From 687c2267d4dfb69138483f90deb4d4f5921a2965 Mon Sep 17 00:00:00 2001 From: Richard Barnes Date: Fri, 15 Oct 2021 13:48:39 -0700 Subject: [PATCH] use irange for loops (#66234) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/66234 Modified loops in files under fbsource/fbcode/caffe2/ from the format `for(TYPE var=x0;var #include +#include #include #include @@ -97,7 +98,7 @@ static at::Tensor newAtTensor( std::vector shapeVec{}; shapeVec.reserve(rank); auto numel = 1; - for (auto i = 0; i < rank; ++i) { + for (const auto i : c10::irange(rank)) { shapeVec.push_back(shapeArr[i]); numel *= shapeArr[i]; } @@ -521,7 +522,7 @@ at::IValue JIValue::JIValueToAtIValue( std::vector elements; elements.reserve(n); - for (auto i = 0; i < n; ++i) { + for (const auto i : c10::irange(n)) { auto jivalue_element = jarray->getElement(i); auto element = JIValue::JIValueToAtIValue(jivalue_element); elements.push_back(std::move(element)); @@ -535,7 +536,7 @@ at::IValue JIValue::JIValueToAtIValue( size_t n = jArrayPinned.size(); c10::List list{}; list.reserve(n); - for (size_t i = 0; i < n; ++i) { + for (const auto i : c10::irange(n)) { list.push_back(jArrayPinned[i]); } return at::IValue{std::move(list)}; @@ -547,7 +548,7 @@ at::IValue JIValue::JIValueToAtIValue( size_t n = jArrayPinned.size(); c10::List list{}; list.reserve(n); - for (size_t i = 0; i < n; ++i) { + for (const auto i : c10::irange(n)) { list.push_back(jArrayPinned[i]); } return at::IValue{std::move(list)}; @@ -559,7 +560,7 @@ at::IValue JIValue::JIValueToAtIValue( size_t n = jArrayPinned.size(); c10::List list{}; list.reserve(n); - for (size_t i = 0; i < n; ++i) { + for (const auto i : c10::irange(n)) { list.push_back(jArrayPinned[i]); } return at::IValue{std::move(list)}; @@ -572,7 +573,7 @@ at::IValue JIValue::JIValueToAtIValue( size_t n = jArray->size(); c10::List list{}; list.reserve(n); - for (size_t i = 0; i < n; ++i) { + for (const auto i : c10::irange(n)) { list.push_back( TensorHybrid::newAtTensorFromJTensor(jArray->getElement(i))); } @@ -594,7 +595,7 @@ at::IValue JIValue::JIValueToAtIValue( c10::impl::GenericList list{c10::unshapedType(first_element.type())}; list.reserve(n); list.push_back(first_element); - for (auto i = 1; i < n; ++i) { + for (const auto i : c10::irange(1, n)) { auto jivalue_element = jarray->getElement(i); auto element = JIValue::JIValueToAtIValue(jivalue_element); list.push_back(element); diff --git a/android/pytorch_android/src/main/cpp/pytorch_jni_lite.cpp b/android/pytorch_android/src/main/cpp/pytorch_jni_lite.cpp index b67799672ec291..86fd1e2260f9ca 100644 --- a/android/pytorch_android/src/main/cpp/pytorch_jni_lite.cpp +++ b/android/pytorch_android/src/main/cpp/pytorch_jni_lite.cpp @@ -6,6 +6,7 @@ #include #include +#include #include #include #include @@ -157,7 +158,7 @@ class PytorchJni : public facebook::jni::HybridClass { std::vector inputs{}; size_t n = jinputs->size(); inputs.reserve(n); - for (size_t i = 0; i < n; i++) { + for (const auto i : c10::irange(n)) { at::IValue atIValue = JIValue::JIValueToAtIValue(jinputs->getElement(i)); if (at::kVulkan == deviceType_) { inputs.push_back( @@ -186,7 +187,7 @@ class PytorchJni : public facebook::jni::HybridClass { std::vector inputs{}; size_t n = jinputs->size(); inputs.reserve(n); - for (size_t i = 0; i < n; i++) { + for (const auto i : c10::irange(n)) { at::IValue atIValue = JIValue::JIValueToAtIValue(jinputs->getElement(i)); if (at::kVulkan == deviceType_) { inputs.push_back( diff --git a/aten/src/ATen/BatchingRegistrations.cpp b/aten/src/ATen/BatchingRegistrations.cpp index b2dcaa04b12c7d..e2292e1964e029 100644 --- a/aten/src/ATen/BatchingRegistrations.cpp +++ b/aten/src/ATen/BatchingRegistrations.cpp @@ -3,6 +3,7 @@ #include #include #include +#include namespace at { @@ -329,7 +330,7 @@ Tensor permute_batching_rule(const Tensor& self, IntArrayRef dims) { VmapDimVector all_dims_physical; all_dims_physical.reserve(self_physical.tensor().dim()); - for (int64_t bdim = 0; bdim < self_physical.numBatchDims(); bdim++) { + for (const auto bdim : c10::irange(self_physical.numBatchDims())) { all_dims_physical.push_back(bdim); } all_dims_physical.insert( diff --git a/aten/src/ATen/CPUApplyUtils.h b/aten/src/ATen/CPUApplyUtils.h index 229eaf48be5020..d2fb3ac96305fa 100644 --- a/aten/src/ATen/CPUApplyUtils.h +++ b/aten/src/ATen/CPUApplyUtils.h @@ -2,6 +2,7 @@ #include #include +#include #include #include #include @@ -130,7 +131,7 @@ inline Tensor sort_strides(Tensor& tensor_) { IntArrayRef strides = tensor_.strides(); std::vector indices; indices.reserve(tensor_.ndimension()); - for (int64_t i = 0; i < tensor_.ndimension(); i++) { + for (const auto i : c10::irange(tensor_.ndimension())) { indices.push_back(i); } std::sort(indices.begin(), indices.end(), [&strides](int64_t i1, int64_t i2) { @@ -196,7 +197,7 @@ inline bool _all_equal_numel(at::ArrayRef tensors) { if (tensors.size() == 0) return true; int64_t all_numel = tensors[0].numel(); - for (size_t i = 1; i < tensors.size(); i++) { + for (const auto i : c10::irange(1, tensors.size())) { if (tensors[i].numel() != all_numel) return false; } diff --git a/aten/src/ATen/Context.h b/aten/src/ATen/Context.h index dd74df5895f7c6..1bd46299175a5e 100644 --- a/aten/src/ATen/Context.h +++ b/aten/src/ATen/Context.h @@ -11,6 +11,7 @@ #include #include #include +#include #include #include @@ -349,7 +350,7 @@ static inline void manual_seed(uint64_t seed) { // available. In that case, we must not seed CUDA; it will fail! const auto num_gpus = detail::getCUDAHooks().getNumGPUs(); if (hasCUDA() && num_gpus > 0) { - for (int i = 0; i < num_gpus; i++) { + for (const auto i : c10::irange(num_gpus)) { auto cuda_gen = globalContext().defaultGenerator( Device(at::kCUDA, static_cast(i)) ); diff --git a/aten/src/ATen/ExpandUtils.cpp b/aten/src/ATen/ExpandUtils.cpp index 710c27170958af..35588ac62a29cc 100644 --- a/aten/src/ATen/ExpandUtils.cpp +++ b/aten/src/ATen/ExpandUtils.cpp @@ -197,7 +197,7 @@ std::vector infer_dense_strides(IntArrayRef tensor_sizes, IntArrayRef t // compute output strides which preserves the input tensor's memory layout std::vector out_strides(ndim); int64_t curr_stride = 1; - for (size_t i = 0; i < ndim; ++i) { + for (const auto i : c10::irange(ndim)) { int64_t idx = perm[i]; out_strides[idx] = curr_stride; // Note: for size 0, we simply treated it as 1, it really doesn't matter here diff --git a/aten/src/ATen/ExpandUtils.h b/aten/src/ATen/ExpandUtils.h index 3f15e778618b68..55a392c8d9cc3a 100644 --- a/aten/src/ATen/ExpandUtils.h +++ b/aten/src/ATen/ExpandUtils.h @@ -4,6 +4,7 @@ #include #include #include +#include #include #include @@ -266,7 +267,7 @@ inline std::vector expand_outplace(TensorList to_expand) { // expands a list of Tensors; ignores undefined (null) tensors bool first = true; DimVector sizes; - for (size_t i = 0; i < to_expand.size(); ++i) { + for (const auto i : c10::irange(to_expand.size())) { if (!to_expand[i].defined()) { continue; } else if (first) { @@ -278,7 +279,7 @@ inline std::vector expand_outplace(TensorList to_expand) { } std::vector result(to_expand.size()); - for (size_t i = 0; i < to_expand.size(); ++i) { + for (const auto i : c10::irange(to_expand.size())) { if (!to_expand[i].defined()) { continue; } else if (to_expand[i].sizes().equals(sizes)) { @@ -299,7 +300,7 @@ static inline Tensor sum_to(Tensor tensor, const IntArrayRef shape) { c10::SmallVector reduce_dims; const at::IntArrayRef sizes = tensor.sizes(); const int64_t leading_dims = sizes.size() - shape.size(); - for (int64_t i = 0; i < leading_dims; ++i) { + for (const auto i : c10::irange(leading_dims)) { reduce_dims.push_back(i); } for (int64_t i = leading_dims; i < static_cast(sizes.size()); ++i) { @@ -320,7 +321,7 @@ static inline bool is_expandable_to(IntArrayRef shape, IntArrayRef desired) { if (ndim > target_dim) { return false; } - for (size_t i = 0; i < ndim; i++) { + for (const auto i : c10::irange(ndim)) { int64_t size = shape[ndim - i - 1]; int64_t target = desired[target_dim - i - 1]; if (size != target && size != 1) { diff --git a/aten/src/ATen/MemoryOverlap.cpp b/aten/src/ATen/MemoryOverlap.cpp index 232fda6bac10f0..edeca5e4bac1a4 100644 --- a/aten/src/ATen/MemoryOverlap.cpp +++ b/aten/src/ATen/MemoryOverlap.cpp @@ -1,6 +1,7 @@ #include #include #include +#include namespace at { @@ -17,7 +18,7 @@ MemOverlap has_internal_overlap(TensorImpl* t) { auto strides = t->strides(); auto sizes = t->sizes(); - for (size_t i = 0; i < strides.size(); ++i) { + for (const auto i : c10::irange(strides.size())) { if (strides[i] == 0 && sizes[i] > 1) { return MemOverlap::YES; } diff --git a/aten/src/ATen/NamedTensorUtils.cpp b/aten/src/ATen/NamedTensorUtils.cpp index 782a266a8aeda3..24a85b4ce7085a 100644 --- a/aten/src/ATen/NamedTensorUtils.cpp +++ b/aten/src/ATen/NamedTensorUtils.cpp @@ -225,7 +225,7 @@ std::vector compute_squeeze_outnames(const Tensor& tensor) { } std::vector outnames; auto tensor_names = tensor.names(); - for (int64_t d = 0; d < tensor.dim(); d++) { + for (const auto d : c10::irange(tensor.dim())) { if (tensor.sizes()[d] != 1) { outnames.push_back(tensor_names[d]); } @@ -242,7 +242,7 @@ std::vector compute_diagonal_outnames( } std::vector outnames; auto tensor_names = tensor.names(); - for (int64_t d = 0; d < tensor.dim(); d++) { + for (const auto d : c10::irange(tensor.dim())) { if (d == dim1 || d == dim2) { continue; } diff --git a/aten/src/ATen/ParallelNative.cpp b/aten/src/ATen/ParallelNative.cpp index 753dbdb751e68e..bade0b26d54d81 100644 --- a/aten/src/ATen/ParallelNative.cpp +++ b/aten/src/ATen/ParallelNative.cpp @@ -6,6 +6,7 @@ #ifndef C10_MOBILE #include +#include #else #include #endif // C10_MOBILE @@ -87,7 +88,7 @@ TaskThreadPoolBase& _get_intraop_pool() { // `fn` will be called with params: (thread_pool_task_id, task_id). void _run_with_pool(const std::function& fn, size_t range) { #ifndef C10_MOBILE - for (size_t i = 1; i < range; ++i) { + for (const auto i : c10::irange(1, range)) { _get_intraop_pool().run([fn, i]() { fn((int)i, i); }); } // Run the first task on the current thread directly. diff --git a/aten/src/ATen/SparseTensorImpl.h b/aten/src/ATen/SparseTensorImpl.h index e2fc89a9db8498..78ebb25e15b1f5 100644 --- a/aten/src/ATen/SparseTensorImpl.h +++ b/aten/src/ATen/SparseTensorImpl.h @@ -3,6 +3,7 @@ #include #include #include +#include namespace at { struct TORCH_API SparseTensorImpl : public TensorImpl { @@ -109,7 +110,7 @@ struct TORCH_API SparseTensorImpl : public TensorImpl { bool shrinking_dense_dim = false; auto sparse_size_original = sizes().slice(0, sparse_dim); auto sparse_size_new = size.slice(0, sparse_dim); - for (int64_t i = 0; i < sparse_dim; i++) { + for (const auto i : c10::irange(sparse_dim)) { if (sparse_size_new[i] < sparse_size_original[i]) { shrinking_sparse_dims = true; break; @@ -117,7 +118,7 @@ struct TORCH_API SparseTensorImpl : public TensorImpl { } auto dense_size_original = sizes().slice(sparse_dim); auto dense_size_new = size.slice(sparse_dim); - for (int64_t i = 0; i < dense_dim; i++) { + for (const auto i : c10::irange(dense_dim)) { if (dense_size_new[i] < dense_size_original[i]) { shrinking_dense_dim = true; break; diff --git a/aten/src/ATen/SparseTensorUtils.cpp b/aten/src/ATen/SparseTensorUtils.cpp index 564eeda03c3daa..d5811b933e7ca5 100644 --- a/aten/src/ATen/SparseTensorUtils.cpp +++ b/aten/src/ATen/SparseTensorUtils.cpp @@ -3,6 +3,7 @@ #include #include #include +#include namespace at { namespace sparse { @@ -98,7 +99,7 @@ Tensor coo_to_csr(const int64_t* indices, int64_t dim, int64_t nnz) { at::parallel_for(0, nnz, 10000, [&](int64_t start, int64_t end) { // NOLINTNEXTLINE(cppcoreguidelines-init-variables) int64_t h, hp0, hp1; - for (auto i = start; i < end; i++) { + for (const auto i : c10::irange(start, end)) { hp0 = indices[i]; hp1 = (i+1 == nnz) ? dim : indices[i+1]; if (hp0 != hp1) { diff --git a/aten/src/ATen/TensorIndexing.cpp b/aten/src/ATen/TensorIndexing.cpp index 5c402155c6feaa..95d70132f43f95 100644 --- a/aten/src/ATen/TensorIndexing.cpp +++ b/aten/src/ATen/TensorIndexing.cpp @@ -1,6 +1,7 @@ #include #include +#include namespace at { namespace indexing { @@ -31,7 +32,7 @@ std::ostream& operator<<(std::ostream& stream, const TensorIndex& tensor_index) std::ostream& operator<<(std::ostream& stream, const std::vector& tensor_indices) { stream << "("; - for (size_t i = 0; i < tensor_indices.size(); i++) { + for (const auto i : c10::irange(tensor_indices.size())) { stream << tensor_indices[i]; if (i < tensor_indices.size() - 1) stream << ", "; } diff --git a/aten/src/ATen/TensorIndexing.h b/aten/src/ATen/TensorIndexing.h index 263f4914e0d5e2..71c9c3feb9e76b 100644 --- a/aten/src/ATen/TensorIndexing.h +++ b/aten/src/ATen/TensorIndexing.h @@ -1,6 +1,7 @@ #pragma once #include +#include #include #include #include @@ -335,7 +336,7 @@ static inline Tensor scalarToTensor(const Scalar& v, const TensorOptions& option // strip away unit dimensions from the left of 'src' static inline IntArrayRef slicePrefix1sSize(const IntArrayRef& sizes) { size_t first_non1_src = sizes.size(); - for (size_t i = 0; i < sizes.size(); ++i) { + for (const auto i : c10::irange(sizes.size())) { if (sizes[i] != 1) { first_non1_src = i; break; @@ -439,7 +440,7 @@ static inline Tensor applySlicing( "too many indices for tensor of dimension ", (int)self_sizes.size()); Tensor result = self; - for (size_t i = 0; i < indices.size(); i++) { + for (const auto i : c10::irange(indices.size())) { auto& obj = indices[i]; result = handleDimInMultiDimIndexing( /*prev_dim_result=*/result, diff --git a/aten/src/ATen/TensorIterator.cpp b/aten/src/ATen/TensorIterator.cpp index 84298660aedfa1..9a6021acf800c2 100644 --- a/aten/src/ATen/TensorIterator.cpp +++ b/aten/src/ATen/TensorIterator.cpp @@ -36,8 +36,8 @@ inline void get_base_ptrs(char** ptrs, ArrayRef operands) { } inline void get_strides(int64_t* strides, ArrayRef operands, int64_t ndim) { - for (int64_t dim = 0; dim < ndim; ++dim) { - for (size_t arg = 0; arg < operands.size(); ++arg) { + for (const auto dim : c10::irange(ndim)) { + for (const auto arg : c10::irange(operands.size())) { *strides++ = operands[arg].stride_bytes[dim]; } } @@ -214,7 +214,7 @@ void TensorIteratorBase::reorder_dimensions() { // returns 1 if the dim0 should come after dim1, -1 if dim0 should come // before dim1, and 0 if the comparison is ambiguous. auto should_swap = [&](size_t dim0, size_t dim1) { - for (int arg = 0; arg < ntensors(); arg++) { + for (const auto arg : c10::irange(ntensors())) { // ignore undefined or incorrectly sized tensors if (operands_[arg].stride_bytes.empty() || operands_[arg].will_resize) { continue; @@ -251,7 +251,7 @@ void TensorIteratorBase::reorder_dimensions() { }; // insertion sort with support for ambiguous comparisons - for (int i = 1; i < ndim(); i++) { + for (const auto i : c10::irange(1, ndim())) { int dim1 = i; for (int dim0 = i - 1; dim0 >= 0; dim0--) { int comparison = should_swap(perm_[dim0], perm_[dim1]); @@ -497,7 +497,7 @@ void TensorIteratorBase::compute_types(const TensorIteratorConfig& config) { StrideVector TensorIteratorBase::compatible_stride(int element_size) const { auto stride = StrideVector(); int64_t next_stride = element_size; - for (int dim = 0; dim < ndim(); dim++) { + for (const auto dim : c10::irange(ndim())) { stride.push_back(next_stride); next_stride *= shape_[dim]; } @@ -510,14 +510,14 @@ DimVector TensorIteratorBase::invert_perm(IntArrayRef input) const { TORCH_INTERNAL_ASSERT(!has_coalesced_dimensions_); TORCH_INTERNAL_ASSERT(input.size()==perm_.size()); auto res = DimVector(input.size()); //no initialization needed, every value in res should be written to. - for (int dim = 0; dim < ndim(); dim++) { + for (const auto dim : c10::irange(ndim())) { res[perm_[dim]] = input[dim]; } return res; } void TensorIteratorBase::allocate_or_resize_outputs() { - for (int i = 0; i < num_outputs_; i++) { + for (const auto i : c10::irange(num_outputs_)) { auto& op = operands_[i]; if (!op.tensor_base().defined() || op.will_resize) { TORCH_INTERNAL_ASSERT(op.is_type_defined(), "no type for operand", i); @@ -525,7 +525,7 @@ void TensorIteratorBase::allocate_or_resize_outputs() { op.stride_bytes = compatible_stride(element_size); // check if permutation is just an inverted order bool inverted = true; - for (int i = 0; i < ndim(); i++) { + for (const auto i : c10::irange(ndim())) { if (perm_[i] != ndim() - i - 1) { inverted = false; break; @@ -539,7 +539,7 @@ void TensorIteratorBase::allocate_or_resize_outputs() { set_output(i, tensor_shape, {}, original_options(op), names_); } else { auto tensor_stride = invert_perm(op.stride_bytes); - for (int dim = 0; dim < ndim(); dim++) { + for (const auto dim : c10::irange(ndim())) { tensor_stride[dim] /= element_size; } set_output(i, tensor_shape, tensor_stride, original_options(op), names_); @@ -593,7 +593,7 @@ void TensorIteratorBase::coalesce_dimensions() { if (shape0 == 1 || shape1 == 1) { return true; } - for (int i = 0; i < ntensors(); i++) { + for (const auto i : c10::irange(ntensors())) { auto& stride = operands_[i].stride_bytes; if (shape0 * stride[dim0] != stride[dim1]) { return false; @@ -604,14 +604,14 @@ void TensorIteratorBase::coalesce_dimensions() { // replace each operands stride at dim0 with its stride at dim1 auto replace_stride = [&](int dim0, int dim1) { - for (int i = 0; i < ntensors(); i++) { + for (const auto i : c10::irange(ntensors())) { auto& stride = operands_[i].stride_bytes; stride[dim0] = stride[dim1]; } }; int prev_dim = 0; - for (int dim = 1; dim < ndim(); dim++) { + for (const auto dim : c10::irange(1, ndim())) { if (can_coalesce(prev_dim, dim)) { if (shape_[prev_dim] == 1) { replace_stride(prev_dim, dim); @@ -627,7 +627,7 @@ void TensorIteratorBase::coalesce_dimensions() { } shape_.resize(prev_dim + 1); - for (int i = 0; i < ntensors(); i++) { + for (const auto i : c10::irange(ntensors())) { operands_[i].stride_bytes.resize(ndim()); } has_coalesced_dimensions_ = true; @@ -670,7 +670,7 @@ void TensorIteratorBase::permute_dimensions(IntArrayRef perm) { auto reorder = [perm](IntArrayRef data) { auto res = DimVector(data.size(), 0); - for (size_t i = 0; i < perm.size(); i++) { + for (const auto i : c10::irange(perm.size())) { res[i] = data[perm[i]]; } return res; @@ -687,7 +687,7 @@ void TensorIteratorBase::permute_dimensions(IntArrayRef perm) { int64_t TensorIteratorBase::num_output_elements() const { int64_t elem = 1; - for (int dim = 0; dim < ndim(); dim++) { + for (const auto dim : c10::irange(ndim())) { if (operands_[0].stride_bytes[dim] != 0 || shape_[dim] == 0) { elem *= shape_[dim]; } @@ -697,7 +697,7 @@ int64_t TensorIteratorBase::num_output_elements() const { int TensorIteratorBase::num_reduce_dims() const { int count = 0; - for (int dim = 0; dim < ndim(); dim++) { + for (const auto dim : c10::irange(ndim())) { if (operands_[0].stride_bytes[dim] == 0) { count++; } @@ -760,7 +760,7 @@ bool TensorIteratorBase::is_contiguous() const { bool TensorIteratorBase::is_scalar(int arg) const { const auto& stride = operands_[arg].stride_bytes; - for (int i = 0; i < ndim(); i++) { + for (const auto i : c10::irange(ndim())) { if (stride[i] != 0 && shape_[i] != 1) { return false; } @@ -815,7 +815,7 @@ void TensorIteratorBase::narrow(int dim, int64_t start, int64_t size) { void TensorIteratorBase::select_all_keeping_dim(int start_dim, IntArrayRef indices) { TORCH_INTERNAL_ASSERT(start_dim <= ndim()); - for (int i = start_dim; i < ndim(); ++i) { + for (const auto i : c10::irange(start_dim, ndim())) { for (auto& op : operands_) { op.data = ((char*)op.data) + op.stride_bytes[i] * indices[i - start_dim]; } @@ -1063,13 +1063,13 @@ void TensorIteratorBase::populate_operands(TensorIteratorConfig& config) { void TensorIteratorBase::mark_outputs() { // TODO: merge this into populate_operands - for (int i = 0; i < num_outputs_; i++) { + for (const auto i : c10::irange(num_outputs_)) { operands_[i].is_output = true; const auto& output = tensor(i); if (!output.defined()) continue; // check if output is also an input - for (int arg = num_outputs_; arg < ntensors(); arg++) { + for (const auto arg : c10::irange(num_outputs_, ntensors())) { const auto& input = tensor(arg); if (output.is_same(input)) { operands_[i].is_read_write = true; @@ -1086,7 +1086,7 @@ void TensorIteratorBase::mark_resize_outputs(const TensorIteratorConfig& config) if (config.static_shape_.has_value()) { return; } - for (int i = 0; i < num_outputs_; i++) { + for (const auto i : c10::irange(num_outputs_)) { const auto& output = tensor(i); if (output.defined() && !output.sizes().equals(shape_)) { if (config.resize_outputs_ && !operands_[i].is_read_write) { @@ -1104,11 +1104,11 @@ void TensorIteratorBase::compute_mem_overlaps(const TensorIteratorConfig& config if (!config.check_mem_overlap_) { return; } - for (int i = 0; i < num_outputs_; i++) { + for (const auto i : c10::irange(num_outputs_)) { const auto& output = tensor_base(i); if (!output.defined()) continue; assert_no_internal_overlap(output); - for (int j = num_outputs_; j < ntensors(); j++) { + for (const auto j : c10::irange(num_outputs_, ntensors())) { const auto& input = tensor_base(j); if (!input.is_same(output)) { assert_no_partial_overlap(output, input); @@ -1164,7 +1164,7 @@ void TensorIteratorBase::compute_strides(const TensorIteratorConfig& config) { op.stride_bytes.resize(ndim(), 0); else op.stride_bytes.resize(ndim()); - for (size_t i = 0; i < original_shape.size(); i++) { + for (const auto i : c10::irange(original_shape.size())) { // see NOTE: [Computing output strides] if (original_shape[i] == 1 && shape_[offset + i] !=1) { op.stride_bytes[offset + i] = 0; @@ -1183,7 +1183,7 @@ bool TensorIteratorBase::can_use_32bit_indexing() const { } for (auto& op : operands_) { int64_t max_offset = 1; - for (int dim = 0; dim < ndim(); dim++) { + for (const auto dim : c10::irange(ndim())) { max_offset += (shape_[dim] - 1) * op.stride_bytes[dim]; } if (max_offset > max_value) { @@ -1245,7 +1245,7 @@ bool TensorIteratorBase::fast_set_up(const TensorIteratorConfig& config) { switch (setup_type) { case FastSetupType::CONTIGUOUS: { - for (int i = 0; i < num_outputs_; i++){ + for (const auto i : c10::irange(num_outputs_)) { auto& op = operands_[i]; if (!op.tensor_base().defined()) { TORCH_INTERNAL_ASSERT(op.is_type_defined(), "no type for operand", i); @@ -1256,7 +1256,7 @@ bool TensorIteratorBase::fast_set_up(const TensorIteratorConfig& config) { } case FastSetupType::CHANNELS_LAST: { - for (int i = 0; i < num_outputs_; i++){ + for (const auto i : c10::irange(num_outputs_)) { auto& op = operands_[i]; if (!op.tensor_base().defined()) { TORCH_INTERNAL_ASSERT(op.is_type_defined(), "no type for operand", i); @@ -1273,7 +1273,7 @@ bool TensorIteratorBase::fast_set_up(const TensorIteratorConfig& config) { if (tensor(i_defined).defined()) break; } TORCH_CHECK(i_defined >= 0, "Can not find a defined tensor when fast allocating memory to outputs"); - for (int i = 0; i < num_outputs_; i++){ + for (const auto i : c10::irange(num_outputs_)) { auto& op = operands_[i]; if (!op.tensor_base().defined()) { TORCH_INTERNAL_ASSERT(op.is_type_defined(), "no type for operand", i); diff --git a/aten/src/ATen/TensorIterator.h b/aten/src/ATen/TensorIterator.h index 329598695dc9cc..6a35650c96d203 100644 --- a/aten/src/ATen/TensorIterator.h +++ b/aten/src/ATen/TensorIterator.h @@ -4,6 +4,7 @@ #include #include #include +#include #include #include #include @@ -322,9 +323,9 @@ struct TORCH_API TensorIteratorBase : public impl::MetaBase { char** base, const int64_t* strides, int64_t size0, int64_t size1) { PtrVector data(base, base + ntensor); const int64_t* outer_strides = &strides[ntensor]; - for (int64_t i = 0; i < size1; i++) { + for (const auto i : c10::irange(size1)) { if (i > 0) { - for (int64_t arg = 0; arg < ntensor; arg++) { + for (const auto arg : c10::irange(ntensor)) { data[arg] += outer_strides[arg]; } } @@ -397,7 +398,7 @@ struct TORCH_API TensorIteratorBase : public impl::MetaBase { bool has_contiguous_first_dim() const { int num_tensors = ntensors(); - for (int i = 0; i < num_tensors; i++) { + for (const auto i : c10::irange(num_tensors)) { if (strides(i)[0] != element_size(i)) { return false; } diff --git a/aten/src/ATen/TensorIteratorInternal.h b/aten/src/ATen/TensorIteratorInternal.h index 57477bcb1d4030..72e5939b351798 100644 --- a/aten/src/ATen/TensorIteratorInternal.h +++ b/aten/src/ATen/TensorIteratorInternal.h @@ -1,6 +1,7 @@ #pragma once #include #include +#include namespace at { @@ -24,9 +25,9 @@ inline void get_data_ptrs( const int64_t ntensors = base.size(); const int64_t ndim = counter.size(); std::copy(base.begin(), base.end(), ptrs); - for (int64_t dim = 0; dim < ndim; ++dim) { + for (const auto dim : c10::irange(ndim)) { int64_t value = counter[dim]; - for (int64_t arg = 0; arg < ntensors; ++arg) { + for (const auto arg : c10::irange(ntensors)) { ptrs[arg] += value * strides[dim * ntensors + arg]; } } diff --git a/aten/src/ATen/TensorNames.cpp b/aten/src/ATen/TensorNames.cpp index 9c28924dc0aee2..683de258a2ebd8 100644 --- a/aten/src/ATen/TensorNames.cpp +++ b/aten/src/ATen/TensorNames.cpp @@ -56,7 +56,7 @@ TensorNames::TensorNames(ArrayRef names, int64_t start, int64_t end) { start = maybe_wrap_dim(start, names.size()); end = maybe_wrap_dim(end, names.size()); names_.reserve(end - start); - for (int64_t idx = start; idx < end; ++idx) { + for (const auto idx : c10::irange(start, end)) { names_.emplace_back(names, idx); } } diff --git a/aten/src/ATen/TensorUtils.cpp b/aten/src/ATen/TensorUtils.cpp index 1ec9f9c291c0ab..3426bff7b4b8da 100644 --- a/aten/src/ATen/TensorUtils.cpp +++ b/aten/src/ATen/TensorUtils.cpp @@ -2,6 +2,7 @@ #include #include #include +#include #include #include @@ -323,7 +324,7 @@ size_t computeStorageNbytes( // size of the underlying storage is 1 bigger than the offset // of the last element according to stride size_t size = 1; - for(size_t i = 0; i < sizes.size(); i++) { + for (const auto i : c10::irange(sizes.size())) { if(sizes[i] == 0) { return 0; } diff --git a/aten/src/ATen/VmapTransforms.cpp b/aten/src/ATen/VmapTransforms.cpp index 07ff77fe2b746f..4bda903545fdf8 100644 --- a/aten/src/ATen/VmapTransforms.cpp +++ b/aten/src/ATen/VmapTransforms.cpp @@ -83,7 +83,7 @@ VmapDimVector VmapPhysicalView::getPhysicalShape(IntArrayRef logical_shape) cons static BatchDims computeFrontBatchDimsFromLevels(std::bitset levels_bitset) { BatchDims bdims; int64_t dim = 0; - for (int64_t level = 0; level < kVmapNumLevels; level++) { + for (const auto level : c10::irange(kVmapNumLevels)) { if (!levels_bitset[level]) { continue; } @@ -208,7 +208,7 @@ MultiBatchVmapTransform::logicalToPhysical(TensorList logical_tensors) { VmapDimVector batch_sizes(num_batch_dims, 1); for (const auto& physical_tensor : physical_tensors) { auto physical_sizes = physical_tensor.sizes(); - for (int64_t dim = 0; dim < num_batch_dims; dim++) { + for (const auto dim : c10::irange(num_batch_dims)) { if (physical_sizes[dim] != 1) { batch_sizes[dim] = physical_sizes[dim]; } diff --git a/aten/src/ATen/WrapDimUtils.h b/aten/src/ATen/WrapDimUtils.h index 13e605c920ec13..24fe684c6dc61c 100644 --- a/aten/src/ATen/WrapDimUtils.h +++ b/aten/src/ATen/WrapDimUtils.h @@ -2,6 +2,7 @@ #include #include +#include #include namespace at { @@ -40,7 +41,7 @@ static inline void maybe_wrap_dims_n(int64_t* dims, int64_t ndims, int64_t dim_p } int64_t min = -dim_post_expr; int64_t max = dim_post_expr - 1; - for (int64_t i = 0; i < ndims; ++i) { + for (const auto i : c10::irange(ndims)) { auto &dim = dims[i]; if (dim < min || dim > max) { TORCH_CHECK_INDEX(false, @@ -85,7 +86,7 @@ static inline int64_t legacy_cat_wrap_dim(int64_t dim, TensorList tensors) { // wrap negative dims in a vector static inline void wrap_all_dims(std::vector& dims_to_wrap, int64_t tensor_total_dims) { - for (size_t i = 0; i < dims_to_wrap.size(); i++) { + for (const auto i : c10::irange(dims_to_wrap.size())) { dims_to_wrap[i] = maybe_wrap_dim(dims_to_wrap[i], tensor_total_dims); } } diff --git a/aten/src/ATen/WrapDimUtilsMulti.h b/aten/src/ATen/WrapDimUtilsMulti.h index a2af1b0dcd7195..e1d2266e24efba 100644 --- a/aten/src/ATen/WrapDimUtilsMulti.h +++ b/aten/src/ATen/WrapDimUtilsMulti.h @@ -1,6 +1,7 @@ #pragma once #include +#include #include #include #include @@ -15,7 +16,7 @@ constexpr size_t dim_bitset_size = 64; static inline std::bitset dim_list_to_bitset(IntArrayRef dims, int64_t ndims) { TORCH_CHECK(ndims <= (int64_t) dim_bitset_size, "only tensors with up to ", dim_bitset_size, " dims are supported"); std::bitset seen; - for (size_t i = 0; i < dims.size(); i++) { + for (const auto i : c10::irange(dims.size())) { size_t dim = maybe_wrap_dim(dims[i], ndims); TORCH_CHECK(!seen[dim], "dim ", dim, " appears multiple times in the list of dims"); seen[dim] = true; diff --git a/aten/src/ATen/benchmarks/stateful_conv1d.cpp b/aten/src/ATen/benchmarks/stateful_conv1d.cpp index 60502773ca57a0..527dcc439dcdce 100644 --- a/aten/src/ATen/benchmarks/stateful_conv1d.cpp +++ b/aten/src/ATen/benchmarks/stateful_conv1d.cpp @@ -1,4 +1,5 @@ #include +#include #include #include #include @@ -33,7 +34,7 @@ static void stateful_conv1d(benchmark::State& state) { )"); std::vector> inputs; - for (int i = 0; i < 10; ++i) { + for (const auto i : c10::irange(10)) { std::vector input; // NOLINTNEXTLINE(modernize-use-emplace) input.push_back(torch::rand({batch_size, input_channels, width})); @@ -70,8 +71,8 @@ static void GenerateSizes(benchmark::internal::Benchmark* b) { for (size_t input_channels = 32; input_channels < 256; input_channels *= 2) { for (size_t output_channels = 32; output_channels < 256; output_channels *= 2) { - for (size_t kernel = 3; kernel < 8; ++kernel) { - for (size_t batch_size = 1; batch_size < 5; ++batch_size) { + for (const auto kernel : c10::irange(3, 8)) { + for (const auto batch_size : c10::irange(1, 5)) { for (size_t width = 32; width < 256; width *= 2) { b->Args({input_channels, output_channels, kernel, batch_size, width, true}); b->Args({input_channels, output_channels, kernel, batch_size, width, false}); diff --git a/aten/src/ATen/core/Array.h b/aten/src/ATen/core/Array.h index 6e0fce606efc80..4754f72cda0f74 100644 --- a/aten/src/ATen/core/Array.h +++ b/aten/src/ATen/core/Array.h @@ -4,6 +4,7 @@ // device code. #include +#include namespace at { namespace detail { diff --git a/aten/src/ATen/core/Formatting.cpp b/aten/src/ATen/core/Formatting.cpp index dbbed6e3b07858..ff1894c5486375 100644 --- a/aten/src/ATen/core/Formatting.cpp +++ b/aten/src/ATen/core/Formatting.cpp @@ -1,4 +1,5 @@ #include +#include #include #include @@ -44,7 +45,7 @@ static std::tuple __printFormat(std::ostream& stream, const Ten } bool intMode = true; auto self_p = self.data_ptr(); - for(int64_t i = 0; i < size; i++) { + for (const auto i : c10::irange(size)) { auto z = self_p[i]; if(std::isfinite(z)) { if(z != std::ceil(z)) { @@ -70,7 +71,7 @@ static std::tuple __printFormat(std::ostream& stream, const Ten } else { expMin = fabs(self_p[offset]); expMax = fabs(self_p[offset]); - for(int64_t i = offset; i < size; i++) { + for (const auto i : c10::irange(offset, size)) { double z = fabs(self_p[i]); if(std::isfinite(z)) { if(z < expMin) { @@ -130,7 +131,8 @@ static std::tuple __printFormat(std::ostream& stream, const Ten static void __printIndent(std::ostream &stream, int64_t indent) { - for(int64_t i = 0; i < indent; i++) { + for (const auto i : c10::irange(indent)) { + (void)i; //Suppress unused variable warning stream << " "; } } @@ -168,7 +170,7 @@ static void __printMatrix(std::ostream& stream, const Tensor& self, int64_t line printScale(stream,scale); __printIndent(stream, indent); } - for(int64_t l = 0; l < self.size(0); l++) { + for (const auto l : c10::irange(self.size(0))) { Tensor row = self.select(0,l); double *row_ptr = row.data_ptr(); for(int64_t c = firstColumn; c < lastColumn+1; c++) { @@ -198,8 +200,7 @@ void __printTensor(std::ostream& stream, Tensor& self, int64_t linesize) bool start = true; bool finished = false; counter[0] = -1; - for(size_t i = 1; i < counter.size(); i++) - counter[i] = 0; + for (const auto i : c10::irange(1, counter.size()))counter[i] = 0; while(true) { for(int64_t i = 0; self.ndimension()-2; i++) { counter[i] = counter[i] + 1; @@ -269,7 +270,7 @@ std::ostream& print(std::ostream& stream, const Tensor & tensor_, int64_t linesi printScale(stream, scale); } double* tensor_p = tensor.data_ptr(); - for (int64_t i = 0; i < tensor.size(0); i++) { + for (const auto i : c10::irange(tensor.size(0))) { stream << std::setw(sz) << tensor_p[i]/scale << std::endl; } } @@ -284,7 +285,7 @@ std::ostream& print(std::ostream& stream, const Tensor & tensor_, int64_t linesi __printTensor(stream, tensor, linesize); } stream << "[ " << tensor_.toString() << "{" << tensor.size(0); - for(int64_t i = 1; i < tensor.ndimension(); i++) { + for (const auto i : c10::irange(1, tensor.ndimension())) { stream << "," << tensor.size(i); } stream << "}"; diff --git a/aten/src/ATen/core/MT19937RNGEngine.h b/aten/src/ATen/core/MT19937RNGEngine.h index 40c1ba5f584ade..6c071cbc448ade 100644 --- a/aten/src/ATen/core/MT19937RNGEngine.h +++ b/aten/src/ATen/core/MT19937RNGEngine.h @@ -155,7 +155,7 @@ class mt19937_engine { data_.seed_ = seed; data_.seeded_ = true; data_.state_[0] = seed & 0xffffffff; - for(int j = 1; j < MERSENNE_STATE_N; j++) { + for (const auto j : c10::irange(1, MERSENNE_STATE_N)) { data_.state_[j] = (1812433253 * (data_.state_[j-1] ^ (data_.state_[j-1] >> 30)) + j); } data_.left_ = 1; diff --git a/aten/src/ATen/core/TensorAccessor.h b/aten/src/ATen/core/TensorAccessor.h index e18f9d35ca2f04..9d65522b5d96b2 100644 --- a/aten/src/ATen/core/TensorAccessor.h +++ b/aten/src/ATen/core/TensorAccessor.h @@ -3,6 +3,7 @@ #include #include #include +#include #include #include @@ -134,7 +135,7 @@ class GenericPackedTensorAccessorBase { const source_index_t* sizes_, const source_index_t* strides_) : data_(data_) { - for (int i = 0; i < N; i++) { + for (const auto i : c10::irange(N)) { this->sizes_[i] = sizes_[i]; this->strides_[i] = strides_[i]; } diff --git a/aten/src/ATen/core/boxing/impl/test_helpers.h b/aten/src/ATen/core/boxing/impl/test_helpers.h index 9ca06878f1539f..93b11dc853f00f 100644 --- a/aten/src/ATen/core/boxing/impl/test_helpers.h +++ b/aten/src/ATen/core/boxing/impl/test_helpers.h @@ -7,6 +7,7 @@ #include #include #include +#include template inline std::vector makeStack(Inputs&&... inputs) { @@ -87,7 +88,7 @@ inline void expectThrows(Functor&& functor, const char* expectMessageContains) { template void expectListEquals(c10::ArrayRef expected, std::array actual) { EXPECT_EQ(expected.size(), actual.size()); - for (size_t i = 0; i < expected.size(); ++i) { + for (const auto i : c10::irange(expected.size())) { EXPECT_EQ(expected[i], actual[i]); } } @@ -95,7 +96,7 @@ void expectListEquals(c10::ArrayRef expected, std::array actual) { template void expectListEquals(c10::ArrayRef expected, c10::ArrayRef actual) { EXPECT_EQ(expected.size(), actual.size()); - for (size_t i = 0; i < expected.size(); ++i) { + for (const auto i : c10::irange(expected.size())) { EXPECT_EQ(expected[i], actual[i]); } } @@ -103,7 +104,7 @@ void expectListEquals(c10::ArrayRef expected, c10::ArrayRef actual) { template void expectListEquals(c10::ArrayRef expected, c10::List actual) { EXPECT_EQ(expected.size(), actual.size()); - for (size_t i = 0; i < expected.size(); ++i) { + for (const auto i : c10::irange(expected.size())) { EXPECT_EQ(expected[i], actual.get(i)); } } @@ -111,7 +112,7 @@ void expectListEquals(c10::ArrayRef expected, c10::List actual) { template void expectListEquals(c10::ArrayRef expected, std::vector actual) { EXPECT_EQ(expected.size(), actual.size()); - for (size_t i = 0; i < expected.size(); ++i) { + for (const auto i : c10::irange(expected.size())) { EXPECT_EQ(expected[i], actual[i]); } } diff --git a/aten/src/ATen/core/dispatch/DispatchKeyExtractor.h b/aten/src/ATen/core/dispatch/DispatchKeyExtractor.h index 5289f9fa01142f..31dd09836cbe66 100644 --- a/aten/src/ATen/core/dispatch/DispatchKeyExtractor.h +++ b/aten/src/ATen/core/dispatch/DispatchKeyExtractor.h @@ -5,6 +5,7 @@ #include #include #include +#include #include #include @@ -171,7 +172,7 @@ struct TORCH_API DispatchKeyExtractor final { "The function schema has ", schema.arguments().size(), " arguments but this PyTorch build only supports ", c10::utils::bitset::NUM_BITS()); c10::utils::bitset dispatch_arg_indices_reverse; - for (size_t index = 0; index < schema.arguments().size(); ++index) { + for (const auto index : c10::irange(schema.arguments().size())) { if (schema.arguments()[index].type()->isSubtypeOf(*TensorType::get()) || schema.arguments()[index].type()->isSubtypeOf( *ListType::ofTensors()) || diff --git a/aten/src/ATen/core/dispatch/backend_fallback_test.cpp b/aten/src/ATen/core/dispatch/backend_fallback_test.cpp index 1fb14cf205b94c..19981988962a95 100644 --- a/aten/src/ATen/core/dispatch/backend_fallback_test.cpp +++ b/aten/src/ATen/core/dispatch/backend_fallback_test.cpp @@ -5,6 +5,7 @@ #include #include #include +#include #include using namespace at; @@ -51,7 +52,7 @@ void generic_wrapper_fallback(const c10::OperatorHandle& op, torch::jit::Stack* // Unwrap all arguments auto args = torch::jit::pop(*stack, num_arguments); - for (size_t i = 0; i < num_arguments; i++) { + for (const auto i : c10::irange(num_arguments)) { // TODO: Handle tensor list if (args[i].isTensor()) { auto* impl = args[i].unsafeToTensorImpl(); @@ -70,7 +71,7 @@ void generic_wrapper_fallback(const c10::OperatorHandle& op, torch::jit::Stack* // Rewrap outputs auto rets = torch::jit::pop(*stack, num_returns); - for (size_t i = 0; i < num_returns; i++) { + for (const auto i : c10::irange(num_returns)) { // TODO: Handle tensor list if (rets[i].isTensor()) { torch::jit::push(*stack, at::detail::make_tensor(std::move(rets[i]).toTensor())); // yes move! diff --git a/aten/src/ATen/core/function_schema.h b/aten/src/ATen/core/function_schema.h index 3da6958eaf3441..211c55662f2b82 100644 --- a/aten/src/ATen/core/function_schema.h +++ b/aten/src/ATen/core/function_schema.h @@ -2,6 +2,7 @@ #include #include +#include #include #include #include diff --git a/aten/src/ATen/core/function_schema_inl.h b/aten/src/ATen/core/function_schema_inl.h index 712192b0823062..b7aab0730c7d5c 100644 --- a/aten/src/ATen/core/function_schema_inl.h +++ b/aten/src/ATen/core/function_schema_inl.h @@ -16,7 +16,7 @@ inline std::ostream& operator<<(std::ostream& out, const FunctionSchema& schema) out << "("; bool seen_kwarg_only = false; - for(size_t i = 0; i < schema.arguments().size(); ++i) { + for (const auto i : c10::irange(schema.arguments().size())) { if (i > 0) out << ", "; if (schema.arguments()[i].kwarg_only() && !seen_kwarg_only) { out << "*, "; @@ -35,7 +35,7 @@ inline std::ostream& operator<<(std::ostream& out, const FunctionSchema& schema) const auto& returns = schema.returns(); out << "("; - for(size_t i = 0; i < returns.size(); ++i) { + for (const auto i : c10::irange(returns.size())) { if (i > 0) { out << ", "; } @@ -53,7 +53,7 @@ inline std::ostream& operator<<(std::ostream& out, const FunctionSchema& schema) inline size_t findFirstOutArg(const std::vector& args) { // find the start of out args in the schema - for (size_t out_start_idx = 0; out_start_idx < args.size(); out_start_idx++) { + for (const auto out_start_idx : c10::irange(args.size())) { if (args.at(out_start_idx).is_out()) { return out_start_idx; } @@ -122,7 +122,7 @@ inline bool FunctionSchema::isBackwardCompatibleWith( && arguments().size() >= old.arguments().size())) { return false; } - for (size_t i = 0; i < returns().size(); ++i) { + for (const auto i : c10::irange(returns().size())) { // Backwards compatibility requires covariance on argument types // (i.e. more generic), and contravariance on return types (i.e. // more specific). @@ -138,7 +138,7 @@ inline bool FunctionSchema::isBackwardCompatibleWith( size_t new_out_start_idx = findFirstOutArg(arguments()); // make sure among the default args, they are backward compatible - for (size_t i = 0; i < old_out_start_idx; i++) { + for (const auto i : c10::irange(old_out_start_idx)) { if (!arguments().at(i).isBackwardCompatibleWith( old.arguments().at(i), why_not)) { return false; @@ -146,7 +146,7 @@ inline bool FunctionSchema::isBackwardCompatibleWith( } // // Validate that all new arguments provided has a default value - for (size_t i = old_out_start_idx; i < new_out_start_idx; ++i) { + for (const auto i : c10::irange(old_out_start_idx, new_out_start_idx)) { if (!arguments().at(i).default_value()) { if (why_not) { *why_not @@ -160,7 +160,7 @@ inline bool FunctionSchema::isBackwardCompatibleWith( } // now compare the out args - for (size_t i = old_out_start_idx; i < old.arguments().size(); i++) { + for (const auto i : c10::irange(old_out_start_idx, old.arguments().size())) { if (!arguments() .at(i - old_out_start_idx + new_out_start_idx) .isBackwardCompatibleWith(old.arguments().at(i), why_not)) { @@ -238,7 +238,7 @@ inline void FunctionSchema::checkAndNormalizeInputs( *this); size_t consumed_kwargs = 0; - for (size_t pos = 0; pos < arguments().size(); ++pos) { + for (const auto pos : c10::irange(arguments().size())) { const auto& argument = arguments()[pos]; if (pos < inputs.size()) { checkArg(inputs[pos], argument, pos); @@ -298,7 +298,7 @@ inline bool isSubtypeOfList( if (child.size() != parent.size()) { return false; } - for (size_t i = 0; i < child.size(); ++i) { + for (const auto i : c10::irange(child.size())) { const Argument& c = child[i]; const Argument& p = parent[i]; if (c.name() != p.name()) { diff --git a/aten/src/ATen/core/ivalue_inl.h b/aten/src/ATen/core/ivalue_inl.h index 062af97af793b8..09092edfbc1b7a 100644 --- a/aten/src/ATen/core/ivalue_inl.h +++ b/aten/src/ATen/core/ivalue_inl.h @@ -22,6 +22,7 @@ #include #include #include +#include namespace torch { namespace jit { @@ -1114,7 +1115,7 @@ struct C10_EXPORT ivalue::Future final : c10::intrusive_ptr_target { } std::ostringstream oss; oss << devices[0]; - for (size_t idx = 1; idx < devices.size(); idx++) { + for (const auto idx : c10::irange(1, devices.size())) { if (idx == devices.size() - 1) { oss << " and "; } else { @@ -1131,7 +1132,7 @@ struct C10_EXPORT ivalue::Future final : c10::intrusive_ptr_target { return c10::kCPU; } c10::DeviceType deviceType = devices[0].type(); - for (size_t idx = 1; idx < devices.size(); idx++) { + for (const auto idx : c10::irange(1, devices.size())) { TORCH_CHECK_VALUE( devices[idx].type() == deviceType, "Expected all devices to be of the same type, but got a mismatch between ", @@ -1151,7 +1152,7 @@ struct C10_EXPORT ivalue::Future final : c10::intrusive_ptr_target { [](const c10::Device& a, const c10::Device& b) { return a.index() < b.index(); }); // Deduplicate by compacting. size_t targetIdx = 0; - for (size_t sourceIdx = 0; sourceIdx < devices.size(); sourceIdx++) { + for (const auto sourceIdx : c10::irange(devices.size())) { TORCH_CHECK_VALUE( devices[sourceIdx].has_index(), "Expected devices to have indices, got ", devices[sourceIdx]); diff --git a/aten/src/ATen/core/op_registration/infer_schema.cpp b/aten/src/ATen/core/op_registration/infer_schema.cpp index 3807e420086a7f..df1925aba5ed1a 100644 --- a/aten/src/ATen/core/op_registration/infer_schema.cpp +++ b/aten/src/ATen/core/op_registration/infer_schema.cpp @@ -1,4 +1,5 @@ #include +#include #include namespace c10 { @@ -20,7 +21,7 @@ std::string fastToString(size_t x) { std::vector createArgumentVector(c10::ArrayRef args) { std::vector result; result.reserve(args.size()); - for (size_t i = 0; i < args.size(); ++i) { + for (const auto i : c10::irange(args.size())) { // Arguments are named "_" result.emplace_back(fastToString(i), (*args[i].getTypeFn)()); } @@ -49,7 +50,7 @@ C10_EXPORT c10::optional findSchemaDifferences(const FunctionSchema " vs " + guts::to_string(rhs.returns().size()); } - for (size_t i = 0; i < lhs.arguments().size(); ++i) { + for (const auto i : c10::irange(lhs.arguments().size())) { const TypePtr& leftType = lhs.arguments()[i].type(); const TypePtr& rightType = rhs.arguments()[i].type(); // Type::operator== is virtual. Comparing pointers first is @@ -61,7 +62,7 @@ C10_EXPORT c10::optional findSchemaDifferences(const FunctionSchema } } - for (size_t i = 0; i < lhs.returns().size(); ++i) { + for (const auto i : c10::irange(lhs.returns().size())) { const TypePtr& leftType = lhs.returns()[i].type(); const TypePtr& rightType = rhs.returns()[i].type(); // See above about comparing pointers first. diff --git a/aten/src/ATen/core/qualified_name.h b/aten/src/ATen/core/qualified_name.h index 4770a3cf334080..b8065d9d5085f7 100644 --- a/aten/src/ATen/core/qualified_name.h +++ b/aten/src/ATen/core/qualified_name.h @@ -3,6 +3,7 @@ #include #include #include +#include #include namespace c10 { @@ -69,7 +70,7 @@ struct QualifiedName { // Can't be a prefix if it's bigger return false; } - for (size_t i = 0; i < thisAtoms.size(); i++) { + for (const auto i : c10::irange(thisAtoms.size())) { if (thisAtoms[i] != otherAtoms[i]) { return false; } @@ -116,7 +117,7 @@ struct QualifiedName { reserve += e.size() + 1; } out.reserve(reserve); - for (size_t i = 0; i < v.size(); ++i) { + for (const auto i : c10::irange(v.size())) { if (i != 0) { out.push_back(delimiter); } diff --git a/aten/src/ATen/core/stack.h b/aten/src/ATen/core/stack.h index 021e8a02104f22..35bb9964eb398e 100644 --- a/aten/src/ATen/core/stack.h +++ b/aten/src/ATen/core/stack.h @@ -4,6 +4,7 @@ #include #include +#include // TODO move this to c10 namespace @@ -108,7 +109,7 @@ static inline IValue pop(Stack* stack) { static inline std::vector pop(Stack& stack, size_t n) { std::vector result; result.reserve(n); - for (size_t i = 0; i < n; ++i) { + for (const auto i : c10::irange(n)) { result.push_back(std::move(peek(stack, i, n))); } drop(stack, n); diff --git a/aten/src/ATen/cpu/vec/functional_base.h b/aten/src/ATen/cpu/vec/functional_base.h index 7bd04e637c7e3c..eb160577e8694e 100644 --- a/aten/src/ATen/cpu/vec/functional_base.h +++ b/aten/src/ATen/cpu/vec/functional_base.h @@ -4,6 +4,7 @@ // See Note [Do not compile initializers with AVX] #include +#include namespace at { namespace vec { @@ -16,7 +17,7 @@ inline scalar_t vec_reduce_all( using Vec = vec::Vectorized; scalar_t acc_arr[Vec::size()]; acc_vec.store(acc_arr); - for (int64_t i = 1; i < size; i++) { + for (const auto i : c10::irange(1, size)) { std::array acc_arr_next = {0}; acc_arr_next[0] = acc_arr[i]; Vec acc_vec_next = Vec::loadu(acc_arr_next.data()); diff --git a/aten/src/ATen/cpu/vec/vec256/vec256_complex_double.h b/aten/src/ATen/cpu/vec/vec256/vec256_complex_double.h index 40276ba8365d51..f6db6fdc49a4a9 100644 --- a/aten/src/ATen/cpu/vec/vec256/vec256_complex_double.h +++ b/aten/src/ATen/cpu/vec/vec256/vec256_complex_double.h @@ -4,6 +4,7 @@ // See Note [Do not compile initializers with AVX] #include +#include #include #include @@ -109,7 +110,7 @@ template <> class Vectorized> { Vectorized> map(c10::complex (*const f)(const c10::complex &)) const { __at_align__ c10::complex tmp[size()]; store(tmp); - for (int i = 0; i < size(); i++) { + for (const auto i : c10::irange(size())) { tmp[i] = f(tmp[i]); } return loadu(tmp); @@ -293,7 +294,7 @@ template <> class Vectorized> { __at_align__ c10::complex y_tmp[size()]; store(x_tmp); exp.store(y_tmp); - for (int i = 0; i < size(); i++) { + for (const auto i : c10::irange(size())) { x_tmp[i] = std::pow(x_tmp[i], y_tmp[i]); } return loadu(x_tmp); diff --git a/aten/src/ATen/cpu/vec/vec256/vec256_complex_float.h b/aten/src/ATen/cpu/vec/vec256/vec256_complex_float.h index f40196320022bc..a4181a8abb8b21 100644 --- a/aten/src/ATen/cpu/vec/vec256/vec256_complex_float.h +++ b/aten/src/ATen/cpu/vec/vec256/vec256_complex_float.h @@ -4,6 +4,7 @@ // See Note [Do not compile initializers with AVX] #include +#include #include #include #if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER) @@ -144,7 +145,7 @@ template <> class Vectorized> { Vectorized> map(c10::complex (*const f)(const c10::complex &)) const { __at_align__ c10::complex tmp[size()]; store(tmp); - for (int i = 0; i < size(); i++) { + for (const auto i : c10::irange(size())) { tmp[i] = f(tmp[i]); } return loadu(tmp); @@ -327,7 +328,7 @@ template <> class Vectorized> { __at_align__ c10::complex y_tmp[size()]; store(x_tmp); exp.store(y_tmp); - for (int i = 0; i < size(); i++) { + for (const auto i : c10::irange(size())) { x_tmp[i] = std::pow(x_tmp[i], y_tmp[i]); } return loadu(x_tmp); diff --git a/aten/src/ATen/cpu/vec/vec256/vec256_double.h b/aten/src/ATen/cpu/vec/vec256/vec256_double.h index f92f44e562a9d4..b64f910fbb6d8a 100644 --- a/aten/src/ATen/cpu/vec/vec256/vec256_double.h +++ b/aten/src/ATen/cpu/vec/vec256/vec256_double.h @@ -5,6 +5,7 @@ #include #include +#include #if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER) #include #endif @@ -72,7 +73,7 @@ template <> class Vectorized { // Ensure uninitialized memory does not change the output value See https://github.com/pytorch/pytorch/issues/32502 // for more details. We do not initialize arrays to zero using "={0}" because gcc would compile it to two // instructions while a loop would be compiled to one instruction. - for (auto i = 0; i < size(); ++i) { + for (const auto i : c10::irange(size())) { tmp_values[i] = 0.0; } std::memcpy( @@ -103,7 +104,7 @@ template <> class Vectorized { Vectorized map(double (*const f)(double)) const { __at_align__ double tmp[size()]; store(tmp); - for (int64_t i = 0; i < size(); i++) { + for (const auto i : c10::irange(size())) { tmp[i] = f(tmp[i]); } return loadu(tmp); @@ -180,7 +181,7 @@ template <> class Vectorized { __at_align__ double tmp_x[size()]; store(tmp); x.store(tmp_x); - for (int64_t i = 0; i < size(); i++) { + for (const auto i : c10::irange(size())) { tmp[i] = calc_igamma(tmp[i], tmp_x[i]); } return loadu(tmp); @@ -190,7 +191,7 @@ template <> class Vectorized { __at_align__ double tmp_x[size()]; store(tmp); x.store(tmp_x); - for (int64_t i = 0; i < size(); i++) { + for (const auto i : c10::irange(size())) { tmp[i] = calc_igammac(tmp[i], tmp_x[i]); } return loadu(tmp); diff --git a/aten/src/ATen/cpu/vec/vec256/vec256_float.h b/aten/src/ATen/cpu/vec/vec256/vec256_float.h index deb95429843738..57a594f6354c49 100644 --- a/aten/src/ATen/cpu/vec/vec256/vec256_float.h +++ b/aten/src/ATen/cpu/vec/vec256/vec256_float.h @@ -5,6 +5,7 @@ #include #include +#include #if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER) #include #endif @@ -80,7 +81,7 @@ template <> class Vectorized { // Ensure uninitialized memory does not change the output value See https://github.com/pytorch/pytorch/issues/32502 // for more details. We do not initialize arrays to zero using "={0}" because gcc would compile it to two // instructions while a loop would be compiled to one instruction. - for (auto i = 0; i < size(); ++i) { + for (const auto i : c10::irange(size())) { tmp_values[i] = 0.0; } std::memcpy( @@ -109,7 +110,7 @@ template <> class Vectorized { Vectorized map(float (*const f)(float)) const { __at_align__ float tmp[size()]; store(tmp); - for (int64_t i = 0; i < size(); i++) { + for (const auto i : c10::irange(size())) { tmp[i] = f(tmp[i]); } return loadu(tmp); @@ -217,7 +218,7 @@ template <> class Vectorized { __at_align__ float tmp_x[size()]; store(tmp); x.store(tmp_x); - for (int64_t i = 0; i < size(); i++) { + for (const auto i : c10::irange(size())) { tmp[i] = calc_igamma(tmp[i], tmp_x[i]); } return loadu(tmp); @@ -227,7 +228,7 @@ template <> class Vectorized { __at_align__ float tmp_x[size()]; store(tmp); x.store(tmp_x); - for (int64_t i = 0; i < size(); i++) { + for (const auto i : c10::irange(size())) { tmp[i] = calc_igammac(tmp[i], tmp_x[i]); } return loadu(tmp); diff --git a/aten/src/ATen/cpu/vec/vec256/vec256_float_neon.h b/aten/src/ATen/cpu/vec/vec256/vec256_float_neon.h index 2aac442d2123d9..465266b8b55dac 100644 --- a/aten/src/ATen/cpu/vec/vec256/vec256_float_neon.h +++ b/aten/src/ATen/cpu/vec/vec256/vec256_float_neon.h @@ -5,6 +5,7 @@ #include #include +#include // Sleef offers vectorized versions of some transcedentals // such as sin, cos, tan etc.. // However for now opting for STL, since we are not building @@ -221,7 +222,7 @@ template <> class Vectorized { } else { __at_align__ float tmp_values[size()]; - for (auto i = 0; i < size(); ++i) { + for (const auto i : c10::irange(size())) { tmp_values[i] = 0.0; } std::memcpy( @@ -287,7 +288,7 @@ template <> class Vectorized { __at_align__ float tmp[size()]; __at_align__ float res[size()]; store(tmp); - for (int i = 0; i < size(); i++) { + for (const auto i : c10::irange(size())) { if (_isnan(tmp[i])) { std::memset(static_cast(&res[i]), 0xFF, sizeof(float)); } else { @@ -299,7 +300,7 @@ template <> class Vectorized { Vectorized map(float (*const f)(float)) const { __at_align__ float tmp[size()]; store(tmp); - for (int64_t i = 0; i < size(); i++) { + for (const auto i : c10::irange(size())) { tmp[i] = f(tmp[i]); } return loadu(tmp); @@ -336,7 +337,7 @@ template <> class Vectorized { __at_align__ float tmp_exp[size()]; store(tmp); exp.store(tmp_exp); - for (int64_t i = 0; i < size(); i++) { + for (const auto i : c10::irange(size())) { tmp[i] = std::atan2(tmp[i], tmp_exp[i]); } return loadu(tmp); @@ -371,7 +372,7 @@ template <> class Vectorized { __at_align__ float tmp_q[size()]; store(tmp); q.store(tmp_q); - for (int64_t i = 0; i < size(); i++) { + for (const auto i : c10::irange(size())) { tmp[i] = std::fmod(tmp[i], tmp_q[i]); } return loadu(tmp); @@ -381,7 +382,7 @@ template <> class Vectorized { __at_align__ float tmp_b[size()]; store(tmp); b.store(tmp_b); - for (int64_t i = 0; i < size(); i++) { + for (const auto i : c10::irange(size())) { tmp[i] = std::hypot(tmp[i], tmp_b[i]); } return loadu(tmp); @@ -397,7 +398,7 @@ template <> class Vectorized { __at_align__ float tmp_x[size()]; store(tmp); x.store(tmp_x); - for (int64_t i = 0; i < size(); i++) { + for (const auto i : c10::irange(size())) { tmp[i] = calc_igamma(tmp[i], tmp_x[i]); } return loadu(tmp); @@ -407,7 +408,7 @@ template <> class Vectorized { __at_align__ float tmp_x[size()]; store(tmp); x.store(tmp_x); - for (int64_t i = 0; i < size(); i++) { + for (const auto i : c10::irange(size())) { tmp[i] = calc_igammac(tmp[i], tmp_x[i]); } return loadu(tmp); @@ -429,7 +430,7 @@ template <> class Vectorized { __at_align__ float tmp_b[size()]; store(tmp); b.store(tmp_b); - for (int64_t i = 0; i < size(); i++) { + for (const auto i : c10::irange(size())) { tmp[i] = std::nextafter(tmp[i], tmp_b[i]); } return loadu(tmp); @@ -494,7 +495,7 @@ template <> class Vectorized { __at_align__ float tmp_exp[size()]; store(tmp); exp.store(tmp_exp); - for (int64_t i = 0; i < size(); i++) { + for (const auto i : c10::irange(size())) { tmp[i] = std::pow(tmp[i], tmp_exp[i]); } return loadu(tmp); diff --git a/aten/src/ATen/cpu/vec/vec256/vec256_int.h b/aten/src/ATen/cpu/vec/vec256/vec256_int.h index 5ee9919abca02c..2808c19bb3bb3d 100644 --- a/aten/src/ATen/cpu/vec/vec256/vec256_int.h +++ b/aten/src/ATen/cpu/vec/vec256/vec256_int.h @@ -6,6 +6,7 @@ #include #include #include +#include #include namespace at { @@ -98,7 +99,7 @@ class Vectorized : public Vectorizedi { // Ensure uninitialized memory does not change the output value See https://github.com/pytorch/pytorch/issues/32502 // for more details. We do not initialize arrays to zero using "={0}" because gcc would compile it to two // instructions while a loop would be compiled to one instruction. - for (auto i = 0; i < size(); ++i) { + for (const auto i : c10::irange(size())) { tmp_values[i] = 0; } std::memcpy(tmp_values, ptr, count * sizeof(int64_t)); @@ -221,7 +222,7 @@ class Vectorized : public Vectorizedi { // Ensure uninitialized memory does not change the output value See https://github.com/pytorch/pytorch/issues/32502 // for more details. We do not initialize arrays to zero using "={0}" because gcc would compile it to two // instructions while a loop would be compiled to one instruction. - for (auto i = 0; i < size(); ++i) { + for (const auto i : c10::irange(size())) { tmp_values[i] = 0; } std::memcpy(tmp_values, ptr, count * sizeof(int32_t)); @@ -435,7 +436,7 @@ class Vectorized : public Vectorizedi { // Ensure uninitialized memory does not change the output value See https://github.com/pytorch/pytorch/issues/32502 // for more details. We do not initialize arrays to zero using "={0}" because gcc would compile it to two // instructions while a loop would be compiled to one instruction. - for (auto i = 0; i < size(); ++i) { + for (const auto i : c10::irange(size())) { tmp_values[i] = 0; } std::memcpy(tmp_values, ptr, count * sizeof(int16_t)); @@ -684,7 +685,7 @@ class Vectorized : public Vectorizedi { // Ensure uninitialized memory does not change the output value See https://github.com/pytorch/pytorch/issues/32502 // for more details. We do not initialize arrays to zero using "={0}" because gcc would compile it to two // instructions while a loop would be compiled to one instruction. - for (size_t i = 0; i < size(); ++i) { + for (const auto i : c10::irange(size())) { tmp_values[i] = 0; } std::memcpy(tmp_values, ptr, count * sizeof(int8_t)); diff --git a/aten/src/ATen/cpu/vec/vec256/vec256_qint.h b/aten/src/ATen/cpu/vec/vec256/vec256_qint.h index 8cde485c90d7d9..db81f4a97ae526 100644 --- a/aten/src/ATen/cpu/vec/vec256/vec256_qint.h +++ b/aten/src/ATen/cpu/vec/vec256/vec256_qint.h @@ -6,6 +6,8 @@ #include #include #include + +#include #include #include #include @@ -739,7 +741,7 @@ struct VectorizedQuantizedConverter { std::array vals; VectorizedQuantizedConverter(T val) { - for (size_t i = 0; i < size(); ++i) { + for (const auto i : c10::irange(size())) { vals[i] = val.val_; } } @@ -757,9 +759,9 @@ struct VectorizedQuantizedConverter { Vectorized zero_point, Vectorized scale_zp_premul) const { float_vec_return_type rv; - for (int i = 0; i < float_num_vecs(); ++i) { + for (const auto i : c10::irange(float_num_vecs())) { float tmp_vals[8]; - for (int j = 0; j < 8; ++j) { + for (const auto j : c10::irange(8)) { tmp_vals[j] = at::native::dequantize_val( scale[j], zero_point[j], T(vals[8 * i + j])); } @@ -816,7 +818,7 @@ struct Vectorized : public VectorizedQuantizedConverter< std::array qvals; std::array float_vals; - for (int i = 0; i < float_num_vecs(); ++i) { + for (const auto i : c10::irange(float_num_vecs())) { rhs[i].store(&float_vals[i * 8], 8); } @@ -832,7 +834,7 @@ struct Vectorized : public VectorizedQuantizedConverter< Vectorized maximum(Vectorized b) const { Vectorized retval; - for (size_t i = 0; i < size(); ++i) { + for (const auto i : c10::irange(size())) { retval.vals[i] = std::max(vals[i], b.vals[i]); } return retval; @@ -840,7 +842,7 @@ struct Vectorized : public VectorizedQuantizedConverter< Vectorized minimum(Vectorized b) const { Vectorized retval; - for (size_t i = 0; i < size(); ++i) { + for (const auto i : c10::irange(size())) { retval.vals[i] = std::min(vals[i], b.vals[i]); } return retval; @@ -855,7 +857,7 @@ struct Vectorized : public VectorizedQuantizedConverter< Vectorized zero_point, Vectorized q_six) { Vectorized retval; - for (size_t i = 0; i < size(); ++i) { + for (const auto i : c10::irange(size())) { retval.vals[i] = std::min( std::max(vals[i], zero_point.vals[i]), q_six.vals[i]); } @@ -864,7 +866,7 @@ struct Vectorized : public VectorizedQuantizedConverter< int_vec_return_type widening_subtract(Vectorized b) const { int_vec_return_type retval; - for (size_t i = 0; i < size(); ++i) { + for (const auto i : c10::irange(size())) { retval[0].vals[i] = vals[i] - b.vals[i]; } return retval; @@ -875,7 +877,7 @@ struct Vectorized : public VectorizedQuantizedConverter< float multiplier, int32_t zero_point) { Vectorized retval; - for (size_t i = 0; i < size(); ++i) { + for (const auto i : c10::irange(size())) { retval.vals[i] = nearbyint(static_cast(inp[0].vals[i]) * multiplier) + zero_point; @@ -948,7 +950,7 @@ struct Vectorized : public VectorizedQuantizedConverter< std::array qvals; std::array float_vals; - for (int i = 0; i < float_num_vecs(); ++i) { + for (const auto i : c10::irange(float_num_vecs())) { rhs[i].store(&float_vals[i * 8], 8); } @@ -964,7 +966,7 @@ struct Vectorized : public VectorizedQuantizedConverter< Vectorized maximum(Vectorized b) const { Vectorized retval; - for (size_t i = 0; i < size(); ++i) { + for (const auto i : c10::irange(size())) { retval.vals[i] = std::max(vals[i], b.vals[i]); } return retval; @@ -972,7 +974,7 @@ struct Vectorized : public VectorizedQuantizedConverter< Vectorized minimum(Vectorized b) const { Vectorized retval; - for (size_t i = 0; i < size(); ++i) { + for (const auto i : c10::irange(size())) { retval.vals[i] = std::min(vals[i], b.vals[i]); } return retval; @@ -986,7 +988,7 @@ struct Vectorized : public VectorizedQuantizedConverter< Vectorized zero_point, Vectorized q_six) { Vectorized retval; - for (size_t i = 0; i < size(); ++i) { + for (const auto i : c10::irange(size())) { retval.vals[i] = std::min( std::max(vals[i], zero_point.vals[i]), q_six.vals[i]); } @@ -996,8 +998,8 @@ struct Vectorized : public VectorizedQuantizedConverter< int_vec_return_type widening_subtract(Vectorized b) const { int_vec_return_type retval; constexpr int elem_per_int_vec = size() / int_num_vecs(); - for (size_t i = 0; i < int_num_vecs(); ++i) { - for (size_t j = 0; j < elem_per_int_vec; ++j) { + for (const auto i : c10::irange(int_num_vecs())) { + for (const auto j : c10::irange(elem_per_int_vec)) { retval[i].vals[j] = static_cast(vals[i * elem_per_int_vec + j]) - static_cast(b.vals[i * elem_per_int_vec + j]); @@ -1013,8 +1015,8 @@ struct Vectorized : public VectorizedQuantizedConverter< constexpr auto min_val = std::numeric_limits::min(); constexpr auto max_val = std::numeric_limits::max(); Vectorized retval; - for (size_t i = 0; i < int_num_vecs(); ++i) { - for (size_t j = 0; j < elem_per_int_vec; ++j) { + for (const auto i : c10::irange(int_num_vecs())) { + for (const auto j : c10::irange(elem_per_int_vec)) { int32_t rounded = nearbyint(static_cast(inp[i].vals[j]) * multiplier) + zero_point; @@ -1068,7 +1070,7 @@ struct Vectorized : public VectorizedQuantizedConverter< std::array qvals; std::array float_vals; - for (int i = 0; i < float_num_vecs(); ++i) { + for (const auto i : c10::irange(float_num_vecs())) { rhs[i].store(&float_vals[i * 8], 8); } @@ -1084,7 +1086,7 @@ struct Vectorized : public VectorizedQuantizedConverter< Vectorized maximum(Vectorized b) const { Vectorized retval; - for (size_t i = 0; i < size(); ++i) { + for (const auto i : c10::irange(size())) { retval.vals[i] = std::max(vals[i], b.vals[i]); } return retval; @@ -1092,7 +1094,7 @@ struct Vectorized : public VectorizedQuantizedConverter< Vectorized minimum(Vectorized b) const { Vectorized retval; - for (size_t i = 0; i < size(); ++i) { + for (const auto i : c10::irange(size())) { retval.vals[i] = std::min(vals[i], b.vals[i]); } return retval; @@ -1107,7 +1109,7 @@ struct Vectorized : public VectorizedQuantizedConverter< Vectorized zero_point, Vectorized q_six) { Vectorized retval; - for (size_t i = 0; i < size(); ++i) { + for (const auto i : c10::irange(size())) { retval.vals[i] = std::min( std::max(vals[i], zero_point.vals[i]), q_six.vals[i]); } @@ -1117,8 +1119,8 @@ struct Vectorized : public VectorizedQuantizedConverter< int_vec_return_type widening_subtract(Vectorized b) const { int_vec_return_type retval; constexpr int elem_per_int_vec = size() / int_num_vecs(); - for (size_t i = 0; i < int_num_vecs(); ++i) { - for (size_t j = 0; j < elem_per_int_vec; ++j) { + for (const auto i : c10::irange(int_num_vecs())) { + for (const auto j : c10::irange(elem_per_int_vec)) { retval[i].vals[j] = static_cast(vals[i * elem_per_int_vec + j]) - static_cast(b.vals[i * elem_per_int_vec + j]); @@ -1134,8 +1136,8 @@ struct Vectorized : public VectorizedQuantizedConverter< constexpr auto min_val = std::numeric_limits::min(); constexpr auto max_val = std::numeric_limits::max(); Vectorized retval; - for (size_t i = 0; i < int_num_vecs(); ++i) { - for (size_t j = 0; j < elem_per_int_vec; ++j) { + for (const auto i : c10::irange(int_num_vecs())) { + for (const auto j : c10::irange(elem_per_int_vec)) { int32_t rounded = nearbyint(static_cast(inp[i].vals[j]) * multiplier) + zero_point; diff --git a/aten/src/ATen/cpu/vec/vec256/vsx/vec256_complex_double_vsx.h b/aten/src/ATen/cpu/vec/vec256/vsx/vec256_complex_double_vsx.h index 3a3e0daade098b..fefe5a0a4c9aba 100644 --- a/aten/src/ATen/cpu/vec/vec256/vsx/vec256_complex_double_vsx.h +++ b/aten/src/ATen/cpu/vec/vec256/vsx/vec256_complex_double_vsx.h @@ -3,6 +3,7 @@ #include #include #include +#include namespace at { namespace vec { @@ -167,7 +168,7 @@ class Vectorized { Vectorized map(ComplexDbl (*const f)(ComplexDbl)) const { __at_align__ ComplexDbl tmp[size()]; store(tmp); - for (int i = 0; i < size(); i++) { + for (const auto i : c10::irange(size())) { tmp[i] = f(tmp[i]); } return loadu(tmp); @@ -176,7 +177,7 @@ class Vectorized { Vectorized map(ComplexDbl (*const f)(const ComplexDbl&)) const { __at_align__ ComplexDbl tmp[size()]; store(tmp); - for (int i = 0; i < size(); i++) { + for (const auto i : c10::irange(size())) { tmp[i] = f(tmp[i]); } return loadu(tmp); @@ -454,7 +455,7 @@ class Vectorized { __at_align__ ComplexDbl y_tmp[size()]; store(x_tmp); exp.store(y_tmp); - for (int i = 0; i < size(); i++) { + for (const auto i : c10::irange(size())) { x_tmp[i] = std::pow(x_tmp[i], y_tmp[i]); } return loadu(x_tmp); diff --git a/aten/src/ATen/cpu/vec/vec256/vsx/vec256_complex_float_vsx.h b/aten/src/ATen/cpu/vec/vec256/vsx/vec256_complex_float_vsx.h index 712de24597dcfa..92beb6bc227ff2 100644 --- a/aten/src/ATen/cpu/vec/vec256/vsx/vec256_complex_float_vsx.h +++ b/aten/src/ATen/cpu/vec/vec256/vsx/vec256_complex_float_vsx.h @@ -4,6 +4,7 @@ #include #include #include +#include namespace at { namespace vec { @@ -222,7 +223,7 @@ class Vectorized { Vectorized map(ComplexFlt (*const f)(ComplexFlt)) const { __at_align__ ComplexFlt tmp[size()]; store(tmp); - for (int i = 0; i < size(); i++) { + for (const auto i : c10::irange(size())) { tmp[i] = f(tmp[i]); } return loadu(tmp); @@ -231,7 +232,7 @@ class Vectorized { Vectorized map(ComplexFlt (*const f)(const ComplexFlt&)) const { __at_align__ ComplexFlt tmp[size()]; store(tmp); - for (int i = 0; i < size(); i++) { + for (const auto i : c10::irange(size())) { tmp[i] = f(tmp[i]); } return loadu(tmp); @@ -430,7 +431,7 @@ class Vectorized { __at_align__ ComplexFlt y_tmp[size()]; store(x_tmp); exp.store(y_tmp); - for (int i = 0; i < size(); i++) { + for (const auto i : c10::irange(size())) { x_tmp[i] = std::pow(x_tmp[i], y_tmp[i]); } return loadu(x_tmp); diff --git a/aten/src/ATen/cpu/vec/vec256/vsx/vec256_quint8_vsx.h b/aten/src/ATen/cpu/vec/vec256/vsx/vec256_quint8_vsx.h index 2482af6ec2324f..7a80c24e42c6a9 100644 --- a/aten/src/ATen/cpu/vec/vec256/vsx/vec256_quint8_vsx.h +++ b/aten/src/ATen/cpu/vec/vec256/vsx/vec256_quint8_vsx.h @@ -3,6 +3,8 @@ #include #include #include + +#include #include #include diff --git a/aten/src/ATen/cpu/vec/vec512/vec512_complex_double.h b/aten/src/ATen/cpu/vec/vec512/vec512_complex_double.h index 6fc22f0f7d3362..7dc3fdc6eafc38 100644 --- a/aten/src/ATen/cpu/vec/vec512/vec512_complex_double.h +++ b/aten/src/ATen/cpu/vec/vec512/vec512_complex_double.h @@ -4,6 +4,7 @@ // See Note [Do not compile initializers with AVX] #include +#include #include #include #if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER) @@ -149,7 +150,7 @@ template <> class Vectorized> { Vectorized> map(c10::complex (*const f)(const c10::complex &)) const { __at_align__ c10::complex tmp[size()]; store(tmp); - for (int i = 0; i < size(); i++) { + for (const auto i : c10::irange(size())) { tmp[i] = f(tmp[i]); } return loadu(tmp); @@ -357,7 +358,7 @@ template <> class Vectorized> { __at_align__ c10::complex y_tmp[size()]; store(x_tmp); exp.store(y_tmp); - for (int i = 0; i < size(); i++) { + for (const auto i : c10::irange(size())) { x_tmp[i] = std::pow(x_tmp[i], y_tmp[i]); } return loadu(x_tmp); diff --git a/aten/src/ATen/cpu/vec/vec512/vec512_complex_float.h b/aten/src/ATen/cpu/vec/vec512/vec512_complex_float.h index dfd070604c40c5..a9876dd5fcadc5 100644 --- a/aten/src/ATen/cpu/vec/vec512/vec512_complex_float.h +++ b/aten/src/ATen/cpu/vec/vec512/vec512_complex_float.h @@ -4,6 +4,7 @@ // See Note [Do not compile initializers with AVX] #include +#include #include #include #if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER) @@ -667,7 +668,7 @@ template <> class Vectorized> { Vectorized> map(c10::complex (*const f)(const c10::complex &)) const { __at_align__ c10::complex tmp[size()]; store(tmp); - for (int i = 0; i < size(); i++) { + for (const auto i : c10::irange(size())) { tmp[i] = f(tmp[i]); } return loadu(tmp); @@ -858,7 +859,7 @@ template <> class Vectorized> { __at_align__ c10::complex y_tmp[size()]; store(x_tmp); exp.store(y_tmp); - for (int i = 0; i < size(); i++) { + for (const auto i : c10::irange(size())) { x_tmp[i] = std::pow(x_tmp[i], y_tmp[i]); } return loadu(x_tmp); diff --git a/aten/src/ATen/cpu/vec/vec512/vec512_double.h b/aten/src/ATen/cpu/vec/vec512/vec512_double.h index 7128219748a061..7035b3e0f5d4b8 100644 --- a/aten/src/ATen/cpu/vec/vec512/vec512_double.h +++ b/aten/src/ATen/cpu/vec/vec512/vec512_double.h @@ -5,6 +5,7 @@ #include #include +#include #if (defined(CPU_CAPABILITY_AVX512)) && !defined(_MSC_VER) #include #endif @@ -87,7 +88,7 @@ template <> class Vectorized { // Ensure uninitialized memory does not change the output value See https://github.com/pytorch/pytorch/issues/32502 // for more details. We do not initialize arrays to zero using "={0}" because gcc would compile it to two // instructions while a loop would be compiled to one instruction. - for (auto i = 0; i < size(); ++i) { + for (const auto i : c10::irange(size())) { tmp_values[i] = 0.0; } std::memcpy( @@ -120,7 +121,7 @@ template <> class Vectorized { Vectorized map(double (*const f)(double)) const { __at_align__ double tmp[size()]; store(tmp); - for (int64_t i = 0; i < size(); i++) { + for (const auto i : c10::irange(size())) { tmp[i] = f(tmp[i]); } return loadu(tmp); @@ -200,7 +201,7 @@ template <> class Vectorized { __at_align__ double tmp_x[size()]; store(tmp); x.store(tmp_x); - for (int64_t i = 0; i < size(); i++) { + for (const auto i : c10::irange(size())) { tmp[i] = calc_igamma(tmp[i], tmp_x[i]); } return loadu(tmp); @@ -210,7 +211,7 @@ template <> class Vectorized { __at_align__ double tmp_x[size()]; store(tmp); x.store(tmp_x); - for (int64_t i = 0; i < size(); i++) { + for (const auto i : c10::irange(size())) { tmp[i] = calc_igammac(tmp[i], tmp_x[i]); } return loadu(tmp); diff --git a/aten/src/ATen/cpu/vec/vec512/vec512_float.h b/aten/src/ATen/cpu/vec/vec512/vec512_float.h index 1a2b113de9d367..70866b15eb7085 100644 --- a/aten/src/ATen/cpu/vec/vec512/vec512_float.h +++ b/aten/src/ATen/cpu/vec/vec512/vec512_float.h @@ -5,6 +5,7 @@ #include #include +#include #if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER) #include #endif @@ -104,7 +105,7 @@ template <> class Vectorized { // Ensure uninitialized memory does not change the output value See https://github.com/pytorch/pytorch/issues/32502 // for more details. We do not initialize arrays to zero using "={0}" because gcc would compile it to two // instructions while a loop would be compiled to one instruction. - for (auto i = 0; i < size(); ++i) { + for (const auto i : c10::irange(size())) { tmp_values[i] = 0.0; } std::memcpy( @@ -135,7 +136,7 @@ template <> class Vectorized { Vectorized map(float (*const f)(float)) const { __at_align__ float tmp[size()]; store(tmp); - for (int64_t i = 0; i < size(); i++) { + for (const auto i : c10::irange(size())) { tmp[i] = f(tmp[i]); } return loadu(tmp); @@ -246,7 +247,7 @@ template <> class Vectorized { __at_align__ float tmp_x[size()]; store(tmp); x.store(tmp_x); - for (int64_t i = 0; i < size(); i++) { + for (const auto i : c10::irange(size())) { tmp[i] = calc_igamma(tmp[i], tmp_x[i]); } return loadu(tmp); @@ -256,7 +257,7 @@ template <> class Vectorized { __at_align__ float tmp_x[size()]; store(tmp); x.store(tmp_x); - for (int64_t i = 0; i < size(); i++) { + for (const auto i : c10::irange(size())) { tmp[i] = calc_igammac(tmp[i], tmp_x[i]); } return loadu(tmp); diff --git a/aten/src/ATen/cpu/vec/vec512/vec512_int.h b/aten/src/ATen/cpu/vec/vec512/vec512_int.h index f28c14ed3f73f4..92cbe6b6abd6d5 100644 --- a/aten/src/ATen/cpu/vec/vec512/vec512_int.h +++ b/aten/src/ATen/cpu/vec/vec512/vec512_int.h @@ -6,6 +6,7 @@ #include #include #include +#include namespace at { namespace vec { @@ -100,7 +101,7 @@ class Vectorized : public Vectorizedi { // Ensure uninitialized memory does not change the output value See https://github.com/pytorch/pytorch/issues/32502 // for more details. We do not initialize arrays to zero using "={0}" because gcc would compile it to two // instructions while a loop would be compiled to one instruction. - for (auto i = 0; i < size(); ++i) { + for (const auto i : c10::irange(size())) { tmp_values[i] = 0; } std::memcpy(tmp_values, ptr, count * sizeof(int64_t)); @@ -253,7 +254,7 @@ class Vectorized : public Vectorizedi { // Ensure uninitialized memory does not change the output value See https://github.com/pytorch/pytorch/issues/32502 // for more details. We do not initialize arrays to zero using "={0}" because gcc would compile it to two // instructions while a loop would be compiled to one instruction. - for (auto i = 0; i < size(); ++i) { + for (const auto i : c10::irange(size())) { tmp_values[i] = 0; } std::memcpy(tmp_values, ptr, count * sizeof(int32_t)); @@ -485,7 +486,7 @@ class Vectorized : public Vectorizedi { // Ensure uninitialized memory does not change the output value See https://github.com/pytorch/pytorch/issues/32502 // for more details. We do not initialize arrays to zero using "={0}" because gcc would compile it to two // instructions while a loop would be compiled to one instruction. - for (auto i = 0; i < size(); ++i) { + for (const auto i : c10::irange(size())) { tmp_values[i] = 0; } std::memcpy(tmp_values, ptr, count * sizeof(int16_t)); @@ -761,7 +762,7 @@ class Vectorized : public Vectorizedi { // Ensure uninitialized memory does not change the output value See https://github.com/pytorch/pytorch/issues/32502 // for more details. We do not initialize arrays to zero using "={0}" because gcc would compile it to two // instructions while a loop would be compiled to one instruction. - for (size_t i = 0; i < size(); ++i) { + for (const auto i : c10::irange(size())) { tmp_values[i] = 0; } std::memcpy(tmp_values, ptr, count * sizeof(int8_t)); diff --git a/aten/src/ATen/cpu/vec/vec512/vec512_qint.h b/aten/src/ATen/cpu/vec/vec512/vec512_qint.h index 3a1eda8874f1af..3ed7899bb75b60 100644 --- a/aten/src/ATen/cpu/vec/vec512/vec512_qint.h +++ b/aten/src/ATen/cpu/vec/vec512/vec512_qint.h @@ -6,6 +6,8 @@ #include #include #include + +#include #include #include #include @@ -744,7 +746,7 @@ struct VectorizedQuantizedConverter { std::array vals; VectorizedQuantizedConverter(T val) { - for (size_t i = 0; i < size(); ++i) { + for (const auto i : c10::irange(size())) { vals[i] = val.val_; } } @@ -762,9 +764,9 @@ struct VectorizedQuantizedConverter { Vectorized zero_point, Vectorized scale_zp_premul) const { float_vec_return_type rv; - for (int i = 0; i < float_num_vecs(); ++i) { + for (const auto i : c10::irange(float_num_vecs())) { float tmp_vals[16]; - for (int j = 0; j < 16; ++j) { + for (const auto j : c10::irange(16)) { tmp_vals[j] = at::native::dequantize_val( scale[j], zero_point[j], T(vals[16 * i + j])); } @@ -829,7 +831,7 @@ struct Vectorized : public VectorizedQuantizedConverter< std::array qvals; std::array float_vals; - for (int i = 0; i < float_num_vecs(); ++i) { + for (const auto i : c10::irange(float_num_vecs())) { rhs[i].store(&float_vals[i * 16], 16); } @@ -845,7 +847,7 @@ struct Vectorized : public VectorizedQuantizedConverter< Vectorized maximum(Vectorized b) const { Vectorized retval; - for (size_t i = 0; i < size(); ++i) { + for (const auto i : c10::irange(size())) { retval.vals[i] = std::max(vals[i], b.vals[i]); } return retval; @@ -853,7 +855,7 @@ struct Vectorized : public VectorizedQuantizedConverter< Vectorized minimum(Vectorized b) const { Vectorized retval; - for (size_t i = 0; i < size(); ++i) { + for (const auto i : c10::irange(size())) { retval.vals[i] = std::min(vals[i], b.vals[i]); } return retval; @@ -868,7 +870,7 @@ struct Vectorized : public VectorizedQuantizedConverter< Vectorized zero_point, Vectorized q_six) { Vectorized retval; - for (size_t i = 0; i < size(); ++i) { + for (const auto i : c10::irange(size())) { retval.vals[i] = std::min( std::max(vals[i], zero_point.vals[i]), q_six.vals[i]); } @@ -877,7 +879,7 @@ struct Vectorized : public VectorizedQuantizedConverter< int_vec_return_type widening_subtract(Vectorized b) const { int_vec_return_type retval; - for (size_t i = 0; i < size(); ++i) { + for (const auto i : c10::irange(size())) { retval[0].vals[i] = vals[i] - b.vals[i]; } return retval; @@ -888,7 +890,7 @@ struct Vectorized : public VectorizedQuantizedConverter< float multiplier, int32_t zero_point) { Vectorized retval; - for (size_t i = 0; i < size(); ++i) { + for (const auto i : c10::irange(size())) { retval.vals[i] = nearbyint(static_cast(inp[0].vals[i]) * multiplier) + zero_point; @@ -961,7 +963,7 @@ struct Vectorized : public VectorizedQuantizedConverter< std::array qvals; std::array float_vals; - for (int i = 0; i < float_num_vecs(); ++i) { + for (const auto i : c10::irange(float_num_vecs())) { rhs[i].store(&float_vals[i * 16], 16); } @@ -977,7 +979,7 @@ struct Vectorized : public VectorizedQuantizedConverter< Vectorized maximum(Vectorized b) const { Vectorized retval; - for (size_t i = 0; i < size(); ++i) { + for (const auto i : c10::irange(size())) { retval.vals[i] = std::max(vals[i], b.vals[i]); } return retval; @@ -985,7 +987,7 @@ struct Vectorized : public VectorizedQuantizedConverter< Vectorized minimum(Vectorized b) const { Vectorized retval; - for (size_t i = 0; i < size(); ++i) { + for (const auto i : c10::irange(size())) { retval.vals[i] = std::min(vals[i], b.vals[i]); } return retval; @@ -999,7 +1001,7 @@ struct Vectorized : public VectorizedQuantizedConverter< Vectorized zero_point, Vectorized q_six) { Vectorized retval; - for (size_t i = 0; i < size(); ++i) { + for (const auto i : c10::irange(size())) { retval.vals[i] = std::min( std::max(vals[i], zero_point.vals[i]), q_six.vals[i]); } @@ -1009,8 +1011,8 @@ struct Vectorized : public VectorizedQuantizedConverter< int_vec_return_type widening_subtract(Vectorized b) const { int_vec_return_type retval; constexpr int elem_per_int_vec = size() / int_num_vecs(); - for (size_t i = 0; i < int_num_vecs(); ++i) { - for (size_t j = 0; j < elem_per_int_vec; ++j) { + for (const auto i : c10::irange(int_num_vecs())) { + for (const auto j : c10::irange(elem_per_int_vec)) { retval[i].vals[j] = static_cast(vals[i * elem_per_int_vec + j]) - static_cast(b.vals[i * elem_per_int_vec + j]); @@ -1026,8 +1028,8 @@ struct Vectorized : public VectorizedQuantizedConverter< constexpr auto min_val = std::numeric_limits::min(); constexpr auto max_val = std::numeric_limits::max(); Vectorized retval; - for (size_t i = 0; i < int_num_vecs(); ++i) { - for (size_t j = 0; j < elem_per_int_vec; ++j) { + for (const auto i : c10::irange(int_num_vecs())) { + for (const auto j : c10::irange(elem_per_int_vec)) { int32_t rounded = nearbyint(static_cast(inp[i].vals[j]) * multiplier) + zero_point; @@ -1081,7 +1083,7 @@ struct Vectorized : public VectorizedQuantizedConverter< std::array qvals; std::array float_vals; - for (int i = 0; i < float_num_vecs(); ++i) { + for (const auto i : c10::irange(float_num_vecs())) { rhs[i].store(&float_vals[i * 16], 16); } @@ -1097,7 +1099,7 @@ struct Vectorized : public VectorizedQuantizedConverter< Vectorized maximum(Vectorized b) const { Vectorized retval; - for (size_t i = 0; i < size(); ++i) { + for (const auto i : c10::irange(size())) { retval.vals[i] = std::max(vals[i], b.vals[i]); } return retval; @@ -1105,7 +1107,7 @@ struct Vectorized : public VectorizedQuantizedConverter< Vectorized minimum(Vectorized b) const { Vectorized retval; - for (size_t i = 0; i < size(); ++i) { + for (const auto i : c10::irange(size())) { retval.vals[i] = std::min(vals[i], b.vals[i]); } return retval; @@ -1120,7 +1122,7 @@ struct Vectorized : public VectorizedQuantizedConverter< Vectorized zero_point, Vectorized q_six) { Vectorized retval; - for (size_t i = 0; i < size(); ++i) { + for (const auto i : c10::irange(size())) { retval.vals[i] = std::min( std::max(vals[i], zero_point.vals[i]), q_six.vals[i]); } @@ -1130,8 +1132,8 @@ struct Vectorized : public VectorizedQuantizedConverter< int_vec_return_type widening_subtract(Vectorized b) const { int_vec_return_type retval; constexpr int elem_per_int_vec = size() / int_num_vecs(); - for (size_t i = 0; i < int_num_vecs(); ++i) { - for (size_t j = 0; j < elem_per_int_vec; ++j) { + for (const auto i : c10::irange(int_num_vecs())) { + for (const auto j : c10::irange(elem_per_int_vec)) { retval[i].vals[j] = static_cast(vals[i * elem_per_int_vec + j]) - static_cast(b.vals[i * elem_per_int_vec + j]); @@ -1147,8 +1149,8 @@ struct Vectorized : public VectorizedQuantizedConverter< constexpr auto min_val = std::numeric_limits::min(); constexpr auto max_val = std::numeric_limits::max(); Vectorized retval; - for (size_t i = 0; i < int_num_vecs(); ++i) { - for (size_t j = 0; j < elem_per_int_vec; ++j) { + for (const auto i : c10::irange(int_num_vecs())) { + for (const auto j : c10::irange(elem_per_int_vec)) { int32_t rounded = nearbyint(static_cast(inp[i].vals[j]) * multiplier) + zero_point; diff --git a/aten/src/ATen/cpu/vec/vec_base.h b/aten/src/ATen/cpu/vec/vec_base.h index da5f318bf530cc..d6c921eddde262 100644 --- a/aten/src/ATen/cpu/vec/vec_base.h +++ b/aten/src/ATen/cpu/vec/vec_base.h @@ -31,6 +31,7 @@ #include #include #include +#include // These macros helped us unify vec_base.h #ifdef CPU_CAPABILITY_AVX512 @@ -150,7 +151,7 @@ struct Vectorized { static Vectorized blend(const Vectorized& a, const Vectorized& b) { int64_t mask = mask_; Vectorized vector; - for (int64_t i = 0; i < size(); i++) { + for (const auto i : c10::irange(size())) { if (mask & 0x01) { vector[i] = b[i]; } else { @@ -165,7 +166,7 @@ struct Vectorized { Vectorized vector; int_same_size_t buffer[size()]; mask.store(buffer); - for (int64_t i = 0; i < size(); i++) { + for (const auto i : c10::irange(size())) { if (buffer[i] & 0x01) { vector[i] = b[i]; @@ -178,14 +179,14 @@ struct Vectorized { template // step sometimes requires a higher precision type (e.g., T=int, step_t=double) static Vectorized arange(T base = static_cast(0), step_t step = static_cast(1)) { Vectorized vector; - for (int64_t i = 0; i < size(); i++) { + for (const auto i : c10::irange(size())) { vector.values[i] = base + i * step; } return vector; } static Vectorized set(const Vectorized& a, const Vectorized& b, int64_t count = size()) { Vectorized vector; - for (int64_t i = 0; i < size(); i++) { + for (const auto i : c10::irange(size())) { if (i < count) { vector[i] = b[i]; } else { @@ -340,7 +341,7 @@ struct Vectorized { } Vectorized atan2(const Vectorized &exp) const { Vectorized ret; - for (int64_t i = 0; i < size(); i++) { + for (const auto i : c10::irange(size())) { ret[i] = std::atan2(values[i], exp[i]); } return ret; @@ -380,7 +381,7 @@ struct Vectorized { // U is for SFINAE purposes only. Make sure it is not changed. static_assert(std::is_same::value, "U must be T"); Vectorized ret; - for (int64_t i = 0; i < size(); i++) { + for (const auto i : c10::irange(size())) { ret[i] = std::fmod(values[i], q[i]); } return ret; @@ -423,7 +424,7 @@ struct Vectorized { } Vectorized hypot(const Vectorized &b) const { Vectorized ret; - for (int64_t i = 0; i < size(); i++) { + for (const auto i : c10::irange(size())) { ret[i] = std::hypot(values[i], b[i]); } return ret; @@ -436,14 +437,14 @@ struct Vectorized { } Vectorized igamma(const Vectorized &x) const { Vectorized ret; - for (int64_t i = 0; i < size(); i++) { + for (const auto i : c10::irange(size())) { ret[i] = calc_igamma(values[i], x[i]); } return ret; } Vectorized igammac(const Vectorized &x) const { Vectorized ret; - for (int64_t i = 0; i < size(); i++) { + for (const auto i : c10::irange(size())) { ret[i] = calc_igammac(values[i], x[i]); } return ret; @@ -456,7 +457,7 @@ struct Vectorized { } Vectorized nextafter(const Vectorized &b) const { Vectorized ret; - for (int64_t i = 0; i < size(); i++) { + for (const auto i : c10::irange(size())) { ret[i] = std::nextafter(values[i], b[i]); } return ret; @@ -494,7 +495,7 @@ struct Vectorized { } Vectorized pow(const Vectorized &exp) const { Vectorized ret; - for (int64_t i = 0; i < size(); i++) { + for (const auto i : c10::irange(size())) { ret[i] = std::pow(values[i], exp[i]); } return ret; @@ -808,7 +809,7 @@ inline gather(T const* base_addr, const Vectorized>& vindex) int_same_size_t index_arr[size]; vindex.store(static_cast(index_arr)); T buffer[size]; - for (int64_t i = 0; i < size; i++) { + for (const auto i : c10::irange(size)) { buffer[i] = base_addr[index_arr[i] * scale / sizeof(T)]; } return Vectorized::loadu(static_cast(buffer)); @@ -826,7 +827,7 @@ inline mask_gather(const Vectorized& src, T const* base_addr, mask.store(static_cast(mask_arr)); vindex.store(static_cast(index_arr)); T buffer[size]; - for (int64_t i = 0; i < size; i++) { + for (const auto i : c10::irange(size)) { if (mask_arr[i] & 0x01) { // check highest bit buffer[i] = base_addr[index_arr[i] * scale / sizeof(T)]; } else { @@ -872,7 +873,7 @@ inline Vectorized> convert_to_int_of_same_size(const Vectoriz T src_arr[size]; src.store(static_cast(src_arr)); int_same_size_t buffer[size]; - for (int64_t i = 0; i < size; i++) { + for (const auto i : c10::irange(size)) { buffer[i] = static_cast>(src_arr[i]); } return Vectorized>::loadu(static_cast(buffer)); @@ -899,7 +900,7 @@ deinterleave2(const Vectorized& a, const Vectorized& b) { T buffer2[size]; a.store(static_cast(a_arr)); b.store(static_cast(b_arr)); - for (int64_t i = 0; i < half_size; i++) { + for (const auto i : c10::irange(half_size)) { buffer1[i] = a_arr[i * 2]; buffer1[half_size + i] = b_arr[i * 2]; buffer2[i] = a_arr[i * 2 + 1]; @@ -931,7 +932,7 @@ interleave2(const Vectorized& a, const Vectorized& b) { T buffer2[size]; a.store(static_cast(a_arr)); b.store(static_cast(b_arr)); - for (int64_t i = 0; i < half_size; i++) { + for (const auto i : c10::irange(half_size)) { buffer1[i * 2] = a_arr[i]; buffer1[i * 2 + 1] = b_arr[i]; buffer2[i * 2] = a_arr[half_size + i]; @@ -946,7 +947,8 @@ inline void convert(const src_T *src, dst_T *dst, int64_t n) { #ifndef _MSC_VER # pragma unroll #endif - for (int64_t i = 0; i < n; i++) { + for (const auto i : c10::irange(n)) { + (void)i; //Suppress unused variable warning *dst = c10::static_cast_with_inter_type::apply(*src); src++; dst++; diff --git a/aten/src/ATen/cuda/CUDABlas.cpp b/aten/src/ATen/cuda/CUDABlas.cpp index ab542cb3bdab04..d6a6205ab1c249 100644 --- a/aten/src/ATen/cuda/CUDABlas.cpp +++ b/aten/src/ATen/cuda/CUDABlas.cpp @@ -4,6 +4,7 @@ #include #include +#include #define CUDABLAS_POSINT_CHECK(FD, X) \ TORCH_CHECK( \ @@ -295,7 +296,7 @@ void bgemm(CUDABLAS_BGEMM_ARGTYPES(at::Half)) { c, CUDA_R_16F, ldc, stridec, num_batches, CUDA_R_32F, CUBLAS_GEMM_DEFAULT_TENSOR_OP)); } else { - for (int64_t i = 0; i < num_batches; ++i) { + for (const auto i : c10::irange(num_batches)) { at::cuda::blas::gemm( transa, transb, m, n, k, diff --git a/aten/src/ATen/cudnn/Descriptors.cpp b/aten/src/ATen/cudnn/Descriptors.cpp index f52280e9d2401d..6a617edaf2777f 100644 --- a/aten/src/ATen/cudnn/Descriptors.cpp +++ b/aten/src/ATen/cudnn/Descriptors.cpp @@ -1,6 +1,7 @@ #include #include +#include #include #include @@ -47,11 +48,11 @@ void TensorDescriptor::set(cudnnDataType_t datatype, IntArrayRef t_sizes, IntArr #undef STR int size[CUDNN_DIM_MAX]; int stride[CUDNN_DIM_MAX]; - for (size_t i = 0; i < dim; ++i) { + for (const auto i : c10::irange(dim)) { size[i] = static_cast(t_sizes[i]); stride[i] = static_cast(t_strides[i]); } - for (size_t i = dim; i < pad; ++i) { + for (const auto i : c10::irange(dim, pad)) { size[i] = 1; stride[i] = 1; } @@ -126,10 +127,10 @@ void FilterDescriptor::set(const at::Tensor &t, const at::MemoryFormat memory_fo "cuDNN filters (a.k.a. weights) must be contiguous in desired memory_format"); int size[CUDNN_DIM_MAX]; - for (int i = 0; i < dim; ++i) { + for (const auto i : c10::irange(dim)) { size[i] = (int) t.size(i); } - for (int i = dim; i < pad; ++i) { + for (const auto i : c10::irange(dim, pad)) { size[i] = (int) 1; } dim = std::max(dim, pad); diff --git a/aten/src/ATen/miopen/Descriptors.cpp b/aten/src/ATen/miopen/Descriptors.cpp index 6911b1ad216bd3..ead45a52dad1a1 100644 --- a/aten/src/ATen/miopen/Descriptors.cpp +++ b/aten/src/ATen/miopen/Descriptors.cpp @@ -1,5 +1,6 @@ #include #include +#include #include @@ -39,11 +40,11 @@ void TensorDescriptor::set(miopenDataType_t datatype, IntArrayRef t_sizes, IntAr #undef STR int size[MIOPEN_DIM_MAX]; int stride[MIOPEN_DIM_MAX]; - for (size_t i = 0; i < dim; ++i) { + for (const auto i : c10::irange(dim)) { size[i] = static_cast(t_sizes[i]); stride[i] = static_cast(t_strides[i]); } - for (size_t i = dim; i < pad; ++i) { + for (const auto i : c10::irange(dim, pad)) { size[i] = 1; stride[i] = 1; } @@ -103,10 +104,10 @@ void FilterDescriptor::set(const at::Tensor &t, const at::MemoryFormat memory_fo int size[MIOPEN_DIM_MAX]; int stride[MIOPEN_DIM_MAX]; - for (int i = 0; i < dim; ++i) { + for (const auto i : c10::irange(dim)) { size[i] = (int) t.size(i); } - for (int i = dim; i < pad; ++i) { + for (const auto i : c10::irange(dim, pad)) { size[i] = (int) 1; } diff --git a/aten/src/ATen/native/Activation.cpp b/aten/src/ATen/native/Activation.cpp index 37700bb5867939..72414065de0f24 100644 --- a/aten/src/ATen/native/Activation.cpp +++ b/aten/src/ATen/native/Activation.cpp @@ -500,7 +500,7 @@ inline void _rrelu_with_noise_train( scalar_t* noise_data = noise.data_ptr(); auto gen = at::get_generator_or_default(generator, detail::getDefaultCPUGenerator()); std::lock_guard lock(gen->mutex_); - for (int64_t i = 0; i < input.numel(); i++) { + for (const auto i : c10::irange(input.numel())) { if (input_data[i] <= 0) { at::uniform_real_distribution uniform(lower, upper); const scalar_t r = (scalar_t)uniform(gen); @@ -610,7 +610,7 @@ void inline prelu_cpu_kernel_share_weights( auto weight_val = weight.data_ptr()[0]; at::parallel_for(0, input_numel, 1000, [&](int64_t start, int64_t end) { - for (auto i = start; i < end; i++) { + for (const auto i : c10::irange(start, end)) { scalar_t input_data_val = input_data[i]; // to allow for compiler optimization, here splitting into two lines: scalar_t r = (input_data_val > 0) ? scalar_t(1) : weight_val; @@ -725,7 +725,7 @@ void inline prelu_cpu_backward_kernel_share_weights( scalar_t sum = at::parallel_reduce(0, input_numel, 1000, scalar_t(0), [&](int64_t start, int64_t end, scalar_t ident) -> scalar_t { scalar_t partial_sum = ident; - for (auto i = start; i < end; i++) { + for (const auto i : c10::irange(start, end)) { scalar_t input_data_val = input_data[i]; scalar_t grad_out_data_val = grad_out_data[i]; // to allow for compiler optimization, here splitting into two lines: @@ -839,7 +839,7 @@ std::tuple prelu_backward_cpu(const Tensor& grad_out_, const Ten std::vector reduce_dims; reduce_dims.push_back(0); if (dims > 2) { - for(int64_t i = 2; i < dims; i++) reduce_dims.push_back(i); + for (const auto i : c10::irange(2, dims))reduce_dims.push_back(i); } weight_grad = weight_grad_collector.sum(reduce_dims); } diff --git a/aten/src/ATen/native/AdaptiveAveragePooling.cpp b/aten/src/ATen/native/AdaptiveAveragePooling.cpp index 2324b958b34f51..b0be043e30692b 100644 --- a/aten/src/ATen/native/AdaptiveAveragePooling.cpp +++ b/aten/src/ATen/native/AdaptiveAveragePooling.cpp @@ -2,6 +2,7 @@ #include #include #include +#include namespace at { @@ -16,7 +17,7 @@ namespace { { TORCH_CHECK(output_size.size() == 2, "adaptive_avg_pool2d: output_size must be 2"); int64_t ndim = input.ndimension(); - for (int64_t i = 1; i < ndim; i++) { + for (const auto i : c10::irange(1, ndim)) { TORCH_CHECK(input.size(i) > 0, "adaptive_avg_pool2d(): Expected input to have non-zero size for non-batch dimensions, " "but input has sizes ", input.sizes(), " with dimension ", i, " being " @@ -52,7 +53,7 @@ namespace { const Tensor& input) { int64_t ndim = grad_output.ndimension(); - for (int64_t i = 1; i < ndim; i++) { + for (const auto i : c10::irange(1, ndim)) { TORCH_CHECK(grad_output.size(i) > 0, "adaptive_avg_pool2d_backward(): Expected grad_output to have non-zero size for non-batch dimensions, " "but grad_output has sizes ", grad_output.sizes(), " with dimension ", i, " being " diff --git a/aten/src/ATen/native/AdaptiveAveragePooling3d.cpp b/aten/src/ATen/native/AdaptiveAveragePooling3d.cpp index f7565b554d896e..41515259c33e1a 100644 --- a/aten/src/ATen/native/AdaptiveAveragePooling3d.cpp +++ b/aten/src/ATen/native/AdaptiveAveragePooling3d.cpp @@ -1,6 +1,7 @@ #include #include #include +#include namespace at { namespace native { @@ -33,19 +34,19 @@ static void adaptive_avg_pool3d_out_frame( int64_t istrideH, int64_t istrideW) { at::parallel_for(0, sizeD, 1, [&](int64_t start, int64_t end) { - for (int64_t d = start; d < end; d++) { + for (const auto d : c10::irange(start, end)) { /* loop over output */ - for (int64_t ot = 0; ot < osizeT; ot++) { + for (const auto ot : c10::irange(osizeT)) { int istartT = start_index(ot, osizeT, isizeT); int iendT = end_index(ot, osizeT, isizeT); int kT = iendT - istartT; - for (int64_t oh = 0; oh < osizeH; oh++) { + for (const auto oh : c10::irange(osizeH)) { int istartH = start_index(oh, osizeH, isizeH); int iendH = end_index(oh, osizeH, isizeH); int kH = iendH - istartH; - for (int64_t ow = 0; ow < osizeW; ow++) { + for (const auto ow : c10::irange(osizeW)) { int istartW = start_index(ow, osizeW, isizeW); int iendW = end_index(ow, osizeW, isizeW); int kW = iendW - istartW; @@ -58,9 +59,9 @@ static void adaptive_avg_pool3d_out_frame( /* compute local average: */ scalar_t sum = 0; - for (int it = 0; it < kT; it++) { - for (int ih = 0; ih < kH; ih++) { - for (int iw = 0; iw < kW; iw++) { + for (const auto it : c10::irange(kT)) { + for (const auto ih : c10::irange(kH)) { + for (const auto iw : c10::irange(kW)) { scalar_t val = *(ip + it * istrideT + ih * istrideH + iw * istrideW); sum += val; @@ -83,7 +84,7 @@ void adaptive_avg_pool3d_out_cpu_template( IntArrayRef output_size) { TORCH_CHECK(output_size.size() == 3, "adaptive_avg_pool3d: output_size must be 3"); - for (int64_t i = 1; i < input.ndimension(); i++) { + for (const auto i : c10::irange(1, input.ndimension())) { TORCH_CHECK( input.size(i) > 0, "adaptive_avg_pool3d(): Expected input to have non-zero size for non-batch dimensions, " @@ -148,7 +149,7 @@ void adaptive_avg_pool3d_out_cpu_template( auto input_data = input.data_ptr(); auto output_data = output.data_ptr(); at::parallel_for(0, n, 1, [&](int64_t start, int64_t end) { - for (int64_t b = start; b < end; ++b) { + for (const auto b : c10::irange(start, end)) { adaptive_avg_pool3d_out_frame( input_data + b * input.stride(0), output_data + b * sizeD * osizeT * osizeH * osizeW, @@ -181,22 +182,22 @@ static void adaptive_avg_pool3d_backward_out_frame( int64_t osizeH, int64_t osizeW) { at::parallel_for(0, sizeD, 1, [&](int64_t start, int64_t end) { - for (int64_t d = start; d < end; d++) { + for (const auto d : c10::irange(start, end)) { scalar_t* gradInput_p_d = gradInput_p + d * isizeT * isizeW * isizeH; scalar_t* gradOutput_p_d = gradOutput_p + d * osizeT * osizeW * osizeH; /* calculate average */ - for (int64_t ot = 0; ot < osizeT; ot++) { + for (const auto ot : c10::irange(osizeT)) { int istartT = start_index(ot, osizeT, isizeT); int iendT = end_index(ot, osizeT, isizeT); int kT = iendT - istartT; - for (int64_t oh = 0; oh < osizeH; oh++) { + for (const auto oh : c10::irange(osizeH)) { int istartH = start_index(oh, osizeH, isizeH); int iendH = end_index(oh, osizeH, isizeH); int kH = iendH - istartH; - for (int64_t ow = 0; ow < osizeW; ow++) { + for (const auto ow : c10::irange(osizeW)) { int istartW = start_index(ow, osizeW, isizeW); int iendW = end_index(ow, osizeW, isizeW); int kW = iendW - istartW; @@ -205,9 +206,9 @@ static void adaptive_avg_pool3d_backward_out_frame( gradOutput_p_d[ot * osizeH * osizeW + oh * osizeW + ow] / kT / kH / kW; - for (int it = istartT; it < iendT; it++) { - for (int ih = istartH; ih < iendH; ih++) { - for (int iw = istartW; iw < iendW; iw++) { + for (const auto it : c10::irange(istartT, iendT)) { + for (const auto ih : c10::irange(istartH, iendH)) { + for (const auto iw : c10::irange(istartW, iendW)) { /* update gradient */ gradInput_p_d[it * isizeH * isizeW + ih * isizeW + iw] += grad_delta; @@ -265,7 +266,7 @@ Tensor& adaptive_avg_pool3d_backward_out_cpu_template( scalar_t* gradInput_data = gradInput.data_ptr(); scalar_t* gradOutput_data = gradOutput.data_ptr(); at::parallel_for(0, n, 1, [&](int64_t start, int64_t end) { - for (int64_t b = start; b < end; b++) { + for (const auto b : c10::irange(start, end)) { adaptive_avg_pool3d_backward_out_frame( gradInput_data + b * sizeD * isizeT * isizeH * isizeW, gradOutput_data + b * sizeD * osizeT * osizeH * osizeW, diff --git a/aten/src/ATen/native/AdaptiveMaxPooling2d.cpp b/aten/src/ATen/native/AdaptiveMaxPooling2d.cpp index bc9bc60b9da957..6634d74a2e3f84 100644 --- a/aten/src/ATen/native/AdaptiveMaxPooling2d.cpp +++ b/aten/src/ATen/native/AdaptiveMaxPooling2d.cpp @@ -1,6 +1,7 @@ #include #include #include +#include namespace at { @@ -10,7 +11,7 @@ TORCH_META_FUNC(adaptive_max_pool2d) (const Tensor& input, IntArrayRef output_si TORCH_CHECK(ndim == 3 || ndim == 4, "adaptive_max_pool2d(): Expected 3D or 4D tensor, but got: ", input.sizes()); - for (int64_t i = 1; i < ndim; i++) { + for (const auto i : c10::irange(1, ndim)) { TORCH_CHECK(input.size(i) > 0, "adaptive_max_pool2d(): Expected input to have non-zero size for non-batch dimensions, " "but input has sizes ", input.sizes(), " with dimension ", i, @@ -51,7 +52,7 @@ TORCH_META_FUNC(adaptive_max_pool2d_backward) int64_t ndim = grad_output.ndimension(); TORCH_CHECK(ndim == 3 || ndim == 4, "adaptive_max_pooling2d_backward(): Expected 3D or 4D grad_output, but got: ", grad_output.sizes()); - for (int64_t i = 1; i < ndim; i++) { + for (const auto i : c10::irange(1, ndim)) { TORCH_CHECK(grad_output.size(i) > 0, "adaptive_max_pooling2d_backward(): Expected grad_output to have non-zero size for non-batch dimensions, " "but grad_output has sizes ", grad_output.sizes(), " with dimension ", i, diff --git a/aten/src/ATen/native/AdaptiveMaxPooling3d.cpp b/aten/src/ATen/native/AdaptiveMaxPooling3d.cpp index 257670fc7c9d09..3bf1186b3bce82 100644 --- a/aten/src/ATen/native/AdaptiveMaxPooling3d.cpp +++ b/aten/src/ATen/native/AdaptiveMaxPooling3d.cpp @@ -1,6 +1,7 @@ #include #include #include +#include #include @@ -11,7 +12,7 @@ TORCH_META_FUNC(adaptive_max_pool3d) (const Tensor& input, IntArrayRef output_si TORCH_CHECK( ndim == 4 || ndim == 5, "adaptive_max_pool3d(): Expected 4D or 5D tensor, but got: ", input.sizes()); - for (int64_t i = 1; i < ndim; i++) { + for (const auto i : c10::irange(1, ndim)) { TORCH_CHECK( input.size(i) > 0, "adaptive_max_pool3d(): Expected input to have non-zero size for non-batch dimensions, " @@ -96,8 +97,7 @@ static void adaptive_max_pool3d_single_out_frame( int64_t istrideW) { at::parallel_for(0, sizeD, 0, [&](int64_t start, int64_t end) { - for (auto d = start; d < end; d++) - { + for (const auto d : c10::irange(start, end)) { /* loop over output */ int64_t ot, oh, ow; for(ot = 0; ot < osizeT; ot++) @@ -176,8 +176,7 @@ static void adaptive_max_pool3d_out_frame( int64_t istrideW) { at::parallel_for(0, sizeB, 0, [&](int64_t start, int64_t end) { - for (auto b = start; b < end; b++) - { + for (const auto b : c10::irange(start, end)) { adaptive_max_pool3d_single_out_frame(input_data+b*istrideB, output_data+b*sizeD*osizeT*osizeH*osizeW, indices_data+b*sizeD*osizeT*osizeH*osizeW, sizeD, @@ -203,8 +202,7 @@ static void adaptive_max_pool3d_backward_single_out_frame( int64_t osizeW) { at::parallel_for(0, sizeD, 0, [&](int64_t start, int64_t end) { - for (auto d = start; d < end; d++) - { + for (const auto d : c10::irange(start, end)) { scalar_t *gradInput_p_d = gradInput_p + d*isizeT*isizeH*isizeW; scalar_t *gradOutput_p_d = gradOutput_p + d*osizeT*osizeH*osizeW; int64_t *ind_p_d = ind_p + d*osizeT*osizeH*osizeW; @@ -244,8 +242,7 @@ static void adaptive_max_pool3d_backward_out_frame( int64_t osizeW) { at::parallel_for(0, sizeB, 0, [&](int64_t start, int64_t end) { - for (auto b = start; b < end; b++) - { + for (const auto b : c10::irange(start, end)) { adaptive_max_pool3d_backward_single_out_frame(gradInput_data+b*sizeD*isizeT*isizeH*isizeW, gradOutput_data+b*sizeD*osizeT*osizeH*osizeW, indices_data+b*sizeD*osizeT*osizeH*osizeW, sizeD, diff --git a/aten/src/ATen/native/AveragePool3d.cpp b/aten/src/ATen/native/AveragePool3d.cpp index 658936f329fcd6..7d3febede6f9a5 100644 --- a/aten/src/ATen/native/AveragePool3d.cpp +++ b/aten/src/ATen/native/AveragePool3d.cpp @@ -2,6 +2,7 @@ #include #include #include +#include #include @@ -169,8 +170,7 @@ static void avg_pool3d_out_frame( c10::optional divisor_override) { at::parallel_for(0, nslices, 0, [&](int64_t start, int64_t end) { - for (auto k = start; k < end; k++) - { + for (const auto k : c10::irange(start, end)) { // NOLINTNEXTLINE(cppcoreguidelines-init-variables) int64_t i, j, ti; @@ -315,7 +315,7 @@ TORCH_IMPL_FUNC(avg_pool3d_out_cpu) ( scalar_t *output_data = output.data_ptr(); at::parallel_for(0, nbatch, 0, [&](int64_t start, int64_t end) { - for (auto p = start; p < end; p++) { + for (const auto p : c10::irange(start, end)) { avg_pool3d_out_frame( input_data + p * istride, output_data + p * ostride, nslices, itime, iwidth, iheight, @@ -358,8 +358,7 @@ static void avg_pool3d_backward_out_frame( c10::optional divisor_override) { at::parallel_for(0, nslices, 0, [&](int64_t start, int64_t end) { - for (auto k = start; k < end; k++) - { + for (const auto k : c10::irange(start, end)) { // NOLINTNEXTLINE(cppcoreguidelines-init-variables) int64_t i, j, ti; @@ -500,8 +499,7 @@ TORCH_IMPL_FUNC(avg_pool3d_backward_out_cpu) ( scalar_t *gradOutput_data = gradOutput.data_ptr(); at::parallel_for(0, nbatch, 0, [&](int64_t start, int64_t end) { - for (auto p = start; p < end; p++) - { + for (const auto p : c10::irange(start, end)) { avg_pool3d_backward_out_frame( gradInput_data + p * istride, gradOutput_data + p * ostride, nslices, itime, iwidth, iheight, diff --git a/aten/src/ATen/native/BatchLinearAlgebraKernel.cpp b/aten/src/ATen/native/BatchLinearAlgebraKernel.cpp index dee835a5f88dc4..58e42876da0f7c 100644 --- a/aten/src/ATen/native/BatchLinearAlgebraKernel.cpp +++ b/aten/src/ATen/native/BatchLinearAlgebraKernel.cpp @@ -63,7 +63,7 @@ void apply_reflect_conj_tri_single(scalar_t* self, int64_t n, int64_t stride, bo std::function loop = [](int64_t, int64_t){}; if (upper) { loop = [&](int64_t start, int64_t end) { - for (int64_t i = start; i < end; i++) { + for (const auto i : c10::irange(start, end)) { for (int64_t j = i + 1; j < n; j++) { self[i * stride + j] = conj_impl(self[j * stride + i]); } @@ -71,8 +71,8 @@ void apply_reflect_conj_tri_single(scalar_t* self, int64_t n, int64_t stride, bo }; } else { loop = [&](int64_t start, int64_t end) { - for (int64_t i = start; i < end; i++) { - for (int64_t j = 0; j < i; j++) { + for (const auto i : c10::irange(start, end)) { + for (const auto j : c10::irange(i)) { self[i * stride + j] = conj_impl(self[j * stride + i]); } } @@ -106,7 +106,7 @@ void apply_cholesky_inverse(Tensor& input, Tensor& infos, bool upper) { auto n = input.size(-2); auto lda = std::max(1, n); - for (int64_t i = 0; i < batch_size; i++) { + for (const auto i : c10::irange(batch_size)) { scalar_t* input_working_ptr = &input_data[i * input_matrix_stride]; int* info_working_ptr = &infos_data[i]; lapackCholeskyInverse(uplo, n, input_working_ptr, lda, info_working_ptr); @@ -501,7 +501,7 @@ inline void apply_orgqr(Tensor& self, const Tensor& tau) { lwork = std::max(1, real_impl(wkopt)); Tensor work = at::empty({lwork}, self.options()); - for (int64_t i = 0; i < batch_size; i++) { + for (const auto i : c10::irange(batch_size)) { scalar_t* self_working_ptr = &self_data[i * self_matrix_stride]; scalar_t* tau_working_ptr = &tau_data[i * tau_stride]; diff --git a/aten/src/ATen/native/BlasKernel.cpp b/aten/src/ATen/native/BlasKernel.cpp index b52a0f20a35c9c..9cf1f995f3ca9a 100644 --- a/aten/src/ATen/native/BlasKernel.cpp +++ b/aten/src/ATen/native/BlasKernel.cpp @@ -2,6 +2,7 @@ #include #include #include +#include #if AT_BUILD_WITH_BLAS() extern "C" double ddot_(int *n, double *x, int *incx, double *y, int *incy); @@ -151,7 +152,7 @@ inline void scal(int64_t n, scalar_t a, scalar_t *x, int64_t incx) blas_impl::scal_fast_path(&i_n, &a, x, &i_incx); return; } - for (int64_t i = 0; i < n; i++) { + for (const auto i : c10::irange(n)) { if (a == scalar_t(0)) { x[i * incx] = 0; } else { @@ -176,11 +177,10 @@ void gemv(char trans, int64_t m, int64_t n, scalar_t alpha, scalar_t *a, int64_t } if ((trans == 'T') || (trans == 't')) { - for (int64_t i = 0; i < n; i++) - { + for (const auto i : c10::irange(n)) { scalar_t sum = 0; scalar_t *row_ = a + lda * i; - for (int64_t j = 0; j < m; j++) { + for (const auto j : c10::irange(m)) { sum += x[j * incx] * row_[j]; } if (beta == scalar_t(0)) { @@ -192,10 +192,10 @@ void gemv(char trans, int64_t m, int64_t n, scalar_t alpha, scalar_t *a, int64_t } else { if (beta != scalar_t(1) && beta != scalar_t(0)) scal(m, beta, y, incy); - for (int64_t j = 0; j < n; j++) { + for (const auto j : c10::irange(n)) { scalar_t *column_ = a + lda * j; scalar_t z = alpha * x[j * incx]; - for (int64_t i = 0; i < m; i++) { + for (const auto i : c10::irange(m)) { //output values are ignored if beta is 0, and set to 0, nans and infs are not propagated if (j==0 && beta==scalar_t(0)) { y[i * incy] = scalar_t(0); diff --git a/aten/src/ATen/native/Bucketization.cpp b/aten/src/ATen/native/Bucketization.cpp index e4ad35f59fc698..63b88510a6f4e9 100644 --- a/aten/src/ATen/native/Bucketization.cpp +++ b/aten/src/ATen/native/Bucketization.cpp @@ -2,6 +2,7 @@ #include #include #include +#include /* Implement a TF like searchsorted and a bucketize function running on cpu * @@ -58,7 +59,7 @@ void searchsorted_cpu_contiguous(Tensor& result, const Tensor& input, const Tens bool is_1d_boundaries = boundaries.dim() == 1; at::parallel_for(0, numel_in, SEARCHSORTED_GRAIN_SIZE, [&](int64_t start, int64_t end) { - for (int64_t i = start; i < end; ++i) { + for (const auto i : c10::irange(start, end)) { // If boundaries tensor is 1d, we always search the entire boundary tensor int64_t start_bd = is_1d_boundaries ? 0 : i / idim_in * idim_bd; const input_t *data_bd_start = &data_bd[start_bd]; diff --git a/aten/src/ATen/native/Col2Im.cpp b/aten/src/ATen/native/Col2Im.cpp index efc41bea0c207a..f1e08a887c841e 100644 --- a/aten/src/ATen/native/Col2Im.cpp +++ b/aten/src/ATen/native/Col2Im.cpp @@ -5,6 +5,7 @@ #include #include +#include // Note [im2col/col2im output padding] // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -150,7 +151,7 @@ static void col2im_out_cpu_template( stride_width + 1; - for (int64_t elt = 0; elt < batch_size; elt++) { + for (const auto elt : c10::irange(batch_size)) { input_n = input.select(0, elt); output_n = output.select(0, elt); diff --git a/aten/src/ATen/native/ComplexHelper.h b/aten/src/ATen/native/ComplexHelper.h index 1b9a94e9089068..e9efd4b7c88db2 100644 --- a/aten/src/ATen/native/ComplexHelper.h +++ b/aten/src/ATen/native/ComplexHelper.h @@ -24,7 +24,7 @@ inline Tensor view_tensor( inline DimVector computeStrideForViewAsReal(IntArrayRef oldstride) { DimVector res(oldstride.size() + 1); - for(size_t i = 0; i < oldstride.size(); i++) { + for (const auto i : c10::irange(oldstride.size())) { res[i] = oldstride[i] * 2; } res.back() = 1; diff --git a/aten/src/ATen/native/ConstantPadNd.cpp b/aten/src/ATen/native/ConstantPadNd.cpp index 71bbfde152895a..f7a2d76ed52280 100644 --- a/aten/src/ATen/native/ConstantPadNd.cpp +++ b/aten/src/ATen/native/ConstantPadNd.cpp @@ -47,7 +47,7 @@ Tensor constant_pad_nd(const Tensor& self, IntArrayRef pad, const Scalar& value) new_shape.emplace_back(input_sizes[i]); } - for (size_t i = 0; i < (size_t)l_pad; i++) { + for (const auto i : c10::irange((size_t)l_pad)) { auto pad_idx = pad.size() - ((i + 1) * 2); auto new_dim = input_sizes[l_diff + i] + pad[pad_idx] + pad[pad_idx + 1]; TORCH_CHECK(new_dim > 0, "The input size ", input_sizes[l_diff + i], ", plus negative padding ", diff --git a/aten/src/ATen/native/ConvUtils.h b/aten/src/ATen/native/ConvUtils.h index 9f0abaf7e61cc3..7a1dcde7b8ce37 100644 --- a/aten/src/ATen/native/ConvUtils.h +++ b/aten/src/ATen/native/ConvUtils.h @@ -1,6 +1,7 @@ #pragma once #include #include +#include namespace at { namespace native { @@ -35,7 +36,7 @@ static inline std::vector conv_output_size( std::vector output_size(dim); output_size[0] = input_size[input_batch_size_dim]; output_size[1] = weight_size[weight_output_channels_dim]; - for (size_t d = 2; d < dim; ++d) { + for (const auto d : c10::irange(2, dim)) { auto dilation_ = has_dilation ? dilation[d - 2] : 1; auto kernel = dilation_ * (weight_size[d] - 1) + 1; output_size[d] = (input_size[d] + (2 * padding[d - 2]) - kernel) / stride[d - 2] + 1; @@ -53,7 +54,7 @@ static inline std::vector conv_input_size( std::vector input_size(dim); input_size[0] = output_size[output_batch_size_dim]; input_size[1] = weight_size[weight_input_channels_dim] * groups; - for (size_t d = 2; d < dim; ++d) { + for (const auto d : c10::irange(2, dim)) { int kernel = dilation[d - 2] * (weight_size[d] - 1) + 1; input_size[d] = (output_size[d] - 1) * stride[d - 2] - (2 * padding[d - 2]) + kernel + output_padding[d - 2]; @@ -69,7 +70,7 @@ static inline std::vector conv_weight_size( std::vector weight_size(dim); weight_size[0] = output_size[1]; weight_size[1] = input_size[1] / groups; - for (size_t d = 2; d < dim; ++d) { + for (const auto d : c10::irange(2, dim)) { int kernel = input_size[d] - (output_size[d] - 1) * stride[d - 2] + 2 * padding[d - 2] - output_padding[d - 2]; weight_size[d] = (kernel - 1) / dilation[d - 2] + 1; diff --git a/aten/src/ATen/native/Convolution.cpp b/aten/src/ATen/native/Convolution.cpp index 78eb889f8cfa6e..e8baf42b8c9bb1 100644 --- a/aten/src/ATen/native/Convolution.cpp +++ b/aten/src/ATen/native/Convolution.cpp @@ -975,7 +975,7 @@ at::Tensor _convolution( } else { std::vector outputs(params.groups); input = input.contiguous(); - for (int g = 0; g < params.groups; ++g) { + for (const auto g : c10::irange(params.groups)) { auto input_g = subtensor(input, 1, params.groups, g); auto weight_g = subtensor(weight, 0, params.groups, g); auto bias_g = subtensor(bias, 0, params.groups, g); @@ -1212,7 +1212,7 @@ std::tuple _convolution_double_backward( const c10::option } } else { std::vector gWt_list(groups); - for (int g = 0; g < groups; ++g) { + for (const auto g : c10::irange(groups)) { auto ggIt_g = subvariable(ggIt, 0, groups, g); auto gOt_g = subvariable(gOt, 0, groups, g); if (gOt_g.is_cuda()) { @@ -1239,7 +1239,7 @@ std::tuple _convolution_double_backward( const c10::option // the ConvForward kernels don't support asymmetric padding. auto gW_size = gW.sizes(); auto w_size = weight.sizes(); - for (size_t i = 2; i < gW_size.size(); ++i) { + for (const auto i : c10::irange(2, gW_size.size())) { if (gW_size[i] > w_size[i]) { gW = gW.narrow(i, 0, w_size[i]); gW_size = gW.sizes(); @@ -1268,7 +1268,7 @@ std::tuple _convolution_double_backward( const c10::option // rather than narrowing the computed gI auto gI_size = gI.sizes(); auto i_size = input.sizes(); - for (size_t i = 2; i < gI_size.size(); ++i) { + for (const auto i : c10::irange(2, gI_size.size())) { if (gI_size[i] > i_size[i]) { gI = gI.narrow(i, 0, i_size[i]); gI_size = gI.sizes(); @@ -1289,7 +1289,7 @@ std::tuple _convolution_double_backward( const c10::option gi_conv_params.output_padding[1] = input_shape[0] - expected_input_shape; } } else { - for(size_t i = 0; i < kernel_size.size(); ++i) { + for (const auto i : c10::irange(kernel_size.size())) { // Check if whole input has been used or not auto expected_input_shape = (kernel_size[i] - 1) * gi_conv_params.dilation[i] - 2 * gi_conv_params.padding[i] diff --git a/aten/src/ATen/native/ConvolutionMM2d.cpp b/aten/src/ATen/native/ConvolutionMM2d.cpp index 67a045ad1a198d..f9f2bb88daf1c2 100644 --- a/aten/src/ATen/native/ConvolutionMM2d.cpp +++ b/aten/src/ATen/native/ConvolutionMM2d.cpp @@ -7,6 +7,7 @@ #include #include #include +#include namespace at { namespace native { @@ -299,7 +300,7 @@ void slow_conv2d_backward_out_cpu_template( at::parallel_for(0, batch_size, 0, [&](int64_t start, int64_t end) { auto fgrad_input = std::make_unique( c10::multiply_integers(finput.sizes().slice(1))); - for (int64_t t = start; t < end; t++) { + for (const auto t : c10::irange(start, end)) { auto grad_input_t = grad_input_a[t]; auto grad_output_t = grad_output_a[t]; slow_conv2d_backward_update_grad_input_frame( @@ -478,7 +479,7 @@ std::tuple slow_conv2d_forward_out_cpu( auto weight_2d_a = weight_2d.accessor(); at::parallel_for(0, batch_size, 0, [&](int64_t start, int64_t end) { - for (int64_t t = start; t < end; t++) { + for (const auto t : c10::irange(start, end)) { auto input_t = input_a[t]; auto output_t = output_a[t]; auto finput_t = finput_a[t]; diff --git a/aten/src/ATen/native/ConvolutionMM3d.cpp b/aten/src/ATen/native/ConvolutionMM3d.cpp index cd8ba16903f0b6..88d4245f9d93c8 100644 --- a/aten/src/ATen/native/ConvolutionMM3d.cpp +++ b/aten/src/ATen/native/ConvolutionMM3d.cpp @@ -6,6 +6,7 @@ #include #include #include +#include constexpr int64_t CONV3D_GRAIN_SALT = 20; @@ -358,7 +359,7 @@ void slow_conv3d_backward_out_cpu_template( auto fgrad_input_a = fgrad_input.accessor(); auto weight_2d_a = weight2d.accessor(); - for (int64_t t = start; t < end; t++) { + for (const auto t : c10::irange(start, end)) { auto grad_input_t = grad_input_a[t]; auto grad_output_t = grad_output_a[t]; auto fgrad_input_t = fgrad_input_a[t]; @@ -462,7 +463,7 @@ static void slow_conv3d_backward_parameters_out_cpu_template( auto grad_weight_2d_a = grad_weight_2d.accessor(); auto grad_output_a = grad_output_contiguous.accessor(); auto finput_a = finput.accessor(); - for (int64_t t = 0; t < batch_size; t++) { + for (const auto t : c10::irange(batch_size)) { auto grad_output_t = grad_output_a[t]; auto finput_t = finput_a[t]; slow_conv3d_backward_weight_frame( @@ -564,7 +565,7 @@ std::tuple slow_conv3d_forward_out_cpu(const Tensor& at::parallel_for( 0, batch_size, CONV3D_GRAIN_SALT, [&](int64_t start, int64_t end) { - for (int64_t t = start; t < end; t++) { + for (const auto t : c10::irange(start, end)) { auto input_t = input_a[t]; auto output_t = output_a[t]; auto finput_t = finput_a[t]; diff --git a/aten/src/ATen/native/ConvolutionTBC.cpp b/aten/src/ATen/native/ConvolutionTBC.cpp index 2bd0f5ae4b9e3b..c90577822218e9 100644 --- a/aten/src/ATen/native/ConvolutionTBC.cpp +++ b/aten/src/ATen/native/ConvolutionTBC.cpp @@ -1,5 +1,6 @@ #include #include +#include #include namespace at { @@ -39,7 +40,7 @@ Tensor conv_tbc(const Tensor& self, const Tensor& weight, const Tensor& bias, in weight_size[2], }, self.options()); output.copy_(bias.expand(output.sizes())); - for (int k = 0; k < kw; k++) { + for (const auto k : c10::irange(kw)) { int iShift = std::max(0, static_cast(k - real_pad)); int oShift = std::max(0, static_cast(real_pad - k)); // NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions) diff --git a/aten/src/ATen/native/Copy.cpp b/aten/src/ATen/native/Copy.cpp index e2c2a1150c5729..c28ca2b66ef8f7 100644 --- a/aten/src/ATen/native/Copy.cpp +++ b/aten/src/ATen/native/Copy.cpp @@ -12,6 +12,7 @@ #include #include #include +#include #include #ifdef USE_FBGEMM @@ -65,16 +66,16 @@ void copy_same_type_transpose_(Tensor& self, const Tensor& src) { int nc = std::min(NC - C, BLOCK_SZ); // 1. copy columns from src to buf - for (int c = 0; c < nc; c++) { + for (const auto c : c10::irange(nc)) { memcpy(bp + c * BLOCK_SZ, spo + c * NR, nr * sizeof(scalar_t)); } // 2. transpose buf in place int rc_max = std::max(nr, nc); int rc_min = std::min(nr, nc); - for (int r = 0; r < rc_max; r++) { + for (const auto r : c10::irange(rc_max)) { int end = std::min(r, rc_min); - for (int c = 0; c < end; c++) { + for (const auto c : c10::irange(end)) { scalar_t tmp = bp[r + BLOCK_SZ * c]; bp[r + BLOCK_SZ * c] = bp[r * BLOCK_SZ + c]; bp[r * BLOCK_SZ + c] = tmp; @@ -82,7 +83,7 @@ void copy_same_type_transpose_(Tensor& self, const Tensor& src) { } // 3. copy rows from buf to dst - for (int r = 0; r < nr; r++) { + for (const auto r : c10::irange(nr)) { memcpy(rpo + r * NC, bp + r * BLOCK_SZ, nc * sizeof(scalar_t)); } } diff --git a/aten/src/ATen/native/Cross.cpp b/aten/src/ATen/native/Cross.cpp index 70dba97520c008..49f3c80e27d509 100644 --- a/aten/src/ATen/native/Cross.cpp +++ b/aten/src/ATen/native/Cross.cpp @@ -3,6 +3,7 @@ #include #include +#include namespace at { namespace native { @@ -30,7 +31,7 @@ Tensor & cross_out(const Tensor & input, const Tensor & other, const c10::option int64_t dim = -1; if(!dimension.has_value()) { - for(int64_t i = 0; i < input.dim(); i++) { + for (const auto i : c10::irange(input.dim())) { if(input.size(i) == 3) { dim = i; break; diff --git a/aten/src/ATen/native/DilatedConvolutionUtils.h b/aten/src/ATen/native/DilatedConvolutionUtils.h index 0f9bf90ab5a169..2d4815799b10f2 100644 --- a/aten/src/ATen/native/DilatedConvolutionUtils.h +++ b/aten/src/ATen/native/DilatedConvolutionUtils.h @@ -5,6 +5,7 @@ #include #include +#include #define TORCH_CHECK_DIM_SIZE(T, DIM, DIM_SIZE, SIZE) \ TORCH_CHECK( \ @@ -43,7 +44,7 @@ std::vector get_output_size( IntArrayRef pad_size, IntArrayRef dilation_size) { std::vector sizes; - for (int index = 0; index < dim; index++) { + for (const auto index : c10::irange(dim)) { sizes.push_back( div_rtn( input.size(index + input.dim() - dim) + 2 * pad_size[index] - diff --git a/aten/src/ATen/native/DilatedMaxPool3d.cpp b/aten/src/ATen/native/DilatedMaxPool3d.cpp index 21398c09067598..57fa6f9ea691cf 100644 --- a/aten/src/ATen/native/DilatedMaxPool3d.cpp +++ b/aten/src/ATen/native/DilatedMaxPool3d.cpp @@ -3,6 +3,7 @@ #include #include #include +#include #include @@ -37,8 +38,7 @@ static void max_pool3d_with_indices_single_out_frame( int dilationH) { at::parallel_for(0, nslices, 0, [&](int64_t start, int64_t end) { - for (auto k = start; k < end; k++) - { + for (const auto k : c10::irange(start, end)) { /* loop over output */ // NOLINTNEXTLINE(cppcoreguidelines-init-variables) int64_t i, j, ti; @@ -120,8 +120,7 @@ static void max_pool3d_with_indices_out_frame( int dilationT, int dilationW, int dilationH) { at::parallel_for(0, nbatch, 0, [&](int64_t start, int64_t end) { - for (auto p = start; p < end; p++) - { + for (const auto p : c10::irange(start, end)) { max_pool3d_with_indices_single_out_frame( input_data + p * istride, output_data + p * ostride, @@ -285,8 +284,7 @@ static void max_pool3d_with_indices_backward_single_out_frame( int dilationH) { at::parallel_for(0, nslices, 0, [&](int64_t start, int64_t end) { - for (auto k = start; k < end; k++) - { + for (const auto k : c10::irange(start, end)) { scalar_t *gradInput_p_k = gradInput_p + k * itime * iwidth * iheight; scalar_t *gradOutput_p_k = gradOutput_p + k * otime * owidth * oheight; int64_t *indz_p_k = indz_p + k * otime * owidth * oheight; @@ -330,8 +328,7 @@ static void max_pool3d_with_indices_backward_out_frame( int dilationT, int dilationW, int dilationH) { at::parallel_for(0, nbatch, 0, [&](int64_t start, int64_t end) { - for (auto p = start; p < end; p++) - { + for (const auto p : c10::irange(start, end)) { max_pool3d_with_indices_backward_single_out_frame( gradInput_data + p * istride, gradOutput_data + p * ostride, diff --git a/aten/src/ATen/native/Dropout.cpp b/aten/src/ATen/native/Dropout.cpp index ac56071edb8054..c4a6ec6cef5561 100644 --- a/aten/src/ATen/native/Dropout.cpp +++ b/aten/src/ATen/native/Dropout.cpp @@ -1,6 +1,7 @@ #include #include #include +#include namespace at { namespace native { @@ -16,8 +17,10 @@ Tensor make_feature_noise(const Tensor& input) { sizes.reserve(input.dim()); sizes.push_back(input_sizes[0]); sizes.push_back(input_sizes[1]); - for (int64_t i = 2; i < input.dim(); ++i) + for (const auto i : c10::irange(2, input.dim())) { + (void)i; //Suppress unused variable warning sizes.push_back(1); + } return at::empty(sizes, input.options()); } diff --git a/aten/src/ATen/native/Embedding.cpp b/aten/src/ATen/native/Embedding.cpp index 959005c52b2fc4..cac0cbe7130f26 100644 --- a/aten/src/ATen/native/Embedding.cpp +++ b/aten/src/ATen/native/Embedding.cpp @@ -123,7 +123,7 @@ Tensor embedding_dense_backward_cpu( auto parallel_section = [&](index_t start, index_t end) { TensorIterator iter(add_iter); - for (int64_t i = 0; i < numel; i++) { + for (const auto i : c10::irange(numel)) { if (indices_data[i] != padding_idx) { index_t k = indices_data[i]; if (k >= start && k < end) { @@ -167,7 +167,7 @@ Tensor & embedding_renorm_cpu_( // Note that we cannot use at::parallel_for here because we perform operations on // Tensor inside the loop. See github.com/pytorch/pytorch/issues/28370 for more details. - for (auto i = 0; i < num_indices; i++) { + for (const auto i : c10::irange(num_indices)) { if (i > 0 && sorted_indices[i] == sorted_indices[i - 1]) { continue; } diff --git a/aten/src/ATen/native/EmbeddingBag.cpp b/aten/src/ATen/native/EmbeddingBag.cpp index 53477e2e20e989..66ae4b4f7956cb 100644 --- a/aten/src/ATen/native/EmbeddingBag.cpp +++ b/aten/src/ATen/native/EmbeddingBag.cpp @@ -107,7 +107,7 @@ index_select_add(const Tensor &select_indices, auto output_stride0 = output.strides()[0]; auto output_stride1 = output.strides()[1]; - for (int64_t i = 0; i < numel; i++) { + for (const auto i : c10::irange(numel)) { // We can skip indices equal to padding_idx so they are not included in // the reduction if (select_indices_data[i] != padding_idx) { @@ -247,7 +247,7 @@ index_select_add(const Tensor &select_indices, auto output_stride0 = output.strides()[0]; auto output_stride1 = output.strides()[1]; auto numel = add_indices.numel(); - for (int64_t i = 0; i < numel; i++) { + for (const auto i : c10::irange(numel)) { // We can skip indices equal to padding_idx so they are not included in // the reduction if (select_indices_data[i] != padding_idx) { @@ -302,14 +302,14 @@ index_select_scale_add(const Tensor &select_indices, auto* scale_data = scale.data_ptr(); auto scale_stride = scale.strides()[0]; - for (int64_t i = 0; i < numel; i++) { + for (const auto i : c10::irange(numel)) { // We can skip indices equal to padding_idx so they are not included in // the reduction if (select_indices_data[i] != padding_idx) { auto* src_base = src_data + src_stride0 * select_indices_data[i]; auto* output_base = output_data + output_stride0 * add_indices_data[i]; auto scale = scale_data[i * scale_stride]; - for (int64_t j = 0; j < ddim; j++) { + for (const auto j : c10::irange(ddim)) { output_base[j * output_stride1] += src_base[j * src_stride1] * scale; } } else if (bag_size.defined()) { @@ -419,14 +419,14 @@ index_select_scale_add(const Tensor &select_indices, auto numel = add_indices.numel(); - for (int64_t i = 0; i < numel; i++) { + for (const auto i : c10::irange(numel)) { // We can skip indices equal to padding_idx so they are not included in // the reduction if (select_indices_data[i] != padding_idx) { auto* src_base = src_data + src_stride0 * select_indices_data[i]; auto* output_base = output_data + output_stride0 * add_indices_data[i]; auto scale = scale_data[i * scale_stride]; - for (int64_t j = 0; j < ddim; j++) { + for (const auto j : c10::irange(ddim)) { output_base[j * output_stride1] += src_base[j * src_stride1] * scale; } } else if (bag_size.defined()) { diff --git a/aten/src/ATen/native/Fill.cpp b/aten/src/ATen/native/Fill.cpp index 91f0534d5fa1f6..acaad52a299591 100644 --- a/aten/src/ATen/native/Fill.cpp +++ b/aten/src/ATen/native/Fill.cpp @@ -6,6 +6,7 @@ #include #include #include +#include namespace at { namespace native { @@ -63,7 +64,7 @@ Tensor& fill_diagonal_(Tensor& self, const Scalar& fill_value, bool wrap) { if (nDims > 2) { int64_t dim1 = height; - for (int64_t i = 1; i < nDims; i++) { + for (const auto i : c10::irange(1, nDims)) { if (self.size(i) != dim1) { AT_ERROR("all dimensions of input must be of equal length"); } @@ -76,7 +77,7 @@ Tensor& fill_diagonal_(Tensor& self, const Scalar& fill_value, bool wrap) { int64_t size = std::min(height, width); int64_t stride = 0; - for (int64_t i = 0; i < nDims; i++) { + for (const auto i : c10::irange(nDims)) { stride += self.stride(i); } strides.push_back(stride); diff --git a/aten/src/ATen/native/FractionalMaxPool2d.cpp b/aten/src/ATen/native/FractionalMaxPool2d.cpp index e7eacd3e76e0f7..bdff052e94b001 100644 --- a/aten/src/ATen/native/FractionalMaxPool2d.cpp +++ b/aten/src/ATen/native/FractionalMaxPool2d.cpp @@ -1,6 +1,7 @@ #include #include #include +#include #include #include @@ -32,7 +33,7 @@ TORCH_META_FUNC(fractional_max_pool2d) ( int64_t ndims = input.ndimension(); TORCH_CHECK(ndims == 3 || ndims == 4, "fractional_max_pool2d(): Expected 3D or 4D tensor, but got: ", input.sizes()); - for (int64_t i = 1; i < ndims; ++i) { + for (const auto i : c10::irange(1, ndims)) { TORCH_CHECK(input.size(i) > 0, "fractional_max_pool2d(): Expected input to have non-zero size for non-batch dimensions, but got", input.sizes(), " with dimension ", i, " being empty."); @@ -106,7 +107,7 @@ static void fractional_max_pool2d_out_single_batch_frame( int outputW, int outputH, int poolSizeW, int poolSizeH) { at::parallel_for(0, numPlanes, 0, [&](int64_t start, int64_t end) { - for (auto plane = start; plane < end; ++plane) { + for (const auto plane : c10::irange(start, end)) { /* each plane contains 2 random samples, one for W and one for H */ scalar_t* randomSamplesForPlane = randomSamples + plane * 2; @@ -177,7 +178,7 @@ static void fractional_max_pool2d_out_frame( return; } at::parallel_for(0, numBatch, 0, [&](int64_t start, int64_t end) { - for (auto batch = start; batch < end; ++batch) { + for (const auto batch : c10::irange(start, end)) { fractional_max_pool2d_out_single_batch_frame( input + batch * numPlanes * inputH * inputW, output + batch * numPlanes * outputH * outputW, @@ -254,7 +255,7 @@ static void fractional_max_pool2d_backward_out_single_batch_frame( int inputW, int inputH, int outputW, int outputH) { at::parallel_for(0, numPlanes, 0, [&](int64_t start, int64_t end) { - for (auto plane = start; plane < end; plane++) { + for (const auto plane : c10::irange(start, end)) { scalar_t* gradInputForPlane = gradInput + plane * inputW * inputH; scalar_t* gradOutputForPlane = gradOutput + plane * outputW * outputH; int64_t* indicesForPlane = indices + plane * outputW * outputH; @@ -291,7 +292,7 @@ static void fractional_max_pool2d_backward_out_frame( return; } at::parallel_for(0, numBatch, 0, [&](int64_t start, int64_t end) { - for (auto batch = start; batch < end; ++batch) { + for (const auto batch : c10::irange(start, end)) { fractional_max_pool2d_backward_out_single_batch_frame( gradInput + batch * numPlanes * inputH * inputW, gradOutput + batch * numPlanes * outputH * outputW, diff --git a/aten/src/ATen/native/FractionalMaxPool3d.cpp b/aten/src/ATen/native/FractionalMaxPool3d.cpp index 279ff92467733d..237f9d4395bcea 100644 --- a/aten/src/ATen/native/FractionalMaxPool3d.cpp +++ b/aten/src/ATen/native/FractionalMaxPool3d.cpp @@ -44,7 +44,7 @@ static void fractional_max_pool3d_out_single_batch_frame( int64_t poolSizeT, int64_t poolSizeH, int64_t poolSizeW) { at::parallel_for(0, numPlanes, 0, [&](int64_t start, int64_t end) { - for (auto plane = start; plane < end; ++plane) { + for (const auto plane : c10::irange(start, end)) { /* each plane contains 3 random samples, one for T, one for W, and one for H */ scalar_t* randomSamplesForPlane = randomSamples + plane * 3; @@ -126,7 +126,7 @@ static void fractional_max_pool3d_out_frame( } at::parallel_for(0, numBatch, 0, [&](int64_t start, int64_t end) { - for (auto batch = start; batch < end; ++batch) { + for (const auto batch : c10::irange(start, end)) { fractional_max_pool3d_out_single_batch_frame( input + batch * numPlanes * inputW * inputH * inputT, output + batch * numPlanes * outputW * outputH * outputT, @@ -171,7 +171,7 @@ void fractional_max_pool3d_out_cpu_template( TORCH_CHECK(ndims == 4 || ndims == 5, "fractional_max_pool3d_out(): Expected 4D or 5D tensor, but got: ", input_.sizes()); - for (int64_t i = 1; i < ndims; ++i) { + for (const auto i : c10::irange(1, ndims)) { TORCH_CHECK(input_.size(i) > 0, "fractional_max_pool3d_out(): Expected input to have non-zero size for non-batch dimensions, but got", input_.sizes(), " with dimension ", i, " being empty."); @@ -243,7 +243,7 @@ static void fractional_max_pool3d_backward_out_single_batch_frame( int64_t outputT, int64_t outputH, int64_t outputW) { at::parallel_for(0, numPlanes, 0, [&](int64_t start, int64_t end) { - for (auto plane = start; plane < end; plane++) { + for (const auto plane : c10::irange(start, end)) { scalar_t* gradInputForPlane = gradInput + plane * inputT * inputH * inputW; scalar_t* gradOutputForPlane = gradOutput + plane * outputT * outputH * outputW; @@ -284,7 +284,7 @@ static void fractional_max_pool3d_backward_out_frame( } at::parallel_for(0, numBatch, 0, [&](int64_t start, int64_t end) { - for (auto batch = start; batch < end; ++batch) { + for (const auto batch : c10::irange(start, end)) { fractional_max_pool3d_backward_out_single_batch_frame( gradInput + batch * numPlanes * inputW * inputH * inputT, gradOutput + batch * numPlanes * outputW * outputH * outputT, diff --git a/aten/src/ATen/native/GridSampler.cpp b/aten/src/ATen/native/GridSampler.cpp index df3ff5d73f7ebb..740f725167a63a 100644 --- a/aten/src/ATen/native/GridSampler.cpp +++ b/aten/src/ATen/native/GridSampler.cpp @@ -9,6 +9,7 @@ #include #include #include +#include namespace at { namespace native { @@ -51,12 +52,12 @@ namespace { scalar_t *grid_ptr = grid.data_ptr(); // loop over each output pixel at::parallel_for(0, N, 0, [&](int64_t start, int64_t end) { - for (int64_t n = start; n < end; ++n) { + for (const auto n : c10::irange(start, end)) { scalar_t *grid_ptr_N = grid_ptr + n * grid_sN; scalar_t *inp_ptr_N = inp_ptr + n * inp_sN; - for (int64_t d = 0; d < out_D; ++d) { - for (int64_t h = 0; h < out_H; ++h) { - for (int64_t w = 0; w < out_W; ++w) { + for (const auto d : c10::irange(out_D)) { + for (const auto h : c10::irange(out_H)) { + for (const auto w : c10::irange(out_W)) { // get the corresponding input x, y, z co-ordinates from grid scalar_t *grid_ptr_NDHW = grid_ptr_N + d * grid_sD + h * grid_sH + w * grid_sW; scalar_t ix = *grid_ptr_NDHW; @@ -222,12 +223,12 @@ namespace { scalar_t *gGrid_ptr = grad_grid.data_ptr(); // loop over each output pixel at::parallel_for(0, N, 0, [&](int64_t start, int64_t end) { - for (int64_t n = start; n < end; ++n) { + for (const auto n : c10::irange(start, end)) { scalar_t *grid_ptr_N = grid_ptr + n * grid_sN; scalar_t *inp_ptr_N = inp_ptr + n * inp_sN; scalar_t *gGrid_ptr_NDHW = gGrid_ptr + n * gGrid_sN; - for (int64_t d = 0; d < out_D; ++d) { - for (int64_t h = 0; h < out_H; ++h) { + for (const auto d : c10::irange(out_D)) { + for (const auto h : c10::irange(out_H)) { for (int64_t w = 0; w < out_W; ++w, gGrid_ptr_NDHW += gGrid_sW /* grad_grid is contiguous */ ) { // get the corresponding input x, y, z co-ordinates from grid scalar_t *grid_ptr_NDHW = grid_ptr_N + d * grid_sD + h * grid_sH + w * grid_sW; @@ -416,11 +417,11 @@ Tensor _grid_sampler_2d_cpu_fallback(const Tensor& input, const Tensor& grid, scalar_t *grid_ptr = grid.data_ptr(); // loop over each output pixel at::parallel_for(0, N, 0, [&](int64_t start, int64_t end) { - for (int64_t n = start; n < end; ++n) { + for (const auto n : c10::irange(start, end)) { scalar_t *grid_ptr_N = grid_ptr + n * grid_sN; scalar_t *inp_ptr_N = inp_ptr + n * inp_sN; - for (int64_t h = 0; h < out_H; ++h) { - for (int64_t w = 0; w < out_W; ++w) { + for (const auto h : c10::irange(out_H)) { + for (const auto w : c10::irange(out_W)) { // get the corresponding input x, y, z co-ordinates from grid scalar_t *grid_ptr_NHW = grid_ptr_N + h * grid_sH + w * grid_sW; scalar_t x = *grid_ptr_NHW; @@ -505,7 +506,7 @@ Tensor _grid_sampler_2d_cpu_fallback(const Tensor& input, const Tensor& grid, scalar_t coefficients[4]; // Interpolate 4 values in the x directon - for (int64_t i = 0; i < 4; ++i) { + for (const auto i : c10::irange(4)) { coefficients[i] = cubic_interp1d( get_value_bounded(inp_ptr_NC, ix_nw - 1, iy_nw - 1 + i, inp_W, inp_H, inp_sW, inp_sH, padding_mode, align_corners), get_value_bounded(inp_ptr_NC, ix_nw + 0, iy_nw - 1 + i, inp_W, inp_H, inp_sW, inp_sH, padding_mode, align_corners), @@ -578,11 +579,11 @@ _grid_sampler_2d_cpu_fallback_backward(const Tensor& grad_output, scalar_t *gGrid_ptr = grad_grid.data_ptr(); // loop over each output pixel at::parallel_for(0, N, 0, [&](int64_t start, int64_t end) { - for (int64_t n = start; n < end; ++n) { + for (const auto n : c10::irange(start, end)) { scalar_t *grid_ptr_N = grid_ptr + n * grid_sN; scalar_t *inp_ptr_N = inp_ptr + n * inp_sN; scalar_t *gGrid_ptr_NHW = gGrid_ptr + n * gGrid_sN; - for (int64_t h = 0; h < out_H; ++h) { + for (const auto h : c10::irange(out_H)) { for (int64_t w = 0; w < out_W; ++w, gGrid_ptr_NHW += gGrid_sW /* grad_grid is contiguous */ ) { // get the corresponding input x, y co-ordinates from grid scalar_t *grid_ptr_NHW = grid_ptr_N + h * grid_sH + w * grid_sW; @@ -703,8 +704,8 @@ _grid_sampler_2d_cpu_fallback_backward(const Tensor& grad_output, for (int64_t c = 0; c < C; ++c, gOut_ptr_NCHW += gOut_sC, gInp_ptr_NC += gInp_sC, inp_ptr_NC+= inp_sC) { scalar_t gOut = *gOut_ptr_NCHW; - for (int64_t i = 0; i < 4; ++i) { - for (int64_t j = 0; j < 4; ++j) { + for (const auto i : c10::irange(4)) { + for (const auto j : c10::irange(4)) { // set input gradient add_value_bounded(gInp_ptr_NC, ix_nw - 1 + i, iy_nw - 1 + j, @@ -857,7 +858,7 @@ Tensor grid_sampler(const Tensor& input, const Tensor& grid, !(input.dim() == 5 && static_cast(interpolation_mode) == GridSamplerInterpolation::Bicubic), "grid_sampler(): bicubic interpolation only supports 4D input" ); - for (int64_t i = 2; i < input.dim(); i++) { + for (const auto i : c10::irange(2, input.dim())) { TORCH_CHECK(input.size(i) > 0, "grid_sampler(): expected input to have non-empty spatial dimensions, " "but input has sizes ", input.sizes(), " with dimension ", i, " being " diff --git a/aten/src/ATen/native/Im2Col.cpp b/aten/src/ATen/native/Im2Col.cpp index f66e4d44544616..c4b05bc18b566f 100644 --- a/aten/src/ATen/native/Im2Col.cpp +++ b/aten/src/ATen/native/Im2Col.cpp @@ -5,6 +5,7 @@ #include #include +#include namespace at { namespace native { @@ -91,7 +92,7 @@ static void im2col_out_cpu_template( Tensor input_n; Tensor output_n; - for (int64_t elt = 0; elt < batch_size; elt++) { + for (const auto elt : c10::irange(batch_size)) { input_n = input.select(0, elt); output_n = output.select(0, elt); diff --git a/aten/src/ATen/native/IndexingUtils.h b/aten/src/ATen/native/IndexingUtils.h index 5b938e9536c486..2dea9a0e94d416 100644 --- a/aten/src/ATen/native/IndexingUtils.h +++ b/aten/src/ATen/native/IndexingUtils.h @@ -2,6 +2,7 @@ #include #include #include +#include #include @@ -31,7 +32,7 @@ static C10_UNUSED std::vector expandTensors(const Tensor & self, const t } // The sizes of the ByteTensor mask or bool tensor must match the sizes of the // corresponding dimensions in self - for (int64_t j = 0; j < index.dim(); j++) { + for (const auto j : c10::irange(index.dim())) { int64_t srcIdx = result.size() + j; if (index.size(j) != self.size(srcIdx)) { invalid_mask(self, srcIdx, index, j); @@ -39,7 +40,7 @@ static C10_UNUSED std::vector expandTensors(const Tensor & self, const t } // Replace with nonzeros auto nonzero = index.nonzero(); - for (int64_t j = 0; j < index.dim(); j++) { + for (const auto j : c10::irange(index.dim())) { result.emplace_back(nonzero.select(1, j)); } } else { diff --git a/aten/src/ATen/native/LinearAlgebra.cpp b/aten/src/ATen/native/LinearAlgebra.cpp index 2cb8825a3f335b..942a15dfc26bdd 100644 --- a/aten/src/ATen/native/LinearAlgebra.cpp +++ b/aten/src/ATen/native/LinearAlgebra.cpp @@ -1158,7 +1158,7 @@ static void addbmm_impl_( } auto adjusted_beta(beta); - for (int64_t batch = 0; batch < num_batches; ++batch) { + for (const auto batch : c10::irange(num_batches)) { result.addmm_(batch1[batch], batch2[batch], adjusted_beta, alpha); adjusted_beta = 1; // accumulate output once } @@ -1215,23 +1215,23 @@ inline void baddbmm_cpu_kernel(const Tensor& result, const Tensor& self, const T int64_t grain_size = std::min(internal::GRAIN_SIZE / (is * js * ks), (int64_t)1); parallel_for(0, bs, grain_size, [&](int64_t b_begin, int64_t b_end) { - for (int64_t b = b_begin; b < b_end; b++) { + for (const auto b : c10::irange(b_begin, b_end)) { auto r1 = r0[b]; auto s1 = s0[b]; auto m1 = m0[b]; - for (int64_t i = 0; i < is; i++) { + for (const auto i : c10::irange(is)) { auto r2 = r1[i]; auto s2 = s1[i]; - for (int64_t j = 0; j < js; j++) { + for (const auto j : c10::irange(js)) { scalar_t &r = r2[j]; if (is_bmm) { r = 0; - for (int64_t k = 0; k < ks; k++) { + for (const auto k : c10::irange(ks)) { r += s2[k] * m1[k][j]; } } else { r *= beta; - for (int64_t k = 0; k < ks; k++) { + for (const auto k : c10::irange(ks)) { r += alpha * s2[k] * m1[k][j]; } } @@ -1994,10 +1994,11 @@ void compute_T18_scale_square( auto mexp_scaled = at::native::compute_T18(a_scaled); auto s_cpu = (s.device().type() == at::kCPU) ? s : s.to(at::kCPU); - for (int64_t i = 0; i < mexp_scaled.size(0); ++i) { + for (const auto i : c10::irange(mexp_scaled.size(0))) { auto s_val = s_cpu.select(0, i).template item(); auto mexp = mexp_scaled.select(0, i); - for (int64_t p = 0; p < s_val; ++p) { + for (const auto p : c10::irange(s_val)) { + (void)p; //Suppress unused variable warning mexp = at::matmul(mexp, mexp); } mexp_out.select(0, i).copy_(mexp); @@ -2265,7 +2266,7 @@ Tensor& nuclear_norm_out(const Tensor& self, IntArrayRef dim, bool keepdim, Tens // (e.g. [0, 1, 2, ..., ndim-1]) static std::vector make_dim_list(int64_t ndim) { std::vector dim_list(ndim); - for (int64_t ind = 0; ind < ndim; ind++) { + for (const auto ind : c10::irange(ndim)) { dim_list[ind] = ind; } return dim_list; @@ -2818,7 +2819,7 @@ struct KronImpl final { a_reshape = c10::SmallVector(2 * maxdim); b_reshape = c10::SmallVector(2 * maxdim); result_reshape = c10::SmallVector(maxdim); - for (int64_t i = 0; i < maxdim; i++) { + for (const auto i : c10::irange(maxdim)) { a_reshape[2 * i] = (i >= pad_self ? self.sizes()[i - pad_self] : 1); a_reshape[2 * i + 1] = 1; b_reshape[2 * i] = 1; @@ -2833,7 +2834,7 @@ struct KronImpl final { TORCH_INTERNAL_ASSERT(result.defined(), "Cannot call kron_out with an undefined result tensor as the out argument. Please allocate a Tensor before calling kron_out with it."); c10::SmallVector mul_shape(2 * maxdim); - for (int64_t i = 0; i < maxdim; i++) { + for (const auto i : c10::irange(maxdim)) { mul_shape[2 * i] = a_reshape[2 * i]; mul_shape[2 * i + 1] = b_reshape[2 * i + 1]; } diff --git a/aten/src/ATen/native/LinearAlgebraUtils.h b/aten/src/ATen/native/LinearAlgebraUtils.h index 3c37a49d4b70ba..d34d9355ece6c6 100644 --- a/aten/src/ATen/native/LinearAlgebraUtils.h +++ b/aten/src/ATen/native/LinearAlgebraUtils.h @@ -1,6 +1,7 @@ #pragma once #include +#include #include #include #include @@ -169,7 +170,8 @@ void batch_iterator_with_broadcasting(const Tensor& a, const Tensor& b, const fu auto* b_batch_idx_ptr = data[0]; auto* a_batch_idx_ptr = data[1]; - for (int64_t elem = 0; elem < nelems; ++elem) { + for (const auto elem : c10::irange(nelems)) { + (void)elem; //Suppress unused variable warning auto b_curr_linear_batch_idx = *reinterpret_cast(b_batch_idx_ptr); auto a_curr_linear_batch_idx = *reinterpret_cast(a_batch_idx_ptr); @@ -332,7 +334,7 @@ static inline Tensor _move_to_end(const Tensor& self, IntArrayRef axes) { const int64_t ndim = self.ndimension(); std::vector perm; - for (int64_t i = 0; i < ndim; i++) { + for (const auto i : c10::irange(ndim)) { auto it = std::find(a.begin(), a.end(), i); if (it == a.end()) { perm.push_back(i); @@ -476,7 +478,7 @@ static inline std::vector create_dim_backshift_permutation(int64_t dim0 "duplicate or invalid dimensions"); std::vector permutation(ndim); int64_t cur_permuted_dim = 0; - for (int64_t dim_ind = 0; dim_ind < ndim; dim_ind++) { + for (const auto dim_ind : c10::irange(ndim)) { if ((dim_ind != dim0) && (dim_ind != dim1)) { permutation[cur_permuted_dim++] = dim_ind; } @@ -493,7 +495,7 @@ static inline std::vector create_dim_backshift_permutation(int64_t dim0 static inline std::vector create_reverse_permutation(std::vector permutation) { int64_t ndim = permutation.size(); std::vector reverse_permutation(ndim); - for (int64_t dim_ind = 0; dim_ind < ndim; dim_ind++) { + for (const auto dim_ind : c10::irange(ndim)) { reverse_permutation[permutation[dim_ind]] = dim_ind; } return reverse_permutation; diff --git a/aten/src/ATen/native/LossCTC.cpp b/aten/src/ATen/native/LossCTC.cpp index dbf0e2cc990950..19af04b9731de4 100644 --- a/aten/src/ATen/native/LossCTC.cpp +++ b/aten/src/ATen/native/LossCTC.cpp @@ -11,6 +11,7 @@ #include #include #include +#include #include #include @@ -60,7 +61,7 @@ std::tuple ctc_loss_cpu_template(const Tensor& log_probs, const std::vector tg_batch_offsets(batch_size); if (targets.dim() == 1) { // concatenated targets int64_t pos = 0; - for (int64_t i = 0; i < batch_size; i++) { + for (const auto i : c10::irange(batch_size)) { tg_batch_offsets[i] = pos; pos += target_lengths[i]; if (max_target_length < target_lengths[i]) @@ -72,7 +73,7 @@ std::tuple ctc_loss_cpu_template(const Tensor& log_probs, const else { // batch x max_target_length // dim is 2 int64_t tg_batch_stride = targets.stride(0); - for (int64_t i = 0; i < batch_size; i++) { + for (const auto i : c10::irange(batch_size)) { tg_batch_offsets[i] = i * tg_batch_stride; if (max_target_length < target_lengths[i]) max_target_length = target_lengths[i]; @@ -84,7 +85,7 @@ std::tuple ctc_loss_cpu_template(const Tensor& log_probs, const " (while checking arguments for ", c, ")"); } int64_t max_input_length = log_probs.size(0); - for (int64_t b = 0; b < batch_size; b++) { + for (const auto b : c10::irange(batch_size)) { TORCH_CHECK(input_lengths[b] <= max_input_length, "Expected input_lengths to have value at most ", max_input_length, ", but got value ", input_lengths[b], " (while checking arguments for ", c, ")"); @@ -103,7 +104,7 @@ std::tuple ctc_loss_cpu_template(const Tensor& log_probs, const // first the default log_alpha.narrow(1, 0, 1).fill_(neginf); at::parallel_for(0, batch_size, 0, [&](int64_t start, int64_t end) { - for (int64_t b = start; b < end; b++) { + for (const auto b : c10::irange(start, end)) { int64_t input_length = input_lengths[b]; int64_t target_length = target_lengths[b]; auto log_probs_a = log_probs_a_global[b]; @@ -116,7 +117,7 @@ std::tuple ctc_loss_cpu_template(const Tensor& log_probs, const log_alpha_a[0][1] = log_probs_a[0][get_target_prime(targets_data, tg_batch_offset, tg_target_stride, 1, BLANK)]; // now the loop over the inputs - for (int64_t t=1; t()[b]; auto grad_a = grad_a_global[b]; if (zero_infinity && nll == std::numeric_limits::infinity()) { @@ -322,8 +323,8 @@ Tensor ctc_loss_backward_cpu_template(const Tensor& grad_out, const Tensor& log_ // this could be a great target for further vectorization. // grad is the output gradient, nll is the loss. Note that the likelihood -nll is the Z of eq (16) scalar_t gr = grad_out.accessor()[b]; - for (int64_t t = 0; t < input_length; t++) { // or go for the full thing? - for (int64_t c = 0; c < num_labels; c++) { + for (const auto t : c10::irange(input_length)) { // or go for the full thing? + for (const auto c : c10::irange(num_labels)) { scalar_t& res = grad_a[t][c]; scalar_t lp = log_probs_a[t][c]; res = (std::exp(lp)-std::exp(res + nll - lp)) * gr; diff --git a/aten/src/ATen/native/LossMultiLabelMargin.cpp b/aten/src/ATen/native/LossMultiLabelMargin.cpp index fa71663716f5d3..f59de5c8817a42 100644 --- a/aten/src/ATen/native/LossMultiLabelMargin.cpp +++ b/aten/src/ATen/native/LossMultiLabelMargin.cpp @@ -3,6 +3,7 @@ #include #include #include +#include namespace at { namespace native { @@ -17,21 +18,21 @@ inline scalar_t multilabel_margin_loss_forward_inner_sum_cpu( int64_t dim) { using accscalar_t = at::acc_type; accscalar_t sum = 0; - for (int64_t ddt = 0; ddt < dim; ddt++) { + for (const auto ddt : c10::irange(dim)) { int64_t target_idx = target_data[ddt]; if (target_idx < 0) { break; } is_target_data[target_idx] = 1; } - for (int64_t dt = 0; dt < dim; dt++) { + for (const auto dt : c10::irange(dim)) { int64_t target_idx = target_data[dt]; if (target_idx < 0) { break; } scalar_t input_target = input_data[target_idx]; - for (int64_t d = 0; d < dim; d++) { + for (const auto d : c10::irange(dim)) { if (!is_target_data[d]) { scalar_t z = 1 - input_target + input_data[d]; if (z > 0) { @@ -63,7 +64,8 @@ static void multilabel_margin_loss_forward_out_frame( accscalar_t sum = 0; - for (int64_t t = 0; t < nframe; t++) { + for (const auto t : c10::irange(nframe)) { + (void)t; //Suppress unused variable warning sum += multilabel_margin_loss_forward_inner_sum_cpu( input_data, target_data, is_target_data, dim); @@ -81,7 +83,7 @@ static void multilabel_margin_loss_forward_out_frame( } else { auto output_acc = output.accessor(); - for (int64_t t = 0; t < nframe; t++) { + for (const auto t : c10::irange(nframe)) { scalar_t sum = multilabel_margin_loss_forward_inner_sum_cpu( input_data, target_data, is_target_data, dim); @@ -171,15 +173,16 @@ static void multilabel_margin_loss_backward_out_frame( reduction == Reduction::Mean ? 1. / (nframe * dim) : 1. / dim); scalar_t* grad_input_row_data = grad_input.data_ptr(); - for (int64_t t = 0; t < nframe; t++) { - for (int64_t dt = 0; dt < dim; dt++) { + for (const auto t : c10::irange(nframe)) { + (void)t; //Suppress unused variable warning + for (const auto dt : c10::irange(dim)) { int64_t target_idx = target_data[dt]; if (target_idx < 0) { break; } scalar_t input_target = input_data[target_idx]; - for (int64_t d = 0; d < dim; d++) { + for (const auto d : c10::irange(dim)) { if (!is_target_data[d]) { scalar_t z = 1 - input_target + input_data[d]; if (z > 0) { @@ -206,8 +209,8 @@ static void multilabel_margin_loss_backward_out_frame( } else { check_dim_size(grad_output, 1, 0, nframe); auto grad_output_acc = grad_output.accessor(); - for (int64_t t = 0; t < nframe; t++) { - for (int64_t d = 0; d < dim; d++) { + for (const auto t : c10::irange(nframe)) { + for (const auto d : c10::irange(dim)) { grad_input_data[t * dim + d] *= grad_output_acc[t]; } } diff --git a/aten/src/ATen/native/LossMultiMargin.cpp b/aten/src/ATen/native/LossMultiMargin.cpp index b65aaf9b6adce2..c7ab53f1d211b7 100644 --- a/aten/src/ATen/native/LossMultiMargin.cpp +++ b/aten/src/ATen/native/LossMultiMargin.cpp @@ -2,6 +2,7 @@ #include #include #include +#include namespace at { namespace native { @@ -18,7 +19,7 @@ inline scalar_t multi_margin_inner_sum_cpu( const int64_t target_idx) { const scalar_t input_target = input_data[target_idx]; scalar_t sum = 0; - for (int64_t d = 0; d < dim; d++) { + for (const auto d : c10::irange(dim)) { if (d == target_idx) { continue; } @@ -63,7 +64,7 @@ static inline void multi_margin_loss_cpu_kernel( // cannot be handled by TensorAccessor) if (reduction == Reduction::None && output.dim() > 0) { auto output_acc = output.accessor(); - for (int64_t t = 0; t < nframe; t++) { + for (const auto t : c10::irange(nframe)) { const auto idx = target_index_checked(target_data, t, dim); auto sum = multi_margin_inner_sum_cpu( input_data, weight_data, p, margin, dim, idx); @@ -73,7 +74,7 @@ static inline void multi_margin_loss_cpu_kernel( } else { accscalar_t sum = 0; auto output_acc = output.data_ptr(); - for (int64_t t = 0; t < nframe; t++) { + for (const auto t : c10::irange(nframe)) { const auto idx = target_index_checked(target_data, t, dim); sum += multi_margin_inner_sum_cpu( input_data, weight_data, p, margin, dim, idx); @@ -149,11 +150,11 @@ static void multi_margin_loss_backward_cpu_kernel( int64_t dim, int64_t reduction) { scalar_t* grad_input_row_data = grad_input_data; - for (int64_t t = 0; t < nframe; t++) { + for (const auto t : c10::irange(nframe)) { int64_t target_idx = target_index_checked(target_data, t, dim); scalar_t input_target = input_data[target_idx]; scalar_t grad_input_target = 0; - for (int64_t d = 0; d < dim; d++) { + for (const auto d : c10::irange(dim)) { scalar_t z = margin - input_target + input_data[d]; if (d == target_idx) { continue; @@ -186,8 +187,8 @@ static void multi_margin_loss_backward_cpu_kernel( } } else { auto grad_output_acc = grad_output.accessor(); - for (int64_t t = 0; t < nframe; t++) { - for (int64_t d = 0; d < dim; d++) { + for (const auto t : c10::irange(nframe)) { + for (const auto d : c10::irange(dim)) { grad_input_data[t * dim + d] *= grad_output_acc[t]; } } diff --git a/aten/src/ATen/native/LossNLL.cpp b/aten/src/ATen/native/LossNLL.cpp index 78f982afcd871d..dfb4aced85c41f 100644 --- a/aten/src/ATen/native/LossNLL.cpp +++ b/aten/src/ATen/native/LossNLL.cpp @@ -9,6 +9,7 @@ #include #include +#include namespace at { namespace meta { @@ -155,7 +156,7 @@ static void nll_loss_out_frame( auto output_acc = output.accessor(); at::parallel_for(0, batch_size, 0, [&](int64_t start, int64_t end) { - for (auto i = start; i < end; i++) { + for (const auto i : c10::irange(start, end)) { const auto cur_target = target_acc[i]; if (cur_target == ignore_index) { @@ -215,7 +216,7 @@ static void nll_loss_out_frame( scalar_t weight_partial_sums[cascade_sum_num_levels] = {0}; // NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,modernize-avoid-c-arrays) scalar_t loss_partial_sums[cascade_sum_num_levels] = {0}; - for (int64_t b = 0; b < batch_size; b++) { + for (const auto b : c10::irange(batch_size)) { const int64_t cur_target = target_data[b]; if (cur_target == ignore_index) { ++num_ignored; @@ -330,7 +331,7 @@ static void nll_loss_backward_out_frame( auto grad_input_acc = grad_input.accessor(); auto grad_output_acc = grad_output.accessor(); at::parallel_for(0, batch_size, 0, [&](int64_t start, int64_t end) { - for (auto i = start; i < end; i++) { + for (const auto i : c10::irange(start, end)) { auto cur_target = target_acc[i]; if (cur_target == ignore_index) { continue; diff --git a/aten/src/ATen/native/LossNLL2d.cpp b/aten/src/ATen/native/LossNLL2d.cpp index 0e3e9e6fec77ba..d7ebf65231f1ed 100644 --- a/aten/src/ATen/native/LossNLL2d.cpp +++ b/aten/src/ATen/native/LossNLL2d.cpp @@ -5,6 +5,7 @@ #include #include #include +#include namespace at { namespace native { @@ -109,9 +110,9 @@ static void nll_loss2d_forward_out_frame( auto target_acc = target.accessor(); at::parallel_for(0, batch_size, 0, [&](int64_t start, int64_t end) { - for (int64_t b = start; b < end; b++) { - for (int64_t h = 0; h < H; h++) { - for (int64_t w = 0; w < W; w++) { + for (const auto b : c10::irange(start, end)) { + for (const auto h : c10::irange(H)) { + for (const auto w : c10::irange(W)) { const int64_t cur_target = (int64_t)target_acc[b][h][w]; if (cur_target == ignore_index) { @@ -176,8 +177,8 @@ static void nll_loss2d_forward_out_frame( const int64_t level_mask = level_step - 1; int64_t num_ignored = 0; - for (int64_t b = 0; b < batch_size; b++) { - for (int64_t elem = 0; elem < map_size; elem++) { + for (const auto b : c10::irange(batch_size)) { + for (const auto elem : c10::irange(map_size)) { const int64_t cur_target = target_data[b * map_size + elem]; if (cur_target == ignore_index) { ++num_ignored; @@ -286,9 +287,9 @@ static void nll_loss2d_backward_out_frame( auto target_acc = target.accessor(); at::parallel_for(0, batch_size, 0, [&](int64_t start, int64_t end) { - for (int64_t b = start; b < end; b++) { - for (int64_t h = 0; h < H; h++) { - for (int64_t w = 0; w < W; w++) { + for (const auto b : c10::irange(start, end)) { + for (const auto h : c10::irange(H)) { + for (const auto w : c10::irange(W)) { const int64_t cur_target = target_acc[b][h][w]; if (cur_target == ignore_index) { continue; @@ -329,8 +330,8 @@ static void nll_loss2d_backward_out_frame( : grad_output_value); at::parallel_for(0, batch_size, 0, [&](int64_t start, int64_t end) { - for (int64_t b = start; b < end; b++) { - for (int64_t elem = 0; elem < map_size; elem++) { + for (const auto b : c10::irange(start, end)) { + for (const auto elem : c10::irange(map_size)) { const int64_t t = target_data[b * map_size + elem]; if (t != ignore_index) { diff --git a/aten/src/ATen/native/NNPACK.cpp b/aten/src/ATen/native/NNPACK.cpp index fa1d1d86c6930d..e83320e09fa6eb 100644 --- a/aten/src/ATen/native/NNPACK.cpp +++ b/aten/src/ATen/native/NNPACK.cpp @@ -60,6 +60,7 @@ bool _nnpack_available() { #include #include #include +#include namespace at { namespace native { @@ -238,7 +239,7 @@ Tensor _nnpack_spatial_convolution( const size_t input_size_per_batch = input_channels * input_size.width * input_size.height; const size_t output_size_per_batch = output_channels * output_size.width * output_size.height; - for (size_t batch = 0u; batch < batch_size; ++batch) { + for (const auto batch : c10::irange(0u, batch_size)) { const nnp_status status = nnp_convolution_inference( algorithm, nnp_convolution_transform_strategy_compute, diff --git a/aten/src/ATen/native/NamedTensor.cpp b/aten/src/ATen/native/NamedTensor.cpp index 1d5d8e4a4a6982..c987f72261ab47 100644 --- a/aten/src/ATen/native/NamedTensor.cpp +++ b/aten/src/ATen/native/NamedTensor.cpp @@ -100,7 +100,7 @@ Tensor refine_names(const Tensor& self, DimnameList names) { self_names.size(), " and ", names.size(), " respectively)."); check_names_valid_for(self, names); - for (size_t idx = 0; idx < self_names.size(); idx++) { + for (const auto idx : c10::irange(self_names.size())) { const auto& self_name = self_names[idx]; const auto& out_name = names[idx]; if (self_name == out_name || self_name.isWildcard()) { @@ -221,7 +221,7 @@ Tensor align_to(const Tensor& tensor, DimnameList order, int64_t ellipsis_idx) { }; // Fill in the non-ellipsis dimensions - for (auto order_idx = 0U; order_idx < order.size(); ++order_idx) { + for (const auto order_idx : c10::irange(0U, order.size())) { auto out_idx = order_idx; if (order_idx >= ellipsis_idx) { out_idx = order_idx + num_ellipsis_names; diff --git a/aten/src/ATen/native/Normalization.cpp b/aten/src/ATen/native/Normalization.cpp index 25ae1a765e85ff..fdce903c0806d9 100644 --- a/aten/src/ATen/native/Normalization.cpp +++ b/aten/src/ATen/native/Normalization.cpp @@ -10,6 +10,7 @@ #include #include #include +#include #include @@ -156,7 +157,7 @@ std::tuple batch_norm_cpu_update_stats_template( // Reduce all dimensions except dim=1 DimVector reduce_dims(ndim - 1); reduce_dims[0] = 0; - for (int64_t i = 2; i < ndim; ++i) { + for (const auto i : c10::irange(2, ndim)) { reduce_dims[i - 1] = i; } @@ -178,7 +179,7 @@ std::tuple batch_norm_cpu_update_stats_template( batch_norm_cpu_collect_stats_stub(kCPU, _mean, _var_sum, input); parallel_for(0, n_input, 1, [&](int64_t b_begin, int64_t b_end) { - for (int64_t f = b_begin; f < b_end; ++f) { + for (const auto f : c10::irange(b_begin, b_end)) { save_mean_a[f] = _mean_a[f]; save_var_transform_a[f] = VarTransform{}(_var_sum_a[f] / n, eps); @@ -206,7 +207,7 @@ std::tuple batch_norm_cpu_update_stats_template( parallel_for(0, n_input, 1, [&](int64_t b_begin, int64_t b_end) { TensorIterator iter(reduce_iter); - for (int64_t f = b_begin; f < b_end; ++f) { + for (const auto f : c10::irange(b_begin, b_end)) { // compute variance per input iter.unsafe_replace_operand(0, in_data + channel_stride * f); accscalar_t var_sum = 0; @@ -283,7 +284,7 @@ std::tuple batch_norm_backward_cpu_template( // Reduce all dimensions except dim=1 DimVector reduce_dims(ndim - 1); reduce_dims[0] = 0; - for (int64_t i = 2; i < ndim; ++i) { + for (const auto i : c10::irange(2, ndim)) { reduce_dims[i - 1] = i; } @@ -330,7 +331,7 @@ std::tuple batch_norm_backward_cpu_template( TensorIterator unary_iter_local(unary_iter); TensorIterator binary_iter_local(binary_iter); - for (int64_t f = b_begin; f < b_end; ++f) { + for (const auto f : c10::irange(b_begin, b_end)) { scalar_t w = weight.defined() ? weight_a[f] : 1; scalar_t mean, invstd; diff --git a/aten/src/ATen/native/PackedSequence.cpp b/aten/src/ATen/native/PackedSequence.cpp index 798672ccdeaeff..ec997d86aa1b59 100644 --- a/aten/src/ATen/native/PackedSequence.cpp +++ b/aten/src/ATen/native/PackedSequence.cpp @@ -77,7 +77,7 @@ std::tuple _pack_padded_sequence(const Tensor& _input, const Ten // more elements below in our column, we lower the counter (prev_l), and append the new // block to the output. int64_t prev_l = 0; - for (int64_t i = 0; i < batch_size; ++i) { + for (const auto i : c10::irange(batch_size)) { int64_t l = lengths[batch_size - 1 - i]; if (l > prev_l) { auto current_batch_size = batch_size - i; @@ -109,7 +109,7 @@ Tensor _pack_padded_sequence_backward(const Tensor& grad, at::IntArrayRef input_ int64_t offset = 0; int64_t max_seq_len = batch_sizes_t.size(0); int64_t * batch_sizes = batch_sizes_t.data_ptr(); - for (int64_t i = 0; i < max_seq_len; ++i) { + for (const auto i : c10::irange(max_seq_len)) { grad_input[i].slice(0, 0, batch_sizes[i]).copy_(grad.slice(0, offset, offset + batch_sizes[i])); offset += batch_sizes[i]; } @@ -170,7 +170,8 @@ std::tuple _pad_packed_sequence(const Tensor& data, const Tensor } int64_t dec = prev_batch_size - batch_size; if (dec > 0) { - for (int64_t j = 0; j < dec; ++j) { + for (const auto j : c10::irange(dec)) { + (void)j; //Suppress unused variable warning (*lengths--) = i; } } @@ -206,7 +207,7 @@ Tensor pad_sequence(TensorList sequences, bool batch_first, double padding_value out_dims.insert(out_dims.end(), trailing_dims.begin(), trailing_dims.end()); Tensor out = at::full(out_dims, padding_value, sequences[0].options()); - for (int64_t i = 0; i < sequences_size; i++) { + for (const auto i : c10::irange(sequences_size)) { const Tensor currseq = sequences[i]; const int64_t length_i = currseq.size(0); // use index notation to prevent duplicate references to the tensor diff --git a/aten/src/ATen/native/Pool.h b/aten/src/ATen/native/Pool.h index da774911b5737e..3db102ad855053 100644 --- a/aten/src/ATen/native/Pool.h +++ b/aten/src/ATen/native/Pool.h @@ -2,6 +2,7 @@ #include #include #include +#include #pragma once @@ -212,7 +213,7 @@ pool3d_shape_check( TORCH_CHECK(ndim == 4 || ndim == 5, fn_name, ": Expected 4D or 5D tensor for input, but got: ", input.sizes()); - for (int64_t i = 1; i < ndim; ++i) { + for (const auto i : c10::irange(1, ndim)) { TORCH_CHECK(input.size(i) > 0, fn_name, "Expected input to have non-zero size for non-batch dimensions, but got", input.sizes(), " with dimension ", i, " being empty."); diff --git a/aten/src/ATen/native/QuantizedLinear.cpp b/aten/src/ATen/native/QuantizedLinear.cpp index 0c4256f5272041..e3030f71d16517 100644 --- a/aten/src/ATen/native/QuantizedLinear.cpp +++ b/aten/src/ATen/native/QuantizedLinear.cpp @@ -206,9 +206,9 @@ void CalcColOffsetsTranspose( const int8_t* Bint8, int32_t B_zero_point, int32_t* col_offsets) { - for (int i = 0; i < N; ++i) { + for (const auto i : c10::irange(N)) { int32_t sum = 0; - for (int j = 0; j < K; ++j) { + for (const auto j : c10::irange(K)) { sum += Bint8[i * K + j]; } col_offsets[i] = sum - B_zero_point * K; @@ -353,7 +353,7 @@ bool CheckAndSaturate(T max_val, T* element) { void HandleWeightsSaturation(int64_t N, float* weight) { const float kFp16Max = RawUint16ToFp16(0x7BFF); bool found_out_of_range = false; - for (int64_t i = 0; i < N; ++i) { + for (const auto i : c10::irange(N)) { if (CheckAndSaturate(kFp16Max, weight + i)) { found_out_of_range = true; } diff --git a/aten/src/ATen/native/RNN.cpp b/aten/src/ATen/native/RNN.cpp index c30bb6f3f46261..cf49cd239a2b20 100644 --- a/aten/src/ATen/native/RNN.cpp +++ b/aten/src/ATen/native/RNN.cpp @@ -7,6 +7,7 @@ #include #include #include +#include #include #include @@ -539,7 +540,7 @@ template static std::vector unpair_vec(std::vector>&& vals) { std::vector result; result.reserve(vals.size() * 2); - for (size_t i = 0; i < vals.size(); i++) { + for (const auto i : c10::irange(vals.size())) { result.push_back(std::move(vals[i].first)); result.push_back(std::move(vals[i].second)); } @@ -986,7 +987,7 @@ struct PackedLayer : Layer { // are completed now). The sliced parts are also saved, because we will need // to return a tensor of final hidden state. auto hidden = input_hidden; - for (int64_t i = 0; i < num_steps; ++i) { + for (const auto i : c10::irange(num_steps)) { const int64_t batch_size = batch_sizes[i]; auto step_input = input_ptr->narrow(0, input_offset, batch_size); input_offset += batch_size; @@ -1121,7 +1122,7 @@ apply_layer_stack(const Layer& layer, const i auto hidden_it = hiddens.begin(); auto weight_it = weights.begin(); std::vector final_hiddens; - for (int64_t l = 0; l < num_layers; ++l) { + for (const auto l : c10::irange(num_layers)) { auto layer_output = layer(layer_input, *(hidden_it++), *(weight_it++)); final_hiddens.push_back(layer_output.final_hidden); layer_input = layer_output.outputs; @@ -1177,7 +1178,7 @@ std::tuple _lstm_impl( int64_t total_layers = layer_hx.size(); std::vector::hidden_type> hiddens; hiddens.reserve(total_layers); - for (int64_t i = 0; i < total_layers; ++i) { + for (const auto i : c10::irange(total_layers)) { hiddens.emplace_back(std::move(layer_hx[i]), std::move(layer_cx[i])); } diff --git a/aten/src/ATen/native/RangeFactories.cpp b/aten/src/ATen/native/RangeFactories.cpp index b6afc38dd1b1c9..244eed2e847571 100644 --- a/aten/src/ATen/native/RangeFactories.cpp +++ b/aten/src/ATen/native/RangeFactories.cpp @@ -4,6 +4,7 @@ #include #include #include +#include #include #include @@ -104,7 +105,7 @@ Tensor& logspace_out(const Scalar& start, const Scalar& end, c10::optional(scalar_end - scalar_start) / (steps - 1); const int64_t halfway = steps / 2; at::parallel_for(0, steps, internal::GRAIN_SIZE, [&](int64_t p_begin, int64_t p_end) { - for (int64_t i=p_begin; i < p_end; i++) { + for (const auto i : c10::irange(p_begin, p_end)) { if (i < halfway) { data_ptr[i] = std::pow(scalar_base, scalar_start + step*i); } else { diff --git a/aten/src/ATen/native/ReduceOps.cpp b/aten/src/ATen/native/ReduceOps.cpp index c912910356f83d..9e31294d88fdf1 100644 --- a/aten/src/ATen/native/ReduceOps.cpp +++ b/aten/src/ATen/native/ReduceOps.cpp @@ -627,7 +627,7 @@ void cummax_cummin_helper(const T1* self_data, T1* values_data, T2* indices_data Operation op; T1 out = self_data[0]; int idx = 0; - for(int i = 0; i < self_dim_size; i++) { + for (const auto i : c10::irange(self_dim_size)) { T1 curr_elem = self_data[i*self_stride]; if(isnan_(curr_elem) || (!isnan_(out) && op(curr_elem, out))) { out = self_data[i*self_stride]; @@ -738,7 +738,7 @@ static inline void diff_check_compatible_shape(const Tensor& self, const c10::op other.value().dim() == self.dim(), "diff expects prepend or append to be the same dimension as input"); - for (int i = 0; i < other.value().dim(); i++) { + for (const auto i : c10::irange(other.value().dim())) { TORCH_CHECK( other.value().size(i) == self.size(i) || i == wrapped_dim, "diff expects the shape of tensor to prepend or append to match that of" @@ -1065,7 +1065,7 @@ Tensor trace_cpu(const Tensor& self) { t_stride_1 = self.stride(1); t_diag_size = std::min(self.size(0), self.size(1)); - for (int64_t i = 0; i < t_diag_size; i++) { + for (const auto i : c10::irange(t_diag_size)) { sum += t_data[i * (t_stride_0 + t_stride_1)]; } @@ -1478,9 +1478,9 @@ static double std_var_all_cpu(const Tensor& self, int64_t correction, bool take_ const int64_t outer_stride = strides[1]; double local_sum = 0.0; - for (int64_t i = 0; i < size1; ++i) { + for (const auto i : c10::irange(size1)) { const char* row_ptr = data[0] + outer_stride * i; - for (int64_t j = 0; j < size0; ++j) { + for (const auto j : c10::irange(size0)) { const auto ptr = reinterpret_cast(row_ptr + inner_stride * j); auto dx = (static_cast(*ptr) - local_mean); local_sum += dx * dx; @@ -1908,7 +1908,8 @@ bool cpu_equal(const Tensor& self, const Tensor& other) { } char* self_data = data[0]; char* other_data = data[1]; - for (int64_t i = 0; i < dim_size; ++i) { + for (const auto i : c10::irange(dim_size)) { + (void)i; //Suppress unused variable warning if (*((scalar_t*)self_data) != *((scalar_t*)other_data)) { result = false; return; diff --git a/aten/src/ATen/native/ReduceOpsUtils.h b/aten/src/ATen/native/ReduceOpsUtils.h index 28aa87761bb10f..0808d1e7e7d1b8 100644 --- a/aten/src/ATen/native/ReduceOpsUtils.h +++ b/aten/src/ATen/native/ReduceOpsUtils.h @@ -168,7 +168,7 @@ static Tensor review_reduce_result(const Tensor& result, int ndim, DimMask mask, } auto shape = DimVector(result.sizes()); auto stride = DimVector(result.strides()); - for (int dim = 0; dim < ndim; dim++) { + for (const auto dim : c10::irange(ndim)) { if (mask[dim]) { shape.insert(shape.begin() + dim, 1); stride.insert(stride.begin() + dim, 0); diff --git a/aten/src/ATen/native/ReflectionPad.cpp b/aten/src/ATen/native/ReflectionPad.cpp index 07058c81dcf809..81eba80af1dd46 100644 --- a/aten/src/ATen/native/ReflectionPad.cpp +++ b/aten/src/ATen/native/ReflectionPad.cpp @@ -1,6 +1,7 @@ #include #include #include +#include namespace at { @@ -226,8 +227,8 @@ static void reflection_pad1d_out_frame( at::parallel_for(0, nplane, 0, [&](int64_t start, int64_t end) { // NOLINTNEXTLINE(cppcoreguidelines-init-variables) int64_t ip_x; - for (auto k = start; k < end; k++) { - for (int64_t j = 0; j < output_w; j++) { + for (const auto k : c10::irange(start, end)) { + for (const auto j : c10::irange(output_w)) { if (j < pad_l) { ip_x = pad_l * 2 - j; } else if (j >= pad_l && j < input_w + pad_l) { @@ -252,7 +253,7 @@ inline void reflection_pad1d_out_loop( int64_t input_w, int64_t output_w, int64_t pad_l) { at::parallel_for(0, nbatch, 0, [&](int64_t start, int64_t end) { - for (auto p = start; p < end; p++) { + for (const auto p : c10::irange(start, end)) { reflection_pad1d_out_frame( input_p + p * nplane * input_w, output_p + p * nplane * output_w, @@ -352,8 +353,8 @@ static void reflection_pad1d_backward_out_frame( at::parallel_for(0, nplane, 0, [&](int64_t start, int64_t end) { // NOLINTNEXTLINE(cppcoreguidelines-init-variables) int64_t ip_x; - for (auto k = start; k < end; k++) { - for (int64_t j = 0; j < output_w; j++) { + for (const auto k : c10::irange(start, end)) { + for (const auto j : c10::irange(output_w)) { if (j < pad_l) { ip_x = pad_l * 2 - j; } else if (j >= pad_l && j < input_w + pad_l) { @@ -378,7 +379,7 @@ inline void reflection_pad1d_backward_out_loop( int64_t input_w, int64_t output_w, int64_t pad_l) { at::parallel_for(0, nbatch, 0, [&](int64_t start, int64_t end) { - for (auto p = start; p < end; p++) { + for (const auto p : c10::irange(start, end)) { reflection_pad1d_backward_out_frame( grad_input + p * nplane * input_w, grad_output + p * nplane * output_w, @@ -404,9 +405,9 @@ static void reflection_pad2d_out_frame( at::parallel_for(0, nplane, 0, [&](int64_t start, int64_t end) { // NOLINTNEXTLINE(cppcoreguidelines-init-variables) int64_t ip_x, ip_y; - for (auto k = start; k < end; k++) { - for (int64_t i = 0; i < output_h; i++) { - for (int64_t j = 0; j < output_w; j++) { + for (const auto k : c10::irange(start, end)) { + for (const auto i : c10::irange(output_h)) { + for (const auto j : c10::irange(output_w)) { if (j < pad_l) { ip_x = pad_l * 2 - j; } else if (j >= pad_l && j < input_w + pad_l) { @@ -442,7 +443,7 @@ inline void reflection_pad2d_out_loop( int64_t output_w, int64_t output_h, int64_t pad_l, int64_t pad_t) { at::parallel_for(0, nbatch, 0, [&](int64_t start, int64_t end) { - for (auto p = start; p < end; p++) { + for (const auto p : c10::irange(start, end)) { reflection_pad2d_out_frame( input_p + p * nplane * input_w * input_h, output_p + p * nplane * output_w * output_h, @@ -560,9 +561,9 @@ static void reflection_pad2d_backward_out_frame( at::parallel_for(0, nplane, 0, [&](int64_t start, int64_t end) { // NOLINTNEXTLINE(cppcoreguidelines-init-variables) int64_t ip_x, ip_y; - for (auto k = start; k < end; k++) { - for (int64_t i = 0; i < output_h; i++) { - for (int64_t j = 0; j < output_w; j++) { + for (const auto k : c10::irange(start, end)) { + for (const auto i : c10::irange(output_h)) { + for (const auto j : c10::irange(output_w)) { if (j < pad_l) { ip_x = pad_l * 2 - j; } else if (j >= pad_l && j < input_w + pad_l) { @@ -600,7 +601,7 @@ inline void reflection_pad2d_backward_out_loop( int64_t output_w, int64_t output_h, int64_t pad_l, int64_t pad_t) { at::parallel_for(0, nbatch, 0, [&](int64_t start, int64_t end) { - for (auto p = start; p < end; p++) { + for (const auto p : c10::irange(start, end)) { reflection_pad2d_backward_out_frame( grad_input + p * nplane * input_h * input_w, grad_output + p * nplane * output_h * output_w, @@ -690,10 +691,10 @@ inline void parallel_reflection_pad3d( at::parallel_for(0, nplane, 0, [&](int64_t start, int64_t end) { // NOLINTNEXTLINE(cppcoreguidelines-init-variables) int64_t ip_x, ip_y, ip_z; - for (int64_t k = start; k < end; k++) { - for (int64_t op_z = 0; op_z < output_d; op_z++) { - for (int64_t op_y = 0; op_y < output_h; op_y++) { - for (int64_t op_x = 0; op_x < output_w; op_x++) { + for (const auto k : c10::irange(start, end)) { + for (const auto op_z : c10::irange(output_d)) { + for (const auto op_y : c10::irange(output_h)) { + for (const auto op_x : c10::irange(output_w)) { if (op_x < pad_left) { ip_x = pad_left * 2 - op_x; } else if (op_x >= pad_left && op_x < input_w + pad_left) { @@ -772,7 +773,7 @@ static void reflection_pad3d_out_loop( int64_t pad_left, int64_t pad_top, int64_t pad_front) { at::parallel_for(0, nbatch, 0, [&](int64_t start, int64_t end) { - for (int64_t p = start; p < end; p++) { + for (const auto p : c10::irange(start, end)) { reflection_pad3d_out_frame( input_p + p * nplane * input_w * input_h * input_d, output_p + p * nplane * output_w * output_h * output_d, @@ -833,7 +834,7 @@ static void reflection_pad3d_backward_out_loop( int64_t pad_left, int64_t pad_top, int64_t pad_front ) { at::parallel_for(0, nbatch, 0, [&](int64_t start, int64_t end) { - for (int64_t p = start; p < end; p++) { + for (const auto p : c10::irange(start, end)) { reflection_pad3d_backward_out_frame( grad_input + p * nplane * input_w * input_h * input_d, grad_output + p * nplane * output_w * output_h * output_d, diff --git a/aten/src/ATen/native/Repeat.cpp b/aten/src/ATen/native/Repeat.cpp index 2c67c8099f7d60..97723eefa0fe18 100644 --- a/aten/src/ATen/native/Repeat.cpp +++ b/aten/src/ATen/native/Repeat.cpp @@ -1,6 +1,7 @@ #include #include #include +#include template static void compute_cpu( @@ -13,12 +14,12 @@ static void compute_cpu( (result_size == cumsum_ptr[size - 1]), "allocated size does not match required size"); at::parallel_for(0, size, 1, [&](int64_t i_begin, int64_t i_end) { - for (int64_t i = i_begin; i < i_end; i++) { + for (const auto i : c10::irange(i_begin, i_end)) { int64_t end = cumsum_ptr[i]; index_t size = repeat_ptr[i]; TORCH_CHECK((size >= 0), "repeats can not be negative"); int64_t start = end - size; - for (int64_t j = start; j < end; j++) { + for (const auto j : c10::irange(start, end)) { result_ptr[j] = i; } } diff --git a/aten/src/ATen/native/ReplicationPadding.cpp b/aten/src/ATen/native/ReplicationPadding.cpp index 9ee656e96bba2b..d89150cee2676e 100644 --- a/aten/src/ATen/native/ReplicationPadding.cpp +++ b/aten/src/ATen/native/ReplicationPadding.cpp @@ -1,6 +1,7 @@ #include #include #include +#include #include namespace at { @@ -237,8 +238,7 @@ static void replication_pad1d_out_frame( at::parallel_for(0, nslices, 0, [&](int64_t start, int64_t end) { // NOLINTNEXTLINE(cppcoreguidelines-init-variables) long ip_x; - for (auto k = start; k < end; k++) - { + for (const auto k : c10::irange(start, end)) { for (long j = 0; j < owidth; j++) { if (j < pad_l) { ip_x = pad_l; @@ -267,8 +267,7 @@ static void replication_pad1d_out_batch( int nbatch) { at::parallel_for(0, nbatch, 0, [&](int64_t start, int64_t end) { - for (auto p = start; p < end; p++) - { + for (const auto p : c10::irange(start, end)) { scalar_t *input_p = input_data+p*nslices*iwidth; scalar_t *output_p = output_data+p*nslices*owidth; replication_pad1d_out_frame(input_p, output_p, nslices, iwidth, owidth, pad_l, pad_r); @@ -290,8 +289,7 @@ static void replication_pad1d_backward_out_frame( at::parallel_for(0, nslices, 0, [&](int64_t start, int64_t end) { // NOLINTNEXTLINE(cppcoreguidelines-init-variables) long ip_x; - for (auto k = start; k < end; k++) - { + for (const auto k : c10::irange(start, end)) { for (long j = 0; j < owidth; j++) { if (j < pad_l) { ip_x = pad_l; @@ -320,8 +318,7 @@ static void replication_pad1d_backward_out_batch( int nbatch) { at::parallel_for(0, nbatch, 0, [&](int64_t start, int64_t end) { - for (auto p = start; p < end; p++) - { + for (const auto p : c10::irange(start, end)) { scalar_t *ginput_p = ginput_data + p * nslices * iwidth; scalar_t *goutput_p = goutput_data + p * nslices * owidth; replication_pad1d_backward_out_frame(ginput_p, goutput_p, @@ -347,10 +344,9 @@ static void replication_pad2d_out_frame( at::parallel_for(0, nslices, 0, [&](int64_t start, int64_t end) { // NOLINTNEXTLINE(cppcoreguidelines-init-variables) int64_t ip_x, ip_y; - for (auto k = start; k < end; k++) - { - for (int64_t i = 0; i < oheight; i++) { - for (int64_t j = 0; j < owidth; j++) { + for (const auto k : c10::irange(start, end)) { + for (const auto i : c10::irange(oheight)) { + for (const auto j : c10::irange(owidth)) { if (j < pad_l) { ip_x = pad_l; } else if (j >= pad_l && j < iwidth + pad_l) { @@ -389,8 +385,7 @@ static void replication_pad2d_out_batch( int nbatch) { at::parallel_for(0, nbatch, 0, [&](int64_t start, int64_t end) { - for (auto p = start; p < end; p++) - { + for (const auto p : c10::irange(start, end)) { scalar_t *input_p = input_data+p*nslices*iwidth*iheight; scalar_t *output_p = output_data+p*nslices*owidth*oheight; replication_pad2d_out_frame(input_p, output_p, nslices, @@ -416,10 +411,9 @@ static void replication_pad2d_backward_out_frame( at::parallel_for(0, nslices, 0, [&](int64_t start, int64_t end) { // NOLINTNEXTLINE(cppcoreguidelines-init-variables) int64_t ip_x, ip_y; - for (auto k = start; k < end; k++) - { - for (int64_t i = 0; i < oheight; i++) { - for (int64_t j = 0; j < owidth; j++) { + for (const auto k : c10::irange(start, end)) { + for (const auto i : c10::irange(oheight)) { + for (const auto j : c10::irange(owidth)) { if (j < pad_l) { ip_x = pad_l; } else if (j >= pad_l && j < iwidth + pad_l) { @@ -458,8 +452,7 @@ static void replication_pad2d_backward_out_batch( int nbatch) { at::parallel_for(0, nbatch, 0, [&](int64_t start, int64_t end) { - for (auto p = start; p < end; p++) - { + for (const auto p : c10::irange(start, end)) { scalar_t *ginput_p = ginput_data + p * nslices * iheight * iwidth; scalar_t *goutput_p = goutput_data + p * nslices * oheight * owidth; replication_pad2d_backward_out_frame(ginput_p, goutput_p, nslices, @@ -572,10 +565,10 @@ static void replication_pad3d_out_frame( at::parallel_for(0, nslices, 0, [&](int64_t start, int64_t end) { // NOLINTNEXTLINE(cppcoreguidelines-init-variables) int64_t ip_x, ip_y, ip_z; - for (auto k = start; k < end; k++) { - for (int64_t z = 0; z < odepth; z++) { - for (int64_t i = 0; i < oheight; i++) { - for (int64_t j = 0; j < owidth; j++) { + for (const auto k : c10::irange(start, end)) { + for (const auto z : c10::irange(odepth)) { + for (const auto i : c10::irange(oheight)) { + for (const auto j : c10::irange(owidth)) { if (j < pleft) { ip_x = pleft; } else if (j >= pleft && j < iwidth + pleft) { @@ -627,8 +620,7 @@ static void replication_pad3d_out_batch( int nbatch) { at::parallel_for(0, nbatch, 0, [&](int64_t start, int64_t end) { - for (auto p = start; p < end; p++) - { + for (const auto p : c10::irange(start, end)) { scalar_t *input_p = input_data + p * nslices * iwidth * iheight * idepth; scalar_t *output_p = output_data + p * nslices * owidth * oheight * odepth; replication_pad3d_out_frame(input_p, output_p, nslices, @@ -658,10 +650,10 @@ static void replication_pad3d_backward_out_frame( at::parallel_for(0, nslices, 0, [&](int64_t start, int64_t end) { // NOLINTNEXTLINE(cppcoreguidelines-init-variables) int64_t ip_x, ip_y, ip_z; - for (auto k = start; k < end; k++) { - for (int64_t z = 0; z < odepth; z++) { - for (int64_t i = 0; i < oheight; i++) { - for (int64_t j = 0; j < owidth; j++) { + for (const auto k : c10::irange(start, end)) { + for (const auto z : c10::irange(odepth)) { + for (const auto i : c10::irange(oheight)) { + for (const auto j : c10::irange(owidth)) { if (j < pleft) { ip_x = pleft; } else if (j >= pleft && j < iwidth + pleft) { @@ -713,8 +705,7 @@ static void replication_pad3d_backward_out_batch( int nbatch) { at::parallel_for(0, nbatch, 0, [&](int64_t start, int64_t end) { - for (auto p = start; p < end; p++) - { + for (const auto p : c10::irange(start, end)) { scalar_t *ginput_p = ginput_data + p * nslices * idepth * iheight * iwidth; scalar_t *goutput_p = goutput_data + p * nslices * odepth * oheight * owidth; replication_pad3d_backward_out_frame(ginput_p, goutput_p, nslices, diff --git a/aten/src/ATen/native/ResizeCommon.h b/aten/src/ATen/native/ResizeCommon.h index 8c934e7872e229..df3c0e441245a2 100644 --- a/aten/src/ATen/native/ResizeCommon.h +++ b/aten/src/ATen/native/ResizeCommon.h @@ -2,6 +2,7 @@ #include #include +#include namespace at { namespace native { @@ -10,7 +11,7 @@ inline int64_t storage_size_for(IntArrayRef size, IntArrayRef stride) { "storage_size_for(size, stride) requires that size and stride ", "have the same size as a precondition."); int64_t storage_size = 1; - for (size_t dim = 0; dim < size.size(); ++dim) { + for (const auto dim : c10::irange(size.size())) { if (size[dim] == 0) { storage_size = 0; break; diff --git a/aten/src/ATen/native/RowwisePrune.cpp b/aten/src/ATen/native/RowwisePrune.cpp index 319b94c7e1048a..40ae2215cbccc6 100644 --- a/aten/src/ATen/native/RowwisePrune.cpp +++ b/aten/src/ATen/native/RowwisePrune.cpp @@ -1,6 +1,7 @@ // Copyright 2004-present Facebook. All Rights Reserved. #include +#include namespace at { @@ -15,7 +16,7 @@ std::tuple _rowwise_prune_helper( int num_non_masked_rows = 0; auto mask_contig = mask.contiguous(); auto mask_data = mask_contig.data_ptr(); - for (int i = 0; i < mask.numel(); ++i) { + for (const auto i : c10::irange(mask.numel())) { num_non_masked_rows += (((mask_data[i] == true)) ? 1 : 0); } int num_cols = weights.size(1); @@ -32,7 +33,7 @@ std::tuple _rowwise_prune_helper( compressed_indices_mapping.data_ptr(); auto weights_data = weights.data_ptr(); int last_row_kept = 0; - for (int i = 0; i < mask.numel(); i++) { + for (const auto i : c10::irange(mask.numel())) { if (mask_data[i]) { memcpy(pruned_2d_tensor_data + last_row_kept * num_cols, weights_data + i * num_cols, diff --git a/aten/src/ATen/native/ScatterGatherChecks.h b/aten/src/ATen/native/ScatterGatherChecks.h index 4518952cd3a806..1b71eb40975db7 100644 --- a/aten/src/ATen/native/ScatterGatherChecks.h +++ b/aten/src/ATen/native/ScatterGatherChecks.h @@ -3,6 +3,7 @@ #include #include #include +#include namespace at { namespace native { @@ -45,7 +46,7 @@ static C10_UNUSED void gather_shape_check(const Tensor& self, int64_t dim, "Index tensor must have the same number of dimensions as input tensor" ); - for (int64_t i = 0; i < self_dims; ++i) { + for (const auto i : c10::irange(self_dims)) { if (i != dim) { TORCH_CHECK( ensure_nonempty_size(index, i) <= ensure_nonempty_size(self, i), @@ -77,7 +78,7 @@ static C10_UNUSED void scatter_shape_check( int64_t self_dims = ensure_nonempty_dim(self.dim()); // Check: index.size(d) <= self.size(d) for all d != dim - for (int64_t d = 0; d < self_dims; ++d) { + for (const auto d : c10::irange(self_dims)) { int64_t index_d_size = ensure_nonempty_size(index, d); if (d == dim) continue; if (index_d_size > ensure_nonempty_size(self, d)) { @@ -89,7 +90,7 @@ static C10_UNUSED void scatter_shape_check( // Check: index.size(d) <= src.size(d) for all d if src is Tensor if (!is_wrong_shape && src_opt.has_value()) { auto src = src_opt.value(); - for (int64_t d = 0; d < self_dims; ++d) { + for (const auto d : c10::irange(self_dims)) { int64_t index_d_size = ensure_nonempty_size(index, d); if (index_d_size > ensure_nonempty_size(src, d)) { is_wrong_shape = true; diff --git a/aten/src/ATen/native/SegmentReduce.cpp b/aten/src/ATen/native/SegmentReduce.cpp index 8e5c846bd38e39..143783779f085f 100644 --- a/aten/src/ATen/native/SegmentReduce.cpp +++ b/aten/src/ATen/native/SegmentReduce.cpp @@ -3,6 +3,7 @@ #include #include #include +#include namespace at { namespace native { @@ -41,8 +42,8 @@ void _segment_reduce_cpu_kernel1( auto* output_data = output.data_ptr(); const auto* values_data = data.data_ptr(); int64_t lengths_cum_sum = 0; - for (int64_t i = 0; i < segment_count; ++i) { - for (int64_t l = 0; l < stride_count; ++l) { + for (const auto i : c10::irange(segment_count)) { + for (const auto l : c10::irange(stride_count)) { // ===== step1: initialize starting value scalar_t initial_value; if (initial.has_value()) { @@ -141,12 +142,12 @@ void _segment_reduce_cpu_backward_kernel1( const auto* values_data = data_contig.data_ptr(); int64_t lengths_cum_sum = 0; - for (int64_t i = 0; i < segment_count; ++i) { + for (const auto i : c10::irange(segment_count)) { if (lengths_data[i] == 0) { continue; } - for (int64_t l = 0; l < stride_count; ++l) { + for (const auto l : c10::irange(stride_count)) { int64_t output_index = (i * stride_count) + l; if (reduction == SegmentReductionType::MAX || diff --git a/aten/src/ATen/native/SobolEngineOps.cpp b/aten/src/ATen/native/SobolEngineOps.cpp index 6cb069adafe9d9..48366976a2e705 100644 --- a/aten/src/ATen/native/SobolEngineOps.cpp +++ b/aten/src/ATen/native/SobolEngineOps.cpp @@ -3,6 +3,7 @@ #include #include +#include #include @@ -40,7 +41,7 @@ std::tuple _sobol_engine_draw(const Tensor& quasi, int64_t n, co for (int64_t i = 0; i < n; i++, num_generated++) { l = rightmost_zero(num_generated); - for (int64_t j = 0; j < dimension; j++) { + for (const auto j : c10::irange(dimension)) { wquasi_data[j * wquasi_stride] ^= sobolstate_data[j * sobolstate_row_stride + l * sobolstate_col_stride]; result_data[i * result_row_stride + j * result_col_stride] = wquasi_data[j * wquasi_stride]; } @@ -73,7 +74,7 @@ Tensor& _sobol_engine_ff_(Tensor& quasi, int64_t n, const Tensor& sobolstate, for (int64_t i = 0; i < n; i++, num_generated++) { l = rightmost_zero(num_generated); - for (int64_t j = 0; j < dimension; j++) { + for (const auto j : c10::irange(dimension)) { quasi_data[j * quasi_stride] ^= sobolstate_data[j * sobolstate_row_stride + l * sobolstate_col_stride]; } } @@ -102,13 +103,13 @@ Tensor& _sobol_engine_scramble_(Tensor& sobolstate, const Tensor& ltm, int64_t d auto ltm_d_a = ltm_dots.accessor(); /// Main scrambling loop - for (int64_t d = 0; d < dimension; ++d) { - for (int64_t j = 0; j < MAXBIT; ++j) { + for (const auto d : c10::irange(dimension)) { + for (const auto j : c10::irange(MAXBIT)) { int64_t vdj = ss_a[d][j], l = 1, t2 = 0; for (int64_t p = MAXBIT - 1; p >= 0; --p) { int64_t lsmdp = ltm_d_a[d][p]; int64_t t1 = 0; - for (int64_t k = 0; k < MAXBIT; ++k) { + for (const auto k : c10::irange(MAXBIT)) { t1 += (bitsubseq(lsmdp, k, 1) * bitsubseq(vdj, k, 1)); } t1 = t1 % 2; @@ -131,17 +132,17 @@ Tensor& _sobol_engine_initialize_state_(Tensor& sobolstate, int64_t dimension) { auto ss_a = sobolstate.accessor(); /// First row of `sobolstate` is all 1s - for (int64_t m = 0; m < MAXBIT; ++m) { + for (const auto m : c10::irange(MAXBIT)) { ss_a[0][m] = 1; } /// Remaining rows of sobolstate (row 2 through dim, indexed by [1:dim]) - for (int64_t d = 1; d < dimension; ++d) { + for (const auto d : c10::irange(1, dimension)) { int64_t p = poly[d]; int64_t m = bit_length(p) - 1; // First m elements of row d comes from initsobolstate - for (int64_t i = 0; i < m; ++i) { + for (const auto i : c10::irange(m)) { ss_a[d][i] = initsobolstate[d][i]; } @@ -149,10 +150,10 @@ Tensor& _sobol_engine_initialize_state_(Tensor& sobolstate, int64_t dimension) { // P. Bratley and B. L. Fox. Algorithm 659: Implementing sobol's // quasirandom sequence generator. ACM Trans. // Math. Softw., 14(1):88-100, Mar. 1988. - for (int64_t j = m; j < MAXBIT; ++j) { + for (const auto j : c10::irange(m, MAXBIT)) { int64_t newv = ss_a[d][j - m]; int64_t pow2 = 1; - for (int64_t k = 0; k < m; ++k) { + for (const auto k : c10::irange(m)) { pow2 <<= 1; if ((p >> (m - 1 - k)) & 1) { newv = newv ^ (pow2 * ss_a[d][j - k - 1]); diff --git a/aten/src/ATen/native/SoftMax.cpp b/aten/src/ATen/native/SoftMax.cpp index 4e56a833d473dc..5128d6bf698fd5 100644 --- a/aten/src/ATen/native/SoftMax.cpp +++ b/aten/src/ATen/native/SoftMax.cpp @@ -9,6 +9,7 @@ #include #include +#include namespace at { namespace meta { @@ -129,8 +130,7 @@ void host_softmax(Tensor output, const Tensor& input, const int64_t dim) { int64_t outer_size = 1; int64_t dim_size = input.size(dim); int64_t inner_size = 1; - for (int64_t i = 0; i < dim; ++i) - outer_size *= input.size(i); + for (const auto i : c10::irange(dim))outer_size *= input.size(i); for (int64_t i = dim + 1; i < input.dim(); ++i) inner_size *= input.size(i); int64_t dim_stride = inner_size; @@ -141,7 +141,7 @@ void host_softmax(Tensor output, const Tensor& input, const int64_t dim) { parallel_for( 0, outer_size * inner_size, grain_size, [&](int64_t begin, int64_t end) { - for (int64_t i = begin; i < end; i++) { + for (const auto i : c10::irange(begin, end)) { int64_t outer_idx = i / inner_size; int64_t inner_idx = i % inner_size; scalar_t* input_data = @@ -149,11 +149,10 @@ void host_softmax(Tensor output, const Tensor& input, const int64_t dim) { scalar_t* output_data = output_data_base + outer_idx * outer_stride + inner_idx; scalar_t max_input = input_data[0]; - for (int64_t d = 1; d < dim_size; d++) - max_input = std::max(max_input, input_data[d * dim_stride]); + for (const auto d : c10::irange(1, dim_size))max_input = std::max(max_input, input_data[d * dim_stride]); acc_type tmpsum = 0; - for (int64_t d = 0; d < dim_size; d++) { + for (const auto d : c10::irange(dim_size)) { scalar_t z = std::exp(input_data[d * dim_stride] - max_input); if (!LogSoftMax) { output_data[d * dim_stride] = z; @@ -166,8 +165,7 @@ void host_softmax(Tensor output, const Tensor& input, const int64_t dim) { else tmpsum = 1 / tmpsum; - for (int64_t d = 0; d < dim_size; d++) - if (LogSoftMax) + for (const auto d : c10::irange(dim_size))if (LogSoftMax) output_data[d * dim_stride] = input_data[d * dim_stride] - max_input - tmpsum; else @@ -186,8 +184,7 @@ void host_softmax_backward( int64_t outer_size = 1; int64_t dim_size = grad.size(dim); int64_t inner_size = 1; - for (int64_t i = 0; i < dim; ++i) - outer_size *= grad.size(i); + for (const auto i : c10::irange(dim))outer_size *= grad.size(i); for (int64_t i = dim + 1; i < grad.dim(); ++i) inner_size *= grad.size(i); int64_t dim_stride = inner_size; @@ -198,7 +195,7 @@ void host_softmax_backward( int64_t grain_size = std::min(internal::GRAIN_SIZE / dim_size, (int64_t)1); parallel_for( 0, outer_size * inner_size, grain_size, [&](int64_t begin, int64_t end) { - for (int64_t i = begin; i < end; i++) { + for (const auto i : c10::irange(begin, end)) { int64_t outer_idx = i / inner_size; int64_t inner_idx = i % inner_size; scalar_t* gradInput_data = @@ -209,14 +206,13 @@ void host_softmax_backward( gradOutput_data_base + outer_idx * outer_stride + inner_idx; acc_type sum = 0; - for (int64_t d = 0; d < dim_size; d++) - if (LogSoftMax) + for (const auto d : c10::irange(dim_size))if (LogSoftMax) sum += gradOutput_data[d * dim_stride]; else sum += gradOutput_data[d * dim_stride] * output_data[d * dim_stride]; - for (int64_t d = 0; d < dim_size; d++) { + for (const auto d : c10::irange(dim_size)) { if (LogSoftMax) { gradInput_data[d * dim_stride] = gradOutput_data[d * dim_stride] - std::exp(output_data[d * dim_stride]) * sum; diff --git a/aten/src/ATen/native/Sorting.cpp b/aten/src/ATen/native/Sorting.cpp index dd41cb5e12f829..6dc76242a5f087 100644 --- a/aten/src/ATen/native/Sorting.cpp +++ b/aten/src/ATen/native/Sorting.cpp @@ -8,6 +8,7 @@ #include #include #include +#include #include @@ -315,7 +316,7 @@ std::tuple kthvalue_out_impl_cpu( AT_DISPATCH_ALL_TYPES_AND(ScalarType::BFloat16, self.scalar_type(), "kthvalue_cpu", [&] { auto loop = [&](char** data, const int64_t* strides, int64_t n) { - for (int64_t i = 0; i < n; ++i) { + for (const auto i : c10::irange(n)) { TensorAccessor tmp_values( reinterpret_cast(data[0] + i * strides[0]), &sizes[dim], &tmp_values_stride); @@ -325,7 +326,7 @@ std::tuple kthvalue_out_impl_cpu( auto mode_value = reinterpret_cast(data[2] + i * strides[2]); auto mode_index = reinterpret_cast(data[3] + i * strides[3]); - for (int64_t j = 0; j < tmp_indices.size(0); j++) { + for (const auto j : c10::irange(tmp_indices.size(0))) { tmp_indices[j] = j; } @@ -411,7 +412,7 @@ std::tuple median_with_indices_impl( AT_DISPATCH_ALL_TYPES_AND(ScalarType::BFloat16, in.scalar_type(), "median_out", [&] { auto loop = [&](char** data, const int64_t* strides, int64_t n) { - for (int64_t i = 0; i < n; ++i) { + for (const auto i : c10::irange(n)) { auto valp = reinterpret_cast(data[0] + i * strides[0]); auto indp = reinterpret_cast(data[1] + i * strides[1]); auto ip = reinterpret_cast(data[2] + i * strides[2]); diff --git a/aten/src/ATen/native/SortingUtils.h b/aten/src/ATen/native/SortingUtils.h index f1dc3b5d570e08..c6f4f919a4bad3 100644 --- a/aten/src/ATen/native/SortingUtils.h +++ b/aten/src/ATen/native/SortingUtils.h @@ -2,6 +2,7 @@ #include #include +#include namespace at { namespace native { @@ -97,7 +98,7 @@ void topk_impl_loop( const bool largest, const bool sorted, char** data, const int64_t* strides, const int64_t n) { - for (int64_t i = 0; i < n; ++i) { + for (const auto i : c10::irange(n)) { TensorAccessor mode_values( reinterpret_cast(data[0] + i * strides[0]), &k, &mode_values_stride); @@ -113,7 +114,7 @@ void topk_impl_loop( using elem_t = std::pair; std::vector queue(n); - for (int64_t j = 0; j < n; j++) { + for (const auto j : c10::irange(n)) { queue[j].first = tmp_values[j]; queue[j].second = j; } @@ -157,7 +158,7 @@ void topk_impl_loop( } } - for (int64_t j = 0; j < k; j++) { + for (const auto j : c10::irange(k)) { mode_values[j] = queue[j].first; mode_indices[j] = queue[j].second; } diff --git a/aten/src/ATen/native/SpectralOps.cpp b/aten/src/ATen/native/SpectralOps.cpp index 57c88474f748a5..2f5789a8f387ce 100644 --- a/aten/src/ATen/native/SpectralOps.cpp +++ b/aten/src/ATen/native/SpectralOps.cpp @@ -252,7 +252,7 @@ ShapeAndDims canonicalize_fft_shape_and_dim_args( // Translate shape of -1 to the default length ret.shape.resize(transform_ndim); - for (int64_t i = 0; i < transform_ndim; ++i) { + for (const auto i : c10::irange(transform_ndim)) { const auto n = (*shape)[i]; ret.shape[i] = n == -1 ? input_sizes[ret.dim[i]] : n; } diff --git a/aten/src/ATen/native/SummaryOps.cpp b/aten/src/ATen/native/SummaryOps.cpp index 6d37c904a98d9d..cf86225460ea0a 100644 --- a/aten/src/ATen/native/SummaryOps.cpp +++ b/aten/src/ATen/native/SummaryOps.cpp @@ -2,6 +2,7 @@ #include #include +#include #include @@ -45,13 +46,13 @@ Tensor _bincount_cpu_template( weights.options().pinned_memory_opt()); weights_t* output_p = output.data_ptr(); const weights_t* weights_p = weights.data_ptr(); - for (int64_t i = 0; i < self_size; i++) { + for (const auto i : c10::irange(self_size)) { output_p[self_p[i]] += weights_p[i]; } } else { output = native::zeros({nbins}, kLong); int64_t* output_p = output.data_ptr(); - for (int64_t i = 0; i < self_size; i++) { + for (const auto i : c10::irange(self_size)) { output_p[self_p[i]] += 1L; } } diff --git a/aten/src/ATen/native/TensorAdvancedIndexing.cpp b/aten/src/ATen/native/TensorAdvancedIndexing.cpp index aecc98cd4b9270..1cf619ec96a3e4 100644 --- a/aten/src/ATen/native/TensorAdvancedIndexing.cpp +++ b/aten/src/ATen/native/TensorAdvancedIndexing.cpp @@ -255,7 +255,7 @@ AdvancedIndex::AdvancedIndex(const Tensor& src, TensorList indices_list) int64_t element_size_bytes = src.element_size(); int64_t dims_before = 0, dims_after = 0, dims_indexed = 0; IntArrayRef replacement_shape; - for (size_t dim = 0; dim < indices_list.size(); dim++) { + for (const auto dim : c10::irange(indices_list.size())) { if (!indices_list[dim].defined()) { if (dims_indexed == 0) { dims_before++; @@ -319,7 +319,7 @@ const Tensor& value){ return std::make_tuple(false, Tensor()); } else { mask = index; - for (int64_t j = 0; j < index.dim(); j++) { + for (const auto j : c10::irange(index.dim())) { int64_t srcIdx = num_ind + j; TORCH_CHECK_INDEX(index.size(j) == self.size(srcIdx), "The shape of the mask ", index.sizes(), " at index ", j, " does not match the shape of the indexed tensor ", self.sizes(), " at index ", srcIdx); @@ -328,7 +328,8 @@ const Tensor& value){ } } } - for (int64_t i = num_ind; i< self.ndimension(); i++){ + for (const auto i : c10::irange(num_ind, self.ndimension())) { + (void)i; //Suppress unused variable warning mask = mask.unsqueeze(-1); } return std::make_tuple(true, mask); @@ -729,7 +730,7 @@ Tensor& index_add_cpu_(Tensor & self, int64_t dim, const Tensor & index, const T if (self.dim() > 1) { // Equivalent to: - // for (auto i = 0; i < numel; i++) { + // for (const auto i : c10::irange(numel)) { // auto selfSlice = self.select(dim, index_data[i]); // auto sourceSlice = source.select(dim, i); // selfSlice.add_(sourceSlice); @@ -747,7 +748,7 @@ Tensor& index_add_cpu_(Tensor & self, int64_t dim, const Tensor & index, const T AT_DISPATCH_INDEX_TYPES(index.scalar_type(), "index_add_cpu_", [&] () { auto index_data = index_contig.data_ptr(); - for (auto i = 0; i < numel; i++) { + for (const auto i : c10::irange(numel)) { auto self_i = index_data[i]; TORCH_CHECK_INDEX((self_i >= 0) && (self_i < self_dim_size), "index out of range in self"); auto self_data = static_cast(selfSlice.data_ptr()) + self_i * self_stride_bytes; @@ -775,7 +776,7 @@ Tensor& index_add_cpu_(Tensor & self, int64_t dim, const Tensor & index, const T AT_DISPATCH_INDEX_TYPES(index_contig.scalar_type(), "index_add_cpu_", [&index_contig, &numel, &self, &self_ptr, &self_stride, &source_ptr, &source_stride, alpha_value] { auto index_data = index_contig.data_ptr(); - for (auto i = 0; i < numel; i++) { + for (const auto i : c10::irange(numel)) { auto self_i = index_data[i]; TORCH_CHECK_INDEX((self_i >= 0) && (self_i < self.numel()), "index out of range in self"); scalar_t *self_ip = self_ptr + self_i * self_stride; @@ -847,12 +848,12 @@ Tensor & index_select_out_cpu_dim1_( // Special-case single-float copy for efficiency if (self.scalar_type() == ScalarType::Float && block_size == 1) { - for (auto batch = 0; batch < outer_dims_product; ++batch) { + for (const auto batch : c10::irange(outer_dims_product)) { const float* src_floats = (const float*)(src_base + batch * src_batch_bytesize); float* dst_floats = (float*)(out + batch * gathered_batch_bytesize); - for (auto i = 0; i < N; ++i) { + for (const auto i : c10::irange(N)) { auto idx = idxs[i]; if (idx < 0) { idx = idx + src_indexing_axis_dim; @@ -863,8 +864,8 @@ Tensor & index_select_out_cpu_dim1_( } else { // outer_dims_product specifies how many times we repeat inner dimensions, // so we just iterate over it to cover all outer dimensions. - for (auto batch = 0; batch < outer_dims_product; ++batch) { - for (auto i = 0; i < N; ++i) { + for (const auto batch : c10::irange(outer_dims_product)) { + for (const auto i : c10::irange(N)) { auto idx = idxs[i]; if (idx < 0) { idx = idx + src_indexing_axis_dim; @@ -946,7 +947,7 @@ Tensor & index_select_out_cpu_(const Tensor & self, int64_t dim, const Tensor & [&index_contig, &start, &end, &sub_iter, &self_dim_size, &selfSlice_data, &self_stride_bytes, &resultSlice_data, &result_stride_bytes] () { auto index_data = index_contig.data_ptr(); - for (int64_t i = start; i < end; i++) { + for (const auto i : c10::irange(start, end)) { auto self_i = index_data[i]; TORCH_CHECK_INDEX((self_i >= 0) && (self_i < self_dim_size), "index out of range in self"); auto self_data = static_cast(selfSlice_data) + self_i * self_stride_bytes; @@ -975,7 +976,7 @@ Tensor & index_select_out_cpu_(const Tensor & self, int64_t dim, const Tensor & [&index_contig, &slice_size_bytes, &self_dim_size, &selfSlice_data, &self_stride_bytes, &resultSlice_data, &result_stride_bytes, &start, &end] () { auto index_data = index_contig.data_ptr(); - for (int64_t i = start; i < end; i++) { + for (const auto i : c10::irange(start, end)) { auto self_i = index_data[i]; TORCH_CHECK_INDEX((self_i >= 0) && (self_i < self_dim_size), "index out of range in self"); auto self_data = static_cast(selfSlice_data) + self_i * self_stride_bytes; @@ -1003,7 +1004,7 @@ Tensor & index_select_out_cpu_(const Tensor & self, int64_t dim, const Tensor & AT_DISPATCH_INDEX_TYPES(index_contig.scalar_type(), "index_select_out_cpu_quant_", [&index_contig, &numel, &self_numel, &self_data_ptr, &self_stride, &result_data_ptr, &result_stride] { auto index_data = index_contig.data_ptr(); - for (auto i = 0; i < numel; i++) { + for (const auto i : c10::irange(numel)) { auto self_i = index_data[i]; TORCH_CHECK_INDEX((self_i >= 0) && (self_i < self_numel), "index out of range in self"); scalar_t *self_ip = self_data_ptr + self_i * self_stride; @@ -1023,7 +1024,7 @@ Tensor & index_select_out_cpu_(const Tensor & self, int64_t dim, const Tensor & AT_DISPATCH_INDEX_TYPES(index_contig.scalar_type(), "index_select_out_cpu_", [&index_contig, &numel, &self_numel, &self_data_ptr, &self_stride, &result_data_ptr, &result_stride] { auto index_data = index_contig.data_ptr(); - for (auto i = 0; i < numel; i++) { + for (const auto i : c10::irange(numel)) { auto self_i = index_data[i]; TORCH_CHECK_INDEX((self_i >= 0) && (self_i < self_numel), "index out of range in self"); scalar_t *self_ip = self_data_ptr + self_i * self_stride; @@ -1523,7 +1524,7 @@ Tensor _gather_sparse_backward(const Tensor& self, int64_t dim, const Tensor& in int64_t n_above = grad.numel(); int64_t n_below = 1; if (dim < 0) dim += self.ndimension(); - for (int i=0; i(ptr); // If nonzero, write index if (val != scalar_t(0)) { - for (int64_t k = 0; k < ndim; ++k) { + for (const auto k : c10::irange(ndim)) { *out = local_idx[k]; out += out_stride1; } diff --git a/aten/src/ATen/native/TensorDimApply.h b/aten/src/ATen/native/TensorDimApply.h index 7b077ab3644cc5..ad9ca857eeab8c 100644 --- a/aten/src/ATen/native/TensorDimApply.h +++ b/aten/src/ATen/native/TensorDimApply.h @@ -1,4 +1,5 @@ #include +#include namespace at { namespace native { @@ -20,7 +21,7 @@ namespace at { func(self_data, values_data, indices_data, self_dim_size, self_stride, values_stride, indices_stride); if(ndims == 1) break; - for(int dim_i = 0; dim_i < ndims; dim_i++) { + for (const auto dim_i : c10::irange(ndims)) { if(dim_i == dim) { if(dim_i == (ndims - 1)) { tensor_dim_apply_has_finished = 1; diff --git a/aten/src/ATen/native/TensorFactories.cpp b/aten/src/ATen/native/TensorFactories.cpp index 763c5b5b483013..a95f2ebd43792c 100644 --- a/aten/src/ATen/native/TensorFactories.cpp +++ b/aten/src/ATen/native/TensorFactories.cpp @@ -14,6 +14,7 @@ #include #include #include +#include #include #include @@ -423,8 +424,7 @@ Tensor& eye_out_cpu(int64_t n, int64_t m, Tensor& result) { AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2(at::ScalarType::Half, at::ScalarType::Bool, result.scalar_type(), "eye", [&]() -> void { scalar_t* result_data = result.data_ptr(); at::parallel_for(0, sz, internal::GRAIN_SIZE, [&](int64_t p_begin, int64_t p_end) { - for(int64_t i = p_begin; i < p_end; i++) - result_data[i*(result.strides()[0] + result.strides()[1])] = 1; + for (const auto i : c10::irange(p_begin, p_end))result_data[i*(result.strides()[0] + result.strides()[1])] = 1; }); }); @@ -864,8 +864,7 @@ void randperm_cpu(Tensor& result, int64_t n, CPUGeneratorImpl* generator) { at::parallel_for(0, n, internal::GRAIN_SIZE, [&r__data, &r__stride_0](int64_t p_begin, int64_t p_end) { - for(int64_t i = p_begin; i < p_end; i++) - r__data[i*r__stride_0] = static_cast(i); + for (const auto i : c10::irange(p_begin, p_end))r__data[i*r__stride_0] = static_cast(i); }); for(int64_t i = 0; i < n - 1; i++) diff --git a/aten/src/ATen/native/TensorProperties.cpp b/aten/src/ATen/native/TensorProperties.cpp index 40a4d6219e7e5b..63d928749e0910 100644 --- a/aten/src/ATen/native/TensorProperties.cpp +++ b/aten/src/ATen/native/TensorProperties.cpp @@ -5,6 +5,7 @@ #include #include +#include namespace at { namespace native { @@ -72,7 +73,7 @@ bool is_set_to(const Tensor& self, const Tensor& src) { if (self.storage().unsafeGetStorageImpl() == src.storage().unsafeGetStorageImpl() && self.storage_offset() == src.storage_offset() && self.dim() == src.dim()) { - for (int64_t d = 0; d < self.dim(); ++d) { + for (const auto d : c10::irange(self.dim())) { if (self.size(d) != src.size(d) || self.stride(d) != src.stride(d)) { return false; } diff --git a/aten/src/ATen/native/TensorShape.cpp b/aten/src/ATen/native/TensorShape.cpp index 5bebd678c46de2..aec3f3aa97e838 100644 --- a/aten/src/ATen/native/TensorShape.cpp +++ b/aten/src/ATen/native/TensorShape.cpp @@ -1421,7 +1421,7 @@ static inline std::vector get_stack_inputs(TensorList tensors, int64_t d std::vector inputs(tensors.size()); at::IntArrayRef entry_shape = tensors[0].sizes(); inputs[0] = tensors[0].unsqueeze(dim); - for (size_t i = 1; i < tensors.size(); ++i) { + for (const auto i : c10::irange(1, tensors.size())) { TORCH_CHECK(tensors[i].sizes() == entry_shape, "stack expects each tensor to be equal size, but got ", entry_shape, " at entry 0 and ", tensors[i].sizes(), " at entry ", i); @@ -1449,7 +1449,7 @@ bool inline can_use_native_serial_stack(Tensor& result, TensorList tensors, int6 if (result.dtype() != firstTensor.dtype()) return false; // Inputs cannot alias the output tensor - for (size_t i = 0; i < tensors.size(); i++) { + for (const auto i : c10::irange(tensors.size())) { auto lap = at::get_overlap_status(result, tensors[i]); TORCH_CHECK(lap != at::MemOverlapStatus::PARTIAL && lap != at::MemOverlapStatus::FULL, 0, @@ -1471,7 +1471,7 @@ bool inline can_use_native_serial_stack(Tensor& result, TensorList tensors, int6 // check remainder of inputs auto const &first_tensor_shape = firstTensor.sizes(); - for (size_t i = 1; i < tensors.size(); i++) { + for (const auto i : c10::irange(1, tensors.size())) { auto const &tensor = tensors[i]; TORCH_CHECK(tensors[i].sizes() == firstTensor.sizes(), "stack expects each tensor to be equal size, but got ", first_tensor_shape, diff --git a/aten/src/ATen/native/TensorTransformations.cpp b/aten/src/ATen/native/TensorTransformations.cpp index bdff5a1e9db053..3f5111a884b737 100644 --- a/aten/src/ATen/native/TensorTransformations.cpp +++ b/aten/src/ATen/native/TensorTransformations.cpp @@ -5,6 +5,7 @@ #include #include #include +#include #include #include @@ -22,7 +23,7 @@ Tensor flip(const Tensor& self, IntArrayRef dims) { // Count dimensions in which we need to do work int n = 0; auto strides = DimVector(self.strides()); - for(int64_t i = 0; i < total_dims; i++) { + for (const auto i : c10::irange(total_dims)) { if(flip_dims_b[i] && self.size(i) > 1 && self.stride(i) != 0) { n++; strides[i] = 0; @@ -61,7 +62,7 @@ Tensor flip(const Tensor& self, IntArrayRef dims) { // - We move the pointer to the opposite vertex of the cube // - We iterate in the opposite direction (invert the strides) - for (int i=0; i #include +#include namespace at { namespace native { @@ -23,7 +24,7 @@ static void apply_triu_tril_single( if (upper) { at::parallel_for(0, n, 0, [&](int64_t start, int64_t end) { - for (auto i = start; i < end; i++) { + for (const auto i : c10::irange(start, end)) { for (int64_t j = 0; j < std::min(m, i + k); j++) { result[i * res_row_stride + j * res_col_stride] = 0; } @@ -36,7 +37,7 @@ static void apply_triu_tril_single( }); } else { at::parallel_for(0, n, 0, [&](int64_t start, int64_t end) { - for (auto i = start; i < end; i++) { + for (const auto i : c10::irange(start, end)) { for (int64_t j = std::max(zero, i + k + 1); j < m; j++) { result[i * res_row_stride + j * res_col_stride] = 0; } @@ -74,7 +75,7 @@ void apply_triu_tril(Tensor& result, const Tensor& self, bool inplace, int64_t k } at::parallel_for(0, batchsize, 0, [&](int64_t start, int64_t end) { - for (auto b = start; b < end; b++) { + for (const auto b : c10::irange(start, end)) { scalar_t* self_batch = &self_data[b * self_stride]; scalar_t* result_batch = &result_data[b * result_stride]; apply_triu_tril_single( diff --git a/aten/src/ATen/native/Unfold3d.cpp b/aten/src/ATen/native/Unfold3d.cpp index 812bacf5e1eea2..3495f92dc3ce64 100644 --- a/aten/src/ATen/native/Unfold3d.cpp +++ b/aten/src/ATen/native/Unfold3d.cpp @@ -1,6 +1,7 @@ #include #include #include +#include #if AT_MKL_ENABLED() #include @@ -17,7 +18,7 @@ bool IsAGeZeroAndALtB(int64_t a, int64_t b) { template void MatCopy(int64_t M, int64_t N, int64_t lda, int64_t ldb, const T* A, T* B) { - for (int64_t i = 0; i < M; ++i) { + for (const auto i : c10::irange(M)) { std::memcpy(B + i * ldb, A + i * lda, N * sizeof(T)); } } @@ -32,10 +33,10 @@ void MatCopy( int64_t strideb, const T* A, T* B) { - for (int64_t i = 0; i < M; ++i) { + for (const auto i : c10::irange(M)) { const T* A_ptr = A + i * lda; T* B_ptr = B + i * ldb; - for (int64_t j = 0; j < N; ++j) { + for (const auto j : c10::irange(N)) { B_ptr[j * strideb] = A_ptr[j * stridea]; } } @@ -44,8 +45,8 @@ void MatCopy( // Y += X template void MatAdd(int64_t M, int64_t N, int64_t ldx, int64_t ldy, const T* X, T* Y) { - for (int64_t i = 0; i < M; ++i) { - for (int64_t j = 0; j < N; ++j) { + for (const auto i : c10::irange(M)) { + for (const auto j : c10::irange(N)) { Y[i * ldy + j] += X[i * ldx + j]; } } @@ -62,8 +63,8 @@ void MatAdd( int64_t stridey, const T* X, T* Y) { - for (int64_t i = 0; i < M; ++i) { - for (int64_t j = 0; j < N; ++j) { + for (const auto i : c10::irange(M)) { + for (const auto j : c10::irange(N)) { Y[i * ldy + j * stridey] += X[i * ldx + j * stridex]; } } @@ -151,7 +152,7 @@ void MatAdd( int64_t stridey, const float* X, float* Y) { - for (int64_t i = 0; i < M; ++i) { + for (const auto i : c10::irange(M)) { cblas_saxpy(N, 1.0f, X + i * ldx, stridex, Y + i * ldy, stridey); } } @@ -166,7 +167,7 @@ void MatAdd( int64_t stridey, const double* X, double* Y) { - for (int64_t i = 0; i < M; ++i) { + for (const auto i : c10::irange(M)) { cblas_daxpy(N, 1.0, X + i * ldx, stridex, Y + i * ldy, stridey); } } @@ -194,7 +195,7 @@ void Unfold3dZeroPaddingCopyKernelImpl( const int64_t X_size = X_D * X_H * X_W; const int64_t Y_size = Y_D * Y_H * Y_W; at::parallel_for(0, n, 0, [=](int64_t begin, int64_t end) { - for (int64_t p = begin; p < end; ++p) { + for (const auto p : c10::irange(begin, end)) { int64_t c = p; const int64_t kw = c % kernel_w; c /= kernel_w; @@ -202,7 +203,7 @@ void Unfold3dZeroPaddingCopyKernelImpl( c /= kernel_h; const int64_t kd = c % kernel_d; c /= kernel_d; - for (int64_t yd = 0; yd < Y_D; ++yd) { + for (const auto yd : c10::irange(Y_D)) { const int64_t xd = yd * stride_d + kd; const T* src_ptr = src + c * X_size + xd * X_H * X_W + kh * X_W + kw; T* dst_ptr = dst + p * Y_size + yd * Y_H * Y_W; @@ -261,7 +262,7 @@ void Unfold3dCopyKernelImpl( const int64_t X_size = X_D * X_H * X_W; const int64_t Y_size = Y_D * Y_H * Y_W; at::parallel_for(0, n, 0, [=](int64_t begin, int64_t end) { - for (int64_t p = begin; p < end; ++p) { + for (const auto p : c10::irange(begin, end)) { int64_t c = p; const int64_t kw = c % kernel_w; c /= kernel_w; @@ -271,20 +272,20 @@ void Unfold3dCopyKernelImpl( c /= kernel_d; const T* src_ptr = src + c * X_size; T* dst_ptr = dst + p * Y_size; - for (int64_t yd = 0; yd < Y_D; ++yd) { + for (const auto yd : c10::irange(Y_D)) { const int64_t xd = yd * stride_d - pad_d + kd; if (!IsAGeZeroAndALtB(xd, X_D)) { std::memset(dst_ptr + yd * Y_H * Y_W, 0, Y_H * Y_W * sizeof(T)); continue; } - for (int64_t yh = 0; yh < Y_H; ++yh) { + for (const auto yh : c10::irange(Y_H)) { const int64_t xh = yh * stride_h - pad_h + kh; if (!IsAGeZeroAndALtB(xh, X_H)) { std::memset( dst_ptr + yd * Y_H * Y_W + yh * Y_W, 0, Y_W * sizeof(T)); continue; } - for (int64_t yw = 0; yw < Y_W; ++yw) { + for (const auto yw : c10::irange(Y_W)) { const int64_t xw = yw * stride_w - pad_w + kw; dst_ptr[yd * Y_H * Y_W + yh * Y_W + yw] = IsAGeZeroAndALtB(xw, X_W) ? src_ptr[xd * X_H * X_W + xh * X_W + xw] @@ -318,13 +319,13 @@ void Unfold3dZeroPaddingAccKernelImpl( const int64_t kernel_size = kernel_d * kernel_h * kernel_w; at::parallel_for(0, C, 0, [=](int64_t begin, int64_t end) { std::memset(dst + begin * X_size, 0, (end - begin) * X_size * sizeof(T)); - for (int64_t c = begin; c < end; ++c) { - for (int64_t kd = 0; kd < kernel_d; ++kd) { - for (int64_t kh = 0; kh < kernel_h; ++kh) { - for (int64_t kw = 0; kw < kernel_w; ++kw) { + for (const auto c : c10::irange(begin, end)) { + for (const auto kd : c10::irange(kernel_d)) { + for (const auto kh : c10::irange(kernel_h)) { + for (const auto kw : c10::irange(kernel_w)) { const int64_t p = c * kernel_size + kd * kernel_h * kernel_w + kh * kernel_w + kw; - for (int64_t yd = 0; yd < Y_D; ++yd) { + for (const auto yd : c10::irange(Y_D)) { const int64_t xd = yd * stride_d + kd; const T* src_ptr = src + p * Y_size + yd * Y_H * Y_W; T* dst_ptr = dst + c * X_size + xd * X_H * X_W + kh * X_W + kw; @@ -393,25 +394,25 @@ void Unfold3dAccKernelImpl( const int64_t kernel_size = kernel_d * kernel_h * kernel_w; at::parallel_for(0, C, 0, [=](int64_t begin, int64_t end) { std::memset(dst + begin * X_size, 0, (end - begin) * X_size * sizeof(T)); - for (int64_t c = begin; c < end; ++c) { + for (const auto c : c10::irange(begin, end)) { T* dst_ptr = dst + c * X_size; - for (int64_t kd = 0; kd < kernel_d; ++kd) { - for (int64_t kh = 0; kh < kernel_h; ++kh) { - for (int64_t kw = 0; kw < kernel_w; ++kw) { + for (const auto kd : c10::irange(kernel_d)) { + for (const auto kh : c10::irange(kernel_h)) { + for (const auto kw : c10::irange(kernel_w)) { const int64_t p = c * kernel_size + kd * kernel_h * kernel_w + kh * kernel_w + kw; const T* src_ptr = src + p * Y_size; - for (int64_t yd = 0; yd < Y_D; ++yd) { + for (const auto yd : c10::irange(Y_D)) { const int64_t xd = yd * stride_d - pad_d + kd; if (!IsAGeZeroAndALtB(xd, X_D)) { continue; } - for (int64_t yh = 0; yh < Y_H; ++yh) { + for (const auto yh : c10::irange(Y_H)) { const int64_t xh = yh * stride_h - pad_h + kh; if (!IsAGeZeroAndALtB(xh, X_H)) { continue; } - for (int64_t yw = 0; yw < Y_W; ++yw) { + for (const auto yw : c10::irange(Y_W)) { const int64_t xw = yw * stride_w - pad_w + kw; if (IsAGeZeroAndALtB(xw, X_W)) { dst_ptr[xd * X_H * X_W + xh * X_W + xw] += diff --git a/aten/src/ATen/native/Unique.cpp b/aten/src/ATen/native/Unique.cpp index 4d35bbf57e6a62..c146bfe246a017 100644 --- a/aten/src/ATen/native/Unique.cpp +++ b/aten/src/ATen/native/Unique.cpp @@ -2,6 +2,7 @@ #include #include +#include #include #include @@ -51,25 +52,25 @@ std::tuple unique_cpu_template( int64_t* inverse_indices_data = inverse_indices.data_ptr(); std::unordered_map inverse_map; inverse_map.reserve(output.numel()); - for (int64_t i = 0; i < output.numel(); ++i) { + for (const auto i : c10::irange(output.numel())) { inverse_map[output_data[i]] = i; } - for(int64_t i = 0; i < numel; ++i) { + for (const auto i : c10::irange(numel)) { inverse_indices_data[i] = inverse_map[input_data[i]]; } if (return_counts) { std::unordered_map counts_map; counts_map.reserve(output.numel()); - for (int64_t i = 0; i < output.numel(); ++i) { + for (const auto i : c10::irange(output.numel())) { counts_map[output_data[i]] = 0; } - for(int64_t i = 0; i < numel; i++) { + for (const auto i : c10::irange(numel)) { counts_map[input_data[i]] += 1; } counts.resize_(output.sizes()); counts.fill_(0); int64_t *counts_data = counts.data_ptr(); - for(int64_t i = 0; i < output.numel(); i++) { + for (const auto i : c10::irange(output.numel())) { counts_data[i] = counts_map[output_data[i]]; } } @@ -106,7 +107,7 @@ std::tuple unique_consecutive_cpu_template( scalar_t *p = output_data; int64_t *q = counts_data; int64_t last = 0; - for (int64_t i = 0; i < numel; i++) { + for (const auto i : c10::irange(numel)) { if (input_data[i] != *p) { *(++p) = input_data[i]; if (return_counts) { @@ -202,7 +203,7 @@ std::tuple _unique_dim_cpu_template( if (!consecutive) { std::sort(indices.begin(), indices.end(), [&](int64_t a, int64_t b) -> bool { - for (int64_t i = 0; i < numel; ++i) { + for (const auto i : c10::irange(numel)) { scalar_t lhs = input_flat_ptr[i + a * numel]; scalar_t rhs = input_flat_ptr[i + b * numel]; if (lhs < rhs) { @@ -218,7 +219,7 @@ std::tuple _unique_dim_cpu_template( Tensor input_sorted; if (!consecutive) { input_sorted = at::empty(input_flat.sizes(), input_flat.options()); - for (size_t i = 0; i < indices.size(); ++i) { + for (const auto i : c10::irange(indices.size())) { input_sorted[i] = input_flat[indices[i]]; } } else { diff --git a/aten/src/ATen/native/UpSample.cpp b/aten/src/ATen/native/UpSample.cpp index f703f8a4f56645..bcc8891de8dcd7 100644 --- a/aten/src/ATen/native/UpSample.cpp +++ b/aten/src/ATen/native/UpSample.cpp @@ -1,6 +1,7 @@ // Copyright 2004-present Facebook. All Rights Reserved. #include +#include namespace at { namespace native { @@ -20,7 +21,7 @@ TORCH_API c10::SmallVector compute_output_size( TORCH_CHECK(!output_size, "Must specify exactly one of output_size and scale_factors"); TORCH_CHECK(static_cast(scale_factors->size()) == spatial_dimensions); c10::SmallVector ret; - for (int i = 0; i < spatial_dimensions; ++i) { + for (const auto i : c10::irange(spatial_dimensions)) { ret.push_back(static_cast(input_size[i+2]) * scale_factors.value()[i]); } return ret; diff --git a/aten/src/ATen/native/UpSampleBicubic2d.cpp b/aten/src/ATen/native/UpSampleBicubic2d.cpp index 72fd73c8b23eab..5db6212e9d9b65 100644 --- a/aten/src/ATen/native/UpSampleBicubic2d.cpp +++ b/aten/src/ATen/native/UpSampleBicubic2d.cpp @@ -1,6 +1,7 @@ #include #include #include +#include namespace at { namespace meta { @@ -33,7 +34,7 @@ TORCH_META_FUNC(upsample_bicubic2d_backward) ( grad_output.dim() == 4, "Expected grad_output to be a tensor of dimension 4 but got: dimension ", grad_output.dim()); - for (int i = 0; i < 4; ++i) { + for (const auto i : c10::irange(4)) { TORCH_CHECK( grad_output.size(i) == full_output_size[i], "Expected grad_output to have the same shape as output;", @@ -65,11 +66,12 @@ static void upsample_bicubic2d_backward_out_frame( // Special case: input/output same size, just copy if (input_height == output_height && input_width == output_width) { - for (int64_t output_y = 0; output_y < output_height; output_y++) { - for (int64_t output_x = 0; output_x < output_width; output_x++) { + for (const auto output_y : c10::irange(output_height)) { + for (const auto output_x : c10::irange(output_width)) { scalar_t* in = &idata[output_y * input_width + output_x]; scalar_t* out = &odata[output_y * output_width + output_x]; - for (int64_t c = 0; c < channels; ++c) { + for (const auto c : c10::irange(channels)) { + (void)c; //Suppress unused variable warning in[0] = out[0]; in += input_width * input_height; out += output_width * output_height; @@ -84,8 +86,8 @@ static void upsample_bicubic2d_backward_out_frame( const scalar_t width_scale = area_pixel_compute_scale( input_width, output_width, align_corners, scales_w); - for (int64_t output_y = 0; output_y < output_height; output_y++) { - for (int64_t output_x = 0; output_x < output_width; output_x++) { + for (const auto output_y : c10::irange(output_height)) { + for (const auto output_x : c10::irange(output_width)) { scalar_t* in = idata; scalar_t* out = odata; @@ -105,11 +107,12 @@ static void upsample_bicubic2d_backward_out_frame( get_cubic_upsample_coefficients(x_coeffs, t_x); get_cubic_upsample_coefficients(y_coeffs, t_y); - for (int64_t c = 0; c < channels; c++) { + for (const auto c : c10::irange(channels)) { + (void)c; //Suppress unused variable warning scalar_t out_value = out[output_y * output_width + output_x]; - for (int64_t i = 0; i < 4; i++) { - for (int64_t j = 0; j < 4; j++) { + for (const auto i : c10::irange(4)) { + for (const auto j : c10::irange(4)) { upsample_increment_value_bounded( in, input_width, diff --git a/aten/src/ATen/native/UpSampleBilinear2d.cpp b/aten/src/ATen/native/UpSampleBilinear2d.cpp index 043de6ac3682ae..a080a627993f21 100644 --- a/aten/src/ATen/native/UpSampleBilinear2d.cpp +++ b/aten/src/ATen/native/UpSampleBilinear2d.cpp @@ -4,6 +4,7 @@ #include #include #include +#include namespace at { namespace meta { @@ -36,7 +37,7 @@ TORCH_META_FUNC(upsample_bilinear2d_backward) ( grad_output.dim() == 4, "Expected grad_output to be a tensor of dimension 4 but got: dimension ", grad_output.dim()); - for (int i = 0; i < 4; ++i) { + for (const auto i : c10::irange(4)) { TORCH_CHECK( grad_output.size(i) == full_output_size[i], "Expected grad_output to have the same shape as output;", diff --git a/aten/src/ATen/native/UpSampleNearest2d.cpp b/aten/src/ATen/native/UpSampleNearest2d.cpp index 31bad2f57e8420..cda2528e1ed63c 100644 --- a/aten/src/ATen/native/UpSampleNearest2d.cpp +++ b/aten/src/ATen/native/UpSampleNearest2d.cpp @@ -2,6 +2,7 @@ #include #include #include +#include namespace at { namespace meta { @@ -33,7 +34,7 @@ TORCH_META_FUNC(upsample_nearest2d_backward) ( grad_output.dim() == 4, "Expected grad_output to be a tensor of dimension 4 but got: dimension ", grad_output.dim()); - for (int i = 0; i < 4; ++i) { + for (const auto i : c10::irange(4)) { TORCH_CHECK( grad_output.size(i) == full_output_size[i], "Expected grad_output to have the same shape as output;", diff --git a/aten/src/ATen/native/UpSampleNearest3d.cpp b/aten/src/ATen/native/UpSampleNearest3d.cpp index df7cffb80bb991..95c5e52aa4e737 100644 --- a/aten/src/ATen/native/UpSampleNearest3d.cpp +++ b/aten/src/ATen/native/UpSampleNearest3d.cpp @@ -1,6 +1,7 @@ #include #include #include +#include namespace at { namespace meta { @@ -37,7 +38,7 @@ TORCH_META_FUNC(upsample_nearest3d_backward) ( grad_output.dim() == 5, "Expected grad_output to be a tensor of dimension 5 but got: dimension ", grad_output.dim()); - for (int i = 0; i < 5; ++i) { + for (const auto i : c10::irange(5)) { TORCH_CHECK( grad_output.size(i) == full_output_size[i], "Expected grad_output to have the same shape as output;", diff --git a/aten/src/ATen/native/UpSampleTrilinear3d.cpp b/aten/src/ATen/native/UpSampleTrilinear3d.cpp index cf39a0b0bc3508..75a77a76c623d2 100644 --- a/aten/src/ATen/native/UpSampleTrilinear3d.cpp +++ b/aten/src/ATen/native/UpSampleTrilinear3d.cpp @@ -4,6 +4,7 @@ #include #include #include +#include namespace at { namespace meta { @@ -42,7 +43,7 @@ TORCH_META_FUNC(upsample_trilinear3d_backward) ( grad_output.dim() == 5, "Expected grad_output to be a tensor of dimension 5 but got: dimension ", grad_output.dim()); - for (int i = 0; i < 5; ++i) { + for (const auto i : c10::irange(5)) { TORCH_CHECK( grad_output.size(i) == full_output_size[i], "Expected grad_output to have the same shape as output;", diff --git a/aten/src/ATen/native/ao_sparse/quantized/cpu/qlinear.cpp b/aten/src/ATen/native/ao_sparse/quantized/cpu/qlinear.cpp index ec1700665596a8..6628eb4c6192b9 100644 --- a/aten/src/ATen/native/ao_sparse/quantized/cpu/qlinear.cpp +++ b/aten/src/ATen/native/ao_sparse/quantized/cpu/qlinear.cpp @@ -5,6 +5,7 @@ #include #include +#include namespace ao { namespace sparse { @@ -64,7 +65,7 @@ at::Tensor PackedLinearWeight::apply_impl( // Process the per channel quantization. output_multiplier_float.resize(out_channels, 0.0); act_times_w_scale.resize(out_channels, 1.0f); - for (int i = 0; i < out_channels; ++i) { + for (const auto i : c10::irange(out_channels)) { act_times_w_scale[i] = (input_scale_float * w_scale[i]); output_multiplier_float[i] = act_times_w_scale[i] / static_cast(output_scale); @@ -126,7 +127,7 @@ at::Tensor PackedLinearWeight::apply_impl( int num_tasks = at::get_num_threads(); at::parallel_for(0, num_tasks, 1, [&](int64_t begin, int64_t end) { - for (int task_id = begin; task_id < end; ++task_id) { + for (const auto task_id : c10::irange(begin, end)) { fbgemm::trRequantizationParams_t reqParams = { input_zero_point_int32, w_zp.data(), diff --git a/aten/src/ATen/native/cpu/AdaptiveAvgPoolKernel.cpp b/aten/src/ATen/native/cpu/AdaptiveAvgPoolKernel.cpp index 282eabcfc75027..5d16201f1c2351 100644 --- a/aten/src/ATen/native/cpu/AdaptiveAvgPoolKernel.cpp +++ b/aten/src/ATen/native/cpu/AdaptiveAvgPoolKernel.cpp @@ -5,6 +5,7 @@ #include #include #include +#include namespace at { namespace native { @@ -31,24 +32,24 @@ void cpu_adaptive_avg_pool( // parallel on dim of N, C at::parallel_for(0, channels, 0, [&](int64_t begin, int64_t end) { - for (int64_t c = begin; c < end; c++) { + for (const auto c : c10::irange(begin, end)) { scalar_t* input_ptr = input_data + c * input_height * input_width; scalar_t* output_ptr = output_data + c * output_height * output_width; - for (int64_t oh = 0; oh < output_height; oh++) { + for (const auto oh : c10::irange(output_height)) { int64_t ih0 = start_index(oh, output_height, input_height); int64_t ih1 = end_index(oh, output_height, input_height); int64_t kh = ih1 - ih0; - for (int64_t ow = 0; ow < output_width; ow++) { + for (const auto ow : c10::irange(output_width)) { int64_t iw0 = start_index(ow, output_width, input_width); int64_t iw1 = end_index(ow, output_width, input_width); int64_t kw = iw1 - iw0; // compute local average scalar_t sum = 0; - for (int64_t ih = ih0; ih < ih1; ih++) { - for (int64_t iw = iw0; iw < iw1; iw++) { + for (const auto ih : c10::irange(ih0, ih1)) { + for (const auto iw : c10::irange(iw0, iw1)) { sum += input_ptr[ih * input_width + iw]; } } @@ -90,7 +91,7 @@ void cpu_adaptive_avg_pool_channels_last( int64_t ow = 0; data_index_init(begin, n, nbatch, oh, output_height, ow, output_width); - for (int64_t i = begin; i < end; i++) { + for (const auto i : c10::irange(begin, end)) { int64_t ih0 = start_index(oh, output_height, input_height); int64_t ih1 = end_index(oh, output_height, input_height); int64_t kh = ih1 - ih0; @@ -114,8 +115,8 @@ void cpu_adaptive_avg_pool_channels_last( out[d1] = scalar_t(0); } // Pass II: compute local sum - for (int64_t ih = ih0; ih < ih1; ih++) { - for (int64_t iw = iw0; iw < iw1; iw++) { + for (const auto ih : c10::irange(ih0, ih1)) { + for (const auto iw : c10::irange(iw0, iw1)) { scalar_t* in = input_data + n * input_height * input_width * channels + ih * input_width * channels + iw * channels; @@ -169,23 +170,23 @@ void cpu_adaptive_avg_pool_backward( // parallel on dim of N, C at::parallel_for(0, channels, 0, [&](int64_t begin, int64_t end) { - for (int64_t c = begin; c < end; c++) { + for (const auto c : c10::irange(begin, end)) { scalar_t* grad_input_ptr = grad_input_data + c * input_height * input_width; scalar_t* grad_output_ptr = grad_output_data + c * output_height * output_width; - for (int64_t oh = 0; oh < output_height; oh++) { + for (const auto oh : c10::irange(output_height)) { int64_t ih0 = start_index(oh, output_height, input_height); int64_t ih1 = end_index(oh, output_height, input_height); int64_t kh = ih1 - ih0; - for (int64_t ow = 0; ow < output_width; ow++) { + for (const auto ow : c10::irange(output_width)) { int64_t iw0 = start_index(ow, output_width, input_width); int64_t iw1 = end_index(ow, output_width, input_width); int64_t kw = iw1 - iw0; scalar_t grad_delta = grad_output_ptr[oh * output_width + ow] / kh / kw; - for (int64_t ih = ih0; ih < ih1; ih++) { - for (int64_t iw = iw0; iw < iw1; iw++) { + for (const auto ih : c10::irange(ih0, ih1)) { + for (const auto iw : c10::irange(iw0, iw1)) { grad_input_ptr[ih * input_width + iw] += grad_delta; } } @@ -220,24 +221,24 @@ void cpu_adaptive_avg_pool_backward_channels_last( using Vec = vec::Vectorized; // parallel on dim N at::parallel_for(0, nbatch, 0, [&](int64_t begin, int64_t end) { - for (int64_t n = begin; n < end; n++) { + for (const auto n : c10::irange(begin, end)) { scalar_t* grad_input_ptr = grad_input_data + n * input_height * input_width * channels; scalar_t* grad_output_ptr = grad_output_data + n * output_height * output_width * channels; - for (int64_t oh = 0; oh < output_height; oh++) { + for (const auto oh : c10::irange(output_height)) { int64_t ih0 = start_index(oh, output_height, input_height); int64_t ih1 = end_index(oh, output_height, input_height); int64_t kh = ih1 - ih0; - for (int64_t ow = 0; ow < output_width; ow++) { + for (const auto ow : c10::irange(output_width)) { int64_t iw0 = start_index(ow, output_width, input_width); int64_t iw1 = end_index(ow, output_width, input_width); int64_t kw = iw1 - iw0; scalar_t* gout = grad_output_ptr + oh * output_width * channels + ow * channels; int64_t size = channels; - for (int64_t ih = ih0; ih < ih1; ih++) { - for (int64_t iw = iw0; iw < iw1; iw++) { + for (const auto ih : c10::irange(ih0, ih1)) { + for (const auto iw : c10::irange(iw0, iw1)) { scalar_t* gin = grad_input_ptr + ih * input_width * channels + iw * channels; int64_t d = 0; diff --git a/aten/src/ATen/native/cpu/AdaptiveMaxPoolKernel.cpp b/aten/src/ATen/native/cpu/AdaptiveMaxPoolKernel.cpp index b8687a6d7347ab..89024b0bbfdefa 100644 --- a/aten/src/ATen/native/cpu/AdaptiveMaxPoolKernel.cpp +++ b/aten/src/ATen/native/cpu/AdaptiveMaxPoolKernel.cpp @@ -5,6 +5,7 @@ #include #include #include +#include namespace at { namespace native { @@ -34,16 +35,16 @@ void cpu_adaptive_max_pool( // parallel on dim of N, C at::parallel_for(0, channels, 0, [&](int64_t begin, int64_t end) { - for (int64_t c = begin; c < end; c++) { + for (const auto c : c10::irange(begin, end)) { scalar_t* input_ptr = input_data + c * input_height * input_width; scalar_t* output_ptr = output_data + c * output_height * output_width; int64_t* indices_ptr = indices_data + c * output_height * output_width; - for (int64_t oh = 0; oh < output_height; oh++) { + for (const auto oh : c10::irange(output_height)) { int64_t ih0 = start_index(oh, output_height, input_height); int64_t ih1 = end_index(oh, output_height, input_height); - for (int64_t ow = 0; ow < output_width; ow++) { + for (const auto ow : c10::irange(output_width)) { int64_t iw0 = start_index(ow, output_width, input_width); int64_t iw1 = end_index(ow, output_width, input_width); @@ -121,7 +122,7 @@ void cpu_adaptive_max_pool_channels_last( // temp buffer holding index with integer_t std::unique_ptr index_buffer(new integer_t[len]); - for (int64_t i = begin; i < end; i++) { + for (const auto i : c10::irange(begin, end)) { int64_t ih0 = start_index(oh, output_height, input_height); int64_t ih1 = end_index(oh, output_height, input_height); @@ -216,13 +217,13 @@ void cpu_adaptive_max_pool_backward( // parallel on dim of N, C at::parallel_for(0, channels, 0, [&](int64_t begin, int64_t end) { - for (int64_t c = begin; c < end; c++) { + for (const auto c : c10::irange(begin, end)) { scalar_t* grad_input_ptr = grad_input_data + c * input_height * input_width; scalar_t* grad_output_ptr = grad_output_data + c * output_height * output_width; int64_t* indices_ptr = indices_data + c * output_height * output_width; - for (int64_t oh = 0; oh < output_height; oh++) { - for (int64_t ow = 0; ow < output_width; ow++) { + for (const auto oh : c10::irange(output_height)) { + for (const auto ow : c10::irange(output_width)) { // retrieve position of max int64_t index = oh * output_width + ow; int64_t maxindex = indices_ptr[index]; @@ -264,17 +265,17 @@ void cpu_adaptive_max_pool_backward_channels_last( // parallel on dim N at::parallel_for(0, nbatch, 0, [&](int64_t begin, int64_t end) { - for (int64_t n = begin; n < end; n++) { + for (const auto n : c10::irange(begin, end)) { scalar_t* grad_input_ptr = grad_input_data + n * input_height * input_width * channels; scalar_t* grad_output_ptr = grad_output_data + n * output_height * output_width * channels; int64_t* indices_ptr = indices_data + n * output_height * output_width * channels; - for (int64_t oh = 0; oh < output_height; oh++) { - for (int64_t ow = 0; ow < output_width; ow++) { + for (const auto oh : c10::irange(output_height)) { + for (const auto ow : c10::irange(output_width)) { scalar_t* gout = grad_output_ptr + oh * output_width * channels + ow * channels; int64_t* ind = indices_ptr + oh * output_width * channels + ow * channels; // TODO: gcc vectorization - for (int64_t c = 0; c < channels; c++) { + for (const auto c : c10::irange(channels)) { int64_t maxindex = ind[c]; grad_input_ptr[maxindex * channels + c] += gout[c]; } diff --git a/aten/src/ATen/native/cpu/AvgPoolKernel.cpp b/aten/src/ATen/native/cpu/AvgPoolKernel.cpp index 2bee0206ff6b5c..87107e95ed8845 100644 --- a/aten/src/ATen/native/cpu/AvgPoolKernel.cpp +++ b/aten/src/ATen/native/cpu/AvgPoolKernel.cpp @@ -5,6 +5,7 @@ #include #include #include +#include namespace at { namespace native { @@ -41,7 +42,7 @@ void cpu_avg_pool( int64_t ow = 0; data_index_init(begin, c, channels, oh, output_height, ow, output_width); - for (int64_t i = begin; i < end; i++) { + for (const auto i : c10::irange(begin, end)) { output_data[i] = static_cast(0); // local pointers @@ -77,8 +78,8 @@ void cpu_avg_pool( } } - for (int64_t ih = ih0; ih < ih1; ih++) { - for (int64_t iw = iw0; iw < iw1; iw++) { + for (const auto ih : c10::irange(ih0, ih1)) { + for (const auto iw : c10::irange(iw0, iw1)) { sum += input_ptr[ih * input_width + iw]; } } @@ -129,7 +130,7 @@ void cpu_avg_pool_channels_last( int64_t size = channels; int64_t len = size - (size % Vec::size()); - for (int64_t i = begin; i < end; i++) { + for (const auto i : c10::irange(begin, end)) { // compute the mean of the input image... int64_t ih0 = oh * dH - padH; int64_t iw0 = ow * dW - padW; @@ -171,8 +172,8 @@ void cpu_avg_pool_channels_last( } // Pass II: compute local sum - for (int64_t ih = ih0; ih < ih1; ih++) { - for (int64_t iw = iw0; iw < iw1; iw++) { + for (const auto ih : c10::irange(ih0, ih1)) { + for (const auto iw : c10::irange(iw0, iw1)) { scalar_t* in = input_data + n * input_height * input_width * channels + ih * input_width * channels + iw * channels; @@ -232,12 +233,12 @@ void cpu_avg_pool_backward( // parallel on dim of N, C at::parallel_for(0, channels, 0, [&](int64_t begin, int64_t end) { - for (int64_t c = begin; c < end; c++) { + for (const auto c : c10::irange(begin, end)) { scalar_t* grad_input_ptr = grad_input_data + c * input_height * input_width; scalar_t* grad_output_ptr = grad_output_data + c * output_height * output_width; - for (int64_t oh = 0; oh < output_height; oh++) { - for (int64_t ow = 0; ow < output_width; ow++) { + for (const auto oh : c10::irange(output_height)) { + for (const auto ow : c10::irange(output_width)) { int64_t ih0 = oh * dH - padH; int64_t iw0 = ow * dW - padW; int64_t ih1 = std::min(ih0 + kH, input_height + padH); @@ -260,8 +261,8 @@ void cpu_avg_pool_backward( } scalar_t grad_delta = grad_output_ptr[oh * output_width + ow] / divide_factor; - for (int64_t ih = ih0; ih < ih1; ih++) { - for (int64_t iw = iw0; iw < iw1; iw++) { + for (const auto ih : c10::irange(ih0, ih1)) { + for (const auto iw : c10::irange(iw0, iw1)) { grad_input_ptr[ih * input_width + iw] += grad_delta; } } @@ -301,12 +302,12 @@ void cpu_avg_pool_backward_channels_last( using Vec = vec::Vectorized; // parallel on dim N at::parallel_for(0, nbatch, 0, [&](int64_t begin, int64_t end) { - for (int64_t n = begin; n < end; n++) { + for (const auto n : c10::irange(begin, end)) { scalar_t* grad_input_ptr = grad_input_data + n * input_height * input_width * channels; scalar_t* grad_output_ptr = grad_output_data + n * output_height * output_width * channels; - for (int64_t oh = 0; oh < output_height; oh++) { - for (int64_t ow = 0; ow < output_width; ow++) { + for (const auto oh : c10::irange(output_height)) { + for (const auto ow : c10::irange(output_width)) { int64_t ih0 = oh * dH - padH; int64_t iw0 = ow * dW - padW; int64_t ih1 = std::min(ih0 + kH, input_height + padH); @@ -331,8 +332,8 @@ void cpu_avg_pool_backward_channels_last( scalar_t* gout = grad_output_ptr + oh * output_width * channels + ow * channels; int64_t size = channels; int64_t len = size - (size % Vec::size()); - for (int64_t ih = ih0; ih < ih1; ih++) { - for (int64_t iw = iw0; iw < iw1; iw++) { + for (const auto ih : c10::irange(ih0, ih1)) { + for (const auto iw : c10::irange(iw0, iw1)) { scalar_t* gin = grad_input_ptr + ih * input_width * channels + iw * channels; int64_t d = 0; diff --git a/aten/src/ATen/native/cpu/BlasKernel.cpp b/aten/src/ATen/native/cpu/BlasKernel.cpp index 1897ff8aead334..ece9f38c0b9b19 100644 --- a/aten/src/ATen/native/cpu/BlasKernel.cpp +++ b/aten/src/ATen/native/cpu/BlasKernel.cpp @@ -1,5 +1,6 @@ #include #include +#include namespace at { namespace native { @@ -13,16 +14,16 @@ void scale_(int64_t m, int64_t n, scalar_t alpha, scalar_t *a, int64_t lda) { } if (alpha == scalar_t(0)) { - for (int64_t j = 0; j < n; j++) { - for (int64_t i = 0; i < m; i++) { + for (const auto j : c10::irange(n)) { + for (const auto i : c10::irange(m)) { a[j * lda + i] = scalar_t(0); } } return; } - for (int64_t j = 0; j < n; j++) { - for (int64_t i = 0; i < m; i++) { + for (const auto j : c10::irange(n)) { + for (const auto i : c10::irange(m)) { a[j * lda + i] *= alpha; } } @@ -41,11 +42,11 @@ void gemm_notrans_( scale_(m, n, beta, c, ldc); // c += alpha * (a @ b) - for (int64_t l = 0; l < k; l++) { - for (int64_t j = 0; j < n; j++) { + for (const auto l : c10::irange(k)) { + for (const auto j : c10::irange(n)) { scalar_t val = b[l + j * ldb] * alpha; int64_t i_m = m / 4; - for (int64_t i_i = 0; i_i < i_m; i_i++) { + for (const auto i_i : c10::irange(i_m)) { c[j * ldc + i_i * 4 + 0] += a[i_i * 4 + 0 + l * lda] * val; c[j * ldc + i_i * 4 + 1] += a[i_i * 4 + 1 + l * lda] * val; c[j * ldc + i_i * 4 + 2] += a[i_i * 4 + 2 + l * lda] * val; @@ -68,14 +69,11 @@ void gemm_transa_( scalar_t *c, int64_t ldc) { // c = alpha * (a.T @ b) + beta * c const scalar_t *a_ = a; - for (int64_t i = 0; i < m; i++) - { + for (const auto i : c10::irange(m)) { const scalar_t *b_ = b; - for (int64_t j = 0; j < n; j++) - { + for (const auto j : c10::irange(n)) { scalar_t sum = 0; - for(int64_t l = 0; l < k; l++) - sum += a_[l]*b_[l]; + for (const auto l : c10::irange(k))sum += a_[l]*b_[l]; b_ += ldb; if (beta == scalar_t(0)) c[j*ldc+i] = alpha*sum; @@ -98,11 +96,11 @@ void gemm_transb_( scale_(m, n, beta, c, ldc); // c += alpha * (a @ b.T) - for (int64_t l = 0; l < k; l++) { - for (int64_t j = 0; j < n; j++) { + for (const auto l : c10::irange(k)) { + for (const auto j : c10::irange(n)) { scalar_t val = b[j + l * ldb] * alpha; int64_t i_m = m / 4; - for (int64_t i_i = 0; i_i < i_m; i_i++) { + for (const auto i_i : c10::irange(i_m)) { c[j * ldc + i_i * 4 + 0] += a[i_i * 4 + 0 + l * lda] * val; c[j * ldc + i_i * 4 + 1] += a[i_i * 4 + 1 + l * lda] * val; c[j * ldc + i_i * 4 + 2] += a[i_i * 4 + 2 + l * lda] * val; @@ -127,10 +125,10 @@ void gemm_transab_( scale_(m, n, beta, c, ldc); // c += alpha * (a.T @ b.T) - for (int64_t i = 0; i < m; i++) { - for (int64_t j = 0; j < n; j++) { + for (const auto i : c10::irange(m)) { + for (const auto j : c10::irange(n)) { int64_t l_k = k / 4; - for (int64_t l_l = 0; l_l < l_k; l_l++) { + for (const auto l_l : c10::irange(l_k)) { c[j * ldc + i] += a[i * lda + l_l * 4 + 0] // * b[(l_l * 4 + 0) * ldb + j] * alpha; c[j * ldc + i] += a[i * lda + l_l * 4 + 1] // diff --git a/aten/src/ATen/native/cpu/CatKernel.cpp b/aten/src/ATen/native/cpu/CatKernel.cpp index 0659bb8c3e963d..f9ddc5ef329c8b 100644 --- a/aten/src/ATen/native/cpu/CatKernel.cpp +++ b/aten/src/ATen/native/cpu/CatKernel.cpp @@ -4,6 +4,7 @@ #include #include #include +#include namespace at { namespace native { @@ -33,8 +34,8 @@ void cat_serial_kernel_impl(Tensor& result, TensorList tensors, int64_t dim) { using Vec = vec::Vectorized; scalar_t* result_ptr = result_data; - for (int64_t i = 0; i < outer; ++i) { - for (int64_t j = 0; j < ninputs; j++) { + for (const auto i : c10::irange(outer)) { + for (const auto j : c10::irange(ninputs)) { int64_t local_inner = inputs[j].inner_size; scalar_t* input_ptr = (scalar_t*)(inputs[j].data_ptr) + i * local_inner; int64_t d = 0; diff --git a/aten/src/ATen/native/cpu/CrossKernel.cpp b/aten/src/ATen/native/cpu/CrossKernel.cpp index d5bbc81864a791..9977ea670950c0 100644 --- a/aten/src/ATen/native/cpu/CrossKernel.cpp +++ b/aten/src/ATen/native/cpu/CrossKernel.cpp @@ -8,6 +8,7 @@ #include #include #include +#include namespace at { namespace native { namespace { template @@ -28,7 +29,7 @@ static void apply_cross(Tensor& result, const Tensor& a, const Tensor& b, const int64_t a_start = 0; int64_t b_start = 0; int64_t r_start = 0; - for (int64_t i = 0; i < a.dim(); i++) { + for (const auto i : c10::irange(a.dim())) { if (i == dim) continue; position_in_dims[i] = index_in_curr_dim % a.size(i); a_start += (index_in_curr_dim % a.size(i)) * a.stride(i); @@ -43,7 +44,7 @@ static void apply_cross(Tensor& result, const Tensor& a, const Tensor& b, const r_ptr[r_start+2*r_stride] = a_ptr[a_start+0*a_stride]*b_ptr[b_start+1*b_stride] - a_ptr[a_start+1*a_stride]*b_ptr[b_start+0*b_stride]; s++; - for (int i = 0; i < a.dim(); i++) { + for (const auto i : c10::irange(a.dim())) { if (i == dim) { continue; } diff --git a/aten/src/ATen/native/cpu/DepthwiseConvKernel.cpp b/aten/src/ATen/native/cpu/DepthwiseConvKernel.cpp index e0dcb873ea5f20..9ab2e860d8959c 100644 --- a/aten/src/ATen/native/cpu/DepthwiseConvKernel.cpp +++ b/aten/src/ATen/native/cpu/DepthwiseConvKernel.cpp @@ -1,6 +1,7 @@ #include #include #include +#include #ifdef __ARM_NEON__ #include @@ -152,7 +153,7 @@ void convolution_depthwise3x3_winograd_impl( &input_tile.val[2], \ &input_tile.val[3]); \ \ - for (int64_t row = 0; row < 4; ++row) { \ + for (const auto row : c10::irange(4)) { \ input_tile.val[row] = \ vmulq_f32(input_tile.val[row], kernel_tile.val[row]); \ } \ @@ -186,22 +187,22 @@ void convolution_depthwise3x3_winograd_impl( 2 * otw + 1 < args.out_cols )) { float32x4x4_t input_tile; - for (int64_t row = 0; row < 4; ++row) { + for (const auto row : c10::irange(4)) { input_tile.val[row] = vld1q_f32(input + (ih + row) * args.in_cols + iw); } TILE; - for (size_t row = 0; row < 2; ++row) { + for (const auto row : c10::irange(2)) { vst1_f32( output + (oth * 2 + row) * args.out_cols + otw * 2, vget_low_f32(input_tile.val[row])); } } else { float block[4][4]; - for (int64_t row = 0; row < 4; ++row) { - for (int64_t col = 0; col < 4; ++col) { + for (const auto row : c10::irange(4)) { + for (const auto col : c10::irange(4)) { if (ih + row >= 0 && iw + col >= 0 && ih + row < args.in_rows && iw + col < args.in_cols) { block[row][col] = input[(ih + row) * args.in_cols + iw + col]; @@ -212,18 +213,18 @@ void convolution_depthwise3x3_winograd_impl( } float32x4x4_t input_tile; - for (int64_t row = 0; row < 4; ++row) { + for (const auto row : c10::irange(4)) { input_tile.val[row] = vld1q_f32(&block[row][0]); } TILE; float oblock[2][2]; - for (int64_t row = 0; row < 2; ++row) { + for (const auto row : c10::irange(2)) { vst1_f32(&oblock[row][0], vget_low_f32(input_tile.val[row])); } - for (int64_t row = 0; row < 2; ++row) { - for (int64_t col = 0; col < 2; ++col) { + for (const auto row : c10::irange(2)) { + for (const auto col : c10::irange(2)) { if (2 * oth + row < args.out_rows && 2 * otw + col < args.out_cols) { output[(2 * oth + row) * args.out_cols + 2 * otw + col] = @@ -285,7 +286,7 @@ Tensor _convolution_depthwise3x3_winograd( at::zeros({kernel_sizes[0]}, input.options()); at::parallel_for(0, args.batch * args.out_channels, 0, [&](int64_t start, int64_t end) { - for (int64_t k = start; k < end; ++k) { + for (const auto k : c10::irange(start, end)) { const int64_t g = k % args.out_channels; const int64_t i = k / (args.out_channels / groups); convolution_depthwise3x3_winograd_impl( diff --git a/aten/src/ATen/native/cpu/DistanceOpsKernel.cpp b/aten/src/ATen/native/cpu/DistanceOpsKernel.cpp index 0b2ff59b5c420a..2058ca482ea0b1 100644 --- a/aten/src/ATen/native/cpu/DistanceOpsKernel.cpp +++ b/aten/src/ATen/native/cpu/DistanceOpsKernel.cpp @@ -7,6 +7,7 @@ #include #include #include +#include namespace at { namespace native { namespace { @@ -227,7 +228,7 @@ struct Dist { const scalar_t * self_j = t2_start + size2 * l + j; scalar_t agg = 0; - for (int x = 0; x < m; x++) { + for (const auto x : c10::irange(m)) { scalar_t a = *(self_i + x); scalar_t b = *(self_j + x); agg = F::red(agg, F::map(std::abs(a-b), p)); @@ -392,7 +393,8 @@ struct Dist { const scalar_t * t1_end = t1 + l1_size; const scalar_t * t2_end = t2 + l2_size; - for (int64_t l = 0; l < d; l++) { + for (const auto l : c10::irange(d)) { + (void)l; //Suppress unused variable warning for (; t1 != t1_end; t1 += m, res += m) { const Vec vec_t1 = Vec::loadu(t1, count); Vec res_vec = Vec::loadu(res, count); diff --git a/aten/src/ATen/native/cpu/DistributionTemplates.h b/aten/src/ATen/native/cpu/DistributionTemplates.h index 15b1916b9892c1..6c017e15c46198 100644 --- a/aten/src/ATen/native/cpu/DistributionTemplates.h +++ b/aten/src/ATen/native/cpu/DistributionTemplates.h @@ -10,6 +10,7 @@ #ifdef CPU_CAPABILITY_AVX2 #include +#include #endif @@ -108,7 +109,7 @@ void normal_fill_AVX2(Tensor& self, const float mean, const float std, RNG gener float *data = self.data_ptr(); auto size = self.numel(); std::lock_guard lock(generator->mutex_); - for (int64_t i = 0; i < size; ++i) { + for (const auto i : c10::irange(size)) { at::uniform_real_distribution uniform(0, 1); data[i] = uniform(generator); } @@ -125,7 +126,7 @@ void normal_fill_AVX2(Tensor& self, const float mean, const float std, RNG gener if (size % 16 != 0) { // Recompute the last 16 values. data = data + size - 16; - for (int64_t i = 0; i < 16; ++i) { + for (const auto i : c10::irange(16)) { at::uniform_real_distribution uniform(0, 1); data[i] = uniform(generator); } @@ -136,7 +137,7 @@ void normal_fill_AVX2(Tensor& self, const float mean, const float std, RNG gener template static void normal_fill_16(scalar_t *data, const scalar_t mean, const scalar_t std) { - for (int j = 0; j < 8; ++j) { + for (const auto j : c10::irange(8)) { const scalar_t u1 = 1 - data[j]; // [0, 1) -> (0, 1] for log. const scalar_t u2 = data[j + 8]; const scalar_t radius = std::sqrt(-2 * std::log(u1)); @@ -151,7 +152,7 @@ void normal_fill(Tensor& self, const scalar_t mean, const scalar_t std, RNG gene scalar_t *data = self.data_ptr(); auto size = self.numel(); std::lock_guard lock(generator->mutex_); - for (int64_t i = 0; i < size; ++i) { + for (const auto i : c10::irange(size)) { at::uniform_real_distribution uniform(0, 1); data[i] = uniform(generator); } @@ -162,7 +163,7 @@ void normal_fill(Tensor& self, const scalar_t mean, const scalar_t std, RNG gene if (size % 16 != 0) { // Recompute the last 16 values. data = data + size - 16; - for (int64_t i = 0; i < 16; ++i) { + for (const auto i : c10::irange(16)) { at::uniform_real_distribution uniform(0, 1); data[i] = uniform(generator); } diff --git a/aten/src/ATen/native/cpu/FunctionOfAMatrixUtilsKernel.cpp b/aten/src/ATen/native/cpu/FunctionOfAMatrixUtilsKernel.cpp index 3e3bae5a5ec69c..2e0cc33c3f5146 100644 --- a/aten/src/ATen/native/cpu/FunctionOfAMatrixUtilsKernel.cpp +++ b/aten/src/ATen/native/cpu/FunctionOfAMatrixUtilsKernel.cpp @@ -1,6 +1,7 @@ #include #include +#include #if (defined(_WIN32) || defined(_WIN64)) #define RESTRICT __restrict @@ -27,14 +28,15 @@ void _compute_linear_combination_cpu_kernel( auto* RESTRICT in_ptr = data[1]; auto* RESTRICT coeff_ptr = data[2]; - for (int64_t elem = 0; elem < n; ++elem) { + for (const auto elem : c10::irange(n)) { + (void)elem; //Suppress unused variable warning auto* RESTRICT out_data = reinterpret_cast(out_ptr); auto* RESTRICT in_data = reinterpret_cast(in_ptr); using primitive_t = typename scalar_value_type::type; auto* RESTRICT coeff_data = reinterpret_cast(coeff_ptr); // perform summation - for (int32_t i = 0; i < num_summations; ++i) { + for (const auto i : c10::irange(num_summations)) { *out_data += in_data[i * in_stride] * coeff_data[i * coeff_stride]; } diff --git a/aten/src/ATen/native/cpu/GridSamplerKernel.cpp b/aten/src/ATen/native/cpu/GridSamplerKernel.cpp index a706f7bd0946fd..4e89a499d233e3 100644 --- a/aten/src/ATen/native/cpu/GridSamplerKernel.cpp +++ b/aten/src/ATen/native/cpu/GridSamplerKernel.cpp @@ -7,6 +7,7 @@ #include #include #include +#include #include #include @@ -50,7 +51,7 @@ namespace at { namespace native { namespace { * // from the beginning of this slice. * // iii. `len` as the number of valid locations in the vectors. * // (There might not be enough near boundary.) - * for (int n = 0; n < input_accessor.size(0); n++) { + * for (const auto n : c10::irange(input_accessor.size(0))) { * grid_sample_2d_grid_slice_iterator( * grid_accessor[n], * [&](const Vectorized& grid_x, @@ -443,7 +444,7 @@ mask_scatter_add(const scalar_t *src, scalar_t* base_addr, #if !defined(_MSC_VER) && !defined(COMPILING_FOR_MIN_SIZE) # pragma unroll #endif - for (int64_t i = 0; i < len; i++) { + for (const auto i : c10::irange(len)) { if (mask[i] & 0x01) { base_addr[offsets[i]] += src[i]; } @@ -568,7 +569,7 @@ struct ApplyGridSample \ grid_sample(inp_acc); \ parallel_for(0, N, grain_size, [&](int64_t begin, int64_t end) { \ - for (int64_t n = begin; n < end; n++) { \ + for (const auto n : c10::irange(begin, end)) { \ auto out_slice = out_acc[n]; \ auto inp_slice = inp_acc[n]; \ grid_sample_2d_grid_slice_iterator( \ @@ -1246,7 +1247,7 @@ grid_sampler_2d_backward_cpu_kernel_impl(const Tensor& grad_output_, ApplyGridSample \ grid_sample(inp_acc); \ parallel_for(0, N, grain_size, [&](int64_t begin, int64_t end) { \ - for (int64_t n = begin; n < end; n++) { \ + for (const auto n : c10::irange(begin, end)) { \ GINP_SLICE_PTR(input_requires_grad) \ auto gGrid_slice = gGrid_acc[n]; \ auto gOut_slice = gOut_acc[n]; \ diff --git a/aten/src/ATen/native/cpu/HistogramKernel.cpp b/aten/src/ATen/native/cpu/HistogramKernel.cpp index c3e8e0856a58a7..672aa0dff06c3e 100644 --- a/aten/src/ATen/native/cpu/HistogramKernel.cpp +++ b/aten/src/ATen/native/cpu/HistogramKernel.cpp @@ -3,6 +3,7 @@ #include #include #include +#include #include #include @@ -98,7 +99,7 @@ void histogram_cpu_contiguous(Tensor& hist, const Tensor& bin_edges, // Allocates a buffer for the thread's local results std::vector data_out_local(numel_be - 1, input_t(0)); - for (int64_t i = start; i < end; ++i) { + for (const auto i : c10::irange(start, end)) { const input_t elt = accessor_in[i]; // Skips elements which fall outside the specified bins diff --git a/aten/src/ATen/native/cpu/IndexKernel.cpp b/aten/src/ATen/native/cpu/IndexKernel.cpp index 152a3bc847ccb2..2f0b9f2da820c0 100644 --- a/aten/src/ATen/native/cpu/IndexKernel.cpp +++ b/aten/src/ATen/native/cpu/IndexKernel.cpp @@ -10,6 +10,7 @@ #include #include #include +#include namespace at { namespace native { namespace { @@ -36,7 +37,7 @@ struct Indexer { int64_t get(int64_t idx) { int64_t offset = 0; - for (int j = 0; j < num_indexers; j++) { + for (const auto j : c10::irange(num_indexers)) { int64_t value = *(int64_t*)&indexers[j][idx * indexer_strides[j]]; int64_t size = original_sizes[j]; TORCH_CHECK_INDEX(value >= -size && value < size, @@ -52,7 +53,7 @@ struct Indexer { static bool is_constant_index(int ntensor, const int64_t* strides) { AT_ASSERT(ntensor >= 3); - for (int arg = 2; arg < ntensor; arg++) { + for (const auto arg : c10::irange(2, ntensor)) { if (strides[arg] != 0) { return false; } @@ -77,16 +78,16 @@ void cpu_index_kernel(TensorIterator& iter, IntArrayRef index_size, IntArrayRef // specialization for when every element uses the same index int64_t offset = indexer.get(0); if (strides[0] == sizeof(scalar_t) && strides[1] == sizeof(scalar_t)) { - for (int64_t i = 0; i < n; i++) { + for (const auto i : c10::irange(n)) { f(dst + strides[0] * i, src + strides[1] * i, offset); } } else { - for (int64_t i = 0; i < n; i++) { + for (const auto i : c10::irange(n)) { f(dst + strides[0] * i, src + strides[1] * i, offset); } } } else { - for (int64_t i = 0; i < n; i++) { + for (const auto i : c10::irange(n)) { int64_t offset = indexer.get(i); f(dst + strides[0] * i, src + strides[1] * i, offset); } @@ -153,7 +154,8 @@ void cpu_take_put_kernel( auto loop = [&](char** data, const int64_t* strides, int64_t n) { auto* iterated_data_bytes = data[0]; auto* index_data_bytes = data[1]; - for (int64_t elem = 0; elem < n; ++elem) { + for (const auto elem : c10::irange(n)) { + (void)elem; //Suppress unused variable warning auto idx = *reinterpret_cast(index_data_bytes); auto& iterated = *reinterpret_cast(iterated_data_bytes); @@ -270,7 +272,8 @@ void index_fill_kernel( auto handle_nonzero_idx_stride = [&](char** data, const int64_t* strides, int64_t n) { auto* self_data_bytes = data[0]; auto* index_data_bytes = data[1]; - for (int64_t elem = 0; elem < n; ++elem) { + for (const auto elem : c10::irange(n)) { + (void)elem; //Suppress unused variable warning auto* self_data = reinterpret_cast(self_data_bytes); auto idx = *reinterpret_cast(index_data_bytes); TORCH_CHECK_INDEX(idx >= -self_dim_size && idx < self_dim_size, @@ -296,7 +299,8 @@ void index_fill_kernel( if (idx < 0) { idx += self_dim_size; } - for (int64_t elem = 0; elem < n; ++elem) { + for (const auto elem : c10::irange(n)) { + (void)elem; //Suppress unused variable warning auto* self_data = reinterpret_cast(self_data_bytes); self_data[idx * self_dim_stride] = fill_val; @@ -329,7 +333,8 @@ void index_copy_kernel( auto* self_data_bytes = data[0]; auto* index_data_bytes = data[1]; auto* source_data_bytes = data[2]; - for (int64_t elem = 0; elem < n; ++elem) { + for (const auto elem : c10::irange(n)) { + (void)elem; //Suppress unused variable warning auto* self_data = reinterpret_cast(self_data_bytes); auto idx = *reinterpret_cast(index_data_bytes); auto* source_data = reinterpret_cast(source_data_bytes); @@ -352,7 +357,8 @@ void index_copy_kernel( TORCH_CHECK_INDEX(idx >= 0 && idx < self_dim_size, "index_copy_(): index ", idx, " is out of bounds for dimension ", dim, " with size ", self_dim_size); - for (int64_t elem = 0; elem < n; ++elem) { + for (const auto elem : c10::irange(n)) { + (void)elem; //Suppress unused variable warning auto* self_data = reinterpret_cast(self_data_bytes); auto* source_data = reinterpret_cast(source_data_bytes); @@ -387,7 +393,7 @@ void cpu_masked_fill_kernel(TensorIterator& iter, scalar_t value) { auto loop = [&](char** data, const int64_t* strides, int64_t n) { char* dst = data[0]; char* mask = data[1]; - for (int64_t i = 0; i < n; i++) { + for (const auto i : c10::irange(n)) { mask_t mask_value = *(mask_t*)(mask + strides[1] * i); if (!is_mask_bool) { TORCH_CHECK(mask_value == 0 || mask_value == 1, "Mask tensor can take 0 and 1 values only"); @@ -425,7 +431,7 @@ void cpu_masked_scatter_kernel(TensorIterator& iter, const Tensor& source) { const int64_t dst_stride = strides[0]; char* mask = data[1]; const int64_t mask_stride = strides[1]; - for (int64_t i = 0; i < n; i++) { + for (const auto i : c10::irange(n)) { mask_t mask_value = *(mask_t*)(mask + mask_stride * i); if (!is_mask_bool) { TORCH_CHECK(mask_value <= static_cast(1), "Mask tensor can take 0 and 1 values only"); @@ -466,7 +472,7 @@ void cpu_masked_select_serial_kernel(TensorIterator& iter, const func_t& f) { char* dst = data[0]; char* src = data[1]; char* mask = data[2]; - for (int64_t i = 0; i < n; i++) { + for (const auto i : c10::irange(n)) { mask_t mask_value = *(mask_t*)(mask + strides[2] * i); if (!is_mask_bool) { TORCH_CHECK(mask_value == 0 || mask_value == 1, "Mask tensor can take 0 and 1 values only"); @@ -505,7 +511,7 @@ void cpu_masked_select_kernel(TensorIterator& iter, const func_t& f) { char* src = data[1]; char* mask = data[2]; char* mask_prefix_sum = data[3]; - for (int64_t i = 0; i < n; i++) { + for (const auto i : c10::irange(n)) { mask_t mask_value = *(mask_t*)(mask + strides[2] * i); if (!is_mask_bool) { TORCH_CHECK(mask_value == 0 || mask_value == 1, "Mask tensor can take 0 and 1 values only"); diff --git a/aten/src/ATen/native/cpu/LinearAlgebraKernel.cpp b/aten/src/ATen/native/cpu/LinearAlgebraKernel.cpp index ba7cf73576d491..0bb92a158aa2e0 100644 --- a/aten/src/ATen/native/cpu/LinearAlgebraKernel.cpp +++ b/aten/src/ATen/native/cpu/LinearAlgebraKernel.cpp @@ -5,6 +5,7 @@ #include #include #include +#include namespace at { namespace native { namespace { @@ -135,13 +136,14 @@ void unpack_pivots_cpu_kernel( auto* unpacked_pivots_ptr = data[0]; const auto* pivots_ptr = data[1]; - for (int64_t elem = 0; elem < nelems; ++elem) { + for (const auto elem : c10::irange(nelems)) { + (void)elem; //Suppress unused variable warning // WARNING: torch.lu returns int32 pivots, // this behavior could change in the future. auto* unpacked_pivots_data = reinterpret_cast(unpacked_pivots_ptr); auto* pivots_data = reinterpret_cast(pivots_ptr); - for (int64_t i = 0; i < dim_size; ++i) { + for (const auto i : c10::irange(dim_size)) { std::swap( unpacked_pivots_data[i], unpacked_pivots_data[pivots_data[i]] diff --git a/aten/src/ATen/native/cpu/Loops.h b/aten/src/ATen/native/cpu/Loops.h index 7e16afc0af6fc1..3b585972d99a9c 100644 --- a/aten/src/ATen/native/cpu/Loops.h +++ b/aten/src/ATen/native/cpu/Loops.h @@ -28,6 +28,7 @@ #include #include +#include #include #include #include @@ -120,7 +121,7 @@ basic_loop(char* C10_RESTRICT data[], const int64_t* strides_, int64_t i, int64_ // Copying strides to temporary array helps auto vectorization in older GCC // versions. int64_t strides[ntensors]; - for (int arg = 0; arg < ntensors; arg++) { + for (const auto arg : c10::irange(ntensors)) { strides[arg] = strides_[arg]; } @@ -178,7 +179,7 @@ multiple_outputs_loop(char* C10_RESTRICT data[], const int64_t* strides_, int64_ // Copying strides to temporary array helps auto vectorization in older GCC // versions. int64_t strides[ntensors]; - for (int arg = 0; arg < ntensors; arg++) { + for (const auto arg : c10::irange(ntensors)) { strides[arg] = strides_[arg]; } @@ -204,7 +205,7 @@ vectorized_loop(char** C10_RESTRICT data_, int64_t n, int64_t S, func_t&& op, ve constexpr int ntensors = traits::arity + 1; char* C10_RESTRICT data[ntensors]; - for (int arg = 0; arg < ntensors; arg++) { + for (const auto arg : c10::irange(ntensors)) { data[arg] = data_[arg]; } @@ -220,7 +221,7 @@ vectorized_loop(char** C10_RESTRICT data_, int64_t n, int64_t S, func_t&& op, ve } if (i < n) { int64_t strides[ntensors]; - for (int arg = 0; arg < ntensors; arg++) { + for (const auto arg : c10::irange(ntensors)) { strides[arg] = (S > 0 && arg == S) ? 0 : sizeof(scalar_t); } basic_loop(data, strides, i, n, std::forward(op)); diff --git a/aten/src/ATen/native/cpu/MaxPoolKernel.cpp b/aten/src/ATen/native/cpu/MaxPoolKernel.cpp index a556ea37806439..77be0982e831c7 100644 --- a/aten/src/ATen/native/cpu/MaxPoolKernel.cpp +++ b/aten/src/ATen/native/cpu/MaxPoolKernel.cpp @@ -5,6 +5,7 @@ #include #include #include +#include namespace at { namespace native { @@ -43,7 +44,7 @@ void cpu_max_pool( int64_t ow = 0; data_index_init(begin, c, channels, oh, output_height, ow, output_width); - for (int64_t i = begin; i < end; i++) { + for (const auto i : c10::irange(begin, end)) { int64_t ih0 = oh * dH - padH; int64_t iw0 = ow * dW - padW; int64_t ih1 = std::min(ih0 + (kH - 1) * dilationH + 1, input_height); @@ -133,7 +134,7 @@ void cpu_max_pool_channels_last( // NOLINTNEXTLINE(modernize-avoid-c-arrays,cppcoreguidelines-avoid-c-arrays) std::unique_ptr index_buffer(new integer_t[len]); - for (int64_t i = begin; i < end; i++) { + for (const auto i : c10::irange(begin, end)) { int64_t ih0 = oh * dH - padH; int64_t iw0 = ow * dW - padW; int64_t ih1 = std::min(ih0 + (kH - 1) * dilationH + 1, input_height); @@ -229,13 +230,13 @@ void cpu_max_pool_backward( // parallel on dim of N, C at::parallel_for(0, channels, 0, [&](int64_t begin, int64_t end) { - for (int64_t c = begin; c < end; c++) { + for (const auto c : c10::irange(begin, end)) { scalar_t* grad_input_ptr = grad_input_data + c * input_height * input_width; scalar_t* grad_output_ptr = grad_output_data + c * output_height * output_width; int64_t * indices_ptr = indices_data + c * output_height * output_width; - for (int64_t oh = 0; oh < output_height; oh++) { - for (int64_t ow = 0; ow < output_width; ow++) { + for (const auto oh : c10::irange(output_height)) { + for (const auto ow : c10::irange(output_width)) { // retrieve position of max int64_t index = oh * output_width + ow; int64_t maxindex = indices_ptr[index]; @@ -278,17 +279,17 @@ void cpu_max_pool_backward_channels_last( // parallel on dim N at::parallel_for(0, nbatch, 0, [&](int64_t begin, int64_t end) { - for (int64_t n = begin; n < end; n++) { + for (const auto n : c10::irange(begin, end)) { scalar_t* grad_input_ptr = grad_input_data + n * input_height * input_width * channels; scalar_t* grad_output_ptr = grad_output_data + n * output_height * output_width * channels; int64_t* indices_ptr = indices_data + n * output_height * output_width * channels; - for (int64_t oh = 0; oh < output_height; oh++) { - for (int64_t ow = 0; ow < output_width; ow++) { + for (const auto oh : c10::irange(output_height)) { + for (const auto ow : c10::irange(output_width)) { scalar_t* gout = grad_output_ptr + oh * output_width * channels + ow * channels; int64_t* ind = indices_ptr + oh * output_width * channels + ow * channels; // TODO: gcc vectorization - for (int64_t c = 0; c < channels; c++) { + for (const auto c : c10::irange(channels)) { int64_t maxindex = ind[c]; if (maxindex != -1) { grad_input_ptr[maxindex * channels + c] += gout[c]; diff --git a/aten/src/ATen/native/cpu/MaxPooling.cpp b/aten/src/ATen/native/cpu/MaxPooling.cpp index ebfb59775d0317..d70b6ef6e70d05 100644 --- a/aten/src/ATen/native/cpu/MaxPooling.cpp +++ b/aten/src/ATen/native/cpu/MaxPooling.cpp @@ -2,6 +2,7 @@ #include #include #include +#include namespace at { namespace native { @@ -13,7 +14,7 @@ inline void max_pool1d_kernel( scalar_t* C10_RESTRICT op, const scalar_t* C10_RESTRICT ip, const PoolingParams1D& p) { - for (int64_t kj = 0; kj < p.KW; ++kj) { + for (const auto kj : c10::irange(p.KW)) { int64_t oj = p.valid_output_start(kj); int64_t oe = p.valid_output_end(kj); int64_t ij = p.index(kj, oj); @@ -40,7 +41,7 @@ void max_pool1d_impl( : std::numeric_limits::lowest(); at::parallel_for(0, p.NB * p.NC, 0, [&](int64_t begin, int64_t end) { - for (int64_t it = begin; it < end; ++it) { + for (const auto it : c10::irange(begin, end)) { scalar_t* op = OP + it * p.OW; const scalar_t* ip = IP + it * p.IW; std::fill_n(op, p.OW, FILL); diff --git a/aten/src/ATen/native/cpu/MaxUnpoolKernel.cpp b/aten/src/ATen/native/cpu/MaxUnpoolKernel.cpp index 2d0e39249c5d10..222274e83b8412 100644 --- a/aten/src/ATen/native/cpu/MaxUnpoolKernel.cpp +++ b/aten/src/ATen/native/cpu/MaxUnpoolKernel.cpp @@ -4,6 +4,7 @@ #include #include #include +#include #include @@ -60,7 +61,7 @@ void cpu_max_unpool( int64_t ip = 0; data_index_init(begin, c, channels, ip, input_image_size); - for (int64_t i = begin; i < end; i++) { + for (const auto i : c10::irange(begin, end)) { scalar_t* output_ptr = output_data + c * output_image_size; int64_t maxp = indices_data[i]; @@ -124,13 +125,13 @@ void cpu_max_unpool_channels_last( int64_t ip = 0; data_index_init(begin, n, nbatch, ip, input_image_size); - for (int64_t i = begin; i < end; i++) { + for (const auto i : c10::irange(begin, end)) { scalar_t* input_ptr = input_data + i * channels; int64_t* indices_ptr = indices_data + i * channels; scalar_t* output_ptr = output_data + n * output_image_size * channels; // can't do scatter on avx2 (only available on avx512) - for (int64_t c = 0; c < channels; c++) { + for (const auto c : c10::irange(channels)) { int64_t maxp = indices_ptr[c]; if (maxp < 0 || maxp >= output_image_size) { optional_error_index = maxp; @@ -197,7 +198,7 @@ void cpu_max_unpool_backward( int64_t ip = 0; data_index_init(begin, c, channels, ip, input_image_size); - for (int64_t i = begin; i < end; i++) { + for (const auto i : c10::irange(begin, end)) { scalar_t* grad_output_ptr = grad_output_data + c * output_image_size; int64_t maxp = indices_data[i]; @@ -262,12 +263,12 @@ void cpu_max_unpool_backward_channels_last( int64_t ip = 0; data_index_init(begin, n, nbatch, ip, input_image_size); - for (int64_t i = begin; i < end; i++) { + for (const auto i : c10::irange(begin, end)) { scalar_t* grad_output_ptr = grad_output_data + n * output_image_size * channels; scalar_t* grad_input_ptr = grad_input_data + i * channels; int64_t* indices_ptr = indices_data + i * channels; - for (int64_t c = 0; c < channels; c++) { + for (const auto c : c10::irange(channels)) { int64_t maxp = indices_ptr[c]; if (maxp < 0 || maxp >= output_image_size) { optional_error_index = maxp; diff --git a/aten/src/ATen/native/cpu/MultinomialKernel.cpp b/aten/src/ATen/native/cpu/MultinomialKernel.cpp index 3dc68d0e765961..f181572f51afef 100644 --- a/aten/src/ATen/native/cpu/MultinomialKernel.cpp +++ b/aten/src/ATen/native/cpu/MultinomialKernel.cpp @@ -6,6 +6,7 @@ #include #include #include +#include namespace at { namespace native { @@ -39,12 +40,12 @@ void multinomial_with_replacement_apply( auto result_dist_stride_0 = result.dim() > 1 ? result.stride(-2) : 0; auto result_dist_stride_1 = result.stride(-1); - for (int64_t i = 0; i < n_dist; i++) { + for (const auto i : c10::irange(n_dist)) { /* Get normalized cumulative distribution from prob distribution */ scalar_t sum = 0; // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init) scalar_t val; - for (int64_t j = 0; j < n_categories; j++) { + for (const auto j : c10::irange(n_categories)) { val = self_ptr[i * self_stride_0 + j * self_stride_1]; TORCH_CHECK(val >= 0, "invalid multinomial distribution (encountering probability entry < 0)"); // NB: std::isfinite doesn't bode well with libc++ for half datatypes, @@ -66,12 +67,12 @@ void multinomial_with_replacement_apply( /* normalize cumulative probability distribution so that last val is 1 i.e. doesn't assume original self row sums to one */ if ((sum > 0) || ((sum < 1.00001) && (sum > 0.99999))) { - for (int64_t j = 0; j < n_categories; j++) { + for (const auto j : c10::irange(n_categories)) { cum_dist_ptr[j * cum_dist_stride_0] /= sum; } } - for (int64_t j = 0; j < n_sample; j++) { + for (const auto j : c10::irange(n_sample)) { /* sample a probability mass from a uniform distribution */ at::uniform_real_distribution uniform(0, 1); double uniform_sample = uniform(gen); diff --git a/aten/src/ATen/native/cpu/Reduce.h b/aten/src/ATen/native/cpu/Reduce.h index 899099ed79452f..dbf83487180377 100644 --- a/aten/src/ATen/native/cpu/Reduce.h +++ b/aten/src/ATen/native/cpu/Reduce.h @@ -4,6 +4,7 @@ #include #include #include +#include #include @@ -38,10 +39,10 @@ static inline void vectorized_reduction(char** data, int64_t n, int64_t stride, VEC_LOOP_HEADER(func_t, data) const char* in1_ptr = data[1]; Vec acc[4]; - for (int j = 0; j < 4; j++) { + for (const auto j : c10::irange(4)) { acc[j] = Vec::loadu(in1_ptr + j * Vec::size() * sizeof(scalar_t)); } - for (int64_t i = 1; i < n; i++) { + for (const auto i : c10::irange(1, n)) { const char* ptr = in1_ptr + stride * i; acc[0] = vop(acc[0], Vec::loadu(ptr + (0 * Vec::size() * sizeof(scalar_t)))); acc[1] = vop(acc[1], Vec::loadu(ptr + (1 * Vec::size() * sizeof(scalar_t)))); @@ -58,7 +59,7 @@ static inline void vectorized_reduction(char** data, int64_t n, int64_t stride, auto dst = (scalar_t*)out_ptr; *dst = op(*dst, buffer[0]); } else { - for (int j = 0; j < 4; j++) { + for (const auto j : c10::irange(4)) { auto dst = out_ptr + j * Vec::size() * sizeof(scalar_t); acc[j] = vop(acc[j], Vec::loadu(dst)); acc[j].store(dst); @@ -68,7 +69,8 @@ static inline void vectorized_reduction(char** data, int64_t n, int64_t stride, template static inline void UNARY_OUTER_LOOP(char* data[2], const int64_t strides[2], int64_t n, F f) { - for (int j = 0; j < n; j++) { + for (const auto j : c10::irange(n)) { + (void)j; //Suppress unused variable warning f(); data[0] += strides[0]; data[1] += strides[1]; @@ -215,7 +217,7 @@ void binary_kernel_reduce(TensorIteratorBase& iter, ops_t ops, init_t init) { AT_ASSERT(ntensors - num_outputs == 1); char *in = data[ntensors - 1]; int64_t stride = strides[ntensors - 1]; - for (int64_t i = 0; i < size; ++i) { + for (const auto i : c10::irange(size)) { acc = ops.reduce(acc, *(data_t*)in, begin + i); in += stride; } @@ -241,7 +243,7 @@ void binary_kernel_reduce(TensorIteratorBase& iter, ops_t ops, init_t init) { acc = reduction_body(acc, begin, end); } ); - for (int i = 0; i < max_threads; ++i) { + for (const auto i : c10::irange(max_threads)) { total_acc = ops.combine(total_acc, buffer[i]); } } diff --git a/aten/src/ATen/native/cpu/ReduceAllOpsKernel.cpp b/aten/src/ATen/native/cpu/ReduceAllOpsKernel.cpp index a07a0587b80e31..90bac8aab63fcf 100644 --- a/aten/src/ATen/native/cpu/ReduceAllOpsKernel.cpp +++ b/aten/src/ATen/native/cpu/ReduceAllOpsKernel.cpp @@ -11,6 +11,7 @@ #include #include #include +#include namespace at { namespace native { namespace { @@ -51,7 +52,7 @@ inline void reduce_all_impl( scalar_t result = at::parallel_reduce(0, input_numel, internal::GRAIN_SIZE, ident_v, [&](int64_t start, int64_t end, const scalar_t ident) -> scalar_t { scalar_t partial_out = ident; - for (int64_t i = start; i < end; i++) { + for (const auto i : c10::irange(start, end)) { partial_out = op(partial_out, input_data[i]); } return partial_out; @@ -124,7 +125,7 @@ inline void reduce_all_impl_two_outputs( scalar_t_pair result = at::parallel_reduce(0, input_numel, internal::GRAIN_SIZE, ident_v, [&](int64_t start, int64_t end, const scalar_t_pair& ident) -> scalar_t_pair { scalar_t_pair partial_out(ident); - for (int64_t i = start; i < end; i++) { + for (const auto i : c10::irange(start, end)) { partial_out = reduce_chunk_func(partial_out, input_data[i]); } return partial_out; diff --git a/aten/src/ATen/native/cpu/ReduceOpsKernel.cpp b/aten/src/ATen/native/cpu/ReduceOpsKernel.cpp index 01ed54e56fc738..65a54340ec0a87 100644 --- a/aten/src/ATen/native/cpu/ReduceOpsKernel.cpp +++ b/aten/src/ATen/native/cpu/ReduceOpsKernel.cpp @@ -13,6 +13,7 @@ #include #include +#include #include namespace at { namespace native { namespace { @@ -54,7 +55,8 @@ static inline void cpu_cum_base_kernel(const Tensor& result, auto* result_data_bytes = data[0]; const auto* self_data_bytes = data[1]; - for (int64_t i = 0; i < n; ++i) { + for (const auto i : c10::irange(n)) { + (void)i; //Suppress unused variable warning f( (scalar_t*)result_data_bytes, result_dim_stride, (scalar_t*)self_data_bytes, self_dim_stride, init_val @@ -77,7 +79,7 @@ static void cumsum_cpu_kernel(const Tensor& result, const Tensor& self, int64_t const scalar_t* self_data, auto self_dim_stride, scalar_t init_val) { // NOLINTNEXTLINE(bugprone-signed-char-misuse) auto cum_number = (at::acc_type)init_val; - for (int64_t i = 0; i < self_dim_size; ++i) { + for (const auto i : c10::irange(self_dim_size)) { cum_number += self_data[i * self_dim_stride]; result_data[i * result_dim_stride] = (scalar_t)cum_number; } @@ -96,7 +98,7 @@ static void cumprod_cpu_kernel(const Tensor& result, const Tensor& self, int64_t const scalar_t* self_data, auto self_dim_stride, scalar_t init_val) { // NOLINTNEXTLINE(bugprone-signed-char-misuse) auto cum_number = (at::acc_type)init_val; - for (int64_t i = 0; i < self_dim_size; ++i) { + for (const auto i : c10::irange(self_dim_size)) { cum_number *= self_data[i * self_dim_stride]; result_data[i * result_dim_stride] = (scalar_t)cum_number; } @@ -114,7 +116,7 @@ static void logcumsumexp_cpu_kernel(Tensor& result, const Tensor& self, int64_t scalar_t* result_data, auto result_dim_stride, const scalar_t* self_data, auto self_dim_stride, scalar_t init_val) { scalar_t cum_number = (at::acc_type)init_val; - for (int64_t i = 0; i < self_dim_size; ++i) { + for (const auto i : c10::irange(self_dim_size)) { scalar_t x = self_data[i * self_dim_stride]; // Reference : https://www.tensorflow.org/api_docs/python/tf/math/cumulative_logsumexp diff --git a/aten/src/ATen/native/cpu/ScatterGatherKernel.cpp b/aten/src/ATen/native/cpu/ScatterGatherKernel.cpp index 2ab92fbdb2bb22..dc8df42852c405 100644 --- a/aten/src/ATen/native/cpu/ScatterGatherKernel.cpp +++ b/aten/src/ATen/native/cpu/ScatterGatherKernel.cpp @@ -3,6 +3,7 @@ #include #include #include +#include namespace at { namespace native { @@ -52,7 +53,7 @@ struct _cpu_scatter_gather_dim_loop { func_t& f ) { - for (int64_t i = 0; i < index_dim_size; ++i) { + for (const auto i : c10::irange(index_dim_size)) { int64_t idx_dim = index_data[i * index_dim_stride]; // we are not putting idx_dim in the error message because it disables // loop optimization in clang-7 @@ -79,7 +80,7 @@ struct _cpu_scatter_gather_dim_loop { func_t& f ) { - for (int64_t i = 0; i < index_dim_size; ++i) { + for (const auto i : c10::irange(index_dim_size)) { int64_t idx_dim = index_data[i * index_dim_stride]; // we are not putting idx_dim in the error message because it disables // loop optimization in clang-7 @@ -146,7 +147,8 @@ struct cpu_scatter_gather_base_kernel { // whether `n` is smaller than `index_dim_size` if ((dim== self.dim() - 1) || (n < index_dim_size)) { - for (int64_t nelem = 0; nelem < n; ++nelem) { + for (const auto nelem : c10::irange(n)) { + (void)nelem; //Suppress unused variable warning // dim loop is a separate code block // for better performance _cpu_scatter_gather_dim_loop()( @@ -160,10 +162,11 @@ struct cpu_scatter_gather_base_kernel { } } else { - for (int64_t i = 0; i < index_dim_size; ++i) { + for (const auto i : c10::irange(index_dim_size)) { auto* self_data = self_data_bytes; auto* index_data = (char*)((int64_t*)index_data_bytes + i * index_dim_stride); - for (int64_t nelem = 0; nelem < n; ++nelem) { + for (const auto nelem : c10::irange(n)) { + (void)nelem; //Suppress unused variable warning int64_t idx_dim = *(int64_t*)index_data; // we are not putting idx_dim in the error message because it disables // loop optimization in clang-7 @@ -227,7 +230,8 @@ struct cpu_scatter_gather_base_kernel { // whether dim is the last dimension and/or // whether `n` is smaller than `index_dim_size` if ((dim== self.dim() - 1) || (n < index_dim_size)) { - for (int64_t nelem = 0; nelem < n; ++nelem) { + for (const auto nelem : c10::irange(n)) { + (void)nelem; //Suppress unused variable warning // dim loop is a separate code block // for better performance _cpu_scatter_gather_dim_loop()( @@ -244,11 +248,12 @@ struct cpu_scatter_gather_base_kernel { } } else { - for (int64_t i = 0; i < index_dim_size; ++i) { + for (const auto i : c10::irange(index_dim_size)) { auto* self_data = self_data_bytes; auto* index_data = (char*)((int64_t*)index_data_bytes + i * index_dim_stride); auto* src_data = src_data_bytes; - for (int64_t nelem = 0; nelem < n; ++nelem) { + for (const auto nelem : c10::irange(n)) { + (void)nelem; //Suppress unused variable warning int64_t idx_dim = *(int64_t*)index_data; // we are not putting idx_dim in the error message because it disables // loop optimization in clang-7 diff --git a/aten/src/ATen/native/cpu/SoftMaxKernel.cpp b/aten/src/ATen/native/cpu/SoftMaxKernel.cpp index 790ac743153df4..ed4a3a7664509b 100644 --- a/aten/src/ATen/native/cpu/SoftMaxKernel.cpp +++ b/aten/src/ATen/native/cpu/SoftMaxKernel.cpp @@ -9,6 +9,7 @@ #include #include #include +#include #include // [Note AVX-SSE transitions] In general we avoid calls into cmath for code @@ -48,7 +49,7 @@ inline void _vec_log_softmax_lastdim( int64_t loop_end = CHUNK_SIZE; if (ii + CHUNK_SIZE > end) loop_end = end - ii; - for (int64_t j = 0; j < loop_end; j++) { + for (const auto j : c10::irange(loop_end)) { int64_t i = ii + j; scalar_t* input_data = input_data_base + i * dim_size; max_input_arr[j] = vec::reduce_all( @@ -56,7 +57,7 @@ inline void _vec_log_softmax_lastdim( input_data, dim_size); } - for (int64_t j = 0; j < loop_end; j++) { + for (const auto j : c10::irange(loop_end)) { int64_t i = ii + j; scalar_t* input_data = input_data_base + i * dim_size; scalar_t max_input = max_input_arr[j]; @@ -73,7 +74,7 @@ inline void _vec_log_softmax_lastdim( tmp_sum_scalar, tmp_sum_scalar, loop_end); - for (int64_t j = 0; j < loop_end; j++) { + for (const auto j : c10::irange(loop_end)) { int64_t i = ii + j; scalar_t* input_data = input_data_base + i * dim_size; scalar_t* output_data = output_data_base + i * dim_size; @@ -111,7 +112,7 @@ inline void _vec_softmax_lastdim( outer_size, grain_size, [&](int64_t begin, int64_t end) { - for (int64_t i = begin; i < end; i++) { + for (const auto i : c10::irange(begin, end)) { scalar_t* input_data = input_data_base + i * dim_size; scalar_t* output_data = output_data_base + i * dim_size; scalar_t max_input = vec::reduce_all( @@ -152,7 +153,7 @@ inline void _vec_host_softmax_backward_lastdim( outer_size, grain_size, [&](int64_t begin, int64_t end) { - for (int64_t i = begin; i < end; i++) { + for (const auto i : c10::irange(begin, end)) { scalar_t* grad_input_data = grad_input_data_base + i * dim_size; scalar_t* grad_data = grad_data_base + i * dim_size; scalar_t* output_data = output_data_base + i * dim_size; @@ -242,7 +243,7 @@ inline void _vec_softmax( Vec max_vec_o2 = std::get<1>(convert_result); std::get<0>(convert_result).store(temp_vec_input_data); std::get<1>(convert_result).store(temp_vec_input_data + vectorized_step); - for (int64_t d = 1; d < dim_size; d++) { + for (const auto d : c10::irange(1, dim_size)) { Vec_bf16 input_vec_bf16 = Vec_bf16::loadu(input_data + d * dim_stride); convert_result = convert_bfloat16_float(input_vec_bf16); max_vec_o1 = vec::maximum(max_vec_o1, std::get<0>(convert_result)); @@ -253,7 +254,7 @@ inline void _vec_softmax( // Step2: Calculate sum Vec sum_vec_o1 = Vec(0.0); Vec sum_vec_o2 = Vec(0.0); - for (int64_t d = 0; d < dim_size; d++) { + for (const auto d : c10::irange(dim_size)) { Vec output_vec_o1 = Vec::loadu(temp_vec_input_data + d*vectorized_step*2); Vec output_vec_o2 = Vec::loadu(temp_vec_input_data + d*vectorized_step*2 + vectorized_step); output_vec_o1 = (output_vec_o1 - max_vec_o1).exp(); @@ -265,7 +266,7 @@ inline void _vec_softmax( sum_vec_o2 = sum_vec_o2 + output_vec_o2; } // Step3: Unify - for (int64_t d = 0; d < dim_size; d++) { + for (const auto d : c10::irange(dim_size)) { Vec output_vec_o1 = Vec::loadu(temp_vec_output_data + d*vectorized_step*2); Vec output_vec_o2 = Vec::loadu(temp_vec_output_data + d*vectorized_step*2 + vectorized_step); output_vec_o1 = output_vec_o1/sum_vec_o1; @@ -281,7 +282,7 @@ inline void _vec_softmax( // Case 1: For the idx at the end of total chunk for each thread, there are not enough numbers for parallization. // Case 2: For the idx at the end of each inner_size inside thread, there are not enough numbers for parallization. int64_t tail_number = ((idx+vectorized_step) > end) ? /*Case1*/ (end - idx) : /*Case2*/ (inner_size - inner_idx); - for (int64_t i=0; i < tail_number; i++) { + for (const auto i : c10::irange(tail_number)) { outer_idx = (idx + i) / inner_size; inner_idx = (idx + i) % inner_size; BFloat16* input_data = @@ -290,19 +291,19 @@ inline void _vec_softmax( output_data_base + outer_idx * outer_stride + inner_idx; // Step1: Get max score float max_input = float(input_data[0]); - for (int64_t d = 1; d < dim_size; d++) { + for (const auto d : c10::irange(1, dim_size)) { max_input = std::max(max_input, float(input_data[d * dim_stride])); } // Step2: Calculate the Sum float sum_data = 0.0; float temp_output_data = 0.0; - for (int64_t d = 0; d < dim_size; d++) { + for (const auto d : c10::irange(dim_size)) { temp_output_data = std::exp(input_data[d * dim_stride] - max_input); sum_data += temp_output_data; output_data[d * dim_stride] = c10::BFloat16(temp_output_data); } // Step3: Unify - for (int64_t d = 0; d < dim_size; d++) { + for (const auto d : c10::irange(dim_size)) { output_data[d * dim_stride] = c10::BFloat16(float(output_data[d * dim_stride])/sum_data); } @@ -339,20 +340,20 @@ inline void _vec_softmax( output_data_base + outer_idx * outer_stride + inner_idx; // Step 1: Get max Score Vec max_vec = Vec::loadu(input_data); - for (int64_t d = 1; d < dim_size; d++) { + for (const auto d : c10::irange(1, dim_size)) { Vec input_vec = Vec::loadu(input_data + d * dim_stride); max_vec = vec::maximum(max_vec, input_vec); } // Step2: Calculate sum Vec sum_vec = Vec(0.0); - for (int64_t d = 0; d < dim_size; d++) { + for (const auto d : c10::irange(dim_size)) { Vec output_vec = (Vec::loadu(input_data + d * dim_stride) - max_vec).exp(); output_vec.store(output_data + d * dim_stride); sum_vec = sum_vec + output_vec; } // Step3: Unify - for (int64_t d = 0; d < dim_size; d++) { + for (const auto d : c10::irange(dim_size)) { Vec output_vec = Vec::loadu(output_data + d * dim_stride) / sum_vec; output_vec.store(output_data + d * dim_stride); @@ -365,7 +366,7 @@ inline void _vec_softmax( // Case 1: For the idx at the end of total chunk for each thread, there are not enough numbers for parallization. // Case 2: For the idx at the end of each inner_size inside thread, there are not enough numbers for parallization. int64_t tail_number = ((idx+vectorized_step) > end) ? /*Case1*/ (end - idx) : /*Case2*/ (inner_size - inner_idx); - for (int64_t i=0; i < tail_number; i++) { + for (const auto i : c10::irange(tail_number)) { outer_idx = (idx + i) / inner_size; inner_idx = (idx + i) % inner_size; scalar_t* input_data = @@ -374,18 +375,18 @@ inline void _vec_softmax( output_data_base + outer_idx * outer_stride + inner_idx; // Step1: Get max score scalar_t max_input = input_data[0]; - for (int64_t d = 1; d < dim_size; d++) { + for (const auto d : c10::irange(1, dim_size)) { max_input = std::max(max_input, input_data[d * dim_stride]); } // Step2: Calculate the Sum scalar_t sum_data = 0; - for (int64_t d = 0; d < dim_size; d++) { + for (const auto d : c10::irange(dim_size)) { output_data[d * dim_stride] = std::exp(input_data[d * dim_stride] - max_input); sum_data += output_data[d * dim_stride]; } // Step3: Unify - for (int64_t d = 0; d < dim_size; d++) { + for (const auto d : c10::irange(dim_size)) { output_data[d * dim_stride] = output_data[d * dim_stride]/sum_data; } @@ -402,8 +403,7 @@ struct vec_softmax { int64_t outer_size = 1; int64_t dim_size = input.size(dim); int64_t inner_size = 1; - for (int64_t i = 0; i < dim; ++i) - outer_size *= input.size(i); + for (const auto i : c10::irange(dim))outer_size *= input.size(i); for (int64_t i = dim + 1; i < input.dim(); ++i) inner_size *= input.size(i); scalar_t* input_data_base = input.data_ptr(); diff --git a/aten/src/ATen/native/cpu/SortingKernel.cpp b/aten/src/ATen/native/cpu/SortingKernel.cpp index 23bbc1cf7fecaf..8eab924407d135 100644 --- a/aten/src/ATen/native/cpu/SortingKernel.cpp +++ b/aten/src/ATen/native/cpu/SortingKernel.cpp @@ -7,6 +7,7 @@ #include #include #include +#include namespace at { namespace native { @@ -55,7 +56,8 @@ void _dim_apply( auto* values_data_bytes = data[0]; auto* indices_data_bytes = data[1]; - for (int64_t i = 0; i < n; ++i) { + for (const auto i : c10::irange(n)) { + (void)i; //Suppress unused variable warning f( reinterpret_cast(values_data_bytes), values_dim_stride, diff --git a/aten/src/ATen/native/cpu/StackKernel.cpp b/aten/src/ATen/native/cpu/StackKernel.cpp index ab808da84d51be..4427d999f3fd22 100644 --- a/aten/src/ATen/native/cpu/StackKernel.cpp +++ b/aten/src/ATen/native/cpu/StackKernel.cpp @@ -6,6 +6,7 @@ #include #include #include +#include namespace at { namespace native { @@ -37,8 +38,8 @@ void stack_serial_kernel_impl(Tensor& result, TensorList tensors, int64_t dim) { using Vec = vec::Vectorized; scalar_t* result_ptr = result_data; - for (int64_t i = 0; i < outer; ++i) { - for (int64_t j = 0; j < ninputs; j++) { + for (const auto i : c10::irange(outer)) { + for (const auto j : c10::irange(ninputs)) { int64_t local_inner = inputs[j].inner_size; scalar_t* input_ptr = (scalar_t*)(inputs[j].data_ptr) + i * local_inner; @@ -46,7 +47,7 @@ void stack_serial_kernel_impl(Tensor& result, TensorList tensors, int64_t dim) { #if !defined(_MSC_VER) && !defined(COMPILING_FOR_MIN_SIZE) #pragma unroll #endif - for (int64_t k = 0; k < local_inner; k++) { + for (const auto k : c10::irange(local_inner)) { result_ptr[k] = input_ptr[k]; } } else { diff --git a/aten/src/ATen/native/cpu/SumKernel.cpp b/aten/src/ATen/native/cpu/SumKernel.cpp index 87ed221ea64377..8318314054f84a 100644 --- a/aten/src/ATen/native/cpu/SumKernel.cpp +++ b/aten/src/ATen/native/cpu/SumKernel.cpp @@ -5,6 +5,7 @@ #include #include #include +#include #include @@ -26,7 +27,7 @@ Vectorized load_reduce_vec(const scalar_t* data, F reduce, acc_t ident) { constexpr int vstride = vec_t::size() / vacc_t::size(); alignas(64) std::array acc; acc.fill(ident); - for (int k = 0; k < vstride; ++k) { + for (const auto k : c10::irange(vstride)) { for (int i = 0; i < vacc_t::size(); ++i) { acc[i] = reduce(acc[i], values[i * vstride + k]); } @@ -280,7 +281,7 @@ template static void store(char * C10_RESTRICT data, int64_t stride, int64_t index, const std::array &values) { auto *base_ptr = data + stride * index; - for (size_t k = 0; k < numel; ++k) { + for (const auto k : c10::irange(numel)) { auto val = values[k]; StorePolicy::store(base_ptr, stride, k, val); } @@ -314,7 +315,7 @@ A simplified recursive implementation would look like this: scalar_t sum = 0; if (n <= min_chunk_size) { // Recursive base case, calculate a simple running sum - for (int64_t i = 0; i < n; ++i) { + for (const auto i : c10::irange(n)) { sum += data[i]; } return sum; @@ -352,16 +353,16 @@ std::array multi_row_sum( #if !defined(COMPILING_FOR_MIN_SIZE) # pragma unroll #endif - for (int64_t k = 0; k < nrows; ++k) { + for (const auto k : c10::irange(nrows)) { acc[0][k] += LoadPolicy::load(sum_base, col_stride, k); } } - for (int64_t j = 1; j < num_levels; ++j) { + for (const auto j : c10::irange(1, num_levels)) { #if !defined(COMPILING_FOR_MIN_SIZE) # pragma unroll #endif - for (int64_t k = 0; k < nrows; ++k) { + for (const auto k : c10::irange(nrows)) { acc[j][k] += acc[j-1][k]; acc[j-1][k] = scalar_t(0); } @@ -378,23 +379,23 @@ std::array multi_row_sum( #if !defined(COMPILING_FOR_MIN_SIZE) # pragma unroll #endif - for (int64_t k = 0; k < nrows; ++k) { + for (const auto k : c10::irange(nrows)) { acc[0][k] += LoadPolicy::load(sum_base, col_stride, k); } } - for (int64_t j = 1; j < num_levels; ++j) { + for (const auto j : c10::irange(1, num_levels)) { #if !defined(COMPILING_FOR_MIN_SIZE) # pragma unroll #endif - for (int64_t k = 0; k < nrows; ++k) { + for (const auto k : c10::irange(nrows)) { acc[0][k] += acc[j][k]; } } // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init) std::array ret; - for (int64_t k = 0; k < nrows; ++k) { + for (const auto k : c10::irange(nrows)) { ret[k] = acc[0][k]; } return ret; @@ -414,7 +415,7 @@ scalar_t row_sum(const char * C10_RESTRICT in_data, partial_sums[0] += LoadPolicy::load(in_data, in_stride, i); } - for (int64_t k = 1; k < ilp_factor; ++k) { + for (const auto k : c10::irange(1, ilp_factor)) { partial_sums[0] += partial_sums[k]; } @@ -433,7 +434,7 @@ void vectorized_inner_sum( const int64_t vec_size = size0 / vec_numel; // Input is contiguous over the first (reduced) dimension - for (int64_t j = 0; j < size1; ++j) { + for (const auto j : c10::irange(size1)) { const auto *row_in = data[1] + j * outer_stride; auto vec_acc = row_sum(row_in, vec_stride, vec_size); @@ -444,7 +445,7 @@ void vectorized_inner_sum( alignas(64) std::array partials{}; vec_acc.store(partials.data()); - for (size_t k = 0; k < partials.size(); ++k) { + for (const auto k : c10::irange(partials.size())) { final_acc += partials[k]; } store(data[0], out_stride, j, final_acc); @@ -456,7 +457,7 @@ void scalar_inner_sum( // NOLINTNEXTLINE(modernize-avoid-c-arrays,cppcoreguidelines-avoid-c-arrays) char * C10_RESTRICT data[2], int64_t in_strides[2], int64_t out_stride, int64_t size0, int64_t size1) { - for (int64_t j = 0; j < size1; ++j) { + for (const auto j : c10::irange(size1)) { const auto *row_in = data[1] + j * in_strides[1]; auto ans = row_sum(row_in, in_strides[0], size0); store(data[0], out_stride, j, ans); @@ -480,7 +481,7 @@ void vectorized_outer_sum( auto sums = multi_row_sum( row_in, inner_stride, vec_stride, size0); - for (int64_t i = 0; i < nrows; ++i) { + for (const auto i : c10::irange(nrows)) { const int64_t base_idx = j + i * vacc_t::size(); store(data[0], out_stride, base_idx, sums[i]); } diff --git a/aten/src/ATen/native/cpu/TensorCompareKernel.cpp b/aten/src/ATen/native/cpu/TensorCompareKernel.cpp index 589f135bbf58b3..ab173bfc69d5b7 100644 --- a/aten/src/ATen/native/cpu/TensorCompareKernel.cpp +++ b/aten/src/ATen/native/cpu/TensorCompareKernel.cpp @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include @@ -73,7 +74,8 @@ static inline void compare_base_kernel(const Tensor& result1, const Tensor& resu auto* result1_data_bytes = data[0]; auto* result2_data_bytes = data[1]; const auto* self_data_bytes = data[2]; - for (int64_t i = 0; i < n; ++i) { + for (const auto i : c10::irange(n)) { + (void)i; //Suppress unused variable warning f((scalar_t*)result1_data_bytes, (scalar_t_2*)result2_data_bytes, (scalar_t*)self_data_bytes, @@ -104,7 +106,7 @@ static void min_kernel_impl( value_t (*zabs_)(scalar_t) = zabs; scalar_t min_number = self_data[0]; int64_t index = 0; - for (int64_t i = 0; i < self_dim_size; ++i) { + for (const auto i : c10::irange(self_dim_size)) { scalar_t value = self_data[i * self_dim_stride]; if (!(zabs_(value) >= zabs_(min_number))) { min_number = value; @@ -137,7 +139,7 @@ static void max_kernel_impl( value_t (*zabs_)(scalar_t) = zabs; scalar_t max_number = self_data[0]; int64_t index = 0; - for (int64_t i = 0; i < self_dim_size; ++i) { + for (const auto i : c10::irange(self_dim_size)) { scalar_t value = self_data[i * self_dim_stride]; if (!(zabs_(value) <= zabs_(max_number))) { max_number = value; @@ -173,7 +175,7 @@ static void aminmax_kernel( const scalar_t* self_data, auto self_dim_stride) { scalar_t min_number = self_data[0]; scalar_t max_number = self_data[0]; - for (int64_t i = 0; i < self_dim_size; ++i) { + for (const auto i : c10::irange(self_dim_size)) { scalar_t value = self_data[i * self_dim_stride]; // note: comparison is written this way to handle NaN correctly if (!(value >= min_number)) { @@ -242,7 +244,8 @@ static void mode_kernel_impl( std::vector> elements(self_dim_size); - for (int64_t k = 0; k < n; ++k) { + for (const auto k : c10::irange(n)) { + (void)k; //Suppress unused variable warning scalar_t* values_data = (scalar_t*)values_data_bytes; int64_t* indices_data = (int64_t*)indices_data_bytes; const scalar_t* self_data = (scalar_t*)self_data_bytes; @@ -252,7 +255,7 @@ static void mode_kernel_impl( int64_t temp_freq = 0; int64_t max_freq = 0; - for (int64_t i = 0; i < self_dim_size; i++) { + for (const auto i : c10::irange(self_dim_size)) { elements[i] = std::make_pair(self_data[i * self_dim_stride], i); } @@ -267,7 +270,7 @@ static void mode_kernel_impl( return i.first < j.first; }); - for (int64_t i = 0; i < self_dim_size; i++) { + for (const auto i : c10::irange(self_dim_size)) { temp_freq++; if ((i == self_dim_size - 1) || (elements[i].first != elements[i + 1].first)) { @@ -315,7 +318,7 @@ static void isin_default_kernel_cpu( AT_DISPATCH_ALL_TYPES(iter.dtype(1), "isin_default_cpu", [&]() { cpu_kernel(iter, [&](scalar_t element_val) -> bool { const auto* test_element_data = reinterpret_cast(test_elements_flat.data_ptr()); - for (auto j = 0; j < test_elements_flat.numel(); ++j) { + for (const auto j : c10::irange(test_elements_flat.numel())) { if (element_val == test_element_data[j]) { return !invert; } diff --git a/aten/src/ATen/native/cpu/UnaryOpsKernel.cpp b/aten/src/ATen/native/cpu/UnaryOpsKernel.cpp index 8e0f8cacf01481..944c48ec4d69f1 100644 --- a/aten/src/ATen/native/cpu/UnaryOpsKernel.cpp +++ b/aten/src/ATen/native/cpu/UnaryOpsKernel.cpp @@ -18,6 +18,7 @@ #include #include +#include #if AT_MKL_ENABLED() #include @@ -103,7 +104,7 @@ void LogitMKLKernel(T eps, TensorIteratorBase* it) { T* Y_data = static_cast(it->data_ptr(0)); if (eps < T(0)) { at::parallel_for(0, N, K, [=](int64_t begin, int64_t end) { - for (int64_t i = begin; i < end; ++i) { + for (const auto i : c10::irange(begin, end)) { Y_data[i] = X_data[i] == T(1) ? std::numeric_limits::infinity() : X_data[i] / (T(1) - X_data[i]); } @@ -113,7 +114,7 @@ void LogitMKLKernel(T eps, TensorIteratorBase* it) { const T lo = eps; const T hi = T(1) - eps; at::parallel_for(0, N, K, [=](int64_t begin, int64_t end) { - for (int64_t i = begin; i < end; ++i) { + for (const auto i : c10::irange(begin, end)) { const T x = X_data[i] < lo ? lo : (X_data[i] > hi ? hi : X_data[i]); Y_data[i] = x == T(1) ? std::numeric_limits::infinity() : (x / (T(1) - x)); @@ -552,10 +553,10 @@ static void erfcx_kernel(TensorIteratorBase& iter){ scalar_t buffer[WIDTH]; \ int64_t width = WIDTH; \ width = std::min(width, n - i); \ - for (int64_t j = 0; j < width; j++) \ + for (const auto j : c10::irange(width))\ buffer[j] = in_data[in_stride * (i + j)]; \ vml::v##op(buffer, buffer, width); \ - for (int64_t j = 0; j < width; j++) \ + for (const auto j : c10::irange(width))\ out_data[out_stride * (i + j)] = buffer[j]; \ } \ } \ diff --git a/aten/src/ATen/native/cpu/Unfold2d.cpp b/aten/src/ATen/native/cpu/Unfold2d.cpp index 862c83502e46e5..cc3a6b68d43e00 100644 --- a/aten/src/ATen/native/cpu/Unfold2d.cpp +++ b/aten/src/ATen/native/cpu/Unfold2d.cpp @@ -2,6 +2,7 @@ #include #include #include +#include #include namespace at { @@ -46,7 +47,7 @@ static void unfolded2d_acc( int64_t output_height, int64_t output_width) { at::parallel_for(0, n_input_plane, 0, [&](int64_t start, int64_t end) { - for (auto nip = start; nip < end; nip++) { + for (const auto nip : c10::irange(start, end)) { // NOLINTNEXTLINE(cppcoreguidelines-init-variables) int64_t kw, kh, y, x; // NOLINTNEXTLINE(cppcoreguidelines-init-variables) @@ -172,7 +173,7 @@ static void unfolded2d_copy( int64_t output_width) { at::parallel_for( 0, (int64_t)n_input_plane * kH * kW, 0, [&](int64_t start, int64_t end) { - for (auto k = start; k < end; k++) { + for (const auto k : c10::irange(start, end)) { int64_t nip = k / (kH * kW); int64_t rest = k % (kH * kW); int64_t kh = rest / kW; diff --git a/aten/src/ATen/native/cpu/UnfoldBackwardKernel.cpp b/aten/src/ATen/native/cpu/UnfoldBackwardKernel.cpp index 00367c5fd10380..b226b68bbca15a 100644 --- a/aten/src/ATen/native/cpu/UnfoldBackwardKernel.cpp +++ b/aten/src/ATen/native/cpu/UnfoldBackwardKernel.cpp @@ -2,6 +2,7 @@ #include #include #include +#include #if (defined(_WIN32) || defined(_WIN64)) #define RESTRICT __restrict @@ -77,7 +78,8 @@ void _unfold_backward_internal_kernel( if (is_step_ge_size) { auto* RESTRICT idx_last_dim_ptr = data[3]; - for (int64_t elem = 0; elem < nelems; ++elem) { + for (const auto elem : c10::irange(nelems)) { + (void)elem; //Suppress unused variable warning auto* RESTRICT grad_out_data = reinterpret_cast(grad_out_ptr); auto* RESTRICT grad_in_data = reinterpret_cast(grad_in_ptr); @@ -94,7 +96,8 @@ void _unfold_backward_internal_kernel( } } else { - for (int64_t elem = 0; elem < nelems; ++elem) { + for (const auto elem : c10::irange(nelems)) { + (void)elem; //Suppress unused variable warning auto* RESTRICT grad_out_data = reinterpret_cast(grad_out_ptr); auto* RESTRICT grad_in_data = reinterpret_cast(grad_in_ptr); diff --git a/aten/src/ATen/native/cpu/UpSampleKernel.cpp b/aten/src/ATen/native/cpu/UpSampleKernel.cpp index ad54778229d026..2e5c6b300dde97 100644 --- a/aten/src/ATen/native/cpu/UpSampleKernel.cpp +++ b/aten/src/ATen/native/cpu/UpSampleKernel.cpp @@ -5,6 +5,7 @@ #include #include #include +#include namespace at { namespace native { @@ -45,7 +46,7 @@ struct Interpolate { scalar_t wts = *(scalar_t*)&data[1][i * strides[1]]; scalar_t t = Interpolate::eval(src + ids, &data[2 * interp_size], &strides[2 * interp_size], i); scalar_t output = t * wts; - for (int j=1; j::eval(src + ids, &data[2 * interp_size], &strides[2 * interp_size], i); @@ -62,7 +63,7 @@ struct Interpolate<1, scalar_t, index_t, interp_size> { scalar_t wts = *(scalar_t*)&data[1][i * strides[1]]; scalar_t t = *(scalar_t *)&src[ids]; scalar_t output = t * wts; - for (int j=1; j static inline void basic_loop(char** data, const int64_t* strides, int64_t n) { char* dst = data[0]; char* src = data[1]; - for (int64_t i = 0; i < n; i++) { + for (const auto i : c10::irange(n)) { *(scalar_t*)&dst[i * strides[0]] = interpolate( src + i * strides[1], &data[2], &strides[2], i); } @@ -297,7 +298,7 @@ void cpu_upsample_nearest_channels_last( int64_t ow = 0; data_index_init(begin, n, num_batches, oh, output_height, ow, output_width); - for (int64_t i = begin; i < end; i++) { + for (const auto i : c10::irange(begin, end)) { int64_t ih = nearest_idx(oh, input_height, output_height, scales[0]); int64_t iw = nearest_idx(ow, input_width, output_width, scales[1]); scalar_t* output_ptr = output_data + i * channels; @@ -315,7 +316,7 @@ void cpu_upsample_nearest_channels_last( int64_t ow = 0; data_index_init(begin, n, num_batches, od, output_depth, oh, output_height, ow, output_width); - for (int64_t i = begin; i < end; i++) { + for (const auto i : c10::irange(begin, end)) { int64_t id = nearest_idx(od, input_depth, output_depth, scales[0]); int64_t ih = nearest_idx(oh, input_height, output_height, scales[1]); int64_t iw = nearest_idx(ow, input_width, output_width, scales[2]); @@ -390,11 +391,11 @@ void cpu_upsample_linear_channels_last( // NOLINTNEXTLINE(cppcoreguidelines-init-variables) int64_t ih0, ih1, iw0, iw1; scalar_t h0lambda, h1lambda, w0lambda, w1lambda; - for (int64_t n = begin; n < end; n++) { - for (int64_t oh = 0; oh < output_height; oh++) { + for (const auto n : c10::irange(begin, end)) { + for (const auto oh : c10::irange(output_height)) { compute_source_index_and_lambda( ih0, ih1, h0lambda, h1lambda, height_scale, oh, input_height, output_height, align_corners); - for (int64_t ow = 0; ow < output_width; ow++) { + for (const auto ow : c10::irange(output_width)) { compute_source_index_and_lambda( iw0, iw1, w0lambda, w1lambda, width_scale, ow, input_width, output_width, align_corners); @@ -444,14 +445,14 @@ void cpu_upsample_linear_channels_last( // NOLINTNEXTLINE(cppcoreguidelines-init-variables) int64_t id0, id1, ih0, ih1, iw0, iw1; scalar_t d0lambda, d1lambda, h0lambda, h1lambda, w0lambda, w1lambda; - for (int64_t n = begin; n < end; n++) { - for (int64_t od = 0; od < output_depth; od++) { + for (const auto n : c10::irange(begin, end)) { + for (const auto od : c10::irange(output_depth)) { compute_source_index_and_lambda( id0, id1, d0lambda, d1lambda, depth_scale, od, input_depth, output_depth, align_corners); - for (int64_t oh = 0; oh < output_height; oh++) { + for (const auto oh : c10::irange(output_height)) { compute_source_index_and_lambda( ih0, ih1, h0lambda, h1lambda, height_scale, oh, input_height, output_height, align_corners); - for (int64_t ow = 0; ow < output_width; ow++) { + for (const auto ow : c10::irange(output_width)) { compute_source_index_and_lambda( iw0, iw1, w0lambda, w1lambda, width_scale, ow, input_width, output_width, align_corners); @@ -523,7 +524,8 @@ struct HelperInterpBase { auto new_shape = std::vector(ndims, 1); new_shape[reshape_dim] = output_size; - for (int j=0; j()))); output.emplace_back(empty(new_shape, CPU(output_type))); } @@ -543,7 +545,8 @@ struct HelperInterpNearest : public HelperInterpBase { auto new_shape = std::vector(ndims, 1); new_shape[reshape_dim] = output_size; - for (int j=0; j()))); // Defines weights for consistency, but not used output.emplace_back(at::ones(new_shape, CPU(output_type))); @@ -578,7 +581,7 @@ struct HelperInterpNearest : public HelperInterpBase { auto input_index_ptr = output[0].data_ptr(); int64_t input_index; - for (int64_t i=0; i( scale, i, /*align_corners=*/true, /*cubic=*/false); input_index = static_cast(floorf(real_input_index)); @@ -625,7 +628,7 @@ struct HelperInterpLinear : public HelperInterpBase { auto input_index1_ptr = output[2].data_ptr(); auto lambda1_ptr = output[3].data_ptr(); - for (int64_t i=0; i( input_index0_ptr[i], input_index1_ptr[i], @@ -683,14 +686,14 @@ struct HelperInterpCubic : public HelperInterpBase { int64_t * idx_ptr; scalar_t * wt_ptr; - for (int64_t i=0; i( scale, i, align_corners, /*cubic=*/true); input_index = static_cast(floorf(real_input_index)); get_cubic_upsample_coefficients(coeffs, real_input_index - input_index); - for (int j=0; j(); idx_ptr[i] = static_cast(std::max(std::min(input_index + j - 1, input_size - 1), zero)) * stride; wt_ptr = output[2 * j + 1].data_ptr(); @@ -728,7 +731,7 @@ void upsample_generic_Nd_kernel_impl( ); TORCH_INTERNAL_ASSERT(strides.size() == 2 + out_ndims); - for (int i=0; i #include #include +#include namespace at { namespace native { @@ -55,8 +56,8 @@ void cpu_upsample_linear_backward( // NOLINTNEXTLINE(cppcoreguidelines-init-variables) int64_t iw0, iw1; scalar_t w0lambda, w1lambda; - for (int64_t c = begin; c < end; c++){ - for (int64_t ow = 0; ow < output_width; ow++) { + for (const auto c : c10::irange(begin, end)) { + for (const auto ow : c10::irange(output_width)) { compute_source_index_and_lambda( iw0, iw1, w0lambda, w1lambda, width_scale, ow, input_width, output_width, align_corners); scalar_t grad_output_value = grad_output_data[c * output_slice_size + ow]; @@ -79,11 +80,11 @@ void cpu_upsample_linear_backward( // NOLINTNEXTLINE(cppcoreguidelines-init-variables) int64_t ih0, ih1, iw0, iw1; scalar_t h0lambda, h1lambda, w0lambda, w1lambda; - for (int64_t c = begin; c < end; c++) { - for (int64_t oh = 0; oh < output_height; oh++) { + for (const auto c : c10::irange(begin, end)) { + for (const auto oh : c10::irange(output_height)) { compute_source_index_and_lambda( ih0, ih1, h0lambda, h1lambda, height_scale, oh, input_height, output_height, align_corners); - for (int64_t ow = 0; ow < output_width; ow++) { + for (const auto ow : c10::irange(output_width)) { compute_source_index_and_lambda( iw0, iw1, w0lambda, w1lambda, width_scale, ow, input_width, output_width, align_corners); scalar_t grad_output_value = grad_output_data[c * output_slice_size + oh * output_width + ow]; @@ -112,14 +113,14 @@ void cpu_upsample_linear_backward( // NOLINTNEXTLINE(cppcoreguidelines-init-variables) int64_t id0, id1, ih0, ih1, iw0, iw1; scalar_t d0lambda, d1lambda, h0lambda, h1lambda, w0lambda, w1lambda; - for (int64_t c = begin; c < end; c++) { - for (int64_t od = 0; od < output_depth; od++) { + for (const auto c : c10::irange(begin, end)) { + for (const auto od : c10::irange(output_depth)) { compute_source_index_and_lambda( id0, id1, d0lambda, d1lambda, depth_scale, od, input_depth, output_depth, align_corners); - for (int64_t oh = 0; oh < output_height; oh++) { + for (const auto oh : c10::irange(output_height)) { compute_source_index_and_lambda( ih0, ih1, h0lambda, h1lambda, height_scale, oh, input_height, output_height, align_corners); - for (int64_t ow = 0; ow < output_width; ow++) { + for (const auto ow : c10::irange(output_width)) { compute_source_index_and_lambda( iw0, iw1, w0lambda, w1lambda, width_scale, ow, input_width, output_width, align_corners); scalar_t grad_output_value = grad_output_data[c * output_slice_size + diff --git a/aten/src/ATen/native/cpu/batch_norm_kernel.cpp b/aten/src/ATen/native/cpu/batch_norm_kernel.cpp index 75037606d3ff46..302edc1e1d0ae5 100644 --- a/aten/src/ATen/native/cpu/batch_norm_kernel.cpp +++ b/aten/src/ATen/native/cpu/batch_norm_kernel.cpp @@ -10,6 +10,7 @@ #include #include #include +#include namespace at { namespace native { namespace { @@ -42,7 +43,7 @@ void batch_norm_cpu_collect_linear_and_constant_terms( /// the constant term beta(c) = bias(c) - mean(c) * inv_var(c) * weight(c) /// Note that this is only a good idea if (input_size >> c), in degenerate /// cases where image_size == 1 && batch_size == 1, it is slow. - for (int64_t c = 0; c < n_channel; c++) { + for (const auto c : c10::irange(n_channel)) { scalar_t mean, invstd; if (train) { mean = save_mean_a[c]; @@ -90,7 +91,7 @@ void batch_norm_cpu_contiguous_impl(Tensor& output, const Tensor& input, int64_t c = 0; data_index_init(begin, n, n_batch, c, n_channel); - for (int64_t i = begin; i < end; i++) { + for (const auto i : c10::irange(begin, end)) { const Vec alpha_vec(alpha_data[c]); const Vec beta_vec(beta_data[c]); int64_t offset = i * image_size; @@ -113,7 +114,7 @@ void batch_norm_cpu_contiguous_impl(Tensor& output, const Tensor& input, // image_size == 1 const int64_t loop_size = n_channel - (n_channel % Vec::size()); at::parallel_for(0, n_batch, 1, [&](int64_t begin, int64_t end) { - for (int64_t n = begin; n < end; n++) { + for (const auto n : c10::irange(begin, end)) { int64_t offset = n * n_channel; int64_t d = 0; for (; d < loop_size; d += Vec::size()) { @@ -161,7 +162,7 @@ void batch_norm_cpu_channels_last_impl(Tensor& output, const Tensor& input, // output(n, c, h, w) = input(n, c, h, w) * alpha(c) + beta(c) const int64_t loop_size = n_channel - (n_channel % Vec::size()); at::parallel_for(0, n_batch * image_size, 1, [&](int64_t begin, int64_t end) { - for (int64_t i = begin; i < end; i++) { + for (const auto i : c10::irange(begin, end)) { int64_t offset = i * n_channel; int64_t d = 0; // vectorize on channel dimension, for normal batch_norm input size, @@ -200,11 +201,11 @@ void batch_norm_cpu_collect_stats_contiguous_impl( // parallel dim reduce on 'channel' at::parallel_for(0, n_channel, 1, [&](int64_t begin, int64_t end) { - for (int64_t c = begin; c < end; c++) { + for (const auto c : c10::irange(begin, end)) { // compute mean per input accscalar_t sum = 0; - for (int64_t n = 0; n < n_batch; n++) { - for (int64_t i = 0; i < image_size; i++) { + for (const auto n : c10::irange(n_batch)) { + for (const auto i : c10::irange(image_size)) { auto offset = n * n_channel * image_size + c * image_size + i; sum += input_data[offset]; } @@ -214,8 +215,8 @@ void batch_norm_cpu_collect_stats_contiguous_impl( // compute variance per input accscalar_t _var_sum = 0; - for (int64_t n = 0; n < n_batch; n++) { - for (int64_t i = 0; i < image_size; i++) { + for (const auto n : c10::irange(n_batch)) { + for (const auto i : c10::irange(image_size)) { auto offset = n * n_channel * image_size + c * image_size + i; auto x = input_data[offset]; _var_sum += (x - mean) * (x - mean); @@ -259,7 +260,7 @@ void batch_norm_cpu_collect_stats_channels_last_impl( TORCH_CHECK(tid < num_threads, "expect thread id smaller than ", num_threads, ", got thread id ", tid); scalar_t* buffer_ptr = buffer_data + tid * n_channel; - for (int64_t i = begin; i < end; i++) { + for (const auto i : c10::irange(begin, end)) { const scalar_t* x_ptr = input_data + i * n_channel; vec::map2( [](Vec x, Vec y) { return x + y; }, @@ -271,9 +272,9 @@ void batch_norm_cpu_collect_stats_channels_last_impl( }); at::parallel_for(0, n_channel, 1, [&](int64_t begin, int64_t end) { - for (int64_t c = begin; c < end; c++) { + for (const auto c : c10::irange(begin, end)) { accscalar_t sum = 0; - for (int64_t t = 0; t < num_threads; t++) { + for (const auto t : c10::irange(num_threads)) { sum += buffer_data[t * n_channel + c]; } scalar_t mean = sum / N; @@ -287,7 +288,7 @@ void batch_norm_cpu_collect_stats_channels_last_impl( int tid = at::get_thread_num(); TORCH_CHECK(tid < num_threads, "expect thread id smaller than ", num_threads, ", got thread id ", tid); scalar_t* buffer_ptr = buffer_data + tid * n_channel; - for (int64_t i = begin; i < end; i++) { + for (const auto i : c10::irange(begin, end)) { const scalar_t* x_ptr = input_data + i * n_channel; vec::map3( [](Vec x, Vec y, Vec mean) { return y + (x - mean) * (x - mean); }, @@ -300,9 +301,9 @@ void batch_norm_cpu_collect_stats_channels_last_impl( }); at::parallel_for(0, n_channel, 1, [&](int64_t begin, int64_t end) { - for (int64_t c = begin; c < end; c++) { + for (const auto c : c10::irange(begin, end)) { accscalar_t _var_sum = 0; - for (int64_t t = 0; t < num_threads; t++) { + for (const auto t : c10::irange(num_threads)) { _var_sum += buffer_data[t * n_channel + c]; } var_sum_data[c] = _var_sum; @@ -341,7 +342,7 @@ void batch_norm_cpu_backward_contiguous_impl(Tensor& grad_input, Tensor& grad_we // parallel dim reduce on 'channel' at::parallel_for(0, n_channel, 1, [&](int64_t begin, int64_t end) { - for (int64_t c = begin; c < end; c++) { + for (const auto c : c10::irange(begin, end)) { scalar_t w = weight.defined() ? weight_a[c] : 1; scalar_t mean, invstd; @@ -359,7 +360,7 @@ void batch_norm_cpu_backward_contiguous_impl(Tensor& grad_input, Tensor& grad_we // accscalar_t sum = 0; accscalar_t dotp = 0; - for (int64_t n = 0; n < n_batch; n++) { + for (const auto n : c10::irange(n_batch)) { const scalar_t* x_ptr = input_data + n * n_channel * image_size + c * image_size; const scalar_t* dy_ptr = grad_output_data + n * n_channel * image_size + c * image_size; @@ -381,13 +382,13 @@ void batch_norm_cpu_backward_contiguous_impl(Tensor& grad_input, Tensor& grad_we scalar_t k = (scalar_t) dotp * invstd * invstd / N; scalar_t grad_mean = sum / N; - for (int64_t n = 0; n < n_batch; n++) { + for (const auto n : c10::irange(n_batch)) { const scalar_t* x_ptr = input_data + n * n_channel * image_size + c * image_size; scalar_t* dx_ptr = grad_input_data + n * n_channel * image_size + c * image_size; const scalar_t* dy_ptr = grad_output_data + n * n_channel * image_size + c * image_size; // Scalar math: - // for (int64_t j = 0; j < image_size; ++j) { + // for (const auto j : c10::irange(image_size)) { // scalar_t dx = (x_ptr[j] - mean) * k; // dx_ptr[j] = (dy_ptr[j] - grad_mean - dx) * invstd * w; // } @@ -402,12 +403,12 @@ void batch_norm_cpu_backward_contiguous_impl(Tensor& grad_input, Tensor& grad_we image_size); } } else { // evaluation mode - for (int64_t n = 0; n < n_batch; n++) { + for (const auto n : c10::irange(n_batch)) { scalar_t* dx_ptr = grad_input_data + n * n_channel * image_size + c * image_size; const scalar_t* dy_ptr = grad_output_data + n * n_channel * image_size + c * image_size; // Scalar math: - // for (int64_t j = 0; j < image_size; ++j) { + // for (const auto j : c10::irange(image_size)) { // dx_ptr[j] = dy_ptr[j] * invstd * w; // } vec::map( @@ -467,7 +468,7 @@ void batch_norm_cpu_backward_channels_last_impl(Tensor& grad_input, Tensor& grad invstd.resize_({n_channel}); invstd_ptr = invstd.data_ptr(); - for (int64_t c = 0; c < n_channel; c++) { + for (const auto c : c10::irange(n_channel)) { invstd_ptr[c] = 1 / std::sqrt(running_var_data[c] + eps); } } @@ -491,7 +492,7 @@ void batch_norm_cpu_backward_channels_last_impl(Tensor& grad_input, Tensor& grad TORCH_CHECK(tid < num_threads, "expect thread id smaller than ", num_threads, ", got thread id ", tid); scalar_t* sum_ptr = sum_data + tid * n_channel; scalar_t* dotp_ptr = dotp_data + tid * n_channel; - for (int64_t i = begin; i < end; i++) { + for (const auto i : c10::irange(begin, end)) { const scalar_t* x_ptr = input_data + i * n_channel; const scalar_t* dy_ptr = grad_output_data + i * n_channel; @@ -514,17 +515,17 @@ void batch_norm_cpu_backward_channels_last_impl(Tensor& grad_input, Tensor& grad }); at::parallel_for(0, n_channel, 1, [&](int64_t begin, int64_t end) { - for (int64_t c = begin; c < end; c++) { + for (const auto c : c10::irange(begin, end)) { // store the final result of sum and dotp in the 1st lane of immediate buffer, // so that we won't need to allocate anther buffer to store the temp values. accscalar_t _sum = 0; - for (int64_t t = 0; t < num_threads; t++) { + for (const auto t : c10::irange(num_threads)) { _sum += sum_data[t * n_channel + c]; } sum_data[/* 0 * n_channel + */c] = _sum; accscalar_t _dotp = 0; - for (int64_t t = 0; t < num_threads; t++) { + for (const auto t : c10::irange(num_threads)) { _dotp += dotp_data[t * n_channel + c]; } dotp_data[/* 0 * n_channel + */c] = _dotp; @@ -535,7 +536,7 @@ void batch_norm_cpu_backward_channels_last_impl(Tensor& grad_input, Tensor& grad const int64_t loop_size = n_channel - (n_channel % Vec::size()); if (grad_input.defined()) { at::parallel_for(0, N, 1, [&](int64_t begin, int64_t end) { - for (int64_t i = begin; i < end; i++) { + for (const auto i : c10::irange(begin, end)) { scalar_t* dx_ptr = grad_input_data + i * n_channel; const scalar_t* x_ptr = input_data + i * n_channel; const scalar_t* dy_ptr = grad_output_data + i * n_channel; diff --git a/aten/src/ATen/native/cpu/group_norm_kernel.cpp b/aten/src/ATen/native/cpu/group_norm_kernel.cpp index fb8db7e61800fa..6f98b58a3c0e5a 100644 --- a/aten/src/ATen/native/cpu/group_norm_kernel.cpp +++ b/aten/src/ATen/native/cpu/group_norm_kernel.cpp @@ -9,6 +9,7 @@ #include #include #include +#include namespace at { namespace native { @@ -44,7 +45,7 @@ void GroupNormKernelImplInternal( const int64_t inner_size = D * HxW; at::parallel_for(0, N * G, 1, [&](int64_t start, int64_t end) { - for (int64_t i = start; i < end; ++i) { + for (const auto i : c10::irange(start, end)) { const T* X_ptr = X_data + i * inner_size; T mean_val; T rstd_val; @@ -52,18 +53,18 @@ void GroupNormKernelImplInternal( rstd_val = T(1) / std::sqrt(std::max(rstd_val, T(0)) + eps); if (gamma_null && beta_null) { T* Y_ptr = Y_data + i * inner_size; - for (int j = 0; j < inner_size; ++j) { + for (const auto j : c10::irange(inner_size)) { Y_ptr[j] = (X_ptr[j] - mean_val) * rstd_val; } } else { const int64_t g = i % G; - for (int64_t j = 0; j < D; ++j) { + for (const auto j : c10::irange(D)) { const int64_t c = g * D + j; const T scale = rstd_val * (gamma_null ? T(1) : gamma_data[c]); const T bias = -scale * mean_val + (beta_null ? T(0) : beta_data[c]); X_ptr = X_data + (i * D + j) * HxW; T* Y_ptr = Y_data + (i * D + j) * HxW; - for (int64_t k = 0; k < HxW; ++k) { + for (const auto k : c10::irange(HxW)) { Y_ptr[k] = scale * X_ptr[k] + bias; } } @@ -110,10 +111,10 @@ void GroupNormKernelImplChannelsLastInternal( at::parallel_for(0, N, 1, [&](int64_t start, int64_t end) { constexpr int64_t K = Vec::size(); const int64_t inner_size = C / K * K; - for (int64_t n = start; n < end; ++n) { + for (const auto n : c10::irange(start, end)) { T* mean_ptr = buffer_data + n * 2 * C; T* rstd_ptr = mean_ptr + C; - for (int64_t i = 0; i < HxW; ++i) { + for (const auto i : c10::irange(HxW)) { const T* X_ptr = X_data + n * HxW * C + i * C; for (int64_t j = 0; j < inner_size; j += K) { const Vec x_vec = Vec::loadu(X_ptr + j); @@ -122,16 +123,16 @@ void GroupNormKernelImplChannelsLastInternal( mean_vec.store(mean_ptr + j); rstd_vec.store(rstd_ptr + j); } - for (int64_t j = inner_size; j < C; ++j) { + for (const auto j : c10::irange(inner_size, C)) { mean_ptr[j] += X_ptr[j]; rstd_ptr[j] += X_ptr[j] * X_ptr[j]; } } - for (int64_t g = 0; g < G; ++g) { + for (const auto g : c10::irange(G)) { T mean_val = T(0); T rstd_val = T(0); - for (int64_t d = 0; d < D; ++d) { + for (const auto d : c10::irange(D)) { mean_val += mean_ptr[g * D + d]; rstd_val += rstd_ptr[g * D + d]; } @@ -141,7 +142,7 @@ void GroupNormKernelImplChannelsLastInternal( // continue to use the temp buffer for mean and rstd value, // so that we can vectorize the following math on entire C dimension. - for (int64_t d = 0; d < D; ++d) { + for (const auto d : c10::irange(D)) { mean_ptr[g * D + d] = mean_val; rstd_ptr[g * D + d] = rstd_val; } @@ -152,7 +153,7 @@ void GroupNormKernelImplChannelsLastInternal( // expand gamma_null and beta_null to reduce if-else on critial path. if (!gamma_null && !beta_null) { - for (int64_t i = 0; i < HxW; ++i) { + for (const auto i : c10::irange(HxW)) { const T* X_ptr = X_data + n * HxW * C + i * C; T* Y_ptr = Y_data + n * HxW * C + i * C; for (int64_t j = 0; j < inner_size; j += K) { @@ -161,14 +162,14 @@ void GroupNormKernelImplChannelsLastInternal( Vec y_vec = scale_vec * Vec::loadu(X_ptr + j) + bias_vec; y_vec.store(Y_ptr + j); } - for (int64_t j = inner_size; j < C; ++j) { + for (const auto j : c10::irange(inner_size, C)) { T scale = rstd_ptr[j] * gamma_data[j]; T bias = -scale * mean_ptr[j] + beta_data[j]; Y_ptr[j] = scale * X_ptr[j] + bias; } } } else if (gamma_null && beta_null) { - for (int64_t i = 0; i < HxW; ++i) { + for (const auto i : c10::irange(HxW)) { const T* X_ptr = X_data + n * HxW * C + i * C; T* Y_ptr = Y_data + n * HxW * C + i * C; for (int64_t j = 0; j < inner_size; j += K) { @@ -176,13 +177,13 @@ void GroupNormKernelImplChannelsLastInternal( Vec y_vec = scale_vec * Vec::loadu(X_ptr + j) - scale_vec * Vec::loadu(mean_ptr + j); y_vec.store(Y_ptr + j); } - for (int64_t j = inner_size; j < C; ++j) { + for (const auto j : c10::irange(inner_size, C)) { T scale = rstd_ptr[j]; Y_ptr[j] = scale * X_ptr[j] -scale * mean_ptr[j]; } } } else { - for (int64_t i = 0; i < HxW; ++i) { + for (const auto i : c10::irange(HxW)) { const T* X_ptr = X_data + n * HxW * C + i * C; T* Y_ptr = Y_data + n * HxW * C + i * C; for (int64_t j = 0; j < inner_size; j += K) { @@ -193,7 +194,7 @@ void GroupNormKernelImplChannelsLastInternal( Vec y_vec = scale_vec * Vec::loadu(X_ptr + j) + bias_vec; y_vec.store(Y_ptr + j); } - for (int64_t j = inner_size; j < C; ++j) { + for (const auto j : c10::irange(inner_size, C)) { T scale = rstd_ptr[j] * (gamma_null ? T(1) : gamma_data[j]); T bias = -scale * mean_ptr[j] + (beta_null ? T(0) : beta_data[j]); Y_ptr[j] = scale * X_ptr[j] + bias; @@ -252,7 +253,7 @@ void ComputeInternalGradients( std::array ds_arr; // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init) std::array db_arr; - for (int64_t i = start; i < end; ++i) { + for (const auto i : c10::irange(start, end)) { const T* dY_ptr = dY + i * HxW; const T* X_ptr = X + i * HxW; vec::Vectorized ds_vec(0); @@ -267,7 +268,7 @@ void ComputeInternalGradients( db_vec.store(db_arr.data()); T ds_val = std::accumulate(ds_arr.cbegin(), ds_arr.cend(), T(0)); T db_val = std::accumulate(db_arr.cbegin(), db_arr.cend(), T(0)); - for (int64_t j = inner_size; j < HxW; ++j) { + for (const auto j : c10::irange(inner_size, HxW)) { ds_val += dY_ptr[j] * X_ptr[j]; db_val += dY_ptr[j]; } @@ -302,7 +303,7 @@ void GroupNormInputBackward( std::array ds_arr; // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init) std::array db_arr; - for (int64_t i = start; i < end; ++i) { + for (const auto i : c10::irange(start, end)) { const int64_t g = i % G; const T* ds_ptr = ds + i * D; const T* db_ptr = db + i * D; @@ -319,7 +320,7 @@ void GroupNormInputBackward( db_vec.store(db_arr.data()); T ds_val = std::accumulate(ds_arr.cbegin(), ds_arr.cend(), T(0)); T db_val = std::accumulate(db_arr.cbegin(), db_arr.cend(), T(0)); - for (int64_t j = d; j < D; ++j) { + for (const auto j : c10::irange(d, D)) { const T gamma_v = gamma_null ? T(1) : gamma[g * D + j]; ds_val += ds_ptr[j] * gamma_v; db_val += db_ptr[j] * gamma_v; @@ -327,13 +328,13 @@ void GroupNormInputBackward( const T c2 = (db_val * mean[i] - ds_val) * rstd[i] * rstd[i] * rstd[i] * s; const T c3 = -c2 * mean[i] - db_val * rstd[i] * s; - for (int64_t j = 0; j < D; ++j) { + for (const auto j : c10::irange(D)) { const int64_t c = g * D + j; const T* dY_ptr = dY + (i * D + j) * HxW; const T* X_ptr = X + (i * D + j) * HxW; T* dX_ptr = dX + (i * D + j) * HxW; const T c1 = rstd[i] * (gamma_null ? T(1) : gamma[c]); - for (int64_t k = 0; k < HxW; ++k) { + for (const auto k : c10::irange(HxW)) { dX_ptr[k] = c1 * dY_ptr[k] + c2 * X_ptr[k] + c3; } } @@ -355,14 +356,14 @@ void GammaBackward( const int64_t D = C / G; constexpr int64_t K = vec::Vectorized::size(); at::parallel_for(0, D, K, [=](int64_t start, int64_t end) { - for (int64_t i = 0; i < G; ++i) { + for (const auto i : c10::irange(G)) { std::memset(dgamma + i * D + start, 0, (end - start) * sizeof(T)); } for (int64_t i = 0; i < N * G; ++i) { const T* ds_ptr = ds + i * D; const T* db_ptr = db + i * D; const int64_t g = i % G; - for (int64_t j = start; j < end; ++j) { + for (const auto j : c10::irange(start, end)) { const int64_t c = g * D + j; dgamma[c] += (ds_ptr[j] - db_ptr[j] * mean[i]) * rstd[i]; } @@ -375,9 +376,9 @@ void BetaBackward(int64_t N, int64_t C, const T* db, T* dbeta) { constexpr int64_t K = vec::Vectorized::size(); at::parallel_for(0, C, K, [=](int64_t start, int64_t end) { std::memset(dbeta + start, 0, (end - start) * sizeof(T)); - for (int64_t i = 0; i < N; ++i) { + for (const auto i : c10::irange(N)) { const T* db_ptr = db + i * C; - for (int64_t j = start; j < end; ++j) { + for (const auto j : c10::irange(start, end)) { dbeta[j] += db_ptr[j]; } } diff --git a/aten/src/ATen/native/cpu/layer_norm_kernel.cpp b/aten/src/ATen/native/cpu/layer_norm_kernel.cpp index a065d3b61b730d..887b7a1dcdc9ff 100644 --- a/aten/src/ATen/native/cpu/layer_norm_kernel.cpp +++ b/aten/src/ATen/native/cpu/layer_norm_kernel.cpp @@ -9,6 +9,7 @@ #include #include #include +#include namespace at { namespace native { @@ -40,7 +41,7 @@ void LayerNormKernelImplInternal( const bool gamma_null = gamma_data == nullptr; const bool beta_null = beta_data == nullptr; at::parallel_for(0, M, 1, [&](int64_t start, int64_t end) { - for (int64_t i = start; i < end; ++i) { + for (const auto i : c10::irange(start, end)) { const T* X_ptr = X_data + i * N; T* Y_ptr = Y_data + i * N; T mean_val; @@ -50,7 +51,7 @@ void LayerNormKernelImplInternal( const T_ACC scale = rstd_val; const T_ACC bias = -rstd_val * mean_val; if (gamma_null || beta_null) { - for (int64_t j = 0; j < N; ++j) { + for (const auto j : c10::irange(N)) { const T gamma_v = gamma_null ? T(1) : gamma_data[j]; const T beta_v = beta_null ? T(0) : beta_data[j]; Y_ptr[j] = (X_ptr[j] * scale + bias) * gamma_v + beta_v; @@ -153,14 +154,14 @@ void LayerNormBackwardKernelImplInternal( T* dgamma_buffer_ptr = dgamma_null ? nullptr : buffer_data + tid * N; T* dbeta_buffer_ptr = dbeta_null ? nullptr : buffer_data + num_threads * N + tid * N; - for (int64_t i = start; i < end; ++i) { + for (const auto i : c10::irange(start, end)) { const T* dY_ptr = dY_data + i * N; const T* X_ptr = X_data + i * N; if (!dgamma_null) { const T_ACC a = rstd_data[i]; const T_ACC b = -a * mean_data[i]; // Scalar math: - // for (int64_t j = 0; j < N; ++j) { + // for (const auto j : c10::irange(N)) { // dgamma_data[j] += dY_ptr[j] * (a * X_ptr[j] + b); // } vec::map3( @@ -175,7 +176,7 @@ void LayerNormBackwardKernelImplInternal( } if (!dbeta_null) { // Scalar math: - // for (int64_t j = 0; j < N; ++j) { + // for (const auto j : c10::irange(N)) { // dbeta_data[j] += dY_ptr[j]; // } vec::map2( @@ -190,7 +191,7 @@ void LayerNormBackwardKernelImplInternal( T_ACC ds = T_ACC(0); T_ACC db = T_ACC(0); // Scalar math: - // for (int64_t j = 0; j < N; ++j) { + // for (const auto j : c10::irange(N)) { // const T gamma_v = gamma_null ? T(1) : gamma_data[j]; // ds += dY_ptr[j] * X_ptr[j] * gamma_v; // db += dY_ptr[j] * gamma_v; @@ -223,7 +224,7 @@ void LayerNormBackwardKernelImplInternal( const T_ACC b = (db * mean_data[i] - ds) * a * a * a * scale; const T_ACC c = -b * mean_data[i] - db * a * scale; // Scalar math: - // for (int64_t j = 0; j < N; ++j) { + // for (const auto j : c10::irange(N)) { // const T gamma_v = gamma_null ? T(1) : gamma_data[j]; // dX_ptr[j] = a * dY_ptr[j] * gamma_v + b * X_ptr[j] + c; // } @@ -254,10 +255,10 @@ void LayerNormBackwardKernelImplInternal( // Second path of dgamma/dbeta if (buffer_data != nullptr) { parallel_for(0, N, 1, [&](int64_t start, int64_t end) { - for (int64_t j = start; j < end; ++j) { + for (const auto j : c10::irange(start, end)) { T_ACC dgamma_v = T_ACC(0); T_ACC dbeta_v = T_ACC(0); - for (int64_t i = 0; i < num_threads; ++i) { + for (const auto i : c10::irange(num_threads)) { dgamma_v += buffer_data[i * N + j]; dbeta_v += buffer_data[num_threads * N + i * N + j]; } diff --git a/aten/src/ATen/native/cpu/moments_utils.h b/aten/src/ATen/native/cpu/moments_utils.h index 786957c2b914cc..9a0c883bfb961d 100644 --- a/aten/src/ATen/native/cpu/moments_utils.h +++ b/aten/src/ATen/native/cpu/moments_utils.h @@ -10,6 +10,7 @@ #include #include #include +#include namespace at { namespace native { @@ -69,12 +70,12 @@ std::pair RowwiseMomentsImpl(const T* X, int64_t N, int64_t ddof = 0) { c10::SmallVector m1_stk(depth, kZeroVec); c10::SmallVector m2_stk(depth, kZeroVec); - for (int64_t i = 0; i < m; ++i) { + for (const auto i : c10::irange(m)) { const T* X_ptr = X + i * kChunkSize * kVecSize; const int64_t m0 = std::min(kChunkSize, n - i * kChunkSize); Vec m1_vec(0); Vec m2_vec(0); - for (int64_t j = 0; j < m0; ++j) { + for (const auto j : c10::irange(m0)) { const Vec x_vec = Vec::loadu(X_ptr + j * kVecSize); const Vec delta_vec = x_vec - m1_vec; const Vec c_vec = Vec(T(1) / static_cast(j + 1)); @@ -97,7 +98,7 @@ std::pair RowwiseMomentsImpl(const T* X, int64_t N, int64_t ddof = 0) { mask >>= 1; } } - for (int64_t i = 1; i < depth; ++i) { + for (const auto i : c10::irange(1, depth)) { AddMomentsVec( m0_stk[i], m1_stk[i], m2_stk[i], m0_stk[0], m1_stk[0], m2_stk[0]); } @@ -116,7 +117,7 @@ std::pair RowwiseMomentsImpl(const T* X, int64_t N, int64_t ddof = 0) { m1 += delta / static_cast(m0); m2 += delta * (X[i] - m1); } - for (int64_t i = 0; i < kVecSize; ++i) { + for (const auto i : c10::irange(kVecSize)) { AddMoments(n, m1_arr[i], m2_arr[i], m0, m1, m2); } diff --git a/aten/src/ATen/native/cuda/CuFFTPlanCache.h b/aten/src/ATen/native/cuda/CuFFTPlanCache.h index 6337e130d66a2c..ad7ca2ac5a1425 100644 --- a/aten/src/ATen/native/cuda/CuFFTPlanCache.h +++ b/aten/src/ATen/native/cuda/CuFFTPlanCache.h @@ -275,7 +275,7 @@ class CuFFTConfig { "cuFFT doesn't support signals of half type with compute " "capability less than SM_53, but the device containing input half " "tensor only has SM_", dev_prop->major, dev_prop->minor); - for (int64_t i = 0; i < signal_ndim; i++) { + for (const auto i : c10::irange(signal_ndim)) { TORCH_CHECK(is_pow_of_two(sizes[i + 1]), "cuFFT only supports dimensions whose sizes are powers of two when" " computing in half precision, but got a signal size of", diff --git a/aten/src/ATen/native/cuda/SpectralOps.cpp b/aten/src/ATen/native/cuda/SpectralOps.cpp index 941513b885249a..95fef7d09150b7 100644 --- a/aten/src/ATen/native/cuda/SpectralOps.cpp +++ b/aten/src/ATen/native/cuda/SpectralOps.cpp @@ -12,6 +12,7 @@ #include #include #include +#include #include #include @@ -229,7 +230,7 @@ static const Tensor& _exec_fft(Tensor& out, const Tensor& self, IntArrayRef out_ const auto batch_size = input.sizes()[0]; DimVector signal_size(signal_ndim + 1); signal_size[0] = batch_size; - for (int64_t i = 0; i < signal_ndim; ++i) { + for (const auto i : c10::irange(signal_ndim)) { auto in_size = input.sizes()[i + 1]; auto out_size = out_sizes[dim[i]]; signal_size[i + 1] = std::max(in_size, out_size); @@ -241,7 +242,7 @@ static const Tensor& _exec_fft(Tensor& out, const Tensor& self, IntArrayRef out_ batched_sizes[0] = batch_size; DimVector batched_out_sizes(batched_sizes.begin(), batched_sizes.end()); - for (size_t i = 0; i < dim.size(); ++i) { + for (const auto i : c10::irange(dim.size())) { batched_out_sizes[i + 1] = out_sizes[dim[i]]; } out.resize_(batched_out_sizes, MemoryFormat::Contiguous); @@ -303,7 +304,7 @@ static const Tensor& _exec_fft(Tensor& out, const Tensor& self, IntArrayRef out_ out_strides[dim_permute[i]] = batch_numel * out.strides()[0]; batch_numel *= out_sizes[dim_permute[i]]; } - for (int64_t i = batch_dims; i < ndim; ++i) { + for (const auto i : c10::irange(batch_dims, ndim)) { out_strides[dim_permute[i]] = out.strides()[1 + (i - batch_dims)]; } return out.as_strided_(out_sizes, out_strides, out.storage_offset()); diff --git a/aten/src/ATen/native/cudnn/Conv_v7.cpp b/aten/src/ATen/native/cudnn/Conv_v7.cpp index 87a0b34e96a308..673dfb35c28d98 100644 --- a/aten/src/ATen/native/cudnn/Conv_v7.cpp +++ b/aten/src/ATen/native/cudnn/Conv_v7.cpp @@ -20,6 +20,7 @@ #include #include +#include #include #include @@ -204,7 +205,7 @@ size_t getMaxWorkspaceSize( THCudaCheck(cudaGetDevice(&device)); c10::cuda::CUDACachingAllocator::cacheInfo(device, &tmp_bytes, &max_block_size); - for (int i = 0; i < n_algo; i++) { + for (const auto i : c10::irange(n_algo)) { cudnnStatus_t err; size_t sz; err = getWorkspaceSize(args, algo[i], &sz); @@ -229,7 +230,7 @@ std::vector getValidAlgorithms(perf_t *perfResults, const ConvolutionArg std::vector result; result.reserve(n_algo); - for (int i = 0; i < n_algo; i++) { + for (const auto i : c10::irange(n_algo)) { perf_t perf = perfResults[i]; // TODO: Shouldn't all returned results be successful? @@ -579,7 +580,7 @@ static inline void split_batch_dim_to_32bit_out( int64_t split_size = std::max(max_worksize / max_inner_size, 1L); int64_t num_splits = (n + split_size - 1) / split_size; if (split_size * max_inner_size < int_max) { - for (int64_t i = 0; i < num_splits; i++) { + for (const auto i : c10::irange(num_splits)) { int64_t start = split_size * i; int64_t split_size_ = std::min(split_size, n - start); Tensor input_ = input.narrow(0, start, split_size_); @@ -805,7 +806,7 @@ void raw_cudnn_convolution_backward_weight_out( int64_t split_size = std::max(1024 * 1024 * 512 / max_inner_size, 1L); int64_t num_splits = (n + split_size - 1) / split_size; if (split_size * max_inner_size < int_max) { - for (int64_t i = 0; i < num_splits; i++) { + for (const auto i : c10::irange(num_splits)) { int64_t start = split_size * i; int64_t split_size_ = std::min(split_size, n - start); Tensor input_ = input.narrow(0, start, split_size_); diff --git a/aten/src/ATen/native/cudnn/GridSampler.cpp b/aten/src/ATen/native/cudnn/GridSampler.cpp index c5960f6a1c5594..38bde06aa6cc0c 100644 --- a/aten/src/ATen/native/cudnn/GridSampler.cpp +++ b/aten/src/ATen/native/cudnn/GridSampler.cpp @@ -30,6 +30,7 @@ std::tuple cudnn_grid_sampler_backward( #include #include +#include // TODO: descriptor checking @@ -41,7 +42,7 @@ namespace { void setSamplerDescriptor(SpatialTransformerDescriptor& desc, cudnnDataType_t dataType, const at::Tensor& tensor) { int inputSize[4] = {0}; - for (int i = 0; i < tensor.dim(); ++i) { + for (const auto i : c10::irange(tensor.dim())) { inputSize[i] = (int) tensor.size(i); } desc.set(dataType, 4, inputSize); diff --git a/aten/src/ATen/native/cudnn/LossCTC.cpp b/aten/src/ATen/native/cudnn/LossCTC.cpp index 543083bd73a811..37c5277428b7f4 100644 --- a/aten/src/ATen/native/cudnn/LossCTC.cpp +++ b/aten/src/ATen/native/cudnn/LossCTC.cpp @@ -34,6 +34,7 @@ std::tuple _cudnn_ctc_loss(const Tensor& log_probs, const Tensor #include #include +#include namespace at { namespace native { @@ -57,7 +58,7 @@ bool _use_cudnn_ctc_loss( for (const auto input_length : input_lengths) { use_cudnn &= ((input_length == max_input_length) ? 1 : 0); } - for (size_t b = 0; b < target_lengths.size(); b++) { + for (const auto b : c10::irange(target_lengths.size())) { // target length < 256 is documented, but we see illegal memory accesses // when target lengths > input lengths for CuDNN use_cudnn &= diff --git a/aten/src/ATen/native/cudnn/RNN.cpp b/aten/src/ATen/native/cudnn/RNN.cpp index 8eceed0212ec31..639c741685b6d9 100644 --- a/aten/src/ATen/native/cudnn/RNN.cpp +++ b/aten/src/ATen/native/cudnn/RNN.cpp @@ -11,6 +11,7 @@ #include #include #include +#include #if !AT_CUDNN_ENABLED() @@ -191,7 +192,7 @@ namespace { std::vector rnn_descriptor(const Tensor& tensor, int64_t N) { std::vector descriptors(N); - for (int64_t i = 0; i < N; i++) { + for (const auto i : c10::irange(N)) { descriptors[i].set(tensor, 5); } return descriptors; @@ -470,10 +471,10 @@ namespace { int64_t num_layers = rnn.num_directions() * rnn.num_layers; size_t cur_offset = 0; size_t global_layer_params_count = 0; - for (int64_t layer = 0; layer < num_layers; layer++) { + for (const auto layer : c10::irange(num_layers)) { size_t layer_params_count = 0; for (auto cudnn_method : cudnn_methods) { - for (int64_t linear_id = 0; linear_id < num_linear_layers; linear_id++) { + for (const auto linear_id : c10::irange(num_linear_layers)) { FilterDescriptor lin_layer_mat_desc; void* matrix_pointer; AT_CUDNN_CHECK(cudnn_method( @@ -566,7 +567,7 @@ namespace { } else { data_ptrs.reserve(num_dir_layers * 2 * 2); } - for (int64_t layer = 0; layer < num_dir_layers; layer++) { + for (const auto layer : c10::irange(num_dir_layers)) { for (auto cudnn_method : cudnn_methods) { // This API returns a separate pointer for weight of every gate, // but we represent them as a single tensor, so we're only interested @@ -629,7 +630,7 @@ namespace { void _viewOrCopyParams(MatrixRef params_from, MatrixRef params_to, bool copy, bool allow_type_change=false) { TORCH_INTERNAL_ASSERT(params_from.size(0) == params_to.size(0), "number of layers mismatch"); - for (size_t i = 0; i < params_from.size(0); i++) { + for (const auto i : c10::irange(params_from.size(0))) { auto layer_params_from = params_from[i]; auto layer_params_to = params_to[i]; // NOTE: these lists have all weights before all biases, so if the layer @@ -845,7 +846,7 @@ copy_weights_to_flat_buf_views( _viewOrCopyParams(weight, params, /*copy=*/true, allow_type_change); if (set_orig_weights_to_flat_buf) { // Update the storage - for (size_t i = 0; i < weight.size(0); i++) { + for (const auto i : c10::irange(weight.size(0))) { // There is a special case for LSTM with projections and no bias, // where weight copy is done in 0->0, 1->1, 2->4 layout if (weight[i].size() == 3 && params[i].size() == 5) { @@ -1525,7 +1526,7 @@ Tensor try_get_weight_buf( AT_ASSERT(num_ptrs % 5 == 0); if (has_biases) { AT_ASSERT(num_ptrs == num_parameters); - for (int64_t i = 0; i < num_parameters; i++) { + for (const auto i : c10::irange(num_parameters)) { if (expected_data_ptrs[i] != parameters[i].data_ptr()) return {}; } } else { diff --git a/aten/src/ATen/native/im2col.h b/aten/src/ATen/native/im2col.h index d4b7ecce3b3a06..854052145d5430 100644 --- a/aten/src/ATen/native/im2col.h +++ b/aten/src/ATen/native/im2col.h @@ -3,6 +3,7 @@ #include #include #include +#include #include @@ -30,15 +31,15 @@ static void im2col( const int64_t width_col = output_width; const int64_t channels_col = channels * kernel_h * kernel_w; - for (int64_t c_col = 0; c_col < channels_col; ++c_col) { + for (const auto c_col : c10::irange(channels_col)) { int64_t w_offset = c_col % kernel_w; int64_t h_offset = (c_col / kernel_w) % kernel_h; int64_t c_im = c_col / kernel_h / kernel_w; - for (int64_t h_col = 0; h_col < height_col; ++h_col) { + for (const auto h_col : c10::irange(height_col)) { int64_t h_im = h_col * stride_h - pad_h + h_offset * dilation_h; - for (int64_t w_col = 0; w_col < width_col; ++w_col) { + for (const auto w_col : c10::irange(width_col)) { int64_t w_im = w_col * stride_w - pad_w + w_offset * dilation_w; data_col[(c_col * height_col + h_col) * width_col + w_col] = (h_im >= 0 && w_im >= 0 && h_im < height && w_im < width) @@ -72,15 +73,15 @@ static void col2im( const int64_t width_col = output_width; const int64_t channels_col = channels * kernel_h * kernel_w; - for (int64_t c_col = 0; c_col < channels_col; ++c_col) { + for (const auto c_col : c10::irange(channels_col)) { int64_t w_offset = c_col % kernel_w; int64_t h_offset = (c_col / kernel_w) % kernel_h; int64_t c_im = c_col / kernel_h / kernel_w; - for (int64_t h_col = 0; h_col < height_col; ++h_col) { + for (const auto h_col : c10::irange(height_col)) { int64_t h_im = h_col * stride_h - pad_h + h_offset * dilation_h; - for (int64_t w_col = 0; w_col < width_col; ++w_col) { + for (const auto w_col : c10::irange(width_col)) { int64_t w_im = w_col * stride_w - pad_w + w_offset * dilation_w; if (h_im >= 0 && h_im < height && w_im >= 0 && w_im < width) diff --git a/aten/src/ATen/native/metal/MetalShaders.h b/aten/src/ATen/native/metal/MetalShaders.h index 0ee703f2ee261b..c36b99ae7c6f49 100644 --- a/aten/src/ATen/native/metal/MetalShaders.h +++ b/aten/src/ATen/native/metal/MetalShaders.h @@ -516,7 +516,7 @@ kernel void reshape(texture2d_array in_arr[[texture(0), func const ushort n2 = gid.z / slices2; //image index const ushort s2 = gid.z - n2 * slices2; // slice offest half4 value; - for (int idx = 0; idx < 4; ++idx){ + for (const auto idx : c10::irange(4)) { // we compute the "linear index" of the output element, // and convert it to the equivalent "linear index" of the input element. ushort offset = 4 * s2 + idx; @@ -590,7 +590,7 @@ kernel void transpose(texture2d_arrayin_arr[[texture(0),func half4 value; ushort4 threadIndexBufferLower{1, 1, 1, 1}; ushort4 threadIndexBufferUpper{1, 1, 1 ,1}; - for (int idx = 0; idx < 4; ++idx){ + for (const auto idx : c10::irange(4)) { ushort offset = 4 * s2 + idx; size_t linear_idx2 = n2 * C2 * H2 * W2 + offset * H2 * W2 + gid.y * W2 + gid.x; if(linear_idx2 >= numel) { @@ -810,8 +810,8 @@ kernel void roi_align(texture2d_array ina[[texture(0), fun constexpr sampler s2(coord::pixel, address::clamp_to_edge, filter::linear); - for (int iy = 0; iy < roi_bin_grid_h; iy++) { - for (int ix = 0; ix < roi_bin_grid_w; ix++) { + for (const auto iy : c10::irange(roi_bin_grid_h)) { + for (const auto ix : c10::irange(roi_bin_grid_w)) { // Shift the pixel by 0.5. This is critical to achieve high accuracy. const half y = roi_start_h + ph * bin_size_h + (iy+0.5) * bin_size_h / static_cast(roi_bin_grid_h); diff --git a/aten/src/ATen/native/miopen/Conv_miopen.cpp b/aten/src/ATen/native/miopen/Conv_miopen.cpp index 967001750c1ec6..d26f0b479debee 100644 --- a/aten/src/ATen/native/miopen/Conv_miopen.cpp +++ b/aten/src/ATen/native/miopen/Conv_miopen.cpp @@ -112,6 +112,7 @@ std::tuple miopen_depthwise_convolution_backwa #include #include +#include #include @@ -255,7 +256,7 @@ struct ParamsHash { std::size_t operator()(const ConvolutionParams& params) const { auto ptr = reinterpret_cast(¶ms); uint32_t value = 0x811C9DC5; - for (int i = 0; i < (int)sizeof(ConvolutionParams); ++i) { + for (const auto i : c10::irange((int)sizeof(ConvolutionParams))) { value ^= ptr[i]; value *= 0x01000193; } diff --git a/aten/src/ATen/native/miopen/RNN_miopen.cpp b/aten/src/ATen/native/miopen/RNN_miopen.cpp index ec92c0654d66b6..4bd0a26f353e75 100644 --- a/aten/src/ATen/native/miopen/RNN_miopen.cpp +++ b/aten/src/ATen/native/miopen/RNN_miopen.cpp @@ -8,6 +8,7 @@ #include #include +#include #if !AT_ROCM_ENABLED() @@ -136,7 +137,7 @@ std::vector rnn_descriptor_sequence(const Tensor& tensor, IntA std::vector rnn_descriptor(const Tensor& tensor, int64_t N) { std::vector descriptors(N); - for (int64_t i = 0; i < N ; i++) { + for (const auto i : c10::irange(N)) { descriptors[i].set(tensor, 5); } @@ -246,7 +247,7 @@ Tensor permute_wei_for_miopen(Tensor wei, int64_t mode) void _viewOrCopyParams(MatrixRef params_from, MatrixRef params_to, bool copy) { TORCH_CHECK(params_from.size(0) == params_to.size(0), "number of layers mismatch"); - for (size_t i = 0; i < params_from.size(0); i++) { + for (const auto i : c10::irange(params_from.size(0))) { auto layer_params_from = params_from[i]; auto layer_params_to = params_to[i]; // NOTE: these lists have all weights before all biases, so if the layer @@ -268,7 +269,7 @@ void _viewOrCopyParams(MatrixRef params_from, MatrixRef params_t void _copyParams_and_permute(MatrixRef params_from, MatrixRef params_to, int64_t mode) { TORCH_CHECK(params_from.size(0) == params_to.size(0), "number of layers mismatch"); - for (size_t i = 0; i < params_from.size(0); i++) { + for (const auto i : c10::irange(params_from.size(0))) { auto layer_params_from = params_from[i]; auto layer_params_to = params_to[i]; for (auto a = layer_params_from.begin(), b = layer_params_to.begin(); @@ -327,11 +328,11 @@ std::pair, size_t> get_parameters(miopenHandle_t handle, con auto elem_size = dataSize(getMiopenDataType(weight_buf)); auto bias_mode = rnn.bias_mode; - for (int64_t layer = 0; layer < num_layers; layer++) { + for (const auto layer : c10::irange(num_layers)) { size_t layer_params_count = 0; // Get layer params - for (int64_t linear_id = 0; linear_id < num_linear_layers; linear_id++) { + for (const auto linear_id : c10::irange(num_linear_layers)) { FilterDescriptor lin_layer_mat_desc; size_t offset; MIOPEN_CHECK(miopenGetRNNLayerParamOffset( @@ -366,7 +367,7 @@ std::pair, size_t> get_parameters(miopenHandle_t handle, con // Get bias params if (bias_mode == miopenRNNwithBias) { - for (int64_t linear_id = 0; linear_id < num_linear_layers; linear_id++) { + for (const auto linear_id : c10::irange(num_linear_layers)) { FilterDescriptor lin_layer_mat_desc; size_t offset; MIOPEN_CHECK(miopenGetRNNLayerBiasOffset( @@ -776,7 +777,7 @@ std::tuple> miopen_rnn_backward( if (output_mask[3]) { dw = at::native::miopen_rnn_backward_weight(input, weight, weight_stride0, weight_buf, hx, cx, output, mode, hidden_size, num_layers, batch_first, dropout, train, bidirectional, batch_sizes, dropout_state, reserve, ws); if (mode > 1) { - for (int i = 0; i < dw.size(); i++) { + for (const auto i : c10::irange(dw.size())) { dw[i] = permute_wei_for_miopen(dw[i], mode); } } diff --git a/aten/src/ATen/native/mkl/LinearAlgebra.cpp b/aten/src/ATen/native/mkl/LinearAlgebra.cpp index ee64b3bd3facff..2790f1e8b3f276 100644 --- a/aten/src/ATen/native/mkl/LinearAlgebra.cpp +++ b/aten/src/ATen/native/mkl/LinearAlgebra.cpp @@ -42,6 +42,7 @@ void mkl_gemm_batched( #else // AT_MKL_ENABLED #include +#include namespace at { namespace native { diff --git a/aten/src/ATen/native/mkl/SpectralOps.cpp b/aten/src/ATen/native/mkl/SpectralOps.cpp index de10ec90cc87d9..858b7a2601e60f 100644 --- a/aten/src/ATen/native/mkl/SpectralOps.cpp +++ b/aten/src/ATen/native/mkl/SpectralOps.cpp @@ -29,7 +29,7 @@ void _fft_fill_with_conjugate_symmetry_slice( // n-dimensions. This advances iter_index by one row, while updating in_ptr // and out_ptr to point to the new row of data. auto advance_index = [&] () __ubsan_ignore_undefined__ { - for (size_t i = 1; i < iter_index.size(); ++i) { + for (const auto i : c10::irange(1, iter_index.size())) { if (iter_index[i] + 1 < signal_half_sizes[i]) { ++iter_index[i]; in_ptr += in_strides[i]; @@ -93,7 +93,7 @@ void _fft_fill_with_conjugate_symmetry_slice( while (numel_remaining > 0) { auto end = std::min(signal_half_sizes[0], numel_remaining); out_ptr[0] = std::conj(in_ptr[0]); - for (int64_t i = 1; i < end; ++i) { + for (const auto i : c10::irange(1, end)) { out_ptr[(signal_half_sizes[0] - i) * out_strides[0]] = std::conj(in_ptr[i * in_strides[0]]); } numel_remaining -= end; @@ -448,7 +448,7 @@ static Tensor& _exec_fft(Tensor& out, const Tensor& self, IntArrayRef out_sizes, const auto batch_size = input.sizes()[0]; DimVector signal_size(signal_ndim + 1); signal_size[0] = batch_size; - for (int64_t i = 0; i < signal_ndim; ++i) { + for (const auto i : c10::irange(signal_ndim)) { auto in_size = input.sizes()[i + 1]; auto out_size = out_sizes[dim[i]]; signal_size[i + 1] = std::max(in_size, out_size); @@ -460,7 +460,7 @@ static Tensor& _exec_fft(Tensor& out, const Tensor& self, IntArrayRef out_sizes, batched_sizes[0] = batch_size; DimVector batched_out_sizes(batched_sizes.begin(), batched_sizes.end()); - for (size_t i = 0; i < dim.size(); ++i) { + for (const auto i : c10::irange(dim.size())) { batched_out_sizes[i + 1] = out_sizes[dim[i]]; } @@ -485,7 +485,7 @@ static Tensor& _exec_fft(Tensor& out, const Tensor& self, IntArrayRef out_sizes, out_strides[dim_permute[i]] = batch_numel * out.strides()[0]; batch_numel *= out_sizes[dim_permute[i]]; } - for (int64_t i = batch_dims; i < ndim; ++i) { + for (const auto i : c10::irange(batch_dims, ndim)) { out_strides[dim_permute[i]] = out.strides()[1 + (i - batch_dims)]; } out.as_strided_(out_sizes, out_strides, out.storage_offset()); diff --git a/aten/src/ATen/native/mkldnn/Pooling.cpp b/aten/src/ATen/native/mkldnn/Pooling.cpp index e6945e52da904f..fac656fea0d4c7 100644 --- a/aten/src/ATen/native/mkldnn/Pooling.cpp +++ b/aten/src/ATen/native/mkldnn/Pooling.cpp @@ -229,7 +229,7 @@ static Tensor _mkldnn_pooling( false /*ceil_mode */); all_equal = true; - for (size_t i = 2; i < input.sizes().size(); ++i) { + for (const auto i : c10::irange(2, input.sizes().size())) { if (output_sizes[i] < output_sizes_ceil[i]) { padding_vec_r[i - 2]++; all_equal = false; @@ -318,7 +318,7 @@ static Tensor _mkldnn_pooling_backward( false /*ceil_mode */); all_equal = true; - for (size_t i = 2; i < input.sizes().size(); ++i) { + for (const auto i : c10::irange(2, input.sizes().size())) { if (output_sizes[i] < output_sizes_ceil[i]) { padding_vec_r[i - 2]++; all_equal = false; @@ -479,7 +479,7 @@ Tensor mkldnn_adaptive_avg_pool2d( auto output_size_vec = expand_param_if_needed(output_size, "output_size", input.dim() - 2); std::vector kernel_size(input.dim() - 2); - for (int64_t i = 2; i < input.dim(); ++i) { + for (const auto i : c10::irange(2, input.dim())) { auto s1 = input.size(i); auto s2 = output_size_vec[i - 2]; TORCH_CHECK(s2 != 0, "output size can not be zero"); diff --git a/aten/src/ATen/native/mkldnn/Utils.cpp b/aten/src/ATen/native/mkldnn/Utils.cpp index 7c4183585d3678..62aeee4078088f 100644 --- a/aten/src/ATen/native/mkldnn/Utils.cpp +++ b/aten/src/ATen/native/mkldnn/Utils.cpp @@ -1,5 +1,6 @@ #include #include +#include namespace at { namespace native { @@ -16,7 +17,7 @@ std::vector pool_output_sizes( output_size[0] = input_size[0]; output_size[1] = input_size[1]; - for (size_t i = 2; i < input_size.size(); ++i) { + for (const auto i : c10::irange(2, input_size.size())) { output_size[i] = pooling_output_shape_pad_lr( input_size[i], kernel_size[i - 2], diff --git a/aten/src/ATen/native/quantized/Copy.cpp b/aten/src/ATen/native/quantized/Copy.cpp index da95ff3c7738a6..347742f2297220 100644 --- a/aten/src/ATen/native/quantized/Copy.cpp +++ b/aten/src/ATen/native/quantized/Copy.cpp @@ -2,6 +2,7 @@ #include #include +#include namespace at { namespace native { @@ -23,7 +24,7 @@ Tensor& quantized_copy_from_float_cpu_(Tensor& self, const Tensor& src) { AT_DISPATCH_QINT_TYPES(self.scalar_type(), "Copy", [&]() { float* src_data = src.data_ptr(); scalar_t* self_data = self.data_ptr(); - for (int i = 0; i < self.numel(); ++i) { + for (const auto i : c10::irange(self.numel())) { self_data[i] = quantize_val( self.q_scale(), self.q_zero_point(), src_data[i]); } diff --git a/aten/src/ATen/native/quantized/affine_quantizer_base.cpp b/aten/src/ATen/native/quantized/affine_quantizer_base.cpp index f83731fd467796..bba09aedbb795d 100644 --- a/aten/src/ATen/native/quantized/affine_quantizer_base.cpp +++ b/aten/src/ATen/native/quantized/affine_quantizer_base.cpp @@ -1,4 +1,5 @@ #include +#include #include #include @@ -145,7 +146,7 @@ void quantize_vec( T* dst, size_t count) { checkZeroPoint("quantize_vec", zero_point); - for (size_t i = 0; i < count; ++i) { + for (const auto i : c10::irange(count)) { dst[i] = quantize_val(scale, zero_point, src[i]); } } diff --git a/aten/src/ATen/native/quantized/cpu/fbgemm_utils.cpp b/aten/src/ATen/native/quantized/cpu/fbgemm_utils.cpp index 1d933741c7eee6..ace415ed183f30 100644 --- a/aten/src/ATen/native/quantized/cpu/fbgemm_utils.cpp +++ b/aten/src/ATen/native/quantized/cpu/fbgemm_utils.cpp @@ -11,6 +11,7 @@ #include #include #include +#include #include torch::class_ register_linear_params(); @@ -47,9 +48,9 @@ void CopyToChannelsLast3dTensor( const T* src, T* dst) { const int64_t inner_size = D * H * W; - for (int64_t i = 0; i < N; ++i) { - for (int64_t j = 0; j < inner_size; ++j) { - for (int64_t k = 0; k < C; ++k) { + for (const auto i : c10::irange(N)) { + for (const auto j : c10::irange(inner_size)) { + for (const auto k : c10::irange(C)) { dst[(i * inner_size + j) * C + k] = src[(i * C + k) * inner_size + j]; } } @@ -69,8 +70,8 @@ void CopyICFirst3dTensorToChannelsLast3dTensor( // IC OC/G THW -> G OC/G THW IC/G const int64_t inner_size = D * H * W; for (int64_t i = 0; i < G * OC_G; ++i) { - for (int64_t j = 0; j < inner_size; ++j) { - for (int64_t ic = 0; ic < IC_G; ++ic) { + for (const auto j : c10::irange(inner_size)) { + for (const auto ic : c10::irange(IC_G)) { // NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions) int g = i / OC_G; // NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions) diff --git a/aten/src/ATen/native/quantized/cpu/fbgemm_utils.h b/aten/src/ATen/native/quantized/cpu/fbgemm_utils.h index 8854f2dce0b247..36234da4f17214 100644 --- a/aten/src/ATen/native/quantized/cpu/fbgemm_utils.h +++ b/aten/src/ATen/native/quantized/cpu/fbgemm_utils.h @@ -5,6 +5,7 @@ #include #include #include +#include #ifdef USE_FBGEMM #include @@ -240,7 +241,7 @@ inline void convert_uint8_int8( int len, const uint8_t* src_uint8, int8_t* dst_int8) { - for (int i = 0; i < len; ++i) { + for (const auto i : c10::irange(len)) { dst_int8[i] = static_cast(static_cast(src_uint8[i]) - 128); } } @@ -250,7 +251,7 @@ inline void convert_int8_uint8( int len, const int8_t* src_int8, uint8_t* dst_uint8) { - for (int i = 0; i < len; ++i) { + for (const auto i : c10::irange(len)) { dst_uint8[i] = static_cast(static_cast(src_int8[i]) + 128); } diff --git a/aten/src/ATen/native/quantized/cpu/int_repr_quant.cpp b/aten/src/ATen/native/quantized/cpu/int_repr_quant.cpp index fb9cafec9e7b26..b3735ddb236d25 100644 --- a/aten/src/ATen/native/quantized/cpu/int_repr_quant.cpp +++ b/aten/src/ATen/native/quantized/cpu/int_repr_quant.cpp @@ -3,6 +3,7 @@ #include #include #include +#include namespace at { namespace native { @@ -22,7 +23,7 @@ Tensor int_repr_quantized_cpu(const Tensor& self) { self.options().dtype(UNDERLYING_TYPE), self.suggest_memory_format()); const underlying_t* qdata = reinterpret_cast(self.data_ptr()); - for (int64_t i = 0; i < dst.numel(); ++i) { + for (const auto i : c10::irange(dst.numel())) { dst[i] = static_cast(qdata[i]); } } else { diff --git a/aten/src/ATen/native/quantized/cpu/kernels/QuantizedOpKernels.cpp b/aten/src/ATen/native/quantized/cpu/kernels/QuantizedOpKernels.cpp index c0641097149566..cccab018e75ce4 100644 --- a/aten/src/ATen/native/quantized/cpu/kernels/QuantizedOpKernels.cpp +++ b/aten/src/ATen/native/quantized/cpu/kernels/QuantizedOpKernels.cpp @@ -104,9 +104,9 @@ Tensor qcat_nhwc_kernel( // which causes an internal compiler error if they're not AT_DISPATCH_QINT_TYPES(output.scalar_type(), "qcat_nhwc", [&, N, H, W]() { using Vec = Vectorized; - for (int64_t batch = 0; batch < N; ++batch) { - for (int64_t row = 0; row < H; ++row) { - for (int64_t col = 0; col < W; ++col) { + for (const auto batch : c10::irange(N)) { + for (const auto row : c10::irange(H)) { + for (const auto col : c10::irange(W)) { // loop over input tensors for (const auto tidx : c10::irange(Cs_in.size())) { scalar_t::underlying* optr = @@ -1294,13 +1294,13 @@ void qmaxpool_2d_nhwc_kernel( scalar_t* odata = static_cast(qy.data_ptr()); // Loop over N - for (int64_t b = 0; b < qx.size(0); ++b) { + for (const auto b : c10::irange(qx.size(0))) { // Loop over H auto* i_p = reinterpret_cast(idata + b * iW * iH * iC); - for (int64_t row = 0; row < oH; ++row) { + for (const auto row : c10::irange(oH)) { // Loop over W - for (int64_t col = 0; col < oW; ++col) { + for (const auto col : c10::irange(oW)) { // Pointer to output data for this specific N,H,W position auto* o_p = reinterpret_cast( odata + b * oH * oW * iC + row * oW * iC + col * iC); @@ -1328,7 +1328,7 @@ void qmaxpool_2d_nhwc_kernel( int64_t x, y; for (y = h_start; y < h_end; y += dH) { for (x = w_start; x < w_end; x += dW) { - for (int i = 0; i < 4; ++i) { + for (const auto i : c10::irange(4)) { tcntr = y * iW + x; auto vals = Vectorized::loadu( i_p + tcntr * iC + c + Vectorized::size() * i); @@ -1336,7 +1336,7 @@ void qmaxpool_2d_nhwc_kernel( } } // for x } // for y - for (int i = 0; i < 4; ++i) { + for (const auto i : c10::irange(4)) { accs[i].store(o_p + c + Vectorized::size() * i); } } // for c @@ -1417,18 +1417,18 @@ void do_avg_pool_nhwc_on_AVX_n( for (int c = c_start; c < csize; c += cb_step) { int cend = std::min(cb_size, (csize - c) / vec_width); // initialize loop - for (int ic = 0; ic < cend; ic++) { + for (const auto ic : c10::irange(cend)) { acc_buffer[ic] = Vectorized(input_zero_point_m_size); } // compute loop - for (int id = dstart; id < dend; id++) { - for (int ih = hstart; ih < hend; ih++) { - for (int iw = wstart; iw < wend; iw++) { + for (const auto id : c10::irange(dstart, dend)) { + for (const auto ih : c10::irange(hstart, hend)) { + for (const auto iw : c10::irange(wstart, wend)) { const int i_idx = (id * wsize * hsize + ih * wsize + iw) * csize + c; - for (int ic = 0; ic < cend; ic++) { + for (const auto ic : c10::irange(cend)) { auto vals = vec::convert_to_int32( i_p + i_idx + ic * vec_width); acc_buffer[ic] = acc_buffer[ic] + vals; @@ -1493,9 +1493,9 @@ void do_avg_pool_on_AVX_n( int64_t tcntr = 0; Vectorized acc(input_zero_point_m_size); - for (int64_t id = dstart; id < dend; id++) { - for (int64_t ih = hstart; ih < hend; ih++) { - for (int64_t iw = wstart; iw < wend; iw++) { + for (const auto id : c10::irange(dstart, dend)) { + for (const auto ih : c10::irange(hstart, hend)) { + for (const auto iw : c10::irange(wstart, wend)) { tcntr = id * stride_D + ih * stride_H + iw * stride_W; auto vals = vec::convert_to_int32( i_p + tcntr * channel_multiplier + c * stride_C); @@ -1546,11 +1546,11 @@ void _qadaptive_avg_pool_kernel( int input_zero_point = qx.q_zero_point(); int output_zero_point = qy.q_zero_point(); - for (int64_t od = 0; od < osizeD; od++) { + for (const auto od : c10::irange(osizeD)) { int istartD = (int)std::floor((float)(od * isizeD) / osizeD); int iendD = (int)std::ceil((float)((od + 1) * isizeD) / osizeD); int kD = iendD - istartD; - for (int64_t oh = 0; oh < osizeH; oh++) { + for (const auto oh : c10::irange(osizeH)) { int istartH = (int)std::floor((float)(oh * isizeH) / osizeH); int iendH = (int)std::ceil((float)((oh + 1) * isizeH) / osizeH); int kH = iendH - istartH; @@ -1603,9 +1603,9 @@ void _qadaptive_avg_pool_kernel( for (; c < sizeC; ++c) { int32_t acc_int32 = input_zero_point_m_size; int64_t tcntr = 0; - for (int64_t id = 0; id < kD; ++id) { - for (int64_t ih = 0; ih < kH; ++ih) { - for (int64_t iw = 0; iw < kW; ++iw) { + for (const auto id : c10::irange(kD)) { + for (const auto ih : c10::irange(kH)) { + for (const auto iw : c10::irange(kW)) { tcntr = id * istrideD + ih * istrideH + iw * istrideW; @@ -1945,7 +1945,7 @@ int64_t do_quantized_bilinear_on_AVX_n( pos1 + h1p * input_width * channels); pos1_int_v[3] = vec::convert_to_int32( pos1 + (h1p * input_width + w1p) * channels); - for (int i = 0; i < 4; i++) { + for (const auto i : c10::irange(4)) { int32_t pos1_int[vec_width]; float pos1_fp[vec_width]; pos1_int_v[i].store(pos1_int); @@ -1999,13 +1999,13 @@ void qupsample_bilinear2d_nhwc_kernel( const auto rwidth = area_pixel_compute_scale( input_width, output_width, align_corners, scales_w); - for (int64_t b = 0; b < nbatch; ++b) { + for (const auto b : c10::irange(nbatch)) { auto* i_p = reinterpret_cast( idata + b * input_height * input_width * channels); auto* o_p = reinterpret_cast( odata + b * output_height * output_width * channels); - for (int64_t h2 = 0; h2 < output_height; ++h2) { + for (const auto h2 : c10::irange(output_height)) { const auto h1r = area_pixel_compute_source_index( rheight, h2, align_corners, /*cubic=*/false); @@ -2014,7 +2014,7 @@ void qupsample_bilinear2d_nhwc_kernel( const float h1lambda = h1r - h1; const float h0lambda = static_cast(1.) - h1lambda; - for (int64_t w2 = 0; w2 < output_width; ++w2) { + for (const auto w2 : c10::irange(output_width)) { const auto w1r = area_pixel_compute_source_index( rwidth, w2, align_corners, /*cubic=*/false); const int64_t w1 = w1r; @@ -2250,7 +2250,7 @@ void _fake_quantize_tensor_helper( AT_DISPATCH_FLOATING_TYPES_AND_HALF(input.scalar_type(), "fake_quantize_tensor_cachemask_kernel_type_handling", [&] { iter_combined.for_each([&](char** data, const int64_t* strides, int64_t n) { - for (int64_t i = 0; i < n; i++) { + for (const auto i : c10::irange(n)) { scalar_t* output_val = (scalar_t*)(data[0] + i * strides[0]); bool* mask_val = (bool*)(data[1] + i * strides[1]); scalar_t* input_val = (scalar_t*)(data[2] + i * strides[2]); @@ -2319,7 +2319,7 @@ void fake_quantize_learnable_tensor_grad_kernel_cpu( (to move onto different elements), can allow accessing of the input and assignment to the right output. */ - for (int64_t i = 0; i < n; i++) { + for (const auto i : c10::irange(n)) { float* dXOutput = (float*)(data[0] + i * strides[0]); float* dScaleOutput = (float*)(data[1] + i * strides[1]); float* dZeroPointOutput = (float*)(data[2] + i * strides[2]); @@ -2429,7 +2429,7 @@ void fake_quantize_learnable_channel_grad_kernel_cpu( please see the implemenetation of fake_quantize_learnable_tensor_grad_kernel_cpu. */ - for (int64_t i = 0; i < n; i++) { + for (const auto i : c10::irange(n)) { float* dx_output = (float*)(data[0] + i * strides[0]); float* dscale_output = (float*)(data[1] + i * strides[1]); float* dzero_point_output = (float*)(data[2] + i * strides[2]); @@ -2516,7 +2516,7 @@ void quantized_normalize_kernel( int64_t kNonVecRemInChannel = NPerChannel % kIntVLen; at::parallel_for(0, M, 1, [&](int64_t start, int64_t end) { - for (int64_t i = start; i < end; ++i) { + for (const auto i : c10::irange(start, end)) { scalar_t* X_ptr = X_data + i * N; scalar_t* Y_ptr = Y_data + i * N; @@ -2546,7 +2546,7 @@ void quantized_normalize_kernel( // if scaling per channel, scaling parameters can be pre-multiplied // with normalization parameters - for (int64_t chIdx = 0; chIdx < channels_per_group; chIdx++) { + for (const auto chIdx : c10::irange(channels_per_group)) { int scalingIdx = (i * channels_per_group + chIdx) % (num_channels); float gamma = gamma_null ? 1.0f : gamma_data[scalingIdx]; // scale_x / layer_std * gamma @@ -2558,7 +2558,7 @@ void quantized_normalize_kernel( int64_t chStartIdx = chIdx * NPerChannel; int64_t chEndIdx = chStartIdx + NPerChannel; - for (int64_t vecIdx = 0; vecIdx < kNumIntVecInChannel; vecIdx++) { + for (const auto vecIdx : c10::irange(kNumIntVecInChannel)) { int64_t vecStartIdx = chStartIdx + vecIdx * kIntVLen; auto qXVec = qVec::loadu(X_ptr + vecStartIdx); auto dqXVec = qXVec.dequantize(x_fake_scale_vec, x_zp_vec, @@ -2584,7 +2584,7 @@ void quantized_normalize_kernel( } else { - for (int64_t vecIdx = 0; vecIdx < kNumIntVecInLayer; vecIdx++) { + for (const auto vecIdx : c10::irange(kNumIntVecInLayer)) { int64_t vecStartIdx = vecIdx * kIntVLen; auto qXVec = qVec::loadu(X_ptr + vecStartIdx); auto dqXVec = qXVec.dequantize(x_fake_scale_vec, x_zp_vec, @@ -2638,7 +2638,7 @@ void quantize_tensor_per_tensor_affine_cpu( qparams.precision = CHAR_BIT * sizeof(underlying_t); int num_tasks = at::get_num_threads(); at::parallel_for(0, num_tasks, 1, [&](int64_t begin, int64_t end) { - for (int task_id = begin; task_id < end; ++task_id) { + for (const auto task_id : c10::irange(begin, end)) { fbgemm::Quantize( // NOLINTNEXTLINE(bugprone-argument-comment) rd, /*src=*/ @@ -2672,7 +2672,7 @@ void dequantize_tensor_per_tensor_affine_cpu( float* rd = rtensor.data_ptr(); int num_tasks = at::get_num_threads(); at::parallel_for(0, num_tasks, 1, [&](int64_t begin, int64_t end) { - for (int task_id = begin; task_id < end; ++task_id) { + for (const auto task_id : c10::irange(begin, end)) { fbgemm::Dequantize( // NOLINTNEXTLINE(bugprone-argument-comment) qd, /*src=*/ @@ -2700,7 +2700,7 @@ void quantize_tensor_arm( const float scale, const int32_t zero_point) { auto out = qtensor.data_ptr(); - for (int i = 0; i < N; ++i) { + for (const auto i : c10::irange(N)) { out[i] = at::native::quantize_val(scale, zero_point, in[i]); } } @@ -2802,7 +2802,7 @@ void quantize_tensor_per_tensor_affine_cpu( const float* const rdata = rtensor.data_ptr(); auto qdata = qtensor.data_ptr(); auto numel = rtensor.numel(); - for (int i = 0; i < numel; ++i) { + for (const auto i : c10::irange(numel)) { qdata[i] = quantize_val(scale, zero_point, rdata[i]); } }); @@ -2820,7 +2820,7 @@ void dequantize_tensor_per_tensor_affine_cpu( const auto* qd = qtensor.data_ptr(); float* rd = rtensor.data_ptr(); auto numel = qtensor.numel(); - for (auto i = 0; i < numel; ++i) { + for (const auto i : c10::irange(numel)) { rd[i] = dequantize_val(scale, zero_point, qd[i]); } }); @@ -2857,9 +2857,9 @@ void quantize_tensor_per_channel_impl( // channels_last contig. // If axis = 0 and channels_last contig, implementation for channels // first (NCHW) works. - for (auto b = 0; b < batches; ++b) { - for (auto e = 0; e < elements_per_channel; ++e) { - for (auto c = 0; c < channels; ++c) { + for (const auto b : c10::irange(batches)) { + for (const auto e : c10::irange(elements_per_channel)) { + for (const auto c : c10::irange(channels)) { auto i = b * channels * elements_per_channel + e * channels + c; out[i] = at::native::quantize_val( scales_data[c], zero_points_data[c], in[i]); @@ -2867,9 +2867,9 @@ void quantize_tensor_per_channel_impl( } } } else { - for (auto b = 0; b < batches; ++b) { - for (auto c = 0; c < channels; ++c) { - for (auto e = 0; e < elements_per_channel; ++e) { + for (const auto b : c10::irange(batches)) { + for (const auto c : c10::irange(channels)) { + for (const auto e : c10::irange(elements_per_channel)) { auto i = b * channels * elements_per_channel + c * elements_per_channel + e; out[i] = at::native::quantize_val( @@ -2919,7 +2919,7 @@ void quantize_tensor_per_channel_impl( // Copy zero_points with magic int (int64_t) into int32_t array std::vector inv_scales(channels); std::vector zero_points_int32t(channels); - for (int i = 0; i < channels; ++i) { + for (const auto i : c10::irange(channels)) { inv_scales[i] = 1.0f / (float)scales_data[i]; zero_points_int32t[i] = (int32_t)(uint32_t)zero_points_data[i] - 0x4B400000; } @@ -2930,8 +2930,8 @@ void quantize_tensor_per_channel_impl( // channels_last contig. // If axis = 0 and channels_last contig, implementation for channels // first (NCHW) works. - for (uint32_t b = 0; b < batches; ++b) { - for (uint32_t e = 0; e < elements_per_channel; ++e) { + for (const auto b : c10::irange(batches)) { + for (const auto e : c10::irange(elements_per_channel)) { uint32_t c = 0; while (c + 8 < channels) { const int32x4_t voffset0123 = vld1q_s32(&zero_points_int32t[c]); @@ -2965,8 +2965,8 @@ void quantize_tensor_per_channel_impl( } } } else { - for (uint32_t b = 0; b < batches; ++b) { - for (uint32_t c = 0; c < channels; ++c) { + for (const auto b : c10::irange(batches)) { + for (const auto c : c10::irange(channels)) { uint32_t e = 0; const int32x4_t voffset = vdupq_n_s32(zero_points_int32t[c]); const float32x4_t vinv_scale = vdupq_n_f32(inv_scales[c]); @@ -3001,7 +3001,7 @@ void quantize_tensor_per_channel_impl( // Copy zero_points (int64_t) into int16_t array std::vector inv_scales(channels); std::vector zero_points_int16t(channels); - for (int i = 0; i < channels; ++i) { + for (const auto i : c10::irange(channels)) { inv_scales[i] = 1.0f / (float)scales_data[i]; zero_points_int16t[i] = (int16_t)(uint16_t)zero_points_data[i]; } @@ -3012,8 +3012,8 @@ void quantize_tensor_per_channel_impl( // channels_last contig. // If axis = 0 and channels_last contig, implementation for channels // first (NCHW) works. - for (uint32_t b = 0; b < batches; ++b) { - for (uint32_t e = 0; e < elements_per_channel; ++e) { + for (const auto b : c10::irange(batches)) { + for (const auto e : c10::irange(elements_per_channel)) { uint32_t c = 0; while (c + 8 < channels) { const int16x8_t vzero_point = vld1q_s16(&zero_points_int16t[c]); @@ -3043,8 +3043,8 @@ void quantize_tensor_per_channel_impl( } } } else { - for (uint32_t b = 0; b < batches; ++b) { - for (uint32_t c = 0; c < channels; ++c) { + for (const auto b : c10::irange(batches)) { + for (const auto c : c10::irange(channels)) { uint32_t e = 0; const int16x8_t vzero_point = vdupq_n_s16(zero_points_int16t[c]); const float32x4_t vinv_scale = vdupq_n_f32(inv_scales[c]); @@ -3123,9 +3123,9 @@ void dequantize_per_channel_affine_kernel( const auto elem_per_byte = 8 / bit_width; if (axis == 1 && (rtensor.is_contiguous(MemoryFormat::ChannelsLast) || rtensor.is_contiguous(MemoryFormat::ChannelsLast3d))) { - for (auto b = 0; b < batches; ++b) { - for (auto e = 0; e < elements_per_channel; ++e) { - for (auto c = 0; c < channel; ++c) { + for (const auto b : c10::irange(batches)) { + for (const auto e : c10::irange(elements_per_channel)) { + for (const auto c : c10::irange(channel)) { auto i = b * channel * elements_per_channel + e * channel + c; // We need to convert the qint8 value to float to ensure the // subtraction subexpression returns a float @@ -3139,9 +3139,9 @@ void dequantize_per_channel_affine_kernel( } } } else { - for (auto b = 0; b < batches; ++b) { - for (auto c = 0; c < channel; ++c) { - for (auto e = 0; e < elements_per_channel; ++e) { + for (const auto b : c10::irange(batches)) { + for (const auto c : c10::irange(channel)) { + for (const auto e : c10::irange(elements_per_channel)) { auto i = b * channel * elements_per_channel + c * elements_per_channel + e; // We need to convert the qint8 value to float to ensure the @@ -3201,9 +3201,9 @@ void quantize_tensor_per_channel_float_qparams_cpu( int qvalue = 0; if (axis == 1 && (rtensor.is_contiguous(MemoryFormat::ChannelsLast) || rtensor.is_contiguous(MemoryFormat::ChannelsLast3d))) { - for (auto b = 0; b < batches; ++b) { - for (auto e = 0; e < elements_per_channel; ++e) { - for (auto c = 0; c < channel; ++c) { + for (const auto b : c10::irange(batches)) { + for (const auto e : c10::irange(elements_per_channel)) { + for (const auto c : c10::irange(channel)) { auto i = b * channel * elements_per_channel + e * channel + c; qvalue = quantize_val_float_qparams( scales_data[c], zero_points_data[c], rdata[i], quant_min, quant_max); @@ -3217,9 +3217,9 @@ void quantize_tensor_per_channel_float_qparams_cpu( } } } else { - for (auto b = 0; b < batches; ++b) { - for (auto c = 0; c < channel; ++c) { - for (auto e = 0; e < elements_per_channel; ++e) { + for (const auto b : c10::irange(batches)) { + for (const auto c : c10::irange(channel)) { + for (const auto e : c10::irange(elements_per_channel)) { auto i = b * channel * elements_per_channel + c * elements_per_channel + e; qvalue = quantize_val_float_qparams( @@ -3263,7 +3263,7 @@ void quantize_tensor_per_tensor_affine_sub_byte_cpu( auto qdata = reinterpret_cast(qtensor.data_ptr()); auto numel = rtensor.numel(); const auto elem_per_byte = CHAR_BIT / bit_width; - for (int i = 0; i < numel; ++i) { + for (const auto i : c10::irange(numel)) { float inv_scale = scale == 0 ? 1.0f : 1.0f / scale; int64_t qvalue = lrintf(std::nearbyint(rdata[i] * inv_scale) + zero_point); qvalue = std::max(quant_min, std::min(qvalue, quant_max)); @@ -3296,7 +3296,7 @@ void dequantize_tensor_per_tensor_affine_sub_byte_cpu( auto numel = rtensor.numel(); const auto elem_per_byte = CHAR_BIT / bit_width; - for (int i = 0; i < numel; ++i) { + for (const auto i : c10::irange(numel)) { // NOLINTNEXTLINE(clang-analyzer-core.DivideZero) underlying_t qvalue = qdata[i / elem_per_byte]; qvalue >>= (i % elem_per_byte) * bit_width; diff --git a/aten/src/ATen/native/quantized/cpu/q_adaavgpool.cpp b/aten/src/ATen/native/quantized/cpu/q_adaavgpool.cpp index c220c4541969b0..584d29d2bac3d7 100644 --- a/aten/src/ATen/native/quantized/cpu/q_adaavgpool.cpp +++ b/aten/src/ATen/native/quantized/cpu/q_adaavgpool.cpp @@ -2,6 +2,8 @@ #include #include #include + +#include #include #include @@ -58,7 +60,7 @@ static void adaptive_avg_pool_single_out_frame( int64_t istrideH, int64_t istrideW) { at::parallel_for(0, sizeC, 0, [&](int64_t start, int64_t end) { - for (auto c = start; c < end; c++) { + for (const auto c : c10::irange(start, end)) { /* loop over output */ // NOLINTNEXTLINE(cppcoreguidelines-init-variables) int64_t od, oh, ow; @@ -126,7 +128,7 @@ template std::vector get_output_shape( const Tensor& input, IntArrayRef output_size) { - for (int64_t i = 1; i < input.dim(); i++) { + for (const auto i : c10::irange(1, input.dim())) { // Allow for empty batch. TORCH_CHECK( input.size(i) > 0, @@ -215,7 +217,7 @@ Tensor _adaptive_avg_pool(const Tensor& input, } else { int64_t istrideB = input.stride(-(kSpatialDim + 2)); at::parallel_for(0, sizeB, 0, [&](int64_t start, int64_t end) { - for (auto b = start; b < end; b++) { + for (const auto b : c10::irange(start, end)) { qadaptive_avg_pool3d_ndhwc_stub( input.device().type(), input, @@ -262,7 +264,7 @@ Tensor _adaptive_avg_pool(const Tensor& input, } else { int64_t istrideB = input.stride(-(kSpatialDim + 2)); at::parallel_for(0, sizeB, 0, [&](int64_t start, int64_t end) { - for (auto b = start; b < end; b++) { + for (const auto b : c10::irange(start, end)) { adaptive_avg_pool_single_out_frame( input_data + b * istrideB, output_data + b * sizeC * osizeD * osizeH * osizeW, diff --git a/aten/src/ATen/native/quantized/cpu/q_avgpool.cpp b/aten/src/ATen/native/quantized/cpu/q_avgpool.cpp index 32767e5b38a3a6..699ddbfd891290 100644 --- a/aten/src/ATen/native/quantized/cpu/q_avgpool.cpp +++ b/aten/src/ATen/native/quantized/cpu/q_avgpool.cpp @@ -6,6 +6,8 @@ #include #include #include + +#include #include #include @@ -39,7 +41,7 @@ static void avg_pool2d_out_frame( bool count_include_pad, c10::optional divisor_override) { at::parallel_for(0, nInputPlane, 0, [&](int64_t start, int64_t end) { - for (auto k = start; k < end; k++) { + for (const auto k : c10::irange(start, end)) { // NOLINTNEXTLINE(cppcoreguidelines-init-variables) int64_t xx, yy; /* For all output pixels... */ @@ -224,7 +226,7 @@ Tensor q_avg_pool2d( divisor_override); } else { at::parallel_for(0, nbatch, 0, [&](int64_t start, int64_t end) { - for (auto b = start; b < end; b++) { + for (const auto b : c10::irange(start, end)) { qavg_pool2d_nhwc_stub( input.device().type(), input, @@ -270,7 +272,7 @@ Tensor q_avg_pool2d( divisor_override); } else { at::parallel_for(0, nbatch, 0, [&](int64_t start, int64_t end) { - for (auto b = start; b < end; b++) { + for (const auto b : c10::irange(start, end)) { avg_pool2d_out_frame( input, output, diff --git a/aten/src/ATen/native/quantized/cpu/q_avgpool3d.cpp b/aten/src/ATen/native/quantized/cpu/q_avgpool3d.cpp index 003bf8692782df..ce4e8cc8468479 100644 --- a/aten/src/ATen/native/quantized/cpu/q_avgpool3d.cpp +++ b/aten/src/ATen/native/quantized/cpu/q_avgpool3d.cpp @@ -5,6 +5,8 @@ #include #include #include + +#include #include #include @@ -154,7 +156,7 @@ Tensor q_avg_pool3d( divisor_override); } else { at::parallel_for(0, nbatch, 0, [&](int64_t start, int64_t end) { - for (auto b = start; b < end; b++) { + for (const auto b : c10::irange(start, end)) { qavg_pool3d_nhwc_stub( input_nhwc.device().type(), input_nhwc, diff --git a/aten/src/ATen/native/quantized/cpu/qbatch_norm.cpp b/aten/src/ATen/native/quantized/cpu/qbatch_norm.cpp index ab5d79faccd46e..60fdfb3fabd94a 100644 --- a/aten/src/ATen/native/quantized/cpu/qbatch_norm.cpp +++ b/aten/src/ATen/native/quantized/cpu/qbatch_norm.cpp @@ -3,6 +3,7 @@ #include #include #include +#include #include #include @@ -30,7 +31,7 @@ void compute_fused_params( // = (input(n, c, h, w) - mean(c)) / sqrt(var(c) + eps) * weight(c) // + bias(c) // We factor out inv_sigma(c) = 1 / sqrt(var(c) + eps). - for (int64_t c = 0; c < channels; c++) { + for (const auto c : c10::irange(channels)) { // NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions) float inv_sigma = 1.0 / std::sqrt(var_data[c] + static_cast(eps)); float weight_v = weight_data ? weight_data[c] : 1; diff --git a/aten/src/ATen/native/quantized/cpu/qclamp.cpp b/aten/src/ATen/native/quantized/cpu/qclamp.cpp index fac56bc70066d7..68d8322730fe8b 100644 --- a/aten/src/ATen/native/quantized/cpu/qclamp.cpp +++ b/aten/src/ATen/native/quantized/cpu/qclamp.cpp @@ -7,6 +7,7 @@ #include #include #include +#include #include #include @@ -29,7 +30,7 @@ Tensor qnnpack_clamp(Tensor input, const Scalar& min, const Scalar& max) { Tensor input_contig = input.contiguous(input.suggest_memory_format()); size_t num_elems = 1; - for (int i = 1; i < input_contig.ndimension(); ++i) { + for (const auto i : c10::irange(1, input_contig.ndimension())) { num_elems *= input_contig.size(i); } diff --git a/aten/src/ATen/native/quantized/cpu/qconv.cpp b/aten/src/ATen/native/quantized/cpu/qconv.cpp index 06907885c383fd..37487b613fbe3e 100644 --- a/aten/src/ATen/native/quantized/cpu/qconv.cpp +++ b/aten/src/ATen/native/quantized/cpu/qconv.cpp @@ -95,7 +95,7 @@ at::SmallVector MakeDeConvOutputShape( output_shape.resize(kSpatialDim + 2); output_shape[0] = N; // Batch size output_shape[1] = M; // Output channels - for (int64_t idx = 0; idx < kSpatialDim; ++idx) { + for (const auto idx : c10::irange(kSpatialDim)) { output_shape[idx + 2] = compute_deconv_shape(input_shape[idx], kernel[idx], stride[idx], @@ -250,7 +250,7 @@ void PackedConvWeight::GetQuantizationParams( const int M = w->outputChannels(); output_multiplier_float->resize(M); act_times_w_scale->resize(M); - for (int i = 0; i < M; ++i) { + for (const auto i : c10::irange(M)) { act_times_w_scale->at(i) = (act_scale * w_scale[i]); output_multiplier_float->at(i) = act_times_w_scale->at(i) / out_scale; } @@ -653,7 +653,7 @@ at::Tensor PackedConvWeightsQnnp::apply_impl( c10::nullopt); auto* qnnp_w_data = qnnp_weight.template data_ptr(); auto wt_numel = weight_contig.numel(); - for (int i = 0; i < wt_numel; ++i) { + for (const auto i : c10::irange(wt_numel)) { qnnp_w_data[i] = static_cast(w_data[i] + 128); } at::Tensor qbias; diff --git a/aten/src/ATen/native/quantized/cpu/qconv_prepack.cpp b/aten/src/ATen/native/quantized/cpu/qconv_prepack.cpp index 5e6deb76b75814..3cb5d9ef1a18cc 100644 --- a/aten/src/ATen/native/quantized/cpu/qconv_prepack.cpp +++ b/aten/src/ATen/native/quantized/cpu/qconv_prepack.cpp @@ -91,7 +91,7 @@ c10::intrusive_ptr> PackedConvWeight< !transpose, "Per Channel Quantization is currently disabled for transposed conv"); zero_points.resize(output_channels); - for (int i = 0; i < output_channels; ++i) { + for (const auto i : c10::irange(output_channels)) { zero_points[i] = weight.q_per_channel_zero_points()[i].item(); } } else { @@ -120,11 +120,11 @@ c10::intrusive_ptr> PackedConvWeight< const int inner_size = kernel_d * kernel_h * kernel_w * input_channels_per_group; for (const auto g : c10::irange(groups)) { - for (int i = 0; i < output_channels_per_group; ++i) { + for (const auto i : c10::irange(output_channels_per_group)) { // NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions) const int c = g * output_channels_per_group + i; int32_t sum = 0; - for (int j = 0; j < inner_size; ++j) { + for (const auto j : c10::irange(inner_size)) { sum += static_cast(weight_data_int8[c * inner_size + j]); } if (qtype == c10::kPerTensorAffine) { @@ -140,7 +140,7 @@ c10::intrusive_ptr> PackedConvWeight< scales = {static_cast(weight.q_scale())}; } else if (qtype == c10::kPerChannelAffine) { scales.resize(output_channels); - for (int i = 0; i < output_channels; ++i) { + for (const auto i : c10::irange(output_channels)) { scales[i] = weight.q_per_channel_scales()[i].item(); } } @@ -330,7 +330,8 @@ class QConvPackWeightInt8 final { int64_t groups) { torch::List output_padding; output_padding.reserve(kSpatialDim); - for (int idx = 0; idx < kSpatialDim; ++idx) { + for (const auto idx : c10::irange(kSpatialDim)) { + (void)idx; //Suppress unused variable warning output_padding.push_back((int64_t)0); } return _run(weight, bias, stride, padding, output_padding, dilation, groups, diff --git a/aten/src/ATen/native/quantized/cpu/qembeddingbag.cpp b/aten/src/ATen/native/quantized/cpu/qembeddingbag.cpp index a7c9e9ea13093d..f526a7ac13973b 100644 --- a/aten/src/ATen/native/quantized/cpu/qembeddingbag.cpp +++ b/aten/src/ATen/native/quantized/cpu/qembeddingbag.cpp @@ -9,6 +9,7 @@ #endif #include +#include torch::class_ register_embedding_params(); @@ -44,7 +45,7 @@ at::Tensor& embedding_lookup_fallback_impl( std::vector lengths_data; int64_t lower = accessor[0]; - for (int64_t i = 1; i < offsets.numel(); ++i) { + for (const auto i : c10::irange(1, offsets.numel())) { lengths_data.push_back(accessor[i] - lower); lower = accessor[i]; } @@ -58,7 +59,7 @@ at::Tensor& embedding_lookup_fallback_impl( if (per_sample_weights_.has_value()) { per_sample_weights_data = per_sample_weights_.value().data_ptr(); } - for (int m = 0; m < output_size; ++m) { + for (const auto m : c10::irange(output_size)) { memset(output_data, 0, block_size * sizeof(float)); TORCH_CHECK( current + lengths_data[m] <= index_size, @@ -126,7 +127,7 @@ at::Tensor& embedding_lookup_fallback_impl( bias = weight_val * bias_val; } - for (int j = 0; j < block_size; ++j) { + for (const auto j : c10::irange(block_size)) { uint8_t quantized = weight_data[idx * weight_size + j / NUM_ELEM_PER_BYTE]; quantized >>= (j % NUM_ELEM_PER_BYTE) * BIT_RATE; diff --git a/aten/src/ATen/native/quantized/cpu/qembeddingbag_prepack.cpp b/aten/src/ATen/native/quantized/cpu/qembeddingbag_prepack.cpp index a5bd74ef64a9d7..f03c24e59ca600 100644 --- a/aten/src/ATen/native/quantized/cpu/qembeddingbag_prepack.cpp +++ b/aten/src/ATen/native/quantized/cpu/qembeddingbag_prepack.cpp @@ -67,7 +67,7 @@ c10::intrusive_ptr PackedEmbeddingBagWeight::prepack( weight_scales_tensor.copy_(qweight.q_per_channel_scales()); weight_zero_points_tensor.copy_(qweight.q_per_channel_zero_points()); - for (int64_t i = 0; i < embedding_rows; ++i) { + for (const auto i : c10::irange(embedding_rows)) { weight_bias[i] = weight_zero_points[i] * weight_scales[i] * -1; } @@ -88,14 +88,14 @@ c10::intrusive_ptr PackedEmbeddingBagWeight::prepack( if (bit_width == 8) { at::parallel_for( 0, embedding_rows, 1, [&](int32_t start_idx, int32_t end_idx) { - for (int64_t row = start_idx; row < end_idx; ++row) { + for (const auto row : c10::irange(start_idx, end_idx)) { const uint8_t* input_row = weight_data + row * embedding_cols; std::uint8_t* output_row = output_data + row * output_columns; float* output_row_scale_bias = reinterpret_cast(output_row + embedding_cols); output_row_scale_bias[0] = weight_scales[row]; output_row_scale_bias[1] = weight_bias[row]; - for (int64_t col = 0; col < embedding_cols; ++col) { + for (const auto col : c10::irange(embedding_cols)) { output_row[col] = input_row[col]; } } @@ -107,14 +107,14 @@ c10::intrusive_ptr PackedEmbeddingBagWeight::prepack( (embedding_cols + num_elem_per_byte - 1) / num_elem_per_byte; at::parallel_for( 0, embedding_rows, 1, [&](int32_t start_idx, int32_t end_idx) { - for (int64_t row = start_idx; row < end_idx; ++row) { + for (const auto row : c10::irange(start_idx, end_idx)) { const uint8_t* input_row = weight_data + row * embedding_cols; std::uint8_t* output_row = output_data + row * output_columns; at::Half* output_row_scale_bias = reinterpret_cast(output_row + embedding_cols); output_row_scale_bias[0] = weight_scales[row]; output_row_scale_bias[1] = weight_bias[row]; - for (int64_t col = 0; col < embedding_cols; ++col) { + for (const auto col : c10::irange(embedding_cols)) { // The weight values have already been packed, so here we just // store it in the output tensor. output_row[col] = input_row[col]; @@ -229,7 +229,7 @@ Tensor& qembeddingbag_byte_prepack_out(Tensor& output, const Tensor& weight) { const auto weight_data = static_cast(weight.data_ptr()); at::parallel_for( 0, embedding_rows, 1, [&](int32_t start_idx, int32_t end_idx) { - for (int64_t row = start_idx; row < end_idx; ++row) { + for (const auto row : c10::irange(start_idx, end_idx)) { fbgemm::FloatOrHalfToFused8BitRowwiseQuantizedSBFloat( weight_data + row * embedding_cols, 1, embedding_cols, output_data + row * output_columns); @@ -240,7 +240,7 @@ Tensor& qembeddingbag_byte_prepack_out(Tensor& output, const Tensor& weight) { const auto weight_data = weight.data_ptr(); at::parallel_for( 0, embedding_rows, 1, [&](int32_t start_idx, int32_t end_idx) { - for (int64_t row = start_idx; row < end_idx; ++row) { + for (const auto row : c10::irange(start_idx, end_idx)) { fbgemm::FloatOrHalfToFused8BitRowwiseQuantizedSBFloat( weight_data + row * embedding_cols, 1, embedding_cols, output_data + row * output_columns); @@ -344,7 +344,7 @@ Tensor _qembeddingbag_nbit_prepack_helper( const auto weight_data = static_cast(weight.data_ptr()); at::parallel_for( 0, embedding_rows, 1, [&](int32_t start_idx, int32_t end_idx) { - for (int64_t row = start_idx; row < end_idx; ++row) { + for (const auto row : c10::irange(start_idx, end_idx)) { fbgemm::FloatOrHalfToFusedNBitRowwiseQuantizedSBHalf( bit_width, weight_data + row * embedding_cols, 1, embedding_cols, output_data + row * output_shape[1]); @@ -355,7 +355,7 @@ Tensor _qembeddingbag_nbit_prepack_helper( const auto weight_data = weight.data_ptr(); at::parallel_for( 0, embedding_rows, 1, [&](int32_t start_idx, int32_t end_idx) { - for (int64_t row = start_idx; row < end_idx; ++row) { + for (const auto row : c10::irange(start_idx, end_idx)) { fbgemm::FloatOrHalfToFusedNBitRowwiseQuantizedSBHalf( bit_width, weight_data + row * embedding_cols, 1, embedding_cols, output_data + row * output_shape[1]); @@ -369,7 +369,7 @@ Tensor _qembeddingbag_nbit_prepack_helper( ? weight_contig.to(at::ScalarType::Float) : weight_contig; const auto weight_data = float_weight.data_ptr(); - for (int row = 0; row < embedding_rows; ++row) { + for (const auto row : c10::irange(embedding_rows)) { const float* input_row = weight_data + row * embedding_cols; std::uint8_t* output_row = output_data + row * output_columns; diff --git a/aten/src/ATen/native/quantized/cpu/qembeddingbag_unpack.cpp b/aten/src/ATen/native/quantized/cpu/qembeddingbag_unpack.cpp index b2e6b5217fd6d0..65365a3d709ccd 100644 --- a/aten/src/ATen/native/quantized/cpu/qembeddingbag_unpack.cpp +++ b/aten/src/ATen/native/quantized/cpu/qembeddingbag_unpack.cpp @@ -2,6 +2,7 @@ #include #include #include +#include #include torch::class_ register_embedding_params(); @@ -67,7 +68,7 @@ at::Tensor PackedEmbeddingBagWeight::unpack() { // For sub-byte tensors this will copy the packed bytes over since the // sub_byte qtensors are expected to store data in packed format. at::parallel_for(0, input_rows, 1, [&](int32_t start_idx, int32_t end_idx) { - for (int64_t row = start_idx; row < end_idx; ++row) { + for (const auto row : c10::irange(start_idx, end_idx)) { const std::uint8_t* input_row = input + row * input_columns; uint8_t* output_row = output_data + row * output_columns / num_elem_per_byte; @@ -126,7 +127,7 @@ Tensor qembeddingbag_byte_unpack(const Tensor& packed_weight) { #ifdef USE_FBGEMM at::parallel_for( 0, input_rows, 1, [&](int32_t start_idx, int32_t end_idx) { - for (int64_t row = start_idx; row < end_idx; ++row) { + for (const auto row : c10::irange(start_idx, end_idx)) { fbgemm::Fused8BitRowwiseQuantizedSBFloatToFloatOrHalf( input_data + row * input_columns, 1, @@ -173,7 +174,7 @@ Tensor _qembeddingbag_nbit_unpack_helper( #ifdef USE_FBGEMM at::parallel_for( 0, input_rows, 1, [&](int32_t start_idx, int32_t end_idx) { - for (int64_t row = start_idx; row < end_idx; ++row) { + for (const auto row : c10::irange(start_idx, end_idx)) { fbgemm::FusedNBitRowwiseQuantizedSBHalfToFloatOrHalf(BIT_RATE, input_data + row * input_columns, 1, @@ -192,7 +193,7 @@ Tensor _qembeddingbag_nbit_unpack_helper( float scale = input_row_scale_zp[0]; float zero_point = input_row_scale_zp[1]; - for (int col = 0; col < output_columns; ++col) { + for (const auto col : c10::irange(output_columns)) { std::uint8_t quantized = input_row[col / NUM_ELEM_PER_BYTE]; quantized >>= (col % NUM_ELEM_PER_BYTE) * BIT_RATE; quantized &= (1 << BIT_RATE) - 1; diff --git a/aten/src/ATen/native/quantized/cpu/qlinear.cpp b/aten/src/ATen/native/quantized/cpu/qlinear.cpp index 9e6ddc0068c43c..52b2d01f1ba6c9 100644 --- a/aten/src/ATen/native/quantized/cpu/qlinear.cpp +++ b/aten/src/ATen/native/quantized/cpu/qlinear.cpp @@ -303,7 +303,7 @@ at::Tensor PackedLinearWeightsQnnp::apply_impl( w_zero_points[0]); auto* qnnp_w_data = qnnp_weight.data_ptr(); auto wt_numel = weight_contig.numel(); - for (int i = 0; i < wt_numel; ++i) { + for (const auto i : c10::irange(wt_numel)) { qnnp_w_data[i] = static_cast(w_data[i] + 128); } // Original bias was float, so we requantize it here. diff --git a/aten/src/ATen/native/quantized/cpu/qlinear_dynamic.cpp b/aten/src/ATen/native/quantized/cpu/qlinear_dynamic.cpp index 09f42286a216d6..9a126fedcec138 100644 --- a/aten/src/ATen/native/quantized/cpu/qlinear_dynamic.cpp +++ b/aten/src/ATen/native/quantized/cpu/qlinear_dynamic.cpp @@ -290,7 +290,7 @@ at::Tensor PackedLinearWeightsQnnp::apply_dynamic_impl(at::Tensor input) { auto* qnnp_w_data = qnnp_weight.data_ptr(); int8_t* w_data = (int8_t*)weight_contig.data_ptr(); auto wt_numel = weight_contig.numel(); - for (int i = 0; i < wt_numel; ++i) { + for (const auto i : c10::irange(wt_numel)) { qnnp_w_data[i] = static_cast(w_data[i] + 128); } diff --git a/aten/src/ATen/native/quantized/cpu/qpool.cpp b/aten/src/ATen/native/quantized/cpu/qpool.cpp index 17a40245a981d9..adee30b4470e76 100644 --- a/aten/src/ATen/native/quantized/cpu/qpool.cpp +++ b/aten/src/ATen/native/quantized/cpu/qpool.cpp @@ -9,6 +9,7 @@ #include #include #include +#include #include #include @@ -43,7 +44,7 @@ void spatial_dilated_max_pooling( int64_t dW, // dilation T* oData) { // output arrays (data and max-index) at::parallel_for(0, iC, 0, [&](int64_t start, int64_t end) { - for (auto p = start; p < end; ++p) { + for (const auto p : c10::irange(start, end)) { // NOLINTNEXTLINE(cppcoreguidelines-init-variables) int64_t row, col; const T* i_p = iData + p * iW * iH; @@ -195,7 +196,7 @@ Tensor q_maxpool_2d( oData); } else { at::parallel_for(0, nbatch, 0, [&](int64_t start, int64_t end) { - for (auto p = start; p < end; ++p) { + for (const auto p : c10::irange(start, end)) { auto* iData = qxd + p * iC * iW * iH; auto* oData = qyd + p * oC * oW * oH; spatial_dilated_max_pooling( diff --git a/aten/src/ATen/native/quantized/cpu/qrelu.cpp b/aten/src/ATen/native/quantized/cpu/qrelu.cpp index c282b8ac1c501e..beba6a90acdfc3 100644 --- a/aten/src/ATen/native/quantized/cpu/qrelu.cpp +++ b/aten/src/ATen/native/quantized/cpu/qrelu.cpp @@ -6,6 +6,7 @@ #include #include #include +#include #include #include @@ -31,7 +32,7 @@ Tensor qnnpack_relu(Tensor input) { initQNNPACK(); size_t num_elems = 1; - for (int i = 1; i < input_contig.ndimension(); ++i) { + for (const auto i : c10::irange(1, input_contig.ndimension())) { num_elems *= input_contig.size(i); } diff --git a/aten/src/ATen/native/quantized/cpu/qsigmoid.cpp b/aten/src/ATen/native/quantized/cpu/qsigmoid.cpp index f152bd2a1839e4..756b617af8d8ad 100644 --- a/aten/src/ATen/native/quantized/cpu/qsigmoid.cpp +++ b/aten/src/ATen/native/quantized/cpu/qsigmoid.cpp @@ -7,6 +7,7 @@ #include #include #include +#include #include #include @@ -26,7 +27,7 @@ Tensor qnnpack_sigmoid( Tensor input_contig = input.contiguous(input.suggest_memory_format()); size_t num_elems = 1; - for (int i = 1; i < input_contig.ndimension(); ++i) { + for (const auto i : c10::irange(1, input_contig.ndimension())) { num_elems *= input_contig.size(i); } diff --git a/aten/src/ATen/native/quantized/cpu/qtanh.cpp b/aten/src/ATen/native/quantized/cpu/qtanh.cpp index 9c290016c6c8bf..3c21d1cf4c8479 100644 --- a/aten/src/ATen/native/quantized/cpu/qtanh.cpp +++ b/aten/src/ATen/native/quantized/cpu/qtanh.cpp @@ -7,6 +7,7 @@ #include #include #include +#include #include #include @@ -29,7 +30,7 @@ Tensor qnnpack_tanh(Tensor input) { Tensor input_contig = input.contiguous(input.suggest_memory_format()); size_t num_elems = 1; - for (int i = 1; i < input_contig.ndimension(); ++i) { + for (const auto i : c10::irange(1, input_contig.ndimension())) { num_elems *= input_contig.size(i); } const auto zero_point = input_contig.q_zero_point(); diff --git a/aten/src/ATen/native/quantized/cpu/quant_utils.h b/aten/src/ATen/native/quantized/cpu/quant_utils.h index e14914df42e694..8ebcea45883c6c 100644 --- a/aten/src/ATen/native/quantized/cpu/quant_utils.h +++ b/aten/src/ATen/native/quantized/cpu/quant_utils.h @@ -1,6 +1,7 @@ #pragma once #include +#include #include #include @@ -193,7 +194,7 @@ static C10_UNUSED torch::List MakeArgForConv1d(const torch::List(kFp16Max, weight + i); if (saturate) { found_out_of_range = true; diff --git a/aten/src/ATen/native/quantized/cpu/qupsample_bilinear2d.cpp b/aten/src/ATen/native/quantized/cpu/qupsample_bilinear2d.cpp index 6bd5ef990c1873..ab30cd7d381010 100644 --- a/aten/src/ATen/native/quantized/cpu/qupsample_bilinear2d.cpp +++ b/aten/src/ATen/native/quantized/cpu/qupsample_bilinear2d.cpp @@ -2,6 +2,7 @@ #include #include #include +#include #include #include @@ -57,7 +58,7 @@ static void upsample_bilinear2d_out_frame( const int64_t input_q_zero_point = input.q_zero_point(); const int64_t output_q_zero_point = output.q_zero_point(); - for (int64_t h2 = 0; h2 < output_height; ++h2) { + for (const auto h2 : c10::irange(output_height)) { const auto h1r = area_pixel_compute_source_index( rheight, h2, align_corners, /*cubic=*/false); @@ -67,7 +68,7 @@ static void upsample_bilinear2d_out_frame( const float h1lambda = h1r - h1; const float h0lambda = static_cast(1.) - h1lambda; - for (int64_t w2 = 0; w2 < output_width; ++w2) { + for (const auto w2 : c10::irange(output_width)) { const auto w1r = area_pixel_compute_source_index( rwidth, w2, align_corners, /*cubic=*/false); @@ -79,7 +80,8 @@ static void upsample_bilinear2d_out_frame( const typename scalar_t::underlying* pos1 = i_p + h1 * input_width + w1; typename scalar_t::underlying* pos2 = o_p + h2 * output_width + w2; - for (int64_t c = 0; c < channels; ++c) { + for (const auto c : c10::irange(channels)) { + (void)c; //Suppress unused variable warning float result = h0lambda * (w0lambda * pos1[0] + w1lambda * pos1[w1p]) + h1lambda * (w0lambda * pos1[h1p * input_width] + diff --git a/aten/src/ATen/native/quantized/cpu/qupsample_nearest2d.cpp b/aten/src/ATen/native/quantized/cpu/qupsample_nearest2d.cpp index 753f272f014896..7d2e5db61b106b 100644 --- a/aten/src/ATen/native/quantized/cpu/qupsample_nearest2d.cpp +++ b/aten/src/ATen/native/quantized/cpu/qupsample_nearest2d.cpp @@ -41,18 +41,19 @@ static void upsample_nearest2d_out_frame( return; } - for (int64_t h2 = 0; h2 < output_height; ++h2) { + for (const auto h2 : c10::irange(output_height)) { const int64_t h1 = nearest_neighbor_compute_source_index(height_scale, h2, input_height); - for (int64_t w2 = 0; w2 < output_width; ++w2) { + for (const auto w2 : c10::irange(output_width)) { const int64_t w1 = nearest_neighbor_compute_source_index(width_scale, w2, input_width); const auto* pos1 = &i_p[h1 * input_width + w1]; auto* pos2 = &o_p[h2 * output_width + w2]; - for (int64_t c = 0; c < channels; ++c) { + for (const auto c : c10::irange(channels)) { + (void)c; //Suppress unused variable warning pos2[0] = pos1[0]; pos1 += input_height * input_width; pos2 += output_height * output_width; @@ -85,11 +86,11 @@ static void upsample_nearest2d_out_frame_nhwc( return; } - for (int64_t h2 = 0; h2 < output_height; ++h2) { + for (const auto h2 : c10::irange(output_height)) { const int64_t h1 = nearest_neighbor_compute_source_index(height_scale, h2, input_height); - for (int64_t w2 = 0; w2 < output_width; ++w2) { + for (const auto w2 : c10::irange(output_width)) { const int64_t w1 = nearest_neighbor_compute_source_index(width_scale, w2, input_width); diff --git a/aten/src/ATen/native/quantized/cpu/qupsample_nearest3d.cpp b/aten/src/ATen/native/quantized/cpu/qupsample_nearest3d.cpp index 9876635537088d..604ec909406ffa 100644 --- a/aten/src/ATen/native/quantized/cpu/qupsample_nearest3d.cpp +++ b/aten/src/ATen/native/quantized/cpu/qupsample_nearest3d.cpp @@ -45,22 +45,23 @@ static void upsample_nearest3d_out_frame( return; } - for (int64_t d2 = 0; d2 < output_depth; ++d2) { + for (const auto d2 : c10::irange(output_depth)) { const int64_t d1 = nearest_neighbor_compute_source_index(depth_scale, d2, input_depth); - for (int64_t h2 = 0; h2 < output_height; ++h2) { + for (const auto h2 : c10::irange(output_height)) { const int64_t h1 = nearest_neighbor_compute_source_index(height_scale, h2, input_height); - for (int64_t w2 = 0; w2 < output_width; ++w2) { + for (const auto w2 : c10::irange(output_width)) { const int64_t w1 = nearest_neighbor_compute_source_index(width_scale, w2, input_width); const auto* pos1 = &i_p[d1 * input_height * input_width + h1 * input_width + w1]; auto* pos2 = &o_p[d2 * output_height * output_width + h2 * output_width + w2]; - for (int64_t c = 0; c < channels; ++c) { + for (const auto c : c10::irange(channels)) { + (void)c; //Suppress unused variable warning pos2[0] = pos1[0]; pos1 += input_depth * input_height * input_width; pos2 += output_depth * output_height * output_width; @@ -98,14 +99,14 @@ static void upsample_nearest3d_out_frame_nhwc( return; } - for (int64_t d2 = 0; d2 < output_depth; ++d2) { + for (const auto d2 : c10::irange(output_depth)) { const int64_t d1 = nearest_neighbor_compute_source_index(depth_scale, d2, input_depth); - for (int64_t h2 = 0; h2 < output_height; ++h2) { + for (const auto h2 : c10::irange(output_height)) { const int64_t h1 = nearest_neighbor_compute_source_index(height_scale, h2, input_height); - for (int64_t w2 = 0; w2 < output_width; ++w2) { + for (const auto w2 : c10::irange(output_width)) { const int64_t w1 = nearest_neighbor_compute_source_index(width_scale, w2, input_width); diff --git a/aten/src/ATen/native/quantized/fake_quant_per_channel_affine.cpp b/aten/src/ATen/native/quantized/fake_quant_per_channel_affine.cpp index b86518d2d4c8e5..1e9dbdea4b77df 100644 --- a/aten/src/ATen/native/quantized/fake_quant_per_channel_affine.cpp +++ b/aten/src/ATen/native/quantized/fake_quant_per_channel_affine.cpp @@ -217,7 +217,7 @@ std::tuple _fake_quantize_learnable_per_channel_affine_b // into the same shapes as X along the channel axis. // NOLINTNEXTLINE(cppcoreguidelines-no-malloc) int64_t* axis_mask = (int64_t *) calloc(numDimensions, sizeof(int64_t)); - for (int i = 0; i < numDimensions; ++i) { + for (const auto i : c10::irange(numDimensions)) { axis_mask[i] = (i == axis) ? X.size(axis) : 1; } auto X_shape = X.sizes(); diff --git a/aten/src/ATen/native/sparse/SoftMax.cpp b/aten/src/ATen/native/sparse/SoftMax.cpp index 29662b00a1837d..04619cb245df02 100644 --- a/aten/src/ATen/native/sparse/SoftMax.cpp +++ b/aten/src/ATen/native/sparse/SoftMax.cpp @@ -7,6 +7,7 @@ #include #include #include +#include #include @@ -71,9 +72,9 @@ std::vector get_offsets(const Tensor& indices, const IntArrayRef& sizes } } - for (int64_t i=0; i < nnz; i++) { + for (const auto i : c10::irange(nnz)) { int64_t acc = 0; - for (int64_t j=0; j < ndim; j++) { + for (const auto j : c10::irange(ndim)) { auto indices_row = indices_accessor[j]; auto stride = strides[j]; if (j != dim) { @@ -119,9 +120,9 @@ std::vector> get_pools(const Tensor& indices, const IntArra } } - for (int64_t i=0; i < nnz; i++) { + for (const auto i : c10::irange(nnz)) { int64_t pool_index = 0; - for (int64_t j=0; j < ndim; j++) { + for (const auto j : c10::irange(ndim)) { if (j != dim) { const auto indices_row = indices_accessor[j]; const auto stride = strides[j]; @@ -315,7 +316,7 @@ void cpu_sparse_coo_softmax(Tensor output, const Tensor& input, const int64_t di int64_t grain_size = 1; parallel_for(0, pools.size(), grain_size, [&](int64_t begin, int64_t end) { - for (auto p = begin; p < end; p++) { + for (const auto p : c10::irange(begin, end)) { auto pool_indices = pools[p]; // Skip empty pools @@ -329,7 +330,7 @@ void cpu_sparse_coo_softmax(Tensor output, const Tensor& input, const int64_t di /* Compute mx */ for (int64_t i : pool_indices) { auto values_row = values_accessor[i]; - for (int64_t j=0; j < nvalues; j++) { + for (const auto j : c10::irange(nvalues)) { mx_row[j] = std::max(mx_row[j], values_row[j]); } } @@ -338,7 +339,7 @@ void cpu_sparse_coo_softmax(Tensor output, const Tensor& input, const int64_t di for (int64_t i : pool_indices) { auto values_row = values_accessor[i]; auto out_values_row = out_values_accessor[i]; - for (int64_t j=0; j < nvalues; j++) { + for (const auto j : c10::irange(nvalues)) { auto v = std::exp(values_row[j] - mx_row[j]); if (!LogSoftMax) { out_values_row[j] = v; @@ -347,7 +348,7 @@ void cpu_sparse_coo_softmax(Tensor output, const Tensor& input, const int64_t di } } - for (int64_t j=0; j < nvalues; j++) { + for (const auto j : c10::irange(nvalues)) { if (LogSoftMax) { mx_row[j] += std::log(exp_sums_row[j]); } else { @@ -359,7 +360,7 @@ void cpu_sparse_coo_softmax(Tensor output, const Tensor& input, const int64_t di for (int64_t i : pool_indices) { auto values_row = values_accessor[i]; auto out_values_row = out_values_accessor[i]; - for (int64_t j=0; j < nvalues; j++) { + for (const auto j : c10::irange(nvalues)) { if (LogSoftMax) { out_values_row[j] = values_row[j] - mx_row[j]; } else { @@ -421,7 +422,7 @@ void cpu_sparse_coo_softmax_backward(const Tensor& grad_input, const Tensor& gra values.set_(r); } } else { - for(int64_t i=0; i #include #include +#include #include @@ -48,7 +49,7 @@ void convert_indices_from_coo_to_csr_cpu(const Tensor& result, const Tensor& inp at::parallel_for(0, numel - 1, GRAIN_SIZE, [&](int64_t start, int64_t end) { input_t curr_value = data_in[start], next_value; - for (int64_t i = start; i < end; i++) { + for (const auto i : c10::irange(start, end)) { next_value = data_in[i + 1]; for (; curr_value < next_value; curr_value++) data_out[curr_value + 1] = static_cast(i + 1); diff --git a/aten/src/ATen/native/sparse/SparseMatMul.cpp b/aten/src/ATen/native/sparse/SparseMatMul.cpp index 2783f9e9000766..647d9788fd5504 100644 --- a/aten/src/ATen/native/sparse/SparseMatMul.cpp +++ b/aten/src/ATen/native/sparse/SparseMatMul.cpp @@ -5,6 +5,7 @@ #include #include #include +#include #include namespace at { namespace native { @@ -30,7 +31,7 @@ void csr_to_coo(const int64_t n_row, const int64_t Ap[], int64_t Bi[]) { Output: `Bi` is the row indices */ - for (int64_t i = 0; i < n_row; i++) { + for (const auto i : c10::irange(n_row)) { for (int64_t jj = Ap[i]; jj < Ap[i + 1]; jj++) { Bi[jj] = i; } @@ -56,7 +57,7 @@ int64_t _csr_matmult_maxnnz( */ std::vector mask(n_col, -1); int64_t nnz = 0; - for (int64_t i = 0; i < n_row; i++) { + for (const auto i : c10::irange(n_row)) { int64_t row_nnz = 0; for (int64_t jj = Ap[i]; jj < Ap[i + 1]; jj++) { @@ -127,19 +128,19 @@ void _csr_matmult( Cp[0] = 0; - for (int64_t i = 0; i < n_row; i++) { + for (const auto i : c10::irange(n_row)) { int64_t head = -2; int64_t length = 0; int64_t jj_start = Ap[i]; int64_t jj_end = Ap[i + 1]; - for (int64_t jj = jj_start; jj < jj_end; jj++) { + for (const auto jj : c10::irange(jj_start, jj_end)) { int64_t j = Aj[jj]; scalar_t v = Ax[jj]; int64_t kk_start = Bp[j]; int64_t kk_end = Bp[j + 1]; - for (int64_t kk = kk_start; kk < kk_end; kk++) { + for (const auto kk : c10::irange(kk_start, kk_end)) { int64_t k = Bj[kk]; sums[k] += v * Bx[kk]; @@ -152,7 +153,8 @@ void _csr_matmult( } } - for (int64_t jj = 0; jj < length; jj++) { + for (const auto jj : c10::irange(length)) { + (void)jj; //Suppress unused variable warning Cj[nnz] = head; Cx[nnz] = sums[head]; nnz++; diff --git a/aten/src/ATen/native/sparse/SparseTensor.cpp b/aten/src/ATen/native/sparse/SparseTensor.cpp index fa60582a66840d..72b801a0089ec1 100644 --- a/aten/src/ATen/native/sparse/SparseTensor.cpp +++ b/aten/src/ATen/native/sparse/SparseTensor.cpp @@ -12,6 +12,7 @@ #include #include +#include namespace at { namespace native { @@ -229,7 +230,7 @@ Tensor sparse_coo_tensor(const Tensor& indices, const Tensor& values_, auto cpu_min_indices_accessor = cpu_min_indices.accessor(); auto cpu_computed_indices_sizes_accessor = cpu_computed_indices_sizes.accessor(); - for (int64_t d = 0; d < sparse_dim; d++) { + for (const auto d : c10::irange(sparse_dim)) { int64_t min_index_in_dim = cpu_min_indices_accessor[d]; TORCH_CHECK( min_index_in_dim >= 0, @@ -244,11 +245,11 @@ Tensor sparse_coo_tensor(const Tensor& indices, const Tensor& values_, // If the indices doesn't have elements in it, there is not enough // information to know what the minimum sparse dimension sizes should be, // and in this case we set them to 0 - for (int64_t d = 0; d < sparse_dim; d++) { + for (const auto d : c10::irange(sparse_dim)) { computed_sizes[static_cast(d)] = 0; } } - for (int64_t d = 0; d < dense_dim; d++) { + for (const auto d : c10::irange(dense_dim)) { computed_sizes[static_cast(sparse_dim + d)] = values.size(d + 1); } @@ -305,7 +306,7 @@ void _validate_sparse_coo_tensor_args( } auto cpu_min_indices_accessor = cpu_min_indices.accessor(); auto cpu_max_indices_accessor = cpu_max_indices.accessor(); - for (int64_t d = 0; d < sparse_dim; d++) { + for (const auto d : c10::irange(sparse_dim)) { // NB: This used to sync ndim times to access each entry; now we copy // everything to CPU first and then access it. int64_t min_index_in_dim = cpu_min_indices_accessor[d]; @@ -597,7 +598,7 @@ SparseTensor _coalesce_sparse_cpu(const SparseTensor& self) { int64_t blockSize = values.stride(0); scalar_t* values_ptr = values.data_ptr(); scalar_t* newValues_ptr = newValues.data_ptr(); - for (int64_t j = 0; j < nnz; j++) { + for (const auto j : c10::irange(nnz)) { int64_t pos = indicesPermutationAccessor[j]; int64_t curr = indicesBufferAccessor[j]; if (curr == prev) { @@ -613,7 +614,7 @@ SparseTensor _coalesce_sparse_cpu(const SparseTensor& self) { } } else { ++i; - for (int64_t d = 0; d < sparse_dim; d++) { + for (const auto d : c10::irange(sparse_dim)) { newIndicesAccessor[d][i] = indicesAccessor[d][pos]; } if (values.numel() > @@ -656,9 +657,9 @@ void inline sparse_mask_out_cpu_kernel( auto t_strides = t.strides(); at::parallel_for(0, r_nnz, 1000, [&](int64_t start, int64_t end) { - for (auto i = start; i < end; i++) { + for (const auto i : c10::irange(start, end)) { int64_t idx = 0; - for (int64_t d = 0; d < sparse_dim; d++) { + for (const auto d : c10::irange(sparse_dim)) { idx += mask_indices_accessor[d][i] * t_strides[d]; } r_values_accessor[i] = t_ptr[idx]; @@ -706,14 +707,14 @@ SparseTensor& sparse_mask_out_cpu( // ]. Keeping this implementation because it is faster than // flatten_indices() Tensor indices = at::zeros({mask._nnz()}, mask_indices.options()); - for (int64_t d = 0; d < mask.sparse_dim(); d++) { + for (const auto d : c10::irange(mask.sparse_dim())) { indices.mul_(mask.size(d)); indices.add_(mask_indices.select(0, d)); } std::vector view_size(1 + mask.dense_dim()); view_size[0] = -1; - for (int64_t d = 0; d < mask.dense_dim(); d++) { + for (const auto d : c10::irange(mask.dense_dim())) { view_size[d + 1] = mask.size(mask.sparse_dim() + d); } @@ -777,7 +778,7 @@ Tensor sparse_mask_helper_cpu( // Step 1: flatten the sparse indices `t._indices()` tensor and then map this // flatten value `index` to the original position `i` - for (int64_t i = 0; i < t_nnz; i++) { + for (const auto i : c10::irange(t_nnz)) { int64_t index = ti_flattened_indices.data_ptr()[i]; t_flatten_indices[index] = i; } @@ -802,7 +803,7 @@ Tensor sparse_mask_helper_cpu( const auto r_values_stride = r_values.strides()[0] * r_values.element_size(); const auto t_values_stride = t_v.strides()[0] * t_v.element_size(); - for (auto i = start; i < end; i++) { + for (const auto i : c10::irange(start, end)) { int64_t index = flattened_mask_indices.data_ptr()[i]; auto iter = t_flatten_indices.find(index); if (iter != t_flatten_indices.end()) { diff --git a/aten/src/ATen/native/sparse/SparseTensorMath.cpp b/aten/src/ATen/native/sparse/SparseTensorMath.cpp index 5051ce009e231a..6f0c4035c2e93a 100644 --- a/aten/src/ATen/native/sparse/SparseTensorMath.cpp +++ b/aten/src/ATen/native/sparse/SparseTensorMath.cpp @@ -700,9 +700,9 @@ Tensor& add_out_dense_sparse_cpu(Tensor& r, const Tensor& dense, const SparseTen // accessors rely on nnz test if (nDim > nDimI) { auto indices_accessor = indices.accessor(); - for (int64_t k = 0; k < sparse._nnz(); k++) { + for (const auto k : c10::irange(sparse._nnz())) { Tensor dstBuffer = resultBuffer; - for (int64_t d = 0; d < sparse.sparse_dim(); d++) { + for (const auto d : c10::irange(sparse.sparse_dim())) { dstBuffer = dstBuffer.select(0, indices_accessor[d][k]); } Tensor srcBuffer = valuesBuffer.select(0, k); @@ -1069,7 +1069,7 @@ SparseTensor& hspmm_out_sparse_cpu(const SparseTensor& sparse_, const Tensor& de auto indices_accessor = indices.accessor(); int64_t i = -1, prevIdx = -1; - for (int64_t j = 0; j < nnz; j++) { + for (const auto j : c10::irange(nnz)) { int64_t currIdx = valueIndices_accessor[j]; if (currIdx != prevIdx) { indices_accessor[0][++i] = currIdx; @@ -1185,10 +1185,10 @@ SparseTensor& _sspaddmm_out_cpu( scalar_t* newv_ptr = newv.data_ptr(); scalar_t cast_alpha = alpha.to(); - for (int64_t h = 0; h < dim_i; h++) { + for (const auto h : c10::irange(dim_i)) { int64_t i_start = csr_accessor[h]; int64_t i_end = csr_accessor[h+1]; - for (int64_t i = i_start; i < i_end; i++) { + for (const auto i : c10::irange(i_start, i_end)) { scalar_t val = values_accessor[i]; int64_t col = indices_accessor[1][i]; if (col >= 0 && col < dim_j) { @@ -1202,7 +1202,7 @@ SparseTensor& _sspaddmm_out_cpu( } // Fill up the indices with the right values if (i_start != i_end) { - for (int64_t i = 0; i < dim_k; i++) { + for (const auto i : c10::irange(dim_k)) { newi_accessor[0][p+i] = h; newi_accessor[1][p+i] = i; } @@ -1277,7 +1277,7 @@ Tensor _sparse_sum(const SparseTensor& input, IntArrayRef dims_to_sum) { auto dims_to_keep_v = std::vector(); auto dense_dims_to_sum_v = std::vector(); - for (int64_t d = 0; d < input_dim; d++) { + for (const auto d : c10::irange(input_dim)) { if (dims_to_sum_b[d]) { if (d >= sparse_dim) dense_dims_to_sum_v.emplace_back(d + 1 - sparse_dim); } diff --git a/aten/src/ATen/native/sparse/cuda/SparseCUDATensor.cpp b/aten/src/ATen/native/sparse/cuda/SparseCUDATensor.cpp index 1a3650e6880af6..5705c19a4a1aeb 100644 --- a/aten/src/ATen/native/sparse/cuda/SparseCUDATensor.cpp +++ b/aten/src/ATen/native/sparse/cuda/SparseCUDATensor.cpp @@ -3,6 +3,7 @@ #include #include +#include namespace at { namespace native { @@ -34,7 +35,7 @@ SparseTensor& sparse_mask_out_cuda(SparseTensor& r, const Tensor& t, const Spars // Get a flattened sparse indices, similar to NOTE [ Flatten Sparse Indices ]. // Keeping this implementation because it is faster than flatten_indices() Tensor indices = at::zeros({mask._nnz()}, mask_indices.options()); - for (int64_t d = 0; d < mask.sparse_dim(); d++) { + for (const auto d : c10::irange(mask.sparse_dim())) { indices.mul_(mask.size(d)); // This used to use a buffer but I deoptimized it indices.add_(mask_indices.select(0, d)); @@ -42,7 +43,7 @@ SparseTensor& sparse_mask_out_cuda(SparseTensor& r, const Tensor& t, const Spars std::vector view_size(1 + mask.dense_dim()); view_size[0] = -1; - for (int64_t d = 0; d < mask.dense_dim(); d++) { + for (const auto d : c10::irange(mask.dense_dim())) { view_size[d + 1] = mask.size(mask.sparse_dim() + d); } diff --git a/aten/src/ATen/native/utils/ParamsHash.h b/aten/src/ATen/native/utils/ParamsHash.h index 3b42b61b34eb37..76bb4de53d633b 100644 --- a/aten/src/ATen/native/utils/ParamsHash.h +++ b/aten/src/ATen/native/utils/ParamsHash.h @@ -17,7 +17,7 @@ struct ParamsHash { size_t operator()(const Params& params) const { auto ptr = reinterpret_cast(¶ms); uint32_t value = 0x811C9DC5; - for (int i = 0; i < (int)sizeof(Params); ++i) { + for (const auto i : c10::irange((int)sizeof(Params))) { value ^= ptr[i]; value *= 0x01000193; } diff --git a/aten/src/ATen/native/vulkan/Vulkan.cpp b/aten/src/ATen/native/vulkan/Vulkan.cpp index 4b86f0b9cd816c..6d253206bafd78 100644 --- a/aten/src/ATen/native/vulkan/Vulkan.cpp +++ b/aten/src/ATen/native/vulkan/Vulkan.cpp @@ -2,6 +2,7 @@ #include #include #include +#include #ifdef USE_VULKAN_WRAPPER #include @@ -192,7 +193,7 @@ uint32_t VContext::getComputeQueueFamilyIndex() { vkGetPhysicalDeviceQueueFamilyProperties( physicalDevice_, &queueFamilyCount, queueFamilies.data()); - for (uint32_t i = 0; i < queueFamilies.size(); ++i) { + for (const auto i : c10::irange(queueFamilies.size())) { VkQueueFamilyProperties props = queueFamilies[i]; if (props.queueCount > 0 && (props.queueFlags & VK_QUEUE_COMPUTE_BIT)) { return i; @@ -274,7 +275,7 @@ uint32_t findMemoryType( const VkMemoryPropertyFlags properties) { VkPhysicalDeviceMemoryProperties memoryProperties{}; vkGetPhysicalDeviceMemoryProperties(physicalDevice, &memoryProperties); - for (uint32_t i = 0; i < memoryProperties.memoryTypeCount; ++i) { + for (const auto i : c10::irange(memoryProperties.memoryTypeCount)) { if ((memoryTypeBits & (1 << i)) && ((memoryProperties.memoryTypes[i].propertyFlags & properties) == properties)) { diff --git a/aten/src/ATen/native/vulkan/VulkanAten.cpp b/aten/src/ATen/native/vulkan/VulkanAten.cpp index 1693d9616b46ca..768ce081b353a7 100644 --- a/aten/src/ATen/native/vulkan/VulkanAten.cpp +++ b/aten/src/ATen/native/vulkan/VulkanAten.cpp @@ -9,6 +9,7 @@ #include #include #include +#include namespace at { namespace native { @@ -265,13 +266,13 @@ Tensor cat(const TensorList tensors, int64_t dim) { int64_t cat_dim_size = 0; std::vector vTensors{}; - for (int i = 0; i < tensors.size(); ++i) { + for (const auto i : c10::irange(tensors.size())) { const auto& t = tensors[i]; TORCH_INTERNAL_ASSERT( t.dim() == 4, "Vulkan cat expects 4 dimensional inputs"); TORCH_INTERNAL_ASSERT(t.is_vulkan(), "Vulkan cat expects Vulkan inputs"); - for (int d = 0; d < 4; ++d) { + for (const auto d : c10::irange(4)) { if (d == dim) { continue; } diff --git a/aten/src/ATen/native/vulkan/VulkanOps.cpp b/aten/src/ATen/native/vulkan/VulkanOps.cpp index ba5d8bc88c58ba..7cbd7479e256f3 100644 --- a/aten/src/ATen/native/vulkan/VulkanOps.cpp +++ b/aten/src/ATen/native/vulkan/VulkanOps.cpp @@ -3,6 +3,7 @@ #include #include #include +#include #include #include @@ -629,17 +630,17 @@ VBuffer kernelNCHW_OCHW_repack_O4C4HWi4o4( memset(basePtr, 0, size); const float* src = weights; int ridx = 0; - for (int oc = 0; oc < OC; ++oc) { + for (const auto oc : c10::irange(OC)) { int oc_4 = oc / 4; int oc_4_i = oc % 4; float* dst_oc = basePtr + oc_4 * oc_4SizeNumel; - for (int ic = 0; ic < C; ++ic) { + for (const auto ic : c10::irange(C)) { int ic_4 = ic / 4; int ic_4_i = ic % 4; float* dst_ic = dst_oc + ic_4 * KW * KH * 16; - for (int ky = 0; ky < KH; ++ky) { + for (const auto ky : c10::irange(KH)) { float* dst_ky = dst_ic + ky * KW * 16; - for (int kx = 0; kx < KW; ++kx) { + for (const auto kx : c10::irange(KW)) { float* dst_kx = dst_ky + kx * 16; dst_kx[4 * ic_4_i + oc_4_i] = src[ridx++]; } diff --git a/aten/src/ATen/native/vulkan/api/Runtime.cpp b/aten/src/ATen/native/vulkan/api/Runtime.cpp index dd2de28d593c2b..b40466858f9cc2 100644 --- a/aten/src/ATen/native/vulkan/api/Runtime.cpp +++ b/aten/src/ATen/native/vulkan/api/Runtime.cpp @@ -1,5 +1,6 @@ #include #include +#include #include @@ -244,7 +245,7 @@ uint32_t query_compute_queue_family_index(const VkPhysicalDevice physical_device &queue_family_count, queue_families_properties.data()); - for (uint32_t i = 0; i < queue_families_properties.size(); ++i) { + for (const auto i : c10::irange(queue_families_properties.size())) { const VkQueueFamilyProperties& properties = queue_families_properties[i]; if (properties.queueCount > 0 && (properties.queueFlags & VK_QUEUE_COMPUTE_BIT)) { return i; diff --git a/aten/src/ATen/native/vulkan/api/vk_mem_alloc.h b/aten/src/ATen/native/vulkan/api/vk_mem_alloc.h index 5e3a227a917ba0..e75cd56979be7d 100644 --- a/aten/src/ATen/native/vulkan/api/vk_mem_alloc.h +++ b/aten/src/ATen/native/vulkan/api/vk_mem_alloc.h @@ -1,19320 +1,19258 @@ -// -// Copyright (c) 2017-2021 Advanced Micro Devices, Inc. All rights reserved. -// -// Permission is hereby granted, free of charge, to any person obtaining a copy -// of this software and associated documentation files (the "Software"), to deal -// in the Software without restriction, including without limitation the rights -// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -// copies of the Software, and to permit persons to whom the Software is -// furnished to do so, subject to the following conditions: -// -// The above copyright notice and this permission notice shall be included in -// all copies or substantial portions of the Software. -// -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -// THE SOFTWARE. -// - -#ifndef AMD_VULKAN_MEMORY_ALLOCATOR_H -#define AMD_VULKAN_MEMORY_ALLOCATOR_H - -/** \mainpage Vulkan Memory Allocator - -Version 3.0.0-development (2021-02-16) - -Copyright (c) 2017-2021 Advanced Micro Devices, Inc. All rights reserved. \n -License: MIT - -Documentation of all members: vk_mem_alloc.h - -\section main_table_of_contents Table of contents - -- User guide - - \subpage quick_start - - [Project setup](@ref quick_start_project_setup) - - [Initialization](@ref quick_start_initialization) - - [Resource allocation](@ref quick_start_resource_allocation) - - \subpage choosing_memory_type - - [Usage](@ref choosing_memory_type_usage) - - [Required and preferred flags](@ref choosing_memory_type_required_preferred_flags) - - [Explicit memory types](@ref choosing_memory_type_explicit_memory_types) - - [Custom memory pools](@ref choosing_memory_type_custom_memory_pools) - - [Dedicated allocations](@ref choosing_memory_type_dedicated_allocations) - - \subpage memory_mapping - - [Mapping functions](@ref memory_mapping_mapping_functions) - - [Persistently mapped memory](@ref memory_mapping_persistently_mapped_memory) - - [Cache flush and invalidate](@ref memory_mapping_cache_control) - - [Finding out if memory is mappable](@ref memory_mapping_finding_if_memory_mappable) - - \subpage staying_within_budget - - [Querying for budget](@ref staying_within_budget_querying_for_budget) - - [Controlling memory usage](@ref staying_within_budget_controlling_memory_usage) - - \subpage resource_aliasing - - \subpage custom_memory_pools - - [Choosing memory type index](@ref custom_memory_pools_MemTypeIndex) - - [Linear allocation algorithm](@ref linear_algorithm) - - [Free-at-once](@ref linear_algorithm_free_at_once) - - [Stack](@ref linear_algorithm_stack) - - [Double stack](@ref linear_algorithm_double_stack) - - [Ring buffer](@ref linear_algorithm_ring_buffer) - - [Buddy allocation algorithm](@ref buddy_algorithm) - - \subpage defragmentation - - [Defragmenting CPU memory](@ref defragmentation_cpu) - - [Defragmenting GPU memory](@ref defragmentation_gpu) - - [Additional notes](@ref defragmentation_additional_notes) - - [Writing custom allocation algorithm](@ref defragmentation_custom_algorithm) - - \subpage lost_allocations - - \subpage statistics - - [Numeric statistics](@ref statistics_numeric_statistics) - - [JSON dump](@ref statistics_json_dump) - - \subpage allocation_annotation - - [Allocation user data](@ref allocation_user_data) - - [Allocation names](@ref allocation_names) - - \subpage debugging_memory_usage - - [Memory initialization](@ref debugging_memory_usage_initialization) - - [Margins](@ref debugging_memory_usage_margins) - - [Corruption detection](@ref debugging_memory_usage_corruption_detection) - - \subpage record_and_replay -- \subpage usage_patterns - - [Common mistakes](@ref usage_patterns_common_mistakes) - - [Simple patterns](@ref usage_patterns_simple) - - [Advanced patterns](@ref usage_patterns_advanced) -- \subpage configuration - - [Pointers to Vulkan functions](@ref config_Vulkan_functions) - - [Custom host memory allocator](@ref custom_memory_allocator) - - [Device memory allocation callbacks](@ref allocation_callbacks) - - [Device heap memory limit](@ref heap_memory_limit) - - \subpage vk_khr_dedicated_allocation - - \subpage enabling_buffer_device_address - - \subpage vk_amd_device_coherent_memory -- \subpage general_considerations - - [Thread safety](@ref general_considerations_thread_safety) - - [Validation layer warnings](@ref general_considerations_validation_layer_warnings) - - [Allocation algorithm](@ref general_considerations_allocation_algorithm) - - [Features not supported](@ref general_considerations_features_not_supported) - -\section main_see_also See also - -- [Product page on GPUOpen](https://gpuopen.com/gaming-product/vulkan-memory-allocator/) -- [Source repository on GitHub](https://github.com/GPUOpen-LibrariesAndSDKs/VulkanMemoryAllocator) - - - - -\page quick_start Quick start - -\section quick_start_project_setup Project setup - -Vulkan Memory Allocator comes in form of a "stb-style" single header file. -You don't need to build it as a separate library project. -You can add this file directly to your project and submit it to code repository next to your other source files. - -"Single header" doesn't mean that everything is contained in C/C++ declarations, -like it tends to be in case of inline functions or C++ templates. -It means that implementation is bundled with interface in a single file and needs to be extracted using preprocessor macro. -If you don't do it properly, you will get linker errors. - -To do it properly: - --# Include "vk_mem_alloc.h" file in each CPP file where you want to use the library. - This includes declarations of all members of the library. --# In exacly one CPP file define following macro before this include. - It enables also internal definitions. - -\code -#define VMA_IMPLEMENTATION -#include -\endcode - -It may be a good idea to create dedicated CPP file just for this purpose. - -Note on language: This library is written in C++, but has C-compatible interface. -Thus you can include and use vk_mem_alloc.h in C or C++ code, but full -implementation with `VMA_IMPLEMENTATION` macro must be compiled as C++, NOT as C. - -Please note that this library includes header ``, which in turn -includes `` on Windows. If you need some specific macros defined -before including these headers (like `WIN32_LEAN_AND_MEAN` or -`WINVER` for Windows, `VK_USE_PLATFORM_WIN32_KHR` for Vulkan), you must define -them before every `#include` of this library. - -You may need to configure the way you import Vulkan functions. - -- By default, VMA assumes you you link statically with Vulkan API. If this is not the case, - `#define VMA_STATIC_VULKAN_FUNCTIONS 0` before `#include` of the VMA implementation and use another way. -- You can `#define VMA_DYNAMIC_VULKAN_FUNCTIONS 1` and make sure `vkGetInstanceProcAddr` and `vkGetDeviceProcAddr` globals are defined. - All the remaining Vulkan functions will be fetched automatically. -- Finally, you can provide your own pointers to all Vulkan functions needed by VMA using structure member - VmaAllocatorCreateInfo::pVulkanFunctions, if you fetched them in some custom way e.g. using some loader like [Volk](https://github.com/zeux/volk). - - -\section quick_start_initialization Initialization - -At program startup: - --# Initialize Vulkan to have `VkPhysicalDevice`, `VkDevice` and `VkInstance` object. --# Fill VmaAllocatorCreateInfo structure and create #VmaAllocator object by - calling vmaCreateAllocator(). - -\code -VmaAllocatorCreateInfo allocatorInfo = {}; -allocatorInfo.vulkanApiVersion = VK_API_VERSION_1_2; -allocatorInfo.physicalDevice = physicalDevice; -allocatorInfo.device = device; -allocatorInfo.instance = instance; - -VmaAllocator allocator; -vmaCreateAllocator(&allocatorInfo, &allocator); -\endcode - -Only members `physicalDevice`, `device`, `instance` are required. -However, you should inform the library which Vulkan version do you use by setting -VmaAllocatorCreateInfo::vulkanApiVersion and which extensions did you enable -by setting VmaAllocatorCreateInfo::flags (like #VMA_ALLOCATOR_CREATE_BUFFER_DEVICE_ADDRESS_BIT for VK_KHR_buffer_device_address). -Otherwise, VMA would use only features of Vulkan 1.0 core with no extensions. - - -\section quick_start_resource_allocation Resource allocation - -When you want to create a buffer or image: - --# Fill `VkBufferCreateInfo` / `VkImageCreateInfo` structure. --# Fill VmaAllocationCreateInfo structure. --# Call vmaCreateBuffer() / vmaCreateImage() to get `VkBuffer`/`VkImage` with memory - already allocated and bound to it. - -\code -VkBufferCreateInfo bufferInfo = { VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO }; -bufferInfo.size = 65536; -bufferInfo.usage = VK_BUFFER_USAGE_VERTEX_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT; - -VmaAllocationCreateInfo allocInfo = {}; -allocInfo.usage = VMA_MEMORY_USAGE_GPU_ONLY; - -VkBuffer buffer; -VmaAllocation allocation; -vmaCreateBuffer(allocator, &bufferInfo, &allocInfo, &buffer, &allocation, nullptr); -\endcode - -Don't forget to destroy your objects when no longer needed: - -\code -vmaDestroyBuffer(allocator, buffer, allocation); -vmaDestroyAllocator(allocator); -\endcode - - -\page choosing_memory_type Choosing memory type - -Physical devices in Vulkan support various combinations of memory heaps and -types. Help with choosing correct and optimal memory type for your specific -resource is one of the key features of this library. You can use it by filling -appropriate members of VmaAllocationCreateInfo structure, as described below. -You can also combine multiple methods. - --# If you just want to find memory type index that meets your requirements, you - can use function: vmaFindMemoryTypeIndex(), vmaFindMemoryTypeIndexForBufferInfo(), - vmaFindMemoryTypeIndexForImageInfo(). --# If you want to allocate a region of device memory without association with any - specific image or buffer, you can use function vmaAllocateMemory(). Usage of - this function is not recommended and usually not needed. - vmaAllocateMemoryPages() function is also provided for creating multiple allocations at once, - which may be useful for sparse binding. --# If you already have a buffer or an image created, you want to allocate memory - for it and then you will bind it yourself, you can use function - vmaAllocateMemoryForBuffer(), vmaAllocateMemoryForImage(). - For binding you should use functions: vmaBindBufferMemory(), vmaBindImageMemory() - or their extended versions: vmaBindBufferMemory2(), vmaBindImageMemory2(). --# If you want to create a buffer or an image, allocate memory for it and bind - them together, all in one call, you can use function vmaCreateBuffer(), - vmaCreateImage(). This is the easiest and recommended way to use this library. - -When using 3. or 4., the library internally queries Vulkan for memory types -supported for that buffer or image (function `vkGetBufferMemoryRequirements()`) -and uses only one of these types. - -If no memory type can be found that meets all the requirements, these functions -return `VK_ERROR_FEATURE_NOT_PRESENT`. - -You can leave VmaAllocationCreateInfo structure completely filled with zeros. -It means no requirements are specified for memory type. -It is valid, although not very useful. - -\section choosing_memory_type_usage Usage - -The easiest way to specify memory requirements is to fill member -VmaAllocationCreateInfo::usage using one of the values of enum #VmaMemoryUsage. -It defines high level, common usage types. -For more details, see description of this enum. - -For example, if you want to create a uniform buffer that will be filled using -transfer only once or infrequently and used for rendering every frame, you can -do it using following code: - -\code -VkBufferCreateInfo bufferInfo = { VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO }; -bufferInfo.size = 65536; -bufferInfo.usage = VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT; - -VmaAllocationCreateInfo allocInfo = {}; -allocInfo.usage = VMA_MEMORY_USAGE_GPU_ONLY; - -VkBuffer buffer; -VmaAllocation allocation; -vmaCreateBuffer(allocator, &bufferInfo, &allocInfo, &buffer, &allocation, nullptr); -\endcode - -\section choosing_memory_type_required_preferred_flags Required and preferred flags - -You can specify more detailed requirements by filling members -VmaAllocationCreateInfo::requiredFlags and VmaAllocationCreateInfo::preferredFlags -with a combination of bits from enum `VkMemoryPropertyFlags`. For example, -if you want to create a buffer that will be persistently mapped on host (so it -must be `HOST_VISIBLE`) and preferably will also be `HOST_COHERENT` and `HOST_CACHED`, -use following code: - -\code -VmaAllocationCreateInfo allocInfo = {}; -allocInfo.requiredFlags = VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT; -allocInfo.preferredFlags = VK_MEMORY_PROPERTY_HOST_COHERENT_BIT | VK_MEMORY_PROPERTY_HOST_CACHED_BIT; -allocInfo.flags = VMA_ALLOCATION_CREATE_MAPPED_BIT; - -VkBuffer buffer; -VmaAllocation allocation; -vmaCreateBuffer(allocator, &bufferInfo, &allocInfo, &buffer, &allocation, nullptr); -\endcode - -A memory type is chosen that has all the required flags and as many preferred -flags set as possible. - -If you use VmaAllocationCreateInfo::usage, it is just internally converted to -a set of required and preferred flags. - -\section choosing_memory_type_explicit_memory_types Explicit memory types - -If you inspected memory types available on the physical device and you have -a preference for memory types that you want to use, you can fill member -VmaAllocationCreateInfo::memoryTypeBits. It is a bit mask, where each bit set -means that a memory type with that index is allowed to be used for the -allocation. Special value 0, just like `UINT32_MAX`, means there are no -restrictions to memory type index. - -Please note that this member is NOT just a memory type index. -Still you can use it to choose just one, specific memory type. -For example, if you already determined that your buffer should be created in -memory type 2, use following code: - -\code -uint32_t memoryTypeIndex = 2; - -VmaAllocationCreateInfo allocInfo = {}; -allocInfo.memoryTypeBits = 1u << memoryTypeIndex; - -VkBuffer buffer; -VmaAllocation allocation; -vmaCreateBuffer(allocator, &bufferInfo, &allocInfo, &buffer, &allocation, nullptr); -\endcode - - -\section choosing_memory_type_custom_memory_pools Custom memory pools - -If you allocate from custom memory pool, all the ways of specifying memory -requirements described above are not applicable and the aforementioned members -of VmaAllocationCreateInfo structure are ignored. Memory type is selected -explicitly when creating the pool and then used to make all the allocations from -that pool. For further details, see \ref custom_memory_pools. - -\section choosing_memory_type_dedicated_allocations Dedicated allocations - -Memory for allocations is reserved out of larger block of `VkDeviceMemory` -allocated from Vulkan internally. That's the main feature of this whole library. -You can still request a separate memory block to be created for an allocation, -just like you would do in a trivial solution without using any allocator. -In that case, a buffer or image is always bound to that memory at offset 0. -This is called a "dedicated allocation". -You can explicitly request it by using flag #VMA_ALLOCATION_CREATE_DEDICATED_MEMORY_BIT. -The library can also internally decide to use dedicated allocation in some cases, e.g.: - -- When the size of the allocation is large. -- When [VK_KHR_dedicated_allocation](@ref vk_khr_dedicated_allocation) extension is enabled - and it reports that dedicated allocation is required or recommended for the resource. -- When allocation of next big memory block fails due to not enough device memory, - but allocation with the exact requested size succeeds. - - -\page memory_mapping Memory mapping - -To "map memory" in Vulkan means to obtain a CPU pointer to `VkDeviceMemory`, -to be able to read from it or write to it in CPU code. -Mapping is possible only of memory allocated from a memory type that has -`VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT` flag. -Functions `vkMapMemory()`, `vkUnmapMemory()` are designed for this purpose. -You can use them directly with memory allocated by this library, -but it is not recommended because of following issue: -Mapping the same `VkDeviceMemory` block multiple times is illegal - only one mapping at a time is allowed. -This includes mapping disjoint regions. Mapping is not reference-counted internally by Vulkan. -Because of this, Vulkan Memory Allocator provides following facilities: - -\section memory_mapping_mapping_functions Mapping functions - -The library provides following functions for mapping of a specific #VmaAllocation: vmaMapMemory(), vmaUnmapMemory(). -They are safer and more convenient to use than standard Vulkan functions. -You can map an allocation multiple times simultaneously - mapping is reference-counted internally. -You can also map different allocations simultaneously regardless of whether they use the same `VkDeviceMemory` block. -The way it's implemented is that the library always maps entire memory block, not just region of the allocation. -For further details, see description of vmaMapMemory() function. -Example: - -\code -// Having these objects initialized: - -struct ConstantBuffer -{ - ... -}; -ConstantBuffer constantBufferData; - -VmaAllocator allocator; -VkBuffer constantBuffer; -VmaAllocation constantBufferAllocation; - -// You can map and fill your buffer using following code: - -void* mappedData; -vmaMapMemory(allocator, constantBufferAllocation, &mappedData); -memcpy(mappedData, &constantBufferData, sizeof(constantBufferData)); -vmaUnmapMemory(allocator, constantBufferAllocation); -\endcode - -When mapping, you may see a warning from Vulkan validation layer similar to this one: - -Mapping an image with layout VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL can result in undefined behavior if this memory is used by the device. Only GENERAL or PREINITIALIZED should be used. - -It happens because the library maps entire `VkDeviceMemory` block, where different -types of images and buffers may end up together, especially on GPUs with unified memory like Intel. -You can safely ignore it if you are sure you access only memory of the intended -object that you wanted to map. - - -\section memory_mapping_persistently_mapped_memory Persistently mapped memory - -Kepping your memory persistently mapped is generally OK in Vulkan. -You don't need to unmap it before using its data on the GPU. -The library provides a special feature designed for that: -Allocations made with #VMA_ALLOCATION_CREATE_MAPPED_BIT flag set in -VmaAllocationCreateInfo::flags stay mapped all the time, -so you can just access CPU pointer to it any time -without a need to call any "map" or "unmap" function. -Example: - -\code -VkBufferCreateInfo bufCreateInfo = { VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO }; -bufCreateInfo.size = sizeof(ConstantBuffer); -bufCreateInfo.usage = VK_BUFFER_USAGE_TRANSFER_SRC_BIT; - -VmaAllocationCreateInfo allocCreateInfo = {}; -allocCreateInfo.usage = VMA_MEMORY_USAGE_CPU_ONLY; -allocCreateInfo.flags = VMA_ALLOCATION_CREATE_MAPPED_BIT; - -VkBuffer buf; -VmaAllocation alloc; -VmaAllocationInfo allocInfo; -vmaCreateBuffer(allocator, &bufCreateInfo, &allocCreateInfo, &buf, &alloc, &allocInfo); - -// Buffer is already mapped. You can access its memory. -memcpy(allocInfo.pMappedData, &constantBufferData, sizeof(constantBufferData)); -\endcode - -There are some exceptions though, when you should consider mapping memory only for a short period of time: - -- When operating system is Windows 7 or 8.x (Windows 10 is not affected because it uses WDDM2), - device is discrete AMD GPU, - and memory type is the special 256 MiB pool of `DEVICE_LOCAL + HOST_VISIBLE` memory - (selected when you use #VMA_MEMORY_USAGE_CPU_TO_GPU), - then whenever a memory block allocated from this memory type stays mapped - for the time of any call to `vkQueueSubmit()` or `vkQueuePresentKHR()`, this - block is migrated by WDDM to system RAM, which degrades performance. It doesn't - matter if that particular memory block is actually used by the command buffer - being submitted. -- On Mac/MoltenVK there is a known bug - [Issue #175](https://github.com/KhronosGroup/MoltenVK/issues/175) - which requires unmapping before GPU can see updated texture. -- Keeping many large memory blocks mapped may impact performance or stability of some debugging tools. - -\section memory_mapping_cache_control Cache flush and invalidate - -Memory in Vulkan doesn't need to be unmapped before using it on GPU, -but unless a memory types has `VK_MEMORY_PROPERTY_HOST_COHERENT_BIT` flag set, -you need to manually **invalidate** cache before reading of mapped pointer -and **flush** cache after writing to mapped pointer. -Map/unmap operations don't do that automatically. -Vulkan provides following functions for this purpose `vkFlushMappedMemoryRanges()`, -`vkInvalidateMappedMemoryRanges()`, but this library provides more convenient -functions that refer to given allocation object: vmaFlushAllocation(), -vmaInvalidateAllocation(), -or multiple objects at once: vmaFlushAllocations(), vmaInvalidateAllocations(). - -Regions of memory specified for flush/invalidate must be aligned to -`VkPhysicalDeviceLimits::nonCoherentAtomSize`. This is automatically ensured by the library. -In any memory type that is `HOST_VISIBLE` but not `HOST_COHERENT`, all allocations -within blocks are aligned to this value, so their offsets are always multiply of -`nonCoherentAtomSize` and two different allocations never share same "line" of this size. - -Please note that memory allocated with #VMA_MEMORY_USAGE_CPU_ONLY is guaranteed to be `HOST_COHERENT`. - -Also, Windows drivers from all 3 **PC** GPU vendors (AMD, Intel, NVIDIA) -currently provide `HOST_COHERENT` flag on all memory types that are -`HOST_VISIBLE`, so on this platform you may not need to bother. - -\section memory_mapping_finding_if_memory_mappable Finding out if memory is mappable - -It may happen that your allocation ends up in memory that is `HOST_VISIBLE` (available for mapping) -despite it wasn't explicitly requested. -For example, application may work on integrated graphics with unified memory (like Intel) or -allocation from video memory might have failed, so the library chose system memory as fallback. - -You can detect this case and map such allocation to access its memory on CPU directly, -instead of launching a transfer operation. -In order to do that: inspect `allocInfo.memoryType`, call vmaGetMemoryTypeProperties(), -and look for `VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT` flag in properties of that memory type. - -\code -VkBufferCreateInfo bufCreateInfo = { VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO }; -bufCreateInfo.size = sizeof(ConstantBuffer); -bufCreateInfo.usage = VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT; - -VmaAllocationCreateInfo allocCreateInfo = {}; -allocCreateInfo.usage = VMA_MEMORY_USAGE_GPU_ONLY; -allocCreateInfo.preferredFlags = VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT; - -VkBuffer buf; -VmaAllocation alloc; -VmaAllocationInfo allocInfo; -vmaCreateBuffer(allocator, &bufCreateInfo, &allocCreateInfo, &buf, &alloc, &allocInfo); - -VkMemoryPropertyFlags memFlags; -vmaGetMemoryTypeProperties(allocator, allocInfo.memoryType, &memFlags); -if((memFlags & VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT) != 0) -{ - // Allocation ended up in mappable memory. You can map it and access it directly. - void* mappedData; - vmaMapMemory(allocator, alloc, &mappedData); - memcpy(mappedData, &constantBufferData, sizeof(constantBufferData)); - vmaUnmapMemory(allocator, alloc); -} -else -{ - // Allocation ended up in non-mappable memory. - // You need to create CPU-side buffer in VMA_MEMORY_USAGE_CPU_ONLY and make a transfer. -} -\endcode - -You can even use #VMA_ALLOCATION_CREATE_MAPPED_BIT flag while creating allocations -that are not necessarily `HOST_VISIBLE` (e.g. using #VMA_MEMORY_USAGE_GPU_ONLY). -If the allocation ends up in memory type that is `HOST_VISIBLE`, it will be persistently mapped and you can use it directly. -If not, the flag is just ignored. -Example: - -\code -VkBufferCreateInfo bufCreateInfo = { VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO }; -bufCreateInfo.size = sizeof(ConstantBuffer); -bufCreateInfo.usage = VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT; - -VmaAllocationCreateInfo allocCreateInfo = {}; -allocCreateInfo.usage = VMA_MEMORY_USAGE_GPU_ONLY; -allocCreateInfo.flags = VMA_ALLOCATION_CREATE_MAPPED_BIT; - -VkBuffer buf; -VmaAllocation alloc; -VmaAllocationInfo allocInfo; -vmaCreateBuffer(allocator, &bufCreateInfo, &allocCreateInfo, &buf, &alloc, &allocInfo); - -if(allocInfo.pMappedData != nullptr) -{ - // Allocation ended up in mappable memory. - // It's persistently mapped. You can access it directly. - memcpy(allocInfo.pMappedData, &constantBufferData, sizeof(constantBufferData)); -} -else -{ - // Allocation ended up in non-mappable memory. - // You need to create CPU-side buffer in VMA_MEMORY_USAGE_CPU_ONLY and make a transfer. -} -\endcode - - -\page staying_within_budget Staying within budget - -When developing a graphics-intensive game or program, it is important to avoid allocating -more GPU memory than it's physically available. When the memory is over-committed, -various bad things can happen, depending on the specific GPU, graphics driver, and -operating system: - -- It may just work without any problems. -- The application may slow down because some memory blocks are moved to system RAM - and the GPU has to access them through PCI Express bus. -- A new allocation may take very long time to complete, even few seconds, and possibly - freeze entire system. -- The new allocation may fail with `VK_ERROR_OUT_OF_DEVICE_MEMORY`. -- It may even result in GPU crash (TDR), observed as `VK_ERROR_DEVICE_LOST` - returned somewhere later. - -\section staying_within_budget_querying_for_budget Querying for budget - -To query for current memory usage and available budget, use function vmaGetBudget(). -Returned structure #VmaBudget contains quantities expressed in bytes, per Vulkan memory heap. - -Please note that this function returns different information and works faster than -vmaCalculateStats(). vmaGetBudget() can be called every frame or even before every -allocation, while vmaCalculateStats() is intended to be used rarely, -only to obtain statistical information, e.g. for debugging purposes. - -It is recommended to use VK_EXT_memory_budget device extension to obtain information -about the budget from Vulkan device. VMA is able to use this extension automatically. -When not enabled, the allocator behaves same way, but then it estimates current usage -and available budget based on its internal information and Vulkan memory heap sizes, -which may be less precise. In order to use this extension: - -1. Make sure extensions VK_EXT_memory_budget and VK_KHR_get_physical_device_properties2 - required by it are available and enable them. Please note that the first is a device - extension and the second is instance extension! -2. Use flag #VMA_ALLOCATOR_CREATE_EXT_MEMORY_BUDGET_BIT when creating #VmaAllocator object. -3. Make sure to call vmaSetCurrentFrameIndex() every frame. Budget is queried from - Vulkan inside of it to avoid overhead of querying it with every allocation. - -\section staying_within_budget_controlling_memory_usage Controlling memory usage - -There are many ways in which you can try to stay within the budget. - -First, when making new allocation requires allocating a new memory block, the library -tries not to exceed the budget automatically. If a block with default recommended size -(e.g. 256 MB) would go over budget, a smaller block is allocated, possibly even -dedicated memory for just this resource. - -If the size of the requested resource plus current memory usage is more than the -budget, by default the library still tries to create it, leaving it to the Vulkan -implementation whether the allocation succeeds or fails. You can change this behavior -by using #VMA_ALLOCATION_CREATE_WITHIN_BUDGET_BIT flag. With it, the allocation is -not made if it would exceed the budget or if the budget is already exceeded. -Some other allocations become lost instead to make room for it, if the mechanism of -[lost allocations](@ref lost_allocations) is used. -If that is not possible, the allocation fails with `VK_ERROR_OUT_OF_DEVICE_MEMORY`. -Example usage pattern may be to pass the #VMA_ALLOCATION_CREATE_WITHIN_BUDGET_BIT flag -when creating resources that are not essential for the application (e.g. the texture -of a specific object) and not to pass it when creating critically important resources -(e.g. render targets). - -Finally, you can also use #VMA_ALLOCATION_CREATE_NEVER_ALLOCATE_BIT flag to make sure -a new allocation is created only when it fits inside one of the existing memory blocks. -If it would require to allocate a new block, if fails instead with `VK_ERROR_OUT_OF_DEVICE_MEMORY`. -This also ensures that the function call is very fast because it never goes to Vulkan -to obtain a new block. - -Please note that creating \ref custom_memory_pools with VmaPoolCreateInfo::minBlockCount -set to more than 0 will try to allocate memory blocks without checking whether they -fit within budget. - - -\page resource_aliasing Resource aliasing (overlap) - -New explicit graphics APIs (Vulkan and Direct3D 12), thanks to manual memory -management, give an opportunity to alias (overlap) multiple resources in the -same region of memory - a feature not available in the old APIs (Direct3D 11, OpenGL). -It can be useful to save video memory, but it must be used with caution. - -For example, if you know the flow of your whole render frame in advance, you -are going to use some intermediate textures or buffers only during a small range of render passes, -and you know these ranges don't overlap in time, you can bind these resources to -the same place in memory, even if they have completely different parameters (width, height, format etc.). - -![Resource aliasing (overlap)](../gfx/Aliasing.png) - -Such scenario is possible using VMA, but you need to create your images manually. -Then you need to calculate parameters of an allocation to be made using formula: - -- allocation size = max(size of each image) -- allocation alignment = max(alignment of each image) -- allocation memoryTypeBits = bitwise AND(memoryTypeBits of each image) - -Following example shows two different images bound to the same place in memory, -allocated to fit largest of them. - -\code -// A 512x512 texture to be sampled. -VkImageCreateInfo img1CreateInfo = { VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO }; -img1CreateInfo.imageType = VK_IMAGE_TYPE_2D; -img1CreateInfo.extent.width = 512; -img1CreateInfo.extent.height = 512; -img1CreateInfo.extent.depth = 1; -img1CreateInfo.mipLevels = 10; -img1CreateInfo.arrayLayers = 1; -img1CreateInfo.format = VK_FORMAT_R8G8B8A8_SRGB; -img1CreateInfo.tiling = VK_IMAGE_TILING_OPTIMAL; -img1CreateInfo.initialLayout = VK_IMAGE_LAYOUT_UNDEFINED; -img1CreateInfo.usage = VK_IMAGE_USAGE_TRANSFER_DST_BIT | VK_IMAGE_USAGE_SAMPLED_BIT; -img1CreateInfo.samples = VK_SAMPLE_COUNT_1_BIT; - -// A full screen texture to be used as color attachment. -VkImageCreateInfo img2CreateInfo = { VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO }; -img2CreateInfo.imageType = VK_IMAGE_TYPE_2D; -img2CreateInfo.extent.width = 1920; -img2CreateInfo.extent.height = 1080; -img2CreateInfo.extent.depth = 1; -img2CreateInfo.mipLevels = 1; -img2CreateInfo.arrayLayers = 1; -img2CreateInfo.format = VK_FORMAT_R8G8B8A8_UNORM; -img2CreateInfo.tiling = VK_IMAGE_TILING_OPTIMAL; -img2CreateInfo.initialLayout = VK_IMAGE_LAYOUT_UNDEFINED; -img2CreateInfo.usage = VK_IMAGE_USAGE_SAMPLED_BIT | VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT; -img2CreateInfo.samples = VK_SAMPLE_COUNT_1_BIT; - -VkImage img1; -res = vkCreateImage(device, &img1CreateInfo, nullptr, &img1); -VkImage img2; -res = vkCreateImage(device, &img2CreateInfo, nullptr, &img2); - -VkMemoryRequirements img1MemReq; -vkGetImageMemoryRequirements(device, img1, &img1MemReq); -VkMemoryRequirements img2MemReq; -vkGetImageMemoryRequirements(device, img2, &img2MemReq); - -VkMemoryRequirements finalMemReq = {}; -finalMemReq.size = std::max(img1MemReq.size, img2MemReq.size); -finalMemReq.alignment = std::max(img1MemReq.alignment, img2MemReq.alignment); -finalMemReq.memoryTypeBits = img1MemReq.memoryTypeBits & img2MemReq.memoryTypeBits; -// Validate if(finalMemReq.memoryTypeBits != 0) - -VmaAllocationCreateInfo allocCreateInfo = {}; -allocCreateInfo.usage = VMA_MEMORY_USAGE_GPU_ONLY; - -VmaAllocation alloc; -res = vmaAllocateMemory(allocator, &finalMemReq, &allocCreateInfo, &alloc, nullptr); - -res = vmaBindImageMemory(allocator, alloc, img1); -res = vmaBindImageMemory(allocator, alloc, img2); - -// You can use img1, img2 here, but not at the same time! - -vmaFreeMemory(allocator, alloc); -vkDestroyImage(allocator, img2, nullptr); -vkDestroyImage(allocator, img1, nullptr); -\endcode - -Remember that using resouces that alias in memory requires proper synchronization. -You need to issue a memory barrier to make sure commands that use `img1` and `img2` -don't overlap on GPU timeline. -You also need to treat a resource after aliasing as uninitialized - containing garbage data. -For example, if you use `img1` and then want to use `img2`, you need to issue -an image memory barrier for `img2` with `oldLayout` = `VK_IMAGE_LAYOUT_UNDEFINED`. - -Additional considerations: - -- Vulkan also allows to interpret contents of memory between aliasing resources consistently in some cases. -See chapter 11.8. "Memory Aliasing" of Vulkan specification or `VK_IMAGE_CREATE_ALIAS_BIT` flag. -- You can create more complex layout where different images and buffers are bound -at different offsets inside one large allocation. For example, one can imagine -a big texture used in some render passes, aliasing with a set of many small buffers -used between in some further passes. To bind a resource at non-zero offset of an allocation, -use vmaBindBufferMemory2() / vmaBindImageMemory2(). -- Before allocating memory for the resources you want to alias, check `memoryTypeBits` -returned in memory requirements of each resource to make sure the bits overlap. -Some GPUs may expose multiple memory types suitable e.g. only for buffers or -images with `COLOR_ATTACHMENT` usage, so the sets of memory types supported by your -resources may be disjoint. Aliasing them is not possible in that case. - - -\page custom_memory_pools Custom memory pools - -A memory pool contains a number of `VkDeviceMemory` blocks. -The library automatically creates and manages default pool for each memory type available on the device. -Default memory pool automatically grows in size. -Size of allocated blocks is also variable and managed automatically. - -You can create custom pool and allocate memory out of it. -It can be useful if you want to: - -- Keep certain kind of allocations separate from others. -- Enforce particular, fixed size of Vulkan memory blocks. -- Limit maximum amount of Vulkan memory allocated for that pool. -- Reserve minimum or fixed amount of Vulkan memory always preallocated for that pool. - -To use custom memory pools: - --# Fill VmaPoolCreateInfo structure. --# Call vmaCreatePool() to obtain #VmaPool handle. --# When making an allocation, set VmaAllocationCreateInfo::pool to this handle. - You don't need to specify any other parameters of this structure, like `usage`. - -Example: - -\code -// Create a pool that can have at most 2 blocks, 128 MiB each. -VmaPoolCreateInfo poolCreateInfo = {}; -poolCreateInfo.memoryTypeIndex = ... -poolCreateInfo.blockSize = 128ull * 1024 * 1024; -poolCreateInfo.maxBlockCount = 2; - -VmaPool pool; -vmaCreatePool(allocator, &poolCreateInfo, &pool); - -// Allocate a buffer out of it. -VkBufferCreateInfo bufCreateInfo = { VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO }; -bufCreateInfo.size = 1024; -bufCreateInfo.usage = VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT; - -VmaAllocationCreateInfo allocCreateInfo = {}; -allocCreateInfo.pool = pool; - -VkBuffer buf; -VmaAllocation alloc; -VmaAllocationInfo allocInfo; -vmaCreateBuffer(allocator, &bufCreateInfo, &allocCreateInfo, &buf, &alloc, &allocInfo); -\endcode - -You have to free all allocations made from this pool before destroying it. - -\code -vmaDestroyBuffer(allocator, buf, alloc); -vmaDestroyPool(allocator, pool); -\endcode - -\section custom_memory_pools_MemTypeIndex Choosing memory type index - -When creating a pool, you must explicitly specify memory type index. -To find the one suitable for your buffers or images, you can use helper functions -vmaFindMemoryTypeIndexForBufferInfo(), vmaFindMemoryTypeIndexForImageInfo(). -You need to provide structures with example parameters of buffers or images -that you are going to create in that pool. - -\code -VkBufferCreateInfo exampleBufCreateInfo = { VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO }; -exampleBufCreateInfo.size = 1024; // Whatever. -exampleBufCreateInfo.usage = VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT; // Change if needed. - -VmaAllocationCreateInfo allocCreateInfo = {}; -allocCreateInfo.usage = VMA_MEMORY_USAGE_GPU_ONLY; // Change if needed. - -uint32_t memTypeIndex; -vmaFindMemoryTypeIndexForBufferInfo(allocator, &exampleBufCreateInfo, &allocCreateInfo, &memTypeIndex); - -VmaPoolCreateInfo poolCreateInfo = {}; -poolCreateInfo.memoryTypeIndex = memTypeIndex; -// ... -\endcode - -When creating buffers/images allocated in that pool, provide following parameters: - -- `VkBufferCreateInfo`: Prefer to pass same parameters as above. - Otherwise you risk creating resources in a memory type that is not suitable for them, which may result in undefined behavior. - Using different `VK_BUFFER_USAGE_` flags may work, but you shouldn't create images in a pool intended for buffers - or the other way around. -- VmaAllocationCreateInfo: You don't need to pass same parameters. Fill only `pool` member. - Other members are ignored anyway. - -\section linear_algorithm Linear allocation algorithm - -Each Vulkan memory block managed by this library has accompanying metadata that -keeps track of used and unused regions. By default, the metadata structure and -algorithm tries to find best place for new allocations among free regions to -optimize memory usage. This way you can allocate and free objects in any order. - -![Default allocation algorithm](../gfx/Linear_allocator_1_algo_default.png) - -Sometimes there is a need to use simpler, linear allocation algorithm. You can -create custom pool that uses such algorithm by adding flag -#VMA_POOL_CREATE_LINEAR_ALGORITHM_BIT to VmaPoolCreateInfo::flags while creating -#VmaPool object. Then an alternative metadata management is used. It always -creates new allocations after last one and doesn't reuse free regions after -allocations freed in the middle. It results in better allocation performance and -less memory consumed by metadata. - -![Linear allocation algorithm](../gfx/Linear_allocator_2_algo_linear.png) - -With this one flag, you can create a custom pool that can be used in many ways: -free-at-once, stack, double stack, and ring buffer. See below for details. - -\subsection linear_algorithm_free_at_once Free-at-once - -In a pool that uses linear algorithm, you still need to free all the allocations -individually, e.g. by using vmaFreeMemory() or vmaDestroyBuffer(). You can free -them in any order. New allocations are always made after last one - free space -in the middle is not reused. However, when you release all the allocation and -the pool becomes empty, allocation starts from the beginning again. This way you -can use linear algorithm to speed up creation of allocations that you are going -to release all at once. - -![Free-at-once](../gfx/Linear_allocator_3_free_at_once.png) - -This mode is also available for pools created with VmaPoolCreateInfo::maxBlockCount -value that allows multiple memory blocks. - -\subsection linear_algorithm_stack Stack - -When you free an allocation that was created last, its space can be reused. -Thanks to this, if you always release allocations in the order opposite to their -creation (LIFO - Last In First Out), you can achieve behavior of a stack. - -![Stack](../gfx/Linear_allocator_4_stack.png) - -This mode is also available for pools created with VmaPoolCreateInfo::maxBlockCount -value that allows multiple memory blocks. - -\subsection linear_algorithm_double_stack Double stack - -The space reserved by a custom pool with linear algorithm may be used by two -stacks: - -- First, default one, growing up from offset 0. -- Second, "upper" one, growing down from the end towards lower offsets. - -To make allocation from upper stack, add flag #VMA_ALLOCATION_CREATE_UPPER_ADDRESS_BIT -to VmaAllocationCreateInfo::flags. - -![Double stack](../gfx/Linear_allocator_7_double_stack.png) - -Double stack is available only in pools with one memory block - -VmaPoolCreateInfo::maxBlockCount must be 1. Otherwise behavior is undefined. - -When the two stacks' ends meet so there is not enough space between them for a -new allocation, such allocation fails with usual -`VK_ERROR_OUT_OF_DEVICE_MEMORY` error. - -\subsection linear_algorithm_ring_buffer Ring buffer - -When you free some allocations from the beginning and there is not enough free space -for a new one at the end of a pool, allocator's "cursor" wraps around to the -beginning and starts allocation there. Thanks to this, if you always release -allocations in the same order as you created them (FIFO - First In First Out), -you can achieve behavior of a ring buffer / queue. - -![Ring buffer](../gfx/Linear_allocator_5_ring_buffer.png) - -Pools with linear algorithm support [lost allocations](@ref lost_allocations) when used as ring buffer. -If there is not enough free space for a new allocation, but existing allocations -from the front of the queue can become lost, they become lost and the allocation -succeeds. - -![Ring buffer with lost allocations](../gfx/Linear_allocator_6_ring_buffer_lost.png) - -Ring buffer is available only in pools with one memory block - -VmaPoolCreateInfo::maxBlockCount must be 1. Otherwise behavior is undefined. - -\section buddy_algorithm Buddy allocation algorithm - -There is another allocation algorithm that can be used with custom pools, called -"buddy". Its internal data structure is based on a tree of blocks, each having -size that is a power of two and a half of its parent's size. When you want to -allocate memory of certain size, a free node in the tree is located. If it's too -large, it is recursively split into two halves (called "buddies"). However, if -requested allocation size is not a power of two, the size of a tree node is -aligned up to the nearest power of two and the remaining space is wasted. When -two buddy nodes become free, they are merged back into one larger node. - -![Buddy allocator](../gfx/Buddy_allocator.png) - -The advantage of buddy allocation algorithm over default algorithm is faster -allocation and deallocation, as well as smaller external fragmentation. The -disadvantage is more wasted space (internal fragmentation). - -For more information, please read ["Buddy memory allocation" on Wikipedia](https://en.wikipedia.org/wiki/Buddy_memory_allocation) -or other sources that describe this concept in general. - -To use buddy allocation algorithm with a custom pool, add flag -#VMA_POOL_CREATE_BUDDY_ALGORITHM_BIT to VmaPoolCreateInfo::flags while creating -#VmaPool object. - -Several limitations apply to pools that use buddy algorithm: - -- It is recommended to use VmaPoolCreateInfo::blockSize that is a power of two. - Otherwise, only largest power of two smaller than the size is used for - allocations. The remaining space always stays unused. -- [Margins](@ref debugging_memory_usage_margins) and - [corruption detection](@ref debugging_memory_usage_corruption_detection) - don't work in such pools. -- [Lost allocations](@ref lost_allocations) don't work in such pools. You can - use them, but they never become lost. Support may be added in the future. -- [Defragmentation](@ref defragmentation) doesn't work with allocations made from - such pool. - -\page defragmentation Defragmentation - -Interleaved allocations and deallocations of many objects of varying size can -cause fragmentation over time, which can lead to a situation where the library is unable -to find a continuous range of free memory for a new allocation despite there is -enough free space, just scattered across many small free ranges between existing -allocations. - -To mitigate this problem, you can use defragmentation feature: -structure #VmaDefragmentationInfo2, function vmaDefragmentationBegin(), vmaDefragmentationEnd(). -Given set of allocations, -this function can move them to compact used memory, ensure more continuous free -space and possibly also free some `VkDeviceMemory` blocks. - -What the defragmentation does is: - -- Updates #VmaAllocation objects to point to new `VkDeviceMemory` and offset. - After allocation has been moved, its VmaAllocationInfo::deviceMemory and/or - VmaAllocationInfo::offset changes. You must query them again using - vmaGetAllocationInfo() if you need them. -- Moves actual data in memory. - -What it doesn't do, so you need to do it yourself: - -- Recreate buffers and images that were bound to allocations that were defragmented and - bind them with their new places in memory. - You must use `vkDestroyBuffer()`, `vkDestroyImage()`, - `vkCreateBuffer()`, `vkCreateImage()`, vmaBindBufferMemory(), vmaBindImageMemory() - for that purpose and NOT vmaDestroyBuffer(), - vmaDestroyImage(), vmaCreateBuffer(), vmaCreateImage(), because you don't need to - destroy or create allocation objects! -- Recreate views and update descriptors that point to these buffers and images. - -\section defragmentation_cpu Defragmenting CPU memory - -Following example demonstrates how you can run defragmentation on CPU. -Only allocations created in memory types that are `HOST_VISIBLE` can be defragmented. -Others are ignored. - -The way it works is: - -- It temporarily maps entire memory blocks when necessary. -- It moves data using `memmove()` function. - -\code -// Given following variables already initialized: -VkDevice device; -VmaAllocator allocator; -std::vector buffers; -std::vector allocations; - - -const uint32_t allocCount = (uint32_t)allocations.size(); -std::vector allocationsChanged(allocCount); - -VmaDefragmentationInfo2 defragInfo = {}; -defragInfo.allocationCount = allocCount; -defragInfo.pAllocations = allocations.data(); -defragInfo.pAllocationsChanged = allocationsChanged.data(); -defragInfo.maxCpuBytesToMove = VK_WHOLE_SIZE; // No limit. -defragInfo.maxCpuAllocationsToMove = UINT32_MAX; // No limit. - -VmaDefragmentationContext defragCtx; -vmaDefragmentationBegin(allocator, &defragInfo, nullptr, &defragCtx); -vmaDefragmentationEnd(allocator, defragCtx); - -for(uint32_t i = 0; i < allocCount; ++i) -{ - if(allocationsChanged[i]) - { - // Destroy buffer that is immutably bound to memory region which is no longer valid. - vkDestroyBuffer(device, buffers[i], nullptr); - - // Create new buffer with same parameters. - VkBufferCreateInfo bufferInfo = ...; - vkCreateBuffer(device, &bufferInfo, nullptr, &buffers[i]); - - // You can make dummy call to vkGetBufferMemoryRequirements here to silence validation layer warning. - - // Bind new buffer to new memory region. Data contained in it is already moved. - VmaAllocationInfo allocInfo; - vmaGetAllocationInfo(allocator, allocations[i], &allocInfo); - vmaBindBufferMemory(allocator, allocations[i], buffers[i]); - } -} -\endcode - -Setting VmaDefragmentationInfo2::pAllocationsChanged is optional. -This output array tells whether particular allocation in VmaDefragmentationInfo2::pAllocations at the same index -has been modified during defragmentation. -You can pass null, but you then need to query every allocation passed to defragmentation -for new parameters using vmaGetAllocationInfo() if you might need to recreate and rebind a buffer or image associated with it. - -If you use [Custom memory pools](@ref choosing_memory_type_custom_memory_pools), -you can fill VmaDefragmentationInfo2::poolCount and VmaDefragmentationInfo2::pPools -instead of VmaDefragmentationInfo2::allocationCount and VmaDefragmentationInfo2::pAllocations -to defragment all allocations in given pools. -You cannot use VmaDefragmentationInfo2::pAllocationsChanged in that case. -You can also combine both methods. - -\section defragmentation_gpu Defragmenting GPU memory - -It is also possible to defragment allocations created in memory types that are not `HOST_VISIBLE`. -To do that, you need to pass a command buffer that meets requirements as described in -VmaDefragmentationInfo2::commandBuffer. The way it works is: - -- It creates temporary buffers and binds them to entire memory blocks when necessary. -- It issues `vkCmdCopyBuffer()` to passed command buffer. - -Example: - -\code -// Given following variables already initialized: -VkDevice device; -VmaAllocator allocator; -VkCommandBuffer commandBuffer; -std::vector buffers; -std::vector allocations; - - -const uint32_t allocCount = (uint32_t)allocations.size(); -std::vector allocationsChanged(allocCount); - -VkCommandBufferBeginInfo cmdBufBeginInfo = ...; -vkBeginCommandBuffer(commandBuffer, &cmdBufBeginInfo); - -VmaDefragmentationInfo2 defragInfo = {}; -defragInfo.allocationCount = allocCount; -defragInfo.pAllocations = allocations.data(); -defragInfo.pAllocationsChanged = allocationsChanged.data(); -defragInfo.maxGpuBytesToMove = VK_WHOLE_SIZE; // Notice it's "GPU" this time. -defragInfo.maxGpuAllocationsToMove = UINT32_MAX; // Notice it's "GPU" this time. -defragInfo.commandBuffer = commandBuffer; - -VmaDefragmentationContext defragCtx; -vmaDefragmentationBegin(allocator, &defragInfo, nullptr, &defragCtx); - -vkEndCommandBuffer(commandBuffer); - -// Submit commandBuffer. -// Wait for a fence that ensures commandBuffer execution finished. - -vmaDefragmentationEnd(allocator, defragCtx); - -for(uint32_t i = 0; i < allocCount; ++i) -{ - if(allocationsChanged[i]) - { - // Destroy buffer that is immutably bound to memory region which is no longer valid. - vkDestroyBuffer(device, buffers[i], nullptr); - - // Create new buffer with same parameters. - VkBufferCreateInfo bufferInfo = ...; - vkCreateBuffer(device, &bufferInfo, nullptr, &buffers[i]); - - // You can make dummy call to vkGetBufferMemoryRequirements here to silence validation layer warning. - - // Bind new buffer to new memory region. Data contained in it is already moved. - VmaAllocationInfo allocInfo; - vmaGetAllocationInfo(allocator, allocations[i], &allocInfo); - vmaBindBufferMemory(allocator, allocations[i], buffers[i]); - } -} -\endcode - -You can combine these two methods by specifying non-zero `maxGpu*` as well as `maxCpu*` parameters. -The library automatically chooses best method to defragment each memory pool. - -You may try not to block your entire program to wait until defragmentation finishes, -but do it in the background, as long as you carefully fullfill requirements described -in function vmaDefragmentationBegin(). - -\section defragmentation_additional_notes Additional notes - -It is only legal to defragment allocations bound to: - -- buffers -- images created with `VK_IMAGE_CREATE_ALIAS_BIT`, `VK_IMAGE_TILING_LINEAR`, and - being currently in `VK_IMAGE_LAYOUT_GENERAL` or `VK_IMAGE_LAYOUT_PREINITIALIZED`. - -Defragmentation of images created with `VK_IMAGE_TILING_OPTIMAL` or in any other -layout may give undefined results. - -If you defragment allocations bound to images, new images to be bound to new -memory region after defragmentation should be created with `VK_IMAGE_LAYOUT_PREINITIALIZED` -and then transitioned to their original layout from before defragmentation if -needed using an image memory barrier. - -While using defragmentation, you may experience validation layer warnings, which you just need to ignore. -See [Validation layer warnings](@ref general_considerations_validation_layer_warnings). - -Please don't expect memory to be fully compacted after defragmentation. -Algorithms inside are based on some heuristics that try to maximize number of Vulkan -memory blocks to make totally empty to release them, as well as to maximimze continuous -empty space inside remaining blocks, while minimizing the number and size of allocations that -need to be moved. Some fragmentation may still remain - this is normal. - -\section defragmentation_custom_algorithm Writing custom defragmentation algorithm - -If you want to implement your own, custom defragmentation algorithm, -there is infrastructure prepared for that, -but it is not exposed through the library API - you need to hack its source code. -Here are steps needed to do this: - --# Main thing you need to do is to define your own class derived from base abstract - class `VmaDefragmentationAlgorithm` and implement your version of its pure virtual methods. - See definition and comments of this class for details. --# Your code needs to interact with device memory block metadata. - If you need more access to its data than it's provided by its public interface, - declare your new class as a friend class e.g. in class `VmaBlockMetadata_Generic`. --# If you want to create a flag that would enable your algorithm or pass some additional - flags to configure it, add them to `VmaDefragmentationFlagBits` and use them in - VmaDefragmentationInfo2::flags. --# Modify function `VmaBlockVectorDefragmentationContext::Begin` to create object - of your new class whenever needed. - - -\page lost_allocations Lost allocations - -If your game oversubscribes video memory, if may work OK in previous-generation -graphics APIs (DirectX 9, 10, 11, OpenGL) because resources are automatically -paged to system RAM. In Vulkan you can't do it because when you run out of -memory, an allocation just fails. If you have more data (e.g. textures) that can -fit into VRAM and you don't need it all at once, you may want to upload them to -GPU on demand and "push out" ones that are not used for a long time to make room -for the new ones, effectively using VRAM (or a cartain memory pool) as a form of -cache. Vulkan Memory Allocator can help you with that by supporting a concept of -"lost allocations". - -To create an allocation that can become lost, include #VMA_ALLOCATION_CREATE_CAN_BECOME_LOST_BIT -flag in VmaAllocationCreateInfo::flags. Before using a buffer or image bound to -such allocation in every new frame, you need to query it if it's not lost. -To check it, call vmaTouchAllocation(). -If the allocation is lost, you should not use it or buffer/image bound to it. -You mustn't forget to destroy this allocation and this buffer/image. -vmaGetAllocationInfo() can also be used for checking status of the allocation. -Allocation is lost when returned VmaAllocationInfo::deviceMemory == `VK_NULL_HANDLE`. - -To create an allocation that can make some other allocations lost to make room -for it, use #VMA_ALLOCATION_CREATE_CAN_MAKE_OTHER_LOST_BIT flag. You will -usually use both flags #VMA_ALLOCATION_CREATE_CAN_MAKE_OTHER_LOST_BIT and -#VMA_ALLOCATION_CREATE_CAN_BECOME_LOST_BIT at the same time. - -Warning! Current implementation uses quite naive, brute force algorithm, -which can make allocation calls that use #VMA_ALLOCATION_CREATE_CAN_MAKE_OTHER_LOST_BIT -flag quite slow. A new, more optimal algorithm and data structure to speed this -up is planned for the future. - -Q: When interleaving creation of new allocations with usage of existing ones, -how do you make sure that an allocation won't become lost while it's used in the -current frame? - -It is ensured because vmaTouchAllocation() / vmaGetAllocationInfo() not only returns allocation -status/parameters and checks whether it's not lost, but when it's not, it also -atomically marks it as used in the current frame, which makes it impossible to -become lost in that frame. It uses lockless algorithm, so it works fast and -doesn't involve locking any internal mutex. - -Q: What if my allocation may still be in use by the GPU when it's rendering a -previous frame while I already submit new frame on the CPU? - -You can make sure that allocations "touched" by vmaTouchAllocation() / vmaGetAllocationInfo() will not -become lost for a number of additional frames back from the current one by -specifying this number as VmaAllocatorCreateInfo::frameInUseCount (for default -memory pool) and VmaPoolCreateInfo::frameInUseCount (for custom pool). - -Q: How do you inform the library when new frame starts? - -You need to call function vmaSetCurrentFrameIndex(). - -Example code: - -\code -struct MyBuffer -{ - VkBuffer m_Buf = nullptr; - VmaAllocation m_Alloc = nullptr; - - // Called when the buffer is really needed in the current frame. - void EnsureBuffer(); -}; - -void MyBuffer::EnsureBuffer() -{ - // Buffer has been created. - if(m_Buf != VK_NULL_HANDLE) - { - // Check if its allocation is not lost + mark it as used in current frame. - if(vmaTouchAllocation(allocator, m_Alloc)) - { - // It's all OK - safe to use m_Buf. - return; - } - } - - // Buffer not yet exists or lost - destroy and recreate it. - - vmaDestroyBuffer(allocator, m_Buf, m_Alloc); - - VkBufferCreateInfo bufCreateInfo = { VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO }; - bufCreateInfo.size = 1024; - bufCreateInfo.usage = VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT; - - VmaAllocationCreateInfo allocCreateInfo = {}; - allocCreateInfo.usage = VMA_MEMORY_USAGE_GPU_ONLY; - allocCreateInfo.flags = VMA_ALLOCATION_CREATE_CAN_BECOME_LOST_BIT | - VMA_ALLOCATION_CREATE_CAN_MAKE_OTHER_LOST_BIT; - - vmaCreateBuffer(allocator, &bufCreateInfo, &allocCreateInfo, &m_Buf, &m_Alloc, nullptr); -} -\endcode - -When using lost allocations, you may see some Vulkan validation layer warnings -about overlapping regions of memory bound to different kinds of buffers and -images. This is still valid as long as you implement proper handling of lost -allocations (like in the example above) and don't use them. - -You can create an allocation that is already in lost state from the beginning using function -vmaCreateLostAllocation(). It may be useful if you need a "dummy" allocation that is not null. - -You can call function vmaMakePoolAllocationsLost() to set all eligible allocations -in a specified custom pool to lost state. -Allocations that have been "touched" in current frame or VmaPoolCreateInfo::frameInUseCount frames back -cannot become lost. - -Q: Can I touch allocation that cannot become lost? - -Yes, although it has no visible effect. -Calls to vmaGetAllocationInfo() and vmaTouchAllocation() update last use frame index -also for allocations that cannot become lost, but the only way to observe it is to dump -internal allocator state using vmaBuildStatsString(). -You can use this feature for debugging purposes to explicitly mark allocations that you use -in current frame and then analyze JSON dump to see for how long each allocation stays unused. - - -\page statistics Statistics - -This library contains functions that return information about its internal state, -especially the amount of memory allocated from Vulkan. -Please keep in mind that these functions need to traverse all internal data structures -to gather these information, so they may be quite time-consuming. -Don't call them too often. - -\section statistics_numeric_statistics Numeric statistics - -You can query for overall statistics of the allocator using function vmaCalculateStats(). -Information are returned using structure #VmaStats. -It contains #VmaStatInfo - number of allocated blocks, number of allocations -(occupied ranges in these blocks), number of unused (free) ranges in these blocks, -number of bytes used and unused (but still allocated from Vulkan) and other information. -They are summed across memory heaps, memory types and total for whole allocator. - -You can query for statistics of a custom pool using function vmaGetPoolStats(). -Information are returned using structure #VmaPoolStats. - -You can query for information about specific allocation using function vmaGetAllocationInfo(). -It fill structure #VmaAllocationInfo. - -\section statistics_json_dump JSON dump - -You can dump internal state of the allocator to a string in JSON format using function vmaBuildStatsString(). -The result is guaranteed to be correct JSON. -It uses ANSI encoding. -Any strings provided by user (see [Allocation names](@ref allocation_names)) -are copied as-is and properly escaped for JSON, so if they use UTF-8, ISO-8859-2 or any other encoding, -this JSON string can be treated as using this encoding. -It must be freed using function vmaFreeStatsString(). - -The format of this JSON string is not part of official documentation of the library, -but it will not change in backward-incompatible way without increasing library major version number -and appropriate mention in changelog. - -The JSON string contains all the data that can be obtained using vmaCalculateStats(). -It can also contain detailed map of allocated memory blocks and their regions - -free and occupied by allocations. -This allows e.g. to visualize the memory or assess fragmentation. - - -\page allocation_annotation Allocation names and user data - -\section allocation_user_data Allocation user data - -You can annotate allocations with your own information, e.g. for debugging purposes. -To do that, fill VmaAllocationCreateInfo::pUserData field when creating -an allocation. It's an opaque `void*` pointer. You can use it e.g. as a pointer, -some handle, index, key, ordinal number or any other value that would associate -the allocation with your custom metadata. - -\code -VkBufferCreateInfo bufferInfo = { VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO }; -// Fill bufferInfo... - -MyBufferMetadata* pMetadata = CreateBufferMetadata(); - -VmaAllocationCreateInfo allocCreateInfo = {}; -allocCreateInfo.usage = VMA_MEMORY_USAGE_GPU_ONLY; -allocCreateInfo.pUserData = pMetadata; - -VkBuffer buffer; -VmaAllocation allocation; -vmaCreateBuffer(allocator, &bufferInfo, &allocCreateInfo, &buffer, &allocation, nullptr); -\endcode - -The pointer may be later retrieved as VmaAllocationInfo::pUserData: - -\code -VmaAllocationInfo allocInfo; -vmaGetAllocationInfo(allocator, allocation, &allocInfo); -MyBufferMetadata* pMetadata = (MyBufferMetadata*)allocInfo.pUserData; -\endcode - -It can also be changed using function vmaSetAllocationUserData(). - -Values of (non-zero) allocations' `pUserData` are printed in JSON report created by -vmaBuildStatsString(), in hexadecimal form. - -\section allocation_names Allocation names - -There is alternative mode available where `pUserData` pointer is used to point to -a null-terminated string, giving a name to the allocation. To use this mode, -set #VMA_ALLOCATION_CREATE_USER_DATA_COPY_STRING_BIT flag in VmaAllocationCreateInfo::flags. -Then `pUserData` passed as VmaAllocationCreateInfo::pUserData or argument to -vmaSetAllocationUserData() must be either null or pointer to a null-terminated string. -The library creates internal copy of the string, so the pointer you pass doesn't need -to be valid for whole lifetime of the allocation. You can free it after the call. - -\code -VkImageCreateInfo imageInfo = { VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO }; -// Fill imageInfo... - -std::string imageName = "Texture: "; -imageName += fileName; - -VmaAllocationCreateInfo allocCreateInfo = {}; -allocCreateInfo.usage = VMA_MEMORY_USAGE_GPU_ONLY; -allocCreateInfo.flags = VMA_ALLOCATION_CREATE_USER_DATA_COPY_STRING_BIT; -allocCreateInfo.pUserData = imageName.c_str(); - -VkImage image; -VmaAllocation allocation; -vmaCreateImage(allocator, &imageInfo, &allocCreateInfo, &image, &allocation, nullptr); -\endcode - -The value of `pUserData` pointer of the allocation will be different than the one -you passed when setting allocation's name - pointing to a buffer managed -internally that holds copy of the string. - -\code -VmaAllocationInfo allocInfo; -vmaGetAllocationInfo(allocator, allocation, &allocInfo); -const char* imageName = (const char*)allocInfo.pUserData; -printf("Image name: %s\n", imageName); -\endcode - -That string is also printed in JSON report created by vmaBuildStatsString(). - -\note Passing string name to VMA allocation doesn't automatically set it to the Vulkan buffer or image created with it. -You must do it manually using an extension like VK_EXT_debug_utils, which is independent of this library. - - -\page debugging_memory_usage Debugging incorrect memory usage - -If you suspect a bug with memory usage, like usage of uninitialized memory or -memory being overwritten out of bounds of an allocation, -you can use debug features of this library to verify this. - -\section debugging_memory_usage_initialization Memory initialization - -If you experience a bug with incorrect and nondeterministic data in your program and you suspect uninitialized memory to be used, -you can enable automatic memory initialization to verify this. -To do it, define macro `VMA_DEBUG_INITIALIZE_ALLOCATIONS` to 1. - -\code -#define VMA_DEBUG_INITIALIZE_ALLOCATIONS 1 -#include -\endcode - -It makes memory of all new allocations initialized to bit pattern `0xDCDCDCDC`. -Before an allocation is destroyed, its memory is filled with bit pattern `0xEFEFEFEF`. -Memory is automatically mapped and unmapped if necessary. - -If you find these values while debugging your program, good chances are that you incorrectly -read Vulkan memory that is allocated but not initialized, or already freed, respectively. - -Memory initialization works only with memory types that are `HOST_VISIBLE`. -It works also with dedicated allocations. -It doesn't work with allocations created with #VMA_ALLOCATION_CREATE_CAN_BECOME_LOST_BIT flag, -as they cannot be mapped. - -\section debugging_memory_usage_margins Margins - -By default, allocations are laid out in memory blocks next to each other if possible -(considering required alignment, `bufferImageGranularity`, and `nonCoherentAtomSize`). - -![Allocations without margin](../gfx/Margins_1.png) - -Define macro `VMA_DEBUG_MARGIN` to some non-zero value (e.g. 16) to enforce specified -number of bytes as a margin before and after every allocation. - -\code -#define VMA_DEBUG_MARGIN 16 -#include -\endcode - -![Allocations with margin](../gfx/Margins_2.png) - -If your bug goes away after enabling margins, it means it may be caused by memory -being overwritten outside of allocation boundaries. It is not 100% certain though. -Change in application behavior may also be caused by different order and distribution -of allocations across memory blocks after margins are applied. - -The margin is applied also before first and after last allocation in a block. -It may occur only once between two adjacent allocations. - -Margins work with all types of memory. - -Margin is applied only to allocations made out of memory blocks and not to dedicated -allocations, which have their own memory block of specific size. -It is thus not applied to allocations made using #VMA_ALLOCATION_CREATE_DEDICATED_MEMORY_BIT flag -or those automatically decided to put into dedicated allocations, e.g. due to its -large size or recommended by VK_KHR_dedicated_allocation extension. -Margins are also not active in custom pools created with #VMA_POOL_CREATE_BUDDY_ALGORITHM_BIT flag. - -Margins appear in [JSON dump](@ref statistics_json_dump) as part of free space. - -Note that enabling margins increases memory usage and fragmentation. - -\section debugging_memory_usage_corruption_detection Corruption detection - -You can additionally define macro `VMA_DEBUG_DETECT_CORRUPTION` to 1 to enable validation -of contents of the margins. - -\code -#define VMA_DEBUG_MARGIN 16 -#define VMA_DEBUG_DETECT_CORRUPTION 1 -#include -\endcode - -When this feature is enabled, number of bytes specified as `VMA_DEBUG_MARGIN` -(it must be multiply of 4) before and after every allocation is filled with a magic number. -This idea is also know as "canary". -Memory is automatically mapped and unmapped if necessary. - -This number is validated automatically when the allocation is destroyed. -If it's not equal to the expected value, `VMA_ASSERT()` is executed. -It clearly means that either CPU or GPU overwritten the memory outside of boundaries of the allocation, -which indicates a serious bug. - -You can also explicitly request checking margins of all allocations in all memory blocks -that belong to specified memory types by using function vmaCheckCorruption(), -or in memory blocks that belong to specified custom pool, by using function -vmaCheckPoolCorruption(). - -Margin validation (corruption detection) works only for memory types that are -`HOST_VISIBLE` and `HOST_COHERENT`. - - -\page record_and_replay Record and replay - -\section record_and_replay_introduction Introduction - -While using the library, sequence of calls to its functions together with their -parameters can be recorded to a file and later replayed using standalone player -application. It can be useful to: - -- Test correctness - check if same sequence of calls will not cause crash or - failures on a target platform. -- Gather statistics - see number of allocations, peak memory usage, number of - calls etc. -- Benchmark performance - see how much time it takes to replay the whole - sequence. - -\section record_and_replay_usage Usage - -Recording functionality is disabled by default. -To enable it, define following macro before every include of this library: - -\code -#define VMA_RECORDING_ENABLED 1 -\endcode - -To record sequence of calls to a file: Fill in -VmaAllocatorCreateInfo::pRecordSettings member while creating #VmaAllocator -object. File is opened and written during whole lifetime of the allocator. - -To replay file: Use VmaReplay - standalone command-line program. -Precompiled binary can be found in "bin" directory. -Its source can be found in "src/VmaReplay" directory. -Its project is generated by Premake. -Command line syntax is printed when the program is launched without parameters. -Basic usage: - - VmaReplay.exe MyRecording.csv - -Documentation of file format can be found in file: "docs/Recording file format.md". -It's a human-readable, text file in CSV format (Comma Separated Values). - -\section record_and_replay_additional_considerations Additional considerations - -- Replaying file that was recorded on a different GPU (with different parameters - like `bufferImageGranularity`, `nonCoherentAtomSize`, and especially different - set of memory heaps and types) may give different performance and memory usage - results, as well as issue some warnings and errors. -- Current implementation of recording in VMA, as well as VmaReplay application, is - coded and tested only on Windows. Inclusion of recording code is driven by - `VMA_RECORDING_ENABLED` macro. Support for other platforms should be easy to - add. Contributions are welcomed. - - -\page usage_patterns Recommended usage patterns - -See also slides from talk: -[Sawicki, Adam. Advanced Graphics Techniques Tutorial: Memory management in Vulkan and DX12. Game Developers Conference, 2018](https://www.gdcvault.com/play/1025458/Advanced-Graphics-Techniques-Tutorial-New) - - -\section usage_patterns_common_mistakes Common mistakes - -Use of CPU_TO_GPU instead of CPU_ONLY memory - -#VMA_MEMORY_USAGE_CPU_TO_GPU is recommended only for resources that will be -mapped and written by the CPU, as well as read directly by the GPU - like some -buffers or textures updated every frame (dynamic). If you create a staging copy -of a resource to be written by CPU and then used as a source of transfer to -another resource placed in the GPU memory, that staging resource should be -created with #VMA_MEMORY_USAGE_CPU_ONLY. Please read the descriptions of these -enums carefully for details. - -Unnecessary use of custom pools - -\ref custom_memory_pools may be useful for special purposes - when you want to -keep certain type of resources separate e.g. to reserve minimum amount of memory -for them, limit maximum amount of memory they can occupy, or make some of them -push out the other through the mechanism of \ref lost_allocations. For most -resources this is not needed and so it is not recommended to create #VmaPool -objects and allocations out of them. Allocating from the default pool is sufficient. - -\section usage_patterns_simple Simple patterns - -\subsection usage_patterns_simple_render_targets Render targets - -When: -Any resources that you frequently write and read on GPU, -e.g. images used as color attachments (aka "render targets"), depth-stencil attachments, -images/buffers used as storage image/buffer (aka "Unordered Access View (UAV)"). - -What to do: -Create them in video memory that is fastest to access from GPU using -#VMA_MEMORY_USAGE_GPU_ONLY. - -Consider using [VK_KHR_dedicated_allocation](@ref vk_khr_dedicated_allocation) extension -and/or manually creating them as dedicated allocations using #VMA_ALLOCATION_CREATE_DEDICATED_MEMORY_BIT, -especially if they are large or if you plan to destroy and recreate them e.g. when -display resolution changes. -Prefer to create such resources first and all other GPU resources (like textures and vertex buffers) later. - -\subsection usage_patterns_simple_immutable_resources Immutable resources - -When: -Any resources that you fill on CPU only once (aka "immutable") or infrequently -and then read frequently on GPU, -e.g. textures, vertex and index buffers, constant buffers that don't change often. - -What to do: -Create them in video memory that is fastest to access from GPU using -#VMA_MEMORY_USAGE_GPU_ONLY. - -To initialize content of such resource, create a CPU-side (aka "staging") copy of it -in system memory - #VMA_MEMORY_USAGE_CPU_ONLY, map it, fill it, -and submit a transfer from it to the GPU resource. -You can keep the staging copy if you need it for another upload transfer in the future. -If you don't, you can destroy it or reuse this buffer for uploading different resource -after the transfer finishes. - -Prefer to create just buffers in system memory rather than images, even for uploading textures. -Use `vkCmdCopyBufferToImage()`. -Dont use images with `VK_IMAGE_TILING_LINEAR`. - -\subsection usage_patterns_dynamic_resources Dynamic resources - -When: -Any resources that change frequently (aka "dynamic"), e.g. every frame or every draw call, -written on CPU, read on GPU. - -What to do: -Create them using #VMA_MEMORY_USAGE_CPU_TO_GPU. -You can map it and write to it directly on CPU, as well as read from it on GPU. - -This is a more complex situation. Different solutions are possible, -and the best one depends on specific GPU type, but you can use this simple approach for the start. -Prefer to write to such resource sequentially (e.g. using `memcpy`). -Don't perform random access or any reads from it on CPU, as it may be very slow. -Also note that textures written directly from the host through a mapped pointer need to be in LINEAR not OPTIMAL layout. - -\subsection usage_patterns_readback Readback - -When: -Resources that contain data written by GPU that you want to read back on CPU, -e.g. results of some computations. - -What to do: -Create them using #VMA_MEMORY_USAGE_GPU_TO_CPU. -You can write to them directly on GPU, as well as map and read them on CPU. - -\section usage_patterns_advanced Advanced patterns - -\subsection usage_patterns_integrated_graphics Detecting integrated graphics - -You can support integrated graphics (like Intel HD Graphics, AMD APU) better -by detecting it in Vulkan. -To do it, call `vkGetPhysicalDeviceProperties()`, inspect -`VkPhysicalDeviceProperties::deviceType` and look for `VK_PHYSICAL_DEVICE_TYPE_INTEGRATED_GPU`. -When you find it, you can assume that memory is unified and all memory types are comparably fast -to access from GPU, regardless of `VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT`. - -You can then sum up sizes of all available memory heaps and treat them as useful for -your GPU resources, instead of only `DEVICE_LOCAL` ones. -You can also prefer to create your resources in memory types that are `HOST_VISIBLE` to map them -directly instead of submitting explicit transfer (see below). - -\subsection usage_patterns_direct_vs_transfer Direct access versus transfer - -For resources that you frequently write on CPU and read on GPU, many solutions are possible: - --# Create one copy in video memory using #VMA_MEMORY_USAGE_GPU_ONLY, - second copy in system memory using #VMA_MEMORY_USAGE_CPU_ONLY and submit explicit transfer each time. --# Create just a single copy using #VMA_MEMORY_USAGE_CPU_TO_GPU, map it and fill it on CPU, - read it directly on GPU. --# Create just a single copy using #VMA_MEMORY_USAGE_CPU_ONLY, map it and fill it on CPU, - read it directly on GPU. - -Which solution is the most efficient depends on your resource and especially on the GPU. -It is best to measure it and then make the decision. -Some general recommendations: - -- On integrated graphics use (2) or (3) to avoid unnecesary time and memory overhead - related to using a second copy and making transfer. -- For small resources (e.g. constant buffers) use (2). - Discrete AMD cards have special 256 MiB pool of video memory that is directly mappable. - Even if the resource ends up in system memory, its data may be cached on GPU after first - fetch over PCIe bus. -- For larger resources (e.g. textures), decide between (1) and (2). - You may want to differentiate NVIDIA and AMD, e.g. by looking for memory type that is - both `DEVICE_LOCAL` and `HOST_VISIBLE`. When you find it, use (2), otherwise use (1). - -Similarly, for resources that you frequently write on GPU and read on CPU, multiple -solutions are possible: - --# Create one copy in video memory using #VMA_MEMORY_USAGE_GPU_ONLY, - second copy in system memory using #VMA_MEMORY_USAGE_GPU_TO_CPU and submit explicit tranfer each time. --# Create just single copy using #VMA_MEMORY_USAGE_GPU_TO_CPU, write to it directly on GPU, - map it and read it on CPU. - -You should take some measurements to decide which option is faster in case of your specific -resource. - -Note that textures accessed directly from the host through a mapped pointer need to be in LINEAR layout, -which may slow down their usage on the device. -Textures accessed only by the device and transfer operations can use OPTIMAL layout. - -If you don't want to specialize your code for specific types of GPUs, you can still make -an simple optimization for cases when your resource ends up in mappable memory to use it -directly in this case instead of creating CPU-side staging copy. -For details see [Finding out if memory is mappable](@ref memory_mapping_finding_if_memory_mappable). - - -\page configuration Configuration - -Please check "CONFIGURATION SECTION" in the code to find macros that you can define -before each include of this file or change directly in this file to provide -your own implementation of basic facilities like assert, `min()` and `max()` functions, -mutex, atomic etc. -The library uses its own implementation of containers by default, but you can switch to using -STL containers instead. - -For example, define `VMA_ASSERT(expr)` before including the library to provide -custom implementation of the assertion, compatible with your project. -By default it is defined to standard C `assert(expr)` in `_DEBUG` configuration -and empty otherwise. - -\section config_Vulkan_functions Pointers to Vulkan functions - -There are multiple ways to import pointers to Vulkan functions in the library. -In the simplest case you don't need to do anything. -If the compilation or linking of your program or the initialization of the #VmaAllocator -doesn't work for you, you can try to reconfigure it. - -First, the allocator tries to fetch pointers to Vulkan functions linked statically, -like this: - -\code -m_VulkanFunctions.vkAllocateMemory = (PFN_vkAllocateMemory)vkAllocateMemory; -\endcode - -If you want to disable this feature, set configuration macro: `#define VMA_STATIC_VULKAN_FUNCTIONS 0`. - -Second, you can provide the pointers yourself by setting member VmaAllocatorCreateInfo::pVulkanFunctions. -You can fetch them e.g. using functions `vkGetInstanceProcAddr` and `vkGetDeviceProcAddr` or -by using a helper library like [volk](https://github.com/zeux/volk). - -Third, VMA tries to fetch remaining pointers that are still null by calling -`vkGetInstanceProcAddr` and `vkGetDeviceProcAddr` on its own. -If you want to disable this feature, set configuration macro: `#define VMA_DYNAMIC_VULKAN_FUNCTIONS 0`. - -Finally, all the function pointers required by the library (considering selected -Vulkan version and enabled extensions) are checked with `VMA_ASSERT` if they are not null. - - -\section custom_memory_allocator Custom host memory allocator - -If you use custom allocator for CPU memory rather than default operator `new` -and `delete` from C++, you can make this library using your allocator as well -by filling optional member VmaAllocatorCreateInfo::pAllocationCallbacks. These -functions will be passed to Vulkan, as well as used by the library itself to -make any CPU-side allocations. - -\section allocation_callbacks Device memory allocation callbacks - -The library makes calls to `vkAllocateMemory()` and `vkFreeMemory()` internally. -You can setup callbacks to be informed about these calls, e.g. for the purpose -of gathering some statistics. To do it, fill optional member -VmaAllocatorCreateInfo::pDeviceMemoryCallbacks. - -\section heap_memory_limit Device heap memory limit - -When device memory of certain heap runs out of free space, new allocations may -fail (returning error code) or they may succeed, silently pushing some existing -memory blocks from GPU VRAM to system RAM (which degrades performance). This -behavior is implementation-dependent - it depends on GPU vendor and graphics -driver. - -On AMD cards it can be controlled while creating Vulkan device object by using -VK_AMD_memory_overallocation_behavior extension, if available. - -Alternatively, if you want to test how your program behaves with limited amount of Vulkan device -memory available without switching your graphics card to one that really has -smaller VRAM, you can use a feature of this library intended for this purpose. -To do it, fill optional member VmaAllocatorCreateInfo::pHeapSizeLimit. - - - -\page vk_khr_dedicated_allocation VK_KHR_dedicated_allocation - -VK_KHR_dedicated_allocation is a Vulkan extension which can be used to improve -performance on some GPUs. It augments Vulkan API with possibility to query -driver whether it prefers particular buffer or image to have its own, dedicated -allocation (separate `VkDeviceMemory` block) for better efficiency - to be able -to do some internal optimizations. - -The extension is supported by this library. It will be used automatically when -enabled. To enable it: - -1 . When creating Vulkan device, check if following 2 device extensions are -supported (call `vkEnumerateDeviceExtensionProperties()`). -If yes, enable them (fill `VkDeviceCreateInfo::ppEnabledExtensionNames`). - -- VK_KHR_get_memory_requirements2 -- VK_KHR_dedicated_allocation - -If you enabled these extensions: - -2 . Use #VMA_ALLOCATOR_CREATE_KHR_DEDICATED_ALLOCATION_BIT flag when creating -your #VmaAllocator`to inform the library that you enabled required extensions -and you want the library to use them. - -\code -allocatorInfo.flags |= VMA_ALLOCATOR_CREATE_KHR_DEDICATED_ALLOCATION_BIT; - -vmaCreateAllocator(&allocatorInfo, &allocator); -\endcode - -That's all. The extension will be automatically used whenever you create a -buffer using vmaCreateBuffer() or image using vmaCreateImage(). - -When using the extension together with Vulkan Validation Layer, you will receive -warnings like this: - - vkBindBufferMemory(): Binding memory to buffer 0x33 but vkGetBufferMemoryRequirements() has not been called on that buffer. - -It is OK, you should just ignore it. It happens because you use function -`vkGetBufferMemoryRequirements2KHR()` instead of standard -`vkGetBufferMemoryRequirements()`, while the validation layer seems to be -unaware of it. - -To learn more about this extension, see: - -- [VK_KHR_dedicated_allocation in Vulkan specification](https://www.khronos.org/registry/vulkan/specs/1.2-extensions/html/chap44.html#VK_KHR_dedicated_allocation) -- [VK_KHR_dedicated_allocation unofficial manual](http://asawicki.info/articles/VK_KHR_dedicated_allocation.php5) - - - -\page vk_amd_device_coherent_memory VK_AMD_device_coherent_memory - -VK_AMD_device_coherent_memory is a device extension that enables access to -additional memory types with `VK_MEMORY_PROPERTY_DEVICE_COHERENT_BIT_AMD` and -`VK_MEMORY_PROPERTY_DEVICE_UNCACHED_BIT_AMD` flag. It is useful mostly for -allocation of buffers intended for writing "breadcrumb markers" in between passes -or draw calls, which in turn are useful for debugging GPU crash/hang/TDR cases. - -When the extension is available but has not been enabled, Vulkan physical device -still exposes those memory types, but their usage is forbidden. VMA automatically -takes care of that - it returns `VK_ERROR_FEATURE_NOT_PRESENT` when an attempt -to allocate memory of such type is made. - -If you want to use this extension in connection with VMA, follow these steps: - -\section vk_amd_device_coherent_memory_initialization Initialization - -1) Call `vkEnumerateDeviceExtensionProperties` for the physical device. -Check if the extension is supported - if returned array of `VkExtensionProperties` contains "VK_AMD_device_coherent_memory". - -2) Call `vkGetPhysicalDeviceFeatures2` for the physical device instead of old `vkGetPhysicalDeviceFeatures`. -Attach additional structure `VkPhysicalDeviceCoherentMemoryFeaturesAMD` to `VkPhysicalDeviceFeatures2::pNext` to be returned. -Check if the device feature is really supported - check if `VkPhysicalDeviceCoherentMemoryFeaturesAMD::deviceCoherentMemory` is true. - -3) While creating device with `vkCreateDevice`, enable this extension - add "VK_AMD_device_coherent_memory" -to the list passed as `VkDeviceCreateInfo::ppEnabledExtensionNames`. - -4) While creating the device, also don't set `VkDeviceCreateInfo::pEnabledFeatures`. -Fill in `VkPhysicalDeviceFeatures2` structure instead and pass it as `VkDeviceCreateInfo::pNext`. -Enable this device feature - attach additional structure `VkPhysicalDeviceCoherentMemoryFeaturesAMD` to -`VkPhysicalDeviceFeatures2::pNext` and set its member `deviceCoherentMemory` to `VK_TRUE`. - -5) While creating #VmaAllocator with vmaCreateAllocator() inform VMA that you -have enabled this extension and feature - add #VMA_ALLOCATOR_CREATE_AMD_DEVICE_COHERENT_MEMORY_BIT -to VmaAllocatorCreateInfo::flags. - -\section vk_amd_device_coherent_memory_usage Usage - -After following steps described above, you can create VMA allocations and custom pools -out of the special `DEVICE_COHERENT` and `DEVICE_UNCACHED` memory types on eligible -devices. There are multiple ways to do it, for example: - -- You can request or prefer to allocate out of such memory types by adding - `VK_MEMORY_PROPERTY_DEVICE_COHERENT_BIT_AMD` to VmaAllocationCreateInfo::requiredFlags - or VmaAllocationCreateInfo::preferredFlags. Those flags can be freely mixed with - other ways of \ref choosing_memory_type, like setting VmaAllocationCreateInfo::usage. -- If you manually found memory type index to use for this purpose, force allocation - from this specific index by setting VmaAllocationCreateInfo::memoryTypeBits `= 1u << index`. - -\section vk_amd_device_coherent_memory_more_information More information - -To learn more about this extension, see [VK_AMD_device_coherent_memory in Vulkan specification](https://www.khronos.org/registry/vulkan/specs/1.2-extensions/html/chap44.html#VK_AMD_device_coherent_memory) - -Example use of this extension can be found in the code of the sample and test suite -accompanying this library. - - -\page enabling_buffer_device_address Enabling buffer device address - -Device extension VK_KHR_buffer_device_address -allow to fetch raw GPU pointer to a buffer and pass it for usage in a shader code. -It is promoted to core Vulkan 1.2. - -If you want to use this feature in connection with VMA, follow these steps: - -\section enabling_buffer_device_address_initialization Initialization - -1) (For Vulkan version < 1.2) Call `vkEnumerateDeviceExtensionProperties` for the physical device. -Check if the extension is supported - if returned array of `VkExtensionProperties` contains -"VK_KHR_buffer_device_address". - -2) Call `vkGetPhysicalDeviceFeatures2` for the physical device instead of old `vkGetPhysicalDeviceFeatures`. -Attach additional structure `VkPhysicalDeviceBufferDeviceAddressFeatures*` to `VkPhysicalDeviceFeatures2::pNext` to be returned. -Check if the device feature is really supported - check if `VkPhysicalDeviceBufferDeviceAddressFeatures*::bufferDeviceAddress` is true. - -3) (For Vulkan version < 1.2) While creating device with `vkCreateDevice`, enable this extension - add -"VK_KHR_buffer_device_address" to the list passed as `VkDeviceCreateInfo::ppEnabledExtensionNames`. - -4) While creating the device, also don't set `VkDeviceCreateInfo::pEnabledFeatures`. -Fill in `VkPhysicalDeviceFeatures2` structure instead and pass it as `VkDeviceCreateInfo::pNext`. -Enable this device feature - attach additional structure `VkPhysicalDeviceBufferDeviceAddressFeatures*` to -`VkPhysicalDeviceFeatures2::pNext` and set its member `bufferDeviceAddress` to `VK_TRUE`. - -5) While creating #VmaAllocator with vmaCreateAllocator() inform VMA that you -have enabled this feature - add #VMA_ALLOCATOR_CREATE_BUFFER_DEVICE_ADDRESS_BIT -to VmaAllocatorCreateInfo::flags. - -\section enabling_buffer_device_address_usage Usage - -After following steps described above, you can create buffers with `VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT*` using VMA. -The library automatically adds `VK_MEMORY_ALLOCATE_DEVICE_ADDRESS_BIT*` to -allocated memory blocks wherever it might be needed. - -Please note that the library supports only `VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT*`. -The second part of this functionality related to "capture and replay" is not supported, -as it is intended for usage in debugging tools like RenderDoc, not in everyday Vulkan usage. - -\section enabling_buffer_device_address_more_information More information - -To learn more about this extension, see [VK_KHR_buffer_device_address in Vulkan specification](https://www.khronos.org/registry/vulkan/specs/1.2-extensions/html/chap46.html#VK_KHR_buffer_device_address) - -Example use of this extension can be found in the code of the sample and test suite -accompanying this library. - -\page general_considerations General considerations - -\section general_considerations_thread_safety Thread safety - -- The library has no global state, so separate #VmaAllocator objects can be used - independently. - There should be no need to create multiple such objects though - one per `VkDevice` is enough. -- By default, all calls to functions that take #VmaAllocator as first parameter - are safe to call from multiple threads simultaneously because they are - synchronized internally when needed. -- When the allocator is created with #VMA_ALLOCATOR_CREATE_EXTERNALLY_SYNCHRONIZED_BIT - flag, calls to functions that take such #VmaAllocator object must be - synchronized externally. -- Access to a #VmaAllocation object must be externally synchronized. For example, - you must not call vmaGetAllocationInfo() and vmaMapMemory() from different - threads at the same time if you pass the same #VmaAllocation object to these - functions. - -\section general_considerations_validation_layer_warnings Validation layer warnings - -When using this library, you can meet following types of warnings issued by -Vulkan validation layer. They don't necessarily indicate a bug, so you may need -to just ignore them. - -- *vkBindBufferMemory(): Binding memory to buffer 0xeb8e4 but vkGetBufferMemoryRequirements() has not been called on that buffer.* - - It happens when VK_KHR_dedicated_allocation extension is enabled. - `vkGetBufferMemoryRequirements2KHR` function is used instead, while validation layer seems to be unaware of it. -- *Mapping an image with layout VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL can result in undefined behavior if this memory is used by the device. Only GENERAL or PREINITIALIZED should be used.* - - It happens when you map a buffer or image, because the library maps entire - `VkDeviceMemory` block, where different types of images and buffers may end - up together, especially on GPUs with unified memory like Intel. -- *Non-linear image 0xebc91 is aliased with linear buffer 0xeb8e4 which may indicate a bug.* - - It happens when you use lost allocations, and a new image or buffer is - created in place of an existing object that bacame lost. - - It may happen also when you use [defragmentation](@ref defragmentation). - -\section general_considerations_allocation_algorithm Allocation algorithm - -The library uses following algorithm for allocation, in order: - --# Try to find free range of memory in existing blocks. --# If failed, try to create a new block of `VkDeviceMemory`, with preferred block size. --# If failed, try to create such block with size/2, size/4, size/8. --# If failed and #VMA_ALLOCATION_CREATE_CAN_MAKE_OTHER_LOST_BIT flag was - specified, try to find space in existing blocks, possilby making some other - allocations lost. --# If failed, try to allocate separate `VkDeviceMemory` for this allocation, - just like when you use #VMA_ALLOCATION_CREATE_DEDICATED_MEMORY_BIT. --# If failed, choose other memory type that meets the requirements specified in - VmaAllocationCreateInfo and go to point 1. --# If failed, return `VK_ERROR_OUT_OF_DEVICE_MEMORY`. - -\section general_considerations_features_not_supported Features not supported - -Features deliberately excluded from the scope of this library: - -- Data transfer. Uploading (straming) and downloading data of buffers and images - between CPU and GPU memory and related synchronization is responsibility of the user. - Defining some "texture" object that would automatically stream its data from a - staging copy in CPU memory to GPU memory would rather be a feature of another, - higher-level library implemented on top of VMA. -- Allocations for imported/exported external memory. They tend to require - explicit memory type index and dedicated allocation anyway, so they don't - interact with main features of this library. Such special purpose allocations - should be made manually, using `vkCreateBuffer()` and `vkAllocateMemory()`. -- Sub-allocation of parts of one large buffer. Although recommended as a good practice, - it is the user's responsibility to implement such logic on top of VMA. -- Recreation of buffers and images. Although the library has functions for - buffer and image creation (vmaCreateBuffer(), vmaCreateImage()), you need to - recreate these objects yourself after defragmentation. That's because the big - structures `VkBufferCreateInfo`, `VkImageCreateInfo` are not stored in - #VmaAllocation object. -- Handling CPU memory allocation failures. When dynamically creating small C++ - objects in CPU memory (not Vulkan memory), allocation failures are not checked - and handled gracefully, because that would complicate code significantly and - is usually not needed in desktop PC applications anyway. - Success of an allocation is just checked with an assert. -- Code free of any compiler warnings. Maintaining the library to compile and - work correctly on so many different platforms is hard enough. Being free of - any warnings, on any version of any compiler, is simply not feasible. -- This is a C++ library with C interface. - Bindings or ports to any other programming languages are welcomed as external projects and - are not going to be included into this repository. - -*/ - -#ifdef __cplusplus -extern "C" { -#endif - -/* -Define this macro to 0/1 to disable/enable support for recording functionality, -available through VmaAllocatorCreateInfo::pRecordSettings. -*/ -#ifndef VMA_RECORDING_ENABLED - #define VMA_RECORDING_ENABLED 0 -#endif - -#if !defined(NOMINMAX) && defined(VMA_IMPLEMENTATION) - #define NOMINMAX // For windows.h -#endif - -#if defined(__ANDROID__) && defined(VK_NO_PROTOTYPES) && VMA_STATIC_VULKAN_FUNCTIONS - extern PFN_vkGetInstanceProcAddr vkGetInstanceProcAddr; - extern PFN_vkGetDeviceProcAddr vkGetDeviceProcAddr; - extern PFN_vkGetPhysicalDeviceProperties vkGetPhysicalDeviceProperties; - extern PFN_vkGetPhysicalDeviceMemoryProperties vkGetPhysicalDeviceMemoryProperties; - extern PFN_vkAllocateMemory vkAllocateMemory; - extern PFN_vkFreeMemory vkFreeMemory; - extern PFN_vkMapMemory vkMapMemory; - extern PFN_vkUnmapMemory vkUnmapMemory; - extern PFN_vkFlushMappedMemoryRanges vkFlushMappedMemoryRanges; - extern PFN_vkInvalidateMappedMemoryRanges vkInvalidateMappedMemoryRanges; - extern PFN_vkBindBufferMemory vkBindBufferMemory; - extern PFN_vkBindImageMemory vkBindImageMemory; - extern PFN_vkGetBufferMemoryRequirements vkGetBufferMemoryRequirements; - extern PFN_vkGetImageMemoryRequirements vkGetImageMemoryRequirements; - extern PFN_vkCreateBuffer vkCreateBuffer; - extern PFN_vkDestroyBuffer vkDestroyBuffer; - extern PFN_vkCreateImage vkCreateImage; - extern PFN_vkDestroyImage vkDestroyImage; - extern PFN_vkCmdCopyBuffer vkCmdCopyBuffer; - #if VMA_VULKAN_VERSION >= 1001000 - extern PFN_vkGetBufferMemoryRequirements2 vkGetBufferMemoryRequirements2; - extern PFN_vkGetImageMemoryRequirements2 vkGetImageMemoryRequirements2; - extern PFN_vkBindBufferMemory2 vkBindBufferMemory2; - extern PFN_vkBindImageMemory2 vkBindImageMemory2; - extern PFN_vkGetPhysicalDeviceMemoryProperties2 vkGetPhysicalDeviceMemoryProperties2; - #endif // #if VMA_VULKAN_VERSION >= 1001000 -#endif // #if defined(__ANDROID__) && VMA_STATIC_VULKAN_FUNCTIONS && VK_NO_PROTOTYPES - -#ifndef VULKAN_H_ - #include -#endif - -// Define this macro to declare maximum supported Vulkan version in format AAABBBCCC, -// where AAA = major, BBB = minor, CCC = patch. -// If you want to use version > 1.0, it still needs to be enabled via VmaAllocatorCreateInfo::vulkanApiVersion. -#if !defined(VMA_VULKAN_VERSION) - #if defined(VK_VERSION_1_2) - #define VMA_VULKAN_VERSION 1002000 - #elif defined(VK_VERSION_1_1) - #define VMA_VULKAN_VERSION 1001000 - #else - #define VMA_VULKAN_VERSION 1000000 - #endif -#endif - -#if !defined(VMA_DEDICATED_ALLOCATION) - #if VK_KHR_get_memory_requirements2 && VK_KHR_dedicated_allocation - #define VMA_DEDICATED_ALLOCATION 1 - #else - #define VMA_DEDICATED_ALLOCATION 0 - #endif -#endif - -#if !defined(VMA_BIND_MEMORY2) - #if VK_KHR_bind_memory2 - #define VMA_BIND_MEMORY2 1 - #else - #define VMA_BIND_MEMORY2 0 - #endif -#endif - -#if !defined(VMA_MEMORY_BUDGET) - #if VK_EXT_memory_budget && (VK_KHR_get_physical_device_properties2 || VMA_VULKAN_VERSION >= 1001000) - #define VMA_MEMORY_BUDGET 1 - #else - #define VMA_MEMORY_BUDGET 0 - #endif -#endif - -// Defined to 1 when VK_KHR_buffer_device_address device extension or equivalent core Vulkan 1.2 feature is defined in its headers. -#if !defined(VMA_BUFFER_DEVICE_ADDRESS) - #if VK_KHR_buffer_device_address || VMA_VULKAN_VERSION >= 1002000 - #define VMA_BUFFER_DEVICE_ADDRESS 1 - #else - #define VMA_BUFFER_DEVICE_ADDRESS 0 - #endif -#endif - -// Defined to 1 when VK_EXT_memory_priority device extension is defined in Vulkan headers. -#if !defined(VMA_MEMORY_PRIORITY) - #if VK_EXT_memory_priority - #define VMA_MEMORY_PRIORITY 1 - #else - #define VMA_MEMORY_PRIORITY 0 - #endif -#endif - -// Define these macros to decorate all public functions with additional code, -// before and after returned type, appropriately. This may be useful for -// exporting the functions when compiling VMA as a separate library. Example: -// #define VMA_CALL_PRE __declspec(dllexport) -// #define VMA_CALL_POST __cdecl -#ifndef VMA_CALL_PRE - #define VMA_CALL_PRE -#endif -#ifndef VMA_CALL_POST - #define VMA_CALL_POST -#endif - -// Define this macro to decorate pointers with an attribute specifying the -// length of the array they point to if they are not null. -// -// The length may be one of -// - The name of another parameter in the argument list where the pointer is declared -// - The name of another member in the struct where the pointer is declared -// - The name of a member of a struct type, meaning the value of that member in -// the context of the call. For example -// VMA_LEN_IF_NOT_NULL("VkPhysicalDeviceMemoryProperties::memoryHeapCount"), -// this means the number of memory heaps available in the device associated -// with the VmaAllocator being dealt with. -#ifndef VMA_LEN_IF_NOT_NULL - #define VMA_LEN_IF_NOT_NULL(len) -#endif - -// The VMA_NULLABLE macro is defined to be _Nullable when compiling with Clang. -// see: https://clang.llvm.org/docs/AttributeReference.html#nullable -#ifndef VMA_NULLABLE - #ifdef __clang__ - #define VMA_NULLABLE _Nullable - #else - #define VMA_NULLABLE - #endif -#endif - -// The VMA_NOT_NULL macro is defined to be _Nonnull when compiling with Clang. -// see: https://clang.llvm.org/docs/AttributeReference.html#nonnull -#ifndef VMA_NOT_NULL - #ifdef __clang__ - #define VMA_NOT_NULL _Nonnull - #else - #define VMA_NOT_NULL - #endif -#endif - -// If non-dispatchable handles are represented as pointers then we can give -// then nullability annotations -#ifndef VMA_NOT_NULL_NON_DISPATCHABLE - #if defined(__LP64__) || defined(_WIN64) || (defined(__x86_64__) && !defined(__ILP32__) ) || defined(_M_X64) || defined(__ia64) || defined (_M_IA64) || defined(__aarch64__) || defined(__powerpc64__) - #define VMA_NOT_NULL_NON_DISPATCHABLE VMA_NOT_NULL - #else - #define VMA_NOT_NULL_NON_DISPATCHABLE - #endif -#endif - -#ifndef VMA_NULLABLE_NON_DISPATCHABLE - #if defined(__LP64__) || defined(_WIN64) || (defined(__x86_64__) && !defined(__ILP32__) ) || defined(_M_X64) || defined(__ia64) || defined (_M_IA64) || defined(__aarch64__) || defined(__powerpc64__) - #define VMA_NULLABLE_NON_DISPATCHABLE VMA_NULLABLE - #else - #define VMA_NULLABLE_NON_DISPATCHABLE - #endif -#endif - -/** \struct VmaAllocator -\brief Represents main object of this library initialized. - -Fill structure #VmaAllocatorCreateInfo and call function vmaCreateAllocator() to create it. -Call function vmaDestroyAllocator() to destroy it. - -It is recommended to create just one object of this type per `VkDevice` object, -right after Vulkan is initialized and keep it alive until before Vulkan device is destroyed. -*/ -VK_DEFINE_HANDLE(VmaAllocator) - -/// Callback function called after successful vkAllocateMemory. -typedef void (VKAPI_PTR *PFN_vmaAllocateDeviceMemoryFunction)( - VmaAllocator VMA_NOT_NULL allocator, - uint32_t memoryType, - VkDeviceMemory VMA_NOT_NULL_NON_DISPATCHABLE memory, - VkDeviceSize size, - void* VMA_NULLABLE pUserData); -/// Callback function called before vkFreeMemory. -typedef void (VKAPI_PTR *PFN_vmaFreeDeviceMemoryFunction)( - VmaAllocator VMA_NOT_NULL allocator, - uint32_t memoryType, - VkDeviceMemory VMA_NOT_NULL_NON_DISPATCHABLE memory, - VkDeviceSize size, - void* VMA_NULLABLE pUserData); - -/** \brief Set of callbacks that the library will call for `vkAllocateMemory` and `vkFreeMemory`. - -Provided for informative purpose, e.g. to gather statistics about number of -allocations or total amount of memory allocated in Vulkan. - -Used in VmaAllocatorCreateInfo::pDeviceMemoryCallbacks. -*/ -typedef struct VmaDeviceMemoryCallbacks { - /// Optional, can be null. - PFN_vmaAllocateDeviceMemoryFunction VMA_NULLABLE pfnAllocate; - /// Optional, can be null. - PFN_vmaFreeDeviceMemoryFunction VMA_NULLABLE pfnFree; - /// Optional, can be null. - void* VMA_NULLABLE pUserData; -} VmaDeviceMemoryCallbacks; - -/// Flags for created #VmaAllocator. -typedef enum VmaAllocatorCreateFlagBits { - /** \brief Allocator and all objects created from it will not be synchronized internally, so you must guarantee they are used from only one thread at a time or synchronized externally by you. - - Using this flag may increase performance because internal mutexes are not used. - */ - VMA_ALLOCATOR_CREATE_EXTERNALLY_SYNCHRONIZED_BIT = 0x00000001, - /** \brief Enables usage of VK_KHR_dedicated_allocation extension. - - The flag works only if VmaAllocatorCreateInfo::vulkanApiVersion `== VK_API_VERSION_1_0`. - When it's `VK_API_VERSION_1_1`, the flag is ignored because the extension has been promoted to Vulkan 1.1. - - Using this extenion will automatically allocate dedicated blocks of memory for - some buffers and images instead of suballocating place for them out of bigger - memory blocks (as if you explicitly used #VMA_ALLOCATION_CREATE_DEDICATED_MEMORY_BIT - flag) when it is recommended by the driver. It may improve performance on some - GPUs. - - You may set this flag only if you found out that following device extensions are - supported, you enabled them while creating Vulkan device passed as - VmaAllocatorCreateInfo::device, and you want them to be used internally by this - library: - - - VK_KHR_get_memory_requirements2 (device extension) - - VK_KHR_dedicated_allocation (device extension) - - When this flag is set, you can experience following warnings reported by Vulkan - validation layer. You can ignore them. - - > vkBindBufferMemory(): Binding memory to buffer 0x2d but vkGetBufferMemoryRequirements() has not been called on that buffer. - */ - VMA_ALLOCATOR_CREATE_KHR_DEDICATED_ALLOCATION_BIT = 0x00000002, - /** - Enables usage of VK_KHR_bind_memory2 extension. - - The flag works only if VmaAllocatorCreateInfo::vulkanApiVersion `== VK_API_VERSION_1_0`. - When it's `VK_API_VERSION_1_1`, the flag is ignored because the extension has been promoted to Vulkan 1.1. - - You may set this flag only if you found out that this device extension is supported, - you enabled it while creating Vulkan device passed as VmaAllocatorCreateInfo::device, - and you want it to be used internally by this library. - - The extension provides functions `vkBindBufferMemory2KHR` and `vkBindImageMemory2KHR`, - which allow to pass a chain of `pNext` structures while binding. - This flag is required if you use `pNext` parameter in vmaBindBufferMemory2() or vmaBindImageMemory2(). - */ - VMA_ALLOCATOR_CREATE_KHR_BIND_MEMORY2_BIT = 0x00000004, - /** - Enables usage of VK_EXT_memory_budget extension. - - You may set this flag only if you found out that this device extension is supported, - you enabled it while creating Vulkan device passed as VmaAllocatorCreateInfo::device, - and you want it to be used internally by this library, along with another instance extension - VK_KHR_get_physical_device_properties2, which is required by it (or Vulkan 1.1, where this extension is promoted). - - The extension provides query for current memory usage and budget, which will probably - be more accurate than an estimation used by the library otherwise. - */ - VMA_ALLOCATOR_CREATE_EXT_MEMORY_BUDGET_BIT = 0x00000008, - /** - Enables usage of VK_AMD_device_coherent_memory extension. - - You may set this flag only if you: - - - found out that this device extension is supported and enabled it while creating Vulkan device passed as VmaAllocatorCreateInfo::device, - - checked that `VkPhysicalDeviceCoherentMemoryFeaturesAMD::deviceCoherentMemory` is true and set it while creating the Vulkan device, - - want it to be used internally by this library. - - The extension and accompanying device feature provide access to memory types with - `VK_MEMORY_PROPERTY_DEVICE_COHERENT_BIT_AMD` and `VK_MEMORY_PROPERTY_DEVICE_UNCACHED_BIT_AMD` flags. - They are useful mostly for writing breadcrumb markers - a common method for debugging GPU crash/hang/TDR. - - When the extension is not enabled, such memory types are still enumerated, but their usage is illegal. - To protect from this error, if you don't create the allocator with this flag, it will refuse to allocate any memory or create a custom pool in such memory type, - returning `VK_ERROR_FEATURE_NOT_PRESENT`. - */ - VMA_ALLOCATOR_CREATE_AMD_DEVICE_COHERENT_MEMORY_BIT = 0x00000010, - /** - Enables usage of "buffer device address" feature, which allows you to use function - `vkGetBufferDeviceAddress*` to get raw GPU pointer to a buffer and pass it for usage inside a shader. - - You may set this flag only if you: - - 1. (For Vulkan version < 1.2) Found as available and enabled device extension - VK_KHR_buffer_device_address. - This extension is promoted to core Vulkan 1.2. - 2. Found as available and enabled device feature `VkPhysicalDeviceBufferDeviceAddressFeatures::bufferDeviceAddress`. - - When this flag is set, you can create buffers with `VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT` using VMA. - The library automatically adds `VK_MEMORY_ALLOCATE_DEVICE_ADDRESS_BIT` to - allocated memory blocks wherever it might be needed. - - For more information, see documentation chapter \ref enabling_buffer_device_address. - */ - VMA_ALLOCATOR_CREATE_BUFFER_DEVICE_ADDRESS_BIT = 0x00000020, - /** - Enables usage of VK_EXT_memory_priority extension in the library. - - You may set this flag only if you found available and enabled this device extension, - along with `VkPhysicalDeviceMemoryPriorityFeaturesEXT::memoryPriority == VK_TRUE`, - while creating Vulkan device passed as VmaAllocatorCreateInfo::device. - - When this flag is used, VmaAllocationCreateInfo::priority and VmaPoolCreateInfo::priority - are used to set priorities of allocated Vulkan memory. Without it, these variables are ignored. - - A priority must be a floating-point value between 0 and 1, indicating the priority of the allocation relative to other memory allocations. - Larger values are higher priority. The granularity of the priorities is implementation-dependent. - It is automatically passed to every call to `vkAllocateMemory` done by the library using structure `VkMemoryPriorityAllocateInfoEXT`. - The value to be used for default priority is 0.5. - For more details, see the documentation of the VK_EXT_memory_priority extension. - */ - VMA_ALLOCATOR_CREATE_EXT_MEMORY_PRIORITY_BIT = 0x00000040, - - VMA_ALLOCATOR_CREATE_FLAG_BITS_MAX_ENUM = 0x7FFFFFFF -} VmaAllocatorCreateFlagBits; -typedef VkFlags VmaAllocatorCreateFlags; - -/** \brief Pointers to some Vulkan functions - a subset used by the library. - -Used in VmaAllocatorCreateInfo::pVulkanFunctions. -*/ -typedef struct VmaVulkanFunctions { - PFN_vkGetPhysicalDeviceProperties VMA_NULLABLE vkGetPhysicalDeviceProperties; - PFN_vkGetPhysicalDeviceMemoryProperties VMA_NULLABLE vkGetPhysicalDeviceMemoryProperties; - PFN_vkAllocateMemory VMA_NULLABLE vkAllocateMemory; - PFN_vkFreeMemory VMA_NULLABLE vkFreeMemory; - PFN_vkMapMemory VMA_NULLABLE vkMapMemory; - PFN_vkUnmapMemory VMA_NULLABLE vkUnmapMemory; - PFN_vkFlushMappedMemoryRanges VMA_NULLABLE vkFlushMappedMemoryRanges; - PFN_vkInvalidateMappedMemoryRanges VMA_NULLABLE vkInvalidateMappedMemoryRanges; - PFN_vkBindBufferMemory VMA_NULLABLE vkBindBufferMemory; - PFN_vkBindImageMemory VMA_NULLABLE vkBindImageMemory; - PFN_vkGetBufferMemoryRequirements VMA_NULLABLE vkGetBufferMemoryRequirements; - PFN_vkGetImageMemoryRequirements VMA_NULLABLE vkGetImageMemoryRequirements; - PFN_vkCreateBuffer VMA_NULLABLE vkCreateBuffer; - PFN_vkDestroyBuffer VMA_NULLABLE vkDestroyBuffer; - PFN_vkCreateImage VMA_NULLABLE vkCreateImage; - PFN_vkDestroyImage VMA_NULLABLE vkDestroyImage; - PFN_vkCmdCopyBuffer VMA_NULLABLE vkCmdCopyBuffer; -#if VMA_DEDICATED_ALLOCATION || VMA_VULKAN_VERSION >= 1001000 - PFN_vkGetBufferMemoryRequirements2KHR VMA_NULLABLE vkGetBufferMemoryRequirements2KHR; - PFN_vkGetImageMemoryRequirements2KHR VMA_NULLABLE vkGetImageMemoryRequirements2KHR; -#endif -#if VMA_BIND_MEMORY2 || VMA_VULKAN_VERSION >= 1001000 - PFN_vkBindBufferMemory2KHR VMA_NULLABLE vkBindBufferMemory2KHR; - PFN_vkBindImageMemory2KHR VMA_NULLABLE vkBindImageMemory2KHR; -#endif -#if VMA_MEMORY_BUDGET || VMA_VULKAN_VERSION >= 1001000 - PFN_vkGetPhysicalDeviceMemoryProperties2KHR VMA_NULLABLE vkGetPhysicalDeviceMemoryProperties2KHR; -#endif -} VmaVulkanFunctions; - -/// Flags to be used in VmaRecordSettings::flags. -typedef enum VmaRecordFlagBits { - /** \brief Enables flush after recording every function call. - - Enable it if you expect your application to crash, which may leave recording file truncated. - It may degrade performance though. - */ - VMA_RECORD_FLUSH_AFTER_CALL_BIT = 0x00000001, - - VMA_RECORD_FLAG_BITS_MAX_ENUM = 0x7FFFFFFF -} VmaRecordFlagBits; -typedef VkFlags VmaRecordFlags; - -/// Parameters for recording calls to VMA functions. To be used in VmaAllocatorCreateInfo::pRecordSettings. -typedef struct VmaRecordSettings -{ - /// Flags for recording. Use #VmaRecordFlagBits enum. - VmaRecordFlags flags; - /** \brief Path to the file that should be written by the recording. - - Suggested extension: "csv". - If the file already exists, it will be overwritten. - It will be opened for the whole time #VmaAllocator object is alive. - If opening this file fails, creation of the whole allocator object fails. - */ - const char* VMA_NOT_NULL pFilePath; -} VmaRecordSettings; - -/// Description of a Allocator to be created. -typedef struct VmaAllocatorCreateInfo -{ - /// Flags for created allocator. Use #VmaAllocatorCreateFlagBits enum. - VmaAllocatorCreateFlags flags; - /// Vulkan physical device. - /** It must be valid throughout whole lifetime of created allocator. */ - VkPhysicalDevice VMA_NOT_NULL physicalDevice; - /// Vulkan device. - /** It must be valid throughout whole lifetime of created allocator. */ - VkDevice VMA_NOT_NULL device; - /// Preferred size of a single `VkDeviceMemory` block to be allocated from large heaps > 1 GiB. Optional. - /** Set to 0 to use default, which is currently 256 MiB. */ - VkDeviceSize preferredLargeHeapBlockSize; - /// Custom CPU memory allocation callbacks. Optional. - /** Optional, can be null. When specified, will also be used for all CPU-side memory allocations. */ - const VkAllocationCallbacks* VMA_NULLABLE pAllocationCallbacks; - /// Informative callbacks for `vkAllocateMemory`, `vkFreeMemory`. Optional. - /** Optional, can be null. */ - const VmaDeviceMemoryCallbacks* VMA_NULLABLE pDeviceMemoryCallbacks; - /** \brief Maximum number of additional frames that are in use at the same time as current frame. - - This value is used only when you make allocations with - VMA_ALLOCATION_CREATE_CAN_BECOME_LOST_BIT flag. Such allocation cannot become - lost if allocation.lastUseFrameIndex >= allocator.currentFrameIndex - frameInUseCount. - - For example, if you double-buffer your command buffers, so resources used for - rendering in previous frame may still be in use by the GPU at the moment you - allocate resources needed for the current frame, set this value to 1. - - If you want to allow any allocations other than used in the current frame to - become lost, set this value to 0. - */ - uint32_t frameInUseCount; - /** \brief Either null or a pointer to an array of limits on maximum number of bytes that can be allocated out of particular Vulkan memory heap. - - If not NULL, it must be a pointer to an array of - `VkPhysicalDeviceMemoryProperties::memoryHeapCount` elements, defining limit on - maximum number of bytes that can be allocated out of particular Vulkan memory - heap. - - Any of the elements may be equal to `VK_WHOLE_SIZE`, which means no limit on that - heap. This is also the default in case of `pHeapSizeLimit` = NULL. - - If there is a limit defined for a heap: - - - If user tries to allocate more memory from that heap using this allocator, - the allocation fails with `VK_ERROR_OUT_OF_DEVICE_MEMORY`. - - If the limit is smaller than heap size reported in `VkMemoryHeap::size`, the - value of this limit will be reported instead when using vmaGetMemoryProperties(). - - Warning! Using this feature may not be equivalent to installing a GPU with - smaller amount of memory, because graphics driver doesn't necessary fail new - allocations with `VK_ERROR_OUT_OF_DEVICE_MEMORY` result when memory capacity is - exceeded. It may return success and just silently migrate some device memory - blocks to system RAM. This driver behavior can also be controlled using - VK_AMD_memory_overallocation_behavior extension. - */ - const VkDeviceSize* VMA_NULLABLE VMA_LEN_IF_NOT_NULL("VkPhysicalDeviceMemoryProperties::memoryHeapCount") pHeapSizeLimit; - - /** \brief Pointers to Vulkan functions. Can be null. - - For details see [Pointers to Vulkan functions](@ref config_Vulkan_functions). - */ - const VmaVulkanFunctions* VMA_NULLABLE pVulkanFunctions; - /** \brief Parameters for recording of VMA calls. Can be null. - - If not null, it enables recording of calls to VMA functions to a file. - If support for recording is not enabled using `VMA_RECORDING_ENABLED` macro, - creation of the allocator object fails with `VK_ERROR_FEATURE_NOT_PRESENT`. - */ - const VmaRecordSettings* VMA_NULLABLE pRecordSettings; - /** \brief Handle to Vulkan instance object. - - Starting from version 3.0.0 this member is no longer optional, it must be set! - */ - VkInstance VMA_NOT_NULL instance; - /** \brief Optional. The highest version of Vulkan that the application is designed to use. - - It must be a value in the format as created by macro `VK_MAKE_VERSION` or a constant like: `VK_API_VERSION_1_1`, `VK_API_VERSION_1_0`. - The patch version number specified is ignored. Only the major and minor versions are considered. - It must be less or equal (preferably equal) to value as passed to `vkCreateInstance` as `VkApplicationInfo::apiVersion`. - Only versions 1.0, 1.1, 1.2 are supported by the current implementation. - Leaving it initialized to zero is equivalent to `VK_API_VERSION_1_0`. - */ - uint32_t vulkanApiVersion; -} VmaAllocatorCreateInfo; - -/// Creates Allocator object. -VMA_CALL_PRE VkResult VMA_CALL_POST vmaCreateAllocator( - const VmaAllocatorCreateInfo* VMA_NOT_NULL pCreateInfo, - VmaAllocator VMA_NULLABLE * VMA_NOT_NULL pAllocator); - -/// Destroys allocator object. -VMA_CALL_PRE void VMA_CALL_POST vmaDestroyAllocator( - VmaAllocator VMA_NULLABLE allocator); - -/** \brief Information about existing #VmaAllocator object. -*/ -typedef struct VmaAllocatorInfo -{ - /** \brief Handle to Vulkan instance object. - - This is the same value as has been passed through VmaAllocatorCreateInfo::instance. - */ - VkInstance VMA_NOT_NULL instance; - /** \brief Handle to Vulkan physical device object. - - This is the same value as has been passed through VmaAllocatorCreateInfo::physicalDevice. - */ - VkPhysicalDevice VMA_NOT_NULL physicalDevice; - /** \brief Handle to Vulkan device object. - - This is the same value as has been passed through VmaAllocatorCreateInfo::device. - */ - VkDevice VMA_NOT_NULL device; -} VmaAllocatorInfo; - -/** \brief Returns information about existing #VmaAllocator object - handle to Vulkan device etc. - -It might be useful if you want to keep just the #VmaAllocator handle and fetch other required handles to -`VkPhysicalDevice`, `VkDevice` etc. every time using this function. -*/ -VMA_CALL_PRE void VMA_CALL_POST vmaGetAllocatorInfo(VmaAllocator VMA_NOT_NULL allocator, VmaAllocatorInfo* VMA_NOT_NULL pAllocatorInfo); - -/** -PhysicalDeviceProperties are fetched from physicalDevice by the allocator. -You can access it here, without fetching it again on your own. -*/ -VMA_CALL_PRE void VMA_CALL_POST vmaGetPhysicalDeviceProperties( - VmaAllocator VMA_NOT_NULL allocator, - const VkPhysicalDeviceProperties* VMA_NULLABLE * VMA_NOT_NULL ppPhysicalDeviceProperties); - -/** -PhysicalDeviceMemoryProperties are fetched from physicalDevice by the allocator. -You can access it here, without fetching it again on your own. -*/ -VMA_CALL_PRE void VMA_CALL_POST vmaGetMemoryProperties( - VmaAllocator VMA_NOT_NULL allocator, - const VkPhysicalDeviceMemoryProperties* VMA_NULLABLE * VMA_NOT_NULL ppPhysicalDeviceMemoryProperties); - -/** -\brief Given Memory Type Index, returns Property Flags of this memory type. - -This is just a convenience function. Same information can be obtained using -vmaGetMemoryProperties(). -*/ -VMA_CALL_PRE void VMA_CALL_POST vmaGetMemoryTypeProperties( - VmaAllocator VMA_NOT_NULL allocator, - uint32_t memoryTypeIndex, - VkMemoryPropertyFlags* VMA_NOT_NULL pFlags); - -/** \brief Sets index of the current frame. - -This function must be used if you make allocations with -#VMA_ALLOCATION_CREATE_CAN_BECOME_LOST_BIT and -#VMA_ALLOCATION_CREATE_CAN_MAKE_OTHER_LOST_BIT flags to inform the allocator -when a new frame begins. Allocations queried using vmaGetAllocationInfo() cannot -become lost in the current frame. -*/ -VMA_CALL_PRE void VMA_CALL_POST vmaSetCurrentFrameIndex( - VmaAllocator VMA_NOT_NULL allocator, - uint32_t frameIndex); - -/** \brief Calculated statistics of memory usage in entire allocator. -*/ -typedef struct VmaStatInfo -{ - /// Number of `VkDeviceMemory` Vulkan memory blocks allocated. - uint32_t blockCount; - /// Number of #VmaAllocation allocation objects allocated. - uint32_t allocationCount; - /// Number of free ranges of memory between allocations. - uint32_t unusedRangeCount; - /// Total number of bytes occupied by all allocations. - VkDeviceSize usedBytes; - /// Total number of bytes occupied by unused ranges. - VkDeviceSize unusedBytes; - VkDeviceSize allocationSizeMin, allocationSizeAvg, allocationSizeMax; - VkDeviceSize unusedRangeSizeMin, unusedRangeSizeAvg, unusedRangeSizeMax; -} VmaStatInfo; - -/// General statistics from current state of Allocator. -typedef struct VmaStats -{ - VmaStatInfo memoryType[VK_MAX_MEMORY_TYPES]; - VmaStatInfo memoryHeap[VK_MAX_MEMORY_HEAPS]; - VmaStatInfo total; -} VmaStats; - -/** \brief Retrieves statistics from current state of the Allocator. - -This function is called "calculate" not "get" because it has to traverse all -internal data structures, so it may be quite slow. For faster but more brief statistics -suitable to be called every frame or every allocation, use vmaGetBudget(). - -Note that when using allocator from multiple threads, returned information may immediately -become outdated. -*/ -VMA_CALL_PRE void VMA_CALL_POST vmaCalculateStats( - VmaAllocator VMA_NOT_NULL allocator, - VmaStats* VMA_NOT_NULL pStats); - -/** \brief Statistics of current memory usage and available budget, in bytes, for specific memory heap. -*/ -typedef struct VmaBudget -{ - /** \brief Sum size of all `VkDeviceMemory` blocks allocated from particular heap, in bytes. - */ - VkDeviceSize blockBytes; - - /** \brief Sum size of all allocations created in particular heap, in bytes. - - Usually less or equal than `blockBytes`. - Difference `blockBytes - allocationBytes` is the amount of memory allocated but unused - - available for new allocations or wasted due to fragmentation. - - It might be greater than `blockBytes` if there are some allocations in lost state, as they account - to this value as well. - */ - VkDeviceSize allocationBytes; - - /** \brief Estimated current memory usage of the program, in bytes. - - Fetched from system using `VK_EXT_memory_budget` extension if enabled. - - It might be different than `blockBytes` (usually higher) due to additional implicit objects - also occupying the memory, like swapchain, pipelines, descriptor heaps, command buffers, or - `VkDeviceMemory` blocks allocated outside of this library, if any. - */ - VkDeviceSize usage; - - /** \brief Estimated amount of memory available to the program, in bytes. - - Fetched from system using `VK_EXT_memory_budget` extension if enabled. - - It might be different (most probably smaller) than `VkMemoryHeap::size[heapIndex]` due to factors - external to the program, like other programs also consuming system resources. - Difference `budget - usage` is the amount of additional memory that can probably - be allocated without problems. Exceeding the budget may result in various problems. - */ - VkDeviceSize budget; -} VmaBudget; - -/** \brief Retrieves information about current memory budget for all memory heaps. - -\param[out] pBudget Must point to array with number of elements at least equal to number of memory heaps in physical device used. - -This function is called "get" not "calculate" because it is very fast, suitable to be called -every frame or every allocation. For more detailed statistics use vmaCalculateStats(). - -Note that when using allocator from multiple threads, returned information may immediately -become outdated. -*/ -VMA_CALL_PRE void VMA_CALL_POST vmaGetBudget( - VmaAllocator VMA_NOT_NULL allocator, - VmaBudget* VMA_NOT_NULL pBudget); - -#ifndef VMA_STATS_STRING_ENABLED -#define VMA_STATS_STRING_ENABLED 1 -#endif - -#if VMA_STATS_STRING_ENABLED - -/// Builds and returns statistics as string in JSON format. -/** @param[out] ppStatsString Must be freed using vmaFreeStatsString() function. -*/ -VMA_CALL_PRE void VMA_CALL_POST vmaBuildStatsString( - VmaAllocator VMA_NOT_NULL allocator, - char* VMA_NULLABLE * VMA_NOT_NULL ppStatsString, - VkBool32 detailedMap); - -VMA_CALL_PRE void VMA_CALL_POST vmaFreeStatsString( - VmaAllocator VMA_NOT_NULL allocator, - char* VMA_NULLABLE pStatsString); - -#endif // #if VMA_STATS_STRING_ENABLED - -/** \struct VmaPool -\brief Represents custom memory pool - -Fill structure VmaPoolCreateInfo and call function vmaCreatePool() to create it. -Call function vmaDestroyPool() to destroy it. - -For more information see [Custom memory pools](@ref choosing_memory_type_custom_memory_pools). -*/ -VK_DEFINE_HANDLE(VmaPool) - -typedef enum VmaMemoryUsage -{ - /** No intended memory usage specified. - Use other members of VmaAllocationCreateInfo to specify your requirements. - */ - VMA_MEMORY_USAGE_UNKNOWN = 0, - /** Memory will be used on device only, so fast access from the device is preferred. - It usually means device-local GPU (video) memory. - No need to be mappable on host. - It is roughly equivalent of `D3D12_HEAP_TYPE_DEFAULT`. - - Usage: - - - Resources written and read by device, e.g. images used as attachments. - - Resources transferred from host once (immutable) or infrequently and read by - device multiple times, e.g. textures to be sampled, vertex buffers, uniform - (constant) buffers, and majority of other types of resources used on GPU. - - Allocation may still end up in `HOST_VISIBLE` memory on some implementations. - In such case, you are free to map it. - You can use #VMA_ALLOCATION_CREATE_MAPPED_BIT with this usage type. - */ - VMA_MEMORY_USAGE_GPU_ONLY = 1, - /** Memory will be mappable on host. - It usually means CPU (system) memory. - Guarantees to be `HOST_VISIBLE` and `HOST_COHERENT`. - CPU access is typically uncached. Writes may be write-combined. - Resources created in this pool may still be accessible to the device, but access to them can be slow. - It is roughly equivalent of `D3D12_HEAP_TYPE_UPLOAD`. - - Usage: Staging copy of resources used as transfer source. - */ - VMA_MEMORY_USAGE_CPU_ONLY = 2, - /** - Memory that is both mappable on host (guarantees to be `HOST_VISIBLE`) and preferably fast to access by GPU. - CPU access is typically uncached. Writes may be write-combined. - - Usage: Resources written frequently by host (dynamic), read by device. E.g. textures (with LINEAR layout), vertex buffers, uniform buffers updated every frame or every draw call. - */ - VMA_MEMORY_USAGE_CPU_TO_GPU = 3, - /** Memory mappable on host (guarantees to be `HOST_VISIBLE`) and cached. - It is roughly equivalent of `D3D12_HEAP_TYPE_READBACK`. - - Usage: - - - Resources written by device, read by host - results of some computations, e.g. screen capture, average scene luminance for HDR tone mapping. - - Any resources read or accessed randomly on host, e.g. CPU-side copy of vertex buffer used as source of transfer, but also used for collision detection. - */ - VMA_MEMORY_USAGE_GPU_TO_CPU = 4, - /** CPU memory - memory that is preferably not `DEVICE_LOCAL`, but also not guaranteed to be `HOST_VISIBLE`. - - Usage: Staging copy of resources moved from GPU memory to CPU memory as part - of custom paging/residency mechanism, to be moved back to GPU memory when needed. - */ - VMA_MEMORY_USAGE_CPU_COPY = 5, - /** Lazily allocated GPU memory having `VK_MEMORY_PROPERTY_LAZILY_ALLOCATED_BIT`. - Exists mostly on mobile platforms. Using it on desktop PC or other GPUs with no such memory type present will fail the allocation. - - Usage: Memory for transient attachment images (color attachments, depth attachments etc.), created with `VK_IMAGE_USAGE_TRANSIENT_ATTACHMENT_BIT`. - - Allocations with this usage are always created as dedicated - it implies #VMA_ALLOCATION_CREATE_DEDICATED_MEMORY_BIT. - */ - VMA_MEMORY_USAGE_GPU_LAZILY_ALLOCATED = 6, - - VMA_MEMORY_USAGE_MAX_ENUM = 0x7FFFFFFF -} VmaMemoryUsage; - -/// Flags to be passed as VmaAllocationCreateInfo::flags. -typedef enum VmaAllocationCreateFlagBits { - /** \brief Set this flag if the allocation should have its own memory block. - - Use it for special, big resources, like fullscreen images used as attachments. - - You should not use this flag if VmaAllocationCreateInfo::pool is not null. - */ - VMA_ALLOCATION_CREATE_DEDICATED_MEMORY_BIT = 0x00000001, - - /** \brief Set this flag to only try to allocate from existing `VkDeviceMemory` blocks and never create new such block. - - If new allocation cannot be placed in any of the existing blocks, allocation - fails with `VK_ERROR_OUT_OF_DEVICE_MEMORY` error. - - You should not use #VMA_ALLOCATION_CREATE_DEDICATED_MEMORY_BIT and - #VMA_ALLOCATION_CREATE_NEVER_ALLOCATE_BIT at the same time. It makes no sense. - - If VmaAllocationCreateInfo::pool is not null, this flag is implied and ignored. */ - VMA_ALLOCATION_CREATE_NEVER_ALLOCATE_BIT = 0x00000002, - /** \brief Set this flag to use a memory that will be persistently mapped and retrieve pointer to it. - - Pointer to mapped memory will be returned through VmaAllocationInfo::pMappedData. - - It is valid to use this flag for allocation made from memory type that is not - `HOST_VISIBLE`. This flag is then ignored and memory is not mapped. This is - useful if you need an allocation that is efficient to use on GPU - (`DEVICE_LOCAL`) and still want to map it directly if possible on platforms that - support it (e.g. Intel GPU). - - You should not use this flag together with #VMA_ALLOCATION_CREATE_CAN_BECOME_LOST_BIT. - */ - VMA_ALLOCATION_CREATE_MAPPED_BIT = 0x00000004, - /** Allocation created with this flag can become lost as a result of another - allocation with #VMA_ALLOCATION_CREATE_CAN_MAKE_OTHER_LOST_BIT flag, so you - must check it before use. - - To check if allocation is not lost, call vmaGetAllocationInfo() and check if - VmaAllocationInfo::deviceMemory is not `VK_NULL_HANDLE`. - - For details about supporting lost allocations, see Lost Allocations - chapter of User Guide on Main Page. - - You should not use this flag together with #VMA_ALLOCATION_CREATE_MAPPED_BIT. - */ - VMA_ALLOCATION_CREATE_CAN_BECOME_LOST_BIT = 0x00000008, - /** While creating allocation using this flag, other allocations that were - created with flag #VMA_ALLOCATION_CREATE_CAN_BECOME_LOST_BIT can become lost. - - For details about supporting lost allocations, see Lost Allocations - chapter of User Guide on Main Page. - */ - VMA_ALLOCATION_CREATE_CAN_MAKE_OTHER_LOST_BIT = 0x00000010, - /** Set this flag to treat VmaAllocationCreateInfo::pUserData as pointer to a - null-terminated string. Instead of copying pointer value, a local copy of the - string is made and stored in allocation's `pUserData`. The string is automatically - freed together with the allocation. It is also used in vmaBuildStatsString(). - */ - VMA_ALLOCATION_CREATE_USER_DATA_COPY_STRING_BIT = 0x00000020, - /** Allocation will be created from upper stack in a double stack pool. - - This flag is only allowed for custom pools created with #VMA_POOL_CREATE_LINEAR_ALGORITHM_BIT flag. - */ - VMA_ALLOCATION_CREATE_UPPER_ADDRESS_BIT = 0x00000040, - /** Create both buffer/image and allocation, but don't bind them together. - It is useful when you want to bind yourself to do some more advanced binding, e.g. using some extensions. - The flag is meaningful only with functions that bind by default: vmaCreateBuffer(), vmaCreateImage(). - Otherwise it is ignored. - */ - VMA_ALLOCATION_CREATE_DONT_BIND_BIT = 0x00000080, - /** Create allocation only if additional device memory required for it, if any, won't exceed - memory budget. Otherwise return `VK_ERROR_OUT_OF_DEVICE_MEMORY`. - */ - VMA_ALLOCATION_CREATE_WITHIN_BUDGET_BIT = 0x00000100, - - /** Allocation strategy that chooses smallest possible free range for the - allocation. - */ - VMA_ALLOCATION_CREATE_STRATEGY_BEST_FIT_BIT = 0x00010000, - /** Allocation strategy that chooses biggest possible free range for the - allocation. - */ - VMA_ALLOCATION_CREATE_STRATEGY_WORST_FIT_BIT = 0x00020000, - /** Allocation strategy that chooses first suitable free range for the - allocation. - - "First" doesn't necessarily means the one with smallest offset in memory, - but rather the one that is easiest and fastest to find. - */ - VMA_ALLOCATION_CREATE_STRATEGY_FIRST_FIT_BIT = 0x00040000, - - /** Allocation strategy that tries to minimize memory usage. - */ - VMA_ALLOCATION_CREATE_STRATEGY_MIN_MEMORY_BIT = VMA_ALLOCATION_CREATE_STRATEGY_BEST_FIT_BIT, - /** Allocation strategy that tries to minimize allocation time. - */ - VMA_ALLOCATION_CREATE_STRATEGY_MIN_TIME_BIT = VMA_ALLOCATION_CREATE_STRATEGY_FIRST_FIT_BIT, - /** Allocation strategy that tries to minimize memory fragmentation. - */ - VMA_ALLOCATION_CREATE_STRATEGY_MIN_FRAGMENTATION_BIT = VMA_ALLOCATION_CREATE_STRATEGY_WORST_FIT_BIT, - - /** A bit mask to extract only `STRATEGY` bits from entire set of flags. - */ - VMA_ALLOCATION_CREATE_STRATEGY_MASK = - VMA_ALLOCATION_CREATE_STRATEGY_BEST_FIT_BIT | - VMA_ALLOCATION_CREATE_STRATEGY_WORST_FIT_BIT | - VMA_ALLOCATION_CREATE_STRATEGY_FIRST_FIT_BIT, - - VMA_ALLOCATION_CREATE_FLAG_BITS_MAX_ENUM = 0x7FFFFFFF -} VmaAllocationCreateFlagBits; -typedef VkFlags VmaAllocationCreateFlags; - -typedef struct VmaAllocationCreateInfo -{ - /// Use #VmaAllocationCreateFlagBits enum. - VmaAllocationCreateFlags flags; - /** \brief Intended usage of memory. - - You can leave #VMA_MEMORY_USAGE_UNKNOWN if you specify memory requirements in other way. \n - If `pool` is not null, this member is ignored. - */ - VmaMemoryUsage usage; - /** \brief Flags that must be set in a Memory Type chosen for an allocation. - - Leave 0 if you specify memory requirements in other way. \n - If `pool` is not null, this member is ignored.*/ - VkMemoryPropertyFlags requiredFlags; - /** \brief Flags that preferably should be set in a memory type chosen for an allocation. - - Set to 0 if no additional flags are preferred. \n - If `pool` is not null, this member is ignored. */ - VkMemoryPropertyFlags preferredFlags; - /** \brief Bitmask containing one bit set for every memory type acceptable for this allocation. - - Value 0 is equivalent to `UINT32_MAX` - it means any memory type is accepted if - it meets other requirements specified by this structure, with no further - restrictions on memory type index. \n - If `pool` is not null, this member is ignored. - */ - uint32_t memoryTypeBits; - /** \brief Pool that this allocation should be created in. - - Leave `VK_NULL_HANDLE` to allocate from default pool. If not null, members: - `usage`, `requiredFlags`, `preferredFlags`, `memoryTypeBits` are ignored. - */ - VmaPool VMA_NULLABLE pool; - /** \brief Custom general-purpose pointer that will be stored in #VmaAllocation, can be read as VmaAllocationInfo::pUserData and changed using vmaSetAllocationUserData(). - - If #VMA_ALLOCATION_CREATE_USER_DATA_COPY_STRING_BIT is used, it must be either - null or pointer to a null-terminated string. The string will be then copied to - internal buffer, so it doesn't need to be valid after allocation call. - */ - void* VMA_NULLABLE pUserData; - /** \brief A floating-point value between 0 and 1, indicating the priority of the allocation relative to other memory allocations. - - It is used only when #VMA_ALLOCATOR_CREATE_EXT_MEMORY_PRIORITY_BIT flag was used during creation of the #VmaAllocator object - and this allocation ends up as dedicated or is explicitly forced as dedicated using #VMA_ALLOCATION_CREATE_DEDICATED_MEMORY_BIT. - Otherwise, it has the priority of a memory block where it is placed and this variable is ignored. - */ - float priority; -} VmaAllocationCreateInfo; - -/** -\brief Helps to find memoryTypeIndex, given memoryTypeBits and VmaAllocationCreateInfo. - -This algorithm tries to find a memory type that: - -- Is allowed by memoryTypeBits. -- Contains all the flags from pAllocationCreateInfo->requiredFlags. -- Matches intended usage. -- Has as many flags from pAllocationCreateInfo->preferredFlags as possible. - -\return Returns VK_ERROR_FEATURE_NOT_PRESENT if not found. Receiving such result -from this function or any other allocating function probably means that your -device doesn't support any memory type with requested features for the specific -type of resource you want to use it for. Please check parameters of your -resource, like image layout (OPTIMAL versus LINEAR) or mip level count. -*/ -VMA_CALL_PRE VkResult VMA_CALL_POST vmaFindMemoryTypeIndex( - VmaAllocator VMA_NOT_NULL allocator, - uint32_t memoryTypeBits, - const VmaAllocationCreateInfo* VMA_NOT_NULL pAllocationCreateInfo, - uint32_t* VMA_NOT_NULL pMemoryTypeIndex); - -/** -\brief Helps to find memoryTypeIndex, given VkBufferCreateInfo and VmaAllocationCreateInfo. - -It can be useful e.g. to determine value to be used as VmaPoolCreateInfo::memoryTypeIndex. -It internally creates a temporary, dummy buffer that never has memory bound. -It is just a convenience function, equivalent to calling: - -- `vkCreateBuffer` -- `vkGetBufferMemoryRequirements` -- `vmaFindMemoryTypeIndex` -- `vkDestroyBuffer` -*/ -VMA_CALL_PRE VkResult VMA_CALL_POST vmaFindMemoryTypeIndexForBufferInfo( - VmaAllocator VMA_NOT_NULL allocator, - const VkBufferCreateInfo* VMA_NOT_NULL pBufferCreateInfo, - const VmaAllocationCreateInfo* VMA_NOT_NULL pAllocationCreateInfo, - uint32_t* VMA_NOT_NULL pMemoryTypeIndex); - -/** -\brief Helps to find memoryTypeIndex, given VkImageCreateInfo and VmaAllocationCreateInfo. - -It can be useful e.g. to determine value to be used as VmaPoolCreateInfo::memoryTypeIndex. -It internally creates a temporary, dummy image that never has memory bound. -It is just a convenience function, equivalent to calling: - -- `vkCreateImage` -- `vkGetImageMemoryRequirements` -- `vmaFindMemoryTypeIndex` -- `vkDestroyImage` -*/ -VMA_CALL_PRE VkResult VMA_CALL_POST vmaFindMemoryTypeIndexForImageInfo( - VmaAllocator VMA_NOT_NULL allocator, - const VkImageCreateInfo* VMA_NOT_NULL pImageCreateInfo, - const VmaAllocationCreateInfo* VMA_NOT_NULL pAllocationCreateInfo, - uint32_t* VMA_NOT_NULL pMemoryTypeIndex); - -/// Flags to be passed as VmaPoolCreateInfo::flags. -typedef enum VmaPoolCreateFlagBits { - /** \brief Use this flag if you always allocate only buffers and linear images or only optimal images out of this pool and so Buffer-Image Granularity can be ignored. - - This is an optional optimization flag. - - If you always allocate using vmaCreateBuffer(), vmaCreateImage(), - vmaAllocateMemoryForBuffer(), then you don't need to use it because allocator - knows exact type of your allocations so it can handle Buffer-Image Granularity - in the optimal way. - - If you also allocate using vmaAllocateMemoryForImage() or vmaAllocateMemory(), - exact type of such allocations is not known, so allocator must be conservative - in handling Buffer-Image Granularity, which can lead to suboptimal allocation - (wasted memory). In that case, if you can make sure you always allocate only - buffers and linear images or only optimal images out of this pool, use this flag - to make allocator disregard Buffer-Image Granularity and so make allocations - faster and more optimal. - */ - VMA_POOL_CREATE_IGNORE_BUFFER_IMAGE_GRANULARITY_BIT = 0x00000002, - - /** \brief Enables alternative, linear allocation algorithm in this pool. - - Specify this flag to enable linear allocation algorithm, which always creates - new allocations after last one and doesn't reuse space from allocations freed in - between. It trades memory consumption for simplified algorithm and data - structure, which has better performance and uses less memory for metadata. - - By using this flag, you can achieve behavior of free-at-once, stack, - ring buffer, and double stack. For details, see documentation chapter - \ref linear_algorithm. - - When using this flag, you must specify VmaPoolCreateInfo::maxBlockCount == 1 (or 0 for default). - - For more details, see [Linear allocation algorithm](@ref linear_algorithm). - */ - VMA_POOL_CREATE_LINEAR_ALGORITHM_BIT = 0x00000004, - - /** \brief Enables alternative, buddy allocation algorithm in this pool. - - It operates on a tree of blocks, each having size that is a power of two and - a half of its parent's size. Comparing to default algorithm, this one provides - faster allocation and deallocation and decreased external fragmentation, - at the expense of more memory wasted (internal fragmentation). - - For more details, see [Buddy allocation algorithm](@ref buddy_algorithm). - */ - VMA_POOL_CREATE_BUDDY_ALGORITHM_BIT = 0x00000008, - - /** Bit mask to extract only `ALGORITHM` bits from entire set of flags. - */ - VMA_POOL_CREATE_ALGORITHM_MASK = - VMA_POOL_CREATE_LINEAR_ALGORITHM_BIT | - VMA_POOL_CREATE_BUDDY_ALGORITHM_BIT, - - VMA_POOL_CREATE_FLAG_BITS_MAX_ENUM = 0x7FFFFFFF -} VmaPoolCreateFlagBits; -typedef VkFlags VmaPoolCreateFlags; - -/** \brief Describes parameter of created #VmaPool. -*/ -typedef struct VmaPoolCreateInfo { - /** \brief Vulkan memory type index to allocate this pool from. - */ - uint32_t memoryTypeIndex; - /** \brief Use combination of #VmaPoolCreateFlagBits. - */ - VmaPoolCreateFlags flags; - /** \brief Size of a single `VkDeviceMemory` block to be allocated as part of this pool, in bytes. Optional. - - Specify nonzero to set explicit, constant size of memory blocks used by this - pool. - - Leave 0 to use default and let the library manage block sizes automatically. - Sizes of particular blocks may vary. - */ - VkDeviceSize blockSize; - /** \brief Minimum number of blocks to be always allocated in this pool, even if they stay empty. - - Set to 0 to have no preallocated blocks and allow the pool be completely empty. - */ - size_t minBlockCount; - /** \brief Maximum number of blocks that can be allocated in this pool. Optional. - - Set to 0 to use default, which is `SIZE_MAX`, which means no limit. - - Set to same value as VmaPoolCreateInfo::minBlockCount to have fixed amount of memory allocated - throughout whole lifetime of this pool. - */ - size_t maxBlockCount; - /** \brief Maximum number of additional frames that are in use at the same time as current frame. - - This value is used only when you make allocations with - #VMA_ALLOCATION_CREATE_CAN_BECOME_LOST_BIT flag. Such allocation cannot become - lost if allocation.lastUseFrameIndex >= allocator.currentFrameIndex - frameInUseCount. - - For example, if you double-buffer your command buffers, so resources used for - rendering in previous frame may still be in use by the GPU at the moment you - allocate resources needed for the current frame, set this value to 1. - - If you want to allow any allocations other than used in the current frame to - become lost, set this value to 0. - */ - uint32_t frameInUseCount; - /** \brief A floating-point value between 0 and 1, indicating the priority of the allocations in this pool relative to other memory allocations. - - It is used only when #VMA_ALLOCATOR_CREATE_EXT_MEMORY_PRIORITY_BIT flag was used during creation of the #VmaAllocator object. - Otherwise, this variable is ignored. - */ - float priority; -} VmaPoolCreateInfo; - -/** \brief Describes parameter of existing #VmaPool. -*/ -typedef struct VmaPoolStats { - /** \brief Total amount of `VkDeviceMemory` allocated from Vulkan for this pool, in bytes. - */ - VkDeviceSize size; - /** \brief Total number of bytes in the pool not used by any #VmaAllocation. - */ - VkDeviceSize unusedSize; - /** \brief Number of #VmaAllocation objects created from this pool that were not destroyed or lost. - */ - size_t allocationCount; - /** \brief Number of continuous memory ranges in the pool not used by any #VmaAllocation. - */ - size_t unusedRangeCount; - /** \brief Size of the largest continuous free memory region available for new allocation. - - Making a new allocation of that size is not guaranteed to succeed because of - possible additional margin required to respect alignment and buffer/image - granularity. - */ - VkDeviceSize unusedRangeSizeMax; - /** \brief Number of `VkDeviceMemory` blocks allocated for this pool. - */ - size_t blockCount; -} VmaPoolStats; - -/** \brief Allocates Vulkan device memory and creates #VmaPool object. - -@param allocator Allocator object. -@param pCreateInfo Parameters of pool to create. -@param[out] pPool Handle to created pool. -*/ -VMA_CALL_PRE VkResult VMA_CALL_POST vmaCreatePool( - VmaAllocator VMA_NOT_NULL allocator, - const VmaPoolCreateInfo* VMA_NOT_NULL pCreateInfo, - VmaPool VMA_NULLABLE * VMA_NOT_NULL pPool); - -/** \brief Destroys #VmaPool object and frees Vulkan device memory. -*/ -VMA_CALL_PRE void VMA_CALL_POST vmaDestroyPool( - VmaAllocator VMA_NOT_NULL allocator, - VmaPool VMA_NULLABLE pool); - -/** \brief Retrieves statistics of existing #VmaPool object. - -@param allocator Allocator object. -@param pool Pool object. -@param[out] pPoolStats Statistics of specified pool. -*/ -VMA_CALL_PRE void VMA_CALL_POST vmaGetPoolStats( - VmaAllocator VMA_NOT_NULL allocator, - VmaPool VMA_NOT_NULL pool, - VmaPoolStats* VMA_NOT_NULL pPoolStats); - -/** \brief Marks all allocations in given pool as lost if they are not used in current frame or VmaPoolCreateInfo::frameInUseCount back from now. - -@param allocator Allocator object. -@param pool Pool. -@param[out] pLostAllocationCount Number of allocations marked as lost. Optional - pass null if you don't need this information. -*/ -VMA_CALL_PRE void VMA_CALL_POST vmaMakePoolAllocationsLost( - VmaAllocator VMA_NOT_NULL allocator, - VmaPool VMA_NOT_NULL pool, - size_t* VMA_NULLABLE pLostAllocationCount); - -/** \brief Checks magic number in margins around all allocations in given memory pool in search for corruptions. - -Corruption detection is enabled only when `VMA_DEBUG_DETECT_CORRUPTION` macro is defined to nonzero, -`VMA_DEBUG_MARGIN` is defined to nonzero and the pool is created in memory type that is -`HOST_VISIBLE` and `HOST_COHERENT`. For more information, see [Corruption detection](@ref debugging_memory_usage_corruption_detection). - -Possible return values: - -- `VK_ERROR_FEATURE_NOT_PRESENT` - corruption detection is not enabled for specified pool. -- `VK_SUCCESS` - corruption detection has been performed and succeeded. -- `VK_ERROR_VALIDATION_FAILED_EXT` - corruption detection has been performed and found memory corruptions around one of the allocations. - `VMA_ASSERT` is also fired in that case. -- Other value: Error returned by Vulkan, e.g. memory mapping failure. -*/ -VMA_CALL_PRE VkResult VMA_CALL_POST vmaCheckPoolCorruption(VmaAllocator VMA_NOT_NULL allocator, VmaPool VMA_NOT_NULL pool); - -/** \brief Retrieves name of a custom pool. - -After the call `ppName` is either null or points to an internally-owned null-terminated string -containing name of the pool that was previously set. The pointer becomes invalid when the pool is -destroyed or its name is changed using vmaSetPoolName(). -*/ -VMA_CALL_PRE void VMA_CALL_POST vmaGetPoolName( - VmaAllocator VMA_NOT_NULL allocator, - VmaPool VMA_NOT_NULL pool, - const char* VMA_NULLABLE * VMA_NOT_NULL ppName); - -/** \brief Sets name of a custom pool. - -`pName` can be either null or pointer to a null-terminated string with new name for the pool. -Function makes internal copy of the string, so it can be changed or freed immediately after this call. -*/ -VMA_CALL_PRE void VMA_CALL_POST vmaSetPoolName( - VmaAllocator VMA_NOT_NULL allocator, - VmaPool VMA_NOT_NULL pool, - const char* VMA_NULLABLE pName); - -/** \struct VmaAllocation -\brief Represents single memory allocation. - -It may be either dedicated block of `VkDeviceMemory` or a specific region of a bigger block of this type -plus unique offset. - -There are multiple ways to create such object. -You need to fill structure VmaAllocationCreateInfo. -For more information see [Choosing memory type](@ref choosing_memory_type). - -Although the library provides convenience functions that create Vulkan buffer or image, -allocate memory for it and bind them together, -binding of the allocation to a buffer or an image is out of scope of the allocation itself. -Allocation object can exist without buffer/image bound, -binding can be done manually by the user, and destruction of it can be done -independently of destruction of the allocation. - -The object also remembers its size and some other information. -To retrieve this information, use function vmaGetAllocationInfo() and inspect -returned structure VmaAllocationInfo. - -Some kinds allocations can be in lost state. -For more information, see [Lost allocations](@ref lost_allocations). -*/ -VK_DEFINE_HANDLE(VmaAllocation) - -/** \brief Parameters of #VmaAllocation objects, that can be retrieved using function vmaGetAllocationInfo(). -*/ -typedef struct VmaAllocationInfo { - /** \brief Memory type index that this allocation was allocated from. - - It never changes. - */ - uint32_t memoryType; - /** \brief Handle to Vulkan memory object. - - Same memory object can be shared by multiple allocations. - - It can change after call to vmaDefragment() if this allocation is passed to the function, or if allocation is lost. - - If the allocation is lost, it is equal to `VK_NULL_HANDLE`. - */ - VkDeviceMemory VMA_NULLABLE_NON_DISPATCHABLE deviceMemory; - /** \brief Offset in `VkDeviceMemory` object to the beginning of this allocation, in bytes. `(deviceMemory, offset)` pair is unique to this allocation. - - You usually don't need to use this offset. If you create a buffer or an image together with the allocation using e.g. function - vmaCreateBuffer(), vmaCreateImage(), functions that operate on these resources refer to the beginning of the buffer or image, - not entire device memory block. Functions like vmaMapMemory(), vmaBindBufferMemory() also refer to the beginning of the allocation - and apply this offset automatically. - - It can change after call to vmaDefragment() if this allocation is passed to the function, or if allocation is lost. - */ - VkDeviceSize offset; - /** \brief Size of this allocation, in bytes. - - It never changes, unless allocation is lost. - - \note Allocation size returned in this variable may be greater than the size - requested for the resource e.g. as `VkBufferCreateInfo::size`. Whole size of the - allocation is accessible for operations on memory e.g. using a pointer after - mapping with vmaMapMemory(), but operations on the resource e.g. using - `vkCmdCopyBuffer` must be limited to the size of the resource. - */ - VkDeviceSize size; - /** \brief Pointer to the beginning of this allocation as mapped data. - - If the allocation hasn't been mapped using vmaMapMemory() and hasn't been - created with #VMA_ALLOCATION_CREATE_MAPPED_BIT flag, this value is null. - - It can change after call to vmaMapMemory(), vmaUnmapMemory(). - It can also change after call to vmaDefragment() if this allocation is passed to the function. - */ - void* VMA_NULLABLE pMappedData; - /** \brief Custom general-purpose pointer that was passed as VmaAllocationCreateInfo::pUserData or set using vmaSetAllocationUserData(). - - It can change after call to vmaSetAllocationUserData() for this allocation. - */ - void* VMA_NULLABLE pUserData; -} VmaAllocationInfo; - -/** \brief General purpose memory allocation. - -@param[out] pAllocation Handle to allocated memory. -@param[out] pAllocationInfo Optional. Information about allocated memory. It can be later fetched using function vmaGetAllocationInfo(). - -You should free the memory using vmaFreeMemory() or vmaFreeMemoryPages(). - -It is recommended to use vmaAllocateMemoryForBuffer(), vmaAllocateMemoryForImage(), -vmaCreateBuffer(), vmaCreateImage() instead whenever possible. -*/ -VMA_CALL_PRE VkResult VMA_CALL_POST vmaAllocateMemory( - VmaAllocator VMA_NOT_NULL allocator, - const VkMemoryRequirements* VMA_NOT_NULL pVkMemoryRequirements, - const VmaAllocationCreateInfo* VMA_NOT_NULL pCreateInfo, - VmaAllocation VMA_NULLABLE * VMA_NOT_NULL pAllocation, - VmaAllocationInfo* VMA_NULLABLE pAllocationInfo); - -/** \brief General purpose memory allocation for multiple allocation objects at once. - -@param allocator Allocator object. -@param pVkMemoryRequirements Memory requirements for each allocation. -@param pCreateInfo Creation parameters for each alloction. -@param allocationCount Number of allocations to make. -@param[out] pAllocations Pointer to array that will be filled with handles to created allocations. -@param[out] pAllocationInfo Optional. Pointer to array that will be filled with parameters of created allocations. - -You should free the memory using vmaFreeMemory() or vmaFreeMemoryPages(). - -Word "pages" is just a suggestion to use this function to allocate pieces of memory needed for sparse binding. -It is just a general purpose allocation function able to make multiple allocations at once. -It may be internally optimized to be more efficient than calling vmaAllocateMemory() `allocationCount` times. - -All allocations are made using same parameters. All of them are created out of the same memory pool and type. -If any allocation fails, all allocations already made within this function call are also freed, so that when -returned result is not `VK_SUCCESS`, `pAllocation` array is always entirely filled with `VK_NULL_HANDLE`. -*/ -VMA_CALL_PRE VkResult VMA_CALL_POST vmaAllocateMemoryPages( - VmaAllocator VMA_NOT_NULL allocator, - const VkMemoryRequirements* VMA_NOT_NULL VMA_LEN_IF_NOT_NULL(allocationCount) pVkMemoryRequirements, - const VmaAllocationCreateInfo* VMA_NOT_NULL VMA_LEN_IF_NOT_NULL(allocationCount) pCreateInfo, - size_t allocationCount, - VmaAllocation VMA_NULLABLE * VMA_NOT_NULL VMA_LEN_IF_NOT_NULL(allocationCount) pAllocations, - VmaAllocationInfo* VMA_NULLABLE VMA_LEN_IF_NOT_NULL(allocationCount) pAllocationInfo); - -/** -@param[out] pAllocation Handle to allocated memory. -@param[out] pAllocationInfo Optional. Information about allocated memory. It can be later fetched using function vmaGetAllocationInfo(). - -You should free the memory using vmaFreeMemory(). -*/ -VMA_CALL_PRE VkResult VMA_CALL_POST vmaAllocateMemoryForBuffer( - VmaAllocator VMA_NOT_NULL allocator, - VkBuffer VMA_NOT_NULL_NON_DISPATCHABLE buffer, - const VmaAllocationCreateInfo* VMA_NOT_NULL pCreateInfo, - VmaAllocation VMA_NULLABLE * VMA_NOT_NULL pAllocation, - VmaAllocationInfo* VMA_NULLABLE pAllocationInfo); - -/// Function similar to vmaAllocateMemoryForBuffer(). -VMA_CALL_PRE VkResult VMA_CALL_POST vmaAllocateMemoryForImage( - VmaAllocator VMA_NOT_NULL allocator, - VkImage VMA_NOT_NULL_NON_DISPATCHABLE image, - const VmaAllocationCreateInfo* VMA_NOT_NULL pCreateInfo, - VmaAllocation VMA_NULLABLE * VMA_NOT_NULL pAllocation, - VmaAllocationInfo* VMA_NULLABLE pAllocationInfo); - -/** \brief Frees memory previously allocated using vmaAllocateMemory(), vmaAllocateMemoryForBuffer(), or vmaAllocateMemoryForImage(). - -Passing `VK_NULL_HANDLE` as `allocation` is valid. Such function call is just skipped. -*/ -VMA_CALL_PRE void VMA_CALL_POST vmaFreeMemory( - VmaAllocator VMA_NOT_NULL allocator, - const VmaAllocation VMA_NULLABLE allocation); - -/** \brief Frees memory and destroys multiple allocations. - -Word "pages" is just a suggestion to use this function to free pieces of memory used for sparse binding. -It is just a general purpose function to free memory and destroy allocations made using e.g. vmaAllocateMemory(), -vmaAllocateMemoryPages() and other functions. -It may be internally optimized to be more efficient than calling vmaFreeMemory() `allocationCount` times. - -Allocations in `pAllocations` array can come from any memory pools and types. -Passing `VK_NULL_HANDLE` as elements of `pAllocations` array is valid. Such entries are just skipped. -*/ -VMA_CALL_PRE void VMA_CALL_POST vmaFreeMemoryPages( - VmaAllocator VMA_NOT_NULL allocator, - size_t allocationCount, - const VmaAllocation VMA_NULLABLE * VMA_NOT_NULL VMA_LEN_IF_NOT_NULL(allocationCount) pAllocations); - -/** \brief Returns current information about specified allocation and atomically marks it as used in current frame. - -Current paramteres of given allocation are returned in `pAllocationInfo`. - -This function also atomically "touches" allocation - marks it as used in current frame, -just like vmaTouchAllocation(). -If the allocation is in lost state, `pAllocationInfo->deviceMemory == VK_NULL_HANDLE`. - -Although this function uses atomics and doesn't lock any mutex, so it should be quite efficient, -you can avoid calling it too often. - -- You can retrieve same VmaAllocationInfo structure while creating your resource, from function - vmaCreateBuffer(), vmaCreateImage(). You can remember it if you are sure parameters don't change - (e.g. due to defragmentation or allocation becoming lost). -- If you just want to check if allocation is not lost, vmaTouchAllocation() will work faster. -*/ -VMA_CALL_PRE void VMA_CALL_POST vmaGetAllocationInfo( - VmaAllocator VMA_NOT_NULL allocator, - VmaAllocation VMA_NOT_NULL allocation, - VmaAllocationInfo* VMA_NOT_NULL pAllocationInfo); - -/** \brief Returns `VK_TRUE` if allocation is not lost and atomically marks it as used in current frame. - -If the allocation has been created with #VMA_ALLOCATION_CREATE_CAN_BECOME_LOST_BIT flag, -this function returns `VK_TRUE` if it's not in lost state, so it can still be used. -It then also atomically "touches" the allocation - marks it as used in current frame, -so that you can be sure it won't become lost in current frame or next `frameInUseCount` frames. - -If the allocation is in lost state, the function returns `VK_FALSE`. -Memory of such allocation, as well as buffer or image bound to it, should not be used. -Lost allocation and the buffer/image still need to be destroyed. - -If the allocation has been created without #VMA_ALLOCATION_CREATE_CAN_BECOME_LOST_BIT flag, -this function always returns `VK_TRUE`. -*/ -VMA_CALL_PRE VkBool32 VMA_CALL_POST vmaTouchAllocation( - VmaAllocator VMA_NOT_NULL allocator, - VmaAllocation VMA_NOT_NULL allocation); - -/** \brief Sets pUserData in given allocation to new value. - -If the allocation was created with VMA_ALLOCATION_CREATE_USER_DATA_COPY_STRING_BIT, -pUserData must be either null, or pointer to a null-terminated string. The function -makes local copy of the string and sets it as allocation's `pUserData`. String -passed as pUserData doesn't need to be valid for whole lifetime of the allocation - -you can free it after this call. String previously pointed by allocation's -pUserData is freed from memory. - -If the flag was not used, the value of pointer `pUserData` is just copied to -allocation's `pUserData`. It is opaque, so you can use it however you want - e.g. -as a pointer, ordinal number or some handle to you own data. -*/ -VMA_CALL_PRE void VMA_CALL_POST vmaSetAllocationUserData( - VmaAllocator VMA_NOT_NULL allocator, - VmaAllocation VMA_NOT_NULL allocation, - void* VMA_NULLABLE pUserData); - -/** \brief Creates new allocation that is in lost state from the beginning. - -It can be useful if you need a dummy, non-null allocation. - -You still need to destroy created object using vmaFreeMemory(). - -Returned allocation is not tied to any specific memory pool or memory type and -not bound to any image or buffer. It has size = 0. It cannot be turned into -a real, non-empty allocation. -*/ -VMA_CALL_PRE void VMA_CALL_POST vmaCreateLostAllocation( - VmaAllocator VMA_NOT_NULL allocator, - VmaAllocation VMA_NULLABLE * VMA_NOT_NULL pAllocation); - -/** \brief Maps memory represented by given allocation and returns pointer to it. - -Maps memory represented by given allocation to make it accessible to CPU code. -When succeeded, `*ppData` contains pointer to first byte of this memory. -If the allocation is part of bigger `VkDeviceMemory` block, the pointer is -correctly offseted to the beginning of region assigned to this particular -allocation. - -Mapping is internally reference-counted and synchronized, so despite raw Vulkan -function `vkMapMemory()` cannot be used to map same block of `VkDeviceMemory` -multiple times simultaneously, it is safe to call this function on allocations -assigned to the same memory block. Actual Vulkan memory will be mapped on first -mapping and unmapped on last unmapping. - -If the function succeeded, you must call vmaUnmapMemory() to unmap the -allocation when mapping is no longer needed or before freeing the allocation, at -the latest. - -It also safe to call this function multiple times on the same allocation. You -must call vmaUnmapMemory() same number of times as you called vmaMapMemory(). - -It is also safe to call this function on allocation created with -#VMA_ALLOCATION_CREATE_MAPPED_BIT flag. Its memory stays mapped all the time. -You must still call vmaUnmapMemory() same number of times as you called -vmaMapMemory(). You must not call vmaUnmapMemory() additional time to free the -"0-th" mapping made automatically due to #VMA_ALLOCATION_CREATE_MAPPED_BIT flag. - -This function fails when used on allocation made in memory type that is not -`HOST_VISIBLE`. - -This function always fails when called for allocation that was created with -#VMA_ALLOCATION_CREATE_CAN_BECOME_LOST_BIT flag. Such allocations cannot be -mapped. - -This function doesn't automatically flush or invalidate caches. -If the allocation is made from a memory types that is not `HOST_COHERENT`, -you also need to use vmaInvalidateAllocation() / vmaFlushAllocation(), as required by Vulkan specification. -*/ -VMA_CALL_PRE VkResult VMA_CALL_POST vmaMapMemory( - VmaAllocator VMA_NOT_NULL allocator, - VmaAllocation VMA_NOT_NULL allocation, - void* VMA_NULLABLE * VMA_NOT_NULL ppData); - -/** \brief Unmaps memory represented by given allocation, mapped previously using vmaMapMemory(). - -For details, see description of vmaMapMemory(). - -This function doesn't automatically flush or invalidate caches. -If the allocation is made from a memory types that is not `HOST_COHERENT`, -you also need to use vmaInvalidateAllocation() / vmaFlushAllocation(), as required by Vulkan specification. -*/ -VMA_CALL_PRE void VMA_CALL_POST vmaUnmapMemory( - VmaAllocator VMA_NOT_NULL allocator, - VmaAllocation VMA_NOT_NULL allocation); - -/** \brief Flushes memory of given allocation. - -Calls `vkFlushMappedMemoryRanges()` for memory associated with given range of given allocation. -It needs to be called after writing to a mapped memory for memory types that are not `HOST_COHERENT`. -Unmap operation doesn't do that automatically. - -- `offset` must be relative to the beginning of allocation. -- `size` can be `VK_WHOLE_SIZE`. It means all memory from `offset` the the end of given allocation. -- `offset` and `size` don't have to be aligned. - They are internally rounded down/up to multiply of `nonCoherentAtomSize`. -- If `size` is 0, this call is ignored. -- If memory type that the `allocation` belongs to is not `HOST_VISIBLE` or it is `HOST_COHERENT`, - this call is ignored. - -Warning! `offset` and `size` are relative to the contents of given `allocation`. -If you mean whole allocation, you can pass 0 and `VK_WHOLE_SIZE`, respectively. -Do not pass allocation's offset as `offset`!!! - -This function returns the `VkResult` from `vkFlushMappedMemoryRanges` if it is -called, otherwise `VK_SUCCESS`. -*/ -VMA_CALL_PRE VkResult VMA_CALL_POST vmaFlushAllocation( - VmaAllocator VMA_NOT_NULL allocator, - VmaAllocation VMA_NOT_NULL allocation, - VkDeviceSize offset, - VkDeviceSize size); - -/** \brief Invalidates memory of given allocation. - -Calls `vkInvalidateMappedMemoryRanges()` for memory associated with given range of given allocation. -It needs to be called before reading from a mapped memory for memory types that are not `HOST_COHERENT`. -Map operation doesn't do that automatically. - -- `offset` must be relative to the beginning of allocation. -- `size` can be `VK_WHOLE_SIZE`. It means all memory from `offset` the the end of given allocation. -- `offset` and `size` don't have to be aligned. - They are internally rounded down/up to multiply of `nonCoherentAtomSize`. -- If `size` is 0, this call is ignored. -- If memory type that the `allocation` belongs to is not `HOST_VISIBLE` or it is `HOST_COHERENT`, - this call is ignored. - -Warning! `offset` and `size` are relative to the contents of given `allocation`. -If you mean whole allocation, you can pass 0 and `VK_WHOLE_SIZE`, respectively. -Do not pass allocation's offset as `offset`!!! - -This function returns the `VkResult` from `vkInvalidateMappedMemoryRanges` if -it is called, otherwise `VK_SUCCESS`. -*/ -VMA_CALL_PRE VkResult VMA_CALL_POST vmaInvalidateAllocation( - VmaAllocator VMA_NOT_NULL allocator, - VmaAllocation VMA_NOT_NULL allocation, - VkDeviceSize offset, - VkDeviceSize size); - -/** \brief Flushes memory of given set of allocations. - -Calls `vkFlushMappedMemoryRanges()` for memory associated with given ranges of given allocations. -For more information, see documentation of vmaFlushAllocation(). - -\param allocator -\param allocationCount -\param allocations -\param offsets If not null, it must point to an array of offsets of regions to flush, relative to the beginning of respective allocations. Null means all ofsets are zero. -\param sizes If not null, it must point to an array of sizes of regions to flush in respective allocations. Null means `VK_WHOLE_SIZE` for all allocations. - -This function returns the `VkResult` from `vkFlushMappedMemoryRanges` if it is -called, otherwise `VK_SUCCESS`. -*/ -VMA_CALL_PRE VkResult VMA_CALL_POST vmaFlushAllocations( - VmaAllocator VMA_NOT_NULL allocator, - uint32_t allocationCount, - const VmaAllocation VMA_NOT_NULL * VMA_NULLABLE VMA_LEN_IF_NOT_NULL(allocationCount) allocations, - const VkDeviceSize* VMA_NULLABLE VMA_LEN_IF_NOT_NULL(allocationCount) offsets, - const VkDeviceSize* VMA_NULLABLE VMA_LEN_IF_NOT_NULL(allocationCount) sizes); - -/** \brief Invalidates memory of given set of allocations. - -Calls `vkInvalidateMappedMemoryRanges()` for memory associated with given ranges of given allocations. -For more information, see documentation of vmaInvalidateAllocation(). - -\param allocator -\param allocationCount -\param allocations -\param offsets If not null, it must point to an array of offsets of regions to flush, relative to the beginning of respective allocations. Null means all ofsets are zero. -\param sizes If not null, it must point to an array of sizes of regions to flush in respective allocations. Null means `VK_WHOLE_SIZE` for all allocations. - -This function returns the `VkResult` from `vkInvalidateMappedMemoryRanges` if it is -called, otherwise `VK_SUCCESS`. -*/ -VMA_CALL_PRE VkResult VMA_CALL_POST vmaInvalidateAllocations( - VmaAllocator VMA_NOT_NULL allocator, - uint32_t allocationCount, - const VmaAllocation VMA_NOT_NULL * VMA_NULLABLE VMA_LEN_IF_NOT_NULL(allocationCount) allocations, - const VkDeviceSize* VMA_NULLABLE VMA_LEN_IF_NOT_NULL(allocationCount) offsets, - const VkDeviceSize* VMA_NULLABLE VMA_LEN_IF_NOT_NULL(allocationCount) sizes); - -/** \brief Checks magic number in margins around all allocations in given memory types (in both default and custom pools) in search for corruptions. - -@param memoryTypeBits Bit mask, where each bit set means that a memory type with that index should be checked. - -Corruption detection is enabled only when `VMA_DEBUG_DETECT_CORRUPTION` macro is defined to nonzero, -`VMA_DEBUG_MARGIN` is defined to nonzero and only for memory types that are -`HOST_VISIBLE` and `HOST_COHERENT`. For more information, see [Corruption detection](@ref debugging_memory_usage_corruption_detection). - -Possible return values: - -- `VK_ERROR_FEATURE_NOT_PRESENT` - corruption detection is not enabled for any of specified memory types. -- `VK_SUCCESS` - corruption detection has been performed and succeeded. -- `VK_ERROR_VALIDATION_FAILED_EXT` - corruption detection has been performed and found memory corruptions around one of the allocations. - `VMA_ASSERT` is also fired in that case. -- Other value: Error returned by Vulkan, e.g. memory mapping failure. -*/ -VMA_CALL_PRE VkResult VMA_CALL_POST vmaCheckCorruption(VmaAllocator VMA_NOT_NULL allocator, uint32_t memoryTypeBits); - -/** \struct VmaDefragmentationContext -\brief Represents Opaque object that represents started defragmentation process. - -Fill structure #VmaDefragmentationInfo2 and call function vmaDefragmentationBegin() to create it. -Call function vmaDefragmentationEnd() to destroy it. -*/ -VK_DEFINE_HANDLE(VmaDefragmentationContext) - -/// Flags to be used in vmaDefragmentationBegin(). None at the moment. Reserved for future use. -typedef enum VmaDefragmentationFlagBits { - VMA_DEFRAGMENTATION_FLAG_INCREMENTAL = 0x1, - VMA_DEFRAGMENTATION_FLAG_BITS_MAX_ENUM = 0x7FFFFFFF -} VmaDefragmentationFlagBits; -typedef VkFlags VmaDefragmentationFlags; - -/** \brief Parameters for defragmentation. - -To be used with function vmaDefragmentationBegin(). -*/ -typedef struct VmaDefragmentationInfo2 { - /** \brief Reserved for future use. Should be 0. - */ - VmaDefragmentationFlags flags; - /** \brief Number of allocations in `pAllocations` array. - */ - uint32_t allocationCount; - /** \brief Pointer to array of allocations that can be defragmented. - - The array should have `allocationCount` elements. - The array should not contain nulls. - Elements in the array should be unique - same allocation cannot occur twice. - It is safe to pass allocations that are in the lost state - they are ignored. - All allocations not present in this array are considered non-moveable during this defragmentation. - */ - const VmaAllocation VMA_NOT_NULL * VMA_NULLABLE VMA_LEN_IF_NOT_NULL(allocationCount) pAllocations; - /** \brief Optional, output. Pointer to array that will be filled with information whether the allocation at certain index has been changed during defragmentation. - - The array should have `allocationCount` elements. - You can pass null if you are not interested in this information. - */ - VkBool32* VMA_NULLABLE VMA_LEN_IF_NOT_NULL(allocationCount) pAllocationsChanged; - /** \brief Numer of pools in `pPools` array. - */ - uint32_t poolCount; - /** \brief Either null or pointer to array of pools to be defragmented. - - All the allocations in the specified pools can be moved during defragmentation - and there is no way to check if they were really moved as in `pAllocationsChanged`, - so you must query all the allocations in all these pools for new `VkDeviceMemory` - and offset using vmaGetAllocationInfo() if you might need to recreate buffers - and images bound to them. - - The array should have `poolCount` elements. - The array should not contain nulls. - Elements in the array should be unique - same pool cannot occur twice. - - Using this array is equivalent to specifying all allocations from the pools in `pAllocations`. - It might be more efficient. - */ - const VmaPool VMA_NOT_NULL * VMA_NULLABLE VMA_LEN_IF_NOT_NULL(poolCount) pPools; - /** \brief Maximum total numbers of bytes that can be copied while moving allocations to different places using transfers on CPU side, like `memcpy()`, `memmove()`. - - `VK_WHOLE_SIZE` means no limit. - */ - VkDeviceSize maxCpuBytesToMove; - /** \brief Maximum number of allocations that can be moved to a different place using transfers on CPU side, like `memcpy()`, `memmove()`. - - `UINT32_MAX` means no limit. - */ - uint32_t maxCpuAllocationsToMove; - /** \brief Maximum total numbers of bytes that can be copied while moving allocations to different places using transfers on GPU side, posted to `commandBuffer`. - - `VK_WHOLE_SIZE` means no limit. - */ - VkDeviceSize maxGpuBytesToMove; - /** \brief Maximum number of allocations that can be moved to a different place using transfers on GPU side, posted to `commandBuffer`. - - `UINT32_MAX` means no limit. - */ - uint32_t maxGpuAllocationsToMove; - /** \brief Optional. Command buffer where GPU copy commands will be posted. - - If not null, it must be a valid command buffer handle that supports Transfer queue type. - It must be in the recording state and outside of a render pass instance. - You need to submit it and make sure it finished execution before calling vmaDefragmentationEnd(). - - Passing null means that only CPU defragmentation will be performed. - */ - VkCommandBuffer VMA_NULLABLE commandBuffer; -} VmaDefragmentationInfo2; - -typedef struct VmaDefragmentationPassMoveInfo { - VmaAllocation VMA_NOT_NULL allocation; - VkDeviceMemory VMA_NOT_NULL_NON_DISPATCHABLE memory; - VkDeviceSize offset; -} VmaDefragmentationPassMoveInfo; - -/** \brief Parameters for incremental defragmentation steps. - -To be used with function vmaBeginDefragmentationPass(). -*/ -typedef struct VmaDefragmentationPassInfo { - uint32_t moveCount; - VmaDefragmentationPassMoveInfo* VMA_NOT_NULL VMA_LEN_IF_NOT_NULL(moveCount) pMoves; -} VmaDefragmentationPassInfo; - -/** \brief Deprecated. Optional configuration parameters to be passed to function vmaDefragment(). - -\deprecated This is a part of the old interface. It is recommended to use structure #VmaDefragmentationInfo2 and function vmaDefragmentationBegin() instead. -*/ -typedef struct VmaDefragmentationInfo { - /** \brief Maximum total numbers of bytes that can be copied while moving allocations to different places. - - Default is `VK_WHOLE_SIZE`, which means no limit. - */ - VkDeviceSize maxBytesToMove; - /** \brief Maximum number of allocations that can be moved to different place. - - Default is `UINT32_MAX`, which means no limit. - */ - uint32_t maxAllocationsToMove; -} VmaDefragmentationInfo; - -/** \brief Statistics returned by function vmaDefragment(). */ -typedef struct VmaDefragmentationStats { - /// Total number of bytes that have been copied while moving allocations to different places. - VkDeviceSize bytesMoved; - /// Total number of bytes that have been released to the system by freeing empty `VkDeviceMemory` objects. - VkDeviceSize bytesFreed; - /// Number of allocations that have been moved to different places. - uint32_t allocationsMoved; - /// Number of empty `VkDeviceMemory` objects that have been released to the system. - uint32_t deviceMemoryBlocksFreed; -} VmaDefragmentationStats; - -/** \brief Begins defragmentation process. - -@param allocator Allocator object. -@param pInfo Structure filled with parameters of defragmentation. -@param[out] pStats Optional. Statistics of defragmentation. You can pass null if you are not interested in this information. -@param[out] pContext Context object that must be passed to vmaDefragmentationEnd() to finish defragmentation. -@return `VK_SUCCESS` and `*pContext == null` if defragmentation finished within this function call. `VK_NOT_READY` and `*pContext != null` if defragmentation has been started and you need to call vmaDefragmentationEnd() to finish it. Negative value in case of error. - -Use this function instead of old, deprecated vmaDefragment(). - -Warning! Between the call to vmaDefragmentationBegin() and vmaDefragmentationEnd(): - -- You should not use any of allocations passed as `pInfo->pAllocations` or - any allocations that belong to pools passed as `pInfo->pPools`, - including calling vmaGetAllocationInfo(), vmaTouchAllocation(), or access - their data. -- Some mutexes protecting internal data structures may be locked, so trying to - make or free any allocations, bind buffers or images, map memory, or launch - another simultaneous defragmentation in between may cause stall (when done on - another thread) or deadlock (when done on the same thread), unless you are - 100% sure that defragmented allocations are in different pools. -- Information returned via `pStats` and `pInfo->pAllocationsChanged` are undefined. - They become valid after call to vmaDefragmentationEnd(). -- If `pInfo->commandBuffer` is not null, you must submit that command buffer - and make sure it finished execution before calling vmaDefragmentationEnd(). - -For more information and important limitations regarding defragmentation, see documentation chapter: -[Defragmentation](@ref defragmentation). -*/ -VMA_CALL_PRE VkResult VMA_CALL_POST vmaDefragmentationBegin( - VmaAllocator VMA_NOT_NULL allocator, - const VmaDefragmentationInfo2* VMA_NOT_NULL pInfo, - VmaDefragmentationStats* VMA_NULLABLE pStats, - VmaDefragmentationContext VMA_NULLABLE * VMA_NOT_NULL pContext); - -/** \brief Ends defragmentation process. - -Use this function to finish defragmentation started by vmaDefragmentationBegin(). -It is safe to pass `context == null`. The function then does nothing. -*/ -VMA_CALL_PRE VkResult VMA_CALL_POST vmaDefragmentationEnd( - VmaAllocator VMA_NOT_NULL allocator, - VmaDefragmentationContext VMA_NULLABLE context); - -VMA_CALL_PRE VkResult VMA_CALL_POST vmaBeginDefragmentationPass( - VmaAllocator VMA_NOT_NULL allocator, - VmaDefragmentationContext VMA_NULLABLE context, - VmaDefragmentationPassInfo* VMA_NOT_NULL pInfo -); -VMA_CALL_PRE VkResult VMA_CALL_POST vmaEndDefragmentationPass( - VmaAllocator VMA_NOT_NULL allocator, - VmaDefragmentationContext VMA_NULLABLE context -); - -/** \brief Deprecated. Compacts memory by moving allocations. - -@param pAllocations Array of allocations that can be moved during this compation. -@param allocationCount Number of elements in pAllocations and pAllocationsChanged arrays. -@param[out] pAllocationsChanged Array of boolean values that will indicate whether matching allocation in pAllocations array has been moved. This parameter is optional. Pass null if you don't need this information. -@param pDefragmentationInfo Configuration parameters. Optional - pass null to use default values. -@param[out] pDefragmentationStats Statistics returned by the function. Optional - pass null if you don't need this information. -@return `VK_SUCCESS` if completed, negative error code in case of error. - -\deprecated This is a part of the old interface. It is recommended to use structure #VmaDefragmentationInfo2 and function vmaDefragmentationBegin() instead. - -This function works by moving allocations to different places (different -`VkDeviceMemory` objects and/or different offsets) in order to optimize memory -usage. Only allocations that are in `pAllocations` array can be moved. All other -allocations are considered nonmovable in this call. Basic rules: - -- Only allocations made in memory types that have - `VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT` and `VK_MEMORY_PROPERTY_HOST_COHERENT_BIT` - flags can be compacted. You may pass other allocations but it makes no sense - - these will never be moved. -- Custom pools created with #VMA_POOL_CREATE_LINEAR_ALGORITHM_BIT or - #VMA_POOL_CREATE_BUDDY_ALGORITHM_BIT flag are not defragmented. Allocations - passed to this function that come from such pools are ignored. -- Allocations created with #VMA_ALLOCATION_CREATE_DEDICATED_MEMORY_BIT or - created as dedicated allocations for any other reason are also ignored. -- Both allocations made with or without #VMA_ALLOCATION_CREATE_MAPPED_BIT - flag can be compacted. If not persistently mapped, memory will be mapped - temporarily inside this function if needed. -- You must not pass same #VmaAllocation object multiple times in `pAllocations` array. - -The function also frees empty `VkDeviceMemory` blocks. - -Warning: This function may be time-consuming, so you shouldn't call it too often -(like after every resource creation/destruction). -You can call it on special occasions (like when reloading a game level or -when you just destroyed a lot of objects). Calling it every frame may be OK, but -you should measure that on your platform. - -For more information, see [Defragmentation](@ref defragmentation) chapter. -*/ -VMA_CALL_PRE VkResult VMA_CALL_POST vmaDefragment( - VmaAllocator VMA_NOT_NULL allocator, - const VmaAllocation VMA_NOT_NULL * VMA_NOT_NULL VMA_LEN_IF_NOT_NULL(allocationCount) pAllocations, - size_t allocationCount, - VkBool32* VMA_NULLABLE VMA_LEN_IF_NOT_NULL(allocationCount) pAllocationsChanged, - const VmaDefragmentationInfo* VMA_NULLABLE pDefragmentationInfo, - VmaDefragmentationStats* VMA_NULLABLE pDefragmentationStats); - -/** \brief Binds buffer to allocation. - -Binds specified buffer to region of memory represented by specified allocation. -Gets `VkDeviceMemory` handle and offset from the allocation. -If you want to create a buffer, allocate memory for it and bind them together separately, -you should use this function for binding instead of standard `vkBindBufferMemory()`, -because it ensures proper synchronization so that when a `VkDeviceMemory` object is used by multiple -allocations, calls to `vkBind*Memory()` or `vkMapMemory()` won't happen from multiple threads simultaneously -(which is illegal in Vulkan). - -It is recommended to use function vmaCreateBuffer() instead of this one. -*/ -VMA_CALL_PRE VkResult VMA_CALL_POST vmaBindBufferMemory( - VmaAllocator VMA_NOT_NULL allocator, - VmaAllocation VMA_NOT_NULL allocation, - VkBuffer VMA_NOT_NULL_NON_DISPATCHABLE buffer); - -/** \brief Binds buffer to allocation with additional parameters. - -@param allocationLocalOffset Additional offset to be added while binding, relative to the beginnig of the `allocation`. Normally it should be 0. -@param pNext A chain of structures to be attached to `VkBindBufferMemoryInfoKHR` structure used internally. Normally it should be null. - -This function is similar to vmaBindBufferMemory(), but it provides additional parameters. - -If `pNext` is not null, #VmaAllocator object must have been created with #VMA_ALLOCATOR_CREATE_KHR_BIND_MEMORY2_BIT flag -or with VmaAllocatorCreateInfo::vulkanApiVersion `>= VK_API_VERSION_1_1`. Otherwise the call fails. -*/ -VMA_CALL_PRE VkResult VMA_CALL_POST vmaBindBufferMemory2( - VmaAllocator VMA_NOT_NULL allocator, - VmaAllocation VMA_NOT_NULL allocation, - VkDeviceSize allocationLocalOffset, - VkBuffer VMA_NOT_NULL_NON_DISPATCHABLE buffer, - const void* VMA_NULLABLE pNext); - -/** \brief Binds image to allocation. - -Binds specified image to region of memory represented by specified allocation. -Gets `VkDeviceMemory` handle and offset from the allocation. -If you want to create an image, allocate memory for it and bind them together separately, -you should use this function for binding instead of standard `vkBindImageMemory()`, -because it ensures proper synchronization so that when a `VkDeviceMemory` object is used by multiple -allocations, calls to `vkBind*Memory()` or `vkMapMemory()` won't happen from multiple threads simultaneously -(which is illegal in Vulkan). - -It is recommended to use function vmaCreateImage() instead of this one. -*/ -VMA_CALL_PRE VkResult VMA_CALL_POST vmaBindImageMemory( - VmaAllocator VMA_NOT_NULL allocator, - VmaAllocation VMA_NOT_NULL allocation, - VkImage VMA_NOT_NULL_NON_DISPATCHABLE image); - -/** \brief Binds image to allocation with additional parameters. - -@param allocationLocalOffset Additional offset to be added while binding, relative to the beginnig of the `allocation`. Normally it should be 0. -@param pNext A chain of structures to be attached to `VkBindImageMemoryInfoKHR` structure used internally. Normally it should be null. - -This function is similar to vmaBindImageMemory(), but it provides additional parameters. - -If `pNext` is not null, #VmaAllocator object must have been created with #VMA_ALLOCATOR_CREATE_KHR_BIND_MEMORY2_BIT flag -or with VmaAllocatorCreateInfo::vulkanApiVersion `>= VK_API_VERSION_1_1`. Otherwise the call fails. -*/ -VMA_CALL_PRE VkResult VMA_CALL_POST vmaBindImageMemory2( - VmaAllocator VMA_NOT_NULL allocator, - VmaAllocation VMA_NOT_NULL allocation, - VkDeviceSize allocationLocalOffset, - VkImage VMA_NOT_NULL_NON_DISPATCHABLE image, - const void* VMA_NULLABLE pNext); - -/** -@param[out] pBuffer Buffer that was created. -@param[out] pAllocation Allocation that was created. -@param[out] pAllocationInfo Optional. Information about allocated memory. It can be later fetched using function vmaGetAllocationInfo(). - -This function automatically: - --# Creates buffer. --# Allocates appropriate memory for it. --# Binds the buffer with the memory. - -If any of these operations fail, buffer and allocation are not created, -returned value is negative error code, *pBuffer and *pAllocation are null. - -If the function succeeded, you must destroy both buffer and allocation when you -no longer need them using either convenience function vmaDestroyBuffer() or -separately, using `vkDestroyBuffer()` and vmaFreeMemory(). - -If #VMA_ALLOCATOR_CREATE_KHR_DEDICATED_ALLOCATION_BIT flag was used, -VK_KHR_dedicated_allocation extension is used internally to query driver whether -it requires or prefers the new buffer to have dedicated allocation. If yes, -and if dedicated allocation is possible (VmaAllocationCreateInfo::pool is null -and #VMA_ALLOCATION_CREATE_NEVER_ALLOCATE_BIT is not used), it creates dedicated -allocation for this buffer, just like when using -#VMA_ALLOCATION_CREATE_DEDICATED_MEMORY_BIT. - -\note This function creates a new `VkBuffer`. Sub-allocation of parts of one large buffer, -although recommended as a good practice, is out of scope of this library and could be implemented -by the user as a higher-level logic on top of VMA. -*/ -VMA_CALL_PRE VkResult VMA_CALL_POST vmaCreateBuffer( - VmaAllocator VMA_NOT_NULL allocator, - const VkBufferCreateInfo* VMA_NOT_NULL pBufferCreateInfo, - const VmaAllocationCreateInfo* VMA_NOT_NULL pAllocationCreateInfo, - VkBuffer VMA_NULLABLE_NON_DISPATCHABLE * VMA_NOT_NULL pBuffer, - VmaAllocation VMA_NULLABLE * VMA_NOT_NULL pAllocation, - VmaAllocationInfo* VMA_NULLABLE pAllocationInfo); - -/** \brief Destroys Vulkan buffer and frees allocated memory. - -This is just a convenience function equivalent to: - -\code -vkDestroyBuffer(device, buffer, allocationCallbacks); -vmaFreeMemory(allocator, allocation); -\endcode - -It it safe to pass null as buffer and/or allocation. -*/ -VMA_CALL_PRE void VMA_CALL_POST vmaDestroyBuffer( - VmaAllocator VMA_NOT_NULL allocator, - VkBuffer VMA_NULLABLE_NON_DISPATCHABLE buffer, - VmaAllocation VMA_NULLABLE allocation); - -/// Function similar to vmaCreateBuffer(). -VMA_CALL_PRE VkResult VMA_CALL_POST vmaCreateImage( - VmaAllocator VMA_NOT_NULL allocator, - const VkImageCreateInfo* VMA_NOT_NULL pImageCreateInfo, - const VmaAllocationCreateInfo* VMA_NOT_NULL pAllocationCreateInfo, - VkImage VMA_NULLABLE_NON_DISPATCHABLE * VMA_NOT_NULL pImage, - VmaAllocation VMA_NULLABLE * VMA_NOT_NULL pAllocation, - VmaAllocationInfo* VMA_NULLABLE pAllocationInfo); - -/** \brief Destroys Vulkan image and frees allocated memory. - -This is just a convenience function equivalent to: - -\code -vkDestroyImage(device, image, allocationCallbacks); -vmaFreeMemory(allocator, allocation); -\endcode - -It it safe to pass null as image and/or allocation. -*/ -VMA_CALL_PRE void VMA_CALL_POST vmaDestroyImage( - VmaAllocator VMA_NOT_NULL allocator, - VkImage VMA_NULLABLE_NON_DISPATCHABLE image, - VmaAllocation VMA_NULLABLE allocation); - -#ifdef __cplusplus -} -#endif - -#endif // AMD_VULKAN_MEMORY_ALLOCATOR_H - -// For Visual Studio IntelliSense. -#if defined(__cplusplus) && defined(__INTELLISENSE__) -#define VMA_IMPLEMENTATION -#endif - -#ifdef VMA_IMPLEMENTATION -#undef VMA_IMPLEMENTATION - -#include -#include -#include -#include - -#if VMA_RECORDING_ENABLED - #include - #if defined(_WIN32) - #include - #else - #include - #include - #endif -#endif - -/******************************************************************************* -CONFIGURATION SECTION - -Define some of these macros before each #include of this header or change them -here if you need other then default behavior depending on your environment. -*/ - -/* -Define this macro to 1 to make the library fetch pointers to Vulkan functions -internally, like: - - vulkanFunctions.vkAllocateMemory = &vkAllocateMemory; -*/ -#if !defined(VMA_STATIC_VULKAN_FUNCTIONS) && !defined(VK_NO_PROTOTYPES) - #define VMA_STATIC_VULKAN_FUNCTIONS 1 -#endif - -/* -Define this macro to 1 to make the library fetch pointers to Vulkan functions -internally, like: - - vulkanFunctions.vkAllocateMemory = (PFN_vkAllocateMemory)vkGetDeviceProcAddr(m_hDevice, vkAllocateMemory); -*/ -#if !defined(VMA_DYNAMIC_VULKAN_FUNCTIONS) - #define VMA_DYNAMIC_VULKAN_FUNCTIONS 1 - #if defined(VK_NO_PROTOTYPES) - extern PFN_vkGetInstanceProcAddr vkGetInstanceProcAddr; - extern PFN_vkGetDeviceProcAddr vkGetDeviceProcAddr; - #endif -#endif - -// Define this macro to 1 to make the library use STL containers instead of its own implementation. -//#define VMA_USE_STL_CONTAINERS 1 - -/* Set this macro to 1 to make the library including and using STL containers: -std::pair, std::vector, std::list, std::unordered_map. - -Set it to 0 or undefined to make the library using its own implementation of -the containers. -*/ -#if VMA_USE_STL_CONTAINERS - #define VMA_USE_STL_VECTOR 1 - #define VMA_USE_STL_UNORDERED_MAP 1 - #define VMA_USE_STL_LIST 1 -#endif - -#ifndef VMA_USE_STL_SHARED_MUTEX - // Compiler conforms to C++17. - #if __cplusplus >= 201703L - #define VMA_USE_STL_SHARED_MUTEX 1 - // Visual studio defines __cplusplus properly only when passed additional parameter: /Zc:__cplusplus - // Otherwise it's always 199711L, despite shared_mutex works since Visual Studio 2015 Update 2. - // See: https://blogs.msdn.microsoft.com/vcblog/2018/04/09/msvc-now-correctly-reports-__cplusplus/ - #elif defined(_MSC_FULL_VER) && _MSC_FULL_VER >= 190023918 && __cplusplus == 199711L && _MSVC_LANG >= 201703L - #define VMA_USE_STL_SHARED_MUTEX 1 - #else - #define VMA_USE_STL_SHARED_MUTEX 0 - #endif -#endif - -/* -THESE INCLUDES ARE NOT ENABLED BY DEFAULT. -Library has its own container implementation. -*/ -#if VMA_USE_STL_VECTOR - #include -#endif - -#if VMA_USE_STL_UNORDERED_MAP - #include -#endif - -#if VMA_USE_STL_LIST - #include -#endif - -/* -Following headers are used in this CONFIGURATION section only, so feel free to -remove them if not needed. -*/ -#include // for assert -#include // for min, max -#include - -#ifndef VMA_NULL - // Value used as null pointer. Define it to e.g.: nullptr, NULL, 0, (void*)0. - #define VMA_NULL nullptr -#endif - -#if defined(__ANDROID_API__) && (__ANDROID_API__ < 16) -#include -static void* vma_aligned_alloc(size_t alignment, size_t size) -{ - // alignment must be >= sizeof(void*) - if(alignment < sizeof(void*)) - { - alignment = sizeof(void*); - } - - return memalign(alignment, size); -} -#elif defined(__APPLE__) || defined(__ANDROID__) || (defined(__linux__) && defined(__GLIBCXX__) && !defined(_GLIBCXX_HAVE_ALIGNED_ALLOC)) -#include - -#if defined(__APPLE__) -#include -#endif - -static void* vma_aligned_alloc(size_t alignment, size_t size) -{ -#if defined(__APPLE__) && (defined(MAC_OS_X_VERSION_10_16) || defined(__IPHONE_14_0)) -#if MAC_OS_X_VERSION_MAX_ALLOWED >= MAC_OS_X_VERSION_10_16 || __IPHONE_OS_VERSION_MAX_ALLOWED >= __IPHONE_14_0 - // For C++14, usr/include/malloc/_malloc.h declares aligned_alloc()) only - // with the MacOSX11.0 SDK in Xcode 12 (which is what adds - // MAC_OS_X_VERSION_10_16), even though the function is marked - // availabe for 10.15. That's why the preprocessor checks for 10.16 but - // the __builtin_available checks for 10.15. - // People who use C++17 could call aligned_alloc with the 10.15 SDK already. - if (__builtin_available(macOS 10.15, iOS 13, *)) - return aligned_alloc(alignment, size); -#endif -#endif - // alignment must be >= sizeof(void*) - if(alignment < sizeof(void*)) - { - alignment = sizeof(void*); - } - - void *pointer; - if(posix_memalign(&pointer, alignment, size) == 0) - return pointer; - return VMA_NULL; -} -#elif defined(_WIN32) -static void* vma_aligned_alloc(size_t alignment, size_t size) -{ - return _aligned_malloc(size, alignment); -} -#else -static void* vma_aligned_alloc(size_t alignment, size_t size) -{ - return aligned_alloc(alignment, size); -} -#endif - -#if defined(_WIN32) -static void vma_aligned_free(void* ptr) -{ - _aligned_free(ptr); -} -#else -static void vma_aligned_free(void* ptr) -{ - free(ptr); -} -#endif - -// If your compiler is not compatible with C++11 and definition of -// aligned_alloc() function is missing, uncommeting following line may help: - -//#include - -// Normal assert to check for programmer's errors, especially in Debug configuration. -#ifndef VMA_ASSERT - #ifdef NDEBUG - #define VMA_ASSERT(expr) - #else - #define VMA_ASSERT(expr) assert(expr) - #endif -#endif - -// Assert that will be called very often, like inside data structures e.g. operator[]. -// Making it non-empty can make program slow. -#ifndef VMA_HEAVY_ASSERT - #ifdef NDEBUG - #define VMA_HEAVY_ASSERT(expr) - #else - #define VMA_HEAVY_ASSERT(expr) //VMA_ASSERT(expr) - #endif -#endif - -#ifndef VMA_ALIGN_OF - #define VMA_ALIGN_OF(type) (__alignof(type)) -#endif - -#ifndef VMA_SYSTEM_ALIGNED_MALLOC - #define VMA_SYSTEM_ALIGNED_MALLOC(size, alignment) vma_aligned_alloc((alignment), (size)) -#endif - -#ifndef VMA_SYSTEM_ALIGNED_FREE - // VMA_SYSTEM_FREE is the old name, but might have been defined by the user - #if defined(VMA_SYSTEM_FREE) - #define VMA_SYSTEM_ALIGNED_FREE(ptr) VMA_SYSTEM_FREE(ptr) - #else - #define VMA_SYSTEM_ALIGNED_FREE(ptr) vma_aligned_free(ptr) - #endif -#endif - -#ifndef VMA_MIN - #define VMA_MIN(v1, v2) (std::min((v1), (v2))) -#endif - -#ifndef VMA_MAX - #define VMA_MAX(v1, v2) (std::max((v1), (v2))) -#endif - -#ifndef VMA_SWAP - #define VMA_SWAP(v1, v2) std::swap((v1), (v2)) -#endif - -#ifndef VMA_SORT - #define VMA_SORT(beg, end, cmp) std::sort(beg, end, cmp) -#endif - -#ifndef VMA_DEBUG_LOG - #define VMA_DEBUG_LOG(format, ...) - /* - #define VMA_DEBUG_LOG(format, ...) do { \ - printf(format, __VA_ARGS__); \ - printf("\n"); \ - } while(false) - */ -#endif - -// Define this macro to 1 to enable functions: vmaBuildStatsString, vmaFreeStatsString. -#if VMA_STATS_STRING_ENABLED - static inline void VmaUint32ToStr(char* outStr, size_t strLen, uint32_t num) - { - snprintf(outStr, strLen, "%u", static_cast(num)); - } - static inline void VmaUint64ToStr(char* outStr, size_t strLen, uint64_t num) - { - snprintf(outStr, strLen, "%llu", static_cast(num)); - } - static inline void VmaPtrToStr(char* outStr, size_t strLen, const void* ptr) - { - snprintf(outStr, strLen, "%p", ptr); - } -#endif - -#ifndef VMA_MUTEX - class VmaMutex - { - public: - void Lock() { m_Mutex.lock(); } - void Unlock() { m_Mutex.unlock(); } - bool TryLock() { return m_Mutex.try_lock(); } - private: - std::mutex m_Mutex; - }; - #define VMA_MUTEX VmaMutex -#endif - -// Read-write mutex, where "read" is shared access, "write" is exclusive access. -#ifndef VMA_RW_MUTEX - #if VMA_USE_STL_SHARED_MUTEX - // Use std::shared_mutex from C++17. - #include - class VmaRWMutex - { - public: - void LockRead() { m_Mutex.lock_shared(); } - void UnlockRead() { m_Mutex.unlock_shared(); } - bool TryLockRead() { return m_Mutex.try_lock_shared(); } - void LockWrite() { m_Mutex.lock(); } - void UnlockWrite() { m_Mutex.unlock(); } - bool TryLockWrite() { return m_Mutex.try_lock(); } - private: - std::shared_mutex m_Mutex; - }; - #define VMA_RW_MUTEX VmaRWMutex - #elif defined(_WIN32) && defined(WINVER) && WINVER >= 0x0600 - // Use SRWLOCK from WinAPI. - // Minimum supported client = Windows Vista, server = Windows Server 2008. - class VmaRWMutex - { - public: - VmaRWMutex() { InitializeSRWLock(&m_Lock); } - void LockRead() { AcquireSRWLockShared(&m_Lock); } - void UnlockRead() { ReleaseSRWLockShared(&m_Lock); } - bool TryLockRead() { return TryAcquireSRWLockShared(&m_Lock) != FALSE; } - void LockWrite() { AcquireSRWLockExclusive(&m_Lock); } - void UnlockWrite() { ReleaseSRWLockExclusive(&m_Lock); } - bool TryLockWrite() { return TryAcquireSRWLockExclusive(&m_Lock) != FALSE; } - private: - SRWLOCK m_Lock; - }; - #define VMA_RW_MUTEX VmaRWMutex - #else - // Less efficient fallback: Use normal mutex. - class VmaRWMutex - { - public: - void LockRead() { m_Mutex.Lock(); } - void UnlockRead() { m_Mutex.Unlock(); } - bool TryLockRead() { return m_Mutex.TryLock(); } - void LockWrite() { m_Mutex.Lock(); } - void UnlockWrite() { m_Mutex.Unlock(); } - bool TryLockWrite() { return m_Mutex.TryLock(); } - private: - VMA_MUTEX m_Mutex; - }; - #define VMA_RW_MUTEX VmaRWMutex - #endif // #if VMA_USE_STL_SHARED_MUTEX -#endif // #ifndef VMA_RW_MUTEX - -/* -If providing your own implementation, you need to implement a subset of std::atomic. -*/ -#ifndef VMA_ATOMIC_UINT32 - #include - #define VMA_ATOMIC_UINT32 std::atomic -#endif - -#ifndef VMA_ATOMIC_UINT64 - #include - #define VMA_ATOMIC_UINT64 std::atomic -#endif - -#ifndef VMA_DEBUG_ALWAYS_DEDICATED_MEMORY - /** - Every allocation will have its own memory block. - Define to 1 for debugging purposes only. - */ - #define VMA_DEBUG_ALWAYS_DEDICATED_MEMORY (0) -#endif - -#ifndef VMA_DEBUG_ALIGNMENT - /** - Minimum alignment of all allocations, in bytes. - Set to more than 1 for debugging purposes only. Must be power of two. - */ - #define VMA_DEBUG_ALIGNMENT (1) -#endif - -#ifndef VMA_DEBUG_MARGIN - /** - Minimum margin before and after every allocation, in bytes. - Set nonzero for debugging purposes only. - */ - #define VMA_DEBUG_MARGIN (0) -#endif - -#ifndef VMA_DEBUG_INITIALIZE_ALLOCATIONS - /** - Define this macro to 1 to automatically fill new allocations and destroyed - allocations with some bit pattern. - */ - #define VMA_DEBUG_INITIALIZE_ALLOCATIONS (0) -#endif - -#ifndef VMA_DEBUG_DETECT_CORRUPTION - /** - Define this macro to 1 together with non-zero value of VMA_DEBUG_MARGIN to - enable writing magic value to the margin before and after every allocation and - validating it, so that memory corruptions (out-of-bounds writes) are detected. - */ - #define VMA_DEBUG_DETECT_CORRUPTION (0) -#endif - -#ifndef VMA_DEBUG_GLOBAL_MUTEX - /** - Set this to 1 for debugging purposes only, to enable single mutex protecting all - entry calls to the library. Can be useful for debugging multithreading issues. - */ - #define VMA_DEBUG_GLOBAL_MUTEX (0) -#endif - -#ifndef VMA_DEBUG_MIN_BUFFER_IMAGE_GRANULARITY - /** - Minimum value for VkPhysicalDeviceLimits::bufferImageGranularity. - Set to more than 1 for debugging purposes only. Must be power of two. - */ - #define VMA_DEBUG_MIN_BUFFER_IMAGE_GRANULARITY (1) -#endif - -#ifndef VMA_DEBUG_DONT_EXCEED_MAX_MEMORY_ALLOCATION_COUNT - /* - Set this to 1 to make VMA never exceed VkPhysicalDeviceLimits::maxMemoryAllocationCount - and return error instead of leaving up to Vulkan implementation what to do in such cases. - */ - #define VMA_DEBUG_DONT_EXCEED_MAX_MEMORY_ALLOCATION_COUNT (0) -#endif - -#ifndef VMA_SMALL_HEAP_MAX_SIZE - /// Maximum size of a memory heap in Vulkan to consider it "small". - #define VMA_SMALL_HEAP_MAX_SIZE (1024ull * 1024 * 1024) -#endif - -#ifndef VMA_DEFAULT_LARGE_HEAP_BLOCK_SIZE - /// Default size of a block allocated as single VkDeviceMemory from a "large" heap. - #define VMA_DEFAULT_LARGE_HEAP_BLOCK_SIZE (256ull * 1024 * 1024) -#endif - -#ifndef VMA_CLASS_NO_COPY - #define VMA_CLASS_NO_COPY(className) \ - private: \ - className(const className&) = delete; \ - className& operator=(const className&) = delete; -#endif - -static const uint32_t VMA_FRAME_INDEX_LOST = UINT32_MAX; - -// Decimal 2139416166, float NaN, little-endian binary 66 E6 84 7F. -static const uint32_t VMA_CORRUPTION_DETECTION_MAGIC_VALUE = 0x7F84E666; - -static const uint8_t VMA_ALLOCATION_FILL_PATTERN_CREATED = 0xDC; -static const uint8_t VMA_ALLOCATION_FILL_PATTERN_DESTROYED = 0xEF; - -/******************************************************************************* -END OF CONFIGURATION -*/ - -// # Copy of some Vulkan definitions so we don't need to check their existence just to handle few constants. - -static const uint32_t VK_MEMORY_PROPERTY_DEVICE_COHERENT_BIT_AMD_COPY = 0x00000040; -static const uint32_t VK_MEMORY_PROPERTY_DEVICE_UNCACHED_BIT_AMD_COPY = 0x00000080; -static const uint32_t VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT_COPY = 0x00020000; - -static const uint32_t VMA_ALLOCATION_INTERNAL_STRATEGY_MIN_OFFSET = 0x10000000u; - -static VkAllocationCallbacks VmaEmptyAllocationCallbacks = { - VMA_NULL, VMA_NULL, VMA_NULL, VMA_NULL, VMA_NULL, VMA_NULL }; - -// Returns number of bits set to 1 in (v). -static inline uint32_t VmaCountBitsSet(uint32_t v) -{ - uint32_t c = v - ((v >> 1) & 0x55555555); - c = ((c >> 2) & 0x33333333) + (c & 0x33333333); - c = ((c >> 4) + c) & 0x0F0F0F0F; - c = ((c >> 8) + c) & 0x00FF00FF; - c = ((c >> 16) + c) & 0x0000FFFF; - return c; -} - -/* -Returns true if given number is a power of two. -T must be unsigned integer number or signed integer but always nonnegative. -For 0 returns true. -*/ -template -inline bool VmaIsPow2(T x) -{ - return (x & (x-1)) == 0; -} - -// Aligns given value up to nearest multiply of align value. For example: VmaAlignUp(11, 8) = 16. -// Use types like uint32_t, uint64_t as T. -template -static inline T VmaAlignUp(T val, T alignment) -{ - VMA_HEAVY_ASSERT(VmaIsPow2(alignment)); - return (val + alignment - 1) & ~(alignment - 1); -} -// Aligns given value down to nearest multiply of align value. For example: VmaAlignUp(11, 8) = 8. -// Use types like uint32_t, uint64_t as T. -template -static inline T VmaAlignDown(T val, T alignment) -{ - VMA_HEAVY_ASSERT(VmaIsPow2(alignment)); - return val & ~(alignment - 1); -} - -// Division with mathematical rounding to nearest number. -template -static inline T VmaRoundDiv(T x, T y) -{ - return (x + (y / (T)2)) / y; -} - -// Returns smallest power of 2 greater or equal to v. -static inline uint32_t VmaNextPow2(uint32_t v) -{ - v--; - v |= v >> 1; - v |= v >> 2; - v |= v >> 4; - v |= v >> 8; - v |= v >> 16; - v++; - return v; -} -static inline uint64_t VmaNextPow2(uint64_t v) -{ - v--; - v |= v >> 1; - v |= v >> 2; - v |= v >> 4; - v |= v >> 8; - v |= v >> 16; - v |= v >> 32; - v++; - return v; -} - -// Returns largest power of 2 less or equal to v. -static inline uint32_t VmaPrevPow2(uint32_t v) -{ - v |= v >> 1; - v |= v >> 2; - v |= v >> 4; - v |= v >> 8; - v |= v >> 16; - v = v ^ (v >> 1); - return v; -} -static inline uint64_t VmaPrevPow2(uint64_t v) -{ - v |= v >> 1; - v |= v >> 2; - v |= v >> 4; - v |= v >> 8; - v |= v >> 16; - v |= v >> 32; - v = v ^ (v >> 1); - return v; -} - -static inline bool VmaStrIsEmpty(const char* pStr) -{ - return pStr == VMA_NULL || *pStr == '\0'; -} - -#if VMA_STATS_STRING_ENABLED - -static const char* VmaAlgorithmToStr(uint32_t algorithm) -{ - switch(algorithm) - { - case VMA_POOL_CREATE_LINEAR_ALGORITHM_BIT: - return "Linear"; - case VMA_POOL_CREATE_BUDDY_ALGORITHM_BIT: - return "Buddy"; - case 0: - return "Default"; - default: - VMA_ASSERT(0); - return ""; - } -} - -#endif // #if VMA_STATS_STRING_ENABLED - -#ifndef VMA_SORT - -template -Iterator VmaQuickSortPartition(Iterator beg, Iterator end, Compare cmp) -{ - Iterator centerValue = end; --centerValue; - Iterator insertIndex = beg; - for(Iterator memTypeIndex = beg; memTypeIndex < centerValue; ++memTypeIndex) - { - if(cmp(*memTypeIndex, *centerValue)) - { - if(insertIndex != memTypeIndex) - { - VMA_SWAP(*memTypeIndex, *insertIndex); - } - ++insertIndex; - } - } - if(insertIndex != centerValue) - { - VMA_SWAP(*insertIndex, *centerValue); - } - return insertIndex; -} - -template -void VmaQuickSort(Iterator beg, Iterator end, Compare cmp) -{ - if(beg < end) - { - Iterator it = VmaQuickSortPartition(beg, end, cmp); - VmaQuickSort(beg, it, cmp); - VmaQuickSort(it + 1, end, cmp); - } -} - -#define VMA_SORT(beg, end, cmp) VmaQuickSort(beg, end, cmp) - -#endif // #ifndef VMA_SORT - -/* -Returns true if two memory blocks occupy overlapping pages. -ResourceA must be in less memory offset than ResourceB. - -Algorithm is based on "Vulkan 1.0.39 - A Specification (with all registered Vulkan extensions)" -chapter 11.6 "Resource Memory Association", paragraph "Buffer-Image Granularity". -*/ -static inline bool VmaBlocksOnSamePage( - VkDeviceSize resourceAOffset, - VkDeviceSize resourceASize, - VkDeviceSize resourceBOffset, - VkDeviceSize pageSize) -{ - VMA_ASSERT(resourceAOffset + resourceASize <= resourceBOffset && resourceASize > 0 && pageSize > 0); - VkDeviceSize resourceAEnd = resourceAOffset + resourceASize - 1; - VkDeviceSize resourceAEndPage = resourceAEnd & ~(pageSize - 1); - VkDeviceSize resourceBStart = resourceBOffset; - VkDeviceSize resourceBStartPage = resourceBStart & ~(pageSize - 1); - return resourceAEndPage == resourceBStartPage; -} - -enum VmaSuballocationType -{ - VMA_SUBALLOCATION_TYPE_FREE = 0, - VMA_SUBALLOCATION_TYPE_UNKNOWN = 1, - VMA_SUBALLOCATION_TYPE_BUFFER = 2, - VMA_SUBALLOCATION_TYPE_IMAGE_UNKNOWN = 3, - VMA_SUBALLOCATION_TYPE_IMAGE_LINEAR = 4, - VMA_SUBALLOCATION_TYPE_IMAGE_OPTIMAL = 5, - VMA_SUBALLOCATION_TYPE_MAX_ENUM = 0x7FFFFFFF -}; - -/* -Returns true if given suballocation types could conflict and must respect -VkPhysicalDeviceLimits::bufferImageGranularity. They conflict if one is buffer -or linear image and another one is optimal image. If type is unknown, behave -conservatively. -*/ -static inline bool VmaIsBufferImageGranularityConflict( - VmaSuballocationType suballocType1, - VmaSuballocationType suballocType2) -{ - if(suballocType1 > suballocType2) - { - VMA_SWAP(suballocType1, suballocType2); - } - - switch(suballocType1) - { - case VMA_SUBALLOCATION_TYPE_FREE: - return false; - case VMA_SUBALLOCATION_TYPE_UNKNOWN: - return true; - case VMA_SUBALLOCATION_TYPE_BUFFER: - return - suballocType2 == VMA_SUBALLOCATION_TYPE_IMAGE_UNKNOWN || - suballocType2 == VMA_SUBALLOCATION_TYPE_IMAGE_OPTIMAL; - case VMA_SUBALLOCATION_TYPE_IMAGE_UNKNOWN: - return - suballocType2 == VMA_SUBALLOCATION_TYPE_IMAGE_UNKNOWN || - suballocType2 == VMA_SUBALLOCATION_TYPE_IMAGE_LINEAR || - suballocType2 == VMA_SUBALLOCATION_TYPE_IMAGE_OPTIMAL; - case VMA_SUBALLOCATION_TYPE_IMAGE_LINEAR: - return - suballocType2 == VMA_SUBALLOCATION_TYPE_IMAGE_OPTIMAL; - case VMA_SUBALLOCATION_TYPE_IMAGE_OPTIMAL: - return false; - default: - VMA_ASSERT(0); - return true; - } -} - -static void VmaWriteMagicValue(void* pData, VkDeviceSize offset) -{ -#if VMA_DEBUG_MARGIN > 0 && VMA_DEBUG_DETECT_CORRUPTION - uint32_t* pDst = (uint32_t*)((char*)pData + offset); - const size_t numberCount = VMA_DEBUG_MARGIN / sizeof(uint32_t); - for(size_t i = 0; i < numberCount; ++i, ++pDst) - { - *pDst = VMA_CORRUPTION_DETECTION_MAGIC_VALUE; - } -#else - // no-op -#endif -} - -static bool VmaValidateMagicValue(const void* pData, VkDeviceSize offset) -{ -#if VMA_DEBUG_MARGIN > 0 && VMA_DEBUG_DETECT_CORRUPTION - const uint32_t* pSrc = (const uint32_t*)((const char*)pData + offset); - const size_t numberCount = VMA_DEBUG_MARGIN / sizeof(uint32_t); - for(size_t i = 0; i < numberCount; ++i, ++pSrc) - { - if(*pSrc != VMA_CORRUPTION_DETECTION_MAGIC_VALUE) - { - return false; - } - } -#endif - return true; -} - -/* -Fills structure with parameters of an example buffer to be used for transfers -during GPU memory defragmentation. -*/ -static void VmaFillGpuDefragmentationBufferCreateInfo(VkBufferCreateInfo& outBufCreateInfo) -{ - memset(&outBufCreateInfo, 0, sizeof(outBufCreateInfo)); - outBufCreateInfo.sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO; - outBufCreateInfo.usage = VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT; - outBufCreateInfo.size = (VkDeviceSize)VMA_DEFAULT_LARGE_HEAP_BLOCK_SIZE; // Example size. -} - -// Helper RAII class to lock a mutex in constructor and unlock it in destructor (at the end of scope). -struct VmaMutexLock -{ - VMA_CLASS_NO_COPY(VmaMutexLock) -public: - VmaMutexLock(VMA_MUTEX& mutex, bool useMutex = true) : - m_pMutex(useMutex ? &mutex : VMA_NULL) - { if(m_pMutex) { m_pMutex->Lock(); } } - ~VmaMutexLock() - { if(m_pMutex) { m_pMutex->Unlock(); } } -private: - VMA_MUTEX* m_pMutex; -}; - -// Helper RAII class to lock a RW mutex in constructor and unlock it in destructor (at the end of scope), for reading. -struct VmaMutexLockRead -{ - VMA_CLASS_NO_COPY(VmaMutexLockRead) -public: - VmaMutexLockRead(VMA_RW_MUTEX& mutex, bool useMutex) : - m_pMutex(useMutex ? &mutex : VMA_NULL) - { if(m_pMutex) { m_pMutex->LockRead(); } } - ~VmaMutexLockRead() { if(m_pMutex) { m_pMutex->UnlockRead(); } } -private: - VMA_RW_MUTEX* m_pMutex; -}; - -// Helper RAII class to lock a RW mutex in constructor and unlock it in destructor (at the end of scope), for writing. -struct VmaMutexLockWrite -{ - VMA_CLASS_NO_COPY(VmaMutexLockWrite) -public: - VmaMutexLockWrite(VMA_RW_MUTEX& mutex, bool useMutex) : - m_pMutex(useMutex ? &mutex : VMA_NULL) - { if(m_pMutex) { m_pMutex->LockWrite(); } } - ~VmaMutexLockWrite() { if(m_pMutex) { m_pMutex->UnlockWrite(); } } -private: - VMA_RW_MUTEX* m_pMutex; -}; - -#if VMA_DEBUG_GLOBAL_MUTEX - static VMA_MUTEX gDebugGlobalMutex; - #define VMA_DEBUG_GLOBAL_MUTEX_LOCK VmaMutexLock debugGlobalMutexLock(gDebugGlobalMutex, true); -#else - #define VMA_DEBUG_GLOBAL_MUTEX_LOCK -#endif - -// Minimum size of a free suballocation to register it in the free suballocation collection. -static const VkDeviceSize VMA_MIN_FREE_SUBALLOCATION_SIZE_TO_REGISTER = 16; - -/* -Performs binary search and returns iterator to first element that is greater or -equal to (key), according to comparison (cmp). - -Cmp should return true if first argument is less than second argument. - -Returned value is the found element, if present in the collection or place where -new element with value (key) should be inserted. -*/ -template -static IterT VmaBinaryFindFirstNotLess(IterT beg, IterT end, const KeyT &key, const CmpLess& cmp) -{ - size_t down = 0, up = (end - beg); - while(down < up) - { - const size_t mid = down + (up - down) / 2; // Overflow-safe midpoint calculation - if(cmp(*(beg+mid), key)) - { - down = mid + 1; - } - else - { - up = mid; - } - } - return beg + down; -} - -template -IterT VmaBinaryFindSorted(const IterT& beg, const IterT& end, const KeyT& value, const CmpLess& cmp) -{ - IterT it = VmaBinaryFindFirstNotLess( - beg, end, value, cmp); - if(it == end || - (!cmp(*it, value) && !cmp(value, *it))) - { - return it; - } - return end; -} - -/* -Returns true if all pointers in the array are not-null and unique. -Warning! O(n^2) complexity. Use only inside VMA_HEAVY_ASSERT. -T must be pointer type, e.g. VmaAllocation, VmaPool. -*/ -template -static bool VmaValidatePointerArray(uint32_t count, const T* arr) -{ - for(uint32_t i = 0; i < count; ++i) - { - const T iPtr = arr[i]; - if(iPtr == VMA_NULL) - { - return false; - } - for(uint32_t j = i + 1; j < count; ++j) - { - if(iPtr == arr[j]) - { - return false; - } - } - } - return true; -} - -template -static inline void VmaPnextChainPushFront(MainT* mainStruct, NewT* newStruct) -{ - newStruct->pNext = mainStruct->pNext; - mainStruct->pNext = newStruct; -} - -//////////////////////////////////////////////////////////////////////////////// -// Memory allocation - -static void* VmaMalloc(const VkAllocationCallbacks* pAllocationCallbacks, size_t size, size_t alignment) -{ - void* result = VMA_NULL; - if((pAllocationCallbacks != VMA_NULL) && - (pAllocationCallbacks->pfnAllocation != VMA_NULL)) - { - result = (*pAllocationCallbacks->pfnAllocation)( - pAllocationCallbacks->pUserData, - size, - alignment, - VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); - } - else - { - result = VMA_SYSTEM_ALIGNED_MALLOC(size, alignment); - } - VMA_ASSERT(result != VMA_NULL && "CPU memory allocation failed."); - return result; -} - -static void VmaFree(const VkAllocationCallbacks* pAllocationCallbacks, void* ptr) -{ - if((pAllocationCallbacks != VMA_NULL) && - (pAllocationCallbacks->pfnFree != VMA_NULL)) - { - (*pAllocationCallbacks->pfnFree)(pAllocationCallbacks->pUserData, ptr); - } - else - { - VMA_SYSTEM_ALIGNED_FREE(ptr); - } -} - -template -static T* VmaAllocate(const VkAllocationCallbacks* pAllocationCallbacks) -{ - return (T*)VmaMalloc(pAllocationCallbacks, sizeof(T), VMA_ALIGN_OF(T)); -} - -template -static T* VmaAllocateArray(const VkAllocationCallbacks* pAllocationCallbacks, size_t count) -{ - return (T*)VmaMalloc(pAllocationCallbacks, sizeof(T) * count, VMA_ALIGN_OF(T)); -} - -#define vma_new(allocator, type) new(VmaAllocate(allocator))(type) - -#define vma_new_array(allocator, type, count) new(VmaAllocateArray((allocator), (count)))(type) - -template -static void vma_delete(const VkAllocationCallbacks* pAllocationCallbacks, T* ptr) -{ - ptr->~T(); - VmaFree(pAllocationCallbacks, ptr); -} - -template -static void vma_delete_array(const VkAllocationCallbacks* pAllocationCallbacks, T* ptr, size_t count) -{ - if(ptr != VMA_NULL) - { - for(size_t i = count; i--; ) - { - ptr[i].~T(); - } - VmaFree(pAllocationCallbacks, ptr); - } -} - -static char* VmaCreateStringCopy(const VkAllocationCallbacks* allocs, const char* srcStr) -{ - if(srcStr != VMA_NULL) - { - const size_t len = strlen(srcStr); - char* const result = vma_new_array(allocs, char, len + 1); - memcpy(result, srcStr, len + 1); - return result; - } - else - { - return VMA_NULL; - } -} - -static void VmaFreeString(const VkAllocationCallbacks* allocs, char* str) -{ - if(str != VMA_NULL) - { - const size_t len = strlen(str); - vma_delete_array(allocs, str, len + 1); - } -} - -// STL-compatible allocator. -template -class VmaStlAllocator -{ -public: - const VkAllocationCallbacks* const m_pCallbacks; - typedef T value_type; - - VmaStlAllocator(const VkAllocationCallbacks* pCallbacks) : m_pCallbacks(pCallbacks) { } - template VmaStlAllocator(const VmaStlAllocator& src) : m_pCallbacks(src.m_pCallbacks) { } - - T* allocate(size_t n) { return VmaAllocateArray(m_pCallbacks, n); } - void deallocate(T* p, size_t n) { VmaFree(m_pCallbacks, p); } - - template - bool operator==(const VmaStlAllocator& rhs) const - { - return m_pCallbacks == rhs.m_pCallbacks; - } - template - bool operator!=(const VmaStlAllocator& rhs) const - { - return m_pCallbacks != rhs.m_pCallbacks; - } - - VmaStlAllocator& operator=(const VmaStlAllocator& x) = delete; -}; - -#if VMA_USE_STL_VECTOR - -#define VmaVector std::vector - -template -static void VmaVectorInsert(std::vector& vec, size_t index, const T& item) -{ - vec.insert(vec.begin() + index, item); -} - -template -static void VmaVectorRemove(std::vector& vec, size_t index) -{ - vec.erase(vec.begin() + index); -} - -#else // #if VMA_USE_STL_VECTOR - -/* Class with interface compatible with subset of std::vector. -T must be POD because constructors and destructors are not called and memcpy is -used for these objects. */ -template -class VmaVector -{ -public: - typedef T value_type; - - VmaVector(const AllocatorT& allocator) : - m_Allocator(allocator), - m_pArray(VMA_NULL), - m_Count(0), - m_Capacity(0) - { - } - - VmaVector(size_t count, const AllocatorT& allocator) : - m_Allocator(allocator), - m_pArray(count ? (T*)VmaAllocateArray(allocator.m_pCallbacks, count) : VMA_NULL), - m_Count(count), - m_Capacity(count) - { - } - - // This version of the constructor is here for compatibility with pre-C++14 std::vector. - // value is unused. - VmaVector(size_t count, const T& value, const AllocatorT& allocator) - : VmaVector(count, allocator) {} - - VmaVector(const VmaVector& src) : - m_Allocator(src.m_Allocator), - m_pArray(src.m_Count ? (T*)VmaAllocateArray(src.m_Allocator.m_pCallbacks, src.m_Count) : VMA_NULL), - m_Count(src.m_Count), - m_Capacity(src.m_Count) - { - if(m_Count != 0) - { - memcpy(m_pArray, src.m_pArray, m_Count * sizeof(T)); - } - } - - ~VmaVector() - { - VmaFree(m_Allocator.m_pCallbacks, m_pArray); - } - - VmaVector& operator=(const VmaVector& rhs) - { - if(&rhs != this) - { - resize(rhs.m_Count); - if(m_Count != 0) - { - memcpy(m_pArray, rhs.m_pArray, m_Count * sizeof(T)); - } - } - return *this; - } - - bool empty() const { return m_Count == 0; } - size_t size() const { return m_Count; } - T* data() { return m_pArray; } - const T* data() const { return m_pArray; } - - T& operator[](size_t index) - { - VMA_HEAVY_ASSERT(index < m_Count); - return m_pArray[index]; - } - const T& operator[](size_t index) const - { - VMA_HEAVY_ASSERT(index < m_Count); - return m_pArray[index]; - } - - T& front() - { - VMA_HEAVY_ASSERT(m_Count > 0); - return m_pArray[0]; - } - const T& front() const - { - VMA_HEAVY_ASSERT(m_Count > 0); - return m_pArray[0]; - } - T& back() - { - VMA_HEAVY_ASSERT(m_Count > 0); - return m_pArray[m_Count - 1]; - } - const T& back() const - { - VMA_HEAVY_ASSERT(m_Count > 0); - return m_pArray[m_Count - 1]; - } - - void reserve(size_t newCapacity, bool freeMemory = false) - { - newCapacity = VMA_MAX(newCapacity, m_Count); - - if((newCapacity < m_Capacity) && !freeMemory) - { - newCapacity = m_Capacity; - } - - if(newCapacity != m_Capacity) - { - T* const newArray = newCapacity ? VmaAllocateArray(m_Allocator, newCapacity) : VMA_NULL; - if(m_Count != 0) - { - memcpy(newArray, m_pArray, m_Count * sizeof(T)); - } - VmaFree(m_Allocator.m_pCallbacks, m_pArray); - m_Capacity = newCapacity; - m_pArray = newArray; - } - } - - void resize(size_t newCount, bool freeMemory = false) - { - size_t newCapacity = m_Capacity; - if(newCount > m_Capacity) - { - newCapacity = VMA_MAX(newCount, VMA_MAX(m_Capacity * 3 / 2, (size_t)8)); - } - else if(freeMemory) - { - newCapacity = newCount; - } - - if(newCapacity != m_Capacity) - { - T* const newArray = newCapacity ? VmaAllocateArray(m_Allocator.m_pCallbacks, newCapacity) : VMA_NULL; - const size_t elementsToCopy = VMA_MIN(m_Count, newCount); - if(elementsToCopy != 0) - { - memcpy(newArray, m_pArray, elementsToCopy * sizeof(T)); - } - VmaFree(m_Allocator.m_pCallbacks, m_pArray); - m_Capacity = newCapacity; - m_pArray = newArray; - } - - m_Count = newCount; - } - - void clear(bool freeMemory = false) - { - resize(0, freeMemory); - } - - void insert(size_t index, const T& src) - { - VMA_HEAVY_ASSERT(index <= m_Count); - const size_t oldCount = size(); - resize(oldCount + 1); - if(index < oldCount) - { - memmove(m_pArray + (index + 1), m_pArray + index, (oldCount - index) * sizeof(T)); - } - m_pArray[index] = src; - } - - void remove(size_t index) - { - VMA_HEAVY_ASSERT(index < m_Count); - const size_t oldCount = size(); - if(index < oldCount - 1) - { - memmove(m_pArray + index, m_pArray + (index + 1), (oldCount - index - 1) * sizeof(T)); - } - resize(oldCount - 1); - } - - void push_back(const T& src) - { - const size_t newIndex = size(); - resize(newIndex + 1); - m_pArray[newIndex] = src; - } - - void pop_back() - { - VMA_HEAVY_ASSERT(m_Count > 0); - resize(size() - 1); - } - - void push_front(const T& src) - { - insert(0, src); - } - - void pop_front() - { - VMA_HEAVY_ASSERT(m_Count > 0); - remove(0); - } - - typedef T* iterator; - - iterator begin() { return m_pArray; } - iterator end() { return m_pArray + m_Count; } - -private: - AllocatorT m_Allocator; - T* m_pArray; - size_t m_Count; - size_t m_Capacity; -}; - -template -static void VmaVectorInsert(VmaVector& vec, size_t index, const T& item) -{ - vec.insert(index, item); -} - -template -static void VmaVectorRemove(VmaVector& vec, size_t index) -{ - vec.remove(index); -} - -#endif // #if VMA_USE_STL_VECTOR - -template -size_t VmaVectorInsertSorted(VectorT& vector, const typename VectorT::value_type& value) -{ - const size_t indexToInsert = VmaBinaryFindFirstNotLess( - vector.data(), - vector.data() + vector.size(), - value, - CmpLess()) - vector.data(); - VmaVectorInsert(vector, indexToInsert, value); - return indexToInsert; -} - -template -bool VmaVectorRemoveSorted(VectorT& vector, const typename VectorT::value_type& value) -{ - CmpLess comparator; - typename VectorT::iterator it = VmaBinaryFindFirstNotLess( - vector.begin(), - vector.end(), - value, - comparator); - if((it != vector.end()) && !comparator(*it, value) && !comparator(value, *it)) - { - size_t indexToRemove = it - vector.begin(); - VmaVectorRemove(vector, indexToRemove); - return true; - } - return false; -} - -//////////////////////////////////////////////////////////////////////////////// -// class VmaSmallVector - -/* -This is a vector (a variable-sized array), optimized for the case when the array is small. - -It contains some number of elements in-place, which allows it to avoid heap allocation -when the actual number of elements is below that threshold. This allows normal "small" -cases to be fast without losing generality for large inputs. -*/ - -template -class VmaSmallVector -{ -public: - typedef T value_type; - - VmaSmallVector(const AllocatorT& allocator) : - m_Count(0), - m_DynamicArray(allocator) - { - } - VmaSmallVector(size_t count, const AllocatorT& allocator) : - m_Count(count), - m_DynamicArray(count > N ? count : 0, allocator) - { - } - template - VmaSmallVector(const VmaSmallVector& src) = delete; - template - VmaSmallVector& operator=(const VmaSmallVector& rhs) = delete; - - bool empty() const { return m_Count == 0; } - size_t size() const { return m_Count; } - T* data() { return m_Count > N ? m_DynamicArray.data() : m_StaticArray; } - const T* data() const { return m_Count > N ? m_DynamicArray.data() : m_StaticArray; } - - T& operator[](size_t index) - { - VMA_HEAVY_ASSERT(index < m_Count); - return data()[index]; - } - const T& operator[](size_t index) const - { - VMA_HEAVY_ASSERT(index < m_Count); - return data()[index]; - } - - T& front() - { - VMA_HEAVY_ASSERT(m_Count > 0); - return data()[0]; - } - const T& front() const - { - VMA_HEAVY_ASSERT(m_Count > 0); - return data()[0]; - } - T& back() - { - VMA_HEAVY_ASSERT(m_Count > 0); - return data()[m_Count - 1]; - } - const T& back() const - { - VMA_HEAVY_ASSERT(m_Count > 0); - return data()[m_Count - 1]; - } - - void resize(size_t newCount, bool freeMemory = false) - { - if(newCount > N && m_Count > N) - { - // Any direction, staying in m_DynamicArray - m_DynamicArray.resize(newCount, freeMemory); - } - else if(newCount > N && m_Count <= N) - { - // Growing, moving from m_StaticArray to m_DynamicArray - m_DynamicArray.resize(newCount, freeMemory); - if(m_Count > 0) - { - memcpy(m_DynamicArray.data(), m_StaticArray, m_Count * sizeof(T)); - } - } - else if(newCount <= N && m_Count > N) - { - // Shrinking, moving from m_DynamicArray to m_StaticArray - if(newCount > 0) - { - memcpy(m_StaticArray, m_DynamicArray.data(), newCount * sizeof(T)); - } - m_DynamicArray.resize(0, freeMemory); - } - else - { - // Any direction, staying in m_StaticArray - nothing to do here - } - m_Count = newCount; - } - - void clear(bool freeMemory = false) - { - m_DynamicArray.clear(freeMemory); - m_Count = 0; - } - - void insert(size_t index, const T& src) - { - VMA_HEAVY_ASSERT(index <= m_Count); - const size_t oldCount = size(); - resize(oldCount + 1); - T* const dataPtr = data(); - if(index < oldCount) - { - // I know, this could be more optimal for case where memmove can be memcpy directly from m_StaticArray to m_DynamicArray. - memmove(dataPtr + (index + 1), dataPtr + index, (oldCount - index) * sizeof(T)); - } - dataPtr[index] = src; - } - - void remove(size_t index) - { - VMA_HEAVY_ASSERT(index < m_Count); - const size_t oldCount = size(); - if(index < oldCount - 1) - { - // I know, this could be more optimal for case where memmove can be memcpy directly from m_DynamicArray to m_StaticArray. - T* const dataPtr = data(); - memmove(dataPtr + index, dataPtr + (index + 1), (oldCount - index - 1) * sizeof(T)); - } - resize(oldCount - 1); - } - - void push_back(const T& src) - { - const size_t newIndex = size(); - resize(newIndex + 1); - data()[newIndex] = src; - } - - void pop_back() - { - VMA_HEAVY_ASSERT(m_Count > 0); - resize(size() - 1); - } - - void push_front(const T& src) - { - insert(0, src); - } - - void pop_front() - { - VMA_HEAVY_ASSERT(m_Count > 0); - remove(0); - } - - typedef T* iterator; - - iterator begin() { return data(); } - iterator end() { return data() + m_Count; } - -private: - size_t m_Count; - T m_StaticArray[N]; // Used when m_Size <= N - VmaVector m_DynamicArray; // Used when m_Size > N -}; - -//////////////////////////////////////////////////////////////////////////////// -// class VmaPoolAllocator - -/* -Allocator for objects of type T using a list of arrays (pools) to speed up -allocation. Number of elements that can be allocated is not bounded because -allocator can create multiple blocks. -*/ -template -class VmaPoolAllocator -{ - VMA_CLASS_NO_COPY(VmaPoolAllocator) -public: - VmaPoolAllocator(const VkAllocationCallbacks* pAllocationCallbacks, uint32_t firstBlockCapacity); - ~VmaPoolAllocator(); - template T* Alloc(Types... args); - void Free(T* ptr); - -private: - union Item - { - uint32_t NextFreeIndex; - alignas(T) char Value[sizeof(T)]; - }; - - struct ItemBlock - { - Item* pItems; - uint32_t Capacity; - uint32_t FirstFreeIndex; - }; - - const VkAllocationCallbacks* m_pAllocationCallbacks; - const uint32_t m_FirstBlockCapacity; - VmaVector< ItemBlock, VmaStlAllocator > m_ItemBlocks; - - ItemBlock& CreateNewBlock(); -}; - -template -VmaPoolAllocator::VmaPoolAllocator(const VkAllocationCallbacks* pAllocationCallbacks, uint32_t firstBlockCapacity) : - m_pAllocationCallbacks(pAllocationCallbacks), - m_FirstBlockCapacity(firstBlockCapacity), - m_ItemBlocks(VmaStlAllocator(pAllocationCallbacks)) -{ - VMA_ASSERT(m_FirstBlockCapacity > 1); -} - -template -VmaPoolAllocator::~VmaPoolAllocator() -{ - for(size_t i = m_ItemBlocks.size(); i--; ) - vma_delete_array(m_pAllocationCallbacks, m_ItemBlocks[i].pItems, m_ItemBlocks[i].Capacity); - m_ItemBlocks.clear(); -} - -template -template T* VmaPoolAllocator::Alloc(Types... args) -{ - for(size_t i = m_ItemBlocks.size(); i--; ) - { - ItemBlock& block = m_ItemBlocks[i]; - // This block has some free items: Use first one. - if(block.FirstFreeIndex != UINT32_MAX) - { - Item* const pItem = &block.pItems[block.FirstFreeIndex]; - block.FirstFreeIndex = pItem->NextFreeIndex; - T* result = (T*)&pItem->Value; - new(result)T(std::forward(args)...); // Explicit constructor call. - return result; - } - } - - // No block has free item: Create new one and use it. - ItemBlock& newBlock = CreateNewBlock(); - Item* const pItem = &newBlock.pItems[0]; - newBlock.FirstFreeIndex = pItem->NextFreeIndex; - T* result = (T*)&pItem->Value; - new(result)T(std::forward(args)...); // Explicit constructor call. - return result; -} - -template -void VmaPoolAllocator::Free(T* ptr) -{ - // Search all memory blocks to find ptr. - for(size_t i = m_ItemBlocks.size(); i--; ) - { - ItemBlock& block = m_ItemBlocks[i]; - - // Casting to union. - Item* pItemPtr; - memcpy(&pItemPtr, &ptr, sizeof(pItemPtr)); - - // Check if pItemPtr is in address range of this block. - if((pItemPtr >= block.pItems) && (pItemPtr < block.pItems + block.Capacity)) - { - ptr->~T(); // Explicit destructor call. - const uint32_t index = static_cast(pItemPtr - block.pItems); - pItemPtr->NextFreeIndex = block.FirstFreeIndex; - block.FirstFreeIndex = index; - return; - } - } - VMA_ASSERT(0 && "Pointer doesn't belong to this memory pool."); -} - -template -typename VmaPoolAllocator::ItemBlock& VmaPoolAllocator::CreateNewBlock() -{ - const uint32_t newBlockCapacity = m_ItemBlocks.empty() ? - m_FirstBlockCapacity : m_ItemBlocks.back().Capacity * 3 / 2; - - const ItemBlock newBlock = { - vma_new_array(m_pAllocationCallbacks, Item, newBlockCapacity), - newBlockCapacity, - 0 }; - - m_ItemBlocks.push_back(newBlock); - - // Setup singly-linked list of all free items in this block. - for(uint32_t i = 0; i < newBlockCapacity - 1; ++i) - newBlock.pItems[i].NextFreeIndex = i + 1; - newBlock.pItems[newBlockCapacity - 1].NextFreeIndex = UINT32_MAX; - return m_ItemBlocks.back(); -} - -//////////////////////////////////////////////////////////////////////////////// -// class VmaRawList, VmaList - -#if VMA_USE_STL_LIST - -#define VmaList std::list - -#else // #if VMA_USE_STL_LIST - -template -struct VmaListItem -{ - VmaListItem* pPrev; - VmaListItem* pNext; - T Value; -}; - -// Doubly linked list. -template -class VmaRawList -{ - VMA_CLASS_NO_COPY(VmaRawList) -public: - typedef VmaListItem ItemType; - - VmaRawList(const VkAllocationCallbacks* pAllocationCallbacks); - ~VmaRawList(); - void Clear(); - - size_t GetCount() const { return m_Count; } - bool IsEmpty() const { return m_Count == 0; } - - ItemType* Front() { return m_pFront; } - const ItemType* Front() const { return m_pFront; } - ItemType* Back() { return m_pBack; } - const ItemType* Back() const { return m_pBack; } - - ItemType* PushBack(); - ItemType* PushFront(); - ItemType* PushBack(const T& value); - ItemType* PushFront(const T& value); - void PopBack(); - void PopFront(); - - // Item can be null - it means PushBack. - ItemType* InsertBefore(ItemType* pItem); - // Item can be null - it means PushFront. - ItemType* InsertAfter(ItemType* pItem); - - ItemType* InsertBefore(ItemType* pItem, const T& value); - ItemType* InsertAfter(ItemType* pItem, const T& value); - - void Remove(ItemType* pItem); - -private: - const VkAllocationCallbacks* const m_pAllocationCallbacks; - VmaPoolAllocator m_ItemAllocator; - ItemType* m_pFront; - ItemType* m_pBack; - size_t m_Count; -}; - -template -VmaRawList::VmaRawList(const VkAllocationCallbacks* pAllocationCallbacks) : - m_pAllocationCallbacks(pAllocationCallbacks), - m_ItemAllocator(pAllocationCallbacks, 128), - m_pFront(VMA_NULL), - m_pBack(VMA_NULL), - m_Count(0) -{ -} - -template -VmaRawList::~VmaRawList() -{ - // Intentionally not calling Clear, because that would be unnecessary - // computations to return all items to m_ItemAllocator as free. -} - -template -void VmaRawList::Clear() -{ - if(IsEmpty() == false) - { - ItemType* pItem = m_pBack; - while(pItem != VMA_NULL) - { - ItemType* const pPrevItem = pItem->pPrev; - m_ItemAllocator.Free(pItem); - pItem = pPrevItem; - } - m_pFront = VMA_NULL; - m_pBack = VMA_NULL; - m_Count = 0; - } -} - -template -VmaListItem* VmaRawList::PushBack() -{ - ItemType* const pNewItem = m_ItemAllocator.Alloc(); - pNewItem->pNext = VMA_NULL; - if(IsEmpty()) - { - pNewItem->pPrev = VMA_NULL; - m_pFront = pNewItem; - m_pBack = pNewItem; - m_Count = 1; - } - else - { - pNewItem->pPrev = m_pBack; - m_pBack->pNext = pNewItem; - m_pBack = pNewItem; - ++m_Count; - } - return pNewItem; -} - -template -VmaListItem* VmaRawList::PushFront() -{ - ItemType* const pNewItem = m_ItemAllocator.Alloc(); - pNewItem->pPrev = VMA_NULL; - if(IsEmpty()) - { - pNewItem->pNext = VMA_NULL; - m_pFront = pNewItem; - m_pBack = pNewItem; - m_Count = 1; - } - else - { - pNewItem->pNext = m_pFront; - m_pFront->pPrev = pNewItem; - m_pFront = pNewItem; - ++m_Count; - } - return pNewItem; -} - -template -VmaListItem* VmaRawList::PushBack(const T& value) -{ - ItemType* const pNewItem = PushBack(); - pNewItem->Value = value; - return pNewItem; -} - -template -VmaListItem* VmaRawList::PushFront(const T& value) -{ - ItemType* const pNewItem = PushFront(); - pNewItem->Value = value; - return pNewItem; -} - -template -void VmaRawList::PopBack() -{ - VMA_HEAVY_ASSERT(m_Count > 0); - ItemType* const pBackItem = m_pBack; - ItemType* const pPrevItem = pBackItem->pPrev; - if(pPrevItem != VMA_NULL) - { - pPrevItem->pNext = VMA_NULL; - } - m_pBack = pPrevItem; - m_ItemAllocator.Free(pBackItem); - --m_Count; -} - -template -void VmaRawList::PopFront() -{ - VMA_HEAVY_ASSERT(m_Count > 0); - ItemType* const pFrontItem = m_pFront; - ItemType* const pNextItem = pFrontItem->pNext; - if(pNextItem != VMA_NULL) - { - pNextItem->pPrev = VMA_NULL; - } - m_pFront = pNextItem; - m_ItemAllocator.Free(pFrontItem); - --m_Count; -} - -template -void VmaRawList::Remove(ItemType* pItem) -{ - VMA_HEAVY_ASSERT(pItem != VMA_NULL); - VMA_HEAVY_ASSERT(m_Count > 0); - - if(pItem->pPrev != VMA_NULL) - { - pItem->pPrev->pNext = pItem->pNext; - } - else - { - VMA_HEAVY_ASSERT(m_pFront == pItem); - m_pFront = pItem->pNext; - } - - if(pItem->pNext != VMA_NULL) - { - pItem->pNext->pPrev = pItem->pPrev; - } - else - { - VMA_HEAVY_ASSERT(m_pBack == pItem); - m_pBack = pItem->pPrev; - } - - m_ItemAllocator.Free(pItem); - --m_Count; -} - -template -VmaListItem* VmaRawList::InsertBefore(ItemType* pItem) -{ - if(pItem != VMA_NULL) - { - ItemType* const prevItem = pItem->pPrev; - ItemType* const newItem = m_ItemAllocator.Alloc(); - newItem->pPrev = prevItem; - newItem->pNext = pItem; - pItem->pPrev = newItem; - if(prevItem != VMA_NULL) - { - prevItem->pNext = newItem; - } - else - { - VMA_HEAVY_ASSERT(m_pFront == pItem); - m_pFront = newItem; - } - ++m_Count; - return newItem; - } - else - return PushBack(); -} - -template -VmaListItem* VmaRawList::InsertAfter(ItemType* pItem) -{ - if(pItem != VMA_NULL) - { - ItemType* const nextItem = pItem->pNext; - ItemType* const newItem = m_ItemAllocator.Alloc(); - newItem->pNext = nextItem; - newItem->pPrev = pItem; - pItem->pNext = newItem; - if(nextItem != VMA_NULL) - { - nextItem->pPrev = newItem; - } - else - { - VMA_HEAVY_ASSERT(m_pBack == pItem); - m_pBack = newItem; - } - ++m_Count; - return newItem; - } - else - return PushFront(); -} - -template -VmaListItem* VmaRawList::InsertBefore(ItemType* pItem, const T& value) -{ - ItemType* const newItem = InsertBefore(pItem); - newItem->Value = value; - return newItem; -} - -template -VmaListItem* VmaRawList::InsertAfter(ItemType* pItem, const T& value) -{ - ItemType* const newItem = InsertAfter(pItem); - newItem->Value = value; - return newItem; -} - -template -class VmaList -{ - VMA_CLASS_NO_COPY(VmaList) -public: - class iterator - { - public: - iterator() : - m_pList(VMA_NULL), - m_pItem(VMA_NULL) - { - } - - T& operator*() const - { - VMA_HEAVY_ASSERT(m_pItem != VMA_NULL); - return m_pItem->Value; - } - T* operator->() const - { - VMA_HEAVY_ASSERT(m_pItem != VMA_NULL); - return &m_pItem->Value; - } - - iterator& operator++() - { - VMA_HEAVY_ASSERT(m_pItem != VMA_NULL); - m_pItem = m_pItem->pNext; - return *this; - } - iterator& operator--() - { - if(m_pItem != VMA_NULL) - { - m_pItem = m_pItem->pPrev; - } - else - { - VMA_HEAVY_ASSERT(!m_pList->IsEmpty()); - m_pItem = m_pList->Back(); - } - return *this; - } - - iterator operator++(int) - { - iterator result = *this; - ++*this; - return result; - } - iterator operator--(int) - { - iterator result = *this; - --*this; - return result; - } - - bool operator==(const iterator& rhs) const - { - VMA_HEAVY_ASSERT(m_pList == rhs.m_pList); - return m_pItem == rhs.m_pItem; - } - bool operator!=(const iterator& rhs) const - { - VMA_HEAVY_ASSERT(m_pList == rhs.m_pList); - return m_pItem != rhs.m_pItem; - } - - private: - VmaRawList* m_pList; - VmaListItem* m_pItem; - - iterator(VmaRawList* pList, VmaListItem* pItem) : - m_pList(pList), - m_pItem(pItem) - { - } - - friend class VmaList; - }; - - class const_iterator - { - public: - const_iterator() : - m_pList(VMA_NULL), - m_pItem(VMA_NULL) - { - } - - const_iterator(const iterator& src) : - m_pList(src.m_pList), - m_pItem(src.m_pItem) - { - } - - const T& operator*() const - { - VMA_HEAVY_ASSERT(m_pItem != VMA_NULL); - return m_pItem->Value; - } - const T* operator->() const - { - VMA_HEAVY_ASSERT(m_pItem != VMA_NULL); - return &m_pItem->Value; - } - - const_iterator& operator++() - { - VMA_HEAVY_ASSERT(m_pItem != VMA_NULL); - m_pItem = m_pItem->pNext; - return *this; - } - const_iterator& operator--() - { - if(m_pItem != VMA_NULL) - { - m_pItem = m_pItem->pPrev; - } - else - { - VMA_HEAVY_ASSERT(!m_pList->IsEmpty()); - m_pItem = m_pList->Back(); - } - return *this; - } - - const_iterator operator++(int) - { - const_iterator result = *this; - ++*this; - return result; - } - const_iterator operator--(int) - { - const_iterator result = *this; - --*this; - return result; - } - - bool operator==(const const_iterator& rhs) const - { - VMA_HEAVY_ASSERT(m_pList == rhs.m_pList); - return m_pItem == rhs.m_pItem; - } - bool operator!=(const const_iterator& rhs) const - { - VMA_HEAVY_ASSERT(m_pList == rhs.m_pList); - return m_pItem != rhs.m_pItem; - } - - private: - const_iterator(const VmaRawList* pList, const VmaListItem* pItem) : - m_pList(pList), - m_pItem(pItem) - { - } - - const VmaRawList* m_pList; - const VmaListItem* m_pItem; - - friend class VmaList; - }; - - VmaList(const AllocatorT& allocator) : m_RawList(allocator.m_pCallbacks) { } - - bool empty() const { return m_RawList.IsEmpty(); } - size_t size() const { return m_RawList.GetCount(); } - - iterator begin() { return iterator(&m_RawList, m_RawList.Front()); } - iterator end() { return iterator(&m_RawList, VMA_NULL); } - - const_iterator cbegin() const { return const_iterator(&m_RawList, m_RawList.Front()); } - const_iterator cend() const { return const_iterator(&m_RawList, VMA_NULL); } - - void clear() { m_RawList.Clear(); } - void push_back(const T& value) { m_RawList.PushBack(value); } - void erase(iterator it) { m_RawList.Remove(it.m_pItem); } - iterator insert(iterator it, const T& value) { return iterator(&m_RawList, m_RawList.InsertBefore(it.m_pItem, value)); } - -private: - VmaRawList m_RawList; -}; - -#endif // #if VMA_USE_STL_LIST - -//////////////////////////////////////////////////////////////////////////////// -// class VmaMap - -// Unused in this version. -#if 0 - -#if VMA_USE_STL_UNORDERED_MAP - -#define VmaPair std::pair - -#define VMA_MAP_TYPE(KeyT, ValueT) \ - std::unordered_map< KeyT, ValueT, std::hash, std::equal_to, VmaStlAllocator< std::pair > > - -#else // #if VMA_USE_STL_UNORDERED_MAP - -template -struct VmaPair -{ - T1 first; - T2 second; - - VmaPair() : first(), second() { } - VmaPair(const T1& firstSrc, const T2& secondSrc) : first(firstSrc), second(secondSrc) { } -}; - -/* Class compatible with subset of interface of std::unordered_map. -KeyT, ValueT must be POD because they will be stored in VmaVector. -*/ -template -class VmaMap -{ -public: - typedef VmaPair PairType; - typedef PairType* iterator; - - VmaMap(const VmaStlAllocator& allocator) : m_Vector(allocator) { } - - iterator begin() { return m_Vector.begin(); } - iterator end() { return m_Vector.end(); } - - void insert(const PairType& pair); - iterator find(const KeyT& key); - void erase(iterator it); - -private: - VmaVector< PairType, VmaStlAllocator > m_Vector; -}; - -#define VMA_MAP_TYPE(KeyT, ValueT) VmaMap - -template -struct VmaPairFirstLess -{ - bool operator()(const VmaPair& lhs, const VmaPair& rhs) const - { - return lhs.first < rhs.first; - } - bool operator()(const VmaPair& lhs, const FirstT& rhsFirst) const - { - return lhs.first < rhsFirst; - } -}; - -template -void VmaMap::insert(const PairType& pair) -{ - const size_t indexToInsert = VmaBinaryFindFirstNotLess( - m_Vector.data(), - m_Vector.data() + m_Vector.size(), - pair, - VmaPairFirstLess()) - m_Vector.data(); - VmaVectorInsert(m_Vector, indexToInsert, pair); -} - -template -VmaPair* VmaMap::find(const KeyT& key) -{ - PairType* it = VmaBinaryFindFirstNotLess( - m_Vector.data(), - m_Vector.data() + m_Vector.size(), - key, - VmaPairFirstLess()); - if((it != m_Vector.end()) && (it->first == key)) - { - return it; - } - else - { - return m_Vector.end(); - } -} - -template -void VmaMap::erase(iterator it) -{ - VmaVectorRemove(m_Vector, it - m_Vector.begin()); -} - -#endif // #if VMA_USE_STL_UNORDERED_MAP - -#endif // #if 0 - -//////////////////////////////////////////////////////////////////////////////// - -class VmaDeviceMemoryBlock; - -enum VMA_CACHE_OPERATION { VMA_CACHE_FLUSH, VMA_CACHE_INVALIDATE }; - -struct VmaAllocation_T -{ -private: - static const uint8_t MAP_COUNT_FLAG_PERSISTENT_MAP = 0x80; - - enum FLAGS - { - FLAG_USER_DATA_STRING = 0x01, - }; - -public: - enum ALLOCATION_TYPE - { - ALLOCATION_TYPE_NONE, - ALLOCATION_TYPE_BLOCK, - ALLOCATION_TYPE_DEDICATED, - }; - - /* - This struct is allocated using VmaPoolAllocator. - */ - - VmaAllocation_T(uint32_t currentFrameIndex, bool userDataString) : - m_Alignment{1}, - m_Size{0}, - m_pUserData{VMA_NULL}, - m_LastUseFrameIndex{currentFrameIndex}, - m_MemoryTypeIndex{0}, - m_Type{(uint8_t)ALLOCATION_TYPE_NONE}, - m_SuballocationType{(uint8_t)VMA_SUBALLOCATION_TYPE_UNKNOWN}, - m_MapCount{0}, - m_Flags{userDataString ? (uint8_t)FLAG_USER_DATA_STRING : (uint8_t)0} - { -#if VMA_STATS_STRING_ENABLED - m_CreationFrameIndex = currentFrameIndex; - m_BufferImageUsage = 0; -#endif - } - - ~VmaAllocation_T() - { - VMA_ASSERT((m_MapCount & ~MAP_COUNT_FLAG_PERSISTENT_MAP) == 0 && "Allocation was not unmapped before destruction."); - - // Check if owned string was freed. - VMA_ASSERT(m_pUserData == VMA_NULL); - } - - void InitBlockAllocation( - VmaDeviceMemoryBlock* block, - VkDeviceSize offset, - VkDeviceSize alignment, - VkDeviceSize size, - uint32_t memoryTypeIndex, - VmaSuballocationType suballocationType, - bool mapped, - bool canBecomeLost) - { - VMA_ASSERT(m_Type == ALLOCATION_TYPE_NONE); - VMA_ASSERT(block != VMA_NULL); - m_Type = (uint8_t)ALLOCATION_TYPE_BLOCK; - m_Alignment = alignment; - m_Size = size; - m_MemoryTypeIndex = memoryTypeIndex; - m_MapCount = mapped ? MAP_COUNT_FLAG_PERSISTENT_MAP : 0; - m_SuballocationType = (uint8_t)suballocationType; - m_BlockAllocation.m_Block = block; - m_BlockAllocation.m_Offset = offset; - m_BlockAllocation.m_CanBecomeLost = canBecomeLost; - } - - void InitLost() - { - VMA_ASSERT(m_Type == ALLOCATION_TYPE_NONE); - VMA_ASSERT(m_LastUseFrameIndex.load() == VMA_FRAME_INDEX_LOST); - m_Type = (uint8_t)ALLOCATION_TYPE_BLOCK; - m_MemoryTypeIndex = 0; - m_BlockAllocation.m_Block = VMA_NULL; - m_BlockAllocation.m_Offset = 0; - m_BlockAllocation.m_CanBecomeLost = true; - } - - void ChangeBlockAllocation( - VmaAllocator hAllocator, - VmaDeviceMemoryBlock* block, - VkDeviceSize offset); - - void ChangeOffset(VkDeviceSize newOffset); - - // pMappedData not null means allocation is created with MAPPED flag. - void InitDedicatedAllocation( - uint32_t memoryTypeIndex, - VkDeviceMemory hMemory, - VmaSuballocationType suballocationType, - void* pMappedData, - VkDeviceSize size) - { - VMA_ASSERT(m_Type == ALLOCATION_TYPE_NONE); - VMA_ASSERT(hMemory != VK_NULL_HANDLE); - m_Type = (uint8_t)ALLOCATION_TYPE_DEDICATED; - m_Alignment = 0; - m_Size = size; - m_MemoryTypeIndex = memoryTypeIndex; - m_SuballocationType = (uint8_t)suballocationType; - m_MapCount = (pMappedData != VMA_NULL) ? MAP_COUNT_FLAG_PERSISTENT_MAP : 0; - m_DedicatedAllocation.m_hMemory = hMemory; - m_DedicatedAllocation.m_pMappedData = pMappedData; - } - - ALLOCATION_TYPE GetType() const { return (ALLOCATION_TYPE)m_Type; } - VkDeviceSize GetAlignment() const { return m_Alignment; } - VkDeviceSize GetSize() const { return m_Size; } - bool IsUserDataString() const { return (m_Flags & FLAG_USER_DATA_STRING) != 0; } - void* GetUserData() const { return m_pUserData; } - void SetUserData(VmaAllocator hAllocator, void* pUserData); - VmaSuballocationType GetSuballocationType() const { return (VmaSuballocationType)m_SuballocationType; } - - VmaDeviceMemoryBlock* GetBlock() const - { - VMA_ASSERT(m_Type == ALLOCATION_TYPE_BLOCK); - return m_BlockAllocation.m_Block; - } - VkDeviceSize GetOffset() const; - VkDeviceMemory GetMemory() const; - uint32_t GetMemoryTypeIndex() const { return m_MemoryTypeIndex; } - bool IsPersistentMap() const { return (m_MapCount & MAP_COUNT_FLAG_PERSISTENT_MAP) != 0; } - void* GetMappedData() const; - bool CanBecomeLost() const; - - uint32_t GetLastUseFrameIndex() const - { - return m_LastUseFrameIndex.load(); - } - bool CompareExchangeLastUseFrameIndex(uint32_t& expected, uint32_t desired) - { - return m_LastUseFrameIndex.compare_exchange_weak(expected, desired); - } - /* - - If hAllocation.LastUseFrameIndex + frameInUseCount < allocator.CurrentFrameIndex, - makes it lost by setting LastUseFrameIndex = VMA_FRAME_INDEX_LOST and returns true. - - Else, returns false. - - If hAllocation is already lost, assert - you should not call it then. - If hAllocation was not created with CAN_BECOME_LOST_BIT, assert. - */ - bool MakeLost(uint32_t currentFrameIndex, uint32_t frameInUseCount); - - void DedicatedAllocCalcStatsInfo(VmaStatInfo& outInfo) - { - VMA_ASSERT(m_Type == ALLOCATION_TYPE_DEDICATED); - outInfo.blockCount = 1; - outInfo.allocationCount = 1; - outInfo.unusedRangeCount = 0; - outInfo.usedBytes = m_Size; - outInfo.unusedBytes = 0; - outInfo.allocationSizeMin = outInfo.allocationSizeMax = m_Size; - outInfo.unusedRangeSizeMin = UINT64_MAX; - outInfo.unusedRangeSizeMax = 0; - } - - void BlockAllocMap(); - void BlockAllocUnmap(); - VkResult DedicatedAllocMap(VmaAllocator hAllocator, void** ppData); - void DedicatedAllocUnmap(VmaAllocator hAllocator); - -#if VMA_STATS_STRING_ENABLED - uint32_t GetCreationFrameIndex() const { return m_CreationFrameIndex; } - uint32_t GetBufferImageUsage() const { return m_BufferImageUsage; } - - void InitBufferImageUsage(uint32_t bufferImageUsage) - { - VMA_ASSERT(m_BufferImageUsage == 0); - m_BufferImageUsage = bufferImageUsage; - } - - void PrintParameters(class VmaJsonWriter& json) const; -#endif - -private: - VkDeviceSize m_Alignment; - VkDeviceSize m_Size; - void* m_pUserData; - VMA_ATOMIC_UINT32 m_LastUseFrameIndex; - uint32_t m_MemoryTypeIndex; - uint8_t m_Type; // ALLOCATION_TYPE - uint8_t m_SuballocationType; // VmaSuballocationType - // Bit 0x80 is set when allocation was created with VMA_ALLOCATION_CREATE_MAPPED_BIT. - // Bits with mask 0x7F are reference counter for vmaMapMemory()/vmaUnmapMemory(). - uint8_t m_MapCount; - uint8_t m_Flags; // enum FLAGS - - // Allocation out of VmaDeviceMemoryBlock. - struct BlockAllocation - { - VmaDeviceMemoryBlock* m_Block; - VkDeviceSize m_Offset; - bool m_CanBecomeLost; - }; - - // Allocation for an object that has its own private VkDeviceMemory. - struct DedicatedAllocation - { - VkDeviceMemory m_hMemory; - void* m_pMappedData; // Not null means memory is mapped. - }; - - union - { - // Allocation out of VmaDeviceMemoryBlock. - BlockAllocation m_BlockAllocation; - // Allocation for an object that has its own private VkDeviceMemory. - DedicatedAllocation m_DedicatedAllocation; - }; - -#if VMA_STATS_STRING_ENABLED - uint32_t m_CreationFrameIndex; - uint32_t m_BufferImageUsage; // 0 if unknown. -#endif - - void FreeUserDataString(VmaAllocator hAllocator); -}; - -/* -Represents a region of VmaDeviceMemoryBlock that is either assigned and returned as -allocated memory block or free. -*/ -struct VmaSuballocation -{ - VkDeviceSize offset; - VkDeviceSize size; - VmaAllocation hAllocation; - VmaSuballocationType type; -}; - -// Comparator for offsets. -struct VmaSuballocationOffsetLess -{ - bool operator()(const VmaSuballocation& lhs, const VmaSuballocation& rhs) const - { - return lhs.offset < rhs.offset; - } -}; -struct VmaSuballocationOffsetGreater -{ - bool operator()(const VmaSuballocation& lhs, const VmaSuballocation& rhs) const - { - return lhs.offset > rhs.offset; - } -}; - -typedef VmaList< VmaSuballocation, VmaStlAllocator > VmaSuballocationList; - -// Cost of one additional allocation lost, as equivalent in bytes. -static const VkDeviceSize VMA_LOST_ALLOCATION_COST = 1048576; - -enum class VmaAllocationRequestType -{ - Normal, - // Used by "Linear" algorithm. - UpperAddress, - EndOf1st, - EndOf2nd, -}; - -/* -Parameters of planned allocation inside a VmaDeviceMemoryBlock. - -If canMakeOtherLost was false: -- item points to a FREE suballocation. -- itemsToMakeLostCount is 0. - -If canMakeOtherLost was true: -- item points to first of sequence of suballocations, which are either FREE, - or point to VmaAllocations that can become lost. -- itemsToMakeLostCount is the number of VmaAllocations that need to be made lost for - the requested allocation to succeed. -*/ -struct VmaAllocationRequest -{ - VkDeviceSize offset; - VkDeviceSize sumFreeSize; // Sum size of free items that overlap with proposed allocation. - VkDeviceSize sumItemSize; // Sum size of items to make lost that overlap with proposed allocation. - VmaSuballocationList::iterator item; - size_t itemsToMakeLostCount; - void* customData; - VmaAllocationRequestType type; - - VkDeviceSize CalcCost() const - { - return sumItemSize + itemsToMakeLostCount * VMA_LOST_ALLOCATION_COST; - } -}; - -/* -Data structure used for bookkeeping of allocations and unused ranges of memory -in a single VkDeviceMemory block. -*/ -class VmaBlockMetadata -{ -public: - VmaBlockMetadata(VmaAllocator hAllocator); - virtual ~VmaBlockMetadata() { } - virtual void Init(VkDeviceSize size) { m_Size = size; } - - // Validates all data structures inside this object. If not valid, returns false. - virtual bool Validate() const = 0; - VkDeviceSize GetSize() const { return m_Size; } - virtual size_t GetAllocationCount() const = 0; - virtual VkDeviceSize GetSumFreeSize() const = 0; - virtual VkDeviceSize GetUnusedRangeSizeMax() const = 0; - // Returns true if this block is empty - contains only single free suballocation. - virtual bool IsEmpty() const = 0; - - virtual void CalcAllocationStatInfo(VmaStatInfo& outInfo) const = 0; - // Shouldn't modify blockCount. - virtual void AddPoolStats(VmaPoolStats& inoutStats) const = 0; - -#if VMA_STATS_STRING_ENABLED - virtual void PrintDetailedMap(class VmaJsonWriter& json) const = 0; -#endif - - // Tries to find a place for suballocation with given parameters inside this block. - // If succeeded, fills pAllocationRequest and returns true. - // If failed, returns false. - virtual bool CreateAllocationRequest( - uint32_t currentFrameIndex, - uint32_t frameInUseCount, - VkDeviceSize bufferImageGranularity, - VkDeviceSize allocSize, - VkDeviceSize allocAlignment, - bool upperAddress, - VmaSuballocationType allocType, - bool canMakeOtherLost, - // Always one of VMA_ALLOCATION_CREATE_STRATEGY_* or VMA_ALLOCATION_INTERNAL_STRATEGY_* flags. - uint32_t strategy, - VmaAllocationRequest* pAllocationRequest) = 0; - - virtual bool MakeRequestedAllocationsLost( - uint32_t currentFrameIndex, - uint32_t frameInUseCount, - VmaAllocationRequest* pAllocationRequest) = 0; - - virtual uint32_t MakeAllocationsLost(uint32_t currentFrameIndex, uint32_t frameInUseCount) = 0; - - virtual VkResult CheckCorruption(const void* pBlockData) = 0; - - // Makes actual allocation based on request. Request must already be checked and valid. - virtual void Alloc( - const VmaAllocationRequest& request, - VmaSuballocationType type, - VkDeviceSize allocSize, - VmaAllocation hAllocation) = 0; - - // Frees suballocation assigned to given memory region. - virtual void Free(const VmaAllocation allocation) = 0; - virtual void FreeAtOffset(VkDeviceSize offset) = 0; - -protected: - const VkAllocationCallbacks* GetAllocationCallbacks() const { return m_pAllocationCallbacks; } - -#if VMA_STATS_STRING_ENABLED - void PrintDetailedMap_Begin(class VmaJsonWriter& json, - VkDeviceSize unusedBytes, - size_t allocationCount, - size_t unusedRangeCount) const; - void PrintDetailedMap_Allocation(class VmaJsonWriter& json, - VkDeviceSize offset, - VmaAllocation hAllocation) const; - void PrintDetailedMap_UnusedRange(class VmaJsonWriter& json, - VkDeviceSize offset, - VkDeviceSize size) const; - void PrintDetailedMap_End(class VmaJsonWriter& json) const; -#endif - -private: - VkDeviceSize m_Size; - const VkAllocationCallbacks* m_pAllocationCallbacks; -}; - -#define VMA_VALIDATE(cond) do { if(!(cond)) { \ - VMA_ASSERT(0 && "Validation failed: " #cond); \ - return false; \ - } } while(false) - -class VmaBlockMetadata_Generic : public VmaBlockMetadata -{ - VMA_CLASS_NO_COPY(VmaBlockMetadata_Generic) -public: - VmaBlockMetadata_Generic(VmaAllocator hAllocator); - virtual ~VmaBlockMetadata_Generic(); - virtual void Init(VkDeviceSize size); - - virtual bool Validate() const; - virtual size_t GetAllocationCount() const { return m_Suballocations.size() - m_FreeCount; } - virtual VkDeviceSize GetSumFreeSize() const { return m_SumFreeSize; } - virtual VkDeviceSize GetUnusedRangeSizeMax() const; - virtual bool IsEmpty() const; - - virtual void CalcAllocationStatInfo(VmaStatInfo& outInfo) const; - virtual void AddPoolStats(VmaPoolStats& inoutStats) const; - -#if VMA_STATS_STRING_ENABLED - virtual void PrintDetailedMap(class VmaJsonWriter& json) const; -#endif - - virtual bool CreateAllocationRequest( - uint32_t currentFrameIndex, - uint32_t frameInUseCount, - VkDeviceSize bufferImageGranularity, - VkDeviceSize allocSize, - VkDeviceSize allocAlignment, - bool upperAddress, - VmaSuballocationType allocType, - bool canMakeOtherLost, - uint32_t strategy, - VmaAllocationRequest* pAllocationRequest); - - virtual bool MakeRequestedAllocationsLost( - uint32_t currentFrameIndex, - uint32_t frameInUseCount, - VmaAllocationRequest* pAllocationRequest); - - virtual uint32_t MakeAllocationsLost(uint32_t currentFrameIndex, uint32_t frameInUseCount); - - virtual VkResult CheckCorruption(const void* pBlockData); - - virtual void Alloc( - const VmaAllocationRequest& request, - VmaSuballocationType type, - VkDeviceSize allocSize, - VmaAllocation hAllocation); - - virtual void Free(const VmaAllocation allocation); - virtual void FreeAtOffset(VkDeviceSize offset); - - //////////////////////////////////////////////////////////////////////////////// - // For defragmentation - - bool IsBufferImageGranularityConflictPossible( - VkDeviceSize bufferImageGranularity, - VmaSuballocationType& inOutPrevSuballocType) const; - -private: - friend class VmaDefragmentationAlgorithm_Generic; - friend class VmaDefragmentationAlgorithm_Fast; - - uint32_t m_FreeCount; - VkDeviceSize m_SumFreeSize; - VmaSuballocationList m_Suballocations; - // Suballocations that are free and have size greater than certain threshold. - // Sorted by size, ascending. - VmaVector< VmaSuballocationList::iterator, VmaStlAllocator< VmaSuballocationList::iterator > > m_FreeSuballocationsBySize; - - bool ValidateFreeSuballocationList() const; - - // Checks if requested suballocation with given parameters can be placed in given pFreeSuballocItem. - // If yes, fills pOffset and returns true. If no, returns false. - bool CheckAllocation( - uint32_t currentFrameIndex, - uint32_t frameInUseCount, - VkDeviceSize bufferImageGranularity, - VkDeviceSize allocSize, - VkDeviceSize allocAlignment, - VmaSuballocationType allocType, - VmaSuballocationList::const_iterator suballocItem, - bool canMakeOtherLost, - VkDeviceSize* pOffset, - size_t* itemsToMakeLostCount, - VkDeviceSize* pSumFreeSize, - VkDeviceSize* pSumItemSize) const; - // Given free suballocation, it merges it with following one, which must also be free. - void MergeFreeWithNext(VmaSuballocationList::iterator item); - // Releases given suballocation, making it free. - // Merges it with adjacent free suballocations if applicable. - // Returns iterator to new free suballocation at this place. - VmaSuballocationList::iterator FreeSuballocation(VmaSuballocationList::iterator suballocItem); - // Given free suballocation, it inserts it into sorted list of - // m_FreeSuballocationsBySize if it's suitable. - void RegisterFreeSuballocation(VmaSuballocationList::iterator item); - // Given free suballocation, it removes it from sorted list of - // m_FreeSuballocationsBySize if it's suitable. - void UnregisterFreeSuballocation(VmaSuballocationList::iterator item); -}; - -/* -Allocations and their references in internal data structure look like this: - -if(m_2ndVectorMode == SECOND_VECTOR_EMPTY): - - 0 +-------+ - | | - | | - | | - +-------+ - | Alloc | 1st[m_1stNullItemsBeginCount] - +-------+ - | Alloc | 1st[m_1stNullItemsBeginCount + 1] - +-------+ - | ... | - +-------+ - | Alloc | 1st[1st.size() - 1] - +-------+ - | | - | | - | | -GetSize() +-------+ - -if(m_2ndVectorMode == SECOND_VECTOR_RING_BUFFER): - - 0 +-------+ - | Alloc | 2nd[0] - +-------+ - | Alloc | 2nd[1] - +-------+ - | ... | - +-------+ - | Alloc | 2nd[2nd.size() - 1] - +-------+ - | | - | | - | | - +-------+ - | Alloc | 1st[m_1stNullItemsBeginCount] - +-------+ - | Alloc | 1st[m_1stNullItemsBeginCount + 1] - +-------+ - | ... | - +-------+ - | Alloc | 1st[1st.size() - 1] - +-------+ - | | -GetSize() +-------+ - -if(m_2ndVectorMode == SECOND_VECTOR_DOUBLE_STACK): - - 0 +-------+ - | | - | | - | | - +-------+ - | Alloc | 1st[m_1stNullItemsBeginCount] - +-------+ - | Alloc | 1st[m_1stNullItemsBeginCount + 1] - +-------+ - | ... | - +-------+ - | Alloc | 1st[1st.size() - 1] - +-------+ - | | - | | - | | - +-------+ - | Alloc | 2nd[2nd.size() - 1] - +-------+ - | ... | - +-------+ - | Alloc | 2nd[1] - +-------+ - | Alloc | 2nd[0] -GetSize() +-------+ - -*/ -class VmaBlockMetadata_Linear : public VmaBlockMetadata -{ - VMA_CLASS_NO_COPY(VmaBlockMetadata_Linear) -public: - VmaBlockMetadata_Linear(VmaAllocator hAllocator); - virtual ~VmaBlockMetadata_Linear(); - virtual void Init(VkDeviceSize size); - - virtual bool Validate() const; - virtual size_t GetAllocationCount() const; - virtual VkDeviceSize GetSumFreeSize() const { return m_SumFreeSize; } - virtual VkDeviceSize GetUnusedRangeSizeMax() const; - virtual bool IsEmpty() const { return GetAllocationCount() == 0; } - - virtual void CalcAllocationStatInfo(VmaStatInfo& outInfo) const; - virtual void AddPoolStats(VmaPoolStats& inoutStats) const; - -#if VMA_STATS_STRING_ENABLED - virtual void PrintDetailedMap(class VmaJsonWriter& json) const; -#endif - - virtual bool CreateAllocationRequest( - uint32_t currentFrameIndex, - uint32_t frameInUseCount, - VkDeviceSize bufferImageGranularity, - VkDeviceSize allocSize, - VkDeviceSize allocAlignment, - bool upperAddress, - VmaSuballocationType allocType, - bool canMakeOtherLost, - uint32_t strategy, - VmaAllocationRequest* pAllocationRequest); - - virtual bool MakeRequestedAllocationsLost( - uint32_t currentFrameIndex, - uint32_t frameInUseCount, - VmaAllocationRequest* pAllocationRequest); - - virtual uint32_t MakeAllocationsLost(uint32_t currentFrameIndex, uint32_t frameInUseCount); - - virtual VkResult CheckCorruption(const void* pBlockData); - - virtual void Alloc( - const VmaAllocationRequest& request, - VmaSuballocationType type, - VkDeviceSize allocSize, - VmaAllocation hAllocation); - - virtual void Free(const VmaAllocation allocation); - virtual void FreeAtOffset(VkDeviceSize offset); - -private: - /* - There are two suballocation vectors, used in ping-pong way. - The one with index m_1stVectorIndex is called 1st. - The one with index (m_1stVectorIndex ^ 1) is called 2nd. - 2nd can be non-empty only when 1st is not empty. - When 2nd is not empty, m_2ndVectorMode indicates its mode of operation. - */ - typedef VmaVector< VmaSuballocation, VmaStlAllocator > SuballocationVectorType; - - enum SECOND_VECTOR_MODE - { - SECOND_VECTOR_EMPTY, - /* - Suballocations in 2nd vector are created later than the ones in 1st, but they - all have smaller offset. - */ - SECOND_VECTOR_RING_BUFFER, - /* - Suballocations in 2nd vector are upper side of double stack. - They all have offsets higher than those in 1st vector. - Top of this stack means smaller offsets, but higher indices in this vector. - */ - SECOND_VECTOR_DOUBLE_STACK, - }; - - VkDeviceSize m_SumFreeSize; - SuballocationVectorType m_Suballocations0, m_Suballocations1; - uint32_t m_1stVectorIndex; - SECOND_VECTOR_MODE m_2ndVectorMode; - - SuballocationVectorType& AccessSuballocations1st() { return m_1stVectorIndex ? m_Suballocations1 : m_Suballocations0; } - SuballocationVectorType& AccessSuballocations2nd() { return m_1stVectorIndex ? m_Suballocations0 : m_Suballocations1; } - const SuballocationVectorType& AccessSuballocations1st() const { return m_1stVectorIndex ? m_Suballocations1 : m_Suballocations0; } - const SuballocationVectorType& AccessSuballocations2nd() const { return m_1stVectorIndex ? m_Suballocations0 : m_Suballocations1; } - - // Number of items in 1st vector with hAllocation = null at the beginning. - size_t m_1stNullItemsBeginCount; - // Number of other items in 1st vector with hAllocation = null somewhere in the middle. - size_t m_1stNullItemsMiddleCount; - // Number of items in 2nd vector with hAllocation = null. - size_t m_2ndNullItemsCount; - - bool ShouldCompact1st() const; - void CleanupAfterFree(); - - bool CreateAllocationRequest_LowerAddress( - uint32_t currentFrameIndex, - uint32_t frameInUseCount, - VkDeviceSize bufferImageGranularity, - VkDeviceSize allocSize, - VkDeviceSize allocAlignment, - VmaSuballocationType allocType, - bool canMakeOtherLost, - uint32_t strategy, - VmaAllocationRequest* pAllocationRequest); - bool CreateAllocationRequest_UpperAddress( - uint32_t currentFrameIndex, - uint32_t frameInUseCount, - VkDeviceSize bufferImageGranularity, - VkDeviceSize allocSize, - VkDeviceSize allocAlignment, - VmaSuballocationType allocType, - bool canMakeOtherLost, - uint32_t strategy, - VmaAllocationRequest* pAllocationRequest); -}; - -/* -- GetSize() is the original size of allocated memory block. -- m_UsableSize is this size aligned down to a power of two. - All allocations and calculations happen relative to m_UsableSize. -- GetUnusableSize() is the difference between them. - It is repoted as separate, unused range, not available for allocations. - -Node at level 0 has size = m_UsableSize. -Each next level contains nodes with size 2 times smaller than current level. -m_LevelCount is the maximum number of levels to use in the current object. -*/ -class VmaBlockMetadata_Buddy : public VmaBlockMetadata -{ - VMA_CLASS_NO_COPY(VmaBlockMetadata_Buddy) -public: - VmaBlockMetadata_Buddy(VmaAllocator hAllocator); - virtual ~VmaBlockMetadata_Buddy(); - virtual void Init(VkDeviceSize size); - - virtual bool Validate() const; - virtual size_t GetAllocationCount() const { return m_AllocationCount; } - virtual VkDeviceSize GetSumFreeSize() const { return m_SumFreeSize + GetUnusableSize(); } - virtual VkDeviceSize GetUnusedRangeSizeMax() const; - virtual bool IsEmpty() const { return m_Root->type == Node::TYPE_FREE; } - - virtual void CalcAllocationStatInfo(VmaStatInfo& outInfo) const; - virtual void AddPoolStats(VmaPoolStats& inoutStats) const; - -#if VMA_STATS_STRING_ENABLED - virtual void PrintDetailedMap(class VmaJsonWriter& json) const; -#endif - - virtual bool CreateAllocationRequest( - uint32_t currentFrameIndex, - uint32_t frameInUseCount, - VkDeviceSize bufferImageGranularity, - VkDeviceSize allocSize, - VkDeviceSize allocAlignment, - bool upperAddress, - VmaSuballocationType allocType, - bool canMakeOtherLost, - uint32_t strategy, - VmaAllocationRequest* pAllocationRequest); - - virtual bool MakeRequestedAllocationsLost( - uint32_t currentFrameIndex, - uint32_t frameInUseCount, - VmaAllocationRequest* pAllocationRequest); - - virtual uint32_t MakeAllocationsLost(uint32_t currentFrameIndex, uint32_t frameInUseCount); - - virtual VkResult CheckCorruption(const void* pBlockData) { return VK_ERROR_FEATURE_NOT_PRESENT; } - - virtual void Alloc( - const VmaAllocationRequest& request, - VmaSuballocationType type, - VkDeviceSize allocSize, - VmaAllocation hAllocation); - - virtual void Free(const VmaAllocation allocation) { FreeAtOffset(allocation, allocation->GetOffset()); } - virtual void FreeAtOffset(VkDeviceSize offset) { FreeAtOffset(VMA_NULL, offset); } - -private: - static const VkDeviceSize MIN_NODE_SIZE = 32; - static const size_t MAX_LEVELS = 30; - - struct ValidationContext - { - size_t calculatedAllocationCount; - size_t calculatedFreeCount; - VkDeviceSize calculatedSumFreeSize; - - ValidationContext() : - calculatedAllocationCount(0), - calculatedFreeCount(0), - calculatedSumFreeSize(0) { } - }; - - struct Node - { - VkDeviceSize offset; - enum TYPE - { - TYPE_FREE, - TYPE_ALLOCATION, - TYPE_SPLIT, - TYPE_COUNT - } type; - Node* parent; - Node* buddy; - - union - { - struct - { - Node* prev; - Node* next; - } free; - struct - { - VmaAllocation alloc; - } allocation; - struct - { - Node* leftChild; - } split; - }; - }; - - // Size of the memory block aligned down to a power of two. - VkDeviceSize m_UsableSize; - uint32_t m_LevelCount; - - Node* m_Root; - struct { - Node* front; - Node* back; - } m_FreeList[MAX_LEVELS]; - // Number of nodes in the tree with type == TYPE_ALLOCATION. - size_t m_AllocationCount; - // Number of nodes in the tree with type == TYPE_FREE. - size_t m_FreeCount; - // This includes space wasted due to internal fragmentation. Doesn't include unusable size. - VkDeviceSize m_SumFreeSize; - - VkDeviceSize GetUnusableSize() const { return GetSize() - m_UsableSize; } - void DeleteNode(Node* node); - bool ValidateNode(ValidationContext& ctx, const Node* parent, const Node* curr, uint32_t level, VkDeviceSize levelNodeSize) const; - uint32_t AllocSizeToLevel(VkDeviceSize allocSize) const; - inline VkDeviceSize LevelToNodeSize(uint32_t level) const { return m_UsableSize >> level; } - // Alloc passed just for validation. Can be null. - void FreeAtOffset(VmaAllocation alloc, VkDeviceSize offset); - void CalcAllocationStatInfoNode(VmaStatInfo& outInfo, const Node* node, VkDeviceSize levelNodeSize) const; - // Adds node to the front of FreeList at given level. - // node->type must be FREE. - // node->free.prev, next can be undefined. - void AddToFreeListFront(uint32_t level, Node* node); - // Removes node from FreeList at given level. - // node->type must be FREE. - // node->free.prev, next stay untouched. - void RemoveFromFreeList(uint32_t level, Node* node); - -#if VMA_STATS_STRING_ENABLED - void PrintDetailedMapNode(class VmaJsonWriter& json, const Node* node, VkDeviceSize levelNodeSize) const; -#endif -}; - -/* -Represents a single block of device memory (`VkDeviceMemory`) with all the -data about its regions (aka suballocations, #VmaAllocation), assigned and free. - -Thread-safety: This class must be externally synchronized. -*/ -class VmaDeviceMemoryBlock -{ - VMA_CLASS_NO_COPY(VmaDeviceMemoryBlock) -public: - VmaBlockMetadata* m_pMetadata; - - VmaDeviceMemoryBlock(VmaAllocator hAllocator); - - ~VmaDeviceMemoryBlock() - { - VMA_ASSERT(m_MapCount == 0 && "VkDeviceMemory block is being destroyed while it is still mapped."); - VMA_ASSERT(m_hMemory == VK_NULL_HANDLE); - } - - // Always call after construction. - void Init( - VmaAllocator hAllocator, - VmaPool hParentPool, - uint32_t newMemoryTypeIndex, - VkDeviceMemory newMemory, - VkDeviceSize newSize, - uint32_t id, - uint32_t algorithm); - // Always call before destruction. - void Destroy(VmaAllocator allocator); - - VmaPool GetParentPool() const { return m_hParentPool; } - VkDeviceMemory GetDeviceMemory() const { return m_hMemory; } - uint32_t GetMemoryTypeIndex() const { return m_MemoryTypeIndex; } - uint32_t GetId() const { return m_Id; } - void* GetMappedData() const { return m_pMappedData; } - - // Validates all data structures inside this object. If not valid, returns false. - bool Validate() const; - - VkResult CheckCorruption(VmaAllocator hAllocator); - - // ppData can be null. - VkResult Map(VmaAllocator hAllocator, uint32_t count, void** ppData); - void Unmap(VmaAllocator hAllocator, uint32_t count); - - VkResult WriteMagicValueAroundAllocation(VmaAllocator hAllocator, VkDeviceSize allocOffset, VkDeviceSize allocSize); - VkResult ValidateMagicValueAroundAllocation(VmaAllocator hAllocator, VkDeviceSize allocOffset, VkDeviceSize allocSize); - - VkResult BindBufferMemory( - const VmaAllocator hAllocator, - const VmaAllocation hAllocation, - VkDeviceSize allocationLocalOffset, - VkBuffer hBuffer, - const void* pNext); - VkResult BindImageMemory( - const VmaAllocator hAllocator, - const VmaAllocation hAllocation, - VkDeviceSize allocationLocalOffset, - VkImage hImage, - const void* pNext); - -private: - VmaPool m_hParentPool; // VK_NULL_HANDLE if not belongs to custom pool. - uint32_t m_MemoryTypeIndex; - uint32_t m_Id; - VkDeviceMemory m_hMemory; - - /* - Protects access to m_hMemory so it's not used by multiple threads simultaneously, e.g. vkMapMemory, vkBindBufferMemory. - Also protects m_MapCount, m_pMappedData. - Allocations, deallocations, any change in m_pMetadata is protected by parent's VmaBlockVector::m_Mutex. - */ - VMA_MUTEX m_Mutex; - uint32_t m_MapCount; - void* m_pMappedData; -}; - -struct VmaPointerLess -{ - bool operator()(const void* lhs, const void* rhs) const - { - return lhs < rhs; - } -}; - -struct VmaDefragmentationMove -{ - size_t srcBlockIndex; - size_t dstBlockIndex; - VkDeviceSize srcOffset; - VkDeviceSize dstOffset; - VkDeviceSize size; - VmaAllocation hAllocation; - VmaDeviceMemoryBlock* pSrcBlock; - VmaDeviceMemoryBlock* pDstBlock; -}; - -class VmaDefragmentationAlgorithm; - -/* -Sequence of VmaDeviceMemoryBlock. Represents memory blocks allocated for a specific -Vulkan memory type. - -Synchronized internally with a mutex. -*/ -struct VmaBlockVector -{ - VMA_CLASS_NO_COPY(VmaBlockVector) -public: - VmaBlockVector( - VmaAllocator hAllocator, - VmaPool hParentPool, - uint32_t memoryTypeIndex, - VkDeviceSize preferredBlockSize, - size_t minBlockCount, - size_t maxBlockCount, - VkDeviceSize bufferImageGranularity, - uint32_t frameInUseCount, - bool explicitBlockSize, - uint32_t algorithm, - float priority); - ~VmaBlockVector(); - - VkResult CreateMinBlocks(); - - VmaAllocator GetAllocator() const { return m_hAllocator; } - VmaPool GetParentPool() const { return m_hParentPool; } - bool IsCustomPool() const { return m_hParentPool != VMA_NULL; } - uint32_t GetMemoryTypeIndex() const { return m_MemoryTypeIndex; } - VkDeviceSize GetPreferredBlockSize() const { return m_PreferredBlockSize; } - VkDeviceSize GetBufferImageGranularity() const { return m_BufferImageGranularity; } - uint32_t GetFrameInUseCount() const { return m_FrameInUseCount; } - uint32_t GetAlgorithm() const { return m_Algorithm; } - - void GetPoolStats(VmaPoolStats* pStats); - - bool IsEmpty(); - bool IsCorruptionDetectionEnabled() const; - - VkResult Allocate( - uint32_t currentFrameIndex, - VkDeviceSize size, - VkDeviceSize alignment, - const VmaAllocationCreateInfo& createInfo, - VmaSuballocationType suballocType, - size_t allocationCount, - VmaAllocation* pAllocations); - - void Free(const VmaAllocation hAllocation); - - // Adds statistics of this BlockVector to pStats. - void AddStats(VmaStats* pStats); - -#if VMA_STATS_STRING_ENABLED - void PrintDetailedMap(class VmaJsonWriter& json); -#endif - - void MakePoolAllocationsLost( - uint32_t currentFrameIndex, - size_t* pLostAllocationCount); - VkResult CheckCorruption(); - - // Saves results in pCtx->res. - void Defragment( - class VmaBlockVectorDefragmentationContext* pCtx, - VmaDefragmentationStats* pStats, VmaDefragmentationFlags flags, - VkDeviceSize& maxCpuBytesToMove, uint32_t& maxCpuAllocationsToMove, - VkDeviceSize& maxGpuBytesToMove, uint32_t& maxGpuAllocationsToMove, - VkCommandBuffer commandBuffer); - void DefragmentationEnd( - class VmaBlockVectorDefragmentationContext* pCtx, - uint32_t flags, - VmaDefragmentationStats* pStats); - - uint32_t ProcessDefragmentations( - class VmaBlockVectorDefragmentationContext *pCtx, - VmaDefragmentationPassMoveInfo* pMove, uint32_t maxMoves); - - void CommitDefragmentations( - class VmaBlockVectorDefragmentationContext *pCtx, - VmaDefragmentationStats* pStats); - - //////////////////////////////////////////////////////////////////////////////// - // To be used only while the m_Mutex is locked. Used during defragmentation. - - size_t GetBlockCount() const { return m_Blocks.size(); } - VmaDeviceMemoryBlock* GetBlock(size_t index) const { return m_Blocks[index]; } - size_t CalcAllocationCount() const; - bool IsBufferImageGranularityConflictPossible() const; - -private: - friend class VmaDefragmentationAlgorithm_Generic; - - const VmaAllocator m_hAllocator; - const VmaPool m_hParentPool; - const uint32_t m_MemoryTypeIndex; - const VkDeviceSize m_PreferredBlockSize; - const size_t m_MinBlockCount; - const size_t m_MaxBlockCount; - const VkDeviceSize m_BufferImageGranularity; - const uint32_t m_FrameInUseCount; - const bool m_ExplicitBlockSize; - const uint32_t m_Algorithm; - const float m_Priority; - VMA_RW_MUTEX m_Mutex; - - /* There can be at most one allocation that is completely empty (except when minBlockCount > 0) - - a hysteresis to avoid pessimistic case of alternating creation and destruction of a VkDeviceMemory. */ - bool m_HasEmptyBlock; - // Incrementally sorted by sumFreeSize, ascending. - VmaVector< VmaDeviceMemoryBlock*, VmaStlAllocator > m_Blocks; - uint32_t m_NextBlockId; - - VkDeviceSize CalcMaxBlockSize() const; - - // Finds and removes given block from vector. - void Remove(VmaDeviceMemoryBlock* pBlock); - - // Performs single step in sorting m_Blocks. They may not be fully sorted - // after this call. - void IncrementallySortBlocks(); - - VkResult AllocatePage( - uint32_t currentFrameIndex, - VkDeviceSize size, - VkDeviceSize alignment, - const VmaAllocationCreateInfo& createInfo, - VmaSuballocationType suballocType, - VmaAllocation* pAllocation); - - // To be used only without CAN_MAKE_OTHER_LOST flag. - VkResult AllocateFromBlock( - VmaDeviceMemoryBlock* pBlock, - uint32_t currentFrameIndex, - VkDeviceSize size, - VkDeviceSize alignment, - VmaAllocationCreateFlags allocFlags, - void* pUserData, - VmaSuballocationType suballocType, - uint32_t strategy, - VmaAllocation* pAllocation); - - VkResult CreateBlock(VkDeviceSize blockSize, size_t* pNewBlockIndex); - - // Saves result to pCtx->res. - void ApplyDefragmentationMovesCpu( - class VmaBlockVectorDefragmentationContext* pDefragCtx, - const VmaVector< VmaDefragmentationMove, VmaStlAllocator >& moves); - // Saves result to pCtx->res. - void ApplyDefragmentationMovesGpu( - class VmaBlockVectorDefragmentationContext* pDefragCtx, - VmaVector< VmaDefragmentationMove, VmaStlAllocator >& moves, - VkCommandBuffer commandBuffer); - - /* - Used during defragmentation. pDefragmentationStats is optional. It's in/out - - updated with new data. - */ - void FreeEmptyBlocks(VmaDefragmentationStats* pDefragmentationStats); - - void UpdateHasEmptyBlock(); -}; - -struct VmaPool_T -{ - VMA_CLASS_NO_COPY(VmaPool_T) -public: - VmaBlockVector m_BlockVector; - - VmaPool_T( - VmaAllocator hAllocator, - const VmaPoolCreateInfo& createInfo, - VkDeviceSize preferredBlockSize); - ~VmaPool_T(); - - uint32_t GetId() const { return m_Id; } - void SetId(uint32_t id) { VMA_ASSERT(m_Id == 0); m_Id = id; } - - const char* GetName() const { return m_Name; } - void SetName(const char* pName); - -#if VMA_STATS_STRING_ENABLED - //void PrintDetailedMap(class VmaStringBuilder& sb); -#endif - -private: - uint32_t m_Id; - char* m_Name; -}; - -/* -Performs defragmentation: - -- Updates `pBlockVector->m_pMetadata`. -- Updates allocations by calling ChangeBlockAllocation() or ChangeOffset(). -- Does not move actual data, only returns requested moves as `moves`. -*/ -class VmaDefragmentationAlgorithm -{ - VMA_CLASS_NO_COPY(VmaDefragmentationAlgorithm) -public: - VmaDefragmentationAlgorithm( - VmaAllocator hAllocator, - VmaBlockVector* pBlockVector, - uint32_t currentFrameIndex) : - m_hAllocator(hAllocator), - m_pBlockVector(pBlockVector), - m_CurrentFrameIndex(currentFrameIndex) - { - } - virtual ~VmaDefragmentationAlgorithm() - { - } - - virtual void AddAllocation(VmaAllocation hAlloc, VkBool32* pChanged) = 0; - virtual void AddAll() = 0; - - virtual VkResult Defragment( - VmaVector< VmaDefragmentationMove, VmaStlAllocator >& moves, - VkDeviceSize maxBytesToMove, - uint32_t maxAllocationsToMove, - VmaDefragmentationFlags flags) = 0; - - virtual VkDeviceSize GetBytesMoved() const = 0; - virtual uint32_t GetAllocationsMoved() const = 0; - -protected: - VmaAllocator const m_hAllocator; - VmaBlockVector* const m_pBlockVector; - const uint32_t m_CurrentFrameIndex; - - struct AllocationInfo - { - VmaAllocation m_hAllocation; - VkBool32* m_pChanged; - - AllocationInfo() : - m_hAllocation(VK_NULL_HANDLE), - m_pChanged(VMA_NULL) - { - } - AllocationInfo(VmaAllocation hAlloc, VkBool32* pChanged) : - m_hAllocation(hAlloc), - m_pChanged(pChanged) - { - } - }; -}; - -class VmaDefragmentationAlgorithm_Generic : public VmaDefragmentationAlgorithm -{ - VMA_CLASS_NO_COPY(VmaDefragmentationAlgorithm_Generic) -public: - VmaDefragmentationAlgorithm_Generic( - VmaAllocator hAllocator, - VmaBlockVector* pBlockVector, - uint32_t currentFrameIndex, - bool overlappingMoveSupported); - virtual ~VmaDefragmentationAlgorithm_Generic(); - - virtual void AddAllocation(VmaAllocation hAlloc, VkBool32* pChanged); - virtual void AddAll() { m_AllAllocations = true; } - - virtual VkResult Defragment( - VmaVector< VmaDefragmentationMove, VmaStlAllocator >& moves, - VkDeviceSize maxBytesToMove, - uint32_t maxAllocationsToMove, - VmaDefragmentationFlags flags); - - virtual VkDeviceSize GetBytesMoved() const { return m_BytesMoved; } - virtual uint32_t GetAllocationsMoved() const { return m_AllocationsMoved; } - -private: - uint32_t m_AllocationCount; - bool m_AllAllocations; - - VkDeviceSize m_BytesMoved; - uint32_t m_AllocationsMoved; - - struct AllocationInfoSizeGreater - { - bool operator()(const AllocationInfo& lhs, const AllocationInfo& rhs) const - { - return lhs.m_hAllocation->GetSize() > rhs.m_hAllocation->GetSize(); - } - }; - - struct AllocationInfoOffsetGreater - { - bool operator()(const AllocationInfo& lhs, const AllocationInfo& rhs) const - { - return lhs.m_hAllocation->GetOffset() > rhs.m_hAllocation->GetOffset(); - } - }; - - struct BlockInfo - { - size_t m_OriginalBlockIndex; - VmaDeviceMemoryBlock* m_pBlock; - bool m_HasNonMovableAllocations; - VmaVector< AllocationInfo, VmaStlAllocator > m_Allocations; - - BlockInfo(const VkAllocationCallbacks* pAllocationCallbacks) : - m_OriginalBlockIndex(SIZE_MAX), - m_pBlock(VMA_NULL), - m_HasNonMovableAllocations(true), - m_Allocations(pAllocationCallbacks) - { - } - - void CalcHasNonMovableAllocations() - { - const size_t blockAllocCount = m_pBlock->m_pMetadata->GetAllocationCount(); - const size_t defragmentAllocCount = m_Allocations.size(); - m_HasNonMovableAllocations = blockAllocCount != defragmentAllocCount; - } - - void SortAllocationsBySizeDescending() - { - VMA_SORT(m_Allocations.begin(), m_Allocations.end(), AllocationInfoSizeGreater()); - } - - void SortAllocationsByOffsetDescending() - { - VMA_SORT(m_Allocations.begin(), m_Allocations.end(), AllocationInfoOffsetGreater()); - } - }; - - struct BlockPointerLess - { - bool operator()(const BlockInfo* pLhsBlockInfo, const VmaDeviceMemoryBlock* pRhsBlock) const - { - return pLhsBlockInfo->m_pBlock < pRhsBlock; - } - bool operator()(const BlockInfo* pLhsBlockInfo, const BlockInfo* pRhsBlockInfo) const - { - return pLhsBlockInfo->m_pBlock < pRhsBlockInfo->m_pBlock; - } - }; - - // 1. Blocks with some non-movable allocations go first. - // 2. Blocks with smaller sumFreeSize go first. - struct BlockInfoCompareMoveDestination - { - bool operator()(const BlockInfo* pLhsBlockInfo, const BlockInfo* pRhsBlockInfo) const - { - if(pLhsBlockInfo->m_HasNonMovableAllocations && !pRhsBlockInfo->m_HasNonMovableAllocations) - { - return true; - } - if(!pLhsBlockInfo->m_HasNonMovableAllocations && pRhsBlockInfo->m_HasNonMovableAllocations) - { - return false; - } - if(pLhsBlockInfo->m_pBlock->m_pMetadata->GetSumFreeSize() < pRhsBlockInfo->m_pBlock->m_pMetadata->GetSumFreeSize()) - { - return true; - } - return false; - } - }; - - typedef VmaVector< BlockInfo*, VmaStlAllocator > BlockInfoVector; - BlockInfoVector m_Blocks; - - VkResult DefragmentRound( - VmaVector< VmaDefragmentationMove, VmaStlAllocator >& moves, - VkDeviceSize maxBytesToMove, - uint32_t maxAllocationsToMove, - bool freeOldAllocations); - - size_t CalcBlocksWithNonMovableCount() const; - - static bool MoveMakesSense( - size_t dstBlockIndex, VkDeviceSize dstOffset, - size_t srcBlockIndex, VkDeviceSize srcOffset); -}; - -class VmaDefragmentationAlgorithm_Fast : public VmaDefragmentationAlgorithm -{ - VMA_CLASS_NO_COPY(VmaDefragmentationAlgorithm_Fast) -public: - VmaDefragmentationAlgorithm_Fast( - VmaAllocator hAllocator, - VmaBlockVector* pBlockVector, - uint32_t currentFrameIndex, - bool overlappingMoveSupported); - virtual ~VmaDefragmentationAlgorithm_Fast(); - - virtual void AddAllocation(VmaAllocation hAlloc, VkBool32* pChanged) { ++m_AllocationCount; } - virtual void AddAll() { m_AllAllocations = true; } - - virtual VkResult Defragment( - VmaVector< VmaDefragmentationMove, VmaStlAllocator >& moves, - VkDeviceSize maxBytesToMove, - uint32_t maxAllocationsToMove, - VmaDefragmentationFlags flags); - - virtual VkDeviceSize GetBytesMoved() const { return m_BytesMoved; } - virtual uint32_t GetAllocationsMoved() const { return m_AllocationsMoved; } - -private: - struct BlockInfo - { - size_t origBlockIndex; - }; - - class FreeSpaceDatabase - { - public: - FreeSpaceDatabase() - { - FreeSpace s = {}; - s.blockInfoIndex = SIZE_MAX; - for(size_t i = 0; i < MAX_COUNT; ++i) - { - m_FreeSpaces[i] = s; - } - } - - void Register(size_t blockInfoIndex, VkDeviceSize offset, VkDeviceSize size) - { - if(size < VMA_MIN_FREE_SUBALLOCATION_SIZE_TO_REGISTER) - { - return; - } - - // Find first invalid or the smallest structure. - size_t bestIndex = SIZE_MAX; - for(size_t i = 0; i < MAX_COUNT; ++i) - { - // Empty structure. - if(m_FreeSpaces[i].blockInfoIndex == SIZE_MAX) - { - bestIndex = i; - break; - } - if(m_FreeSpaces[i].size < size && - (bestIndex == SIZE_MAX || m_FreeSpaces[bestIndex].size > m_FreeSpaces[i].size)) - { - bestIndex = i; - } - } - - if(bestIndex != SIZE_MAX) - { - m_FreeSpaces[bestIndex].blockInfoIndex = blockInfoIndex; - m_FreeSpaces[bestIndex].offset = offset; - m_FreeSpaces[bestIndex].size = size; - } - } - - bool Fetch(VkDeviceSize alignment, VkDeviceSize size, - size_t& outBlockInfoIndex, VkDeviceSize& outDstOffset) - { - size_t bestIndex = SIZE_MAX; - VkDeviceSize bestFreeSpaceAfter = 0; - for(size_t i = 0; i < MAX_COUNT; ++i) - { - // Structure is valid. - if(m_FreeSpaces[i].blockInfoIndex != SIZE_MAX) - { - const VkDeviceSize dstOffset = VmaAlignUp(m_FreeSpaces[i].offset, alignment); - // Allocation fits into this structure. - if(dstOffset + size <= m_FreeSpaces[i].offset + m_FreeSpaces[i].size) - { - const VkDeviceSize freeSpaceAfter = (m_FreeSpaces[i].offset + m_FreeSpaces[i].size) - - (dstOffset + size); - if(bestIndex == SIZE_MAX || freeSpaceAfter > bestFreeSpaceAfter) - { - bestIndex = i; - bestFreeSpaceAfter = freeSpaceAfter; - } - } - } - } - - if(bestIndex != SIZE_MAX) - { - outBlockInfoIndex = m_FreeSpaces[bestIndex].blockInfoIndex; - outDstOffset = VmaAlignUp(m_FreeSpaces[bestIndex].offset, alignment); - - if(bestFreeSpaceAfter >= VMA_MIN_FREE_SUBALLOCATION_SIZE_TO_REGISTER) - { - // Leave this structure for remaining empty space. - const VkDeviceSize alignmentPlusSize = (outDstOffset - m_FreeSpaces[bestIndex].offset) + size; - m_FreeSpaces[bestIndex].offset += alignmentPlusSize; - m_FreeSpaces[bestIndex].size -= alignmentPlusSize; - } - else - { - // This structure becomes invalid. - m_FreeSpaces[bestIndex].blockInfoIndex = SIZE_MAX; - } - - return true; - } - - return false; - } - - private: - static const size_t MAX_COUNT = 4; - - struct FreeSpace - { - size_t blockInfoIndex; // SIZE_MAX means this structure is invalid. - VkDeviceSize offset; - VkDeviceSize size; - } m_FreeSpaces[MAX_COUNT]; - }; - - const bool m_OverlappingMoveSupported; - - uint32_t m_AllocationCount; - bool m_AllAllocations; - - VkDeviceSize m_BytesMoved; - uint32_t m_AllocationsMoved; - - VmaVector< BlockInfo, VmaStlAllocator > m_BlockInfos; - - void PreprocessMetadata(); - void PostprocessMetadata(); - void InsertSuballoc(VmaBlockMetadata_Generic* pMetadata, const VmaSuballocation& suballoc); -}; - -struct VmaBlockDefragmentationContext -{ - enum BLOCK_FLAG - { - BLOCK_FLAG_USED = 0x00000001, - }; - uint32_t flags; - VkBuffer hBuffer; -}; - -class VmaBlockVectorDefragmentationContext -{ - VMA_CLASS_NO_COPY(VmaBlockVectorDefragmentationContext) -public: - VkResult res; - bool mutexLocked; - VmaVector< VmaBlockDefragmentationContext, VmaStlAllocator > blockContexts; - VmaVector< VmaDefragmentationMove, VmaStlAllocator > defragmentationMoves; - uint32_t defragmentationMovesProcessed; - uint32_t defragmentationMovesCommitted; - bool hasDefragmentationPlan; - - VmaBlockVectorDefragmentationContext( - VmaAllocator hAllocator, - VmaPool hCustomPool, // Optional. - VmaBlockVector* pBlockVector, - uint32_t currFrameIndex); - ~VmaBlockVectorDefragmentationContext(); - - VmaPool GetCustomPool() const { return m_hCustomPool; } - VmaBlockVector* GetBlockVector() const { return m_pBlockVector; } - VmaDefragmentationAlgorithm* GetAlgorithm() const { return m_pAlgorithm; } - - void AddAllocation(VmaAllocation hAlloc, VkBool32* pChanged); - void AddAll() { m_AllAllocations = true; } - - void Begin(bool overlappingMoveSupported, VmaDefragmentationFlags flags); - -private: - const VmaAllocator m_hAllocator; - // Null if not from custom pool. - const VmaPool m_hCustomPool; - // Redundant, for convenience not to fetch from m_hCustomPool->m_BlockVector or m_hAllocator->m_pBlockVectors. - VmaBlockVector* const m_pBlockVector; - const uint32_t m_CurrFrameIndex; - // Owner of this object. - VmaDefragmentationAlgorithm* m_pAlgorithm; - - struct AllocInfo - { - VmaAllocation hAlloc; - VkBool32* pChanged; - }; - // Used between constructor and Begin. - VmaVector< AllocInfo, VmaStlAllocator > m_Allocations; - bool m_AllAllocations; -}; - -struct VmaDefragmentationContext_T -{ -private: - VMA_CLASS_NO_COPY(VmaDefragmentationContext_T) -public: - VmaDefragmentationContext_T( - VmaAllocator hAllocator, - uint32_t currFrameIndex, - uint32_t flags, - VmaDefragmentationStats* pStats); - ~VmaDefragmentationContext_T(); - - void AddPools(uint32_t poolCount, const VmaPool* pPools); - void AddAllocations( - uint32_t allocationCount, - const VmaAllocation* pAllocations, - VkBool32* pAllocationsChanged); - - /* - Returns: - - `VK_SUCCESS` if succeeded and object can be destroyed immediately. - - `VK_NOT_READY` if succeeded but the object must remain alive until vmaDefragmentationEnd(). - - Negative value if error occured and object can be destroyed immediately. - */ - VkResult Defragment( - VkDeviceSize maxCpuBytesToMove, uint32_t maxCpuAllocationsToMove, - VkDeviceSize maxGpuBytesToMove, uint32_t maxGpuAllocationsToMove, - VkCommandBuffer commandBuffer, VmaDefragmentationStats* pStats, VmaDefragmentationFlags flags); - - VkResult DefragmentPassBegin(VmaDefragmentationPassInfo* pInfo); - VkResult DefragmentPassEnd(); - -private: - const VmaAllocator m_hAllocator; - const uint32_t m_CurrFrameIndex; - const uint32_t m_Flags; - VmaDefragmentationStats* const m_pStats; - - VkDeviceSize m_MaxCpuBytesToMove; - uint32_t m_MaxCpuAllocationsToMove; - VkDeviceSize m_MaxGpuBytesToMove; - uint32_t m_MaxGpuAllocationsToMove; - - // Owner of these objects. - VmaBlockVectorDefragmentationContext* m_DefaultPoolContexts[VK_MAX_MEMORY_TYPES]; - // Owner of these objects. - VmaVector< VmaBlockVectorDefragmentationContext*, VmaStlAllocator > m_CustomPoolContexts; -}; - -#if VMA_RECORDING_ENABLED - -class VmaRecorder -{ -public: - VmaRecorder(); - VkResult Init(const VmaRecordSettings& settings, bool useMutex); - void WriteConfiguration( - const VkPhysicalDeviceProperties& devProps, - const VkPhysicalDeviceMemoryProperties& memProps, - uint32_t vulkanApiVersion, - bool dedicatedAllocationExtensionEnabled, - bool bindMemory2ExtensionEnabled, - bool memoryBudgetExtensionEnabled, - bool deviceCoherentMemoryExtensionEnabled); - ~VmaRecorder(); - - void RecordCreateAllocator(uint32_t frameIndex); - void RecordDestroyAllocator(uint32_t frameIndex); - void RecordCreatePool(uint32_t frameIndex, - const VmaPoolCreateInfo& createInfo, - VmaPool pool); - void RecordDestroyPool(uint32_t frameIndex, VmaPool pool); - void RecordAllocateMemory(uint32_t frameIndex, - const VkMemoryRequirements& vkMemReq, - const VmaAllocationCreateInfo& createInfo, - VmaAllocation allocation); - void RecordAllocateMemoryPages(uint32_t frameIndex, - const VkMemoryRequirements& vkMemReq, - const VmaAllocationCreateInfo& createInfo, - uint64_t allocationCount, - const VmaAllocation* pAllocations); - void RecordAllocateMemoryForBuffer(uint32_t frameIndex, - const VkMemoryRequirements& vkMemReq, - bool requiresDedicatedAllocation, - bool prefersDedicatedAllocation, - const VmaAllocationCreateInfo& createInfo, - VmaAllocation allocation); - void RecordAllocateMemoryForImage(uint32_t frameIndex, - const VkMemoryRequirements& vkMemReq, - bool requiresDedicatedAllocation, - bool prefersDedicatedAllocation, - const VmaAllocationCreateInfo& createInfo, - VmaAllocation allocation); - void RecordFreeMemory(uint32_t frameIndex, - VmaAllocation allocation); - void RecordFreeMemoryPages(uint32_t frameIndex, - uint64_t allocationCount, - const VmaAllocation* pAllocations); - void RecordSetAllocationUserData(uint32_t frameIndex, - VmaAllocation allocation, - const void* pUserData); - void RecordCreateLostAllocation(uint32_t frameIndex, - VmaAllocation allocation); - void RecordMapMemory(uint32_t frameIndex, - VmaAllocation allocation); - void RecordUnmapMemory(uint32_t frameIndex, - VmaAllocation allocation); - void RecordFlushAllocation(uint32_t frameIndex, - VmaAllocation allocation, VkDeviceSize offset, VkDeviceSize size); - void RecordInvalidateAllocation(uint32_t frameIndex, - VmaAllocation allocation, VkDeviceSize offset, VkDeviceSize size); - void RecordCreateBuffer(uint32_t frameIndex, - const VkBufferCreateInfo& bufCreateInfo, - const VmaAllocationCreateInfo& allocCreateInfo, - VmaAllocation allocation); - void RecordCreateImage(uint32_t frameIndex, - const VkImageCreateInfo& imageCreateInfo, - const VmaAllocationCreateInfo& allocCreateInfo, - VmaAllocation allocation); - void RecordDestroyBuffer(uint32_t frameIndex, - VmaAllocation allocation); - void RecordDestroyImage(uint32_t frameIndex, - VmaAllocation allocation); - void RecordTouchAllocation(uint32_t frameIndex, - VmaAllocation allocation); - void RecordGetAllocationInfo(uint32_t frameIndex, - VmaAllocation allocation); - void RecordMakePoolAllocationsLost(uint32_t frameIndex, - VmaPool pool); - void RecordDefragmentationBegin(uint32_t frameIndex, - const VmaDefragmentationInfo2& info, - VmaDefragmentationContext ctx); - void RecordDefragmentationEnd(uint32_t frameIndex, - VmaDefragmentationContext ctx); - void RecordSetPoolName(uint32_t frameIndex, - VmaPool pool, - const char* name); - -private: - struct CallParams - { - uint32_t threadId; - double time; - }; - - class UserDataString - { - public: - UserDataString(VmaAllocationCreateFlags allocFlags, const void* pUserData); - const char* GetString() const { return m_Str; } - - private: - char m_PtrStr[17]; - const char* m_Str; - }; - - bool m_UseMutex; - VmaRecordFlags m_Flags; - FILE* m_File; - VMA_MUTEX m_FileMutex; - std::chrono::time_point m_RecordingStartTime; - - void GetBasicParams(CallParams& outParams); - - // T must be a pointer type, e.g. VmaAllocation, VmaPool. - template - void PrintPointerList(uint64_t count, const T* pItems) - { - if(count) - { - fprintf(m_File, "%p", pItems[0]); - for(uint64_t i = 1; i < count; ++i) - { - fprintf(m_File, " %p", pItems[i]); - } - } - } - - void PrintPointerList(uint64_t count, const VmaAllocation* pItems); - void Flush(); -}; - -#endif // #if VMA_RECORDING_ENABLED - -/* -Thread-safe wrapper over VmaPoolAllocator free list, for allocation of VmaAllocation_T objects. -*/ -class VmaAllocationObjectAllocator -{ - VMA_CLASS_NO_COPY(VmaAllocationObjectAllocator) -public: - VmaAllocationObjectAllocator(const VkAllocationCallbacks* pAllocationCallbacks); - - template VmaAllocation Allocate(Types... args); - void Free(VmaAllocation hAlloc); - -private: - VMA_MUTEX m_Mutex; - VmaPoolAllocator m_Allocator; -}; - -struct VmaCurrentBudgetData -{ - VMA_ATOMIC_UINT64 m_BlockBytes[VK_MAX_MEMORY_HEAPS]; - VMA_ATOMIC_UINT64 m_AllocationBytes[VK_MAX_MEMORY_HEAPS]; - -#if VMA_MEMORY_BUDGET - VMA_ATOMIC_UINT32 m_OperationsSinceBudgetFetch; - VMA_RW_MUTEX m_BudgetMutex; - uint64_t m_VulkanUsage[VK_MAX_MEMORY_HEAPS]; - uint64_t m_VulkanBudget[VK_MAX_MEMORY_HEAPS]; - uint64_t m_BlockBytesAtBudgetFetch[VK_MAX_MEMORY_HEAPS]; -#endif // #if VMA_MEMORY_BUDGET - - VmaCurrentBudgetData() - { - for(uint32_t heapIndex = 0; heapIndex < VK_MAX_MEMORY_HEAPS; ++heapIndex) - { - m_BlockBytes[heapIndex] = 0; - m_AllocationBytes[heapIndex] = 0; -#if VMA_MEMORY_BUDGET - m_VulkanUsage[heapIndex] = 0; - m_VulkanBudget[heapIndex] = 0; - m_BlockBytesAtBudgetFetch[heapIndex] = 0; -#endif - } - -#if VMA_MEMORY_BUDGET - m_OperationsSinceBudgetFetch = 0; -#endif - } - - void AddAllocation(uint32_t heapIndex, VkDeviceSize allocationSize) - { - m_AllocationBytes[heapIndex] += allocationSize; -#if VMA_MEMORY_BUDGET - ++m_OperationsSinceBudgetFetch; -#endif - } - - void RemoveAllocation(uint32_t heapIndex, VkDeviceSize allocationSize) - { - VMA_ASSERT(m_AllocationBytes[heapIndex] >= allocationSize); // DELME - m_AllocationBytes[heapIndex] -= allocationSize; -#if VMA_MEMORY_BUDGET - ++m_OperationsSinceBudgetFetch; -#endif - } -}; - -// Main allocator object. -struct VmaAllocator_T -{ - VMA_CLASS_NO_COPY(VmaAllocator_T) -public: - bool m_UseMutex; - uint32_t m_VulkanApiVersion; - bool m_UseKhrDedicatedAllocation; // Can be set only if m_VulkanApiVersion < VK_MAKE_VERSION(1, 1, 0). - bool m_UseKhrBindMemory2; // Can be set only if m_VulkanApiVersion < VK_MAKE_VERSION(1, 1, 0). - bool m_UseExtMemoryBudget; - bool m_UseAmdDeviceCoherentMemory; - bool m_UseKhrBufferDeviceAddress; - bool m_UseExtMemoryPriority; - VkDevice m_hDevice; - VkInstance m_hInstance; - bool m_AllocationCallbacksSpecified; - VkAllocationCallbacks m_AllocationCallbacks; - VmaDeviceMemoryCallbacks m_DeviceMemoryCallbacks; - VmaAllocationObjectAllocator m_AllocationObjectAllocator; - - // Each bit (1 << i) is set if HeapSizeLimit is enabled for that heap, so cannot allocate more than the heap size. - uint32_t m_HeapSizeLimitMask; - - VkPhysicalDeviceProperties m_PhysicalDeviceProperties; - VkPhysicalDeviceMemoryProperties m_MemProps; - - // Default pools. - VmaBlockVector* m_pBlockVectors[VK_MAX_MEMORY_TYPES]; - - // Each vector is sorted by memory (handle value). - typedef VmaVector< VmaAllocation, VmaStlAllocator > AllocationVectorType; - AllocationVectorType* m_pDedicatedAllocations[VK_MAX_MEMORY_TYPES]; - VMA_RW_MUTEX m_DedicatedAllocationsMutex[VK_MAX_MEMORY_TYPES]; - - VmaCurrentBudgetData m_Budget; - VMA_ATOMIC_UINT32 m_DeviceMemoryCount; // Total number of VkDeviceMemory objects. - - VmaAllocator_T(const VmaAllocatorCreateInfo* pCreateInfo); - VkResult Init(const VmaAllocatorCreateInfo* pCreateInfo); - ~VmaAllocator_T(); - - const VkAllocationCallbacks* GetAllocationCallbacks() const - { - return m_AllocationCallbacksSpecified ? &m_AllocationCallbacks : 0; - } - const VmaVulkanFunctions& GetVulkanFunctions() const - { - return m_VulkanFunctions; - } - - VkPhysicalDevice GetPhysicalDevice() const { return m_PhysicalDevice; } - - VkDeviceSize GetBufferImageGranularity() const - { - return VMA_MAX( - static_cast(VMA_DEBUG_MIN_BUFFER_IMAGE_GRANULARITY), - m_PhysicalDeviceProperties.limits.bufferImageGranularity); - } - - uint32_t GetMemoryHeapCount() const { return m_MemProps.memoryHeapCount; } - uint32_t GetMemoryTypeCount() const { return m_MemProps.memoryTypeCount; } - - uint32_t MemoryTypeIndexToHeapIndex(uint32_t memTypeIndex) const - { - VMA_ASSERT(memTypeIndex < m_MemProps.memoryTypeCount); - return m_MemProps.memoryTypes[memTypeIndex].heapIndex; - } - // True when specific memory type is HOST_VISIBLE but not HOST_COHERENT. - bool IsMemoryTypeNonCoherent(uint32_t memTypeIndex) const - { - return (m_MemProps.memoryTypes[memTypeIndex].propertyFlags & (VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT)) == - VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT; - } - // Minimum alignment for all allocations in specific memory type. - VkDeviceSize GetMemoryTypeMinAlignment(uint32_t memTypeIndex) const - { - return IsMemoryTypeNonCoherent(memTypeIndex) ? - VMA_MAX((VkDeviceSize)VMA_DEBUG_ALIGNMENT, m_PhysicalDeviceProperties.limits.nonCoherentAtomSize) : - (VkDeviceSize)VMA_DEBUG_ALIGNMENT; - } - - bool IsIntegratedGpu() const - { - return m_PhysicalDeviceProperties.deviceType == VK_PHYSICAL_DEVICE_TYPE_INTEGRATED_GPU; - } - - uint32_t GetGlobalMemoryTypeBits() const { return m_GlobalMemoryTypeBits; } - -#if VMA_RECORDING_ENABLED - VmaRecorder* GetRecorder() const { return m_pRecorder; } -#endif - - void GetBufferMemoryRequirements( - VkBuffer hBuffer, - VkMemoryRequirements& memReq, - bool& requiresDedicatedAllocation, - bool& prefersDedicatedAllocation) const; - void GetImageMemoryRequirements( - VkImage hImage, - VkMemoryRequirements& memReq, - bool& requiresDedicatedAllocation, - bool& prefersDedicatedAllocation) const; - - // Main allocation function. - VkResult AllocateMemory( - const VkMemoryRequirements& vkMemReq, - bool requiresDedicatedAllocation, - bool prefersDedicatedAllocation, - VkBuffer dedicatedBuffer, - VkBufferUsageFlags dedicatedBufferUsage, // UINT32_MAX when unknown. - VkImage dedicatedImage, - const VmaAllocationCreateInfo& createInfo, - VmaSuballocationType suballocType, - size_t allocationCount, - VmaAllocation* pAllocations); - - // Main deallocation function. - void FreeMemory( - size_t allocationCount, - const VmaAllocation* pAllocations); - - void CalculateStats(VmaStats* pStats); - - void GetBudget( - VmaBudget* outBudget, uint32_t firstHeap, uint32_t heapCount); - -#if VMA_STATS_STRING_ENABLED - void PrintDetailedMap(class VmaJsonWriter& json); -#endif - - VkResult DefragmentationBegin( - const VmaDefragmentationInfo2& info, - VmaDefragmentationStats* pStats, - VmaDefragmentationContext* pContext); - VkResult DefragmentationEnd( - VmaDefragmentationContext context); - - VkResult DefragmentationPassBegin( - VmaDefragmentationPassInfo* pInfo, - VmaDefragmentationContext context); - VkResult DefragmentationPassEnd( - VmaDefragmentationContext context); - - void GetAllocationInfo(VmaAllocation hAllocation, VmaAllocationInfo* pAllocationInfo); - bool TouchAllocation(VmaAllocation hAllocation); - - VkResult CreatePool(const VmaPoolCreateInfo* pCreateInfo, VmaPool* pPool); - void DestroyPool(VmaPool pool); - void GetPoolStats(VmaPool pool, VmaPoolStats* pPoolStats); - - void SetCurrentFrameIndex(uint32_t frameIndex); - uint32_t GetCurrentFrameIndex() const { return m_CurrentFrameIndex.load(); } - - void MakePoolAllocationsLost( - VmaPool hPool, - size_t* pLostAllocationCount); - VkResult CheckPoolCorruption(VmaPool hPool); - VkResult CheckCorruption(uint32_t memoryTypeBits); - - void CreateLostAllocation(VmaAllocation* pAllocation); - - // Call to Vulkan function vkAllocateMemory with accompanying bookkeeping. - VkResult AllocateVulkanMemory(const VkMemoryAllocateInfo* pAllocateInfo, VkDeviceMemory* pMemory); - // Call to Vulkan function vkFreeMemory with accompanying bookkeeping. - void FreeVulkanMemory(uint32_t memoryType, VkDeviceSize size, VkDeviceMemory hMemory); - // Call to Vulkan function vkBindBufferMemory or vkBindBufferMemory2KHR. - VkResult BindVulkanBuffer( - VkDeviceMemory memory, - VkDeviceSize memoryOffset, - VkBuffer buffer, - const void* pNext); - // Call to Vulkan function vkBindImageMemory or vkBindImageMemory2KHR. - VkResult BindVulkanImage( - VkDeviceMemory memory, - VkDeviceSize memoryOffset, - VkImage image, - const void* pNext); - - VkResult Map(VmaAllocation hAllocation, void** ppData); - void Unmap(VmaAllocation hAllocation); - - VkResult BindBufferMemory( - VmaAllocation hAllocation, - VkDeviceSize allocationLocalOffset, - VkBuffer hBuffer, - const void* pNext); - VkResult BindImageMemory( - VmaAllocation hAllocation, - VkDeviceSize allocationLocalOffset, - VkImage hImage, - const void* pNext); - - VkResult FlushOrInvalidateAllocation( - VmaAllocation hAllocation, - VkDeviceSize offset, VkDeviceSize size, - VMA_CACHE_OPERATION op); - VkResult FlushOrInvalidateAllocations( - uint32_t allocationCount, - const VmaAllocation* allocations, - const VkDeviceSize* offsets, const VkDeviceSize* sizes, - VMA_CACHE_OPERATION op); - - void FillAllocation(const VmaAllocation hAllocation, uint8_t pattern); - - /* - Returns bit mask of memory types that can support defragmentation on GPU as - they support creation of required buffer for copy operations. - */ - uint32_t GetGpuDefragmentationMemoryTypeBits(); - -private: - VkDeviceSize m_PreferredLargeHeapBlockSize; - - VkPhysicalDevice m_PhysicalDevice; - VMA_ATOMIC_UINT32 m_CurrentFrameIndex; - VMA_ATOMIC_UINT32 m_GpuDefragmentationMemoryTypeBits; // UINT32_MAX means uninitialized. - - VMA_RW_MUTEX m_PoolsMutex; - // Protected by m_PoolsMutex. Sorted by pointer value. - VmaVector > m_Pools; - uint32_t m_NextPoolId; - - VmaVulkanFunctions m_VulkanFunctions; - - // Global bit mask AND-ed with any memoryTypeBits to disallow certain memory types. - uint32_t m_GlobalMemoryTypeBits; - -#if VMA_RECORDING_ENABLED - VmaRecorder* m_pRecorder; -#endif - - void ImportVulkanFunctions(const VmaVulkanFunctions* pVulkanFunctions); - -#if VMA_STATIC_VULKAN_FUNCTIONS == 1 - void ImportVulkanFunctions_Static(); -#endif - - void ImportVulkanFunctions_Custom(const VmaVulkanFunctions* pVulkanFunctions); - -#if VMA_DYNAMIC_VULKAN_FUNCTIONS == 1 - void ImportVulkanFunctions_Dynamic(); -#endif - - void ValidateVulkanFunctions(); - - VkDeviceSize CalcPreferredBlockSize(uint32_t memTypeIndex); - - VkResult AllocateMemoryOfType( - VkDeviceSize size, - VkDeviceSize alignment, - bool dedicatedAllocation, - VkBuffer dedicatedBuffer, - VkBufferUsageFlags dedicatedBufferUsage, - VkImage dedicatedImage, - const VmaAllocationCreateInfo& createInfo, - uint32_t memTypeIndex, - VmaSuballocationType suballocType, - size_t allocationCount, - VmaAllocation* pAllocations); - - // Helper function only to be used inside AllocateDedicatedMemory. - VkResult AllocateDedicatedMemoryPage( - VkDeviceSize size, - VmaSuballocationType suballocType, - uint32_t memTypeIndex, - const VkMemoryAllocateInfo& allocInfo, - bool map, - bool isUserDataString, - void* pUserData, - VmaAllocation* pAllocation); - - // Allocates and registers new VkDeviceMemory specifically for dedicated allocations. - VkResult AllocateDedicatedMemory( - VkDeviceSize size, - VmaSuballocationType suballocType, - uint32_t memTypeIndex, - bool withinBudget, - bool map, - bool isUserDataString, - void* pUserData, - float priority, - VkBuffer dedicatedBuffer, - VkBufferUsageFlags dedicatedBufferUsage, - VkImage dedicatedImage, - size_t allocationCount, - VmaAllocation* pAllocations); - - void FreeDedicatedMemory(const VmaAllocation allocation); - - /* - Calculates and returns bit mask of memory types that can support defragmentation - on GPU as they support creation of required buffer for copy operations. - */ - uint32_t CalculateGpuDefragmentationMemoryTypeBits() const; - - uint32_t CalculateGlobalMemoryTypeBits() const; - - bool GetFlushOrInvalidateRange( - VmaAllocation allocation, - VkDeviceSize offset, VkDeviceSize size, - VkMappedMemoryRange& outRange) const; - -#if VMA_MEMORY_BUDGET - void UpdateVulkanBudget(); -#endif // #if VMA_MEMORY_BUDGET -}; - -//////////////////////////////////////////////////////////////////////////////// -// Memory allocation #2 after VmaAllocator_T definition - -static void* VmaMalloc(VmaAllocator hAllocator, size_t size, size_t alignment) -{ - return VmaMalloc(&hAllocator->m_AllocationCallbacks, size, alignment); -} - -static void VmaFree(VmaAllocator hAllocator, void* ptr) -{ - VmaFree(&hAllocator->m_AllocationCallbacks, ptr); -} - -template -static T* VmaAllocate(VmaAllocator hAllocator) -{ - return (T*)VmaMalloc(hAllocator, sizeof(T), VMA_ALIGN_OF(T)); -} - -template -static T* VmaAllocateArray(VmaAllocator hAllocator, size_t count) -{ - return (T*)VmaMalloc(hAllocator, sizeof(T) * count, VMA_ALIGN_OF(T)); -} - -template -static void vma_delete(VmaAllocator hAllocator, T* ptr) -{ - if(ptr != VMA_NULL) - { - ptr->~T(); - VmaFree(hAllocator, ptr); - } -} - -template -static void vma_delete_array(VmaAllocator hAllocator, T* ptr, size_t count) -{ - if(ptr != VMA_NULL) - { - for(size_t i = count; i--; ) - ptr[i].~T(); - VmaFree(hAllocator, ptr); - } -} - -//////////////////////////////////////////////////////////////////////////////// -// VmaStringBuilder - -#if VMA_STATS_STRING_ENABLED - -class VmaStringBuilder -{ -public: - VmaStringBuilder(VmaAllocator alloc) : m_Data(VmaStlAllocator(alloc->GetAllocationCallbacks())) { } - size_t GetLength() const { return m_Data.size(); } - const char* GetData() const { return m_Data.data(); } - - void Add(char ch) { m_Data.push_back(ch); } - void Add(const char* pStr); - void AddNewLine() { Add('\n'); } - void AddNumber(uint32_t num); - void AddNumber(uint64_t num); - void AddPointer(const void* ptr); - -private: - VmaVector< char, VmaStlAllocator > m_Data; -}; - -void VmaStringBuilder::Add(const char* pStr) -{ - const size_t strLen = strlen(pStr); - if(strLen > 0) - { - const size_t oldCount = m_Data.size(); - m_Data.resize(oldCount + strLen); - memcpy(m_Data.data() + oldCount, pStr, strLen); - } -} - -void VmaStringBuilder::AddNumber(uint32_t num) -{ - char buf[11]; - buf[10] = '\0'; - char *p = &buf[10]; - do - { - *--p = '0' + (num % 10); - num /= 10; - } - while(num); - Add(p); -} - -void VmaStringBuilder::AddNumber(uint64_t num) -{ - char buf[21]; - buf[20] = '\0'; - char *p = &buf[20]; - do - { - *--p = '0' + (num % 10); - num /= 10; - } - while(num); - Add(p); -} - -void VmaStringBuilder::AddPointer(const void* ptr) -{ - char buf[21]; - VmaPtrToStr(buf, sizeof(buf), ptr); - Add(buf); -} - -#endif // #if VMA_STATS_STRING_ENABLED - -//////////////////////////////////////////////////////////////////////////////// -// VmaJsonWriter - -#if VMA_STATS_STRING_ENABLED - -class VmaJsonWriter -{ - VMA_CLASS_NO_COPY(VmaJsonWriter) -public: - VmaJsonWriter(const VkAllocationCallbacks* pAllocationCallbacks, VmaStringBuilder& sb); - ~VmaJsonWriter(); - - void BeginObject(bool singleLine = false); - void EndObject(); - - void BeginArray(bool singleLine = false); - void EndArray(); - - void WriteString(const char* pStr); - void BeginString(const char* pStr = VMA_NULL); - void ContinueString(const char* pStr); - void ContinueString(uint32_t n); - void ContinueString(uint64_t n); - void ContinueString_Pointer(const void* ptr); - void EndString(const char* pStr = VMA_NULL); - - void WriteNumber(uint32_t n); - void WriteNumber(uint64_t n); - void WriteBool(bool b); - void WriteNull(); - -private: - static const char* const INDENT; - - enum COLLECTION_TYPE - { - COLLECTION_TYPE_OBJECT, - COLLECTION_TYPE_ARRAY, - }; - struct StackItem - { - COLLECTION_TYPE type; - uint32_t valueCount; - bool singleLineMode; - }; - - VmaStringBuilder& m_SB; - VmaVector< StackItem, VmaStlAllocator > m_Stack; - bool m_InsideString; - - void BeginValue(bool isString); - void WriteIndent(bool oneLess = false); -}; - -const char* const VmaJsonWriter::INDENT = " "; - -VmaJsonWriter::VmaJsonWriter(const VkAllocationCallbacks* pAllocationCallbacks, VmaStringBuilder& sb) : - m_SB(sb), - m_Stack(VmaStlAllocator(pAllocationCallbacks)), - m_InsideString(false) -{ -} - -VmaJsonWriter::~VmaJsonWriter() -{ - VMA_ASSERT(!m_InsideString); - VMA_ASSERT(m_Stack.empty()); -} - -void VmaJsonWriter::BeginObject(bool singleLine) -{ - VMA_ASSERT(!m_InsideString); - - BeginValue(false); - m_SB.Add('{'); - - StackItem item; - item.type = COLLECTION_TYPE_OBJECT; - item.valueCount = 0; - item.singleLineMode = singleLine; - m_Stack.push_back(item); -} - -void VmaJsonWriter::EndObject() -{ - VMA_ASSERT(!m_InsideString); - - WriteIndent(true); - m_SB.Add('}'); - - VMA_ASSERT(!m_Stack.empty() && m_Stack.back().type == COLLECTION_TYPE_OBJECT); - m_Stack.pop_back(); -} - -void VmaJsonWriter::BeginArray(bool singleLine) -{ - VMA_ASSERT(!m_InsideString); - - BeginValue(false); - m_SB.Add('['); - - StackItem item; - item.type = COLLECTION_TYPE_ARRAY; - item.valueCount = 0; - item.singleLineMode = singleLine; - m_Stack.push_back(item); -} - -void VmaJsonWriter::EndArray() -{ - VMA_ASSERT(!m_InsideString); - - WriteIndent(true); - m_SB.Add(']'); - - VMA_ASSERT(!m_Stack.empty() && m_Stack.back().type == COLLECTION_TYPE_ARRAY); - m_Stack.pop_back(); -} - -void VmaJsonWriter::WriteString(const char* pStr) -{ - BeginString(pStr); - EndString(); -} - -void VmaJsonWriter::BeginString(const char* pStr) -{ - VMA_ASSERT(!m_InsideString); - - BeginValue(true); - m_SB.Add('"'); - m_InsideString = true; - if(pStr != VMA_NULL && pStr[0] != '\0') - { - ContinueString(pStr); - } -} - -void VmaJsonWriter::ContinueString(const char* pStr) -{ - VMA_ASSERT(m_InsideString); - - const size_t strLen = strlen(pStr); - for(size_t i = 0; i < strLen; ++i) - { - char ch = pStr[i]; - if(ch == '\\') - { - m_SB.Add("\\\\"); - } - else if(ch == '"') - { - m_SB.Add("\\\""); - } - else if(ch >= 32) - { - m_SB.Add(ch); - } - else switch(ch) - { - case '\b': - m_SB.Add("\\b"); - break; - case '\f': - m_SB.Add("\\f"); - break; - case '\n': - m_SB.Add("\\n"); - break; - case '\r': - m_SB.Add("\\r"); - break; - case '\t': - m_SB.Add("\\t"); - break; - default: - VMA_ASSERT(0 && "Character not currently supported."); - break; - } - } -} - -void VmaJsonWriter::ContinueString(uint32_t n) -{ - VMA_ASSERT(m_InsideString); - m_SB.AddNumber(n); -} - -void VmaJsonWriter::ContinueString(uint64_t n) -{ - VMA_ASSERT(m_InsideString); - m_SB.AddNumber(n); -} - -void VmaJsonWriter::ContinueString_Pointer(const void* ptr) -{ - VMA_ASSERT(m_InsideString); - m_SB.AddPointer(ptr); -} - -void VmaJsonWriter::EndString(const char* pStr) -{ - VMA_ASSERT(m_InsideString); - if(pStr != VMA_NULL && pStr[0] != '\0') - { - ContinueString(pStr); - } - m_SB.Add('"'); - m_InsideString = false; -} - -void VmaJsonWriter::WriteNumber(uint32_t n) -{ - VMA_ASSERT(!m_InsideString); - BeginValue(false); - m_SB.AddNumber(n); -} - -void VmaJsonWriter::WriteNumber(uint64_t n) -{ - VMA_ASSERT(!m_InsideString); - BeginValue(false); - m_SB.AddNumber(n); -} - -void VmaJsonWriter::WriteBool(bool b) -{ - VMA_ASSERT(!m_InsideString); - BeginValue(false); - m_SB.Add(b ? "true" : "false"); -} - -void VmaJsonWriter::WriteNull() -{ - VMA_ASSERT(!m_InsideString); - BeginValue(false); - m_SB.Add("null"); -} - -void VmaJsonWriter::BeginValue(bool isString) -{ - if(!m_Stack.empty()) - { - StackItem& currItem = m_Stack.back(); - if(currItem.type == COLLECTION_TYPE_OBJECT && - currItem.valueCount % 2 == 0) - { - VMA_ASSERT(isString); - } - - if(currItem.type == COLLECTION_TYPE_OBJECT && - currItem.valueCount % 2 != 0) - { - m_SB.Add(": "); - } - else if(currItem.valueCount > 0) - { - m_SB.Add(", "); - WriteIndent(); - } - else - { - WriteIndent(); - } - ++currItem.valueCount; - } -} - -void VmaJsonWriter::WriteIndent(bool oneLess) -{ - if(!m_Stack.empty() && !m_Stack.back().singleLineMode) - { - m_SB.AddNewLine(); - - size_t count = m_Stack.size(); - if(count > 0 && oneLess) - { - --count; - } - for(size_t i = 0; i < count; ++i) - { - m_SB.Add(INDENT); - } - } -} - -#endif // #if VMA_STATS_STRING_ENABLED - -//////////////////////////////////////////////////////////////////////////////// - -void VmaAllocation_T::SetUserData(VmaAllocator hAllocator, void* pUserData) -{ - if(IsUserDataString()) - { - VMA_ASSERT(pUserData == VMA_NULL || pUserData != m_pUserData); - - FreeUserDataString(hAllocator); - - if(pUserData != VMA_NULL) - { - m_pUserData = VmaCreateStringCopy(hAllocator->GetAllocationCallbacks(), (const char*)pUserData); - } - } - else - { - m_pUserData = pUserData; - } -} - -void VmaAllocation_T::ChangeBlockAllocation( - VmaAllocator hAllocator, - VmaDeviceMemoryBlock* block, - VkDeviceSize offset) -{ - VMA_ASSERT(block != VMA_NULL); - VMA_ASSERT(m_Type == ALLOCATION_TYPE_BLOCK); - - // Move mapping reference counter from old block to new block. - if(block != m_BlockAllocation.m_Block) - { - uint32_t mapRefCount = m_MapCount & ~MAP_COUNT_FLAG_PERSISTENT_MAP; - if(IsPersistentMap()) - ++mapRefCount; - m_BlockAllocation.m_Block->Unmap(hAllocator, mapRefCount); - block->Map(hAllocator, mapRefCount, VMA_NULL); - } - - m_BlockAllocation.m_Block = block; - m_BlockAllocation.m_Offset = offset; -} - -void VmaAllocation_T::ChangeOffset(VkDeviceSize newOffset) -{ - VMA_ASSERT(m_Type == ALLOCATION_TYPE_BLOCK); - m_BlockAllocation.m_Offset = newOffset; -} - -VkDeviceSize VmaAllocation_T::GetOffset() const -{ - switch(m_Type) - { - case ALLOCATION_TYPE_BLOCK: - return m_BlockAllocation.m_Offset; - case ALLOCATION_TYPE_DEDICATED: - return 0; - default: - VMA_ASSERT(0); - return 0; - } -} - -VkDeviceMemory VmaAllocation_T::GetMemory() const -{ - switch(m_Type) - { - case ALLOCATION_TYPE_BLOCK: - return m_BlockAllocation.m_Block->GetDeviceMemory(); - case ALLOCATION_TYPE_DEDICATED: - return m_DedicatedAllocation.m_hMemory; - default: - VMA_ASSERT(0); - return VK_NULL_HANDLE; - } -} - -void* VmaAllocation_T::GetMappedData() const -{ - switch(m_Type) - { - case ALLOCATION_TYPE_BLOCK: - if(m_MapCount != 0) - { - void* pBlockData = m_BlockAllocation.m_Block->GetMappedData(); - VMA_ASSERT(pBlockData != VMA_NULL); - return (char*)pBlockData + m_BlockAllocation.m_Offset; - } - else - { - return VMA_NULL; - } - break; - case ALLOCATION_TYPE_DEDICATED: - VMA_ASSERT((m_DedicatedAllocation.m_pMappedData != VMA_NULL) == (m_MapCount != 0)); - return m_DedicatedAllocation.m_pMappedData; - default: - VMA_ASSERT(0); - return VMA_NULL; - } -} - -bool VmaAllocation_T::CanBecomeLost() const -{ - switch(m_Type) - { - case ALLOCATION_TYPE_BLOCK: - return m_BlockAllocation.m_CanBecomeLost; - case ALLOCATION_TYPE_DEDICATED: - return false; - default: - VMA_ASSERT(0); - return false; - } -} - -bool VmaAllocation_T::MakeLost(uint32_t currentFrameIndex, uint32_t frameInUseCount) -{ - VMA_ASSERT(CanBecomeLost()); - - /* - Warning: This is a carefully designed algorithm. - Do not modify unless you really know what you're doing :) - */ - uint32_t localLastUseFrameIndex = GetLastUseFrameIndex(); - for(;;) - { - if(localLastUseFrameIndex == VMA_FRAME_INDEX_LOST) - { - VMA_ASSERT(0); - return false; - } - else if(localLastUseFrameIndex + frameInUseCount >= currentFrameIndex) - { - return false; - } - else // Last use time earlier than current time. - { - if(CompareExchangeLastUseFrameIndex(localLastUseFrameIndex, VMA_FRAME_INDEX_LOST)) - { - // Setting hAllocation.LastUseFrameIndex atomic to VMA_FRAME_INDEX_LOST is enough to mark it as LOST. - // Calling code just needs to unregister this allocation in owning VmaDeviceMemoryBlock. - return true; - } - } - } -} - -#if VMA_STATS_STRING_ENABLED - -// Correspond to values of enum VmaSuballocationType. -static const char* VMA_SUBALLOCATION_TYPE_NAMES[] = { - "FREE", - "UNKNOWN", - "BUFFER", - "IMAGE_UNKNOWN", - "IMAGE_LINEAR", - "IMAGE_OPTIMAL", -}; - -void VmaAllocation_T::PrintParameters(class VmaJsonWriter& json) const -{ - json.WriteString("Type"); - json.WriteString(VMA_SUBALLOCATION_TYPE_NAMES[m_SuballocationType]); - - json.WriteString("Size"); - json.WriteNumber(m_Size); - - if(m_pUserData != VMA_NULL) - { - json.WriteString("UserData"); - if(IsUserDataString()) - { - json.WriteString((const char*)m_pUserData); - } - else - { - json.BeginString(); - json.ContinueString_Pointer(m_pUserData); - json.EndString(); - } - } - - json.WriteString("CreationFrameIndex"); - json.WriteNumber(m_CreationFrameIndex); - - json.WriteString("LastUseFrameIndex"); - json.WriteNumber(GetLastUseFrameIndex()); - - if(m_BufferImageUsage != 0) - { - json.WriteString("Usage"); - json.WriteNumber(m_BufferImageUsage); - } -} - -#endif - -void VmaAllocation_T::FreeUserDataString(VmaAllocator hAllocator) -{ - VMA_ASSERT(IsUserDataString()); - VmaFreeString(hAllocator->GetAllocationCallbacks(), (char*)m_pUserData); - m_pUserData = VMA_NULL; -} - -void VmaAllocation_T::BlockAllocMap() -{ - VMA_ASSERT(GetType() == ALLOCATION_TYPE_BLOCK); - - if((m_MapCount & ~MAP_COUNT_FLAG_PERSISTENT_MAP) < 0x7F) - { - ++m_MapCount; - } - else - { - VMA_ASSERT(0 && "Allocation mapped too many times simultaneously."); - } -} - -void VmaAllocation_T::BlockAllocUnmap() -{ - VMA_ASSERT(GetType() == ALLOCATION_TYPE_BLOCK); - - if((m_MapCount & ~MAP_COUNT_FLAG_PERSISTENT_MAP) != 0) - { - --m_MapCount; - } - else - { - VMA_ASSERT(0 && "Unmapping allocation not previously mapped."); - } -} - -VkResult VmaAllocation_T::DedicatedAllocMap(VmaAllocator hAllocator, void** ppData) -{ - VMA_ASSERT(GetType() == ALLOCATION_TYPE_DEDICATED); - - if(m_MapCount != 0) - { - if((m_MapCount & ~MAP_COUNT_FLAG_PERSISTENT_MAP) < 0x7F) - { - VMA_ASSERT(m_DedicatedAllocation.m_pMappedData != VMA_NULL); - *ppData = m_DedicatedAllocation.m_pMappedData; - ++m_MapCount; - return VK_SUCCESS; - } - else - { - VMA_ASSERT(0 && "Dedicated allocation mapped too many times simultaneously."); - return VK_ERROR_MEMORY_MAP_FAILED; - } - } - else - { - VkResult result = (*hAllocator->GetVulkanFunctions().vkMapMemory)( - hAllocator->m_hDevice, - m_DedicatedAllocation.m_hMemory, - 0, // offset - VK_WHOLE_SIZE, - 0, // flags - ppData); - if(result == VK_SUCCESS) - { - m_DedicatedAllocation.m_pMappedData = *ppData; - m_MapCount = 1; - } - return result; - } -} - -void VmaAllocation_T::DedicatedAllocUnmap(VmaAllocator hAllocator) -{ - VMA_ASSERT(GetType() == ALLOCATION_TYPE_DEDICATED); - - if((m_MapCount & ~MAP_COUNT_FLAG_PERSISTENT_MAP) != 0) - { - --m_MapCount; - if(m_MapCount == 0) - { - m_DedicatedAllocation.m_pMappedData = VMA_NULL; - (*hAllocator->GetVulkanFunctions().vkUnmapMemory)( - hAllocator->m_hDevice, - m_DedicatedAllocation.m_hMemory); - } - } - else - { - VMA_ASSERT(0 && "Unmapping dedicated allocation not previously mapped."); - } -} - -#if VMA_STATS_STRING_ENABLED - -static void VmaPrintStatInfo(VmaJsonWriter& json, const VmaStatInfo& stat) -{ - json.BeginObject(); - - json.WriteString("Blocks"); - json.WriteNumber(stat.blockCount); - - json.WriteString("Allocations"); - json.WriteNumber(stat.allocationCount); - - json.WriteString("UnusedRanges"); - json.WriteNumber(stat.unusedRangeCount); - - json.WriteString("UsedBytes"); - json.WriteNumber(stat.usedBytes); - - json.WriteString("UnusedBytes"); - json.WriteNumber(stat.unusedBytes); - - if(stat.allocationCount > 1) - { - json.WriteString("AllocationSize"); - json.BeginObject(true); - json.WriteString("Min"); - json.WriteNumber(stat.allocationSizeMin); - json.WriteString("Avg"); - json.WriteNumber(stat.allocationSizeAvg); - json.WriteString("Max"); - json.WriteNumber(stat.allocationSizeMax); - json.EndObject(); - } - - if(stat.unusedRangeCount > 1) - { - json.WriteString("UnusedRangeSize"); - json.BeginObject(true); - json.WriteString("Min"); - json.WriteNumber(stat.unusedRangeSizeMin); - json.WriteString("Avg"); - json.WriteNumber(stat.unusedRangeSizeAvg); - json.WriteString("Max"); - json.WriteNumber(stat.unusedRangeSizeMax); - json.EndObject(); - } - - json.EndObject(); -} - -#endif // #if VMA_STATS_STRING_ENABLED - -struct VmaSuballocationItemSizeLess -{ - bool operator()( - const VmaSuballocationList::iterator lhs, - const VmaSuballocationList::iterator rhs) const - { - return lhs->size < rhs->size; - } - bool operator()( - const VmaSuballocationList::iterator lhs, - VkDeviceSize rhsSize) const - { - return lhs->size < rhsSize; - } -}; - - -//////////////////////////////////////////////////////////////////////////////// -// class VmaBlockMetadata - -VmaBlockMetadata::VmaBlockMetadata(VmaAllocator hAllocator) : - m_Size(0), - m_pAllocationCallbacks(hAllocator->GetAllocationCallbacks()) -{ -} - -#if VMA_STATS_STRING_ENABLED - -void VmaBlockMetadata::PrintDetailedMap_Begin(class VmaJsonWriter& json, - VkDeviceSize unusedBytes, - size_t allocationCount, - size_t unusedRangeCount) const -{ - json.BeginObject(); - - json.WriteString("TotalBytes"); - json.WriteNumber(GetSize()); - - json.WriteString("UnusedBytes"); - json.WriteNumber(unusedBytes); - - json.WriteString("Allocations"); - json.WriteNumber((uint64_t)allocationCount); - - json.WriteString("UnusedRanges"); - json.WriteNumber((uint64_t)unusedRangeCount); - - json.WriteString("Suballocations"); - json.BeginArray(); -} - -void VmaBlockMetadata::PrintDetailedMap_Allocation(class VmaJsonWriter& json, - VkDeviceSize offset, - VmaAllocation hAllocation) const -{ - json.BeginObject(true); - - json.WriteString("Offset"); - json.WriteNumber(offset); - - hAllocation->PrintParameters(json); - - json.EndObject(); -} - -void VmaBlockMetadata::PrintDetailedMap_UnusedRange(class VmaJsonWriter& json, - VkDeviceSize offset, - VkDeviceSize size) const -{ - json.BeginObject(true); - - json.WriteString("Offset"); - json.WriteNumber(offset); - - json.WriteString("Type"); - json.WriteString(VMA_SUBALLOCATION_TYPE_NAMES[VMA_SUBALLOCATION_TYPE_FREE]); - - json.WriteString("Size"); - json.WriteNumber(size); - - json.EndObject(); -} - -void VmaBlockMetadata::PrintDetailedMap_End(class VmaJsonWriter& json) const -{ - json.EndArray(); - json.EndObject(); -} - -#endif // #if VMA_STATS_STRING_ENABLED - -//////////////////////////////////////////////////////////////////////////////// -// class VmaBlockMetadata_Generic - -VmaBlockMetadata_Generic::VmaBlockMetadata_Generic(VmaAllocator hAllocator) : - VmaBlockMetadata(hAllocator), - m_FreeCount(0), - m_SumFreeSize(0), - m_Suballocations(VmaStlAllocator(hAllocator->GetAllocationCallbacks())), - m_FreeSuballocationsBySize(VmaStlAllocator(hAllocator->GetAllocationCallbacks())) -{ -} - -VmaBlockMetadata_Generic::~VmaBlockMetadata_Generic() -{ -} - -void VmaBlockMetadata_Generic::Init(VkDeviceSize size) -{ - VmaBlockMetadata::Init(size); - - m_FreeCount = 1; - m_SumFreeSize = size; - - VmaSuballocation suballoc = {}; - suballoc.offset = 0; - suballoc.size = size; - suballoc.type = VMA_SUBALLOCATION_TYPE_FREE; - suballoc.hAllocation = VK_NULL_HANDLE; - - VMA_ASSERT(size > VMA_MIN_FREE_SUBALLOCATION_SIZE_TO_REGISTER); - m_Suballocations.push_back(suballoc); - VmaSuballocationList::iterator suballocItem = m_Suballocations.end(); - --suballocItem; - m_FreeSuballocationsBySize.push_back(suballocItem); -} - -bool VmaBlockMetadata_Generic::Validate() const -{ - VMA_VALIDATE(!m_Suballocations.empty()); - - // Expected offset of new suballocation as calculated from previous ones. - VkDeviceSize calculatedOffset = 0; - // Expected number of free suballocations as calculated from traversing their list. - uint32_t calculatedFreeCount = 0; - // Expected sum size of free suballocations as calculated from traversing their list. - VkDeviceSize calculatedSumFreeSize = 0; - // Expected number of free suballocations that should be registered in - // m_FreeSuballocationsBySize calculated from traversing their list. - size_t freeSuballocationsToRegister = 0; - // True if previous visited suballocation was free. - bool prevFree = false; - - for(VmaSuballocationList::const_iterator suballocItem = m_Suballocations.cbegin(); - suballocItem != m_Suballocations.cend(); - ++suballocItem) - { - const VmaSuballocation& subAlloc = *suballocItem; - - // Actual offset of this suballocation doesn't match expected one. - VMA_VALIDATE(subAlloc.offset == calculatedOffset); - - const bool currFree = (subAlloc.type == VMA_SUBALLOCATION_TYPE_FREE); - // Two adjacent free suballocations are invalid. They should be merged. - VMA_VALIDATE(!prevFree || !currFree); - - VMA_VALIDATE(currFree == (subAlloc.hAllocation == VK_NULL_HANDLE)); - - if(currFree) - { - calculatedSumFreeSize += subAlloc.size; - ++calculatedFreeCount; - if(subAlloc.size >= VMA_MIN_FREE_SUBALLOCATION_SIZE_TO_REGISTER) - { - ++freeSuballocationsToRegister; - } - - // Margin required between allocations - every free space must be at least that large. - VMA_VALIDATE(subAlloc.size >= VMA_DEBUG_MARGIN); - } - else - { - VMA_VALIDATE(subAlloc.hAllocation->GetOffset() == subAlloc.offset); - VMA_VALIDATE(subAlloc.hAllocation->GetSize() == subAlloc.size); - - // Margin required between allocations - previous allocation must be free. - VMA_VALIDATE(VMA_DEBUG_MARGIN == 0 || prevFree); - } - - calculatedOffset += subAlloc.size; - prevFree = currFree; - } - - // Number of free suballocations registered in m_FreeSuballocationsBySize doesn't - // match expected one. - VMA_VALIDATE(m_FreeSuballocationsBySize.size() == freeSuballocationsToRegister); - - VkDeviceSize lastSize = 0; - for(size_t i = 0; i < m_FreeSuballocationsBySize.size(); ++i) - { - VmaSuballocationList::iterator suballocItem = m_FreeSuballocationsBySize[i]; - - // Only free suballocations can be registered in m_FreeSuballocationsBySize. - VMA_VALIDATE(suballocItem->type == VMA_SUBALLOCATION_TYPE_FREE); - // They must be sorted by size ascending. - VMA_VALIDATE(suballocItem->size >= lastSize); - - lastSize = suballocItem->size; - } - - // Check if totals match calculacted values. - VMA_VALIDATE(ValidateFreeSuballocationList()); - VMA_VALIDATE(calculatedOffset == GetSize()); - VMA_VALIDATE(calculatedSumFreeSize == m_SumFreeSize); - VMA_VALIDATE(calculatedFreeCount == m_FreeCount); - - return true; -} - -VkDeviceSize VmaBlockMetadata_Generic::GetUnusedRangeSizeMax() const -{ - if(!m_FreeSuballocationsBySize.empty()) - { - return m_FreeSuballocationsBySize.back()->size; - } - else - { - return 0; - } -} - -bool VmaBlockMetadata_Generic::IsEmpty() const -{ - return (m_Suballocations.size() == 1) && (m_FreeCount == 1); -} - -void VmaBlockMetadata_Generic::CalcAllocationStatInfo(VmaStatInfo& outInfo) const -{ - outInfo.blockCount = 1; - - const uint32_t rangeCount = (uint32_t)m_Suballocations.size(); - outInfo.allocationCount = rangeCount - m_FreeCount; - outInfo.unusedRangeCount = m_FreeCount; - - outInfo.unusedBytes = m_SumFreeSize; - outInfo.usedBytes = GetSize() - outInfo.unusedBytes; - - outInfo.allocationSizeMin = UINT64_MAX; - outInfo.allocationSizeMax = 0; - outInfo.unusedRangeSizeMin = UINT64_MAX; - outInfo.unusedRangeSizeMax = 0; - - for(VmaSuballocationList::const_iterator suballocItem = m_Suballocations.cbegin(); - suballocItem != m_Suballocations.cend(); - ++suballocItem) - { - const VmaSuballocation& suballoc = *suballocItem; - if(suballoc.type != VMA_SUBALLOCATION_TYPE_FREE) - { - outInfo.allocationSizeMin = VMA_MIN(outInfo.allocationSizeMin, suballoc.size); - outInfo.allocationSizeMax = VMA_MAX(outInfo.allocationSizeMax, suballoc.size); - } - else - { - outInfo.unusedRangeSizeMin = VMA_MIN(outInfo.unusedRangeSizeMin, suballoc.size); - outInfo.unusedRangeSizeMax = VMA_MAX(outInfo.unusedRangeSizeMax, suballoc.size); - } - } -} - -void VmaBlockMetadata_Generic::AddPoolStats(VmaPoolStats& inoutStats) const -{ - const uint32_t rangeCount = (uint32_t)m_Suballocations.size(); - - inoutStats.size += GetSize(); - inoutStats.unusedSize += m_SumFreeSize; - inoutStats.allocationCount += rangeCount - m_FreeCount; - inoutStats.unusedRangeCount += m_FreeCount; - inoutStats.unusedRangeSizeMax = VMA_MAX(inoutStats.unusedRangeSizeMax, GetUnusedRangeSizeMax()); -} - -#if VMA_STATS_STRING_ENABLED - -void VmaBlockMetadata_Generic::PrintDetailedMap(class VmaJsonWriter& json) const -{ - PrintDetailedMap_Begin(json, - m_SumFreeSize, // unusedBytes - m_Suballocations.size() - (size_t)m_FreeCount, // allocationCount - m_FreeCount); // unusedRangeCount - - size_t i = 0; - for(VmaSuballocationList::const_iterator suballocItem = m_Suballocations.cbegin(); - suballocItem != m_Suballocations.cend(); - ++suballocItem, ++i) - { - if(suballocItem->type == VMA_SUBALLOCATION_TYPE_FREE) - { - PrintDetailedMap_UnusedRange(json, suballocItem->offset, suballocItem->size); - } - else - { - PrintDetailedMap_Allocation(json, suballocItem->offset, suballocItem->hAllocation); - } - } - - PrintDetailedMap_End(json); -} - -#endif // #if VMA_STATS_STRING_ENABLED - -bool VmaBlockMetadata_Generic::CreateAllocationRequest( - uint32_t currentFrameIndex, - uint32_t frameInUseCount, - VkDeviceSize bufferImageGranularity, - VkDeviceSize allocSize, - VkDeviceSize allocAlignment, - bool upperAddress, - VmaSuballocationType allocType, - bool canMakeOtherLost, - uint32_t strategy, - VmaAllocationRequest* pAllocationRequest) -{ - VMA_ASSERT(allocSize > 0); - VMA_ASSERT(!upperAddress); - VMA_ASSERT(allocType != VMA_SUBALLOCATION_TYPE_FREE); - VMA_ASSERT(pAllocationRequest != VMA_NULL); - VMA_HEAVY_ASSERT(Validate()); - - pAllocationRequest->type = VmaAllocationRequestType::Normal; - - // There is not enough total free space in this block to fullfill the request: Early return. - if(canMakeOtherLost == false && - m_SumFreeSize < allocSize + 2 * VMA_DEBUG_MARGIN) - { - return false; - } - - // New algorithm, efficiently searching freeSuballocationsBySize. - const size_t freeSuballocCount = m_FreeSuballocationsBySize.size(); - if(freeSuballocCount > 0) - { - if(strategy == VMA_ALLOCATION_CREATE_STRATEGY_BEST_FIT_BIT) - { - // Find first free suballocation with size not less than allocSize + 2 * VMA_DEBUG_MARGIN. - VmaSuballocationList::iterator* const it = VmaBinaryFindFirstNotLess( - m_FreeSuballocationsBySize.data(), - m_FreeSuballocationsBySize.data() + freeSuballocCount, - allocSize + 2 * VMA_DEBUG_MARGIN, - VmaSuballocationItemSizeLess()); - size_t index = it - m_FreeSuballocationsBySize.data(); - for(; index < freeSuballocCount; ++index) - { - if(CheckAllocation( - currentFrameIndex, - frameInUseCount, - bufferImageGranularity, - allocSize, - allocAlignment, - allocType, - m_FreeSuballocationsBySize[index], - false, // canMakeOtherLost - &pAllocationRequest->offset, - &pAllocationRequest->itemsToMakeLostCount, - &pAllocationRequest->sumFreeSize, - &pAllocationRequest->sumItemSize)) - { - pAllocationRequest->item = m_FreeSuballocationsBySize[index]; - return true; - } - } - } - else if(strategy == VMA_ALLOCATION_INTERNAL_STRATEGY_MIN_OFFSET) - { - for(VmaSuballocationList::iterator it = m_Suballocations.begin(); - it != m_Suballocations.end(); - ++it) - { - if(it->type == VMA_SUBALLOCATION_TYPE_FREE && CheckAllocation( - currentFrameIndex, - frameInUseCount, - bufferImageGranularity, - allocSize, - allocAlignment, - allocType, - it, - false, // canMakeOtherLost - &pAllocationRequest->offset, - &pAllocationRequest->itemsToMakeLostCount, - &pAllocationRequest->sumFreeSize, - &pAllocationRequest->sumItemSize)) - { - pAllocationRequest->item = it; - return true; - } - } - } - else // WORST_FIT, FIRST_FIT - { - // Search staring from biggest suballocations. - for(size_t index = freeSuballocCount; index--; ) - { - if(CheckAllocation( - currentFrameIndex, - frameInUseCount, - bufferImageGranularity, - allocSize, - allocAlignment, - allocType, - m_FreeSuballocationsBySize[index], - false, // canMakeOtherLost - &pAllocationRequest->offset, - &pAllocationRequest->itemsToMakeLostCount, - &pAllocationRequest->sumFreeSize, - &pAllocationRequest->sumItemSize)) - { - pAllocationRequest->item = m_FreeSuballocationsBySize[index]; - return true; - } - } - } - } - - if(canMakeOtherLost) - { - // Brute-force algorithm. TODO: Come up with something better. - - bool found = false; - VmaAllocationRequest tmpAllocRequest = {}; - tmpAllocRequest.type = VmaAllocationRequestType::Normal; - for(VmaSuballocationList::iterator suballocIt = m_Suballocations.begin(); - suballocIt != m_Suballocations.end(); - ++suballocIt) - { - if(suballocIt->type == VMA_SUBALLOCATION_TYPE_FREE || - suballocIt->hAllocation->CanBecomeLost()) - { - if(CheckAllocation( - currentFrameIndex, - frameInUseCount, - bufferImageGranularity, - allocSize, - allocAlignment, - allocType, - suballocIt, - canMakeOtherLost, - &tmpAllocRequest.offset, - &tmpAllocRequest.itemsToMakeLostCount, - &tmpAllocRequest.sumFreeSize, - &tmpAllocRequest.sumItemSize)) - { - if(strategy == VMA_ALLOCATION_CREATE_STRATEGY_FIRST_FIT_BIT) - { - *pAllocationRequest = tmpAllocRequest; - pAllocationRequest->item = suballocIt; - break; - } - if(!found || tmpAllocRequest.CalcCost() < pAllocationRequest->CalcCost()) - { - *pAllocationRequest = tmpAllocRequest; - pAllocationRequest->item = suballocIt; - found = true; - } - } - } - } - - return found; - } - - return false; -} - -bool VmaBlockMetadata_Generic::MakeRequestedAllocationsLost( - uint32_t currentFrameIndex, - uint32_t frameInUseCount, - VmaAllocationRequest* pAllocationRequest) -{ - VMA_ASSERT(pAllocationRequest && pAllocationRequest->type == VmaAllocationRequestType::Normal); - - while(pAllocationRequest->itemsToMakeLostCount > 0) - { - if(pAllocationRequest->item->type == VMA_SUBALLOCATION_TYPE_FREE) - { - ++pAllocationRequest->item; - } - VMA_ASSERT(pAllocationRequest->item != m_Suballocations.end()); - VMA_ASSERT(pAllocationRequest->item->hAllocation != VK_NULL_HANDLE); - VMA_ASSERT(pAllocationRequest->item->hAllocation->CanBecomeLost()); - if(pAllocationRequest->item->hAllocation->MakeLost(currentFrameIndex, frameInUseCount)) - { - pAllocationRequest->item = FreeSuballocation(pAllocationRequest->item); - --pAllocationRequest->itemsToMakeLostCount; - } - else - { - return false; - } - } - - VMA_HEAVY_ASSERT(Validate()); - VMA_ASSERT(pAllocationRequest->item != m_Suballocations.end()); - VMA_ASSERT(pAllocationRequest->item->type == VMA_SUBALLOCATION_TYPE_FREE); - - return true; -} - -uint32_t VmaBlockMetadata_Generic::MakeAllocationsLost(uint32_t currentFrameIndex, uint32_t frameInUseCount) -{ - uint32_t lostAllocationCount = 0; - for(VmaSuballocationList::iterator it = m_Suballocations.begin(); - it != m_Suballocations.end(); - ++it) - { - if(it->type != VMA_SUBALLOCATION_TYPE_FREE && - it->hAllocation->CanBecomeLost() && - it->hAllocation->MakeLost(currentFrameIndex, frameInUseCount)) - { - it = FreeSuballocation(it); - ++lostAllocationCount; - } - } - return lostAllocationCount; -} - -VkResult VmaBlockMetadata_Generic::CheckCorruption(const void* pBlockData) -{ - for(VmaSuballocationList::iterator it = m_Suballocations.begin(); - it != m_Suballocations.end(); - ++it) - { - if(it->type != VMA_SUBALLOCATION_TYPE_FREE) - { - if(!VmaValidateMagicValue(pBlockData, it->offset - VMA_DEBUG_MARGIN)) - { - VMA_ASSERT(0 && "MEMORY CORRUPTION DETECTED BEFORE VALIDATED ALLOCATION!"); - return VK_ERROR_VALIDATION_FAILED_EXT; - } - if(!VmaValidateMagicValue(pBlockData, it->offset + it->size)) - { - VMA_ASSERT(0 && "MEMORY CORRUPTION DETECTED AFTER VALIDATED ALLOCATION!"); - return VK_ERROR_VALIDATION_FAILED_EXT; - } - } - } - - return VK_SUCCESS; -} - -void VmaBlockMetadata_Generic::Alloc( - const VmaAllocationRequest& request, - VmaSuballocationType type, - VkDeviceSize allocSize, - VmaAllocation hAllocation) -{ - VMA_ASSERT(request.type == VmaAllocationRequestType::Normal); - VMA_ASSERT(request.item != m_Suballocations.end()); - VmaSuballocation& suballoc = *request.item; - // Given suballocation is a free block. - VMA_ASSERT(suballoc.type == VMA_SUBALLOCATION_TYPE_FREE); - // Given offset is inside this suballocation. - VMA_ASSERT(request.offset >= suballoc.offset); - const VkDeviceSize paddingBegin = request.offset - suballoc.offset; - VMA_ASSERT(suballoc.size >= paddingBegin + allocSize); - const VkDeviceSize paddingEnd = suballoc.size - paddingBegin - allocSize; - - // Unregister this free suballocation from m_FreeSuballocationsBySize and update - // it to become used. - UnregisterFreeSuballocation(request.item); - - suballoc.offset = request.offset; - suballoc.size = allocSize; - suballoc.type = type; - suballoc.hAllocation = hAllocation; - - // If there are any free bytes remaining at the end, insert new free suballocation after current one. - if(paddingEnd) - { - VmaSuballocation paddingSuballoc = {}; - paddingSuballoc.offset = request.offset + allocSize; - paddingSuballoc.size = paddingEnd; - paddingSuballoc.type = VMA_SUBALLOCATION_TYPE_FREE; - VmaSuballocationList::iterator next = request.item; - ++next; - const VmaSuballocationList::iterator paddingEndItem = - m_Suballocations.insert(next, paddingSuballoc); - RegisterFreeSuballocation(paddingEndItem); - } - - // If there are any free bytes remaining at the beginning, insert new free suballocation before current one. - if(paddingBegin) - { - VmaSuballocation paddingSuballoc = {}; - paddingSuballoc.offset = request.offset - paddingBegin; - paddingSuballoc.size = paddingBegin; - paddingSuballoc.type = VMA_SUBALLOCATION_TYPE_FREE; - const VmaSuballocationList::iterator paddingBeginItem = - m_Suballocations.insert(request.item, paddingSuballoc); - RegisterFreeSuballocation(paddingBeginItem); - } - - // Update totals. - m_FreeCount = m_FreeCount - 1; - if(paddingBegin > 0) - { - ++m_FreeCount; - } - if(paddingEnd > 0) - { - ++m_FreeCount; - } - m_SumFreeSize -= allocSize; -} - -void VmaBlockMetadata_Generic::Free(const VmaAllocation allocation) -{ - for(VmaSuballocationList::iterator suballocItem = m_Suballocations.begin(); - suballocItem != m_Suballocations.end(); - ++suballocItem) - { - VmaSuballocation& suballoc = *suballocItem; - if(suballoc.hAllocation == allocation) - { - FreeSuballocation(suballocItem); - VMA_HEAVY_ASSERT(Validate()); - return; - } - } - VMA_ASSERT(0 && "Not found!"); -} - -void VmaBlockMetadata_Generic::FreeAtOffset(VkDeviceSize offset) -{ - for(VmaSuballocationList::iterator suballocItem = m_Suballocations.begin(); - suballocItem != m_Suballocations.end(); - ++suballocItem) - { - VmaSuballocation& suballoc = *suballocItem; - if(suballoc.offset == offset) - { - FreeSuballocation(suballocItem); - return; - } - } - VMA_ASSERT(0 && "Not found!"); -} - -bool VmaBlockMetadata_Generic::ValidateFreeSuballocationList() const -{ - VkDeviceSize lastSize = 0; - for(size_t i = 0, count = m_FreeSuballocationsBySize.size(); i < count; ++i) - { - const VmaSuballocationList::iterator it = m_FreeSuballocationsBySize[i]; - - VMA_VALIDATE(it->type == VMA_SUBALLOCATION_TYPE_FREE); - VMA_VALIDATE(it->size >= VMA_MIN_FREE_SUBALLOCATION_SIZE_TO_REGISTER); - VMA_VALIDATE(it->size >= lastSize); - lastSize = it->size; - } - return true; -} - -bool VmaBlockMetadata_Generic::CheckAllocation( - uint32_t currentFrameIndex, - uint32_t frameInUseCount, - VkDeviceSize bufferImageGranularity, - VkDeviceSize allocSize, - VkDeviceSize allocAlignment, - VmaSuballocationType allocType, - VmaSuballocationList::const_iterator suballocItem, - bool canMakeOtherLost, - VkDeviceSize* pOffset, - size_t* itemsToMakeLostCount, - VkDeviceSize* pSumFreeSize, - VkDeviceSize* pSumItemSize) const -{ - VMA_ASSERT(allocSize > 0); - VMA_ASSERT(allocType != VMA_SUBALLOCATION_TYPE_FREE); - VMA_ASSERT(suballocItem != m_Suballocations.cend()); - VMA_ASSERT(pOffset != VMA_NULL); - - *itemsToMakeLostCount = 0; - *pSumFreeSize = 0; - *pSumItemSize = 0; - - if(canMakeOtherLost) - { - if(suballocItem->type == VMA_SUBALLOCATION_TYPE_FREE) - { - *pSumFreeSize = suballocItem->size; - } - else - { - if(suballocItem->hAllocation->CanBecomeLost() && - suballocItem->hAllocation->GetLastUseFrameIndex() + frameInUseCount < currentFrameIndex) - { - ++*itemsToMakeLostCount; - *pSumItemSize = suballocItem->size; - } - else - { - return false; - } - } - - // Remaining size is too small for this request: Early return. - if(GetSize() - suballocItem->offset < allocSize) - { - return false; - } - - // Start from offset equal to beginning of this suballocation. - *pOffset = suballocItem->offset; - - // Apply VMA_DEBUG_MARGIN at the beginning. - if(VMA_DEBUG_MARGIN > 0) - { - *pOffset += VMA_DEBUG_MARGIN; - } - - // Apply alignment. - *pOffset = VmaAlignUp(*pOffset, allocAlignment); - - // Check previous suballocations for BufferImageGranularity conflicts. - // Make bigger alignment if necessary. - if(bufferImageGranularity > 1 && bufferImageGranularity != allocAlignment) - { - bool bufferImageGranularityConflict = false; - VmaSuballocationList::const_iterator prevSuballocItem = suballocItem; - while(prevSuballocItem != m_Suballocations.cbegin()) - { - --prevSuballocItem; - const VmaSuballocation& prevSuballoc = *prevSuballocItem; - if(VmaBlocksOnSamePage(prevSuballoc.offset, prevSuballoc.size, *pOffset, bufferImageGranularity)) - { - if(VmaIsBufferImageGranularityConflict(prevSuballoc.type, allocType)) - { - bufferImageGranularityConflict = true; - break; - } - } - else - // Already on previous page. - break; - } - if(bufferImageGranularityConflict) - { - *pOffset = VmaAlignUp(*pOffset, bufferImageGranularity); - } - } - - // Now that we have final *pOffset, check if we are past suballocItem. - // If yes, return false - this function should be called for another suballocItem as starting point. - if(*pOffset >= suballocItem->offset + suballocItem->size) - { - return false; - } - - // Calculate padding at the beginning based on current offset. - const VkDeviceSize paddingBegin = *pOffset - suballocItem->offset; - - // Calculate required margin at the end. - const VkDeviceSize requiredEndMargin = VMA_DEBUG_MARGIN; - - const VkDeviceSize totalSize = paddingBegin + allocSize + requiredEndMargin; - // Another early return check. - if(suballocItem->offset + totalSize > GetSize()) - { - return false; - } - - // Advance lastSuballocItem until desired size is reached. - // Update itemsToMakeLostCount. - VmaSuballocationList::const_iterator lastSuballocItem = suballocItem; - if(totalSize > suballocItem->size) - { - VkDeviceSize remainingSize = totalSize - suballocItem->size; - while(remainingSize > 0) - { - ++lastSuballocItem; - if(lastSuballocItem == m_Suballocations.cend()) - { - return false; - } - if(lastSuballocItem->type == VMA_SUBALLOCATION_TYPE_FREE) - { - *pSumFreeSize += lastSuballocItem->size; - } - else - { - VMA_ASSERT(lastSuballocItem->hAllocation != VK_NULL_HANDLE); - if(lastSuballocItem->hAllocation->CanBecomeLost() && - lastSuballocItem->hAllocation->GetLastUseFrameIndex() + frameInUseCount < currentFrameIndex) - { - ++*itemsToMakeLostCount; - *pSumItemSize += lastSuballocItem->size; - } - else - { - return false; - } - } - remainingSize = (lastSuballocItem->size < remainingSize) ? - remainingSize - lastSuballocItem->size : 0; - } - } - - // Check next suballocations for BufferImageGranularity conflicts. - // If conflict exists, we must mark more allocations lost or fail. - if(allocSize % bufferImageGranularity || *pOffset % bufferImageGranularity) - { - VmaSuballocationList::const_iterator nextSuballocItem = lastSuballocItem; - ++nextSuballocItem; - while(nextSuballocItem != m_Suballocations.cend()) - { - const VmaSuballocation& nextSuballoc = *nextSuballocItem; - if(VmaBlocksOnSamePage(*pOffset, allocSize, nextSuballoc.offset, bufferImageGranularity)) - { - if(VmaIsBufferImageGranularityConflict(allocType, nextSuballoc.type)) - { - VMA_ASSERT(nextSuballoc.hAllocation != VK_NULL_HANDLE); - if(nextSuballoc.hAllocation->CanBecomeLost() && - nextSuballoc.hAllocation->GetLastUseFrameIndex() + frameInUseCount < currentFrameIndex) - { - ++*itemsToMakeLostCount; - } - else - { - return false; - } - } - } - else - { - // Already on next page. - break; - } - ++nextSuballocItem; - } - } - } - else - { - const VmaSuballocation& suballoc = *suballocItem; - VMA_ASSERT(suballoc.type == VMA_SUBALLOCATION_TYPE_FREE); - - *pSumFreeSize = suballoc.size; - - // Size of this suballocation is too small for this request: Early return. - if(suballoc.size < allocSize) - { - return false; - } - - // Start from offset equal to beginning of this suballocation. - *pOffset = suballoc.offset; - - // Apply VMA_DEBUG_MARGIN at the beginning. - if(VMA_DEBUG_MARGIN > 0) - { - *pOffset += VMA_DEBUG_MARGIN; - } - - // Apply alignment. - *pOffset = VmaAlignUp(*pOffset, allocAlignment); - - // Check previous suballocations for BufferImageGranularity conflicts. - // Make bigger alignment if necessary. - if(bufferImageGranularity > 1 && bufferImageGranularity != allocAlignment) - { - bool bufferImageGranularityConflict = false; - VmaSuballocationList::const_iterator prevSuballocItem = suballocItem; - while(prevSuballocItem != m_Suballocations.cbegin()) - { - --prevSuballocItem; - const VmaSuballocation& prevSuballoc = *prevSuballocItem; - if(VmaBlocksOnSamePage(prevSuballoc.offset, prevSuballoc.size, *pOffset, bufferImageGranularity)) - { - if(VmaIsBufferImageGranularityConflict(prevSuballoc.type, allocType)) - { - bufferImageGranularityConflict = true; - break; - } - } - else - // Already on previous page. - break; - } - if(bufferImageGranularityConflict) - { - *pOffset = VmaAlignUp(*pOffset, bufferImageGranularity); - } - } - - // Calculate padding at the beginning based on current offset. - const VkDeviceSize paddingBegin = *pOffset - suballoc.offset; - - // Calculate required margin at the end. - const VkDeviceSize requiredEndMargin = VMA_DEBUG_MARGIN; - - // Fail if requested size plus margin before and after is bigger than size of this suballocation. - if(paddingBegin + allocSize + requiredEndMargin > suballoc.size) - { - return false; - } - - // Check next suballocations for BufferImageGranularity conflicts. - // If conflict exists, allocation cannot be made here. - if(allocSize % bufferImageGranularity || *pOffset % bufferImageGranularity) - { - VmaSuballocationList::const_iterator nextSuballocItem = suballocItem; - ++nextSuballocItem; - while(nextSuballocItem != m_Suballocations.cend()) - { - const VmaSuballocation& nextSuballoc = *nextSuballocItem; - if(VmaBlocksOnSamePage(*pOffset, allocSize, nextSuballoc.offset, bufferImageGranularity)) - { - if(VmaIsBufferImageGranularityConflict(allocType, nextSuballoc.type)) - { - return false; - } - } - else - { - // Already on next page. - break; - } - ++nextSuballocItem; - } - } - } - - // All tests passed: Success. pOffset is already filled. - return true; -} - -void VmaBlockMetadata_Generic::MergeFreeWithNext(VmaSuballocationList::iterator item) -{ - VMA_ASSERT(item != m_Suballocations.end()); - VMA_ASSERT(item->type == VMA_SUBALLOCATION_TYPE_FREE); - - VmaSuballocationList::iterator nextItem = item; - ++nextItem; - VMA_ASSERT(nextItem != m_Suballocations.end()); - VMA_ASSERT(nextItem->type == VMA_SUBALLOCATION_TYPE_FREE); - - item->size += nextItem->size; - --m_FreeCount; - m_Suballocations.erase(nextItem); -} - -VmaSuballocationList::iterator VmaBlockMetadata_Generic::FreeSuballocation(VmaSuballocationList::iterator suballocItem) -{ - // Change this suballocation to be marked as free. - VmaSuballocation& suballoc = *suballocItem; - suballoc.type = VMA_SUBALLOCATION_TYPE_FREE; - suballoc.hAllocation = VK_NULL_HANDLE; - - // Update totals. - ++m_FreeCount; - m_SumFreeSize += suballoc.size; - - // Merge with previous and/or next suballocation if it's also free. - bool mergeWithNext = false; - bool mergeWithPrev = false; - - VmaSuballocationList::iterator nextItem = suballocItem; - ++nextItem; - if((nextItem != m_Suballocations.end()) && (nextItem->type == VMA_SUBALLOCATION_TYPE_FREE)) - { - mergeWithNext = true; - } - - VmaSuballocationList::iterator prevItem = suballocItem; - if(suballocItem != m_Suballocations.begin()) - { - --prevItem; - if(prevItem->type == VMA_SUBALLOCATION_TYPE_FREE) - { - mergeWithPrev = true; - } - } - - if(mergeWithNext) - { - UnregisterFreeSuballocation(nextItem); - MergeFreeWithNext(suballocItem); - } - - if(mergeWithPrev) - { - UnregisterFreeSuballocation(prevItem); - MergeFreeWithNext(prevItem); - RegisterFreeSuballocation(prevItem); - return prevItem; - } - else - { - RegisterFreeSuballocation(suballocItem); - return suballocItem; - } -} - -void VmaBlockMetadata_Generic::RegisterFreeSuballocation(VmaSuballocationList::iterator item) -{ - VMA_ASSERT(item->type == VMA_SUBALLOCATION_TYPE_FREE); - VMA_ASSERT(item->size > 0); - - // You may want to enable this validation at the beginning or at the end of - // this function, depending on what do you want to check. - VMA_HEAVY_ASSERT(ValidateFreeSuballocationList()); - - if(item->size >= VMA_MIN_FREE_SUBALLOCATION_SIZE_TO_REGISTER) - { - if(m_FreeSuballocationsBySize.empty()) - { - m_FreeSuballocationsBySize.push_back(item); - } - else - { - VmaVectorInsertSorted(m_FreeSuballocationsBySize, item); - } - } - - //VMA_HEAVY_ASSERT(ValidateFreeSuballocationList()); -} - - -void VmaBlockMetadata_Generic::UnregisterFreeSuballocation(VmaSuballocationList::iterator item) -{ - VMA_ASSERT(item->type == VMA_SUBALLOCATION_TYPE_FREE); - VMA_ASSERT(item->size > 0); - - // You may want to enable this validation at the beginning or at the end of - // this function, depending on what do you want to check. - VMA_HEAVY_ASSERT(ValidateFreeSuballocationList()); - - if(item->size >= VMA_MIN_FREE_SUBALLOCATION_SIZE_TO_REGISTER) - { - VmaSuballocationList::iterator* const it = VmaBinaryFindFirstNotLess( - m_FreeSuballocationsBySize.data(), - m_FreeSuballocationsBySize.data() + m_FreeSuballocationsBySize.size(), - item, - VmaSuballocationItemSizeLess()); - for(size_t index = it - m_FreeSuballocationsBySize.data(); - index < m_FreeSuballocationsBySize.size(); - ++index) - { - if(m_FreeSuballocationsBySize[index] == item) - { - VmaVectorRemove(m_FreeSuballocationsBySize, index); - return; - } - VMA_ASSERT((m_FreeSuballocationsBySize[index]->size == item->size) && "Not found."); - } - VMA_ASSERT(0 && "Not found."); - } - - //VMA_HEAVY_ASSERT(ValidateFreeSuballocationList()); -} - -bool VmaBlockMetadata_Generic::IsBufferImageGranularityConflictPossible( - VkDeviceSize bufferImageGranularity, - VmaSuballocationType& inOutPrevSuballocType) const -{ - if(bufferImageGranularity == 1 || IsEmpty()) - { - return false; - } - - VkDeviceSize minAlignment = VK_WHOLE_SIZE; - bool typeConflictFound = false; - for(VmaSuballocationList::const_iterator it = m_Suballocations.cbegin(); - it != m_Suballocations.cend(); - ++it) - { - const VmaSuballocationType suballocType = it->type; - if(suballocType != VMA_SUBALLOCATION_TYPE_FREE) - { - minAlignment = VMA_MIN(minAlignment, it->hAllocation->GetAlignment()); - if(VmaIsBufferImageGranularityConflict(inOutPrevSuballocType, suballocType)) - { - typeConflictFound = true; - } - inOutPrevSuballocType = suballocType; - } - } - - return typeConflictFound || minAlignment >= bufferImageGranularity; -} - -//////////////////////////////////////////////////////////////////////////////// -// class VmaBlockMetadata_Linear - -VmaBlockMetadata_Linear::VmaBlockMetadata_Linear(VmaAllocator hAllocator) : - VmaBlockMetadata(hAllocator), - m_SumFreeSize(0), - m_Suballocations0(VmaStlAllocator(hAllocator->GetAllocationCallbacks())), - m_Suballocations1(VmaStlAllocator(hAllocator->GetAllocationCallbacks())), - m_1stVectorIndex(0), - m_2ndVectorMode(SECOND_VECTOR_EMPTY), - m_1stNullItemsBeginCount(0), - m_1stNullItemsMiddleCount(0), - m_2ndNullItemsCount(0) -{ -} - -VmaBlockMetadata_Linear::~VmaBlockMetadata_Linear() -{ -} - -void VmaBlockMetadata_Linear::Init(VkDeviceSize size) -{ - VmaBlockMetadata::Init(size); - m_SumFreeSize = size; -} - -bool VmaBlockMetadata_Linear::Validate() const -{ - const SuballocationVectorType& suballocations1st = AccessSuballocations1st(); - const SuballocationVectorType& suballocations2nd = AccessSuballocations2nd(); - - VMA_VALIDATE(suballocations2nd.empty() == (m_2ndVectorMode == SECOND_VECTOR_EMPTY)); - VMA_VALIDATE(!suballocations1st.empty() || - suballocations2nd.empty() || - m_2ndVectorMode != SECOND_VECTOR_RING_BUFFER); - - if(!suballocations1st.empty()) - { - // Null item at the beginning should be accounted into m_1stNullItemsBeginCount. - VMA_VALIDATE(suballocations1st[m_1stNullItemsBeginCount].hAllocation != VK_NULL_HANDLE); - // Null item at the end should be just pop_back(). - VMA_VALIDATE(suballocations1st.back().hAllocation != VK_NULL_HANDLE); - } - if(!suballocations2nd.empty()) - { - // Null item at the end should be just pop_back(). - VMA_VALIDATE(suballocations2nd.back().hAllocation != VK_NULL_HANDLE); - } - - VMA_VALIDATE(m_1stNullItemsBeginCount + m_1stNullItemsMiddleCount <= suballocations1st.size()); - VMA_VALIDATE(m_2ndNullItemsCount <= suballocations2nd.size()); - - VkDeviceSize sumUsedSize = 0; - const size_t suballoc1stCount = suballocations1st.size(); - VkDeviceSize offset = VMA_DEBUG_MARGIN; - - if(m_2ndVectorMode == SECOND_VECTOR_RING_BUFFER) - { - const size_t suballoc2ndCount = suballocations2nd.size(); - size_t nullItem2ndCount = 0; - for(size_t i = 0; i < suballoc2ndCount; ++i) - { - const VmaSuballocation& suballoc = suballocations2nd[i]; - const bool currFree = (suballoc.type == VMA_SUBALLOCATION_TYPE_FREE); - - VMA_VALIDATE(currFree == (suballoc.hAllocation == VK_NULL_HANDLE)); - VMA_VALIDATE(suballoc.offset >= offset); - - if(!currFree) - { - VMA_VALIDATE(suballoc.hAllocation->GetOffset() == suballoc.offset); - VMA_VALIDATE(suballoc.hAllocation->GetSize() == suballoc.size); - sumUsedSize += suballoc.size; - } - else - { - ++nullItem2ndCount; - } - - offset = suballoc.offset + suballoc.size + VMA_DEBUG_MARGIN; - } - - VMA_VALIDATE(nullItem2ndCount == m_2ndNullItemsCount); - } - - for(size_t i = 0; i < m_1stNullItemsBeginCount; ++i) - { - const VmaSuballocation& suballoc = suballocations1st[i]; - VMA_VALIDATE(suballoc.type == VMA_SUBALLOCATION_TYPE_FREE && - suballoc.hAllocation == VK_NULL_HANDLE); - } - - size_t nullItem1stCount = m_1stNullItemsBeginCount; - - for(size_t i = m_1stNullItemsBeginCount; i < suballoc1stCount; ++i) - { - const VmaSuballocation& suballoc = suballocations1st[i]; - const bool currFree = (suballoc.type == VMA_SUBALLOCATION_TYPE_FREE); - - VMA_VALIDATE(currFree == (suballoc.hAllocation == VK_NULL_HANDLE)); - VMA_VALIDATE(suballoc.offset >= offset); - VMA_VALIDATE(i >= m_1stNullItemsBeginCount || currFree); - - if(!currFree) - { - VMA_VALIDATE(suballoc.hAllocation->GetOffset() == suballoc.offset); - VMA_VALIDATE(suballoc.hAllocation->GetSize() == suballoc.size); - sumUsedSize += suballoc.size; - } - else - { - ++nullItem1stCount; - } - - offset = suballoc.offset + suballoc.size + VMA_DEBUG_MARGIN; - } - VMA_VALIDATE(nullItem1stCount == m_1stNullItemsBeginCount + m_1stNullItemsMiddleCount); - - if(m_2ndVectorMode == SECOND_VECTOR_DOUBLE_STACK) - { - const size_t suballoc2ndCount = suballocations2nd.size(); - size_t nullItem2ndCount = 0; - for(size_t i = suballoc2ndCount; i--; ) - { - const VmaSuballocation& suballoc = suballocations2nd[i]; - const bool currFree = (suballoc.type == VMA_SUBALLOCATION_TYPE_FREE); - - VMA_VALIDATE(currFree == (suballoc.hAllocation == VK_NULL_HANDLE)); - VMA_VALIDATE(suballoc.offset >= offset); - - if(!currFree) - { - VMA_VALIDATE(suballoc.hAllocation->GetOffset() == suballoc.offset); - VMA_VALIDATE(suballoc.hAllocation->GetSize() == suballoc.size); - sumUsedSize += suballoc.size; - } - else - { - ++nullItem2ndCount; - } - - offset = suballoc.offset + suballoc.size + VMA_DEBUG_MARGIN; - } - - VMA_VALIDATE(nullItem2ndCount == m_2ndNullItemsCount); - } - - VMA_VALIDATE(offset <= GetSize()); - VMA_VALIDATE(m_SumFreeSize == GetSize() - sumUsedSize); - - return true; -} - -size_t VmaBlockMetadata_Linear::GetAllocationCount() const -{ - return AccessSuballocations1st().size() - (m_1stNullItemsBeginCount + m_1stNullItemsMiddleCount) + - AccessSuballocations2nd().size() - m_2ndNullItemsCount; -} - -VkDeviceSize VmaBlockMetadata_Linear::GetUnusedRangeSizeMax() const -{ - const VkDeviceSize size = GetSize(); - - /* - We don't consider gaps inside allocation vectors with freed allocations because - they are not suitable for reuse in linear allocator. We consider only space that - is available for new allocations. - */ - if(IsEmpty()) - { - return size; - } - - const SuballocationVectorType& suballocations1st = AccessSuballocations1st(); - - switch(m_2ndVectorMode) - { - case SECOND_VECTOR_EMPTY: - /* - Available space is after end of 1st, as well as before beginning of 1st (which - whould make it a ring buffer). - */ - { - const size_t suballocations1stCount = suballocations1st.size(); - VMA_ASSERT(suballocations1stCount > m_1stNullItemsBeginCount); - const VmaSuballocation& firstSuballoc = suballocations1st[m_1stNullItemsBeginCount]; - const VmaSuballocation& lastSuballoc = suballocations1st[suballocations1stCount - 1]; - return VMA_MAX( - firstSuballoc.offset, - size - (lastSuballoc.offset + lastSuballoc.size)); - } - break; - - case SECOND_VECTOR_RING_BUFFER: - /* - Available space is only between end of 2nd and beginning of 1st. - */ - { - const SuballocationVectorType& suballocations2nd = AccessSuballocations2nd(); - const VmaSuballocation& lastSuballoc2nd = suballocations2nd.back(); - const VmaSuballocation& firstSuballoc1st = suballocations1st[m_1stNullItemsBeginCount]; - return firstSuballoc1st.offset - (lastSuballoc2nd.offset + lastSuballoc2nd.size); - } - break; - - case SECOND_VECTOR_DOUBLE_STACK: - /* - Available space is only between end of 1st and top of 2nd. - */ - { - const SuballocationVectorType& suballocations2nd = AccessSuballocations2nd(); - const VmaSuballocation& topSuballoc2nd = suballocations2nd.back(); - const VmaSuballocation& lastSuballoc1st = suballocations1st.back(); - return topSuballoc2nd.offset - (lastSuballoc1st.offset + lastSuballoc1st.size); - } - break; - - default: - VMA_ASSERT(0); - return 0; - } -} - -void VmaBlockMetadata_Linear::CalcAllocationStatInfo(VmaStatInfo& outInfo) const -{ - const VkDeviceSize size = GetSize(); - const SuballocationVectorType& suballocations1st = AccessSuballocations1st(); - const SuballocationVectorType& suballocations2nd = AccessSuballocations2nd(); - const size_t suballoc1stCount = suballocations1st.size(); - const size_t suballoc2ndCount = suballocations2nd.size(); - - outInfo.blockCount = 1; - outInfo.allocationCount = (uint32_t)GetAllocationCount(); - outInfo.unusedRangeCount = 0; - outInfo.usedBytes = 0; - outInfo.allocationSizeMin = UINT64_MAX; - outInfo.allocationSizeMax = 0; - outInfo.unusedRangeSizeMin = UINT64_MAX; - outInfo.unusedRangeSizeMax = 0; - - VkDeviceSize lastOffset = 0; - - if(m_2ndVectorMode == SECOND_VECTOR_RING_BUFFER) - { - const VkDeviceSize freeSpace2ndTo1stEnd = suballocations1st[m_1stNullItemsBeginCount].offset; - size_t nextAlloc2ndIndex = 0; - while(lastOffset < freeSpace2ndTo1stEnd) - { - // Find next non-null allocation or move nextAllocIndex to the end. - while(nextAlloc2ndIndex < suballoc2ndCount && - suballocations2nd[nextAlloc2ndIndex].hAllocation == VK_NULL_HANDLE) - { - ++nextAlloc2ndIndex; - } - - // Found non-null allocation. - if(nextAlloc2ndIndex < suballoc2ndCount) - { - const VmaSuballocation& suballoc = suballocations2nd[nextAlloc2ndIndex]; - - // 1. Process free space before this allocation. - if(lastOffset < suballoc.offset) - { - // There is free space from lastOffset to suballoc.offset. - const VkDeviceSize unusedRangeSize = suballoc.offset - lastOffset; - ++outInfo.unusedRangeCount; - outInfo.unusedBytes += unusedRangeSize; - outInfo.unusedRangeSizeMin = VMA_MIN(outInfo.unusedRangeSizeMin, unusedRangeSize); - outInfo.unusedRangeSizeMax = VMA_MIN(outInfo.unusedRangeSizeMax, unusedRangeSize); - } - - // 2. Process this allocation. - // There is allocation with suballoc.offset, suballoc.size. - outInfo.usedBytes += suballoc.size; - outInfo.allocationSizeMin = VMA_MIN(outInfo.allocationSizeMin, suballoc.size); - outInfo.allocationSizeMax = VMA_MIN(outInfo.allocationSizeMax, suballoc.size); - - // 3. Prepare for next iteration. - lastOffset = suballoc.offset + suballoc.size; - ++nextAlloc2ndIndex; - } - // We are at the end. - else - { - // There is free space from lastOffset to freeSpace2ndTo1stEnd. - if(lastOffset < freeSpace2ndTo1stEnd) - { - const VkDeviceSize unusedRangeSize = freeSpace2ndTo1stEnd - lastOffset; - ++outInfo.unusedRangeCount; - outInfo.unusedBytes += unusedRangeSize; - outInfo.unusedRangeSizeMin = VMA_MIN(outInfo.unusedRangeSizeMin, unusedRangeSize); - outInfo.unusedRangeSizeMax = VMA_MIN(outInfo.unusedRangeSizeMax, unusedRangeSize); - } - - // End of loop. - lastOffset = freeSpace2ndTo1stEnd; - } - } - } - - size_t nextAlloc1stIndex = m_1stNullItemsBeginCount; - const VkDeviceSize freeSpace1stTo2ndEnd = - m_2ndVectorMode == SECOND_VECTOR_DOUBLE_STACK ? suballocations2nd.back().offset : size; - while(lastOffset < freeSpace1stTo2ndEnd) - { - // Find next non-null allocation or move nextAllocIndex to the end. - while(nextAlloc1stIndex < suballoc1stCount && - suballocations1st[nextAlloc1stIndex].hAllocation == VK_NULL_HANDLE) - { - ++nextAlloc1stIndex; - } - - // Found non-null allocation. - if(nextAlloc1stIndex < suballoc1stCount) - { - const VmaSuballocation& suballoc = suballocations1st[nextAlloc1stIndex]; - - // 1. Process free space before this allocation. - if(lastOffset < suballoc.offset) - { - // There is free space from lastOffset to suballoc.offset. - const VkDeviceSize unusedRangeSize = suballoc.offset - lastOffset; - ++outInfo.unusedRangeCount; - outInfo.unusedBytes += unusedRangeSize; - outInfo.unusedRangeSizeMin = VMA_MIN(outInfo.unusedRangeSizeMin, unusedRangeSize); - outInfo.unusedRangeSizeMax = VMA_MIN(outInfo.unusedRangeSizeMax, unusedRangeSize); - } - - // 2. Process this allocation. - // There is allocation with suballoc.offset, suballoc.size. - outInfo.usedBytes += suballoc.size; - outInfo.allocationSizeMin = VMA_MIN(outInfo.allocationSizeMin, suballoc.size); - outInfo.allocationSizeMax = VMA_MIN(outInfo.allocationSizeMax, suballoc.size); - - // 3. Prepare for next iteration. - lastOffset = suballoc.offset + suballoc.size; - ++nextAlloc1stIndex; - } - // We are at the end. - else - { - // There is free space from lastOffset to freeSpace1stTo2ndEnd. - if(lastOffset < freeSpace1stTo2ndEnd) - { - const VkDeviceSize unusedRangeSize = freeSpace1stTo2ndEnd - lastOffset; - ++outInfo.unusedRangeCount; - outInfo.unusedBytes += unusedRangeSize; - outInfo.unusedRangeSizeMin = VMA_MIN(outInfo.unusedRangeSizeMin, unusedRangeSize); - outInfo.unusedRangeSizeMax = VMA_MIN(outInfo.unusedRangeSizeMax, unusedRangeSize); - } - - // End of loop. - lastOffset = freeSpace1stTo2ndEnd; - } - } - - if(m_2ndVectorMode == SECOND_VECTOR_DOUBLE_STACK) - { - size_t nextAlloc2ndIndex = suballocations2nd.size() - 1; - while(lastOffset < size) - { - // Find next non-null allocation or move nextAllocIndex to the end. - while(nextAlloc2ndIndex != SIZE_MAX && - suballocations2nd[nextAlloc2ndIndex].hAllocation == VK_NULL_HANDLE) - { - --nextAlloc2ndIndex; - } - - // Found non-null allocation. - if(nextAlloc2ndIndex != SIZE_MAX) - { - const VmaSuballocation& suballoc = suballocations2nd[nextAlloc2ndIndex]; - - // 1. Process free space before this allocation. - if(lastOffset < suballoc.offset) - { - // There is free space from lastOffset to suballoc.offset. - const VkDeviceSize unusedRangeSize = suballoc.offset - lastOffset; - ++outInfo.unusedRangeCount; - outInfo.unusedBytes += unusedRangeSize; - outInfo.unusedRangeSizeMin = VMA_MIN(outInfo.unusedRangeSizeMin, unusedRangeSize); - outInfo.unusedRangeSizeMax = VMA_MIN(outInfo.unusedRangeSizeMax, unusedRangeSize); - } - - // 2. Process this allocation. - // There is allocation with suballoc.offset, suballoc.size. - outInfo.usedBytes += suballoc.size; - outInfo.allocationSizeMin = VMA_MIN(outInfo.allocationSizeMin, suballoc.size); - outInfo.allocationSizeMax = VMA_MIN(outInfo.allocationSizeMax, suballoc.size); - - // 3. Prepare for next iteration. - lastOffset = suballoc.offset + suballoc.size; - --nextAlloc2ndIndex; - } - // We are at the end. - else - { - // There is free space from lastOffset to size. - if(lastOffset < size) - { - const VkDeviceSize unusedRangeSize = size - lastOffset; - ++outInfo.unusedRangeCount; - outInfo.unusedBytes += unusedRangeSize; - outInfo.unusedRangeSizeMin = VMA_MIN(outInfo.unusedRangeSizeMin, unusedRangeSize); - outInfo.unusedRangeSizeMax = VMA_MIN(outInfo.unusedRangeSizeMax, unusedRangeSize); - } - - // End of loop. - lastOffset = size; - } - } - } - - outInfo.unusedBytes = size - outInfo.usedBytes; -} - -void VmaBlockMetadata_Linear::AddPoolStats(VmaPoolStats& inoutStats) const -{ - const SuballocationVectorType& suballocations1st = AccessSuballocations1st(); - const SuballocationVectorType& suballocations2nd = AccessSuballocations2nd(); - const VkDeviceSize size = GetSize(); - const size_t suballoc1stCount = suballocations1st.size(); - const size_t suballoc2ndCount = suballocations2nd.size(); - - inoutStats.size += size; - - VkDeviceSize lastOffset = 0; - - if(m_2ndVectorMode == SECOND_VECTOR_RING_BUFFER) - { - const VkDeviceSize freeSpace2ndTo1stEnd = suballocations1st[m_1stNullItemsBeginCount].offset; - size_t nextAlloc2ndIndex = m_1stNullItemsBeginCount; - while(lastOffset < freeSpace2ndTo1stEnd) - { - // Find next non-null allocation or move nextAlloc2ndIndex to the end. - while(nextAlloc2ndIndex < suballoc2ndCount && - suballocations2nd[nextAlloc2ndIndex].hAllocation == VK_NULL_HANDLE) - { - ++nextAlloc2ndIndex; - } - - // Found non-null allocation. - if(nextAlloc2ndIndex < suballoc2ndCount) - { - const VmaSuballocation& suballoc = suballocations2nd[nextAlloc2ndIndex]; - - // 1. Process free space before this allocation. - if(lastOffset < suballoc.offset) - { - // There is free space from lastOffset to suballoc.offset. - const VkDeviceSize unusedRangeSize = suballoc.offset - lastOffset; - inoutStats.unusedSize += unusedRangeSize; - ++inoutStats.unusedRangeCount; - inoutStats.unusedRangeSizeMax = VMA_MAX(inoutStats.unusedRangeSizeMax, unusedRangeSize); - } - - // 2. Process this allocation. - // There is allocation with suballoc.offset, suballoc.size. - ++inoutStats.allocationCount; - - // 3. Prepare for next iteration. - lastOffset = suballoc.offset + suballoc.size; - ++nextAlloc2ndIndex; - } - // We are at the end. - else - { - if(lastOffset < freeSpace2ndTo1stEnd) - { - // There is free space from lastOffset to freeSpace2ndTo1stEnd. - const VkDeviceSize unusedRangeSize = freeSpace2ndTo1stEnd - lastOffset; - inoutStats.unusedSize += unusedRangeSize; - ++inoutStats.unusedRangeCount; - inoutStats.unusedRangeSizeMax = VMA_MAX(inoutStats.unusedRangeSizeMax, unusedRangeSize); - } - - // End of loop. - lastOffset = freeSpace2ndTo1stEnd; - } - } - } - - size_t nextAlloc1stIndex = m_1stNullItemsBeginCount; - const VkDeviceSize freeSpace1stTo2ndEnd = - m_2ndVectorMode == SECOND_VECTOR_DOUBLE_STACK ? suballocations2nd.back().offset : size; - while(lastOffset < freeSpace1stTo2ndEnd) - { - // Find next non-null allocation or move nextAllocIndex to the end. - while(nextAlloc1stIndex < suballoc1stCount && - suballocations1st[nextAlloc1stIndex].hAllocation == VK_NULL_HANDLE) - { - ++nextAlloc1stIndex; - } - - // Found non-null allocation. - if(nextAlloc1stIndex < suballoc1stCount) - { - const VmaSuballocation& suballoc = suballocations1st[nextAlloc1stIndex]; - - // 1. Process free space before this allocation. - if(lastOffset < suballoc.offset) - { - // There is free space from lastOffset to suballoc.offset. - const VkDeviceSize unusedRangeSize = suballoc.offset - lastOffset; - inoutStats.unusedSize += unusedRangeSize; - ++inoutStats.unusedRangeCount; - inoutStats.unusedRangeSizeMax = VMA_MAX(inoutStats.unusedRangeSizeMax, unusedRangeSize); - } - - // 2. Process this allocation. - // There is allocation with suballoc.offset, suballoc.size. - ++inoutStats.allocationCount; - - // 3. Prepare for next iteration. - lastOffset = suballoc.offset + suballoc.size; - ++nextAlloc1stIndex; - } - // We are at the end. - else - { - if(lastOffset < freeSpace1stTo2ndEnd) - { - // There is free space from lastOffset to freeSpace1stTo2ndEnd. - const VkDeviceSize unusedRangeSize = freeSpace1stTo2ndEnd - lastOffset; - inoutStats.unusedSize += unusedRangeSize; - ++inoutStats.unusedRangeCount; - inoutStats.unusedRangeSizeMax = VMA_MAX(inoutStats.unusedRangeSizeMax, unusedRangeSize); - } - - // End of loop. - lastOffset = freeSpace1stTo2ndEnd; - } - } - - if(m_2ndVectorMode == SECOND_VECTOR_DOUBLE_STACK) - { - size_t nextAlloc2ndIndex = suballocations2nd.size() - 1; - while(lastOffset < size) - { - // Find next non-null allocation or move nextAlloc2ndIndex to the end. - while(nextAlloc2ndIndex != SIZE_MAX && - suballocations2nd[nextAlloc2ndIndex].hAllocation == VK_NULL_HANDLE) - { - --nextAlloc2ndIndex; - } - - // Found non-null allocation. - if(nextAlloc2ndIndex != SIZE_MAX) - { - const VmaSuballocation& suballoc = suballocations2nd[nextAlloc2ndIndex]; - - // 1. Process free space before this allocation. - if(lastOffset < suballoc.offset) - { - // There is free space from lastOffset to suballoc.offset. - const VkDeviceSize unusedRangeSize = suballoc.offset - lastOffset; - inoutStats.unusedSize += unusedRangeSize; - ++inoutStats.unusedRangeCount; - inoutStats.unusedRangeSizeMax = VMA_MAX(inoutStats.unusedRangeSizeMax, unusedRangeSize); - } - - // 2. Process this allocation. - // There is allocation with suballoc.offset, suballoc.size. - ++inoutStats.allocationCount; - - // 3. Prepare for next iteration. - lastOffset = suballoc.offset + suballoc.size; - --nextAlloc2ndIndex; - } - // We are at the end. - else - { - if(lastOffset < size) - { - // There is free space from lastOffset to size. - const VkDeviceSize unusedRangeSize = size - lastOffset; - inoutStats.unusedSize += unusedRangeSize; - ++inoutStats.unusedRangeCount; - inoutStats.unusedRangeSizeMax = VMA_MAX(inoutStats.unusedRangeSizeMax, unusedRangeSize); - } - - // End of loop. - lastOffset = size; - } - } - } -} - -#if VMA_STATS_STRING_ENABLED -void VmaBlockMetadata_Linear::PrintDetailedMap(class VmaJsonWriter& json) const -{ - const VkDeviceSize size = GetSize(); - const SuballocationVectorType& suballocations1st = AccessSuballocations1st(); - const SuballocationVectorType& suballocations2nd = AccessSuballocations2nd(); - const size_t suballoc1stCount = suballocations1st.size(); - const size_t suballoc2ndCount = suballocations2nd.size(); - - // FIRST PASS - - size_t unusedRangeCount = 0; - VkDeviceSize usedBytes = 0; - - VkDeviceSize lastOffset = 0; - - size_t alloc2ndCount = 0; - if(m_2ndVectorMode == SECOND_VECTOR_RING_BUFFER) - { - const VkDeviceSize freeSpace2ndTo1stEnd = suballocations1st[m_1stNullItemsBeginCount].offset; - size_t nextAlloc2ndIndex = 0; - while(lastOffset < freeSpace2ndTo1stEnd) - { - // Find next non-null allocation or move nextAlloc2ndIndex to the end. - while(nextAlloc2ndIndex < suballoc2ndCount && - suballocations2nd[nextAlloc2ndIndex].hAllocation == VK_NULL_HANDLE) - { - ++nextAlloc2ndIndex; - } - - // Found non-null allocation. - if(nextAlloc2ndIndex < suballoc2ndCount) - { - const VmaSuballocation& suballoc = suballocations2nd[nextAlloc2ndIndex]; - - // 1. Process free space before this allocation. - if(lastOffset < suballoc.offset) - { - // There is free space from lastOffset to suballoc.offset. - ++unusedRangeCount; - } - - // 2. Process this allocation. - // There is allocation with suballoc.offset, suballoc.size. - ++alloc2ndCount; - usedBytes += suballoc.size; - - // 3. Prepare for next iteration. - lastOffset = suballoc.offset + suballoc.size; - ++nextAlloc2ndIndex; - } - // We are at the end. - else - { - if(lastOffset < freeSpace2ndTo1stEnd) - { - // There is free space from lastOffset to freeSpace2ndTo1stEnd. - ++unusedRangeCount; - } - - // End of loop. - lastOffset = freeSpace2ndTo1stEnd; - } - } - } - - size_t nextAlloc1stIndex = m_1stNullItemsBeginCount; - size_t alloc1stCount = 0; - const VkDeviceSize freeSpace1stTo2ndEnd = - m_2ndVectorMode == SECOND_VECTOR_DOUBLE_STACK ? suballocations2nd.back().offset : size; - while(lastOffset < freeSpace1stTo2ndEnd) - { - // Find next non-null allocation or move nextAllocIndex to the end. - while(nextAlloc1stIndex < suballoc1stCount && - suballocations1st[nextAlloc1stIndex].hAllocation == VK_NULL_HANDLE) - { - ++nextAlloc1stIndex; - } - - // Found non-null allocation. - if(nextAlloc1stIndex < suballoc1stCount) - { - const VmaSuballocation& suballoc = suballocations1st[nextAlloc1stIndex]; - - // 1. Process free space before this allocation. - if(lastOffset < suballoc.offset) - { - // There is free space from lastOffset to suballoc.offset. - ++unusedRangeCount; - } - - // 2. Process this allocation. - // There is allocation with suballoc.offset, suballoc.size. - ++alloc1stCount; - usedBytes += suballoc.size; - - // 3. Prepare for next iteration. - lastOffset = suballoc.offset + suballoc.size; - ++nextAlloc1stIndex; - } - // We are at the end. - else - { - if(lastOffset < size) - { - // There is free space from lastOffset to freeSpace1stTo2ndEnd. - ++unusedRangeCount; - } - - // End of loop. - lastOffset = freeSpace1stTo2ndEnd; - } - } - - if(m_2ndVectorMode == SECOND_VECTOR_DOUBLE_STACK) - { - size_t nextAlloc2ndIndex = suballocations2nd.size() - 1; - while(lastOffset < size) - { - // Find next non-null allocation or move nextAlloc2ndIndex to the end. - while(nextAlloc2ndIndex != SIZE_MAX && - suballocations2nd[nextAlloc2ndIndex].hAllocation == VK_NULL_HANDLE) - { - --nextAlloc2ndIndex; - } - - // Found non-null allocation. - if(nextAlloc2ndIndex != SIZE_MAX) - { - const VmaSuballocation& suballoc = suballocations2nd[nextAlloc2ndIndex]; - - // 1. Process free space before this allocation. - if(lastOffset < suballoc.offset) - { - // There is free space from lastOffset to suballoc.offset. - ++unusedRangeCount; - } - - // 2. Process this allocation. - // There is allocation with suballoc.offset, suballoc.size. - ++alloc2ndCount; - usedBytes += suballoc.size; - - // 3. Prepare for next iteration. - lastOffset = suballoc.offset + suballoc.size; - --nextAlloc2ndIndex; - } - // We are at the end. - else - { - if(lastOffset < size) - { - // There is free space from lastOffset to size. - ++unusedRangeCount; - } - - // End of loop. - lastOffset = size; - } - } - } - - const VkDeviceSize unusedBytes = size - usedBytes; - PrintDetailedMap_Begin(json, unusedBytes, alloc1stCount + alloc2ndCount, unusedRangeCount); - - // SECOND PASS - lastOffset = 0; - - if(m_2ndVectorMode == SECOND_VECTOR_RING_BUFFER) - { - const VkDeviceSize freeSpace2ndTo1stEnd = suballocations1st[m_1stNullItemsBeginCount].offset; - size_t nextAlloc2ndIndex = 0; - while(lastOffset < freeSpace2ndTo1stEnd) - { - // Find next non-null allocation or move nextAlloc2ndIndex to the end. - while(nextAlloc2ndIndex < suballoc2ndCount && - suballocations2nd[nextAlloc2ndIndex].hAllocation == VK_NULL_HANDLE) - { - ++nextAlloc2ndIndex; - } - - // Found non-null allocation. - if(nextAlloc2ndIndex < suballoc2ndCount) - { - const VmaSuballocation& suballoc = suballocations2nd[nextAlloc2ndIndex]; - - // 1. Process free space before this allocation. - if(lastOffset < suballoc.offset) - { - // There is free space from lastOffset to suballoc.offset. - const VkDeviceSize unusedRangeSize = suballoc.offset - lastOffset; - PrintDetailedMap_UnusedRange(json, lastOffset, unusedRangeSize); - } - - // 2. Process this allocation. - // There is allocation with suballoc.offset, suballoc.size. - PrintDetailedMap_Allocation(json, suballoc.offset, suballoc.hAllocation); - - // 3. Prepare for next iteration. - lastOffset = suballoc.offset + suballoc.size; - ++nextAlloc2ndIndex; - } - // We are at the end. - else - { - if(lastOffset < freeSpace2ndTo1stEnd) - { - // There is free space from lastOffset to freeSpace2ndTo1stEnd. - const VkDeviceSize unusedRangeSize = freeSpace2ndTo1stEnd - lastOffset; - PrintDetailedMap_UnusedRange(json, lastOffset, unusedRangeSize); - } - - // End of loop. - lastOffset = freeSpace2ndTo1stEnd; - } - } - } - - nextAlloc1stIndex = m_1stNullItemsBeginCount; - while(lastOffset < freeSpace1stTo2ndEnd) - { - // Find next non-null allocation or move nextAllocIndex to the end. - while(nextAlloc1stIndex < suballoc1stCount && - suballocations1st[nextAlloc1stIndex].hAllocation == VK_NULL_HANDLE) - { - ++nextAlloc1stIndex; - } - - // Found non-null allocation. - if(nextAlloc1stIndex < suballoc1stCount) - { - const VmaSuballocation& suballoc = suballocations1st[nextAlloc1stIndex]; - - // 1. Process free space before this allocation. - if(lastOffset < suballoc.offset) - { - // There is free space from lastOffset to suballoc.offset. - const VkDeviceSize unusedRangeSize = suballoc.offset - lastOffset; - PrintDetailedMap_UnusedRange(json, lastOffset, unusedRangeSize); - } - - // 2. Process this allocation. - // There is allocation with suballoc.offset, suballoc.size. - PrintDetailedMap_Allocation(json, suballoc.offset, suballoc.hAllocation); - - // 3. Prepare for next iteration. - lastOffset = suballoc.offset + suballoc.size; - ++nextAlloc1stIndex; - } - // We are at the end. - else - { - if(lastOffset < freeSpace1stTo2ndEnd) - { - // There is free space from lastOffset to freeSpace1stTo2ndEnd. - const VkDeviceSize unusedRangeSize = freeSpace1stTo2ndEnd - lastOffset; - PrintDetailedMap_UnusedRange(json, lastOffset, unusedRangeSize); - } - - // End of loop. - lastOffset = freeSpace1stTo2ndEnd; - } - } - - if(m_2ndVectorMode == SECOND_VECTOR_DOUBLE_STACK) - { - size_t nextAlloc2ndIndex = suballocations2nd.size() - 1; - while(lastOffset < size) - { - // Find next non-null allocation or move nextAlloc2ndIndex to the end. - while(nextAlloc2ndIndex != SIZE_MAX && - suballocations2nd[nextAlloc2ndIndex].hAllocation == VK_NULL_HANDLE) - { - --nextAlloc2ndIndex; - } - - // Found non-null allocation. - if(nextAlloc2ndIndex != SIZE_MAX) - { - const VmaSuballocation& suballoc = suballocations2nd[nextAlloc2ndIndex]; - - // 1. Process free space before this allocation. - if(lastOffset < suballoc.offset) - { - // There is free space from lastOffset to suballoc.offset. - const VkDeviceSize unusedRangeSize = suballoc.offset - lastOffset; - PrintDetailedMap_UnusedRange(json, lastOffset, unusedRangeSize); - } - - // 2. Process this allocation. - // There is allocation with suballoc.offset, suballoc.size. - PrintDetailedMap_Allocation(json, suballoc.offset, suballoc.hAllocation); - - // 3. Prepare for next iteration. - lastOffset = suballoc.offset + suballoc.size; - --nextAlloc2ndIndex; - } - // We are at the end. - else - { - if(lastOffset < size) - { - // There is free space from lastOffset to size. - const VkDeviceSize unusedRangeSize = size - lastOffset; - PrintDetailedMap_UnusedRange(json, lastOffset, unusedRangeSize); - } - - // End of loop. - lastOffset = size; - } - } - } - - PrintDetailedMap_End(json); -} -#endif // #if VMA_STATS_STRING_ENABLED - -bool VmaBlockMetadata_Linear::CreateAllocationRequest( - uint32_t currentFrameIndex, - uint32_t frameInUseCount, - VkDeviceSize bufferImageGranularity, - VkDeviceSize allocSize, - VkDeviceSize allocAlignment, - bool upperAddress, - VmaSuballocationType allocType, - bool canMakeOtherLost, - uint32_t strategy, - VmaAllocationRequest* pAllocationRequest) -{ - VMA_ASSERT(allocSize > 0); - VMA_ASSERT(allocType != VMA_SUBALLOCATION_TYPE_FREE); - VMA_ASSERT(pAllocationRequest != VMA_NULL); - VMA_HEAVY_ASSERT(Validate()); - return upperAddress ? - CreateAllocationRequest_UpperAddress( - currentFrameIndex, frameInUseCount, bufferImageGranularity, - allocSize, allocAlignment, allocType, canMakeOtherLost, strategy, pAllocationRequest) : - CreateAllocationRequest_LowerAddress( - currentFrameIndex, frameInUseCount, bufferImageGranularity, - allocSize, allocAlignment, allocType, canMakeOtherLost, strategy, pAllocationRequest); -} - -bool VmaBlockMetadata_Linear::CreateAllocationRequest_UpperAddress( - uint32_t currentFrameIndex, - uint32_t frameInUseCount, - VkDeviceSize bufferImageGranularity, - VkDeviceSize allocSize, - VkDeviceSize allocAlignment, - VmaSuballocationType allocType, - bool canMakeOtherLost, - uint32_t strategy, - VmaAllocationRequest* pAllocationRequest) -{ - const VkDeviceSize size = GetSize(); - SuballocationVectorType& suballocations1st = AccessSuballocations1st(); - SuballocationVectorType& suballocations2nd = AccessSuballocations2nd(); - - if(m_2ndVectorMode == SECOND_VECTOR_RING_BUFFER) - { - VMA_ASSERT(0 && "Trying to use pool with linear algorithm as double stack, while it is already being used as ring buffer."); - return false; - } - - // Try to allocate before 2nd.back(), or end of block if 2nd.empty(). - if(allocSize > size) - { - return false; - } - VkDeviceSize resultBaseOffset = size - allocSize; - if(!suballocations2nd.empty()) - { - const VmaSuballocation& lastSuballoc = suballocations2nd.back(); - resultBaseOffset = lastSuballoc.offset - allocSize; - if(allocSize > lastSuballoc.offset) - { - return false; - } - } - - // Start from offset equal to end of free space. - VkDeviceSize resultOffset = resultBaseOffset; - - // Apply VMA_DEBUG_MARGIN at the end. - if(VMA_DEBUG_MARGIN > 0) - { - if(resultOffset < VMA_DEBUG_MARGIN) - { - return false; - } - resultOffset -= VMA_DEBUG_MARGIN; - } - - // Apply alignment. - resultOffset = VmaAlignDown(resultOffset, allocAlignment); - - // Check next suballocations from 2nd for BufferImageGranularity conflicts. - // Make bigger alignment if necessary. - if(bufferImageGranularity > 1 && bufferImageGranularity != allocAlignment && !suballocations2nd.empty()) - { - bool bufferImageGranularityConflict = false; - for(size_t nextSuballocIndex = suballocations2nd.size(); nextSuballocIndex--; ) - { - const VmaSuballocation& nextSuballoc = suballocations2nd[nextSuballocIndex]; - if(VmaBlocksOnSamePage(resultOffset, allocSize, nextSuballoc.offset, bufferImageGranularity)) - { - if(VmaIsBufferImageGranularityConflict(nextSuballoc.type, allocType)) - { - bufferImageGranularityConflict = true; - break; - } - } - else - // Already on previous page. - break; - } - if(bufferImageGranularityConflict) - { - resultOffset = VmaAlignDown(resultOffset, bufferImageGranularity); - } - } - - // There is enough free space. - const VkDeviceSize endOf1st = !suballocations1st.empty() ? - suballocations1st.back().offset + suballocations1st.back().size : - 0; - if(endOf1st + VMA_DEBUG_MARGIN <= resultOffset) - { - // Check previous suballocations for BufferImageGranularity conflicts. - // If conflict exists, allocation cannot be made here. - if(bufferImageGranularity > 1) - { - for(size_t prevSuballocIndex = suballocations1st.size(); prevSuballocIndex--; ) - { - const VmaSuballocation& prevSuballoc = suballocations1st[prevSuballocIndex]; - if(VmaBlocksOnSamePage(prevSuballoc.offset, prevSuballoc.size, resultOffset, bufferImageGranularity)) - { - if(VmaIsBufferImageGranularityConflict(allocType, prevSuballoc.type)) - { - return false; - } - } - else - { - // Already on next page. - break; - } - } - } - - // All tests passed: Success. - pAllocationRequest->offset = resultOffset; - pAllocationRequest->sumFreeSize = resultBaseOffset + allocSize - endOf1st; - pAllocationRequest->sumItemSize = 0; - // pAllocationRequest->item unused. - pAllocationRequest->itemsToMakeLostCount = 0; - pAllocationRequest->type = VmaAllocationRequestType::UpperAddress; - return true; - } - - return false; -} - -bool VmaBlockMetadata_Linear::CreateAllocationRequest_LowerAddress( - uint32_t currentFrameIndex, - uint32_t frameInUseCount, - VkDeviceSize bufferImageGranularity, - VkDeviceSize allocSize, - VkDeviceSize allocAlignment, - VmaSuballocationType allocType, - bool canMakeOtherLost, - uint32_t strategy, - VmaAllocationRequest* pAllocationRequest) -{ - const VkDeviceSize size = GetSize(); - SuballocationVectorType& suballocations1st = AccessSuballocations1st(); - SuballocationVectorType& suballocations2nd = AccessSuballocations2nd(); - - if(m_2ndVectorMode == SECOND_VECTOR_EMPTY || m_2ndVectorMode == SECOND_VECTOR_DOUBLE_STACK) - { - // Try to allocate at the end of 1st vector. - - VkDeviceSize resultBaseOffset = 0; - if(!suballocations1st.empty()) - { - const VmaSuballocation& lastSuballoc = suballocations1st.back(); - resultBaseOffset = lastSuballoc.offset + lastSuballoc.size; - } - - // Start from offset equal to beginning of free space. - VkDeviceSize resultOffset = resultBaseOffset; - - // Apply VMA_DEBUG_MARGIN at the beginning. - if(VMA_DEBUG_MARGIN > 0) - { - resultOffset += VMA_DEBUG_MARGIN; - } - - // Apply alignment. - resultOffset = VmaAlignUp(resultOffset, allocAlignment); - - // Check previous suballocations for BufferImageGranularity conflicts. - // Make bigger alignment if necessary. - if(bufferImageGranularity > 1 && bufferImageGranularity != allocAlignment && !suballocations1st.empty()) - { - bool bufferImageGranularityConflict = false; - for(size_t prevSuballocIndex = suballocations1st.size(); prevSuballocIndex--; ) - { - const VmaSuballocation& prevSuballoc = suballocations1st[prevSuballocIndex]; - if(VmaBlocksOnSamePage(prevSuballoc.offset, prevSuballoc.size, resultOffset, bufferImageGranularity)) - { - if(VmaIsBufferImageGranularityConflict(prevSuballoc.type, allocType)) - { - bufferImageGranularityConflict = true; - break; - } - } - else - // Already on previous page. - break; - } - if(bufferImageGranularityConflict) - { - resultOffset = VmaAlignUp(resultOffset, bufferImageGranularity); - } - } - - const VkDeviceSize freeSpaceEnd = m_2ndVectorMode == SECOND_VECTOR_DOUBLE_STACK ? - suballocations2nd.back().offset : size; - - // There is enough free space at the end after alignment. - if(resultOffset + allocSize + VMA_DEBUG_MARGIN <= freeSpaceEnd) - { - // Check next suballocations for BufferImageGranularity conflicts. - // If conflict exists, allocation cannot be made here. - if((allocSize % bufferImageGranularity || resultOffset % bufferImageGranularity) && m_2ndVectorMode == SECOND_VECTOR_DOUBLE_STACK) - { - for(size_t nextSuballocIndex = suballocations2nd.size(); nextSuballocIndex--; ) - { - const VmaSuballocation& nextSuballoc = suballocations2nd[nextSuballocIndex]; - if(VmaBlocksOnSamePage(resultOffset, allocSize, nextSuballoc.offset, bufferImageGranularity)) - { - if(VmaIsBufferImageGranularityConflict(allocType, nextSuballoc.type)) - { - return false; - } - } - else - { - // Already on previous page. - break; - } - } - } - - // All tests passed: Success. - pAllocationRequest->offset = resultOffset; - pAllocationRequest->sumFreeSize = freeSpaceEnd - resultBaseOffset; - pAllocationRequest->sumItemSize = 0; - // pAllocationRequest->item, customData unused. - pAllocationRequest->type = VmaAllocationRequestType::EndOf1st; - pAllocationRequest->itemsToMakeLostCount = 0; - return true; - } - } - - // Wrap-around to end of 2nd vector. Try to allocate there, watching for the - // beginning of 1st vector as the end of free space. - if(m_2ndVectorMode == SECOND_VECTOR_EMPTY || m_2ndVectorMode == SECOND_VECTOR_RING_BUFFER) - { - VMA_ASSERT(!suballocations1st.empty()); - - VkDeviceSize resultBaseOffset = 0; - if(!suballocations2nd.empty()) - { - const VmaSuballocation& lastSuballoc = suballocations2nd.back(); - resultBaseOffset = lastSuballoc.offset + lastSuballoc.size; - } - - // Start from offset equal to beginning of free space. - VkDeviceSize resultOffset = resultBaseOffset; - - // Apply VMA_DEBUG_MARGIN at the beginning. - if(VMA_DEBUG_MARGIN > 0) - { - resultOffset += VMA_DEBUG_MARGIN; - } - - // Apply alignment. - resultOffset = VmaAlignUp(resultOffset, allocAlignment); - - // Check previous suballocations for BufferImageGranularity conflicts. - // Make bigger alignment if necessary. - if(bufferImageGranularity > 1 && bufferImageGranularity != allocAlignment && !suballocations2nd.empty()) - { - bool bufferImageGranularityConflict = false; - for(size_t prevSuballocIndex = suballocations2nd.size(); prevSuballocIndex--; ) - { - const VmaSuballocation& prevSuballoc = suballocations2nd[prevSuballocIndex]; - if(VmaBlocksOnSamePage(prevSuballoc.offset, prevSuballoc.size, resultOffset, bufferImageGranularity)) - { - if(VmaIsBufferImageGranularityConflict(prevSuballoc.type, allocType)) - { - bufferImageGranularityConflict = true; - break; - } - } - else - // Already on previous page. - break; - } - if(bufferImageGranularityConflict) - { - resultOffset = VmaAlignUp(resultOffset, bufferImageGranularity); - } - } - - pAllocationRequest->itemsToMakeLostCount = 0; - pAllocationRequest->sumItemSize = 0; - size_t index1st = m_1stNullItemsBeginCount; - - if(canMakeOtherLost) - { - while(index1st < suballocations1st.size() && - resultOffset + allocSize + VMA_DEBUG_MARGIN > suballocations1st[index1st].offset) - { - // Next colliding allocation at the beginning of 1st vector found. Try to make it lost. - const VmaSuballocation& suballoc = suballocations1st[index1st]; - if(suballoc.type == VMA_SUBALLOCATION_TYPE_FREE) - { - // No problem. - } - else - { - VMA_ASSERT(suballoc.hAllocation != VK_NULL_HANDLE); - if(suballoc.hAllocation->CanBecomeLost() && - suballoc.hAllocation->GetLastUseFrameIndex() + frameInUseCount < currentFrameIndex) - { - ++pAllocationRequest->itemsToMakeLostCount; - pAllocationRequest->sumItemSize += suballoc.size; - } - else - { - return false; - } - } - ++index1st; - } - - // Check next suballocations for BufferImageGranularity conflicts. - // If conflict exists, we must mark more allocations lost or fail. - if(allocSize % bufferImageGranularity || resultOffset % bufferImageGranularity) - { - while(index1st < suballocations1st.size()) - { - const VmaSuballocation& suballoc = suballocations1st[index1st]; - if(VmaBlocksOnSamePage(resultOffset, allocSize, suballoc.offset, bufferImageGranularity)) - { - if(suballoc.hAllocation != VK_NULL_HANDLE) - { - // Not checking actual VmaIsBufferImageGranularityConflict(allocType, suballoc.type). - if(suballoc.hAllocation->CanBecomeLost() && - suballoc.hAllocation->GetLastUseFrameIndex() + frameInUseCount < currentFrameIndex) - { - ++pAllocationRequest->itemsToMakeLostCount; - pAllocationRequest->sumItemSize += suballoc.size; - } - else - { - return false; - } - } - } - else - { - // Already on next page. - break; - } - ++index1st; - } - } - - // Special case: There is not enough room at the end for this allocation, even after making all from the 1st lost. - if(index1st == suballocations1st.size() && - resultOffset + allocSize + VMA_DEBUG_MARGIN > size) - { - // TODO: This is a known bug that it's not yet implemented and the allocation is failing. - VMA_DEBUG_LOG("Unsupported special case in custom pool with linear allocation algorithm used as ring buffer with allocations that can be lost."); - } - } - - // There is enough free space at the end after alignment. - if((index1st == suballocations1st.size() && resultOffset + allocSize + VMA_DEBUG_MARGIN <= size) || - (index1st < suballocations1st.size() && resultOffset + allocSize + VMA_DEBUG_MARGIN <= suballocations1st[index1st].offset)) - { - // Check next suballocations for BufferImageGranularity conflicts. - // If conflict exists, allocation cannot be made here. - if(allocSize % bufferImageGranularity || resultOffset % bufferImageGranularity) - { - for(size_t nextSuballocIndex = index1st; - nextSuballocIndex < suballocations1st.size(); - nextSuballocIndex++) - { - const VmaSuballocation& nextSuballoc = suballocations1st[nextSuballocIndex]; - if(VmaBlocksOnSamePage(resultOffset, allocSize, nextSuballoc.offset, bufferImageGranularity)) - { - if(VmaIsBufferImageGranularityConflict(allocType, nextSuballoc.type)) - { - return false; - } - } - else - { - // Already on next page. - break; - } - } - } - - // All tests passed: Success. - pAllocationRequest->offset = resultOffset; - pAllocationRequest->sumFreeSize = - (index1st < suballocations1st.size() ? suballocations1st[index1st].offset : size) - - resultBaseOffset - - pAllocationRequest->sumItemSize; - pAllocationRequest->type = VmaAllocationRequestType::EndOf2nd; - // pAllocationRequest->item, customData unused. - return true; - } - } - - return false; -} - -bool VmaBlockMetadata_Linear::MakeRequestedAllocationsLost( - uint32_t currentFrameIndex, - uint32_t frameInUseCount, - VmaAllocationRequest* pAllocationRequest) -{ - if(pAllocationRequest->itemsToMakeLostCount == 0) - { - return true; - } - - VMA_ASSERT(m_2ndVectorMode == SECOND_VECTOR_EMPTY || m_2ndVectorMode == SECOND_VECTOR_RING_BUFFER); - - // We always start from 1st. - SuballocationVectorType* suballocations = &AccessSuballocations1st(); - size_t index = m_1stNullItemsBeginCount; - size_t madeLostCount = 0; - while(madeLostCount < pAllocationRequest->itemsToMakeLostCount) - { - if(index == suballocations->size()) - { - index = 0; - // If we get to the end of 1st, we wrap around to beginning of 2nd of 1st. - if(m_2ndVectorMode == SECOND_VECTOR_RING_BUFFER) - { - suballocations = &AccessSuballocations2nd(); - } - // else: m_2ndVectorMode == SECOND_VECTOR_EMPTY: - // suballocations continues pointing at AccessSuballocations1st(). - VMA_ASSERT(!suballocations->empty()); - } - VmaSuballocation& suballoc = (*suballocations)[index]; - if(suballoc.type != VMA_SUBALLOCATION_TYPE_FREE) - { - VMA_ASSERT(suballoc.hAllocation != VK_NULL_HANDLE); - VMA_ASSERT(suballoc.hAllocation->CanBecomeLost()); - if(suballoc.hAllocation->MakeLost(currentFrameIndex, frameInUseCount)) - { - suballoc.type = VMA_SUBALLOCATION_TYPE_FREE; - suballoc.hAllocation = VK_NULL_HANDLE; - m_SumFreeSize += suballoc.size; - if(suballocations == &AccessSuballocations1st()) - { - ++m_1stNullItemsMiddleCount; - } - else - { - ++m_2ndNullItemsCount; - } - ++madeLostCount; - } - else - { - return false; - } - } - ++index; - } - - CleanupAfterFree(); - //VMA_HEAVY_ASSERT(Validate()); // Already called by ClanupAfterFree(). - - return true; -} - -uint32_t VmaBlockMetadata_Linear::MakeAllocationsLost(uint32_t currentFrameIndex, uint32_t frameInUseCount) -{ - uint32_t lostAllocationCount = 0; - - SuballocationVectorType& suballocations1st = AccessSuballocations1st(); - for(size_t i = m_1stNullItemsBeginCount, count = suballocations1st.size(); i < count; ++i) - { - VmaSuballocation& suballoc = suballocations1st[i]; - if(suballoc.type != VMA_SUBALLOCATION_TYPE_FREE && - suballoc.hAllocation->CanBecomeLost() && - suballoc.hAllocation->MakeLost(currentFrameIndex, frameInUseCount)) - { - suballoc.type = VMA_SUBALLOCATION_TYPE_FREE; - suballoc.hAllocation = VK_NULL_HANDLE; - ++m_1stNullItemsMiddleCount; - m_SumFreeSize += suballoc.size; - ++lostAllocationCount; - } - } - - SuballocationVectorType& suballocations2nd = AccessSuballocations2nd(); - for(size_t i = 0, count = suballocations2nd.size(); i < count; ++i) - { - VmaSuballocation& suballoc = suballocations2nd[i]; - if(suballoc.type != VMA_SUBALLOCATION_TYPE_FREE && - suballoc.hAllocation->CanBecomeLost() && - suballoc.hAllocation->MakeLost(currentFrameIndex, frameInUseCount)) - { - suballoc.type = VMA_SUBALLOCATION_TYPE_FREE; - suballoc.hAllocation = VK_NULL_HANDLE; - ++m_2ndNullItemsCount; - m_SumFreeSize += suballoc.size; - ++lostAllocationCount; - } - } - - if(lostAllocationCount) - { - CleanupAfterFree(); - } - - return lostAllocationCount; -} - -VkResult VmaBlockMetadata_Linear::CheckCorruption(const void* pBlockData) -{ - SuballocationVectorType& suballocations1st = AccessSuballocations1st(); - for(size_t i = m_1stNullItemsBeginCount, count = suballocations1st.size(); i < count; ++i) - { - const VmaSuballocation& suballoc = suballocations1st[i]; - if(suballoc.type != VMA_SUBALLOCATION_TYPE_FREE) - { - if(!VmaValidateMagicValue(pBlockData, suballoc.offset - VMA_DEBUG_MARGIN)) - { - VMA_ASSERT(0 && "MEMORY CORRUPTION DETECTED BEFORE VALIDATED ALLOCATION!"); - return VK_ERROR_VALIDATION_FAILED_EXT; - } - if(!VmaValidateMagicValue(pBlockData, suballoc.offset + suballoc.size)) - { - VMA_ASSERT(0 && "MEMORY CORRUPTION DETECTED AFTER VALIDATED ALLOCATION!"); - return VK_ERROR_VALIDATION_FAILED_EXT; - } - } - } - - SuballocationVectorType& suballocations2nd = AccessSuballocations2nd(); - for(size_t i = 0, count = suballocations2nd.size(); i < count; ++i) - { - const VmaSuballocation& suballoc = suballocations2nd[i]; - if(suballoc.type != VMA_SUBALLOCATION_TYPE_FREE) - { - if(!VmaValidateMagicValue(pBlockData, suballoc.offset - VMA_DEBUG_MARGIN)) - { - VMA_ASSERT(0 && "MEMORY CORRUPTION DETECTED BEFORE VALIDATED ALLOCATION!"); - return VK_ERROR_VALIDATION_FAILED_EXT; - } - if(!VmaValidateMagicValue(pBlockData, suballoc.offset + suballoc.size)) - { - VMA_ASSERT(0 && "MEMORY CORRUPTION DETECTED AFTER VALIDATED ALLOCATION!"); - return VK_ERROR_VALIDATION_FAILED_EXT; - } - } - } - - return VK_SUCCESS; -} - -void VmaBlockMetadata_Linear::Alloc( - const VmaAllocationRequest& request, - VmaSuballocationType type, - VkDeviceSize allocSize, - VmaAllocation hAllocation) -{ - const VmaSuballocation newSuballoc = { request.offset, allocSize, hAllocation, type }; - - switch(request.type) - { - case VmaAllocationRequestType::UpperAddress: - { - VMA_ASSERT(m_2ndVectorMode != SECOND_VECTOR_RING_BUFFER && - "CRITICAL ERROR: Trying to use linear allocator as double stack while it was already used as ring buffer."); - SuballocationVectorType& suballocations2nd = AccessSuballocations2nd(); - suballocations2nd.push_back(newSuballoc); - m_2ndVectorMode = SECOND_VECTOR_DOUBLE_STACK; - } - break; - case VmaAllocationRequestType::EndOf1st: - { - SuballocationVectorType& suballocations1st = AccessSuballocations1st(); - - VMA_ASSERT(suballocations1st.empty() || - request.offset >= suballocations1st.back().offset + suballocations1st.back().size); - // Check if it fits before the end of the block. - VMA_ASSERT(request.offset + allocSize <= GetSize()); - - suballocations1st.push_back(newSuballoc); - } - break; - case VmaAllocationRequestType::EndOf2nd: - { - SuballocationVectorType& suballocations1st = AccessSuballocations1st(); - // New allocation at the end of 2-part ring buffer, so before first allocation from 1st vector. - VMA_ASSERT(!suballocations1st.empty() && - request.offset + allocSize <= suballocations1st[m_1stNullItemsBeginCount].offset); - SuballocationVectorType& suballocations2nd = AccessSuballocations2nd(); - - switch(m_2ndVectorMode) - { - case SECOND_VECTOR_EMPTY: - // First allocation from second part ring buffer. - VMA_ASSERT(suballocations2nd.empty()); - m_2ndVectorMode = SECOND_VECTOR_RING_BUFFER; - break; - case SECOND_VECTOR_RING_BUFFER: - // 2-part ring buffer is already started. - VMA_ASSERT(!suballocations2nd.empty()); - break; - case SECOND_VECTOR_DOUBLE_STACK: - VMA_ASSERT(0 && "CRITICAL ERROR: Trying to use linear allocator as ring buffer while it was already used as double stack."); - break; - default: - VMA_ASSERT(0); - } - - suballocations2nd.push_back(newSuballoc); - } - break; - default: - VMA_ASSERT(0 && "CRITICAL INTERNAL ERROR."); - } - - m_SumFreeSize -= newSuballoc.size; -} - -void VmaBlockMetadata_Linear::Free(const VmaAllocation allocation) -{ - FreeAtOffset(allocation->GetOffset()); -} - -void VmaBlockMetadata_Linear::FreeAtOffset(VkDeviceSize offset) -{ - SuballocationVectorType& suballocations1st = AccessSuballocations1st(); - SuballocationVectorType& suballocations2nd = AccessSuballocations2nd(); - - if(!suballocations1st.empty()) - { - // First allocation: Mark it as next empty at the beginning. - VmaSuballocation& firstSuballoc = suballocations1st[m_1stNullItemsBeginCount]; - if(firstSuballoc.offset == offset) - { - firstSuballoc.type = VMA_SUBALLOCATION_TYPE_FREE; - firstSuballoc.hAllocation = VK_NULL_HANDLE; - m_SumFreeSize += firstSuballoc.size; - ++m_1stNullItemsBeginCount; - CleanupAfterFree(); - return; - } - } - - // Last allocation in 2-part ring buffer or top of upper stack (same logic). - if(m_2ndVectorMode == SECOND_VECTOR_RING_BUFFER || - m_2ndVectorMode == SECOND_VECTOR_DOUBLE_STACK) - { - VmaSuballocation& lastSuballoc = suballocations2nd.back(); - if(lastSuballoc.offset == offset) - { - m_SumFreeSize += lastSuballoc.size; - suballocations2nd.pop_back(); - CleanupAfterFree(); - return; - } - } - // Last allocation in 1st vector. - else if(m_2ndVectorMode == SECOND_VECTOR_EMPTY) - { - VmaSuballocation& lastSuballoc = suballocations1st.back(); - if(lastSuballoc.offset == offset) - { - m_SumFreeSize += lastSuballoc.size; - suballocations1st.pop_back(); - CleanupAfterFree(); - return; - } - } - - // Item from the middle of 1st vector. - { - VmaSuballocation refSuballoc; - refSuballoc.offset = offset; - // Rest of members stays uninitialized intentionally for better performance. - SuballocationVectorType::iterator it = VmaBinaryFindSorted( - suballocations1st.begin() + m_1stNullItemsBeginCount, - suballocations1st.end(), - refSuballoc, - VmaSuballocationOffsetLess()); - if(it != suballocations1st.end()) - { - it->type = VMA_SUBALLOCATION_TYPE_FREE; - it->hAllocation = VK_NULL_HANDLE; - ++m_1stNullItemsMiddleCount; - m_SumFreeSize += it->size; - CleanupAfterFree(); - return; - } - } - - if(m_2ndVectorMode != SECOND_VECTOR_EMPTY) - { - // Item from the middle of 2nd vector. - VmaSuballocation refSuballoc; - refSuballoc.offset = offset; - // Rest of members stays uninitialized intentionally for better performance. - SuballocationVectorType::iterator it = m_2ndVectorMode == SECOND_VECTOR_RING_BUFFER ? - VmaBinaryFindSorted(suballocations2nd.begin(), suballocations2nd.end(), refSuballoc, VmaSuballocationOffsetLess()) : - VmaBinaryFindSorted(suballocations2nd.begin(), suballocations2nd.end(), refSuballoc, VmaSuballocationOffsetGreater()); - if(it != suballocations2nd.end()) - { - it->type = VMA_SUBALLOCATION_TYPE_FREE; - it->hAllocation = VK_NULL_HANDLE; - ++m_2ndNullItemsCount; - m_SumFreeSize += it->size; - CleanupAfterFree(); - return; - } - } - - VMA_ASSERT(0 && "Allocation to free not found in linear allocator!"); -} - -bool VmaBlockMetadata_Linear::ShouldCompact1st() const -{ - const size_t nullItemCount = m_1stNullItemsBeginCount + m_1stNullItemsMiddleCount; - const size_t suballocCount = AccessSuballocations1st().size(); - return suballocCount > 32 && nullItemCount * 2 >= (suballocCount - nullItemCount) * 3; -} - -void VmaBlockMetadata_Linear::CleanupAfterFree() -{ - SuballocationVectorType& suballocations1st = AccessSuballocations1st(); - SuballocationVectorType& suballocations2nd = AccessSuballocations2nd(); - - if(IsEmpty()) - { - suballocations1st.clear(); - suballocations2nd.clear(); - m_1stNullItemsBeginCount = 0; - m_1stNullItemsMiddleCount = 0; - m_2ndNullItemsCount = 0; - m_2ndVectorMode = SECOND_VECTOR_EMPTY; - } - else - { - const size_t suballoc1stCount = suballocations1st.size(); - const size_t nullItem1stCount = m_1stNullItemsBeginCount + m_1stNullItemsMiddleCount; - VMA_ASSERT(nullItem1stCount <= suballoc1stCount); - - // Find more null items at the beginning of 1st vector. - while(m_1stNullItemsBeginCount < suballoc1stCount && - suballocations1st[m_1stNullItemsBeginCount].hAllocation == VK_NULL_HANDLE) - { - ++m_1stNullItemsBeginCount; - --m_1stNullItemsMiddleCount; - } - - // Find more null items at the end of 1st vector. - while(m_1stNullItemsMiddleCount > 0 && - suballocations1st.back().hAllocation == VK_NULL_HANDLE) - { - --m_1stNullItemsMiddleCount; - suballocations1st.pop_back(); - } - - // Find more null items at the end of 2nd vector. - while(m_2ndNullItemsCount > 0 && - suballocations2nd.back().hAllocation == VK_NULL_HANDLE) - { - --m_2ndNullItemsCount; - suballocations2nd.pop_back(); - } - - // Find more null items at the beginning of 2nd vector. - while(m_2ndNullItemsCount > 0 && - suballocations2nd[0].hAllocation == VK_NULL_HANDLE) - { - --m_2ndNullItemsCount; - VmaVectorRemove(suballocations2nd, 0); - } - - if(ShouldCompact1st()) - { - const size_t nonNullItemCount = suballoc1stCount - nullItem1stCount; - size_t srcIndex = m_1stNullItemsBeginCount; - for(size_t dstIndex = 0; dstIndex < nonNullItemCount; ++dstIndex) - { - while(suballocations1st[srcIndex].hAllocation == VK_NULL_HANDLE) - { - ++srcIndex; - } - if(dstIndex != srcIndex) - { - suballocations1st[dstIndex] = suballocations1st[srcIndex]; - } - ++srcIndex; - } - suballocations1st.resize(nonNullItemCount); - m_1stNullItemsBeginCount = 0; - m_1stNullItemsMiddleCount = 0; - } - - // 2nd vector became empty. - if(suballocations2nd.empty()) - { - m_2ndVectorMode = SECOND_VECTOR_EMPTY; - } - - // 1st vector became empty. - if(suballocations1st.size() - m_1stNullItemsBeginCount == 0) - { - suballocations1st.clear(); - m_1stNullItemsBeginCount = 0; - - if(!suballocations2nd.empty() && m_2ndVectorMode == SECOND_VECTOR_RING_BUFFER) - { - // Swap 1st with 2nd. Now 2nd is empty. - m_2ndVectorMode = SECOND_VECTOR_EMPTY; - m_1stNullItemsMiddleCount = m_2ndNullItemsCount; - while(m_1stNullItemsBeginCount < suballocations2nd.size() && - suballocations2nd[m_1stNullItemsBeginCount].hAllocation == VK_NULL_HANDLE) - { - ++m_1stNullItemsBeginCount; - --m_1stNullItemsMiddleCount; - } - m_2ndNullItemsCount = 0; - m_1stVectorIndex ^= 1; - } - } - } - - VMA_HEAVY_ASSERT(Validate()); -} - - -//////////////////////////////////////////////////////////////////////////////// -// class VmaBlockMetadata_Buddy - -VmaBlockMetadata_Buddy::VmaBlockMetadata_Buddy(VmaAllocator hAllocator) : - VmaBlockMetadata(hAllocator), - m_Root(VMA_NULL), - m_AllocationCount(0), - m_FreeCount(1), - m_SumFreeSize(0) -{ - memset(m_FreeList, 0, sizeof(m_FreeList)); -} - -VmaBlockMetadata_Buddy::~VmaBlockMetadata_Buddy() -{ - DeleteNode(m_Root); -} - -void VmaBlockMetadata_Buddy::Init(VkDeviceSize size) -{ - VmaBlockMetadata::Init(size); - - m_UsableSize = VmaPrevPow2(size); - m_SumFreeSize = m_UsableSize; - - // Calculate m_LevelCount. - m_LevelCount = 1; - while(m_LevelCount < MAX_LEVELS && - LevelToNodeSize(m_LevelCount) >= MIN_NODE_SIZE) - { - ++m_LevelCount; - } - - Node* rootNode = vma_new(GetAllocationCallbacks(), Node)(); - rootNode->offset = 0; - rootNode->type = Node::TYPE_FREE; - rootNode->parent = VMA_NULL; - rootNode->buddy = VMA_NULL; - - m_Root = rootNode; - AddToFreeListFront(0, rootNode); -} - -bool VmaBlockMetadata_Buddy::Validate() const -{ - // Validate tree. - ValidationContext ctx; - if(!ValidateNode(ctx, VMA_NULL, m_Root, 0, LevelToNodeSize(0))) - { - VMA_VALIDATE(false && "ValidateNode failed."); - } - VMA_VALIDATE(m_AllocationCount == ctx.calculatedAllocationCount); - VMA_VALIDATE(m_SumFreeSize == ctx.calculatedSumFreeSize); - - // Validate free node lists. - for(uint32_t level = 0; level < m_LevelCount; ++level) - { - VMA_VALIDATE(m_FreeList[level].front == VMA_NULL || - m_FreeList[level].front->free.prev == VMA_NULL); - - for(Node* node = m_FreeList[level].front; - node != VMA_NULL; - node = node->free.next) - { - VMA_VALIDATE(node->type == Node::TYPE_FREE); - - if(node->free.next == VMA_NULL) - { - VMA_VALIDATE(m_FreeList[level].back == node); - } - else - { - VMA_VALIDATE(node->free.next->free.prev == node); - } - } - } - - // Validate that free lists ar higher levels are empty. - for(uint32_t level = m_LevelCount; level < MAX_LEVELS; ++level) - { - VMA_VALIDATE(m_FreeList[level].front == VMA_NULL && m_FreeList[level].back == VMA_NULL); - } - - return true; -} - -VkDeviceSize VmaBlockMetadata_Buddy::GetUnusedRangeSizeMax() const -{ - for(uint32_t level = 0; level < m_LevelCount; ++level) - { - if(m_FreeList[level].front != VMA_NULL) - { - return LevelToNodeSize(level); - } - } - return 0; -} - -void VmaBlockMetadata_Buddy::CalcAllocationStatInfo(VmaStatInfo& outInfo) const -{ - const VkDeviceSize unusableSize = GetUnusableSize(); - - outInfo.blockCount = 1; - - outInfo.allocationCount = outInfo.unusedRangeCount = 0; - outInfo.usedBytes = outInfo.unusedBytes = 0; - - outInfo.allocationSizeMax = outInfo.unusedRangeSizeMax = 0; - outInfo.allocationSizeMin = outInfo.unusedRangeSizeMin = UINT64_MAX; - outInfo.allocationSizeAvg = outInfo.unusedRangeSizeAvg = 0; // Unused. - - CalcAllocationStatInfoNode(outInfo, m_Root, LevelToNodeSize(0)); - - if(unusableSize > 0) - { - ++outInfo.unusedRangeCount; - outInfo.unusedBytes += unusableSize; - outInfo.unusedRangeSizeMax = VMA_MAX(outInfo.unusedRangeSizeMax, unusableSize); - outInfo.unusedRangeSizeMin = VMA_MIN(outInfo.unusedRangeSizeMin, unusableSize); - } -} - -void VmaBlockMetadata_Buddy::AddPoolStats(VmaPoolStats& inoutStats) const -{ - const VkDeviceSize unusableSize = GetUnusableSize(); - - inoutStats.size += GetSize(); - inoutStats.unusedSize += m_SumFreeSize + unusableSize; - inoutStats.allocationCount += m_AllocationCount; - inoutStats.unusedRangeCount += m_FreeCount; - inoutStats.unusedRangeSizeMax = VMA_MAX(inoutStats.unusedRangeSizeMax, GetUnusedRangeSizeMax()); - - if(unusableSize > 0) - { - ++inoutStats.unusedRangeCount; - // Not updating inoutStats.unusedRangeSizeMax with unusableSize because this space is not available for allocations. - } -} - -#if VMA_STATS_STRING_ENABLED - -void VmaBlockMetadata_Buddy::PrintDetailedMap(class VmaJsonWriter& json) const -{ - // TODO optimize - VmaStatInfo stat; - CalcAllocationStatInfo(stat); - - PrintDetailedMap_Begin( - json, - stat.unusedBytes, - stat.allocationCount, - stat.unusedRangeCount); - - PrintDetailedMapNode(json, m_Root, LevelToNodeSize(0)); - - const VkDeviceSize unusableSize = GetUnusableSize(); - if(unusableSize > 0) - { - PrintDetailedMap_UnusedRange(json, - m_UsableSize, // offset - unusableSize); // size - } - - PrintDetailedMap_End(json); -} - -#endif // #if VMA_STATS_STRING_ENABLED - -bool VmaBlockMetadata_Buddy::CreateAllocationRequest( - uint32_t currentFrameIndex, - uint32_t frameInUseCount, - VkDeviceSize bufferImageGranularity, - VkDeviceSize allocSize, - VkDeviceSize allocAlignment, - bool upperAddress, - VmaSuballocationType allocType, - bool canMakeOtherLost, - uint32_t strategy, - VmaAllocationRequest* pAllocationRequest) -{ - VMA_ASSERT(!upperAddress && "VMA_ALLOCATION_CREATE_UPPER_ADDRESS_BIT can be used only with linear algorithm."); - - // Simple way to respect bufferImageGranularity. May be optimized some day. - // Whenever it might be an OPTIMAL image... - if(allocType == VMA_SUBALLOCATION_TYPE_UNKNOWN || - allocType == VMA_SUBALLOCATION_TYPE_IMAGE_UNKNOWN || - allocType == VMA_SUBALLOCATION_TYPE_IMAGE_OPTIMAL) - { - allocAlignment = VMA_MAX(allocAlignment, bufferImageGranularity); - allocSize = VMA_MAX(allocSize, bufferImageGranularity); - } - - if(allocSize > m_UsableSize) - { - return false; - } - - const uint32_t targetLevel = AllocSizeToLevel(allocSize); - for(uint32_t level = targetLevel + 1; level--; ) - { - for(Node* freeNode = m_FreeList[level].front; - freeNode != VMA_NULL; - freeNode = freeNode->free.next) - { - if(freeNode->offset % allocAlignment == 0) - { - pAllocationRequest->type = VmaAllocationRequestType::Normal; - pAllocationRequest->offset = freeNode->offset; - pAllocationRequest->sumFreeSize = LevelToNodeSize(level); - pAllocationRequest->sumItemSize = 0; - pAllocationRequest->itemsToMakeLostCount = 0; - pAllocationRequest->customData = (void*)(uintptr_t)level; - return true; - } - } - } - - return false; -} - -bool VmaBlockMetadata_Buddy::MakeRequestedAllocationsLost( - uint32_t currentFrameIndex, - uint32_t frameInUseCount, - VmaAllocationRequest* pAllocationRequest) -{ - /* - Lost allocations are not supported in buddy allocator at the moment. - Support might be added in the future. - */ - return pAllocationRequest->itemsToMakeLostCount == 0; -} - -uint32_t VmaBlockMetadata_Buddy::MakeAllocationsLost(uint32_t currentFrameIndex, uint32_t frameInUseCount) -{ - /* - Lost allocations are not supported in buddy allocator at the moment. - Support might be added in the future. - */ - return 0; -} - -void VmaBlockMetadata_Buddy::Alloc( - const VmaAllocationRequest& request, - VmaSuballocationType type, - VkDeviceSize allocSize, - VmaAllocation hAllocation) -{ - VMA_ASSERT(request.type == VmaAllocationRequestType::Normal); - - const uint32_t targetLevel = AllocSizeToLevel(allocSize); - uint32_t currLevel = (uint32_t)(uintptr_t)request.customData; - - Node* currNode = m_FreeList[currLevel].front; - VMA_ASSERT(currNode != VMA_NULL && currNode->type == Node::TYPE_FREE); - while(currNode->offset != request.offset) - { - currNode = currNode->free.next; - VMA_ASSERT(currNode != VMA_NULL && currNode->type == Node::TYPE_FREE); - } - - // Go down, splitting free nodes. - while(currLevel < targetLevel) - { - // currNode is already first free node at currLevel. - // Remove it from list of free nodes at this currLevel. - RemoveFromFreeList(currLevel, currNode); - - const uint32_t childrenLevel = currLevel + 1; - - // Create two free sub-nodes. - Node* leftChild = vma_new(GetAllocationCallbacks(), Node)(); - Node* rightChild = vma_new(GetAllocationCallbacks(), Node)(); - - leftChild->offset = currNode->offset; - leftChild->type = Node::TYPE_FREE; - leftChild->parent = currNode; - leftChild->buddy = rightChild; - - rightChild->offset = currNode->offset + LevelToNodeSize(childrenLevel); - rightChild->type = Node::TYPE_FREE; - rightChild->parent = currNode; - rightChild->buddy = leftChild; - - // Convert current currNode to split type. - currNode->type = Node::TYPE_SPLIT; - currNode->split.leftChild = leftChild; - - // Add child nodes to free list. Order is important! - AddToFreeListFront(childrenLevel, rightChild); - AddToFreeListFront(childrenLevel, leftChild); - - ++m_FreeCount; - //m_SumFreeSize -= LevelToNodeSize(currLevel) % 2; // Useful only when level node sizes can be non power of 2. - ++currLevel; - currNode = m_FreeList[currLevel].front; - - /* - We can be sure that currNode, as left child of node previously split, - also fullfills the alignment requirement. - */ - } - - // Remove from free list. - VMA_ASSERT(currLevel == targetLevel && - currNode != VMA_NULL && - currNode->type == Node::TYPE_FREE); - RemoveFromFreeList(currLevel, currNode); - - // Convert to allocation node. - currNode->type = Node::TYPE_ALLOCATION; - currNode->allocation.alloc = hAllocation; - - ++m_AllocationCount; - --m_FreeCount; - m_SumFreeSize -= allocSize; -} - -void VmaBlockMetadata_Buddy::DeleteNode(Node* node) -{ - if(node->type == Node::TYPE_SPLIT) - { - DeleteNode(node->split.leftChild->buddy); - DeleteNode(node->split.leftChild); - } - - vma_delete(GetAllocationCallbacks(), node); -} - -bool VmaBlockMetadata_Buddy::ValidateNode(ValidationContext& ctx, const Node* parent, const Node* curr, uint32_t level, VkDeviceSize levelNodeSize) const -{ - VMA_VALIDATE(level < m_LevelCount); - VMA_VALIDATE(curr->parent == parent); - VMA_VALIDATE((curr->buddy == VMA_NULL) == (parent == VMA_NULL)); - VMA_VALIDATE(curr->buddy == VMA_NULL || curr->buddy->buddy == curr); - switch(curr->type) - { - case Node::TYPE_FREE: - // curr->free.prev, next are validated separately. - ctx.calculatedSumFreeSize += levelNodeSize; - ++ctx.calculatedFreeCount; - break; - case Node::TYPE_ALLOCATION: - ++ctx.calculatedAllocationCount; - ctx.calculatedSumFreeSize += levelNodeSize - curr->allocation.alloc->GetSize(); - VMA_VALIDATE(curr->allocation.alloc != VK_NULL_HANDLE); - break; - case Node::TYPE_SPLIT: - { - const uint32_t childrenLevel = level + 1; - const VkDeviceSize childrenLevelNodeSize = levelNodeSize / 2; - const Node* const leftChild = curr->split.leftChild; - VMA_VALIDATE(leftChild != VMA_NULL); - VMA_VALIDATE(leftChild->offset == curr->offset); - if(!ValidateNode(ctx, curr, leftChild, childrenLevel, childrenLevelNodeSize)) - { - VMA_VALIDATE(false && "ValidateNode for left child failed."); - } - const Node* const rightChild = leftChild->buddy; - VMA_VALIDATE(rightChild->offset == curr->offset + childrenLevelNodeSize); - if(!ValidateNode(ctx, curr, rightChild, childrenLevel, childrenLevelNodeSize)) - { - VMA_VALIDATE(false && "ValidateNode for right child failed."); - } - } - break; - default: - return false; - } - - return true; -} - -uint32_t VmaBlockMetadata_Buddy::AllocSizeToLevel(VkDeviceSize allocSize) const -{ - // I know this could be optimized somehow e.g. by using std::log2p1 from C++20. - uint32_t level = 0; - VkDeviceSize currLevelNodeSize = m_UsableSize; - VkDeviceSize nextLevelNodeSize = currLevelNodeSize >> 1; - while(allocSize <= nextLevelNodeSize && level + 1 < m_LevelCount) - { - ++level; - currLevelNodeSize = nextLevelNodeSize; - nextLevelNodeSize = currLevelNodeSize >> 1; - } - return level; -} - -void VmaBlockMetadata_Buddy::FreeAtOffset(VmaAllocation alloc, VkDeviceSize offset) -{ - // Find node and level. - Node* node = m_Root; - VkDeviceSize nodeOffset = 0; - uint32_t level = 0; - VkDeviceSize levelNodeSize = LevelToNodeSize(0); - while(node->type == Node::TYPE_SPLIT) - { - const VkDeviceSize nextLevelSize = levelNodeSize >> 1; - if(offset < nodeOffset + nextLevelSize) - { - node = node->split.leftChild; - } - else - { - node = node->split.leftChild->buddy; - nodeOffset += nextLevelSize; - } - ++level; - levelNodeSize = nextLevelSize; - } - - VMA_ASSERT(node != VMA_NULL && node->type == Node::TYPE_ALLOCATION); - VMA_ASSERT(alloc == VK_NULL_HANDLE || node->allocation.alloc == alloc); - - ++m_FreeCount; - --m_AllocationCount; - m_SumFreeSize += alloc->GetSize(); - - node->type = Node::TYPE_FREE; - - // Join free nodes if possible. - while(level > 0 && node->buddy->type == Node::TYPE_FREE) - { - RemoveFromFreeList(level, node->buddy); - Node* const parent = node->parent; - - vma_delete(GetAllocationCallbacks(), node->buddy); - vma_delete(GetAllocationCallbacks(), node); - parent->type = Node::TYPE_FREE; - - node = parent; - --level; - //m_SumFreeSize += LevelToNodeSize(level) % 2; // Useful only when level node sizes can be non power of 2. - --m_FreeCount; - } - - AddToFreeListFront(level, node); -} - -void VmaBlockMetadata_Buddy::CalcAllocationStatInfoNode(VmaStatInfo& outInfo, const Node* node, VkDeviceSize levelNodeSize) const -{ - switch(node->type) - { - case Node::TYPE_FREE: - ++outInfo.unusedRangeCount; - outInfo.unusedBytes += levelNodeSize; - outInfo.unusedRangeSizeMax = VMA_MAX(outInfo.unusedRangeSizeMax, levelNodeSize); - outInfo.unusedRangeSizeMin = VMA_MAX(outInfo.unusedRangeSizeMin, levelNodeSize); - break; - case Node::TYPE_ALLOCATION: - { - const VkDeviceSize allocSize = node->allocation.alloc->GetSize(); - ++outInfo.allocationCount; - outInfo.usedBytes += allocSize; - outInfo.allocationSizeMax = VMA_MAX(outInfo.allocationSizeMax, allocSize); - outInfo.allocationSizeMin = VMA_MAX(outInfo.allocationSizeMin, allocSize); - - const VkDeviceSize unusedRangeSize = levelNodeSize - allocSize; - if(unusedRangeSize > 0) - { - ++outInfo.unusedRangeCount; - outInfo.unusedBytes += unusedRangeSize; - outInfo.unusedRangeSizeMax = VMA_MAX(outInfo.unusedRangeSizeMax, unusedRangeSize); - outInfo.unusedRangeSizeMin = VMA_MAX(outInfo.unusedRangeSizeMin, unusedRangeSize); - } - } - break; - case Node::TYPE_SPLIT: - { - const VkDeviceSize childrenNodeSize = levelNodeSize / 2; - const Node* const leftChild = node->split.leftChild; - CalcAllocationStatInfoNode(outInfo, leftChild, childrenNodeSize); - const Node* const rightChild = leftChild->buddy; - CalcAllocationStatInfoNode(outInfo, rightChild, childrenNodeSize); - } - break; - default: - VMA_ASSERT(0); - } -} - -void VmaBlockMetadata_Buddy::AddToFreeListFront(uint32_t level, Node* node) -{ - VMA_ASSERT(node->type == Node::TYPE_FREE); - - // List is empty. - Node* const frontNode = m_FreeList[level].front; - if(frontNode == VMA_NULL) - { - VMA_ASSERT(m_FreeList[level].back == VMA_NULL); - node->free.prev = node->free.next = VMA_NULL; - m_FreeList[level].front = m_FreeList[level].back = node; - } - else - { - VMA_ASSERT(frontNode->free.prev == VMA_NULL); - node->free.prev = VMA_NULL; - node->free.next = frontNode; - frontNode->free.prev = node; - m_FreeList[level].front = node; - } -} - -void VmaBlockMetadata_Buddy::RemoveFromFreeList(uint32_t level, Node* node) -{ - VMA_ASSERT(m_FreeList[level].front != VMA_NULL); - - // It is at the front. - if(node->free.prev == VMA_NULL) - { - VMA_ASSERT(m_FreeList[level].front == node); - m_FreeList[level].front = node->free.next; - } - else - { - Node* const prevFreeNode = node->free.prev; - VMA_ASSERT(prevFreeNode->free.next == node); - prevFreeNode->free.next = node->free.next; - } - - // It is at the back. - if(node->free.next == VMA_NULL) - { - VMA_ASSERT(m_FreeList[level].back == node); - m_FreeList[level].back = node->free.prev; - } - else - { - Node* const nextFreeNode = node->free.next; - VMA_ASSERT(nextFreeNode->free.prev == node); - nextFreeNode->free.prev = node->free.prev; - } -} - -#if VMA_STATS_STRING_ENABLED -void VmaBlockMetadata_Buddy::PrintDetailedMapNode(class VmaJsonWriter& json, const Node* node, VkDeviceSize levelNodeSize) const -{ - switch(node->type) - { - case Node::TYPE_FREE: - PrintDetailedMap_UnusedRange(json, node->offset, levelNodeSize); - break; - case Node::TYPE_ALLOCATION: - { - PrintDetailedMap_Allocation(json, node->offset, node->allocation.alloc); - const VkDeviceSize allocSize = node->allocation.alloc->GetSize(); - if(allocSize < levelNodeSize) - { - PrintDetailedMap_UnusedRange(json, node->offset + allocSize, levelNodeSize - allocSize); - } - } - break; - case Node::TYPE_SPLIT: - { - const VkDeviceSize childrenNodeSize = levelNodeSize / 2; - const Node* const leftChild = node->split.leftChild; - PrintDetailedMapNode(json, leftChild, childrenNodeSize); - const Node* const rightChild = leftChild->buddy; - PrintDetailedMapNode(json, rightChild, childrenNodeSize); - } - break; - default: - VMA_ASSERT(0); - } -} -#endif // #if VMA_STATS_STRING_ENABLED - - -//////////////////////////////////////////////////////////////////////////////// -// class VmaDeviceMemoryBlock - -VmaDeviceMemoryBlock::VmaDeviceMemoryBlock(VmaAllocator hAllocator) : - m_pMetadata(VMA_NULL), - m_MemoryTypeIndex(UINT32_MAX), - m_Id(0), - m_hMemory(VK_NULL_HANDLE), - m_MapCount(0), - m_pMappedData(VMA_NULL) -{ -} - -void VmaDeviceMemoryBlock::Init( - VmaAllocator hAllocator, - VmaPool hParentPool, - uint32_t newMemoryTypeIndex, - VkDeviceMemory newMemory, - VkDeviceSize newSize, - uint32_t id, - uint32_t algorithm) -{ - VMA_ASSERT(m_hMemory == VK_NULL_HANDLE); - - m_hParentPool = hParentPool; - m_MemoryTypeIndex = newMemoryTypeIndex; - m_Id = id; - m_hMemory = newMemory; - - switch(algorithm) - { - case VMA_POOL_CREATE_LINEAR_ALGORITHM_BIT: - m_pMetadata = vma_new(hAllocator, VmaBlockMetadata_Linear)(hAllocator); - break; - case VMA_POOL_CREATE_BUDDY_ALGORITHM_BIT: - m_pMetadata = vma_new(hAllocator, VmaBlockMetadata_Buddy)(hAllocator); - break; - default: - VMA_ASSERT(0); - // Fall-through. - case 0: - m_pMetadata = vma_new(hAllocator, VmaBlockMetadata_Generic)(hAllocator); - } - m_pMetadata->Init(newSize); -} - -void VmaDeviceMemoryBlock::Destroy(VmaAllocator allocator) -{ - // This is the most important assert in the entire library. - // Hitting it means you have some memory leak - unreleased VmaAllocation objects. - VMA_ASSERT(m_pMetadata->IsEmpty() && "Some allocations were not freed before destruction of this memory block!"); - - VMA_ASSERT(m_hMemory != VK_NULL_HANDLE); - allocator->FreeVulkanMemory(m_MemoryTypeIndex, m_pMetadata->GetSize(), m_hMemory); - m_hMemory = VK_NULL_HANDLE; - - vma_delete(allocator, m_pMetadata); - m_pMetadata = VMA_NULL; -} - -bool VmaDeviceMemoryBlock::Validate() const -{ - VMA_VALIDATE((m_hMemory != VK_NULL_HANDLE) && - (m_pMetadata->GetSize() != 0)); - - return m_pMetadata->Validate(); -} - -VkResult VmaDeviceMemoryBlock::CheckCorruption(VmaAllocator hAllocator) -{ - void* pData = nullptr; - VkResult res = Map(hAllocator, 1, &pData); - if(res != VK_SUCCESS) - { - return res; - } - - res = m_pMetadata->CheckCorruption(pData); - - Unmap(hAllocator, 1); - - return res; -} - -VkResult VmaDeviceMemoryBlock::Map(VmaAllocator hAllocator, uint32_t count, void** ppData) -{ - if(count == 0) - { - return VK_SUCCESS; - } - - VmaMutexLock lock(m_Mutex, hAllocator->m_UseMutex); - if(m_MapCount != 0) - { - m_MapCount += count; - VMA_ASSERT(m_pMappedData != VMA_NULL); - if(ppData != VMA_NULL) - { - *ppData = m_pMappedData; - } - return VK_SUCCESS; - } - else - { - VkResult result = (*hAllocator->GetVulkanFunctions().vkMapMemory)( - hAllocator->m_hDevice, - m_hMemory, - 0, // offset - VK_WHOLE_SIZE, - 0, // flags - &m_pMappedData); - if(result == VK_SUCCESS) - { - if(ppData != VMA_NULL) - { - *ppData = m_pMappedData; - } - m_MapCount = count; - } - return result; - } -} - -void VmaDeviceMemoryBlock::Unmap(VmaAllocator hAllocator, uint32_t count) -{ - if(count == 0) - { - return; - } - - VmaMutexLock lock(m_Mutex, hAllocator->m_UseMutex); - if(m_MapCount >= count) - { - m_MapCount -= count; - if(m_MapCount == 0) - { - m_pMappedData = VMA_NULL; - (*hAllocator->GetVulkanFunctions().vkUnmapMemory)(hAllocator->m_hDevice, m_hMemory); - } - } - else - { - VMA_ASSERT(0 && "VkDeviceMemory block is being unmapped while it was not previously mapped."); - } -} - -VkResult VmaDeviceMemoryBlock::WriteMagicValueAroundAllocation(VmaAllocator hAllocator, VkDeviceSize allocOffset, VkDeviceSize allocSize) -{ - VMA_ASSERT(VMA_DEBUG_MARGIN > 0 && VMA_DEBUG_MARGIN % 4 == 0 && VMA_DEBUG_DETECT_CORRUPTION); - VMA_ASSERT(allocOffset >= VMA_DEBUG_MARGIN); - - void* pData; - VkResult res = Map(hAllocator, 1, &pData); - if(res != VK_SUCCESS) - { - return res; - } - - VmaWriteMagicValue(pData, allocOffset - VMA_DEBUG_MARGIN); - VmaWriteMagicValue(pData, allocOffset + allocSize); - - Unmap(hAllocator, 1); - - return VK_SUCCESS; -} - -VkResult VmaDeviceMemoryBlock::ValidateMagicValueAroundAllocation(VmaAllocator hAllocator, VkDeviceSize allocOffset, VkDeviceSize allocSize) -{ - VMA_ASSERT(VMA_DEBUG_MARGIN > 0 && VMA_DEBUG_MARGIN % 4 == 0 && VMA_DEBUG_DETECT_CORRUPTION); - VMA_ASSERT(allocOffset >= VMA_DEBUG_MARGIN); - - void* pData; - VkResult res = Map(hAllocator, 1, &pData); - if(res != VK_SUCCESS) - { - return res; - } - - if(!VmaValidateMagicValue(pData, allocOffset - VMA_DEBUG_MARGIN)) - { - VMA_ASSERT(0 && "MEMORY CORRUPTION DETECTED BEFORE FREED ALLOCATION!"); - } - else if(!VmaValidateMagicValue(pData, allocOffset + allocSize)) - { - VMA_ASSERT(0 && "MEMORY CORRUPTION DETECTED AFTER FREED ALLOCATION!"); - } - - Unmap(hAllocator, 1); - - return VK_SUCCESS; -} - -VkResult VmaDeviceMemoryBlock::BindBufferMemory( - const VmaAllocator hAllocator, - const VmaAllocation hAllocation, - VkDeviceSize allocationLocalOffset, - VkBuffer hBuffer, - const void* pNext) -{ - VMA_ASSERT(hAllocation->GetType() == VmaAllocation_T::ALLOCATION_TYPE_BLOCK && - hAllocation->GetBlock() == this); - VMA_ASSERT(allocationLocalOffset < hAllocation->GetSize() && - "Invalid allocationLocalOffset. Did you forget that this offset is relative to the beginning of the allocation, not the whole memory block?"); - const VkDeviceSize memoryOffset = hAllocation->GetOffset() + allocationLocalOffset; - // This lock is important so that we don't call vkBind... and/or vkMap... simultaneously on the same VkDeviceMemory from multiple threads. - VmaMutexLock lock(m_Mutex, hAllocator->m_UseMutex); - return hAllocator->BindVulkanBuffer(m_hMemory, memoryOffset, hBuffer, pNext); -} - -VkResult VmaDeviceMemoryBlock::BindImageMemory( - const VmaAllocator hAllocator, - const VmaAllocation hAllocation, - VkDeviceSize allocationLocalOffset, - VkImage hImage, - const void* pNext) -{ - VMA_ASSERT(hAllocation->GetType() == VmaAllocation_T::ALLOCATION_TYPE_BLOCK && - hAllocation->GetBlock() == this); - VMA_ASSERT(allocationLocalOffset < hAllocation->GetSize() && - "Invalid allocationLocalOffset. Did you forget that this offset is relative to the beginning of the allocation, not the whole memory block?"); - const VkDeviceSize memoryOffset = hAllocation->GetOffset() + allocationLocalOffset; - // This lock is important so that we don't call vkBind... and/or vkMap... simultaneously on the same VkDeviceMemory from multiple threads. - VmaMutexLock lock(m_Mutex, hAllocator->m_UseMutex); - return hAllocator->BindVulkanImage(m_hMemory, memoryOffset, hImage, pNext); -} - -static void InitStatInfo(VmaStatInfo& outInfo) -{ - memset(&outInfo, 0, sizeof(outInfo)); - outInfo.allocationSizeMin = UINT64_MAX; - outInfo.unusedRangeSizeMin = UINT64_MAX; -} - -// Adds statistics srcInfo into inoutInfo, like: inoutInfo += srcInfo. -static void VmaAddStatInfo(VmaStatInfo& inoutInfo, const VmaStatInfo& srcInfo) -{ - inoutInfo.blockCount += srcInfo.blockCount; - inoutInfo.allocationCount += srcInfo.allocationCount; - inoutInfo.unusedRangeCount += srcInfo.unusedRangeCount; - inoutInfo.usedBytes += srcInfo.usedBytes; - inoutInfo.unusedBytes += srcInfo.unusedBytes; - inoutInfo.allocationSizeMin = VMA_MIN(inoutInfo.allocationSizeMin, srcInfo.allocationSizeMin); - inoutInfo.allocationSizeMax = VMA_MAX(inoutInfo.allocationSizeMax, srcInfo.allocationSizeMax); - inoutInfo.unusedRangeSizeMin = VMA_MIN(inoutInfo.unusedRangeSizeMin, srcInfo.unusedRangeSizeMin); - inoutInfo.unusedRangeSizeMax = VMA_MAX(inoutInfo.unusedRangeSizeMax, srcInfo.unusedRangeSizeMax); -} - -static void VmaPostprocessCalcStatInfo(VmaStatInfo& inoutInfo) -{ - inoutInfo.allocationSizeAvg = (inoutInfo.allocationCount > 0) ? - VmaRoundDiv(inoutInfo.usedBytes, inoutInfo.allocationCount) : 0; - inoutInfo.unusedRangeSizeAvg = (inoutInfo.unusedRangeCount > 0) ? - VmaRoundDiv(inoutInfo.unusedBytes, inoutInfo.unusedRangeCount) : 0; -} - -VmaPool_T::VmaPool_T( - VmaAllocator hAllocator, - const VmaPoolCreateInfo& createInfo, - VkDeviceSize preferredBlockSize) : - m_BlockVector( - hAllocator, - this, // hParentPool - createInfo.memoryTypeIndex, - createInfo.blockSize != 0 ? createInfo.blockSize : preferredBlockSize, - createInfo.minBlockCount, - createInfo.maxBlockCount, - (createInfo.flags & VMA_POOL_CREATE_IGNORE_BUFFER_IMAGE_GRANULARITY_BIT) != 0 ? 1 : hAllocator->GetBufferImageGranularity(), - createInfo.frameInUseCount, - createInfo.blockSize != 0, // explicitBlockSize - createInfo.flags & VMA_POOL_CREATE_ALGORITHM_MASK, - createInfo.priority), // algorithm - m_Id(0), - m_Name(VMA_NULL) -{ -} - -VmaPool_T::~VmaPool_T() -{ -} - -void VmaPool_T::SetName(const char* pName) -{ - const VkAllocationCallbacks* allocs = m_BlockVector.GetAllocator()->GetAllocationCallbacks(); - VmaFreeString(allocs, m_Name); - - if(pName != VMA_NULL) - { - m_Name = VmaCreateStringCopy(allocs, pName); - } - else - { - m_Name = VMA_NULL; - } -} - -#if VMA_STATS_STRING_ENABLED - -#endif // #if VMA_STATS_STRING_ENABLED - -VmaBlockVector::VmaBlockVector( - VmaAllocator hAllocator, - VmaPool hParentPool, - uint32_t memoryTypeIndex, - VkDeviceSize preferredBlockSize, - size_t minBlockCount, - size_t maxBlockCount, - VkDeviceSize bufferImageGranularity, - uint32_t frameInUseCount, - bool explicitBlockSize, - uint32_t algorithm, - float priority) : - m_hAllocator(hAllocator), - m_hParentPool(hParentPool), - m_MemoryTypeIndex(memoryTypeIndex), - m_PreferredBlockSize(preferredBlockSize), - m_MinBlockCount(minBlockCount), - m_MaxBlockCount(maxBlockCount), - m_BufferImageGranularity(bufferImageGranularity), - m_FrameInUseCount(frameInUseCount), - m_ExplicitBlockSize(explicitBlockSize), - m_Algorithm(algorithm), - m_Priority(priority), - m_HasEmptyBlock(false), - m_Blocks(VmaStlAllocator(hAllocator->GetAllocationCallbacks())), - m_NextBlockId(0) -{ -} - -VmaBlockVector::~VmaBlockVector() -{ - for(size_t i = m_Blocks.size(); i--; ) - { - m_Blocks[i]->Destroy(m_hAllocator); - vma_delete(m_hAllocator, m_Blocks[i]); - } -} - -VkResult VmaBlockVector::CreateMinBlocks() -{ - for(size_t i = 0; i < m_MinBlockCount; ++i) - { - VkResult res = CreateBlock(m_PreferredBlockSize, VMA_NULL); - if(res != VK_SUCCESS) - { - return res; - } - } - return VK_SUCCESS; -} - -void VmaBlockVector::GetPoolStats(VmaPoolStats* pStats) -{ - VmaMutexLockRead lock(m_Mutex, m_hAllocator->m_UseMutex); - - const size_t blockCount = m_Blocks.size(); - - pStats->size = 0; - pStats->unusedSize = 0; - pStats->allocationCount = 0; - pStats->unusedRangeCount = 0; - pStats->unusedRangeSizeMax = 0; - pStats->blockCount = blockCount; - - for(uint32_t blockIndex = 0; blockIndex < blockCount; ++blockIndex) - { - const VmaDeviceMemoryBlock* const pBlock = m_Blocks[blockIndex]; - VMA_ASSERT(pBlock); - VMA_HEAVY_ASSERT(pBlock->Validate()); - pBlock->m_pMetadata->AddPoolStats(*pStats); - } -} - -bool VmaBlockVector::IsEmpty() -{ - VmaMutexLockRead lock(m_Mutex, m_hAllocator->m_UseMutex); - return m_Blocks.empty(); -} - -bool VmaBlockVector::IsCorruptionDetectionEnabled() const -{ - const uint32_t requiredMemFlags = VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT; - return (VMA_DEBUG_DETECT_CORRUPTION != 0) && - (VMA_DEBUG_MARGIN > 0) && - (m_Algorithm == 0 || m_Algorithm == VMA_POOL_CREATE_LINEAR_ALGORITHM_BIT) && - (m_hAllocator->m_MemProps.memoryTypes[m_MemoryTypeIndex].propertyFlags & requiredMemFlags) == requiredMemFlags; -} - -static const uint32_t VMA_ALLOCATION_TRY_COUNT = 32; - -VkResult VmaBlockVector::Allocate( - uint32_t currentFrameIndex, - VkDeviceSize size, - VkDeviceSize alignment, - const VmaAllocationCreateInfo& createInfo, - VmaSuballocationType suballocType, - size_t allocationCount, - VmaAllocation* pAllocations) -{ - size_t allocIndex; - VkResult res = VK_SUCCESS; - - if(IsCorruptionDetectionEnabled()) - { - size = VmaAlignUp(size, sizeof(VMA_CORRUPTION_DETECTION_MAGIC_VALUE)); - alignment = VmaAlignUp(alignment, sizeof(VMA_CORRUPTION_DETECTION_MAGIC_VALUE)); - } - - { - VmaMutexLockWrite lock(m_Mutex, m_hAllocator->m_UseMutex); - for(allocIndex = 0; allocIndex < allocationCount; ++allocIndex) - { - res = AllocatePage( - currentFrameIndex, - size, - alignment, - createInfo, - suballocType, - pAllocations + allocIndex); - if(res != VK_SUCCESS) - { - break; - } - } - } - - if(res != VK_SUCCESS) - { - // Free all already created allocations. - const uint32_t heapIndex = m_hAllocator->MemoryTypeIndexToHeapIndex(m_MemoryTypeIndex); - while(allocIndex--) - { - VmaAllocation_T* const alloc = pAllocations[allocIndex]; - const VkDeviceSize allocSize = alloc->GetSize(); - Free(alloc); - m_hAllocator->m_Budget.RemoveAllocation(heapIndex, allocSize); - } - memset(pAllocations, 0, sizeof(VmaAllocation) * allocationCount); - } - - return res; -} - -VkResult VmaBlockVector::AllocatePage( - uint32_t currentFrameIndex, - VkDeviceSize size, - VkDeviceSize alignment, - const VmaAllocationCreateInfo& createInfo, - VmaSuballocationType suballocType, - VmaAllocation* pAllocation) -{ - const bool isUpperAddress = (createInfo.flags & VMA_ALLOCATION_CREATE_UPPER_ADDRESS_BIT) != 0; - bool canMakeOtherLost = (createInfo.flags & VMA_ALLOCATION_CREATE_CAN_MAKE_OTHER_LOST_BIT) != 0; - const bool mapped = (createInfo.flags & VMA_ALLOCATION_CREATE_MAPPED_BIT) != 0; - const bool isUserDataString = (createInfo.flags & VMA_ALLOCATION_CREATE_USER_DATA_COPY_STRING_BIT) != 0; - - VkDeviceSize freeMemory; - { - const uint32_t heapIndex = m_hAllocator->MemoryTypeIndexToHeapIndex(m_MemoryTypeIndex); - VmaBudget heapBudget = {}; - m_hAllocator->GetBudget(&heapBudget, heapIndex, 1); - freeMemory = (heapBudget.usage < heapBudget.budget) ? (heapBudget.budget - heapBudget.usage) : 0; - } - - const bool canFallbackToDedicated = !IsCustomPool(); - const bool canCreateNewBlock = - ((createInfo.flags & VMA_ALLOCATION_CREATE_NEVER_ALLOCATE_BIT) == 0) && - (m_Blocks.size() < m_MaxBlockCount) && - (freeMemory >= size || !canFallbackToDedicated); - uint32_t strategy = createInfo.flags & VMA_ALLOCATION_CREATE_STRATEGY_MASK; - - // If linearAlgorithm is used, canMakeOtherLost is available only when used as ring buffer. - // Which in turn is available only when maxBlockCount = 1. - if(m_Algorithm == VMA_POOL_CREATE_LINEAR_ALGORITHM_BIT && m_MaxBlockCount > 1) - { - canMakeOtherLost = false; - } - - // Upper address can only be used with linear allocator and within single memory block. - if(isUpperAddress && - (m_Algorithm != VMA_POOL_CREATE_LINEAR_ALGORITHM_BIT || m_MaxBlockCount > 1)) - { - return VK_ERROR_FEATURE_NOT_PRESENT; - } - - // Validate strategy. - switch(strategy) - { - case 0: - strategy = VMA_ALLOCATION_CREATE_STRATEGY_BEST_FIT_BIT; - break; - case VMA_ALLOCATION_CREATE_STRATEGY_BEST_FIT_BIT: - case VMA_ALLOCATION_CREATE_STRATEGY_WORST_FIT_BIT: - case VMA_ALLOCATION_CREATE_STRATEGY_FIRST_FIT_BIT: - break; - default: - return VK_ERROR_FEATURE_NOT_PRESENT; - } - - // Early reject: requested allocation size is larger that maximum block size for this block vector. - if(size + 2 * VMA_DEBUG_MARGIN > m_PreferredBlockSize) - { - return VK_ERROR_OUT_OF_DEVICE_MEMORY; - } - - /* - Under certain condition, this whole section can be skipped for optimization, so - we move on directly to trying to allocate with canMakeOtherLost. That's the case - e.g. for custom pools with linear algorithm. - */ - if(!canMakeOtherLost || canCreateNewBlock) - { - // 1. Search existing allocations. Try to allocate without making other allocations lost. - VmaAllocationCreateFlags allocFlagsCopy = createInfo.flags; - allocFlagsCopy &= ~VMA_ALLOCATION_CREATE_CAN_MAKE_OTHER_LOST_BIT; - - if(m_Algorithm == VMA_POOL_CREATE_LINEAR_ALGORITHM_BIT) - { - // Use only last block. - if(!m_Blocks.empty()) - { - VmaDeviceMemoryBlock* const pCurrBlock = m_Blocks.back(); - VMA_ASSERT(pCurrBlock); - VkResult res = AllocateFromBlock( - pCurrBlock, - currentFrameIndex, - size, - alignment, - allocFlagsCopy, - createInfo.pUserData, - suballocType, - strategy, - pAllocation); - if(res == VK_SUCCESS) - { - VMA_DEBUG_LOG(" Returned from last block #%u", pCurrBlock->GetId()); - return VK_SUCCESS; - } - } - } - else - { - if(strategy == VMA_ALLOCATION_CREATE_STRATEGY_BEST_FIT_BIT) - { - // Forward order in m_Blocks - prefer blocks with smallest amount of free space. - for(size_t blockIndex = 0; blockIndex < m_Blocks.size(); ++blockIndex ) - { - VmaDeviceMemoryBlock* const pCurrBlock = m_Blocks[blockIndex]; - VMA_ASSERT(pCurrBlock); - VkResult res = AllocateFromBlock( - pCurrBlock, - currentFrameIndex, - size, - alignment, - allocFlagsCopy, - createInfo.pUserData, - suballocType, - strategy, - pAllocation); - if(res == VK_SUCCESS) - { - VMA_DEBUG_LOG(" Returned from existing block #%u", pCurrBlock->GetId()); - return VK_SUCCESS; - } - } - } - else // WORST_FIT, FIRST_FIT - { - // Backward order in m_Blocks - prefer blocks with largest amount of free space. - for(size_t blockIndex = m_Blocks.size(); blockIndex--; ) - { - VmaDeviceMemoryBlock* const pCurrBlock = m_Blocks[blockIndex]; - VMA_ASSERT(pCurrBlock); - VkResult res = AllocateFromBlock( - pCurrBlock, - currentFrameIndex, - size, - alignment, - allocFlagsCopy, - createInfo.pUserData, - suballocType, - strategy, - pAllocation); - if(res == VK_SUCCESS) - { - VMA_DEBUG_LOG(" Returned from existing block #%u", pCurrBlock->GetId()); - return VK_SUCCESS; - } - } - } - } - - // 2. Try to create new block. - if(canCreateNewBlock) - { - // Calculate optimal size for new block. - VkDeviceSize newBlockSize = m_PreferredBlockSize; - uint32_t newBlockSizeShift = 0; - const uint32_t NEW_BLOCK_SIZE_SHIFT_MAX = 3; - - if(!m_ExplicitBlockSize) - { - // Allocate 1/8, 1/4, 1/2 as first blocks. - const VkDeviceSize maxExistingBlockSize = CalcMaxBlockSize(); - for(uint32_t i = 0; i < NEW_BLOCK_SIZE_SHIFT_MAX; ++i) - { - const VkDeviceSize smallerNewBlockSize = newBlockSize / 2; - if(smallerNewBlockSize > maxExistingBlockSize && smallerNewBlockSize >= size * 2) - { - newBlockSize = smallerNewBlockSize; - ++newBlockSizeShift; - } - else - { - break; - } - } - } - - size_t newBlockIndex = 0; - VkResult res = (newBlockSize <= freeMemory || !canFallbackToDedicated) ? - CreateBlock(newBlockSize, &newBlockIndex) : VK_ERROR_OUT_OF_DEVICE_MEMORY; - // Allocation of this size failed? Try 1/2, 1/4, 1/8 of m_PreferredBlockSize. - if(!m_ExplicitBlockSize) - { - while(res < 0 && newBlockSizeShift < NEW_BLOCK_SIZE_SHIFT_MAX) - { - const VkDeviceSize smallerNewBlockSize = newBlockSize / 2; - if(smallerNewBlockSize >= size) - { - newBlockSize = smallerNewBlockSize; - ++newBlockSizeShift; - res = (newBlockSize <= freeMemory || !canFallbackToDedicated) ? - CreateBlock(newBlockSize, &newBlockIndex) : VK_ERROR_OUT_OF_DEVICE_MEMORY; - } - else - { - break; - } - } - } - - if(res == VK_SUCCESS) - { - VmaDeviceMemoryBlock* const pBlock = m_Blocks[newBlockIndex]; - VMA_ASSERT(pBlock->m_pMetadata->GetSize() >= size); - - res = AllocateFromBlock( - pBlock, - currentFrameIndex, - size, - alignment, - allocFlagsCopy, - createInfo.pUserData, - suballocType, - strategy, - pAllocation); - if(res == VK_SUCCESS) - { - VMA_DEBUG_LOG(" Created new block #%u Size=%llu", pBlock->GetId(), newBlockSize); - return VK_SUCCESS; - } - else - { - // Allocation from new block failed, possibly due to VMA_DEBUG_MARGIN or alignment. - return VK_ERROR_OUT_OF_DEVICE_MEMORY; - } - } - } - } - - // 3. Try to allocate from existing blocks with making other allocations lost. - if(canMakeOtherLost) - { - uint32_t tryIndex = 0; - for(; tryIndex < VMA_ALLOCATION_TRY_COUNT; ++tryIndex) - { - VmaDeviceMemoryBlock* pBestRequestBlock = VMA_NULL; - VmaAllocationRequest bestRequest = {}; - VkDeviceSize bestRequestCost = VK_WHOLE_SIZE; - - // 1. Search existing allocations. - if(strategy == VMA_ALLOCATION_CREATE_STRATEGY_BEST_FIT_BIT) - { - // Forward order in m_Blocks - prefer blocks with smallest amount of free space. - for(size_t blockIndex = 0; blockIndex < m_Blocks.size(); ++blockIndex ) - { - VmaDeviceMemoryBlock* const pCurrBlock = m_Blocks[blockIndex]; - VMA_ASSERT(pCurrBlock); - VmaAllocationRequest currRequest = {}; - if(pCurrBlock->m_pMetadata->CreateAllocationRequest( - currentFrameIndex, - m_FrameInUseCount, - m_BufferImageGranularity, - size, - alignment, - (createInfo.flags & VMA_ALLOCATION_CREATE_UPPER_ADDRESS_BIT) != 0, - suballocType, - canMakeOtherLost, - strategy, - &currRequest)) - { - const VkDeviceSize currRequestCost = currRequest.CalcCost(); - if(pBestRequestBlock == VMA_NULL || - currRequestCost < bestRequestCost) - { - pBestRequestBlock = pCurrBlock; - bestRequest = currRequest; - bestRequestCost = currRequestCost; - - if(bestRequestCost == 0) - { - break; - } - } - } - } - } - else // WORST_FIT, FIRST_FIT - { - // Backward order in m_Blocks - prefer blocks with largest amount of free space. - for(size_t blockIndex = m_Blocks.size(); blockIndex--; ) - { - VmaDeviceMemoryBlock* const pCurrBlock = m_Blocks[blockIndex]; - VMA_ASSERT(pCurrBlock); - VmaAllocationRequest currRequest = {}; - if(pCurrBlock->m_pMetadata->CreateAllocationRequest( - currentFrameIndex, - m_FrameInUseCount, - m_BufferImageGranularity, - size, - alignment, - (createInfo.flags & VMA_ALLOCATION_CREATE_UPPER_ADDRESS_BIT) != 0, - suballocType, - canMakeOtherLost, - strategy, - &currRequest)) - { - const VkDeviceSize currRequestCost = currRequest.CalcCost(); - if(pBestRequestBlock == VMA_NULL || - currRequestCost < bestRequestCost || - strategy == VMA_ALLOCATION_CREATE_STRATEGY_FIRST_FIT_BIT) - { - pBestRequestBlock = pCurrBlock; - bestRequest = currRequest; - bestRequestCost = currRequestCost; - - if(bestRequestCost == 0 || - strategy == VMA_ALLOCATION_CREATE_STRATEGY_FIRST_FIT_BIT) - { - break; - } - } - } - } - } - - if(pBestRequestBlock != VMA_NULL) - { - if(mapped) - { - VkResult res = pBestRequestBlock->Map(m_hAllocator, 1, VMA_NULL); - if(res != VK_SUCCESS) - { - return res; - } - } - - if(pBestRequestBlock->m_pMetadata->MakeRequestedAllocationsLost( - currentFrameIndex, - m_FrameInUseCount, - &bestRequest)) - { - // Allocate from this pBlock. - *pAllocation = m_hAllocator->m_AllocationObjectAllocator.Allocate(currentFrameIndex, isUserDataString); - pBestRequestBlock->m_pMetadata->Alloc(bestRequest, suballocType, size, *pAllocation); - UpdateHasEmptyBlock(); - (*pAllocation)->InitBlockAllocation( - pBestRequestBlock, - bestRequest.offset, - alignment, - size, - m_MemoryTypeIndex, - suballocType, - mapped, - (createInfo.flags & VMA_ALLOCATION_CREATE_CAN_BECOME_LOST_BIT) != 0); - VMA_HEAVY_ASSERT(pBestRequestBlock->Validate()); - VMA_DEBUG_LOG(" Returned from existing block"); - (*pAllocation)->SetUserData(m_hAllocator, createInfo.pUserData); - m_hAllocator->m_Budget.AddAllocation(m_hAllocator->MemoryTypeIndexToHeapIndex(m_MemoryTypeIndex), size); - if(VMA_DEBUG_INITIALIZE_ALLOCATIONS) - { - m_hAllocator->FillAllocation(*pAllocation, VMA_ALLOCATION_FILL_PATTERN_CREATED); - } - if(IsCorruptionDetectionEnabled()) - { - VkResult res = pBestRequestBlock->WriteMagicValueAroundAllocation(m_hAllocator, bestRequest.offset, size); - VMA_ASSERT(res == VK_SUCCESS && "Couldn't map block memory to write magic value."); - } - return VK_SUCCESS; - } - // else: Some allocations must have been touched while we are here. Next try. - } - else - { - // Could not find place in any of the blocks - break outer loop. - break; - } - } - /* Maximum number of tries exceeded - a very unlike event when many other - threads are simultaneously touching allocations making it impossible to make - lost at the same time as we try to allocate. */ - if(tryIndex == VMA_ALLOCATION_TRY_COUNT) - { - return VK_ERROR_TOO_MANY_OBJECTS; - } - } - - return VK_ERROR_OUT_OF_DEVICE_MEMORY; -} - -void VmaBlockVector::Free( - const VmaAllocation hAllocation) -{ - VmaDeviceMemoryBlock* pBlockToDelete = VMA_NULL; - - bool budgetExceeded = false; - { - const uint32_t heapIndex = m_hAllocator->MemoryTypeIndexToHeapIndex(m_MemoryTypeIndex); - VmaBudget heapBudget = {}; - m_hAllocator->GetBudget(&heapBudget, heapIndex, 1); - budgetExceeded = heapBudget.usage >= heapBudget.budget; - } - - // Scope for lock. - { - VmaMutexLockWrite lock(m_Mutex, m_hAllocator->m_UseMutex); - - VmaDeviceMemoryBlock* pBlock = hAllocation->GetBlock(); - - if(IsCorruptionDetectionEnabled()) - { - VkResult res = pBlock->ValidateMagicValueAroundAllocation(m_hAllocator, hAllocation->GetOffset(), hAllocation->GetSize()); - VMA_ASSERT(res == VK_SUCCESS && "Couldn't map block memory to validate magic value."); - } - - if(hAllocation->IsPersistentMap()) - { - pBlock->Unmap(m_hAllocator, 1); - } - - pBlock->m_pMetadata->Free(hAllocation); - VMA_HEAVY_ASSERT(pBlock->Validate()); - - VMA_DEBUG_LOG(" Freed from MemoryTypeIndex=%u", m_MemoryTypeIndex); - - const bool canDeleteBlock = m_Blocks.size() > m_MinBlockCount; - // pBlock became empty after this deallocation. - if(pBlock->m_pMetadata->IsEmpty()) - { - // Already has empty block. We don't want to have two, so delete this one. - if((m_HasEmptyBlock || budgetExceeded) && canDeleteBlock) - { - pBlockToDelete = pBlock; - Remove(pBlock); - } - // else: We now have an empty block - leave it. - } - // pBlock didn't become empty, but we have another empty block - find and free that one. - // (This is optional, heuristics.) - else if(m_HasEmptyBlock && canDeleteBlock) - { - VmaDeviceMemoryBlock* pLastBlock = m_Blocks.back(); - if(pLastBlock->m_pMetadata->IsEmpty()) - { - pBlockToDelete = pLastBlock; - m_Blocks.pop_back(); - } - } - - UpdateHasEmptyBlock(); - IncrementallySortBlocks(); - } - - // Destruction of a free block. Deferred until this point, outside of mutex - // lock, for performance reason. - if(pBlockToDelete != VMA_NULL) - { - VMA_DEBUG_LOG(" Deleted empty block"); - pBlockToDelete->Destroy(m_hAllocator); - vma_delete(m_hAllocator, pBlockToDelete); - } -} - -VkDeviceSize VmaBlockVector::CalcMaxBlockSize() const -{ - VkDeviceSize result = 0; - for(size_t i = m_Blocks.size(); i--; ) - { - result = VMA_MAX(result, m_Blocks[i]->m_pMetadata->GetSize()); - if(result >= m_PreferredBlockSize) - { - break; - } - } - return result; -} - -void VmaBlockVector::Remove(VmaDeviceMemoryBlock* pBlock) -{ - for(uint32_t blockIndex = 0; blockIndex < m_Blocks.size(); ++blockIndex) - { - if(m_Blocks[blockIndex] == pBlock) - { - VmaVectorRemove(m_Blocks, blockIndex); - return; - } - } - VMA_ASSERT(0); -} - -void VmaBlockVector::IncrementallySortBlocks() -{ - if(m_Algorithm != VMA_POOL_CREATE_LINEAR_ALGORITHM_BIT) - { - // Bubble sort only until first swap. - for(size_t i = 1; i < m_Blocks.size(); ++i) - { - if(m_Blocks[i - 1]->m_pMetadata->GetSumFreeSize() > m_Blocks[i]->m_pMetadata->GetSumFreeSize()) - { - VMA_SWAP(m_Blocks[i - 1], m_Blocks[i]); - return; - } - } - } -} - -VkResult VmaBlockVector::AllocateFromBlock( - VmaDeviceMemoryBlock* pBlock, - uint32_t currentFrameIndex, - VkDeviceSize size, - VkDeviceSize alignment, - VmaAllocationCreateFlags allocFlags, - void* pUserData, - VmaSuballocationType suballocType, - uint32_t strategy, - VmaAllocation* pAllocation) -{ - VMA_ASSERT((allocFlags & VMA_ALLOCATION_CREATE_CAN_MAKE_OTHER_LOST_BIT) == 0); - const bool isUpperAddress = (allocFlags & VMA_ALLOCATION_CREATE_UPPER_ADDRESS_BIT) != 0; - const bool mapped = (allocFlags & VMA_ALLOCATION_CREATE_MAPPED_BIT) != 0; - const bool isUserDataString = (allocFlags & VMA_ALLOCATION_CREATE_USER_DATA_COPY_STRING_BIT) != 0; - - VmaAllocationRequest currRequest = {}; - if(pBlock->m_pMetadata->CreateAllocationRequest( - currentFrameIndex, - m_FrameInUseCount, - m_BufferImageGranularity, - size, - alignment, - isUpperAddress, - suballocType, - false, // canMakeOtherLost - strategy, - &currRequest)) - { - // Allocate from pCurrBlock. - VMA_ASSERT(currRequest.itemsToMakeLostCount == 0); - - if(mapped) - { - VkResult res = pBlock->Map(m_hAllocator, 1, VMA_NULL); - if(res != VK_SUCCESS) - { - return res; - } - } - - *pAllocation = m_hAllocator->m_AllocationObjectAllocator.Allocate(currentFrameIndex, isUserDataString); - pBlock->m_pMetadata->Alloc(currRequest, suballocType, size, *pAllocation); - UpdateHasEmptyBlock(); - (*pAllocation)->InitBlockAllocation( - pBlock, - currRequest.offset, - alignment, - size, - m_MemoryTypeIndex, - suballocType, - mapped, - (allocFlags & VMA_ALLOCATION_CREATE_CAN_BECOME_LOST_BIT) != 0); - VMA_HEAVY_ASSERT(pBlock->Validate()); - (*pAllocation)->SetUserData(m_hAllocator, pUserData); - m_hAllocator->m_Budget.AddAllocation(m_hAllocator->MemoryTypeIndexToHeapIndex(m_MemoryTypeIndex), size); - if(VMA_DEBUG_INITIALIZE_ALLOCATIONS) - { - m_hAllocator->FillAllocation(*pAllocation, VMA_ALLOCATION_FILL_PATTERN_CREATED); - } - if(IsCorruptionDetectionEnabled()) - { - VkResult res = pBlock->WriteMagicValueAroundAllocation(m_hAllocator, currRequest.offset, size); - VMA_ASSERT(res == VK_SUCCESS && "Couldn't map block memory to write magic value."); - } - return VK_SUCCESS; - } - return VK_ERROR_OUT_OF_DEVICE_MEMORY; -} - -VkResult VmaBlockVector::CreateBlock(VkDeviceSize blockSize, size_t* pNewBlockIndex) -{ - VkMemoryAllocateInfo allocInfo = { VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO }; - allocInfo.memoryTypeIndex = m_MemoryTypeIndex; - allocInfo.allocationSize = blockSize; - -#if VMA_BUFFER_DEVICE_ADDRESS - // Every standalone block can potentially contain a buffer with VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT - always enable the feature. - VkMemoryAllocateFlagsInfoKHR allocFlagsInfo = { VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_FLAGS_INFO_KHR }; - if(m_hAllocator->m_UseKhrBufferDeviceAddress) - { - allocFlagsInfo.flags = VK_MEMORY_ALLOCATE_DEVICE_ADDRESS_BIT_KHR; - VmaPnextChainPushFront(&allocInfo, &allocFlagsInfo); - } -#endif // #if VMA_BUFFER_DEVICE_ADDRESS - -#if VMA_MEMORY_PRIORITY - VkMemoryPriorityAllocateInfoEXT priorityInfo = { VK_STRUCTURE_TYPE_MEMORY_PRIORITY_ALLOCATE_INFO_EXT }; - if(m_hAllocator->m_UseExtMemoryPriority) - { - priorityInfo.priority = m_Priority; - VmaPnextChainPushFront(&allocInfo, &priorityInfo); - } -#endif // #if VMA_MEMORY_PRIORITY - - VkDeviceMemory mem = VK_NULL_HANDLE; - VkResult res = m_hAllocator->AllocateVulkanMemory(&allocInfo, &mem); - if(res < 0) - { - return res; - } - - // New VkDeviceMemory successfully created. - - // Create new Allocation for it. - VmaDeviceMemoryBlock* const pBlock = vma_new(m_hAllocator, VmaDeviceMemoryBlock)(m_hAllocator); - pBlock->Init( - m_hAllocator, - m_hParentPool, - m_MemoryTypeIndex, - mem, - allocInfo.allocationSize, - m_NextBlockId++, - m_Algorithm); - - m_Blocks.push_back(pBlock); - if(pNewBlockIndex != VMA_NULL) - { - *pNewBlockIndex = m_Blocks.size() - 1; - } - - return VK_SUCCESS; -} - -void VmaBlockVector::ApplyDefragmentationMovesCpu( - class VmaBlockVectorDefragmentationContext* pDefragCtx, - const VmaVector< VmaDefragmentationMove, VmaStlAllocator >& moves) -{ - const size_t blockCount = m_Blocks.size(); - const bool isNonCoherent = m_hAllocator->IsMemoryTypeNonCoherent(m_MemoryTypeIndex); - - enum BLOCK_FLAG - { - BLOCK_FLAG_USED = 0x00000001, - BLOCK_FLAG_MAPPED_FOR_DEFRAGMENTATION = 0x00000002, - }; - - struct BlockInfo - { - uint32_t flags; - void* pMappedData; - }; - VmaVector< BlockInfo, VmaStlAllocator > - blockInfo(blockCount, BlockInfo(), VmaStlAllocator(m_hAllocator->GetAllocationCallbacks())); - memset(blockInfo.data(), 0, blockCount * sizeof(BlockInfo)); - - // Go over all moves. Mark blocks that are used with BLOCK_FLAG_USED. - const size_t moveCount = moves.size(); - for(size_t moveIndex = 0; moveIndex < moveCount; ++moveIndex) - { - const VmaDefragmentationMove& move = moves[moveIndex]; - blockInfo[move.srcBlockIndex].flags |= BLOCK_FLAG_USED; - blockInfo[move.dstBlockIndex].flags |= BLOCK_FLAG_USED; - } - - VMA_ASSERT(pDefragCtx->res == VK_SUCCESS); - - // Go over all blocks. Get mapped pointer or map if necessary. - for(size_t blockIndex = 0; pDefragCtx->res == VK_SUCCESS && blockIndex < blockCount; ++blockIndex) - { - BlockInfo& currBlockInfo = blockInfo[blockIndex]; - VmaDeviceMemoryBlock* pBlock = m_Blocks[blockIndex]; - if((currBlockInfo.flags & BLOCK_FLAG_USED) != 0) - { - currBlockInfo.pMappedData = pBlock->GetMappedData(); - // It is not originally mapped - map it. - if(currBlockInfo.pMappedData == VMA_NULL) - { - pDefragCtx->res = pBlock->Map(m_hAllocator, 1, &currBlockInfo.pMappedData); - if(pDefragCtx->res == VK_SUCCESS) - { - currBlockInfo.flags |= BLOCK_FLAG_MAPPED_FOR_DEFRAGMENTATION; - } - } - } - } - - // Go over all moves. Do actual data transfer. - if(pDefragCtx->res == VK_SUCCESS) - { - const VkDeviceSize nonCoherentAtomSize = m_hAllocator->m_PhysicalDeviceProperties.limits.nonCoherentAtomSize; - VkMappedMemoryRange memRange = { VK_STRUCTURE_TYPE_MAPPED_MEMORY_RANGE }; - - for(size_t moveIndex = 0; moveIndex < moveCount; ++moveIndex) - { - const VmaDefragmentationMove& move = moves[moveIndex]; - - const BlockInfo& srcBlockInfo = blockInfo[move.srcBlockIndex]; - const BlockInfo& dstBlockInfo = blockInfo[move.dstBlockIndex]; - - VMA_ASSERT(srcBlockInfo.pMappedData && dstBlockInfo.pMappedData); - - // Invalidate source. - if(isNonCoherent) - { - VmaDeviceMemoryBlock* const pSrcBlock = m_Blocks[move.srcBlockIndex]; - memRange.memory = pSrcBlock->GetDeviceMemory(); - memRange.offset = VmaAlignDown(move.srcOffset, nonCoherentAtomSize); - memRange.size = VMA_MIN( - VmaAlignUp(move.size + (move.srcOffset - memRange.offset), nonCoherentAtomSize), - pSrcBlock->m_pMetadata->GetSize() - memRange.offset); - (*m_hAllocator->GetVulkanFunctions().vkInvalidateMappedMemoryRanges)(m_hAllocator->m_hDevice, 1, &memRange); - } - - // THE PLACE WHERE ACTUAL DATA COPY HAPPENS. - memmove( - reinterpret_cast(dstBlockInfo.pMappedData) + move.dstOffset, - reinterpret_cast(srcBlockInfo.pMappedData) + move.srcOffset, - static_cast(move.size)); - - if(IsCorruptionDetectionEnabled()) - { - VmaWriteMagicValue(dstBlockInfo.pMappedData, move.dstOffset - VMA_DEBUG_MARGIN); - VmaWriteMagicValue(dstBlockInfo.pMappedData, move.dstOffset + move.size); - } - - // Flush destination. - if(isNonCoherent) - { - VmaDeviceMemoryBlock* const pDstBlock = m_Blocks[move.dstBlockIndex]; - memRange.memory = pDstBlock->GetDeviceMemory(); - memRange.offset = VmaAlignDown(move.dstOffset, nonCoherentAtomSize); - memRange.size = VMA_MIN( - VmaAlignUp(move.size + (move.dstOffset - memRange.offset), nonCoherentAtomSize), - pDstBlock->m_pMetadata->GetSize() - memRange.offset); - (*m_hAllocator->GetVulkanFunctions().vkFlushMappedMemoryRanges)(m_hAllocator->m_hDevice, 1, &memRange); - } - } - } - - // Go over all blocks in reverse order. Unmap those that were mapped just for defragmentation. - // Regardless of pCtx->res == VK_SUCCESS. - for(size_t blockIndex = blockCount; blockIndex--; ) - { - const BlockInfo& currBlockInfo = blockInfo[blockIndex]; - if((currBlockInfo.flags & BLOCK_FLAG_MAPPED_FOR_DEFRAGMENTATION) != 0) - { - VmaDeviceMemoryBlock* pBlock = m_Blocks[blockIndex]; - pBlock->Unmap(m_hAllocator, 1); - } - } -} - -void VmaBlockVector::ApplyDefragmentationMovesGpu( - class VmaBlockVectorDefragmentationContext* pDefragCtx, - VmaVector< VmaDefragmentationMove, VmaStlAllocator >& moves, - VkCommandBuffer commandBuffer) -{ - const size_t blockCount = m_Blocks.size(); - - pDefragCtx->blockContexts.resize(blockCount); - memset(pDefragCtx->blockContexts.data(), 0, blockCount * sizeof(VmaBlockDefragmentationContext)); - - // Go over all moves. Mark blocks that are used with BLOCK_FLAG_USED. - const size_t moveCount = moves.size(); - for(size_t moveIndex = 0; moveIndex < moveCount; ++moveIndex) - { - const VmaDefragmentationMove& move = moves[moveIndex]; - - //if(move.type == VMA_ALLOCATION_TYPE_UNKNOWN) - { - // Old school move still require us to map the whole block - pDefragCtx->blockContexts[move.srcBlockIndex].flags |= VmaBlockDefragmentationContext::BLOCK_FLAG_USED; - pDefragCtx->blockContexts[move.dstBlockIndex].flags |= VmaBlockDefragmentationContext::BLOCK_FLAG_USED; - } - } - - VMA_ASSERT(pDefragCtx->res == VK_SUCCESS); - - // Go over all blocks. Create and bind buffer for whole block if necessary. - { - VkBufferCreateInfo bufCreateInfo; - VmaFillGpuDefragmentationBufferCreateInfo(bufCreateInfo); - - for(size_t blockIndex = 0; pDefragCtx->res == VK_SUCCESS && blockIndex < blockCount; ++blockIndex) - { - VmaBlockDefragmentationContext& currBlockCtx = pDefragCtx->blockContexts[blockIndex]; - VmaDeviceMemoryBlock* pBlock = m_Blocks[blockIndex]; - if((currBlockCtx.flags & VmaBlockDefragmentationContext::BLOCK_FLAG_USED) != 0) - { - bufCreateInfo.size = pBlock->m_pMetadata->GetSize(); - pDefragCtx->res = (*m_hAllocator->GetVulkanFunctions().vkCreateBuffer)( - m_hAllocator->m_hDevice, &bufCreateInfo, m_hAllocator->GetAllocationCallbacks(), &currBlockCtx.hBuffer); - if(pDefragCtx->res == VK_SUCCESS) - { - pDefragCtx->res = (*m_hAllocator->GetVulkanFunctions().vkBindBufferMemory)( - m_hAllocator->m_hDevice, currBlockCtx.hBuffer, pBlock->GetDeviceMemory(), 0); - } - } - } - } - - // Go over all moves. Post data transfer commands to command buffer. - if(pDefragCtx->res == VK_SUCCESS) - { - for(size_t moveIndex = 0; moveIndex < moveCount; ++moveIndex) - { - const VmaDefragmentationMove& move = moves[moveIndex]; - - const VmaBlockDefragmentationContext& srcBlockCtx = pDefragCtx->blockContexts[move.srcBlockIndex]; - const VmaBlockDefragmentationContext& dstBlockCtx = pDefragCtx->blockContexts[move.dstBlockIndex]; - - VMA_ASSERT(srcBlockCtx.hBuffer && dstBlockCtx.hBuffer); - - VkBufferCopy region = { - move.srcOffset, - move.dstOffset, - move.size }; - (*m_hAllocator->GetVulkanFunctions().vkCmdCopyBuffer)( - commandBuffer, srcBlockCtx.hBuffer, dstBlockCtx.hBuffer, 1, ®ion); - } - } - - // Save buffers to defrag context for later destruction. - if(pDefragCtx->res == VK_SUCCESS && moveCount > 0) - { - pDefragCtx->res = VK_NOT_READY; - } -} - -void VmaBlockVector::FreeEmptyBlocks(VmaDefragmentationStats* pDefragmentationStats) -{ - for(size_t blockIndex = m_Blocks.size(); blockIndex--; ) - { - VmaDeviceMemoryBlock* pBlock = m_Blocks[blockIndex]; - if(pBlock->m_pMetadata->IsEmpty()) - { - if(m_Blocks.size() > m_MinBlockCount) - { - if(pDefragmentationStats != VMA_NULL) - { - ++pDefragmentationStats->deviceMemoryBlocksFreed; - pDefragmentationStats->bytesFreed += pBlock->m_pMetadata->GetSize(); - } - - VmaVectorRemove(m_Blocks, blockIndex); - pBlock->Destroy(m_hAllocator); - vma_delete(m_hAllocator, pBlock); - } - else - { - break; - } - } - } - UpdateHasEmptyBlock(); -} - -void VmaBlockVector::UpdateHasEmptyBlock() -{ - m_HasEmptyBlock = false; - for(size_t index = 0, count = m_Blocks.size(); index < count; ++index) - { - VmaDeviceMemoryBlock* const pBlock = m_Blocks[index]; - if(pBlock->m_pMetadata->IsEmpty()) - { - m_HasEmptyBlock = true; - break; - } - } -} - -#if VMA_STATS_STRING_ENABLED - -void VmaBlockVector::PrintDetailedMap(class VmaJsonWriter& json) -{ - VmaMutexLockRead lock(m_Mutex, m_hAllocator->m_UseMutex); - - json.BeginObject(); - - if(IsCustomPool()) - { - const char* poolName = m_hParentPool->GetName(); - if(poolName != VMA_NULL && poolName[0] != '\0') - { - json.WriteString("Name"); - json.WriteString(poolName); - } - - json.WriteString("MemoryTypeIndex"); - json.WriteNumber(m_MemoryTypeIndex); - - json.WriteString("BlockSize"); - json.WriteNumber(m_PreferredBlockSize); - - json.WriteString("BlockCount"); - json.BeginObject(true); - if(m_MinBlockCount > 0) - { - json.WriteString("Min"); - json.WriteNumber((uint64_t)m_MinBlockCount); - } - if(m_MaxBlockCount < SIZE_MAX) - { - json.WriteString("Max"); - json.WriteNumber((uint64_t)m_MaxBlockCount); - } - json.WriteString("Cur"); - json.WriteNumber((uint64_t)m_Blocks.size()); - json.EndObject(); - - if(m_FrameInUseCount > 0) - { - json.WriteString("FrameInUseCount"); - json.WriteNumber(m_FrameInUseCount); - } - - if(m_Algorithm != 0) - { - json.WriteString("Algorithm"); - json.WriteString(VmaAlgorithmToStr(m_Algorithm)); - } - } - else - { - json.WriteString("PreferredBlockSize"); - json.WriteNumber(m_PreferredBlockSize); - } - - json.WriteString("Blocks"); - json.BeginObject(); - for(size_t i = 0; i < m_Blocks.size(); ++i) - { - json.BeginString(); - json.ContinueString(m_Blocks[i]->GetId()); - json.EndString(); - - m_Blocks[i]->m_pMetadata->PrintDetailedMap(json); - } - json.EndObject(); - - json.EndObject(); -} - -#endif // #if VMA_STATS_STRING_ENABLED - -void VmaBlockVector::Defragment( - class VmaBlockVectorDefragmentationContext* pCtx, - VmaDefragmentationStats* pStats, VmaDefragmentationFlags flags, - VkDeviceSize& maxCpuBytesToMove, uint32_t& maxCpuAllocationsToMove, - VkDeviceSize& maxGpuBytesToMove, uint32_t& maxGpuAllocationsToMove, - VkCommandBuffer commandBuffer) -{ - pCtx->res = VK_SUCCESS; - - const VkMemoryPropertyFlags memPropFlags = - m_hAllocator->m_MemProps.memoryTypes[m_MemoryTypeIndex].propertyFlags; - const bool isHostVisible = (memPropFlags & VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT) != 0; - - const bool canDefragmentOnCpu = maxCpuBytesToMove > 0 && maxCpuAllocationsToMove > 0 && - isHostVisible; - const bool canDefragmentOnGpu = maxGpuBytesToMove > 0 && maxGpuAllocationsToMove > 0 && - !IsCorruptionDetectionEnabled() && - ((1u << m_MemoryTypeIndex) & m_hAllocator->GetGpuDefragmentationMemoryTypeBits()) != 0; - - // There are options to defragment this memory type. - if(canDefragmentOnCpu || canDefragmentOnGpu) - { - bool defragmentOnGpu; - // There is only one option to defragment this memory type. - if(canDefragmentOnGpu != canDefragmentOnCpu) - { - defragmentOnGpu = canDefragmentOnGpu; - } - // Both options are available: Heuristics to choose the best one. - else - { - defragmentOnGpu = (memPropFlags & VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT) != 0 || - m_hAllocator->IsIntegratedGpu(); - } - - bool overlappingMoveSupported = !defragmentOnGpu; - - if(m_hAllocator->m_UseMutex) - { - if(flags & VMA_DEFRAGMENTATION_FLAG_INCREMENTAL) - { - if(!m_Mutex.TryLockWrite()) - { - pCtx->res = VK_ERROR_INITIALIZATION_FAILED; - return; - } - } - else - { - m_Mutex.LockWrite(); - pCtx->mutexLocked = true; - } - } - - pCtx->Begin(overlappingMoveSupported, flags); - - // Defragment. - - const VkDeviceSize maxBytesToMove = defragmentOnGpu ? maxGpuBytesToMove : maxCpuBytesToMove; - const uint32_t maxAllocationsToMove = defragmentOnGpu ? maxGpuAllocationsToMove : maxCpuAllocationsToMove; - pCtx->res = pCtx->GetAlgorithm()->Defragment(pCtx->defragmentationMoves, maxBytesToMove, maxAllocationsToMove, flags); - - // Accumulate statistics. - if(pStats != VMA_NULL) - { - const VkDeviceSize bytesMoved = pCtx->GetAlgorithm()->GetBytesMoved(); - const uint32_t allocationsMoved = pCtx->GetAlgorithm()->GetAllocationsMoved(); - pStats->bytesMoved += bytesMoved; - pStats->allocationsMoved += allocationsMoved; - VMA_ASSERT(bytesMoved <= maxBytesToMove); - VMA_ASSERT(allocationsMoved <= maxAllocationsToMove); - if(defragmentOnGpu) - { - maxGpuBytesToMove -= bytesMoved; - maxGpuAllocationsToMove -= allocationsMoved; - } - else - { - maxCpuBytesToMove -= bytesMoved; - maxCpuAllocationsToMove -= allocationsMoved; - } - } - - if(flags & VMA_DEFRAGMENTATION_FLAG_INCREMENTAL) - { - if(m_hAllocator->m_UseMutex) - m_Mutex.UnlockWrite(); - - if(pCtx->res >= VK_SUCCESS && !pCtx->defragmentationMoves.empty()) - pCtx->res = VK_NOT_READY; - - return; - } - - if(pCtx->res >= VK_SUCCESS) - { - if(defragmentOnGpu) - { - ApplyDefragmentationMovesGpu(pCtx, pCtx->defragmentationMoves, commandBuffer); - } - else - { - ApplyDefragmentationMovesCpu(pCtx, pCtx->defragmentationMoves); - } - } - } -} - -void VmaBlockVector::DefragmentationEnd( - class VmaBlockVectorDefragmentationContext* pCtx, - uint32_t flags, - VmaDefragmentationStats* pStats) -{ - if(flags & VMA_DEFRAGMENTATION_FLAG_INCREMENTAL && m_hAllocator->m_UseMutex) - { - VMA_ASSERT(pCtx->mutexLocked == false); - - // Incremental defragmentation doesn't hold the lock, so when we enter here we don't actually have any - // lock protecting us. Since we mutate state here, we have to take the lock out now - m_Mutex.LockWrite(); - pCtx->mutexLocked = true; - } - - // If the mutex isn't locked we didn't do any work and there is nothing to delete. - if(pCtx->mutexLocked || !m_hAllocator->m_UseMutex) - { - // Destroy buffers. - for(size_t blockIndex = pCtx->blockContexts.size(); blockIndex--;) - { - VmaBlockDefragmentationContext &blockCtx = pCtx->blockContexts[blockIndex]; - if(blockCtx.hBuffer) - { - (*m_hAllocator->GetVulkanFunctions().vkDestroyBuffer)(m_hAllocator->m_hDevice, blockCtx.hBuffer, m_hAllocator->GetAllocationCallbacks()); - } - } - - if(pCtx->res >= VK_SUCCESS) - { - FreeEmptyBlocks(pStats); - } - } - - if(pCtx->mutexLocked) - { - VMA_ASSERT(m_hAllocator->m_UseMutex); - m_Mutex.UnlockWrite(); - } -} - -uint32_t VmaBlockVector::ProcessDefragmentations( - class VmaBlockVectorDefragmentationContext *pCtx, - VmaDefragmentationPassMoveInfo* pMove, uint32_t maxMoves) -{ - VmaMutexLockWrite lock(m_Mutex, m_hAllocator->m_UseMutex); - - const uint32_t moveCount = VMA_MIN(uint32_t(pCtx->defragmentationMoves.size()) - pCtx->defragmentationMovesProcessed, maxMoves); - - for(uint32_t i = 0; i < moveCount; ++ i) - { - VmaDefragmentationMove& move = pCtx->defragmentationMoves[pCtx->defragmentationMovesProcessed + i]; - - pMove->allocation = move.hAllocation; - pMove->memory = move.pDstBlock->GetDeviceMemory(); - pMove->offset = move.dstOffset; - - ++ pMove; - } - - pCtx->defragmentationMovesProcessed += moveCount; - - return moveCount; -} - -void VmaBlockVector::CommitDefragmentations( - class VmaBlockVectorDefragmentationContext *pCtx, - VmaDefragmentationStats* pStats) -{ - VmaMutexLockWrite lock(m_Mutex, m_hAllocator->m_UseMutex); - - for(uint32_t i = pCtx->defragmentationMovesCommitted; i < pCtx->defragmentationMovesProcessed; ++ i) - { - const VmaDefragmentationMove &move = pCtx->defragmentationMoves[i]; - - move.pSrcBlock->m_pMetadata->FreeAtOffset(move.srcOffset); - move.hAllocation->ChangeBlockAllocation(m_hAllocator, move.pDstBlock, move.dstOffset); - } - - pCtx->defragmentationMovesCommitted = pCtx->defragmentationMovesProcessed; - FreeEmptyBlocks(pStats); -} - -size_t VmaBlockVector::CalcAllocationCount() const -{ - size_t result = 0; - for(size_t i = 0; i < m_Blocks.size(); ++i) - { - result += m_Blocks[i]->m_pMetadata->GetAllocationCount(); - } - return result; -} - -bool VmaBlockVector::IsBufferImageGranularityConflictPossible() const -{ - if(m_BufferImageGranularity == 1) - { - return false; - } - VmaSuballocationType lastSuballocType = VMA_SUBALLOCATION_TYPE_FREE; - for(size_t i = 0, count = m_Blocks.size(); i < count; ++i) - { - VmaDeviceMemoryBlock* const pBlock = m_Blocks[i]; - VMA_ASSERT(m_Algorithm == 0); - VmaBlockMetadata_Generic* const pMetadata = (VmaBlockMetadata_Generic*)pBlock->m_pMetadata; - if(pMetadata->IsBufferImageGranularityConflictPossible(m_BufferImageGranularity, lastSuballocType)) - { - return true; - } - } - return false; -} - -void VmaBlockVector::MakePoolAllocationsLost( - uint32_t currentFrameIndex, - size_t* pLostAllocationCount) -{ - VmaMutexLockWrite lock(m_Mutex, m_hAllocator->m_UseMutex); - size_t lostAllocationCount = 0; - for(uint32_t blockIndex = 0; blockIndex < m_Blocks.size(); ++blockIndex) - { - VmaDeviceMemoryBlock* const pBlock = m_Blocks[blockIndex]; - VMA_ASSERT(pBlock); - lostAllocationCount += pBlock->m_pMetadata->MakeAllocationsLost(currentFrameIndex, m_FrameInUseCount); - } - if(pLostAllocationCount != VMA_NULL) - { - *pLostAllocationCount = lostAllocationCount; - } -} - -VkResult VmaBlockVector::CheckCorruption() -{ - if(!IsCorruptionDetectionEnabled()) - { - return VK_ERROR_FEATURE_NOT_PRESENT; - } - - VmaMutexLockRead lock(m_Mutex, m_hAllocator->m_UseMutex); - for(uint32_t blockIndex = 0; blockIndex < m_Blocks.size(); ++blockIndex) - { - VmaDeviceMemoryBlock* const pBlock = m_Blocks[blockIndex]; - VMA_ASSERT(pBlock); - VkResult res = pBlock->CheckCorruption(m_hAllocator); - if(res != VK_SUCCESS) - { - return res; - } - } - return VK_SUCCESS; -} - -void VmaBlockVector::AddStats(VmaStats* pStats) -{ - const uint32_t memTypeIndex = m_MemoryTypeIndex; - const uint32_t memHeapIndex = m_hAllocator->MemoryTypeIndexToHeapIndex(memTypeIndex); - - VmaMutexLockRead lock(m_Mutex, m_hAllocator->m_UseMutex); - - for(uint32_t blockIndex = 0; blockIndex < m_Blocks.size(); ++blockIndex) - { - const VmaDeviceMemoryBlock* const pBlock = m_Blocks[blockIndex]; - VMA_ASSERT(pBlock); - VMA_HEAVY_ASSERT(pBlock->Validate()); - VmaStatInfo allocationStatInfo; - pBlock->m_pMetadata->CalcAllocationStatInfo(allocationStatInfo); - VmaAddStatInfo(pStats->total, allocationStatInfo); - VmaAddStatInfo(pStats->memoryType[memTypeIndex], allocationStatInfo); - VmaAddStatInfo(pStats->memoryHeap[memHeapIndex], allocationStatInfo); - } -} - -//////////////////////////////////////////////////////////////////////////////// -// VmaDefragmentationAlgorithm_Generic members definition - -VmaDefragmentationAlgorithm_Generic::VmaDefragmentationAlgorithm_Generic( - VmaAllocator hAllocator, - VmaBlockVector* pBlockVector, - uint32_t currentFrameIndex, - bool overlappingMoveSupported) : - VmaDefragmentationAlgorithm(hAllocator, pBlockVector, currentFrameIndex), - m_AllocationCount(0), - m_AllAllocations(false), - m_BytesMoved(0), - m_AllocationsMoved(0), - m_Blocks(VmaStlAllocator(hAllocator->GetAllocationCallbacks())) -{ - // Create block info for each block. - const size_t blockCount = m_pBlockVector->m_Blocks.size(); - for(size_t blockIndex = 0; blockIndex < blockCount; ++blockIndex) - { - BlockInfo* pBlockInfo = vma_new(m_hAllocator, BlockInfo)(m_hAllocator->GetAllocationCallbacks()); - pBlockInfo->m_OriginalBlockIndex = blockIndex; - pBlockInfo->m_pBlock = m_pBlockVector->m_Blocks[blockIndex]; - m_Blocks.push_back(pBlockInfo); - } - - // Sort them by m_pBlock pointer value. - VMA_SORT(m_Blocks.begin(), m_Blocks.end(), BlockPointerLess()); -} - -VmaDefragmentationAlgorithm_Generic::~VmaDefragmentationAlgorithm_Generic() -{ - for(size_t i = m_Blocks.size(); i--; ) - { - vma_delete(m_hAllocator, m_Blocks[i]); - } -} - -void VmaDefragmentationAlgorithm_Generic::AddAllocation(VmaAllocation hAlloc, VkBool32* pChanged) -{ - // Now as we are inside VmaBlockVector::m_Mutex, we can make final check if this allocation was not lost. - if(hAlloc->GetLastUseFrameIndex() != VMA_FRAME_INDEX_LOST) - { - VmaDeviceMemoryBlock* pBlock = hAlloc->GetBlock(); - BlockInfoVector::iterator it = VmaBinaryFindFirstNotLess(m_Blocks.begin(), m_Blocks.end(), pBlock, BlockPointerLess()); - if(it != m_Blocks.end() && (*it)->m_pBlock == pBlock) - { - AllocationInfo allocInfo = AllocationInfo(hAlloc, pChanged); - (*it)->m_Allocations.push_back(allocInfo); - } - else - { - VMA_ASSERT(0); - } - - ++m_AllocationCount; - } -} - -VkResult VmaDefragmentationAlgorithm_Generic::DefragmentRound( - VmaVector< VmaDefragmentationMove, VmaStlAllocator >& moves, - VkDeviceSize maxBytesToMove, - uint32_t maxAllocationsToMove, - bool freeOldAllocations) -{ - if(m_Blocks.empty()) - { - return VK_SUCCESS; - } - - // This is a choice based on research. - // Option 1: - uint32_t strategy = VMA_ALLOCATION_CREATE_STRATEGY_MIN_TIME_BIT; - // Option 2: - //uint32_t strategy = VMA_ALLOCATION_CREATE_STRATEGY_MIN_MEMORY_BIT; - // Option 3: - //uint32_t strategy = VMA_ALLOCATION_CREATE_STRATEGY_MIN_FRAGMENTATION_BIT; - - size_t srcBlockMinIndex = 0; - // When FAST_ALGORITHM, move allocations from only last out of blocks that contain non-movable allocations. - /* - if(m_AlgorithmFlags & VMA_DEFRAGMENTATION_FAST_ALGORITHM_BIT) - { - const size_t blocksWithNonMovableCount = CalcBlocksWithNonMovableCount(); - if(blocksWithNonMovableCount > 0) - { - srcBlockMinIndex = blocksWithNonMovableCount - 1; - } - } - */ - - size_t srcBlockIndex = m_Blocks.size() - 1; - size_t srcAllocIndex = SIZE_MAX; - for(;;) - { - // 1. Find next allocation to move. - // 1.1. Start from last to first m_Blocks - they are sorted from most "destination" to most "source". - // 1.2. Then start from last to first m_Allocations. - while(srcAllocIndex >= m_Blocks[srcBlockIndex]->m_Allocations.size()) - { - if(m_Blocks[srcBlockIndex]->m_Allocations.empty()) - { - // Finished: no more allocations to process. - if(srcBlockIndex == srcBlockMinIndex) - { - return VK_SUCCESS; - } - else - { - --srcBlockIndex; - srcAllocIndex = SIZE_MAX; - } - } - else - { - srcAllocIndex = m_Blocks[srcBlockIndex]->m_Allocations.size() - 1; - } - } - - BlockInfo* pSrcBlockInfo = m_Blocks[srcBlockIndex]; - AllocationInfo& allocInfo = pSrcBlockInfo->m_Allocations[srcAllocIndex]; - - const VkDeviceSize size = allocInfo.m_hAllocation->GetSize(); - const VkDeviceSize srcOffset = allocInfo.m_hAllocation->GetOffset(); - const VkDeviceSize alignment = allocInfo.m_hAllocation->GetAlignment(); - const VmaSuballocationType suballocType = allocInfo.m_hAllocation->GetSuballocationType(); - - // 2. Try to find new place for this allocation in preceding or current block. - for(size_t dstBlockIndex = 0; dstBlockIndex <= srcBlockIndex; ++dstBlockIndex) - { - BlockInfo* pDstBlockInfo = m_Blocks[dstBlockIndex]; - VmaAllocationRequest dstAllocRequest; - if(pDstBlockInfo->m_pBlock->m_pMetadata->CreateAllocationRequest( - m_CurrentFrameIndex, - m_pBlockVector->GetFrameInUseCount(), - m_pBlockVector->GetBufferImageGranularity(), - size, - alignment, - false, // upperAddress - suballocType, - false, // canMakeOtherLost - strategy, - &dstAllocRequest) && - MoveMakesSense( - dstBlockIndex, dstAllocRequest.offset, srcBlockIndex, srcOffset)) - { - VMA_ASSERT(dstAllocRequest.itemsToMakeLostCount == 0); - - // Reached limit on number of allocations or bytes to move. - if((m_AllocationsMoved + 1 > maxAllocationsToMove) || - (m_BytesMoved + size > maxBytesToMove)) - { - return VK_SUCCESS; - } - - VmaDefragmentationMove move = {}; - move.srcBlockIndex = pSrcBlockInfo->m_OriginalBlockIndex; - move.dstBlockIndex = pDstBlockInfo->m_OriginalBlockIndex; - move.srcOffset = srcOffset; - move.dstOffset = dstAllocRequest.offset; - move.size = size; - move.hAllocation = allocInfo.m_hAllocation; - move.pSrcBlock = pSrcBlockInfo->m_pBlock; - move.pDstBlock = pDstBlockInfo->m_pBlock; - - moves.push_back(move); - - pDstBlockInfo->m_pBlock->m_pMetadata->Alloc( - dstAllocRequest, - suballocType, - size, - allocInfo.m_hAllocation); - - if(freeOldAllocations) - { - pSrcBlockInfo->m_pBlock->m_pMetadata->FreeAtOffset(srcOffset); - allocInfo.m_hAllocation->ChangeBlockAllocation(m_hAllocator, pDstBlockInfo->m_pBlock, dstAllocRequest.offset); - } - - if(allocInfo.m_pChanged != VMA_NULL) - { - *allocInfo.m_pChanged = VK_TRUE; - } - - ++m_AllocationsMoved; - m_BytesMoved += size; - - VmaVectorRemove(pSrcBlockInfo->m_Allocations, srcAllocIndex); - - break; - } - } - - // If not processed, this allocInfo remains in pBlockInfo->m_Allocations for next round. - - if(srcAllocIndex > 0) - { - --srcAllocIndex; - } - else - { - if(srcBlockIndex > 0) - { - --srcBlockIndex; - srcAllocIndex = SIZE_MAX; - } - else - { - return VK_SUCCESS; - } - } - } -} - -size_t VmaDefragmentationAlgorithm_Generic::CalcBlocksWithNonMovableCount() const -{ - size_t result = 0; - for(size_t i = 0; i < m_Blocks.size(); ++i) - { - if(m_Blocks[i]->m_HasNonMovableAllocations) - { - ++result; - } - } - return result; -} - -VkResult VmaDefragmentationAlgorithm_Generic::Defragment( - VmaVector< VmaDefragmentationMove, VmaStlAllocator >& moves, - VkDeviceSize maxBytesToMove, - uint32_t maxAllocationsToMove, - VmaDefragmentationFlags flags) -{ - if(!m_AllAllocations && m_AllocationCount == 0) - { - return VK_SUCCESS; - } - - const size_t blockCount = m_Blocks.size(); - for(size_t blockIndex = 0; blockIndex < blockCount; ++blockIndex) - { - BlockInfo* pBlockInfo = m_Blocks[blockIndex]; - - if(m_AllAllocations) - { - VmaBlockMetadata_Generic* pMetadata = (VmaBlockMetadata_Generic*)pBlockInfo->m_pBlock->m_pMetadata; - for(VmaSuballocationList::const_iterator it = pMetadata->m_Suballocations.begin(); - it != pMetadata->m_Suballocations.end(); - ++it) - { - if(it->type != VMA_SUBALLOCATION_TYPE_FREE) - { - AllocationInfo allocInfo = AllocationInfo(it->hAllocation, VMA_NULL); - pBlockInfo->m_Allocations.push_back(allocInfo); - } - } - } - - pBlockInfo->CalcHasNonMovableAllocations(); - - // This is a choice based on research. - // Option 1: - pBlockInfo->SortAllocationsByOffsetDescending(); - // Option 2: - //pBlockInfo->SortAllocationsBySizeDescending(); - } - - // Sort m_Blocks this time by the main criterium, from most "destination" to most "source" blocks. - VMA_SORT(m_Blocks.begin(), m_Blocks.end(), BlockInfoCompareMoveDestination()); - - // This is a choice based on research. - const uint32_t roundCount = 2; - - // Execute defragmentation rounds (the main part). - VkResult result = VK_SUCCESS; - for(uint32_t round = 0; (round < roundCount) && (result == VK_SUCCESS); ++round) - { - result = DefragmentRound(moves, maxBytesToMove, maxAllocationsToMove, !(flags & VMA_DEFRAGMENTATION_FLAG_INCREMENTAL)); - } - - return result; -} - -bool VmaDefragmentationAlgorithm_Generic::MoveMakesSense( - size_t dstBlockIndex, VkDeviceSize dstOffset, - size_t srcBlockIndex, VkDeviceSize srcOffset) -{ - if(dstBlockIndex < srcBlockIndex) - { - return true; - } - if(dstBlockIndex > srcBlockIndex) - { - return false; - } - if(dstOffset < srcOffset) - { - return true; - } - return false; -} - -//////////////////////////////////////////////////////////////////////////////// -// VmaDefragmentationAlgorithm_Fast - -VmaDefragmentationAlgorithm_Fast::VmaDefragmentationAlgorithm_Fast( - VmaAllocator hAllocator, - VmaBlockVector* pBlockVector, - uint32_t currentFrameIndex, - bool overlappingMoveSupported) : - VmaDefragmentationAlgorithm(hAllocator, pBlockVector, currentFrameIndex), - m_OverlappingMoveSupported(overlappingMoveSupported), - m_AllocationCount(0), - m_AllAllocations(false), - m_BytesMoved(0), - m_AllocationsMoved(0), - m_BlockInfos(VmaStlAllocator(hAllocator->GetAllocationCallbacks())) -{ - VMA_ASSERT(VMA_DEBUG_MARGIN == 0); - -} - -VmaDefragmentationAlgorithm_Fast::~VmaDefragmentationAlgorithm_Fast() -{ -} - -VkResult VmaDefragmentationAlgorithm_Fast::Defragment( - VmaVector< VmaDefragmentationMove, VmaStlAllocator >& moves, - VkDeviceSize maxBytesToMove, - uint32_t maxAllocationsToMove, - VmaDefragmentationFlags flags) -{ - VMA_ASSERT(m_AllAllocations || m_pBlockVector->CalcAllocationCount() == m_AllocationCount); - - const size_t blockCount = m_pBlockVector->GetBlockCount(); - if(blockCount == 0 || maxBytesToMove == 0 || maxAllocationsToMove == 0) - { - return VK_SUCCESS; - } - - PreprocessMetadata(); - - // Sort blocks in order from most destination. - - m_BlockInfos.resize(blockCount); - for(size_t i = 0; i < blockCount; ++i) - { - m_BlockInfos[i].origBlockIndex = i; - } - - VMA_SORT(m_BlockInfos.begin(), m_BlockInfos.end(), [this](const BlockInfo& lhs, const BlockInfo& rhs) -> bool { - return m_pBlockVector->GetBlock(lhs.origBlockIndex)->m_pMetadata->GetSumFreeSize() < - m_pBlockVector->GetBlock(rhs.origBlockIndex)->m_pMetadata->GetSumFreeSize(); - }); - - // THE MAIN ALGORITHM - - FreeSpaceDatabase freeSpaceDb; - - size_t dstBlockInfoIndex = 0; - size_t dstOrigBlockIndex = m_BlockInfos[dstBlockInfoIndex].origBlockIndex; - VmaDeviceMemoryBlock* pDstBlock = m_pBlockVector->GetBlock(dstOrigBlockIndex); - VmaBlockMetadata_Generic* pDstMetadata = (VmaBlockMetadata_Generic*)pDstBlock->m_pMetadata; - VkDeviceSize dstBlockSize = pDstMetadata->GetSize(); - VkDeviceSize dstOffset = 0; - - bool end = false; - for(size_t srcBlockInfoIndex = 0; !end && srcBlockInfoIndex < blockCount; ++srcBlockInfoIndex) - { - const size_t srcOrigBlockIndex = m_BlockInfos[srcBlockInfoIndex].origBlockIndex; - VmaDeviceMemoryBlock* const pSrcBlock = m_pBlockVector->GetBlock(srcOrigBlockIndex); - VmaBlockMetadata_Generic* const pSrcMetadata = (VmaBlockMetadata_Generic*)pSrcBlock->m_pMetadata; - for(VmaSuballocationList::iterator srcSuballocIt = pSrcMetadata->m_Suballocations.begin(); - !end && srcSuballocIt != pSrcMetadata->m_Suballocations.end(); ) - { - VmaAllocation_T* const pAlloc = srcSuballocIt->hAllocation; - const VkDeviceSize srcAllocAlignment = pAlloc->GetAlignment(); - const VkDeviceSize srcAllocSize = srcSuballocIt->size; - if(m_AllocationsMoved == maxAllocationsToMove || - m_BytesMoved + srcAllocSize > maxBytesToMove) - { - end = true; - break; - } - const VkDeviceSize srcAllocOffset = srcSuballocIt->offset; - - VmaDefragmentationMove move = {}; - // Try to place it in one of free spaces from the database. - size_t freeSpaceInfoIndex; - VkDeviceSize dstAllocOffset; - if(freeSpaceDb.Fetch(srcAllocAlignment, srcAllocSize, - freeSpaceInfoIndex, dstAllocOffset)) - { - size_t freeSpaceOrigBlockIndex = m_BlockInfos[freeSpaceInfoIndex].origBlockIndex; - VmaDeviceMemoryBlock* pFreeSpaceBlock = m_pBlockVector->GetBlock(freeSpaceOrigBlockIndex); - VmaBlockMetadata_Generic* pFreeSpaceMetadata = (VmaBlockMetadata_Generic*)pFreeSpaceBlock->m_pMetadata; - - // Same block - if(freeSpaceInfoIndex == srcBlockInfoIndex) - { - VMA_ASSERT(dstAllocOffset <= srcAllocOffset); - - // MOVE OPTION 1: Move the allocation inside the same block by decreasing offset. - - VmaSuballocation suballoc = *srcSuballocIt; - suballoc.offset = dstAllocOffset; - suballoc.hAllocation->ChangeOffset(dstAllocOffset); - m_BytesMoved += srcAllocSize; - ++m_AllocationsMoved; - - VmaSuballocationList::iterator nextSuballocIt = srcSuballocIt; - ++nextSuballocIt; - pSrcMetadata->m_Suballocations.erase(srcSuballocIt); - srcSuballocIt = nextSuballocIt; - - InsertSuballoc(pFreeSpaceMetadata, suballoc); - - move.srcBlockIndex = srcOrigBlockIndex; - move.dstBlockIndex = freeSpaceOrigBlockIndex; - move.srcOffset = srcAllocOffset; - move.dstOffset = dstAllocOffset; - move.size = srcAllocSize; - - moves.push_back(move); - } - // Different block - else - { - // MOVE OPTION 2: Move the allocation to a different block. - - VMA_ASSERT(freeSpaceInfoIndex < srcBlockInfoIndex); - - VmaSuballocation suballoc = *srcSuballocIt; - suballoc.offset = dstAllocOffset; - suballoc.hAllocation->ChangeBlockAllocation(m_hAllocator, pFreeSpaceBlock, dstAllocOffset); - m_BytesMoved += srcAllocSize; - ++m_AllocationsMoved; - - VmaSuballocationList::iterator nextSuballocIt = srcSuballocIt; - ++nextSuballocIt; - pSrcMetadata->m_Suballocations.erase(srcSuballocIt); - srcSuballocIt = nextSuballocIt; - - InsertSuballoc(pFreeSpaceMetadata, suballoc); - - move.srcBlockIndex = srcOrigBlockIndex; - move.dstBlockIndex = freeSpaceOrigBlockIndex; - move.srcOffset = srcAllocOffset; - move.dstOffset = dstAllocOffset; - move.size = srcAllocSize; - - moves.push_back(move); - } - } - else - { - dstAllocOffset = VmaAlignUp(dstOffset, srcAllocAlignment); - - // If the allocation doesn't fit before the end of dstBlock, forward to next block. - while(dstBlockInfoIndex < srcBlockInfoIndex && - dstAllocOffset + srcAllocSize > dstBlockSize) - { - // But before that, register remaining free space at the end of dst block. - freeSpaceDb.Register(dstBlockInfoIndex, dstOffset, dstBlockSize - dstOffset); - - ++dstBlockInfoIndex; - dstOrigBlockIndex = m_BlockInfos[dstBlockInfoIndex].origBlockIndex; - pDstBlock = m_pBlockVector->GetBlock(dstOrigBlockIndex); - pDstMetadata = (VmaBlockMetadata_Generic*)pDstBlock->m_pMetadata; - dstBlockSize = pDstMetadata->GetSize(); - dstOffset = 0; - dstAllocOffset = 0; - } - - // Same block - if(dstBlockInfoIndex == srcBlockInfoIndex) - { - VMA_ASSERT(dstAllocOffset <= srcAllocOffset); - - const bool overlap = dstAllocOffset + srcAllocSize > srcAllocOffset; - - bool skipOver = overlap; - if(overlap && m_OverlappingMoveSupported && dstAllocOffset < srcAllocOffset) - { - // If destination and source place overlap, skip if it would move it - // by only < 1/64 of its size. - skipOver = (srcAllocOffset - dstAllocOffset) * 64 < srcAllocSize; - } - - if(skipOver) - { - freeSpaceDb.Register(dstBlockInfoIndex, dstOffset, srcAllocOffset - dstOffset); - - dstOffset = srcAllocOffset + srcAllocSize; - ++srcSuballocIt; - } - // MOVE OPTION 1: Move the allocation inside the same block by decreasing offset. - else - { - srcSuballocIt->offset = dstAllocOffset; - srcSuballocIt->hAllocation->ChangeOffset(dstAllocOffset); - dstOffset = dstAllocOffset + srcAllocSize; - m_BytesMoved += srcAllocSize; - ++m_AllocationsMoved; - ++srcSuballocIt; - - move.srcBlockIndex = srcOrigBlockIndex; - move.dstBlockIndex = dstOrigBlockIndex; - move.srcOffset = srcAllocOffset; - move.dstOffset = dstAllocOffset; - move.size = srcAllocSize; - - moves.push_back(move); - } - } - // Different block - else - { - // MOVE OPTION 2: Move the allocation to a different block. - - VMA_ASSERT(dstBlockInfoIndex < srcBlockInfoIndex); - VMA_ASSERT(dstAllocOffset + srcAllocSize <= dstBlockSize); - - VmaSuballocation suballoc = *srcSuballocIt; - suballoc.offset = dstAllocOffset; - suballoc.hAllocation->ChangeBlockAllocation(m_hAllocator, pDstBlock, dstAllocOffset); - dstOffset = dstAllocOffset + srcAllocSize; - m_BytesMoved += srcAllocSize; - ++m_AllocationsMoved; - - VmaSuballocationList::iterator nextSuballocIt = srcSuballocIt; - ++nextSuballocIt; - pSrcMetadata->m_Suballocations.erase(srcSuballocIt); - srcSuballocIt = nextSuballocIt; - - pDstMetadata->m_Suballocations.push_back(suballoc); - - move.srcBlockIndex = srcOrigBlockIndex; - move.dstBlockIndex = dstOrigBlockIndex; - move.srcOffset = srcAllocOffset; - move.dstOffset = dstAllocOffset; - move.size = srcAllocSize; - - moves.push_back(move); - } - } - } - } - - m_BlockInfos.clear(); - - PostprocessMetadata(); - - return VK_SUCCESS; -} - -void VmaDefragmentationAlgorithm_Fast::PreprocessMetadata() -{ - const size_t blockCount = m_pBlockVector->GetBlockCount(); - for(size_t blockIndex = 0; blockIndex < blockCount; ++blockIndex) - { - VmaBlockMetadata_Generic* const pMetadata = - (VmaBlockMetadata_Generic*)m_pBlockVector->GetBlock(blockIndex)->m_pMetadata; - pMetadata->m_FreeCount = 0; - pMetadata->m_SumFreeSize = pMetadata->GetSize(); - pMetadata->m_FreeSuballocationsBySize.clear(); - for(VmaSuballocationList::iterator it = pMetadata->m_Suballocations.begin(); - it != pMetadata->m_Suballocations.end(); ) - { - if(it->type == VMA_SUBALLOCATION_TYPE_FREE) - { - VmaSuballocationList::iterator nextIt = it; - ++nextIt; - pMetadata->m_Suballocations.erase(it); - it = nextIt; - } - else - { - ++it; - } - } - } -} - -void VmaDefragmentationAlgorithm_Fast::PostprocessMetadata() -{ - const size_t blockCount = m_pBlockVector->GetBlockCount(); - for(size_t blockIndex = 0; blockIndex < blockCount; ++blockIndex) - { - VmaBlockMetadata_Generic* const pMetadata = - (VmaBlockMetadata_Generic*)m_pBlockVector->GetBlock(blockIndex)->m_pMetadata; - const VkDeviceSize blockSize = pMetadata->GetSize(); - - // No allocations in this block - entire area is free. - if(pMetadata->m_Suballocations.empty()) - { - pMetadata->m_FreeCount = 1; - //pMetadata->m_SumFreeSize is already set to blockSize. - VmaSuballocation suballoc = { - 0, // offset - blockSize, // size - VMA_NULL, // hAllocation - VMA_SUBALLOCATION_TYPE_FREE }; - pMetadata->m_Suballocations.push_back(suballoc); - pMetadata->RegisterFreeSuballocation(pMetadata->m_Suballocations.begin()); - } - // There are some allocations in this block. - else - { - VkDeviceSize offset = 0; - VmaSuballocationList::iterator it; - for(it = pMetadata->m_Suballocations.begin(); - it != pMetadata->m_Suballocations.end(); - ++it) - { - VMA_ASSERT(it->type != VMA_SUBALLOCATION_TYPE_FREE); - VMA_ASSERT(it->offset >= offset); - - // Need to insert preceding free space. - if(it->offset > offset) - { - ++pMetadata->m_FreeCount; - const VkDeviceSize freeSize = it->offset - offset; - VmaSuballocation suballoc = { - offset, // offset - freeSize, // size - VMA_NULL, // hAllocation - VMA_SUBALLOCATION_TYPE_FREE }; - VmaSuballocationList::iterator precedingFreeIt = pMetadata->m_Suballocations.insert(it, suballoc); - if(freeSize >= VMA_MIN_FREE_SUBALLOCATION_SIZE_TO_REGISTER) - { - pMetadata->m_FreeSuballocationsBySize.push_back(precedingFreeIt); - } - } - - pMetadata->m_SumFreeSize -= it->size; - offset = it->offset + it->size; - } - - // Need to insert trailing free space. - if(offset < blockSize) - { - ++pMetadata->m_FreeCount; - const VkDeviceSize freeSize = blockSize - offset; - VmaSuballocation suballoc = { - offset, // offset - freeSize, // size - VMA_NULL, // hAllocation - VMA_SUBALLOCATION_TYPE_FREE }; - VMA_ASSERT(it == pMetadata->m_Suballocations.end()); - VmaSuballocationList::iterator trailingFreeIt = pMetadata->m_Suballocations.insert(it, suballoc); - if(freeSize > VMA_MIN_FREE_SUBALLOCATION_SIZE_TO_REGISTER) - { - pMetadata->m_FreeSuballocationsBySize.push_back(trailingFreeIt); - } - } - - VMA_SORT( - pMetadata->m_FreeSuballocationsBySize.begin(), - pMetadata->m_FreeSuballocationsBySize.end(), - VmaSuballocationItemSizeLess()); - } - - VMA_HEAVY_ASSERT(pMetadata->Validate()); - } -} - -void VmaDefragmentationAlgorithm_Fast::InsertSuballoc(VmaBlockMetadata_Generic* pMetadata, const VmaSuballocation& suballoc) -{ - // TODO: Optimize somehow. Remember iterator instead of searching for it linearly. - VmaSuballocationList::iterator it = pMetadata->m_Suballocations.begin(); - while(it != pMetadata->m_Suballocations.end()) - { - if(it->offset < suballoc.offset) - { - ++it; - } - } - pMetadata->m_Suballocations.insert(it, suballoc); -} - -//////////////////////////////////////////////////////////////////////////////// -// VmaBlockVectorDefragmentationContext - -VmaBlockVectorDefragmentationContext::VmaBlockVectorDefragmentationContext( - VmaAllocator hAllocator, - VmaPool hCustomPool, - VmaBlockVector* pBlockVector, - uint32_t currFrameIndex) : - res(VK_SUCCESS), - mutexLocked(false), - blockContexts(VmaStlAllocator(hAllocator->GetAllocationCallbacks())), - defragmentationMoves(VmaStlAllocator(hAllocator->GetAllocationCallbacks())), - defragmentationMovesProcessed(0), - defragmentationMovesCommitted(0), - hasDefragmentationPlan(0), - m_hAllocator(hAllocator), - m_hCustomPool(hCustomPool), - m_pBlockVector(pBlockVector), - m_CurrFrameIndex(currFrameIndex), - m_pAlgorithm(VMA_NULL), - m_Allocations(VmaStlAllocator(hAllocator->GetAllocationCallbacks())), - m_AllAllocations(false) -{ -} - -VmaBlockVectorDefragmentationContext::~VmaBlockVectorDefragmentationContext() -{ - vma_delete(m_hAllocator, m_pAlgorithm); -} - -void VmaBlockVectorDefragmentationContext::AddAllocation(VmaAllocation hAlloc, VkBool32* pChanged) -{ - AllocInfo info = { hAlloc, pChanged }; - m_Allocations.push_back(info); -} - -void VmaBlockVectorDefragmentationContext::Begin(bool overlappingMoveSupported, VmaDefragmentationFlags flags) -{ - const bool allAllocations = m_AllAllocations || - m_Allocations.size() == m_pBlockVector->CalcAllocationCount(); - - /******************************** - HERE IS THE CHOICE OF DEFRAGMENTATION ALGORITHM. - ********************************/ - - /* - Fast algorithm is supported only when certain criteria are met: - - VMA_DEBUG_MARGIN is 0. - - All allocations in this block vector are moveable. - - There is no possibility of image/buffer granularity conflict. - - The defragmentation is not incremental - */ - if(VMA_DEBUG_MARGIN == 0 && - allAllocations && - !m_pBlockVector->IsBufferImageGranularityConflictPossible() && - !(flags & VMA_DEFRAGMENTATION_FLAG_INCREMENTAL)) - { - m_pAlgorithm = vma_new(m_hAllocator, VmaDefragmentationAlgorithm_Fast)( - m_hAllocator, m_pBlockVector, m_CurrFrameIndex, overlappingMoveSupported); - } - else - { - m_pAlgorithm = vma_new(m_hAllocator, VmaDefragmentationAlgorithm_Generic)( - m_hAllocator, m_pBlockVector, m_CurrFrameIndex, overlappingMoveSupported); - } - - if(allAllocations) - { - m_pAlgorithm->AddAll(); - } - else - { - for(size_t i = 0, count = m_Allocations.size(); i < count; ++i) - { - m_pAlgorithm->AddAllocation(m_Allocations[i].hAlloc, m_Allocations[i].pChanged); - } - } -} - -//////////////////////////////////////////////////////////////////////////////// -// VmaDefragmentationContext - -VmaDefragmentationContext_T::VmaDefragmentationContext_T( - VmaAllocator hAllocator, - uint32_t currFrameIndex, - uint32_t flags, - VmaDefragmentationStats* pStats) : - m_hAllocator(hAllocator), - m_CurrFrameIndex(currFrameIndex), - m_Flags(flags), - m_pStats(pStats), - m_CustomPoolContexts(VmaStlAllocator(hAllocator->GetAllocationCallbacks())) -{ - memset(m_DefaultPoolContexts, 0, sizeof(m_DefaultPoolContexts)); -} - -VmaDefragmentationContext_T::~VmaDefragmentationContext_T() -{ - for(size_t i = m_CustomPoolContexts.size(); i--; ) - { - VmaBlockVectorDefragmentationContext* pBlockVectorCtx = m_CustomPoolContexts[i]; - pBlockVectorCtx->GetBlockVector()->DefragmentationEnd(pBlockVectorCtx, m_Flags, m_pStats); - vma_delete(m_hAllocator, pBlockVectorCtx); - } - for(size_t i = m_hAllocator->m_MemProps.memoryTypeCount; i--; ) - { - VmaBlockVectorDefragmentationContext* pBlockVectorCtx = m_DefaultPoolContexts[i]; - if(pBlockVectorCtx) - { - pBlockVectorCtx->GetBlockVector()->DefragmentationEnd(pBlockVectorCtx, m_Flags, m_pStats); - vma_delete(m_hAllocator, pBlockVectorCtx); - } - } -} - -void VmaDefragmentationContext_T::AddPools(uint32_t poolCount, const VmaPool* pPools) -{ - for(uint32_t poolIndex = 0; poolIndex < poolCount; ++poolIndex) - { - VmaPool pool = pPools[poolIndex]; - VMA_ASSERT(pool); - // Pools with algorithm other than default are not defragmented. - if(pool->m_BlockVector.GetAlgorithm() == 0) - { - VmaBlockVectorDefragmentationContext* pBlockVectorDefragCtx = VMA_NULL; - - for(size_t i = m_CustomPoolContexts.size(); i--; ) - { - if(m_CustomPoolContexts[i]->GetCustomPool() == pool) - { - pBlockVectorDefragCtx = m_CustomPoolContexts[i]; - break; - } - } - - if(!pBlockVectorDefragCtx) - { - pBlockVectorDefragCtx = vma_new(m_hAllocator, VmaBlockVectorDefragmentationContext)( - m_hAllocator, - pool, - &pool->m_BlockVector, - m_CurrFrameIndex); - m_CustomPoolContexts.push_back(pBlockVectorDefragCtx); - } - - pBlockVectorDefragCtx->AddAll(); - } - } -} - -void VmaDefragmentationContext_T::AddAllocations( - uint32_t allocationCount, - const VmaAllocation* pAllocations, - VkBool32* pAllocationsChanged) -{ - // Dispatch pAllocations among defragmentators. Create them when necessary. - for(uint32_t allocIndex = 0; allocIndex < allocationCount; ++allocIndex) - { - const VmaAllocation hAlloc = pAllocations[allocIndex]; - VMA_ASSERT(hAlloc); - // DedicatedAlloc cannot be defragmented. - if((hAlloc->GetType() == VmaAllocation_T::ALLOCATION_TYPE_BLOCK) && - // Lost allocation cannot be defragmented. - (hAlloc->GetLastUseFrameIndex() != VMA_FRAME_INDEX_LOST)) - { - VmaBlockVectorDefragmentationContext* pBlockVectorDefragCtx = VMA_NULL; - - const VmaPool hAllocPool = hAlloc->GetBlock()->GetParentPool(); - // This allocation belongs to custom pool. - if(hAllocPool != VK_NULL_HANDLE) - { - // Pools with algorithm other than default are not defragmented. - if(hAllocPool->m_BlockVector.GetAlgorithm() == 0) - { - for(size_t i = m_CustomPoolContexts.size(); i--; ) - { - if(m_CustomPoolContexts[i]->GetCustomPool() == hAllocPool) - { - pBlockVectorDefragCtx = m_CustomPoolContexts[i]; - break; - } - } - if(!pBlockVectorDefragCtx) - { - pBlockVectorDefragCtx = vma_new(m_hAllocator, VmaBlockVectorDefragmentationContext)( - m_hAllocator, - hAllocPool, - &hAllocPool->m_BlockVector, - m_CurrFrameIndex); - m_CustomPoolContexts.push_back(pBlockVectorDefragCtx); - } - } - } - // This allocation belongs to default pool. - else - { - const uint32_t memTypeIndex = hAlloc->GetMemoryTypeIndex(); - pBlockVectorDefragCtx = m_DefaultPoolContexts[memTypeIndex]; - if(!pBlockVectorDefragCtx) - { - pBlockVectorDefragCtx = vma_new(m_hAllocator, VmaBlockVectorDefragmentationContext)( - m_hAllocator, - VMA_NULL, // hCustomPool - m_hAllocator->m_pBlockVectors[memTypeIndex], - m_CurrFrameIndex); - m_DefaultPoolContexts[memTypeIndex] = pBlockVectorDefragCtx; - } - } - - if(pBlockVectorDefragCtx) - { - VkBool32* const pChanged = (pAllocationsChanged != VMA_NULL) ? - &pAllocationsChanged[allocIndex] : VMA_NULL; - pBlockVectorDefragCtx->AddAllocation(hAlloc, pChanged); - } - } - } -} - -VkResult VmaDefragmentationContext_T::Defragment( - VkDeviceSize maxCpuBytesToMove, uint32_t maxCpuAllocationsToMove, - VkDeviceSize maxGpuBytesToMove, uint32_t maxGpuAllocationsToMove, - VkCommandBuffer commandBuffer, VmaDefragmentationStats* pStats, VmaDefragmentationFlags flags) -{ - if(pStats) - { - memset(pStats, 0, sizeof(VmaDefragmentationStats)); - } - - if(flags & VMA_DEFRAGMENTATION_FLAG_INCREMENTAL) - { - // For incremental defragmetnations, we just earmark how much we can move - // The real meat is in the defragmentation steps - m_MaxCpuBytesToMove = maxCpuBytesToMove; - m_MaxCpuAllocationsToMove = maxCpuAllocationsToMove; - - m_MaxGpuBytesToMove = maxGpuBytesToMove; - m_MaxGpuAllocationsToMove = maxGpuAllocationsToMove; - - if(m_MaxCpuBytesToMove == 0 && m_MaxCpuAllocationsToMove == 0 && - m_MaxGpuBytesToMove == 0 && m_MaxGpuAllocationsToMove == 0) - return VK_SUCCESS; - - return VK_NOT_READY; - } - - if(commandBuffer == VK_NULL_HANDLE) - { - maxGpuBytesToMove = 0; - maxGpuAllocationsToMove = 0; - } - - VkResult res = VK_SUCCESS; - - // Process default pools. - for(uint32_t memTypeIndex = 0; - memTypeIndex < m_hAllocator->GetMemoryTypeCount() && res >= VK_SUCCESS; - ++memTypeIndex) - { - VmaBlockVectorDefragmentationContext* pBlockVectorCtx = m_DefaultPoolContexts[memTypeIndex]; - if(pBlockVectorCtx) - { - VMA_ASSERT(pBlockVectorCtx->GetBlockVector()); - pBlockVectorCtx->GetBlockVector()->Defragment( - pBlockVectorCtx, - pStats, flags, - maxCpuBytesToMove, maxCpuAllocationsToMove, - maxGpuBytesToMove, maxGpuAllocationsToMove, - commandBuffer); - if(pBlockVectorCtx->res != VK_SUCCESS) - { - res = pBlockVectorCtx->res; - } - } - } - - // Process custom pools. - for(size_t customCtxIndex = 0, customCtxCount = m_CustomPoolContexts.size(); - customCtxIndex < customCtxCount && res >= VK_SUCCESS; - ++customCtxIndex) - { - VmaBlockVectorDefragmentationContext* pBlockVectorCtx = m_CustomPoolContexts[customCtxIndex]; - VMA_ASSERT(pBlockVectorCtx && pBlockVectorCtx->GetBlockVector()); - pBlockVectorCtx->GetBlockVector()->Defragment( - pBlockVectorCtx, - pStats, flags, - maxCpuBytesToMove, maxCpuAllocationsToMove, - maxGpuBytesToMove, maxGpuAllocationsToMove, - commandBuffer); - if(pBlockVectorCtx->res != VK_SUCCESS) - { - res = pBlockVectorCtx->res; - } - } - - return res; -} - -VkResult VmaDefragmentationContext_T::DefragmentPassBegin(VmaDefragmentationPassInfo* pInfo) -{ - VmaDefragmentationPassMoveInfo* pCurrentMove = pInfo->pMoves; - uint32_t movesLeft = pInfo->moveCount; - - // Process default pools. - for(uint32_t memTypeIndex = 0; - memTypeIndex < m_hAllocator->GetMemoryTypeCount(); - ++memTypeIndex) - { - VmaBlockVectorDefragmentationContext *pBlockVectorCtx = m_DefaultPoolContexts[memTypeIndex]; - if(pBlockVectorCtx) - { - VMA_ASSERT(pBlockVectorCtx->GetBlockVector()); - - if(!pBlockVectorCtx->hasDefragmentationPlan) - { - pBlockVectorCtx->GetBlockVector()->Defragment( - pBlockVectorCtx, - m_pStats, m_Flags, - m_MaxCpuBytesToMove, m_MaxCpuAllocationsToMove, - m_MaxGpuBytesToMove, m_MaxGpuAllocationsToMove, - VK_NULL_HANDLE); - - if(pBlockVectorCtx->res < VK_SUCCESS) - continue; - - pBlockVectorCtx->hasDefragmentationPlan = true; - } - - const uint32_t processed = pBlockVectorCtx->GetBlockVector()->ProcessDefragmentations( - pBlockVectorCtx, - pCurrentMove, movesLeft); - - movesLeft -= processed; - pCurrentMove += processed; - } - } - - // Process custom pools. - for(size_t customCtxIndex = 0, customCtxCount = m_CustomPoolContexts.size(); - customCtxIndex < customCtxCount; - ++customCtxIndex) - { - VmaBlockVectorDefragmentationContext *pBlockVectorCtx = m_CustomPoolContexts[customCtxIndex]; - VMA_ASSERT(pBlockVectorCtx && pBlockVectorCtx->GetBlockVector()); - - if(!pBlockVectorCtx->hasDefragmentationPlan) - { - pBlockVectorCtx->GetBlockVector()->Defragment( - pBlockVectorCtx, - m_pStats, m_Flags, - m_MaxCpuBytesToMove, m_MaxCpuAllocationsToMove, - m_MaxGpuBytesToMove, m_MaxGpuAllocationsToMove, - VK_NULL_HANDLE); - - if(pBlockVectorCtx->res < VK_SUCCESS) - continue; - - pBlockVectorCtx->hasDefragmentationPlan = true; - } - - const uint32_t processed = pBlockVectorCtx->GetBlockVector()->ProcessDefragmentations( - pBlockVectorCtx, - pCurrentMove, movesLeft); - - movesLeft -= processed; - pCurrentMove += processed; - } - - pInfo->moveCount = pInfo->moveCount - movesLeft; - - return VK_SUCCESS; -} -VkResult VmaDefragmentationContext_T::DefragmentPassEnd() -{ - VkResult res = VK_SUCCESS; - - // Process default pools. - for(uint32_t memTypeIndex = 0; - memTypeIndex < m_hAllocator->GetMemoryTypeCount(); - ++memTypeIndex) - { - VmaBlockVectorDefragmentationContext *pBlockVectorCtx = m_DefaultPoolContexts[memTypeIndex]; - if(pBlockVectorCtx) - { - VMA_ASSERT(pBlockVectorCtx->GetBlockVector()); - - if(!pBlockVectorCtx->hasDefragmentationPlan) - { - res = VK_NOT_READY; - continue; - } - - pBlockVectorCtx->GetBlockVector()->CommitDefragmentations( - pBlockVectorCtx, m_pStats); - - if(pBlockVectorCtx->defragmentationMoves.size() != pBlockVectorCtx->defragmentationMovesCommitted) - res = VK_NOT_READY; - } - } - - // Process custom pools. - for(size_t customCtxIndex = 0, customCtxCount = m_CustomPoolContexts.size(); - customCtxIndex < customCtxCount; - ++customCtxIndex) - { - VmaBlockVectorDefragmentationContext *pBlockVectorCtx = m_CustomPoolContexts[customCtxIndex]; - VMA_ASSERT(pBlockVectorCtx && pBlockVectorCtx->GetBlockVector()); - - if(!pBlockVectorCtx->hasDefragmentationPlan) - { - res = VK_NOT_READY; - continue; - } - - pBlockVectorCtx->GetBlockVector()->CommitDefragmentations( - pBlockVectorCtx, m_pStats); - - if(pBlockVectorCtx->defragmentationMoves.size() != pBlockVectorCtx->defragmentationMovesCommitted) - res = VK_NOT_READY; - } - - return res; -} - -//////////////////////////////////////////////////////////////////////////////// -// VmaRecorder - -#if VMA_RECORDING_ENABLED - -VmaRecorder::VmaRecorder() : - m_UseMutex(true), - m_Flags(0), - m_File(VMA_NULL), - m_RecordingStartTime(std::chrono::high_resolution_clock::now()) -{ -} - -VkResult VmaRecorder::Init(const VmaRecordSettings& settings, bool useMutex) -{ - m_UseMutex = useMutex; - m_Flags = settings.flags; - -#if defined(_WIN32) - // Open file for writing. - errno_t err = fopen_s(&m_File, settings.pFilePath, "wb"); - - if(err != 0) - { - return VK_ERROR_INITIALIZATION_FAILED; - } -#else - // Open file for writing. - m_File = fopen(settings.pFilePath, "wb"); - - if(m_File == 0) - { - return VK_ERROR_INITIALIZATION_FAILED; - } -#endif - - // Write header. - fprintf(m_File, "%s\n", "Vulkan Memory Allocator,Calls recording"); - fprintf(m_File, "%s\n", "1,8"); - - return VK_SUCCESS; -} - -VmaRecorder::~VmaRecorder() -{ - if(m_File != VMA_NULL) - { - fclose(m_File); - } -} - -void VmaRecorder::RecordCreateAllocator(uint32_t frameIndex) -{ - CallParams callParams; - GetBasicParams(callParams); - - VmaMutexLock lock(m_FileMutex, m_UseMutex); - fprintf(m_File, "%u,%.3f,%u,vmaCreateAllocator\n", callParams.threadId, callParams.time, frameIndex); - Flush(); -} - -void VmaRecorder::RecordDestroyAllocator(uint32_t frameIndex) -{ - CallParams callParams; - GetBasicParams(callParams); - - VmaMutexLock lock(m_FileMutex, m_UseMutex); - fprintf(m_File, "%u,%.3f,%u,vmaDestroyAllocator\n", callParams.threadId, callParams.time, frameIndex); - Flush(); -} - -void VmaRecorder::RecordCreatePool(uint32_t frameIndex, const VmaPoolCreateInfo& createInfo, VmaPool pool) -{ - CallParams callParams; - GetBasicParams(callParams); - - VmaMutexLock lock(m_FileMutex, m_UseMutex); - fprintf(m_File, "%u,%.3f,%u,vmaCreatePool,%u,%u,%llu,%llu,%llu,%u,%p\n", callParams.threadId, callParams.time, frameIndex, - createInfo.memoryTypeIndex, - createInfo.flags, - createInfo.blockSize, - (uint64_t)createInfo.minBlockCount, - (uint64_t)createInfo.maxBlockCount, - createInfo.frameInUseCount, - pool); - Flush(); -} - -void VmaRecorder::RecordDestroyPool(uint32_t frameIndex, VmaPool pool) -{ - CallParams callParams; - GetBasicParams(callParams); - - VmaMutexLock lock(m_FileMutex, m_UseMutex); - fprintf(m_File, "%u,%.3f,%u,vmaDestroyPool,%p\n", callParams.threadId, callParams.time, frameIndex, - pool); - Flush(); -} - -void VmaRecorder::RecordAllocateMemory(uint32_t frameIndex, - const VkMemoryRequirements& vkMemReq, - const VmaAllocationCreateInfo& createInfo, - VmaAllocation allocation) -{ - CallParams callParams; - GetBasicParams(callParams); - - VmaMutexLock lock(m_FileMutex, m_UseMutex); - UserDataString userDataStr(createInfo.flags, createInfo.pUserData); - fprintf(m_File, "%u,%.3f,%u,vmaAllocateMemory,%llu,%llu,%u,%u,%u,%u,%u,%u,%p,%p,%s\n", callParams.threadId, callParams.time, frameIndex, - vkMemReq.size, - vkMemReq.alignment, - vkMemReq.memoryTypeBits, - createInfo.flags, - createInfo.usage, - createInfo.requiredFlags, - createInfo.preferredFlags, - createInfo.memoryTypeBits, - createInfo.pool, - allocation, - userDataStr.GetString()); - Flush(); -} - -void VmaRecorder::RecordAllocateMemoryPages(uint32_t frameIndex, - const VkMemoryRequirements& vkMemReq, - const VmaAllocationCreateInfo& createInfo, - uint64_t allocationCount, - const VmaAllocation* pAllocations) -{ - CallParams callParams; - GetBasicParams(callParams); - - VmaMutexLock lock(m_FileMutex, m_UseMutex); - UserDataString userDataStr(createInfo.flags, createInfo.pUserData); - fprintf(m_File, "%u,%.3f,%u,vmaAllocateMemoryPages,%llu,%llu,%u,%u,%u,%u,%u,%u,%p,", callParams.threadId, callParams.time, frameIndex, - vkMemReq.size, - vkMemReq.alignment, - vkMemReq.memoryTypeBits, - createInfo.flags, - createInfo.usage, - createInfo.requiredFlags, - createInfo.preferredFlags, - createInfo.memoryTypeBits, - createInfo.pool); - PrintPointerList(allocationCount, pAllocations); - fprintf(m_File, ",%s\n", userDataStr.GetString()); - Flush(); -} - -void VmaRecorder::RecordAllocateMemoryForBuffer(uint32_t frameIndex, - const VkMemoryRequirements& vkMemReq, - bool requiresDedicatedAllocation, - bool prefersDedicatedAllocation, - const VmaAllocationCreateInfo& createInfo, - VmaAllocation allocation) -{ - CallParams callParams; - GetBasicParams(callParams); - - VmaMutexLock lock(m_FileMutex, m_UseMutex); - UserDataString userDataStr(createInfo.flags, createInfo.pUserData); - fprintf(m_File, "%u,%.3f,%u,vmaAllocateMemoryForBuffer,%llu,%llu,%u,%u,%u,%u,%u,%u,%u,%u,%p,%p,%s\n", callParams.threadId, callParams.time, frameIndex, - vkMemReq.size, - vkMemReq.alignment, - vkMemReq.memoryTypeBits, - requiresDedicatedAllocation ? 1 : 0, - prefersDedicatedAllocation ? 1 : 0, - createInfo.flags, - createInfo.usage, - createInfo.requiredFlags, - createInfo.preferredFlags, - createInfo.memoryTypeBits, - createInfo.pool, - allocation, - userDataStr.GetString()); - Flush(); -} - -void VmaRecorder::RecordAllocateMemoryForImage(uint32_t frameIndex, - const VkMemoryRequirements& vkMemReq, - bool requiresDedicatedAllocation, - bool prefersDedicatedAllocation, - const VmaAllocationCreateInfo& createInfo, - VmaAllocation allocation) -{ - CallParams callParams; - GetBasicParams(callParams); - - VmaMutexLock lock(m_FileMutex, m_UseMutex); - UserDataString userDataStr(createInfo.flags, createInfo.pUserData); - fprintf(m_File, "%u,%.3f,%u,vmaAllocateMemoryForImage,%llu,%llu,%u,%u,%u,%u,%u,%u,%u,%u,%p,%p,%s\n", callParams.threadId, callParams.time, frameIndex, - vkMemReq.size, - vkMemReq.alignment, - vkMemReq.memoryTypeBits, - requiresDedicatedAllocation ? 1 : 0, - prefersDedicatedAllocation ? 1 : 0, - createInfo.flags, - createInfo.usage, - createInfo.requiredFlags, - createInfo.preferredFlags, - createInfo.memoryTypeBits, - createInfo.pool, - allocation, - userDataStr.GetString()); - Flush(); -} - -void VmaRecorder::RecordFreeMemory(uint32_t frameIndex, - VmaAllocation allocation) -{ - CallParams callParams; - GetBasicParams(callParams); - - VmaMutexLock lock(m_FileMutex, m_UseMutex); - fprintf(m_File, "%u,%.3f,%u,vmaFreeMemory,%p\n", callParams.threadId, callParams.time, frameIndex, - allocation); - Flush(); -} - -void VmaRecorder::RecordFreeMemoryPages(uint32_t frameIndex, - uint64_t allocationCount, - const VmaAllocation* pAllocations) -{ - CallParams callParams; - GetBasicParams(callParams); - - VmaMutexLock lock(m_FileMutex, m_UseMutex); - fprintf(m_File, "%u,%.3f,%u,vmaFreeMemoryPages,", callParams.threadId, callParams.time, frameIndex); - PrintPointerList(allocationCount, pAllocations); - fprintf(m_File, "\n"); - Flush(); -} - -void VmaRecorder::RecordSetAllocationUserData(uint32_t frameIndex, - VmaAllocation allocation, - const void* pUserData) -{ - CallParams callParams; - GetBasicParams(callParams); - - VmaMutexLock lock(m_FileMutex, m_UseMutex); - UserDataString userDataStr( - allocation->IsUserDataString() ? VMA_ALLOCATION_CREATE_USER_DATA_COPY_STRING_BIT : 0, - pUserData); - fprintf(m_File, "%u,%.3f,%u,vmaSetAllocationUserData,%p,%s\n", callParams.threadId, callParams.time, frameIndex, - allocation, - userDataStr.GetString()); - Flush(); -} - -void VmaRecorder::RecordCreateLostAllocation(uint32_t frameIndex, - VmaAllocation allocation) -{ - CallParams callParams; - GetBasicParams(callParams); - - VmaMutexLock lock(m_FileMutex, m_UseMutex); - fprintf(m_File, "%u,%.3f,%u,vmaCreateLostAllocation,%p\n", callParams.threadId, callParams.time, frameIndex, - allocation); - Flush(); -} - -void VmaRecorder::RecordMapMemory(uint32_t frameIndex, - VmaAllocation allocation) -{ - CallParams callParams; - GetBasicParams(callParams); - - VmaMutexLock lock(m_FileMutex, m_UseMutex); - fprintf(m_File, "%u,%.3f,%u,vmaMapMemory,%p\n", callParams.threadId, callParams.time, frameIndex, - allocation); - Flush(); -} - -void VmaRecorder::RecordUnmapMemory(uint32_t frameIndex, - VmaAllocation allocation) -{ - CallParams callParams; - GetBasicParams(callParams); - - VmaMutexLock lock(m_FileMutex, m_UseMutex); - fprintf(m_File, "%u,%.3f,%u,vmaUnmapMemory,%p\n", callParams.threadId, callParams.time, frameIndex, - allocation); - Flush(); -} - -void VmaRecorder::RecordFlushAllocation(uint32_t frameIndex, - VmaAllocation allocation, VkDeviceSize offset, VkDeviceSize size) -{ - CallParams callParams; - GetBasicParams(callParams); - - VmaMutexLock lock(m_FileMutex, m_UseMutex); - fprintf(m_File, "%u,%.3f,%u,vmaFlushAllocation,%p,%llu,%llu\n", callParams.threadId, callParams.time, frameIndex, - allocation, - offset, - size); - Flush(); -} - -void VmaRecorder::RecordInvalidateAllocation(uint32_t frameIndex, - VmaAllocation allocation, VkDeviceSize offset, VkDeviceSize size) -{ - CallParams callParams; - GetBasicParams(callParams); - - VmaMutexLock lock(m_FileMutex, m_UseMutex); - fprintf(m_File, "%u,%.3f,%u,vmaInvalidateAllocation,%p,%llu,%llu\n", callParams.threadId, callParams.time, frameIndex, - allocation, - offset, - size); - Flush(); -} - -void VmaRecorder::RecordCreateBuffer(uint32_t frameIndex, - const VkBufferCreateInfo& bufCreateInfo, - const VmaAllocationCreateInfo& allocCreateInfo, - VmaAllocation allocation) -{ - CallParams callParams; - GetBasicParams(callParams); - - VmaMutexLock lock(m_FileMutex, m_UseMutex); - UserDataString userDataStr(allocCreateInfo.flags, allocCreateInfo.pUserData); - fprintf(m_File, "%u,%.3f,%u,vmaCreateBuffer,%u,%llu,%u,%u,%u,%u,%u,%u,%u,%p,%p,%s\n", callParams.threadId, callParams.time, frameIndex, - bufCreateInfo.flags, - bufCreateInfo.size, - bufCreateInfo.usage, - bufCreateInfo.sharingMode, - allocCreateInfo.flags, - allocCreateInfo.usage, - allocCreateInfo.requiredFlags, - allocCreateInfo.preferredFlags, - allocCreateInfo.memoryTypeBits, - allocCreateInfo.pool, - allocation, - userDataStr.GetString()); - Flush(); -} - -void VmaRecorder::RecordCreateImage(uint32_t frameIndex, - const VkImageCreateInfo& imageCreateInfo, - const VmaAllocationCreateInfo& allocCreateInfo, - VmaAllocation allocation) -{ - CallParams callParams; - GetBasicParams(callParams); - - VmaMutexLock lock(m_FileMutex, m_UseMutex); - UserDataString userDataStr(allocCreateInfo.flags, allocCreateInfo.pUserData); - fprintf(m_File, "%u,%.3f,%u,vmaCreateImage,%u,%u,%u,%u,%u,%u,%u,%u,%u,%u,%u,%u,%u,%u,%u,%u,%u,%u,%p,%p,%s\n", callParams.threadId, callParams.time, frameIndex, - imageCreateInfo.flags, - imageCreateInfo.imageType, - imageCreateInfo.format, - imageCreateInfo.extent.width, - imageCreateInfo.extent.height, - imageCreateInfo.extent.depth, - imageCreateInfo.mipLevels, - imageCreateInfo.arrayLayers, - imageCreateInfo.samples, - imageCreateInfo.tiling, - imageCreateInfo.usage, - imageCreateInfo.sharingMode, - imageCreateInfo.initialLayout, - allocCreateInfo.flags, - allocCreateInfo.usage, - allocCreateInfo.requiredFlags, - allocCreateInfo.preferredFlags, - allocCreateInfo.memoryTypeBits, - allocCreateInfo.pool, - allocation, - userDataStr.GetString()); - Flush(); -} - -void VmaRecorder::RecordDestroyBuffer(uint32_t frameIndex, - VmaAllocation allocation) -{ - CallParams callParams; - GetBasicParams(callParams); - - VmaMutexLock lock(m_FileMutex, m_UseMutex); - fprintf(m_File, "%u,%.3f,%u,vmaDestroyBuffer,%p\n", callParams.threadId, callParams.time, frameIndex, - allocation); - Flush(); -} - -void VmaRecorder::RecordDestroyImage(uint32_t frameIndex, - VmaAllocation allocation) -{ - CallParams callParams; - GetBasicParams(callParams); - - VmaMutexLock lock(m_FileMutex, m_UseMutex); - fprintf(m_File, "%u,%.3f,%u,vmaDestroyImage,%p\n", callParams.threadId, callParams.time, frameIndex, - allocation); - Flush(); -} - -void VmaRecorder::RecordTouchAllocation(uint32_t frameIndex, - VmaAllocation allocation) -{ - CallParams callParams; - GetBasicParams(callParams); - - VmaMutexLock lock(m_FileMutex, m_UseMutex); - fprintf(m_File, "%u,%.3f,%u,vmaTouchAllocation,%p\n", callParams.threadId, callParams.time, frameIndex, - allocation); - Flush(); -} - -void VmaRecorder::RecordGetAllocationInfo(uint32_t frameIndex, - VmaAllocation allocation) -{ - CallParams callParams; - GetBasicParams(callParams); - - VmaMutexLock lock(m_FileMutex, m_UseMutex); - fprintf(m_File, "%u,%.3f,%u,vmaGetAllocationInfo,%p\n", callParams.threadId, callParams.time, frameIndex, - allocation); - Flush(); -} - -void VmaRecorder::RecordMakePoolAllocationsLost(uint32_t frameIndex, - VmaPool pool) -{ - CallParams callParams; - GetBasicParams(callParams); - - VmaMutexLock lock(m_FileMutex, m_UseMutex); - fprintf(m_File, "%u,%.3f,%u,vmaMakePoolAllocationsLost,%p\n", callParams.threadId, callParams.time, frameIndex, - pool); - Flush(); -} - -void VmaRecorder::RecordDefragmentationBegin(uint32_t frameIndex, - const VmaDefragmentationInfo2& info, - VmaDefragmentationContext ctx) -{ - CallParams callParams; - GetBasicParams(callParams); - - VmaMutexLock lock(m_FileMutex, m_UseMutex); - fprintf(m_File, "%u,%.3f,%u,vmaDefragmentationBegin,%u,", callParams.threadId, callParams.time, frameIndex, - info.flags); - PrintPointerList(info.allocationCount, info.pAllocations); - fprintf(m_File, ","); - PrintPointerList(info.poolCount, info.pPools); - fprintf(m_File, ",%llu,%u,%llu,%u,%p,%p\n", - info.maxCpuBytesToMove, - info.maxCpuAllocationsToMove, - info.maxGpuBytesToMove, - info.maxGpuAllocationsToMove, - info.commandBuffer, - ctx); - Flush(); -} - -void VmaRecorder::RecordDefragmentationEnd(uint32_t frameIndex, - VmaDefragmentationContext ctx) -{ - CallParams callParams; - GetBasicParams(callParams); - - VmaMutexLock lock(m_FileMutex, m_UseMutex); - fprintf(m_File, "%u,%.3f,%u,vmaDefragmentationEnd,%p\n", callParams.threadId, callParams.time, frameIndex, - ctx); - Flush(); -} - -void VmaRecorder::RecordSetPoolName(uint32_t frameIndex, - VmaPool pool, - const char* name) -{ - CallParams callParams; - GetBasicParams(callParams); - - VmaMutexLock lock(m_FileMutex, m_UseMutex); - fprintf(m_File, "%u,%.3f,%u,vmaSetPoolName,%p,%s\n", callParams.threadId, callParams.time, frameIndex, - pool, name != VMA_NULL ? name : ""); - Flush(); -} - -VmaRecorder::UserDataString::UserDataString(VmaAllocationCreateFlags allocFlags, const void* pUserData) -{ - if(pUserData != VMA_NULL) - { - if((allocFlags & VMA_ALLOCATION_CREATE_USER_DATA_COPY_STRING_BIT) != 0) - { - m_Str = (const char*)pUserData; - } - else - { - // If VMA_ALLOCATION_CREATE_USER_DATA_COPY_STRING_BIT is not specified, convert the string's memory address to a string and store it. - snprintf(m_PtrStr, 17, "%p", pUserData); - m_Str = m_PtrStr; - } - } - else - { - m_Str = ""; - } -} - -void VmaRecorder::WriteConfiguration( - const VkPhysicalDeviceProperties& devProps, - const VkPhysicalDeviceMemoryProperties& memProps, - uint32_t vulkanApiVersion, - bool dedicatedAllocationExtensionEnabled, - bool bindMemory2ExtensionEnabled, - bool memoryBudgetExtensionEnabled, - bool deviceCoherentMemoryExtensionEnabled) -{ - fprintf(m_File, "Config,Begin\n"); - - fprintf(m_File, "VulkanApiVersion,%u,%u\n", VK_VERSION_MAJOR(vulkanApiVersion), VK_VERSION_MINOR(vulkanApiVersion)); - - fprintf(m_File, "PhysicalDevice,apiVersion,%u\n", devProps.apiVersion); - fprintf(m_File, "PhysicalDevice,driverVersion,%u\n", devProps.driverVersion); - fprintf(m_File, "PhysicalDevice,vendorID,%u\n", devProps.vendorID); - fprintf(m_File, "PhysicalDevice,deviceID,%u\n", devProps.deviceID); - fprintf(m_File, "PhysicalDevice,deviceType,%u\n", devProps.deviceType); - fprintf(m_File, "PhysicalDevice,deviceName,%s\n", devProps.deviceName); - - fprintf(m_File, "PhysicalDeviceLimits,maxMemoryAllocationCount,%u\n", devProps.limits.maxMemoryAllocationCount); - fprintf(m_File, "PhysicalDeviceLimits,bufferImageGranularity,%llu\n", devProps.limits.bufferImageGranularity); - fprintf(m_File, "PhysicalDeviceLimits,nonCoherentAtomSize,%llu\n", devProps.limits.nonCoherentAtomSize); - - fprintf(m_File, "PhysicalDeviceMemory,HeapCount,%u\n", memProps.memoryHeapCount); - for(uint32_t i = 0; i < memProps.memoryHeapCount; ++i) - { - fprintf(m_File, "PhysicalDeviceMemory,Heap,%u,size,%llu\n", i, memProps.memoryHeaps[i].size); - fprintf(m_File, "PhysicalDeviceMemory,Heap,%u,flags,%u\n", i, memProps.memoryHeaps[i].flags); - } - fprintf(m_File, "PhysicalDeviceMemory,TypeCount,%u\n", memProps.memoryTypeCount); - for(uint32_t i = 0; i < memProps.memoryTypeCount; ++i) - { - fprintf(m_File, "PhysicalDeviceMemory,Type,%u,heapIndex,%u\n", i, memProps.memoryTypes[i].heapIndex); - fprintf(m_File, "PhysicalDeviceMemory,Type,%u,propertyFlags,%u\n", i, memProps.memoryTypes[i].propertyFlags); - } - - fprintf(m_File, "Extension,VK_KHR_dedicated_allocation,%u\n", dedicatedAllocationExtensionEnabled ? 1 : 0); - fprintf(m_File, "Extension,VK_KHR_bind_memory2,%u\n", bindMemory2ExtensionEnabled ? 1 : 0); - fprintf(m_File, "Extension,VK_EXT_memory_budget,%u\n", memoryBudgetExtensionEnabled ? 1 : 0); - fprintf(m_File, "Extension,VK_AMD_device_coherent_memory,%u\n", deviceCoherentMemoryExtensionEnabled ? 1 : 0); - - fprintf(m_File, "Macro,VMA_DEBUG_ALWAYS_DEDICATED_MEMORY,%u\n", VMA_DEBUG_ALWAYS_DEDICATED_MEMORY ? 1 : 0); - fprintf(m_File, "Macro,VMA_DEBUG_ALIGNMENT,%llu\n", (VkDeviceSize)VMA_DEBUG_ALIGNMENT); - fprintf(m_File, "Macro,VMA_DEBUG_MARGIN,%llu\n", (VkDeviceSize)VMA_DEBUG_MARGIN); - fprintf(m_File, "Macro,VMA_DEBUG_INITIALIZE_ALLOCATIONS,%u\n", VMA_DEBUG_INITIALIZE_ALLOCATIONS ? 1 : 0); - fprintf(m_File, "Macro,VMA_DEBUG_DETECT_CORRUPTION,%u\n", VMA_DEBUG_DETECT_CORRUPTION ? 1 : 0); - fprintf(m_File, "Macro,VMA_DEBUG_GLOBAL_MUTEX,%u\n", VMA_DEBUG_GLOBAL_MUTEX ? 1 : 0); - fprintf(m_File, "Macro,VMA_DEBUG_MIN_BUFFER_IMAGE_GRANULARITY,%llu\n", (VkDeviceSize)VMA_DEBUG_MIN_BUFFER_IMAGE_GRANULARITY); - fprintf(m_File, "Macro,VMA_SMALL_HEAP_MAX_SIZE,%llu\n", (VkDeviceSize)VMA_SMALL_HEAP_MAX_SIZE); - fprintf(m_File, "Macro,VMA_DEFAULT_LARGE_HEAP_BLOCK_SIZE,%llu\n", (VkDeviceSize)VMA_DEFAULT_LARGE_HEAP_BLOCK_SIZE); - - fprintf(m_File, "Config,End\n"); -} - -void VmaRecorder::GetBasicParams(CallParams& outParams) -{ - #if defined(_WIN32) - outParams.threadId = GetCurrentThreadId(); - #else - // Use C++11 features to get thread id and convert it to uint32_t. - // There is room for optimization since sstream is quite slow. - // Is there a better way to convert std::this_thread::get_id() to uint32_t? - std::thread::id thread_id = std::this_thread::get_id(); - std::stringstream thread_id_to_string_converter; - thread_id_to_string_converter << thread_id; - std::string thread_id_as_string = thread_id_to_string_converter.str(); - outParams.threadId = static_cast(std::stoi(thread_id_as_string.c_str())); - #endif - - auto current_time = std::chrono::high_resolution_clock::now(); - - outParams.time = std::chrono::duration(current_time - m_RecordingStartTime).count(); -} - -void VmaRecorder::PrintPointerList(uint64_t count, const VmaAllocation* pItems) -{ - if(count) - { - fprintf(m_File, "%p", pItems[0]); - for(uint64_t i = 1; i < count; ++i) - { - fprintf(m_File, " %p", pItems[i]); - } - } -} - -void VmaRecorder::Flush() -{ - if((m_Flags & VMA_RECORD_FLUSH_AFTER_CALL_BIT) != 0) - { - fflush(m_File); - } -} - -#endif // #if VMA_RECORDING_ENABLED - -//////////////////////////////////////////////////////////////////////////////// -// VmaAllocationObjectAllocator - -VmaAllocationObjectAllocator::VmaAllocationObjectAllocator(const VkAllocationCallbacks* pAllocationCallbacks) : - m_Allocator(pAllocationCallbacks, 1024) -{ -} - -template VmaAllocation VmaAllocationObjectAllocator::Allocate(Types... args) -{ - VmaMutexLock mutexLock(m_Mutex); - return m_Allocator.Alloc(std::forward(args)...); -} - -void VmaAllocationObjectAllocator::Free(VmaAllocation hAlloc) -{ - VmaMutexLock mutexLock(m_Mutex); - m_Allocator.Free(hAlloc); -} - -//////////////////////////////////////////////////////////////////////////////// -// VmaAllocator_T - -VmaAllocator_T::VmaAllocator_T(const VmaAllocatorCreateInfo* pCreateInfo) : - m_UseMutex((pCreateInfo->flags & VMA_ALLOCATOR_CREATE_EXTERNALLY_SYNCHRONIZED_BIT) == 0), - m_VulkanApiVersion(pCreateInfo->vulkanApiVersion != 0 ? pCreateInfo->vulkanApiVersion : VK_API_VERSION_1_0), - m_UseKhrDedicatedAllocation((pCreateInfo->flags & VMA_ALLOCATOR_CREATE_KHR_DEDICATED_ALLOCATION_BIT) != 0), - m_UseKhrBindMemory2((pCreateInfo->flags & VMA_ALLOCATOR_CREATE_KHR_BIND_MEMORY2_BIT) != 0), - m_UseExtMemoryBudget((pCreateInfo->flags & VMA_ALLOCATOR_CREATE_EXT_MEMORY_BUDGET_BIT) != 0), - m_UseAmdDeviceCoherentMemory((pCreateInfo->flags & VMA_ALLOCATOR_CREATE_AMD_DEVICE_COHERENT_MEMORY_BIT) != 0), - m_UseKhrBufferDeviceAddress((pCreateInfo->flags & VMA_ALLOCATOR_CREATE_BUFFER_DEVICE_ADDRESS_BIT) != 0), - m_UseExtMemoryPriority((pCreateInfo->flags & VMA_ALLOCATOR_CREATE_EXT_MEMORY_PRIORITY_BIT) != 0), - m_hDevice(pCreateInfo->device), - m_hInstance(pCreateInfo->instance), - m_AllocationCallbacksSpecified(pCreateInfo->pAllocationCallbacks != VMA_NULL), - m_AllocationCallbacks(pCreateInfo->pAllocationCallbacks ? - *pCreateInfo->pAllocationCallbacks : VmaEmptyAllocationCallbacks), - m_AllocationObjectAllocator(&m_AllocationCallbacks), - m_HeapSizeLimitMask(0), - m_DeviceMemoryCount(0), - m_PreferredLargeHeapBlockSize(0), - m_PhysicalDevice(pCreateInfo->physicalDevice), - m_CurrentFrameIndex(0), - m_GpuDefragmentationMemoryTypeBits(UINT32_MAX), - m_Pools(VmaStlAllocator(GetAllocationCallbacks())), - m_NextPoolId(0), - m_GlobalMemoryTypeBits(UINT32_MAX) -#if VMA_RECORDING_ENABLED - ,m_pRecorder(VMA_NULL) -#endif -{ - if(m_VulkanApiVersion >= VK_MAKE_VERSION(1, 1, 0)) - { - m_UseKhrDedicatedAllocation = false; - m_UseKhrBindMemory2 = false; - } - - if(VMA_DEBUG_DETECT_CORRUPTION) - { - // Needs to be multiply of uint32_t size because we are going to write VMA_CORRUPTION_DETECTION_MAGIC_VALUE to it. - VMA_ASSERT(VMA_DEBUG_MARGIN % sizeof(uint32_t) == 0); - } - - VMA_ASSERT(pCreateInfo->physicalDevice && pCreateInfo->device && pCreateInfo->instance); - - if(m_VulkanApiVersion < VK_MAKE_VERSION(1, 1, 0)) - { -#if !(VMA_DEDICATED_ALLOCATION) - if((pCreateInfo->flags & VMA_ALLOCATOR_CREATE_KHR_DEDICATED_ALLOCATION_BIT) != 0) - { - VMA_ASSERT(0 && "VMA_ALLOCATOR_CREATE_KHR_DEDICATED_ALLOCATION_BIT set but required extensions are disabled by preprocessor macros."); - } -#endif -#if !(VMA_BIND_MEMORY2) - if((pCreateInfo->flags & VMA_ALLOCATOR_CREATE_KHR_BIND_MEMORY2_BIT) != 0) - { - VMA_ASSERT(0 && "VMA_ALLOCATOR_CREATE_KHR_BIND_MEMORY2_BIT set but required extension is disabled by preprocessor macros."); - } -#endif - } -#if !(VMA_MEMORY_BUDGET) - if((pCreateInfo->flags & VMA_ALLOCATOR_CREATE_EXT_MEMORY_BUDGET_BIT) != 0) - { - VMA_ASSERT(0 && "VMA_ALLOCATOR_CREATE_EXT_MEMORY_BUDGET_BIT set but required extension is disabled by preprocessor macros."); - } -#endif -#if !(VMA_BUFFER_DEVICE_ADDRESS) - if(m_UseKhrBufferDeviceAddress) - { - VMA_ASSERT(0 && "VMA_ALLOCATOR_CREATE_BUFFER_DEVICE_ADDRESS_BIT is set but required extension or Vulkan 1.2 is not available in your Vulkan header or its support in VMA has been disabled by a preprocessor macro."); - } -#endif -#if VMA_VULKAN_VERSION < 1002000 - if(m_VulkanApiVersion >= VK_MAKE_VERSION(1, 2, 0)) - { - VMA_ASSERT(0 && "vulkanApiVersion >= VK_API_VERSION_1_2 but required Vulkan version is disabled by preprocessor macros."); - } -#endif -#if VMA_VULKAN_VERSION < 1001000 - if(m_VulkanApiVersion >= VK_MAKE_VERSION(1, 1, 0)) - { - VMA_ASSERT(0 && "vulkanApiVersion >= VK_API_VERSION_1_1 but required Vulkan version is disabled by preprocessor macros."); - } -#endif -#if !(VMA_MEMORY_PRIORITY) - if(m_UseExtMemoryPriority) - { - VMA_ASSERT(0 && "VMA_ALLOCATOR_CREATE_EXT_MEMORY_PRIORITY_BIT is set but required extension is not available in your Vulkan header or its support in VMA has been disabled by a preprocessor macro."); - } -#endif - - memset(&m_DeviceMemoryCallbacks, 0 ,sizeof(m_DeviceMemoryCallbacks)); - memset(&m_PhysicalDeviceProperties, 0, sizeof(m_PhysicalDeviceProperties)); - memset(&m_MemProps, 0, sizeof(m_MemProps)); - - memset(&m_pBlockVectors, 0, sizeof(m_pBlockVectors)); - memset(&m_pDedicatedAllocations, 0, sizeof(m_pDedicatedAllocations)); - memset(&m_VulkanFunctions, 0, sizeof(m_VulkanFunctions)); - - if(pCreateInfo->pDeviceMemoryCallbacks != VMA_NULL) - { - m_DeviceMemoryCallbacks.pUserData = pCreateInfo->pDeviceMemoryCallbacks->pUserData; - m_DeviceMemoryCallbacks.pfnAllocate = pCreateInfo->pDeviceMemoryCallbacks->pfnAllocate; - m_DeviceMemoryCallbacks.pfnFree = pCreateInfo->pDeviceMemoryCallbacks->pfnFree; - } - - ImportVulkanFunctions(pCreateInfo->pVulkanFunctions); - - (*m_VulkanFunctions.vkGetPhysicalDeviceProperties)(m_PhysicalDevice, &m_PhysicalDeviceProperties); - (*m_VulkanFunctions.vkGetPhysicalDeviceMemoryProperties)(m_PhysicalDevice, &m_MemProps); - - VMA_ASSERT(VmaIsPow2(VMA_DEBUG_ALIGNMENT)); - VMA_ASSERT(VmaIsPow2(VMA_DEBUG_MIN_BUFFER_IMAGE_GRANULARITY)); - VMA_ASSERT(VmaIsPow2(m_PhysicalDeviceProperties.limits.bufferImageGranularity)); - VMA_ASSERT(VmaIsPow2(m_PhysicalDeviceProperties.limits.nonCoherentAtomSize)); - - m_PreferredLargeHeapBlockSize = (pCreateInfo->preferredLargeHeapBlockSize != 0) ? - pCreateInfo->preferredLargeHeapBlockSize : static_cast(VMA_DEFAULT_LARGE_HEAP_BLOCK_SIZE); - - m_GlobalMemoryTypeBits = CalculateGlobalMemoryTypeBits(); - - if(pCreateInfo->pHeapSizeLimit != VMA_NULL) - { - for(uint32_t heapIndex = 0; heapIndex < GetMemoryHeapCount(); ++heapIndex) - { - const VkDeviceSize limit = pCreateInfo->pHeapSizeLimit[heapIndex]; - if(limit != VK_WHOLE_SIZE) - { - m_HeapSizeLimitMask |= 1u << heapIndex; - if(limit < m_MemProps.memoryHeaps[heapIndex].size) - { - m_MemProps.memoryHeaps[heapIndex].size = limit; - } - } - } - } - - for(uint32_t memTypeIndex = 0; memTypeIndex < GetMemoryTypeCount(); ++memTypeIndex) - { - const VkDeviceSize preferredBlockSize = CalcPreferredBlockSize(memTypeIndex); - - m_pBlockVectors[memTypeIndex] = vma_new(this, VmaBlockVector)( - this, - VK_NULL_HANDLE, // hParentPool - memTypeIndex, - preferredBlockSize, - 0, - SIZE_MAX, - GetBufferImageGranularity(), - pCreateInfo->frameInUseCount, - false, // explicitBlockSize - false, // linearAlgorithm - 0.5f); // priority (0.5 is the default per Vulkan spec) - // No need to call m_pBlockVectors[memTypeIndex][blockVectorTypeIndex]->CreateMinBlocks here, - // becase minBlockCount is 0. - m_pDedicatedAllocations[memTypeIndex] = vma_new(this, AllocationVectorType)(VmaStlAllocator(GetAllocationCallbacks())); - - } -} - -VkResult VmaAllocator_T::Init(const VmaAllocatorCreateInfo* pCreateInfo) -{ - VkResult res = VK_SUCCESS; - - if(pCreateInfo->pRecordSettings != VMA_NULL && - !VmaStrIsEmpty(pCreateInfo->pRecordSettings->pFilePath)) - { -#if VMA_RECORDING_ENABLED - m_pRecorder = vma_new(this, VmaRecorder)(); - res = m_pRecorder->Init(*pCreateInfo->pRecordSettings, m_UseMutex); - if(res != VK_SUCCESS) - { - return res; - } - m_pRecorder->WriteConfiguration( - m_PhysicalDeviceProperties, - m_MemProps, - m_VulkanApiVersion, - m_UseKhrDedicatedAllocation, - m_UseKhrBindMemory2, - m_UseExtMemoryBudget, - m_UseAmdDeviceCoherentMemory); - m_pRecorder->RecordCreateAllocator(GetCurrentFrameIndex()); -#else - VMA_ASSERT(0 && "VmaAllocatorCreateInfo::pRecordSettings used, but not supported due to VMA_RECORDING_ENABLED not defined to 1."); - return VK_ERROR_FEATURE_NOT_PRESENT; -#endif - } - -#if VMA_MEMORY_BUDGET - if(m_UseExtMemoryBudget) - { - UpdateVulkanBudget(); - } -#endif // #if VMA_MEMORY_BUDGET - - return res; -} - -VmaAllocator_T::~VmaAllocator_T() -{ -#if VMA_RECORDING_ENABLED - if(m_pRecorder != VMA_NULL) - { - m_pRecorder->RecordDestroyAllocator(GetCurrentFrameIndex()); - vma_delete(this, m_pRecorder); - } -#endif - - VMA_ASSERT(m_Pools.empty()); - - for(size_t i = GetMemoryTypeCount(); i--; ) - { - if(m_pDedicatedAllocations[i] != VMA_NULL && !m_pDedicatedAllocations[i]->empty()) - { - VMA_ASSERT(0 && "Unfreed dedicated allocations found."); - } - - vma_delete(this, m_pDedicatedAllocations[i]); - vma_delete(this, m_pBlockVectors[i]); - } -} - -void VmaAllocator_T::ImportVulkanFunctions(const VmaVulkanFunctions* pVulkanFunctions) -{ -#if VMA_STATIC_VULKAN_FUNCTIONS == 1 - ImportVulkanFunctions_Static(); -#endif - - if(pVulkanFunctions != VMA_NULL) - { - ImportVulkanFunctions_Custom(pVulkanFunctions); - } - -#if VMA_DYNAMIC_VULKAN_FUNCTIONS == 1 - ImportVulkanFunctions_Dynamic(); -#endif - - ValidateVulkanFunctions(); -} - -#if VMA_STATIC_VULKAN_FUNCTIONS == 1 - -void VmaAllocator_T::ImportVulkanFunctions_Static() -{ - // Vulkan 1.0 - m_VulkanFunctions.vkGetPhysicalDeviceProperties = (PFN_vkGetPhysicalDeviceProperties)vkGetPhysicalDeviceProperties; - m_VulkanFunctions.vkGetPhysicalDeviceMemoryProperties = (PFN_vkGetPhysicalDeviceMemoryProperties)vkGetPhysicalDeviceMemoryProperties; - m_VulkanFunctions.vkAllocateMemory = (PFN_vkAllocateMemory)vkAllocateMemory; - m_VulkanFunctions.vkFreeMemory = (PFN_vkFreeMemory)vkFreeMemory; - m_VulkanFunctions.vkMapMemory = (PFN_vkMapMemory)vkMapMemory; - m_VulkanFunctions.vkUnmapMemory = (PFN_vkUnmapMemory)vkUnmapMemory; - m_VulkanFunctions.vkFlushMappedMemoryRanges = (PFN_vkFlushMappedMemoryRanges)vkFlushMappedMemoryRanges; - m_VulkanFunctions.vkInvalidateMappedMemoryRanges = (PFN_vkInvalidateMappedMemoryRanges)vkInvalidateMappedMemoryRanges; - m_VulkanFunctions.vkBindBufferMemory = (PFN_vkBindBufferMemory)vkBindBufferMemory; - m_VulkanFunctions.vkBindImageMemory = (PFN_vkBindImageMemory)vkBindImageMemory; - m_VulkanFunctions.vkGetBufferMemoryRequirements = (PFN_vkGetBufferMemoryRequirements)vkGetBufferMemoryRequirements; - m_VulkanFunctions.vkGetImageMemoryRequirements = (PFN_vkGetImageMemoryRequirements)vkGetImageMemoryRequirements; - m_VulkanFunctions.vkCreateBuffer = (PFN_vkCreateBuffer)vkCreateBuffer; - m_VulkanFunctions.vkDestroyBuffer = (PFN_vkDestroyBuffer)vkDestroyBuffer; - m_VulkanFunctions.vkCreateImage = (PFN_vkCreateImage)vkCreateImage; - m_VulkanFunctions.vkDestroyImage = (PFN_vkDestroyImage)vkDestroyImage; - m_VulkanFunctions.vkCmdCopyBuffer = (PFN_vkCmdCopyBuffer)vkCmdCopyBuffer; - - // Vulkan 1.1 -#if VMA_VULKAN_VERSION >= 1001000 - if(m_VulkanApiVersion >= VK_MAKE_VERSION(1, 1, 0)) - { - m_VulkanFunctions.vkGetBufferMemoryRequirements2KHR = (PFN_vkGetBufferMemoryRequirements2)vkGetBufferMemoryRequirements2; - m_VulkanFunctions.vkGetImageMemoryRequirements2KHR = (PFN_vkGetImageMemoryRequirements2)vkGetImageMemoryRequirements2; - m_VulkanFunctions.vkBindBufferMemory2KHR = (PFN_vkBindBufferMemory2)vkBindBufferMemory2; - m_VulkanFunctions.vkBindImageMemory2KHR = (PFN_vkBindImageMemory2)vkBindImageMemory2; - m_VulkanFunctions.vkGetPhysicalDeviceMemoryProperties2KHR = (PFN_vkGetPhysicalDeviceMemoryProperties2)vkGetPhysicalDeviceMemoryProperties2; - } -#endif -} - -#endif // #if VMA_STATIC_VULKAN_FUNCTIONS == 1 - -void VmaAllocator_T::ImportVulkanFunctions_Custom(const VmaVulkanFunctions* pVulkanFunctions) -{ - VMA_ASSERT(pVulkanFunctions != VMA_NULL); - -#define VMA_COPY_IF_NOT_NULL(funcName) \ - if(pVulkanFunctions->funcName != VMA_NULL) m_VulkanFunctions.funcName = pVulkanFunctions->funcName; - - VMA_COPY_IF_NOT_NULL(vkGetPhysicalDeviceProperties); - VMA_COPY_IF_NOT_NULL(vkGetPhysicalDeviceMemoryProperties); - VMA_COPY_IF_NOT_NULL(vkAllocateMemory); - VMA_COPY_IF_NOT_NULL(vkFreeMemory); - VMA_COPY_IF_NOT_NULL(vkMapMemory); - VMA_COPY_IF_NOT_NULL(vkUnmapMemory); - VMA_COPY_IF_NOT_NULL(vkFlushMappedMemoryRanges); - VMA_COPY_IF_NOT_NULL(vkInvalidateMappedMemoryRanges); - VMA_COPY_IF_NOT_NULL(vkBindBufferMemory); - VMA_COPY_IF_NOT_NULL(vkBindImageMemory); - VMA_COPY_IF_NOT_NULL(vkGetBufferMemoryRequirements); - VMA_COPY_IF_NOT_NULL(vkGetImageMemoryRequirements); - VMA_COPY_IF_NOT_NULL(vkCreateBuffer); - VMA_COPY_IF_NOT_NULL(vkDestroyBuffer); - VMA_COPY_IF_NOT_NULL(vkCreateImage); - VMA_COPY_IF_NOT_NULL(vkDestroyImage); - VMA_COPY_IF_NOT_NULL(vkCmdCopyBuffer); - -#if VMA_DEDICATED_ALLOCATION || VMA_VULKAN_VERSION >= 1001000 - VMA_COPY_IF_NOT_NULL(vkGetBufferMemoryRequirements2KHR); - VMA_COPY_IF_NOT_NULL(vkGetImageMemoryRequirements2KHR); -#endif - -#if VMA_BIND_MEMORY2 || VMA_VULKAN_VERSION >= 1001000 - VMA_COPY_IF_NOT_NULL(vkBindBufferMemory2KHR); - VMA_COPY_IF_NOT_NULL(vkBindImageMemory2KHR); -#endif - -#if VMA_MEMORY_BUDGET - VMA_COPY_IF_NOT_NULL(vkGetPhysicalDeviceMemoryProperties2KHR); -#endif - -#undef VMA_COPY_IF_NOT_NULL -} - -#if VMA_DYNAMIC_VULKAN_FUNCTIONS == 1 - -void VmaAllocator_T::ImportVulkanFunctions_Dynamic() -{ -#define VMA_FETCH_INSTANCE_FUNC(memberName, functionPointerType, functionNameString) \ - if(m_VulkanFunctions.memberName == VMA_NULL) \ - m_VulkanFunctions.memberName = \ - (functionPointerType)vkGetInstanceProcAddr(m_hInstance, functionNameString); -#define VMA_FETCH_DEVICE_FUNC(memberName, functionPointerType, functionNameString) \ - if(m_VulkanFunctions.memberName == VMA_NULL) \ - m_VulkanFunctions.memberName = \ - (functionPointerType)vkGetDeviceProcAddr(m_hDevice, functionNameString); - - VMA_FETCH_INSTANCE_FUNC(vkGetPhysicalDeviceProperties, PFN_vkGetPhysicalDeviceProperties, "vkGetPhysicalDeviceProperties"); - VMA_FETCH_INSTANCE_FUNC(vkGetPhysicalDeviceMemoryProperties, PFN_vkGetPhysicalDeviceMemoryProperties, "vkGetPhysicalDeviceMemoryProperties"); - VMA_FETCH_DEVICE_FUNC(vkAllocateMemory, PFN_vkAllocateMemory, "vkAllocateMemory"); - VMA_FETCH_DEVICE_FUNC(vkFreeMemory, PFN_vkFreeMemory, "vkFreeMemory"); - VMA_FETCH_DEVICE_FUNC(vkMapMemory, PFN_vkMapMemory, "vkMapMemory"); - VMA_FETCH_DEVICE_FUNC(vkUnmapMemory, PFN_vkUnmapMemory, "vkUnmapMemory"); - VMA_FETCH_DEVICE_FUNC(vkFlushMappedMemoryRanges, PFN_vkFlushMappedMemoryRanges, "vkFlushMappedMemoryRanges"); - VMA_FETCH_DEVICE_FUNC(vkInvalidateMappedMemoryRanges, PFN_vkInvalidateMappedMemoryRanges, "vkInvalidateMappedMemoryRanges"); - VMA_FETCH_DEVICE_FUNC(vkBindBufferMemory, PFN_vkBindBufferMemory, "vkBindBufferMemory"); - VMA_FETCH_DEVICE_FUNC(vkBindImageMemory, PFN_vkBindImageMemory, "vkBindImageMemory"); - VMA_FETCH_DEVICE_FUNC(vkGetBufferMemoryRequirements, PFN_vkGetBufferMemoryRequirements, "vkGetBufferMemoryRequirements"); - VMA_FETCH_DEVICE_FUNC(vkGetImageMemoryRequirements, PFN_vkGetImageMemoryRequirements, "vkGetImageMemoryRequirements"); - VMA_FETCH_DEVICE_FUNC(vkCreateBuffer, PFN_vkCreateBuffer, "vkCreateBuffer"); - VMA_FETCH_DEVICE_FUNC(vkDestroyBuffer, PFN_vkDestroyBuffer, "vkDestroyBuffer"); - VMA_FETCH_DEVICE_FUNC(vkCreateImage, PFN_vkCreateImage, "vkCreateImage"); - VMA_FETCH_DEVICE_FUNC(vkDestroyImage, PFN_vkDestroyImage, "vkDestroyImage"); - VMA_FETCH_DEVICE_FUNC(vkCmdCopyBuffer, PFN_vkCmdCopyBuffer, "vkCmdCopyBuffer"); - -#if VMA_VULKAN_VERSION >= 1001000 - if(m_VulkanApiVersion >= VK_MAKE_VERSION(1, 1, 0)) - { - VMA_FETCH_DEVICE_FUNC(vkGetBufferMemoryRequirements2KHR, PFN_vkGetBufferMemoryRequirements2, "vkGetBufferMemoryRequirements2"); - VMA_FETCH_DEVICE_FUNC(vkGetImageMemoryRequirements2KHR, PFN_vkGetImageMemoryRequirements2, "vkGetImageMemoryRequirements2"); - VMA_FETCH_DEVICE_FUNC(vkBindBufferMemory2KHR, PFN_vkBindBufferMemory2, "vkBindBufferMemory2"); - VMA_FETCH_DEVICE_FUNC(vkBindImageMemory2KHR, PFN_vkBindImageMemory2, "vkBindImageMemory2"); - VMA_FETCH_INSTANCE_FUNC(vkGetPhysicalDeviceMemoryProperties2KHR, PFN_vkGetPhysicalDeviceMemoryProperties2, "vkGetPhysicalDeviceMemoryProperties2"); - } -#endif - -#if VMA_DEDICATED_ALLOCATION - if(m_UseKhrDedicatedAllocation) - { - VMA_FETCH_DEVICE_FUNC(vkGetBufferMemoryRequirements2KHR, PFN_vkGetBufferMemoryRequirements2KHR, "vkGetBufferMemoryRequirements2KHR"); - VMA_FETCH_DEVICE_FUNC(vkGetImageMemoryRequirements2KHR, PFN_vkGetImageMemoryRequirements2KHR, "vkGetImageMemoryRequirements2KHR"); - } -#endif - -#if VMA_BIND_MEMORY2 - if(m_UseKhrBindMemory2) - { - VMA_FETCH_DEVICE_FUNC(vkBindBufferMemory2KHR, PFN_vkBindBufferMemory2KHR, "vkBindBufferMemory2KHR"); - VMA_FETCH_DEVICE_FUNC(vkBindImageMemory2KHR, PFN_vkBindImageMemory2KHR, "vkBindImageMemory2KHR"); - } -#endif // #if VMA_BIND_MEMORY2 - -#if VMA_MEMORY_BUDGET - if(m_UseExtMemoryBudget) - { - VMA_FETCH_INSTANCE_FUNC(vkGetPhysicalDeviceMemoryProperties2KHR, PFN_vkGetPhysicalDeviceMemoryProperties2KHR, "vkGetPhysicalDeviceMemoryProperties2KHR"); - } -#endif // #if VMA_MEMORY_BUDGET - -#undef VMA_FETCH_DEVICE_FUNC -#undef VMA_FETCH_INSTANCE_FUNC -} - -#endif // #if VMA_DYNAMIC_VULKAN_FUNCTIONS == 1 - -void VmaAllocator_T::ValidateVulkanFunctions() -{ - VMA_ASSERT(m_VulkanFunctions.vkGetPhysicalDeviceProperties != VMA_NULL); - VMA_ASSERT(m_VulkanFunctions.vkGetPhysicalDeviceMemoryProperties != VMA_NULL); - VMA_ASSERT(m_VulkanFunctions.vkAllocateMemory != VMA_NULL); - VMA_ASSERT(m_VulkanFunctions.vkFreeMemory != VMA_NULL); - VMA_ASSERT(m_VulkanFunctions.vkMapMemory != VMA_NULL); - VMA_ASSERT(m_VulkanFunctions.vkUnmapMemory != VMA_NULL); - VMA_ASSERT(m_VulkanFunctions.vkFlushMappedMemoryRanges != VMA_NULL); - VMA_ASSERT(m_VulkanFunctions.vkInvalidateMappedMemoryRanges != VMA_NULL); - VMA_ASSERT(m_VulkanFunctions.vkBindBufferMemory != VMA_NULL); - VMA_ASSERT(m_VulkanFunctions.vkBindImageMemory != VMA_NULL); - VMA_ASSERT(m_VulkanFunctions.vkGetBufferMemoryRequirements != VMA_NULL); - VMA_ASSERT(m_VulkanFunctions.vkGetImageMemoryRequirements != VMA_NULL); - VMA_ASSERT(m_VulkanFunctions.vkCreateBuffer != VMA_NULL); - VMA_ASSERT(m_VulkanFunctions.vkDestroyBuffer != VMA_NULL); - VMA_ASSERT(m_VulkanFunctions.vkCreateImage != VMA_NULL); - VMA_ASSERT(m_VulkanFunctions.vkDestroyImage != VMA_NULL); - VMA_ASSERT(m_VulkanFunctions.vkCmdCopyBuffer != VMA_NULL); - -#if VMA_DEDICATED_ALLOCATION || VMA_VULKAN_VERSION >= 1001000 - if(m_VulkanApiVersion >= VK_MAKE_VERSION(1, 1, 0) || m_UseKhrDedicatedAllocation) - { - VMA_ASSERT(m_VulkanFunctions.vkGetBufferMemoryRequirements2KHR != VMA_NULL); - VMA_ASSERT(m_VulkanFunctions.vkGetImageMemoryRequirements2KHR != VMA_NULL); - } -#endif - -#if VMA_BIND_MEMORY2 || VMA_VULKAN_VERSION >= 1001000 - if(m_VulkanApiVersion >= VK_MAKE_VERSION(1, 1, 0) || m_UseKhrBindMemory2) - { - VMA_ASSERT(m_VulkanFunctions.vkBindBufferMemory2KHR != VMA_NULL); - VMA_ASSERT(m_VulkanFunctions.vkBindImageMemory2KHR != VMA_NULL); - } -#endif - -#if VMA_MEMORY_BUDGET || VMA_VULKAN_VERSION >= 1001000 - if(m_UseExtMemoryBudget || m_VulkanApiVersion >= VK_MAKE_VERSION(1, 1, 0)) - { - VMA_ASSERT(m_VulkanFunctions.vkGetPhysicalDeviceMemoryProperties2KHR != VMA_NULL); - } -#endif -} - -VkDeviceSize VmaAllocator_T::CalcPreferredBlockSize(uint32_t memTypeIndex) -{ - const uint32_t heapIndex = MemoryTypeIndexToHeapIndex(memTypeIndex); - const VkDeviceSize heapSize = m_MemProps.memoryHeaps[heapIndex].size; - const bool isSmallHeap = heapSize <= VMA_SMALL_HEAP_MAX_SIZE; - return VmaAlignUp(isSmallHeap ? (heapSize / 8) : m_PreferredLargeHeapBlockSize, (VkDeviceSize)32); -} - -VkResult VmaAllocator_T::AllocateMemoryOfType( - VkDeviceSize size, - VkDeviceSize alignment, - bool dedicatedAllocation, - VkBuffer dedicatedBuffer, - VkBufferUsageFlags dedicatedBufferUsage, - VkImage dedicatedImage, - const VmaAllocationCreateInfo& createInfo, - uint32_t memTypeIndex, - VmaSuballocationType suballocType, - size_t allocationCount, - VmaAllocation* pAllocations) -{ - VMA_ASSERT(pAllocations != VMA_NULL); - VMA_DEBUG_LOG(" AllocateMemory: MemoryTypeIndex=%u, AllocationCount=%zu, Size=%llu", memTypeIndex, allocationCount, size); - - VmaAllocationCreateInfo finalCreateInfo = createInfo; - - // If memory type is not HOST_VISIBLE, disable MAPPED. - if((finalCreateInfo.flags & VMA_ALLOCATION_CREATE_MAPPED_BIT) != 0 && - (m_MemProps.memoryTypes[memTypeIndex].propertyFlags & VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT) == 0) - { - finalCreateInfo.flags &= ~VMA_ALLOCATION_CREATE_MAPPED_BIT; - } - // If memory is lazily allocated, it should be always dedicated. - if(finalCreateInfo.usage == VMA_MEMORY_USAGE_GPU_LAZILY_ALLOCATED) - { - finalCreateInfo.flags |= VMA_ALLOCATION_CREATE_DEDICATED_MEMORY_BIT; - } - - VmaBlockVector* const blockVector = m_pBlockVectors[memTypeIndex]; - VMA_ASSERT(blockVector); - - const VkDeviceSize preferredBlockSize = blockVector->GetPreferredBlockSize(); - bool preferDedicatedMemory = - VMA_DEBUG_ALWAYS_DEDICATED_MEMORY || - dedicatedAllocation || - // Heuristics: Allocate dedicated memory if requested size if greater than half of preferred block size. - size > preferredBlockSize / 2; - - if(preferDedicatedMemory && - (finalCreateInfo.flags & VMA_ALLOCATION_CREATE_NEVER_ALLOCATE_BIT) == 0 && - finalCreateInfo.pool == VK_NULL_HANDLE) - { - finalCreateInfo.flags |= VMA_ALLOCATION_CREATE_DEDICATED_MEMORY_BIT; - } - - if((finalCreateInfo.flags & VMA_ALLOCATION_CREATE_DEDICATED_MEMORY_BIT) != 0) - { - if((finalCreateInfo.flags & VMA_ALLOCATION_CREATE_NEVER_ALLOCATE_BIT) != 0) - { - return VK_ERROR_OUT_OF_DEVICE_MEMORY; - } - else - { - return AllocateDedicatedMemory( - size, - suballocType, - memTypeIndex, - (finalCreateInfo.flags & VMA_ALLOCATION_CREATE_WITHIN_BUDGET_BIT) != 0, - (finalCreateInfo.flags & VMA_ALLOCATION_CREATE_MAPPED_BIT) != 0, - (finalCreateInfo.flags & VMA_ALLOCATION_CREATE_USER_DATA_COPY_STRING_BIT) != 0, - finalCreateInfo.pUserData, - finalCreateInfo.priority, - dedicatedBuffer, - dedicatedBufferUsage, - dedicatedImage, - allocationCount, - pAllocations); - } - } - else - { - VkResult res = blockVector->Allocate( - m_CurrentFrameIndex.load(), - size, - alignment, - finalCreateInfo, - suballocType, - allocationCount, - pAllocations); - if(res == VK_SUCCESS) - { - return res; - } - - // 5. Try dedicated memory. - if((finalCreateInfo.flags & VMA_ALLOCATION_CREATE_NEVER_ALLOCATE_BIT) != 0) - { - return VK_ERROR_OUT_OF_DEVICE_MEMORY; - } - - // Protection against creating each allocation as dedicated when we reach or exceed heap size/budget, - // which can quickly deplete maxMemoryAllocationCount: Don't try dedicated allocations when above - // 3/4 of the maximum allocation count. - if(m_DeviceMemoryCount.load() > m_PhysicalDeviceProperties.limits.maxMemoryAllocationCount * 3 / 4) - { - return VK_ERROR_OUT_OF_DEVICE_MEMORY; - } - - res = AllocateDedicatedMemory( - size, - suballocType, - memTypeIndex, - (finalCreateInfo.flags & VMA_ALLOCATION_CREATE_WITHIN_BUDGET_BIT) != 0, - (finalCreateInfo.flags & VMA_ALLOCATION_CREATE_MAPPED_BIT) != 0, - (finalCreateInfo.flags & VMA_ALLOCATION_CREATE_USER_DATA_COPY_STRING_BIT) != 0, - finalCreateInfo.pUserData, - finalCreateInfo.priority, - dedicatedBuffer, - dedicatedBufferUsage, - dedicatedImage, - allocationCount, - pAllocations); - if(res == VK_SUCCESS) - { - // Succeeded: AllocateDedicatedMemory function already filld pMemory, nothing more to do here. - VMA_DEBUG_LOG(" Allocated as DedicatedMemory"); - return VK_SUCCESS; - } - else - { - // Everything failed: Return error code. - VMA_DEBUG_LOG(" vkAllocateMemory FAILED"); - return res; - } - } -} - -VkResult VmaAllocator_T::AllocateDedicatedMemory( - VkDeviceSize size, - VmaSuballocationType suballocType, - uint32_t memTypeIndex, - bool withinBudget, - bool map, - bool isUserDataString, - void* pUserData, - float priority, - VkBuffer dedicatedBuffer, - VkBufferUsageFlags dedicatedBufferUsage, - VkImage dedicatedImage, - size_t allocationCount, - VmaAllocation* pAllocations) -{ - VMA_ASSERT(allocationCount > 0 && pAllocations); - - if(withinBudget) - { - const uint32_t heapIndex = MemoryTypeIndexToHeapIndex(memTypeIndex); - VmaBudget heapBudget = {}; - GetBudget(&heapBudget, heapIndex, 1); - if(heapBudget.usage + size * allocationCount > heapBudget.budget) - { - return VK_ERROR_OUT_OF_DEVICE_MEMORY; - } - } - - VkMemoryAllocateInfo allocInfo = { VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO }; - allocInfo.memoryTypeIndex = memTypeIndex; - allocInfo.allocationSize = size; - -#if VMA_DEDICATED_ALLOCATION || VMA_VULKAN_VERSION >= 1001000 - VkMemoryDedicatedAllocateInfoKHR dedicatedAllocInfo = { VK_STRUCTURE_TYPE_MEMORY_DEDICATED_ALLOCATE_INFO_KHR }; - if(m_UseKhrDedicatedAllocation || m_VulkanApiVersion >= VK_MAKE_VERSION(1, 1, 0)) - { - if(dedicatedBuffer != VK_NULL_HANDLE) - { - VMA_ASSERT(dedicatedImage == VK_NULL_HANDLE); - dedicatedAllocInfo.buffer = dedicatedBuffer; - VmaPnextChainPushFront(&allocInfo, &dedicatedAllocInfo); - } - else if(dedicatedImage != VK_NULL_HANDLE) - { - dedicatedAllocInfo.image = dedicatedImage; - VmaPnextChainPushFront(&allocInfo, &dedicatedAllocInfo); - } - } -#endif // #if VMA_DEDICATED_ALLOCATION || VMA_VULKAN_VERSION >= 1001000 - -#if VMA_BUFFER_DEVICE_ADDRESS - VkMemoryAllocateFlagsInfoKHR allocFlagsInfo = { VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_FLAGS_INFO_KHR }; - if(m_UseKhrBufferDeviceAddress) - { - bool canContainBufferWithDeviceAddress = true; - if(dedicatedBuffer != VK_NULL_HANDLE) - { - canContainBufferWithDeviceAddress = dedicatedBufferUsage == UINT32_MAX || // Usage flags unknown - (dedicatedBufferUsage & VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT_EXT) != 0; - } - else if(dedicatedImage != VK_NULL_HANDLE) - { - canContainBufferWithDeviceAddress = false; - } - if(canContainBufferWithDeviceAddress) - { - allocFlagsInfo.flags = VK_MEMORY_ALLOCATE_DEVICE_ADDRESS_BIT_KHR; - VmaPnextChainPushFront(&allocInfo, &allocFlagsInfo); - } - } -#endif // #if VMA_BUFFER_DEVICE_ADDRESS - -#if VMA_MEMORY_PRIORITY - VkMemoryPriorityAllocateInfoEXT priorityInfo = { VK_STRUCTURE_TYPE_MEMORY_PRIORITY_ALLOCATE_INFO_EXT }; - if(m_UseExtMemoryPriority) - { - priorityInfo.priority = priority; - VmaPnextChainPushFront(&allocInfo, &priorityInfo); - } -#endif // #if VMA_MEMORY_PRIORITY - - size_t allocIndex; - VkResult res = VK_SUCCESS; - for(allocIndex = 0; allocIndex < allocationCount; ++allocIndex) - { - res = AllocateDedicatedMemoryPage( - size, - suballocType, - memTypeIndex, - allocInfo, - map, - isUserDataString, - pUserData, - pAllocations + allocIndex); - if(res != VK_SUCCESS) - { - break; - } - } - - if(res == VK_SUCCESS) - { - // Register them in m_pDedicatedAllocations. - { - VmaMutexLockWrite lock(m_DedicatedAllocationsMutex[memTypeIndex], m_UseMutex); - AllocationVectorType* pDedicatedAllocations = m_pDedicatedAllocations[memTypeIndex]; - VMA_ASSERT(pDedicatedAllocations); - for(allocIndex = 0; allocIndex < allocationCount; ++allocIndex) - { - VmaVectorInsertSorted(*pDedicatedAllocations, pAllocations[allocIndex]); - } - } - - VMA_DEBUG_LOG(" Allocated DedicatedMemory Count=%zu, MemoryTypeIndex=#%u", allocationCount, memTypeIndex); - } - else - { - // Free all already created allocations. - while(allocIndex--) - { - VmaAllocation currAlloc = pAllocations[allocIndex]; - VkDeviceMemory hMemory = currAlloc->GetMemory(); - - /* - There is no need to call this, because Vulkan spec allows to skip vkUnmapMemory - before vkFreeMemory. - - if(currAlloc->GetMappedData() != VMA_NULL) - { - (*m_VulkanFunctions.vkUnmapMemory)(m_hDevice, hMemory); - } - */ - - FreeVulkanMemory(memTypeIndex, currAlloc->GetSize(), hMemory); - m_Budget.RemoveAllocation(MemoryTypeIndexToHeapIndex(memTypeIndex), currAlloc->GetSize()); - currAlloc->SetUserData(this, VMA_NULL); - m_AllocationObjectAllocator.Free(currAlloc); - } - - memset(pAllocations, 0, sizeof(VmaAllocation) * allocationCount); - } - - return res; -} - -VkResult VmaAllocator_T::AllocateDedicatedMemoryPage( - VkDeviceSize size, - VmaSuballocationType suballocType, - uint32_t memTypeIndex, - const VkMemoryAllocateInfo& allocInfo, - bool map, - bool isUserDataString, - void* pUserData, - VmaAllocation* pAllocation) -{ - VkDeviceMemory hMemory = VK_NULL_HANDLE; - VkResult res = AllocateVulkanMemory(&allocInfo, &hMemory); - if(res < 0) - { - VMA_DEBUG_LOG(" vkAllocateMemory FAILED"); - return res; - } - - void* pMappedData = VMA_NULL; - if(map) - { - res = (*m_VulkanFunctions.vkMapMemory)( - m_hDevice, - hMemory, - 0, - VK_WHOLE_SIZE, - 0, - &pMappedData); - if(res < 0) - { - VMA_DEBUG_LOG(" vkMapMemory FAILED"); - FreeVulkanMemory(memTypeIndex, size, hMemory); - return res; - } - } - - *pAllocation = m_AllocationObjectAllocator.Allocate(m_CurrentFrameIndex.load(), isUserDataString); - (*pAllocation)->InitDedicatedAllocation(memTypeIndex, hMemory, suballocType, pMappedData, size); - (*pAllocation)->SetUserData(this, pUserData); - m_Budget.AddAllocation(MemoryTypeIndexToHeapIndex(memTypeIndex), size); - if(VMA_DEBUG_INITIALIZE_ALLOCATIONS) - { - FillAllocation(*pAllocation, VMA_ALLOCATION_FILL_PATTERN_CREATED); - } - - return VK_SUCCESS; -} - -void VmaAllocator_T::GetBufferMemoryRequirements( - VkBuffer hBuffer, - VkMemoryRequirements& memReq, - bool& requiresDedicatedAllocation, - bool& prefersDedicatedAllocation) const -{ -#if VMA_DEDICATED_ALLOCATION || VMA_VULKAN_VERSION >= 1001000 - if(m_UseKhrDedicatedAllocation || m_VulkanApiVersion >= VK_MAKE_VERSION(1, 1, 0)) - { - VkBufferMemoryRequirementsInfo2KHR memReqInfo = { VK_STRUCTURE_TYPE_BUFFER_MEMORY_REQUIREMENTS_INFO_2_KHR }; - memReqInfo.buffer = hBuffer; - - VkMemoryDedicatedRequirementsKHR memDedicatedReq = { VK_STRUCTURE_TYPE_MEMORY_DEDICATED_REQUIREMENTS_KHR }; - - VkMemoryRequirements2KHR memReq2 = { VK_STRUCTURE_TYPE_MEMORY_REQUIREMENTS_2_KHR }; - VmaPnextChainPushFront(&memReq2, &memDedicatedReq); - - (*m_VulkanFunctions.vkGetBufferMemoryRequirements2KHR)(m_hDevice, &memReqInfo, &memReq2); - - memReq = memReq2.memoryRequirements; - requiresDedicatedAllocation = (memDedicatedReq.requiresDedicatedAllocation != VK_FALSE); - prefersDedicatedAllocation = (memDedicatedReq.prefersDedicatedAllocation != VK_FALSE); - } - else -#endif // #if VMA_DEDICATED_ALLOCATION || VMA_VULKAN_VERSION >= 1001000 - { - (*m_VulkanFunctions.vkGetBufferMemoryRequirements)(m_hDevice, hBuffer, &memReq); - requiresDedicatedAllocation = false; - prefersDedicatedAllocation = false; - } -} - -void VmaAllocator_T::GetImageMemoryRequirements( - VkImage hImage, - VkMemoryRequirements& memReq, - bool& requiresDedicatedAllocation, - bool& prefersDedicatedAllocation) const -{ -#if VMA_DEDICATED_ALLOCATION || VMA_VULKAN_VERSION >= 1001000 - if(m_UseKhrDedicatedAllocation || m_VulkanApiVersion >= VK_MAKE_VERSION(1, 1, 0)) - { - VkImageMemoryRequirementsInfo2KHR memReqInfo = { VK_STRUCTURE_TYPE_IMAGE_MEMORY_REQUIREMENTS_INFO_2_KHR }; - memReqInfo.image = hImage; - - VkMemoryDedicatedRequirementsKHR memDedicatedReq = { VK_STRUCTURE_TYPE_MEMORY_DEDICATED_REQUIREMENTS_KHR }; - - VkMemoryRequirements2KHR memReq2 = { VK_STRUCTURE_TYPE_MEMORY_REQUIREMENTS_2_KHR }; - VmaPnextChainPushFront(&memReq2, &memDedicatedReq); - - (*m_VulkanFunctions.vkGetImageMemoryRequirements2KHR)(m_hDevice, &memReqInfo, &memReq2); - - memReq = memReq2.memoryRequirements; - requiresDedicatedAllocation = (memDedicatedReq.requiresDedicatedAllocation != VK_FALSE); - prefersDedicatedAllocation = (memDedicatedReq.prefersDedicatedAllocation != VK_FALSE); - } - else -#endif // #if VMA_DEDICATED_ALLOCATION || VMA_VULKAN_VERSION >= 1001000 - { - (*m_VulkanFunctions.vkGetImageMemoryRequirements)(m_hDevice, hImage, &memReq); - requiresDedicatedAllocation = false; - prefersDedicatedAllocation = false; - } -} - -VkResult VmaAllocator_T::AllocateMemory( - const VkMemoryRequirements& vkMemReq, - bool requiresDedicatedAllocation, - bool prefersDedicatedAllocation, - VkBuffer dedicatedBuffer, - VkBufferUsageFlags dedicatedBufferUsage, - VkImage dedicatedImage, - const VmaAllocationCreateInfo& createInfo, - VmaSuballocationType suballocType, - size_t allocationCount, - VmaAllocation* pAllocations) -{ - memset(pAllocations, 0, sizeof(VmaAllocation) * allocationCount); - - VMA_ASSERT(VmaIsPow2(vkMemReq.alignment)); - - if(vkMemReq.size == 0) - { - return VK_ERROR_VALIDATION_FAILED_EXT; - } - if((createInfo.flags & VMA_ALLOCATION_CREATE_DEDICATED_MEMORY_BIT) != 0 && - (createInfo.flags & VMA_ALLOCATION_CREATE_NEVER_ALLOCATE_BIT) != 0) - { - VMA_ASSERT(0 && "Specifying VMA_ALLOCATION_CREATE_DEDICATED_MEMORY_BIT together with VMA_ALLOCATION_CREATE_NEVER_ALLOCATE_BIT makes no sense."); - return VK_ERROR_OUT_OF_DEVICE_MEMORY; - } - if((createInfo.flags & VMA_ALLOCATION_CREATE_MAPPED_BIT) != 0 && - (createInfo.flags & VMA_ALLOCATION_CREATE_CAN_BECOME_LOST_BIT) != 0) - { - VMA_ASSERT(0 && "Specifying VMA_ALLOCATION_CREATE_MAPPED_BIT together with VMA_ALLOCATION_CREATE_CAN_BECOME_LOST_BIT is invalid."); - return VK_ERROR_OUT_OF_DEVICE_MEMORY; - } - if(requiresDedicatedAllocation) - { - if((createInfo.flags & VMA_ALLOCATION_CREATE_NEVER_ALLOCATE_BIT) != 0) - { - VMA_ASSERT(0 && "VMA_ALLOCATION_CREATE_NEVER_ALLOCATE_BIT specified while dedicated allocation is required."); - return VK_ERROR_OUT_OF_DEVICE_MEMORY; - } - if(createInfo.pool != VK_NULL_HANDLE) - { - VMA_ASSERT(0 && "Pool specified while dedicated allocation is required."); - return VK_ERROR_OUT_OF_DEVICE_MEMORY; - } - } - if((createInfo.pool != VK_NULL_HANDLE) && - ((createInfo.flags & (VMA_ALLOCATION_CREATE_DEDICATED_MEMORY_BIT)) != 0)) - { - VMA_ASSERT(0 && "Specifying VMA_ALLOCATION_CREATE_DEDICATED_MEMORY_BIT when pool != null is invalid."); - return VK_ERROR_OUT_OF_DEVICE_MEMORY; - } - - if(createInfo.pool != VK_NULL_HANDLE) - { - const VkDeviceSize alignmentForPool = VMA_MAX( - vkMemReq.alignment, - GetMemoryTypeMinAlignment(createInfo.pool->m_BlockVector.GetMemoryTypeIndex())); - - VmaAllocationCreateInfo createInfoForPool = createInfo; - // If memory type is not HOST_VISIBLE, disable MAPPED. - if((createInfoForPool.flags & VMA_ALLOCATION_CREATE_MAPPED_BIT) != 0 && - (m_MemProps.memoryTypes[createInfo.pool->m_BlockVector.GetMemoryTypeIndex()].propertyFlags & VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT) == 0) - { - createInfoForPool.flags &= ~VMA_ALLOCATION_CREATE_MAPPED_BIT; - } - - return createInfo.pool->m_BlockVector.Allocate( - m_CurrentFrameIndex.load(), - vkMemReq.size, - alignmentForPool, - createInfoForPool, - suballocType, - allocationCount, - pAllocations); - } - else - { - // Bit mask of memory Vulkan types acceptable for this allocation. - uint32_t memoryTypeBits = vkMemReq.memoryTypeBits; - uint32_t memTypeIndex = UINT32_MAX; - VkResult res = vmaFindMemoryTypeIndex(this, memoryTypeBits, &createInfo, &memTypeIndex); - if(res == VK_SUCCESS) - { - VkDeviceSize alignmentForMemType = VMA_MAX( - vkMemReq.alignment, - GetMemoryTypeMinAlignment(memTypeIndex)); - - res = AllocateMemoryOfType( - vkMemReq.size, - alignmentForMemType, - requiresDedicatedAllocation || prefersDedicatedAllocation, - dedicatedBuffer, - dedicatedBufferUsage, - dedicatedImage, - createInfo, - memTypeIndex, - suballocType, - allocationCount, - pAllocations); - // Succeeded on first try. - if(res == VK_SUCCESS) - { - return res; - } - // Allocation from this memory type failed. Try other compatible memory types. - else - { - for(;;) - { - // Remove old memTypeIndex from list of possibilities. - memoryTypeBits &= ~(1u << memTypeIndex); - // Find alternative memTypeIndex. - res = vmaFindMemoryTypeIndex(this, memoryTypeBits, &createInfo, &memTypeIndex); - if(res == VK_SUCCESS) - { - alignmentForMemType = VMA_MAX( - vkMemReq.alignment, - GetMemoryTypeMinAlignment(memTypeIndex)); - - res = AllocateMemoryOfType( - vkMemReq.size, - alignmentForMemType, - requiresDedicatedAllocation || prefersDedicatedAllocation, - dedicatedBuffer, - dedicatedBufferUsage, - dedicatedImage, - createInfo, - memTypeIndex, - suballocType, - allocationCount, - pAllocations); - // Allocation from this alternative memory type succeeded. - if(res == VK_SUCCESS) - { - return res; - } - // else: Allocation from this memory type failed. Try next one - next loop iteration. - } - // No other matching memory type index could be found. - else - { - // Not returning res, which is VK_ERROR_FEATURE_NOT_PRESENT, because we already failed to allocate once. - return VK_ERROR_OUT_OF_DEVICE_MEMORY; - } - } - } - } - // Can't find any single memory type maching requirements. res is VK_ERROR_FEATURE_NOT_PRESENT. - else - return res; - } -} - -void VmaAllocator_T::FreeMemory( - size_t allocationCount, - const VmaAllocation* pAllocations) -{ - VMA_ASSERT(pAllocations); - - for(size_t allocIndex = allocationCount; allocIndex--; ) - { - VmaAllocation allocation = pAllocations[allocIndex]; - - if(allocation != VK_NULL_HANDLE) - { - if(TouchAllocation(allocation)) - { - if(VMA_DEBUG_INITIALIZE_ALLOCATIONS) - { - FillAllocation(allocation, VMA_ALLOCATION_FILL_PATTERN_DESTROYED); - } - - switch(allocation->GetType()) - { - case VmaAllocation_T::ALLOCATION_TYPE_BLOCK: - { - VmaBlockVector* pBlockVector = VMA_NULL; - VmaPool hPool = allocation->GetBlock()->GetParentPool(); - if(hPool != VK_NULL_HANDLE) - { - pBlockVector = &hPool->m_BlockVector; - } - else - { - const uint32_t memTypeIndex = allocation->GetMemoryTypeIndex(); - pBlockVector = m_pBlockVectors[memTypeIndex]; - } - pBlockVector->Free(allocation); - } - break; - case VmaAllocation_T::ALLOCATION_TYPE_DEDICATED: - FreeDedicatedMemory(allocation); - break; - default: - VMA_ASSERT(0); - } - } - - // Do this regardless of whether the allocation is lost. Lost allocations still account to Budget.AllocationBytes. - m_Budget.RemoveAllocation(MemoryTypeIndexToHeapIndex(allocation->GetMemoryTypeIndex()), allocation->GetSize()); - allocation->SetUserData(this, VMA_NULL); - m_AllocationObjectAllocator.Free(allocation); - } - } -} - -void VmaAllocator_T::CalculateStats(VmaStats* pStats) -{ - // Initialize. - InitStatInfo(pStats->total); - for(size_t i = 0; i < VK_MAX_MEMORY_TYPES; ++i) - InitStatInfo(pStats->memoryType[i]); - for(size_t i = 0; i < VK_MAX_MEMORY_HEAPS; ++i) - InitStatInfo(pStats->memoryHeap[i]); - - // Process default pools. - for(uint32_t memTypeIndex = 0; memTypeIndex < GetMemoryTypeCount(); ++memTypeIndex) - { - VmaBlockVector* const pBlockVector = m_pBlockVectors[memTypeIndex]; - VMA_ASSERT(pBlockVector); - pBlockVector->AddStats(pStats); - } - - // Process custom pools. - { - VmaMutexLockRead lock(m_PoolsMutex, m_UseMutex); - for(size_t poolIndex = 0, poolCount = m_Pools.size(); poolIndex < poolCount; ++poolIndex) - { - m_Pools[poolIndex]->m_BlockVector.AddStats(pStats); - } - } - - // Process dedicated allocations. - for(uint32_t memTypeIndex = 0; memTypeIndex < GetMemoryTypeCount(); ++memTypeIndex) - { - const uint32_t memHeapIndex = MemoryTypeIndexToHeapIndex(memTypeIndex); - VmaMutexLockRead dedicatedAllocationsLock(m_DedicatedAllocationsMutex[memTypeIndex], m_UseMutex); - AllocationVectorType* const pDedicatedAllocVector = m_pDedicatedAllocations[memTypeIndex]; - VMA_ASSERT(pDedicatedAllocVector); - for(size_t allocIndex = 0, allocCount = pDedicatedAllocVector->size(); allocIndex < allocCount; ++allocIndex) - { - VmaStatInfo allocationStatInfo; - (*pDedicatedAllocVector)[allocIndex]->DedicatedAllocCalcStatsInfo(allocationStatInfo); - VmaAddStatInfo(pStats->total, allocationStatInfo); - VmaAddStatInfo(pStats->memoryType[memTypeIndex], allocationStatInfo); - VmaAddStatInfo(pStats->memoryHeap[memHeapIndex], allocationStatInfo); - } - } - - // Postprocess. - VmaPostprocessCalcStatInfo(pStats->total); - for(size_t i = 0; i < GetMemoryTypeCount(); ++i) - VmaPostprocessCalcStatInfo(pStats->memoryType[i]); - for(size_t i = 0; i < GetMemoryHeapCount(); ++i) - VmaPostprocessCalcStatInfo(pStats->memoryHeap[i]); -} - -void VmaAllocator_T::GetBudget(VmaBudget* outBudget, uint32_t firstHeap, uint32_t heapCount) -{ -#if VMA_MEMORY_BUDGET - if(m_UseExtMemoryBudget) - { - if(m_Budget.m_OperationsSinceBudgetFetch < 30) - { - VmaMutexLockRead lockRead(m_Budget.m_BudgetMutex, m_UseMutex); - for(uint32_t i = 0; i < heapCount; ++i, ++outBudget) - { - const uint32_t heapIndex = firstHeap + i; - - outBudget->blockBytes = m_Budget.m_BlockBytes[heapIndex]; - outBudget->allocationBytes = m_Budget.m_AllocationBytes[heapIndex]; - - if(m_Budget.m_VulkanUsage[heapIndex] + outBudget->blockBytes > m_Budget.m_BlockBytesAtBudgetFetch[heapIndex]) - { - outBudget->usage = m_Budget.m_VulkanUsage[heapIndex] + - outBudget->blockBytes - m_Budget.m_BlockBytesAtBudgetFetch[heapIndex]; - } - else - { - outBudget->usage = 0; - } - - // Have to take MIN with heap size because explicit HeapSizeLimit is included in it. - outBudget->budget = VMA_MIN( - m_Budget.m_VulkanBudget[heapIndex], m_MemProps.memoryHeaps[heapIndex].size); - } - } - else - { - UpdateVulkanBudget(); // Outside of mutex lock - GetBudget(outBudget, firstHeap, heapCount); // Recursion - } - } - else -#endif - { - for(uint32_t i = 0; i < heapCount; ++i, ++outBudget) - { - const uint32_t heapIndex = firstHeap + i; - - outBudget->blockBytes = m_Budget.m_BlockBytes[heapIndex]; - outBudget->allocationBytes = m_Budget.m_AllocationBytes[heapIndex]; - - outBudget->usage = outBudget->blockBytes; - outBudget->budget = m_MemProps.memoryHeaps[heapIndex].size * 8 / 10; // 80% heuristics. - } - } -} - -static const uint32_t VMA_VENDOR_ID_AMD = 4098; - -VkResult VmaAllocator_T::DefragmentationBegin( - const VmaDefragmentationInfo2& info, - VmaDefragmentationStats* pStats, - VmaDefragmentationContext* pContext) -{ - if(info.pAllocationsChanged != VMA_NULL) - { - memset(info.pAllocationsChanged, 0, info.allocationCount * sizeof(VkBool32)); - } - - *pContext = vma_new(this, VmaDefragmentationContext_T)( - this, m_CurrentFrameIndex.load(), info.flags, pStats); - - (*pContext)->AddPools(info.poolCount, info.pPools); - (*pContext)->AddAllocations( - info.allocationCount, info.pAllocations, info.pAllocationsChanged); - - VkResult res = (*pContext)->Defragment( - info.maxCpuBytesToMove, info.maxCpuAllocationsToMove, - info.maxGpuBytesToMove, info.maxGpuAllocationsToMove, - info.commandBuffer, pStats, info.flags); - - if(res != VK_NOT_READY) - { - vma_delete(this, *pContext); - *pContext = VMA_NULL; - } - - return res; -} - -VkResult VmaAllocator_T::DefragmentationEnd( - VmaDefragmentationContext context) -{ - vma_delete(this, context); - return VK_SUCCESS; -} - -VkResult VmaAllocator_T::DefragmentationPassBegin( - VmaDefragmentationPassInfo* pInfo, - VmaDefragmentationContext context) -{ - return context->DefragmentPassBegin(pInfo); -} -VkResult VmaAllocator_T::DefragmentationPassEnd( - VmaDefragmentationContext context) -{ - return context->DefragmentPassEnd(); - -} - -void VmaAllocator_T::GetAllocationInfo(VmaAllocation hAllocation, VmaAllocationInfo* pAllocationInfo) -{ - if(hAllocation->CanBecomeLost()) - { - /* - Warning: This is a carefully designed algorithm. - Do not modify unless you really know what you're doing :) - */ - const uint32_t localCurrFrameIndex = m_CurrentFrameIndex.load(); - uint32_t localLastUseFrameIndex = hAllocation->GetLastUseFrameIndex(); - for(;;) - { - if(localLastUseFrameIndex == VMA_FRAME_INDEX_LOST) - { - pAllocationInfo->memoryType = UINT32_MAX; - pAllocationInfo->deviceMemory = VK_NULL_HANDLE; - pAllocationInfo->offset = 0; - pAllocationInfo->size = hAllocation->GetSize(); - pAllocationInfo->pMappedData = VMA_NULL; - pAllocationInfo->pUserData = hAllocation->GetUserData(); - return; - } - else if(localLastUseFrameIndex == localCurrFrameIndex) - { - pAllocationInfo->memoryType = hAllocation->GetMemoryTypeIndex(); - pAllocationInfo->deviceMemory = hAllocation->GetMemory(); - pAllocationInfo->offset = hAllocation->GetOffset(); - pAllocationInfo->size = hAllocation->GetSize(); - pAllocationInfo->pMappedData = VMA_NULL; - pAllocationInfo->pUserData = hAllocation->GetUserData(); - return; - } - else // Last use time earlier than current time. - { - if(hAllocation->CompareExchangeLastUseFrameIndex(localLastUseFrameIndex, localCurrFrameIndex)) - { - localLastUseFrameIndex = localCurrFrameIndex; - } - } - } - } - else - { -#if VMA_STATS_STRING_ENABLED - uint32_t localCurrFrameIndex = m_CurrentFrameIndex.load(); - uint32_t localLastUseFrameIndex = hAllocation->GetLastUseFrameIndex(); - for(;;) - { - VMA_ASSERT(localLastUseFrameIndex != VMA_FRAME_INDEX_LOST); - if(localLastUseFrameIndex == localCurrFrameIndex) - { - break; - } - else // Last use time earlier than current time. - { - if(hAllocation->CompareExchangeLastUseFrameIndex(localLastUseFrameIndex, localCurrFrameIndex)) - { - localLastUseFrameIndex = localCurrFrameIndex; - } - } - } -#endif - - pAllocationInfo->memoryType = hAllocation->GetMemoryTypeIndex(); - pAllocationInfo->deviceMemory = hAllocation->GetMemory(); - pAllocationInfo->offset = hAllocation->GetOffset(); - pAllocationInfo->size = hAllocation->GetSize(); - pAllocationInfo->pMappedData = hAllocation->GetMappedData(); - pAllocationInfo->pUserData = hAllocation->GetUserData(); - } -} - -bool VmaAllocator_T::TouchAllocation(VmaAllocation hAllocation) -{ - // This is a stripped-down version of VmaAllocator_T::GetAllocationInfo. - if(hAllocation->CanBecomeLost()) - { - uint32_t localCurrFrameIndex = m_CurrentFrameIndex.load(); - uint32_t localLastUseFrameIndex = hAllocation->GetLastUseFrameIndex(); - for(;;) - { - if(localLastUseFrameIndex == VMA_FRAME_INDEX_LOST) - { - return false; - } - else if(localLastUseFrameIndex == localCurrFrameIndex) - { - return true; - } - else // Last use time earlier than current time. - { - if(hAllocation->CompareExchangeLastUseFrameIndex(localLastUseFrameIndex, localCurrFrameIndex)) - { - localLastUseFrameIndex = localCurrFrameIndex; - } - } - } - } - else - { -#if VMA_STATS_STRING_ENABLED - uint32_t localCurrFrameIndex = m_CurrentFrameIndex.load(); - uint32_t localLastUseFrameIndex = hAllocation->GetLastUseFrameIndex(); - for(;;) - { - VMA_ASSERT(localLastUseFrameIndex != VMA_FRAME_INDEX_LOST); - if(localLastUseFrameIndex == localCurrFrameIndex) - { - break; - } - else // Last use time earlier than current time. - { - if(hAllocation->CompareExchangeLastUseFrameIndex(localLastUseFrameIndex, localCurrFrameIndex)) - { - localLastUseFrameIndex = localCurrFrameIndex; - } - } - } -#endif - - return true; - } -} - -VkResult VmaAllocator_T::CreatePool(const VmaPoolCreateInfo* pCreateInfo, VmaPool* pPool) -{ - VMA_DEBUG_LOG(" CreatePool: MemoryTypeIndex=%u, flags=%u", pCreateInfo->memoryTypeIndex, pCreateInfo->flags); - - VmaPoolCreateInfo newCreateInfo = *pCreateInfo; - - if(newCreateInfo.maxBlockCount == 0) - { - newCreateInfo.maxBlockCount = SIZE_MAX; - } - if(newCreateInfo.minBlockCount > newCreateInfo.maxBlockCount) - { - return VK_ERROR_INITIALIZATION_FAILED; - } - // Memory type index out of range or forbidden. - if(pCreateInfo->memoryTypeIndex >= GetMemoryTypeCount() || - ((1u << pCreateInfo->memoryTypeIndex) & m_GlobalMemoryTypeBits) == 0) - { - return VK_ERROR_FEATURE_NOT_PRESENT; - } - - const VkDeviceSize preferredBlockSize = CalcPreferredBlockSize(newCreateInfo.memoryTypeIndex); - - *pPool = vma_new(this, VmaPool_T)(this, newCreateInfo, preferredBlockSize); - - VkResult res = (*pPool)->m_BlockVector.CreateMinBlocks(); - if(res != VK_SUCCESS) - { - vma_delete(this, *pPool); - *pPool = VMA_NULL; - return res; - } - - // Add to m_Pools. - { - VmaMutexLockWrite lock(m_PoolsMutex, m_UseMutex); - (*pPool)->SetId(m_NextPoolId++); - VmaVectorInsertSorted(m_Pools, *pPool); - } - - return VK_SUCCESS; -} - -void VmaAllocator_T::DestroyPool(VmaPool pool) -{ - // Remove from m_Pools. - { - VmaMutexLockWrite lock(m_PoolsMutex, m_UseMutex); - bool success = VmaVectorRemoveSorted(m_Pools, pool); - VMA_ASSERT(success && "Pool not found in Allocator."); - } - - vma_delete(this, pool); -} - -void VmaAllocator_T::GetPoolStats(VmaPool pool, VmaPoolStats* pPoolStats) -{ - pool->m_BlockVector.GetPoolStats(pPoolStats); -} - -void VmaAllocator_T::SetCurrentFrameIndex(uint32_t frameIndex) -{ - m_CurrentFrameIndex.store(frameIndex); - -#if VMA_MEMORY_BUDGET - if(m_UseExtMemoryBudget) - { - UpdateVulkanBudget(); - } -#endif // #if VMA_MEMORY_BUDGET -} - -void VmaAllocator_T::MakePoolAllocationsLost( - VmaPool hPool, - size_t* pLostAllocationCount) -{ - hPool->m_BlockVector.MakePoolAllocationsLost( - m_CurrentFrameIndex.load(), - pLostAllocationCount); -} - -VkResult VmaAllocator_T::CheckPoolCorruption(VmaPool hPool) -{ - return hPool->m_BlockVector.CheckCorruption(); -} - -VkResult VmaAllocator_T::CheckCorruption(uint32_t memoryTypeBits) -{ - VkResult finalRes = VK_ERROR_FEATURE_NOT_PRESENT; - - // Process default pools. - for(uint32_t memTypeIndex = 0; memTypeIndex < GetMemoryTypeCount(); ++memTypeIndex) - { - if(((1u << memTypeIndex) & memoryTypeBits) != 0) - { - VmaBlockVector* const pBlockVector = m_pBlockVectors[memTypeIndex]; - VMA_ASSERT(pBlockVector); - VkResult localRes = pBlockVector->CheckCorruption(); - switch(localRes) - { - case VK_ERROR_FEATURE_NOT_PRESENT: - break; - case VK_SUCCESS: - finalRes = VK_SUCCESS; - break; - default: - return localRes; - } - } - } - - // Process custom pools. - { - VmaMutexLockRead lock(m_PoolsMutex, m_UseMutex); - for(size_t poolIndex = 0, poolCount = m_Pools.size(); poolIndex < poolCount; ++poolIndex) - { - if(((1u << m_Pools[poolIndex]->m_BlockVector.GetMemoryTypeIndex()) & memoryTypeBits) != 0) - { - VkResult localRes = m_Pools[poolIndex]->m_BlockVector.CheckCorruption(); - switch(localRes) - { - case VK_ERROR_FEATURE_NOT_PRESENT: - break; - case VK_SUCCESS: - finalRes = VK_SUCCESS; - break; - default: - return localRes; - } - } - } - } - - return finalRes; -} - -void VmaAllocator_T::CreateLostAllocation(VmaAllocation* pAllocation) -{ - *pAllocation = m_AllocationObjectAllocator.Allocate(VMA_FRAME_INDEX_LOST, false); - (*pAllocation)->InitLost(); -} - -// An object that increments given atomic but decrements it back in the destructor unless Commit() is called. -template -struct AtomicTransactionalIncrement -{ -public: - typedef std::atomic AtomicT; - ~AtomicTransactionalIncrement() - { - if(m_Atomic) - --(*m_Atomic); - } - T Increment(AtomicT* atomic) - { - m_Atomic = atomic; - return m_Atomic->fetch_add(1); - } - void Commit() - { - m_Atomic = nullptr; - } - -private: - AtomicT* m_Atomic = nullptr; -}; - -VkResult VmaAllocator_T::AllocateVulkanMemory(const VkMemoryAllocateInfo* pAllocateInfo, VkDeviceMemory* pMemory) -{ - AtomicTransactionalIncrement deviceMemoryCountIncrement; - const uint64_t prevDeviceMemoryCount = deviceMemoryCountIncrement.Increment(&m_DeviceMemoryCount); -#if VMA_DEBUG_DONT_EXCEED_MAX_MEMORY_ALLOCATION_COUNT - if(prevDeviceMemoryCount >= m_PhysicalDeviceProperties.limits.maxMemoryAllocationCount) - { - return VK_ERROR_TOO_MANY_OBJECTS; - } -#endif - - const uint32_t heapIndex = MemoryTypeIndexToHeapIndex(pAllocateInfo->memoryTypeIndex); - - // HeapSizeLimit is in effect for this heap. - if((m_HeapSizeLimitMask & (1u << heapIndex)) != 0) - { - const VkDeviceSize heapSize = m_MemProps.memoryHeaps[heapIndex].size; - VkDeviceSize blockBytes = m_Budget.m_BlockBytes[heapIndex]; - for(;;) - { - const VkDeviceSize blockBytesAfterAllocation = blockBytes + pAllocateInfo->allocationSize; - if(blockBytesAfterAllocation > heapSize) - { - return VK_ERROR_OUT_OF_DEVICE_MEMORY; - } - if(m_Budget.m_BlockBytes[heapIndex].compare_exchange_strong(blockBytes, blockBytesAfterAllocation)) - { - break; - } - } - } - else - { - m_Budget.m_BlockBytes[heapIndex] += pAllocateInfo->allocationSize; - } - - // VULKAN CALL vkAllocateMemory. - VkResult res = (*m_VulkanFunctions.vkAllocateMemory)(m_hDevice, pAllocateInfo, GetAllocationCallbacks(), pMemory); - - if(res == VK_SUCCESS) - { -#if VMA_MEMORY_BUDGET - ++m_Budget.m_OperationsSinceBudgetFetch; -#endif - - // Informative callback. - if(m_DeviceMemoryCallbacks.pfnAllocate != VMA_NULL) - { - (*m_DeviceMemoryCallbacks.pfnAllocate)(this, pAllocateInfo->memoryTypeIndex, *pMemory, pAllocateInfo->allocationSize, m_DeviceMemoryCallbacks.pUserData); - } - - deviceMemoryCountIncrement.Commit(); - } - else - { - m_Budget.m_BlockBytes[heapIndex] -= pAllocateInfo->allocationSize; - } - - return res; -} - -void VmaAllocator_T::FreeVulkanMemory(uint32_t memoryType, VkDeviceSize size, VkDeviceMemory hMemory) -{ - // Informative callback. - if(m_DeviceMemoryCallbacks.pfnFree != VMA_NULL) - { - (*m_DeviceMemoryCallbacks.pfnFree)(this, memoryType, hMemory, size, m_DeviceMemoryCallbacks.pUserData); - } - - // VULKAN CALL vkFreeMemory. - (*m_VulkanFunctions.vkFreeMemory)(m_hDevice, hMemory, GetAllocationCallbacks()); - - m_Budget.m_BlockBytes[MemoryTypeIndexToHeapIndex(memoryType)] -= size; - - --m_DeviceMemoryCount; -} - -VkResult VmaAllocator_T::BindVulkanBuffer( - VkDeviceMemory memory, - VkDeviceSize memoryOffset, - VkBuffer buffer, - const void* pNext) -{ - if(pNext != VMA_NULL) - { -#if VMA_VULKAN_VERSION >= 1001000 || VMA_BIND_MEMORY2 - if((m_UseKhrBindMemory2 || m_VulkanApiVersion >= VK_MAKE_VERSION(1, 1, 0)) && - m_VulkanFunctions.vkBindBufferMemory2KHR != VMA_NULL) - { - VkBindBufferMemoryInfoKHR bindBufferMemoryInfo = { VK_STRUCTURE_TYPE_BIND_BUFFER_MEMORY_INFO_KHR }; - bindBufferMemoryInfo.pNext = pNext; - bindBufferMemoryInfo.buffer = buffer; - bindBufferMemoryInfo.memory = memory; - bindBufferMemoryInfo.memoryOffset = memoryOffset; - return (*m_VulkanFunctions.vkBindBufferMemory2KHR)(m_hDevice, 1, &bindBufferMemoryInfo); - } - else -#endif // #if VMA_VULKAN_VERSION >= 1001000 || VMA_BIND_MEMORY2 - { - return VK_ERROR_EXTENSION_NOT_PRESENT; - } - } - else - { - return (*m_VulkanFunctions.vkBindBufferMemory)(m_hDevice, buffer, memory, memoryOffset); - } -} - -VkResult VmaAllocator_T::BindVulkanImage( - VkDeviceMemory memory, - VkDeviceSize memoryOffset, - VkImage image, - const void* pNext) -{ - if(pNext != VMA_NULL) - { -#if VMA_VULKAN_VERSION >= 1001000 || VMA_BIND_MEMORY2 - if((m_UseKhrBindMemory2 || m_VulkanApiVersion >= VK_MAKE_VERSION(1, 1, 0)) && - m_VulkanFunctions.vkBindImageMemory2KHR != VMA_NULL) - { - VkBindImageMemoryInfoKHR bindBufferMemoryInfo = { VK_STRUCTURE_TYPE_BIND_IMAGE_MEMORY_INFO_KHR }; - bindBufferMemoryInfo.pNext = pNext; - bindBufferMemoryInfo.image = image; - bindBufferMemoryInfo.memory = memory; - bindBufferMemoryInfo.memoryOffset = memoryOffset; - return (*m_VulkanFunctions.vkBindImageMemory2KHR)(m_hDevice, 1, &bindBufferMemoryInfo); - } - else -#endif // #if VMA_BIND_MEMORY2 - { - return VK_ERROR_EXTENSION_NOT_PRESENT; - } - } - else - { - return (*m_VulkanFunctions.vkBindImageMemory)(m_hDevice, image, memory, memoryOffset); - } -} - -VkResult VmaAllocator_T::Map(VmaAllocation hAllocation, void** ppData) -{ - if(hAllocation->CanBecomeLost()) - { - return VK_ERROR_MEMORY_MAP_FAILED; - } - - switch(hAllocation->GetType()) - { - case VmaAllocation_T::ALLOCATION_TYPE_BLOCK: - { - VmaDeviceMemoryBlock* const pBlock = hAllocation->GetBlock(); - char *pBytes = VMA_NULL; - VkResult res = pBlock->Map(this, 1, (void**)&pBytes); - if(res == VK_SUCCESS) - { - *ppData = pBytes + (ptrdiff_t)hAllocation->GetOffset(); - hAllocation->BlockAllocMap(); - } - return res; - } - case VmaAllocation_T::ALLOCATION_TYPE_DEDICATED: - return hAllocation->DedicatedAllocMap(this, ppData); - default: - VMA_ASSERT(0); - return VK_ERROR_MEMORY_MAP_FAILED; - } -} - -void VmaAllocator_T::Unmap(VmaAllocation hAllocation) -{ - switch(hAllocation->GetType()) - { - case VmaAllocation_T::ALLOCATION_TYPE_BLOCK: - { - VmaDeviceMemoryBlock* const pBlock = hAllocation->GetBlock(); - hAllocation->BlockAllocUnmap(); - pBlock->Unmap(this, 1); - } - break; - case VmaAllocation_T::ALLOCATION_TYPE_DEDICATED: - hAllocation->DedicatedAllocUnmap(this); - break; - default: - VMA_ASSERT(0); - } -} - -VkResult VmaAllocator_T::BindBufferMemory( - VmaAllocation hAllocation, - VkDeviceSize allocationLocalOffset, - VkBuffer hBuffer, - const void* pNext) -{ - VkResult res = VK_SUCCESS; - switch(hAllocation->GetType()) - { - case VmaAllocation_T::ALLOCATION_TYPE_DEDICATED: - res = BindVulkanBuffer(hAllocation->GetMemory(), allocationLocalOffset, hBuffer, pNext); - break; - case VmaAllocation_T::ALLOCATION_TYPE_BLOCK: - { - VmaDeviceMemoryBlock* const pBlock = hAllocation->GetBlock(); - VMA_ASSERT(pBlock && "Binding buffer to allocation that doesn't belong to any block. Is the allocation lost?"); - res = pBlock->BindBufferMemory(this, hAllocation, allocationLocalOffset, hBuffer, pNext); - break; - } - default: - VMA_ASSERT(0); - } - return res; -} - -VkResult VmaAllocator_T::BindImageMemory( - VmaAllocation hAllocation, - VkDeviceSize allocationLocalOffset, - VkImage hImage, - const void* pNext) -{ - VkResult res = VK_SUCCESS; - switch(hAllocation->GetType()) - { - case VmaAllocation_T::ALLOCATION_TYPE_DEDICATED: - res = BindVulkanImage(hAllocation->GetMemory(), allocationLocalOffset, hImage, pNext); - break; - case VmaAllocation_T::ALLOCATION_TYPE_BLOCK: - { - VmaDeviceMemoryBlock* pBlock = hAllocation->GetBlock(); - VMA_ASSERT(pBlock && "Binding image to allocation that doesn't belong to any block. Is the allocation lost?"); - res = pBlock->BindImageMemory(this, hAllocation, allocationLocalOffset, hImage, pNext); - break; - } - default: - VMA_ASSERT(0); - } - return res; -} - -VkResult VmaAllocator_T::FlushOrInvalidateAllocation( - VmaAllocation hAllocation, - VkDeviceSize offset, VkDeviceSize size, - VMA_CACHE_OPERATION op) -{ - VkResult res = VK_SUCCESS; - - VkMappedMemoryRange memRange = {}; - if(GetFlushOrInvalidateRange(hAllocation, offset, size, memRange)) - { - switch(op) - { - case VMA_CACHE_FLUSH: - res = (*GetVulkanFunctions().vkFlushMappedMemoryRanges)(m_hDevice, 1, &memRange); - break; - case VMA_CACHE_INVALIDATE: - res = (*GetVulkanFunctions().vkInvalidateMappedMemoryRanges)(m_hDevice, 1, &memRange); - break; - default: - VMA_ASSERT(0); - } - } - // else: Just ignore this call. - return res; -} - -VkResult VmaAllocator_T::FlushOrInvalidateAllocations( - uint32_t allocationCount, - const VmaAllocation* allocations, - const VkDeviceSize* offsets, const VkDeviceSize* sizes, - VMA_CACHE_OPERATION op) -{ - typedef VmaStlAllocator RangeAllocator; - typedef VmaSmallVector RangeVector; - RangeVector ranges = RangeVector(RangeAllocator(GetAllocationCallbacks())); - - for(uint32_t allocIndex = 0; allocIndex < allocationCount; ++allocIndex) - { - const VmaAllocation alloc = allocations[allocIndex]; - const VkDeviceSize offset = offsets != VMA_NULL ? offsets[allocIndex] : 0; - const VkDeviceSize size = sizes != VMA_NULL ? sizes[allocIndex] : VK_WHOLE_SIZE; - VkMappedMemoryRange newRange; - if(GetFlushOrInvalidateRange(alloc, offset, size, newRange)) - { - ranges.push_back(newRange); - } - } - - VkResult res = VK_SUCCESS; - if(!ranges.empty()) - { - switch(op) - { - case VMA_CACHE_FLUSH: - res = (*GetVulkanFunctions().vkFlushMappedMemoryRanges)(m_hDevice, (uint32_t)ranges.size(), ranges.data()); - break; - case VMA_CACHE_INVALIDATE: - res = (*GetVulkanFunctions().vkInvalidateMappedMemoryRanges)(m_hDevice, (uint32_t)ranges.size(), ranges.data()); - break; - default: - VMA_ASSERT(0); - } - } - // else: Just ignore this call. - return res; -} - -void VmaAllocator_T::FreeDedicatedMemory(const VmaAllocation allocation) -{ - VMA_ASSERT(allocation && allocation->GetType() == VmaAllocation_T::ALLOCATION_TYPE_DEDICATED); - - const uint32_t memTypeIndex = allocation->GetMemoryTypeIndex(); - { - VmaMutexLockWrite lock(m_DedicatedAllocationsMutex[memTypeIndex], m_UseMutex); - AllocationVectorType* const pDedicatedAllocations = m_pDedicatedAllocations[memTypeIndex]; - VMA_ASSERT(pDedicatedAllocations); - bool success = VmaVectorRemoveSorted(*pDedicatedAllocations, allocation); - VMA_ASSERT(success); - } - - VkDeviceMemory hMemory = allocation->GetMemory(); - - /* - There is no need to call this, because Vulkan spec allows to skip vkUnmapMemory - before vkFreeMemory. - - if(allocation->GetMappedData() != VMA_NULL) - { - (*m_VulkanFunctions.vkUnmapMemory)(m_hDevice, hMemory); - } - */ - - FreeVulkanMemory(memTypeIndex, allocation->GetSize(), hMemory); - - VMA_DEBUG_LOG(" Freed DedicatedMemory MemoryTypeIndex=%u", memTypeIndex); -} - -uint32_t VmaAllocator_T::CalculateGpuDefragmentationMemoryTypeBits() const -{ - VkBufferCreateInfo dummyBufCreateInfo; - VmaFillGpuDefragmentationBufferCreateInfo(dummyBufCreateInfo); - - uint32_t memoryTypeBits = 0; - - // Create buffer. - VkBuffer buf = VK_NULL_HANDLE; - VkResult res = (*GetVulkanFunctions().vkCreateBuffer)( - m_hDevice, &dummyBufCreateInfo, GetAllocationCallbacks(), &buf); - if(res == VK_SUCCESS) - { - // Query for supported memory types. - VkMemoryRequirements memReq; - (*GetVulkanFunctions().vkGetBufferMemoryRequirements)(m_hDevice, buf, &memReq); - memoryTypeBits = memReq.memoryTypeBits; - - // Destroy buffer. - (*GetVulkanFunctions().vkDestroyBuffer)(m_hDevice, buf, GetAllocationCallbacks()); - } - - return memoryTypeBits; -} - -uint32_t VmaAllocator_T::CalculateGlobalMemoryTypeBits() const -{ - // Make sure memory information is already fetched. - VMA_ASSERT(GetMemoryTypeCount() > 0); - - uint32_t memoryTypeBits = UINT32_MAX; - - if(!m_UseAmdDeviceCoherentMemory) - { - // Exclude memory types that have VK_MEMORY_PROPERTY_DEVICE_COHERENT_BIT_AMD. - for(uint32_t memTypeIndex = 0; memTypeIndex < GetMemoryTypeCount(); ++memTypeIndex) - { - if((m_MemProps.memoryTypes[memTypeIndex].propertyFlags & VK_MEMORY_PROPERTY_DEVICE_COHERENT_BIT_AMD_COPY) != 0) - { - memoryTypeBits &= ~(1u << memTypeIndex); - } - } - } - - return memoryTypeBits; -} - -bool VmaAllocator_T::GetFlushOrInvalidateRange( - VmaAllocation allocation, - VkDeviceSize offset, VkDeviceSize size, - VkMappedMemoryRange& outRange) const -{ - const uint32_t memTypeIndex = allocation->GetMemoryTypeIndex(); - if(size > 0 && IsMemoryTypeNonCoherent(memTypeIndex)) - { - const VkDeviceSize nonCoherentAtomSize = m_PhysicalDeviceProperties.limits.nonCoherentAtomSize; - const VkDeviceSize allocationSize = allocation->GetSize(); - VMA_ASSERT(offset <= allocationSize); - - outRange.sType = VK_STRUCTURE_TYPE_MAPPED_MEMORY_RANGE; - outRange.pNext = VMA_NULL; - outRange.memory = allocation->GetMemory(); - - switch(allocation->GetType()) - { - case VmaAllocation_T::ALLOCATION_TYPE_DEDICATED: - outRange.offset = VmaAlignDown(offset, nonCoherentAtomSize); - if(size == VK_WHOLE_SIZE) - { - outRange.size = allocationSize - outRange.offset; - } - else - { - VMA_ASSERT(offset + size <= allocationSize); - outRange.size = VMA_MIN( - VmaAlignUp(size + (offset - outRange.offset), nonCoherentAtomSize), - allocationSize - outRange.offset); - } - break; - case VmaAllocation_T::ALLOCATION_TYPE_BLOCK: - { - // 1. Still within this allocation. - outRange.offset = VmaAlignDown(offset, nonCoherentAtomSize); - if(size == VK_WHOLE_SIZE) - { - size = allocationSize - offset; - } - else - { - VMA_ASSERT(offset + size <= allocationSize); - } - outRange.size = VmaAlignUp(size + (offset - outRange.offset), nonCoherentAtomSize); - - // 2. Adjust to whole block. - const VkDeviceSize allocationOffset = allocation->GetOffset(); - VMA_ASSERT(allocationOffset % nonCoherentAtomSize == 0); - const VkDeviceSize blockSize = allocation->GetBlock()->m_pMetadata->GetSize(); - outRange.offset += allocationOffset; - outRange.size = VMA_MIN(outRange.size, blockSize - outRange.offset); - - break; - } - default: - VMA_ASSERT(0); - } - return true; - } - return false; -} - -#if VMA_MEMORY_BUDGET - -void VmaAllocator_T::UpdateVulkanBudget() -{ - VMA_ASSERT(m_UseExtMemoryBudget); - - VkPhysicalDeviceMemoryProperties2KHR memProps = { VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_MEMORY_PROPERTIES_2_KHR }; - - VkPhysicalDeviceMemoryBudgetPropertiesEXT budgetProps = { VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_MEMORY_BUDGET_PROPERTIES_EXT }; - VmaPnextChainPushFront(&memProps, &budgetProps); - - GetVulkanFunctions().vkGetPhysicalDeviceMemoryProperties2KHR(m_PhysicalDevice, &memProps); - - { - VmaMutexLockWrite lockWrite(m_Budget.m_BudgetMutex, m_UseMutex); - - for(uint32_t heapIndex = 0; heapIndex < GetMemoryHeapCount(); ++heapIndex) - { - m_Budget.m_VulkanUsage[heapIndex] = budgetProps.heapUsage[heapIndex]; - m_Budget.m_VulkanBudget[heapIndex] = budgetProps.heapBudget[heapIndex]; - m_Budget.m_BlockBytesAtBudgetFetch[heapIndex] = m_Budget.m_BlockBytes[heapIndex].load(); - - // Some bugged drivers return the budget incorrectly, e.g. 0 or much bigger than heap size. - if(m_Budget.m_VulkanBudget[heapIndex] == 0) - { - m_Budget.m_VulkanBudget[heapIndex] = m_MemProps.memoryHeaps[heapIndex].size * 8 / 10; // 80% heuristics. - } - else if(m_Budget.m_VulkanBudget[heapIndex] > m_MemProps.memoryHeaps[heapIndex].size) - { - m_Budget.m_VulkanBudget[heapIndex] = m_MemProps.memoryHeaps[heapIndex].size; - } - if(m_Budget.m_VulkanUsage[heapIndex] == 0 && m_Budget.m_BlockBytesAtBudgetFetch[heapIndex] > 0) - { - m_Budget.m_VulkanUsage[heapIndex] = m_Budget.m_BlockBytesAtBudgetFetch[heapIndex]; - } - } - m_Budget.m_OperationsSinceBudgetFetch = 0; - } -} - -#endif // #if VMA_MEMORY_BUDGET - -void VmaAllocator_T::FillAllocation(const VmaAllocation hAllocation, uint8_t pattern) -{ - if(VMA_DEBUG_INITIALIZE_ALLOCATIONS && - !hAllocation->CanBecomeLost() && - (m_MemProps.memoryTypes[hAllocation->GetMemoryTypeIndex()].propertyFlags & VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT) != 0) - { - void* pData = VMA_NULL; - VkResult res = Map(hAllocation, &pData); - if(res == VK_SUCCESS) - { - memset(pData, (int)pattern, (size_t)hAllocation->GetSize()); - FlushOrInvalidateAllocation(hAllocation, 0, VK_WHOLE_SIZE, VMA_CACHE_FLUSH); - Unmap(hAllocation); - } - else - { - VMA_ASSERT(0 && "VMA_DEBUG_INITIALIZE_ALLOCATIONS is enabled, but couldn't map memory to fill allocation."); - } - } -} - -uint32_t VmaAllocator_T::GetGpuDefragmentationMemoryTypeBits() -{ - uint32_t memoryTypeBits = m_GpuDefragmentationMemoryTypeBits.load(); - if(memoryTypeBits == UINT32_MAX) - { - memoryTypeBits = CalculateGpuDefragmentationMemoryTypeBits(); - m_GpuDefragmentationMemoryTypeBits.store(memoryTypeBits); - } - return memoryTypeBits; -} - -#if VMA_STATS_STRING_ENABLED - -void VmaAllocator_T::PrintDetailedMap(VmaJsonWriter& json) -{ - bool dedicatedAllocationsStarted = false; - for(uint32_t memTypeIndex = 0; memTypeIndex < GetMemoryTypeCount(); ++memTypeIndex) - { - VmaMutexLockRead dedicatedAllocationsLock(m_DedicatedAllocationsMutex[memTypeIndex], m_UseMutex); - AllocationVectorType* const pDedicatedAllocVector = m_pDedicatedAllocations[memTypeIndex]; - VMA_ASSERT(pDedicatedAllocVector); - if(pDedicatedAllocVector->empty() == false) - { - if(dedicatedAllocationsStarted == false) - { - dedicatedAllocationsStarted = true; - json.WriteString("DedicatedAllocations"); - json.BeginObject(); - } - - json.BeginString("Type "); - json.ContinueString(memTypeIndex); - json.EndString(); - - json.BeginArray(); - - for(size_t i = 0; i < pDedicatedAllocVector->size(); ++i) - { - json.BeginObject(true); - const VmaAllocation hAlloc = (*pDedicatedAllocVector)[i]; - hAlloc->PrintParameters(json); - json.EndObject(); - } - - json.EndArray(); - } - } - if(dedicatedAllocationsStarted) - { - json.EndObject(); - } - - { - bool allocationsStarted = false; - for(uint32_t memTypeIndex = 0; memTypeIndex < GetMemoryTypeCount(); ++memTypeIndex) - { - if(m_pBlockVectors[memTypeIndex]->IsEmpty() == false) - { - if(allocationsStarted == false) - { - allocationsStarted = true; - json.WriteString("DefaultPools"); - json.BeginObject(); - } - - json.BeginString("Type "); - json.ContinueString(memTypeIndex); - json.EndString(); - - m_pBlockVectors[memTypeIndex]->PrintDetailedMap(json); - } - } - if(allocationsStarted) - { - json.EndObject(); - } - } - - // Custom pools - { - VmaMutexLockRead lock(m_PoolsMutex, m_UseMutex); - const size_t poolCount = m_Pools.size(); - if(poolCount > 0) - { - json.WriteString("Pools"); - json.BeginObject(); - for(size_t poolIndex = 0; poolIndex < poolCount; ++poolIndex) - { - json.BeginString(); - json.ContinueString(m_Pools[poolIndex]->GetId()); - json.EndString(); - - m_Pools[poolIndex]->m_BlockVector.PrintDetailedMap(json); - } - json.EndObject(); - } - } -} - -#endif // #if VMA_STATS_STRING_ENABLED - -//////////////////////////////////////////////////////////////////////////////// -// Public interface - -VMA_CALL_PRE VkResult VMA_CALL_POST vmaCreateAllocator( - const VmaAllocatorCreateInfo* pCreateInfo, - VmaAllocator* pAllocator) -{ - VMA_ASSERT(pCreateInfo && pAllocator); - VMA_ASSERT(pCreateInfo->vulkanApiVersion == 0 || - (VK_VERSION_MAJOR(pCreateInfo->vulkanApiVersion) == 1 && VK_VERSION_MINOR(pCreateInfo->vulkanApiVersion) <= 2)); - VMA_DEBUG_LOG("vmaCreateAllocator"); - *pAllocator = vma_new(pCreateInfo->pAllocationCallbacks, VmaAllocator_T)(pCreateInfo); - return (*pAllocator)->Init(pCreateInfo); -} - -VMA_CALL_PRE void VMA_CALL_POST vmaDestroyAllocator( - VmaAllocator allocator) -{ - if(allocator != VK_NULL_HANDLE) - { - VMA_DEBUG_LOG("vmaDestroyAllocator"); - VkAllocationCallbacks allocationCallbacks = allocator->m_AllocationCallbacks; - vma_delete(&allocationCallbacks, allocator); - } -} - -VMA_CALL_PRE void VMA_CALL_POST vmaGetAllocatorInfo(VmaAllocator allocator, VmaAllocatorInfo* pAllocatorInfo) -{ - VMA_ASSERT(allocator && pAllocatorInfo); - pAllocatorInfo->instance = allocator->m_hInstance; - pAllocatorInfo->physicalDevice = allocator->GetPhysicalDevice(); - pAllocatorInfo->device = allocator->m_hDevice; -} - -VMA_CALL_PRE void VMA_CALL_POST vmaGetPhysicalDeviceProperties( - VmaAllocator allocator, - const VkPhysicalDeviceProperties **ppPhysicalDeviceProperties) -{ - VMA_ASSERT(allocator && ppPhysicalDeviceProperties); - *ppPhysicalDeviceProperties = &allocator->m_PhysicalDeviceProperties; -} - -VMA_CALL_PRE void VMA_CALL_POST vmaGetMemoryProperties( - VmaAllocator allocator, - const VkPhysicalDeviceMemoryProperties** ppPhysicalDeviceMemoryProperties) -{ - VMA_ASSERT(allocator && ppPhysicalDeviceMemoryProperties); - *ppPhysicalDeviceMemoryProperties = &allocator->m_MemProps; -} - -VMA_CALL_PRE void VMA_CALL_POST vmaGetMemoryTypeProperties( - VmaAllocator allocator, - uint32_t memoryTypeIndex, - VkMemoryPropertyFlags* pFlags) -{ - VMA_ASSERT(allocator && pFlags); - VMA_ASSERT(memoryTypeIndex < allocator->GetMemoryTypeCount()); - *pFlags = allocator->m_MemProps.memoryTypes[memoryTypeIndex].propertyFlags; -} - -VMA_CALL_PRE void VMA_CALL_POST vmaSetCurrentFrameIndex( - VmaAllocator allocator, - uint32_t frameIndex) -{ - VMA_ASSERT(allocator); - VMA_ASSERT(frameIndex != VMA_FRAME_INDEX_LOST); - - VMA_DEBUG_GLOBAL_MUTEX_LOCK - - allocator->SetCurrentFrameIndex(frameIndex); -} - -VMA_CALL_PRE void VMA_CALL_POST vmaCalculateStats( - VmaAllocator allocator, - VmaStats* pStats) -{ - VMA_ASSERT(allocator && pStats); - VMA_DEBUG_GLOBAL_MUTEX_LOCK - allocator->CalculateStats(pStats); -} - -VMA_CALL_PRE void VMA_CALL_POST vmaGetBudget( - VmaAllocator allocator, - VmaBudget* pBudget) -{ - VMA_ASSERT(allocator && pBudget); - VMA_DEBUG_GLOBAL_MUTEX_LOCK - allocator->GetBudget(pBudget, 0, allocator->GetMemoryHeapCount()); -} - -#if VMA_STATS_STRING_ENABLED - -VMA_CALL_PRE void VMA_CALL_POST vmaBuildStatsString( - VmaAllocator allocator, - char** ppStatsString, - VkBool32 detailedMap) -{ - VMA_ASSERT(allocator && ppStatsString); - VMA_DEBUG_GLOBAL_MUTEX_LOCK - - VmaStringBuilder sb(allocator); - { - VmaJsonWriter json(allocator->GetAllocationCallbacks(), sb); - json.BeginObject(); - - VmaBudget budget[VK_MAX_MEMORY_HEAPS]; - allocator->GetBudget(budget, 0, allocator->GetMemoryHeapCount()); - - VmaStats stats; - allocator->CalculateStats(&stats); - - json.WriteString("Total"); - VmaPrintStatInfo(json, stats.total); - - for(uint32_t heapIndex = 0; heapIndex < allocator->GetMemoryHeapCount(); ++heapIndex) - { - json.BeginString("Heap "); - json.ContinueString(heapIndex); - json.EndString(); - json.BeginObject(); - - json.WriteString("Size"); - json.WriteNumber(allocator->m_MemProps.memoryHeaps[heapIndex].size); - - json.WriteString("Flags"); - json.BeginArray(true); - if((allocator->m_MemProps.memoryHeaps[heapIndex].flags & VK_MEMORY_HEAP_DEVICE_LOCAL_BIT) != 0) - { - json.WriteString("DEVICE_LOCAL"); - } - json.EndArray(); - - json.WriteString("Budget"); - json.BeginObject(); - { - json.WriteString("BlockBytes"); - json.WriteNumber(budget[heapIndex].blockBytes); - json.WriteString("AllocationBytes"); - json.WriteNumber(budget[heapIndex].allocationBytes); - json.WriteString("Usage"); - json.WriteNumber(budget[heapIndex].usage); - json.WriteString("Budget"); - json.WriteNumber(budget[heapIndex].budget); - } - json.EndObject(); - - if(stats.memoryHeap[heapIndex].blockCount > 0) - { - json.WriteString("Stats"); - VmaPrintStatInfo(json, stats.memoryHeap[heapIndex]); - } - - for(uint32_t typeIndex = 0; typeIndex < allocator->GetMemoryTypeCount(); ++typeIndex) - { - if(allocator->MemoryTypeIndexToHeapIndex(typeIndex) == heapIndex) - { - json.BeginString("Type "); - json.ContinueString(typeIndex); - json.EndString(); - - json.BeginObject(); - - json.WriteString("Flags"); - json.BeginArray(true); - VkMemoryPropertyFlags flags = allocator->m_MemProps.memoryTypes[typeIndex].propertyFlags; - if((flags & VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT) != 0) - { - json.WriteString("DEVICE_LOCAL"); - } - if((flags & VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT) != 0) - { - json.WriteString("HOST_VISIBLE"); - } - if((flags & VK_MEMORY_PROPERTY_HOST_COHERENT_BIT) != 0) - { - json.WriteString("HOST_COHERENT"); - } - if((flags & VK_MEMORY_PROPERTY_HOST_CACHED_BIT) != 0) - { - json.WriteString("HOST_CACHED"); - } - if((flags & VK_MEMORY_PROPERTY_LAZILY_ALLOCATED_BIT) != 0) - { - json.WriteString("LAZILY_ALLOCATED"); - } -#if VMA_VULKAN_VERSION >= 1001000 - if((flags & VK_MEMORY_PROPERTY_PROTECTED_BIT) != 0) - { - json.WriteString("PROTECTED"); - } -#endif // #if VMA_VULKAN_VERSION >= 1001000 -#if VK_AMD_device_coherent_memory - if((flags & VK_MEMORY_PROPERTY_DEVICE_COHERENT_BIT_AMD_COPY) != 0) - { - json.WriteString("DEVICE_COHERENT"); - } - if((flags & VK_MEMORY_PROPERTY_DEVICE_UNCACHED_BIT_AMD_COPY) != 0) - { - json.WriteString("DEVICE_UNCACHED"); - } -#endif // #if VK_AMD_device_coherent_memory - json.EndArray(); - - if(stats.memoryType[typeIndex].blockCount > 0) - { - json.WriteString("Stats"); - VmaPrintStatInfo(json, stats.memoryType[typeIndex]); - } - - json.EndObject(); - } - } - - json.EndObject(); - } - if(detailedMap == VK_TRUE) - { - allocator->PrintDetailedMap(json); - } - - json.EndObject(); - } - - const size_t len = sb.GetLength(); - char* const pChars = vma_new_array(allocator, char, len + 1); - if(len > 0) - { - memcpy(pChars, sb.GetData(), len); - } - pChars[len] = '\0'; - *ppStatsString = pChars; -} - -VMA_CALL_PRE void VMA_CALL_POST vmaFreeStatsString( - VmaAllocator allocator, - char* pStatsString) -{ - if(pStatsString != VMA_NULL) - { - VMA_ASSERT(allocator); - size_t len = strlen(pStatsString); - vma_delete_array(allocator, pStatsString, len + 1); - } -} - -#endif // #if VMA_STATS_STRING_ENABLED - -/* -This function is not protected by any mutex because it just reads immutable data. -*/ -VMA_CALL_PRE VkResult VMA_CALL_POST vmaFindMemoryTypeIndex( - VmaAllocator allocator, - uint32_t memoryTypeBits, - const VmaAllocationCreateInfo* pAllocationCreateInfo, - uint32_t* pMemoryTypeIndex) -{ - VMA_ASSERT(allocator != VK_NULL_HANDLE); - VMA_ASSERT(pAllocationCreateInfo != VMA_NULL); - VMA_ASSERT(pMemoryTypeIndex != VMA_NULL); - - memoryTypeBits &= allocator->GetGlobalMemoryTypeBits(); - - if(pAllocationCreateInfo->memoryTypeBits != 0) - { - memoryTypeBits &= pAllocationCreateInfo->memoryTypeBits; - } - - uint32_t requiredFlags = pAllocationCreateInfo->requiredFlags; - uint32_t preferredFlags = pAllocationCreateInfo->preferredFlags; - uint32_t notPreferredFlags = 0; - - // Convert usage to requiredFlags and preferredFlags. - switch(pAllocationCreateInfo->usage) - { - case VMA_MEMORY_USAGE_UNKNOWN: - break; - case VMA_MEMORY_USAGE_GPU_ONLY: - if(!allocator->IsIntegratedGpu() || (preferredFlags & VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT) == 0) - { - preferredFlags |= VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT; - } - break; - case VMA_MEMORY_USAGE_CPU_ONLY: - requiredFlags |= VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT; - break; - case VMA_MEMORY_USAGE_CPU_TO_GPU: - requiredFlags |= VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT; - if(!allocator->IsIntegratedGpu() || (preferredFlags & VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT) == 0) - { - preferredFlags |= VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT; - } - break; - case VMA_MEMORY_USAGE_GPU_TO_CPU: - requiredFlags |= VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT; - preferredFlags |= VK_MEMORY_PROPERTY_HOST_CACHED_BIT; - break; - case VMA_MEMORY_USAGE_CPU_COPY: - notPreferredFlags |= VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT; - break; - case VMA_MEMORY_USAGE_GPU_LAZILY_ALLOCATED: - requiredFlags |= VK_MEMORY_PROPERTY_LAZILY_ALLOCATED_BIT; - break; - default: - VMA_ASSERT(0); - break; - } - - // Avoid DEVICE_COHERENT unless explicitly requested. - if(((pAllocationCreateInfo->requiredFlags | pAllocationCreateInfo->preferredFlags) & - (VK_MEMORY_PROPERTY_DEVICE_COHERENT_BIT_AMD_COPY | VK_MEMORY_PROPERTY_DEVICE_UNCACHED_BIT_AMD_COPY)) == 0) - { - notPreferredFlags |= VK_MEMORY_PROPERTY_DEVICE_COHERENT_BIT_AMD_COPY; - } - - *pMemoryTypeIndex = UINT32_MAX; - uint32_t minCost = UINT32_MAX; - for(uint32_t memTypeIndex = 0, memTypeBit = 1; - memTypeIndex < allocator->GetMemoryTypeCount(); - ++memTypeIndex, memTypeBit <<= 1) - { - // This memory type is acceptable according to memoryTypeBits bitmask. - if((memTypeBit & memoryTypeBits) != 0) - { - const VkMemoryPropertyFlags currFlags = - allocator->m_MemProps.memoryTypes[memTypeIndex].propertyFlags; - // This memory type contains requiredFlags. - if((requiredFlags & ~currFlags) == 0) - { - // Calculate cost as number of bits from preferredFlags not present in this memory type. - uint32_t currCost = VmaCountBitsSet(preferredFlags & ~currFlags) + - VmaCountBitsSet(currFlags & notPreferredFlags); - // Remember memory type with lowest cost. - if(currCost < minCost) - { - *pMemoryTypeIndex = memTypeIndex; - if(currCost == 0) - { - return VK_SUCCESS; - } - minCost = currCost; - } - } - } - } - return (*pMemoryTypeIndex != UINT32_MAX) ? VK_SUCCESS : VK_ERROR_FEATURE_NOT_PRESENT; -} - -VMA_CALL_PRE VkResult VMA_CALL_POST vmaFindMemoryTypeIndexForBufferInfo( - VmaAllocator allocator, - const VkBufferCreateInfo* pBufferCreateInfo, - const VmaAllocationCreateInfo* pAllocationCreateInfo, - uint32_t* pMemoryTypeIndex) -{ - VMA_ASSERT(allocator != VK_NULL_HANDLE); - VMA_ASSERT(pBufferCreateInfo != VMA_NULL); - VMA_ASSERT(pAllocationCreateInfo != VMA_NULL); - VMA_ASSERT(pMemoryTypeIndex != VMA_NULL); - - const VkDevice hDev = allocator->m_hDevice; - VkBuffer hBuffer = VK_NULL_HANDLE; - VkResult res = allocator->GetVulkanFunctions().vkCreateBuffer( - hDev, pBufferCreateInfo, allocator->GetAllocationCallbacks(), &hBuffer); - if(res == VK_SUCCESS) - { - VkMemoryRequirements memReq = {}; - allocator->GetVulkanFunctions().vkGetBufferMemoryRequirements( - hDev, hBuffer, &memReq); - - res = vmaFindMemoryTypeIndex( - allocator, - memReq.memoryTypeBits, - pAllocationCreateInfo, - pMemoryTypeIndex); - - allocator->GetVulkanFunctions().vkDestroyBuffer( - hDev, hBuffer, allocator->GetAllocationCallbacks()); - } - return res; -} - -VMA_CALL_PRE VkResult VMA_CALL_POST vmaFindMemoryTypeIndexForImageInfo( - VmaAllocator allocator, - const VkImageCreateInfo* pImageCreateInfo, - const VmaAllocationCreateInfo* pAllocationCreateInfo, - uint32_t* pMemoryTypeIndex) -{ - VMA_ASSERT(allocator != VK_NULL_HANDLE); - VMA_ASSERT(pImageCreateInfo != VMA_NULL); - VMA_ASSERT(pAllocationCreateInfo != VMA_NULL); - VMA_ASSERT(pMemoryTypeIndex != VMA_NULL); - - const VkDevice hDev = allocator->m_hDevice; - VkImage hImage = VK_NULL_HANDLE; - VkResult res = allocator->GetVulkanFunctions().vkCreateImage( - hDev, pImageCreateInfo, allocator->GetAllocationCallbacks(), &hImage); - if(res == VK_SUCCESS) - { - VkMemoryRequirements memReq = {}; - allocator->GetVulkanFunctions().vkGetImageMemoryRequirements( - hDev, hImage, &memReq); - - res = vmaFindMemoryTypeIndex( - allocator, - memReq.memoryTypeBits, - pAllocationCreateInfo, - pMemoryTypeIndex); - - allocator->GetVulkanFunctions().vkDestroyImage( - hDev, hImage, allocator->GetAllocationCallbacks()); - } - return res; -} - -VMA_CALL_PRE VkResult VMA_CALL_POST vmaCreatePool( - VmaAllocator allocator, - const VmaPoolCreateInfo* pCreateInfo, - VmaPool* pPool) -{ - VMA_ASSERT(allocator && pCreateInfo && pPool); - - VMA_DEBUG_LOG("vmaCreatePool"); - - VMA_DEBUG_GLOBAL_MUTEX_LOCK - - VkResult res = allocator->CreatePool(pCreateInfo, pPool); - -#if VMA_RECORDING_ENABLED - if(allocator->GetRecorder() != VMA_NULL) - { - allocator->GetRecorder()->RecordCreatePool(allocator->GetCurrentFrameIndex(), *pCreateInfo, *pPool); - } -#endif - - return res; -} - -VMA_CALL_PRE void VMA_CALL_POST vmaDestroyPool( - VmaAllocator allocator, - VmaPool pool) -{ - VMA_ASSERT(allocator); - - if(pool == VK_NULL_HANDLE) - { - return; - } - - VMA_DEBUG_LOG("vmaDestroyPool"); - - VMA_DEBUG_GLOBAL_MUTEX_LOCK - -#if VMA_RECORDING_ENABLED - if(allocator->GetRecorder() != VMA_NULL) - { - allocator->GetRecorder()->RecordDestroyPool(allocator->GetCurrentFrameIndex(), pool); - } -#endif - - allocator->DestroyPool(pool); -} - -VMA_CALL_PRE void VMA_CALL_POST vmaGetPoolStats( - VmaAllocator allocator, - VmaPool pool, - VmaPoolStats* pPoolStats) -{ - VMA_ASSERT(allocator && pool && pPoolStats); - - VMA_DEBUG_GLOBAL_MUTEX_LOCK - - allocator->GetPoolStats(pool, pPoolStats); -} - -VMA_CALL_PRE void VMA_CALL_POST vmaMakePoolAllocationsLost( - VmaAllocator allocator, - VmaPool pool, - size_t* pLostAllocationCount) -{ - VMA_ASSERT(allocator && pool); - - VMA_DEBUG_GLOBAL_MUTEX_LOCK - -#if VMA_RECORDING_ENABLED - if(allocator->GetRecorder() != VMA_NULL) - { - allocator->GetRecorder()->RecordMakePoolAllocationsLost(allocator->GetCurrentFrameIndex(), pool); - } -#endif - - allocator->MakePoolAllocationsLost(pool, pLostAllocationCount); -} - -VMA_CALL_PRE VkResult VMA_CALL_POST vmaCheckPoolCorruption(VmaAllocator allocator, VmaPool pool) -{ - VMA_ASSERT(allocator && pool); - - VMA_DEBUG_GLOBAL_MUTEX_LOCK - - VMA_DEBUG_LOG("vmaCheckPoolCorruption"); - - return allocator->CheckPoolCorruption(pool); -} - -VMA_CALL_PRE void VMA_CALL_POST vmaGetPoolName( - VmaAllocator allocator, - VmaPool pool, - const char** ppName) -{ - VMA_ASSERT(allocator && pool && ppName); - - VMA_DEBUG_LOG("vmaGetPoolName"); - - VMA_DEBUG_GLOBAL_MUTEX_LOCK - - *ppName = pool->GetName(); -} - -VMA_CALL_PRE void VMA_CALL_POST vmaSetPoolName( - VmaAllocator allocator, - VmaPool pool, - const char* pName) -{ - VMA_ASSERT(allocator && pool); - - VMA_DEBUG_LOG("vmaSetPoolName"); - - VMA_DEBUG_GLOBAL_MUTEX_LOCK - - pool->SetName(pName); - -#if VMA_RECORDING_ENABLED - if(allocator->GetRecorder() != VMA_NULL) - { - allocator->GetRecorder()->RecordSetPoolName(allocator->GetCurrentFrameIndex(), pool, pName); - } -#endif -} - -VMA_CALL_PRE VkResult VMA_CALL_POST vmaAllocateMemory( - VmaAllocator allocator, - const VkMemoryRequirements* pVkMemoryRequirements, - const VmaAllocationCreateInfo* pCreateInfo, - VmaAllocation* pAllocation, - VmaAllocationInfo* pAllocationInfo) -{ - VMA_ASSERT(allocator && pVkMemoryRequirements && pCreateInfo && pAllocation); - - VMA_DEBUG_LOG("vmaAllocateMemory"); - - VMA_DEBUG_GLOBAL_MUTEX_LOCK - - VkResult result = allocator->AllocateMemory( - *pVkMemoryRequirements, - false, // requiresDedicatedAllocation - false, // prefersDedicatedAllocation - VK_NULL_HANDLE, // dedicatedBuffer - UINT32_MAX, // dedicatedBufferUsage - VK_NULL_HANDLE, // dedicatedImage - *pCreateInfo, - VMA_SUBALLOCATION_TYPE_UNKNOWN, - 1, // allocationCount - pAllocation); - -#if VMA_RECORDING_ENABLED - if(allocator->GetRecorder() != VMA_NULL) - { - allocator->GetRecorder()->RecordAllocateMemory( - allocator->GetCurrentFrameIndex(), - *pVkMemoryRequirements, - *pCreateInfo, - *pAllocation); - } -#endif - - if(pAllocationInfo != VMA_NULL && result == VK_SUCCESS) - { - allocator->GetAllocationInfo(*pAllocation, pAllocationInfo); - } - - return result; -} - -VMA_CALL_PRE VkResult VMA_CALL_POST vmaAllocateMemoryPages( - VmaAllocator allocator, - const VkMemoryRequirements* pVkMemoryRequirements, - const VmaAllocationCreateInfo* pCreateInfo, - size_t allocationCount, - VmaAllocation* pAllocations, - VmaAllocationInfo* pAllocationInfo) -{ - if(allocationCount == 0) - { - return VK_SUCCESS; - } - - VMA_ASSERT(allocator && pVkMemoryRequirements && pCreateInfo && pAllocations); - - VMA_DEBUG_LOG("vmaAllocateMemoryPages"); - - VMA_DEBUG_GLOBAL_MUTEX_LOCK - - VkResult result = allocator->AllocateMemory( - *pVkMemoryRequirements, - false, // requiresDedicatedAllocation - false, // prefersDedicatedAllocation - VK_NULL_HANDLE, // dedicatedBuffer - UINT32_MAX, // dedicatedBufferUsage - VK_NULL_HANDLE, // dedicatedImage - *pCreateInfo, - VMA_SUBALLOCATION_TYPE_UNKNOWN, - allocationCount, - pAllocations); - -#if VMA_RECORDING_ENABLED - if(allocator->GetRecorder() != VMA_NULL) - { - allocator->GetRecorder()->RecordAllocateMemoryPages( - allocator->GetCurrentFrameIndex(), - *pVkMemoryRequirements, - *pCreateInfo, - (uint64_t)allocationCount, - pAllocations); - } -#endif - - if(pAllocationInfo != VMA_NULL && result == VK_SUCCESS) - { - for(size_t i = 0; i < allocationCount; ++i) - { - allocator->GetAllocationInfo(pAllocations[i], pAllocationInfo + i); - } - } - - return result; -} - -VMA_CALL_PRE VkResult VMA_CALL_POST vmaAllocateMemoryForBuffer( - VmaAllocator allocator, - VkBuffer buffer, - const VmaAllocationCreateInfo* pCreateInfo, - VmaAllocation* pAllocation, - VmaAllocationInfo* pAllocationInfo) -{ - VMA_ASSERT(allocator && buffer != VK_NULL_HANDLE && pCreateInfo && pAllocation); - - VMA_DEBUG_LOG("vmaAllocateMemoryForBuffer"); - - VMA_DEBUG_GLOBAL_MUTEX_LOCK - - VkMemoryRequirements vkMemReq = {}; - bool requiresDedicatedAllocation = false; - bool prefersDedicatedAllocation = false; - allocator->GetBufferMemoryRequirements(buffer, vkMemReq, - requiresDedicatedAllocation, - prefersDedicatedAllocation); - - VkResult result = allocator->AllocateMemory( - vkMemReq, - requiresDedicatedAllocation, - prefersDedicatedAllocation, - buffer, // dedicatedBuffer - UINT32_MAX, // dedicatedBufferUsage - VK_NULL_HANDLE, // dedicatedImage - *pCreateInfo, - VMA_SUBALLOCATION_TYPE_BUFFER, - 1, // allocationCount - pAllocation); - -#if VMA_RECORDING_ENABLED - if(allocator->GetRecorder() != VMA_NULL) - { - allocator->GetRecorder()->RecordAllocateMemoryForBuffer( - allocator->GetCurrentFrameIndex(), - vkMemReq, - requiresDedicatedAllocation, - prefersDedicatedAllocation, - *pCreateInfo, - *pAllocation); - } -#endif - - if(pAllocationInfo && result == VK_SUCCESS) - { - allocator->GetAllocationInfo(*pAllocation, pAllocationInfo); - } - - return result; -} - -VMA_CALL_PRE VkResult VMA_CALL_POST vmaAllocateMemoryForImage( - VmaAllocator allocator, - VkImage image, - const VmaAllocationCreateInfo* pCreateInfo, - VmaAllocation* pAllocation, - VmaAllocationInfo* pAllocationInfo) -{ - VMA_ASSERT(allocator && image != VK_NULL_HANDLE && pCreateInfo && pAllocation); - - VMA_DEBUG_LOG("vmaAllocateMemoryForImage"); - - VMA_DEBUG_GLOBAL_MUTEX_LOCK - - VkMemoryRequirements vkMemReq = {}; - bool requiresDedicatedAllocation = false; - bool prefersDedicatedAllocation = false; - allocator->GetImageMemoryRequirements(image, vkMemReq, - requiresDedicatedAllocation, prefersDedicatedAllocation); - - VkResult result = allocator->AllocateMemory( - vkMemReq, - requiresDedicatedAllocation, - prefersDedicatedAllocation, - VK_NULL_HANDLE, // dedicatedBuffer - UINT32_MAX, // dedicatedBufferUsage - image, // dedicatedImage - *pCreateInfo, - VMA_SUBALLOCATION_TYPE_IMAGE_UNKNOWN, - 1, // allocationCount - pAllocation); - -#if VMA_RECORDING_ENABLED - if(allocator->GetRecorder() != VMA_NULL) - { - allocator->GetRecorder()->RecordAllocateMemoryForImage( - allocator->GetCurrentFrameIndex(), - vkMemReq, - requiresDedicatedAllocation, - prefersDedicatedAllocation, - *pCreateInfo, - *pAllocation); - } -#endif - - if(pAllocationInfo && result == VK_SUCCESS) - { - allocator->GetAllocationInfo(*pAllocation, pAllocationInfo); - } - - return result; -} - -VMA_CALL_PRE void VMA_CALL_POST vmaFreeMemory( - VmaAllocator allocator, - VmaAllocation allocation) -{ - VMA_ASSERT(allocator); - - if(allocation == VK_NULL_HANDLE) - { - return; - } - - VMA_DEBUG_LOG("vmaFreeMemory"); - - VMA_DEBUG_GLOBAL_MUTEX_LOCK - -#if VMA_RECORDING_ENABLED - if(allocator->GetRecorder() != VMA_NULL) - { - allocator->GetRecorder()->RecordFreeMemory( - allocator->GetCurrentFrameIndex(), - allocation); - } -#endif - - allocator->FreeMemory( - 1, // allocationCount - &allocation); -} - -VMA_CALL_PRE void VMA_CALL_POST vmaFreeMemoryPages( - VmaAllocator allocator, - size_t allocationCount, - const VmaAllocation* pAllocations) -{ - if(allocationCount == 0) - { - return; - } - - VMA_ASSERT(allocator); - - VMA_DEBUG_LOG("vmaFreeMemoryPages"); - - VMA_DEBUG_GLOBAL_MUTEX_LOCK - -#if VMA_RECORDING_ENABLED - if(allocator->GetRecorder() != VMA_NULL) - { - allocator->GetRecorder()->RecordFreeMemoryPages( - allocator->GetCurrentFrameIndex(), - (uint64_t)allocationCount, - pAllocations); - } -#endif - - allocator->FreeMemory(allocationCount, pAllocations); -} - -VMA_CALL_PRE void VMA_CALL_POST vmaGetAllocationInfo( - VmaAllocator allocator, - VmaAllocation allocation, - VmaAllocationInfo* pAllocationInfo) -{ - VMA_ASSERT(allocator && allocation && pAllocationInfo); - - VMA_DEBUG_GLOBAL_MUTEX_LOCK - -#if VMA_RECORDING_ENABLED - if(allocator->GetRecorder() != VMA_NULL) - { - allocator->GetRecorder()->RecordGetAllocationInfo( - allocator->GetCurrentFrameIndex(), - allocation); - } -#endif - - allocator->GetAllocationInfo(allocation, pAllocationInfo); -} - -VMA_CALL_PRE VkBool32 VMA_CALL_POST vmaTouchAllocation( - VmaAllocator allocator, - VmaAllocation allocation) -{ - VMA_ASSERT(allocator && allocation); - - VMA_DEBUG_GLOBAL_MUTEX_LOCK - -#if VMA_RECORDING_ENABLED - if(allocator->GetRecorder() != VMA_NULL) - { - allocator->GetRecorder()->RecordTouchAllocation( - allocator->GetCurrentFrameIndex(), - allocation); - } -#endif - - return allocator->TouchAllocation(allocation); -} - -VMA_CALL_PRE void VMA_CALL_POST vmaSetAllocationUserData( - VmaAllocator allocator, - VmaAllocation allocation, - void* pUserData) -{ - VMA_ASSERT(allocator && allocation); - - VMA_DEBUG_GLOBAL_MUTEX_LOCK - - allocation->SetUserData(allocator, pUserData); - -#if VMA_RECORDING_ENABLED - if(allocator->GetRecorder() != VMA_NULL) - { - allocator->GetRecorder()->RecordSetAllocationUserData( - allocator->GetCurrentFrameIndex(), - allocation, - pUserData); - } -#endif -} - -VMA_CALL_PRE void VMA_CALL_POST vmaCreateLostAllocation( - VmaAllocator allocator, - VmaAllocation* pAllocation) -{ - VMA_ASSERT(allocator && pAllocation); - - VMA_DEBUG_GLOBAL_MUTEX_LOCK; - - allocator->CreateLostAllocation(pAllocation); - -#if VMA_RECORDING_ENABLED - if(allocator->GetRecorder() != VMA_NULL) - { - allocator->GetRecorder()->RecordCreateLostAllocation( - allocator->GetCurrentFrameIndex(), - *pAllocation); - } -#endif -} - -VMA_CALL_PRE VkResult VMA_CALL_POST vmaMapMemory( - VmaAllocator allocator, - VmaAllocation allocation, - void** ppData) -{ - VMA_ASSERT(allocator && allocation && ppData); - - VMA_DEBUG_GLOBAL_MUTEX_LOCK - - VkResult res = allocator->Map(allocation, ppData); - -#if VMA_RECORDING_ENABLED - if(allocator->GetRecorder() != VMA_NULL) - { - allocator->GetRecorder()->RecordMapMemory( - allocator->GetCurrentFrameIndex(), - allocation); - } -#endif - - return res; -} - -VMA_CALL_PRE void VMA_CALL_POST vmaUnmapMemory( - VmaAllocator allocator, - VmaAllocation allocation) -{ - VMA_ASSERT(allocator && allocation); - - VMA_DEBUG_GLOBAL_MUTEX_LOCK - -#if VMA_RECORDING_ENABLED - if(allocator->GetRecorder() != VMA_NULL) - { - allocator->GetRecorder()->RecordUnmapMemory( - allocator->GetCurrentFrameIndex(), - allocation); - } -#endif - - allocator->Unmap(allocation); -} - -VMA_CALL_PRE VkResult VMA_CALL_POST vmaFlushAllocation(VmaAllocator allocator, VmaAllocation allocation, VkDeviceSize offset, VkDeviceSize size) -{ - VMA_ASSERT(allocator && allocation); - - VMA_DEBUG_LOG("vmaFlushAllocation"); - - VMA_DEBUG_GLOBAL_MUTEX_LOCK - - const VkResult res = allocator->FlushOrInvalidateAllocation(allocation, offset, size, VMA_CACHE_FLUSH); - -#if VMA_RECORDING_ENABLED - if(allocator->GetRecorder() != VMA_NULL) - { - allocator->GetRecorder()->RecordFlushAllocation( - allocator->GetCurrentFrameIndex(), - allocation, offset, size); - } -#endif - - return res; -} - -VMA_CALL_PRE VkResult VMA_CALL_POST vmaInvalidateAllocation(VmaAllocator allocator, VmaAllocation allocation, VkDeviceSize offset, VkDeviceSize size) -{ - VMA_ASSERT(allocator && allocation); - - VMA_DEBUG_LOG("vmaInvalidateAllocation"); - - VMA_DEBUG_GLOBAL_MUTEX_LOCK - - const VkResult res = allocator->FlushOrInvalidateAllocation(allocation, offset, size, VMA_CACHE_INVALIDATE); - -#if VMA_RECORDING_ENABLED - if(allocator->GetRecorder() != VMA_NULL) - { - allocator->GetRecorder()->RecordInvalidateAllocation( - allocator->GetCurrentFrameIndex(), - allocation, offset, size); - } -#endif - - return res; -} - -VMA_CALL_PRE VkResult VMA_CALL_POST vmaFlushAllocations( - VmaAllocator allocator, - uint32_t allocationCount, - const VmaAllocation* allocations, - const VkDeviceSize* offsets, - const VkDeviceSize* sizes) -{ - VMA_ASSERT(allocator); - - if(allocationCount == 0) - { - return VK_SUCCESS; - } - - VMA_ASSERT(allocations); - - VMA_DEBUG_LOG("vmaFlushAllocations"); - - VMA_DEBUG_GLOBAL_MUTEX_LOCK - - const VkResult res = allocator->FlushOrInvalidateAllocations(allocationCount, allocations, offsets, sizes, VMA_CACHE_FLUSH); - -#if VMA_RECORDING_ENABLED - if(allocator->GetRecorder() != VMA_NULL) - { - //TODO - } -#endif - - return res; -} - -VMA_CALL_PRE VkResult VMA_CALL_POST vmaInvalidateAllocations( - VmaAllocator allocator, - uint32_t allocationCount, - const VmaAllocation* allocations, - const VkDeviceSize* offsets, - const VkDeviceSize* sizes) -{ - VMA_ASSERT(allocator); - - if(allocationCount == 0) - { - return VK_SUCCESS; - } - - VMA_ASSERT(allocations); - - VMA_DEBUG_LOG("vmaInvalidateAllocations"); - - VMA_DEBUG_GLOBAL_MUTEX_LOCK - - const VkResult res = allocator->FlushOrInvalidateAllocations(allocationCount, allocations, offsets, sizes, VMA_CACHE_INVALIDATE); - -#if VMA_RECORDING_ENABLED - if(allocator->GetRecorder() != VMA_NULL) - { - //TODO - } -#endif - - return res; -} - -VMA_CALL_PRE VkResult VMA_CALL_POST vmaCheckCorruption(VmaAllocator allocator, uint32_t memoryTypeBits) -{ - VMA_ASSERT(allocator); - - VMA_DEBUG_LOG("vmaCheckCorruption"); - - VMA_DEBUG_GLOBAL_MUTEX_LOCK - - return allocator->CheckCorruption(memoryTypeBits); -} - -VMA_CALL_PRE VkResult VMA_CALL_POST vmaDefragment( - VmaAllocator allocator, - const VmaAllocation* pAllocations, - size_t allocationCount, - VkBool32* pAllocationsChanged, - const VmaDefragmentationInfo *pDefragmentationInfo, - VmaDefragmentationStats* pDefragmentationStats) -{ - // Deprecated interface, reimplemented using new one. - - VmaDefragmentationInfo2 info2 = {}; - info2.allocationCount = (uint32_t)allocationCount; - info2.pAllocations = pAllocations; - info2.pAllocationsChanged = pAllocationsChanged; - if(pDefragmentationInfo != VMA_NULL) - { - info2.maxCpuAllocationsToMove = pDefragmentationInfo->maxAllocationsToMove; - info2.maxCpuBytesToMove = pDefragmentationInfo->maxBytesToMove; - } - else - { - info2.maxCpuAllocationsToMove = UINT32_MAX; - info2.maxCpuBytesToMove = VK_WHOLE_SIZE; - } - // info2.flags, maxGpuAllocationsToMove, maxGpuBytesToMove, commandBuffer deliberately left zero. - - VmaDefragmentationContext ctx; - VkResult res = vmaDefragmentationBegin(allocator, &info2, pDefragmentationStats, &ctx); - if(res == VK_NOT_READY) - { - res = vmaDefragmentationEnd( allocator, ctx); - } - return res; -} - -VMA_CALL_PRE VkResult VMA_CALL_POST vmaDefragmentationBegin( - VmaAllocator allocator, - const VmaDefragmentationInfo2* pInfo, - VmaDefragmentationStats* pStats, - VmaDefragmentationContext *pContext) -{ - VMA_ASSERT(allocator && pInfo && pContext); - - // Degenerate case: Nothing to defragment. - if(pInfo->allocationCount == 0 && pInfo->poolCount == 0) - { - return VK_SUCCESS; - } - - VMA_ASSERT(pInfo->allocationCount == 0 || pInfo->pAllocations != VMA_NULL); - VMA_ASSERT(pInfo->poolCount == 0 || pInfo->pPools != VMA_NULL); - VMA_HEAVY_ASSERT(VmaValidatePointerArray(pInfo->allocationCount, pInfo->pAllocations)); - VMA_HEAVY_ASSERT(VmaValidatePointerArray(pInfo->poolCount, pInfo->pPools)); - - VMA_DEBUG_LOG("vmaDefragmentationBegin"); - - VMA_DEBUG_GLOBAL_MUTEX_LOCK - - VkResult res = allocator->DefragmentationBegin(*pInfo, pStats, pContext); - -#if VMA_RECORDING_ENABLED - if(allocator->GetRecorder() != VMA_NULL) - { - allocator->GetRecorder()->RecordDefragmentationBegin( - allocator->GetCurrentFrameIndex(), *pInfo, *pContext); - } -#endif - - return res; -} - -VMA_CALL_PRE VkResult VMA_CALL_POST vmaDefragmentationEnd( - VmaAllocator allocator, - VmaDefragmentationContext context) -{ - VMA_ASSERT(allocator); - - VMA_DEBUG_LOG("vmaDefragmentationEnd"); - - if(context != VK_NULL_HANDLE) - { - VMA_DEBUG_GLOBAL_MUTEX_LOCK - -#if VMA_RECORDING_ENABLED - if(allocator->GetRecorder() != VMA_NULL) - { - allocator->GetRecorder()->RecordDefragmentationEnd( - allocator->GetCurrentFrameIndex(), context); - } -#endif - - return allocator->DefragmentationEnd(context); - } - else - { - return VK_SUCCESS; - } -} - -VMA_CALL_PRE VkResult VMA_CALL_POST vmaBeginDefragmentationPass( - VmaAllocator allocator, - VmaDefragmentationContext context, - VmaDefragmentationPassInfo* pInfo - ) -{ - VMA_ASSERT(allocator); - VMA_ASSERT(pInfo); - - VMA_DEBUG_LOG("vmaBeginDefragmentationPass"); - - VMA_DEBUG_GLOBAL_MUTEX_LOCK - - if(context == VK_NULL_HANDLE) - { - pInfo->moveCount = 0; - return VK_SUCCESS; - } - - return allocator->DefragmentationPassBegin(pInfo, context); -} -VMA_CALL_PRE VkResult VMA_CALL_POST vmaEndDefragmentationPass( - VmaAllocator allocator, - VmaDefragmentationContext context) -{ - VMA_ASSERT(allocator); - - VMA_DEBUG_LOG("vmaEndDefragmentationPass"); - VMA_DEBUG_GLOBAL_MUTEX_LOCK - - if(context == VK_NULL_HANDLE) - return VK_SUCCESS; - - return allocator->DefragmentationPassEnd(context); -} - -VMA_CALL_PRE VkResult VMA_CALL_POST vmaBindBufferMemory( - VmaAllocator allocator, - VmaAllocation allocation, - VkBuffer buffer) -{ - VMA_ASSERT(allocator && allocation && buffer); - - VMA_DEBUG_LOG("vmaBindBufferMemory"); - - VMA_DEBUG_GLOBAL_MUTEX_LOCK - - return allocator->BindBufferMemory(allocation, 0, buffer, VMA_NULL); -} - -VMA_CALL_PRE VkResult VMA_CALL_POST vmaBindBufferMemory2( - VmaAllocator allocator, - VmaAllocation allocation, - VkDeviceSize allocationLocalOffset, - VkBuffer buffer, - const void* pNext) -{ - VMA_ASSERT(allocator && allocation && buffer); - - VMA_DEBUG_LOG("vmaBindBufferMemory2"); - - VMA_DEBUG_GLOBAL_MUTEX_LOCK - - return allocator->BindBufferMemory(allocation, allocationLocalOffset, buffer, pNext); -} - -VMA_CALL_PRE VkResult VMA_CALL_POST vmaBindImageMemory( - VmaAllocator allocator, - VmaAllocation allocation, - VkImage image) -{ - VMA_ASSERT(allocator && allocation && image); - - VMA_DEBUG_LOG("vmaBindImageMemory"); - - VMA_DEBUG_GLOBAL_MUTEX_LOCK - - return allocator->BindImageMemory(allocation, 0, image, VMA_NULL); -} - -VMA_CALL_PRE VkResult VMA_CALL_POST vmaBindImageMemory2( - VmaAllocator allocator, - VmaAllocation allocation, - VkDeviceSize allocationLocalOffset, - VkImage image, - const void* pNext) -{ - VMA_ASSERT(allocator && allocation && image); - - VMA_DEBUG_LOG("vmaBindImageMemory2"); - - VMA_DEBUG_GLOBAL_MUTEX_LOCK - - return allocator->BindImageMemory(allocation, allocationLocalOffset, image, pNext); -} - -VMA_CALL_PRE VkResult VMA_CALL_POST vmaCreateBuffer( - VmaAllocator allocator, - const VkBufferCreateInfo* pBufferCreateInfo, - const VmaAllocationCreateInfo* pAllocationCreateInfo, - VkBuffer* pBuffer, - VmaAllocation* pAllocation, - VmaAllocationInfo* pAllocationInfo) -{ - VMA_ASSERT(allocator && pBufferCreateInfo && pAllocationCreateInfo && pBuffer && pAllocation); - - if(pBufferCreateInfo->size == 0) - { - return VK_ERROR_VALIDATION_FAILED_EXT; - } - if((pBufferCreateInfo->usage & VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT_COPY) != 0 && - !allocator->m_UseKhrBufferDeviceAddress) - { - VMA_ASSERT(0 && "Creating a buffer with VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT is not valid if VMA_ALLOCATOR_CREATE_BUFFER_DEVICE_ADDRESS_BIT was not used."); - return VK_ERROR_VALIDATION_FAILED_EXT; - } - - VMA_DEBUG_LOG("vmaCreateBuffer"); - - VMA_DEBUG_GLOBAL_MUTEX_LOCK - - *pBuffer = VK_NULL_HANDLE; - *pAllocation = VK_NULL_HANDLE; - - // 1. Create VkBuffer. - VkResult res = (*allocator->GetVulkanFunctions().vkCreateBuffer)( - allocator->m_hDevice, - pBufferCreateInfo, - allocator->GetAllocationCallbacks(), - pBuffer); - if(res >= 0) - { - // 2. vkGetBufferMemoryRequirements. - VkMemoryRequirements vkMemReq = {}; - bool requiresDedicatedAllocation = false; - bool prefersDedicatedAllocation = false; - allocator->GetBufferMemoryRequirements(*pBuffer, vkMemReq, - requiresDedicatedAllocation, prefersDedicatedAllocation); - - // 3. Allocate memory using allocator. - res = allocator->AllocateMemory( - vkMemReq, - requiresDedicatedAllocation, - prefersDedicatedAllocation, - *pBuffer, // dedicatedBuffer - pBufferCreateInfo->usage, // dedicatedBufferUsage - VK_NULL_HANDLE, // dedicatedImage - *pAllocationCreateInfo, - VMA_SUBALLOCATION_TYPE_BUFFER, - 1, // allocationCount - pAllocation); - -#if VMA_RECORDING_ENABLED - if(allocator->GetRecorder() != VMA_NULL) - { - allocator->GetRecorder()->RecordCreateBuffer( - allocator->GetCurrentFrameIndex(), - *pBufferCreateInfo, - *pAllocationCreateInfo, - *pAllocation); - } -#endif - - if(res >= 0) - { - // 3. Bind buffer with memory. - if((pAllocationCreateInfo->flags & VMA_ALLOCATION_CREATE_DONT_BIND_BIT) == 0) - { - res = allocator->BindBufferMemory(*pAllocation, 0, *pBuffer, VMA_NULL); - } - if(res >= 0) - { - // All steps succeeded. - #if VMA_STATS_STRING_ENABLED - (*pAllocation)->InitBufferImageUsage(pBufferCreateInfo->usage); - #endif - if(pAllocationInfo != VMA_NULL) - { - allocator->GetAllocationInfo(*pAllocation, pAllocationInfo); - } - - return VK_SUCCESS; - } - allocator->FreeMemory( - 1, // allocationCount - pAllocation); - *pAllocation = VK_NULL_HANDLE; - (*allocator->GetVulkanFunctions().vkDestroyBuffer)(allocator->m_hDevice, *pBuffer, allocator->GetAllocationCallbacks()); - *pBuffer = VK_NULL_HANDLE; - return res; - } - (*allocator->GetVulkanFunctions().vkDestroyBuffer)(allocator->m_hDevice, *pBuffer, allocator->GetAllocationCallbacks()); - *pBuffer = VK_NULL_HANDLE; - return res; - } - return res; -} - -VMA_CALL_PRE void VMA_CALL_POST vmaDestroyBuffer( - VmaAllocator allocator, - VkBuffer buffer, - VmaAllocation allocation) -{ - VMA_ASSERT(allocator); - - if(buffer == VK_NULL_HANDLE && allocation == VK_NULL_HANDLE) - { - return; - } - - VMA_DEBUG_LOG("vmaDestroyBuffer"); - - VMA_DEBUG_GLOBAL_MUTEX_LOCK - -#if VMA_RECORDING_ENABLED - if(allocator->GetRecorder() != VMA_NULL) - { - allocator->GetRecorder()->RecordDestroyBuffer( - allocator->GetCurrentFrameIndex(), - allocation); - } -#endif - - if(buffer != VK_NULL_HANDLE) - { - (*allocator->GetVulkanFunctions().vkDestroyBuffer)(allocator->m_hDevice, buffer, allocator->GetAllocationCallbacks()); - } - - if(allocation != VK_NULL_HANDLE) - { - allocator->FreeMemory( - 1, // allocationCount - &allocation); - } -} - -VMA_CALL_PRE VkResult VMA_CALL_POST vmaCreateImage( - VmaAllocator allocator, - const VkImageCreateInfo* pImageCreateInfo, - const VmaAllocationCreateInfo* pAllocationCreateInfo, - VkImage* pImage, - VmaAllocation* pAllocation, - VmaAllocationInfo* pAllocationInfo) -{ - VMA_ASSERT(allocator && pImageCreateInfo && pAllocationCreateInfo && pImage && pAllocation); - - if(pImageCreateInfo->extent.width == 0 || - pImageCreateInfo->extent.height == 0 || - pImageCreateInfo->extent.depth == 0 || - pImageCreateInfo->mipLevels == 0 || - pImageCreateInfo->arrayLayers == 0) - { - return VK_ERROR_VALIDATION_FAILED_EXT; - } - - VMA_DEBUG_LOG("vmaCreateImage"); - - VMA_DEBUG_GLOBAL_MUTEX_LOCK - - *pImage = VK_NULL_HANDLE; - *pAllocation = VK_NULL_HANDLE; - - // 1. Create VkImage. - VkResult res = (*allocator->GetVulkanFunctions().vkCreateImage)( - allocator->m_hDevice, - pImageCreateInfo, - allocator->GetAllocationCallbacks(), - pImage); - if(res >= 0) - { - VmaSuballocationType suballocType = pImageCreateInfo->tiling == VK_IMAGE_TILING_OPTIMAL ? - VMA_SUBALLOCATION_TYPE_IMAGE_OPTIMAL : - VMA_SUBALLOCATION_TYPE_IMAGE_LINEAR; - - // 2. Allocate memory using allocator. - VkMemoryRequirements vkMemReq = {}; - bool requiresDedicatedAllocation = false; - bool prefersDedicatedAllocation = false; - allocator->GetImageMemoryRequirements(*pImage, vkMemReq, - requiresDedicatedAllocation, prefersDedicatedAllocation); - - res = allocator->AllocateMemory( - vkMemReq, - requiresDedicatedAllocation, - prefersDedicatedAllocation, - VK_NULL_HANDLE, // dedicatedBuffer - UINT32_MAX, // dedicatedBufferUsage - *pImage, // dedicatedImage - *pAllocationCreateInfo, - suballocType, - 1, // allocationCount - pAllocation); - -#if VMA_RECORDING_ENABLED - if(allocator->GetRecorder() != VMA_NULL) - { - allocator->GetRecorder()->RecordCreateImage( - allocator->GetCurrentFrameIndex(), - *pImageCreateInfo, - *pAllocationCreateInfo, - *pAllocation); - } -#endif - - if(res >= 0) - { - // 3. Bind image with memory. - if((pAllocationCreateInfo->flags & VMA_ALLOCATION_CREATE_DONT_BIND_BIT) == 0) - { - res = allocator->BindImageMemory(*pAllocation, 0, *pImage, VMA_NULL); - } - if(res >= 0) - { - // All steps succeeded. - #if VMA_STATS_STRING_ENABLED - (*pAllocation)->InitBufferImageUsage(pImageCreateInfo->usage); - #endif - if(pAllocationInfo != VMA_NULL) - { - allocator->GetAllocationInfo(*pAllocation, pAllocationInfo); - } - - return VK_SUCCESS; - } - allocator->FreeMemory( - 1, // allocationCount - pAllocation); - *pAllocation = VK_NULL_HANDLE; - (*allocator->GetVulkanFunctions().vkDestroyImage)(allocator->m_hDevice, *pImage, allocator->GetAllocationCallbacks()); - *pImage = VK_NULL_HANDLE; - return res; - } - (*allocator->GetVulkanFunctions().vkDestroyImage)(allocator->m_hDevice, *pImage, allocator->GetAllocationCallbacks()); - *pImage = VK_NULL_HANDLE; - return res; - } - return res; -} - -VMA_CALL_PRE void VMA_CALL_POST vmaDestroyImage( - VmaAllocator allocator, - VkImage image, - VmaAllocation allocation) -{ - VMA_ASSERT(allocator); - - if(image == VK_NULL_HANDLE && allocation == VK_NULL_HANDLE) - { - return; - } - - VMA_DEBUG_LOG("vmaDestroyImage"); - - VMA_DEBUG_GLOBAL_MUTEX_LOCK - -#if VMA_RECORDING_ENABLED - if(allocator->GetRecorder() != VMA_NULL) - { - allocator->GetRecorder()->RecordDestroyImage( - allocator->GetCurrentFrameIndex(), - allocation); - } -#endif - - if(image != VK_NULL_HANDLE) - { - (*allocator->GetVulkanFunctions().vkDestroyImage)(allocator->m_hDevice, image, allocator->GetAllocationCallbacks()); - } - if(allocation != VK_NULL_HANDLE) - { - allocator->FreeMemory( - 1, // allocationCount - &allocation); - } -} - -#endif // #ifdef VMA_IMPLEMENTATION +// +// Copyright (c) 2017-2021 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. +// + +#ifndef AMD_VULKAN_MEMORY_ALLOCATOR_H +#define AMD_VULKAN_MEMORY_ALLOCATOR_H + +/** \mainpage Vulkan Memory Allocator + +Version 3.0.0-development (2021-02-16) + +Copyright (c) 2017-2021 Advanced Micro Devices, Inc. All rights reserved. \n +License: MIT + +Documentation of all members: vk_mem_alloc.h + +\section main_table_of_contents Table of contents + +- User guide + - \subpage quick_start + - [Project setup](@ref quick_start_project_setup) + - [Initialization](@ref quick_start_initialization) + - [Resource allocation](@ref quick_start_resource_allocation) + - \subpage choosing_memory_type + - [Usage](@ref choosing_memory_type_usage) + - [Required and preferred flags](@ref choosing_memory_type_required_preferred_flags) + - [Explicit memory types](@ref choosing_memory_type_explicit_memory_types) + - [Custom memory pools](@ref choosing_memory_type_custom_memory_pools) + - [Dedicated allocations](@ref choosing_memory_type_dedicated_allocations) + - \subpage memory_mapping + - [Mapping functions](@ref memory_mapping_mapping_functions) + - [Persistently mapped memory](@ref memory_mapping_persistently_mapped_memory) + - [Cache flush and invalidate](@ref memory_mapping_cache_control) + - [Finding out if memory is mappable](@ref memory_mapping_finding_if_memory_mappable) + - \subpage staying_within_budget + - [Querying for budget](@ref staying_within_budget_querying_for_budget) + - [Controlling memory usage](@ref staying_within_budget_controlling_memory_usage) + - \subpage resource_aliasing + - \subpage custom_memory_pools + - [Choosing memory type index](@ref custom_memory_pools_MemTypeIndex) + - [Linear allocation algorithm](@ref linear_algorithm) + - [Free-at-once](@ref linear_algorithm_free_at_once) + - [Stack](@ref linear_algorithm_stack) + - [Double stack](@ref linear_algorithm_double_stack) + - [Ring buffer](@ref linear_algorithm_ring_buffer) + - [Buddy allocation algorithm](@ref buddy_algorithm) + - \subpage defragmentation + - [Defragmenting CPU memory](@ref defragmentation_cpu) + - [Defragmenting GPU memory](@ref defragmentation_gpu) + - [Additional notes](@ref defragmentation_additional_notes) + - [Writing custom allocation algorithm](@ref defragmentation_custom_algorithm) + - \subpage lost_allocations + - \subpage statistics + - [Numeric statistics](@ref statistics_numeric_statistics) + - [JSON dump](@ref statistics_json_dump) + - \subpage allocation_annotation + - [Allocation user data](@ref allocation_user_data) + - [Allocation names](@ref allocation_names) + - \subpage debugging_memory_usage + - [Memory initialization](@ref debugging_memory_usage_initialization) + - [Margins](@ref debugging_memory_usage_margins) + - [Corruption detection](@ref debugging_memory_usage_corruption_detection) + - \subpage record_and_replay +- \subpage usage_patterns + - [Common mistakes](@ref usage_patterns_common_mistakes) + - [Simple patterns](@ref usage_patterns_simple) + - [Advanced patterns](@ref usage_patterns_advanced) +- \subpage configuration + - [Pointers to Vulkan functions](@ref config_Vulkan_functions) + - [Custom host memory allocator](@ref custom_memory_allocator) + - [Device memory allocation callbacks](@ref allocation_callbacks) + - [Device heap memory limit](@ref heap_memory_limit) + - \subpage vk_khr_dedicated_allocation + - \subpage enabling_buffer_device_address + - \subpage vk_amd_device_coherent_memory +- \subpage general_considerations + - [Thread safety](@ref general_considerations_thread_safety) + - [Validation layer warnings](@ref general_considerations_validation_layer_warnings) + - [Allocation algorithm](@ref general_considerations_allocation_algorithm) + - [Features not supported](@ref general_considerations_features_not_supported) + +\section main_see_also See also + +- [Product page on GPUOpen](https://gpuopen.com/gaming-product/vulkan-memory-allocator/) +- [Source repository on GitHub](https://github.com/GPUOpen-LibrariesAndSDKs/VulkanMemoryAllocator) + + + + +\page quick_start Quick start + +\section quick_start_project_setup Project setup + +Vulkan Memory Allocator comes in form of a "stb-style" single header file. +You don't need to build it as a separate library project. +You can add this file directly to your project and submit it to code repository next to your other source files. + +"Single header" doesn't mean that everything is contained in C/C++ declarations, +like it tends to be in case of inline functions or C++ templates. +It means that implementation is bundled with interface in a single file and needs to be extracted using preprocessor macro. +If you don't do it properly, you will get linker errors. + +To do it properly: + +-# Include "vk_mem_alloc.h" file in each CPP file where you want to use the library. + This includes declarations of all members of the library. +-# In exacly one CPP file define following macro before this include. + It enables also internal definitions. + +\code +#define VMA_IMPLEMENTATION +#include +\endcode + +It may be a good idea to create dedicated CPP file just for this purpose. + +Note on language: This library is written in C++, but has C-compatible interface. +Thus you can include and use vk_mem_alloc.h in C or C++ code, but full +implementation with `VMA_IMPLEMENTATION` macro must be compiled as C++, NOT as C. + +Please note that this library includes header ``, which in turn +includes `` on Windows. If you need some specific macros defined +before including these headers (like `WIN32_LEAN_AND_MEAN` or +`WINVER` for Windows, `VK_USE_PLATFORM_WIN32_KHR` for Vulkan), you must define +them before every `#include` of this library. + +You may need to configure the way you import Vulkan functions. + +- By default, VMA assumes you you link statically with Vulkan API. If this is not the case, + `#define VMA_STATIC_VULKAN_FUNCTIONS 0` before `#include` of the VMA implementation and use another way. +- You can `#define VMA_DYNAMIC_VULKAN_FUNCTIONS 1` and make sure `vkGetInstanceProcAddr` and `vkGetDeviceProcAddr` globals are defined. + All the remaining Vulkan functions will be fetched automatically. +- Finally, you can provide your own pointers to all Vulkan functions needed by VMA using structure member + VmaAllocatorCreateInfo::pVulkanFunctions, if you fetched them in some custom way e.g. using some loader like [Volk](https://github.com/zeux/volk). + + +\section quick_start_initialization Initialization + +At program startup: + +-# Initialize Vulkan to have `VkPhysicalDevice`, `VkDevice` and `VkInstance` object. +-# Fill VmaAllocatorCreateInfo structure and create #VmaAllocator object by + calling vmaCreateAllocator(). + +\code +VmaAllocatorCreateInfo allocatorInfo = {}; +allocatorInfo.vulkanApiVersion = VK_API_VERSION_1_2; +allocatorInfo.physicalDevice = physicalDevice; +allocatorInfo.device = device; +allocatorInfo.instance = instance; + +VmaAllocator allocator; +vmaCreateAllocator(&allocatorInfo, &allocator); +\endcode + +Only members `physicalDevice`, `device`, `instance` are required. +However, you should inform the library which Vulkan version do you use by setting +VmaAllocatorCreateInfo::vulkanApiVersion and which extensions did you enable +by setting VmaAllocatorCreateInfo::flags (like #VMA_ALLOCATOR_CREATE_BUFFER_DEVICE_ADDRESS_BIT for VK_KHR_buffer_device_address). +Otherwise, VMA would use only features of Vulkan 1.0 core with no extensions. + + +\section quick_start_resource_allocation Resource allocation + +When you want to create a buffer or image: + +-# Fill `VkBufferCreateInfo` / `VkImageCreateInfo` structure. +-# Fill VmaAllocationCreateInfo structure. +-# Call vmaCreateBuffer() / vmaCreateImage() to get `VkBuffer`/`VkImage` with memory + already allocated and bound to it. + +\code +VkBufferCreateInfo bufferInfo = { VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO }; +bufferInfo.size = 65536; +bufferInfo.usage = VK_BUFFER_USAGE_VERTEX_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT; + +VmaAllocationCreateInfo allocInfo = {}; +allocInfo.usage = VMA_MEMORY_USAGE_GPU_ONLY; + +VkBuffer buffer; +VmaAllocation allocation; +vmaCreateBuffer(allocator, &bufferInfo, &allocInfo, &buffer, &allocation, nullptr); +\endcode + +Don't forget to destroy your objects when no longer needed: + +\code +vmaDestroyBuffer(allocator, buffer, allocation); +vmaDestroyAllocator(allocator); +\endcode + + +\page choosing_memory_type Choosing memory type + +Physical devices in Vulkan support various combinations of memory heaps and +types. Help with choosing correct and optimal memory type for your specific +resource is one of the key features of this library. You can use it by filling +appropriate members of VmaAllocationCreateInfo structure, as described below. +You can also combine multiple methods. + +-# If you just want to find memory type index that meets your requirements, you + can use function: vmaFindMemoryTypeIndex(), vmaFindMemoryTypeIndexForBufferInfo(), + vmaFindMemoryTypeIndexForImageInfo(). +-# If you want to allocate a region of device memory without association with any + specific image or buffer, you can use function vmaAllocateMemory(). Usage of + this function is not recommended and usually not needed. + vmaAllocateMemoryPages() function is also provided for creating multiple allocations at once, + which may be useful for sparse binding. +-# If you already have a buffer or an image created, you want to allocate memory + for it and then you will bind it yourself, you can use function + vmaAllocateMemoryForBuffer(), vmaAllocateMemoryForImage(). + For binding you should use functions: vmaBindBufferMemory(), vmaBindImageMemory() + or their extended versions: vmaBindBufferMemory2(), vmaBindImageMemory2(). +-# If you want to create a buffer or an image, allocate memory for it and bind + them together, all in one call, you can use function vmaCreateBuffer(), + vmaCreateImage(). This is the easiest and recommended way to use this library. + +When using 3. or 4., the library internally queries Vulkan for memory types +supported for that buffer or image (function `vkGetBufferMemoryRequirements()`) +and uses only one of these types. + +If no memory type can be found that meets all the requirements, these functions +return `VK_ERROR_FEATURE_NOT_PRESENT`. + +You can leave VmaAllocationCreateInfo structure completely filled with zeros. +It means no requirements are specified for memory type. +It is valid, although not very useful. + +\section choosing_memory_type_usage Usage + +The easiest way to specify memory requirements is to fill member +VmaAllocationCreateInfo::usage using one of the values of enum #VmaMemoryUsage. +It defines high level, common usage types. +For more details, see description of this enum. + +For example, if you want to create a uniform buffer that will be filled using +transfer only once or infrequently and used for rendering every frame, you can +do it using following code: + +\code +VkBufferCreateInfo bufferInfo = { VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO }; +bufferInfo.size = 65536; +bufferInfo.usage = VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT; + +VmaAllocationCreateInfo allocInfo = {}; +allocInfo.usage = VMA_MEMORY_USAGE_GPU_ONLY; + +VkBuffer buffer; +VmaAllocation allocation; +vmaCreateBuffer(allocator, &bufferInfo, &allocInfo, &buffer, &allocation, nullptr); +\endcode + +\section choosing_memory_type_required_preferred_flags Required and preferred flags + +You can specify more detailed requirements by filling members +VmaAllocationCreateInfo::requiredFlags and VmaAllocationCreateInfo::preferredFlags +with a combination of bits from enum `VkMemoryPropertyFlags`. For example, +if you want to create a buffer that will be persistently mapped on host (so it +must be `HOST_VISIBLE`) and preferably will also be `HOST_COHERENT` and `HOST_CACHED`, +use following code: + +\code +VmaAllocationCreateInfo allocInfo = {}; +allocInfo.requiredFlags = VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT; +allocInfo.preferredFlags = VK_MEMORY_PROPERTY_HOST_COHERENT_BIT | VK_MEMORY_PROPERTY_HOST_CACHED_BIT; +allocInfo.flags = VMA_ALLOCATION_CREATE_MAPPED_BIT; + +VkBuffer buffer; +VmaAllocation allocation; +vmaCreateBuffer(allocator, &bufferInfo, &allocInfo, &buffer, &allocation, nullptr); +\endcode + +A memory type is chosen that has all the required flags and as many preferred +flags set as possible. + +If you use VmaAllocationCreateInfo::usage, it is just internally converted to +a set of required and preferred flags. + +\section choosing_memory_type_explicit_memory_types Explicit memory types + +If you inspected memory types available on the physical device and you have +a preference for memory types that you want to use, you can fill member +VmaAllocationCreateInfo::memoryTypeBits. It is a bit mask, where each bit set +means that a memory type with that index is allowed to be used for the +allocation. Special value 0, just like `UINT32_MAX`, means there are no +restrictions to memory type index. + +Please note that this member is NOT just a memory type index. +Still you can use it to choose just one, specific memory type. +For example, if you already determined that your buffer should be created in +memory type 2, use following code: + +\code +uint32_t memoryTypeIndex = 2; + +VmaAllocationCreateInfo allocInfo = {}; +allocInfo.memoryTypeBits = 1u << memoryTypeIndex; + +VkBuffer buffer; +VmaAllocation allocation; +vmaCreateBuffer(allocator, &bufferInfo, &allocInfo, &buffer, &allocation, nullptr); +\endcode + + +\section choosing_memory_type_custom_memory_pools Custom memory pools + +If you allocate from custom memory pool, all the ways of specifying memory +requirements described above are not applicable and the aforementioned members +of VmaAllocationCreateInfo structure are ignored. Memory type is selected +explicitly when creating the pool and then used to make all the allocations from +that pool. For further details, see \ref custom_memory_pools. + +\section choosing_memory_type_dedicated_allocations Dedicated allocations + +Memory for allocations is reserved out of larger block of `VkDeviceMemory` +allocated from Vulkan internally. That's the main feature of this whole library. +You can still request a separate memory block to be created for an allocation, +just like you would do in a trivial solution without using any allocator. +In that case, a buffer or image is always bound to that memory at offset 0. +This is called a "dedicated allocation". +You can explicitly request it by using flag #VMA_ALLOCATION_CREATE_DEDICATED_MEMORY_BIT. +The library can also internally decide to use dedicated allocation in some cases, e.g.: + +- When the size of the allocation is large. +- When [VK_KHR_dedicated_allocation](@ref vk_khr_dedicated_allocation) extension is enabled + and it reports that dedicated allocation is required or recommended for the resource. +- When allocation of next big memory block fails due to not enough device memory, + but allocation with the exact requested size succeeds. + + +\page memory_mapping Memory mapping + +To "map memory" in Vulkan means to obtain a CPU pointer to `VkDeviceMemory`, +to be able to read from it or write to it in CPU code. +Mapping is possible only of memory allocated from a memory type that has +`VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT` flag. +Functions `vkMapMemory()`, `vkUnmapMemory()` are designed for this purpose. +You can use them directly with memory allocated by this library, +but it is not recommended because of following issue: +Mapping the same `VkDeviceMemory` block multiple times is illegal - only one mapping at a time is allowed. +This includes mapping disjoint regions. Mapping is not reference-counted internally by Vulkan. +Because of this, Vulkan Memory Allocator provides following facilities: + +\section memory_mapping_mapping_functions Mapping functions + +The library provides following functions for mapping of a specific #VmaAllocation: vmaMapMemory(), vmaUnmapMemory(). +They are safer and more convenient to use than standard Vulkan functions. +You can map an allocation multiple times simultaneously - mapping is reference-counted internally. +You can also map different allocations simultaneously regardless of whether they use the same `VkDeviceMemory` block. +The way it's implemented is that the library always maps entire memory block, not just region of the allocation. +For further details, see description of vmaMapMemory() function. +Example: + +\code +// Having these objects initialized: + +struct ConstantBuffer +{ + ... +}; +ConstantBuffer constantBufferData; + +VmaAllocator allocator; +VkBuffer constantBuffer; +VmaAllocation constantBufferAllocation; + +// You can map and fill your buffer using following code: + +void* mappedData; +vmaMapMemory(allocator, constantBufferAllocation, &mappedData); +memcpy(mappedData, &constantBufferData, sizeof(constantBufferData)); +vmaUnmapMemory(allocator, constantBufferAllocation); +\endcode + +When mapping, you may see a warning from Vulkan validation layer similar to this one: + +Mapping an image with layout VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL can result in undefined behavior if this memory is used by the device. Only GENERAL or PREINITIALIZED should be used. + +It happens because the library maps entire `VkDeviceMemory` block, where different +types of images and buffers may end up together, especially on GPUs with unified memory like Intel. +You can safely ignore it if you are sure you access only memory of the intended +object that you wanted to map. + + +\section memory_mapping_persistently_mapped_memory Persistently mapped memory + +Kepping your memory persistently mapped is generally OK in Vulkan. +You don't need to unmap it before using its data on the GPU. +The library provides a special feature designed for that: +Allocations made with #VMA_ALLOCATION_CREATE_MAPPED_BIT flag set in +VmaAllocationCreateInfo::flags stay mapped all the time, +so you can just access CPU pointer to it any time +without a need to call any "map" or "unmap" function. +Example: + +\code +VkBufferCreateInfo bufCreateInfo = { VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO }; +bufCreateInfo.size = sizeof(ConstantBuffer); +bufCreateInfo.usage = VK_BUFFER_USAGE_TRANSFER_SRC_BIT; + +VmaAllocationCreateInfo allocCreateInfo = {}; +allocCreateInfo.usage = VMA_MEMORY_USAGE_CPU_ONLY; +allocCreateInfo.flags = VMA_ALLOCATION_CREATE_MAPPED_BIT; + +VkBuffer buf; +VmaAllocation alloc; +VmaAllocationInfo allocInfo; +vmaCreateBuffer(allocator, &bufCreateInfo, &allocCreateInfo, &buf, &alloc, &allocInfo); + +// Buffer is already mapped. You can access its memory. +memcpy(allocInfo.pMappedData, &constantBufferData, sizeof(constantBufferData)); +\endcode + +There are some exceptions though, when you should consider mapping memory only for a short period of time: + +- When operating system is Windows 7 or 8.x (Windows 10 is not affected because it uses WDDM2), + device is discrete AMD GPU, + and memory type is the special 256 MiB pool of `DEVICE_LOCAL + HOST_VISIBLE` memory + (selected when you use #VMA_MEMORY_USAGE_CPU_TO_GPU), + then whenever a memory block allocated from this memory type stays mapped + for the time of any call to `vkQueueSubmit()` or `vkQueuePresentKHR()`, this + block is migrated by WDDM to system RAM, which degrades performance. It doesn't + matter if that particular memory block is actually used by the command buffer + being submitted. +- On Mac/MoltenVK there is a known bug - [Issue #175](https://github.com/KhronosGroup/MoltenVK/issues/175) + which requires unmapping before GPU can see updated texture. +- Keeping many large memory blocks mapped may impact performance or stability of some debugging tools. + +\section memory_mapping_cache_control Cache flush and invalidate + +Memory in Vulkan doesn't need to be unmapped before using it on GPU, +but unless a memory types has `VK_MEMORY_PROPERTY_HOST_COHERENT_BIT` flag set, +you need to manually **invalidate** cache before reading of mapped pointer +and **flush** cache after writing to mapped pointer. +Map/unmap operations don't do that automatically. +Vulkan provides following functions for this purpose `vkFlushMappedMemoryRanges()`, +`vkInvalidateMappedMemoryRanges()`, but this library provides more convenient +functions that refer to given allocation object: vmaFlushAllocation(), +vmaInvalidateAllocation(), +or multiple objects at once: vmaFlushAllocations(), vmaInvalidateAllocations(). + +Regions of memory specified for flush/invalidate must be aligned to +`VkPhysicalDeviceLimits::nonCoherentAtomSize`. This is automatically ensured by the library. +In any memory type that is `HOST_VISIBLE` but not `HOST_COHERENT`, all allocations +within blocks are aligned to this value, so their offsets are always multiply of +`nonCoherentAtomSize` and two different allocations never share same "line" of this size. + +Please note that memory allocated with #VMA_MEMORY_USAGE_CPU_ONLY is guaranteed to be `HOST_COHERENT`. + +Also, Windows drivers from all 3 **PC** GPU vendors (AMD, Intel, NVIDIA) +currently provide `HOST_COHERENT` flag on all memory types that are +`HOST_VISIBLE`, so on this platform you may not need to bother. + +\section memory_mapping_finding_if_memory_mappable Finding out if memory is mappable + +It may happen that your allocation ends up in memory that is `HOST_VISIBLE` (available for mapping) +despite it wasn't explicitly requested. +For example, application may work on integrated graphics with unified memory (like Intel) or +allocation from video memory might have failed, so the library chose system memory as fallback. + +You can detect this case and map such allocation to access its memory on CPU directly, +instead of launching a transfer operation. +In order to do that: inspect `allocInfo.memoryType`, call vmaGetMemoryTypeProperties(), +and look for `VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT` flag in properties of that memory type. + +\code +VkBufferCreateInfo bufCreateInfo = { VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO }; +bufCreateInfo.size = sizeof(ConstantBuffer); +bufCreateInfo.usage = VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT; + +VmaAllocationCreateInfo allocCreateInfo = {}; +allocCreateInfo.usage = VMA_MEMORY_USAGE_GPU_ONLY; +allocCreateInfo.preferredFlags = VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT; + +VkBuffer buf; +VmaAllocation alloc; +VmaAllocationInfo allocInfo; +vmaCreateBuffer(allocator, &bufCreateInfo, &allocCreateInfo, &buf, &alloc, &allocInfo); + +VkMemoryPropertyFlags memFlags; +vmaGetMemoryTypeProperties(allocator, allocInfo.memoryType, &memFlags); +if((memFlags & VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT) != 0) +{ + // Allocation ended up in mappable memory. You can map it and access it directly. + void* mappedData; + vmaMapMemory(allocator, alloc, &mappedData); + memcpy(mappedData, &constantBufferData, sizeof(constantBufferData)); + vmaUnmapMemory(allocator, alloc); +} +else +{ + // Allocation ended up in non-mappable memory. + // You need to create CPU-side buffer in VMA_MEMORY_USAGE_CPU_ONLY and make a transfer. +} +\endcode + +You can even use #VMA_ALLOCATION_CREATE_MAPPED_BIT flag while creating allocations +that are not necessarily `HOST_VISIBLE` (e.g. using #VMA_MEMORY_USAGE_GPU_ONLY). +If the allocation ends up in memory type that is `HOST_VISIBLE`, it will be persistently mapped and you can use it directly. +If not, the flag is just ignored. +Example: + +\code +VkBufferCreateInfo bufCreateInfo = { VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO }; +bufCreateInfo.size = sizeof(ConstantBuffer); +bufCreateInfo.usage = VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT; + +VmaAllocationCreateInfo allocCreateInfo = {}; +allocCreateInfo.usage = VMA_MEMORY_USAGE_GPU_ONLY; +allocCreateInfo.flags = VMA_ALLOCATION_CREATE_MAPPED_BIT; + +VkBuffer buf; +VmaAllocation alloc; +VmaAllocationInfo allocInfo; +vmaCreateBuffer(allocator, &bufCreateInfo, &allocCreateInfo, &buf, &alloc, &allocInfo); + +if(allocInfo.pMappedData != nullptr) +{ + // Allocation ended up in mappable memory. + // It's persistently mapped. You can access it directly. + memcpy(allocInfo.pMappedData, &constantBufferData, sizeof(constantBufferData)); +} +else +{ + // Allocation ended up in non-mappable memory. + // You need to create CPU-side buffer in VMA_MEMORY_USAGE_CPU_ONLY and make a transfer. +} +\endcode + + +\page staying_within_budget Staying within budget + +When developing a graphics-intensive game or program, it is important to avoid allocating +more GPU memory than it's physically available. When the memory is over-committed, +various bad things can happen, depending on the specific GPU, graphics driver, and +operating system: + +- It may just work without any problems. +- The application may slow down because some memory blocks are moved to system RAM + and the GPU has to access them through PCI Express bus. +- A new allocation may take very long time to complete, even few seconds, and possibly + freeze entire system. +- The new allocation may fail with `VK_ERROR_OUT_OF_DEVICE_MEMORY`. +- It may even result in GPU crash (TDR), observed as `VK_ERROR_DEVICE_LOST` + returned somewhere later. + +\section staying_within_budget_querying_for_budget Querying for budget + +To query for current memory usage and available budget, use function vmaGetBudget(). +Returned structure #VmaBudget contains quantities expressed in bytes, per Vulkan memory heap. + +Please note that this function returns different information and works faster than +vmaCalculateStats(). vmaGetBudget() can be called every frame or even before every +allocation, while vmaCalculateStats() is intended to be used rarely, +only to obtain statistical information, e.g. for debugging purposes. + +It is recommended to use VK_EXT_memory_budget device extension to obtain information +about the budget from Vulkan device. VMA is able to use this extension automatically. +When not enabled, the allocator behaves same way, but then it estimates current usage +and available budget based on its internal information and Vulkan memory heap sizes, +which may be less precise. In order to use this extension: + +1. Make sure extensions VK_EXT_memory_budget and VK_KHR_get_physical_device_properties2 + required by it are available and enable them. Please note that the first is a device + extension and the second is instance extension! +2. Use flag #VMA_ALLOCATOR_CREATE_EXT_MEMORY_BUDGET_BIT when creating #VmaAllocator object. +3. Make sure to call vmaSetCurrentFrameIndex() every frame. Budget is queried from + Vulkan inside of it to avoid overhead of querying it with every allocation. + +\section staying_within_budget_controlling_memory_usage Controlling memory usage + +There are many ways in which you can try to stay within the budget. + +First, when making new allocation requires allocating a new memory block, the library +tries not to exceed the budget automatically. If a block with default recommended size +(e.g. 256 MB) would go over budget, a smaller block is allocated, possibly even +dedicated memory for just this resource. + +If the size of the requested resource plus current memory usage is more than the +budget, by default the library still tries to create it, leaving it to the Vulkan +implementation whether the allocation succeeds or fails. You can change this behavior +by using #VMA_ALLOCATION_CREATE_WITHIN_BUDGET_BIT flag. With it, the allocation is +not made if it would exceed the budget or if the budget is already exceeded. +Some other allocations become lost instead to make room for it, if the mechanism of +[lost allocations](@ref lost_allocations) is used. +If that is not possible, the allocation fails with `VK_ERROR_OUT_OF_DEVICE_MEMORY`. +Example usage pattern may be to pass the #VMA_ALLOCATION_CREATE_WITHIN_BUDGET_BIT flag +when creating resources that are not essential for the application (e.g. the texture +of a specific object) and not to pass it when creating critically important resources +(e.g. render targets). + +Finally, you can also use #VMA_ALLOCATION_CREATE_NEVER_ALLOCATE_BIT flag to make sure +a new allocation is created only when it fits inside one of the existing memory blocks. +If it would require to allocate a new block, if fails instead with `VK_ERROR_OUT_OF_DEVICE_MEMORY`. +This also ensures that the function call is very fast because it never goes to Vulkan +to obtain a new block. + +Please note that creating \ref custom_memory_pools with VmaPoolCreateInfo::minBlockCount +set to more than 0 will try to allocate memory blocks without checking whether they +fit within budget. + + +\page resource_aliasing Resource aliasing (overlap) + +New explicit graphics APIs (Vulkan and Direct3D 12), thanks to manual memory +management, give an opportunity to alias (overlap) multiple resources in the +same region of memory - a feature not available in the old APIs (Direct3D 11, OpenGL). +It can be useful to save video memory, but it must be used with caution. + +For example, if you know the flow of your whole render frame in advance, you +are going to use some intermediate textures or buffers only during a small range of render passes, +and you know these ranges don't overlap in time, you can bind these resources to +the same place in memory, even if they have completely different parameters (width, height, format etc.). + +![Resource aliasing (overlap)](../gfx/Aliasing.png) + +Such scenario is possible using VMA, but you need to create your images manually. +Then you need to calculate parameters of an allocation to be made using formula: + +- allocation size = max(size of each image) +- allocation alignment = max(alignment of each image) +- allocation memoryTypeBits = bitwise AND(memoryTypeBits of each image) + +Following example shows two different images bound to the same place in memory, +allocated to fit largest of them. + +\code +// A 512x512 texture to be sampled. +VkImageCreateInfo img1CreateInfo = { VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO }; +img1CreateInfo.imageType = VK_IMAGE_TYPE_2D; +img1CreateInfo.extent.width = 512; +img1CreateInfo.extent.height = 512; +img1CreateInfo.extent.depth = 1; +img1CreateInfo.mipLevels = 10; +img1CreateInfo.arrayLayers = 1; +img1CreateInfo.format = VK_FORMAT_R8G8B8A8_SRGB; +img1CreateInfo.tiling = VK_IMAGE_TILING_OPTIMAL; +img1CreateInfo.initialLayout = VK_IMAGE_LAYOUT_UNDEFINED; +img1CreateInfo.usage = VK_IMAGE_USAGE_TRANSFER_DST_BIT | VK_IMAGE_USAGE_SAMPLED_BIT; +img1CreateInfo.samples = VK_SAMPLE_COUNT_1_BIT; + +// A full screen texture to be used as color attachment. +VkImageCreateInfo img2CreateInfo = { VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO }; +img2CreateInfo.imageType = VK_IMAGE_TYPE_2D; +img2CreateInfo.extent.width = 1920; +img2CreateInfo.extent.height = 1080; +img2CreateInfo.extent.depth = 1; +img2CreateInfo.mipLevels = 1; +img2CreateInfo.arrayLayers = 1; +img2CreateInfo.format = VK_FORMAT_R8G8B8A8_UNORM; +img2CreateInfo.tiling = VK_IMAGE_TILING_OPTIMAL; +img2CreateInfo.initialLayout = VK_IMAGE_LAYOUT_UNDEFINED; +img2CreateInfo.usage = VK_IMAGE_USAGE_SAMPLED_BIT | VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT; +img2CreateInfo.samples = VK_SAMPLE_COUNT_1_BIT; + +VkImage img1; +res = vkCreateImage(device, &img1CreateInfo, nullptr, &img1); +VkImage img2; +res = vkCreateImage(device, &img2CreateInfo, nullptr, &img2); + +VkMemoryRequirements img1MemReq; +vkGetImageMemoryRequirements(device, img1, &img1MemReq); +VkMemoryRequirements img2MemReq; +vkGetImageMemoryRequirements(device, img2, &img2MemReq); + +VkMemoryRequirements finalMemReq = {}; +finalMemReq.size = std::max(img1MemReq.size, img2MemReq.size); +finalMemReq.alignment = std::max(img1MemReq.alignment, img2MemReq.alignment); +finalMemReq.memoryTypeBits = img1MemReq.memoryTypeBits & img2MemReq.memoryTypeBits; +// Validate if(finalMemReq.memoryTypeBits != 0) + +VmaAllocationCreateInfo allocCreateInfo = {}; +allocCreateInfo.usage = VMA_MEMORY_USAGE_GPU_ONLY; + +VmaAllocation alloc; +res = vmaAllocateMemory(allocator, &finalMemReq, &allocCreateInfo, &alloc, nullptr); + +res = vmaBindImageMemory(allocator, alloc, img1); +res = vmaBindImageMemory(allocator, alloc, img2); + +// You can use img1, img2 here, but not at the same time! + +vmaFreeMemory(allocator, alloc); +vkDestroyImage(allocator, img2, nullptr); +vkDestroyImage(allocator, img1, nullptr); +\endcode + +Remember that using resouces that alias in memory requires proper synchronization. +You need to issue a memory barrier to make sure commands that use `img1` and `img2` +don't overlap on GPU timeline. +You also need to treat a resource after aliasing as uninitialized - containing garbage data. +For example, if you use `img1` and then want to use `img2`, you need to issue +an image memory barrier for `img2` with `oldLayout` = `VK_IMAGE_LAYOUT_UNDEFINED`. + +Additional considerations: + +- Vulkan also allows to interpret contents of memory between aliasing resources consistently in some cases. +See chapter 11.8. "Memory Aliasing" of Vulkan specification or `VK_IMAGE_CREATE_ALIAS_BIT` flag. +- You can create more complex layout where different images and buffers are bound +at different offsets inside one large allocation. For example, one can imagine +a big texture used in some render passes, aliasing with a set of many small buffers +used between in some further passes. To bind a resource at non-zero offset of an allocation, +use vmaBindBufferMemory2() / vmaBindImageMemory2(). +- Before allocating memory for the resources you want to alias, check `memoryTypeBits` +returned in memory requirements of each resource to make sure the bits overlap. +Some GPUs may expose multiple memory types suitable e.g. only for buffers or +images with `COLOR_ATTACHMENT` usage, so the sets of memory types supported by your +resources may be disjoint. Aliasing them is not possible in that case. + + +\page custom_memory_pools Custom memory pools + +A memory pool contains a number of `VkDeviceMemory` blocks. +The library automatically creates and manages default pool for each memory type available on the device. +Default memory pool automatically grows in size. +Size of allocated blocks is also variable and managed automatically. + +You can create custom pool and allocate memory out of it. +It can be useful if you want to: + +- Keep certain kind of allocations separate from others. +- Enforce particular, fixed size of Vulkan memory blocks. +- Limit maximum amount of Vulkan memory allocated for that pool. +- Reserve minimum or fixed amount of Vulkan memory always preallocated for that pool. + +To use custom memory pools: + +-# Fill VmaPoolCreateInfo structure. +-# Call vmaCreatePool() to obtain #VmaPool handle. +-# When making an allocation, set VmaAllocationCreateInfo::pool to this handle. + You don't need to specify any other parameters of this structure, like `usage`. + +Example: + +\code +// Create a pool that can have at most 2 blocks, 128 MiB each. +VmaPoolCreateInfo poolCreateInfo = {}; +poolCreateInfo.memoryTypeIndex = ... +poolCreateInfo.blockSize = 128ull * 1024 * 1024; +poolCreateInfo.maxBlockCount = 2; + +VmaPool pool; +vmaCreatePool(allocator, &poolCreateInfo, &pool); + +// Allocate a buffer out of it. +VkBufferCreateInfo bufCreateInfo = { VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO }; +bufCreateInfo.size = 1024; +bufCreateInfo.usage = VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT; + +VmaAllocationCreateInfo allocCreateInfo = {}; +allocCreateInfo.pool = pool; + +VkBuffer buf; +VmaAllocation alloc; +VmaAllocationInfo allocInfo; +vmaCreateBuffer(allocator, &bufCreateInfo, &allocCreateInfo, &buf, &alloc, &allocInfo); +\endcode + +You have to free all allocations made from this pool before destroying it. + +\code +vmaDestroyBuffer(allocator, buf, alloc); +vmaDestroyPool(allocator, pool); +\endcode + +\section custom_memory_pools_MemTypeIndex Choosing memory type index + +When creating a pool, you must explicitly specify memory type index. +To find the one suitable for your buffers or images, you can use helper functions +vmaFindMemoryTypeIndexForBufferInfo(), vmaFindMemoryTypeIndexForImageInfo(). +You need to provide structures with example parameters of buffers or images +that you are going to create in that pool. + +\code +VkBufferCreateInfo exampleBufCreateInfo = { VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO }; +exampleBufCreateInfo.size = 1024; // Whatever. +exampleBufCreateInfo.usage = VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT; // Change if needed. + +VmaAllocationCreateInfo allocCreateInfo = {}; +allocCreateInfo.usage = VMA_MEMORY_USAGE_GPU_ONLY; // Change if needed. + +uint32_t memTypeIndex; +vmaFindMemoryTypeIndexForBufferInfo(allocator, &exampleBufCreateInfo, &allocCreateInfo, &memTypeIndex); + +VmaPoolCreateInfo poolCreateInfo = {}; +poolCreateInfo.memoryTypeIndex = memTypeIndex; +// ... +\endcode + +When creating buffers/images allocated in that pool, provide following parameters: + +- `VkBufferCreateInfo`: Prefer to pass same parameters as above. + Otherwise you risk creating resources in a memory type that is not suitable for them, which may result in undefined behavior. + Using different `VK_BUFFER_USAGE_` flags may work, but you shouldn't create images in a pool intended for buffers + or the other way around. +- VmaAllocationCreateInfo: You don't need to pass same parameters. Fill only `pool` member. + Other members are ignored anyway. + +\section linear_algorithm Linear allocation algorithm + +Each Vulkan memory block managed by this library has accompanying metadata that +keeps track of used and unused regions. By default, the metadata structure and +algorithm tries to find best place for new allocations among free regions to +optimize memory usage. This way you can allocate and free objects in any order. + +![Default allocation algorithm](../gfx/Linear_allocator_1_algo_default.png) + +Sometimes there is a need to use simpler, linear allocation algorithm. You can +create custom pool that uses such algorithm by adding flag +#VMA_POOL_CREATE_LINEAR_ALGORITHM_BIT to VmaPoolCreateInfo::flags while creating +#VmaPool object. Then an alternative metadata management is used. It always +creates new allocations after last one and doesn't reuse free regions after +allocations freed in the middle. It results in better allocation performance and +less memory consumed by metadata. + +![Linear allocation algorithm](../gfx/Linear_allocator_2_algo_linear.png) + +With this one flag, you can create a custom pool that can be used in many ways: +free-at-once, stack, double stack, and ring buffer. See below for details. + +\subsection linear_algorithm_free_at_once Free-at-once + +In a pool that uses linear algorithm, you still need to free all the allocations +individually, e.g. by using vmaFreeMemory() or vmaDestroyBuffer(). You can free +them in any order. New allocations are always made after last one - free space +in the middle is not reused. However, when you release all the allocation and +the pool becomes empty, allocation starts from the beginning again. This way you +can use linear algorithm to speed up creation of allocations that you are going +to release all at once. + +![Free-at-once](../gfx/Linear_allocator_3_free_at_once.png) + +This mode is also available for pools created with VmaPoolCreateInfo::maxBlockCount +value that allows multiple memory blocks. + +\subsection linear_algorithm_stack Stack + +When you free an allocation that was created last, its space can be reused. +Thanks to this, if you always release allocations in the order opposite to their +creation (LIFO - Last In First Out), you can achieve behavior of a stack. + +![Stack](../gfx/Linear_allocator_4_stack.png) + +This mode is also available for pools created with VmaPoolCreateInfo::maxBlockCount +value that allows multiple memory blocks. + +\subsection linear_algorithm_double_stack Double stack + +The space reserved by a custom pool with linear algorithm may be used by two +stacks: + +- First, default one, growing up from offset 0. +- Second, "upper" one, growing down from the end towards lower offsets. + +To make allocation from upper stack, add flag #VMA_ALLOCATION_CREATE_UPPER_ADDRESS_BIT +to VmaAllocationCreateInfo::flags. + +![Double stack](../gfx/Linear_allocator_7_double_stack.png) + +Double stack is available only in pools with one memory block - +VmaPoolCreateInfo::maxBlockCount must be 1. Otherwise behavior is undefined. + +When the two stacks' ends meet so there is not enough space between them for a +new allocation, such allocation fails with usual +`VK_ERROR_OUT_OF_DEVICE_MEMORY` error. + +\subsection linear_algorithm_ring_buffer Ring buffer + +When you free some allocations from the beginning and there is not enough free space +for a new one at the end of a pool, allocator's "cursor" wraps around to the +beginning and starts allocation there. Thanks to this, if you always release +allocations in the same order as you created them (FIFO - First In First Out), +you can achieve behavior of a ring buffer / queue. + +![Ring buffer](../gfx/Linear_allocator_5_ring_buffer.png) + +Pools with linear algorithm support [lost allocations](@ref lost_allocations) when used as ring buffer. +If there is not enough free space for a new allocation, but existing allocations +from the front of the queue can become lost, they become lost and the allocation +succeeds. + +![Ring buffer with lost allocations](../gfx/Linear_allocator_6_ring_buffer_lost.png) + +Ring buffer is available only in pools with one memory block - +VmaPoolCreateInfo::maxBlockCount must be 1. Otherwise behavior is undefined. + +\section buddy_algorithm Buddy allocation algorithm + +There is another allocation algorithm that can be used with custom pools, called +"buddy". Its internal data structure is based on a tree of blocks, each having +size that is a power of two and a half of its parent's size. When you want to +allocate memory of certain size, a free node in the tree is located. If it's too +large, it is recursively split into two halves (called "buddies"). However, if +requested allocation size is not a power of two, the size of a tree node is +aligned up to the nearest power of two and the remaining space is wasted. When +two buddy nodes become free, they are merged back into one larger node. + +![Buddy allocator](../gfx/Buddy_allocator.png) + +The advantage of buddy allocation algorithm over default algorithm is faster +allocation and deallocation, as well as smaller external fragmentation. The +disadvantage is more wasted space (internal fragmentation). + +For more information, please read ["Buddy memory allocation" on Wikipedia](https://en.wikipedia.org/wiki/Buddy_memory_allocation) +or other sources that describe this concept in general. + +To use buddy allocation algorithm with a custom pool, add flag +#VMA_POOL_CREATE_BUDDY_ALGORITHM_BIT to VmaPoolCreateInfo::flags while creating +#VmaPool object. + +Several limitations apply to pools that use buddy algorithm: + +- It is recommended to use VmaPoolCreateInfo::blockSize that is a power of two. + Otherwise, only largest power of two smaller than the size is used for + allocations. The remaining space always stays unused. +- [Margins](@ref debugging_memory_usage_margins) and + [corruption detection](@ref debugging_memory_usage_corruption_detection) + don't work in such pools. +- [Lost allocations](@ref lost_allocations) don't work in such pools. You can + use them, but they never become lost. Support may be added in the future. +- [Defragmentation](@ref defragmentation) doesn't work with allocations made from + such pool. + +\page defragmentation Defragmentation + +Interleaved allocations and deallocations of many objects of varying size can +cause fragmentation over time, which can lead to a situation where the library is unable +to find a continuous range of free memory for a new allocation despite there is +enough free space, just scattered across many small free ranges between existing +allocations. + +To mitigate this problem, you can use defragmentation feature: +structure #VmaDefragmentationInfo2, function vmaDefragmentationBegin(), vmaDefragmentationEnd(). +Given set of allocations, +this function can move them to compact used memory, ensure more continuous free +space and possibly also free some `VkDeviceMemory` blocks. + +What the defragmentation does is: + +- Updates #VmaAllocation objects to point to new `VkDeviceMemory` and offset. + After allocation has been moved, its VmaAllocationInfo::deviceMemory and/or + VmaAllocationInfo::offset changes. You must query them again using + vmaGetAllocationInfo() if you need them. +- Moves actual data in memory. + +What it doesn't do, so you need to do it yourself: + +- Recreate buffers and images that were bound to allocations that were defragmented and + bind them with their new places in memory. + You must use `vkDestroyBuffer()`, `vkDestroyImage()`, + `vkCreateBuffer()`, `vkCreateImage()`, vmaBindBufferMemory(), vmaBindImageMemory() + for that purpose and NOT vmaDestroyBuffer(), + vmaDestroyImage(), vmaCreateBuffer(), vmaCreateImage(), because you don't need to + destroy or create allocation objects! +- Recreate views and update descriptors that point to these buffers and images. + +\section defragmentation_cpu Defragmenting CPU memory + +Following example demonstrates how you can run defragmentation on CPU. +Only allocations created in memory types that are `HOST_VISIBLE` can be defragmented. +Others are ignored. + +The way it works is: + +- It temporarily maps entire memory blocks when necessary. +- It moves data using `memmove()` function. + +\code +// Given following variables already initialized: +VkDevice device; +VmaAllocator allocator; +std::vector buffers; +std::vector allocations; + + +const uint32_t allocCount = (uint32_t)allocations.size(); +std::vector allocationsChanged(allocCount); + +VmaDefragmentationInfo2 defragInfo = {}; +defragInfo.allocationCount = allocCount; +defragInfo.pAllocations = allocations.data(); +defragInfo.pAllocationsChanged = allocationsChanged.data(); +defragInfo.maxCpuBytesToMove = VK_WHOLE_SIZE; // No limit. +defragInfo.maxCpuAllocationsToMove = UINT32_MAX; // No limit. + +VmaDefragmentationContext defragCtx; +vmaDefragmentationBegin(allocator, &defragInfo, nullptr, &defragCtx); +vmaDefragmentationEnd(allocator, defragCtx); + +for (const auto i : c10::irange(allocCount)) { + if(allocationsChanged[i]) + { + // Destroy buffer that is immutably bound to memory region which is no longer valid. + vkDestroyBuffer(device, buffers[i], nullptr); + + // Create new buffer with same parameters. + VkBufferCreateInfo bufferInfo = ...; + vkCreateBuffer(device, &bufferInfo, nullptr, &buffers[i]); + + // You can make dummy call to vkGetBufferMemoryRequirements here to silence validation layer warning. + + // Bind new buffer to new memory region. Data contained in it is already moved. + VmaAllocationInfo allocInfo; + vmaGetAllocationInfo(allocator, allocations[i], &allocInfo); + vmaBindBufferMemory(allocator, allocations[i], buffers[i]); + } +} +\endcode + +Setting VmaDefragmentationInfo2::pAllocationsChanged is optional. +This output array tells whether particular allocation in VmaDefragmentationInfo2::pAllocations at the same index +has been modified during defragmentation. +You can pass null, but you then need to query every allocation passed to defragmentation +for new parameters using vmaGetAllocationInfo() if you might need to recreate and rebind a buffer or image associated with it. + +If you use [Custom memory pools](@ref choosing_memory_type_custom_memory_pools), +you can fill VmaDefragmentationInfo2::poolCount and VmaDefragmentationInfo2::pPools +instead of VmaDefragmentationInfo2::allocationCount and VmaDefragmentationInfo2::pAllocations +to defragment all allocations in given pools. +You cannot use VmaDefragmentationInfo2::pAllocationsChanged in that case. +You can also combine both methods. + +\section defragmentation_gpu Defragmenting GPU memory + +It is also possible to defragment allocations created in memory types that are not `HOST_VISIBLE`. +To do that, you need to pass a command buffer that meets requirements as described in +VmaDefragmentationInfo2::commandBuffer. The way it works is: + +- It creates temporary buffers and binds them to entire memory blocks when necessary. +- It issues `vkCmdCopyBuffer()` to passed command buffer. + +Example: + +\code +// Given following variables already initialized: +VkDevice device; +VmaAllocator allocator; +VkCommandBuffer commandBuffer; +std::vector buffers; +std::vector allocations; + + +const uint32_t allocCount = (uint32_t)allocations.size(); +std::vector allocationsChanged(allocCount); + +VkCommandBufferBeginInfo cmdBufBeginInfo = ...; +vkBeginCommandBuffer(commandBuffer, &cmdBufBeginInfo); + +VmaDefragmentationInfo2 defragInfo = {}; +defragInfo.allocationCount = allocCount; +defragInfo.pAllocations = allocations.data(); +defragInfo.pAllocationsChanged = allocationsChanged.data(); +defragInfo.maxGpuBytesToMove = VK_WHOLE_SIZE; // Notice it's "GPU" this time. +defragInfo.maxGpuAllocationsToMove = UINT32_MAX; // Notice it's "GPU" this time. +defragInfo.commandBuffer = commandBuffer; + +VmaDefragmentationContext defragCtx; +vmaDefragmentationBegin(allocator, &defragInfo, nullptr, &defragCtx); + +vkEndCommandBuffer(commandBuffer); + +// Submit commandBuffer. +// Wait for a fence that ensures commandBuffer execution finished. + +vmaDefragmentationEnd(allocator, defragCtx); + +for (const auto i : c10::irange(allocCount)) { + if(allocationsChanged[i]) + { + // Destroy buffer that is immutably bound to memory region which is no longer valid. + vkDestroyBuffer(device, buffers[i], nullptr); + + // Create new buffer with same parameters. + VkBufferCreateInfo bufferInfo = ...; + vkCreateBuffer(device, &bufferInfo, nullptr, &buffers[i]); + + // You can make dummy call to vkGetBufferMemoryRequirements here to silence validation layer warning. + + // Bind new buffer to new memory region. Data contained in it is already moved. + VmaAllocationInfo allocInfo; + vmaGetAllocationInfo(allocator, allocations[i], &allocInfo); + vmaBindBufferMemory(allocator, allocations[i], buffers[i]); + } +} +\endcode + +You can combine these two methods by specifying non-zero `maxGpu*` as well as `maxCpu*` parameters. +The library automatically chooses best method to defragment each memory pool. + +You may try not to block your entire program to wait until defragmentation finishes, +but do it in the background, as long as you carefully fullfill requirements described +in function vmaDefragmentationBegin(). + +\section defragmentation_additional_notes Additional notes + +It is only legal to defragment allocations bound to: + +- buffers +- images created with `VK_IMAGE_CREATE_ALIAS_BIT`, `VK_IMAGE_TILING_LINEAR`, and + being currently in `VK_IMAGE_LAYOUT_GENERAL` or `VK_IMAGE_LAYOUT_PREINITIALIZED`. + +Defragmentation of images created with `VK_IMAGE_TILING_OPTIMAL` or in any other +layout may give undefined results. + +If you defragment allocations bound to images, new images to be bound to new +memory region after defragmentation should be created with `VK_IMAGE_LAYOUT_PREINITIALIZED` +and then transitioned to their original layout from before defragmentation if +needed using an image memory barrier. + +While using defragmentation, you may experience validation layer warnings, which you just need to ignore. +See [Validation layer warnings](@ref general_considerations_validation_layer_warnings). + +Please don't expect memory to be fully compacted after defragmentation. +Algorithms inside are based on some heuristics that try to maximize number of Vulkan +memory blocks to make totally empty to release them, as well as to maximimze continuous +empty space inside remaining blocks, while minimizing the number and size of allocations that +need to be moved. Some fragmentation may still remain - this is normal. + +\section defragmentation_custom_algorithm Writing custom defragmentation algorithm + +If you want to implement your own, custom defragmentation algorithm, +there is infrastructure prepared for that, +but it is not exposed through the library API - you need to hack its source code. +Here are steps needed to do this: + +-# Main thing you need to do is to define your own class derived from base abstract + class `VmaDefragmentationAlgorithm` and implement your version of its pure virtual methods. + See definition and comments of this class for details. +-# Your code needs to interact with device memory block metadata. + If you need more access to its data than it's provided by its public interface, + declare your new class as a friend class e.g. in class `VmaBlockMetadata_Generic`. +-# If you want to create a flag that would enable your algorithm or pass some additional + flags to configure it, add them to `VmaDefragmentationFlagBits` and use them in + VmaDefragmentationInfo2::flags. +-# Modify function `VmaBlockVectorDefragmentationContext::Begin` to create object + of your new class whenever needed. + + +\page lost_allocations Lost allocations + +If your game oversubscribes video memory, if may work OK in previous-generation +graphics APIs (DirectX 9, 10, 11, OpenGL) because resources are automatically +paged to system RAM. In Vulkan you can't do it because when you run out of +memory, an allocation just fails. If you have more data (e.g. textures) that can +fit into VRAM and you don't need it all at once, you may want to upload them to +GPU on demand and "push out" ones that are not used for a long time to make room +for the new ones, effectively using VRAM (or a cartain memory pool) as a form of +cache. Vulkan Memory Allocator can help you with that by supporting a concept of +"lost allocations". + +To create an allocation that can become lost, include #VMA_ALLOCATION_CREATE_CAN_BECOME_LOST_BIT +flag in VmaAllocationCreateInfo::flags. Before using a buffer or image bound to +such allocation in every new frame, you need to query it if it's not lost. +To check it, call vmaTouchAllocation(). +If the allocation is lost, you should not use it or buffer/image bound to it. +You mustn't forget to destroy this allocation and this buffer/image. +vmaGetAllocationInfo() can also be used for checking status of the allocation. +Allocation is lost when returned VmaAllocationInfo::deviceMemory == `VK_NULL_HANDLE`. + +To create an allocation that can make some other allocations lost to make room +for it, use #VMA_ALLOCATION_CREATE_CAN_MAKE_OTHER_LOST_BIT flag. You will +usually use both flags #VMA_ALLOCATION_CREATE_CAN_MAKE_OTHER_LOST_BIT and +#VMA_ALLOCATION_CREATE_CAN_BECOME_LOST_BIT at the same time. + +Warning! Current implementation uses quite naive, brute force algorithm, +which can make allocation calls that use #VMA_ALLOCATION_CREATE_CAN_MAKE_OTHER_LOST_BIT +flag quite slow. A new, more optimal algorithm and data structure to speed this +up is planned for the future. + +Q: When interleaving creation of new allocations with usage of existing ones, +how do you make sure that an allocation won't become lost while it's used in the +current frame? + +It is ensured because vmaTouchAllocation() / vmaGetAllocationInfo() not only returns allocation +status/parameters and checks whether it's not lost, but when it's not, it also +atomically marks it as used in the current frame, which makes it impossible to +become lost in that frame. It uses lockless algorithm, so it works fast and +doesn't involve locking any internal mutex. + +Q: What if my allocation may still be in use by the GPU when it's rendering a +previous frame while I already submit new frame on the CPU? + +You can make sure that allocations "touched" by vmaTouchAllocation() / vmaGetAllocationInfo() will not +become lost for a number of additional frames back from the current one by +specifying this number as VmaAllocatorCreateInfo::frameInUseCount (for default +memory pool) and VmaPoolCreateInfo::frameInUseCount (for custom pool). + +Q: How do you inform the library when new frame starts? + +You need to call function vmaSetCurrentFrameIndex(). + +Example code: + +\code +struct MyBuffer +{ + VkBuffer m_Buf = nullptr; + VmaAllocation m_Alloc = nullptr; + + // Called when the buffer is really needed in the current frame. + void EnsureBuffer(); +}; + +void MyBuffer::EnsureBuffer() +{ + // Buffer has been created. + if(m_Buf != VK_NULL_HANDLE) + { + // Check if its allocation is not lost + mark it as used in current frame. + if(vmaTouchAllocation(allocator, m_Alloc)) + { + // It's all OK - safe to use m_Buf. + return; + } + } + + // Buffer not yet exists or lost - destroy and recreate it. + + vmaDestroyBuffer(allocator, m_Buf, m_Alloc); + + VkBufferCreateInfo bufCreateInfo = { VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO }; + bufCreateInfo.size = 1024; + bufCreateInfo.usage = VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT; + + VmaAllocationCreateInfo allocCreateInfo = {}; + allocCreateInfo.usage = VMA_MEMORY_USAGE_GPU_ONLY; + allocCreateInfo.flags = VMA_ALLOCATION_CREATE_CAN_BECOME_LOST_BIT | + VMA_ALLOCATION_CREATE_CAN_MAKE_OTHER_LOST_BIT; + + vmaCreateBuffer(allocator, &bufCreateInfo, &allocCreateInfo, &m_Buf, &m_Alloc, nullptr); +} +\endcode + +When using lost allocations, you may see some Vulkan validation layer warnings +about overlapping regions of memory bound to different kinds of buffers and +images. This is still valid as long as you implement proper handling of lost +allocations (like in the example above) and don't use them. + +You can create an allocation that is already in lost state from the beginning using function +vmaCreateLostAllocation(). It may be useful if you need a "dummy" allocation that is not null. + +You can call function vmaMakePoolAllocationsLost() to set all eligible allocations +in a specified custom pool to lost state. +Allocations that have been "touched" in current frame or VmaPoolCreateInfo::frameInUseCount frames back +cannot become lost. + +Q: Can I touch allocation that cannot become lost? + +Yes, although it has no visible effect. +Calls to vmaGetAllocationInfo() and vmaTouchAllocation() update last use frame index +also for allocations that cannot become lost, but the only way to observe it is to dump +internal allocator state using vmaBuildStatsString(). +You can use this feature for debugging purposes to explicitly mark allocations that you use +in current frame and then analyze JSON dump to see for how long each allocation stays unused. + + +\page statistics Statistics + +This library contains functions that return information about its internal state, +especially the amount of memory allocated from Vulkan. +Please keep in mind that these functions need to traverse all internal data structures +to gather these information, so they may be quite time-consuming. +Don't call them too often. + +\section statistics_numeric_statistics Numeric statistics + +You can query for overall statistics of the allocator using function vmaCalculateStats(). +Information are returned using structure #VmaStats. +It contains #VmaStatInfo - number of allocated blocks, number of allocations +(occupied ranges in these blocks), number of unused (free) ranges in these blocks, +number of bytes used and unused (but still allocated from Vulkan) and other information. +They are summed across memory heaps, memory types and total for whole allocator. + +You can query for statistics of a custom pool using function vmaGetPoolStats(). +Information are returned using structure #VmaPoolStats. + +You can query for information about specific allocation using function vmaGetAllocationInfo(). +It fill structure #VmaAllocationInfo. + +\section statistics_json_dump JSON dump + +You can dump internal state of the allocator to a string in JSON format using function vmaBuildStatsString(). +The result is guaranteed to be correct JSON. +It uses ANSI encoding. +Any strings provided by user (see [Allocation names](@ref allocation_names)) +are copied as-is and properly escaped for JSON, so if they use UTF-8, ISO-8859-2 or any other encoding, +this JSON string can be treated as using this encoding. +It must be freed using function vmaFreeStatsString(). + +The format of this JSON string is not part of official documentation of the library, +but it will not change in backward-incompatible way without increasing library major version number +and appropriate mention in changelog. + +The JSON string contains all the data that can be obtained using vmaCalculateStats(). +It can also contain detailed map of allocated memory blocks and their regions - +free and occupied by allocations. +This allows e.g. to visualize the memory or assess fragmentation. + + +\page allocation_annotation Allocation names and user data + +\section allocation_user_data Allocation user data + +You can annotate allocations with your own information, e.g. for debugging purposes. +To do that, fill VmaAllocationCreateInfo::pUserData field when creating +an allocation. It's an opaque `void*` pointer. You can use it e.g. as a pointer, +some handle, index, key, ordinal number or any other value that would associate +the allocation with your custom metadata. + +\code +VkBufferCreateInfo bufferInfo = { VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO }; +// Fill bufferInfo... + +MyBufferMetadata* pMetadata = CreateBufferMetadata(); + +VmaAllocationCreateInfo allocCreateInfo = {}; +allocCreateInfo.usage = VMA_MEMORY_USAGE_GPU_ONLY; +allocCreateInfo.pUserData = pMetadata; + +VkBuffer buffer; +VmaAllocation allocation; +vmaCreateBuffer(allocator, &bufferInfo, &allocCreateInfo, &buffer, &allocation, nullptr); +\endcode + +The pointer may be later retrieved as VmaAllocationInfo::pUserData: + +\code +VmaAllocationInfo allocInfo; +vmaGetAllocationInfo(allocator, allocation, &allocInfo); +MyBufferMetadata* pMetadata = (MyBufferMetadata*)allocInfo.pUserData; +\endcode + +It can also be changed using function vmaSetAllocationUserData(). + +Values of (non-zero) allocations' `pUserData` are printed in JSON report created by +vmaBuildStatsString(), in hexadecimal form. + +\section allocation_names Allocation names + +There is alternative mode available where `pUserData` pointer is used to point to +a null-terminated string, giving a name to the allocation. To use this mode, +set #VMA_ALLOCATION_CREATE_USER_DATA_COPY_STRING_BIT flag in VmaAllocationCreateInfo::flags. +Then `pUserData` passed as VmaAllocationCreateInfo::pUserData or argument to +vmaSetAllocationUserData() must be either null or pointer to a null-terminated string. +The library creates internal copy of the string, so the pointer you pass doesn't need +to be valid for whole lifetime of the allocation. You can free it after the call. + +\code +VkImageCreateInfo imageInfo = { VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO }; +// Fill imageInfo... + +std::string imageName = "Texture: "; +imageName += fileName; + +VmaAllocationCreateInfo allocCreateInfo = {}; +allocCreateInfo.usage = VMA_MEMORY_USAGE_GPU_ONLY; +allocCreateInfo.flags = VMA_ALLOCATION_CREATE_USER_DATA_COPY_STRING_BIT; +allocCreateInfo.pUserData = imageName.c_str(); + +VkImage image; +VmaAllocation allocation; +vmaCreateImage(allocator, &imageInfo, &allocCreateInfo, &image, &allocation, nullptr); +\endcode + +The value of `pUserData` pointer of the allocation will be different than the one +you passed when setting allocation's name - pointing to a buffer managed +internally that holds copy of the string. + +\code +VmaAllocationInfo allocInfo; +vmaGetAllocationInfo(allocator, allocation, &allocInfo); +const char* imageName = (const char*)allocInfo.pUserData; +printf("Image name: %s\n", imageName); +\endcode + +That string is also printed in JSON report created by vmaBuildStatsString(). + +\note Passing string name to VMA allocation doesn't automatically set it to the Vulkan buffer or image created with it. +You must do it manually using an extension like VK_EXT_debug_utils, which is independent of this library. + + +\page debugging_memory_usage Debugging incorrect memory usage + +If you suspect a bug with memory usage, like usage of uninitialized memory or +memory being overwritten out of bounds of an allocation, +you can use debug features of this library to verify this. + +\section debugging_memory_usage_initialization Memory initialization + +If you experience a bug with incorrect and nondeterministic data in your program and you suspect uninitialized memory to be used, +you can enable automatic memory initialization to verify this. +To do it, define macro `VMA_DEBUG_INITIALIZE_ALLOCATIONS` to 1. + +\code +#define VMA_DEBUG_INITIALIZE_ALLOCATIONS 1 +#include +\endcode + +It makes memory of all new allocations initialized to bit pattern `0xDCDCDCDC`. +Before an allocation is destroyed, its memory is filled with bit pattern `0xEFEFEFEF`. +Memory is automatically mapped and unmapped if necessary. + +If you find these values while debugging your program, good chances are that you incorrectly +read Vulkan memory that is allocated but not initialized, or already freed, respectively. + +Memory initialization works only with memory types that are `HOST_VISIBLE`. +It works also with dedicated allocations. +It doesn't work with allocations created with #VMA_ALLOCATION_CREATE_CAN_BECOME_LOST_BIT flag, +as they cannot be mapped. + +\section debugging_memory_usage_margins Margins + +By default, allocations are laid out in memory blocks next to each other if possible +(considering required alignment, `bufferImageGranularity`, and `nonCoherentAtomSize`). + +![Allocations without margin](../gfx/Margins_1.png) + +Define macro `VMA_DEBUG_MARGIN` to some non-zero value (e.g. 16) to enforce specified +number of bytes as a margin before and after every allocation. + +\code +#define VMA_DEBUG_MARGIN 16 +#include +\endcode + +![Allocations with margin](../gfx/Margins_2.png) + +If your bug goes away after enabling margins, it means it may be caused by memory +being overwritten outside of allocation boundaries. It is not 100% certain though. +Change in application behavior may also be caused by different order and distribution +of allocations across memory blocks after margins are applied. + +The margin is applied also before first and after last allocation in a block. +It may occur only once between two adjacent allocations. + +Margins work with all types of memory. + +Margin is applied only to allocations made out of memory blocks and not to dedicated +allocations, which have their own memory block of specific size. +It is thus not applied to allocations made using #VMA_ALLOCATION_CREATE_DEDICATED_MEMORY_BIT flag +or those automatically decided to put into dedicated allocations, e.g. due to its +large size or recommended by VK_KHR_dedicated_allocation extension. +Margins are also not active in custom pools created with #VMA_POOL_CREATE_BUDDY_ALGORITHM_BIT flag. + +Margins appear in [JSON dump](@ref statistics_json_dump) as part of free space. + +Note that enabling margins increases memory usage and fragmentation. + +\section debugging_memory_usage_corruption_detection Corruption detection + +You can additionally define macro `VMA_DEBUG_DETECT_CORRUPTION` to 1 to enable validation +of contents of the margins. + +\code +#define VMA_DEBUG_MARGIN 16 +#define VMA_DEBUG_DETECT_CORRUPTION 1 +#include +\endcode + +When this feature is enabled, number of bytes specified as `VMA_DEBUG_MARGIN` +(it must be multiply of 4) before and after every allocation is filled with a magic number. +This idea is also know as "canary". +Memory is automatically mapped and unmapped if necessary. + +This number is validated automatically when the allocation is destroyed. +If it's not equal to the expected value, `VMA_ASSERT()` is executed. +It clearly means that either CPU or GPU overwritten the memory outside of boundaries of the allocation, +which indicates a serious bug. + +You can also explicitly request checking margins of all allocations in all memory blocks +that belong to specified memory types by using function vmaCheckCorruption(), +or in memory blocks that belong to specified custom pool, by using function +vmaCheckPoolCorruption(). + +Margin validation (corruption detection) works only for memory types that are +`HOST_VISIBLE` and `HOST_COHERENT`. + + +\page record_and_replay Record and replay + +\section record_and_replay_introduction Introduction + +While using the library, sequence of calls to its functions together with their +parameters can be recorded to a file and later replayed using standalone player +application. It can be useful to: + +- Test correctness - check if same sequence of calls will not cause crash or + failures on a target platform. +- Gather statistics - see number of allocations, peak memory usage, number of + calls etc. +- Benchmark performance - see how much time it takes to replay the whole + sequence. + +\section record_and_replay_usage Usage + +Recording functionality is disabled by default. +To enable it, define following macro before every include of this library: + +\code +#define VMA_RECORDING_ENABLED 1 +\endcode + +To record sequence of calls to a file: Fill in +VmaAllocatorCreateInfo::pRecordSettings member while creating #VmaAllocator +object. File is opened and written during whole lifetime of the allocator. + +To replay file: Use VmaReplay - standalone command-line program. +Precompiled binary can be found in "bin" directory. +Its source can be found in "src/VmaReplay" directory. +Its project is generated by Premake. +Command line syntax is printed when the program is launched without parameters. +Basic usage: + + VmaReplay.exe MyRecording.csv + +Documentation of file format can be found in file: "docs/Recording file format.md". +It's a human-readable, text file in CSV format (Comma Separated Values). + +\section record_and_replay_additional_considerations Additional considerations + +- Replaying file that was recorded on a different GPU (with different parameters + like `bufferImageGranularity`, `nonCoherentAtomSize`, and especially different + set of memory heaps and types) may give different performance and memory usage + results, as well as issue some warnings and errors. +- Current implementation of recording in VMA, as well as VmaReplay application, is + coded and tested only on Windows. Inclusion of recording code is driven by + `VMA_RECORDING_ENABLED` macro. Support for other platforms should be easy to + add. Contributions are welcomed. + + +\page usage_patterns Recommended usage patterns + +See also slides from talk: +[Sawicki, Adam. Advanced Graphics Techniques Tutorial: Memory management in Vulkan and DX12. Game Developers Conference, 2018](https://www.gdcvault.com/play/1025458/Advanced-Graphics-Techniques-Tutorial-New) + + +\section usage_patterns_common_mistakes Common mistakes + +Use of CPU_TO_GPU instead of CPU_ONLY memory + +#VMA_MEMORY_USAGE_CPU_TO_GPU is recommended only for resources that will be +mapped and written by the CPU, as well as read directly by the GPU - like some +buffers or textures updated every frame (dynamic). If you create a staging copy +of a resource to be written by CPU and then used as a source of transfer to +another resource placed in the GPU memory, that staging resource should be +created with #VMA_MEMORY_USAGE_CPU_ONLY. Please read the descriptions of these +enums carefully for details. + +Unnecessary use of custom pools + +\ref custom_memory_pools may be useful for special purposes - when you want to +keep certain type of resources separate e.g. to reserve minimum amount of memory +for them, limit maximum amount of memory they can occupy, or make some of them +push out the other through the mechanism of \ref lost_allocations. For most +resources this is not needed and so it is not recommended to create #VmaPool +objects and allocations out of them. Allocating from the default pool is sufficient. + +\section usage_patterns_simple Simple patterns + +\subsection usage_patterns_simple_render_targets Render targets + +When: +Any resources that you frequently write and read on GPU, +e.g. images used as color attachments (aka "render targets"), depth-stencil attachments, +images/buffers used as storage image/buffer (aka "Unordered Access View (UAV)"). + +What to do: +Create them in video memory that is fastest to access from GPU using +#VMA_MEMORY_USAGE_GPU_ONLY. + +Consider using [VK_KHR_dedicated_allocation](@ref vk_khr_dedicated_allocation) extension +and/or manually creating them as dedicated allocations using #VMA_ALLOCATION_CREATE_DEDICATED_MEMORY_BIT, +especially if they are large or if you plan to destroy and recreate them e.g. when +display resolution changes. +Prefer to create such resources first and all other GPU resources (like textures and vertex buffers) later. + +\subsection usage_patterns_simple_immutable_resources Immutable resources + +When: +Any resources that you fill on CPU only once (aka "immutable") or infrequently +and then read frequently on GPU, +e.g. textures, vertex and index buffers, constant buffers that don't change often. + +What to do: +Create them in video memory that is fastest to access from GPU using +#VMA_MEMORY_USAGE_GPU_ONLY. + +To initialize content of such resource, create a CPU-side (aka "staging") copy of it +in system memory - #VMA_MEMORY_USAGE_CPU_ONLY, map it, fill it, +and submit a transfer from it to the GPU resource. +You can keep the staging copy if you need it for another upload transfer in the future. +If you don't, you can destroy it or reuse this buffer for uploading different resource +after the transfer finishes. + +Prefer to create just buffers in system memory rather than images, even for uploading textures. +Use `vkCmdCopyBufferToImage()`. +Dont use images with `VK_IMAGE_TILING_LINEAR`. + +\subsection usage_patterns_dynamic_resources Dynamic resources + +When: +Any resources that change frequently (aka "dynamic"), e.g. every frame or every draw call, +written on CPU, read on GPU. + +What to do: +Create them using #VMA_MEMORY_USAGE_CPU_TO_GPU. +You can map it and write to it directly on CPU, as well as read from it on GPU. + +This is a more complex situation. Different solutions are possible, +and the best one depends on specific GPU type, but you can use this simple approach for the start. +Prefer to write to such resource sequentially (e.g. using `memcpy`). +Don't perform random access or any reads from it on CPU, as it may be very slow. +Also note that textures written directly from the host through a mapped pointer need to be in LINEAR not OPTIMAL layout. + +\subsection usage_patterns_readback Readback + +When: +Resources that contain data written by GPU that you want to read back on CPU, +e.g. results of some computations. + +What to do: +Create them using #VMA_MEMORY_USAGE_GPU_TO_CPU. +You can write to them directly on GPU, as well as map and read them on CPU. + +\section usage_patterns_advanced Advanced patterns + +\subsection usage_patterns_integrated_graphics Detecting integrated graphics + +You can support integrated graphics (like Intel HD Graphics, AMD APU) better +by detecting it in Vulkan. +To do it, call `vkGetPhysicalDeviceProperties()`, inspect +`VkPhysicalDeviceProperties::deviceType` and look for `VK_PHYSICAL_DEVICE_TYPE_INTEGRATED_GPU`. +When you find it, you can assume that memory is unified and all memory types are comparably fast +to access from GPU, regardless of `VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT`. + +You can then sum up sizes of all available memory heaps and treat them as useful for +your GPU resources, instead of only `DEVICE_LOCAL` ones. +You can also prefer to create your resources in memory types that are `HOST_VISIBLE` to map them +directly instead of submitting explicit transfer (see below). + +\subsection usage_patterns_direct_vs_transfer Direct access versus transfer + +For resources that you frequently write on CPU and read on GPU, many solutions are possible: + +-# Create one copy in video memory using #VMA_MEMORY_USAGE_GPU_ONLY, + second copy in system memory using #VMA_MEMORY_USAGE_CPU_ONLY and submit explicit transfer each time. +-# Create just a single copy using #VMA_MEMORY_USAGE_CPU_TO_GPU, map it and fill it on CPU, + read it directly on GPU. +-# Create just a single copy using #VMA_MEMORY_USAGE_CPU_ONLY, map it and fill it on CPU, + read it directly on GPU. + +Which solution is the most efficient depends on your resource and especially on the GPU. +It is best to measure it and then make the decision. +Some general recommendations: + +- On integrated graphics use (2) or (3) to avoid unnecesary time and memory overhead + related to using a second copy and making transfer. +- For small resources (e.g. constant buffers) use (2). + Discrete AMD cards have special 256 MiB pool of video memory that is directly mappable. + Even if the resource ends up in system memory, its data may be cached on GPU after first + fetch over PCIe bus. +- For larger resources (e.g. textures), decide between (1) and (2). + You may want to differentiate NVIDIA and AMD, e.g. by looking for memory type that is + both `DEVICE_LOCAL` and `HOST_VISIBLE`. When you find it, use (2), otherwise use (1). + +Similarly, for resources that you frequently write on GPU and read on CPU, multiple +solutions are possible: + +-# Create one copy in video memory using #VMA_MEMORY_USAGE_GPU_ONLY, + second copy in system memory using #VMA_MEMORY_USAGE_GPU_TO_CPU and submit explicit tranfer each time. +-# Create just single copy using #VMA_MEMORY_USAGE_GPU_TO_CPU, write to it directly on GPU, + map it and read it on CPU. + +You should take some measurements to decide which option is faster in case of your specific +resource. + +Note that textures accessed directly from the host through a mapped pointer need to be in LINEAR layout, +which may slow down their usage on the device. +Textures accessed only by the device and transfer operations can use OPTIMAL layout. + +If you don't want to specialize your code for specific types of GPUs, you can still make +an simple optimization for cases when your resource ends up in mappable memory to use it +directly in this case instead of creating CPU-side staging copy. +For details see [Finding out if memory is mappable](@ref memory_mapping_finding_if_memory_mappable). + + +\page configuration Configuration + +Please check "CONFIGURATION SECTION" in the code to find macros that you can define +before each include of this file or change directly in this file to provide +your own implementation of basic facilities like assert, `min()` and `max()` functions, +mutex, atomic etc. +The library uses its own implementation of containers by default, but you can switch to using +STL containers instead. + +For example, define `VMA_ASSERT(expr)` before including the library to provide +custom implementation of the assertion, compatible with your project. +By default it is defined to standard C `assert(expr)` in `_DEBUG` configuration +and empty otherwise. + +\section config_Vulkan_functions Pointers to Vulkan functions + +There are multiple ways to import pointers to Vulkan functions in the library. +In the simplest case you don't need to do anything. +If the compilation or linking of your program or the initialization of the #VmaAllocator +doesn't work for you, you can try to reconfigure it. + +First, the allocator tries to fetch pointers to Vulkan functions linked statically, +like this: + +\code +m_VulkanFunctions.vkAllocateMemory = (PFN_vkAllocateMemory)vkAllocateMemory; +\endcode + +If you want to disable this feature, set configuration macro: `#define VMA_STATIC_VULKAN_FUNCTIONS 0`. + +Second, you can provide the pointers yourself by setting member VmaAllocatorCreateInfo::pVulkanFunctions. +You can fetch them e.g. using functions `vkGetInstanceProcAddr` and `vkGetDeviceProcAddr` or +by using a helper library like [volk](https://github.com/zeux/volk). + +Third, VMA tries to fetch remaining pointers that are still null by calling +`vkGetInstanceProcAddr` and `vkGetDeviceProcAddr` on its own. +If you want to disable this feature, set configuration macro: `#define VMA_DYNAMIC_VULKAN_FUNCTIONS 0`. + +Finally, all the function pointers required by the library (considering selected +Vulkan version and enabled extensions) are checked with `VMA_ASSERT` if they are not null. + + +\section custom_memory_allocator Custom host memory allocator + +If you use custom allocator for CPU memory rather than default operator `new` +and `delete` from C++, you can make this library using your allocator as well +by filling optional member VmaAllocatorCreateInfo::pAllocationCallbacks. These +functions will be passed to Vulkan, as well as used by the library itself to +make any CPU-side allocations. + +\section allocation_callbacks Device memory allocation callbacks + +The library makes calls to `vkAllocateMemory()` and `vkFreeMemory()` internally. +You can setup callbacks to be informed about these calls, e.g. for the purpose +of gathering some statistics. To do it, fill optional member +VmaAllocatorCreateInfo::pDeviceMemoryCallbacks. + +\section heap_memory_limit Device heap memory limit + +When device memory of certain heap runs out of free space, new allocations may +fail (returning error code) or they may succeed, silently pushing some existing +memory blocks from GPU VRAM to system RAM (which degrades performance). This +behavior is implementation-dependent - it depends on GPU vendor and graphics +driver. + +On AMD cards it can be controlled while creating Vulkan device object by using +VK_AMD_memory_overallocation_behavior extension, if available. + +Alternatively, if you want to test how your program behaves with limited amount of Vulkan device +memory available without switching your graphics card to one that really has +smaller VRAM, you can use a feature of this library intended for this purpose. +To do it, fill optional member VmaAllocatorCreateInfo::pHeapSizeLimit. + + + +\page vk_khr_dedicated_allocation VK_KHR_dedicated_allocation + +VK_KHR_dedicated_allocation is a Vulkan extension which can be used to improve +performance on some GPUs. It augments Vulkan API with possibility to query +driver whether it prefers particular buffer or image to have its own, dedicated +allocation (separate `VkDeviceMemory` block) for better efficiency - to be able +to do some internal optimizations. + +The extension is supported by this library. It will be used automatically when +enabled. To enable it: + +1 . When creating Vulkan device, check if following 2 device extensions are +supported (call `vkEnumerateDeviceExtensionProperties()`). +If yes, enable them (fill `VkDeviceCreateInfo::ppEnabledExtensionNames`). + +- VK_KHR_get_memory_requirements2 +- VK_KHR_dedicated_allocation + +If you enabled these extensions: + +2 . Use #VMA_ALLOCATOR_CREATE_KHR_DEDICATED_ALLOCATION_BIT flag when creating +your #VmaAllocator`to inform the library that you enabled required extensions +and you want the library to use them. + +\code +allocatorInfo.flags |= VMA_ALLOCATOR_CREATE_KHR_DEDICATED_ALLOCATION_BIT; + +vmaCreateAllocator(&allocatorInfo, &allocator); +\endcode + +That's all. The extension will be automatically used whenever you create a +buffer using vmaCreateBuffer() or image using vmaCreateImage(). + +When using the extension together with Vulkan Validation Layer, you will receive +warnings like this: + + vkBindBufferMemory(): Binding memory to buffer 0x33 but vkGetBufferMemoryRequirements() has not been called on that buffer. + +It is OK, you should just ignore it. It happens because you use function +`vkGetBufferMemoryRequirements2KHR()` instead of standard +`vkGetBufferMemoryRequirements()`, while the validation layer seems to be +unaware of it. + +To learn more about this extension, see: + +- [VK_KHR_dedicated_allocation in Vulkan specification](https://www.khronos.org/registry/vulkan/specs/1.2-extensions/html/chap44.html#VK_KHR_dedicated_allocation) +- [VK_KHR_dedicated_allocation unofficial manual](http://asawicki.info/articles/VK_KHR_dedicated_allocation.php5) + + + +\page vk_amd_device_coherent_memory VK_AMD_device_coherent_memory + +VK_AMD_device_coherent_memory is a device extension that enables access to +additional memory types with `VK_MEMORY_PROPERTY_DEVICE_COHERENT_BIT_AMD` and +`VK_MEMORY_PROPERTY_DEVICE_UNCACHED_BIT_AMD` flag. It is useful mostly for +allocation of buffers intended for writing "breadcrumb markers" in between passes +or draw calls, which in turn are useful for debugging GPU crash/hang/TDR cases. + +When the extension is available but has not been enabled, Vulkan physical device +still exposes those memory types, but their usage is forbidden. VMA automatically +takes care of that - it returns `VK_ERROR_FEATURE_NOT_PRESENT` when an attempt +to allocate memory of such type is made. + +If you want to use this extension in connection with VMA, follow these steps: + +\section vk_amd_device_coherent_memory_initialization Initialization + +1) Call `vkEnumerateDeviceExtensionProperties` for the physical device. +Check if the extension is supported - if returned array of `VkExtensionProperties` contains "VK_AMD_device_coherent_memory". + +2) Call `vkGetPhysicalDeviceFeatures2` for the physical device instead of old `vkGetPhysicalDeviceFeatures`. +Attach additional structure `VkPhysicalDeviceCoherentMemoryFeaturesAMD` to `VkPhysicalDeviceFeatures2::pNext` to be returned. +Check if the device feature is really supported - check if `VkPhysicalDeviceCoherentMemoryFeaturesAMD::deviceCoherentMemory` is true. + +3) While creating device with `vkCreateDevice`, enable this extension - add "VK_AMD_device_coherent_memory" +to the list passed as `VkDeviceCreateInfo::ppEnabledExtensionNames`. + +4) While creating the device, also don't set `VkDeviceCreateInfo::pEnabledFeatures`. +Fill in `VkPhysicalDeviceFeatures2` structure instead and pass it as `VkDeviceCreateInfo::pNext`. +Enable this device feature - attach additional structure `VkPhysicalDeviceCoherentMemoryFeaturesAMD` to +`VkPhysicalDeviceFeatures2::pNext` and set its member `deviceCoherentMemory` to `VK_TRUE`. + +5) While creating #VmaAllocator with vmaCreateAllocator() inform VMA that you +have enabled this extension and feature - add #VMA_ALLOCATOR_CREATE_AMD_DEVICE_COHERENT_MEMORY_BIT +to VmaAllocatorCreateInfo::flags. + +\section vk_amd_device_coherent_memory_usage Usage + +After following steps described above, you can create VMA allocations and custom pools +out of the special `DEVICE_COHERENT` and `DEVICE_UNCACHED` memory types on eligible +devices. There are multiple ways to do it, for example: + +- You can request or prefer to allocate out of such memory types by adding + `VK_MEMORY_PROPERTY_DEVICE_COHERENT_BIT_AMD` to VmaAllocationCreateInfo::requiredFlags + or VmaAllocationCreateInfo::preferredFlags. Those flags can be freely mixed with + other ways of \ref choosing_memory_type, like setting VmaAllocationCreateInfo::usage. +- If you manually found memory type index to use for this purpose, force allocation + from this specific index by setting VmaAllocationCreateInfo::memoryTypeBits `= 1u << index`. + +\section vk_amd_device_coherent_memory_more_information More information + +To learn more about this extension, see [VK_AMD_device_coherent_memory in Vulkan specification](https://www.khronos.org/registry/vulkan/specs/1.2-extensions/html/chap44.html#VK_AMD_device_coherent_memory) + +Example use of this extension can be found in the code of the sample and test suite +accompanying this library. + + +\page enabling_buffer_device_address Enabling buffer device address + +Device extension VK_KHR_buffer_device_address +allow to fetch raw GPU pointer to a buffer and pass it for usage in a shader code. +It is promoted to core Vulkan 1.2. + +If you want to use this feature in connection with VMA, follow these steps: + +\section enabling_buffer_device_address_initialization Initialization + +1) (For Vulkan version < 1.2) Call `vkEnumerateDeviceExtensionProperties` for the physical device. +Check if the extension is supported - if returned array of `VkExtensionProperties` contains +"VK_KHR_buffer_device_address". + +2) Call `vkGetPhysicalDeviceFeatures2` for the physical device instead of old `vkGetPhysicalDeviceFeatures`. +Attach additional structure `VkPhysicalDeviceBufferDeviceAddressFeatures*` to `VkPhysicalDeviceFeatures2::pNext` to be returned. +Check if the device feature is really supported - check if `VkPhysicalDeviceBufferDeviceAddressFeatures*::bufferDeviceAddress` is true. + +3) (For Vulkan version < 1.2) While creating device with `vkCreateDevice`, enable this extension - add +"VK_KHR_buffer_device_address" to the list passed as `VkDeviceCreateInfo::ppEnabledExtensionNames`. + +4) While creating the device, also don't set `VkDeviceCreateInfo::pEnabledFeatures`. +Fill in `VkPhysicalDeviceFeatures2` structure instead and pass it as `VkDeviceCreateInfo::pNext`. +Enable this device feature - attach additional structure `VkPhysicalDeviceBufferDeviceAddressFeatures*` to +`VkPhysicalDeviceFeatures2::pNext` and set its member `bufferDeviceAddress` to `VK_TRUE`. + +5) While creating #VmaAllocator with vmaCreateAllocator() inform VMA that you +have enabled this feature - add #VMA_ALLOCATOR_CREATE_BUFFER_DEVICE_ADDRESS_BIT +to VmaAllocatorCreateInfo::flags. + +\section enabling_buffer_device_address_usage Usage + +After following steps described above, you can create buffers with `VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT*` using VMA. +The library automatically adds `VK_MEMORY_ALLOCATE_DEVICE_ADDRESS_BIT*` to +allocated memory blocks wherever it might be needed. + +Please note that the library supports only `VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT*`. +The second part of this functionality related to "capture and replay" is not supported, +as it is intended for usage in debugging tools like RenderDoc, not in everyday Vulkan usage. + +\section enabling_buffer_device_address_more_information More information + +To learn more about this extension, see [VK_KHR_buffer_device_address in Vulkan specification](https://www.khronos.org/registry/vulkan/specs/1.2-extensions/html/chap46.html#VK_KHR_buffer_device_address) + +Example use of this extension can be found in the code of the sample and test suite +accompanying this library. + +\page general_considerations General considerations + +\section general_considerations_thread_safety Thread safety + +- The library has no global state, so separate #VmaAllocator objects can be used + independently. + There should be no need to create multiple such objects though - one per `VkDevice` is enough. +- By default, all calls to functions that take #VmaAllocator as first parameter + are safe to call from multiple threads simultaneously because they are + synchronized internally when needed. +- When the allocator is created with #VMA_ALLOCATOR_CREATE_EXTERNALLY_SYNCHRONIZED_BIT + flag, calls to functions that take such #VmaAllocator object must be + synchronized externally. +- Access to a #VmaAllocation object must be externally synchronized. For example, + you must not call vmaGetAllocationInfo() and vmaMapMemory() from different + threads at the same time if you pass the same #VmaAllocation object to these + functions. + +\section general_considerations_validation_layer_warnings Validation layer warnings + +When using this library, you can meet following types of warnings issued by +Vulkan validation layer. They don't necessarily indicate a bug, so you may need +to just ignore them. + +- *vkBindBufferMemory(): Binding memory to buffer 0xeb8e4 but vkGetBufferMemoryRequirements() has not been called on that buffer.* + - It happens when VK_KHR_dedicated_allocation extension is enabled. + `vkGetBufferMemoryRequirements2KHR` function is used instead, while validation layer seems to be unaware of it. +- *Mapping an image with layout VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL can result in undefined behavior if this memory is used by the device. Only GENERAL or PREINITIALIZED should be used.* + - It happens when you map a buffer or image, because the library maps entire + `VkDeviceMemory` block, where different types of images and buffers may end + up together, especially on GPUs with unified memory like Intel. +- *Non-linear image 0xebc91 is aliased with linear buffer 0xeb8e4 which may indicate a bug.* + - It happens when you use lost allocations, and a new image or buffer is + created in place of an existing object that bacame lost. + - It may happen also when you use [defragmentation](@ref defragmentation). + +\section general_considerations_allocation_algorithm Allocation algorithm + +The library uses following algorithm for allocation, in order: + +-# Try to find free range of memory in existing blocks. +-# If failed, try to create a new block of `VkDeviceMemory`, with preferred block size. +-# If failed, try to create such block with size/2, size/4, size/8. +-# If failed and #VMA_ALLOCATION_CREATE_CAN_MAKE_OTHER_LOST_BIT flag was + specified, try to find space in existing blocks, possilby making some other + allocations lost. +-# If failed, try to allocate separate `VkDeviceMemory` for this allocation, + just like when you use #VMA_ALLOCATION_CREATE_DEDICATED_MEMORY_BIT. +-# If failed, choose other memory type that meets the requirements specified in + VmaAllocationCreateInfo and go to point 1. +-# If failed, return `VK_ERROR_OUT_OF_DEVICE_MEMORY`. + +\section general_considerations_features_not_supported Features not supported + +Features deliberately excluded from the scope of this library: + +- Data transfer. Uploading (straming) and downloading data of buffers and images + between CPU and GPU memory and related synchronization is responsibility of the user. + Defining some "texture" object that would automatically stream its data from a + staging copy in CPU memory to GPU memory would rather be a feature of another, + higher-level library implemented on top of VMA. +- Allocations for imported/exported external memory. They tend to require + explicit memory type index and dedicated allocation anyway, so they don't + interact with main features of this library. Such special purpose allocations + should be made manually, using `vkCreateBuffer()` and `vkAllocateMemory()`. +- Sub-allocation of parts of one large buffer. Although recommended as a good practice, + it is the user's responsibility to implement such logic on top of VMA. +- Recreation of buffers and images. Although the library has functions for + buffer and image creation (vmaCreateBuffer(), vmaCreateImage()), you need to + recreate these objects yourself after defragmentation. That's because the big + structures `VkBufferCreateInfo`, `VkImageCreateInfo` are not stored in + #VmaAllocation object. +- Handling CPU memory allocation failures. When dynamically creating small C++ + objects in CPU memory (not Vulkan memory), allocation failures are not checked + and handled gracefully, because that would complicate code significantly and + is usually not needed in desktop PC applications anyway. + Success of an allocation is just checked with an assert. +- Code free of any compiler warnings. Maintaining the library to compile and + work correctly on so many different platforms is hard enough. Being free of + any warnings, on any version of any compiler, is simply not feasible. +- This is a C++ library with C interface. + Bindings or ports to any other programming languages are welcomed as external projects and + are not going to be included into this repository. + +*/ + +#ifdef __cplusplus +extern "C" { +#endif + +/* +Define this macro to 0/1 to disable/enable support for recording functionality, +available through VmaAllocatorCreateInfo::pRecordSettings. +*/ +#ifndef VMA_RECORDING_ENABLED + #define VMA_RECORDING_ENABLED 0 +#endif + +#if !defined(NOMINMAX) && defined(VMA_IMPLEMENTATION) + #define NOMINMAX // For windows.h +#endif + +#if defined(__ANDROID__) && defined(VK_NO_PROTOTYPES) && VMA_STATIC_VULKAN_FUNCTIONS + extern PFN_vkGetInstanceProcAddr vkGetInstanceProcAddr; + extern PFN_vkGetDeviceProcAddr vkGetDeviceProcAddr; + extern PFN_vkGetPhysicalDeviceProperties vkGetPhysicalDeviceProperties; + extern PFN_vkGetPhysicalDeviceMemoryProperties vkGetPhysicalDeviceMemoryProperties; + extern PFN_vkAllocateMemory vkAllocateMemory; + extern PFN_vkFreeMemory vkFreeMemory; + extern PFN_vkMapMemory vkMapMemory; + extern PFN_vkUnmapMemory vkUnmapMemory; + extern PFN_vkFlushMappedMemoryRanges vkFlushMappedMemoryRanges; + extern PFN_vkInvalidateMappedMemoryRanges vkInvalidateMappedMemoryRanges; + extern PFN_vkBindBufferMemory vkBindBufferMemory; + extern PFN_vkBindImageMemory vkBindImageMemory; + extern PFN_vkGetBufferMemoryRequirements vkGetBufferMemoryRequirements; + extern PFN_vkGetImageMemoryRequirements vkGetImageMemoryRequirements; + extern PFN_vkCreateBuffer vkCreateBuffer; + extern PFN_vkDestroyBuffer vkDestroyBuffer; + extern PFN_vkCreateImage vkCreateImage; + extern PFN_vkDestroyImage vkDestroyImage; + extern PFN_vkCmdCopyBuffer vkCmdCopyBuffer; + #if VMA_VULKAN_VERSION >= 1001000 + extern PFN_vkGetBufferMemoryRequirements2 vkGetBufferMemoryRequirements2; + extern PFN_vkGetImageMemoryRequirements2 vkGetImageMemoryRequirements2; + extern PFN_vkBindBufferMemory2 vkBindBufferMemory2; + extern PFN_vkBindImageMemory2 vkBindImageMemory2; + extern PFN_vkGetPhysicalDeviceMemoryProperties2 vkGetPhysicalDeviceMemoryProperties2; + #endif // #if VMA_VULKAN_VERSION >= 1001000 +#endif // #if defined(__ANDROID__) && VMA_STATIC_VULKAN_FUNCTIONS && VK_NO_PROTOTYPES + +#ifndef VULKAN_H_ + #include +#endif + +// Define this macro to declare maximum supported Vulkan version in format AAABBBCCC, +// where AAA = major, BBB = minor, CCC = patch. +// If you want to use version > 1.0, it still needs to be enabled via VmaAllocatorCreateInfo::vulkanApiVersion. +#if !defined(VMA_VULKAN_VERSION) + #if defined(VK_VERSION_1_2) + #define VMA_VULKAN_VERSION 1002000 + #elif defined(VK_VERSION_1_1) + #define VMA_VULKAN_VERSION 1001000 + #else + #define VMA_VULKAN_VERSION 1000000 + #endif +#endif + +#if !defined(VMA_DEDICATED_ALLOCATION) + #if VK_KHR_get_memory_requirements2 && VK_KHR_dedicated_allocation + #define VMA_DEDICATED_ALLOCATION 1 + #else + #define VMA_DEDICATED_ALLOCATION 0 + #endif +#endif + +#if !defined(VMA_BIND_MEMORY2) + #if VK_KHR_bind_memory2 + #define VMA_BIND_MEMORY2 1 + #else + #define VMA_BIND_MEMORY2 0 + #endif +#endif + +#if !defined(VMA_MEMORY_BUDGET) + #if VK_EXT_memory_budget && (VK_KHR_get_physical_device_properties2 || VMA_VULKAN_VERSION >= 1001000) + #define VMA_MEMORY_BUDGET 1 + #else + #define VMA_MEMORY_BUDGET 0 + #endif +#endif + +// Defined to 1 when VK_KHR_buffer_device_address device extension or equivalent core Vulkan 1.2 feature is defined in its headers. +#if !defined(VMA_BUFFER_DEVICE_ADDRESS) + #if VK_KHR_buffer_device_address || VMA_VULKAN_VERSION >= 1002000 + #define VMA_BUFFER_DEVICE_ADDRESS 1 + #else + #define VMA_BUFFER_DEVICE_ADDRESS 0 + #endif +#endif + +// Defined to 1 when VK_EXT_memory_priority device extension is defined in Vulkan headers. +#if !defined(VMA_MEMORY_PRIORITY) + #if VK_EXT_memory_priority + #define VMA_MEMORY_PRIORITY 1 + #else + #define VMA_MEMORY_PRIORITY 0 + #endif +#endif + +// Define these macros to decorate all public functions with additional code, +// before and after returned type, appropriately. This may be useful for +// exporting the functions when compiling VMA as a separate library. Example: +// #define VMA_CALL_PRE __declspec(dllexport) +// #define VMA_CALL_POST __cdecl +#ifndef VMA_CALL_PRE + #define VMA_CALL_PRE +#endif +#ifndef VMA_CALL_POST + #define VMA_CALL_POST +#endif + +// Define this macro to decorate pointers with an attribute specifying the +// length of the array they point to if they are not null. +// +// The length may be one of +// - The name of another parameter in the argument list where the pointer is declared +// - The name of another member in the struct where the pointer is declared +// - The name of a member of a struct type, meaning the value of that member in +// the context of the call. For example +// VMA_LEN_IF_NOT_NULL("VkPhysicalDeviceMemoryProperties::memoryHeapCount"), +// this means the number of memory heaps available in the device associated +// with the VmaAllocator being dealt with. +#ifndef VMA_LEN_IF_NOT_NULL + #define VMA_LEN_IF_NOT_NULL(len) +#endif + +// The VMA_NULLABLE macro is defined to be _Nullable when compiling with Clang. +// see: https://clang.llvm.org/docs/AttributeReference.html#nullable +#ifndef VMA_NULLABLE + #ifdef __clang__ + #define VMA_NULLABLE _Nullable + #else + #define VMA_NULLABLE + #endif +#endif + +// The VMA_NOT_NULL macro is defined to be _Nonnull when compiling with Clang. +// see: https://clang.llvm.org/docs/AttributeReference.html#nonnull +#ifndef VMA_NOT_NULL + #ifdef __clang__ + #define VMA_NOT_NULL _Nonnull + #else + #define VMA_NOT_NULL + #endif +#endif + +// If non-dispatchable handles are represented as pointers then we can give +// then nullability annotations +#ifndef VMA_NOT_NULL_NON_DISPATCHABLE + #if defined(__LP64__) || defined(_WIN64) || (defined(__x86_64__) && !defined(__ILP32__) ) || defined(_M_X64) || defined(__ia64) || defined (_M_IA64) || defined(__aarch64__) || defined(__powerpc64__) + #define VMA_NOT_NULL_NON_DISPATCHABLE VMA_NOT_NULL + #else + #define VMA_NOT_NULL_NON_DISPATCHABLE + #endif +#endif + +#ifndef VMA_NULLABLE_NON_DISPATCHABLE + #if defined(__LP64__) || defined(_WIN64) || (defined(__x86_64__) && !defined(__ILP32__) ) || defined(_M_X64) || defined(__ia64) || defined (_M_IA64) || defined(__aarch64__) || defined(__powerpc64__) + #define VMA_NULLABLE_NON_DISPATCHABLE VMA_NULLABLE + #else + #define VMA_NULLABLE_NON_DISPATCHABLE + #endif +#endif + +/** \struct VmaAllocator +\brief Represents main object of this library initialized. + +Fill structure #VmaAllocatorCreateInfo and call function vmaCreateAllocator() to create it. +Call function vmaDestroyAllocator() to destroy it. + +It is recommended to create just one object of this type per `VkDevice` object, +right after Vulkan is initialized and keep it alive until before Vulkan device is destroyed. +*/ +VK_DEFINE_HANDLE(VmaAllocator) + +/// Callback function called after successful vkAllocateMemory. +typedef void (VKAPI_PTR *PFN_vmaAllocateDeviceMemoryFunction)( + VmaAllocator VMA_NOT_NULL allocator, + uint32_t memoryType, + VkDeviceMemory VMA_NOT_NULL_NON_DISPATCHABLE memory, + VkDeviceSize size, + void* VMA_NULLABLE pUserData); +/// Callback function called before vkFreeMemory. +typedef void (VKAPI_PTR *PFN_vmaFreeDeviceMemoryFunction)( + VmaAllocator VMA_NOT_NULL allocator, + uint32_t memoryType, + VkDeviceMemory VMA_NOT_NULL_NON_DISPATCHABLE memory, + VkDeviceSize size, + void* VMA_NULLABLE pUserData); + +/** \brief Set of callbacks that the library will call for `vkAllocateMemory` and `vkFreeMemory`. + +Provided for informative purpose, e.g. to gather statistics about number of +allocations or total amount of memory allocated in Vulkan. + +Used in VmaAllocatorCreateInfo::pDeviceMemoryCallbacks. +*/ +typedef struct VmaDeviceMemoryCallbacks { + /// Optional, can be null. + PFN_vmaAllocateDeviceMemoryFunction VMA_NULLABLE pfnAllocate; + /// Optional, can be null. + PFN_vmaFreeDeviceMemoryFunction VMA_NULLABLE pfnFree; + /// Optional, can be null. + void* VMA_NULLABLE pUserData; +} VmaDeviceMemoryCallbacks; + +/// Flags for created #VmaAllocator. +typedef enum VmaAllocatorCreateFlagBits { + /** \brief Allocator and all objects created from it will not be synchronized internally, so you must guarantee they are used from only one thread at a time or synchronized externally by you. + + Using this flag may increase performance because internal mutexes are not used. + */ + VMA_ALLOCATOR_CREATE_EXTERNALLY_SYNCHRONIZED_BIT = 0x00000001, + /** \brief Enables usage of VK_KHR_dedicated_allocation extension. + + The flag works only if VmaAllocatorCreateInfo::vulkanApiVersion `== VK_API_VERSION_1_0`. + When it's `VK_API_VERSION_1_1`, the flag is ignored because the extension has been promoted to Vulkan 1.1. + + Using this extenion will automatically allocate dedicated blocks of memory for + some buffers and images instead of suballocating place for them out of bigger + memory blocks (as if you explicitly used #VMA_ALLOCATION_CREATE_DEDICATED_MEMORY_BIT + flag) when it is recommended by the driver. It may improve performance on some + GPUs. + + You may set this flag only if you found out that following device extensions are + supported, you enabled them while creating Vulkan device passed as + VmaAllocatorCreateInfo::device, and you want them to be used internally by this + library: + + - VK_KHR_get_memory_requirements2 (device extension) + - VK_KHR_dedicated_allocation (device extension) + + When this flag is set, you can experience following warnings reported by Vulkan + validation layer. You can ignore them. + + > vkBindBufferMemory(): Binding memory to buffer 0x2d but vkGetBufferMemoryRequirements() has not been called on that buffer. + */ + VMA_ALLOCATOR_CREATE_KHR_DEDICATED_ALLOCATION_BIT = 0x00000002, + /** + Enables usage of VK_KHR_bind_memory2 extension. + + The flag works only if VmaAllocatorCreateInfo::vulkanApiVersion `== VK_API_VERSION_1_0`. + When it's `VK_API_VERSION_1_1`, the flag is ignored because the extension has been promoted to Vulkan 1.1. + + You may set this flag only if you found out that this device extension is supported, + you enabled it while creating Vulkan device passed as VmaAllocatorCreateInfo::device, + and you want it to be used internally by this library. + + The extension provides functions `vkBindBufferMemory2KHR` and `vkBindImageMemory2KHR`, + which allow to pass a chain of `pNext` structures while binding. + This flag is required if you use `pNext` parameter in vmaBindBufferMemory2() or vmaBindImageMemory2(). + */ + VMA_ALLOCATOR_CREATE_KHR_BIND_MEMORY2_BIT = 0x00000004, + /** + Enables usage of VK_EXT_memory_budget extension. + + You may set this flag only if you found out that this device extension is supported, + you enabled it while creating Vulkan device passed as VmaAllocatorCreateInfo::device, + and you want it to be used internally by this library, along with another instance extension + VK_KHR_get_physical_device_properties2, which is required by it (or Vulkan 1.1, where this extension is promoted). + + The extension provides query for current memory usage and budget, which will probably + be more accurate than an estimation used by the library otherwise. + */ + VMA_ALLOCATOR_CREATE_EXT_MEMORY_BUDGET_BIT = 0x00000008, + /** + Enables usage of VK_AMD_device_coherent_memory extension. + + You may set this flag only if you: + + - found out that this device extension is supported and enabled it while creating Vulkan device passed as VmaAllocatorCreateInfo::device, + - checked that `VkPhysicalDeviceCoherentMemoryFeaturesAMD::deviceCoherentMemory` is true and set it while creating the Vulkan device, + - want it to be used internally by this library. + + The extension and accompanying device feature provide access to memory types with + `VK_MEMORY_PROPERTY_DEVICE_COHERENT_BIT_AMD` and `VK_MEMORY_PROPERTY_DEVICE_UNCACHED_BIT_AMD` flags. + They are useful mostly for writing breadcrumb markers - a common method for debugging GPU crash/hang/TDR. + + When the extension is not enabled, such memory types are still enumerated, but their usage is illegal. + To protect from this error, if you don't create the allocator with this flag, it will refuse to allocate any memory or create a custom pool in such memory type, + returning `VK_ERROR_FEATURE_NOT_PRESENT`. + */ + VMA_ALLOCATOR_CREATE_AMD_DEVICE_COHERENT_MEMORY_BIT = 0x00000010, + /** + Enables usage of "buffer device address" feature, which allows you to use function + `vkGetBufferDeviceAddress*` to get raw GPU pointer to a buffer and pass it for usage inside a shader. + + You may set this flag only if you: + + 1. (For Vulkan version < 1.2) Found as available and enabled device extension + VK_KHR_buffer_device_address. + This extension is promoted to core Vulkan 1.2. + 2. Found as available and enabled device feature `VkPhysicalDeviceBufferDeviceAddressFeatures::bufferDeviceAddress`. + + When this flag is set, you can create buffers with `VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT` using VMA. + The library automatically adds `VK_MEMORY_ALLOCATE_DEVICE_ADDRESS_BIT` to + allocated memory blocks wherever it might be needed. + + For more information, see documentation chapter \ref enabling_buffer_device_address. + */ + VMA_ALLOCATOR_CREATE_BUFFER_DEVICE_ADDRESS_BIT = 0x00000020, + /** + Enables usage of VK_EXT_memory_priority extension in the library. + + You may set this flag only if you found available and enabled this device extension, + along with `VkPhysicalDeviceMemoryPriorityFeaturesEXT::memoryPriority == VK_TRUE`, + while creating Vulkan device passed as VmaAllocatorCreateInfo::device. + + When this flag is used, VmaAllocationCreateInfo::priority and VmaPoolCreateInfo::priority + are used to set priorities of allocated Vulkan memory. Without it, these variables are ignored. + + A priority must be a floating-point value between 0 and 1, indicating the priority of the allocation relative to other memory allocations. + Larger values are higher priority. The granularity of the priorities is implementation-dependent. + It is automatically passed to every call to `vkAllocateMemory` done by the library using structure `VkMemoryPriorityAllocateInfoEXT`. + The value to be used for default priority is 0.5. + For more details, see the documentation of the VK_EXT_memory_priority extension. + */ + VMA_ALLOCATOR_CREATE_EXT_MEMORY_PRIORITY_BIT = 0x00000040, + + VMA_ALLOCATOR_CREATE_FLAG_BITS_MAX_ENUM = 0x7FFFFFFF +} VmaAllocatorCreateFlagBits; +typedef VkFlags VmaAllocatorCreateFlags; + +/** \brief Pointers to some Vulkan functions - a subset used by the library. + +Used in VmaAllocatorCreateInfo::pVulkanFunctions. +*/ +typedef struct VmaVulkanFunctions { + PFN_vkGetPhysicalDeviceProperties VMA_NULLABLE vkGetPhysicalDeviceProperties; + PFN_vkGetPhysicalDeviceMemoryProperties VMA_NULLABLE vkGetPhysicalDeviceMemoryProperties; + PFN_vkAllocateMemory VMA_NULLABLE vkAllocateMemory; + PFN_vkFreeMemory VMA_NULLABLE vkFreeMemory; + PFN_vkMapMemory VMA_NULLABLE vkMapMemory; + PFN_vkUnmapMemory VMA_NULLABLE vkUnmapMemory; + PFN_vkFlushMappedMemoryRanges VMA_NULLABLE vkFlushMappedMemoryRanges; + PFN_vkInvalidateMappedMemoryRanges VMA_NULLABLE vkInvalidateMappedMemoryRanges; + PFN_vkBindBufferMemory VMA_NULLABLE vkBindBufferMemory; + PFN_vkBindImageMemory VMA_NULLABLE vkBindImageMemory; + PFN_vkGetBufferMemoryRequirements VMA_NULLABLE vkGetBufferMemoryRequirements; + PFN_vkGetImageMemoryRequirements VMA_NULLABLE vkGetImageMemoryRequirements; + PFN_vkCreateBuffer VMA_NULLABLE vkCreateBuffer; + PFN_vkDestroyBuffer VMA_NULLABLE vkDestroyBuffer; + PFN_vkCreateImage VMA_NULLABLE vkCreateImage; + PFN_vkDestroyImage VMA_NULLABLE vkDestroyImage; + PFN_vkCmdCopyBuffer VMA_NULLABLE vkCmdCopyBuffer; +#if VMA_DEDICATED_ALLOCATION || VMA_VULKAN_VERSION >= 1001000 + PFN_vkGetBufferMemoryRequirements2KHR VMA_NULLABLE vkGetBufferMemoryRequirements2KHR; + PFN_vkGetImageMemoryRequirements2KHR VMA_NULLABLE vkGetImageMemoryRequirements2KHR; +#endif +#if VMA_BIND_MEMORY2 || VMA_VULKAN_VERSION >= 1001000 + PFN_vkBindBufferMemory2KHR VMA_NULLABLE vkBindBufferMemory2KHR; + PFN_vkBindImageMemory2KHR VMA_NULLABLE vkBindImageMemory2KHR; +#endif +#if VMA_MEMORY_BUDGET || VMA_VULKAN_VERSION >= 1001000 + PFN_vkGetPhysicalDeviceMemoryProperties2KHR VMA_NULLABLE vkGetPhysicalDeviceMemoryProperties2KHR; +#endif +} VmaVulkanFunctions; + +/// Flags to be used in VmaRecordSettings::flags. +typedef enum VmaRecordFlagBits { + /** \brief Enables flush after recording every function call. + + Enable it if you expect your application to crash, which may leave recording file truncated. + It may degrade performance though. + */ + VMA_RECORD_FLUSH_AFTER_CALL_BIT = 0x00000001, + + VMA_RECORD_FLAG_BITS_MAX_ENUM = 0x7FFFFFFF +} VmaRecordFlagBits; +typedef VkFlags VmaRecordFlags; + +/// Parameters for recording calls to VMA functions. To be used in VmaAllocatorCreateInfo::pRecordSettings. +typedef struct VmaRecordSettings +{ + /// Flags for recording. Use #VmaRecordFlagBits enum. + VmaRecordFlags flags; + /** \brief Path to the file that should be written by the recording. + + Suggested extension: "csv". + If the file already exists, it will be overwritten. + It will be opened for the whole time #VmaAllocator object is alive. + If opening this file fails, creation of the whole allocator object fails. + */ + const char* VMA_NOT_NULL pFilePath; +} VmaRecordSettings; + +/// Description of a Allocator to be created. +typedef struct VmaAllocatorCreateInfo +{ + /// Flags for created allocator. Use #VmaAllocatorCreateFlagBits enum. + VmaAllocatorCreateFlags flags; + /// Vulkan physical device. + /** It must be valid throughout whole lifetime of created allocator. */ + VkPhysicalDevice VMA_NOT_NULL physicalDevice; + /// Vulkan device. + /** It must be valid throughout whole lifetime of created allocator. */ + VkDevice VMA_NOT_NULL device; + /// Preferred size of a single `VkDeviceMemory` block to be allocated from large heaps > 1 GiB. Optional. + /** Set to 0 to use default, which is currently 256 MiB. */ + VkDeviceSize preferredLargeHeapBlockSize; + /// Custom CPU memory allocation callbacks. Optional. + /** Optional, can be null. When specified, will also be used for all CPU-side memory allocations. */ + const VkAllocationCallbacks* VMA_NULLABLE pAllocationCallbacks; + /// Informative callbacks for `vkAllocateMemory`, `vkFreeMemory`. Optional. + /** Optional, can be null. */ + const VmaDeviceMemoryCallbacks* VMA_NULLABLE pDeviceMemoryCallbacks; + /** \brief Maximum number of additional frames that are in use at the same time as current frame. + + This value is used only when you make allocations with + VMA_ALLOCATION_CREATE_CAN_BECOME_LOST_BIT flag. Such allocation cannot become + lost if allocation.lastUseFrameIndex >= allocator.currentFrameIndex - frameInUseCount. + + For example, if you double-buffer your command buffers, so resources used for + rendering in previous frame may still be in use by the GPU at the moment you + allocate resources needed for the current frame, set this value to 1. + + If you want to allow any allocations other than used in the current frame to + become lost, set this value to 0. + */ + uint32_t frameInUseCount; + /** \brief Either null or a pointer to an array of limits on maximum number of bytes that can be allocated out of particular Vulkan memory heap. + + If not NULL, it must be a pointer to an array of + `VkPhysicalDeviceMemoryProperties::memoryHeapCount` elements, defining limit on + maximum number of bytes that can be allocated out of particular Vulkan memory + heap. + + Any of the elements may be equal to `VK_WHOLE_SIZE`, which means no limit on that + heap. This is also the default in case of `pHeapSizeLimit` = NULL. + + If there is a limit defined for a heap: + + - If user tries to allocate more memory from that heap using this allocator, + the allocation fails with `VK_ERROR_OUT_OF_DEVICE_MEMORY`. + - If the limit is smaller than heap size reported in `VkMemoryHeap::size`, the + value of this limit will be reported instead when using vmaGetMemoryProperties(). + + Warning! Using this feature may not be equivalent to installing a GPU with + smaller amount of memory, because graphics driver doesn't necessary fail new + allocations with `VK_ERROR_OUT_OF_DEVICE_MEMORY` result when memory capacity is + exceeded. It may return success and just silently migrate some device memory + blocks to system RAM. This driver behavior can also be controlled using + VK_AMD_memory_overallocation_behavior extension. + */ + const VkDeviceSize* VMA_NULLABLE VMA_LEN_IF_NOT_NULL("VkPhysicalDeviceMemoryProperties::memoryHeapCount") pHeapSizeLimit; + + /** \brief Pointers to Vulkan functions. Can be null. + + For details see [Pointers to Vulkan functions](@ref config_Vulkan_functions). + */ + const VmaVulkanFunctions* VMA_NULLABLE pVulkanFunctions; + /** \brief Parameters for recording of VMA calls. Can be null. + + If not null, it enables recording of calls to VMA functions to a file. + If support for recording is not enabled using `VMA_RECORDING_ENABLED` macro, + creation of the allocator object fails with `VK_ERROR_FEATURE_NOT_PRESENT`. + */ + const VmaRecordSettings* VMA_NULLABLE pRecordSettings; + /** \brief Handle to Vulkan instance object. + + Starting from version 3.0.0 this member is no longer optional, it must be set! + */ + VkInstance VMA_NOT_NULL instance; + /** \brief Optional. The highest version of Vulkan that the application is designed to use. + + It must be a value in the format as created by macro `VK_MAKE_VERSION` or a constant like: `VK_API_VERSION_1_1`, `VK_API_VERSION_1_0`. + The patch version number specified is ignored. Only the major and minor versions are considered. + It must be less or equal (preferably equal) to value as passed to `vkCreateInstance` as `VkApplicationInfo::apiVersion`. + Only versions 1.0, 1.1, 1.2 are supported by the current implementation. + Leaving it initialized to zero is equivalent to `VK_API_VERSION_1_0`. + */ + uint32_t vulkanApiVersion; +} VmaAllocatorCreateInfo; + +/// Creates Allocator object. +VMA_CALL_PRE VkResult VMA_CALL_POST vmaCreateAllocator( + const VmaAllocatorCreateInfo* VMA_NOT_NULL pCreateInfo, + VmaAllocator VMA_NULLABLE * VMA_NOT_NULL pAllocator); + +/// Destroys allocator object. +VMA_CALL_PRE void VMA_CALL_POST vmaDestroyAllocator( + VmaAllocator VMA_NULLABLE allocator); + +/** \brief Information about existing #VmaAllocator object. +*/ +typedef struct VmaAllocatorInfo +{ + /** \brief Handle to Vulkan instance object. + + This is the same value as has been passed through VmaAllocatorCreateInfo::instance. + */ + VkInstance VMA_NOT_NULL instance; + /** \brief Handle to Vulkan physical device object. + + This is the same value as has been passed through VmaAllocatorCreateInfo::physicalDevice. + */ + VkPhysicalDevice VMA_NOT_NULL physicalDevice; + /** \brief Handle to Vulkan device object. + + This is the same value as has been passed through VmaAllocatorCreateInfo::device. + */ + VkDevice VMA_NOT_NULL device; +} VmaAllocatorInfo; + +/** \brief Returns information about existing #VmaAllocator object - handle to Vulkan device etc. + +It might be useful if you want to keep just the #VmaAllocator handle and fetch other required handles to +`VkPhysicalDevice`, `VkDevice` etc. every time using this function. +*/ +VMA_CALL_PRE void VMA_CALL_POST vmaGetAllocatorInfo(VmaAllocator VMA_NOT_NULL allocator, VmaAllocatorInfo* VMA_NOT_NULL pAllocatorInfo); + +/** +PhysicalDeviceProperties are fetched from physicalDevice by the allocator. +You can access it here, without fetching it again on your own. +*/ +VMA_CALL_PRE void VMA_CALL_POST vmaGetPhysicalDeviceProperties( + VmaAllocator VMA_NOT_NULL allocator, + const VkPhysicalDeviceProperties* VMA_NULLABLE * VMA_NOT_NULL ppPhysicalDeviceProperties); + +/** +PhysicalDeviceMemoryProperties are fetched from physicalDevice by the allocator. +You can access it here, without fetching it again on your own. +*/ +VMA_CALL_PRE void VMA_CALL_POST vmaGetMemoryProperties( + VmaAllocator VMA_NOT_NULL allocator, + const VkPhysicalDeviceMemoryProperties* VMA_NULLABLE * VMA_NOT_NULL ppPhysicalDeviceMemoryProperties); + +/** +\brief Given Memory Type Index, returns Property Flags of this memory type. + +This is just a convenience function. Same information can be obtained using +vmaGetMemoryProperties(). +*/ +VMA_CALL_PRE void VMA_CALL_POST vmaGetMemoryTypeProperties( + VmaAllocator VMA_NOT_NULL allocator, + uint32_t memoryTypeIndex, + VkMemoryPropertyFlags* VMA_NOT_NULL pFlags); + +/** \brief Sets index of the current frame. + +This function must be used if you make allocations with +#VMA_ALLOCATION_CREATE_CAN_BECOME_LOST_BIT and +#VMA_ALLOCATION_CREATE_CAN_MAKE_OTHER_LOST_BIT flags to inform the allocator +when a new frame begins. Allocations queried using vmaGetAllocationInfo() cannot +become lost in the current frame. +*/ +VMA_CALL_PRE void VMA_CALL_POST vmaSetCurrentFrameIndex( + VmaAllocator VMA_NOT_NULL allocator, + uint32_t frameIndex); + +/** \brief Calculated statistics of memory usage in entire allocator. +*/ +typedef struct VmaStatInfo +{ + /// Number of `VkDeviceMemory` Vulkan memory blocks allocated. + uint32_t blockCount; + /// Number of #VmaAllocation allocation objects allocated. + uint32_t allocationCount; + /// Number of free ranges of memory between allocations. + uint32_t unusedRangeCount; + /// Total number of bytes occupied by all allocations. + VkDeviceSize usedBytes; + /// Total number of bytes occupied by unused ranges. + VkDeviceSize unusedBytes; + VkDeviceSize allocationSizeMin, allocationSizeAvg, allocationSizeMax; + VkDeviceSize unusedRangeSizeMin, unusedRangeSizeAvg, unusedRangeSizeMax; +} VmaStatInfo; + +/// General statistics from current state of Allocator. +typedef struct VmaStats +{ + VmaStatInfo memoryType[VK_MAX_MEMORY_TYPES]; + VmaStatInfo memoryHeap[VK_MAX_MEMORY_HEAPS]; + VmaStatInfo total; +} VmaStats; + +/** \brief Retrieves statistics from current state of the Allocator. + +This function is called "calculate" not "get" because it has to traverse all +internal data structures, so it may be quite slow. For faster but more brief statistics +suitable to be called every frame or every allocation, use vmaGetBudget(). + +Note that when using allocator from multiple threads, returned information may immediately +become outdated. +*/ +VMA_CALL_PRE void VMA_CALL_POST vmaCalculateStats( + VmaAllocator VMA_NOT_NULL allocator, + VmaStats* VMA_NOT_NULL pStats); + +/** \brief Statistics of current memory usage and available budget, in bytes, for specific memory heap. +*/ +typedef struct VmaBudget +{ + /** \brief Sum size of all `VkDeviceMemory` blocks allocated from particular heap, in bytes. + */ + VkDeviceSize blockBytes; + + /** \brief Sum size of all allocations created in particular heap, in bytes. + + Usually less or equal than `blockBytes`. + Difference `blockBytes - allocationBytes` is the amount of memory allocated but unused - + available for new allocations or wasted due to fragmentation. + + It might be greater than `blockBytes` if there are some allocations in lost state, as they account + to this value as well. + */ + VkDeviceSize allocationBytes; + + /** \brief Estimated current memory usage of the program, in bytes. + + Fetched from system using `VK_EXT_memory_budget` extension if enabled. + + It might be different than `blockBytes` (usually higher) due to additional implicit objects + also occupying the memory, like swapchain, pipelines, descriptor heaps, command buffers, or + `VkDeviceMemory` blocks allocated outside of this library, if any. + */ + VkDeviceSize usage; + + /** \brief Estimated amount of memory available to the program, in bytes. + + Fetched from system using `VK_EXT_memory_budget` extension if enabled. + + It might be different (most probably smaller) than `VkMemoryHeap::size[heapIndex]` due to factors + external to the program, like other programs also consuming system resources. + Difference `budget - usage` is the amount of additional memory that can probably + be allocated without problems. Exceeding the budget may result in various problems. + */ + VkDeviceSize budget; +} VmaBudget; + +/** \brief Retrieves information about current memory budget for all memory heaps. + +\param[out] pBudget Must point to array with number of elements at least equal to number of memory heaps in physical device used. + +This function is called "get" not "calculate" because it is very fast, suitable to be called +every frame or every allocation. For more detailed statistics use vmaCalculateStats(). + +Note that when using allocator from multiple threads, returned information may immediately +become outdated. +*/ +VMA_CALL_PRE void VMA_CALL_POST vmaGetBudget( + VmaAllocator VMA_NOT_NULL allocator, + VmaBudget* VMA_NOT_NULL pBudget); + +#ifndef VMA_STATS_STRING_ENABLED +#define VMA_STATS_STRING_ENABLED 1 +#endif + +#if VMA_STATS_STRING_ENABLED + +/// Builds and returns statistics as string in JSON format. +/** @param[out] ppStatsString Must be freed using vmaFreeStatsString() function. +*/ +VMA_CALL_PRE void VMA_CALL_POST vmaBuildStatsString( + VmaAllocator VMA_NOT_NULL allocator, + char* VMA_NULLABLE * VMA_NOT_NULL ppStatsString, + VkBool32 detailedMap); + +VMA_CALL_PRE void VMA_CALL_POST vmaFreeStatsString( + VmaAllocator VMA_NOT_NULL allocator, + char* VMA_NULLABLE pStatsString); + +#endif // #if VMA_STATS_STRING_ENABLED + +/** \struct VmaPool +\brief Represents custom memory pool + +Fill structure VmaPoolCreateInfo and call function vmaCreatePool() to create it. +Call function vmaDestroyPool() to destroy it. + +For more information see [Custom memory pools](@ref choosing_memory_type_custom_memory_pools). +*/ +VK_DEFINE_HANDLE(VmaPool) + +typedef enum VmaMemoryUsage +{ + /** No intended memory usage specified. + Use other members of VmaAllocationCreateInfo to specify your requirements. + */ + VMA_MEMORY_USAGE_UNKNOWN = 0, + /** Memory will be used on device only, so fast access from the device is preferred. + It usually means device-local GPU (video) memory. + No need to be mappable on host. + It is roughly equivalent of `D3D12_HEAP_TYPE_DEFAULT`. + + Usage: + + - Resources written and read by device, e.g. images used as attachments. + - Resources transferred from host once (immutable) or infrequently and read by + device multiple times, e.g. textures to be sampled, vertex buffers, uniform + (constant) buffers, and majority of other types of resources used on GPU. + + Allocation may still end up in `HOST_VISIBLE` memory on some implementations. + In such case, you are free to map it. + You can use #VMA_ALLOCATION_CREATE_MAPPED_BIT with this usage type. + */ + VMA_MEMORY_USAGE_GPU_ONLY = 1, + /** Memory will be mappable on host. + It usually means CPU (system) memory. + Guarantees to be `HOST_VISIBLE` and `HOST_COHERENT`. + CPU access is typically uncached. Writes may be write-combined. + Resources created in this pool may still be accessible to the device, but access to them can be slow. + It is roughly equivalent of `D3D12_HEAP_TYPE_UPLOAD`. + + Usage: Staging copy of resources used as transfer source. + */ + VMA_MEMORY_USAGE_CPU_ONLY = 2, + /** + Memory that is both mappable on host (guarantees to be `HOST_VISIBLE`) and preferably fast to access by GPU. + CPU access is typically uncached. Writes may be write-combined. + + Usage: Resources written frequently by host (dynamic), read by device. E.g. textures (with LINEAR layout), vertex buffers, uniform buffers updated every frame or every draw call. + */ + VMA_MEMORY_USAGE_CPU_TO_GPU = 3, + /** Memory mappable on host (guarantees to be `HOST_VISIBLE`) and cached. + It is roughly equivalent of `D3D12_HEAP_TYPE_READBACK`. + + Usage: + + - Resources written by device, read by host - results of some computations, e.g. screen capture, average scene luminance for HDR tone mapping. + - Any resources read or accessed randomly on host, e.g. CPU-side copy of vertex buffer used as source of transfer, but also used for collision detection. + */ + VMA_MEMORY_USAGE_GPU_TO_CPU = 4, + /** CPU memory - memory that is preferably not `DEVICE_LOCAL`, but also not guaranteed to be `HOST_VISIBLE`. + + Usage: Staging copy of resources moved from GPU memory to CPU memory as part + of custom paging/residency mechanism, to be moved back to GPU memory when needed. + */ + VMA_MEMORY_USAGE_CPU_COPY = 5, + /** Lazily allocated GPU memory having `VK_MEMORY_PROPERTY_LAZILY_ALLOCATED_BIT`. + Exists mostly on mobile platforms. Using it on desktop PC or other GPUs with no such memory type present will fail the allocation. + + Usage: Memory for transient attachment images (color attachments, depth attachments etc.), created with `VK_IMAGE_USAGE_TRANSIENT_ATTACHMENT_BIT`. + + Allocations with this usage are always created as dedicated - it implies #VMA_ALLOCATION_CREATE_DEDICATED_MEMORY_BIT. + */ + VMA_MEMORY_USAGE_GPU_LAZILY_ALLOCATED = 6, + + VMA_MEMORY_USAGE_MAX_ENUM = 0x7FFFFFFF +} VmaMemoryUsage; + +/// Flags to be passed as VmaAllocationCreateInfo::flags. +typedef enum VmaAllocationCreateFlagBits { + /** \brief Set this flag if the allocation should have its own memory block. + + Use it for special, big resources, like fullscreen images used as attachments. + + You should not use this flag if VmaAllocationCreateInfo::pool is not null. + */ + VMA_ALLOCATION_CREATE_DEDICATED_MEMORY_BIT = 0x00000001, + + /** \brief Set this flag to only try to allocate from existing `VkDeviceMemory` blocks and never create new such block. + + If new allocation cannot be placed in any of the existing blocks, allocation + fails with `VK_ERROR_OUT_OF_DEVICE_MEMORY` error. + + You should not use #VMA_ALLOCATION_CREATE_DEDICATED_MEMORY_BIT and + #VMA_ALLOCATION_CREATE_NEVER_ALLOCATE_BIT at the same time. It makes no sense. + + If VmaAllocationCreateInfo::pool is not null, this flag is implied and ignored. */ + VMA_ALLOCATION_CREATE_NEVER_ALLOCATE_BIT = 0x00000002, + /** \brief Set this flag to use a memory that will be persistently mapped and retrieve pointer to it. + + Pointer to mapped memory will be returned through VmaAllocationInfo::pMappedData. + + It is valid to use this flag for allocation made from memory type that is not + `HOST_VISIBLE`. This flag is then ignored and memory is not mapped. This is + useful if you need an allocation that is efficient to use on GPU + (`DEVICE_LOCAL`) and still want to map it directly if possible on platforms that + support it (e.g. Intel GPU). + + You should not use this flag together with #VMA_ALLOCATION_CREATE_CAN_BECOME_LOST_BIT. + */ + VMA_ALLOCATION_CREATE_MAPPED_BIT = 0x00000004, + /** Allocation created with this flag can become lost as a result of another + allocation with #VMA_ALLOCATION_CREATE_CAN_MAKE_OTHER_LOST_BIT flag, so you + must check it before use. + + To check if allocation is not lost, call vmaGetAllocationInfo() and check if + VmaAllocationInfo::deviceMemory is not `VK_NULL_HANDLE`. + + For details about supporting lost allocations, see Lost Allocations + chapter of User Guide on Main Page. + + You should not use this flag together with #VMA_ALLOCATION_CREATE_MAPPED_BIT. + */ + VMA_ALLOCATION_CREATE_CAN_BECOME_LOST_BIT = 0x00000008, + /** While creating allocation using this flag, other allocations that were + created with flag #VMA_ALLOCATION_CREATE_CAN_BECOME_LOST_BIT can become lost. + + For details about supporting lost allocations, see Lost Allocations + chapter of User Guide on Main Page. + */ + VMA_ALLOCATION_CREATE_CAN_MAKE_OTHER_LOST_BIT = 0x00000010, + /** Set this flag to treat VmaAllocationCreateInfo::pUserData as pointer to a + null-terminated string. Instead of copying pointer value, a local copy of the + string is made and stored in allocation's `pUserData`. The string is automatically + freed together with the allocation. It is also used in vmaBuildStatsString(). + */ + VMA_ALLOCATION_CREATE_USER_DATA_COPY_STRING_BIT = 0x00000020, + /** Allocation will be created from upper stack in a double stack pool. + + This flag is only allowed for custom pools created with #VMA_POOL_CREATE_LINEAR_ALGORITHM_BIT flag. + */ + VMA_ALLOCATION_CREATE_UPPER_ADDRESS_BIT = 0x00000040, + /** Create both buffer/image and allocation, but don't bind them together. + It is useful when you want to bind yourself to do some more advanced binding, e.g. using some extensions. + The flag is meaningful only with functions that bind by default: vmaCreateBuffer(), vmaCreateImage(). + Otherwise it is ignored. + */ + VMA_ALLOCATION_CREATE_DONT_BIND_BIT = 0x00000080, + /** Create allocation only if additional device memory required for it, if any, won't exceed + memory budget. Otherwise return `VK_ERROR_OUT_OF_DEVICE_MEMORY`. + */ + VMA_ALLOCATION_CREATE_WITHIN_BUDGET_BIT = 0x00000100, + + /** Allocation strategy that chooses smallest possible free range for the + allocation. + */ + VMA_ALLOCATION_CREATE_STRATEGY_BEST_FIT_BIT = 0x00010000, + /** Allocation strategy that chooses biggest possible free range for the + allocation. + */ + VMA_ALLOCATION_CREATE_STRATEGY_WORST_FIT_BIT = 0x00020000, + /** Allocation strategy that chooses first suitable free range for the + allocation. + + "First" doesn't necessarily means the one with smallest offset in memory, + but rather the one that is easiest and fastest to find. + */ + VMA_ALLOCATION_CREATE_STRATEGY_FIRST_FIT_BIT = 0x00040000, + + /** Allocation strategy that tries to minimize memory usage. + */ + VMA_ALLOCATION_CREATE_STRATEGY_MIN_MEMORY_BIT = VMA_ALLOCATION_CREATE_STRATEGY_BEST_FIT_BIT, + /** Allocation strategy that tries to minimize allocation time. + */ + VMA_ALLOCATION_CREATE_STRATEGY_MIN_TIME_BIT = VMA_ALLOCATION_CREATE_STRATEGY_FIRST_FIT_BIT, + /** Allocation strategy that tries to minimize memory fragmentation. + */ + VMA_ALLOCATION_CREATE_STRATEGY_MIN_FRAGMENTATION_BIT = VMA_ALLOCATION_CREATE_STRATEGY_WORST_FIT_BIT, + + /** A bit mask to extract only `STRATEGY` bits from entire set of flags. + */ + VMA_ALLOCATION_CREATE_STRATEGY_MASK = + VMA_ALLOCATION_CREATE_STRATEGY_BEST_FIT_BIT | + VMA_ALLOCATION_CREATE_STRATEGY_WORST_FIT_BIT | + VMA_ALLOCATION_CREATE_STRATEGY_FIRST_FIT_BIT, + + VMA_ALLOCATION_CREATE_FLAG_BITS_MAX_ENUM = 0x7FFFFFFF +} VmaAllocationCreateFlagBits; +typedef VkFlags VmaAllocationCreateFlags; + +typedef struct VmaAllocationCreateInfo +{ + /// Use #VmaAllocationCreateFlagBits enum. + VmaAllocationCreateFlags flags; + /** \brief Intended usage of memory. + + You can leave #VMA_MEMORY_USAGE_UNKNOWN if you specify memory requirements in other way. \n + If `pool` is not null, this member is ignored. + */ + VmaMemoryUsage usage; + /** \brief Flags that must be set in a Memory Type chosen for an allocation. + + Leave 0 if you specify memory requirements in other way. \n + If `pool` is not null, this member is ignored.*/ + VkMemoryPropertyFlags requiredFlags; + /** \brief Flags that preferably should be set in a memory type chosen for an allocation. + + Set to 0 if no additional flags are preferred. \n + If `pool` is not null, this member is ignored. */ + VkMemoryPropertyFlags preferredFlags; + /** \brief Bitmask containing one bit set for every memory type acceptable for this allocation. + + Value 0 is equivalent to `UINT32_MAX` - it means any memory type is accepted if + it meets other requirements specified by this structure, with no further + restrictions on memory type index. \n + If `pool` is not null, this member is ignored. + */ + uint32_t memoryTypeBits; + /** \brief Pool that this allocation should be created in. + + Leave `VK_NULL_HANDLE` to allocate from default pool. If not null, members: + `usage`, `requiredFlags`, `preferredFlags`, `memoryTypeBits` are ignored. + */ + VmaPool VMA_NULLABLE pool; + /** \brief Custom general-purpose pointer that will be stored in #VmaAllocation, can be read as VmaAllocationInfo::pUserData and changed using vmaSetAllocationUserData(). + + If #VMA_ALLOCATION_CREATE_USER_DATA_COPY_STRING_BIT is used, it must be either + null or pointer to a null-terminated string. The string will be then copied to + internal buffer, so it doesn't need to be valid after allocation call. + */ + void* VMA_NULLABLE pUserData; + /** \brief A floating-point value between 0 and 1, indicating the priority of the allocation relative to other memory allocations. + + It is used only when #VMA_ALLOCATOR_CREATE_EXT_MEMORY_PRIORITY_BIT flag was used during creation of the #VmaAllocator object + and this allocation ends up as dedicated or is explicitly forced as dedicated using #VMA_ALLOCATION_CREATE_DEDICATED_MEMORY_BIT. + Otherwise, it has the priority of a memory block where it is placed and this variable is ignored. + */ + float priority; +} VmaAllocationCreateInfo; + +/** +\brief Helps to find memoryTypeIndex, given memoryTypeBits and VmaAllocationCreateInfo. + +This algorithm tries to find a memory type that: + +- Is allowed by memoryTypeBits. +- Contains all the flags from pAllocationCreateInfo->requiredFlags. +- Matches intended usage. +- Has as many flags from pAllocationCreateInfo->preferredFlags as possible. + +\return Returns VK_ERROR_FEATURE_NOT_PRESENT if not found. Receiving such result +from this function or any other allocating function probably means that your +device doesn't support any memory type with requested features for the specific +type of resource you want to use it for. Please check parameters of your +resource, like image layout (OPTIMAL versus LINEAR) or mip level count. +*/ +VMA_CALL_PRE VkResult VMA_CALL_POST vmaFindMemoryTypeIndex( + VmaAllocator VMA_NOT_NULL allocator, + uint32_t memoryTypeBits, + const VmaAllocationCreateInfo* VMA_NOT_NULL pAllocationCreateInfo, + uint32_t* VMA_NOT_NULL pMemoryTypeIndex); + +/** +\brief Helps to find memoryTypeIndex, given VkBufferCreateInfo and VmaAllocationCreateInfo. + +It can be useful e.g. to determine value to be used as VmaPoolCreateInfo::memoryTypeIndex. +It internally creates a temporary, dummy buffer that never has memory bound. +It is just a convenience function, equivalent to calling: + +- `vkCreateBuffer` +- `vkGetBufferMemoryRequirements` +- `vmaFindMemoryTypeIndex` +- `vkDestroyBuffer` +*/ +VMA_CALL_PRE VkResult VMA_CALL_POST vmaFindMemoryTypeIndexForBufferInfo( + VmaAllocator VMA_NOT_NULL allocator, + const VkBufferCreateInfo* VMA_NOT_NULL pBufferCreateInfo, + const VmaAllocationCreateInfo* VMA_NOT_NULL pAllocationCreateInfo, + uint32_t* VMA_NOT_NULL pMemoryTypeIndex); + +/** +\brief Helps to find memoryTypeIndex, given VkImageCreateInfo and VmaAllocationCreateInfo. + +It can be useful e.g. to determine value to be used as VmaPoolCreateInfo::memoryTypeIndex. +It internally creates a temporary, dummy image that never has memory bound. +It is just a convenience function, equivalent to calling: + +- `vkCreateImage` +- `vkGetImageMemoryRequirements` +- `vmaFindMemoryTypeIndex` +- `vkDestroyImage` +*/ +VMA_CALL_PRE VkResult VMA_CALL_POST vmaFindMemoryTypeIndexForImageInfo( + VmaAllocator VMA_NOT_NULL allocator, + const VkImageCreateInfo* VMA_NOT_NULL pImageCreateInfo, + const VmaAllocationCreateInfo* VMA_NOT_NULL pAllocationCreateInfo, + uint32_t* VMA_NOT_NULL pMemoryTypeIndex); + +/// Flags to be passed as VmaPoolCreateInfo::flags. +typedef enum VmaPoolCreateFlagBits { + /** \brief Use this flag if you always allocate only buffers and linear images or only optimal images out of this pool and so Buffer-Image Granularity can be ignored. + + This is an optional optimization flag. + + If you always allocate using vmaCreateBuffer(), vmaCreateImage(), + vmaAllocateMemoryForBuffer(), then you don't need to use it because allocator + knows exact type of your allocations so it can handle Buffer-Image Granularity + in the optimal way. + + If you also allocate using vmaAllocateMemoryForImage() or vmaAllocateMemory(), + exact type of such allocations is not known, so allocator must be conservative + in handling Buffer-Image Granularity, which can lead to suboptimal allocation + (wasted memory). In that case, if you can make sure you always allocate only + buffers and linear images or only optimal images out of this pool, use this flag + to make allocator disregard Buffer-Image Granularity and so make allocations + faster and more optimal. + */ + VMA_POOL_CREATE_IGNORE_BUFFER_IMAGE_GRANULARITY_BIT = 0x00000002, + + /** \brief Enables alternative, linear allocation algorithm in this pool. + + Specify this flag to enable linear allocation algorithm, which always creates + new allocations after last one and doesn't reuse space from allocations freed in + between. It trades memory consumption for simplified algorithm and data + structure, which has better performance and uses less memory for metadata. + + By using this flag, you can achieve behavior of free-at-once, stack, + ring buffer, and double stack. For details, see documentation chapter + \ref linear_algorithm. + + When using this flag, you must specify VmaPoolCreateInfo::maxBlockCount == 1 (or 0 for default). + + For more details, see [Linear allocation algorithm](@ref linear_algorithm). + */ + VMA_POOL_CREATE_LINEAR_ALGORITHM_BIT = 0x00000004, + + /** \brief Enables alternative, buddy allocation algorithm in this pool. + + It operates on a tree of blocks, each having size that is a power of two and + a half of its parent's size. Comparing to default algorithm, this one provides + faster allocation and deallocation and decreased external fragmentation, + at the expense of more memory wasted (internal fragmentation). + + For more details, see [Buddy allocation algorithm](@ref buddy_algorithm). + */ + VMA_POOL_CREATE_BUDDY_ALGORITHM_BIT = 0x00000008, + + /** Bit mask to extract only `ALGORITHM` bits from entire set of flags. + */ + VMA_POOL_CREATE_ALGORITHM_MASK = + VMA_POOL_CREATE_LINEAR_ALGORITHM_BIT | + VMA_POOL_CREATE_BUDDY_ALGORITHM_BIT, + + VMA_POOL_CREATE_FLAG_BITS_MAX_ENUM = 0x7FFFFFFF +} VmaPoolCreateFlagBits; +typedef VkFlags VmaPoolCreateFlags; + +/** \brief Describes parameter of created #VmaPool. +*/ +typedef struct VmaPoolCreateInfo { + /** \brief Vulkan memory type index to allocate this pool from. + */ + uint32_t memoryTypeIndex; + /** \brief Use combination of #VmaPoolCreateFlagBits. + */ + VmaPoolCreateFlags flags; + /** \brief Size of a single `VkDeviceMemory` block to be allocated as part of this pool, in bytes. Optional. + + Specify nonzero to set explicit, constant size of memory blocks used by this + pool. + + Leave 0 to use default and let the library manage block sizes automatically. + Sizes of particular blocks may vary. + */ + VkDeviceSize blockSize; + /** \brief Minimum number of blocks to be always allocated in this pool, even if they stay empty. + + Set to 0 to have no preallocated blocks and allow the pool be completely empty. + */ + size_t minBlockCount; + /** \brief Maximum number of blocks that can be allocated in this pool. Optional. + + Set to 0 to use default, which is `SIZE_MAX`, which means no limit. + + Set to same value as VmaPoolCreateInfo::minBlockCount to have fixed amount of memory allocated + throughout whole lifetime of this pool. + */ + size_t maxBlockCount; + /** \brief Maximum number of additional frames that are in use at the same time as current frame. + + This value is used only when you make allocations with + #VMA_ALLOCATION_CREATE_CAN_BECOME_LOST_BIT flag. Such allocation cannot become + lost if allocation.lastUseFrameIndex >= allocator.currentFrameIndex - frameInUseCount. + + For example, if you double-buffer your command buffers, so resources used for + rendering in previous frame may still be in use by the GPU at the moment you + allocate resources needed for the current frame, set this value to 1. + + If you want to allow any allocations other than used in the current frame to + become lost, set this value to 0. + */ + uint32_t frameInUseCount; + /** \brief A floating-point value between 0 and 1, indicating the priority of the allocations in this pool relative to other memory allocations. + + It is used only when #VMA_ALLOCATOR_CREATE_EXT_MEMORY_PRIORITY_BIT flag was used during creation of the #VmaAllocator object. + Otherwise, this variable is ignored. + */ + float priority; +} VmaPoolCreateInfo; + +/** \brief Describes parameter of existing #VmaPool. +*/ +typedef struct VmaPoolStats { + /** \brief Total amount of `VkDeviceMemory` allocated from Vulkan for this pool, in bytes. + */ + VkDeviceSize size; + /** \brief Total number of bytes in the pool not used by any #VmaAllocation. + */ + VkDeviceSize unusedSize; + /** \brief Number of #VmaAllocation objects created from this pool that were not destroyed or lost. + */ + size_t allocationCount; + /** \brief Number of continuous memory ranges in the pool not used by any #VmaAllocation. + */ + size_t unusedRangeCount; + /** \brief Size of the largest continuous free memory region available for new allocation. + + Making a new allocation of that size is not guaranteed to succeed because of + possible additional margin required to respect alignment and buffer/image + granularity. + */ + VkDeviceSize unusedRangeSizeMax; + /** \brief Number of `VkDeviceMemory` blocks allocated for this pool. + */ + size_t blockCount; +} VmaPoolStats; + +/** \brief Allocates Vulkan device memory and creates #VmaPool object. + +@param allocator Allocator object. +@param pCreateInfo Parameters of pool to create. +@param[out] pPool Handle to created pool. +*/ +VMA_CALL_PRE VkResult VMA_CALL_POST vmaCreatePool( + VmaAllocator VMA_NOT_NULL allocator, + const VmaPoolCreateInfo* VMA_NOT_NULL pCreateInfo, + VmaPool VMA_NULLABLE * VMA_NOT_NULL pPool); + +/** \brief Destroys #VmaPool object and frees Vulkan device memory. +*/ +VMA_CALL_PRE void VMA_CALL_POST vmaDestroyPool( + VmaAllocator VMA_NOT_NULL allocator, + VmaPool VMA_NULLABLE pool); + +/** \brief Retrieves statistics of existing #VmaPool object. + +@param allocator Allocator object. +@param pool Pool object. +@param[out] pPoolStats Statistics of specified pool. +*/ +VMA_CALL_PRE void VMA_CALL_POST vmaGetPoolStats( + VmaAllocator VMA_NOT_NULL allocator, + VmaPool VMA_NOT_NULL pool, + VmaPoolStats* VMA_NOT_NULL pPoolStats); + +/** \brief Marks all allocations in given pool as lost if they are not used in current frame or VmaPoolCreateInfo::frameInUseCount back from now. + +@param allocator Allocator object. +@param pool Pool. +@param[out] pLostAllocationCount Number of allocations marked as lost. Optional - pass null if you don't need this information. +*/ +VMA_CALL_PRE void VMA_CALL_POST vmaMakePoolAllocationsLost( + VmaAllocator VMA_NOT_NULL allocator, + VmaPool VMA_NOT_NULL pool, + size_t* VMA_NULLABLE pLostAllocationCount); + +/** \brief Checks magic number in margins around all allocations in given memory pool in search for corruptions. + +Corruption detection is enabled only when `VMA_DEBUG_DETECT_CORRUPTION` macro is defined to nonzero, +`VMA_DEBUG_MARGIN` is defined to nonzero and the pool is created in memory type that is +`HOST_VISIBLE` and `HOST_COHERENT`. For more information, see [Corruption detection](@ref debugging_memory_usage_corruption_detection). + +Possible return values: + +- `VK_ERROR_FEATURE_NOT_PRESENT` - corruption detection is not enabled for specified pool. +- `VK_SUCCESS` - corruption detection has been performed and succeeded. +- `VK_ERROR_VALIDATION_FAILED_EXT` - corruption detection has been performed and found memory corruptions around one of the allocations. + `VMA_ASSERT` is also fired in that case. +- Other value: Error returned by Vulkan, e.g. memory mapping failure. +*/ +VMA_CALL_PRE VkResult VMA_CALL_POST vmaCheckPoolCorruption(VmaAllocator VMA_NOT_NULL allocator, VmaPool VMA_NOT_NULL pool); + +/** \brief Retrieves name of a custom pool. + +After the call `ppName` is either null or points to an internally-owned null-terminated string +containing name of the pool that was previously set. The pointer becomes invalid when the pool is +destroyed or its name is changed using vmaSetPoolName(). +*/ +VMA_CALL_PRE void VMA_CALL_POST vmaGetPoolName( + VmaAllocator VMA_NOT_NULL allocator, + VmaPool VMA_NOT_NULL pool, + const char* VMA_NULLABLE * VMA_NOT_NULL ppName); + +/** \brief Sets name of a custom pool. + +`pName` can be either null or pointer to a null-terminated string with new name for the pool. +Function makes internal copy of the string, so it can be changed or freed immediately after this call. +*/ +VMA_CALL_PRE void VMA_CALL_POST vmaSetPoolName( + VmaAllocator VMA_NOT_NULL allocator, + VmaPool VMA_NOT_NULL pool, + const char* VMA_NULLABLE pName); + +/** \struct VmaAllocation +\brief Represents single memory allocation. + +It may be either dedicated block of `VkDeviceMemory` or a specific region of a bigger block of this type +plus unique offset. + +There are multiple ways to create such object. +You need to fill structure VmaAllocationCreateInfo. +For more information see [Choosing memory type](@ref choosing_memory_type). + +Although the library provides convenience functions that create Vulkan buffer or image, +allocate memory for it and bind them together, +binding of the allocation to a buffer or an image is out of scope of the allocation itself. +Allocation object can exist without buffer/image bound, +binding can be done manually by the user, and destruction of it can be done +independently of destruction of the allocation. + +The object also remembers its size and some other information. +To retrieve this information, use function vmaGetAllocationInfo() and inspect +returned structure VmaAllocationInfo. + +Some kinds allocations can be in lost state. +For more information, see [Lost allocations](@ref lost_allocations). +*/ +VK_DEFINE_HANDLE(VmaAllocation) + +/** \brief Parameters of #VmaAllocation objects, that can be retrieved using function vmaGetAllocationInfo(). +*/ +typedef struct VmaAllocationInfo { + /** \brief Memory type index that this allocation was allocated from. + + It never changes. + */ + uint32_t memoryType; + /** \brief Handle to Vulkan memory object. + + Same memory object can be shared by multiple allocations. + + It can change after call to vmaDefragment() if this allocation is passed to the function, or if allocation is lost. + + If the allocation is lost, it is equal to `VK_NULL_HANDLE`. + */ + VkDeviceMemory VMA_NULLABLE_NON_DISPATCHABLE deviceMemory; + /** \brief Offset in `VkDeviceMemory` object to the beginning of this allocation, in bytes. `(deviceMemory, offset)` pair is unique to this allocation. + + You usually don't need to use this offset. If you create a buffer or an image together with the allocation using e.g. function + vmaCreateBuffer(), vmaCreateImage(), functions that operate on these resources refer to the beginning of the buffer or image, + not entire device memory block. Functions like vmaMapMemory(), vmaBindBufferMemory() also refer to the beginning of the allocation + and apply this offset automatically. + + It can change after call to vmaDefragment() if this allocation is passed to the function, or if allocation is lost. + */ + VkDeviceSize offset; + /** \brief Size of this allocation, in bytes. + + It never changes, unless allocation is lost. + + \note Allocation size returned in this variable may be greater than the size + requested for the resource e.g. as `VkBufferCreateInfo::size`. Whole size of the + allocation is accessible for operations on memory e.g. using a pointer after + mapping with vmaMapMemory(), but operations on the resource e.g. using + `vkCmdCopyBuffer` must be limited to the size of the resource. + */ + VkDeviceSize size; + /** \brief Pointer to the beginning of this allocation as mapped data. + + If the allocation hasn't been mapped using vmaMapMemory() and hasn't been + created with #VMA_ALLOCATION_CREATE_MAPPED_BIT flag, this value is null. + + It can change after call to vmaMapMemory(), vmaUnmapMemory(). + It can also change after call to vmaDefragment() if this allocation is passed to the function. + */ + void* VMA_NULLABLE pMappedData; + /** \brief Custom general-purpose pointer that was passed as VmaAllocationCreateInfo::pUserData or set using vmaSetAllocationUserData(). + + It can change after call to vmaSetAllocationUserData() for this allocation. + */ + void* VMA_NULLABLE pUserData; +} VmaAllocationInfo; + +/** \brief General purpose memory allocation. + +@param[out] pAllocation Handle to allocated memory. +@param[out] pAllocationInfo Optional. Information about allocated memory. It can be later fetched using function vmaGetAllocationInfo(). + +You should free the memory using vmaFreeMemory() or vmaFreeMemoryPages(). + +It is recommended to use vmaAllocateMemoryForBuffer(), vmaAllocateMemoryForImage(), +vmaCreateBuffer(), vmaCreateImage() instead whenever possible. +*/ +VMA_CALL_PRE VkResult VMA_CALL_POST vmaAllocateMemory( + VmaAllocator VMA_NOT_NULL allocator, + const VkMemoryRequirements* VMA_NOT_NULL pVkMemoryRequirements, + const VmaAllocationCreateInfo* VMA_NOT_NULL pCreateInfo, + VmaAllocation VMA_NULLABLE * VMA_NOT_NULL pAllocation, + VmaAllocationInfo* VMA_NULLABLE pAllocationInfo); + +/** \brief General purpose memory allocation for multiple allocation objects at once. + +@param allocator Allocator object. +@param pVkMemoryRequirements Memory requirements for each allocation. +@param pCreateInfo Creation parameters for each alloction. +@param allocationCount Number of allocations to make. +@param[out] pAllocations Pointer to array that will be filled with handles to created allocations. +@param[out] pAllocationInfo Optional. Pointer to array that will be filled with parameters of created allocations. + +You should free the memory using vmaFreeMemory() or vmaFreeMemoryPages(). + +Word "pages" is just a suggestion to use this function to allocate pieces of memory needed for sparse binding. +It is just a general purpose allocation function able to make multiple allocations at once. +It may be internally optimized to be more efficient than calling vmaAllocateMemory() `allocationCount` times. + +All allocations are made using same parameters. All of them are created out of the same memory pool and type. +If any allocation fails, all allocations already made within this function call are also freed, so that when +returned result is not `VK_SUCCESS`, `pAllocation` array is always entirely filled with `VK_NULL_HANDLE`. +*/ +VMA_CALL_PRE VkResult VMA_CALL_POST vmaAllocateMemoryPages( + VmaAllocator VMA_NOT_NULL allocator, + const VkMemoryRequirements* VMA_NOT_NULL VMA_LEN_IF_NOT_NULL(allocationCount) pVkMemoryRequirements, + const VmaAllocationCreateInfo* VMA_NOT_NULL VMA_LEN_IF_NOT_NULL(allocationCount) pCreateInfo, + size_t allocationCount, + VmaAllocation VMA_NULLABLE * VMA_NOT_NULL VMA_LEN_IF_NOT_NULL(allocationCount) pAllocations, + VmaAllocationInfo* VMA_NULLABLE VMA_LEN_IF_NOT_NULL(allocationCount) pAllocationInfo); + +/** +@param[out] pAllocation Handle to allocated memory. +@param[out] pAllocationInfo Optional. Information about allocated memory. It can be later fetched using function vmaGetAllocationInfo(). + +You should free the memory using vmaFreeMemory(). +*/ +VMA_CALL_PRE VkResult VMA_CALL_POST vmaAllocateMemoryForBuffer( + VmaAllocator VMA_NOT_NULL allocator, + VkBuffer VMA_NOT_NULL_NON_DISPATCHABLE buffer, + const VmaAllocationCreateInfo* VMA_NOT_NULL pCreateInfo, + VmaAllocation VMA_NULLABLE * VMA_NOT_NULL pAllocation, + VmaAllocationInfo* VMA_NULLABLE pAllocationInfo); + +/// Function similar to vmaAllocateMemoryForBuffer(). +VMA_CALL_PRE VkResult VMA_CALL_POST vmaAllocateMemoryForImage( + VmaAllocator VMA_NOT_NULL allocator, + VkImage VMA_NOT_NULL_NON_DISPATCHABLE image, + const VmaAllocationCreateInfo* VMA_NOT_NULL pCreateInfo, + VmaAllocation VMA_NULLABLE * VMA_NOT_NULL pAllocation, + VmaAllocationInfo* VMA_NULLABLE pAllocationInfo); + +/** \brief Frees memory previously allocated using vmaAllocateMemory(), vmaAllocateMemoryForBuffer(), or vmaAllocateMemoryForImage(). + +Passing `VK_NULL_HANDLE` as `allocation` is valid. Such function call is just skipped. +*/ +VMA_CALL_PRE void VMA_CALL_POST vmaFreeMemory( + VmaAllocator VMA_NOT_NULL allocator, + const VmaAllocation VMA_NULLABLE allocation); + +/** \brief Frees memory and destroys multiple allocations. + +Word "pages" is just a suggestion to use this function to free pieces of memory used for sparse binding. +It is just a general purpose function to free memory and destroy allocations made using e.g. vmaAllocateMemory(), +vmaAllocateMemoryPages() and other functions. +It may be internally optimized to be more efficient than calling vmaFreeMemory() `allocationCount` times. + +Allocations in `pAllocations` array can come from any memory pools and types. +Passing `VK_NULL_HANDLE` as elements of `pAllocations` array is valid. Such entries are just skipped. +*/ +VMA_CALL_PRE void VMA_CALL_POST vmaFreeMemoryPages( + VmaAllocator VMA_NOT_NULL allocator, + size_t allocationCount, + const VmaAllocation VMA_NULLABLE * VMA_NOT_NULL VMA_LEN_IF_NOT_NULL(allocationCount) pAllocations); + +/** \brief Returns current information about specified allocation and atomically marks it as used in current frame. + +Current paramteres of given allocation are returned in `pAllocationInfo`. + +This function also atomically "touches" allocation - marks it as used in current frame, +just like vmaTouchAllocation(). +If the allocation is in lost state, `pAllocationInfo->deviceMemory == VK_NULL_HANDLE`. + +Although this function uses atomics and doesn't lock any mutex, so it should be quite efficient, +you can avoid calling it too often. + +- You can retrieve same VmaAllocationInfo structure while creating your resource, from function + vmaCreateBuffer(), vmaCreateImage(). You can remember it if you are sure parameters don't change + (e.g. due to defragmentation or allocation becoming lost). +- If you just want to check if allocation is not lost, vmaTouchAllocation() will work faster. +*/ +VMA_CALL_PRE void VMA_CALL_POST vmaGetAllocationInfo( + VmaAllocator VMA_NOT_NULL allocator, + VmaAllocation VMA_NOT_NULL allocation, + VmaAllocationInfo* VMA_NOT_NULL pAllocationInfo); + +/** \brief Returns `VK_TRUE` if allocation is not lost and atomically marks it as used in current frame. + +If the allocation has been created with #VMA_ALLOCATION_CREATE_CAN_BECOME_LOST_BIT flag, +this function returns `VK_TRUE` if it's not in lost state, so it can still be used. +It then also atomically "touches" the allocation - marks it as used in current frame, +so that you can be sure it won't become lost in current frame or next `frameInUseCount` frames. + +If the allocation is in lost state, the function returns `VK_FALSE`. +Memory of such allocation, as well as buffer or image bound to it, should not be used. +Lost allocation and the buffer/image still need to be destroyed. + +If the allocation has been created without #VMA_ALLOCATION_CREATE_CAN_BECOME_LOST_BIT flag, +this function always returns `VK_TRUE`. +*/ +VMA_CALL_PRE VkBool32 VMA_CALL_POST vmaTouchAllocation( + VmaAllocator VMA_NOT_NULL allocator, + VmaAllocation VMA_NOT_NULL allocation); + +/** \brief Sets pUserData in given allocation to new value. + +If the allocation was created with VMA_ALLOCATION_CREATE_USER_DATA_COPY_STRING_BIT, +pUserData must be either null, or pointer to a null-terminated string. The function +makes local copy of the string and sets it as allocation's `pUserData`. String +passed as pUserData doesn't need to be valid for whole lifetime of the allocation - +you can free it after this call. String previously pointed by allocation's +pUserData is freed from memory. + +If the flag was not used, the value of pointer `pUserData` is just copied to +allocation's `pUserData`. It is opaque, so you can use it however you want - e.g. +as a pointer, ordinal number or some handle to you own data. +*/ +VMA_CALL_PRE void VMA_CALL_POST vmaSetAllocationUserData( + VmaAllocator VMA_NOT_NULL allocator, + VmaAllocation VMA_NOT_NULL allocation, + void* VMA_NULLABLE pUserData); + +/** \brief Creates new allocation that is in lost state from the beginning. + +It can be useful if you need a dummy, non-null allocation. + +You still need to destroy created object using vmaFreeMemory(). + +Returned allocation is not tied to any specific memory pool or memory type and +not bound to any image or buffer. It has size = 0. It cannot be turned into +a real, non-empty allocation. +*/ +VMA_CALL_PRE void VMA_CALL_POST vmaCreateLostAllocation( + VmaAllocator VMA_NOT_NULL allocator, + VmaAllocation VMA_NULLABLE * VMA_NOT_NULL pAllocation); + +/** \brief Maps memory represented by given allocation and returns pointer to it. + +Maps memory represented by given allocation to make it accessible to CPU code. +When succeeded, `*ppData` contains pointer to first byte of this memory. +If the allocation is part of bigger `VkDeviceMemory` block, the pointer is +correctly offseted to the beginning of region assigned to this particular +allocation. + +Mapping is internally reference-counted and synchronized, so despite raw Vulkan +function `vkMapMemory()` cannot be used to map same block of `VkDeviceMemory` +multiple times simultaneously, it is safe to call this function on allocations +assigned to the same memory block. Actual Vulkan memory will be mapped on first +mapping and unmapped on last unmapping. + +If the function succeeded, you must call vmaUnmapMemory() to unmap the +allocation when mapping is no longer needed or before freeing the allocation, at +the latest. + +It also safe to call this function multiple times on the same allocation. You +must call vmaUnmapMemory() same number of times as you called vmaMapMemory(). + +It is also safe to call this function on allocation created with +#VMA_ALLOCATION_CREATE_MAPPED_BIT flag. Its memory stays mapped all the time. +You must still call vmaUnmapMemory() same number of times as you called +vmaMapMemory(). You must not call vmaUnmapMemory() additional time to free the +"0-th" mapping made automatically due to #VMA_ALLOCATION_CREATE_MAPPED_BIT flag. + +This function fails when used on allocation made in memory type that is not +`HOST_VISIBLE`. + +This function always fails when called for allocation that was created with +#VMA_ALLOCATION_CREATE_CAN_BECOME_LOST_BIT flag. Such allocations cannot be +mapped. + +This function doesn't automatically flush or invalidate caches. +If the allocation is made from a memory types that is not `HOST_COHERENT`, +you also need to use vmaInvalidateAllocation() / vmaFlushAllocation(), as required by Vulkan specification. +*/ +VMA_CALL_PRE VkResult VMA_CALL_POST vmaMapMemory( + VmaAllocator VMA_NOT_NULL allocator, + VmaAllocation VMA_NOT_NULL allocation, + void* VMA_NULLABLE * VMA_NOT_NULL ppData); + +/** \brief Unmaps memory represented by given allocation, mapped previously using vmaMapMemory(). + +For details, see description of vmaMapMemory(). + +This function doesn't automatically flush or invalidate caches. +If the allocation is made from a memory types that is not `HOST_COHERENT`, +you also need to use vmaInvalidateAllocation() / vmaFlushAllocation(), as required by Vulkan specification. +*/ +VMA_CALL_PRE void VMA_CALL_POST vmaUnmapMemory( + VmaAllocator VMA_NOT_NULL allocator, + VmaAllocation VMA_NOT_NULL allocation); + +/** \brief Flushes memory of given allocation. + +Calls `vkFlushMappedMemoryRanges()` for memory associated with given range of given allocation. +It needs to be called after writing to a mapped memory for memory types that are not `HOST_COHERENT`. +Unmap operation doesn't do that automatically. + +- `offset` must be relative to the beginning of allocation. +- `size` can be `VK_WHOLE_SIZE`. It means all memory from `offset` the the end of given allocation. +- `offset` and `size` don't have to be aligned. + They are internally rounded down/up to multiply of `nonCoherentAtomSize`. +- If `size` is 0, this call is ignored. +- If memory type that the `allocation` belongs to is not `HOST_VISIBLE` or it is `HOST_COHERENT`, + this call is ignored. + +Warning! `offset` and `size` are relative to the contents of given `allocation`. +If you mean whole allocation, you can pass 0 and `VK_WHOLE_SIZE`, respectively. +Do not pass allocation's offset as `offset`!!! + +This function returns the `VkResult` from `vkFlushMappedMemoryRanges` if it is +called, otherwise `VK_SUCCESS`. +*/ +VMA_CALL_PRE VkResult VMA_CALL_POST vmaFlushAllocation( + VmaAllocator VMA_NOT_NULL allocator, + VmaAllocation VMA_NOT_NULL allocation, + VkDeviceSize offset, + VkDeviceSize size); + +/** \brief Invalidates memory of given allocation. + +Calls `vkInvalidateMappedMemoryRanges()` for memory associated with given range of given allocation. +It needs to be called before reading from a mapped memory for memory types that are not `HOST_COHERENT`. +Map operation doesn't do that automatically. + +- `offset` must be relative to the beginning of allocation. +- `size` can be `VK_WHOLE_SIZE`. It means all memory from `offset` the the end of given allocation. +- `offset` and `size` don't have to be aligned. + They are internally rounded down/up to multiply of `nonCoherentAtomSize`. +- If `size` is 0, this call is ignored. +- If memory type that the `allocation` belongs to is not `HOST_VISIBLE` or it is `HOST_COHERENT`, + this call is ignored. + +Warning! `offset` and `size` are relative to the contents of given `allocation`. +If you mean whole allocation, you can pass 0 and `VK_WHOLE_SIZE`, respectively. +Do not pass allocation's offset as `offset`!!! + +This function returns the `VkResult` from `vkInvalidateMappedMemoryRanges` if +it is called, otherwise `VK_SUCCESS`. +*/ +VMA_CALL_PRE VkResult VMA_CALL_POST vmaInvalidateAllocation( + VmaAllocator VMA_NOT_NULL allocator, + VmaAllocation VMA_NOT_NULL allocation, + VkDeviceSize offset, + VkDeviceSize size); + +/** \brief Flushes memory of given set of allocations. + +Calls `vkFlushMappedMemoryRanges()` for memory associated with given ranges of given allocations. +For more information, see documentation of vmaFlushAllocation(). + +\param allocator +\param allocationCount +\param allocations +\param offsets If not null, it must point to an array of offsets of regions to flush, relative to the beginning of respective allocations. Null means all ofsets are zero. +\param sizes If not null, it must point to an array of sizes of regions to flush in respective allocations. Null means `VK_WHOLE_SIZE` for all allocations. + +This function returns the `VkResult` from `vkFlushMappedMemoryRanges` if it is +called, otherwise `VK_SUCCESS`. +*/ +VMA_CALL_PRE VkResult VMA_CALL_POST vmaFlushAllocations( + VmaAllocator VMA_NOT_NULL allocator, + uint32_t allocationCount, + const VmaAllocation VMA_NOT_NULL * VMA_NULLABLE VMA_LEN_IF_NOT_NULL(allocationCount) allocations, + const VkDeviceSize* VMA_NULLABLE VMA_LEN_IF_NOT_NULL(allocationCount) offsets, + const VkDeviceSize* VMA_NULLABLE VMA_LEN_IF_NOT_NULL(allocationCount) sizes); + +/** \brief Invalidates memory of given set of allocations. + +Calls `vkInvalidateMappedMemoryRanges()` for memory associated with given ranges of given allocations. +For more information, see documentation of vmaInvalidateAllocation(). + +\param allocator +\param allocationCount +\param allocations +\param offsets If not null, it must point to an array of offsets of regions to flush, relative to the beginning of respective allocations. Null means all ofsets are zero. +\param sizes If not null, it must point to an array of sizes of regions to flush in respective allocations. Null means `VK_WHOLE_SIZE` for all allocations. + +This function returns the `VkResult` from `vkInvalidateMappedMemoryRanges` if it is +called, otherwise `VK_SUCCESS`. +*/ +VMA_CALL_PRE VkResult VMA_CALL_POST vmaInvalidateAllocations( + VmaAllocator VMA_NOT_NULL allocator, + uint32_t allocationCount, + const VmaAllocation VMA_NOT_NULL * VMA_NULLABLE VMA_LEN_IF_NOT_NULL(allocationCount) allocations, + const VkDeviceSize* VMA_NULLABLE VMA_LEN_IF_NOT_NULL(allocationCount) offsets, + const VkDeviceSize* VMA_NULLABLE VMA_LEN_IF_NOT_NULL(allocationCount) sizes); + +/** \brief Checks magic number in margins around all allocations in given memory types (in both default and custom pools) in search for corruptions. + +@param memoryTypeBits Bit mask, where each bit set means that a memory type with that index should be checked. + +Corruption detection is enabled only when `VMA_DEBUG_DETECT_CORRUPTION` macro is defined to nonzero, +`VMA_DEBUG_MARGIN` is defined to nonzero and only for memory types that are +`HOST_VISIBLE` and `HOST_COHERENT`. For more information, see [Corruption detection](@ref debugging_memory_usage_corruption_detection). + +Possible return values: + +- `VK_ERROR_FEATURE_NOT_PRESENT` - corruption detection is not enabled for any of specified memory types. +- `VK_SUCCESS` - corruption detection has been performed and succeeded. +- `VK_ERROR_VALIDATION_FAILED_EXT` - corruption detection has been performed and found memory corruptions around one of the allocations. + `VMA_ASSERT` is also fired in that case. +- Other value: Error returned by Vulkan, e.g. memory mapping failure. +*/ +VMA_CALL_PRE VkResult VMA_CALL_POST vmaCheckCorruption(VmaAllocator VMA_NOT_NULL allocator, uint32_t memoryTypeBits); + +/** \struct VmaDefragmentationContext +\brief Represents Opaque object that represents started defragmentation process. + +Fill structure #VmaDefragmentationInfo2 and call function vmaDefragmentationBegin() to create it. +Call function vmaDefragmentationEnd() to destroy it. +*/ +VK_DEFINE_HANDLE(VmaDefragmentationContext) + +/// Flags to be used in vmaDefragmentationBegin(). None at the moment. Reserved for future use. +typedef enum VmaDefragmentationFlagBits { + VMA_DEFRAGMENTATION_FLAG_INCREMENTAL = 0x1, + VMA_DEFRAGMENTATION_FLAG_BITS_MAX_ENUM = 0x7FFFFFFF +} VmaDefragmentationFlagBits; +typedef VkFlags VmaDefragmentationFlags; + +/** \brief Parameters for defragmentation. + +To be used with function vmaDefragmentationBegin(). +*/ +typedef struct VmaDefragmentationInfo2 { + /** \brief Reserved for future use. Should be 0. + */ + VmaDefragmentationFlags flags; + /** \brief Number of allocations in `pAllocations` array. + */ + uint32_t allocationCount; + /** \brief Pointer to array of allocations that can be defragmented. + + The array should have `allocationCount` elements. + The array should not contain nulls. + Elements in the array should be unique - same allocation cannot occur twice. + It is safe to pass allocations that are in the lost state - they are ignored. + All allocations not present in this array are considered non-moveable during this defragmentation. + */ + const VmaAllocation VMA_NOT_NULL * VMA_NULLABLE VMA_LEN_IF_NOT_NULL(allocationCount) pAllocations; + /** \brief Optional, output. Pointer to array that will be filled with information whether the allocation at certain index has been changed during defragmentation. + + The array should have `allocationCount` elements. + You can pass null if you are not interested in this information. + */ + VkBool32* VMA_NULLABLE VMA_LEN_IF_NOT_NULL(allocationCount) pAllocationsChanged; + /** \brief Numer of pools in `pPools` array. + */ + uint32_t poolCount; + /** \brief Either null or pointer to array of pools to be defragmented. + + All the allocations in the specified pools can be moved during defragmentation + and there is no way to check if they were really moved as in `pAllocationsChanged`, + so you must query all the allocations in all these pools for new `VkDeviceMemory` + and offset using vmaGetAllocationInfo() if you might need to recreate buffers + and images bound to them. + + The array should have `poolCount` elements. + The array should not contain nulls. + Elements in the array should be unique - same pool cannot occur twice. + + Using this array is equivalent to specifying all allocations from the pools in `pAllocations`. + It might be more efficient. + */ + const VmaPool VMA_NOT_NULL * VMA_NULLABLE VMA_LEN_IF_NOT_NULL(poolCount) pPools; + /** \brief Maximum total numbers of bytes that can be copied while moving allocations to different places using transfers on CPU side, like `memcpy()`, `memmove()`. + + `VK_WHOLE_SIZE` means no limit. + */ + VkDeviceSize maxCpuBytesToMove; + /** \brief Maximum number of allocations that can be moved to a different place using transfers on CPU side, like `memcpy()`, `memmove()`. + + `UINT32_MAX` means no limit. + */ + uint32_t maxCpuAllocationsToMove; + /** \brief Maximum total numbers of bytes that can be copied while moving allocations to different places using transfers on GPU side, posted to `commandBuffer`. + + `VK_WHOLE_SIZE` means no limit. + */ + VkDeviceSize maxGpuBytesToMove; + /** \brief Maximum number of allocations that can be moved to a different place using transfers on GPU side, posted to `commandBuffer`. + + `UINT32_MAX` means no limit. + */ + uint32_t maxGpuAllocationsToMove; + /** \brief Optional. Command buffer where GPU copy commands will be posted. + + If not null, it must be a valid command buffer handle that supports Transfer queue type. + It must be in the recording state and outside of a render pass instance. + You need to submit it and make sure it finished execution before calling vmaDefragmentationEnd(). + + Passing null means that only CPU defragmentation will be performed. + */ + VkCommandBuffer VMA_NULLABLE commandBuffer; +} VmaDefragmentationInfo2; + +typedef struct VmaDefragmentationPassMoveInfo { + VmaAllocation VMA_NOT_NULL allocation; + VkDeviceMemory VMA_NOT_NULL_NON_DISPATCHABLE memory; + VkDeviceSize offset; +} VmaDefragmentationPassMoveInfo; + +/** \brief Parameters for incremental defragmentation steps. + +To be used with function vmaBeginDefragmentationPass(). +*/ +typedef struct VmaDefragmentationPassInfo { + uint32_t moveCount; + VmaDefragmentationPassMoveInfo* VMA_NOT_NULL VMA_LEN_IF_NOT_NULL(moveCount) pMoves; +} VmaDefragmentationPassInfo; + +/** \brief Deprecated. Optional configuration parameters to be passed to function vmaDefragment(). + +\deprecated This is a part of the old interface. It is recommended to use structure #VmaDefragmentationInfo2 and function vmaDefragmentationBegin() instead. +*/ +typedef struct VmaDefragmentationInfo { + /** \brief Maximum total numbers of bytes that can be copied while moving allocations to different places. + + Default is `VK_WHOLE_SIZE`, which means no limit. + */ + VkDeviceSize maxBytesToMove; + /** \brief Maximum number of allocations that can be moved to different place. + + Default is `UINT32_MAX`, which means no limit. + */ + uint32_t maxAllocationsToMove; +} VmaDefragmentationInfo; + +/** \brief Statistics returned by function vmaDefragment(). */ +typedef struct VmaDefragmentationStats { + /// Total number of bytes that have been copied while moving allocations to different places. + VkDeviceSize bytesMoved; + /// Total number of bytes that have been released to the system by freeing empty `VkDeviceMemory` objects. + VkDeviceSize bytesFreed; + /// Number of allocations that have been moved to different places. + uint32_t allocationsMoved; + /// Number of empty `VkDeviceMemory` objects that have been released to the system. + uint32_t deviceMemoryBlocksFreed; +} VmaDefragmentationStats; + +/** \brief Begins defragmentation process. + +@param allocator Allocator object. +@param pInfo Structure filled with parameters of defragmentation. +@param[out] pStats Optional. Statistics of defragmentation. You can pass null if you are not interested in this information. +@param[out] pContext Context object that must be passed to vmaDefragmentationEnd() to finish defragmentation. +@return `VK_SUCCESS` and `*pContext == null` if defragmentation finished within this function call. `VK_NOT_READY` and `*pContext != null` if defragmentation has been started and you need to call vmaDefragmentationEnd() to finish it. Negative value in case of error. + +Use this function instead of old, deprecated vmaDefragment(). + +Warning! Between the call to vmaDefragmentationBegin() and vmaDefragmentationEnd(): + +- You should not use any of allocations passed as `pInfo->pAllocations` or + any allocations that belong to pools passed as `pInfo->pPools`, + including calling vmaGetAllocationInfo(), vmaTouchAllocation(), or access + their data. +- Some mutexes protecting internal data structures may be locked, so trying to + make or free any allocations, bind buffers or images, map memory, or launch + another simultaneous defragmentation in between may cause stall (when done on + another thread) or deadlock (when done on the same thread), unless you are + 100% sure that defragmented allocations are in different pools. +- Information returned via `pStats` and `pInfo->pAllocationsChanged` are undefined. + They become valid after call to vmaDefragmentationEnd(). +- If `pInfo->commandBuffer` is not null, you must submit that command buffer + and make sure it finished execution before calling vmaDefragmentationEnd(). + +For more information and important limitations regarding defragmentation, see documentation chapter: +[Defragmentation](@ref defragmentation). +*/ +VMA_CALL_PRE VkResult VMA_CALL_POST vmaDefragmentationBegin( + VmaAllocator VMA_NOT_NULL allocator, + const VmaDefragmentationInfo2* VMA_NOT_NULL pInfo, + VmaDefragmentationStats* VMA_NULLABLE pStats, + VmaDefragmentationContext VMA_NULLABLE * VMA_NOT_NULL pContext); + +/** \brief Ends defragmentation process. + +Use this function to finish defragmentation started by vmaDefragmentationBegin(). +It is safe to pass `context == null`. The function then does nothing. +*/ +VMA_CALL_PRE VkResult VMA_CALL_POST vmaDefragmentationEnd( + VmaAllocator VMA_NOT_NULL allocator, + VmaDefragmentationContext VMA_NULLABLE context); + +VMA_CALL_PRE VkResult VMA_CALL_POST vmaBeginDefragmentationPass( + VmaAllocator VMA_NOT_NULL allocator, + VmaDefragmentationContext VMA_NULLABLE context, + VmaDefragmentationPassInfo* VMA_NOT_NULL pInfo +); +VMA_CALL_PRE VkResult VMA_CALL_POST vmaEndDefragmentationPass( + VmaAllocator VMA_NOT_NULL allocator, + VmaDefragmentationContext VMA_NULLABLE context +); + +/** \brief Deprecated. Compacts memory by moving allocations. + +@param pAllocations Array of allocations that can be moved during this compation. +@param allocationCount Number of elements in pAllocations and pAllocationsChanged arrays. +@param[out] pAllocationsChanged Array of boolean values that will indicate whether matching allocation in pAllocations array has been moved. This parameter is optional. Pass null if you don't need this information. +@param pDefragmentationInfo Configuration parameters. Optional - pass null to use default values. +@param[out] pDefragmentationStats Statistics returned by the function. Optional - pass null if you don't need this information. +@return `VK_SUCCESS` if completed, negative error code in case of error. + +\deprecated This is a part of the old interface. It is recommended to use structure #VmaDefragmentationInfo2 and function vmaDefragmentationBegin() instead. + +This function works by moving allocations to different places (different +`VkDeviceMemory` objects and/or different offsets) in order to optimize memory +usage. Only allocations that are in `pAllocations` array can be moved. All other +allocations are considered nonmovable in this call. Basic rules: + +- Only allocations made in memory types that have + `VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT` and `VK_MEMORY_PROPERTY_HOST_COHERENT_BIT` + flags can be compacted. You may pass other allocations but it makes no sense - + these will never be moved. +- Custom pools created with #VMA_POOL_CREATE_LINEAR_ALGORITHM_BIT or + #VMA_POOL_CREATE_BUDDY_ALGORITHM_BIT flag are not defragmented. Allocations + passed to this function that come from such pools are ignored. +- Allocations created with #VMA_ALLOCATION_CREATE_DEDICATED_MEMORY_BIT or + created as dedicated allocations for any other reason are also ignored. +- Both allocations made with or without #VMA_ALLOCATION_CREATE_MAPPED_BIT + flag can be compacted. If not persistently mapped, memory will be mapped + temporarily inside this function if needed. +- You must not pass same #VmaAllocation object multiple times in `pAllocations` array. + +The function also frees empty `VkDeviceMemory` blocks. + +Warning: This function may be time-consuming, so you shouldn't call it too often +(like after every resource creation/destruction). +You can call it on special occasions (like when reloading a game level or +when you just destroyed a lot of objects). Calling it every frame may be OK, but +you should measure that on your platform. + +For more information, see [Defragmentation](@ref defragmentation) chapter. +*/ +VMA_CALL_PRE VkResult VMA_CALL_POST vmaDefragment( + VmaAllocator VMA_NOT_NULL allocator, + const VmaAllocation VMA_NOT_NULL * VMA_NOT_NULL VMA_LEN_IF_NOT_NULL(allocationCount) pAllocations, + size_t allocationCount, + VkBool32* VMA_NULLABLE VMA_LEN_IF_NOT_NULL(allocationCount) pAllocationsChanged, + const VmaDefragmentationInfo* VMA_NULLABLE pDefragmentationInfo, + VmaDefragmentationStats* VMA_NULLABLE pDefragmentationStats); + +/** \brief Binds buffer to allocation. + +Binds specified buffer to region of memory represented by specified allocation. +Gets `VkDeviceMemory` handle and offset from the allocation. +If you want to create a buffer, allocate memory for it and bind them together separately, +you should use this function for binding instead of standard `vkBindBufferMemory()`, +because it ensures proper synchronization so that when a `VkDeviceMemory` object is used by multiple +allocations, calls to `vkBind*Memory()` or `vkMapMemory()` won't happen from multiple threads simultaneously +(which is illegal in Vulkan). + +It is recommended to use function vmaCreateBuffer() instead of this one. +*/ +VMA_CALL_PRE VkResult VMA_CALL_POST vmaBindBufferMemory( + VmaAllocator VMA_NOT_NULL allocator, + VmaAllocation VMA_NOT_NULL allocation, + VkBuffer VMA_NOT_NULL_NON_DISPATCHABLE buffer); + +/** \brief Binds buffer to allocation with additional parameters. + +@param allocationLocalOffset Additional offset to be added while binding, relative to the beginnig of the `allocation`. Normally it should be 0. +@param pNext A chain of structures to be attached to `VkBindBufferMemoryInfoKHR` structure used internally. Normally it should be null. + +This function is similar to vmaBindBufferMemory(), but it provides additional parameters. + +If `pNext` is not null, #VmaAllocator object must have been created with #VMA_ALLOCATOR_CREATE_KHR_BIND_MEMORY2_BIT flag +or with VmaAllocatorCreateInfo::vulkanApiVersion `>= VK_API_VERSION_1_1`. Otherwise the call fails. +*/ +VMA_CALL_PRE VkResult VMA_CALL_POST vmaBindBufferMemory2( + VmaAllocator VMA_NOT_NULL allocator, + VmaAllocation VMA_NOT_NULL allocation, + VkDeviceSize allocationLocalOffset, + VkBuffer VMA_NOT_NULL_NON_DISPATCHABLE buffer, + const void* VMA_NULLABLE pNext); + +/** \brief Binds image to allocation. + +Binds specified image to region of memory represented by specified allocation. +Gets `VkDeviceMemory` handle and offset from the allocation. +If you want to create an image, allocate memory for it and bind them together separately, +you should use this function for binding instead of standard `vkBindImageMemory()`, +because it ensures proper synchronization so that when a `VkDeviceMemory` object is used by multiple +allocations, calls to `vkBind*Memory()` or `vkMapMemory()` won't happen from multiple threads simultaneously +(which is illegal in Vulkan). + +It is recommended to use function vmaCreateImage() instead of this one. +*/ +VMA_CALL_PRE VkResult VMA_CALL_POST vmaBindImageMemory( + VmaAllocator VMA_NOT_NULL allocator, + VmaAllocation VMA_NOT_NULL allocation, + VkImage VMA_NOT_NULL_NON_DISPATCHABLE image); + +/** \brief Binds image to allocation with additional parameters. + +@param allocationLocalOffset Additional offset to be added while binding, relative to the beginnig of the `allocation`. Normally it should be 0. +@param pNext A chain of structures to be attached to `VkBindImageMemoryInfoKHR` structure used internally. Normally it should be null. + +This function is similar to vmaBindImageMemory(), but it provides additional parameters. + +If `pNext` is not null, #VmaAllocator object must have been created with #VMA_ALLOCATOR_CREATE_KHR_BIND_MEMORY2_BIT flag +or with VmaAllocatorCreateInfo::vulkanApiVersion `>= VK_API_VERSION_1_1`. Otherwise the call fails. +*/ +VMA_CALL_PRE VkResult VMA_CALL_POST vmaBindImageMemory2( + VmaAllocator VMA_NOT_NULL allocator, + VmaAllocation VMA_NOT_NULL allocation, + VkDeviceSize allocationLocalOffset, + VkImage VMA_NOT_NULL_NON_DISPATCHABLE image, + const void* VMA_NULLABLE pNext); + +/** +@param[out] pBuffer Buffer that was created. +@param[out] pAllocation Allocation that was created. +@param[out] pAllocationInfo Optional. Information about allocated memory. It can be later fetched using function vmaGetAllocationInfo(). + +This function automatically: + +-# Creates buffer. +-# Allocates appropriate memory for it. +-# Binds the buffer with the memory. + +If any of these operations fail, buffer and allocation are not created, +returned value is negative error code, *pBuffer and *pAllocation are null. + +If the function succeeded, you must destroy both buffer and allocation when you +no longer need them using either convenience function vmaDestroyBuffer() or +separately, using `vkDestroyBuffer()` and vmaFreeMemory(). + +If #VMA_ALLOCATOR_CREATE_KHR_DEDICATED_ALLOCATION_BIT flag was used, +VK_KHR_dedicated_allocation extension is used internally to query driver whether +it requires or prefers the new buffer to have dedicated allocation. If yes, +and if dedicated allocation is possible (VmaAllocationCreateInfo::pool is null +and #VMA_ALLOCATION_CREATE_NEVER_ALLOCATE_BIT is not used), it creates dedicated +allocation for this buffer, just like when using +#VMA_ALLOCATION_CREATE_DEDICATED_MEMORY_BIT. + +\note This function creates a new `VkBuffer`. Sub-allocation of parts of one large buffer, +although recommended as a good practice, is out of scope of this library and could be implemented +by the user as a higher-level logic on top of VMA. +*/ +VMA_CALL_PRE VkResult VMA_CALL_POST vmaCreateBuffer( + VmaAllocator VMA_NOT_NULL allocator, + const VkBufferCreateInfo* VMA_NOT_NULL pBufferCreateInfo, + const VmaAllocationCreateInfo* VMA_NOT_NULL pAllocationCreateInfo, + VkBuffer VMA_NULLABLE_NON_DISPATCHABLE * VMA_NOT_NULL pBuffer, + VmaAllocation VMA_NULLABLE * VMA_NOT_NULL pAllocation, + VmaAllocationInfo* VMA_NULLABLE pAllocationInfo); + +/** \brief Destroys Vulkan buffer and frees allocated memory. + +This is just a convenience function equivalent to: + +\code +vkDestroyBuffer(device, buffer, allocationCallbacks); +vmaFreeMemory(allocator, allocation); +\endcode + +It it safe to pass null as buffer and/or allocation. +*/ +VMA_CALL_PRE void VMA_CALL_POST vmaDestroyBuffer( + VmaAllocator VMA_NOT_NULL allocator, + VkBuffer VMA_NULLABLE_NON_DISPATCHABLE buffer, + VmaAllocation VMA_NULLABLE allocation); + +/// Function similar to vmaCreateBuffer(). +VMA_CALL_PRE VkResult VMA_CALL_POST vmaCreateImage( + VmaAllocator VMA_NOT_NULL allocator, + const VkImageCreateInfo* VMA_NOT_NULL pImageCreateInfo, + const VmaAllocationCreateInfo* VMA_NOT_NULL pAllocationCreateInfo, + VkImage VMA_NULLABLE_NON_DISPATCHABLE * VMA_NOT_NULL pImage, + VmaAllocation VMA_NULLABLE * VMA_NOT_NULL pAllocation, + VmaAllocationInfo* VMA_NULLABLE pAllocationInfo); + +/** \brief Destroys Vulkan image and frees allocated memory. + +This is just a convenience function equivalent to: + +\code +vkDestroyImage(device, image, allocationCallbacks); +vmaFreeMemory(allocator, allocation); +\endcode + +It it safe to pass null as image and/or allocation. +*/ +VMA_CALL_PRE void VMA_CALL_POST vmaDestroyImage( + VmaAllocator VMA_NOT_NULL allocator, + VkImage VMA_NULLABLE_NON_DISPATCHABLE image, + VmaAllocation VMA_NULLABLE allocation); + +#ifdef __cplusplus +} +#endif + +#endif // AMD_VULKAN_MEMORY_ALLOCATOR_H + +// For Visual Studio IntelliSense. +#if defined(__cplusplus) && defined(__INTELLISENSE__) +#define VMA_IMPLEMENTATION +#endif + +#ifdef VMA_IMPLEMENTATION +#undef VMA_IMPLEMENTATION + +#include +#include +#include +#include + +#if VMA_RECORDING_ENABLED + #include + #if defined(_WIN32) + #include + #else + #include + #include + #endif +#endif + +/******************************************************************************* +CONFIGURATION SECTION + +Define some of these macros before each #include of this header or change them +here if you need other then default behavior depending on your environment. +*/ + +/* +Define this macro to 1 to make the library fetch pointers to Vulkan functions +internally, like: + + vulkanFunctions.vkAllocateMemory = &vkAllocateMemory; +*/ +#if !defined(VMA_STATIC_VULKAN_FUNCTIONS) && !defined(VK_NO_PROTOTYPES) + #define VMA_STATIC_VULKAN_FUNCTIONS 1 +#endif + +/* +Define this macro to 1 to make the library fetch pointers to Vulkan functions +internally, like: + + vulkanFunctions.vkAllocateMemory = (PFN_vkAllocateMemory)vkGetDeviceProcAddr(m_hDevice, vkAllocateMemory); +*/ +#if !defined(VMA_DYNAMIC_VULKAN_FUNCTIONS) + #define VMA_DYNAMIC_VULKAN_FUNCTIONS 1 + #if defined(VK_NO_PROTOTYPES) + extern PFN_vkGetInstanceProcAddr vkGetInstanceProcAddr; + extern PFN_vkGetDeviceProcAddr vkGetDeviceProcAddr; + #endif +#endif + +// Define this macro to 1 to make the library use STL containers instead of its own implementation. +//#define VMA_USE_STL_CONTAINERS 1 + +/* Set this macro to 1 to make the library including and using STL containers: +std::pair, std::vector, std::list, std::unordered_map. + +Set it to 0 or undefined to make the library using its own implementation of +the containers. +*/ +#if VMA_USE_STL_CONTAINERS + #define VMA_USE_STL_VECTOR 1 + #define VMA_USE_STL_UNORDERED_MAP 1 + #define VMA_USE_STL_LIST 1 +#endif + +#ifndef VMA_USE_STL_SHARED_MUTEX + // Compiler conforms to C++17. + #if __cplusplus >= 201703L + #define VMA_USE_STL_SHARED_MUTEX 1 + // Visual studio defines __cplusplus properly only when passed additional parameter: /Zc:__cplusplus + // Otherwise it's always 199711L, despite shared_mutex works since Visual Studio 2015 Update 2. + // See: https://blogs.msdn.microsoft.com/vcblog/2018/04/09/msvc-now-correctly-reports-__cplusplus/ + #elif defined(_MSC_FULL_VER) && _MSC_FULL_VER >= 190023918 && __cplusplus == 199711L && _MSVC_LANG >= 201703L + #define VMA_USE_STL_SHARED_MUTEX 1 + #else + #define VMA_USE_STL_SHARED_MUTEX 0 + #endif +#endif + +/* +THESE INCLUDES ARE NOT ENABLED BY DEFAULT. +Library has its own container implementation. +*/ +#if VMA_USE_STL_VECTOR + #include +#endif + +#if VMA_USE_STL_UNORDERED_MAP + #include +#endif + +#if VMA_USE_STL_LIST + #include +#endif + +/* +Following headers are used in this CONFIGURATION section only, so feel free to +remove them if not needed. +*/ +#include // for assert +#include // for min, max +#include + +#ifndef VMA_NULL + // Value used as null pointer. Define it to e.g.: nullptr, NULL, 0, (void*)0. + #define VMA_NULL nullptr +#endif + +#if defined(__ANDROID_API__) && (__ANDROID_API__ < 16) +#include +static void* vma_aligned_alloc(size_t alignment, size_t size) +{ + // alignment must be >= sizeof(void*) + if(alignment < sizeof(void*)) + { + alignment = sizeof(void*); + } + + return memalign(alignment, size); +} +#elif defined(__APPLE__) || defined(__ANDROID__) || (defined(__linux__) && defined(__GLIBCXX__) && !defined(_GLIBCXX_HAVE_ALIGNED_ALLOC)) +#include + +#if defined(__APPLE__) +#include +#endif + +static void* vma_aligned_alloc(size_t alignment, size_t size) +{ +#if defined(__APPLE__) && (defined(MAC_OS_X_VERSION_10_16) || defined(__IPHONE_14_0)) +#if MAC_OS_X_VERSION_MAX_ALLOWED >= MAC_OS_X_VERSION_10_16 || __IPHONE_OS_VERSION_MAX_ALLOWED >= __IPHONE_14_0 + // For C++14, usr/include/malloc/_malloc.h declares aligned_alloc()) only + // with the MacOSX11.0 SDK in Xcode 12 (which is what adds + // MAC_OS_X_VERSION_10_16), even though the function is marked + // availabe for 10.15. That's why the preprocessor checks for 10.16 but + // the __builtin_available checks for 10.15. + // People who use C++17 could call aligned_alloc with the 10.15 SDK already. + if (__builtin_available(macOS 10.15, iOS 13, *)) + return aligned_alloc(alignment, size); +#endif +#endif + // alignment must be >= sizeof(void*) + if(alignment < sizeof(void*)) + { + alignment = sizeof(void*); + } + + void *pointer; + if(posix_memalign(&pointer, alignment, size) == 0) + return pointer; + return VMA_NULL; +} +#elif defined(_WIN32) +static void* vma_aligned_alloc(size_t alignment, size_t size) +{ + return _aligned_malloc(size, alignment); +} +#else +static void* vma_aligned_alloc(size_t alignment, size_t size) +{ + return aligned_alloc(alignment, size); +} +#endif + +#if defined(_WIN32) +static void vma_aligned_free(void* ptr) +{ + _aligned_free(ptr); +} +#else +static void vma_aligned_free(void* ptr) +{ + free(ptr); +} +#endif + +// If your compiler is not compatible with C++11 and definition of +// aligned_alloc() function is missing, uncommeting following line may help: + +//#include + +// Normal assert to check for programmer's errors, especially in Debug configuration. +#ifndef VMA_ASSERT + #ifdef NDEBUG + #define VMA_ASSERT(expr) + #else + #define VMA_ASSERT(expr) assert(expr) + #endif +#endif + +// Assert that will be called very often, like inside data structures e.g. operator[]. +// Making it non-empty can make program slow. +#ifndef VMA_HEAVY_ASSERT + #ifdef NDEBUG + #define VMA_HEAVY_ASSERT(expr) + #else + #define VMA_HEAVY_ASSERT(expr) //VMA_ASSERT(expr) + #endif +#endif + +#ifndef VMA_ALIGN_OF + #define VMA_ALIGN_OF(type) (__alignof(type)) +#endif + +#ifndef VMA_SYSTEM_ALIGNED_MALLOC + #define VMA_SYSTEM_ALIGNED_MALLOC(size, alignment) vma_aligned_alloc((alignment), (size)) +#endif + +#ifndef VMA_SYSTEM_ALIGNED_FREE + // VMA_SYSTEM_FREE is the old name, but might have been defined by the user + #if defined(VMA_SYSTEM_FREE) + #define VMA_SYSTEM_ALIGNED_FREE(ptr) VMA_SYSTEM_FREE(ptr) + #else + #define VMA_SYSTEM_ALIGNED_FREE(ptr) vma_aligned_free(ptr) + #endif +#endif + +#ifndef VMA_MIN + #define VMA_MIN(v1, v2) (std::min((v1), (v2))) +#endif + +#ifndef VMA_MAX + #define VMA_MAX(v1, v2) (std::max((v1), (v2))) +#endif + +#ifndef VMA_SWAP + #define VMA_SWAP(v1, v2) std::swap((v1), (v2)) +#endif + +#ifndef VMA_SORT + #define VMA_SORT(beg, end, cmp) std::sort(beg, end, cmp) +#endif + +#ifndef VMA_DEBUG_LOG + #define VMA_DEBUG_LOG(format, ...) + /* + #define VMA_DEBUG_LOG(format, ...) do { \ + printf(format, __VA_ARGS__); \ + printf("\n"); \ + } while(false) + */ +#endif + +// Define this macro to 1 to enable functions: vmaBuildStatsString, vmaFreeStatsString. +#if VMA_STATS_STRING_ENABLED + static inline void VmaUint32ToStr(char* outStr, size_t strLen, uint32_t num) + { + snprintf(outStr, strLen, "%u", static_cast(num)); + } + static inline void VmaUint64ToStr(char* outStr, size_t strLen, uint64_t num) + { + snprintf(outStr, strLen, "%llu", static_cast(num)); + } + static inline void VmaPtrToStr(char* outStr, size_t strLen, const void* ptr) + { + snprintf(outStr, strLen, "%p", ptr); + } +#endif + +#ifndef VMA_MUTEX + class VmaMutex + { + public: + void Lock() { m_Mutex.lock(); } + void Unlock() { m_Mutex.unlock(); } + bool TryLock() { return m_Mutex.try_lock(); } + private: + std::mutex m_Mutex; + }; + #define VMA_MUTEX VmaMutex +#endif + +// Read-write mutex, where "read" is shared access, "write" is exclusive access. +#ifndef VMA_RW_MUTEX + #if VMA_USE_STL_SHARED_MUTEX + // Use std::shared_mutex from C++17. + #include + class VmaRWMutex + { + public: + void LockRead() { m_Mutex.lock_shared(); } + void UnlockRead() { m_Mutex.unlock_shared(); } + bool TryLockRead() { return m_Mutex.try_lock_shared(); } + void LockWrite() { m_Mutex.lock(); } + void UnlockWrite() { m_Mutex.unlock(); } + bool TryLockWrite() { return m_Mutex.try_lock(); } + private: + std::shared_mutex m_Mutex; + }; + #define VMA_RW_MUTEX VmaRWMutex + #elif defined(_WIN32) && defined(WINVER) && WINVER >= 0x0600 + // Use SRWLOCK from WinAPI. + // Minimum supported client = Windows Vista, server = Windows Server 2008. + class VmaRWMutex + { + public: + VmaRWMutex() { InitializeSRWLock(&m_Lock); } + void LockRead() { AcquireSRWLockShared(&m_Lock); } + void UnlockRead() { ReleaseSRWLockShared(&m_Lock); } + bool TryLockRead() { return TryAcquireSRWLockShared(&m_Lock) != FALSE; } + void LockWrite() { AcquireSRWLockExclusive(&m_Lock); } + void UnlockWrite() { ReleaseSRWLockExclusive(&m_Lock); } + bool TryLockWrite() { return TryAcquireSRWLockExclusive(&m_Lock) != FALSE; } + private: + SRWLOCK m_Lock; + }; + #define VMA_RW_MUTEX VmaRWMutex + #else + // Less efficient fallback: Use normal mutex. + class VmaRWMutex + { + public: + void LockRead() { m_Mutex.Lock(); } + void UnlockRead() { m_Mutex.Unlock(); } + bool TryLockRead() { return m_Mutex.TryLock(); } + void LockWrite() { m_Mutex.Lock(); } + void UnlockWrite() { m_Mutex.Unlock(); } + bool TryLockWrite() { return m_Mutex.TryLock(); } + private: + VMA_MUTEX m_Mutex; + }; + #define VMA_RW_MUTEX VmaRWMutex + #endif // #if VMA_USE_STL_SHARED_MUTEX +#endif // #ifndef VMA_RW_MUTEX + +/* +If providing your own implementation, you need to implement a subset of std::atomic. +*/ +#ifndef VMA_ATOMIC_UINT32 + #include + #define VMA_ATOMIC_UINT32 std::atomic +#endif + +#ifndef VMA_ATOMIC_UINT64 + #include + #define VMA_ATOMIC_UINT64 std::atomic +#endif + +#ifndef VMA_DEBUG_ALWAYS_DEDICATED_MEMORY + /** + Every allocation will have its own memory block. + Define to 1 for debugging purposes only. + */ + #define VMA_DEBUG_ALWAYS_DEDICATED_MEMORY (0) +#endif + +#ifndef VMA_DEBUG_ALIGNMENT + /** + Minimum alignment of all allocations, in bytes. + Set to more than 1 for debugging purposes only. Must be power of two. + */ + #define VMA_DEBUG_ALIGNMENT (1) +#endif + +#ifndef VMA_DEBUG_MARGIN + /** + Minimum margin before and after every allocation, in bytes. + Set nonzero for debugging purposes only. + */ + #define VMA_DEBUG_MARGIN (0) +#endif + +#ifndef VMA_DEBUG_INITIALIZE_ALLOCATIONS + /** + Define this macro to 1 to automatically fill new allocations and destroyed + allocations with some bit pattern. + */ + #define VMA_DEBUG_INITIALIZE_ALLOCATIONS (0) +#endif + +#ifndef VMA_DEBUG_DETECT_CORRUPTION + /** + Define this macro to 1 together with non-zero value of VMA_DEBUG_MARGIN to + enable writing magic value to the margin before and after every allocation and + validating it, so that memory corruptions (out-of-bounds writes) are detected. + */ + #define VMA_DEBUG_DETECT_CORRUPTION (0) +#endif + +#ifndef VMA_DEBUG_GLOBAL_MUTEX + /** + Set this to 1 for debugging purposes only, to enable single mutex protecting all + entry calls to the library. Can be useful for debugging multithreading issues. + */ + #define VMA_DEBUG_GLOBAL_MUTEX (0) +#endif + +#ifndef VMA_DEBUG_MIN_BUFFER_IMAGE_GRANULARITY + /** + Minimum value for VkPhysicalDeviceLimits::bufferImageGranularity. + Set to more than 1 for debugging purposes only. Must be power of two. + */ + #define VMA_DEBUG_MIN_BUFFER_IMAGE_GRANULARITY (1) +#endif + +#ifndef VMA_DEBUG_DONT_EXCEED_MAX_MEMORY_ALLOCATION_COUNT + /* + Set this to 1 to make VMA never exceed VkPhysicalDeviceLimits::maxMemoryAllocationCount + and return error instead of leaving up to Vulkan implementation what to do in such cases. + */ + #define VMA_DEBUG_DONT_EXCEED_MAX_MEMORY_ALLOCATION_COUNT (0) +#endif + +#ifndef VMA_SMALL_HEAP_MAX_SIZE + /// Maximum size of a memory heap in Vulkan to consider it "small". + #define VMA_SMALL_HEAP_MAX_SIZE (1024ull * 1024 * 1024) +#endif + +#ifndef VMA_DEFAULT_LARGE_HEAP_BLOCK_SIZE + /// Default size of a block allocated as single VkDeviceMemory from a "large" heap. + #define VMA_DEFAULT_LARGE_HEAP_BLOCK_SIZE (256ull * 1024 * 1024) +#endif + +#ifndef VMA_CLASS_NO_COPY + #define VMA_CLASS_NO_COPY(className) \ + private: \ + className(const className&) = delete; \ + className& operator=(const className&) = delete; +#endif + +static const uint32_t VMA_FRAME_INDEX_LOST = UINT32_MAX; + +// Decimal 2139416166, float NaN, little-endian binary 66 E6 84 7F. +static const uint32_t VMA_CORRUPTION_DETECTION_MAGIC_VALUE = 0x7F84E666; + +static const uint8_t VMA_ALLOCATION_FILL_PATTERN_CREATED = 0xDC; +static const uint8_t VMA_ALLOCATION_FILL_PATTERN_DESTROYED = 0xEF; + +/******************************************************************************* +END OF CONFIGURATION +*/ + +// # Copy of some Vulkan definitions so we don't need to check their existence just to handle few constants. + +static const uint32_t VK_MEMORY_PROPERTY_DEVICE_COHERENT_BIT_AMD_COPY = 0x00000040; +static const uint32_t VK_MEMORY_PROPERTY_DEVICE_UNCACHED_BIT_AMD_COPY = 0x00000080; +static const uint32_t VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT_COPY = 0x00020000; + +static const uint32_t VMA_ALLOCATION_INTERNAL_STRATEGY_MIN_OFFSET = 0x10000000u; + +static VkAllocationCallbacks VmaEmptyAllocationCallbacks = { + VMA_NULL, VMA_NULL, VMA_NULL, VMA_NULL, VMA_NULL, VMA_NULL }; + +// Returns number of bits set to 1 in (v). +static inline uint32_t VmaCountBitsSet(uint32_t v) +{ + uint32_t c = v - ((v >> 1) & 0x55555555); + c = ((c >> 2) & 0x33333333) + (c & 0x33333333); + c = ((c >> 4) + c) & 0x0F0F0F0F; + c = ((c >> 8) + c) & 0x00FF00FF; + c = ((c >> 16) + c) & 0x0000FFFF; + return c; +} + +/* +Returns true if given number is a power of two. +T must be unsigned integer number or signed integer but always nonnegative. +For 0 returns true. +*/ +template +inline bool VmaIsPow2(T x) +{ + return (x & (x-1)) == 0; +} + +// Aligns given value up to nearest multiply of align value. For example: VmaAlignUp(11, 8) = 16. +// Use types like uint32_t, uint64_t as T. +template +static inline T VmaAlignUp(T val, T alignment) +{ + VMA_HEAVY_ASSERT(VmaIsPow2(alignment)); + return (val + alignment - 1) & ~(alignment - 1); +} +// Aligns given value down to nearest multiply of align value. For example: VmaAlignUp(11, 8) = 8. +// Use types like uint32_t, uint64_t as T. +template +static inline T VmaAlignDown(T val, T alignment) +{ + VMA_HEAVY_ASSERT(VmaIsPow2(alignment)); + return val & ~(alignment - 1); +} + +// Division with mathematical rounding to nearest number. +template +static inline T VmaRoundDiv(T x, T y) +{ + return (x + (y / (T)2)) / y; +} + +// Returns smallest power of 2 greater or equal to v. +static inline uint32_t VmaNextPow2(uint32_t v) +{ + v--; + v |= v >> 1; + v |= v >> 2; + v |= v >> 4; + v |= v >> 8; + v |= v >> 16; + v++; + return v; +} +static inline uint64_t VmaNextPow2(uint64_t v) +{ + v--; + v |= v >> 1; + v |= v >> 2; + v |= v >> 4; + v |= v >> 8; + v |= v >> 16; + v |= v >> 32; + v++; + return v; +} + +// Returns largest power of 2 less or equal to v. +static inline uint32_t VmaPrevPow2(uint32_t v) +{ + v |= v >> 1; + v |= v >> 2; + v |= v >> 4; + v |= v >> 8; + v |= v >> 16; + v = v ^ (v >> 1); + return v; +} +static inline uint64_t VmaPrevPow2(uint64_t v) +{ + v |= v >> 1; + v |= v >> 2; + v |= v >> 4; + v |= v >> 8; + v |= v >> 16; + v |= v >> 32; + v = v ^ (v >> 1); + return v; +} + +static inline bool VmaStrIsEmpty(const char* pStr) +{ + return pStr == VMA_NULL || *pStr == '\0'; +} + +#if VMA_STATS_STRING_ENABLED + +static const char* VmaAlgorithmToStr(uint32_t algorithm) +{ + switch(algorithm) + { + case VMA_POOL_CREATE_LINEAR_ALGORITHM_BIT: + return "Linear"; + case VMA_POOL_CREATE_BUDDY_ALGORITHM_BIT: + return "Buddy"; + case 0: + return "Default"; + default: + VMA_ASSERT(0); + return ""; + } +} + +#endif // #if VMA_STATS_STRING_ENABLED + +#ifndef VMA_SORT + +template +Iterator VmaQuickSortPartition(Iterator beg, Iterator end, Compare cmp) +{ + Iterator centerValue = end; --centerValue; + Iterator insertIndex = beg; + for(Iterator memTypeIndex = beg; memTypeIndex < centerValue; ++memTypeIndex) + { + if(cmp(*memTypeIndex, *centerValue)) + { + if(insertIndex != memTypeIndex) + { + VMA_SWAP(*memTypeIndex, *insertIndex); + } + ++insertIndex; + } + } + if(insertIndex != centerValue) + { + VMA_SWAP(*insertIndex, *centerValue); + } + return insertIndex; +} + +template +void VmaQuickSort(Iterator beg, Iterator end, Compare cmp) +{ + if(beg < end) + { + Iterator it = VmaQuickSortPartition(beg, end, cmp); + VmaQuickSort(beg, it, cmp); + VmaQuickSort(it + 1, end, cmp); + } +} + +#define VMA_SORT(beg, end, cmp) VmaQuickSort(beg, end, cmp) + +#endif // #ifndef VMA_SORT + +/* +Returns true if two memory blocks occupy overlapping pages. +ResourceA must be in less memory offset than ResourceB. + +Algorithm is based on "Vulkan 1.0.39 - A Specification (with all registered Vulkan extensions)" +chapter 11.6 "Resource Memory Association", paragraph "Buffer-Image Granularity". +*/ +static inline bool VmaBlocksOnSamePage( + VkDeviceSize resourceAOffset, + VkDeviceSize resourceASize, + VkDeviceSize resourceBOffset, + VkDeviceSize pageSize) +{ + VMA_ASSERT(resourceAOffset + resourceASize <= resourceBOffset && resourceASize > 0 && pageSize > 0); + VkDeviceSize resourceAEnd = resourceAOffset + resourceASize - 1; + VkDeviceSize resourceAEndPage = resourceAEnd & ~(pageSize - 1); + VkDeviceSize resourceBStart = resourceBOffset; + VkDeviceSize resourceBStartPage = resourceBStart & ~(pageSize - 1); + return resourceAEndPage == resourceBStartPage; +} + +enum VmaSuballocationType +{ + VMA_SUBALLOCATION_TYPE_FREE = 0, + VMA_SUBALLOCATION_TYPE_UNKNOWN = 1, + VMA_SUBALLOCATION_TYPE_BUFFER = 2, + VMA_SUBALLOCATION_TYPE_IMAGE_UNKNOWN = 3, + VMA_SUBALLOCATION_TYPE_IMAGE_LINEAR = 4, + VMA_SUBALLOCATION_TYPE_IMAGE_OPTIMAL = 5, + VMA_SUBALLOCATION_TYPE_MAX_ENUM = 0x7FFFFFFF +}; + +/* +Returns true if given suballocation types could conflict and must respect +VkPhysicalDeviceLimits::bufferImageGranularity. They conflict if one is buffer +or linear image and another one is optimal image. If type is unknown, behave +conservatively. +*/ +static inline bool VmaIsBufferImageGranularityConflict( + VmaSuballocationType suballocType1, + VmaSuballocationType suballocType2) +{ + if(suballocType1 > suballocType2) + { + VMA_SWAP(suballocType1, suballocType2); + } + + switch(suballocType1) + { + case VMA_SUBALLOCATION_TYPE_FREE: + return false; + case VMA_SUBALLOCATION_TYPE_UNKNOWN: + return true; + case VMA_SUBALLOCATION_TYPE_BUFFER: + return + suballocType2 == VMA_SUBALLOCATION_TYPE_IMAGE_UNKNOWN || + suballocType2 == VMA_SUBALLOCATION_TYPE_IMAGE_OPTIMAL; + case VMA_SUBALLOCATION_TYPE_IMAGE_UNKNOWN: + return + suballocType2 == VMA_SUBALLOCATION_TYPE_IMAGE_UNKNOWN || + suballocType2 == VMA_SUBALLOCATION_TYPE_IMAGE_LINEAR || + suballocType2 == VMA_SUBALLOCATION_TYPE_IMAGE_OPTIMAL; + case VMA_SUBALLOCATION_TYPE_IMAGE_LINEAR: + return + suballocType2 == VMA_SUBALLOCATION_TYPE_IMAGE_OPTIMAL; + case VMA_SUBALLOCATION_TYPE_IMAGE_OPTIMAL: + return false; + default: + VMA_ASSERT(0); + return true; + } +} + +static void VmaWriteMagicValue(void* pData, VkDeviceSize offset) +{ +#if VMA_DEBUG_MARGIN > 0 && VMA_DEBUG_DETECT_CORRUPTION + uint32_t* pDst = (uint32_t*)((char*)pData + offset); + const size_t numberCount = VMA_DEBUG_MARGIN / sizeof(uint32_t); + for(size_t i = 0; i < numberCount; ++i, ++pDst) + { + *pDst = VMA_CORRUPTION_DETECTION_MAGIC_VALUE; + } +#else + // no-op +#endif +} + +static bool VmaValidateMagicValue(const void* pData, VkDeviceSize offset) +{ +#if VMA_DEBUG_MARGIN > 0 && VMA_DEBUG_DETECT_CORRUPTION + const uint32_t* pSrc = (const uint32_t*)((const char*)pData + offset); + const size_t numberCount = VMA_DEBUG_MARGIN / sizeof(uint32_t); + for(size_t i = 0; i < numberCount; ++i, ++pSrc) + { + if(*pSrc != VMA_CORRUPTION_DETECTION_MAGIC_VALUE) + { + return false; + } + } +#endif + return true; +} + +/* +Fills structure with parameters of an example buffer to be used for transfers +during GPU memory defragmentation. +*/ +static void VmaFillGpuDefragmentationBufferCreateInfo(VkBufferCreateInfo& outBufCreateInfo) +{ + memset(&outBufCreateInfo, 0, sizeof(outBufCreateInfo)); + outBufCreateInfo.sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO; + outBufCreateInfo.usage = VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT; + outBufCreateInfo.size = (VkDeviceSize)VMA_DEFAULT_LARGE_HEAP_BLOCK_SIZE; // Example size. +} + +// Helper RAII class to lock a mutex in constructor and unlock it in destructor (at the end of scope). +struct VmaMutexLock +{ + VMA_CLASS_NO_COPY(VmaMutexLock) +public: + VmaMutexLock(VMA_MUTEX& mutex, bool useMutex = true) : + m_pMutex(useMutex ? &mutex : VMA_NULL) + { if(m_pMutex) { m_pMutex->Lock(); } } + ~VmaMutexLock() + { if(m_pMutex) { m_pMutex->Unlock(); } } +private: + VMA_MUTEX* m_pMutex; +}; + +// Helper RAII class to lock a RW mutex in constructor and unlock it in destructor (at the end of scope), for reading. +struct VmaMutexLockRead +{ + VMA_CLASS_NO_COPY(VmaMutexLockRead) +public: + VmaMutexLockRead(VMA_RW_MUTEX& mutex, bool useMutex) : + m_pMutex(useMutex ? &mutex : VMA_NULL) + { if(m_pMutex) { m_pMutex->LockRead(); } } + ~VmaMutexLockRead() { if(m_pMutex) { m_pMutex->UnlockRead(); } } +private: + VMA_RW_MUTEX* m_pMutex; +}; + +// Helper RAII class to lock a RW mutex in constructor and unlock it in destructor (at the end of scope), for writing. +struct VmaMutexLockWrite +{ + VMA_CLASS_NO_COPY(VmaMutexLockWrite) +public: + VmaMutexLockWrite(VMA_RW_MUTEX& mutex, bool useMutex) : + m_pMutex(useMutex ? &mutex : VMA_NULL) + { if(m_pMutex) { m_pMutex->LockWrite(); } } + ~VmaMutexLockWrite() { if(m_pMutex) { m_pMutex->UnlockWrite(); } } +private: + VMA_RW_MUTEX* m_pMutex; +}; + +#if VMA_DEBUG_GLOBAL_MUTEX + static VMA_MUTEX gDebugGlobalMutex; + #define VMA_DEBUG_GLOBAL_MUTEX_LOCK VmaMutexLock debugGlobalMutexLock(gDebugGlobalMutex, true); +#else + #define VMA_DEBUG_GLOBAL_MUTEX_LOCK +#endif + +// Minimum size of a free suballocation to register it in the free suballocation collection. +static const VkDeviceSize VMA_MIN_FREE_SUBALLOCATION_SIZE_TO_REGISTER = 16; + +/* +Performs binary search and returns iterator to first element that is greater or +equal to (key), according to comparison (cmp). + +Cmp should return true if first argument is less than second argument. + +Returned value is the found element, if present in the collection or place where +new element with value (key) should be inserted. +*/ +template +static IterT VmaBinaryFindFirstNotLess(IterT beg, IterT end, const KeyT &key, const CmpLess& cmp) +{ + size_t down = 0, up = (end - beg); + while(down < up) + { + const size_t mid = down + (up - down) / 2; // Overflow-safe midpoint calculation + if(cmp(*(beg+mid), key)) + { + down = mid + 1; + } + else + { + up = mid; + } + } + return beg + down; +} + +template +IterT VmaBinaryFindSorted(const IterT& beg, const IterT& end, const KeyT& value, const CmpLess& cmp) +{ + IterT it = VmaBinaryFindFirstNotLess( + beg, end, value, cmp); + if(it == end || + (!cmp(*it, value) && !cmp(value, *it))) + { + return it; + } + return end; +} + +/* +Returns true if all pointers in the array are not-null and unique. +Warning! O(n^2) complexity. Use only inside VMA_HEAVY_ASSERT. +T must be pointer type, e.g. VmaAllocation, VmaPool. +*/ +template +static bool VmaValidatePointerArray(uint32_t count, const T* arr) +{ + for (const auto i : c10::irange(count)) { + const T iPtr = arr[i]; + if(iPtr == VMA_NULL) + { + return false; + } + for(uint32_t j = i + 1; j < count; ++j) + { + if(iPtr == arr[j]) + { + return false; + } + } + } + return true; +} + +template +static inline void VmaPnextChainPushFront(MainT* mainStruct, NewT* newStruct) +{ + newStruct->pNext = mainStruct->pNext; + mainStruct->pNext = newStruct; +} + +//////////////////////////////////////////////////////////////////////////////// +// Memory allocation + +static void* VmaMalloc(const VkAllocationCallbacks* pAllocationCallbacks, size_t size, size_t alignment) +{ + void* result = VMA_NULL; + if((pAllocationCallbacks != VMA_NULL) && + (pAllocationCallbacks->pfnAllocation != VMA_NULL)) + { + result = (*pAllocationCallbacks->pfnAllocation)( + pAllocationCallbacks->pUserData, + size, + alignment, + VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); + } + else + { + result = VMA_SYSTEM_ALIGNED_MALLOC(size, alignment); + } + VMA_ASSERT(result != VMA_NULL && "CPU memory allocation failed."); + return result; +} + +static void VmaFree(const VkAllocationCallbacks* pAllocationCallbacks, void* ptr) +{ + if((pAllocationCallbacks != VMA_NULL) && + (pAllocationCallbacks->pfnFree != VMA_NULL)) + { + (*pAllocationCallbacks->pfnFree)(pAllocationCallbacks->pUserData, ptr); + } + else + { + VMA_SYSTEM_ALIGNED_FREE(ptr); + } +} + +template +static T* VmaAllocate(const VkAllocationCallbacks* pAllocationCallbacks) +{ + return (T*)VmaMalloc(pAllocationCallbacks, sizeof(T), VMA_ALIGN_OF(T)); +} + +template +static T* VmaAllocateArray(const VkAllocationCallbacks* pAllocationCallbacks, size_t count) +{ + return (T*)VmaMalloc(pAllocationCallbacks, sizeof(T) * count, VMA_ALIGN_OF(T)); +} + +#define vma_new(allocator, type) new(VmaAllocate(allocator))(type) + +#define vma_new_array(allocator, type, count) new(VmaAllocateArray((allocator), (count)))(type) + +template +static void vma_delete(const VkAllocationCallbacks* pAllocationCallbacks, T* ptr) +{ + ptr->~T(); + VmaFree(pAllocationCallbacks, ptr); +} + +template +static void vma_delete_array(const VkAllocationCallbacks* pAllocationCallbacks, T* ptr, size_t count) +{ + if(ptr != VMA_NULL) + { + for(size_t i = count; i--; ) + { + ptr[i].~T(); + } + VmaFree(pAllocationCallbacks, ptr); + } +} + +static char* VmaCreateStringCopy(const VkAllocationCallbacks* allocs, const char* srcStr) +{ + if(srcStr != VMA_NULL) + { + const size_t len = strlen(srcStr); + char* const result = vma_new_array(allocs, char, len + 1); + memcpy(result, srcStr, len + 1); + return result; + } + else + { + return VMA_NULL; + } +} + +static void VmaFreeString(const VkAllocationCallbacks* allocs, char* str) +{ + if(str != VMA_NULL) + { + const size_t len = strlen(str); + vma_delete_array(allocs, str, len + 1); + } +} + +// STL-compatible allocator. +template +class VmaStlAllocator +{ +public: + const VkAllocationCallbacks* const m_pCallbacks; + typedef T value_type; + + VmaStlAllocator(const VkAllocationCallbacks* pCallbacks) : m_pCallbacks(pCallbacks) { } + template VmaStlAllocator(const VmaStlAllocator& src) : m_pCallbacks(src.m_pCallbacks) { } + + T* allocate(size_t n) { return VmaAllocateArray(m_pCallbacks, n); } + void deallocate(T* p, size_t n) { VmaFree(m_pCallbacks, p); } + + template + bool operator==(const VmaStlAllocator& rhs) const + { + return m_pCallbacks == rhs.m_pCallbacks; + } + template + bool operator!=(const VmaStlAllocator& rhs) const + { + return m_pCallbacks != rhs.m_pCallbacks; + } + + VmaStlAllocator& operator=(const VmaStlAllocator& x) = delete; +}; + +#if VMA_USE_STL_VECTOR + +#define VmaVector std::vector + +template +static void VmaVectorInsert(std::vector& vec, size_t index, const T& item) +{ + vec.insert(vec.begin() + index, item); +} + +template +static void VmaVectorRemove(std::vector& vec, size_t index) +{ + vec.erase(vec.begin() + index); +} + +#else // #if VMA_USE_STL_VECTOR + +/* Class with interface compatible with subset of std::vector. +T must be POD because constructors and destructors are not called and memcpy is +used for these objects. */ +template +class VmaVector +{ +public: + typedef T value_type; + + VmaVector(const AllocatorT& allocator) : + m_Allocator(allocator), + m_pArray(VMA_NULL), + m_Count(0), + m_Capacity(0) + { + } + + VmaVector(size_t count, const AllocatorT& allocator) : + m_Allocator(allocator), + m_pArray(count ? (T*)VmaAllocateArray(allocator.m_pCallbacks, count) : VMA_NULL), + m_Count(count), + m_Capacity(count) + { + } + + // This version of the constructor is here for compatibility with pre-C++14 std::vector. + // value is unused. + VmaVector(size_t count, const T& value, const AllocatorT& allocator) + : VmaVector(count, allocator) {} + + VmaVector(const VmaVector& src) : + m_Allocator(src.m_Allocator), + m_pArray(src.m_Count ? (T*)VmaAllocateArray(src.m_Allocator.m_pCallbacks, src.m_Count) : VMA_NULL), + m_Count(src.m_Count), + m_Capacity(src.m_Count) + { + if(m_Count != 0) + { + memcpy(m_pArray, src.m_pArray, m_Count * sizeof(T)); + } + } + + ~VmaVector() + { + VmaFree(m_Allocator.m_pCallbacks, m_pArray); + } + + VmaVector& operator=(const VmaVector& rhs) + { + if(&rhs != this) + { + resize(rhs.m_Count); + if(m_Count != 0) + { + memcpy(m_pArray, rhs.m_pArray, m_Count * sizeof(T)); + } + } + return *this; + } + + bool empty() const { return m_Count == 0; } + size_t size() const { return m_Count; } + T* data() { return m_pArray; } + const T* data() const { return m_pArray; } + + T& operator[](size_t index) + { + VMA_HEAVY_ASSERT(index < m_Count); + return m_pArray[index]; + } + const T& operator[](size_t index) const + { + VMA_HEAVY_ASSERT(index < m_Count); + return m_pArray[index]; + } + + T& front() + { + VMA_HEAVY_ASSERT(m_Count > 0); + return m_pArray[0]; + } + const T& front() const + { + VMA_HEAVY_ASSERT(m_Count > 0); + return m_pArray[0]; + } + T& back() + { + VMA_HEAVY_ASSERT(m_Count > 0); + return m_pArray[m_Count - 1]; + } + const T& back() const + { + VMA_HEAVY_ASSERT(m_Count > 0); + return m_pArray[m_Count - 1]; + } + + void reserve(size_t newCapacity, bool freeMemory = false) + { + newCapacity = VMA_MAX(newCapacity, m_Count); + + if((newCapacity < m_Capacity) && !freeMemory) + { + newCapacity = m_Capacity; + } + + if(newCapacity != m_Capacity) + { + T* const newArray = newCapacity ? VmaAllocateArray(m_Allocator, newCapacity) : VMA_NULL; + if(m_Count != 0) + { + memcpy(newArray, m_pArray, m_Count * sizeof(T)); + } + VmaFree(m_Allocator.m_pCallbacks, m_pArray); + m_Capacity = newCapacity; + m_pArray = newArray; + } + } + + void resize(size_t newCount, bool freeMemory = false) + { + size_t newCapacity = m_Capacity; + if(newCount > m_Capacity) + { + newCapacity = VMA_MAX(newCount, VMA_MAX(m_Capacity * 3 / 2, (size_t)8)); + } + else if(freeMemory) + { + newCapacity = newCount; + } + + if(newCapacity != m_Capacity) + { + T* const newArray = newCapacity ? VmaAllocateArray(m_Allocator.m_pCallbacks, newCapacity) : VMA_NULL; + const size_t elementsToCopy = VMA_MIN(m_Count, newCount); + if(elementsToCopy != 0) + { + memcpy(newArray, m_pArray, elementsToCopy * sizeof(T)); + } + VmaFree(m_Allocator.m_pCallbacks, m_pArray); + m_Capacity = newCapacity; + m_pArray = newArray; + } + + m_Count = newCount; + } + + void clear(bool freeMemory = false) + { + resize(0, freeMemory); + } + + void insert(size_t index, const T& src) + { + VMA_HEAVY_ASSERT(index <= m_Count); + const size_t oldCount = size(); + resize(oldCount + 1); + if(index < oldCount) + { + memmove(m_pArray + (index + 1), m_pArray + index, (oldCount - index) * sizeof(T)); + } + m_pArray[index] = src; + } + + void remove(size_t index) + { + VMA_HEAVY_ASSERT(index < m_Count); + const size_t oldCount = size(); + if(index < oldCount - 1) + { + memmove(m_pArray + index, m_pArray + (index + 1), (oldCount - index - 1) * sizeof(T)); + } + resize(oldCount - 1); + } + + void push_back(const T& src) + { + const size_t newIndex = size(); + resize(newIndex + 1); + m_pArray[newIndex] = src; + } + + void pop_back() + { + VMA_HEAVY_ASSERT(m_Count > 0); + resize(size() - 1); + } + + void push_front(const T& src) + { + insert(0, src); + } + + void pop_front() + { + VMA_HEAVY_ASSERT(m_Count > 0); + remove(0); + } + + typedef T* iterator; + + iterator begin() { return m_pArray; } + iterator end() { return m_pArray + m_Count; } + +private: + AllocatorT m_Allocator; + T* m_pArray; + size_t m_Count; + size_t m_Capacity; +}; + +template +static void VmaVectorInsert(VmaVector& vec, size_t index, const T& item) +{ + vec.insert(index, item); +} + +template +static void VmaVectorRemove(VmaVector& vec, size_t index) +{ + vec.remove(index); +} + +#endif // #if VMA_USE_STL_VECTOR + +template +size_t VmaVectorInsertSorted(VectorT& vector, const typename VectorT::value_type& value) +{ + const size_t indexToInsert = VmaBinaryFindFirstNotLess( + vector.data(), + vector.data() + vector.size(), + value, + CmpLess()) - vector.data(); + VmaVectorInsert(vector, indexToInsert, value); + return indexToInsert; +} + +template +bool VmaVectorRemoveSorted(VectorT& vector, const typename VectorT::value_type& value) +{ + CmpLess comparator; + typename VectorT::iterator it = VmaBinaryFindFirstNotLess( + vector.begin(), + vector.end(), + value, + comparator); + if((it != vector.end()) && !comparator(*it, value) && !comparator(value, *it)) + { + size_t indexToRemove = it - vector.begin(); + VmaVectorRemove(vector, indexToRemove); + return true; + } + return false; +} + +//////////////////////////////////////////////////////////////////////////////// +// class VmaSmallVector + +/* +This is a vector (a variable-sized array), optimized for the case when the array is small. + +It contains some number of elements in-place, which allows it to avoid heap allocation +when the actual number of elements is below that threshold. This allows normal "small" +cases to be fast without losing generality for large inputs. +*/ + +template +class VmaSmallVector +{ +public: + typedef T value_type; + + VmaSmallVector(const AllocatorT& allocator) : + m_Count(0), + m_DynamicArray(allocator) + { + } + VmaSmallVector(size_t count, const AllocatorT& allocator) : + m_Count(count), + m_DynamicArray(count > N ? count : 0, allocator) + { + } + template + VmaSmallVector(const VmaSmallVector& src) = delete; + template + VmaSmallVector& operator=(const VmaSmallVector& rhs) = delete; + + bool empty() const { return m_Count == 0; } + size_t size() const { return m_Count; } + T* data() { return m_Count > N ? m_DynamicArray.data() : m_StaticArray; } + const T* data() const { return m_Count > N ? m_DynamicArray.data() : m_StaticArray; } + + T& operator[](size_t index) + { + VMA_HEAVY_ASSERT(index < m_Count); + return data()[index]; + } + const T& operator[](size_t index) const + { + VMA_HEAVY_ASSERT(index < m_Count); + return data()[index]; + } + + T& front() + { + VMA_HEAVY_ASSERT(m_Count > 0); + return data()[0]; + } + const T& front() const + { + VMA_HEAVY_ASSERT(m_Count > 0); + return data()[0]; + } + T& back() + { + VMA_HEAVY_ASSERT(m_Count > 0); + return data()[m_Count - 1]; + } + const T& back() const + { + VMA_HEAVY_ASSERT(m_Count > 0); + return data()[m_Count - 1]; + } + + void resize(size_t newCount, bool freeMemory = false) + { + if(newCount > N && m_Count > N) + { + // Any direction, staying in m_DynamicArray + m_DynamicArray.resize(newCount, freeMemory); + } + else if(newCount > N && m_Count <= N) + { + // Growing, moving from m_StaticArray to m_DynamicArray + m_DynamicArray.resize(newCount, freeMemory); + if(m_Count > 0) + { + memcpy(m_DynamicArray.data(), m_StaticArray, m_Count * sizeof(T)); + } + } + else if(newCount <= N && m_Count > N) + { + // Shrinking, moving from m_DynamicArray to m_StaticArray + if(newCount > 0) + { + memcpy(m_StaticArray, m_DynamicArray.data(), newCount * sizeof(T)); + } + m_DynamicArray.resize(0, freeMemory); + } + else + { + // Any direction, staying in m_StaticArray - nothing to do here + } + m_Count = newCount; + } + + void clear(bool freeMemory = false) + { + m_DynamicArray.clear(freeMemory); + m_Count = 0; + } + + void insert(size_t index, const T& src) + { + VMA_HEAVY_ASSERT(index <= m_Count); + const size_t oldCount = size(); + resize(oldCount + 1); + T* const dataPtr = data(); + if(index < oldCount) + { + // I know, this could be more optimal for case where memmove can be memcpy directly from m_StaticArray to m_DynamicArray. + memmove(dataPtr + (index + 1), dataPtr + index, (oldCount - index) * sizeof(T)); + } + dataPtr[index] = src; + } + + void remove(size_t index) + { + VMA_HEAVY_ASSERT(index < m_Count); + const size_t oldCount = size(); + if(index < oldCount - 1) + { + // I know, this could be more optimal for case where memmove can be memcpy directly from m_DynamicArray to m_StaticArray. + T* const dataPtr = data(); + memmove(dataPtr + index, dataPtr + (index + 1), (oldCount - index - 1) * sizeof(T)); + } + resize(oldCount - 1); + } + + void push_back(const T& src) + { + const size_t newIndex = size(); + resize(newIndex + 1); + data()[newIndex] = src; + } + + void pop_back() + { + VMA_HEAVY_ASSERT(m_Count > 0); + resize(size() - 1); + } + + void push_front(const T& src) + { + insert(0, src); + } + + void pop_front() + { + VMA_HEAVY_ASSERT(m_Count > 0); + remove(0); + } + + typedef T* iterator; + + iterator begin() { return data(); } + iterator end() { return data() + m_Count; } + +private: + size_t m_Count; + T m_StaticArray[N]; // Used when m_Size <= N + VmaVector m_DynamicArray; // Used when m_Size > N +}; + +//////////////////////////////////////////////////////////////////////////////// +// class VmaPoolAllocator + +/* +Allocator for objects of type T using a list of arrays (pools) to speed up +allocation. Number of elements that can be allocated is not bounded because +allocator can create multiple blocks. +*/ +template +class VmaPoolAllocator +{ + VMA_CLASS_NO_COPY(VmaPoolAllocator) +public: + VmaPoolAllocator(const VkAllocationCallbacks* pAllocationCallbacks, uint32_t firstBlockCapacity); + ~VmaPoolAllocator(); + template T* Alloc(Types... args); + void Free(T* ptr); + +private: + union Item + { + uint32_t NextFreeIndex; + alignas(T) char Value[sizeof(T)]; + }; + + struct ItemBlock + { + Item* pItems; + uint32_t Capacity; + uint32_t FirstFreeIndex; + }; + + const VkAllocationCallbacks* m_pAllocationCallbacks; + const uint32_t m_FirstBlockCapacity; + VmaVector< ItemBlock, VmaStlAllocator > m_ItemBlocks; + + ItemBlock& CreateNewBlock(); +}; + +template +VmaPoolAllocator::VmaPoolAllocator(const VkAllocationCallbacks* pAllocationCallbacks, uint32_t firstBlockCapacity) : + m_pAllocationCallbacks(pAllocationCallbacks), + m_FirstBlockCapacity(firstBlockCapacity), + m_ItemBlocks(VmaStlAllocator(pAllocationCallbacks)) +{ + VMA_ASSERT(m_FirstBlockCapacity > 1); +} + +template +VmaPoolAllocator::~VmaPoolAllocator() +{ + for(size_t i = m_ItemBlocks.size(); i--; ) + vma_delete_array(m_pAllocationCallbacks, m_ItemBlocks[i].pItems, m_ItemBlocks[i].Capacity); + m_ItemBlocks.clear(); +} + +template +template T* VmaPoolAllocator::Alloc(Types... args) +{ + for(size_t i = m_ItemBlocks.size(); i--; ) + { + ItemBlock& block = m_ItemBlocks[i]; + // This block has some free items: Use first one. + if(block.FirstFreeIndex != UINT32_MAX) + { + Item* const pItem = &block.pItems[block.FirstFreeIndex]; + block.FirstFreeIndex = pItem->NextFreeIndex; + T* result = (T*)&pItem->Value; + new(result)T(std::forward(args)...); // Explicit constructor call. + return result; + } + } + + // No block has free item: Create new one and use it. + ItemBlock& newBlock = CreateNewBlock(); + Item* const pItem = &newBlock.pItems[0]; + newBlock.FirstFreeIndex = pItem->NextFreeIndex; + T* result = (T*)&pItem->Value; + new(result)T(std::forward(args)...); // Explicit constructor call. + return result; +} + +template +void VmaPoolAllocator::Free(T* ptr) +{ + // Search all memory blocks to find ptr. + for(size_t i = m_ItemBlocks.size(); i--; ) + { + ItemBlock& block = m_ItemBlocks[i]; + + // Casting to union. + Item* pItemPtr; + memcpy(&pItemPtr, &ptr, sizeof(pItemPtr)); + + // Check if pItemPtr is in address range of this block. + if((pItemPtr >= block.pItems) && (pItemPtr < block.pItems + block.Capacity)) + { + ptr->~T(); // Explicit destructor call. + const uint32_t index = static_cast(pItemPtr - block.pItems); + pItemPtr->NextFreeIndex = block.FirstFreeIndex; + block.FirstFreeIndex = index; + return; + } + } + VMA_ASSERT(0 && "Pointer doesn't belong to this memory pool."); +} + +template +typename VmaPoolAllocator::ItemBlock& VmaPoolAllocator::CreateNewBlock() +{ + const uint32_t newBlockCapacity = m_ItemBlocks.empty() ? + m_FirstBlockCapacity : m_ItemBlocks.back().Capacity * 3 / 2; + + const ItemBlock newBlock = { + vma_new_array(m_pAllocationCallbacks, Item, newBlockCapacity), + newBlockCapacity, + 0 }; + + m_ItemBlocks.push_back(newBlock); + + // Setup singly-linked list of all free items in this block. + for(uint32_t i = 0; i < newBlockCapacity - 1; ++i) + newBlock.pItems[i].NextFreeIndex = i + 1; + newBlock.pItems[newBlockCapacity - 1].NextFreeIndex = UINT32_MAX; + return m_ItemBlocks.back(); +} + +//////////////////////////////////////////////////////////////////////////////// +// class VmaRawList, VmaList + +#if VMA_USE_STL_LIST + +#define VmaList std::list + +#else // #if VMA_USE_STL_LIST + +template +struct VmaListItem +{ + VmaListItem* pPrev; + VmaListItem* pNext; + T Value; +}; + +// Doubly linked list. +template +class VmaRawList +{ + VMA_CLASS_NO_COPY(VmaRawList) +public: + typedef VmaListItem ItemType; + + VmaRawList(const VkAllocationCallbacks* pAllocationCallbacks); + ~VmaRawList(); + void Clear(); + + size_t GetCount() const { return m_Count; } + bool IsEmpty() const { return m_Count == 0; } + + ItemType* Front() { return m_pFront; } + const ItemType* Front() const { return m_pFront; } + ItemType* Back() { return m_pBack; } + const ItemType* Back() const { return m_pBack; } + + ItemType* PushBack(); + ItemType* PushFront(); + ItemType* PushBack(const T& value); + ItemType* PushFront(const T& value); + void PopBack(); + void PopFront(); + + // Item can be null - it means PushBack. + ItemType* InsertBefore(ItemType* pItem); + // Item can be null - it means PushFront. + ItemType* InsertAfter(ItemType* pItem); + + ItemType* InsertBefore(ItemType* pItem, const T& value); + ItemType* InsertAfter(ItemType* pItem, const T& value); + + void Remove(ItemType* pItem); + +private: + const VkAllocationCallbacks* const m_pAllocationCallbacks; + VmaPoolAllocator m_ItemAllocator; + ItemType* m_pFront; + ItemType* m_pBack; + size_t m_Count; +}; + +template +VmaRawList::VmaRawList(const VkAllocationCallbacks* pAllocationCallbacks) : + m_pAllocationCallbacks(pAllocationCallbacks), + m_ItemAllocator(pAllocationCallbacks, 128), + m_pFront(VMA_NULL), + m_pBack(VMA_NULL), + m_Count(0) +{ +} + +template +VmaRawList::~VmaRawList() +{ + // Intentionally not calling Clear, because that would be unnecessary + // computations to return all items to m_ItemAllocator as free. +} + +template +void VmaRawList::Clear() +{ + if(IsEmpty() == false) + { + ItemType* pItem = m_pBack; + while(pItem != VMA_NULL) + { + ItemType* const pPrevItem = pItem->pPrev; + m_ItemAllocator.Free(pItem); + pItem = pPrevItem; + } + m_pFront = VMA_NULL; + m_pBack = VMA_NULL; + m_Count = 0; + } +} + +template +VmaListItem* VmaRawList::PushBack() +{ + ItemType* const pNewItem = m_ItemAllocator.Alloc(); + pNewItem->pNext = VMA_NULL; + if(IsEmpty()) + { + pNewItem->pPrev = VMA_NULL; + m_pFront = pNewItem; + m_pBack = pNewItem; + m_Count = 1; + } + else + { + pNewItem->pPrev = m_pBack; + m_pBack->pNext = pNewItem; + m_pBack = pNewItem; + ++m_Count; + } + return pNewItem; +} + +template +VmaListItem* VmaRawList::PushFront() +{ + ItemType* const pNewItem = m_ItemAllocator.Alloc(); + pNewItem->pPrev = VMA_NULL; + if(IsEmpty()) + { + pNewItem->pNext = VMA_NULL; + m_pFront = pNewItem; + m_pBack = pNewItem; + m_Count = 1; + } + else + { + pNewItem->pNext = m_pFront; + m_pFront->pPrev = pNewItem; + m_pFront = pNewItem; + ++m_Count; + } + return pNewItem; +} + +template +VmaListItem* VmaRawList::PushBack(const T& value) +{ + ItemType* const pNewItem = PushBack(); + pNewItem->Value = value; + return pNewItem; +} + +template +VmaListItem* VmaRawList::PushFront(const T& value) +{ + ItemType* const pNewItem = PushFront(); + pNewItem->Value = value; + return pNewItem; +} + +template +void VmaRawList::PopBack() +{ + VMA_HEAVY_ASSERT(m_Count > 0); + ItemType* const pBackItem = m_pBack; + ItemType* const pPrevItem = pBackItem->pPrev; + if(pPrevItem != VMA_NULL) + { + pPrevItem->pNext = VMA_NULL; + } + m_pBack = pPrevItem; + m_ItemAllocator.Free(pBackItem); + --m_Count; +} + +template +void VmaRawList::PopFront() +{ + VMA_HEAVY_ASSERT(m_Count > 0); + ItemType* const pFrontItem = m_pFront; + ItemType* const pNextItem = pFrontItem->pNext; + if(pNextItem != VMA_NULL) + { + pNextItem->pPrev = VMA_NULL; + } + m_pFront = pNextItem; + m_ItemAllocator.Free(pFrontItem); + --m_Count; +} + +template +void VmaRawList::Remove(ItemType* pItem) +{ + VMA_HEAVY_ASSERT(pItem != VMA_NULL); + VMA_HEAVY_ASSERT(m_Count > 0); + + if(pItem->pPrev != VMA_NULL) + { + pItem->pPrev->pNext = pItem->pNext; + } + else + { + VMA_HEAVY_ASSERT(m_pFront == pItem); + m_pFront = pItem->pNext; + } + + if(pItem->pNext != VMA_NULL) + { + pItem->pNext->pPrev = pItem->pPrev; + } + else + { + VMA_HEAVY_ASSERT(m_pBack == pItem); + m_pBack = pItem->pPrev; + } + + m_ItemAllocator.Free(pItem); + --m_Count; +} + +template +VmaListItem* VmaRawList::InsertBefore(ItemType* pItem) +{ + if(pItem != VMA_NULL) + { + ItemType* const prevItem = pItem->pPrev; + ItemType* const newItem = m_ItemAllocator.Alloc(); + newItem->pPrev = prevItem; + newItem->pNext = pItem; + pItem->pPrev = newItem; + if(prevItem != VMA_NULL) + { + prevItem->pNext = newItem; + } + else + { + VMA_HEAVY_ASSERT(m_pFront == pItem); + m_pFront = newItem; + } + ++m_Count; + return newItem; + } + else + return PushBack(); +} + +template +VmaListItem* VmaRawList::InsertAfter(ItemType* pItem) +{ + if(pItem != VMA_NULL) + { + ItemType* const nextItem = pItem->pNext; + ItemType* const newItem = m_ItemAllocator.Alloc(); + newItem->pNext = nextItem; + newItem->pPrev = pItem; + pItem->pNext = newItem; + if(nextItem != VMA_NULL) + { + nextItem->pPrev = newItem; + } + else + { + VMA_HEAVY_ASSERT(m_pBack == pItem); + m_pBack = newItem; + } + ++m_Count; + return newItem; + } + else + return PushFront(); +} + +template +VmaListItem* VmaRawList::InsertBefore(ItemType* pItem, const T& value) +{ + ItemType* const newItem = InsertBefore(pItem); + newItem->Value = value; + return newItem; +} + +template +VmaListItem* VmaRawList::InsertAfter(ItemType* pItem, const T& value) +{ + ItemType* const newItem = InsertAfter(pItem); + newItem->Value = value; + return newItem; +} + +template +class VmaList +{ + VMA_CLASS_NO_COPY(VmaList) +public: + class iterator + { + public: + iterator() : + m_pList(VMA_NULL), + m_pItem(VMA_NULL) + { + } + + T& operator*() const + { + VMA_HEAVY_ASSERT(m_pItem != VMA_NULL); + return m_pItem->Value; + } + T* operator->() const + { + VMA_HEAVY_ASSERT(m_pItem != VMA_NULL); + return &m_pItem->Value; + } + + iterator& operator++() + { + VMA_HEAVY_ASSERT(m_pItem != VMA_NULL); + m_pItem = m_pItem->pNext; + return *this; + } + iterator& operator--() + { + if(m_pItem != VMA_NULL) + { + m_pItem = m_pItem->pPrev; + } + else + { + VMA_HEAVY_ASSERT(!m_pList->IsEmpty()); + m_pItem = m_pList->Back(); + } + return *this; + } + + iterator operator++(int) + { + iterator result = *this; + ++*this; + return result; + } + iterator operator--(int) + { + iterator result = *this; + --*this; + return result; + } + + bool operator==(const iterator& rhs) const + { + VMA_HEAVY_ASSERT(m_pList == rhs.m_pList); + return m_pItem == rhs.m_pItem; + } + bool operator!=(const iterator& rhs) const + { + VMA_HEAVY_ASSERT(m_pList == rhs.m_pList); + return m_pItem != rhs.m_pItem; + } + + private: + VmaRawList* m_pList; + VmaListItem* m_pItem; + + iterator(VmaRawList* pList, VmaListItem* pItem) : + m_pList(pList), + m_pItem(pItem) + { + } + + friend class VmaList; + }; + + class const_iterator + { + public: + const_iterator() : + m_pList(VMA_NULL), + m_pItem(VMA_NULL) + { + } + + const_iterator(const iterator& src) : + m_pList(src.m_pList), + m_pItem(src.m_pItem) + { + } + + const T& operator*() const + { + VMA_HEAVY_ASSERT(m_pItem != VMA_NULL); + return m_pItem->Value; + } + const T* operator->() const + { + VMA_HEAVY_ASSERT(m_pItem != VMA_NULL); + return &m_pItem->Value; + } + + const_iterator& operator++() + { + VMA_HEAVY_ASSERT(m_pItem != VMA_NULL); + m_pItem = m_pItem->pNext; + return *this; + } + const_iterator& operator--() + { + if(m_pItem != VMA_NULL) + { + m_pItem = m_pItem->pPrev; + } + else + { + VMA_HEAVY_ASSERT(!m_pList->IsEmpty()); + m_pItem = m_pList->Back(); + } + return *this; + } + + const_iterator operator++(int) + { + const_iterator result = *this; + ++*this; + return result; + } + const_iterator operator--(int) + { + const_iterator result = *this; + --*this; + return result; + } + + bool operator==(const const_iterator& rhs) const + { + VMA_HEAVY_ASSERT(m_pList == rhs.m_pList); + return m_pItem == rhs.m_pItem; + } + bool operator!=(const const_iterator& rhs) const + { + VMA_HEAVY_ASSERT(m_pList == rhs.m_pList); + return m_pItem != rhs.m_pItem; + } + + private: + const_iterator(const VmaRawList* pList, const VmaListItem* pItem) : + m_pList(pList), + m_pItem(pItem) + { + } + + const VmaRawList* m_pList; + const VmaListItem* m_pItem; + + friend class VmaList; + }; + + VmaList(const AllocatorT& allocator) : m_RawList(allocator.m_pCallbacks) { } + + bool empty() const { return m_RawList.IsEmpty(); } + size_t size() const { return m_RawList.GetCount(); } + + iterator begin() { return iterator(&m_RawList, m_RawList.Front()); } + iterator end() { return iterator(&m_RawList, VMA_NULL); } + + const_iterator cbegin() const { return const_iterator(&m_RawList, m_RawList.Front()); } + const_iterator cend() const { return const_iterator(&m_RawList, VMA_NULL); } + + void clear() { m_RawList.Clear(); } + void push_back(const T& value) { m_RawList.PushBack(value); } + void erase(iterator it) { m_RawList.Remove(it.m_pItem); } + iterator insert(iterator it, const T& value) { return iterator(&m_RawList, m_RawList.InsertBefore(it.m_pItem, value)); } + +private: + VmaRawList m_RawList; +}; + +#endif // #if VMA_USE_STL_LIST + +//////////////////////////////////////////////////////////////////////////////// +// class VmaMap + +// Unused in this version. +#if 0 + +#if VMA_USE_STL_UNORDERED_MAP + +#define VmaPair std::pair + +#define VMA_MAP_TYPE(KeyT, ValueT) \ + std::unordered_map< KeyT, ValueT, std::hash, std::equal_to, VmaStlAllocator< std::pair > > + +#else // #if VMA_USE_STL_UNORDERED_MAP + +template +struct VmaPair +{ + T1 first; + T2 second; + + VmaPair() : first(), second() { } + VmaPair(const T1& firstSrc, const T2& secondSrc) : first(firstSrc), second(secondSrc) { } +}; + +/* Class compatible with subset of interface of std::unordered_map. +KeyT, ValueT must be POD because they will be stored in VmaVector. +*/ +template +class VmaMap +{ +public: + typedef VmaPair PairType; + typedef PairType* iterator; + + VmaMap(const VmaStlAllocator& allocator) : m_Vector(allocator) { } + + iterator begin() { return m_Vector.begin(); } + iterator end() { return m_Vector.end(); } + + void insert(const PairType& pair); + iterator find(const KeyT& key); + void erase(iterator it); + +private: + VmaVector< PairType, VmaStlAllocator > m_Vector; +}; + +#define VMA_MAP_TYPE(KeyT, ValueT) VmaMap + +template +struct VmaPairFirstLess +{ + bool operator()(const VmaPair& lhs, const VmaPair& rhs) const + { + return lhs.first < rhs.first; + } + bool operator()(const VmaPair& lhs, const FirstT& rhsFirst) const + { + return lhs.first < rhsFirst; + } +}; + +template +void VmaMap::insert(const PairType& pair) +{ + const size_t indexToInsert = VmaBinaryFindFirstNotLess( + m_Vector.data(), + m_Vector.data() + m_Vector.size(), + pair, + VmaPairFirstLess()) - m_Vector.data(); + VmaVectorInsert(m_Vector, indexToInsert, pair); +} + +template +VmaPair* VmaMap::find(const KeyT& key) +{ + PairType* it = VmaBinaryFindFirstNotLess( + m_Vector.data(), + m_Vector.data() + m_Vector.size(), + key, + VmaPairFirstLess()); + if((it != m_Vector.end()) && (it->first == key)) + { + return it; + } + else + { + return m_Vector.end(); + } +} + +template +void VmaMap::erase(iterator it) +{ + VmaVectorRemove(m_Vector, it - m_Vector.begin()); +} + +#endif // #if VMA_USE_STL_UNORDERED_MAP + +#endif // #if 0 + +//////////////////////////////////////////////////////////////////////////////// + +class VmaDeviceMemoryBlock; + +enum VMA_CACHE_OPERATION { VMA_CACHE_FLUSH, VMA_CACHE_INVALIDATE }; + +struct VmaAllocation_T +{ +private: + static const uint8_t MAP_COUNT_FLAG_PERSISTENT_MAP = 0x80; + + enum FLAGS + { + FLAG_USER_DATA_STRING = 0x01, + }; + +public: + enum ALLOCATION_TYPE + { + ALLOCATION_TYPE_NONE, + ALLOCATION_TYPE_BLOCK, + ALLOCATION_TYPE_DEDICATED, + }; + + /* + This struct is allocated using VmaPoolAllocator. + */ + + VmaAllocation_T(uint32_t currentFrameIndex, bool userDataString) : + m_Alignment{1}, + m_Size{0}, + m_pUserData{VMA_NULL}, + m_LastUseFrameIndex{currentFrameIndex}, + m_MemoryTypeIndex{0}, + m_Type{(uint8_t)ALLOCATION_TYPE_NONE}, + m_SuballocationType{(uint8_t)VMA_SUBALLOCATION_TYPE_UNKNOWN}, + m_MapCount{0}, + m_Flags{userDataString ? (uint8_t)FLAG_USER_DATA_STRING : (uint8_t)0} + { +#if VMA_STATS_STRING_ENABLED + m_CreationFrameIndex = currentFrameIndex; + m_BufferImageUsage = 0; +#endif + } + + ~VmaAllocation_T() + { + VMA_ASSERT((m_MapCount & ~MAP_COUNT_FLAG_PERSISTENT_MAP) == 0 && "Allocation was not unmapped before destruction."); + + // Check if owned string was freed. + VMA_ASSERT(m_pUserData == VMA_NULL); + } + + void InitBlockAllocation( + VmaDeviceMemoryBlock* block, + VkDeviceSize offset, + VkDeviceSize alignment, + VkDeviceSize size, + uint32_t memoryTypeIndex, + VmaSuballocationType suballocationType, + bool mapped, + bool canBecomeLost) + { + VMA_ASSERT(m_Type == ALLOCATION_TYPE_NONE); + VMA_ASSERT(block != VMA_NULL); + m_Type = (uint8_t)ALLOCATION_TYPE_BLOCK; + m_Alignment = alignment; + m_Size = size; + m_MemoryTypeIndex = memoryTypeIndex; + m_MapCount = mapped ? MAP_COUNT_FLAG_PERSISTENT_MAP : 0; + m_SuballocationType = (uint8_t)suballocationType; + m_BlockAllocation.m_Block = block; + m_BlockAllocation.m_Offset = offset; + m_BlockAllocation.m_CanBecomeLost = canBecomeLost; + } + + void InitLost() + { + VMA_ASSERT(m_Type == ALLOCATION_TYPE_NONE); + VMA_ASSERT(m_LastUseFrameIndex.load() == VMA_FRAME_INDEX_LOST); + m_Type = (uint8_t)ALLOCATION_TYPE_BLOCK; + m_MemoryTypeIndex = 0; + m_BlockAllocation.m_Block = VMA_NULL; + m_BlockAllocation.m_Offset = 0; + m_BlockAllocation.m_CanBecomeLost = true; + } + + void ChangeBlockAllocation( + VmaAllocator hAllocator, + VmaDeviceMemoryBlock* block, + VkDeviceSize offset); + + void ChangeOffset(VkDeviceSize newOffset); + + // pMappedData not null means allocation is created with MAPPED flag. + void InitDedicatedAllocation( + uint32_t memoryTypeIndex, + VkDeviceMemory hMemory, + VmaSuballocationType suballocationType, + void* pMappedData, + VkDeviceSize size) + { + VMA_ASSERT(m_Type == ALLOCATION_TYPE_NONE); + VMA_ASSERT(hMemory != VK_NULL_HANDLE); + m_Type = (uint8_t)ALLOCATION_TYPE_DEDICATED; + m_Alignment = 0; + m_Size = size; + m_MemoryTypeIndex = memoryTypeIndex; + m_SuballocationType = (uint8_t)suballocationType; + m_MapCount = (pMappedData != VMA_NULL) ? MAP_COUNT_FLAG_PERSISTENT_MAP : 0; + m_DedicatedAllocation.m_hMemory = hMemory; + m_DedicatedAllocation.m_pMappedData = pMappedData; + } + + ALLOCATION_TYPE GetType() const { return (ALLOCATION_TYPE)m_Type; } + VkDeviceSize GetAlignment() const { return m_Alignment; } + VkDeviceSize GetSize() const { return m_Size; } + bool IsUserDataString() const { return (m_Flags & FLAG_USER_DATA_STRING) != 0; } + void* GetUserData() const { return m_pUserData; } + void SetUserData(VmaAllocator hAllocator, void* pUserData); + VmaSuballocationType GetSuballocationType() const { return (VmaSuballocationType)m_SuballocationType; } + + VmaDeviceMemoryBlock* GetBlock() const + { + VMA_ASSERT(m_Type == ALLOCATION_TYPE_BLOCK); + return m_BlockAllocation.m_Block; + } + VkDeviceSize GetOffset() const; + VkDeviceMemory GetMemory() const; + uint32_t GetMemoryTypeIndex() const { return m_MemoryTypeIndex; } + bool IsPersistentMap() const { return (m_MapCount & MAP_COUNT_FLAG_PERSISTENT_MAP) != 0; } + void* GetMappedData() const; + bool CanBecomeLost() const; + + uint32_t GetLastUseFrameIndex() const + { + return m_LastUseFrameIndex.load(); + } + bool CompareExchangeLastUseFrameIndex(uint32_t& expected, uint32_t desired) + { + return m_LastUseFrameIndex.compare_exchange_weak(expected, desired); + } + /* + - If hAllocation.LastUseFrameIndex + frameInUseCount < allocator.CurrentFrameIndex, + makes it lost by setting LastUseFrameIndex = VMA_FRAME_INDEX_LOST and returns true. + - Else, returns false. + + If hAllocation is already lost, assert - you should not call it then. + If hAllocation was not created with CAN_BECOME_LOST_BIT, assert. + */ + bool MakeLost(uint32_t currentFrameIndex, uint32_t frameInUseCount); + + void DedicatedAllocCalcStatsInfo(VmaStatInfo& outInfo) + { + VMA_ASSERT(m_Type == ALLOCATION_TYPE_DEDICATED); + outInfo.blockCount = 1; + outInfo.allocationCount = 1; + outInfo.unusedRangeCount = 0; + outInfo.usedBytes = m_Size; + outInfo.unusedBytes = 0; + outInfo.allocationSizeMin = outInfo.allocationSizeMax = m_Size; + outInfo.unusedRangeSizeMin = UINT64_MAX; + outInfo.unusedRangeSizeMax = 0; + } + + void BlockAllocMap(); + void BlockAllocUnmap(); + VkResult DedicatedAllocMap(VmaAllocator hAllocator, void** ppData); + void DedicatedAllocUnmap(VmaAllocator hAllocator); + +#if VMA_STATS_STRING_ENABLED + uint32_t GetCreationFrameIndex() const { return m_CreationFrameIndex; } + uint32_t GetBufferImageUsage() const { return m_BufferImageUsage; } + + void InitBufferImageUsage(uint32_t bufferImageUsage) + { + VMA_ASSERT(m_BufferImageUsage == 0); + m_BufferImageUsage = bufferImageUsage; + } + + void PrintParameters(class VmaJsonWriter& json) const; +#endif + +private: + VkDeviceSize m_Alignment; + VkDeviceSize m_Size; + void* m_pUserData; + VMA_ATOMIC_UINT32 m_LastUseFrameIndex; + uint32_t m_MemoryTypeIndex; + uint8_t m_Type; // ALLOCATION_TYPE + uint8_t m_SuballocationType; // VmaSuballocationType + // Bit 0x80 is set when allocation was created with VMA_ALLOCATION_CREATE_MAPPED_BIT. + // Bits with mask 0x7F are reference counter for vmaMapMemory()/vmaUnmapMemory(). + uint8_t m_MapCount; + uint8_t m_Flags; // enum FLAGS + + // Allocation out of VmaDeviceMemoryBlock. + struct BlockAllocation + { + VmaDeviceMemoryBlock* m_Block; + VkDeviceSize m_Offset; + bool m_CanBecomeLost; + }; + + // Allocation for an object that has its own private VkDeviceMemory. + struct DedicatedAllocation + { + VkDeviceMemory m_hMemory; + void* m_pMappedData; // Not null means memory is mapped. + }; + + union + { + // Allocation out of VmaDeviceMemoryBlock. + BlockAllocation m_BlockAllocation; + // Allocation for an object that has its own private VkDeviceMemory. + DedicatedAllocation m_DedicatedAllocation; + }; + +#if VMA_STATS_STRING_ENABLED + uint32_t m_CreationFrameIndex; + uint32_t m_BufferImageUsage; // 0 if unknown. +#endif + + void FreeUserDataString(VmaAllocator hAllocator); +}; + +/* +Represents a region of VmaDeviceMemoryBlock that is either assigned and returned as +allocated memory block or free. +*/ +struct VmaSuballocation +{ + VkDeviceSize offset; + VkDeviceSize size; + VmaAllocation hAllocation; + VmaSuballocationType type; +}; + +// Comparator for offsets. +struct VmaSuballocationOffsetLess +{ + bool operator()(const VmaSuballocation& lhs, const VmaSuballocation& rhs) const + { + return lhs.offset < rhs.offset; + } +}; +struct VmaSuballocationOffsetGreater +{ + bool operator()(const VmaSuballocation& lhs, const VmaSuballocation& rhs) const + { + return lhs.offset > rhs.offset; + } +}; + +typedef VmaList< VmaSuballocation, VmaStlAllocator > VmaSuballocationList; + +// Cost of one additional allocation lost, as equivalent in bytes. +static const VkDeviceSize VMA_LOST_ALLOCATION_COST = 1048576; + +enum class VmaAllocationRequestType +{ + Normal, + // Used by "Linear" algorithm. + UpperAddress, + EndOf1st, + EndOf2nd, +}; + +/* +Parameters of planned allocation inside a VmaDeviceMemoryBlock. + +If canMakeOtherLost was false: +- item points to a FREE suballocation. +- itemsToMakeLostCount is 0. + +If canMakeOtherLost was true: +- item points to first of sequence of suballocations, which are either FREE, + or point to VmaAllocations that can become lost. +- itemsToMakeLostCount is the number of VmaAllocations that need to be made lost for + the requested allocation to succeed. +*/ +struct VmaAllocationRequest +{ + VkDeviceSize offset; + VkDeviceSize sumFreeSize; // Sum size of free items that overlap with proposed allocation. + VkDeviceSize sumItemSize; // Sum size of items to make lost that overlap with proposed allocation. + VmaSuballocationList::iterator item; + size_t itemsToMakeLostCount; + void* customData; + VmaAllocationRequestType type; + + VkDeviceSize CalcCost() const + { + return sumItemSize + itemsToMakeLostCount * VMA_LOST_ALLOCATION_COST; + } +}; + +/* +Data structure used for bookkeeping of allocations and unused ranges of memory +in a single VkDeviceMemory block. +*/ +class VmaBlockMetadata +{ +public: + VmaBlockMetadata(VmaAllocator hAllocator); + virtual ~VmaBlockMetadata() { } + virtual void Init(VkDeviceSize size) { m_Size = size; } + + // Validates all data structures inside this object. If not valid, returns false. + virtual bool Validate() const = 0; + VkDeviceSize GetSize() const { return m_Size; } + virtual size_t GetAllocationCount() const = 0; + virtual VkDeviceSize GetSumFreeSize() const = 0; + virtual VkDeviceSize GetUnusedRangeSizeMax() const = 0; + // Returns true if this block is empty - contains only single free suballocation. + virtual bool IsEmpty() const = 0; + + virtual void CalcAllocationStatInfo(VmaStatInfo& outInfo) const = 0; + // Shouldn't modify blockCount. + virtual void AddPoolStats(VmaPoolStats& inoutStats) const = 0; + +#if VMA_STATS_STRING_ENABLED + virtual void PrintDetailedMap(class VmaJsonWriter& json) const = 0; +#endif + + // Tries to find a place for suballocation with given parameters inside this block. + // If succeeded, fills pAllocationRequest and returns true. + // If failed, returns false. + virtual bool CreateAllocationRequest( + uint32_t currentFrameIndex, + uint32_t frameInUseCount, + VkDeviceSize bufferImageGranularity, + VkDeviceSize allocSize, + VkDeviceSize allocAlignment, + bool upperAddress, + VmaSuballocationType allocType, + bool canMakeOtherLost, + // Always one of VMA_ALLOCATION_CREATE_STRATEGY_* or VMA_ALLOCATION_INTERNAL_STRATEGY_* flags. + uint32_t strategy, + VmaAllocationRequest* pAllocationRequest) = 0; + + virtual bool MakeRequestedAllocationsLost( + uint32_t currentFrameIndex, + uint32_t frameInUseCount, + VmaAllocationRequest* pAllocationRequest) = 0; + + virtual uint32_t MakeAllocationsLost(uint32_t currentFrameIndex, uint32_t frameInUseCount) = 0; + + virtual VkResult CheckCorruption(const void* pBlockData) = 0; + + // Makes actual allocation based on request. Request must already be checked and valid. + virtual void Alloc( + const VmaAllocationRequest& request, + VmaSuballocationType type, + VkDeviceSize allocSize, + VmaAllocation hAllocation) = 0; + + // Frees suballocation assigned to given memory region. + virtual void Free(const VmaAllocation allocation) = 0; + virtual void FreeAtOffset(VkDeviceSize offset) = 0; + +protected: + const VkAllocationCallbacks* GetAllocationCallbacks() const { return m_pAllocationCallbacks; } + +#if VMA_STATS_STRING_ENABLED + void PrintDetailedMap_Begin(class VmaJsonWriter& json, + VkDeviceSize unusedBytes, + size_t allocationCount, + size_t unusedRangeCount) const; + void PrintDetailedMap_Allocation(class VmaJsonWriter& json, + VkDeviceSize offset, + VmaAllocation hAllocation) const; + void PrintDetailedMap_UnusedRange(class VmaJsonWriter& json, + VkDeviceSize offset, + VkDeviceSize size) const; + void PrintDetailedMap_End(class VmaJsonWriter& json) const; +#endif + +private: + VkDeviceSize m_Size; + const VkAllocationCallbacks* m_pAllocationCallbacks; +}; + +#define VMA_VALIDATE(cond) do { if(!(cond)) { \ + VMA_ASSERT(0 && "Validation failed: " #cond); \ + return false; \ + } } while(false) + +class VmaBlockMetadata_Generic : public VmaBlockMetadata +{ + VMA_CLASS_NO_COPY(VmaBlockMetadata_Generic) +public: + VmaBlockMetadata_Generic(VmaAllocator hAllocator); + virtual ~VmaBlockMetadata_Generic(); + virtual void Init(VkDeviceSize size); + + virtual bool Validate() const; + virtual size_t GetAllocationCount() const { return m_Suballocations.size() - m_FreeCount; } + virtual VkDeviceSize GetSumFreeSize() const { return m_SumFreeSize; } + virtual VkDeviceSize GetUnusedRangeSizeMax() const; + virtual bool IsEmpty() const; + + virtual void CalcAllocationStatInfo(VmaStatInfo& outInfo) const; + virtual void AddPoolStats(VmaPoolStats& inoutStats) const; + +#if VMA_STATS_STRING_ENABLED + virtual void PrintDetailedMap(class VmaJsonWriter& json) const; +#endif + + virtual bool CreateAllocationRequest( + uint32_t currentFrameIndex, + uint32_t frameInUseCount, + VkDeviceSize bufferImageGranularity, + VkDeviceSize allocSize, + VkDeviceSize allocAlignment, + bool upperAddress, + VmaSuballocationType allocType, + bool canMakeOtherLost, + uint32_t strategy, + VmaAllocationRequest* pAllocationRequest); + + virtual bool MakeRequestedAllocationsLost( + uint32_t currentFrameIndex, + uint32_t frameInUseCount, + VmaAllocationRequest* pAllocationRequest); + + virtual uint32_t MakeAllocationsLost(uint32_t currentFrameIndex, uint32_t frameInUseCount); + + virtual VkResult CheckCorruption(const void* pBlockData); + + virtual void Alloc( + const VmaAllocationRequest& request, + VmaSuballocationType type, + VkDeviceSize allocSize, + VmaAllocation hAllocation); + + virtual void Free(const VmaAllocation allocation); + virtual void FreeAtOffset(VkDeviceSize offset); + + //////////////////////////////////////////////////////////////////////////////// + // For defragmentation + + bool IsBufferImageGranularityConflictPossible( + VkDeviceSize bufferImageGranularity, + VmaSuballocationType& inOutPrevSuballocType) const; + +private: + friend class VmaDefragmentationAlgorithm_Generic; + friend class VmaDefragmentationAlgorithm_Fast; + + uint32_t m_FreeCount; + VkDeviceSize m_SumFreeSize; + VmaSuballocationList m_Suballocations; + // Suballocations that are free and have size greater than certain threshold. + // Sorted by size, ascending. + VmaVector< VmaSuballocationList::iterator, VmaStlAllocator< VmaSuballocationList::iterator > > m_FreeSuballocationsBySize; + + bool ValidateFreeSuballocationList() const; + + // Checks if requested suballocation with given parameters can be placed in given pFreeSuballocItem. + // If yes, fills pOffset and returns true. If no, returns false. + bool CheckAllocation( + uint32_t currentFrameIndex, + uint32_t frameInUseCount, + VkDeviceSize bufferImageGranularity, + VkDeviceSize allocSize, + VkDeviceSize allocAlignment, + VmaSuballocationType allocType, + VmaSuballocationList::const_iterator suballocItem, + bool canMakeOtherLost, + VkDeviceSize* pOffset, + size_t* itemsToMakeLostCount, + VkDeviceSize* pSumFreeSize, + VkDeviceSize* pSumItemSize) const; + // Given free suballocation, it merges it with following one, which must also be free. + void MergeFreeWithNext(VmaSuballocationList::iterator item); + // Releases given suballocation, making it free. + // Merges it with adjacent free suballocations if applicable. + // Returns iterator to new free suballocation at this place. + VmaSuballocationList::iterator FreeSuballocation(VmaSuballocationList::iterator suballocItem); + // Given free suballocation, it inserts it into sorted list of + // m_FreeSuballocationsBySize if it's suitable. + void RegisterFreeSuballocation(VmaSuballocationList::iterator item); + // Given free suballocation, it removes it from sorted list of + // m_FreeSuballocationsBySize if it's suitable. + void UnregisterFreeSuballocation(VmaSuballocationList::iterator item); +}; + +/* +Allocations and their references in internal data structure look like this: + +if(m_2ndVectorMode == SECOND_VECTOR_EMPTY): + + 0 +-------+ + | | + | | + | | + +-------+ + | Alloc | 1st[m_1stNullItemsBeginCount] + +-------+ + | Alloc | 1st[m_1stNullItemsBeginCount + 1] + +-------+ + | ... | + +-------+ + | Alloc | 1st[1st.size() - 1] + +-------+ + | | + | | + | | +GetSize() +-------+ + +if(m_2ndVectorMode == SECOND_VECTOR_RING_BUFFER): + + 0 +-------+ + | Alloc | 2nd[0] + +-------+ + | Alloc | 2nd[1] + +-------+ + | ... | + +-------+ + | Alloc | 2nd[2nd.size() - 1] + +-------+ + | | + | | + | | + +-------+ + | Alloc | 1st[m_1stNullItemsBeginCount] + +-------+ + | Alloc | 1st[m_1stNullItemsBeginCount + 1] + +-------+ + | ... | + +-------+ + | Alloc | 1st[1st.size() - 1] + +-------+ + | | +GetSize() +-------+ + +if(m_2ndVectorMode == SECOND_VECTOR_DOUBLE_STACK): + + 0 +-------+ + | | + | | + | | + +-------+ + | Alloc | 1st[m_1stNullItemsBeginCount] + +-------+ + | Alloc | 1st[m_1stNullItemsBeginCount + 1] + +-------+ + | ... | + +-------+ + | Alloc | 1st[1st.size() - 1] + +-------+ + | | + | | + | | + +-------+ + | Alloc | 2nd[2nd.size() - 1] + +-------+ + | ... | + +-------+ + | Alloc | 2nd[1] + +-------+ + | Alloc | 2nd[0] +GetSize() +-------+ + +*/ +class VmaBlockMetadata_Linear : public VmaBlockMetadata +{ + VMA_CLASS_NO_COPY(VmaBlockMetadata_Linear) +public: + VmaBlockMetadata_Linear(VmaAllocator hAllocator); + virtual ~VmaBlockMetadata_Linear(); + virtual void Init(VkDeviceSize size); + + virtual bool Validate() const; + virtual size_t GetAllocationCount() const; + virtual VkDeviceSize GetSumFreeSize() const { return m_SumFreeSize; } + virtual VkDeviceSize GetUnusedRangeSizeMax() const; + virtual bool IsEmpty() const { return GetAllocationCount() == 0; } + + virtual void CalcAllocationStatInfo(VmaStatInfo& outInfo) const; + virtual void AddPoolStats(VmaPoolStats& inoutStats) const; + +#if VMA_STATS_STRING_ENABLED + virtual void PrintDetailedMap(class VmaJsonWriter& json) const; +#endif + + virtual bool CreateAllocationRequest( + uint32_t currentFrameIndex, + uint32_t frameInUseCount, + VkDeviceSize bufferImageGranularity, + VkDeviceSize allocSize, + VkDeviceSize allocAlignment, + bool upperAddress, + VmaSuballocationType allocType, + bool canMakeOtherLost, + uint32_t strategy, + VmaAllocationRequest* pAllocationRequest); + + virtual bool MakeRequestedAllocationsLost( + uint32_t currentFrameIndex, + uint32_t frameInUseCount, + VmaAllocationRequest* pAllocationRequest); + + virtual uint32_t MakeAllocationsLost(uint32_t currentFrameIndex, uint32_t frameInUseCount); + + virtual VkResult CheckCorruption(const void* pBlockData); + + virtual void Alloc( + const VmaAllocationRequest& request, + VmaSuballocationType type, + VkDeviceSize allocSize, + VmaAllocation hAllocation); + + virtual void Free(const VmaAllocation allocation); + virtual void FreeAtOffset(VkDeviceSize offset); + +private: + /* + There are two suballocation vectors, used in ping-pong way. + The one with index m_1stVectorIndex is called 1st. + The one with index (m_1stVectorIndex ^ 1) is called 2nd. + 2nd can be non-empty only when 1st is not empty. + When 2nd is not empty, m_2ndVectorMode indicates its mode of operation. + */ + typedef VmaVector< VmaSuballocation, VmaStlAllocator > SuballocationVectorType; + + enum SECOND_VECTOR_MODE + { + SECOND_VECTOR_EMPTY, + /* + Suballocations in 2nd vector are created later than the ones in 1st, but they + all have smaller offset. + */ + SECOND_VECTOR_RING_BUFFER, + /* + Suballocations in 2nd vector are upper side of double stack. + They all have offsets higher than those in 1st vector. + Top of this stack means smaller offsets, but higher indices in this vector. + */ + SECOND_VECTOR_DOUBLE_STACK, + }; + + VkDeviceSize m_SumFreeSize; + SuballocationVectorType m_Suballocations0, m_Suballocations1; + uint32_t m_1stVectorIndex; + SECOND_VECTOR_MODE m_2ndVectorMode; + + SuballocationVectorType& AccessSuballocations1st() { return m_1stVectorIndex ? m_Suballocations1 : m_Suballocations0; } + SuballocationVectorType& AccessSuballocations2nd() { return m_1stVectorIndex ? m_Suballocations0 : m_Suballocations1; } + const SuballocationVectorType& AccessSuballocations1st() const { return m_1stVectorIndex ? m_Suballocations1 : m_Suballocations0; } + const SuballocationVectorType& AccessSuballocations2nd() const { return m_1stVectorIndex ? m_Suballocations0 : m_Suballocations1; } + + // Number of items in 1st vector with hAllocation = null at the beginning. + size_t m_1stNullItemsBeginCount; + // Number of other items in 1st vector with hAllocation = null somewhere in the middle. + size_t m_1stNullItemsMiddleCount; + // Number of items in 2nd vector with hAllocation = null. + size_t m_2ndNullItemsCount; + + bool ShouldCompact1st() const; + void CleanupAfterFree(); + + bool CreateAllocationRequest_LowerAddress( + uint32_t currentFrameIndex, + uint32_t frameInUseCount, + VkDeviceSize bufferImageGranularity, + VkDeviceSize allocSize, + VkDeviceSize allocAlignment, + VmaSuballocationType allocType, + bool canMakeOtherLost, + uint32_t strategy, + VmaAllocationRequest* pAllocationRequest); + bool CreateAllocationRequest_UpperAddress( + uint32_t currentFrameIndex, + uint32_t frameInUseCount, + VkDeviceSize bufferImageGranularity, + VkDeviceSize allocSize, + VkDeviceSize allocAlignment, + VmaSuballocationType allocType, + bool canMakeOtherLost, + uint32_t strategy, + VmaAllocationRequest* pAllocationRequest); +}; + +/* +- GetSize() is the original size of allocated memory block. +- m_UsableSize is this size aligned down to a power of two. + All allocations and calculations happen relative to m_UsableSize. +- GetUnusableSize() is the difference between them. + It is repoted as separate, unused range, not available for allocations. + +Node at level 0 has size = m_UsableSize. +Each next level contains nodes with size 2 times smaller than current level. +m_LevelCount is the maximum number of levels to use in the current object. +*/ +class VmaBlockMetadata_Buddy : public VmaBlockMetadata +{ + VMA_CLASS_NO_COPY(VmaBlockMetadata_Buddy) +public: + VmaBlockMetadata_Buddy(VmaAllocator hAllocator); + virtual ~VmaBlockMetadata_Buddy(); + virtual void Init(VkDeviceSize size); + + virtual bool Validate() const; + virtual size_t GetAllocationCount() const { return m_AllocationCount; } + virtual VkDeviceSize GetSumFreeSize() const { return m_SumFreeSize + GetUnusableSize(); } + virtual VkDeviceSize GetUnusedRangeSizeMax() const; + virtual bool IsEmpty() const { return m_Root->type == Node::TYPE_FREE; } + + virtual void CalcAllocationStatInfo(VmaStatInfo& outInfo) const; + virtual void AddPoolStats(VmaPoolStats& inoutStats) const; + +#if VMA_STATS_STRING_ENABLED + virtual void PrintDetailedMap(class VmaJsonWriter& json) const; +#endif + + virtual bool CreateAllocationRequest( + uint32_t currentFrameIndex, + uint32_t frameInUseCount, + VkDeviceSize bufferImageGranularity, + VkDeviceSize allocSize, + VkDeviceSize allocAlignment, + bool upperAddress, + VmaSuballocationType allocType, + bool canMakeOtherLost, + uint32_t strategy, + VmaAllocationRequest* pAllocationRequest); + + virtual bool MakeRequestedAllocationsLost( + uint32_t currentFrameIndex, + uint32_t frameInUseCount, + VmaAllocationRequest* pAllocationRequest); + + virtual uint32_t MakeAllocationsLost(uint32_t currentFrameIndex, uint32_t frameInUseCount); + + virtual VkResult CheckCorruption(const void* pBlockData) { return VK_ERROR_FEATURE_NOT_PRESENT; } + + virtual void Alloc( + const VmaAllocationRequest& request, + VmaSuballocationType type, + VkDeviceSize allocSize, + VmaAllocation hAllocation); + + virtual void Free(const VmaAllocation allocation) { FreeAtOffset(allocation, allocation->GetOffset()); } + virtual void FreeAtOffset(VkDeviceSize offset) { FreeAtOffset(VMA_NULL, offset); } + +private: + static const VkDeviceSize MIN_NODE_SIZE = 32; + static const size_t MAX_LEVELS = 30; + + struct ValidationContext + { + size_t calculatedAllocationCount; + size_t calculatedFreeCount; + VkDeviceSize calculatedSumFreeSize; + + ValidationContext() : + calculatedAllocationCount(0), + calculatedFreeCount(0), + calculatedSumFreeSize(0) { } + }; + + struct Node + { + VkDeviceSize offset; + enum TYPE + { + TYPE_FREE, + TYPE_ALLOCATION, + TYPE_SPLIT, + TYPE_COUNT + } type; + Node* parent; + Node* buddy; + + union + { + struct + { + Node* prev; + Node* next; + } free; + struct + { + VmaAllocation alloc; + } allocation; + struct + { + Node* leftChild; + } split; + }; + }; + + // Size of the memory block aligned down to a power of two. + VkDeviceSize m_UsableSize; + uint32_t m_LevelCount; + + Node* m_Root; + struct { + Node* front; + Node* back; + } m_FreeList[MAX_LEVELS]; + // Number of nodes in the tree with type == TYPE_ALLOCATION. + size_t m_AllocationCount; + // Number of nodes in the tree with type == TYPE_FREE. + size_t m_FreeCount; + // This includes space wasted due to internal fragmentation. Doesn't include unusable size. + VkDeviceSize m_SumFreeSize; + + VkDeviceSize GetUnusableSize() const { return GetSize() - m_UsableSize; } + void DeleteNode(Node* node); + bool ValidateNode(ValidationContext& ctx, const Node* parent, const Node* curr, uint32_t level, VkDeviceSize levelNodeSize) const; + uint32_t AllocSizeToLevel(VkDeviceSize allocSize) const; + inline VkDeviceSize LevelToNodeSize(uint32_t level) const { return m_UsableSize >> level; } + // Alloc passed just for validation. Can be null. + void FreeAtOffset(VmaAllocation alloc, VkDeviceSize offset); + void CalcAllocationStatInfoNode(VmaStatInfo& outInfo, const Node* node, VkDeviceSize levelNodeSize) const; + // Adds node to the front of FreeList at given level. + // node->type must be FREE. + // node->free.prev, next can be undefined. + void AddToFreeListFront(uint32_t level, Node* node); + // Removes node from FreeList at given level. + // node->type must be FREE. + // node->free.prev, next stay untouched. + void RemoveFromFreeList(uint32_t level, Node* node); + +#if VMA_STATS_STRING_ENABLED + void PrintDetailedMapNode(class VmaJsonWriter& json, const Node* node, VkDeviceSize levelNodeSize) const; +#endif +}; + +/* +Represents a single block of device memory (`VkDeviceMemory`) with all the +data about its regions (aka suballocations, #VmaAllocation), assigned and free. + +Thread-safety: This class must be externally synchronized. +*/ +class VmaDeviceMemoryBlock +{ + VMA_CLASS_NO_COPY(VmaDeviceMemoryBlock) +public: + VmaBlockMetadata* m_pMetadata; + + VmaDeviceMemoryBlock(VmaAllocator hAllocator); + + ~VmaDeviceMemoryBlock() + { + VMA_ASSERT(m_MapCount == 0 && "VkDeviceMemory block is being destroyed while it is still mapped."); + VMA_ASSERT(m_hMemory == VK_NULL_HANDLE); + } + + // Always call after construction. + void Init( + VmaAllocator hAllocator, + VmaPool hParentPool, + uint32_t newMemoryTypeIndex, + VkDeviceMemory newMemory, + VkDeviceSize newSize, + uint32_t id, + uint32_t algorithm); + // Always call before destruction. + void Destroy(VmaAllocator allocator); + + VmaPool GetParentPool() const { return m_hParentPool; } + VkDeviceMemory GetDeviceMemory() const { return m_hMemory; } + uint32_t GetMemoryTypeIndex() const { return m_MemoryTypeIndex; } + uint32_t GetId() const { return m_Id; } + void* GetMappedData() const { return m_pMappedData; } + + // Validates all data structures inside this object. If not valid, returns false. + bool Validate() const; + + VkResult CheckCorruption(VmaAllocator hAllocator); + + // ppData can be null. + VkResult Map(VmaAllocator hAllocator, uint32_t count, void** ppData); + void Unmap(VmaAllocator hAllocator, uint32_t count); + + VkResult WriteMagicValueAroundAllocation(VmaAllocator hAllocator, VkDeviceSize allocOffset, VkDeviceSize allocSize); + VkResult ValidateMagicValueAroundAllocation(VmaAllocator hAllocator, VkDeviceSize allocOffset, VkDeviceSize allocSize); + + VkResult BindBufferMemory( + const VmaAllocator hAllocator, + const VmaAllocation hAllocation, + VkDeviceSize allocationLocalOffset, + VkBuffer hBuffer, + const void* pNext); + VkResult BindImageMemory( + const VmaAllocator hAllocator, + const VmaAllocation hAllocation, + VkDeviceSize allocationLocalOffset, + VkImage hImage, + const void* pNext); + +private: + VmaPool m_hParentPool; // VK_NULL_HANDLE if not belongs to custom pool. + uint32_t m_MemoryTypeIndex; + uint32_t m_Id; + VkDeviceMemory m_hMemory; + + /* + Protects access to m_hMemory so it's not used by multiple threads simultaneously, e.g. vkMapMemory, vkBindBufferMemory. + Also protects m_MapCount, m_pMappedData. + Allocations, deallocations, any change in m_pMetadata is protected by parent's VmaBlockVector::m_Mutex. + */ + VMA_MUTEX m_Mutex; + uint32_t m_MapCount; + void* m_pMappedData; +}; + +struct VmaPointerLess +{ + bool operator()(const void* lhs, const void* rhs) const + { + return lhs < rhs; + } +}; + +struct VmaDefragmentationMove +{ + size_t srcBlockIndex; + size_t dstBlockIndex; + VkDeviceSize srcOffset; + VkDeviceSize dstOffset; + VkDeviceSize size; + VmaAllocation hAllocation; + VmaDeviceMemoryBlock* pSrcBlock; + VmaDeviceMemoryBlock* pDstBlock; +}; + +class VmaDefragmentationAlgorithm; + +/* +Sequence of VmaDeviceMemoryBlock. Represents memory blocks allocated for a specific +Vulkan memory type. + +Synchronized internally with a mutex. +*/ +struct VmaBlockVector +{ + VMA_CLASS_NO_COPY(VmaBlockVector) +public: + VmaBlockVector( + VmaAllocator hAllocator, + VmaPool hParentPool, + uint32_t memoryTypeIndex, + VkDeviceSize preferredBlockSize, + size_t minBlockCount, + size_t maxBlockCount, + VkDeviceSize bufferImageGranularity, + uint32_t frameInUseCount, + bool explicitBlockSize, + uint32_t algorithm, + float priority); + ~VmaBlockVector(); + + VkResult CreateMinBlocks(); + + VmaAllocator GetAllocator() const { return m_hAllocator; } + VmaPool GetParentPool() const { return m_hParentPool; } + bool IsCustomPool() const { return m_hParentPool != VMA_NULL; } + uint32_t GetMemoryTypeIndex() const { return m_MemoryTypeIndex; } + VkDeviceSize GetPreferredBlockSize() const { return m_PreferredBlockSize; } + VkDeviceSize GetBufferImageGranularity() const { return m_BufferImageGranularity; } + uint32_t GetFrameInUseCount() const { return m_FrameInUseCount; } + uint32_t GetAlgorithm() const { return m_Algorithm; } + + void GetPoolStats(VmaPoolStats* pStats); + + bool IsEmpty(); + bool IsCorruptionDetectionEnabled() const; + + VkResult Allocate( + uint32_t currentFrameIndex, + VkDeviceSize size, + VkDeviceSize alignment, + const VmaAllocationCreateInfo& createInfo, + VmaSuballocationType suballocType, + size_t allocationCount, + VmaAllocation* pAllocations); + + void Free(const VmaAllocation hAllocation); + + // Adds statistics of this BlockVector to pStats. + void AddStats(VmaStats* pStats); + +#if VMA_STATS_STRING_ENABLED + void PrintDetailedMap(class VmaJsonWriter& json); +#endif + + void MakePoolAllocationsLost( + uint32_t currentFrameIndex, + size_t* pLostAllocationCount); + VkResult CheckCorruption(); + + // Saves results in pCtx->res. + void Defragment( + class VmaBlockVectorDefragmentationContext* pCtx, + VmaDefragmentationStats* pStats, VmaDefragmentationFlags flags, + VkDeviceSize& maxCpuBytesToMove, uint32_t& maxCpuAllocationsToMove, + VkDeviceSize& maxGpuBytesToMove, uint32_t& maxGpuAllocationsToMove, + VkCommandBuffer commandBuffer); + void DefragmentationEnd( + class VmaBlockVectorDefragmentationContext* pCtx, + uint32_t flags, + VmaDefragmentationStats* pStats); + + uint32_t ProcessDefragmentations( + class VmaBlockVectorDefragmentationContext *pCtx, + VmaDefragmentationPassMoveInfo* pMove, uint32_t maxMoves); + + void CommitDefragmentations( + class VmaBlockVectorDefragmentationContext *pCtx, + VmaDefragmentationStats* pStats); + + //////////////////////////////////////////////////////////////////////////////// + // To be used only while the m_Mutex is locked. Used during defragmentation. + + size_t GetBlockCount() const { return m_Blocks.size(); } + VmaDeviceMemoryBlock* GetBlock(size_t index) const { return m_Blocks[index]; } + size_t CalcAllocationCount() const; + bool IsBufferImageGranularityConflictPossible() const; + +private: + friend class VmaDefragmentationAlgorithm_Generic; + + const VmaAllocator m_hAllocator; + const VmaPool m_hParentPool; + const uint32_t m_MemoryTypeIndex; + const VkDeviceSize m_PreferredBlockSize; + const size_t m_MinBlockCount; + const size_t m_MaxBlockCount; + const VkDeviceSize m_BufferImageGranularity; + const uint32_t m_FrameInUseCount; + const bool m_ExplicitBlockSize; + const uint32_t m_Algorithm; + const float m_Priority; + VMA_RW_MUTEX m_Mutex; + + /* There can be at most one allocation that is completely empty (except when minBlockCount > 0) - + a hysteresis to avoid pessimistic case of alternating creation and destruction of a VkDeviceMemory. */ + bool m_HasEmptyBlock; + // Incrementally sorted by sumFreeSize, ascending. + VmaVector< VmaDeviceMemoryBlock*, VmaStlAllocator > m_Blocks; + uint32_t m_NextBlockId; + + VkDeviceSize CalcMaxBlockSize() const; + + // Finds and removes given block from vector. + void Remove(VmaDeviceMemoryBlock* pBlock); + + // Performs single step in sorting m_Blocks. They may not be fully sorted + // after this call. + void IncrementallySortBlocks(); + + VkResult AllocatePage( + uint32_t currentFrameIndex, + VkDeviceSize size, + VkDeviceSize alignment, + const VmaAllocationCreateInfo& createInfo, + VmaSuballocationType suballocType, + VmaAllocation* pAllocation); + + // To be used only without CAN_MAKE_OTHER_LOST flag. + VkResult AllocateFromBlock( + VmaDeviceMemoryBlock* pBlock, + uint32_t currentFrameIndex, + VkDeviceSize size, + VkDeviceSize alignment, + VmaAllocationCreateFlags allocFlags, + void* pUserData, + VmaSuballocationType suballocType, + uint32_t strategy, + VmaAllocation* pAllocation); + + VkResult CreateBlock(VkDeviceSize blockSize, size_t* pNewBlockIndex); + + // Saves result to pCtx->res. + void ApplyDefragmentationMovesCpu( + class VmaBlockVectorDefragmentationContext* pDefragCtx, + const VmaVector< VmaDefragmentationMove, VmaStlAllocator >& moves); + // Saves result to pCtx->res. + void ApplyDefragmentationMovesGpu( + class VmaBlockVectorDefragmentationContext* pDefragCtx, + VmaVector< VmaDefragmentationMove, VmaStlAllocator >& moves, + VkCommandBuffer commandBuffer); + + /* + Used during defragmentation. pDefragmentationStats is optional. It's in/out + - updated with new data. + */ + void FreeEmptyBlocks(VmaDefragmentationStats* pDefragmentationStats); + + void UpdateHasEmptyBlock(); +}; + +struct VmaPool_T +{ + VMA_CLASS_NO_COPY(VmaPool_T) +public: + VmaBlockVector m_BlockVector; + + VmaPool_T( + VmaAllocator hAllocator, + const VmaPoolCreateInfo& createInfo, + VkDeviceSize preferredBlockSize); + ~VmaPool_T(); + + uint32_t GetId() const { return m_Id; } + void SetId(uint32_t id) { VMA_ASSERT(m_Id == 0); m_Id = id; } + + const char* GetName() const { return m_Name; } + void SetName(const char* pName); + +#if VMA_STATS_STRING_ENABLED + //void PrintDetailedMap(class VmaStringBuilder& sb); +#endif + +private: + uint32_t m_Id; + char* m_Name; +}; + +/* +Performs defragmentation: + +- Updates `pBlockVector->m_pMetadata`. +- Updates allocations by calling ChangeBlockAllocation() or ChangeOffset(). +- Does not move actual data, only returns requested moves as `moves`. +*/ +class VmaDefragmentationAlgorithm +{ + VMA_CLASS_NO_COPY(VmaDefragmentationAlgorithm) +public: + VmaDefragmentationAlgorithm( + VmaAllocator hAllocator, + VmaBlockVector* pBlockVector, + uint32_t currentFrameIndex) : + m_hAllocator(hAllocator), + m_pBlockVector(pBlockVector), + m_CurrentFrameIndex(currentFrameIndex) + { + } + virtual ~VmaDefragmentationAlgorithm() + { + } + + virtual void AddAllocation(VmaAllocation hAlloc, VkBool32* pChanged) = 0; + virtual void AddAll() = 0; + + virtual VkResult Defragment( + VmaVector< VmaDefragmentationMove, VmaStlAllocator >& moves, + VkDeviceSize maxBytesToMove, + uint32_t maxAllocationsToMove, + VmaDefragmentationFlags flags) = 0; + + virtual VkDeviceSize GetBytesMoved() const = 0; + virtual uint32_t GetAllocationsMoved() const = 0; + +protected: + VmaAllocator const m_hAllocator; + VmaBlockVector* const m_pBlockVector; + const uint32_t m_CurrentFrameIndex; + + struct AllocationInfo + { + VmaAllocation m_hAllocation; + VkBool32* m_pChanged; + + AllocationInfo() : + m_hAllocation(VK_NULL_HANDLE), + m_pChanged(VMA_NULL) + { + } + AllocationInfo(VmaAllocation hAlloc, VkBool32* pChanged) : + m_hAllocation(hAlloc), + m_pChanged(pChanged) + { + } + }; +}; + +class VmaDefragmentationAlgorithm_Generic : public VmaDefragmentationAlgorithm +{ + VMA_CLASS_NO_COPY(VmaDefragmentationAlgorithm_Generic) +public: + VmaDefragmentationAlgorithm_Generic( + VmaAllocator hAllocator, + VmaBlockVector* pBlockVector, + uint32_t currentFrameIndex, + bool overlappingMoveSupported); + virtual ~VmaDefragmentationAlgorithm_Generic(); + + virtual void AddAllocation(VmaAllocation hAlloc, VkBool32* pChanged); + virtual void AddAll() { m_AllAllocations = true; } + + virtual VkResult Defragment( + VmaVector< VmaDefragmentationMove, VmaStlAllocator >& moves, + VkDeviceSize maxBytesToMove, + uint32_t maxAllocationsToMove, + VmaDefragmentationFlags flags); + + virtual VkDeviceSize GetBytesMoved() const { return m_BytesMoved; } + virtual uint32_t GetAllocationsMoved() const { return m_AllocationsMoved; } + +private: + uint32_t m_AllocationCount; + bool m_AllAllocations; + + VkDeviceSize m_BytesMoved; + uint32_t m_AllocationsMoved; + + struct AllocationInfoSizeGreater + { + bool operator()(const AllocationInfo& lhs, const AllocationInfo& rhs) const + { + return lhs.m_hAllocation->GetSize() > rhs.m_hAllocation->GetSize(); + } + }; + + struct AllocationInfoOffsetGreater + { + bool operator()(const AllocationInfo& lhs, const AllocationInfo& rhs) const + { + return lhs.m_hAllocation->GetOffset() > rhs.m_hAllocation->GetOffset(); + } + }; + + struct BlockInfo + { + size_t m_OriginalBlockIndex; + VmaDeviceMemoryBlock* m_pBlock; + bool m_HasNonMovableAllocations; + VmaVector< AllocationInfo, VmaStlAllocator > m_Allocations; + + BlockInfo(const VkAllocationCallbacks* pAllocationCallbacks) : + m_OriginalBlockIndex(SIZE_MAX), + m_pBlock(VMA_NULL), + m_HasNonMovableAllocations(true), + m_Allocations(pAllocationCallbacks) + { + } + + void CalcHasNonMovableAllocations() + { + const size_t blockAllocCount = m_pBlock->m_pMetadata->GetAllocationCount(); + const size_t defragmentAllocCount = m_Allocations.size(); + m_HasNonMovableAllocations = blockAllocCount != defragmentAllocCount; + } + + void SortAllocationsBySizeDescending() + { + VMA_SORT(m_Allocations.begin(), m_Allocations.end(), AllocationInfoSizeGreater()); + } + + void SortAllocationsByOffsetDescending() + { + VMA_SORT(m_Allocations.begin(), m_Allocations.end(), AllocationInfoOffsetGreater()); + } + }; + + struct BlockPointerLess + { + bool operator()(const BlockInfo* pLhsBlockInfo, const VmaDeviceMemoryBlock* pRhsBlock) const + { + return pLhsBlockInfo->m_pBlock < pRhsBlock; + } + bool operator()(const BlockInfo* pLhsBlockInfo, const BlockInfo* pRhsBlockInfo) const + { + return pLhsBlockInfo->m_pBlock < pRhsBlockInfo->m_pBlock; + } + }; + + // 1. Blocks with some non-movable allocations go first. + // 2. Blocks with smaller sumFreeSize go first. + struct BlockInfoCompareMoveDestination + { + bool operator()(const BlockInfo* pLhsBlockInfo, const BlockInfo* pRhsBlockInfo) const + { + if(pLhsBlockInfo->m_HasNonMovableAllocations && !pRhsBlockInfo->m_HasNonMovableAllocations) + { + return true; + } + if(!pLhsBlockInfo->m_HasNonMovableAllocations && pRhsBlockInfo->m_HasNonMovableAllocations) + { + return false; + } + if(pLhsBlockInfo->m_pBlock->m_pMetadata->GetSumFreeSize() < pRhsBlockInfo->m_pBlock->m_pMetadata->GetSumFreeSize()) + { + return true; + } + return false; + } + }; + + typedef VmaVector< BlockInfo*, VmaStlAllocator > BlockInfoVector; + BlockInfoVector m_Blocks; + + VkResult DefragmentRound( + VmaVector< VmaDefragmentationMove, VmaStlAllocator >& moves, + VkDeviceSize maxBytesToMove, + uint32_t maxAllocationsToMove, + bool freeOldAllocations); + + size_t CalcBlocksWithNonMovableCount() const; + + static bool MoveMakesSense( + size_t dstBlockIndex, VkDeviceSize dstOffset, + size_t srcBlockIndex, VkDeviceSize srcOffset); +}; + +class VmaDefragmentationAlgorithm_Fast : public VmaDefragmentationAlgorithm +{ + VMA_CLASS_NO_COPY(VmaDefragmentationAlgorithm_Fast) +public: + VmaDefragmentationAlgorithm_Fast( + VmaAllocator hAllocator, + VmaBlockVector* pBlockVector, + uint32_t currentFrameIndex, + bool overlappingMoveSupported); + virtual ~VmaDefragmentationAlgorithm_Fast(); + + virtual void AddAllocation(VmaAllocation hAlloc, VkBool32* pChanged) { ++m_AllocationCount; } + virtual void AddAll() { m_AllAllocations = true; } + + virtual VkResult Defragment( + VmaVector< VmaDefragmentationMove, VmaStlAllocator >& moves, + VkDeviceSize maxBytesToMove, + uint32_t maxAllocationsToMove, + VmaDefragmentationFlags flags); + + virtual VkDeviceSize GetBytesMoved() const { return m_BytesMoved; } + virtual uint32_t GetAllocationsMoved() const { return m_AllocationsMoved; } + +private: + struct BlockInfo + { + size_t origBlockIndex; + }; + + class FreeSpaceDatabase + { + public: + FreeSpaceDatabase() + { + FreeSpace s = {}; + s.blockInfoIndex = SIZE_MAX; + for (const auto i : c10::irange(MAX_COUNT)) { + m_FreeSpaces[i] = s; + } + } + + void Register(size_t blockInfoIndex, VkDeviceSize offset, VkDeviceSize size) + { + if(size < VMA_MIN_FREE_SUBALLOCATION_SIZE_TO_REGISTER) + { + return; + } + + // Find first invalid or the smallest structure. + size_t bestIndex = SIZE_MAX; + for (const auto i : c10::irange(MAX_COUNT)) { + // Empty structure. + if(m_FreeSpaces[i].blockInfoIndex == SIZE_MAX) + { + bestIndex = i; + break; + } + if(m_FreeSpaces[i].size < size && + (bestIndex == SIZE_MAX || m_FreeSpaces[bestIndex].size > m_FreeSpaces[i].size)) + { + bestIndex = i; + } + } + + if(bestIndex != SIZE_MAX) + { + m_FreeSpaces[bestIndex].blockInfoIndex = blockInfoIndex; + m_FreeSpaces[bestIndex].offset = offset; + m_FreeSpaces[bestIndex].size = size; + } + } + + bool Fetch(VkDeviceSize alignment, VkDeviceSize size, + size_t& outBlockInfoIndex, VkDeviceSize& outDstOffset) + { + size_t bestIndex = SIZE_MAX; + VkDeviceSize bestFreeSpaceAfter = 0; + for (const auto i : c10::irange(MAX_COUNT)) { + // Structure is valid. + if(m_FreeSpaces[i].blockInfoIndex != SIZE_MAX) + { + const VkDeviceSize dstOffset = VmaAlignUp(m_FreeSpaces[i].offset, alignment); + // Allocation fits into this structure. + if(dstOffset + size <= m_FreeSpaces[i].offset + m_FreeSpaces[i].size) + { + const VkDeviceSize freeSpaceAfter = (m_FreeSpaces[i].offset + m_FreeSpaces[i].size) - + (dstOffset + size); + if(bestIndex == SIZE_MAX || freeSpaceAfter > bestFreeSpaceAfter) + { + bestIndex = i; + bestFreeSpaceAfter = freeSpaceAfter; + } + } + } + } + + if(bestIndex != SIZE_MAX) + { + outBlockInfoIndex = m_FreeSpaces[bestIndex].blockInfoIndex; + outDstOffset = VmaAlignUp(m_FreeSpaces[bestIndex].offset, alignment); + + if(bestFreeSpaceAfter >= VMA_MIN_FREE_SUBALLOCATION_SIZE_TO_REGISTER) + { + // Leave this structure for remaining empty space. + const VkDeviceSize alignmentPlusSize = (outDstOffset - m_FreeSpaces[bestIndex].offset) + size; + m_FreeSpaces[bestIndex].offset += alignmentPlusSize; + m_FreeSpaces[bestIndex].size -= alignmentPlusSize; + } + else + { + // This structure becomes invalid. + m_FreeSpaces[bestIndex].blockInfoIndex = SIZE_MAX; + } + + return true; + } + + return false; + } + + private: + static const size_t MAX_COUNT = 4; + + struct FreeSpace + { + size_t blockInfoIndex; // SIZE_MAX means this structure is invalid. + VkDeviceSize offset; + VkDeviceSize size; + } m_FreeSpaces[MAX_COUNT]; + }; + + const bool m_OverlappingMoveSupported; + + uint32_t m_AllocationCount; + bool m_AllAllocations; + + VkDeviceSize m_BytesMoved; + uint32_t m_AllocationsMoved; + + VmaVector< BlockInfo, VmaStlAllocator > m_BlockInfos; + + void PreprocessMetadata(); + void PostprocessMetadata(); + void InsertSuballoc(VmaBlockMetadata_Generic* pMetadata, const VmaSuballocation& suballoc); +}; + +struct VmaBlockDefragmentationContext +{ + enum BLOCK_FLAG + { + BLOCK_FLAG_USED = 0x00000001, + }; + uint32_t flags; + VkBuffer hBuffer; +}; + +class VmaBlockVectorDefragmentationContext +{ + VMA_CLASS_NO_COPY(VmaBlockVectorDefragmentationContext) +public: + VkResult res; + bool mutexLocked; + VmaVector< VmaBlockDefragmentationContext, VmaStlAllocator > blockContexts; + VmaVector< VmaDefragmentationMove, VmaStlAllocator > defragmentationMoves; + uint32_t defragmentationMovesProcessed; + uint32_t defragmentationMovesCommitted; + bool hasDefragmentationPlan; + + VmaBlockVectorDefragmentationContext( + VmaAllocator hAllocator, + VmaPool hCustomPool, // Optional. + VmaBlockVector* pBlockVector, + uint32_t currFrameIndex); + ~VmaBlockVectorDefragmentationContext(); + + VmaPool GetCustomPool() const { return m_hCustomPool; } + VmaBlockVector* GetBlockVector() const { return m_pBlockVector; } + VmaDefragmentationAlgorithm* GetAlgorithm() const { return m_pAlgorithm; } + + void AddAllocation(VmaAllocation hAlloc, VkBool32* pChanged); + void AddAll() { m_AllAllocations = true; } + + void Begin(bool overlappingMoveSupported, VmaDefragmentationFlags flags); + +private: + const VmaAllocator m_hAllocator; + // Null if not from custom pool. + const VmaPool m_hCustomPool; + // Redundant, for convenience not to fetch from m_hCustomPool->m_BlockVector or m_hAllocator->m_pBlockVectors. + VmaBlockVector* const m_pBlockVector; + const uint32_t m_CurrFrameIndex; + // Owner of this object. + VmaDefragmentationAlgorithm* m_pAlgorithm; + + struct AllocInfo + { + VmaAllocation hAlloc; + VkBool32* pChanged; + }; + // Used between constructor and Begin. + VmaVector< AllocInfo, VmaStlAllocator > m_Allocations; + bool m_AllAllocations; +}; + +struct VmaDefragmentationContext_T +{ +private: + VMA_CLASS_NO_COPY(VmaDefragmentationContext_T) +public: + VmaDefragmentationContext_T( + VmaAllocator hAllocator, + uint32_t currFrameIndex, + uint32_t flags, + VmaDefragmentationStats* pStats); + ~VmaDefragmentationContext_T(); + + void AddPools(uint32_t poolCount, const VmaPool* pPools); + void AddAllocations( + uint32_t allocationCount, + const VmaAllocation* pAllocations, + VkBool32* pAllocationsChanged); + + /* + Returns: + - `VK_SUCCESS` if succeeded and object can be destroyed immediately. + - `VK_NOT_READY` if succeeded but the object must remain alive until vmaDefragmentationEnd(). + - Negative value if error occured and object can be destroyed immediately. + */ + VkResult Defragment( + VkDeviceSize maxCpuBytesToMove, uint32_t maxCpuAllocationsToMove, + VkDeviceSize maxGpuBytesToMove, uint32_t maxGpuAllocationsToMove, + VkCommandBuffer commandBuffer, VmaDefragmentationStats* pStats, VmaDefragmentationFlags flags); + + VkResult DefragmentPassBegin(VmaDefragmentationPassInfo* pInfo); + VkResult DefragmentPassEnd(); + +private: + const VmaAllocator m_hAllocator; + const uint32_t m_CurrFrameIndex; + const uint32_t m_Flags; + VmaDefragmentationStats* const m_pStats; + + VkDeviceSize m_MaxCpuBytesToMove; + uint32_t m_MaxCpuAllocationsToMove; + VkDeviceSize m_MaxGpuBytesToMove; + uint32_t m_MaxGpuAllocationsToMove; + + // Owner of these objects. + VmaBlockVectorDefragmentationContext* m_DefaultPoolContexts[VK_MAX_MEMORY_TYPES]; + // Owner of these objects. + VmaVector< VmaBlockVectorDefragmentationContext*, VmaStlAllocator > m_CustomPoolContexts; +}; + +#if VMA_RECORDING_ENABLED + +class VmaRecorder +{ +public: + VmaRecorder(); + VkResult Init(const VmaRecordSettings& settings, bool useMutex); + void WriteConfiguration( + const VkPhysicalDeviceProperties& devProps, + const VkPhysicalDeviceMemoryProperties& memProps, + uint32_t vulkanApiVersion, + bool dedicatedAllocationExtensionEnabled, + bool bindMemory2ExtensionEnabled, + bool memoryBudgetExtensionEnabled, + bool deviceCoherentMemoryExtensionEnabled); + ~VmaRecorder(); + + void RecordCreateAllocator(uint32_t frameIndex); + void RecordDestroyAllocator(uint32_t frameIndex); + void RecordCreatePool(uint32_t frameIndex, + const VmaPoolCreateInfo& createInfo, + VmaPool pool); + void RecordDestroyPool(uint32_t frameIndex, VmaPool pool); + void RecordAllocateMemory(uint32_t frameIndex, + const VkMemoryRequirements& vkMemReq, + const VmaAllocationCreateInfo& createInfo, + VmaAllocation allocation); + void RecordAllocateMemoryPages(uint32_t frameIndex, + const VkMemoryRequirements& vkMemReq, + const VmaAllocationCreateInfo& createInfo, + uint64_t allocationCount, + const VmaAllocation* pAllocations); + void RecordAllocateMemoryForBuffer(uint32_t frameIndex, + const VkMemoryRequirements& vkMemReq, + bool requiresDedicatedAllocation, + bool prefersDedicatedAllocation, + const VmaAllocationCreateInfo& createInfo, + VmaAllocation allocation); + void RecordAllocateMemoryForImage(uint32_t frameIndex, + const VkMemoryRequirements& vkMemReq, + bool requiresDedicatedAllocation, + bool prefersDedicatedAllocation, + const VmaAllocationCreateInfo& createInfo, + VmaAllocation allocation); + void RecordFreeMemory(uint32_t frameIndex, + VmaAllocation allocation); + void RecordFreeMemoryPages(uint32_t frameIndex, + uint64_t allocationCount, + const VmaAllocation* pAllocations); + void RecordSetAllocationUserData(uint32_t frameIndex, + VmaAllocation allocation, + const void* pUserData); + void RecordCreateLostAllocation(uint32_t frameIndex, + VmaAllocation allocation); + void RecordMapMemory(uint32_t frameIndex, + VmaAllocation allocation); + void RecordUnmapMemory(uint32_t frameIndex, + VmaAllocation allocation); + void RecordFlushAllocation(uint32_t frameIndex, + VmaAllocation allocation, VkDeviceSize offset, VkDeviceSize size); + void RecordInvalidateAllocation(uint32_t frameIndex, + VmaAllocation allocation, VkDeviceSize offset, VkDeviceSize size); + void RecordCreateBuffer(uint32_t frameIndex, + const VkBufferCreateInfo& bufCreateInfo, + const VmaAllocationCreateInfo& allocCreateInfo, + VmaAllocation allocation); + void RecordCreateImage(uint32_t frameIndex, + const VkImageCreateInfo& imageCreateInfo, + const VmaAllocationCreateInfo& allocCreateInfo, + VmaAllocation allocation); + void RecordDestroyBuffer(uint32_t frameIndex, + VmaAllocation allocation); + void RecordDestroyImage(uint32_t frameIndex, + VmaAllocation allocation); + void RecordTouchAllocation(uint32_t frameIndex, + VmaAllocation allocation); + void RecordGetAllocationInfo(uint32_t frameIndex, + VmaAllocation allocation); + void RecordMakePoolAllocationsLost(uint32_t frameIndex, + VmaPool pool); + void RecordDefragmentationBegin(uint32_t frameIndex, + const VmaDefragmentationInfo2& info, + VmaDefragmentationContext ctx); + void RecordDefragmentationEnd(uint32_t frameIndex, + VmaDefragmentationContext ctx); + void RecordSetPoolName(uint32_t frameIndex, + VmaPool pool, + const char* name); + +private: + struct CallParams + { + uint32_t threadId; + double time; + }; + + class UserDataString + { + public: + UserDataString(VmaAllocationCreateFlags allocFlags, const void* pUserData); + const char* GetString() const { return m_Str; } + + private: + char m_PtrStr[17]; + const char* m_Str; + }; + + bool m_UseMutex; + VmaRecordFlags m_Flags; + FILE* m_File; + VMA_MUTEX m_FileMutex; + std::chrono::time_point m_RecordingStartTime; + + void GetBasicParams(CallParams& outParams); + + // T must be a pointer type, e.g. VmaAllocation, VmaPool. + template + void PrintPointerList(uint64_t count, const T* pItems) + { + if(count) + { + fprintf(m_File, "%p", pItems[0]); + for(uint64_t i = 1; i < count; ++i) + { + fprintf(m_File, " %p", pItems[i]); + } + } + } + + void PrintPointerList(uint64_t count, const VmaAllocation* pItems); + void Flush(); +}; + +#endif // #if VMA_RECORDING_ENABLED + +/* +Thread-safe wrapper over VmaPoolAllocator free list, for allocation of VmaAllocation_T objects. +*/ +class VmaAllocationObjectAllocator +{ + VMA_CLASS_NO_COPY(VmaAllocationObjectAllocator) +public: + VmaAllocationObjectAllocator(const VkAllocationCallbacks* pAllocationCallbacks); + + template VmaAllocation Allocate(Types... args); + void Free(VmaAllocation hAlloc); + +private: + VMA_MUTEX m_Mutex; + VmaPoolAllocator m_Allocator; +}; + +struct VmaCurrentBudgetData +{ + VMA_ATOMIC_UINT64 m_BlockBytes[VK_MAX_MEMORY_HEAPS]; + VMA_ATOMIC_UINT64 m_AllocationBytes[VK_MAX_MEMORY_HEAPS]; + +#if VMA_MEMORY_BUDGET + VMA_ATOMIC_UINT32 m_OperationsSinceBudgetFetch; + VMA_RW_MUTEX m_BudgetMutex; + uint64_t m_VulkanUsage[VK_MAX_MEMORY_HEAPS]; + uint64_t m_VulkanBudget[VK_MAX_MEMORY_HEAPS]; + uint64_t m_BlockBytesAtBudgetFetch[VK_MAX_MEMORY_HEAPS]; +#endif // #if VMA_MEMORY_BUDGET + + VmaCurrentBudgetData() + { + for (const auto heapIndex : c10::irange(VK_MAX_MEMORY_HEAPS)) { + m_BlockBytes[heapIndex] = 0; + m_AllocationBytes[heapIndex] = 0; +#if VMA_MEMORY_BUDGET + m_VulkanUsage[heapIndex] = 0; + m_VulkanBudget[heapIndex] = 0; + m_BlockBytesAtBudgetFetch[heapIndex] = 0; +#endif + } + +#if VMA_MEMORY_BUDGET + m_OperationsSinceBudgetFetch = 0; +#endif + } + + void AddAllocation(uint32_t heapIndex, VkDeviceSize allocationSize) + { + m_AllocationBytes[heapIndex] += allocationSize; +#if VMA_MEMORY_BUDGET + ++m_OperationsSinceBudgetFetch; +#endif + } + + void RemoveAllocation(uint32_t heapIndex, VkDeviceSize allocationSize) + { + VMA_ASSERT(m_AllocationBytes[heapIndex] >= allocationSize); // DELME + m_AllocationBytes[heapIndex] -= allocationSize; +#if VMA_MEMORY_BUDGET + ++m_OperationsSinceBudgetFetch; +#endif + } +}; + +// Main allocator object. +struct VmaAllocator_T +{ + VMA_CLASS_NO_COPY(VmaAllocator_T) +public: + bool m_UseMutex; + uint32_t m_VulkanApiVersion; + bool m_UseKhrDedicatedAllocation; // Can be set only if m_VulkanApiVersion < VK_MAKE_VERSION(1, 1, 0). + bool m_UseKhrBindMemory2; // Can be set only if m_VulkanApiVersion < VK_MAKE_VERSION(1, 1, 0). + bool m_UseExtMemoryBudget; + bool m_UseAmdDeviceCoherentMemory; + bool m_UseKhrBufferDeviceAddress; + bool m_UseExtMemoryPriority; + VkDevice m_hDevice; + VkInstance m_hInstance; + bool m_AllocationCallbacksSpecified; + VkAllocationCallbacks m_AllocationCallbacks; + VmaDeviceMemoryCallbacks m_DeviceMemoryCallbacks; + VmaAllocationObjectAllocator m_AllocationObjectAllocator; + + // Each bit (1 << i) is set if HeapSizeLimit is enabled for that heap, so cannot allocate more than the heap size. + uint32_t m_HeapSizeLimitMask; + + VkPhysicalDeviceProperties m_PhysicalDeviceProperties; + VkPhysicalDeviceMemoryProperties m_MemProps; + + // Default pools. + VmaBlockVector* m_pBlockVectors[VK_MAX_MEMORY_TYPES]; + + // Each vector is sorted by memory (handle value). + typedef VmaVector< VmaAllocation, VmaStlAllocator > AllocationVectorType; + AllocationVectorType* m_pDedicatedAllocations[VK_MAX_MEMORY_TYPES]; + VMA_RW_MUTEX m_DedicatedAllocationsMutex[VK_MAX_MEMORY_TYPES]; + + VmaCurrentBudgetData m_Budget; + VMA_ATOMIC_UINT32 m_DeviceMemoryCount; // Total number of VkDeviceMemory objects. + + VmaAllocator_T(const VmaAllocatorCreateInfo* pCreateInfo); + VkResult Init(const VmaAllocatorCreateInfo* pCreateInfo); + ~VmaAllocator_T(); + + const VkAllocationCallbacks* GetAllocationCallbacks() const + { + return m_AllocationCallbacksSpecified ? &m_AllocationCallbacks : 0; + } + const VmaVulkanFunctions& GetVulkanFunctions() const + { + return m_VulkanFunctions; + } + + VkPhysicalDevice GetPhysicalDevice() const { return m_PhysicalDevice; } + + VkDeviceSize GetBufferImageGranularity() const + { + return VMA_MAX( + static_cast(VMA_DEBUG_MIN_BUFFER_IMAGE_GRANULARITY), + m_PhysicalDeviceProperties.limits.bufferImageGranularity); + } + + uint32_t GetMemoryHeapCount() const { return m_MemProps.memoryHeapCount; } + uint32_t GetMemoryTypeCount() const { return m_MemProps.memoryTypeCount; } + + uint32_t MemoryTypeIndexToHeapIndex(uint32_t memTypeIndex) const + { + VMA_ASSERT(memTypeIndex < m_MemProps.memoryTypeCount); + return m_MemProps.memoryTypes[memTypeIndex].heapIndex; + } + // True when specific memory type is HOST_VISIBLE but not HOST_COHERENT. + bool IsMemoryTypeNonCoherent(uint32_t memTypeIndex) const + { + return (m_MemProps.memoryTypes[memTypeIndex].propertyFlags & (VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT)) == + VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT; + } + // Minimum alignment for all allocations in specific memory type. + VkDeviceSize GetMemoryTypeMinAlignment(uint32_t memTypeIndex) const + { + return IsMemoryTypeNonCoherent(memTypeIndex) ? + VMA_MAX((VkDeviceSize)VMA_DEBUG_ALIGNMENT, m_PhysicalDeviceProperties.limits.nonCoherentAtomSize) : + (VkDeviceSize)VMA_DEBUG_ALIGNMENT; + } + + bool IsIntegratedGpu() const + { + return m_PhysicalDeviceProperties.deviceType == VK_PHYSICAL_DEVICE_TYPE_INTEGRATED_GPU; + } + + uint32_t GetGlobalMemoryTypeBits() const { return m_GlobalMemoryTypeBits; } + +#if VMA_RECORDING_ENABLED + VmaRecorder* GetRecorder() const { return m_pRecorder; } +#endif + + void GetBufferMemoryRequirements( + VkBuffer hBuffer, + VkMemoryRequirements& memReq, + bool& requiresDedicatedAllocation, + bool& prefersDedicatedAllocation) const; + void GetImageMemoryRequirements( + VkImage hImage, + VkMemoryRequirements& memReq, + bool& requiresDedicatedAllocation, + bool& prefersDedicatedAllocation) const; + + // Main allocation function. + VkResult AllocateMemory( + const VkMemoryRequirements& vkMemReq, + bool requiresDedicatedAllocation, + bool prefersDedicatedAllocation, + VkBuffer dedicatedBuffer, + VkBufferUsageFlags dedicatedBufferUsage, // UINT32_MAX when unknown. + VkImage dedicatedImage, + const VmaAllocationCreateInfo& createInfo, + VmaSuballocationType suballocType, + size_t allocationCount, + VmaAllocation* pAllocations); + + // Main deallocation function. + void FreeMemory( + size_t allocationCount, + const VmaAllocation* pAllocations); + + void CalculateStats(VmaStats* pStats); + + void GetBudget( + VmaBudget* outBudget, uint32_t firstHeap, uint32_t heapCount); + +#if VMA_STATS_STRING_ENABLED + void PrintDetailedMap(class VmaJsonWriter& json); +#endif + + VkResult DefragmentationBegin( + const VmaDefragmentationInfo2& info, + VmaDefragmentationStats* pStats, + VmaDefragmentationContext* pContext); + VkResult DefragmentationEnd( + VmaDefragmentationContext context); + + VkResult DefragmentationPassBegin( + VmaDefragmentationPassInfo* pInfo, + VmaDefragmentationContext context); + VkResult DefragmentationPassEnd( + VmaDefragmentationContext context); + + void GetAllocationInfo(VmaAllocation hAllocation, VmaAllocationInfo* pAllocationInfo); + bool TouchAllocation(VmaAllocation hAllocation); + + VkResult CreatePool(const VmaPoolCreateInfo* pCreateInfo, VmaPool* pPool); + void DestroyPool(VmaPool pool); + void GetPoolStats(VmaPool pool, VmaPoolStats* pPoolStats); + + void SetCurrentFrameIndex(uint32_t frameIndex); + uint32_t GetCurrentFrameIndex() const { return m_CurrentFrameIndex.load(); } + + void MakePoolAllocationsLost( + VmaPool hPool, + size_t* pLostAllocationCount); + VkResult CheckPoolCorruption(VmaPool hPool); + VkResult CheckCorruption(uint32_t memoryTypeBits); + + void CreateLostAllocation(VmaAllocation* pAllocation); + + // Call to Vulkan function vkAllocateMemory with accompanying bookkeeping. + VkResult AllocateVulkanMemory(const VkMemoryAllocateInfo* pAllocateInfo, VkDeviceMemory* pMemory); + // Call to Vulkan function vkFreeMemory with accompanying bookkeeping. + void FreeVulkanMemory(uint32_t memoryType, VkDeviceSize size, VkDeviceMemory hMemory); + // Call to Vulkan function vkBindBufferMemory or vkBindBufferMemory2KHR. + VkResult BindVulkanBuffer( + VkDeviceMemory memory, + VkDeviceSize memoryOffset, + VkBuffer buffer, + const void* pNext); + // Call to Vulkan function vkBindImageMemory or vkBindImageMemory2KHR. + VkResult BindVulkanImage( + VkDeviceMemory memory, + VkDeviceSize memoryOffset, + VkImage image, + const void* pNext); + + VkResult Map(VmaAllocation hAllocation, void** ppData); + void Unmap(VmaAllocation hAllocation); + + VkResult BindBufferMemory( + VmaAllocation hAllocation, + VkDeviceSize allocationLocalOffset, + VkBuffer hBuffer, + const void* pNext); + VkResult BindImageMemory( + VmaAllocation hAllocation, + VkDeviceSize allocationLocalOffset, + VkImage hImage, + const void* pNext); + + VkResult FlushOrInvalidateAllocation( + VmaAllocation hAllocation, + VkDeviceSize offset, VkDeviceSize size, + VMA_CACHE_OPERATION op); + VkResult FlushOrInvalidateAllocations( + uint32_t allocationCount, + const VmaAllocation* allocations, + const VkDeviceSize* offsets, const VkDeviceSize* sizes, + VMA_CACHE_OPERATION op); + + void FillAllocation(const VmaAllocation hAllocation, uint8_t pattern); + + /* + Returns bit mask of memory types that can support defragmentation on GPU as + they support creation of required buffer for copy operations. + */ + uint32_t GetGpuDefragmentationMemoryTypeBits(); + +private: + VkDeviceSize m_PreferredLargeHeapBlockSize; + + VkPhysicalDevice m_PhysicalDevice; + VMA_ATOMIC_UINT32 m_CurrentFrameIndex; + VMA_ATOMIC_UINT32 m_GpuDefragmentationMemoryTypeBits; // UINT32_MAX means uninitialized. + + VMA_RW_MUTEX m_PoolsMutex; + // Protected by m_PoolsMutex. Sorted by pointer value. + VmaVector > m_Pools; + uint32_t m_NextPoolId; + + VmaVulkanFunctions m_VulkanFunctions; + + // Global bit mask AND-ed with any memoryTypeBits to disallow certain memory types. + uint32_t m_GlobalMemoryTypeBits; + +#if VMA_RECORDING_ENABLED + VmaRecorder* m_pRecorder; +#endif + + void ImportVulkanFunctions(const VmaVulkanFunctions* pVulkanFunctions); + +#if VMA_STATIC_VULKAN_FUNCTIONS == 1 + void ImportVulkanFunctions_Static(); +#endif + + void ImportVulkanFunctions_Custom(const VmaVulkanFunctions* pVulkanFunctions); + +#if VMA_DYNAMIC_VULKAN_FUNCTIONS == 1 + void ImportVulkanFunctions_Dynamic(); +#endif + + void ValidateVulkanFunctions(); + + VkDeviceSize CalcPreferredBlockSize(uint32_t memTypeIndex); + + VkResult AllocateMemoryOfType( + VkDeviceSize size, + VkDeviceSize alignment, + bool dedicatedAllocation, + VkBuffer dedicatedBuffer, + VkBufferUsageFlags dedicatedBufferUsage, + VkImage dedicatedImage, + const VmaAllocationCreateInfo& createInfo, + uint32_t memTypeIndex, + VmaSuballocationType suballocType, + size_t allocationCount, + VmaAllocation* pAllocations); + + // Helper function only to be used inside AllocateDedicatedMemory. + VkResult AllocateDedicatedMemoryPage( + VkDeviceSize size, + VmaSuballocationType suballocType, + uint32_t memTypeIndex, + const VkMemoryAllocateInfo& allocInfo, + bool map, + bool isUserDataString, + void* pUserData, + VmaAllocation* pAllocation); + + // Allocates and registers new VkDeviceMemory specifically for dedicated allocations. + VkResult AllocateDedicatedMemory( + VkDeviceSize size, + VmaSuballocationType suballocType, + uint32_t memTypeIndex, + bool withinBudget, + bool map, + bool isUserDataString, + void* pUserData, + float priority, + VkBuffer dedicatedBuffer, + VkBufferUsageFlags dedicatedBufferUsage, + VkImage dedicatedImage, + size_t allocationCount, + VmaAllocation* pAllocations); + + void FreeDedicatedMemory(const VmaAllocation allocation); + + /* + Calculates and returns bit mask of memory types that can support defragmentation + on GPU as they support creation of required buffer for copy operations. + */ + uint32_t CalculateGpuDefragmentationMemoryTypeBits() const; + + uint32_t CalculateGlobalMemoryTypeBits() const; + + bool GetFlushOrInvalidateRange( + VmaAllocation allocation, + VkDeviceSize offset, VkDeviceSize size, + VkMappedMemoryRange& outRange) const; + +#if VMA_MEMORY_BUDGET + void UpdateVulkanBudget(); +#endif // #if VMA_MEMORY_BUDGET +}; + +//////////////////////////////////////////////////////////////////////////////// +// Memory allocation #2 after VmaAllocator_T definition + +static void* VmaMalloc(VmaAllocator hAllocator, size_t size, size_t alignment) +{ + return VmaMalloc(&hAllocator->m_AllocationCallbacks, size, alignment); +} + +static void VmaFree(VmaAllocator hAllocator, void* ptr) +{ + VmaFree(&hAllocator->m_AllocationCallbacks, ptr); +} + +template +static T* VmaAllocate(VmaAllocator hAllocator) +{ + return (T*)VmaMalloc(hAllocator, sizeof(T), VMA_ALIGN_OF(T)); +} + +template +static T* VmaAllocateArray(VmaAllocator hAllocator, size_t count) +{ + return (T*)VmaMalloc(hAllocator, sizeof(T) * count, VMA_ALIGN_OF(T)); +} + +template +static void vma_delete(VmaAllocator hAllocator, T* ptr) +{ + if(ptr != VMA_NULL) + { + ptr->~T(); + VmaFree(hAllocator, ptr); + } +} + +template +static void vma_delete_array(VmaAllocator hAllocator, T* ptr, size_t count) +{ + if(ptr != VMA_NULL) + { + for(size_t i = count; i--; ) + ptr[i].~T(); + VmaFree(hAllocator, ptr); + } +} + +//////////////////////////////////////////////////////////////////////////////// +// VmaStringBuilder + +#if VMA_STATS_STRING_ENABLED + +class VmaStringBuilder +{ +public: + VmaStringBuilder(VmaAllocator alloc) : m_Data(VmaStlAllocator(alloc->GetAllocationCallbacks())) { } + size_t GetLength() const { return m_Data.size(); } + const char* GetData() const { return m_Data.data(); } + + void Add(char ch) { m_Data.push_back(ch); } + void Add(const char* pStr); + void AddNewLine() { Add('\n'); } + void AddNumber(uint32_t num); + void AddNumber(uint64_t num); + void AddPointer(const void* ptr); + +private: + VmaVector< char, VmaStlAllocator > m_Data; +}; + +void VmaStringBuilder::Add(const char* pStr) +{ + const size_t strLen = strlen(pStr); + if(strLen > 0) + { + const size_t oldCount = m_Data.size(); + m_Data.resize(oldCount + strLen); + memcpy(m_Data.data() + oldCount, pStr, strLen); + } +} + +void VmaStringBuilder::AddNumber(uint32_t num) +{ + char buf[11]; + buf[10] = '\0'; + char *p = &buf[10]; + do + { + *--p = '0' + (num % 10); + num /= 10; + } + while(num); + Add(p); +} + +void VmaStringBuilder::AddNumber(uint64_t num) +{ + char buf[21]; + buf[20] = '\0'; + char *p = &buf[20]; + do + { + *--p = '0' + (num % 10); + num /= 10; + } + while(num); + Add(p); +} + +void VmaStringBuilder::AddPointer(const void* ptr) +{ + char buf[21]; + VmaPtrToStr(buf, sizeof(buf), ptr); + Add(buf); +} + +#endif // #if VMA_STATS_STRING_ENABLED + +//////////////////////////////////////////////////////////////////////////////// +// VmaJsonWriter + +#if VMA_STATS_STRING_ENABLED + +class VmaJsonWriter +{ + VMA_CLASS_NO_COPY(VmaJsonWriter) +public: + VmaJsonWriter(const VkAllocationCallbacks* pAllocationCallbacks, VmaStringBuilder& sb); + ~VmaJsonWriter(); + + void BeginObject(bool singleLine = false); + void EndObject(); + + void BeginArray(bool singleLine = false); + void EndArray(); + + void WriteString(const char* pStr); + void BeginString(const char* pStr = VMA_NULL); + void ContinueString(const char* pStr); + void ContinueString(uint32_t n); + void ContinueString(uint64_t n); + void ContinueString_Pointer(const void* ptr); + void EndString(const char* pStr = VMA_NULL); + + void WriteNumber(uint32_t n); + void WriteNumber(uint64_t n); + void WriteBool(bool b); + void WriteNull(); + +private: + static const char* const INDENT; + + enum COLLECTION_TYPE + { + COLLECTION_TYPE_OBJECT, + COLLECTION_TYPE_ARRAY, + }; + struct StackItem + { + COLLECTION_TYPE type; + uint32_t valueCount; + bool singleLineMode; + }; + + VmaStringBuilder& m_SB; + VmaVector< StackItem, VmaStlAllocator > m_Stack; + bool m_InsideString; + + void BeginValue(bool isString); + void WriteIndent(bool oneLess = false); +}; + +const char* const VmaJsonWriter::INDENT = " "; + +VmaJsonWriter::VmaJsonWriter(const VkAllocationCallbacks* pAllocationCallbacks, VmaStringBuilder& sb) : + m_SB(sb), + m_Stack(VmaStlAllocator(pAllocationCallbacks)), + m_InsideString(false) +{ +} + +VmaJsonWriter::~VmaJsonWriter() +{ + VMA_ASSERT(!m_InsideString); + VMA_ASSERT(m_Stack.empty()); +} + +void VmaJsonWriter::BeginObject(bool singleLine) +{ + VMA_ASSERT(!m_InsideString); + + BeginValue(false); + m_SB.Add('{'); + + StackItem item; + item.type = COLLECTION_TYPE_OBJECT; + item.valueCount = 0; + item.singleLineMode = singleLine; + m_Stack.push_back(item); +} + +void VmaJsonWriter::EndObject() +{ + VMA_ASSERT(!m_InsideString); + + WriteIndent(true); + m_SB.Add('}'); + + VMA_ASSERT(!m_Stack.empty() && m_Stack.back().type == COLLECTION_TYPE_OBJECT); + m_Stack.pop_back(); +} + +void VmaJsonWriter::BeginArray(bool singleLine) +{ + VMA_ASSERT(!m_InsideString); + + BeginValue(false); + m_SB.Add('['); + + StackItem item; + item.type = COLLECTION_TYPE_ARRAY; + item.valueCount = 0; + item.singleLineMode = singleLine; + m_Stack.push_back(item); +} + +void VmaJsonWriter::EndArray() +{ + VMA_ASSERT(!m_InsideString); + + WriteIndent(true); + m_SB.Add(']'); + + VMA_ASSERT(!m_Stack.empty() && m_Stack.back().type == COLLECTION_TYPE_ARRAY); + m_Stack.pop_back(); +} + +void VmaJsonWriter::WriteString(const char* pStr) +{ + BeginString(pStr); + EndString(); +} + +void VmaJsonWriter::BeginString(const char* pStr) +{ + VMA_ASSERT(!m_InsideString); + + BeginValue(true); + m_SB.Add('"'); + m_InsideString = true; + if(pStr != VMA_NULL && pStr[0] != '\0') + { + ContinueString(pStr); + } +} + +void VmaJsonWriter::ContinueString(const char* pStr) +{ + VMA_ASSERT(m_InsideString); + + const size_t strLen = strlen(pStr); + for (const auto i : c10::irange(strLen)) { + char ch = pStr[i]; + if(ch == '\\') + { + m_SB.Add("\\\\"); + } + else if(ch == '"') + { + m_SB.Add("\\\""); + } + else if(ch >= 32) + { + m_SB.Add(ch); + } + else switch(ch) + { + case '\b': + m_SB.Add("\\b"); + break; + case '\f': + m_SB.Add("\\f"); + break; + case '\n': + m_SB.Add("\\n"); + break; + case '\r': + m_SB.Add("\\r"); + break; + case '\t': + m_SB.Add("\\t"); + break; + default: + VMA_ASSERT(0 && "Character not currently supported."); + break; + } + } +} + +void VmaJsonWriter::ContinueString(uint32_t n) +{ + VMA_ASSERT(m_InsideString); + m_SB.AddNumber(n); +} + +void VmaJsonWriter::ContinueString(uint64_t n) +{ + VMA_ASSERT(m_InsideString); + m_SB.AddNumber(n); +} + +void VmaJsonWriter::ContinueString_Pointer(const void* ptr) +{ + VMA_ASSERT(m_InsideString); + m_SB.AddPointer(ptr); +} + +void VmaJsonWriter::EndString(const char* pStr) +{ + VMA_ASSERT(m_InsideString); + if(pStr != VMA_NULL && pStr[0] != '\0') + { + ContinueString(pStr); + } + m_SB.Add('"'); + m_InsideString = false; +} + +void VmaJsonWriter::WriteNumber(uint32_t n) +{ + VMA_ASSERT(!m_InsideString); + BeginValue(false); + m_SB.AddNumber(n); +} + +void VmaJsonWriter::WriteNumber(uint64_t n) +{ + VMA_ASSERT(!m_InsideString); + BeginValue(false); + m_SB.AddNumber(n); +} + +void VmaJsonWriter::WriteBool(bool b) +{ + VMA_ASSERT(!m_InsideString); + BeginValue(false); + m_SB.Add(b ? "true" : "false"); +} + +void VmaJsonWriter::WriteNull() +{ + VMA_ASSERT(!m_InsideString); + BeginValue(false); + m_SB.Add("null"); +} + +void VmaJsonWriter::BeginValue(bool isString) +{ + if(!m_Stack.empty()) + { + StackItem& currItem = m_Stack.back(); + if(currItem.type == COLLECTION_TYPE_OBJECT && + currItem.valueCount % 2 == 0) + { + VMA_ASSERT(isString); + } + + if(currItem.type == COLLECTION_TYPE_OBJECT && + currItem.valueCount % 2 != 0) + { + m_SB.Add(": "); + } + else if(currItem.valueCount > 0) + { + m_SB.Add(", "); + WriteIndent(); + } + else + { + WriteIndent(); + } + ++currItem.valueCount; + } +} + +void VmaJsonWriter::WriteIndent(bool oneLess) +{ + if(!m_Stack.empty() && !m_Stack.back().singleLineMode) + { + m_SB.AddNewLine(); + + size_t count = m_Stack.size(); + if(count > 0 && oneLess) + { + --count; + } + for (const auto i : c10::irange(count)) { + m_SB.Add(INDENT); + } + } +} + +#endif // #if VMA_STATS_STRING_ENABLED + +//////////////////////////////////////////////////////////////////////////////// + +void VmaAllocation_T::SetUserData(VmaAllocator hAllocator, void* pUserData) +{ + if(IsUserDataString()) + { + VMA_ASSERT(pUserData == VMA_NULL || pUserData != m_pUserData); + + FreeUserDataString(hAllocator); + + if(pUserData != VMA_NULL) + { + m_pUserData = VmaCreateStringCopy(hAllocator->GetAllocationCallbacks(), (const char*)pUserData); + } + } + else + { + m_pUserData = pUserData; + } +} + +void VmaAllocation_T::ChangeBlockAllocation( + VmaAllocator hAllocator, + VmaDeviceMemoryBlock* block, + VkDeviceSize offset) +{ + VMA_ASSERT(block != VMA_NULL); + VMA_ASSERT(m_Type == ALLOCATION_TYPE_BLOCK); + + // Move mapping reference counter from old block to new block. + if(block != m_BlockAllocation.m_Block) + { + uint32_t mapRefCount = m_MapCount & ~MAP_COUNT_FLAG_PERSISTENT_MAP; + if(IsPersistentMap()) + ++mapRefCount; + m_BlockAllocation.m_Block->Unmap(hAllocator, mapRefCount); + block->Map(hAllocator, mapRefCount, VMA_NULL); + } + + m_BlockAllocation.m_Block = block; + m_BlockAllocation.m_Offset = offset; +} + +void VmaAllocation_T::ChangeOffset(VkDeviceSize newOffset) +{ + VMA_ASSERT(m_Type == ALLOCATION_TYPE_BLOCK); + m_BlockAllocation.m_Offset = newOffset; +} + +VkDeviceSize VmaAllocation_T::GetOffset() const +{ + switch(m_Type) + { + case ALLOCATION_TYPE_BLOCK: + return m_BlockAllocation.m_Offset; + case ALLOCATION_TYPE_DEDICATED: + return 0; + default: + VMA_ASSERT(0); + return 0; + } +} + +VkDeviceMemory VmaAllocation_T::GetMemory() const +{ + switch(m_Type) + { + case ALLOCATION_TYPE_BLOCK: + return m_BlockAllocation.m_Block->GetDeviceMemory(); + case ALLOCATION_TYPE_DEDICATED: + return m_DedicatedAllocation.m_hMemory; + default: + VMA_ASSERT(0); + return VK_NULL_HANDLE; + } +} + +void* VmaAllocation_T::GetMappedData() const +{ + switch(m_Type) + { + case ALLOCATION_TYPE_BLOCK: + if(m_MapCount != 0) + { + void* pBlockData = m_BlockAllocation.m_Block->GetMappedData(); + VMA_ASSERT(pBlockData != VMA_NULL); + return (char*)pBlockData + m_BlockAllocation.m_Offset; + } + else + { + return VMA_NULL; + } + break; + case ALLOCATION_TYPE_DEDICATED: + VMA_ASSERT((m_DedicatedAllocation.m_pMappedData != VMA_NULL) == (m_MapCount != 0)); + return m_DedicatedAllocation.m_pMappedData; + default: + VMA_ASSERT(0); + return VMA_NULL; + } +} + +bool VmaAllocation_T::CanBecomeLost() const +{ + switch(m_Type) + { + case ALLOCATION_TYPE_BLOCK: + return m_BlockAllocation.m_CanBecomeLost; + case ALLOCATION_TYPE_DEDICATED: + return false; + default: + VMA_ASSERT(0); + return false; + } +} + +bool VmaAllocation_T::MakeLost(uint32_t currentFrameIndex, uint32_t frameInUseCount) +{ + VMA_ASSERT(CanBecomeLost()); + + /* + Warning: This is a carefully designed algorithm. + Do not modify unless you really know what you're doing :) + */ + uint32_t localLastUseFrameIndex = GetLastUseFrameIndex(); + for(;;) + { + if(localLastUseFrameIndex == VMA_FRAME_INDEX_LOST) + { + VMA_ASSERT(0); + return false; + } + else if(localLastUseFrameIndex + frameInUseCount >= currentFrameIndex) + { + return false; + } + else // Last use time earlier than current time. + { + if(CompareExchangeLastUseFrameIndex(localLastUseFrameIndex, VMA_FRAME_INDEX_LOST)) + { + // Setting hAllocation.LastUseFrameIndex atomic to VMA_FRAME_INDEX_LOST is enough to mark it as LOST. + // Calling code just needs to unregister this allocation in owning VmaDeviceMemoryBlock. + return true; + } + } + } +} + +#if VMA_STATS_STRING_ENABLED + +// Correspond to values of enum VmaSuballocationType. +static const char* VMA_SUBALLOCATION_TYPE_NAMES[] = { + "FREE", + "UNKNOWN", + "BUFFER", + "IMAGE_UNKNOWN", + "IMAGE_LINEAR", + "IMAGE_OPTIMAL", +}; + +void VmaAllocation_T::PrintParameters(class VmaJsonWriter& json) const +{ + json.WriteString("Type"); + json.WriteString(VMA_SUBALLOCATION_TYPE_NAMES[m_SuballocationType]); + + json.WriteString("Size"); + json.WriteNumber(m_Size); + + if(m_pUserData != VMA_NULL) + { + json.WriteString("UserData"); + if(IsUserDataString()) + { + json.WriteString((const char*)m_pUserData); + } + else + { + json.BeginString(); + json.ContinueString_Pointer(m_pUserData); + json.EndString(); + } + } + + json.WriteString("CreationFrameIndex"); + json.WriteNumber(m_CreationFrameIndex); + + json.WriteString("LastUseFrameIndex"); + json.WriteNumber(GetLastUseFrameIndex()); + + if(m_BufferImageUsage != 0) + { + json.WriteString("Usage"); + json.WriteNumber(m_BufferImageUsage); + } +} + +#endif + +void VmaAllocation_T::FreeUserDataString(VmaAllocator hAllocator) +{ + VMA_ASSERT(IsUserDataString()); + VmaFreeString(hAllocator->GetAllocationCallbacks(), (char*)m_pUserData); + m_pUserData = VMA_NULL; +} + +void VmaAllocation_T::BlockAllocMap() +{ + VMA_ASSERT(GetType() == ALLOCATION_TYPE_BLOCK); + + if((m_MapCount & ~MAP_COUNT_FLAG_PERSISTENT_MAP) < 0x7F) + { + ++m_MapCount; + } + else + { + VMA_ASSERT(0 && "Allocation mapped too many times simultaneously."); + } +} + +void VmaAllocation_T::BlockAllocUnmap() +{ + VMA_ASSERT(GetType() == ALLOCATION_TYPE_BLOCK); + + if((m_MapCount & ~MAP_COUNT_FLAG_PERSISTENT_MAP) != 0) + { + --m_MapCount; + } + else + { + VMA_ASSERT(0 && "Unmapping allocation not previously mapped."); + } +} + +VkResult VmaAllocation_T::DedicatedAllocMap(VmaAllocator hAllocator, void** ppData) +{ + VMA_ASSERT(GetType() == ALLOCATION_TYPE_DEDICATED); + + if(m_MapCount != 0) + { + if((m_MapCount & ~MAP_COUNT_FLAG_PERSISTENT_MAP) < 0x7F) + { + VMA_ASSERT(m_DedicatedAllocation.m_pMappedData != VMA_NULL); + *ppData = m_DedicatedAllocation.m_pMappedData; + ++m_MapCount; + return VK_SUCCESS; + } + else + { + VMA_ASSERT(0 && "Dedicated allocation mapped too many times simultaneously."); + return VK_ERROR_MEMORY_MAP_FAILED; + } + } + else + { + VkResult result = (*hAllocator->GetVulkanFunctions().vkMapMemory)( + hAllocator->m_hDevice, + m_DedicatedAllocation.m_hMemory, + 0, // offset + VK_WHOLE_SIZE, + 0, // flags + ppData); + if(result == VK_SUCCESS) + { + m_DedicatedAllocation.m_pMappedData = *ppData; + m_MapCount = 1; + } + return result; + } +} + +void VmaAllocation_T::DedicatedAllocUnmap(VmaAllocator hAllocator) +{ + VMA_ASSERT(GetType() == ALLOCATION_TYPE_DEDICATED); + + if((m_MapCount & ~MAP_COUNT_FLAG_PERSISTENT_MAP) != 0) + { + --m_MapCount; + if(m_MapCount == 0) + { + m_DedicatedAllocation.m_pMappedData = VMA_NULL; + (*hAllocator->GetVulkanFunctions().vkUnmapMemory)( + hAllocator->m_hDevice, + m_DedicatedAllocation.m_hMemory); + } + } + else + { + VMA_ASSERT(0 && "Unmapping dedicated allocation not previously mapped."); + } +} + +#if VMA_STATS_STRING_ENABLED + +static void VmaPrintStatInfo(VmaJsonWriter& json, const VmaStatInfo& stat) +{ + json.BeginObject(); + + json.WriteString("Blocks"); + json.WriteNumber(stat.blockCount); + + json.WriteString("Allocations"); + json.WriteNumber(stat.allocationCount); + + json.WriteString("UnusedRanges"); + json.WriteNumber(stat.unusedRangeCount); + + json.WriteString("UsedBytes"); + json.WriteNumber(stat.usedBytes); + + json.WriteString("UnusedBytes"); + json.WriteNumber(stat.unusedBytes); + + if(stat.allocationCount > 1) + { + json.WriteString("AllocationSize"); + json.BeginObject(true); + json.WriteString("Min"); + json.WriteNumber(stat.allocationSizeMin); + json.WriteString("Avg"); + json.WriteNumber(stat.allocationSizeAvg); + json.WriteString("Max"); + json.WriteNumber(stat.allocationSizeMax); + json.EndObject(); + } + + if(stat.unusedRangeCount > 1) + { + json.WriteString("UnusedRangeSize"); + json.BeginObject(true); + json.WriteString("Min"); + json.WriteNumber(stat.unusedRangeSizeMin); + json.WriteString("Avg"); + json.WriteNumber(stat.unusedRangeSizeAvg); + json.WriteString("Max"); + json.WriteNumber(stat.unusedRangeSizeMax); + json.EndObject(); + } + + json.EndObject(); +} + +#endif // #if VMA_STATS_STRING_ENABLED + +struct VmaSuballocationItemSizeLess +{ + bool operator()( + const VmaSuballocationList::iterator lhs, + const VmaSuballocationList::iterator rhs) const + { + return lhs->size < rhs->size; + } + bool operator()( + const VmaSuballocationList::iterator lhs, + VkDeviceSize rhsSize) const + { + return lhs->size < rhsSize; + } +}; + + +//////////////////////////////////////////////////////////////////////////////// +// class VmaBlockMetadata + +VmaBlockMetadata::VmaBlockMetadata(VmaAllocator hAllocator) : + m_Size(0), + m_pAllocationCallbacks(hAllocator->GetAllocationCallbacks()) +{ +} + +#if VMA_STATS_STRING_ENABLED + +void VmaBlockMetadata::PrintDetailedMap_Begin(class VmaJsonWriter& json, + VkDeviceSize unusedBytes, + size_t allocationCount, + size_t unusedRangeCount) const +{ + json.BeginObject(); + + json.WriteString("TotalBytes"); + json.WriteNumber(GetSize()); + + json.WriteString("UnusedBytes"); + json.WriteNumber(unusedBytes); + + json.WriteString("Allocations"); + json.WriteNumber((uint64_t)allocationCount); + + json.WriteString("UnusedRanges"); + json.WriteNumber((uint64_t)unusedRangeCount); + + json.WriteString("Suballocations"); + json.BeginArray(); +} + +void VmaBlockMetadata::PrintDetailedMap_Allocation(class VmaJsonWriter& json, + VkDeviceSize offset, + VmaAllocation hAllocation) const +{ + json.BeginObject(true); + + json.WriteString("Offset"); + json.WriteNumber(offset); + + hAllocation->PrintParameters(json); + + json.EndObject(); +} + +void VmaBlockMetadata::PrintDetailedMap_UnusedRange(class VmaJsonWriter& json, + VkDeviceSize offset, + VkDeviceSize size) const +{ + json.BeginObject(true); + + json.WriteString("Offset"); + json.WriteNumber(offset); + + json.WriteString("Type"); + json.WriteString(VMA_SUBALLOCATION_TYPE_NAMES[VMA_SUBALLOCATION_TYPE_FREE]); + + json.WriteString("Size"); + json.WriteNumber(size); + + json.EndObject(); +} + +void VmaBlockMetadata::PrintDetailedMap_End(class VmaJsonWriter& json) const +{ + json.EndArray(); + json.EndObject(); +} + +#endif // #if VMA_STATS_STRING_ENABLED + +//////////////////////////////////////////////////////////////////////////////// +// class VmaBlockMetadata_Generic + +VmaBlockMetadata_Generic::VmaBlockMetadata_Generic(VmaAllocator hAllocator) : + VmaBlockMetadata(hAllocator), + m_FreeCount(0), + m_SumFreeSize(0), + m_Suballocations(VmaStlAllocator(hAllocator->GetAllocationCallbacks())), + m_FreeSuballocationsBySize(VmaStlAllocator(hAllocator->GetAllocationCallbacks())) +{ +} + +VmaBlockMetadata_Generic::~VmaBlockMetadata_Generic() +{ +} + +void VmaBlockMetadata_Generic::Init(VkDeviceSize size) +{ + VmaBlockMetadata::Init(size); + + m_FreeCount = 1; + m_SumFreeSize = size; + + VmaSuballocation suballoc = {}; + suballoc.offset = 0; + suballoc.size = size; + suballoc.type = VMA_SUBALLOCATION_TYPE_FREE; + suballoc.hAllocation = VK_NULL_HANDLE; + + VMA_ASSERT(size > VMA_MIN_FREE_SUBALLOCATION_SIZE_TO_REGISTER); + m_Suballocations.push_back(suballoc); + VmaSuballocationList::iterator suballocItem = m_Suballocations.end(); + --suballocItem; + m_FreeSuballocationsBySize.push_back(suballocItem); +} + +bool VmaBlockMetadata_Generic::Validate() const +{ + VMA_VALIDATE(!m_Suballocations.empty()); + + // Expected offset of new suballocation as calculated from previous ones. + VkDeviceSize calculatedOffset = 0; + // Expected number of free suballocations as calculated from traversing their list. + uint32_t calculatedFreeCount = 0; + // Expected sum size of free suballocations as calculated from traversing their list. + VkDeviceSize calculatedSumFreeSize = 0; + // Expected number of free suballocations that should be registered in + // m_FreeSuballocationsBySize calculated from traversing their list. + size_t freeSuballocationsToRegister = 0; + // True if previous visited suballocation was free. + bool prevFree = false; + + for(VmaSuballocationList::const_iterator suballocItem = m_Suballocations.cbegin(); + suballocItem != m_Suballocations.cend(); + ++suballocItem) + { + const VmaSuballocation& subAlloc = *suballocItem; + + // Actual offset of this suballocation doesn't match expected one. + VMA_VALIDATE(subAlloc.offset == calculatedOffset); + + const bool currFree = (subAlloc.type == VMA_SUBALLOCATION_TYPE_FREE); + // Two adjacent free suballocations are invalid. They should be merged. + VMA_VALIDATE(!prevFree || !currFree); + + VMA_VALIDATE(currFree == (subAlloc.hAllocation == VK_NULL_HANDLE)); + + if(currFree) + { + calculatedSumFreeSize += subAlloc.size; + ++calculatedFreeCount; + if(subAlloc.size >= VMA_MIN_FREE_SUBALLOCATION_SIZE_TO_REGISTER) + { + ++freeSuballocationsToRegister; + } + + // Margin required between allocations - every free space must be at least that large. + VMA_VALIDATE(subAlloc.size >= VMA_DEBUG_MARGIN); + } + else + { + VMA_VALIDATE(subAlloc.hAllocation->GetOffset() == subAlloc.offset); + VMA_VALIDATE(subAlloc.hAllocation->GetSize() == subAlloc.size); + + // Margin required between allocations - previous allocation must be free. + VMA_VALIDATE(VMA_DEBUG_MARGIN == 0 || prevFree); + } + + calculatedOffset += subAlloc.size; + prevFree = currFree; + } + + // Number of free suballocations registered in m_FreeSuballocationsBySize doesn't + // match expected one. + VMA_VALIDATE(m_FreeSuballocationsBySize.size() == freeSuballocationsToRegister); + + VkDeviceSize lastSize = 0; + for (const auto i : c10::irange(m_FreeSuballocationsBySize.size())) { + VmaSuballocationList::iterator suballocItem = m_FreeSuballocationsBySize[i]; + + // Only free suballocations can be registered in m_FreeSuballocationsBySize. + VMA_VALIDATE(suballocItem->type == VMA_SUBALLOCATION_TYPE_FREE); + // They must be sorted by size ascending. + VMA_VALIDATE(suballocItem->size >= lastSize); + + lastSize = suballocItem->size; + } + + // Check if totals match calculacted values. + VMA_VALIDATE(ValidateFreeSuballocationList()); + VMA_VALIDATE(calculatedOffset == GetSize()); + VMA_VALIDATE(calculatedSumFreeSize == m_SumFreeSize); + VMA_VALIDATE(calculatedFreeCount == m_FreeCount); + + return true; +} + +VkDeviceSize VmaBlockMetadata_Generic::GetUnusedRangeSizeMax() const +{ + if(!m_FreeSuballocationsBySize.empty()) + { + return m_FreeSuballocationsBySize.back()->size; + } + else + { + return 0; + } +} + +bool VmaBlockMetadata_Generic::IsEmpty() const +{ + return (m_Suballocations.size() == 1) && (m_FreeCount == 1); +} + +void VmaBlockMetadata_Generic::CalcAllocationStatInfo(VmaStatInfo& outInfo) const +{ + outInfo.blockCount = 1; + + const uint32_t rangeCount = (uint32_t)m_Suballocations.size(); + outInfo.allocationCount = rangeCount - m_FreeCount; + outInfo.unusedRangeCount = m_FreeCount; + + outInfo.unusedBytes = m_SumFreeSize; + outInfo.usedBytes = GetSize() - outInfo.unusedBytes; + + outInfo.allocationSizeMin = UINT64_MAX; + outInfo.allocationSizeMax = 0; + outInfo.unusedRangeSizeMin = UINT64_MAX; + outInfo.unusedRangeSizeMax = 0; + + for(VmaSuballocationList::const_iterator suballocItem = m_Suballocations.cbegin(); + suballocItem != m_Suballocations.cend(); + ++suballocItem) + { + const VmaSuballocation& suballoc = *suballocItem; + if(suballoc.type != VMA_SUBALLOCATION_TYPE_FREE) + { + outInfo.allocationSizeMin = VMA_MIN(outInfo.allocationSizeMin, suballoc.size); + outInfo.allocationSizeMax = VMA_MAX(outInfo.allocationSizeMax, suballoc.size); + } + else + { + outInfo.unusedRangeSizeMin = VMA_MIN(outInfo.unusedRangeSizeMin, suballoc.size); + outInfo.unusedRangeSizeMax = VMA_MAX(outInfo.unusedRangeSizeMax, suballoc.size); + } + } +} + +void VmaBlockMetadata_Generic::AddPoolStats(VmaPoolStats& inoutStats) const +{ + const uint32_t rangeCount = (uint32_t)m_Suballocations.size(); + + inoutStats.size += GetSize(); + inoutStats.unusedSize += m_SumFreeSize; + inoutStats.allocationCount += rangeCount - m_FreeCount; + inoutStats.unusedRangeCount += m_FreeCount; + inoutStats.unusedRangeSizeMax = VMA_MAX(inoutStats.unusedRangeSizeMax, GetUnusedRangeSizeMax()); +} + +#if VMA_STATS_STRING_ENABLED + +void VmaBlockMetadata_Generic::PrintDetailedMap(class VmaJsonWriter& json) const +{ + PrintDetailedMap_Begin(json, + m_SumFreeSize, // unusedBytes + m_Suballocations.size() - (size_t)m_FreeCount, // allocationCount + m_FreeCount); // unusedRangeCount + + size_t i = 0; + for(VmaSuballocationList::const_iterator suballocItem = m_Suballocations.cbegin(); + suballocItem != m_Suballocations.cend(); + ++suballocItem, ++i) + { + if(suballocItem->type == VMA_SUBALLOCATION_TYPE_FREE) + { + PrintDetailedMap_UnusedRange(json, suballocItem->offset, suballocItem->size); + } + else + { + PrintDetailedMap_Allocation(json, suballocItem->offset, suballocItem->hAllocation); + } + } + + PrintDetailedMap_End(json); +} + +#endif // #if VMA_STATS_STRING_ENABLED + +bool VmaBlockMetadata_Generic::CreateAllocationRequest( + uint32_t currentFrameIndex, + uint32_t frameInUseCount, + VkDeviceSize bufferImageGranularity, + VkDeviceSize allocSize, + VkDeviceSize allocAlignment, + bool upperAddress, + VmaSuballocationType allocType, + bool canMakeOtherLost, + uint32_t strategy, + VmaAllocationRequest* pAllocationRequest) +{ + VMA_ASSERT(allocSize > 0); + VMA_ASSERT(!upperAddress); + VMA_ASSERT(allocType != VMA_SUBALLOCATION_TYPE_FREE); + VMA_ASSERT(pAllocationRequest != VMA_NULL); + VMA_HEAVY_ASSERT(Validate()); + + pAllocationRequest->type = VmaAllocationRequestType::Normal; + + // There is not enough total free space in this block to fullfill the request: Early return. + if(canMakeOtherLost == false && + m_SumFreeSize < allocSize + 2 * VMA_DEBUG_MARGIN) + { + return false; + } + + // New algorithm, efficiently searching freeSuballocationsBySize. + const size_t freeSuballocCount = m_FreeSuballocationsBySize.size(); + if(freeSuballocCount > 0) + { + if(strategy == VMA_ALLOCATION_CREATE_STRATEGY_BEST_FIT_BIT) + { + // Find first free suballocation with size not less than allocSize + 2 * VMA_DEBUG_MARGIN. + VmaSuballocationList::iterator* const it = VmaBinaryFindFirstNotLess( + m_FreeSuballocationsBySize.data(), + m_FreeSuballocationsBySize.data() + freeSuballocCount, + allocSize + 2 * VMA_DEBUG_MARGIN, + VmaSuballocationItemSizeLess()); + size_t index = it - m_FreeSuballocationsBySize.data(); + for(; index < freeSuballocCount; ++index) + { + if(CheckAllocation( + currentFrameIndex, + frameInUseCount, + bufferImageGranularity, + allocSize, + allocAlignment, + allocType, + m_FreeSuballocationsBySize[index], + false, // canMakeOtherLost + &pAllocationRequest->offset, + &pAllocationRequest->itemsToMakeLostCount, + &pAllocationRequest->sumFreeSize, + &pAllocationRequest->sumItemSize)) + { + pAllocationRequest->item = m_FreeSuballocationsBySize[index]; + return true; + } + } + } + else if(strategy == VMA_ALLOCATION_INTERNAL_STRATEGY_MIN_OFFSET) + { + for(VmaSuballocationList::iterator it = m_Suballocations.begin(); + it != m_Suballocations.end(); + ++it) + { + if(it->type == VMA_SUBALLOCATION_TYPE_FREE && CheckAllocation( + currentFrameIndex, + frameInUseCount, + bufferImageGranularity, + allocSize, + allocAlignment, + allocType, + it, + false, // canMakeOtherLost + &pAllocationRequest->offset, + &pAllocationRequest->itemsToMakeLostCount, + &pAllocationRequest->sumFreeSize, + &pAllocationRequest->sumItemSize)) + { + pAllocationRequest->item = it; + return true; + } + } + } + else // WORST_FIT, FIRST_FIT + { + // Search staring from biggest suballocations. + for(size_t index = freeSuballocCount; index--; ) + { + if(CheckAllocation( + currentFrameIndex, + frameInUseCount, + bufferImageGranularity, + allocSize, + allocAlignment, + allocType, + m_FreeSuballocationsBySize[index], + false, // canMakeOtherLost + &pAllocationRequest->offset, + &pAllocationRequest->itemsToMakeLostCount, + &pAllocationRequest->sumFreeSize, + &pAllocationRequest->sumItemSize)) + { + pAllocationRequest->item = m_FreeSuballocationsBySize[index]; + return true; + } + } + } + } + + if(canMakeOtherLost) + { + // Brute-force algorithm. TODO: Come up with something better. + + bool found = false; + VmaAllocationRequest tmpAllocRequest = {}; + tmpAllocRequest.type = VmaAllocationRequestType::Normal; + for(VmaSuballocationList::iterator suballocIt = m_Suballocations.begin(); + suballocIt != m_Suballocations.end(); + ++suballocIt) + { + if(suballocIt->type == VMA_SUBALLOCATION_TYPE_FREE || + suballocIt->hAllocation->CanBecomeLost()) + { + if(CheckAllocation( + currentFrameIndex, + frameInUseCount, + bufferImageGranularity, + allocSize, + allocAlignment, + allocType, + suballocIt, + canMakeOtherLost, + &tmpAllocRequest.offset, + &tmpAllocRequest.itemsToMakeLostCount, + &tmpAllocRequest.sumFreeSize, + &tmpAllocRequest.sumItemSize)) + { + if(strategy == VMA_ALLOCATION_CREATE_STRATEGY_FIRST_FIT_BIT) + { + *pAllocationRequest = tmpAllocRequest; + pAllocationRequest->item = suballocIt; + break; + } + if(!found || tmpAllocRequest.CalcCost() < pAllocationRequest->CalcCost()) + { + *pAllocationRequest = tmpAllocRequest; + pAllocationRequest->item = suballocIt; + found = true; + } + } + } + } + + return found; + } + + return false; +} + +bool VmaBlockMetadata_Generic::MakeRequestedAllocationsLost( + uint32_t currentFrameIndex, + uint32_t frameInUseCount, + VmaAllocationRequest* pAllocationRequest) +{ + VMA_ASSERT(pAllocationRequest && pAllocationRequest->type == VmaAllocationRequestType::Normal); + + while(pAllocationRequest->itemsToMakeLostCount > 0) + { + if(pAllocationRequest->item->type == VMA_SUBALLOCATION_TYPE_FREE) + { + ++pAllocationRequest->item; + } + VMA_ASSERT(pAllocationRequest->item != m_Suballocations.end()); + VMA_ASSERT(pAllocationRequest->item->hAllocation != VK_NULL_HANDLE); + VMA_ASSERT(pAllocationRequest->item->hAllocation->CanBecomeLost()); + if(pAllocationRequest->item->hAllocation->MakeLost(currentFrameIndex, frameInUseCount)) + { + pAllocationRequest->item = FreeSuballocation(pAllocationRequest->item); + --pAllocationRequest->itemsToMakeLostCount; + } + else + { + return false; + } + } + + VMA_HEAVY_ASSERT(Validate()); + VMA_ASSERT(pAllocationRequest->item != m_Suballocations.end()); + VMA_ASSERT(pAllocationRequest->item->type == VMA_SUBALLOCATION_TYPE_FREE); + + return true; +} + +uint32_t VmaBlockMetadata_Generic::MakeAllocationsLost(uint32_t currentFrameIndex, uint32_t frameInUseCount) +{ + uint32_t lostAllocationCount = 0; + for(VmaSuballocationList::iterator it = m_Suballocations.begin(); + it != m_Suballocations.end(); + ++it) + { + if(it->type != VMA_SUBALLOCATION_TYPE_FREE && + it->hAllocation->CanBecomeLost() && + it->hAllocation->MakeLost(currentFrameIndex, frameInUseCount)) + { + it = FreeSuballocation(it); + ++lostAllocationCount; + } + } + return lostAllocationCount; +} + +VkResult VmaBlockMetadata_Generic::CheckCorruption(const void* pBlockData) +{ + for(VmaSuballocationList::iterator it = m_Suballocations.begin(); + it != m_Suballocations.end(); + ++it) + { + if(it->type != VMA_SUBALLOCATION_TYPE_FREE) + { + if(!VmaValidateMagicValue(pBlockData, it->offset - VMA_DEBUG_MARGIN)) + { + VMA_ASSERT(0 && "MEMORY CORRUPTION DETECTED BEFORE VALIDATED ALLOCATION!"); + return VK_ERROR_VALIDATION_FAILED_EXT; + } + if(!VmaValidateMagicValue(pBlockData, it->offset + it->size)) + { + VMA_ASSERT(0 && "MEMORY CORRUPTION DETECTED AFTER VALIDATED ALLOCATION!"); + return VK_ERROR_VALIDATION_FAILED_EXT; + } + } + } + + return VK_SUCCESS; +} + +void VmaBlockMetadata_Generic::Alloc( + const VmaAllocationRequest& request, + VmaSuballocationType type, + VkDeviceSize allocSize, + VmaAllocation hAllocation) +{ + VMA_ASSERT(request.type == VmaAllocationRequestType::Normal); + VMA_ASSERT(request.item != m_Suballocations.end()); + VmaSuballocation& suballoc = *request.item; + // Given suballocation is a free block. + VMA_ASSERT(suballoc.type == VMA_SUBALLOCATION_TYPE_FREE); + // Given offset is inside this suballocation. + VMA_ASSERT(request.offset >= suballoc.offset); + const VkDeviceSize paddingBegin = request.offset - suballoc.offset; + VMA_ASSERT(suballoc.size >= paddingBegin + allocSize); + const VkDeviceSize paddingEnd = suballoc.size - paddingBegin - allocSize; + + // Unregister this free suballocation from m_FreeSuballocationsBySize and update + // it to become used. + UnregisterFreeSuballocation(request.item); + + suballoc.offset = request.offset; + suballoc.size = allocSize; + suballoc.type = type; + suballoc.hAllocation = hAllocation; + + // If there are any free bytes remaining at the end, insert new free suballocation after current one. + if(paddingEnd) + { + VmaSuballocation paddingSuballoc = {}; + paddingSuballoc.offset = request.offset + allocSize; + paddingSuballoc.size = paddingEnd; + paddingSuballoc.type = VMA_SUBALLOCATION_TYPE_FREE; + VmaSuballocationList::iterator next = request.item; + ++next; + const VmaSuballocationList::iterator paddingEndItem = + m_Suballocations.insert(next, paddingSuballoc); + RegisterFreeSuballocation(paddingEndItem); + } + + // If there are any free bytes remaining at the beginning, insert new free suballocation before current one. + if(paddingBegin) + { + VmaSuballocation paddingSuballoc = {}; + paddingSuballoc.offset = request.offset - paddingBegin; + paddingSuballoc.size = paddingBegin; + paddingSuballoc.type = VMA_SUBALLOCATION_TYPE_FREE; + const VmaSuballocationList::iterator paddingBeginItem = + m_Suballocations.insert(request.item, paddingSuballoc); + RegisterFreeSuballocation(paddingBeginItem); + } + + // Update totals. + m_FreeCount = m_FreeCount - 1; + if(paddingBegin > 0) + { + ++m_FreeCount; + } + if(paddingEnd > 0) + { + ++m_FreeCount; + } + m_SumFreeSize -= allocSize; +} + +void VmaBlockMetadata_Generic::Free(const VmaAllocation allocation) +{ + for(VmaSuballocationList::iterator suballocItem = m_Suballocations.begin(); + suballocItem != m_Suballocations.end(); + ++suballocItem) + { + VmaSuballocation& suballoc = *suballocItem; + if(suballoc.hAllocation == allocation) + { + FreeSuballocation(suballocItem); + VMA_HEAVY_ASSERT(Validate()); + return; + } + } + VMA_ASSERT(0 && "Not found!"); +} + +void VmaBlockMetadata_Generic::FreeAtOffset(VkDeviceSize offset) +{ + for(VmaSuballocationList::iterator suballocItem = m_Suballocations.begin(); + suballocItem != m_Suballocations.end(); + ++suballocItem) + { + VmaSuballocation& suballoc = *suballocItem; + if(suballoc.offset == offset) + { + FreeSuballocation(suballocItem); + return; + } + } + VMA_ASSERT(0 && "Not found!"); +} + +bool VmaBlockMetadata_Generic::ValidateFreeSuballocationList() const +{ + VkDeviceSize lastSize = 0; + for(size_t i = 0, count = m_FreeSuballocationsBySize.size(); i < count; ++i) + { + const VmaSuballocationList::iterator it = m_FreeSuballocationsBySize[i]; + + VMA_VALIDATE(it->type == VMA_SUBALLOCATION_TYPE_FREE); + VMA_VALIDATE(it->size >= VMA_MIN_FREE_SUBALLOCATION_SIZE_TO_REGISTER); + VMA_VALIDATE(it->size >= lastSize); + lastSize = it->size; + } + return true; +} + +bool VmaBlockMetadata_Generic::CheckAllocation( + uint32_t currentFrameIndex, + uint32_t frameInUseCount, + VkDeviceSize bufferImageGranularity, + VkDeviceSize allocSize, + VkDeviceSize allocAlignment, + VmaSuballocationType allocType, + VmaSuballocationList::const_iterator suballocItem, + bool canMakeOtherLost, + VkDeviceSize* pOffset, + size_t* itemsToMakeLostCount, + VkDeviceSize* pSumFreeSize, + VkDeviceSize* pSumItemSize) const +{ + VMA_ASSERT(allocSize > 0); + VMA_ASSERT(allocType != VMA_SUBALLOCATION_TYPE_FREE); + VMA_ASSERT(suballocItem != m_Suballocations.cend()); + VMA_ASSERT(pOffset != VMA_NULL); + + *itemsToMakeLostCount = 0; + *pSumFreeSize = 0; + *pSumItemSize = 0; + + if(canMakeOtherLost) + { + if(suballocItem->type == VMA_SUBALLOCATION_TYPE_FREE) + { + *pSumFreeSize = suballocItem->size; + } + else + { + if(suballocItem->hAllocation->CanBecomeLost() && + suballocItem->hAllocation->GetLastUseFrameIndex() + frameInUseCount < currentFrameIndex) + { + ++*itemsToMakeLostCount; + *pSumItemSize = suballocItem->size; + } + else + { + return false; + } + } + + // Remaining size is too small for this request: Early return. + if(GetSize() - suballocItem->offset < allocSize) + { + return false; + } + + // Start from offset equal to beginning of this suballocation. + *pOffset = suballocItem->offset; + + // Apply VMA_DEBUG_MARGIN at the beginning. + if(VMA_DEBUG_MARGIN > 0) + { + *pOffset += VMA_DEBUG_MARGIN; + } + + // Apply alignment. + *pOffset = VmaAlignUp(*pOffset, allocAlignment); + + // Check previous suballocations for BufferImageGranularity conflicts. + // Make bigger alignment if necessary. + if(bufferImageGranularity > 1 && bufferImageGranularity != allocAlignment) + { + bool bufferImageGranularityConflict = false; + VmaSuballocationList::const_iterator prevSuballocItem = suballocItem; + while(prevSuballocItem != m_Suballocations.cbegin()) + { + --prevSuballocItem; + const VmaSuballocation& prevSuballoc = *prevSuballocItem; + if(VmaBlocksOnSamePage(prevSuballoc.offset, prevSuballoc.size, *pOffset, bufferImageGranularity)) + { + if(VmaIsBufferImageGranularityConflict(prevSuballoc.type, allocType)) + { + bufferImageGranularityConflict = true; + break; + } + } + else + // Already on previous page. + break; + } + if(bufferImageGranularityConflict) + { + *pOffset = VmaAlignUp(*pOffset, bufferImageGranularity); + } + } + + // Now that we have final *pOffset, check if we are past suballocItem. + // If yes, return false - this function should be called for another suballocItem as starting point. + if(*pOffset >= suballocItem->offset + suballocItem->size) + { + return false; + } + + // Calculate padding at the beginning based on current offset. + const VkDeviceSize paddingBegin = *pOffset - suballocItem->offset; + + // Calculate required margin at the end. + const VkDeviceSize requiredEndMargin = VMA_DEBUG_MARGIN; + + const VkDeviceSize totalSize = paddingBegin + allocSize + requiredEndMargin; + // Another early return check. + if(suballocItem->offset + totalSize > GetSize()) + { + return false; + } + + // Advance lastSuballocItem until desired size is reached. + // Update itemsToMakeLostCount. + VmaSuballocationList::const_iterator lastSuballocItem = suballocItem; + if(totalSize > suballocItem->size) + { + VkDeviceSize remainingSize = totalSize - suballocItem->size; + while(remainingSize > 0) + { + ++lastSuballocItem; + if(lastSuballocItem == m_Suballocations.cend()) + { + return false; + } + if(lastSuballocItem->type == VMA_SUBALLOCATION_TYPE_FREE) + { + *pSumFreeSize += lastSuballocItem->size; + } + else + { + VMA_ASSERT(lastSuballocItem->hAllocation != VK_NULL_HANDLE); + if(lastSuballocItem->hAllocation->CanBecomeLost() && + lastSuballocItem->hAllocation->GetLastUseFrameIndex() + frameInUseCount < currentFrameIndex) + { + ++*itemsToMakeLostCount; + *pSumItemSize += lastSuballocItem->size; + } + else + { + return false; + } + } + remainingSize = (lastSuballocItem->size < remainingSize) ? + remainingSize - lastSuballocItem->size : 0; + } + } + + // Check next suballocations for BufferImageGranularity conflicts. + // If conflict exists, we must mark more allocations lost or fail. + if(allocSize % bufferImageGranularity || *pOffset % bufferImageGranularity) + { + VmaSuballocationList::const_iterator nextSuballocItem = lastSuballocItem; + ++nextSuballocItem; + while(nextSuballocItem != m_Suballocations.cend()) + { + const VmaSuballocation& nextSuballoc = *nextSuballocItem; + if(VmaBlocksOnSamePage(*pOffset, allocSize, nextSuballoc.offset, bufferImageGranularity)) + { + if(VmaIsBufferImageGranularityConflict(allocType, nextSuballoc.type)) + { + VMA_ASSERT(nextSuballoc.hAllocation != VK_NULL_HANDLE); + if(nextSuballoc.hAllocation->CanBecomeLost() && + nextSuballoc.hAllocation->GetLastUseFrameIndex() + frameInUseCount < currentFrameIndex) + { + ++*itemsToMakeLostCount; + } + else + { + return false; + } + } + } + else + { + // Already on next page. + break; + } + ++nextSuballocItem; + } + } + } + else + { + const VmaSuballocation& suballoc = *suballocItem; + VMA_ASSERT(suballoc.type == VMA_SUBALLOCATION_TYPE_FREE); + + *pSumFreeSize = suballoc.size; + + // Size of this suballocation is too small for this request: Early return. + if(suballoc.size < allocSize) + { + return false; + } + + // Start from offset equal to beginning of this suballocation. + *pOffset = suballoc.offset; + + // Apply VMA_DEBUG_MARGIN at the beginning. + if(VMA_DEBUG_MARGIN > 0) + { + *pOffset += VMA_DEBUG_MARGIN; + } + + // Apply alignment. + *pOffset = VmaAlignUp(*pOffset, allocAlignment); + + // Check previous suballocations for BufferImageGranularity conflicts. + // Make bigger alignment if necessary. + if(bufferImageGranularity > 1 && bufferImageGranularity != allocAlignment) + { + bool bufferImageGranularityConflict = false; + VmaSuballocationList::const_iterator prevSuballocItem = suballocItem; + while(prevSuballocItem != m_Suballocations.cbegin()) + { + --prevSuballocItem; + const VmaSuballocation& prevSuballoc = *prevSuballocItem; + if(VmaBlocksOnSamePage(prevSuballoc.offset, prevSuballoc.size, *pOffset, bufferImageGranularity)) + { + if(VmaIsBufferImageGranularityConflict(prevSuballoc.type, allocType)) + { + bufferImageGranularityConflict = true; + break; + } + } + else + // Already on previous page. + break; + } + if(bufferImageGranularityConflict) + { + *pOffset = VmaAlignUp(*pOffset, bufferImageGranularity); + } + } + + // Calculate padding at the beginning based on current offset. + const VkDeviceSize paddingBegin = *pOffset - suballoc.offset; + + // Calculate required margin at the end. + const VkDeviceSize requiredEndMargin = VMA_DEBUG_MARGIN; + + // Fail if requested size plus margin before and after is bigger than size of this suballocation. + if(paddingBegin + allocSize + requiredEndMargin > suballoc.size) + { + return false; + } + + // Check next suballocations for BufferImageGranularity conflicts. + // If conflict exists, allocation cannot be made here. + if(allocSize % bufferImageGranularity || *pOffset % bufferImageGranularity) + { + VmaSuballocationList::const_iterator nextSuballocItem = suballocItem; + ++nextSuballocItem; + while(nextSuballocItem != m_Suballocations.cend()) + { + const VmaSuballocation& nextSuballoc = *nextSuballocItem; + if(VmaBlocksOnSamePage(*pOffset, allocSize, nextSuballoc.offset, bufferImageGranularity)) + { + if(VmaIsBufferImageGranularityConflict(allocType, nextSuballoc.type)) + { + return false; + } + } + else + { + // Already on next page. + break; + } + ++nextSuballocItem; + } + } + } + + // All tests passed: Success. pOffset is already filled. + return true; +} + +void VmaBlockMetadata_Generic::MergeFreeWithNext(VmaSuballocationList::iterator item) +{ + VMA_ASSERT(item != m_Suballocations.end()); + VMA_ASSERT(item->type == VMA_SUBALLOCATION_TYPE_FREE); + + VmaSuballocationList::iterator nextItem = item; + ++nextItem; + VMA_ASSERT(nextItem != m_Suballocations.end()); + VMA_ASSERT(nextItem->type == VMA_SUBALLOCATION_TYPE_FREE); + + item->size += nextItem->size; + --m_FreeCount; + m_Suballocations.erase(nextItem); +} + +VmaSuballocationList::iterator VmaBlockMetadata_Generic::FreeSuballocation(VmaSuballocationList::iterator suballocItem) +{ + // Change this suballocation to be marked as free. + VmaSuballocation& suballoc = *suballocItem; + suballoc.type = VMA_SUBALLOCATION_TYPE_FREE; + suballoc.hAllocation = VK_NULL_HANDLE; + + // Update totals. + ++m_FreeCount; + m_SumFreeSize += suballoc.size; + + // Merge with previous and/or next suballocation if it's also free. + bool mergeWithNext = false; + bool mergeWithPrev = false; + + VmaSuballocationList::iterator nextItem = suballocItem; + ++nextItem; + if((nextItem != m_Suballocations.end()) && (nextItem->type == VMA_SUBALLOCATION_TYPE_FREE)) + { + mergeWithNext = true; + } + + VmaSuballocationList::iterator prevItem = suballocItem; + if(suballocItem != m_Suballocations.begin()) + { + --prevItem; + if(prevItem->type == VMA_SUBALLOCATION_TYPE_FREE) + { + mergeWithPrev = true; + } + } + + if(mergeWithNext) + { + UnregisterFreeSuballocation(nextItem); + MergeFreeWithNext(suballocItem); + } + + if(mergeWithPrev) + { + UnregisterFreeSuballocation(prevItem); + MergeFreeWithNext(prevItem); + RegisterFreeSuballocation(prevItem); + return prevItem; + } + else + { + RegisterFreeSuballocation(suballocItem); + return suballocItem; + } +} + +void VmaBlockMetadata_Generic::RegisterFreeSuballocation(VmaSuballocationList::iterator item) +{ + VMA_ASSERT(item->type == VMA_SUBALLOCATION_TYPE_FREE); + VMA_ASSERT(item->size > 0); + + // You may want to enable this validation at the beginning or at the end of + // this function, depending on what do you want to check. + VMA_HEAVY_ASSERT(ValidateFreeSuballocationList()); + + if(item->size >= VMA_MIN_FREE_SUBALLOCATION_SIZE_TO_REGISTER) + { + if(m_FreeSuballocationsBySize.empty()) + { + m_FreeSuballocationsBySize.push_back(item); + } + else + { + VmaVectorInsertSorted(m_FreeSuballocationsBySize, item); + } + } + + //VMA_HEAVY_ASSERT(ValidateFreeSuballocationList()); +} + + +void VmaBlockMetadata_Generic::UnregisterFreeSuballocation(VmaSuballocationList::iterator item) +{ + VMA_ASSERT(item->type == VMA_SUBALLOCATION_TYPE_FREE); + VMA_ASSERT(item->size > 0); + + // You may want to enable this validation at the beginning or at the end of + // this function, depending on what do you want to check. + VMA_HEAVY_ASSERT(ValidateFreeSuballocationList()); + + if(item->size >= VMA_MIN_FREE_SUBALLOCATION_SIZE_TO_REGISTER) + { + VmaSuballocationList::iterator* const it = VmaBinaryFindFirstNotLess( + m_FreeSuballocationsBySize.data(), + m_FreeSuballocationsBySize.data() + m_FreeSuballocationsBySize.size(), + item, + VmaSuballocationItemSizeLess()); + for(size_t index = it - m_FreeSuballocationsBySize.data(); + index < m_FreeSuballocationsBySize.size(); + ++index) + { + if(m_FreeSuballocationsBySize[index] == item) + { + VmaVectorRemove(m_FreeSuballocationsBySize, index); + return; + } + VMA_ASSERT((m_FreeSuballocationsBySize[index]->size == item->size) && "Not found."); + } + VMA_ASSERT(0 && "Not found."); + } + + //VMA_HEAVY_ASSERT(ValidateFreeSuballocationList()); +} + +bool VmaBlockMetadata_Generic::IsBufferImageGranularityConflictPossible( + VkDeviceSize bufferImageGranularity, + VmaSuballocationType& inOutPrevSuballocType) const +{ + if(bufferImageGranularity == 1 || IsEmpty()) + { + return false; + } + + VkDeviceSize minAlignment = VK_WHOLE_SIZE; + bool typeConflictFound = false; + for(VmaSuballocationList::const_iterator it = m_Suballocations.cbegin(); + it != m_Suballocations.cend(); + ++it) + { + const VmaSuballocationType suballocType = it->type; + if(suballocType != VMA_SUBALLOCATION_TYPE_FREE) + { + minAlignment = VMA_MIN(minAlignment, it->hAllocation->GetAlignment()); + if(VmaIsBufferImageGranularityConflict(inOutPrevSuballocType, suballocType)) + { + typeConflictFound = true; + } + inOutPrevSuballocType = suballocType; + } + } + + return typeConflictFound || minAlignment >= bufferImageGranularity; +} + +//////////////////////////////////////////////////////////////////////////////// +// class VmaBlockMetadata_Linear + +VmaBlockMetadata_Linear::VmaBlockMetadata_Linear(VmaAllocator hAllocator) : + VmaBlockMetadata(hAllocator), + m_SumFreeSize(0), + m_Suballocations0(VmaStlAllocator(hAllocator->GetAllocationCallbacks())), + m_Suballocations1(VmaStlAllocator(hAllocator->GetAllocationCallbacks())), + m_1stVectorIndex(0), + m_2ndVectorMode(SECOND_VECTOR_EMPTY), + m_1stNullItemsBeginCount(0), + m_1stNullItemsMiddleCount(0), + m_2ndNullItemsCount(0) +{ +} + +VmaBlockMetadata_Linear::~VmaBlockMetadata_Linear() +{ +} + +void VmaBlockMetadata_Linear::Init(VkDeviceSize size) +{ + VmaBlockMetadata::Init(size); + m_SumFreeSize = size; +} + +bool VmaBlockMetadata_Linear::Validate() const +{ + const SuballocationVectorType& suballocations1st = AccessSuballocations1st(); + const SuballocationVectorType& suballocations2nd = AccessSuballocations2nd(); + + VMA_VALIDATE(suballocations2nd.empty() == (m_2ndVectorMode == SECOND_VECTOR_EMPTY)); + VMA_VALIDATE(!suballocations1st.empty() || + suballocations2nd.empty() || + m_2ndVectorMode != SECOND_VECTOR_RING_BUFFER); + + if(!suballocations1st.empty()) + { + // Null item at the beginning should be accounted into m_1stNullItemsBeginCount. + VMA_VALIDATE(suballocations1st[m_1stNullItemsBeginCount].hAllocation != VK_NULL_HANDLE); + // Null item at the end should be just pop_back(). + VMA_VALIDATE(suballocations1st.back().hAllocation != VK_NULL_HANDLE); + } + if(!suballocations2nd.empty()) + { + // Null item at the end should be just pop_back(). + VMA_VALIDATE(suballocations2nd.back().hAllocation != VK_NULL_HANDLE); + } + + VMA_VALIDATE(m_1stNullItemsBeginCount + m_1stNullItemsMiddleCount <= suballocations1st.size()); + VMA_VALIDATE(m_2ndNullItemsCount <= suballocations2nd.size()); + + VkDeviceSize sumUsedSize = 0; + const size_t suballoc1stCount = suballocations1st.size(); + VkDeviceSize offset = VMA_DEBUG_MARGIN; + + if(m_2ndVectorMode == SECOND_VECTOR_RING_BUFFER) + { + const size_t suballoc2ndCount = suballocations2nd.size(); + size_t nullItem2ndCount = 0; + for (const auto i : c10::irange(suballoc2ndCount)) { + const VmaSuballocation& suballoc = suballocations2nd[i]; + const bool currFree = (suballoc.type == VMA_SUBALLOCATION_TYPE_FREE); + + VMA_VALIDATE(currFree == (suballoc.hAllocation == VK_NULL_HANDLE)); + VMA_VALIDATE(suballoc.offset >= offset); + + if(!currFree) + { + VMA_VALIDATE(suballoc.hAllocation->GetOffset() == suballoc.offset); + VMA_VALIDATE(suballoc.hAllocation->GetSize() == suballoc.size); + sumUsedSize += suballoc.size; + } + else + { + ++nullItem2ndCount; + } + + offset = suballoc.offset + suballoc.size + VMA_DEBUG_MARGIN; + } + + VMA_VALIDATE(nullItem2ndCount == m_2ndNullItemsCount); + } + + for (const auto i : c10::irange(m_1stNullItemsBeginCount)) { + const VmaSuballocation& suballoc = suballocations1st[i]; + VMA_VALIDATE(suballoc.type == VMA_SUBALLOCATION_TYPE_FREE && + suballoc.hAllocation == VK_NULL_HANDLE); + } + + size_t nullItem1stCount = m_1stNullItemsBeginCount; + + for (const auto i : c10::irange(m_1stNullItemsBeginCount, suballoc1stCount)) { + const VmaSuballocation& suballoc = suballocations1st[i]; + const bool currFree = (suballoc.type == VMA_SUBALLOCATION_TYPE_FREE); + + VMA_VALIDATE(currFree == (suballoc.hAllocation == VK_NULL_HANDLE)); + VMA_VALIDATE(suballoc.offset >= offset); + VMA_VALIDATE(i >= m_1stNullItemsBeginCount || currFree); + + if(!currFree) + { + VMA_VALIDATE(suballoc.hAllocation->GetOffset() == suballoc.offset); + VMA_VALIDATE(suballoc.hAllocation->GetSize() == suballoc.size); + sumUsedSize += suballoc.size; + } + else + { + ++nullItem1stCount; + } + + offset = suballoc.offset + suballoc.size + VMA_DEBUG_MARGIN; + } + VMA_VALIDATE(nullItem1stCount == m_1stNullItemsBeginCount + m_1stNullItemsMiddleCount); + + if(m_2ndVectorMode == SECOND_VECTOR_DOUBLE_STACK) + { + const size_t suballoc2ndCount = suballocations2nd.size(); + size_t nullItem2ndCount = 0; + for(size_t i = suballoc2ndCount; i--; ) + { + const VmaSuballocation& suballoc = suballocations2nd[i]; + const bool currFree = (suballoc.type == VMA_SUBALLOCATION_TYPE_FREE); + + VMA_VALIDATE(currFree == (suballoc.hAllocation == VK_NULL_HANDLE)); + VMA_VALIDATE(suballoc.offset >= offset); + + if(!currFree) + { + VMA_VALIDATE(suballoc.hAllocation->GetOffset() == suballoc.offset); + VMA_VALIDATE(suballoc.hAllocation->GetSize() == suballoc.size); + sumUsedSize += suballoc.size; + } + else + { + ++nullItem2ndCount; + } + + offset = suballoc.offset + suballoc.size + VMA_DEBUG_MARGIN; + } + + VMA_VALIDATE(nullItem2ndCount == m_2ndNullItemsCount); + } + + VMA_VALIDATE(offset <= GetSize()); + VMA_VALIDATE(m_SumFreeSize == GetSize() - sumUsedSize); + + return true; +} + +size_t VmaBlockMetadata_Linear::GetAllocationCount() const +{ + return AccessSuballocations1st().size() - (m_1stNullItemsBeginCount + m_1stNullItemsMiddleCount) + + AccessSuballocations2nd().size() - m_2ndNullItemsCount; +} + +VkDeviceSize VmaBlockMetadata_Linear::GetUnusedRangeSizeMax() const +{ + const VkDeviceSize size = GetSize(); + + /* + We don't consider gaps inside allocation vectors with freed allocations because + they are not suitable for reuse in linear allocator. We consider only space that + is available for new allocations. + */ + if(IsEmpty()) + { + return size; + } + + const SuballocationVectorType& suballocations1st = AccessSuballocations1st(); + + switch(m_2ndVectorMode) + { + case SECOND_VECTOR_EMPTY: + /* + Available space is after end of 1st, as well as before beginning of 1st (which + whould make it a ring buffer). + */ + { + const size_t suballocations1stCount = suballocations1st.size(); + VMA_ASSERT(suballocations1stCount > m_1stNullItemsBeginCount); + const VmaSuballocation& firstSuballoc = suballocations1st[m_1stNullItemsBeginCount]; + const VmaSuballocation& lastSuballoc = suballocations1st[suballocations1stCount - 1]; + return VMA_MAX( + firstSuballoc.offset, + size - (lastSuballoc.offset + lastSuballoc.size)); + } + break; + + case SECOND_VECTOR_RING_BUFFER: + /* + Available space is only between end of 2nd and beginning of 1st. + */ + { + const SuballocationVectorType& suballocations2nd = AccessSuballocations2nd(); + const VmaSuballocation& lastSuballoc2nd = suballocations2nd.back(); + const VmaSuballocation& firstSuballoc1st = suballocations1st[m_1stNullItemsBeginCount]; + return firstSuballoc1st.offset - (lastSuballoc2nd.offset + lastSuballoc2nd.size); + } + break; + + case SECOND_VECTOR_DOUBLE_STACK: + /* + Available space is only between end of 1st and top of 2nd. + */ + { + const SuballocationVectorType& suballocations2nd = AccessSuballocations2nd(); + const VmaSuballocation& topSuballoc2nd = suballocations2nd.back(); + const VmaSuballocation& lastSuballoc1st = suballocations1st.back(); + return topSuballoc2nd.offset - (lastSuballoc1st.offset + lastSuballoc1st.size); + } + break; + + default: + VMA_ASSERT(0); + return 0; + } +} + +void VmaBlockMetadata_Linear::CalcAllocationStatInfo(VmaStatInfo& outInfo) const +{ + const VkDeviceSize size = GetSize(); + const SuballocationVectorType& suballocations1st = AccessSuballocations1st(); + const SuballocationVectorType& suballocations2nd = AccessSuballocations2nd(); + const size_t suballoc1stCount = suballocations1st.size(); + const size_t suballoc2ndCount = suballocations2nd.size(); + + outInfo.blockCount = 1; + outInfo.allocationCount = (uint32_t)GetAllocationCount(); + outInfo.unusedRangeCount = 0; + outInfo.usedBytes = 0; + outInfo.allocationSizeMin = UINT64_MAX; + outInfo.allocationSizeMax = 0; + outInfo.unusedRangeSizeMin = UINT64_MAX; + outInfo.unusedRangeSizeMax = 0; + + VkDeviceSize lastOffset = 0; + + if(m_2ndVectorMode == SECOND_VECTOR_RING_BUFFER) + { + const VkDeviceSize freeSpace2ndTo1stEnd = suballocations1st[m_1stNullItemsBeginCount].offset; + size_t nextAlloc2ndIndex = 0; + while(lastOffset < freeSpace2ndTo1stEnd) + { + // Find next non-null allocation or move nextAllocIndex to the end. + while(nextAlloc2ndIndex < suballoc2ndCount && + suballocations2nd[nextAlloc2ndIndex].hAllocation == VK_NULL_HANDLE) + { + ++nextAlloc2ndIndex; + } + + // Found non-null allocation. + if(nextAlloc2ndIndex < suballoc2ndCount) + { + const VmaSuballocation& suballoc = suballocations2nd[nextAlloc2ndIndex]; + + // 1. Process free space before this allocation. + if(lastOffset < suballoc.offset) + { + // There is free space from lastOffset to suballoc.offset. + const VkDeviceSize unusedRangeSize = suballoc.offset - lastOffset; + ++outInfo.unusedRangeCount; + outInfo.unusedBytes += unusedRangeSize; + outInfo.unusedRangeSizeMin = VMA_MIN(outInfo.unusedRangeSizeMin, unusedRangeSize); + outInfo.unusedRangeSizeMax = VMA_MIN(outInfo.unusedRangeSizeMax, unusedRangeSize); + } + + // 2. Process this allocation. + // There is allocation with suballoc.offset, suballoc.size. + outInfo.usedBytes += suballoc.size; + outInfo.allocationSizeMin = VMA_MIN(outInfo.allocationSizeMin, suballoc.size); + outInfo.allocationSizeMax = VMA_MIN(outInfo.allocationSizeMax, suballoc.size); + + // 3. Prepare for next iteration. + lastOffset = suballoc.offset + suballoc.size; + ++nextAlloc2ndIndex; + } + // We are at the end. + else + { + // There is free space from lastOffset to freeSpace2ndTo1stEnd. + if(lastOffset < freeSpace2ndTo1stEnd) + { + const VkDeviceSize unusedRangeSize = freeSpace2ndTo1stEnd - lastOffset; + ++outInfo.unusedRangeCount; + outInfo.unusedBytes += unusedRangeSize; + outInfo.unusedRangeSizeMin = VMA_MIN(outInfo.unusedRangeSizeMin, unusedRangeSize); + outInfo.unusedRangeSizeMax = VMA_MIN(outInfo.unusedRangeSizeMax, unusedRangeSize); + } + + // End of loop. + lastOffset = freeSpace2ndTo1stEnd; + } + } + } + + size_t nextAlloc1stIndex = m_1stNullItemsBeginCount; + const VkDeviceSize freeSpace1stTo2ndEnd = + m_2ndVectorMode == SECOND_VECTOR_DOUBLE_STACK ? suballocations2nd.back().offset : size; + while(lastOffset < freeSpace1stTo2ndEnd) + { + // Find next non-null allocation or move nextAllocIndex to the end. + while(nextAlloc1stIndex < suballoc1stCount && + suballocations1st[nextAlloc1stIndex].hAllocation == VK_NULL_HANDLE) + { + ++nextAlloc1stIndex; + } + + // Found non-null allocation. + if(nextAlloc1stIndex < suballoc1stCount) + { + const VmaSuballocation& suballoc = suballocations1st[nextAlloc1stIndex]; + + // 1. Process free space before this allocation. + if(lastOffset < suballoc.offset) + { + // There is free space from lastOffset to suballoc.offset. + const VkDeviceSize unusedRangeSize = suballoc.offset - lastOffset; + ++outInfo.unusedRangeCount; + outInfo.unusedBytes += unusedRangeSize; + outInfo.unusedRangeSizeMin = VMA_MIN(outInfo.unusedRangeSizeMin, unusedRangeSize); + outInfo.unusedRangeSizeMax = VMA_MIN(outInfo.unusedRangeSizeMax, unusedRangeSize); + } + + // 2. Process this allocation. + // There is allocation with suballoc.offset, suballoc.size. + outInfo.usedBytes += suballoc.size; + outInfo.allocationSizeMin = VMA_MIN(outInfo.allocationSizeMin, suballoc.size); + outInfo.allocationSizeMax = VMA_MIN(outInfo.allocationSizeMax, suballoc.size); + + // 3. Prepare for next iteration. + lastOffset = suballoc.offset + suballoc.size; + ++nextAlloc1stIndex; + } + // We are at the end. + else + { + // There is free space from lastOffset to freeSpace1stTo2ndEnd. + if(lastOffset < freeSpace1stTo2ndEnd) + { + const VkDeviceSize unusedRangeSize = freeSpace1stTo2ndEnd - lastOffset; + ++outInfo.unusedRangeCount; + outInfo.unusedBytes += unusedRangeSize; + outInfo.unusedRangeSizeMin = VMA_MIN(outInfo.unusedRangeSizeMin, unusedRangeSize); + outInfo.unusedRangeSizeMax = VMA_MIN(outInfo.unusedRangeSizeMax, unusedRangeSize); + } + + // End of loop. + lastOffset = freeSpace1stTo2ndEnd; + } + } + + if(m_2ndVectorMode == SECOND_VECTOR_DOUBLE_STACK) + { + size_t nextAlloc2ndIndex = suballocations2nd.size() - 1; + while(lastOffset < size) + { + // Find next non-null allocation or move nextAllocIndex to the end. + while(nextAlloc2ndIndex != SIZE_MAX && + suballocations2nd[nextAlloc2ndIndex].hAllocation == VK_NULL_HANDLE) + { + --nextAlloc2ndIndex; + } + + // Found non-null allocation. + if(nextAlloc2ndIndex != SIZE_MAX) + { + const VmaSuballocation& suballoc = suballocations2nd[nextAlloc2ndIndex]; + + // 1. Process free space before this allocation. + if(lastOffset < suballoc.offset) + { + // There is free space from lastOffset to suballoc.offset. + const VkDeviceSize unusedRangeSize = suballoc.offset - lastOffset; + ++outInfo.unusedRangeCount; + outInfo.unusedBytes += unusedRangeSize; + outInfo.unusedRangeSizeMin = VMA_MIN(outInfo.unusedRangeSizeMin, unusedRangeSize); + outInfo.unusedRangeSizeMax = VMA_MIN(outInfo.unusedRangeSizeMax, unusedRangeSize); + } + + // 2. Process this allocation. + // There is allocation with suballoc.offset, suballoc.size. + outInfo.usedBytes += suballoc.size; + outInfo.allocationSizeMin = VMA_MIN(outInfo.allocationSizeMin, suballoc.size); + outInfo.allocationSizeMax = VMA_MIN(outInfo.allocationSizeMax, suballoc.size); + + // 3. Prepare for next iteration. + lastOffset = suballoc.offset + suballoc.size; + --nextAlloc2ndIndex; + } + // We are at the end. + else + { + // There is free space from lastOffset to size. + if(lastOffset < size) + { + const VkDeviceSize unusedRangeSize = size - lastOffset; + ++outInfo.unusedRangeCount; + outInfo.unusedBytes += unusedRangeSize; + outInfo.unusedRangeSizeMin = VMA_MIN(outInfo.unusedRangeSizeMin, unusedRangeSize); + outInfo.unusedRangeSizeMax = VMA_MIN(outInfo.unusedRangeSizeMax, unusedRangeSize); + } + + // End of loop. + lastOffset = size; + } + } + } + + outInfo.unusedBytes = size - outInfo.usedBytes; +} + +void VmaBlockMetadata_Linear::AddPoolStats(VmaPoolStats& inoutStats) const +{ + const SuballocationVectorType& suballocations1st = AccessSuballocations1st(); + const SuballocationVectorType& suballocations2nd = AccessSuballocations2nd(); + const VkDeviceSize size = GetSize(); + const size_t suballoc1stCount = suballocations1st.size(); + const size_t suballoc2ndCount = suballocations2nd.size(); + + inoutStats.size += size; + + VkDeviceSize lastOffset = 0; + + if(m_2ndVectorMode == SECOND_VECTOR_RING_BUFFER) + { + const VkDeviceSize freeSpace2ndTo1stEnd = suballocations1st[m_1stNullItemsBeginCount].offset; + size_t nextAlloc2ndIndex = m_1stNullItemsBeginCount; + while(lastOffset < freeSpace2ndTo1stEnd) + { + // Find next non-null allocation or move nextAlloc2ndIndex to the end. + while(nextAlloc2ndIndex < suballoc2ndCount && + suballocations2nd[nextAlloc2ndIndex].hAllocation == VK_NULL_HANDLE) + { + ++nextAlloc2ndIndex; + } + + // Found non-null allocation. + if(nextAlloc2ndIndex < suballoc2ndCount) + { + const VmaSuballocation& suballoc = suballocations2nd[nextAlloc2ndIndex]; + + // 1. Process free space before this allocation. + if(lastOffset < suballoc.offset) + { + // There is free space from lastOffset to suballoc.offset. + const VkDeviceSize unusedRangeSize = suballoc.offset - lastOffset; + inoutStats.unusedSize += unusedRangeSize; + ++inoutStats.unusedRangeCount; + inoutStats.unusedRangeSizeMax = VMA_MAX(inoutStats.unusedRangeSizeMax, unusedRangeSize); + } + + // 2. Process this allocation. + // There is allocation with suballoc.offset, suballoc.size. + ++inoutStats.allocationCount; + + // 3. Prepare for next iteration. + lastOffset = suballoc.offset + suballoc.size; + ++nextAlloc2ndIndex; + } + // We are at the end. + else + { + if(lastOffset < freeSpace2ndTo1stEnd) + { + // There is free space from lastOffset to freeSpace2ndTo1stEnd. + const VkDeviceSize unusedRangeSize = freeSpace2ndTo1stEnd - lastOffset; + inoutStats.unusedSize += unusedRangeSize; + ++inoutStats.unusedRangeCount; + inoutStats.unusedRangeSizeMax = VMA_MAX(inoutStats.unusedRangeSizeMax, unusedRangeSize); + } + + // End of loop. + lastOffset = freeSpace2ndTo1stEnd; + } + } + } + + size_t nextAlloc1stIndex = m_1stNullItemsBeginCount; + const VkDeviceSize freeSpace1stTo2ndEnd = + m_2ndVectorMode == SECOND_VECTOR_DOUBLE_STACK ? suballocations2nd.back().offset : size; + while(lastOffset < freeSpace1stTo2ndEnd) + { + // Find next non-null allocation or move nextAllocIndex to the end. + while(nextAlloc1stIndex < suballoc1stCount && + suballocations1st[nextAlloc1stIndex].hAllocation == VK_NULL_HANDLE) + { + ++nextAlloc1stIndex; + } + + // Found non-null allocation. + if(nextAlloc1stIndex < suballoc1stCount) + { + const VmaSuballocation& suballoc = suballocations1st[nextAlloc1stIndex]; + + // 1. Process free space before this allocation. + if(lastOffset < suballoc.offset) + { + // There is free space from lastOffset to suballoc.offset. + const VkDeviceSize unusedRangeSize = suballoc.offset - lastOffset; + inoutStats.unusedSize += unusedRangeSize; + ++inoutStats.unusedRangeCount; + inoutStats.unusedRangeSizeMax = VMA_MAX(inoutStats.unusedRangeSizeMax, unusedRangeSize); + } + + // 2. Process this allocation. + // There is allocation with suballoc.offset, suballoc.size. + ++inoutStats.allocationCount; + + // 3. Prepare for next iteration. + lastOffset = suballoc.offset + suballoc.size; + ++nextAlloc1stIndex; + } + // We are at the end. + else + { + if(lastOffset < freeSpace1stTo2ndEnd) + { + // There is free space from lastOffset to freeSpace1stTo2ndEnd. + const VkDeviceSize unusedRangeSize = freeSpace1stTo2ndEnd - lastOffset; + inoutStats.unusedSize += unusedRangeSize; + ++inoutStats.unusedRangeCount; + inoutStats.unusedRangeSizeMax = VMA_MAX(inoutStats.unusedRangeSizeMax, unusedRangeSize); + } + + // End of loop. + lastOffset = freeSpace1stTo2ndEnd; + } + } + + if(m_2ndVectorMode == SECOND_VECTOR_DOUBLE_STACK) + { + size_t nextAlloc2ndIndex = suballocations2nd.size() - 1; + while(lastOffset < size) + { + // Find next non-null allocation or move nextAlloc2ndIndex to the end. + while(nextAlloc2ndIndex != SIZE_MAX && + suballocations2nd[nextAlloc2ndIndex].hAllocation == VK_NULL_HANDLE) + { + --nextAlloc2ndIndex; + } + + // Found non-null allocation. + if(nextAlloc2ndIndex != SIZE_MAX) + { + const VmaSuballocation& suballoc = suballocations2nd[nextAlloc2ndIndex]; + + // 1. Process free space before this allocation. + if(lastOffset < suballoc.offset) + { + // There is free space from lastOffset to suballoc.offset. + const VkDeviceSize unusedRangeSize = suballoc.offset - lastOffset; + inoutStats.unusedSize += unusedRangeSize; + ++inoutStats.unusedRangeCount; + inoutStats.unusedRangeSizeMax = VMA_MAX(inoutStats.unusedRangeSizeMax, unusedRangeSize); + } + + // 2. Process this allocation. + // There is allocation with suballoc.offset, suballoc.size. + ++inoutStats.allocationCount; + + // 3. Prepare for next iteration. + lastOffset = suballoc.offset + suballoc.size; + --nextAlloc2ndIndex; + } + // We are at the end. + else + { + if(lastOffset < size) + { + // There is free space from lastOffset to size. + const VkDeviceSize unusedRangeSize = size - lastOffset; + inoutStats.unusedSize += unusedRangeSize; + ++inoutStats.unusedRangeCount; + inoutStats.unusedRangeSizeMax = VMA_MAX(inoutStats.unusedRangeSizeMax, unusedRangeSize); + } + + // End of loop. + lastOffset = size; + } + } + } +} + +#if VMA_STATS_STRING_ENABLED +void VmaBlockMetadata_Linear::PrintDetailedMap(class VmaJsonWriter& json) const +{ + const VkDeviceSize size = GetSize(); + const SuballocationVectorType& suballocations1st = AccessSuballocations1st(); + const SuballocationVectorType& suballocations2nd = AccessSuballocations2nd(); + const size_t suballoc1stCount = suballocations1st.size(); + const size_t suballoc2ndCount = suballocations2nd.size(); + + // FIRST PASS + + size_t unusedRangeCount = 0; + VkDeviceSize usedBytes = 0; + + VkDeviceSize lastOffset = 0; + + size_t alloc2ndCount = 0; + if(m_2ndVectorMode == SECOND_VECTOR_RING_BUFFER) + { + const VkDeviceSize freeSpace2ndTo1stEnd = suballocations1st[m_1stNullItemsBeginCount].offset; + size_t nextAlloc2ndIndex = 0; + while(lastOffset < freeSpace2ndTo1stEnd) + { + // Find next non-null allocation or move nextAlloc2ndIndex to the end. + while(nextAlloc2ndIndex < suballoc2ndCount && + suballocations2nd[nextAlloc2ndIndex].hAllocation == VK_NULL_HANDLE) + { + ++nextAlloc2ndIndex; + } + + // Found non-null allocation. + if(nextAlloc2ndIndex < suballoc2ndCount) + { + const VmaSuballocation& suballoc = suballocations2nd[nextAlloc2ndIndex]; + + // 1. Process free space before this allocation. + if(lastOffset < suballoc.offset) + { + // There is free space from lastOffset to suballoc.offset. + ++unusedRangeCount; + } + + // 2. Process this allocation. + // There is allocation with suballoc.offset, suballoc.size. + ++alloc2ndCount; + usedBytes += suballoc.size; + + // 3. Prepare for next iteration. + lastOffset = suballoc.offset + suballoc.size; + ++nextAlloc2ndIndex; + } + // We are at the end. + else + { + if(lastOffset < freeSpace2ndTo1stEnd) + { + // There is free space from lastOffset to freeSpace2ndTo1stEnd. + ++unusedRangeCount; + } + + // End of loop. + lastOffset = freeSpace2ndTo1stEnd; + } + } + } + + size_t nextAlloc1stIndex = m_1stNullItemsBeginCount; + size_t alloc1stCount = 0; + const VkDeviceSize freeSpace1stTo2ndEnd = + m_2ndVectorMode == SECOND_VECTOR_DOUBLE_STACK ? suballocations2nd.back().offset : size; + while(lastOffset < freeSpace1stTo2ndEnd) + { + // Find next non-null allocation or move nextAllocIndex to the end. + while(nextAlloc1stIndex < suballoc1stCount && + suballocations1st[nextAlloc1stIndex].hAllocation == VK_NULL_HANDLE) + { + ++nextAlloc1stIndex; + } + + // Found non-null allocation. + if(nextAlloc1stIndex < suballoc1stCount) + { + const VmaSuballocation& suballoc = suballocations1st[nextAlloc1stIndex]; + + // 1. Process free space before this allocation. + if(lastOffset < suballoc.offset) + { + // There is free space from lastOffset to suballoc.offset. + ++unusedRangeCount; + } + + // 2. Process this allocation. + // There is allocation with suballoc.offset, suballoc.size. + ++alloc1stCount; + usedBytes += suballoc.size; + + // 3. Prepare for next iteration. + lastOffset = suballoc.offset + suballoc.size; + ++nextAlloc1stIndex; + } + // We are at the end. + else + { + if(lastOffset < size) + { + // There is free space from lastOffset to freeSpace1stTo2ndEnd. + ++unusedRangeCount; + } + + // End of loop. + lastOffset = freeSpace1stTo2ndEnd; + } + } + + if(m_2ndVectorMode == SECOND_VECTOR_DOUBLE_STACK) + { + size_t nextAlloc2ndIndex = suballocations2nd.size() - 1; + while(lastOffset < size) + { + // Find next non-null allocation or move nextAlloc2ndIndex to the end. + while(nextAlloc2ndIndex != SIZE_MAX && + suballocations2nd[nextAlloc2ndIndex].hAllocation == VK_NULL_HANDLE) + { + --nextAlloc2ndIndex; + } + + // Found non-null allocation. + if(nextAlloc2ndIndex != SIZE_MAX) + { + const VmaSuballocation& suballoc = suballocations2nd[nextAlloc2ndIndex]; + + // 1. Process free space before this allocation. + if(lastOffset < suballoc.offset) + { + // There is free space from lastOffset to suballoc.offset. + ++unusedRangeCount; + } + + // 2. Process this allocation. + // There is allocation with suballoc.offset, suballoc.size. + ++alloc2ndCount; + usedBytes += suballoc.size; + + // 3. Prepare for next iteration. + lastOffset = suballoc.offset + suballoc.size; + --nextAlloc2ndIndex; + } + // We are at the end. + else + { + if(lastOffset < size) + { + // There is free space from lastOffset to size. + ++unusedRangeCount; + } + + // End of loop. + lastOffset = size; + } + } + } + + const VkDeviceSize unusedBytes = size - usedBytes; + PrintDetailedMap_Begin(json, unusedBytes, alloc1stCount + alloc2ndCount, unusedRangeCount); + + // SECOND PASS + lastOffset = 0; + + if(m_2ndVectorMode == SECOND_VECTOR_RING_BUFFER) + { + const VkDeviceSize freeSpace2ndTo1stEnd = suballocations1st[m_1stNullItemsBeginCount].offset; + size_t nextAlloc2ndIndex = 0; + while(lastOffset < freeSpace2ndTo1stEnd) + { + // Find next non-null allocation or move nextAlloc2ndIndex to the end. + while(nextAlloc2ndIndex < suballoc2ndCount && + suballocations2nd[nextAlloc2ndIndex].hAllocation == VK_NULL_HANDLE) + { + ++nextAlloc2ndIndex; + } + + // Found non-null allocation. + if(nextAlloc2ndIndex < suballoc2ndCount) + { + const VmaSuballocation& suballoc = suballocations2nd[nextAlloc2ndIndex]; + + // 1. Process free space before this allocation. + if(lastOffset < suballoc.offset) + { + // There is free space from lastOffset to suballoc.offset. + const VkDeviceSize unusedRangeSize = suballoc.offset - lastOffset; + PrintDetailedMap_UnusedRange(json, lastOffset, unusedRangeSize); + } + + // 2. Process this allocation. + // There is allocation with suballoc.offset, suballoc.size. + PrintDetailedMap_Allocation(json, suballoc.offset, suballoc.hAllocation); + + // 3. Prepare for next iteration. + lastOffset = suballoc.offset + suballoc.size; + ++nextAlloc2ndIndex; + } + // We are at the end. + else + { + if(lastOffset < freeSpace2ndTo1stEnd) + { + // There is free space from lastOffset to freeSpace2ndTo1stEnd. + const VkDeviceSize unusedRangeSize = freeSpace2ndTo1stEnd - lastOffset; + PrintDetailedMap_UnusedRange(json, lastOffset, unusedRangeSize); + } + + // End of loop. + lastOffset = freeSpace2ndTo1stEnd; + } + } + } + + nextAlloc1stIndex = m_1stNullItemsBeginCount; + while(lastOffset < freeSpace1stTo2ndEnd) + { + // Find next non-null allocation or move nextAllocIndex to the end. + while(nextAlloc1stIndex < suballoc1stCount && + suballocations1st[nextAlloc1stIndex].hAllocation == VK_NULL_HANDLE) + { + ++nextAlloc1stIndex; + } + + // Found non-null allocation. + if(nextAlloc1stIndex < suballoc1stCount) + { + const VmaSuballocation& suballoc = suballocations1st[nextAlloc1stIndex]; + + // 1. Process free space before this allocation. + if(lastOffset < suballoc.offset) + { + // There is free space from lastOffset to suballoc.offset. + const VkDeviceSize unusedRangeSize = suballoc.offset - lastOffset; + PrintDetailedMap_UnusedRange(json, lastOffset, unusedRangeSize); + } + + // 2. Process this allocation. + // There is allocation with suballoc.offset, suballoc.size. + PrintDetailedMap_Allocation(json, suballoc.offset, suballoc.hAllocation); + + // 3. Prepare for next iteration. + lastOffset = suballoc.offset + suballoc.size; + ++nextAlloc1stIndex; + } + // We are at the end. + else + { + if(lastOffset < freeSpace1stTo2ndEnd) + { + // There is free space from lastOffset to freeSpace1stTo2ndEnd. + const VkDeviceSize unusedRangeSize = freeSpace1stTo2ndEnd - lastOffset; + PrintDetailedMap_UnusedRange(json, lastOffset, unusedRangeSize); + } + + // End of loop. + lastOffset = freeSpace1stTo2ndEnd; + } + } + + if(m_2ndVectorMode == SECOND_VECTOR_DOUBLE_STACK) + { + size_t nextAlloc2ndIndex = suballocations2nd.size() - 1; + while(lastOffset < size) + { + // Find next non-null allocation or move nextAlloc2ndIndex to the end. + while(nextAlloc2ndIndex != SIZE_MAX && + suballocations2nd[nextAlloc2ndIndex].hAllocation == VK_NULL_HANDLE) + { + --nextAlloc2ndIndex; + } + + // Found non-null allocation. + if(nextAlloc2ndIndex != SIZE_MAX) + { + const VmaSuballocation& suballoc = suballocations2nd[nextAlloc2ndIndex]; + + // 1. Process free space before this allocation. + if(lastOffset < suballoc.offset) + { + // There is free space from lastOffset to suballoc.offset. + const VkDeviceSize unusedRangeSize = suballoc.offset - lastOffset; + PrintDetailedMap_UnusedRange(json, lastOffset, unusedRangeSize); + } + + // 2. Process this allocation. + // There is allocation with suballoc.offset, suballoc.size. + PrintDetailedMap_Allocation(json, suballoc.offset, suballoc.hAllocation); + + // 3. Prepare for next iteration. + lastOffset = suballoc.offset + suballoc.size; + --nextAlloc2ndIndex; + } + // We are at the end. + else + { + if(lastOffset < size) + { + // There is free space from lastOffset to size. + const VkDeviceSize unusedRangeSize = size - lastOffset; + PrintDetailedMap_UnusedRange(json, lastOffset, unusedRangeSize); + } + + // End of loop. + lastOffset = size; + } + } + } + + PrintDetailedMap_End(json); +} +#endif // #if VMA_STATS_STRING_ENABLED + +bool VmaBlockMetadata_Linear::CreateAllocationRequest( + uint32_t currentFrameIndex, + uint32_t frameInUseCount, + VkDeviceSize bufferImageGranularity, + VkDeviceSize allocSize, + VkDeviceSize allocAlignment, + bool upperAddress, + VmaSuballocationType allocType, + bool canMakeOtherLost, + uint32_t strategy, + VmaAllocationRequest* pAllocationRequest) +{ + VMA_ASSERT(allocSize > 0); + VMA_ASSERT(allocType != VMA_SUBALLOCATION_TYPE_FREE); + VMA_ASSERT(pAllocationRequest != VMA_NULL); + VMA_HEAVY_ASSERT(Validate()); + return upperAddress ? + CreateAllocationRequest_UpperAddress( + currentFrameIndex, frameInUseCount, bufferImageGranularity, + allocSize, allocAlignment, allocType, canMakeOtherLost, strategy, pAllocationRequest) : + CreateAllocationRequest_LowerAddress( + currentFrameIndex, frameInUseCount, bufferImageGranularity, + allocSize, allocAlignment, allocType, canMakeOtherLost, strategy, pAllocationRequest); +} + +bool VmaBlockMetadata_Linear::CreateAllocationRequest_UpperAddress( + uint32_t currentFrameIndex, + uint32_t frameInUseCount, + VkDeviceSize bufferImageGranularity, + VkDeviceSize allocSize, + VkDeviceSize allocAlignment, + VmaSuballocationType allocType, + bool canMakeOtherLost, + uint32_t strategy, + VmaAllocationRequest* pAllocationRequest) +{ + const VkDeviceSize size = GetSize(); + SuballocationVectorType& suballocations1st = AccessSuballocations1st(); + SuballocationVectorType& suballocations2nd = AccessSuballocations2nd(); + + if(m_2ndVectorMode == SECOND_VECTOR_RING_BUFFER) + { + VMA_ASSERT(0 && "Trying to use pool with linear algorithm as double stack, while it is already being used as ring buffer."); + return false; + } + + // Try to allocate before 2nd.back(), or end of block if 2nd.empty(). + if(allocSize > size) + { + return false; + } + VkDeviceSize resultBaseOffset = size - allocSize; + if(!suballocations2nd.empty()) + { + const VmaSuballocation& lastSuballoc = suballocations2nd.back(); + resultBaseOffset = lastSuballoc.offset - allocSize; + if(allocSize > lastSuballoc.offset) + { + return false; + } + } + + // Start from offset equal to end of free space. + VkDeviceSize resultOffset = resultBaseOffset; + + // Apply VMA_DEBUG_MARGIN at the end. + if(VMA_DEBUG_MARGIN > 0) + { + if(resultOffset < VMA_DEBUG_MARGIN) + { + return false; + } + resultOffset -= VMA_DEBUG_MARGIN; + } + + // Apply alignment. + resultOffset = VmaAlignDown(resultOffset, allocAlignment); + + // Check next suballocations from 2nd for BufferImageGranularity conflicts. + // Make bigger alignment if necessary. + if(bufferImageGranularity > 1 && bufferImageGranularity != allocAlignment && !suballocations2nd.empty()) + { + bool bufferImageGranularityConflict = false; + for(size_t nextSuballocIndex = suballocations2nd.size(); nextSuballocIndex--; ) + { + const VmaSuballocation& nextSuballoc = suballocations2nd[nextSuballocIndex]; + if(VmaBlocksOnSamePage(resultOffset, allocSize, nextSuballoc.offset, bufferImageGranularity)) + { + if(VmaIsBufferImageGranularityConflict(nextSuballoc.type, allocType)) + { + bufferImageGranularityConflict = true; + break; + } + } + else + // Already on previous page. + break; + } + if(bufferImageGranularityConflict) + { + resultOffset = VmaAlignDown(resultOffset, bufferImageGranularity); + } + } + + // There is enough free space. + const VkDeviceSize endOf1st = !suballocations1st.empty() ? + suballocations1st.back().offset + suballocations1st.back().size : + 0; + if(endOf1st + VMA_DEBUG_MARGIN <= resultOffset) + { + // Check previous suballocations for BufferImageGranularity conflicts. + // If conflict exists, allocation cannot be made here. + if(bufferImageGranularity > 1) + { + for(size_t prevSuballocIndex = suballocations1st.size(); prevSuballocIndex--; ) + { + const VmaSuballocation& prevSuballoc = suballocations1st[prevSuballocIndex]; + if(VmaBlocksOnSamePage(prevSuballoc.offset, prevSuballoc.size, resultOffset, bufferImageGranularity)) + { + if(VmaIsBufferImageGranularityConflict(allocType, prevSuballoc.type)) + { + return false; + } + } + else + { + // Already on next page. + break; + } + } + } + + // All tests passed: Success. + pAllocationRequest->offset = resultOffset; + pAllocationRequest->sumFreeSize = resultBaseOffset + allocSize - endOf1st; + pAllocationRequest->sumItemSize = 0; + // pAllocationRequest->item unused. + pAllocationRequest->itemsToMakeLostCount = 0; + pAllocationRequest->type = VmaAllocationRequestType::UpperAddress; + return true; + } + + return false; +} + +bool VmaBlockMetadata_Linear::CreateAllocationRequest_LowerAddress( + uint32_t currentFrameIndex, + uint32_t frameInUseCount, + VkDeviceSize bufferImageGranularity, + VkDeviceSize allocSize, + VkDeviceSize allocAlignment, + VmaSuballocationType allocType, + bool canMakeOtherLost, + uint32_t strategy, + VmaAllocationRequest* pAllocationRequest) +{ + const VkDeviceSize size = GetSize(); + SuballocationVectorType& suballocations1st = AccessSuballocations1st(); + SuballocationVectorType& suballocations2nd = AccessSuballocations2nd(); + + if(m_2ndVectorMode == SECOND_VECTOR_EMPTY || m_2ndVectorMode == SECOND_VECTOR_DOUBLE_STACK) + { + // Try to allocate at the end of 1st vector. + + VkDeviceSize resultBaseOffset = 0; + if(!suballocations1st.empty()) + { + const VmaSuballocation& lastSuballoc = suballocations1st.back(); + resultBaseOffset = lastSuballoc.offset + lastSuballoc.size; + } + + // Start from offset equal to beginning of free space. + VkDeviceSize resultOffset = resultBaseOffset; + + // Apply VMA_DEBUG_MARGIN at the beginning. + if(VMA_DEBUG_MARGIN > 0) + { + resultOffset += VMA_DEBUG_MARGIN; + } + + // Apply alignment. + resultOffset = VmaAlignUp(resultOffset, allocAlignment); + + // Check previous suballocations for BufferImageGranularity conflicts. + // Make bigger alignment if necessary. + if(bufferImageGranularity > 1 && bufferImageGranularity != allocAlignment && !suballocations1st.empty()) + { + bool bufferImageGranularityConflict = false; + for(size_t prevSuballocIndex = suballocations1st.size(); prevSuballocIndex--; ) + { + const VmaSuballocation& prevSuballoc = suballocations1st[prevSuballocIndex]; + if(VmaBlocksOnSamePage(prevSuballoc.offset, prevSuballoc.size, resultOffset, bufferImageGranularity)) + { + if(VmaIsBufferImageGranularityConflict(prevSuballoc.type, allocType)) + { + bufferImageGranularityConflict = true; + break; + } + } + else + // Already on previous page. + break; + } + if(bufferImageGranularityConflict) + { + resultOffset = VmaAlignUp(resultOffset, bufferImageGranularity); + } + } + + const VkDeviceSize freeSpaceEnd = m_2ndVectorMode == SECOND_VECTOR_DOUBLE_STACK ? + suballocations2nd.back().offset : size; + + // There is enough free space at the end after alignment. + if(resultOffset + allocSize + VMA_DEBUG_MARGIN <= freeSpaceEnd) + { + // Check next suballocations for BufferImageGranularity conflicts. + // If conflict exists, allocation cannot be made here. + if((allocSize % bufferImageGranularity || resultOffset % bufferImageGranularity) && m_2ndVectorMode == SECOND_VECTOR_DOUBLE_STACK) + { + for(size_t nextSuballocIndex = suballocations2nd.size(); nextSuballocIndex--; ) + { + const VmaSuballocation& nextSuballoc = suballocations2nd[nextSuballocIndex]; + if(VmaBlocksOnSamePage(resultOffset, allocSize, nextSuballoc.offset, bufferImageGranularity)) + { + if(VmaIsBufferImageGranularityConflict(allocType, nextSuballoc.type)) + { + return false; + } + } + else + { + // Already on previous page. + break; + } + } + } + + // All tests passed: Success. + pAllocationRequest->offset = resultOffset; + pAllocationRequest->sumFreeSize = freeSpaceEnd - resultBaseOffset; + pAllocationRequest->sumItemSize = 0; + // pAllocationRequest->item, customData unused. + pAllocationRequest->type = VmaAllocationRequestType::EndOf1st; + pAllocationRequest->itemsToMakeLostCount = 0; + return true; + } + } + + // Wrap-around to end of 2nd vector. Try to allocate there, watching for the + // beginning of 1st vector as the end of free space. + if(m_2ndVectorMode == SECOND_VECTOR_EMPTY || m_2ndVectorMode == SECOND_VECTOR_RING_BUFFER) + { + VMA_ASSERT(!suballocations1st.empty()); + + VkDeviceSize resultBaseOffset = 0; + if(!suballocations2nd.empty()) + { + const VmaSuballocation& lastSuballoc = suballocations2nd.back(); + resultBaseOffset = lastSuballoc.offset + lastSuballoc.size; + } + + // Start from offset equal to beginning of free space. + VkDeviceSize resultOffset = resultBaseOffset; + + // Apply VMA_DEBUG_MARGIN at the beginning. + if(VMA_DEBUG_MARGIN > 0) + { + resultOffset += VMA_DEBUG_MARGIN; + } + + // Apply alignment. + resultOffset = VmaAlignUp(resultOffset, allocAlignment); + + // Check previous suballocations for BufferImageGranularity conflicts. + // Make bigger alignment if necessary. + if(bufferImageGranularity > 1 && bufferImageGranularity != allocAlignment && !suballocations2nd.empty()) + { + bool bufferImageGranularityConflict = false; + for(size_t prevSuballocIndex = suballocations2nd.size(); prevSuballocIndex--; ) + { + const VmaSuballocation& prevSuballoc = suballocations2nd[prevSuballocIndex]; + if(VmaBlocksOnSamePage(prevSuballoc.offset, prevSuballoc.size, resultOffset, bufferImageGranularity)) + { + if(VmaIsBufferImageGranularityConflict(prevSuballoc.type, allocType)) + { + bufferImageGranularityConflict = true; + break; + } + } + else + // Already on previous page. + break; + } + if(bufferImageGranularityConflict) + { + resultOffset = VmaAlignUp(resultOffset, bufferImageGranularity); + } + } + + pAllocationRequest->itemsToMakeLostCount = 0; + pAllocationRequest->sumItemSize = 0; + size_t index1st = m_1stNullItemsBeginCount; + + if(canMakeOtherLost) + { + while(index1st < suballocations1st.size() && + resultOffset + allocSize + VMA_DEBUG_MARGIN > suballocations1st[index1st].offset) + { + // Next colliding allocation at the beginning of 1st vector found. Try to make it lost. + const VmaSuballocation& suballoc = suballocations1st[index1st]; + if(suballoc.type == VMA_SUBALLOCATION_TYPE_FREE) + { + // No problem. + } + else + { + VMA_ASSERT(suballoc.hAllocation != VK_NULL_HANDLE); + if(suballoc.hAllocation->CanBecomeLost() && + suballoc.hAllocation->GetLastUseFrameIndex() + frameInUseCount < currentFrameIndex) + { + ++pAllocationRequest->itemsToMakeLostCount; + pAllocationRequest->sumItemSize += suballoc.size; + } + else + { + return false; + } + } + ++index1st; + } + + // Check next suballocations for BufferImageGranularity conflicts. + // If conflict exists, we must mark more allocations lost or fail. + if(allocSize % bufferImageGranularity || resultOffset % bufferImageGranularity) + { + while(index1st < suballocations1st.size()) + { + const VmaSuballocation& suballoc = suballocations1st[index1st]; + if(VmaBlocksOnSamePage(resultOffset, allocSize, suballoc.offset, bufferImageGranularity)) + { + if(suballoc.hAllocation != VK_NULL_HANDLE) + { + // Not checking actual VmaIsBufferImageGranularityConflict(allocType, suballoc.type). + if(suballoc.hAllocation->CanBecomeLost() && + suballoc.hAllocation->GetLastUseFrameIndex() + frameInUseCount < currentFrameIndex) + { + ++pAllocationRequest->itemsToMakeLostCount; + pAllocationRequest->sumItemSize += suballoc.size; + } + else + { + return false; + } + } + } + else + { + // Already on next page. + break; + } + ++index1st; + } + } + + // Special case: There is not enough room at the end for this allocation, even after making all from the 1st lost. + if(index1st == suballocations1st.size() && + resultOffset + allocSize + VMA_DEBUG_MARGIN > size) + { + // TODO: This is a known bug that it's not yet implemented and the allocation is failing. + VMA_DEBUG_LOG("Unsupported special case in custom pool with linear allocation algorithm used as ring buffer with allocations that can be lost."); + } + } + + // There is enough free space at the end after alignment. + if((index1st == suballocations1st.size() && resultOffset + allocSize + VMA_DEBUG_MARGIN <= size) || + (index1st < suballocations1st.size() && resultOffset + allocSize + VMA_DEBUG_MARGIN <= suballocations1st[index1st].offset)) + { + // Check next suballocations for BufferImageGranularity conflicts. + // If conflict exists, allocation cannot be made here. + if(allocSize % bufferImageGranularity || resultOffset % bufferImageGranularity) + { + for (const auto nextSuballocIndex : c10::irange(index1st, suballocations1st.size())) { + const VmaSuballocation& nextSuballoc = suballocations1st[nextSuballocIndex]; + if(VmaBlocksOnSamePage(resultOffset, allocSize, nextSuballoc.offset, bufferImageGranularity)) + { + if(VmaIsBufferImageGranularityConflict(allocType, nextSuballoc.type)) + { + return false; + } + } + else + { + // Already on next page. + break; + } + } + } + + // All tests passed: Success. + pAllocationRequest->offset = resultOffset; + pAllocationRequest->sumFreeSize = + (index1st < suballocations1st.size() ? suballocations1st[index1st].offset : size) + - resultBaseOffset + - pAllocationRequest->sumItemSize; + pAllocationRequest->type = VmaAllocationRequestType::EndOf2nd; + // pAllocationRequest->item, customData unused. + return true; + } + } + + return false; +} + +bool VmaBlockMetadata_Linear::MakeRequestedAllocationsLost( + uint32_t currentFrameIndex, + uint32_t frameInUseCount, + VmaAllocationRequest* pAllocationRequest) +{ + if(pAllocationRequest->itemsToMakeLostCount == 0) + { + return true; + } + + VMA_ASSERT(m_2ndVectorMode == SECOND_VECTOR_EMPTY || m_2ndVectorMode == SECOND_VECTOR_RING_BUFFER); + + // We always start from 1st. + SuballocationVectorType* suballocations = &AccessSuballocations1st(); + size_t index = m_1stNullItemsBeginCount; + size_t madeLostCount = 0; + while(madeLostCount < pAllocationRequest->itemsToMakeLostCount) + { + if(index == suballocations->size()) + { + index = 0; + // If we get to the end of 1st, we wrap around to beginning of 2nd of 1st. + if(m_2ndVectorMode == SECOND_VECTOR_RING_BUFFER) + { + suballocations = &AccessSuballocations2nd(); + } + // else: m_2ndVectorMode == SECOND_VECTOR_EMPTY: + // suballocations continues pointing at AccessSuballocations1st(). + VMA_ASSERT(!suballocations->empty()); + } + VmaSuballocation& suballoc = (*suballocations)[index]; + if(suballoc.type != VMA_SUBALLOCATION_TYPE_FREE) + { + VMA_ASSERT(suballoc.hAllocation != VK_NULL_HANDLE); + VMA_ASSERT(suballoc.hAllocation->CanBecomeLost()); + if(suballoc.hAllocation->MakeLost(currentFrameIndex, frameInUseCount)) + { + suballoc.type = VMA_SUBALLOCATION_TYPE_FREE; + suballoc.hAllocation = VK_NULL_HANDLE; + m_SumFreeSize += suballoc.size; + if(suballocations == &AccessSuballocations1st()) + { + ++m_1stNullItemsMiddleCount; + } + else + { + ++m_2ndNullItemsCount; + } + ++madeLostCount; + } + else + { + return false; + } + } + ++index; + } + + CleanupAfterFree(); + //VMA_HEAVY_ASSERT(Validate()); // Already called by ClanupAfterFree(). + + return true; +} + +uint32_t VmaBlockMetadata_Linear::MakeAllocationsLost(uint32_t currentFrameIndex, uint32_t frameInUseCount) +{ + uint32_t lostAllocationCount = 0; + + SuballocationVectorType& suballocations1st = AccessSuballocations1st(); + for(size_t i = m_1stNullItemsBeginCount, count = suballocations1st.size(); i < count; ++i) + { + VmaSuballocation& suballoc = suballocations1st[i]; + if(suballoc.type != VMA_SUBALLOCATION_TYPE_FREE && + suballoc.hAllocation->CanBecomeLost() && + suballoc.hAllocation->MakeLost(currentFrameIndex, frameInUseCount)) + { + suballoc.type = VMA_SUBALLOCATION_TYPE_FREE; + suballoc.hAllocation = VK_NULL_HANDLE; + ++m_1stNullItemsMiddleCount; + m_SumFreeSize += suballoc.size; + ++lostAllocationCount; + } + } + + SuballocationVectorType& suballocations2nd = AccessSuballocations2nd(); + for(size_t i = 0, count = suballocations2nd.size(); i < count; ++i) + { + VmaSuballocation& suballoc = suballocations2nd[i]; + if(suballoc.type != VMA_SUBALLOCATION_TYPE_FREE && + suballoc.hAllocation->CanBecomeLost() && + suballoc.hAllocation->MakeLost(currentFrameIndex, frameInUseCount)) + { + suballoc.type = VMA_SUBALLOCATION_TYPE_FREE; + suballoc.hAllocation = VK_NULL_HANDLE; + ++m_2ndNullItemsCount; + m_SumFreeSize += suballoc.size; + ++lostAllocationCount; + } + } + + if(lostAllocationCount) + { + CleanupAfterFree(); + } + + return lostAllocationCount; +} + +VkResult VmaBlockMetadata_Linear::CheckCorruption(const void* pBlockData) +{ + SuballocationVectorType& suballocations1st = AccessSuballocations1st(); + for(size_t i = m_1stNullItemsBeginCount, count = suballocations1st.size(); i < count; ++i) + { + const VmaSuballocation& suballoc = suballocations1st[i]; + if(suballoc.type != VMA_SUBALLOCATION_TYPE_FREE) + { + if(!VmaValidateMagicValue(pBlockData, suballoc.offset - VMA_DEBUG_MARGIN)) + { + VMA_ASSERT(0 && "MEMORY CORRUPTION DETECTED BEFORE VALIDATED ALLOCATION!"); + return VK_ERROR_VALIDATION_FAILED_EXT; + } + if(!VmaValidateMagicValue(pBlockData, suballoc.offset + suballoc.size)) + { + VMA_ASSERT(0 && "MEMORY CORRUPTION DETECTED AFTER VALIDATED ALLOCATION!"); + return VK_ERROR_VALIDATION_FAILED_EXT; + } + } + } + + SuballocationVectorType& suballocations2nd = AccessSuballocations2nd(); + for(size_t i = 0, count = suballocations2nd.size(); i < count; ++i) + { + const VmaSuballocation& suballoc = suballocations2nd[i]; + if(suballoc.type != VMA_SUBALLOCATION_TYPE_FREE) + { + if(!VmaValidateMagicValue(pBlockData, suballoc.offset - VMA_DEBUG_MARGIN)) + { + VMA_ASSERT(0 && "MEMORY CORRUPTION DETECTED BEFORE VALIDATED ALLOCATION!"); + return VK_ERROR_VALIDATION_FAILED_EXT; + } + if(!VmaValidateMagicValue(pBlockData, suballoc.offset + suballoc.size)) + { + VMA_ASSERT(0 && "MEMORY CORRUPTION DETECTED AFTER VALIDATED ALLOCATION!"); + return VK_ERROR_VALIDATION_FAILED_EXT; + } + } + } + + return VK_SUCCESS; +} + +void VmaBlockMetadata_Linear::Alloc( + const VmaAllocationRequest& request, + VmaSuballocationType type, + VkDeviceSize allocSize, + VmaAllocation hAllocation) +{ + const VmaSuballocation newSuballoc = { request.offset, allocSize, hAllocation, type }; + + switch(request.type) + { + case VmaAllocationRequestType::UpperAddress: + { + VMA_ASSERT(m_2ndVectorMode != SECOND_VECTOR_RING_BUFFER && + "CRITICAL ERROR: Trying to use linear allocator as double stack while it was already used as ring buffer."); + SuballocationVectorType& suballocations2nd = AccessSuballocations2nd(); + suballocations2nd.push_back(newSuballoc); + m_2ndVectorMode = SECOND_VECTOR_DOUBLE_STACK; + } + break; + case VmaAllocationRequestType::EndOf1st: + { + SuballocationVectorType& suballocations1st = AccessSuballocations1st(); + + VMA_ASSERT(suballocations1st.empty() || + request.offset >= suballocations1st.back().offset + suballocations1st.back().size); + // Check if it fits before the end of the block. + VMA_ASSERT(request.offset + allocSize <= GetSize()); + + suballocations1st.push_back(newSuballoc); + } + break; + case VmaAllocationRequestType::EndOf2nd: + { + SuballocationVectorType& suballocations1st = AccessSuballocations1st(); + // New allocation at the end of 2-part ring buffer, so before first allocation from 1st vector. + VMA_ASSERT(!suballocations1st.empty() && + request.offset + allocSize <= suballocations1st[m_1stNullItemsBeginCount].offset); + SuballocationVectorType& suballocations2nd = AccessSuballocations2nd(); + + switch(m_2ndVectorMode) + { + case SECOND_VECTOR_EMPTY: + // First allocation from second part ring buffer. + VMA_ASSERT(suballocations2nd.empty()); + m_2ndVectorMode = SECOND_VECTOR_RING_BUFFER; + break; + case SECOND_VECTOR_RING_BUFFER: + // 2-part ring buffer is already started. + VMA_ASSERT(!suballocations2nd.empty()); + break; + case SECOND_VECTOR_DOUBLE_STACK: + VMA_ASSERT(0 && "CRITICAL ERROR: Trying to use linear allocator as ring buffer while it was already used as double stack."); + break; + default: + VMA_ASSERT(0); + } + + suballocations2nd.push_back(newSuballoc); + } + break; + default: + VMA_ASSERT(0 && "CRITICAL INTERNAL ERROR."); + } + + m_SumFreeSize -= newSuballoc.size; +} + +void VmaBlockMetadata_Linear::Free(const VmaAllocation allocation) +{ + FreeAtOffset(allocation->GetOffset()); +} + +void VmaBlockMetadata_Linear::FreeAtOffset(VkDeviceSize offset) +{ + SuballocationVectorType& suballocations1st = AccessSuballocations1st(); + SuballocationVectorType& suballocations2nd = AccessSuballocations2nd(); + + if(!suballocations1st.empty()) + { + // First allocation: Mark it as next empty at the beginning. + VmaSuballocation& firstSuballoc = suballocations1st[m_1stNullItemsBeginCount]; + if(firstSuballoc.offset == offset) + { + firstSuballoc.type = VMA_SUBALLOCATION_TYPE_FREE; + firstSuballoc.hAllocation = VK_NULL_HANDLE; + m_SumFreeSize += firstSuballoc.size; + ++m_1stNullItemsBeginCount; + CleanupAfterFree(); + return; + } + } + + // Last allocation in 2-part ring buffer or top of upper stack (same logic). + if(m_2ndVectorMode == SECOND_VECTOR_RING_BUFFER || + m_2ndVectorMode == SECOND_VECTOR_DOUBLE_STACK) + { + VmaSuballocation& lastSuballoc = suballocations2nd.back(); + if(lastSuballoc.offset == offset) + { + m_SumFreeSize += lastSuballoc.size; + suballocations2nd.pop_back(); + CleanupAfterFree(); + return; + } + } + // Last allocation in 1st vector. + else if(m_2ndVectorMode == SECOND_VECTOR_EMPTY) + { + VmaSuballocation& lastSuballoc = suballocations1st.back(); + if(lastSuballoc.offset == offset) + { + m_SumFreeSize += lastSuballoc.size; + suballocations1st.pop_back(); + CleanupAfterFree(); + return; + } + } + + // Item from the middle of 1st vector. + { + VmaSuballocation refSuballoc; + refSuballoc.offset = offset; + // Rest of members stays uninitialized intentionally for better performance. + SuballocationVectorType::iterator it = VmaBinaryFindSorted( + suballocations1st.begin() + m_1stNullItemsBeginCount, + suballocations1st.end(), + refSuballoc, + VmaSuballocationOffsetLess()); + if(it != suballocations1st.end()) + { + it->type = VMA_SUBALLOCATION_TYPE_FREE; + it->hAllocation = VK_NULL_HANDLE; + ++m_1stNullItemsMiddleCount; + m_SumFreeSize += it->size; + CleanupAfterFree(); + return; + } + } + + if(m_2ndVectorMode != SECOND_VECTOR_EMPTY) + { + // Item from the middle of 2nd vector. + VmaSuballocation refSuballoc; + refSuballoc.offset = offset; + // Rest of members stays uninitialized intentionally for better performance. + SuballocationVectorType::iterator it = m_2ndVectorMode == SECOND_VECTOR_RING_BUFFER ? + VmaBinaryFindSorted(suballocations2nd.begin(), suballocations2nd.end(), refSuballoc, VmaSuballocationOffsetLess()) : + VmaBinaryFindSorted(suballocations2nd.begin(), suballocations2nd.end(), refSuballoc, VmaSuballocationOffsetGreater()); + if(it != suballocations2nd.end()) + { + it->type = VMA_SUBALLOCATION_TYPE_FREE; + it->hAllocation = VK_NULL_HANDLE; + ++m_2ndNullItemsCount; + m_SumFreeSize += it->size; + CleanupAfterFree(); + return; + } + } + + VMA_ASSERT(0 && "Allocation to free not found in linear allocator!"); +} + +bool VmaBlockMetadata_Linear::ShouldCompact1st() const +{ + const size_t nullItemCount = m_1stNullItemsBeginCount + m_1stNullItemsMiddleCount; + const size_t suballocCount = AccessSuballocations1st().size(); + return suballocCount > 32 && nullItemCount * 2 >= (suballocCount - nullItemCount) * 3; +} + +void VmaBlockMetadata_Linear::CleanupAfterFree() +{ + SuballocationVectorType& suballocations1st = AccessSuballocations1st(); + SuballocationVectorType& suballocations2nd = AccessSuballocations2nd(); + + if(IsEmpty()) + { + suballocations1st.clear(); + suballocations2nd.clear(); + m_1stNullItemsBeginCount = 0; + m_1stNullItemsMiddleCount = 0; + m_2ndNullItemsCount = 0; + m_2ndVectorMode = SECOND_VECTOR_EMPTY; + } + else + { + const size_t suballoc1stCount = suballocations1st.size(); + const size_t nullItem1stCount = m_1stNullItemsBeginCount + m_1stNullItemsMiddleCount; + VMA_ASSERT(nullItem1stCount <= suballoc1stCount); + + // Find more null items at the beginning of 1st vector. + while(m_1stNullItemsBeginCount < suballoc1stCount && + suballocations1st[m_1stNullItemsBeginCount].hAllocation == VK_NULL_HANDLE) + { + ++m_1stNullItemsBeginCount; + --m_1stNullItemsMiddleCount; + } + + // Find more null items at the end of 1st vector. + while(m_1stNullItemsMiddleCount > 0 && + suballocations1st.back().hAllocation == VK_NULL_HANDLE) + { + --m_1stNullItemsMiddleCount; + suballocations1st.pop_back(); + } + + // Find more null items at the end of 2nd vector. + while(m_2ndNullItemsCount > 0 && + suballocations2nd.back().hAllocation == VK_NULL_HANDLE) + { + --m_2ndNullItemsCount; + suballocations2nd.pop_back(); + } + + // Find more null items at the beginning of 2nd vector. + while(m_2ndNullItemsCount > 0 && + suballocations2nd[0].hAllocation == VK_NULL_HANDLE) + { + --m_2ndNullItemsCount; + VmaVectorRemove(suballocations2nd, 0); + } + + if(ShouldCompact1st()) + { + const size_t nonNullItemCount = suballoc1stCount - nullItem1stCount; + size_t srcIndex = m_1stNullItemsBeginCount; + for (const auto dstIndex : c10::irange(nonNullItemCount)) { + while(suballocations1st[srcIndex].hAllocation == VK_NULL_HANDLE) + { + ++srcIndex; + } + if(dstIndex != srcIndex) + { + suballocations1st[dstIndex] = suballocations1st[srcIndex]; + } + ++srcIndex; + } + suballocations1st.resize(nonNullItemCount); + m_1stNullItemsBeginCount = 0; + m_1stNullItemsMiddleCount = 0; + } + + // 2nd vector became empty. + if(suballocations2nd.empty()) + { + m_2ndVectorMode = SECOND_VECTOR_EMPTY; + } + + // 1st vector became empty. + if(suballocations1st.size() - m_1stNullItemsBeginCount == 0) + { + suballocations1st.clear(); + m_1stNullItemsBeginCount = 0; + + if(!suballocations2nd.empty() && m_2ndVectorMode == SECOND_VECTOR_RING_BUFFER) + { + // Swap 1st with 2nd. Now 2nd is empty. + m_2ndVectorMode = SECOND_VECTOR_EMPTY; + m_1stNullItemsMiddleCount = m_2ndNullItemsCount; + while(m_1stNullItemsBeginCount < suballocations2nd.size() && + suballocations2nd[m_1stNullItemsBeginCount].hAllocation == VK_NULL_HANDLE) + { + ++m_1stNullItemsBeginCount; + --m_1stNullItemsMiddleCount; + } + m_2ndNullItemsCount = 0; + m_1stVectorIndex ^= 1; + } + } + } + + VMA_HEAVY_ASSERT(Validate()); +} + + +//////////////////////////////////////////////////////////////////////////////// +// class VmaBlockMetadata_Buddy + +VmaBlockMetadata_Buddy::VmaBlockMetadata_Buddy(VmaAllocator hAllocator) : + VmaBlockMetadata(hAllocator), + m_Root(VMA_NULL), + m_AllocationCount(0), + m_FreeCount(1), + m_SumFreeSize(0) +{ + memset(m_FreeList, 0, sizeof(m_FreeList)); +} + +VmaBlockMetadata_Buddy::~VmaBlockMetadata_Buddy() +{ + DeleteNode(m_Root); +} + +void VmaBlockMetadata_Buddy::Init(VkDeviceSize size) +{ + VmaBlockMetadata::Init(size); + + m_UsableSize = VmaPrevPow2(size); + m_SumFreeSize = m_UsableSize; + + // Calculate m_LevelCount. + m_LevelCount = 1; + while(m_LevelCount < MAX_LEVELS && + LevelToNodeSize(m_LevelCount) >= MIN_NODE_SIZE) + { + ++m_LevelCount; + } + + Node* rootNode = vma_new(GetAllocationCallbacks(), Node)(); + rootNode->offset = 0; + rootNode->type = Node::TYPE_FREE; + rootNode->parent = VMA_NULL; + rootNode->buddy = VMA_NULL; + + m_Root = rootNode; + AddToFreeListFront(0, rootNode); +} + +bool VmaBlockMetadata_Buddy::Validate() const +{ + // Validate tree. + ValidationContext ctx; + if(!ValidateNode(ctx, VMA_NULL, m_Root, 0, LevelToNodeSize(0))) + { + VMA_VALIDATE(false && "ValidateNode failed."); + } + VMA_VALIDATE(m_AllocationCount == ctx.calculatedAllocationCount); + VMA_VALIDATE(m_SumFreeSize == ctx.calculatedSumFreeSize); + + // Validate free node lists. + for (const auto level : c10::irange(m_LevelCount)) { + VMA_VALIDATE(m_FreeList[level].front == VMA_NULL || + m_FreeList[level].front->free.prev == VMA_NULL); + + for(Node* node = m_FreeList[level].front; + node != VMA_NULL; + node = node->free.next) + { + VMA_VALIDATE(node->type == Node::TYPE_FREE); + + if(node->free.next == VMA_NULL) + { + VMA_VALIDATE(m_FreeList[level].back == node); + } + else + { + VMA_VALIDATE(node->free.next->free.prev == node); + } + } + } + + // Validate that free lists ar higher levels are empty. + for (const auto level : c10::irange(m_LevelCount, MAX_LEVELS)) { + VMA_VALIDATE(m_FreeList[level].front == VMA_NULL && m_FreeList[level].back == VMA_NULL); + } + + return true; +} + +VkDeviceSize VmaBlockMetadata_Buddy::GetUnusedRangeSizeMax() const +{ + for (const auto level : c10::irange(m_LevelCount)) { + if(m_FreeList[level].front != VMA_NULL) + { + return LevelToNodeSize(level); + } + } + return 0; +} + +void VmaBlockMetadata_Buddy::CalcAllocationStatInfo(VmaStatInfo& outInfo) const +{ + const VkDeviceSize unusableSize = GetUnusableSize(); + + outInfo.blockCount = 1; + + outInfo.allocationCount = outInfo.unusedRangeCount = 0; + outInfo.usedBytes = outInfo.unusedBytes = 0; + + outInfo.allocationSizeMax = outInfo.unusedRangeSizeMax = 0; + outInfo.allocationSizeMin = outInfo.unusedRangeSizeMin = UINT64_MAX; + outInfo.allocationSizeAvg = outInfo.unusedRangeSizeAvg = 0; // Unused. + + CalcAllocationStatInfoNode(outInfo, m_Root, LevelToNodeSize(0)); + + if(unusableSize > 0) + { + ++outInfo.unusedRangeCount; + outInfo.unusedBytes += unusableSize; + outInfo.unusedRangeSizeMax = VMA_MAX(outInfo.unusedRangeSizeMax, unusableSize); + outInfo.unusedRangeSizeMin = VMA_MIN(outInfo.unusedRangeSizeMin, unusableSize); + } +} + +void VmaBlockMetadata_Buddy::AddPoolStats(VmaPoolStats& inoutStats) const +{ + const VkDeviceSize unusableSize = GetUnusableSize(); + + inoutStats.size += GetSize(); + inoutStats.unusedSize += m_SumFreeSize + unusableSize; + inoutStats.allocationCount += m_AllocationCount; + inoutStats.unusedRangeCount += m_FreeCount; + inoutStats.unusedRangeSizeMax = VMA_MAX(inoutStats.unusedRangeSizeMax, GetUnusedRangeSizeMax()); + + if(unusableSize > 0) + { + ++inoutStats.unusedRangeCount; + // Not updating inoutStats.unusedRangeSizeMax with unusableSize because this space is not available for allocations. + } +} + +#if VMA_STATS_STRING_ENABLED + +void VmaBlockMetadata_Buddy::PrintDetailedMap(class VmaJsonWriter& json) const +{ + // TODO optimize + VmaStatInfo stat; + CalcAllocationStatInfo(stat); + + PrintDetailedMap_Begin( + json, + stat.unusedBytes, + stat.allocationCount, + stat.unusedRangeCount); + + PrintDetailedMapNode(json, m_Root, LevelToNodeSize(0)); + + const VkDeviceSize unusableSize = GetUnusableSize(); + if(unusableSize > 0) + { + PrintDetailedMap_UnusedRange(json, + m_UsableSize, // offset + unusableSize); // size + } + + PrintDetailedMap_End(json); +} + +#endif // #if VMA_STATS_STRING_ENABLED + +bool VmaBlockMetadata_Buddy::CreateAllocationRequest( + uint32_t currentFrameIndex, + uint32_t frameInUseCount, + VkDeviceSize bufferImageGranularity, + VkDeviceSize allocSize, + VkDeviceSize allocAlignment, + bool upperAddress, + VmaSuballocationType allocType, + bool canMakeOtherLost, + uint32_t strategy, + VmaAllocationRequest* pAllocationRequest) +{ + VMA_ASSERT(!upperAddress && "VMA_ALLOCATION_CREATE_UPPER_ADDRESS_BIT can be used only with linear algorithm."); + + // Simple way to respect bufferImageGranularity. May be optimized some day. + // Whenever it might be an OPTIMAL image... + if(allocType == VMA_SUBALLOCATION_TYPE_UNKNOWN || + allocType == VMA_SUBALLOCATION_TYPE_IMAGE_UNKNOWN || + allocType == VMA_SUBALLOCATION_TYPE_IMAGE_OPTIMAL) + { + allocAlignment = VMA_MAX(allocAlignment, bufferImageGranularity); + allocSize = VMA_MAX(allocSize, bufferImageGranularity); + } + + if(allocSize > m_UsableSize) + { + return false; + } + + const uint32_t targetLevel = AllocSizeToLevel(allocSize); + for(uint32_t level = targetLevel + 1; level--; ) + { + for(Node* freeNode = m_FreeList[level].front; + freeNode != VMA_NULL; + freeNode = freeNode->free.next) + { + if(freeNode->offset % allocAlignment == 0) + { + pAllocationRequest->type = VmaAllocationRequestType::Normal; + pAllocationRequest->offset = freeNode->offset; + pAllocationRequest->sumFreeSize = LevelToNodeSize(level); + pAllocationRequest->sumItemSize = 0; + pAllocationRequest->itemsToMakeLostCount = 0; + pAllocationRequest->customData = (void*)(uintptr_t)level; + return true; + } + } + } + + return false; +} + +bool VmaBlockMetadata_Buddy::MakeRequestedAllocationsLost( + uint32_t currentFrameIndex, + uint32_t frameInUseCount, + VmaAllocationRequest* pAllocationRequest) +{ + /* + Lost allocations are not supported in buddy allocator at the moment. + Support might be added in the future. + */ + return pAllocationRequest->itemsToMakeLostCount == 0; +} + +uint32_t VmaBlockMetadata_Buddy::MakeAllocationsLost(uint32_t currentFrameIndex, uint32_t frameInUseCount) +{ + /* + Lost allocations are not supported in buddy allocator at the moment. + Support might be added in the future. + */ + return 0; +} + +void VmaBlockMetadata_Buddy::Alloc( + const VmaAllocationRequest& request, + VmaSuballocationType type, + VkDeviceSize allocSize, + VmaAllocation hAllocation) +{ + VMA_ASSERT(request.type == VmaAllocationRequestType::Normal); + + const uint32_t targetLevel = AllocSizeToLevel(allocSize); + uint32_t currLevel = (uint32_t)(uintptr_t)request.customData; + + Node* currNode = m_FreeList[currLevel].front; + VMA_ASSERT(currNode != VMA_NULL && currNode->type == Node::TYPE_FREE); + while(currNode->offset != request.offset) + { + currNode = currNode->free.next; + VMA_ASSERT(currNode != VMA_NULL && currNode->type == Node::TYPE_FREE); + } + + // Go down, splitting free nodes. + while(currLevel < targetLevel) + { + // currNode is already first free node at currLevel. + // Remove it from list of free nodes at this currLevel. + RemoveFromFreeList(currLevel, currNode); + + const uint32_t childrenLevel = currLevel + 1; + + // Create two free sub-nodes. + Node* leftChild = vma_new(GetAllocationCallbacks(), Node)(); + Node* rightChild = vma_new(GetAllocationCallbacks(), Node)(); + + leftChild->offset = currNode->offset; + leftChild->type = Node::TYPE_FREE; + leftChild->parent = currNode; + leftChild->buddy = rightChild; + + rightChild->offset = currNode->offset + LevelToNodeSize(childrenLevel); + rightChild->type = Node::TYPE_FREE; + rightChild->parent = currNode; + rightChild->buddy = leftChild; + + // Convert current currNode to split type. + currNode->type = Node::TYPE_SPLIT; + currNode->split.leftChild = leftChild; + + // Add child nodes to free list. Order is important! + AddToFreeListFront(childrenLevel, rightChild); + AddToFreeListFront(childrenLevel, leftChild); + + ++m_FreeCount; + //m_SumFreeSize -= LevelToNodeSize(currLevel) % 2; // Useful only when level node sizes can be non power of 2. + ++currLevel; + currNode = m_FreeList[currLevel].front; + + /* + We can be sure that currNode, as left child of node previously split, + also fullfills the alignment requirement. + */ + } + + // Remove from free list. + VMA_ASSERT(currLevel == targetLevel && + currNode != VMA_NULL && + currNode->type == Node::TYPE_FREE); + RemoveFromFreeList(currLevel, currNode); + + // Convert to allocation node. + currNode->type = Node::TYPE_ALLOCATION; + currNode->allocation.alloc = hAllocation; + + ++m_AllocationCount; + --m_FreeCount; + m_SumFreeSize -= allocSize; +} + +void VmaBlockMetadata_Buddy::DeleteNode(Node* node) +{ + if(node->type == Node::TYPE_SPLIT) + { + DeleteNode(node->split.leftChild->buddy); + DeleteNode(node->split.leftChild); + } + + vma_delete(GetAllocationCallbacks(), node); +} + +bool VmaBlockMetadata_Buddy::ValidateNode(ValidationContext& ctx, const Node* parent, const Node* curr, uint32_t level, VkDeviceSize levelNodeSize) const +{ + VMA_VALIDATE(level < m_LevelCount); + VMA_VALIDATE(curr->parent == parent); + VMA_VALIDATE((curr->buddy == VMA_NULL) == (parent == VMA_NULL)); + VMA_VALIDATE(curr->buddy == VMA_NULL || curr->buddy->buddy == curr); + switch(curr->type) + { + case Node::TYPE_FREE: + // curr->free.prev, next are validated separately. + ctx.calculatedSumFreeSize += levelNodeSize; + ++ctx.calculatedFreeCount; + break; + case Node::TYPE_ALLOCATION: + ++ctx.calculatedAllocationCount; + ctx.calculatedSumFreeSize += levelNodeSize - curr->allocation.alloc->GetSize(); + VMA_VALIDATE(curr->allocation.alloc != VK_NULL_HANDLE); + break; + case Node::TYPE_SPLIT: + { + const uint32_t childrenLevel = level + 1; + const VkDeviceSize childrenLevelNodeSize = levelNodeSize / 2; + const Node* const leftChild = curr->split.leftChild; + VMA_VALIDATE(leftChild != VMA_NULL); + VMA_VALIDATE(leftChild->offset == curr->offset); + if(!ValidateNode(ctx, curr, leftChild, childrenLevel, childrenLevelNodeSize)) + { + VMA_VALIDATE(false && "ValidateNode for left child failed."); + } + const Node* const rightChild = leftChild->buddy; + VMA_VALIDATE(rightChild->offset == curr->offset + childrenLevelNodeSize); + if(!ValidateNode(ctx, curr, rightChild, childrenLevel, childrenLevelNodeSize)) + { + VMA_VALIDATE(false && "ValidateNode for right child failed."); + } + } + break; + default: + return false; + } + + return true; +} + +uint32_t VmaBlockMetadata_Buddy::AllocSizeToLevel(VkDeviceSize allocSize) const +{ + // I know this could be optimized somehow e.g. by using std::log2p1 from C++20. + uint32_t level = 0; + VkDeviceSize currLevelNodeSize = m_UsableSize; + VkDeviceSize nextLevelNodeSize = currLevelNodeSize >> 1; + while(allocSize <= nextLevelNodeSize && level + 1 < m_LevelCount) + { + ++level; + currLevelNodeSize = nextLevelNodeSize; + nextLevelNodeSize = currLevelNodeSize >> 1; + } + return level; +} + +void VmaBlockMetadata_Buddy::FreeAtOffset(VmaAllocation alloc, VkDeviceSize offset) +{ + // Find node and level. + Node* node = m_Root; + VkDeviceSize nodeOffset = 0; + uint32_t level = 0; + VkDeviceSize levelNodeSize = LevelToNodeSize(0); + while(node->type == Node::TYPE_SPLIT) + { + const VkDeviceSize nextLevelSize = levelNodeSize >> 1; + if(offset < nodeOffset + nextLevelSize) + { + node = node->split.leftChild; + } + else + { + node = node->split.leftChild->buddy; + nodeOffset += nextLevelSize; + } + ++level; + levelNodeSize = nextLevelSize; + } + + VMA_ASSERT(node != VMA_NULL && node->type == Node::TYPE_ALLOCATION); + VMA_ASSERT(alloc == VK_NULL_HANDLE || node->allocation.alloc == alloc); + + ++m_FreeCount; + --m_AllocationCount; + m_SumFreeSize += alloc->GetSize(); + + node->type = Node::TYPE_FREE; + + // Join free nodes if possible. + while(level > 0 && node->buddy->type == Node::TYPE_FREE) + { + RemoveFromFreeList(level, node->buddy); + Node* const parent = node->parent; + + vma_delete(GetAllocationCallbacks(), node->buddy); + vma_delete(GetAllocationCallbacks(), node); + parent->type = Node::TYPE_FREE; + + node = parent; + --level; + //m_SumFreeSize += LevelToNodeSize(level) % 2; // Useful only when level node sizes can be non power of 2. + --m_FreeCount; + } + + AddToFreeListFront(level, node); +} + +void VmaBlockMetadata_Buddy::CalcAllocationStatInfoNode(VmaStatInfo& outInfo, const Node* node, VkDeviceSize levelNodeSize) const +{ + switch(node->type) + { + case Node::TYPE_FREE: + ++outInfo.unusedRangeCount; + outInfo.unusedBytes += levelNodeSize; + outInfo.unusedRangeSizeMax = VMA_MAX(outInfo.unusedRangeSizeMax, levelNodeSize); + outInfo.unusedRangeSizeMin = VMA_MAX(outInfo.unusedRangeSizeMin, levelNodeSize); + break; + case Node::TYPE_ALLOCATION: + { + const VkDeviceSize allocSize = node->allocation.alloc->GetSize(); + ++outInfo.allocationCount; + outInfo.usedBytes += allocSize; + outInfo.allocationSizeMax = VMA_MAX(outInfo.allocationSizeMax, allocSize); + outInfo.allocationSizeMin = VMA_MAX(outInfo.allocationSizeMin, allocSize); + + const VkDeviceSize unusedRangeSize = levelNodeSize - allocSize; + if(unusedRangeSize > 0) + { + ++outInfo.unusedRangeCount; + outInfo.unusedBytes += unusedRangeSize; + outInfo.unusedRangeSizeMax = VMA_MAX(outInfo.unusedRangeSizeMax, unusedRangeSize); + outInfo.unusedRangeSizeMin = VMA_MAX(outInfo.unusedRangeSizeMin, unusedRangeSize); + } + } + break; + case Node::TYPE_SPLIT: + { + const VkDeviceSize childrenNodeSize = levelNodeSize / 2; + const Node* const leftChild = node->split.leftChild; + CalcAllocationStatInfoNode(outInfo, leftChild, childrenNodeSize); + const Node* const rightChild = leftChild->buddy; + CalcAllocationStatInfoNode(outInfo, rightChild, childrenNodeSize); + } + break; + default: + VMA_ASSERT(0); + } +} + +void VmaBlockMetadata_Buddy::AddToFreeListFront(uint32_t level, Node* node) +{ + VMA_ASSERT(node->type == Node::TYPE_FREE); + + // List is empty. + Node* const frontNode = m_FreeList[level].front; + if(frontNode == VMA_NULL) + { + VMA_ASSERT(m_FreeList[level].back == VMA_NULL); + node->free.prev = node->free.next = VMA_NULL; + m_FreeList[level].front = m_FreeList[level].back = node; + } + else + { + VMA_ASSERT(frontNode->free.prev == VMA_NULL); + node->free.prev = VMA_NULL; + node->free.next = frontNode; + frontNode->free.prev = node; + m_FreeList[level].front = node; + } +} + +void VmaBlockMetadata_Buddy::RemoveFromFreeList(uint32_t level, Node* node) +{ + VMA_ASSERT(m_FreeList[level].front != VMA_NULL); + + // It is at the front. + if(node->free.prev == VMA_NULL) + { + VMA_ASSERT(m_FreeList[level].front == node); + m_FreeList[level].front = node->free.next; + } + else + { + Node* const prevFreeNode = node->free.prev; + VMA_ASSERT(prevFreeNode->free.next == node); + prevFreeNode->free.next = node->free.next; + } + + // It is at the back. + if(node->free.next == VMA_NULL) + { + VMA_ASSERT(m_FreeList[level].back == node); + m_FreeList[level].back = node->free.prev; + } + else + { + Node* const nextFreeNode = node->free.next; + VMA_ASSERT(nextFreeNode->free.prev == node); + nextFreeNode->free.prev = node->free.prev; + } +} + +#if VMA_STATS_STRING_ENABLED +void VmaBlockMetadata_Buddy::PrintDetailedMapNode(class VmaJsonWriter& json, const Node* node, VkDeviceSize levelNodeSize) const +{ + switch(node->type) + { + case Node::TYPE_FREE: + PrintDetailedMap_UnusedRange(json, node->offset, levelNodeSize); + break; + case Node::TYPE_ALLOCATION: + { + PrintDetailedMap_Allocation(json, node->offset, node->allocation.alloc); + const VkDeviceSize allocSize = node->allocation.alloc->GetSize(); + if(allocSize < levelNodeSize) + { + PrintDetailedMap_UnusedRange(json, node->offset + allocSize, levelNodeSize - allocSize); + } + } + break; + case Node::TYPE_SPLIT: + { + const VkDeviceSize childrenNodeSize = levelNodeSize / 2; + const Node* const leftChild = node->split.leftChild; + PrintDetailedMapNode(json, leftChild, childrenNodeSize); + const Node* const rightChild = leftChild->buddy; + PrintDetailedMapNode(json, rightChild, childrenNodeSize); + } + break; + default: + VMA_ASSERT(0); + } +} +#endif // #if VMA_STATS_STRING_ENABLED + + +//////////////////////////////////////////////////////////////////////////////// +// class VmaDeviceMemoryBlock + +VmaDeviceMemoryBlock::VmaDeviceMemoryBlock(VmaAllocator hAllocator) : + m_pMetadata(VMA_NULL), + m_MemoryTypeIndex(UINT32_MAX), + m_Id(0), + m_hMemory(VK_NULL_HANDLE), + m_MapCount(0), + m_pMappedData(VMA_NULL) +{ +} + +void VmaDeviceMemoryBlock::Init( + VmaAllocator hAllocator, + VmaPool hParentPool, + uint32_t newMemoryTypeIndex, + VkDeviceMemory newMemory, + VkDeviceSize newSize, + uint32_t id, + uint32_t algorithm) +{ + VMA_ASSERT(m_hMemory == VK_NULL_HANDLE); + + m_hParentPool = hParentPool; + m_MemoryTypeIndex = newMemoryTypeIndex; + m_Id = id; + m_hMemory = newMemory; + + switch(algorithm) + { + case VMA_POOL_CREATE_LINEAR_ALGORITHM_BIT: + m_pMetadata = vma_new(hAllocator, VmaBlockMetadata_Linear)(hAllocator); + break; + case VMA_POOL_CREATE_BUDDY_ALGORITHM_BIT: + m_pMetadata = vma_new(hAllocator, VmaBlockMetadata_Buddy)(hAllocator); + break; + default: + VMA_ASSERT(0); + // Fall-through. + case 0: + m_pMetadata = vma_new(hAllocator, VmaBlockMetadata_Generic)(hAllocator); + } + m_pMetadata->Init(newSize); +} + +void VmaDeviceMemoryBlock::Destroy(VmaAllocator allocator) +{ + // This is the most important assert in the entire library. + // Hitting it means you have some memory leak - unreleased VmaAllocation objects. + VMA_ASSERT(m_pMetadata->IsEmpty() && "Some allocations were not freed before destruction of this memory block!"); + + VMA_ASSERT(m_hMemory != VK_NULL_HANDLE); + allocator->FreeVulkanMemory(m_MemoryTypeIndex, m_pMetadata->GetSize(), m_hMemory); + m_hMemory = VK_NULL_HANDLE; + + vma_delete(allocator, m_pMetadata); + m_pMetadata = VMA_NULL; +} + +bool VmaDeviceMemoryBlock::Validate() const +{ + VMA_VALIDATE((m_hMemory != VK_NULL_HANDLE) && + (m_pMetadata->GetSize() != 0)); + + return m_pMetadata->Validate(); +} + +VkResult VmaDeviceMemoryBlock::CheckCorruption(VmaAllocator hAllocator) +{ + void* pData = nullptr; + VkResult res = Map(hAllocator, 1, &pData); + if(res != VK_SUCCESS) + { + return res; + } + + res = m_pMetadata->CheckCorruption(pData); + + Unmap(hAllocator, 1); + + return res; +} + +VkResult VmaDeviceMemoryBlock::Map(VmaAllocator hAllocator, uint32_t count, void** ppData) +{ + if(count == 0) + { + return VK_SUCCESS; + } + + VmaMutexLock lock(m_Mutex, hAllocator->m_UseMutex); + if(m_MapCount != 0) + { + m_MapCount += count; + VMA_ASSERT(m_pMappedData != VMA_NULL); + if(ppData != VMA_NULL) + { + *ppData = m_pMappedData; + } + return VK_SUCCESS; + } + else + { + VkResult result = (*hAllocator->GetVulkanFunctions().vkMapMemory)( + hAllocator->m_hDevice, + m_hMemory, + 0, // offset + VK_WHOLE_SIZE, + 0, // flags + &m_pMappedData); + if(result == VK_SUCCESS) + { + if(ppData != VMA_NULL) + { + *ppData = m_pMappedData; + } + m_MapCount = count; + } + return result; + } +} + +void VmaDeviceMemoryBlock::Unmap(VmaAllocator hAllocator, uint32_t count) +{ + if(count == 0) + { + return; + } + + VmaMutexLock lock(m_Mutex, hAllocator->m_UseMutex); + if(m_MapCount >= count) + { + m_MapCount -= count; + if(m_MapCount == 0) + { + m_pMappedData = VMA_NULL; + (*hAllocator->GetVulkanFunctions().vkUnmapMemory)(hAllocator->m_hDevice, m_hMemory); + } + } + else + { + VMA_ASSERT(0 && "VkDeviceMemory block is being unmapped while it was not previously mapped."); + } +} + +VkResult VmaDeviceMemoryBlock::WriteMagicValueAroundAllocation(VmaAllocator hAllocator, VkDeviceSize allocOffset, VkDeviceSize allocSize) +{ + VMA_ASSERT(VMA_DEBUG_MARGIN > 0 && VMA_DEBUG_MARGIN % 4 == 0 && VMA_DEBUG_DETECT_CORRUPTION); + VMA_ASSERT(allocOffset >= VMA_DEBUG_MARGIN); + + void* pData; + VkResult res = Map(hAllocator, 1, &pData); + if(res != VK_SUCCESS) + { + return res; + } + + VmaWriteMagicValue(pData, allocOffset - VMA_DEBUG_MARGIN); + VmaWriteMagicValue(pData, allocOffset + allocSize); + + Unmap(hAllocator, 1); + + return VK_SUCCESS; +} + +VkResult VmaDeviceMemoryBlock::ValidateMagicValueAroundAllocation(VmaAllocator hAllocator, VkDeviceSize allocOffset, VkDeviceSize allocSize) +{ + VMA_ASSERT(VMA_DEBUG_MARGIN > 0 && VMA_DEBUG_MARGIN % 4 == 0 && VMA_DEBUG_DETECT_CORRUPTION); + VMA_ASSERT(allocOffset >= VMA_DEBUG_MARGIN); + + void* pData; + VkResult res = Map(hAllocator, 1, &pData); + if(res != VK_SUCCESS) + { + return res; + } + + if(!VmaValidateMagicValue(pData, allocOffset - VMA_DEBUG_MARGIN)) + { + VMA_ASSERT(0 && "MEMORY CORRUPTION DETECTED BEFORE FREED ALLOCATION!"); + } + else if(!VmaValidateMagicValue(pData, allocOffset + allocSize)) + { + VMA_ASSERT(0 && "MEMORY CORRUPTION DETECTED AFTER FREED ALLOCATION!"); + } + + Unmap(hAllocator, 1); + + return VK_SUCCESS; +} + +VkResult VmaDeviceMemoryBlock::BindBufferMemory( + const VmaAllocator hAllocator, + const VmaAllocation hAllocation, + VkDeviceSize allocationLocalOffset, + VkBuffer hBuffer, + const void* pNext) +{ + VMA_ASSERT(hAllocation->GetType() == VmaAllocation_T::ALLOCATION_TYPE_BLOCK && + hAllocation->GetBlock() == this); + VMA_ASSERT(allocationLocalOffset < hAllocation->GetSize() && + "Invalid allocationLocalOffset. Did you forget that this offset is relative to the beginning of the allocation, not the whole memory block?"); + const VkDeviceSize memoryOffset = hAllocation->GetOffset() + allocationLocalOffset; + // This lock is important so that we don't call vkBind... and/or vkMap... simultaneously on the same VkDeviceMemory from multiple threads. + VmaMutexLock lock(m_Mutex, hAllocator->m_UseMutex); + return hAllocator->BindVulkanBuffer(m_hMemory, memoryOffset, hBuffer, pNext); +} + +VkResult VmaDeviceMemoryBlock::BindImageMemory( + const VmaAllocator hAllocator, + const VmaAllocation hAllocation, + VkDeviceSize allocationLocalOffset, + VkImage hImage, + const void* pNext) +{ + VMA_ASSERT(hAllocation->GetType() == VmaAllocation_T::ALLOCATION_TYPE_BLOCK && + hAllocation->GetBlock() == this); + VMA_ASSERT(allocationLocalOffset < hAllocation->GetSize() && + "Invalid allocationLocalOffset. Did you forget that this offset is relative to the beginning of the allocation, not the whole memory block?"); + const VkDeviceSize memoryOffset = hAllocation->GetOffset() + allocationLocalOffset; + // This lock is important so that we don't call vkBind... and/or vkMap... simultaneously on the same VkDeviceMemory from multiple threads. + VmaMutexLock lock(m_Mutex, hAllocator->m_UseMutex); + return hAllocator->BindVulkanImage(m_hMemory, memoryOffset, hImage, pNext); +} + +static void InitStatInfo(VmaStatInfo& outInfo) +{ + memset(&outInfo, 0, sizeof(outInfo)); + outInfo.allocationSizeMin = UINT64_MAX; + outInfo.unusedRangeSizeMin = UINT64_MAX; +} + +// Adds statistics srcInfo into inoutInfo, like: inoutInfo += srcInfo. +static void VmaAddStatInfo(VmaStatInfo& inoutInfo, const VmaStatInfo& srcInfo) +{ + inoutInfo.blockCount += srcInfo.blockCount; + inoutInfo.allocationCount += srcInfo.allocationCount; + inoutInfo.unusedRangeCount += srcInfo.unusedRangeCount; + inoutInfo.usedBytes += srcInfo.usedBytes; + inoutInfo.unusedBytes += srcInfo.unusedBytes; + inoutInfo.allocationSizeMin = VMA_MIN(inoutInfo.allocationSizeMin, srcInfo.allocationSizeMin); + inoutInfo.allocationSizeMax = VMA_MAX(inoutInfo.allocationSizeMax, srcInfo.allocationSizeMax); + inoutInfo.unusedRangeSizeMin = VMA_MIN(inoutInfo.unusedRangeSizeMin, srcInfo.unusedRangeSizeMin); + inoutInfo.unusedRangeSizeMax = VMA_MAX(inoutInfo.unusedRangeSizeMax, srcInfo.unusedRangeSizeMax); +} + +static void VmaPostprocessCalcStatInfo(VmaStatInfo& inoutInfo) +{ + inoutInfo.allocationSizeAvg = (inoutInfo.allocationCount > 0) ? + VmaRoundDiv(inoutInfo.usedBytes, inoutInfo.allocationCount) : 0; + inoutInfo.unusedRangeSizeAvg = (inoutInfo.unusedRangeCount > 0) ? + VmaRoundDiv(inoutInfo.unusedBytes, inoutInfo.unusedRangeCount) : 0; +} + +VmaPool_T::VmaPool_T( + VmaAllocator hAllocator, + const VmaPoolCreateInfo& createInfo, + VkDeviceSize preferredBlockSize) : + m_BlockVector( + hAllocator, + this, // hParentPool + createInfo.memoryTypeIndex, + createInfo.blockSize != 0 ? createInfo.blockSize : preferredBlockSize, + createInfo.minBlockCount, + createInfo.maxBlockCount, + (createInfo.flags & VMA_POOL_CREATE_IGNORE_BUFFER_IMAGE_GRANULARITY_BIT) != 0 ? 1 : hAllocator->GetBufferImageGranularity(), + createInfo.frameInUseCount, + createInfo.blockSize != 0, // explicitBlockSize + createInfo.flags & VMA_POOL_CREATE_ALGORITHM_MASK, + createInfo.priority), // algorithm + m_Id(0), + m_Name(VMA_NULL) +{ +} + +VmaPool_T::~VmaPool_T() +{ +} + +void VmaPool_T::SetName(const char* pName) +{ + const VkAllocationCallbacks* allocs = m_BlockVector.GetAllocator()->GetAllocationCallbacks(); + VmaFreeString(allocs, m_Name); + + if(pName != VMA_NULL) + { + m_Name = VmaCreateStringCopy(allocs, pName); + } + else + { + m_Name = VMA_NULL; + } +} + +#if VMA_STATS_STRING_ENABLED + +#endif // #if VMA_STATS_STRING_ENABLED + +VmaBlockVector::VmaBlockVector( + VmaAllocator hAllocator, + VmaPool hParentPool, + uint32_t memoryTypeIndex, + VkDeviceSize preferredBlockSize, + size_t minBlockCount, + size_t maxBlockCount, + VkDeviceSize bufferImageGranularity, + uint32_t frameInUseCount, + bool explicitBlockSize, + uint32_t algorithm, + float priority) : + m_hAllocator(hAllocator), + m_hParentPool(hParentPool), + m_MemoryTypeIndex(memoryTypeIndex), + m_PreferredBlockSize(preferredBlockSize), + m_MinBlockCount(minBlockCount), + m_MaxBlockCount(maxBlockCount), + m_BufferImageGranularity(bufferImageGranularity), + m_FrameInUseCount(frameInUseCount), + m_ExplicitBlockSize(explicitBlockSize), + m_Algorithm(algorithm), + m_Priority(priority), + m_HasEmptyBlock(false), + m_Blocks(VmaStlAllocator(hAllocator->GetAllocationCallbacks())), + m_NextBlockId(0) +{ +} + +VmaBlockVector::~VmaBlockVector() +{ + for(size_t i = m_Blocks.size(); i--; ) + { + m_Blocks[i]->Destroy(m_hAllocator); + vma_delete(m_hAllocator, m_Blocks[i]); + } +} + +VkResult VmaBlockVector::CreateMinBlocks() +{ + for (const auto i : c10::irange(m_MinBlockCount)) { + VkResult res = CreateBlock(m_PreferredBlockSize, VMA_NULL); + if(res != VK_SUCCESS) + { + return res; + } + } + return VK_SUCCESS; +} + +void VmaBlockVector::GetPoolStats(VmaPoolStats* pStats) +{ + VmaMutexLockRead lock(m_Mutex, m_hAllocator->m_UseMutex); + + const size_t blockCount = m_Blocks.size(); + + pStats->size = 0; + pStats->unusedSize = 0; + pStats->allocationCount = 0; + pStats->unusedRangeCount = 0; + pStats->unusedRangeSizeMax = 0; + pStats->blockCount = blockCount; + + for (const auto blockIndex : c10::irange(blockCount)) { + const VmaDeviceMemoryBlock* const pBlock = m_Blocks[blockIndex]; + VMA_ASSERT(pBlock); + VMA_HEAVY_ASSERT(pBlock->Validate()); + pBlock->m_pMetadata->AddPoolStats(*pStats); + } +} + +bool VmaBlockVector::IsEmpty() +{ + VmaMutexLockRead lock(m_Mutex, m_hAllocator->m_UseMutex); + return m_Blocks.empty(); +} + +bool VmaBlockVector::IsCorruptionDetectionEnabled() const +{ + const uint32_t requiredMemFlags = VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT; + return (VMA_DEBUG_DETECT_CORRUPTION != 0) && + (VMA_DEBUG_MARGIN > 0) && + (m_Algorithm == 0 || m_Algorithm == VMA_POOL_CREATE_LINEAR_ALGORITHM_BIT) && + (m_hAllocator->m_MemProps.memoryTypes[m_MemoryTypeIndex].propertyFlags & requiredMemFlags) == requiredMemFlags; +} + +static const uint32_t VMA_ALLOCATION_TRY_COUNT = 32; + +VkResult VmaBlockVector::Allocate( + uint32_t currentFrameIndex, + VkDeviceSize size, + VkDeviceSize alignment, + const VmaAllocationCreateInfo& createInfo, + VmaSuballocationType suballocType, + size_t allocationCount, + VmaAllocation* pAllocations) +{ + size_t allocIndex; + VkResult res = VK_SUCCESS; + + if(IsCorruptionDetectionEnabled()) + { + size = VmaAlignUp(size, sizeof(VMA_CORRUPTION_DETECTION_MAGIC_VALUE)); + alignment = VmaAlignUp(alignment, sizeof(VMA_CORRUPTION_DETECTION_MAGIC_VALUE)); + } + + { + VmaMutexLockWrite lock(m_Mutex, m_hAllocator->m_UseMutex); + for(allocIndex = 0; allocIndex < allocationCount; ++allocIndex) + { + res = AllocatePage( + currentFrameIndex, + size, + alignment, + createInfo, + suballocType, + pAllocations + allocIndex); + if(res != VK_SUCCESS) + { + break; + } + } + } + + if(res != VK_SUCCESS) + { + // Free all already created allocations. + const uint32_t heapIndex = m_hAllocator->MemoryTypeIndexToHeapIndex(m_MemoryTypeIndex); + while(allocIndex--) + { + VmaAllocation_T* const alloc = pAllocations[allocIndex]; + const VkDeviceSize allocSize = alloc->GetSize(); + Free(alloc); + m_hAllocator->m_Budget.RemoveAllocation(heapIndex, allocSize); + } + memset(pAllocations, 0, sizeof(VmaAllocation) * allocationCount); + } + + return res; +} + +VkResult VmaBlockVector::AllocatePage( + uint32_t currentFrameIndex, + VkDeviceSize size, + VkDeviceSize alignment, + const VmaAllocationCreateInfo& createInfo, + VmaSuballocationType suballocType, + VmaAllocation* pAllocation) +{ + const bool isUpperAddress = (createInfo.flags & VMA_ALLOCATION_CREATE_UPPER_ADDRESS_BIT) != 0; + bool canMakeOtherLost = (createInfo.flags & VMA_ALLOCATION_CREATE_CAN_MAKE_OTHER_LOST_BIT) != 0; + const bool mapped = (createInfo.flags & VMA_ALLOCATION_CREATE_MAPPED_BIT) != 0; + const bool isUserDataString = (createInfo.flags & VMA_ALLOCATION_CREATE_USER_DATA_COPY_STRING_BIT) != 0; + + VkDeviceSize freeMemory; + { + const uint32_t heapIndex = m_hAllocator->MemoryTypeIndexToHeapIndex(m_MemoryTypeIndex); + VmaBudget heapBudget = {}; + m_hAllocator->GetBudget(&heapBudget, heapIndex, 1); + freeMemory = (heapBudget.usage < heapBudget.budget) ? (heapBudget.budget - heapBudget.usage) : 0; + } + + const bool canFallbackToDedicated = !IsCustomPool(); + const bool canCreateNewBlock = + ((createInfo.flags & VMA_ALLOCATION_CREATE_NEVER_ALLOCATE_BIT) == 0) && + (m_Blocks.size() < m_MaxBlockCount) && + (freeMemory >= size || !canFallbackToDedicated); + uint32_t strategy = createInfo.flags & VMA_ALLOCATION_CREATE_STRATEGY_MASK; + + // If linearAlgorithm is used, canMakeOtherLost is available only when used as ring buffer. + // Which in turn is available only when maxBlockCount = 1. + if(m_Algorithm == VMA_POOL_CREATE_LINEAR_ALGORITHM_BIT && m_MaxBlockCount > 1) + { + canMakeOtherLost = false; + } + + // Upper address can only be used with linear allocator and within single memory block. + if(isUpperAddress && + (m_Algorithm != VMA_POOL_CREATE_LINEAR_ALGORITHM_BIT || m_MaxBlockCount > 1)) + { + return VK_ERROR_FEATURE_NOT_PRESENT; + } + + // Validate strategy. + switch(strategy) + { + case 0: + strategy = VMA_ALLOCATION_CREATE_STRATEGY_BEST_FIT_BIT; + break; + case VMA_ALLOCATION_CREATE_STRATEGY_BEST_FIT_BIT: + case VMA_ALLOCATION_CREATE_STRATEGY_WORST_FIT_BIT: + case VMA_ALLOCATION_CREATE_STRATEGY_FIRST_FIT_BIT: + break; + default: + return VK_ERROR_FEATURE_NOT_PRESENT; + } + + // Early reject: requested allocation size is larger that maximum block size for this block vector. + if(size + 2 * VMA_DEBUG_MARGIN > m_PreferredBlockSize) + { + return VK_ERROR_OUT_OF_DEVICE_MEMORY; + } + + /* + Under certain condition, this whole section can be skipped for optimization, so + we move on directly to trying to allocate with canMakeOtherLost. That's the case + e.g. for custom pools with linear algorithm. + */ + if(!canMakeOtherLost || canCreateNewBlock) + { + // 1. Search existing allocations. Try to allocate without making other allocations lost. + VmaAllocationCreateFlags allocFlagsCopy = createInfo.flags; + allocFlagsCopy &= ~VMA_ALLOCATION_CREATE_CAN_MAKE_OTHER_LOST_BIT; + + if(m_Algorithm == VMA_POOL_CREATE_LINEAR_ALGORITHM_BIT) + { + // Use only last block. + if(!m_Blocks.empty()) + { + VmaDeviceMemoryBlock* const pCurrBlock = m_Blocks.back(); + VMA_ASSERT(pCurrBlock); + VkResult res = AllocateFromBlock( + pCurrBlock, + currentFrameIndex, + size, + alignment, + allocFlagsCopy, + createInfo.pUserData, + suballocType, + strategy, + pAllocation); + if(res == VK_SUCCESS) + { + VMA_DEBUG_LOG(" Returned from last block #%u", pCurrBlock->GetId()); + return VK_SUCCESS; + } + } + } + else + { + if(strategy == VMA_ALLOCATION_CREATE_STRATEGY_BEST_FIT_BIT) + { + // Forward order in m_Blocks - prefer blocks with smallest amount of free space. + for (const auto blockIndex : c10::irange(m_Blocks.size())) { + VmaDeviceMemoryBlock* const pCurrBlock = m_Blocks[blockIndex]; + VMA_ASSERT(pCurrBlock); + VkResult res = AllocateFromBlock( + pCurrBlock, + currentFrameIndex, + size, + alignment, + allocFlagsCopy, + createInfo.pUserData, + suballocType, + strategy, + pAllocation); + if(res == VK_SUCCESS) + { + VMA_DEBUG_LOG(" Returned from existing block #%u", pCurrBlock->GetId()); + return VK_SUCCESS; + } + } + } + else // WORST_FIT, FIRST_FIT + { + // Backward order in m_Blocks - prefer blocks with largest amount of free space. + for(size_t blockIndex = m_Blocks.size(); blockIndex--; ) + { + VmaDeviceMemoryBlock* const pCurrBlock = m_Blocks[blockIndex]; + VMA_ASSERT(pCurrBlock); + VkResult res = AllocateFromBlock( + pCurrBlock, + currentFrameIndex, + size, + alignment, + allocFlagsCopy, + createInfo.pUserData, + suballocType, + strategy, + pAllocation); + if(res == VK_SUCCESS) + { + VMA_DEBUG_LOG(" Returned from existing block #%u", pCurrBlock->GetId()); + return VK_SUCCESS; + } + } + } + } + + // 2. Try to create new block. + if(canCreateNewBlock) + { + // Calculate optimal size for new block. + VkDeviceSize newBlockSize = m_PreferredBlockSize; + uint32_t newBlockSizeShift = 0; + const uint32_t NEW_BLOCK_SIZE_SHIFT_MAX = 3; + + if(!m_ExplicitBlockSize) + { + // Allocate 1/8, 1/4, 1/2 as first blocks. + const VkDeviceSize maxExistingBlockSize = CalcMaxBlockSize(); + for (const auto i : c10::irange(NEW_BLOCK_SIZE_SHIFT_MAX)) { + const VkDeviceSize smallerNewBlockSize = newBlockSize / 2; + if(smallerNewBlockSize > maxExistingBlockSize && smallerNewBlockSize >= size * 2) + { + newBlockSize = smallerNewBlockSize; + ++newBlockSizeShift; + } + else + { + break; + } + } + } + + size_t newBlockIndex = 0; + VkResult res = (newBlockSize <= freeMemory || !canFallbackToDedicated) ? + CreateBlock(newBlockSize, &newBlockIndex) : VK_ERROR_OUT_OF_DEVICE_MEMORY; + // Allocation of this size failed? Try 1/2, 1/4, 1/8 of m_PreferredBlockSize. + if(!m_ExplicitBlockSize) + { + while(res < 0 && newBlockSizeShift < NEW_BLOCK_SIZE_SHIFT_MAX) + { + const VkDeviceSize smallerNewBlockSize = newBlockSize / 2; + if(smallerNewBlockSize >= size) + { + newBlockSize = smallerNewBlockSize; + ++newBlockSizeShift; + res = (newBlockSize <= freeMemory || !canFallbackToDedicated) ? + CreateBlock(newBlockSize, &newBlockIndex) : VK_ERROR_OUT_OF_DEVICE_MEMORY; + } + else + { + break; + } + } + } + + if(res == VK_SUCCESS) + { + VmaDeviceMemoryBlock* const pBlock = m_Blocks[newBlockIndex]; + VMA_ASSERT(pBlock->m_pMetadata->GetSize() >= size); + + res = AllocateFromBlock( + pBlock, + currentFrameIndex, + size, + alignment, + allocFlagsCopy, + createInfo.pUserData, + suballocType, + strategy, + pAllocation); + if(res == VK_SUCCESS) + { + VMA_DEBUG_LOG(" Created new block #%u Size=%llu", pBlock->GetId(), newBlockSize); + return VK_SUCCESS; + } + else + { + // Allocation from new block failed, possibly due to VMA_DEBUG_MARGIN or alignment. + return VK_ERROR_OUT_OF_DEVICE_MEMORY; + } + } + } + } + + // 3. Try to allocate from existing blocks with making other allocations lost. + if(canMakeOtherLost) + { + uint32_t tryIndex = 0; + for(; tryIndex < VMA_ALLOCATION_TRY_COUNT; ++tryIndex) + { + VmaDeviceMemoryBlock* pBestRequestBlock = VMA_NULL; + VmaAllocationRequest bestRequest = {}; + VkDeviceSize bestRequestCost = VK_WHOLE_SIZE; + + // 1. Search existing allocations. + if(strategy == VMA_ALLOCATION_CREATE_STRATEGY_BEST_FIT_BIT) + { + // Forward order in m_Blocks - prefer blocks with smallest amount of free space. + for (const auto blockIndex : c10::irange(m_Blocks.size())) { + VmaDeviceMemoryBlock* const pCurrBlock = m_Blocks[blockIndex]; + VMA_ASSERT(pCurrBlock); + VmaAllocationRequest currRequest = {}; + if(pCurrBlock->m_pMetadata->CreateAllocationRequest( + currentFrameIndex, + m_FrameInUseCount, + m_BufferImageGranularity, + size, + alignment, + (createInfo.flags & VMA_ALLOCATION_CREATE_UPPER_ADDRESS_BIT) != 0, + suballocType, + canMakeOtherLost, + strategy, + &currRequest)) + { + const VkDeviceSize currRequestCost = currRequest.CalcCost(); + if(pBestRequestBlock == VMA_NULL || + currRequestCost < bestRequestCost) + { + pBestRequestBlock = pCurrBlock; + bestRequest = currRequest; + bestRequestCost = currRequestCost; + + if(bestRequestCost == 0) + { + break; + } + } + } + } + } + else // WORST_FIT, FIRST_FIT + { + // Backward order in m_Blocks - prefer blocks with largest amount of free space. + for(size_t blockIndex = m_Blocks.size(); blockIndex--; ) + { + VmaDeviceMemoryBlock* const pCurrBlock = m_Blocks[blockIndex]; + VMA_ASSERT(pCurrBlock); + VmaAllocationRequest currRequest = {}; + if(pCurrBlock->m_pMetadata->CreateAllocationRequest( + currentFrameIndex, + m_FrameInUseCount, + m_BufferImageGranularity, + size, + alignment, + (createInfo.flags & VMA_ALLOCATION_CREATE_UPPER_ADDRESS_BIT) != 0, + suballocType, + canMakeOtherLost, + strategy, + &currRequest)) + { + const VkDeviceSize currRequestCost = currRequest.CalcCost(); + if(pBestRequestBlock == VMA_NULL || + currRequestCost < bestRequestCost || + strategy == VMA_ALLOCATION_CREATE_STRATEGY_FIRST_FIT_BIT) + { + pBestRequestBlock = pCurrBlock; + bestRequest = currRequest; + bestRequestCost = currRequestCost; + + if(bestRequestCost == 0 || + strategy == VMA_ALLOCATION_CREATE_STRATEGY_FIRST_FIT_BIT) + { + break; + } + } + } + } + } + + if(pBestRequestBlock != VMA_NULL) + { + if(mapped) + { + VkResult res = pBestRequestBlock->Map(m_hAllocator, 1, VMA_NULL); + if(res != VK_SUCCESS) + { + return res; + } + } + + if(pBestRequestBlock->m_pMetadata->MakeRequestedAllocationsLost( + currentFrameIndex, + m_FrameInUseCount, + &bestRequest)) + { + // Allocate from this pBlock. + *pAllocation = m_hAllocator->m_AllocationObjectAllocator.Allocate(currentFrameIndex, isUserDataString); + pBestRequestBlock->m_pMetadata->Alloc(bestRequest, suballocType, size, *pAllocation); + UpdateHasEmptyBlock(); + (*pAllocation)->InitBlockAllocation( + pBestRequestBlock, + bestRequest.offset, + alignment, + size, + m_MemoryTypeIndex, + suballocType, + mapped, + (createInfo.flags & VMA_ALLOCATION_CREATE_CAN_BECOME_LOST_BIT) != 0); + VMA_HEAVY_ASSERT(pBestRequestBlock->Validate()); + VMA_DEBUG_LOG(" Returned from existing block"); + (*pAllocation)->SetUserData(m_hAllocator, createInfo.pUserData); + m_hAllocator->m_Budget.AddAllocation(m_hAllocator->MemoryTypeIndexToHeapIndex(m_MemoryTypeIndex), size); + if(VMA_DEBUG_INITIALIZE_ALLOCATIONS) + { + m_hAllocator->FillAllocation(*pAllocation, VMA_ALLOCATION_FILL_PATTERN_CREATED); + } + if(IsCorruptionDetectionEnabled()) + { + VkResult res = pBestRequestBlock->WriteMagicValueAroundAllocation(m_hAllocator, bestRequest.offset, size); + VMA_ASSERT(res == VK_SUCCESS && "Couldn't map block memory to write magic value."); + } + return VK_SUCCESS; + } + // else: Some allocations must have been touched while we are here. Next try. + } + else + { + // Could not find place in any of the blocks - break outer loop. + break; + } + } + /* Maximum number of tries exceeded - a very unlike event when many other + threads are simultaneously touching allocations making it impossible to make + lost at the same time as we try to allocate. */ + if(tryIndex == VMA_ALLOCATION_TRY_COUNT) + { + return VK_ERROR_TOO_MANY_OBJECTS; + } + } + + return VK_ERROR_OUT_OF_DEVICE_MEMORY; +} + +void VmaBlockVector::Free( + const VmaAllocation hAllocation) +{ + VmaDeviceMemoryBlock* pBlockToDelete = VMA_NULL; + + bool budgetExceeded = false; + { + const uint32_t heapIndex = m_hAllocator->MemoryTypeIndexToHeapIndex(m_MemoryTypeIndex); + VmaBudget heapBudget = {}; + m_hAllocator->GetBudget(&heapBudget, heapIndex, 1); + budgetExceeded = heapBudget.usage >= heapBudget.budget; + } + + // Scope for lock. + { + VmaMutexLockWrite lock(m_Mutex, m_hAllocator->m_UseMutex); + + VmaDeviceMemoryBlock* pBlock = hAllocation->GetBlock(); + + if(IsCorruptionDetectionEnabled()) + { + VkResult res = pBlock->ValidateMagicValueAroundAllocation(m_hAllocator, hAllocation->GetOffset(), hAllocation->GetSize()); + VMA_ASSERT(res == VK_SUCCESS && "Couldn't map block memory to validate magic value."); + } + + if(hAllocation->IsPersistentMap()) + { + pBlock->Unmap(m_hAllocator, 1); + } + + pBlock->m_pMetadata->Free(hAllocation); + VMA_HEAVY_ASSERT(pBlock->Validate()); + + VMA_DEBUG_LOG(" Freed from MemoryTypeIndex=%u", m_MemoryTypeIndex); + + const bool canDeleteBlock = m_Blocks.size() > m_MinBlockCount; + // pBlock became empty after this deallocation. + if(pBlock->m_pMetadata->IsEmpty()) + { + // Already has empty block. We don't want to have two, so delete this one. + if((m_HasEmptyBlock || budgetExceeded) && canDeleteBlock) + { + pBlockToDelete = pBlock; + Remove(pBlock); + } + // else: We now have an empty block - leave it. + } + // pBlock didn't become empty, but we have another empty block - find and free that one. + // (This is optional, heuristics.) + else if(m_HasEmptyBlock && canDeleteBlock) + { + VmaDeviceMemoryBlock* pLastBlock = m_Blocks.back(); + if(pLastBlock->m_pMetadata->IsEmpty()) + { + pBlockToDelete = pLastBlock; + m_Blocks.pop_back(); + } + } + + UpdateHasEmptyBlock(); + IncrementallySortBlocks(); + } + + // Destruction of a free block. Deferred until this point, outside of mutex + // lock, for performance reason. + if(pBlockToDelete != VMA_NULL) + { + VMA_DEBUG_LOG(" Deleted empty block"); + pBlockToDelete->Destroy(m_hAllocator); + vma_delete(m_hAllocator, pBlockToDelete); + } +} + +VkDeviceSize VmaBlockVector::CalcMaxBlockSize() const +{ + VkDeviceSize result = 0; + for(size_t i = m_Blocks.size(); i--; ) + { + result = VMA_MAX(result, m_Blocks[i]->m_pMetadata->GetSize()); + if(result >= m_PreferredBlockSize) + { + break; + } + } + return result; +} + +void VmaBlockVector::Remove(VmaDeviceMemoryBlock* pBlock) +{ + for (const auto blockIndex : c10::irange(m_Blocks.size())) { + if(m_Blocks[blockIndex] == pBlock) + { + VmaVectorRemove(m_Blocks, blockIndex); + return; + } + } + VMA_ASSERT(0); +} + +void VmaBlockVector::IncrementallySortBlocks() +{ + if(m_Algorithm != VMA_POOL_CREATE_LINEAR_ALGORITHM_BIT) + { + // Bubble sort only until first swap. + for (const auto i : c10::irange(1, m_Blocks.size())) { + if(m_Blocks[i - 1]->m_pMetadata->GetSumFreeSize() > m_Blocks[i]->m_pMetadata->GetSumFreeSize()) + { + VMA_SWAP(m_Blocks[i - 1], m_Blocks[i]); + return; + } + } + } +} + +VkResult VmaBlockVector::AllocateFromBlock( + VmaDeviceMemoryBlock* pBlock, + uint32_t currentFrameIndex, + VkDeviceSize size, + VkDeviceSize alignment, + VmaAllocationCreateFlags allocFlags, + void* pUserData, + VmaSuballocationType suballocType, + uint32_t strategy, + VmaAllocation* pAllocation) +{ + VMA_ASSERT((allocFlags & VMA_ALLOCATION_CREATE_CAN_MAKE_OTHER_LOST_BIT) == 0); + const bool isUpperAddress = (allocFlags & VMA_ALLOCATION_CREATE_UPPER_ADDRESS_BIT) != 0; + const bool mapped = (allocFlags & VMA_ALLOCATION_CREATE_MAPPED_BIT) != 0; + const bool isUserDataString = (allocFlags & VMA_ALLOCATION_CREATE_USER_DATA_COPY_STRING_BIT) != 0; + + VmaAllocationRequest currRequest = {}; + if(pBlock->m_pMetadata->CreateAllocationRequest( + currentFrameIndex, + m_FrameInUseCount, + m_BufferImageGranularity, + size, + alignment, + isUpperAddress, + suballocType, + false, // canMakeOtherLost + strategy, + &currRequest)) + { + // Allocate from pCurrBlock. + VMA_ASSERT(currRequest.itemsToMakeLostCount == 0); + + if(mapped) + { + VkResult res = pBlock->Map(m_hAllocator, 1, VMA_NULL); + if(res != VK_SUCCESS) + { + return res; + } + } + + *pAllocation = m_hAllocator->m_AllocationObjectAllocator.Allocate(currentFrameIndex, isUserDataString); + pBlock->m_pMetadata->Alloc(currRequest, suballocType, size, *pAllocation); + UpdateHasEmptyBlock(); + (*pAllocation)->InitBlockAllocation( + pBlock, + currRequest.offset, + alignment, + size, + m_MemoryTypeIndex, + suballocType, + mapped, + (allocFlags & VMA_ALLOCATION_CREATE_CAN_BECOME_LOST_BIT) != 0); + VMA_HEAVY_ASSERT(pBlock->Validate()); + (*pAllocation)->SetUserData(m_hAllocator, pUserData); + m_hAllocator->m_Budget.AddAllocation(m_hAllocator->MemoryTypeIndexToHeapIndex(m_MemoryTypeIndex), size); + if(VMA_DEBUG_INITIALIZE_ALLOCATIONS) + { + m_hAllocator->FillAllocation(*pAllocation, VMA_ALLOCATION_FILL_PATTERN_CREATED); + } + if(IsCorruptionDetectionEnabled()) + { + VkResult res = pBlock->WriteMagicValueAroundAllocation(m_hAllocator, currRequest.offset, size); + VMA_ASSERT(res == VK_SUCCESS && "Couldn't map block memory to write magic value."); + } + return VK_SUCCESS; + } + return VK_ERROR_OUT_OF_DEVICE_MEMORY; +} + +VkResult VmaBlockVector::CreateBlock(VkDeviceSize blockSize, size_t* pNewBlockIndex) +{ + VkMemoryAllocateInfo allocInfo = { VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO }; + allocInfo.memoryTypeIndex = m_MemoryTypeIndex; + allocInfo.allocationSize = blockSize; + +#if VMA_BUFFER_DEVICE_ADDRESS + // Every standalone block can potentially contain a buffer with VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT - always enable the feature. + VkMemoryAllocateFlagsInfoKHR allocFlagsInfo = { VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_FLAGS_INFO_KHR }; + if(m_hAllocator->m_UseKhrBufferDeviceAddress) + { + allocFlagsInfo.flags = VK_MEMORY_ALLOCATE_DEVICE_ADDRESS_BIT_KHR; + VmaPnextChainPushFront(&allocInfo, &allocFlagsInfo); + } +#endif // #if VMA_BUFFER_DEVICE_ADDRESS + +#if VMA_MEMORY_PRIORITY + VkMemoryPriorityAllocateInfoEXT priorityInfo = { VK_STRUCTURE_TYPE_MEMORY_PRIORITY_ALLOCATE_INFO_EXT }; + if(m_hAllocator->m_UseExtMemoryPriority) + { + priorityInfo.priority = m_Priority; + VmaPnextChainPushFront(&allocInfo, &priorityInfo); + } +#endif // #if VMA_MEMORY_PRIORITY + + VkDeviceMemory mem = VK_NULL_HANDLE; + VkResult res = m_hAllocator->AllocateVulkanMemory(&allocInfo, &mem); + if(res < 0) + { + return res; + } + + // New VkDeviceMemory successfully created. + + // Create new Allocation for it. + VmaDeviceMemoryBlock* const pBlock = vma_new(m_hAllocator, VmaDeviceMemoryBlock)(m_hAllocator); + pBlock->Init( + m_hAllocator, + m_hParentPool, + m_MemoryTypeIndex, + mem, + allocInfo.allocationSize, + m_NextBlockId++, + m_Algorithm); + + m_Blocks.push_back(pBlock); + if(pNewBlockIndex != VMA_NULL) + { + *pNewBlockIndex = m_Blocks.size() - 1; + } + + return VK_SUCCESS; +} + +void VmaBlockVector::ApplyDefragmentationMovesCpu( + class VmaBlockVectorDefragmentationContext* pDefragCtx, + const VmaVector< VmaDefragmentationMove, VmaStlAllocator >& moves) +{ + const size_t blockCount = m_Blocks.size(); + const bool isNonCoherent = m_hAllocator->IsMemoryTypeNonCoherent(m_MemoryTypeIndex); + + enum BLOCK_FLAG + { + BLOCK_FLAG_USED = 0x00000001, + BLOCK_FLAG_MAPPED_FOR_DEFRAGMENTATION = 0x00000002, + }; + + struct BlockInfo + { + uint32_t flags; + void* pMappedData; + }; + VmaVector< BlockInfo, VmaStlAllocator > + blockInfo(blockCount, BlockInfo(), VmaStlAllocator(m_hAllocator->GetAllocationCallbacks())); + memset(blockInfo.data(), 0, blockCount * sizeof(BlockInfo)); + + // Go over all moves. Mark blocks that are used with BLOCK_FLAG_USED. + const size_t moveCount = moves.size(); + for (const auto moveIndex : c10::irange(moveCount)) { + const VmaDefragmentationMove& move = moves[moveIndex]; + blockInfo[move.srcBlockIndex].flags |= BLOCK_FLAG_USED; + blockInfo[move.dstBlockIndex].flags |= BLOCK_FLAG_USED; + } + + VMA_ASSERT(pDefragCtx->res == VK_SUCCESS); + + // Go over all blocks. Get mapped pointer or map if necessary. + for(size_t blockIndex = 0; pDefragCtx->res == VK_SUCCESS && blockIndex < blockCount; ++blockIndex) + { + BlockInfo& currBlockInfo = blockInfo[blockIndex]; + VmaDeviceMemoryBlock* pBlock = m_Blocks[blockIndex]; + if((currBlockInfo.flags & BLOCK_FLAG_USED) != 0) + { + currBlockInfo.pMappedData = pBlock->GetMappedData(); + // It is not originally mapped - map it. + if(currBlockInfo.pMappedData == VMA_NULL) + { + pDefragCtx->res = pBlock->Map(m_hAllocator, 1, &currBlockInfo.pMappedData); + if(pDefragCtx->res == VK_SUCCESS) + { + currBlockInfo.flags |= BLOCK_FLAG_MAPPED_FOR_DEFRAGMENTATION; + } + } + } + } + + // Go over all moves. Do actual data transfer. + if(pDefragCtx->res == VK_SUCCESS) + { + const VkDeviceSize nonCoherentAtomSize = m_hAllocator->m_PhysicalDeviceProperties.limits.nonCoherentAtomSize; + VkMappedMemoryRange memRange = { VK_STRUCTURE_TYPE_MAPPED_MEMORY_RANGE }; + + for (const auto moveIndex : c10::irange(moveCount)) { + const VmaDefragmentationMove& move = moves[moveIndex]; + + const BlockInfo& srcBlockInfo = blockInfo[move.srcBlockIndex]; + const BlockInfo& dstBlockInfo = blockInfo[move.dstBlockIndex]; + + VMA_ASSERT(srcBlockInfo.pMappedData && dstBlockInfo.pMappedData); + + // Invalidate source. + if(isNonCoherent) + { + VmaDeviceMemoryBlock* const pSrcBlock = m_Blocks[move.srcBlockIndex]; + memRange.memory = pSrcBlock->GetDeviceMemory(); + memRange.offset = VmaAlignDown(move.srcOffset, nonCoherentAtomSize); + memRange.size = VMA_MIN( + VmaAlignUp(move.size + (move.srcOffset - memRange.offset), nonCoherentAtomSize), + pSrcBlock->m_pMetadata->GetSize() - memRange.offset); + (*m_hAllocator->GetVulkanFunctions().vkInvalidateMappedMemoryRanges)(m_hAllocator->m_hDevice, 1, &memRange); + } + + // THE PLACE WHERE ACTUAL DATA COPY HAPPENS. + memmove( + reinterpret_cast(dstBlockInfo.pMappedData) + move.dstOffset, + reinterpret_cast(srcBlockInfo.pMappedData) + move.srcOffset, + static_cast(move.size)); + + if(IsCorruptionDetectionEnabled()) + { + VmaWriteMagicValue(dstBlockInfo.pMappedData, move.dstOffset - VMA_DEBUG_MARGIN); + VmaWriteMagicValue(dstBlockInfo.pMappedData, move.dstOffset + move.size); + } + + // Flush destination. + if(isNonCoherent) + { + VmaDeviceMemoryBlock* const pDstBlock = m_Blocks[move.dstBlockIndex]; + memRange.memory = pDstBlock->GetDeviceMemory(); + memRange.offset = VmaAlignDown(move.dstOffset, nonCoherentAtomSize); + memRange.size = VMA_MIN( + VmaAlignUp(move.size + (move.dstOffset - memRange.offset), nonCoherentAtomSize), + pDstBlock->m_pMetadata->GetSize() - memRange.offset); + (*m_hAllocator->GetVulkanFunctions().vkFlushMappedMemoryRanges)(m_hAllocator->m_hDevice, 1, &memRange); + } + } + } + + // Go over all blocks in reverse order. Unmap those that were mapped just for defragmentation. + // Regardless of pCtx->res == VK_SUCCESS. + for(size_t blockIndex = blockCount; blockIndex--; ) + { + const BlockInfo& currBlockInfo = blockInfo[blockIndex]; + if((currBlockInfo.flags & BLOCK_FLAG_MAPPED_FOR_DEFRAGMENTATION) != 0) + { + VmaDeviceMemoryBlock* pBlock = m_Blocks[blockIndex]; + pBlock->Unmap(m_hAllocator, 1); + } + } +} + +void VmaBlockVector::ApplyDefragmentationMovesGpu( + class VmaBlockVectorDefragmentationContext* pDefragCtx, + VmaVector< VmaDefragmentationMove, VmaStlAllocator >& moves, + VkCommandBuffer commandBuffer) +{ + const size_t blockCount = m_Blocks.size(); + + pDefragCtx->blockContexts.resize(blockCount); + memset(pDefragCtx->blockContexts.data(), 0, blockCount * sizeof(VmaBlockDefragmentationContext)); + + // Go over all moves. Mark blocks that are used with BLOCK_FLAG_USED. + const size_t moveCount = moves.size(); + for (const auto moveIndex : c10::irange(moveCount)) { + const VmaDefragmentationMove& move = moves[moveIndex]; + + //if(move.type == VMA_ALLOCATION_TYPE_UNKNOWN) + { + // Old school move still require us to map the whole block + pDefragCtx->blockContexts[move.srcBlockIndex].flags |= VmaBlockDefragmentationContext::BLOCK_FLAG_USED; + pDefragCtx->blockContexts[move.dstBlockIndex].flags |= VmaBlockDefragmentationContext::BLOCK_FLAG_USED; + } + } + + VMA_ASSERT(pDefragCtx->res == VK_SUCCESS); + + // Go over all blocks. Create and bind buffer for whole block if necessary. + { + VkBufferCreateInfo bufCreateInfo; + VmaFillGpuDefragmentationBufferCreateInfo(bufCreateInfo); + + for(size_t blockIndex = 0; pDefragCtx->res == VK_SUCCESS && blockIndex < blockCount; ++blockIndex) + { + VmaBlockDefragmentationContext& currBlockCtx = pDefragCtx->blockContexts[blockIndex]; + VmaDeviceMemoryBlock* pBlock = m_Blocks[blockIndex]; + if((currBlockCtx.flags & VmaBlockDefragmentationContext::BLOCK_FLAG_USED) != 0) + { + bufCreateInfo.size = pBlock->m_pMetadata->GetSize(); + pDefragCtx->res = (*m_hAllocator->GetVulkanFunctions().vkCreateBuffer)( + m_hAllocator->m_hDevice, &bufCreateInfo, m_hAllocator->GetAllocationCallbacks(), &currBlockCtx.hBuffer); + if(pDefragCtx->res == VK_SUCCESS) + { + pDefragCtx->res = (*m_hAllocator->GetVulkanFunctions().vkBindBufferMemory)( + m_hAllocator->m_hDevice, currBlockCtx.hBuffer, pBlock->GetDeviceMemory(), 0); + } + } + } + } + + // Go over all moves. Post data transfer commands to command buffer. + if(pDefragCtx->res == VK_SUCCESS) + { + for (const auto moveIndex : c10::irange(moveCount)) { + const VmaDefragmentationMove& move = moves[moveIndex]; + + const VmaBlockDefragmentationContext& srcBlockCtx = pDefragCtx->blockContexts[move.srcBlockIndex]; + const VmaBlockDefragmentationContext& dstBlockCtx = pDefragCtx->blockContexts[move.dstBlockIndex]; + + VMA_ASSERT(srcBlockCtx.hBuffer && dstBlockCtx.hBuffer); + + VkBufferCopy region = { + move.srcOffset, + move.dstOffset, + move.size }; + (*m_hAllocator->GetVulkanFunctions().vkCmdCopyBuffer)( + commandBuffer, srcBlockCtx.hBuffer, dstBlockCtx.hBuffer, 1, ®ion); + } + } + + // Save buffers to defrag context for later destruction. + if(pDefragCtx->res == VK_SUCCESS && moveCount > 0) + { + pDefragCtx->res = VK_NOT_READY; + } +} + +void VmaBlockVector::FreeEmptyBlocks(VmaDefragmentationStats* pDefragmentationStats) +{ + for(size_t blockIndex = m_Blocks.size(); blockIndex--; ) + { + VmaDeviceMemoryBlock* pBlock = m_Blocks[blockIndex]; + if(pBlock->m_pMetadata->IsEmpty()) + { + if(m_Blocks.size() > m_MinBlockCount) + { + if(pDefragmentationStats != VMA_NULL) + { + ++pDefragmentationStats->deviceMemoryBlocksFreed; + pDefragmentationStats->bytesFreed += pBlock->m_pMetadata->GetSize(); + } + + VmaVectorRemove(m_Blocks, blockIndex); + pBlock->Destroy(m_hAllocator); + vma_delete(m_hAllocator, pBlock); + } + else + { + break; + } + } + } + UpdateHasEmptyBlock(); +} + +void VmaBlockVector::UpdateHasEmptyBlock() +{ + m_HasEmptyBlock = false; + for(size_t index = 0, count = m_Blocks.size(); index < count; ++index) + { + VmaDeviceMemoryBlock* const pBlock = m_Blocks[index]; + if(pBlock->m_pMetadata->IsEmpty()) + { + m_HasEmptyBlock = true; + break; + } + } +} + +#if VMA_STATS_STRING_ENABLED + +void VmaBlockVector::PrintDetailedMap(class VmaJsonWriter& json) +{ + VmaMutexLockRead lock(m_Mutex, m_hAllocator->m_UseMutex); + + json.BeginObject(); + + if(IsCustomPool()) + { + const char* poolName = m_hParentPool->GetName(); + if(poolName != VMA_NULL && poolName[0] != '\0') + { + json.WriteString("Name"); + json.WriteString(poolName); + } + + json.WriteString("MemoryTypeIndex"); + json.WriteNumber(m_MemoryTypeIndex); + + json.WriteString("BlockSize"); + json.WriteNumber(m_PreferredBlockSize); + + json.WriteString("BlockCount"); + json.BeginObject(true); + if(m_MinBlockCount > 0) + { + json.WriteString("Min"); + json.WriteNumber((uint64_t)m_MinBlockCount); + } + if(m_MaxBlockCount < SIZE_MAX) + { + json.WriteString("Max"); + json.WriteNumber((uint64_t)m_MaxBlockCount); + } + json.WriteString("Cur"); + json.WriteNumber((uint64_t)m_Blocks.size()); + json.EndObject(); + + if(m_FrameInUseCount > 0) + { + json.WriteString("FrameInUseCount"); + json.WriteNumber(m_FrameInUseCount); + } + + if(m_Algorithm != 0) + { + json.WriteString("Algorithm"); + json.WriteString(VmaAlgorithmToStr(m_Algorithm)); + } + } + else + { + json.WriteString("PreferredBlockSize"); + json.WriteNumber(m_PreferredBlockSize); + } + + json.WriteString("Blocks"); + json.BeginObject(); + for (const auto i : c10::irange(m_Blocks.size())) { + json.BeginString(); + json.ContinueString(m_Blocks[i]->GetId()); + json.EndString(); + + m_Blocks[i]->m_pMetadata->PrintDetailedMap(json); + } + json.EndObject(); + + json.EndObject(); +} + +#endif // #if VMA_STATS_STRING_ENABLED + +void VmaBlockVector::Defragment( + class VmaBlockVectorDefragmentationContext* pCtx, + VmaDefragmentationStats* pStats, VmaDefragmentationFlags flags, + VkDeviceSize& maxCpuBytesToMove, uint32_t& maxCpuAllocationsToMove, + VkDeviceSize& maxGpuBytesToMove, uint32_t& maxGpuAllocationsToMove, + VkCommandBuffer commandBuffer) +{ + pCtx->res = VK_SUCCESS; + + const VkMemoryPropertyFlags memPropFlags = + m_hAllocator->m_MemProps.memoryTypes[m_MemoryTypeIndex].propertyFlags; + const bool isHostVisible = (memPropFlags & VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT) != 0; + + const bool canDefragmentOnCpu = maxCpuBytesToMove > 0 && maxCpuAllocationsToMove > 0 && + isHostVisible; + const bool canDefragmentOnGpu = maxGpuBytesToMove > 0 && maxGpuAllocationsToMove > 0 && + !IsCorruptionDetectionEnabled() && + ((1u << m_MemoryTypeIndex) & m_hAllocator->GetGpuDefragmentationMemoryTypeBits()) != 0; + + // There are options to defragment this memory type. + if(canDefragmentOnCpu || canDefragmentOnGpu) + { + bool defragmentOnGpu; + // There is only one option to defragment this memory type. + if(canDefragmentOnGpu != canDefragmentOnCpu) + { + defragmentOnGpu = canDefragmentOnGpu; + } + // Both options are available: Heuristics to choose the best one. + else + { + defragmentOnGpu = (memPropFlags & VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT) != 0 || + m_hAllocator->IsIntegratedGpu(); + } + + bool overlappingMoveSupported = !defragmentOnGpu; + + if(m_hAllocator->m_UseMutex) + { + if(flags & VMA_DEFRAGMENTATION_FLAG_INCREMENTAL) + { + if(!m_Mutex.TryLockWrite()) + { + pCtx->res = VK_ERROR_INITIALIZATION_FAILED; + return; + } + } + else + { + m_Mutex.LockWrite(); + pCtx->mutexLocked = true; + } + } + + pCtx->Begin(overlappingMoveSupported, flags); + + // Defragment. + + const VkDeviceSize maxBytesToMove = defragmentOnGpu ? maxGpuBytesToMove : maxCpuBytesToMove; + const uint32_t maxAllocationsToMove = defragmentOnGpu ? maxGpuAllocationsToMove : maxCpuAllocationsToMove; + pCtx->res = pCtx->GetAlgorithm()->Defragment(pCtx->defragmentationMoves, maxBytesToMove, maxAllocationsToMove, flags); + + // Accumulate statistics. + if(pStats != VMA_NULL) + { + const VkDeviceSize bytesMoved = pCtx->GetAlgorithm()->GetBytesMoved(); + const uint32_t allocationsMoved = pCtx->GetAlgorithm()->GetAllocationsMoved(); + pStats->bytesMoved += bytesMoved; + pStats->allocationsMoved += allocationsMoved; + VMA_ASSERT(bytesMoved <= maxBytesToMove); + VMA_ASSERT(allocationsMoved <= maxAllocationsToMove); + if(defragmentOnGpu) + { + maxGpuBytesToMove -= bytesMoved; + maxGpuAllocationsToMove -= allocationsMoved; + } + else + { + maxCpuBytesToMove -= bytesMoved; + maxCpuAllocationsToMove -= allocationsMoved; + } + } + + if(flags & VMA_DEFRAGMENTATION_FLAG_INCREMENTAL) + { + if(m_hAllocator->m_UseMutex) + m_Mutex.UnlockWrite(); + + if(pCtx->res >= VK_SUCCESS && !pCtx->defragmentationMoves.empty()) + pCtx->res = VK_NOT_READY; + + return; + } + + if(pCtx->res >= VK_SUCCESS) + { + if(defragmentOnGpu) + { + ApplyDefragmentationMovesGpu(pCtx, pCtx->defragmentationMoves, commandBuffer); + } + else + { + ApplyDefragmentationMovesCpu(pCtx, pCtx->defragmentationMoves); + } + } + } +} + +void VmaBlockVector::DefragmentationEnd( + class VmaBlockVectorDefragmentationContext* pCtx, + uint32_t flags, + VmaDefragmentationStats* pStats) +{ + if(flags & VMA_DEFRAGMENTATION_FLAG_INCREMENTAL && m_hAllocator->m_UseMutex) + { + VMA_ASSERT(pCtx->mutexLocked == false); + + // Incremental defragmentation doesn't hold the lock, so when we enter here we don't actually have any + // lock protecting us. Since we mutate state here, we have to take the lock out now + m_Mutex.LockWrite(); + pCtx->mutexLocked = true; + } + + // If the mutex isn't locked we didn't do any work and there is nothing to delete. + if(pCtx->mutexLocked || !m_hAllocator->m_UseMutex) + { + // Destroy buffers. + for(size_t blockIndex = pCtx->blockContexts.size(); blockIndex--;) + { + VmaBlockDefragmentationContext &blockCtx = pCtx->blockContexts[blockIndex]; + if(blockCtx.hBuffer) + { + (*m_hAllocator->GetVulkanFunctions().vkDestroyBuffer)(m_hAllocator->m_hDevice, blockCtx.hBuffer, m_hAllocator->GetAllocationCallbacks()); + } + } + + if(pCtx->res >= VK_SUCCESS) + { + FreeEmptyBlocks(pStats); + } + } + + if(pCtx->mutexLocked) + { + VMA_ASSERT(m_hAllocator->m_UseMutex); + m_Mutex.UnlockWrite(); + } +} + +uint32_t VmaBlockVector::ProcessDefragmentations( + class VmaBlockVectorDefragmentationContext *pCtx, + VmaDefragmentationPassMoveInfo* pMove, uint32_t maxMoves) +{ + VmaMutexLockWrite lock(m_Mutex, m_hAllocator->m_UseMutex); + + const uint32_t moveCount = VMA_MIN(uint32_t(pCtx->defragmentationMoves.size()) - pCtx->defragmentationMovesProcessed, maxMoves); + + for(uint32_t i = 0; i < moveCount; ++ i) + { + VmaDefragmentationMove& move = pCtx->defragmentationMoves[pCtx->defragmentationMovesProcessed + i]; + + pMove->allocation = move.hAllocation; + pMove->memory = move.pDstBlock->GetDeviceMemory(); + pMove->offset = move.dstOffset; + + ++ pMove; + } + + pCtx->defragmentationMovesProcessed += moveCount; + + return moveCount; +} + +void VmaBlockVector::CommitDefragmentations( + class VmaBlockVectorDefragmentationContext *pCtx, + VmaDefragmentationStats* pStats) +{ + VmaMutexLockWrite lock(m_Mutex, m_hAllocator->m_UseMutex); + + for(uint32_t i = pCtx->defragmentationMovesCommitted; i < pCtx->defragmentationMovesProcessed; ++ i) + { + const VmaDefragmentationMove &move = pCtx->defragmentationMoves[i]; + + move.pSrcBlock->m_pMetadata->FreeAtOffset(move.srcOffset); + move.hAllocation->ChangeBlockAllocation(m_hAllocator, move.pDstBlock, move.dstOffset); + } + + pCtx->defragmentationMovesCommitted = pCtx->defragmentationMovesProcessed; + FreeEmptyBlocks(pStats); +} + +size_t VmaBlockVector::CalcAllocationCount() const +{ + size_t result = 0; + for (const auto i : c10::irange(m_Blocks.size())) { + result += m_Blocks[i]->m_pMetadata->GetAllocationCount(); + } + return result; +} + +bool VmaBlockVector::IsBufferImageGranularityConflictPossible() const +{ + if(m_BufferImageGranularity == 1) + { + return false; + } + VmaSuballocationType lastSuballocType = VMA_SUBALLOCATION_TYPE_FREE; + for(size_t i = 0, count = m_Blocks.size(); i < count; ++i) + { + VmaDeviceMemoryBlock* const pBlock = m_Blocks[i]; + VMA_ASSERT(m_Algorithm == 0); + VmaBlockMetadata_Generic* const pMetadata = (VmaBlockMetadata_Generic*)pBlock->m_pMetadata; + if(pMetadata->IsBufferImageGranularityConflictPossible(m_BufferImageGranularity, lastSuballocType)) + { + return true; + } + } + return false; +} + +void VmaBlockVector::MakePoolAllocationsLost( + uint32_t currentFrameIndex, + size_t* pLostAllocationCount) +{ + VmaMutexLockWrite lock(m_Mutex, m_hAllocator->m_UseMutex); + size_t lostAllocationCount = 0; + for (const auto blockIndex : c10::irange(m_Blocks.size())) { + VmaDeviceMemoryBlock* const pBlock = m_Blocks[blockIndex]; + VMA_ASSERT(pBlock); + lostAllocationCount += pBlock->m_pMetadata->MakeAllocationsLost(currentFrameIndex, m_FrameInUseCount); + } + if(pLostAllocationCount != VMA_NULL) + { + *pLostAllocationCount = lostAllocationCount; + } +} + +VkResult VmaBlockVector::CheckCorruption() +{ + if(!IsCorruptionDetectionEnabled()) + { + return VK_ERROR_FEATURE_NOT_PRESENT; + } + + VmaMutexLockRead lock(m_Mutex, m_hAllocator->m_UseMutex); + for (const auto blockIndex : c10::irange(m_Blocks.size())) { + VmaDeviceMemoryBlock* const pBlock = m_Blocks[blockIndex]; + VMA_ASSERT(pBlock); + VkResult res = pBlock->CheckCorruption(m_hAllocator); + if(res != VK_SUCCESS) + { + return res; + } + } + return VK_SUCCESS; +} + +void VmaBlockVector::AddStats(VmaStats* pStats) +{ + const uint32_t memTypeIndex = m_MemoryTypeIndex; + const uint32_t memHeapIndex = m_hAllocator->MemoryTypeIndexToHeapIndex(memTypeIndex); + + VmaMutexLockRead lock(m_Mutex, m_hAllocator->m_UseMutex); + + for (const auto blockIndex : c10::irange(m_Blocks.size())) { + const VmaDeviceMemoryBlock* const pBlock = m_Blocks[blockIndex]; + VMA_ASSERT(pBlock); + VMA_HEAVY_ASSERT(pBlock->Validate()); + VmaStatInfo allocationStatInfo; + pBlock->m_pMetadata->CalcAllocationStatInfo(allocationStatInfo); + VmaAddStatInfo(pStats->total, allocationStatInfo); + VmaAddStatInfo(pStats->memoryType[memTypeIndex], allocationStatInfo); + VmaAddStatInfo(pStats->memoryHeap[memHeapIndex], allocationStatInfo); + } +} + +//////////////////////////////////////////////////////////////////////////////// +// VmaDefragmentationAlgorithm_Generic members definition + +VmaDefragmentationAlgorithm_Generic::VmaDefragmentationAlgorithm_Generic( + VmaAllocator hAllocator, + VmaBlockVector* pBlockVector, + uint32_t currentFrameIndex, + bool overlappingMoveSupported) : + VmaDefragmentationAlgorithm(hAllocator, pBlockVector, currentFrameIndex), + m_AllocationCount(0), + m_AllAllocations(false), + m_BytesMoved(0), + m_AllocationsMoved(0), + m_Blocks(VmaStlAllocator(hAllocator->GetAllocationCallbacks())) +{ + // Create block info for each block. + const size_t blockCount = m_pBlockVector->m_Blocks.size(); + for (const auto blockIndex : c10::irange(blockCount)) { + BlockInfo* pBlockInfo = vma_new(m_hAllocator, BlockInfo)(m_hAllocator->GetAllocationCallbacks()); + pBlockInfo->m_OriginalBlockIndex = blockIndex; + pBlockInfo->m_pBlock = m_pBlockVector->m_Blocks[blockIndex]; + m_Blocks.push_back(pBlockInfo); + } + + // Sort them by m_pBlock pointer value. + VMA_SORT(m_Blocks.begin(), m_Blocks.end(), BlockPointerLess()); +} + +VmaDefragmentationAlgorithm_Generic::~VmaDefragmentationAlgorithm_Generic() +{ + for(size_t i = m_Blocks.size(); i--; ) + { + vma_delete(m_hAllocator, m_Blocks[i]); + } +} + +void VmaDefragmentationAlgorithm_Generic::AddAllocation(VmaAllocation hAlloc, VkBool32* pChanged) +{ + // Now as we are inside VmaBlockVector::m_Mutex, we can make final check if this allocation was not lost. + if(hAlloc->GetLastUseFrameIndex() != VMA_FRAME_INDEX_LOST) + { + VmaDeviceMemoryBlock* pBlock = hAlloc->GetBlock(); + BlockInfoVector::iterator it = VmaBinaryFindFirstNotLess(m_Blocks.begin(), m_Blocks.end(), pBlock, BlockPointerLess()); + if(it != m_Blocks.end() && (*it)->m_pBlock == pBlock) + { + AllocationInfo allocInfo = AllocationInfo(hAlloc, pChanged); + (*it)->m_Allocations.push_back(allocInfo); + } + else + { + VMA_ASSERT(0); + } + + ++m_AllocationCount; + } +} + +VkResult VmaDefragmentationAlgorithm_Generic::DefragmentRound( + VmaVector< VmaDefragmentationMove, VmaStlAllocator >& moves, + VkDeviceSize maxBytesToMove, + uint32_t maxAllocationsToMove, + bool freeOldAllocations) +{ + if(m_Blocks.empty()) + { + return VK_SUCCESS; + } + + // This is a choice based on research. + // Option 1: + uint32_t strategy = VMA_ALLOCATION_CREATE_STRATEGY_MIN_TIME_BIT; + // Option 2: + //uint32_t strategy = VMA_ALLOCATION_CREATE_STRATEGY_MIN_MEMORY_BIT; + // Option 3: + //uint32_t strategy = VMA_ALLOCATION_CREATE_STRATEGY_MIN_FRAGMENTATION_BIT; + + size_t srcBlockMinIndex = 0; + // When FAST_ALGORITHM, move allocations from only last out of blocks that contain non-movable allocations. + /* + if(m_AlgorithmFlags & VMA_DEFRAGMENTATION_FAST_ALGORITHM_BIT) + { + const size_t blocksWithNonMovableCount = CalcBlocksWithNonMovableCount(); + if(blocksWithNonMovableCount > 0) + { + srcBlockMinIndex = blocksWithNonMovableCount - 1; + } + } + */ + + size_t srcBlockIndex = m_Blocks.size() - 1; + size_t srcAllocIndex = SIZE_MAX; + for(;;) + { + // 1. Find next allocation to move. + // 1.1. Start from last to first m_Blocks - they are sorted from most "destination" to most "source". + // 1.2. Then start from last to first m_Allocations. + while(srcAllocIndex >= m_Blocks[srcBlockIndex]->m_Allocations.size()) + { + if(m_Blocks[srcBlockIndex]->m_Allocations.empty()) + { + // Finished: no more allocations to process. + if(srcBlockIndex == srcBlockMinIndex) + { + return VK_SUCCESS; + } + else + { + --srcBlockIndex; + srcAllocIndex = SIZE_MAX; + } + } + else + { + srcAllocIndex = m_Blocks[srcBlockIndex]->m_Allocations.size() - 1; + } + } + + BlockInfo* pSrcBlockInfo = m_Blocks[srcBlockIndex]; + AllocationInfo& allocInfo = pSrcBlockInfo->m_Allocations[srcAllocIndex]; + + const VkDeviceSize size = allocInfo.m_hAllocation->GetSize(); + const VkDeviceSize srcOffset = allocInfo.m_hAllocation->GetOffset(); + const VkDeviceSize alignment = allocInfo.m_hAllocation->GetAlignment(); + const VmaSuballocationType suballocType = allocInfo.m_hAllocation->GetSuballocationType(); + + // 2. Try to find new place for this allocation in preceding or current block. + for(size_t dstBlockIndex = 0; dstBlockIndex <= srcBlockIndex; ++dstBlockIndex) + { + BlockInfo* pDstBlockInfo = m_Blocks[dstBlockIndex]; + VmaAllocationRequest dstAllocRequest; + if(pDstBlockInfo->m_pBlock->m_pMetadata->CreateAllocationRequest( + m_CurrentFrameIndex, + m_pBlockVector->GetFrameInUseCount(), + m_pBlockVector->GetBufferImageGranularity(), + size, + alignment, + false, // upperAddress + suballocType, + false, // canMakeOtherLost + strategy, + &dstAllocRequest) && + MoveMakesSense( + dstBlockIndex, dstAllocRequest.offset, srcBlockIndex, srcOffset)) + { + VMA_ASSERT(dstAllocRequest.itemsToMakeLostCount == 0); + + // Reached limit on number of allocations or bytes to move. + if((m_AllocationsMoved + 1 > maxAllocationsToMove) || + (m_BytesMoved + size > maxBytesToMove)) + { + return VK_SUCCESS; + } + + VmaDefragmentationMove move = {}; + move.srcBlockIndex = pSrcBlockInfo->m_OriginalBlockIndex; + move.dstBlockIndex = pDstBlockInfo->m_OriginalBlockIndex; + move.srcOffset = srcOffset; + move.dstOffset = dstAllocRequest.offset; + move.size = size; + move.hAllocation = allocInfo.m_hAllocation; + move.pSrcBlock = pSrcBlockInfo->m_pBlock; + move.pDstBlock = pDstBlockInfo->m_pBlock; + + moves.push_back(move); + + pDstBlockInfo->m_pBlock->m_pMetadata->Alloc( + dstAllocRequest, + suballocType, + size, + allocInfo.m_hAllocation); + + if(freeOldAllocations) + { + pSrcBlockInfo->m_pBlock->m_pMetadata->FreeAtOffset(srcOffset); + allocInfo.m_hAllocation->ChangeBlockAllocation(m_hAllocator, pDstBlockInfo->m_pBlock, dstAllocRequest.offset); + } + + if(allocInfo.m_pChanged != VMA_NULL) + { + *allocInfo.m_pChanged = VK_TRUE; + } + + ++m_AllocationsMoved; + m_BytesMoved += size; + + VmaVectorRemove(pSrcBlockInfo->m_Allocations, srcAllocIndex); + + break; + } + } + + // If not processed, this allocInfo remains in pBlockInfo->m_Allocations for next round. + + if(srcAllocIndex > 0) + { + --srcAllocIndex; + } + else + { + if(srcBlockIndex > 0) + { + --srcBlockIndex; + srcAllocIndex = SIZE_MAX; + } + else + { + return VK_SUCCESS; + } + } + } +} + +size_t VmaDefragmentationAlgorithm_Generic::CalcBlocksWithNonMovableCount() const +{ + size_t result = 0; + for (const auto i : c10::irange(m_Blocks.size())) { + if(m_Blocks[i]->m_HasNonMovableAllocations) + { + ++result; + } + } + return result; +} + +VkResult VmaDefragmentationAlgorithm_Generic::Defragment( + VmaVector< VmaDefragmentationMove, VmaStlAllocator >& moves, + VkDeviceSize maxBytesToMove, + uint32_t maxAllocationsToMove, + VmaDefragmentationFlags flags) +{ + if(!m_AllAllocations && m_AllocationCount == 0) + { + return VK_SUCCESS; + } + + const size_t blockCount = m_Blocks.size(); + for (const auto blockIndex : c10::irange(blockCount)) { + BlockInfo* pBlockInfo = m_Blocks[blockIndex]; + + if(m_AllAllocations) + { + VmaBlockMetadata_Generic* pMetadata = (VmaBlockMetadata_Generic*)pBlockInfo->m_pBlock->m_pMetadata; + for(VmaSuballocationList::const_iterator it = pMetadata->m_Suballocations.begin(); + it != pMetadata->m_Suballocations.end(); + ++it) + { + if(it->type != VMA_SUBALLOCATION_TYPE_FREE) + { + AllocationInfo allocInfo = AllocationInfo(it->hAllocation, VMA_NULL); + pBlockInfo->m_Allocations.push_back(allocInfo); + } + } + } + + pBlockInfo->CalcHasNonMovableAllocations(); + + // This is a choice based on research. + // Option 1: + pBlockInfo->SortAllocationsByOffsetDescending(); + // Option 2: + //pBlockInfo->SortAllocationsBySizeDescending(); + } + + // Sort m_Blocks this time by the main criterium, from most "destination" to most "source" blocks. + VMA_SORT(m_Blocks.begin(), m_Blocks.end(), BlockInfoCompareMoveDestination()); + + // This is a choice based on research. + const uint32_t roundCount = 2; + + // Execute defragmentation rounds (the main part). + VkResult result = VK_SUCCESS; + for(uint32_t round = 0; (round < roundCount) && (result == VK_SUCCESS); ++round) + { + result = DefragmentRound(moves, maxBytesToMove, maxAllocationsToMove, !(flags & VMA_DEFRAGMENTATION_FLAG_INCREMENTAL)); + } + + return result; +} + +bool VmaDefragmentationAlgorithm_Generic::MoveMakesSense( + size_t dstBlockIndex, VkDeviceSize dstOffset, + size_t srcBlockIndex, VkDeviceSize srcOffset) +{ + if(dstBlockIndex < srcBlockIndex) + { + return true; + } + if(dstBlockIndex > srcBlockIndex) + { + return false; + } + if(dstOffset < srcOffset) + { + return true; + } + return false; +} + +//////////////////////////////////////////////////////////////////////////////// +// VmaDefragmentationAlgorithm_Fast + +VmaDefragmentationAlgorithm_Fast::VmaDefragmentationAlgorithm_Fast( + VmaAllocator hAllocator, + VmaBlockVector* pBlockVector, + uint32_t currentFrameIndex, + bool overlappingMoveSupported) : + VmaDefragmentationAlgorithm(hAllocator, pBlockVector, currentFrameIndex), + m_OverlappingMoveSupported(overlappingMoveSupported), + m_AllocationCount(0), + m_AllAllocations(false), + m_BytesMoved(0), + m_AllocationsMoved(0), + m_BlockInfos(VmaStlAllocator(hAllocator->GetAllocationCallbacks())) +{ + VMA_ASSERT(VMA_DEBUG_MARGIN == 0); + +} + +VmaDefragmentationAlgorithm_Fast::~VmaDefragmentationAlgorithm_Fast() +{ +} + +VkResult VmaDefragmentationAlgorithm_Fast::Defragment( + VmaVector< VmaDefragmentationMove, VmaStlAllocator >& moves, + VkDeviceSize maxBytesToMove, + uint32_t maxAllocationsToMove, + VmaDefragmentationFlags flags) +{ + VMA_ASSERT(m_AllAllocations || m_pBlockVector->CalcAllocationCount() == m_AllocationCount); + + const size_t blockCount = m_pBlockVector->GetBlockCount(); + if(blockCount == 0 || maxBytesToMove == 0 || maxAllocationsToMove == 0) + { + return VK_SUCCESS; + } + + PreprocessMetadata(); + + // Sort blocks in order from most destination. + + m_BlockInfos.resize(blockCount); + for (const auto i : c10::irange(blockCount)) { + m_BlockInfos[i].origBlockIndex = i; + } + + VMA_SORT(m_BlockInfos.begin(), m_BlockInfos.end(), [this](const BlockInfo& lhs, const BlockInfo& rhs) -> bool { + return m_pBlockVector->GetBlock(lhs.origBlockIndex)->m_pMetadata->GetSumFreeSize() < + m_pBlockVector->GetBlock(rhs.origBlockIndex)->m_pMetadata->GetSumFreeSize(); + }); + + // THE MAIN ALGORITHM + + FreeSpaceDatabase freeSpaceDb; + + size_t dstBlockInfoIndex = 0; + size_t dstOrigBlockIndex = m_BlockInfos[dstBlockInfoIndex].origBlockIndex; + VmaDeviceMemoryBlock* pDstBlock = m_pBlockVector->GetBlock(dstOrigBlockIndex); + VmaBlockMetadata_Generic* pDstMetadata = (VmaBlockMetadata_Generic*)pDstBlock->m_pMetadata; + VkDeviceSize dstBlockSize = pDstMetadata->GetSize(); + VkDeviceSize dstOffset = 0; + + bool end = false; + for(size_t srcBlockInfoIndex = 0; !end && srcBlockInfoIndex < blockCount; ++srcBlockInfoIndex) + { + const size_t srcOrigBlockIndex = m_BlockInfos[srcBlockInfoIndex].origBlockIndex; + VmaDeviceMemoryBlock* const pSrcBlock = m_pBlockVector->GetBlock(srcOrigBlockIndex); + VmaBlockMetadata_Generic* const pSrcMetadata = (VmaBlockMetadata_Generic*)pSrcBlock->m_pMetadata; + for(VmaSuballocationList::iterator srcSuballocIt = pSrcMetadata->m_Suballocations.begin(); + !end && srcSuballocIt != pSrcMetadata->m_Suballocations.end(); ) + { + VmaAllocation_T* const pAlloc = srcSuballocIt->hAllocation; + const VkDeviceSize srcAllocAlignment = pAlloc->GetAlignment(); + const VkDeviceSize srcAllocSize = srcSuballocIt->size; + if(m_AllocationsMoved == maxAllocationsToMove || + m_BytesMoved + srcAllocSize > maxBytesToMove) + { + end = true; + break; + } + const VkDeviceSize srcAllocOffset = srcSuballocIt->offset; + + VmaDefragmentationMove move = {}; + // Try to place it in one of free spaces from the database. + size_t freeSpaceInfoIndex; + VkDeviceSize dstAllocOffset; + if(freeSpaceDb.Fetch(srcAllocAlignment, srcAllocSize, + freeSpaceInfoIndex, dstAllocOffset)) + { + size_t freeSpaceOrigBlockIndex = m_BlockInfos[freeSpaceInfoIndex].origBlockIndex; + VmaDeviceMemoryBlock* pFreeSpaceBlock = m_pBlockVector->GetBlock(freeSpaceOrigBlockIndex); + VmaBlockMetadata_Generic* pFreeSpaceMetadata = (VmaBlockMetadata_Generic*)pFreeSpaceBlock->m_pMetadata; + + // Same block + if(freeSpaceInfoIndex == srcBlockInfoIndex) + { + VMA_ASSERT(dstAllocOffset <= srcAllocOffset); + + // MOVE OPTION 1: Move the allocation inside the same block by decreasing offset. + + VmaSuballocation suballoc = *srcSuballocIt; + suballoc.offset = dstAllocOffset; + suballoc.hAllocation->ChangeOffset(dstAllocOffset); + m_BytesMoved += srcAllocSize; + ++m_AllocationsMoved; + + VmaSuballocationList::iterator nextSuballocIt = srcSuballocIt; + ++nextSuballocIt; + pSrcMetadata->m_Suballocations.erase(srcSuballocIt); + srcSuballocIt = nextSuballocIt; + + InsertSuballoc(pFreeSpaceMetadata, suballoc); + + move.srcBlockIndex = srcOrigBlockIndex; + move.dstBlockIndex = freeSpaceOrigBlockIndex; + move.srcOffset = srcAllocOffset; + move.dstOffset = dstAllocOffset; + move.size = srcAllocSize; + + moves.push_back(move); + } + // Different block + else + { + // MOVE OPTION 2: Move the allocation to a different block. + + VMA_ASSERT(freeSpaceInfoIndex < srcBlockInfoIndex); + + VmaSuballocation suballoc = *srcSuballocIt; + suballoc.offset = dstAllocOffset; + suballoc.hAllocation->ChangeBlockAllocation(m_hAllocator, pFreeSpaceBlock, dstAllocOffset); + m_BytesMoved += srcAllocSize; + ++m_AllocationsMoved; + + VmaSuballocationList::iterator nextSuballocIt = srcSuballocIt; + ++nextSuballocIt; + pSrcMetadata->m_Suballocations.erase(srcSuballocIt); + srcSuballocIt = nextSuballocIt; + + InsertSuballoc(pFreeSpaceMetadata, suballoc); + + move.srcBlockIndex = srcOrigBlockIndex; + move.dstBlockIndex = freeSpaceOrigBlockIndex; + move.srcOffset = srcAllocOffset; + move.dstOffset = dstAllocOffset; + move.size = srcAllocSize; + + moves.push_back(move); + } + } + else + { + dstAllocOffset = VmaAlignUp(dstOffset, srcAllocAlignment); + + // If the allocation doesn't fit before the end of dstBlock, forward to next block. + while(dstBlockInfoIndex < srcBlockInfoIndex && + dstAllocOffset + srcAllocSize > dstBlockSize) + { + // But before that, register remaining free space at the end of dst block. + freeSpaceDb.Register(dstBlockInfoIndex, dstOffset, dstBlockSize - dstOffset); + + ++dstBlockInfoIndex; + dstOrigBlockIndex = m_BlockInfos[dstBlockInfoIndex].origBlockIndex; + pDstBlock = m_pBlockVector->GetBlock(dstOrigBlockIndex); + pDstMetadata = (VmaBlockMetadata_Generic*)pDstBlock->m_pMetadata; + dstBlockSize = pDstMetadata->GetSize(); + dstOffset = 0; + dstAllocOffset = 0; + } + + // Same block + if(dstBlockInfoIndex == srcBlockInfoIndex) + { + VMA_ASSERT(dstAllocOffset <= srcAllocOffset); + + const bool overlap = dstAllocOffset + srcAllocSize > srcAllocOffset; + + bool skipOver = overlap; + if(overlap && m_OverlappingMoveSupported && dstAllocOffset < srcAllocOffset) + { + // If destination and source place overlap, skip if it would move it + // by only < 1/64 of its size. + skipOver = (srcAllocOffset - dstAllocOffset) * 64 < srcAllocSize; + } + + if(skipOver) + { + freeSpaceDb.Register(dstBlockInfoIndex, dstOffset, srcAllocOffset - dstOffset); + + dstOffset = srcAllocOffset + srcAllocSize; + ++srcSuballocIt; + } + // MOVE OPTION 1: Move the allocation inside the same block by decreasing offset. + else + { + srcSuballocIt->offset = dstAllocOffset; + srcSuballocIt->hAllocation->ChangeOffset(dstAllocOffset); + dstOffset = dstAllocOffset + srcAllocSize; + m_BytesMoved += srcAllocSize; + ++m_AllocationsMoved; + ++srcSuballocIt; + + move.srcBlockIndex = srcOrigBlockIndex; + move.dstBlockIndex = dstOrigBlockIndex; + move.srcOffset = srcAllocOffset; + move.dstOffset = dstAllocOffset; + move.size = srcAllocSize; + + moves.push_back(move); + } + } + // Different block + else + { + // MOVE OPTION 2: Move the allocation to a different block. + + VMA_ASSERT(dstBlockInfoIndex < srcBlockInfoIndex); + VMA_ASSERT(dstAllocOffset + srcAllocSize <= dstBlockSize); + + VmaSuballocation suballoc = *srcSuballocIt; + suballoc.offset = dstAllocOffset; + suballoc.hAllocation->ChangeBlockAllocation(m_hAllocator, pDstBlock, dstAllocOffset); + dstOffset = dstAllocOffset + srcAllocSize; + m_BytesMoved += srcAllocSize; + ++m_AllocationsMoved; + + VmaSuballocationList::iterator nextSuballocIt = srcSuballocIt; + ++nextSuballocIt; + pSrcMetadata->m_Suballocations.erase(srcSuballocIt); + srcSuballocIt = nextSuballocIt; + + pDstMetadata->m_Suballocations.push_back(suballoc); + + move.srcBlockIndex = srcOrigBlockIndex; + move.dstBlockIndex = dstOrigBlockIndex; + move.srcOffset = srcAllocOffset; + move.dstOffset = dstAllocOffset; + move.size = srcAllocSize; + + moves.push_back(move); + } + } + } + } + + m_BlockInfos.clear(); + + PostprocessMetadata(); + + return VK_SUCCESS; +} + +void VmaDefragmentationAlgorithm_Fast::PreprocessMetadata() +{ + const size_t blockCount = m_pBlockVector->GetBlockCount(); + for (const auto blockIndex : c10::irange(blockCount)) { + VmaBlockMetadata_Generic* const pMetadata = + (VmaBlockMetadata_Generic*)m_pBlockVector->GetBlock(blockIndex)->m_pMetadata; + pMetadata->m_FreeCount = 0; + pMetadata->m_SumFreeSize = pMetadata->GetSize(); + pMetadata->m_FreeSuballocationsBySize.clear(); + for(VmaSuballocationList::iterator it = pMetadata->m_Suballocations.begin(); + it != pMetadata->m_Suballocations.end(); ) + { + if(it->type == VMA_SUBALLOCATION_TYPE_FREE) + { + VmaSuballocationList::iterator nextIt = it; + ++nextIt; + pMetadata->m_Suballocations.erase(it); + it = nextIt; + } + else + { + ++it; + } + } + } +} + +void VmaDefragmentationAlgorithm_Fast::PostprocessMetadata() +{ + const size_t blockCount = m_pBlockVector->GetBlockCount(); + for (const auto blockIndex : c10::irange(blockCount)) { + VmaBlockMetadata_Generic* const pMetadata = + (VmaBlockMetadata_Generic*)m_pBlockVector->GetBlock(blockIndex)->m_pMetadata; + const VkDeviceSize blockSize = pMetadata->GetSize(); + + // No allocations in this block - entire area is free. + if(pMetadata->m_Suballocations.empty()) + { + pMetadata->m_FreeCount = 1; + //pMetadata->m_SumFreeSize is already set to blockSize. + VmaSuballocation suballoc = { + 0, // offset + blockSize, // size + VMA_NULL, // hAllocation + VMA_SUBALLOCATION_TYPE_FREE }; + pMetadata->m_Suballocations.push_back(suballoc); + pMetadata->RegisterFreeSuballocation(pMetadata->m_Suballocations.begin()); + } + // There are some allocations in this block. + else + { + VkDeviceSize offset = 0; + VmaSuballocationList::iterator it; + for(it = pMetadata->m_Suballocations.begin(); + it != pMetadata->m_Suballocations.end(); + ++it) + { + VMA_ASSERT(it->type != VMA_SUBALLOCATION_TYPE_FREE); + VMA_ASSERT(it->offset >= offset); + + // Need to insert preceding free space. + if(it->offset > offset) + { + ++pMetadata->m_FreeCount; + const VkDeviceSize freeSize = it->offset - offset; + VmaSuballocation suballoc = { + offset, // offset + freeSize, // size + VMA_NULL, // hAllocation + VMA_SUBALLOCATION_TYPE_FREE }; + VmaSuballocationList::iterator precedingFreeIt = pMetadata->m_Suballocations.insert(it, suballoc); + if(freeSize >= VMA_MIN_FREE_SUBALLOCATION_SIZE_TO_REGISTER) + { + pMetadata->m_FreeSuballocationsBySize.push_back(precedingFreeIt); + } + } + + pMetadata->m_SumFreeSize -= it->size; + offset = it->offset + it->size; + } + + // Need to insert trailing free space. + if(offset < blockSize) + { + ++pMetadata->m_FreeCount; + const VkDeviceSize freeSize = blockSize - offset; + VmaSuballocation suballoc = { + offset, // offset + freeSize, // size + VMA_NULL, // hAllocation + VMA_SUBALLOCATION_TYPE_FREE }; + VMA_ASSERT(it == pMetadata->m_Suballocations.end()); + VmaSuballocationList::iterator trailingFreeIt = pMetadata->m_Suballocations.insert(it, suballoc); + if(freeSize > VMA_MIN_FREE_SUBALLOCATION_SIZE_TO_REGISTER) + { + pMetadata->m_FreeSuballocationsBySize.push_back(trailingFreeIt); + } + } + + VMA_SORT( + pMetadata->m_FreeSuballocationsBySize.begin(), + pMetadata->m_FreeSuballocationsBySize.end(), + VmaSuballocationItemSizeLess()); + } + + VMA_HEAVY_ASSERT(pMetadata->Validate()); + } +} + +void VmaDefragmentationAlgorithm_Fast::InsertSuballoc(VmaBlockMetadata_Generic* pMetadata, const VmaSuballocation& suballoc) +{ + // TODO: Optimize somehow. Remember iterator instead of searching for it linearly. + VmaSuballocationList::iterator it = pMetadata->m_Suballocations.begin(); + while(it != pMetadata->m_Suballocations.end()) + { + if(it->offset < suballoc.offset) + { + ++it; + } + } + pMetadata->m_Suballocations.insert(it, suballoc); +} + +//////////////////////////////////////////////////////////////////////////////// +// VmaBlockVectorDefragmentationContext + +VmaBlockVectorDefragmentationContext::VmaBlockVectorDefragmentationContext( + VmaAllocator hAllocator, + VmaPool hCustomPool, + VmaBlockVector* pBlockVector, + uint32_t currFrameIndex) : + res(VK_SUCCESS), + mutexLocked(false), + blockContexts(VmaStlAllocator(hAllocator->GetAllocationCallbacks())), + defragmentationMoves(VmaStlAllocator(hAllocator->GetAllocationCallbacks())), + defragmentationMovesProcessed(0), + defragmentationMovesCommitted(0), + hasDefragmentationPlan(0), + m_hAllocator(hAllocator), + m_hCustomPool(hCustomPool), + m_pBlockVector(pBlockVector), + m_CurrFrameIndex(currFrameIndex), + m_pAlgorithm(VMA_NULL), + m_Allocations(VmaStlAllocator(hAllocator->GetAllocationCallbacks())), + m_AllAllocations(false) +{ +} + +VmaBlockVectorDefragmentationContext::~VmaBlockVectorDefragmentationContext() +{ + vma_delete(m_hAllocator, m_pAlgorithm); +} + +void VmaBlockVectorDefragmentationContext::AddAllocation(VmaAllocation hAlloc, VkBool32* pChanged) +{ + AllocInfo info = { hAlloc, pChanged }; + m_Allocations.push_back(info); +} + +void VmaBlockVectorDefragmentationContext::Begin(bool overlappingMoveSupported, VmaDefragmentationFlags flags) +{ + const bool allAllocations = m_AllAllocations || + m_Allocations.size() == m_pBlockVector->CalcAllocationCount(); + + /******************************** + HERE IS THE CHOICE OF DEFRAGMENTATION ALGORITHM. + ********************************/ + + /* + Fast algorithm is supported only when certain criteria are met: + - VMA_DEBUG_MARGIN is 0. + - All allocations in this block vector are moveable. + - There is no possibility of image/buffer granularity conflict. + - The defragmentation is not incremental + */ + if(VMA_DEBUG_MARGIN == 0 && + allAllocations && + !m_pBlockVector->IsBufferImageGranularityConflictPossible() && + !(flags & VMA_DEFRAGMENTATION_FLAG_INCREMENTAL)) + { + m_pAlgorithm = vma_new(m_hAllocator, VmaDefragmentationAlgorithm_Fast)( + m_hAllocator, m_pBlockVector, m_CurrFrameIndex, overlappingMoveSupported); + } + else + { + m_pAlgorithm = vma_new(m_hAllocator, VmaDefragmentationAlgorithm_Generic)( + m_hAllocator, m_pBlockVector, m_CurrFrameIndex, overlappingMoveSupported); + } + + if(allAllocations) + { + m_pAlgorithm->AddAll(); + } + else + { + for(size_t i = 0, count = m_Allocations.size(); i < count; ++i) + { + m_pAlgorithm->AddAllocation(m_Allocations[i].hAlloc, m_Allocations[i].pChanged); + } + } +} + +//////////////////////////////////////////////////////////////////////////////// +// VmaDefragmentationContext + +VmaDefragmentationContext_T::VmaDefragmentationContext_T( + VmaAllocator hAllocator, + uint32_t currFrameIndex, + uint32_t flags, + VmaDefragmentationStats* pStats) : + m_hAllocator(hAllocator), + m_CurrFrameIndex(currFrameIndex), + m_Flags(flags), + m_pStats(pStats), + m_CustomPoolContexts(VmaStlAllocator(hAllocator->GetAllocationCallbacks())) +{ + memset(m_DefaultPoolContexts, 0, sizeof(m_DefaultPoolContexts)); +} + +VmaDefragmentationContext_T::~VmaDefragmentationContext_T() +{ + for(size_t i = m_CustomPoolContexts.size(); i--; ) + { + VmaBlockVectorDefragmentationContext* pBlockVectorCtx = m_CustomPoolContexts[i]; + pBlockVectorCtx->GetBlockVector()->DefragmentationEnd(pBlockVectorCtx, m_Flags, m_pStats); + vma_delete(m_hAllocator, pBlockVectorCtx); + } + for(size_t i = m_hAllocator->m_MemProps.memoryTypeCount; i--; ) + { + VmaBlockVectorDefragmentationContext* pBlockVectorCtx = m_DefaultPoolContexts[i]; + if(pBlockVectorCtx) + { + pBlockVectorCtx->GetBlockVector()->DefragmentationEnd(pBlockVectorCtx, m_Flags, m_pStats); + vma_delete(m_hAllocator, pBlockVectorCtx); + } + } +} + +void VmaDefragmentationContext_T::AddPools(uint32_t poolCount, const VmaPool* pPools) +{ + for (const auto poolIndex : c10::irange(poolCount)) { + VmaPool pool = pPools[poolIndex]; + VMA_ASSERT(pool); + // Pools with algorithm other than default are not defragmented. + if(pool->m_BlockVector.GetAlgorithm() == 0) + { + VmaBlockVectorDefragmentationContext* pBlockVectorDefragCtx = VMA_NULL; + + for(size_t i = m_CustomPoolContexts.size(); i--; ) + { + if(m_CustomPoolContexts[i]->GetCustomPool() == pool) + { + pBlockVectorDefragCtx = m_CustomPoolContexts[i]; + break; + } + } + + if(!pBlockVectorDefragCtx) + { + pBlockVectorDefragCtx = vma_new(m_hAllocator, VmaBlockVectorDefragmentationContext)( + m_hAllocator, + pool, + &pool->m_BlockVector, + m_CurrFrameIndex); + m_CustomPoolContexts.push_back(pBlockVectorDefragCtx); + } + + pBlockVectorDefragCtx->AddAll(); + } + } +} + +void VmaDefragmentationContext_T::AddAllocations( + uint32_t allocationCount, + const VmaAllocation* pAllocations, + VkBool32* pAllocationsChanged) +{ + // Dispatch pAllocations among defragmentators. Create them when necessary. + for (const auto allocIndex : c10::irange(allocationCount)) { + const VmaAllocation hAlloc = pAllocations[allocIndex]; + VMA_ASSERT(hAlloc); + // DedicatedAlloc cannot be defragmented. + if((hAlloc->GetType() == VmaAllocation_T::ALLOCATION_TYPE_BLOCK) && + // Lost allocation cannot be defragmented. + (hAlloc->GetLastUseFrameIndex() != VMA_FRAME_INDEX_LOST)) + { + VmaBlockVectorDefragmentationContext* pBlockVectorDefragCtx = VMA_NULL; + + const VmaPool hAllocPool = hAlloc->GetBlock()->GetParentPool(); + // This allocation belongs to custom pool. + if(hAllocPool != VK_NULL_HANDLE) + { + // Pools with algorithm other than default are not defragmented. + if(hAllocPool->m_BlockVector.GetAlgorithm() == 0) + { + for(size_t i = m_CustomPoolContexts.size(); i--; ) + { + if(m_CustomPoolContexts[i]->GetCustomPool() == hAllocPool) + { + pBlockVectorDefragCtx = m_CustomPoolContexts[i]; + break; + } + } + if(!pBlockVectorDefragCtx) + { + pBlockVectorDefragCtx = vma_new(m_hAllocator, VmaBlockVectorDefragmentationContext)( + m_hAllocator, + hAllocPool, + &hAllocPool->m_BlockVector, + m_CurrFrameIndex); + m_CustomPoolContexts.push_back(pBlockVectorDefragCtx); + } + } + } + // This allocation belongs to default pool. + else + { + const uint32_t memTypeIndex = hAlloc->GetMemoryTypeIndex(); + pBlockVectorDefragCtx = m_DefaultPoolContexts[memTypeIndex]; + if(!pBlockVectorDefragCtx) + { + pBlockVectorDefragCtx = vma_new(m_hAllocator, VmaBlockVectorDefragmentationContext)( + m_hAllocator, + VMA_NULL, // hCustomPool + m_hAllocator->m_pBlockVectors[memTypeIndex], + m_CurrFrameIndex); + m_DefaultPoolContexts[memTypeIndex] = pBlockVectorDefragCtx; + } + } + + if(pBlockVectorDefragCtx) + { + VkBool32* const pChanged = (pAllocationsChanged != VMA_NULL) ? + &pAllocationsChanged[allocIndex] : VMA_NULL; + pBlockVectorDefragCtx->AddAllocation(hAlloc, pChanged); + } + } + } +} + +VkResult VmaDefragmentationContext_T::Defragment( + VkDeviceSize maxCpuBytesToMove, uint32_t maxCpuAllocationsToMove, + VkDeviceSize maxGpuBytesToMove, uint32_t maxGpuAllocationsToMove, + VkCommandBuffer commandBuffer, VmaDefragmentationStats* pStats, VmaDefragmentationFlags flags) +{ + if(pStats) + { + memset(pStats, 0, sizeof(VmaDefragmentationStats)); + } + + if(flags & VMA_DEFRAGMENTATION_FLAG_INCREMENTAL) + { + // For incremental defragmetnations, we just earmark how much we can move + // The real meat is in the defragmentation steps + m_MaxCpuBytesToMove = maxCpuBytesToMove; + m_MaxCpuAllocationsToMove = maxCpuAllocationsToMove; + + m_MaxGpuBytesToMove = maxGpuBytesToMove; + m_MaxGpuAllocationsToMove = maxGpuAllocationsToMove; + + if(m_MaxCpuBytesToMove == 0 && m_MaxCpuAllocationsToMove == 0 && + m_MaxGpuBytesToMove == 0 && m_MaxGpuAllocationsToMove == 0) + return VK_SUCCESS; + + return VK_NOT_READY; + } + + if(commandBuffer == VK_NULL_HANDLE) + { + maxGpuBytesToMove = 0; + maxGpuAllocationsToMove = 0; + } + + VkResult res = VK_SUCCESS; + + // Process default pools. + for(uint32_t memTypeIndex = 0; + memTypeIndex < m_hAllocator->GetMemoryTypeCount() && res >= VK_SUCCESS; + ++memTypeIndex) + { + VmaBlockVectorDefragmentationContext* pBlockVectorCtx = m_DefaultPoolContexts[memTypeIndex]; + if(pBlockVectorCtx) + { + VMA_ASSERT(pBlockVectorCtx->GetBlockVector()); + pBlockVectorCtx->GetBlockVector()->Defragment( + pBlockVectorCtx, + pStats, flags, + maxCpuBytesToMove, maxCpuAllocationsToMove, + maxGpuBytesToMove, maxGpuAllocationsToMove, + commandBuffer); + if(pBlockVectorCtx->res != VK_SUCCESS) + { + res = pBlockVectorCtx->res; + } + } + } + + // Process custom pools. + for(size_t customCtxIndex = 0, customCtxCount = m_CustomPoolContexts.size(); + customCtxIndex < customCtxCount && res >= VK_SUCCESS; + ++customCtxIndex) + { + VmaBlockVectorDefragmentationContext* pBlockVectorCtx = m_CustomPoolContexts[customCtxIndex]; + VMA_ASSERT(pBlockVectorCtx && pBlockVectorCtx->GetBlockVector()); + pBlockVectorCtx->GetBlockVector()->Defragment( + pBlockVectorCtx, + pStats, flags, + maxCpuBytesToMove, maxCpuAllocationsToMove, + maxGpuBytesToMove, maxGpuAllocationsToMove, + commandBuffer); + if(pBlockVectorCtx->res != VK_SUCCESS) + { + res = pBlockVectorCtx->res; + } + } + + return res; +} + +VkResult VmaDefragmentationContext_T::DefragmentPassBegin(VmaDefragmentationPassInfo* pInfo) +{ + VmaDefragmentationPassMoveInfo* pCurrentMove = pInfo->pMoves; + uint32_t movesLeft = pInfo->moveCount; + + // Process default pools. + for(uint32_t memTypeIndex = 0; + memTypeIndex < m_hAllocator->GetMemoryTypeCount(); + ++memTypeIndex) + { + VmaBlockVectorDefragmentationContext *pBlockVectorCtx = m_DefaultPoolContexts[memTypeIndex]; + if(pBlockVectorCtx) + { + VMA_ASSERT(pBlockVectorCtx->GetBlockVector()); + + if(!pBlockVectorCtx->hasDefragmentationPlan) + { + pBlockVectorCtx->GetBlockVector()->Defragment( + pBlockVectorCtx, + m_pStats, m_Flags, + m_MaxCpuBytesToMove, m_MaxCpuAllocationsToMove, + m_MaxGpuBytesToMove, m_MaxGpuAllocationsToMove, + VK_NULL_HANDLE); + + if(pBlockVectorCtx->res < VK_SUCCESS) + continue; + + pBlockVectorCtx->hasDefragmentationPlan = true; + } + + const uint32_t processed = pBlockVectorCtx->GetBlockVector()->ProcessDefragmentations( + pBlockVectorCtx, + pCurrentMove, movesLeft); + + movesLeft -= processed; + pCurrentMove += processed; + } + } + + // Process custom pools. + for(size_t customCtxIndex = 0, customCtxCount = m_CustomPoolContexts.size(); + customCtxIndex < customCtxCount; + ++customCtxIndex) + { + VmaBlockVectorDefragmentationContext *pBlockVectorCtx = m_CustomPoolContexts[customCtxIndex]; + VMA_ASSERT(pBlockVectorCtx && pBlockVectorCtx->GetBlockVector()); + + if(!pBlockVectorCtx->hasDefragmentationPlan) + { + pBlockVectorCtx->GetBlockVector()->Defragment( + pBlockVectorCtx, + m_pStats, m_Flags, + m_MaxCpuBytesToMove, m_MaxCpuAllocationsToMove, + m_MaxGpuBytesToMove, m_MaxGpuAllocationsToMove, + VK_NULL_HANDLE); + + if(pBlockVectorCtx->res < VK_SUCCESS) + continue; + + pBlockVectorCtx->hasDefragmentationPlan = true; + } + + const uint32_t processed = pBlockVectorCtx->GetBlockVector()->ProcessDefragmentations( + pBlockVectorCtx, + pCurrentMove, movesLeft); + + movesLeft -= processed; + pCurrentMove += processed; + } + + pInfo->moveCount = pInfo->moveCount - movesLeft; + + return VK_SUCCESS; +} +VkResult VmaDefragmentationContext_T::DefragmentPassEnd() +{ + VkResult res = VK_SUCCESS; + + // Process default pools. + for(uint32_t memTypeIndex = 0; + memTypeIndex < m_hAllocator->GetMemoryTypeCount(); + ++memTypeIndex) + { + VmaBlockVectorDefragmentationContext *pBlockVectorCtx = m_DefaultPoolContexts[memTypeIndex]; + if(pBlockVectorCtx) + { + VMA_ASSERT(pBlockVectorCtx->GetBlockVector()); + + if(!pBlockVectorCtx->hasDefragmentationPlan) + { + res = VK_NOT_READY; + continue; + } + + pBlockVectorCtx->GetBlockVector()->CommitDefragmentations( + pBlockVectorCtx, m_pStats); + + if(pBlockVectorCtx->defragmentationMoves.size() != pBlockVectorCtx->defragmentationMovesCommitted) + res = VK_NOT_READY; + } + } + + // Process custom pools. + for(size_t customCtxIndex = 0, customCtxCount = m_CustomPoolContexts.size(); + customCtxIndex < customCtxCount; + ++customCtxIndex) + { + VmaBlockVectorDefragmentationContext *pBlockVectorCtx = m_CustomPoolContexts[customCtxIndex]; + VMA_ASSERT(pBlockVectorCtx && pBlockVectorCtx->GetBlockVector()); + + if(!pBlockVectorCtx->hasDefragmentationPlan) + { + res = VK_NOT_READY; + continue; + } + + pBlockVectorCtx->GetBlockVector()->CommitDefragmentations( + pBlockVectorCtx, m_pStats); + + if(pBlockVectorCtx->defragmentationMoves.size() != pBlockVectorCtx->defragmentationMovesCommitted) + res = VK_NOT_READY; + } + + return res; +} + +//////////////////////////////////////////////////////////////////////////////// +// VmaRecorder + +#if VMA_RECORDING_ENABLED + +VmaRecorder::VmaRecorder() : + m_UseMutex(true), + m_Flags(0), + m_File(VMA_NULL), + m_RecordingStartTime(std::chrono::high_resolution_clock::now()) +{ +} + +VkResult VmaRecorder::Init(const VmaRecordSettings& settings, bool useMutex) +{ + m_UseMutex = useMutex; + m_Flags = settings.flags; + +#if defined(_WIN32) + // Open file for writing. + errno_t err = fopen_s(&m_File, settings.pFilePath, "wb"); + + if(err != 0) + { + return VK_ERROR_INITIALIZATION_FAILED; + } +#else + // Open file for writing. + m_File = fopen(settings.pFilePath, "wb"); + + if(m_File == 0) + { + return VK_ERROR_INITIALIZATION_FAILED; + } +#endif + + // Write header. + fprintf(m_File, "%s\n", "Vulkan Memory Allocator,Calls recording"); + fprintf(m_File, "%s\n", "1,8"); + + return VK_SUCCESS; +} + +VmaRecorder::~VmaRecorder() +{ + if(m_File != VMA_NULL) + { + fclose(m_File); + } +} + +void VmaRecorder::RecordCreateAllocator(uint32_t frameIndex) +{ + CallParams callParams; + GetBasicParams(callParams); + + VmaMutexLock lock(m_FileMutex, m_UseMutex); + fprintf(m_File, "%u,%.3f,%u,vmaCreateAllocator\n", callParams.threadId, callParams.time, frameIndex); + Flush(); +} + +void VmaRecorder::RecordDestroyAllocator(uint32_t frameIndex) +{ + CallParams callParams; + GetBasicParams(callParams); + + VmaMutexLock lock(m_FileMutex, m_UseMutex); + fprintf(m_File, "%u,%.3f,%u,vmaDestroyAllocator\n", callParams.threadId, callParams.time, frameIndex); + Flush(); +} + +void VmaRecorder::RecordCreatePool(uint32_t frameIndex, const VmaPoolCreateInfo& createInfo, VmaPool pool) +{ + CallParams callParams; + GetBasicParams(callParams); + + VmaMutexLock lock(m_FileMutex, m_UseMutex); + fprintf(m_File, "%u,%.3f,%u,vmaCreatePool,%u,%u,%llu,%llu,%llu,%u,%p\n", callParams.threadId, callParams.time, frameIndex, + createInfo.memoryTypeIndex, + createInfo.flags, + createInfo.blockSize, + (uint64_t)createInfo.minBlockCount, + (uint64_t)createInfo.maxBlockCount, + createInfo.frameInUseCount, + pool); + Flush(); +} + +void VmaRecorder::RecordDestroyPool(uint32_t frameIndex, VmaPool pool) +{ + CallParams callParams; + GetBasicParams(callParams); + + VmaMutexLock lock(m_FileMutex, m_UseMutex); + fprintf(m_File, "%u,%.3f,%u,vmaDestroyPool,%p\n", callParams.threadId, callParams.time, frameIndex, + pool); + Flush(); +} + +void VmaRecorder::RecordAllocateMemory(uint32_t frameIndex, + const VkMemoryRequirements& vkMemReq, + const VmaAllocationCreateInfo& createInfo, + VmaAllocation allocation) +{ + CallParams callParams; + GetBasicParams(callParams); + + VmaMutexLock lock(m_FileMutex, m_UseMutex); + UserDataString userDataStr(createInfo.flags, createInfo.pUserData); + fprintf(m_File, "%u,%.3f,%u,vmaAllocateMemory,%llu,%llu,%u,%u,%u,%u,%u,%u,%p,%p,%s\n", callParams.threadId, callParams.time, frameIndex, + vkMemReq.size, + vkMemReq.alignment, + vkMemReq.memoryTypeBits, + createInfo.flags, + createInfo.usage, + createInfo.requiredFlags, + createInfo.preferredFlags, + createInfo.memoryTypeBits, + createInfo.pool, + allocation, + userDataStr.GetString()); + Flush(); +} + +void VmaRecorder::RecordAllocateMemoryPages(uint32_t frameIndex, + const VkMemoryRequirements& vkMemReq, + const VmaAllocationCreateInfo& createInfo, + uint64_t allocationCount, + const VmaAllocation* pAllocations) +{ + CallParams callParams; + GetBasicParams(callParams); + + VmaMutexLock lock(m_FileMutex, m_UseMutex); + UserDataString userDataStr(createInfo.flags, createInfo.pUserData); + fprintf(m_File, "%u,%.3f,%u,vmaAllocateMemoryPages,%llu,%llu,%u,%u,%u,%u,%u,%u,%p,", callParams.threadId, callParams.time, frameIndex, + vkMemReq.size, + vkMemReq.alignment, + vkMemReq.memoryTypeBits, + createInfo.flags, + createInfo.usage, + createInfo.requiredFlags, + createInfo.preferredFlags, + createInfo.memoryTypeBits, + createInfo.pool); + PrintPointerList(allocationCount, pAllocations); + fprintf(m_File, ",%s\n", userDataStr.GetString()); + Flush(); +} + +void VmaRecorder::RecordAllocateMemoryForBuffer(uint32_t frameIndex, + const VkMemoryRequirements& vkMemReq, + bool requiresDedicatedAllocation, + bool prefersDedicatedAllocation, + const VmaAllocationCreateInfo& createInfo, + VmaAllocation allocation) +{ + CallParams callParams; + GetBasicParams(callParams); + + VmaMutexLock lock(m_FileMutex, m_UseMutex); + UserDataString userDataStr(createInfo.flags, createInfo.pUserData); + fprintf(m_File, "%u,%.3f,%u,vmaAllocateMemoryForBuffer,%llu,%llu,%u,%u,%u,%u,%u,%u,%u,%u,%p,%p,%s\n", callParams.threadId, callParams.time, frameIndex, + vkMemReq.size, + vkMemReq.alignment, + vkMemReq.memoryTypeBits, + requiresDedicatedAllocation ? 1 : 0, + prefersDedicatedAllocation ? 1 : 0, + createInfo.flags, + createInfo.usage, + createInfo.requiredFlags, + createInfo.preferredFlags, + createInfo.memoryTypeBits, + createInfo.pool, + allocation, + userDataStr.GetString()); + Flush(); +} + +void VmaRecorder::RecordAllocateMemoryForImage(uint32_t frameIndex, + const VkMemoryRequirements& vkMemReq, + bool requiresDedicatedAllocation, + bool prefersDedicatedAllocation, + const VmaAllocationCreateInfo& createInfo, + VmaAllocation allocation) +{ + CallParams callParams; + GetBasicParams(callParams); + + VmaMutexLock lock(m_FileMutex, m_UseMutex); + UserDataString userDataStr(createInfo.flags, createInfo.pUserData); + fprintf(m_File, "%u,%.3f,%u,vmaAllocateMemoryForImage,%llu,%llu,%u,%u,%u,%u,%u,%u,%u,%u,%p,%p,%s\n", callParams.threadId, callParams.time, frameIndex, + vkMemReq.size, + vkMemReq.alignment, + vkMemReq.memoryTypeBits, + requiresDedicatedAllocation ? 1 : 0, + prefersDedicatedAllocation ? 1 : 0, + createInfo.flags, + createInfo.usage, + createInfo.requiredFlags, + createInfo.preferredFlags, + createInfo.memoryTypeBits, + createInfo.pool, + allocation, + userDataStr.GetString()); + Flush(); +} + +void VmaRecorder::RecordFreeMemory(uint32_t frameIndex, + VmaAllocation allocation) +{ + CallParams callParams; + GetBasicParams(callParams); + + VmaMutexLock lock(m_FileMutex, m_UseMutex); + fprintf(m_File, "%u,%.3f,%u,vmaFreeMemory,%p\n", callParams.threadId, callParams.time, frameIndex, + allocation); + Flush(); +} + +void VmaRecorder::RecordFreeMemoryPages(uint32_t frameIndex, + uint64_t allocationCount, + const VmaAllocation* pAllocations) +{ + CallParams callParams; + GetBasicParams(callParams); + + VmaMutexLock lock(m_FileMutex, m_UseMutex); + fprintf(m_File, "%u,%.3f,%u,vmaFreeMemoryPages,", callParams.threadId, callParams.time, frameIndex); + PrintPointerList(allocationCount, pAllocations); + fprintf(m_File, "\n"); + Flush(); +} + +void VmaRecorder::RecordSetAllocationUserData(uint32_t frameIndex, + VmaAllocation allocation, + const void* pUserData) +{ + CallParams callParams; + GetBasicParams(callParams); + + VmaMutexLock lock(m_FileMutex, m_UseMutex); + UserDataString userDataStr( + allocation->IsUserDataString() ? VMA_ALLOCATION_CREATE_USER_DATA_COPY_STRING_BIT : 0, + pUserData); + fprintf(m_File, "%u,%.3f,%u,vmaSetAllocationUserData,%p,%s\n", callParams.threadId, callParams.time, frameIndex, + allocation, + userDataStr.GetString()); + Flush(); +} + +void VmaRecorder::RecordCreateLostAllocation(uint32_t frameIndex, + VmaAllocation allocation) +{ + CallParams callParams; + GetBasicParams(callParams); + + VmaMutexLock lock(m_FileMutex, m_UseMutex); + fprintf(m_File, "%u,%.3f,%u,vmaCreateLostAllocation,%p\n", callParams.threadId, callParams.time, frameIndex, + allocation); + Flush(); +} + +void VmaRecorder::RecordMapMemory(uint32_t frameIndex, + VmaAllocation allocation) +{ + CallParams callParams; + GetBasicParams(callParams); + + VmaMutexLock lock(m_FileMutex, m_UseMutex); + fprintf(m_File, "%u,%.3f,%u,vmaMapMemory,%p\n", callParams.threadId, callParams.time, frameIndex, + allocation); + Flush(); +} + +void VmaRecorder::RecordUnmapMemory(uint32_t frameIndex, + VmaAllocation allocation) +{ + CallParams callParams; + GetBasicParams(callParams); + + VmaMutexLock lock(m_FileMutex, m_UseMutex); + fprintf(m_File, "%u,%.3f,%u,vmaUnmapMemory,%p\n", callParams.threadId, callParams.time, frameIndex, + allocation); + Flush(); +} + +void VmaRecorder::RecordFlushAllocation(uint32_t frameIndex, + VmaAllocation allocation, VkDeviceSize offset, VkDeviceSize size) +{ + CallParams callParams; + GetBasicParams(callParams); + + VmaMutexLock lock(m_FileMutex, m_UseMutex); + fprintf(m_File, "%u,%.3f,%u,vmaFlushAllocation,%p,%llu,%llu\n", callParams.threadId, callParams.time, frameIndex, + allocation, + offset, + size); + Flush(); +} + +void VmaRecorder::RecordInvalidateAllocation(uint32_t frameIndex, + VmaAllocation allocation, VkDeviceSize offset, VkDeviceSize size) +{ + CallParams callParams; + GetBasicParams(callParams); + + VmaMutexLock lock(m_FileMutex, m_UseMutex); + fprintf(m_File, "%u,%.3f,%u,vmaInvalidateAllocation,%p,%llu,%llu\n", callParams.threadId, callParams.time, frameIndex, + allocation, + offset, + size); + Flush(); +} + +void VmaRecorder::RecordCreateBuffer(uint32_t frameIndex, + const VkBufferCreateInfo& bufCreateInfo, + const VmaAllocationCreateInfo& allocCreateInfo, + VmaAllocation allocation) +{ + CallParams callParams; + GetBasicParams(callParams); + + VmaMutexLock lock(m_FileMutex, m_UseMutex); + UserDataString userDataStr(allocCreateInfo.flags, allocCreateInfo.pUserData); + fprintf(m_File, "%u,%.3f,%u,vmaCreateBuffer,%u,%llu,%u,%u,%u,%u,%u,%u,%u,%p,%p,%s\n", callParams.threadId, callParams.time, frameIndex, + bufCreateInfo.flags, + bufCreateInfo.size, + bufCreateInfo.usage, + bufCreateInfo.sharingMode, + allocCreateInfo.flags, + allocCreateInfo.usage, + allocCreateInfo.requiredFlags, + allocCreateInfo.preferredFlags, + allocCreateInfo.memoryTypeBits, + allocCreateInfo.pool, + allocation, + userDataStr.GetString()); + Flush(); +} + +void VmaRecorder::RecordCreateImage(uint32_t frameIndex, + const VkImageCreateInfo& imageCreateInfo, + const VmaAllocationCreateInfo& allocCreateInfo, + VmaAllocation allocation) +{ + CallParams callParams; + GetBasicParams(callParams); + + VmaMutexLock lock(m_FileMutex, m_UseMutex); + UserDataString userDataStr(allocCreateInfo.flags, allocCreateInfo.pUserData); + fprintf(m_File, "%u,%.3f,%u,vmaCreateImage,%u,%u,%u,%u,%u,%u,%u,%u,%u,%u,%u,%u,%u,%u,%u,%u,%u,%u,%p,%p,%s\n", callParams.threadId, callParams.time, frameIndex, + imageCreateInfo.flags, + imageCreateInfo.imageType, + imageCreateInfo.format, + imageCreateInfo.extent.width, + imageCreateInfo.extent.height, + imageCreateInfo.extent.depth, + imageCreateInfo.mipLevels, + imageCreateInfo.arrayLayers, + imageCreateInfo.samples, + imageCreateInfo.tiling, + imageCreateInfo.usage, + imageCreateInfo.sharingMode, + imageCreateInfo.initialLayout, + allocCreateInfo.flags, + allocCreateInfo.usage, + allocCreateInfo.requiredFlags, + allocCreateInfo.preferredFlags, + allocCreateInfo.memoryTypeBits, + allocCreateInfo.pool, + allocation, + userDataStr.GetString()); + Flush(); +} + +void VmaRecorder::RecordDestroyBuffer(uint32_t frameIndex, + VmaAllocation allocation) +{ + CallParams callParams; + GetBasicParams(callParams); + + VmaMutexLock lock(m_FileMutex, m_UseMutex); + fprintf(m_File, "%u,%.3f,%u,vmaDestroyBuffer,%p\n", callParams.threadId, callParams.time, frameIndex, + allocation); + Flush(); +} + +void VmaRecorder::RecordDestroyImage(uint32_t frameIndex, + VmaAllocation allocation) +{ + CallParams callParams; + GetBasicParams(callParams); + + VmaMutexLock lock(m_FileMutex, m_UseMutex); + fprintf(m_File, "%u,%.3f,%u,vmaDestroyImage,%p\n", callParams.threadId, callParams.time, frameIndex, + allocation); + Flush(); +} + +void VmaRecorder::RecordTouchAllocation(uint32_t frameIndex, + VmaAllocation allocation) +{ + CallParams callParams; + GetBasicParams(callParams); + + VmaMutexLock lock(m_FileMutex, m_UseMutex); + fprintf(m_File, "%u,%.3f,%u,vmaTouchAllocation,%p\n", callParams.threadId, callParams.time, frameIndex, + allocation); + Flush(); +} + +void VmaRecorder::RecordGetAllocationInfo(uint32_t frameIndex, + VmaAllocation allocation) +{ + CallParams callParams; + GetBasicParams(callParams); + + VmaMutexLock lock(m_FileMutex, m_UseMutex); + fprintf(m_File, "%u,%.3f,%u,vmaGetAllocationInfo,%p\n", callParams.threadId, callParams.time, frameIndex, + allocation); + Flush(); +} + +void VmaRecorder::RecordMakePoolAllocationsLost(uint32_t frameIndex, + VmaPool pool) +{ + CallParams callParams; + GetBasicParams(callParams); + + VmaMutexLock lock(m_FileMutex, m_UseMutex); + fprintf(m_File, "%u,%.3f,%u,vmaMakePoolAllocationsLost,%p\n", callParams.threadId, callParams.time, frameIndex, + pool); + Flush(); +} + +void VmaRecorder::RecordDefragmentationBegin(uint32_t frameIndex, + const VmaDefragmentationInfo2& info, + VmaDefragmentationContext ctx) +{ + CallParams callParams; + GetBasicParams(callParams); + + VmaMutexLock lock(m_FileMutex, m_UseMutex); + fprintf(m_File, "%u,%.3f,%u,vmaDefragmentationBegin,%u,", callParams.threadId, callParams.time, frameIndex, + info.flags); + PrintPointerList(info.allocationCount, info.pAllocations); + fprintf(m_File, ","); + PrintPointerList(info.poolCount, info.pPools); + fprintf(m_File, ",%llu,%u,%llu,%u,%p,%p\n", + info.maxCpuBytesToMove, + info.maxCpuAllocationsToMove, + info.maxGpuBytesToMove, + info.maxGpuAllocationsToMove, + info.commandBuffer, + ctx); + Flush(); +} + +void VmaRecorder::RecordDefragmentationEnd(uint32_t frameIndex, + VmaDefragmentationContext ctx) +{ + CallParams callParams; + GetBasicParams(callParams); + + VmaMutexLock lock(m_FileMutex, m_UseMutex); + fprintf(m_File, "%u,%.3f,%u,vmaDefragmentationEnd,%p\n", callParams.threadId, callParams.time, frameIndex, + ctx); + Flush(); +} + +void VmaRecorder::RecordSetPoolName(uint32_t frameIndex, + VmaPool pool, + const char* name) +{ + CallParams callParams; + GetBasicParams(callParams); + + VmaMutexLock lock(m_FileMutex, m_UseMutex); + fprintf(m_File, "%u,%.3f,%u,vmaSetPoolName,%p,%s\n", callParams.threadId, callParams.time, frameIndex, + pool, name != VMA_NULL ? name : ""); + Flush(); +} + +VmaRecorder::UserDataString::UserDataString(VmaAllocationCreateFlags allocFlags, const void* pUserData) +{ + if(pUserData != VMA_NULL) + { + if((allocFlags & VMA_ALLOCATION_CREATE_USER_DATA_COPY_STRING_BIT) != 0) + { + m_Str = (const char*)pUserData; + } + else + { + // If VMA_ALLOCATION_CREATE_USER_DATA_COPY_STRING_BIT is not specified, convert the string's memory address to a string and store it. + snprintf(m_PtrStr, 17, "%p", pUserData); + m_Str = m_PtrStr; + } + } + else + { + m_Str = ""; + } +} + +void VmaRecorder::WriteConfiguration( + const VkPhysicalDeviceProperties& devProps, + const VkPhysicalDeviceMemoryProperties& memProps, + uint32_t vulkanApiVersion, + bool dedicatedAllocationExtensionEnabled, + bool bindMemory2ExtensionEnabled, + bool memoryBudgetExtensionEnabled, + bool deviceCoherentMemoryExtensionEnabled) +{ + fprintf(m_File, "Config,Begin\n"); + + fprintf(m_File, "VulkanApiVersion,%u,%u\n", VK_VERSION_MAJOR(vulkanApiVersion), VK_VERSION_MINOR(vulkanApiVersion)); + + fprintf(m_File, "PhysicalDevice,apiVersion,%u\n", devProps.apiVersion); + fprintf(m_File, "PhysicalDevice,driverVersion,%u\n", devProps.driverVersion); + fprintf(m_File, "PhysicalDevice,vendorID,%u\n", devProps.vendorID); + fprintf(m_File, "PhysicalDevice,deviceID,%u\n", devProps.deviceID); + fprintf(m_File, "PhysicalDevice,deviceType,%u\n", devProps.deviceType); + fprintf(m_File, "PhysicalDevice,deviceName,%s\n", devProps.deviceName); + + fprintf(m_File, "PhysicalDeviceLimits,maxMemoryAllocationCount,%u\n", devProps.limits.maxMemoryAllocationCount); + fprintf(m_File, "PhysicalDeviceLimits,bufferImageGranularity,%llu\n", devProps.limits.bufferImageGranularity); + fprintf(m_File, "PhysicalDeviceLimits,nonCoherentAtomSize,%llu\n", devProps.limits.nonCoherentAtomSize); + + fprintf(m_File, "PhysicalDeviceMemory,HeapCount,%u\n", memProps.memoryHeapCount); + for (const auto i : c10::irange(memProps.memoryHeapCount)) { + fprintf(m_File, "PhysicalDeviceMemory,Heap,%u,size,%llu\n", i, memProps.memoryHeaps[i].size); + fprintf(m_File, "PhysicalDeviceMemory,Heap,%u,flags,%u\n", i, memProps.memoryHeaps[i].flags); + } + fprintf(m_File, "PhysicalDeviceMemory,TypeCount,%u\n", memProps.memoryTypeCount); + for (const auto i : c10::irange(memProps.memoryTypeCount)) { + fprintf(m_File, "PhysicalDeviceMemory,Type,%u,heapIndex,%u\n", i, memProps.memoryTypes[i].heapIndex); + fprintf(m_File, "PhysicalDeviceMemory,Type,%u,propertyFlags,%u\n", i, memProps.memoryTypes[i].propertyFlags); + } + + fprintf(m_File, "Extension,VK_KHR_dedicated_allocation,%u\n", dedicatedAllocationExtensionEnabled ? 1 : 0); + fprintf(m_File, "Extension,VK_KHR_bind_memory2,%u\n", bindMemory2ExtensionEnabled ? 1 : 0); + fprintf(m_File, "Extension,VK_EXT_memory_budget,%u\n", memoryBudgetExtensionEnabled ? 1 : 0); + fprintf(m_File, "Extension,VK_AMD_device_coherent_memory,%u\n", deviceCoherentMemoryExtensionEnabled ? 1 : 0); + + fprintf(m_File, "Macro,VMA_DEBUG_ALWAYS_DEDICATED_MEMORY,%u\n", VMA_DEBUG_ALWAYS_DEDICATED_MEMORY ? 1 : 0); + fprintf(m_File, "Macro,VMA_DEBUG_ALIGNMENT,%llu\n", (VkDeviceSize)VMA_DEBUG_ALIGNMENT); + fprintf(m_File, "Macro,VMA_DEBUG_MARGIN,%llu\n", (VkDeviceSize)VMA_DEBUG_MARGIN); + fprintf(m_File, "Macro,VMA_DEBUG_INITIALIZE_ALLOCATIONS,%u\n", VMA_DEBUG_INITIALIZE_ALLOCATIONS ? 1 : 0); + fprintf(m_File, "Macro,VMA_DEBUG_DETECT_CORRUPTION,%u\n", VMA_DEBUG_DETECT_CORRUPTION ? 1 : 0); + fprintf(m_File, "Macro,VMA_DEBUG_GLOBAL_MUTEX,%u\n", VMA_DEBUG_GLOBAL_MUTEX ? 1 : 0); + fprintf(m_File, "Macro,VMA_DEBUG_MIN_BUFFER_IMAGE_GRANULARITY,%llu\n", (VkDeviceSize)VMA_DEBUG_MIN_BUFFER_IMAGE_GRANULARITY); + fprintf(m_File, "Macro,VMA_SMALL_HEAP_MAX_SIZE,%llu\n", (VkDeviceSize)VMA_SMALL_HEAP_MAX_SIZE); + fprintf(m_File, "Macro,VMA_DEFAULT_LARGE_HEAP_BLOCK_SIZE,%llu\n", (VkDeviceSize)VMA_DEFAULT_LARGE_HEAP_BLOCK_SIZE); + + fprintf(m_File, "Config,End\n"); +} + +void VmaRecorder::GetBasicParams(CallParams& outParams) +{ + #if defined(_WIN32) + outParams.threadId = GetCurrentThreadId(); + #else + // Use C++11 features to get thread id and convert it to uint32_t. + // There is room for optimization since sstream is quite slow. + // Is there a better way to convert std::this_thread::get_id() to uint32_t? + std::thread::id thread_id = std::this_thread::get_id(); + std::stringstream thread_id_to_string_converter; + thread_id_to_string_converter << thread_id; + std::string thread_id_as_string = thread_id_to_string_converter.str(); + outParams.threadId = static_cast(std::stoi(thread_id_as_string.c_str())); + #endif + + auto current_time = std::chrono::high_resolution_clock::now(); + + outParams.time = std::chrono::duration(current_time - m_RecordingStartTime).count(); +} + +void VmaRecorder::PrintPointerList(uint64_t count, const VmaAllocation* pItems) +{ + if(count) + { + fprintf(m_File, "%p", pItems[0]); + for(uint64_t i = 1; i < count; ++i) + { + fprintf(m_File, " %p", pItems[i]); + } + } +} + +void VmaRecorder::Flush() +{ + if((m_Flags & VMA_RECORD_FLUSH_AFTER_CALL_BIT) != 0) + { + fflush(m_File); + } +} + +#endif // #if VMA_RECORDING_ENABLED + +//////////////////////////////////////////////////////////////////////////////// +// VmaAllocationObjectAllocator + +VmaAllocationObjectAllocator::VmaAllocationObjectAllocator(const VkAllocationCallbacks* pAllocationCallbacks) : + m_Allocator(pAllocationCallbacks, 1024) +{ +} + +template VmaAllocation VmaAllocationObjectAllocator::Allocate(Types... args) +{ + VmaMutexLock mutexLock(m_Mutex); + return m_Allocator.Alloc(std::forward(args)...); +} + +void VmaAllocationObjectAllocator::Free(VmaAllocation hAlloc) +{ + VmaMutexLock mutexLock(m_Mutex); + m_Allocator.Free(hAlloc); +} + +//////////////////////////////////////////////////////////////////////////////// +// VmaAllocator_T + +VmaAllocator_T::VmaAllocator_T(const VmaAllocatorCreateInfo* pCreateInfo) : + m_UseMutex((pCreateInfo->flags & VMA_ALLOCATOR_CREATE_EXTERNALLY_SYNCHRONIZED_BIT) == 0), + m_VulkanApiVersion(pCreateInfo->vulkanApiVersion != 0 ? pCreateInfo->vulkanApiVersion : VK_API_VERSION_1_0), + m_UseKhrDedicatedAllocation((pCreateInfo->flags & VMA_ALLOCATOR_CREATE_KHR_DEDICATED_ALLOCATION_BIT) != 0), + m_UseKhrBindMemory2((pCreateInfo->flags & VMA_ALLOCATOR_CREATE_KHR_BIND_MEMORY2_BIT) != 0), + m_UseExtMemoryBudget((pCreateInfo->flags & VMA_ALLOCATOR_CREATE_EXT_MEMORY_BUDGET_BIT) != 0), + m_UseAmdDeviceCoherentMemory((pCreateInfo->flags & VMA_ALLOCATOR_CREATE_AMD_DEVICE_COHERENT_MEMORY_BIT) != 0), + m_UseKhrBufferDeviceAddress((pCreateInfo->flags & VMA_ALLOCATOR_CREATE_BUFFER_DEVICE_ADDRESS_BIT) != 0), + m_UseExtMemoryPriority((pCreateInfo->flags & VMA_ALLOCATOR_CREATE_EXT_MEMORY_PRIORITY_BIT) != 0), + m_hDevice(pCreateInfo->device), + m_hInstance(pCreateInfo->instance), + m_AllocationCallbacksSpecified(pCreateInfo->pAllocationCallbacks != VMA_NULL), + m_AllocationCallbacks(pCreateInfo->pAllocationCallbacks ? + *pCreateInfo->pAllocationCallbacks : VmaEmptyAllocationCallbacks), + m_AllocationObjectAllocator(&m_AllocationCallbacks), + m_HeapSizeLimitMask(0), + m_DeviceMemoryCount(0), + m_PreferredLargeHeapBlockSize(0), + m_PhysicalDevice(pCreateInfo->physicalDevice), + m_CurrentFrameIndex(0), + m_GpuDefragmentationMemoryTypeBits(UINT32_MAX), + m_Pools(VmaStlAllocator(GetAllocationCallbacks())), + m_NextPoolId(0), + m_GlobalMemoryTypeBits(UINT32_MAX) +#if VMA_RECORDING_ENABLED + ,m_pRecorder(VMA_NULL) +#endif +{ + if(m_VulkanApiVersion >= VK_MAKE_VERSION(1, 1, 0)) + { + m_UseKhrDedicatedAllocation = false; + m_UseKhrBindMemory2 = false; + } + + if(VMA_DEBUG_DETECT_CORRUPTION) + { + // Needs to be multiply of uint32_t size because we are going to write VMA_CORRUPTION_DETECTION_MAGIC_VALUE to it. + VMA_ASSERT(VMA_DEBUG_MARGIN % sizeof(uint32_t) == 0); + } + + VMA_ASSERT(pCreateInfo->physicalDevice && pCreateInfo->device && pCreateInfo->instance); + + if(m_VulkanApiVersion < VK_MAKE_VERSION(1, 1, 0)) + { +#if !(VMA_DEDICATED_ALLOCATION) + if((pCreateInfo->flags & VMA_ALLOCATOR_CREATE_KHR_DEDICATED_ALLOCATION_BIT) != 0) + { + VMA_ASSERT(0 && "VMA_ALLOCATOR_CREATE_KHR_DEDICATED_ALLOCATION_BIT set but required extensions are disabled by preprocessor macros."); + } +#endif +#if !(VMA_BIND_MEMORY2) + if((pCreateInfo->flags & VMA_ALLOCATOR_CREATE_KHR_BIND_MEMORY2_BIT) != 0) + { + VMA_ASSERT(0 && "VMA_ALLOCATOR_CREATE_KHR_BIND_MEMORY2_BIT set but required extension is disabled by preprocessor macros."); + } +#endif + } +#if !(VMA_MEMORY_BUDGET) + if((pCreateInfo->flags & VMA_ALLOCATOR_CREATE_EXT_MEMORY_BUDGET_BIT) != 0) + { + VMA_ASSERT(0 && "VMA_ALLOCATOR_CREATE_EXT_MEMORY_BUDGET_BIT set but required extension is disabled by preprocessor macros."); + } +#endif +#if !(VMA_BUFFER_DEVICE_ADDRESS) + if(m_UseKhrBufferDeviceAddress) + { + VMA_ASSERT(0 && "VMA_ALLOCATOR_CREATE_BUFFER_DEVICE_ADDRESS_BIT is set but required extension or Vulkan 1.2 is not available in your Vulkan header or its support in VMA has been disabled by a preprocessor macro."); + } +#endif +#if VMA_VULKAN_VERSION < 1002000 + if(m_VulkanApiVersion >= VK_MAKE_VERSION(1, 2, 0)) + { + VMA_ASSERT(0 && "vulkanApiVersion >= VK_API_VERSION_1_2 but required Vulkan version is disabled by preprocessor macros."); + } +#endif +#if VMA_VULKAN_VERSION < 1001000 + if(m_VulkanApiVersion >= VK_MAKE_VERSION(1, 1, 0)) + { + VMA_ASSERT(0 && "vulkanApiVersion >= VK_API_VERSION_1_1 but required Vulkan version is disabled by preprocessor macros."); + } +#endif +#if !(VMA_MEMORY_PRIORITY) + if(m_UseExtMemoryPriority) + { + VMA_ASSERT(0 && "VMA_ALLOCATOR_CREATE_EXT_MEMORY_PRIORITY_BIT is set but required extension is not available in your Vulkan header or its support in VMA has been disabled by a preprocessor macro."); + } +#endif + + memset(&m_DeviceMemoryCallbacks, 0 ,sizeof(m_DeviceMemoryCallbacks)); + memset(&m_PhysicalDeviceProperties, 0, sizeof(m_PhysicalDeviceProperties)); + memset(&m_MemProps, 0, sizeof(m_MemProps)); + + memset(&m_pBlockVectors, 0, sizeof(m_pBlockVectors)); + memset(&m_pDedicatedAllocations, 0, sizeof(m_pDedicatedAllocations)); + memset(&m_VulkanFunctions, 0, sizeof(m_VulkanFunctions)); + + if(pCreateInfo->pDeviceMemoryCallbacks != VMA_NULL) + { + m_DeviceMemoryCallbacks.pUserData = pCreateInfo->pDeviceMemoryCallbacks->pUserData; + m_DeviceMemoryCallbacks.pfnAllocate = pCreateInfo->pDeviceMemoryCallbacks->pfnAllocate; + m_DeviceMemoryCallbacks.pfnFree = pCreateInfo->pDeviceMemoryCallbacks->pfnFree; + } + + ImportVulkanFunctions(pCreateInfo->pVulkanFunctions); + + (*m_VulkanFunctions.vkGetPhysicalDeviceProperties)(m_PhysicalDevice, &m_PhysicalDeviceProperties); + (*m_VulkanFunctions.vkGetPhysicalDeviceMemoryProperties)(m_PhysicalDevice, &m_MemProps); + + VMA_ASSERT(VmaIsPow2(VMA_DEBUG_ALIGNMENT)); + VMA_ASSERT(VmaIsPow2(VMA_DEBUG_MIN_BUFFER_IMAGE_GRANULARITY)); + VMA_ASSERT(VmaIsPow2(m_PhysicalDeviceProperties.limits.bufferImageGranularity)); + VMA_ASSERT(VmaIsPow2(m_PhysicalDeviceProperties.limits.nonCoherentAtomSize)); + + m_PreferredLargeHeapBlockSize = (pCreateInfo->preferredLargeHeapBlockSize != 0) ? + pCreateInfo->preferredLargeHeapBlockSize : static_cast(VMA_DEFAULT_LARGE_HEAP_BLOCK_SIZE); + + m_GlobalMemoryTypeBits = CalculateGlobalMemoryTypeBits(); + + if(pCreateInfo->pHeapSizeLimit != VMA_NULL) + { + for (const auto heapIndex : c10::irange(GetMemoryHeapCount())) { + const VkDeviceSize limit = pCreateInfo->pHeapSizeLimit[heapIndex]; + if(limit != VK_WHOLE_SIZE) + { + m_HeapSizeLimitMask |= 1u << heapIndex; + if(limit < m_MemProps.memoryHeaps[heapIndex].size) + { + m_MemProps.memoryHeaps[heapIndex].size = limit; + } + } + } + } + + for (const auto memTypeIndex : c10::irange(GetMemoryTypeCount())) { + const VkDeviceSize preferredBlockSize = CalcPreferredBlockSize(memTypeIndex); + + m_pBlockVectors[memTypeIndex] = vma_new(this, VmaBlockVector)( + this, + VK_NULL_HANDLE, // hParentPool + memTypeIndex, + preferredBlockSize, + 0, + SIZE_MAX, + GetBufferImageGranularity(), + pCreateInfo->frameInUseCount, + false, // explicitBlockSize + false, // linearAlgorithm + 0.5f); // priority (0.5 is the default per Vulkan spec) + // No need to call m_pBlockVectors[memTypeIndex][blockVectorTypeIndex]->CreateMinBlocks here, + // becase minBlockCount is 0. + m_pDedicatedAllocations[memTypeIndex] = vma_new(this, AllocationVectorType)(VmaStlAllocator(GetAllocationCallbacks())); + + } +} + +VkResult VmaAllocator_T::Init(const VmaAllocatorCreateInfo* pCreateInfo) +{ + VkResult res = VK_SUCCESS; + + if(pCreateInfo->pRecordSettings != VMA_NULL && + !VmaStrIsEmpty(pCreateInfo->pRecordSettings->pFilePath)) + { +#if VMA_RECORDING_ENABLED + m_pRecorder = vma_new(this, VmaRecorder)(); + res = m_pRecorder->Init(*pCreateInfo->pRecordSettings, m_UseMutex); + if(res != VK_SUCCESS) + { + return res; + } + m_pRecorder->WriteConfiguration( + m_PhysicalDeviceProperties, + m_MemProps, + m_VulkanApiVersion, + m_UseKhrDedicatedAllocation, + m_UseKhrBindMemory2, + m_UseExtMemoryBudget, + m_UseAmdDeviceCoherentMemory); + m_pRecorder->RecordCreateAllocator(GetCurrentFrameIndex()); +#else + VMA_ASSERT(0 && "VmaAllocatorCreateInfo::pRecordSettings used, but not supported due to VMA_RECORDING_ENABLED not defined to 1."); + return VK_ERROR_FEATURE_NOT_PRESENT; +#endif + } + +#if VMA_MEMORY_BUDGET + if(m_UseExtMemoryBudget) + { + UpdateVulkanBudget(); + } +#endif // #if VMA_MEMORY_BUDGET + + return res; +} + +VmaAllocator_T::~VmaAllocator_T() +{ +#if VMA_RECORDING_ENABLED + if(m_pRecorder != VMA_NULL) + { + m_pRecorder->RecordDestroyAllocator(GetCurrentFrameIndex()); + vma_delete(this, m_pRecorder); + } +#endif + + VMA_ASSERT(m_Pools.empty()); + + for(size_t i = GetMemoryTypeCount(); i--; ) + { + if(m_pDedicatedAllocations[i] != VMA_NULL && !m_pDedicatedAllocations[i]->empty()) + { + VMA_ASSERT(0 && "Unfreed dedicated allocations found."); + } + + vma_delete(this, m_pDedicatedAllocations[i]); + vma_delete(this, m_pBlockVectors[i]); + } +} + +void VmaAllocator_T::ImportVulkanFunctions(const VmaVulkanFunctions* pVulkanFunctions) +{ +#if VMA_STATIC_VULKAN_FUNCTIONS == 1 + ImportVulkanFunctions_Static(); +#endif + + if(pVulkanFunctions != VMA_NULL) + { + ImportVulkanFunctions_Custom(pVulkanFunctions); + } + +#if VMA_DYNAMIC_VULKAN_FUNCTIONS == 1 + ImportVulkanFunctions_Dynamic(); +#endif + + ValidateVulkanFunctions(); +} + +#if VMA_STATIC_VULKAN_FUNCTIONS == 1 + +void VmaAllocator_T::ImportVulkanFunctions_Static() +{ + // Vulkan 1.0 + m_VulkanFunctions.vkGetPhysicalDeviceProperties = (PFN_vkGetPhysicalDeviceProperties)vkGetPhysicalDeviceProperties; + m_VulkanFunctions.vkGetPhysicalDeviceMemoryProperties = (PFN_vkGetPhysicalDeviceMemoryProperties)vkGetPhysicalDeviceMemoryProperties; + m_VulkanFunctions.vkAllocateMemory = (PFN_vkAllocateMemory)vkAllocateMemory; + m_VulkanFunctions.vkFreeMemory = (PFN_vkFreeMemory)vkFreeMemory; + m_VulkanFunctions.vkMapMemory = (PFN_vkMapMemory)vkMapMemory; + m_VulkanFunctions.vkUnmapMemory = (PFN_vkUnmapMemory)vkUnmapMemory; + m_VulkanFunctions.vkFlushMappedMemoryRanges = (PFN_vkFlushMappedMemoryRanges)vkFlushMappedMemoryRanges; + m_VulkanFunctions.vkInvalidateMappedMemoryRanges = (PFN_vkInvalidateMappedMemoryRanges)vkInvalidateMappedMemoryRanges; + m_VulkanFunctions.vkBindBufferMemory = (PFN_vkBindBufferMemory)vkBindBufferMemory; + m_VulkanFunctions.vkBindImageMemory = (PFN_vkBindImageMemory)vkBindImageMemory; + m_VulkanFunctions.vkGetBufferMemoryRequirements = (PFN_vkGetBufferMemoryRequirements)vkGetBufferMemoryRequirements; + m_VulkanFunctions.vkGetImageMemoryRequirements = (PFN_vkGetImageMemoryRequirements)vkGetImageMemoryRequirements; + m_VulkanFunctions.vkCreateBuffer = (PFN_vkCreateBuffer)vkCreateBuffer; + m_VulkanFunctions.vkDestroyBuffer = (PFN_vkDestroyBuffer)vkDestroyBuffer; + m_VulkanFunctions.vkCreateImage = (PFN_vkCreateImage)vkCreateImage; + m_VulkanFunctions.vkDestroyImage = (PFN_vkDestroyImage)vkDestroyImage; + m_VulkanFunctions.vkCmdCopyBuffer = (PFN_vkCmdCopyBuffer)vkCmdCopyBuffer; + + // Vulkan 1.1 +#if VMA_VULKAN_VERSION >= 1001000 + if(m_VulkanApiVersion >= VK_MAKE_VERSION(1, 1, 0)) + { + m_VulkanFunctions.vkGetBufferMemoryRequirements2KHR = (PFN_vkGetBufferMemoryRequirements2)vkGetBufferMemoryRequirements2; + m_VulkanFunctions.vkGetImageMemoryRequirements2KHR = (PFN_vkGetImageMemoryRequirements2)vkGetImageMemoryRequirements2; + m_VulkanFunctions.vkBindBufferMemory2KHR = (PFN_vkBindBufferMemory2)vkBindBufferMemory2; + m_VulkanFunctions.vkBindImageMemory2KHR = (PFN_vkBindImageMemory2)vkBindImageMemory2; + m_VulkanFunctions.vkGetPhysicalDeviceMemoryProperties2KHR = (PFN_vkGetPhysicalDeviceMemoryProperties2)vkGetPhysicalDeviceMemoryProperties2; + } +#endif +} + +#endif // #if VMA_STATIC_VULKAN_FUNCTIONS == 1 + +void VmaAllocator_T::ImportVulkanFunctions_Custom(const VmaVulkanFunctions* pVulkanFunctions) +{ + VMA_ASSERT(pVulkanFunctions != VMA_NULL); + +#define VMA_COPY_IF_NOT_NULL(funcName) \ + if(pVulkanFunctions->funcName != VMA_NULL) m_VulkanFunctions.funcName = pVulkanFunctions->funcName; + + VMA_COPY_IF_NOT_NULL(vkGetPhysicalDeviceProperties); + VMA_COPY_IF_NOT_NULL(vkGetPhysicalDeviceMemoryProperties); + VMA_COPY_IF_NOT_NULL(vkAllocateMemory); + VMA_COPY_IF_NOT_NULL(vkFreeMemory); + VMA_COPY_IF_NOT_NULL(vkMapMemory); + VMA_COPY_IF_NOT_NULL(vkUnmapMemory); + VMA_COPY_IF_NOT_NULL(vkFlushMappedMemoryRanges); + VMA_COPY_IF_NOT_NULL(vkInvalidateMappedMemoryRanges); + VMA_COPY_IF_NOT_NULL(vkBindBufferMemory); + VMA_COPY_IF_NOT_NULL(vkBindImageMemory); + VMA_COPY_IF_NOT_NULL(vkGetBufferMemoryRequirements); + VMA_COPY_IF_NOT_NULL(vkGetImageMemoryRequirements); + VMA_COPY_IF_NOT_NULL(vkCreateBuffer); + VMA_COPY_IF_NOT_NULL(vkDestroyBuffer); + VMA_COPY_IF_NOT_NULL(vkCreateImage); + VMA_COPY_IF_NOT_NULL(vkDestroyImage); + VMA_COPY_IF_NOT_NULL(vkCmdCopyBuffer); + +#if VMA_DEDICATED_ALLOCATION || VMA_VULKAN_VERSION >= 1001000 + VMA_COPY_IF_NOT_NULL(vkGetBufferMemoryRequirements2KHR); + VMA_COPY_IF_NOT_NULL(vkGetImageMemoryRequirements2KHR); +#endif + +#if VMA_BIND_MEMORY2 || VMA_VULKAN_VERSION >= 1001000 + VMA_COPY_IF_NOT_NULL(vkBindBufferMemory2KHR); + VMA_COPY_IF_NOT_NULL(vkBindImageMemory2KHR); +#endif + +#if VMA_MEMORY_BUDGET + VMA_COPY_IF_NOT_NULL(vkGetPhysicalDeviceMemoryProperties2KHR); +#endif + +#undef VMA_COPY_IF_NOT_NULL +} + +#if VMA_DYNAMIC_VULKAN_FUNCTIONS == 1 + +void VmaAllocator_T::ImportVulkanFunctions_Dynamic() +{ +#define VMA_FETCH_INSTANCE_FUNC(memberName, functionPointerType, functionNameString) \ + if(m_VulkanFunctions.memberName == VMA_NULL) \ + m_VulkanFunctions.memberName = \ + (functionPointerType)vkGetInstanceProcAddr(m_hInstance, functionNameString); +#define VMA_FETCH_DEVICE_FUNC(memberName, functionPointerType, functionNameString) \ + if(m_VulkanFunctions.memberName == VMA_NULL) \ + m_VulkanFunctions.memberName = \ + (functionPointerType)vkGetDeviceProcAddr(m_hDevice, functionNameString); + + VMA_FETCH_INSTANCE_FUNC(vkGetPhysicalDeviceProperties, PFN_vkGetPhysicalDeviceProperties, "vkGetPhysicalDeviceProperties"); + VMA_FETCH_INSTANCE_FUNC(vkGetPhysicalDeviceMemoryProperties, PFN_vkGetPhysicalDeviceMemoryProperties, "vkGetPhysicalDeviceMemoryProperties"); + VMA_FETCH_DEVICE_FUNC(vkAllocateMemory, PFN_vkAllocateMemory, "vkAllocateMemory"); + VMA_FETCH_DEVICE_FUNC(vkFreeMemory, PFN_vkFreeMemory, "vkFreeMemory"); + VMA_FETCH_DEVICE_FUNC(vkMapMemory, PFN_vkMapMemory, "vkMapMemory"); + VMA_FETCH_DEVICE_FUNC(vkUnmapMemory, PFN_vkUnmapMemory, "vkUnmapMemory"); + VMA_FETCH_DEVICE_FUNC(vkFlushMappedMemoryRanges, PFN_vkFlushMappedMemoryRanges, "vkFlushMappedMemoryRanges"); + VMA_FETCH_DEVICE_FUNC(vkInvalidateMappedMemoryRanges, PFN_vkInvalidateMappedMemoryRanges, "vkInvalidateMappedMemoryRanges"); + VMA_FETCH_DEVICE_FUNC(vkBindBufferMemory, PFN_vkBindBufferMemory, "vkBindBufferMemory"); + VMA_FETCH_DEVICE_FUNC(vkBindImageMemory, PFN_vkBindImageMemory, "vkBindImageMemory"); + VMA_FETCH_DEVICE_FUNC(vkGetBufferMemoryRequirements, PFN_vkGetBufferMemoryRequirements, "vkGetBufferMemoryRequirements"); + VMA_FETCH_DEVICE_FUNC(vkGetImageMemoryRequirements, PFN_vkGetImageMemoryRequirements, "vkGetImageMemoryRequirements"); + VMA_FETCH_DEVICE_FUNC(vkCreateBuffer, PFN_vkCreateBuffer, "vkCreateBuffer"); + VMA_FETCH_DEVICE_FUNC(vkDestroyBuffer, PFN_vkDestroyBuffer, "vkDestroyBuffer"); + VMA_FETCH_DEVICE_FUNC(vkCreateImage, PFN_vkCreateImage, "vkCreateImage"); + VMA_FETCH_DEVICE_FUNC(vkDestroyImage, PFN_vkDestroyImage, "vkDestroyImage"); + VMA_FETCH_DEVICE_FUNC(vkCmdCopyBuffer, PFN_vkCmdCopyBuffer, "vkCmdCopyBuffer"); + +#if VMA_VULKAN_VERSION >= 1001000 + if(m_VulkanApiVersion >= VK_MAKE_VERSION(1, 1, 0)) + { + VMA_FETCH_DEVICE_FUNC(vkGetBufferMemoryRequirements2KHR, PFN_vkGetBufferMemoryRequirements2, "vkGetBufferMemoryRequirements2"); + VMA_FETCH_DEVICE_FUNC(vkGetImageMemoryRequirements2KHR, PFN_vkGetImageMemoryRequirements2, "vkGetImageMemoryRequirements2"); + VMA_FETCH_DEVICE_FUNC(vkBindBufferMemory2KHR, PFN_vkBindBufferMemory2, "vkBindBufferMemory2"); + VMA_FETCH_DEVICE_FUNC(vkBindImageMemory2KHR, PFN_vkBindImageMemory2, "vkBindImageMemory2"); + VMA_FETCH_INSTANCE_FUNC(vkGetPhysicalDeviceMemoryProperties2KHR, PFN_vkGetPhysicalDeviceMemoryProperties2, "vkGetPhysicalDeviceMemoryProperties2"); + } +#endif + +#if VMA_DEDICATED_ALLOCATION + if(m_UseKhrDedicatedAllocation) + { + VMA_FETCH_DEVICE_FUNC(vkGetBufferMemoryRequirements2KHR, PFN_vkGetBufferMemoryRequirements2KHR, "vkGetBufferMemoryRequirements2KHR"); + VMA_FETCH_DEVICE_FUNC(vkGetImageMemoryRequirements2KHR, PFN_vkGetImageMemoryRequirements2KHR, "vkGetImageMemoryRequirements2KHR"); + } +#endif + +#if VMA_BIND_MEMORY2 + if(m_UseKhrBindMemory2) + { + VMA_FETCH_DEVICE_FUNC(vkBindBufferMemory2KHR, PFN_vkBindBufferMemory2KHR, "vkBindBufferMemory2KHR"); + VMA_FETCH_DEVICE_FUNC(vkBindImageMemory2KHR, PFN_vkBindImageMemory2KHR, "vkBindImageMemory2KHR"); + } +#endif // #if VMA_BIND_MEMORY2 + +#if VMA_MEMORY_BUDGET + if(m_UseExtMemoryBudget) + { + VMA_FETCH_INSTANCE_FUNC(vkGetPhysicalDeviceMemoryProperties2KHR, PFN_vkGetPhysicalDeviceMemoryProperties2KHR, "vkGetPhysicalDeviceMemoryProperties2KHR"); + } +#endif // #if VMA_MEMORY_BUDGET + +#undef VMA_FETCH_DEVICE_FUNC +#undef VMA_FETCH_INSTANCE_FUNC +} + +#endif // #if VMA_DYNAMIC_VULKAN_FUNCTIONS == 1 + +void VmaAllocator_T::ValidateVulkanFunctions() +{ + VMA_ASSERT(m_VulkanFunctions.vkGetPhysicalDeviceProperties != VMA_NULL); + VMA_ASSERT(m_VulkanFunctions.vkGetPhysicalDeviceMemoryProperties != VMA_NULL); + VMA_ASSERT(m_VulkanFunctions.vkAllocateMemory != VMA_NULL); + VMA_ASSERT(m_VulkanFunctions.vkFreeMemory != VMA_NULL); + VMA_ASSERT(m_VulkanFunctions.vkMapMemory != VMA_NULL); + VMA_ASSERT(m_VulkanFunctions.vkUnmapMemory != VMA_NULL); + VMA_ASSERT(m_VulkanFunctions.vkFlushMappedMemoryRanges != VMA_NULL); + VMA_ASSERT(m_VulkanFunctions.vkInvalidateMappedMemoryRanges != VMA_NULL); + VMA_ASSERT(m_VulkanFunctions.vkBindBufferMemory != VMA_NULL); + VMA_ASSERT(m_VulkanFunctions.vkBindImageMemory != VMA_NULL); + VMA_ASSERT(m_VulkanFunctions.vkGetBufferMemoryRequirements != VMA_NULL); + VMA_ASSERT(m_VulkanFunctions.vkGetImageMemoryRequirements != VMA_NULL); + VMA_ASSERT(m_VulkanFunctions.vkCreateBuffer != VMA_NULL); + VMA_ASSERT(m_VulkanFunctions.vkDestroyBuffer != VMA_NULL); + VMA_ASSERT(m_VulkanFunctions.vkCreateImage != VMA_NULL); + VMA_ASSERT(m_VulkanFunctions.vkDestroyImage != VMA_NULL); + VMA_ASSERT(m_VulkanFunctions.vkCmdCopyBuffer != VMA_NULL); + +#if VMA_DEDICATED_ALLOCATION || VMA_VULKAN_VERSION >= 1001000 + if(m_VulkanApiVersion >= VK_MAKE_VERSION(1, 1, 0) || m_UseKhrDedicatedAllocation) + { + VMA_ASSERT(m_VulkanFunctions.vkGetBufferMemoryRequirements2KHR != VMA_NULL); + VMA_ASSERT(m_VulkanFunctions.vkGetImageMemoryRequirements2KHR != VMA_NULL); + } +#endif + +#if VMA_BIND_MEMORY2 || VMA_VULKAN_VERSION >= 1001000 + if(m_VulkanApiVersion >= VK_MAKE_VERSION(1, 1, 0) || m_UseKhrBindMemory2) + { + VMA_ASSERT(m_VulkanFunctions.vkBindBufferMemory2KHR != VMA_NULL); + VMA_ASSERT(m_VulkanFunctions.vkBindImageMemory2KHR != VMA_NULL); + } +#endif + +#if VMA_MEMORY_BUDGET || VMA_VULKAN_VERSION >= 1001000 + if(m_UseExtMemoryBudget || m_VulkanApiVersion >= VK_MAKE_VERSION(1, 1, 0)) + { + VMA_ASSERT(m_VulkanFunctions.vkGetPhysicalDeviceMemoryProperties2KHR != VMA_NULL); + } +#endif +} + +VkDeviceSize VmaAllocator_T::CalcPreferredBlockSize(uint32_t memTypeIndex) +{ + const uint32_t heapIndex = MemoryTypeIndexToHeapIndex(memTypeIndex); + const VkDeviceSize heapSize = m_MemProps.memoryHeaps[heapIndex].size; + const bool isSmallHeap = heapSize <= VMA_SMALL_HEAP_MAX_SIZE; + return VmaAlignUp(isSmallHeap ? (heapSize / 8) : m_PreferredLargeHeapBlockSize, (VkDeviceSize)32); +} + +VkResult VmaAllocator_T::AllocateMemoryOfType( + VkDeviceSize size, + VkDeviceSize alignment, + bool dedicatedAllocation, + VkBuffer dedicatedBuffer, + VkBufferUsageFlags dedicatedBufferUsage, + VkImage dedicatedImage, + const VmaAllocationCreateInfo& createInfo, + uint32_t memTypeIndex, + VmaSuballocationType suballocType, + size_t allocationCount, + VmaAllocation* pAllocations) +{ + VMA_ASSERT(pAllocations != VMA_NULL); + VMA_DEBUG_LOG(" AllocateMemory: MemoryTypeIndex=%u, AllocationCount=%zu, Size=%llu", memTypeIndex, allocationCount, size); + + VmaAllocationCreateInfo finalCreateInfo = createInfo; + + // If memory type is not HOST_VISIBLE, disable MAPPED. + if((finalCreateInfo.flags & VMA_ALLOCATION_CREATE_MAPPED_BIT) != 0 && + (m_MemProps.memoryTypes[memTypeIndex].propertyFlags & VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT) == 0) + { + finalCreateInfo.flags &= ~VMA_ALLOCATION_CREATE_MAPPED_BIT; + } + // If memory is lazily allocated, it should be always dedicated. + if(finalCreateInfo.usage == VMA_MEMORY_USAGE_GPU_LAZILY_ALLOCATED) + { + finalCreateInfo.flags |= VMA_ALLOCATION_CREATE_DEDICATED_MEMORY_BIT; + } + + VmaBlockVector* const blockVector = m_pBlockVectors[memTypeIndex]; + VMA_ASSERT(blockVector); + + const VkDeviceSize preferredBlockSize = blockVector->GetPreferredBlockSize(); + bool preferDedicatedMemory = + VMA_DEBUG_ALWAYS_DEDICATED_MEMORY || + dedicatedAllocation || + // Heuristics: Allocate dedicated memory if requested size if greater than half of preferred block size. + size > preferredBlockSize / 2; + + if(preferDedicatedMemory && + (finalCreateInfo.flags & VMA_ALLOCATION_CREATE_NEVER_ALLOCATE_BIT) == 0 && + finalCreateInfo.pool == VK_NULL_HANDLE) + { + finalCreateInfo.flags |= VMA_ALLOCATION_CREATE_DEDICATED_MEMORY_BIT; + } + + if((finalCreateInfo.flags & VMA_ALLOCATION_CREATE_DEDICATED_MEMORY_BIT) != 0) + { + if((finalCreateInfo.flags & VMA_ALLOCATION_CREATE_NEVER_ALLOCATE_BIT) != 0) + { + return VK_ERROR_OUT_OF_DEVICE_MEMORY; + } + else + { + return AllocateDedicatedMemory( + size, + suballocType, + memTypeIndex, + (finalCreateInfo.flags & VMA_ALLOCATION_CREATE_WITHIN_BUDGET_BIT) != 0, + (finalCreateInfo.flags & VMA_ALLOCATION_CREATE_MAPPED_BIT) != 0, + (finalCreateInfo.flags & VMA_ALLOCATION_CREATE_USER_DATA_COPY_STRING_BIT) != 0, + finalCreateInfo.pUserData, + finalCreateInfo.priority, + dedicatedBuffer, + dedicatedBufferUsage, + dedicatedImage, + allocationCount, + pAllocations); + } + } + else + { + VkResult res = blockVector->Allocate( + m_CurrentFrameIndex.load(), + size, + alignment, + finalCreateInfo, + suballocType, + allocationCount, + pAllocations); + if(res == VK_SUCCESS) + { + return res; + } + + // 5. Try dedicated memory. + if((finalCreateInfo.flags & VMA_ALLOCATION_CREATE_NEVER_ALLOCATE_BIT) != 0) + { + return VK_ERROR_OUT_OF_DEVICE_MEMORY; + } + + // Protection against creating each allocation as dedicated when we reach or exceed heap size/budget, + // which can quickly deplete maxMemoryAllocationCount: Don't try dedicated allocations when above + // 3/4 of the maximum allocation count. + if(m_DeviceMemoryCount.load() > m_PhysicalDeviceProperties.limits.maxMemoryAllocationCount * 3 / 4) + { + return VK_ERROR_OUT_OF_DEVICE_MEMORY; + } + + res = AllocateDedicatedMemory( + size, + suballocType, + memTypeIndex, + (finalCreateInfo.flags & VMA_ALLOCATION_CREATE_WITHIN_BUDGET_BIT) != 0, + (finalCreateInfo.flags & VMA_ALLOCATION_CREATE_MAPPED_BIT) != 0, + (finalCreateInfo.flags & VMA_ALLOCATION_CREATE_USER_DATA_COPY_STRING_BIT) != 0, + finalCreateInfo.pUserData, + finalCreateInfo.priority, + dedicatedBuffer, + dedicatedBufferUsage, + dedicatedImage, + allocationCount, + pAllocations); + if(res == VK_SUCCESS) + { + // Succeeded: AllocateDedicatedMemory function already filld pMemory, nothing more to do here. + VMA_DEBUG_LOG(" Allocated as DedicatedMemory"); + return VK_SUCCESS; + } + else + { + // Everything failed: Return error code. + VMA_DEBUG_LOG(" vkAllocateMemory FAILED"); + return res; + } + } +} + +VkResult VmaAllocator_T::AllocateDedicatedMemory( + VkDeviceSize size, + VmaSuballocationType suballocType, + uint32_t memTypeIndex, + bool withinBudget, + bool map, + bool isUserDataString, + void* pUserData, + float priority, + VkBuffer dedicatedBuffer, + VkBufferUsageFlags dedicatedBufferUsage, + VkImage dedicatedImage, + size_t allocationCount, + VmaAllocation* pAllocations) +{ + VMA_ASSERT(allocationCount > 0 && pAllocations); + + if(withinBudget) + { + const uint32_t heapIndex = MemoryTypeIndexToHeapIndex(memTypeIndex); + VmaBudget heapBudget = {}; + GetBudget(&heapBudget, heapIndex, 1); + if(heapBudget.usage + size * allocationCount > heapBudget.budget) + { + return VK_ERROR_OUT_OF_DEVICE_MEMORY; + } + } + + VkMemoryAllocateInfo allocInfo = { VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO }; + allocInfo.memoryTypeIndex = memTypeIndex; + allocInfo.allocationSize = size; + +#if VMA_DEDICATED_ALLOCATION || VMA_VULKAN_VERSION >= 1001000 + VkMemoryDedicatedAllocateInfoKHR dedicatedAllocInfo = { VK_STRUCTURE_TYPE_MEMORY_DEDICATED_ALLOCATE_INFO_KHR }; + if(m_UseKhrDedicatedAllocation || m_VulkanApiVersion >= VK_MAKE_VERSION(1, 1, 0)) + { + if(dedicatedBuffer != VK_NULL_HANDLE) + { + VMA_ASSERT(dedicatedImage == VK_NULL_HANDLE); + dedicatedAllocInfo.buffer = dedicatedBuffer; + VmaPnextChainPushFront(&allocInfo, &dedicatedAllocInfo); + } + else if(dedicatedImage != VK_NULL_HANDLE) + { + dedicatedAllocInfo.image = dedicatedImage; + VmaPnextChainPushFront(&allocInfo, &dedicatedAllocInfo); + } + } +#endif // #if VMA_DEDICATED_ALLOCATION || VMA_VULKAN_VERSION >= 1001000 + +#if VMA_BUFFER_DEVICE_ADDRESS + VkMemoryAllocateFlagsInfoKHR allocFlagsInfo = { VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_FLAGS_INFO_KHR }; + if(m_UseKhrBufferDeviceAddress) + { + bool canContainBufferWithDeviceAddress = true; + if(dedicatedBuffer != VK_NULL_HANDLE) + { + canContainBufferWithDeviceAddress = dedicatedBufferUsage == UINT32_MAX || // Usage flags unknown + (dedicatedBufferUsage & VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT_EXT) != 0; + } + else if(dedicatedImage != VK_NULL_HANDLE) + { + canContainBufferWithDeviceAddress = false; + } + if(canContainBufferWithDeviceAddress) + { + allocFlagsInfo.flags = VK_MEMORY_ALLOCATE_DEVICE_ADDRESS_BIT_KHR; + VmaPnextChainPushFront(&allocInfo, &allocFlagsInfo); + } + } +#endif // #if VMA_BUFFER_DEVICE_ADDRESS + +#if VMA_MEMORY_PRIORITY + VkMemoryPriorityAllocateInfoEXT priorityInfo = { VK_STRUCTURE_TYPE_MEMORY_PRIORITY_ALLOCATE_INFO_EXT }; + if(m_UseExtMemoryPriority) + { + priorityInfo.priority = priority; + VmaPnextChainPushFront(&allocInfo, &priorityInfo); + } +#endif // #if VMA_MEMORY_PRIORITY + + size_t allocIndex; + VkResult res = VK_SUCCESS; + for(allocIndex = 0; allocIndex < allocationCount; ++allocIndex) + { + res = AllocateDedicatedMemoryPage( + size, + suballocType, + memTypeIndex, + allocInfo, + map, + isUserDataString, + pUserData, + pAllocations + allocIndex); + if(res != VK_SUCCESS) + { + break; + } + } + + if(res == VK_SUCCESS) + { + // Register them in m_pDedicatedAllocations. + { + VmaMutexLockWrite lock(m_DedicatedAllocationsMutex[memTypeIndex], m_UseMutex); + AllocationVectorType* pDedicatedAllocations = m_pDedicatedAllocations[memTypeIndex]; + VMA_ASSERT(pDedicatedAllocations); + for(allocIndex = 0; allocIndex < allocationCount; ++allocIndex) + { + VmaVectorInsertSorted(*pDedicatedAllocations, pAllocations[allocIndex]); + } + } + + VMA_DEBUG_LOG(" Allocated DedicatedMemory Count=%zu, MemoryTypeIndex=#%u", allocationCount, memTypeIndex); + } + else + { + // Free all already created allocations. + while(allocIndex--) + { + VmaAllocation currAlloc = pAllocations[allocIndex]; + VkDeviceMemory hMemory = currAlloc->GetMemory(); + + /* + There is no need to call this, because Vulkan spec allows to skip vkUnmapMemory + before vkFreeMemory. + + if(currAlloc->GetMappedData() != VMA_NULL) + { + (*m_VulkanFunctions.vkUnmapMemory)(m_hDevice, hMemory); + } + */ + + FreeVulkanMemory(memTypeIndex, currAlloc->GetSize(), hMemory); + m_Budget.RemoveAllocation(MemoryTypeIndexToHeapIndex(memTypeIndex), currAlloc->GetSize()); + currAlloc->SetUserData(this, VMA_NULL); + m_AllocationObjectAllocator.Free(currAlloc); + } + + memset(pAllocations, 0, sizeof(VmaAllocation) * allocationCount); + } + + return res; +} + +VkResult VmaAllocator_T::AllocateDedicatedMemoryPage( + VkDeviceSize size, + VmaSuballocationType suballocType, + uint32_t memTypeIndex, + const VkMemoryAllocateInfo& allocInfo, + bool map, + bool isUserDataString, + void* pUserData, + VmaAllocation* pAllocation) +{ + VkDeviceMemory hMemory = VK_NULL_HANDLE; + VkResult res = AllocateVulkanMemory(&allocInfo, &hMemory); + if(res < 0) + { + VMA_DEBUG_LOG(" vkAllocateMemory FAILED"); + return res; + } + + void* pMappedData = VMA_NULL; + if(map) + { + res = (*m_VulkanFunctions.vkMapMemory)( + m_hDevice, + hMemory, + 0, + VK_WHOLE_SIZE, + 0, + &pMappedData); + if(res < 0) + { + VMA_DEBUG_LOG(" vkMapMemory FAILED"); + FreeVulkanMemory(memTypeIndex, size, hMemory); + return res; + } + } + + *pAllocation = m_AllocationObjectAllocator.Allocate(m_CurrentFrameIndex.load(), isUserDataString); + (*pAllocation)->InitDedicatedAllocation(memTypeIndex, hMemory, suballocType, pMappedData, size); + (*pAllocation)->SetUserData(this, pUserData); + m_Budget.AddAllocation(MemoryTypeIndexToHeapIndex(memTypeIndex), size); + if(VMA_DEBUG_INITIALIZE_ALLOCATIONS) + { + FillAllocation(*pAllocation, VMA_ALLOCATION_FILL_PATTERN_CREATED); + } + + return VK_SUCCESS; +} + +void VmaAllocator_T::GetBufferMemoryRequirements( + VkBuffer hBuffer, + VkMemoryRequirements& memReq, + bool& requiresDedicatedAllocation, + bool& prefersDedicatedAllocation) const +{ +#if VMA_DEDICATED_ALLOCATION || VMA_VULKAN_VERSION >= 1001000 + if(m_UseKhrDedicatedAllocation || m_VulkanApiVersion >= VK_MAKE_VERSION(1, 1, 0)) + { + VkBufferMemoryRequirementsInfo2KHR memReqInfo = { VK_STRUCTURE_TYPE_BUFFER_MEMORY_REQUIREMENTS_INFO_2_KHR }; + memReqInfo.buffer = hBuffer; + + VkMemoryDedicatedRequirementsKHR memDedicatedReq = { VK_STRUCTURE_TYPE_MEMORY_DEDICATED_REQUIREMENTS_KHR }; + + VkMemoryRequirements2KHR memReq2 = { VK_STRUCTURE_TYPE_MEMORY_REQUIREMENTS_2_KHR }; + VmaPnextChainPushFront(&memReq2, &memDedicatedReq); + + (*m_VulkanFunctions.vkGetBufferMemoryRequirements2KHR)(m_hDevice, &memReqInfo, &memReq2); + + memReq = memReq2.memoryRequirements; + requiresDedicatedAllocation = (memDedicatedReq.requiresDedicatedAllocation != VK_FALSE); + prefersDedicatedAllocation = (memDedicatedReq.prefersDedicatedAllocation != VK_FALSE); + } + else +#endif // #if VMA_DEDICATED_ALLOCATION || VMA_VULKAN_VERSION >= 1001000 + { + (*m_VulkanFunctions.vkGetBufferMemoryRequirements)(m_hDevice, hBuffer, &memReq); + requiresDedicatedAllocation = false; + prefersDedicatedAllocation = false; + } +} + +void VmaAllocator_T::GetImageMemoryRequirements( + VkImage hImage, + VkMemoryRequirements& memReq, + bool& requiresDedicatedAllocation, + bool& prefersDedicatedAllocation) const +{ +#if VMA_DEDICATED_ALLOCATION || VMA_VULKAN_VERSION >= 1001000 + if(m_UseKhrDedicatedAllocation || m_VulkanApiVersion >= VK_MAKE_VERSION(1, 1, 0)) + { + VkImageMemoryRequirementsInfo2KHR memReqInfo = { VK_STRUCTURE_TYPE_IMAGE_MEMORY_REQUIREMENTS_INFO_2_KHR }; + memReqInfo.image = hImage; + + VkMemoryDedicatedRequirementsKHR memDedicatedReq = { VK_STRUCTURE_TYPE_MEMORY_DEDICATED_REQUIREMENTS_KHR }; + + VkMemoryRequirements2KHR memReq2 = { VK_STRUCTURE_TYPE_MEMORY_REQUIREMENTS_2_KHR }; + VmaPnextChainPushFront(&memReq2, &memDedicatedReq); + + (*m_VulkanFunctions.vkGetImageMemoryRequirements2KHR)(m_hDevice, &memReqInfo, &memReq2); + + memReq = memReq2.memoryRequirements; + requiresDedicatedAllocation = (memDedicatedReq.requiresDedicatedAllocation != VK_FALSE); + prefersDedicatedAllocation = (memDedicatedReq.prefersDedicatedAllocation != VK_FALSE); + } + else +#endif // #if VMA_DEDICATED_ALLOCATION || VMA_VULKAN_VERSION >= 1001000 + { + (*m_VulkanFunctions.vkGetImageMemoryRequirements)(m_hDevice, hImage, &memReq); + requiresDedicatedAllocation = false; + prefersDedicatedAllocation = false; + } +} + +VkResult VmaAllocator_T::AllocateMemory( + const VkMemoryRequirements& vkMemReq, + bool requiresDedicatedAllocation, + bool prefersDedicatedAllocation, + VkBuffer dedicatedBuffer, + VkBufferUsageFlags dedicatedBufferUsage, + VkImage dedicatedImage, + const VmaAllocationCreateInfo& createInfo, + VmaSuballocationType suballocType, + size_t allocationCount, + VmaAllocation* pAllocations) +{ + memset(pAllocations, 0, sizeof(VmaAllocation) * allocationCount); + + VMA_ASSERT(VmaIsPow2(vkMemReq.alignment)); + + if(vkMemReq.size == 0) + { + return VK_ERROR_VALIDATION_FAILED_EXT; + } + if((createInfo.flags & VMA_ALLOCATION_CREATE_DEDICATED_MEMORY_BIT) != 0 && + (createInfo.flags & VMA_ALLOCATION_CREATE_NEVER_ALLOCATE_BIT) != 0) + { + VMA_ASSERT(0 && "Specifying VMA_ALLOCATION_CREATE_DEDICATED_MEMORY_BIT together with VMA_ALLOCATION_CREATE_NEVER_ALLOCATE_BIT makes no sense."); + return VK_ERROR_OUT_OF_DEVICE_MEMORY; + } + if((createInfo.flags & VMA_ALLOCATION_CREATE_MAPPED_BIT) != 0 && + (createInfo.flags & VMA_ALLOCATION_CREATE_CAN_BECOME_LOST_BIT) != 0) + { + VMA_ASSERT(0 && "Specifying VMA_ALLOCATION_CREATE_MAPPED_BIT together with VMA_ALLOCATION_CREATE_CAN_BECOME_LOST_BIT is invalid."); + return VK_ERROR_OUT_OF_DEVICE_MEMORY; + } + if(requiresDedicatedAllocation) + { + if((createInfo.flags & VMA_ALLOCATION_CREATE_NEVER_ALLOCATE_BIT) != 0) + { + VMA_ASSERT(0 && "VMA_ALLOCATION_CREATE_NEVER_ALLOCATE_BIT specified while dedicated allocation is required."); + return VK_ERROR_OUT_OF_DEVICE_MEMORY; + } + if(createInfo.pool != VK_NULL_HANDLE) + { + VMA_ASSERT(0 && "Pool specified while dedicated allocation is required."); + return VK_ERROR_OUT_OF_DEVICE_MEMORY; + } + } + if((createInfo.pool != VK_NULL_HANDLE) && + ((createInfo.flags & (VMA_ALLOCATION_CREATE_DEDICATED_MEMORY_BIT)) != 0)) + { + VMA_ASSERT(0 && "Specifying VMA_ALLOCATION_CREATE_DEDICATED_MEMORY_BIT when pool != null is invalid."); + return VK_ERROR_OUT_OF_DEVICE_MEMORY; + } + + if(createInfo.pool != VK_NULL_HANDLE) + { + const VkDeviceSize alignmentForPool = VMA_MAX( + vkMemReq.alignment, + GetMemoryTypeMinAlignment(createInfo.pool->m_BlockVector.GetMemoryTypeIndex())); + + VmaAllocationCreateInfo createInfoForPool = createInfo; + // If memory type is not HOST_VISIBLE, disable MAPPED. + if((createInfoForPool.flags & VMA_ALLOCATION_CREATE_MAPPED_BIT) != 0 && + (m_MemProps.memoryTypes[createInfo.pool->m_BlockVector.GetMemoryTypeIndex()].propertyFlags & VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT) == 0) + { + createInfoForPool.flags &= ~VMA_ALLOCATION_CREATE_MAPPED_BIT; + } + + return createInfo.pool->m_BlockVector.Allocate( + m_CurrentFrameIndex.load(), + vkMemReq.size, + alignmentForPool, + createInfoForPool, + suballocType, + allocationCount, + pAllocations); + } + else + { + // Bit mask of memory Vulkan types acceptable for this allocation. + uint32_t memoryTypeBits = vkMemReq.memoryTypeBits; + uint32_t memTypeIndex = UINT32_MAX; + VkResult res = vmaFindMemoryTypeIndex(this, memoryTypeBits, &createInfo, &memTypeIndex); + if(res == VK_SUCCESS) + { + VkDeviceSize alignmentForMemType = VMA_MAX( + vkMemReq.alignment, + GetMemoryTypeMinAlignment(memTypeIndex)); + + res = AllocateMemoryOfType( + vkMemReq.size, + alignmentForMemType, + requiresDedicatedAllocation || prefersDedicatedAllocation, + dedicatedBuffer, + dedicatedBufferUsage, + dedicatedImage, + createInfo, + memTypeIndex, + suballocType, + allocationCount, + pAllocations); + // Succeeded on first try. + if(res == VK_SUCCESS) + { + return res; + } + // Allocation from this memory type failed. Try other compatible memory types. + else + { + for(;;) + { + // Remove old memTypeIndex from list of possibilities. + memoryTypeBits &= ~(1u << memTypeIndex); + // Find alternative memTypeIndex. + res = vmaFindMemoryTypeIndex(this, memoryTypeBits, &createInfo, &memTypeIndex); + if(res == VK_SUCCESS) + { + alignmentForMemType = VMA_MAX( + vkMemReq.alignment, + GetMemoryTypeMinAlignment(memTypeIndex)); + + res = AllocateMemoryOfType( + vkMemReq.size, + alignmentForMemType, + requiresDedicatedAllocation || prefersDedicatedAllocation, + dedicatedBuffer, + dedicatedBufferUsage, + dedicatedImage, + createInfo, + memTypeIndex, + suballocType, + allocationCount, + pAllocations); + // Allocation from this alternative memory type succeeded. + if(res == VK_SUCCESS) + { + return res; + } + // else: Allocation from this memory type failed. Try next one - next loop iteration. + } + // No other matching memory type index could be found. + else + { + // Not returning res, which is VK_ERROR_FEATURE_NOT_PRESENT, because we already failed to allocate once. + return VK_ERROR_OUT_OF_DEVICE_MEMORY; + } + } + } + } + // Can't find any single memory type maching requirements. res is VK_ERROR_FEATURE_NOT_PRESENT. + else + return res; + } +} + +void VmaAllocator_T::FreeMemory( + size_t allocationCount, + const VmaAllocation* pAllocations) +{ + VMA_ASSERT(pAllocations); + + for(size_t allocIndex = allocationCount; allocIndex--; ) + { + VmaAllocation allocation = pAllocations[allocIndex]; + + if(allocation != VK_NULL_HANDLE) + { + if(TouchAllocation(allocation)) + { + if(VMA_DEBUG_INITIALIZE_ALLOCATIONS) + { + FillAllocation(allocation, VMA_ALLOCATION_FILL_PATTERN_DESTROYED); + } + + switch(allocation->GetType()) + { + case VmaAllocation_T::ALLOCATION_TYPE_BLOCK: + { + VmaBlockVector* pBlockVector = VMA_NULL; + VmaPool hPool = allocation->GetBlock()->GetParentPool(); + if(hPool != VK_NULL_HANDLE) + { + pBlockVector = &hPool->m_BlockVector; + } + else + { + const uint32_t memTypeIndex = allocation->GetMemoryTypeIndex(); + pBlockVector = m_pBlockVectors[memTypeIndex]; + } + pBlockVector->Free(allocation); + } + break; + case VmaAllocation_T::ALLOCATION_TYPE_DEDICATED: + FreeDedicatedMemory(allocation); + break; + default: + VMA_ASSERT(0); + } + } + + // Do this regardless of whether the allocation is lost. Lost allocations still account to Budget.AllocationBytes. + m_Budget.RemoveAllocation(MemoryTypeIndexToHeapIndex(allocation->GetMemoryTypeIndex()), allocation->GetSize()); + allocation->SetUserData(this, VMA_NULL); + m_AllocationObjectAllocator.Free(allocation); + } + } +} + +void VmaAllocator_T::CalculateStats(VmaStats* pStats) +{ + // Initialize. + InitStatInfo(pStats->total); + for (const auto i : c10::irange(VK_MAX_MEMORY_TYPES))InitStatInfo(pStats->memoryType[i]); + for (const auto i : c10::irange(VK_MAX_MEMORY_HEAPS))InitStatInfo(pStats->memoryHeap[i]); + + // Process default pools. + for (const auto memTypeIndex : c10::irange(GetMemoryTypeCount())) { + VmaBlockVector* const pBlockVector = m_pBlockVectors[memTypeIndex]; + VMA_ASSERT(pBlockVector); + pBlockVector->AddStats(pStats); + } + + // Process custom pools. + { + VmaMutexLockRead lock(m_PoolsMutex, m_UseMutex); + for(size_t poolIndex = 0, poolCount = m_Pools.size(); poolIndex < poolCount; ++poolIndex) + { + m_Pools[poolIndex]->m_BlockVector.AddStats(pStats); + } + } + + // Process dedicated allocations. + for (const auto memTypeIndex : c10::irange(GetMemoryTypeCount())) { + const uint32_t memHeapIndex = MemoryTypeIndexToHeapIndex(memTypeIndex); + VmaMutexLockRead dedicatedAllocationsLock(m_DedicatedAllocationsMutex[memTypeIndex], m_UseMutex); + AllocationVectorType* const pDedicatedAllocVector = m_pDedicatedAllocations[memTypeIndex]; + VMA_ASSERT(pDedicatedAllocVector); + for(size_t allocIndex = 0, allocCount = pDedicatedAllocVector->size(); allocIndex < allocCount; ++allocIndex) + { + VmaStatInfo allocationStatInfo; + (*pDedicatedAllocVector)[allocIndex]->DedicatedAllocCalcStatsInfo(allocationStatInfo); + VmaAddStatInfo(pStats->total, allocationStatInfo); + VmaAddStatInfo(pStats->memoryType[memTypeIndex], allocationStatInfo); + VmaAddStatInfo(pStats->memoryHeap[memHeapIndex], allocationStatInfo); + } + } + + // Postprocess. + VmaPostprocessCalcStatInfo(pStats->total); + for (const auto i : c10::irange(GetMemoryTypeCount()))VmaPostprocessCalcStatInfo(pStats->memoryType[i]); + for (const auto i : c10::irange(GetMemoryHeapCount()))VmaPostprocessCalcStatInfo(pStats->memoryHeap[i]); +} + +void VmaAllocator_T::GetBudget(VmaBudget* outBudget, uint32_t firstHeap, uint32_t heapCount) +{ +#if VMA_MEMORY_BUDGET + if(m_UseExtMemoryBudget) + { + if(m_Budget.m_OperationsSinceBudgetFetch < 30) + { + VmaMutexLockRead lockRead(m_Budget.m_BudgetMutex, m_UseMutex); + for(uint32_t i = 0; i < heapCount; ++i, ++outBudget) + { + const uint32_t heapIndex = firstHeap + i; + + outBudget->blockBytes = m_Budget.m_BlockBytes[heapIndex]; + outBudget->allocationBytes = m_Budget.m_AllocationBytes[heapIndex]; + + if(m_Budget.m_VulkanUsage[heapIndex] + outBudget->blockBytes > m_Budget.m_BlockBytesAtBudgetFetch[heapIndex]) + { + outBudget->usage = m_Budget.m_VulkanUsage[heapIndex] + + outBudget->blockBytes - m_Budget.m_BlockBytesAtBudgetFetch[heapIndex]; + } + else + { + outBudget->usage = 0; + } + + // Have to take MIN with heap size because explicit HeapSizeLimit is included in it. + outBudget->budget = VMA_MIN( + m_Budget.m_VulkanBudget[heapIndex], m_MemProps.memoryHeaps[heapIndex].size); + } + } + else + { + UpdateVulkanBudget(); // Outside of mutex lock + GetBudget(outBudget, firstHeap, heapCount); // Recursion + } + } + else +#endif + { + for(uint32_t i = 0; i < heapCount; ++i, ++outBudget) + { + const uint32_t heapIndex = firstHeap + i; + + outBudget->blockBytes = m_Budget.m_BlockBytes[heapIndex]; + outBudget->allocationBytes = m_Budget.m_AllocationBytes[heapIndex]; + + outBudget->usage = outBudget->blockBytes; + outBudget->budget = m_MemProps.memoryHeaps[heapIndex].size * 8 / 10; // 80% heuristics. + } + } +} + +static const uint32_t VMA_VENDOR_ID_AMD = 4098; + +VkResult VmaAllocator_T::DefragmentationBegin( + const VmaDefragmentationInfo2& info, + VmaDefragmentationStats* pStats, + VmaDefragmentationContext* pContext) +{ + if(info.pAllocationsChanged != VMA_NULL) + { + memset(info.pAllocationsChanged, 0, info.allocationCount * sizeof(VkBool32)); + } + + *pContext = vma_new(this, VmaDefragmentationContext_T)( + this, m_CurrentFrameIndex.load(), info.flags, pStats); + + (*pContext)->AddPools(info.poolCount, info.pPools); + (*pContext)->AddAllocations( + info.allocationCount, info.pAllocations, info.pAllocationsChanged); + + VkResult res = (*pContext)->Defragment( + info.maxCpuBytesToMove, info.maxCpuAllocationsToMove, + info.maxGpuBytesToMove, info.maxGpuAllocationsToMove, + info.commandBuffer, pStats, info.flags); + + if(res != VK_NOT_READY) + { + vma_delete(this, *pContext); + *pContext = VMA_NULL; + } + + return res; +} + +VkResult VmaAllocator_T::DefragmentationEnd( + VmaDefragmentationContext context) +{ + vma_delete(this, context); + return VK_SUCCESS; +} + +VkResult VmaAllocator_T::DefragmentationPassBegin( + VmaDefragmentationPassInfo* pInfo, + VmaDefragmentationContext context) +{ + return context->DefragmentPassBegin(pInfo); +} +VkResult VmaAllocator_T::DefragmentationPassEnd( + VmaDefragmentationContext context) +{ + return context->DefragmentPassEnd(); + +} + +void VmaAllocator_T::GetAllocationInfo(VmaAllocation hAllocation, VmaAllocationInfo* pAllocationInfo) +{ + if(hAllocation->CanBecomeLost()) + { + /* + Warning: This is a carefully designed algorithm. + Do not modify unless you really know what you're doing :) + */ + const uint32_t localCurrFrameIndex = m_CurrentFrameIndex.load(); + uint32_t localLastUseFrameIndex = hAllocation->GetLastUseFrameIndex(); + for(;;) + { + if(localLastUseFrameIndex == VMA_FRAME_INDEX_LOST) + { + pAllocationInfo->memoryType = UINT32_MAX; + pAllocationInfo->deviceMemory = VK_NULL_HANDLE; + pAllocationInfo->offset = 0; + pAllocationInfo->size = hAllocation->GetSize(); + pAllocationInfo->pMappedData = VMA_NULL; + pAllocationInfo->pUserData = hAllocation->GetUserData(); + return; + } + else if(localLastUseFrameIndex == localCurrFrameIndex) + { + pAllocationInfo->memoryType = hAllocation->GetMemoryTypeIndex(); + pAllocationInfo->deviceMemory = hAllocation->GetMemory(); + pAllocationInfo->offset = hAllocation->GetOffset(); + pAllocationInfo->size = hAllocation->GetSize(); + pAllocationInfo->pMappedData = VMA_NULL; + pAllocationInfo->pUserData = hAllocation->GetUserData(); + return; + } + else // Last use time earlier than current time. + { + if(hAllocation->CompareExchangeLastUseFrameIndex(localLastUseFrameIndex, localCurrFrameIndex)) + { + localLastUseFrameIndex = localCurrFrameIndex; + } + } + } + } + else + { +#if VMA_STATS_STRING_ENABLED + uint32_t localCurrFrameIndex = m_CurrentFrameIndex.load(); + uint32_t localLastUseFrameIndex = hAllocation->GetLastUseFrameIndex(); + for(;;) + { + VMA_ASSERT(localLastUseFrameIndex != VMA_FRAME_INDEX_LOST); + if(localLastUseFrameIndex == localCurrFrameIndex) + { + break; + } + else // Last use time earlier than current time. + { + if(hAllocation->CompareExchangeLastUseFrameIndex(localLastUseFrameIndex, localCurrFrameIndex)) + { + localLastUseFrameIndex = localCurrFrameIndex; + } + } + } +#endif + + pAllocationInfo->memoryType = hAllocation->GetMemoryTypeIndex(); + pAllocationInfo->deviceMemory = hAllocation->GetMemory(); + pAllocationInfo->offset = hAllocation->GetOffset(); + pAllocationInfo->size = hAllocation->GetSize(); + pAllocationInfo->pMappedData = hAllocation->GetMappedData(); + pAllocationInfo->pUserData = hAllocation->GetUserData(); + } +} + +bool VmaAllocator_T::TouchAllocation(VmaAllocation hAllocation) +{ + // This is a stripped-down version of VmaAllocator_T::GetAllocationInfo. + if(hAllocation->CanBecomeLost()) + { + uint32_t localCurrFrameIndex = m_CurrentFrameIndex.load(); + uint32_t localLastUseFrameIndex = hAllocation->GetLastUseFrameIndex(); + for(;;) + { + if(localLastUseFrameIndex == VMA_FRAME_INDEX_LOST) + { + return false; + } + else if(localLastUseFrameIndex == localCurrFrameIndex) + { + return true; + } + else // Last use time earlier than current time. + { + if(hAllocation->CompareExchangeLastUseFrameIndex(localLastUseFrameIndex, localCurrFrameIndex)) + { + localLastUseFrameIndex = localCurrFrameIndex; + } + } + } + } + else + { +#if VMA_STATS_STRING_ENABLED + uint32_t localCurrFrameIndex = m_CurrentFrameIndex.load(); + uint32_t localLastUseFrameIndex = hAllocation->GetLastUseFrameIndex(); + for(;;) + { + VMA_ASSERT(localLastUseFrameIndex != VMA_FRAME_INDEX_LOST); + if(localLastUseFrameIndex == localCurrFrameIndex) + { + break; + } + else // Last use time earlier than current time. + { + if(hAllocation->CompareExchangeLastUseFrameIndex(localLastUseFrameIndex, localCurrFrameIndex)) + { + localLastUseFrameIndex = localCurrFrameIndex; + } + } + } +#endif + + return true; + } +} + +VkResult VmaAllocator_T::CreatePool(const VmaPoolCreateInfo* pCreateInfo, VmaPool* pPool) +{ + VMA_DEBUG_LOG(" CreatePool: MemoryTypeIndex=%u, flags=%u", pCreateInfo->memoryTypeIndex, pCreateInfo->flags); + + VmaPoolCreateInfo newCreateInfo = *pCreateInfo; + + if(newCreateInfo.maxBlockCount == 0) + { + newCreateInfo.maxBlockCount = SIZE_MAX; + } + if(newCreateInfo.minBlockCount > newCreateInfo.maxBlockCount) + { + return VK_ERROR_INITIALIZATION_FAILED; + } + // Memory type index out of range or forbidden. + if(pCreateInfo->memoryTypeIndex >= GetMemoryTypeCount() || + ((1u << pCreateInfo->memoryTypeIndex) & m_GlobalMemoryTypeBits) == 0) + { + return VK_ERROR_FEATURE_NOT_PRESENT; + } + + const VkDeviceSize preferredBlockSize = CalcPreferredBlockSize(newCreateInfo.memoryTypeIndex); + + *pPool = vma_new(this, VmaPool_T)(this, newCreateInfo, preferredBlockSize); + + VkResult res = (*pPool)->m_BlockVector.CreateMinBlocks(); + if(res != VK_SUCCESS) + { + vma_delete(this, *pPool); + *pPool = VMA_NULL; + return res; + } + + // Add to m_Pools. + { + VmaMutexLockWrite lock(m_PoolsMutex, m_UseMutex); + (*pPool)->SetId(m_NextPoolId++); + VmaVectorInsertSorted(m_Pools, *pPool); + } + + return VK_SUCCESS; +} + +void VmaAllocator_T::DestroyPool(VmaPool pool) +{ + // Remove from m_Pools. + { + VmaMutexLockWrite lock(m_PoolsMutex, m_UseMutex); + bool success = VmaVectorRemoveSorted(m_Pools, pool); + VMA_ASSERT(success && "Pool not found in Allocator."); + } + + vma_delete(this, pool); +} + +void VmaAllocator_T::GetPoolStats(VmaPool pool, VmaPoolStats* pPoolStats) +{ + pool->m_BlockVector.GetPoolStats(pPoolStats); +} + +void VmaAllocator_T::SetCurrentFrameIndex(uint32_t frameIndex) +{ + m_CurrentFrameIndex.store(frameIndex); + +#if VMA_MEMORY_BUDGET + if(m_UseExtMemoryBudget) + { + UpdateVulkanBudget(); + } +#endif // #if VMA_MEMORY_BUDGET +} + +void VmaAllocator_T::MakePoolAllocationsLost( + VmaPool hPool, + size_t* pLostAllocationCount) +{ + hPool->m_BlockVector.MakePoolAllocationsLost( + m_CurrentFrameIndex.load(), + pLostAllocationCount); +} + +VkResult VmaAllocator_T::CheckPoolCorruption(VmaPool hPool) +{ + return hPool->m_BlockVector.CheckCorruption(); +} + +VkResult VmaAllocator_T::CheckCorruption(uint32_t memoryTypeBits) +{ + VkResult finalRes = VK_ERROR_FEATURE_NOT_PRESENT; + + // Process default pools. + for (const auto memTypeIndex : c10::irange(GetMemoryTypeCount())) { + if(((1u << memTypeIndex) & memoryTypeBits) != 0) + { + VmaBlockVector* const pBlockVector = m_pBlockVectors[memTypeIndex]; + VMA_ASSERT(pBlockVector); + VkResult localRes = pBlockVector->CheckCorruption(); + switch(localRes) + { + case VK_ERROR_FEATURE_NOT_PRESENT: + break; + case VK_SUCCESS: + finalRes = VK_SUCCESS; + break; + default: + return localRes; + } + } + } + + // Process custom pools. + { + VmaMutexLockRead lock(m_PoolsMutex, m_UseMutex); + for(size_t poolIndex = 0, poolCount = m_Pools.size(); poolIndex < poolCount; ++poolIndex) + { + if(((1u << m_Pools[poolIndex]->m_BlockVector.GetMemoryTypeIndex()) & memoryTypeBits) != 0) + { + VkResult localRes = m_Pools[poolIndex]->m_BlockVector.CheckCorruption(); + switch(localRes) + { + case VK_ERROR_FEATURE_NOT_PRESENT: + break; + case VK_SUCCESS: + finalRes = VK_SUCCESS; + break; + default: + return localRes; + } + } + } + } + + return finalRes; +} + +void VmaAllocator_T::CreateLostAllocation(VmaAllocation* pAllocation) +{ + *pAllocation = m_AllocationObjectAllocator.Allocate(VMA_FRAME_INDEX_LOST, false); + (*pAllocation)->InitLost(); +} + +// An object that increments given atomic but decrements it back in the destructor unless Commit() is called. +template +struct AtomicTransactionalIncrement +{ +public: + typedef std::atomic AtomicT; + ~AtomicTransactionalIncrement() + { + if(m_Atomic) + --(*m_Atomic); + } + T Increment(AtomicT* atomic) + { + m_Atomic = atomic; + return m_Atomic->fetch_add(1); + } + void Commit() + { + m_Atomic = nullptr; + } + +private: + AtomicT* m_Atomic = nullptr; +}; + +VkResult VmaAllocator_T::AllocateVulkanMemory(const VkMemoryAllocateInfo* pAllocateInfo, VkDeviceMemory* pMemory) +{ + AtomicTransactionalIncrement deviceMemoryCountIncrement; + const uint64_t prevDeviceMemoryCount = deviceMemoryCountIncrement.Increment(&m_DeviceMemoryCount); +#if VMA_DEBUG_DONT_EXCEED_MAX_MEMORY_ALLOCATION_COUNT + if(prevDeviceMemoryCount >= m_PhysicalDeviceProperties.limits.maxMemoryAllocationCount) + { + return VK_ERROR_TOO_MANY_OBJECTS; + } +#endif + + const uint32_t heapIndex = MemoryTypeIndexToHeapIndex(pAllocateInfo->memoryTypeIndex); + + // HeapSizeLimit is in effect for this heap. + if((m_HeapSizeLimitMask & (1u << heapIndex)) != 0) + { + const VkDeviceSize heapSize = m_MemProps.memoryHeaps[heapIndex].size; + VkDeviceSize blockBytes = m_Budget.m_BlockBytes[heapIndex]; + for(;;) + { + const VkDeviceSize blockBytesAfterAllocation = blockBytes + pAllocateInfo->allocationSize; + if(blockBytesAfterAllocation > heapSize) + { + return VK_ERROR_OUT_OF_DEVICE_MEMORY; + } + if(m_Budget.m_BlockBytes[heapIndex].compare_exchange_strong(blockBytes, blockBytesAfterAllocation)) + { + break; + } + } + } + else + { + m_Budget.m_BlockBytes[heapIndex] += pAllocateInfo->allocationSize; + } + + // VULKAN CALL vkAllocateMemory. + VkResult res = (*m_VulkanFunctions.vkAllocateMemory)(m_hDevice, pAllocateInfo, GetAllocationCallbacks(), pMemory); + + if(res == VK_SUCCESS) + { +#if VMA_MEMORY_BUDGET + ++m_Budget.m_OperationsSinceBudgetFetch; +#endif + + // Informative callback. + if(m_DeviceMemoryCallbacks.pfnAllocate != VMA_NULL) + { + (*m_DeviceMemoryCallbacks.pfnAllocate)(this, pAllocateInfo->memoryTypeIndex, *pMemory, pAllocateInfo->allocationSize, m_DeviceMemoryCallbacks.pUserData); + } + + deviceMemoryCountIncrement.Commit(); + } + else + { + m_Budget.m_BlockBytes[heapIndex] -= pAllocateInfo->allocationSize; + } + + return res; +} + +void VmaAllocator_T::FreeVulkanMemory(uint32_t memoryType, VkDeviceSize size, VkDeviceMemory hMemory) +{ + // Informative callback. + if(m_DeviceMemoryCallbacks.pfnFree != VMA_NULL) + { + (*m_DeviceMemoryCallbacks.pfnFree)(this, memoryType, hMemory, size, m_DeviceMemoryCallbacks.pUserData); + } + + // VULKAN CALL vkFreeMemory. + (*m_VulkanFunctions.vkFreeMemory)(m_hDevice, hMemory, GetAllocationCallbacks()); + + m_Budget.m_BlockBytes[MemoryTypeIndexToHeapIndex(memoryType)] -= size; + + --m_DeviceMemoryCount; +} + +VkResult VmaAllocator_T::BindVulkanBuffer( + VkDeviceMemory memory, + VkDeviceSize memoryOffset, + VkBuffer buffer, + const void* pNext) +{ + if(pNext != VMA_NULL) + { +#if VMA_VULKAN_VERSION >= 1001000 || VMA_BIND_MEMORY2 + if((m_UseKhrBindMemory2 || m_VulkanApiVersion >= VK_MAKE_VERSION(1, 1, 0)) && + m_VulkanFunctions.vkBindBufferMemory2KHR != VMA_NULL) + { + VkBindBufferMemoryInfoKHR bindBufferMemoryInfo = { VK_STRUCTURE_TYPE_BIND_BUFFER_MEMORY_INFO_KHR }; + bindBufferMemoryInfo.pNext = pNext; + bindBufferMemoryInfo.buffer = buffer; + bindBufferMemoryInfo.memory = memory; + bindBufferMemoryInfo.memoryOffset = memoryOffset; + return (*m_VulkanFunctions.vkBindBufferMemory2KHR)(m_hDevice, 1, &bindBufferMemoryInfo); + } + else +#endif // #if VMA_VULKAN_VERSION >= 1001000 || VMA_BIND_MEMORY2 + { + return VK_ERROR_EXTENSION_NOT_PRESENT; + } + } + else + { + return (*m_VulkanFunctions.vkBindBufferMemory)(m_hDevice, buffer, memory, memoryOffset); + } +} + +VkResult VmaAllocator_T::BindVulkanImage( + VkDeviceMemory memory, + VkDeviceSize memoryOffset, + VkImage image, + const void* pNext) +{ + if(pNext != VMA_NULL) + { +#if VMA_VULKAN_VERSION >= 1001000 || VMA_BIND_MEMORY2 + if((m_UseKhrBindMemory2 || m_VulkanApiVersion >= VK_MAKE_VERSION(1, 1, 0)) && + m_VulkanFunctions.vkBindImageMemory2KHR != VMA_NULL) + { + VkBindImageMemoryInfoKHR bindBufferMemoryInfo = { VK_STRUCTURE_TYPE_BIND_IMAGE_MEMORY_INFO_KHR }; + bindBufferMemoryInfo.pNext = pNext; + bindBufferMemoryInfo.image = image; + bindBufferMemoryInfo.memory = memory; + bindBufferMemoryInfo.memoryOffset = memoryOffset; + return (*m_VulkanFunctions.vkBindImageMemory2KHR)(m_hDevice, 1, &bindBufferMemoryInfo); + } + else +#endif // #if VMA_BIND_MEMORY2 + { + return VK_ERROR_EXTENSION_NOT_PRESENT; + } + } + else + { + return (*m_VulkanFunctions.vkBindImageMemory)(m_hDevice, image, memory, memoryOffset); + } +} + +VkResult VmaAllocator_T::Map(VmaAllocation hAllocation, void** ppData) +{ + if(hAllocation->CanBecomeLost()) + { + return VK_ERROR_MEMORY_MAP_FAILED; + } + + switch(hAllocation->GetType()) + { + case VmaAllocation_T::ALLOCATION_TYPE_BLOCK: + { + VmaDeviceMemoryBlock* const pBlock = hAllocation->GetBlock(); + char *pBytes = VMA_NULL; + VkResult res = pBlock->Map(this, 1, (void**)&pBytes); + if(res == VK_SUCCESS) + { + *ppData = pBytes + (ptrdiff_t)hAllocation->GetOffset(); + hAllocation->BlockAllocMap(); + } + return res; + } + case VmaAllocation_T::ALLOCATION_TYPE_DEDICATED: + return hAllocation->DedicatedAllocMap(this, ppData); + default: + VMA_ASSERT(0); + return VK_ERROR_MEMORY_MAP_FAILED; + } +} + +void VmaAllocator_T::Unmap(VmaAllocation hAllocation) +{ + switch(hAllocation->GetType()) + { + case VmaAllocation_T::ALLOCATION_TYPE_BLOCK: + { + VmaDeviceMemoryBlock* const pBlock = hAllocation->GetBlock(); + hAllocation->BlockAllocUnmap(); + pBlock->Unmap(this, 1); + } + break; + case VmaAllocation_T::ALLOCATION_TYPE_DEDICATED: + hAllocation->DedicatedAllocUnmap(this); + break; + default: + VMA_ASSERT(0); + } +} + +VkResult VmaAllocator_T::BindBufferMemory( + VmaAllocation hAllocation, + VkDeviceSize allocationLocalOffset, + VkBuffer hBuffer, + const void* pNext) +{ + VkResult res = VK_SUCCESS; + switch(hAllocation->GetType()) + { + case VmaAllocation_T::ALLOCATION_TYPE_DEDICATED: + res = BindVulkanBuffer(hAllocation->GetMemory(), allocationLocalOffset, hBuffer, pNext); + break; + case VmaAllocation_T::ALLOCATION_TYPE_BLOCK: + { + VmaDeviceMemoryBlock* const pBlock = hAllocation->GetBlock(); + VMA_ASSERT(pBlock && "Binding buffer to allocation that doesn't belong to any block. Is the allocation lost?"); + res = pBlock->BindBufferMemory(this, hAllocation, allocationLocalOffset, hBuffer, pNext); + break; + } + default: + VMA_ASSERT(0); + } + return res; +} + +VkResult VmaAllocator_T::BindImageMemory( + VmaAllocation hAllocation, + VkDeviceSize allocationLocalOffset, + VkImage hImage, + const void* pNext) +{ + VkResult res = VK_SUCCESS; + switch(hAllocation->GetType()) + { + case VmaAllocation_T::ALLOCATION_TYPE_DEDICATED: + res = BindVulkanImage(hAllocation->GetMemory(), allocationLocalOffset, hImage, pNext); + break; + case VmaAllocation_T::ALLOCATION_TYPE_BLOCK: + { + VmaDeviceMemoryBlock* pBlock = hAllocation->GetBlock(); + VMA_ASSERT(pBlock && "Binding image to allocation that doesn't belong to any block. Is the allocation lost?"); + res = pBlock->BindImageMemory(this, hAllocation, allocationLocalOffset, hImage, pNext); + break; + } + default: + VMA_ASSERT(0); + } + return res; +} + +VkResult VmaAllocator_T::FlushOrInvalidateAllocation( + VmaAllocation hAllocation, + VkDeviceSize offset, VkDeviceSize size, + VMA_CACHE_OPERATION op) +{ + VkResult res = VK_SUCCESS; + + VkMappedMemoryRange memRange = {}; + if(GetFlushOrInvalidateRange(hAllocation, offset, size, memRange)) + { + switch(op) + { + case VMA_CACHE_FLUSH: + res = (*GetVulkanFunctions().vkFlushMappedMemoryRanges)(m_hDevice, 1, &memRange); + break; + case VMA_CACHE_INVALIDATE: + res = (*GetVulkanFunctions().vkInvalidateMappedMemoryRanges)(m_hDevice, 1, &memRange); + break; + default: + VMA_ASSERT(0); + } + } + // else: Just ignore this call. + return res; +} + +VkResult VmaAllocator_T::FlushOrInvalidateAllocations( + uint32_t allocationCount, + const VmaAllocation* allocations, + const VkDeviceSize* offsets, const VkDeviceSize* sizes, + VMA_CACHE_OPERATION op) +{ + typedef VmaStlAllocator RangeAllocator; + typedef VmaSmallVector RangeVector; + RangeVector ranges = RangeVector(RangeAllocator(GetAllocationCallbacks())); + + for (const auto allocIndex : c10::irange(allocationCount)) { + const VmaAllocation alloc = allocations[allocIndex]; + const VkDeviceSize offset = offsets != VMA_NULL ? offsets[allocIndex] : 0; + const VkDeviceSize size = sizes != VMA_NULL ? sizes[allocIndex] : VK_WHOLE_SIZE; + VkMappedMemoryRange newRange; + if(GetFlushOrInvalidateRange(alloc, offset, size, newRange)) + { + ranges.push_back(newRange); + } + } + + VkResult res = VK_SUCCESS; + if(!ranges.empty()) + { + switch(op) + { + case VMA_CACHE_FLUSH: + res = (*GetVulkanFunctions().vkFlushMappedMemoryRanges)(m_hDevice, (uint32_t)ranges.size(), ranges.data()); + break; + case VMA_CACHE_INVALIDATE: + res = (*GetVulkanFunctions().vkInvalidateMappedMemoryRanges)(m_hDevice, (uint32_t)ranges.size(), ranges.data()); + break; + default: + VMA_ASSERT(0); + } + } + // else: Just ignore this call. + return res; +} + +void VmaAllocator_T::FreeDedicatedMemory(const VmaAllocation allocation) +{ + VMA_ASSERT(allocation && allocation->GetType() == VmaAllocation_T::ALLOCATION_TYPE_DEDICATED); + + const uint32_t memTypeIndex = allocation->GetMemoryTypeIndex(); + { + VmaMutexLockWrite lock(m_DedicatedAllocationsMutex[memTypeIndex], m_UseMutex); + AllocationVectorType* const pDedicatedAllocations = m_pDedicatedAllocations[memTypeIndex]; + VMA_ASSERT(pDedicatedAllocations); + bool success = VmaVectorRemoveSorted(*pDedicatedAllocations, allocation); + VMA_ASSERT(success); + } + + VkDeviceMemory hMemory = allocation->GetMemory(); + + /* + There is no need to call this, because Vulkan spec allows to skip vkUnmapMemory + before vkFreeMemory. + + if(allocation->GetMappedData() != VMA_NULL) + { + (*m_VulkanFunctions.vkUnmapMemory)(m_hDevice, hMemory); + } + */ + + FreeVulkanMemory(memTypeIndex, allocation->GetSize(), hMemory); + + VMA_DEBUG_LOG(" Freed DedicatedMemory MemoryTypeIndex=%u", memTypeIndex); +} + +uint32_t VmaAllocator_T::CalculateGpuDefragmentationMemoryTypeBits() const +{ + VkBufferCreateInfo dummyBufCreateInfo; + VmaFillGpuDefragmentationBufferCreateInfo(dummyBufCreateInfo); + + uint32_t memoryTypeBits = 0; + + // Create buffer. + VkBuffer buf = VK_NULL_HANDLE; + VkResult res = (*GetVulkanFunctions().vkCreateBuffer)( + m_hDevice, &dummyBufCreateInfo, GetAllocationCallbacks(), &buf); + if(res == VK_SUCCESS) + { + // Query for supported memory types. + VkMemoryRequirements memReq; + (*GetVulkanFunctions().vkGetBufferMemoryRequirements)(m_hDevice, buf, &memReq); + memoryTypeBits = memReq.memoryTypeBits; + + // Destroy buffer. + (*GetVulkanFunctions().vkDestroyBuffer)(m_hDevice, buf, GetAllocationCallbacks()); + } + + return memoryTypeBits; +} + +uint32_t VmaAllocator_T::CalculateGlobalMemoryTypeBits() const +{ + // Make sure memory information is already fetched. + VMA_ASSERT(GetMemoryTypeCount() > 0); + + uint32_t memoryTypeBits = UINT32_MAX; + + if(!m_UseAmdDeviceCoherentMemory) + { + // Exclude memory types that have VK_MEMORY_PROPERTY_DEVICE_COHERENT_BIT_AMD. + for (const auto memTypeIndex : c10::irange(GetMemoryTypeCount())) { + if((m_MemProps.memoryTypes[memTypeIndex].propertyFlags & VK_MEMORY_PROPERTY_DEVICE_COHERENT_BIT_AMD_COPY) != 0) + { + memoryTypeBits &= ~(1u << memTypeIndex); + } + } + } + + return memoryTypeBits; +} + +bool VmaAllocator_T::GetFlushOrInvalidateRange( + VmaAllocation allocation, + VkDeviceSize offset, VkDeviceSize size, + VkMappedMemoryRange& outRange) const +{ + const uint32_t memTypeIndex = allocation->GetMemoryTypeIndex(); + if(size > 0 && IsMemoryTypeNonCoherent(memTypeIndex)) + { + const VkDeviceSize nonCoherentAtomSize = m_PhysicalDeviceProperties.limits.nonCoherentAtomSize; + const VkDeviceSize allocationSize = allocation->GetSize(); + VMA_ASSERT(offset <= allocationSize); + + outRange.sType = VK_STRUCTURE_TYPE_MAPPED_MEMORY_RANGE; + outRange.pNext = VMA_NULL; + outRange.memory = allocation->GetMemory(); + + switch(allocation->GetType()) + { + case VmaAllocation_T::ALLOCATION_TYPE_DEDICATED: + outRange.offset = VmaAlignDown(offset, nonCoherentAtomSize); + if(size == VK_WHOLE_SIZE) + { + outRange.size = allocationSize - outRange.offset; + } + else + { + VMA_ASSERT(offset + size <= allocationSize); + outRange.size = VMA_MIN( + VmaAlignUp(size + (offset - outRange.offset), nonCoherentAtomSize), + allocationSize - outRange.offset); + } + break; + case VmaAllocation_T::ALLOCATION_TYPE_BLOCK: + { + // 1. Still within this allocation. + outRange.offset = VmaAlignDown(offset, nonCoherentAtomSize); + if(size == VK_WHOLE_SIZE) + { + size = allocationSize - offset; + } + else + { + VMA_ASSERT(offset + size <= allocationSize); + } + outRange.size = VmaAlignUp(size + (offset - outRange.offset), nonCoherentAtomSize); + + // 2. Adjust to whole block. + const VkDeviceSize allocationOffset = allocation->GetOffset(); + VMA_ASSERT(allocationOffset % nonCoherentAtomSize == 0); + const VkDeviceSize blockSize = allocation->GetBlock()->m_pMetadata->GetSize(); + outRange.offset += allocationOffset; + outRange.size = VMA_MIN(outRange.size, blockSize - outRange.offset); + + break; + } + default: + VMA_ASSERT(0); + } + return true; + } + return false; +} + +#if VMA_MEMORY_BUDGET + +void VmaAllocator_T::UpdateVulkanBudget() +{ + VMA_ASSERT(m_UseExtMemoryBudget); + + VkPhysicalDeviceMemoryProperties2KHR memProps = { VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_MEMORY_PROPERTIES_2_KHR }; + + VkPhysicalDeviceMemoryBudgetPropertiesEXT budgetProps = { VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_MEMORY_BUDGET_PROPERTIES_EXT }; + VmaPnextChainPushFront(&memProps, &budgetProps); + + GetVulkanFunctions().vkGetPhysicalDeviceMemoryProperties2KHR(m_PhysicalDevice, &memProps); + + { + VmaMutexLockWrite lockWrite(m_Budget.m_BudgetMutex, m_UseMutex); + + for (const auto heapIndex : c10::irange(GetMemoryHeapCount())) { + m_Budget.m_VulkanUsage[heapIndex] = budgetProps.heapUsage[heapIndex]; + m_Budget.m_VulkanBudget[heapIndex] = budgetProps.heapBudget[heapIndex]; + m_Budget.m_BlockBytesAtBudgetFetch[heapIndex] = m_Budget.m_BlockBytes[heapIndex].load(); + + // Some bugged drivers return the budget incorrectly, e.g. 0 or much bigger than heap size. + if(m_Budget.m_VulkanBudget[heapIndex] == 0) + { + m_Budget.m_VulkanBudget[heapIndex] = m_MemProps.memoryHeaps[heapIndex].size * 8 / 10; // 80% heuristics. + } + else if(m_Budget.m_VulkanBudget[heapIndex] > m_MemProps.memoryHeaps[heapIndex].size) + { + m_Budget.m_VulkanBudget[heapIndex] = m_MemProps.memoryHeaps[heapIndex].size; + } + if(m_Budget.m_VulkanUsage[heapIndex] == 0 && m_Budget.m_BlockBytesAtBudgetFetch[heapIndex] > 0) + { + m_Budget.m_VulkanUsage[heapIndex] = m_Budget.m_BlockBytesAtBudgetFetch[heapIndex]; + } + } + m_Budget.m_OperationsSinceBudgetFetch = 0; + } +} + +#endif // #if VMA_MEMORY_BUDGET + +void VmaAllocator_T::FillAllocation(const VmaAllocation hAllocation, uint8_t pattern) +{ + if(VMA_DEBUG_INITIALIZE_ALLOCATIONS && + !hAllocation->CanBecomeLost() && + (m_MemProps.memoryTypes[hAllocation->GetMemoryTypeIndex()].propertyFlags & VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT) != 0) + { + void* pData = VMA_NULL; + VkResult res = Map(hAllocation, &pData); + if(res == VK_SUCCESS) + { + memset(pData, (int)pattern, (size_t)hAllocation->GetSize()); + FlushOrInvalidateAllocation(hAllocation, 0, VK_WHOLE_SIZE, VMA_CACHE_FLUSH); + Unmap(hAllocation); + } + else + { + VMA_ASSERT(0 && "VMA_DEBUG_INITIALIZE_ALLOCATIONS is enabled, but couldn't map memory to fill allocation."); + } + } +} + +uint32_t VmaAllocator_T::GetGpuDefragmentationMemoryTypeBits() +{ + uint32_t memoryTypeBits = m_GpuDefragmentationMemoryTypeBits.load(); + if(memoryTypeBits == UINT32_MAX) + { + memoryTypeBits = CalculateGpuDefragmentationMemoryTypeBits(); + m_GpuDefragmentationMemoryTypeBits.store(memoryTypeBits); + } + return memoryTypeBits; +} + +#if VMA_STATS_STRING_ENABLED + +void VmaAllocator_T::PrintDetailedMap(VmaJsonWriter& json) +{ + bool dedicatedAllocationsStarted = false; + for (const auto memTypeIndex : c10::irange(GetMemoryTypeCount())) { + VmaMutexLockRead dedicatedAllocationsLock(m_DedicatedAllocationsMutex[memTypeIndex], m_UseMutex); + AllocationVectorType* const pDedicatedAllocVector = m_pDedicatedAllocations[memTypeIndex]; + VMA_ASSERT(pDedicatedAllocVector); + if(pDedicatedAllocVector->empty() == false) + { + if(dedicatedAllocationsStarted == false) + { + dedicatedAllocationsStarted = true; + json.WriteString("DedicatedAllocations"); + json.BeginObject(); + } + + json.BeginString("Type "); + json.ContinueString(memTypeIndex); + json.EndString(); + + json.BeginArray(); + + for(size_t i = 0; i < pDedicatedAllocVector->size(); ++i) + { + json.BeginObject(true); + const VmaAllocation hAlloc = (*pDedicatedAllocVector)[i]; + hAlloc->PrintParameters(json); + json.EndObject(); + } + + json.EndArray(); + } + } + if(dedicatedAllocationsStarted) + { + json.EndObject(); + } + + { + bool allocationsStarted = false; + for (const auto memTypeIndex : c10::irange(GetMemoryTypeCount())) { + if(m_pBlockVectors[memTypeIndex]->IsEmpty() == false) + { + if(allocationsStarted == false) + { + allocationsStarted = true; + json.WriteString("DefaultPools"); + json.BeginObject(); + } + + json.BeginString("Type "); + json.ContinueString(memTypeIndex); + json.EndString(); + + m_pBlockVectors[memTypeIndex]->PrintDetailedMap(json); + } + } + if(allocationsStarted) + { + json.EndObject(); + } + } + + // Custom pools + { + VmaMutexLockRead lock(m_PoolsMutex, m_UseMutex); + const size_t poolCount = m_Pools.size(); + if(poolCount > 0) + { + json.WriteString("Pools"); + json.BeginObject(); + for (const auto poolIndex : c10::irange(poolCount)) { + json.BeginString(); + json.ContinueString(m_Pools[poolIndex]->GetId()); + json.EndString(); + + m_Pools[poolIndex]->m_BlockVector.PrintDetailedMap(json); + } + json.EndObject(); + } + } +} + +#endif // #if VMA_STATS_STRING_ENABLED + +//////////////////////////////////////////////////////////////////////////////// +// Public interface + +VMA_CALL_PRE VkResult VMA_CALL_POST vmaCreateAllocator( + const VmaAllocatorCreateInfo* pCreateInfo, + VmaAllocator* pAllocator) +{ + VMA_ASSERT(pCreateInfo && pAllocator); + VMA_ASSERT(pCreateInfo->vulkanApiVersion == 0 || + (VK_VERSION_MAJOR(pCreateInfo->vulkanApiVersion) == 1 && VK_VERSION_MINOR(pCreateInfo->vulkanApiVersion) <= 2)); + VMA_DEBUG_LOG("vmaCreateAllocator"); + *pAllocator = vma_new(pCreateInfo->pAllocationCallbacks, VmaAllocator_T)(pCreateInfo); + return (*pAllocator)->Init(pCreateInfo); +} + +VMA_CALL_PRE void VMA_CALL_POST vmaDestroyAllocator( + VmaAllocator allocator) +{ + if(allocator != VK_NULL_HANDLE) + { + VMA_DEBUG_LOG("vmaDestroyAllocator"); + VkAllocationCallbacks allocationCallbacks = allocator->m_AllocationCallbacks; + vma_delete(&allocationCallbacks, allocator); + } +} + +VMA_CALL_PRE void VMA_CALL_POST vmaGetAllocatorInfo(VmaAllocator allocator, VmaAllocatorInfo* pAllocatorInfo) +{ + VMA_ASSERT(allocator && pAllocatorInfo); + pAllocatorInfo->instance = allocator->m_hInstance; + pAllocatorInfo->physicalDevice = allocator->GetPhysicalDevice(); + pAllocatorInfo->device = allocator->m_hDevice; +} + +VMA_CALL_PRE void VMA_CALL_POST vmaGetPhysicalDeviceProperties( + VmaAllocator allocator, + const VkPhysicalDeviceProperties **ppPhysicalDeviceProperties) +{ + VMA_ASSERT(allocator && ppPhysicalDeviceProperties); + *ppPhysicalDeviceProperties = &allocator->m_PhysicalDeviceProperties; +} + +VMA_CALL_PRE void VMA_CALL_POST vmaGetMemoryProperties( + VmaAllocator allocator, + const VkPhysicalDeviceMemoryProperties** ppPhysicalDeviceMemoryProperties) +{ + VMA_ASSERT(allocator && ppPhysicalDeviceMemoryProperties); + *ppPhysicalDeviceMemoryProperties = &allocator->m_MemProps; +} + +VMA_CALL_PRE void VMA_CALL_POST vmaGetMemoryTypeProperties( + VmaAllocator allocator, + uint32_t memoryTypeIndex, + VkMemoryPropertyFlags* pFlags) +{ + VMA_ASSERT(allocator && pFlags); + VMA_ASSERT(memoryTypeIndex < allocator->GetMemoryTypeCount()); + *pFlags = allocator->m_MemProps.memoryTypes[memoryTypeIndex].propertyFlags; +} + +VMA_CALL_PRE void VMA_CALL_POST vmaSetCurrentFrameIndex( + VmaAllocator allocator, + uint32_t frameIndex) +{ + VMA_ASSERT(allocator); + VMA_ASSERT(frameIndex != VMA_FRAME_INDEX_LOST); + + VMA_DEBUG_GLOBAL_MUTEX_LOCK + + allocator->SetCurrentFrameIndex(frameIndex); +} + +VMA_CALL_PRE void VMA_CALL_POST vmaCalculateStats( + VmaAllocator allocator, + VmaStats* pStats) +{ + VMA_ASSERT(allocator && pStats); + VMA_DEBUG_GLOBAL_MUTEX_LOCK + allocator->CalculateStats(pStats); +} + +VMA_CALL_PRE void VMA_CALL_POST vmaGetBudget( + VmaAllocator allocator, + VmaBudget* pBudget) +{ + VMA_ASSERT(allocator && pBudget); + VMA_DEBUG_GLOBAL_MUTEX_LOCK + allocator->GetBudget(pBudget, 0, allocator->GetMemoryHeapCount()); +} + +#if VMA_STATS_STRING_ENABLED + +VMA_CALL_PRE void VMA_CALL_POST vmaBuildStatsString( + VmaAllocator allocator, + char** ppStatsString, + VkBool32 detailedMap) +{ + VMA_ASSERT(allocator && ppStatsString); + VMA_DEBUG_GLOBAL_MUTEX_LOCK + + VmaStringBuilder sb(allocator); + { + VmaJsonWriter json(allocator->GetAllocationCallbacks(), sb); + json.BeginObject(); + + VmaBudget budget[VK_MAX_MEMORY_HEAPS]; + allocator->GetBudget(budget, 0, allocator->GetMemoryHeapCount()); + + VmaStats stats; + allocator->CalculateStats(&stats); + + json.WriteString("Total"); + VmaPrintStatInfo(json, stats.total); + + for(uint32_t heapIndex = 0; heapIndex < allocator->GetMemoryHeapCount(); ++heapIndex) + { + json.BeginString("Heap "); + json.ContinueString(heapIndex); + json.EndString(); + json.BeginObject(); + + json.WriteString("Size"); + json.WriteNumber(allocator->m_MemProps.memoryHeaps[heapIndex].size); + + json.WriteString("Flags"); + json.BeginArray(true); + if((allocator->m_MemProps.memoryHeaps[heapIndex].flags & VK_MEMORY_HEAP_DEVICE_LOCAL_BIT) != 0) + { + json.WriteString("DEVICE_LOCAL"); + } + json.EndArray(); + + json.WriteString("Budget"); + json.BeginObject(); + { + json.WriteString("BlockBytes"); + json.WriteNumber(budget[heapIndex].blockBytes); + json.WriteString("AllocationBytes"); + json.WriteNumber(budget[heapIndex].allocationBytes); + json.WriteString("Usage"); + json.WriteNumber(budget[heapIndex].usage); + json.WriteString("Budget"); + json.WriteNumber(budget[heapIndex].budget); + } + json.EndObject(); + + if(stats.memoryHeap[heapIndex].blockCount > 0) + { + json.WriteString("Stats"); + VmaPrintStatInfo(json, stats.memoryHeap[heapIndex]); + } + + for(uint32_t typeIndex = 0; typeIndex < allocator->GetMemoryTypeCount(); ++typeIndex) + { + if(allocator->MemoryTypeIndexToHeapIndex(typeIndex) == heapIndex) + { + json.BeginString("Type "); + json.ContinueString(typeIndex); + json.EndString(); + + json.BeginObject(); + + json.WriteString("Flags"); + json.BeginArray(true); + VkMemoryPropertyFlags flags = allocator->m_MemProps.memoryTypes[typeIndex].propertyFlags; + if((flags & VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT) != 0) + { + json.WriteString("DEVICE_LOCAL"); + } + if((flags & VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT) != 0) + { + json.WriteString("HOST_VISIBLE"); + } + if((flags & VK_MEMORY_PROPERTY_HOST_COHERENT_BIT) != 0) + { + json.WriteString("HOST_COHERENT"); + } + if((flags & VK_MEMORY_PROPERTY_HOST_CACHED_BIT) != 0) + { + json.WriteString("HOST_CACHED"); + } + if((flags & VK_MEMORY_PROPERTY_LAZILY_ALLOCATED_BIT) != 0) + { + json.WriteString("LAZILY_ALLOCATED"); + } +#if VMA_VULKAN_VERSION >= 1001000 + if((flags & VK_MEMORY_PROPERTY_PROTECTED_BIT) != 0) + { + json.WriteString("PROTECTED"); + } +#endif // #if VMA_VULKAN_VERSION >= 1001000 +#if VK_AMD_device_coherent_memory + if((flags & VK_MEMORY_PROPERTY_DEVICE_COHERENT_BIT_AMD_COPY) != 0) + { + json.WriteString("DEVICE_COHERENT"); + } + if((flags & VK_MEMORY_PROPERTY_DEVICE_UNCACHED_BIT_AMD_COPY) != 0) + { + json.WriteString("DEVICE_UNCACHED"); + } +#endif // #if VK_AMD_device_coherent_memory + json.EndArray(); + + if(stats.memoryType[typeIndex].blockCount > 0) + { + json.WriteString("Stats"); + VmaPrintStatInfo(json, stats.memoryType[typeIndex]); + } + + json.EndObject(); + } + } + + json.EndObject(); + } + if(detailedMap == VK_TRUE) + { + allocator->PrintDetailedMap(json); + } + + json.EndObject(); + } + + const size_t len = sb.GetLength(); + char* const pChars = vma_new_array(allocator, char, len + 1); + if(len > 0) + { + memcpy(pChars, sb.GetData(), len); + } + pChars[len] = '\0'; + *ppStatsString = pChars; +} + +VMA_CALL_PRE void VMA_CALL_POST vmaFreeStatsString( + VmaAllocator allocator, + char* pStatsString) +{ + if(pStatsString != VMA_NULL) + { + VMA_ASSERT(allocator); + size_t len = strlen(pStatsString); + vma_delete_array(allocator, pStatsString, len + 1); + } +} + +#endif // #if VMA_STATS_STRING_ENABLED + +/* +This function is not protected by any mutex because it just reads immutable data. +*/ +VMA_CALL_PRE VkResult VMA_CALL_POST vmaFindMemoryTypeIndex( + VmaAllocator allocator, + uint32_t memoryTypeBits, + const VmaAllocationCreateInfo* pAllocationCreateInfo, + uint32_t* pMemoryTypeIndex) +{ + VMA_ASSERT(allocator != VK_NULL_HANDLE); + VMA_ASSERT(pAllocationCreateInfo != VMA_NULL); + VMA_ASSERT(pMemoryTypeIndex != VMA_NULL); + + memoryTypeBits &= allocator->GetGlobalMemoryTypeBits(); + + if(pAllocationCreateInfo->memoryTypeBits != 0) + { + memoryTypeBits &= pAllocationCreateInfo->memoryTypeBits; + } + + uint32_t requiredFlags = pAllocationCreateInfo->requiredFlags; + uint32_t preferredFlags = pAllocationCreateInfo->preferredFlags; + uint32_t notPreferredFlags = 0; + + // Convert usage to requiredFlags and preferredFlags. + switch(pAllocationCreateInfo->usage) + { + case VMA_MEMORY_USAGE_UNKNOWN: + break; + case VMA_MEMORY_USAGE_GPU_ONLY: + if(!allocator->IsIntegratedGpu() || (preferredFlags & VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT) == 0) + { + preferredFlags |= VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT; + } + break; + case VMA_MEMORY_USAGE_CPU_ONLY: + requiredFlags |= VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT; + break; + case VMA_MEMORY_USAGE_CPU_TO_GPU: + requiredFlags |= VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT; + if(!allocator->IsIntegratedGpu() || (preferredFlags & VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT) == 0) + { + preferredFlags |= VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT; + } + break; + case VMA_MEMORY_USAGE_GPU_TO_CPU: + requiredFlags |= VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT; + preferredFlags |= VK_MEMORY_PROPERTY_HOST_CACHED_BIT; + break; + case VMA_MEMORY_USAGE_CPU_COPY: + notPreferredFlags |= VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT; + break; + case VMA_MEMORY_USAGE_GPU_LAZILY_ALLOCATED: + requiredFlags |= VK_MEMORY_PROPERTY_LAZILY_ALLOCATED_BIT; + break; + default: + VMA_ASSERT(0); + break; + } + + // Avoid DEVICE_COHERENT unless explicitly requested. + if(((pAllocationCreateInfo->requiredFlags | pAllocationCreateInfo->preferredFlags) & + (VK_MEMORY_PROPERTY_DEVICE_COHERENT_BIT_AMD_COPY | VK_MEMORY_PROPERTY_DEVICE_UNCACHED_BIT_AMD_COPY)) == 0) + { + notPreferredFlags |= VK_MEMORY_PROPERTY_DEVICE_COHERENT_BIT_AMD_COPY; + } + + *pMemoryTypeIndex = UINT32_MAX; + uint32_t minCost = UINT32_MAX; + for(uint32_t memTypeIndex = 0, memTypeBit = 1; + memTypeIndex < allocator->GetMemoryTypeCount(); + ++memTypeIndex, memTypeBit <<= 1) + { + // This memory type is acceptable according to memoryTypeBits bitmask. + if((memTypeBit & memoryTypeBits) != 0) + { + const VkMemoryPropertyFlags currFlags = + allocator->m_MemProps.memoryTypes[memTypeIndex].propertyFlags; + // This memory type contains requiredFlags. + if((requiredFlags & ~currFlags) == 0) + { + // Calculate cost as number of bits from preferredFlags not present in this memory type. + uint32_t currCost = VmaCountBitsSet(preferredFlags & ~currFlags) + + VmaCountBitsSet(currFlags & notPreferredFlags); + // Remember memory type with lowest cost. + if(currCost < minCost) + { + *pMemoryTypeIndex = memTypeIndex; + if(currCost == 0) + { + return VK_SUCCESS; + } + minCost = currCost; + } + } + } + } + return (*pMemoryTypeIndex != UINT32_MAX) ? VK_SUCCESS : VK_ERROR_FEATURE_NOT_PRESENT; +} + +VMA_CALL_PRE VkResult VMA_CALL_POST vmaFindMemoryTypeIndexForBufferInfo( + VmaAllocator allocator, + const VkBufferCreateInfo* pBufferCreateInfo, + const VmaAllocationCreateInfo* pAllocationCreateInfo, + uint32_t* pMemoryTypeIndex) +{ + VMA_ASSERT(allocator != VK_NULL_HANDLE); + VMA_ASSERT(pBufferCreateInfo != VMA_NULL); + VMA_ASSERT(pAllocationCreateInfo != VMA_NULL); + VMA_ASSERT(pMemoryTypeIndex != VMA_NULL); + + const VkDevice hDev = allocator->m_hDevice; + VkBuffer hBuffer = VK_NULL_HANDLE; + VkResult res = allocator->GetVulkanFunctions().vkCreateBuffer( + hDev, pBufferCreateInfo, allocator->GetAllocationCallbacks(), &hBuffer); + if(res == VK_SUCCESS) + { + VkMemoryRequirements memReq = {}; + allocator->GetVulkanFunctions().vkGetBufferMemoryRequirements( + hDev, hBuffer, &memReq); + + res = vmaFindMemoryTypeIndex( + allocator, + memReq.memoryTypeBits, + pAllocationCreateInfo, + pMemoryTypeIndex); + + allocator->GetVulkanFunctions().vkDestroyBuffer( + hDev, hBuffer, allocator->GetAllocationCallbacks()); + } + return res; +} + +VMA_CALL_PRE VkResult VMA_CALL_POST vmaFindMemoryTypeIndexForImageInfo( + VmaAllocator allocator, + const VkImageCreateInfo* pImageCreateInfo, + const VmaAllocationCreateInfo* pAllocationCreateInfo, + uint32_t* pMemoryTypeIndex) +{ + VMA_ASSERT(allocator != VK_NULL_HANDLE); + VMA_ASSERT(pImageCreateInfo != VMA_NULL); + VMA_ASSERT(pAllocationCreateInfo != VMA_NULL); + VMA_ASSERT(pMemoryTypeIndex != VMA_NULL); + + const VkDevice hDev = allocator->m_hDevice; + VkImage hImage = VK_NULL_HANDLE; + VkResult res = allocator->GetVulkanFunctions().vkCreateImage( + hDev, pImageCreateInfo, allocator->GetAllocationCallbacks(), &hImage); + if(res == VK_SUCCESS) + { + VkMemoryRequirements memReq = {}; + allocator->GetVulkanFunctions().vkGetImageMemoryRequirements( + hDev, hImage, &memReq); + + res = vmaFindMemoryTypeIndex( + allocator, + memReq.memoryTypeBits, + pAllocationCreateInfo, + pMemoryTypeIndex); + + allocator->GetVulkanFunctions().vkDestroyImage( + hDev, hImage, allocator->GetAllocationCallbacks()); + } + return res; +} + +VMA_CALL_PRE VkResult VMA_CALL_POST vmaCreatePool( + VmaAllocator allocator, + const VmaPoolCreateInfo* pCreateInfo, + VmaPool* pPool) +{ + VMA_ASSERT(allocator && pCreateInfo && pPool); + + VMA_DEBUG_LOG("vmaCreatePool"); + + VMA_DEBUG_GLOBAL_MUTEX_LOCK + + VkResult res = allocator->CreatePool(pCreateInfo, pPool); + +#if VMA_RECORDING_ENABLED + if(allocator->GetRecorder() != VMA_NULL) + { + allocator->GetRecorder()->RecordCreatePool(allocator->GetCurrentFrameIndex(), *pCreateInfo, *pPool); + } +#endif + + return res; +} + +VMA_CALL_PRE void VMA_CALL_POST vmaDestroyPool( + VmaAllocator allocator, + VmaPool pool) +{ + VMA_ASSERT(allocator); + + if(pool == VK_NULL_HANDLE) + { + return; + } + + VMA_DEBUG_LOG("vmaDestroyPool"); + + VMA_DEBUG_GLOBAL_MUTEX_LOCK + +#if VMA_RECORDING_ENABLED + if(allocator->GetRecorder() != VMA_NULL) + { + allocator->GetRecorder()->RecordDestroyPool(allocator->GetCurrentFrameIndex(), pool); + } +#endif + + allocator->DestroyPool(pool); +} + +VMA_CALL_PRE void VMA_CALL_POST vmaGetPoolStats( + VmaAllocator allocator, + VmaPool pool, + VmaPoolStats* pPoolStats) +{ + VMA_ASSERT(allocator && pool && pPoolStats); + + VMA_DEBUG_GLOBAL_MUTEX_LOCK + + allocator->GetPoolStats(pool, pPoolStats); +} + +VMA_CALL_PRE void VMA_CALL_POST vmaMakePoolAllocationsLost( + VmaAllocator allocator, + VmaPool pool, + size_t* pLostAllocationCount) +{ + VMA_ASSERT(allocator && pool); + + VMA_DEBUG_GLOBAL_MUTEX_LOCK + +#if VMA_RECORDING_ENABLED + if(allocator->GetRecorder() != VMA_NULL) + { + allocator->GetRecorder()->RecordMakePoolAllocationsLost(allocator->GetCurrentFrameIndex(), pool); + } +#endif + + allocator->MakePoolAllocationsLost(pool, pLostAllocationCount); +} + +VMA_CALL_PRE VkResult VMA_CALL_POST vmaCheckPoolCorruption(VmaAllocator allocator, VmaPool pool) +{ + VMA_ASSERT(allocator && pool); + + VMA_DEBUG_GLOBAL_MUTEX_LOCK + + VMA_DEBUG_LOG("vmaCheckPoolCorruption"); + + return allocator->CheckPoolCorruption(pool); +} + +VMA_CALL_PRE void VMA_CALL_POST vmaGetPoolName( + VmaAllocator allocator, + VmaPool pool, + const char** ppName) +{ + VMA_ASSERT(allocator && pool && ppName); + + VMA_DEBUG_LOG("vmaGetPoolName"); + + VMA_DEBUG_GLOBAL_MUTEX_LOCK + + *ppName = pool->GetName(); +} + +VMA_CALL_PRE void VMA_CALL_POST vmaSetPoolName( + VmaAllocator allocator, + VmaPool pool, + const char* pName) +{ + VMA_ASSERT(allocator && pool); + + VMA_DEBUG_LOG("vmaSetPoolName"); + + VMA_DEBUG_GLOBAL_MUTEX_LOCK + + pool->SetName(pName); + +#if VMA_RECORDING_ENABLED + if(allocator->GetRecorder() != VMA_NULL) + { + allocator->GetRecorder()->RecordSetPoolName(allocator->GetCurrentFrameIndex(), pool, pName); + } +#endif +} + +VMA_CALL_PRE VkResult VMA_CALL_POST vmaAllocateMemory( + VmaAllocator allocator, + const VkMemoryRequirements* pVkMemoryRequirements, + const VmaAllocationCreateInfo* pCreateInfo, + VmaAllocation* pAllocation, + VmaAllocationInfo* pAllocationInfo) +{ + VMA_ASSERT(allocator && pVkMemoryRequirements && pCreateInfo && pAllocation); + + VMA_DEBUG_LOG("vmaAllocateMemory"); + + VMA_DEBUG_GLOBAL_MUTEX_LOCK + + VkResult result = allocator->AllocateMemory( + *pVkMemoryRequirements, + false, // requiresDedicatedAllocation + false, // prefersDedicatedAllocation + VK_NULL_HANDLE, // dedicatedBuffer + UINT32_MAX, // dedicatedBufferUsage + VK_NULL_HANDLE, // dedicatedImage + *pCreateInfo, + VMA_SUBALLOCATION_TYPE_UNKNOWN, + 1, // allocationCount + pAllocation); + +#if VMA_RECORDING_ENABLED + if(allocator->GetRecorder() != VMA_NULL) + { + allocator->GetRecorder()->RecordAllocateMemory( + allocator->GetCurrentFrameIndex(), + *pVkMemoryRequirements, + *pCreateInfo, + *pAllocation); + } +#endif + + if(pAllocationInfo != VMA_NULL && result == VK_SUCCESS) + { + allocator->GetAllocationInfo(*pAllocation, pAllocationInfo); + } + + return result; +} + +VMA_CALL_PRE VkResult VMA_CALL_POST vmaAllocateMemoryPages( + VmaAllocator allocator, + const VkMemoryRequirements* pVkMemoryRequirements, + const VmaAllocationCreateInfo* pCreateInfo, + size_t allocationCount, + VmaAllocation* pAllocations, + VmaAllocationInfo* pAllocationInfo) +{ + if(allocationCount == 0) + { + return VK_SUCCESS; + } + + VMA_ASSERT(allocator && pVkMemoryRequirements && pCreateInfo && pAllocations); + + VMA_DEBUG_LOG("vmaAllocateMemoryPages"); + + VMA_DEBUG_GLOBAL_MUTEX_LOCK + + VkResult result = allocator->AllocateMemory( + *pVkMemoryRequirements, + false, // requiresDedicatedAllocation + false, // prefersDedicatedAllocation + VK_NULL_HANDLE, // dedicatedBuffer + UINT32_MAX, // dedicatedBufferUsage + VK_NULL_HANDLE, // dedicatedImage + *pCreateInfo, + VMA_SUBALLOCATION_TYPE_UNKNOWN, + allocationCount, + pAllocations); + +#if VMA_RECORDING_ENABLED + if(allocator->GetRecorder() != VMA_NULL) + { + allocator->GetRecorder()->RecordAllocateMemoryPages( + allocator->GetCurrentFrameIndex(), + *pVkMemoryRequirements, + *pCreateInfo, + (uint64_t)allocationCount, + pAllocations); + } +#endif + + if(pAllocationInfo != VMA_NULL && result == VK_SUCCESS) + { + for (const auto i : c10::irange(allocationCount)) { + allocator->GetAllocationInfo(pAllocations[i], pAllocationInfo + i); + } + } + + return result; +} + +VMA_CALL_PRE VkResult VMA_CALL_POST vmaAllocateMemoryForBuffer( + VmaAllocator allocator, + VkBuffer buffer, + const VmaAllocationCreateInfo* pCreateInfo, + VmaAllocation* pAllocation, + VmaAllocationInfo* pAllocationInfo) +{ + VMA_ASSERT(allocator && buffer != VK_NULL_HANDLE && pCreateInfo && pAllocation); + + VMA_DEBUG_LOG("vmaAllocateMemoryForBuffer"); + + VMA_DEBUG_GLOBAL_MUTEX_LOCK + + VkMemoryRequirements vkMemReq = {}; + bool requiresDedicatedAllocation = false; + bool prefersDedicatedAllocation = false; + allocator->GetBufferMemoryRequirements(buffer, vkMemReq, + requiresDedicatedAllocation, + prefersDedicatedAllocation); + + VkResult result = allocator->AllocateMemory( + vkMemReq, + requiresDedicatedAllocation, + prefersDedicatedAllocation, + buffer, // dedicatedBuffer + UINT32_MAX, // dedicatedBufferUsage + VK_NULL_HANDLE, // dedicatedImage + *pCreateInfo, + VMA_SUBALLOCATION_TYPE_BUFFER, + 1, // allocationCount + pAllocation); + +#if VMA_RECORDING_ENABLED + if(allocator->GetRecorder() != VMA_NULL) + { + allocator->GetRecorder()->RecordAllocateMemoryForBuffer( + allocator->GetCurrentFrameIndex(), + vkMemReq, + requiresDedicatedAllocation, + prefersDedicatedAllocation, + *pCreateInfo, + *pAllocation); + } +#endif + + if(pAllocationInfo && result == VK_SUCCESS) + { + allocator->GetAllocationInfo(*pAllocation, pAllocationInfo); + } + + return result; +} + +VMA_CALL_PRE VkResult VMA_CALL_POST vmaAllocateMemoryForImage( + VmaAllocator allocator, + VkImage image, + const VmaAllocationCreateInfo* pCreateInfo, + VmaAllocation* pAllocation, + VmaAllocationInfo* pAllocationInfo) +{ + VMA_ASSERT(allocator && image != VK_NULL_HANDLE && pCreateInfo && pAllocation); + + VMA_DEBUG_LOG("vmaAllocateMemoryForImage"); + + VMA_DEBUG_GLOBAL_MUTEX_LOCK + + VkMemoryRequirements vkMemReq = {}; + bool requiresDedicatedAllocation = false; + bool prefersDedicatedAllocation = false; + allocator->GetImageMemoryRequirements(image, vkMemReq, + requiresDedicatedAllocation, prefersDedicatedAllocation); + + VkResult result = allocator->AllocateMemory( + vkMemReq, + requiresDedicatedAllocation, + prefersDedicatedAllocation, + VK_NULL_HANDLE, // dedicatedBuffer + UINT32_MAX, // dedicatedBufferUsage + image, // dedicatedImage + *pCreateInfo, + VMA_SUBALLOCATION_TYPE_IMAGE_UNKNOWN, + 1, // allocationCount + pAllocation); + +#if VMA_RECORDING_ENABLED + if(allocator->GetRecorder() != VMA_NULL) + { + allocator->GetRecorder()->RecordAllocateMemoryForImage( + allocator->GetCurrentFrameIndex(), + vkMemReq, + requiresDedicatedAllocation, + prefersDedicatedAllocation, + *pCreateInfo, + *pAllocation); + } +#endif + + if(pAllocationInfo && result == VK_SUCCESS) + { + allocator->GetAllocationInfo(*pAllocation, pAllocationInfo); + } + + return result; +} + +VMA_CALL_PRE void VMA_CALL_POST vmaFreeMemory( + VmaAllocator allocator, + VmaAllocation allocation) +{ + VMA_ASSERT(allocator); + + if(allocation == VK_NULL_HANDLE) + { + return; + } + + VMA_DEBUG_LOG("vmaFreeMemory"); + + VMA_DEBUG_GLOBAL_MUTEX_LOCK + +#if VMA_RECORDING_ENABLED + if(allocator->GetRecorder() != VMA_NULL) + { + allocator->GetRecorder()->RecordFreeMemory( + allocator->GetCurrentFrameIndex(), + allocation); + } +#endif + + allocator->FreeMemory( + 1, // allocationCount + &allocation); +} + +VMA_CALL_PRE void VMA_CALL_POST vmaFreeMemoryPages( + VmaAllocator allocator, + size_t allocationCount, + const VmaAllocation* pAllocations) +{ + if(allocationCount == 0) + { + return; + } + + VMA_ASSERT(allocator); + + VMA_DEBUG_LOG("vmaFreeMemoryPages"); + + VMA_DEBUG_GLOBAL_MUTEX_LOCK + +#if VMA_RECORDING_ENABLED + if(allocator->GetRecorder() != VMA_NULL) + { + allocator->GetRecorder()->RecordFreeMemoryPages( + allocator->GetCurrentFrameIndex(), + (uint64_t)allocationCount, + pAllocations); + } +#endif + + allocator->FreeMemory(allocationCount, pAllocations); +} + +VMA_CALL_PRE void VMA_CALL_POST vmaGetAllocationInfo( + VmaAllocator allocator, + VmaAllocation allocation, + VmaAllocationInfo* pAllocationInfo) +{ + VMA_ASSERT(allocator && allocation && pAllocationInfo); + + VMA_DEBUG_GLOBAL_MUTEX_LOCK + +#if VMA_RECORDING_ENABLED + if(allocator->GetRecorder() != VMA_NULL) + { + allocator->GetRecorder()->RecordGetAllocationInfo( + allocator->GetCurrentFrameIndex(), + allocation); + } +#endif + + allocator->GetAllocationInfo(allocation, pAllocationInfo); +} + +VMA_CALL_PRE VkBool32 VMA_CALL_POST vmaTouchAllocation( + VmaAllocator allocator, + VmaAllocation allocation) +{ + VMA_ASSERT(allocator && allocation); + + VMA_DEBUG_GLOBAL_MUTEX_LOCK + +#if VMA_RECORDING_ENABLED + if(allocator->GetRecorder() != VMA_NULL) + { + allocator->GetRecorder()->RecordTouchAllocation( + allocator->GetCurrentFrameIndex(), + allocation); + } +#endif + + return allocator->TouchAllocation(allocation); +} + +VMA_CALL_PRE void VMA_CALL_POST vmaSetAllocationUserData( + VmaAllocator allocator, + VmaAllocation allocation, + void* pUserData) +{ + VMA_ASSERT(allocator && allocation); + + VMA_DEBUG_GLOBAL_MUTEX_LOCK + + allocation->SetUserData(allocator, pUserData); + +#if VMA_RECORDING_ENABLED + if(allocator->GetRecorder() != VMA_NULL) + { + allocator->GetRecorder()->RecordSetAllocationUserData( + allocator->GetCurrentFrameIndex(), + allocation, + pUserData); + } +#endif +} + +VMA_CALL_PRE void VMA_CALL_POST vmaCreateLostAllocation( + VmaAllocator allocator, + VmaAllocation* pAllocation) +{ + VMA_ASSERT(allocator && pAllocation); + + VMA_DEBUG_GLOBAL_MUTEX_LOCK; + + allocator->CreateLostAllocation(pAllocation); + +#if VMA_RECORDING_ENABLED + if(allocator->GetRecorder() != VMA_NULL) + { + allocator->GetRecorder()->RecordCreateLostAllocation( + allocator->GetCurrentFrameIndex(), + *pAllocation); + } +#endif +} + +VMA_CALL_PRE VkResult VMA_CALL_POST vmaMapMemory( + VmaAllocator allocator, + VmaAllocation allocation, + void** ppData) +{ + VMA_ASSERT(allocator && allocation && ppData); + + VMA_DEBUG_GLOBAL_MUTEX_LOCK + + VkResult res = allocator->Map(allocation, ppData); + +#if VMA_RECORDING_ENABLED + if(allocator->GetRecorder() != VMA_NULL) + { + allocator->GetRecorder()->RecordMapMemory( + allocator->GetCurrentFrameIndex(), + allocation); + } +#endif + + return res; +} + +VMA_CALL_PRE void VMA_CALL_POST vmaUnmapMemory( + VmaAllocator allocator, + VmaAllocation allocation) +{ + VMA_ASSERT(allocator && allocation); + + VMA_DEBUG_GLOBAL_MUTEX_LOCK + +#if VMA_RECORDING_ENABLED + if(allocator->GetRecorder() != VMA_NULL) + { + allocator->GetRecorder()->RecordUnmapMemory( + allocator->GetCurrentFrameIndex(), + allocation); + } +#endif + + allocator->Unmap(allocation); +} + +VMA_CALL_PRE VkResult VMA_CALL_POST vmaFlushAllocation(VmaAllocator allocator, VmaAllocation allocation, VkDeviceSize offset, VkDeviceSize size) +{ + VMA_ASSERT(allocator && allocation); + + VMA_DEBUG_LOG("vmaFlushAllocation"); + + VMA_DEBUG_GLOBAL_MUTEX_LOCK + + const VkResult res = allocator->FlushOrInvalidateAllocation(allocation, offset, size, VMA_CACHE_FLUSH); + +#if VMA_RECORDING_ENABLED + if(allocator->GetRecorder() != VMA_NULL) + { + allocator->GetRecorder()->RecordFlushAllocation( + allocator->GetCurrentFrameIndex(), + allocation, offset, size); + } +#endif + + return res; +} + +VMA_CALL_PRE VkResult VMA_CALL_POST vmaInvalidateAllocation(VmaAllocator allocator, VmaAllocation allocation, VkDeviceSize offset, VkDeviceSize size) +{ + VMA_ASSERT(allocator && allocation); + + VMA_DEBUG_LOG("vmaInvalidateAllocation"); + + VMA_DEBUG_GLOBAL_MUTEX_LOCK + + const VkResult res = allocator->FlushOrInvalidateAllocation(allocation, offset, size, VMA_CACHE_INVALIDATE); + +#if VMA_RECORDING_ENABLED + if(allocator->GetRecorder() != VMA_NULL) + { + allocator->GetRecorder()->RecordInvalidateAllocation( + allocator->GetCurrentFrameIndex(), + allocation, offset, size); + } +#endif + + return res; +} + +VMA_CALL_PRE VkResult VMA_CALL_POST vmaFlushAllocations( + VmaAllocator allocator, + uint32_t allocationCount, + const VmaAllocation* allocations, + const VkDeviceSize* offsets, + const VkDeviceSize* sizes) +{ + VMA_ASSERT(allocator); + + if(allocationCount == 0) + { + return VK_SUCCESS; + } + + VMA_ASSERT(allocations); + + VMA_DEBUG_LOG("vmaFlushAllocations"); + + VMA_DEBUG_GLOBAL_MUTEX_LOCK + + const VkResult res = allocator->FlushOrInvalidateAllocations(allocationCount, allocations, offsets, sizes, VMA_CACHE_FLUSH); + +#if VMA_RECORDING_ENABLED + if(allocator->GetRecorder() != VMA_NULL) + { + //TODO + } +#endif + + return res; +} + +VMA_CALL_PRE VkResult VMA_CALL_POST vmaInvalidateAllocations( + VmaAllocator allocator, + uint32_t allocationCount, + const VmaAllocation* allocations, + const VkDeviceSize* offsets, + const VkDeviceSize* sizes) +{ + VMA_ASSERT(allocator); + + if(allocationCount == 0) + { + return VK_SUCCESS; + } + + VMA_ASSERT(allocations); + + VMA_DEBUG_LOG("vmaInvalidateAllocations"); + + VMA_DEBUG_GLOBAL_MUTEX_LOCK + + const VkResult res = allocator->FlushOrInvalidateAllocations(allocationCount, allocations, offsets, sizes, VMA_CACHE_INVALIDATE); + +#if VMA_RECORDING_ENABLED + if(allocator->GetRecorder() != VMA_NULL) + { + //TODO + } +#endif + + return res; +} + +VMA_CALL_PRE VkResult VMA_CALL_POST vmaCheckCorruption(VmaAllocator allocator, uint32_t memoryTypeBits) +{ + VMA_ASSERT(allocator); + + VMA_DEBUG_LOG("vmaCheckCorruption"); + + VMA_DEBUG_GLOBAL_MUTEX_LOCK + + return allocator->CheckCorruption(memoryTypeBits); +} + +VMA_CALL_PRE VkResult VMA_CALL_POST vmaDefragment( + VmaAllocator allocator, + const VmaAllocation* pAllocations, + size_t allocationCount, + VkBool32* pAllocationsChanged, + const VmaDefragmentationInfo *pDefragmentationInfo, + VmaDefragmentationStats* pDefragmentationStats) +{ + // Deprecated interface, reimplemented using new one. + + VmaDefragmentationInfo2 info2 = {}; + info2.allocationCount = (uint32_t)allocationCount; + info2.pAllocations = pAllocations; + info2.pAllocationsChanged = pAllocationsChanged; + if(pDefragmentationInfo != VMA_NULL) + { + info2.maxCpuAllocationsToMove = pDefragmentationInfo->maxAllocationsToMove; + info2.maxCpuBytesToMove = pDefragmentationInfo->maxBytesToMove; + } + else + { + info2.maxCpuAllocationsToMove = UINT32_MAX; + info2.maxCpuBytesToMove = VK_WHOLE_SIZE; + } + // info2.flags, maxGpuAllocationsToMove, maxGpuBytesToMove, commandBuffer deliberately left zero. + + VmaDefragmentationContext ctx; + VkResult res = vmaDefragmentationBegin(allocator, &info2, pDefragmentationStats, &ctx); + if(res == VK_NOT_READY) + { + res = vmaDefragmentationEnd( allocator, ctx); + } + return res; +} + +VMA_CALL_PRE VkResult VMA_CALL_POST vmaDefragmentationBegin( + VmaAllocator allocator, + const VmaDefragmentationInfo2* pInfo, + VmaDefragmentationStats* pStats, + VmaDefragmentationContext *pContext) +{ + VMA_ASSERT(allocator && pInfo && pContext); + + // Degenerate case: Nothing to defragment. + if(pInfo->allocationCount == 0 && pInfo->poolCount == 0) + { + return VK_SUCCESS; + } + + VMA_ASSERT(pInfo->allocationCount == 0 || pInfo->pAllocations != VMA_NULL); + VMA_ASSERT(pInfo->poolCount == 0 || pInfo->pPools != VMA_NULL); + VMA_HEAVY_ASSERT(VmaValidatePointerArray(pInfo->allocationCount, pInfo->pAllocations)); + VMA_HEAVY_ASSERT(VmaValidatePointerArray(pInfo->poolCount, pInfo->pPools)); + + VMA_DEBUG_LOG("vmaDefragmentationBegin"); + + VMA_DEBUG_GLOBAL_MUTEX_LOCK + + VkResult res = allocator->DefragmentationBegin(*pInfo, pStats, pContext); + +#if VMA_RECORDING_ENABLED + if(allocator->GetRecorder() != VMA_NULL) + { + allocator->GetRecorder()->RecordDefragmentationBegin( + allocator->GetCurrentFrameIndex(), *pInfo, *pContext); + } +#endif + + return res; +} + +VMA_CALL_PRE VkResult VMA_CALL_POST vmaDefragmentationEnd( + VmaAllocator allocator, + VmaDefragmentationContext context) +{ + VMA_ASSERT(allocator); + + VMA_DEBUG_LOG("vmaDefragmentationEnd"); + + if(context != VK_NULL_HANDLE) + { + VMA_DEBUG_GLOBAL_MUTEX_LOCK + +#if VMA_RECORDING_ENABLED + if(allocator->GetRecorder() != VMA_NULL) + { + allocator->GetRecorder()->RecordDefragmentationEnd( + allocator->GetCurrentFrameIndex(), context); + } +#endif + + return allocator->DefragmentationEnd(context); + } + else + { + return VK_SUCCESS; + } +} + +VMA_CALL_PRE VkResult VMA_CALL_POST vmaBeginDefragmentationPass( + VmaAllocator allocator, + VmaDefragmentationContext context, + VmaDefragmentationPassInfo* pInfo + ) +{ + VMA_ASSERT(allocator); + VMA_ASSERT(pInfo); + + VMA_DEBUG_LOG("vmaBeginDefragmentationPass"); + + VMA_DEBUG_GLOBAL_MUTEX_LOCK + + if(context == VK_NULL_HANDLE) + { + pInfo->moveCount = 0; + return VK_SUCCESS; + } + + return allocator->DefragmentationPassBegin(pInfo, context); +} +VMA_CALL_PRE VkResult VMA_CALL_POST vmaEndDefragmentationPass( + VmaAllocator allocator, + VmaDefragmentationContext context) +{ + VMA_ASSERT(allocator); + + VMA_DEBUG_LOG("vmaEndDefragmentationPass"); + VMA_DEBUG_GLOBAL_MUTEX_LOCK + + if(context == VK_NULL_HANDLE) + return VK_SUCCESS; + + return allocator->DefragmentationPassEnd(context); +} + +VMA_CALL_PRE VkResult VMA_CALL_POST vmaBindBufferMemory( + VmaAllocator allocator, + VmaAllocation allocation, + VkBuffer buffer) +{ + VMA_ASSERT(allocator && allocation && buffer); + + VMA_DEBUG_LOG("vmaBindBufferMemory"); + + VMA_DEBUG_GLOBAL_MUTEX_LOCK + + return allocator->BindBufferMemory(allocation, 0, buffer, VMA_NULL); +} + +VMA_CALL_PRE VkResult VMA_CALL_POST vmaBindBufferMemory2( + VmaAllocator allocator, + VmaAllocation allocation, + VkDeviceSize allocationLocalOffset, + VkBuffer buffer, + const void* pNext) +{ + VMA_ASSERT(allocator && allocation && buffer); + + VMA_DEBUG_LOG("vmaBindBufferMemory2"); + + VMA_DEBUG_GLOBAL_MUTEX_LOCK + + return allocator->BindBufferMemory(allocation, allocationLocalOffset, buffer, pNext); +} + +VMA_CALL_PRE VkResult VMA_CALL_POST vmaBindImageMemory( + VmaAllocator allocator, + VmaAllocation allocation, + VkImage image) +{ + VMA_ASSERT(allocator && allocation && image); + + VMA_DEBUG_LOG("vmaBindImageMemory"); + + VMA_DEBUG_GLOBAL_MUTEX_LOCK + + return allocator->BindImageMemory(allocation, 0, image, VMA_NULL); +} + +VMA_CALL_PRE VkResult VMA_CALL_POST vmaBindImageMemory2( + VmaAllocator allocator, + VmaAllocation allocation, + VkDeviceSize allocationLocalOffset, + VkImage image, + const void* pNext) +{ + VMA_ASSERT(allocator && allocation && image); + + VMA_DEBUG_LOG("vmaBindImageMemory2"); + + VMA_DEBUG_GLOBAL_MUTEX_LOCK + + return allocator->BindImageMemory(allocation, allocationLocalOffset, image, pNext); +} + +VMA_CALL_PRE VkResult VMA_CALL_POST vmaCreateBuffer( + VmaAllocator allocator, + const VkBufferCreateInfo* pBufferCreateInfo, + const VmaAllocationCreateInfo* pAllocationCreateInfo, + VkBuffer* pBuffer, + VmaAllocation* pAllocation, + VmaAllocationInfo* pAllocationInfo) +{ + VMA_ASSERT(allocator && pBufferCreateInfo && pAllocationCreateInfo && pBuffer && pAllocation); + + if(pBufferCreateInfo->size == 0) + { + return VK_ERROR_VALIDATION_FAILED_EXT; + } + if((pBufferCreateInfo->usage & VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT_COPY) != 0 && + !allocator->m_UseKhrBufferDeviceAddress) + { + VMA_ASSERT(0 && "Creating a buffer with VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT is not valid if VMA_ALLOCATOR_CREATE_BUFFER_DEVICE_ADDRESS_BIT was not used."); + return VK_ERROR_VALIDATION_FAILED_EXT; + } + + VMA_DEBUG_LOG("vmaCreateBuffer"); + + VMA_DEBUG_GLOBAL_MUTEX_LOCK + + *pBuffer = VK_NULL_HANDLE; + *pAllocation = VK_NULL_HANDLE; + + // 1. Create VkBuffer. + VkResult res = (*allocator->GetVulkanFunctions().vkCreateBuffer)( + allocator->m_hDevice, + pBufferCreateInfo, + allocator->GetAllocationCallbacks(), + pBuffer); + if(res >= 0) + { + // 2. vkGetBufferMemoryRequirements. + VkMemoryRequirements vkMemReq = {}; + bool requiresDedicatedAllocation = false; + bool prefersDedicatedAllocation = false; + allocator->GetBufferMemoryRequirements(*pBuffer, vkMemReq, + requiresDedicatedAllocation, prefersDedicatedAllocation); + + // 3. Allocate memory using allocator. + res = allocator->AllocateMemory( + vkMemReq, + requiresDedicatedAllocation, + prefersDedicatedAllocation, + *pBuffer, // dedicatedBuffer + pBufferCreateInfo->usage, // dedicatedBufferUsage + VK_NULL_HANDLE, // dedicatedImage + *pAllocationCreateInfo, + VMA_SUBALLOCATION_TYPE_BUFFER, + 1, // allocationCount + pAllocation); + +#if VMA_RECORDING_ENABLED + if(allocator->GetRecorder() != VMA_NULL) + { + allocator->GetRecorder()->RecordCreateBuffer( + allocator->GetCurrentFrameIndex(), + *pBufferCreateInfo, + *pAllocationCreateInfo, + *pAllocation); + } +#endif + + if(res >= 0) + { + // 3. Bind buffer with memory. + if((pAllocationCreateInfo->flags & VMA_ALLOCATION_CREATE_DONT_BIND_BIT) == 0) + { + res = allocator->BindBufferMemory(*pAllocation, 0, *pBuffer, VMA_NULL); + } + if(res >= 0) + { + // All steps succeeded. + #if VMA_STATS_STRING_ENABLED + (*pAllocation)->InitBufferImageUsage(pBufferCreateInfo->usage); + #endif + if(pAllocationInfo != VMA_NULL) + { + allocator->GetAllocationInfo(*pAllocation, pAllocationInfo); + } + + return VK_SUCCESS; + } + allocator->FreeMemory( + 1, // allocationCount + pAllocation); + *pAllocation = VK_NULL_HANDLE; + (*allocator->GetVulkanFunctions().vkDestroyBuffer)(allocator->m_hDevice, *pBuffer, allocator->GetAllocationCallbacks()); + *pBuffer = VK_NULL_HANDLE; + return res; + } + (*allocator->GetVulkanFunctions().vkDestroyBuffer)(allocator->m_hDevice, *pBuffer, allocator->GetAllocationCallbacks()); + *pBuffer = VK_NULL_HANDLE; + return res; + } + return res; +} + +VMA_CALL_PRE void VMA_CALL_POST vmaDestroyBuffer( + VmaAllocator allocator, + VkBuffer buffer, + VmaAllocation allocation) +{ + VMA_ASSERT(allocator); + + if(buffer == VK_NULL_HANDLE && allocation == VK_NULL_HANDLE) + { + return; + } + + VMA_DEBUG_LOG("vmaDestroyBuffer"); + + VMA_DEBUG_GLOBAL_MUTEX_LOCK + +#if VMA_RECORDING_ENABLED + if(allocator->GetRecorder() != VMA_NULL) + { + allocator->GetRecorder()->RecordDestroyBuffer( + allocator->GetCurrentFrameIndex(), + allocation); + } +#endif + + if(buffer != VK_NULL_HANDLE) + { + (*allocator->GetVulkanFunctions().vkDestroyBuffer)(allocator->m_hDevice, buffer, allocator->GetAllocationCallbacks()); + } + + if(allocation != VK_NULL_HANDLE) + { + allocator->FreeMemory( + 1, // allocationCount + &allocation); + } +} + +VMA_CALL_PRE VkResult VMA_CALL_POST vmaCreateImage( + VmaAllocator allocator, + const VkImageCreateInfo* pImageCreateInfo, + const VmaAllocationCreateInfo* pAllocationCreateInfo, + VkImage* pImage, + VmaAllocation* pAllocation, + VmaAllocationInfo* pAllocationInfo) +{ + VMA_ASSERT(allocator && pImageCreateInfo && pAllocationCreateInfo && pImage && pAllocation); + + if(pImageCreateInfo->extent.width == 0 || + pImageCreateInfo->extent.height == 0 || + pImageCreateInfo->extent.depth == 0 || + pImageCreateInfo->mipLevels == 0 || + pImageCreateInfo->arrayLayers == 0) + { + return VK_ERROR_VALIDATION_FAILED_EXT; + } + + VMA_DEBUG_LOG("vmaCreateImage"); + + VMA_DEBUG_GLOBAL_MUTEX_LOCK + + *pImage = VK_NULL_HANDLE; + *pAllocation = VK_NULL_HANDLE; + + // 1. Create VkImage. + VkResult res = (*allocator->GetVulkanFunctions().vkCreateImage)( + allocator->m_hDevice, + pImageCreateInfo, + allocator->GetAllocationCallbacks(), + pImage); + if(res >= 0) + { + VmaSuballocationType suballocType = pImageCreateInfo->tiling == VK_IMAGE_TILING_OPTIMAL ? + VMA_SUBALLOCATION_TYPE_IMAGE_OPTIMAL : + VMA_SUBALLOCATION_TYPE_IMAGE_LINEAR; + + // 2. Allocate memory using allocator. + VkMemoryRequirements vkMemReq = {}; + bool requiresDedicatedAllocation = false; + bool prefersDedicatedAllocation = false; + allocator->GetImageMemoryRequirements(*pImage, vkMemReq, + requiresDedicatedAllocation, prefersDedicatedAllocation); + + res = allocator->AllocateMemory( + vkMemReq, + requiresDedicatedAllocation, + prefersDedicatedAllocation, + VK_NULL_HANDLE, // dedicatedBuffer + UINT32_MAX, // dedicatedBufferUsage + *pImage, // dedicatedImage + *pAllocationCreateInfo, + suballocType, + 1, // allocationCount + pAllocation); + +#if VMA_RECORDING_ENABLED + if(allocator->GetRecorder() != VMA_NULL) + { + allocator->GetRecorder()->RecordCreateImage( + allocator->GetCurrentFrameIndex(), + *pImageCreateInfo, + *pAllocationCreateInfo, + *pAllocation); + } +#endif + + if(res >= 0) + { + // 3. Bind image with memory. + if((pAllocationCreateInfo->flags & VMA_ALLOCATION_CREATE_DONT_BIND_BIT) == 0) + { + res = allocator->BindImageMemory(*pAllocation, 0, *pImage, VMA_NULL); + } + if(res >= 0) + { + // All steps succeeded. + #if VMA_STATS_STRING_ENABLED + (*pAllocation)->InitBufferImageUsage(pImageCreateInfo->usage); + #endif + if(pAllocationInfo != VMA_NULL) + { + allocator->GetAllocationInfo(*pAllocation, pAllocationInfo); + } + + return VK_SUCCESS; + } + allocator->FreeMemory( + 1, // allocationCount + pAllocation); + *pAllocation = VK_NULL_HANDLE; + (*allocator->GetVulkanFunctions().vkDestroyImage)(allocator->m_hDevice, *pImage, allocator->GetAllocationCallbacks()); + *pImage = VK_NULL_HANDLE; + return res; + } + (*allocator->GetVulkanFunctions().vkDestroyImage)(allocator->m_hDevice, *pImage, allocator->GetAllocationCallbacks()); + *pImage = VK_NULL_HANDLE; + return res; + } + return res; +} + +VMA_CALL_PRE void VMA_CALL_POST vmaDestroyImage( + VmaAllocator allocator, + VkImage image, + VmaAllocation allocation) +{ + VMA_ASSERT(allocator); + + if(image == VK_NULL_HANDLE && allocation == VK_NULL_HANDLE) + { + return; + } + + VMA_DEBUG_LOG("vmaDestroyImage"); + + VMA_DEBUG_GLOBAL_MUTEX_LOCK + +#if VMA_RECORDING_ENABLED + if(allocator->GetRecorder() != VMA_NULL) + { + allocator->GetRecorder()->RecordDestroyImage( + allocator->GetCurrentFrameIndex(), + allocation); + } +#endif + + if(image != VK_NULL_HANDLE) + { + (*allocator->GetVulkanFunctions().vkDestroyImage)(allocator->m_hDevice, image, allocator->GetAllocationCallbacks()); + } + if(allocation != VK_NULL_HANDLE) + { + allocator->FreeMemory( + 1, // allocationCount + &allocation); + } +} + +#endif // #ifdef VMA_IMPLEMENTATION diff --git a/aten/src/ATen/native/vulkan/ops/Convolution.cpp b/aten/src/ATen/native/vulkan/ops/Convolution.cpp index f952e611547c13..f94daa2f0b73f3 100644 --- a/aten/src/ATen/native/vulkan/ops/Convolution.cpp +++ b/aten/src/ATen/native/vulkan/ops/Convolution.cpp @@ -3,6 +3,7 @@ #include #include #include +#include namespace at { namespace native { @@ -32,7 +33,7 @@ inline bool is_pointwise(const IntArrayRef filter) { bool all_lessthan(const IntArrayRef arr, const int t) { bool retval = true; - for (size_t i = 0; i < arr.size(); i++) { + for (const auto i : c10::irange(arr.size())) { retval = retval && (arr[i] < t); } return retval; @@ -173,8 +174,8 @@ vTensor pack_weights_2d( for (int64_t src_ic = 0; src_ic < src_filter[Layout::Filter::input]; ++src_ic) { const int64_t dst_ic4 = src_ic / 4; - for (int64_t src_ih = 0; src_ih < src_kh_sz; ++src_ih) { - for (int64_t src_iw = 0; src_iw < src_kw_sz; ++src_iw) { + for (const auto src_ih : c10::irange(src_kh_sz)) { + for (const auto src_iw : c10::irange(src_kw_sz)) { memcpy( dst_weight_c_ptr + (dst_oh * src_kh_sz + src_ih) * dst_kw_sz + dst_ic4 * src_kw_sz * 4 + src_iw * 4 + src_ic % 4, @@ -225,11 +226,11 @@ vTensor pack_weights_2d_winograd_2_3( float* const dst_weight_ptr = v_weight_payload.get(); memset(dst_weight_ptr, 0, v_weight.nbytes()); - for (int64_t src_oc = 0; src_oc < src_oc_sz; ++src_oc) { + for (const auto src_oc : c10::irange(src_oc_sz)) { const int64_t dst_oh = src_oc / 4; const int64_t dst_iw = src_oc % 4; - for (int64_t src_ic = 0; src_ic < src_ic_sz; ++src_ic) { + for (const auto src_ic : c10::irange(src_ic_sz)) { const int64_t dst_ow = src_ic / 4; const int64_t dst_c = src_ic % 4; @@ -344,7 +345,7 @@ vTensor pack_biases( float* const dst_bias_ptr = v_bias_payload.get(); memset(dst_bias_ptr, 0, v_bias.nbytes()); - for (int64_t i = 0; i < src_w; ++i) { + for (const auto i : c10::irange(src_w)) { const int64_t c = i % 4; const int64_t x = i / 4; dst_bias_ptr[c * packed_w + x] = src_bias_ptr[i]; diff --git a/aten/src/ATen/native/vulkan/ops/Mm.cpp b/aten/src/ATen/native/vulkan/ops/Mm.cpp index 7d4d45106d44fc..b19f02af0b7e95 100644 --- a/aten/src/ATen/native/vulkan/ops/Mm.cpp +++ b/aten/src/ATen/native/vulkan/ops/Mm.cpp @@ -1,4 +1,5 @@ #include +#include namespace at { namespace native { @@ -47,8 +48,8 @@ vTensor pack_weights( float* const dst_weight_ptr = v_weight_payload.get(); memset(dst_weight_ptr, 0, v_weight.nbytes()); - for (int64_t src_h = 0; src_h < src_kh_sz; ++src_h) { - for (int64_t src_w = 0; src_w < src_kw_sz; ++src_w) { + for (const auto src_h : c10::irange(src_kh_sz)) { + for (const auto src_w : c10::irange(src_kw_sz)) { int64_t dst_plane = 2*(src_h%2) + (src_w%2); int64_t dst_index = (src_h/2)*dst_kw_sz + (src_w/2); memcpy( @@ -109,8 +110,8 @@ vTensor pack_biases( float* const dst_bias_ptr = v_bias_payload.get(); memset(dst_bias_ptr, 0, v_bias.nbytes()); - for (int64_t src_h = 0; src_h < src_kh_sz; ++src_h) { - for (int64_t src_w = 0; src_w < src_kw_sz; ++src_w) { + for (const auto src_h : c10::irange(src_kh_sz)) { + for (const auto src_w : c10::irange(src_kw_sz)) { int64_t dst_plane = 2*(src_h%2) + (src_w%2); int64_t dst_index = (src_h/2)*dst_kw_sz + (src_w/2); memcpy( diff --git a/aten/src/ATen/native/vulkan/ops/Padding.cpp b/aten/src/ATen/native/vulkan/ops/Padding.cpp index 4e8eb87e2e068c..8d16093bd384ad 100644 --- a/aten/src/ATen/native/vulkan/ops/Padding.cpp +++ b/aten/src/ATen/native/vulkan/ops/Padding.cpp @@ -1,4 +1,5 @@ #include +#include #include namespace at { @@ -35,7 +36,7 @@ Tensor reflection_pad2d(const Tensor& self_arg, IntArrayRef padding) { const vTensor& v_self = convert(self); c10::SmallVector output_size(input_dim); - for (size_t d = 0; d < input_dim; ++d) { + for (const auto d : c10::irange(input_dim)) { if (d == input_dim - 1) { output_size[d] = input_size[d] + pad_right + pad_left; } else if (d == input_dim - 2) { diff --git a/aten/src/ATen/native/xnnpack/Convolution.cpp b/aten/src/ATen/native/xnnpack/Convolution.cpp index f46052d9c5ef63..5ec3c1818fba57 100644 --- a/aten/src/ATen/native/xnnpack/Convolution.cpp +++ b/aten/src/ATen/native/xnnpack/Convolution.cpp @@ -7,6 +7,7 @@ #include #include #include +#include namespace at { namespace native { @@ -150,11 +151,11 @@ const Tensor reorder_weights_for_transpose_conv(const Tensor& weight_nhwc, float* in_ptr = weight_nhwc.data_ptr(); int out_index = 0; - for (int g = 0; g < num_groups; g++) { - for (int o = 0; o < output_channels_per_group; o++) { - for (int w = 0; w < kernel_width; w++) { - for (int h = 0; h < kernel_height; h++) { - for (int i = 0; i < input_channels_per_group; i++) { + for (const auto g : c10::irange(num_groups)) { + for (const auto o : c10::irange(output_channels_per_group)) { + for (const auto w : c10::irange(kernel_width)) { + for (const auto h : c10::irange(kernel_height)) { + for (const auto i : c10::irange(input_channels_per_group)) { int in_index = (g*g_offset) + (i*i_offset) + (h*h_offset) + (w*w_offset) + (o*o_offset); out_ptr[out_index] = in_ptr[in_index]; out_index++; @@ -210,7 +211,7 @@ ContextConv2D create( if (transposed) { const Tensor weight_reordered = reorder_weights_for_transpose_conv(weight_nhwc, groups); - for (int i = 0; i < 4; i++) { + for (const auto i : c10::irange(4)) { weight_sizes[i] = weight_reordered.size(i); } create_status = xnn_create_deconvolution2d_nhwc_f32( @@ -238,7 +239,7 @@ ContextConv2D create( 0u, // flags &convolution_op); // operator } else { - for (int i = 0; i < 4; i++) { + for (const auto i : c10::irange(4)) { weight_sizes[i] = weight_nhwc.size(i); } create_status = xnn_create_convolution2d_nhwc_f32( diff --git a/aten/src/ATen/nnapi/nnapi_bind.cpp b/aten/src/ATen/nnapi/nnapi_bind.cpp index 0ab8997476c9b1..ab9eb6b9a03f97 100644 --- a/aten/src/ATen/nnapi/nnapi_bind.cpp +++ b/aten/src/ATen/nnapi/nnapi_bind.cpp @@ -4,6 +4,7 @@ #include #include #include +#include namespace torch { namespace nnapi { @@ -103,7 +104,7 @@ void NnapiCompilation::run( TORCH_CHECK((int32_t)inputs.size() == num_inputs_); TORCH_CHECK((int32_t)outputs.size() == num_outputs_); - for (size_t i = 0; i < inputs.size(); i++) { + for (const auto i : c10::irange(inputs.size())) { auto& t = inputs[i]; // TODO: Check contiguous and dtype. ANeuralNetworksOperandType op_type; @@ -117,7 +118,7 @@ void NnapiCompilation::run( t.nbytes()); } - for (size_t i = 0; i < outputs.size(); i++) { + for (const auto i : c10::irange(outputs.size())) { auto& t = outputs[i]; // TODO: Check contiguous and dtype. check_nnapi->Execution_setOutput( @@ -131,7 +132,7 @@ void NnapiCompilation::run( check_nnapi->Execution_compute(execution); // TODO: Maybe skip this for fixed-size outputs? - for (size_t i = 0; i < outputs.size(); i++) { + for (const auto i : c10::irange(outputs.size())) { auto& t = outputs[i]; // NOLINTNEXTLINE(cppcoreguidelines-init-variables) uint32_t rank; diff --git a/aten/src/ATen/test/apply_utils_test.cpp b/aten/src/ATen/test/apply_utils_test.cpp index 56fd77ed708448..9028ae303e7344 100644 --- a/aten/src/ATen/test/apply_utils_test.cpp +++ b/aten/src/ATen/test/apply_utils_test.cpp @@ -3,6 +3,7 @@ #include #include #include +#include #include using namespace std; @@ -10,7 +11,7 @@ using namespace at; void fill_tensor(int64_t scalar, Tensor& t_) { auto t = t_.view(-1); - for (int64_t i = 0; i < t.numel(); i++) { + for (const auto i : c10::irange(t.numel())) { t[i] = (i + 1) * scalar; } } @@ -42,7 +43,7 @@ void test(DeprecatedTypeProperties& type, IntArrayRef shape, int64_t a = 0, int6 auto a4 = at::empty({0}, at::TensorOptions(kCPU).dtype(kDouble)); std::vector tensors({a0, a1, a2, a3, a4}); - for (size_t i = 0; i < tensors.size(); i++) { + for (const auto i : c10::irange(tensors.size())) { tensors[i].resize_(shape); fill_tensor(i + 1, tensors[i]); if (a >= 0 && b >= 0) { @@ -55,7 +56,7 @@ void test(DeprecatedTypeProperties& type, IntArrayRef shape, int64_t a = 0, int6 a0, a1, [](scalar_t& y, const scalar_t& x) { y = x * x; }); CPU_tensor_apply2( a4, a1, [](double& y, scalar_t x) { y = (double)(x * x); }); - for (int64_t i = 0; i < a0.numel(); i++) { + for (const auto i : c10::irange(a0.numel())) { auto target = a1.data_ptr()[i] * a1.data_ptr()[i]; ASSERT(a0.data_ptr()[i] == target); ASSERT(a4.data_ptr()[i] == target); @@ -71,7 +72,7 @@ void test(DeprecatedTypeProperties& type, IntArrayRef shape, int64_t a = 0, int6 a4, a1, a2, [](double& y, const scalar_t& x, const scalar_t& z) { y = (double)(x * x + z); }); - for (int64_t i = 0; i < a0.numel(); i++) { + for (const auto i : c10::irange(a0.numel())) { auto target = a1.data_ptr()[i] * a1.data_ptr()[i]; target = target + a2.data_ptr()[i]; ASSERT(a0.data_ptr()[i] == target); @@ -97,7 +98,7 @@ void test(DeprecatedTypeProperties& type, IntArrayRef shape, int64_t a = 0, int6 [](double& y, const scalar_t& x, const scalar_t& z, const scalar_t& a) { y = (double)(x * x + z * a); }); - for (int64_t i = 0; i < a0.numel(); i++) { + for (const auto i : c10::irange(a0.numel())) { auto target = a1.data_ptr()[i] * a1.data_ptr()[i]; target = target + a2.data_ptr()[i] * a3.data_ptr()[i]; ASSERT(a0.data_ptr()[i] == target); diff --git a/aten/src/ATen/test/atest.cpp b/aten/src/ATen/test/atest.cpp index 408898bd54bc6c..6ea874fdb4ad7b 100644 --- a/aten/src/ATen/test/atest.cpp +++ b/aten/src/ATen/test/atest.cpp @@ -1,6 +1,7 @@ #include #include +#include #include using namespace std; @@ -102,7 +103,7 @@ void trace() { auto foo_a = foo.accessor(); float trace = 0; - for (int i = 0; i < foo_a.size(0); i++) { + for (const auto i : c10::irange(foo_a.size(0))) { trace += foo_a[i][i]; } @@ -237,8 +238,8 @@ TEST_F(atest, atest) { // foo = foo[3]; auto foo_v = foo.accessor(); - for (int i = 0; i < foo_v.size(0); i++) { - for (int j = 0; j < foo_v.size(1); j++) { + for (const auto i : c10::irange(foo_v.size(0))) { + for (const auto j : c10::irange(foo_v.size(1))) { foo_v[i][j]++; } } diff --git a/aten/src/ATen/test/basic.cpp b/aten/src/ATen/test/basic.cpp index 2e4f8d27f427c1..f041ed4a73654c 100644 --- a/aten/src/ATen/test/basic.cpp +++ b/aten/src/ATen/test/basic.cpp @@ -4,6 +4,7 @@ #include #include #include +#include // for TH compat test only... struct THFloatTensor; @@ -84,7 +85,7 @@ void TestAdd(DeprecatedTypeProperties& type) { void TestZeros(DeprecatedTypeProperties& type) { auto begin = std::chrono::high_resolution_clock::now(); Tensor a = zeros({1024, 1024}, type); - for (int i = 1; i < 1000; ++i) { + for (const auto i : c10::irange(1, 1000)) { a = zeros({128, 128}, type); } auto end = std::chrono::high_resolution_clock::now(); @@ -102,7 +103,7 @@ void TestLoadsOfAdds(DeprecatedTypeProperties& type) { auto begin = std::chrono::high_resolution_clock::now(); Tensor d = ones({3, 4}, type); Tensor r = zeros({3, 4}, type); - for (auto i = 0; i < 100000; i++) { + for (const auto i : c10::irange(100000)) { add_out(r, r, d); } auto end = std::chrono::high_resolution_clock::now(); @@ -119,7 +120,7 @@ void TestLoadOfAddsWithCopy(DeprecatedTypeProperties& type) { auto begin = std::chrono::high_resolution_clock::now(); Tensor d = ones({3, 4}, type); Tensor r = zeros({3, 4}, type); - for (auto i = 0; i < 100000; i++) { + for (const auto i : c10::irange(100000)) { r = add(r, d); } auto end = std::chrono::high_resolution_clock::now(); @@ -176,7 +177,7 @@ void TestCopyBroadcasting(DeprecatedTypeProperties& type) { Tensor a = zeros({4, 3}, type); Tensor e = rand({3}, type); a.copy_(e); - for (int i = 0; i < 4; ++i) { + for (const auto i : c10::irange(4)) { ASSERT_TRUE(a[i].equal(e)); } } @@ -247,13 +248,13 @@ void TestToString() { void TestIndexingByScalar() { Tensor tensor = arange(0, 10, kInt); Tensor one = ones({}, kInt); - for (int64_t i = 0; i < tensor.numel(); ++i) { + for (const auto i : c10::irange(tensor.numel())) { ASSERT_TRUE(tensor[i].equal(one * i)); } for (size_t i = 0; i < static_cast(tensor.numel()); ++i) { ASSERT_TRUE(tensor[i].equal(one * static_cast(i))); } - for (int i = 0; i < tensor.numel(); ++i) { + for (const auto i : c10::irange(tensor.numel())) { ASSERT_TRUE(tensor[i].equal(one * i)); } // NOLINTNEXTLINE(bugprone-too-small-loop-variable) @@ -272,7 +273,7 @@ void TestIndexingByScalar() { void TestIndexingByZerodimTensor() { Tensor tensor = arange(0, 10, kInt); Tensor one = ones({}, kInt); - for (int i = 0; i < tensor.numel(); ++i) { + for (const auto i : c10::irange(tensor.numel())) { ASSERT_TRUE(tensor[one * i].equal(one * i)); } // Throw StartsWith( diff --git a/aten/src/ATen/test/cpu_generator_test.cpp b/aten/src/ATen/test/cpu_generator_test.cpp index c686c4e79531cc..40b2fbf3bd59f4 100644 --- a/aten/src/ATen/test/cpu_generator_test.cpp +++ b/aten/src/ATen/test/cpu_generator_test.cpp @@ -4,6 +4,7 @@ #include #include #include +#include #include #include #include @@ -160,7 +161,7 @@ TEST(CPUGeneratorImpl, TestPhiloxEngineOffset1) { // So if you want to skip 8 values, offset would // be 2, since 2*4=8. at::Philox4_32_10 engine2(123, 1, 2); - for(int i = 0; i < 8; i++){ + for (const auto i : c10::irange(8)) { // Note: instead of using the engine() call 8 times // we could have achieved the same functionality by // calling the incr() function twice. @@ -221,14 +222,14 @@ TEST(CPUGeneratorImpl, TestMT19937EngineReproducibility) { // test with zero seed at::mt19937 engine1(0); std::mt19937 engine2(0); - for(int i = 0; i < 10000; i++) { + for (const auto i : c10::irange(10000)) { ASSERT_EQ(engine1(), engine2()); } // test with large seed engine1 = at::mt19937(2147483647); engine2 = std::mt19937(2147483647); - for(int i = 0; i < 10000; i++) { + for (const auto i : c10::irange(10000)) { ASSERT_EQ(engine1(), engine2()); } @@ -237,7 +238,7 @@ TEST(CPUGeneratorImpl, TestMT19937EngineReproducibility) { auto seed = rd(); engine1 = at::mt19937(seed); engine2 = std::mt19937(seed); - for(int i = 0; i < 10000; i++) { + for (const auto i : c10::irange(10000)) { ASSERT_EQ(engine1(), engine2()); } diff --git a/aten/src/ATen/test/cuda_tensor_interop_test.cpp b/aten/src/ATen/test/cuda_tensor_interop_test.cpp index 2a2833b7b2748a..0b027bbfde96fb 100644 --- a/aten/src/ATen/test/cuda_tensor_interop_test.cpp +++ b/aten/src/ATen/test/cuda_tensor_interop_test.cpp @@ -2,6 +2,7 @@ #include #include +#include #include #include #include @@ -34,7 +35,7 @@ TEST(CUDACaffe2ToPytorch, SimpleLegacy) { auto at_cpu = at_tensor.cpu(); auto it = at_cpu.data_ptr(); - for (int64_t i = 0; i < 16; i++) { + for (const auto i : c10::irange(16)) { ASSERT_EQ(it[i], 777); } } @@ -53,7 +54,7 @@ TEST(CUDACaffe2ToPytorch, Simple) { auto at_cpu = at_tensor.cpu(); auto it = at_cpu.data_ptr(); - for (int64_t i = 0; i < 16; i++) { + for (const auto i : c10::irange(16)) { ASSERT_EQ(it[i], 777); } } @@ -109,7 +110,7 @@ TEST(CUDAPytorchToCaffe2, Op) { ASSERT_EQ(result.GetDeviceType(), caffe2::CUDA); auto data = result.data(); - for (int64_t i = 0; i < 25; i++) { + for (const auto i : c10::irange(25)) { ASSERT_EQ(cuda_get(data + i), 3.0); } at::Tensor at_result(result); diff --git a/aten/src/ATen/test/ivalue_test.cpp b/aten/src/ATen/test/ivalue_test.cpp index 50d2beaab369aa..cd0e82a2b0b8bc 100644 --- a/aten/src/ATen/test/ivalue_test.cpp +++ b/aten/src/ATen/test/ivalue_test.cpp @@ -3,6 +3,7 @@ #include #include #include +#include #include // Snippets for checking assembly. @@ -640,7 +641,7 @@ TEST(IValueTest, IdentityComparisonAndHashing) { auto moreSampleIValues = makeMoreSampleIValues(); ASSERT_EQ(sampleIValues.size(), moreSampleIValues.size()); - for (int ii = 0; ii < sampleIValues.size(); ++ii) { + for (const auto ii : c10::irange(sampleIValues.size())) { if (sampleIValues[ii].isComplexDouble() || sampleIValues[ii].isBlob() || sampleIValues[ii].isList() || diff --git a/aten/src/ATen/test/math_kernel_test.cpp b/aten/src/ATen/test/math_kernel_test.cpp index b6d7b5ae355a7b..15ce0af4001d5a 100644 --- a/aten/src/ATen/test/math_kernel_test.cpp +++ b/aten/src/ATen/test/math_kernel_test.cpp @@ -2,6 +2,7 @@ #include #include +#include using namespace at; @@ -115,7 +116,7 @@ TEST(MathKernelTest, MishBackward) { TEST(MathKernelTest, NarrowCopy) { auto x = rand({5, 8, 7}); - for (int64_t dim = 0; dim < 3; ++dim) { + for (const auto dim : c10::irange(3)) { const int64_t start = 1, length = 4; auto y_ref = x.narrow(dim, start, length); auto y_test = at::native::narrow_copy_dense(x, dim, start, length); diff --git a/aten/src/ATen/test/native_test.cpp b/aten/src/ATen/test/native_test.cpp index 2b7ebb2a789f7e..5f27ce4886e479 100644 --- a/aten/src/ATen/test/native_test.cpp +++ b/aten/src/ATen/test/native_test.cpp @@ -1,6 +1,7 @@ #include #include +#include using namespace at; @@ -16,7 +17,7 @@ using namespace at; void requireEqualTensorList(TensorList t1, TensorList t2) { ASSERT_EQ(t1.size(), t2.size()); - for (size_t i = 0; i < t1.size(); ++i) { + for (const auto i : c10::irange(t1.size())) { ASSERT_EQUAL(t1[i], t2[i]); } } @@ -74,7 +75,7 @@ void TestStack(TensorOptions T, Tensor& t) { auto z = rand({2, 3, 4}); auto inputs = {x, y, z}; - for (int64_t dim = 0; dim < 4; ++dim) { + for (const auto dim : c10::irange(4)) { _test_stack(inputs, dim, at::stack); } } @@ -85,7 +86,7 @@ void TestStack(TensorOptions T, Tensor& t) { auto z = rand({2, 3, 4}); auto inputs = {x, y, z}; - for (int64_t dim = 0; dim < 4; ++dim) { + for (const auto dim : c10::irange(4)) { _test_stack(inputs, dim, at::native::_stack); } } @@ -96,7 +97,7 @@ void TestStack(TensorOptions T, Tensor& t) { auto z = rand({2, 3, 4}); auto inputs = {x, y, z}; - for (int64_t dim = 0; dim < 4; ++dim) { + for (const auto dim : c10::irange(4)) { _test_stack(inputs, dim, at::native::_stack_cpu); } } diff --git a/aten/src/ATen/test/packedtensoraccessor_test.cpp b/aten/src/ATen/test/packedtensoraccessor_test.cpp index 69b09835fb6993..2a56fb4a436e37 100644 --- a/aten/src/ATen/test/packedtensoraccessor_test.cpp +++ b/aten/src/ATen/test/packedtensoraccessor_test.cpp @@ -1,6 +1,7 @@ #include #include #include +#include #include #include @@ -34,7 +35,7 @@ TEST(PackedtensoraccessorTest, TransposeTest) { t = rand({size}, CPU(kFloat)); auto original_1d = t.packed_accessor64(); auto transposed_1d = original_1d.transpose(0, 0); - for (int i = 0; i < size; i++){ + for (const auto i : c10::irange(size)) { ASSERT_EQ(original_1d[i], transposed_1d[i]); } diff --git a/aten/src/ATen/test/pow_test.cpp b/aten/src/ATen/test/pow_test.cpp index 24ba446e031a7b..64cec8a1699d73 100644 --- a/aten/src/ATen/test/pow_test.cpp +++ b/aten/src/ATen/test/pow_test.cpp @@ -1,6 +1,7 @@ #include #include +#include #include #include @@ -203,7 +204,7 @@ void tensor_pow_tensor(const Vals vals, c10::ScalarType vals_dtype, Pows pows, c std::cout.precision(dbl::max_digits10); const auto vals_tensor = torch::tensor(vals, vals_dtype); - for (size_t shift = 0; shift < pows.size(); shift++) { + for (const auto shift : c10::irange(pows.size())) { const auto pows_tensor = torch::tensor(pows, pows_dtype); const auto actual_pow = vals_tensor.pow(pows_tensor); diff --git a/aten/src/ATen/test/quantized_test.cpp b/aten/src/ATen/test/quantized_test.cpp index 91b17e0be61547..06373d30edd023 100644 --- a/aten/src/ATen/test/quantized_test.cpp +++ b/aten/src/ATen/test/quantized_test.cpp @@ -11,6 +11,7 @@ // For quantize_val #include #include +#include #include using namespace at; @@ -30,14 +31,14 @@ TEST(TestQTensor, QuantDequantAPIs) { // int_repr Tensor int_repr = qr.int_repr(); auto* int_repr_data = int_repr.data_ptr(); - for (auto i = 0; i < num_elements; ++i) { + for (const auto i : c10::irange(num_elements)) { ASSERT_EQ(int_repr_data[i], 3); } // Check for correct quantization auto r_data = r.data_ptr(); auto qr_data = qr.data_ptr(); - for (auto i = 0; i < num_elements; ++i) { + for (const auto i : c10::irange(num_elements)) { ASSERT_EQ( native::quantize_val(scale, zero_point, r_data[i]).val_, qr_data[i].val_); @@ -46,10 +47,10 @@ TEST(TestQTensor, QuantDequantAPIs) { // Check for correct dequantization Tensor rqr = qr.dequantize(); auto rqr_data = rqr.data_ptr(); - for (auto i = 0; i < num_elements; ++i) { + for (const auto i : c10::irange(num_elements)) { ASSERT_EQ(r_data[i], rqr_data[i]); } - for (auto i = 0; i < num_elements; ++i) { + for (const auto i : c10::irange(num_elements)) { ASSERT_EQ( r_data[i], native::dequantize_val(qr.q_scale(), qr.q_zero_point(), qr_data[i])); @@ -60,7 +61,7 @@ TEST(TestQTensor, QuantDequantAPIs) { int64_t new_zero_point = 1; Tensor reqr = at::quantize_per_tensor(r, new_scale, new_zero_point, kQInt8); auto reqr_data = reqr.data_ptr(); - for (auto i = 0; i < num_elements; ++i) { + for (const auto i : c10::irange(num_elements)) { reqr_data[i].val_ = native::requantize_val( scale, zero_point, new_scale, new_zero_point, qr_data[i]) @@ -85,7 +86,7 @@ TEST(TestQTensor, RoundingMode) { Tensor qx = at::quantize_per_tensor(x, /*scale=*/1.0, zero_point, kQUInt8); auto qx_data = qx.data_ptr(); - for (size_t idx = 0; idx < x_values.size(); ++idx) { + for (const auto idx : c10::irange(x_values.size())) { ASSERT_EQ(qx_expect[idx], qx_data[idx].val_) << "Tie breaking during rounding element " << idx << " failed!"; } @@ -108,14 +109,14 @@ TEST(TestQTensor, EmptyQuantized) { {numel}, at::device(at::kCPU).dtype(kQUInt8), scale, zero_point); // Assigning to QTensor auto* q_data = q.data_ptr(); - for (int i = 0; i < numel; ++i) { + for (const auto i : c10::irange(numel)) { q_data[i].val_ = val; } // dequantize auto r = q.dequantize(); auto* r_data = r.data_ptr(); - for (int i = 0; i < numel; ++i) { + for (const auto i : c10::irange(numel)) { ASSERT_EQ(r_data[i], (val - zero_point) * scale); } } @@ -134,14 +135,14 @@ TEST(TestQTensor, EmptyPerchannelQuantized) { at::device(at::kCPU).dtype(kQUInt8)); // Assigning to QTensor auto* q_data = q.data_ptr(); - for (int i = 0; i < numel; ++i) { + for (const auto i : c10::irange(numel)) { q_data[i].val_ = val; } // dequantize auto r = q.dequantize(); auto* r_data = r.data_ptr(); - for (int i = 0; i < numel; ++i) { + for (const auto i : c10::irange(numel)) { ASSERT_EQ( r_data[i], (val - zero_points[i].item().to()) * scales[i].item().to()); @@ -222,7 +223,7 @@ TEST(TestQTensor, FromBlobQuantizedPerTensor) { custom_vec->reserve(numel); uint8_t* custom_data = custom_vec->data(); - for (auto i = 0; i < numel; ++i) { + for (const auto i : c10::irange(numel)) { custom_data[i] = i; } bool customDataDeleted{false}; @@ -236,7 +237,7 @@ TEST(TestQTensor, FromBlobQuantizedPerTensor) { Tensor qtensor = at::from_blob_quantized_per_tensor_affine(custom_data, shape, deleter, scale, zero_point, options); uint8_t* q_data = (uint8_t*)qtensor.data_ptr(); - for (auto i = 0; i < numel; ++i) { + for (const auto i : c10::irange(numel)) { ASSERT_EQ((int)custom_data[i], (int)q_data[i]); } ASSERT_EQ((float)qtensor.q_scale(), (float)scale); @@ -258,7 +259,7 @@ TEST(TestQTensor, FromBlobQuantizedPerChannel) { custom_vec->reserve(numel); uint8_t* custom_data = custom_vec->data(); - for (auto i = 0; i < numel; ++i) { + for (const auto i : c10::irange(numel)) { custom_data[i] = i; } bool customDataDeleted{false}; @@ -271,7 +272,7 @@ TEST(TestQTensor, FromBlobQuantizedPerChannel) { { Tensor qtensor = at::from_blob_quantized_per_channel_affine(custom_data, shape, deleter, scales, zero_points, ch_axis, options); uint8_t* q_data = (uint8_t*)qtensor.data_ptr(); - for (auto i = 0; i < numel; ++i) { + for (const auto i : c10::irange(numel)) { ASSERT_EQ((int)custom_data[i], (int)q_data[i]); } ASSERT_TRUE(at::allclose(qtensor.q_per_channel_scales(), scales)); diff --git a/aten/src/ATen/test/tensor_interop_test.cpp b/aten/src/ATen/test/tensor_interop_test.cpp index 7e82cd8addb749..c04dcdf77c1ed2 100644 --- a/aten/src/ATen/test/tensor_interop_test.cpp +++ b/aten/src/ATen/test/tensor_interop_test.cpp @@ -1,6 +1,7 @@ #include #include +#include #include #include @@ -8,13 +9,13 @@ TEST(Caffe2ToPytorch, SimpleLegacy) { caffe2::Tensor c2_tensor(caffe2::CPU); c2_tensor.Resize(4, 4); auto data = c2_tensor.mutable_data(); - for (int64_t i = 0; i < 16; i++) { + for (const auto i : c10::irange(16)) { data[i] = i; } at::Tensor at_tensor(c2_tensor); auto it = at_tensor.data_ptr(); - for (int64_t i = 0; i < 16; i++) { + for (const auto i : c10::irange(16)) { ASSERT_EQ(it[i], i); } } @@ -22,13 +23,13 @@ TEST(Caffe2ToPytorch, SimpleLegacy) { TEST(Caffe2ToPytorch, Simple) { caffe2::Tensor c2_tensor = caffe2::empty({4, 4}, at::kLong); auto data = c2_tensor.mutable_data(); - for (int64_t i = 0; i < 16; i++) { + for (const auto i : c10::irange(16)) { data[i] = i; } at::Tensor at_tensor(c2_tensor); auto it = at_tensor.data_ptr(); - for (int64_t i = 0; i < 16; i++) { + for (const auto i : c10::irange(16)) { ASSERT_EQ(it[i], i); } } @@ -37,7 +38,7 @@ TEST(Caffe2ToPytorch, ExternalData) { caffe2::Tensor c2_tensor = caffe2::empty({4, 4}, at::kLong); // NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,modernize-avoid-c-arrays,cppcoreguidelines-avoid-magic-numbers) int64_t buf[16]; - for (int64_t i = 0; i < 16; i++) { + for (const auto i : c10::irange(16)) { buf[i] = i; } c2_tensor.ShareExternalPointer(buf, 16 * sizeof(int64_t)); @@ -48,7 +49,7 @@ TEST(Caffe2ToPytorch, ExternalData) { at_tensor.permute({1, 0}); at_tensor.permute({1, 0}); auto it = at_tensor.data_ptr(); - for (int64_t i = 0; i < 16; i++) { + for (const auto i : c10::irange(16)) { ASSERT_EQ(it[i], i); } ASSERT_FALSE(at_tensor.storage().resizable()); @@ -60,7 +61,7 @@ TEST(Caffe2ToPytorch, Op) { caffe2::Tensor c2_tensor(caffe2::CPU); c2_tensor.Resize(3, 3); auto data = c2_tensor.mutable_data(); - for (int64_t i = 0; i < 9; i++) { + for (const auto i : c10::irange(9)) { data[i] = i; } at::Tensor at_tensor(c2_tensor); @@ -107,7 +108,7 @@ TEST(Caffe2ToPytorch, PartiallyInitialized) { TEST(Caffe2ToPytorch, MutualResizes) { caffe2::Tensor c2_tensor = caffe2::empty({5, 5}, at::kFloat); auto data = c2_tensor.mutable_data(); - for (int64_t i = 0; i < 25; i++) { + for (const auto i : c10::irange(25)) { data[i] = 0; } @@ -171,7 +172,7 @@ TEST(PytorchToCaffe2, Op) { auto result = XBlobGetMutableTensor(workspace.CreateBlob("d"), {5, 5}, at::kCPU); auto it = result.data(); - for (int64_t i = 0; i < 25; i++) { + for (const auto i : c10::irange(25)) { ASSERT_EQ(it[i], 3.0); } at::Tensor at_result(result); @@ -202,7 +203,7 @@ TEST(PytorchToCaffe2, SharedStorageRead) { auto result = XBlobGetMutableTensor(workspace.CreateBlob("c"), {5, 5}, at::kCPU); auto it = result.data(); - for (int64_t i = 0; i < 25; i++) { + for (const auto i : c10::irange(25)) { ASSERT_EQ(it[i], 2.0); } at::Tensor at_result(result); @@ -259,7 +260,7 @@ TEST(PytorchToCaffe2, Strided) { ASSERT_ANY_THROW(caffe2::Tensor c2_tensor(at_tensor)); // but calling contiguous is fine caffe2::Tensor c2_tensor(at_tensor.contiguous()); - for (int64_t i = 0; i < 25; i++) { + for (const auto i : c10::irange(25)) { ASSERT_EQ(c2_tensor.data()[i], 1.0); } } diff --git a/aten/src/ATen/test/thread_init_test.cpp b/aten/src/ATen/test/thread_init_test.cpp index 55df55f3b58cf0..b7f7452e9bea30 100644 --- a/aten/src/ATen/test/thread_init_test.cpp +++ b/aten/src/ATen/test/thread_init_test.cpp @@ -1,5 +1,6 @@ #include #include +#include #include #include @@ -13,7 +14,7 @@ void test(int given_num_threads) { ASSERT_TRUE(given_num_threads >= 0); ASSERT_EQ(at::get_num_threads(), given_num_threads); auto t_sum = t.sum(); - for (int i = 0; i < 1000; ++i) { + for (const auto i : c10::irange(1000)) { t_sum = t_sum + t.sum(); } } diff --git a/aten/src/ATen/test/vec_test_all_types.cpp b/aten/src/ATen/test/vec_test_all_types.cpp index a049dfce419e87..becb9c4dd5bc49 100644 --- a/aten/src/ATen/test/vec_test_all_types.cpp +++ b/aten/src/ATen/test/vec_test_all_types.cpp @@ -1,4 +1,5 @@ #include +#include namespace { #if GTEST_HAS_TYPED_TEST template @@ -455,7 +456,7 @@ namespace { // NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,modernize-avoid-c-arrays) CACHE_ALIGN VT expected_vals[vec::size()]; auto vals = 1 << (vec::size()); - for (int val = 0; val < vals; ++val) { + for (const auto val : c10::irange(vals)) { for (int i = 0; i < vec::size(); ++i) { if (val & (1 << i)) { test_vals[i] = std::numeric_limits::quiet_NaN(); @@ -747,7 +748,7 @@ namespace { CACHE_ALIGN VT test_vals[vec::size()]; //all sets will be within 0 2^(n-1) auto power_sets = 1 << (vec::size()); - for (int expected = 0; expected < power_sets; expected++) { + for (const auto expected : c10::irange(power_sets)) { // generate test_val based on expected for (int i = 0; i < vec::size(); ++i) { @@ -894,7 +895,7 @@ namespace { void blend_init(T(&a)[N], T(&b)[N]) { a[0] = (T)1.0; b[0] = a[0] + (T)N; - for (int i = 1; i < N; i++) { + for (const auto i : c10::irange(1, N)) { a[i] = a[i - 1] + (T)(1.0); b[i] = b[i - 1] + (T)(1.0); } @@ -905,7 +906,7 @@ namespace { auto add = Complex(1., 100.); a[0] = Complex(1., 100.); b[0] = Complex(5., 1000.); - for (int i = 1; i < 4; i++) { + for (const auto i : c10::irange(1, 4)) { a[i] = a[i - 1] + add; b[i] = b[i - 1] + add; } @@ -1051,7 +1052,8 @@ namespace { float minv = static_cast(static_cast(min_val) * 2.0); float maxv = static_cast(static_cast(max_val) * 2.0); ValueGen gen(minv, maxv, seed.add(2)); - for (int i = 0; i < trials; i++) { + for (const auto i : c10::irange(trials)) { + (void)i; // Suppress unused variable warning float scale = generator_sc.get(); float inv_scale = 1.0f / static_cast(scale); auto zero_point_val = generator_zp.get(); @@ -1088,7 +1090,8 @@ namespace { ValueGen generator(min_val, max_val, seed.add(1)); //scale ValueGen generator_sc(1.f, 15.f, seed.add(2)); - for (int i = 0; i < trials; i++) { + for (const auto i : c10::irange(trials)) { + (void)i; // Suppress unused variable warning float scale = generator_sc.get(); int32_t zero_point_val = generator.get(); float scale_zp_premul = -(scale * zero_point_val); @@ -1135,7 +1138,8 @@ namespace { ValueGen generator(min_val, max_val, seed); //scale ValueGen generator_sc(1.f, 15.f, seed.add(1)); - for (int i = 0; i < trials; i++) { + for (const auto i : c10::irange(trials)) { + (void)i; // Suppress unused variable warning float multiplier = 1.f / (generator_sc.get()); auto zero_point_val = generator.get(); int index = 0; @@ -1172,7 +1176,8 @@ namespace { typename vec::int_vec_return_type expected_int_ret; auto seed = TestSeed(); ValueGen generator(min_val, max_val, seed); - for (int i = 0; i < trials; i++) { + for (const auto i : c10::irange(trials)) { + (void)i; // Suppress unused variable warning //generate vals for (int j = 0; j < vec::size(); j++) { qint_vals[j] = generator.get(); @@ -1251,7 +1256,7 @@ namespace { CACHE_ALIGN VT ref_y[N]; auto seed = TestSeed(); ValueGen generator(VT(-100), VT(100), seed); - for (int64_t i = 0; i < N; i++) { + for (const auto i : c10::irange(N)) { x1[i] = generator.get(); x2[i] = generator.get(); x3[i] = generator.get(); @@ -1263,19 +1268,19 @@ namespace { }; // test map: y = x1 at::vec::map([](vec x) { return x; }, y, x1, N); - for (int64_t i = 0; i < N; i++) { ref_y[i] = x1[i]; } + for (const auto i : c10::irange(N)) { ref_y[i] = x1[i]; } cmp(y, ref_y); // test map2: y = x1 + x2 at::vec::map2([](vec x1, vec x2) { return x1 + x2; }, y, x1, x2, N); - for (int64_t i = 0; i < N; i++) { ref_y[i] = x1[i] + x2[i]; } + for (const auto i : c10::irange(N)) { ref_y[i] = x1[i] + x2[i]; } cmp(y, ref_y); // test map3: y = x1 + x2 + x3 at::vec::map3([](vec x1, vec x2, vec x3) { return x1 + x2 + x3; }, y, x1, x2, x3, N); - for (int64_t i = 0; i < N; i++) { ref_y[i] = x1[i] + x2[i] + x3[i]; } + for (const auto i : c10::irange(N)) { ref_y[i] = x1[i] + x2[i] + x3[i]; } cmp(y, ref_y); // test map4: y = x1 + x2 + x3 + x4 at::vec::map4([](vec x1, vec x2, vec x3, vec x4) { return x1 + x2 + x3 + x4; }, y, x1, x2, x3, x4, N); - for (int64_t i = 0; i < N; i++) { ref_y[i] = x1[i] + x2[i] + x3[i] + x4[i]; } + for (const auto i : c10::irange(N)) { ref_y[i] = x1[i] + x2[i] + x3[i] + x4[i]; } cmp(y, ref_y); } TYPED_TEST(FunctionalBF16Tests, Reduce) { @@ -1294,7 +1299,7 @@ namespace { CACHE_ALIGN VT x_b3[N]; auto seed = TestSeed(); ValueGen generator(RT(-1), RT(1), seed); - for (int64_t i = 0; i < N; i++) { + for (const auto i : c10::irange(N)) { x_f1[i] = generator.get(); x_f2[i] = generator.get(); x_f3[i] = generator.get(); @@ -1362,7 +1367,7 @@ namespace { CACHE_ALIGN VT y_b[N]; auto seed = TestSeed(); ValueGen generator(RT(-1), RT(1), seed); - for (int64_t i = 0; i < N; i++) { + for (const auto i : c10::irange(N)) { x_f1[i] = generator.get(); x_f2[i] = generator.get(); x_f3[i] = generator.get(); @@ -1379,7 +1384,7 @@ namespace { for (int64_t len = 1; len <= N; len++) { at::vec::map([](auto x) { return x; }, y_f, x_f1, len); at::vec::map([](auto x) { return x; }, y_b, x_b1, len); - for (int64_t i = 0; i < len; i++) { + for (const auto i : c10::irange(len)) { ASSERT_TRUE(cmp(y_f[i], y_b[i])) << "Failure Details:\nTest Seed to reproduce: " << seed << "\nmap, Length: " << len << "; index: " << i << "; fp32 reference: " << y_f[i] << "; bf16 value: " << RT(y_b[i]); } @@ -1388,7 +1393,7 @@ namespace { for (int64_t len = 1; len <= N; len++) { at::vec::map2([](auto x, auto y) { return x + y; }, y_f, x_f1, x_f2, len); at::vec::map2([](auto x, auto y) { return x + y; }, y_b, x_b1, x_b2, len); - for (int64_t i = 0; i < len; i++) { + for (const auto i : c10::irange(len)) { ASSERT_TRUE(cmp(y_f[i], y_b[i])) << "Failure Details:\nTest Seed to reproduce: " << seed << "\nmap2, Length: " << len << "; index: " << i << "; fp32 reference: " << y_f[i] << "; bf16 value: " << RT(y_b[i]); } @@ -1397,7 +1402,7 @@ namespace { for (int64_t len = 1; len <= N; len++) { at::vec::map3([](auto x, auto y, auto z) { return x + y * z; }, y_f, x_f1, x_f2, x_f3, len); at::vec::map3([](auto x, auto y, auto z) { return x + y * z; }, y_b, x_b1, x_b2, x_b3, len); - for (int64_t i = 0; i < len; i++) { + for (const auto i : c10::irange(len)) { ASSERT_TRUE(cmp(y_f[i], y_b[i])) << "Failure Details:\nTest Seed to reproduce: " << seed << "\nmap3, Length: " << len << "; index: " << i << "; fp32 reference: " << y_f[i] << "; bf16 value: " << RT(y_b[i]); } @@ -1406,7 +1411,7 @@ namespace { for (int64_t len = 1; len <= N; len++) { at::vec::map4([](auto x, auto y, auto z, auto w) { return x + y * z - w; }, y_f, x_f1, x_f2, x_f3, x_f4, len); at::vec::map4([](auto x, auto y, auto z, auto w) { return x + y * z - w; }, y_b, x_b1, x_b2, x_b3, x_b4, len); - for (int64_t i = 0; i < len; i++) { + for (const auto i : c10::irange(len)) { ASSERT_TRUE(cmp(y_f[i], y_b[i])) << "Failure Details:\nTest Seed to reproduce: " << seed << "\nmap4, Length: " << len << "; index: " << i << "; fp32 reference: " << y_f[i] << "; bf16 value: " << RT(y_b[i]); } diff --git a/aten/src/ATen/test/vec_test_all_types.h b/aten/src/ATen/test/vec_test_all_types.h index 8b0854866a946c..49a01b54b080e1 100644 --- a/aten/src/ATen/test/vec_test_all_types.h +++ b/aten/src/ATen/test/vec_test_all_types.h @@ -1,6 +1,7 @@ #pragma once #include #include +#include #include #include #include @@ -869,8 +870,7 @@ class AssertVectorized act.store(actArr); if (bitwise) { - for (int i = 0; i < sizeX; i++) - { + for (const auto i : c10::irange(sizeX)) { BVT b_exp = bit_cast(expArr[i]); BVT b_act = bit_cast(actArr[i]); EXPECT_EQ(b_exp, b_act) << getDetail(i / unitStorageCount); @@ -880,8 +880,7 @@ class AssertVectorized } else if (checkWithTolerance) { - for (int i = 0; i < sizeX; i++) - { + for (const auto i : c10::irange(sizeX)) { EXPECT_EQ(nearlyEqual(expArr[i], actArr[i], absErr), true) << expArr[i] << "!=" << actArr[i] << "\n" << getDetail(i / unitStorageCount); if (::testing::Test::HasFailure()) return true; @@ -889,8 +888,7 @@ class AssertVectorized } else { - for (int i = 0; i < sizeX; i++) - { + for (const auto i : c10::irange(sizeX)) { if (std::is_same::value) { if (!check_both_nan(expArr[i], actArr[i])) { @@ -952,8 +950,9 @@ void test_unary( UVT start = dmn_argc > 0 ? dmn.ArgsDomain[0].start : default_start; UVT end = dmn_argc > 0 ? dmn.ArgsDomain[0].end : default_end; ValueGen generator(start, end, seed.add(changeSeedBy)); - for (int trial = 0; trial < trialCount; trial++) { - for (int k = 0; k < el_count; k++) { + for (const auto trial : c10::irange(trialCount)) { + (void)trial; // Suppress unused variable warning + for (const auto k : c10::irange(el_count)) { vals[k] = generator.get(); call_filter(filter, vals[k]); //map operator @@ -1011,8 +1010,9 @@ void test_binary( UVT end1 = dmn_argc > 1 ? dmn.ArgsDomain[1].end : default_end; ValueGen generator0(start0, end0, seed.add(changeSeedBy)); ValueGen generator1(start1, end1, seed.add(changeSeedBy + 1)); - for (int trial = 0; trial < trialCount; trial++) { - for (int k = 0; k < el_count; k++) { + for (const auto trial : c10::irange(trialCount)) { + (void)trial; // Suppress unused variable warning + for (const auto k : c10::irange(el_count)) { vals0[k] = generator0.get(); vals1[k] = generator1.get(); call_filter(filter, vals0[k], vals1[k]); @@ -1076,8 +1076,9 @@ void test_ternary( ValueGen generator1(start1, end1, seed.add(changeSeedBy + 1)); ValueGen generator2(start2, end2, seed.add(changeSeedBy + 2)); - for (int trial = 0; trial < trialCount; trial++) { - for (int k = 0; k < el_count; k++) { + for (const auto trial : c10::irange(trialCount)) { + (void)trial; // Suppress unused variable warning + for (const auto k : c10::irange(el_count)) { vals0[k] = generator0.get(); vals1[k] = generator1.get(); vals2[k] = generator2.get(); diff --git a/aten/src/ATen/test/vitals.cpp b/aten/src/ATen/test/vitals.cpp index a486fb49dabcaa..93b2337f2b694d 100644 --- a/aten/src/ATen/test/vitals.cpp +++ b/aten/src/ATen/test/vitals.cpp @@ -3,6 +3,7 @@ #include #include +#include #include using namespace at::vitals; @@ -62,7 +63,7 @@ TEST(Vitals, MultiString) { } TEST(Vitals, OnAndOff) { - for (auto i = 0; i < 2; ++i) { + for (const auto i : c10::irange(2)) { std::stringstream buffer; std::streambuf* sbuf = std::cout.rdbuf(); diff --git a/aten/src/ATen/test/vmap_test.cpp b/aten/src/ATen/test/vmap_test.cpp index 28befede6483f2..5087a16ac4b922 100644 --- a/aten/src/ATen/test/vmap_test.cpp +++ b/aten/src/ATen/test/vmap_test.cpp @@ -3,6 +3,7 @@ #include #include #include +#include using namespace at; @@ -55,7 +56,7 @@ TEST(VmapTest, TestBatchedTensor) { // returns {{lvl=0,dim=0}, {lvl=1,dim=1}, ..., {lvl=kVmapNumLevels-1,dim=kVmapNumLevels-1}}; static BatchDims maxBatchDimsAtFront() { BatchDims result; - for (int64_t lvl = 0; lvl < kVmapNumLevels; lvl++) { + for (const auto lvl : c10::irange(kVmapNumLevels)) { result.emplace_back(lvl, /*dim=*/lvl); } return result; @@ -169,7 +170,7 @@ TEST(VmapTest, TestBatchedTensorActualDim) { { // ActualDim on kVmapMaxTensorDims sized underlying tensor auto tensor = ones({}); - for (int64_t i = 0; i < kVmapMaxTensorDims; i++) { + for (const auto i : c10::irange(kVmapMaxTensorDims)) { tensor = tensor.unsqueeze(0); } ASSERT_EQ(tensor.dim(), kVmapMaxTensorDims); @@ -260,7 +261,7 @@ TEST(VmapTest, TestMultiBatchVmapTransform) { BatchDims batch_dims = { {0, 2}, {1, 1}, {2, kVmapNumLevels - 1}, {3, 5}, {4, 0}, {5, 3}, {6, 4} }; - for (int64_t level = 7; level < kVmapNumLevels; level++ ) { + for (const auto level : c10::irange(7, kVmapNumLevels)) { batch_dims.emplace_back(level, /*dim=*/level - 1); } auto tensor = ones(sizes); @@ -303,7 +304,7 @@ TEST(VmapTest, TestVmapPhysicalViewGetPhysicalDims) { static void checkBatchDimsEqual(BatchDimsRef bdims, BatchDimsRef expected_bdims) { ASSERT_EQ(bdims.size(), expected_bdims.size()); - for (int64_t idx = 0; idx < bdims.size(); idx++) { + for (const auto idx : c10::irange(bdims.size())) { ASSERT_EQ(bdims[idx].dim(), expected_bdims[idx].dim()); ASSERT_EQ(bdims[idx].level(), expected_bdims[idx].level()); } @@ -394,7 +395,7 @@ TEST(VmapTest, TestBatchedTensorSum) { static void checkBroadcastingVmapTransform(TensorList inputs, TensorList expected_outputs) { auto outputs = BroadcastingVmapTransform::logicalToPhysical(inputs); ASSERT_EQ(outputs.size(), expected_outputs.size()); - for (int64_t idx = 0; idx < outputs.size(); idx++) { + for (const auto idx : c10::irange(outputs.size())) { const auto& output = outputs[idx].tensor(); ASSERT_EQ(output.data_ptr(), expected_outputs[idx].data_ptr()); ASSERT_TRUE(at::allclose(output, expected_outputs[idx])); @@ -878,7 +879,7 @@ TEST(VmapTest, TestBatchedTensorPermute) { static void checkMultiBatchVmapTransform(TensorList inputs, TensorList expected_outputs) { auto outputs = MultiBatchVmapTransform::logicalToPhysical(inputs); ASSERT_EQ(outputs.size(), expected_outputs.size()); - for (int64_t idx = 0; idx < outputs.size(); idx++) { + for (const auto idx : c10::irange(outputs.size())) { const auto& output = outputs[idx].tensor(); ASSERT_EQ(output.data_ptr(), expected_outputs[idx].data_ptr()); ASSERT_EQ(output.sizes(), expected_outputs[idx].sizes()); diff --git a/aten/src/ATen/test/vulkan_test.cpp b/aten/src/ATen/test/vulkan_test.cpp index f1d8b2e44036d8..09c98fa214c189 100644 --- a/aten/src/ATen/test/vulkan_test.cpp +++ b/aten/src/ATen/test/vulkan_test.cpp @@ -5,6 +5,7 @@ #include #include #include +#include bool checkRtol(const at::Tensor& diff, const std::vector inputs) { double maxValue = 0.0; @@ -145,7 +146,7 @@ TEST(VulkanTest, addScalar) { auto t_in = at::rand({3, 2, 2, 3}, at::device(at::kCPU).dtype(at::kFloat)); float* data = t_in.data_ptr(); auto numel = t_in.numel(); - for (int i = 0; i < numel; i++) { + for (const auto i : c10::irange(numel)) { // NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions) data[i] = i; } @@ -772,7 +773,7 @@ TEST(VulkanTest, tensor5d_transpose) { at::empty({1, 2, 3, 2, 1}, at::TensorOptions(at::kCPU).dtype(at::kFloat)); float* data = t_in.data_ptr(); auto numel = t_in.numel(); - for (int i = 0; i < numel; i++) { + for (const auto i : c10::irange(numel)) { // NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions) data[i] = i; } @@ -816,7 +817,7 @@ TEST(VulkanTest, slice) { at::empty({1, 4, 2, 2}, at::TensorOptions(at::kCPU).dtype(at::kFloat)); float* data = t_in.data_ptr(); auto numel = t_in.numel(); - for (int i = 0; i < numel; i++) { + for (const auto i : c10::irange(numel)) { // NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions) data[i] = i; } @@ -841,7 +842,7 @@ TEST(VulkanTest, select) { at::empty({1, 4, 2, 2}, at::TensorOptions(at::kCPU).dtype(at::kFloat)); float* data = t_in.data_ptr(); auto numel = t_in.numel(); - for (int i = 0; i < numel; i++) { + for (const auto i : c10::irange(numel)) { // NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions) data[i] = i; } @@ -866,7 +867,7 @@ TEST(VulkanTest, unsqueeze) { at::empty({1, 2, 2}, at::TensorOptions(at::kCPU).dtype(at::kFloat)); float* data = t_in.data_ptr(); auto numel = t_in.numel(); - for (int i = 0; i < numel; i++) { + for (const auto i : c10::irange(numel)) { // NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions) data[i] = i; } diff --git a/aten/src/THC/THCGeneral.cpp b/aten/src/THC/THCGeneral.cpp index 60f9eb23dcf1e4..1e7ab2591445a3 100644 --- a/aten/src/THC/THCGeneral.cpp +++ b/aten/src/THC/THCGeneral.cpp @@ -8,6 +8,7 @@ #include #include +#include #include #include @@ -57,16 +58,15 @@ void THCudaInit(THCState* state) // Currently the max number of gpus in P2P group is 8, so if there are more // we enable P2P in groups of 8 state->p2pAccessEnabled = (int**) calloc(numDevices, sizeof(int*)); - for (int i = 0; i < numDevices; ++i) { + for (const auto i : c10::irange(numDevices)) { state->p2pAccessEnabled[i] = (int*) calloc(numDevices, sizeof(int)); - for (int j = 0; j < numDevices; ++j) - if (i == j) + for (const auto j : c10::irange(numDevices))if (i == j) state->p2pAccessEnabled[i][j] = 1; else state->p2pAccessEnabled[i][j] = -1; } - for (int i = 0; i < numDevices; ++i) { + for (const auto i : c10::irange(numDevices)) { THCCudaResourcesPerDevice* res = THCState_getDeviceResourcePtr(state, i); THCudaCheck(cudaSetDevice(i)); @@ -96,7 +96,7 @@ void THCudaShutdown(THCState* state) THCudaCheck(cudaGetDeviceCount(&deviceCount)); /* cleanup p2p access state */ - for (int dev = 0; dev < deviceCount; ++dev) { + for (const auto dev : c10::irange(deviceCount)) { free(state->p2pAccessEnabled[dev]); } free(state->p2pAccessEnabled); diff --git a/aten/src/THC/THCTensor.cpp b/aten/src/THC/THCTensor.cpp index de76d11b265b70..9732ce00630991 100644 --- a/aten/src/THC/THCTensor.cpp +++ b/aten/src/THC/THCTensor.cpp @@ -16,6 +16,7 @@ #include #include +#include void THCTensor_resizeAs(THCState *state, THCTensor *self, THCTensor *src) { int isSame = 0; diff --git a/aten/src/THC/generic/THCTensor.cpp b/aten/src/THC/generic/THCTensor.cpp index a65b7543dd1d81..f96aa7ed4f0c3b 100644 --- a/aten/src/THC/generic/THCTensor.cpp +++ b/aten/src/THC/generic/THCTensor.cpp @@ -4,6 +4,7 @@ #include #include +#include /**** creation methods ****/ diff --git a/benchmarks/cpp/tensorexpr/bench_concat.cpp b/benchmarks/cpp/tensorexpr/bench_concat.cpp index 70bfb4246f15cf..0a6ee26c7e8436 100644 --- a/benchmarks/cpp/tensorexpr/bench_concat.cpp +++ b/benchmarks/cpp/tensorexpr/bench_concat.cpp @@ -1,4 +1,5 @@ #include +#include #include #include #include @@ -15,14 +16,14 @@ class ConcatBench : public benchmark::Fixture { input_sizes_ = std::move(input_sizes); concat_dim_ = concat_dim; inputs_.resize(input_sizes_.size()); - for (size_t i = 0; i < input_sizes_.size(); ++i) { + for (const auto i : c10::irange(input_sizes_.size())) { inputs_[i] = torch::ones({input_sizes_[i][0], input_sizes_[i][1]}); } output_size_.resize(input_sizes_.front().size()); - for (size_t i = 0; i < output_size_.size(); ++i) { + for (const auto i : c10::irange(output_size_.size())) { if (i == static_cast(concat_dim_)) { output_size_[i] = 0; - for (size_t j = 0; j < input_sizes_.size(); ++j) { + for (const auto j : c10::irange(input_sizes_.size())) { output_size_[i] += input_sizes_[j][i]; } } else { @@ -65,7 +66,7 @@ class ConcatBench : public benchmark::Fixture { [&](const VarHandle& m, const VarHandle& n) { int d = 0; std::vector cumulative_concat_dim_sizes(num_inputs); - for (size_t i = 0; i < num_inputs; ++i) { + for (const auto i : c10::irange(num_inputs)) { cumulative_concat_dim_sizes[i] = d; d += input_sizes_[i][concat_dim_]; } @@ -121,7 +122,7 @@ class ConcatBench : public benchmark::Fixture { {input_sizes_[i][0], input_sizes_[i][1]}, kFloat)); std::vector for_vars(num_inputs); - for (size_t d = 0; d < num_dims; ++d) { + for (const auto d : c10::irange(num_dims)) { for_vars[d] = alloc("i" + std::to_string(i) + "_" + std::to_string(d), kInt); } diff --git a/benchmarks/cpp/tensorexpr/bench_fuser_overhead.cpp b/benchmarks/cpp/tensorexpr/bench_fuser_overhead.cpp index 5a31312d635c01..d1e8d13bab5b3d 100644 --- a/benchmarks/cpp/tensorexpr/bench_fuser_overhead.cpp +++ b/benchmarks/cpp/tensorexpr/bench_fuser_overhead.cpp @@ -2,6 +2,7 @@ #include #include #include +#include using namespace torch::jit; @@ -22,7 +23,7 @@ static void FusedOverhead(benchmark::State& state) { auto z = torch::ones({1}); // Warmup. - for (int i = 0; i < 8; i++) { + for (const auto i : c10::irange(8)) { m.run_method("two_adds", x, y, z); } @@ -43,7 +44,7 @@ static void UnfusedOverhead(benchmark::State& state) { auto z = torch::ones({1}); // Warmup. - for (int i = 0; i < 8; i++) { + for (const auto i : c10::irange(8)) { m.run_method("two_adds", x, y, z); } diff --git a/benchmarks/cpp/tensorexpr/bench_parallel.cpp b/benchmarks/cpp/tensorexpr/bench_parallel.cpp index 8f98c98a443950..1449ec33498bf7 100644 --- a/benchmarks/cpp/tensorexpr/bench_parallel.cpp +++ b/benchmarks/cpp/tensorexpr/bench_parallel.cpp @@ -1,4 +1,5 @@ #include +#include #include #include #include @@ -54,7 +55,7 @@ BENCHMARK_DEFINE_F(ParallelAdd, Simple)(benchmark::State& state) { float* c_ptr = C.data_ptr(); std::vector args({c_ptr, a_ptr, b_ptr}); cg.value(args); - for (int i = 0; i < M; i++) { + for (const auto i : c10::irange(M)) { float diff = fabs(a_ptr[i] + b_ptr[i] - c_ptr[i]); TORCH_CHECK(diff < 1e-5); } diff --git a/benchmarks/cpp/tensorexpr/bench_reduce.cpp b/benchmarks/cpp/tensorexpr/bench_reduce.cpp index 0db6753f6e9bdf..4ef999579304ec 100644 --- a/benchmarks/cpp/tensorexpr/bench_reduce.cpp +++ b/benchmarks/cpp/tensorexpr/bench_reduce.cpp @@ -1,4 +1,5 @@ #include +#include #include #include #include @@ -72,7 +73,7 @@ static void reduce1d_naive(at::Tensor& A, at::Tensor& B) { int size = A.numel(); TORCH_CHECK(B.numel() == 1); *pB = 0.; - for (int i = 0; i < size; i++) { + for (const auto i : c10::irange(size)) { *pB += pA[i]; } } @@ -95,18 +96,18 @@ static void reduce1d_native_rfactor(at::Tensor& A, at::Tensor& B) { TORCH_CHECK(size % kChunkSize == 0); *pB = 0.; float temp[kChunkSize]; - for (int j = 0; j < kChunkSize; j++) { + for (const auto j : c10::irange(kChunkSize)) { temp[j] = 0; } int chunk_count = size / kChunkSize; - for (int i = 0; i < chunk_count; i++) { - for (int j = 0; j < kChunkSize; j++) { + for (const auto i : c10::irange(chunk_count)) { + for (const auto j : c10::irange(kChunkSize)) { temp[j] += pA[i * kChunkSize + j]; } } - for (int j = 0; j < kChunkSize; j++) { + for (const auto j : c10::irange(kChunkSize)) { *pB += temp[j]; } } @@ -157,7 +158,7 @@ static void reduce1d_native_vector(at::Tensor& A, at::Tensor& B) { temp = _mm256_setzero_ps(); int tile_count = size / kChunkSize; - for (int i = 0; i < tile_count; i++) { + for (const auto i : c10::irange(tile_count)) { __m256 data = _mm256_load_ps(pA + i * kChunkSize); temp = _mm256_add_ps(temp, data); } @@ -184,14 +185,14 @@ static void reduce1d_native_tiled(at::Tensor& A, at::Tensor& B) { TORCH_CHECK(B.numel() == 1, "Invalid size: ", B.numel(), " != 1"); TORCH_CHECK(size % kChunkSize == 0, "Invalid size: ", size, " % ", kChunkSize , " ! = 0"); __m256 t[kTileSize]; - for (int j = 0; j < kTileSize; j++) { + for (const auto j : c10::irange(kTileSize)) { t[j] = _mm256_setzero_ps(); } int tile_count = size / kChunkSize / kTileSize; - for (int i = 0; i < tile_count; i++) { + for (const auto i : c10::irange(tile_count)) { #pragma unroll - for (int j = 0; j < kTileSize; j++) { + for (const auto j : c10::irange(kTileSize)) { float *p = pA + (i * kTileSize + j) * kChunkSize; __m256 data = _mm256_loadu_ps(p); t[j] = _mm256_add_ps(t[j], data); @@ -199,7 +200,7 @@ static void reduce1d_native_tiled(at::Tensor& A, at::Tensor& B) { } float result = sum_f32x8(t[0]); - for (int j = 1; j < kTileSize; j++) { + for (const auto j : c10::irange(1, kTileSize)) { result += sum_f32x8(t[j]); } *pB = result; @@ -531,15 +532,15 @@ BENCHMARK_DEFINE_F(Reduce2DRow, Hand)(benchmark::State& state) { for (int m_outer = 0; m_outer < M; m_outer += Mb) { float bregs[Mb][Nb] = {0.0f}; for (int n_outer = 0; n_outer < N; n_outer += Nb) { - for (int m_inner = 0; m_inner < Mb; m_inner++) { - for (int n_inner = 0; n_inner < Nb; n_inner++) { + for (const auto m_inner : c10::irange(Mb)) { + for (const auto n_inner : c10::irange(Nb)) { bregs[m_inner][n_inner] += a[(m_outer + m_inner) * N + n_outer + n_inner]; } } } - for (int m_inner = 0; m_inner < Mb; m_inner++) { + for (const auto m_inner : c10::irange(Mb)) { b[m_outer + m_inner] = 0.f; - for (int n_inner = 0; n_inner < Nb; n_inner++) { + for (const auto n_inner : c10::irange(Nb)) { b[m_outer + m_inner] += bregs[m_inner][n_inner]; } } diff --git a/binaries/benchmark_helper.h b/binaries/benchmark_helper.h index 6dfb3162e2c96b..bd48be7ff3a30e 100644 --- a/binaries/benchmark_helper.h +++ b/binaries/benchmark_helper.h @@ -24,6 +24,7 @@ #include "caffe2/core/operator.h" #include "caffe2/utils/string_utils.h" #include "c10/util/string_utils.h" +#include using std::map; using std::shared_ptr; @@ -55,12 +56,12 @@ void writeTextOutput( int dims_size = tensor_proto.dims_size(); long long elem_dim_size = dims_size > 1 ? tensor_proto.dims(1) : tensor_proto.dims(0); - for (int i = 2; i < dims_size; i++) { + for (const auto i : c10::irange(2, dims_size)) { elem_dim_size *= tensor_proto.dims(i); } std::vector lines; std::string dims; - for (int i = 0; i < dims_size; i++) { + for (const auto i : c10::irange(dims_size)) { int dim = tensor_proto.dims(i); if (i > 0) { dims += ", "; diff --git a/c10/core/CPUAllocator.cpp b/c10/core/CPUAllocator.cpp index fe75969c49b443..578d6ddac49ad0 100644 --- a/c10/core/CPUAllocator.cpp +++ b/c10/core/CPUAllocator.cpp @@ -2,6 +2,7 @@ #include #include #include +#include // TODO: rename flags to C10 C10_DEFINE_bool( @@ -30,7 +31,7 @@ void memset_junk(void* data, size_t num) { int32_t int64_count = num / sizeof(kJunkPattern64); int32_t remaining_bytes = num % sizeof(kJunkPattern64); int64_t* data_i64 = reinterpret_cast(data); - for (int i = 0; i < int64_count; i++) { + for (const auto i : c10::irange(int64_count)) { data_i64[i] = kJunkPattern64; } if (remaining_bytes > 0) { diff --git a/c10/core/TensorImpl.cpp b/c10/core/TensorImpl.cpp index fe8ad99e421af9..a5b7790f757449 100644 --- a/c10/core/TensorImpl.cpp +++ b/c10/core/TensorImpl.cpp @@ -5,6 +5,7 @@ #include #include #include +#include C10_DEFINE_bool( caffe2_keep_on_shrink, @@ -335,7 +336,7 @@ bool TensorImpl::compute_non_overlapping_and_dense() const { } SmallVector perm; perm.resize(dim()); - for (int64_t i = 0; i < dim(); i++) { + for (const auto i : c10::irange(dim())) { perm[i] = i; } // Sort by strides, leaving 0 and 1 sized dims at the end of the array @@ -349,7 +350,7 @@ bool TensorImpl::compute_non_overlapping_and_dense() const { sizes_and_strides_.stride_at_unchecked(b); }); auto require_stride = 1; - for (int64_t i = 0; i < dim(); i++) { + for (const auto i : c10::irange(dim())) { const auto size_perm_i = sizes_and_strides_.size_at_unchecked(perm[i]); if (size_perm_i < 2) { return true; diff --git a/c10/core/TensorImpl.h b/c10/core/TensorImpl.h index 857e28fe113ac4..8a01515740c5a7 100644 --- a/c10/core/TensorImpl.h +++ b/c10/core/TensorImpl.h @@ -19,6 +19,7 @@ #include #include #include +#include #include // A global boolean variable to control whether we free memory when a Tensor @@ -68,7 +69,7 @@ inline std::vector ToVectorint64_t(ArrayRef src) { */ inline int64_t size_from_dim_(int k, IntArrayRef dims) { int64_t r = 1; - for (size_t i = k; i < dims.size(); ++i) { + for (const auto i : c10::irange(k, dims.size())) { r *= dims[i]; } return r; @@ -78,7 +79,7 @@ inline int64_t size_from_dim_(int k, IntArrayRef dims) { inline int64_t size_to_dim_(int k, IntArrayRef dims) { TORCH_CHECK((unsigned)k <= dims.size()); int64_t r = 1; - for (int i = 0; i < k; ++i) { + for (const auto i : c10::irange(k)) { r *= dims[i]; } return r; @@ -2143,7 +2144,7 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target { auto old_numel = numel_; sizes_and_strides_.resize(src.size()); int64_t new_numel = 1; - for (size_t i = 0; i < src.size(); ++i) { + for (const auto i : c10::irange(src.size())) { new_numel *= src[i]; sizes_and_strides_.size_at_unchecked(i) = src[i]; } diff --git a/c10/core/impl/InlineStreamGuard.h b/c10/core/impl/InlineStreamGuard.h index 295e3095e7a17d..7f4691e84a7903 100644 --- a/c10/core/impl/InlineStreamGuard.h +++ b/c10/core/impl/InlineStreamGuard.h @@ -2,6 +2,7 @@ #include #include +#include namespace c10 { namespace impl { @@ -237,7 +238,7 @@ class InlineMultiStreamGuard { static DeviceType getDeviceTypeOfStreams(ArrayRef streams) { TORCH_INTERNAL_ASSERT(!streams.empty()); DeviceType type = streams[0].device_type(); - for (size_t idx = 1; idx < streams.size(); idx++) { + for (const auto idx : c10::irange(1, streams.size())) { TORCH_CHECK_VALUE( streams[idx].device_type() == type, "Streams have a mix of device types: stream 0 is on ", diff --git a/c10/test/core/impl/SizesAndStrides_test.cpp b/c10/test/core/impl/SizesAndStrides_test.cpp index e7d988ed90fd4e..7031e18b2ff6d3 100644 --- a/c10/test/core/impl/SizesAndStrides_test.cpp +++ b/c10/test/core/impl/SizesAndStrides_test.cpp @@ -201,7 +201,7 @@ static SizesAndStrides makeBig(int offset = 0) { static void checkSmall(const SizesAndStrides& sm, int offset = 0) { std::vector sizes(3), strides(3); - for (int ii = 0; ii < 3; ++ii) { + for (const auto ii : c10::irange(3)) { sizes[ii] = ii + 1 + offset; strides[ii] = 2 * (ii + 1 + offset); } @@ -210,7 +210,7 @@ static void checkSmall(const SizesAndStrides& sm, int offset = 0) { static void checkBig(const SizesAndStrides& big, int offset = 0) { std::vector sizes(8), strides(8); - for (int ii = 0; ii < 8; ++ii) { + for (const auto ii : c10::irange(8)) { sizes[ii] = ii - 1 + offset; strides[ii] = 2 * (ii - 1 + offset); } diff --git a/c10/test/util/Bitset_test.cpp b/c10/test/util/Bitset_test.cpp index 12d5fcf01571f2..33c546d7d83ccd 100644 --- a/c10/test/util/Bitset_test.cpp +++ b/c10/test/util/Bitset_test.cpp @@ -1,6 +1,7 @@ #include #include +#include using c10::utils::bitset; @@ -37,7 +38,7 @@ TEST(BitsetTest, givenEmptyBitset_whenSettingBit_thenIsSet) { TEST(BitsetTest, givenEmptyBitset_whenSettingBit_thenOthersStayUnset) { bitset b; b.set(6); - for (size_t i = 0; i < 6; ++i) { + for (const auto i : c10::irange(6)) { EXPECT_FALSE(b.get(i)); } for (size_t i = 7; i < bitset::NUM_BITS(); ++i) { @@ -56,10 +57,10 @@ TEST(BitsetTest, givenNonemptyBitset_whenSettingBit_thenOthersStayAtOldValue) { bitset b; b.set(6); b.set(30); - for (size_t i = 0; i < 6; ++i) { + for (const auto i : c10::irange(6)) { EXPECT_FALSE(b.get(i)); } - for (size_t i = 7; i < 30; ++i) { + for (const auto i : c10::irange(7, 30)) { EXPECT_FALSE(b.get(i)); } for (size_t i = 31; i < bitset::NUM_BITS(); ++i) { @@ -82,7 +83,7 @@ TEST( b.set(6); b.set(30); b.unset(6); - for (size_t i = 0; i < 30; ++i) { + for (const auto i : c10::irange(30)) { EXPECT_FALSE(b.get(i)); } EXPECT_TRUE(b.get(30)); @@ -100,7 +101,7 @@ struct IndexCallbackMock final { void expect_was_called_for_indices(std::vector expected_indices) { EXPECT_EQ(expected_indices.size(), called_for_indices.size()); - for (size_t i = 0; i < expected_indices.size(); ++i) { + for (const auto i : c10::irange(expected_indices.size())) { EXPECT_EQ(expected_indices[i], called_for_indices[i]); } } diff --git a/c10/test/util/bfloat16_test.cpp b/c10/test/util/bfloat16_test.cpp index 2cce81f0e01ade..0de6882b46a14d 100644 --- a/c10/test/util/bfloat16_test.cpp +++ b/c10/test/util/bfloat16_test.cpp @@ -1,6 +1,7 @@ // clang-format off #include #include +#include // clang-format on #include @@ -24,7 +25,7 @@ float float_from_bytes(uint32_t sign, uint32_t exponent, uint32_t fraction) { TEST(BFloat16Conversion, FloatToBFloat16AndBack) { // NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,cppcoreguidelines-avoid-magic-numbers,modernize-avoid-c-arrays) float in[100]; - for (int i = 0; i < 100; ++i) { + for (const auto i : c10::irange(100)) { // NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions,cppcoreguidelines-avoid-magic-numbers) in[i] = i + 1.25; } @@ -34,7 +35,7 @@ TEST(BFloat16Conversion, FloatToBFloat16AndBack) { // NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,cppcoreguidelines-avoid-magic-numbers,modernize-avoid-c-arrays) float out[100]; - for (int i = 0; i < 100; ++i) { + for (const auto i : c10::irange(100)) { bfloats[i].x = c10::detail::bits_from_f32(in[i]); out[i] = c10::detail::f32_from_bits(bfloats[i].x); @@ -47,7 +48,7 @@ TEST(BFloat16Conversion, FloatToBFloat16AndBack) { TEST(BFloat16Conversion, FloatToBFloat16RNEAndBack) { // NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,cppcoreguidelines-avoid-magic-numbers,modernize-avoid-c-arrays) float in[100]; - for (int i = 0; i < 100; ++i) { + for (const auto i : c10::irange(100)) { // NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions,cppcoreguidelines-avoid-magic-numbers) in[i] = i + 1.25; } @@ -57,7 +58,7 @@ TEST(BFloat16Conversion, FloatToBFloat16RNEAndBack) { // NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,cppcoreguidelines-avoid-magic-numbers,modernize-avoid-c-arrays) float out[100]; - for (int i = 0; i < 100; ++i) { + for (const auto i : c10::irange(100)) { bfloats[i].x = c10::detail::round_to_nearest_even(in[i]); out[i] = c10::detail::f32_from_bits(bfloats[i].x); diff --git a/c10/test/util/ordered_preserving_dict_test.cpp b/c10/test/util/ordered_preserving_dict_test.cpp index 82533c08c1f3d0..7a71c0b11f6666 100644 --- a/c10/test/util/ordered_preserving_dict_test.cpp +++ b/c10/test/util/ordered_preserving_dict_test.cpp @@ -4,6 +4,7 @@ #include #include +#include #include #include @@ -15,7 +16,7 @@ using dict_int_int = ska_ordered::order_preserving_flat_hash_map; dict_int_int test_dict(dict_int_int& dict) { - for (int64_t i = 0; i < 100; ++i) { + for (const auto i : c10::irange(100)) { dict[i] = i + 1; } @@ -33,18 +34,18 @@ dict_int_int test_dict(dict_int_int& dict) { // erase via iterators auto begin = dict.begin(); - for (size_t i = 0; i < 20; ++i) + for (const auto i : c10::irange(20)) begin++; auto end = begin; - for (size_t i = 0; i < 20; ++i) { + for (const auto i : c10::irange(20)) { erase_set.insert(end->first); end++; } dict.erase(begin, end); std::vector order; - for (size_t i = 0; i < 100; ++i) { + for (const auto i : c10::irange(100)) { if (!erase_set.count(i)) { order.push_back(i); } @@ -113,7 +114,7 @@ TEST(OrderedPreservingDictTest, DictCollisions) { for (auto init_dict_size : {27, 34, 41}) { bad_hash_dict dict; - for (int64_t i = 0; i < init_dict_size; ++i) { + for (const auto i : c10::irange(init_dict_size)) { dict[i] = i + 1; } @@ -131,18 +132,18 @@ TEST(OrderedPreservingDictTest, DictCollisions) { // erase a few entries via iterator auto begin = dict.begin(); - for (size_t i = 0; i < 10; ++i) { + for (const auto i : c10::irange(10)) { begin++; } auto end = begin; - for (size_t i = 0; i < 7; ++i) { + for (const auto i : c10::irange(7)) { erase_set.insert(end->first); end++; } dict.erase(begin, end); std::vector order; - for (int64_t i = 0; i < init_dict_size; ++i) { + for (const auto i : c10::irange(init_dict_size)) { if (!erase_set.count(i)) { order.push_back(i); } @@ -167,7 +168,7 @@ TEST(OrderedPreservingDictTest, test_range_insert) { // check values const int nb_values = 1000; std::vector> values; - for (int i = 0; i < nb_values; i++) { + for (const auto i : c10::irange(nb_values)) { // NOLINTNEXTLINE(modernize-use-emplace,performance-inefficient-vector-operation) values.push_back(std::make_pair(i, i + 1)); } @@ -190,7 +191,7 @@ TEST(OrderedPreservingDictTest, test_range_erase_all) { // insert x values, delete all const std::size_t nb_values = 1000; dict_int_int map; - for (size_t i = 0; i < nb_values; ++i) { + for (const auto i : c10::irange(nb_values)) { map[i] = i + 1; } auto it = map.erase(map.begin(), map.end()); @@ -206,7 +207,7 @@ TEST(OrderedPreservingDictTest, test_range_erase) { const std::size_t nb_values = 1000; HMap map; - for (size_t i = 0; i < nb_values; ++i) { + for (const auto i : c10::irange(nb_values)) { map[c10::guts::to_string(i)] = i; auto begin = map.begin(); for (size_t j = 0; j <= i; ++j, begin++) { @@ -305,7 +306,7 @@ TEST(OrderedPreservingDictTest, test_copy_constructor_and_operator) { const std::size_t nb_values = 100; HMap map; - for (size_t i = 0; i < nb_values; ++i) { + for (const auto i : c10::irange(nb_values)) { map[c10::guts::to_string(i)] = c10::guts::to_string(i); } diff --git a/c10/util/Backtrace.cpp b/c10/util/Backtrace.cpp index 2c5e2e4cdca16f..d19f8d0ba5fd45 100644 --- a/c10/util/Backtrace.cpp +++ b/c10/util/Backtrace.cpp @@ -1,6 +1,7 @@ #include #include #include +#include #include #include @@ -281,8 +282,7 @@ std::string get_backtrace( // Toggles to true after the first skipped python frame. bool has_skipped_python_frames = false; - for (size_t frame_number = 0; frame_number < callstack.size(); - ++frame_number) { + for (const auto frame_number : c10::irange(callstack.size())) { const auto frame = parse_frame_information(symbols[frame_number]); if (skip_python_frames && frame && is_python_frame(*frame)) { diff --git a/c10/util/typeid.h b/c10/util/typeid.h index c651d4e7ad448a..396ea3eefdf72e 100644 --- a/c10/util/typeid.h +++ b/c10/util/typeid.h @@ -27,6 +27,7 @@ #include #include +#include /* * TypeIdentifier is a small type containing an id. @@ -170,7 +171,7 @@ struct TypeMetaData final { template inline void _PlacementNew(void* ptr, size_t n) { T* typed_ptr = static_cast(ptr); - for (size_t i = 0; i < n; ++i) { + for (const auto i : c10::irange(n)) { new (typed_ptr + i) T; } } @@ -234,7 +235,7 @@ template inline void _Copy(const void* src, void* dst, size_t n) { const T* typed_src = static_cast(src); T* typed_dst = static_cast(dst); - for (size_t i = 0; i < n; ++i) { + for (const auto i : c10::irange(n)) { typed_dst[i] = typed_src[i]; } } @@ -274,7 +275,7 @@ inline constexpr TypeMetaData::Copy* _PickCopy() { template inline void _PlacementDelete(void* ptr, size_t n) { T* typed_ptr = static_cast(ptr); - for (size_t i = 0; i < n; ++i) { + for (const auto i : c10::irange(n)) { typed_ptr[i].~T(); } } diff --git a/caffe2/contrib/aten/aten_op_template.h b/caffe2/contrib/aten/aten_op_template.h index 68b1feda93b7dc..a5d1ea40e27a8b 100644 --- a/caffe2/contrib/aten/aten_op_template.h +++ b/caffe2/contrib/aten/aten_op_template.h @@ -3,6 +3,7 @@ #include #include #include +#include #include #include #include @@ -130,7 +131,7 @@ class ATenOp : public Operator { void assignListStartingAt( size_t offset, const std::vector& tensors) { - for (size_t i = 0; i < tensors.size(); i++) { + for (const auto i : c10::irange(tensors.size())) { assignTo(Output(offset + i), tensors[i]); } } @@ -176,7 +177,7 @@ class ATenOp : public Operator { std::stringstream descriptor; descriptor << op; std::vector attrs; - for(size_t i = 0; i < operator_def.arg_size(); i++) { + for (const auto i : c10::irange(operator_def.arg_size())) { auto & attr = operator_def.arg(i); if(attr.name() == "operator" || attr.name() == "type" ) continue; @@ -223,7 +224,7 @@ class ATenOp : public Operator { std::vector ints = OperatorBase::GetRepeatedArgument(name, {}); std::array result; - for (size_t i = 0; i < N; ++i) { + for (const auto i : c10::irange(N)) { result[i] = ints.at(i); } return result; diff --git a/caffe2/contrib/fakelowp/fp16_fc_acc_op.h b/caffe2/contrib/fakelowp/fp16_fc_acc_op.h index 28554d4ec37d2d..49859ed1a373d9 100644 --- a/caffe2/contrib/fakelowp/fp16_fc_acc_op.h +++ b/caffe2/contrib/fakelowp/fp16_fc_acc_op.h @@ -118,8 +118,8 @@ class Fp16FCAccOp final : public Operator { if (!W_fbgemm->packed()) { float* W_fp16_trans = new float[W_size]; fbgemm::Float16ToFloat_avx2(W_fbgemm->pmat(), W_fp16_trans, W_size); - for (int i = 0; i < N; i++) { - for (int j = 0; j < K; j++) { + for (const auto i : c10::irange(N)) { + for (const auto j : c10::irange(K)) { W_fp16_[j * N + i] = W_fp16_trans[i * K + j]; } } @@ -136,8 +136,8 @@ class Fp16FCAccOp final : public Operator { const auto& W = Input(1); W_data = W.template data(); // Transpose W - for (int i = 0; i < N; i++) { - for (int j = 0; j < K; j++) { + for (const auto i : c10::irange(N)) { + for (const auto j : c10::irange(K)) { W_fp16_[j * N + i] = W_data[i * K + j]; } } @@ -352,7 +352,7 @@ class Fp16FCAccOp final : public Operator { #ifdef LOG_LEVEL_FOR_FBFCPACKEDACC16_ACCURACY_LOG float compute_L2_norm(float* A, int size) { float square_sum = 0.0; - for (int i = 0; i < size; i++) { + for (const auto i : c10::irange(size)) { square_sum += A[i] * A[i]; } return std::sqrt(square_sum); @@ -360,7 +360,7 @@ class Fp16FCAccOp final : public Operator { float compute_relative_error(float* A, float* A_ref, int size) { float error = 0.0; - for (int i = 0; i < size; i++) { + for (const auto i : c10::irange(size)) { error += (A[i] - A_ref[i]) * (A[i] - A_ref[i]); } error = std::sqrt(error); diff --git a/caffe2/contrib/fakelowp/int8_dequantize_op_nnpi.h b/caffe2/contrib/fakelowp/int8_dequantize_op_nnpi.h index 6769a978c39184..2c4cd39edfd85d 100644 --- a/caffe2/contrib/fakelowp/int8_dequantize_op_nnpi.h +++ b/caffe2/contrib/fakelowp/int8_dequantize_op_nnpi.h @@ -22,7 +22,7 @@ void Int8DequantizeNNPI( const float X_scale, const int32_t X_offset) { float X_scale_fp32 = 1.0f / X_scale; - for (auto i = 0; i < N; ++i) { + for (const auto i : c10::irange(N)) { out[i] = (float)(static_cast(in[i]) - X_offset) / X_scale_fp32; } } // namespace diff --git a/caffe2/contrib/fakelowp/int8_quantize_op_nnpi.h b/caffe2/contrib/fakelowp/int8_quantize_op_nnpi.h index 9e10134c9ea0c1..723e2f741b0e4a 100644 --- a/caffe2/contrib/fakelowp/int8_quantize_op_nnpi.h +++ b/caffe2/contrib/fakelowp/int8_quantize_op_nnpi.h @@ -53,12 +53,12 @@ void Int8QuantizeNNPI( std::vector inv_scalev(N, inv_scale_fp16); std::vector offsetv(N, -offset_tmp); fake_fp16::fma_fp16(N, in_fp16.data(), inv_scalev.data(), offsetv.data()); - for (int i = 0; i < N; i++) { + for (const auto i : c10::irange(N)) { offsetv[i] = round(offsetv[i]); } fbgemm::RoundToFloat16( offsetv.data(), offsetv.data(), N, false /* no clamping */); - for (int i = 0; i < N; i++) { + for (const auto i : c10::irange(N)) { float halfRes = offsetv[i]; if (std::isinf(halfRes)) { if (halfRes > 0) { diff --git a/caffe2/contrib/fakelowp/int8_swish_op_nnpi.h b/caffe2/contrib/fakelowp/int8_swish_op_nnpi.h index a5185cc171fa4a..990af90616bcbf 100644 --- a/caffe2/contrib/fakelowp/int8_swish_op_nnpi.h +++ b/caffe2/contrib/fakelowp/int8_swish_op_nnpi.h @@ -29,7 +29,7 @@ void SwishFakeInt8NNPI( int32_t quant_val = 0; uint8_t result = 0; - for (auto i = 0; i < N; ++i) { + for (const auto i : c10::irange(N)) { deq_val = (static_cast(in[i]) - X_offset) / X_scale_fp32; deq_swish = deq_val / (1 + exp(-deq_val)); quant_val = round(deq_swish / Y_scale + Y_offset); diff --git a/caffe2/contrib/fakelowp/layernorm_fp16_fake_op.h b/caffe2/contrib/fakelowp/layernorm_fp16_fake_op.h index 654836642501db..535b0a2460ecdb 100644 --- a/caffe2/contrib/fakelowp/layernorm_fp16_fake_op.h +++ b/caffe2/contrib/fakelowp/layernorm_fp16_fake_op.h @@ -129,7 +129,7 @@ class LayerNormFakeFp16Op final : public Operator { FLAGS_caffe2_fbgemm_fake_fp16_clamp, false /*USE_ACC_FP16*/); - for (int i = 0; i < M; ++i) { + for (const auto i : c10::irange(M)) { // fma_fp16(A, B, Out) -> Out = A * B + Out std::vector out(N); std::memcpy(out.data(), bias_data.data(), sizeof(float) * N); @@ -169,7 +169,7 @@ class LayerNormFakeFp16Op final : public Operator { const int32_t qmin = std::numeric_limits::min(); const int32_t qmax = std::numeric_limits::max(); - for (int i = 0; i < Nout; i++) { + for (const auto i : c10::irange(Nout)) { float halfRes = offsetv[i]; halfRes = round(halfRes); if (std::isinf(halfRes)) { diff --git a/caffe2/contrib/fakelowp/lengths_reducer_fused_4bit_rowwise_fp16_fake_op.h b/caffe2/contrib/fakelowp/lengths_reducer_fused_4bit_rowwise_fp16_fake_op.h index 3316ad46af0783..9ef11ed1c870da 100644 --- a/caffe2/contrib/fakelowp/lengths_reducer_fused_4bit_rowwise_fp16_fake_op.h +++ b/caffe2/contrib/fakelowp/lengths_reducer_fused_4bit_rowwise_fp16_fake_op.h @@ -85,7 +85,7 @@ class SparseLengthsFused4BitRowwiseFakeFP16Op final : public Operator { const auto scale_bias_offset = 2 * sizeof(at::Half); const int64_t input_fused_block_size = input_block_size + scale_bias_offset; int64_t current = 0; - for (int m = 0; m < output_size; ++m) { + for (const auto m : c10::irange(output_size)) { if (!use_fp16_for_embedding_only) { memset(rowTempSums[0].data(), 0, sizeof(float) * output_block_size); memset(rowTempSums[1].data(), 0, sizeof(float) * output_block_size); @@ -135,7 +135,7 @@ class SparseLengthsFused4BitRowwiseFakeFP16Op final : public Operator { // Unpack int4 elements std::vector input_rounded(output_block_size); int k = 0; - for (int j = 0; j < input_block_size; j++) { + for (const auto j : c10::irange(input_block_size)) { input_rounded[k++] = input[input_fused_block_size * indices_data[current] + j] & 0x0f; input_rounded[k++] = @@ -150,7 +150,7 @@ class SparseLengthsFused4BitRowwiseFakeFP16Op final : public Operator { input_rounded.data(), product_rounded.data()); - for (int j = 0; j < output_block_size; ++j) { + for (const auto j : c10::irange(output_block_size)) { product_rounded[j] += bias; } @@ -190,7 +190,7 @@ class SparseLengthsFused4BitRowwiseFakeFP16Op final : public Operator { } if (!use_fp16_for_embedding_only) { - for (int j = 0; j < output_block_size; ++j) { + for (const auto j : c10::irange(output_block_size)) { out[j] = rowTempSums[0][j] + rowTempSums[1][j]; } fbgemm::RoundToFloat16( diff --git a/caffe2/contrib/fakelowp/lengths_reducer_fused_8bit_rowwise_fp16_fake_op.h b/caffe2/contrib/fakelowp/lengths_reducer_fused_8bit_rowwise_fp16_fake_op.h index 143a3d3fcd60fa..a5f8a19466c55b 100644 --- a/caffe2/contrib/fakelowp/lengths_reducer_fused_8bit_rowwise_fp16_fake_op.h +++ b/caffe2/contrib/fakelowp/lengths_reducer_fused_8bit_rowwise_fp16_fake_op.h @@ -84,7 +84,7 @@ class SparseLengthsFused8BitRowwiseFakeFP16Op final : public Operator { const auto scale_bias_offset = 8 / sizeof(uint8_t); const int64_t fused_block_size = block_size + scale_bias_offset; int64_t current = 0; - for (int m = 0; m < output_size; ++m) { + for (const auto m : c10::irange(output_size)) { memset(out, 0, sizeof(float) * block_size); memset(rowTempSums[0].data(), 0, sizeof(float) * block_size); memset(rowTempSums[1].data(), 0, sizeof(float) * block_size); @@ -152,7 +152,7 @@ class SparseLengthsFused8BitRowwiseFakeFP16Op final : public Operator { // Fake fp16 rounding of input/ it is already ints std::vector input_rounded(block_size); - for (int j = 0; j < block_size; ++j) { + for (const auto j : c10::irange(block_size)) { input_rounded[j] = input[fused_block_size * indices_data[current] + j]; } @@ -164,7 +164,7 @@ class SparseLengthsFused8BitRowwiseFakeFP16Op final : public Operator { TypedAxpy( block_size, scale, input_rounded.data(), product_rounded.data()); - for (int j = 0; j < block_size; ++j) { + for (const auto j : c10::irange(block_size)) { product_rounded[j] += bias; } @@ -215,7 +215,7 @@ class SparseLengthsFused8BitRowwiseFakeFP16Op final : public Operator { block_size, FLAGS_caffe2_fbgemm_fake_fp16_clamp); - for (int j = 0; j < block_size; ++j) { + for (const auto j : c10::irange(block_size)) { product_rounded[j] += bias; } // Fake fp16 rounding of w x scale x input + w x bias @@ -239,7 +239,7 @@ class SparseLengthsFused8BitRowwiseFakeFP16Op final : public Operator { block_size, FLAGS_caffe2_fbgemm_fake_fp16_clamp); } else if (use_acc_fp32) { - for (int j = 0; j < block_size; ++j) { + for (const auto j : c10::irange(block_size)) { float deqVal = fake_fp16::fmafp32_avx_emulation( scale, input_rounded[j], @@ -256,7 +256,7 @@ class SparseLengthsFused8BitRowwiseFakeFP16Op final : public Operator { TypedAxpy(block_size, scale, input_rounded.data(), out); - for (int j = 0; j < block_size; ++j) { + for (const auto j : c10::irange(block_size)) { out[j] += bias; } } @@ -264,7 +264,7 @@ class SparseLengthsFused8BitRowwiseFakeFP16Op final : public Operator { } if (use_nnpi_fma || use_acc_fp32) { - for (int j = 0; j < block_size; ++j) { + for (const auto j : c10::irange(block_size)) { out[j] = rowTempSums[0][j] + rowTempSums[1][j]; } } diff --git a/caffe2/contrib/fakelowp/lengths_reducer_ops.h b/caffe2/contrib/fakelowp/lengths_reducer_ops.h index b626a61ef2db15..f451b83001eea9 100644 --- a/caffe2/contrib/fakelowp/lengths_reducer_ops.h +++ b/caffe2/contrib/fakelowp/lengths_reducer_ops.h @@ -94,7 +94,7 @@ class SparseLengthsReductionFakeFp16Op final : public Operator { float* out = out_data; int64_t current = 0; - for (int m = 0; m < output_size; ++m) { + for (const auto m : c10::irange(output_size)) { memset(out, 0, sizeof(float) * block_size); if (current + lengths[m] > index_size) { return false; diff --git a/caffe2/contrib/fakelowp/quant_lut_fp16_fake_op.h b/caffe2/contrib/fakelowp/quant_lut_fp16_fake_op.h index d6210362aa17d6..a0310a9f5def14 100644 --- a/caffe2/contrib/fakelowp/quant_lut_fp16_fake_op.h +++ b/caffe2/contrib/fakelowp/quant_lut_fp16_fake_op.h @@ -39,7 +39,7 @@ class TanhInt8QuantizeNNPIOp final : public Operator { Y_scale = 1.0f / Y_scale; // create table once - for (int i = 0; i < lutSize; i++) { + for (const auto i : c10::irange(lutSize)) { short input = i + tanhLUTMinOffset; float x = _cvtsh_ss(input); float tanh_x = tanh(x); @@ -54,7 +54,7 @@ class TanhInt8QuantizeNNPIOp final : public Operator { } const float* X_data = X.template data(); - for (int i = 0; i < X.numel(); i++) { + for (const auto i : c10::irange(X.numel())) { short val = _cvtss_sh(X_data[i], 0); unsigned short max16BitPositive = 0x7FFF; unsigned short input16Bit = (*(unsigned short*)& val); diff --git a/caffe2/contrib/fakelowp/spatial_batch_norm_fp16_fake_op.h b/caffe2/contrib/fakelowp/spatial_batch_norm_fp16_fake_op.h index 06c70f4c0d664c..9b909695040d42 100644 --- a/caffe2/contrib/fakelowp/spatial_batch_norm_fp16_fake_op.h +++ b/caffe2/contrib/fakelowp/spatial_batch_norm_fp16_fake_op.h @@ -159,7 +159,7 @@ class SpatialBNFakeLoweredFp16Op : public Operator { const int stride = C * HxW; const float* X_ptr = X; float* Y_ptr = Y; - for (int i = 0; i < N; ++i) { + for (const auto i : c10::irange(N)) { EigenArrayMap(Y_ptr, HxW, C) = ConstEigenArrayMap(X_ptr, HxW, C).rowwise() - mean_arr.transpose(); @@ -356,9 +356,9 @@ class SpatialBNFakeFp16Op : public Operator { float* Y_ptr = Y; // Do Y = X * scale + bias - for (int i = 0; i < N; i++) { - for (int j = 0; j < C; j++) { - for (int k = 0; k < HxW; k++) { + for (const auto i : c10::irange(N)) { + for (const auto j : c10::irange(C)) { + for (const auto k : c10::irange(HxW)) { Y_ptr[HxW * j + k] = bias[j]; } diff --git a/caffe2/contrib/fakelowp/sum_fp16_fake_op.h b/caffe2/contrib/fakelowp/sum_fp16_fake_op.h index 947b156ba493a7..0f654110321f21 100644 --- a/caffe2/contrib/fakelowp/sum_fp16_fake_op.h +++ b/caffe2/contrib/fakelowp/sum_fp16_fake_op.h @@ -18,7 +18,7 @@ class SumFP16FP16AccOp : public Operator { size_t N = input0.numel(); auto* output = Output(0, input0.sizes(), at::dtype()); // Dimension checking - for (int i = 1; i < InputSize(); ++i) { + for (const auto i : c10::irange(1, InputSize())) { if (output->sizes() != Input(i).sizes()) { CAFFE_THROW( "Check failed: output->sizes() == Input(i).sizes().", @@ -37,7 +37,7 @@ class SumFP16FP16AccOp : public Operator { std::vector t1(N); std::vector t2(N); - for (auto i = 0; i < InputSize(); i++) { + for (const auto i : c10::irange(InputSize())) { fbgemm::RoundToFloat16( Input(i).template data(), t1.data(), diff --git a/caffe2/contrib/gloo/allgather_ops.h b/caffe2/contrib/gloo/allgather_ops.h index 0347cc485507a3..f2a690dd432f5e 100644 --- a/caffe2/contrib/gloo/allgather_ops.h +++ b/caffe2/contrib/gloo/allgather_ops.h @@ -85,13 +85,13 @@ class AllgatherOp final : public Operator { // Verify tensors all have same size size_t size = Input(1).numel(); - for (auto i = 2; i < InputSize(); i++) { + for (const auto i : c10::irange(2, InputSize())) { CAFFE_ENFORCE_EQ(Input(i).numel(), size); } // Verify tensors all have same type TypeMeta meta = Input(1).dtype(); - for (auto i = 2; i < InputSize(); i++) { + for (const auto i : c10::irange(2, InputSize())) { CAFFE_ENFORCE(Input(i).dtype() == meta); } @@ -113,7 +113,7 @@ class AllgatherOp final : public Operator { params.inputs.resize(InputSize() - 1); params.size = Input(1).numel(); params.meta = Input(1).dtype(); - for (auto i = 0; i < params.inputs.size(); i++) { + for (const auto i : c10::irange(params.inputs.size())) { params.inputs[i] = Input(i + 1).raw_data(); } params.outputs.resize(OutputSize()); diff --git a/caffe2/contrib/gloo/allreduce_ops.h b/caffe2/contrib/gloo/allreduce_ops.h index 475fc0b4d99be3..b3242ac47075cf 100644 --- a/caffe2/contrib/gloo/allreduce_ops.h +++ b/caffe2/contrib/gloo/allreduce_ops.h @@ -65,19 +65,19 @@ class AllreduceOp final : public Operator { // Verify inputs == outputs CAFFE_ENFORCE_EQ(init_.inputs.size(), init_.outputs.size()); - for (auto i = 0U; i < init_.inputs.size(); i++) { + for (const auto i : c10::irange(0U, init_.inputs.size())) { CAFFE_ENFORCE_EQ(init_.inputs[i], init_.outputs[i]); } // Verify tensors all have same size auto size = Input(1).numel(); - for (auto i = 2; i < InputSize(); i++) { + for (const auto i : c10::irange(2, InputSize())) { CAFFE_ENFORCE_EQ(Input(i).numel(), size); } // Verify tensors all have same type TypeMeta meta = Input(1).dtype(); - for (auto i = 2; i < InputSize(); i++) { + for (const auto i : c10::irange(2, InputSize())) { CAFFE_ENFORCE(Input(i).dtype() == meta); } @@ -115,7 +115,7 @@ class AllreduceOp final : public Operator { params.context = OperatorBase::Input>(0); params.inputs.resize(InputSize() - 1); params.outputs.resize(OutputSize()); - for (auto i = 0U; i < params.inputs.size(); i++) { + for (const auto i : c10::irange(0U, params.inputs.size())) { params.inputs[i] = Input(i + 1).raw_data(); params.outputs[i] = Output(i)->raw_mutable_data(); } diff --git a/caffe2/contrib/gloo/broadcast_ops.h b/caffe2/contrib/gloo/broadcast_ops.h index 5c3af429bd4cc2..33c498b00c5ad8 100644 --- a/caffe2/contrib/gloo/broadcast_ops.h +++ b/caffe2/contrib/gloo/broadcast_ops.h @@ -60,19 +60,19 @@ class BroadcastOp final : public Operator { // Verify inputs == outputs CAFFE_ENFORCE_EQ(init_.inputs.size(), init_.outputs.size()); - for (auto i = 0; i < init_.inputs.size(); i++) { + for (const auto i : c10::irange(init_.inputs.size())) { CAFFE_ENFORCE_EQ(init_.inputs[i], init_.outputs[i]); } // Verify tensors all have same size size_t size = Input(1).numel(); - for (auto i = 2; i < InputSize(); i++) { + for (const auto i : c10::irange(2, InputSize())) { CAFFE_ENFORCE_EQ(Input(i).numel(), size); } // Verify tensors all have same size TypeMeta meta = Input(1).dtype(); - for (auto i = 2; i < InputSize(); i++) { + for (const auto i : c10::irange(2, InputSize())) { CAFFE_ENFORCE(Input(i).dtype() == meta); } @@ -94,7 +94,7 @@ class BroadcastOp final : public Operator { params.context = OperatorBase::Input>(0); params.inputs.resize(InputSize() - 1); params.outputs.resize(OutputSize()); - for (auto i = 0; i < params.inputs.size(); i++) { + for (const auto i : c10::irange(params.inputs.size())) { params.inputs[i] = Input(i + 1).raw_data(); params.outputs[i] = Output(i)->raw_mutable_data(); } diff --git a/caffe2/contrib/gloo/reduce_scatter_ops.h b/caffe2/contrib/gloo/reduce_scatter_ops.h index 56113807d54f2a..9019ff549e51e5 100644 --- a/caffe2/contrib/gloo/reduce_scatter_ops.h +++ b/caffe2/contrib/gloo/reduce_scatter_ops.h @@ -75,7 +75,7 @@ class ReduceScatterOp final : public Operator { // Verify inputs == outputs CAFFE_ENFORCE_EQ(init_.inputs.size(), init_.outputs.size()); - for (auto i = 0; i < init_.inputs.size(); i++) { + for (const auto i : c10::irange(init_.inputs.size())) { CAFFE_ENFORCE_EQ(init_.inputs[i], init_.outputs[i]); } @@ -107,7 +107,7 @@ class ReduceScatterOp final : public Operator { params.context = OperatorBase::Input>(0); params.inputs.resize(InputSize() - 2); params.outputs.resize(OutputSize() - 1); - for (auto i = 0; i < params.inputs.size(); i++) { + for (const auto i : c10::irange(params.inputs.size())) { params.inputs[i] = Input(i + 1).raw_data(); params.outputs[i] = Output(i)->raw_mutable_data(); } diff --git a/caffe2/contrib/opencl/OpenCL/cl.hpp b/caffe2/contrib/opencl/OpenCL/cl.hpp index 87cc4a17d1078c..bb33da683ff02f 100644 --- a/caffe2/contrib/opencl/OpenCL/cl.hpp +++ b/caffe2/contrib/opencl/OpenCL/cl.hpp @@ -1241,7 +1241,7 @@ inline cl_int getInfoHelper(Func f, cl_uint name, size_t* param, long) return err; } - for(int i = 0; i < N; ++i) { + for (const auto i : c10::irange(N)) { (*param)[i] = value[i]; } diff --git a/caffe2/core/blob_serialization.h b/caffe2/core/blob_serialization.h index 0162e8393ee23b..68ba36956938bc 100644 --- a/caffe2/core/blob_serialization.h +++ b/caffe2/core/blob_serialization.h @@ -9,6 +9,8 @@ #include "caffe2/core/blob.h" #include "caffe2/core/blob_serializer_base.h" #include "caffe2/core/tensor.h" + +#include #include #include "caffe2/core/types.h" #include "caffe2/utils/simple_queue.h" @@ -201,7 +203,7 @@ void ExtendRepeatedField( #else // We unfortunately do still need to support old protobuf versions in some // build configurations. - for (size_t i = 0; i < size; ++i) { + for (const auto i : c10::irange(size)) { field->Add(0); } #endif @@ -236,7 +238,7 @@ inline void CopyToProtoWithCast( context->template CopyToCPU(size, src, buffer.get()); context->FinishDeviceComputation(); field->Reserve(size); - for (size_t i = 0; i < size; ++i) { + for (const auto i : c10::irange(size)) { field->Add(static_cast(buffer[i])); } } @@ -267,7 +269,7 @@ inline void CopyFromProtoWithCast( // CPUContext. Remove it if it is performance critical. unique_ptr buffer(new DstType[size]); const SrcType* src = field.data(); - for (size_t i = 0; i < size; ++i) { + for (const auto i : c10::irange(size)) { buffer[i] = static_cast(src[i]); } context->template CopyFromCPU(size, buffer.get(), dst); diff --git a/caffe2/core/context.h b/caffe2/core/context.h index ac2096b33a7c6e..36fd4e400fe8c5 100644 --- a/caffe2/core/context.h +++ b/caffe2/core/context.h @@ -17,6 +17,7 @@ #if !defined(CAFFE2_IS_XPLAT_BUILD) && !defined(C10_MOBILE) #include +#include #include #include #else @@ -155,7 +156,7 @@ class TORCH_API CPUContext final : public BaseContext { static_cast(src), static_cast(dst)); } else { - for (size_t i = 0; i < n; ++i) { + for (const auto i : c10::irange(n)) { dst[i] = src[i]; } } diff --git a/caffe2/core/db.h b/caffe2/core/db.h index 1c2a5a808b33d7..d045e97a25d408 100644 --- a/caffe2/core/db.h +++ b/caffe2/core/db.h @@ -4,6 +4,7 @@ #include #include +#include #include #include "caffe2/core/blob_serialization.h" #include "caffe2/proto/caffe2_pb.h" @@ -248,7 +249,8 @@ class TORCH_API DBReader { *value = cursor_->value(); // In sharded mode, each read skips num_shards_ records - for (uint32_t s = 0; s < num_shards_; s++) { + for (const auto s : c10::irange(num_shards_)) { + (void)s; // Suppress unused variable cursor_->Next(); if (!cursor_->Valid()) { MoveToBeginning(); @@ -292,7 +294,8 @@ class TORCH_API DBReader { void MoveToBeginning() const { cursor_->SeekToFirst(); - for (uint32_t s = 0; s < shard_id_; s++) { + for (const auto s : c10::irange(shard_id_)) { + (void)s; // Suppress unused variable cursor_->Next(); CAFFE_ENFORCE( cursor_->Valid(), "Db has fewer rows than shard id: ", s, shard_id_); diff --git a/caffe2/core/export_c10_op_to_caffe2.h b/caffe2/core/export_c10_op_to_caffe2.h index 9dda158c63d6b2..b8bbfda84a50ee 100644 --- a/caffe2/core/export_c10_op_to_caffe2.h +++ b/caffe2/core/export_c10_op_to_caffe2.h @@ -12,6 +12,7 @@ #include #include #include "caffe2/core/export_caffe2_op_to_c10.h" +#include namespace caffe2 { @@ -136,7 +137,7 @@ class C10OperatorWrapper final : public Operator { void popOutputs_() { AT_ASSERT(stack_.size() == op_.schema().returns().size()); - for (size_t i = 0; i < op_.schema().returns().size(); ++i) { + for (const auto i : c10::irange(op_.schema().returns().size())) { OperatorBase::SetOutputTensor(i, Tensor(std::move(stack_[i]).toTensor())); } stack_.clear(); @@ -146,7 +147,7 @@ class C10OperatorWrapper final : public Operator { c10::List result; result.reserve(InputSize()); // NOLINTNEXTLINE(clang-diagnostic-sign-compare) - for (size_t i = 0; i < InputSize(); ++i) { + for (const auto i : c10::irange(InputSize())) { result.emplace_back(Input(i)); } return result; @@ -156,7 +157,7 @@ class C10OperatorWrapper final : public Operator { c10::List result; result.reserve(OutputSize()); // NOLINTNEXTLINE(clang-diagnostic-sign-compare) - for (size_t i = 0; i < OutputSize(); ++i) { + for (const auto i : c10::irange(OutputSize())) { result.emplace_back(OperatorBase::OutputTensorOrUndefined(i)); } return result; diff --git a/caffe2/core/export_caffe2_op_to_c10.h b/caffe2/core/export_caffe2_op_to_c10.h index bac3b0fd584656..66ffdf21a1085c 100644 --- a/caffe2/core/export_caffe2_op_to_c10.h +++ b/caffe2/core/export_caffe2_op_to_c10.h @@ -9,6 +9,7 @@ #include #include #include +#include #include #include @@ -94,7 +95,7 @@ inline void _call_caffe2_op_from_c10( // We should not unwrap the list if we expect tensor list in the schema. torch::jit::push(*stack, outputs); } else { - for (size_t i = 0; i < outputs.size(); ++i) { + for (const auto i : c10::irange(outputs.size())) { torch::jit::push(*stack, outputs.extract(i)); } } diff --git a/caffe2/core/nomnigraph/include/nomnigraph/Converters/Dot.h b/caffe2/core/nomnigraph/include/nomnigraph/Converters/Dot.h index c0ff13f9245666..bf1ab5c5200cd3 100644 --- a/caffe2/core/nomnigraph/include/nomnigraph/Converters/Dot.h +++ b/caffe2/core/nomnigraph/include/nomnigraph/Converters/Dot.h @@ -1,6 +1,7 @@ #ifndef NOM_CONVERTERS_DOT_H #define NOM_CONVERTERS_DOT_H +#include "c10/util/irange.h" #include "nomnigraph/Graph/Algorithms.h" #include "nomnigraph/Graph/Graph.h" #include "nomnigraph/Support/Casting.h" @@ -42,7 +43,7 @@ class DotGenerator { for (const auto& node : sg.getNodes()) { generateNode(node, sg, output); } - for (size_t i = 0; i < subgraphs.size(); ++i) { + for (const auto i : c10::irange(subgraphs.size())) { const auto& subgraph = subgraphs[i]; output << "subgraph cluster" << i << " {\n"; output << "style=dotted;\n"; diff --git a/caffe2/core/nomnigraph/include/nomnigraph/Transformations/SubgraphMatcher.h b/caffe2/core/nomnigraph/include/nomnigraph/Transformations/SubgraphMatcher.h index 1cc80b71617a15..da2c60a627a091 100644 --- a/caffe2/core/nomnigraph/include/nomnigraph/Transformations/SubgraphMatcher.h +++ b/caffe2/core/nomnigraph/include/nomnigraph/Transformations/SubgraphMatcher.h @@ -1,6 +1,7 @@ #ifndef NOM_TRANFORMATIONS_SUBGRAPH_MATCHER_H #define NOM_TRANFORMATIONS_SUBGRAPH_MATCHER_H +#include "c10/util/irange.h" #include "caffe2/core/common.h" #include "nomnigraph/Graph/Graph.h" @@ -240,8 +241,7 @@ class MatchGraph : public Graph> { // criteria in the given order. int currentEdgeIdx = 0; - for (int criteriaIdx = 0; criteriaIdx < numChildrenCriteria; - criteriaIdx++) { + for (const auto criteriaIdx : c10::irange(numChildrenCriteria)) { auto childrenCriteriaRef = invertGraphTraversal ? criteriaEdges[criteriaIdx]->tail() : criteriaEdges[criteriaIdx]->head(); diff --git a/caffe2/core/operator_schema.h b/caffe2/core/operator_schema.h index 0d048eb8d26e99..138dff0523df3b 100644 --- a/caffe2/core/operator_schema.h +++ b/caffe2/core/operator_schema.h @@ -10,6 +10,7 @@ #include #include "c10/util/Registry.h" +#include #include "caffe2/core/common.h" #include "caffe2/core/logging.h" #include "caffe2/core/types.h" @@ -519,7 +520,7 @@ inline uint64_t nElemFromDim(const TensorShape& X, int dim = 0) { CAFFE_ENFORCE_GE(dim, 0, "Invalid maximum index specified"); uint64_t nElem = 1; - for (int i = dim; i < X.dims_size(); ++i) { + for (const auto i : c10::irange(dim, X.dims_size())) { nElem *= X.dims(i); } return nElem; @@ -531,7 +532,7 @@ inline uint64_t nElemBetweenDim(const TensorShape& X, int start, int stop) { CAFFE_ENFORCE_LE(stop, X.dims_size(), "Invalid maximum index specified"); uint64_t nElem = 1; - for (int i = start; i < stop; ++i) { + for (const auto i : c10::irange(start, stop)) { nElem *= X.dims(i); } return nElem; @@ -560,7 +561,7 @@ OpSchema::Cost PointwiseCostInference( const TensorShape X = inputs[0]; uint64_t nElemX = nElemFromDim(X); uint64_t nElemRead = 0; - for (size_t i = 0; i < inputs.size(); ++i) { + for (const auto i : c10::irange(inputs.size())) { nElemRead += nElemFromDim(inputs[i]); } diff --git a/caffe2/core/qtensor.h b/caffe2/core/qtensor.h index dcebae66f29fd2..a34da6918bcd2f 100644 --- a/caffe2/core/qtensor.h +++ b/caffe2/core/qtensor.h @@ -5,6 +5,7 @@ #include "caffe2/core/context.h" #include "caffe2/core/tensor.h" #include +#include #include #include @@ -218,7 +219,7 @@ class C10_EXPORT QTensor { */ inline int64_t size_from_dim(int k) const { int64_t r = 1; - for (int i = k; i < dims_.size(); ++i) { + for (const auto i : c10::irange(k, dims_.size())) { r *= dims_[i]; } return r; @@ -230,7 +231,7 @@ class C10_EXPORT QTensor { inline int64_t size_to_dim(int k) const { CAFFE_ENFORCE(k < dims_.size()); int64_t r = 1; - for (int i = 0; i < k; ++i) { + for (const auto i : c10::irange(k)) { r *= dims_[i]; } return r; diff --git a/caffe2/core/qtensor_serialization.h b/caffe2/core/qtensor_serialization.h index 007174368a442d..fa8295ae2d43e9 100644 --- a/caffe2/core/qtensor_serialization.h +++ b/caffe2/core/qtensor_serialization.h @@ -46,7 +46,7 @@ void QTensorSerializer::Serialize( blob_proto.set_type(kQTensorBlobQType); QTensorProto& proto = *blob_proto.mutable_qtensor(); proto.set_name(name); - for (int i = 0; i < qtensor.ndim(); ++i) { + for (const auto i : c10::irange(qtensor.ndim())) { proto.add_dims(qtensor.dim32(i)); } proto.set_precision(qtensor.precision()); diff --git a/caffe2/core/stats.h b/caffe2/core/stats.h index a2ba948cc8cf4c..26fbdbe4a753ca 100644 --- a/caffe2/core/stats.h +++ b/caffe2/core/stats.h @@ -73,7 +73,7 @@ TORCH_API ExportedStatMap toMap(const ExportedStatList& stats); * int main() { * MyCaffeClass a("first"); * MyCaffeClass b("second"); - * for (int i = 0; i < 10; ++i) { + * for (const auto i : c10::irange(10)) { * a.run(10); * b.run(5); * } diff --git a/caffe2/core/test_utils.h b/caffe2/core/test_utils.h index 316985b509fca2..920157e222c238 100644 --- a/caffe2/core/test_utils.h +++ b/caffe2/core/test_utils.h @@ -6,6 +6,7 @@ #include "caffe2/utils/proto_utils.h" #include +#include #include #include @@ -34,7 +35,7 @@ void assertTensorEquals( float epsilon = 0.1f) { CAFFE_ENFORCE(tensor.IsType()); CAFFE_ENFORCE_EQ(tensor.numel(), data.size()); - for (auto idx = 0; idx < tensor.numel(); ++idx) { + for (const auto idx : c10::irange(tensor.numel())) { if (tensor.IsType()) { assertNear(tensor.data()[idx], data[idx], epsilon); } else { @@ -88,7 +89,7 @@ void randomFill( std::mt19937 gen(42); std::uniform_real_distribution dis( static_cast(min), static_cast(max)); - for (size_t i = 0; i < size; i++) { + for (const auto i : c10::irange(size)) { data[i] = dis(gen); } } diff --git a/caffe2/cuda_rtc/common_rtc.h b/caffe2/cuda_rtc/common_rtc.h index 80c2ba2ec582d1..53a5b381ba848a 100644 --- a/caffe2/cuda_rtc/common_rtc.h +++ b/caffe2/cuda_rtc/common_rtc.h @@ -109,7 +109,7 @@ inline std::string GetUniqueName() { std::stringstream ss; ss << "_cuda_kernel_"; - for (int i = 0; i < len; ++i) { + for (const auto i : c10::irange(len)) { ss << alpha[rand() % (sizeof(alpha) - 1)]; } return ss.str(); diff --git a/caffe2/experiments/operators/fully_connected_op_prune.h b/caffe2/experiments/operators/fully_connected_op_prune.h index 462ada3f3e02c6..268f308b5ebd6d 100644 --- a/caffe2/experiments/operators/fully_connected_op_prune.h +++ b/caffe2/experiments/operators/fully_connected_op_prune.h @@ -32,7 +32,7 @@ namespace caffe2 { const std::vector& shape(Shape vs) { static thread_local std::vector cache; cache.resize(vs.size()); - for (auto i = 0; i < vs.size(); ++i) { + for (const auto i : c10::irange(vs.size())) { cache[i] = vs[i]; } return cache; @@ -70,8 +70,8 @@ namespace caffe2 { void MaskMatrix( const float* mask, float* mat, int M, int N) { int offset = 0; - for (int i = 0; i < M; ++i) { - for (int j = 0; j < N; ++j) { + for (const auto i : c10::irange(M)) { + for (const auto j : c10::irange(N)) { mat[offset] = mask[offset]? mat[offset] : 0; offset++; } @@ -86,7 +86,7 @@ namespace caffe2 { int /*N*/, int seq_len, float target) { - for (int i = 0; i < seq_len; ++i) { + for (const auto i : c10::irange(seq_len)) { // assume that the mask_seq is smaller than size // Although it seems that random access gets bad performance, // we make sure that seq is in order; @@ -107,8 +107,8 @@ namespace caffe2 { float* mask_seq, int M, int N) { int seq_len = 0; int offset = 0; - for (int i = 0 ; i < M; ++i) { - for (int j = 0; j < N; ++j) { + for (const auto i : c10::irange(M)) { + for (const auto j : c10::irange(N)) { if (mat[offset] != 0 && (mat[offset] < thres && mat[offset] > -thres)) { mask_seq[seq_len++] = static_cast(offset); diff --git a/caffe2/experiments/operators/fully_connected_op_sparse.h b/caffe2/experiments/operators/fully_connected_op_sparse.h index 99726bc6abac71..2b9ca4d73809b2 100644 --- a/caffe2/experiments/operators/fully_connected_op_sparse.h +++ b/caffe2/experiments/operators/fully_connected_op_sparse.h @@ -35,7 +35,7 @@ template const std::vector& shape(Shape vs) { static thread_local std::vector cache; cache.resize(vs.size()); - for (auto i = 0; i < vs.size(); ++i) { + for (const auto i : c10::irange(vs.size())) { cache[i] = vs[i]; } return cache; @@ -63,8 +63,8 @@ void trans_mat( int m, int n, CPUContext* /*context*/) { - for(int i = 0; i < m; ++i){ - for(int j = 0; j < n; ++j){ + for (const auto i : c10::irange(m)) { + for (const auto j : c10::irange(n)) { t[j*m+i]=o[i*n+j]; } } diff --git a/caffe2/experiments/operators/funhash_op.h b/caffe2/experiments/operators/funhash_op.h index b07ca68c928eb8..3f52a906a27658 100644 --- a/caffe2/experiments/operators/funhash_op.h +++ b/caffe2/experiments/operators/funhash_op.h @@ -67,7 +67,7 @@ class FunHashOp : public Operator { int64_t n_segments = num_segments_; if (num_segments_ == -1) { - for (int64_t i = 0; i < num_nz_ent; ++i) { + for (const auto i : c10::irange(num_nz_ent)) { if (seg_data[i] > n_segments) { n_segments = seg_data[i]; } @@ -86,14 +86,14 @@ class FunHashOp : public Operator { const auto* val_data = val.template data(); const auto* key_data = key.template data(); - for (int64_t j = 0; j < num_nz_ent; ++j) { + for (const auto j : c10::irange(num_nz_ent)) { int64_t cur_seg = seg_data[j]; int64_t cur_key = key_data[j]; T cur_val = val_data[j]; int64_t output_stride = cur_seg * num_outputs_; - for (int64_t i = 0; i < num_outputs_; ++i) { + for (const auto i : c10::irange(num_outputs_)) { T sum = 0; - for (int64_t k = 0; k < num_alpha; ++k) { + for (const auto k : c10::irange(num_alpha)) { uint64_t hash; // The hash function takes as input four integers: // 1. feature index @@ -186,14 +186,14 @@ class FunHashGradientOp : public Operator { memset(grad_weight_data, 0, sizeof(T) * num_weight); - for (int64_t j = 0; j < num_nz_ent; ++j) { + for (const auto j : c10::irange(num_nz_ent)) { int64_t cur_seg = seg_data[j]; int64_t cur_key = key_data[j]; T cur_val = val_data[j]; int64_t grad_out_stride = cur_seg * num_outputs_; - for (int64_t i = 0; i < num_outputs_; ++i) { + for (const auto i : c10::irange(num_outputs_)) { T grad_out_scale = grad_out_data[grad_out_stride + i] * cur_val; - for (int64_t k = 0; k < num_alpha; ++k) { + for (const auto k : c10::irange(num_alpha)) { uint64_t hash; hash_data[0] = cur_key; hash_data[1] = i; diff --git a/caffe2/experiments/operators/sparse_funhash_op.h b/caffe2/experiments/operators/sparse_funhash_op.h index 58603315e325ef..1ba2cc5a342d76 100644 --- a/caffe2/experiments/operators/sparse_funhash_op.h +++ b/caffe2/experiments/operators/sparse_funhash_op.h @@ -66,7 +66,7 @@ class SparseFunHashOp : public Operator { int64_t n_segments = num_segments_; if (num_segments_ == -1) { - for (int64_t i = 0; i < num_nz_ent; ++i) { + for (const auto i : c10::irange(num_nz_ent)) { if (seg_data[i] > n_segments) { n_segments = seg_data[i]; } @@ -85,14 +85,14 @@ class SparseFunHashOp : public Operator { const auto* val_data = val.template data(); const auto* key_data = key.template data(); - for (int64_t j = 0; j < num_nz_ent; ++j) { + for (const auto j : c10::irange(num_nz_ent)) { int64_t cur_seg = seg_data[j]; int64_t cur_key = key_data[j]; T cur_val = val_data[j]; int64_t output_stride = cur_seg * num_outputs_; - for (int64_t i = 0; i < num_outputs_; ++i) { + for (const auto i : c10::irange(num_outputs_)) { T sum = 0; - for (int64_t k = 0; k < num_alpha; ++k) { + for (const auto k : c10::irange(num_alpha)) { // The hash function takes as input three integers: // 1. feature index // 2. output index @@ -190,14 +190,14 @@ class SparseFunHashGradientOp : public Operator { const auto* key_data = key.template data(); int64_t w_ind = 0; - for (int64_t j = 0; j < num_nz_ent; ++j) { + for (const auto j : c10::irange(num_nz_ent)) { int64_t cur_seg = seg_data[j]; int64_t cur_key = key_data[j]; T cur_val = val_data[j]; int64_t grad_out_stride = cur_seg * num_outputs_; - for (int64_t i = 0; i < num_outputs_; ++i) { + for (const auto i : c10::irange(num_outputs_)) { T grad_out_scale = grad_out_data[grad_out_stride + i] * cur_val; - for (int64_t k = 0; k < num_alpha; ++k) { + for (const auto k : c10::irange(num_alpha)) { hash_data[0] = cur_key; hash_data[1] = i; hash_data[2] = k; diff --git a/caffe2/experiments/operators/sparse_matrix_reshape_op.h b/caffe2/experiments/operators/sparse_matrix_reshape_op.h index e48665fb8465e9..298ffcf7f47afe 100644 --- a/caffe2/experiments/operators/sparse_matrix_reshape_op.h +++ b/caffe2/experiments/operators/sparse_matrix_reshape_op.h @@ -111,7 +111,7 @@ class SparseMatrixReshapeOp : public Operator { auto* new_col_data = new_col->template mutable_data(); auto* new_row_data = new_row->template mutable_data(); - for (int i = 0; i < nnz; ++i) { + for (const auto i : c10::irange(nnz)) { int64_t offset = old_row_data[i] * old_stride_ + old_col_data[i]; new_row_data[i] = offset / new_stride_; new_col_data[i] = offset % new_stride_; diff --git a/caffe2/ideep/operators/conv_pool_base_op.h b/caffe2/ideep/operators/conv_pool_base_op.h index 0a1e2c5d886a7e..61238dce105c0b 100644 --- a/caffe2/ideep/operators/conv_pool_base_op.h +++ b/caffe2/ideep/operators/conv_pool_base_op.h @@ -53,7 +53,7 @@ class IDEEPConvPoolOpBase : public ConvPoolOpBase { bool RunOnDevice() override { if (!global_pooling_) { - for (int dim = 0; dim < kernel_.size(); ++dim) { + for (const auto dim : c10::irange(kernel_.size())) { CAFFE_ENFORCE_GT(kernel_[dim], 0); } } diff --git a/caffe2/ideep/operators/conv_transpose_unpool_base_op.h b/caffe2/ideep/operators/conv_transpose_unpool_base_op.h index 11d0f5f7365bd4..aa28621804a02a 100644 --- a/caffe2/ideep/operators/conv_transpose_unpool_base_op.h +++ b/caffe2/ideep/operators/conv_transpose_unpool_base_op.h @@ -109,7 +109,7 @@ class IDEEPConvTransposeUnpoolBase : public IDEEPOperator { CAFFE_ENFORCE_EQ(pads_.size(), 2 * kernel_.size()); } - for (int dim = 0; dim < kernel_.size(); ++dim) { + for (const auto dim : c10::irange(kernel_.size())) { CAFFE_ENFORCE_GT(kernel_[dim], 0); CAFFE_ENFORCE_GT(stride_[dim], 0); CAFFE_ENFORCE_GE(adj_[dim], 0); @@ -143,7 +143,7 @@ class IDEEPConvTransposeUnpoolBase : public IDEEPOperator { auto input_dims = input.get_dims(); itensor::dims dims; dims.assign(input_dims.begin() + 2, input_dims.end()); - for (int dim = 0; dim < dims.size(); ++dim) { + for (const auto dim : c10::irange(dims.size())) { int dim_size = 0; ComputeSizeAndPad( dims[dim], diff --git a/caffe2/ideep/operators/operator_fallback_ideep.h b/caffe2/ideep/operators/operator_fallback_ideep.h index 9d6be29435272f..e7925d9e5d1e17 100644 --- a/caffe2/ideep/operators/operator_fallback_ideep.h +++ b/caffe2/ideep/operators/operator_fallback_ideep.h @@ -52,7 +52,7 @@ class IDEEPFallbackOp final : public IDEEPOperator { // Create output blobs in parent workspace, // then forward output blobs to local workspace. std::unordered_map forwarded_output_blobs; - for (int i = 0; i < base_def_.output_size(); i++) { + for (const auto i : c10::irange(base_def_.output_size())) { // For in-place case, the in/output tensor for local_ws must be // re-created, instead of forwarding from current workspace. string parent_name(base_def_.output(i)); @@ -81,7 +81,7 @@ class IDEEPFallbackOp final : public IDEEPOperator { } bool RunOnDevice() override { - for (int i = 0; i < InputSize(); ++i) { + for (const auto i : c10::irange(InputSize())) { if (InputIsType(i) && (Input(i).has_scale() || Input(i).get_data_type() == idtype::f32)) { @@ -128,7 +128,7 @@ class IDEEPFallbackOp final : public IDEEPOperator { return false; } - for (int i = 0; i < OutputSize(); ++i) { + for (const auto i : c10::irange(OutputSize())) { if (SkipOutputCopy::Contains(i)) { VLOG(1) << "Copy output: index " << i << " skipped."; continue; diff --git a/caffe2/ideep/utils/ideep_context.h b/caffe2/ideep/utils/ideep_context.h index 7df6763b1baf79..88503420989025 100644 --- a/caffe2/ideep/utils/ideep_context.h +++ b/caffe2/ideep/utils/ideep_context.h @@ -83,7 +83,7 @@ class IDEEPContext final : public BaseContext { static_cast(src), static_cast(dst)); } else { - for (size_t i = 0; i < n; ++i) { + for (const auto i : c10::irange(n)) { dst[i] = src[i]; } } diff --git a/caffe2/image/image_input_op.h b/caffe2/image/image_input_op.h index 5262b6a116a4bc..5d72898bfc6946 100644 --- a/caffe2/image/image_input_op.h +++ b/caffe2/image/image_input_op.h @@ -8,6 +8,7 @@ #include #include "c10/core/thread_pool.h" +#include #include "caffe2/core/common.h" #include "caffe2/core/db.h" #include "caffe2/image/transform_gpu.h" @@ -387,7 +388,7 @@ ImageInputOp::ImageInputOp( << "."; std::mt19937 meta_randgen(time(nullptr)); - for (int i = 0; i < num_decode_threads_; ++i) { + for (const auto i : c10::irange(num_decode_threads_)) { randgen_per_thread_.emplace_back(meta_randgen()); } ReinitializeTensor( @@ -406,7 +407,7 @@ ImageInputOp::ImageInputOp( // data type for prefetched_label_ is actually not known here.. ReinitializeTensor(&prefetched_label_, sizes, at::dtype().device(CPU)); - for (int i = 0; i < additional_output_sizes_.size(); ++i) { + for (const auto i : c10::irange(additional_output_sizes_.size())) { prefetched_additional_outputs_on_device_.emplace_back(); prefetched_additional_outputs_.emplace_back(); } @@ -423,7 +424,7 @@ bool RandomSizedCropping(cv::Mat* img, const int crop, std::mt19937* randgen) { std::uniform_real_distribution<> aspect_ratio_dis(3.0 / 4.0, 4.0 / 3.0); cv::Mat cropping; - for (int i = 0; i < 10; ++i) { + for (const auto i : c10::irange(10)) { int target_area = int(ceil(area_dis(*randgen) * area)); float aspect_ratio = aspect_ratio_dis(*randgen); int nh = floor(std::sqrt(((float)target_area / aspect_ratio))); @@ -499,12 +500,12 @@ bool ImageInputOp::GetImageAndLabelAndInfoFromDBValue( } else { // Datum stores things in CHW order, let's do HWC for images to make // things more consistent with conventional image storage. - for (int c = 0; c < 3; ++c) { + for (const auto c : c10::irange(3)) { const char* datum_buffer = datum.data().data() + datum.height() * datum.width() * c; uchar* ptr = src.ptr(0) + c; - for (int h = 0; h < datum.height(); ++h) { - for (int w = 0; w < datum.width(); ++w) { + for (const auto h : c10::irange(datum.height())) { + for (const auto w : c10::irange(datum.width())) { *ptr = *(datum_buffer++); ptr += 3; } @@ -522,7 +523,7 @@ bool ImageInputOp::GetImageAndLabelAndInfoFromDBValue( vector additional_output_protos; int start = additional_inputs_offset_; int end = start + additional_inputs_count_; - for (int i = start; i < end; ++i) { + for (const auto i : c10::irange(start, end)) { additional_output_protos.push_back(protos.protos(i)); } @@ -588,7 +589,7 @@ bool ImageInputOp::GetImageAndLabelAndInfoFromDBValue( float* label_data = prefetched_label_.mutable_data() + item_id * num_labels_; memset(label_data, 0, sizeof(float) * num_labels_); - for (int i = 0; i < label_proto.float_data_size(); ++i) { + for (const auto i : c10::irange(label_proto.float_data_size())) { label_data[(int)label_proto.float_data(i)] = 1.0; } } else if (label_type_ == MULTI_LABEL_WEIGHTED_SPARSE) { @@ -596,7 +597,7 @@ bool ImageInputOp::GetImageAndLabelAndInfoFromDBValue( float* label_data = prefetched_label_.mutable_data() + item_id * num_labels_; memset(label_data, 0, sizeof(float) * num_labels_); - for (int i = 0; i < label_proto.float_data_size(); ++i) { + for (const auto i : c10::irange(label_proto.float_data_size())) { label_data[(int)label_proto.float_data(i)] = weight_proto.float_data(i); } @@ -605,7 +606,7 @@ bool ImageInputOp::GetImageAndLabelAndInfoFromDBValue( CAFFE_ENFORCE(label_proto.float_data_size() == num_labels_); float* label_data = prefetched_label_.mutable_data() + item_id * num_labels_; - for (int i = 0; i < label_proto.float_data_size(); ++i) { + for (const auto i : c10::irange(label_proto.float_data_size())) { label_data[i] = label_proto.float_data(i); } } else { @@ -620,7 +621,7 @@ bool ImageInputOp::GetImageAndLabelAndInfoFromDBValue( int* label_data = prefetched_label_.mutable_data() + item_id * num_labels_; memset(label_data, 0, sizeof(int) * num_labels_); - for (int i = 0; i < label_proto.int32_data_size(); ++i) { + for (const auto i : c10::irange(label_proto.int32_data_size())) { label_data[label_proto.int32_data(i)] = 1; } } else if (label_type_ == MULTI_LABEL_WEIGHTED_SPARSE) { @@ -628,7 +629,7 @@ bool ImageInputOp::GetImageAndLabelAndInfoFromDBValue( float* label_data = prefetched_label_.mutable_data() + item_id * num_labels_; memset(label_data, 0, sizeof(float) * num_labels_); - for (int i = 0; i < label_proto.int32_data_size(); ++i) { + for (const auto i : c10::irange(label_proto.int32_data_size())) { label_data[label_proto.int32_data(i)] = weight_proto.float_data(i); } } else if ( @@ -636,7 +637,7 @@ bool ImageInputOp::GetImageAndLabelAndInfoFromDBValue( CAFFE_ENFORCE(label_proto.int32_data_size() == num_labels_); int* label_data = prefetched_label_.mutable_data() + item_id * num_labels_; - for (int i = 0; i < label_proto.int32_data_size(); ++i) { + for (const auto i : c10::irange(label_proto.int32_data_size())) { label_data[i] = label_proto.int32_data(i); } } else { @@ -646,14 +647,14 @@ bool ImageInputOp::GetImageAndLabelAndInfoFromDBValue( LOG(FATAL) << "Unsupported label data type."; } - for (int i = 0; i < additional_output_protos.size(); ++i) { + for (const auto i : c10::irange(additional_output_protos.size())) { auto additional_output_proto = additional_output_protos[i]; if (additional_output_proto.data_type() == TensorProto::FLOAT) { float* additional_output = prefetched_additional_outputs_[i].template mutable_data() + item_id * additional_output_proto.float_data_size(); - for (int j = 0; j < additional_output_proto.float_data_size(); ++j) { + for (const auto j : c10::irange(additional_output_proto.float_data_size())) { additional_output[j] = additional_output_proto.float_data(j); } } else if (additional_output_proto.data_type() == TensorProto::INT32) { @@ -661,7 +662,7 @@ bool ImageInputOp::GetImageAndLabelAndInfoFromDBValue( prefetched_additional_outputs_[i].template mutable_data() + item_id * additional_output_proto.int32_data_size(); - for (int j = 0; j < additional_output_proto.int32_data_size(); ++j) { + for (const auto j : c10::irange(additional_output_proto.int32_data_size())) { additional_output[j] = additional_output_proto.int32_data(j); } } else if (additional_output_proto.data_type() == TensorProto::INT64) { @@ -669,7 +670,7 @@ bool ImageInputOp::GetImageAndLabelAndInfoFromDBValue( prefetched_additional_outputs_[i].template mutable_data() + item_id * additional_output_proto.int64_data_size(); - for (int j = 0; j < additional_output_proto.int64_data_size(); ++j) { + for (const auto j : c10::irange(additional_output_proto.int64_data_size())) { additional_output[j] = additional_output_proto.int64_data(j); } } else if (additional_output_proto.data_type() == TensorProto::UINT8) { @@ -677,7 +678,7 @@ bool ImageInputOp::GetImageAndLabelAndInfoFromDBValue( prefetched_additional_outputs_[i].template mutable_data() + item_id * additional_output_proto.int32_data_size(); - for (int j = 0; j < additional_output_proto.int32_data_size(); ++j) { + for (const auto j : c10::irange(additional_output_proto.int32_data_size())) { additional_output[j] = static_cast(additional_output_proto.int32_data(j)); } @@ -799,11 +800,11 @@ void Saturation( std::uniform_real_distribution(-alpha_rand, alpha_rand)(*randgen); // BGR to Gray scale image: R -> 0.299, G -> 0.587, B -> 0.114 int p = 0; - for (int h = 0; h < img_size; ++h) { - for (int w = 0; w < img_size; ++w) { + for (const auto h : c10::irange(img_size)) { + for (const auto w : c10::irange(img_size)) { float gray_color = img[3 * p] * 0.114f + img[3 * p + 1] * 0.587f + img[3 * p + 2] * 0.299f; - for (int c = 0; c < 3; ++c) { + for (const auto c : c10::irange(3)) { img[3 * p + c] = img[3 * p + c] * alpha + gray_color * (1.0f - alpha); } p++; @@ -821,9 +822,9 @@ void Brightness( float alpha = 1.0f + std::uniform_real_distribution(-alpha_rand, alpha_rand)(*randgen); int p = 0; - for (int h = 0; h < img_size; ++h) { - for (int w = 0; w < img_size; ++w) { - for (int c = 0; c < 3; ++c) { + for (const auto h : c10::irange(img_size)) { + for (const auto w : c10::irange(img_size)) { + for (const auto c : c10::irange(3)) { img[p++] *= alpha; } } @@ -839,8 +840,8 @@ void Contrast( std::mt19937* randgen) { float gray_mean = 0; int p = 0; - for (int h = 0; h < img_size; ++h) { - for (int w = 0; w < img_size; ++w) { + for (const auto h : c10::irange(img_size)) { + for (const auto w : c10::irange(img_size)) { // BGR to Gray scale image: R -> 0.299, G -> 0.587, B -> 0.114 gray_mean += img[3 * p] * 0.114f + img[3 * p + 1] * 0.587f + img[3 * p + 2] * 0.299f; @@ -852,9 +853,9 @@ void Contrast( float alpha = 1.0f + std::uniform_real_distribution(-alpha_rand, alpha_rand)(*randgen); p = 0; - for (int h = 0; h < img_size; ++h) { - for (int w = 0; w < img_size; ++w) { - for (int c = 0; c < 3; ++c) { + for (const auto h : c10::irange(img_size)) { + for (const auto w : c10::irange(img_size)) { + for (const auto c : c10::irange(3)) { img[p] = img[p] * alpha + gray_mean * (1.0f - alpha); p++; } @@ -880,7 +881,7 @@ void ColorJitter( jitter_order.end(), std::default_random_engine(seed)); - for (int i = 0; i < 3; ++i) { + for (const auto i : c10::irange(3)) { if (jitter_order[i] == 0) { Saturation(img, img_size, saturation, randgen); } else if (jitter_order[i] == 1) { @@ -902,21 +903,21 @@ void ColorLighting( std::mt19937* randgen) { std::normal_distribution d(0, alpha_std); std::vector alphas(3); - for (int i = 0; i < 3; ++i) { + for (const auto i : c10::irange(3)) { alphas[i] = d(*randgen); } std::vector delta_rgb(3, 0.0); - for (int i = 0; i < 3; ++i) { - for (int j = 0; j < 3; ++j) { + for (const auto i : c10::irange(3)) { + for (const auto j : c10::irange(3)) { delta_rgb[i] += eigvecs[i][j] * eigvals[j] * alphas[j]; } } int p = 0; - for (int h = 0; h < img_size; ++h) { - for (int w = 0; w < img_size; ++w) { - for (int c = 0; c < 3; ++c) { + for (const auto h : c10::irange(img_size)) { + for (const auto w : c10::irange(img_size)) { + for (const auto c : c10::irange(3)) { img[p++] += delta_rgb[2 - c]; } } @@ -933,9 +934,9 @@ void ColorNormalization( const std::vector& mean, const std::vector& std) { int p = 0; - for (int h = 0; h < img_size; ++h) { - for (int w = 0; w < img_size; ++w) { - for (int c = 0; c < channels; ++c) { + for (const auto h : c10::irange(img_size)) { + for (const auto w : c10::irange(img_size)) { + for (const auto c : c10::irange(channels)) { img[p] = (img[p] - mean[c]) * std[c]; p++; } @@ -987,7 +988,7 @@ void TransformImage( for (int h = height_offset; h < height_offset + crop; ++h) { for (int w = width_offset + crop - 1; w >= width_offset; --w) { const uint8_t* cv_data = scaled_img.ptr(h) + w * channels; - for (int c = 0; c < channels; ++c) { + for (const auto c : c10::irange(channels)) { *(image_data_ptr++) = static_cast(cv_data[c]); } } @@ -997,7 +998,7 @@ void TransformImage( for (int h = height_offset; h < height_offset + crop; ++h) { for (int w = width_offset; w < width_offset + crop; ++w) { const uint8_t* cv_data = scaled_img.ptr(h) + w * channels; - for (int c = 0; c < channels; ++c) { + for (const auto c : c10::irange(channels)) { *(image_data_ptr++) = static_cast(cv_data[c]); } } @@ -1057,7 +1058,7 @@ void CropTransposeImage( for (int h = height_offset; h < height_offset + crop; ++h) { for (int w = width_offset + crop - 1; w >= width_offset; --w) { const uint8_t* cv_data = scaled_img.ptr(h) + w * channels; - for (int c = 0; c < channels; ++c) { + for (const auto c : c10::irange(channels)) { *(cropped_data++) = cv_data[c]; } } @@ -1067,7 +1068,7 @@ void CropTransposeImage( for (int h = height_offset; h < height_offset + crop; ++h) { for (int w = width_offset; w < width_offset + crop; ++w) { const uint8_t* cv_data = scaled_img.ptr(h) + w * channels; - for (int c = 0; c < channels; ++c) { + for (const auto c : c10::irange(channels)) { *(cropped_data++) = cv_data[c]; } } @@ -1166,7 +1167,7 @@ bool ImageInputOp::Prefetch() { prefetched_label_.mutable_data(); // Prefetching handled with a thread pool of "decode_threads" threads. - for (int item_id = 0; item_id < batch_size_; ++item_id) { + for (const auto item_id : c10::irange(batch_size_)) { std::string key, value; cv::Mat img; @@ -1189,7 +1190,7 @@ bool ImageInputOp::Prefetch() { LOG(FATAL) << "Unsupported label type."; } - for (int i = 0; i < additional_inputs_count_; ++i) { + for (const auto i : c10::irange(additional_inputs_count_)) { int index = additional_inputs_offset_ + i; TensorProto additional_output_proto = protos.protos(index); auto sizes = @@ -1264,7 +1265,7 @@ bool ImageInputOp::Prefetch() { ReinitializeAndCopyFrom( &prefetched_label_on_device_, device, prefetched_label_); - for (int i = 0; i < prefetched_additional_outputs_on_device_.size(); ++i) { + for (const auto i : c10::irange(prefetched_additional_outputs_on_device_.size())) { ReinitializeAndCopyFrom( &prefetched_additional_outputs_on_device_[i], device, @@ -1290,7 +1291,7 @@ bool ImageInputOp::CopyPrefetched() { OperatorBase::OutputTensorCopyFrom( 1, options, prefetched_label_, /* async */ true); - for (int i = 2; i < OutputSize(); ++i) { + for (const auto i : c10::irange(2, OutputSize())) { OperatorBase::OutputTensorCopyFrom( i, options, prefetched_additional_outputs_[i - 2], /* async */ true); } @@ -1331,7 +1332,7 @@ bool ImageInputOp::CopyPrefetched() { OperatorBase::OutputTensorCopyFrom( 1, type, prefetched_label_on_device_, /* async */ true); - for (int i = 2; i < OutputSize(); ++i) { + for (const auto i : c10::irange(2, OutputSize())) { OperatorBase::OutputTensorCopyFrom( i, type, diff --git a/caffe2/mobile/contrib/ios/mpscnn/mpscnn_kernels.h b/caffe2/mobile/contrib/ios/mpscnn/mpscnn_kernels.h index 38e54cd59bff44..c82c723020ad2a 100644 --- a/caffe2/mobile/contrib/ios/mpscnn/mpscnn_kernels.h +++ b/caffe2/mobile/contrib/ios/mpscnn/mpscnn_kernels.h @@ -523,7 +523,7 @@ kernel void col2im( } } else { half4 components(0, 0, 0, 0); - for (auto i = 0; i < 4; ++i) { + for (const auto i : c10::irange(4)) { ushort c_col_i = n * divRoundUp(kernel_h * kernel_w * C, 4) * 4 + h_k * kernel_w * C + w_k * C + c * 4 + i; ushort c_col_i_z = c_col_i / 4; @@ -826,7 +826,7 @@ kernel void concat( ushort2 gid_ = ushort2(gid.x, gid.y); half4 value; - for (int off = 0; off < 4; ++off) { + for (const auto off : c10::irange(4)) { ushort cur_channel = c * 4 + off; ushort cur_idx = 0; if (cur_channel >= C) { @@ -1013,8 +1013,8 @@ kernel void roi_warp(texture2d_array ina[[texture(0), func const RoIT count = iy_upper * ix_upper; RoIT4 output_val = 0.0; - for (int iy = 0; iy < iy_upper; iy++) { - for (int ix = 0; ix < ix_upper; ix++) { + for (const auto iy : c10::irange(iy_upper)) { + for (const auto ix : c10::irange(ix_upper)) { const RoIT y = roi_start_h + ph * bin_size_h + iy * bin_size_h / static_cast(roi_bin_grid_h); const RoIT x = @@ -1141,7 +1141,7 @@ kernel void channel_shuffle( const ushort c = gid.z - n * divRoundUp(C, 4); half4 value; ushort2 gid_ = gid.xy; - for (int off = 0; off < 4; ++off) { + for (const auto off : c10::irange(4)) { ushort cur_channel = c * 4 + off; if (cur_channel >= C) { break; diff --git a/caffe2/mobile/contrib/libopencl-stub/include/CL/cl.hpp b/caffe2/mobile/contrib/libopencl-stub/include/CL/cl.hpp index f3badf7775085f..1945198afa7f0b 100644 --- a/caffe2/mobile/contrib/libopencl-stub/include/CL/cl.hpp +++ b/caffe2/mobile/contrib/libopencl-stub/include/CL/cl.hpp @@ -1196,7 +1196,7 @@ inline cl_int getInfoHelper(Func f, cl_uint name, size_t* param, long) return err; } - for(int i = 0; i < N; ++i) { + for (const auto i : c10::irange(N)) { (*param)[i] = value[i]; } diff --git a/caffe2/operators/arg_ops.h b/caffe2/operators/arg_ops.h index 782401da214736..d9aa624bb9f4a5 100644 --- a/caffe2/operators/arg_ops.h +++ b/caffe2/operators/arg_ops.h @@ -43,7 +43,7 @@ class ArgOp final : public Operator { Y_dims.reserve(ndim); int prev_size = 1; int next_size = 1; - for (int i = 0; i < axis_; ++i) { + for (const auto i : c10::irange(axis_)) { Y_dims.push_back(X_dims[i]); prev_size *= X_dims[i]; } diff --git a/caffe2/operators/assert_op.h b/caffe2/operators/assert_op.h index 097427834c89ca..bc09adc59f41c8 100644 --- a/caffe2/operators/assert_op.h +++ b/caffe2/operators/assert_op.h @@ -23,7 +23,7 @@ class AssertOp final : public Operator { cmp_tensor_.CopyFrom(Input(0)); auto* cmp_data = cmp_tensor_.template data(); - for (int64_t i = 0; i < cmp_tensor_.numel(); ++i) { + for (const auto i : c10::irange(cmp_tensor_.numel())) { CAFFE_ENFORCE((bool)cmp_data[i], [&]() { std::stringstream ss; ss << "Assert failed for element " << i diff --git a/caffe2/operators/batch_gather_ops.h b/caffe2/operators/batch_gather_ops.h index 3d00970b04b21f..8736b475b8515e 100644 --- a/caffe2/operators/batch_gather_ops.h +++ b/caffe2/operators/batch_gather_ops.h @@ -80,7 +80,7 @@ class BatchGatherGradientOp final : public Operator { CAFFE_ENFORCE_GE(data.dim(), 2, "DATA should be at least 2-D"); // Outer dimensions of input data and gradient should be the same // because they are preserved for gathers with axis > 0. - for (int acheck = 0; acheck < axis; acheck++) { + for (const auto acheck : c10::irange(axis)) { CAFFE_ENFORCE_EQ( data.size(acheck), grad.size(acheck), @@ -105,7 +105,7 @@ class BatchGatherGradientOp final : public Operator { auto idx_inner_dims_product = indices.size_from_dim(axis); if (match_outer) { CAFFE_ENFORCE_GE(axis, 1, "Axis should be at least 1"); - for (auto i = 0; i < axis; i++) { + for (const auto i : c10::irange(axis)) { CAFFE_ENFORCE_EQ( data.size(i), indices.size(i), @@ -120,11 +120,11 @@ class BatchGatherGradientOp final : public Operator { gather_helper::check_indexarray_range( idxs, N, src_indexing_axis_dim, false); - for (auto batch = 0; batch < outer_dims_product; ++batch) { + for (const auto batch : c10::irange(outer_dims_product)) { auto grad_batch_base = grad_data + batch * gathered_grad_batch_size; auto out_batch_base = out_data + batch * batch_size; - for (auto i = 0; i < N; ++i) { + for (const auto i : c10::irange(N)) { auto idx = idxs[i]; if (match_outer) { idx = idxs[batch * idx_inner_dims_product + i]; diff --git a/caffe2/operators/bisect_percentile_op.h b/caffe2/operators/bisect_percentile_op.h index 5e924b293a850b..5ee90977641467 100644 --- a/caffe2/operators/bisect_percentile_op.h +++ b/caffe2/operators/bisect_percentile_op.h @@ -74,11 +74,12 @@ class BisectPercentileOp final : public Operator { int feature_length = 0; int cur_index = 0; - for (int i = 0; i < num_features; ++i) { + for (const auto i : c10::irange(num_features)) { cur_index = i; feature_start_index = index[i]; feature_length = pct_lens_[i]; - for (int j = 0; j < batch_size; ++j) { + for (const auto j : c10::irange(batch_size)) { + (void)j; // Suppress unused variable warning pct_output[cur_index] = compute_percentile( pct_raw_.begin() + feature_start_index, pct_mapping_.begin() + feature_start_index, diff --git a/caffe2/operators/byte_weight_dequant_op.h b/caffe2/operators/byte_weight_dequant_op.h index c7b786325bb48b..cf31c497946bfb 100644 --- a/caffe2/operators/byte_weight_dequant_op.h +++ b/caffe2/operators/byte_weight_dequant_op.h @@ -25,7 +25,7 @@ class ByteWeightDequantOp : public Operator { auto* Y = Output(0, shape_, at::dtype()); float bin_interval = (max_ - min_) / 255.0; int total = 1; - for (auto i = 0U; i < shape_.size(); i++) { + for (const auto i : c10::irange(0U, shape_.size())) { total *= Y->size(i); } const uint8_t* Xdata; diff --git a/caffe2/operators/cast_op.h b/caffe2/operators/cast_op.h index 478f2f30380c28..9c4c593db81d29 100644 --- a/caffe2/operators/cast_op.h +++ b/caffe2/operators/cast_op.h @@ -41,7 +41,7 @@ class CastOp : public Operator { const auto* data = input.template data(); auto* out = output->template mutable_data(); auto N = input.size(); - for (int64_t i = 0; i < N; ++i) { + for (const auto i : c10::irange(N)) { out[i] = static_cast(data[i]); } return true; diff --git a/caffe2/operators/cc_bmm_bg_op.h b/caffe2/operators/cc_bmm_bg_op.h index 3560d6a59dc2eb..0076d4546c0d0d 100644 --- a/caffe2/operators/cc_bmm_bg_op.h +++ b/caffe2/operators/cc_bmm_bg_op.h @@ -38,7 +38,7 @@ bool ConcatBatchMatMulBatchGatherOp::RunOnDevice() { int adj_size = input_zero.dim() + 1; int canonical_axis = 1; CAFFE_ENFORCE_LT(canonical_axis, adj_size, "Axis not in input ndim range."); - for (int i = 2; i < InputSize(); ++i) { + for (const auto i : c10::irange(2, InputSize())) { CAFFE_ENFORCE( Input(i).dtype() == input_zero.dtype(), "All inputs must have the same type, expected: ", @@ -50,7 +50,7 @@ bool ConcatBatchMatMulBatchGatherOp::RunOnDevice() { } int before = 1, after = 1; - for (int i = 0; i < input_zero.dim(); ++i) { + for (const auto i : c10::irange(input_zero.dim())) { int dim = input_zero.dim32(i); if (i < canonical_axis) { before *= dim; @@ -58,7 +58,7 @@ bool ConcatBatchMatMulBatchGatherOp::RunOnDevice() { after *= dim; } // check the input dims are compatible. - for (int j = 2; j < InputSize(); ++j) { + for (const auto j : c10::irange(2, InputSize())) { int dim_j = Input(j).dim32(i); CAFFE_ENFORCE( dim == dim_j, @@ -93,7 +93,7 @@ bool ConcatBatchMatMulBatchGatherOp::RunOnDevice() { auto* output = Output(0, output_dims, at::dtype()); // std::stringstream ss; // ss << "["; - // for(int i = 0; i < output_dims.size(); i++) ss << output_dims[i]; + // for (const auto i : c10::irange(output_dims.size()))ss << output_dims[i]; // ss << "]"; // LOG(INFO) << "output size: " << ss.str(); @@ -107,7 +107,7 @@ bool ConcatBatchMatMulBatchGatherOp::RunOnDevice() { #pragma omp for for (int b = 0; b < batch_size; ++b) { // concat input to scratch - for (int i = 1; i < InputSize(); ++i) { + for (const auto i : c10::irange(1, InputSize())) { auto* input_data = Input(i).template data(); memcpy( &scratch_input[(i - 1) * embed_size], @@ -130,7 +130,7 @@ bool ConcatBatchMatMulBatchGatherOp::RunOnDevice() { // do gather int64_t output_offset = b * gather_size; - for (int i = 0; i < gather_size; i++) { + for (const auto i : c10::irange(gather_size)) { output_data[output_offset + i] = scratch_output[indices_data[i]]; } } diff --git a/caffe2/operators/ceil_op.h b/caffe2/operators/ceil_op.h index 3283fbe8d9f1f5..93605e1d5f71bd 100644 --- a/caffe2/operators/ceil_op.h +++ b/caffe2/operators/ceil_op.h @@ -21,7 +21,7 @@ class CeilOp final : public Operator { const float* Xdata = X.template data(); float* Ydata = Y->template mutable_data(); - for (int i = 0; i < X.numel(); ++i) { + for (const auto i : c10::irange(X.numel())) { Ydata[i] = std::ceil(Xdata[i]); } return true; diff --git a/caffe2/operators/concat_split_op.h b/caffe2/operators/concat_split_op.h index 3338e9b9ae35cc..a1144e6d4d5cfb 100644 --- a/caffe2/operators/concat_split_op.h +++ b/caffe2/operators/concat_split_op.h @@ -161,7 +161,7 @@ bool SplitOp::RunOnDevice() { input_channels); vector output_dims(input.sizes().vec()); int before = 1, after = 1; - for (int i = 0; i < canonical_axis; ++i) { + for (const auto i : c10::irange(canonical_axis)) { before *= input.dim32(i); } for (int i = canonical_axis + 1; i < input.dim(); ++i) { @@ -174,7 +174,7 @@ bool SplitOp::RunOnDevice() { const auto *const input_ptr = static_cast(input.raw_data()); size_t input_offset = 0; - for (int i = 0; i < OutputSize(); ++i) { + for (const auto i : c10::irange(OutputSize())) { auto *const output = Output(i); const auto axis_dim = add_axis_ ? 1 : axis_data[i]; if (!add_axis_) { @@ -264,7 +264,7 @@ bool SplitByLengthsOp::RunOnDevice() { dim_multiplier = 1; } - for (int i = 0; i < OutputSize(); ++i) { + for (const auto i : c10::irange(OutputSize())) { auto* output = Output(i); const auto* axis_offset = axis_data + lengths_length / OutputSize() * i; auto axis_dim = @@ -301,7 +301,7 @@ bool ConcatOp::RunOnDevice() { int adj_size = input_zero.dim() + (add_axis_ ? 1 : 0); int canonical_axis = canonical_axis_index_(axis_, adj_size); CAFFE_ENFORCE_LT(canonical_axis, adj_size, "Axis not in input ndim range."); - for (int i = 1; i < InputSize(); ++i) { + for (const auto i : c10::irange(1, InputSize())) { CAFFE_ENFORCE_EQ( Input(i).dtype(), input_zero.dtype(), @@ -315,7 +315,7 @@ bool ConcatOp::RunOnDevice() { int before = 1, after = 1; vector output_dims(input_zero.sizes().vec()); - for (int i = 0; i < input_zero.dim(); ++i) { + for (const auto i : c10::irange(input_zero.dim())) { if (i == canonical_axis && !add_axis_) { continue; } @@ -326,7 +326,7 @@ bool ConcatOp::RunOnDevice() { after *= dim; } // check the input dims are compatible. - for (int j = 1; j < InputSize(); ++j) { + for (const auto j : c10::irange(1, InputSize())) { int dim_j = Input(j).dim32(i); CAFFE_ENFORCE_EQ( dim, @@ -351,7 +351,7 @@ bool ConcatOp::RunOnDevice() { } int output_channels = 0; - for (int i = 0; i < InputSize(); ++i) { + for (const auto i : c10::irange(InputSize())) { axis_data[i] = add_axis_ ? 1 : Input(i).dim32(canonical_axis); output_channels += axis_data[i]; } @@ -368,7 +368,7 @@ bool ConcatOp::RunOnDevice() { } size_t output_offset = 0; - for (int i = 0; i < InputSize(); ++i) { + for (const auto i : c10::irange(InputSize())) { auto& input = Input(i); auto axis_dim = add_axis_ ? 1 : input.dim32(canonical_axis); math::CopyMatrix( diff --git a/caffe2/operators/conv_op_impl.h b/caffe2/operators/conv_op_impl.h index 29294a1c0ea1b4..d95fc5f9e48c51 100644 --- a/caffe2/operators/conv_op_impl.h +++ b/caffe2/operators/conv_op_impl.h @@ -93,7 +93,8 @@ bool ConvOp::RunOnDeviceWithOrderNCHW() { col_buffer->Resize(buffer_shape); T* col_buffer_data = col_buffer->template mutable_data(); // Im2Col, followed by gemm. - for (int image_id = 0; image_id < N; ++image_id) { + for (const auto image_id : c10::irange(N)) { + (void)image_id; // Suppress unused variable warning if (kernel_.size() == 2) { math::Im2Col( C, @@ -277,7 +278,8 @@ bool ConvOp::RunOnDeviceWithOrderNHWC() { col_buffer->Resize(buffer_shape); T* col_buffer_data = col_buffer->template mutable_data(); // Im2Col, followed by gemm. - for (int image_id = 0; image_id < N; ++image_id) { + for (const auto image_id : c10::irange(N)) { + (void)image_id; // Suppress unused variable warning if (kernel_.size() <= 2) { math::Im2Col( C, @@ -314,7 +316,7 @@ bool ConvOp::RunOnDeviceWithOrderNHWC() { group_); } // Weight term - for (int group_id = 0; group_id < group_; ++group_id) { + for (const auto group_id : c10::irange(group_)) { // col_buffer_data in G (H W) (R S C/G) layout // filter_data in G K/G (R S C/G) layout math::GemmEx( @@ -398,8 +400,8 @@ bool ConvOp::Run1x1ConvOnDeviceWithOrderNCHW( std::vector X_ptr(N * G); std::vector W_ptr(N * G); std::vector Y_ptr(N * G); - for (int i = 0; i < N; ++i) { - for (int j = 0; j < G; ++j) { + for (const auto i : c10::irange(N)) { + for (const auto j : c10::irange(G)) { const int index = i * G + j; X_ptr[index] = X + index * X_stride; W_ptr[index] = filter + j * W_stride; @@ -454,7 +456,7 @@ bool ConvOp::Run1x1ConvOnDeviceWithOrderNHWC( T* Y) { const int G = group_; const int kernel_dim = C / G; - for (int group_id = 0; group_id < group_; ++group_id) { + for (const auto group_id : c10::irange(group_)) { math::GemmEx( CblasNoTrans, CblasTrans, @@ -511,7 +513,7 @@ bool ConvGradientOp::RunOnDeviceWithOrderNCHW() { int kernel_dims_size = 1; // NOLINTNEXTLINE(clang-diagnostic-sign-compare) - for (int i = 0; i < kernel_.size(); ++i) { + for (const auto i : c10::irange(kernel_.size())) { CAFFE_ENFORCE_EQ(filter.dim32(i + 2), kernel_[i]); kernel_dims_size *= kernel_[i]; } @@ -588,8 +590,9 @@ bool ConvGradientOp::RunOnDeviceWithOrderNCHW() { const int input_offset = C / group_ * input_image_size; const int output_offset = dY.numel() / dY.dim32(0) / group_; const int filter_offset = filter.numel() / group_; - for (int image_id = 0; image_id < N; ++image_id) { - for (int group_id = 0; group_id < group_; ++group_id) { + for (const auto image_id : c10::irange(N)) { + (void)image_id; // Suppress unused variable warning + for (const auto group_id : c10::irange(group_)) { // When we compute the gradient with respect to the filters, we need to do // im2col to allow gemm-type computation. if (kernel_.size() == 2) { @@ -662,8 +665,9 @@ bool ConvGradientOp::RunOnDeviceWithOrderNCHW() { no_bias_ ? BIAS_OR_INPUT_GRAD : INPUT_GRAD, X.sizes(), at::dtype()); T* dXdata = dX->template mutable_data(); dYdata = dY.template data(); - for (int image_id = 0; image_id < N; ++image_id) { - for (int group_id = 0; group_id < group_; ++group_id) { + for (const auto image_id : c10::irange(N)) { + (void)image_id; // Suppress unused variable warning + for (const auto group_id : c10::irange(group_)) { // Compute gradient into col_buffer. math::Gemm( CblasTrans, @@ -739,7 +743,7 @@ bool ConvGradientOp::RunOnDeviceWithOrderNHWC() { CAFFE_ENFORCE_EQ(C, filter.dim32(filter.dim() - 1) * group_); int kernel_dims_size = 1; - for (size_t i = 0; i < kernel_.size(); ++i) { + for (const auto i : c10::irange(kernel_.size())) { CAFFE_ENFORCE_EQ(filter.dim32(i + 1), kernel_[i]); kernel_dims_size *= kernel_[i]; } @@ -812,7 +816,7 @@ bool ConvGradientOp::RunOnDeviceWithOrderNHWC() { // image. const size_t input_offset = C * input_image_size; const size_t output_offset = dY.numel() / dY.dim32(0); - for (int image_id = 0; image_id < N; ++image_id) { + for (const auto image_id : c10::irange(N)) { // When we compute the gradient with respect to the filters, we need to do // im2col to allow gemm-type computation. if (kernel_.size() <= 2) { @@ -851,7 +855,7 @@ bool ConvGradientOp::RunOnDeviceWithOrderNHWC() { group_); } // Gradient with respect to filter. - for (int group_id = 0; group_id < group_; ++group_id) { + for (const auto group_id : c10::irange(group_)) { math::GemmEx( CblasTrans, CblasNoTrans, @@ -890,9 +894,9 @@ bool ConvGradientOp::RunOnDeviceWithOrderNHWC() { auto* dX = Output( no_bias_ ? BIAS_OR_INPUT_GRAD : INPUT_GRAD, X.sizes(), at::dtype()); T* dXdata = dX->template mutable_data(); - for (int image_id = 0; image_id < N; ++image_id) { + for (const auto image_id : c10::irange(N)) { // Compute gradient into col_buffer. - for (int group_id = 0; group_id < group_; ++group_id) { + for (const auto group_id : c10::irange(group_)) { math::GemmEx( CblasNoTrans, CblasNoTrans, diff --git a/caffe2/operators/conv_pool_op_base.h b/caffe2/operators/conv_pool_op_base.h index b356ef952d79c3..1a52101c30636e 100644 --- a/caffe2/operators/conv_pool_op_base.h +++ b/caffe2/operators/conv_pool_op_base.h @@ -139,7 +139,7 @@ class ConvPoolOpBase : public Operator { } if (global_pooling_) { - for (size_t dim = 0; dim < kernel_.size(); ++dim) { + for (const auto dim : c10::irange(kernel_.size())) { CAFFE_ENFORCE( pads_[2 * dim] == 0 && pads_[2 * dim + 1] == 0 && dilation_[dim] == 1 && stride_[dim] == 1, @@ -152,7 +152,7 @@ class ConvPoolOpBase : public Operator { // need to clean this up. if (operator_def.name().find("Conv") == 0 || operator_def.name().find("Pool") != std::string::npos) { - for (size_t dim = 0; dim < kernel_.size(); ++dim) { + for (const auto dim : c10::irange(kernel_.size())) { CAFFE_ENFORCE_GE(pads_[dim], 0); CAFFE_ENFORCE_GE(pads_[kernel_.size() + dim], 0); CAFFE_ENFORCE( @@ -162,7 +162,7 @@ class ConvPoolOpBase : public Operator { } } - for (size_t dim = 0; dim < kernel_.size(); ++dim) { + for (const auto dim : c10::irange(kernel_.size())) { CAFFE_ENFORCE_GE(kernel_[dim], 0); CAFFE_ENFORCE_GE(dilation_[dim], 0); CAFFE_ENFORCE_GE(stride_[dim], 0); @@ -281,7 +281,7 @@ class ConvPoolOpBase : public Operator { std::copy_n(input_dims.cbegin() + offset, ndim, kernel->begin()); std::fill_n(output_dims->begin() + offset, ndim, 1LL); } else { - for (int i = 0; i < ndim; ++i) { + for (const auto i : c10::irange(ndim)) { ComputeSizeAndPad( input_dims[i + offset], stride[i], @@ -320,7 +320,7 @@ class ConvPoolOpBase : public Operator { std::copy_n(input_dims.cbegin() + offset, ndim, kernel->begin()); std::fill_n(output_dims->begin() + offset, ndim, 1LL); } else { - for (int i = 0; i < ndim; ++i) { + for (const auto i : c10::irange(ndim)) { ComputeSizeAndPad64( input_dims[i + offset], stride[i], @@ -342,7 +342,7 @@ class ConvPoolOpBase : public Operator { } else if (legacy_pad_ != LegacyPadding::NOTSET) { int output_unused; // NOLINTNEXTLINE(clang-diagnostic-sign-compare) - for (int dim = 0; dim < dims.size(); ++dim) { + for (const auto dim : c10::irange(dims.size())) { ComputeSizeAndPad( dims[dim], stride_[dim], @@ -381,7 +381,7 @@ class ConvPoolOpBase : public Operator { reset_tensor_device_ = true; } else { const int* tensor_data = tensor->template data(); - for (int d_i = 0; d_i < data.size(); ++d_i) { + for (const auto d_i : c10::irange(data.size())) { if (tensor_data[d_i] != data[d_i]) { reset_tensor_device_ = true; break; @@ -411,7 +411,7 @@ class ConvPoolOpBase : public Operator { bool RunOnDevice() override { if (!global_pooling_) { - for (size_t dim = 0; dim < kernel_.size(); ++dim) { + for (const auto dim : c10::irange(kernel_.size())) { CAFFE_ENFORCE_GT(kernel_[dim], 0); } } diff --git a/caffe2/operators/conv_transpose_op_impl.h b/caffe2/operators/conv_transpose_op_impl.h index e3be8a81ce033c..a7e989f0fccde8 100644 --- a/caffe2/operators/conv_transpose_op_impl.h +++ b/caffe2/operators/conv_transpose_op_impl.h @@ -78,7 +78,7 @@ bool ConvTransposeOp::RunOnDeviceWithOrderNCHW() { buffer_shape, at::dtype().device(Context::GetDeviceType())); T* col_buffer_data = col_buffer->template mutable_data(); - for (int image_id = 0; image_id < N; ++image_id) { + for (const auto image_id : c10::irange(N)) { // Weight term if (G == 1) { math::Gemm( @@ -231,7 +231,7 @@ bool ConvTransposeOp::RunOnDeviceWithOrderNHWC() { buffer_shape, at::dtype().device(Context::GetDeviceType())); T* col_buffer_data = col_buffer_.template mutable_data(); - for (int image_id = 0; image_id < N; ++image_id) { + for (const auto image_id : c10::irange(N)) { // Weight term if (G == 1) { math::Gemm( @@ -247,7 +247,7 @@ bool ConvTransposeOp::RunOnDeviceWithOrderNHWC() { col_buffer_data, &context_); } else { - for (int group_id = 0; group_id < G; ++group_id) { + for (const auto group_id : c10::irange(G)) { math::GemmEx( CblasNoTrans, CblasNoTrans, @@ -374,7 +374,7 @@ bool ConvTransposeGradientOp::RunOnDeviceWithOrderNCHW() { at::dtype().device(Context::GetDeviceType())); T* col_buffer_data = col_buffer_.template mutable_data(); - for (int image_id = 0; image_id < N; ++image_id) { + for (const auto image_id : c10::irange(N)) { // gradient w.r.t. filters. Im2Col followed by Gemm // Im2Col. math::Im2Col( @@ -539,7 +539,7 @@ bool ConvTransposeGradientOp::RunOnDeviceWithOrderNHWC() { at::dtype().device(Context::GetDeviceType())); T* col_buffer_data = col_buffer_.template mutable_data(); - for (int image_id = 0; image_id < N; ++image_id) { + for (const auto image_id : c10::irange(N)) { // gradient w.r.t. filters. Im2Col followed by Gemm // Im2Col. math::Im2Col( @@ -575,7 +575,7 @@ bool ConvTransposeGradientOp::RunOnDeviceWithOrderNHWC() { dfilter_data, &context_); } else { - for (int group_id = 0; group_id < G; ++group_id) { + for (const auto group_id : c10::irange(G)) { math::GemmEx( CblasTrans, CblasNoTrans, @@ -610,7 +610,7 @@ bool ConvTransposeGradientOp::RunOnDeviceWithOrderNHWC() { dX_data + image_id * M * X_HxW, &context_); } else { - for (int group_id = 0; group_id < G; ++group_id) { + for (const auto group_id : c10::irange(G)) { math::GemmEx( CblasNoTrans, CblasTrans, diff --git a/caffe2/operators/conv_transpose_op_mobile_impl.h b/caffe2/operators/conv_transpose_op_mobile_impl.h index 45fc78ce9bc9e5..25ac65190bbf81 100644 --- a/caffe2/operators/conv_transpose_op_mobile_impl.h +++ b/caffe2/operators/conv_transpose_op_mobile_impl.h @@ -76,7 +76,7 @@ void runTileContiguous( int colBlockSize = (W + kernelW / strideW); int numColBlocks = strideW; - for (int c = 0; c < kernelDataSize; ++c) { + for (const auto c : c10::irange(kernelDataSize)) { int w_offset = c % kernelW; int h_offset = (c / kernelW) % kernelH; int c_im = c / kernelH / kernelW; @@ -276,13 +276,13 @@ void reinterleaveRows( float32x4_t v0[kStrideW]; float32x4_t v1[kStrideW]; - for (int i = 0; i < kStrideW; ++i) { + for (const auto i : c10::irange(kStrideW)) { v0[i] = vld1q_f32(src + i * colBlockSize); v1[i] = vld1q_f32(src + i * colBlockSize + 4); } // add per-channel bias - for (int i = 0; i < kStrideW; ++i) { + for (const auto i : c10::irange(kStrideW)) { v0[i] = vaddq_f32(v0[i], biasV); v1[i] = vaddq_f32(v1[i], biasV); } @@ -300,12 +300,12 @@ void reinterleaveRows( for (; w < inputW - 1; ++w) { float v[kStrideW]; - for (int i = 0; i < kStrideW; ++i) { + for (const auto i : c10::irange(kStrideW)) { v[i] = src[i * colBlockSize]; } // add per-channel bias - for (int i = 0; i < kStrideW; ++i) { + for (const auto i : c10::irange(kStrideW)) { v[i] += b; } @@ -614,12 +614,12 @@ bool ConvTransposeMobileOp::RunOnDeviceWithOrderNCHW() { numThreads * threadColBufferSize); // Group together thread buffers for accumulation std::vector toSum(numThreads - 1); - for (int i = 1; i < numThreads; ++i) { + for (const auto i : c10::irange(1, numThreads)) { toSum[i - 1] = threadBuffer->template mutable_data() + i * threadYBufferSizeAligned; } - for (auto image_id = 0; image_id < N; ++image_id) { + for (const auto image_id : c10::irange(N)) { // Each time through, we have to reset all per-thread output // buffers, since the output buffer is only per-batch element // The column buffers are overwritten by the matrix multiplication diff --git a/caffe2/operators/conv_transpose_unpool_op_base.h b/caffe2/operators/conv_transpose_unpool_op_base.h index db7fedf32a7657..6619ac379875a4 100644 --- a/caffe2/operators/conv_transpose_unpool_op_base.h +++ b/caffe2/operators/conv_transpose_unpool_op_base.h @@ -121,7 +121,7 @@ class ConvTransposeUnpoolBase : public Operator { } // NOLINTNEXTLINE(clang-diagnostic-sign-compare) - for (int dim = 0; dim < kernel_.size(); ++dim) { + for (const auto dim : c10::irange(kernel_.size())) { CAFFE_ENFORCE_GT(kernel_[dim], 0); CAFFE_ENFORCE_GT(stride_[dim], 0); CAFFE_ENFORCE_GE(adj_[dim], 0); diff --git a/caffe2/operators/deform_conv_op_impl.h b/caffe2/operators/deform_conv_op_impl.h index 0dfb1564acc195..011b1bf9204b32 100644 --- a/caffe2/operators/deform_conv_op_impl.h +++ b/caffe2/operators/deform_conv_op_impl.h @@ -77,7 +77,7 @@ bool DeformConvOp::RunOnDeviceWithOrderNCHW() { 1); int kernel_dims_size = 1; - for (int i = 0; i < kernel_.size(); ++i) { + for (const auto i : c10::irange(kernel_.size())) { CAFFE_ENFORCE(filter.dim32(i + 2) == kernel_[i]); kernel_dims_size *= kernel_[i]; } @@ -155,8 +155,8 @@ bool DeformConvOp::RunOnDeviceWithOrderNCHW() { col_buffer->Resize(buffer_shape); T* col_buffer_data = col_buffer->template mutable_data(); // Im2col, followed by gemm. - for (int image_id = 0; image_id < N; ++image_id) { - for (int group_id = 0; group_id < group_; ++group_id) { + for (const auto image_id : c10::irange(N)) { + for (const auto group_id : c10::irange(group_)) { DeformableIm2col( Xdata + group_id * input_offset, offset_data, @@ -271,7 +271,7 @@ bool DeformConvGradientOp::RunOnDeviceWithOrderNCHW() { 1); int kernel_dims_size = 1; - for (int i = 0; i < kernel_.size(); ++i) { + for (const auto i : c10::irange(kernel_.size())) { CAFFE_ENFORCE(filter.dim32(i + 2) == kernel_[i]); kernel_dims_size *= kernel_[i]; } @@ -342,8 +342,8 @@ bool DeformConvGradientOp::RunOnDeviceWithOrderNCHW() { math::Set(dX->numel(), 0, dXdata, &context_); } - for (int image_id = 0; image_id < N; ++image_id) { - for (int group_id = 0; group_id < group_; ++group_id) { + for (const auto image_id : c10::irange(N)) { + for (const auto group_id : c10::irange(group_)) { math::Gemm( CblasTrans, CblasNoTrans, @@ -378,7 +378,7 @@ bool DeformConvGradientOp::RunOnDeviceWithOrderNCHW() { DeformableIm2col( Xdata, offset_data, X.sizes(), col_buffer_shape, col_buffer_data); - for (int group_id = 0; group_id < group_; ++group_id) { + for (const auto group_id : c10::irange(group_)) { math::Gemm( CblasNoTrans, CblasTrans, diff --git a/caffe2/operators/dense_vector_to_id_list_op.h b/caffe2/operators/dense_vector_to_id_list_op.h index b532500d4b1017..35e94db661130a 100644 --- a/caffe2/operators/dense_vector_to_id_list_op.h +++ b/caffe2/operators/dense_vector_to_id_list_op.h @@ -33,9 +33,9 @@ class DenseVectorToIdListOp : public Operator { auto v_pos = 0; auto l_pos = 0; - for (auto i = 0; i < batch_size; i++) { + for (const auto i : c10::irange(batch_size)) { auto length = 0; - for (int j = 0; j < col_num; j++) { + for (const auto j : c10::irange(col_num)) { if ((int)(input_data[i * col_num + j] + 0.5) != 0) { out_values_data[v_pos++] = j; length++; diff --git a/caffe2/operators/distance_op.h b/caffe2/operators/distance_op.h index 11b43b630b76a2..31f505ce98ff5f 100644 --- a/caffe2/operators/distance_op.h +++ b/caffe2/operators/distance_op.h @@ -37,7 +37,7 @@ class SquaredL2DistanceGradientOp final : public Operator { int N = X.dim() > 0 ? X.dim32(0) : 1; int D = N > 0 ? X.numel() / N : 0; CAFFE_ENFORCE(X.dim() == Y.dim()); - for (int i = 0; i < X.dim(); ++i) { + for (const auto i : c10::irange(X.dim())) { CAFFE_ENFORCE(X.dim32(i) == Y.dim32(i)); } CAFFE_ENFORCE(dDistance.dim() == 1); @@ -50,7 +50,7 @@ class SquaredL2DistanceGradientOp final : public Operator { Y.template data(), dX->template mutable_data(), &context_); - for (int i = 0; i < N; ++i) { + for (const auto i : c10::irange(N)) { math::Scale( D, dDistance.template data() + i, @@ -227,7 +227,7 @@ class DotProductWithPaddingGradientOp final : public Operator { const auto* dDot_data = dDot.template data(); auto* dX_data = dX->template mutable_data(); auto* dY_data = dY->template mutable_data(); - for (int i = 0; i < N; ++i) { // TODO: multithreading + for (const auto i : c10::irange(N)) { // TODO: multithreading auto offsetX = i * DX; auto offsetY = i * DY; if (replicate_) { diff --git a/caffe2/operators/do_op.h b/caffe2/operators/do_op.h index a6a31344507815..a0dae5fbe5d502 100644 --- a/caffe2/operators/do_op.h +++ b/caffe2/operators/do_op.h @@ -46,7 +46,7 @@ class DoOp final : public Operator { const auto& outer_blob_names = checkAndGetOuterNames(operator_def); std::unordered_set used_outer_names; - for (size_t blob_idx = 0; blob_idx < inner_blobs.size(); ++blob_idx) { + for (const auto blob_idx : c10::irange(inner_blobs.size())) { CAFFE_ENFORCE( !blob_bindings_.count(inner_blobs[blob_idx]), "Invalid blob bindings: redefinition of inner blob " + @@ -154,7 +154,7 @@ class DoOp final : public Operator { const OperatorDef& operator_def) const { std::vector names; names.reserve(operator_def.input_size()); - for (auto idx = 0; idx < operator_def.input_size(); ++idx) { + for (const auto idx : c10::irange(operator_def.input_size())) { names.push_back(operator_def.input(idx)); } return names; @@ -164,7 +164,7 @@ class DoOp final : public Operator { const OperatorDef& operator_def) const { std::vector names; names.reserve(operator_def.output_size()); - for (auto idx = 0; idx < operator_def.output_size(); ++idx) { + for (const auto idx : c10::irange(operator_def.output_size())) { names.push_back(operator_def.output(idx)); } return names; diff --git a/caffe2/operators/elementwise_logical_ops.h b/caffe2/operators/elementwise_logical_ops.h index 1d74e1e1ca47d6..df7e0d09d734e2 100644 --- a/caffe2/operators/elementwise_logical_ops.h +++ b/caffe2/operators/elementwise_logical_ops.h @@ -51,7 +51,7 @@ class WhereOp final : public Operator { if (enable_broadcast_) { size_t block_size = left.size_from_dim(1); - for (int i = 0; i < select.numel(); i++) { + for (const auto i : c10::irange(select.numel())) { size_t offset = i * block_size; if (select_data[i]) { context_.CopyItemsSameDevice( @@ -68,7 +68,7 @@ class WhereOp final : public Operator { } } } else { - for (int i = 0; i < select.numel(); ++i) { + for (const auto i : c10::irange(select.numel())) { output_data[i] = select_data[i] ? left_data[i] : right_data[i]; } } @@ -159,7 +159,7 @@ class IsMemberOfOp final : public Operator { const T* input_data = input.template data(); bool* output_data = output->template mutable_data(); - for (int i = 0; i < input.numel(); ++i) { + for (const auto i : c10::irange(input.numel())) { output_data[i] = values.find(input_data[i]) != values.end(); } return true; diff --git a/caffe2/operators/elementwise_op_test.h b/caffe2/operators/elementwise_op_test.h index 7dcdbc1c0684e5..a26cfa83662d1d 100644 --- a/caffe2/operators/elementwise_op_test.h +++ b/caffe2/operators/elementwise_op_test.h @@ -62,7 +62,7 @@ void elementwiseAnd() { caffe2::Tensor Z(blob->Get(), caffe2::CPU); EXPECT_EQ(Z.numel(), N); std::vector result{true, false, false, false}; - for (size_t i = 0; i < Z.numel(); ++i) { + for (const auto i : c10::irange(Z.numel())) { EXPECT_EQ(Z.template data()[i], result[i]); } } @@ -83,7 +83,7 @@ void elementwiseAnd() { EXPECT_EQ(Z.numel(), M * N); std::vector result{ true, false, false, false, true, false, false, false}; - for (size_t i = 0; i < Z.numel(); ++i) { + for (const auto i : c10::irange(Z.numel())) { EXPECT_EQ(Z.template data()[i], result[i]); } } @@ -108,7 +108,7 @@ void elementwiseOr() { caffe2::Tensor Z(blob->Get(), caffe2::CPU); EXPECT_EQ(Z.numel(), N); std::vector result{true, true, true, false}; - for (size_t i = 0; i < Z.numel(); ++i) { + for (const auto i : c10::irange(Z.numel())) { EXPECT_EQ(Z.template data()[i], result[i]); } } @@ -128,7 +128,7 @@ void elementwiseOr() { caffe2::Tensor Z(blob->Get(), caffe2::CPU); EXPECT_EQ(Z.numel(), M * N); std::vector result{true, true, true, false, true, true, true, false}; - for (size_t i = 0; i < Z.numel(); ++i) { + for (const auto i : c10::irange(Z.numel())) { EXPECT_EQ(Z.template data()[i], result[i]); } } @@ -153,7 +153,7 @@ void elementwiseXor() { caffe2::Tensor Z(blob->Get(), caffe2::CPU); EXPECT_EQ(Z.numel(), N); std::vector result{false, true, true, false}; - for (size_t i = 0; i < Z.numel(); ++i) { + for (const auto i : c10::irange(Z.numel())) { EXPECT_EQ(Z.template data()[i], result[i]); } } @@ -174,7 +174,7 @@ void elementwiseXor() { EXPECT_EQ(Z.numel(), M * N); std::vector result{ false, true, true, false, false, true, true, false}; - for (size_t i = 0; i < Z.numel(); ++i) { + for (const auto i : c10::irange(Z.numel())) { EXPECT_EQ(Z.template data()[i], result[i]); } } @@ -198,7 +198,7 @@ void elementwiseNot() { caffe2::Tensor Y(blob->Get(), caffe2::CPU); EXPECT_EQ(Y.numel(), N); std::vector result{false, true}; - for (size_t i = 0; i < Y.numel(); ++i) { + for (const auto i : c10::irange(Y.numel())) { EXPECT_EQ(Y.template data()[i], result[i]); } } @@ -220,7 +220,7 @@ void elementwiseEQ() { caffe2::Tensor Z(blob->Get(), caffe2::CPU); EXPECT_EQ(Z.numel(), N); std::vector result{false, true, false, true}; - for (size_t i = 0; i < Z.numel(); ++i) { + for (const auto i : c10::irange(Z.numel())) { EXPECT_EQ(Z.template data()[i], result[i]); } } @@ -237,7 +237,7 @@ void elementwiseEQ() { caffe2::Tensor Z(blob->Get(), caffe2::CPU); EXPECT_EQ(Z.numel(), N); std::vector result{true, true, false, false}; - for (size_t i = 0; i < Z.numel(); ++i) { + for (const auto i : c10::irange(Z.numel())) { EXPECT_EQ(Z.template data()[i], result[i]); } } @@ -257,7 +257,7 @@ void elementwiseEQ() { EXPECT_EQ(Z.numel(), M * N); std::vector result{ true, false, false, true, false, true, true, false}; - for (size_t i = 0; i < Z.numel(); ++i) { + for (const auto i : c10::irange(Z.numel())) { EXPECT_EQ(Z.template data()[i], result[i]); } } diff --git a/caffe2/operators/enforce_finite_op.h b/caffe2/operators/enforce_finite_op.h index c1a788151ae515..cf7aaadeaa826c 100644 --- a/caffe2/operators/enforce_finite_op.h +++ b/caffe2/operators/enforce_finite_op.h @@ -32,7 +32,7 @@ class EnforceFiniteOp final : public Operator { const T* input_data = input.template data(); auto size = input.numel(); - for (auto i = 0; i < size; i++) { + for (const auto i : c10::irange(size)) { auto isfinite = std::isfinite(input_data[i]); if (!isfinite) { LogBlobFiniteness(); diff --git a/caffe2/operators/expand_op.h b/caffe2/operators/expand_op.h index 2532277c386036..a56a7548608fd4 100644 --- a/caffe2/operators/expand_op.h +++ b/caffe2/operators/expand_op.h @@ -95,7 +95,7 @@ class ExpandGradientOp final : public Operator { auto* dX = Output(0, X.sizes(), at::dtype()); std::vector axes; const int offset = ndim - X.dim(); - for (int i = 0; i < ndim; i++) { + for (const auto i : c10::irange(ndim)) { if (i < offset || dX_dims[i - offset] == 1) { axes.push_back(i); } diff --git a/caffe2/operators/expand_squeeze_dims_op.h b/caffe2/operators/expand_squeeze_dims_op.h index be2f17bcd31061..2ff2f909be95f5 100644 --- a/caffe2/operators/expand_squeeze_dims_op.h +++ b/caffe2/operators/expand_squeeze_dims_op.h @@ -91,7 +91,7 @@ class SqueezeOp : public Operator { const std::vector& dims) { size_t j = 0; std::vector newDims; - for (size_t i = 0; i < inputDims.size(); ++i) { + for (const auto i : c10::irange(inputDims.size())) { // NOLINTNEXTLINE(clang-diagnostic-sign-compare) if (j < dims.size() && dims[j] == i) { CAFFE_ENFORCE_EQ( diff --git a/caffe2/operators/feature_maps_ops.h b/caffe2/operators/feature_maps_ops.h index be091373096411..2595cfdce6c907 100644 --- a/caffe2/operators/feature_maps_ops.h +++ b/caffe2/operators/feature_maps_ops.h @@ -32,8 +32,8 @@ class MergeDenseFeatureTensorsOp : public Operator { const bool* inPresenceData = Input(1).template data(); int totalNumFeatures = 0; - for (int exampleIndex = 0; exampleIndex < numExamples; ++exampleIndex) { - for (int inputIndex = 0; inputIndex < numFeatures; ++inputIndex) { + for (const auto exampleIndex : c10::irange(numExamples)) { + for (const auto inputIndex : c10::irange(numFeatures)) { if (inPresenceData[exampleIndex * numFeatures + inputIndex]) { ++totalNumFeatures; } @@ -51,10 +51,10 @@ class MergeDenseFeatureTensorsOp : public Operator { Input(0).template data(); int keysOffset = 0; - for (int exampleIndex = 0; exampleIndex < numExamples; ++exampleIndex) { + for (const auto exampleIndex : c10::irange(numExamples)) { outLengthsData[exampleIndex] = 0; auto offset = exampleIndex * numFeatures; - for (int inputIndex = 0; inputIndex < numFeatures; ++inputIndex) { + for (const auto inputIndex : c10::irange(numFeatures)) { if (inPresenceData[offset]) { ++outLengthsData[exampleIndex]; outKeysData[keysOffset] = featureIDs_[inputIndex]; @@ -94,10 +94,10 @@ class MergeSingleScalarFeatureTensorsOp : public Operator { bool DoRunWithType() { int numExamples = Input(0).numel(); int totalNumFeatures = 0; - for (int inputIndex = 0; inputIndex < numInputs_; ++inputIndex) { + for (const auto inputIndex : c10::irange(numInputs_)) { const bool* inPresenceData = Input(kNumTensorsPerInput * inputIndex + 1).template data(); - for (int exampleIndex = 0; exampleIndex < numExamples; ++exampleIndex) { + for (const auto exampleIndex : c10::irange(numExamples)) { if (inPresenceData[exampleIndex]) { ++totalNumFeatures; } @@ -113,9 +113,9 @@ class MergeSingleScalarFeatureTensorsOp : public Operator { T* outValuesData = outValues->template mutable_data(); int keysOffset = 0; - for (int exampleIndex = 0; exampleIndex < numExamples; ++exampleIndex) { + for (const auto exampleIndex : c10::irange(numExamples)) { outLengthsData[exampleIndex] = 0; - for (int inputIndex = 0; inputIndex < numInputs_; ++inputIndex) { + for (const auto inputIndex : c10::irange(numInputs_)) { const T* inData = Input(kNumTensorsPerInput * inputIndex).template data(); const bool* inPresenceData = @@ -158,7 +158,7 @@ class MergeSingleScalarFeatureTensorsGradientOp : public Operator { template bool DoRunWithType() { int numExamples = Input(0).numel(); - for (int inputIndex = 0; inputIndex < numFeatureInputs_; ++inputIndex) { + for (const auto inputIndex : c10::irange(numFeatureInputs_)) { Output(inputIndex)->ResizeLike(Input(inputIndex)); } @@ -166,8 +166,8 @@ class MergeSingleScalarFeatureTensorsGradientOp : public Operator { T default_value = T(); int valuesOffset = 0; - for (int exampleIndex = 0; exampleIndex < numExamples; ++exampleIndex) { - for (int inputIndex = 0; inputIndex < numFeatureInputs_; ++inputIndex) { + for (const auto exampleIndex : c10::irange(numExamples)) { + for (const auto inputIndex : c10::irange(numFeatureInputs_)) { const bool* inPresenceData = Input(inputIndex).template data(); T* outFeatureData = Output(inputIndex)->template mutable_data(); if (inPresenceData[exampleIndex]) { @@ -210,12 +210,12 @@ class MergeSingleListFeatureTensorsOp : public Operator { int numExamples = Input(0).numel(); int totalNumFeatures = 0; int totalNumValues = 0; - for (int inputIndex = 0; inputIndex < numInputs_; ++inputIndex) { + for (const auto inputIndex : c10::irange(numInputs_)) { const int32_t* inLengthsData = Input(kNumTensorsPerInput * inputIndex).template data(); const bool* inPresenceData = Input(kNumTensorsPerInput * inputIndex + 2).template data(); - for (int exampleIndex = 0; exampleIndex < numExamples; ++exampleIndex) { + for (const auto exampleIndex : c10::irange(numExamples)) { if (inPresenceData[exampleIndex]) { ++totalNumFeatures; totalNumValues += inLengthsData[exampleIndex]; @@ -237,12 +237,12 @@ class MergeSingleListFeatureTensorsOp : public Operator { int keysOffset = 0; int valuesOffset = 0; - for (int inputIndex = 0; inputIndex < numInputs_; ++inputIndex) { + for (const auto inputIndex : c10::irange(numInputs_)) { inValuesOffset_[inputIndex] = 0; } - for (int exampleIndex = 0; exampleIndex < numExamples; ++exampleIndex) { + for (const auto exampleIndex : c10::irange(numExamples)) { outLengthsData[exampleIndex] = 0; - for (int inputIndex = 0; inputIndex < numInputs_; ++inputIndex) { + for (const auto inputIndex : c10::irange(numInputs_)) { const int32_t* inLengthsData = Input(kNumTensorsPerInput * inputIndex).template data(); const auto& inValues = Input(kNumTensorsPerInput * inputIndex + 1); @@ -295,13 +295,13 @@ class MergeSingleListOrMapFeatureTensorsGradientOp : public Operator { bool DoRunWithType() { int numExamples = Input(0).numel(); std::vector outValuesOffset(numFeatureInputs_); - for (int inputIndex = 0; inputIndex < numFeatureInputs_; ++inputIndex) { + for (const auto inputIndex : c10::irange(numFeatureInputs_)) { int inputNumValues = 0; const int32_t* inLengthsData = Input(kNumTensorsPerInput * inputIndex).template data(); const bool* inPresenceData = Input(kNumTensorsPerInput * inputIndex + 1).template data(); - for (int exampleIndex = 0; exampleIndex < numExamples; ++exampleIndex) { + for (const auto exampleIndex : c10::irange(numExamples)) { if (inPresenceData[exampleIndex]) { inputNumValues += inLengthsData[exampleIndex]; } @@ -313,8 +313,8 @@ class MergeSingleListOrMapFeatureTensorsGradientOp : public Operator { const T* inValuesValuesGradData = inValuesValuesGrad.template data(); int inValuesValuesOffset = 0; - for (int exampleIndex = 0; exampleIndex < numExamples; ++exampleIndex) { - for (int inputIndex = 0; inputIndex < numFeatureInputs_; ++inputIndex) { + for (const auto exampleIndex : c10::irange(numExamples)) { + for (const auto inputIndex : c10::irange(numFeatureInputs_)) { const int32_t* inLengthsData = Input(kNumTensorsPerInput * inputIndex).template data(); const bool* inPresenceData = @@ -371,12 +371,12 @@ class MergeSingleMapFeatureTensorsOp : public Operator { int numExamples = Input(0).numel(); int totalNumFeatures = 0; int totalNumValues = 0; - for (int inputIndex = 0; inputIndex < numInputs_; ++inputIndex) { + for (const auto inputIndex : c10::irange(numInputs_)) { const int32_t* inLengthsData = Input(kNumTensorsPerInput * inputIndex).template data(); const bool* inPresenceData = Input(kNumTensorsPerInput * inputIndex + 3).template data(); - for (int exampleIndex = 0; exampleIndex < numExamples; ++exampleIndex) { + for (const auto exampleIndex : c10::irange(numExamples)) { if (inPresenceData[exampleIndex]) { ++totalNumFeatures; totalNumValues += inLengthsData[exampleIndex]; @@ -400,12 +400,12 @@ class MergeSingleMapFeatureTensorsOp : public Operator { int keysOffset = 0; int valuesOffset = 0; - for (int inputIndex = 0; inputIndex < numInputs_; ++inputIndex) { + for (const auto inputIndex : c10::irange(numInputs_)) { inValuesOffset_[inputIndex] = 0; } - for (int exampleIndex = 0; exampleIndex < numExamples; ++exampleIndex) { + for (const auto exampleIndex : c10::irange(numExamples)) { outLengthsData[exampleIndex] = 0; - for (int inputIndex = 0; inputIndex < numInputs_; ++inputIndex) { + for (const auto inputIndex : c10::irange(numInputs_)) { const int32_t* inLengthsData = Input(kNumTensorsPerInput * inputIndex).template data(); const auto& inKeys = Input(kNumTensorsPerInput * inputIndex + 1); @@ -465,7 +465,7 @@ class MergeMultiScalarFeatureTensorsOp : public Operator { bool DoRunWithType() { int numExamples = Input(0).numel(); int totalNumFeatures = 0; - for (int inputIndex = 0; inputIndex < numInputs_; ++inputIndex) { + for (const auto inputIndex : c10::irange(numInputs_)) { totalNumFeatures += Input(kNumTensorsPerInput * inputIndex + 1).numel(); } @@ -478,12 +478,12 @@ class MergeMultiScalarFeatureTensorsOp : public Operator { T* outValuesData = outValues->template mutable_data(); int outKeysOffset = 0; - for (int inputIndex = 0; inputIndex < numInputs_; ++inputIndex) { + for (const auto inputIndex : c10::irange(numInputs_)) { inKeysOffset_[inputIndex] = 0; } - for (int exampleIndex = 0; exampleIndex < numExamples; ++exampleIndex) { + for (const auto exampleIndex : c10::irange(numExamples)) { outLengthsData[exampleIndex] = 0; - for (int inputIndex = 0; inputIndex < numInputs_; ++inputIndex) { + for (const auto inputIndex : c10::irange(numInputs_)) { const int32_t* inLengthsData = Input(kNumTensorsPerInput * inputIndex).template data(); auto inputKeysBlobIdx = kNumTensorsPerInput * inputIndex + 1; @@ -537,11 +537,11 @@ class MergeMultiScalarFeatureTensorsGradientOp : public Operator { bool DoRunWithType() { int numExamples = Input(0).numel(); std::vector outValuesOffset(numFeatureInputs_); - for (int inputIndex = 0; inputIndex < numFeatureInputs_; ++inputIndex) { + for (const auto inputIndex : c10::irange(numFeatureInputs_)) { int inputNumValues = 0; const int32_t* inLengthsData = Input(kNumTensorsPerInput * inputIndex).template data(); - for (int exampleIndex = 0; exampleIndex < numExamples; ++exampleIndex) { + for (const auto exampleIndex : c10::irange(numExamples)) { inputNumValues += inLengthsData[exampleIndex]; } Output(inputIndex)->Resize(inputNumValues); @@ -551,8 +551,8 @@ class MergeMultiScalarFeatureTensorsGradientOp : public Operator { const T* inValuesGradData = inValuesGrad.template data(); int inValuesOffset = 0; - for (int exampleIndex = 0; exampleIndex < numExamples; ++exampleIndex) { - for (int inputIndex = 0; inputIndex < numFeatureInputs_; ++inputIndex) { + for (const auto exampleIndex : c10::irange(numExamples)) { + for (const auto inputIndex : c10::irange(numFeatureInputs_)) { const int32_t* inLengthsData = Input(kNumTensorsPerInput * inputIndex).template data(); if (inLengthsData[exampleIndex] > 0) { @@ -600,7 +600,7 @@ class MergeMultiListFeatureTensorsOp : public Operator { int numExamples = Input(0).numel(); int totalNumFeatures = 0; int totalNumValues = 0; - for (int inputIndex = 0; inputIndex < numInputs_; ++inputIndex) { + for (const auto inputIndex : c10::irange(numInputs_)) { totalNumFeatures += Input(kNumTensorsPerInput * inputIndex + 1).numel(); totalNumValues += Input(kNumTensorsPerInput * inputIndex + 3).numel(); } @@ -619,13 +619,13 @@ class MergeMultiListFeatureTensorsOp : public Operator { int outKeysOffset = 0; int outValuesValuesOffset = 0; - for (int inputIndex = 0; inputIndex < numInputs_; ++inputIndex) { + for (const auto inputIndex : c10::irange(numInputs_)) { inKeysOffset_[inputIndex] = 0; inValuesValuesOffset_[inputIndex] = 0; } - for (int exampleIndex = 0; exampleIndex < numExamples; ++exampleIndex) { + for (const auto exampleIndex : c10::irange(numExamples)) { outLengthsData[exampleIndex] = 0; - for (int inputIndex = 0; inputIndex < numInputs_; ++inputIndex) { + for (const auto inputIndex : c10::irange(numInputs_)) { const int32_t* inLengthsData = Input(kNumTensorsPerInput * inputIndex).template data(); const int64_t* inKeysData = Input(kNumTensorsPerInput * inputIndex + 1) @@ -699,7 +699,7 @@ class MergeMultiMapFeatureTensorsOp : public Operator { int numExamples = Input(0).numel(); int totalNumFeatures = 0; int totalNumValues = 0; - for (int inputIndex = 0; inputIndex < numInputs_; ++inputIndex) { + for (const auto inputIndex : c10::irange(numInputs_)) { totalNumFeatures += Input(kNumTensorsPerInput * inputIndex + 1).numel(); totalNumValues += Input(kNumTensorsPerInput * inputIndex + 4).numel(); } @@ -720,13 +720,13 @@ class MergeMultiMapFeatureTensorsOp : public Operator { int outKeysOffset = 0; int outValuesValuesOffset = 0; - for (int inputIndex = 0; inputIndex < numInputs_; ++inputIndex) { + for (const auto inputIndex : c10::irange(numInputs_)) { inKeysOffset_[inputIndex] = 0; inValuesValuesOffset_[inputIndex] = 0; } - for (int exampleIndex = 0; exampleIndex < numExamples; ++exampleIndex) { + for (const auto exampleIndex : c10::irange(numExamples)) { outLengthsData[exampleIndex] = 0; - for (int inputIndex = 0; inputIndex < numInputs_; ++inputIndex) { + for (const auto inputIndex : c10::irange(numInputs_)) { const int32_t* inLengthsData = Input(kNumTensorsPerInput * inputIndex).template data(); const int64_t* inKeysData = Input(kNumTensorsPerInput * inputIndex + 1) @@ -798,13 +798,12 @@ class MergeMultiListOrMapFeatureTensorsGradientOp : public Operator { int numExamples = Input(0).numel(); std::vector outValuesLengthOffset(numFeatureInputs_); std::vector outValuesValuesOffset(numFeatureInputs_); - for (int inputIndex = 0; inputIndex < numFeatureInputs_; ++inputIndex) { + for (const auto inputIndex : c10::irange(numFeatureInputs_)) { int inputNumValues = 0; auto& inValuesLength = Input(kNumTensorsPerInput * inputIndex + 1); const int32_t* inValuesLengthsData = inValuesLength.template data(); - for (int valuesIndex = 0; valuesIndex < inValuesLength.numel(); - ++valuesIndex) { + for (const auto valuesIndex : c10::irange(inValuesLength.numel())) { inputNumValues += inValuesLengthsData[valuesIndex]; } Output(inputIndex)->Resize(inputNumValues); @@ -814,8 +813,8 @@ class MergeMultiListOrMapFeatureTensorsGradientOp : public Operator { const T* inValuesValuesGradData = inValuesValuesGrad.template data(); int inValuesValuesOffset = 0; - for (int exampleIndex = 0; exampleIndex < numExamples; ++exampleIndex) { - for (int inputIndex = 0; inputIndex < numFeatureInputs_; ++inputIndex) { + for (const auto exampleIndex : c10::irange(numExamples)) { + for (const auto inputIndex : c10::irange(numFeatureInputs_)) { const int32_t* inLengthsData = Input(kNumTensorsPerInput * inputIndex).template data(); const int32_t* inValuesLengthsData = diff --git a/caffe2/operators/filler_op.h b/caffe2/operators/filler_op.h index 7e01a01792bf50..0175f77a6f2abf 100644 --- a/caffe2/operators/filler_op.h +++ b/caffe2/operators/filler_op.h @@ -536,7 +536,7 @@ class LengthsRangeFillOp : public Operator { auto* output_data = output->template mutable_data(); int32_t offset = 0; - for (int i = 0; i < input.numel(); ++i) { + for (const auto i : c10::irange(input.numel())) { auto len = input_data[i]; auto start = output_data + offset; std::iota( diff --git a/caffe2/operators/find_duplicate_elements_op.h b/caffe2/operators/find_duplicate_elements_op.h index 681d56e47c0b5c..4e3e1e7743cefb 100644 --- a/caffe2/operators/find_duplicate_elements_op.h +++ b/caffe2/operators/find_duplicate_elements_op.h @@ -44,7 +44,7 @@ class FindDuplicateElementsOp final : public Operator { auto* output = Output(0, {static_cast(dupSize)}, at::dtype()); auto* out_ptr = output->template mutable_data(); - for (size_t i = 0; i < dupSize; ++i) { + for (const auto i : c10::irange(dupSize)) { out_ptr[i] = dupIndices[i]; } diff --git a/caffe2/operators/find_op.h b/caffe2/operators/find_op.h index 4b5bfdc852f14e..e59aa846f89134 100644 --- a/caffe2/operators/find_op.h +++ b/caffe2/operators/find_op.h @@ -42,7 +42,7 @@ class FindOp final : public Operator { // index into a map if (needles.numel() < 16) { // Brute force O(nm) - for (int i = 0; i < needles.numel(); i++) { + for (const auto i : c10::irange(needles.numel())) { T x = needles_data[i]; T res = static_cast(missing_value_); for (int j = idx_size - 1; j >= 0; j--) { @@ -56,10 +56,10 @@ class FindOp final : public Operator { } else { // O(n + m) std::unordered_map idx_map; - for (int j = 0; j < idx_size; j++) { + for (const auto j : c10::irange(idx_size)) { idx_map[idx_data[j]] = j; } - for (int i = 0; i < needles.numel(); i++) { + for (const auto i : c10::irange(needles.numel())) { T x = needles_data[i]; auto it = idx_map.find(x); res_data[i] = (it == idx_map.end() ? missing_value_ : it->second); diff --git a/caffe2/operators/floor_op.h b/caffe2/operators/floor_op.h index 6af9b414814a2f..fa3436d23e5068 100644 --- a/caffe2/operators/floor_op.h +++ b/caffe2/operators/floor_op.h @@ -21,7 +21,7 @@ class FloorOp final : public Operator { const float* Xdata = X.template data(); float* Ydata = Y->template mutable_data(); - for (int i = 0; i < X.numel(); ++i) { + for (const auto i : c10::irange(X.numel())) { Ydata[i] = std::floor(Xdata[i]); } return true; diff --git a/caffe2/operators/fused_rowwise_8bit_conversion_ops.h b/caffe2/operators/fused_rowwise_8bit_conversion_ops.h index cf593a10e0f497..23c7f968bf1663 100644 --- a/caffe2/operators/fused_rowwise_8bit_conversion_ops.h +++ b/caffe2/operators/fused_rowwise_8bit_conversion_ops.h @@ -3,6 +3,7 @@ #include "caffe2/core/context.h" #include "caffe2/core/export_caffe2_op_to_c10.h" +#include #include "caffe2/core/logging.h" #include "caffe2/core/operator.h" #include "caffe2/operators/reducer_functors.h" @@ -82,7 +83,7 @@ class FloatToFused8BitRowwiseQuantizedOp : public Operator { vector tmp(input_columns); // NOLINTNEXTLINE(clang-diagnostic-sign-compare) - for (size_t row = 0; row < input_rows; ++row) { + for (const auto row : c10::irange(input_rows)) { convert(tmp.data(), input_data + row * input_columns, input_columns); if (out_sb_half) { FloatToFusedNBitRowwiseQuantizedSBHalf( @@ -163,7 +164,7 @@ class Fused8BitRowwiseQuantizedToFloatOp : public Operator { vector tmp(input_columns); // NOLINTNEXTLINE(clang-diagnostic-sign-compare) - for (size_t row = 0; row < input_rows; ++row) { + for (const auto row : c10::irange(input_rows)) { if (in_sb_half) { FusedNBitRowwiseQuantizedSBHalfToFloat( 8, diff --git a/caffe2/operators/fused_rowwise_nbit_conversion_ops.h b/caffe2/operators/fused_rowwise_nbit_conversion_ops.h index 363743eebeb325..a03fdf65521a05 100644 --- a/caffe2/operators/fused_rowwise_nbit_conversion_ops.h +++ b/caffe2/operators/fused_rowwise_nbit_conversion_ops.h @@ -131,7 +131,7 @@ class FloatToFusedNBitRowwiseQuantizedOp final : public Operator { *output_row_scale = scale; *output_row_bias = Xmin; - for (int col = 0; col < input_columns; ++col) { + for (const auto col : c10::irange(input_columns)) { float X = tmp[col]; std::uint8_t quantized = std::max( 0, @@ -206,7 +206,7 @@ class FusedNBitRowwiseQuantizedToFloatOp final : public Operator { std::vector tmp(output_columns); // NOLINTNEXTLINE(clang-diagnostic-sign-compare) - for (size_t row = 0; row < input_rows; ++row) { + for (const auto row : c10::irange(input_rows)) { const std::uint8_t* input_row = input_data + row * input_columns; float scale = *reinterpret_cast( input_row + @@ -216,7 +216,7 @@ class FusedNBitRowwiseQuantizedToFloatOp final : public Operator { (output_columns + NUM_ELEM_PER_BYTE - 1) / NUM_ELEM_PER_BYTE + sizeof(at::Half)); - for (int col = 0; col < output_columns; ++col) { + for (const auto col : c10::irange(output_columns)) { std::uint8_t quantized = input_row[col / NUM_ELEM_PER_BYTE]; quantized >>= (col % NUM_ELEM_PER_BYTE) * BIT_RATE; quantized &= (1 << BIT_RATE) - 1; diff --git a/caffe2/operators/fused_rowwise_nbitfake_conversion_ops.h b/caffe2/operators/fused_rowwise_nbitfake_conversion_ops.h index 105f9902b21599..9e506aa7d94321 100644 --- a/caffe2/operators/fused_rowwise_nbitfake_conversion_ops.h +++ b/caffe2/operators/fused_rowwise_nbitfake_conversion_ops.h @@ -118,7 +118,7 @@ class FloatToFusedNBitFakeRowwiseQuantizedOp final output_row_scale_bias[1] = minimum_element; // NOLINTNEXTLINE(clang-diagnostic-sign-compare) - for (size_t col = 0; col < input_columns; ++col) { + for (const auto col : c10::irange(input_columns)) { output_row[col] = std::max( 0, std::min( diff --git a/caffe2/operators/gather_fused_8bit_rowwise_op.h b/caffe2/operators/gather_fused_8bit_rowwise_op.h index a111ff3eca4dbb..4892b4cbb407d7 100644 --- a/caffe2/operators/gather_fused_8bit_rowwise_op.h +++ b/caffe2/operators/gather_fused_8bit_rowwise_op.h @@ -37,7 +37,7 @@ class GatherFused8BitRowwiseOp : public Operator { const Index* idxs = indices.template data(); auto out = output->template mutable_data(); - for (int i = 0; i < N; ++i) { + for (const auto i : c10::irange(N)) { auto idx = idxs[i]; CAFFE_ENFORCE( 0 <= idx && idx < data.size(0), diff --git a/caffe2/operators/gather_op.h b/caffe2/operators/gather_op.h index 52f45c19898369..309d07b8b0fcfb 100644 --- a/caffe2/operators/gather_op.h +++ b/caffe2/operators/gather_op.h @@ -44,7 +44,7 @@ static void check_indexarray_range( IndexType indexing_axis_dim, bool wrap_indices) { // - for (auto i = 0; i < n; ++i) { + for (const auto i : c10::irange(n)) { auto idx = indices[i]; if (wrap_indices && idx < 0) { idx = idx + indexing_axis_dim; @@ -114,7 +114,7 @@ static bool gather_impl( auto N = indices.numel(); if (match_outer) { CAFFE_ENFORCE_GE(axis, 1, "Axis should be at least 1"); - for (auto i = 0; i < axis; i++) { + for (const auto i : c10::irange(axis)) { CAFFE_ENFORCE_EQ( data.size(i), indices.size(i), @@ -129,12 +129,12 @@ static bool gather_impl( // Special-case single-float copy for efficiency if (data.template IsType() && block_size == 1) { - for (auto batch = 0; batch < outer_dims_product; ++batch) { + for (const auto batch : c10::irange(outer_dims_product)) { const float* src_floats = (const float*)(src_base + batch * src_batch_bytesize); float* dst_floats = (float*)(out + batch * gathered_batch_bytesize); - for (auto i = 0; i < N; ++i) { + for (const auto i : c10::irange(N)) { auto idx = idxs[i]; if (match_outer) { idx = idxs[batch * idx_inner_dims_product + i]; @@ -148,8 +148,8 @@ static bool gather_impl( } else { // outer_dims_product specifies how many times we repeat inner dimensions, // so we just iterate over it to cover all outer dimensions. - for (auto batch = 0; batch < outer_dims_product; ++batch) { - for (auto i = 0; i < N; ++i) { + for (const auto batch : c10::irange(outer_dims_product)) { + for (const auto i : c10::irange(N)) { auto idx = idxs[i]; if (match_outer) { idx = idxs[batch * idx_inner_dims_product + i]; diff --git a/caffe2/operators/gather_ranges_to_dense_op.h b/caffe2/operators/gather_ranges_to_dense_op.h index 0fdc430ea441ea..7e314f703aa976 100644 --- a/caffe2/operators/gather_ranges_to_dense_op.h +++ b/caffe2/operators/gather_ranges_to_dense_op.h @@ -6,6 +6,7 @@ #include "caffe2/core/common_omp.h" #include "caffe2/core/context.h" #include "caffe2/core/export_caffe2_op_to_c10.h" +#include #include "caffe2/core/logging.h" #include "caffe2/core/operator.h" #include "caffe2/core/types.h" @@ -42,7 +43,8 @@ class GatherRangesToDenseOp final : public Operator { CAFFE_ENFORCE_GT( minObservation_, 0, "The number of observations is at least 1"); // Initialize the empty and mismatch counter. - for (int i = 0; i < OutputSize(); ++i) { + for (const auto i : c10::irange(OutputSize())) { + (void)i; // Suppress unused variable warning emptyRanges_.push_back(0); mismatchedRanges_.push_back(0); mismatchedLengths_.push_back(set()); @@ -105,7 +107,7 @@ class GatherRangesToDenseOp final : public Operator { auto batchSize = ranges.size(0); vector outputDims{batchSize, 0}; vector outputRawData; - for (int i = 0; i < OutputSize(); ++i) { + for (const auto i : c10::irange(OutputSize())) { auto* output = Output(i); outputDims[1] = lengths_[i]; output->Resize(outputDims); @@ -114,8 +116,8 @@ class GatherRangesToDenseOp final : public Operator { outputRawData.push_back(ptr); } - for (int i = 0; i < batchSize; ++i) { - for (int j = 0; j < OutputSize(); ++j) { + for (const auto i : c10::irange(batchSize)) { + for (const auto j : c10::irange(OutputSize())) { auto rangeStart = rangesData[rangesDataOffset++]; auto rangeLength = rangesData[rangesDataOffset++]; @@ -143,7 +145,7 @@ class GatherRangesToDenseOp final : public Operator { auto& key = Input(KEY); auto* key_data = key.template data(); vector> buffer; - for (int b_i = 0; b_i < rangeLength; ++b_i) { + for (const auto b_i : c10::irange(rangeLength)) { int64_t one_key_item = key_data[rangeStart + b_i]; auto* one_data_item = rawData + (rangeStart + b_i) * itemsize; buffer.emplace_back(one_key_item, one_data_item); @@ -155,7 +157,7 @@ class GatherRangesToDenseOp final : public Operator { const std::pair& right) { return left.first < right.first; }); - for (int b_i = 0; b_i < rangeLength; ++b_i) { + for (const auto b_i : c10::irange(rangeLength)) { // Since this CPU only, directly copy to the destination. std::memcpy( outputRawData[j] + (i * lengths_[j] + b_i) * itemsize, @@ -170,7 +172,7 @@ class GatherRangesToDenseOp final : public Operator { // Check whether the empty and mismatch ratio exceeded the threshold. totalRanges_ += batchSize; - for (int j = 0; j < OutputSize(); ++j) { + for (const auto j : c10::irange(OutputSize())) { // Only check when the ratio is not set to allow all mismatches. if (maxMismatchedRatio_ < 1.0) { CAFFE_ENFORCE_GE( diff --git a/caffe2/operators/generate_proposals_op_util_boxes.h b/caffe2/operators/generate_proposals_op_util_boxes.h index dacee9eb7db603..0a402cdb6a3c12 100644 --- a/caffe2/operators/generate_proposals_op_util_boxes.h +++ b/caffe2/operators/generate_proposals_op_util_boxes.h @@ -294,7 +294,7 @@ EArrXXt clip_boxes_rotated( EArrXXt ret(boxes.rows(), boxes.cols()); ret = boxes; - for (int i = 0; i < upright_boxes.rows(); ++i) { + for (const auto i : c10::irange(upright_boxes.rows())) { ret.row(indices[i]) = upright_boxes.row(i); } return ret; diff --git a/caffe2/operators/generate_proposals_op_util_nms.h b/caffe2/operators/generate_proposals_op_util_nms.h index ff68d49251c1e3..7fb9c3767cf351 100644 --- a/caffe2/operators/generate_proposals_op_util_nms.h +++ b/caffe2/operators/generate_proposals_op_util_nms.h @@ -247,7 +247,7 @@ int rotated_rect_intersection_pts( // Specical case of rect1 == rect2 bool same = true; - for (int i = 0; i < 4; i++) { + for (const auto i : c10::irange(4)) { if (fabs(pts1[i].x() - pts2[i].x()) > samePointEps || (fabs(pts1[i].y() - pts2[i].y()) > samePointEps)) { same = false; @@ -256,7 +256,7 @@ int rotated_rect_intersection_pts( } if (same) { - for (int i = 0; i < 4; i++) { + for (const auto i : c10::irange(4)) { intersections[i] = pts1[i]; } num = 4; @@ -265,14 +265,14 @@ int rotated_rect_intersection_pts( // Line vector // A line from p1 to p2 is: p1 + (p2-p1)*t, t=[0,1] - for (int i = 0; i < 4; i++) { + for (const auto i : c10::irange(4)) { vec1[i] = pts1[(i + 1) % 4] - pts1[i]; vec2[i] = pts2[(i + 1) % 4] - pts2[i]; } // Line test - test all line combos for intersection - for (int i = 0; i < 4; i++) { - for (int j = 0; j < 4; j++) { + for (const auto i : c10::irange(4)) { + for (const auto j : c10::irange(4)) { // Solve for 2x2 Ax=b // This takes care of parallel lines @@ -298,7 +298,7 @@ int rotated_rect_intersection_pts( const auto& DA = vec2[3]; auto ABdotAB = AB.squaredNorm(); auto ADdotAD = DA.squaredNorm(); - for (int i = 0; i < 4; i++) { + for (const auto i : c10::irange(4)) { // assume ABCD is the rectangle, and P is the point to be judged // P is inside ABCD iff. P's projection on AB lies within AB // and P's projection on AD lies within AD @@ -321,7 +321,7 @@ int rotated_rect_intersection_pts( const auto& DA = vec1[3]; auto ABdotAB = AB.squaredNorm(); auto ADdotAD = DA.squaredNorm(); - for (int i = 0; i < 4; i++) { + for (const auto i : c10::irange(4)) { auto AP = pts2[i] - pts1[0]; auto APdotAB = AP.dot(AB); @@ -351,7 +351,7 @@ int convex_hull_graham( // if more than 1 points have the same minimum y, // pick the one with the mimimum x. int t = 0; - for (int i = 1; i < num_in; i++) { + for (const auto i : c10::irange(1, num_in)) { if (p[i].y() < p[t].y() || (p[i].y() == p[t].y() && p[i].x() < p[t].x())) { t = i; } @@ -360,7 +360,7 @@ int convex_hull_graham( // Step 2: // Subtract starting point from every points (for sorting in the next step) - for (int i = 0; i < num_in; i++) { + for (const auto i : c10::irange(num_in)) { q[i] = p[i] - s; } @@ -415,8 +415,7 @@ int convex_hull_graham( // But if we're only interested in getting the area/perimeter of the shape // We can simply return. if (!shift_to_zero) { - for (int i = 0; i < m; i++) - q[i] += s; + for (const auto i : c10::irange(m))q[i] += s; } return m; @@ -518,8 +517,8 @@ Eigen::ArrayXXf bbox_overlaps_rotated( const auto& query_boxes_areas = query_boxes.col(2) * query_boxes.col(3); Eigen::ArrayXXf overlaps(boxes.rows(), query_boxes.rows()); - for (int i = 0; i < boxes.rows(); ++i) { - for (int j = 0; j < query_boxes.rows(); ++j) { + for (const auto i : c10::irange(boxes.rows())) { + for (const auto j : c10::irange(query_boxes.rows())) { auto inter = bbox_intersection_rotated(boxes.row(i), query_boxes.row(j)); overlaps(i, j) = (inter == 0.0) ? 0.0 @@ -554,7 +553,7 @@ std::vector nms_cpu_rotated( EArrX areas = widths * heights; std::vector rotated_rects(proposals.rows()); - for (int i = 0; i < proposals.rows(); ++i) { + for (const auto i : c10::irange(proposals.rows())) { rotated_rects[i] = bbox_to_rotated_rect(proposals.row(i)); } @@ -616,7 +615,7 @@ std::vector soft_nms_cpu_rotated( EArrX areas = widths * heights; std::vector rotated_rects(proposals.rows()); - for (int i = 0; i < proposals.rows(); ++i) { + for (const auto i : c10::irange(proposals.rows())) { rotated_rects[i] = bbox_to_rotated_rect(proposals.row(i)); } diff --git a/caffe2/operators/given_tensor_byte_string_to_uint8_fill_op.h b/caffe2/operators/given_tensor_byte_string_to_uint8_fill_op.h index 13a504017c12d0..8a6d4ddd58b1e3 100644 --- a/caffe2/operators/given_tensor_byte_string_to_uint8_fill_op.h +++ b/caffe2/operators/given_tensor_byte_string_to_uint8_fill_op.h @@ -61,7 +61,7 @@ class GivenTensorByteStringToUInt8FillOp final : public FillerOp { at::dtype().device(CPU)); uint8_t* values_data = values_.template mutable_data(); // NOLINTNEXTLINE(clang-diagnostic-sign-compare) - for (int i = 0; i < str.size(); i++) { + for (const auto i : c10::irange(str.size())) { values_data[i] = static_cast(str[i]); } } diff --git a/caffe2/operators/given_tensor_fill_op.h b/caffe2/operators/given_tensor_fill_op.h index 9b975e910b1074..8a20aa813d6129 100644 --- a/caffe2/operators/given_tensor_fill_op.h +++ b/caffe2/operators/given_tensor_fill_op.h @@ -68,7 +68,7 @@ class GivenTensorFillOp final : public FillerOp { at::dtype().device(CPU)); Type* values_data = values_.template mutable_data(); // NOLINTNEXTLINE(clang-diagnostic-sign-compare) - for (int i = 0; i < source_values.size(); i++) { + for (const auto i : c10::irange(source_values.size())) { values_data[i] = static_cast(source_values[i]); } body_ = &GivenTensorFillOp::FillWithType; diff --git a/caffe2/operators/gru_unit_op.h b/caffe2/operators/gru_unit_op.h index 721b882797e08c..5b23dd0995b2ba 100644 --- a/caffe2/operators/gru_unit_op.h +++ b/caffe2/operators/gru_unit_op.h @@ -29,10 +29,10 @@ void GRUUnit( bool drop_states, T* H, Context* /*context*/) { - for (int n = 0; n < N; ++n) { + for (const auto n : c10::irange(N)) { const bool valid = seqLengths == nullptr || t < seqLengths[n]; - for (int d = 0; d < D; ++d) { + for (const auto d : c10::irange(D)) { if (!valid) { if (drop_states) { H[d] = 0; @@ -68,10 +68,10 @@ void GRUUnitGradient( T* H_prev_diff, T* X_diff, Context* /*context*/) { - for (int n = 0; n < N; ++n) { + for (const auto n : c10::irange(N)) { const bool valid = seqLengths == nullptr || t < seqLengths[n]; - for (int d = 0; d < D; ++d) { + for (const auto d : c10::irange(D)) { T* h_prev_diff = H_prev_diff + d; T* reset_diff = X_diff + 0 * D + d; T* update_diff = X_diff + 1 * D + d; diff --git a/caffe2/operators/h_softmax_op.h b/caffe2/operators/h_softmax_op.h index 395f8f651b54bc..3943e874772021 100644 --- a/caffe2/operators/h_softmax_op.h +++ b/caffe2/operators/h_softmax_op.h @@ -2,6 +2,7 @@ #define CAFFE2_OPERATORS_H_SOFTMAX_OP_H_ #include +#include #include "caffe2/core/context.h" #include "caffe2/core/logging.h" #include "caffe2/core/operator.h" @@ -51,7 +52,7 @@ class HSoftmaxOpBase : public Operator { int M, std::unordered_map& hierarchy) const { int size = 0; - for (int label = 0; label < M; ++label) { + for (const auto label : c10::irange(M)) { int word_id = labels[label]; const auto& path = hierarchy[word_id]; size += std::accumulate( diff --git a/caffe2/operators/histogram_op.h b/caffe2/operators/histogram_op.h index 29000898393575..abd6943a7b29d4 100644 --- a/caffe2/operators/histogram_op.h +++ b/caffe2/operators/histogram_op.h @@ -19,7 +19,7 @@ class HistogramOp final : public Operator { 2, "Number of bin edges must be greater than or equal to 2."); // NOLINTNEXTLINE(clang-diagnostic-sign-compare) - for (int i = 1; i < bin_edges_.size(); i++) { + for (const auto i : c10::irange(1, bin_edges_.size())) { CAFFE_ENFORCE_GT( bin_edges_[i], bin_edges_[i - 1], @@ -41,11 +41,11 @@ class HistogramOp final : public Operator { math::Set( bin_edges_.size() - 1, 0, histogram_data, &context_); - for (int input_idx = 0; input_idx < InputSize(); input_idx++) { + for (const auto input_idx : c10::irange(InputSize())) { const auto& x = Input(input_idx); const int64_t N = x.numel(); const auto* x_data = x.template data(); - for (int64_t data_idx = 0; data_idx < N; data_idx++) { + for (const auto data_idx : c10::irange(N)) { const auto bisection_it = std::upper_bound( bin_edges_.begin(), bin_edges_.end(), x_data[data_idx]); const int bisection_idx = bisection_it - bin_edges_.begin(); @@ -67,7 +67,7 @@ class HistogramOp final : public Operator { void CheckInputs() { const auto& input_zero = Input(0); - for (int i = 1; i < InputSize(); i++) { + for (const auto i : c10::irange(1, InputSize())) { CAFFE_ENFORCE_EQ( Input(i).dtype(), input_zero.dtype(), diff --git a/caffe2/operators/im2col_op.h b/caffe2/operators/im2col_op.h index 5bb07ea41f43dc..40dcae2dd1ef45 100644 --- a/caffe2/operators/im2col_op.h +++ b/caffe2/operators/im2col_op.h @@ -84,7 +84,7 @@ class Im2ColOp final : public Operator { const size_t dx = X.numel() / N; const size_t dy = Y->numel() / N; - for (int n = 0; n < N; ++n) { + for (const auto n : c10::irange(N)) { const auto* xdata = X.template data() + (n * dx); auto* ydata = Y->template mutable_data() + (n * dy); math::Im2Col( @@ -114,7 +114,7 @@ class Im2ColOp final : public Operator { const size_t dx = X.numel() / N; const size_t dy = Y->numel() / N; - for (int n = 0; n < N; ++n) { + for (const auto n : c10::irange(N)) { const auto* xdata = X.template data() + (n * dx); auto* ydata = Y->template mutable_data() + (n * dy); math::Im2Col( @@ -230,7 +230,7 @@ class Col2ImOp final : public Operator { // could template-specialize this, but it's test code... switch (order_) { case StorageOrder::NCHW: { - for (int n = 0; n < N; ++n) { + for (const auto n : c10::irange(N)) { const auto* xdata = X.template data() + (n * dx); auto* ydata = Y->template mutable_data() + (n * dy); math::Col2Im( @@ -253,7 +253,7 @@ class Col2ImOp final : public Operator { } }; break; case StorageOrder::NHWC: { - for (int n = 0; n < N; ++n) { + for (const auto n : c10::irange(N)) { const auto* xdata = X.template data() + (n * dx); auto* ydata = Y->template mutable_data() + (n * dy); math::Col2Im( diff --git a/caffe2/operators/index_hash_ops.h b/caffe2/operators/index_hash_ops.h index df3eeb6a867146..c26331c381c73c 100644 --- a/caffe2/operators/index_hash_ops.h +++ b/caffe2/operators/index_hash_ops.h @@ -2,6 +2,7 @@ #define CAFFE2_OPERATORS_INDEX_HASH_OPS_H_ #include "caffe2/core/export_caffe2_op_to_c10.h" +#include #include "caffe2/core/logging.h" #include "caffe2/core/operator.h" @@ -42,7 +43,7 @@ class IndexHashOp : public Operator { auto* indices_data = indices.template data(); auto* hashed_indices_data = hashed_indices->template mutable_data(); - for (auto i = 0; i < N; i++) { + for (const auto i : c10::irange(N)) { hashed_indices_data[i] = hash(indices_data[i]); } diff --git a/caffe2/operators/index_ops.h b/caffe2/operators/index_ops.h index e0c00286781e9f..b0e9e644d07c28 100644 --- a/caffe2/operators/index_ops.h +++ b/caffe2/operators/index_ops.h @@ -64,7 +64,7 @@ struct Index : IndexBase { } std::lock_guard lock(dictMutex_); // NOLINTNEXTLINE(clang-diagnostic-sign-compare) - for (int i = 0; i < numKeys; ++i) { + for (const auto i : c10::irange(numKeys)) { auto it = dict_.find(keys[i]); if (it != dict_.end()) { values[i] = it->second; @@ -84,7 +84,7 @@ struct Index : IndexBase { numKeys <= maxElements_, "Cannot load index: Tensor is larger than max_elements."); decltype(dict_) dict; - for (auto i = 0U; i < numKeys; ++i) { + for (const auto i : c10::irange(0U, numKeys)) { CAFFE_ENFORCE( dict.insert({keys[i], i + 1}).second, "Repeated elements found: cannot load into dictionary."); @@ -111,7 +111,7 @@ struct Index : IndexBase { private: void FrozenGet(const T* keys, int64_tValue* values, size_t numKeys) { - for (auto i = 0U; i < numKeys; ++i) { + for (const auto i : c10::irange(0U, numKeys)) { auto it = dict_.find(keys[i]); values[i] = it != dict_.end() ? it->second : 0; } diff --git a/caffe2/operators/inference_lstm_op.h b/caffe2/operators/inference_lstm_op.h index dbbe7d33ce2907..2a7a4851ce24f5 100644 --- a/caffe2/operators/inference_lstm_op.h +++ b/caffe2/operators/inference_lstm_op.h @@ -7,6 +7,7 @@ #include #include "caffe2/core/blob_serialization.h" #include "caffe2/core/export_caffe2_op_to_c10.h" +#include #include "caffe2/core/operator.h" #include "caffe2/core/tensor.h" #include "caffe2/utils/eigen_utils.h" @@ -125,7 +126,7 @@ struct FullLSTMLayer : Layer { std::vector step_outputs; auto hidden = copy_ctor(input_hidden); - for (size_t i = 0; i < step_inputs.size(); i++) { + for (const auto i : c10::irange(step_inputs.size())) { hidden = cell_(step_inputs[i], hidden, params); step_outputs.push_back(copy_ctor(std::get<0>(hidden))); } @@ -203,7 +204,7 @@ LayerOutput> apply_layer_stack( auto hidden_it = hiddens.begin(); auto weight_it = weights.begin(); std::vector final_hiddens(num_layers); - for (int64_t l = 0; l < num_layers; ++l) { + for (const auto l : c10::irange(num_layers)) { auto layer_output = layer(layer_input, *(hidden_it++), *(weight_it++)); final_hiddens.at(l) = std::move(layer_output.final_hidden); layer_input = std::move(layer_output.outputs); @@ -225,7 +226,7 @@ std::tuple _lstm_impl( int64_t total_layers = layer_hx.size(); std::vector> hiddens; hiddens.reserve(total_layers); - for (int64_t i = 0; i < total_layers; ++i) { + for (const auto i : c10::irange(total_layers)) { hiddens.emplace_back(std::move(layer_hx[i]), std::move(layer_cx[i])); } LSTMCell cell(context); diff --git a/caffe2/operators/key_split_ops.h b/caffe2/operators/key_split_ops.h index f3eb3cd47b2a68..b8d057879a6bc2 100644 --- a/caffe2/operators/key_split_ops.h +++ b/caffe2/operators/key_split_ops.h @@ -26,21 +26,21 @@ class KeySplitOp : public Operator { const T* keys_data = keys.template data(); std::vector counts(categorical_limit_); std::vector eids(categorical_limit_); - for (int k = 0; k < categorical_limit_; k++) { + for (const auto k : c10::irange(categorical_limit_)) { counts[k] = 0; } - for (int i = 0; i < N; i++) { + for (const auto i : c10::irange(N)) { int k = keys_data[i]; CAFFE_ENFORCE_GT(categorical_limit_, k); CAFFE_ENFORCE_GE(k, 0); counts[k]++; } - for (int k = 0; k < categorical_limit_; k++) { + for (const auto k : c10::irange(categorical_limit_)) { auto* eid = Output(k, {counts[k]}, at::dtype()); eids[k] = eid->template mutable_data(); counts[k] = 0; } - for (int i = 0; i < N; i++) { + for (const auto i : c10::irange(N)) { int k = keys_data[i]; eids[k][counts[k]++] = i; } diff --git a/caffe2/operators/length_split_op.h b/caffe2/operators/length_split_op.h index a5bee082d77b9a..426f8ab884c396 100644 --- a/caffe2/operators/length_split_op.h +++ b/caffe2/operators/length_split_op.h @@ -55,11 +55,11 @@ class LengthsSplitOp final : public Operator { const int32_t* Ldata = L.template data(); int32_t* Ydata = Y->template mutable_data(); - for (int i = 0; i < M; i++) { + for (const auto i : c10::irange(M)) { int32_t mod = Ldata[i] % n_split_; int32_t res = mod != 0 ? math::DivUp(Ldata[i], n_split_) : Ldata[i] / n_split_ + 1; - for (int j = 0; j < n_split_; j++) { + for (const auto j : c10::irange(n_split_)) { Ydata[(i * n_split_) + j] = mod-- > 0 ? res : res - 1; } } diff --git a/caffe2/operators/lengths_pad_op.h b/caffe2/operators/lengths_pad_op.h index c0019b6f4ee3f2..fede82d65ce75a 100644 --- a/caffe2/operators/lengths_pad_op.h +++ b/caffe2/operators/lengths_pad_op.h @@ -56,7 +56,7 @@ class LengthsPadOp : public Operator { math::Set( output->numel(), static_cast(padding_value_), out_data, &context_); - for (int64_t i = 0; i < lengths_size; ++i) { + for (const auto i : c10::irange(lengths_size)) { auto length = lengths_data[i]; CAFFE_ENFORCE_GE(length, 0); CAFFE_ENFORCE_GE( diff --git a/caffe2/operators/lengths_reducer_fused_8bit_rowwise_ops.h b/caffe2/operators/lengths_reducer_fused_8bit_rowwise_ops.h index e12c1e9106950d..db22264aff3179 100644 --- a/caffe2/operators/lengths_reducer_fused_8bit_rowwise_ops.h +++ b/caffe2/operators/lengths_reducer_fused_8bit_rowwise_ops.h @@ -121,7 +121,7 @@ class SparseLengthsFused8BitRowwiseOp : public Operator { auto indices_data = indices.template data(); int64_t current = 0; - for (int m = 0; m < output_size; ++m) { + for (const auto m : c10::irange(output_size)) { for (int i = 0; i < lengths_data[m]; ++i) { CAFFE_ENFORCE_LT(current, index_size); IndexType idx = indices_data[current]; diff --git a/caffe2/operators/lengths_reducer_fused_nbit_rowwise_ops.h b/caffe2/operators/lengths_reducer_fused_nbit_rowwise_ops.h index 58be36fb9bc56f..a0dd91b34d94f2 100644 --- a/caffe2/operators/lengths_reducer_fused_nbit_rowwise_ops.h +++ b/caffe2/operators/lengths_reducer_fused_nbit_rowwise_ops.h @@ -137,7 +137,7 @@ class SparseLengthsFusedNBitRowwiseOp final : public Operator { // Error handling int64_t current = 0; - for (int m = 0; m < output_size; ++m) { + for (const auto m : c10::irange(output_size)) { for (int i = 0; i < lengths_data[m]; ++i) { CAFFE_ENFORCE_LT(current, index_size); IndexType idx = indices_data[current]; @@ -164,7 +164,7 @@ class SparseLengthsFusedNBitRowwiseOp final : public Operator { << "Running slow path because FBGEMM is not available"; int64_t current = 0; - for (int m = 0; m < output_size; ++m) { + for (const auto m : c10::irange(output_size)) { memset(output_data, 0, block_size * sizeof(float)); if (current + lengths_data[m] > index_size) { return false; @@ -185,7 +185,7 @@ class SparseLengthsFusedNBitRowwiseOp final : public Operator { const float scale = weight * scale_bias[0]; const float bias = weight * scale_bias[1]; - for (int j = 0; j < block_size; ++j) { + for (const auto j : c10::irange(block_size)) { uint8_t quantized = input_data[idx * data.size(1) + j / NUM_ELEM_PER_BYTE]; quantized >>= (j % NUM_ELEM_PER_BYTE) * BIT_RATE; @@ -196,7 +196,7 @@ class SparseLengthsFusedNBitRowwiseOp final : public Operator { } // for each i if (is_mean && lengths_data[m]) { float scale = 1.0f / lengths_data[m]; - for (int j = 0; j < block_size; ++j) { + for (const auto j : c10::irange(block_size)) { output_data[j] *= scale; } } @@ -284,13 +284,14 @@ class SparseLengthsSumSparseLookupOp final : public Operator { const IndexType compressed_data_size = compressed_indices_mapping.size(0); IndexType current = 0; IndexType current_output = 0; - for (int m = 0; m < output_size; ++m) { + for (const auto m : c10::irange(output_size)) { const auto current_length = lengths_data[m]; if (current + current_length > index_size) { return false; } int32_t skipped = 0; - for (int i = 0; i < current_length; ++i) { + for (const auto i : c10::irange(current_length)) { + (void)i; // Suppress unused variable warning IndexType compressed_idx = indices_data[current]; if (compressed_idx < 0 || compressed_idx >= compressed_data_size) { return false; @@ -554,7 +555,7 @@ class SparseLengthsNBitRowwiseSparseOp final : public Operator { // Error handling int64_t current = 0; - for (int m = 0; m < output_size; ++m) { + for (const auto m : c10::irange(output_size)) { for (int i = 0; i < lengths_data[m]; ++i) { CAFFE_ENFORCE_LT(current, index_size); IndexType idx = indices_data[current]; @@ -592,7 +593,7 @@ class SparseLengthsNBitRowwiseSparseOp final : public Operator { << "Running slow path because FBGEMM is not available"; int64_t current = 0; - for (int m = 0; m < output_size; ++m) { + for (const auto m : c10::irange(output_size)) { memset(output_data, 0, block_size * sizeof(float)); if (current + lengths_data[m] > index_size) { return false; @@ -632,7 +633,7 @@ class SparseLengthsNBitRowwiseSparseOp final : public Operator { bias = weight * reinterpret_cast(scale_bias)[1]; } - for (int j = 0; j < block_size; ++j) { + for (const auto j : c10::irange(block_size)) { uint8_t quantized = input_data[idx * data.size(1) + j / NUM_ELEM_PER_BYTE]; quantized >>= (j % NUM_ELEM_PER_BYTE) * BIT_RATE; @@ -643,7 +644,7 @@ class SparseLengthsNBitRowwiseSparseOp final : public Operator { } // for each i if (is_mean && lengths_data[m]) { float scale = 1.0f / lengths_data[m]; - for (int j = 0; j < block_size; ++j) { + for (const auto j : c10::irange(block_size)) { output_data[j] *= scale; } } diff --git a/caffe2/operators/lengths_reducer_ops.h b/caffe2/operators/lengths_reducer_ops.h index e01da722074cf8..9615e9bad1f45d 100644 --- a/caffe2/operators/lengths_reducer_ops.h +++ b/caffe2/operators/lengths_reducer_ops.h @@ -192,7 +192,7 @@ class CPUSparseLengthsReductionOp : public Operator { } int64_t current = 0; - for (int m = 0; m < M; ++m) { + for (const auto m : c10::irange(M)) { for (int i = 0; i < lengths[m]; ++i) { CAFFE_ENFORCE_LT( current, @@ -280,7 +280,7 @@ class TTSparseLengthsSumOp final : public Operator { emb_size(this->template GetSingleArgument("emb_size", 64)) { // cumprod of i, used for index slice l_cumprod.push_back(1); - for (size_t i = 1; i < factor_i.size(); ++i) { + for (const auto i : c10::irange(1, factor_i.size())) { l_cumprod.push_back(l_cumprod[i - 1] * factor_i[i - 1]); } } @@ -290,7 +290,7 @@ class TTSparseLengthsSumOp final : public Operator { void Ind2Sub(int64_t* out_factor_index, const int64_t* indices, int len) { // TODO: vectorization auto N = factor_i.size(); - for (int j = 0; j < len; j++) { + for (const auto j : c10::irange(len)) { auto idx = indices[j]; for (int i = N; i > 0; i--) { out_factor_index[j * N + i - 1] = idx / l_cumprod[i - 1]; @@ -307,7 +307,7 @@ class TTSparseLengthsSumOp final : public Operator { int idx) { // implement the functinality index_select(core, 1, ind_slice) auto num_of_elements = ranks[idx] * factor_j[idx] * ranks[idx + 1]; - for (int i = 0; i < bs; i++) { + for (const auto i : c10::irange(bs)) { memcpy( tgt_slice[i].data(), core + ind_slice[i] * num_of_elements, @@ -345,16 +345,16 @@ class TTSparseLengthsSumOp final : public Operator { // Store the intermediate result in each layer vector Z_ptr(bs); - for (int b = 0; b < bs; b++) { + for (const auto b : c10::irange(bs)) { Y_ptr[b] = res[b].data(); Z_ptr[b] = int_res[b].data(); } vector ind_slice(bs); int rows = 0; - for (int i = 0; i < x_len; i++) { + for (const auto i : c10::irange(x_len)) { // slice cur - for (int j = 0; j < bs; j++) { + for (const auto j : c10::irange(bs)) { ind_slice[j] = ind[x_len * j + i]; } if (i == 0) { @@ -364,7 +364,7 @@ class TTSparseLengthsSumOp final : public Operator { std::vector> slice( bs, std::vector(ranks[i] * factor_j[i] * ranks[i + 1], 0)); vector X_ptr(bs); - for (int b = 0; b < bs; b++) { + for (const auto b : c10::irange(bs)) { X_ptr[b] = slice[b].data(); } GetSlice(slice, cores[i], ind_slice, bs, i); @@ -382,7 +382,7 @@ class TTSparseLengthsSumOp final : public Operator { 0.0f, Z_ptr.data(), &context_); - for (int b = 0; b < bs; b++) { + for (const auto b : c10::irange(bs)) { std::memcpy(Y_ptr[b], Z_ptr[b], (emb_size * max_rank) * sizeof(T)); } rows *= factor_j[i]; @@ -393,7 +393,7 @@ class TTSparseLengthsSumOp final : public Operator { if (i < 2) { auto* core_data = Output(i + 1, shape, at::dtype()); T* out_core = core_data->template mutable_data(); - for (int b = 0; b < bs; b++) { + for (const auto b : c10::irange(bs)) { std::memcpy( out_core + b * rows * ranks[i + 1], Y_ptr[b], @@ -404,7 +404,7 @@ class TTSparseLengthsSumOp final : public Operator { // reduction and store back to output vector cum_lengths(segments); - for (int seg = 0; seg < segments; seg++) { + for (const auto seg : c10::irange(segments)) { cum_lengths[seg] = seg == 0 ? lengths[0] : lengths[seg] + cum_lengths[seg - 1]; } @@ -549,7 +549,7 @@ bool TTSparseLengthsSumGradientOp::RunOnDevice() { int64_t* index_out_data = index_out.template mutable_data(); vector> index_slice(bs, vector(3, 0)); - for (int64_t b = 0; b < bs; b++) { + for (const auto b : c10::irange(bs)) { memcpy(index_slice[b].data(), index_out_data + b * 3, 3 * sizeof(int64_t)); } @@ -563,7 +563,7 @@ bool TTSparseLengthsSumGradientOp::RunOnDevice() { // expand the gradient into all indices vector> core2_out_grad(bs, vector(emb_size, 0)); int64_t data_index = 0; - for (int64_t range_index = 0; range_index < num_segments; ++range_index) { + for (const auto range_index : c10::irange(num_segments)) { for (int64_t start = data_index; data_index < start + lengths_data[range_index]; ++data_index) { @@ -582,7 +582,7 @@ bool TTSparseLengthsSumGradientOp::RunOnDevice() { bs, vector(core2_shape[1] * core2_shape[2] * core2_shape[3], 0)); const T* core1_out_data = core1_out.template data(); // const T* core1_out_p[bs]; - for (int64_t b = 0; b < bs; b++) { + for (const auto b : c10::irange(bs)) { A_ptr[b] = core1_out_data + b * core1_out.size(1) * core1_out.size(2); B_ptr[b] = core2_out_grad[b].data(); C_ptr[b] = dCore2_data_slice_grad[b].data(); @@ -609,8 +609,8 @@ bool TTSparseLengthsSumGradientOp::RunOnDevice() { vector> core2_slice( bs, vector(core2_shape[1] * core2_shape[2] * core2_shape[3], 0)); - for (int64_t b = 0; b < bs; b++) { - for (int i = 0; i < num_of_elements; i++) { + for (const auto b : c10::irange(bs)) { + for (const auto i : c10::irange(num_of_elements)) { dCore2_data[index_slice[b][2] * num_of_elements + i] += C_ptr[b][i]; } memcpy( @@ -623,7 +623,7 @@ bool TTSparseLengthsSumGradientOp::RunOnDevice() { vector> core1_out_grad( bs, vector(core1_out_shape[1] * core1_out_shape[2], 0)); - for (int64_t b = 0; b < bs; b++) { + for (const auto b : c10::irange(bs)) { A_ptr[b] = core2_out_grad[b].data(); B_ptr[b] = core2_slice[b].data(); C_ptr[b] = core1_out_grad[b].data(); @@ -650,7 +650,7 @@ bool TTSparseLengthsSumGradientOp::RunOnDevice() { vector> dCore1_data_slice_grad( bs, vector(core1_shape[1] * core1_shape[2] * core1_shape[3], 0)); const T* core0_out_data = core0_out.template data(); - for (int64_t b = 0; b < bs; b++) { + for (const auto b : c10::irange(bs)) { A_ptr[b] = core0_out_data + b * core0_out.size(1) * core0_out.size(2); B_ptr[b] = core1_out_grad[b].data(); C_ptr[b] = dCore1_data_slice_grad[b].data(); @@ -676,8 +676,8 @@ bool TTSparseLengthsSumGradientOp::RunOnDevice() { vector> core1_slice( bs, vector(core1_shape[1] * core1_shape[2] * core1_shape[3], 0)); - for (int64_t b = 0; b < bs; b++) { - for (int i = 0; i < num_of_elements; i++) { + for (const auto b : c10::irange(bs)) { + for (const auto i : c10::irange(num_of_elements)) { dCore1_data[index_slice[b][1] * num_of_elements + i] += C_ptr[b][i]; } memcpy( @@ -690,7 +690,7 @@ bool TTSparseLengthsSumGradientOp::RunOnDevice() { vector> core0_out_grad( bs, vector(core0_out_shape[1] * core0_out_shape[2], 0)); - for (int64_t b = 0; b < bs; b++) { + for (const auto b : c10::irange(bs)) { A_ptr[b] = core1_out_grad[b].data(); B_ptr[b] = core1_slice[b].data(); C_ptr[b] = core0_out_grad[b].data(); @@ -712,8 +712,8 @@ bool TTSparseLengthsSumGradientOp::RunOnDevice() { num_of_elements = core0_shape[1] * core0_shape[2] * core0_shape[3]; - for (int64_t b = 0; b < bs; b++) { - for (int i = 0; i < num_of_elements; i++) { + for (const auto b : c10::irange(bs)) { + for (const auto i : c10::irange(num_of_elements)) { dCore0_data[index_slice[b][0] * num_of_elements + i] += C_ptr[b][i]; } } diff --git a/caffe2/operators/lengths_reducer_rowwise_8bit_ops.h b/caffe2/operators/lengths_reducer_rowwise_8bit_ops.h index 67d792f4ca23ce..acf5d442ca7245 100644 --- a/caffe2/operators/lengths_reducer_rowwise_8bit_ops.h +++ b/caffe2/operators/lengths_reducer_rowwise_8bit_ops.h @@ -110,7 +110,7 @@ class FloatToRowwiseQuantized8BitsOp : public Operator { float* scale_bias_data = scale_bias->template mutable_data(); size_t n_blocks = input.size(0); size_t block_size = input.size_from_dim(1); - for (size_t i = 0; i < n_blocks; ++i) { + for (const auto i : c10::irange(n_blocks)) { ConstEigenVectorArrayMap input_row( input_data + i * block_size, block_size); EigenVectorArrayMap output_row( @@ -164,7 +164,7 @@ class Rowwise8BitQuantizedToFloatOp : public Operator { size_t block_size = input.size_from_dim(1); size_t n_blocks = input.size(0); - for (size_t i = 0; i < n_blocks; ++i) { + for (const auto i : c10::irange(n_blocks)) { ConstEigenVectorArrayMap input_row( input_data + i * block_size, block_size); EigenVectorArrayMap output_row( diff --git a/caffe2/operators/load_save_op.h b/caffe2/operators/load_save_op.h index 64a90eb0c442b9..78fd0b9d51337a 100644 --- a/caffe2/operators/load_save_op.h +++ b/caffe2/operators/load_save_op.h @@ -5,6 +5,8 @@ #include #include + +#include #include #include "caffe2/core/blob_serialization.h" #include "caffe2/core/context.h" @@ -129,13 +131,13 @@ class LoadOp final : public Operator { int total_loaded_blobs = 0; std::unordered_map blob_states; if (InputSize() > 0) { - for (int i = 0; i < InputSize(); ++i) { + for (const auto i : c10::irange(InputSize())) { const db::DBReader& reader = this->template Input(i); extract(i, reader.cursor(), &blob_states, &total_loaded_blobs); } } else { // NOLINTNEXTLINE(clang-diagnostic-sign-compare) - for (int i = 0; i < db_names_.size(); ++i) { + for (const auto i : c10::irange(db_names_.size())) { string full_db_name = absolute_path_ ? db_names_[i] : (ws_->RootFolder() + "/" + db_names_[i]); diff --git a/caffe2/operators/locally_connected_op_impl.h b/caffe2/operators/locally_connected_op_impl.h index df05cad403e92d..4c6312ab3a5f95 100644 --- a/caffe2/operators/locally_connected_op_impl.h +++ b/caffe2/operators/locally_connected_op_impl.h @@ -45,7 +45,7 @@ bool LocallyConnectedOp::RunOnDeviceWithOrderNCHW() { shape.input_image_size = GetDimsSize(X); shape.output_image_size = GetDimsSize(*Y); const std::vector output_image_dims = GetDims(*Y); - for (int i = 0; i < image_ndim; ++i) { + for (const auto i : c10::irange(image_ndim)) { CAFFE_ENFORCE_EQ(output_image_dims[i], filter.dim32(i)); } @@ -82,7 +82,7 @@ bool LocallyConnectedOp::RunOnDeviceWithOrderNCHW() { if (InputSize() == 3) { const auto& bias = Input(BIAS); CAFFE_ENFORCE_EQ(bias.dim(), image_ndim + 1); - for (int i = 0; i < image_ndim; ++i) { + for (const auto i : c10::irange(image_ndim)) { CAFFE_ENFORCE_EQ(bias.dim32(i), output_image_dims[i]); } CAFFE_ENFORCE_EQ(bias.dim32(image_ndim), shape.M); @@ -129,7 +129,7 @@ bool LocallyConnectedOp::RunOnDeviceWithOrderNHWC() { shape.input_image_size = GetDimsSize(X); shape.output_image_size = GetDimsSize(*Y); const std::vector output_image_dims = GetDims(*Y); - for (int i = 0; i < image_ndim; ++i) { + for (const auto i : c10::irange(image_ndim)) { CAFFE_ENFORCE_EQ(output_image_dims[i], filter.dim32(i)); } @@ -159,7 +159,7 @@ bool LocallyConnectedOp::RunOnDeviceWithOrderNHWC() { if (InputSize() == 3) { const auto& bias = Input(BIAS); CAFFE_ENFORCE_EQ(bias.dim(), image_ndim + 1); - for (int i = 0; i < image_ndim; ++i) { + for (const auto i : c10::irange(image_ndim)) { CAFFE_ENFORCE_EQ(bias.dim32(i), output_image_dims[i]); } CAFFE_ENFORCE_EQ(bias.dim32(image_ndim), shape.M); @@ -200,8 +200,9 @@ void LocallyConnectedOp::RunOnDeviceWithOrderNCHWImpl( T* column_buffer_data = column_buffer->template mutable_data(); T* Y_transposed_buffer_data = Y_transposed_buffer->template mutable_data(); - for (int image_id = 0; image_id < shape.N; ++image_id) { - for (int group_id = 0; group_id < group_; ++group_id) { + for (const auto image_id : c10::irange(shape.N)) { + (void)image_id; // Suppress unused variable warning + for (const auto group_id : c10::irange(group_)) { if (kernel_.size() == 2) { math::Im2Col( shape.C / group_, @@ -302,7 +303,7 @@ void LocallyConnectedOp::RunOnDeviceWithOrderNHWCImpl( Y_transposed_buffer->Resize(shape.Y_transposed_dims); T* column_buffer_data = column_buffer->template mutable_data(); T* Y_transposed_buffer_data = Y_transposed_buffer->template mutable_data(); - for (int image_id = 0; image_id < shape.N; ++image_id) { + for (const auto image_id : c10::irange(shape.N)) { math::Im2Col( shape.C, shape.X_dims[0], @@ -387,7 +388,7 @@ bool LocallyConnectedGradientOp::RunOnDeviceWithOrderNCHW() { shape.input_image_size = GetDimsSize(X); const std::vector output_image_dims = GetDims(dY); shape.output_image_size = GetDimsSize(dY); - for (int i = 0; i < image_ndim; ++i) { + for (const auto i : c10::irange(image_ndim)) { CAFFE_ENFORCE_EQ(output_image_dims[i], filter.dim32(i)); } ConvPoolOpBase::ComputePads(input_image_dims); @@ -484,7 +485,7 @@ bool LocallyConnectedGradientOp::RunOnDeviceWithOrderNHWC() { shape.input_image_size = GetDimsSize(X); shape.output_image_size = GetDimsSize(dY); const std::vector output_image_dims = GetDims(dY); - for (int i = 0; i < image_ndim; ++i) { + for (const auto i : c10::irange(image_ndim)) { CAFFE_ENFORCE_EQ(output_image_dims[i], filter.dim32(i)); } @@ -568,8 +569,9 @@ void LocallyConnectedGradientOp::RunOnDeviceWithOrderNCHWImpl( T* dY_transposed_buffer_data = dY_transposed_buffer->template mutable_data(); - for (int image_id = 0; image_id < shape.N; ++image_id) { - for (int group_id = 0; group_id < group_; ++group_id) { + for (const auto image_id : c10::irange(shape.N)) { + (void)image_id; // Suppress unused variable warning + for (const auto group_id : c10::irange(group_)) { if (kernel_.size() == 2) { math::Im2Col( shape.C / group_, @@ -681,8 +683,9 @@ void LocallyConnectedGradientOp::RunOnDeviceWithOrderNCHWImpl( column_buffer->template mutable_data(), &context_); const T* const_column_buffer_data = column_buffer->template data(); - for (int image_id = 0; image_id < shape.N; ++image_id) { - for (int group_id = 0; group_id < group_; ++group_id) { + for (const auto image_id : c10::irange(shape.N)) { + (void)image_id; // Suppress unused variable warning + for (const auto group_id : c10::irange(group_)) { if (kernel_.size() == 2) { math::Col2Im( shape.C / group_, @@ -743,7 +746,7 @@ void LocallyConnectedGradientOp::RunOnDeviceWithOrderNHWCImpl( T* column_buffer_data = column_buffer->template mutable_data(); T* dY_transposed_buffer_data = dY_transposed_buffer->template mutable_data(); - for (int image_id = 0; image_id < shape.N; ++image_id) { + for (const auto image_id : c10::irange(shape.N)) { math::Im2Col( shape.C, shape.X_dims[0], @@ -835,7 +838,8 @@ void LocallyConnectedGradientOp::RunOnDeviceWithOrderNHWCImpl( column_buffer->template mutable_data(), &context_); const T* const_column_buffer_data = column_buffer->template data(); - for (int image_id = 0; image_id < shape.N; ++image_id) { + for (const auto image_id : c10::irange(shape.N)) { + (void)image_id; // Suppress unused variable warning math::Col2Im( shape.C, shape.X_dims[0], diff --git a/caffe2/operators/lstm_utils.h b/caffe2/operators/lstm_utils.h index 0f564792215b67..cd2da4224ed0cb 100644 --- a/caffe2/operators/lstm_utils.h +++ b/caffe2/operators/lstm_utils.h @@ -78,7 +78,7 @@ template static std::vector unpair_vec(std::vector>&& vals) { std::vector result; result.reserve(vals.size() * 2); - for (int64_t i = 0; i < vals.size(); i++) { + for (const auto i : c10::irange(vals.size())) { result.push_back(std::move(vals[i].first)); result.push_back(std::move(vals[i].second)); } @@ -150,7 +150,7 @@ chunk(const Tensor& input, int chunks, int axis, CPUContext* context) { auto split_size = input_channels / chunks; vector output_dims(input.sizes().vec()); int before = 1, after = 1; - for (int i = 0; i < canonical_axis; ++i) { + for (const auto i : c10::irange(canonical_axis)) { before *= input.dim32(i); } for (int i = canonical_axis + 1; i < input.dim(); ++i) { @@ -158,7 +158,8 @@ chunk(const Tensor& input, int chunks, int axis, CPUContext* context) { } size_t input_offset = 0; std::vector outputs; - for (int i = 0; i < chunks; ++i) { + for (const auto i : c10::irange(chunks)) { + (void)i; // Suppress unused variable warning auto axis_dim = split_size; output_dims[canonical_axis] = split_size; Tensor output(output_dims, CPU); @@ -187,7 +188,7 @@ std::vector unbind(const Tensor& input, int axis, CPUContext* context) { newDims.erase(newDims.begin() + axis); // 3 - Reshape chunks to drop the extra dimension - for (int i = 0; i < chunks.size(); i++) { + for (const auto i : c10::irange(chunks.size())) { CAFFE_ENFORCE_EQ( chunks[i].sizes()[axis], 1, "Got an unexpected chunk size"); chunks[i].Reshape(newDims); @@ -201,14 +202,14 @@ cat(const std::vector& tensorList, int axis, CPUContext* context) { auto input_zero = copy_ctor(tensorList.at(0)); vector outputDims(input_zero.sizes().vec()); CAFFE_ENFORCE(outputDims.size() > 0); - for (int i = 1; i < tensorList.size(); i++) { + for (const auto i : c10::irange(1, tensorList.size())) { CAFFE_ENFORCE(input_zero.dtype() == tensorList.at(i).dtype()); outputDims[axis] += tensorList.at(i).sizes()[axis]; } auto output_channels = outputDims[axis]; Tensor output(outputDims, CPU); int before = 1, after = 1; - for (int i = 0; i < tensorList.at(0).dim(); ++i) { + for (const auto i : c10::irange(tensorList.at(0).dim())) { if (i == axis) { continue; } @@ -245,7 +246,7 @@ stack(const std::vector& tensorList, int axis, CPUContext* context) { std::vector newDims(tensorList[0].sizes().vec()); std::vector expandedTensorList; newDims.insert(newDims.begin() + axis, 1); - for (int i = 0; i < tensorList.size(); i++) { + for (const auto i : c10::irange(tensorList.size())) { expandedTensorList.emplace_back(tensorList[i].Clone()); expandedTensorList.at(i).Reshape(newDims); } @@ -301,7 +302,7 @@ Tensor transpose(const Tensor& X, int dim0, int dim1, CPUContext* context) { std::swap(axes[dim0], axes[dim1]); const std::vector X_dims = X.sizes().vec(); std::vector Y_dims(ndim); - for (int i = 0; i < ndim; ++i) { + for (const auto i : c10::irange(ndim)) { Y_dims[i] = X_dims[axes[i]]; } Tensor Y(Y_dims, CPU); diff --git a/caffe2/operators/map_ops.h b/caffe2/operators/map_ops.h index f870172a69faba..a6d80f3924195c 100644 --- a/caffe2/operators/map_ops.h +++ b/caffe2/operators/map_ops.h @@ -130,7 +130,7 @@ class KeyValueToMapOp final : public Operator { auto* map_data = this->template Output(MAP); - for (int i = 0; i < key_input.numel(); ++i) { + for (const auto i : c10::irange(key_input.numel())) { map_data->emplace(key_data[i], value_data[i]); } @@ -257,7 +257,7 @@ class MapDeserializer : public BlobDeserializerBase { auto* value_data = value_tensor.data(); auto* map_ptr = blob->template GetMutable(); - for (int i = 0; i < key_tensor.numel(); ++i) { + for (const auto i : c10::irange(key_tensor.numel())) { map_ptr->emplace(key_data[i], value_data[i]); } } diff --git a/caffe2/operators/mean_op.h b/caffe2/operators/mean_op.h index beb0b0440505dd..0dbc2ca4ad916a 100644 --- a/caffe2/operators/mean_op.h +++ b/caffe2/operators/mean_op.h @@ -29,7 +29,7 @@ class MeanOp final : public Operator { } // Dimension checking - for (int i = 1; i < InputSize(); ++i) { + for (const auto i : c10::irange(1, InputSize())) { if (output->sizes() != Input(i).sizes()) { CAFFE_THROW( "Check failed: output->sizes() == Input(i).sizes().", @@ -43,7 +43,7 @@ class MeanOp final : public Operator { } T* output_data = output->template mutable_data(); - for (int i = 1; i < InputSize(); ++i) { + for (const auto i : c10::irange(1, InputSize())) { math::Add( output->numel(), output_data, @@ -101,7 +101,7 @@ class MeanGradientOp : public Operator { size, scale, dY_data, dX0->template mutable_data(), &context_); // Copy the rest dX - for (int i = 1; i < num_inputs; i++) { + for (const auto i : c10::irange(1, num_inputs)) { auto* cur_dX = Output(i); cur_dX->ResizeLike(dY); cur_dX->CopyFrom(*dX0, true /*async*/); diff --git a/caffe2/operators/merge_id_lists_op.h b/caffe2/operators/merge_id_lists_op.h index e01abbecc486fd..6619a57ca2f87c 100644 --- a/caffe2/operators/merge_id_lists_op.h +++ b/caffe2/operators/merge_id_lists_op.h @@ -6,6 +6,7 @@ #include "caffe2/core/context.h" #include "caffe2/core/operator.h" #include "caffe2/core/export_caffe2_op_to_c10.h" +#include C10_DECLARE_EXPORT_CAFFE2_OP_TO_C10(MergeIdLists); @@ -50,7 +51,7 @@ class MergeIdListsOp : public Operator { // TODO(badri): Use unordered_set if performance is an issue std::set deduped; std::vector offsets(InputSize(), 0); - for (auto sample = 0; sample < batch_size; sample++) { + for (const auto sample : c10::irange(batch_size)) { for (size_t i = 0; i < InputSize(); i += 2) { auto& lengths = Input(i); const auto* lengths_data = lengths.template data(); diff --git a/caffe2/operators/minmax_ops.h b/caffe2/operators/minmax_ops.h index 2191a96fccaeb4..1c4c38cc7ed8b8 100644 --- a/caffe2/operators/minmax_ops.h +++ b/caffe2/operators/minmax_ops.h @@ -39,7 +39,7 @@ class MaxOp final : public Operator { Y->sizes()); const T* X1_data = X1.template data(); math::Max(N, X0_data, X1_data, Y_data, &context_); - for (int i = 2; i < InputSize(); ++i) { + for (const auto i : c10::irange(2, InputSize())) { const auto& Xi = Input(i); CAFFE_ENFORCE_EQ( Xi.sizes(), @@ -87,7 +87,7 @@ class MinOp final : public Operator { Y->sizes()); const T* X1_data = X1.template data(); math::Min(N, X0_data, X1_data, Y_data, &context_); - for (int i = 2; i < InputSize(); ++i) { + for (const auto i : c10::irange(2, InputSize())) { const auto& Xi = Input(i); CAFFE_ENFORCE_EQ( Xi.sizes(), diff --git a/caffe2/operators/moments_op.h b/caffe2/operators/moments_op.h index 136c4b10e9b0c1..cf81244875856b 100644 --- a/caffe2/operators/moments_op.h +++ b/caffe2/operators/moments_op.h @@ -45,7 +45,7 @@ class MomentsOp final : public Operator { std::vector output_dims; output_dims.reserve(ndim); std::size_t cur_axis = 0; - for (int i = 0; i < ndim; ++i) { + for (const auto i : c10::irange(ndim)) { if (cur_axis < axes_.size() && i == axes_[cur_axis]) { if (keep_dims_) { output_dims.push_back(1); diff --git a/caffe2/operators/ngram_ops.h b/caffe2/operators/ngram_ops.h index 162338a8ac416c..97c65e895853a5 100644 --- a/caffe2/operators/ngram_ops.h +++ b/caffe2/operators/ngram_ops.h @@ -35,9 +35,9 @@ class NGramFromCategoricalOp : public Operator { } int base = 1; int idx = 0; - for (int k = 0; k < col_num_; k++) { + for (const auto k : c10::irange(col_num_)) { int l = categorical_limits_[k]; - for (int m = 0; m < l; m++) { + for (const auto m : c10::irange(l)) { int v = vals_[idx++]; ngram_maps_[k][v] = m * base; } @@ -56,8 +56,8 @@ class NGramFromCategoricalOp : public Operator { math::Set(output->numel(), 0, output_data, &context_); CAFFE_ENFORCE_GT(D, max_col_id_); - for (int i = 0; i < N; i++) { - for (int k = 0; k < col_num_; k++) { + for (const auto i : c10::irange(N)) { + for (const auto k : c10::irange(col_num_)) { int j = col_ids_[k]; int v = round(floats_data[i * D + j]); // for out-of-vocabulary values, we always treat them the same as the diff --git a/caffe2/operators/normalize_op.h b/caffe2/operators/normalize_op.h index ae1bb0f57f33c9..1b4be7c8d20fbf 100644 --- a/caffe2/operators/normalize_op.h +++ b/caffe2/operators/normalize_op.h @@ -48,7 +48,7 @@ class NormalizeOp final : public Operator { using ConstStridedVec = Eigen::Map, 0, InnerStride>; - for (int i = 0; i < n; ++i) { + for (const auto i : c10::irange(n)) { auto base = (i / sf) * sf * m + (i % sf); ConstStridedVec xVec(xData + base, 1, m, InnerStride(sf)); auto norm = xVec.template lpNorm<2>(); diff --git a/caffe2/operators/numpy_tile_op.h b/caffe2/operators/numpy_tile_op.h index 7fc745afccfc4c..b69fe7fac2f691 100644 --- a/caffe2/operators/numpy_tile_op.h +++ b/caffe2/operators/numpy_tile_op.h @@ -33,7 +33,7 @@ class NumpyTileOp : public Operator { " number of elements as `inputs` has dimensions."); const int64_t* repeats_data = repeats.template data(); // NOLINTNEXTLINE(clang-diagnostic-sign-compare) - for (size_t i = 0; i < repeats.numel(); ++i) { + for (const auto i : c10::irange(repeats.numel())) { CAFFE_ENFORCE_GE(repeats_data[i], 0); } @@ -45,7 +45,7 @@ class NumpyTileOp : public Operator { Tensor *src = &buffer, *dst = output; src->CopyFrom(input); vector output_dims(input.sizes().vec()); - for (size_t i = 0; i < repeats.numel(); ++i) { + for (const auto i : c10::irange(repeats.numel())) { if (repeats_data[i] == 1) { continue; } @@ -100,8 +100,10 @@ class NumpyTileOp : public Operator { int64_t num_tiles, const char* input_data, char* output_data) { - for (auto i = 0; i < outer_dim; ++i) { - for (auto t = 0; t < num_tiles; ++t) { + for (const auto i : c10::irange(outer_dim)) { + (void)i; // Suppress unused variable warning + for (const auto t : c10::irange(num_tiles)) { + (void)t; // Suppress unused variable warning context_.CopyItemsSameDevice(meta, inner_dim, input_data, output_data); output_data += inner_dim * item_size; } diff --git a/caffe2/operators/onnx_while_op.h b/caffe2/operators/onnx_while_op.h index 85b5d87ef1f309..428e63fcd23232 100644 --- a/caffe2/operators/onnx_while_op.h +++ b/caffe2/operators/onnx_while_op.h @@ -94,7 +94,7 @@ class ONNXWhileOp final : public Operator { "outputs"); // Copy initial loop-carried dependencies - for (int i = 0; i < num_loop_carried_deps; ++i) { + for (const auto i : c10::irange(num_loop_carried_deps)) { scope_->lcd_tensor(i)->CopyFrom(Input(i + num_inputs_before_lcds)); } @@ -126,7 +126,7 @@ class ONNXWhileOp final : public Operator { }; // Allocate scan_outputs for zero-iteration case - for (int i = 0; i < num_scan_outputs; ++i) { + for (const auto i : c10::irange(num_scan_outputs)) { Output(i + num_loop_carried_deps)->Resize(0); Output(i + num_loop_carried_deps)->template mutable_data(); } @@ -154,13 +154,13 @@ class ONNXWhileOp final : public Operator { } // Copy forward loop-carried dependencies - for (int i = 0; i < num_loop_carried_deps; ++i) { + for (const auto i : c10::irange(num_loop_carried_deps)) { Blob* b = cur_ws->GetBlob(scope_->net()->external_output()[i + 1]); const Tensor& t = b->template Get(); scope_->lcd_tensor(i)->CopyFrom(t); } // Copy out scan_outputs - for (int i = 0; i < num_scan_outputs; ++i) { + for (const auto i : c10::irange(num_scan_outputs)) { int net_output_idx = i + 1 + num_loop_carried_deps; const Tensor& scan_output = cur_ws->GetBlob(scope_->net()->external_output()[net_output_idx]) @@ -202,7 +202,7 @@ class ONNXWhileOp final : public Operator { } // Copy out final loop-carried dependencies - for (int i = 0; i < num_loop_carried_deps; ++i) { + for (const auto i : c10::irange(num_loop_carried_deps)) { Output(i)->CopyFrom(*scope_->lcd_tensor(i)); } diff --git a/caffe2/operators/op_utils_cudnn.h b/caffe2/operators/op_utils_cudnn.h index 0ea76855b8430b..ca5c19e6291823 100644 --- a/caffe2/operators/op_utils_cudnn.h +++ b/caffe2/operators/op_utils_cudnn.h @@ -36,7 +36,7 @@ inline void LogCuDNNPerfStats( const ArrayOfcudnnConvolutionAlgoPerf_t& perf_stat, int returned_algo_count) { VLOG(1) << "Perf result: (algo: stat, time, memory)"; - for (int i = 0; i < returned_algo_count; ++i) { + for (const auto i : c10::irange(returned_algo_count)) { const auto& stat = perf_stat[i]; VLOG(1) << stat.algo << ": " << stat.status << " " << stat.time << " " << stat.memory; diff --git a/caffe2/operators/operator_fallback_gpu.h b/caffe2/operators/operator_fallback_gpu.h index 72b430f00d3614..a728b79b4916f7 100644 --- a/caffe2/operators/operator_fallback_gpu.h +++ b/caffe2/operators/operator_fallback_gpu.h @@ -62,7 +62,7 @@ class GPUFallbackOpEx final : public Operator { } bool RunOnDevice() override { - for (int i = 0; i < InputSize(); ++i) { + for (const auto i : c10::irange(InputSize())) { if (this->InputIsTensorType(i, CUDA)) { // use sync copy BlobGetMutableTensor(local_input_blobs_[i], CPU)->CopyFrom(Input(i)); @@ -82,7 +82,7 @@ class GPUFallbackOpEx final : public Operator { << ProtoDebugString(this->debug_def()); return false; } - for (int i = 0; i < OutputSize(); ++i) { + for (const auto i : c10::irange(OutputSize())) { if (SkipOutputCopy::Contains(i)) { VLOG(1) << "Copy output: index " << i << " skipped."; continue; diff --git a/caffe2/operators/order_switch_ops.h b/caffe2/operators/order_switch_ops.h index 2dab5f72e8ca6c..3573ddfa038db7 100644 --- a/caffe2/operators/order_switch_ops.h +++ b/caffe2/operators/order_switch_ops.h @@ -30,7 +30,7 @@ class NHWC2NCHWOp final : public Operator { Y_dims[0] = N; Y_dims[1] = C; int HxW = 1; - for (int i = 2; i < ndim; ++i) { + for (const auto i : c10::irange(2, ndim)) { Y_dims[i] = X.dim32(i - 1); HxW *= Y_dims[i]; } diff --git a/caffe2/operators/pack_rnn_sequence_op.h b/caffe2/operators/pack_rnn_sequence_op.h index 0dc597d20ee61a..a22c96ed5a3973 100644 --- a/caffe2/operators/pack_rnn_sequence_op.h +++ b/caffe2/operators/pack_rnn_sequence_op.h @@ -69,7 +69,7 @@ class PackRNNSequenceOpBase : public Operator { math::Set(output->numel(), 0, output_data, &context_); int32_t offset = 0; - for (int c = 0; c < cols; c++) { + for (const auto c : c10::irange(cols)) { for (int r = 0; r < lengths_vec[c]; r++) { auto input_offset = Forward ? (offset + r) : (r * cols + c); auto output_offset = Forward ? (r * cols + c) : (offset + r); diff --git a/caffe2/operators/partition_ops.h b/caffe2/operators/partition_ops.h index fa8a27c39605ed..78ca127b433a27 100644 --- a/caffe2/operators/partition_ops.h +++ b/caffe2/operators/partition_ops.h @@ -48,7 +48,7 @@ class GatherByKeyOp : public Operator { CAFFE_ENFORCE_GE(outShape.size(), 1); auto totalSize = in0Shape[0]; auto meta = Input(1).dtype(); - for (int i = 2; i < InputSize(); ++i) { + for (const auto i : c10::irange(2, InputSize())) { const auto& input = Input(i); CAFFE_ENFORCE(meta == input.dtype()); CAFFE_ENFORCE_GE(input.dim(), 1); @@ -66,7 +66,7 @@ class GatherByKeyOp : public Operator { const auto blockSize = outTensor->size_from_dim(1); inputDatas_.resize(numPartitions); - for (int i = 0; i < numPartitions; ++i) { + for (const auto i : c10::irange(numPartitions)) { inputDatas_[i] = static_cast(Input(i + 1).raw_data()); } inStartOffsets_.assign(numPartitions, 0); @@ -127,7 +127,7 @@ class PartitionOpBase : public Operator { int64_t size = main_input.numel(); const Index* data = main_input.template data(); counts_.assign(partitions, 0); - for (int64_t p = 0; p < size; p++) { + for (const auto p : c10::irange(size)) { int shard = moduloPartition(data[p], partitions); ++counts_[shard]; } @@ -136,7 +136,7 @@ class PartitionOpBase : public Operator { block_sizes_.resize(inputSize); metas_.resize(inputSize); out_datas_.resize(OutputSize()); - for (int i = mainInputIndex; i < inputSize; ++i) { + for (const auto i : c10::irange(mainInputIndex, inputSize)) { auto& input = Input(i); if (i > mainInputIndex) { CAFFE_ENFORCE_GE( @@ -145,7 +145,7 @@ class PartitionOpBase : public Operator { "Prefix of extra input's shape must match main input's shape, ", "input: ", i); - for (int j = 0; j < main_input.dim(); ++j) { + for (const auto j : c10::irange(main_input.dim())) { CAFFE_ENFORCE_GE( input.size(j), main_input.size(j), @@ -162,7 +162,7 @@ class PartitionOpBase : public Operator { // shape = partition_size + suffix of input dims vector shape( input.sizes().begin() + main_input.dim() - 1, input.sizes().end()); - for (int j = 0; j < partitions; ++j) { + for (const auto j : c10::irange(partitions)) { int out_idx = i + j * inputSize; auto output = Output(out_idx); shape[0] = counts_[j]; @@ -172,7 +172,7 @@ class PartitionOpBase : public Operator { } counts_.assign(partitions, 0); - for (int64_t p = 0; p < size; p++) { + for (const auto p : c10::irange(size)) { int shard = moduloPartition(data[p], partitions); int64_t idx = counts_[shard]++; @@ -254,7 +254,7 @@ class LengthsPartitionOp : public PartitionOpBase { if (partitions == 1) { // Specialization when partitions == 1 which just becomes a copy. - for (int i = 0; i < InputSize(); ++i) { + for (const auto i : c10::irange(InputSize())) { auto& input = Input(i); auto& output = *Output(i); output.ResizeLike(input); @@ -279,14 +279,14 @@ class LengthsPartitionOp : public PartitionOpBase { int64_t elements = length_input.numel(); const int32_t* lengths_data = length_input.template data(); out_length_.resize(partitions); - for (int i = 0; i < partitions; ++i) { + for (const auto i : c10::irange(partitions)) { auto& output = *Output(i * InputSize()); output.Resize(elements); out_length_[i] = output.template mutable_data(); } int total_length = 0; - for (int i = 0; i < elements; ++i) { + for (const auto i : c10::irange(elements)) { total_length += lengths_data[i]; } CAFFE_ENFORCE( @@ -294,8 +294,8 @@ class LengthsPartitionOp : public PartitionOpBase { "Total length is not matching to the number of elements"); int index = 0; - for (int i = 0; i < elements; ++i) { - for (int j = 0; j < partitions; ++j) { + for (const auto i : c10::irange(elements)) { + for (const auto j : c10::irange(partitions)) { out_length_[j][i] = 0; } for (int j = 0; j < lengths_data[i]; ++j, ++index) { diff --git a/caffe2/operators/piecewise_linear_transform_op.h b/caffe2/operators/piecewise_linear_transform_op.h index d1fdc65369706f..9dcf0021f1c2e9 100644 --- a/caffe2/operators/piecewise_linear_transform_op.h +++ b/caffe2/operators/piecewise_linear_transform_op.h @@ -3,6 +3,7 @@ #include "caffe2/core/context.h" #include "caffe2/core/export_caffe2_op_to_c10.h" +#include #include "caffe2/core/operator.h" C10_DECLARE_EXPORT_CAFFE2_OP_TO_C10(PiecewiseLinearTransform); @@ -61,7 +62,7 @@ class PiecewiseLinearTransformOp final : public Operator { const int64_t num_bounds_per_group, const int64_t num_group) { const T* start = bounds; - for (int64_t i = 0; i < num_group; i++) { + for (const auto i : c10::irange(num_group)) { if (!std::is_sorted(start, start + num_bounds_per_group)) { return false; } @@ -153,11 +154,11 @@ class PiecewiseLinearTransformOp final : public Operator { &bounds, &slopes, &intercepts, &num_func_per_group, &num_group); CAFFE_ENFORCE_EQ(num_group, M); - for (int64_t j = 0; j < M; ++j) { + for (const auto j : c10::irange(M)) { const T* bounds_group = bounds + j * (num_func_per_group + 1); const T* slopes_group = slopes + j * num_func_per_group; const T* intercepts_group = intercepts + j * num_func_per_group; - for (int64_t i = 0; i < N; ++i) { + for (const auto i : c10::irange(N)) { Ydata[i * M + j] = PiecewiseLinearTransform( Xdata[i * M + j], bounds_group, @@ -192,12 +193,12 @@ class PiecewiseLinearTransformOp final : public Operator { CAFFE_ENFORCE_EQ(num_group, 1); if (M == 1) { - for (int64_t i = 0; i < N; ++i) { + for (const auto i : c10::irange(N)) { Ydata[i] = PiecewiseLinearTransform( Xdata[i], bounds, slopes, intercepts, num_func_per_group); } } else { - for (int64_t i = 0; i < N; ++i) { + for (const auto i : c10::irange(N)) { Ydata[i * M + 1] = PiecewiseLinearTransform( Xdata[i * M + 1], bounds, slopes, intercepts, num_func_per_group); Ydata[i * M] = 1.0f - Ydata[i * M + 1]; diff --git a/caffe2/operators/pool_op.h b/caffe2/operators/pool_op.h index 77d0f0659eb14a..855d69404e6a9a 100644 --- a/caffe2/operators/pool_op.h +++ b/caffe2/operators/pool_op.h @@ -20,12 +20,12 @@ class PoolOp final : public ConvPoolOpBase { explicit PoolOp(Args&&... args) : ConvPoolOpBase(std::forward(args)...), functor_(*this) { const int kernel_size = kernel_.size(); - for (int i = 0; i < kernel_size; ++i) { + for (const auto i : c10::irange(kernel_size)) { CAFFE_ENFORCE_EQ( dilation_[i], 1, "Pooling op does not support dilation right now."); } if (!global_pooling_) { - for (int i = 0; i < kernel_size; ++i) { + for (const auto i : c10::irange(kernel_size)) { CAFFE_ENFORCE( pads_[i] < kernel_[i] && pads_[i + kernel_size] < kernel_[i], "Pad should be smaller than kernel."); diff --git a/caffe2/operators/prepend_dim_op.h b/caffe2/operators/prepend_dim_op.h index cf425942a5100b..396abc647efb66 100644 --- a/caffe2/operators/prepend_dim_op.h +++ b/caffe2/operators/prepend_dim_op.h @@ -35,7 +35,7 @@ class PrependDimOp : public Operator { actual_new_shape[0] = dim_size_; actual_new_shape[1] = input.size(0) / dim_size_; // NOLINTNEXTLINE(clang-diagnostic-sign-compare) - for (int i = 1; i < input.sizes().size(); ++i) { + for (const auto i : c10::irange(1, input.sizes().size())) { actual_new_shape[i + 1] = input.size(i); } output->Resize(actual_new_shape); diff --git a/caffe2/operators/quant_decode_op.h b/caffe2/operators/quant_decode_op.h index 560f6abd3f6d42..1eeb4f2db8ad2e 100644 --- a/caffe2/operators/quant_decode_op.h +++ b/caffe2/operators/quant_decode_op.h @@ -1,6 +1,8 @@ #ifndef QUANT_DECODE_OP_H_ #define QUANT_DECODE_OP_H_ + +#include #include #include "caffe2/core/context.h" #include "caffe2/core/operator.h" @@ -34,7 +36,7 @@ void Decode( } int sz = output->numel(); - for (int i = 0; i < sz; i++) { + for (const auto i : c10::irange(sz)) { DCHECK_LE(*code_ptr, cb_size); *out_ptr++ = cb_ptr[*code_ptr++]; } @@ -116,7 +118,7 @@ class QuantDecodeOp final : public Operator { const auto& codebook = Input(0); CAFFE_ENFORCE(codebook.template IsType(), codebook.dtype().name()); - for (int i = 0; i < OutputSize(); i++) { + for (const auto i : c10::irange(OutputSize())) { auto& ci = Input(i + 1); auto* co = Output(i); @@ -157,7 +159,7 @@ class QuantDecodeGradientOp final : public Operator { auto* gradient_ptr = gradient->template mutable_data(); std::fill(gradient_ptr, gradient_ptr + gradient->numel(), 0); - for (int i = 0; i < num_code_tensors; i++) { + for (const auto i : c10::irange(num_code_tensors)) { auto& codes_i = Input(i + 1); auto& output_gradient_i = Input(i + num_code_tensors + 1); DecodeGeneral(codebook, codes_i, &output_gradient_i, gradient, false); diff --git a/caffe2/operators/quantile_op.h b/caffe2/operators/quantile_op.h index 165addc137c3c8..e20ea1d07be419 100644 --- a/caffe2/operators/quantile_op.h +++ b/caffe2/operators/quantile_op.h @@ -42,7 +42,7 @@ class QuantileOp final : public Operator { auto& input_zero = Input(0); int64_t numel = input_zero.numel(); - for (int i = 1; i < InputSize(); ++i) { + for (const auto i : c10::irange(1, InputSize())) { CAFFE_ENFORCE_EQ( Input(i).dtype(), input_zero.dtype(), @@ -116,9 +116,9 @@ class QuantileOp final : public Operator { void GetRangeFromInputs(T* lo, T* hi) { *hi = std::numeric_limits::lowest(); *lo = std::numeric_limits::max(); - for (int i = 0; i < InputSize(); ++i) { + for (const auto i : c10::irange(InputSize())) { const auto* input = Input(i).template data(); - for (int j = 0; j < Input(i).numel(); j++) { + for (const auto j : c10::irange(Input(i).numel())) { const T val = abs_ ? std::abs(input[j]) : input[j]; if (*hi < val) { *hi = val; @@ -133,9 +133,9 @@ class QuantileOp final : public Operator { template int64_t CountLowerEq(const T& thd) { int64_t count = 0; - for (int i = 0; i < InputSize(); ++i) { + for (const auto i : c10::irange(InputSize())) { const auto* input = Input(i).template data(); - for (int j = 0; j < Input(i).numel(); j++) { + for (const auto j : c10::irange(Input(i).numel())) { const T val = abs_ ? std::abs(input[j]) : input[j]; if (val <= thd) { count++; diff --git a/caffe2/operators/quantized/int8_concat_op.h b/caffe2/operators/quantized/int8_concat_op.h index b501bc128fae2e..3db1bcde780c84 100644 --- a/caffe2/operators/quantized/int8_concat_op.h +++ b/caffe2/operators/quantized/int8_concat_op.h @@ -46,10 +46,10 @@ class Int8ConcatOp final : public Operator { if (this->template GetSingleArgument("order", "") == "NHWC") { CHECK_EQ(Y_dims.size(), 4); } - for (auto i = 1; i < InputSize(); ++i) { + for (const auto i : c10::irange(1, InputSize())) { const auto& Xi = Inputs()[i]->template Get(); CHECK_EQ(Xi.t.dim(), Y_dims.size()); - for (auto j = 0; j < Y_dims.size(); ++j) { + for (const auto j : c10::irange(Y_dims.size())) { if (j != axis_) { CHECK_EQ(Xi.t.size(j), Y_dims[j]); } @@ -61,7 +61,7 @@ class Int8ConcatOp final : public Operator { int after = X0.t.size_from_dim(axis_ + 1); const auto C_total = Y_dims[axis_]; size_t C_offset = 0; - for (auto i = 0; i < InputSize(); ++i) { + for (const auto i : c10::irange(InputSize())) { const auto& Xi = Inputs()[i]->template Get(); // Copy the NxHxWxC input slice to NxHxWx[C_offset:C_offset + C]. const auto Ci = Xi.t.size(axis_); diff --git a/caffe2/operators/quantized/int8_dequantize_op.h b/caffe2/operators/quantized/int8_dequantize_op.h index eeecf915454248..ae0787e802d735 100644 --- a/caffe2/operators/quantized/int8_dequantize_op.h +++ b/caffe2/operators/quantized/int8_dequantize_op.h @@ -18,7 +18,7 @@ void Int8Dequantize( const int64_t N, const float X_scale, const int32_t X_offset) { - for (auto i = 0; i < N; ++i) { + for (const auto i : c10::irange(N)) { out[i] = (static_cast(in[i]) - X_offset) * X_scale; } } diff --git a/caffe2/operators/quantized/int8_given_tensor_fill_op.h b/caffe2/operators/quantized/int8_given_tensor_fill_op.h index bd55c9a5481439..8080ca78b344dc 100644 --- a/caffe2/operators/quantized/int8_given_tensor_fill_op.h +++ b/caffe2/operators/quantized/int8_given_tensor_fill_op.h @@ -40,7 +40,7 @@ class Int8GivenTensorFillOp final : public Operator { {static_cast(source_values.size())}, at::dtype().device(CPU)); uint8_t* values_data = values_.template mutable_data(); - for (int i = 0; i < source_values.size(); i++) { + for (const auto i : c10::irange(source_values.size())) { values_data[i] = static_cast(source_values[i]); } } @@ -92,7 +92,7 @@ class Int8GivenIntTensorFillOp final : public Operator { {static_cast(source_values.size())}, at::dtype().device(CPU)); auto* values_data = values_.template mutable_data(); - for (int i = 0; i < source_values.size(); i++) { + for (const auto i : c10::irange(source_values.size())) { values_data[i] = static_cast(source_values[i]); } } diff --git a/caffe2/operators/quantized/int8_resize_nearest_op.h b/caffe2/operators/quantized/int8_resize_nearest_op.h index 102cc35fdaaaf5..94adaa8dc23024 100644 --- a/caffe2/operators/quantized/int8_resize_nearest_op.h +++ b/caffe2/operators/quantized/int8_resize_nearest_op.h @@ -54,10 +54,10 @@ class Int8ResizeNearestOp final : public Operator { const uint8_t* Xdata = X.t.data(); uint8_t* Ydata = Y->t.mutable_data(); - for (int n = 0; n < N; ++n) { - for (int y = 0; y < OH; ++y) { + for (const auto n : c10::irange(N)) { + for (const auto y : c10::irange(OH)) { const int in_y = std::min((int)(y / height_scale_), (IH - 1)); - for (int x = 0; x < OW; ++x) { + for (const auto x : c10::irange(OW)) { const int in_x = std::min((int)(x / width_scale_), (IW - 1)); std::memcpy( &Ydata[C * x + C * OW * y + C * OW * OH * n], diff --git a/caffe2/operators/quantized/int8_roi_align_op.h b/caffe2/operators/quantized/int8_roi_align_op.h index 710476e0520176..c941e1d5b99827 100644 --- a/caffe2/operators/quantized/int8_roi_align_op.h +++ b/caffe2/operators/quantized/int8_roi_align_op.h @@ -44,13 +44,13 @@ void pre_calc_for_bilinear_interpolate( int pre_calc_index = 0; // boltnn use a smaller multiplier here. Sometimes w will shrink to 0. const float w_multiplier = 255.0; - for (int ph = 0; ph < pooled_height; ph++) { - for (int pw = 0; pw < pooled_width; pw++) { - for (int iy = 0; iy < iy_upper; iy++) { + for (const auto ph : c10::irange(pooled_height)) { + for (const auto pw : c10::irange(pooled_width)) { + for (const auto iy : c10::irange(iy_upper)) { const float yy = roi_start_h + ph * bin_size_h + static_cast(iy + .5f) * bin_size_h / static_cast(roi_bin_grid_h); // e.g., 0.5, 1.5 - for (int ix = 0; ix < ix_upper; ix++) { + for (const auto ix : c10::irange(ix_upper)) { const float xx = roi_start_w + pw * bin_size_w + static_cast(ix + .5f) * bin_size_w / static_cast(roi_bin_grid_w); @@ -152,7 +152,7 @@ void ROIAlignForward( int n_rois = nthreads / channels / pooled_width / pooled_height; - for (int n = 0; n < n_rois; n++) { + for (const auto n : c10::irange(n_rois)) { int index_n = n * channels * pooled_width * pooled_height; // roi could have 4 or 5 columns @@ -224,19 +224,19 @@ void ROIAlignForward( const uint8_t* offset_bottom_data = bottom_data + roi_batch_ind * channels * height * width; int pre_calc_index = 0; - for (int ph = 0; ph < pooled_height; ph++) { - for (int pw = 0; pw < pooled_width; pw++) { + for (const auto ph : c10::irange(pooled_height)) { + for (const auto pw : c10::irange(pooled_width)) { vector acc_buffer(channels, 0); - for (int iy = 0; iy < roi_bin_grid_h; iy++) { - for (int ix = 0; ix < roi_bin_grid_w; ix++) { + for (const auto iy : c10::irange(roi_bin_grid_h)) { + for (const auto ix : c10::irange(roi_bin_grid_w)) { PreCalc pc = pre_calc[pre_calc_index]; const uint8_t* data_1 = offset_bottom_data + channels * pc.pos1; const uint8_t* data_2 = offset_bottom_data + channels * pc.pos2; const uint8_t* data_3 = offset_bottom_data + channels * pc.pos3; const uint8_t* data_4 = offset_bottom_data + channels * pc.pos4; - for (int c = 0; c < channels; ++c) { + for (const auto c : c10::irange(channels)) { acc_buffer[c] += (uint32_t)(pc.w1) * (uint32_t)(data_1[c]); acc_buffer[c] += (uint32_t)(pc.w2) * (uint32_t)(data_2[c]); acc_buffer[c] += (uint32_t)(pc.w3) * (uint32_t)(data_3[c]); @@ -251,7 +251,7 @@ void ROIAlignForward( } int index_nhw = index_n + (ph * pooled_width + pw) * channels; uint8_t* out_ptr = top_data + index_nhw; - for (int c = 0; c < channels; ++c) { + for (const auto c : c10::irange(channels)) { int32_t a_mul = MultiplyByQuantizedMultiplierSmallerThanOne( acc_buffer[c], Y_multiplier, Y_shift) + y_offset; diff --git a/caffe2/operators/quantized/int8_test_utils.h b/caffe2/operators/quantized/int8_test_utils.h index f4a96be75cd834..a7bc2421324188 100644 --- a/caffe2/operators/quantized/int8_test_utils.h +++ b/caffe2/operators/quantized/int8_test_utils.h @@ -77,7 +77,7 @@ inline std::unique_ptr biasdq(const int8::Int8TensorCPU& XQ) { #define EXPECT_TENSOR_EQ(_YA, _YE) \ do { \ EXPECT_TRUE((_YA).sizes() == (_YE).sizes()); \ - for (auto i = 0; i < (_YA).numel(); ++i) { \ + for (const auto i : c10::irange((_YA).numel())) { \ EXPECT_FLOAT_EQ((_YA).data()[i], (_YE).data()[i]); \ } \ } while (0); @@ -85,7 +85,7 @@ inline std::unique_ptr biasdq(const int8::Int8TensorCPU& XQ) { #define EXPECT_TENSOR_APPROX_EQ(_YA, _YE, _tol) \ do { \ EXPECT_TRUE((_YA).sizes() == (_YE).sizes()); \ - for (auto i = 0; i < (_YA).numel(); ++i) { \ + for (const auto i : c10::irange((_YA).numel())) { \ EXPECT_NEAR((_YA).data()[i], (_YE).data()[i], (_tol)); \ } \ } while (0); diff --git a/caffe2/operators/reduce_front_back_max_ops.h b/caffe2/operators/reduce_front_back_max_ops.h index 8f064954d98581..25486dcade4265 100644 --- a/caffe2/operators/reduce_front_back_max_ops.h +++ b/caffe2/operators/reduce_front_back_max_ops.h @@ -35,7 +35,7 @@ class MaxReduceDimsOp final : public Operator { int start_index = FIRSTDIMS ? num_reduce_dims_ : 0; int end_index = FIRSTDIMS ? X.dim() : X.dim() - num_reduce_dims_; - for (int i = start_index; i < end_index; ++i) { + for (const auto i : c10::irange(start_index, end_index)) { output_shape.push_back(X.sizes()[i]); } auto* Y = Output(0, output_shape, at::dtype()); diff --git a/caffe2/operators/reduce_front_back_sum_mean_ops.h b/caffe2/operators/reduce_front_back_sum_mean_ops.h index d99efc335e8c34..869e37687ce035 100644 --- a/caffe2/operators/reduce_front_back_sum_mean_ops.h +++ b/caffe2/operators/reduce_front_back_sum_mean_ops.h @@ -35,7 +35,7 @@ class SumReduceDimsOp final : public Operator { vector output_shape; int start_index = FIRSTDIMS ? num_reduce_dims_ : 0; int end_index = FIRSTDIMS ? X.dim() : X.dim() - num_reduce_dims_; - for (int i = start_index; i < end_index; ++i) { + for (const auto i : c10::irange(start_index, end_index)) { output_shape.push_back(X.sizes()[i]); } auto* Y = Output(0, output_shape, at::dtype()); diff --git a/caffe2/operators/reduce_ops.h b/caffe2/operators/reduce_ops.h index 4fba06a528b051..24d0c1a6fcd3d4 100644 --- a/caffe2/operators/reduce_ops.h +++ b/caffe2/operators/reduce_ops.h @@ -50,7 +50,7 @@ class ReduceOp final : public Operator { std::vector output_dims; output_dims.reserve(ndim); std::size_t cur_axis = 0; - for (int i = 0; i < ndim; ++i) { + for (const auto i : c10::irange(ndim)) { if (cur_axis < axes_.size() && i == axes_[cur_axis]) { if (keep_dims_) { output_dims.push_back(1); diff --git a/caffe2/operators/reducer_functors.h b/caffe2/operators/reducer_functors.h index 762e8c41a9ffa9..0159e030d26371 100644 --- a/caffe2/operators/reducer_functors.h +++ b/caffe2/operators/reducer_functors.h @@ -50,7 +50,7 @@ class SumRangeReducerGradient { const T* /*data_out*/, // unused Context* context) { // do we have some op that does it smartly with minimum number of memcpy? - for (int64_t i = 0; i < blocks; ++i) { + for (const auto i : c10::irange(blocks)) { context->template CopySameDevice( block_size, segment_grad, data_grad + block_size * i); } @@ -83,13 +83,13 @@ class LogSumExpRangeReducer { const T* in, T* out, CPUContext* /*context*/) { - for (int j = 0; j < block_size; ++j) { + for (const auto j : c10::irange(block_size)) { T max_value = std::numeric_limits::lowest(); - for (int i = 0; i < blocks; ++i) { + for (const auto i : c10::irange(blocks)) { max_value = std::max(max_value, in[i * block_size + j]); } T scaled_exp_sum = 0; - for (int i = 0; i < blocks; ++i) { + for (const auto i : c10::irange(blocks)) { scaled_exp_sum += std::exp(in[i * block_size + j] - max_value); } *(out++) = std::log(scaled_exp_sum) + max_value; @@ -109,10 +109,10 @@ class LogSumExpRangeReducerGradient { const T* data_in, // I const T* data_out, // O Context* /*context*/) { - for (int j = 0; j < block_size; ++j) { + for (const auto j : c10::irange(block_size)) { const T out_grad = *(segment_grad++); const T offset = *(data_out++); - for (int i = 0; i < blocks; ++i) { + for (const auto i : c10::irange(blocks)) { auto idx = i * block_size + j; data_grad[idx] = out_grad * std::exp(data_in[idx] - offset); } @@ -145,13 +145,13 @@ class LogMeanExpRangeReducer { const T* in, T* out, CPUContext* /*context*/) { - for (int j = 0; j < block_size; ++j) { + for (const auto j : c10::irange(block_size)) { T max_value = std::numeric_limits::lowest(); - for (int i = 0; i < blocks; ++i) { + for (const auto i : c10::irange(blocks)) { max_value = std::max(max_value, in[i * block_size + j]); } T scaled_exp_sum = 0; - for (int i = 0; i < blocks; ++i) { + for (const auto i : c10::irange(blocks)) { scaled_exp_sum += std::exp(in[i * block_size + j] - max_value); } scaled_exp_sum /= blocks; @@ -171,10 +171,10 @@ class LogMeanExpRangeReducerGradient { const T* data_in, // I const T* data_out, // O Context* /*context*/) { - for (int j = 0; j < block_size; ++j) { + for (const auto j : c10::irange(block_size)) { const T out_grad = *(segment_grad++); const T offset = *(data_out++); - for (int i = 0; i < blocks; ++i) { + for (const auto i : c10::irange(blocks)) { auto idx = i * block_size + j; data_grad[idx] = out_grad * std::exp(data_in[idx] - offset) / blocks; } @@ -207,9 +207,9 @@ class MeanRangeReducer { const T* in, T* out, CPUContext* /*context*/) { - for (int j = 0; j < block_size; ++j) { + for (const auto j : c10::irange(block_size)) { T avg_value = 0; - for (int i = 0; i < blocks; ++i) { + for (const auto i : c10::irange(blocks)) { avg_value += in[i * block_size + j] / blocks; } *(out++) = avg_value; @@ -229,9 +229,9 @@ class MeanRangeReducerGradient { const T* /*data_out*/, // O Context* /*context*/) { const auto in_grad = 1.0 / blocks; - for (int j = 0; j < block_size; ++j) { + for (const auto j : c10::irange(block_size)) { const T out_grad = *(segment_grad++); - for (int i = 0; i < blocks; ++i) { + for (const auto i : c10::irange(blocks)) { auto idx = i * block_size + j; data_grad[idx] = out_grad * in_grad; } @@ -266,9 +266,9 @@ class MaxRangeReducer { const T* in, T* out, CPUContext* /*context*/) { - for (int j = 0; j < block_size; ++j) { + for (const auto j : c10::irange(block_size)) { T max_value = std::numeric_limits::lowest(); - for (int i = 0; i < blocks; ++i) { + for (const auto i : c10::irange(blocks)) { max_value = std::max(max_value, in[i * block_size + j]); } *(out++) = max_value; @@ -289,10 +289,10 @@ class MaxRangeReducerGradient { Context* /*context*/) { std::memset( static_cast(data_grad), 0, blocks * block_size * sizeof(T)); - for (int j = 0; j < block_size; ++j) { + for (const auto j : c10::irange(block_size)) { const T out_grad = *(segment_grad++); const T out = data_out[j]; - for (int i = 0; i < blocks; ++i) { + for (const auto i : c10::irange(blocks)) { auto idx = i * block_size + j; if (out == data_in[idx]) { data_grad[idx] = out_grad; @@ -813,7 +813,7 @@ class MaxReducerGradient : public BaseReducerGradient { int64_t /*offset*/, Context* /*context*/, const int /*length*/) { - for (int64_t i = 0; i < meta.block_size; ++i) { + for (const auto i : c10::irange(meta.block_size)) { data_grad[i] = data[i] == forward_output[i] ? s_grad_[i] : 0; } } diff --git a/caffe2/operators/reduction_ops.h b/caffe2/operators/reduction_ops.h index 896ae657a17ee0..1e371fa8e9dca5 100644 --- a/caffe2/operators/reduction_ops.h +++ b/caffe2/operators/reduction_ops.h @@ -164,7 +164,7 @@ class MaxReductionOp : public Operator { &context_); } else { const int input_size = N * M; - for (int i = 0; i < batch_size; ++i) { + for (const auto i : c10::irange(batch_size)) { math::ColwiseMax( M, N, diff --git a/caffe2/operators/remove_data_blocks_op.h b/caffe2/operators/remove_data_blocks_op.h index 5f409bf08dc728..bb97582e1dca64 100644 --- a/caffe2/operators/remove_data_blocks_op.h +++ b/caffe2/operators/remove_data_blocks_op.h @@ -40,7 +40,7 @@ class RemoveDataBlocksOp final : public Operator { const auto* ind_ptr = indices.template data(); std::vector ind_vec; - for (int64_t i = 0; i < indices_size; i++) { + for (const auto i : c10::irange(indices_size)) { ind_vec.push_back(ind_ptr[i]); } std::sort(ind_vec.begin(), ind_vec.end()); @@ -60,7 +60,7 @@ class RemoveDataBlocksOp final : public Operator { ind_vec.insert(ind_vec.begin(), -1); int64_t ind_vec_size = ind_vec.size(); - for (auto i = 0; i < ind_vec_size; i++) { + for (const auto i : c10::irange(ind_vec_size)) { int64_t interval_start = ind_vec[i] + 1; int64_t interval_end = (i == ind_vec_size - 1) ? outer_size : ind_vec[i + 1]; diff --git a/caffe2/operators/reshape_op.h b/caffe2/operators/reshape_op.h index 57b5174467386a..765921e1eaf3cd 100644 --- a/caffe2/operators/reshape_op.h +++ b/caffe2/operators/reshape_op.h @@ -97,7 +97,7 @@ class ReshapeOp : public Operator { } int unknown_idx = -1; - for (int i = 0; i < actual_new_shape.size(); ++i) { + for (const auto i : c10::irange(actual_new_shape.size())) { const auto dim = actual_new_shape[i]; if (dim == -1) { CAFFE_ENFORCE( @@ -153,7 +153,7 @@ class ReshapeOp : public Operator { old_shape->Resize(input.sizes().size()); T* old_shape_data = old_shape->template mutable_data(); std::vector old_shape_vector(input.sizes().begin(), input.sizes().end()); - for (int i = 0; i < old_shape_vector.size(); ++i) { + for (const auto i : c10::irange(old_shape_vector.size())) { old_shape_data[i] = old_shape_vector[i]; } diff --git a/caffe2/operators/reverse_packed_segs_op.h b/caffe2/operators/reverse_packed_segs_op.h index ac6dfdfc53d6d3..1d92e81f0a93cd 100644 --- a/caffe2/operators/reverse_packed_segs_op.h +++ b/caffe2/operators/reverse_packed_segs_op.h @@ -62,7 +62,7 @@ class ReversePackedSegsOp final : public Operator { context_.FinishDeviceComputation(); T* rev_data_ptr = output->template mutable_data(); - for (int64_t i = 0; i < batch_size; i++) { + for (const auto i : c10::irange(batch_size)) { const auto& seg_length = lengths_host[i]; CAFFE_ENFORCE_LE(seg_length, max_length); int64_t j = 0; diff --git a/caffe2/operators/rnn/recurrent_network_blob_fetcher_op.h b/caffe2/operators/rnn/recurrent_network_blob_fetcher_op.h index ebcf50e397d367..13f542ca53099c 100644 --- a/caffe2/operators/rnn/recurrent_network_blob_fetcher_op.h +++ b/caffe2/operators/rnn/recurrent_network_blob_fetcher_op.h @@ -32,7 +32,7 @@ class RecurrentNetworkBlobFetcherOp final : public Operator { std::vector blob_names_vector = {}; // NOLINTNEXTLINE(clang-diagnostic-sign-compare) - for (int64_t i = 0; i < stepWorkspaces.size(); i++) { + for (const auto i : c10::irange(stepWorkspaces.size())) { Workspace* currentStepWorkspace = stepWorkspaces[i].get(); std::vector blob_names = currentStepWorkspace->LocalBlobs(); diff --git a/caffe2/operators/rnn/recurrent_network_executor.h b/caffe2/operators/rnn/recurrent_network_executor.h index eecccf7774926f..179bb7c0b37b19 100644 --- a/caffe2/operators/rnn/recurrent_network_executor.h +++ b/caffe2/operators/rnn/recurrent_network_executor.h @@ -38,7 +38,7 @@ class RecurrentNetworkExecutorBase { recurrent_input_map_(recurrent_input_map), timestep_blob_(timestep_blob) { const bool net_def_has_device_option = step_net_def_.has_device_option(); - for (int i = 0; i < step_net_def_.op_size(); i++) { + for (const auto i : c10::irange(step_net_def_.op_size())) { if (net_def_has_device_option) { // In the case when net def specifies device option, final device option // will be equal to merge of operator and net def device options, with @@ -86,7 +86,7 @@ class RecurrentNetworkExecutorBase { for (auto& rnn_op : timestep_ops_template_) { rnn_op.has_timestep_blob = false; const OperatorDef& op = step_net_def_.op(rnn_op.order); - for (int i = 0; i < op.input_size(); i++) { + for (const auto i : c10::irange(op.input_size())) { if (op.input(i) == timestep_blob_) { rnn_op.has_timestep_blob = true; break; @@ -137,7 +137,7 @@ class RecurrentNetworkExecutorBase { if (rnn_op.has_timestep_blob) { OperatorDef op_copy = step_net_def_.op(rnn_op.order); - for (int i = 0; i < op_copy.input_size(); i++) { + for (const auto i : c10::irange(op_copy.input_size())) { if (op_copy.input(i) == timestep_blob_) { op_copy.set_input(i, this_timestep_blob); } @@ -283,7 +283,7 @@ class RecurrentNetworkExecutorBase { int opidx, std::vector& rnn_ops, std::unordered_set* dep_ops) { - for (int i = 0; i < rnn_ops.size(); i++) { + for (const auto i : c10::irange(rnn_ops.size())) { if (i == opidx) { continue; } @@ -315,7 +315,7 @@ class RecurrentNetworkExecutorBase { * for each timestep. */ void CalculateInternalDependencies() { - for (int i = 0; i < step_net_def_.op_size(); i++) { + for (const auto i : c10::irange(step_net_def_.op_size())) { timestep_ops_template_.push_back(RNNNetOperator(step_net_def_.op(i), i)); } // Then see which outputs appear as inputs, and those are diff --git a/caffe2/operators/rnn/recurrent_network_op.h b/caffe2/operators/rnn/recurrent_network_op.h index de82c9fa3643f3..803606d061bb33 100644 --- a/caffe2/operators/rnn/recurrent_network_op.h +++ b/caffe2/operators/rnn/recurrent_network_op.h @@ -103,7 +103,7 @@ void repeatCopy( T* dst, Context* context) { // NOLINTNEXTLINE(clang-diagnostic-sign-compare) - for (int i = 0; i < repeat_n; ++i) { + for (const auto i : c10::irange(repeat_n)) { context->template CopySameDevice(n, src, dst + i * n); } } @@ -228,7 +228,7 @@ class RecurrentNetworkOp final : public Operator { CAFFE_ENFORCE_EQ(states.size(), inputs.size(), "states/inputs mismatch"); std::vector ris; // NOLINTNEXTLINE(clang-diagnostic-sign-compare) - for (auto i = 0; i < states.size(); ++i) { + for (const auto i : c10::irange(states.size())) { // States need to be "global" (since they are shared between // forward and backward). sharedWs->CreateBlob(states[i]); @@ -254,7 +254,7 @@ class RecurrentNetworkOp final : public Operator { dst.size() == offset.size(), "alias_dst/alias_offset mismatch"); std::vector aliases; // NOLINTNEXTLINE(clang-diagnostic-sign-compare) - for (auto i = 0; i < src.size(); ++i) { + for (const auto i : c10::irange(src.size())) { detail::OffsetAlias oc; oc.src = src[i]; oc.dst = dst[i]; @@ -343,7 +343,7 @@ class RecurrentNetworkOp final : public Operator { stepWorkspaces.resize(num_workspaces_on_fwd_only); } - for (auto t = 0; t < seqLen; ++t) { + for (const auto t : c10::irange(seqLen)) { auto& currentStepWorkspace = (has_backward_pass ? stepWorkspaces[t] : stepWorkspaces[t % num_workspaces_on_fwd_only]); @@ -472,7 +472,7 @@ class RecurrentNetworkGradientOp final : public Operator { } void renameOpInputOutput(std::string from_name, std::string to_name) { - for (int j = 0; j < stepNetDef_.op_size(); j++) { + for (const auto j : c10::irange(stepNetDef_.op_size())) { auto* op = stepNetDef_.mutable_op(j); for (int i = 0; i < op->input_size(); i++) { if (op->input(i) == from_name) { @@ -498,7 +498,7 @@ class RecurrentNetworkGradientOp final : public Operator { " != ", param_grads.size()); // NOLINTNEXTLINE(clang-diagnostic-sign-compare) - for (int i = 0; i < param.size(); ++i) { + for (const auto i : c10::irange(param.size())) { detail::Param p; // Forward inputs come after [outputs_with_grads] gradient inputs p.param = operator_def.input(param[i] + gradInputs_.size()); @@ -526,17 +526,17 @@ class RecurrentNetworkGradientOp final : public Operator { this->template GetRepeatedArgument("alias_offset"); // NOLINTNEXTLINE(clang-diagnostic-sign-compare) - for (auto i = 0; i < recurrent.size(); ++i) { + for (const auto i : c10::irange(recurrent.size())) { detail::RecurrentGradient rg; rg.param = recurrent[i]; rg.grad = remappedName(recurrent[i] + "_grad"); - for (int j = 0; j < alias_src.size(); ++j) { + for (const auto j : c10::irange(alias_src.size())) { if (alias_src[j] != recurrent[i]) { continue; } int idx = -1; - for (int k = 0; k < gradInputs_.size(); ++k) { + for (const auto k : c10::irange(gradInputs_.size())) { if (gradInputs_[k] == j) { idx = k; } @@ -575,7 +575,7 @@ class RecurrentNetworkGradientOp final : public Operator { "", &links); // NOLINTNEXTLINE(clang-diagnostic-sign-compare) - for (int i = 0; i < links.size(); i++) { + for (const auto i : c10::irange(links.size())) { links[i] = remappedLink(links[i]); } return links; @@ -715,7 +715,7 @@ class RecurrentNetworkGradientOp final : public Operator { // This code assumes that there are several inputs // sequences. Actually it is not supported by the rest of the code, // and numSequences_ is a constant, equal to 1. - for (int i = 0; i < numSequences_; ++i) { + for (const auto i : c10::irange(numSequences_)) { // Offseting as the first gradInputs_.size() inputs of the op // are from GO. Then all I(0..N). const int gradientInputIndex = i + gradInputs_.size(); @@ -790,7 +790,7 @@ class RecurrentNetworkGradientOp final : public Operator { CAFFE_ENFORCE_EQ(recurrentInputIds_.size(), recurrentGradients_.size()); // NOLINTNEXTLINE(clang-diagnostic-sign-compare) - for (int i = 0; i < recurrentInputIds_.size(); ++i) { + for (const auto i : c10::irange(recurrentInputIds_.size())) { // See GetRecurrentNetworkGradient to understand offseting here // Outputs of the gradient are inputs of the forward pass. // So we need to offset on all inputs that go before recurrent diff --git a/caffe2/operators/rowmul_op.h b/caffe2/operators/rowmul_op.h index ef866272cdb441..cf0339e732ec82 100644 --- a/caffe2/operators/rowmul_op.h +++ b/caffe2/operators/rowmul_op.h @@ -32,9 +32,9 @@ class RowMulOp : public Operator { "Length of w should be equal to the first dim of mat"); auto block_size = mat.size_from_dim(1); - for (int i = 0; i < w.numel(); i++) { + for (const auto i : c10::irange(w.numel())) { size_t offset = i * block_size; - for (int j = 0; j < block_size; j++) { + for (const auto j : c10::irange(block_size)) { output_data[offset + j] = mat_data[offset + j] * w_data[i]; } } @@ -60,10 +60,10 @@ class ReduceTailSumOp : public Operator { T* output_data = output->template mutable_data(); const T* mat_data = mat.template data(); - for (int i = 0; i < N; i++) { + for (const auto i : c10::irange(N)) { output_data[i] = 0; size_t offset = i * block_size; - for (int j = 0; j < block_size; j++) { + for (const auto j : c10::irange(block_size)) { output_data[i] += mat_data[offset + j]; } } diff --git a/caffe2/operators/scale_blobs_op.h b/caffe2/operators/scale_blobs_op.h index dce69e31555955..af7d80083408de 100644 --- a/caffe2/operators/scale_blobs_op.h +++ b/caffe2/operators/scale_blobs_op.h @@ -20,7 +20,7 @@ class ScaleBlobsOp final : public Operator { bool DoRunWithType() { int batchSize = InputSize(); - for (int i = 0; i < batchSize; ++i) { + for (const auto i : c10::irange(batchSize)) { const auto& X = Input(i); auto* Y = Output(i, X.sizes(), at::dtype()); math::Scale( @@ -34,7 +34,7 @@ class ScaleBlobsOp final : public Operator { } bool RunOnDevice() override { - for (int i = 0; i < InputSize(); ++i) { + for (const auto i : c10::irange(InputSize())) { auto& input = this->template Input(i, CPU); auto* output = this->template Output(i, CPU); output->ResizeLike(input); diff --git a/caffe2/operators/segment_reduction_op.h b/caffe2/operators/segment_reduction_op.h index f126b109ea9b4e..b4075813db0943 100644 --- a/caffe2/operators/segment_reduction_op.h +++ b/caffe2/operators/segment_reduction_op.h @@ -2,6 +2,7 @@ #define CAFFE2_OPERATORS_SEGMENT_REDUCTION_OP_H_ #include "caffe2/core/export_caffe2_op_to_c10.h" +#include #include "caffe2/core/context.h" #include "caffe2/core/logging.h" #include "caffe2/core/operator.h" @@ -335,7 +336,7 @@ class AbstractReduceFrontOrBackOp : public Operator { const int num_blocks = block_size > 0 ? data.numel() / block_size : 0; Reducer r(ctx, out, &context_); - for (int64_t i = 0; i < num_blocks; ++i) { + for (const auto i : c10::irange(num_blocks)) { r.template process( ctx, inputAccessor_.getBlockPtr(block_size, i), i, &context_); } @@ -406,7 +407,7 @@ class AbstractReduceFrontOrBackGradientOp : public Operator { T* out = data_grads->template mutable_data(); ReducerGradient r(ctx, r_grad, &context_); - for (int64_t i = 0; i < block_num; ++i) { + for (const auto i : c10::irange(block_num)) { r.template fillGrad( ctx, out + block_size * i, @@ -1070,7 +1071,7 @@ class AbstractUnsortedSegmentOp : public Operator { K = num_segments_; } else { K = 0; - for (int64_t i = 0; i < N; ++i) { + for (const auto i : c10::irange(N)) { K = std::max(K, s_ids[i] + 1); } } @@ -1086,11 +1087,11 @@ class AbstractUnsortedSegmentOp : public Operator { reducers_.clear(); reducers_.reserve(K); - for (int64_t i = 0; i < K; ++i) { + for (const auto i : c10::irange(K)) { reducers_.emplace_back(ctx, out + out_block_size * i, &context_); } - for (int64_t i = 0; i < N; ++i) { + for (const auto i : c10::irange(N)) { auto s_id = s_ids[i]; CAFFE_ENFORCE( 0 <= s_id && s_id < K, @@ -1114,7 +1115,7 @@ class AbstractUnsortedSegmentOp : public Operator { ctx, inputAccessor_.getBlockPtr(in_block_size, idx), i, &context_); } - for (int64_t i = 0; i < K; ++i) { + for (const auto i : c10::irange(K)) { reducers_[i].template finish(ctx, &context_); } // call reducers destructors (if there is any) @@ -1188,7 +1189,7 @@ class AbstractUnsortedSegmentGradientOp : public Operator { if (ReducerGradient::computeLength()) { segment_length_.resize(K, 0); - for (int i = 0; i < N; ++i) { + for (const auto i : c10::irange(N)) { auto s_id = s_ids[i]; CAFFE_ENFORCE( 0 <= s_id && s_id < K, @@ -1206,7 +1207,7 @@ class AbstractUnsortedSegmentGradientOp : public Operator { reducers_.emplace_back(ctx, s_grads + s_block_size * i, &context_); } - for (int64_t i = 0; i < N; ++i) { + for (const auto i : c10::irange(N)) { auto s_id = s_ids[i]; if (ReducerGradient::computeLength()) { reducers_[s_id].template fillGrad( @@ -1462,7 +1463,7 @@ class AbstractLengthsOp : public Operator { TData* out = output->template mutable_data(); int64_t dataIndex = 0; - for (int64_t rangeIndex = 0; rangeIndex < outputSize; ++rangeIndex) { + for (const auto rangeIndex : c10::irange(outputSize)) { Reducer reducer(ctx, out + out_block_size * rangeIndex, &context_); for (int64_t start = dataIndex; dataIndex < start + lengths[rangeIndex]; ++dataIndex) { @@ -1551,7 +1552,7 @@ class AbstractLengthsGradientOp : public Operator { CAFFE_ENFORCE(segmentGradsInput.dim() > 0); CAFFE_ENFORCE(numSegments == segmentGradsInput.size(0)); const TLengths* lengths = lengthsInput.template data(); - for (int64_t i = 0; i < numSegments; ++i) { + for (const auto i : c10::irange(numSegments)) { reducedDataSize += lengths[i]; } @@ -1580,7 +1581,7 @@ class AbstractLengthsGradientOp : public Operator { T* dataGrads = dataGradsOutput->template mutable_data(); int64_t dataIndex = 0; - for (int64_t rangeIndex = 0; rangeIndex < numSegments; ++rangeIndex) { + for (const auto rangeIndex : c10::irange(numSegments)) { ReducerGradient reducer( ctx, segmentGrads + segmentBlockSize * rangeIndex, &context_); for (int64_t start = dataIndex; dataIndex < start + lengths[rangeIndex]; @@ -1690,7 +1691,7 @@ class AbstractLengthsWithMainInputGradientOp : public Operator { const Tembedding* data = dataInput.template data(); int64_t dataIndex = 0; - for (int64_t rangeIndex = 0; rangeIndex < numSegments; ++rangeIndex) { + for (const auto rangeIndex : c10::irange(numSegments)) { ReducerGradient reducer( ctx, segmentGrads + segmentBlockSize * rangeIndex, &context_); for (int64_t start = dataIndex; dataIndex < start + lengths[rangeIndex]; @@ -1788,7 +1789,7 @@ class AbstractLengthsWithMainInputAndForwardOutputGradientOp const T* data = dataInput.template data(); int64_t dataIndex = 0; - for (int64_t rangeIndex = 0; rangeIndex < numSegments; ++rangeIndex) { + for (const auto rangeIndex : c10::irange(numSegments)) { ReducerGradient reducer( ctx, segmentGrads + segmentBlockSize * rangeIndex, &context_); for (int64_t start = dataIndex; dataIndex < start + lengths[rangeIndex]; diff --git a/caffe2/operators/self_binning_histogram_op.h b/caffe2/operators/self_binning_histogram_op.h index 9ad95a47617453..3e2f303abd7df3 100644 --- a/caffe2/operators/self_binning_histogram_op.h +++ b/caffe2/operators/self_binning_histogram_op.h @@ -59,12 +59,12 @@ class SelfBinningHistogramOp final : public Operator { T max = 0; T min = 0; int64_t total_count = 0; - for (int input_idx = 0; input_idx < InputSize(); input_idx++) { + for (const auto input_idx : c10::irange(InputSize())) { const auto& x = Input(input_idx); const int64_t N = x.numel(); total_count += N; const auto* x_data = x.template data(); - for (int64_t data_idx = 0; data_idx < N; data_idx++) { + for (const auto data_idx : c10::irange(N)) { const T val = this->abs_ ? abs(x_data[data_idx]) : x_data[data_idx]; if (!first_seen) { max = val; @@ -91,7 +91,7 @@ class SelfBinningHistogramOp final : public Operator { scaled_max = min + (max - min) * RANGE_SCALING; T scaled_range = (scaled_max - min); // Avoid underflow by calculating advancement through multiplication. - for (int i = 0; i < num_edges_; i++) { + for (const auto i : c10::irange(num_edges_)) { T advancement_ratio = T(i) / num_bins_; histogram_values_data[i] = min + advancement_ratio * scaled_range; } @@ -112,7 +112,7 @@ class SelfBinningHistogramOp final : public Operator { T log_multiplier_numerator =log(scaled_max) - log(min); // Avoid underflow by: // - Calculating each advancement separately for each i. - for (int i = 0; i < num_edges_; i++) { + for (const auto i : c10::irange(num_edges_)) { T advancement_ratio = T(i)/num_bins_; histogram_values_data[i] = min * exp(log_multiplier_numerator * advancement_ratio); } @@ -127,11 +127,11 @@ class SelfBinningHistogramOp final : public Operator { histogram_counts_data[0] = total_count; } else { - for (int input_idx = 0; input_idx < InputSize(); input_idx++) { + for (const auto input_idx : c10::irange(InputSize())) { const auto& x = Input(input_idx); const int64_t N = x.numel(); const auto* x_data = x.template data(); - for (int64_t data_idx = 0; data_idx < N; data_idx++) { + for (const auto data_idx : c10::irange(N)) { const T val = this->abs_ ? abs(x_data[data_idx]) : x_data[data_idx]; const auto bisection_it = std::upper_bound( histogram_values_data, @@ -163,7 +163,7 @@ class SelfBinningHistogramOp final : public Operator { void CheckInputs() { const auto& input_zero = Input(0); - for (int i = 1; i < InputSize(); i++) { + for (const auto i : c10::irange(1, InputSize())) { CAFFE_ENFORCE_EQ( Input(i).dtype(), input_zero.dtype(), diff --git a/caffe2/operators/shape_op.h b/caffe2/operators/shape_op.h index fd45cbeb190216..4c3750e520140b 100644 --- a/caffe2/operators/shape_op.h +++ b/caffe2/operators/shape_op.h @@ -34,7 +34,7 @@ class ShapeOp : public Operator { auto* output = Output(0, {numAxes}, at::dtype()); auto src = reinterpret_cast(data.sizes().data()); auto out = reinterpret_cast(output->template mutable_data()); - for (int i = 0; i < numAxes; i++) { + for (const auto i : c10::irange(numAxes)) { auto axis = axes_[i]; CAFFE_ENFORCE_LT(axis, numDims, "Axis out of range"); CAFFE_ENFORCE_GE(axis, 0, "Each axis should be non-negative"); diff --git a/caffe2/operators/sinusoid_position_encoding_op.h b/caffe2/operators/sinusoid_position_encoding_op.h index 9b75d333905791..6cf308cc21ef8b 100644 --- a/caffe2/operators/sinusoid_position_encoding_op.h +++ b/caffe2/operators/sinusoid_position_encoding_op.h @@ -51,7 +51,7 @@ class SinusoidPositionEncodingOp : public Operator { float max_alpha_pow = ((float)embedding_size_ - 1.0f) / (float)embedding_size_; - for (int i = 0; i < M; ++i) { + for (const auto i : c10::irange(M)) { float pos = (float)idxs[i * K]; // Compute the embedding for position i, example 0 first @@ -72,7 +72,7 @@ class SinusoidPositionEncodingOp : public Operator { row_array = amplitude_ * row_array.sin().eval(); // Copy the embedding to position i in the other examples - for (int j = 1; j < K; ++j) { + for (const auto j : c10::irange(1, K)) { int base = i * K * embedding_size_; std::copy( &out[base], diff --git a/caffe2/operators/slice_op.h b/caffe2/operators/slice_op.h index 9706472315b612..973a8c57201ec2 100644 --- a/caffe2/operators/slice_op.h +++ b/caffe2/operators/slice_op.h @@ -30,7 +30,7 @@ bool SliceImpl( std::vector ends_idx(data.dim()); std::vector dst_sizes(data.dim()); - for (int i = 0; i < data.dim(); ++i) { + for (const auto i : c10::irange(data.dim())) { if (i >= starts.numel()) { starts_idx[i] = 0; ends_idx[i] = data.size(i); @@ -78,7 +78,7 @@ bool SliceImpl( } // for now only supports slicing in 1 dimension int dim = -1; - for (int i = 0; i < data.dim(); ++i) { + for (const auto i : c10::irange(data.dim())) { if (starts_idx[i] > 0 || ends_idx[i] < data.size(i)) { CAFFE_ENFORCE_EQ( dim, -1, "Currently only possible to slice in 1 dimension."); @@ -131,7 +131,7 @@ bool SliceImpl( char* src_offset_bytes = src_bytes + itemsize * src_offset; char* dst_offset_bytes = dst_bytes; - for (size_t i = 0; i < num_blocks; ++i) { + for (const auto i : c10::irange(num_blocks)) { char* local_src_offset_bytes = src_offset_bytes + i * src_block_size_bytes; char* local_dst_offset_bytes = @@ -177,7 +177,7 @@ bool SliceImpl( return true; } - for (size_t i = 0; i < num_blocks; ++i) { + for (const auto i : c10::irange(num_blocks)) { char* local_src_offset_bytes = src_offset_bytes + i * src_block_size_bytes; char* local_dst_offset_bytes = diff --git a/caffe2/operators/space_batch_op.h b/caffe2/operators/space_batch_op.h index 4c80711b2c1335..ce772275b20694 100644 --- a/caffe2/operators/space_batch_op.h +++ b/caffe2/operators/space_batch_op.h @@ -29,14 +29,14 @@ void spaceToBatch( const int input_height = input.dim32(2); const int input_width = input.dim32(3); - for (int out_b = 0; out_b < output_batch; ++out_b) { + for (const auto out_b : c10::irange(output_batch)) { const int in_b = out_b % input_batch; const int offset_w = (out_b / input_batch) % block_size; const int offset_h = (out_b / input_batch) / block_size; - for (int d = 0; d < input_depth; ++d) { - for (int out_h = 0; out_h < output_height; ++out_h) { + for (const auto d : c10::irange(input_depth)) { + for (const auto out_h : c10::irange(output_height)) { const int in_h = out_h * block_size + offset_h - pad_t; - for (int out_w = 0; out_w < output_width; ++out_w) { + for (const auto out_w : c10::irange(output_width)) { const int in_w = out_w * block_size + offset_w - pad_l; const auto output_offset = ((out_b * output_depth + d) * output_height + out_h) * @@ -80,14 +80,14 @@ void batchToSpace( const int input_width = input.dim32(3); CAFFE_ENFORCE(input_depth == output_depth); - for (int in_b = 0; in_b < input_batch; ++in_b) { + for (const auto in_b : c10::irange(input_batch)) { const int out_b = in_b % output_batch; const int offset_w = (in_b / output_batch) % block_size; const int offset_h = (in_b / output_batch) / block_size; - for (int d = 0; d < input_depth; ++d) { - for (int in_h = 0; in_h < input_height; ++in_h) { + for (const auto d : c10::irange(input_depth)) { + for (const auto in_h : c10::irange(input_height)) { const int out_h = in_h * block_size + offset_h - pad_t; - for (int in_w = 0; in_w < input_width; ++in_w) { + for (const auto in_w : c10::irange(input_width)) { const int out_w = in_w * block_size + offset_w - pad_l; if (out_h >= 0 && out_w >= 0 && out_h < output_height && out_w < output_width) { diff --git a/caffe2/operators/sparse_to_dense_mask_op.h b/caffe2/operators/sparse_to_dense_mask_op.h index 36a7da30781ab0..67a63e199ebc30 100644 --- a/caffe2/operators/sparse_to_dense_mask_op.h +++ b/caffe2/operators/sparse_to_dense_mask_op.h @@ -6,6 +6,7 @@ #include #include "caffe2/core/context.h" #include "caffe2/core/export_caffe2_op_to_c10.h" +#include #include "caffe2/core/operator.h" #include "caffe2/core/tensor.h" #include "caffe2/utils/math.h" @@ -29,7 +30,7 @@ class SparseToDenseMaskBase : public Operator { auto biggest = *std::max_element(mask.begin(), mask.end()); dense_.assign(std::min(kMaxDenseSize, biggest + 1), -1); // NOLINTNEXTLINE(clang-diagnostic-sign-compare) - for (int i = 0; i < mask.size(); i++) { + for (const auto i : c10::irange(mask.size())) { int64_t id = mask[i]; CAFFE_ENFORCE_GE(id, 0, "Only positive IDs are allowed."); if (id >= kMaxDenseSize) { @@ -155,7 +156,7 @@ class SparseToDenseMaskOp : public SparseToDenseMaskBase { } int64_t offset = 0; - for (int r = 0; r < rows; r++) { + for (const auto r : c10::irange(rows)) { bool skippedSparseIndex = false; for (int c = 0; c < lengths_vec[r]; c++) { const auto sparse_index = sparse_indices_vec[offset + c]; @@ -272,7 +273,7 @@ class SparseToDenseMaskGradientOp : public SparseToDenseMaskBase { // SparseToDenseMask is not injective; gradient_used records // if the gradient is used for other input value from the same row vector gradient_used(cols, false); - for (int r = 0; r < rows; r++) { + for (const auto r : c10::irange(rows)) { std::fill(gradient_used.begin(), gradient_used.end(), false); for (int c = lengths_vec[r] - 1; c >= 0; c--) { int idx = this->getFeatureIdx(sparse_indices_vec[offset + c]); diff --git a/caffe2/operators/sparse_to_dense_op.h b/caffe2/operators/sparse_to_dense_op.h index 40498a9510ddd3..574454022b573f 100644 --- a/caffe2/operators/sparse_to_dense_op.h +++ b/caffe2/operators/sparse_to_dense_op.h @@ -89,7 +89,7 @@ class SparseToDenseOp final : public Operator { const auto block_nitems = sparse_values.size_from_dim(1); const TData* sparse_values_vec = sparse_values.template data(); - for (int32_t i = 0; i < sparse_indices_len; i++) { + for (const auto i : c10::irange(sparse_indices_len)) { const TInd idx = sparse_indices_vec[i]; CAFFE_ENFORCE_GE(idx, 0); CAFFE_ENFORCE_LT(idx, output_first_dim); diff --git a/caffe2/operators/square_root_divide_op.h b/caffe2/operators/square_root_divide_op.h index 9adaff46c41d2e..d23c808b655c67 100644 --- a/caffe2/operators/square_root_divide_op.h +++ b/caffe2/operators/square_root_divide_op.h @@ -41,7 +41,7 @@ class SquareRootDivideOp final : public Operator { auto* scalePtr = scale.template data(); auto* dataPtr = data.template data(); auto* yPtr = Y->template mutable_data(); - for (auto i = 0U; i < batchSize; ++i) { + for (const auto i : c10::irange(0U, batchSize)) { auto scale = scalePtr[i]; CAFFE_ENFORCE(scale >= 0, scale, " < 0"); auto multiplier = scale == 0 ? 1.0 : 1 / std::sqrt(scale); diff --git a/caffe2/operators/string_ops.h b/caffe2/operators/string_ops.h index 49cc322267f281..2642bd3daabe18 100644 --- a/caffe2/operators/string_ops.h +++ b/caffe2/operators/string_ops.h @@ -20,7 +20,7 @@ struct ForEach { template bool operator()(int n, const In* in, Out* out, Context* /*c*/) { - for (int i = 0; i < n; ++i) { + for (const auto i : c10::irange(n)) { out[i] = functor(in[i]); } return true; diff --git a/caffe2/operators/tensor_protos_db_input.h b/caffe2/operators/tensor_protos_db_input.h index d07763ccb43d00..04cf0ec002ddbe 100644 --- a/caffe2/operators/tensor_protos_db_input.h +++ b/caffe2/operators/tensor_protos_db_input.h @@ -51,7 +51,7 @@ bool TensorProtosDBInput::Prefetch() { TensorProtos protos; CAFFE_ENFORCE(protos.ParseFromString(value_)); CAFFE_ENFORCE(protos.protos_size() == OutputSize()); - for (int i = 0; i < protos.protos_size(); ++i) { + for (const auto i : c10::irange(protos.protos_size())) { if (protos.protos(i).has_device_detail()) { protos.mutable_protos(i)->clear_device_detail(); } @@ -62,14 +62,14 @@ bool TensorProtosDBInput::Prefetch() { // CPU)); } } else { - for (int item_id = 0; item_id < batch_size_; ++item_id) { + for (const auto item_id : c10::irange(batch_size_)) { reader.Read(&key_, &value_); TensorProtos protos; CAFFE_ENFORCE(protos.ParseFromString(value_)); CAFFE_ENFORCE(protos.protos_size() == OutputSize()); // Note: shape_inferred_ is ignored, we'll always get dimensions from // proto - for (int i = 0; i < protos.protos_size(); ++i) { + for (const auto i : c10::irange(protos.protos_size())) { vector dims( protos.protos(i).dims().begin(), protos.protos(i).dims().end()); dims.insert(dims.begin(), batch_size_); @@ -94,7 +94,7 @@ bool TensorProtosDBInput::Prefetch() { template bool TensorProtosDBInput::CopyPrefetched() { - for (int i = 0; i < OutputSize(); ++i) { + for (const auto i : c10::irange(OutputSize())) { OperatorBase::template Output(i, Context::GetDeviceType()) ->CopyFrom( prefetched_blobs_[i].template Get(), /* async */ true); diff --git a/caffe2/operators/tile_op.h b/caffe2/operators/tile_op.h index 360b58a0934b95..2a960fc93e4262 100644 --- a/caffe2/operators/tile_op.h +++ b/caffe2/operators/tile_op.h @@ -113,12 +113,12 @@ class TileOp final : public Operator { bool DoTile(const int outer_size, const int inner_size, const T* X, T* Y) { if (inner_size == 1) { EigenArrayMap Y_arr(Y, tiles_, outer_size); - for (int i = 0; i < outer_size; ++i) { + for (const auto i : c10::irange(outer_size)) { Y_arr.col(i) = X[i]; } } else { ConstEigenArrayMap X_arr(X, inner_size, outer_size); - for (int i = 0; i < outer_size; ++i) { + for (const auto i : c10::irange(outer_size)) { EigenArrayMap(Y + i * tiles_ * inner_size, inner_size, tiles_) .colwise() = X_arr.col(i); } @@ -245,10 +245,10 @@ class TileGradientOp final : public Operator { dX, inner_size, &context_); - for (int i = 0; i < outer_size; ++i) { + for (const auto i : c10::irange(outer_size)) { const T* dY_ptr = dY + i * tiles_ * inner_size; T* dX_ptr = dX + i * inner_size; - for (int j = 1; j < tiles_; ++j) { + for (const auto j : c10::irange(1, tiles_)) { math::Add( inner_size, dX_ptr, dY_ptr + j * inner_size, dX_ptr, &context_); } diff --git a/caffe2/operators/transpose_op.h b/caffe2/operators/transpose_op.h index cfd2e6341cb89a..c17d0d0d0baecb 100644 --- a/caffe2/operators/transpose_op.h +++ b/caffe2/operators/transpose_op.h @@ -49,7 +49,7 @@ class TransposeOp : public Operator { } const at::IntArrayRef X_dims = X.sizes(); std::vector Y_dims(ndim); - for (int i = 0; i < ndim; ++i) { + for (const auto i : c10::irange(ndim)) { Y_dims[i] = X_dims[axes_[i]]; } Y->Resize(Y_dims); diff --git a/caffe2/operators/tt_linear_op.h b/caffe2/operators/tt_linear_op.h index d4dead22d17ab2..6f452d50dcb378 100644 --- a/caffe2/operators/tt_linear_op.h +++ b/caffe2/operators/tt_linear_op.h @@ -127,7 +127,7 @@ class TTLinearOp final : public Operator { // Check that output size of Y is the element-wise product of out_sizes int prod_out_sizes = 1; // NOLINTNEXTLINE(clang-diagnostic-sign-compare) - for (int i = 0; i < out_sizes_.size(); i++) { + for (const auto i : c10::irange(out_sizes_.size())) { prod_out_sizes *= out_sizes_[i]; } CAFFE_ENFORCE( diff --git a/caffe2/operators/unsafe_coalesce.h b/caffe2/operators/unsafe_coalesce.h index bb0f58a6555893..4070312fd0d96d 100644 --- a/caffe2/operators/unsafe_coalesce.h +++ b/caffe2/operators/unsafe_coalesce.h @@ -3,6 +3,7 @@ #include "caffe2/core/context.h" #include "caffe2/core/export_caffe2_op_to_c10.h" +#include #include "caffe2/core/operator.h" @@ -16,7 +17,7 @@ class UnsafeCoalesceOp final : public Operator { bool RunOnDevice() override { size_t coalesced_size = 0; - for (int i = 0; i < InputSize(); ++i) { + for (const auto i : c10::irange(InputSize())) { // For now only float type is supported CAFFE_ENFORCE( Input(i).dtype().template Match(), @@ -24,14 +25,14 @@ class UnsafeCoalesceOp final : public Operator { i); } - for (int i = 0; i < InputSize(); ++i) { + for (const auto i : c10::irange(InputSize())) { coalesced_size += Input(i).numel(); } auto* coalesced = Output(OutputSize() - 1, coalesced_size, at::dtype()); auto coalesced_data = coalesced->template mutable_data(); size_t coalesced_offset = 0; - for (auto i = 0; i < InputSize(); ++i) { + for (const auto i : c10::irange(InputSize())) { const auto num_elems = Input(i).numel(); auto input_sizes = Input(i).sizes().vec(); // Don't do anything if both tensors are already pointing on the same data diff --git a/caffe2/operators/utility_ops.h b/caffe2/operators/utility_ops.h index e76649346dc31d..8e8a72ac136fc7 100644 --- a/caffe2/operators/utility_ops.h +++ b/caffe2/operators/utility_ops.h @@ -8,6 +8,7 @@ #include "caffe2/core/common_omp.h" #include "caffe2/core/context.h" #include "caffe2/core/export_caffe2_op_to_c10.h" +#include #include "caffe2/core/logging.h" #include "caffe2/core/operator.h" #include "caffe2/core/types.h" @@ -64,7 +65,7 @@ class IsNanOp final : public Operator { const auto* X_data = X.template data(); uint8_t* Y_data = Y->template mutable_data(); // NOLINTNEXTLINE(clang-diagnostic-sign-compare) - for (size_t i = 0; i < X.numel(); i++) { + for (const auto i : c10::irange(X.numel())) { Y_data[i] = (uint8_t)(std::isnan(X_data[i])); } return true; @@ -299,7 +300,7 @@ class SumOp : public Operator { auto* output = Output(0, input0.sizes(), at::dtype()); T* output_data = output->template mutable_data(); // Dimension checking - for (int i = 1; i < InputSize(); ++i) { + for (const auto i : c10::irange(1, InputSize())) { if (output->sizes() != Input(i).sizes()) { CAFFE_THROW( "Check failed: output->sizes() == Input(i).sizes().", @@ -320,7 +321,7 @@ class SumOp : public Operator { output_data, &context_); // Add remaining. - for (int i = 2; i < InputSize(); ++i) { + for (const auto i : c10::irange(2, InputSize())) { math::Add( output->numel(), output_data, @@ -577,7 +578,7 @@ class ScatterWeightedSumOp : public Operator { float w0 = *weight0.template data(); // It's most likely a constant so exact comparison is fine if (w0 != 1.0) { - for (int i = 0; i < K; ++i) { + for (const auto i : c10::irange(K)) { Index idx = idxs[i]; CAFFE_ENFORCE( 0 <= idx && idx < N, @@ -600,7 +601,7 @@ class ScatterWeightedSumOp : public Operator { CAFFE_ENFORCE_EQ(weight.numel(), 1); const T* x_data = X.template data(); float w = *weight.template data(); - for (int i = 0; i < K; ++i) { + for (const auto i : c10::irange(K)) { Index idx = idxs[i]; // double-checking the indices, but it's fine as it's DCHECK only DCHECK(0 <= idx && idx < N) @@ -746,7 +747,7 @@ class ScatterAssignOp : public Operator { int64_t N, int64_t K, int64_t block_size) { - for (int i = 0; i < K; ++i) { + for (const auto i : c10::irange(K)) { Index idx = idxs[i]; // double-checking the indices, but it's fine as it's DCHECK only DCHECK(0 <= idx && idx < N) @@ -838,11 +839,9 @@ class ScatterOp : public Operator { // dst should have the same rank as idxs and src, but the dimension of dim // axis can be different. That is why in the above equation, there is the // difference of J_src and J_dst. - for (int64_t outer_batch = 0; outer_batch < outer_dims_product; - ++outer_batch) { - for (int64_t i = 0; i < N; ++i) { - for (int64_t inner_batch = 0; inner_batch < idxs_block_size; - ++inner_batch) { + for (const auto outer_batch : c10::irange(outer_dims_product)) { + for (const auto i : c10::irange(N)) { + for (const auto inner_batch : c10::irange(idxs_block_size)) { auto idxs_elem_idx = outer_batch * idxs_batch_size + i * idxs_block_size + inner_batch; auto src_elem_idx = @@ -867,7 +866,7 @@ class ScatterOp : public Operator { const IndexType* indices, int64_t n, IndexType indexing_axis_dim) { - for (auto i = 0; i < n; ++i) { + for (const auto i : c10::irange(n)) { auto idx = indices[i]; CAFFE_ENFORCE( 0 <= idx && idx < indexing_axis_dim, @@ -900,7 +899,7 @@ class LengthsToSegmentIdsOp : public Operator { output->Resize(total_length); auto* output_data = output->template mutable_data(); - for (int i = 0; i < input.numel(); ++i) { + for (const auto i : c10::irange(input.numel())) { auto len = input_data[i]; std::fill(output_data, output_data + len, i); output_data += len; @@ -927,7 +926,7 @@ class LengthsToRangesOp : public Operator { auto* output_data = output->template mutable_data(); int32_t offset = 0; - for (int i = 0; i < size; ++i) { + for (const auto i : c10::irange(size)) { auto len = input_data[i]; output_data[i * 2] = offset; output_data[i * 2 + 1] = len; @@ -961,7 +960,7 @@ class LengthsToOffsetsOp : public Operator { auto* output_data = output->template mutable_data(); int32_t offset = 0; - for (int i = 0; i < size; ++i) { + for (const auto i : c10::irange(size)) { auto len = input_data[i]; output_data[i] = offset; offset += len; @@ -1018,7 +1017,7 @@ class SegmentIdsToLengthsOp : public Operator { } std::fill(output_data, output_data + num_segments, 0); Index prev = 0; // Assume that segment_id >= 0. - for (int64_t i = 0; i < input_size; i++) { + for (const auto i : c10::irange(input_size)) { CAFFE_ENFORCE( prev <= input_data[i], "Segment ids must be sorted: ", @@ -1069,7 +1068,7 @@ class SegmentIdsToRangesOp : public Operator { } std::fill(output_data, output_data + num_segments * 2, 0); Index prev = input_data[0]; - for (int64_t i = 0; i < input_size; i++) { + for (const auto i : c10::irange(input_size)) { CAFFE_ENFORCE( prev <= input_data[i], "Segment ids must be sorted: ", @@ -1109,7 +1108,7 @@ class LengthsToWeightsOp : public Operator { auto* output = Output(0); int64_t output_size = 0; - for (auto i = 0; i < input_size; i++) { + for (const auto i : c10::irange(input_size)) { CAFFE_ENFORCE_GE(input_data[i], 0, "unexpected negative length value"); output_size += input_data[i]; } @@ -1132,7 +1131,7 @@ class LengthsToWeightsOp : public Operator { output->Resize(output_size); auto* output_data = output->template mutable_data(); int64_t cnt = 0; - for (auto i = 0; i < input_size; i++) { + for (const auto i : c10::irange(input_size)) { auto len = input_data[i]; if (len == 0) { continue; @@ -1159,7 +1158,7 @@ class HasElementsOp : public Operator { bool RunOnDevice() override { bool res = false; - for (auto i = 0; i < InputSize(); ++i) { + for (const auto i : c10::irange(InputSize())) { const auto& input = Input(i); res = res || input.numel() > 0; } @@ -1208,7 +1207,7 @@ class LengthsToShapeOp : public Operator { auto size = input.numel(); auto first = input_data[0]; - for (int i = 1; i < size; i++) { + for (const auto i : c10::irange(1, size)) { CAFFE_ENFORCE( input_data[i] == first, "All elements of input must be same "); } @@ -1255,7 +1254,7 @@ class GatherRangesOp : public Operator { size_t start = 0; size_t blockSize = ranges.size_from_dim(1); // NOLINTNEXTLINE(clang-diagnostic-sign-compare) - for (size_t i = 0; i < batchSize; ++i) { + for (const auto i : c10::irange(batchSize)) { auto end = start + blockSize; outputLengthsPtr[i] = accumulate(rangesData, start, end); start = end; @@ -1329,7 +1328,7 @@ class LengthsGatherOp : public Operator { int64_t total_length = 0; // NOLINTNEXTLINE(clang-diagnostic-sign-compare) - for (size_t i = 0; i < indices.numel(); ++i) { + for (const auto i : c10::irange(indices.numel())) { auto idx = indices_data[i]; CAFFE_ENFORCE_LT(idx, lengths.numel()); total_length += lengths_data[idx]; @@ -1341,7 +1340,7 @@ class LengthsGatherOp : public Operator { offsets_.clear(); int64_t running_offset = 0; offsets_.reserve(lengths.numel()); - for (size_t i = 0; i < lengths.numel(); ++i) { + for (const auto i : c10::irange(lengths.numel())) { offsets_.push_back(running_offset); running_offset += lengths_data[i]; } @@ -1355,7 +1354,7 @@ class LengthsGatherOp : public Operator { auto block_bytesize = block_size * items.itemsize(); auto out = static_cast(output->raw_mutable_data(items.dtype())); - for (size_t i = 0; i < indices.numel(); ++i) { + for (const auto i : c10::irange(indices.numel())) { auto idx = indices_data[i]; auto length = lengths_data[idx]; context_.CopyItemsSameDevice( @@ -1406,7 +1405,7 @@ class AccumulateHistogramOp : public Operator { math::Set( num_output_buckets_, 0, cur_hist_data, &context_); - for (int i = 0; i < N; i++) { + for (const auto i : c10::irange(N)) { int bucket_index = -1; if (X_data[i] < lower_bound_) { bucket_index = 0; @@ -1419,7 +1418,7 @@ class AccumulateHistogramOp : public Operator { accumulate_hist_[bucket_index] += 1; } - for (int i = 0; i < num_output_buckets_; i++) { + for (const auto i : c10::irange(num_output_buckets_)) { acc_hist_data[i] = accumulate_hist_[i]; } @@ -1464,7 +1463,7 @@ class RangeOp : public Operator { T start = 0; T step = 1; - for (int i = 0; i < InputSize(); ++i) { + for (const auto i : c10::irange(InputSize())) { CAFFE_ENFORCE_EQ( Input(i).numel(), 1, "All inputs must be scalar/1D tensor."); } diff --git a/caffe2/operators/variable_length_sequence_padding.h b/caffe2/operators/variable_length_sequence_padding.h index f86964d639e321..4a594bc74b12c9 100644 --- a/caffe2/operators/variable_length_sequence_padding.h +++ b/caffe2/operators/variable_length_sequence_padding.h @@ -17,7 +17,7 @@ void VariableLengthSequencePadding( const int32_t* seqLengths, const T padValue, Context* /*context*/) { - for (int j = 0; j < B; j++) { + for (const auto j : c10::irange(B)) { for (int i = seqLengths[j]; i < N; i++) { EigenVectorArrayMap(X + B * M * i + M * j, M).setConstant(padValue); } diff --git a/caffe2/opt/custom/cc_amrc.h b/caffe2/opt/custom/cc_amrc.h index 806e2bb15dc401..d6b099decb3f57 100644 --- a/caffe2/opt/custom/cc_amrc.h +++ b/caffe2/opt/custom/cc_amrc.h @@ -54,7 +54,7 @@ class ConcatAddMulReplaceNaNClipOp final : public Operator { } int before = 1, after = 1; vector output_dims(concat_input_0.sizes().vec()); - for (int i = 0; i < concat_input_0.dim(); ++i) { + for (const auto i : c10::irange(concat_input_0.dim())) { if (i == canonical_axis) { continue; } @@ -65,7 +65,7 @@ class ConcatAddMulReplaceNaNClipOp final : public Operator { after *= dim; } // check the input dims are compatible. - for (int j = concat_input_start; j < InputSize(); ++j) { + for (const auto j : c10::irange(concat_input_start, InputSize())) { int dim_j = Input(j).dim32(i); CAFFE_ENFORCE( dim == dim_j, @@ -93,7 +93,7 @@ class ConcatAddMulReplaceNaNClipOp final : public Operator { "Cannot handle fused concat with dim > 2, please update your fusion logic"); int output_channels = 0; - for (int i = concat_input_start; i < InputSize(); ++i) { + for (const auto i : c10::irange(concat_input_start, InputSize())) { axis_data[i - concat_input_start] = Input(i).dim32(canonical_axis); output_channels += Input(i).dim32(canonical_axis); } @@ -101,7 +101,7 @@ class ConcatAddMulReplaceNaNClipOp final : public Operator { auto* output = Output(0, output_dims, at::dtype()); size_t output_offset = 0; - for (int i = concat_input_start; i < InputSize(); ++i) { + for (const auto i : c10::irange(concat_input_start, InputSize())) { auto& input = Input(i); auto axis_dim = input.dim32(canonical_axis); math::CopyMatrix( @@ -127,7 +127,7 @@ class ConcatAddMulReplaceNaNClipOp final : public Operator { const auto _zeros = _mm256_set1_ps(0.f); output_offset = 0; - for (auto outer = 0; outer < before; ++outer) { + for (const auto outer : c10::irange(before)) { auto axis_dim = output->dim32(canonical_axis); size_t inner_size = axis_dim * after; auto inner = 0; @@ -148,7 +148,7 @@ class ConcatAddMulReplaceNaNClipOp final : public Operator { _mm256_storeu_ps(&output_data[output_offset + inner], out_val); } - for (auto inner_omp = inner; inner_omp < inner_size; ++inner_omp) { + for (const auto inner_omp : c10::irange(inner, inner_size)) { float elem = output_data[output_offset + inner_omp]; float add_elem = add_input_data[inner_omp]; float mul_elem = mul_input_data[inner_omp]; diff --git a/caffe2/opt/nql/ast.h b/caffe2/opt/nql/ast.h index 2fee8fc1dc224c..0d6aefc90c4e13 100644 --- a/caffe2/opt/nql/ast.h +++ b/caffe2/opt/nql/ast.h @@ -1,4 +1,5 @@ #pragma once +#include "c10/util/irange.h" #include #include #include @@ -20,8 +21,7 @@ struct ASTExpr { return starInputsFlag; } void dump(int level = 0) const { - for (int i = 0; i < level; i++) - std::cout << " "; + for (const auto i : c10::irange(level))std::cout << " "; if (!isCall()) std::cout << "Var: " << name << std::endl; else { @@ -41,8 +41,7 @@ struct ASTStmt { delete rhs; } void dump(int level = 0) const { - for (int i = 0; i < level; i++) - std::cout << " "; + for (const auto i : c10::irange(level))std::cout << " "; std::cout << "LHS:" << std::endl; for (auto s : lhs) { for (int i = 0; i < level + 1; i++) diff --git a/caffe2/opt/onnxifi_op.h b/caffe2/opt/onnxifi_op.h index 9e3aa800c4e1de..99f25a4453d804 100644 --- a/caffe2/opt/onnxifi_op.h +++ b/caffe2/opt/onnxifi_op.h @@ -6,6 +6,7 @@ #include #include +#include #include "caffe2/core/context.h" #include "caffe2/core/logging.h" #include "caffe2/core/operator.h" @@ -138,7 +139,7 @@ class OnnxifiOp final : public Operator { if (use_passed_output_shapes_) { // Populate output_shapes_per_bs_ - for (int bs = 1; bs < max_batch_size_; ++bs) { + for (const auto bs : c10::irange(1, max_batch_size_)) { auto output_shapes_tp = helper.GetRepeatedArgument("output_shapes_bs_" + caffe2::to_string(bs)); auto output_qshapes_tp = helper.GetRepeatedArgument("output_qshapes_bs_" + caffe2::to_string(bs)); CAFFE_ENFORCE_EQ(output_names_.size(), output_shapes_tp.size() + output_qshapes_tp.size()); @@ -267,7 +268,7 @@ class OnnxifiOp final : public Operator { ONNXIFI_STATUS_SUCCESS); // Release unused backend ids. - for (size_t i = 0; i < num_backends; ++i) { + for (const auto i : c10::irange(num_backends)) { if (i == static_cast(backend_index)) { continue; } @@ -287,7 +288,7 @@ class OnnxifiOp final : public Operator { // Extra weight shapes std::unordered_map weight_shape_info; - for (size_t i = 0; i < weight_names.size(); ++i) { + for (const auto i : c10::irange(weight_names.size())) { TensorShape shape; const auto& shape0 = weight_shapes[i]; for (const auto d : shape0) { diff --git a/caffe2/perfkernels/adagrad.h b/caffe2/perfkernels/adagrad.h index 12cd41056ec3ea..f030e3e09d601f 100644 --- a/caffe2/perfkernels/adagrad.h +++ b/caffe2/perfkernels/adagrad.h @@ -6,6 +6,7 @@ #include #endif #include +#include namespace caffe2 { @@ -26,7 +27,7 @@ static inline void adagrad_update_base_inlined( float epsilon, float lr, float weight_decay = 0.f) { - for (auto i = 0; i < N; ++i) { + for (const auto i : c10::irange(N)) { float gi = std::fma(weight_decay, w[i], g[i]); float hi = decay * h[i] + gi * gi; nh[i] = hi; diff --git a/caffe2/perfkernels/lstm_unit_cpu-impl.h b/caffe2/perfkernels/lstm_unit_cpu-impl.h index 857899a0d9f69f..e4adcbbd9f2ad0 100644 --- a/caffe2/perfkernels/lstm_unit_cpu-impl.h +++ b/caffe2/perfkernels/lstm_unit_cpu-impl.h @@ -2,6 +2,7 @@ #include #include #include +#include "c10/util/irange.h" #include "caffe2/utils/conversions.h" #if (ENABLE_VECTORIZATION > 0) && !defined(_DEBUG) && !defined(DEBUG) @@ -53,7 +54,7 @@ inline void LstmUnitImpl( T* H, const float forget_bias) { const T forgetBias = convert::To(forget_bias); - for (int n = 0; n < N; ++n) { + for (const auto n : c10::irange(N)) { const bool valid = seqLengths == nullptr || t < seqLengths[n]; if (!valid) { if (drop_states) { @@ -67,7 +68,7 @@ inline void LstmUnitImpl( const T* X_D = &X[D]; const T* X_2D = &X[2 * D]; const T* X_3D = &X[3 * D]; - VECTOR_LOOP for (int d = 0; d < D; ++d) { + VECTOR_LOOP for (const auto d : c10::irange(D)) { const T i = sigmoid(X[d]); const T f = sigmoid(X_D[d] + forgetBias); const T o = sigmoid(X_2D[d]); @@ -105,7 +106,7 @@ inline void LstmUnitGradientImpl( T* X_diff, const float forget_bias) { const T localForgetBias = convert::To(forget_bias); - for (int n = 0; n < N; ++n) { + for (const auto n : c10::irange(N)) { const bool valid = seqLengths == nullptr || t < seqLengths[n]; if (!valid) { @@ -118,7 +119,7 @@ inline void LstmUnitGradientImpl( } memset(X_diff, 0, 4 * sizeof(T) * D); } else { - VECTOR_LOOP for (int d = 0; d < D; ++d) { + VECTOR_LOOP for (const auto d : c10::irange(D)) { T* c_prev_diff = C_prev_diff + d; T* h_prev_diff = H_prev_diff + d; T* i_diff = X_diff + d; diff --git a/caffe2/predictor/emulator/data_filler.h b/caffe2/predictor/emulator/data_filler.h index e3021f624e4508..55aed11f3f1048 100644 --- a/caffe2/predictor/emulator/data_filler.h +++ b/caffe2/predictor/emulator/data_filler.h @@ -59,12 +59,12 @@ class DataNetFiller : public Filler { : init_net_(init_net), data_net_(data_net) { // The output of the data_net_ will be served as the input int op_size = data_net_.op_size(); - for (int i = 0; i < op_size; ++i) { + for (const auto i : c10::irange(op_size)) { OperatorDef op_def = data_net_.op(i); // We rely on Fill op to generate inputs CAFFE_ENFORCE(op_def.type().find("Fill") != std::string::npos); int output_size = op_def.output_size(); - for (int j = 0; j < output_size; ++j) { + for (const auto j : c10::irange(output_size)) { input_names_.push_back(op_def.output(j)); } } @@ -105,7 +105,7 @@ class DataRandomFiller : public Filler { int input_index, const std::vector>& input_dims) { Workspace ws; - for (int i = 0; i < op_def.input_size(); ++i) { + for (const auto i : c10::irange(op_def.input_size())) { // CreateOperator requires all input blobs present ws.CreateBlob(op_def.input(i)); } diff --git a/caffe2/python/pybind_state.h b/caffe2/python/pybind_state.h index 6513f216a9bee0..b6fcf6d866d448 100644 --- a/caffe2/python/pybind_state.h +++ b/caffe2/python/pybind_state.h @@ -153,12 +153,12 @@ class TensorFetcher : public BlobFetcherBase { if (numpy_type == NPY_OBJECT) { PyObject** outObj = reinterpret_cast(outPtr); auto* str = tensor.template data(); - for (int i = 0; i < tensor.numel(); ++i) { + for (const auto i : c10::irange(tensor.numel())) { outObj[i] = PyBytes_FromStringAndSize(str->data(), str->size()); str++; // cleanup on failure if (outObj[i] == nullptr) { - for (int j = 0; j < i; ++j) { + for (const auto j : c10::irange(i)) { Py_DECREF(outObj[j]); } CAFFE_THROW("Failed to allocate string for ndarray of strings."); @@ -212,7 +212,7 @@ class TensorFeeder : public BlobFeederBase { int ndim = PyArray_NDIM(array); npy_intp* npy_dims = PyArray_DIMS(array); std::vector dims; - for (int i = 0; i < ndim; ++i) { + for (const auto i : c10::irange(ndim)) { dims.push_back(npy_dims[i]); } @@ -229,7 +229,7 @@ class TensorFeeder : public BlobFeederBase { dims, at::dtype().device(Context::GetDeviceType())); } auto* outPtr = tensor.template mutable_data(); - for (int i = 0; i < tensor.numel(); ++i) { + for (const auto i : c10::irange(tensor.numel())) { char* str; Py_ssize_t strSize; if (PyBytes_Check(input[i])) { @@ -375,7 +375,7 @@ class PythonOpBase : public Operator { std::vector inputs; inputs.reserve(InputSize()); - for (auto i = 0; i < InputSize(); ++i) { + for (const auto i : c10::irange(InputSize())) { const auto* blob = &InputBlob(i); // Allow CPU tensors in addition to operator context's tensors py::object py_obj; @@ -395,7 +395,7 @@ class PythonOpBase : public Operator { } std::vector outputs; outputs.reserve(OutputSize()); - for (auto i = 0; i < OutputSize(); ++i) { + for (const auto i : c10::irange(OutputSize())) { auto* blob = OutputBlob(i); // Python op is always used with CPUContext only and treats inputs and diff --git a/caffe2/quantization/server/elementwise_dnnlowp_op.h b/caffe2/quantization/server/elementwise_dnnlowp_op.h index aac1020143f99e..c7d1d24ec5e57e 100644 --- a/caffe2/quantization/server/elementwise_dnnlowp_op.h +++ b/caffe2/quantization/server/elementwise_dnnlowp_op.h @@ -127,7 +127,7 @@ class BinaryElementwiseDNNLowPOp : public DNNLowPOp { size_t n, \ size_t post, \ CPUContext*) { \ - for (int i = 0; i < pre; ++i) { \ + for (const auto i : c10::irange(pre)) { \ EigenArrayMap(out + i * n * post, post, n) = eigen_op( \ (ConstEigenArrayMap(a + i * n * post, post, n).rowwise()), \ (Eigen::Map>(b, n))); \ diff --git a/caffe2/quantization/server/im2col_dnnlowp.h b/caffe2/quantization/server/im2col_dnnlowp.h index 92f7b272ac399f..dc347142b640f5 100644 --- a/caffe2/quantization/server/im2col_dnnlowp.h +++ b/caffe2/quantization/server/im2col_dnnlowp.h @@ -50,7 +50,7 @@ static void Im2ColNCHW( auto* dst = data_col + nip * (kernel_h * kernel_w * output_h * output_w) + kh * (kernel_w * output_h * output_w) + kw * (output_h * output_w); const auto* src = data_im + nip * (height * width); - for (auto y = 0; y < output_h; y++) { + for (const auto y : c10::irange(output_h)) { const auto iy = y * stride_h + kh; const auto ix = kw; if (stride_w == 1) { @@ -59,7 +59,7 @@ static void Im2ColNCHW( src + (iy * width + ix), sizeof(T) * output_w); } else { - for (auto x = 0; x < output_w; x++) { + for (const auto x : c10::irange(output_w)) { memcpy( dst + (y * output_w + x), src + (iy * width + ix + x * stride_w), @@ -78,8 +78,8 @@ static void Im2ColNCHW( const int pad_w = pad_l; const int channel_size = height * width; for (int channel = channels; channel--; data_im += channel_size) { - for (int kernel_row = 0; kernel_row < kernel_h; kernel_row++) { - for (int kernel_col = 0; kernel_col < kernel_w; kernel_col++) { + for (const auto kernel_row : c10::irange(kernel_h)) { + for (const auto kernel_col : c10::irange(kernel_w)) { int input_row = -pad_h + kernel_row * dilation_h; for (int output_rows = output_h; output_rows; output_rows--) { if (!utils::IsAGeZeroAndALtB(input_row, height)) { @@ -113,12 +113,12 @@ static void Im2ColNCHW( int width_col = (width + pad_l + pad_r - dkernel_w) / stride_w + 1; int channels_col = channels * kernel_h * kernel_w; - for (int c = 0; c < channels_col; ++c) { + for (const auto c : c10::irange(channels_col)) { int w_offset = c % kernel_w; int h_offset = (c / kernel_w) % kernel_h; int c_im = c / kernel_h / kernel_w; - for (int h = 0; h < height_col; ++h) { - for (int w = 0; w < width_col; ++w) { + for (const auto h : c10::irange(height_col)) { + for (const auto w : c10::irange(width_col)) { int h_pad = h * stride_h - pad_t + h_offset * dilation_h; int w_pad = w * stride_w - pad_l + w_offset * dilation_w; if (h_pad >= 0 && h_pad < height && w_pad >= 0 && w_pad < width) @@ -152,20 +152,20 @@ static void Im2ColNdNCHW( kernel_shape, kernel_shape + N, 1, std::multiplies()); std::vector d_offset(N, 0); std::vector d_iter(N, 0); - for (int i = 0; i < outer_size; ++i) { + for (const auto i : c10::irange(outer_size)) { // Loop over spatial axes in reverse order to compute a per-axis offset. int offset = i; for (int d_i = N - 1; d_i >= 0; --d_i) { d_offset[d_i] = offset % kernel_shape[d_i]; offset /= kernel_shape[d_i]; } - for (int j = 0; j < inner_size; ++j) { + for (const auto j : c10::irange(inner_size)) { // Loop over spatial axes in forward order to compute the indices in the // image and column, and whether the index lies in the padding. const int col_index = i * inner_size + j; int img_index = i / kernel_size; bool is_padding = false; - for (int d_i = 0; d_i < N; ++d_i) { + for (const auto d_i : c10::irange(N)) { const int d_img = d_iter[d_i] * stride[d_i] - pad[d_i] + d_offset[d_i] * dilation[d_i]; is_padding |= d_img < 0 || d_img >= img_shape[d_i + 1]; @@ -216,13 +216,13 @@ static void Im2ColNHWC( T* data_col_temp = data_col + h * width_col * kernel_h * kernel_w * channels; int w_pad = -pad_l; - for (int w = 0; w < width_col; ++w) { + for (const auto w : c10::irange(width_col)) { int r = 0; for (int ih = h_pad; ih < h_pad + dkernel_h; ih += dilation_h, ++r) { int s = 0; for (int iw = w_pad; iw < w_pad + dkernel_w; iw += dilation_w, ++s) { if (ih >= 0 && ih < height && iw >= 0 && iw < width) { - for (int g = 0; g < groups; ++g) { + for (const auto g : c10::irange(groups)) { memcpy( data_col_temp + ((g * kernel_h + r) * kernel_w + s) * (channels / groups), @@ -232,7 +232,7 @@ static void Im2ColNHWC( } } else { // This should be simply padded with zero. - for (int g = 0; g < groups; ++g) { + for (const auto g : c10::irange(groups)) { for (int i = 0; i < channels / groups; ++i) { data_col_temp [(((g * kernel_h + r) * kernel_w) + s) * @@ -293,12 +293,12 @@ static void Im2Col3DNHWC( #endif for (int t = 0; t < frame_col; ++t) { int t_pad = -pad_p + t * stride_t; - for (int h = 0; h < height_col; ++h) { + for (const auto h : c10::irange(height_col)) { int h_pad = -pad_t + h * stride_h; T* data_col_temp = data_col + (t * height_col + h) * width_col * kernel_t * kernel_h * kernel_w * channels; - for (int w = 0; w < width_col; ++w) { + for (const auto w : c10::irange(width_col)) { int w_pad = -pad_l + w * stride_w; int q = 0; for (int it = t_pad; it < t_pad + dkernel_t; it += dilation_t, ++q) { @@ -309,7 +309,7 @@ static void Im2Col3DNHWC( iw += dilation_w, ++s) { if (it >= 0 && it < num_frames && ih >= 0 && ih < height && iw >= 0 && iw < width) { - for (int g = 0; g < groups; ++g) { + for (const auto g : c10::irange(groups)) { memcpy( data_col_temp + (((g * kernel_t + q) * kernel_h + r) * kernel_w + s) * @@ -320,7 +320,7 @@ static void Im2Col3DNHWC( } } else { // This should be simply padded with zero. - for (int g = 0; g < groups; ++g) { + for (const auto g : c10::irange(groups)) { for (int i = 0; i < channels / groups; ++i) { data_col_temp [((((g * kernel_t + q) * kernel_h + r) * kernel_w) + diff --git a/caffe2/quantization/server/mmio.h b/caffe2/quantization/server/mmio.h index 91564e5d90b3b1..b52c408e6a3fae 100644 --- a/caffe2/quantization/server/mmio.h +++ b/caffe2/quantization/server/mmio.h @@ -36,8 +36,8 @@ void StoreMatrixInMatrixMarketFormat( } fprintf(fp, "%d %d\n", m, n); // matrix market array format uses column-major order - for (int j = 0; j < n; ++j) { - for (int i = 0; i < m; ++i) { + for (const auto j : c10::irange(n)) { + for (const auto i : c10::irange(m)) { if (is_integral::value) { // NOLINTNEXTLINE(clang-analyzer-core.NullDereference) fprintf(fp, "%d\n", static_cast(a[j * m + i])); diff --git a/caffe2/quantization/server/utility_dnnlowp_ops.h b/caffe2/quantization/server/utility_dnnlowp_ops.h index 1a0d830f96b4ac..9818e7191c6a6d 100644 --- a/caffe2/quantization/server/utility_dnnlowp_ops.h +++ b/caffe2/quantization/server/utility_dnnlowp_ops.h @@ -54,7 +54,7 @@ class GatherDNNLowPOp final : public GatherOp { const Index* idxs = indices.template data(); auto out = static_cast(output->raw_mutable_data(data.dtype())); - for (int i = 0; i < N; ++i) { + for (const auto i : c10::irange(N)) { auto idx = idxs[i]; CAFFE_ENFORCE( 0 <= idx && idx < data.size(0), diff --git a/caffe2/queue/queue_ops.h b/caffe2/queue/queue_ops.h index 1ecf9d59a3f62e..43d7548e73435e 100644 --- a/caffe2/queue/queue_ops.h +++ b/caffe2/queue/queue_ops.h @@ -147,7 +147,7 @@ class SafeDequeueBlobsOp final : public Operator { } const int kTensorGrowthPct = 40; - for (int i = 0; i < numRecords_; ++i) { + for (const auto i : c10::irange(numRecords_)) { if (!queue->blockingRead(blobPtrs_)) { // if we read at least one record, status is still true return i > 0; diff --git a/caffe2/queue/rebatching_queue_ops.h b/caffe2/queue/rebatching_queue_ops.h index 5c9059c05b6ebd..49479fbcaef978 100644 --- a/caffe2/queue/rebatching_queue_ops.h +++ b/caffe2/queue/rebatching_queue_ops.h @@ -32,7 +32,7 @@ class EnqueueRebatchingQueueOp : public Operator { CAFFE_ENFORCE_EQ(InputSize(), queue->numBlobs() + 1); std::vector inputTensors; inputTensors.reserve(InputSize() - 1); - for (int i = 1; i < InputSize(); ++i) { + for (const auto i : c10::irange(1, InputSize())) { inputTensors.push_back(&Input(i)); } @@ -56,7 +56,7 @@ class DequeueRebatchingQueueOp : public Operator { std::vector outputTensors; outputTensors.reserve(OutputSize()); - for (int i = 0; i < OutputSize(); ++i) { + for (const auto i : c10::irange(OutputSize())) { outputTensors.push_back(Output(i)); } diff --git a/caffe2/sgd/adadelta_op.h b/caffe2/sgd/adadelta_op.h index 402edf74228296..d24ba2af2c33b6 100644 --- a/caffe2/sgd/adadelta_op.h +++ b/caffe2/sgd/adadelta_op.h @@ -18,7 +18,7 @@ void AdadeltaUpdate( float* nh, float* nd, Context* /*context*/) { - for (int i = 0; i < N; ++i) { + for (const auto i : c10::irange(N)) { float gi = g[i]; float di = d[i]; float hi = nh[i] = decay * h[i] + (1.0f - decay) * gi * gi; @@ -120,7 +120,7 @@ class SparseAdadeltaOp final : public Operator { } auto block_size = Input(GRAD).numel() / n; - for (int i = 0; i < n; ++i) { + for (const auto i : c10::irange(n)) { auto idx = indices[i]; if (block_size == 1) { float gi = gradIn[i]; diff --git a/caffe2/sgd/adagrad_fused.h b/caffe2/sgd/adagrad_fused.h index 29c506f324749f..9f1843333d92de 100644 --- a/caffe2/sgd/adagrad_fused.h +++ b/caffe2/sgd/adagrad_fused.h @@ -82,8 +82,8 @@ class SparseAdagradFusedWithSparseLengthsSumGradientOp final auto* grad_buffer_data = is_mean ? grad_buffer_.template mutable_data() : NULL; if (is_mean) { - for (auto rangeIndex = 0; rangeIndex < numSegments; ++rangeIndex) { - for (auto tmpIndex = 0; tmpIndex < block_size; ++tmpIndex) { + for (const auto rangeIndex : c10::irange(numSegments)) { + for (const auto tmpIndex : c10::irange(block_size)) { auto offsetI = rangeIndex * block_size; grad_buffer_data[offsetI + tmpIndex] = lengths[rangeIndex] > 0 ? gradIn[offsetI + tmpIndex] / lengths[rangeIndex] @@ -92,7 +92,7 @@ class SparseAdagradFusedWithSparseLengthsSumGradientOp final } } - for (auto rangeIndex = 0; rangeIndex < numSegments; ++rangeIndex) { + for (const auto rangeIndex : c10::irange(numSegments)) { for (auto start = dataIndex; dataIndex < start + lengths[rangeIndex]; ++dataIndex) { std::size_t idx = indices[dataIndex]; @@ -243,7 +243,7 @@ class SparseAdagradFusedWithSparseLengthsWeightedSumGradientOp final // ignores this dependency and fuses these two loops. std::vector temp_grad(block_size); int dataIndex = 0; - for (auto rangeIndex = 0; rangeIndex < numSegments; ++rangeIndex) { + for (const auto rangeIndex : c10::irange(numSegments)) { for (auto start = dataIndex; dataIndex < start + lengths[rangeIndex]; ++dataIndex) { std::size_t idx = indices[dataIndex]; @@ -277,7 +277,7 @@ class SparseAdagradFusedWithSparseLengthsWeightedSumGradientOp final CAFFE_ENFORCE_EQ(dataIndex, n); dataIndex = 0; - for (auto rangeIndex = 0; rangeIndex < numSegments; ++rangeIndex) { + for (const auto rangeIndex : c10::irange(numSegments)) { for (auto start = dataIndex; dataIndex < start + lengths[rangeIndex]; ++dataIndex) { std::size_t idx = indices[dataIndex]; @@ -285,7 +285,7 @@ class SparseAdagradFusedWithSparseLengthsWeightedSumGradientOp final auto offsetIdx = idx * block_size; auto localOffset = dataIndex - start; - for (int i = 0; i < block_size; ++i) { + for (const auto i : c10::irange(block_size)) { temp_grad[i] = auxParamIn[localOffset] * gradIn[offsetI + i]; } @@ -409,7 +409,7 @@ class SparseAdagradFusedWithSparseLengthsWeightedSumGradientApproxOp final std::vector temp_grad(block_size); int dataIndex = 0; - for (auto rangeIndex = 0; rangeIndex < numSegments; ++rangeIndex) { + for (const auto rangeIndex : c10::irange(numSegments)) { for (auto start = dataIndex; dataIndex < start + lengths[rangeIndex]; ++dataIndex) { std::size_t idx = indices[dataIndex]; @@ -440,7 +440,7 @@ class SparseAdagradFusedWithSparseLengthsWeightedSumGradientApproxOp final auxGrad + dataIndex, &context_); - for (int i = 0; i < block_size; ++i) { + for (const auto i : c10::irange(block_size)) { temp_grad[i] = auxParamIn[localOffset] * gradIn[offsetI + i]; } diff --git a/caffe2/sgd/adagrad_op.h b/caffe2/sgd/adagrad_op.h index b683b7ec3cb959..8646d01872daec 100644 --- a/caffe2/sgd/adagrad_op.h +++ b/caffe2/sgd/adagrad_op.h @@ -39,7 +39,7 @@ void adagrad_update_output_effective_lr( const float* lr, Context* /*context*/, float weight_decay = 0.f) { - for (auto i = 0; i < N; ++i) { + for (const auto i : c10::irange(N)) { float grad = std::fma(weight_decay, paramIn[i], gradIn[i]); float moment = momentOut[i] = decay * momentIn[i] + grad * grad; float effective_lr = effectiveLROut[i] = @@ -63,7 +63,7 @@ void adagrad_update_output_effective_lr_and_update( const float* lr, Context* /*context*/, float weight_decay = 0.f) { - for (auto i = 0; i < N; ++i) { + for (const auto i : c10::irange(N)) { float grad = std::fma(weight_decay, paramIn[i], gradIn[i]); float moment = momentOut[i] = decay * momentIn[i] + grad * grad; float effective_lr = effectiveLROut[i] = @@ -300,7 +300,7 @@ class SparseAdagradOp final : public Operator { const auto* momentIn = Input(MOMENT_1).template data(); std::vector grad(block_size); - for (auto i = 0; i < n; ++i) { + for (const auto i : c10::irange(n)) { auto idx = indices[i]; auto offsetI = i * block_size; auto offsetIdx = idx * block_size; @@ -504,7 +504,7 @@ class RowWiseSparseAdagradOp final : public Operator { #else VLOG(1) << "using plain adagrad updates in RowWiseSparseAdagradOp"; - for (auto i = 0; i < n; ++i) { + for (const auto i : c10::irange(n)) { auto idx = indices[i]; float freq = (counter_halflife_ > 0 && count[idx] > 0) ? counter_halflife_ / count[idx] @@ -542,13 +542,13 @@ class RowWiseSparseAdagradOp final : public Operator { const float* g = gradIn + offsetI; float* h = moment + idx; float hs = 0.; - for (auto j = 0; j < block_size; ++j) { + for (const auto j : c10::irange(block_size)) { float gj = std::fma(weight_decay_ * freq, w[j], g[j]); hs += gj * gj; } float hi = h[0] = h[0] + hs / block_size; float step = lr[0] / (std::sqrt(hi) + epsilon_); - for (auto j = 0; j < block_size; ++j) { + for (const auto j : c10::irange(block_size)) { float gj = std::fma(weight_decay_ * freq, w[j], g[j]); w[j] = w[j] + gj * step; } diff --git a/caffe2/sgd/adam_op.h b/caffe2/sgd/adam_op.h index ca1c5aefacebc2..d0aa6dd7aed728 100644 --- a/caffe2/sgd/adam_op.h +++ b/caffe2/sgd/adam_op.h @@ -21,7 +21,7 @@ void adam_update( float correction, const float* lr, Context* /*context*/) { - for (auto i = 0; i < N; ++i) { + for (const auto i : c10::irange(N)) { float gi = g[i]; float mi = nm[i] = m[i] * beta1 + gi * (1 - beta1); float vi = nv[i] = v[i] * beta2 + gi * gi * (1 - beta2); @@ -45,7 +45,7 @@ void adam_compute( float correction, const float* lr, Context* /*context*/) { - for (auto i = 0; i < N; ++i) { + for (const auto i : c10::irange(N)) { float gi = g[i]; float mi = nm[i] = m[i] * beta1 + gi * (1 - beta1); float vi = nv[i] = v[i] * beta2 + gi * gi * (1 - beta2); @@ -74,7 +74,7 @@ void adam_compute_smart_decay( Context* /*context*/) { float k = (float)(t - lastSeenIn[0]); lastSeenOut[0] = t; - for (auto i = 0; i < N; ++i) { + for (const auto i : c10::irange(N)) { float gi = g[i]; // The number of steps since this param was last seen. // We don't need integer precision for k. Float is fine and it's faster to convert here. @@ -107,7 +107,7 @@ void adam_compute_output_grad( float correction, const float* lr, Context* /*context*/) { - for (auto i = 0; i < N; ++i) { + for (const auto i : c10::irange(N)) { float gi = g[i]; float mi = nm[i] = m[i] * beta1 + gi * (1 - beta1); float vi = nv[i] = v[i] * beta2 + gi * gi * (1 - beta2); @@ -135,7 +135,7 @@ void radam_update( float r_correction, const float* lr, Context* /*context*/) { - for (auto i = 0; i < N; ++i) { + for (const auto i : c10::irange(N)) { float gi = g[i]; float mi = nm[i] = m[i] * beta1 + gi * (1 - beta1); float vi = nv[i] = v[i] * beta2 + gi * gi * (1 - beta2); @@ -169,7 +169,7 @@ void radam_compute( float r_correction, const float* lr, Context* /*context*/) { - for (auto i = 0; i < N; ++i) { + for (const auto i : c10::irange(N)) { float gi = g[i]; float mi = nm[i] = m[i] * beta1 + gi * (1 - beta1); float vi = nv[i] = v[i] * beta2 + gi * gi * (1 - beta2); @@ -204,7 +204,7 @@ void radam_compute_output_grad( float r_correction, const float* lr, Context* /*context*/) { - for (auto i = 0; i < N; ++i) { + for (const auto i : c10::irange(N)) { float gi = g[i]; float mi = nm[i] = m[i] * beta1 + gi * (1 - beta1); float vi = nv[i] = v[i] * beta2 + gi * gi * (1 - beta2); @@ -350,7 +350,7 @@ class SparseAdamOp final : public Operator { auto* moment2Out = Output(OUTPUT_MOMENT_2)->template mutable_data(); if (OutputSize() == 3) { - for (auto i = 0; i < n; ++i) { + for (const auto i : c10::irange(n)) { auto idx = indices[i]; if (block_size == 1) { @@ -444,7 +444,7 @@ class SparseAdamOp final : public Operator { } else { Output(OUTPUT_GRAD)->ResizeLike(Input(GRAD)); auto* gradOut = Output(OUTPUT_GRAD)->template mutable_data(); - for (auto i = 0; i < n; ++i) { + for (const auto i : c10::irange(n)) { auto idx = indices[i]; if (block_size == 1) { @@ -593,7 +593,7 @@ class SmartDecaySparseAdamOp final : public Operator { auto* moment2Out = Output(OUTPUT_MOMENT_2)->template mutable_data(); int64_t* lastSeenOut = Output(OUTPUT_LAST_SEEN)->template mutable_data(); - for (auto i = 0; i < n; ++i) { + for (const auto i : c10::irange(n)) { auto idx = indices[i]; auto offsetI = i * block_size; auto offsetIdx = idx * block_size; @@ -673,7 +673,7 @@ class RowWiseSparseAdamOp final : public Operator { auto* moment2Out = Output(OUTPUT_MOMENT_2)->template mutable_data(); if (OutputSize() == 3) { - for (auto i = 0; i < n; ++i) { + for (const auto i : c10::irange(n)) { auto idx = indices[i]; if (block_size == 1) { @@ -719,13 +719,13 @@ class RowWiseSparseAdamOp final : public Operator { float* nm2 = moment2Out + idx; float m2_sum = 0.; - for (auto j = 0; j < block_size; ++j) { + for (const auto j : c10::irange(block_size)) { float gj = g[j]; m2_sum += gj * gj; } float vi = nm2[0] = m2[0] * beta2_ + (m2_sum / block_size) * (1 - beta2_); - for (auto j = 0; j < block_size; ++j) { + for (const auto j : c10::irange(block_size)) { float mi = nm1[j] = m1[j] * beta1_ + g[j] * (1 - beta1_); nw[j] = w[j] + lr[0] * correction * mi / (std::sqrt(vi) + epsilon_); } @@ -734,7 +734,7 @@ class RowWiseSparseAdamOp final : public Operator { } else { Output(OUTPUT_GRAD)->ResizeLike(Input(GRAD)); auto* gradOut = Output(OUTPUT_GRAD)->template mutable_data(); - for (auto i = 0; i < n; ++i) { + for (const auto i : c10::irange(n)) { auto idx = indices[i]; if (block_size == 1) { @@ -781,13 +781,13 @@ class RowWiseSparseAdamOp final : public Operator { float* ng = gradOut + offsetI; float m2_sum = 0.; - for (auto j = 0; j < block_size; ++j) { + for (const auto j : c10::irange(block_size)) { float gj = g[j]; m2_sum += gj * gj; } float vi = nm2[0] = m2[0] * beta2_ + (m2_sum / block_size) * (1 - beta2_); - for (auto j = 0; j < block_size; ++j) { + for (const auto j : c10::irange(block_size)) { float mi = nm1[j] = m1[j] * beta1_ + g[j] * (1 - beta1_); float ngi = ng[j] = correction * mi / (std::sqrt(vi) + epsilon_); nw[j] = w[j] + lr[0] * ngi; diff --git a/caffe2/sgd/learning_rate_adaption_op.h b/caffe2/sgd/learning_rate_adaption_op.h index ff3e30f47398de..10a44807e02728 100644 --- a/caffe2/sgd/learning_rate_adaption_op.h +++ b/caffe2/sgd/learning_rate_adaption_op.h @@ -21,7 +21,7 @@ void lr_update( float x = 0; float y = 0, z = 0; const float kEps = 1e-12f; - for (auto i = 0; i < n; i++) { + for (const auto i : c10::irange(n)) { x += grad[i] * effgrad[i]; if (normalized_lr_adaption) { y += grad[i] * grad[i]; diff --git a/caffe2/sgd/learning_rate_op.h b/caffe2/sgd/learning_rate_op.h index fb0998a65d7149..74387f47db73a1 100644 --- a/caffe2/sgd/learning_rate_op.h +++ b/caffe2/sgd/learning_rate_op.h @@ -5,6 +5,7 @@ #include #include "caffe2/core/context.h" #include "caffe2/core/export_caffe2_op_to_c10.h" +#include #include "caffe2/core/operator.h" #include "caffe2/sgd/learning_rate_functors.h" @@ -162,7 +163,7 @@ class LearningRateOp final : public Operator { sub_policy_num_iters.size(), 0, "Must specify at least one sub learning rate policy."); - for (size_t i = 0; i < sub_policy_num_iters.size(); ++i) { + for (const auto i : c10::irange(sub_policy_num_iters.size())) { CAFFE_ENFORCE_GT( sub_policy_num_iters[i], 0, diff --git a/caffe2/sgd/momentum_sgd_op.h b/caffe2/sgd/momentum_sgd_op.h index 89b13526bcc96a..5ef49ad443e436 100644 --- a/caffe2/sgd/momentum_sgd_op.h +++ b/caffe2/sgd/momentum_sgd_op.h @@ -17,7 +17,7 @@ void momentum_sgd_update( float* param, Context* /*context*/) { const float LR = lr[0]; - for (auto i = 0; i < N; ++i) { + for (const auto i : c10::irange(N)) { if (!nesterov) { const float adjusted_gradient = LR * g[i] + momentum * m[i]; nm[i] = adjusted_gradient; @@ -154,7 +154,7 @@ class SparseMomentumSGDUpdateOp final : public Operator { auto* momentumOut = Output(OUTPUT_MOMENTUM)->template mutable_data(); auto* paramOut = Output(OUTPUT_PARAM)->template mutable_data(); - for (auto i = 0; i < n; ++i) { + for (const auto i : c10::irange(n)) { auto idx = indices[i]; auto offsetI = i * block_size; auto offsetIdx = idx * block_size; diff --git a/caffe2/sgd/rowwise_adagrad_fused.h b/caffe2/sgd/rowwise_adagrad_fused.h index 953ccbe5f30691..1d1076afa3c616 100644 --- a/caffe2/sgd/rowwise_adagrad_fused.h +++ b/caffe2/sgd/rowwise_adagrad_fused.h @@ -217,8 +217,8 @@ class RowWiseSparseAdagradFusedWithSparseLengthsSumGradientOp final auto* grad_buffer_data = is_mean ? grad_buffer_.template mutable_data() : NULL; if (is_mean) { - for (auto rangeIndex = 0; rangeIndex < numSegments; ++rangeIndex) { - for (auto tmpIndex = 0; tmpIndex < block_size; ++tmpIndex) { + for (const auto rangeIndex : c10::irange(numSegments)) { + for (const auto tmpIndex : c10::irange(block_size)) { auto offsetI = rangeIndex * block_size; grad_buffer_data[offsetI + tmpIndex] = lengths[rangeIndex] > 0 ? gradIn[offsetI + tmpIndex] / lengths[rangeIndex] @@ -269,7 +269,7 @@ class RowWiseSparseAdagradFusedWithSparseLengthsSumGradientOp final T counter_halflife, rowWiseAdagradT& kernel) { int dataIndex = 0; - for (auto rangeIndex = 0; rangeIndex < numSegments; ++rangeIndex) { + for (const auto rangeIndex : c10::irange(numSegments)) { auto offsetI = rangeIndex * block_size; const float* g = gradIn + offsetI; @@ -557,7 +557,7 @@ class RowWiseSparseAdagradFusedWithSparseLengthsWeightedSumGradientOp final // ignores this dependency and fuses these two loops. std::vector temp_grad(block_size); int dataIndex = 0; - for (auto rangeIndex = 0; rangeIndex < numSegments; ++rangeIndex) { + for (const auto rangeIndex : c10::irange(numSegments)) { for (auto start = dataIndex; dataIndex < start + lengths[rangeIndex]; ++dataIndex) { std::size_t idx = indices[dataIndex]; @@ -591,7 +591,7 @@ class RowWiseSparseAdagradFusedWithSparseLengthsWeightedSumGradientOp final CAFFE_ENFORCE_EQ(dataIndex, n); dataIndex = 0; - for (auto rangeIndex = 0; rangeIndex < numSegments; ++rangeIndex) { + for (const auto rangeIndex : c10::irange(numSegments)) { auto offsetI = rangeIndex * block_size; const float* g = gradIn + offsetI; @@ -606,7 +606,7 @@ class RowWiseSparseAdagradFusedWithSparseLengthsWeightedSumGradientOp final auto offsetIdx = idx * block_size; auto localOffset = dataIndex - start; - for (int i = 0; i < block_size; ++i) { + for (const auto i : c10::irange(block_size)) { temp_grad[i] = auxParamIn[localOffset] * g[i]; } @@ -839,7 +839,7 @@ class RowWiseSparseAdagradFusedWithSparseLengthsWeightedSumGradientApproxOp std::vector temp_grad(block_size); int dataIndex = 0; - for (auto rangeIndex = 0; rangeIndex < numSegments; ++rangeIndex) { + for (const auto rangeIndex : c10::irange(numSegments)) { auto offsetI = rangeIndex * block_size; const float* g = gradIn + offsetI; @@ -902,7 +902,7 @@ class RowWiseSparseAdagradFusedWithSparseLengthsWeightedSumGradientApproxOp alignas(64) float temp[VLEN]; _mm256_store_ps(temp, acc_v); - for (int j = 0; j < VLEN; ++j) { + for (const auto j : c10::irange(VLEN)) { acc += temp[j]; } #endif diff --git a/caffe2/sgd/rowwise_counter.h b/caffe2/sgd/rowwise_counter.h index fb0647d596f26d..db8fa19ee222cb 100644 --- a/caffe2/sgd/rowwise_counter.h +++ b/caffe2/sgd/rowwise_counter.h @@ -40,7 +40,7 @@ class RowWiseCounterOp final : public Operator { return true; } - for (auto i = 0; i < n; ++i) { + for (const auto i : c10::irange(n)) { const std::size_t idx = indices[i]; CAFFE_ENFORCE_GE( Input(COUNTER).numel(), diff --git a/caffe2/sgd/storm_op.h b/caffe2/sgd/storm_op.h index 0ecb0fad8e8643..5abf0c806d5c72 100644 --- a/caffe2/sgd/storm_op.h +++ b/caffe2/sgd/storm_op.h @@ -19,7 +19,7 @@ void storm_update( const float beta, Context* /*context*/) { float gradSqSumTmp = 0.0; - for (auto i = 0; i < N; ++i) { + for (const auto i : c10::irange(N)) { const float gi = gradIn[i]; gradSqSumTmp += gi * gi; } @@ -27,7 +27,7 @@ void storm_update( const float nlr = lr[0] * std::pow(beta + gradSqSumOut[0], -1.0 / 3.0); const float alpha = momentum * nlr * nlr; - for (auto i = 0; i < N; ++i) { + for (const auto i : c10::irange(N)) { const float gi = gradIn[i]; const float mi = momentIn[i]; float new_mi = momentOut[i] = gi + (1.0 - alpha) * (mi - gi); @@ -120,7 +120,7 @@ class SparseStormOp final : public Operator { } float gradSqSumTmp = 0.0; - for (auto i = 0; i < Input(GRAD).numel(); ++i) { + for (const auto i : c10::irange(Input(GRAD).numel())) { const float gi = gradIn[i]; gradSqSumTmp += gi * gi; } @@ -130,7 +130,7 @@ class SparseStormOp final : public Operator { const float alpha = momentum_ * nlr * nlr; const auto block_size = Input(GRAD).numel() / n; - for (auto i = 0; i < n; ++i) { + for (const auto i : c10::irange(n)) { auto idx = indices[i]; if (block_size == 1) { const float gi = gradIn[i]; @@ -162,7 +162,7 @@ class SparseStormOp final : public Operator { i); #endif - for (auto j = 0; j < block_size; ++j) { + for (const auto j : c10::irange(block_size)) { const float gi = gradIn[offsetI + j]; const float mi = momentIn[offsetIdx + j]; float new_mi = momentOut[offsetIdx + j] = diff --git a/caffe2/sgd/wngrad_op.h b/caffe2/sgd/wngrad_op.h index e9e1fd1c987671..862efa99d20e59 100644 --- a/caffe2/sgd/wngrad_op.h +++ b/caffe2/sgd/wngrad_op.h @@ -15,12 +15,12 @@ void wngrad_update( float epsilon, const float* lr, Context* /*context*/) { - for (auto i = 0; i < N; ++i) { + for (const auto i : c10::irange(N)) { float gi = g[i]; nw[i] = w[i] + lr[0] * gi / (h[0] + epsilon); } float nhTmp = 0.0; - for (auto i = 0; i < N; ++i) { + for (const auto i : c10::irange(N)) { float gi = g[i]; nhTmp += gi * gi; } @@ -42,13 +42,13 @@ void wngrad_update_output_effective_lr( Context* /*context*/) { effectiveLROut[0] = lr[0] / (seqBIn[0] + epsilon); float seqBTmp = 0.0; - for (auto i = 0; i < N; ++i) { + for (const auto i : c10::irange(N)) { float gi = gradIn[i]; seqBTmp += gi * gi; } seqBTmp /= (seqBIn[0] + epsilon); seqBOut[0] = seqBIn[0] + seqBTmp; - for (auto i = 0; i < N; ++i) { + for (const auto i : c10::irange(N)) { float grad = gradIn[i]; paramOut[i] = paramIn[i] + effectiveLROut[0] * grad; } @@ -69,14 +69,14 @@ void wngrad_update_output_effective_lr_and_update( Context* /*context*/) { effectiveLROut[0] = lr[0] / (seqBIn[0] + epsilon); float seqBTmp = 0.0; - for (auto i = 0; i < N; ++i) { + for (const auto i : c10::irange(N)) { float gi = gradIn[i]; seqBTmp += gi * gi; } seqBTmp /= (seqBIn[0] + epsilon); seqBOut[0] = seqBIn[0] + seqBTmp; - for (auto i = 0; i < N; ++i) { + for (const auto i : c10::irange(N)) { float grad = gradIn[i]; float update = updateOut[i] = effectiveLROut[0] * grad; paramOut[i] = paramIn[i] + update; @@ -193,7 +193,7 @@ class SparseWngradOp final : public Operator { auto block_size = Input(GRAD).numel() / n; - for (auto i = 0; i < n; ++i) { + for (const auto i : c10::irange(n)) { auto idx = indices[i]; if (block_size == 1) { float gi = gradIn[i]; @@ -222,7 +222,7 @@ class SparseWngradOp final : public Operator { " for input i:", i); #endif - for (auto j = 0; j < block_size; ++j) { + for (const auto j : c10::irange(block_size)) { float gi = gradIn[offsetI + j]; paramOut[offsetIdx + j] = paramIn[offsetIdx + j] + lr[0] * gi / (seqBIn[0] + epsilon_); @@ -230,7 +230,7 @@ class SparseWngradOp final : public Operator { } } float seqBTmp = 0.0; - for (auto i = 0; i < Input(GRAD).numel(); ++i) { + for (const auto i : c10::irange(Input(GRAD).numel())) { float gi = gradIn[i]; seqBTmp += gi * gi; } diff --git a/caffe2/sgd/yellowfin_op.h b/caffe2/sgd/yellowfin_op.h index a3608aaa6475f1..3ae7dd1e53a913 100644 --- a/caffe2/sgd/yellowfin_op.h +++ b/caffe2/sgd/yellowfin_op.h @@ -133,7 +133,7 @@ CAFFE_ENFORCE_EQ(param_tensor.dim(), moment_tensor.dim()); CAFFE_ENFORCE_EQ(param_tensor.dim(), g_avg_tensor.dim()); CAFFE_ENFORCE_EQ(param_tensor.dim(), g2_avg_tensor.dim()); CAFFE_ENFORCE_EQ(param_tensor.dim(), grad_tensor.dim()); -for (int i = 0; i < param_tensor.dim(); ++i) { +for (const auto i : c10::irange(param_tensor.dim())) { CAFFE_ENFORCE_EQ(param_tensor.dim32(i), moment_tensor.dim32(i)); CAFFE_ENFORCE_EQ(param_tensor.dim32(i), g_avg_tensor.dim32(i)); CAFFE_ENFORCE_EQ(param_tensor.dim32(i), g2_avg_tensor.dim32(i)); diff --git a/caffe2/transforms/pattern_net_transform.h b/caffe2/transforms/pattern_net_transform.h index 95638f4a839c58..bb183292437dda 100644 --- a/caffe2/transforms/pattern_net_transform.h +++ b/caffe2/transforms/pattern_net_transform.h @@ -28,7 +28,7 @@ class TORCH_API PatternNetTransform : public Transform { "External outputs do not match!"); ordered_ops_ = GetPatternTraversalOrder(p_); inverse_ops_.resize(ordered_ops_.size()); - for (size_t i = 0; i < ordered_ops_.size(); i++) { + for (const auto i : c10::irange(ordered_ops_.size())) { inverse_ops_[ordered_ops_[i]] = i; } } diff --git a/caffe2/utils/proto_utils.h b/caffe2/utils/proto_utils.h index b5c6b312b3ab36..a6903425ab4e31 100644 --- a/caffe2/utils/proto_utils.h +++ b/caffe2/utils/proto_utils.h @@ -9,6 +9,7 @@ #include #include +#include #include "caffe2/utils/proto_wrap.h" #include "caffe2/proto/caffe2_pb.h" diff --git a/caffe2/utils/threadpool/WorkersPool.h b/caffe2/utils/threadpool/WorkersPool.h index 145dbc160b1e11..8126b82aa2089f 100644 --- a/caffe2/utils/threadpool/WorkersPool.h +++ b/caffe2/utils/threadpool/WorkersPool.h @@ -4,6 +4,7 @@ #include #include #include "c10/util/thread_name.h" +#include #include "caffe2/core/common.h" #include "caffe2/core/logging.h" @@ -339,7 +340,7 @@ class WorkersPool { CreateWorkers(workers_count); DCHECK_LE(workers_count, (int)workers_.size()); counter_to_decrement_when_ready_.Reset(workers_count); - for (size_t task = 1; task < tasks.size(); ++task) { + for (const auto task : c10::irange(1, tasks.size())) { workers_[task - 1]->StartWork(tasks[task].get()); } // Execute the remaining workload immediately on the current thread. diff --git a/caffe2/video/video_input_op.h b/caffe2/video/video_input_op.h index e168c74cb3dcf9..2711984fc25b36 100644 --- a/caffe2/video/video_input_op.h +++ b/caffe2/video/video_input_op.h @@ -8,6 +8,7 @@ #include #include +#include #include #include #include @@ -225,7 +226,7 @@ void VideoInputOp::CheckParamsAndPrint() { if (random_sampling_rate_) { LOG(INFO) << "random sampling with max:" << random_sampling_rate_; } - for (int i = 0; i < channels_rgb_; i++) { + for (const auto i : c10::irange(channels_rgb_)) { LOG(INFO) << " RGB " << i << "-th channel mean: " << mean_rgb_[i] << " std: " << 1.f / inv_std_rgb_[i]; } @@ -237,7 +238,7 @@ void VideoInputOp::CheckParamsAndPrint() { << "and a sampling rate of 1:" << sampling_rate_of_ << " flow_data_type_: " << flow_data_type_ << " flow_alg_type_: " << flow_alg_type_; - for (int i = 0; i < channels_of_; i++) { + for (const auto i : c10::irange(channels_of_)) { LOG(INFO) << " Optical flow" << i << "-th channel mean: " << mean_of_[i] << " std: " << 1.f / inv_std_of_[i]; @@ -257,7 +258,7 @@ void VideoInputOp::CheckParamsAndPrint() { if (video_res_type_ == VideoResType::USE_SHORT_EDGE) { if (jitter_scales_.size() > 0) { LOG(INFO) << "Using scale jittering:"; - for (int idx = 0; idx < jitter_scales_.size(); idx++) { + for (const auto idx : c10::irange(jitter_scales_.size())) { LOG(INFO) << "scale " << idx << ": " << jitter_scales_[idx]; } } else { @@ -390,7 +391,7 @@ VideoInputOp::VideoInputOp( } channels_rgb_ = 3; - for (int i = 4; i < 7; i++) { + for (const auto i : c10::irange(4, 7)) { mean_rgb_.push_back(InputDataMean[i]); inv_std_rgb_.push_back(1.f / InputDataStd[i]); } @@ -403,7 +404,7 @@ VideoInputOp::VideoInputOp( get_optical_flow_ = false; get_rgb_ = true; sampling_rate_rgb_ = 1; - for (int i = 4; i < 7; i++) { + for (const auto i : c10::irange(4, 7)) { mean_rgb_.push_back(InputDataMean[i]); inv_std_rgb_.push_back(1.f / InputDataStd[i]); } @@ -420,7 +421,7 @@ VideoInputOp::VideoInputOp( switch (flow_data_type_) { case FlowDataType::Flow2C: channels_of_ = 2; - for (int i = 0; i < channels_of_; i++) { + for (const auto i : c10::irange(channels_of_)) { mean_of_.push_back(InputDataMean[i]); inv_std_of_.push_back(1.f / InputDataStd[i]); } @@ -428,7 +429,7 @@ VideoInputOp::VideoInputOp( case FlowDataType::Flow3C: channels_of_ = 3; - for (int i = 0; i < channels_of_; i++) { + for (const auto i : c10::irange(channels_of_)) { mean_of_.push_back(InputDataMean[i]); inv_std_of_.push_back(1.f / InputDataStd[i]); } @@ -437,7 +438,7 @@ VideoInputOp::VideoInputOp( // early fusion with gray case FlowDataType::FlowWithGray: channels_of_ = 3; - for (int i = 0; i < 2; i++) { + for (const auto i : c10::irange(2)) { mean_of_.push_back(InputDataMean[i]); inv_std_of_.push_back(1.f / InputDataStd[i]); } @@ -448,11 +449,11 @@ VideoInputOp::VideoInputOp( // early fusion with RGB case FlowDataType::FlowWithRGB: channels_of_ = 5; - for (int i = 0; i < 2; i++) { + for (const auto i : c10::irange(2)) { mean_of_.push_back(InputDataMean[i]); inv_std_of_.push_back(1.f / InputDataStd[i]); } - for (int i = 4; i < 7; i++) { + for (const auto i : c10::irange(4, 7)) { mean_of_.push_back(InputDataMean[i]); inv_std_of_.push_back(1.f / InputDataStd[i]); } @@ -527,15 +528,15 @@ void VideoInputOp::GetLabelsFromProto( int* label_data) { int num_clips = clip_per_video_ * crop_per_clip_; if (!do_multi_label_) { - for (int i = 0; i < num_clips; i++) { + for (const auto i : c10::irange(num_clips)) { label_data[i] = label_proto.int32_data(0); } } else { // For multiple label case, output label is a binary vector // where presented concepts are marked 1 memset(label_data, 0, sizeof(int) * num_of_class_ * num_clips); - for (int i = 0; i < num_clips; i++) { - for (int j = 0; j < label_proto.int32_data_size(); j++) { + for (const auto i : c10::irange(num_clips)) { + for (const auto j : c10::irange(label_proto.int32_data_size())) { CAFFE_ENFORCE_LT( label_proto.int32_data(j), num_of_class_, @@ -657,7 +658,7 @@ bool VideoInputOp::GetClipsAndLabelsFromDBValue( const TensorProto& start_frm_proto = protos.protos(curr_proto_idx++); start_frm = start_frm_proto.int32_data(0); if (get_start_frame_) { - for (int i = 0; i < num_clips; i++) { + for (const auto i : c10::irange(num_clips)) { start_frame_data[i] = start_frm; } } @@ -667,7 +668,7 @@ bool VideoInputOp::GetClipsAndLabelsFromDBValue( CAFFE_ENFORCE_GE( protos.protos_size(), curr_proto_idx + 1, "Video Id not provided"); const TensorProto& video_id_proto = protos.protos(curr_proto_idx); - for (int i = 0; i < num_clips; i++) { + for (const auto i : c10::irange(num_clips)) { video_id_data[i] = video_id_proto.int64_data(0); } } @@ -772,7 +773,7 @@ void VideoInputOp::DecodeAndTransform( int clip_offset_of = channels_of_ * length_of_ * crop_size_ * crop_size_; for (int i = 0; i < std::min(clip_per_video_, int(buffer_rgb.size())); i++) { - for (int j = 0; j < crop_per_clip_; j++) { + for (const auto j : c10::irange(crop_per_clip_)) { // get the rectangle for cropping int h_off = 0; int w_off = 0; @@ -855,7 +856,7 @@ void VideoInputOp::DecodeAndTransform( } } if (buffer_rgb.size() > 0) { - for (int i = 0; i < buffer_rgb.size(); i++) { + for (const auto i : c10::irange(buffer_rgb.size())) { unsigned char* buff = buffer_rgb[i]; delete[] buff; } @@ -884,12 +885,12 @@ bool VideoInputOp::Prefetch() { // Prefetching handled with a thread pool of "decode_threads" threads. std::mt19937 meta_randgen(time(nullptr)); std::vector randgen_per_thread; - for (int i = 0; i < num_decode_threads_; ++i) { + for (const auto i : c10::irange(num_decode_threads_)) { randgen_per_thread.emplace_back(meta_randgen()); } std::bernoulli_distribution mirror_this_clip(0.5); - for (int item_id = 0; item_id < batch_size_; ++item_id) { + for (const auto item_id : c10::irange(batch_size_)) { std::mt19937* randgen = &randgen_per_thread[item_id % num_decode_threads_]; diff --git a/test/cpp/api/dataloader.cpp b/test/cpp/api/dataloader.cpp index b49330c7e6b0c0..cd4056c53827fa 100644 --- a/test/cpp/api/dataloader.cpp +++ b/test/cpp/api/dataloader.cpp @@ -5,6 +5,7 @@ #include #include +#include #include #include @@ -173,7 +174,7 @@ TEST(DataTest, InfiniteStreamDataset) { for (auto& batch : *data_loader) { ASSERT_LT(batch_index, 3); ASSERT_EQ(batch.size(), kBatchSize); - for (size_t j = 0; j < kBatchSize; ++j) { + for (const auto j : c10::irange(kBatchSize)) { ASSERT_EQ(batch.at(j), 1 + (batch_index * kBatchSize) + j); } batch_index += 1; @@ -837,7 +838,7 @@ TEST(DataTest, CanUseCustomTypeAsIndexType) { size_t i = 0; for (auto batch : *data_loader) { - for (int j = 0; j < kBatchSize; ++j) { + for (const auto j : c10::irange(kBatchSize)) { ASSERT_EQ(batch.at(j), 10 + j); } i += 1; @@ -857,7 +858,7 @@ TEST(DataTest, DistributedRandomSamplerSingleReplicaProduceCorrectSamples) { ASSERT_EQ(res.size(), sample_count); std::sort(res.begin(), res.end()); - for (size_t i = 0; i < res.size(); ++i) { + for (const auto i : c10::irange(res.size())) { ASSERT_EQ(res[i], i); } } @@ -872,14 +873,14 @@ TEST(DataTest, DistributedRandomSamplerMultiReplicaProduceCorrectSamples) { size_t batch_size) { std::vector> samplers; - for (size_t i = 0; i < num_replicas; ++i) { + for (const auto i : c10::irange(num_replicas)) { samplers.emplace_back( torch::make_unique( sample_count, num_replicas, i, allow_duplicates)); } std::vector res; - for (size_t i = 0; i < num_replicas; ++i) { + for (const auto i : c10::irange(num_replicas)) { (*samplers[i]).reset(); torch::optional> idx; while ((idx = (*samplers[i]).next(batch_size)).has_value()) { @@ -953,7 +954,7 @@ TEST(DataTest, DistributedSequentialSamplerSingleReplicaProduceCorrectSamples) { ASSERT_EQ(res.size(), sample_count); std::sort(res.begin(), res.end()); - for (size_t i = 0; i < res.size(); ++i) { + for (const auto i : c10::irange(res.size())) { ASSERT_EQ(res[i], i); } } @@ -969,14 +970,14 @@ TEST(DataTest, DistributedSequentialSamplerMultiReplicaProduceCorrectSamples) { std::vector> samplers; - for (size_t i = 0; i < num_replicas; ++i) { + for (const auto i : c10::irange(num_replicas)) { samplers.emplace_back( torch::make_unique( sample_count, num_replicas, i, allow_duplicates)); } std::vector res; - for (size_t i = 0; i < num_replicas; ++i) { + for (const auto i : c10::irange(num_replicas)) { (*samplers[i]).reset(); torch::optional> idx; while ((idx = (*samplers[i]).next(batch_size)).has_value()) { @@ -1490,7 +1491,7 @@ TEST(DataLoaderTest, StatefulDatasetWithNoWorkers) { auto data_loader = torch::data::make_data_loader(D{}); - for (size_t i = 0; i < 10; ++i) { + for (const auto i : c10::irange(10)) { const auto number_of_iterations = std::distance(data_loader->begin(), data_loader->end()); ASSERT_EQ( @@ -1531,7 +1532,7 @@ TEST(DataLoaderTest, StatefulDatasetWithManyWorkers) { torch::data::datasets::make_shared_dataset(), DataLoaderOptions().workers(kNumberOfWorkers)); - for (size_t i = 0; i < 10; ++i) { + for (const auto i : c10::irange(10)) { const auto number_of_iterations = std::distance(data_loader->begin(), data_loader->end()); ASSERT_EQ( @@ -1574,7 +1575,7 @@ TEST(DataLoaderTest, StatefulDatasetWithMap) { })), DataLoaderOptions{}); - for (size_t i = 0; i < 10; ++i) { + for (const auto i : c10::irange(10)) { const auto number_of_iterations = std::distance(data_loader->begin(), data_loader->end()); ASSERT_EQ( @@ -1675,7 +1676,7 @@ TEST(DataLoaderTest, ChunkDataSetGetBatch) { dataset, DataLoaderOptions(batch_size).workers(dataloader_worker_count)); - for (int epoch_index = 0; epoch_index < epoch_count; ++epoch_index) { + for (const auto epoch_index : c10::irange(epoch_count)) { std::vector result(total_example_count, false); int iteration_count = 0; for (auto iterator = data_loader->begin(); @@ -1687,11 +1688,11 @@ TEST(DataLoaderTest, ChunkDataSetGetBatch) { // When prefetch_count is equal to 1 and no worker thread, the batch // order is deterministic. So we can verify elements in each batch. if (prefetch_count == 1 && dataloader_worker_count == 0) { - for (size_t j = 0; j < batch_size; ++j) { + for (const auto j : c10::irange(batch_size)) { ASSERT_EQ(batch[j], iteration_count * batch_size + j); } } - for (size_t j = 0; j < batch_size; ++j) { + for (const auto j : c10::irange(batch_size)) { result[batch[j]] = true; } } @@ -1978,7 +1979,7 @@ TEST(DataLoaderTest, ChunkDatasetSave) { dataset, DataLoaderOptions(batch_size).workers(dataloader_worker_count)); - for (int epoch_index = 0; epoch_index < epoch_count; ++epoch_index) { + for (const auto epoch_index : c10::irange(epoch_count)) { int iteration_count = 0; for (auto iterator = data_loader->begin(); iterator != data_loader->end(); ++iterator, ++iteration_count) { @@ -2079,7 +2080,7 @@ TEST(DataLoaderTest, ChunkDatasetLoad) { auto data_loader = torch::data::make_data_loader( dataset, DataLoaderOptions(batch_size).workers(dataloader_worker_count)); - for (int epoch_index = 0; epoch_index < epoch_count; ++epoch_index) { + for (const auto epoch_index : c10::irange(epoch_count)) { int iteration_count = 0; // For the first epoch, the returned batch should be returned from the @@ -2128,7 +2129,7 @@ TEST(DataLoaderTest, ChunkDatasetCrossChunkShuffle) { size_t index = 0; // Repeatly sample every 5 indices. - for (size_t i = 0; i < batch_size; ++i) { + for (const auto i : c10::irange(batch_size)) { for (size_t j = 0; j < size_ / batch_size; ++j) { indices_[index++] = i + batch_size * j; } @@ -2225,8 +2226,8 @@ TEST(DataLoaderTest, ChunkDatasetCrossChunkShuffle) { for (int i = 0; i < (chunk_count + cross_chunk_shuffle_count - 1) / cross_chunk_shuffle_count; i++) { - for (int j = 0; j < chunk_size; ++j) { - for (int k = 0; k < cross_chunk_shuffle_count; ++k) { + for (const auto j : c10::irange(chunk_size)) { + for (const auto k : c10::irange(cross_chunk_shuffle_count)) { if (i * cross_chunk_shuffle_count + k < chunk_count) { expected_result.push_back(i * cross_chunk_shuffle_count + k); } diff --git a/test/cpp/api/dispatch.cpp b/test/cpp/api/dispatch.cpp index 6416fe3e809158..ba5300659b39e0 100644 --- a/test/cpp/api/dispatch.cpp +++ b/test/cpp/api/dispatch.cpp @@ -2,6 +2,7 @@ #include #include +#include #include #include #include @@ -24,7 +25,7 @@ TEST_F(DispatchTest, TestAVX2) { setenv("ATEN_CPU_CAPABILITY", "avx2", 1); #endif const auto actual_pow_avx2 = vals_tensor.pow(pows_tensor); - for (int i = 0; i < 4; i++) { + for (const auto i : c10::irange(4)) { ASSERT_EQ(result[i], actual_pow_avx2[i].item()); } } @@ -40,7 +41,7 @@ TEST_F(DispatchTest, TestAVX512) { setenv("ATEN_CPU_CAPABILITY", "avx512", 1); #endif const auto actual_pow_avx512 = vals_tensor.pow(pows_tensor); - for (int i = 0; i < 4; i++) { + for (const auto i : c10::irange(4)) { ASSERT_EQ(result[i], actual_pow_avx512[i].item()); } } @@ -56,7 +57,7 @@ TEST_F(DispatchTest, TestDefault) { setenv("ATEN_CPU_CAPABILITY", "default", 1); #endif const auto actual_pow_default = vals_tensor.pow(pows_tensor); - for (int i = 0; i < 4; i++) { + for (const auto i : c10::irange(4)) { ASSERT_EQ(result[i], actual_pow_default[i].item()); } } diff --git a/test/cpp/api/expanding-array.cpp b/test/cpp/api/expanding-array.cpp index 0ad6dd6fbe8e25..b3e725756ff9c2 100644 --- a/test/cpp/api/expanding-array.cpp +++ b/test/cpp/api/expanding-array.cpp @@ -1,5 +1,6 @@ #include +#include #include #include @@ -13,7 +14,7 @@ struct ExpandingArrayTest : torch::test::SeedingFixture {}; TEST_F(ExpandingArrayTest, CanConstructFromInitializerList) { torch::ExpandingArray<5> e({1, 2, 3, 4, 5}); ASSERT_EQ(e.size(), 5); - for (size_t i = 0; i < e.size(); ++i) { + for (const auto i : c10::irange(e.size())) { ASSERT_EQ((*e)[i], i + 1); } } @@ -21,7 +22,7 @@ TEST_F(ExpandingArrayTest, CanConstructFromInitializerList) { TEST_F(ExpandingArrayTest, CanConstructFromVector) { torch::ExpandingArray<5> e(std::vector{1, 2, 3, 4, 5}); ASSERT_EQ(e.size(), 5); - for (size_t i = 0; i < e.size(); ++i) { + for (const auto i : c10::irange(e.size())) { ASSERT_EQ((*e)[i], i + 1); } } @@ -29,7 +30,7 @@ TEST_F(ExpandingArrayTest, CanConstructFromVector) { TEST_F(ExpandingArrayTest, CanConstructFromArray) { torch::ExpandingArray<5> e(std::array({1, 2, 3, 4, 5})); ASSERT_EQ(e.size(), 5); - for (size_t i = 0; i < e.size(); ++i) { + for (const auto i : c10::irange(e.size())) { ASSERT_EQ((*e)[i], i + 1); } } @@ -37,7 +38,7 @@ TEST_F(ExpandingArrayTest, CanConstructFromArray) { TEST_F(ExpandingArrayTest, CanConstructFromSingleValue) { torch::ExpandingArray<5> e(5); ASSERT_EQ(e.size(), 5); - for (size_t i = 0; i < e.size(); ++i) { + for (const auto i : c10::irange(e.size())) { ASSERT_EQ((*e)[i], 5); } } diff --git a/test/cpp/api/fft.cpp b/test/cpp/api/fft.cpp index 5648a3de009036..5b6452d0a853fd 100644 --- a/test/cpp/api/fft.cpp +++ b/test/cpp/api/fft.cpp @@ -1,5 +1,6 @@ #include +#include #include #include @@ -14,15 +15,15 @@ torch::Tensor naive_dft(torch::Tensor x, bool forward=true) { // Roots of unity, exp(-2*pi*j*n/N) for n in [0, N), reversed for inverse transform std::vector> roots(len); const auto angle_base = (forward ? -2.0 : 2.0) * M_PI / len; - for (int64_t i = 0; i < len; ++i) { + for (const auto i : c10::irange(len)) { auto angle = i * angle_base; roots[i] = c10::complex(std::cos(angle), std::sin(angle)); } const auto in = x.data_ptr>(); const auto out = out_tensor.data_ptr>(); - for (int64_t i = 0; i < len; ++i) { - for (int64_t j = 0; j < len; ++j) { + for (const auto i : c10::irange(len)) { + for (const auto j : c10::irange(len)) { out[i] += roots[(j * i) % len] * in[j]; } } diff --git a/test/cpp/api/functional.cpp b/test/cpp/api/functional.cpp index 8b7889f1841ef1..62e4f4dfdbbb1d 100644 --- a/test/cpp/api/functional.cpp +++ b/test/cpp/api/functional.cpp @@ -1,5 +1,6 @@ #include +#include #include #include @@ -1127,7 +1128,7 @@ TEST_F(FunctionalTest, GumbelSoftmax) { int dims[] = {1, -1}; // NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,modernize-avoid-c-arrays,cppcoreguidelines-avoid-magic-numbers) int expected[] = {5*3, 5*4}; - for(auto i=0; i<2; i++) { + for (const auto i : c10::irange(2)) { auto logits = torch::randn({5, 4, 3}); int expected_count = expected[i]; auto y_draw = F::gumbel_softmax(logits, F::GumbelSoftmaxFuncOptions().hard(true).dim(dims[i])); @@ -1149,7 +1150,7 @@ TEST_F(FunctionalTest, GumbelSoftmax) { auto counts = torch::zeros_like(logits); torch::Tensor y_draw; - for (auto i=0; i +#include #include #include @@ -14,7 +15,7 @@ void check_exact_values( const std::vector>& expected_parameters) { ASSERT_EQ(parameters.size(), expected_parameters.size()); - for (size_t i = 0; i < parameters.size(); i++) { + for (const auto i : c10::irange(parameters.size())) { auto layerParameters = parameters[i]; auto expectedLayerParameters = expected_parameters[i]; @@ -27,7 +28,7 @@ void check_exact_values( ASSERT_TRUE(false); } - for (size_t p = 0; p < layerParameters.size(0); p++) { + for (const auto p : c10::irange(layerParameters.size(0))) { // Always compare using double dtype, regardless of the original dtype of the tensors auto tensor = layerParameters[p].to(torch::kFloat64); auto expectedTensor = expectedLayerParameters[p].to(torch::kFloat64); diff --git a/test/cpp/api/integration.cpp b/test/cpp/api/integration.cpp index 7a57d82cb33942..5c65803891215f 100644 --- a/test/cpp/api/integration.cpp +++ b/test/cpp/api/integration.cpp @@ -1,5 +1,6 @@ #include +#include #include #include @@ -122,7 +123,7 @@ bool test_mnist( torch::Device device(with_cuda ? torch::kCUDA : torch::kCPU); model->to(device); - for (size_t epoch = 0; epoch < number_of_epochs; epoch++) { + for (const auto epoch : c10::irange(number_of_epochs)) { // NOLINTNEXTLINE(performance-for-range-copy) for (torch::data::Example<> batch : *data_loader) { auto data = batch.data.to(device), targets = batch.target.to(device); @@ -196,7 +197,7 @@ TEST_F(IntegrationTest, CartPole) { std::vector policy_loss; std::vector value_loss; - for (auto i = 0U; i < saved_log_probs.size(); i++) { + for (const auto i : c10::irange(0U, saved_log_probs.size())) { auto advantage = r_t[i] - saved_values[i].item(); policy_loss.push_back(-advantage * saved_log_probs[i]); value_loss.push_back( diff --git a/test/cpp/api/module.cpp b/test/cpp/api/module.cpp index fc9f57355cb2ca..c99330fb6efa3b 100644 --- a/test/cpp/api/module.cpp +++ b/test/cpp/api/module.cpp @@ -1,5 +1,6 @@ #include +#include #include #include @@ -699,7 +700,7 @@ TEST_F(ModuleTest, ModulesReturnsExpectedSubmodulesForFlatModel) { std::vector> expected = { model.ptr(), model[0], model[1], model[2]}; ASSERT_EQ(modules.size(), expected.size()); - for (size_t i = 0; i < expected.size(); ++i) { + for (const auto i : c10::irange(expected.size())) { // Assert pointer equality. ASSERT_EQ(modules[i].get(), expected[i].get()); } @@ -712,7 +713,7 @@ TEST_F(ModuleTest, ModulesExcludesSelfWhenIncludeSelfSetToFalse) { std::vector> expected = { model[0], model[1], model[2]}; ASSERT_EQ(modules.size(), expected.size()); - for (size_t i = 0; i < expected.size(); ++i) { + for (const auto i : c10::irange(expected.size())) { // Assert pointer equality. ASSERT_EQ(modules[i].get(), expected[i].get()); } @@ -725,7 +726,7 @@ TEST_F(ModuleTest, NamedModulesReturnsExpectedNamedSubmodulesForFlatModel) { std::vector> expected = { model.ptr(), model[0], model[1], model[2]}; ASSERT_EQ(modules.size(), expected.size()); - for (size_t i = 0; i < expected.size(); ++i) { + for (const auto i : c10::irange(expected.size())) { // Assert pointer equality. ASSERT_EQ(modules[i].key(), i ? std::to_string(i - 1) : std::string()); ASSERT_EQ(modules[i].value().get(), expected[i].get()); @@ -740,7 +741,7 @@ TEST_F(ModuleTest, NamedModulesExcludesSelfWhenIncludeSelfSetToFalse) { std::vector> expected = { model[0], model[1], model[2]}; ASSERT_EQ(modules.size(), expected.size()); - for (size_t i = 0; i < expected.size(); ++i) { + for (const auto i : c10::irange(expected.size())) { // Assert pointer equality. ASSERT_EQ(modules[i].key(), std::to_string(i)); ASSERT_EQ(modules[i].value().get(), expected[i].get()); @@ -753,7 +754,7 @@ TEST_F(ModuleTest, ChildrenReturnsExpectedSubmodulesForFlatModel) { std::vector> expected = { model[0], model[1], model[2]}; ASSERT_EQ(modules.size(), expected.size()); - for (size_t i = 0; i < expected.size(); ++i) { + for (const auto i : c10::irange(expected.size())) { // Assert pointer equality. ASSERT_EQ(modules[i].get(), expected[i].get()); } @@ -769,7 +770,7 @@ TEST_F(ModuleTest, NamedChildrenReturnsExpectedNamedSubmodulesForFlatModel) { std::vector> expected = { model[0], model[1], model[2]}; ASSERT_EQ(modules.size(), expected.size()); - for (size_t i = 0; i < expected.size(); ++i) { + for (const auto i : c10::irange(expected.size())) { // Assert pointer equality. ASSERT_EQ(modules[i].key(), std::to_string(i)); ASSERT_EQ(modules[i].value().get(), expected[i].get()); @@ -817,7 +818,7 @@ TEST_F(ModuleTest, NamedBuffersReturnsExpectedTensorsForFlatModel) { struct TestContainer : torch::nn::Module { TestContainer(int64_t number, std::vector modules = {}) : tensor(torch::tensor(number)) { - for (size_t i = 0; i < modules.size(); ++i) { + for (const auto i : c10::irange(modules.size())) { register_module( std::to_string(i), std::make_shared(std::move(modules[i]))); @@ -861,7 +862,7 @@ TEST_F(ModuleTest, ModulesReturnsExpectedSubmodulesForDeepModel) { std::vector> modules = model->modules(); ASSERT_EQ(modules.size(), 10); - for (size_t i = 0; i < modules.size(); ++i) { + for (const auto i : c10::irange(modules.size())) { ASSERT_EQ(get_test_container_item(modules[i]), i); } } @@ -874,7 +875,7 @@ TEST_F(ModuleTest, NamedModulesReturnsExpectedNamedSubmodulesForDeepModel) { ASSERT_EQ(modules.size(), expected.size()); - for (size_t i = 0; i < expected.size(); ++i) { + for (const auto i : c10::irange(expected.size())) { ASSERT_EQ(modules[i].key(), expected[i].first); ASSERT_EQ(get_test_container_item(modules[i].value()), expected[i].second); } diff --git a/test/cpp/api/modulelist.cpp b/test/cpp/api/modulelist.cpp index 98effb9780abeb..aa4fd05c11d29e 100644 --- a/test/cpp/api/modulelist.cpp +++ b/test/cpp/api/modulelist.cpp @@ -1,5 +1,6 @@ #include +#include #include #include @@ -118,7 +119,7 @@ TEST_F(ModuleListTest, AccessWithAt) { ASSERT_EQ(list->size(), 3); // returns the correct module for a given index - for (size_t i = 0; i < modules.size(); ++i) { + for (const auto i : c10::irange(modules.size())) { ASSERT_EQ(&list->at(i), modules[i].get()); } @@ -143,7 +144,7 @@ TEST_F(ModuleListTest, AccessWithPtr) { ASSERT_EQ(list->size(), 3); // returns the correct module for a given index - for (size_t i = 0; i < modules.size(); ++i) { + for (const auto i : c10::irange(modules.size())) { ASSERT_EQ(list->ptr(i).get(), modules[i].get()); ASSERT_EQ(list[i].get(), modules[i].get()); ASSERT_EQ(list->ptr(i).get(), modules[i].get()); diff --git a/test/cpp/api/modules.cpp b/test/cpp/api/modules.cpp index 0c40cf1fa70255..7ef2f63040d954 100644 --- a/test/cpp/api/modules.cpp +++ b/test/cpp/api/modules.cpp @@ -1,5 +1,6 @@ #include +#include #include #include @@ -1148,7 +1149,7 @@ TEST_F(ModulesTest, LayerNorm) { s.backward(); ASSERT_EQ(y.ndimension(), 2); ASSERT_EQ(s.ndimension(), 0); - for (auto i = 0; i < 2; i++) { + for (const auto i : c10::irange(2)) { ASSERT_EQ(y.size(i), 2); } @@ -1166,7 +1167,7 @@ TEST_F(ModulesTest, GroupNorm) { s.backward(); ASSERT_EQ(y.ndimension(), 2); ASSERT_EQ(s.ndimension(), 0); - for (auto i = 0; i < 2; i++) { + for (const auto i : c10::irange(2)) { ASSERT_EQ(y.size(i), 2); } @@ -2595,7 +2596,7 @@ TEST_F(ModulesTest, Softmax) { auto output = m(input); auto sum = torch::sum(torch::exp(input), 1); - for (int i = 0; i < 2; i++) { + for (const auto i : c10::irange(2)) { auto expected = torch::exp(input[i]) / sum[i]; ASSERT_TRUE(torch::allclose(output[i], expected)); } @@ -2607,7 +2608,7 @@ TEST_F(ModulesTest, Softmin) { auto output = m(input); auto sum = torch::sum(torch::exp(-input), 1); - for (int i = 0; i < 2; i++) { + for (const auto i : c10::irange(2)) { auto expected = torch::exp(-input[i]) / sum[i]; ASSERT_TRUE(torch::allclose(output[i], expected)); } @@ -2619,7 +2620,7 @@ TEST_F(ModulesTest, LogSoftmax) { auto output = m(input); auto sum = torch::sum(torch::exp(input), 1); - for (int i = 0; i < 2; i++) { + for (const auto i : c10::irange(2)) { auto expected = torch::log(torch::exp(input[i]) / sum[i]); ASSERT_TRUE(torch::allclose(output[i], expected)); } @@ -2656,7 +2657,7 @@ TEST_F(ModulesTest, AdaptiveLogSoftmaxWithLoss) { auto logprob_out = asfm->log_prob(x); NLLLoss nll_loss; - for (int64_t v = 0; v < 4; ++v) { + for (const auto v : c10::irange(4)) { auto y = torch::full({4}, v, torch::kLong); auto asm_out = asfm(x, y); auto out = asm_out.output; @@ -2675,10 +2676,10 @@ TEST_F(ModulesTest, Softmax2d) { auto output = m(input); auto sum = torch::sum(torch::exp(input), 1); - for (int i = 0; i < 1; i++) { - for (int j = 0; j < 2; j++) { - for (int k = 0; k < 3; k++) { - for (int l = 0; l < 4; l++) { + for (const auto i : c10::irange(1)) { + for (const auto j : c10::irange(2)) { + for (const auto k : c10::irange(3)) { + for (const auto l : c10::irange(4)) { auto expected = torch::exp(input[i][j][k][l]) / sum[i][k][l]; ASSERT_TRUE(torch::allclose(output[i][j][k][l], expected)); } @@ -3389,8 +3390,8 @@ namespace detail { TORCH_INTERNAL_ASSERT(a.size(0) == b.size(0)); TORCH_INTERNAL_ASSERT(a.size(1) == b.size(1)); auto retval = torch::zeros({a.size(0), a.size(1), a.size(2), b.size(3)}, torch::kFloat32); - for (int i = 0; i < a.size(0); i++) { - for (int j = 0; j < a.size(1); j++) { + for (const auto i : c10::irange(a.size(0))) { + for (const auto j : c10::irange(a.size(1))) { retval[i][j] = torch::matmul(a[i][j], b[i][j]); } } @@ -3399,9 +3400,9 @@ namespace detail { torch::Tensor _softmax(const torch::Tensor& x) { auto output = torch::zeros(x.sizes()); - for (int i = 0; i < x.size(0); i++) { - for (int j = 0; j < x.size(1); j++) { - for (int k = 0; k < x.size(2); k++) { + for (const auto i : c10::irange(x.size(0))) { + for (const auto j : c10::irange(x.size(1))) { + for (const auto k : c10::irange(x.size(2))) { const auto& x_curr = x[i][j][k]; const auto e_x = torch::exp(x_curr - torch::max(x_curr)); output[i][j][k] = e_x / torch::sum(e_x); @@ -3424,10 +3425,10 @@ namespace detail { const auto s1 = QKT.size(2); const auto s2 = QKT.size(3); if (unseen_mask.defined() || key_padding_mask.defined()) { - for (int i = 0; i < b1; i++) { - for (int j = 0; j < b2; j++) { - for (int m = 0; m < s1; m++) { - for (int n = 0; n < s2; n++) { + for (const auto i : c10::irange(b1)) { + for (const auto j : c10::irange(b2)) { + for (const auto m : c10::irange(s1)) { + for (const auto n : c10::irange(s2)) { if (unseen_mask.defined() && unseen_mask[m][n].item() == 0) { QKT[i][j][m][n] = -std::numeric_limits::infinity(); } @@ -3475,7 +3476,7 @@ namespace detail { std::uniform_int_distribution d_2_10(2, 10); std::uniform_int_distribution d_3_10(3, 10); bool registration_checked = false; - for (int i = 0; i < 100; i++) { + for (const auto i : c10::irange(100)) { const auto batch_sz = d_2_10(generator); const auto seq_len = d_2_10(generator); const auto d_head = d_3_10(generator); diff --git a/test/cpp/api/nn_utils.cpp b/test/cpp/api/nn_utils.cpp index 37c2676fa53553..634391d721edd5 100644 --- a/test/cpp/api/nn_utils.cpp +++ b/test/cpp/api/nn_utils.cpp @@ -1,5 +1,6 @@ #include +#include #include #include @@ -40,7 +41,7 @@ TEST_F(NNUtilsTest, ClipGradNorm) { auto compare_scaling = [&](const std::vector& grads) -> torch::Tensor { std::vector p_scale; - for (int i = 0; i < grads.size(); i++) { + for (const auto i : c10::irange(grads.size())) { auto param = l->parameters()[i]; auto grad = grads[i]; p_scale.push_back(param.grad().data().div(grad).view(-1)); @@ -61,7 +62,7 @@ TEST_F(NNUtilsTest, ClipGradNorm) { std::numeric_limits::infinity(), }; for (auto norm_type : norm_types) { - for (int i = 0; i < grads.size(); i++) { + for (const auto i : c10::irange(grads.size())) { l->parameters()[i].mutable_grad() = grads[i].clone().view_as(l->parameters()[i].data()); } @@ -80,7 +81,7 @@ TEST_F(NNUtilsTest, ClipGradNorm) { torch::ones(10).div(500), }; for (auto norm_type : norm_types) { - for (int i = 0; i < grads.size(); i++) { + for (const auto i : c10::irange(grads.size())) { l->parameters()[i].grad().data().copy_(grads[i]); } auto norm_before = compute_norm(norm_type); @@ -227,7 +228,7 @@ TEST_F(NNUtilsTest, ClipGradNormErrorIfNonfinite) { // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto,hicpp-avoid-goto) EXPECT_THROW(utils::clip_grad_norm_(parameters, 1., norm_type, true), std::exception) << msg; // Grads should not change if error is thrown - for (int64_t p_idx = 0; p_idx < parameters.size(); p_idx++) { + for (const auto p_idx : c10::irange(parameters.size())) { ASSERT_TRUE(torch::allclose(parameters[p_idx].grad(), grads_before[p_idx], 1.0, 0.0, /*equal_nan*/ true)) << msg; } } else { @@ -285,7 +286,7 @@ TEST_F(NNUtilsTest, ClipGradValue) { std::vector> grad_lists = { {grad_w, grad_b}, {grad_w, torch::Tensor()}}; for (auto grad_list : grad_lists) { - for (int i = 0; i < grad_list.size(); i++) { + for (const auto i : c10::irange(grad_list.size())) { auto p = l->parameters()[i]; auto g = grad_list[i]; p.mutable_grad() = g.defined() ? g.clone().view_as(p.data()) : g; @@ -335,7 +336,7 @@ TEST_F(NNUtilsTest, ConvertParameters) { }; utils::vector_to_parameters(vector, zero_parameters); - for (int i = 0; i < zero_parameters.size(); ++i) { + for (const auto i : c10::irange(zero_parameters.size())) { ASSERT_TRUE(zero_parameters[i].allclose(parameters[i])); } @@ -368,7 +369,7 @@ int64_t PackedSequenceTest_max_length = 6; std::vector PackedSequenceTest_ordered_sequence(torch::ScalarType tensor_type) { std::vector seqs; seqs.reserve(PackedSequenceTest_batch_size); - for (int64_t i = 0; i < PackedSequenceTest_batch_size; i++) { + for (const auto i : c10::irange(PackedSequenceTest_batch_size)) { seqs.emplace_back(torch::empty({ torch::randint(1, PackedSequenceTest_max_length, {1}).item() }, tensor_type)); @@ -390,7 +391,7 @@ std::tuple PackedSequenceTest_padded_sequence(torc // Create Tensor of random padded sequences auto ordered = PackedSequenceTest_ordered_sequence(tensor_type); auto lengths = torch::empty({(int64_t)ordered.size()}, torch::kInt64); - for (int64_t i = 0; i < ordered.size(); i++) { + for (const auto i : c10::irange(ordered.size())) { lengths[i] = ordered[i].size(0); } auto padded_tensor = rnn_utils::pad_sequence(ordered); @@ -619,9 +620,9 @@ TEST_F(NNUtilsTest, PackPaddedSequence) { } auto padded = torch::cat(tensors_to_be_cat, 1); std::vector expected_data_vec; - for (int64_t n = 0; n < batch_sizes.size(0); n++) { + for (const auto n : c10::irange(batch_sizes.size(0))) { int64_t batch_size = batch_sizes[n].item(); - for (int64_t i = 0; i < batch_size; i++) { + for (const auto i : c10::irange(batch_size)) { expected_data_vec.emplace_back(torch::arange(1., 6) + (i + 1) * 100 + 5 * n); } } @@ -631,7 +632,7 @@ TEST_F(NNUtilsTest, PackPaddedSequence) { if (should_shuffle) { // Shuffle the padded sequence to create an unsorted sequence std::vector permutation; - for (int64_t i = 0; i < sorted_lengths.size(); i++) { + for (const auto i : c10::irange(sorted_lengths.size())) { permutation.emplace_back(i); } std::shuffle( @@ -702,7 +703,7 @@ TEST_F(NNUtilsTest, PackPaddedSequence) { if (batch_first) { grad_output.transpose_(0, 1); } - for (int64_t i = 0; i < lengths.size(0); i++) { + for (const auto i : c10::irange(lengths.size(0))) { int64_t l = lengths[i].item(); ASSERT_TRUE(torch::allclose( padded.grad().narrow(0, 0, l).select(1, i), diff --git a/test/cpp/api/operations.cpp b/test/cpp/api/operations.cpp index 49c205ae403e82..e51eb91384e435 100644 --- a/test/cpp/api/operations.cpp +++ b/test/cpp/api/operations.cpp @@ -1,5 +1,6 @@ #include +#include #include #include @@ -11,7 +12,7 @@ struct OperationTest : torch::test::SeedingFixture { }; TEST_F(OperationTest, Lerp) { - for (auto i = 0; i < TEST_AMOUNT; i++) { + for (const auto i : c10::irange(TEST_AMOUNT)) { // test lerp_kernel_scalar auto start = torch::rand({3, 5}); auto end = torch::rand({3, 5}); @@ -35,13 +36,13 @@ TEST_F(OperationTest, Lerp) { } TEST_F(OperationTest, Cross) { - for (auto i = 0; i < TEST_AMOUNT; i++) { + for (const auto i : c10::irange(TEST_AMOUNT)) { // input auto a = torch::rand({10, 3}); auto b = torch::rand({10, 3}); // expected auto exp = torch::empty({10, 3}); - for (int j = 0; j < 10; j++) { + for (const auto j : c10::irange(10)) { auto u1 = a[j][0], u2 = a[j][1], u3 = a[j][2]; auto v1 = b[j][0], v2 = b[j][1], v3 = b[j][2]; exp[j][0] = u2 * v3 - v2 * u3; diff --git a/test/cpp/api/optim.cpp b/test/cpp/api/optim.cpp index 026e7eba2a38a3..098cf8d04f2abb 100644 --- a/test/cpp/api/optim.cpp +++ b/test/cpp/api/optim.cpp @@ -1,5 +1,6 @@ #include +#include #include #include @@ -36,7 +37,7 @@ bool test_optimizer_xor(Options options) { while (running_loss > 0.1) { auto inputs = torch::empty({kBatchSize, 2}); auto labels = torch::empty({kBatchSize}); - for (size_t i = 0; i < kBatchSize; i++) { + for (const auto i : c10::irange(kBatchSize)) { inputs[i] = torch::randint(2, {2}, torch::kInt64); labels[i] = inputs[i][0].item() ^ inputs[i][1].item(); } @@ -112,7 +113,7 @@ void check_exact_values( torch::Tensor input = torch::tensor({0.1, 0.2, 0.3, 0.4, 0.5, 0.6}, torch::kFloat64).reshape({3, 2}); - for (size_t i = 0; i < kIterations; ++i) { + for (const auto i : c10::irange(kIterations)) { optimizer.zero_grad(); auto output = model->forward(input); auto loss = output.sum(); @@ -124,7 +125,7 @@ void check_exact_values( if (i % kSampleEvery == 0) { ASSERT_TRUE( expected_parameters.at(i / kSampleEvery).size() == parameters.size()); - for (size_t p = 0; p < parameters.size(); ++p) { + for (const auto p : c10::irange(parameters.size())) { ASSERT_TRUE(parameters[p]->defined()); // Always compare using double dtype, regardless of the original dtype of the tensors auto computed = parameters[p]->flatten().to(torch::kFloat64); @@ -143,7 +144,7 @@ void check_exact_values( TEST(OptimTest, OptimizerAccessors) { auto options = AdagradOptions(1.0); std::vector params; - for (size_t i = 0; i < 3; i++) { + for (const auto i : c10::irange(3)) { params.push_back(torch::randn(10)); } auto optimizer = Adagrad(params, options); @@ -155,7 +156,7 @@ TEST(OptimTest, OptimizerAccessors) { // NOLINTNEXTLINE(modernize-use-emplace) params_groups.push_back(OptimizerParamGroup(params)); auto& params_1 = params_groups[1].params(); - for (size_t i = 0; i < params_1.size(); i++) { + for (const auto i : c10::irange(params_1.size())) { torch::equal(params[i], params_1[i]); } @@ -225,7 +226,7 @@ TEST(OptimTest, OldInterface) { std::vector params_; OLD_INTERFACE_WARNING_CHECK(params_ = optimizer.parameters()); - for (size_t p = 0; p < size; ++p) { + for (const auto p : c10::irange(size)) { ASSERT_TRUE(params_[p].allclose(parameters[p])); } } diff --git a/test/cpp/api/parallel.cpp b/test/cpp/api/parallel.cpp index bec553ed76624f..1789fe5000877e 100644 --- a/test/cpp/api/parallel.cpp +++ b/test/cpp/api/parallel.cpp @@ -1,5 +1,6 @@ #include +#include #include #include #include @@ -86,7 +87,7 @@ TEST_F(ParallelTest, Replicate_MultiCUDA) { } replicas[0]->to(torch::kCPU); ASSERT_EQ(replica1_parameters.size(), original_parameters.size()); - for (size_t i = 0; i < original_parameters.size(); ++i) { + for (const auto i : c10::irange(original_parameters.size())) { ASSERT_TRUE(replica1_parameters[i].allclose(original_parameters[i])); ASSERT_TRUE( replica1_parameters[i].data_ptr() != @@ -99,7 +100,7 @@ TEST_F(ParallelTest, Replicate_MultiCUDA) { } replicas[1]->to(torch::kCPU); ASSERT_EQ(replica2_parameters.size(), original_parameters.size()); - for (size_t i = 0; i < original_parameters.size(); ++i) { + for (const auto i : c10::irange(original_parameters.size())) { ASSERT_TRUE(replica2_parameters[i].allclose(original_parameters[i])); ASSERT_TRUE( replica2_parameters[i].data_ptr() != @@ -222,7 +223,7 @@ TEST_F(ParallelTest, DataParallelUsesAllAvailableCUDADevices_CUDA) { auto output = parallel::data_parallel(m, input); ASSERT_EQ(output.numel(), device_count); - for (size_t i = 0; i < device_count; ++i) { + for (const auto i : c10::irange(device_count)) { ASSERT_EQ(output[i].item(), i); } } @@ -258,7 +259,7 @@ TEST_F(ParallelTest, DataParallelNumericalEquivalence_MultiCUDA) { auto model_dp = std::dynamic_pointer_cast(model->clone()); // run 3 training iterations - for (int i = 0; i < 3; ++i) { + for (const auto i : c10::irange(3)) { input += i; input_dp += i; diff --git a/test/cpp/api/parameterlist.cpp b/test/cpp/api/parameterlist.cpp index d4c52133ce6dd3..d5b464c6ec2ff4 100644 --- a/test/cpp/api/parameterlist.cpp +++ b/test/cpp/api/parameterlist.cpp @@ -1,5 +1,6 @@ #include +#include #include #include @@ -78,11 +79,11 @@ TEST_F(ParameterListTest, AccessWithAt) { ASSERT_EQ(list->size(), 4); // returns the correct module for a given index - for (size_t i = 0; i < params.size(); ++i) { + for (const auto i : c10::irange(params.size())) { ASSERT_TRUE(torch::all(torch::eq(list->at(i), params[i])).item()); } - for (size_t i = 0; i < params.size(); ++i) { + for (const auto i : c10::irange(params.size())) { ASSERT_TRUE(torch::all(torch::eq(list[i], params[i])).item()); } diff --git a/test/cpp/api/sequential.cpp b/test/cpp/api/sequential.cpp index ed245963c7fdba..cac719af6cde59 100644 --- a/test/cpp/api/sequential.cpp +++ b/test/cpp/api/sequential.cpp @@ -1,5 +1,6 @@ #include +#include #include #include @@ -172,7 +173,7 @@ TEST_F(SequentialTest, AccessWithAt) { ASSERT_EQ(sequential->size(), 3); // returns the correct module for a given index - for (size_t i = 0; i < modules.size(); ++i) { + for (const auto i : c10::irange(modules.size())) { ASSERT_EQ(&sequential->at(i), modules[i].get()); } @@ -201,7 +202,7 @@ TEST_F(SequentialTest, AccessWithPtr) { ASSERT_EQ(sequential->size(), 3); // returns the correct module for a given index - for (size_t i = 0; i < modules.size(); ++i) { + for (const auto i : c10::irange(modules.size())) { ASSERT_EQ(sequential->ptr(i).get(), modules[i].get()); ASSERT_EQ(sequential[i].get(), modules[i].get()); ASSERT_EQ(sequential->ptr(i).get(), modules[i].get()); diff --git a/test/cpp/api/serialize.cpp b/test/cpp/api/serialize.cpp index 46ede0f156ac1d..b422662aa3623f 100644 --- a/test/cpp/api/serialize.cpp +++ b/test/cpp/api/serialize.cpp @@ -2,6 +2,7 @@ #include #include +#include #include @@ -41,7 +42,7 @@ void is_optimizer_param_group_equal(const OptimizerParamGroup& lhs, const Optimi const auto& rhs_params = rhs.params(); ASSERT_TRUE(lhs_params.size() == rhs_params.size()); - for (size_t j = 0; j < lhs_params.size(); j++) { + for (const auto j : c10::irange(lhs_params.size())) { ASSERT_TRUE(torch::equal(lhs_params[j], rhs_params[j])); } ASSERT_TRUE(static_cast(lhs.options()) == static_cast(rhs.options())); @@ -136,7 +137,7 @@ void test_serialize_optimizer(DerivedOptimizerOptions options, bool only_has_glo ASSERT_TRUE(optim3_2_state.size() == optim3_state.size()); // checking correctness of serialization logic for optimizer.param_groups_ and optimizer.state_ - for (int i = 0; i < optim3_2_param_groups.size(); i++) { + for (const auto i : c10::irange(optim3_2_param_groups.size())) { is_optimizer_param_group_equal( optim3_2_param_groups[i], optim3_param_groups[i]); is_optimizer_state_equal(optim3_2_state, optim3_state); @@ -173,7 +174,7 @@ void write_tensors_to_archive( const BufferContainer& buffers) { archive.write( key + "/size", torch::tensor(static_cast(buffers.size()))); - for (size_t index = 0; index < buffers.size(); ++index) { + for (const auto index : c10::irange(buffers.size())) { archive.write( key + "/" + c10::to_string(index), buffers[index], /*is_buffer=*/true); } @@ -203,7 +204,7 @@ void write_step_buffers( TEST(SerializeTest, KeysFunc) { auto tempfile = c10::make_tempfile(); torch::serialize::OutputArchive output_archive; - for (size_t i = 0; i < 3; i++) { + for (const auto i : c10::irange(3)) { output_archive.write("element/" + c10::to_string(i), c10::IValue(static_cast(i))); } output_archive.save_to(tempfile.name); @@ -211,7 +212,7 @@ TEST(SerializeTest, KeysFunc) { input_archive.load_from(tempfile.name); std::vector keys = input_archive.keys(); ASSERT_EQ(keys.size(), 3); - for (size_t i = 0; i < keys.size(); i++) { + for (const auto i : c10::irange(keys.size())) { ASSERT_EQ(keys[i], "element/" + c10::to_string(i)); } } @@ -219,7 +220,7 @@ TEST(SerializeTest, KeysFunc) { TEST(SerializeTest, TryReadFunc) { auto tempfile = c10::make_tempfile(); torch::serialize::OutputArchive output_archive; - for (size_t i = 0; i < 3; i++) { + for (const auto i : c10::irange(3)) { output_archive.write("element/" + c10::to_string(i), c10::IValue(static_cast(i))); } output_archive.save_to(tempfile.name); @@ -363,7 +364,7 @@ TEST(SerializeTest, XOR) { auto getLoss = [](Sequential model, uint32_t batch_size) { auto inputs = torch::empty({batch_size, 2}); auto labels = torch::empty({batch_size}); - for (size_t i = 0; i < batch_size; i++) { + for (const auto i : c10::irange(batch_size)) { inputs[i] = torch::randint(2, {2}, torch::kInt64); labels[i] = inputs[i][0].item() ^ inputs[i][1].item(); } @@ -533,7 +534,7 @@ TEST(SerializeTest, Optim_SGD) { int64_t iteration_{0}; const auto& params_ = optim1.param_groups()[0].params(); const auto& optim1_state = optim1.state(); - for (size_t i = 0; i < params_.size(); i++) { + for (const auto i : c10::irange(params_.size())) { if(i != (params_.size() - 1)) { auto key_ = c10::guts::to_string(params_[i].unsafeGetTensorImpl()); const SGDParamState& curr_state_ = static_cast(*(optim1_state.at(key_).get())); @@ -577,7 +578,7 @@ TEST(SerializeTest, Optim_Adam) { std::vector max_exp_average_sq_buffers; const auto& params_ = optim1.param_groups()[0].params(); const auto& optim1_state = optim1.state(); - for (size_t i = 0; i < params_.size(); i++) { + for (const auto i : c10::irange(params_.size())) { if(i != (params_.size() - 1)) { auto key_ = c10::guts::to_string(params_[i].unsafeGetTensorImpl()); const AdamParamState& curr_state_ = static_cast(*(optim1_state.at(key_).get())); @@ -627,7 +628,7 @@ TEST(SerializeTest, Optim_AdamW) { std::vector max_exp_average_sq_buffers; const auto& params_ = optim1.param_groups()[0].params(); const auto& optim1_state = optim1.state(); - for (size_t i = 0; i < params_.size(); i++) { + for (const auto i : c10::irange(params_.size())) { if(i != (params_.size() - 1)) { auto key_ = c10::guts::to_string(params_[i].unsafeGetTensorImpl()); const AdamWParamState& curr_state_ = static_cast(*(optim1_state.at(key_).get())); @@ -678,7 +679,7 @@ TEST(SerializeTest, Optim_RMSprop) { std::vector grad_average_buffers; const auto& params_ = optim1.param_groups()[0].params(); const auto& optim1_state = optim1.state(); - for (size_t i = 0; i < params_.size(); i++) { + for (const auto i : c10::irange(params_.size())) { if(i != (params_.size() - 1)) { auto key_ = c10::guts::to_string(params_[i].unsafeGetTensorImpl()); const RMSpropParamState& curr_state_ = static_cast(*(optim1_state.at(key_).get())); @@ -703,7 +704,7 @@ TEST(SerializeTest, Optim_RMSprop) { const auto& params1_2_ = optim1_2.param_groups()[0].params(); auto& optim1_2_state = optim1_2.state(); // old RMSprop didn't track step value - for (size_t i = 0; i < params1_2_.size(); i++) { + for (const auto i : c10::irange(params1_2_.size())) { if(i != (params1_2_.size() - 1)) { auto key_ = c10::guts::to_string(params_[i].unsafeGetTensorImpl()); auto key1_2_ = c10::guts::to_string(params1_2_[i].unsafeGetTensorImpl()); @@ -788,7 +789,7 @@ TEST(SerializeTest, XOR_CUDA) { inputs = inputs.cuda(); labels = labels.cuda(); } - for (size_t i = 0; i < batch_size; i++) { + for (const auto i : c10::irange(batch_size)) { inputs[i] = torch::randint(2, {2}, torch::kInt64); labels[i] = inputs[i][0].item() ^ inputs[i][1].item(); } @@ -879,7 +880,7 @@ TEST(SerializeTest, VectorOfTensors) { std::vector y_vec; torch::load(y_vec, stream); - for (int64_t i = 0; i < x_vec.size(); i++) { + for (const auto i : c10::irange(x_vec.size())) { auto& x = x_vec[i]; auto& y = y_vec[i]; ASSERT_TRUE(y.defined()); diff --git a/test/cpp/api/static.cpp b/test/cpp/api/static.cpp index df90d1fc03763b..36bb5deb00cfea 100644 --- a/test/cpp/api/static.cpp +++ b/test/cpp/api/static.cpp @@ -1,5 +1,6 @@ #include +#include #include #include #include @@ -95,7 +96,7 @@ TEST(TestStatic, Apply) { std::vector v; torch::apply([&v](int x) { v.push_back(x); }, 1, 2, 3, 4, 5); ASSERT_EQ(v.size(), 5); - for (size_t i = 0; i < v.size(); ++i) { + for (const auto i : c10::irange(v.size())) { ASSERT_EQ(v.at(i), i + 1); } } diff --git a/test/cpp/api/tensor.cpp b/test/cpp/api/tensor.cpp index 362cc6af1285b5..eac3a04205ddb3 100644 --- a/test/cpp/api/tensor.cpp +++ b/test/cpp/api/tensor.cpp @@ -1,6 +1,7 @@ #include #include +#include #include #include @@ -263,7 +264,7 @@ TEST(TensorTest, AtTensorCtorSingleDim) { tensor = at::tensor(v); ASSERT_EQ(tensor.numel(), v.size()); ASSERT_EQ(tensor.dtype(), at::kInt); - for (size_t i = 0; i < v.size(); ++i) { + for (const auto i : c10::irange(v.size())) { ASSERT_TRUE(exactly_equal(tensor[i], v.at(i))); } @@ -271,7 +272,7 @@ TEST(TensorTest, AtTensorCtorSingleDim) { tensor = at::tensor(w); ASSERT_EQ(tensor.numel(), w.size()); ASSERT_EQ(tensor.dtype(), at::kDouble); - for (size_t i = 0; i < w.size(); ++i) { + for (const auto i : c10::irange(w.size())) { ASSERT_TRUE(almost_equal(tensor[i], w.at(i))); } @@ -282,7 +283,7 @@ TEST(TensorTest, AtTensorCtorSingleDim) { tensor = at::tensor(x); ASSERT_EQ(tensor.numel(), x.size()); ASSERT_EQ(tensor.dtype(), at::kComplexDouble); - for (size_t i = 0; i < x.size(); ++i) { + for (const auto i : c10::irange(x.size())) { ASSERT_TRUE(almost_equal(tensor[i], x.at(i))); } } @@ -913,8 +914,8 @@ TEST(TensorTest, FromBlobWithStrides) { ASSERT_EQ(tensor.numel(), 9); const std::vector expected_strides = {1, 3}; ASSERT_EQ(tensor.strides(), expected_strides); - for (int64_t i = 0; i < tensor.size(0); ++i) { - for (int64_t j = 0; j < tensor.size(1); ++j) { + for (const auto i : c10::irange(tensor.size(0))) { + for (const auto j : c10::irange(tensor.size(1))) { // NOTE: This is column major because the strides are swapped. EXPECT_EQ(tensor[i][j].item(), 1 + (j * tensor.size(1)) + i); } diff --git a/test/cpp/c10d/ProcessGroupGlooTest.cpp b/test/cpp/c10d/ProcessGroupGlooTest.cpp index 5c490fe28e2e19..b278bfd3518754 100644 --- a/test/cpp/c10d/ProcessGroupGlooTest.cpp +++ b/test/cpp/c10d/ProcessGroupGlooTest.cpp @@ -242,7 +242,7 @@ void checkProfiledEvents( auto match = !strcmp(evt.name(), expected_profile_str); if (verify_shapes && match) { auto shapesVec = evt.shapes(); - for (int i = 0; i < expected_count; i++) { + for (const auto i : c10::irange(expected_count)) { // Assumptions: no two expected shapes are the same if (shapesVec[0] == expected_shapes[i]) { matched_shapes[i] = true; diff --git a/test/cpp/c10d/ProcessGroupNCCLTest.cpp b/test/cpp/c10d/ProcessGroupNCCLTest.cpp index 410e7470642d7d..65b9c00de2b7a1 100644 --- a/test/cpp/c10d/ProcessGroupNCCLTest.cpp +++ b/test/cpp/c10d/ProcessGroupNCCLTest.cpp @@ -312,8 +312,7 @@ class ReduceScatterBaseNCCLTest : public NCCLTest { : NCCLTest(path, worldSize) { output_tensor_ = at::empty({1}, at::kCUDA); input_tensor_ = at::empty({worldSize}, at::kCUDA); - for(int i = 0; i < worldSize; i++) - { + for (const auto i : c10::irange(worldSize)) { input_tensor_[i] = i; } } diff --git a/test/cpp/rpc/test_tensorpipe_serialization.cpp b/test/cpp/rpc/test_tensorpipe_serialization.cpp index 1de13fd353c287..b561e71aec9954 100644 --- a/test/cpp/rpc/test_tensorpipe_serialization.cpp +++ b/test/cpp/rpc/test_tensorpipe_serialization.cpp @@ -2,6 +2,7 @@ #include #include +#include #include #include @@ -64,7 +65,7 @@ TEST(TensorpipeSerialize, Base) { // Mimic tensorpipe data transfer EXPECT_EQ( recvingTpAllocation.payloads.size(), sendingTpMessage.payloads.size()); - for (int i = 0; i < recvingTpAllocation.payloads.size(); i++) { + for (const auto i : c10::irange(recvingTpAllocation.payloads.size())) { tensorpipe::Message::Payload& srcPayload = sendingTpMessage.payloads[i]; tensorpipe::Allocation::Payload& dstPayload = recvingTpAllocation.payloads[i]; @@ -76,7 +77,7 @@ TEST(TensorpipeSerialize, Base) { } EXPECT_EQ( recvingTpAllocation.tensors.size(), sendingTpMessage.tensors.size()); - for (int i = 0; i < recvingTpAllocation.tensors.size(); i++) { + for (const auto i : c10::irange(recvingTpAllocation.tensors.size())) { tensorpipe::Message::Tensor& srcTensor = sendingTpMessage.tensors[i]; tensorpipe::Allocation::Tensor& dstTensor = recvingTpAllocation.tensors[i]; memcpy( diff --git a/test/cpp/rpc/test_wire_serialization.cpp b/test/cpp/rpc/test_wire_serialization.cpp index 9c949dd94c03af..ed96b52fa18b5e 100644 --- a/test/cpp/rpc/test_wire_serialization.cpp +++ b/test/cpp/rpc/test_wire_serialization.cpp @@ -1,5 +1,6 @@ #include +#include #include #include @@ -27,7 +28,7 @@ TEST(WireSerialize, Base) { EXPECT_TRUE( memcmp(deser.first.data(), payload.data(), payload.size()) == 0); } - for (size_t i = 0; i < tensors.size(); ++i) { + for (const auto i : c10::irange(tensors.size())) { EXPECT_TRUE(torch::equal(tensors[i], deser.second[i])); } }; diff --git a/test/cpp/tensorexpr/padded_buffer.cpp b/test/cpp/tensorexpr/padded_buffer.cpp index a78e7c29436b92..2b6202557fdb8e 100644 --- a/test/cpp/tensorexpr/padded_buffer.cpp +++ b/test/cpp/tensorexpr/padded_buffer.cpp @@ -1,6 +1,7 @@ #include "test/cpp/tensorexpr/padded_buffer.h" #include +#include #include namespace torch { @@ -10,7 +11,7 @@ namespace tensorexpr { int PaddedBufferBase::Index(const std::vector& indices) const { DCHECK_EQ(dims_.size(), indices.size()); int total_index = 0; - for (size_t i = 0; i < dims_.size(); i++) { + for (const auto i : c10::irange(dims_.size())) { total_index += indices[i] * strides_[i]; } return total_index; diff --git a/test/cpp/tensorexpr/padded_buffer.h b/test/cpp/tensorexpr/padded_buffer.h index ab050d5c8f1a91..b3e5227ae7e625 100644 --- a/test/cpp/tensorexpr/padded_buffer.h +++ b/test/cpp/tensorexpr/padded_buffer.h @@ -3,6 +3,7 @@ #include #include +#include #include "torch/csrc/jit/tensorexpr/eval.h" namespace torch { @@ -168,7 +169,7 @@ class PaddedBuffer : public PaddedBufferBase { // Verify the watermarks in the paddings are intact. void ValidateWatermark() const { - for (int i = 0; i < kPaddingSize; i++) { + for (const auto i : c10::irange(kPaddingSize)) { ASSERT_EQ(data_[i], kPaddingValue); ASSERT_EQ(data_[i + total_size_ + kPaddingSize], kPaddingValue); } @@ -178,7 +179,7 @@ class PaddedBuffer : public PaddedBufferBase { ValidateWatermark(); DCHECK(backup_data_.size() == data_.size()) << "Please make sure you have call Backup() before calling CheckBackup()"; - for (int i = 0; i < total_size_; i++) { + for (const auto i : c10::irange(total_size_)) { ASSERT_EQ(data_[i + kPaddingSize], backup_data_[i + kPaddingSize]); } } @@ -214,7 +215,7 @@ void ExpectAllEqual(const PaddedBuffer& f1, const PaddedBuffer& f2) { ASSERT_EQ(v1.size(), v2.size()); f1.ValidateWatermark(); f2.ValidateWatermark(); - for (int i = 0; i < total_size; i++) { + for (const auto i : c10::irange(total_size)) { ASSERT_EQ(v1[kPaddingSize + i], v2[kPaddingSize + i]); } } @@ -231,7 +232,7 @@ void ExpectAllNear( ASSERT_EQ(v1.size(), v2.size()); f1.ValidateWatermark(); f2.ValidateWatermark(); - for (int i = 0; i < total_size; i++) { + for (const auto i : c10::irange(total_size)) { ASSERT_NEAR(v1[kPaddingSize + i], v2[kPaddingSize + i], abs_error); } } diff --git a/test/cpp/tensorexpr/test_aten.cpp b/test/cpp/tensorexpr/test_aten.cpp index ecc6365a79f662..34ce2bd069d55f 100644 --- a/test/cpp/tensorexpr/test_aten.cpp +++ b/test/cpp/tensorexpr/test_aten.cpp @@ -5,6 +5,7 @@ #include #include +#include #include "test/cpp/tensorexpr/padded_buffer.h" #include "test/cpp/tensorexpr/test_base.h" #include "torch/csrc/jit/tensorexpr/ir_printer.h" @@ -28,14 +29,14 @@ TEST(ATen, _cast_Float) { PaddedBuffer a_v(kTotalSize); PaddedBuffer b_v(kTotalSize); - for (int i = 0; i < kTotalSize; ++i) { + for (const auto i : c10::irange(kTotalSize)) { a_v(i) = i; } SimpleIREvaluator ir_eval(stmt, {a_buf, b_buf}); ir_eval(a_v, b_v); - for (int i = 0; i < kTotalSize; ++i) { + for (const auto i : c10::irange(kTotalSize)) { ASSERT_EQ(a_v(i), i); ASSERT_EQ(b_v(i), static_cast(i)); } @@ -55,14 +56,14 @@ TEST(ATen, negInt) { PaddedBuffer a_v(kTotalSize); PaddedBuffer b_v(kTotalSize); - for (int i = 0; i < kTotalSize; ++i) { + for (const auto i : c10::irange(kTotalSize)) { a_v(i) = i; } SimpleIREvaluator ir_eval(stmt, {a_buf, b_buf}); ir_eval(a_v, b_v); - for (int i = 0; i < kTotalSize; ++i) { + for (const auto i : c10::irange(kTotalSize)) { ASSERT_EQ(a_v(i), i); ASSERT_EQ(b_v(i), -static_cast(i)); } @@ -82,14 +83,14 @@ TEST(ATen, negFloat) { PaddedBuffer a_v(kTotalSize); PaddedBuffer b_v(kTotalSize); - for (int i = 0; i < kTotalSize; ++i) { + for (const auto i : c10::irange(kTotalSize)) { a_v(i) = i; } SimpleIREvaluator ir_eval(stmt, {a_buf, b_buf}); ir_eval(a_v, b_v); - for (int i = 0; i < kTotalSize; ++i) { + for (const auto i : c10::irange(kTotalSize)) { ASSERT_EQ(a_v(i), i); ASSERT_EQ(b_v(i), -i); } @@ -114,7 +115,7 @@ TEST(ATen, addInt) { PaddedBuffer c_v(kTotalSize); PaddedBuffer d_v(kTotalSize); - for (int i = 0; i < kTotalSize; ++i) { + for (const auto i : c10::irange(kTotalSize)) { a_v(i) = i; b_v(i) = 2 * i + 1; c_v(i) = 3 * i + 2; @@ -123,7 +124,7 @@ TEST(ATen, addInt) { SimpleIREvaluator ir_eval(stmt, {a_buf, b_buf, c_buf, d_buf}); ir_eval(a_v, b_v, c_v, d_v); - for (int i = 0; i < kTotalSize; ++i) { + for (const auto i : c10::irange(kTotalSize)) { ASSERT_EQ(a_v(i), i); ASSERT_EQ(b_v(i), 2 * i + 1); ASSERT_EQ(c_v(i), 3 * i + 2); @@ -150,7 +151,7 @@ TEST(ATen, addFloat) { PaddedBuffer c_v(kTotalSize); PaddedBuffer d_v(kTotalSize); - for (int i = 0; i < kTotalSize; ++i) { + for (const auto i : c10::irange(kTotalSize)) { a_v(i) = i; b_v(i) = 2 * i + 1; c_v(i) = 3 * i + 2; @@ -159,7 +160,7 @@ TEST(ATen, addFloat) { SimpleIREvaluator ir_eval(stmt, {a_buf, b_buf, c_buf, d_buf}); ir_eval(a_v, b_v, c_v, d_v); - for (int i = 0; i < kTotalSize; ++i) { + for (const auto i : c10::irange(kTotalSize)) { ASSERT_EQ(a_v(i), i); ASSERT_EQ(b_v(i), 2 * i + 1); ASSERT_EQ(c_v(i), 3 * i + 2); @@ -186,7 +187,7 @@ TEST(ATen, subInt) { PaddedBuffer c_v(kTotalSize); PaddedBuffer d_v(kTotalSize); - for (int i = 0; i < kTotalSize; ++i) { + for (const auto i : c10::irange(kTotalSize)) { a_v(i) = i; b_v(i) = 2 * i + 1; c_v(i) = 3 * i + 2; @@ -195,7 +196,7 @@ TEST(ATen, subInt) { SimpleIREvaluator ir_eval(stmt, {a_buf, b_buf, c_buf, d_buf}); ir_eval(a_v, b_v, c_v, d_v); - for (int i = 0; i < kTotalSize; ++i) { + for (const auto i : c10::irange(kTotalSize)) { ASSERT_EQ(a_v(i), i); ASSERT_EQ(b_v(i), 2 * i + 1); ASSERT_EQ(c_v(i), 3 * i + 2); @@ -222,7 +223,7 @@ TEST(ATen, subFloat) { PaddedBuffer c_v(kTotalSize); PaddedBuffer d_v(kTotalSize); - for (int i = 0; i < kTotalSize; ++i) { + for (const auto i : c10::irange(kTotalSize)) { a_v(i) = i; b_v(i) = 2 * i + 1; c_v(i) = 3 * i + 2; @@ -231,7 +232,7 @@ TEST(ATen, subFloat) { SimpleIREvaluator ir_eval(stmt, {a_buf, b_buf, c_buf, d_buf}); ir_eval(a_v, b_v, c_v, d_v); - for (int i = 0; i < kTotalSize; ++i) { + for (const auto i : c10::irange(kTotalSize)) { ASSERT_EQ(a_v(i), i); ASSERT_EQ(b_v(i), 2 * i + 1); ASSERT_EQ(c_v(i), 3 * i + 2); @@ -258,7 +259,7 @@ TEST(ATen, lerp) { PaddedBuffer c_v(kTotalSize); PaddedBuffer d_v(kTotalSize); - for (int i = 0; i < kTotalSize; ++i) { + for (const auto i : c10::irange(kTotalSize)) { a_v(i) = i; b_v(i) = 2 * i + 1; c_v(i) = 3 * i + 2; @@ -267,7 +268,7 @@ TEST(ATen, lerp) { SimpleIREvaluator ir_eval(stmt, {a_buf, b_buf, c_buf, d_buf}); ir_eval(a_v, b_v, c_v, d_v); - for (int i = 0; i < kTotalSize; ++i) { + for (const auto i : c10::irange(kTotalSize)) { ASSERT_EQ(a_v(i), i); ASSERT_EQ(b_v(i), 2 * i + 1); ASSERT_EQ(c_v(i), 3 * i + 2); @@ -297,7 +298,7 @@ TEST(ATen, addcmulInt) { PaddedBuffer d_v(kTotalSize); PaddedBuffer e_v(kTotalSize); - for (int i = 0; i < kTotalSize; ++i) { + for (const auto i : c10::irange(kTotalSize)) { a_v(i) = i; b_v(i) = 2 * i + 1; c_v(i) = 3 * i + 2; @@ -307,7 +308,7 @@ TEST(ATen, addcmulInt) { SimpleIREvaluator ir_eval(stmt, {a_buf, b_buf, c_buf, d_buf, e_buf}); ir_eval(a_v, b_v, c_v, d_v, e_v); - for (int i = 0; i < kTotalSize; ++i) { + for (const auto i : c10::irange(kTotalSize)) { ASSERT_EQ(a_v(i), i); ASSERT_EQ(b_v(i), 2 * i + 1); ASSERT_EQ(c_v(i), 3 * i + 2); @@ -338,7 +339,7 @@ TEST(ATen, addcmulFloat) { PaddedBuffer d_v(kTotalSize); PaddedBuffer e_v(kTotalSize); - for (int i = 0; i < kTotalSize; ++i) { + for (const auto i : c10::irange(kTotalSize)) { a_v(i) = i; b_v(i) = 2 * i + 1; c_v(i) = 3 * i + 2; @@ -348,7 +349,7 @@ TEST(ATen, addcmulFloat) { SimpleIREvaluator ir_eval(stmt, {a_buf, b_buf, c_buf, d_buf, e_buf}); ir_eval(a_v, b_v, c_v, d_v, e_v); - for (int i = 0; i < kTotalSize; ++i) { + for (const auto i : c10::irange(kTotalSize)) { ASSERT_EQ(a_v(i), i); ASSERT_EQ(b_v(i), 2 * i + 1); ASSERT_EQ(c_v(i), 3 * i + 2); @@ -373,7 +374,7 @@ TEST(ATen, mulInt) { PaddedBuffer b_v(kTotalSize); PaddedBuffer c_v(kTotalSize); - for (int i = 0; i < kTotalSize; ++i) { + for (const auto i : c10::irange(kTotalSize)) { a_v(i) = i; b_v(i) = 2 * i + 1; } @@ -381,7 +382,7 @@ TEST(ATen, mulInt) { SimpleIREvaluator ir_eval(stmt, {a_buf, b_buf, c_buf}); ir_eval(a_v, b_v, c_v); - for (int i = 0; i < kTotalSize; ++i) { + for (const auto i : c10::irange(kTotalSize)) { ASSERT_EQ(a_v(i), i); ASSERT_EQ(b_v(i), 2 * i + 1); ASSERT_EQ(c_v(i), a_v(i) * b_v(i)); @@ -404,7 +405,7 @@ TEST(ATen, mulFloat) { PaddedBuffer b_v(kTotalSize); PaddedBuffer c_v(kTotalSize); - for (int i = 0; i < kTotalSize; ++i) { + for (const auto i : c10::irange(kTotalSize)) { a_v(i) = i; b_v(i) = 2 * i + 1; } @@ -412,7 +413,7 @@ TEST(ATen, mulFloat) { SimpleIREvaluator ir_eval(stmt, {a_buf, b_buf, c_buf}); ir_eval(a_v, b_v, c_v); - for (int i = 0; i < kTotalSize; ++i) { + for (const auto i : c10::irange(kTotalSize)) { ASSERT_EQ(a_v(i), i); ASSERT_EQ(b_v(i), 2 * i + 1); ASSERT_EQ(c_v(i), a_v(i) * b_v(i)); @@ -435,7 +436,7 @@ TEST(ATen, divInt) { PaddedBuffer b_v(kTotalSize); PaddedBuffer c_v(kTotalSize); - for (int i = 0; i < kTotalSize; ++i) { + for (const auto i : c10::irange(kTotalSize)) { a_v(i) = 2 * i + 1; b_v(i) = i + 1; } @@ -443,7 +444,7 @@ TEST(ATen, divInt) { SimpleIREvaluator ir_eval(stmt, {a_buf, b_buf, c_buf}); ir_eval(a_v, b_v, c_v); - for (int i = 0; i < kTotalSize; ++i) { + for (const auto i : c10::irange(kTotalSize)) { ASSERT_EQ(a_v(i), 2 * i + 1); ASSERT_EQ(b_v(i), i + 1); ASSERT_EQ(c_v(i), a_v(i) / b_v(i)); @@ -466,7 +467,7 @@ TEST(ATen, divFloat) { PaddedBuffer b_v(kTotalSize); PaddedBuffer c_v(kTotalSize); - for (int i = 0; i < kTotalSize; ++i) { + for (const auto i : c10::irange(kTotalSize)) { a_v(i) = 2 * i + 1; b_v(i) = i + 1; } @@ -474,7 +475,7 @@ TEST(ATen, divFloat) { SimpleIREvaluator ir_eval(stmt, {a_buf, b_buf, c_buf}); ir_eval(a_v, b_v, c_v); - for (int i = 0; i < kTotalSize; ++i) { + for (const auto i : c10::irange(kTotalSize)) { ASSERT_EQ(a_v(i), 2 * i + 1); ASSERT_EQ(b_v(i), i + 1); ASSERT_EQ(c_v(i), a_v(i) / b_v(i)); @@ -497,7 +498,7 @@ TEST(ATen, maxInt) { PaddedBuffer b_v(kTotalSize); PaddedBuffer c_v(kTotalSize); - for (int i = 0; i < kTotalSize; ++i) { + for (const auto i : c10::irange(kTotalSize)) { a_v(i) = i; b_v(i) = 2 * i + 1; } @@ -505,7 +506,7 @@ TEST(ATen, maxInt) { SimpleIREvaluator ir_eval(stmt, {a_buf, b_buf, c_buf}); ir_eval(a_v, b_v, c_v); - for (int i = 0; i < kTotalSize; ++i) { + for (const auto i : c10::irange(kTotalSize)) { ASSERT_EQ(a_v(i), i); ASSERT_EQ(b_v(i), 2 * i + 1); ASSERT_EQ(c_v(i), std::max(a_v(i), b_v(i))); @@ -528,7 +529,7 @@ TEST(ATen, maxFloat) { PaddedBuffer b_v(kTotalSize); PaddedBuffer c_v(kTotalSize); - for (int i = 0; i < kTotalSize; ++i) { + for (const auto i : c10::irange(kTotalSize)) { a_v(i) = i; b_v(i) = 2 * i + 1; } @@ -536,7 +537,7 @@ TEST(ATen, maxFloat) { SimpleIREvaluator ir_eval(stmt, {a_buf, b_buf, c_buf}); ir_eval(a_v, b_v, c_v); - for (int i = 0; i < kTotalSize; ++i) { + for (const auto i : c10::irange(kTotalSize)) { ASSERT_EQ(a_v(i), i); ASSERT_EQ(b_v(i), 2 * i + 1); ASSERT_EQ(c_v(i), std::fmax(a_v(i), b_v(i))); @@ -559,7 +560,7 @@ TEST(ATen, minInt) { PaddedBuffer b_v(kTotalSize); PaddedBuffer c_v(kTotalSize); - for (int i = 0; i < kTotalSize; ++i) { + for (const auto i : c10::irange(kTotalSize)) { a_v(i) = i; b_v(i) = 2 * i + 1; } @@ -567,7 +568,7 @@ TEST(ATen, minInt) { SimpleIREvaluator ir_eval(stmt, {a_buf, b_buf, c_buf}); ir_eval(a_v, b_v, c_v); - for (int i = 0; i < kTotalSize; ++i) { + for (const auto i : c10::irange(kTotalSize)) { ASSERT_EQ(a_v(i), i); ASSERT_EQ(b_v(i), 2 * i + 1); ASSERT_EQ(c_v(i), std::min(a_v(i), b_v(i))); @@ -590,7 +591,7 @@ TEST(ATen, minFloat) { PaddedBuffer b_v(kTotalSize); PaddedBuffer c_v(kTotalSize); - for (int i = 0; i < kTotalSize; ++i) { + for (const auto i : c10::irange(kTotalSize)) { a_v(i) = i; b_v(i) = 2 * i + 1; } @@ -598,7 +599,7 @@ TEST(ATen, minFloat) { SimpleIREvaluator ir_eval(stmt, {a_buf, b_buf, c_buf}); ir_eval(a_v, b_v, c_v); - for (int i = 0; i < kTotalSize; ++i) { + for (const auto i : c10::irange(kTotalSize)) { ASSERT_EQ(a_v(i), i); ASSERT_EQ(b_v(i), 2 * i + 1); ASSERT_EQ(c_v(i), std::fmin(a_v(i), b_v(i))); @@ -618,14 +619,14 @@ void __ubsan_ignore_float_divide_by_zero__ testATenreciprocal() { PaddedBuffer a_v(kTotalSize); PaddedBuffer b_v(kTotalSize); - for (int i = 0; i < kTotalSize; ++i) { + for (const auto i : c10::irange(kTotalSize)) { a_v(i) = i; } SimpleIREvaluator ir_eval(stmt, {a_buf, b_buf}); ir_eval(a_v, b_v); - for (int i = 0; i < kTotalSize; ++i) { + for (const auto i : c10::irange(kTotalSize)) { ASSERT_EQ(a_v(i), i); ASSERT_EQ(b_v(i), 1.0f / i); } @@ -644,14 +645,14 @@ TEST(ATen, reluInt) { PaddedBuffer a_v(kTotalSize); PaddedBuffer b_v(kTotalSize); - for (int i = 0; i < kTotalSize; ++i) { + for (const auto i : c10::irange(kTotalSize)) { a_v(i) = i - 64; } SimpleIREvaluator ir_eval(stmt, {a_buf, b_buf}); ir_eval(a_v, b_v); - for (int i = 0; i < kTotalSize; ++i) { + for (const auto i : c10::irange(kTotalSize)) { ASSERT_EQ(a_v(i), i - 64); ASSERT_EQ(b_v(i), std::max(a_v(i), 0)); } @@ -672,14 +673,14 @@ TEST(ATen, reluFloat) { PaddedBuffer a_v(kTotalSize); PaddedBuffer b_v(kTotalSize); - for (int i = 0; i < kTotalSize; ++i) { + for (const auto i : c10::irange(kTotalSize)) { a_v(i) = i - 64; } SimpleIREvaluator ir_eval(stmt, {a_buf, b_buf}); ir_eval(a_v, b_v); - for (int i = 0; i < kTotalSize; ++i) { + for (const auto i : c10::irange(kTotalSize)) { ASSERT_EQ(a_v(i), i - 64); ASSERT_EQ(b_v(i), std::fmax(a_v(i), 0)); } @@ -698,14 +699,14 @@ TEST(ATen, logFloat) { PaddedBuffer a_v(kTotalSize); PaddedBuffer b_v(kTotalSize); - for (int i = 0; i < kTotalSize; ++i) { + for (const auto i : c10::irange(kTotalSize)) { a_v(i) = i + 10; } SimpleIREvaluator ir_eval(stmt, {a_buf, b_buf}); ir_eval(a_v, b_v); - for (int i = 0; i < kTotalSize; ++i) { + for (const auto i : c10::irange(kTotalSize)) { ASSERT_EQ(a_v(i), i + 10); ASSERT_EQ(b_v(i), std::log(a_v(i))); } @@ -724,14 +725,14 @@ TEST(ATen, fastLogFloat) { PaddedBuffer a_v(kTotalSize); PaddedBuffer b_v(kTotalSize); - for (int i = 0; i < kTotalSize; ++i) { + for (const auto i : c10::irange(kTotalSize)) { a_v(i) = at::randn({1}).item().to(); } SimpleIREvaluator ir_eval(stmt, {a_buf, b_buf}); ir_eval(a_v, b_v); - for (int i = 0; i < kTotalSize; ++i) { + for (const auto i : c10::irange(kTotalSize)) { auto test = b_v(i); auto ref = std::log(a_v(i)); if (std::isnan(ref)) { @@ -755,14 +756,14 @@ TEST(ATen, fastTanhFloat) { PaddedBuffer a_v(kTotalSize); PaddedBuffer b_v(kTotalSize); - for (int i = 0; i < kTotalSize; ++i) { + for (const auto i : c10::irange(kTotalSize)) { a_v(i) = at::randn({1}).item().to(); } SimpleIREvaluator ir_eval(stmt, {a_buf, b_buf}); ir_eval(a_v, b_v); - for (int i = 0; i < kTotalSize; ++i) { + for (const auto i : c10::irange(kTotalSize)) { auto test = b_v(i); auto ref = std::tanh(a_v(i)); if (std::isnan(ref)) { @@ -786,14 +787,14 @@ TEST(ATen, fastSigmoidFloat) { PaddedBuffer a_v(kTotalSize); PaddedBuffer b_v(kTotalSize); - for (int i = 0; i < kTotalSize; ++i) { + for (const auto i : c10::irange(kTotalSize)) { a_v(i) = at::randn({1}).item().to(); } SimpleIREvaluator ir_eval(stmt, {a_buf, b_buf}); ir_eval(a_v, b_v); - for (int i = 0; i < kTotalSize; ++i) { + for (const auto i : c10::irange(kTotalSize)) { auto test = b_v(i); at::Tensor t = at::ones({1}) * a_v(i); float ref = at::sigmoid(t).item().to(); @@ -818,14 +819,14 @@ TEST(ATen, log10Float) { PaddedBuffer a_v(kTotalSize); PaddedBuffer b_v(kTotalSize); - for (int i = 0; i < kTotalSize; ++i) { + for (const auto i : c10::irange(kTotalSize)) { a_v(i) = i + 10; } SimpleIREvaluator ir_eval(stmt, {a_buf, b_buf}); ir_eval(a_v, b_v); - for (int i = 0; i < kTotalSize; ++i) { + for (const auto i : c10::irange(kTotalSize)) { ASSERT_EQ(a_v(i), i + 10); ASSERT_EQ(b_v(i), std::log10(a_v(i))); } @@ -844,14 +845,14 @@ TEST(ATen, log2Float) { PaddedBuffer a_v(kTotalSize); PaddedBuffer b_v(kTotalSize); - for (int i = 0; i < kTotalSize; ++i) { + for (const auto i : c10::irange(kTotalSize)) { a_v(i) = i + 10; } SimpleIREvaluator ir_eval(stmt, {a_buf, b_buf}); ir_eval(a_v, b_v); - for (int i = 0; i < kTotalSize; ++i) { + for (const auto i : c10::irange(kTotalSize)) { ASSERT_EQ(a_v(i), i + 10); ASSERT_EQ(b_v(i), std::log2(a_v(i))); } @@ -870,7 +871,7 @@ TEST(ATen, expFloat) { PaddedBuffer a_v(kTotalSize); PaddedBuffer b_v(kTotalSize); - for (int i = 0; i < kTotalSize; ++i) { + for (const auto i : c10::irange(kTotalSize)) { // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers,cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions) a_v(i) = i / 10.0f; } @@ -878,7 +879,7 @@ TEST(ATen, expFloat) { SimpleIREvaluator ir_eval(stmt, {a_buf, b_buf}); ir_eval(a_v, b_v); - for (int i = 0; i < kTotalSize; ++i) { + for (const auto i : c10::irange(kTotalSize)) { ASSERT_EQ(a_v(i), i / 10.0f); ASSERT_EQ(b_v(i), std::exp(a_v(i))); } @@ -897,7 +898,7 @@ TEST(ATen, erfFloat) { PaddedBuffer a_v(kTotalSize); PaddedBuffer b_v(kTotalSize); - for (int i = 0; i < kTotalSize; ++i) { + for (const auto i : c10::irange(kTotalSize)) { // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers,cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions) a_v(i) = i / 10.0f; } @@ -905,7 +906,7 @@ TEST(ATen, erfFloat) { SimpleIREvaluator ir_eval(stmt, {a_buf, b_buf}); ir_eval(a_v, b_v); - for (int i = 0; i < kTotalSize; ++i) { + for (const auto i : c10::irange(kTotalSize)) { ASSERT_EQ(a_v(i), i / 10.0f); ASSERT_EQ(b_v(i), std::erf(a_v(i))); } @@ -924,7 +925,7 @@ TEST(ATen, cosFloat) { PaddedBuffer a_v(kTotalSize); PaddedBuffer b_v(kTotalSize); - for (int i = 0; i < kTotalSize; ++i) { + for (const auto i : c10::irange(kTotalSize)) { // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers,cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions) a_v(i) = i / 10.0f; } @@ -932,7 +933,7 @@ TEST(ATen, cosFloat) { SimpleIREvaluator ir_eval(stmt, {a_buf, b_buf}); ir_eval(a_v, b_v); - for (int i = 0; i < kTotalSize; ++i) { + for (const auto i : c10::irange(kTotalSize)) { ASSERT_EQ(a_v(i), i / 10.0f); ASSERT_EQ(b_v(i), std::cos(a_v(i))); } diff --git a/test/cpp/tensorexpr/test_boundsinference.cpp b/test/cpp/tensorexpr/test_boundsinference.cpp index 3a1645147f0a60..7cabee0ce55e33 100644 --- a/test/cpp/tensorexpr/test_boundsinference.cpp +++ b/test/cpp/tensorexpr/test_boundsinference.cpp @@ -5,6 +5,7 @@ #include +#include #include #include #include @@ -26,7 +27,7 @@ static void verifyConstBounds( size_t ndim = ref.size(); ASSERT_EQ(access_info.start.size(), ndim); ASSERT_EQ(access_info.stop.size(), ndim); - for (size_t i = 0; i < ndim; i++) { + for (const auto i : c10::irange(ndim)) { if (ref[i].first >= 0) { // Negative values are used to skip the check ASSERT_TRUE(access_info.start[i]->isConstant()); int start_i = immediateAs(access_info.start[i]); @@ -524,14 +525,14 @@ TEST(BoundsInference, CacheReads) { // Same number of TensorAccessBoundInfos. ASSERT_EQ(pair.second.size(), beforeIt->second.size()); - for (size_t i = 0; i < pair.second.size(); ++i) { + for (const auto i : c10::irange(pair.second.size())) { TensorAccessBoundsInfo& after = pair.second[i]; TensorAccessBoundsInfo& before = beforeIt->second[i]; // Same number of dimensions. ASSERT_EQ(before.start.size(), after.start.size()); // Bounds are equal. - for (size_t j = 0; j < before.start.size(); ++j) { + for (const auto j : c10::irange(before.start.size())) { ASSERT_TRUE(exprEquals(before.start[j], after.start[j])); ASSERT_TRUE(exprEquals(before.stop[j], after.stop[j])); } @@ -550,7 +551,7 @@ TEST(BoundsInference, CacheReads) { ASSERT_EQ(first.start.size(), 2); // bounds for load and store are equal. - for (size_t j = 0; j < first.start.size(); ++j) { + for (const auto j : c10::irange(first.start.size())) { ASSERT_TRUE(exprEquals(first.start[j], second.start[j])); ASSERT_TRUE(exprEquals(first.stop[j], second.stop[j])); } @@ -713,10 +714,10 @@ TEST(BoundsInference, GetPotentialHazardsLoopSplit) { TEST(BoundsInference, HasConflictingOverlapSameBufferWithPartialOverlap) { // Input IR: - // for (int j = 10; j < 100; j++) { + // for (const auto j : c10::irange(10, 100)) { // A[j] = 10 * j; // } - // for (int k = 10; k < 100; k++) { + // for (const auto k : c10::irange(10, 100)) { // A[k-1] = 20 * k; // } BufHandle a_buf("A", {200}, kInt); @@ -735,10 +736,10 @@ TEST(BoundsInference, HasConflictingOverlapSameBufferWithPartialOverlap) { TEST(BoundsInference, HasConflictingOverlapSameBufferWithFullOverlap) { // Input IR: - // for (int j = 10; j < 100; j++) { + // for (const auto j : c10::irange(10, 100)) { // A[j] = 10 * j; // } - // for (int k = 10; k < 100; k++) { + // for (const auto k : c10::irange(10, 100)) { // A[k] = 20 * k; // } BufHandle a_buf("A", {200}, kInt); @@ -756,10 +757,10 @@ TEST(BoundsInference, HasConflictingOverlapSameBufferWithFullOverlap) { TEST(BoundsInference, HasConflictingOverlapSameBufferWithFullOverlapRAW) { // Input IR: - // for (int j = 10; j < 100; j++) { + // for (const auto j : c10::irange(10, 100)) { // A[j] = 10 * j; // } - // for (int k = 10; k < 100; k++) { + // for (const auto k : c10::irange(10, 100)) { // B[k] = A[k]; // } BufHandle a_buf("A", {200}, kInt); @@ -779,10 +780,10 @@ TEST(BoundsInference, HasConflictingOverlapSameBufferWithFullOverlapRAW) { TEST(BoundsInference, HasConflictingOverlapSameBufferNotOverlapping) { // Input IR: - // for (int j = 10; j < 100; j++) { + // for (const auto j : c10::irange(10, 100)) { // A[j] = 10 * j; // } - // for (int k = 10; k < 100; k++) { + // for (const auto k : c10::irange(10, 100)) { // A[k+100] = 20 * k; // } BufHandle a_buf("A", {200}, kInt); @@ -801,13 +802,13 @@ TEST(BoundsInference, HasConflictingOverlapSameBufferNotOverlapping) { TEST(BoundsInference, HasConflictingOverlap2DBufferWithOverlap) { // Input IR: - // for (int i = 0; i < 20; i++) { - // for (int j = 0; j < 100; j++) { + // for (const auto i : c10::irange(20)) { + // for (const auto j : c10::irange(100)) { // A[i,j] = i * j * 500; // } // } - // for (int m = 0; m < 20; m++) { - // for (int n = 0; n < 50; n++) { + // for (const auto m : c10::irange(20)) { + // for (const auto n : c10::irange(50)) { // A[m+1,n] = m + n * 100; // } // } @@ -840,13 +841,13 @@ TEST(BoundsInference, HasConflictingOverlap2DBufferWithOverlap) { TEST(BoundsInference, HasConflictingOverlap2DBufferWithNoOverlap) { // Input IR: - // for (int i = 0; i < 20; i++) { - // for (int j = 0; j < 100; j++) { + // for (const auto i : c10::irange(20)) { + // for (const auto j : c10::irange(100)) { // A[i,j] = i * j * 500; // } // } - // for (int m = 0; m < 20; m++) { - // for (int n = 0; n < 50; n++) { + // for (const auto m : c10::irange(20)) { + // for (const auto n : c10::irange(50)) { // A[m+20,n+100] = m + n * 100; // } // } @@ -879,13 +880,13 @@ TEST(BoundsInference, HasConflictingOverlap2DBufferWithNoOverlap) { TEST(BoundsInference, HasConflictingOverlapDifferentBuffers) { // Input IR: - // for (int i = 0; i < 20; i++) { - // for (int j = 0; j < 100; j++) { + // for (const auto i : c10::irange(20)) { + // for (const auto j : c10::irange(100)) { // A[i,j] = i * j * 500; // } // } - // for (int m = 0; m < 20; m++) { - // for (int n = 0; n < 50; n++) { + // for (const auto m : c10::irange(20)) { + // for (const auto n : c10::irange(50)) { // B[m,n] = m + n * 100; // } // } @@ -917,10 +918,10 @@ TEST(BoundsInference, HasConflictingOverlapDifferentBuffers) { TEST(BoundsInference, HasConflictingOverlapDueToRAWDependence) { // Input IR: - // for (int j = 0; j < 100; j++) { + // for (const auto j : c10::irange(100)) { // A[j] = 10 * j; // } - // for (int k = 0; k < 100; k++) { + // for (const auto k : c10::irange(100)) { // B[k] = 20 * A[99-k]; // } BufHandle a_buf("A", {100}, kInt); @@ -944,10 +945,10 @@ TEST(BoundsInference, HasConflictingOverlapDueToRAWDependence) { TEST(BoundsInference, HasConflictingOverlapDueToWARDependence) { // Input IR: - // for (int k = 0; k < 100; k++) { + // for (const auto k : c10::irange(100)) { // B[k] = 20 * A[99-k]; // } - // for (int j = 0; j < 100; j++) { + // for (const auto j : c10::irange(100)) { // A[j] = 10 * j; // } BufHandle a_buf("A", {100}, kInt); @@ -971,10 +972,10 @@ TEST(BoundsInference, HasConflictingOverlapDueToWARDependence) { TEST(BoundsInference, HasConflictingOverlapWithLoads) { // Input IR: - // for (int k = 10; k < 100; k++) { + // for (const auto k : c10::irange(10, 100)) { // B[k] = 20 * A[99-k]; // } - // for (int j = 10; j < 100; j++) { + // for (const auto j : c10::irange(10, 100)) { // C[j] = 10 * A[j]; // } BufHandle a_buf("A", {100}, kInt); @@ -1003,7 +1004,7 @@ TEST(BoundsInference, HasConflictingOverlapWithLoads) { TEST(BoundsInference, IsOverlapping) { // Input IR: - // for (int i = 0; i < 100; i++) { + // for (const auto i : c10::irange(100)) { // A[i] = i * 10; // storeA1 // B[i] = A[99-i] * 20; // loadA1 // C[i] = A[i + 100] * 10; // loadA2 diff --git a/test/cpp/tensorexpr/test_cpp_codegen.cpp b/test/cpp/tensorexpr/test_cpp_codegen.cpp index 2603611ec5a4f7..ed7679053637c6 100644 --- a/test/cpp/tensorexpr/test_cpp_codegen.cpp +++ b/test/cpp/tensorexpr/test_cpp_codegen.cpp @@ -2,6 +2,7 @@ #include "test/cpp/tensorexpr/test_base.h" +#include #include #include #include @@ -207,7 +208,7 @@ TEST(CppPrinter, Cond) { TEST(CppPrinter, Intrinsics) { const std::unordered_set> unsupported_ops{ kRand, kSigmoid}; - for (int i = 0; i < kMaxIntrinsicsOp; i++) { + for (const auto i : c10::irange(static_cast(kMaxIntrinsicsOp))) { IntrinsicsOp op = static_cast(i); if (unsupported_ops.count(op)) { continue; diff --git a/test/cpp/tensorexpr/test_cuda.cpp b/test/cpp/tensorexpr/test_cuda.cpp index 6c1a11aa87b7b9..bc267d9158a5ac 100644 --- a/test/cpp/tensorexpr/test_cuda.cpp +++ b/test/cpp/tensorexpr/test_cuda.cpp @@ -19,6 +19,7 @@ #include #include +#include namespace torch { namespace jit { @@ -56,7 +57,7 @@ static void testCudaTestVectorAdd01_impl() { PaddedBuffer c_v(N); PaddedBuffer c_ref(N); - for (int i = 0; i < N; i++) { + for (const auto i : c10::irange(N)) { a_v(i) = ctype(i); b_v(i) = ctype(i * 3 + 7); c_ref(i) = a_v(i) + b_v(i); @@ -119,7 +120,7 @@ TEST(Cuda, Sigmoid_CUDA) { PaddedBuffer c_v(N); PaddedBuffer c_ref(N); - for (int i = 0; i < N; i++) { + for (const auto i : c10::irange(N)) { a_v(i) = float(i); c_ref(i) = sigmoid(sigmoid(a_v(i))); } @@ -182,7 +183,7 @@ static void testCudaTestVectorAdd02_impl(int N, int block_size) { PaddedBuffer c_v(N); PaddedBuffer c_ref(N); - for (int i = 0; i < N; i++) { + for (const auto i : c10::irange(N)) { a_v(i) = i; b_v(i) = i * 3 + 7; c_ref(i) = a_v(i) + b_v(i); @@ -356,7 +357,7 @@ TEST(Cuda, TestRand01_CUDA) { float sum1 = 0; float sum2 = 0; float sum3 = 0; - for (int i = 0; i < N; i++) { + for (const auto i : c10::irange(N)) { float v = c_v.data()[i]; sum1 += v; sum2 += v * v; @@ -431,10 +432,10 @@ TEST(Cuda, OneBlockOneThreadGlobalReduce1_CUDA) { BufHandle output_buf("output", {1}, kFloat); // The test adds the following code for trivial reduction: - // for (int bidx = 0; bidx < 1; bidx++) { // blockIdx.x - // for (int tidx = 0; tidx < 1; tidx++) { // threadIdx.x + // for (const auto bidx : c10::irange(1)) { // blockIdx.x + // for (const auto tidx : c10::irange(1)) { // threadIdx.x // output[0] = 0.f; - // for (int i1 = 0; i1 < 1024; i1++) { + // for (const auto i1 : c10::irange(1024)) { // output[0] = output[0] + data[i1]; // } // } @@ -465,7 +466,7 @@ TEST(Cuda, OneBlockOneThreadGlobalReduce1_CUDA) { PaddedBuffer output_ref(1, "output_ref"); output_ref(0) = 0; - for (int i = 0; i < N; i++) { + for (const auto i : c10::irange(N)) { data_v(i) = i; output_ref(0) += data_v(i); } @@ -544,7 +545,7 @@ TEST(Cuda, OneBlockMultiThreadGlobalReduce1_CUDA) { PaddedBuffer b_ref(1, "b_ref"); b_ref(0) = 0; - for (int i = 0; i < N; i++) { + for (const auto i : c10::irange(N)) { a_v(i) = i; b_ref(0) += a_v(i); } @@ -634,11 +635,11 @@ TEST(Cuda, NoThreadIdxWrite_1_CUDA) { PaddedBuffer b_ref(N, "b_ref"); a_ref(0) = 0; - for (int i = 0; i < 2; i++) { + for (const auto i : c10::irange(2)) { a_ref(0) += i; } a_ref(1) = a_ref(0) + 1; - for (int i = 0; i < N; i++) { + for (const auto i : c10::irange(N)) { b_ref(i) = i; } @@ -772,8 +773,8 @@ TEST(Cuda, SharedMemReduce_1_CUDA) { PaddedBuffer b_ref(1, "b_ref"); b_ref(0) = 0; - for (int i = 0; i < M; i++) { - for (int j = 0; j < N; j++) { + for (const auto i : c10::irange(M)) { + for (const auto j : c10::irange(N)) { int v = i + j; a_v(0, i, j) = v; b_ref(0) += v; @@ -882,8 +883,8 @@ TEST(Cuda, LocalMemReduce_1_CUDA) { PaddedBuffer b_ref(1, "b_ref"); b_ref(0) = 0; - for (int i = 0; i < M; i++) { - for (int j = 0; j < N; j++) { + for (const auto i : c10::irange(M)) { + for (const auto j : c10::irange(N)) { int v = i + j; a_v(0, i, j) = v; b_ref(0) += v; @@ -1083,7 +1084,7 @@ TEST(Cuda, PrioritizeDependents_CUDA) { VarHandle j("j", kInt); /* - * for (int i = 0; i < 12; ++i) { + * for (const auto i : c10::irange(12)) { * c[i] = (i < 10 ? a[i] + b[i] : b[i]); * } */ @@ -1102,13 +1103,13 @@ TEST(Cuda, PrioritizeDependents_CUDA) { PaddedBuffer c_v(12, "c_v"); PaddedBuffer c_ref(12, "c_ref"); - for (int i = 0; i < 10; ++i) { + for (const auto i : c10::irange(10)) { a_v(i) = i * 100; b_v(i) = i; c_v(i) = 0; } - for (int i = 10; i < 12; ++i) { + for (const auto i : c10::irange(10, 12)) { b_v(i) = i; c_v(i) = 0; } @@ -1131,7 +1132,7 @@ TEST(Cuda, PrioritizeDependents_CUDA) { cudaMemcpy(c_v.data(), c_dev, 12 * sizeof(float), cudaMemcpyDeviceToHost); cudaDeviceSynchronize(); - for (int i = 0; i < 12; ++i) { + for (const auto i : c10::irange(12)) { if (i < 10) { c_ref(i) = i + i * 100; } else { @@ -1193,12 +1194,12 @@ TEST(Cuda, MaskBlockDim_CUDA) { PaddedBuffer c_ref(A_SIZE); PaddedBuffer d_ref(B_SIZE); - for (int i = 0; i < A_SIZE; i++) { + for (const auto i : c10::irange(A_SIZE)) { a_v(i) = (float)i; c_ref(i) = (float)(i + 10); } - for (int i = 0; i < B_SIZE; i++) { + for (const auto i : c10::irange(B_SIZE)) { b_v(i) = (float)(B_SIZE - i); d_ref(i) = a_v(i) + b_v(i); } @@ -1285,12 +1286,12 @@ TEST(Cuda, MaskThreadDim_CUDA) { PaddedBuffer c_ref(A_SIZE); PaddedBuffer d_ref(B_SIZE); - for (int i = 0; i < A_SIZE; i++) { + for (const auto i : c10::irange(A_SIZE)) { a_v(i) = (float)i; c_ref(i) = (float)(i + 10); } - for (int i = 0; i < B_SIZE; i++) { + for (const auto i : c10::irange(B_SIZE)) { b_v(i) = (float)(B_SIZE - i); d_ref(i) = a_v(i / 2) + b_v(i); } @@ -1378,12 +1379,12 @@ TEST(Cuda, MaskMultiBlockDim_CUDA) { PaddedBuffer c_ref(A_SIZE); PaddedBuffer d_ref(B_SIZE); - for (int i = 0; i < A_SIZE; i++) { + for (const auto i : c10::irange(A_SIZE)) { a_v(i) = (float)i; c_ref(i) = (float)(i + 10); } - for (int i = 0; i < B_SIZE; i++) { + for (const auto i : c10::irange(B_SIZE)) { b_v(i) = (float)(B_SIZE - i); d_ref(i) = a_v(i) + b_v(i); } @@ -1471,12 +1472,12 @@ TEST(Cuda, MaskBlockAndThreadDim_CUDA) { PaddedBuffer c_ref(A_SIZE); PaddedBuffer d_ref(B_SIZE); - for (int i = 0; i < A_SIZE; i++) { + for (const auto i : c10::irange(A_SIZE)) { a_v(i) = (float)i; c_ref(i) = (float)(i + 10); } - for (int i = 0; i < B_SIZE; i++) { + for (const auto i : c10::irange(B_SIZE)) { b_v(i) = (float)(B_SIZE - i); d_ref(i) = a_v(i) + b_v(i); } @@ -1572,15 +1573,15 @@ TEST(Cuda, MaskMultiDim_CUDA) { PaddedBuffer c_ref(OUTER_SIZE, A_SIZE); PaddedBuffer d_ref(OUTER_SIZE, B_SIZE); - for (int o = 0; o < OUTER_SIZE; ++o) { - for (int i = 0; i < A_SIZE; i++) { + for (const auto o : c10::irange(OUTER_SIZE)) { + for (const auto i : c10::irange(A_SIZE)) { a_v(o, i) = (float)i; c_ref(o, i) = (float)(i * 2); } } - for (int o = 0; o < OUTER_SIZE; ++o) { - for (int i = 0; i < B_SIZE; i++) { + for (const auto o : c10::irange(OUTER_SIZE)) { + for (const auto i : c10::irange(B_SIZE)) { b_v(o, i) = (float)(B_SIZE - i); d_ref(o, i) = c_ref(o, i * 2) + b_v(o, i); } @@ -1706,15 +1707,15 @@ TEST(Cuda, MaskMultiDimSymbolic_CUDA) { PaddedBuffer c_ref(OUTER_EXTENT, A_EXTENT); PaddedBuffer d_ref(OUTER_EXTENT, B_EXTENT); - for (int o = 0; o < OUTER_EXTENT; ++o) { - for (int i = 0; i < A_EXTENT; i++) { + for (const auto o : c10::irange(OUTER_EXTENT)) { + for (const auto i : c10::irange(A_EXTENT)) { a_v(o, i) = (float)i; c_ref(o, i) = (float)(i * 2); } } - for (int o = 0; o < OUTER_EXTENT; ++o) { - for (int i = 0; i < B_EXTENT; i++) { + for (const auto o : c10::irange(OUTER_EXTENT)) { + for (const auto i : c10::irange(B_EXTENT)) { b_v(o, i) = (float)(B_EXTENT - i); d_ref(o, i) = c_ref(o, i * 2) + b_v(o, i); } @@ -1847,12 +1848,12 @@ TEST(Cuda, MaskCompoundInnerLoop_CUDA) { PaddedBuffer c_ref(OUTER_SIZE, A_SIZE); PaddedBuffer d_ref(OUTER_SIZE, B_SIZE); - for (int o = 0; o < OUTER_SIZE; ++o) { - for (int i = 0; i < A_SIZE; i++) { + for (const auto o : c10::irange(OUTER_SIZE)) { + for (const auto i : c10::irange(A_SIZE)) { a_v(o, i) = (float)i; c_ref(o, i) = (float)(i * 2); } - for (int i = 0; i < B_SIZE; i++) { + for (const auto i : c10::irange(B_SIZE)) { b_v(o, i) = (float)(B_SIZE - i); d_ref(o, i) = c_ref(o, i * 2) + b_v(o, i); } @@ -1985,12 +1986,12 @@ TEST(Cuda, MaskInnerLoopOneBlock_CUDA) { PaddedBuffer c_ref(OUTER_SIZE, A_SIZE); PaddedBuffer d_ref(OUTER_SIZE, B_SIZE); - for (int o = 0; o < OUTER_SIZE; ++o) { - for (int i = 0; i < A_SIZE; i++) { + for (const auto o : c10::irange(OUTER_SIZE)) { + for (const auto i : c10::irange(A_SIZE)) { a_v(o, i) = (float)i; c_ref(o, i) = (float)(i * 2); } - for (int i = 0; i < B_SIZE; i++) { + for (const auto i : c10::irange(B_SIZE)) { b_v(o, i) = (float)(B_SIZE - i); d_ref(o, i) = c_ref(o, i * 2) + b_v(o, i); } @@ -2112,15 +2113,15 @@ TEST(Cuda, MaskMultiDimMultiAxis_CUDA) { PaddedBuffer c_ref(OUTER_SIZE, A_SIZE); PaddedBuffer d_ref(OUTER_SIZE, B_SIZE); - for (int o = 0; o < OUTER_SIZE; ++o) { - for (int i = 0; i < A_SIZE; i++) { + for (const auto o : c10::irange(OUTER_SIZE)) { + for (const auto i : c10::irange(A_SIZE)) { a_v(o, i) = (float)i; c_ref(o, i) = (float)(i * 2); } } - for (int o = 0; o < OUTER_SIZE; ++o) { - for (int i = 0; i < B_SIZE; i++) { + for (const auto o : c10::irange(OUTER_SIZE)) { + for (const auto i : c10::irange(B_SIZE)) { b_v(o, i) = (float)(B_SIZE - i); d_ref(o, i) = c_ref(o, i * 2) + b_v(o, i); } @@ -2243,15 +2244,15 @@ TEST(Cuda, MaskMultiDimMultiLevel_CUDA) { PaddedBuffer c_ref(OUTER_A_SIZE, A_SIZE); PaddedBuffer d_ref(OUTER_B_SIZE, B_SIZE); - for (int o = 0; o < OUTER_A_SIZE; ++o) { - for (int i = 0; i < A_SIZE; i++) { + for (const auto o : c10::irange(OUTER_A_SIZE)) { + for (const auto i : c10::irange(A_SIZE)) { a_v(o, i) = (float)i; c_ref(o, i) = (float)(i * 2); } } - for (int o = 0; o < OUTER_B_SIZE; ++o) { - for (int i = 0; i < B_SIZE; i++) { + for (const auto o : c10::irange(OUTER_B_SIZE)) { + for (const auto i : c10::irange(B_SIZE)) { b_v(o, i) = (float)(B_SIZE - i); d_ref(o, i) = c_ref(o, i * 2) + b_v(o, i); } diff --git a/test/cpp/tensorexpr/test_expr.cpp b/test/cpp/tensorexpr/test_expr.cpp index b7389786766e51..520fca4f40d1f5 100644 --- a/test/cpp/tensorexpr/test_expr.cpp +++ b/test/cpp/tensorexpr/test_expr.cpp @@ -2,6 +2,7 @@ #include +#include #include #include #include @@ -163,7 +164,7 @@ TEST(Expr, VectorAdd01) { /* Build the following: - for (int index = 0; index < kVectorCount; index++) { + for (const auto index : c10::irange(kVectorCount)) { store(c_buf, ramp(index * 8, 1, 8), load(a_buf, ramp(index * 8, 1, 8) + load(b_buf, ramp(index * 8, 1, 8)))) @@ -187,7 +188,7 @@ TEST(Expr, VectorAdd01) { PaddedBuffer b_v(kTotalSize); PaddedBuffer c_v(kTotalSize); PaddedBuffer c_ref(kTotalSize); - for (int i = 0; i < kTotalSize; i++) { + for (const auto i : c10::irange(kTotalSize)) { a_v(i) = i * i; b_v(i) = i * i * 4; c_ref(i) = a_v(i) + b_v(i); @@ -568,7 +569,7 @@ void testCond01() { SimpleIREvaluator(for_stmt, {a_buf})(a_v); PaddedBuffer a_ref(N); - for (int i = 0; i < N; i++) { + for (const auto i : c10::irange(N)) { if (i % 2 == 0) { a_ref(i) = i * 2; } else { diff --git a/test/cpp/tensorexpr/test_kernel.cpp b/test/cpp/tensorexpr/test_kernel.cpp index abb211b3bf11cc..f05269f2086782 100644 --- a/test/cpp/tensorexpr/test_kernel.cpp +++ b/test/cpp/tensorexpr/test_kernel.cpp @@ -1,5 +1,6 @@ #include +#include #include #include #include @@ -416,13 +417,13 @@ TEST_F(Kernel, DISABLED_Shape_Inference) { // Check sizes CHECK_EQ(o.sizes().size(), ref.sizes().size()); size_t num_el = 1; - for (size_t idx = 0; idx < ref.sizes().size(); idx++) { + for (const auto idx : c10::irange(ref.sizes().size())) { CHECK_EQ(o.sizes()[idx], ref.sizes()[idx]); num_el *= ref.sizes()[idx]; } // Check the contents - for (size_t i = 0; i < num_el; i++) { + for (const auto i : c10::irange(num_el)) { CHECK_EQ(((float*)o.data_ptr())[i], ((float*)ref.data_ptr())[i]); } } @@ -469,13 +470,13 @@ TEST_F(Kernel, DISABLED_Shape_Inference) { // Check sizes CHECK_EQ(o.sizes().size(), ref.sizes().size()); size_t num_el = 1; - for (size_t idx = 0; idx < ref.sizes().size(); idx++) { + for (const auto idx : c10::irange(ref.sizes().size())) { CHECK_EQ(o.sizes()[idx], ref.sizes()[idx]); num_el *= ref.sizes()[idx]; } // Check the contents - for (size_t i = 0; i < num_el; i++) { + for (const auto i : c10::irange(num_el)) { CHECK_EQ(((float*)o.data_ptr())[i], ((float*)ref.data_ptr())[i]); } } @@ -569,13 +570,13 @@ TEST_F(Kernel, CatInputTypesPromotion) { CHECK_EQ(o.sizes().size(), ref.sizes().size()); CHECK_EQ(o.dtype(), ref.dtype()); size_t num_el = 1; - for (size_t idx = 0; idx < ref.sizes().size(); idx++) { + for (const auto idx : c10::irange(ref.sizes().size())) { CHECK_EQ(o.sizes()[idx], ref.sizes()[idx]); num_el *= ref.sizes()[idx]; } // Check the contents - for (size_t i = 0; i < num_el; i++) { + for (const auto i : c10::irange(num_el)) { CHECK_EQ(((double*)o.data_ptr())[i], ((double*)ref.data_ptr())[i]); } } @@ -658,13 +659,13 @@ TEST_F(Kernel, CatWoConditionals) { CHECK_EQ(o.sizes().size(), ref.sizes().size()); CHECK_EQ(o.dtype(), ref.dtype()); size_t num_el = 1; - for (size_t idx = 0; idx < ref.sizes().size(); idx++) { + for (const auto idx : c10::irange(ref.sizes().size())) { CHECK_EQ(o.sizes()[idx], ref.sizes()[idx]); num_el *= ref.sizes()[idx]; } // Check the contents - for (size_t i = 0; i < num_el; i++) { + for (const auto i : c10::irange(num_el)) { CHECK_EQ(((float*)o.data_ptr())[i], ((float*)ref.data_ptr())[i]); } getCatWoConditionals() = false; @@ -723,13 +724,13 @@ TEST_F(Kernel, OptimizeConditionals) { CHECK_EQ(o.sizes().size(), ref.sizes().size()); CHECK_EQ(o.dtype(), ref.dtype()); size_t num_el = 1; - for (size_t idx = 0; idx < ref.sizes().size(); idx++) { + for (const auto idx : c10::irange(ref.sizes().size())) { CHECK_EQ(o.sizes()[idx], ref.sizes()[idx]); num_el *= ref.sizes()[idx]; } // Check the contents - for (size_t i = 0; i < num_el; i++) { + for (const auto i : c10::irange(num_el)) { CHECK_EQ(((float*)o.data_ptr())[i], ((float*)ref.data_ptr())[i]); } getOptConditionals() = old_opt_conditionals; @@ -904,7 +905,7 @@ TEST_F(Kernel, SumMultipleAxes) { // Only iterate over positive values of axes to keep the running time // reasonable, since the number of pairs is quadratic. - for (int dim1 = 0; dim1 < a.dim(); ++dim1) { + for (const auto dim1 : c10::irange(a.dim())) { for (int dim2 = dim1 + 1; dim2 < a.dim(); ++dim2) { for (bool keepdim : {false, true}) { TemplateEnv env; @@ -977,7 +978,7 @@ TEST_F(Kernel, Softmax2D) { # CHECK-NEXT: aten_softmax)IR"; for (auto log_softmax : {false, true}) { - for (int softmax_dim = 0; softmax_dim < a.dim(); ++softmax_dim) { + for (const auto softmax_dim : c10::irange(a.dim())) { auto softmax_dim_size = a.sizes()[softmax_dim]; auto other_dim = (softmax_dim + 1) % a.dim(); auto ref = @@ -1046,10 +1047,10 @@ TEST_F(Kernel, Softmax3D) { # CHECK-NEXT: aten_softmax)IR"; for (auto log_softmax : {false, true}) { - for (int softmax_dim = 0; softmax_dim < a.dim(); ++softmax_dim) { + for (const auto softmax_dim : c10::irange(a.dim())) { auto softmax_dim_size = a.sizes()[softmax_dim]; std::vector other_dims; - for (int i = 0; i < a.dim(); ++i) { + for (const auto i : c10::irange(a.dim())) { if (i != softmax_dim) { other_dims.push_back(i); } @@ -1127,10 +1128,10 @@ TEST_F(Kernel, Softmax4D) { # CHECK-NEXT: aten_softmax)IR"; for (auto log_softmax : {false, true}) { - for (int softmax_dim = 0; softmax_dim < a.dim(); ++softmax_dim) { + for (const auto softmax_dim : c10::irange(a.dim())) { auto softmax_dim_size = a.sizes()[softmax_dim]; std::vector other_dims; - for (int i = 0; i < a.dim(); ++i) { + for (const auto i : c10::irange(a.dim())) { if (i != softmax_dim) { other_dims.push_back(i); } diff --git a/test/cpp/tensorexpr/test_llvm.cpp b/test/cpp/tensorexpr/test_llvm.cpp index 61748b02a054eb..489c9c85e06337 100644 --- a/test/cpp/tensorexpr/test_llvm.cpp +++ b/test/cpp/tensorexpr/test_llvm.cpp @@ -3,6 +3,7 @@ #include +#include #include #include #include @@ -242,14 +243,14 @@ TEST(LLVM, fastLogFloat) { PaddedBuffer a_v(kTotalSize); PaddedBuffer b_v(kTotalSize); - for (int i = 0; i < kTotalSize; ++i) { + for (const auto i : c10::irange(kTotalSize)) { a_v(i) = at::randn({1}).item().to(); } LLVMCodeGen ir_eval(stmt, {a_buf, b_buf}); ir_eval.call({a_v, b_v}); - for (int i = 0; i < kTotalSize; ++i) { + for (const auto i : c10::irange(kTotalSize)) { auto test = b_v(i); auto ref = std::log(a_v(i)); if (std::isnan(ref)) { @@ -516,7 +517,7 @@ TEST(LLVM, VecLoadStoreTest) { LLVMCodeGen cg(store, {a, b}); \ std::vector args({a_buffer.data(), b_buffer.data()}); \ ASSERT_EQ(cg.value(args), 0); \ - for (int i = 0; i < Lanes; i++) { \ + for (const auto i : c10::irange(Lanes)) { \ ASSERT_FLOAT_EQ(a_buffer[i], val); \ } \ } // namespace jit @@ -554,7 +555,7 @@ FLOAT_INTRINSICS_TEST(lgamma, 8) LLVMCodeGen cg(store, {a, b}); \ std::vector args({a_buffer.data(), b_buffer.data()}); \ ASSERT_EQ(cg.value(args), 0); \ - for (int i = 0; i < Lanes; i++) { \ + for (const auto i : c10::irange(Lanes)) { \ ASSERT_FLOAT_EQ(a_buffer[i], val); \ } \ } // namespace jit @@ -619,7 +620,7 @@ TEST(LLVM, VectorizeBitCast) { std::vector a_vec(128); std::vector c_vec(128); - for (auto i = 0; i < 128; ++i) { + for (const auto i : c10::irange(128)) { a_vec[i] = raw_bitcast(1337.f); } std::vector args({a_vec.data(), c_vec.data()}); @@ -985,7 +986,7 @@ TEST(LLVM, CompareSelectIntEQ) { ASSERT_EQ(c_buffer.size(), N); assertAllEqual(a_buffer, 1); - for (int i = 0; i < N; i++) { + for (const auto i : c10::irange(N)) { ASSERT_EQ(c_ref[i], c_buffer[i]); } } @@ -1058,7 +1059,7 @@ TEST(LLVM, CompareSelectByteGT) { ASSERT_EQ(c_buffer.size(), N); assertAllEqual(b_buffer, uint8_t(0)); - for (int i = 0; i < N; i++) { + for (const auto i : c10::irange(N)) { ASSERT_EQ(c_ref[i], c_buffer[i]); } } @@ -1093,7 +1094,7 @@ TEST(LLVM, CompareSelectByteGE) { ASSERT_EQ(c_buffer.size(), N); assertAllEqual(b_buffer, uint8_t(0)); - for (int i = 0; i < N; i++) { + for (const auto i : c10::irange(N)) { ASSERT_EQ(c_ref[i], c_buffer[i]); } } @@ -1133,7 +1134,7 @@ TEST(LLVM, CompareSelectByteLT) { ASSERT_EQ(c_buffer.size(), N); assertAllEqual(b_buffer, uint8_t(128)); - for (int i = 0; i < N; i++) { + for (const auto i : c10::irange(N)) { ASSERT_EQ(c_ref[i], c_buffer[i]); } } @@ -1168,7 +1169,7 @@ TEST(LLVM, CompareSelectByteLE) { ASSERT_EQ(c_buffer.size(), N); assertAllEqual(b_buffer, uint8_t(128)); - for (int i = 0; i < N; i++) { + for (const auto i : c10::irange(N)) { ASSERT_EQ(c_ref[i], c_buffer[i]); } } @@ -1198,7 +1199,7 @@ TEST(LLVM, SimpleMath01) { int value = cg.value(args); ASSERT_EQ(value, 0); PaddedBuffer f_ref(N, "f_ref"); - for (int i = 0; i < N; i++) { + for (const auto i : c10::irange(N)) { f_ref(i) = i * i + 1; } ExpectAllNear(f_v, f_ref, 1e-5); @@ -1251,8 +1252,8 @@ TEST(LLVM, BroadcastAdd) { std::vector args({av.data(), bv.data(), cv.data()}); ASSERT_EQ(cg.value(args), 0); - for (int i = 0; i < M; i++) { - for (int j = 0; j < N; j++) { + for (const auto i : c10::irange(M)) { + for (const auto j : c10::irange(N)) { ASSERT_EQ(cv[i * N + j], av[i * N + j] + bv[j]); } } @@ -1421,8 +1422,8 @@ TEST(LLVM, SimpleReduction) { PaddedBuffer b_ref(1, "b_ref"); b_ref(0) = 0; - for (int i = 0; i < M; i++) { - for (int j = 0; j < N; j++) { + for (const auto i : c10::irange(M)) { + for (const auto j : c10::irange(N)) { int v = i + j; a_v(0, i, j) = v; b_ref(0) += v; @@ -1469,8 +1470,8 @@ TEST(LLVM, RFactorReduction) { PaddedBuffer b_ref(1, "b_ref"); b_ref(0) = 0; - for (int i = 0; i < M; i++) { - for (int j = 0; j < N; j++) { + for (const auto i : c10::irange(M)) { + for (const auto j : c10::irange(N)) { int v = i + j; a_v(0, i, j) = v; b_ref(0) += v; @@ -1516,8 +1517,8 @@ TEST(LLVM, RFactorVectorizedReduction) { PaddedBuffer b_ref(1, "b_ref"); b_ref(0) = 0; - for (int i = 0; i < M; i++) { - for (int j = 0; j < N; j++) { + for (const auto i : c10::irange(M)) { + for (const auto j : c10::irange(N)) { int v = i + j; a_v(0, i, j) = v; b_ref(0) += v; @@ -1558,8 +1559,8 @@ static void testSimpleParallel() { int value = cg.value(args); ASSERT_EQ(value, 0); PaddedBuffer f_ref(M, N, "f_ref"); - for (int m = 0; m < M; m++) { - for (int n = 0; n < N; n++) { + for (const auto m : c10::irange(M)) { + for (const auto n : c10::irange(N)) { f_ref(m, n) = m + n; } } @@ -1584,7 +1585,7 @@ TEST(LLVM, CompositeParallel) { int test_count = 1 << loop_count; // Compute a composite operation, and try all loop-axis combination to be // parallel or sequential. - for (int test_cfg = 0; test_cfg < test_count; test_cfg++) { + for (const auto test_cfg : c10::irange(test_count)) { int M = 5; int N = 7; Tensor t1 = @@ -1624,7 +1625,7 @@ TEST(LLVM, CompositeParallel) { loop_list.push_back(loops[1]); } ASSERT_EQ(loop_list.size(), loop_count); - for (int i = 0; i < loop_count; i++) { + for (const auto i : c10::irange(loop_count)) { if (test_cfg & (1 << i)) { loop_list[i]->set_parallel(); } @@ -1638,8 +1639,8 @@ TEST(LLVM, CompositeParallel) { int value = cg.value(args); ASSERT_EQ(value, 0); PaddedBuffer t4_ref(M, N, "t4_ref"); - for (int m = 0; m < M; m++) { - for (int n = 0; n < N; n++) { + for (const auto m : c10::irange(M)) { + for (const auto n : c10::irange(N)) { t4_ref(m, n) = (m + 1) * (n + 2) + m + n; } } @@ -1715,10 +1716,10 @@ TEST(LLVM, VectorizedGEMM) { PaddedBuffer c_v(M, N, "c_v"); PaddedBuffer c_ref(M, N, "c_ref"); - for (int m = 0; m < M; m++) { - for (int n = 0; n < N; n++) { + for (const auto m : c10::irange(M)) { + for (const auto n : c10::irange(N)) { c_ref(m, n) = 0.f; - for (int k = 0; k < K; k++) { + for (const auto k : c10::irange(K)) { c_ref(m, n) += a_v(m, k) * b_v(k, n); } } @@ -1754,8 +1755,8 @@ TEST(LLVM, CallRaw) { LLVMCodeGen cg(s, {a, b, BufHandle(c.buf()), N}); cg.call_raw(args); - for (int i = 0; i < M; i++) { - for (int j = 0; j < N_value; j++) { + for (const auto i : c10::irange(M)) { + for (const auto j : c10::irange(N_value)) { ASSERT_EQ(cv[i * N_value + j], av[i * N_value + j] + bv[j]); } } @@ -1763,8 +1764,8 @@ TEST(LLVM, CallRaw) { SimpleIREvaluator eval(s, {a, b, BufHandle(c.buf()), N}); eval.call_raw(args); - for (int i = 0; i < M; i++) { - for (int j = 0; j < N_value; j++) { + for (const auto i : c10::irange(M)) { + for (const auto j : c10::irange(N_value)) { ASSERT_EQ(cv[i * N_value + j], av[i * N_value + j] + bv[j]); } } diff --git a/test/cpp/tensorexpr/test_reductions.cpp b/test/cpp/tensorexpr/test_reductions.cpp index d86f0463f11207..ede514d13a2a16 100644 --- a/test/cpp/tensorexpr/test_reductions.cpp +++ b/test/cpp/tensorexpr/test_reductions.cpp @@ -8,6 +8,7 @@ #include +#include #include #include #include @@ -28,7 +29,7 @@ TEST(Reductions, ReduceSum0D_1) { BufHandle b("b", {M}, kFloat); std::vector in(M); - for (int j = 0; j < M; ++j) { + for (const auto j : c10::irange(M)) { in[j] = j; } @@ -43,7 +44,7 @@ TEST(Reductions, ReduceSum0D_1) { SimpleIREvaluator cg(s, {b, c}); cg.call({in, out}); - for (int i = 0; i < M; ++i) { + for (const auto i : c10::irange(M)) { ASSERT_EQ(out[i], in[i]); } } @@ -73,7 +74,7 @@ TEST(Reductions, ReduceSum0D_2) { TEST(Reductions, ReduceSum1D) { BufHandle b("b", {10}, kFloat); std::vector in(10); - for (int j = 0; j < 10; ++j) { + for (const auto j : c10::irange(10)) { in[j] = j; } @@ -100,8 +101,8 @@ TEST(Reductions, ReduceSum2D) { BufHandle b("b", {m, n}, kFloat); std::vector in(M * N); - for (int i = 0; i < M; ++i) { - for (int j = 0; j < N; ++j) { + for (const auto i : c10::irange(M)) { + for (const auto j : c10::irange(N)) { in[i * N + j] = j; } } @@ -119,12 +120,12 @@ TEST(Reductions, ReduceSum2D) { cg.call({in, out, 5, 7}); float expected = 0; - for (int i = 0; i < N; ++i) { + for (const auto i : c10::irange(N)) { // NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions) expected += i; } - for (int i = 0; i < M; ++i) { + for (const auto i : c10::irange(M)) { ASSERT_EQ(out[i], expected); } } @@ -151,14 +152,14 @@ TEST(Reductions, ReduceSum3D) { std::vector eData(2, 1.0f); for (int i = 0; i < 2 * 3; ++i) { - for (int j = 0; j < M; ++j) { + for (const auto j : c10::irange(M)) { bData[i * M + j] = j; } } cg.call({bData, cData, M}); float expected = 0; - for (int i = 0; i < M; ++i) { + for (const auto i : c10::irange(M)) { // NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions) expected += i; } @@ -179,7 +180,7 @@ TEST(Reductions, ReduceSum3D) { // We're combining an additional dimension of 3, so the sum is 3x. expected = expected * 3; - for (int i = 0; i < 2; ++i) { + for (const auto i : c10::irange(2)) { ASSERT_EQ(dData[i], expected); } @@ -194,7 +195,7 @@ TEST(Reductions, ReduceSum3D) { SimpleIREvaluator cg3(s3, {c, e}); cg3.call({cData, eData}); - for (int i = 0; i < 2; ++i) { + for (const auto i : c10::irange(2)) { ASSERT_EQ(eData[i], expected); } } @@ -226,7 +227,7 @@ TEST(Reductions, ReduceSum10D) { // NOLINTNEXTLINE(bugprone-integer-division) float expected = InputSize / OutputSize; - for (int i = 0; i < OutputSize; ++i) { + for (const auto i : c10::irange(OutputSize)) { ASSERT_EQ(out[i], expected); } } @@ -238,8 +239,8 @@ TEST(Reductions, ReduceProduct) { BufHandle b("b", {M, N}, kFloat); std::vector in(M * N); - for (int i = 0; i < M; ++i) { - for (int j = 0; j < N; ++j) { + for (const auto i : c10::irange(M)) { + for (const auto j : c10::irange(N)) { in[i * N + j] = 2 + j; } } @@ -260,12 +261,12 @@ TEST(Reductions, ReduceProduct) { cg.call({in, out}); float expected = 1; - for (int i = 0; i < N; ++i) { + for (const auto i : c10::irange(N)) { // NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions) expected *= 2 + i; } - for (int i = 0; i < M; ++i) { + for (const auto i : c10::irange(M)) { ASSERT_EQ(out[i], expected); } } @@ -276,7 +277,7 @@ TEST(Reductions, ReduceMax) { std::vector in(10); std::vector out(1, -1.f); - for (int j = 0; j < 10; ++j) { + for (const auto j : c10::irange(10)) { in[j] = j; } @@ -316,7 +317,7 @@ TEST(Reductions, ReduceMinCustomInitializer) { std::vector in(10); std::vector out(1, -1.f); - for (int j = 0; j < 10; ++j) { + for (const auto j : c10::irange(10)) { in[j] = 10 + j; } @@ -374,7 +375,7 @@ TEST(Reductions, ReduceAnyAll) { std::vector out(4, 0); // input has 0-39 in 4 rows. - for (int i = 0; i < 40; ++i) { + for (const auto i : c10::irange(40)) { in[i] = i; } cg.call({in, out, 1}); @@ -438,8 +439,8 @@ TEST(Reductions, ReduceMatmul2D) { std::vector tB_(6); std::vector out(9, -1.f); - for (int i = 0; i < 3; ++i) { - for (int j = 0; j < 2; ++j) { + for (const auto i : c10::irange(3)) { + for (const auto j : c10::irange(2)) { tA_[i * 2 + j] = i * 2 + j; tB_[j * 3 + i] = i * 2 + j; } @@ -465,7 +466,7 @@ TEST(Reductions, ReduceMatmul2D) { std::vector expected( {1.f, 3.f, 5.f, 3.f, 13.f, 23.f, 5.f, 23.f, 41.f}); - for (int i = 0; i < 9; ++i) { + for (const auto i : c10::irange(9)) { ASSERT_EQ(out[i], expected[i]); } } @@ -473,7 +474,7 @@ TEST(Reductions, ReduceMatmul2D) { TEST(Reductions, ReduceRfactorLike) { BufHandle in("in", {10, 10}, kFloat); std::vector in_(100); - for (int i = 0; i < 100; ++i) { + for (const auto i : c10::irange(100)) { in_[i] = i; } std::vector in_rf_(10, -2.f); @@ -522,14 +523,14 @@ TEST(Reductions, ReduceAsProducer) { for (int i = 0; i < 2 * 3; ++i) { aData[i] = 6 - i; - for (int j = 0; j < M; ++j) { + for (const auto j : c10::irange(M)) { bData[i * M + j] = j; } } cg.call({aData, bData, dData, M}); float expected = 0; - for (int i = 0; i < M; ++i) { + for (const auto i : c10::irange(M)) { // NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions) expected += i; } @@ -564,7 +565,7 @@ TEST(Reductions, ReduceAsConsumer) { std::vector dData(2, 6.0f); for (int i = 0; i < 2 * 3; ++i) { - for (int j = 0; j < M; ++j) { + for (const auto j : c10::irange(M)) { bData[i * M + j] = j + 1; aData[i * M + j] = 6 - i; } @@ -573,16 +574,16 @@ TEST(Reductions, ReduceAsConsumer) { cg.call({aData, bData, dData, M}); // NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,modernize-avoid-c-arrays) float expected[2] = {0, 0}; - for (int i = 0; i < 2; ++i) { - for (int j = 0; j < 3; ++j) { - for (int k = 0; k < M; ++k) { + for (const auto i : c10::irange(2)) { + for (const auto j : c10::irange(3)) { + for (const auto k : c10::irange(M)) { // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers,bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions) expected[i] += (k + 1) * (6 - (i * 3 + j)); } } } - for (int i = 0; i < 2; ++i) { + for (const auto i : c10::irange(2)) { ASSERT_EQ(dData[i], expected[i]); } } @@ -591,8 +592,8 @@ TEST(Reductions, SplitReduceAxis) { BufHandle in("in", {16, 8}, kFloat); std::vector in_(16 * 8); - for (int i = 0; i < 16; ++i) { - for (int j = 0; j < 8; ++j) { + for (const auto i : c10::irange(16)) { + for (const auto j : c10::irange(8)) { in_[i * 8 + j] = i; } } @@ -611,7 +612,7 @@ TEST(Reductions, SplitReduceAxis) { SimpleIREvaluator cg(s, {in, tensor}); cg.call({in_, out}); - for (int i = 0; i < 16; ++i) { + for (const auto i : c10::irange(16)) { ASSERT_EQ(out[i], i * 8); } } @@ -620,8 +621,8 @@ TEST(Reductions, SplitNonReduceAxis) { BufHandle in("in", {16, 8}, kFloat); std::vector in_(16 * 8); - for (int i = 0; i < 16; ++i) { - for (int j = 0; j < 8; ++j) { + for (const auto i : c10::irange(16)) { + for (const auto j : c10::irange(8)) { in_[i * 8 + j] = i; } } @@ -640,7 +641,7 @@ TEST(Reductions, SplitNonReduceAxis) { SimpleIREvaluator cg(s, {in, tensor}); cg.call({in_, out}); - for (int i = 0; i < 16; ++i) { + for (const auto i : c10::irange(16)) { ASSERT_EQ(out[i], i * 8); } } @@ -689,7 +690,7 @@ TEST(Reductions, ReorderedReductionInitializer) { SimpleIREvaluator cg2(s, {in, tensor}); cg2.call({in_, out2}); - for (int i = 0; i < 16; ++i) { + for (const auto i : c10::irange(16)) { ASSERT_EQ(out1[i], out2[i]); } } @@ -807,7 +808,7 @@ TEST(Reductions, ReduceRepeatedInternalRfactor) { LoopNest orig_loop({c}); // Try rfactoring N outer loops - for (int rfac_number = 1; rfac_number < 5; rfac_number++) { + for (const auto rfac_number : c10::irange(1, 5)) { LoopNest refloop(orig_loop); LoopNest loop(orig_loop); refloop.prepareForCodegen(); @@ -817,7 +818,7 @@ TEST(Reductions, ReduceRepeatedInternalRfactor) { BufPtr tmp_buf = c.buf(); - for (int idx = 0; idx < rfac_number; idx++) { + for (const auto idx : c10::irange(rfac_number)) { auto reduce = loop.getAllWritesToBuf(tmp_buf)[1]; ASSERT_TRUE(loop.rfactor( reduce, loop.getLoopStmtsFor(tmp_buf).at(idx), &tmp_buf)); @@ -846,7 +847,7 @@ TEST(Reductions, ReduceSplitTail) { in[j] = j; } - for (int i = 0; i < 3; ++i) { + for (const auto i : c10::irange(3)) { std::vector out(M, -1.f); Tensor c = Reduce("sum", {{M, "m"}}, Sum(), b, {{N, "n"}, {K, "k"}}); @@ -876,7 +877,7 @@ TEST(Reductions, ReduceSplitNoTail) { in[j] = j; } - for (int i = 0; i < 3; ++i) { + for (const auto i : c10::irange(3)) { std::vector out(M, -1.f); Tensor c = Reduce("sum", {{M, "m"}}, Sum(), b, {{N, "n"}, {K, "k"}}); @@ -908,7 +909,7 @@ TEST(Reductions, ReduceOverSplitTail) { in[j] = j; } - for (int i = 0; i < 3; ++i) { + for (const auto i : c10::irange(3)) { std::vector out(M, -1.f); Tensor c = Reduce("sum", {{M, "m"}}, Sum(), b, {{N, "n"}, {K, "k"}}); @@ -939,7 +940,7 @@ TEST(Reductions, ReduceSplitMask) { in[j] = j; } - for (int i = 0; i < 3; ++i) { + for (const auto i : c10::irange(3)) { std::vector out(M, -1.f); Tensor c = Reduce("sum", {{M, "m"}}, Sum(), b, {{N, "n"}, {K, "k"}}); @@ -969,7 +970,7 @@ TEST(Reductions, ReduceSplitNoMask) { in[j] = j; } - for (int i = 0; i < 3; ++i) { + for (const auto i : c10::irange(3)) { std::vector out(M, -1.f); Tensor c = Reduce("sum", {{M, "m"}}, Sum(), b, {{N, "n"}, {K, "k"}}); @@ -1000,7 +1001,7 @@ TEST(Reductions, ReduceOverSplitMask) { in[j] = j; } - for (int i = 0; i < 3; ++i) { + for (const auto i : c10::irange(3)) { std::vector out(M, -1.f); Tensor c = Reduce("sum", {{M, "m"}}, Sum(), b, {{N, "n"}, {K, "k"}}); @@ -1029,7 +1030,7 @@ TEST(Reductions, ReduceSplitRfactor) { BufHandle b("b", {M, N, K}, kFloat); std::vector in(M * N * K); - for (int m = 0; m < M; ++m) { + for (const auto m : c10::irange(M)) { for (int j = 0; j < N * K; ++j) { in[m * N * K + j] = j; } @@ -1056,7 +1057,7 @@ TEST(Reductions, ReduceSplitRfactor) { SimpleIREvaluator cg(s, {b, c}); cg.call({in, out}); - for (int i = 0; i < M; ++i) { + for (const auto i : c10::irange(M)) { ASSERT_EQ(out[0], 4950); } } @@ -1134,12 +1135,12 @@ TEST(Reductions, ReduceInlineReduction) { PaddedBuffer a_v(M); PaddedBuffer b_v(M, N, K); - for (int i = 0; i < M; i++) { + for (const auto i : c10::irange(M)) { a_v(i) = i * i; } - for (int i = 0; i < M; i++) { - for (int j = 0; j < N; j++) { - for (int k = 0; k < K; k++) { + for (const auto i : c10::irange(M)) { + for (const auto j : c10::irange(N)) { + for (const auto k : c10::irange(K)) { b_v(i, j, k) = j * j * k; } } @@ -1169,9 +1170,9 @@ TEST(Reductions, ReduceInlineConsumer) { PaddedBuffer a_v(M, N, K); PaddedBuffer b_v(M, N, K); - for (int i = 0; i < M; i++) { - for (int j = 0; j < N; j++) { - for (int k = 0; k < K; k++) { + for (const auto i : c10::irange(M)) { + for (const auto j : c10::irange(N)) { + for (const auto k : c10::irange(K)) { a_v(i, j, k) = i * i + k; b_v(i, j, k) = j * j + k; } @@ -1226,9 +1227,9 @@ TEST(Reductions, ReduceInlineReducerInternal) { PaddedBuffer a_v(M, N, K); PaddedBuffer b_v(M, N, K); - for (int i = 0; i < M; i++) { - for (int j = 0; j < N; j++) { - for (int k = 0; k < K; k++) { + for (const auto i : c10::irange(M)) { + for (const auto j : c10::irange(N)) { + for (const auto k : c10::irange(K)) { a_v(i, j, k) = i * i + k; b_v(i, j, k) = j * j + k; } @@ -1319,9 +1320,9 @@ TEST(Reductions, ReductionCacheAccessesOperatorAxis) { PaddedBuffer e_before(L, "e_before"); PaddedBuffer e_after(L, "e_after"); - for (int l = 0; l < L; l++) { - for (int m = 0; m < M; m++) { - for (int n = 0; n < N; n++) { + for (const auto l : c10::irange(L)) { + for (const auto m : c10::irange(M)) { + for (const auto n : c10::irange(N)) { a_v(l, m, n) = at::randn({1}).item().to(); b_v(l, m, n) = at::randn({1}).item().to(); } @@ -1392,9 +1393,9 @@ TEST(Reductions, ReductionCacheAccessesOuterReduceAxis) { PaddedBuffer e_before(L, "e_before"); PaddedBuffer e_after(L, "e_after"); - for (int l = 0; l < L; l++) { - for (int m = 0; m < M; m++) { - for (int n = 0; n < N; n++) { + for (const auto l : c10::irange(L)) { + for (const auto m : c10::irange(M)) { + for (const auto n : c10::irange(N)) { a_v(l, m, n) = at::randn({1}).item().to(); b_v(l, m, n) = at::randn({1}).item().to(); } @@ -1465,9 +1466,9 @@ TEST(Reductions, ReductionCacheAccessesInnerReduceAxis) { PaddedBuffer e_before(L, "e_before"); PaddedBuffer e_after(L, "e_after"); - for (int l = 0; l < L; l++) { - for (int m = 0; m < M; m++) { - for (int n = 0; n < N; n++) { + for (const auto l : c10::irange(L)) { + for (const auto m : c10::irange(M)) { + for (const auto n : c10::irange(N)) { a_v(l, m, n) = at::randn({1}).item().to(); b_v(l, m, n) = at::randn({1}).item().to(); } @@ -1782,8 +1783,8 @@ TEST(Reductions, ReductionRfactorCacheTempInner) { TEST(Reductions, ReductionVectorize) { std::vector in_(8 * 8); - for (int i = 0; i < 8; ++i) { - for (int j = 0; j < 8; ++j) { + for (const auto i : c10::irange(8)) { + for (const auto j : c10::irange(8)) { in_[i * 8 + j] = i; } } @@ -1820,7 +1821,7 @@ TEST(Reductions, ReductionVectorize) { s = IRSimplifier::simplify(l.root_stmt()); SimpleIREvaluator cg_after(s, {in, tensor}); cg_after.call({in_, out_after}); - for (int i = 0; i < 8; ++i) { + for (const auto i : c10::irange(8)) { ASSERT_EQ(out_before[i], out_after[i]); } } @@ -1836,8 +1837,8 @@ TEST(Reductions, ReductionVectorizeInner) { TEST(Reductions, ReductionVectorizeRfactor) { std::vector in_(8 * 8); - for (int i = 0; i < 8; ++i) { - for (int j = 0; j < 8; ++j) { + for (const auto i : c10::irange(8)) { + for (const auto j : c10::irange(8)) { in_[i * 8 + j] = i; } } diff --git a/test/cpp/tensorexpr/test_simplify.cpp b/test/cpp/tensorexpr/test_simplify.cpp index f71f907c2732af..60c45fe78b34b7 100644 --- a/test/cpp/tensorexpr/test_simplify.cpp +++ b/test/cpp/tensorexpr/test_simplify.cpp @@ -1,6 +1,7 @@ #include #include +#include #include #include #include @@ -1132,7 +1133,7 @@ TEST(Simplify, SimplifyDivWithLoopContext0) { TEST(Simplify, SimplifyDivWithLoopContext1) { // Stmt to simplify: - // for (int i = 0; i < 6; i++) { + // for (const auto i : c10::irange(6)) { // A[i] = (i + 24) / 6; //} VarHandle i("i", kInt); @@ -1153,7 +1154,7 @@ TEST(Simplify, SimplifyDivWithLoopContext1) { TEST(Simplify, SimplifyDivWithLoopContext2) { // Stmt to simplify: - // for (int i = 0; i < 5; i++) { + // for (const auto i : c10::irange(5)) { // A[i] = (i + 25) / 6; //} VarHandle i("i", kInt); @@ -1174,7 +1175,7 @@ TEST(Simplify, SimplifyDivWithLoopContext2) { TEST(Simplify, SimplifyDivWithLoopContext3) { // Stmt to simplify: - // for (int i = 0; i < 6; i++) { + // for (const auto i : c10::irange(6)) { // A[i] = (i + 24) / (-6); //} VarHandle i("i", kInt); @@ -1195,7 +1196,7 @@ TEST(Simplify, SimplifyDivWithLoopContext3) { TEST(Simplify, SimplifyDivWithLoopContext4) { // Stmt to simplify: - // for (int i = 0; i < 5; i++) { + // for (const auto i : c10::irange(5)) { // A[i] = (i - 5) / 6; //} VarHandle i("i", kInt); @@ -1216,8 +1217,8 @@ TEST(Simplify, SimplifyDivWithLoopContext4) { TEST(Simplify, SimplifyDivWithLoopContext5) { // Stmt to simplify: - // for (int i = 0; i < 6; i++) { - // for (int j = 0; j < 10; j++) { + // for (const auto i : c10::irange(6)) { + // for (const auto j : c10::irange(10)) { // A[i, j] = (i + 6*j) / 6; // } //} @@ -1242,7 +1243,7 @@ TEST(Simplify, SimplifyDivWithLoopContext5) { TEST(Simplify, SimplifyDivWithLoopContext6) { // Stmt to simplify: - // for (int i = 0; i < 6; i++) { + // for (const auto i : c10::irange(6)) { // for (int j = -1; j < 9; j++) { // A[i, j+1] = (i + 6*j) / 6; // } @@ -1269,8 +1270,8 @@ TEST(Simplify, SimplifyDivWithLoopContext6) { TEST(Simplify, SimplifyDivWithLoopContext7) { // Stmt to simplify: - // for (int i = 0; i < 6; i++) { - // for (int j = 0; j < 10; j++) { + // for (const auto i : c10::irange(6)) { + // for (const auto j : c10::irange(10)) { // A[i, j] = (i + 6*j) / (-6); // } //} @@ -1296,7 +1297,7 @@ TEST(Simplify, SimplifyDivWithLoopContext7) { TEST(Simplify, SimplifyModWithLoopContext0) { // Stmt to simplify: - // for (int i = 0; i < 100; i++) { + // for (const auto i : c10::irange(100)) { // A[i] = i % 100; //} VarHandle i("i", kInt); @@ -1317,7 +1318,7 @@ TEST(Simplify, SimplifyModWithLoopContext0) { TEST(Simplify, SimplifyModWithLoopContext1) { // Stmt to simplify: - // for (int i = 0; i < 6; i++) { + // for (const auto i : c10::irange(6)) { // A[i] = (i + 24) % 6; //} VarHandle i("i", kInt); @@ -1338,7 +1339,7 @@ TEST(Simplify, SimplifyModWithLoopContext1) { TEST(Simplify, SimplifyModWithLoopContext2) { // Stmt to simplify: - // for (int i = 0; i < 5; i++) { + // for (const auto i : c10::irange(5)) { // A[i] = (i + 25) % 6; //} VarHandle i("i", kInt); @@ -1359,7 +1360,7 @@ TEST(Simplify, SimplifyModWithLoopContext2) { TEST(Simplify, SimplifyModWithLoopContext3) { // Stmt to simplify: - // for (int i = 0; i < 6; i++) { + // for (const auto i : c10::irange(6)) { // A[i] = (i + 24) % (-6); //} VarHandle i("i", kInt); @@ -1380,7 +1381,7 @@ TEST(Simplify, SimplifyModWithLoopContext3) { TEST(Simplify, SimplifyModWithLoopContext4) { // Stmt to simplify: - // for (int i = 0; i < 5; i++) { + // for (const auto i : c10::irange(5)) { // A[i] = (i - 5) % 6; //} VarHandle i("i", kInt); @@ -1401,8 +1402,8 @@ TEST(Simplify, SimplifyModWithLoopContext4) { TEST(Simplify, SimplifyModWithLoopContext5) { // Stmt to simplify: - // for (int i = 0; i < 6; i++) { - // for (int j = 0; j < 10; j++) { + // for (const auto i : c10::irange(6)) { + // for (const auto j : c10::irange(10)) { // A[i, j] = (i + 6*j) % 6; // } //} @@ -1427,7 +1428,7 @@ TEST(Simplify, SimplifyModWithLoopContext5) { TEST(Simplify, SimplifyModWithLoopContext6) { // Stmt to simplify: - // for (int i = 0; i < 6; i++) { + // for (const auto i : c10::irange(6)) { // for (int j = -1; j < 9; j++) { // A[i, j+1] = (i + 6*j) % 6; // } @@ -1454,8 +1455,8 @@ TEST(Simplify, SimplifyModWithLoopContext6) { TEST(Simplify, SimplifyModWithLoopContext7) { // Stmt to simplify: - // for (int i = 0; i < 6; i++) { - // for (int j = 0; j < 10; j++) { + // for (const auto i : c10::irange(6)) { + // for (const auto j : c10::irange(10)) { // A[i, j] = (i + 6*j) % (-6); // } //} @@ -3884,7 +3885,7 @@ TEST(Simplify, SimplifyEliminateEmptyFor) { { // Flatten many layers around an empty block to an empty block. StmtPtr last = alloc(std::vector({})); - for (int i = 0; i < 11; ++i) { + for (const auto i : c10::irange(11)) { VarHandle loopVar("loopVar", kInt); last = For::make(loopVar, 0, 10, last); } @@ -3968,7 +3969,7 @@ TEST(Simplify, SimplifyFlattenBlock) { { // Flatten many layers around an empty block to an empty block. StmtPtr last = alloc(std::vector({})); - for (int i = 0; i < 11; ++i) { + for (const auto i : c10::irange(11)) { last = alloc(std::vector({last})); } @@ -4817,18 +4818,18 @@ TEST(Simplify, SimplifyBroadcastTermExpander) { SimpleIREvaluator eval(store, {buf}); std::vector output(num_lanes); eval(output); - for (int i = 0; i < num_lanes; ++i) { + for (const auto i : c10::irange(num_lanes)) { ASSERT_EQ(output[i], 2); } } TEST(Simplify, DISABLED_CompareSelectCondAlwaysInLoopBounds) { // Before: - // for (int n = 1; n < N; n++) { + // for (const auto n : c10::irange(1, N)) { // b[n] = n < 1 ? 0.f : 1.f; // } // After: - // for (int n = 1; n < N; n++) { + // for (const auto n : c10::irange(1, N)) { // b[n] = 1.f; // } constexpr int N = 8; @@ -4848,11 +4849,11 @@ TEST(Simplify, DISABLED_CompareSelectCondAlwaysInLoopBounds) { TEST(Simplify, DISABLED_IfThenCondAlwaysInLoopBounds) { // Before: - // for (int n = 1; n < N; n++) { + // for (const auto n : c10::irange(1, N)) { // b[n] = IfThenElse(n < 1 ? 1 : 0, 0.f, 1.f); // } // After: - // for (int n = 1; n < N; n++) { + // for (const auto n : c10::irange(1, N)) { // b[n] = 1.f; // } constexpr int N = 8; @@ -4875,13 +4876,13 @@ TEST(Simplify, DISABLED_MultiClauseCondAlwaysInLoopBounds) { // conditional that is provably satisfied (or unsatisfied) by the entire loop // range. // Before: - // for (int i = 1; i < 7; i++) { - // for (int j = 1; j < 7; j++) { + // for (const auto i : c10::irange(1, 7)) { + // for (const auto j : c10::irange(1, 7)) { // b[i, j] = IfThenElse( // j>=7 ? 1 : (i>=7 ? 1 : (j<1 ? 1 : (i<1 ? 1 : 0))), 0.f, 1.f); // After: - // for (int i = 1; i < 7; i++) { - // for (int j = 1; j < 7; j++) { + // for (const auto i : c10::irange(1, 7)) { + // for (const auto j : c10::irange(1, 7)) { // b[i, j] = 1.f; constexpr int N = 8; BufHandle b("b", {N, N}, kFloat); @@ -4910,13 +4911,13 @@ TEST(Simplify, DISABLED_SimplifyLoopBounds) { // could be solved by peeling, and applying the range-based conditional // simplification in the previous tests. // Before: - // for (int i = 0; i < 3; i++) { - // for (int j = 0; j < 3; j++) { + // for (const auto i : c10::irange(3)) { + // for (const auto j : c10::irange(3)) { // b[i, j] = (b[i, j]) + (IfThenElse( // j>=7 ? 1 : (i>=7 ? 1 : (j<1 ? 1 : (i<1 ? 1 : 0))), 0.f, a[i, j])); // After: - // for (int i = 1; i < 3; i++) { - // for (int j = 1; j < 3; j++) { + // for (const auto i : c10::irange(1, 3)) { + // for (const auto j : c10::irange(1, 3)) { // b[i, j] = (b[i, j]) + 1.f; constexpr int N = 8; constexpr int K = 3; @@ -4937,8 +4938,8 @@ TEST(Simplify, DISABLED_SimplifyLoopBounds) { oss << *s; torch::jit::testing::FileCheck().run( R"IR( -# CHECK: for (int i = 1; i < 3; i++) { -# CHECK: for (int j = 1; j < 3; j++) { +# CHECK: for (const auto i : c10::irange(1, 3)) { +# CHECK: for (const auto j : c10::irange(1, 3)) { # CHECK-NOT: IfThenElse )IR", oss.str()); diff --git a/test/cpp/tensorexpr/tutorial.cpp b/test/cpp/tensorexpr/tutorial.cpp index 0ec0968bebf8fa..b89fcc3396df56 100644 --- a/test/cpp/tensorexpr/tutorial.cpp +++ b/test/cpp/tensorexpr/tutorial.cpp @@ -38,6 +38,7 @@ #include #include +#include #include #include #include @@ -146,8 +147,8 @@ int main(int argc, char* argv[]) { std::cout << "Nested for loops: " << std::endl << *loop_i_a << std::endl; // Prints: // Nested for loops: - // for (int i = 0; i < 64; i++) { - // for (int j = 0; j < 32; j++) { + // for (const auto i : c10::irange(64)) { + // for (const auto j : c10::irange(32)) { // A[i, j] = i + j; // } // } @@ -166,13 +167,13 @@ int main(int argc, char* argv[]) { // Prints: // Compound Block statement: // { - // for (int i = 0; i < 64; i++) { - // for (int j = 0; j < 32; j++) { + // for (const auto i : c10::irange(64)) { + // for (const auto j : c10::irange(32)) { // A[i, j] = i + j; // } // } - // for (int i = 0; i < 64; i++) { - // for (int j = 0; j < 32; j++) { + // for (const auto i : c10::irange(64)) { + // for (const auto j : c10::irange(32)) { // B[i, j] = A[i, j]; // } // } @@ -193,8 +194,8 @@ int main(int argc, char* argv[]) { << *C.stmt() << std::endl; // Prints: // Stmt produced by 'Compute' API: - // for (int i = 0; i < 64; i++) { - // for (int j = 0; j < 32; j++) { + // for (const auto i : c10::irange(64)) { + // for (const auto j : c10::irange(32)) { // C[i, j] = i * j; // } // } @@ -239,13 +240,13 @@ int main(int argc, char* argv[]) { // Prints: // Stmt produced by 'Compute' API: // { - // for (int i = 0; i < 64; i++) { - // for (int j = 0; j < 32; j++) { + // for (const auto i : c10::irange(64)) { + // for (const auto j : c10::irange(32)) { // C[i, j] = i * (j + 1); // } // } - // for (int i_1 = 0; i_1 < 64; i_1++) { - // for (int j_1 = 0; j_1 < 32; j_1++) { + // for (const auto i_1 : c10::irange(64)) { + // for (const auto j_1 : c10::irange(32)) { // D[i_1, j_1] = (C[i_1, j_1]) - i_1; // } // } @@ -265,13 +266,13 @@ int main(int argc, char* argv[]) { // Prints: // LoopNest root stmt: // { - // for (int i = 0; i < 64; i++) { - // for (int j = 0; j < 32; j++) { + // for (const auto i : c10::irange(64)) { + // for (const auto j : c10::irange(32)) { // C[i, j] = i * (j + 1); // } // } - // for (int i_1 = 0; i_1 < 64; i_1++) { - // for (int j_1 = 0; j_1 < 32; j_1++) { + // for (const auto i_1 : c10::irange(64)) { + // for (const auto j_1 : c10::irange(32)) { // D[i_1, j_1] = (C[i_1, j_1]) - i_1; // } // } @@ -284,8 +285,8 @@ int main(int argc, char* argv[]) { // Prints: // Stmt after inlining: // { - // for (int i = 0; i < 64; i++) { - // for (int j = 0; j < 32; j++) { + // for (const auto i : c10::irange(64)) { + // for (const auto j : c10::irange(32)) { // D[i, j] = i * (j + 1) - i; // } // } @@ -298,8 +299,8 @@ int main(int argc, char* argv[]) { // Prints: // Stmt after simplification: // { - // for (int i = 0; i < 64; i++) { - // for (int j = 0; j < 32; j++) { + // for (const auto i : c10::irange(64)) { + // for (const auto j : c10::irange(32)) { // D[i, j] = i * j; // } // } @@ -319,15 +320,15 @@ int main(int argc, char* argv[]) { // Prints: // Stmt after splitWithTail: // { - // for (int i_outer = 0; i_outer < 4; i_outer++) { - // for (int i_inner = 0; i_inner < 13; i_inner++) { - // for (int j = 0; j < 32; j++) { + // for (const auto i_outer : c10::irange(4)) { + // for (const auto i_inner : c10::irange(13)) { + // for (const auto j : c10::irange(32)) { // D[i_inner + 13 * i_outer, j] = i_inner * j + 13 * (i_outer * j); // } // } // } - // for (int i_tail = 0; i_tail < 12; i_tail++) { - // for (int j = 0; j < 32; j++) { + // for (const auto i_tail : c10::irange(12)) { + // for (const auto j : c10::irange(32)) { // D[i_tail + 52, j] = i_tail * j + 52 * j; // } // } @@ -365,8 +366,8 @@ int main(int argc, char* argv[]) { std::cout << *loopnest.root_stmt() << std::endl; // Prints: // { - // for (int i = 0; i < 64; i++) { - // for (int j = 0; j < 32; j++) { + // for (const auto i : c10::irange(64)) { + // for (const auto j : c10::irange(32)) { // X[i, j] = (A[i, j]) + (B[i, j]); // } // } @@ -469,8 +470,8 @@ int main(int argc, char* argv[]) { // Prints: // TE Stmt constructed from TorchScript: // { - // for (int v = 0; v < 5; v++) { - // for (int _tail_tail = 0; _tail_tail < 3; _tail_tail++) { + // for (const auto v : c10::irange(5)) { + // for (const auto _tail_tail : c10::irange(3)) { // aten_add[_tail_tail + 3 * v] = (tA[_tail_tail + 3 * v]) * // ((tA[_tail_tail + 3 * v]) * (tB[_tail_tail + 3 * v])) + // (tB[_tail_tail + 3 * v]); diff --git a/test/custom_operator/op.cpp b/test/custom_operator/op.cpp index dd8ca4344bc1e7..943b6938b93812 100644 --- a/test/custom_operator/op.cpp +++ b/test/custom_operator/op.cpp @@ -1,3 +1,4 @@ +#include #include #include "op.h" @@ -11,7 +12,7 @@ torch::List custom_op( int64_t repeat) { torch::List output; output.reserve(repeat); - for (int64_t i = 0; i < repeat; ++i) { + for (const auto i : c10::irange(repeat)) { output.push_back(tensor * scalar); } return output; diff --git a/test/custom_operator/test_custom_ops.cpp b/test/custom_operator/test_custom_ops.cpp index ec22568c5a3eac..4cedd7ed0367b0 100644 --- a/test/custom_operator/test_custom_ops.cpp +++ b/test/custom_operator/test_custom_ops.cpp @@ -1,3 +1,4 @@ +#include #include #include @@ -44,7 +45,7 @@ void get_operator_from_registry_and_execute() { const auto manual = custom_op(torch::ones(5), 2.0, 3); TORCH_INTERNAL_ASSERT(output.size() == 3); - for (size_t i = 0; i < output.size(); ++i) { + for (const auto i : c10::irange(output.size())) { TORCH_INTERNAL_ASSERT(output[i].allclose(torch::ones(5) * 2)); TORCH_INTERNAL_ASSERT(output[i].allclose(manual[i])); } diff --git a/test/mobile/custom_build/predictor.cpp b/test/mobile/custom_build/predictor.cpp index aaae9062f77708..fc9786b82a1b12 100644 --- a/test/mobile/custom_build/predictor.cpp +++ b/test/mobile/custom_build/predictor.cpp @@ -4,6 +4,7 @@ #include #include +#include #include using namespace std; @@ -40,7 +41,7 @@ int main(int argc, const char* argv[]) { }(); std::cout << std::setprecision(3) << std::fixed; - for (int i = 0; i < 5; i++) { + for (const auto i : c10::irange(5)) { std::cout << output.data_ptr()[i] << std::endl; } return 0; diff --git a/torch/csrc/autograd/functions/basic_ops.h b/torch/csrc/autograd/functions/basic_ops.h index 2ff2ce8e499d34..8dfe0c3ab1fad0 100644 --- a/torch/csrc/autograd/functions/basic_ops.h +++ b/torch/csrc/autograd/functions/basic_ops.h @@ -4,8 +4,6 @@ #include #include -#include - #include #include #include diff --git a/torch/csrc/autograd/python_function.cpp b/torch/csrc/autograd/python_function.cpp index 76fbd8fb33463e..01b0eae6ab4365 100644 --- a/torch/csrc/autograd/python_function.cpp +++ b/torch/csrc/autograd/python_function.cpp @@ -589,7 +589,7 @@ static void _trace_post_record( // to the original tuple type. if (!unpack_output) { std::vector new_tuple_values; - for (int i = 0; i < num_outputs; ++i) { + for (const auto i : c10::irange(num_outputs)) { TypePtr ptr = node->outputs()[i]->type(); new_tuple_values.push_back(ptr); } diff --git a/torch/csrc/autograd/python_variable.cpp b/torch/csrc/autograd/python_variable.cpp index d566586de85142..415c49e4ec3170 100644 --- a/torch/csrc/autograd/python_variable.cpp +++ b/torch/csrc/autograd/python_variable.cpp @@ -1706,7 +1706,7 @@ void concrete_dispatch_fn( } // Find overloaded tensors - for (int64_t idx = 0; idx < arguments.size(); idx++) { + for (const auto idx : c10::irange(arguments.size())) { const auto& ivalue = arguments[idx]; if (ivalue.isTensor()) { const auto& tensor = ivalue.toTensor(); @@ -1728,12 +1728,12 @@ void concrete_dispatch_fn( } // Populate positional arguments - for (int64_t idx = 0; idx < positional_default_start; idx++) { + for (const auto idx : c10::irange(positional_default_start)) { PyTuple_SET_ITEM(args.ptr(), idx, torch::jit::toPyObject(std::move(arguments[idx])).release().ptr()); } // Populate keyword arguments - for (int64_t idx = kwarg_only_start; idx < arguments.size(); idx++) { + for (const auto idx : c10::irange(kwarg_only_start, arguments.size())) { // But don't populate default keyword arguments if (is_default(idx)) continue; const auto& arg = schema.arguments()[idx]; diff --git a/torch/csrc/deploy/loader.cpp b/torch/csrc/deploy/loader.cpp index a95cf6a272cc51..a9519865028f30 100644 --- a/torch/csrc/deploy/loader.cpp +++ b/torch/csrc/deploy/loader.cpp @@ -54,6 +54,7 @@ #include #include +#include #include #include @@ -152,7 +153,7 @@ size_t phdr_table_get_load_size( Elf64_Addr max_vaddr = 0; bool found_pt_load = false; - for (size_t i = 0; i < phdr_count; ++i) { + for (const auto i : c10::irange(phdr_count)) { const Elf64_Phdr* phdr = &phdr_table[i]; if (phdr->p_type != PT_LOAD) { @@ -383,7 +384,7 @@ std::pair> load_needed_from_elf_file( auto program_headers = (Elf64_Phdr*)(data + header_->e_phoff); auto n_program_headers = header_->e_phnum; const Elf64_Dyn* dynamic = nullptr; - for (size_t i = 0; i < n_program_headers; ++i) { + for (const auto i : c10::irange(n_program_headers)) { const Elf64_Phdr* phdr = &program_headers[i]; if (phdr->p_type == PT_DYNAMIC) { dynamic = reinterpret_cast(data + phdr->p_offset); @@ -405,7 +406,7 @@ std::pair> load_needed_from_elf_file( const char* segment_string_table = data + segment_headers[header_->e_shstrndx].sh_offset; - for (size_t i = 0; i < n_segments; ++i) { + for (const auto i : c10::irange(n_segments)) { const Elf64_Shdr* shdr = &segment_headers[i]; if (shdr->sh_type == SHT_STRTAB && strcmp(".dynstr", segment_string_table + shdr->sh_name) == 0) { @@ -641,7 +642,7 @@ struct AlreadyLoadedSymTable { const Elf64_Phdr* program_headers, size_t n_program_headers) { Elf64_Dyn* dynamic = nullptr; - for (size_t i = 0; i < n_program_headers; ++i) { + for (const auto i : c10::irange(n_program_headers)) { const Elf64_Phdr* phdr = &program_headers[i]; // Segment addresses in memory. @@ -871,7 +872,7 @@ struct __attribute__((visibility("hidden"))) CustomLibraryImpl void load_segments() { // from bionic - for (size_t i = 0; i < n_program_headers_; ++i) { + for (const auto i : c10::irange(n_program_headers_)) { const Elf64_Phdr* phdr = &program_headers_[i]; // Segment addresses in memory. @@ -1141,17 +1142,17 @@ struct __attribute__((visibility("hidden"))) CustomLibraryImpl } void relocate() { - for (size_t i = 0; i < dyninfo_.n_rela_; ++i) { + for (const auto i : c10::irange(dyninfo_.n_rela_)) { relocate_one(dyninfo_.rela_[i]); } - for (size_t i = 0; i < dyninfo_.n_plt_rela_; ++i) { + for (const auto i : c10::irange(dyninfo_.n_plt_rela_)) { relocate_one(dyninfo_.plt_rela_[i]); } } void initialize() { call_function(dyninfo_.init_func_); - for (size_t i = 0; i < dyninfo_.n_init_array_; ++i) { + for (const auto i : c10::irange(dyninfo_.n_init_array_)) { call_function(dyninfo_.init_array_[i]); } initialized_ = true;