From 1433160a36890bed6cdbefafe8b2ceb0be60cb0d Mon Sep 17 00:00:00 2001
From: Richard Barnes <rbarnes@fb.com>
Date: Tue, 7 Dec 2021 16:06:18 -0800
Subject: [PATCH] use irange for loops 6 (#66742)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/66742

Modified loops in files under fbsource/fbcode/caffe2/ from the format

`for(TYPE var=x0;var<x_max;x++)`

to the format

`for(const auto var: irange(xmax))`

This was achieved by running r-barnes's loop upgrader script (D28874212) with some modification to exclude all files under /torch/jit and a number of reversions or unused variable suppression warnings added by hand.

Test Plan: Sandcastle

Reviewed By: malfet

Differential Revision: D31705366

fbshipit-source-id: be58222426c192406a7f93c21582c3f6f2082401
---
 caffe2/ideep/operators/conv_pool_base_op.h    |  2 +-
 .../operators/conv_transpose_unpool_base_op.h |  4 +-
 .../ideep/operators/operator_fallback_ideep.h |  6 +-
 caffe2/ideep/utils/ideep_context.h            |  2 +-
 caffe2/image/image_input_op.h                 | 97 ++++++++++---------
 .../contrib/ios/mpscnn/mpscnn_kernels.h       | 10 +-
 .../contrib/libopencl-stub/include/CL/cl.hpp  |  2 +-
 caffe2/operators/arg_ops.h                    | 11 ++-
 caffe2/operators/assert_op.h                  |  5 +-
 caffe2/operators/batch_gather_ops.h           |  8 +-
 caffe2/operators/bisect_percentile_op.h       |  6 +-
 caffe2/operators/byte_weight_dequant_op.h     |  2 +-
 caffe2/operators/cast_op.h                    |  3 +-
 caffe2/operators/cc_bmm_bg_op.h               | 13 +--
 caffe2/operators/ceil_op.h                    |  9 +-
 caffe2/operators/concat_split_op.h            | 17 ++--
 caffe2/operators/conv_op_impl.h               | 36 ++++---
 caffe2/operators/conv_pool_op_base.h          | 23 ++---
 caffe2/operators/conv_transpose_op_impl.h     | 14 +--
 .../operators/conv_transpose_op_mobile_impl.h | 14 +--
 .../operators/conv_transpose_unpool_op_base.h |  2 +-
 caffe2/operators/deform_conv_op_impl.h        | 14 +--
 caffe2/operators/dense_vector_to_id_list_op.h |  5 +-
 caffe2/operators/distance_op.h                |  7 +-
 caffe2/operators/do_op.h                      |  7 +-
 caffe2/operators/elementwise_logical_ops.h    |  6 +-
 caffe2/operators/elementwise_op_test.h        | 20 ++--
 caffe2/operators/enforce_finite_op.h          |  3 +-
 caffe2/operators/expand_op.h                  |  3 +-
 caffe2/operators/expand_squeeze_dims_op.h     |  4 +-
 caffe2/operators/feature_maps_ops.h           | 92 +++++++++---------
 caffe2/operators/filler_op.h                  |  3 +-
 caffe2/operators/find_duplicate_elements_op.h |  9 +-
 caffe2/operators/find_op.h                    |  7 +-
 caffe2/operators/floor_op.h                   |  3 +-
 .../fused_rowwise_8bit_conversion_ops.h       |  5 +-
 .../fused_rowwise_nbit_conversion_ops.h       |  6 +-
 .../fused_rowwise_nbitfake_conversion_ops.h   |  2 +-
 .../operators/gather_fused_8bit_rowwise_op.h  |  2 +-
 caffe2/operators/gather_op.h                  | 13 +--
 caffe2/operators/gather_ranges_to_dense_op.h  | 16 +--
 .../generate_proposals_op_util_boxes.h        |  2 +-
 .../generate_proposals_op_util_nms.h          | 29 +++---
 ...iven_tensor_byte_string_to_uint8_fill_op.h |  4 +-
 caffe2/operators/given_tensor_fill_op.h       |  2 +-
 caffe2/operators/gru_unit_op.h                |  9 +-
 caffe2/operators/h_softmax_op.h               |  3 +-
 caffe2/operators/histogram_op.h               | 12 ++-
 caffe2/operators/im2col_op.h                  |  9 +-
 caffe2/operators/index_hash_ops.h             |  3 +-
 caffe2/operators/index_ops.h                  |  7 +-
 caffe2/operators/inference_lstm_op.h          |  7 +-
 caffe2/operators/key_split_ops.h              | 23 ++---
 caffe2/operators/length_split_op.h            |  5 +-
 caffe2/operators/lengths_pad_op.h             |  3 +-
 .../lengths_reducer_fused_8bit_rowwise_ops.h  |  2 +-
 .../lengths_reducer_fused_nbit_rowwise_ops.h  | 21 ++--
 caffe2/operators/lengths_reducer_ops.h        | 48 ++++-----
 .../lengths_reducer_rowwise_8bit_ops.h        |  4 +-
 caffe2/operators/load_save_op.h               |  6 +-
 caffe2/operators/locally_connected_op_impl.h  | 34 ++++---
 caffe2/operators/lstm_utils.h                 | 17 ++--
 caffe2/operators/map_ops.h                    | 13 +--
 caffe2/operators/mean_op.h                    |  7 +-
 caffe2/operators/merge_id_lists_op.h          |  3 +-
 caffe2/operators/minmax_ops.h                 |  5 +-
 caffe2/operators/moments_op.h                 |  9 +-
 caffe2/operators/ngram_ops.h                  | 13 +--
 caffe2/operators/normalize_op.h               |  2 +-
 caffe2/operators/numpy_tile_op.h              | 11 ++-
 caffe2/operators/onnx_while_op.h              | 11 ++-
 caffe2/operators/op_utils_cudnn.h             |  2 +-
 caffe2/operators/operator_fallback_gpu.h      |  4 +-
 caffe2/operators/order_switch_ops.h           |  7 +-
 caffe2/operators/pack_rnn_sequence_op.h       |  8 +-
 caffe2/operators/partition_ops.h              | 25 ++---
 .../operators/piecewise_linear_transform_op.h | 11 ++-
 caffe2/operators/pool_op.h                    |  4 +-
 caffe2/operators/prepend_dim_op.h             |  3 +-
 caffe2/operators/quant_decode_op.h            |  8 +-
 caffe2/operators/quantile_op.h                | 14 +--
 caffe2/operators/quantized/int8_concat_op.h   |  7 +-
 .../operators/quantized/int8_dequantize_op.h  |  4 +-
 .../quantized/int8_given_tensor_fill_op.h     |  4 +-
 .../quantized/int8_resize_nearest_op.h        |  7 +-
 .../operators/quantized/int8_roi_align_op.h   | 23 ++---
 .../quantized/int8_roi_align_op_test.cc       |  1 +
 caffe2/operators/quantized/int8_test_utils.h  |  4 +-
 caffe2/operators/reduce_front_back_max_ops.h  |  3 +-
 .../reduce_front_back_sum_mean_ops.h          |  3 +-
 caffe2/operators/reduce_ops.h                 | 11 ++-
 caffe2/operators/reducer_functors.h           | 40 ++++----
 caffe2/operators/reduction_ops.h              |  3 +-
 caffe2/operators/remove_data_blocks_op.h      |  5 +-
 94 files changed, 558 insertions(+), 480 deletions(-)

diff --git a/caffe2/ideep/operators/conv_pool_base_op.h b/caffe2/ideep/operators/conv_pool_base_op.h
index 0a1e2c5d886a7..61238dce105c0 100644
--- a/caffe2/ideep/operators/conv_pool_base_op.h
+++ b/caffe2/ideep/operators/conv_pool_base_op.h
@@ -53,7 +53,7 @@ class IDEEPConvPoolOpBase : public ConvPoolOpBase<IDEEPContext> {
 
   bool RunOnDevice() override {
     if (!global_pooling_) {
-      for (int dim = 0; dim < kernel_.size(); ++dim) {
+      for (const auto dim : c10::irange(kernel_.size())) {
         CAFFE_ENFORCE_GT(kernel_[dim], 0);
       }
     }
diff --git a/caffe2/ideep/operators/conv_transpose_unpool_base_op.h b/caffe2/ideep/operators/conv_transpose_unpool_base_op.h
index 11d0f5f7365bd..aa28621804a02 100644
--- a/caffe2/ideep/operators/conv_transpose_unpool_base_op.h
+++ b/caffe2/ideep/operators/conv_transpose_unpool_base_op.h
@@ -109,7 +109,7 @@ class IDEEPConvTransposeUnpoolBase : public IDEEPOperator {
       CAFFE_ENFORCE_EQ(pads_.size(), 2 * kernel_.size());
     }
 
-    for (int dim = 0; dim < kernel_.size(); ++dim) {
+    for (const auto dim : c10::irange(kernel_.size())) {
       CAFFE_ENFORCE_GT(kernel_[dim], 0);
       CAFFE_ENFORCE_GT(stride_[dim], 0);
       CAFFE_ENFORCE_GE(adj_[dim], 0);
@@ -143,7 +143,7 @@ class IDEEPConvTransposeUnpoolBase : public IDEEPOperator {
     auto input_dims = input.get_dims();
     itensor::dims dims;
     dims.assign(input_dims.begin() + 2, input_dims.end());
-    for (int dim = 0; dim < dims.size(); ++dim) {
+    for (const auto dim : c10::irange(dims.size())) {
       int dim_size = 0;
       ComputeSizeAndPad(
           dims[dim],
diff --git a/caffe2/ideep/operators/operator_fallback_ideep.h b/caffe2/ideep/operators/operator_fallback_ideep.h
index 9d6be29435272..e7925d9e5d1e1 100644
--- a/caffe2/ideep/operators/operator_fallback_ideep.h
+++ b/caffe2/ideep/operators/operator_fallback_ideep.h
@@ -52,7 +52,7 @@ class IDEEPFallbackOp final : public IDEEPOperator {
     // Create output blobs in parent workspace,
     // then forward output blobs to local workspace.
     std::unordered_map<string, string> forwarded_output_blobs;
-    for (int i = 0; i < base_def_.output_size(); i++) {
+    for (const auto i : c10::irange(base_def_.output_size())) {
       // For in-place case, the in/output tensor for local_ws must be
       // re-created, instead of forwarding from current workspace.
       string parent_name(base_def_.output(i));
@@ -81,7 +81,7 @@ class IDEEPFallbackOp final : public IDEEPOperator {
   }
 
   bool RunOnDevice() override {
-    for (int i = 0; i < InputSize(); ++i) {
+    for (const auto i : c10::irange(InputSize())) {
       if (InputIsType<itensor>(i)
           && (Input(i).has_scale()
             || Input(i).get_data_type() == idtype::f32)) {
@@ -128,7 +128,7 @@ class IDEEPFallbackOp final : public IDEEPOperator {
       return false;
     }
 
-    for (int i = 0; i < OutputSize(); ++i) {
+    for (const auto i : c10::irange(OutputSize())) {
       if (SkipOutputCopy::Contains(i)) {
         VLOG(1) << "Copy output: index " << i << " skipped.";
         continue;
diff --git a/caffe2/ideep/utils/ideep_context.h b/caffe2/ideep/utils/ideep_context.h
index 7df6763b1baf7..8850342098902 100644
--- a/caffe2/ideep/utils/ideep_context.h
+++ b/caffe2/ideep/utils/ideep_context.h
@@ -83,7 +83,7 @@ class IDEEPContext final : public BaseContext {
           static_cast<const void*>(src),
           static_cast<void*>(dst));
     } else {
-      for (size_t i = 0; i < n; ++i) {
+      for (const auto i : c10::irange(n)) {
         dst[i] = src[i];
       }
     }
diff --git a/caffe2/image/image_input_op.h b/caffe2/image/image_input_op.h
index 5262b6a116a4b..5d72898bfc694 100644
--- a/caffe2/image/image_input_op.h
+++ b/caffe2/image/image_input_op.h
@@ -8,6 +8,7 @@
 #include <iostream>
 
 #include "c10/core/thread_pool.h"
+#include <c10/util/irange.h>
 #include "caffe2/core/common.h"
 #include "caffe2/core/db.h"
 #include "caffe2/image/transform_gpu.h"
@@ -387,7 +388,7 @@ ImageInputOp<Context>::ImageInputOp(
             << ".";
 
   std::mt19937 meta_randgen(time(nullptr));
-  for (int i = 0; i < num_decode_threads_; ++i) {
+  for (const auto i : c10::irange(num_decode_threads_)) {
     randgen_per_thread_.emplace_back(meta_randgen());
   }
   ReinitializeTensor(
@@ -406,7 +407,7 @@ ImageInputOp<Context>::ImageInputOp(
   // data type for prefetched_label_ is actually not known here..
   ReinitializeTensor(&prefetched_label_, sizes, at::dtype<int>().device(CPU));
 
-  for (int i = 0; i < additional_output_sizes_.size(); ++i) {
+  for (const auto i : c10::irange(additional_output_sizes_.size())) {
     prefetched_additional_outputs_on_device_.emplace_back();
     prefetched_additional_outputs_.emplace_back();
   }
@@ -423,7 +424,7 @@ bool RandomSizedCropping(cv::Mat* img, const int crop, std::mt19937* randgen) {
   std::uniform_real_distribution<> aspect_ratio_dis(3.0 / 4.0, 4.0 / 3.0);
 
   cv::Mat cropping;
-  for (int i = 0; i < 10; ++i) {
+  for (const auto i : c10::irange(10)) {
     int target_area = int(ceil(area_dis(*randgen) * area));
     float aspect_ratio = aspect_ratio_dis(*randgen);
     int nh = floor(std::sqrt(((float)target_area / aspect_ratio)));
@@ -499,12 +500,12 @@ bool ImageInputOp<Context>::GetImageAndLabelAndInfoFromDBValue(
       } else {
         // Datum stores things in CHW order, let's do HWC for images to make
         // things more consistent with conventional image storage.
-        for (int c = 0; c < 3; ++c) {
+        for (const auto c : c10::irange(3)) {
           const char* datum_buffer =
               datum.data().data() + datum.height() * datum.width() * c;
           uchar* ptr = src.ptr<uchar>(0) + c;
-          for (int h = 0; h < datum.height(); ++h) {
-            for (int w = 0; w < datum.width(); ++w) {
+          for (const auto h : c10::irange(datum.height())) {
+            for (const auto w : c10::irange(datum.width())) {
               *ptr = *(datum_buffer++);
               ptr += 3;
             }
@@ -522,7 +523,7 @@ bool ImageInputOp<Context>::GetImageAndLabelAndInfoFromDBValue(
     vector<TensorProto> additional_output_protos;
     int start = additional_inputs_offset_;
     int end = start + additional_inputs_count_;
-    for (int i = start; i < end; ++i) {
+    for (const auto i : c10::irange(start, end)) {
       additional_output_protos.push_back(protos.protos(i));
     }
 
@@ -588,7 +589,7 @@ bool ImageInputOp<Context>::GetImageAndLabelAndInfoFromDBValue(
         float* label_data =
             prefetched_label_.mutable_data<float>() + item_id * num_labels_;
         memset(label_data, 0, sizeof(float) * num_labels_);
-        for (int i = 0; i < label_proto.float_data_size(); ++i) {
+        for (const auto i : c10::irange(label_proto.float_data_size())) {
           label_data[(int)label_proto.float_data(i)] = 1.0;
         }
       } else if (label_type_ == MULTI_LABEL_WEIGHTED_SPARSE) {
@@ -596,7 +597,7 @@ bool ImageInputOp<Context>::GetImageAndLabelAndInfoFromDBValue(
         float* label_data =
             prefetched_label_.mutable_data<float>() + item_id * num_labels_;
         memset(label_data, 0, sizeof(float) * num_labels_);
-        for (int i = 0; i < label_proto.float_data_size(); ++i) {
+        for (const auto i : c10::irange(label_proto.float_data_size())) {
           label_data[(int)label_proto.float_data(i)] =
               weight_proto.float_data(i);
         }
@@ -605,7 +606,7 @@ bool ImageInputOp<Context>::GetImageAndLabelAndInfoFromDBValue(
         CAFFE_ENFORCE(label_proto.float_data_size() == num_labels_);
         float* label_data =
             prefetched_label_.mutable_data<float>() + item_id * num_labels_;
-        for (int i = 0; i < label_proto.float_data_size(); ++i) {
+        for (const auto i : c10::irange(label_proto.float_data_size())) {
           label_data[i] = label_proto.float_data(i);
         }
       } else {
@@ -620,7 +621,7 @@ bool ImageInputOp<Context>::GetImageAndLabelAndInfoFromDBValue(
         int* label_data =
             prefetched_label_.mutable_data<int>() + item_id * num_labels_;
         memset(label_data, 0, sizeof(int) * num_labels_);
-        for (int i = 0; i < label_proto.int32_data_size(); ++i) {
+        for (const auto i : c10::irange(label_proto.int32_data_size())) {
           label_data[label_proto.int32_data(i)] = 1;
         }
       } else if (label_type_ == MULTI_LABEL_WEIGHTED_SPARSE) {
@@ -628,7 +629,7 @@ bool ImageInputOp<Context>::GetImageAndLabelAndInfoFromDBValue(
         float* label_data =
             prefetched_label_.mutable_data<float>() + item_id * num_labels_;
         memset(label_data, 0, sizeof(float) * num_labels_);
-        for (int i = 0; i < label_proto.int32_data_size(); ++i) {
+        for (const auto i : c10::irange(label_proto.int32_data_size())) {
           label_data[label_proto.int32_data(i)] = weight_proto.float_data(i);
         }
       } else if (
@@ -636,7 +637,7 @@ bool ImageInputOp<Context>::GetImageAndLabelAndInfoFromDBValue(
         CAFFE_ENFORCE(label_proto.int32_data_size() == num_labels_);
         int* label_data =
             prefetched_label_.mutable_data<int>() + item_id * num_labels_;
-        for (int i = 0; i < label_proto.int32_data_size(); ++i) {
+        for (const auto i : c10::irange(label_proto.int32_data_size())) {
           label_data[i] = label_proto.int32_data(i);
         }
       } else {
@@ -646,14 +647,14 @@ bool ImageInputOp<Context>::GetImageAndLabelAndInfoFromDBValue(
       LOG(FATAL) << "Unsupported label data type.";
     }
 
-    for (int i = 0; i < additional_output_protos.size(); ++i) {
+    for (const auto i : c10::irange(additional_output_protos.size())) {
       auto additional_output_proto = additional_output_protos[i];
       if (additional_output_proto.data_type() == TensorProto::FLOAT) {
         float* additional_output =
             prefetched_additional_outputs_[i].template mutable_data<float>() +
             item_id * additional_output_proto.float_data_size();
 
-        for (int j = 0; j < additional_output_proto.float_data_size(); ++j) {
+        for (const auto j : c10::irange(additional_output_proto.float_data_size())) {
           additional_output[j] = additional_output_proto.float_data(j);
         }
       } else if (additional_output_proto.data_type() == TensorProto::INT32) {
@@ -661,7 +662,7 @@ bool ImageInputOp<Context>::GetImageAndLabelAndInfoFromDBValue(
             prefetched_additional_outputs_[i].template mutable_data<int>() +
             item_id * additional_output_proto.int32_data_size();
 
-        for (int j = 0; j < additional_output_proto.int32_data_size(); ++j) {
+        for (const auto j : c10::irange(additional_output_proto.int32_data_size())) {
           additional_output[j] = additional_output_proto.int32_data(j);
         }
       } else if (additional_output_proto.data_type() == TensorProto::INT64) {
@@ -669,7 +670,7 @@ bool ImageInputOp<Context>::GetImageAndLabelAndInfoFromDBValue(
             prefetched_additional_outputs_[i].template mutable_data<int64_t>() +
             item_id * additional_output_proto.int64_data_size();
 
-        for (int j = 0; j < additional_output_proto.int64_data_size(); ++j) {
+        for (const auto j : c10::irange(additional_output_proto.int64_data_size())) {
           additional_output[j] = additional_output_proto.int64_data(j);
         }
       } else if (additional_output_proto.data_type() == TensorProto::UINT8) {
@@ -677,7 +678,7 @@ bool ImageInputOp<Context>::GetImageAndLabelAndInfoFromDBValue(
             prefetched_additional_outputs_[i].template mutable_data<uint8_t>() +
             item_id * additional_output_proto.int32_data_size();
 
-        for (int j = 0; j < additional_output_proto.int32_data_size(); ++j) {
+        for (const auto j : c10::irange(additional_output_proto.int32_data_size())) {
           additional_output[j] =
               static_cast<uint8_t>(additional_output_proto.int32_data(j));
         }
@@ -799,11 +800,11 @@ void Saturation(
       std::uniform_real_distribution<float>(-alpha_rand, alpha_rand)(*randgen);
   // BGR to Gray scale image: R -> 0.299, G -> 0.587, B -> 0.114
   int p = 0;
-  for (int h = 0; h < img_size; ++h) {
-    for (int w = 0; w < img_size; ++w) {
+  for (const auto h : c10::irange(img_size)) {
+    for (const auto w : c10::irange(img_size)) {
       float gray_color = img[3 * p] * 0.114f + img[3 * p + 1] * 0.587f +
           img[3 * p + 2] * 0.299f;
-      for (int c = 0; c < 3; ++c) {
+      for (const auto c : c10::irange(3)) {
         img[3 * p + c] = img[3 * p + c] * alpha + gray_color * (1.0f - alpha);
       }
       p++;
@@ -821,9 +822,9 @@ void Brightness(
   float alpha = 1.0f +
       std::uniform_real_distribution<float>(-alpha_rand, alpha_rand)(*randgen);
   int p = 0;
-  for (int h = 0; h < img_size; ++h) {
-    for (int w = 0; w < img_size; ++w) {
-      for (int c = 0; c < 3; ++c) {
+  for (const auto h : c10::irange(img_size)) {
+    for (const auto w : c10::irange(img_size)) {
+      for (const auto c : c10::irange(3)) {
         img[p++] *= alpha;
       }
     }
@@ -839,8 +840,8 @@ void Contrast(
     std::mt19937* randgen) {
   float gray_mean = 0;
   int p = 0;
-  for (int h = 0; h < img_size; ++h) {
-    for (int w = 0; w < img_size; ++w) {
+  for (const auto h : c10::irange(img_size)) {
+    for (const auto w : c10::irange(img_size)) {
       // BGR to Gray scale image: R -> 0.299, G -> 0.587, B -> 0.114
       gray_mean += img[3 * p] * 0.114f + img[3 * p + 1] * 0.587f +
           img[3 * p + 2] * 0.299f;
@@ -852,9 +853,9 @@ void Contrast(
   float alpha = 1.0f +
       std::uniform_real_distribution<float>(-alpha_rand, alpha_rand)(*randgen);
   p = 0;
-  for (int h = 0; h < img_size; ++h) {
-    for (int w = 0; w < img_size; ++w) {
-      for (int c = 0; c < 3; ++c) {
+  for (const auto h : c10::irange(img_size)) {
+    for (const auto w : c10::irange(img_size)) {
+      for (const auto c : c10::irange(3)) {
         img[p] = img[p] * alpha + gray_mean * (1.0f - alpha);
         p++;
       }
@@ -880,7 +881,7 @@ void ColorJitter(
       jitter_order.end(),
       std::default_random_engine(seed));
 
-  for (int i = 0; i < 3; ++i) {
+  for (const auto i : c10::irange(3)) {
     if (jitter_order[i] == 0) {
       Saturation<Context>(img, img_size, saturation, randgen);
     } else if (jitter_order[i] == 1) {
@@ -902,21 +903,21 @@ void ColorLighting(
     std::mt19937* randgen) {
   std::normal_distribution<float> d(0, alpha_std);
   std::vector<float> alphas(3);
-  for (int i = 0; i < 3; ++i) {
+  for (const auto i : c10::irange(3)) {
     alphas[i] = d(*randgen);
   }
 
   std::vector<float> delta_rgb(3, 0.0);
-  for (int i = 0; i < 3; ++i) {
-    for (int j = 0; j < 3; ++j) {
+  for (const auto i : c10::irange(3)) {
+    for (const auto j : c10::irange(3)) {
       delta_rgb[i] += eigvecs[i][j] * eigvals[j] * alphas[j];
     }
   }
 
   int p = 0;
-  for (int h = 0; h < img_size; ++h) {
-    for (int w = 0; w < img_size; ++w) {
-      for (int c = 0; c < 3; ++c) {
+  for (const auto h : c10::irange(img_size)) {
+    for (const auto w : c10::irange(img_size)) {
+      for (const auto c : c10::irange(3)) {
         img[p++] += delta_rgb[2 - c];
       }
     }
@@ -933,9 +934,9 @@ void ColorNormalization(
     const std::vector<float>& mean,
     const std::vector<float>& std) {
   int p = 0;
-  for (int h = 0; h < img_size; ++h) {
-    for (int w = 0; w < img_size; ++w) {
-      for (int c = 0; c < channels; ++c) {
+  for (const auto h : c10::irange(img_size)) {
+    for (const auto w : c10::irange(img_size)) {
+      for (const auto c : c10::irange(channels)) {
         img[p] = (img[p] - mean[c]) * std[c];
         p++;
       }
@@ -987,7 +988,7 @@ void TransformImage(
     for (int h = height_offset; h < height_offset + crop; ++h) {
       for (int w = width_offset + crop - 1; w >= width_offset; --w) {
         const uint8_t* cv_data = scaled_img.ptr(h) + w * channels;
-        for (int c = 0; c < channels; ++c) {
+        for (const auto c : c10::irange(channels)) {
           *(image_data_ptr++) = static_cast<float>(cv_data[c]);
         }
       }
@@ -997,7 +998,7 @@ void TransformImage(
     for (int h = height_offset; h < height_offset + crop; ++h) {
       for (int w = width_offset; w < width_offset + crop; ++w) {
         const uint8_t* cv_data = scaled_img.ptr(h) + w * channels;
-        for (int c = 0; c < channels; ++c) {
+        for (const auto c : c10::irange(channels)) {
           *(image_data_ptr++) = static_cast<float>(cv_data[c]);
         }
       }
@@ -1057,7 +1058,7 @@ void CropTransposeImage(
     for (int h = height_offset; h < height_offset + crop; ++h) {
       for (int w = width_offset + crop - 1; w >= width_offset; --w) {
         const uint8_t* cv_data = scaled_img.ptr(h) + w * channels;
-        for (int c = 0; c < channels; ++c) {
+        for (const auto c : c10::irange(channels)) {
           *(cropped_data++) = cv_data[c];
         }
       }
@@ -1067,7 +1068,7 @@ void CropTransposeImage(
     for (int h = height_offset; h < height_offset + crop; ++h) {
       for (int w = width_offset; w < width_offset + crop; ++w) {
         const uint8_t* cv_data = scaled_img.ptr(h) + w * channels;
-        for (int c = 0; c < channels; ++c) {
+        for (const auto c : c10::irange(channels)) {
           *(cropped_data++) = cv_data[c];
         }
       }
@@ -1166,7 +1167,7 @@ bool ImageInputOp<Context>::Prefetch() {
   prefetched_label_.mutable_data<int>();
   // Prefetching handled with a thread pool of "decode_threads" threads.
 
-  for (int item_id = 0; item_id < batch_size_; ++item_id) {
+  for (const auto item_id : c10::irange(batch_size_)) {
     std::string key, value;
     cv::Mat img;
 
@@ -1189,7 +1190,7 @@ bool ImageInputOp<Context>::Prefetch() {
           LOG(FATAL) << "Unsupported label type.";
         }
 
-        for (int i = 0; i < additional_inputs_count_; ++i) {
+        for (const auto i : c10::irange(additional_inputs_count_)) {
           int index = additional_inputs_offset_ + i;
           TensorProto additional_output_proto = protos.protos(index);
           auto sizes =
@@ -1264,7 +1265,7 @@ bool ImageInputOp<Context>::Prefetch() {
     ReinitializeAndCopyFrom(
         &prefetched_label_on_device_, device, prefetched_label_);
 
-    for (int i = 0; i < prefetched_additional_outputs_on_device_.size(); ++i) {
+    for (const auto i : c10::irange(prefetched_additional_outputs_on_device_.size())) {
       ReinitializeAndCopyFrom(
           &prefetched_additional_outputs_on_device_[i],
           device,
@@ -1290,7 +1291,7 @@ bool ImageInputOp<Context>::CopyPrefetched() {
     OperatorBase::OutputTensorCopyFrom(
         1, options, prefetched_label_, /* async */ true);
 
-    for (int i = 2; i < OutputSize(); ++i) {
+    for (const auto i : c10::irange(2, OutputSize())) {
       OperatorBase::OutputTensorCopyFrom(
           i, options, prefetched_additional_outputs_[i - 2], /* async */ true);
     }
@@ -1331,7 +1332,7 @@ bool ImageInputOp<Context>::CopyPrefetched() {
     OperatorBase::OutputTensorCopyFrom(
         1, type, prefetched_label_on_device_, /* async */ true);
 
-    for (int i = 2; i < OutputSize(); ++i) {
+    for (const auto i : c10::irange(2, OutputSize())) {
       OperatorBase::OutputTensorCopyFrom(
           i,
           type,
diff --git a/caffe2/mobile/contrib/ios/mpscnn/mpscnn_kernels.h b/caffe2/mobile/contrib/ios/mpscnn/mpscnn_kernels.h
index 38e54cd59bff4..c82c723020ad2 100644
--- a/caffe2/mobile/contrib/ios/mpscnn/mpscnn_kernels.h
+++ b/caffe2/mobile/contrib/ios/mpscnn/mpscnn_kernels.h
@@ -523,7 +523,7 @@ kernel void col2im(
                 }
             } else {
                 half4 components(0, 0, 0, 0);
-                for (auto i = 0; i < 4; ++i) {
+                for (const auto i : c10::irange(4)) {
                     ushort c_col_i = n * divRoundUp(kernel_h * kernel_w * C, 4) * 4 + h_k * kernel_w * C +
                     w_k * C + c * 4 + i;
                     ushort c_col_i_z = c_col_i / 4;
@@ -826,7 +826,7 @@ kernel void concat(
   ushort2 gid_ = ushort2(gid.x, gid.y);
   half4 value;
   
-  for (int off = 0; off < 4; ++off) {
+  for (const auto off : c10::irange(4)) {
     ushort cur_channel = c * 4 + off;
     ushort cur_idx = 0;
     if (cur_channel >= C) {
@@ -1013,8 +1013,8 @@ kernel void roi_warp(texture2d_array<half, access::sample> ina[[texture(0), func
   const RoIT count = iy_upper * ix_upper;
 
   RoIT4 output_val = 0.0;
-  for (int iy = 0; iy < iy_upper; iy++) {
-    for (int ix = 0; ix < ix_upper; ix++) {
+  for (const auto iy : c10::irange(iy_upper)) {
+    for (const auto ix : c10::irange(ix_upper)) {
       const RoIT y =
           roi_start_h + ph * bin_size_h + iy * bin_size_h / static_cast<RoIT>(roi_bin_grid_h);
       const RoIT x =
@@ -1141,7 +1141,7 @@ kernel void channel_shuffle(
   const ushort c = gid.z - n * divRoundUp(C, 4);
   half4 value;
   ushort2 gid_ = gid.xy;
-  for (int off = 0; off < 4; ++off) {
+  for (const auto off : c10::irange(4)) {
     ushort cur_channel = c * 4 + off;
     if (cur_channel >= C) {
       break;
diff --git a/caffe2/mobile/contrib/libopencl-stub/include/CL/cl.hpp b/caffe2/mobile/contrib/libopencl-stub/include/CL/cl.hpp
index f3badf7775085..1945198afa7f0 100644
--- a/caffe2/mobile/contrib/libopencl-stub/include/CL/cl.hpp
+++ b/caffe2/mobile/contrib/libopencl-stub/include/CL/cl.hpp
@@ -1196,7 +1196,7 @@ inline cl_int getInfoHelper(Func f, cl_uint name, size_t<N>* param, long)
         return err;
     }
 
-    for(int i = 0; i < N; ++i) {
+    for (const auto i : c10::irange(N)) {
         (*param)[i] = value[i];
     }
 
diff --git a/caffe2/operators/arg_ops.h b/caffe2/operators/arg_ops.h
index 782401da21473..b05e97fdb55af 100644
--- a/caffe2/operators/arg_ops.h
+++ b/caffe2/operators/arg_ops.h
@@ -1,13 +1,14 @@
 #ifndef CAFFE2_OPERATORS_ARG_OPS_H_
 #define CAFFE2_OPERATORS_ARG_OPS_H_
 
-#include <algorithm>
-#include <iterator>
-#include <vector>
-
 #include "caffe2/core/context.h"
 #include "caffe2/core/operator.h"
 #include "caffe2/core/types.h"
+#include <c10/util/irange.h>
+
+#include <algorithm>
+#include <iterator>
+#include <vector>
 
 namespace caffe2 {
 
@@ -43,7 +44,7 @@ class ArgOp final : public Operator<Context> {
     Y_dims.reserve(ndim);
     int prev_size = 1;
     int next_size = 1;
-    for (int i = 0; i < axis_; ++i) {
+    for (const auto i : c10::irange(axis_)) {
       Y_dims.push_back(X_dims[i]);
       prev_size *= X_dims[i];
     }
diff --git a/caffe2/operators/assert_op.h b/caffe2/operators/assert_op.h
index 097427834c89c..c2528e5485c3e 100644
--- a/caffe2/operators/assert_op.h
+++ b/caffe2/operators/assert_op.h
@@ -3,6 +3,7 @@
 
 #include "caffe2/core/context.h"
 #include "caffe2/core/operator.h"
+#include <c10/util/irange.h>
 
 namespace caffe2 {
 
@@ -21,9 +22,9 @@ class AssertOp final : public Operator<Context> {
   bool DoRunWithType() {
     // Copy into CPU context for comparison
     cmp_tensor_.CopyFrom(Input(0));
-    auto* cmp_data = cmp_tensor_.template data<T>();
+    auto *const cmp_data = cmp_tensor_.template data<T>();
 
-    for (int64_t i = 0; i < cmp_tensor_.numel(); ++i) {
+    for (const auto i : c10::irange(cmp_tensor_.numel())) {
       CAFFE_ENFORCE((bool)cmp_data[i], [&]() {
         std::stringstream ss;
         ss << "Assert failed for element " << i
diff --git a/caffe2/operators/batch_gather_ops.h b/caffe2/operators/batch_gather_ops.h
index 3d00970b04b21..8736b475b8515 100644
--- a/caffe2/operators/batch_gather_ops.h
+++ b/caffe2/operators/batch_gather_ops.h
@@ -80,7 +80,7 @@ class BatchGatherGradientOp final : public Operator<Context> {
     CAFFE_ENFORCE_GE(data.dim(), 2, "DATA should be at least 2-D");
     // Outer dimensions of input data and gradient should be the same
     // because they are preserved for gathers with axis > 0.
-    for (int acheck = 0; acheck < axis; acheck++) {
+    for (const auto acheck : c10::irange(axis)) {
       CAFFE_ENFORCE_EQ(
           data.size(acheck),
           grad.size(acheck),
@@ -105,7 +105,7 @@ class BatchGatherGradientOp final : public Operator<Context> {
     auto idx_inner_dims_product = indices.size_from_dim(axis);
     if (match_outer) {
       CAFFE_ENFORCE_GE(axis, 1, "Axis should be at least 1");
-      for (auto i = 0; i < axis; i++) {
+      for (const auto i : c10::irange(axis)) {
         CAFFE_ENFORCE_EQ(
             data.size(i),
             indices.size(i),
@@ -120,11 +120,11 @@ class BatchGatherGradientOp final : public Operator<Context> {
     gather_helper::check_indexarray_range<TInd>(
         idxs, N, src_indexing_axis_dim, false);
 
-    for (auto batch = 0; batch < outer_dims_product; ++batch) {
+    for (const auto batch : c10::irange(outer_dims_product)) {
       auto grad_batch_base = grad_data + batch * gathered_grad_batch_size;
       auto out_batch_base = out_data + batch * batch_size;
 
-      for (auto i = 0; i < N; ++i) {
+      for (const auto i : c10::irange(N)) {
         auto idx = idxs[i];
         if (match_outer) {
           idx = idxs[batch * idx_inner_dims_product + i];
diff --git a/caffe2/operators/bisect_percentile_op.h b/caffe2/operators/bisect_percentile_op.h
index 5e924b293a850..8dc71795df89f 100644
--- a/caffe2/operators/bisect_percentile_op.h
+++ b/caffe2/operators/bisect_percentile_op.h
@@ -6,6 +6,7 @@
 #include "caffe2/core/operator.h"
 #include "caffe2/core/tensor.h"
 #include "caffe2/utils/math.h"
+#include "c10/util/irange.h"
 
 namespace caffe2 {
 
@@ -74,11 +75,12 @@ class BisectPercentileOp final : public Operator<Context> {
     int feature_length = 0;
     int cur_index = 0;
 
-    for (int i = 0; i < num_features; ++i) {
+    for (const auto i : c10::irange(num_features)) {
       cur_index = i;
       feature_start_index = index[i];
       feature_length = pct_lens_[i];
-      for (int j = 0; j < batch_size; ++j) {
+      for (const auto j : c10::irange(batch_size)) {
+        (void)j; // Suppress unused variable warning
         pct_output[cur_index] = compute_percentile(
             pct_raw_.begin() + feature_start_index,
             pct_mapping_.begin() + feature_start_index,
diff --git a/caffe2/operators/byte_weight_dequant_op.h b/caffe2/operators/byte_weight_dequant_op.h
index c7b786325bb48..cf31c497946bf 100644
--- a/caffe2/operators/byte_weight_dequant_op.h
+++ b/caffe2/operators/byte_weight_dequant_op.h
@@ -25,7 +25,7 @@ class ByteWeightDequantOp : public Operator<Context> {
     auto* Y = Output(0, shape_, at::dtype<float>());
     float bin_interval = (max_ - min_) / 255.0;
     int total = 1;
-    for (auto i = 0U; i < shape_.size(); i++) {
+    for (const auto i : c10::irange(0U, shape_.size())) {
       total *= Y->size(i);
     }
     const uint8_t* Xdata;
diff --git a/caffe2/operators/cast_op.h b/caffe2/operators/cast_op.h
index 478f2f30380c2..e1e71b7196b57 100644
--- a/caffe2/operators/cast_op.h
+++ b/caffe2/operators/cast_op.h
@@ -1,5 +1,6 @@
 #pragma once
 
+#include <c10/util/irange.h>
 #include "caffe2/core/context.h"
 #include "caffe2/core/logging.h"
 #include "caffe2/core/operator.h"
@@ -41,7 +42,7 @@ class CastOp : public Operator<Context> {
     const auto* data = input.template data<SrcType>();
     auto* out = output->template mutable_data<DstType>();
     auto N = input.size();
-    for (int64_t i = 0; i < N; ++i) {
+    for (const auto i : c10::irange(N)) {
       out[i] = static_cast<DstType>(data[i]);
     }
     return true;
diff --git a/caffe2/operators/cc_bmm_bg_op.h b/caffe2/operators/cc_bmm_bg_op.h
index 3560d6a59dc2e..537bef8060fea 100644
--- a/caffe2/operators/cc_bmm_bg_op.h
+++ b/caffe2/operators/cc_bmm_bg_op.h
@@ -5,6 +5,7 @@
 #include "caffe2/core/operator.h"
 #include "caffe2/core/types.h"
 #include "caffe2/utils/math.h"
+#include "c10/util/irange.h"
 
 namespace caffe2 {
 
@@ -38,7 +39,7 @@ bool ConcatBatchMatMulBatchGatherOp<Context>::RunOnDevice() {
   int adj_size = input_zero.dim() + 1;
   int canonical_axis = 1;
   CAFFE_ENFORCE_LT(canonical_axis, adj_size, "Axis not in input ndim range.");
-  for (int i = 2; i < InputSize(); ++i) {
+  for (const auto i : c10::irange(2, InputSize())) {
     CAFFE_ENFORCE(
         Input(i).dtype() == input_zero.dtype(),
         "All inputs must have the same type, expected: ",
@@ -50,7 +51,7 @@ bool ConcatBatchMatMulBatchGatherOp<Context>::RunOnDevice() {
   }
 
   int before = 1, after = 1;
-  for (int i = 0; i < input_zero.dim(); ++i) {
+  for (const auto i : c10::irange(input_zero.dim())) {
     int dim = input_zero.dim32(i);
     if (i < canonical_axis) {
       before *= dim;
@@ -58,7 +59,7 @@ bool ConcatBatchMatMulBatchGatherOp<Context>::RunOnDevice() {
       after *= dim;
     }
     // check the input dims are compatible.
-    for (int j = 2; j < InputSize(); ++j) {
+    for (const auto j : c10::irange(2, InputSize())) {
       int dim_j = Input(j).dim32(i);
       CAFFE_ENFORCE(
           dim == dim_j,
@@ -93,7 +94,7 @@ bool ConcatBatchMatMulBatchGatherOp<Context>::RunOnDevice() {
   auto* output = Output(0, output_dims, at::dtype<T>());
   // std::stringstream ss;
   // ss << "[";
-  // for(int i = 0; i < output_dims.size(); i++) ss << output_dims[i];
+  // for (const auto i : c10::irange(output_dims.size()))ss << output_dims[i];
   // ss << "]";
   // LOG(INFO) << "output size: " << ss.str();
 
@@ -107,7 +108,7 @@ bool ConcatBatchMatMulBatchGatherOp<Context>::RunOnDevice() {
 #pragma omp for
     for (int b = 0; b < batch_size; ++b) {
       // concat input to scratch
-      for (int i = 1; i < InputSize(); ++i) {
+      for (const auto i : c10::irange(1, InputSize())) {
         auto* input_data = Input(i).template data<T>();
         memcpy(
             &scratch_input[(i - 1) * embed_size],
@@ -130,7 +131,7 @@ bool ConcatBatchMatMulBatchGatherOp<Context>::RunOnDevice() {
       // do gather
 
       int64_t output_offset = b * gather_size;
-      for (int i = 0; i < gather_size; i++) {
+      for (const auto i : c10::irange(gather_size)) {
         output_data[output_offset + i] = scratch_output[indices_data[i]];
       }
     }
diff --git a/caffe2/operators/ceil_op.h b/caffe2/operators/ceil_op.h
index 3283fbe8d9f1f..e3bbd7d7fcb63 100644
--- a/caffe2/operators/ceil_op.h
+++ b/caffe2/operators/ceil_op.h
@@ -5,6 +5,7 @@
 #include "caffe2/core/context.h"
 #include "caffe2/core/logging.h"
 #include "caffe2/core/operator.h"
+#include <c10/util/irange.h>
 
 namespace caffe2 {
 
@@ -17,11 +18,11 @@ class CeilOp final : public Operator<Context> {
   bool RunOnDevice() override {
     auto& X = Input(0);
 
-    auto* Y = Output(0, X.sizes(), at::dtype<float>());
+    auto *const Y = Output(0, X.sizes(), at::dtype<float>());
 
-    const float* Xdata = X.template data<float>();
-    float* Ydata = Y->template mutable_data<float>();
-    for (int i = 0; i < X.numel(); ++i) {
+    const float *const Xdata = X.template data<float>();
+    float *const Ydata = Y->template mutable_data<float>();
+    for (const auto i : c10::irange(X.numel())) {
       Ydata[i] = std::ceil(Xdata[i]);
     }
     return true;
diff --git a/caffe2/operators/concat_split_op.h b/caffe2/operators/concat_split_op.h
index 3338e9b9ae35c..3173719dcdf13 100644
--- a/caffe2/operators/concat_split_op.h
+++ b/caffe2/operators/concat_split_op.h
@@ -7,6 +7,7 @@
 #include "caffe2/utils/math.h"
 #include "caffe2/utils/string_utils.h"
 #include <c10/util/accumulate.h>
+#include <c10/util/irange.h>
 
 namespace caffe2 {
 
@@ -161,7 +162,7 @@ bool SplitOp<Context>::RunOnDevice() {
       input_channels);
   vector<int64_t> output_dims(input.sizes().vec());
   int before = 1, after = 1;
-  for (int i = 0; i < canonical_axis; ++i) {
+  for (const auto i : c10::irange(canonical_axis)) {
     before *= input.dim32(i);
   }
   for (int i = canonical_axis + 1; i < input.dim(); ++i) {
@@ -174,7 +175,7 @@ bool SplitOp<Context>::RunOnDevice() {
   const auto *const input_ptr = static_cast<const char*>(input.raw_data());
 
   size_t input_offset = 0;
-  for (int i = 0; i < OutputSize(); ++i) {
+  for (const auto i : c10::irange(OutputSize())) {
     auto *const output = Output(i);
     const auto axis_dim = add_axis_ ? 1 : axis_data[i];
     if (!add_axis_) {
@@ -264,7 +265,7 @@ bool SplitByLengthsOp<Context>::RunOnDevice() {
     dim_multiplier = 1;
   }
 
-  for (int i = 0; i < OutputSize(); ++i) {
+  for (const auto i : c10::irange(OutputSize())) {
     auto* output = Output(i);
     const auto* axis_offset = axis_data + lengths_length / OutputSize() * i;
     auto axis_dim =
@@ -301,7 +302,7 @@ bool ConcatOp<Context>::RunOnDevice() {
   int adj_size = input_zero.dim() + (add_axis_ ? 1 : 0);
   int canonical_axis = canonical_axis_index_(axis_, adj_size);
   CAFFE_ENFORCE_LT(canonical_axis, adj_size, "Axis not in input ndim range.");
-  for (int i = 1; i < InputSize(); ++i) {
+  for (const auto i : c10::irange(1, InputSize())) {
     CAFFE_ENFORCE_EQ(
         Input(i).dtype(),
         input_zero.dtype(),
@@ -315,7 +316,7 @@ bool ConcatOp<Context>::RunOnDevice() {
 
   int before = 1, after = 1;
   vector<int64_t> output_dims(input_zero.sizes().vec());
-  for (int i = 0; i < input_zero.dim(); ++i) {
+  for (const auto i : c10::irange(input_zero.dim())) {
     if (i == canonical_axis && !add_axis_) {
       continue;
     }
@@ -326,7 +327,7 @@ bool ConcatOp<Context>::RunOnDevice() {
       after *= dim;
     }
     // check the input dims are compatible.
-    for (int j = 1; j < InputSize(); ++j) {
+    for (const auto j : c10::irange(1, InputSize())) {
       int dim_j = Input(j).dim32(i);
       CAFFE_ENFORCE_EQ(
           dim,
@@ -351,7 +352,7 @@ bool ConcatOp<Context>::RunOnDevice() {
   }
 
   int output_channels = 0;
-  for (int i = 0; i < InputSize(); ++i) {
+  for (const auto i : c10::irange(InputSize())) {
     axis_data[i] = add_axis_ ? 1 : Input(i).dim32(canonical_axis);
     output_channels += axis_data[i];
   }
@@ -368,7 +369,7 @@ bool ConcatOp<Context>::RunOnDevice() {
   }
 
   size_t output_offset = 0;
-  for (int i = 0; i < InputSize(); ++i) {
+  for (const auto i : c10::irange(InputSize())) {
     auto& input = Input(i);
     auto axis_dim = add_axis_ ? 1 : input.dim32(canonical_axis);
     math::CopyMatrix<Context>(
diff --git a/caffe2/operators/conv_op_impl.h b/caffe2/operators/conv_op_impl.h
index 29294a1c0ea1b..d95fc5f9e48c5 100644
--- a/caffe2/operators/conv_op_impl.h
+++ b/caffe2/operators/conv_op_impl.h
@@ -93,7 +93,8 @@ bool ConvOp<T, Context>::RunOnDeviceWithOrderNCHW() {
     col_buffer->Resize(buffer_shape);
     T* col_buffer_data = col_buffer->template mutable_data<T>();
     // Im2Col, followed by gemm.
-    for (int image_id = 0; image_id < N; ++image_id) {
+    for (const auto image_id : c10::irange(N)) {
+      (void)image_id; // Suppress unused variable warning
       if (kernel_.size() == 2) {
         math::Im2Col<T, Context, StorageOrder::NCHW>(
             C,
@@ -277,7 +278,8 @@ bool ConvOp<T, Context>::RunOnDeviceWithOrderNHWC() {
     col_buffer->Resize(buffer_shape);
     T* col_buffer_data = col_buffer->template mutable_data<T>();
     // Im2Col, followed by gemm.
-    for (int image_id = 0; image_id < N; ++image_id) {
+    for (const auto image_id : c10::irange(N)) {
+      (void)image_id; // Suppress unused variable warning
       if (kernel_.size() <= 2) {
         math::Im2Col<T, Context, StorageOrder::NHWC>(
             C,
@@ -314,7 +316,7 @@ bool ConvOp<T, Context>::RunOnDeviceWithOrderNHWC() {
             group_);
       }
       // Weight term
-      for (int group_id = 0; group_id < group_; ++group_id) {
+      for (const auto group_id : c10::irange(group_)) {
         // col_buffer_data in G (H W) (R S C/G) layout
         // filter_data in G K/G (R S C/G) layout
         math::GemmEx<T, Context>(
@@ -398,8 +400,8 @@ bool ConvOp<T, Context>::Run1x1ConvOnDeviceWithOrderNCHW(
     std::vector<const T*> X_ptr(N * G);
     std::vector<const T*> W_ptr(N * G);
     std::vector<T*> Y_ptr(N * G);
-    for (int i = 0; i < N; ++i) {
-      for (int j = 0; j < G; ++j) {
+    for (const auto i : c10::irange(N)) {
+      for (const auto j : c10::irange(G)) {
         const int index = i * G + j;
         X_ptr[index] = X + index * X_stride;
         W_ptr[index] = filter + j * W_stride;
@@ -454,7 +456,7 @@ bool ConvOp<T, Context>::Run1x1ConvOnDeviceWithOrderNHWC(
     T* Y) {
   const int G = group_;
   const int kernel_dim = C / G;
-  for (int group_id = 0; group_id < group_; ++group_id) {
+  for (const auto group_id : c10::irange(group_)) {
     math::GemmEx<T, Context>(
         CblasNoTrans,
         CblasTrans,
@@ -511,7 +513,7 @@ bool ConvGradientOp<T, Context>::RunOnDeviceWithOrderNCHW() {
 
   int kernel_dims_size = 1;
   // NOLINTNEXTLINE(clang-diagnostic-sign-compare)
-  for (int i = 0; i < kernel_.size(); ++i) {
+  for (const auto i : c10::irange(kernel_.size())) {
     CAFFE_ENFORCE_EQ(filter.dim32(i + 2), kernel_[i]);
     kernel_dims_size *= kernel_[i];
   }
@@ -588,8 +590,9 @@ bool ConvGradientOp<T, Context>::RunOnDeviceWithOrderNCHW() {
   const int input_offset = C / group_ * input_image_size;
   const int output_offset = dY.numel() / dY.dim32(0) / group_;
   const int filter_offset = filter.numel() / group_;
-  for (int image_id = 0; image_id < N; ++image_id) {
-    for (int group_id = 0; group_id < group_; ++group_id) {
+  for (const auto image_id : c10::irange(N)) {
+    (void)image_id; // Suppress unused variable warning
+    for (const auto group_id : c10::irange(group_)) {
       // When we compute the gradient with respect to the filters, we need to do
       // im2col to allow gemm-type computation.
       if (kernel_.size() == 2) {
@@ -662,8 +665,9 @@ bool ConvGradientOp<T, Context>::RunOnDeviceWithOrderNCHW() {
         no_bias_ ? BIAS_OR_INPUT_GRAD : INPUT_GRAD, X.sizes(), at::dtype<T>());
     T* dXdata = dX->template mutable_data<T>();
     dYdata = dY.template data<T>();
-    for (int image_id = 0; image_id < N; ++image_id) {
-      for (int group_id = 0; group_id < group_; ++group_id) {
+    for (const auto image_id : c10::irange(N)) {
+      (void)image_id; // Suppress unused variable warning
+      for (const auto group_id : c10::irange(group_)) {
         // Compute gradient into col_buffer.
         math::Gemm<T, Context>(
             CblasTrans,
@@ -739,7 +743,7 @@ bool ConvGradientOp<T, Context>::RunOnDeviceWithOrderNHWC() {
   CAFFE_ENFORCE_EQ(C, filter.dim32(filter.dim() - 1) * group_);
 
   int kernel_dims_size = 1;
-  for (size_t i = 0; i < kernel_.size(); ++i) {
+  for (const auto i : c10::irange(kernel_.size())) {
     CAFFE_ENFORCE_EQ(filter.dim32(i + 1), kernel_[i]);
     kernel_dims_size *= kernel_[i];
   }
@@ -812,7 +816,7 @@ bool ConvGradientOp<T, Context>::RunOnDeviceWithOrderNHWC() {
   // image.
   const size_t input_offset = C * input_image_size;
   const size_t output_offset = dY.numel() / dY.dim32(0);
-  for (int image_id = 0; image_id < N; ++image_id) {
+  for (const auto image_id : c10::irange(N)) {
     // When we compute the gradient with respect to the filters, we need to do
     // im2col to allow gemm-type computation.
     if (kernel_.size() <= 2) {
@@ -851,7 +855,7 @@ bool ConvGradientOp<T, Context>::RunOnDeviceWithOrderNHWC() {
           group_);
     }
     // Gradient with respect to filter.
-    for (int group_id = 0; group_id < group_; ++group_id) {
+    for (const auto group_id : c10::irange(group_)) {
       math::GemmEx<T, Context>(
           CblasTrans,
           CblasNoTrans,
@@ -890,9 +894,9 @@ bool ConvGradientOp<T, Context>::RunOnDeviceWithOrderNHWC() {
     auto* dX = Output(
         no_bias_ ? BIAS_OR_INPUT_GRAD : INPUT_GRAD, X.sizes(), at::dtype<T>());
     T* dXdata = dX->template mutable_data<T>();
-    for (int image_id = 0; image_id < N; ++image_id) {
+    for (const auto image_id : c10::irange(N)) {
       // Compute gradient into col_buffer.
-      for (int group_id = 0; group_id < group_; ++group_id) {
+      for (const auto group_id : c10::irange(group_)) {
         math::GemmEx<T, Context>(
             CblasNoTrans,
             CblasNoTrans,
diff --git a/caffe2/operators/conv_pool_op_base.h b/caffe2/operators/conv_pool_op_base.h
index b356ef952d79c..1470a5aa4f50d 100644
--- a/caffe2/operators/conv_pool_op_base.h
+++ b/caffe2/operators/conv_pool_op_base.h
@@ -1,9 +1,7 @@
 #ifndef CAFFE2_OPERATORS_CONV_POOL_OP_BASE_H_
 #define CAFFE2_OPERATORS_CONV_POOL_OP_BASE_H_
 
-#include <algorithm>
-#include <vector>
-
+#include <c10/util/irange.h>
 #include "caffe2/core/context.h"
 #include "caffe2/core/logging.h"
 #include "caffe2/core/operator.h"
@@ -11,6 +9,9 @@
 #include "caffe2/proto/caffe2_legacy.pb.h"
 #include "caffe2/utils/math.h"
 
+#include <algorithm>
+#include <vector>
+
 // This macro is here just to allow us to experiment with padding values that
 // determines, when we have an odd number of pads, which side gets the one
 // additional pad value, the head side, or the tail side. Setting it to false
@@ -139,7 +140,7 @@ class ConvPoolOpBase : public Operator<Context> {
     }
 
     if (global_pooling_) {
-      for (size_t dim = 0; dim < kernel_.size(); ++dim) {
+      for (const auto dim : c10::irange(kernel_.size())) {
         CAFFE_ENFORCE(
             pads_[2 * dim] == 0 && pads_[2 * dim + 1] == 0 &&
                 dilation_[dim] == 1 && stride_[dim] == 1,
@@ -152,7 +153,7 @@ class ConvPoolOpBase : public Operator<Context> {
     // need to clean this up.
     if (operator_def.name().find("Conv") == 0 ||
         operator_def.name().find("Pool") != std::string::npos) {
-      for (size_t dim = 0; dim < kernel_.size(); ++dim) {
+      for (const auto dim : c10::irange(kernel_.size())) {
         CAFFE_ENFORCE_GE(pads_[dim], 0);
         CAFFE_ENFORCE_GE(pads_[kernel_.size() + dim], 0);
         CAFFE_ENFORCE(
@@ -162,7 +163,7 @@ class ConvPoolOpBase : public Operator<Context> {
       }
     }
 
-    for (size_t dim = 0; dim < kernel_.size(); ++dim) {
+    for (const auto dim : c10::irange(kernel_.size())) {
       CAFFE_ENFORCE_GE(kernel_[dim], 0);
       CAFFE_ENFORCE_GE(dilation_[dim], 0);
       CAFFE_ENFORCE_GE(stride_[dim], 0);
@@ -281,7 +282,7 @@ class ConvPoolOpBase : public Operator<Context> {
       std::copy_n(input_dims.cbegin() + offset, ndim, kernel->begin());
       std::fill_n(output_dims->begin() + offset, ndim, 1LL);
     } else {
-      for (int i = 0; i < ndim; ++i) {
+      for (const auto i : c10::irange(ndim)) {
         ComputeSizeAndPad(
             input_dims[i + offset],
             stride[i],
@@ -320,7 +321,7 @@ class ConvPoolOpBase : public Operator<Context> {
       std::copy_n(input_dims.cbegin() + offset, ndim, kernel->begin());
       std::fill_n(output_dims->begin() + offset, ndim, 1LL);
     } else {
-      for (int i = 0; i < ndim; ++i) {
+      for (const auto i : c10::irange(ndim)) {
         ComputeSizeAndPad64(
             input_dims[i + offset],
             stride[i],
@@ -342,7 +343,7 @@ class ConvPoolOpBase : public Operator<Context> {
     } else if (legacy_pad_ != LegacyPadding::NOTSET) {
       int output_unused;
       // NOLINTNEXTLINE(clang-diagnostic-sign-compare)
-      for (int dim = 0; dim < dims.size(); ++dim) {
+      for (const auto dim : c10::irange(dims.size())) {
         ComputeSizeAndPad(
             dims[dim],
             stride_[dim],
@@ -381,7 +382,7 @@ class ConvPoolOpBase : public Operator<Context> {
       reset_tensor_device_ = true;
     } else {
       const int* tensor_data = tensor->template data<int>();
-      for (int d_i = 0; d_i < data.size(); ++d_i) {
+      for (const auto d_i : c10::irange(data.size())) {
         if (tensor_data[d_i] != data[d_i]) {
           reset_tensor_device_ = true;
           break;
@@ -411,7 +412,7 @@ class ConvPoolOpBase : public Operator<Context> {
 
   bool RunOnDevice() override {
     if (!global_pooling_) {
-      for (size_t dim = 0; dim < kernel_.size(); ++dim) {
+      for (const auto dim : c10::irange(kernel_.size())) {
         CAFFE_ENFORCE_GT(kernel_[dim], 0);
       }
     }
diff --git a/caffe2/operators/conv_transpose_op_impl.h b/caffe2/operators/conv_transpose_op_impl.h
index e3be8a81ce033..a7e989f0fccde 100644
--- a/caffe2/operators/conv_transpose_op_impl.h
+++ b/caffe2/operators/conv_transpose_op_impl.h
@@ -78,7 +78,7 @@ bool ConvTransposeOp<T, Context>::RunOnDeviceWithOrderNCHW() {
         buffer_shape,
         at::dtype<T>().device(Context::GetDeviceType()));
     T* col_buffer_data = col_buffer->template mutable_data<T>();
-    for (int image_id = 0; image_id < N; ++image_id) {
+    for (const auto image_id : c10::irange(N)) {
       // Weight term
       if (G == 1) {
         math::Gemm<T, Context>(
@@ -231,7 +231,7 @@ bool ConvTransposeOp<T, Context>::RunOnDeviceWithOrderNHWC() {
         buffer_shape,
         at::dtype<T>().device(Context::GetDeviceType()));
     T* col_buffer_data = col_buffer_.template mutable_data<T>();
-    for (int image_id = 0; image_id < N; ++image_id) {
+    for (const auto image_id : c10::irange(N)) {
       // Weight term
       if (G == 1) {
         math::Gemm<T, Context>(
@@ -247,7 +247,7 @@ bool ConvTransposeOp<T, Context>::RunOnDeviceWithOrderNHWC() {
             col_buffer_data,
             &context_);
       } else {
-        for (int group_id = 0; group_id < G; ++group_id) {
+        for (const auto group_id : c10::irange(G)) {
           math::GemmEx<T, Context>(
               CblasNoTrans,
               CblasNoTrans,
@@ -374,7 +374,7 @@ bool ConvTransposeGradientOp<T, Context>::RunOnDeviceWithOrderNCHW() {
       at::dtype<T>().device(Context::GetDeviceType()));
   T* col_buffer_data = col_buffer_.template mutable_data<T>();
 
-  for (int image_id = 0; image_id < N; ++image_id) {
+  for (const auto image_id : c10::irange(N)) {
     // gradient w.r.t. filters. Im2Col followed by Gemm
     // Im2Col.
     math::Im2Col<T, Context, StorageOrder::NCHW>(
@@ -539,7 +539,7 @@ bool ConvTransposeGradientOp<T, Context>::RunOnDeviceWithOrderNHWC() {
       at::dtype<T>().device(Context::GetDeviceType()));
   T* col_buffer_data = col_buffer_.template mutable_data<T>();
 
-  for (int image_id = 0; image_id < N; ++image_id) {
+  for (const auto image_id : c10::irange(N)) {
     // gradient w.r.t. filters. Im2Col followed by Gemm
     // Im2Col.
     math::Im2Col<T, Context, StorageOrder::NHWC>(
@@ -575,7 +575,7 @@ bool ConvTransposeGradientOp<T, Context>::RunOnDeviceWithOrderNHWC() {
           dfilter_data,
           &context_);
     } else {
-      for (int group_id = 0; group_id < G; ++group_id) {
+      for (const auto group_id : c10::irange(G)) {
         math::GemmEx<T, Context>(
             CblasTrans,
             CblasNoTrans,
@@ -610,7 +610,7 @@ bool ConvTransposeGradientOp<T, Context>::RunOnDeviceWithOrderNHWC() {
             dX_data + image_id * M * X_HxW,
             &context_);
       } else {
-        for (int group_id = 0; group_id < G; ++group_id) {
+        for (const auto group_id : c10::irange(G)) {
           math::GemmEx<T, Context>(
               CblasNoTrans,
               CblasTrans,
diff --git a/caffe2/operators/conv_transpose_op_mobile_impl.h b/caffe2/operators/conv_transpose_op_mobile_impl.h
index 45fc78ce9bc9e..25ac65190bbf8 100644
--- a/caffe2/operators/conv_transpose_op_mobile_impl.h
+++ b/caffe2/operators/conv_transpose_op_mobile_impl.h
@@ -76,7 +76,7 @@ void runTileContiguous(
   int colBlockSize = (W + kernelW / strideW);
   int numColBlocks = strideW;
 
-  for (int c = 0; c < kernelDataSize; ++c) {
+  for (const auto c : c10::irange(kernelDataSize)) {
     int w_offset = c % kernelW;
     int h_offset = (c / kernelW) % kernelH;
     int c_im = c / kernelH / kernelW;
@@ -276,13 +276,13 @@ void reinterleaveRows(
     float32x4_t v0[kStrideW];
     float32x4_t v1[kStrideW];
 
-    for (int i = 0; i < kStrideW; ++i) {
+    for (const auto i : c10::irange(kStrideW)) {
       v0[i] = vld1q_f32(src + i * colBlockSize);
       v1[i] = vld1q_f32(src + i * colBlockSize + 4);
     }
 
     // add per-channel bias
-    for (int i = 0; i < kStrideW; ++i) {
+    for (const auto i : c10::irange(kStrideW)) {
       v0[i] = vaddq_f32(v0[i], biasV);
       v1[i] = vaddq_f32(v1[i], biasV);
     }
@@ -300,12 +300,12 @@ void reinterleaveRows(
   for (; w < inputW - 1; ++w) {
     float v[kStrideW];
 
-    for (int i = 0; i < kStrideW; ++i) {
+    for (const auto i : c10::irange(kStrideW)) {
       v[i] = src[i * colBlockSize];
     }
 
     // add per-channel bias
-    for (int i = 0; i < kStrideW; ++i) {
+    for (const auto i : c10::irange(kStrideW)) {
       v[i] += b;
     }
 
@@ -614,12 +614,12 @@ bool ConvTransposeMobileOp<T, Context>::RunOnDeviceWithOrderNCHW() {
         numThreads * threadColBufferSize);
     // Group together thread buffers for accumulation
     std::vector<T*> toSum(numThreads - 1);
-    for (int i = 1; i < numThreads; ++i) {
+    for (const auto i : c10::irange(1, numThreads)) {
       toSum[i - 1] = threadBuffer->template mutable_data<T>() +
           i * threadYBufferSizeAligned;
     }
 
-    for (auto image_id = 0; image_id < N; ++image_id) {
+    for (const auto image_id : c10::irange(N)) {
       // Each time through, we have to reset all per-thread output
       // buffers, since the output buffer is only per-batch element
       // The column buffers are overwritten by the matrix multiplication
diff --git a/caffe2/operators/conv_transpose_unpool_op_base.h b/caffe2/operators/conv_transpose_unpool_op_base.h
index db7fedf32a765..6619ac379875a 100644
--- a/caffe2/operators/conv_transpose_unpool_op_base.h
+++ b/caffe2/operators/conv_transpose_unpool_op_base.h
@@ -121,7 +121,7 @@ class ConvTransposeUnpoolBase : public Operator<Context> {
     }
 
     // NOLINTNEXTLINE(clang-diagnostic-sign-compare)
-    for (int dim = 0; dim < kernel_.size(); ++dim) {
+    for (const auto dim : c10::irange(kernel_.size())) {
       CAFFE_ENFORCE_GT(kernel_[dim], 0);
       CAFFE_ENFORCE_GT(stride_[dim], 0);
       CAFFE_ENFORCE_GE(adj_[dim], 0);
diff --git a/caffe2/operators/deform_conv_op_impl.h b/caffe2/operators/deform_conv_op_impl.h
index 0dfb1564acc19..011b1bf9204b3 100644
--- a/caffe2/operators/deform_conv_op_impl.h
+++ b/caffe2/operators/deform_conv_op_impl.h
@@ -77,7 +77,7 @@ bool DeformConvOp<T, Context>::RunOnDeviceWithOrderNCHW() {
           1);
 
   int kernel_dims_size = 1;
-  for (int i = 0; i < kernel_.size(); ++i) {
+  for (const auto i : c10::irange(kernel_.size())) {
     CAFFE_ENFORCE(filter.dim32(i + 2) == kernel_[i]);
     kernel_dims_size *= kernel_[i];
   }
@@ -155,8 +155,8 @@ bool DeformConvOp<T, Context>::RunOnDeviceWithOrderNCHW() {
     col_buffer->Resize(buffer_shape);
     T* col_buffer_data = col_buffer->template mutable_data<T>();
     // Im2col, followed by gemm.
-    for (int image_id = 0; image_id < N; ++image_id) {
-      for (int group_id = 0; group_id < group_; ++group_id) {
+    for (const auto image_id : c10::irange(N)) {
+      for (const auto group_id : c10::irange(group_)) {
         DeformableIm2col(
             Xdata + group_id * input_offset,
             offset_data,
@@ -271,7 +271,7 @@ bool DeformConvGradientOp<T, Context>::RunOnDeviceWithOrderNCHW() {
           1);
 
   int kernel_dims_size = 1;
-  for (int i = 0; i < kernel_.size(); ++i) {
+  for (const auto i : c10::irange(kernel_.size())) {
     CAFFE_ENFORCE(filter.dim32(i + 2) == kernel_[i]);
     kernel_dims_size *= kernel_[i];
   }
@@ -342,8 +342,8 @@ bool DeformConvGradientOp<T, Context>::RunOnDeviceWithOrderNCHW() {
     math::Set<T, Context>(dX->numel(), 0, dXdata, &context_);
   }
 
-  for (int image_id = 0; image_id < N; ++image_id) {
-    for (int group_id = 0; group_id < group_; ++group_id) {
+  for (const auto image_id : c10::irange(N)) {
+    for (const auto group_id : c10::irange(group_)) {
       math::Gemm<T, Context>(
           CblasTrans,
           CblasNoTrans,
@@ -378,7 +378,7 @@ bool DeformConvGradientOp<T, Context>::RunOnDeviceWithOrderNCHW() {
     DeformableIm2col(
         Xdata, offset_data, X.sizes(), col_buffer_shape, col_buffer_data);
 
-    for (int group_id = 0; group_id < group_; ++group_id) {
+    for (const auto group_id : c10::irange(group_)) {
       math::Gemm<T, Context>(
           CblasNoTrans,
           CblasTrans,
diff --git a/caffe2/operators/dense_vector_to_id_list_op.h b/caffe2/operators/dense_vector_to_id_list_op.h
index b532500d4b101..d2c5f756789a9 100644
--- a/caffe2/operators/dense_vector_to_id_list_op.h
+++ b/caffe2/operators/dense_vector_to_id_list_op.h
@@ -5,6 +5,7 @@
 #include <vector>
 #include "caffe2/core/context.h"
 #include "caffe2/core/operator.h"
+#include "c10/util/irange.h"
 
 namespace caffe2 {
 
@@ -33,9 +34,9 @@ class DenseVectorToIdListOp : public Operator<Context> {
 
     auto v_pos = 0;
     auto l_pos = 0;
-    for (auto i = 0; i < batch_size; i++) {
+    for (const auto i : c10::irange(batch_size)) {
       auto length = 0;
-      for (int j = 0; j < col_num; j++) {
+      for (const auto j : c10::irange(col_num)) {
         if ((int)(input_data[i * col_num + j] + 0.5) != 0) {
           out_values_data[v_pos++] = j;
           length++;
diff --git a/caffe2/operators/distance_op.h b/caffe2/operators/distance_op.h
index 11b43b630b76a..5b8556611b612 100644
--- a/caffe2/operators/distance_op.h
+++ b/caffe2/operators/distance_op.h
@@ -4,6 +4,7 @@
 #include "caffe2/core/context.h"
 #include "caffe2/core/operator.h"
 #include "caffe2/utils/math.h"
+#include "c10/util/irange.h"
 
 namespace caffe2 {
 
@@ -37,7 +38,7 @@ class SquaredL2DistanceGradientOp final : public Operator<Context> {
     int N = X.dim() > 0 ? X.dim32(0) : 1;
     int D = N > 0 ? X.numel() / N : 0;
     CAFFE_ENFORCE(X.dim() == Y.dim());
-    for (int i = 0; i < X.dim(); ++i) {
+    for (const auto i : c10::irange(X.dim())) {
       CAFFE_ENFORCE(X.dim32(i) == Y.dim32(i));
     }
     CAFFE_ENFORCE(dDistance.dim() == 1);
@@ -50,7 +51,7 @@ class SquaredL2DistanceGradientOp final : public Operator<Context> {
         Y.template data<T>(),
         dX->template mutable_data<T>(),
         &context_);
-    for (int i = 0; i < N; ++i) {
+    for (const auto i : c10::irange(N)) {
       math::Scale<T, T, Context>(
           D,
           dDistance.template data<T>() + i,
@@ -227,7 +228,7 @@ class DotProductWithPaddingGradientOp final : public Operator<Context> {
     const auto* dDot_data = dDot.template data<T>();
     auto* dX_data = dX->template mutable_data<T>();
     auto* dY_data = dY->template mutable_data<T>();
-    for (int i = 0; i < N; ++i) { // TODO: multithreading
+    for (const auto i : c10::irange(N)) { // TODO: multithreading
       auto offsetX = i * DX;
       auto offsetY = i * DY;
       if (replicate_) {
diff --git a/caffe2/operators/do_op.h b/caffe2/operators/do_op.h
index a6a3134450781..d3d2da0c2079f 100644
--- a/caffe2/operators/do_op.h
+++ b/caffe2/operators/do_op.h
@@ -11,6 +11,7 @@
 #include "caffe2/core/operator.h"
 #include "caffe2/operators/create_scope_op.h"
 #include "caffe2/proto/caffe2_pb.h"
+#include "c10/util/irange.h"
 
 namespace caffe2 {
 
@@ -46,7 +47,7 @@ class DoOp final : public Operator<Context> {
 
     const auto& outer_blob_names = checkAndGetOuterNames(operator_def);
     std::unordered_set<std::string> used_outer_names;
-    for (size_t blob_idx = 0; blob_idx < inner_blobs.size(); ++blob_idx) {
+    for (const auto blob_idx : c10::irange(inner_blobs.size())) {
       CAFFE_ENFORCE(
           !blob_bindings_.count(inner_blobs[blob_idx]),
           "Invalid blob bindings: redefinition of inner blob " +
@@ -154,7 +155,7 @@ class DoOp final : public Operator<Context> {
       const OperatorDef& operator_def) const {
     std::vector<std::string> names;
     names.reserve(operator_def.input_size());
-    for (auto idx = 0; idx < operator_def.input_size(); ++idx) {
+    for (const auto idx : c10::irange(operator_def.input_size())) {
       names.push_back(operator_def.input(idx));
     }
     return names;
@@ -164,7 +165,7 @@ class DoOp final : public Operator<Context> {
       const OperatorDef& operator_def) const {
     std::vector<std::string> names;
     names.reserve(operator_def.output_size());
-    for (auto idx = 0; idx < operator_def.output_size(); ++idx) {
+    for (const auto idx : c10::irange(operator_def.output_size())) {
       names.push_back(operator_def.output(idx));
     }
     return names;
diff --git a/caffe2/operators/elementwise_logical_ops.h b/caffe2/operators/elementwise_logical_ops.h
index 1d74e1e1ca47d..df7e0d09d734e 100644
--- a/caffe2/operators/elementwise_logical_ops.h
+++ b/caffe2/operators/elementwise_logical_ops.h
@@ -51,7 +51,7 @@ class WhereOp final : public Operator<Context> {
 
     if (enable_broadcast_) {
       size_t block_size = left.size_from_dim(1);
-      for (int i = 0; i < select.numel(); i++) {
+      for (const auto i : c10::irange(select.numel())) {
         size_t offset = i * block_size;
         if (select_data[i]) {
           context_.CopyItemsSameDevice(
@@ -68,7 +68,7 @@ class WhereOp final : public Operator<Context> {
         }
       }
     } else {
-      for (int i = 0; i < select.numel(); ++i) {
+      for (const auto i : c10::irange(select.numel())) {
         output_data[i] = select_data[i] ? left_data[i] : right_data[i];
       }
     }
@@ -159,7 +159,7 @@ class IsMemberOfOp final : public Operator<Context> {
 
     const T* input_data = input.template data<T>();
     bool* output_data = output->template mutable_data<bool>();
-    for (int i = 0; i < input.numel(); ++i) {
+    for (const auto i : c10::irange(input.numel())) {
       output_data[i] = values.find(input_data[i]) != values.end();
     }
     return true;
diff --git a/caffe2/operators/elementwise_op_test.h b/caffe2/operators/elementwise_op_test.h
index 7dcdbc1c0684e..a26cfa83662d1 100644
--- a/caffe2/operators/elementwise_op_test.h
+++ b/caffe2/operators/elementwise_op_test.h
@@ -62,7 +62,7 @@ void elementwiseAnd() {
     caffe2::Tensor Z(blob->Get<caffe2::Tensor>(), caffe2::CPU);
     EXPECT_EQ(Z.numel(), N);
     std::vector<bool> result{true, false, false, false};
-    for (size_t i = 0; i < Z.numel(); ++i) {
+    for (const auto i : c10::irange(Z.numel())) {
       EXPECT_EQ(Z.template data<bool>()[i], result[i]);
     }
   }
@@ -83,7 +83,7 @@ void elementwiseAnd() {
     EXPECT_EQ(Z.numel(), M * N);
     std::vector<bool> result{
         true, false, false, false, true, false, false, false};
-    for (size_t i = 0; i < Z.numel(); ++i) {
+    for (const auto i : c10::irange(Z.numel())) {
       EXPECT_EQ(Z.template data<bool>()[i], result[i]);
     }
   }
@@ -108,7 +108,7 @@ void elementwiseOr() {
     caffe2::Tensor Z(blob->Get<caffe2::Tensor>(), caffe2::CPU);
     EXPECT_EQ(Z.numel(), N);
     std::vector<bool> result{true, true, true, false};
-    for (size_t i = 0; i < Z.numel(); ++i) {
+    for (const auto i : c10::irange(Z.numel())) {
       EXPECT_EQ(Z.template data<bool>()[i], result[i]);
     }
   }
@@ -128,7 +128,7 @@ void elementwiseOr() {
     caffe2::Tensor Z(blob->Get<caffe2::Tensor>(), caffe2::CPU);
     EXPECT_EQ(Z.numel(), M * N);
     std::vector<bool> result{true, true, true, false, true, true, true, false};
-    for (size_t i = 0; i < Z.numel(); ++i) {
+    for (const auto i : c10::irange(Z.numel())) {
       EXPECT_EQ(Z.template data<bool>()[i], result[i]);
     }
   }
@@ -153,7 +153,7 @@ void elementwiseXor() {
     caffe2::Tensor Z(blob->Get<caffe2::Tensor>(), caffe2::CPU);
     EXPECT_EQ(Z.numel(), N);
     std::vector<bool> result{false, true, true, false};
-    for (size_t i = 0; i < Z.numel(); ++i) {
+    for (const auto i : c10::irange(Z.numel())) {
       EXPECT_EQ(Z.template data<bool>()[i], result[i]);
     }
   }
@@ -174,7 +174,7 @@ void elementwiseXor() {
     EXPECT_EQ(Z.numel(), M * N);
     std::vector<bool> result{
         false, true, true, false, false, true, true, false};
-    for (size_t i = 0; i < Z.numel(); ++i) {
+    for (const auto i : c10::irange(Z.numel())) {
       EXPECT_EQ(Z.template data<bool>()[i], result[i]);
     }
   }
@@ -198,7 +198,7 @@ void elementwiseNot() {
   caffe2::Tensor Y(blob->Get<caffe2::Tensor>(), caffe2::CPU);
   EXPECT_EQ(Y.numel(), N);
   std::vector<bool> result{false, true};
-  for (size_t i = 0; i < Y.numel(); ++i) {
+  for (const auto i : c10::irange(Y.numel())) {
     EXPECT_EQ(Y.template data<bool>()[i], result[i]);
   }
 }
@@ -220,7 +220,7 @@ void elementwiseEQ() {
     caffe2::Tensor Z(blob->Get<caffe2::Tensor>(), caffe2::CPU);
     EXPECT_EQ(Z.numel(), N);
     std::vector<bool> result{false, true, false, true};
-    for (size_t i = 0; i < Z.numel(); ++i) {
+    for (const auto i : c10::irange(Z.numel())) {
       EXPECT_EQ(Z.template data<bool>()[i], result[i]);
     }
   }
@@ -237,7 +237,7 @@ void elementwiseEQ() {
     caffe2::Tensor Z(blob->Get<caffe2::Tensor>(), caffe2::CPU);
     EXPECT_EQ(Z.numel(), N);
     std::vector<bool> result{true, true, false, false};
-    for (size_t i = 0; i < Z.numel(); ++i) {
+    for (const auto i : c10::irange(Z.numel())) {
       EXPECT_EQ(Z.template data<bool>()[i], result[i]);
     }
   }
@@ -257,7 +257,7 @@ void elementwiseEQ() {
     EXPECT_EQ(Z.numel(), M * N);
     std::vector<bool> result{
         true, false, false, true, false, true, true, false};
-    for (size_t i = 0; i < Z.numel(); ++i) {
+    for (const auto i : c10::irange(Z.numel())) {
       EXPECT_EQ(Z.template data<bool>()[i], result[i]);
     }
   }
diff --git a/caffe2/operators/enforce_finite_op.h b/caffe2/operators/enforce_finite_op.h
index c1a788151ae51..0030c9f23c5ba 100644
--- a/caffe2/operators/enforce_finite_op.h
+++ b/caffe2/operators/enforce_finite_op.h
@@ -5,6 +5,7 @@
 #include "caffe2/core/logging.h"
 #include "caffe2/core/operator.h"
 #include "caffe2/utils/math.h"
+#include "c10/util/irange.h"
 
 namespace caffe2 {
 
@@ -32,7 +33,7 @@ class EnforceFiniteOp final : public Operator<Context> {
     const T* input_data = input.template data<T>();
     auto size = input.numel();
 
-    for (auto i = 0; i < size; i++) {
+    for (const auto i : c10::irange(size)) {
       auto isfinite = std::isfinite(input_data[i]);
       if (!isfinite) {
         LogBlobFiniteness();
diff --git a/caffe2/operators/expand_op.h b/caffe2/operators/expand_op.h
index 2532277c38603..075ab653ab6ab 100644
--- a/caffe2/operators/expand_op.h
+++ b/caffe2/operators/expand_op.h
@@ -7,6 +7,7 @@
 #include "caffe2/core/operator.h"
 #include "caffe2/core/types.h"
 #include "caffe2/utils/math.h"
+#include "c10/util/irange.h"
 
 namespace caffe2 {
 
@@ -95,7 +96,7 @@ class ExpandGradientOp final : public Operator<Context> {
     auto* dX = Output(0, X.sizes(), at::dtype<T>());
     std::vector<int> axes;
     const int offset = ndim - X.dim();
-    for (int i = 0; i < ndim; i++) {
+    for (const auto i : c10::irange(ndim)) {
       if (i < offset || dX_dims[i - offset] == 1) {
         axes.push_back(i);
       }
diff --git a/caffe2/operators/expand_squeeze_dims_op.h b/caffe2/operators/expand_squeeze_dims_op.h
index be2f17bcd3106..79f76cce958ec 100644
--- a/caffe2/operators/expand_squeeze_dims_op.h
+++ b/caffe2/operators/expand_squeeze_dims_op.h
@@ -1,6 +1,7 @@
 #ifndef CAFFE2_OPERATORS_EXPAND_SQUEEZE_DIMS_OP_H_
 #define CAFFE2_OPERATORS_EXPAND_SQUEEZE_DIMS_OP_H_
 
+#include <c10/util/irange.h>
 #include "caffe2/core/context.h"
 #include "caffe2/core/operator.h"
 
@@ -91,8 +92,7 @@ class SqueezeOp : public Operator<Context> {
       const std::vector<int>& dims) {
     size_t j = 0;
     std::vector<int> newDims;
-    for (size_t i = 0; i < inputDims.size(); ++i) {
-      // NOLINTNEXTLINE(clang-diagnostic-sign-compare)
+    for (const auto i : c10::irange(inputDims.size())) {
       if (j < dims.size() && dims[j] == i) {
         CAFFE_ENFORCE_EQ(
             inputDims[i],
diff --git a/caffe2/operators/feature_maps_ops.h b/caffe2/operators/feature_maps_ops.h
index be09137309641..876e6d23cdf33 100644
--- a/caffe2/operators/feature_maps_ops.h
+++ b/caffe2/operators/feature_maps_ops.h
@@ -3,6 +3,7 @@
 
 #include "caffe2/core/context.h"
 #include "caffe2/core/operator.h"
+#include "c10/util/irange.h"
 
 namespace caffe2 {
 
@@ -32,8 +33,8 @@ class MergeDenseFeatureTensorsOp : public Operator<Context> {
 
     const bool* inPresenceData = Input(1).template data<bool>();
     int totalNumFeatures = 0;
-    for (int exampleIndex = 0; exampleIndex < numExamples; ++exampleIndex) {
-      for (int inputIndex = 0; inputIndex < numFeatures; ++inputIndex) {
+    for (const auto exampleIndex : c10::irange(numExamples)) {
+      for (const auto inputIndex : c10::irange(numFeatures)) {
         if (inPresenceData[exampleIndex * numFeatures + inputIndex]) {
           ++totalNumFeatures;
         }
@@ -51,10 +52,10 @@ class MergeDenseFeatureTensorsOp : public Operator<Context> {
       Input(0).template data<T>();
 
     int keysOffset = 0;
-    for (int exampleIndex = 0; exampleIndex < numExamples; ++exampleIndex) {
+    for (const auto exampleIndex : c10::irange(numExamples)) {
       outLengthsData[exampleIndex] = 0;
       auto offset = exampleIndex * numFeatures;
-      for (int inputIndex = 0; inputIndex < numFeatures; ++inputIndex) {
+      for (const auto inputIndex : c10::irange(numFeatures)) {
         if (inPresenceData[offset]) {
           ++outLengthsData[exampleIndex];
           outKeysData[keysOffset] = featureIDs_[inputIndex];
@@ -94,10 +95,10 @@ class MergeSingleScalarFeatureTensorsOp : public Operator<Context> {
   bool DoRunWithType() {
     int numExamples = Input(0).numel();
     int totalNumFeatures = 0;
-    for (int inputIndex = 0; inputIndex < numInputs_; ++inputIndex) {
+    for (const auto inputIndex : c10::irange(numInputs_)) {
       const bool* inPresenceData =
           Input(kNumTensorsPerInput * inputIndex + 1).template data<bool>();
-      for (int exampleIndex = 0; exampleIndex < numExamples; ++exampleIndex) {
+      for (const auto exampleIndex : c10::irange(numExamples)) {
         if (inPresenceData[exampleIndex]) {
           ++totalNumFeatures;
         }
@@ -113,9 +114,9 @@ class MergeSingleScalarFeatureTensorsOp : public Operator<Context> {
     T* outValuesData = outValues->template mutable_data<T>();
 
     int keysOffset = 0;
-    for (int exampleIndex = 0; exampleIndex < numExamples; ++exampleIndex) {
+    for (const auto exampleIndex : c10::irange(numExamples)) {
       outLengthsData[exampleIndex] = 0;
-      for (int inputIndex = 0; inputIndex < numInputs_; ++inputIndex) {
+      for (const auto inputIndex : c10::irange(numInputs_)) {
         const T* inData =
             Input(kNumTensorsPerInput * inputIndex).template data<T>();
         const bool* inPresenceData =
@@ -158,7 +159,7 @@ class MergeSingleScalarFeatureTensorsGradientOp : public Operator<Context> {
   template <typename T>
   bool DoRunWithType() {
     int numExamples = Input(0).numel();
-    for (int inputIndex = 0; inputIndex < numFeatureInputs_; ++inputIndex) {
+    for (const auto inputIndex : c10::irange(numFeatureInputs_)) {
       Output(inputIndex)->ResizeLike(Input(inputIndex));
     }
 
@@ -166,8 +167,8 @@ class MergeSingleScalarFeatureTensorsGradientOp : public Operator<Context> {
 
     T default_value = T();
     int valuesOffset = 0;
-    for (int exampleIndex = 0; exampleIndex < numExamples; ++exampleIndex) {
-      for (int inputIndex = 0; inputIndex < numFeatureInputs_; ++inputIndex) {
+    for (const auto exampleIndex : c10::irange(numExamples)) {
+      for (const auto inputIndex : c10::irange(numFeatureInputs_)) {
         const bool* inPresenceData = Input(inputIndex).template data<bool>();
         T* outFeatureData = Output(inputIndex)->template mutable_data<T>();
         if (inPresenceData[exampleIndex]) {
@@ -210,12 +211,12 @@ class MergeSingleListFeatureTensorsOp : public Operator<Context> {
     int numExamples = Input(0).numel();
     int totalNumFeatures = 0;
     int totalNumValues = 0;
-    for (int inputIndex = 0; inputIndex < numInputs_; ++inputIndex) {
+    for (const auto inputIndex : c10::irange(numInputs_)) {
       const int32_t* inLengthsData =
           Input(kNumTensorsPerInput * inputIndex).template data<int32_t>();
       const bool* inPresenceData =
           Input(kNumTensorsPerInput * inputIndex + 2).template data<bool>();
-      for (int exampleIndex = 0; exampleIndex < numExamples; ++exampleIndex) {
+      for (const auto exampleIndex : c10::irange(numExamples)) {
         if (inPresenceData[exampleIndex]) {
           ++totalNumFeatures;
           totalNumValues += inLengthsData[exampleIndex];
@@ -237,12 +238,12 @@ class MergeSingleListFeatureTensorsOp : public Operator<Context> {
 
     int keysOffset = 0;
     int valuesOffset = 0;
-    for (int inputIndex = 0; inputIndex < numInputs_; ++inputIndex) {
+    for (const auto inputIndex : c10::irange(numInputs_)) {
       inValuesOffset_[inputIndex] = 0;
     }
-    for (int exampleIndex = 0; exampleIndex < numExamples; ++exampleIndex) {
+    for (const auto exampleIndex : c10::irange(numExamples)) {
       outLengthsData[exampleIndex] = 0;
-      for (int inputIndex = 0; inputIndex < numInputs_; ++inputIndex) {
+      for (const auto inputIndex : c10::irange(numInputs_)) {
         const int32_t* inLengthsData =
             Input(kNumTensorsPerInput * inputIndex).template data<int32_t>();
         const auto& inValues = Input(kNumTensorsPerInput * inputIndex + 1);
@@ -295,13 +296,13 @@ class MergeSingleListOrMapFeatureTensorsGradientOp : public Operator<Context> {
   bool DoRunWithType() {
     int numExamples = Input(0).numel();
     std::vector<int> outValuesOffset(numFeatureInputs_);
-    for (int inputIndex = 0; inputIndex < numFeatureInputs_; ++inputIndex) {
+    for (const auto inputIndex : c10::irange(numFeatureInputs_)) {
       int inputNumValues = 0;
       const int32_t* inLengthsData =
           Input(kNumTensorsPerInput * inputIndex).template data<int32_t>();
       const bool* inPresenceData =
           Input(kNumTensorsPerInput * inputIndex + 1).template data<bool>();
-      for (int exampleIndex = 0; exampleIndex < numExamples; ++exampleIndex) {
+      for (const auto exampleIndex : c10::irange(numExamples)) {
         if (inPresenceData[exampleIndex]) {
           inputNumValues += inLengthsData[exampleIndex];
         }
@@ -313,8 +314,8 @@ class MergeSingleListOrMapFeatureTensorsGradientOp : public Operator<Context> {
     const T* inValuesValuesGradData = inValuesValuesGrad.template data<T>();
 
     int inValuesValuesOffset = 0;
-    for (int exampleIndex = 0; exampleIndex < numExamples; ++exampleIndex) {
-      for (int inputIndex = 0; inputIndex < numFeatureInputs_; ++inputIndex) {
+    for (const auto exampleIndex : c10::irange(numExamples)) {
+      for (const auto inputIndex : c10::irange(numFeatureInputs_)) {
         const int32_t* inLengthsData =
             Input(kNumTensorsPerInput * inputIndex).template data<int32_t>();
         const bool* inPresenceData =
@@ -371,12 +372,12 @@ class MergeSingleMapFeatureTensorsOp : public Operator<Context> {
     int numExamples = Input(0).numel();
     int totalNumFeatures = 0;
     int totalNumValues = 0;
-    for (int inputIndex = 0; inputIndex < numInputs_; ++inputIndex) {
+    for (const auto inputIndex : c10::irange(numInputs_)) {
       const int32_t* inLengthsData =
           Input(kNumTensorsPerInput * inputIndex).template data<int32_t>();
       const bool* inPresenceData =
           Input(kNumTensorsPerInput * inputIndex + 3).template data<bool>();
-      for (int exampleIndex = 0; exampleIndex < numExamples; ++exampleIndex) {
+      for (const auto exampleIndex : c10::irange(numExamples)) {
         if (inPresenceData[exampleIndex]) {
           ++totalNumFeatures;
           totalNumValues += inLengthsData[exampleIndex];
@@ -400,12 +401,12 @@ class MergeSingleMapFeatureTensorsOp : public Operator<Context> {
 
     int keysOffset = 0;
     int valuesOffset = 0;
-    for (int inputIndex = 0; inputIndex < numInputs_; ++inputIndex) {
+    for (const auto inputIndex : c10::irange(numInputs_)) {
       inValuesOffset_[inputIndex] = 0;
     }
-    for (int exampleIndex = 0; exampleIndex < numExamples; ++exampleIndex) {
+    for (const auto exampleIndex : c10::irange(numExamples)) {
       outLengthsData[exampleIndex] = 0;
-      for (int inputIndex = 0; inputIndex < numInputs_; ++inputIndex) {
+      for (const auto inputIndex : c10::irange(numInputs_)) {
         const int32_t* inLengthsData =
             Input(kNumTensorsPerInput * inputIndex).template data<int32_t>();
         const auto& inKeys = Input(kNumTensorsPerInput * inputIndex + 1);
@@ -465,7 +466,7 @@ class MergeMultiScalarFeatureTensorsOp : public Operator<Context> {
   bool DoRunWithType() {
     int numExamples = Input(0).numel();
     int totalNumFeatures = 0;
-    for (int inputIndex = 0; inputIndex < numInputs_; ++inputIndex) {
+    for (const auto inputIndex : c10::irange(numInputs_)) {
       totalNumFeatures += Input(kNumTensorsPerInput * inputIndex + 1).numel();
     }
 
@@ -478,12 +479,12 @@ class MergeMultiScalarFeatureTensorsOp : public Operator<Context> {
     T* outValuesData = outValues->template mutable_data<T>();
 
     int outKeysOffset = 0;
-    for (int inputIndex = 0; inputIndex < numInputs_; ++inputIndex) {
+    for (const auto inputIndex : c10::irange(numInputs_)) {
       inKeysOffset_[inputIndex] = 0;
     }
-    for (int exampleIndex = 0; exampleIndex < numExamples; ++exampleIndex) {
+    for (const auto exampleIndex : c10::irange(numExamples)) {
       outLengthsData[exampleIndex] = 0;
-      for (int inputIndex = 0; inputIndex < numInputs_; ++inputIndex) {
+      for (const auto inputIndex : c10::irange(numInputs_)) {
         const int32_t* inLengthsData =
             Input(kNumTensorsPerInput * inputIndex).template data<int32_t>();
         auto inputKeysBlobIdx = kNumTensorsPerInput * inputIndex + 1;
@@ -537,11 +538,11 @@ class MergeMultiScalarFeatureTensorsGradientOp : public Operator<Context> {
   bool DoRunWithType() {
     int numExamples = Input(0).numel();
     std::vector<int> outValuesOffset(numFeatureInputs_);
-    for (int inputIndex = 0; inputIndex < numFeatureInputs_; ++inputIndex) {
+    for (const auto inputIndex : c10::irange(numFeatureInputs_)) {
       int inputNumValues = 0;
       const int32_t* inLengthsData =
           Input(kNumTensorsPerInput * inputIndex).template data<int32_t>();
-      for (int exampleIndex = 0; exampleIndex < numExamples; ++exampleIndex) {
+      for (const auto exampleIndex : c10::irange(numExamples)) {
         inputNumValues += inLengthsData[exampleIndex];
       }
       Output(inputIndex)->Resize(inputNumValues);
@@ -551,8 +552,8 @@ class MergeMultiScalarFeatureTensorsGradientOp : public Operator<Context> {
     const T* inValuesGradData = inValuesGrad.template data<T>();
 
     int inValuesOffset = 0;
-    for (int exampleIndex = 0; exampleIndex < numExamples; ++exampleIndex) {
-      for (int inputIndex = 0; inputIndex < numFeatureInputs_; ++inputIndex) {
+    for (const auto exampleIndex : c10::irange(numExamples)) {
+      for (const auto inputIndex : c10::irange(numFeatureInputs_)) {
         const int32_t* inLengthsData =
             Input(kNumTensorsPerInput * inputIndex).template data<int32_t>();
         if (inLengthsData[exampleIndex] > 0) {
@@ -600,7 +601,7 @@ class MergeMultiListFeatureTensorsOp : public Operator<Context> {
     int numExamples = Input(0).numel();
     int totalNumFeatures = 0;
     int totalNumValues = 0;
-    for (int inputIndex = 0; inputIndex < numInputs_; ++inputIndex) {
+    for (const auto inputIndex : c10::irange(numInputs_)) {
       totalNumFeatures += Input(kNumTensorsPerInput * inputIndex + 1).numel();
       totalNumValues += Input(kNumTensorsPerInput * inputIndex + 3).numel();
     }
@@ -619,13 +620,13 @@ class MergeMultiListFeatureTensorsOp : public Operator<Context> {
 
     int outKeysOffset = 0;
     int outValuesValuesOffset = 0;
-    for (int inputIndex = 0; inputIndex < numInputs_; ++inputIndex) {
+    for (const auto inputIndex : c10::irange(numInputs_)) {
       inKeysOffset_[inputIndex] = 0;
       inValuesValuesOffset_[inputIndex] = 0;
     }
-    for (int exampleIndex = 0; exampleIndex < numExamples; ++exampleIndex) {
+    for (const auto exampleIndex : c10::irange(numExamples)) {
       outLengthsData[exampleIndex] = 0;
-      for (int inputIndex = 0; inputIndex < numInputs_; ++inputIndex) {
+      for (const auto inputIndex : c10::irange(numInputs_)) {
         const int32_t* inLengthsData =
             Input(kNumTensorsPerInput * inputIndex).template data<int32_t>();
         const int64_t* inKeysData = Input(kNumTensorsPerInput * inputIndex + 1)
@@ -699,7 +700,7 @@ class MergeMultiMapFeatureTensorsOp : public Operator<Context> {
     int numExamples = Input(0).numel();
     int totalNumFeatures = 0;
     int totalNumValues = 0;
-    for (int inputIndex = 0; inputIndex < numInputs_; ++inputIndex) {
+    for (const auto inputIndex : c10::irange(numInputs_)) {
       totalNumFeatures += Input(kNumTensorsPerInput * inputIndex + 1).numel();
       totalNumValues += Input(kNumTensorsPerInput * inputIndex + 4).numel();
     }
@@ -720,13 +721,13 @@ class MergeMultiMapFeatureTensorsOp : public Operator<Context> {
 
     int outKeysOffset = 0;
     int outValuesValuesOffset = 0;
-    for (int inputIndex = 0; inputIndex < numInputs_; ++inputIndex) {
+    for (const auto inputIndex : c10::irange(numInputs_)) {
       inKeysOffset_[inputIndex] = 0;
       inValuesValuesOffset_[inputIndex] = 0;
     }
-    for (int exampleIndex = 0; exampleIndex < numExamples; ++exampleIndex) {
+    for (const auto exampleIndex : c10::irange(numExamples)) {
       outLengthsData[exampleIndex] = 0;
-      for (int inputIndex = 0; inputIndex < numInputs_; ++inputIndex) {
+      for (const auto inputIndex : c10::irange(numInputs_)) {
         const int32_t* inLengthsData =
             Input(kNumTensorsPerInput * inputIndex).template data<int32_t>();
         const int64_t* inKeysData = Input(kNumTensorsPerInput * inputIndex + 1)
@@ -798,13 +799,12 @@ class MergeMultiListOrMapFeatureTensorsGradientOp : public Operator<Context> {
     int numExamples = Input(0).numel();
     std::vector<int> outValuesLengthOffset(numFeatureInputs_);
     std::vector<int> outValuesValuesOffset(numFeatureInputs_);
-    for (int inputIndex = 0; inputIndex < numFeatureInputs_; ++inputIndex) {
+    for (const auto inputIndex : c10::irange(numFeatureInputs_)) {
       int inputNumValues = 0;
       auto& inValuesLength = Input(kNumTensorsPerInput * inputIndex + 1);
       const int32_t* inValuesLengthsData =
           inValuesLength.template data<int32_t>();
-      for (int valuesIndex = 0; valuesIndex < inValuesLength.numel();
-           ++valuesIndex) {
+      for (const auto valuesIndex : c10::irange(inValuesLength.numel())) {
         inputNumValues += inValuesLengthsData[valuesIndex];
       }
       Output(inputIndex)->Resize(inputNumValues);
@@ -814,8 +814,8 @@ class MergeMultiListOrMapFeatureTensorsGradientOp : public Operator<Context> {
     const T* inValuesValuesGradData = inValuesValuesGrad.template data<T>();
 
     int inValuesValuesOffset = 0;
-    for (int exampleIndex = 0; exampleIndex < numExamples; ++exampleIndex) {
-      for (int inputIndex = 0; inputIndex < numFeatureInputs_; ++inputIndex) {
+    for (const auto exampleIndex : c10::irange(numExamples)) {
+      for (const auto inputIndex : c10::irange(numFeatureInputs_)) {
         const int32_t* inLengthsData =
             Input(kNumTensorsPerInput * inputIndex).template data<int32_t>();
         const int32_t* inValuesLengthsData =
diff --git a/caffe2/operators/filler_op.h b/caffe2/operators/filler_op.h
index 7e01a01792bf5..b95f8401f761a 100644
--- a/caffe2/operators/filler_op.h
+++ b/caffe2/operators/filler_op.h
@@ -1,6 +1,7 @@
 #ifndef CAFFE2_OPERATORS_FILLER_OP_H_
 #define CAFFE2_OPERATORS_FILLER_OP_H_
 
+#include <c10/util/irange.h>
 #include "caffe2/core/context.h"
 #include "caffe2/core/logging.h"
 #include "caffe2/core/operator.h"
@@ -536,7 +537,7 @@ class LengthsRangeFillOp : public Operator<Context> {
     auto* output_data = output->template mutable_data<int32_t>();
 
     int32_t offset = 0;
-    for (int i = 0; i < input.numel(); ++i) {
+    for (const auto i : c10::irange(input.numel())) {
       auto len = input_data[i];
       auto start = output_data + offset;
       std::iota(
diff --git a/caffe2/operators/find_duplicate_elements_op.h b/caffe2/operators/find_duplicate_elements_op.h
index 681d56e47c0b5..ca05b616678a4 100644
--- a/caffe2/operators/find_duplicate_elements_op.h
+++ b/caffe2/operators/find_duplicate_elements_op.h
@@ -1,12 +1,13 @@
 #ifndef CAFFE2_OPERATORS_FIND_DUPLICATE_ELEMENTS_OP_H
 #define CAFFE2_OPERATORS_FIND_DUPLICATE_ELEMENTS_OP_H
 
-#include <unordered_map>
-#include <vector>
-
 #include "caffe2/core/context.h"
 #include "caffe2/core/operator.h"
 #include "caffe2/core/tensor.h"
+#include "c10/util/irange.h"
+
+#include <unordered_map>
+#include <vector>
 
 namespace caffe2 {
 
@@ -44,7 +45,7 @@ class FindDuplicateElementsOp final : public Operator<Context> {
     auto* output =
         Output(0, {static_cast<int64_t>(dupSize)}, at::dtype<int64_t>());
     auto* out_ptr = output->template mutable_data<int64_t>();
-    for (size_t i = 0; i < dupSize; ++i) {
+    for (const auto i : c10::irange(dupSize)) {
       out_ptr[i] = dupIndices[i];
     }
 
diff --git a/caffe2/operators/find_op.h b/caffe2/operators/find_op.h
index 4b5bfdc852f14..bd26b39122a6d 100644
--- a/caffe2/operators/find_op.h
+++ b/caffe2/operators/find_op.h
@@ -4,6 +4,7 @@
 #include "caffe2/core/context.h"
 #include "caffe2/core/logging.h"
 #include "caffe2/core/operator.h"
+#include "c10/util/irange.h"
 
 #include <unordered_map>
 
@@ -42,7 +43,7 @@ class FindOp final : public Operator<Context> {
     // index into a map
     if (needles.numel() < 16) {
       // Brute force O(nm)
-      for (int i = 0; i < needles.numel(); i++) {
+      for (const auto i : c10::irange(needles.numel())) {
         T x = needles_data[i];
         T res = static_cast<T>(missing_value_);
         for (int j = idx_size - 1; j >= 0; j--) {
@@ -56,10 +57,10 @@ class FindOp final : public Operator<Context> {
     } else {
       // O(n + m)
       std::unordered_map<T, int> idx_map;
-      for (int j = 0; j < idx_size; j++) {
+      for (const auto j : c10::irange(idx_size)) {
         idx_map[idx_data[j]] = j;
       }
-      for (int i = 0; i < needles.numel(); i++) {
+      for (const auto i : c10::irange(needles.numel())) {
         T x = needles_data[i];
         auto it = idx_map.find(x);
         res_data[i] = (it == idx_map.end() ? missing_value_ : it->second);
diff --git a/caffe2/operators/floor_op.h b/caffe2/operators/floor_op.h
index 6af9b414814a2..5006d6fd0bdae 100644
--- a/caffe2/operators/floor_op.h
+++ b/caffe2/operators/floor_op.h
@@ -5,6 +5,7 @@
 #include "caffe2/core/context.h"
 #include "caffe2/core/logging.h"
 #include "caffe2/core/operator.h"
+#include "c10/util/irange.h"
 
 namespace caffe2 {
 
@@ -21,7 +22,7 @@ class FloorOp final : public Operator<Context> {
 
     const float* Xdata = X.template data<float>();
     float* Ydata = Y->template mutable_data<float>();
-    for (int i = 0; i < X.numel(); ++i) {
+    for (const auto i : c10::irange(X.numel())) {
       Ydata[i] = std::floor(Xdata[i]);
     }
     return true;
diff --git a/caffe2/operators/fused_rowwise_8bit_conversion_ops.h b/caffe2/operators/fused_rowwise_8bit_conversion_ops.h
index cf593a10e0f49..23c7f968bf166 100644
--- a/caffe2/operators/fused_rowwise_8bit_conversion_ops.h
+++ b/caffe2/operators/fused_rowwise_8bit_conversion_ops.h
@@ -3,6 +3,7 @@
 
 #include "caffe2/core/context.h"
 #include "caffe2/core/export_caffe2_op_to_c10.h"
+#include <c10/util/irange.h>
 #include "caffe2/core/logging.h"
 #include "caffe2/core/operator.h"
 #include "caffe2/operators/reducer_functors.h"
@@ -82,7 +83,7 @@ class FloatToFused8BitRowwiseQuantizedOp : public Operator<Context> {
 
       vector<float> tmp(input_columns);
       // NOLINTNEXTLINE(clang-diagnostic-sign-compare)
-      for (size_t row = 0; row < input_rows; ++row) {
+      for (const auto row : c10::irange(input_rows)) {
         convert(tmp.data(), input_data + row * input_columns, input_columns);
         if (out_sb_half) {
           FloatToFusedNBitRowwiseQuantizedSBHalf(
@@ -163,7 +164,7 @@ class Fused8BitRowwiseQuantizedToFloatOp : public Operator<Context> {
 
       vector<float> tmp(input_columns);
       // NOLINTNEXTLINE(clang-diagnostic-sign-compare)
-      for (size_t row = 0; row < input_rows; ++row) {
+      for (const auto row : c10::irange(input_rows)) {
         if (in_sb_half) {
           FusedNBitRowwiseQuantizedSBHalfToFloat(
               8,
diff --git a/caffe2/operators/fused_rowwise_nbit_conversion_ops.h b/caffe2/operators/fused_rowwise_nbit_conversion_ops.h
index 363743eebeb32..a03fdf65521a0 100644
--- a/caffe2/operators/fused_rowwise_nbit_conversion_ops.h
+++ b/caffe2/operators/fused_rowwise_nbit_conversion_ops.h
@@ -131,7 +131,7 @@ class FloatToFusedNBitRowwiseQuantizedOp final : public Operator<CPUContext> {
         *output_row_scale = scale;
         *output_row_bias = Xmin;
 
-        for (int col = 0; col < input_columns; ++col) {
+        for (const auto col : c10::irange(input_columns)) {
           float X = tmp[col];
           std::uint8_t quantized = std::max(
               0,
@@ -206,7 +206,7 @@ class FusedNBitRowwiseQuantizedToFloatOp final : public Operator<CPUContext> {
       std::vector<float> tmp(output_columns);
 
       // NOLINTNEXTLINE(clang-diagnostic-sign-compare)
-      for (size_t row = 0; row < input_rows; ++row) {
+      for (const auto row : c10::irange(input_rows)) {
         const std::uint8_t* input_row = input_data + row * input_columns;
         float scale = *reinterpret_cast<const at::Half*>(
             input_row +
@@ -216,7 +216,7 @@ class FusedNBitRowwiseQuantizedToFloatOp final : public Operator<CPUContext> {
             (output_columns + NUM_ELEM_PER_BYTE - 1) / NUM_ELEM_PER_BYTE +
             sizeof(at::Half));
 
-        for (int col = 0; col < output_columns; ++col) {
+        for (const auto col : c10::irange(output_columns)) {
           std::uint8_t quantized = input_row[col / NUM_ELEM_PER_BYTE];
           quantized >>= (col % NUM_ELEM_PER_BYTE) * BIT_RATE;
           quantized &= (1 << BIT_RATE) - 1;
diff --git a/caffe2/operators/fused_rowwise_nbitfake_conversion_ops.h b/caffe2/operators/fused_rowwise_nbitfake_conversion_ops.h
index 105f9902b2159..9e506aa7d9432 100644
--- a/caffe2/operators/fused_rowwise_nbitfake_conversion_ops.h
+++ b/caffe2/operators/fused_rowwise_nbitfake_conversion_ops.h
@@ -118,7 +118,7 @@ class FloatToFusedNBitFakeRowwiseQuantizedOp final
       output_row_scale_bias[1] = minimum_element;
 
       // NOLINTNEXTLINE(clang-diagnostic-sign-compare)
-      for (size_t col = 0; col < input_columns; ++col) {
+      for (const auto col : c10::irange(input_columns)) {
         output_row[col] = std::max(
             0,
             std::min<int>(
diff --git a/caffe2/operators/gather_fused_8bit_rowwise_op.h b/caffe2/operators/gather_fused_8bit_rowwise_op.h
index a111ff3eca4db..4892b4cbb407d 100644
--- a/caffe2/operators/gather_fused_8bit_rowwise_op.h
+++ b/caffe2/operators/gather_fused_8bit_rowwise_op.h
@@ -37,7 +37,7 @@ class GatherFused8BitRowwiseOp : public Operator<Context> {
     const Index* idxs = indices.template data<Index>();
     auto out = output->template mutable_data<float>();
 
-    for (int i = 0; i < N; ++i) {
+    for (const auto i : c10::irange(N)) {
       auto idx = idxs[i];
       CAFFE_ENFORCE(
           0 <= idx && idx < data.size(0),
diff --git a/caffe2/operators/gather_op.h b/caffe2/operators/gather_op.h
index 52f45c1989836..ae1fecf927938 100644
--- a/caffe2/operators/gather_op.h
+++ b/caffe2/operators/gather_op.h
@@ -3,6 +3,7 @@
 
 #include "caffe2/core/context.h"
 #include "caffe2/core/operator.h"
+#include <c10/util/irange.h>
 
 namespace caffe2 {
 
@@ -44,7 +45,7 @@ static void check_indexarray_range(
     IndexType indexing_axis_dim,
     bool wrap_indices) {
   //
-  for (auto i = 0; i < n; ++i) {
+  for (const auto i : c10::irange(n)) {
     auto idx = indices[i];
     if (wrap_indices && idx < 0) {
       idx = idx + indexing_axis_dim;
@@ -114,7 +115,7 @@ static bool gather_impl(
   auto N = indices.numel();
   if (match_outer) {
     CAFFE_ENFORCE_GE(axis, 1, "Axis should be at least 1");
-    for (auto i = 0; i < axis; i++) {
+    for (const auto i : c10::irange(axis)) {
       CAFFE_ENFORCE_EQ(
           data.size(i),
           indices.size(i),
@@ -129,12 +130,12 @@ static bool gather_impl(
 
   // Special-case single-float copy for efficiency
   if (data.template IsType<float>() && block_size == 1) {
-    for (auto batch = 0; batch < outer_dims_product; ++batch) {
+    for (const auto batch : c10::irange(outer_dims_product)) {
       const float* src_floats =
           (const float*)(src_base + batch * src_batch_bytesize);
       float* dst_floats = (float*)(out + batch * gathered_batch_bytesize);
 
-      for (auto i = 0; i < N; ++i) {
+      for (const auto i : c10::irange(N)) {
         auto idx = idxs[i];
         if (match_outer) {
           idx = idxs[batch * idx_inner_dims_product + i];
@@ -148,8 +149,8 @@ static bool gather_impl(
   } else {
     // outer_dims_product specifies how many times we repeat inner dimensions,
     // so we just iterate over it to cover all outer dimensions.
-    for (auto batch = 0; batch < outer_dims_product; ++batch) {
-      for (auto i = 0; i < N; ++i) {
+    for (const auto batch : c10::irange(outer_dims_product)) {
+      for (const auto i : c10::irange(N)) {
         auto idx = idxs[i];
         if (match_outer) {
           idx = idxs[batch * idx_inner_dims_product + i];
diff --git a/caffe2/operators/gather_ranges_to_dense_op.h b/caffe2/operators/gather_ranges_to_dense_op.h
index 0fdc430ea441e..7e314f703aa97 100644
--- a/caffe2/operators/gather_ranges_to_dense_op.h
+++ b/caffe2/operators/gather_ranges_to_dense_op.h
@@ -6,6 +6,7 @@
 #include "caffe2/core/common_omp.h"
 #include "caffe2/core/context.h"
 #include "caffe2/core/export_caffe2_op_to_c10.h"
+#include <c10/util/irange.h>
 #include "caffe2/core/logging.h"
 #include "caffe2/core/operator.h"
 #include "caffe2/core/types.h"
@@ -42,7 +43,8 @@ class GatherRangesToDenseOp final : public Operator<Context> {
     CAFFE_ENFORCE_GT(
         minObservation_, 0, "The number of observations is at least 1");
     // Initialize the empty and mismatch counter.
-    for (int i = 0; i < OutputSize(); ++i) {
+    for (const auto i : c10::irange(OutputSize())) {
+      (void)i; // Suppress unused variable warning
       emptyRanges_.push_back(0);
       mismatchedRanges_.push_back(0);
       mismatchedLengths_.push_back(set<int>());
@@ -105,7 +107,7 @@ class GatherRangesToDenseOp final : public Operator<Context> {
     auto batchSize = ranges.size(0);
     vector<int64_t> outputDims{batchSize, 0};
     vector<char*> outputRawData;
-    for (int i = 0; i < OutputSize(); ++i) {
+    for (const auto i : c10::irange(OutputSize())) {
       auto* output = Output(i);
       outputDims[1] = lengths_[i];
       output->Resize(outputDims);
@@ -114,8 +116,8 @@ class GatherRangesToDenseOp final : public Operator<Context> {
       outputRawData.push_back(ptr);
     }
 
-    for (int i = 0; i < batchSize; ++i) {
-      for (int j = 0; j < OutputSize(); ++j) {
+    for (const auto i : c10::irange(batchSize)) {
+      for (const auto j : c10::irange(OutputSize())) {
         auto rangeStart = rangesData[rangesDataOffset++];
         auto rangeLength = rangesData[rangesDataOffset++];
 
@@ -143,7 +145,7 @@ class GatherRangesToDenseOp final : public Operator<Context> {
           auto& key = Input(KEY);
           auto* key_data = key.template data<int64_t>();
           vector<std::pair<int64_t, const char*>> buffer;
-          for (int b_i = 0; b_i < rangeLength; ++b_i) {
+          for (const auto b_i : c10::irange(rangeLength)) {
             int64_t one_key_item = key_data[rangeStart + b_i];
             auto* one_data_item = rawData + (rangeStart + b_i) * itemsize;
             buffer.emplace_back(one_key_item, one_data_item);
@@ -155,7 +157,7 @@ class GatherRangesToDenseOp final : public Operator<Context> {
                  const std::pair<int64_t, const char*>& right) {
                 return left.first < right.first;
               });
-          for (int b_i = 0; b_i < rangeLength; ++b_i) {
+          for (const auto b_i : c10::irange(rangeLength)) {
             // Since this CPU only, directly copy to the destination.
             std::memcpy(
                 outputRawData[j] + (i * lengths_[j] + b_i) * itemsize,
@@ -170,7 +172,7 @@ class GatherRangesToDenseOp final : public Operator<Context> {
 
     // Check whether the empty and mismatch ratio exceeded the threshold.
     totalRanges_ += batchSize;
-    for (int j = 0; j < OutputSize(); ++j) {
+    for (const auto j : c10::irange(OutputSize())) {
       // Only check when the ratio is not set to allow all mismatches.
       if (maxMismatchedRatio_ < 1.0) {
         CAFFE_ENFORCE_GE(
diff --git a/caffe2/operators/generate_proposals_op_util_boxes.h b/caffe2/operators/generate_proposals_op_util_boxes.h
index dacee9eb7db60..0a402cdb6a3c1 100644
--- a/caffe2/operators/generate_proposals_op_util_boxes.h
+++ b/caffe2/operators/generate_proposals_op_util_boxes.h
@@ -294,7 +294,7 @@ EArrXXt<typename Derived::Scalar> clip_boxes_rotated(
 
   EArrXXt<typename Derived::Scalar> ret(boxes.rows(), boxes.cols());
   ret = boxes;
-  for (int i = 0; i < upright_boxes.rows(); ++i) {
+  for (const auto i : c10::irange(upright_boxes.rows())) {
     ret.row(indices[i]) = upright_boxes.row(i);
   }
   return ret;
diff --git a/caffe2/operators/generate_proposals_op_util_nms.h b/caffe2/operators/generate_proposals_op_util_nms.h
index ff68d49251c1e..7fb9c3767cf35 100644
--- a/caffe2/operators/generate_proposals_op_util_nms.h
+++ b/caffe2/operators/generate_proposals_op_util_nms.h
@@ -247,7 +247,7 @@ int rotated_rect_intersection_pts(
   // Specical case of rect1 == rect2
   bool same = true;
 
-  for (int i = 0; i < 4; i++) {
+  for (const auto i : c10::irange(4)) {
     if (fabs(pts1[i].x() - pts2[i].x()) > samePointEps ||
         (fabs(pts1[i].y() - pts2[i].y()) > samePointEps)) {
       same = false;
@@ -256,7 +256,7 @@ int rotated_rect_intersection_pts(
   }
 
   if (same) {
-    for (int i = 0; i < 4; i++) {
+    for (const auto i : c10::irange(4)) {
       intersections[i] = pts1[i];
     }
     num = 4;
@@ -265,14 +265,14 @@ int rotated_rect_intersection_pts(
 
   // Line vector
   // A line from p1 to p2 is: p1 + (p2-p1)*t, t=[0,1]
-  for (int i = 0; i < 4; i++) {
+  for (const auto i : c10::irange(4)) {
     vec1[i] = pts1[(i + 1) % 4] - pts1[i];
     vec2[i] = pts2[(i + 1) % 4] - pts2[i];
   }
 
   // Line test - test all line combos for intersection
-  for (int i = 0; i < 4; i++) {
-    for (int j = 0; j < 4; j++) {
+  for (const auto i : c10::irange(4)) {
+    for (const auto j : c10::irange(4)) {
       // Solve for 2x2 Ax=b
 
       // This takes care of parallel lines
@@ -298,7 +298,7 @@ int rotated_rect_intersection_pts(
     const auto& DA = vec2[3];
     auto ABdotAB = AB.squaredNorm();
     auto ADdotAD = DA.squaredNorm();
-    for (int i = 0; i < 4; i++) {
+    for (const auto i : c10::irange(4)) {
       // assume ABCD is the rectangle, and P is the point to be judged
       // P is inside ABCD iff. P's projection on AB lies within AB
       // and P's projection on AD lies within AD
@@ -321,7 +321,7 @@ int rotated_rect_intersection_pts(
     const auto& DA = vec1[3];
     auto ABdotAB = AB.squaredNorm();
     auto ADdotAD = DA.squaredNorm();
-    for (int i = 0; i < 4; i++) {
+    for (const auto i : c10::irange(4)) {
       auto AP = pts2[i] - pts1[0];
 
       auto APdotAB = AP.dot(AB);
@@ -351,7 +351,7 @@ int convex_hull_graham(
   // if more than 1 points have the same minimum y,
   // pick the one with the mimimum x.
   int t = 0;
-  for (int i = 1; i < num_in; i++) {
+  for (const auto i : c10::irange(1, num_in)) {
     if (p[i].y() < p[t].y() || (p[i].y() == p[t].y() && p[i].x() < p[t].x())) {
       t = i;
     }
@@ -360,7 +360,7 @@ int convex_hull_graham(
 
   // Step 2:
   // Subtract starting point from every points (for sorting in the next step)
-  for (int i = 0; i < num_in; i++) {
+  for (const auto i : c10::irange(num_in)) {
     q[i] = p[i] - s;
   }
 
@@ -415,8 +415,7 @@ int convex_hull_graham(
   // But if we're only interested in getting the area/perimeter of the shape
   // We can simply return.
   if (!shift_to_zero) {
-    for (int i = 0; i < m; i++)
-      q[i] += s;
+    for (const auto i : c10::irange(m))q[i] += s;
   }
 
   return m;
@@ -518,8 +517,8 @@ Eigen::ArrayXXf bbox_overlaps_rotated(
   const auto& query_boxes_areas = query_boxes.col(2) * query_boxes.col(3);
 
   Eigen::ArrayXXf overlaps(boxes.rows(), query_boxes.rows());
-  for (int i = 0; i < boxes.rows(); ++i) {
-    for (int j = 0; j < query_boxes.rows(); ++j) {
+  for (const auto i : c10::irange(boxes.rows())) {
+    for (const auto j : c10::irange(query_boxes.rows())) {
       auto inter = bbox_intersection_rotated(boxes.row(i), query_boxes.row(j));
       overlaps(i, j) = (inter == 0.0)
           ? 0.0
@@ -554,7 +553,7 @@ std::vector<int> nms_cpu_rotated(
   EArrX areas = widths * heights;
 
   std::vector<RotatedRect> rotated_rects(proposals.rows());
-  for (int i = 0; i < proposals.rows(); ++i) {
+  for (const auto i : c10::irange(proposals.rows())) {
     rotated_rects[i] = bbox_to_rotated_rect(proposals.row(i));
   }
 
@@ -616,7 +615,7 @@ std::vector<int> soft_nms_cpu_rotated(
   EArrX areas = widths * heights;
 
   std::vector<RotatedRect> rotated_rects(proposals.rows());
-  for (int i = 0; i < proposals.rows(); ++i) {
+  for (const auto i : c10::irange(proposals.rows())) {
     rotated_rects[i] = bbox_to_rotated_rect(proposals.row(i));
   }
 
diff --git a/caffe2/operators/given_tensor_byte_string_to_uint8_fill_op.h b/caffe2/operators/given_tensor_byte_string_to_uint8_fill_op.h
index 13a504017c12d..5cab269dc407e 100644
--- a/caffe2/operators/given_tensor_byte_string_to_uint8_fill_op.h
+++ b/caffe2/operators/given_tensor_byte_string_to_uint8_fill_op.h
@@ -1,5 +1,6 @@
 #pragma once
 
+#include <c10/util/irange.h>
 #include "caffe2/core/context.h"
 #include "caffe2/core/logging.h"
 #include "caffe2/core/operator.h"
@@ -60,8 +61,7 @@ class GivenTensorByteStringToUInt8FillOp final : public FillerOp<Context> {
         {static_cast<int64_t>(str.size())},
         at::dtype<uint8_t>().device(CPU));
     uint8_t* values_data = values_.template mutable_data<uint8_t>();
-    // NOLINTNEXTLINE(clang-diagnostic-sign-compare)
-    for (int i = 0; i < str.size(); i++) {
+    for (const auto i : c10::irange(str.size())) {
       values_data[i] = static_cast<uint8_t>(str[i]);
     }
   }
diff --git a/caffe2/operators/given_tensor_fill_op.h b/caffe2/operators/given_tensor_fill_op.h
index 9b975e910b107..8a20aa813d612 100644
--- a/caffe2/operators/given_tensor_fill_op.h
+++ b/caffe2/operators/given_tensor_fill_op.h
@@ -68,7 +68,7 @@ class GivenTensorFillOp final : public FillerOp<Context> {
         at::dtype<Type>().device(CPU));
     Type* values_data = values_.template mutable_data<Type>();
     // NOLINTNEXTLINE(clang-diagnostic-sign-compare)
-    for (int i = 0; i < source_values.size(); i++) {
+    for (const auto i : c10::irange(source_values.size())) {
       values_data[i] = static_cast<Type>(source_values[i]);
     }
     body_ = &GivenTensorFillOp::FillWithType<Type>;
diff --git a/caffe2/operators/gru_unit_op.h b/caffe2/operators/gru_unit_op.h
index 721b882797e08..2bf1476e9b5cc 100644
--- a/caffe2/operators/gru_unit_op.h
+++ b/caffe2/operators/gru_unit_op.h
@@ -4,6 +4,7 @@
 #include "caffe2/core/context.h"
 #include "caffe2/core/operator.h"
 #include "caffe2/utils/math.h"
+#include <c10/util/irange.h>
 
 namespace caffe2 {
 namespace detail {
@@ -29,10 +30,10 @@ void GRUUnit(
     bool drop_states,
     T* H,
     Context* /*context*/) {
-  for (int n = 0; n < N; ++n) {
+  for (const auto n : c10::irange(N)) {
     const bool valid = seqLengths == nullptr || t < seqLengths[n];
 
-    for (int d = 0; d < D; ++d) {
+    for (const auto d : c10::irange(D)) {
       if (!valid) {
         if (drop_states) {
           H[d] = 0;
@@ -68,10 +69,10 @@ void GRUUnitGradient(
     T* H_prev_diff,
     T* X_diff,
     Context* /*context*/) {
-  for (int n = 0; n < N; ++n) {
+  for (const auto n : c10::irange(N)) {
     const bool valid = seqLengths == nullptr || t < seqLengths[n];
 
-    for (int d = 0; d < D; ++d) {
+    for (const auto d : c10::irange(D)) {
       T* h_prev_diff = H_prev_diff + d;
       T* reset_diff = X_diff + 0 * D + d;
       T* update_diff = X_diff + 1 * D + d;
diff --git a/caffe2/operators/h_softmax_op.h b/caffe2/operators/h_softmax_op.h
index 395f8f651b54b..3943e87477202 100644
--- a/caffe2/operators/h_softmax_op.h
+++ b/caffe2/operators/h_softmax_op.h
@@ -2,6 +2,7 @@
 #define CAFFE2_OPERATORS_H_SOFTMAX_OP_H_
 
 #include <c10/util/Optional.h>
+#include <c10/util/irange.h>
 #include "caffe2/core/context.h"
 #include "caffe2/core/logging.h"
 #include "caffe2/core/operator.h"
@@ -51,7 +52,7 @@ class HSoftmaxOpBase : public Operator<Context> {
       int M,
       std::unordered_map<int, PathProto>& hierarchy) const {
     int size = 0;
-    for (int label = 0; label < M; ++label) {
+    for (const auto label : c10::irange(M)) {
       int word_id = labels[label];
       const auto& path = hierarchy[word_id];
       size += std::accumulate(
diff --git a/caffe2/operators/histogram_op.h b/caffe2/operators/histogram_op.h
index 2900089839357..25d8d602dbbdc 100644
--- a/caffe2/operators/histogram_op.h
+++ b/caffe2/operators/histogram_op.h
@@ -1,8 +1,10 @@
 #pragma once
 
+#include "caffe2/core/operator.h"
+#include "c10/util/irange.h"
+
 #include <cmath>
 #include <limits>
-#include "caffe2/core/operator.h"
 
 namespace caffe2 {
 
@@ -19,7 +21,7 @@ class HistogramOp final : public Operator<Context> {
         2,
         "Number of bin edges must be greater than or equal to 2.");
     // NOLINTNEXTLINE(clang-diagnostic-sign-compare)
-    for (int i = 1; i < bin_edges_.size(); i++) {
+    for (const auto i : c10::irange(1, bin_edges_.size())) {
       CAFFE_ENFORCE_GT(
           bin_edges_[i],
           bin_edges_[i - 1],
@@ -41,11 +43,11 @@ class HistogramOp final : public Operator<Context> {
     math::Set<int64_t, Context>(
         bin_edges_.size() - 1, 0, histogram_data, &context_);
 
-    for (int input_idx = 0; input_idx < InputSize(); input_idx++) {
+    for (const auto input_idx : c10::irange(InputSize())) {
       const auto& x = Input(input_idx);
       const int64_t N = x.numel();
       const auto* x_data = x.template data<T>();
-      for (int64_t data_idx = 0; data_idx < N; data_idx++) {
+      for (const auto data_idx : c10::irange(N)) {
         const auto bisection_it = std::upper_bound(
             bin_edges_.begin(), bin_edges_.end(), x_data[data_idx]);
         const int bisection_idx = bisection_it - bin_edges_.begin();
@@ -67,7 +69,7 @@ class HistogramOp final : public Operator<Context> {
 
   void CheckInputs() {
     const auto& input_zero = Input(0);
-    for (int i = 1; i < InputSize(); i++) {
+    for (const auto i : c10::irange(1, InputSize())) {
       CAFFE_ENFORCE_EQ(
           Input(i).dtype(),
           input_zero.dtype(),
diff --git a/caffe2/operators/im2col_op.h b/caffe2/operators/im2col_op.h
index 5bb07ea41f43d..a7880e0e62334 100644
--- a/caffe2/operators/im2col_op.h
+++ b/caffe2/operators/im2col_op.h
@@ -5,6 +5,7 @@
 #include "caffe2/core/logging.h"
 #include "caffe2/core/operator.h"
 #include "caffe2/utils/math.h"
+#include "c10/util/irange.h"
 
 namespace caffe2 {
 
@@ -84,7 +85,7 @@ class Im2ColOp final : public Operator<Context> {
 
         const size_t dx = X.numel() / N;
         const size_t dy = Y->numel() / N;
-        for (int n = 0; n < N; ++n) {
+        for (const auto n : c10::irange(N)) {
           const auto* xdata = X.template data<T>() + (n * dx);
           auto* ydata = Y->template mutable_data<T>() + (n * dy);
           math::Im2Col<T, Context, StorageOrder::NCHW>(
@@ -114,7 +115,7 @@ class Im2ColOp final : public Operator<Context> {
 
         const size_t dx = X.numel() / N;
         const size_t dy = Y->numel() / N;
-        for (int n = 0; n < N; ++n) {
+        for (const auto n : c10::irange(N)) {
           const auto* xdata = X.template data<T>() + (n * dx);
           auto* ydata = Y->template mutable_data<T>() + (n * dy);
           math::Im2Col<T, Context, StorageOrder::NHWC>(
@@ -230,7 +231,7 @@ class Col2ImOp final : public Operator<Context> {
     // could template-specialize this, but it's test code...
     switch (order_) {
       case StorageOrder::NCHW: {
-        for (int n = 0; n < N; ++n) {
+        for (const auto n : c10::irange(N)) {
           const auto* xdata = X.template data<T>() + (n * dx);
           auto* ydata = Y->template mutable_data<T>() + (n * dy);
           math::Col2Im<T, Context, StorageOrder::NCHW>(
@@ -253,7 +254,7 @@ class Col2ImOp final : public Operator<Context> {
         }
       }; break;
       case StorageOrder::NHWC: {
-        for (int n = 0; n < N; ++n) {
+        for (const auto n : c10::irange(N)) {
           const auto* xdata = X.template data<T>() + (n * dx);
           auto* ydata = Y->template mutable_data<T>() + (n * dy);
           math::Col2Im<T, Context, StorageOrder::NHWC>(
diff --git a/caffe2/operators/index_hash_ops.h b/caffe2/operators/index_hash_ops.h
index df3eeb6a86714..c26331c381c73 100644
--- a/caffe2/operators/index_hash_ops.h
+++ b/caffe2/operators/index_hash_ops.h
@@ -2,6 +2,7 @@
 #define CAFFE2_OPERATORS_INDEX_HASH_OPS_H_
 
 #include "caffe2/core/export_caffe2_op_to_c10.h"
+#include <c10/util/irange.h>
 #include "caffe2/core/logging.h"
 #include "caffe2/core/operator.h"
 
@@ -42,7 +43,7 @@ class IndexHashOp : public Operator<Context> {
     auto* indices_data = indices.template data<T>();
     auto* hashed_indices_data = hashed_indices->template mutable_data<T>();
 
-    for (auto i = 0; i < N; i++) {
+    for (const auto i : c10::irange(N)) {
       hashed_indices_data[i] = hash(indices_data[i]);
     }
 
diff --git a/caffe2/operators/index_ops.h b/caffe2/operators/index_ops.h
index e0c00286781e9..4a72bed7a661e 100644
--- a/caffe2/operators/index_ops.h
+++ b/caffe2/operators/index_ops.h
@@ -9,6 +9,7 @@
 #include "caffe2/core/blob_serialization.h"
 #include "caffe2/core/operator.h"
 #include "caffe2/core/tensor.h"
+#include "c10/util/irange.h"
 
 namespace caffe2 {
 namespace {
@@ -64,7 +65,7 @@ struct Index : IndexBase {
     }
     std::lock_guard<std::mutex> lock(dictMutex_);
     // NOLINTNEXTLINE(clang-diagnostic-sign-compare)
-    for (int i = 0; i < numKeys; ++i) {
+    for (const auto i : c10::irange(numKeys)) {
       auto it = dict_.find(keys[i]);
       if (it != dict_.end()) {
         values[i] = it->second;
@@ -84,7 +85,7 @@ struct Index : IndexBase {
         numKeys <= maxElements_,
         "Cannot load index: Tensor is larger than max_elements.");
     decltype(dict_) dict;
-    for (auto i = 0U; i < numKeys; ++i) {
+    for (const auto i : c10::irange(0U, numKeys)) {
       CAFFE_ENFORCE(
           dict.insert({keys[i], i + 1}).second,
           "Repeated elements found: cannot load into dictionary.");
@@ -111,7 +112,7 @@ struct Index : IndexBase {
 
  private:
   void FrozenGet(const T* keys, int64_tValue* values, size_t numKeys) {
-    for (auto i = 0U; i < numKeys; ++i) {
+    for (const auto i : c10::irange(0U, numKeys)) {
       auto it = dict_.find(keys[i]);
       values[i] = it != dict_.end() ? it->second : 0;
     }
diff --git a/caffe2/operators/inference_lstm_op.h b/caffe2/operators/inference_lstm_op.h
index dbbe7d33ce290..2a7a4851ce24f 100644
--- a/caffe2/operators/inference_lstm_op.h
+++ b/caffe2/operators/inference_lstm_op.h
@@ -7,6 +7,7 @@
 #include <vector>
 #include "caffe2/core/blob_serialization.h"
 #include "caffe2/core/export_caffe2_op_to_c10.h"
+#include <c10/util/irange.h>
 #include "caffe2/core/operator.h"
 #include "caffe2/core/tensor.h"
 #include "caffe2/utils/eigen_utils.h"
@@ -125,7 +126,7 @@ struct FullLSTMLayer : Layer<t_tuple, CellParams> {
     std::vector<Tensor> step_outputs;
     auto hidden = copy_ctor(input_hidden);
 
-    for (size_t i = 0; i < step_inputs.size(); i++) {
+    for (const auto i : c10::irange(step_inputs.size())) {
       hidden = cell_(step_inputs[i], hidden, params);
       step_outputs.push_back(copy_ctor(std::get<0>(hidden)));
     }
@@ -203,7 +204,7 @@ LayerOutput<Tensor, std::vector<hidden_type>> apply_layer_stack(
   auto hidden_it = hiddens.begin();
   auto weight_it = weights.begin();
   std::vector<hidden_type> final_hiddens(num_layers);
-  for (int64_t l = 0; l < num_layers; ++l) {
+  for (const auto l : c10::irange(num_layers)) {
     auto layer_output = layer(layer_input, *(hidden_it++), *(weight_it++));
     final_hiddens.at(l) = std::move(layer_output.final_hidden);
     layer_input = std::move(layer_output.outputs);
@@ -225,7 +226,7 @@ std::tuple<Tensor, Tensor, Tensor> _lstm_impl(
   int64_t total_layers = layer_hx.size();
   std::vector<std::tuple<Tensor, Tensor>> hiddens;
   hiddens.reserve(total_layers);
-  for (int64_t i = 0; i < total_layers; ++i) {
+  for (const auto i : c10::irange(total_layers)) {
     hiddens.emplace_back(std::move(layer_hx[i]), std::move(layer_cx[i]));
   }
   LSTMCell cell(context);
diff --git a/caffe2/operators/key_split_ops.h b/caffe2/operators/key_split_ops.h
index f3eb3cd47b2a6..3656547fa2b2e 100644
--- a/caffe2/operators/key_split_ops.h
+++ b/caffe2/operators/key_split_ops.h
@@ -1,11 +1,12 @@
 #pragma once
 
-#include <vector>
-
+#include <c10/util/irange.h>
 #include "caffe2/core/context.h"
 #include "caffe2/core/operator.h"
 #include "caffe2/utils/math.h"
 
+#include <vector>
+
 namespace caffe2 {
 template <typename T, class Context>
 class KeySplitOp : public Operator<Context> {
@@ -22,26 +23,26 @@ class KeySplitOp : public Operator<Context> {
 
   bool RunOnDevice() override {
     auto& keys = Input(0);
-    int N = keys.numel();
-    const T* keys_data = keys.template data<T>();
+    const auto N = keys.numel();
+    const T *const keys_data = keys.template data<T>();
     std::vector<int> counts(categorical_limit_);
     std::vector<int*> eids(categorical_limit_);
-    for (int k = 0; k < categorical_limit_; k++) {
+    for (const auto k : c10::irange(categorical_limit_)) {
       counts[k] = 0;
     }
-    for (int i = 0; i < N; i++) {
-      int k = keys_data[i];
+    for (const auto i : c10::irange(N)) {
+      const auto k = keys_data[i];
       CAFFE_ENFORCE_GT(categorical_limit_, k);
       CAFFE_ENFORCE_GE(k, 0);
       counts[k]++;
     }
-    for (int k = 0; k < categorical_limit_; k++) {
-      auto* eid = Output(k, {counts[k]}, at::dtype<int>());
+    for (const auto k : c10::irange(categorical_limit_)) {
+      auto *const eid = Output(k, {counts[k]}, at::dtype<int>());
       eids[k] = eid->template mutable_data<int>();
       counts[k] = 0;
     }
-    for (int i = 0; i < N; i++) {
-      int k = keys_data[i];
+    for (const auto i : c10::irange(N)) {
+      const auto k = keys_data[i];
       eids[k][counts[k]++] = i;
     }
     return true;
diff --git a/caffe2/operators/length_split_op.h b/caffe2/operators/length_split_op.h
index a5bee082d77b9..7d65123ca6c6a 100644
--- a/caffe2/operators/length_split_op.h
+++ b/caffe2/operators/length_split_op.h
@@ -6,6 +6,7 @@
 #include "caffe2/core/logging.h"
 #include "caffe2/core/operator.h"
 #include "caffe2/utils/math.h"
+#include "c10/util/irange.h"
 
 namespace caffe2 {
 
@@ -55,11 +56,11 @@ class LengthsSplitOp final : public Operator<Context> {
     const int32_t* Ldata = L.template data<int32_t>();
     int32_t* Ydata = Y->template mutable_data<int32_t>();
 
-    for (int i = 0; i < M; i++) {
+    for (const auto i : c10::irange(M)) {
       int32_t mod = Ldata[i] % n_split_;
       int32_t res =
           mod != 0 ? math::DivUp(Ldata[i], n_split_) : Ldata[i] / n_split_ + 1;
-      for (int j = 0; j < n_split_; j++) {
+      for (const auto j : c10::irange(n_split_)) {
         Ydata[(i * n_split_) + j] = mod-- > 0 ? res : res - 1;
       }
     }
diff --git a/caffe2/operators/lengths_pad_op.h b/caffe2/operators/lengths_pad_op.h
index c0019b6f4ee3f..fed3eb0a2b4d8 100644
--- a/caffe2/operators/lengths_pad_op.h
+++ b/caffe2/operators/lengths_pad_op.h
@@ -3,6 +3,7 @@
 
 #include "caffe2/core/operator.h"
 #include "caffe2/utils/math.h"
+#include "c10/util/irange.h"
 
 namespace caffe2 {
 
@@ -56,7 +57,7 @@ class LengthsPadOp : public Operator<Context> {
 
     math::Set(
         output->numel(), static_cast<T>(padding_value_), out_data, &context_);
-    for (int64_t i = 0; i < lengths_size; ++i) {
+    for (const auto i : c10::irange(lengths_size)) {
       auto length = lengths_data[i];
       CAFFE_ENFORCE_GE(length, 0);
       CAFFE_ENFORCE_GE(
diff --git a/caffe2/operators/lengths_reducer_fused_8bit_rowwise_ops.h b/caffe2/operators/lengths_reducer_fused_8bit_rowwise_ops.h
index e12c1e9106950..db22264aff317 100644
--- a/caffe2/operators/lengths_reducer_fused_8bit_rowwise_ops.h
+++ b/caffe2/operators/lengths_reducer_fused_8bit_rowwise_ops.h
@@ -121,7 +121,7 @@ class SparseLengthsFused8BitRowwiseOp : public Operator<Context> {
     auto indices_data = indices.template data<IndexType>();
 
     int64_t current = 0;
-    for (int m = 0; m < output_size; ++m) {
+    for (const auto m : c10::irange(output_size)) {
       for (int i = 0; i < lengths_data[m]; ++i) {
         CAFFE_ENFORCE_LT(current, index_size);
         IndexType idx = indices_data[current];
diff --git a/caffe2/operators/lengths_reducer_fused_nbit_rowwise_ops.h b/caffe2/operators/lengths_reducer_fused_nbit_rowwise_ops.h
index 58be36fb9bc56..a0dd91b34d94f 100644
--- a/caffe2/operators/lengths_reducer_fused_nbit_rowwise_ops.h
+++ b/caffe2/operators/lengths_reducer_fused_nbit_rowwise_ops.h
@@ -137,7 +137,7 @@ class SparseLengthsFusedNBitRowwiseOp final : public Operator<Context> {
 
     // Error handling
     int64_t current = 0;
-    for (int m = 0; m < output_size; ++m) {
+    for (const auto m : c10::irange(output_size)) {
       for (int i = 0; i < lengths_data[m]; ++i) {
         CAFFE_ENFORCE_LT(current, index_size);
         IndexType idx = indices_data[current];
@@ -164,7 +164,7 @@ class SparseLengthsFusedNBitRowwiseOp final : public Operator<Context> {
         << "Running slow path because FBGEMM is not available";
 
     int64_t current = 0;
-    for (int m = 0; m < output_size; ++m) {
+    for (const auto m : c10::irange(output_size)) {
       memset(output_data, 0, block_size * sizeof(float));
       if (current + lengths_data[m] > index_size) {
         return false;
@@ -185,7 +185,7 @@ class SparseLengthsFusedNBitRowwiseOp final : public Operator<Context> {
         const float scale = weight * scale_bias[0];
         const float bias = weight * scale_bias[1];
 
-        for (int j = 0; j < block_size; ++j) {
+        for (const auto j : c10::irange(block_size)) {
           uint8_t quantized =
               input_data[idx * data.size(1) + j / NUM_ELEM_PER_BYTE];
           quantized >>= (j % NUM_ELEM_PER_BYTE) * BIT_RATE;
@@ -196,7 +196,7 @@ class SparseLengthsFusedNBitRowwiseOp final : public Operator<Context> {
       } // for each i
       if (is_mean && lengths_data[m]) {
         float scale = 1.0f / lengths_data[m];
-        for (int j = 0; j < block_size; ++j) {
+        for (const auto j : c10::irange(block_size)) {
           output_data[j] *= scale;
         }
       }
@@ -284,13 +284,14 @@ class SparseLengthsSumSparseLookupOp final : public Operator<CPUContext> {
     const IndexType compressed_data_size = compressed_indices_mapping.size(0);
     IndexType current = 0;
     IndexType current_output = 0;
-    for (int m = 0; m < output_size; ++m) {
+    for (const auto m : c10::irange(output_size)) {
       const auto current_length = lengths_data[m];
       if (current + current_length > index_size) {
         return false;
       }
       int32_t skipped = 0;
-      for (int i = 0; i < current_length; ++i) {
+      for (const auto i : c10::irange(current_length)) {
+        (void)i; // Suppress unused variable warning
         IndexType compressed_idx = indices_data[current];
         if (compressed_idx < 0 || compressed_idx >= compressed_data_size) {
           return false;
@@ -554,7 +555,7 @@ class SparseLengthsNBitRowwiseSparseOp final : public Operator<CPUContext> {
 
     // Error handling
     int64_t current = 0;
-    for (int m = 0; m < output_size; ++m) {
+    for (const auto m : c10::irange(output_size)) {
       for (int i = 0; i < lengths_data[m]; ++i) {
         CAFFE_ENFORCE_LT(current, index_size);
         IndexType idx = indices_data[current];
@@ -592,7 +593,7 @@ class SparseLengthsNBitRowwiseSparseOp final : public Operator<CPUContext> {
         << "Running slow path because FBGEMM is not available";
 
     int64_t current = 0;
-    for (int m = 0; m < output_size; ++m) {
+    for (const auto m : c10::irange(output_size)) {
       memset(output_data, 0, block_size * sizeof(float));
       if (current + lengths_data[m] > index_size) {
         return false;
@@ -632,7 +633,7 @@ class SparseLengthsNBitRowwiseSparseOp final : public Operator<CPUContext> {
           bias = weight * reinterpret_cast<const at::Half*>(scale_bias)[1];
         }
 
-        for (int j = 0; j < block_size; ++j) {
+        for (const auto j : c10::irange(block_size)) {
           uint8_t quantized =
               input_data[idx * data.size(1) + j / NUM_ELEM_PER_BYTE];
           quantized >>= (j % NUM_ELEM_PER_BYTE) * BIT_RATE;
@@ -643,7 +644,7 @@ class SparseLengthsNBitRowwiseSparseOp final : public Operator<CPUContext> {
       } // for each i
       if (is_mean && lengths_data[m]) {
         float scale = 1.0f / lengths_data[m];
-        for (int j = 0; j < block_size; ++j) {
+        for (const auto j : c10::irange(block_size)) {
           output_data[j] *= scale;
         }
       }
diff --git a/caffe2/operators/lengths_reducer_ops.h b/caffe2/operators/lengths_reducer_ops.h
index e01da722074cf..83199db8fe66e 100644
--- a/caffe2/operators/lengths_reducer_ops.h
+++ b/caffe2/operators/lengths_reducer_ops.h
@@ -1,4 +1,6 @@
 #pragma once
+
+#include <c10/util/irange.h>
 #include "caffe2/core/context.h"
 #include "caffe2/core/operator.h"
 #include "caffe2/perfkernels/embedding_lookup.h"
@@ -192,7 +194,7 @@ class CPUSparseLengthsReductionOp : public Operator<CPUContext> {
     }
 
     int64_t current = 0;
-    for (int m = 0; m < M; ++m) {
+    for (const auto m : c10::irange(M)) {
       for (int i = 0; i < lengths[m]; ++i) {
         CAFFE_ENFORCE_LT(
             current,
@@ -280,7 +282,7 @@ class TTSparseLengthsSumOp final : public Operator<Context> {
         emb_size(this->template GetSingleArgument<int>("emb_size", 64)) {
     // cumprod of i, used for index slice
     l_cumprod.push_back(1);
-    for (size_t i = 1; i < factor_i.size(); ++i) {
+    for (const auto i : c10::irange(1, factor_i.size())) {
       l_cumprod.push_back(l_cumprod[i - 1] * factor_i[i - 1]);
     }
   }
@@ -290,7 +292,7 @@ class TTSparseLengthsSumOp final : public Operator<Context> {
   void Ind2Sub(int64_t* out_factor_index, const int64_t* indices, int len) {
     // TODO: vectorization
     auto N = factor_i.size();
-    for (int j = 0; j < len; j++) {
+    for (const auto j : c10::irange(len)) {
       auto idx = indices[j];
       for (int i = N; i > 0; i--) {
         out_factor_index[j * N + i - 1] = idx / l_cumprod[i - 1];
@@ -307,7 +309,7 @@ class TTSparseLengthsSumOp final : public Operator<Context> {
       int idx) {
     // implement the functinality index_select(core, 1, ind_slice)
     auto num_of_elements = ranks[idx] * factor_j[idx] * ranks[idx + 1];
-    for (int i = 0; i < bs; i++) {
+    for (const auto i : c10::irange(bs)) {
       memcpy(
           tgt_slice[i].data(),
           core + ind_slice[i] * num_of_elements,
@@ -345,16 +347,16 @@ class TTSparseLengthsSumOp final : public Operator<Context> {
     // Store the intermediate result in each layer
     vector<T*> Z_ptr(bs);
 
-    for (int b = 0; b < bs; b++) {
+    for (const auto b : c10::irange(bs)) {
       Y_ptr[b] = res[b].data();
       Z_ptr[b] = int_res[b].data();
     }
 
     vector<int64_t> ind_slice(bs);
     int rows = 0;
-    for (int i = 0; i < x_len; i++) {
+    for (const auto i : c10::irange(x_len)) {
       // slice cur
-      for (int j = 0; j < bs; j++) {
+      for (const auto j : c10::irange(bs)) {
         ind_slice[j] = ind[x_len * j + i];
       }
       if (i == 0) {
@@ -364,7 +366,7 @@ class TTSparseLengthsSumOp final : public Operator<Context> {
         std::vector<std::vector<T>> slice(
             bs, std::vector<T>(ranks[i] * factor_j[i] * ranks[i + 1], 0));
         vector<const T*> X_ptr(bs);
-        for (int b = 0; b < bs; b++) {
+        for (const auto b : c10::irange(bs)) {
           X_ptr[b] = slice[b].data();
         }
         GetSlice(slice, cores[i], ind_slice, bs, i);
@@ -382,7 +384,7 @@ class TTSparseLengthsSumOp final : public Operator<Context> {
             0.0f,
             Z_ptr.data(),
             &context_);
-        for (int b = 0; b < bs; b++) {
+        for (const auto b : c10::irange(bs)) {
           std::memcpy(Y_ptr[b], Z_ptr[b], (emb_size * max_rank) * sizeof(T));
         }
         rows *= factor_j[i];
@@ -393,7 +395,7 @@ class TTSparseLengthsSumOp final : public Operator<Context> {
       if (i < 2) {
         auto* core_data = Output(i + 1, shape, at::dtype<T>());
         T* out_core = core_data->template mutable_data<T>();
-        for (int b = 0; b < bs; b++) {
+        for (const auto b : c10::irange(bs)) {
           std::memcpy(
               out_core + b * rows * ranks[i + 1],
               Y_ptr[b],
@@ -404,7 +406,7 @@ class TTSparseLengthsSumOp final : public Operator<Context> {
 
     // reduction and store back to output
     vector<int64_t> cum_lengths(segments);
-    for (int seg = 0; seg < segments; seg++) {
+    for (const auto seg : c10::irange(segments)) {
       cum_lengths[seg] =
           seg == 0 ? lengths[0] : lengths[seg] + cum_lengths[seg - 1];
     }
@@ -549,7 +551,7 @@ bool TTSparseLengthsSumGradientOp<T, Context>::RunOnDevice() {
   int64_t* index_out_data = index_out.template mutable_data<int64_t>();
 
   vector<vector<int64_t>> index_slice(bs, vector<int64_t>(3, 0));
-  for (int64_t b = 0; b < bs; b++) {
+  for (const auto b : c10::irange(bs)) {
     memcpy(index_slice[b].data(), index_out_data + b * 3, 3 * sizeof(int64_t));
   }
 
@@ -563,7 +565,7 @@ bool TTSparseLengthsSumGradientOp<T, Context>::RunOnDevice() {
   // expand the gradient into all indices
   vector<vector<T>> core2_out_grad(bs, vector<T>(emb_size, 0));
   int64_t data_index = 0;
-  for (int64_t range_index = 0; range_index < num_segments; ++range_index) {
+  for (const auto range_index : c10::irange(num_segments)) {
     for (int64_t start = data_index;
          data_index < start + lengths_data[range_index];
          ++data_index) {
@@ -582,7 +584,7 @@ bool TTSparseLengthsSumGradientOp<T, Context>::RunOnDevice() {
       bs, vector<T>(core2_shape[1] * core2_shape[2] * core2_shape[3], 0));
   const T* core1_out_data = core1_out.template data<T>();
   // const T* core1_out_p[bs];
-  for (int64_t b = 0; b < bs; b++) {
+  for (const auto b : c10::irange(bs)) {
     A_ptr[b] = core1_out_data + b * core1_out.size(1) * core1_out.size(2);
     B_ptr[b] = core2_out_grad[b].data();
     C_ptr[b] = dCore2_data_slice_grad[b].data();
@@ -609,8 +611,8 @@ bool TTSparseLengthsSumGradientOp<T, Context>::RunOnDevice() {
   vector<vector<T>> core2_slice(
       bs, vector<T>(core2_shape[1] * core2_shape[2] * core2_shape[3], 0));
 
-  for (int64_t b = 0; b < bs; b++) {
-    for (int i = 0; i < num_of_elements; i++) {
+  for (const auto b : c10::irange(bs)) {
+    for (const auto i : c10::irange(num_of_elements)) {
       dCore2_data[index_slice[b][2] * num_of_elements + i] += C_ptr[b][i];
     }
     memcpy(
@@ -623,7 +625,7 @@ bool TTSparseLengthsSumGradientOp<T, Context>::RunOnDevice() {
   vector<vector<T>> core1_out_grad(
       bs, vector<T>(core1_out_shape[1] * core1_out_shape[2], 0));
 
-  for (int64_t b = 0; b < bs; b++) {
+  for (const auto b : c10::irange(bs)) {
     A_ptr[b] = core2_out_grad[b].data();
     B_ptr[b] = core2_slice[b].data();
     C_ptr[b] = core1_out_grad[b].data();
@@ -650,7 +652,7 @@ bool TTSparseLengthsSumGradientOp<T, Context>::RunOnDevice() {
   vector<vector<T>> dCore1_data_slice_grad(
       bs, vector<T>(core1_shape[1] * core1_shape[2] * core1_shape[3], 0));
   const T* core0_out_data = core0_out.template data<T>();
-  for (int64_t b = 0; b < bs; b++) {
+  for (const auto b : c10::irange(bs)) {
     A_ptr[b] = core0_out_data + b * core0_out.size(1) * core0_out.size(2);
     B_ptr[b] = core1_out_grad[b].data();
     C_ptr[b] = dCore1_data_slice_grad[b].data();
@@ -676,8 +678,8 @@ bool TTSparseLengthsSumGradientOp<T, Context>::RunOnDevice() {
   vector<vector<T>> core1_slice(
       bs, vector<T>(core1_shape[1] * core1_shape[2] * core1_shape[3], 0));
 
-  for (int64_t b = 0; b < bs; b++) {
-    for (int i = 0; i < num_of_elements; i++) {
+  for (const auto b : c10::irange(bs)) {
+    for (const auto i : c10::irange(num_of_elements)) {
       dCore1_data[index_slice[b][1] * num_of_elements + i] += C_ptr[b][i];
     }
     memcpy(
@@ -690,7 +692,7 @@ bool TTSparseLengthsSumGradientOp<T, Context>::RunOnDevice() {
   vector<vector<T>> core0_out_grad(
       bs, vector<T>(core0_out_shape[1] * core0_out_shape[2], 0));
 
-  for (int64_t b = 0; b < bs; b++) {
+  for (const auto b : c10::irange(bs)) {
     A_ptr[b] = core1_out_grad[b].data();
     B_ptr[b] = core1_slice[b].data();
     C_ptr[b] = core0_out_grad[b].data();
@@ -712,8 +714,8 @@ bool TTSparseLengthsSumGradientOp<T, Context>::RunOnDevice() {
 
   num_of_elements = core0_shape[1] * core0_shape[2] * core0_shape[3];
 
-  for (int64_t b = 0; b < bs; b++) {
-    for (int i = 0; i < num_of_elements; i++) {
+  for (const auto b : c10::irange(bs)) {
+    for (const auto i : c10::irange(num_of_elements)) {
       dCore0_data[index_slice[b][0] * num_of_elements + i] += C_ptr[b][i];
     }
   }
diff --git a/caffe2/operators/lengths_reducer_rowwise_8bit_ops.h b/caffe2/operators/lengths_reducer_rowwise_8bit_ops.h
index 67d792f4ca23c..acf5d442ca724 100644
--- a/caffe2/operators/lengths_reducer_rowwise_8bit_ops.h
+++ b/caffe2/operators/lengths_reducer_rowwise_8bit_ops.h
@@ -110,7 +110,7 @@ class FloatToRowwiseQuantized8BitsOp : public Operator<Context> {
     float* scale_bias_data = scale_bias->template mutable_data<float>();
     size_t n_blocks = input.size(0);
     size_t block_size = input.size_from_dim(1);
-    for (size_t i = 0; i < n_blocks; ++i) {
+    for (const auto i : c10::irange(n_blocks)) {
       ConstEigenVectorArrayMap<float> input_row(
           input_data + i * block_size, block_size);
       EigenVectorArrayMap<uint8_t> output_row(
@@ -164,7 +164,7 @@ class Rowwise8BitQuantizedToFloatOp : public Operator<Context> {
     size_t block_size = input.size_from_dim(1);
     size_t n_blocks = input.size(0);
 
-    for (size_t i = 0; i < n_blocks; ++i) {
+    for (const auto i : c10::irange(n_blocks)) {
       ConstEigenVectorArrayMap<uint8_t> input_row(
           input_data + i * block_size, block_size);
       EigenVectorArrayMap<float> output_row(
diff --git a/caffe2/operators/load_save_op.h b/caffe2/operators/load_save_op.h
index 64a90eb0c442b..78fd0b9d51337 100644
--- a/caffe2/operators/load_save_op.h
+++ b/caffe2/operators/load_save_op.h
@@ -5,6 +5,8 @@
 #include <map>
 #include <unordered_set>
 
+
+#include <c10/util/irange.h>
 #include <c10/util/string_view.h>
 #include "caffe2/core/blob_serialization.h"
 #include "caffe2/core/context.h"
@@ -129,13 +131,13 @@ class LoadOp final : public Operator<Context> {
     int total_loaded_blobs = 0;
     std::unordered_map<string, load_save_op_util::BlobState> blob_states;
     if (InputSize() > 0) {
-      for (int i = 0; i < InputSize(); ++i) {
+      for (const auto i : c10::irange(InputSize())) {
         const db::DBReader& reader = this->template Input<db::DBReader>(i);
         extract(i, reader.cursor(), &blob_states, &total_loaded_blobs);
       }
     } else {
       // NOLINTNEXTLINE(clang-diagnostic-sign-compare)
-      for (int i = 0; i < db_names_.size(); ++i) {
+      for (const auto i : c10::irange(db_names_.size())) {
         string full_db_name = absolute_path_
             ? db_names_[i]
             : (ws_->RootFolder() + "/" + db_names_[i]);
diff --git a/caffe2/operators/locally_connected_op_impl.h b/caffe2/operators/locally_connected_op_impl.h
index df05cad403e92..4c6312ab3a5f9 100644
--- a/caffe2/operators/locally_connected_op_impl.h
+++ b/caffe2/operators/locally_connected_op_impl.h
@@ -45,7 +45,7 @@ bool LocallyConnectedOp<T, Context>::RunOnDeviceWithOrderNCHW() {
   shape.input_image_size = GetDimsSize(X);
   shape.output_image_size = GetDimsSize(*Y);
   const std::vector<int> output_image_dims = GetDims(*Y);
-  for (int i = 0; i < image_ndim; ++i) {
+  for (const auto i : c10::irange(image_ndim)) {
     CAFFE_ENFORCE_EQ(output_image_dims[i], filter.dim32(i));
   }
 
@@ -82,7 +82,7 @@ bool LocallyConnectedOp<T, Context>::RunOnDeviceWithOrderNCHW() {
   if (InputSize() == 3) {
     const auto& bias = Input(BIAS);
     CAFFE_ENFORCE_EQ(bias.dim(), image_ndim + 1);
-    for (int i = 0; i < image_ndim; ++i) {
+    for (const auto i : c10::irange(image_ndim)) {
       CAFFE_ENFORCE_EQ(bias.dim32(i), output_image_dims[i]);
     }
     CAFFE_ENFORCE_EQ(bias.dim32(image_ndim), shape.M);
@@ -129,7 +129,7 @@ bool LocallyConnectedOp<T, Context>::RunOnDeviceWithOrderNHWC() {
   shape.input_image_size = GetDimsSize(X);
   shape.output_image_size = GetDimsSize(*Y);
   const std::vector<int> output_image_dims = GetDims(*Y);
-  for (int i = 0; i < image_ndim; ++i) {
+  for (const auto i : c10::irange(image_ndim)) {
     CAFFE_ENFORCE_EQ(output_image_dims[i], filter.dim32(i));
   }
 
@@ -159,7 +159,7 @@ bool LocallyConnectedOp<T, Context>::RunOnDeviceWithOrderNHWC() {
   if (InputSize() == 3) {
     const auto& bias = Input(BIAS);
     CAFFE_ENFORCE_EQ(bias.dim(), image_ndim + 1);
-    for (int i = 0; i < image_ndim; ++i) {
+    for (const auto i : c10::irange(image_ndim)) {
       CAFFE_ENFORCE_EQ(bias.dim32(i), output_image_dims[i]);
     }
     CAFFE_ENFORCE_EQ(bias.dim32(image_ndim), shape.M);
@@ -200,8 +200,9 @@ void LocallyConnectedOp<T, Context>::RunOnDeviceWithOrderNCHWImpl(
   T* column_buffer_data = column_buffer->template mutable_data<T>();
   T* Y_transposed_buffer_data = Y_transposed_buffer->template mutable_data<T>();
 
-  for (int image_id = 0; image_id < shape.N; ++image_id) {
-    for (int group_id = 0; group_id < group_; ++group_id) {
+  for (const auto image_id : c10::irange(shape.N)) {
+    (void)image_id; // Suppress unused variable warning
+    for (const auto group_id : c10::irange(group_)) {
       if (kernel_.size() == 2) {
         math::Im2Col<T, Context, StorageOrder::NCHW>(
             shape.C / group_,
@@ -302,7 +303,7 @@ void LocallyConnectedOp<T, Context>::RunOnDeviceWithOrderNHWCImpl(
   Y_transposed_buffer->Resize(shape.Y_transposed_dims);
   T* column_buffer_data = column_buffer->template mutable_data<T>();
   T* Y_transposed_buffer_data = Y_transposed_buffer->template mutable_data<T>();
-  for (int image_id = 0; image_id < shape.N; ++image_id) {
+  for (const auto image_id : c10::irange(shape.N)) {
     math::Im2Col<T, Context, StorageOrder::NHWC>(
         shape.C,
         shape.X_dims[0],
@@ -387,7 +388,7 @@ bool LocallyConnectedGradientOp<T, Context>::RunOnDeviceWithOrderNCHW() {
   shape.input_image_size = GetDimsSize(X);
   const std::vector<int> output_image_dims = GetDims(dY);
   shape.output_image_size = GetDimsSize(dY);
-  for (int i = 0; i < image_ndim; ++i) {
+  for (const auto i : c10::irange(image_ndim)) {
     CAFFE_ENFORCE_EQ(output_image_dims[i], filter.dim32(i));
   }
   ConvPoolOpBase<Context>::ComputePads(input_image_dims);
@@ -484,7 +485,7 @@ bool LocallyConnectedGradientOp<T, Context>::RunOnDeviceWithOrderNHWC() {
   shape.input_image_size = GetDimsSize(X);
   shape.output_image_size = GetDimsSize(dY);
   const std::vector<int> output_image_dims = GetDims(dY);
-  for (int i = 0; i < image_ndim; ++i) {
+  for (const auto i : c10::irange(image_ndim)) {
     CAFFE_ENFORCE_EQ(output_image_dims[i], filter.dim32(i));
   }
 
@@ -568,8 +569,9 @@ void LocallyConnectedGradientOp<T, Context>::RunOnDeviceWithOrderNCHWImpl(
   T* dY_transposed_buffer_data =
       dY_transposed_buffer->template mutable_data<T>();
 
-  for (int image_id = 0; image_id < shape.N; ++image_id) {
-    for (int group_id = 0; group_id < group_; ++group_id) {
+  for (const auto image_id : c10::irange(shape.N)) {
+    (void)image_id; // Suppress unused variable warning
+    for (const auto group_id : c10::irange(group_)) {
       if (kernel_.size() == 2) {
         math::Im2Col<T, Context, StorageOrder::NCHW>(
             shape.C / group_,
@@ -681,8 +683,9 @@ void LocallyConnectedGradientOp<T, Context>::RunOnDeviceWithOrderNCHWImpl(
         column_buffer->template mutable_data<T>(),
         &context_);
     const T* const_column_buffer_data = column_buffer->template data<T>();
-    for (int image_id = 0; image_id < shape.N; ++image_id) {
-      for (int group_id = 0; group_id < group_; ++group_id) {
+    for (const auto image_id : c10::irange(shape.N)) {
+      (void)image_id; // Suppress unused variable warning
+      for (const auto group_id : c10::irange(group_)) {
         if (kernel_.size() == 2) {
           math::Col2Im<T, Context, StorageOrder::NCHW>(
               shape.C / group_,
@@ -743,7 +746,7 @@ void LocallyConnectedGradientOp<T, Context>::RunOnDeviceWithOrderNHWCImpl(
   T* column_buffer_data = column_buffer->template mutable_data<T>();
   T* dY_transposed_buffer_data =
       dY_transposed_buffer->template mutable_data<T>();
-  for (int image_id = 0; image_id < shape.N; ++image_id) {
+  for (const auto image_id : c10::irange(shape.N)) {
     math::Im2Col<T, Context, StorageOrder::NHWC>(
         shape.C,
         shape.X_dims[0],
@@ -835,7 +838,8 @@ void LocallyConnectedGradientOp<T, Context>::RunOnDeviceWithOrderNHWCImpl(
         column_buffer->template mutable_data<T>(),
         &context_);
     const T* const_column_buffer_data = column_buffer->template data<T>();
-    for (int image_id = 0; image_id < shape.N; ++image_id) {
+    for (const auto image_id : c10::irange(shape.N)) {
+      (void)image_id; // Suppress unused variable warning
       math::Col2Im<T, Context, StorageOrder::NHWC>(
           shape.C,
           shape.X_dims[0],
diff --git a/caffe2/operators/lstm_utils.h b/caffe2/operators/lstm_utils.h
index 0f564792215b6..cd2da4224ed0c 100644
--- a/caffe2/operators/lstm_utils.h
+++ b/caffe2/operators/lstm_utils.h
@@ -78,7 +78,7 @@ template <typename T>
 static std::vector<T> unpair_vec(std::vector<std::pair<T, T>>&& vals) {
   std::vector<T> result;
   result.reserve(vals.size() * 2);
-  for (int64_t i = 0; i < vals.size(); i++) {
+  for (const auto i : c10::irange(vals.size())) {
     result.push_back(std::move(vals[i].first));
     result.push_back(std::move(vals[i].second));
   }
@@ -150,7 +150,7 @@ chunk(const Tensor& input, int chunks, int axis, CPUContext* context) {
   auto split_size = input_channels / chunks;
   vector<int64_t> output_dims(input.sizes().vec());
   int before = 1, after = 1;
-  for (int i = 0; i < canonical_axis; ++i) {
+  for (const auto i : c10::irange(canonical_axis)) {
     before *= input.dim32(i);
   }
   for (int i = canonical_axis + 1; i < input.dim(); ++i) {
@@ -158,7 +158,8 @@ chunk(const Tensor& input, int chunks, int axis, CPUContext* context) {
   }
   size_t input_offset = 0;
   std::vector<Tensor> outputs;
-  for (int i = 0; i < chunks; ++i) {
+  for (const auto i : c10::irange(chunks)) {
+    (void)i; // Suppress unused variable warning
     auto axis_dim = split_size;
     output_dims[canonical_axis] = split_size;
     Tensor output(output_dims, CPU);
@@ -187,7 +188,7 @@ std::vector<Tensor> unbind(const Tensor& input, int axis, CPUContext* context) {
   newDims.erase(newDims.begin() + axis);
 
   // 3 - Reshape chunks to drop the extra dimension
-  for (int i = 0; i < chunks.size(); i++) {
+  for (const auto i : c10::irange(chunks.size())) {
     CAFFE_ENFORCE_EQ(
         chunks[i].sizes()[axis], 1, "Got an unexpected chunk size");
     chunks[i].Reshape(newDims);
@@ -201,14 +202,14 @@ cat(const std::vector<Tensor>& tensorList, int axis, CPUContext* context) {
   auto input_zero = copy_ctor(tensorList.at(0));
   vector<int64_t> outputDims(input_zero.sizes().vec());
   CAFFE_ENFORCE(outputDims.size() > 0);
-  for (int i = 1; i < tensorList.size(); i++) {
+  for (const auto i : c10::irange(1, tensorList.size())) {
     CAFFE_ENFORCE(input_zero.dtype() == tensorList.at(i).dtype());
     outputDims[axis] += tensorList.at(i).sizes()[axis];
   }
   auto output_channels = outputDims[axis];
   Tensor output(outputDims, CPU);
   int before = 1, after = 1;
-  for (int i = 0; i < tensorList.at(0).dim(); ++i) {
+  for (const auto i : c10::irange(tensorList.at(0).dim())) {
     if (i == axis) {
       continue;
     }
@@ -245,7 +246,7 @@ stack(const std::vector<Tensor>& tensorList, int axis, CPUContext* context) {
   std::vector<int64_t> newDims(tensorList[0].sizes().vec());
   std::vector<Tensor> expandedTensorList;
   newDims.insert(newDims.begin() + axis, 1);
-  for (int i = 0; i < tensorList.size(); i++) {
+  for (const auto i : c10::irange(tensorList.size())) {
     expandedTensorList.emplace_back(tensorList[i].Clone());
     expandedTensorList.at(i).Reshape(newDims);
   }
@@ -301,7 +302,7 @@ Tensor transpose(const Tensor& X, int dim0, int dim1, CPUContext* context) {
   std::swap(axes[dim0], axes[dim1]);
   const std::vector<std::int64_t> X_dims = X.sizes().vec();
   std::vector<std::int64_t> Y_dims(ndim);
-  for (int i = 0; i < ndim; ++i) {
+  for (const auto i : c10::irange(ndim)) {
     Y_dims[i] = X_dims[axes[i]];
   }
   Tensor Y(Y_dims, CPU);
diff --git a/caffe2/operators/map_ops.h b/caffe2/operators/map_ops.h
index f870172a69fab..df3364132ed8b 100644
--- a/caffe2/operators/map_ops.h
+++ b/caffe2/operators/map_ops.h
@@ -1,6 +1,11 @@
 #ifndef CAFFE2_OPERATORS_MAP_OPS_H_
 #define CAFFE2_OPERATORS_MAP_OPS_H_
 
+#include "caffe2/core/blob_serialization.h"
+#include "caffe2/core/context.h"
+#include "caffe2/core/operator.h"
+#include <c10/util/irange.h>
+
 #include <algorithm>
 #include <iterator>
 #include <string>
@@ -9,10 +14,6 @@
 #include <utility>
 #include <vector>
 
-#include "caffe2/core/blob_serialization.h"
-#include "caffe2/core/context.h"
-#include "caffe2/core/operator.h"
-
 namespace caffe2 {
 
 template <typename T>
@@ -130,7 +131,7 @@ class KeyValueToMapOp final : public Operator<Context> {
 
     auto* map_data = this->template Output<MapType>(MAP);
 
-    for (int i = 0; i < key_input.numel(); ++i) {
+    for (const auto i : c10::irange(key_input.numel())) {
       map_data->emplace(key_data[i], value_data[i]);
     }
 
@@ -257,7 +258,7 @@ class MapDeserializer : public BlobDeserializerBase {
     auto* value_data = value_tensor.data<VALUE_T>();
 
     auto* map_ptr = blob->template GetMutable<MapType>();
-    for (int i = 0; i < key_tensor.numel(); ++i) {
+    for (const auto i : c10::irange(key_tensor.numel())) {
       map_ptr->emplace(key_data[i], value_data[i]);
     }
   }
diff --git a/caffe2/operators/mean_op.h b/caffe2/operators/mean_op.h
index beb0b0440505d..19f2836251141 100644
--- a/caffe2/operators/mean_op.h
+++ b/caffe2/operators/mean_op.h
@@ -8,6 +8,7 @@
 #include "caffe2/core/types.h"
 #include "caffe2/utils/math.h"
 #include "caffe2/utils/proto_utils.h"
+#include "c10/util/irange.h"
 
 namespace caffe2 {
 
@@ -29,7 +30,7 @@ class MeanOp final : public Operator<Context> {
     }
 
     // Dimension checking
-    for (int i = 1; i < InputSize(); ++i) {
+    for (const auto i : c10::irange(1, InputSize())) {
       if (output->sizes() != Input(i).sizes()) {
         CAFFE_THROW(
             "Check failed: output->sizes() == Input(i).sizes().",
@@ -43,7 +44,7 @@ class MeanOp final : public Operator<Context> {
     }
 
     T* output_data = output->template mutable_data<T>();
-    for (int i = 1; i < InputSize(); ++i) {
+    for (const auto i : c10::irange(1, InputSize())) {
       math::Add(
           output->numel(),
           output_data,
@@ -101,7 +102,7 @@ class MeanGradientOp : public Operator<Context> {
         size, scale, dY_data, dX0->template mutable_data<T>(), &context_);
 
     // Copy the rest dX
-    for (int i = 1; i < num_inputs; i++) {
+    for (const auto i : c10::irange(1, num_inputs)) {
       auto* cur_dX = Output(i);
       cur_dX->ResizeLike(dY);
       cur_dX->CopyFrom(*dX0, true /*async*/);
diff --git a/caffe2/operators/merge_id_lists_op.h b/caffe2/operators/merge_id_lists_op.h
index e01abbecc486f..6619a57ca2f87 100644
--- a/caffe2/operators/merge_id_lists_op.h
+++ b/caffe2/operators/merge_id_lists_op.h
@@ -6,6 +6,7 @@
 #include "caffe2/core/context.h"
 #include "caffe2/core/operator.h"
 #include "caffe2/core/export_caffe2_op_to_c10.h"
+#include <c10/util/irange.h>
 
 C10_DECLARE_EXPORT_CAFFE2_OP_TO_C10(MergeIdLists);
 
@@ -50,7 +51,7 @@ class MergeIdListsOp : public Operator<Context> {
     // TODO(badri): Use unordered_set if performance is an issue
     std::set<T> deduped;
     std::vector<int> offsets(InputSize(), 0);
-    for (auto sample = 0; sample < batch_size; sample++) {
+    for (const auto sample : c10::irange(batch_size)) {
       for (size_t i = 0; i < InputSize(); i += 2) {
         auto& lengths = Input(i);
         const auto* lengths_data = lengths.template data<int32_t>();
diff --git a/caffe2/operators/minmax_ops.h b/caffe2/operators/minmax_ops.h
index 2191a96fccaeb..4d4e3aa4b5bd0 100644
--- a/caffe2/operators/minmax_ops.h
+++ b/caffe2/operators/minmax_ops.h
@@ -6,6 +6,7 @@
 #include "caffe2/core/operator.h"
 #include "caffe2/core/types.h"
 #include "caffe2/utils/math.h"
+#include <c10/util/irange.h>
 
 namespace caffe2 {
 
@@ -39,7 +40,7 @@ class MaxOp final : public Operator<Context> {
         Y->sizes());
     const T* X1_data = X1.template data<T>();
     math::Max<T, Context>(N, X0_data, X1_data, Y_data, &context_);
-    for (int i = 2; i < InputSize(); ++i) {
+    for (const auto i : c10::irange(2, InputSize())) {
       const auto& Xi = Input(i);
       CAFFE_ENFORCE_EQ(
           Xi.sizes(),
@@ -87,7 +88,7 @@ class MinOp final : public Operator<Context> {
         Y->sizes());
     const T* X1_data = X1.template data<T>();
     math::Min<T, Context>(N, X0_data, X1_data, Y_data, &context_);
-    for (int i = 2; i < InputSize(); ++i) {
+    for (const auto i : c10::irange(2, InputSize())) {
       const auto& Xi = Input(i);
       CAFFE_ENFORCE_EQ(
           Xi.sizes(),
diff --git a/caffe2/operators/moments_op.h b/caffe2/operators/moments_op.h
index 136c4b10e9b0c..b77b515619fcb 100644
--- a/caffe2/operators/moments_op.h
+++ b/caffe2/operators/moments_op.h
@@ -1,12 +1,13 @@
 #ifndef CAFFE2_OPERATORS_MOMENTS_OP_H_
 #define CAFFE2_OPERATORS_MOMENTS_OP_H_
 
-#include <algorithm>
-#include <vector>
-
 #include "caffe2/core/context.h"
 #include "caffe2/core/operator.h"
 #include "caffe2/utils/math.h"
+#include <c10/util/irange.h>
+
+#include <algorithm>
+#include <vector>
 
 namespace caffe2 {
 
@@ -45,7 +46,7 @@ class MomentsOp final : public Operator<Context> {
     std::vector<std::int64_t> output_dims;
     output_dims.reserve(ndim);
     std::size_t cur_axis = 0;
-    for (int i = 0; i < ndim; ++i) {
+    for (const auto i : c10::irange(ndim)) {
       if (cur_axis < axes_.size() && i == axes_[cur_axis]) {
         if (keep_dims_) {
           output_dims.push_back(1);
diff --git a/caffe2/operators/ngram_ops.h b/caffe2/operators/ngram_ops.h
index 162338a8ac416..ca2104cf3584e 100644
--- a/caffe2/operators/ngram_ops.h
+++ b/caffe2/operators/ngram_ops.h
@@ -1,10 +1,11 @@
 #pragma once
 
-#include <vector>
-
 #include "caffe2/core/context.h"
 #include "caffe2/core/operator.h"
 #include "caffe2/utils/math.h"
+#include "c10/util/irange.h"
+
+#include <vector>
 
 namespace caffe2 {
 template <typename F, typename T, class Context>
@@ -35,9 +36,9 @@ class NGramFromCategoricalOp : public Operator<Context> {
     }
     int base = 1;
     int idx = 0;
-    for (int k = 0; k < col_num_; k++) {
+    for (const auto k : c10::irange(col_num_)) {
       int l = categorical_limits_[k];
-      for (int m = 0; m < l; m++) {
+      for (const auto m : c10::irange(l)) {
         int v = vals_[idx++];
         ngram_maps_[k][v] = m * base;
       }
@@ -56,8 +57,8 @@ class NGramFromCategoricalOp : public Operator<Context> {
     math::Set<T, Context>(output->numel(), 0, output_data, &context_);
 
     CAFFE_ENFORCE_GT(D, max_col_id_);
-    for (int i = 0; i < N; i++) {
-      for (int k = 0; k < col_num_; k++) {
+    for (const auto i : c10::irange(N)) {
+      for (const auto k : c10::irange(col_num_)) {
         int j = col_ids_[k];
         int v = round(floats_data[i * D + j]);
         // for out-of-vocabulary values, we always treat them the same as the
diff --git a/caffe2/operators/normalize_op.h b/caffe2/operators/normalize_op.h
index ae1bb0f57f33c..1b4be7c8d20fb 100644
--- a/caffe2/operators/normalize_op.h
+++ b/caffe2/operators/normalize_op.h
@@ -48,7 +48,7 @@ class NormalizeOp final : public Operator<Context> {
     using ConstStridedVec =
         Eigen::Map<const Eigen::Matrix<T, 1, Eigen::Dynamic>, 0, InnerStride>;
 
-    for (int i = 0; i < n; ++i) {
+    for (const auto i : c10::irange(n)) {
       auto base = (i / sf) * sf * m + (i % sf);
       ConstStridedVec xVec(xData + base, 1, m, InnerStride(sf));
       auto norm = xVec.template lpNorm<2>();
diff --git a/caffe2/operators/numpy_tile_op.h b/caffe2/operators/numpy_tile_op.h
index 7fc745afccfc4..f5e39fc93cad9 100644
--- a/caffe2/operators/numpy_tile_op.h
+++ b/caffe2/operators/numpy_tile_op.h
@@ -6,6 +6,7 @@
 #include "caffe2/core/logging.h"
 #include "caffe2/core/operator.h"
 #include "caffe2/utils/math.h"
+#include "c10/util/irange.h"
 
 namespace caffe2 {
 
@@ -33,7 +34,7 @@ class NumpyTileOp : public Operator<Context> {
         " number of elements as `inputs` has dimensions.");
     const int64_t* repeats_data = repeats.template data<int64_t>();
     // NOLINTNEXTLINE(clang-diagnostic-sign-compare)
-    for (size_t i = 0; i < repeats.numel(); ++i) {
+    for (const auto i : c10::irange(repeats.numel())) {
       CAFFE_ENFORCE_GE(repeats_data[i], 0);
     }
 
@@ -45,7 +46,7 @@ class NumpyTileOp : public Operator<Context> {
     Tensor *src = &buffer, *dst = output;
     src->CopyFrom(input);
     vector<int64_t> output_dims(input.sizes().vec());
-    for (size_t i = 0; i < repeats.numel(); ++i) {
+    for (const auto i : c10::irange(repeats.numel())) {
       if (repeats_data[i] == 1) {
         continue;
       }
@@ -100,8 +101,10 @@ class NumpyTileOp : public Operator<Context> {
       int64_t num_tiles,
       const char* input_data,
       char* output_data) {
-    for (auto i = 0; i < outer_dim; ++i) {
-      for (auto t = 0; t < num_tiles; ++t) {
+    for (const auto i : c10::irange(outer_dim)) {
+      (void)i; // Suppress unused variable warning
+      for (const auto t : c10::irange(num_tiles)) {
+        (void)t; // Suppress unused variable warning
         context_.CopyItemsSameDevice(meta, inner_dim, input_data, output_data);
         output_data += inner_dim * item_size;
       }
diff --git a/caffe2/operators/onnx_while_op.h b/caffe2/operators/onnx_while_op.h
index 85b5d87ef1f30..5596eb8cf6dca 100644
--- a/caffe2/operators/onnx_while_op.h
+++ b/caffe2/operators/onnx_while_op.h
@@ -5,6 +5,7 @@
 #include "caffe2/core/logging.h"
 #include "caffe2/core/operator.h"
 #include "caffe2/operators/create_scope_op.h"
+#include "c10/util/irange.h"
 
 namespace caffe2 {
 
@@ -94,7 +95,7 @@ class ONNXWhileOp final : public Operator<Context> {
         "outputs");
 
     // Copy initial loop-carried dependencies
-    for (int i = 0; i < num_loop_carried_deps; ++i) {
+    for (const auto i : c10::irange(num_loop_carried_deps)) {
       scope_->lcd_tensor(i)->CopyFrom(Input(i + num_inputs_before_lcds));
     }
 
@@ -126,7 +127,7 @@ class ONNXWhileOp final : public Operator<Context> {
     };
 
     // Allocate scan_outputs for zero-iteration case
-    for (int i = 0; i < num_scan_outputs; ++i) {
+    for (const auto i : c10::irange(num_scan_outputs)) {
       Output(i + num_loop_carried_deps)->Resize(0);
       Output(i + num_loop_carried_deps)->template mutable_data<int32_t>();
     }
@@ -154,13 +155,13 @@ class ONNXWhileOp final : public Operator<Context> {
         }
 
         // Copy forward loop-carried dependencies
-        for (int i = 0; i < num_loop_carried_deps; ++i) {
+        for (const auto i : c10::irange(num_loop_carried_deps)) {
           Blob* b = cur_ws->GetBlob(scope_->net()->external_output()[i + 1]);
           const Tensor& t = b->template Get<Tensor>();
           scope_->lcd_tensor(i)->CopyFrom(t);
         }
         // Copy out scan_outputs
-        for (int i = 0; i < num_scan_outputs; ++i) {
+        for (const auto i : c10::irange(num_scan_outputs)) {
           int net_output_idx = i + 1 + num_loop_carried_deps;
           const Tensor& scan_output =
               cur_ws->GetBlob(scope_->net()->external_output()[net_output_idx])
@@ -202,7 +203,7 @@ class ONNXWhileOp final : public Operator<Context> {
     }
 
     // Copy out final loop-carried dependencies
-    for (int i = 0; i < num_loop_carried_deps; ++i) {
+    for (const auto i : c10::irange(num_loop_carried_deps)) {
       Output(i)->CopyFrom(*scope_->lcd_tensor(i));
     }
 
diff --git a/caffe2/operators/op_utils_cudnn.h b/caffe2/operators/op_utils_cudnn.h
index 0ea76855b8430..ca5c19e629182 100644
--- a/caffe2/operators/op_utils_cudnn.h
+++ b/caffe2/operators/op_utils_cudnn.h
@@ -36,7 +36,7 @@ inline void LogCuDNNPerfStats(
     const ArrayOfcudnnConvolutionAlgoPerf_t& perf_stat,
     int returned_algo_count) {
   VLOG(1) << "Perf result: (algo: stat, time, memory)";
-  for (int i = 0; i < returned_algo_count; ++i) {
+  for (const auto i : c10::irange(returned_algo_count)) {
     const auto& stat = perf_stat[i];
     VLOG(1) << stat.algo << ": " << stat.status << " " << stat.time << " "
             << stat.memory;
diff --git a/caffe2/operators/operator_fallback_gpu.h b/caffe2/operators/operator_fallback_gpu.h
index 72b430f00d361..a728b79b4916f 100644
--- a/caffe2/operators/operator_fallback_gpu.h
+++ b/caffe2/operators/operator_fallback_gpu.h
@@ -62,7 +62,7 @@ class GPUFallbackOpEx final : public Operator<CUDAContext> {
   }
 
   bool RunOnDevice() override {
-    for (int i = 0; i < InputSize(); ++i) {
+    for (const auto i : c10::irange(InputSize())) {
       if (this->InputIsTensorType(i, CUDA)) {
         // use sync copy
         BlobGetMutableTensor(local_input_blobs_[i], CPU)->CopyFrom(Input(i));
@@ -82,7 +82,7 @@ class GPUFallbackOpEx final : public Operator<CUDAContext> {
                  << ProtoDebugString(this->debug_def());
       return false;
     }
-    for (int i = 0; i < OutputSize(); ++i) {
+    for (const auto i : c10::irange(OutputSize())) {
       if (SkipOutputCopy::Contains(i)) {
         VLOG(1) << "Copy output: index " << i << " skipped.";
         continue;
diff --git a/caffe2/operators/order_switch_ops.h b/caffe2/operators/order_switch_ops.h
index 2dab5f72e8ca6..90c522d82f004 100644
--- a/caffe2/operators/order_switch_ops.h
+++ b/caffe2/operators/order_switch_ops.h
@@ -1,10 +1,11 @@
 #ifndef CAFFE2_OPERATORS_ORDER_SWITCH_OPS_H_
 #define CAFFE2_OPERATORS_ORDER_SWITCH_OPS_H_
 
-#include <vector>
-
 #include "caffe2/core/operator.h"
 #include "caffe2/utils/math.h"
+#include <c10/util/irange.h>
+
+#include <vector>
 
 namespace caffe2 {
 
@@ -30,7 +31,7 @@ class NHWC2NCHWOp final : public Operator<Context> {
     Y_dims[0] = N;
     Y_dims[1] = C;
     int HxW = 1;
-    for (int i = 2; i < ndim; ++i) {
+    for (const auto i : c10::irange(2, ndim)) {
       Y_dims[i] = X.dim32(i - 1);
       HxW *= Y_dims[i];
     }
diff --git a/caffe2/operators/pack_rnn_sequence_op.h b/caffe2/operators/pack_rnn_sequence_op.h
index 0dc597d20ee61..9d883f8e957e5 100644
--- a/caffe2/operators/pack_rnn_sequence_op.h
+++ b/caffe2/operators/pack_rnn_sequence_op.h
@@ -1,11 +1,13 @@
 #ifndef CAFFE2_OPERATORS_PACK_RNN_SEQUENCE_OP_H_
 #define CAFFE2_OPERATORS_PACK_RNN_SEQUENCE_OP_H_
 
-#include <algorithm>
-#include <vector>
 #include "caffe2/core/context.h"
 #include "caffe2/core/operator.h"
 #include "caffe2/utils/math.h"
+#include "c10/util/irange.h"
+
+#include <algorithm>
+#include <vector>
 
 namespace caffe2 {
 
@@ -69,7 +71,7 @@ class PackRNNSequenceOpBase : public Operator<Context> {
     math::Set<ValT, Context>(output->numel(), 0, output_data, &context_);
 
     int32_t offset = 0;
-    for (int c = 0; c < cols; c++) {
+    for (const auto c : c10::irange(cols)) {
       for (int r = 0; r < lengths_vec[c]; r++) {
         auto input_offset = Forward ? (offset + r) : (r * cols + c);
         auto output_offset = Forward ? (r * cols + c) : (offset + r);
diff --git a/caffe2/operators/partition_ops.h b/caffe2/operators/partition_ops.h
index fa8a27c39605e..9abe18befd2fc 100644
--- a/caffe2/operators/partition_ops.h
+++ b/caffe2/operators/partition_ops.h
@@ -3,6 +3,7 @@
 
 #include "caffe2/core/context.h"
 #include "caffe2/core/operator.h"
+#include "c10/util/irange.h"
 
 namespace caffe2 {
 
@@ -48,7 +49,7 @@ class GatherByKeyOp : public Operator<CPUContext> {
     CAFFE_ENFORCE_GE(outShape.size(), 1);
     auto totalSize = in0Shape[0];
     auto meta = Input(1).dtype();
-    for (int i = 2; i < InputSize(); ++i) {
+    for (const auto i : c10::irange(2, InputSize())) {
       const auto& input = Input(i);
       CAFFE_ENFORCE(meta == input.dtype());
       CAFFE_ENFORCE_GE(input.dim(), 1);
@@ -66,7 +67,7 @@ class GatherByKeyOp : public Operator<CPUContext> {
     const auto blockSize = outTensor->size_from_dim(1);
 
     inputDatas_.resize(numPartitions);
-    for (int i = 0; i < numPartitions; ++i) {
+    for (const auto i : c10::irange(numPartitions)) {
       inputDatas_[i] = static_cast<const char*>(Input(i + 1).raw_data());
     }
     inStartOffsets_.assign(numPartitions, 0);
@@ -127,7 +128,7 @@ class PartitionOpBase : public Operator<CPUContext> {
     int64_t size = main_input.numel();
     const Index* data = main_input.template data<Index>();
     counts_.assign(partitions, 0);
-    for (int64_t p = 0; p < size; p++) {
+    for (const auto p : c10::irange(size)) {
       int shard = moduloPartition(data[p], partitions);
       ++counts_[shard];
     }
@@ -136,7 +137,7 @@ class PartitionOpBase : public Operator<CPUContext> {
     block_sizes_.resize(inputSize);
     metas_.resize(inputSize);
     out_datas_.resize(OutputSize());
-    for (int i = mainInputIndex; i < inputSize; ++i) {
+    for (const auto i : c10::irange(mainInputIndex, inputSize)) {
       auto& input = Input(i);
       if (i > mainInputIndex) {
         CAFFE_ENFORCE_GE(
@@ -145,7 +146,7 @@ class PartitionOpBase : public Operator<CPUContext> {
             "Prefix of extra input's shape must match main input's shape, ",
             "input: ",
             i);
-        for (int j = 0; j < main_input.dim(); ++j) {
+        for (const auto j : c10::irange(main_input.dim())) {
           CAFFE_ENFORCE_GE(
               input.size(j),
               main_input.size(j),
@@ -162,7 +163,7 @@ class PartitionOpBase : public Operator<CPUContext> {
       // shape = partition_size + suffix of input dims
       vector<int64_t> shape(
           input.sizes().begin() + main_input.dim() - 1, input.sizes().end());
-      for (int j = 0; j < partitions; ++j) {
+      for (const auto j : c10::irange(partitions)) {
         int out_idx = i + j * inputSize;
         auto output = Output(out_idx);
         shape[0] = counts_[j];
@@ -172,7 +173,7 @@ class PartitionOpBase : public Operator<CPUContext> {
     }
 
     counts_.assign(partitions, 0);
-    for (int64_t p = 0; p < size; p++) {
+    for (const auto p : c10::irange(size)) {
       int shard = moduloPartition(data[p], partitions);
       int64_t idx = counts_[shard]++;
 
@@ -254,7 +255,7 @@ class LengthsPartitionOp : public PartitionOpBase {
 
     if (partitions == 1) {
       // Specialization when partitions == 1 which just becomes a copy.
-      for (int i = 0; i < InputSize(); ++i) {
+      for (const auto i : c10::irange(InputSize())) {
         auto& input = Input(i);
         auto& output = *Output(i);
         output.ResizeLike(input);
@@ -279,14 +280,14 @@ class LengthsPartitionOp : public PartitionOpBase {
     int64_t elements = length_input.numel();
     const int32_t* lengths_data = length_input.template data<int32_t>();
     out_length_.resize(partitions);
-    for (int i = 0; i < partitions; ++i) {
+    for (const auto i : c10::irange(partitions)) {
       auto& output = *Output(i * InputSize());
       output.Resize(elements);
       out_length_[i] = output.template mutable_data<int32_t>();
     }
 
     int total_length = 0;
-    for (int i = 0; i < elements; ++i) {
+    for (const auto i : c10::irange(elements)) {
       total_length += lengths_data[i];
     }
     CAFFE_ENFORCE(
@@ -294,8 +295,8 @@ class LengthsPartitionOp : public PartitionOpBase {
         "Total length is not matching to the number of elements");
 
     int index = 0;
-    for (int i = 0; i < elements; ++i) {
-      for (int j = 0; j < partitions; ++j) {
+    for (const auto i : c10::irange(elements)) {
+      for (const auto j : c10::irange(partitions)) {
         out_length_[j][i] = 0;
       }
       for (int j = 0; j < lengths_data[i]; ++j, ++index) {
diff --git a/caffe2/operators/piecewise_linear_transform_op.h b/caffe2/operators/piecewise_linear_transform_op.h
index d1fdc65369706..9dcf0021f1c2e 100644
--- a/caffe2/operators/piecewise_linear_transform_op.h
+++ b/caffe2/operators/piecewise_linear_transform_op.h
@@ -3,6 +3,7 @@
 
 #include "caffe2/core/context.h"
 #include "caffe2/core/export_caffe2_op_to_c10.h"
+#include <c10/util/irange.h>
 #include "caffe2/core/operator.h"
 
 C10_DECLARE_EXPORT_CAFFE2_OP_TO_C10(PiecewiseLinearTransform);
@@ -61,7 +62,7 @@ class PiecewiseLinearTransformOp final : public Operator<Context> {
       const int64_t num_bounds_per_group,
       const int64_t num_group) {
     const T* start = bounds;
-    for (int64_t i = 0; i < num_group; i++) {
+    for (const auto i : c10::irange(num_group)) {
       if (!std::is_sorted(start, start + num_bounds_per_group)) {
         return false;
       }
@@ -153,11 +154,11 @@ class PiecewiseLinearTransformOp final : public Operator<Context> {
         &bounds, &slopes, &intercepts, &num_func_per_group, &num_group);
     CAFFE_ENFORCE_EQ(num_group, M);
 
-    for (int64_t j = 0; j < M; ++j) {
+    for (const auto j : c10::irange(M)) {
       const T* bounds_group = bounds + j * (num_func_per_group + 1);
       const T* slopes_group = slopes + j * num_func_per_group;
       const T* intercepts_group = intercepts + j * num_func_per_group;
-      for (int64_t i = 0; i < N; ++i) {
+      for (const auto i : c10::irange(N)) {
         Ydata[i * M + j] = PiecewiseLinearTransform(
             Xdata[i * M + j],
             bounds_group,
@@ -192,12 +193,12 @@ class PiecewiseLinearTransformOp final : public Operator<Context> {
     CAFFE_ENFORCE_EQ(num_group, 1);
 
     if (M == 1) {
-      for (int64_t i = 0; i < N; ++i) {
+      for (const auto i : c10::irange(N)) {
         Ydata[i] = PiecewiseLinearTransform(
             Xdata[i], bounds, slopes, intercepts, num_func_per_group);
       }
     } else {
-      for (int64_t i = 0; i < N; ++i) {
+      for (const auto i : c10::irange(N)) {
         Ydata[i * M + 1] = PiecewiseLinearTransform(
             Xdata[i * M + 1], bounds, slopes, intercepts, num_func_per_group);
         Ydata[i * M] = 1.0f - Ydata[i * M + 1];
diff --git a/caffe2/operators/pool_op.h b/caffe2/operators/pool_op.h
index 77d0f0659eb14..855d69404e6a9 100644
--- a/caffe2/operators/pool_op.h
+++ b/caffe2/operators/pool_op.h
@@ -20,12 +20,12 @@ class PoolOp final : public ConvPoolOpBase<Context> {
   explicit PoolOp(Args&&... args)
       : ConvPoolOpBase<Context>(std::forward<Args>(args)...), functor_(*this) {
     const int kernel_size = kernel_.size();
-    for (int i = 0; i < kernel_size; ++i) {
+    for (const auto i : c10::irange(kernel_size)) {
       CAFFE_ENFORCE_EQ(
           dilation_[i], 1, "Pooling op does not support dilation right now.");
     }
     if (!global_pooling_) {
-      for (int i = 0; i < kernel_size; ++i) {
+      for (const auto i : c10::irange(kernel_size)) {
         CAFFE_ENFORCE(
             pads_[i] < kernel_[i] && pads_[i + kernel_size] < kernel_[i],
             "Pad should be smaller than kernel.");
diff --git a/caffe2/operators/prepend_dim_op.h b/caffe2/operators/prepend_dim_op.h
index cf425942a5100..79f1fe778647c 100644
--- a/caffe2/operators/prepend_dim_op.h
+++ b/caffe2/operators/prepend_dim_op.h
@@ -6,6 +6,7 @@
 #include "caffe2/core/context.h"
 #include "caffe2/core/logging.h"
 #include "caffe2/core/operator.h"
+#include <c10/util/irange.h>
 
 namespace caffe2 {
 
@@ -35,7 +36,7 @@ class PrependDimOp : public Operator<Context> {
     actual_new_shape[0] = dim_size_;
     actual_new_shape[1] = input.size(0) / dim_size_;
     // NOLINTNEXTLINE(clang-diagnostic-sign-compare)
-    for (int i = 1; i < input.sizes().size(); ++i) {
+    for (const auto i : c10::irange(1, input.sizes().size())) {
       actual_new_shape[i + 1] = input.size(i);
     }
     output->Resize(actual_new_shape);
diff --git a/caffe2/operators/quant_decode_op.h b/caffe2/operators/quant_decode_op.h
index 560f6abd3f6d4..1eeb4f2db8ad2 100644
--- a/caffe2/operators/quant_decode_op.h
+++ b/caffe2/operators/quant_decode_op.h
@@ -1,6 +1,8 @@
 #ifndef QUANT_DECODE_OP_H_
 #define QUANT_DECODE_OP_H_
 
+
+#include <c10/util/irange.h>
 #include <c10/util/typeid.h>
 #include "caffe2/core/context.h"
 #include "caffe2/core/operator.h"
@@ -34,7 +36,7 @@ void Decode(
     }
 
     int sz = output->numel();
-    for (int i = 0; i < sz; i++) {
+    for (const auto i : c10::irange(sz)) {
       DCHECK_LE(*code_ptr, cb_size);
       *out_ptr++ = cb_ptr[*code_ptr++];
     }
@@ -116,7 +118,7 @@ class QuantDecodeOp final : public Operator<CPUContext> {
     const auto& codebook = Input(0);
     CAFFE_ENFORCE(codebook.template IsType<float>(), codebook.dtype().name());
 
-    for (int i = 0; i < OutputSize(); i++) {
+    for (const auto i : c10::irange(OutputSize())) {
       auto& ci = Input(i + 1);
       auto* co = Output(i);
 
@@ -157,7 +159,7 @@ class QuantDecodeGradientOp final : public Operator<CPUContext> {
     auto* gradient_ptr = gradient->template mutable_data<float>();
     std::fill(gradient_ptr, gradient_ptr + gradient->numel(), 0);
 
-    for (int i = 0; i < num_code_tensors; i++) {
+    for (const auto i : c10::irange(num_code_tensors)) {
       auto& codes_i = Input(i + 1);
       auto& output_gradient_i = Input(i + num_code_tensors + 1);
       DecodeGeneral(codebook, codes_i, &output_gradient_i, gradient, false);
diff --git a/caffe2/operators/quantile_op.h b/caffe2/operators/quantile_op.h
index 165addc137c3c..34d2835f9d676 100644
--- a/caffe2/operators/quantile_op.h
+++ b/caffe2/operators/quantile_op.h
@@ -1,8 +1,10 @@
 #pragma once
 
+#include "caffe2/core/operator.h"
+#include "c10/util/irange.h"
+
 #include <cmath>
 #include <limits>
-#include "caffe2/core/operator.h"
 
 namespace caffe2 {
 
@@ -42,7 +44,7 @@ class QuantileOp final : public Operator<Context> {
 
     auto& input_zero = Input(0);
     int64_t numel = input_zero.numel();
-    for (int i = 1; i < InputSize(); ++i) {
+    for (const auto i : c10::irange(1, InputSize())) {
       CAFFE_ENFORCE_EQ(
           Input(i).dtype(),
           input_zero.dtype(),
@@ -116,9 +118,9 @@ class QuantileOp final : public Operator<Context> {
   void GetRangeFromInputs(T* lo, T* hi) {
     *hi = std::numeric_limits<T>::lowest();
     *lo = std::numeric_limits<T>::max();
-    for (int i = 0; i < InputSize(); ++i) {
+    for (const auto i : c10::irange(InputSize())) {
       const auto* input = Input(i).template data<T>();
-      for (int j = 0; j < Input(i).numel(); j++) {
+      for (const auto j : c10::irange(Input(i).numel())) {
         const T val = abs_ ? std::abs(input[j]) : input[j];
         if (*hi < val) {
           *hi = val;
@@ -133,9 +135,9 @@ class QuantileOp final : public Operator<Context> {
   template <typename T>
   int64_t CountLowerEq(const T& thd) {
     int64_t count = 0;
-    for (int i = 0; i < InputSize(); ++i) {
+    for (const auto i : c10::irange(InputSize())) {
       const auto* input = Input(i).template data<T>();
-      for (int j = 0; j < Input(i).numel(); j++) {
+      for (const auto j : c10::irange(Input(i).numel())) {
         const T val = abs_ ? std::abs(input[j]) : input[j];
         if (val <= thd) {
           count++;
diff --git a/caffe2/operators/quantized/int8_concat_op.h b/caffe2/operators/quantized/int8_concat_op.h
index b501bc128fae2..d0c8d24e9840d 100644
--- a/caffe2/operators/quantized/int8_concat_op.h
+++ b/caffe2/operators/quantized/int8_concat_op.h
@@ -1,6 +1,7 @@
 #ifndef CAFFE2_OPERATORS_INT8_CONCAT_OP_H_
 #define CAFFE2_OPERATORS_INT8_CONCAT_OP_H_
 
+#include <c10/util/irange.h>
 #include "caffe2/core/context.h"
 #include "caffe2/core/operator.h"
 #include "caffe2/core/tensor_int8.h"
@@ -46,10 +47,10 @@ class Int8ConcatOp final : public Operator<CPUContext> {
     if (this->template GetSingleArgument<string>("order", "") == "NHWC") {
       CHECK_EQ(Y_dims.size(), 4);
     }
-    for (auto i = 1; i < InputSize(); ++i) {
+    for (const auto i : c10::irange(1, InputSize())) {
       const auto& Xi = Inputs()[i]->template Get<Int8TensorCPU>();
       CHECK_EQ(Xi.t.dim(), Y_dims.size());
-      for (auto j = 0; j < Y_dims.size(); ++j) {
+      for (const auto j : c10::irange(Y_dims.size())) {
         if (j != axis_) {
           CHECK_EQ(Xi.t.size(j), Y_dims[j]);
         }
@@ -61,7 +62,7 @@ class Int8ConcatOp final : public Operator<CPUContext> {
     int after = X0.t.size_from_dim(axis_ + 1);
     const auto C_total = Y_dims[axis_];
     size_t C_offset = 0;
-    for (auto i = 0; i < InputSize(); ++i) {
+    for (const auto i : c10::irange(InputSize())) {
       const auto& Xi = Inputs()[i]->template Get<Int8TensorCPU>();
       // Copy the NxHxWxC input slice to NxHxWx[C_offset:C_offset + C].
       const auto Ci = Xi.t.size(axis_);
diff --git a/caffe2/operators/quantized/int8_dequantize_op.h b/caffe2/operators/quantized/int8_dequantize_op.h
index eeecf91545424..17866cdad8122 100644
--- a/caffe2/operators/quantized/int8_dequantize_op.h
+++ b/caffe2/operators/quantized/int8_dequantize_op.h
@@ -5,6 +5,8 @@
 #include "caffe2/core/operator.h"
 #include "caffe2/core/tensor_int8.h"
 #include "caffe2/operators/quantized/int8_utils.h"
+#include <c10/util/irange.h>
+
 
 namespace caffe2 {
 
@@ -18,7 +20,7 @@ void Int8Dequantize(
     const int64_t N,
     const float X_scale,
     const int32_t X_offset) {
-  for (auto i = 0; i < N; ++i) {
+  for (const auto i : c10::irange(N)) {
     out[i] = (static_cast<int32_t>(in[i]) - X_offset) * X_scale;
   }
 }
diff --git a/caffe2/operators/quantized/int8_given_tensor_fill_op.h b/caffe2/operators/quantized/int8_given_tensor_fill_op.h
index bd55c9a548143..8080ca78b344d 100644
--- a/caffe2/operators/quantized/int8_given_tensor_fill_op.h
+++ b/caffe2/operators/quantized/int8_given_tensor_fill_op.h
@@ -40,7 +40,7 @@ class Int8GivenTensorFillOp final : public Operator<CPUContext> {
         {static_cast<int64_t>(source_values.size())},
         at::dtype<uint8_t>().device(CPU));
     uint8_t* values_data = values_.template mutable_data<uint8_t>();
-    for (int i = 0; i < source_values.size(); i++) {
+    for (const auto i : c10::irange(source_values.size())) {
       values_data[i] = static_cast<uint8_t>(source_values[i]);
     }
   }
@@ -92,7 +92,7 @@ class Int8GivenIntTensorFillOp final : public Operator<CPUContext> {
         {static_cast<int64_t>(source_values.size())},
         at::dtype<int32_t>().device(CPU));
     auto* values_data = values_.template mutable_data<int32_t>();
-    for (int i = 0; i < source_values.size(); i++) {
+    for (const auto i : c10::irange(source_values.size())) {
       values_data[i] = static_cast<int32_t>(source_values[i]);
     }
   }
diff --git a/caffe2/operators/quantized/int8_resize_nearest_op.h b/caffe2/operators/quantized/int8_resize_nearest_op.h
index 102cc35fdaaaf..06d625bf06ba4 100644
--- a/caffe2/operators/quantized/int8_resize_nearest_op.h
+++ b/caffe2/operators/quantized/int8_resize_nearest_op.h
@@ -5,6 +5,7 @@
 #include "caffe2/core/operator.h"
 #include "caffe2/core/tensor_int8.h"
 #include "caffe2/operators/quantized/int8_utils.h"
+#include <c10/util/irange.h>
 
 namespace caffe2 {
 
@@ -54,10 +55,10 @@ class Int8ResizeNearestOp final : public Operator<CPUContext> {
     const uint8_t* Xdata = X.t.data<uint8_t>();
     uint8_t* Ydata = Y->t.mutable_data<uint8_t>();
 
-    for (int n = 0; n < N; ++n) {
-      for (int y = 0; y < OH; ++y) {
+    for (const auto n : c10::irange(N)) {
+      for (const auto y : c10::irange(OH)) {
         const int in_y = std::min((int)(y / height_scale_), (IH - 1));
-        for (int x = 0; x < OW; ++x) {
+        for (const auto x : c10::irange(OW)) {
           const int in_x = std::min((int)(x / width_scale_), (IW - 1));
           std::memcpy(
               &Ydata[C * x + C * OW * y + C * OW * OH * n],
diff --git a/caffe2/operators/quantized/int8_roi_align_op.h b/caffe2/operators/quantized/int8_roi_align_op.h
index 710476e052017..2a722d2dd8fa1 100644
--- a/caffe2/operators/quantized/int8_roi_align_op.h
+++ b/caffe2/operators/quantized/int8_roi_align_op.h
@@ -9,6 +9,7 @@
 #include "caffe2/core/tensor_int8.h"
 #include "caffe2/operators/quantized/int8_utils.h"
 #include "caffe2/utils/math.h"
+#include <c10/util/irange.h>
 
 namespace caffe2 {
 
@@ -44,13 +45,13 @@ void pre_calc_for_bilinear_interpolate(
   int pre_calc_index = 0;
   // boltnn use a smaller multiplier here. Sometimes w will shrink to 0.
   const float w_multiplier = 255.0;
-  for (int ph = 0; ph < pooled_height; ph++) {
-    for (int pw = 0; pw < pooled_width; pw++) {
-      for (int iy = 0; iy < iy_upper; iy++) {
+  for (const auto ph : c10::irange(pooled_height)) {
+    for (const auto pw : c10::irange(pooled_width)) {
+      for (const auto iy : c10::irange(iy_upper)) {
         const float yy = roi_start_h + ph * bin_size_h +
             static_cast<float>(iy + .5f) * bin_size_h /
                 static_cast<float>(roi_bin_grid_h); // e.g., 0.5, 1.5
-        for (int ix = 0; ix < ix_upper; ix++) {
+        for (const auto ix : c10::irange(ix_upper)) {
           const float xx = roi_start_w + pw * bin_size_w +
               static_cast<float>(ix + .5f) * bin_size_w /
                   static_cast<float>(roi_bin_grid_w);
@@ -152,7 +153,7 @@ void ROIAlignForward(
 
   int n_rois = nthreads / channels / pooled_width / pooled_height;
 
-  for (int n = 0; n < n_rois; n++) {
+  for (const auto n : c10::irange(n_rois)) {
     int index_n = n * channels * pooled_width * pooled_height;
 
     // roi could have 4 or 5 columns
@@ -224,19 +225,19 @@ void ROIAlignForward(
     const uint8_t* offset_bottom_data =
         bottom_data + roi_batch_ind * channels * height * width;
     int pre_calc_index = 0;
-    for (int ph = 0; ph < pooled_height; ph++) {
-      for (int pw = 0; pw < pooled_width; pw++) {
+    for (const auto ph : c10::irange(pooled_height)) {
+      for (const auto pw : c10::irange(pooled_width)) {
         vector<int32_t> acc_buffer(channels, 0);
 
-        for (int iy = 0; iy < roi_bin_grid_h; iy++) {
-          for (int ix = 0; ix < roi_bin_grid_w; ix++) {
+        for (const auto iy : c10::irange(roi_bin_grid_h)) {
+          for (const auto ix : c10::irange(roi_bin_grid_w)) {
             PreCalc pc = pre_calc[pre_calc_index];
 
             const uint8_t* data_1 = offset_bottom_data + channels * pc.pos1;
             const uint8_t* data_2 = offset_bottom_data + channels * pc.pos2;
             const uint8_t* data_3 = offset_bottom_data + channels * pc.pos3;
             const uint8_t* data_4 = offset_bottom_data + channels * pc.pos4;
-            for (int c = 0; c < channels; ++c) {
+            for (const auto c : c10::irange(channels)) {
               acc_buffer[c] += (uint32_t)(pc.w1) * (uint32_t)(data_1[c]);
               acc_buffer[c] += (uint32_t)(pc.w2) * (uint32_t)(data_2[c]);
               acc_buffer[c] += (uint32_t)(pc.w3) * (uint32_t)(data_3[c]);
@@ -251,7 +252,7 @@ void ROIAlignForward(
         }
         int index_nhw = index_n + (ph * pooled_width + pw) * channels;
         uint8_t* out_ptr = top_data + index_nhw;
-        for (int c = 0; c < channels; ++c) {
+        for (const auto c : c10::irange(channels)) {
           int32_t a_mul = MultiplyByQuantizedMultiplierSmallerThanOne(
                               acc_buffer[c], Y_multiplier, Y_shift) +
               y_offset;
diff --git a/caffe2/operators/quantized/int8_roi_align_op_test.cc b/caffe2/operators/quantized/int8_roi_align_op_test.cc
index e00c4aeebe252..66fb4205ef486 100644
--- a/caffe2/operators/quantized/int8_roi_align_op_test.cc
+++ b/caffe2/operators/quantized/int8_roi_align_op_test.cc
@@ -1,5 +1,6 @@
 #include "caffe2/operators/quantized/int8_test_utils.h"
 #include "caffe2/operators/quantized/int8_utils.h"
+#include <c10/util/irange.h>
 
 namespace caffe2 {
 
diff --git a/caffe2/operators/quantized/int8_test_utils.h b/caffe2/operators/quantized/int8_test_utils.h
index f4a96be75cd83..a7bc242132418 100644
--- a/caffe2/operators/quantized/int8_test_utils.h
+++ b/caffe2/operators/quantized/int8_test_utils.h
@@ -77,7 +77,7 @@ inline std::unique_ptr<TensorCPU> biasdq(const int8::Int8TensorCPU& XQ) {
 #define EXPECT_TENSOR_EQ(_YA, _YE)                                     \
   do {                                                                 \
     EXPECT_TRUE((_YA).sizes() == (_YE).sizes());                       \
-    for (auto i = 0; i < (_YA).numel(); ++i) {                         \
+    for (const auto i : c10::irange((_YA).numel())) {                         \
       EXPECT_FLOAT_EQ((_YA).data<float>()[i], (_YE).data<float>()[i]); \
     }                                                                  \
   } while (0);
@@ -85,7 +85,7 @@ inline std::unique_ptr<TensorCPU> biasdq(const int8::Int8TensorCPU& XQ) {
 #define EXPECT_TENSOR_APPROX_EQ(_YA, _YE, _tol)                            \
   do {                                                                     \
     EXPECT_TRUE((_YA).sizes() == (_YE).sizes());                           \
-    for (auto i = 0; i < (_YA).numel(); ++i) {                             \
+    for (const auto i : c10::irange((_YA).numel())) {                             \
       EXPECT_NEAR((_YA).data<float>()[i], (_YE).data<float>()[i], (_tol)); \
     }                                                                      \
   } while (0);
diff --git a/caffe2/operators/reduce_front_back_max_ops.h b/caffe2/operators/reduce_front_back_max_ops.h
index 8f064954d9858..f3aa35f061742 100644
--- a/caffe2/operators/reduce_front_back_max_ops.h
+++ b/caffe2/operators/reduce_front_back_max_ops.h
@@ -5,6 +5,7 @@
 #include "caffe2/core/logging.h"
 #include "caffe2/core/operator.h"
 #include "caffe2/utils/math.h"
+#include "c10/util/irange.h"
 
 namespace caffe2 {
 
@@ -35,7 +36,7 @@ class MaxReduceDimsOp final : public Operator<Context> {
     int start_index = FIRSTDIMS ? num_reduce_dims_ : 0;
     int end_index = FIRSTDIMS ? X.dim() : X.dim() - num_reduce_dims_;
 
-    for (int i = start_index; i < end_index; ++i) {
+    for (const auto i : c10::irange(start_index, end_index)) {
       output_shape.push_back(X.sizes()[i]);
     }
     auto* Y = Output(0, output_shape, at::dtype<float>());
diff --git a/caffe2/operators/reduce_front_back_sum_mean_ops.h b/caffe2/operators/reduce_front_back_sum_mean_ops.h
index d99efc335e8c3..84ab168df30c8 100644
--- a/caffe2/operators/reduce_front_back_sum_mean_ops.h
+++ b/caffe2/operators/reduce_front_back_sum_mean_ops.h
@@ -5,6 +5,7 @@
 #include "caffe2/core/logging.h"
 #include "caffe2/core/operator.h"
 #include "caffe2/utils/math.h"
+#include "c10/util/irange.h"
 
 namespace caffe2 {
 
@@ -35,7 +36,7 @@ class SumReduceDimsOp final : public Operator<Context> {
     vector<int64_t> output_shape;
     int start_index = FIRSTDIMS ? num_reduce_dims_ : 0;
     int end_index = FIRSTDIMS ? X.dim() : X.dim() - num_reduce_dims_;
-    for (int i = start_index; i < end_index; ++i) {
+    for (const auto i : c10::irange(start_index, end_index)) {
       output_shape.push_back(X.sizes()[i]);
     }
     auto* Y = Output(0, output_shape, at::dtype<T>());
diff --git a/caffe2/operators/reduce_ops.h b/caffe2/operators/reduce_ops.h
index 4fba06a528b05..cfa3378843f33 100644
--- a/caffe2/operators/reduce_ops.h
+++ b/caffe2/operators/reduce_ops.h
@@ -1,14 +1,15 @@
 #ifndef CAFFE2_OPERATORS_REDUCE_OPS_H_
 #define CAFFE2_OPERATORS_REDUCE_OPS_H_
 
-#include <algorithm>
-#include <functional>
-#include <vector>
-
 #include "caffe2/core/context.h"
 #include "caffe2/core/operator.h"
 #include "caffe2/core/types.h"
 #include "caffe2/utils/math.h"
+#include <c10/util/irange.h>
+
+#include <algorithm>
+#include <functional>
+#include <vector>
 
 namespace caffe2 {
 
@@ -50,7 +51,7 @@ class ReduceOp final : public Operator<Context> {
     std::vector<int64_t> output_dims;
     output_dims.reserve(ndim);
     std::size_t cur_axis = 0;
-    for (int i = 0; i < ndim; ++i) {
+    for (const auto i : c10::irange(ndim)) {
       if (cur_axis < axes_.size() && i == axes_[cur_axis]) {
         if (keep_dims_) {
           output_dims.push_back(1);
diff --git a/caffe2/operators/reducer_functors.h b/caffe2/operators/reducer_functors.h
index 762e8c41a9ffa..0159e030d2637 100644
--- a/caffe2/operators/reducer_functors.h
+++ b/caffe2/operators/reducer_functors.h
@@ -50,7 +50,7 @@ class SumRangeReducerGradient {
       const T* /*data_out*/, // unused
       Context* context) {
     // do we have some op that does it smartly with minimum number of memcpy?
-    for (int64_t i = 0; i < blocks; ++i) {
+    for (const auto i : c10::irange(blocks)) {
       context->template CopySameDevice<T>(
           block_size, segment_grad, data_grad + block_size * i);
     }
@@ -83,13 +83,13 @@ class LogSumExpRangeReducer<T, CPUContext> {
       const T* in,
       T* out,
       CPUContext* /*context*/) {
-    for (int j = 0; j < block_size; ++j) {
+    for (const auto j : c10::irange(block_size)) {
       T max_value = std::numeric_limits<T>::lowest();
-      for (int i = 0; i < blocks; ++i) {
+      for (const auto i : c10::irange(blocks)) {
         max_value = std::max(max_value, in[i * block_size + j]);
       }
       T scaled_exp_sum = 0;
-      for (int i = 0; i < blocks; ++i) {
+      for (const auto i : c10::irange(blocks)) {
         scaled_exp_sum += std::exp(in[i * block_size + j] - max_value);
       }
       *(out++) = std::log(scaled_exp_sum) + max_value;
@@ -109,10 +109,10 @@ class LogSumExpRangeReducerGradient {
       const T* data_in, // I
       const T* data_out, // O
       Context* /*context*/) {
-    for (int j = 0; j < block_size; ++j) {
+    for (const auto j : c10::irange(block_size)) {
       const T out_grad = *(segment_grad++);
       const T offset = *(data_out++);
-      for (int i = 0; i < blocks; ++i) {
+      for (const auto i : c10::irange(blocks)) {
         auto idx = i * block_size + j;
         data_grad[idx] = out_grad * std::exp(data_in[idx] - offset);
       }
@@ -145,13 +145,13 @@ class LogMeanExpRangeReducer<T, CPUContext> {
       const T* in,
       T* out,
       CPUContext* /*context*/) {
-    for (int j = 0; j < block_size; ++j) {
+    for (const auto j : c10::irange(block_size)) {
       T max_value = std::numeric_limits<T>::lowest();
-      for (int i = 0; i < blocks; ++i) {
+      for (const auto i : c10::irange(blocks)) {
         max_value = std::max(max_value, in[i * block_size + j]);
       }
       T scaled_exp_sum = 0;
-      for (int i = 0; i < blocks; ++i) {
+      for (const auto i : c10::irange(blocks)) {
         scaled_exp_sum += std::exp(in[i * block_size + j] - max_value);
       }
       scaled_exp_sum /= blocks;
@@ -171,10 +171,10 @@ class LogMeanExpRangeReducerGradient {
       const T* data_in, // I
       const T* data_out, // O
       Context* /*context*/) {
-    for (int j = 0; j < block_size; ++j) {
+    for (const auto j : c10::irange(block_size)) {
       const T out_grad = *(segment_grad++);
       const T offset = *(data_out++);
-      for (int i = 0; i < blocks; ++i) {
+      for (const auto i : c10::irange(blocks)) {
         auto idx = i * block_size + j;
         data_grad[idx] = out_grad * std::exp(data_in[idx] - offset) / blocks;
       }
@@ -207,9 +207,9 @@ class MeanRangeReducer<T, CPUContext> {
       const T* in,
       T* out,
       CPUContext* /*context*/) {
-    for (int j = 0; j < block_size; ++j) {
+    for (const auto j : c10::irange(block_size)) {
       T avg_value = 0;
-      for (int i = 0; i < blocks; ++i) {
+      for (const auto i : c10::irange(blocks)) {
         avg_value += in[i * block_size + j] / blocks;
       }
       *(out++) = avg_value;
@@ -229,9 +229,9 @@ class MeanRangeReducerGradient {
       const T* /*data_out*/, // O
       Context* /*context*/) {
     const auto in_grad = 1.0 / blocks;
-    for (int j = 0; j < block_size; ++j) {
+    for (const auto j : c10::irange(block_size)) {
       const T out_grad = *(segment_grad++);
-      for (int i = 0; i < blocks; ++i) {
+      for (const auto i : c10::irange(blocks)) {
         auto idx = i * block_size + j;
         data_grad[idx] = out_grad * in_grad;
       }
@@ -266,9 +266,9 @@ class MaxRangeReducer<T, CPUContext> {
       const T* in,
       T* out,
       CPUContext* /*context*/) {
-    for (int j = 0; j < block_size; ++j) {
+    for (const auto j : c10::irange(block_size)) {
       T max_value = std::numeric_limits<T>::lowest();
-      for (int i = 0; i < blocks; ++i) {
+      for (const auto i : c10::irange(blocks)) {
         max_value = std::max(max_value, in[i * block_size + j]);
       }
       *(out++) = max_value;
@@ -289,10 +289,10 @@ class MaxRangeReducerGradient {
       Context* /*context*/) {
     std::memset(
         static_cast<void*>(data_grad), 0, blocks * block_size * sizeof(T));
-    for (int j = 0; j < block_size; ++j) {
+    for (const auto j : c10::irange(block_size)) {
       const T out_grad = *(segment_grad++);
       const T out = data_out[j];
-      for (int i = 0; i < blocks; ++i) {
+      for (const auto i : c10::irange(blocks)) {
         auto idx = i * block_size + j;
         if (out == data_in[idx]) {
           data_grad[idx] = out_grad;
@@ -813,7 +813,7 @@ class MaxReducerGradient : public BaseReducerGradient {
       int64_t /*offset*/,
       Context* /*context*/,
       const int /*length*/) {
-    for (int64_t i = 0; i < meta.block_size; ++i) {
+    for (const auto i : c10::irange(meta.block_size)) {
       data_grad[i] = data[i] == forward_output[i] ? s_grad_[i] : 0;
     }
   }
diff --git a/caffe2/operators/reduction_ops.h b/caffe2/operators/reduction_ops.h
index 896ae657a17ee..0268aae6cd89c 100644
--- a/caffe2/operators/reduction_ops.h
+++ b/caffe2/operators/reduction_ops.h
@@ -6,6 +6,7 @@
 #include "caffe2/core/logging.h"
 #include "caffe2/core/operator.h"
 #include "caffe2/utils/math.h"
+#include "c10/util/irange.h"
 
 namespace caffe2 {
 
@@ -164,7 +165,7 @@ class MaxReductionOp : public Operator<Context> {
           &context_);
     } else {
       const int input_size = N * M;
-      for (int i = 0; i < batch_size; ++i) {
+      for (const auto i : c10::irange(batch_size)) {
         math::ColwiseMax<T, Context>(
             M,
             N,
diff --git a/caffe2/operators/remove_data_blocks_op.h b/caffe2/operators/remove_data_blocks_op.h
index 5f409bf08dc72..6f4612dd6f4b4 100644
--- a/caffe2/operators/remove_data_blocks_op.h
+++ b/caffe2/operators/remove_data_blocks_op.h
@@ -6,6 +6,7 @@
 
 #include "caffe2/core/context.h"
 #include "caffe2/core/operator.h"
+#include "c10/util/irange.h"
 
 namespace caffe2 {
 
@@ -40,7 +41,7 @@ class RemoveDataBlocksOp final : public Operator<Context> {
     const auto* ind_ptr = indices.template data<T>();
 
     std::vector<T> ind_vec;
-    for (int64_t i = 0; i < indices_size; i++) {
+    for (const auto i : c10::irange(indices_size)) {
       ind_vec.push_back(ind_ptr[i]);
     }
     std::sort(ind_vec.begin(), ind_vec.end());
@@ -60,7 +61,7 @@ class RemoveDataBlocksOp final : public Operator<Context> {
 
     ind_vec.insert(ind_vec.begin(), -1);
     int64_t ind_vec_size = ind_vec.size();
-    for (auto i = 0; i < ind_vec_size; i++) {
+    for (const auto i : c10::irange(ind_vec_size)) {
       int64_t interval_start = ind_vec[i] + 1;
       int64_t interval_end =
           (i == ind_vec_size - 1) ? outer_size : ind_vec[i + 1];