diff --git a/.clang-tidy b/.clang-tidy
index 200ba9a2a3516..2bbfebd79e61e 100644
--- a/.clang-tidy
+++ b/.clang-tidy
@@ -49,7 +49,7 @@ modernize-*,
 performance-*,
 readability-container-size-empty,
 '
-HeaderFilterRegex: '^(c10/|torch/csrc/).*$'
+HeaderFilterRegex: '^(aten/|c10/|torch/).*$'
 AnalyzeTemporaryDtors: false
 WarningsAsErrors: '*'
 ...
diff --git a/aten/src/ATen/ExpandUtils.h b/aten/src/ATen/ExpandUtils.h
index 30077f0ad498d..82db1f8b6517c 100644
--- a/aten/src/ATen/ExpandUtils.h
+++ b/aten/src/ATen/ExpandUtils.h
@@ -63,7 +63,7 @@ inline bool are_expandable(IntArrayRef shape1, IntArrayRef shape2) {
   size_t ndim2 = shape2.size();
   size_t ndim = ndim1 < ndim2 ? ndim1 : ndim2;
 
-  for (int64_t i = ndim - 1; i >= 0; --i) {
+  for (int64_t i = static_cast<int64_t>(ndim) - 1; i >= 0; --i) {
     if (shape1[--ndim1] == shape2[--ndim2] || shape1[ndim1] == 1 ||
         shape2[ndim2] == 1) {
       continue;
diff --git a/aten/src/ATen/LegacyBatchedTensorImpl.h b/aten/src/ATen/LegacyBatchedTensorImpl.h
index bbb599748d2e2..732e252165ca6 100644
--- a/aten/src/ATen/LegacyBatchedTensorImpl.h
+++ b/aten/src/ATen/LegacyBatchedTensorImpl.h
@@ -113,15 +113,15 @@ inline bool isBatchedTensor(const Tensor& tensor) {
 
 // It is unsafe to call this on a Tensor that is not backed by a
 // BatchedTensorImpl. Please use `maybeGetBatchedImpl` whenever possible.
-inline BatchedTensorImpl* unsafeGetBatchedImpl(Tensor tensor) {
+inline BatchedTensorImpl* unsafeGetBatchedImpl(const Tensor& tensor) {
   return static_cast<BatchedTensorImpl*>(tensor.unsafeGetTensorImpl());
 }
 
-inline BatchedTensorImpl* maybeGetBatchedImpl(Tensor tensor) {
+inline BatchedTensorImpl* maybeGetBatchedImpl(const Tensor& tensor) {
   if (!isBatchedTensor(tensor)) {
     return nullptr;
   }
-  return unsafeGetBatchedImpl(std::move(tensor));
+  return unsafeGetBatchedImpl(tensor);
 }
 
 // Returns a bitset. If bit i is set, then that means dim i is a batchdim.
diff --git a/aten/src/ATen/NestedTensorImpl.h b/aten/src/ATen/NestedTensorImpl.h
index 2005c89656432..11d7e2f165548 100644
--- a/aten/src/ATen/NestedTensorImpl.h
+++ b/aten/src/ATen/NestedTensorImpl.h
@@ -95,11 +95,12 @@ struct TORCH_API NestedTensorImpl : public c10::TensorImpl {
     const auto buffer_size = get_buffer_size();
     auto buffer_tensor_impl = c10::make_intrusive<TensorImpl>(
         c10::TensorImpl::VIEW, Storage(storage_), buffer_key_set_, data_type_);
-    buffer_tensor_impl->set_sizes_contiguous(c10::makeArrayRef(buffer_size));
+    buffer_tensor_impl->set_sizes_contiguous(
+        c10::makeArrayRef(static_cast<int64_t>(buffer_size)));
     return Tensor(buffer_tensor_impl);
   }
 
-  int64_t get_buffer_size() const {
+  size_t get_buffer_size() const {
     return storage_.nbytes() / data_type_.itemsize();
   }
 
@@ -146,6 +147,7 @@ struct TORCH_API NestedTensorImpl : public c10::TensorImpl {
   // to TensorImpl.
   void refresh_dim();
 
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-const-or-ref-data-members)
   const at::Tensor nested_sizes_, nested_strides_;
   // The starting positions of the underlying tensors in contiguous buffer
   // i.e. the buffer memory offsets to get the underlying tensors
@@ -159,6 +161,7 @@ struct TORCH_API NestedTensorImpl : public c10::TensorImpl {
   // Some strong enough constraints are:
   // 1. every underlying tensor is contiguous in memory
   //    && nesting in ascending order
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-const-or-ref-data-members)
   const at::Tensor storage_offsets_;
   // NOTE: -1 here means the size is missing
   // Optional to allow it to be computed lazily from nested.
diff --git a/aten/src/ATen/ParallelOpenMP.h b/aten/src/ATen/ParallelOpenMP.h
index f1d0e7fb0a89e..b983571f09a2e 100644
--- a/aten/src/ATen/ParallelOpenMP.h
+++ b/aten/src/ATen/ParallelOpenMP.h
@@ -1,5 +1,6 @@
 #pragma once
 
+#include <algorithm>
 #include <atomic>
 #include <cstddef>
 #include <exception>
diff --git a/aten/src/ATen/SparseCsrTensorImpl.h b/aten/src/ATen/SparseCsrTensorImpl.h
index de54c1d41dbee..2ee2a8004d360 100644
--- a/aten/src/ATen/SparseCsrTensorImpl.h
+++ b/aten/src/ATen/SparseCsrTensorImpl.h
@@ -115,8 +115,8 @@ struct TORCH_API SparseCsrTensorImpl : public TensorImpl {
     auto impl = c10::make_intrusive<SparseCsrTensorImpl>(
         key_set(), device(), layout_impl(), dtype());
     copy_tensor_metadata(
-        /*src_impl=*/this,
-        /*dest_impl=*/impl.get(),
+        /*src_sparse_impl=*/this,
+        /*dest_sparse_impl=*/impl.get(),
         /*version_counter=*/version_counter,
         /*allow_tensor_metadata_change=*/allow_tensor_metadata_change);
     impl->refresh_numel();
@@ -135,9 +135,9 @@ struct TORCH_API SparseCsrTensorImpl : public TensorImpl {
     auto impl = c10::make_intrusive<SparseCsrTensorImpl>(
         key_set(), device(), layout_impl(), dtype());
     copy_tensor_metadata(
-        /*src_impl=*/this,
-        /*dest_impl=*/impl.get(),
-        /*version_counter=*/std::move(version_counter),
+        /*src_sparse_impl=*/this,
+        /*dest_sparse_impl=*/impl.get(),
+        /*version_counter=*/version_counter,
         /*allow_tensor_metadata_change=*/allow_tensor_metadata_change);
     impl->refresh_numel();
     return impl;
diff --git a/aten/src/ATen/TensorNames.h b/aten/src/ATen/TensorNames.h
index 07bde9521c7b5..4ec3d064867fb 100644
--- a/aten/src/ATen/TensorNames.h
+++ b/aten/src/ATen/TensorNames.h
@@ -28,7 +28,9 @@ namespace at::namedinference {
 struct TORCH_API TensorName {
   explicit TensorName(ArrayRef<Dimname> origin, int origin_idx)
       : origin_(origin),
-        name_(origin[maybe_wrap_dim(origin_idx, origin.size())]),
+        name_(origin[maybe_wrap_dim(
+            origin_idx,
+            static_cast<int64_t>(origin.size()))]),
         origin_idx_(origin_idx) {}
 
   // op_name is only used for error reporting.
diff --git a/aten/src/ATen/autocast_mode.h b/aten/src/ATen/autocast_mode.h
index 0002aaa819750..a1b687ce448d7 100644
--- a/aten/src/ATen/autocast_mode.h
+++ b/aten/src/ATen/autocast_mode.h
@@ -45,7 +45,9 @@ TORCH_API bool is_autocast_cache_enabled();
 TORCH_API void set_autocast_cache_enabled(bool enabled);
 
 namespace {
-bool is_autocast_eligible(const Tensor& tensor, c10::DeviceType device_type) {
+inline bool is_autocast_eligible(
+    const Tensor& tensor,
+    c10::DeviceType device_type) {
   switch (device_type) {
     case c10::DeviceType::CUDA:
       return (tensor.is_cuda() || tensor.is_xla()) &&
diff --git a/aten/src/ATen/core/List_inl.h b/aten/src/ATen/core/List_inl.h
index 110ef5747e5b0..0fb911278a919 100644
--- a/aten/src/ATen/core/List_inl.h
+++ b/aten/src/ATen/core/List_inl.h
@@ -198,7 +198,7 @@ typename List<T>::internal_const_reference_type List<T>::operator[](size_type po
 template<class T>
 typename List<T>::internal_reference_type List<T>::operator[](size_type pos) {
   static_cast<void>(impl_->list.at(pos)); // Throw the exception if it is out of range.
-  return {impl_->list.begin() + pos};
+  return {impl_->list.begin() + static_cast<typename decltype(impl_->list)::difference_type>(pos)};
 }
 
 template<class T>
diff --git a/aten/src/ATen/native/ForeachUtils.h b/aten/src/ATen/native/ForeachUtils.h
index d94f087e75a73..23c9b77159a22 100644
--- a/aten/src/ATen/native/ForeachUtils.h
+++ b/aten/src/ATen/native/ForeachUtils.h
@@ -20,14 +20,14 @@
 namespace at::native {
 namespace {
 // Check if tensor list has either a boolean tensor or a integer tensor
-bool has_integral_tensor(TensorList tensors, const bool includeBool) {
+inline bool has_integral_tensor(TensorList tensors, const bool includeBool) {
   return std::any_of(
       tensors.begin(), tensors.end(), [&includeBool](const auto& t) {
         return at::isIntegralType(t.scalar_type(), includeBool);
       });
 }
 // check if tensor list has bool tensors
-bool has_bool_tensor(TensorList tensors) {
+inline bool has_bool_tensor(TensorList tensors) {
   return std::any_of(tensors.begin(), tensors.end(), [](const auto& t) -> bool {
     return t.scalar_type() == ScalarType::Bool;
   });
@@ -37,11 +37,11 @@ bool has_bool_tensor(TensorList tensors) {
 // - Tensor lists must be non-empty.
 // - All TensorLists and ScalarLists must have the same number of elements.
 // - Corresponding tensors must have the same size.
-void check_foreach_api_restrictions(TensorList tensors) {
+inline void check_foreach_api_restrictions(TensorList tensors) {
   TORCH_CHECK(!tensors.empty(), "Tensor list must have at least one tensor.");
 }
 
-void check_foreach_api_restrictions(
+inline void check_foreach_api_restrictions(
     TensorList tensors,
     ArrayRef<Scalar> scalars) {
   check_foreach_api_restrictions(tensors);
@@ -50,7 +50,9 @@ void check_foreach_api_restrictions(
       "Tensor list must have same number of elements as scalar list.");
 }
 
-void check_foreach_api_restrictions(TensorList tensors1, TensorList tensors2) {
+inline void check_foreach_api_restrictions(
+    TensorList tensors1,
+    TensorList tensors2) {
   TORCH_CHECK(!tensors1.empty(), "Tensor list must have at least one tensor.");
   TORCH_CHECK(!tensors2.empty(), "Tensor list must have at least one tensor.");
   TORCH_CHECK(
@@ -61,7 +63,7 @@ void check_foreach_api_restrictions(TensorList tensors1, TensorList tensors2) {
       tensors2.size());
 }
 
-void check_foreach_api_restrictions(
+inline void check_foreach_api_restrictions(
     TensorList tensors1,
     TensorList tensors2,
     TensorList tensors3) {
@@ -82,7 +84,7 @@ void check_foreach_api_restrictions(
       tensors3.size());
 }
 
-void check_foreach_api_restrictions(
+inline void check_foreach_api_restrictions(
     TensorList tensors1,
     TensorList tensors2,
     TensorList tensors3,
@@ -99,7 +101,8 @@ void check_foreach_api_restrictions(
 // Helper function called in check_fast_path_restrictions to check whether all
 // corresponding tensors (aligning in index across the tensorLists) share the
 // same device and dtype.
-bool _check_tensors_share_device_and_dtype(ArrayRef<TensorList> tensorLists) {
+inline bool _check_tensors_share_device_and_dtype(
+    ArrayRef<TensorList> tensorLists) {
   const auto expected_dtype = tensorLists[0][0].dtype();
   const auto expected_device = tensorLists[0][0].device();
 
@@ -122,7 +125,8 @@ bool _check_tensors_share_device_and_dtype(ArrayRef<TensorList> tensorLists) {
 
 // Helper function called in check_fast_path_restrictions to check if
 // corresponding tensors in tensor lists have the same sizes and strides.
-bool _check_tensors_share_sizes_and_strides(ArrayRef<TensorList> tensorLists) {
+inline bool _check_tensors_share_sizes_and_strides(
+    ArrayRef<TensorList> tensorLists) {
   for (const auto i : c10::irange(1, tensorLists.size())) {
     for (const auto j : c10::irange(tensorLists[0].size())) {
       if (tensorLists[0][j].sizes() != tensorLists[i][j].sizes() ||
@@ -140,7 +144,7 @@ bool _check_tensors_share_sizes_and_strides(ArrayRef<TensorList> tensorLists) {
 // function assumes that _check_tensors_share_device_and_dtype has already been
 // called so that all corresponding tensors in tensorLists have the same dtype.
 // Then, it is sufficient to check the type promotion with just one tensorList.
-bool _check_tensors_do_type_promotion_with_scalars(
+inline bool _check_tensors_do_type_promotion_with_scalars(
     TensorList tensorList,
     ArrayRef<Scalar> scalarList = {},
     bool does_op_promote_integer_inputs_to_float = false) {
@@ -176,7 +180,7 @@ bool _check_tensors_do_type_promotion_with_scalars(
 
 // Please, make sure to call check_foreach_api_restrictions before calling this
 // method. There is a set of preconditions that have to be satisfied.
-bool check_fast_path_restrictions(
+inline bool check_fast_path_restrictions(
     ArrayRef<TensorList> tensorLists,
     ArrayRef<Scalar> scalarList = {},
     bool does_op_promote_integer_inputs_to_float = false) {
@@ -188,7 +192,7 @@ bool check_fast_path_restrictions(
              does_op_promote_integer_inputs_to_float);
 }
 
-std::vector<c10::Scalar> convert_tensor_to_scalar_list(
+inline std::vector<c10::Scalar> convert_tensor_to_scalar_list(
     const Tensor& scalarList_,
     int64_t expect_length) {
   std::vector<c10::Scalar> scalarList;
@@ -221,13 +225,13 @@ std::vector<c10::Scalar> convert_tensor_to_scalar_list(
             scalarList_.size(0),
             " instead.");
         for (int64_t i = 0; i < scalarList_.size(0); i++) {
-          scalarList.push_back(c10::Scalar(scalar_data[i]));
+          scalarList.emplace_back(scalar_data[i]);
         }
       });
   return scalarList;
 }
 
-bool can_use_fast_route(
+inline bool can_use_fast_route(
     ArrayRef<TensorList> tensorLists,
     ArrayRef<Scalar> scalarList = {},
     bool does_op_promote_integer_inputs_to_float = false) {
@@ -235,7 +239,7 @@ bool can_use_fast_route(
       tensorLists, scalarList, does_op_promote_integer_inputs_to_float);
 }
 
-bool can_use_fast_route(
+inline bool can_use_fast_route(
     TensorList tensors1,
     TensorList tensors2,
     bool does_op_promote_integer_inputs_to_float = false) {
@@ -253,13 +257,13 @@ using FlatMap = std::unordered_map<
     TensorsAndIndicesT,
     ParamsHash<DeviceDtypeKey>>;
 
-FlatMap _group_tensors_by_first_tensors_device_and_dtype(
+inline FlatMap _group_tensors_by_first_tensors_device_and_dtype(
     const nested_optional_tensorvec_t& nested_tensorlist,
     const bool with_indices) {
   FlatMap grouped_tensors_with_indices;
 
-  TORCH_CHECK(nested_tensorlist.size() > 0);
-  TORCH_CHECK(nested_tensorlist[0].size() > 0);
+  TORCH_CHECK(!nested_tensorlist.empty());
+  TORCH_CHECK(!nested_tensorlist[0].empty());
   const auto num_lists = nested_tensorlist.size();
   const auto num_tensors = nested_tensorlist[0].size();
 
diff --git a/aten/src/ATen/native/IndexingUtils.h b/aten/src/ATen/native/IndexingUtils.h
index 3f359ca79e4cb..72b39eb326a0c 100644
--- a/aten/src/ATen/native/IndexingUtils.h
+++ b/aten/src/ATen/native/IndexingUtils.h
@@ -30,7 +30,7 @@ static C10_UNUSED std::vector<Tensor> expandTensors(const Tensor & self, IOptTen
         // The sizes of the ByteTensor mask or bool tensor must match the sizes of the
         // corresponding dimensions in self
         for (const auto j : c10::irange(index.dim())) {
-          int64_t srcIdx = result.size() + j;
+          int64_t srcIdx = static_cast<int64_t>(result.size() + j);
           if (index.size(j) != self.size(srcIdx)) {
             invalid_mask(self, srcIdx, index, j);
           }
@@ -41,7 +41,7 @@ static C10_UNUSED std::vector<Tensor> expandTensors(const Tensor & self, IOptTen
           result.emplace_back(nonzero.select(1, j));
         }
       } else {
-        result.emplace_back(std::move(index));
+        result.emplace_back(index);
       }
     }
   }
@@ -101,7 +101,7 @@ static C10_UNUSED bool hasContiguousSubspace(TensorList tl) {
 // returns
 // tensor.permute([1, 3, 0, 2]), {a, b, nullptr, nullptr}
 static C10_UNUSED std::tuple<Tensor, std::vector<Tensor>>
-transposeToFront(Tensor self, TensorList indices) {
+transposeToFront(const Tensor& self, TensorList indices) {
   std::vector<int64_t> dims;
   std::vector<Tensor> transposedIndices;
   dims.reserve(self.dim());
@@ -121,7 +121,7 @@ transposeToFront(Tensor self, TensorList indices) {
 }
 
 inline std::tuple<Tensor, std::vector<Tensor>, std::vector<int64_t>>
-transposeToFrontAndInvPerm(Tensor self, TensorList indices) {
+transposeToFrontAndInvPerm(const Tensor& self, TensorList indices) {
   std::vector<int64_t> dims;
   std::vector<int64_t> invPerm;
   std::vector<Tensor> transposedIndices;
diff --git a/aten/src/ATen/native/nested/NestedTensorUtils.h b/aten/src/ATen/native/nested/NestedTensorUtils.h
index 5dcf2b2a52f65..206899875156b 100644
--- a/aten/src/ATen/native/nested/NestedTensorUtils.h
+++ b/aten/src/ATen/native/nested/NestedTensorUtils.h
@@ -169,13 +169,13 @@ inline std::vector<IntArrayRef> NestedTensor_get_strides(
 inline void check_numel_equals_buffer_size(const at::Tensor& self) {
   auto self_impl = get_nested_tensor_impl(self);
   TORCH_CHECK(
-      self.numel() == self_impl->get_buffer_size(),
+      self.numel() == static_cast<int64_t>(self_impl->get_buffer_size()),
       "Number of elements in nested tensor must match number of elements in buffer.");
 }
 
 inline void check_numel_equals_buffer_size(const NestedTensorImpl* self_ptr) {
   TORCH_CHECK(
-      self_ptr->numel() == self_ptr->get_buffer_size(),
+      self_ptr->numel() == static_cast<int64_t>(self_ptr->get_buffer_size()),
       "Number of elements in nested tensor must match number of elements in buffer.");
 }
 //  ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
diff --git a/aten/src/ATen/native/utils/ParamsHash.h b/aten/src/ATen/native/utils/ParamsHash.h
index caa8f3cffa51e..6b7894cb8549f 100644
--- a/aten/src/ATen/native/utils/ParamsHash.h
+++ b/aten/src/ATen/native/utils/ParamsHash.h
@@ -1,5 +1,6 @@
 #pragma once
 
+#include <c10/util/irange.h>
 #include <memory>
 #include <mutex>
 
@@ -7,7 +8,8 @@ namespace at::native {
 
 // Hashing machinery for Params
 // Fowler–Noll–Vo hash function
-// see https://en.wikipedia.org/wiki/Fowler%E2%80%93Noll%E2%80%93Vo_hash_function
+// see
+// https://en.wikipedia.org/wiki/Fowler%E2%80%93Noll%E2%80%93Vo_hash_function
 template <typename Params>
 struct ParamsHash {
   // Params must be a POD because we read out its memory
@@ -43,31 +45,35 @@ struct ParamsEqual {
 template <typename T>
 struct ParamsWrapper {
   T pod;
-  static_assert(std::is_standard_layout_v<T>, "ParamsWrapper cannot wrap non-POD data");
+  static_assert(
+      std::is_standard_layout_v<T>,
+      "ParamsWrapper cannot wrap non-POD data");
 
   ParamsWrapper() {
     memset(&(this->pod), 0, sizeof(this->pod));
   }
 
-  ParamsWrapper(const ParamsWrapper &other) {
+  ParamsWrapper(const ParamsWrapper& other) {
     memcpy(&(this->pod), &(other.pod), sizeof(this->pod));
   }
 
-  ParamsWrapper(ParamsWrapper &&other) {
+  ParamsWrapper(ParamsWrapper&& other) noexcept {
     memcpy(&(this->pod), &(other.pod), sizeof(this->pod));
   }
 
-  ParamsWrapper& operator=(const ParamsWrapper &other) {
+  ParamsWrapper& operator=(const ParamsWrapper& other) {
     memcpy(&(this->pod), &(other.pod), sizeof(this->pod));
     return *this;
   }
 
-  ParamsWrapper& operator=(ParamsWrapper &&other) {
+  ParamsWrapper& operator=(ParamsWrapper&& other) noexcept {
     memcpy(&(this->pod), &(other.pod), sizeof(this->pod));
     return *this;
   }
 
-  inline friend bool operator==(const ParamsWrapper &lhs, const ParamsWrapper &rhs) {
+  inline friend bool operator==(
+      const ParamsWrapper& lhs,
+      const ParamsWrapper& rhs) noexcept {
     auto ptr1 = reinterpret_cast<const uint8_t*>(&(lhs.pod));
     auto ptr2 = reinterpret_cast<const uint8_t*>(&(rhs.pod));
     return memcmp(ptr1, ptr2, sizeof(lhs.pod)) == 0;
@@ -80,7 +86,9 @@ template <typename ParamsWrapper>
 struct ParamsWrapperHash {
   // Params must be a POD because we read out its memory
   // contents as char* when hashing
-  static_assert(std::is_standard_layout_v<decltype(ParamsWrapper::pod)>, "ParamsWrapper cannot wrap non-POD data");
+  static_assert(
+      std::is_standard_layout_v<decltype(ParamsWrapper::pod)>,
+      "ParamsWrapper cannot wrap non-POD data");
 
   size_t operator()(const ParamsWrapper& params_wrapper) const {
     auto ptr = reinterpret_cast<const uint8_t*>(&(params_wrapper.pod));
@@ -93,4 +101,4 @@ struct ParamsWrapperHash {
   }
 };
 
-}  // at::native
+} // namespace at::native