From 0540217d5af92d31304a0044b1ca58078110ad5a Mon Sep 17 00:00:00 2001
From: Sanket Kale <sanketk.kale@fujitsu.com>
Date: Thu, 10 Oct 2024 11:02:16 +0530
Subject: [PATCH 01/15] Enabled path for ARM machines

Signed-off-by: Sanket Kale <sanketk.kale@fujitsu.com>
---
 cmake/cpu_extension.cmake     |  19 +-
 csrc/cpu/cpu_types.hpp        |   6 +-
 csrc/cpu/cpu_types_arm.hpp    | 457 ++++++++++++++++++++++++++++++++++
 examples/offline_inference.py |  13 +
 oneDNN                        |   1 +
 requirements-cpu.txt          |   7 +
 6 files changed, 499 insertions(+), 4 deletions(-)
 create mode 100644 csrc/cpu/cpu_types_arm.hpp
 create mode 160000 oneDNN
diff --git a/cmake/cpu_extension.cmake b/cmake/cpu_extension.cmake
index 426189481575b..a764df75746c0 100644
--- a/cmake/cpu_extension.cmake
+++ b/cmake/cpu_extension.cmake
@@ -59,6 +59,7 @@ find_isa(${CPUINFO} "avx2" AVX2_FOUND)
 find_isa(${CPUINFO} "avx512f" AVX512_FOUND)
 find_isa(${CPUINFO} "POWER10" POWER10_FOUND)
 find_isa(${CPUINFO} "POWER9" POWER9_FOUND)
+find_isa(${CPUINFO} "asimd" ASIMD_FOUND) # Check for ARM NEON support
 
 if (AVX512_FOUND AND NOT AVX512_DISABLED)
     list(APPEND CXX_COMPILE_FLAGS
@@ -78,9 +79,11 @@ if (AVX512_FOUND AND NOT AVX512_DISABLED)
     else()
         message(WARNING "Disable AVX512-BF16 ISA support, no avx512_bf16 found in local CPU flags." " If cross-compilation is required, please set env VLLM_CPU_AVX512BF16=1.")
     endif()
+    
 elseif (AVX2_FOUND)
     list(APPEND CXX_COMPILE_FLAGS "-mavx2")
     message(WARNING "vLLM CPU backend using AVX2 ISA")
+    
 elseif (POWER9_FOUND OR POWER10_FOUND)
     message(STATUS "PowerPC detected")
     # Check for PowerPC VSX support
@@ -88,8 +91,16 @@ elseif (POWER9_FOUND OR POWER10_FOUND)
         "-mvsx"
         "-mcpu=native"
         "-mtune=native")
+
+elseif (ASIMD_FOUND)
+    message(STATUS "ARMv8 architecture detected")
+    list(APPEND CXX_COMPILE_FLAGS
+        "-mcpu=native"
+        "-march=armv8.6-a"
+        )
+        
 else()
-    message(FATAL_ERROR "vLLM CPU backend requires AVX512 or AVX2 or Power9+ ISA support.")
+    message(FATAL_ERROR "vLLM CPU backend requires AVX512, AVX2, Power9+, or ARMv8.")
 endif()
 
 #
@@ -124,7 +135,11 @@ endif()
 
 message(STATUS "CPU extension compile flags: ${CXX_COMPILE_FLAGS}")
 
+<<<<<<< HEAD
 list(APPEND LIBS numa)
+=======
+list(APPEND LIBS dnnl numa)
+>>>>>>> ad80d348 (Enabled path for ARM machines)
 
 #
 # _C extension
@@ -159,4 +174,4 @@ define_gpu_extension_target(
     WITH_SOABI
 )
 
-message(STATUS "Enabling C extension.")
+message(STATUS "Enabling C extension.")
\ No newline at end of file
diff --git a/csrc/cpu/cpu_types.hpp b/csrc/cpu/cpu_types.hpp
index 0213be09105ed..28db0479748bf 100644
--- a/csrc/cpu/cpu_types.hpp
+++ b/csrc/cpu/cpu_types.hpp
@@ -1,4 +1,3 @@
-
 #ifndef CPU_TYPES_HPP
 #define CPU_TYPES_HPP
 
@@ -8,8 +7,11 @@
 #elif defined(__POWER9_VECTOR__)
   //ppc implementation
   #include "cpu_types_vsx.hpp"
+#elif defined(__aarch64__)
+  //arm implementation
+  #include "cpu_types_arm.hpp"
 #else
   #warning "unsupported vLLM cpu implementation"
 #endif
 
-#endif
+#endif
\ No newline at end of file
diff --git a/csrc/cpu/cpu_types_arm.hpp b/csrc/cpu/cpu_types_arm.hpp
new file mode 100644
index 0000000000000..fe87c9b1747c9
--- /dev/null
+++ b/csrc/cpu/cpu_types_arm.hpp
@@ -0,0 +1,457 @@
+#include <arm_neon.h>
+#include <torch/all.h>
+
+#include <cmath>
+
+namespace vec_op {
+
+// FIXME: FP16 is not fully supported in Torch-CPU
+#define VLLM_DISPATCH_CASE_FLOATING_TYPES(...)                                 \
+  AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__)                         \
+  AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__)
+
+#define VLLM_DISPATCH_FLOATING_TYPES(TYPE, NAME, ...)                          \
+  AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_FLOATING_TYPES(__VA_ARGS__))
+
+#ifndef CPU_OP_GUARD
+#define CPU_KERNEL_GUARD_IN(NAME)
+#define CPU_KERNEL_GUARD_OUT(NAME)
+#else
+#define CPU_KERNEL_GUARD_IN(NAME)                                              \
+  std::cout << #NAME << " invoked." << std::endl;
+#define CPU_KERNEL_GUARD_OUT(NAME) std::cout << #NAME << " exit." << std::endl;
+#endif
+
+#define FORCE_INLINE __attribute__((always_inline)) inline
+
+namespace {
+template <typename T, T... indexes, typename F>
+constexpr void unroll_loop_item(std::integer_sequence<T, indexes...>, F &&f) {
+  (f(std::integral_constant<T, indexes>{}), ...);
+}
+}; // namespace
+
+template <typename T, T count, typename F,
+          typename = std::enable_if_t<std::is_invocable_v<F, T>>>
+constexpr void unroll_loop(F &&f) {
+  unroll_loop_item(std::make_integer_sequence<T, count>{}, std::forward<F>(f));
+}
+
+template <typename T> struct Vec {
+  constexpr static int get_elem_num() { return T::VEC_ELEM_NUM; }
+};
+
+struct FP32Vec8;
+struct FP32Vec16;
+
+struct FP16Vec8 : public Vec<FP16Vec8> {
+  constexpr static int VEC_ELEM_NUM = 8;
+
+  float16x8_t reg;
+
+  explicit FP16Vec8(__fp16 v) : reg(vmovq_n_f16(v)) {} //all 8 values set to v
+
+  explicit FP16Vec8(const void *ptr) : reg(vld1q_f16((const __fp16 *)ptr)) {} 
+
+  explicit FP16Vec8(float16x8_t data) : reg(data) {}
+
+  explicit FP16Vec8(float32x4x2_t v) : reg(vcombine_f16(vcvt_f16_f32(v.val[0]), vcvt_f16_f32(v.val[1]))) {}
+
+  FP16Vec8 operator*(const FP16Vec8 &b) const {
+    return FP16Vec8(vmulq_f16(reg, b.reg));
+  }
+
+  FP16Vec8 operator+(const FP16Vec8 &b) const {
+    return FP16Vec8(vaddq_f16(reg, b.reg));
+  }
+
+  FP16Vec8 operator-(const FP16Vec8 &b) const {
+    return FP16Vec8(vsubq_f16(reg, b.reg));
+  }
+
+  FP16Vec8 operator/(const FP16Vec8 &b) const {
+    return FP16Vec8(vdivq_f16(reg, b.reg));
+  }
+
+  void save(void *ptr) const { vst1q_f16((__fp16 *)ptr, reg); }
+};
+
+struct BF16Vec8 : public Vec<BF16Vec8> {
+  constexpr static int VEC_ELEM_NUM = 8;
+
+  bfloat16x8_t reg;
+
+  explicit BF16Vec8(const void *ptr)
+      : reg(*reinterpret_cast<const bfloat16x8_t *>(ptr)) {}
+
+  explicit BF16Vec8(bfloat16x8_t data) : reg(data) {}
+
+  explicit BF16Vec8(const FP32Vec8 &);
+
+  explicit BF16Vec8(float32x4x2_t v) : reg(vcvtq_high_bf16_f32(vcvtq_low_bf16_f32(v.val[0]), v.val[1])) {}  
+
+  void save(void *ptr) const { *reinterpret_cast<bfloat16x8_t *>(ptr) = reg; }
+};
+
+struct BF16Vec16 : public Vec<BF16Vec16> {
+  constexpr static int VEC_ELEM_NUM = 16;
+
+  bfloat16x8x2_t reg;
+
+  explicit BF16Vec16(const void *ptr)
+      : reg(*reinterpret_cast<const bfloat16x8x2_t *>(ptr)) {}
+
+  explicit BF16Vec16(bfloat16x8x2_t data) : reg(data) {}
+
+  explicit BF16Vec16(const FP32Vec16 &);
+
+  explicit BF16Vec16(float32x4x4_t v) : reg({
+    vcvtq_high_bf16_f32(vcvtq_low_bf16_f32(v.val[0]), v.val[1]),
+    vcvtq_high_bf16_f32(vcvtq_low_bf16_f32(v.val[2]), v.val[3])
+  }){};
+
+  void save(void *ptr) const { *reinterpret_cast<bfloat16x8x2_t *>(ptr) = reg; }
+};
+
+struct BF16Vec32 : public Vec<BF16Vec32> {
+  constexpr static int VEC_ELEM_NUM = 32;
+
+  bfloat16x8x4_t reg;
+
+  explicit BF16Vec32(const void *ptr)
+      : reg(*reinterpret_cast<const bfloat16x8x4_t *>(ptr)) {}
+
+  explicit BF16Vec32(bfloat16x8x4_t data) : reg(data) {}
+
+  explicit BF16Vec32(const BF16Vec8 &vec8_data) : reg({
+    vec8_data.reg,
+    vec8_data.reg,
+    vec8_data.reg,
+    vec8_data.reg
+  }) {}
+
+  void save(void *ptr) const { *reinterpret_cast<bfloat16x8x4_t *>(ptr) = reg; }
+};
+
+struct FP32Vec4 : public Vec<FP32Vec4> {
+  constexpr static int VEC_ELEM_NUM = 4;
+
+  union AliasReg {
+    float32x4_t reg;
+    float values[VEC_ELEM_NUM];
+  };
+
+  float32x4_t reg;
+
+  // Constructor that initializes all elements with a single float value
+  explicit FP32Vec4(float v) : reg(vdupq_n_f32(v)) {}
+
+  // Default constructor initializes all elements to zero
+  explicit FP32Vec4() : reg(vdupq_n_f32(0.0f)) {}
+
+  // Constructor to load from memory
+  explicit FP32Vec4(const float *ptr) : reg(vld1q_f32(ptr)) {}
+
+  // Constructor to initialize directly from float32x4_t data
+  explicit FP32Vec4(float32x4_t data) : reg(data) {}
+
+  // Copy constructor
+  explicit FP32Vec4(const FP32Vec4 &data) : reg(data.reg) {}
+};
+
+struct FP32Vec8 : public Vec<FP32Vec8> {
+  constexpr static int VEC_ELEM_NUM = 8;
+  union AliasReg {
+    float32x4x2_t reg;
+    float values[VEC_ELEM_NUM];
+  };
+
+  float32x4x2_t reg;
+
+  explicit FP32Vec8(float v) : reg({vmovq_n_f32(v), vmovq_n_f32(v)}) {}
+
+  explicit FP32Vec8() : reg({vmovq_n_f32(0.0), vmovq_n_f32(0.0)}) {}
+
+  explicit FP32Vec8(const float *ptr) : reg({vld1q_f32(ptr), vld1q_f32(ptr + 4)}) {}
+
+  explicit FP32Vec8(float32x4x2_t data) : reg(data) {}
+
+  explicit FP32Vec8(const FP32Vec8 &data) : reg(data.reg) {}
+
+  explicit FP32Vec8(float16x8_t v) : reg({vcvt_f32_f16(vget_low_f16(v)), vcvt_f32_f16(vget_high_f16(v))}) {}
+
+  explicit FP32Vec8(bfloat16x8_t v) : reg({vcvtq_low_f32_bf16(v), vcvtq_high_f32_bf16(v)}) {}
+
+  explicit FP32Vec8(const BF16Vec8 &v) : reg({vcvtq_low_f32_bf16(v.reg), vcvtq_high_f32_bf16(v.reg)}) {}
+
+  float reduce_sum() const {
+    AliasReg ar;
+    ar.reg = reg;
+    float ans = 0;
+    unroll_loop<int, VEC_ELEM_NUM>([&ans, &ar](int i) { ans += ar.values[i]; });
+
+    return ans;
+  }
+
+  FP32Vec8 exp() const {
+    AliasReg ar;
+    ar.reg = reg;
+
+    // Extract float32x2_t elements for each float32x4_t
+    float32x2_t exp_vec0 = {expf(ar.values[0]), expf(ar.values[1])};
+    float32x2_t exp_vec1 = {expf(ar.values[2]), expf(ar.values[3])};
+    float32x2_t exp_vec2 = {expf(ar.values[4]), expf(ar.values[5])};
+    float32x2_t exp_vec3 = {expf(ar.values[6]), expf(ar.values[7])};
+
+    // Combine the results into float32x4_t vectors
+    float32x4_t result0 = vcombine_f32(exp_vec0, exp_vec1);
+    float32x4_t result1 = vcombine_f32(exp_vec2, exp_vec3);
+
+    // Initialize the float32x4x2_t object
+    float32x4x2_t result;
+    result.val[0] = result0;
+    result.val[1] = result1;
+
+    return FP32Vec8(result);
+  }
+
+  FP32Vec8 tanh() const {
+    AliasReg ar;
+    ar.reg = reg;
+
+    // Extract float32x2_t elements for each float32x4_t
+    float32x2_t tanh_vec0 = {tanhf(ar.values[0]), tanhf(ar.values[1])};
+    float32x2_t tanh_vec1 = {tanhf(ar.values[2]), tanhf(ar.values[3])};
+    float32x2_t tanh_vec2 = {tanhf(ar.values[4]), tanhf(ar.values[5])};
+    float32x2_t tanh_vec3 = {tanhf(ar.values[6]), tanhf(ar.values[7])};
+
+    // Combine the results into float32x4_t vectors
+    float32x4_t result0 = vcombine_f32(tanh_vec0, tanh_vec1);
+    float32x4_t result1 = vcombine_f32(tanh_vec2, tanh_vec3);
+
+    // Initialize the float32x4x2_t object
+    float32x4x2_t result;
+    result.val[0] = result0;
+    result.val[1] = result1;
+
+    return FP32Vec8(result);
+  }
+
+  FP32Vec8 er() const {
+    AliasReg ar;
+    ar.reg = reg;
+
+    // Extract float32x2_t elements for each float32x4_t
+    float32x2_t er_vec0 = {erf(ar.values[0]), erf(ar.values[1])};
+    float32x2_t er_vec1 = {erf(ar.values[2]), erf(ar.values[3])};
+    float32x2_t er_vec2 = {erf(ar.values[4]), erf(ar.values[5])};
+    float32x2_t er_vec3 = {erf(ar.values[6]), erf(ar.values[7])};
+
+    // Combine the results into float32x4_t vectors
+    float32x4_t result0 = vcombine_f32(er_vec0, er_vec1);
+    float32x4_t result1 = vcombine_f32(er_vec2, er_vec3);
+
+    // Initialize the float32x4x2_t object
+    float32x4x2_t result;
+    result.val[0] = result0;
+    result.val[1] = result1;
+
+    return FP32Vec8(result);
+  }
+
+
+  FP32Vec8 operator*(const FP32Vec8 &b) const {
+    return FP32Vec8(float32x4x2_t({vmulq_f32(reg.val[0], b.reg.val[0]), vmulq_f32(reg.val[1], b.reg.val[1])}));
+  }
+
+  FP32Vec8 operator+(const FP32Vec8 &b) const {
+    return FP32Vec8(float32x4x2_t({vaddq_f32(reg.val[0], b.reg.val[0]), vaddq_f32(reg.val[1], b.reg.val[1])}));
+  }
+
+  FP32Vec8 operator-(const FP32Vec8 &b) const {
+    return FP32Vec8(float32x4x2_t({vsubq_f32(reg.val[0], b.reg.val[0]), vsubq_f32(reg.val[1], b.reg.val[1])}));
+  }
+
+  FP32Vec8 operator/(const FP32Vec8 &b) const {
+    return FP32Vec8(float32x4x2_t({vdivq_f32(reg.val[0], b.reg.val[0]), vdivq_f32(reg.val[1], b.reg.val[1])}));
+  }
+
+  void save(float *ptr) const {
+    vst1q_f32(ptr, reg.val[0]);
+    vst1q_f32(ptr + 4, reg.val[1]);
+  }
+};
+
+struct FP32Vec16 : public Vec<FP32Vec16> {
+  constexpr static int VEC_ELEM_NUM = 16;
+  union AliasReg {
+    float32x4x4_t reg;
+    float values[VEC_ELEM_NUM];
+  };
+
+  float32x4x4_t reg;
+
+  explicit FP32Vec16(float v) : reg({vmovq_n_f32(v), vmovq_n_f32(v), vmovq_n_f32(v), vmovq_n_f32(v)}) {}
+
+  explicit FP32Vec16() : reg({vmovq_n_f32(0.0), vmovq_n_f32(0.0), vmovq_n_f32(0.0), vmovq_n_f32(0.0)}) {}
+
+  explicit FP32Vec16(const float *ptr) : reg({vld1q_f32(ptr), vld1q_f32(ptr + 4), vld1q_f32(ptr + 8), vld1q_f32(ptr + 12)}) {}
+
+  explicit FP32Vec16(float32x4x4_t data) : reg(data) {}
+
+  explicit FP32Vec16(const FP32Vec16 &data) : reg(data.reg) {}
+
+  explicit FP32Vec16(bfloat16x8x2_t v) : reg({
+    vcvtq_low_f32_bf16(v.val[0]),
+    vcvtq_high_f32_bf16(v.val[0]),
+    vcvtq_low_f32_bf16(v.val[1]),
+    vcvtq_high_f32_bf16(v.val[1])
+  }) {}
+
+  explicit FP32Vec16(const FP32Vec4 &data) {
+    reg.val[0] = data.reg;
+    reg.val[1] = data.reg;
+    reg.val[2] = data.reg;
+    reg.val[3] = data.reg;
+  }
+
+  explicit FP32Vec16(const FP32Vec8 &data) {
+    reg.val[0] = data.reg.val[0]; // First half (low 128 bits)
+    reg.val[1] = data.reg.val[1]; // Second half (high 128 bits)
+    reg.val[2] = data.reg.val[0]; // Repeating the first half
+    reg.val[3] = data.reg.val[1]; // Repeating the second half
+  }
+
+
+  explicit FP32Vec16(const BF16Vec16 &v) : reg({
+    vcvtq_low_f32_bf16(v.reg.val[0]),
+    vcvtq_high_f32_bf16(v.reg.val[0]),
+    vcvtq_low_f32_bf16(v.reg.val[1]),
+    vcvtq_high_f32_bf16(v.reg.val[1])
+  }) {}
+
+  explicit FP32Vec16(const BF16Vec8 &v) : FP32Vec16(FP32Vec8(v)) {}
+  
+  FP32Vec16 operator+(const FP32Vec16 &b) const {
+    return FP32Vec16(float32x4x4_t({
+        vaddq_f32(reg.val[0], b.reg.val[0]),
+        vaddq_f32(reg.val[1], b.reg.val[1]),
+        vaddq_f32(reg.val[2], b.reg.val[2]),
+        vaddq_f32(reg.val[3], b.reg.val[3])}));
+  }
+
+  FP32Vec16 operator*(const FP32Vec16 &b) const {
+    return FP32Vec16(float32x4x4_t({
+        vmulq_f32(reg.val[0], b.reg.val[0]),
+        vmulq_f32(reg.val[1], b.reg.val[1]),
+        vmulq_f32(reg.val[2], b.reg.val[2]),
+        vmulq_f32(reg.val[3], b.reg.val[3])}));
+  }
+
+  FP32Vec16 operator-(const FP32Vec16 &b) const {
+    return FP32Vec16(float32x4x4_t({
+        vsubq_f32(reg.val[0], b.reg.val[0]),
+        vsubq_f32(reg.val[1], b.reg.val[1]),
+        vsubq_f32(reg.val[2], b.reg.val[2]),
+        vsubq_f32(reg.val[3], b.reg.val[3])
+    }));
+  }
+
+  FP32Vec16 operator/(const FP32Vec16 &b) const {
+    return FP32Vec16(float32x4x4_t({
+        vdivq_f32(reg.val[0], b.reg.val[0]),
+        vdivq_f32(reg.val[1], b.reg.val[1]),
+        vdivq_f32(reg.val[2], b.reg.val[2]),
+        vdivq_f32(reg.val[3], b.reg.val[3])
+    }));
+  }
+
+  float reduce_sum() const {
+    AliasReg ar;
+    ar.reg = reg;
+    float ans = 0;
+    unroll_loop<int, VEC_ELEM_NUM>([&ans, &ar](int i) { ans += ar.values[i]; });
+
+    return ans;
+  }
+
+  template <int group_size> float reduce_sub_sum(int idx) {
+    static_assert(VEC_ELEM_NUM % group_size == 0);
+
+    AliasReg ar;
+    ar.reg = reg;
+    float ans = 0;
+    const int start = idx * group_size;
+    unroll_loop<int, group_size>(
+        [&ans, &start, ar](int i) { ans += ar.values[start + i]; });
+
+    return ans;
+  }
+
+  void save(float *ptr) const {
+    vst1q_f32(ptr, reg.val[0]);
+    vst1q_f32(ptr + 4, reg.val[1]);
+    vst1q_f32(ptr + 8, reg.val[2]);
+    vst1q_f32(ptr + 12, reg.val[3]);
+  }
+};
+
+template <typename T> struct VecType { using vec_type = void; };
+
+template <typename T> using vec_t = typename VecType<T>::vec_type;
+
+template <> struct VecType<float> { using vec_type = FP32Vec8; };
+
+template <> struct VecType<c10::Half> { using vec_type = FP16Vec8; };
+
+template <> struct VecType<c10::BFloat16> { using vec_type = BF16Vec8; };
+
+template <typename T> void storeFP32(float v, T *ptr) { *ptr = v; }
+
+template <> inline void storeFP32<c10::Half>(float v, c10::Half *ptr) {
+  *reinterpret_cast<__fp16 *>(ptr) = v;
+}
+
+inline void fma(FP32Vec16 &acc, FP32Vec16 &a, FP32Vec16 &b) {
+  acc.reg.val[0] = vfmaq_f32(acc.reg.val[0], a.reg.val[0], b.reg.val[0]);
+  acc.reg.val[1] = vfmaq_f32(acc.reg.val[1], a.reg.val[1], b.reg.val[1]);
+  acc.reg.val[2] = vfmaq_f32(acc.reg.val[2], a.reg.val[2], b.reg.val[2]);
+  acc.reg.val[3] = vfmaq_f32(acc.reg.val[3], a.reg.val[3], b.reg.val[3]);
+}
+
+inline void fma(FP32Vec16 &acc, BF16Vec32 &a, BF16Vec32 &b) {
+  // Convert BF16 to FP32 for each half of the BF16 vectors
+  float32x4_t a0_low = vcvt_f32_bf16(vget_low_bf16(a.reg.val[0]));
+  float32x4_t a0_high = vcvt_f32_bf16(vget_high_bf16(a.reg.val[0]));
+  float32x4_t a1_low = vcvt_f32_bf16(vget_low_bf16(a.reg.val[1]));
+  float32x4_t a1_high = vcvt_f32_bf16(vget_high_bf16(a.reg.val[1]));
+
+  float32x4_t b0_low = vcvt_f32_bf16(vget_low_bf16(b.reg.val[0]));
+  float32x4_t b0_high = vcvt_f32_bf16(vget_high_bf16(b.reg.val[0]));
+  float32x4_t b1_low = vcvt_f32_bf16(vget_low_bf16(b.reg.val[1]));
+  float32x4_t b1_high = vcvt_f32_bf16(vget_high_bf16(b.reg.val[1]));
+
+  // Perform FMA on FP32 vectors
+  acc.reg.val[0] = vfmaq_f32(acc.reg.val[0], a0_low, b0_low);
+  acc.reg.val[1] = vfmaq_f32(acc.reg.val[1], a0_high, b0_high);
+  acc.reg.val[2] = vfmaq_f32(acc.reg.val[2], a1_low, b1_low);
+  acc.reg.val[3] = vfmaq_f32(acc.reg.val[3], a1_high, b1_high);
+}
+
+inline BF16Vec8::BF16Vec8(const FP32Vec8 &v) : reg(vcvtq_high_bf16_f32(vcvtq_low_bf16_f32(v.reg.val[0]), v.reg.val[1])) {};
+
+inline BF16Vec16::BF16Vec16(const FP32Vec16 &v) : reg({
+    vcvtq_high_bf16_f32(vcvtq_low_bf16_f32(v.reg.val[0]), v.reg.val[1]),
+    vcvtq_high_bf16_f32(vcvtq_low_bf16_f32(v.reg.val[2]), v.reg.val[3])
+  }){};
+
+inline void prefetch(const void *addr) {
+    __builtin_prefetch(addr, 0, 1);
+}
+
+template <>
+inline void storeFP32<c10::BFloat16>(float v, c10::BFloat16 *ptr) { // storeFP32ToT
+  *reinterpret_cast<__bf16 *>(ptr) = vcvth_bf16_f32(v);
+}
+
+};
diff --git a/examples/offline_inference.py b/examples/offline_inference.py
index 391ac6b9b6b03..dcd8c8eb86d1f 100644
--- a/examples/offline_inference.py
+++ b/examples/offline_inference.py
@@ -5,6 +5,7 @@
 from vllm.utils import FlexibleArgumentParser
 
 
+<<<<<<< HEAD
 def get_prompts(num_prompts: int):
     # The default sample prompts.
     prompts = [
@@ -78,3 +79,15 @@ def main(args):
 
     args = parser.parse_args()
     main(args)
+=======
+# Create an LLM.
+llm = LLM(model="facebook/opt-125m")
+# Generate texts from the prompts. The output is a list of RequestOutput objects
+# that contain the prompt, generated text, and other information.
+outputs = llm.generate(prompts, sampling_params)
+# Print the outputs.
+for output in outputs:
+    prompt = output.prompt
+    generated_text = output.outputs[0].text
+    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+>>>>>>> 3ff99cf4 (Enabled path for ARM machines)
diff --git a/oneDNN b/oneDNN
new file mode 160000
index 0000000000000..e2f85e88e93d8
--- /dev/null
+++ b/oneDNN
@@ -0,0 +1 @@
+Subproject commit e2f85e88e93d87bb17282f03f4d886445668d433
diff --git a/requirements-cpu.txt b/requirements-cpu.txt
index 749b03a0603d8..ac20ea15f4aab 100644
--- a/requirements-cpu.txt
+++ b/requirements-cpu.txt
@@ -1,6 +1,13 @@
 # Common dependencies
 -r requirements-common.txt
 
+<<<<<<< HEAD
 # Dependencies for x86_64 CPUs
 torch == 2.5.1+cpu; platform_machine != "ppc64le"
 torchvision; platform_machine != "ppc64le"   # required for the image processor of phi3v, this must be updated alongside torch
+=======
+# Dependencies for CPUs
+torch==2.4.0+cpu; platform_machine != "ppc64le" and platform_machine != "aarch64" 
+torch==2.4.0; platform_machine == "aarch64"
+torchvision; platform_machine != "ppc64le"  # required for the image processor of phi3v, this must be updated alongside torch
+>>>>>>> ad80d348 (Enabled path for ARM machines)

From 42d7e9d4c44978ea9f1301cb610f41c82104287c Mon Sep 17 00:00:00 2001
From: Sanket Kale <sanketk.kale@fujitsu.com>
Date: Thu, 10 Oct 2024 11:16:10 +0530
Subject: [PATCH 02/15] Enabled path for ARM machines

Signed-off-by: Sanket Kale <sanketk.kale@fujitsu.com>
---
 cmake/cpu_extension.cmake | 9 +++++++++
 oneDNN                    | 1 -
 2 files changed, 9 insertions(+), 1 deletion(-)
 delete mode 160000 oneDNN

diff --git a/cmake/cpu_extension.cmake b/cmake/cpu_extension.cmake
index a764df75746c0..328f618fd2239 100644
--- a/cmake/cpu_extension.cmake
+++ b/cmake/cpu_extension.cmake
@@ -135,11 +135,20 @@ endif()
 
 message(STATUS "CPU extension compile flags: ${CXX_COMPILE_FLAGS}")
 
+<<<<<<< HEAD
 <<<<<<< HEAD
 list(APPEND LIBS numa)
 =======
 list(APPEND LIBS dnnl numa)
 >>>>>>> ad80d348 (Enabled path for ARM machines)
+=======
+list(APPEND LIBS numa)
+
+# Appending the dnnl library for the AVX2 and AVX512, as it is not utilized by Power architecture.
+if (AVX2_FOUND OR AVX512_FOUND)
+    list(APPEND LIBS dnnl)
+endif()
+>>>>>>> b424c9aa (Enabled path for ARM machines)
 
 #
 # _C extension
diff --git a/oneDNN b/oneDNN
deleted file mode 160000
index e2f85e88e93d8..0000000000000
--- a/oneDNN
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit e2f85e88e93d87bb17282f03f4d886445668d433

From e3c67400442ea7231c7bfb5cf11e4375761e5383 Mon Sep 17 00:00:00 2001
From: Sanket Kale <sanket.kale@fujitsu.com>
Date: Fri, 11 Oct 2024 09:35:08 +0530
Subject: [PATCH 03/15] fixed typos

Signed-off-by: Sanket Kale <sanketk.kale@fujitsu.com>
---
 cmake/cpu_extension.cmake | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cmake/cpu_extension.cmake b/cmake/cpu_extension.cmake
index 328f618fd2239..d75aa920135c1 100644
--- a/cmake/cpu_extension.cmake
+++ b/cmake/cpu_extension.cmake
@@ -100,7 +100,7 @@ elseif (ASIMD_FOUND)
         )
         
 else()
-    message(FATAL_ERROR "vLLM CPU backend requires AVX512, AVX2, Power9+, or ARMv8.")
+    message(FATAL_ERROR "vLLM CPU backend requires AVX512, AVX2, Power9+ ISA or ARMv8 support.")
 endif()
 
 #
@@ -183,4 +183,4 @@ define_gpu_extension_target(
     WITH_SOABI
 )
 
-message(STATUS "Enabling C extension.")
\ No newline at end of file
+message(STATUS "Enabling C extension.")

From 85df3b5ac6f7b2fda5b89af472ce7c4c5b2ceeac Mon Sep 17 00:00:00 2001
From: Sanket Kale <sanketk.kale@fujitsu.com>
Date: Mon, 28 Oct 2024 10:34:11 +0530
Subject: [PATCH 04/15] Added Dockerfile for ARM architecture support

Signed-off-by: Sanket Kale <sanketk.kale@fujitsu.com>
---
 Dockerfile.arm | 62 ++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 62 insertions(+)
 create mode 100644 Dockerfile.arm

diff --git a/Dockerfile.arm b/Dockerfile.arm
new file mode 100644
index 0000000000000..093ee2209222f
--- /dev/null
+++ b/Dockerfile.arm
@@ -0,0 +1,62 @@
+# This vLLM Dockerfile is used to construct an image that can build and run vLLM on ARM CPU platform.
+
+FROM ubuntu:22.04 AS cpu-test-arm
+
+ENV CCACHE_DIR=/root/.cache/ccache
+
+ENV CMAKE_CXX_COMPILER_LAUNCHER=ccache
+
+RUN --mount=type=cache,target=/var/cache/apt \
+    apt-get update -y \
+    && apt-get install -y curl ccache git wget vim numactl gcc-12 g++-12 python3 python3-pip libtcmalloc-minimal4 libnuma-dev \
+    && apt-get install -y ffmpeg libsm6 libxext6 libgl1 \
+    && update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12
+
+# tcmalloc provides better memory allocation efficiency, e.g., holding memory in caches to speed up access of commonly-used objects.
+RUN --mount=type=cache,target=/root/.cache/pip \
+    pip install py-cpuinfo  # Use this to gather CPU info and optimize based on ARM Neoverse cores
+
+# Set LD_PRELOAD for tcmalloc on ARM
+ENV LD_PRELOAD="/usr/lib/aarch64-linux-gnu/libtcmalloc_minimal.so.4"
+
+RUN echo 'ulimit -c 0' >> ~/.bashrc
+
+WORKDIR /workspace
+
+ARG PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu"
+ENV PIP_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL}
+RUN --mount=type=cache,target=/root/.cache/pip \
+    --mount=type=bind,src=requirements-build.txt,target=requirements-build.txt \
+    pip install --upgrade pip && \
+    pip install -r requirements-build.txt
+
+FROM cpu-test-arm AS build
+
+WORKDIR /workspace/vllm
+
+RUN --mount=type=cache,target=/root/.cache/pip \
+    --mount=type=bind,src=requirements-common.txt,target=requirements-common.txt \
+    --mount=type=bind,src=requirements-cpu.txt,target=requirements-cpu.txt \
+    pip install -v -r requirements-cpu.txt
+
+COPY . .
+ARG GIT_REPO_CHECK=0
+RUN --mount=type=bind,source=.git,target=.git \
+    if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi
+
+# Disabling AVX512 specific optimizations for ARM
+ARG VLLM_CPU_DISABLE_AVX512="true"
+ENV VLLM_CPU_DISABLE_AVX512=${VLLM_CPU_DISABLE_AVX512}
+
+RUN --mount=type=cache,target=/root/.cache/pip \
+    --mount=type=cache,target=/root/.cache/ccache \
+    --mount=type=bind,source=.git,target=.git \
+    VLLM_TARGET_DEVICE=cpu python3 setup.py bdist_wheel && \
+    pip install dist/*.whl && \
+    rm -rf dist
+
+WORKDIR /workspace/
+
+RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks
+
+ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
\ No newline at end of file

From 56054cbb063c8b693a7e130c591ab2c44f1cb317 Mon Sep 17 00:00:00 2001
From: Sanket Kale <sanketk.kale@fujitsu.com>
Date: Tue, 5 Nov 2024 10:20:36 +0530
Subject: [PATCH 05/15] Added Documenataion for ARM architecture support and
 updated architecture to native

Signed-off-by: Sanket Kale <sanketk.kale@fujitsu.com>
---
 cmake/cpu_extension.cmake                     |  2 +-
 .../getting_started/arm-installation.rst      | 50 +++++++++++++++++++
 docs/source/index.rst                         |  4 ++
 3 files changed, 55 insertions(+), 1 deletion(-)
 create mode 100644 docs/source/getting_started/arm-installation.rst

diff --git a/cmake/cpu_extension.cmake b/cmake/cpu_extension.cmake
index d75aa920135c1..3561cc52e9b33 100644
--- a/cmake/cpu_extension.cmake
+++ b/cmake/cpu_extension.cmake
@@ -96,7 +96,7 @@ elseif (ASIMD_FOUND)
     message(STATUS "ARMv8 architecture detected")
     list(APPEND CXX_COMPILE_FLAGS
         "-mcpu=native"
-        "-march=armv8.6-a"
+        "-mtune=native"
         )
         
 else()
diff --git a/docs/source/getting_started/arm-installation.rst b/docs/source/getting_started/arm-installation.rst
new file mode 100644
index 0000000000000..7f7652af60113
--- /dev/null
+++ b/docs/source/getting_started/arm-installation.rst
@@ -0,0 +1,50 @@
+.. _installation_arm:
+
+Installation for ARM CPUs
+=========================
+
+vLLM has been adapted to work on ARM64 CPUs with NEON support, leveraging the CPU backend initially developed for the x86 platform. This guide provides installation instructions specific to ARM. For additional details on supported features, refer to the x86 platform documentation covering:
+
+* CPU backend inference capabilities
+* Relevant runtime environment variables
+* Performance optimization tips
+
+Contents:
+
+1. :ref:`Requirements <arm_backend_requirements>`
+2. :ref:`Quick Start with Dockerfile <arm_backend_quick_start_dockerfile>`
+3. :ref:`Building from Source <build_arm_backend_from_source>`
+
+.. _arm_backend_requirements:
+
+Requirements
+------------
+
+* **Operating System**: Linux or macOS
+* **Docker**: Required if running on macOS
+* **Compiler**: gcc/g++ >= 12.3.0 (optional, but recommended)
+* **Instruction Set Architecture (ISA)**: NEON support is required
+
+.. _arm_backend_quick_start_dockerfile:
+
+Quick Start with Dockerfile
+---------------------------
+
+You can quickly set up vLLM on ARM using Docker:
+
+.. code-block:: console
+
+    $ docker build -f Dockerfile.arm -t vllm-cpu-env --shm-size=4g .
+    $ docker run -it \
+                 --rm \
+                 --network=host \
+                 --cpuset-cpus=<cpu-id-list, optional> \
+                 --cpuset-mems=<memory-node, optional> \
+                 vllm-cpu-env
+
+.. _build_arm_backend_from_source:
+
+Building from Source
+--------------------
+
+To build vLLM from source on Ubuntu 22.04 or other Linux distributions, follow a similar process as with x86. Testing has been conducted on AWS Graviton3 instances for compatibility.
diff --git a/docs/source/index.rst b/docs/source/index.rst
index c2afd806c50f9..224a59ed1fc7e 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -66,7 +66,11 @@ Documentation
    getting_started/amd-installation
    getting_started/openvino-installation
    getting_started/cpu-installation
+<<<<<<< HEAD
    getting_started/gaudi-installation
+=======
+   getting_started/arm-installation
+>>>>>>> 32a6432f (Added Documenataion for ARM architecture support and updated architecture to native)
    getting_started/neuron-installation
    getting_started/tpu-installation
    getting_started/xpu-installation

From 17896d372c7f785fd31a65c5c377484f1772cda0 Mon Sep 17 00:00:00 2001
From: Sanket Kale <sanketk.kale@fujitsu.com>
Date: Tue, 5 Nov 2024 10:23:26 +0530
Subject: [PATCH 06/15] Updated documentation

Signed-off-by: Sanket Kale <sanketk.kale@fujitsu.com>
---
 docs/source/getting_started/arm-installation.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/getting_started/arm-installation.rst b/docs/source/getting_started/arm-installation.rst
index 7f7652af60113..4ee817875d535 100644
--- a/docs/source/getting_started/arm-installation.rst
+++ b/docs/source/getting_started/arm-installation.rst
@@ -23,7 +23,7 @@ Requirements
 * **Operating System**: Linux or macOS
 * **Docker**: Required if running on macOS
 * **Compiler**: gcc/g++ >= 12.3.0 (optional, but recommended)
-* **Instruction Set Architecture (ISA)**: NEON support is required
+* **Instruction Set Architecture (ISA)**: NEON support and BF16 extension is required
 
 .. _arm_backend_quick_start_dockerfile:
 

From 85a453fa0223947cc521d63aef56547c417e0870 Mon Sep 17 00:00:00 2001
From: Sanket Kale <sanketk.kale@fujitsu.com>
Date: Tue, 12 Nov 2024 16:39:22 +0530
Subject: [PATCH 07/15] Added compatibility with macos by creating separate
 fp32 and bf16 paths

Signed-off-by: Sanket Kale <sanketk.kale@fujitsu.com>
---
 cmake/cpu_extension.cmake  | 19 +++++++++-----
 csrc/cpu/attention.cpp     | 24 ++++++++++-------
 csrc/cpu/cpu_types_arm.hpp | 54 ++++++++++++++++++++++++++------------
 3 files changed, 63 insertions(+), 34 deletions(-)

diff --git a/cmake/cpu_extension.cmake b/cmake/cpu_extension.cmake
index 3561cc52e9b33..22b2fe45d4521 100644
--- a/cmake/cpu_extension.cmake
+++ b/cmake/cpu_extension.cmake
@@ -60,6 +60,7 @@ find_isa(${CPUINFO} "avx512f" AVX512_FOUND)
 find_isa(${CPUINFO} "POWER10" POWER10_FOUND)
 find_isa(${CPUINFO} "POWER9" POWER9_FOUND)
 find_isa(${CPUINFO} "asimd" ASIMD_FOUND) # Check for ARM NEON support
+find_isa(${CPUINFO} "bf16" BF16_FOUND) # Check for BF16 support
 
 if (AVX512_FOUND AND NOT AVX512_DISABLED)
     list(APPEND CXX_COMPILE_FLAGS
@@ -93,12 +94,16 @@ elseif (POWER9_FOUND OR POWER10_FOUND)
         "-mtune=native")
 
 elseif (ASIMD_FOUND)
-    message(STATUS "ARMv8 architecture detected")
-    list(APPEND CXX_COMPILE_FLAGS
-        "-mcpu=native"
-        "-mtune=native"
-        )
-        
+    message(STATUS "ARMv8 or later architecture detected")
+    if(BF16_FOUND)
+        message(STATUS "BF16 extension detected")
+        set(MARCH_FLAGS "-march=armv8.2-a+bf16+dotprod+fp16")
+        add_compile_definitions(BF16_SUPPORT)
+    else()
+        message(WARNING "BF16 functionality is not available")
+        set(MARCH_FLAGS "-march=armv8.2-a+dotprod+fp16")  
+    endif()
+    list(APPEND CXX_COMPILE_FLAGS ${MARCH_FLAGS})     
 else()
     message(FATAL_ERROR "vLLM CPU backend requires AVX512, AVX2, Power9+ ISA or ARMv8 support.")
 endif()
@@ -183,4 +188,4 @@ define_gpu_extension_target(
     WITH_SOABI
 )
 
-message(STATUS "Enabling C extension.")
+message(STATUS "Enabling C extension.")
\ No newline at end of file
diff --git a/csrc/cpu/attention.cpp b/csrc/cpu/attention.cpp
index e6c03dcb034fd..72f8b6f0fca6d 100644
--- a/csrc/cpu/attention.cpp
+++ b/csrc/cpu/attention.cpp
@@ -51,15 +51,19 @@ struct KernelVecType<c10::BFloat16> {
   using v_load_vec_type = vec_op::BF16Vec16;
 };
 #else
-template <>
-struct KernelVecType<c10::BFloat16> {
-  using q_load_vec_type = vec_op::BF16Vec8;
-  using q_vec_type = vec_op::FP32Vec16;
-  using k_load_vec_type = vec_op::BF16Vec16;
-  using k_vec_type = vec_op::FP32Vec16;
-  using qk_acc_vec_type = vec_op::FP32Vec16;
-  using v_load_vec_type = vec_op::BF16Vec16;
-};
+  #ifndef BF16_SUPPORT
+  //pass
+  #else
+  template <>
+  struct KernelVecType<c10::BFloat16> {
+    using q_load_vec_type = vec_op::BF16Vec8;
+    using q_vec_type = vec_op::FP32Vec16;
+    using k_load_vec_type = vec_op::BF16Vec16;
+    using k_vec_type = vec_op::FP32Vec16;
+    using qk_acc_vec_type = vec_op::FP32Vec16;
+    using v_load_vec_type = vec_op::BF16Vec16;
+  };
+  #endif
 #endif
 
 template <typename T>
@@ -779,4 +783,4 @@ void paged_attention_v2(
                                  CALL_V2_KERNEL_LAUNCHER_BLOCK_SIZE(scalar_t);
                                  CPU_KERNEL_GUARD_OUT(paged_attention_v2_impl)
                                });
-}
+}
\ No newline at end of file
diff --git a/csrc/cpu/cpu_types_arm.hpp b/csrc/cpu/cpu_types_arm.hpp
index fe87c9b1747c9..44d0fb78896c5 100644
--- a/csrc/cpu/cpu_types_arm.hpp
+++ b/csrc/cpu/cpu_types_arm.hpp
@@ -1,14 +1,19 @@
 #include <arm_neon.h>
-#include <torch/all.h>
+#include <torch/all.h> 
 
 #include <cmath>
 
 namespace vec_op {
 
 // FIXME: FP16 is not fully supported in Torch-CPU
-#define VLLM_DISPATCH_CASE_FLOATING_TYPES(...)                                 \
-  AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__)                         \
-  AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__)
+#ifdef BF16_SUPPORT
+  #define VLLM_DISPATCH_CASE_FLOATING_TYPES(...)                                 \
+    AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__)                         \
+    AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__)  
+#else
+  #define VLLM_DISPATCH_CASE_FLOATING_TYPES(...)                                 \
+    AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__)                         
+#endif
 
 #define VLLM_DISPATCH_FLOATING_TYPES(TYPE, NAME, ...)                          \
   AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_FLOATING_TYPES(__VA_ARGS__))
@@ -76,6 +81,7 @@ struct FP16Vec8 : public Vec<FP16Vec8> {
   void save(void *ptr) const { vst1q_f16((__fp16 *)ptr, reg); }
 };
 
+#ifdef BF16_SUPPORT
 struct BF16Vec8 : public Vec<BF16Vec8> {
   constexpr static int VEC_ELEM_NUM = 8;
 
@@ -132,6 +138,7 @@ struct BF16Vec32 : public Vec<BF16Vec32> {
 
   void save(void *ptr) const { *reinterpret_cast<bfloat16x8x4_t *>(ptr) = reg; }
 };
+#endif
 
 struct FP32Vec4 : public Vec<FP32Vec4> {
   constexpr static int VEC_ELEM_NUM = 4;
@@ -177,13 +184,16 @@ struct FP32Vec8 : public Vec<FP32Vec8> {
   explicit FP32Vec8(float32x4x2_t data) : reg(data) {}
 
   explicit FP32Vec8(const FP32Vec8 &data) : reg(data.reg) {}
-
+  
+  #ifdef BF16_SUPPORT
   explicit FP32Vec8(float16x8_t v) : reg({vcvt_f32_f16(vget_low_f16(v)), vcvt_f32_f16(vget_high_f16(v))}) {}
 
   explicit FP32Vec8(bfloat16x8_t v) : reg({vcvtq_low_f32_bf16(v), vcvtq_high_f32_bf16(v)}) {}
 
   explicit FP32Vec8(const BF16Vec8 &v) : reg({vcvtq_low_f32_bf16(v.reg), vcvtq_high_f32_bf16(v.reg)}) {}
 
+  #endif
+
   float reduce_sum() const {
     AliasReg ar;
     ar.reg = reg;
@@ -242,10 +252,10 @@ struct FP32Vec8 : public Vec<FP32Vec8> {
     ar.reg = reg;
 
     // Extract float32x2_t elements for each float32x4_t
-    float32x2_t er_vec0 = {erf(ar.values[0]), erf(ar.values[1])};
-    float32x2_t er_vec1 = {erf(ar.values[2]), erf(ar.values[3])};
-    float32x2_t er_vec2 = {erf(ar.values[4]), erf(ar.values[5])};
-    float32x2_t er_vec3 = {erf(ar.values[6]), erf(ar.values[7])};
+    float32x2_t er_vec0 = {static_cast<float32_t>(erf(ar.values[0])), static_cast<float32_t>(erf(ar.values[1]))};
+    float32x2_t er_vec1 = {static_cast<float32_t>(erf(ar.values[2])), static_cast<float32_t>(erf(ar.values[3]))};
+    float32x2_t er_vec2 = {static_cast<float32_t>(erf(ar.values[4])), static_cast<float32_t>(erf(ar.values[5]))};
+    float32x2_t er_vec3 = {static_cast<float32_t>(erf(ar.values[6])), static_cast<float32_t>(erf(ar.values[7]))};
 
     // Combine the results into float32x4_t vectors
     float32x4_t result0 = vcombine_f32(er_vec0, er_vec1);
@@ -257,8 +267,7 @@ struct FP32Vec8 : public Vec<FP32Vec8> {
     result.val[1] = result1;
 
     return FP32Vec8(result);
-  }
-
+  } 
 
   FP32Vec8 operator*(const FP32Vec8 &b) const {
     return FP32Vec8(float32x4x2_t({vmulq_f32(reg.val[0], b.reg.val[0]), vmulq_f32(reg.val[1], b.reg.val[1])}));
@@ -301,12 +310,14 @@ struct FP32Vec16 : public Vec<FP32Vec16> {
 
   explicit FP32Vec16(const FP32Vec16 &data) : reg(data.reg) {}
 
+  #ifdef BF16_SUPPORT
   explicit FP32Vec16(bfloat16x8x2_t v) : reg({
     vcvtq_low_f32_bf16(v.val[0]),
     vcvtq_high_f32_bf16(v.val[0]),
     vcvtq_low_f32_bf16(v.val[1]),
     vcvtq_high_f32_bf16(v.val[1])
   }) {}
+  #endif
 
   explicit FP32Vec16(const FP32Vec4 &data) {
     reg.val[0] = data.reg;
@@ -322,16 +333,19 @@ struct FP32Vec16 : public Vec<FP32Vec16> {
     reg.val[3] = data.reg.val[1]; // Repeating the second half
   }
 
-
+  #ifdef BF16_SUPPORT
   explicit FP32Vec16(const BF16Vec16 &v) : reg({
     vcvtq_low_f32_bf16(v.reg.val[0]),
     vcvtq_high_f32_bf16(v.reg.val[0]),
     vcvtq_low_f32_bf16(v.reg.val[1]),
     vcvtq_high_f32_bf16(v.reg.val[1])
   }) {}
+  #endif
 
+  #ifdef BF16_SUPPORT
   explicit FP32Vec16(const BF16Vec8 &v) : FP32Vec16(FP32Vec8(v)) {}
-  
+  #endif
+
   FP32Vec16 operator+(const FP32Vec16 &b) const {
     return FP32Vec16(float32x4x4_t({
         vaddq_f32(reg.val[0], b.reg.val[0]),
@@ -402,9 +416,10 @@ template <typename T> using vec_t = typename VecType<T>::vec_type;
 
 template <> struct VecType<float> { using vec_type = FP32Vec8; };
 
+#ifdef BF16_SUPPORT
 template <> struct VecType<c10::Half> { using vec_type = FP16Vec8; };
-
 template <> struct VecType<c10::BFloat16> { using vec_type = BF16Vec8; };
+#endif
 
 template <typename T> void storeFP32(float v, T *ptr) { *ptr = v; }
 
@@ -419,6 +434,7 @@ inline void fma(FP32Vec16 &acc, FP32Vec16 &a, FP32Vec16 &b) {
   acc.reg.val[3] = vfmaq_f32(acc.reg.val[3], a.reg.val[3], b.reg.val[3]);
 }
 
+#ifdef BF16_SUPPORT
 inline void fma(FP32Vec16 &acc, BF16Vec32 &a, BF16Vec32 &b) {
   // Convert BF16 to FP32 for each half of the BF16 vectors
   float32x4_t a0_low = vcvt_f32_bf16(vget_low_bf16(a.reg.val[0]));
@@ -437,21 +453,25 @@ inline void fma(FP32Vec16 &acc, BF16Vec32 &a, BF16Vec32 &b) {
   acc.reg.val[2] = vfmaq_f32(acc.reg.val[2], a1_low, b1_low);
   acc.reg.val[3] = vfmaq_f32(acc.reg.val[3], a1_high, b1_high);
 }
+#endif
 
+#ifdef BF16_SUPPORT
 inline BF16Vec8::BF16Vec8(const FP32Vec8 &v) : reg(vcvtq_high_bf16_f32(vcvtq_low_bf16_f32(v.reg.val[0]), v.reg.val[1])) {};
 
 inline BF16Vec16::BF16Vec16(const FP32Vec16 &v) : reg({
     vcvtq_high_bf16_f32(vcvtq_low_bf16_f32(v.reg.val[0]), v.reg.val[1]),
     vcvtq_high_bf16_f32(vcvtq_low_bf16_f32(v.reg.val[2]), v.reg.val[3])
-  }){};
+  }){}
+#endif
 
 inline void prefetch(const void *addr) {
     __builtin_prefetch(addr, 0, 1);
 }
 
+#ifdef BF16_SUPPORT
 template <>
 inline void storeFP32<c10::BFloat16>(float v, c10::BFloat16 *ptr) { // storeFP32ToT
   *reinterpret_cast<__bf16 *>(ptr) = vcvth_bf16_f32(v);
 }
-
-};
+#endif
+};
\ No newline at end of file

From 82e09f8b8cc766e7c5c8d7a3c27449888c19331a Mon Sep 17 00:00:00 2001
From: Sanket Kale <sanketk.kale@fujitsu.com>
Date: Wed, 13 Nov 2024 10:10:54 +0530
Subject: [PATCH 08/15] Fixed some missing conditions

Signed-off-by: Sanket Kale <sanketk.kale@fujitsu.com>
---
 csrc/cpu/attention.cpp | 34 +++++++++++++++++++++++-----------
 1 file changed, 23 insertions(+), 11 deletions(-)

diff --git a/csrc/cpu/attention.cpp b/csrc/cpu/attention.cpp
index 72f8b6f0fca6d..aaa8ec57dfeab 100644
--- a/csrc/cpu/attention.cpp
+++ b/csrc/cpu/attention.cpp
@@ -51,18 +51,30 @@ struct KernelVecType<c10::BFloat16> {
   using v_load_vec_type = vec_op::BF16Vec16;
 };
 #else
-  #ifndef BF16_SUPPORT
-  //pass
+  #ifdef __aarch64__
+    #ifndef BF16_SUPPORT
+      //pass
+    #else
+      template <>
+      struct KernelVecType<c10::BFloat16> {
+        using q_load_vec_type = vec_op::BF16Vec8;
+        using q_vec_type = vec_op::FP32Vec16;
+        using k_load_vec_type = vec_op::BF16Vec16;
+        using k_vec_type = vec_op::FP32Vec16;
+        using qk_acc_vec_type = vec_op::FP32Vec16;
+        using v_load_vec_type = vec_op::BF16Vec16;
+      };
+    #endif
   #else
-  template <>
-  struct KernelVecType<c10::BFloat16> {
-    using q_load_vec_type = vec_op::BF16Vec8;
-    using q_vec_type = vec_op::FP32Vec16;
-    using k_load_vec_type = vec_op::BF16Vec16;
-    using k_vec_type = vec_op::FP32Vec16;
-    using qk_acc_vec_type = vec_op::FP32Vec16;
-    using v_load_vec_type = vec_op::BF16Vec16;
-  };
+    template <>
+    struct KernelVecType<c10::BFloat16> {
+      using q_load_vec_type = vec_op::BF16Vec8;
+      using q_vec_type = vec_op::FP32Vec16;
+      using k_load_vec_type = vec_op::BF16Vec16;
+      using k_vec_type = vec_op::FP32Vec16;
+      using qk_acc_vec_type = vec_op::FP32Vec16;
+      using v_load_vec_type = vec_op::BF16Vec16;
+    };
   #endif
 #endif
 

From f7d2182d99110232b25c459b03e72a07bcf0251d Mon Sep 17 00:00:00 2001
From: Sanket Kale <sanketk.kale@fujitsu.com>
Date: Wed, 13 Nov 2024 16:22:15 +0530
Subject: [PATCH 09/15] fixed dco test

Signed-off-by: Sanket Kale <sanketk.kale@fujitsu.com>
---
 csrc/cpu/attention.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/csrc/cpu/attention.cpp b/csrc/cpu/attention.cpp
index aaa8ec57dfeab..ec982974c7533 100644
--- a/csrc/cpu/attention.cpp
+++ b/csrc/cpu/attention.cpp
@@ -795,4 +795,4 @@ void paged_attention_v2(
                                  CALL_V2_KERNEL_LAUNCHER_BLOCK_SIZE(scalar_t);
                                  CPU_KERNEL_GUARD_OUT(paged_attention_v2_impl)
                                });
-}
\ No newline at end of file
+}

From 98538442759185a67b1c213c5c16565ac82a909b Mon Sep 17 00:00:00 2001
From: Sanket Kale <sanketk.kale@fujitsu.com>
Date: Thu, 14 Nov 2024 18:02:03 +0530
Subject: [PATCH 10/15] Rebased and resolved merge conflicts

Signed-off-by: Sanket Kale <sanketk.kale@fujitsu.com>
---
 cmake/cpu_extension.cmake | 25 ++++++++++++-------------
 csrc/cpu/attention.cpp    |  4 +++-
 requirements-cpu.txt      | 12 +++---------
 3 files changed, 18 insertions(+), 23 deletions(-)

diff --git a/cmake/cpu_extension.cmake b/cmake/cpu_extension.cmake
index 22b2fe45d4521..248c04d4b3dd2 100644
--- a/cmake/cpu_extension.cmake
+++ b/cmake/cpu_extension.cmake
@@ -16,6 +16,7 @@ include_directories("${CMAKE_SOURCE_DIR}/csrc")
 #
 # Check the compile flags
 #
+<<<<<<< HEAD
 if (CMAKE_SYSTEM_PROCESSOR STREQUAL "ppc64le")
     list(APPEND CXX_COMPILE_FLAGS
         "-fopenmp"
@@ -26,6 +27,17 @@ else()
         "-mf16c"
         "-DVLLM_CPU_EXTENSION")
 endif()
+=======
+
+if (NOT CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64")
+    list(APPEND CXX_COMPILE_FLAGS
+        "-mf16c"
+    )
+endif()
+list(APPEND CXX_COMPILE_FLAGS
+    "-fopenmp"
+    "-DVLLM_CPU_EXTENSION")
+>>>>>>> eca86e66 (Rebased and resolved merge conflicts)
 
 execute_process(COMMAND cat /proc/cpuinfo
                 RESULT_VARIABLE CPUINFO_RET
@@ -140,21 +152,8 @@ endif()
 
 message(STATUS "CPU extension compile flags: ${CXX_COMPILE_FLAGS}")
 
-<<<<<<< HEAD
-<<<<<<< HEAD
-list(APPEND LIBS numa)
-=======
-list(APPEND LIBS dnnl numa)
->>>>>>> ad80d348 (Enabled path for ARM machines)
-=======
 list(APPEND LIBS numa)
 
-# Appending the dnnl library for the AVX2 and AVX512, as it is not utilized by Power architecture.
-if (AVX2_FOUND OR AVX512_FOUND)
-    list(APPEND LIBS dnnl)
-endif()
->>>>>>> b424c9aa (Enabled path for ARM machines)
-
 #
 # _C extension
 #
diff --git a/csrc/cpu/attention.cpp b/csrc/cpu/attention.cpp
index ec982974c7533..b5189339f9d48 100644
--- a/csrc/cpu/attention.cpp
+++ b/csrc/cpu/attention.cpp
@@ -22,6 +22,7 @@ struct KernelVecType<float> {
   using v_load_vec_type = vec_op::FP32Vec16;
 };
 
+#ifndef __aarch64__
 template <>
 struct KernelVecType<c10::Half> {
 #ifdef __powerpc64__
@@ -39,6 +40,7 @@ struct KernelVecType<c10::Half> {
   using k_vec_type = vec_op::FP32Vec16;
   using qk_acc_vec_type = vec_op::FP32Vec16;
 };
+#endif
 
 #ifdef __AVX512BF16__
 template <>
@@ -795,4 +797,4 @@ void paged_attention_v2(
                                  CALL_V2_KERNEL_LAUNCHER_BLOCK_SIZE(scalar_t);
                                  CPU_KERNEL_GUARD_OUT(paged_attention_v2_impl)
                                });
-}
+}
\ No newline at end of file
diff --git a/requirements-cpu.txt b/requirements-cpu.txt
index ac20ea15f4aab..81710ae516ef2 100644
--- a/requirements-cpu.txt
+++ b/requirements-cpu.txt
@@ -1,13 +1,7 @@
 # Common dependencies
 -r requirements-common.txt
 
-<<<<<<< HEAD
-# Dependencies for x86_64 CPUs
-torch == 2.5.1+cpu; platform_machine != "ppc64le"
-torchvision; platform_machine != "ppc64le"   # required for the image processor of phi3v, this must be updated alongside torch
-=======
 # Dependencies for CPUs
-torch==2.4.0+cpu; platform_machine != "ppc64le" and platform_machine != "aarch64" 
-torch==2.4.0; platform_machine == "aarch64"
-torchvision; platform_machine != "ppc64le"  # required for the image processor of phi3v, this must be updated alongside torch
->>>>>>> ad80d348 (Enabled path for ARM machines)
+torch==2.5.1+cpu; platform_machine != "ppc64le" and platform_machine != "aarch64" 
+torch==2.5.0; platform_machine == "aarch64"
+torchvision; platform_machine != "ppc64le"  # required for the image processor of phi3v, this must be updated alongside torch
\ No newline at end of file

From 7c941208be570a56694f13152a6911db34b47239 Mon Sep 17 00:00:00 2001
From: Sanket Kale <sanketk.kale@fujitsu.com>
Date: Thu, 14 Nov 2024 19:44:11 +0530
Subject: [PATCH 11/15] Updated documentation

Signed-off-by: Sanket Kale <sanketk.kale@fujitsu.com>
---
 docs/source/getting_started/arm-installation.rst | 4 ++--
 docs/source/index.rst                            | 3 ---
 2 files changed, 2 insertions(+), 5 deletions(-)

diff --git a/docs/source/getting_started/arm-installation.rst b/docs/source/getting_started/arm-installation.rst
index 4ee817875d535..b2ad9de82dd47 100644
--- a/docs/source/getting_started/arm-installation.rst
+++ b/docs/source/getting_started/arm-installation.rst
@@ -9,6 +9,7 @@ vLLM has been adapted to work on ARM64 CPUs with NEON support, leveraging the CP
 * Relevant runtime environment variables
 * Performance optimization tips
 
+It currently supports Float32 and BFloat16 datatypes.
 Contents:
 
 1. :ref:`Requirements <arm_backend_requirements>`
@@ -21,9 +22,8 @@ Requirements
 ------------
 
 * **Operating System**: Linux or macOS
-* **Docker**: Required if running on macOS
 * **Compiler**: gcc/g++ >= 12.3.0 (optional, but recommended)
-* **Instruction Set Architecture (ISA)**: NEON support and BF16 extension is required
+* **Instruction Set Architecture (ISA)**: NEON support is required
 
 .. _arm_backend_quick_start_dockerfile:
 
diff --git a/docs/source/index.rst b/docs/source/index.rst
index 224a59ed1fc7e..0692e949f1c77 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -66,11 +66,8 @@ Documentation
    getting_started/amd-installation
    getting_started/openvino-installation
    getting_started/cpu-installation
-<<<<<<< HEAD
    getting_started/gaudi-installation
-=======
    getting_started/arm-installation
->>>>>>> 32a6432f (Added Documenataion for ARM architecture support and updated architecture to native)
    getting_started/neuron-installation
    getting_started/tpu-installation
    getting_started/xpu-installation

From 9b2cb73b1162643344f8e7f9ac419a45dab2b07b Mon Sep 17 00:00:00 2001
From: Sanket Kale <sanketk.kale@fujitsu.com>
Date: Fri, 15 Nov 2024 15:04:11 +0530
Subject: [PATCH 12/15] Added FP16 compatibility and updated torch version

Signed-off-by: Sanket Kale <sanketk.kale@fujitsu.com>
---
 csrc/cpu/attention.cpp                        |   2 -
 csrc/cpu/cpu_types_arm.hpp                    | 235 +++++++++++-------
 .../getting_started/arm-installation.rst      |   2 +-
 requirements-cpu.txt                          |   2 +-
 4 files changed, 141 insertions(+), 100 deletions(-)

diff --git a/csrc/cpu/attention.cpp b/csrc/cpu/attention.cpp
index b5189339f9d48..aaa8ec57dfeab 100644
--- a/csrc/cpu/attention.cpp
+++ b/csrc/cpu/attention.cpp
@@ -22,7 +22,6 @@ struct KernelVecType<float> {
   using v_load_vec_type = vec_op::FP32Vec16;
 };
 
-#ifndef __aarch64__
 template <>
 struct KernelVecType<c10::Half> {
 #ifdef __powerpc64__
@@ -40,7 +39,6 @@ struct KernelVecType<c10::Half> {
   using k_vec_type = vec_op::FP32Vec16;
   using qk_acc_vec_type = vec_op::FP32Vec16;
 };
-#endif
 
 #ifdef __AVX512BF16__
 template <>
diff --git a/csrc/cpu/cpu_types_arm.hpp b/csrc/cpu/cpu_types_arm.hpp
index 44d0fb78896c5..811d709f8afe9 100644
--- a/csrc/cpu/cpu_types_arm.hpp
+++ b/csrc/cpu/cpu_types_arm.hpp
@@ -5,14 +5,15 @@
 
 namespace vec_op {
 
-// FIXME: FP16 is not fully supported in Torch-CPU
 #ifdef BF16_SUPPORT
   #define VLLM_DISPATCH_CASE_FLOATING_TYPES(...)                                 \
     AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__)                         \
+    AT_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__)                         \
     AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__)  
 #else
   #define VLLM_DISPATCH_CASE_FLOATING_TYPES(...)                                 \
-    AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__)                         
+    AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__)                         \
+    AT_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__)
 #endif
 
 #define VLLM_DISPATCH_FLOATING_TYPES(TYPE, NAME, ...)                          \
@@ -30,11 +31,11 @@ namespace vec_op {
 #define FORCE_INLINE __attribute__((always_inline)) inline
 
 namespace {
-template <typename T, T... indexes, typename F>
-constexpr void unroll_loop_item(std::integer_sequence<T, indexes...>, F &&f) {
-  (f(std::integral_constant<T, indexes>{}), ...);
-}
-}; // namespace
+  template <typename T, T... indexes, typename F>
+  constexpr void unroll_loop_item(std::integer_sequence<T, indexes...>, F &&f) {
+    (f(std::integral_constant<T, indexes>{}), ...);
+  };
+}; 
 
 template <typename T, T count, typename F,
           typename = std::enable_if_t<std::is_invocable_v<F, T>>>
@@ -43,7 +44,7 @@ constexpr void unroll_loop(F &&f) {
 }
 
 template <typename T> struct Vec {
-  constexpr static int get_elem_num() { return T::VEC_ELEM_NUM; }
+  constexpr static int get_elem_num() { return T::VEC_ELEM_NUM; };
 };
 
 struct FP32Vec8;
@@ -54,33 +55,54 @@ struct FP16Vec8 : public Vec<FP16Vec8> {
 
   float16x8_t reg;
 
-  explicit FP16Vec8(__fp16 v) : reg(vmovq_n_f16(v)) {} //all 8 values set to v
-
-  explicit FP16Vec8(const void *ptr) : reg(vld1q_f16((const __fp16 *)ptr)) {} 
-
-  explicit FP16Vec8(float16x8_t data) : reg(data) {}
-
-  explicit FP16Vec8(float32x4x2_t v) : reg(vcombine_f16(vcvt_f16_f32(v.val[0]), vcvt_f16_f32(v.val[1]))) {}
-
-  FP16Vec8 operator*(const FP16Vec8 &b) const {
-    return FP16Vec8(vmulq_f16(reg, b.reg));
-  }
-
-  FP16Vec8 operator+(const FP16Vec8 &b) const {
-    return FP16Vec8(vaddq_f16(reg, b.reg));
-  }
+  explicit FP16Vec8(const void *ptr)
+      : reg(vld1q_f16(static_cast<const __fp16 *>(ptr))) {};
 
-  FP16Vec8 operator-(const FP16Vec8 &b) const {
-    return FP16Vec8(vsubq_f16(reg, b.reg));
-  }
+  explicit FP16Vec8(const FP32Vec8 &);
 
-  FP16Vec8 operator/(const FP16Vec8 &b) const {
-    return FP16Vec8(vdivq_f16(reg, b.reg));
+  void save(void *ptr) const {
+    vst1q_f16(static_cast<__fp16 *>(ptr), reg);
   }
+};
 
-  void save(void *ptr) const { vst1q_f16((__fp16 *)ptr, reg); }
+struct FP16Vec16 : public Vec<FP16Vec16> {
+    constexpr static int VEC_ELEM_NUM = 16;
+    
+    float16x8x2_t reg; 
+    
+    explicit FP16Vec16(const void *ptr) {
+        reg.val[0] = vld1q_f16(reinterpret_cast<const __fp16*>(ptr));        
+        reg.val[1] = vld1q_f16(reinterpret_cast<const __fp16*>(ptr) + 8);    
+    }
+    
+    explicit FP16Vec16(const FP32Vec16& vec);
+    
+    void save(void *ptr) const {
+        vst1q_f16(reinterpret_cast<__fp16*>(ptr), reg.val[0]);       
+        vst1q_f16(reinterpret_cast<__fp16*>(ptr) + 8, reg.val[1]);   
+    }
+    
+    void save(void *ptr, const int elem_num) const {
+        int full_blocks = elem_num / 8;   
+        int remainder = elem_num % 8;     
+        
+        if (full_blocks > 0) {
+            vst1q_f16(reinterpret_cast<__fp16*>(ptr), reg.val[0]);
+            if (full_blocks > 1) {
+                vst1q_f16(reinterpret_cast<__fp16*>(ptr) + 8, reg.val[1]);
+            }
+        }
+        
+        if (remainder > 0) {
+            float16x8_t temp = reg.val[full_blocks];
+            for (int i = 0; i < remainder; ++i) {
+                reinterpret_cast<__fp16*>(ptr)[full_blocks * 8 + i] = vgetq_lane_f16(temp, i);
+            }
+        }
+    }
 };
 
+
 #ifdef BF16_SUPPORT
 struct BF16Vec8 : public Vec<BF16Vec8> {
   constexpr static int VEC_ELEM_NUM = 8;
@@ -88,13 +110,13 @@ struct BF16Vec8 : public Vec<BF16Vec8> {
   bfloat16x8_t reg;
 
   explicit BF16Vec8(const void *ptr)
-      : reg(*reinterpret_cast<const bfloat16x8_t *>(ptr)) {}
+      : reg(*reinterpret_cast<const bfloat16x8_t *>(ptr)) {};
 
-  explicit BF16Vec8(bfloat16x8_t data) : reg(data) {}
+  explicit BF16Vec8(bfloat16x8_t data) : reg(data) {};
 
   explicit BF16Vec8(const FP32Vec8 &);
 
-  explicit BF16Vec8(float32x4x2_t v) : reg(vcvtq_high_bf16_f32(vcvtq_low_bf16_f32(v.val[0]), v.val[1])) {}  
+  explicit BF16Vec8(float32x4x2_t v) : reg(vcvtq_high_bf16_f32(vcvtq_low_bf16_f32(v.val[0]), v.val[1])) {};  
 
   void save(void *ptr) const { *reinterpret_cast<bfloat16x8_t *>(ptr) = reg; }
 };
@@ -105,9 +127,9 @@ struct BF16Vec16 : public Vec<BF16Vec16> {
   bfloat16x8x2_t reg;
 
   explicit BF16Vec16(const void *ptr)
-      : reg(*reinterpret_cast<const bfloat16x8x2_t *>(ptr)) {}
+      : reg(*reinterpret_cast<const bfloat16x8x2_t *>(ptr)) {};
 
-  explicit BF16Vec16(bfloat16x8x2_t data) : reg(data) {}
+  explicit BF16Vec16(bfloat16x8x2_t data) : reg(data) {};
 
   explicit BF16Vec16(const FP32Vec16 &);
 
@@ -116,7 +138,7 @@ struct BF16Vec16 : public Vec<BF16Vec16> {
     vcvtq_high_bf16_f32(vcvtq_low_bf16_f32(v.val[2]), v.val[3])
   }){};
 
-  void save(void *ptr) const { *reinterpret_cast<bfloat16x8x2_t *>(ptr) = reg; }
+  void save(void *ptr) const { *reinterpret_cast<bfloat16x8x2_t *>(ptr) = reg; };
 };
 
 struct BF16Vec32 : public Vec<BF16Vec32> {
@@ -125,18 +147,18 @@ struct BF16Vec32 : public Vec<BF16Vec32> {
   bfloat16x8x4_t reg;
 
   explicit BF16Vec32(const void *ptr)
-      : reg(*reinterpret_cast<const bfloat16x8x4_t *>(ptr)) {}
+      : reg(*reinterpret_cast<const bfloat16x8x4_t *>(ptr)) {};
 
-  explicit BF16Vec32(bfloat16x8x4_t data) : reg(data) {}
+  explicit BF16Vec32(bfloat16x8x4_t data) : reg(data) {};
 
   explicit BF16Vec32(const BF16Vec8 &vec8_data) : reg({
     vec8_data.reg,
     vec8_data.reg,
     vec8_data.reg,
     vec8_data.reg
-  }) {}
+  }) {};
 
-  void save(void *ptr) const { *reinterpret_cast<bfloat16x8x4_t *>(ptr) = reg; }
+  void save(void *ptr) const { *reinterpret_cast<bfloat16x8x4_t *>(ptr) = reg; };
 };
 #endif
 
@@ -150,20 +172,15 @@ struct FP32Vec4 : public Vec<FP32Vec4> {
 
   float32x4_t reg;
 
-  // Constructor that initializes all elements with a single float value
-  explicit FP32Vec4(float v) : reg(vdupq_n_f32(v)) {}
+  explicit FP32Vec4(float v) : reg(vdupq_n_f32(v)) {};
 
-  // Default constructor initializes all elements to zero
-  explicit FP32Vec4() : reg(vdupq_n_f32(0.0f)) {}
+  explicit FP32Vec4() : reg(vdupq_n_f32(0.0f)) {};
 
-  // Constructor to load from memory
-  explicit FP32Vec4(const float *ptr) : reg(vld1q_f32(ptr)) {}
+  explicit FP32Vec4(const float *ptr) : reg(vld1q_f32(ptr)) {};
 
-  // Constructor to initialize directly from float32x4_t data
-  explicit FP32Vec4(float32x4_t data) : reg(data) {}
+  explicit FP32Vec4(float32x4_t data) : reg(data) {};
 
-  // Copy constructor
-  explicit FP32Vec4(const FP32Vec4 &data) : reg(data.reg) {}
+  explicit FP32Vec4(const FP32Vec4 &data) : reg(data.reg) {};
 };
 
 struct FP32Vec8 : public Vec<FP32Vec8> {
@@ -175,22 +192,28 @@ struct FP32Vec8 : public Vec<FP32Vec8> {
 
   float32x4x2_t reg;
 
-  explicit FP32Vec8(float v) : reg({vmovq_n_f32(v), vmovq_n_f32(v)}) {}
+  explicit FP32Vec8(float v) : reg({vmovq_n_f32(v), vmovq_n_f32(v)}) {};
+
+  explicit FP32Vec8() : reg({vmovq_n_f32(0.0), vmovq_n_f32(0.0)}) {};
+
+  explicit FP32Vec8(const float *ptr) : reg({vld1q_f32(ptr), vld1q_f32(ptr + 4)}) {};
+
+  explicit FP32Vec8(float32x4x2_t data) : reg(data) {};
 
-  explicit FP32Vec8() : reg({vmovq_n_f32(0.0), vmovq_n_f32(0.0)}) {}
+  explicit FP32Vec8(const FP32Vec8 &data) : reg(data.reg) {};
 
-  explicit FP32Vec8(const float *ptr) : reg({vld1q_f32(ptr), vld1q_f32(ptr + 4)}) {}
+  explicit FP32Vec8(const FP16Vec8 &v) {
+        reg.val[0] = vcvt_f32_f16(vget_low_f16(v.reg));  
+        reg.val[1] = vcvt_f32_f16(vget_high_f16(v.reg)); 
+    };
 
-  explicit FP32Vec8(float32x4x2_t data) : reg(data) {}
+  explicit FP32Vec8(float16x8_t v) : reg({vcvt_f32_f16(vget_low_f16(v)), vcvt_f32_f16(vget_high_f16(v))}) {};
 
-  explicit FP32Vec8(const FP32Vec8 &data) : reg(data.reg) {}
-  
   #ifdef BF16_SUPPORT
-  explicit FP32Vec8(float16x8_t v) : reg({vcvt_f32_f16(vget_low_f16(v)), vcvt_f32_f16(vget_high_f16(v))}) {}
 
-  explicit FP32Vec8(bfloat16x8_t v) : reg({vcvtq_low_f32_bf16(v), vcvtq_high_f32_bf16(v)}) {}
+  explicit FP32Vec8(bfloat16x8_t v) : reg({vcvtq_low_f32_bf16(v), vcvtq_high_f32_bf16(v)}) {};
 
-  explicit FP32Vec8(const BF16Vec8 &v) : reg({vcvtq_low_f32_bf16(v.reg), vcvtq_high_f32_bf16(v.reg)}) {}
+  explicit FP32Vec8(const BF16Vec8 &v) : reg({vcvtq_low_f32_bf16(v.reg), vcvtq_high_f32_bf16(v.reg)}) {};
 
   #endif
 
@@ -207,17 +230,14 @@ struct FP32Vec8 : public Vec<FP32Vec8> {
     AliasReg ar;
     ar.reg = reg;
 
-    // Extract float32x2_t elements for each float32x4_t
     float32x2_t exp_vec0 = {expf(ar.values[0]), expf(ar.values[1])};
     float32x2_t exp_vec1 = {expf(ar.values[2]), expf(ar.values[3])};
     float32x2_t exp_vec2 = {expf(ar.values[4]), expf(ar.values[5])};
     float32x2_t exp_vec3 = {expf(ar.values[6]), expf(ar.values[7])};
 
-    // Combine the results into float32x4_t vectors
     float32x4_t result0 = vcombine_f32(exp_vec0, exp_vec1);
     float32x4_t result1 = vcombine_f32(exp_vec2, exp_vec3);
 
-    // Initialize the float32x4x2_t object
     float32x4x2_t result;
     result.val[0] = result0;
     result.val[1] = result1;
@@ -229,17 +249,14 @@ struct FP32Vec8 : public Vec<FP32Vec8> {
     AliasReg ar;
     ar.reg = reg;
 
-    // Extract float32x2_t elements for each float32x4_t
     float32x2_t tanh_vec0 = {tanhf(ar.values[0]), tanhf(ar.values[1])};
     float32x2_t tanh_vec1 = {tanhf(ar.values[2]), tanhf(ar.values[3])};
     float32x2_t tanh_vec2 = {tanhf(ar.values[4]), tanhf(ar.values[5])};
     float32x2_t tanh_vec3 = {tanhf(ar.values[6]), tanhf(ar.values[7])};
 
-    // Combine the results into float32x4_t vectors
     float32x4_t result0 = vcombine_f32(tanh_vec0, tanh_vec1);
     float32x4_t result1 = vcombine_f32(tanh_vec2, tanh_vec3);
 
-    // Initialize the float32x4x2_t object
     float32x4x2_t result;
     result.val[0] = result0;
     result.val[1] = result1;
@@ -251,17 +268,14 @@ struct FP32Vec8 : public Vec<FP32Vec8> {
     AliasReg ar;
     ar.reg = reg;
 
-    // Extract float32x2_t elements for each float32x4_t
     float32x2_t er_vec0 = {static_cast<float32_t>(erf(ar.values[0])), static_cast<float32_t>(erf(ar.values[1]))};
     float32x2_t er_vec1 = {static_cast<float32_t>(erf(ar.values[2])), static_cast<float32_t>(erf(ar.values[3]))};
     float32x2_t er_vec2 = {static_cast<float32_t>(erf(ar.values[4])), static_cast<float32_t>(erf(ar.values[5]))};
     float32x2_t er_vec3 = {static_cast<float32_t>(erf(ar.values[6])), static_cast<float32_t>(erf(ar.values[7]))};
 
-    // Combine the results into float32x4_t vectors
     float32x4_t result0 = vcombine_f32(er_vec0, er_vec1);
     float32x4_t result1 = vcombine_f32(er_vec2, er_vec3);
 
-    // Initialize the float32x4x2_t object
     float32x4x2_t result;
     result.val[0] = result0;
     result.val[1] = result1;
@@ -308,15 +322,24 @@ struct FP32Vec16 : public Vec<FP32Vec16> {
 
   explicit FP32Vec16(float32x4x4_t data) : reg(data) {}
 
+  explicit FP32Vec16(const FP32Vec8 &data) {
+        reg.val[0] = data.reg.val[0]; 
+        reg.val[1] = data.reg.val[1]; 
+        reg.val[2] = data.reg.val[0]; 
+        reg.val[3] = data.reg.val[1]; 
+  }
+
   explicit FP32Vec16(const FP32Vec16 &data) : reg(data.reg) {}
 
+  explicit FP32Vec16(const FP16Vec8 &v) : FP32Vec16(FP32Vec8(v.reg)) {}
+
   #ifdef BF16_SUPPORT
   explicit FP32Vec16(bfloat16x8x2_t v) : reg({
     vcvtq_low_f32_bf16(v.val[0]),
     vcvtq_high_f32_bf16(v.val[0]),
     vcvtq_low_f32_bf16(v.val[1]),
     vcvtq_high_f32_bf16(v.val[1])
-  }) {}
+  }) {};
   #endif
 
   explicit FP32Vec16(const FP32Vec4 &data) {
@@ -324,14 +347,7 @@ struct FP32Vec16 : public Vec<FP32Vec16> {
     reg.val[1] = data.reg;
     reg.val[2] = data.reg;
     reg.val[3] = data.reg;
-  }
-
-  explicit FP32Vec16(const FP32Vec8 &data) {
-    reg.val[0] = data.reg.val[0]; // First half (low 128 bits)
-    reg.val[1] = data.reg.val[1]; // Second half (high 128 bits)
-    reg.val[2] = data.reg.val[0]; // Repeating the first half
-    reg.val[3] = data.reg.val[1]; // Repeating the second half
-  }
+  };
 
   #ifdef BF16_SUPPORT
   explicit FP32Vec16(const BF16Vec16 &v) : reg({
@@ -339,20 +355,29 @@ struct FP32Vec16 : public Vec<FP32Vec16> {
     vcvtq_high_f32_bf16(v.reg.val[0]),
     vcvtq_low_f32_bf16(v.reg.val[1]),
     vcvtq_high_f32_bf16(v.reg.val[1])
-  }) {}
-  #endif
+  }) {};
 
-  #ifdef BF16_SUPPORT
-  explicit FP32Vec16(const BF16Vec8 &v) : FP32Vec16(FP32Vec8(v)) {}
+  explicit FP32Vec16(const BF16Vec8 &v) : FP32Vec16(FP32Vec8(v)) {};
   #endif
 
+  explicit FP32Vec16(const FP16Vec16 &v) {
+      reg.val[0] = vcvt_f32_f16(vget_low_f16(v.reg.val[0]));
+      reg.val[1] = vcvt_f32_f16(vget_high_f16(v.reg.val[0]));
+      reg.val[2] = vcvt_f32_f16(vget_low_f16(v.reg.val[1]));
+      reg.val[3] = vcvt_f32_f16(vget_high_f16(v.reg.val[1]));
+  };
+
+  // #ifdef BF16_SUPPORT
+  // explicit FP32Vec16(const BF16Vec8 &v) : FP32Vec16(FP32Vec8(v)) {}
+  // #endif
+
   FP32Vec16 operator+(const FP32Vec16 &b) const {
     return FP32Vec16(float32x4x4_t({
         vaddq_f32(reg.val[0], b.reg.val[0]),
         vaddq_f32(reg.val[1], b.reg.val[1]),
         vaddq_f32(reg.val[2], b.reg.val[2]),
         vaddq_f32(reg.val[3], b.reg.val[3])}));
-  }
+  };
 
   FP32Vec16 operator*(const FP32Vec16 &b) const {
     return FP32Vec16(float32x4x4_t({
@@ -360,7 +385,7 @@ struct FP32Vec16 : public Vec<FP32Vec16> {
         vmulq_f32(reg.val[1], b.reg.val[1]),
         vmulq_f32(reg.val[2], b.reg.val[2]),
         vmulq_f32(reg.val[3], b.reg.val[3])}));
-  }
+  };
 
   FP32Vec16 operator-(const FP32Vec16 &b) const {
     return FP32Vec16(float32x4x4_t({
@@ -369,7 +394,7 @@ struct FP32Vec16 : public Vec<FP32Vec16> {
         vsubq_f32(reg.val[2], b.reg.val[2]),
         vsubq_f32(reg.val[3], b.reg.val[3])
     }));
-  }
+  };
 
   FP32Vec16 operator/(const FP32Vec16 &b) const {
     return FP32Vec16(float32x4x4_t({
@@ -378,7 +403,7 @@ struct FP32Vec16 : public Vec<FP32Vec16> {
         vdivq_f32(reg.val[2], b.reg.val[2]),
         vdivq_f32(reg.val[3], b.reg.val[3])
     }));
-  }
+  };
 
   float reduce_sum() const {
     AliasReg ar;
@@ -387,7 +412,7 @@ struct FP32Vec16 : public Vec<FP32Vec16> {
     unroll_loop<int, VEC_ELEM_NUM>([&ans, &ar](int i) { ans += ar.values[i]; });
 
     return ans;
-  }
+  };
 
   template <int group_size> float reduce_sub_sum(int idx) {
     static_assert(VEC_ELEM_NUM % group_size == 0);
@@ -400,14 +425,14 @@ struct FP32Vec16 : public Vec<FP32Vec16> {
         [&ans, &start, ar](int i) { ans += ar.values[start + i]; });
 
     return ans;
-  }
+  };
 
   void save(float *ptr) const {
     vst1q_f32(ptr, reg.val[0]);
     vst1q_f32(ptr + 4, reg.val[1]);
     vst1q_f32(ptr + 8, reg.val[2]);
     vst1q_f32(ptr + 12, reg.val[3]);
-  }
+  };
 };
 
 template <typename T> struct VecType { using vec_type = void; };
@@ -416,8 +441,9 @@ template <typename T> using vec_t = typename VecType<T>::vec_type;
 
 template <> struct VecType<float> { using vec_type = FP32Vec8; };
 
-#ifdef BF16_SUPPORT
 template <> struct VecType<c10::Half> { using vec_type = FP16Vec8; };
+
+#ifdef BF16_SUPPORT
 template <> struct VecType<c10::BFloat16> { using vec_type = BF16Vec8; };
 #endif
 
@@ -427,16 +453,34 @@ template <> inline void storeFP32<c10::Half>(float v, c10::Half *ptr) {
   *reinterpret_cast<__fp16 *>(ptr) = v;
 }
 
+inline FP16Vec16::FP16Vec16(const FP32Vec16 &v) {
+    float16x4_t low_0 = vcvt_f16_f32(v.reg.val[0]);
+    float16x4_t high_0 = vcvt_f16_f32(v.reg.val[1]);
+    float16x4_t low_1 = vcvt_f16_f32(v.reg.val[2]);
+    float16x4_t high_1 = vcvt_f16_f32(v.reg.val[3]);
+
+    reg.val[0] = vcombine_f16(low_0, high_0);
+    reg.val[1] = vcombine_f16(low_1, high_1);
+};
+
+inline FP16Vec8 :: FP16Vec8(const FP32Vec8 &v) {
+    float16x4_t lower_half = vcvt_f16_f32(v.reg.val[0]);
+    float16x4_t upper_half = vcvt_f16_f32(v.reg.val[1]);
+
+    reg = vcombine_f16(lower_half, upper_half);
+};
+
 inline void fma(FP32Vec16 &acc, FP32Vec16 &a, FP32Vec16 &b) {
+
   acc.reg.val[0] = vfmaq_f32(acc.reg.val[0], a.reg.val[0], b.reg.val[0]);
   acc.reg.val[1] = vfmaq_f32(acc.reg.val[1], a.reg.val[1], b.reg.val[1]);
   acc.reg.val[2] = vfmaq_f32(acc.reg.val[2], a.reg.val[2], b.reg.val[2]);
   acc.reg.val[3] = vfmaq_f32(acc.reg.val[3], a.reg.val[3], b.reg.val[3]);
-}
+};
 
 #ifdef BF16_SUPPORT
 inline void fma(FP32Vec16 &acc, BF16Vec32 &a, BF16Vec32 &b) {
-  // Convert BF16 to FP32 for each half of the BF16 vectors
+
   float32x4_t a0_low = vcvt_f32_bf16(vget_low_bf16(a.reg.val[0]));
   float32x4_t a0_high = vcvt_f32_bf16(vget_high_bf16(a.reg.val[0]));
   float32x4_t a1_low = vcvt_f32_bf16(vget_low_bf16(a.reg.val[1]));
@@ -447,12 +491,11 @@ inline void fma(FP32Vec16 &acc, BF16Vec32 &a, BF16Vec32 &b) {
   float32x4_t b1_low = vcvt_f32_bf16(vget_low_bf16(b.reg.val[1]));
   float32x4_t b1_high = vcvt_f32_bf16(vget_high_bf16(b.reg.val[1]));
 
-  // Perform FMA on FP32 vectors
   acc.reg.val[0] = vfmaq_f32(acc.reg.val[0], a0_low, b0_low);
   acc.reg.val[1] = vfmaq_f32(acc.reg.val[1], a0_high, b0_high);
   acc.reg.val[2] = vfmaq_f32(acc.reg.val[2], a1_low, b1_low);
   acc.reg.val[3] = vfmaq_f32(acc.reg.val[3], a1_high, b1_high);
-}
+};
 #endif
 
 #ifdef BF16_SUPPORT
@@ -461,17 +504,17 @@ inline BF16Vec8::BF16Vec8(const FP32Vec8 &v) : reg(vcvtq_high_bf16_f32(vcvtq_low
 inline BF16Vec16::BF16Vec16(const FP32Vec16 &v) : reg({
     vcvtq_high_bf16_f32(vcvtq_low_bf16_f32(v.reg.val[0]), v.reg.val[1]),
     vcvtq_high_bf16_f32(vcvtq_low_bf16_f32(v.reg.val[2]), v.reg.val[3])
-  }){}
+  }){};
 #endif
 
 inline void prefetch(const void *addr) {
     __builtin_prefetch(addr, 0, 1);
-}
+};
 
 #ifdef BF16_SUPPORT
 template <>
-inline void storeFP32<c10::BFloat16>(float v, c10::BFloat16 *ptr) { // storeFP32ToT
+inline void storeFP32<c10::BFloat16>(float v, c10::BFloat16 *ptr) { 
   *reinterpret_cast<__bf16 *>(ptr) = vcvth_bf16_f32(v);
-}
+};
 #endif
 };
\ No newline at end of file
diff --git a/docs/source/getting_started/arm-installation.rst b/docs/source/getting_started/arm-installation.rst
index b2ad9de82dd47..7b457df92c11d 100644
--- a/docs/source/getting_started/arm-installation.rst
+++ b/docs/source/getting_started/arm-installation.rst
@@ -9,7 +9,7 @@ vLLM has been adapted to work on ARM64 CPUs with NEON support, leveraging the CP
 * Relevant runtime environment variables
 * Performance optimization tips
 
-It currently supports Float32 and BFloat16 datatypes.
+ARM CPU backend currently supports Float32, FP16 and BFloat16 datatypes.
 Contents:
 
 1. :ref:`Requirements <arm_backend_requirements>`
diff --git a/requirements-cpu.txt b/requirements-cpu.txt
index 81710ae516ef2..db8ad9d3a015d 100644
--- a/requirements-cpu.txt
+++ b/requirements-cpu.txt
@@ -3,5 +3,5 @@
 
 # Dependencies for CPUs
 torch==2.5.1+cpu; platform_machine != "ppc64le" and platform_machine != "aarch64" 
-torch==2.5.0; platform_machine == "aarch64"
+torch==2.5.1; platform_machine == "aarch64"
 torchvision; platform_machine != "ppc64le"  # required for the image processor of phi3v, this must be updated alongside torch
\ No newline at end of file

From 64f63d54678789fbe1265bbded8b82b98715c043 Mon Sep 17 00:00:00 2001
From: Sanket Kale <sanketk.kale@fujitsu.com>
Date: Mon, 18 Nov 2024 12:10:38 +0530
Subject: [PATCH 13/15] Resolved merge conflicts

Signed-off-by: Sanket Kale <sanketk.kale@fujitsu.com>
---
 examples/offline_inference.py | 15 +--------------
 1 file changed, 1 insertion(+), 14 deletions(-)

diff --git a/examples/offline_inference.py b/examples/offline_inference.py
index dcd8c8eb86d1f..2c3196ab5caaa 100644
--- a/examples/offline_inference.py
+++ b/examples/offline_inference.py
@@ -5,7 +5,6 @@
 from vllm.utils import FlexibleArgumentParser
 
 
-<<<<<<< HEAD
 def get_prompts(num_prompts: int):
     # The default sample prompts.
     prompts = [
@@ -78,16 +77,4 @@ def main(args):
                        help='top_k for text generation')
 
     args = parser.parse_args()
-    main(args)
-=======
-# Create an LLM.
-llm = LLM(model="facebook/opt-125m")
-# Generate texts from the prompts. The output is a list of RequestOutput objects
-# that contain the prompt, generated text, and other information.
-outputs = llm.generate(prompts, sampling_params)
-# Print the outputs.
-for output in outputs:
-    prompt = output.prompt
-    generated_text = output.outputs[0].text
-    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
->>>>>>> 3ff99cf4 (Enabled path for ARM machines)
+    main(args)
\ No newline at end of file

From dea09ba7fa03ea9d90e37ffab378def44b1c30aa Mon Sep 17 00:00:00 2001
From: Sanket Kale <sanketk.kale@fujitsu.com>
Date: Mon, 18 Nov 2024 14:51:23 +0530
Subject: [PATCH 14/15] Resolve failed formatting checks

Signed-off-by: Sanket Kale <sanketk.kale@fujitsu.com>
---
 csrc/cpu/attention.cpp     | 38 +++++++++++++++++++-------------------
 csrc/cpu/cpu_types_arm.hpp | 18 +++++++++---------
 2 files changed, 28 insertions(+), 28 deletions(-)

diff --git a/csrc/cpu/attention.cpp b/csrc/cpu/attention.cpp
index aaa8ec57dfeab..a2ce4c21b6a50 100644
--- a/csrc/cpu/attention.cpp
+++ b/csrc/cpu/attention.cpp
@@ -53,28 +53,28 @@ struct KernelVecType<c10::BFloat16> {
 #else
   #ifdef __aarch64__
     #ifndef BF16_SUPPORT
-      //pass
+    // pass
     #else
-      template <>
-      struct KernelVecType<c10::BFloat16> {
-        using q_load_vec_type = vec_op::BF16Vec8;
-        using q_vec_type = vec_op::FP32Vec16;
-        using k_load_vec_type = vec_op::BF16Vec16;
-        using k_vec_type = vec_op::FP32Vec16;
-        using qk_acc_vec_type = vec_op::FP32Vec16;
-        using v_load_vec_type = vec_op::BF16Vec16;
-      };
+template <>
+struct KernelVecType<c10::BFloat16> {
+  using q_load_vec_type = vec_op::BF16Vec8;
+  using q_vec_type = vec_op::FP32Vec16;
+  using k_load_vec_type = vec_op::BF16Vec16;
+  using k_vec_type = vec_op::FP32Vec16;
+  using qk_acc_vec_type = vec_op::FP32Vec16;
+  using v_load_vec_type = vec_op::BF16Vec16;
+};
     #endif
   #else
-    template <>
-    struct KernelVecType<c10::BFloat16> {
-      using q_load_vec_type = vec_op::BF16Vec8;
-      using q_vec_type = vec_op::FP32Vec16;
-      using k_load_vec_type = vec_op::BF16Vec16;
-      using k_vec_type = vec_op::FP32Vec16;
-      using qk_acc_vec_type = vec_op::FP32Vec16;
-      using v_load_vec_type = vec_op::BF16Vec16;
-    };
+template <>
+struct KernelVecType<c10::BFloat16> {
+  using q_load_vec_type = vec_op::BF16Vec8;
+  using q_vec_type = vec_op::FP32Vec16;
+  using k_load_vec_type = vec_op::BF16Vec16;
+  using k_vec_type = vec_op::FP32Vec16;
+  using qk_acc_vec_type = vec_op::FP32Vec16;
+  using v_load_vec_type = vec_op::BF16Vec16;
+};
   #endif
 #endif
 
diff --git a/csrc/cpu/cpu_types_arm.hpp b/csrc/cpu/cpu_types_arm.hpp
index 811d709f8afe9..1d29040690702 100644
--- a/csrc/cpu/cpu_types_arm.hpp
+++ b/csrc/cpu/cpu_types_arm.hpp
@@ -220,10 +220,10 @@ struct FP32Vec8 : public Vec<FP32Vec8> {
   float reduce_sum() const {
     AliasReg ar;
     ar.reg = reg;
-    float ans = 0;
-    unroll_loop<int, VEC_ELEM_NUM>([&ans, &ar](int i) { ans += ar.values[i]; });
+    float answer = 0;
+    unroll_loop<int, VEC_ELEM_NUM>([&answer, &ar](int i) { answer += ar.values[i]; });
 
-    return ans;
+    return answer;
   }
 
   FP32Vec8 exp() const {
@@ -408,10 +408,10 @@ struct FP32Vec16 : public Vec<FP32Vec16> {
   float reduce_sum() const {
     AliasReg ar;
     ar.reg = reg;
-    float ans = 0;
-    unroll_loop<int, VEC_ELEM_NUM>([&ans, &ar](int i) { ans += ar.values[i]; });
+    float answer = 0;
+    unroll_loop<int, VEC_ELEM_NUM>([&answer, &ar](int i) { answer += ar.values[i]; });
 
-    return ans;
+    return answer;
   };
 
   template <int group_size> float reduce_sub_sum(int idx) {
@@ -419,12 +419,12 @@ struct FP32Vec16 : public Vec<FP32Vec16> {
 
     AliasReg ar;
     ar.reg = reg;
-    float ans = 0;
+    float answer = 0;
     const int start = idx * group_size;
     unroll_loop<int, group_size>(
-        [&ans, &start, ar](int i) { ans += ar.values[start + i]; });
+        [&answer, &start, ar](int i) { answer += ar.values[start + i]; });
 
-    return ans;
+    return answer;
   };
 
   void save(float *ptr) const {

From f1bf96abe8816a86ec75cf69c63668ccc9238f8d Mon Sep 17 00:00:00 2001
From: Sanket Kale <sanketk.kale@fujitsu.com>
Date: Wed, 20 Nov 2024 11:12:22 +0530
Subject: [PATCH 15/15] Changed flag name and modified compile flag declaration

Signed-off-by: Sanket Kale <sanketk.kale@fujitsu.com>
---
 cmake/cpu_extension.cmake  | 21 ++++-----------------
 csrc/cpu/attention.cpp     |  2 +-
 csrc/cpu/cpu_types_arm.hpp | 23 +++++++++--------------
 3 files changed, 14 insertions(+), 32 deletions(-)

diff --git a/cmake/cpu_extension.cmake b/cmake/cpu_extension.cmake
index 248c04d4b3dd2..68f7ca1af05ad 100644
--- a/cmake/cpu_extension.cmake
+++ b/cmake/cpu_extension.cmake
@@ -16,20 +16,8 @@ include_directories("${CMAKE_SOURCE_DIR}/csrc")
 #
 # Check the compile flags
 #
-<<<<<<< HEAD
-if (CMAKE_SYSTEM_PROCESSOR STREQUAL "ppc64le")
-    list(APPEND CXX_COMPILE_FLAGS
-        "-fopenmp"
-        "-DVLLM_CPU_EXTENSION")
-else()
-    list(APPEND CXX_COMPILE_FLAGS
-        "-fopenmp"
-        "-mf16c"
-        "-DVLLM_CPU_EXTENSION")
-endif()
-=======
 
-if (NOT CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64")
+if (CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64")
     list(APPEND CXX_COMPILE_FLAGS
         "-mf16c"
     )
@@ -37,7 +25,6 @@ endif()
 list(APPEND CXX_COMPILE_FLAGS
     "-fopenmp"
     "-DVLLM_CPU_EXTENSION")
->>>>>>> eca86e66 (Rebased and resolved merge conflicts)
 
 execute_process(COMMAND cat /proc/cpuinfo
                 RESULT_VARIABLE CPUINFO_RET
@@ -72,7 +59,7 @@ find_isa(${CPUINFO} "avx512f" AVX512_FOUND)
 find_isa(${CPUINFO} "POWER10" POWER10_FOUND)
 find_isa(${CPUINFO} "POWER9" POWER9_FOUND)
 find_isa(${CPUINFO} "asimd" ASIMD_FOUND) # Check for ARM NEON support
-find_isa(${CPUINFO} "bf16" BF16_FOUND) # Check for BF16 support
+find_isa(${CPUINFO} "bf16" ARM_BF16_FOUND) # Check for ARM BF16 support
 
 if (AVX512_FOUND AND NOT AVX512_DISABLED)
     list(APPEND CXX_COMPILE_FLAGS
@@ -107,10 +94,10 @@ elseif (POWER9_FOUND OR POWER10_FOUND)
 
 elseif (ASIMD_FOUND)
     message(STATUS "ARMv8 or later architecture detected")
-    if(BF16_FOUND)
+    if(ARM_BF16_FOUND)
         message(STATUS "BF16 extension detected")
         set(MARCH_FLAGS "-march=armv8.2-a+bf16+dotprod+fp16")
-        add_compile_definitions(BF16_SUPPORT)
+        add_compile_definitions(ARM_BF16_SUPPORT)
     else()
         message(WARNING "BF16 functionality is not available")
         set(MARCH_FLAGS "-march=armv8.2-a+dotprod+fp16")  
diff --git a/csrc/cpu/attention.cpp b/csrc/cpu/attention.cpp
index a2ce4c21b6a50..e21832ba7582f 100644
--- a/csrc/cpu/attention.cpp
+++ b/csrc/cpu/attention.cpp
@@ -52,7 +52,7 @@ struct KernelVecType<c10::BFloat16> {
 };
 #else
   #ifdef __aarch64__
-    #ifndef BF16_SUPPORT
+    #ifndef ARM_BF16_SUPPORT
     // pass
     #else
 template <>
diff --git a/csrc/cpu/cpu_types_arm.hpp b/csrc/cpu/cpu_types_arm.hpp
index 1d29040690702..73e0f8cb2e0fb 100644
--- a/csrc/cpu/cpu_types_arm.hpp
+++ b/csrc/cpu/cpu_types_arm.hpp
@@ -1,11 +1,10 @@
 #include <arm_neon.h>
 #include <torch/all.h> 
-
 #include <cmath>
 
 namespace vec_op {
 
-#ifdef BF16_SUPPORT
+#ifdef ARM_BF16_SUPPORT
   #define VLLM_DISPATCH_CASE_FLOATING_TYPES(...)                                 \
     AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__)                         \
     AT_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__)                         \
@@ -103,7 +102,7 @@ struct FP16Vec16 : public Vec<FP16Vec16> {
 };
 
 
-#ifdef BF16_SUPPORT
+#ifdef ARM_BF16_SUPPORT
 struct BF16Vec8 : public Vec<BF16Vec8> {
   constexpr static int VEC_ELEM_NUM = 8;
 
@@ -209,7 +208,7 @@ struct FP32Vec8 : public Vec<FP32Vec8> {
 
   explicit FP32Vec8(float16x8_t v) : reg({vcvt_f32_f16(vget_low_f16(v)), vcvt_f32_f16(vget_high_f16(v))}) {};
 
-  #ifdef BF16_SUPPORT
+  #ifdef ARM_BF16_SUPPORT
 
   explicit FP32Vec8(bfloat16x8_t v) : reg({vcvtq_low_f32_bf16(v), vcvtq_high_f32_bf16(v)}) {};
 
@@ -333,7 +332,7 @@ struct FP32Vec16 : public Vec<FP32Vec16> {
 
   explicit FP32Vec16(const FP16Vec8 &v) : FP32Vec16(FP32Vec8(v.reg)) {}
 
-  #ifdef BF16_SUPPORT
+  #ifdef ARM_BF16_SUPPORT
   explicit FP32Vec16(bfloat16x8x2_t v) : reg({
     vcvtq_low_f32_bf16(v.val[0]),
     vcvtq_high_f32_bf16(v.val[0]),
@@ -349,7 +348,7 @@ struct FP32Vec16 : public Vec<FP32Vec16> {
     reg.val[3] = data.reg;
   };
 
-  #ifdef BF16_SUPPORT
+  #ifdef ARM_BF16_SUPPORT
   explicit FP32Vec16(const BF16Vec16 &v) : reg({
     vcvtq_low_f32_bf16(v.reg.val[0]),
     vcvtq_high_f32_bf16(v.reg.val[0]),
@@ -367,10 +366,6 @@ struct FP32Vec16 : public Vec<FP32Vec16> {
       reg.val[3] = vcvt_f32_f16(vget_high_f16(v.reg.val[1]));
   };
 
-  // #ifdef BF16_SUPPORT
-  // explicit FP32Vec16(const BF16Vec8 &v) : FP32Vec16(FP32Vec8(v)) {}
-  // #endif
-
   FP32Vec16 operator+(const FP32Vec16 &b) const {
     return FP32Vec16(float32x4x4_t({
         vaddq_f32(reg.val[0], b.reg.val[0]),
@@ -443,7 +438,7 @@ template <> struct VecType<float> { using vec_type = FP32Vec8; };
 
 template <> struct VecType<c10::Half> { using vec_type = FP16Vec8; };
 
-#ifdef BF16_SUPPORT
+#ifdef ARM_BF16_SUPPORT
 template <> struct VecType<c10::BFloat16> { using vec_type = BF16Vec8; };
 #endif
 
@@ -478,7 +473,7 @@ inline void fma(FP32Vec16 &acc, FP32Vec16 &a, FP32Vec16 &b) {
   acc.reg.val[3] = vfmaq_f32(acc.reg.val[3], a.reg.val[3], b.reg.val[3]);
 };
 
-#ifdef BF16_SUPPORT
+#ifdef ARM_BF16_SUPPORT
 inline void fma(FP32Vec16 &acc, BF16Vec32 &a, BF16Vec32 &b) {
 
   float32x4_t a0_low = vcvt_f32_bf16(vget_low_bf16(a.reg.val[0]));
@@ -498,7 +493,7 @@ inline void fma(FP32Vec16 &acc, BF16Vec32 &a, BF16Vec32 &b) {
 };
 #endif
 
-#ifdef BF16_SUPPORT
+#ifdef ARM_BF16_SUPPORT
 inline BF16Vec8::BF16Vec8(const FP32Vec8 &v) : reg(vcvtq_high_bf16_f32(vcvtq_low_bf16_f32(v.reg.val[0]), v.reg.val[1])) {};
 
 inline BF16Vec16::BF16Vec16(const FP32Vec16 &v) : reg({
@@ -511,7 +506,7 @@ inline void prefetch(const void *addr) {
     __builtin_prefetch(addr, 0, 1);
 };
 
-#ifdef BF16_SUPPORT
+#ifdef ARM_BF16_SUPPORT
 template <>
 inline void storeFP32<c10::BFloat16>(float v, c10::BFloat16 *ptr) { 
   *reinterpret_cast<__bf16 *>(ptr) = vcvth_bf16_f32(v);