Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Arm port #1009

Draft
wants to merge 5 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions .gitmodules
Original file line number Diff line number Diff line change
Expand Up @@ -20,3 +20,9 @@
[submodule "src/3rd_party/simple-websocket-server"]
path = src/3rd_party/simple-websocket-server
url = https://github.com/marian-nmt/Simple-WebSocket-Server
[submodule "src/3rd_party/ruy"]
path = src/3rd_party/ruy
url = https://github.com/google/ruy.git
[submodule "src/3rd_party/simd_utils"]
path = src/3rd_party/simd_utils
url = https://github.com/JishinMaster/simd_utils.git
60 changes: 54 additions & 6 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,22 @@
endif ()

project(marian CXX C)

######### ARCH DETECTION #########
# Architecture detection
include(TargetArch)

target_architecture(CMAKE_TARGET_ARCHITECTURES)
list(LENGTH CMAKE_TARGET_ARCHITECTURES cmake_target_arch_len)
if(NOT "${cmake_target_arch_len}" STREQUAL "1")
set(CMAKE_TARGET_ARCHITECTURE_UNIVERSAL TRUE)
set(CMAKE_TARGET_ARCHITECTURE_CODE "universal")
else()
set(CMAKE_TARGET_ARCHITECTURE_UNIVERSAL FALSE)
set(CMAKE_TARGET_ARCHITECTURE_CODE "${CMAKE_TARGET_ARCHITECTURES}")
endif()
######### ARCH DETECTION #########

set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CXX_STANDARD_REQUIRED ON)
set(BUILD_ARCH native CACHE STRING "Compile for this CPU architecture.")
Expand Down Expand Up @@ -80,6 +96,38 @@
set(CMAKE_BUILD_TYPE "Release")
endif()

# ARM bits
if(${CMAKE_TARGET_ARCHITECTURE_CODE} MATCHES "arm")
# Define that we are using ARM
add_compile_definitions(ARM)
set(ARM ON)
option(USE_RUY "Use Ruy" ON) # For 8 bit code, later on
set(EXT_LIBS ${EXT_LIBS} ruy)

# Apple M1 has Apple Accelerate. Otherwise fallback to RUY
if(APPLE)
option(USE_RUY_SGEMM "Compile with Ruy SGEMM" OFF)
else(APPLE)
option(USE_RUY_SGEMM "Compile with Ruy SGEMM" ON)
endif(APPLE)

set(USE_SIMD_UTILS ON)

# Some warnings as errors. I don't feel comfortable about the strict aliasing.
set(ARM_WARNINGS "-fno-strict-aliasing -Wno-comment")

set(USE_SIMD_UTILS ON)
# @TODO this assumes ArmV8. We should also look at armv7
add_compile_definitions(ARM FMA SSE) #added for ARM
if(MSVC)
add_compile_options(/flax-vector-conversions)
else(MSVC)
add_compile_options(-flax-vector-conversions)
endif(MSVC)

endif(${CMAKE_TARGET_ARCHITECTURE_CODE} MATCHES "arm")


###############################################################################
# Set compilation flags
if(MSVC)
Expand Down Expand Up @@ -221,7 +269,7 @@
# Clang-10.0.0 complains when CUDA is newer than 10.1
set(CLANG_IGNORE_UNKNOWN_CUDA "-Wno-unknown-warning-option -Wno-unknown-cuda-version")
endif()
set(DISABLE_GLOBALLY "-Wno-unused-result ${CLANG_IGNORE_UNKNOWN_CUDA}")
set(DISABLE_GLOBALLY "-Wno-unused-result ${CLANG_IGNORE_UNKNOWN_CUDA} ${ARM_WARNINGS}")

# These are used in src/CMakeLists.txt on a per-target basis
list(APPEND ALL_WARNINGS -Wall; -Werror; -Wextra; -Wno-unused-result; -Wno-deprecated;
Expand All @@ -241,19 +289,19 @@
endif(CMAKE_COMPILER_IS_GNUCC)

set(CMAKE_CXX_FLAGS "-std=c++11 -pthread ${CMAKE_GCC_FLAGS} -fPIC ${DISABLE_GLOBALLY} -march=${BUILD_ARCH} ${INTRINSICS}")
set(CMAKE_CXX_FLAGS_RELEASE "-O3 -m64 -funroll-loops -g ${CMAKE_RDYNAMIC_FLAG}")
set(CMAKE_CXX_FLAGS_RELEASE "-O3 -funroll-loops -g ${CMAKE_RDYNAMIC_FLAG}")
set(CMAKE_CXX_FLAGS_DEBUG "-O0 -g ${CMAKE_RDYNAMIC_FLAG}")
set(CMAKE_CXX_FLAGS_SLIM "-O3 -m64 -funroll-loops -DNDEBUG")
set(CMAKE_CXX_FLAGS_SLIM "-O3 -funroll-loops -DNDEBUG")
set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELEASE}")
set(CMAKE_CXX_FLAGS_PROFILE "${CMAKE_CXX_FLAGS_RELEASE} -pg")
set(CMAKE_CXX_FLAGS_PROFGEN "${CMAKE_CXX_FLAGS_RELEASE} -fprofile-generate -fprofile-correction")
set(CMAKE_CXX_FLAGS_PROFUSE "${CMAKE_CXX_FLAGS_RELEASE} -fprofile-use -fprofile-correction")

# these need to be set separately
set(CMAKE_C_FLAGS "-pthread ${CMAKE_GCC_FLAGS} -fPIC ${DISABLE_GLOBALLY} -march=${BUILD_ARCH} ${INTRINSICS}")
set(CMAKE_C_FLAGS_RELEASE "-O3 -m64 -funroll-loops -g ${CMAKE_RDYNAMIC_FLAG}")
set(CMAKE_C_FLAGS_RELEASE "-O3 -funroll-loops -g ${CMAKE_RDYNAMIC_FLAG}")
set(CMAKE_C_FLAGS_DEBUG "-O0 -g ${CMAKE_RDYNAMIC_FLAG}")
set(CMAKE_C_FLAGS_SLIM "-O3 -m64 -funroll-loops -DNDEBUG")
set(CMAKE_C_FLAGS_SLIM "-O3 -funroll-loops -DNDEBUG")
set(CMAKE_C_FLAGS_RELWITHDEBINFO "${CMAKE_C_FLAGS_RELEASE}")
set(CMAKE_C_FLAGS_PROFILE "${CMAKE_C_FLAGS_RELEASE} -pg")
set(CMAKE_C_FLAGS_PROFGEN "${CMAKE_C_FLAGS_RELEASE} -fprofile-generate -fprofile-correction")
Expand Down Expand Up @@ -461,7 +509,7 @@
endif(CUDA_FOUND)

else(COMPILE_CUDA)
message(WARNING "COMPILE_CUDA=off : Building only CPU version")

Check warning on line 512 in CMakeLists.txt

View workflow job for this annotation

GitHub Actions / Windows CPU-only

COMPILE_CUDA=off : Building only CPU version
endif(COMPILE_CUDA)

# TODO: make compatible with older CUDA versions
Expand Down Expand Up @@ -511,7 +559,7 @@
###############################################################################
# Find BLAS library
if(COMPILE_CPU)
if(NOT GENERATE_MARIAN_INSTALL_TARGETS)
if(NOT GENERATE_MARIAN_INSTALL_TARGETS AND NOT ARM)
set(EXT_LIBS ${EXT_LIBS} intgemm) # Enable intgemm when compiling CPU
add_definitions(-DCOMPILE_CPU=1)
endif()
Expand Down Expand Up @@ -580,7 +628,7 @@
endif()

if(DETERMINISTIC)
message(WARNING "Option DETERMINISTIC=ON: Trying to make training as deterministic as possible, may result in slow-down")

Check warning on line 631 in CMakeLists.txt

View workflow job for this annotation

GitHub Actions / Windows CPU-only

Option DETERMINISTIC=ON: Trying to make training as deterministic as

Check warning on line 631 in CMakeLists.txt

View workflow job for this annotation

GitHub Actions / Windows CPU+CUDA

Option DETERMINISTIC=ON: Trying to make training as deterministic as

Check warning on line 631 in CMakeLists.txt

View workflow job for this annotation

GitHub Actions / Windows CPU+CUDA

Option DETERMINISTIC=ON: Trying to make training as deterministic as
add_definitions(-DDETERMINISTIC=1)
list(APPEND CUDA_NVCC_FLAGS -DDETERMINISTIC=1; )
else()
Expand Down
142 changes: 142 additions & 0 deletions cmake/TargetArch.cmake
Original file line number Diff line number Diff line change
@@ -0,0 +1,142 @@
# Modified from https://github.com/axr/solar-cmake/blob/73cfea0db0284c5e2010aca23989046e5bda95c9/Solar.cmake
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

TODO. This file needs to be removed and detecting ARM needs to be done in a different way. The variables that this file sets break the double M1/x86_64 compilation that is supposed to be done on modern OSX.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

what are the offending variables in this file?

# Based on the Qt 5 processor detection code, so should be very accurate
# https://qt.gitorious.org/qt/qtbase/blobs/master/src/corelib/global/qprocessordetection.h
# Currently handles arm (v5, v6, v7), x86 (32/64), ia64, and ppc (32/64)

# Regarding POWER/PowerPC, just as is noted in the Qt source,
# "There are many more known variants/revisions that we do not handle/detect."

set(archdetect_c_code "
#if defined(__arm__) || defined(__TARGET_ARCH_ARM) || defined(_M_ARM) || defined(_M_ARM64) || defined(__aarch64__) || defined(__ARM64__)
#if defined(__ARM_ARCH_8__) || defined(__ARM_ARCH_8) \\
|| defined(__ARM_ARCH_8A__) || defined(__ARM_ARCH_8A) \\
|| defined(__ARM_ARCH_8R__) || defined(__ARM_ARCH_8R) \\
|| defined(__ARM_ARCH_8M__) || defined(__ARM_ARCH_8M) \\
|| (defined(__TARGET_ARCH_ARM) && __TARGET_ARCH_ARM-0 >= 8)
#error cmake_ARCH armv8
#elif defined(__ARM_ARCH_7__) \\
|| defined(__ARM_ARCH_7A__) \\
|| defined(__ARM_ARCH_7R__) \\
|| defined(__ARM_ARCH_7M__) \\
|| (defined(__TARGET_ARCH_ARM) && __TARGET_ARCH_ARM-0 >= 7)
#error cmake_ARCH armv7
#elif defined(__ARM_ARCH_6__) \\
|| defined(__ARM_ARCH_6J__) \\
|| defined(__ARM_ARCH_6T2__) \\
|| defined(__ARM_ARCH_6Z__) \\
|| defined(__ARM_ARCH_6K__) \\
|| defined(__ARM_ARCH_6ZK__) \\
|| defined(__ARM_ARCH_6M__) \\
|| (defined(__TARGET_ARCH_ARM) && __TARGET_ARCH_ARM-0 >= 6)
#error cmake_ARCH armv6
#elif defined(__ARM_ARCH_5TEJ__) \\
|| (defined(__TARGET_ARCH_ARM) && __TARGET_ARCH_ARM-0 >= 5)
#error cmake_ARCH armv5
#else
#error cmake_ARCH arm
#endif
#elif defined(__i386) || defined(__i386__) || defined(_M_IX86)
#error cmake_ARCH i386
#elif defined(__x86_64) || defined(__x86_64__) || defined(__amd64) || defined(_M_X64)
#error cmake_ARCH x86_64
#elif defined(__ia64) || defined(__ia64__) || defined(_M_IA64)
#error cmake_ARCH ia64
#elif defined(__ppc__) || defined(__ppc) || defined(__powerpc__) \\
|| defined(_ARCH_COM) || defined(_ARCH_PWR) || defined(_ARCH_PPC) \\
|| defined(_M_MPPC) || defined(_M_PPC)
#if defined(__ppc64__) || defined(__powerpc64__) || defined(__64BIT__)
#error cmake_ARCH ppc64
#else
#error cmake_ARCH ppc
#endif
#endif

#error cmake_ARCH unknown
")


# Set ppc_support to TRUE before including this file or ppc and ppc64
# will be treated as invalid architectures since they are no longer supported by Apple

function(target_architecture output_var)
if(APPLE AND CMAKE_OSX_ARCHITECTURES)
# On OS X we use CMAKE_OSX_ARCHITECTURES *if* it was set
# First let's normalize the order of the values

# Note that it's not possible to compile PowerPC applications if you are using
# the OS X SDK version 10.6 or later - you'll need 10.4/10.5 for that, so we
# disable it by default
# See this page for more information:
# http://stackoverflow.com/questions/5333490/how-can-we-restore-ppc-ppc64-as-well-as-full-10-4-10-5-sdk-support-to-xcode-4

# Architecture defaults to i386 or ppc on OS X 10.5 and earlier, depending on the CPU type detected at runtime.
# On OS X 10.6+ the default is x86_64 if the CPU supports it, i386 otherwise.

foreach(osx_arch ${CMAKE_OSX_ARCHITECTURES})
if("${osx_arch}" STREQUAL "ppc" AND ppc_support)
set(osx_arch_ppc TRUE)
elseif("${osx_arch}" STREQUAL "i386")
set(osx_arch_i386 TRUE)
elseif("${osx_arch}" STREQUAL "x86_64")
set(osx_arch_x86_64 TRUE)
elseif("${osx_arch}" STREQUAL "ppc64" AND ppc_support)
set(osx_arch_ppc64 TRUE)
else()
message(FATAL_ERROR "Invalid OS X arch name: ${osx_arch}")
endif()
endforeach()

# Now add all the architectures in our normalized order
if(osx_arch_ppc)
list(APPEND ARCH ppc)
endif()

if(osx_arch_i386)
list(APPEND ARCH i386)
endif()

if(osx_arch_x86_64)
list(APPEND ARCH x86_64)
endif()

if(osx_arch_ppc64)
list(APPEND ARCH ppc64)
endif()
else()
file(WRITE "${CMAKE_BINARY_DIR}/arch.c" "${archdetect_c_code}")

enable_language(C)

# Detect the architecture in a rather creative way...
# This compiles a small C program which is a series of ifdefs that selects a
# particular #error preprocessor directive whose message string contains the
# target architecture. The program will always fail to compile (both because
# file is not a valid C program, and obviously because of the presence of the
# #error preprocessor directives... but by exploiting the preprocessor in this
# way, we can detect the correct target architecture even when cross-compiling,
# since the program itself never needs to be run (only the compiler/preprocessor)
try_run(
run_result_unused
compile_result_unused
"${CMAKE_BINARY_DIR}"
"${CMAKE_BINARY_DIR}/arch.c"
COMPILE_OUTPUT_VARIABLE ARCH
CMAKE_FLAGS CMAKE_OSX_ARCHITECTURES=${CMAKE_OSX_ARCHITECTURES}
)

# Parse the architecture name from the compiler output
string(REGEX MATCH "cmake_ARCH ([a-zA-Z0-9_]+)" ARCH "${ARCH}")

# Get rid of the value marker leaving just the architecture name
string(REPLACE "cmake_ARCH " "" ARCH "${ARCH}")

# If we are compiling with an unknown architecture this variable should
# already be set to "unknown" but in the case that it's empty (i.e. due
# to a typo in the code), then set it to unknown
if (NOT ARCH)
set(ARCH unknown)
endif()
endif()

set(${output_var} "${ARCH}" PARENT_SCOPE)
endfunction()
12 changes: 11 additions & 1 deletion src/3rd_party/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,20 @@ add_subdirectory(./faiss)
include_directories(./faiss)

if(COMPILE_CPU)
if(NOT GENERATE_MARIAN_INSTALL_TARGETS)
if(NOT GENERATE_MARIAN_INSTALL_TARGETS AND NOT ARM)
set(INTGEMM_DONT_BUILD_TESTS ON CACHE BOOL "Disable intgemm tests")
add_subdirectory(./intgemm)
endif()

if(USE_RUY)
set(CPUINFO_BUILD_MOCK_TESTS OFF CACHE BOOL " " FORCE)
set(CPUINFO_BUILD_BENCHMARKS OFF CACHE BOOL " " FORCE)
set(CPUINFO_BUILD_PKG_CONFIG OFF CACHE BOOL " " FORCE)
set(CPUINFO_BUILD_UNIT_TESTS OFF CACHE BOOL " " FORCE)
set(CPUINFO_BUILD_TOOLS OFF CACHE BOOL " " FORCE)
add_subdirectory(ruy/third_party/cpuinfo EXCLUDE_FROM_ALL)
add_subdirectory(ruy EXCLUDE_FROM_ALL)
endif(USE_RUY)
endif(COMPILE_CPU)

if(USE_FBGEMM)
Expand Down
4 changes: 4 additions & 0 deletions src/3rd_party/faiss/VectorTransform.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,10 @@

using namespace faiss;

#ifdef ARM
#include "3rd_party/simd_utils/simd_utils.h"
#endif


extern "C" {

Expand Down
1 change: 1 addition & 0 deletions src/3rd_party/ruy
Submodule ruy added at c04e5e
1 change: 1 addition & 0 deletions src/3rd_party/simd_utils
Submodule simd_utils added at 696036
6 changes: 5 additions & 1 deletion src/common/types.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,11 @@
#include <type_traits>

#ifndef __CUDACC__ // NVCC is very unreliable when it comes to CPU intrinsics, we hide them completely from NVCC-compiled code
#include <immintrin.h>
#ifndef ARM
#include <immintrin.h>
#else
#include "3rd_party/simd_utils/simd_utils.h"
#endif
#endif

#ifdef __CUDACC__ // nvcc is compiling this code
Expand Down
5 changes: 4 additions & 1 deletion src/functional/operators.h
Original file line number Diff line number Diff line change
Expand Up @@ -217,8 +217,11 @@ struct Ops<double> {
// __CUDACC__ is defined when compiling with NVCC regardless of device type
// __CUDA_ARCH__ is defined when compiling device (GPU) code
#ifndef __CUDACC__

#ifndef ARM
#include "3rd_party/sse_mathfun.h"
#else
#include "3rd_party/simd_utils/simd_utils.h" // @TODO this might be dependent on NEON
#endif

namespace marian {
namespace functional {
Expand Down
2 changes: 1 addition & 1 deletion src/tensors/cpu/expression_graph_packable.h
Original file line number Diff line number Diff line change
Expand Up @@ -152,7 +152,7 @@ class ExpressionGraphPackable : public ExpressionGraph {
#endif
} else if (isIntgemm(gemmElementType) &&
(pName.find("_W") == pName.length() - 3 || pName.find("_W") == pName.length() - 2 /* || pName.find("Wemb") != std::string::npos*/)) {
#if COMPILE_CPU
#if COMPILE_CPU && !defined(ARM)
using cpu::integer::cols;
using cpu::integer::rows;
auto allocator = New<TensorAllocator>(getBackend());
Expand Down
8 changes: 4 additions & 4 deletions src/tensors/cpu/fbgemm/packed_gemm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,16 +2,16 @@
#include "tensors/tensor_allocator.h"
#include "tensors/tensor_operators.h"

#include <emmintrin.h>
#include <immintrin.h>
#include <tmmintrin.h>
#include <xmmintrin.h>
#include <cassert>
#include <cstddef>
#include <unordered_map>
//#include <chrono>

#if USE_FBGEMM
#include <emmintrin.h>
#include <immintrin.h>
#include <tmmintrin.h>
#include <xmmintrin.h>
#ifdef _MSC_VER
#pragma warning(disable: 4505) // 'fbgemmAlignedAlloc' in fbgemm.h: unreferenced local function has been removed (missing 'static inline')
#pragma warning(disable: 4251) // 'fbgemm::CompressedSparseColumn::colptr_': class 'std::vector<int,std::allocator<_Ty>>' needs to have dll-interface to be used by clients of class 'fbgemm::CompressedSparseColumn'
Expand Down
Loading
Loading