diff --git a/.ci/pytorch/build.sh b/.ci/pytorch/build.sh index b81caa0513691b..4aa5dc39d0f5f5 100755 --- a/.ci/pytorch/build.sh +++ b/.ci/pytorch/build.sh @@ -44,11 +44,6 @@ if [[ "$BUILD_ENVIRONMENT" == *cuda11* ]]; then fi fi -if [[ ${BUILD_ENVIRONMENT} == *"caffe2"* ]]; then - echo "Caffe2 build is ON" - export BUILD_CAFFE2=ON -fi - if [[ ${BUILD_ENVIRONMENT} == *"paralleltbb"* ]]; then export ATEN_THREADING=TBB export USE_TBB=1 diff --git a/CMakeLists.txt b/CMakeLists.txt index 79db67e7357b54..f0944733dda87f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -181,11 +181,7 @@ option(BUILD_BINARY "Build C++ binaries" OFF) option(BUILD_DOCS "Build Caffe2 documentation" OFF) option(BUILD_CUSTOM_PROTOBUF "Build and use Caffe2's own protobuf under third_party" ON) option(BUILD_PYTHON "Build Python binaries" ON) -option(BUILD_CAFFE2 "Master flag to build Caffe2" OFF) option(BUILD_LITE_INTERPRETER "Master flag to build Lite Interpreter" OFF) -cmake_dependent_option( - BUILD_CAFFE2_OPS "Build Caffe2 operators" ON - "BUILD_CAFFE2" OFF) option(BUILD_SHARED_LIBS "Build libcaffe2.so" ON) cmake_dependent_option( CAFFE2_LINK_LOCAL_PROTOBUF "If set, build protobuf inside libcaffe2.so." ON @@ -635,7 +631,6 @@ if(INTERN_BUILD_MOBILE) endif() set(BUILD_PYTHON OFF) set(BUILD_FUNCTORCH OFF) - set(BUILD_CAFFE2_OPS OFF) set(USE_DISTRIBUTED OFF) set(NO_API ON) set(USE_FBGEMM OFF) @@ -1208,13 +1203,6 @@ else() "shared libs.") endif() -# ---[ Modules -# If master flag for buildling Caffe2 is disabled, we also disable the -# build for Caffe2 related operator modules. -if(BUILD_CAFFE2) - add_subdirectory(modules) -endif() - # ---[ Binaries # Binaries will be built after the Caffe2 main libraries and the modules # are built. For the binaries, they will be linked to the Caffe2 main diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index e665e3fb8bbf60..a37c5a3b405db6 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -667,7 +667,6 @@ only interested in a specific component. - Working on a test binary? Run `(cd build && ninja bin/test_binary_name)` to rebuild only that test binary (without rerunning cmake). (Replace `ninja` with `make` if you don't have ninja installed). -- Don't need Caffe2? Pass `BUILD_CAFFE2=0` to disable Caffe2 build. On the initial build, you can also speed things up with the environment variables `DEBUG`, `USE_DISTRIBUTED`, `USE_MKLDNN`, `USE_CUDA`, `USE_FLASH_ATTENTION`, `USE_MEM_EFF_ATTENTION`, `BUILD_TEST`, `USE_FBGEMM`, `USE_NNPACK` and `USE_QNNPACK`. @@ -1196,7 +1195,7 @@ build_with_asan() LDFLAGS="-stdlib=libstdc++" \ CFLAGS="-fsanitize=address -fno-sanitize-recover=all -shared-libasan -pthread" \ CXX_FLAGS="-pthread" \ - USE_CUDA=0 USE_OPENMP=0 BUILD_CAFFE2_OPS=0 USE_DISTRIBUTED=0 DEBUG=1 \ + USE_CUDA=0 USE_OPENMP=0 USE_DISTRIBUTED=0 DEBUG=1 \ python setup.py develop } diff --git a/README.md b/README.md index 3ff42586109c38..8e4187e4d97196 100644 --- a/README.md +++ b/README.md @@ -379,7 +379,7 @@ You can also pass the `CMAKE_VARS="..."` environment variable to specify additio See [setup.py](./setup.py) for the list of available variables. ```bash -CMAKE_VARS="BUILD_CAFFE2=ON BUILD_CAFFE2_OPS=ON" make -f docker.Makefile +make -f docker.Makefile ``` ### Building the Documentation diff --git a/aten/src/ATen/CMakeLists.txt b/aten/src/ATen/CMakeLists.txt index 583662e6c63d08..9ec458fda45e4c 100644 --- a/aten/src/ATen/CMakeLists.txt +++ b/aten/src/ATen/CMakeLists.txt @@ -54,7 +54,7 @@ if(NOT BUILD_LITE_INTERPRETER) endif() EXCLUDE(ATen_CORE_SRCS "${ATen_CORE_SRCS}" ${ATen_CORE_TEST_SRCS}) # Exclude TensorImpl_test.cpp if compiling without Caffe2 -if(NOT BUILD_CAFFE2 AND NOT BUILD_LITE_INTERPRETER) +if(NOT BUILD_LITE_INTERPRETER) file(GLOB_RECURSE ATen_CORE_EXCLUDED_TEST_SRCS "core/TensorImpl_test.cpp") EXCLUDE(ATen_CORE_TEST_SRCS "${ATen_CORE_TEST_SRCS}" ${ATen_CORE_EXCLUDED_TEST_SRCS}) endif() diff --git a/aten/src/ATen/test/CMakeLists.txt b/aten/src/ATen/test/CMakeLists.txt index a1a6249414dea8..09579b2367206a 100644 --- a/aten/src/ATen/test/CMakeLists.txt +++ b/aten/src/ATen/test/CMakeLists.txt @@ -123,16 +123,6 @@ list(APPEND ATen_XPU_TEST_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/xpu_generator_test.cpp ) -# Caffe2 specific tests -if(BUILD_CAFFE2) - list(APPEND ATen_CPU_TEST_SRCS - ${CMAKE_CURRENT_SOURCE_DIR}/ExclusivelyOwned_test.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/tensor_interop_test.cpp) - list(APPEND ATen_CUDA_TEST_SRCS - ${CMAKE_CURRENT_SOURCE_DIR}/cuda_tensor_interop_test.cpp) -endif() - - # ---[ Send the lists to the parent scope. set(ATen_CPU_TEST_SRCS ${ATen_CPU_TEST_SRCS} PARENT_SCOPE) set(ATen_CUDA_TEST_SRCS ${ATen_CUDA_TEST_SRCS} PARENT_SCOPE) diff --git a/binaries/CMakeLists.txt b/binaries/CMakeLists.txt index 70b235e43e7d73..273353128baafa 100644 --- a/binaries/CMakeLists.txt +++ b/binaries/CMakeLists.txt @@ -7,16 +7,6 @@ if(INTERN_BUILD_MOBILE) return() endif() -if(BUILD_CAFFE2) - caffe2_binary_target("at_launch_benchmark.cc") - target_include_directories(at_launch_benchmark PUBLIC - ${CMAKE_BINARY_DIR}/aten/src) - - caffe2_binary_target("intra_inter_benchmark.cc") - target_include_directories(intra_inter_benchmark PUBLIC - ${CMAKE_BINARY_DIR}/aten/src) -endif() - caffe2_binary_target("parallel_info.cc") target_include_directories(parallel_info PUBLIC ${CMAKE_BINARY_DIR}/aten/src) # provides "ATen/TypeExtendedInterface.h" to ATen.h diff --git a/buckbuild.bzl b/buckbuild.bzl index 89707dd9bc3f07..4c4fc9a89a280d 100644 --- a/buckbuild.bzl +++ b/buckbuild.bzl @@ -279,7 +279,6 @@ def get_pt_preprocessor_flags(): "-D_THP_CORE", "-DUSE_SCALARS", "-DNO_CUDNN_DESTROY_HANDLE", - "-DBUILD_CAFFE2", ] if _is_build_mode_dev(): diff --git a/caffe2/CMakeLists.txt b/caffe2/CMakeLists.txt index 13282063d90780..bd2588b5aef356 100644 --- a/caffe2/CMakeLists.txt +++ b/caffe2/CMakeLists.txt @@ -110,21 +110,11 @@ endif() add_subdirectory(core) add_subdirectory(serialize) add_subdirectory(utils) -if(BUILD_CAFFE2 OR (NOT USE_FBGEMM)) +if(NOT USE_FBGEMM) add_subdirectory(perfkernels) endif() -# Skip modules that are not used by libtorch mobile yet. -if(BUILD_CAFFE2 AND NOT INTERN_BUILD_MOBILE) - add_subdirectory(core/nomnigraph) - if(USE_NVRTC) - add_subdirectory(cuda_rtc) - endif() - if(BUILD_CAFFE2_OPS) - endif() - add_subdirectory(proto) -endif() -if(NOT BUILD_CAFFE2 AND NOT INTERN_BUILD_MOBILE) +if(NOT INTERN_BUILD_MOBILE) add_subdirectory(proto) endif() @@ -585,17 +575,10 @@ if(NOT INTERN_BUILD_MOBILE AND NOT BUILD_LITE_INTERPRETER) ${TORCH_SRC_DIR}/csrc/utils/byte_order.cpp ) - # Disable legacy import of building without Caffe2 support - if(BUILD_CAFFE2) - list(APPEND TORCH_SRCS - ${TORCH_SRC_DIR}/csrc/jit/serialization/import_legacy.cpp - ) - else() - set_source_files_properties( - ${TORCH_SRC_DIR}/csrc/jit/serialization/import.cpp - PROPERTIES COMPILE_FLAGS "-DC10_DISABLE_LEGACY_IMPORT" - ) - endif() + set_source_files_properties( + ${TORCH_SRC_DIR}/csrc/jit/serialization/import.cpp + PROPERTIES COMPILE_FLAGS "-DC10_DISABLE_LEGACY_IMPORT" + ) if(USE_DISTRIBUTED) append_filelist("libtorch_distributed_base_sources" TORCH_SRCS) if(NOT WIN32) @@ -809,11 +792,6 @@ if(HAVE_SOVERSION) VERSION ${TORCH_VERSION} SOVERSION ${TORCH_SOVERSION}) endif() torch_compile_options(torch_cpu) # see cmake/public/utils.cmake -if(BUILD_CAFFE2 AND NOT MSVC) - # Caffe2 has too many signed-unsigned violation, but the framework is dead - # So no point in fixing those - target_compile_options(torch_cpu PRIVATE "-Wno-sign-compare") -endif() # Ignore Wdeprecated-XXX errors from third-party libraries if(NOT MSVC) @@ -1921,14 +1899,6 @@ if(BUILD_TEST) endif() endforeach() endif() - - # For special tests that explicitly uses dependencies, we add them here - if(BUILD_CAFFE2 AND USE_MPI) - target_link_libraries(mpi_test MPI::MPI_CXX) - if(USE_CUDA) - target_link_libraries(mpi_gpu_test MPI::MPI_CXX) - endif() - endif() endif() if(MSVC) @@ -1998,11 +1968,6 @@ if(BUILD_PYTHON) set_source_files_properties(${TORCH_SRC_DIR}/../caffe2/operators/box_with_nms_limit_op.cc PROPERTIES COMPILE_FLAGS -Wno-attributes) endif() - # ---[ Python. - if(BUILD_CAFFE2) - target_compile_definitions(torch PRIVATE BUILD_CAFFE2) - endif() - # generated pb files are copied from build/caffe2 to caffe2 # if we copied them back to build this would create a build cycle # consider removing the need for globs diff --git a/caffe2/__init__.py b/caffe2/__init__.py index 4096a98283857e..f319e8e2dc15b9 100644 --- a/caffe2/__init__.py +++ b/caffe2/__init__.py @@ -2,5 +2,4 @@ from torch.onnx import _CAFFE2_ATEN_FALLBACK if not _CAFFE2_ATEN_FALLBACK: - warnings.warn("Caffe2 support is not fully enabled in this PyTorch build. " - "Please enable Caffe2 by building PyTorch from source with `BUILD_CAFFE2=1` flag.") + warnings.warn("Caffe2 support is no longer present in PyTorch.") diff --git a/caffe2/core/CMakeLists.txt b/caffe2/core/CMakeLists.txt index f59c0e703edf78..371d2216b50eab 100644 --- a/caffe2/core/CMakeLists.txt +++ b/caffe2/core/CMakeLists.txt @@ -1,68 +1,4 @@ -if(NOT BUILD_CAFFE2 OR INTERN_BUILD_MOBILE) - list(APPEND Caffe2_CPU_SRCS - "${CMAKE_CURRENT_SOURCE_DIR}/common.cc" - ) - set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} PARENT_SCOPE) - return() -endif() - -# ---[ GPU files -# ------[ cuDNN -if(USE_CUDNN) - file(GLOB tmp *_cudnn.cc) - set(Caffe2_GPU_SRCS ${Caffe2_GPU_SRCS} ${tmp}) -endif() -# ------[ general GPU -file(GLOB tmp *_gpu.cc) -set(Caffe2_GPU_SRCS ${Caffe2_GPU_SRCS} ${tmp}) -# ------[ CUDA sources -file(GLOB tmp *.cu) -set(Caffe2_GPU_SRCS ${Caffe2_GPU_SRCS} ${tmp}) -# exclude test files -file(GLOB tmp *_test.cc) -exclude(Caffe2_GPU_SRCS "${Caffe2_GPU_SRCS}" ${tmp}) - -# ---[ general HIP files -file(GLOB tmp hip/*.cc) -set(Caffe2_HIP_SRCS ${Caffe2_HIP_SRCS} ${tmp}) -# ------[ HIP sources -file(GLOB tmp hip/*.hip) -set(Caffe2_HIP_SRCS ${Caffe2_HIP_SRCS} ${tmp}) -# exclude test files -file(GLOB tmp hip/*_test.cc) -exclude(Caffe2_HIP_SRCS "${Caffe2_HIP_SRCS}" ${tmp}) - -# ---[ CPU files. -file(GLOB tmp *.cc) -# Manually remove the cudnn files since we might be using USE_CUDNN=OFF -# TODO: when we move to explicit file list, this would not be needed. -file(GLOB tmp_cudnn *_cudnn.cc) -exclude(tmp "${tmp}" ${tmp_cudnn}) -set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} ${tmp}) -# exclude test files and gpu files -file(GLOB tmp *_test.cc) -exclude(Caffe2_CPU_SRCS "${Caffe2_CPU_SRCS}" ${tmp}) -exclude(Caffe2_CPU_SRCS "${Caffe2_CPU_SRCS}" ${Caffe2_GPU_SRCS}) -exclude(Caffe2_CPU_SRCS "${Caffe2_CPU_SRCS}" ${Caffe2_HIP_SRCS}) - -# ---[ GPU test files -file(GLOB tmp *_gpu_test.cc) -set(Caffe2_GPU_TEST_SRCS ${Caffe2_GPU_TEST_SRCS} ${tmp}) - -# ---[ HIP test files -file(GLOB tmp hip/*_test.cc) -set(Caffe2_HIP_TEST_SRCS ${Caffe2_HIP_TEST_SRCS} ${tmp}) - -# ---[ CPU test files -file(GLOB tmp *_test.cc) -set(Caffe2_CPU_TEST_SRCS ${Caffe2_CPU_TEST_SRCS} ${tmp}) -exclude(Caffe2_CPU_TEST_SRCS "${Caffe2_CPU_TEST_SRCS}" ${Caffe2_GPU_TEST_SRCS}) -exclude(Caffe2_CPU_TEST_SRCS "${Caffe2_CPU_TEST_SRCS}" ${Caffe2_HIP_TEST_SRCS}) - -# ---[ Send the lists to the parent scope. +list(APPEND Caffe2_CPU_SRCS + "${CMAKE_CURRENT_SOURCE_DIR}/common.cc" +) set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} PARENT_SCOPE) -set(Caffe2_GPU_SRCS ${Caffe2_GPU_SRCS} PARENT_SCOPE) -set(Caffe2_HIP_SRCS ${Caffe2_HIP_SRCS} PARENT_SCOPE) -set(Caffe2_CPU_TEST_SRCS ${Caffe2_CPU_TEST_SRCS} PARENT_SCOPE) -set(Caffe2_GPU_TEST_SRCS ${Caffe2_GPU_TEST_SRCS} PARENT_SCOPE) -set(Caffe2_HIP_TEST_SRCS ${Caffe2_HIP_TEST_SRCS} PARENT_SCOPE) diff --git a/caffe2/core/blob_serialization.cc b/caffe2/core/blob_serialization.cc index 5cd01ba7cc59c9..fca3e63f72182e 100644 --- a/caffe2/core/blob_serialization.cc +++ b/caffe2/core/blob_serialization.cc @@ -1,5 +1,6 @@ #include "caffe2/core/blob_serialization.h" +#include #include #include #include @@ -83,8 +84,7 @@ Range GetMutableTensorDataRange( size_t start, size_t numElements) { CAFFE_ENFORCE( - // NOLINTNEXTLINE(clang-diagnostic-sign-compare) - start + numElements <= tensor.numel(), + static_cast(start + numElements) <= tensor.numel(), "Requested invalid mutable tensor range [", start, ", ", @@ -100,8 +100,7 @@ c10::ArrayRef GetTensorDataRange( size_t start, size_t numElements) { CAFFE_ENFORCE( - // NOLINTNEXTLINE(clang-diagnostic-sign-compare) - start + numElements <= tensor.numel(), + static_cast(start + numElements) <= tensor.numel(), "Requested invalid tensor range [", start, ", ", @@ -390,8 +389,7 @@ void TensorSerializer::SerializeWithOptions( // Poorman's IOBound ThreadPool SimpleQueue chunkQueue; auto task = [&]() { - // NOLINTNEXTLINE(cppcoreguidelines-init-variables) - size_t chunkStart; + size_t chunkStart = std::numeric_limits::max(); while (chunkQueue.Pop(&chunkStart)) { processChunk(chunkStart); } @@ -409,8 +407,7 @@ void TensorSerializer::SerializeWithOptions( VLOG(1) << "Serializing blob " << name; // Serialize whole vector. If vector is empty, it's shape still needs to be // serialized in empty proto - for (size_t chunkBegin = 0; - // NOLINTNEXTLINE(clang-diagnostic-sign-compare) + for (int64_t chunkBegin = 0; chunkBegin < std::max(tensor.numel(), static_cast(1)); chunkBegin += chunk_size) { VLOG(2) << "Starting a chunk at " << chunkBegin; @@ -582,8 +579,7 @@ void SerializeTensorData(const SerializeParams& params) { BlobSerializationOptions_FloatFormat_FLOAT_BFLOAT16) { // NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,modernize-avoid-c-arrays) std::unique_ptr tmp_buffer; - // NOLINTNEXTLINE(cppcoreguidelines-init-variables) - const float* src; + const float* src = nullptr; if (params.context.device() == CPU) { src = params.input.data(); } else { @@ -653,14 +649,12 @@ void TensorSerializer::Serialize( size_t chunkBegin, int32_t chunkSize) { CAFFE_ENFORCE( - // NOLINTNEXTLINE(clang-diagnostic-sign-compare) - chunkBegin <= input.numel(), + static_cast(chunkBegin) <= input.numel(), "Chunk begin is out of tensor: ", chunkBegin, ' ', input.numel()); - // NOLINTNEXTLINE(clang-diagnostic-sign-compare) - if (chunkBegin + chunkSize > input.numel()) { + if (static_cast(chunkBegin + chunkSize) > input.numel()) { chunkSize = input.numel() - chunkBegin; } @@ -1029,8 +1023,7 @@ DESERIALIZE_IMPL(float, FMT_BFLOAT16) { params.tensor_proto.raw_data().data()); // If we are on a big-endian machine, byte-swap the serialized data. - // NOLINTNEXTLINE(cppcoreguidelines-init-variables) - const fbgemm::bfloat16* src; + const fbgemm::bfloat16* src = nullptr; // NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,modernize-avoid-c-arrays) std::unique_ptr bswap_buffer; if (kIsLittleEndian) { @@ -1045,8 +1038,7 @@ DESERIALIZE_IMPL(float, FMT_BFLOAT16) { // bfloat16 to float conversion. // NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,modernize-avoid-c-arrays) std::unique_ptr tmp_buffer; - // NOLINTNEXTLINE(cppcoreguidelines-init-variables) - float* dest; + float* dest = nullptr; if (params.context.device() == CPU) { dest = params.dest.data(); } else { diff --git a/caffe2/proto/CMakeLists.txt b/caffe2/proto/CMakeLists.txt index ba6b696dde4ba4..bdbc045afb3d77 100644 --- a/caffe2/proto/CMakeLists.txt +++ b/caffe2/proto/CMakeLists.txt @@ -1,8 +1,4 @@ -if(BUILD_CAFFE2) - file(GLOB Caffe2_PROTOBUF_FILES "${CMAKE_CURRENT_SOURCE_DIR}/*.proto") -else() - set(Caffe2_PROTOBUF_FILES "${CMAKE_CURRENT_SOURCE_DIR}/torch.proto;${CMAKE_CURRENT_SOURCE_DIR}/caffe2.proto") -endif() +set(Caffe2_PROTOBUF_FILES "${CMAKE_CURRENT_SOURCE_DIR}/torch.proto;${CMAKE_CURRENT_SOURCE_DIR}/caffe2.proto") caffe2_protobuf_generate_cpp_py(Caffe2_PROTO_SRCS Caffe2_PROTO_HEADERS Caffe2_PROTO_PY ${Caffe2_PROTOBUF_FILES}) diff --git a/caffe2/proto/__init__.py b/caffe2/proto/__init__.py index ce54a1aee5745b..c40ca97189d1bb 100644 --- a/caffe2/proto/__init__.py +++ b/caffe2/proto/__init__.py @@ -14,8 +14,7 @@ try: from caffe2.proto import caffe2_pb2, metanet_pb2, torch_pb2 except ImportError: - warnings.warn('Caffe2 support is not enabled in this PyTorch build. ' - 'Please enable Caffe2 by building PyTorch from source with `BUILD_CAFFE2=1` flag.') + warnings.warn('Caffe2 support is no longer present in PyTorch.') raise try: diff --git a/caffe2/python/__init__.py b/caffe2/python/__init__.py index 888d286458a3a3..1e44baf28153ff 100644 --- a/caffe2/python/__init__.py +++ b/caffe2/python/__init__.py @@ -6,8 +6,7 @@ try: from caffe2.proto import caffe2_pb2 except ImportError: - warnings.warn('Caffe2 support is not enabled in this PyTorch build. ' - 'Please enable Caffe2 by building PyTorch from source with `BUILD_CAFFE2=1` flag.') + warnings.warn('Caffe2 support is no longer present in PyTorch.') raise # TODO: refactor & remove the following alias diff --git a/caffe2/utils/CMakeLists.txt b/caffe2/utils/CMakeLists.txt index 181d164d81327f..e168eb595feb24 100644 --- a/caffe2/utils/CMakeLists.txt +++ b/caffe2/utils/CMakeLists.txt @@ -1,100 +1,18 @@ -if(NOT BUILD_CAFFE2 OR INTERN_BUILD_MOBILE) - list(APPEND Caffe2_CPU_SRCS - utils/string_utils.cc - utils/threadpool/ThreadPool.cc - ) - - if(USE_PTHREADPOOL AND NOT USE_INTERNAL_PTHREADPOOL_IMPL) - list(APPEND Caffe2_CPU_SRCS - utils/threadpool/pthreadpool-cpp.cc - utils/threadpool/thread_pool_guard.cpp - ) - endif() - - if(NOT BUILD_CAFFE2 AND NOT INTERN_BUILD_MOBILE) - list(APPEND Caffe2_CPU_SRCS - utils/proto_wrap.cc - ) - endif() - set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} PARENT_SCOPE) - return() -endif() - list(APPEND Caffe2_CPU_SRCS - utils/bench_utils.cc - utils/cpuid.cc - utils/math/broadcast.cc - utils/math/elementwise.cc - utils/math/reduce.cc - utils/math/transpose.cc - utils/math/utils.cc - utils/math_cpu.cc - utils/murmur_hash3.cc - utils/proto_utils.cc - utils/proto_wrap.cc + utils/string_utils.cc utils/threadpool/ThreadPool.cc - utils/signal_handler.cc - utils/smart_tensor_printer.cc - utils/string_utils.cc) +) -if(USE_PTHREADPOOL) +if(USE_PTHREADPOOL AND NOT USE_INTERNAL_PTHREADPOOL_IMPL) list(APPEND Caffe2_CPU_SRCS utils/threadpool/pthreadpool-cpp.cc - utils/threadpool/thread_pool_guard.cpp) - if(USE_INTERNAL_PTHREADPOOL_IMPL) - list(APPEND Caffe2_CPU_SRCS - utils/threadpool/pthreadpool.cc - utils/threadpool/pthreadpool_impl.cc) - endif() + utils/threadpool/thread_pool_guard.cpp + ) endif() -set(Caffe2_GPU_SRCS ${Caffe2_GPU_SRCS} - utils/math/broadcast.cu - utils/math/elementwise.cu - utils/math/reduce.cu - utils/math/transpose.cu - utils/math_gpu.cu - ) - -set(Caffe2_HIP_SRCS ${Caffe2_HIP_SRCS} - utils/math/hip/broadcast.hip - utils/math/hip/elementwise.hip - utils/math/hip/reduce.hip - utils/math/hip/transpose.hip - utils/hip/math_gpu.hip - ) - -set(Caffe2_CPU_TEST_SRCS ${Caffe2_CPU_TEST_SRCS} - utils/fixed_divisor_test.cc - utils/math_test.cc - utils/fatal_signal_asan_no_sig_test.cc - utils/simple_queue_test.cc - utils/proto_utils_test.cc - utils/smart_tensor_printer_test.cc - utils/cast_test.cc - ) - -if(NOT CMAKE_SYSTEM_PROCESSOR MATCHES "s390x") - set(Caffe2_CPU_TEST_SRCS ${Caffe2_CPU_TEST_SRCS} - utils/cpuid_test.cc - ) +if(NOT INTERN_BUILD_MOBILE) + list(APPEND Caffe2_CPU_SRCS + utils/proto_wrap.cc + ) endif() - -set(Caffe2_GPU_TEST_SRCS ${Caffe2_GPU_TEST_SRCS} - utils/math_gpu_test.cc - ) - -set(Caffe2_HIP_TEST_SRCS ${Caffe2_HIP_TEST_SRCS} - utils/hip/math_gpu_test.cc - utils/hip/math_blas_gpu_test.cc - ) - -# TODO Once all source files are defined inside the local c10_utils_xxx targets, -# it should be the job of the parent CMakeLists.txt to decide what to do with the target (i.e. link it to caffe2) -# instead of us locally adding it to Caffe2_xxx variables. set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} PARENT_SCOPE) -set(Caffe2_GPU_SRCS ${Caffe2_GPU_SRCS} PARENT_SCOPE) -set(Caffe2_HIP_SRCS ${Caffe2_HIP_SRCS} PARENT_SCOPE) -set(Caffe2_CPU_TEST_SRCS ${Caffe2_CPU_TEST_SRCS} PARENT_SCOPE) -set(Caffe2_GPU_TEST_SRCS ${Caffe2_GPU_TEST_SRCS} PARENT_SCOPE) -set(Caffe2_HIP_TEST_SRCS ${Caffe2_HIP_TEST_SRCS} PARENT_SCOPE) diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake index 73dba4061dcedf..177bd31b9c973d 100644 --- a/cmake/Dependencies.cmake +++ b/cmake/Dependencies.cmake @@ -1685,9 +1685,6 @@ if(NOT INTERN_BUILD_MOBILE) if(MKLDNN_FOUND) set(AT_MKLDNN_ENABLED 1) include_directories(AFTER SYSTEM ${MKLDNN_INCLUDE_DIR}) - if(BUILD_CAFFE2_OPS) - list(APPEND Caffe2_DEPENDENCY_LIBS caffe2::mkldnn) - endif(BUILD_CAFFE2_OPS) else() message(WARNING "MKLDNN could not be found.") caffe2_update_option(USE_MKLDNN OFF) diff --git a/cmake/Summary.cmake b/cmake/Summary.cmake index 72c1243c24ea97..09af98d0bc0666 100644 --- a/cmake/Summary.cmake +++ b/cmake/Summary.cmake @@ -23,8 +23,6 @@ function(caffe2_print_configuration_summary) message(STATUS "") message(STATUS " TORCH_VERSION : ${TORCH_VERSION}") - message(STATUS " BUILD_CAFFE2 : ${BUILD_CAFFE2}") - message(STATUS " BUILD_CAFFE2_OPS : ${BUILD_CAFFE2_OPS}") message(STATUS " BUILD_STATIC_RUNTIME_BENCHMARK: ${BUILD_STATIC_RUNTIME_BENCHMARK}") message(STATUS " BUILD_BINARY : ${BUILD_BINARY}") message(STATUS " BUILD_CUSTOM_PROTOBUF : ${BUILD_CUSTOM_PROTOBUF}") diff --git a/cmake/TorchConfig.cmake.in b/cmake/TorchConfig.cmake.in index 6d518a14896266..02e313285297be 100644 --- a/cmake/TorchConfig.cmake.in +++ b/cmake/TorchConfig.cmake.in @@ -80,9 +80,6 @@ else() # shared library. # TODO: this list might be incomplete. append_torchlib_if_found(c10) - if(@BUILD_CAFFE2@) - append_torchlib_if_found(Caffe2_perfkernels_avx512 Caffe2_perfkernels_avx2 Caffe2_perfkernels_avx) - endif() if(@USE_NNPACK@) append_torchlib_if_found(nnpack) diff --git a/modules/CMakeLists.txt b/modules/CMakeLists.txt deleted file mode 100644 index 598cac60bdbadf..00000000000000 --- a/modules/CMakeLists.txt +++ /dev/null @@ -1,7 +0,0 @@ -project(modules CXX C) -add_subdirectory(detectron) -add_subdirectory(module_test) -add_subdirectory(observers) - -# Finally, set Caffe2_MODULES to parent scope. -set(Caffe2_MODULES ${Caffe2_MODULES} PARENT_SCOPE) diff --git a/modules/detectron/CMakeLists.txt b/modules/detectron/CMakeLists.txt deleted file mode 100644 index 7c9a2d7ff4f4a3..00000000000000 --- a/modules/detectron/CMakeLists.txt +++ /dev/null @@ -1,57 +0,0 @@ -file(GLOB Detectron_CPU_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/*.cc) -file(GLOB Detectron_GPU_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/*.cu) -file(GLOB_RECURSE Detectron_HIP_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/*.hip) - -if(BUILD_CAFFE2_OPS) - # Note(ilijar): Since Detectron ops currently have no - # CPU implementation, we only build GPU ops for now. - if(USE_CUDA) - add_library( - caffe2_detectron_ops_gpu SHARED - ${Detectron_CPU_SRCS} - ${Detectron_GPU_SRCS}) - - target_link_libraries(caffe2_detectron_ops_gpu PRIVATE torch) - if(USE_OPENMP) - target_link_libraries(caffe2_detectron_ops_gpu PRIVATE caffe2::openmp) - endif() - - if(USE_MKLDNN) - target_link_libraries(caffe2_detectron_ops_gpu PRIVATE caffe2::mkldnn) - endif() - install(TARGETS caffe2_detectron_ops_gpu DESTINATION lib) - if(MSVC) - install(FILES $ DESTINATION lib OPTIONAL) - endif() - elseif(USE_ROCM) - hip_include_directories(${Caffe2_HIP_INCLUDES}) - set_source_files_properties(${Detectron_HIP_SRCS} PROPERTIES HIP_SOURCE_PROPERTY_FORMAT 1) - HIP_ADD_LIBRARY( - caffe2_detectron_ops_hip SHARED - ${Detectron_CPU_SRCS} - ${Detectron_HIP_SRCS}) - target_compile_options(caffe2_detectron_ops_hip PRIVATE ${HIP_CXX_FLAGS}) - if(USE_MKLDNN) - target_link_libraries(caffe2_detectron_ops_hip PRIVATE caffe2::mkldnn) - endif() - target_link_libraries(caffe2_detectron_ops_hip PRIVATE torch) - install(TARGETS caffe2_detectron_ops_hip DESTINATION lib) - elseif(NOT IOS_PLATFORM) - add_library(caffe2_detectron_ops SHARED ${Detectron_CPU_SRCS}) - if(HAVE_SOVERSION) - set_target_properties(caffe2_detectron_ops PROPERTIES - VERSION ${TORCH_VERSION} SOVERSION ${TORCH_SOVERSION}) - endif() - target_link_libraries(caffe2_detectron_ops PRIVATE torch) - if(USE_OPENMP) - target_link_libraries(caffe2_detectron_ops PRIVATE caffe2::openmp) - endif() - if(USE_MKLDNN) - target_link_libraries(caffe2_detectron_ops PRIVATE caffe2::mkldnn) - endif() - install(TARGETS caffe2_detectron_ops DESTINATION lib) - if(MSVC) - install(FILES $ DESTINATION lib OPTIONAL) - endif() - endif() -endif() diff --git a/modules/detectron/group_spatial_softmax_op.cc b/modules/detectron/group_spatial_softmax_op.cc deleted file mode 100644 index 8b1fc052ef39b4..00000000000000 --- a/modules/detectron/group_spatial_softmax_op.cc +++ /dev/null @@ -1,83 +0,0 @@ -/** - * Copyright (c) 2016-present, Facebook, Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "modules/detectron/group_spatial_softmax_op.h" - -#include "caffe2/operators/softmax_utils.h" - -namespace caffe2 { - -REGISTER_CPU_OPERATOR( - GroupSpatialSoftmax, - GroupSpatialSoftmaxOp); -REGISTER_CPU_OPERATOR( - GroupSpatialSoftmaxGradient, - GroupSpatialSoftmaxGradientOp); - -OPERATOR_SCHEMA(GroupSpatialSoftmax) - .NumInputs(1) - .NumOutputs(1) - .SetDoc(R"DOC( -RetinaNet specific form of spatial softmax. - -The input is assumed to be unnormalized scores (sometimes called 'logits') -arranged in a 4D tensor with shape (N, C, H, W), where N is the number of -elements in the batch, H and W are the height and width, and C = num_anchors * -num_classes defines num_anchors 'groups' of softmax inputs, each of length -num_classes. The softmax is applied to each group independently. - -See: https://arxiv.org/abs/1708.02002 for details. -)DOC") - .Arg( - "num_classes", - "(int) default 81; number of classes in each softmax group.") - .Input( - 0, - "scores", - "4D tensor of softmax inputs (called 'scores' or 'logits') with shape " - "(N, C, H, W), where C = num_anchors * num_classes defines num_anchors " - "groups of contiguous num_classes softmax inputs.") - .Output( - 0, - "probabilities", - "4D tensor of softmax probabilities with shape (N, C, H, W), where " - "C = num_anchors * num_classes, and softmax was applied to each of the " - "num_anchors groups; within a group the num_classes values sum to 1."); - -OPERATOR_SCHEMA(GroupSpatialSoftmaxGradient) - .NumInputs(2) - .NumOutputs(1) - .Input(0, "scores", "See GroupSpatialSoftmax") - .Input( - 1, - "d_probabilities", - "Gradient of forward output 0 (probabilities).") - .Output(0, "d_scores", "Gradient of forward input 0 (scores)."); - -class GetGroupSpatialSoftmaxGradient : public GradientMakerBase { - using GradientMakerBase::GradientMakerBase; - vector GetGradientDefs() override { - return SingleGradientDef( - "GroupSpatialSoftmaxGradient", - "", - vector{O(0), GO(0)}, - vector{GI(0)}); - } -}; - -REGISTER_GRADIENT(GroupSpatialSoftmax, GetGroupSpatialSoftmaxGradient); - -} // namespace caffe2 diff --git a/modules/detectron/group_spatial_softmax_op.cu b/modules/detectron/group_spatial_softmax_op.cu deleted file mode 100644 index 741da27f59d2b4..00000000000000 --- a/modules/detectron/group_spatial_softmax_op.cu +++ /dev/null @@ -1,181 +0,0 @@ -/** - * Copyright (c) 2016-present, Facebook, Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include - -#include "caffe2/core/context_gpu.h" -#include "modules/detectron/group_spatial_softmax_op.h" - -namespace caffe2 { - -namespace { - -__global__ void GroupSpatialSoftmaxKernel(const int num, const int A, const int W, - const int H, const float* Xdata, float* Pdata, const int num_classes) { - // Loop through labels (N x A x H x W) - CUDA_1D_KERNEL_LOOP(index, num * A * H * W) { - int D = num_classes * A; - int x = index % W; - int y = (index / W) % H; - int a = (index / (W * H)) % A; - int i = index / W / H / A; - - // Subtract max on each cell for numerical reasons - float max_val = -FLT_MAX; - for(int c = a * num_classes; c < (a + 1) * num_classes; ++c) { - int idx = i * (H * W * D) + c * (H * W) + y * W + x; - max_val = max(max_val, Xdata[idx]); - } - // Exponentiate - float expsum = 0.0f; - for(int c = a * num_classes; c < (a + 1) * num_classes; ++c) { - int idx = i * (H * W * D) + c * (H * W) + y * W + x; - float expx = exp(Xdata[idx] - max_val); - Pdata[idx] = expx; - expsum += expx; - } - - // Normalize - for(int c = a * num_classes; c < (a + 1) * num_classes; ++c) { - int idx = i * (H * W * D) + c * (H * W) + y * W + x; - Pdata[idx] /= expsum; - } - - } -} - -__global__ void SumProbsKernel(const int N, const int A, const int W, - const int H, const float* Ydata, const float* dYdata, - float* sum_probs_data, const int num_classes) { - CUDA_1D_KERNEL_LOOP(i, N * A * W * H) { - int D = num_classes * A; - int x = i % W; - int y = (i / W) % H; - int a = (i / (W * H)) % A; - int n = i / (W * H * A); - - sum_probs_data[i] = 0.0; - for(int c = a * num_classes; c < (a + 1) * num_classes; ++c) { - int idx = n * (H * W * D) + c * (H * W) + y * W + x; - sum_probs_data[i] += (Ydata[idx] * dYdata[idx]); - } - } -} - -__global__ void SubSumKernel( - const int N, const int A, const int W, const int H, - const float* sum_probs_data, float* dXdata, const int num_classes) { - CUDA_1D_KERNEL_LOOP(i, N * (A * num_classes) * W * H) { - int D = num_classes * A; - int x = i % W; - int y = (i / W) % H; - int a = ((i / (W * H)) % D) / num_classes; - int n = i / W / H / D; - int idx = n * (H * W * A) + a * (H * W) + y * W + x; - dXdata[i] = (dXdata[i] - sum_probs_data[idx]); - } -} - -} // namespace - - -template <> -bool GroupSpatialSoftmaxOp::RunOnDevice() { - auto& X = Input(0); // Logits - - int N = X.dim32(0); - int D = X.dim32(1); - int H = X.dim32(2); - int W = X.dim32(3); - int A = D / num_classes_; - - auto* P = Output(0, X.sizes(), at::dtype()); // Probabilities from softmax - TORCH_DCHECK_EQ(X.ndim(), 4); - - const float* Xdata = X.data(); - float* Pdata = P->mutable_data(); - - // Softmax for each x,y location - GroupSpatialSoftmaxKernel<<>>( - N, A, W, H, Xdata, Pdata, num_classes_); - C10_CUDA_KERNEL_LAUNCH_CHECK(); - return true; -} - - -template<> -bool GroupSpatialSoftmaxGradientOp::RunOnDevice() { - auto& Y = Input(0); // Probabilities from softmax - auto& dY = Input(1); - - - TORCH_DCHECK_EQ(Y.ndim(), 4); - - int N = Y.dim32(0); - int D = Y.dim32(1); - int H = Y.dim32(2); - int W = Y.dim32(3); - int A = D / num_classes_; - - auto* dX = Output(0, Y.sizes(), at::dtype()); - - if (sum_probs_.size() != N * A * H * W) { - ReinitializeTensor(&sum_probs_, {N * A * H * W}, at::dtype().device(CUDA)); - } - - const float* Ydata = Y.data(); - const float* dYdata = dY.data(); - float* dXdata = dX->mutable_data(); - - float* sum_probs_data = sum_probs_.mutable_data(); - math::Set( - sum_probs_.size(), 0.0f, sum_probs_data, &context_); - - // Complete math: - // J_ij = h_i (delta_ij - h_j) - // d x_i = sum_j d h_ij = sum_j J_ij * dy_j - // = sum_j h_i (delta_ij - h_j) * dy_j - // = h_i dy_i - (sum_j h_i h_j dy_j) - // = h_i dy_i - h_i sum_j h_j dy_j - - // Step 0: dx = dy - context_.Copy(Y.size(), dYdata, dXdata); - - // Step 1: s = Sum(dY[j] * Y[j]) - SumProbsKernel<<>>( - N, A, W, H, Ydata, dYdata, sum_probs_data, num_classes_); - C10_CUDA_KERNEL_LAUNCH_CHECK(); - - // Step 2: dX[i] = dX[i] - s - SubSumKernel<<>>( - N, A, W, H, sum_probs_.data(), dXdata, num_classes_); - C10_CUDA_KERNEL_LAUNCH_CHECK(); - - // Step 3: dX[i] = Y[i] * dX[i] - math::Mul(Y.size(), dXdata, Ydata, dXdata, &context_); - - return true; -} - - -REGISTER_CUDA_OPERATOR(GroupSpatialSoftmax, - GroupSpatialSoftmaxOp); -REGISTER_CUDA_OPERATOR(GroupSpatialSoftmaxGradient, - GroupSpatialSoftmaxGradientOp); -} // namespace caffe2 diff --git a/modules/detectron/group_spatial_softmax_op.h b/modules/detectron/group_spatial_softmax_op.h deleted file mode 100644 index b235a47146b581..00000000000000 --- a/modules/detectron/group_spatial_softmax_op.h +++ /dev/null @@ -1,76 +0,0 @@ -/** - * Copyright (c) 2016-present, Facebook, Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef GROUP_SPATIAL_SOFTMAX_OP_H_ -#define GROUP_SPATIAL_SOFTMAX_OP_H_ - -#include "caffe2/core/context.h" -#include "caffe2/core/logging.h" -#include "caffe2/core/operator.h" -#include "caffe2/utils/math.h" - -namespace caffe2 { - -template -class GroupSpatialSoftmaxOp final : public Operator { - public: - GroupSpatialSoftmaxOp(const OperatorDef& operator_def, Workspace* ws) - : Operator(operator_def, ws), - num_classes_(this->template GetSingleArgument("num_classes", 81)), - order_(StringToStorageOrder( - this->template GetSingleArgument("order", "NCHW"))) { - CAFFE_ENFORCE_EQ( - order_, StorageOrder::NCHW, "Only NCHW order is supported right now."); - } - USE_OPERATOR_CONTEXT_FUNCTIONS; - - bool RunOnDevice() override { - // No CPU implementation for now - CAFFE_NOT_IMPLEMENTED; - } - - protected: - int num_classes_; - StorageOrder order_; -}; - -template -class GroupSpatialSoftmaxGradientOp final : public Operator { - public: - GroupSpatialSoftmaxGradientOp(const OperatorDef& def, Workspace* ws) - : Operator(def, ws), - num_classes_(this->template GetSingleArgument("num_classes", 81)), - order_(StringToStorageOrder( - this->template GetSingleArgument("order", "NCHW"))) { - CAFFE_ENFORCE_EQ( - order_, StorageOrder::NCHW, "Only NCHW order is supported right now."); - } - USE_OPERATOR_CONTEXT_FUNCTIONS; - - bool RunOnDevice() override { - // No CPU implementation for now - CAFFE_NOT_IMPLEMENTED; - } - - protected: - int num_classes_; - StorageOrder order_; - Tensor sum_probs_; -}; - -} // namespace caffe2 - -#endif // GROUP_SPATIAL_SOFTMAX_OP_H_ diff --git a/modules/detectron/ps_roi_pool_op.cc b/modules/detectron/ps_roi_pool_op.cc deleted file mode 100644 index c57b0fc23678ba..00000000000000 --- a/modules/detectron/ps_roi_pool_op.cc +++ /dev/null @@ -1,106 +0,0 @@ -/** - * Copyright (c) 2016-present, Facebook, Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "ps_roi_pool_op.h" - -namespace caffe2 { - -REGISTER_CPU_OPERATOR(PSRoIPool, PSRoIPoolOp); -REGISTER_CPU_OPERATOR( - PSRoIPoolGradient, - PSRoIPoolGradientOp); - -OPERATOR_SCHEMA(PSRoIPool) - .NumInputs(2) - .NumOutputs(2) - .SetDoc(R"DOC( -Position Sensitive Region of Interest Pooling as used in R-FCN. -)DOC") - .Arg( - "spatial_scale", - "(float) default 1.0; Spatial scale of the input feature map X " - "relative to the input image. E.g., 0.0625 if X has a stride of 16 " - "w.r.t. the input image.") - .Arg( - "group_size", - "(int) default 1; pooled_h = pooled_w = group_size where pooled_{h,w} " - "is the pooled output Y's height and width, respectively.") - .Arg( - "output_dim", - "(int) default 1; number of channels in the pooled output, which might " - "be the number of classes is used for classification or 4 if used for " - "class agnostic bounding box regression.") - .Input( - 0, - "X", - "4D position sensitive feature map input of shape (N, C, H, W), where " - "C = group_size**2 * output_dim.") - .Input( - 1, - "RoIs", - "2D input of shape (R, 5) specifying R RoIs with five columns " - "representing: batch index in [0, N - 1], x1, y1, x2, y2. The RoI " - "coordinates are in the coordinate system of the input image.") - .Output( - 0, - "Y", - "4D output of shape (R, output_dim, pooled_h, pooled_w). The r-th " - "batch element is a pooled feature map cooresponding to the r-th RoI.") - .Output( - 1, - "argmaxes", - "4D output of shape (R, output_dim, pooled_h, pooled_w). Same as Y, " - "except it records the argmax indices rather than the max pooled " - "values."); - -OPERATOR_SCHEMA(PSRoIPoolGradient) - .NumInputs(4) - .NumOutputs(1) - .Input( - 0, - "X", - "See PSRoIPool.") - .Input( - 1, - "RoIs", - "See PSRoIPool.") - .Input( - 2, - "argmaxes", - "See PSRoIPool.") - .Input( - 3, - "dY", - "Gradient of forward output 0 (Y)") - .Output( - 0, - "dX", - "Gradient of forward input 0 (X)"); - -class GetPSRoIPoolGradient : public GradientMakerBase { - using GradientMakerBase::GradientMakerBase; - vector GetGradientDefs() override { - return SingleGradientDef( - "PSRoIPoolGradient", - "", - vector{I(0), I(1), O(1), GO(0)}, - vector{GI(0)}); - } -}; - -REGISTER_GRADIENT(PSRoIPool, GetPSRoIPoolGradient); - -} // namespace caffe2 diff --git a/modules/detectron/ps_roi_pool_op.cu b/modules/detectron/ps_roi_pool_op.cu deleted file mode 100644 index 68e4ec377d6227..00000000000000 --- a/modules/detectron/ps_roi_pool_op.cu +++ /dev/null @@ -1,289 +0,0 @@ -/** - * Copyright (c) 2016-present, Facebook, Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -// Based on https://github.com/daijifeng001/caffe-rfcn/blob/r-fcn/src/caffe/layers/psroi_pooling_layer.cu -// -// ------------------------------------------------------------------ -// R-FCN -// Copyright (c) 2016 Microsoft -// Licensed under The MIT License [see r-fcn/LICENSE for details] -// Written by Yi Li -// ------------------------------------------------------------------ -// -// COPYRIGHT -// -// All contributions by the University of California: -// Copyright (c) 2014, 2015, The Regents of the University of California -// (Regents) -// All rights reserved. -// -// All other contributions: -// Copyright (c) 2014, 2015, the respective contributors -// All rights reserved. -// -// Caffe uses a shared copyright model: each contributor holds copyright over -// their contributions to Caffe. The project versioning records all such -// contribution and copyright details. If a contributor wants to further mark -// their specific copyright on a particular contribution, they should indicate -// their copyright solely in the commit message of the change when it is -// committed. -// -// LICENSE -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are met: -// -// 1. Redistributions of source code must retain the above copyright notice, -// this list of conditions and the following disclaimer. -// 2. Redistributions in binary form must reproduce the above copyright notice, -// this list of conditions and the following disclaimer in the documentation -// and/or other materials provided with the distribution. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE -// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR -// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF -// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS -// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN -// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) -// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE -// POSSIBILITY OF SUCH DAMAGE. -// -// CONTRIBUTION AGREEMENT -// -// By contributing to the BVLC/caffe repository through pull-request, comment, -// or otherwise, the contributor releases their content to the -// license and copyright terms herein. - -#include - -#include "caffe2/core/context_gpu.h" -#include "modules/detectron/ps_roi_pool_op.h" - -namespace caffe2 { - -namespace { - -template -inline __device__ T gpu_atomic_add(const T val, T* address); - -template <> -inline __device__ -float gpu_atomic_add(const float val, float* address) { - return atomicAdd(address, val); -} - -template -__global__ void PSRoIPoolForward( - const int nthreads, - const T* bottom_data, - const T spatial_scale, - const int channels, - const int height, - const int width, - const int pooled_height, - const int pooled_width, - const T* bottom_rois, - const int output_dim, - const int group_size, - T* top_data, - int* mapping_channel) { - CUDA_1D_KERNEL_LOOP(index, nthreads) { - // The output is in order (n, ctop, ph, pw) - int pw = index % pooled_width; - int ph = (index / pooled_width) % pooled_height; - int ctop = (index / pooled_width / pooled_height) % output_dim; - int n = index / pooled_width / pooled_height / output_dim; - - // [start, end) interval for spatial sampling - const T* offset_bottom_rois = bottom_rois + n * 5; - int roi_batch_ind = offset_bottom_rois[0]; - T roi_start_w = static_cast( - roundf(offset_bottom_rois[1])) * spatial_scale; - T roi_start_h = static_cast( - roundf(offset_bottom_rois[2])) * spatial_scale; - T roi_end_w = static_cast( - roundf(offset_bottom_rois[3]) + 1.) * spatial_scale; - T roi_end_h = static_cast( - roundf(offset_bottom_rois[4]) + 1.) * spatial_scale; - - // Force too small ROIs to be 1x1 - T roi_width = c10::cuda::compat::max(roi_end_w - roi_start_w, static_cast(0.1)); // avoid 0 - T roi_height = c10::cuda::compat::max(roi_end_h - roi_start_h, static_cast(0.1)); - - // Compute w and h at bottom - T bin_size_h = roi_height / static_cast(pooled_height); - T bin_size_w = roi_width / static_cast(pooled_width); - - // Add roi offsets and clip to input boundaries - int hstart = floor( - static_cast(ph) * bin_size_h + roi_start_h); - int wstart = floor( - static_cast(pw)* bin_size_w + roi_start_w); - int hend = ceil( - static_cast(ph + 1) * bin_size_h + roi_start_h); - int wend = ceil( - static_cast(pw + 1) * bin_size_w + roi_start_w); - - hstart = min(max(hstart, 0), height); - hend = min(max(hend, 0), height); - wstart = min(max(wstart, 0),width); - wend = min(max(wend, 0), width); - bool is_empty = (hend <= hstart) || (wend <= wstart); - - int gw = pw; - int gh = ph; - int c = (ctop * group_size + gh) * group_size + gw; - - const T* offset_bottom_data = - bottom_data + (roi_batch_ind * channels + c) * height * width; - T out_sum = 0; - for (int h = hstart; h < hend; ++h){ - for (int w = wstart; w < wend; ++w){ - int bottom_index = h*width + w; - out_sum += offset_bottom_data[bottom_index]; - } - } - - T bin_area = (hend - hstart) * (wend - wstart); - top_data[index] = is_empty ? 0. : out_sum / bin_area; - mapping_channel[index] = c; - } -} - -template -__global__ void PSRoIPoolBackward( - const int nthreads, - const T* top_diff, - const int* mapping_channel, - const int num_rois, - const T spatial_scale, - const int channels, - const int height, - const int width, - const int pooled_height, - const int pooled_width, - const int output_dim, - T* bottom_diff, - const T* bottom_rois) { - CUDA_1D_KERNEL_LOOP(index, nthreads) { - // The output is in order (n, ctop, ph, pw) - int pw = index % pooled_width; - int ph = (index / pooled_width) % pooled_height; - int n = index / pooled_width / pooled_height / output_dim; - - // [start, end) interval for spatial sampling - const T* offset_bottom_rois = bottom_rois + n * 5; - int roi_batch_ind = offset_bottom_rois[0]; - T roi_start_w = static_cast( - roundf(offset_bottom_rois[1])) * spatial_scale; - T roi_start_h = static_cast( - roundf(offset_bottom_rois[2])) * spatial_scale; - T roi_end_w = static_cast( - roundf(offset_bottom_rois[3]) + 1.) * spatial_scale; - T roi_end_h = static_cast( - roundf(offset_bottom_rois[4]) + 1.) * spatial_scale; - - // Force too small ROIs to be 1x1 - T roi_width = c10::cuda::compat::max(roi_end_w - roi_start_w, static_cast(0.1)); //avoid 0 - T roi_height = c10::cuda::compat::max(roi_end_h - roi_start_h, static_cast(0.1)); - - // Compute w and h at bottom - T bin_size_h = roi_height / static_cast(pooled_height); - T bin_size_w = roi_width / static_cast(pooled_width); - - int hstart = floor( - static_cast(ph)* bin_size_h + roi_start_h); - int wstart = floor( - static_cast(pw)* bin_size_w + roi_start_w); - int hend = ceil( - static_cast(ph + 1) * bin_size_h + roi_start_h); - int wend = ceil( - static_cast(pw + 1) * bin_size_w + roi_start_w); - // Add roi offsets and clip to input boundaries - hstart = min(max(hstart, 0), height); - hend = min(max(hend, 0), height); - wstart = min(max(wstart, 0), width); - wend = min(max(wend, 0), width); - bool is_empty = (hend <= hstart) || (wend <= wstart); - - // Compute c at bottom - int c = mapping_channel[index]; - T* offset_bottom_diff = - bottom_diff + (roi_batch_ind * channels + c) * height * width; - T bin_area = (hend - hstart) * (wend - wstart); - T diff_val = is_empty ? 0. : top_diff[index] / bin_area; - for (int h = hstart; h < hend; ++h){ - for (int w = wstart; w < wend; ++w){ - int bottom_index = h * width + w; - gpu_atomic_add(diff_val, offset_bottom_diff + bottom_index); - } - } - } -} - -} // namespace - -template<> -bool PSRoIPoolOp::RunOnDevice() { - auto& X = Input(0); // Input data to pool - auto& R = Input(1); // RoIs - - auto* Y = Output(0, {R.dim32(0), output_dim_, pooled_height_, pooled_width_}, at::dtype()); // PSRoI pooled data - auto* A = Output(1, Y->sizes(), at::dtype()); // mapping_channel - int output_size = Y->numel(); - PSRoIPoolForward<<>>( - output_size, X.data(), spatial_scale_, X.dim32(1), X.dim32(2), - X.dim32(3), pooled_height_, pooled_width_, R.data(), output_dim_, - group_size_, Y->mutable_data(), A->mutable_data()); - C10_CUDA_KERNEL_LAUNCH_CHECK(); - return true; -} - - -template<> -bool PSRoIPoolGradientOp::RunOnDevice() { - auto& X = Input(0); // Input data to pool - auto& R = Input(1); // RoIs - auto& A = Input(2); // mapping channels - auto& dY = Input(3); // Gradient of net w.r.t. output of "forward" op - // (aka "gradOutput") - - auto* dX = Output(0, X.sizes(), at::dtype()); // Gradient of net w.r.t. input to "forward" op - // (aka "gradInput") - // Must zero-out dX before accumulating gradients - math::Set( - dX->size(), 0.f, dX->mutable_data(), &context_); - PSRoIPoolBackward<<>>( - dY.size(), dY.data(), A.data(), R.dim32(0), spatial_scale_, - X.dim32(1), X.dim32(2), X.dim32(3), pooled_height_, pooled_width_, - output_dim_, dX->mutable_data(), R.data()); - C10_CUDA_KERNEL_LAUNCH_CHECK(); - return true; -} - - -REGISTER_CUDA_OPERATOR(PSRoIPool, - PSRoIPoolOp); -REGISTER_CUDA_OPERATOR(PSRoIPoolGradient, - PSRoIPoolGradientOp); -} // namespace caffe2 diff --git a/modules/detectron/ps_roi_pool_op.h b/modules/detectron/ps_roi_pool_op.h deleted file mode 100644 index ecee1dd7041c4e..00000000000000 --- a/modules/detectron/ps_roi_pool_op.h +++ /dev/null @@ -1,93 +0,0 @@ -/** - * Copyright (c) 2016-present, Facebook, Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef PS_ROI_POOL_OP_H_ -#define PS_ROI_POOL_OP_H_ - -#include "caffe2/core/context.h" -#include "caffe2/core/logging.h" -#include "caffe2/core/operator.h" -#include "caffe2/utils/math.h" - -namespace caffe2 { - -template -class PSRoIPoolOp final : public Operator { - public: - PSRoIPoolOp(const OperatorDef& operator_def, Workspace* ws) - : Operator(operator_def, ws), - spatial_scale_(this->template GetSingleArgument( - "spatial_scale", 1.)), - group_size_(this->template GetSingleArgument("group_size", 1)), - output_dim_(this->template GetSingleArgument("output_dim", 1)) { - TORCH_DCHECK_GT(spatial_scale_, 0); - TORCH_DCHECK_GT(group_size_, 0); - pooled_height_ = group_size_; - pooled_width_ = group_size_; - } - USE_OPERATOR_CONTEXT_FUNCTIONS; - - bool RunOnDevice() override { - // No CPU implementation for now - CAFFE_NOT_IMPLEMENTED; - } - - protected: - float spatial_scale_; - int group_size_; - int output_dim_; - int pooled_height_; - int pooled_width_; - int channels_; - int height_; - int width_; - }; - -template -class PSRoIPoolGradientOp final : public Operator { - public: - PSRoIPoolGradientOp(const OperatorDef& def, Workspace* ws) - : Operator(def, ws), - spatial_scale_(this->template GetSingleArgument( - "spatial_scale", 1.)), - group_size_(this->template GetSingleArgument("group_size", 1)), - output_dim_(this->template GetSingleArgument("output_dim", 1)) { - TORCH_DCHECK_GT(spatial_scale_, 0); - TORCH_DCHECK_GT(group_size_, 0); - pooled_height_ = group_size_; - pooled_width_ = group_size_; - } - USE_OPERATOR_CONTEXT_FUNCTIONS; - - bool RunOnDevice() override { - // No CPU implementation for now - CAFFE_NOT_IMPLEMENTED; - } - - protected: - float spatial_scale_; - int group_size_; - int output_dim_; - int pooled_height_; - int pooled_width_; - int channels_; - int height_; - int width_; -}; - -} // namespace caffe2 - -#endif // PS_ROI_POOL_OP_H_ diff --git a/modules/detectron/roi_pool_f_op.cc b/modules/detectron/roi_pool_f_op.cc deleted file mode 100644 index 81bf8bb62ed0aa..00000000000000 --- a/modules/detectron/roi_pool_f_op.cc +++ /dev/null @@ -1,99 +0,0 @@ -/** - * Copyright (c) 2016-present, Facebook, Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "roi_pool_f_op.h" - -namespace caffe2 { - -REGISTER_CPU_OPERATOR(RoIPoolF, RoIPoolFOp); -REGISTER_CPU_OPERATOR(RoIPoolFGradient, RoIPoolFGradientOp); - -OPERATOR_SCHEMA(RoIPoolF) - .NumInputs(2) - .NumOutputs(2) - .SetDoc(R"DOC( -Region of Interest (RoI) pooling operation as used in Fast R-CNN. -)DOC") - .Arg( - "spatial_scale", - "(float) default 1.0; Spatial scale of the input feature map X " - "relative to the input image. E.g., 0.0625 if X has a stride of 16 " - "w.r.t. the input image.") - .Arg( - "pooled_h", - "(int) default 1; Pooled output Y's height.") - .Arg( - "pooled_w", - "(int) default 1; Pooled output Y's width.") - .Input( - 0, - "X", - "4D feature map input of shape (N, C, H, W).") - .Input( - 1, - "RoIs", - "2D input of shape (R, 5) specifying R RoIs with five columns " - "representing: batch index in [0, N - 1], x1, y1, x2, y2. The RoI " - "coordinates are in the coordinate system of the input image.") - .Output( - 0, - "Y", - "4D output of shape (R, C, pooled_h, pooled_w). The r-th batch element " - "is a pooled feature map cooresponding to the r-th RoI.") - .Output( - 1, - "argmaxes", - "4D output of shape (R, C, pooled_h, pooled_w). Same as Y, except it " - "records the argmax indices rather than the max pooled values."); - -OPERATOR_SCHEMA(RoIPoolFGradient) - .NumInputs(4) - .NumOutputs(1) - .Input( - 0, - "X", - "See RoIPoolF.") - .Input( - 1, - "RoIs", - "See RoIPoolF.") - .Input( - 2, - "argmaxes", - "See RoIPoolF.") - .Input( - 3, - "dY", - "Gradient of forward output 0 (Y)") - .Output( - 0, - "dX", - "Gradient of forward input 0 (X)"); - -class GetRoIPoolFGradient : public GradientMakerBase { - using GradientMakerBase::GradientMakerBase; - vector GetGradientDefs() override { - return SingleGradientDef( - "RoIPoolFGradient", - "", - vector{I(0), I(1), O(1), GO(0)}, - vector{GI(0)}); - } -}; - -REGISTER_GRADIENT(RoIPoolF, GetRoIPoolFGradient); - -} // namespace caffe2 diff --git a/modules/detectron/roi_pool_f_op.cu b/modules/detectron/roi_pool_f_op.cu deleted file mode 100644 index b261911b95a16a..00000000000000 --- a/modules/detectron/roi_pool_f_op.cu +++ /dev/null @@ -1,187 +0,0 @@ -/** - * Copyright (c) 2016-present, Facebook, Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include - -#include "caffe2/core/context_gpu.h" -#include "modules/detectron/roi_pool_f_op.h" - -namespace caffe2 { - -namespace { - -template -inline __device__ T gpu_atomic_add(const T val, T* address); - -template <> -inline __device__ -float gpu_atomic_add(const float val, float* address) { - return atomicAdd(address, val); -} - -template -__global__ void RoIPoolFForward(const int nthreads, const T* bottom_data, - const T spatial_scale, const int channels, const int height, - const int width, const int pooled_height, const int pooled_width, - const T* bottom_rois, T* top_data, int* argmax_data) { - CUDA_1D_KERNEL_LOOP(index, nthreads) { - // (n, c, ph, pw) is an element in the pooled output - int pw = index % pooled_width; - int ph = (index / pooled_width) % pooled_height; - int c = (index / pooled_width / pooled_height) % channels; - int n = index / pooled_width / pooled_height / channels; - - const T* offset_bottom_rois = bottom_rois + n * 5; - int roi_batch_ind = offset_bottom_rois[0]; - int roi_start_w = roundf(offset_bottom_rois[1] * spatial_scale); - int roi_start_h = roundf(offset_bottom_rois[2] * spatial_scale); - int roi_end_w = roundf(offset_bottom_rois[3] * spatial_scale); - int roi_end_h = roundf(offset_bottom_rois[4] * spatial_scale); - - // Force malformed ROIs to be 1x1 - int roi_width = max(roi_end_w - roi_start_w + 1, 1); - int roi_height = max(roi_end_h - roi_start_h + 1, 1); - T bin_size_h = static_cast(roi_height) - / static_cast(pooled_height); - T bin_size_w = static_cast(roi_width) - / static_cast(pooled_width); - - int hstart = static_cast(floor(static_cast(ph) - * bin_size_h)); - int wstart = static_cast(floor(static_cast(pw) - * bin_size_w)); - int hend = static_cast(ceil(static_cast(ph + 1) - * bin_size_h)); - int wend = static_cast(ceil(static_cast(pw + 1) - * bin_size_w)); - - // Add roi offsets and clip to input boundaries - hstart = min(max(hstart + roi_start_h, 0), height); - hend = min(max(hend + roi_start_h, 0), height); - wstart = min(max(wstart + roi_start_w, 0), width); - wend = min(max(wend + roi_start_w, 0), width); - bool is_empty = (hend <= hstart) || (wend <= wstart); - - // Define an empty pooling region to be zero - T maxval = is_empty ? 0 : -FLT_MAX; - // If nothing is pooled, argmax = -1 causes nothing to be backprop'd - int maxidx = -1; - const T* offset_bottom_data = - bottom_data + (roi_batch_ind * channels + c) * height * width; - for (int h = hstart; h < hend; ++h) { - for (int w = wstart; w < wend; ++w) { - int bottom_index = h * width + w; - if (offset_bottom_data[bottom_index] > maxval) { - maxval = offset_bottom_data[bottom_index]; - maxidx = bottom_index; - } - } - } - top_data[index] = maxval; - argmax_data[index] = maxidx; - } -} - -template -__global__ void RoIPoolFBackward(const int nthreads, const T* top_diff, - const int* argmax_data, const int num_rois, const T spatial_scale, - const int channels, const int height, const int width, - const int pooled_height, const int pooled_width, T* bottom_diff, - const T* bottom_rois) { - CUDA_1D_KERNEL_LOOP(index, nthreads) { - // (n, c, ph, pw) is an element in the pooled output - int pw = index % pooled_width; - int ph = (index / pooled_width) % pooled_height; - int c = (index / pooled_width / pooled_height) % channels; - int n = index / pooled_width / pooled_height / channels; - - const T* offset_bottom_rois = bottom_rois + n * 5; - int roi_batch_ind = offset_bottom_rois[0]; - int bottom_offset = (roi_batch_ind * channels + c) * height * width; - int top_offset = (n * channels + c) * pooled_height * pooled_width; - const T* offset_top_diff = top_diff + top_offset; - T* offset_bottom_diff = bottom_diff + bottom_offset; - const int* offset_argmax_data = argmax_data + top_offset; - - int argmax = offset_argmax_data[ph * pooled_width + pw]; - if (argmax != -1) { - gpu_atomic_add( - static_cast(offset_top_diff[ph * pooled_width + pw]), - offset_bottom_diff + argmax); - } - } -} - -} // namespace - -template<> -bool RoIPoolFOp::RunOnDevice() { - auto& X = Input(0); // Input data to pool - auto& R = Input(1); // RoIs - - if (R.size() == 0) { - // Handle empty rois - std::vector sizes = {0, X.dim32(1), pooled_height_, pooled_width_}; - /* auto* Y = */ Output(0, sizes, at::dtype()); - /* auto* A = */ Output(1, sizes, at::dtype()); - return true; - } - - auto* Y = Output(0, {R.dim32(0), X.dim32(1), pooled_height_, pooled_width_}, at::dtype()); // RoI pooled data - auto* A = Output(1, Y->sizes(), at::dtype()); // argmaxes - int output_size = Y->size(); - RoIPoolFForward<<>>( - output_size, X.data(), spatial_scale_, X.dim32(1), X.dim32(2), - X.dim32(3), pooled_height_, pooled_width_, R.data(), - Y->mutable_data(), A->mutable_data()); - C10_CUDA_KERNEL_LAUNCH_CHECK(); - return true; -} - - -template<> -bool RoIPoolFGradientOp::RunOnDevice() { - auto& X = Input(0); // Input data to pool - auto& R = Input(1); // RoIs - auto& A = Input(2); // argmaxes - auto& dY = Input(3); // Gradient of net w.r.t. output of "forward" op - // (aka "gradOutput") - - auto* dX = Output(0, X.sizes(), at::dtype()); // Gradient of net w.r.t. input to "forward" op - // (aka "gradInput") - // Must zero-out dX before accumulating gradients - math::Set( - dX->size(), 0.f, dX->mutable_data(), &context_); - if (dY.size() > 0) { // Handle possibly empty gradient if there were no rois - RoIPoolFBackward<<>>( - dY.size(), dY.data(), A.data(), R.dim32(0), spatial_scale_, - X.dim32(1), X.dim32(2), X.dim32(3), pooled_height_, pooled_width_, - dX->mutable_data(), R.data()); - C10_CUDA_KERNEL_LAUNCH_CHECK(); - } - return true; -} - - -REGISTER_CUDA_OPERATOR(RoIPoolF, - RoIPoolFOp); -REGISTER_CUDA_OPERATOR(RoIPoolFGradient, - RoIPoolFGradientOp); -} // namespace caffe2 diff --git a/modules/detectron/roi_pool_f_op.h b/modules/detectron/roi_pool_f_op.h deleted file mode 100644 index 604c5606a203e0..00000000000000 --- a/modules/detectron/roi_pool_f_op.h +++ /dev/null @@ -1,81 +0,0 @@ -/** - * Copyright (c) 2016-present, Facebook, Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef ROI_POOL_F_OP_H_ -#define ROI_POOL_F_OP_H_ - -#include "caffe2/core/context.h" -#include "caffe2/core/logging.h" -#include "caffe2/core/operator.h" -#include "caffe2/utils/math.h" - -namespace caffe2 { - -template -class RoIPoolFOp final : public Operator { - public: - RoIPoolFOp(const OperatorDef& operator_def, Workspace* ws) - : Operator(operator_def, ws), - spatial_scale_(this->template GetSingleArgument( - "spatial_scale", 1.)), - pooled_height_(this->template GetSingleArgument("pooled_h", 1)), - pooled_width_(this->template GetSingleArgument("pooled_w", 1)) { - TORCH_DCHECK_GT(spatial_scale_, 0); - TORCH_DCHECK_GT(pooled_height_, 0); - TORCH_DCHECK_GT(pooled_width_, 0); - } - USE_OPERATOR_CONTEXT_FUNCTIONS; - - bool RunOnDevice() override { - // No CPU implementation for now - CAFFE_NOT_IMPLEMENTED; - } - - protected: - float spatial_scale_; - int pooled_height_; - int pooled_width_; -}; - -template -class RoIPoolFGradientOp final : public Operator { - public: - RoIPoolFGradientOp(const OperatorDef& def, Workspace* ws) - : Operator(def, ws), - spatial_scale_(this->template GetSingleArgument( - "spatial_scale", 1.)), - pooled_height_(this->template GetSingleArgument("pooled_h", 1)), - pooled_width_(this->template GetSingleArgument("pooled_w", 1)) { - TORCH_DCHECK_GT(spatial_scale_, 0); - TORCH_DCHECK_GT(pooled_height_, 0); - TORCH_DCHECK_GT(pooled_width_, 0); - } - USE_OPERATOR_CONTEXT_FUNCTIONS; - - bool RunOnDevice() override { - // No CPU implementation for now - CAFFE_NOT_IMPLEMENTED; - } - - protected: - float spatial_scale_; - int pooled_height_; - int pooled_width_; -}; - -} // namespace caffe2 - -#endif // ROI_POOL_F_OP_H_ diff --git a/modules/detectron/sample_as_op.cc b/modules/detectron/sample_as_op.cc deleted file mode 100644 index d22cfb8194e60e..00000000000000 --- a/modules/detectron/sample_as_op.cc +++ /dev/null @@ -1,81 +0,0 @@ -/** - * Copyright (c) 2016-present, Facebook, Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "sample_as_op.h" - -namespace caffe2 { - -REGISTER_CPU_OPERATOR(SampleAs, SampleAsOp); -REGISTER_CPU_OPERATOR(SampleAsGradient, SampleAsGradientOp); - -OPERATOR_SCHEMA(SampleAs) - .NumInputs(2) - .NumOutputs(1) - .SetDoc(R"DOC( -Select the batch elements from input tensor X where the corresponding input -label value is > 0. -)DOC") - .Input( - 0, - "X", - "Tensor of at least 1D shape (N, ...).") - .Input( - 1, - "labels", - "Tensor of type int with 1D shape (N, ).") - .Output( - 0, - "Y", - "Tensor with number of dims matching X, but with the length of dim 0 " - "equal to the number of non-zero elements in labels. The batch items " - "from X corresponding to the non-zero elements in labels are copied " - "into Y."); - -OPERATOR_SCHEMA(SampleAsGradient) - .NumInputs(3) - .NumOutputs(1) - .Input( - 0, - "X", - "See SampleAs.") - .Input( - 1, - "labels", - "See SampleAs." - ) - .Input( - 2, - "dY", - "Gradient of forward output 0 (Y).") - .Output( - 0, - "dX", - "Gradient of forward input 0 (X)."); - -class GetSampleAsGradient : public GradientMakerBase { - using GradientMakerBase::GradientMakerBase; - vector GetGradientDefs() override { - return SingleGradientDef( - "SampleAsGradient", - "", - vector{I(0), I(1), GO(0)}, - vector{GI(0)}); - } -}; - -REGISTER_GRADIENT(SampleAs, GetSampleAsGradient); - -} // namespace caffe2 diff --git a/modules/detectron/sample_as_op.cu b/modules/detectron/sample_as_op.cu deleted file mode 100644 index a58604de2b0d03..00000000000000 --- a/modules/detectron/sample_as_op.cu +++ /dev/null @@ -1,116 +0,0 @@ -/** - * Copyright (c) 2016-present, Facebook, Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* SampleAs by Kaiming He for Mask R-CNN -X.dim32(0) = L.dim32(0) -Y's output samples are the samples of X for which L > 0. -*/ -#include - -#include "caffe2/core/context_gpu.h" -#include "modules/detectron/sample_as_op.h" - -#include - -namespace caffe2 { - -template <> -bool SampleAsOp::RunOnDevice() { - auto& X = Input(0); // Input data to be sliced - auto& L = Input(1); // Target data that provide the identity - - CAFFE_ENFORCE( - X.dim32(0) == L.dim32(0), - "X.dim32(0) must be equal to L.dim32(0)", - "(", - X.dim32(0), - " vs. ", - L.dim32(0), - ")"); - - // copy L to CPU: - std::vector labels(L.dim32(0)); - context_.CopyBytes( - L.dim32(0) * sizeof(int), L.data(), &labels[0]); - // Make sure that the copy is finished - context_.FinishDeviceComputation(); - - int count = 0; - for (int i = 0; i < L.dim32(0); i++) { - if (labels[i] > 0) { - count++; - } - } - assert(count > 0); - - // resize Y - vector out_shape(X.sizes().vec()); - out_shape[0] = count; - auto* Y = Output(0, out_shape, at::dtype()); // Sliced data (Y.dim32(0) = num of (L > 0)) - - const int len = X.size() / X.dim32(0); - - float* output = Y->mutable_data(); - for (int i = 0; i < L.dim32(0); i++) { - if (labels[i] > 0) { - context_.CopyBytes( - len * sizeof(float), X.data() + i * len, output); - output += len; - } // if - } // i - - return true; -} - -template <> -bool SampleAsGradientOp::RunOnDevice() { - auto& X = Input(0); - auto& L = Input(1); - auto& dY = Input(2); - - - auto* dX = Output(0, X.sizes(), at::dtype()); - - // copy L to CPU: - std::vector labels(L.dim32(0)); - context_.CopyBytes( - L.dim32(0) * sizeof(int), L.data(), &labels[0]); - // Make sure that the copy is finished - context_.FinishDeviceComputation(); - - // zero-out dX - math::Set( - dX->size(), 0.f, dX->mutable_data(), &context_); - - const int len = X.size() / X.dim32(0); - - const float* input = dY.data(); - for (int i = 0; i < L.dim32(0); i++) { - if (labels[i] > 0) { - context_.CopyBytes( - len * sizeof(float), input, dX->mutable_data() + i * len); - input += len; - } // if - } // i - - return true; -} - -REGISTER_CUDA_OPERATOR(SampleAs, SampleAsOp); -REGISTER_CUDA_OPERATOR( - SampleAsGradient, - SampleAsGradientOp); -} // namespace caffe2 diff --git a/modules/detectron/sample_as_op.h b/modules/detectron/sample_as_op.h deleted file mode 100644 index 70d2214e1c8cf0..00000000000000 --- a/modules/detectron/sample_as_op.h +++ /dev/null @@ -1,55 +0,0 @@ -/** - * Copyright (c) 2016-present, Facebook, Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef SAMPLE_AS_OP_H_ -#define SAMPLE_AS_OP_H_ - -#include "caffe2/core/context.h" -#include "caffe2/core/logging.h" -#include "caffe2/core/operator.h" -#include "caffe2/utils/math.h" - -namespace caffe2 { - -template -class SampleAsOp final : public Operator { - public: - SampleAsOp(const OperatorDef& operator_def, Workspace* ws) - : Operator(operator_def, ws) {} - USE_OPERATOR_CONTEXT_FUNCTIONS; - - bool RunOnDevice() override { - // No CPU implementation for now - CAFFE_NOT_IMPLEMENTED; - } -}; - -template -class SampleAsGradientOp final : public Operator { - public: - SampleAsGradientOp(const OperatorDef& def, Workspace* ws) - : Operator(def, ws) {} - USE_OPERATOR_CONTEXT_FUNCTIONS; - - bool RunOnDevice() override { - // No CPU implementation for now - CAFFE_NOT_IMPLEMENTED; - } -}; - -} // namespace caffe2 - -#endif // SAMPLE_AS_OP_H_ diff --git a/modules/detectron/select_smooth_l1_loss_op.cc b/modules/detectron/select_smooth_l1_loss_op.cc deleted file mode 100644 index 7f1441032acf69..00000000000000 --- a/modules/detectron/select_smooth_l1_loss_op.cc +++ /dev/null @@ -1,107 +0,0 @@ -/** - * Copyright (c) 2016-present, Facebook, Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "select_smooth_l1_loss_op.h" - -namespace caffe2 { - -REGISTER_CPU_OPERATOR( - SelectSmoothL1Loss, - SelectSmoothL1LossOp); -REGISTER_CPU_OPERATOR( - SelectSmoothL1LossGradient, - SelectSmoothL1LossGradientOp); - -OPERATOR_SCHEMA(SelectSmoothL1Loss) - .NumInputs(4) - .NumOutputs(1) - .SetDoc(R"DOC( -RetinaNet specific op for computing Smooth L1 Loss at select locations in a 4D -tensor that encodes bounding box regression predictions. -)DOC") - .Arg( - "beta", - "(float) default 1.0; L2 to L1 transition point.") - .Arg( - "scale", - "(float) default 1.0; multiply the loss by this scale factor.") - .Input( - 0, - "Y_hat", - "4D tensor of bounding box regression predictions with shape " - "(N, 4 * num_bbox_classes * num_anchors, H, W).") - .Input( - 1, - "Y", - "2D tensor of labels shape (M, 4) for 4 contiguous channels starting " - "at each of the M locations selected by the locations input.") - .Input( - 2, - "locations", - "2D tensor of shape (M, 4) that identifies M 'select' locations " - "encoded by the four columns: (n, c, y, x). The loss is computed on the " - "four contiguous channel locations [c, c + 3] (inclusive).") - .Input( - 3, - "normalizer", - "Scalar; the loss is divided by max(1, normalizer).") - .Output( - 0, - "loss", - "Scalar loss."); - -OPERATOR_SCHEMA(SelectSmoothL1LossGradient) - .NumInputs(5) - .NumOutputs(1) - .Input( - 0, - "Y_hat", - "See SelectSmoothL1Loss.") - .Input( - 1, - "Y", - "See SelectSmoothL1Loss.") - .Input( - 2, - "locations", - "See SelectSmoothL1Loss.") - .Input( - 3, - "normalizer", - "See SelectSmoothL1Loss.") - .Input( - 4, - "d_loss", - "Gradient of forward output 0 (loss).") - .Output( - 0, - "d_Y_hat", - "Gradient of forward input 0 (Y_hat)."); - -class GetSelectSmoothL1LossGradient : public GradientMakerBase { - using GradientMakerBase::GradientMakerBase; - vector GetGradientDefs() override { - return SingleGradientDef( - "SelectSmoothL1LossGradient", - "", - vector{I(0), I(1), I(2), I(3), GO(0)}, - vector{GI(0)}); - } -}; - -REGISTER_GRADIENT(SelectSmoothL1Loss, GetSelectSmoothL1LossGradient); - -} // namespace caffe2 diff --git a/modules/detectron/select_smooth_l1_loss_op.cu b/modules/detectron/select_smooth_l1_loss_op.cu deleted file mode 100644 index 72f1d563b4c92f..00000000000000 --- a/modules/detectron/select_smooth_l1_loss_op.cu +++ /dev/null @@ -1,189 +0,0 @@ -/** - * Copyright (c) 2016-present, Facebook, Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "caffe2/core/context_gpu.h" -#include "modules/detectron/select_smooth_l1_loss_op.h" - -namespace caffe2 { - -namespace { -__global__ void SelectSmoothL1Kernel( - const int D, const int H, const int W, - const int M, const float* Y_hat, const float* Y, const float* L, float* out, - const float* S, const float beta) { - // f(x) = 0.5 * x^2 / beta if |x| < beta - // |x| - 0.5 * beta otherwise - CUDA_1D_KERNEL_LOOP(i, M) { - int n = L[i * 4]; - int c = L[i * 4 + 1]; - int y = L[i * 4 + 2]; - int x = L[i * 4 + 3]; - - for (int j = 0; j < 4; j++){ - // Y_hat: N x (A * CLS * 4) x H x W - int ind = n * (D * H * W) + (c + j) * (H * W) + y * W + x; - float y_hat = Y_hat[ind]; - float y = Y[i * 4 + j]; - float val = y_hat - y; - float abs_val = c10::cuda::compat::abs(val); - if (abs_val < beta) { - out[ind] = (0.5 * val * val / beta) / c10::cuda::compat::max(S[0], static_cast(1.0)); - } else { - out[ind] = (abs_val - 0.5 * beta) / c10::cuda::compat::max(S[0], static_cast(1.0)); - } - } - } -} - - -__global__ void SelectSmoothL1GradientKernel( - const int D, const int H, const int W, - const int M, - const float* Y_hat, - const float* Y, - const float* L, - float* out, - const float* d_loss_data, - float norm, - const float* S, - float beta) { - // f'(x) = x / beta if |x| < beta - // = sign(x) otherwise - // We also scale by norm * d_loss in this kernel for convenience - CUDA_1D_KERNEL_LOOP(i, M) { - int n = L[i * 4]; - int c = L[i * 4 + 1]; - int y = L[i * 4 + 2]; - int x = L[i * 4 + 3]; - float d_loss = *d_loss_data; - - for (int j = 0; j < 4; j++) { - int ind = n * (D * H * W) + (c + j) * (H * W) + y * W + x; - float y_hat = Y_hat[ind]; - float y = Y[i * 4 + j]; - float val = y_hat - y; - float abs_val = c10::cuda::compat::abs(val); - if (abs_val < beta) { - out[ind] = norm * d_loss * val / beta / c10::cuda::compat::max(S[0], static_cast(1.0)); - } else { - out[ind] = norm * d_loss * ((float(0) < val) - (val < float(0))) / c10::cuda::compat::max(S[0], static_cast(1.0)); - } - } - } -} -} // namespace - - -template<> -bool SelectSmoothL1LossOp::RunOnDevice() { - // bbox targets predictions, for example: N x (A * 4) H x W in cls-agnostic case - auto& Y_hat = Input(0); - // true targets: for example: M x 4 where M is the #fg boxes per fpn level - auto& Y = Input(1); - // locations of fg boxes: M x 4 - auto& L = Input(2); - // total number of fg boxes across all FPN levels: scalar - auto& S = Input(3); - - - auto* avg_loss = Output(0, vector(), at::dtype()); - if (Y.size() == 0){ - math::Set( - 1, static_cast(0), avg_loss->mutable_data(), &context_); - return true; - } - - int N = Y_hat.dim32(0); - int D = Y_hat.dim32(1); - int H = Y_hat.dim32(2); - int W = Y_hat.dim32(3); - - int M = Y.dim32(0); - - // initialization - buff_.ResizeLike(Y_hat); - math::Set( - 1, static_cast(0), avg_loss->mutable_data(), &context_); - math::Set( - buff_.size(), 0.0, buff_.mutable_data(), &context_); - - // Element-wise smooth l1 loss - // l := SelectSmoothL1((y_hat - y)) - SelectSmoothL1Kernel<<>>( - D, H, W, - M, Y_hat.data(), Y.data(), - L.data(), buff_.mutable_data(), - S.data(), beta_); - C10_CUDA_KERNEL_LAUNCH_CHECK(); - - // Sum of all losses - // al := sum_i l_i - float* avg_loss_data = avg_loss->mutable_data(); - math::Sum( - buff_.size(), buff_.data(), avg_loss_data, &context_); - - // Average of input batch size - math::Scale( - 1, scale_, avg_loss_data, avg_loss_data, &context_); - return true; -} - -template<> -bool SelectSmoothL1LossGradientOp::RunOnDevice() { - auto& Y_hat = Input(0); - auto& Y = Input(1); - auto& L = Input(2); - auto& S = Input(3); - // Below is gradient of net w.r.t. avg_loss ("gradOutput"), should be all 1's - auto& d_avg_loss = Input(4); - - auto* d_Y_hat = Output(0, Y_hat.sizes(), at::dtype()); // gradient of net w.r.t. Y_hat ("gradInput") - math::Set( - d_Y_hat->size(), 0.0, d_Y_hat->mutable_data(), &context_); - if (Y.size() == 0){ - return true; - } - - int N = Y_hat.dim32(0); - int D = Y_hat.dim32(1); - int H = Y_hat.dim32(2); - int W = Y_hat.dim32(3); - - int M = Y.dim32(0); - // Element-wise weighted difference (can be used to ignore or reweight - // specific components) - // d := (y_hat - y) - // d_Y_hat := d_avg_loss * SelectSmoothL1'((y_hat - y)) - - SelectSmoothL1GradientKernel<<size()), - CAFFE_CUDA_NUM_THREADS, - 0, context_.cuda_stream()>>>( - D, H, W, M, Y_hat.data(), Y.data(), - L.data(), d_Y_hat->mutable_data(), - d_avg_loss.data(), scale_, S.data(), beta_); - C10_CUDA_KERNEL_LAUNCH_CHECK(); - - return true; -} - - -REGISTER_CUDA_OPERATOR(SelectSmoothL1Loss, - SelectSmoothL1LossOp); -REGISTER_CUDA_OPERATOR(SelectSmoothL1LossGradient, - SelectSmoothL1LossGradientOp); -} // namespace caffe2 diff --git a/modules/detectron/select_smooth_l1_loss_op.h b/modules/detectron/select_smooth_l1_loss_op.h deleted file mode 100644 index b5a3badfde716e..00000000000000 --- a/modules/detectron/select_smooth_l1_loss_op.h +++ /dev/null @@ -1,77 +0,0 @@ -/** - * Copyright (c) 2016-present, Facebook, Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef SELECT_SMOOTH_L1_LOSS_OP_H_ -#define SELECT_SMOOTH_L1_LOSS_OP_H_ - -#include "caffe2/core/context.h" -#include "caffe2/core/logging.h" -#include "caffe2/core/operator.h" -#include "caffe2/utils/math.h" - -namespace caffe2 { - -template -class SelectSmoothL1LossOp final : public Operator { - public: - SelectSmoothL1LossOp(const OperatorDef& operator_def, Workspace* ws) - : Operator(operator_def, ws), - beta_(this->template GetSingleArgument("beta", 1.)), - scale_(this->template GetSingleArgument("scale", 1.)) { - CAFFE_ENFORCE(beta_ > 0); - CAFFE_ENFORCE(scale_ >= 0); - } - USE_OPERATOR_CONTEXT_FUNCTIONS; - - bool RunOnDevice() override { - // No CPU implementation for now - CAFFE_NOT_IMPLEMENTED; - } - - protected: - float beta_; // Transition point from L1 to L2 loss - float scale_; // Scale the loss by scale_ - int dim_; // dimension for 1 anchor prediction - Tensor buff_{Context::GetDeviceType()}; // Buffer for element-wise differences -}; - -template -class SelectSmoothL1LossGradientOp final : public Operator { - public: - SelectSmoothL1LossGradientOp(const OperatorDef& def, Workspace* ws) - : Operator(def, ws), - beta_(this->template GetSingleArgument("beta", 1.)), - scale_(this->template GetSingleArgument("scale", 1.)) { - CAFFE_ENFORCE(beta_ > 0); - CAFFE_ENFORCE(scale_ >= 0); - } - USE_OPERATOR_CONTEXT_FUNCTIONS; - - bool RunOnDevice() override { - // No CPU implementation for now - CAFFE_NOT_IMPLEMENTED; - } - - protected: - float beta_; // Transition point from L1 to L2 loss - float scale_; // Scale the loss by scale_ - int dim_; // dimension for 1 anchor prediction - Tensor buff_{Context::GetDeviceType()}; // Buffer for element-wise differences -}; - -} // namespace caffe2 - -#endif // SELECT_SMOOTH_L1_LOSS_OP_H_ diff --git a/modules/detectron/sigmoid_cross_entropy_loss_op.cc b/modules/detectron/sigmoid_cross_entropy_loss_op.cc deleted file mode 100644 index f45ff40174bbc8..00000000000000 --- a/modules/detectron/sigmoid_cross_entropy_loss_op.cc +++ /dev/null @@ -1,96 +0,0 @@ -/** - * Copyright (c) 2016-present, Facebook, Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "sigmoid_cross_entropy_loss_op.h" - -namespace caffe2 { - -REGISTER_CPU_OPERATOR( - SigmoidCrossEntropyLoss, - SigmoidCrossEntropyLossOp); -REGISTER_CPU_OPERATOR( - SigmoidCrossEntropyLossGradient, - SigmoidCrossEntropyLossGradientOp); - -OPERATOR_SCHEMA(SigmoidCrossEntropyLoss) - .NumInputs(2) - .NumOutputs(1) - .SetDoc(R"DOC( -Compute sigmoid activations followed by averaged binary cross entropy loss. The -target values may be in {-1, 0, 1}, where -1 indicates that the corresponding -sample should be ignored and {0, 1} correspond to the binary classes 0 and 1. By -default the loss is divided by the number of targets > -1 and then multiplied by -the `scale` op argument. The divisive normalization may be disable by setting -the op argument `normalize` to 0 (the multiplication by `scale` still takes -effect). - -This op fuses sigmoid and cross entropy for numerical stability in both forward -and gradient computation. -)DOC") - .Arg( - "scale", - "(float) default 1.0; multiply the loss by this scale factor.") - .Arg( - "normalize", - "(int) default 1; if true, divide the loss by the number of targets > " - "-1.") - .Input( - 0, - "X", - "Tensor of predicted logits (shape must be at least 1D).") - .Input( - 1, - "targets", - "Tensor of targets of type int and same shape as logits X.") - .Output( - 0, - "loss", - "Scalar loss."); - -OPERATOR_SCHEMA(SigmoidCrossEntropyLossGradient) - .NumInputs(3) - .NumOutputs(1) - .Input( - 0, - "X", - "See SigmoidCrossEntropyLoss.") - .Input( - 1, - "targets", - "See SigmoidCrossEntropyLoss.") - .Input( - 2, - "d_loss", - "Gradient of forward output 0 (loss).") - .Output( - 0, - "dX", - "Gradient of forward input 0 (X)."); - -class GetSigmoidCrossEntropyLossGradient : public GradientMakerBase { - using GradientMakerBase::GradientMakerBase; - vector GetGradientDefs() override { - return SingleGradientDef( - "SigmoidCrossEntropyLossGradient", - "", - vector{I(0), I(1), GO(0)}, - vector{GI(0)}); - } -}; - -REGISTER_GRADIENT(SigmoidCrossEntropyLoss, GetSigmoidCrossEntropyLossGradient); - -} // namespace caffe2 diff --git a/modules/detectron/sigmoid_cross_entropy_loss_op.cu b/modules/detectron/sigmoid_cross_entropy_loss_op.cu deleted file mode 100644 index bb86560fcb01fe..00000000000000 --- a/modules/detectron/sigmoid_cross_entropy_loss_op.cu +++ /dev/null @@ -1,190 +0,0 @@ -/** - * Copyright (c) 2016-present, Facebook, Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "caffe2/core/context_gpu.h" -#include "modules/detectron/sigmoid_cross_entropy_loss_op.h" - -namespace caffe2 { - -namespace { -__global__ void ElementwiseMaxKernel(const int n, float* data, const float a) { - CUDA_1D_KERNEL_LOOP(index, n) { - data[index] = (data[index] > a) ? data[index] : a; - } -} - -__global__ void SigmoidCrossEntropyLossKernel( - const int n, - const float* logits, - const int* targets, - float* losses, - float* counts) { - CUDA_1D_KERNEL_LOOP(index, n) { - if (targets[index] == -1) { - losses[index] = 0.; - counts[index] = 0.; - } else { - losses[index] = - -1. * logits[index] * (targets[index] - (logits[index] >= 0)) + - logf( - 1 + - expf(logits[index] - 2 * logits[index] * (logits[index] >= 0))); - counts[index] = 1.; - } - } -} - -__global__ void SigmoidCrossEntropyLossGradientKernel( - const int n, - const float* logits, - const int* targets, - float* d_logits, - float* counts) { - CUDA_1D_KERNEL_LOOP(index, n) { - if (targets[index] == -1) { - d_logits[index] = 0.; - counts[index] = 0.; - } else { - d_logits[index] = 1. / (1. + expf(-logits[index])) - targets[index]; - counts[index] = 1.; - } - } -} -} // namespace - -template <> -bool SigmoidCrossEntropyLossOp::RunOnDevice() { - auto& X = Input(0); - auto& T = Input(1); - - - CAFFE_ENFORCE( - X.size() == T.size(), - "Logit and target must have the same size", - "(", - X.size(), - " vs. ", - T.size(), - ")"); - auto* avg_loss = Output(0, vector(), at::dtype()); - counts_.ResizeLike(X); - losses_.ResizeLike(X); - ReinitializeTensor(&normalizer_, vector(), at::dtype().device(CUDA)); - SigmoidCrossEntropyLossKernel<<< - CAFFE_GET_BLOCKS(X.size()), - CAFFE_CUDA_NUM_THREADS, - 0, - context_.cuda_stream()>>>( - X.size(), - X.data(), - T.data(), - losses_.mutable_data(), - counts_.mutable_data()); - C10_CUDA_KERNEL_LAUNCH_CHECK(); - - float* avg_loss_data = avg_loss->mutable_data(); - math::Sum( - losses_.size(), losses_.data(), avg_loss_data, &context_); - if (normalize_) { - float* normalizer_data = normalizer_.mutable_data(); - math::Sum( - counts_.size(), counts_.data(), normalizer_data, &context_); - // Prevent division by zero is all counts are zero - ElementwiseMaxKernel<<< - CAFFE_GET_BLOCKS(normalizer_.size()), - CAFFE_CUDA_NUM_THREADS, - 0, - context_.cuda_stream()>>>(normalizer_.size(), normalizer_data, 1e-5); - C10_CUDA_KERNEL_LAUNCH_CHECK(); - math::Div( - 1, avg_loss_data, normalizer_data, avg_loss_data, &context_); - } - math::Scale( - 1, scale_, avg_loss_data, avg_loss_data, &context_); - - return true; -} - -template <> -bool SigmoidCrossEntropyLossGradientOp::RunOnDevice() { - auto& X = Input(0); - auto& T = Input(1); - auto& d_avg_loss = Input(2); - - - auto* dX = Output(0, X.sizes(), at::dtype()); - counts_.ResizeLike(X); - ReinitializeTensor(&normalizer_, vector(), at::dtype().device(CUDA)); - SigmoidCrossEntropyLossGradientKernel<<< - CAFFE_GET_BLOCKS(X.size()), - CAFFE_CUDA_NUM_THREADS, - 0, - context_.cuda_stream()>>>( - X.size(), - X.data(), - T.data(), - dX->mutable_data(), - counts_.mutable_data()); - C10_CUDA_KERNEL_LAUNCH_CHECK(); - if (normalize_) { - float* normalizer_data = normalizer_.mutable_data(); - math::Sum( - counts_.size(), counts_.data(), normalizer_data, &context_); - // Prevent division by zero is all counts are zero - ElementwiseMaxKernel<<< - CAFFE_GET_BLOCKS(normalizer_.size()), - CAFFE_CUDA_NUM_THREADS, - 0, - context_.cuda_stream()>>>(normalizer_.size(), normalizer_data, 1e-5); - C10_CUDA_KERNEL_LAUNCH_CHECK(); - math::Div( - 1, - d_avg_loss.data(), - normalizer_data, - normalizer_data, - &context_); - math::Scale( - 1, scale_, normalizer_data, normalizer_data, &context_); - math::Scale( - dX->size(), - normalizer_data, - dX->data(), - dX->mutable_data(), - &context_); - } else { - math::Scale( - dX->size(), - scale_, - dX->data(), - dX->mutable_data(), - &context_); - math::Scale( - dX->size(), - d_avg_loss.data(), - dX->data(), - dX->mutable_data(), - &context_); - } - return true; -} - -REGISTER_CUDA_OPERATOR( - SigmoidCrossEntropyLoss, - SigmoidCrossEntropyLossOp); -REGISTER_CUDA_OPERATOR( - SigmoidCrossEntropyLossGradient, - SigmoidCrossEntropyLossGradientOp); -} // namespace caffe2 diff --git a/modules/detectron/sigmoid_cross_entropy_loss_op.h b/modules/detectron/sigmoid_cross_entropy_loss_op.h deleted file mode 100644 index 680519e9bdea97..00000000000000 --- a/modules/detectron/sigmoid_cross_entropy_loss_op.h +++ /dev/null @@ -1,78 +0,0 @@ -/** - * Copyright (c) 2016-present, Facebook, Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef SIGMOID_CROSS_ENTROPY_LOSS_OP_H_ -#define SIGMOID_CROSS_ENTROPY_LOSS_OP_H_ - -#include "caffe2/core/context.h" -#include "caffe2/core/logging.h" -#include "caffe2/core/operator.h" -#include "caffe2/utils/math.h" - -namespace caffe2 { - -template -class SigmoidCrossEntropyLossOp final : public Operator { - public: - SigmoidCrossEntropyLossOp(const OperatorDef& operator_def, Workspace* ws) - : Operator(operator_def, ws), - scale_(this->template GetSingleArgument("scale", 1.)), - normalize_(this->template GetSingleArgument("normalize", 1)) { - CAFFE_ENFORCE(scale_ >= 0); - CAFFE_ENFORCE(normalize_ == 0 || normalize_ == 1); - } - USE_OPERATOR_CONTEXT_FUNCTIONS; - - bool RunOnDevice() override { - // No CPU implementation for now - CAFFE_NOT_IMPLEMENTED; - } - - protected: - float scale_; - int normalize_; - Tensor losses_{Context::GetDeviceType()}; - Tensor counts_{Context::GetDeviceType()}; - Tensor normalizer_; -}; - -template -class SigmoidCrossEntropyLossGradientOp final : public Operator { - public: - SigmoidCrossEntropyLossGradientOp(const OperatorDef& def, Workspace* ws) - : Operator(def, ws), - scale_(this->template GetSingleArgument("scale", 1.)), - normalize_(this->template GetSingleArgument("normalize", 1)) { - CAFFE_ENFORCE(scale_ >= 0); - CAFFE_ENFORCE(normalize_ == 0 || normalize_ == 1); - } - USE_OPERATOR_CONTEXT_FUNCTIONS; - - bool RunOnDevice() override { - // No CPU implementation for now - CAFFE_NOT_IMPLEMENTED; - } - - protected: - float scale_; - int normalize_; - Tensor counts_{Context::GetDeviceType()}; - Tensor normalizer_; -}; - -} // namespace caffe2 - -#endif // SIGMOID_CROSS_ENTROPY_LOSS_OP_H_ diff --git a/modules/detectron/sigmoid_focal_loss_op.cc b/modules/detectron/sigmoid_focal_loss_op.cc deleted file mode 100644 index 583e9a0de3283f..00000000000000 --- a/modules/detectron/sigmoid_focal_loss_op.cc +++ /dev/null @@ -1,119 +0,0 @@ -/** - * Copyright (c) 2016-present, Facebook, Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "sigmoid_focal_loss_op.h" - -namespace caffe2 { - -REGISTER_CPU_OPERATOR(SigmoidFocalLoss, SigmoidFocalLossOp); -REGISTER_CPU_OPERATOR( - SigmoidFocalLossGradient, - SigmoidFocalLossGradientOp); - -OPERATOR_SCHEMA(SigmoidFocalLoss) - .NumInputs(3) - .NumOutputs(1) - .SetDoc(R"DOC( -The binary form of Focal Loss designed for use in RetinaNet-like models. -The input is assumed to be unnormalized scores (sometimes called 'logits') -arranged in a 4D tensor with shape (N, C, H, W), where N is the number of -elements in the batch, H and W are the height and width, and C = num_anchors * -num_classes defines num_anchors 'groups' of logits, each of length -num_classes. For the binary form of Focal Loss, num_classes does not include -the background category. (So, for COCO, num_classes = 80, not 81.) - -The binary form of focal loss is: - - FL(p_t) = -alpha * (1 - p_t)**gamma * log(p_t), - -where p = sigmoid(x), p_t = p or 1 - p depending on if the label is 1 or 0, -respectively. - -See: https://arxiv.org/abs/1708.02002 for details. -)DOC") - .Arg( - "scale", - "(float) default 1.0; multiply the loss by this scale factor.") - .Arg( - "alpha", - "(float) default 0.25; Focal Loss's alpha hyper-parameter.") - .Arg( - "gamma", - "(float) default 1.0; Focal Loss's gamma hyper-parameter.") - .Arg( - "num_classes", - "(int) default 80; number of classes (excluding background).") - .Input( - 0, - "logits", - "4D tensor of sigmoid inputs (called 'scores' or 'logits') with shape " - "(N, C, H, W), where C = num_anchors * num_classes.") - .Input( - 1, - "labels", - "4D tensor of labels with shape (N, num_anchors, H, W). Each entry is " - "a class label in [0, num_classes - 1] (inclusive). The label " - "identifies the one class that should have a sigmoid target of 1.") - .Input( - 2, - "normalizer", - "Scalar; the loss is normalized by 1 / max(1, normalizer)." - ) - .Output( - 0, - "loss", - "Scalar loss."); - -OPERATOR_SCHEMA(SigmoidFocalLossGradient) - .NumInputs(4) - .NumOutputs(1) - .Input( - 0, - "logits", - "See SigmoidFocalLoss.") - .Input( - 1, - "labels", - "See SigmoidFocalLoss.") - .Input( - 2, - "normalizer", - "See SigmoidFocalLoss.") - .Input( - 3, - "d_loss", - "Gradient of forward output 0 (loss)") - .Output( - 0, - "d_logits", - "Gradient of forward input 0 (logits)"); - -class GetSigmoidFocalLossGradient : public GradientMakerBase { - using GradientMakerBase::GradientMakerBase; - - vector GetGradientDefs() override { - vector blob_names{ - {I(0), I(1), I(2), GO(0)}, - }; - - return SingleGradientDef( - "SigmoidFocalLossGradient", "", blob_names, vector{GI(0)}); - } -}; - -REGISTER_GRADIENT(SigmoidFocalLoss, GetSigmoidFocalLossGradient); - -} // namespace caffe2 diff --git a/modules/detectron/sigmoid_focal_loss_op.cu b/modules/detectron/sigmoid_focal_loss_op.cu deleted file mode 100644 index e6f2dea21b5df9..00000000000000 --- a/modules/detectron/sigmoid_focal_loss_op.cu +++ /dev/null @@ -1,185 +0,0 @@ -/** - * Copyright (c) 2016-present, Facebook, Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include - -#include "caffe2/core/context_gpu.h" -#include "modules/detectron/sigmoid_focal_loss_op.h" - -namespace caffe2 { - -namespace { - -__global__ void SigmoidFocalLossKernel( - const int N, const int D, const int H, const int W, const float* logits, - const int* targets, const float* weight_pos, - const float gamma, const float alpha, - const int num_classes, float* losses) { - CUDA_1D_KERNEL_LOOP(i, N * D * H * W) { - int x = i % W; - int y = (i / W) % H; - int c = (i / (W * H)) % D; // channel, here D is channel dim in input NxDxHxW - int n = i / (W * H * D); // n in NxDxHxW - - int A = D / num_classes; // num_anchors = A - int a = c / num_classes; // current anchor out of A anchors in D = A * num_cls - int d = c % num_classes; // current class - int t = targets[n * (H * W * A) + a * (H * W) + y * W + x]; // target - - // check whether the class is true class or not. - // The target classes are in range 1 - 81 and the d is in range 0-80 - // because we predict A*80 dim, so for comparison purpose, compare t and (d+1) - float c1 = (t == (d + 1)); - float c2 = (t != -1 & t != (d + 1)); - - float Np = c10::cuda::compat::max(weight_pos[0], static_cast(1.0)); - float zn = (1.0 - alpha) / Np; - float zp = alpha / Np; - - // p = 1. / 1. + expf(-x) - float p = 1. / (1. + expf(-logits[i])); - - // (1 - p)**gamma * log(p) where - float term1 = powf((1. - p), gamma) * logf(c10::cuda::compat::max(p, FLT_MIN)); - // p**gamma * log(1 - p) - float term2 = - powf(p, gamma) * - (-1. * logits[i] * (logits[i] >= 0) - - logf(1. + expf(logits[i] - 2. * logits[i] * (logits[i] >= 0)))); - - losses[i] = 0.0; - losses[i] += -c1 * term1 * zp; - losses[i] += -c2 * term2 * zn; - } -} - -__global__ void SigmoidFocalLossGradientKernel( - const int N, const int D, const int H, const int W, const float* logits, - const int* targets, float* dX_data, const float* weight_pos, - const float gamma, const float alpha, const int num_classes, - const float* avg_loss) { - CUDA_1D_KERNEL_LOOP(i, N * D * H * W) { - float a_loss = avg_loss[0]; - int x = i % W; - int y = (i / W) % H; - int c = (i / (W * H)) % D; - int n = i / (W * H * D); - - int A = D / num_classes; // num_anchors - int a = c / num_classes; // current anchor - int d = c % num_classes; // current class - - float Np = c10::cuda::compat::max(weight_pos[0], static_cast(1.0)); - float zn = (1.0 - alpha) / Np; - float zp = alpha / Np; - int t = targets[n * (H * W * A) + a * (H * W) + y * W + x]; - - float c1 = (t == (d + 1)); - float c2 = (t != -1 & t != (d + 1)); - float p = 1. / (1. + expf(-logits[i])); - - // (1-p)**g * (1 - p - g*p*log(p)) - float term1 = - powf((1. - p), gamma) * - (1. - p - (p * gamma * logf(c10::cuda::compat::max(p, FLT_MIN)))); - // (p**g) * (g*(1-p)*log(1-p) - p) - float term2 = - powf(p, gamma) * - ((-1. * logits[i] * (logits[i] >= 0) - - logf(1. + expf(logits[i] - 2. * logits[i] * (logits[i] >= 0)))) * - (1. - p) * gamma - p); - dX_data[i] = 0.0; - dX_data[i] += -c1 * zp * term1; - dX_data[i] += -c2 * zn * term2; - dX_data[i] = dX_data[i] * a_loss; - } -} -} // namespace - -template<> -bool SigmoidFocalLossOp::RunOnDevice() { - // Input logits, for example: N x (A * 80) x H x W in cls-agnostic - auto& X = Input(0); - // Target, for example: N x A x H x W - auto& T = Input(1); - // Number of positive examples: scalar - auto& wp = Input(2); - // output avg Sigmoid focal loss as mentioned in RetinaNet paper - - - int N = X.dim32(0); - int D = X.dim32(1); - int H = X.dim32(2); - int W = X.dim32(3); - - auto* avg_loss = Output(0, vector(), at::dtype()); - losses_.ResizeLike(X); - float* avg_loss_data = avg_loss->mutable_data(); - - SigmoidFocalLossKernel<<>>( - N, D, H, W, X.data(), T.data(), - wp.data(), gamma_, alpha_, num_classes_, - losses_.mutable_data()); - C10_CUDA_KERNEL_LAUNCH_CHECK(); - - math::Sum( - losses_.size(), losses_.data(), avg_loss_data, &context_); - math::Scale( - 1, scale_, avg_loss_data, avg_loss_data, &context_); - - return true; -} - - -template<> -bool SigmoidFocalLossGradientOp::RunOnDevice() { - auto& X = Input(0); - auto& T = Input(1); - auto& wp = Input(2); - auto& d_avg_loss = Input(InputSize() - 1); - - - // get input shape - int N = X.dim32(0); - int D = X.dim32(1); - int H = X.dim32(2); - int W = X.dim32(3); - - auto* dX = Output(0, X.sizes(), at::dtype()); - - SigmoidFocalLossGradientKernel<<>>( - N, D, H, W, X.data(), T.data(), dX->mutable_data(), - wp.data(), gamma_, alpha_, num_classes_, - d_avg_loss.data()); - C10_CUDA_KERNEL_LAUNCH_CHECK(); - math::Scale( - dX->size(), - scale_, - dX->data(), - dX->mutable_data(), - &context_); - - return true; -} - - -REGISTER_CUDA_OPERATOR(SigmoidFocalLoss, - SigmoidFocalLossOp); -REGISTER_CUDA_OPERATOR(SigmoidFocalLossGradient, - SigmoidFocalLossGradientOp); -} // namespace caffe2 diff --git a/modules/detectron/sigmoid_focal_loss_op.h b/modules/detectron/sigmoid_focal_loss_op.h deleted file mode 100644 index 7640e0bc8a430c..00000000000000 --- a/modules/detectron/sigmoid_focal_loss_op.h +++ /dev/null @@ -1,83 +0,0 @@ -/** - * Copyright (c) 2016-present, Facebook, Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef SIGMOID_FOCAL_LOSS_OP_H_ -#define SIGMOID_FOCAL_LOSS_OP_H_ - -#include "caffe2/core/context.h" -#include "caffe2/core/logging.h" -#include "caffe2/core/operator.h" -#include "caffe2/utils/math.h" - -namespace caffe2 { - -template -class SigmoidFocalLossOp final : public Operator { - public: - SigmoidFocalLossOp(const OperatorDef& operator_def, Workspace* ws) - : Operator(operator_def, ws), - scale_(this->template GetSingleArgument("scale", 1.)), - num_classes_(this->template GetSingleArgument("num_classes", 80)), - gamma_(this->template GetSingleArgument("gamma", 1.)), - alpha_(this->template GetSingleArgument("alpha", 0.25)) { - CAFFE_ENFORCE(scale_ >= 0); - } - USE_OPERATOR_CONTEXT_FUNCTIONS; - - bool RunOnDevice() override { - // No CPU implementation for now - CAFFE_NOT_IMPLEMENTED; - } - - protected: - float scale_; - int num_classes_; - float gamma_; - float alpha_; - Tensor losses_{Context::GetDeviceType()}; - Tensor counts_{Context::GetDeviceType()}; -}; - -template -class SigmoidFocalLossGradientOp final : public Operator { - public: - SigmoidFocalLossGradientOp(const OperatorDef& def, Workspace* ws) - : Operator(def, ws), - scale_(this->template GetSingleArgument("scale", 1.)), - num_classes_(this->template GetSingleArgument("num_classes", 80)), - gamma_(this->template GetSingleArgument("gamma", 1.)), - alpha_(this->template GetSingleArgument("alpha", 0.25)) { - CAFFE_ENFORCE(scale_ >= 0); - } - USE_OPERATOR_CONTEXT_FUNCTIONS; - - bool RunOnDevice() override { - // No CPU implementation for now - CAFFE_NOT_IMPLEMENTED; - } - - protected: - float scale_; - int num_classes_; - float gamma_; - float alpha_; - Tensor counts_{Context::GetDeviceType()}; - Tensor weights_{Context::GetDeviceType()}; // unignored weights -}; - -} // namespace caffe2 - -#endif // SIGMOID_FOCAL_LOSS_OP_H_ diff --git a/modules/detectron/smooth_l1_loss_op.cc b/modules/detectron/smooth_l1_loss_op.cc deleted file mode 100644 index 9ea570ac9c1b03..00000000000000 --- a/modules/detectron/smooth_l1_loss_op.cc +++ /dev/null @@ -1,117 +0,0 @@ -/** - * Copyright (c) 2016-present, Facebook, Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "smooth_l1_loss_op.h" - -namespace caffe2 { - -REGISTER_CPU_OPERATOR(SmoothL1Loss, SmoothL1LossOp); -REGISTER_CPU_OPERATOR( - SmoothL1LossGradient, - SmoothL1LossGradientOp); - -OPERATOR_SCHEMA(SmoothL1Loss) - .NumInputs(4) - .NumOutputs(1) - .SetDoc(R"DOC( -Smooth L1 Loss is a minor variation of Huber loss in which the point of -transition between L2 loss and L1 loss is adjustable by a hyper-parameter beta: - - SmoothL1(x) = 0.5 * x^2 / beta if |x| < beta - |x| - 0.5 * beta otherwise. - -SmoothL1 is used in Fast R-CNN and descendants as the loss function for bounding -box regression. - -The loss computed by this op has a flexible form: - - scale / N * sum_i alpha_out[i] * SmoothL1(alpha_in[i] * (y_hat[i] - y[i])). - -The weights alpha_in and alpha_out are called the "inside" and "outside" -weights, respectively. The inside weights are typically set to either 0 or 1 to -implement ignoring (when 0) certain samples. The outside weights can be used -to implement a per-sample loss weight. The overall loss is scaled by scale / N, -where N is the number of batch elements in the input predictions. -)DOC") - .Arg( - "beta", - "(float) default 1.0; L2 to L1 transition point.") - .Arg( - "scale", - "(float) default 1.0; multiply the loss by this scale factor.") - .Input( - 0, - "Y_hat", - "Tensor of predictions (at least 1D).") - .Input( - 1, - "Y", - "Tensor of labels with the same shape as Y_hat.") - .Input( - 2, - "alpha_in", - "Tensor of inside weights with the same shape as Y.") - .Input( - 3, - "alpha_out", - "Tensor of outside weights with the same shape as Y.") - .Output( - 0, - "loss", - "Scalar loss."); - -OPERATOR_SCHEMA(SmoothL1LossGradient) - .NumInputs(5) - .NumOutputs(1) - .Input( - 0, - "Y_hat", - "See SmoothL1Loss.") - .Input( - 1, - "Y", - "See SmoothL1Loss.") - .Input( - 2, - "alpha_in", - "See SmoothL1Loss.") - .Input( - 3, - "alpha_out", - "See SmoothL1Loss.") - .Input( - 4, - "d_loss", - "Gradient of forward output 0 (loss).") - .Output( - 0, - "d_Y_hat", - "Gradient of forward input 0 (Y_hat)."); - -class GetSmoothL1LossGradient : public GradientMakerBase { - using GradientMakerBase::GradientMakerBase; - vector GetGradientDefs() override { - return SingleGradientDef( - "SmoothL1LossGradient", - "", - vector{I(0), I(1), I(2), I(3), GO(0)}, - vector{GI(0)}); - } -}; - -REGISTER_GRADIENT(SmoothL1Loss, GetSmoothL1LossGradient); - -} // namespace caffe2 diff --git a/modules/detectron/smooth_l1_loss_op.cu b/modules/detectron/smooth_l1_loss_op.cu deleted file mode 100644 index ad2d9148c72f0d..00000000000000 --- a/modules/detectron/smooth_l1_loss_op.cu +++ /dev/null @@ -1,185 +0,0 @@ -/** - * Copyright (c) 2016-present, Facebook, Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "caffe2/core/context_gpu.h" -#include "modules/detectron/smooth_l1_loss_op.h" - -namespace caffe2 { - -namespace { -template -__global__ void SmoothL1Kernel( - const int n, const T* in, T* out, T beta) { - // f(x) = 0.5 * x^2 / beta if |x| < beta - // |x| - 0.5 * beta otherwise - CUDA_1D_KERNEL_LOOP(index, n) { - T val = in[index]; - T abs_val = c10::cuda::compat::abs(val); - if (abs_val < beta) { - out[index] = 0.5 * val * val / beta; - } else { - out[index] = abs_val - 0.5 * beta; - } - } -} - -template -__global__ void SmoothL1GradientKernel( - const int n, - const T* in, - T* out, - const T* d_loss_data, - T norm, - T beta) { - // f'(x) = x / beta if |x| < beta - // = sign(x) otherwise - // We also scale by norm * d_loss in this kernel for convenience - CUDA_1D_KERNEL_LOOP(index, n) { - T val = in[index]; - T abs_val = c10::cuda::compat::abs(val); - T d_loss = *d_loss_data; - if (abs_val < beta) { - out[index] = norm * d_loss * val / beta; - } else { - out[index] = norm * d_loss * ((T(0) < val) - (val < T(0))); - } - } -} -} // namespace - -template<> -bool SmoothL1LossOp::RunOnDevice() { - auto& Y_hat = Input(0); - auto& Y = Input(1); - auto& alpha_in = Input(2); - auto& alpha_out = Input(3); - - - int N = Y.dim32(0); - // Require the same number of elements along axis 0 (batch size), but - // otherwise don't care about the shape (just the number of elements) - CAFFE_ENFORCE_EQ(Y_hat.dim32(0), Y.dim32(0), - "Y_hat and Y must have the same number of elements along axis 0"); - CAFFE_ENFORCE_EQ(Y_hat.size(), Y.size(), - "Y_hat and Y must have the same number of elements"); - CAFFE_ENFORCE_EQ(Y_hat.size(), alpha_in.size()); - CAFFE_ENFORCE_EQ(Y_hat.size(), alpha_out.size()); - - auto* avg_loss = Output(0, vector(), at::dtype()); - buff_.ResizeLike(Y); - - // Difference - // d := y_hat - y - math::Sub( - Y.size(), Y_hat.data(), Y.data(), - buff_.mutable_data(), &context_); - // Element-wise weighted difference (can be used to ignore or reweight - // specific components) - // d := alpha_in * (y_hat - y) - math::Mul( - buff_.size(), buff_.data(), alpha_in.data(), - buff_.mutable_data(), &context_); - - // Element-wise smooth l1 loss - // l := SmoothL1(alpha_in * (y_hat - y)) - SmoothL1Kernel - <<>>( - buff_.size(), buff_.data(), buff_.mutable_data(), - beta_); - C10_CUDA_KERNEL_LAUNCH_CHECK(); - - // Element-wise weighted smooth l1 loss (can be used to specify a per-element - // loss weight) - // l := alpha_out * SmoothL1(alpha_in * (y_hat - y)) - math::Mul( - buff_.size(), buff_.data(), alpha_out.data(), - buff_.mutable_data(), &context_); - // Sum of all losses - // al := sum_i l_i - float* avg_loss_data = avg_loss->mutable_data(); - math::Sum( - buff_.size(), buff_.data(), avg_loss_data, &context_); - // Average of input batch size - // al := 1/N * al - math::Scale( - 1, scale_ / N, avg_loss_data, avg_loss_data, &context_); - return true; -} - -template<> -bool SmoothL1LossGradientOp::RunOnDevice() { - auto& Y_hat = Input(0); - auto& Y = Input(1); - auto& alpha_in = Input(2); - auto& alpha_out = Input(3); - auto& d_avg_loss = Input(4); // gradient of net w.r.t. avg_loss ("gradOutput") - // We intentially don't compute gradients for Y, alpha_{in,out} since they - // are not needed (can change in the future if desired) - - int N = Y.dim32(0); - // Require the same number of elements along axis 0 (batch size), but - // otherwise don't care about the shape (just the number of elements) - CAFFE_ENFORCE_EQ(Y_hat.dim32(0), Y.dim32(0), - "Y_hat and Y must have the same number of elements along axis 0"); - CAFFE_ENFORCE_EQ(Y_hat.size(), Y.size(), - "Y_hat and Y must have the same number of elements"); - CAFFE_ENFORCE_EQ(Y_hat.size(), alpha_in.size()); - CAFFE_ENFORCE_EQ(Y_hat.size(), alpha_out.size()); - CAFFE_ENFORCE_EQ(d_avg_loss.size(), 1); - - auto* d_Y_hat = Output(0, Y_hat.sizes(), at::dtype()); // gradient of net w.r.t. Y_hat ("gradInput") - buff_.ResizeLike(Y); - - // Difference - // d := y_hat - y - math::Sub( - Y.size(), Y_hat.data(), Y.data(), - buff_.mutable_data(), &context_); - // Element-wise weighted difference (can be used to ignore or reweight - // specific components) - // d := alpha_in * (y_hat - y) - math::Mul( - buff_.size(), buff_.data(), alpha_in.data(), - buff_.mutable_data(), &context_); - // d_Y_hat := d_avg_loss / N * SmoothL1'(alpha_in * (y_hat - y)) - SmoothL1GradientKernel - <<>>( - buff_.size(), buff_.data(), d_Y_hat->mutable_data(), - d_avg_loss.data(), scale_ / N, beta_); - C10_CUDA_KERNEL_LAUNCH_CHECK(); - - // Element-wise scale by alpha_in and alpha_out - math::Mul( - d_Y_hat->size(), d_Y_hat->data(), alpha_in.data(), - d_Y_hat->mutable_data(), &context_); - math::Mul( - d_Y_hat->size(), d_Y_hat->data(), alpha_out.data(), - d_Y_hat->mutable_data(), &context_); - return true; -} - - -REGISTER_CUDA_OPERATOR(SmoothL1Loss, - SmoothL1LossOp); -REGISTER_CUDA_OPERATOR(SmoothL1LossGradient, - SmoothL1LossGradientOp); -} // namespace caffe2 diff --git a/modules/detectron/smooth_l1_loss_op.h b/modules/detectron/smooth_l1_loss_op.h deleted file mode 100644 index 5e5cfd882930e0..00000000000000 --- a/modules/detectron/smooth_l1_loss_op.h +++ /dev/null @@ -1,75 +0,0 @@ -/** - * Copyright (c) 2016-present, Facebook, Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef SMOOTH_L1_LOSS_OP_H_ -#define SMOOTH_L1_LOSS_OP_H_ - -#include "caffe2/core/context.h" -#include "caffe2/core/logging.h" -#include "caffe2/core/operator.h" -#include "caffe2/utils/math.h" - -namespace caffe2 { - -template -class SmoothL1LossOp final : public Operator { - public: - SmoothL1LossOp(const OperatorDef& operator_def, Workspace* ws) - : Operator(operator_def, ws), - beta_(this->template GetSingleArgument("beta", 1.)), - scale_(this->template GetSingleArgument("scale", 1.)) { - CAFFE_ENFORCE(beta_ > 0); - CAFFE_ENFORCE(scale_ >= 0); - } - USE_OPERATOR_CONTEXT_FUNCTIONS; - - bool RunOnDevice() override { - // No CPU implementation for now - CAFFE_NOT_IMPLEMENTED; - } - - protected: - float beta_; // Transition point from L1 to L2 loss - float scale_; // Scale the loss by scale_ - Tensor buff_{Context::GetDeviceType()}; // Buffer for element-wise differences -}; - -template -class SmoothL1LossGradientOp final : public Operator { - public: - SmoothL1LossGradientOp(const OperatorDef& def, Workspace* ws) - : Operator(def, ws), - beta_(this->template GetSingleArgument("beta", 1.)), - scale_(this->template GetSingleArgument("scale", 1.)) { - CAFFE_ENFORCE(beta_ > 0); - CAFFE_ENFORCE(scale_ >= 0); - } - USE_OPERATOR_CONTEXT_FUNCTIONS; - - bool RunOnDevice() override { - // No CPU implementation for now - CAFFE_NOT_IMPLEMENTED; - } - - protected: - float beta_; // Transition point from L1 to L2 loss - float scale_; // Scale the loss by scale_ - Tensor buff_{Context::GetDeviceType()}; // Buffer for element-wise differences -}; - -} // namespace caffe2 - -#endif // SMOOTH_L1_LOSS_OP_H_ diff --git a/modules/detectron/softmax_focal_loss_op.cc b/modules/detectron/softmax_focal_loss_op.cc deleted file mode 100644 index 7bc44571f7a5e1..00000000000000 --- a/modules/detectron/softmax_focal_loss_op.cc +++ /dev/null @@ -1,104 +0,0 @@ -/** - * Copyright (c) 2016-present, Facebook, Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "modules/detectron/softmax_focal_loss_op.h" - -#include "caffe2/operators/softmax_utils.h" - -namespace caffe2 { - -REGISTER_CPU_OPERATOR(SoftmaxFocalLoss, SoftmaxFocalLossOp); -REGISTER_CPU_OPERATOR( - SoftmaxFocalLossGradient, - SoftmaxFocalLossGradientOp); - -OPERATOR_SCHEMA(SoftmaxFocalLoss) - .NumInputs(3) - .NumOutputs(2) - .SetDoc(R"DOC( -A multiclass form of Focal Loss designed for use in RetinaNet-like models. -The input is assumed to be unnormalized scores (sometimes called 'logits') -arranged in a 4D tensor with shape (N, C, H, W), where N is the number of -elements in the batch, H and W are the height and width, and C = num_anchors * -num_classes. The softmax is applied num_anchors times along the C axis. - -The softmax version of focal loss is: - - FL(p_t) = -alpha * (1 - p_t)**gamma * log(p_t), - -where p_i = exp(s_i) / sum_j exp(s_j), t is the target (ground truth) class, and -s_j is the unnormalized score for class j. - -See: https://arxiv.org/abs/1708.02002 for details. -)DOC") - .Arg( - "scale", - "(float) default 1.0; multiply the loss by this scale factor.") - .Arg("alpha", "(float) default 0.25; Focal Loss's alpha hyper-parameter.") - .Arg("gamma", "(float) default 1.0; Focal Loss's gamma hyper-parameter.") - .Arg( - "num_classes", - "(int) default 81; number of classes in each softmax group.") - .Input( - 0, - "scores", - "4D tensor of softmax inputs (called 'scores' or 'logits') with shape " - "(N, C, H, W), where C = num_anchors * num_classes defines num_anchors " - "groups of contiguous num_classes softmax inputs.") - .Input( - 1, - "labels", - "4D tensor of labels with shape (N, num_anchors, H, W). Each entry is " - "a class label in [0, num_classes - 1] (inclusive).") - .Input( - 2, - "normalizer", - "Scalar; the loss is normalized by 1 / max(1, normalizer).") - .Output(0, "loss", "Scalar loss.") - .Output( - 1, - "probabilities", - "4D tensor of softmax probabilities with shape (N, C, H, W), where " - "C = num_anchors * num_classes, and softmax was applied to each of the " - "num_anchors groups; within a group the num_classes values sum to 1."); - -OPERATOR_SCHEMA(SoftmaxFocalLossGradient) - .NumInputs(5) - .NumOutputs(1) - .Input(0, "scores", "See SoftmaxFocalLoss.") - .Input(1, "labels", "See SoftmaxFocalLoss.") - .Input(2, "normalizer", "See SoftmaxFocalLoss.") - .Input( - 3, - "probabilities", - "Output 1 from SoftmaxFocalLoss; See SoftmaxFocalLoss.") - .Input(4, "d_loss", "Gradient of forward output 0 (loss)") - .Output(0, "d_scores", "Gradient of forward input 0 (scores)"); - -class GetSoftmaxFocalLossGradient : public GradientMakerBase { - using GradientMakerBase::GradientMakerBase; - vector GetGradientDefs() override { - return SingleGradientDef( - "SoftmaxFocalLossGradient", - "", - vector{I(0), I(1), I(2), O(1), GO(0)}, - vector{GI(0)}); - } -}; - -REGISTER_GRADIENT(SoftmaxFocalLoss, GetSoftmaxFocalLossGradient); - -} // namespace caffe2 diff --git a/modules/detectron/softmax_focal_loss_op.cu b/modules/detectron/softmax_focal_loss_op.cu deleted file mode 100644 index 0612ef7edcc8c9..00000000000000 --- a/modules/detectron/softmax_focal_loss_op.cu +++ /dev/null @@ -1,256 +0,0 @@ -/** - * Copyright (c) 2016-present, Facebook, Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include - -#include "caffe2/core/context_gpu.h" -#include "modules/detectron/softmax_focal_loss_op.h" - -namespace caffe2 { - -namespace { - -__global__ void SpatialSoftmaxKernel(const int N, const int A, - const int H, const int W, const float* Xdata, float* Pdata, - const int num_classes) { - CUDA_1D_KERNEL_LOOP(index, N * A * H * W) { - int D = num_classes * A; - int x = index % W; - int y = (index / W) % H; - int a = (index / (W * H)) % A; - int i = index / W / H / A; - - // Subtract max on each cell for numerical reasons - float max_val = -FLT_MAX; - for(int c = a * num_classes; c < (a + 1) * num_classes; ++c) { - int idx = i * (H * W * D) + c * (H * W) + y * W + x; - max_val = max(max_val, Xdata[idx]); - } - // Exponentiate - float expsum = 0.0f; - for(int c = a * num_classes; c < (a + 1) * num_classes; ++c) { - int idx = i * (H * W * D) + c * (H * W) + y * W + x; - float expx = exp(Xdata[idx] - max_val); - Pdata[idx] = expx; - expsum += expx; - } - // Normalize - for(int c = a * num_classes; c < (a + 1) * num_classes; ++c) { - int idx = i * (H * W * D) + c * (H * W) + y * W + x; - Pdata[idx] /= expsum; - } - } -} - - -__global__ void SoftmaxFocalLossKernel( - const int N, const int A, const int H, const int W, - const float* Pdata, const int* targets, float* losses, - const float* weight_pos, const float gamma, const float alpha, - const int num_classes) { - CUDA_1D_KERNEL_LOOP(i, N * A * H * W) { - int D = A * num_classes; - int x = i % W; - int y = (i / W) % H; - int a = (i / (W * H)) % A; - int n = i / (W * H * A); - const int label = static_cast(targets[i]); - - float Np = c10::cuda::compat::max(weight_pos[0], static_cast(1.0)); - float z = (label == 0) * (1 - alpha) / Np + - (label >= 1) * alpha / Np; - - losses[i] = 0.0; - if (label >= 0) { - int offset = a * num_classes; - int idx = n * (H * W * D) + (offset + label) * (H * W) + y * W + x; - losses[i] = - -(pow(1.0f - Pdata[idx], gamma) * - log(c10::cuda::compat::max(Pdata[idx], FLT_MIN))) * z; - } - } -} - - -__global__ void SoftmaxFocalLossGradientWeightKernel( - const int N, const int A, const int H, const int W, - const float* Pdata, const int* targets, float* buff, - const float* weight_pos, const float gamma, const float alpha, - const int num_classes) { - CUDA_1D_KERNEL_LOOP(i, N * A * H * W) { - int D = A * num_classes; - int x = i % W; - int y = (i / W) % H; - int a = (i / (W * H)) % A; - int n = i / (W * H * A); - const int label = static_cast(targets[i]); - float Np = c10::cuda::compat::max(weight_pos[0], static_cast(1.0)); - float z = (label == 0) * (1 - alpha) / Np + - (label >= 1) * alpha / Np; - - buff[i] = 0.0; - if (label >= 0) { - int offset = a * num_classes; - int idx = n * (H * W * D) + (offset + label) * (H * W) + y * W + x; - float onemp = 1. - Pdata[idx]; - float p = Pdata[idx]; - buff[i] = - (-pow(onemp, gamma) + - gamma * pow(onemp, gamma - 1) * p * log(c10::cuda::compat::max(p, FLT_MIN))) * z; - } - } -} - - -__global__ void SoftmaxFocalLossGradientKernel( - const int N, const int D, const int H, const int W, - const float* Pdata, const int* targets, const float* buff, - const float* d_loss_data, float* dX, const int num_classes) { - CUDA_1D_KERNEL_LOOP(i, N * D * H * W) { - int A = D / num_classes; - int x = i % W; - int y = (i / W) % H; - int d = (i / (W * H)) % D; - int a = d / num_classes; - int c = d % num_classes; - int n = i / (W * H * D); - float d_loss = *d_loss_data; - - int ind = n * (H * W * A) + a * (H * W) + y * W + x; - const int label = static_cast(targets[ind]); - - float c1 = (label >= 0) * 1.0; - float c2 = (label == c) * 1.0; - dX[i] = 0.0; - dX[i] = c1 * d_loss * buff[ind] * (c2 - Pdata[i]); - } -} - -} // namespace - - -template <> -bool SoftmaxFocalLossOp::RunOnDevice() { - auto& X = Input(0); // Logits - auto& T = Input(1); // Labels - auto& wp = Input(2); // num of foreground - // average loss as output - // softmax probability, going to be re-used in gradient - - int N = X.dim32(0); - int D = X.dim32(1); - int H = X.dim32(2); - int W = X.dim32(3); - int A = D / num_classes_; - - ReinitializeTensor(&losses_, {N * A * H * W}, at::dtype().device(CUDA)); - auto* P = Output(1, {N * D * H * W}, at::dtype()); - auto* avg_loss = Output(0, vector(), at::dtype()); - math::Set( - avg_loss->size(), 0.f, avg_loss->mutable_data(), &context_); - math::Set( - P->size(), 0.f, P->mutable_data(), &context_); - math::Set( - losses_.size(), 0.f, losses_.mutable_data(), &context_); - TORCH_DCHECK_EQ(X.ndim(), 4); - - const float* Xdata = X.data(); - const float* Wdata = wp.data(); - - - // Spatial Softmax Kernel - SpatialSoftmaxKernel - <<>>( - N, A, H, W, Xdata, P->mutable_data(), num_classes_); - C10_CUDA_KERNEL_LAUNCH_CHECK(); - - // Compute loss for each x,y location - const int* Tdata = T.data(); - SoftmaxFocalLossKernel - <<>>( - N, A, H, W, P->data(), Tdata, losses_.mutable_data(), - Wdata, gamma_, alpha_, num_classes_); - C10_CUDA_KERNEL_LAUNCH_CHECK(); - - // sum the losses - float* avg_loss_data = avg_loss->mutable_data(); - math::Sum( - losses_.size(), losses_.data(), avg_loss_data, &context_); - math::Scale( - 1, scale_, avg_loss_data, avg_loss_data, &context_); - - return true; -} - - -template<> -bool SoftmaxFocalLossGradientOp::RunOnDevice() { - auto& X = Input(0); // Logits - auto& T = Input(1); // Label - auto& wp = Input(2); // num of foreground example - auto& P = Input(3); // Softmax Probability - auto& d_avg_loss = Input(4); - - - int N = X.dim32(0); - int D = X.dim32(1); - int H = X.dim32(2); - int W = X.dim32(3); - int A = D / num_classes_; - - ReinitializeTensor(&buff_, {N * A * H * W}, at::dtype().device(CUDA)); - - auto* dX = Output(0, X.sizes(), at::dtype()); // gradient wrt logits - - const float* Xdata = X.data(); - const int* Tdata = T.data(); - const float* Pdata = P.data(); - const float* Wdata = wp.data(); - - - // Compute the weight for gradients - SoftmaxFocalLossGradientWeightKernel - <<>>( - N, A, H, W, Pdata, Tdata, buff_.mutable_data(), - Wdata, gamma_, alpha_, num_classes_); - C10_CUDA_KERNEL_LAUNCH_CHECK(); - - // Compute the gradient with the weights - const float* Bdata = buff_.data(); - SoftmaxFocalLossGradientKernel - <<>>( - N, D, H, W, Pdata, Tdata, Bdata, d_avg_loss.data(), - dX->mutable_data(), num_classes_); - C10_CUDA_KERNEL_LAUNCH_CHECK(); - math::Scale( - dX->size(), - scale_, - dX->data(), - dX->mutable_data(), - &context_); - return true; -} - - -REGISTER_CUDA_OPERATOR(SoftmaxFocalLoss, - SoftmaxFocalLossOp); -REGISTER_CUDA_OPERATOR(SoftmaxFocalLossGradient, - SoftmaxFocalLossGradientOp); -} // namespace caffe2 diff --git a/modules/detectron/softmax_focal_loss_op.h b/modules/detectron/softmax_focal_loss_op.h deleted file mode 100644 index 413c5bd6d7054a..00000000000000 --- a/modules/detectron/softmax_focal_loss_op.h +++ /dev/null @@ -1,91 +0,0 @@ -/** - * Copyright (c) 2016-present, Facebook, Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef SOFTMAX_FOCAL_LOSS_OP_H_ -#define SOFTMAX_FOCAL_LOSS_OP_H_ - -#include "caffe2/core/context.h" -#include "caffe2/core/logging.h" -#include "caffe2/core/operator.h" -#include "caffe2/utils/math.h" - -namespace caffe2 { - -template -class SoftmaxFocalLossOp final : public Operator { - public: - SoftmaxFocalLossOp(const OperatorDef& operator_def, Workspace* ws) - : Operator(operator_def, ws), - scale_(this->template GetSingleArgument("scale", 1.)), - gamma_(this->template GetSingleArgument("gamma", 1.)), - alpha_(this->template GetSingleArgument("alpha", 0.25)), - num_classes_(this->template GetSingleArgument("num_classes", 81)), - order_(StringToStorageOrder( - this->template GetSingleArgument("order", "NCHW"))) { - CAFFE_ENFORCE(scale_ >= 0); - CAFFE_ENFORCE_EQ( - order_, StorageOrder::NCHW, "Only NCHW order is supported right now."); - } - USE_OPERATOR_CONTEXT_FUNCTIONS; - - bool RunOnDevice() override { - // No CPU implementation for now - CAFFE_NOT_IMPLEMENTED; - } - - protected: - float scale_; - float gamma_; - float alpha_; - int num_classes_; - StorageOrder order_; - Tensor losses_; -}; - -template -class SoftmaxFocalLossGradientOp final : public Operator { - public: - SoftmaxFocalLossGradientOp(const OperatorDef& def, Workspace* ws) - : Operator(def, ws), - scale_(this->template GetSingleArgument("scale", 1.)), - gamma_(this->template GetSingleArgument("gamma", 1.)), - alpha_(this->template GetSingleArgument("alpha", 0.25)), - num_classes_(this->template GetSingleArgument("num_classes", 81)), - order_(StringToStorageOrder( - this->template GetSingleArgument("order", "NCHW"))) { - CAFFE_ENFORCE(scale_ >= 0); - CAFFE_ENFORCE_EQ( - order_, StorageOrder::NCHW, "Only NCHW order is supported right now."); - } - USE_OPERATOR_CONTEXT_FUNCTIONS; - - bool RunOnDevice() override { - // No CPU implementation for now - CAFFE_NOT_IMPLEMENTED; - } - - protected: - float scale_; - float gamma_; - float alpha_; - int num_classes_; - StorageOrder order_; - Tensor buff_; -}; - -} // namespace caffe2 - -#endif // SOFTMAX_FOCAL_LOSS_OP_H_ diff --git a/modules/detectron/spatial_narrow_as_op.cc b/modules/detectron/spatial_narrow_as_op.cc deleted file mode 100644 index 363aa63a8f1229..00000000000000 --- a/modules/detectron/spatial_narrow_as_op.cc +++ /dev/null @@ -1,79 +0,0 @@ -/** - * Copyright (c) 2016-present, Facebook, Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "spatial_narrow_as_op.h" - -namespace caffe2 { - -REGISTER_CPU_OPERATOR(SpatialNarrowAs, SpatialNarrowAsOp); -REGISTER_CPU_OPERATOR( - SpatialNarrowAsGradient, - SpatialNarrowAsGradientOp); - -OPERATOR_SCHEMA(SpatialNarrowAs) - .NumInputs(2) - .NumOutputs(1) - .SetDoc(R"DOC( -Reduces ("narrows") the spatial extent of A to that of B by removing rows and -columns from the bottom and right. -)DOC") - .Input( - 0, - "A", - "3D or 4D input of shape (N, H0, W0) or (N, C, H0, W0).") - .Input( - 1, - "B", - "3D or 4D input of shape (N, H1, W1) or (N, C, H1, W1), where H1 <= H0 " - "and W1 <= W0.") - .Output( - 0, - "C", - "Sub window of A containing rows [0, H1 - 1] (inclusive) and columns " - "[0, W1 - 1] (inclusive)."); - -OPERATOR_SCHEMA(SpatialNarrowAsGradient) - .NumInputs(3) - .NumOutputs(1) - .Input( - 0, - "A", - "See SpatialNarrowAs.") - .Input( - 1, - "B", - "See SpatialNarrowAs.") - .Input( - 2, - "dC", - "Gradient of forward output 0 (C).") - .Output( - 0, - "dA", - "Gradient of forward input 0 (A)"); - -class SpatialNarrowAsGradient : public GradientMakerBase { - using GradientMakerBase::GradientMakerBase; - vector GetGradientDefs() override { - return SingleGradientDef( - "SpatialNarrowAsGradient", "", - vector{I(0), I(1), GO(0)}, - vector{GI(0)}); - } -}; -REGISTER_GRADIENT(SpatialNarrowAs, SpatialNarrowAsGradient); - -} // namespace caffe2 diff --git a/modules/detectron/spatial_narrow_as_op.cu b/modules/detectron/spatial_narrow_as_op.cu deleted file mode 100644 index ff8b5632e80a85..00000000000000 --- a/modules/detectron/spatial_narrow_as_op.cu +++ /dev/null @@ -1,165 +0,0 @@ -/** - * Copyright (c) 2016-present, Facebook, Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "caffe2/core/context_gpu.h" -#include "caffe2/core/operator.h" -#include "modules/detectron/spatial_narrow_as_op.h" - -namespace caffe2 { - -namespace { -template -__global__ void CopyKernel( - const int N, - const int C, - const int in_H, - const int in_W, - const int out_H, - const int out_W, - const T* in_data, - T* out_data) { - CUDA_1D_KERNEL_LOOP(index, N * C * out_H * out_W) { - int w = index % out_W; - int h = (index / out_W) % out_H; - int c = (index / out_W / out_H) % C; - int n = (index / out_W / out_H / C); - int in_index = n * C * in_H * in_W + c * in_H * in_W + h * in_W + w; - int out_index = n * C * out_H * out_W + c * out_H * out_W + h * out_W + w; - out_data[out_index] = in_data[in_index]; - } -} - -template -__global__ void CopyGradientKernel( - const int N, - const int C, - const int in_H, - const int in_W, - const int out_H, - const int out_W, - const T* in_data, - T* out_data) { - CUDA_1D_KERNEL_LOOP(index, N * C * in_H * in_W) { - int w = index % in_W; - int h = (index / in_W) % in_H; - int c = (index / in_W / in_H) % C; - int n = (index / in_W / in_H / C); - int in_index = n * C * in_H * in_W + c * in_H * in_W + h * in_W + w; - int out_index = n * C * out_H * out_W + c * out_H * out_W + h * out_W + w; - out_data[out_index] = in_data[in_index]; - } -} -} // namespace - - -template <> -bool SpatialNarrowAsOp::RunOnDevice() { - return DispatchHelper>::call(this, Input(0)); -} - -template <> -template -bool SpatialNarrowAsOp::DoRunWithType() { - // Narrows input 0 (A) spatially to match input 1 (B) - auto& A = Input(0); - auto& B = Input(1); - - - CAFFE_ENFORCE_EQ(A.dim32(0), B.dim32(0), "Input dim 0 must be equal."); - std::vector sizes; - if (A.ndim() == B.ndim()) { - CAFFE_ENFORCE_EQ(A.dim32(1), B.dim32(1), "Input dim 1 must be equal."); - CAFFE_ENFORCE_GE( - A.dim32(2), B.dim32(2), "Input 0 height must be >= input 1 height."); - CAFFE_ENFORCE_GE( - A.dim32(3), B.dim32(3), "Input 0 width must be >= input 1 width."); - sizes = B.sizes().vec(); - } else { - // For (N, H, W) case - CAFFE_ENFORCE_EQ(A.ndim() - 1, B.ndim(), "Dimension mismatch."); - CAFFE_ENFORCE_GE( - A.dim32(2), B.dim32(1), "Input 0 height must be >= input 1 height."); - CAFFE_ENFORCE_GE( - A.dim32(3), B.dim32(2), "Input 0 width must be >= input 1 width."); - sizes = {A.dim32(0), A.dim32(1), B.dim32(1), B.dim32(2)}; - } - auto* C = Output(0, sizes, at::dtype()); - int out_width = C->dim32(3); - int out_height = C->dim32(2); - int in_width = A.dim32(3); - int in_height = A.dim32(2); - - CopyKernel<<< - CAFFE_GET_BLOCKS(C->size()), - CAFFE_CUDA_NUM_THREADS, - 0, - context_.cuda_stream()>>>( - C->dim32(0), - C->dim32(1), - in_height, - in_width, - out_height, - out_width, - A.template data(), - C->template mutable_data()); - C10_CUDA_KERNEL_LAUNCH_CHECK(); - - return true; -} - -template <> -bool SpatialNarrowAsGradientOp::RunOnDevice() { - return DispatchHelper>::call(this, Input(0)); -} - -template <> -template -bool SpatialNarrowAsGradientOp::DoRunWithType() { - auto& A = Input(0); - auto& B = Input(1); - auto& dC = Input(2); // Gradient of net w.r.t. output of forward op - auto* dA = Output(0, A.sizes(), at::dtype()); // Gradient of net w.r.t. input to forward op - - math::Set( - dA->size(), 0.f, dA->template mutable_data(), &context_); - int out_width = dA->dim32(3); - int out_height = dA->dim32(2); - int in_width = dC.dim32(3); - int in_height = dC.dim32(2); - - CopyGradientKernel<<< - CAFFE_GET_BLOCKS(dC.size()), - CAFFE_CUDA_NUM_THREADS, - 0, - context_.cuda_stream()>>>( - dA->dim32(0), - dA->dim32(1), - in_height, - in_width, - out_height, - out_width, - dC.template data(), - dA->template mutable_data()); - C10_CUDA_KERNEL_LAUNCH_CHECK(); - - return true; -} - -REGISTER_CUDA_OPERATOR(SpatialNarrowAs, SpatialNarrowAsOp); -REGISTER_CUDA_OPERATOR( - SpatialNarrowAsGradient, - SpatialNarrowAsGradientOp); -} // namespace caffe2 diff --git a/modules/detectron/spatial_narrow_as_op.h b/modules/detectron/spatial_narrow_as_op.h deleted file mode 100644 index a1fca861f1c268..00000000000000 --- a/modules/detectron/spatial_narrow_as_op.h +++ /dev/null @@ -1,63 +0,0 @@ -/** - * Copyright (c) 2016-present, Facebook, Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef SPATIAL_NARROW_AS_OP_H_ -#define SPATIAL_NARROW_AS_OP_H_ - -#include "caffe2/core/context.h" -#include "caffe2/core/logging.h" -#include "caffe2/core/operator.h" -#include "caffe2/utils/math.h" - -namespace caffe2 { - -template -class SpatialNarrowAsOp final : public Operator { - public: - SpatialNarrowAsOp(const OperatorDef& operator_def, Workspace* ws) - : Operator(operator_def, ws) {} - USE_OPERATOR_CONTEXT_FUNCTIONS; - USE_DISPATCH_HELPER; - bool RunOnDevice() override { - // No CPU implementation for now - CAFFE_NOT_IMPLEMENTED; - } - - protected: - template - bool DoRunWithType(); -}; - -template -class SpatialNarrowAsGradientOp final : public Operator { - public: - SpatialNarrowAsGradientOp(const OperatorDef& def, Workspace* ws) - : Operator(def, ws) {} - USE_OPERATOR_CONTEXT_FUNCTIONS; - USE_DISPATCH_HELPER; - bool RunOnDevice() override { - // No CPU implementation for now - CAFFE_NOT_IMPLEMENTED; - } - - protected: - template - bool DoRunWithType(); -}; - -} // namespace caffe2 - -#endif // SPATIAL_NARROW_AS_OP_H_ diff --git a/modules/detectron/upsample_nearest_op.cc b/modules/detectron/upsample_nearest_op.cc deleted file mode 100644 index 631e17b231f91b..00000000000000 --- a/modules/detectron/upsample_nearest_op.cc +++ /dev/null @@ -1,83 +0,0 @@ -/** - * Copyright (c) 2016-present, Facebook, Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "upsample_nearest_op.h" -#ifdef USE_MKLDNN -#include "caffe2/ideep/operators/operator_fallback_ideep.h" -#include "caffe2/ideep/utils/ideep_operator.h" -#endif - -namespace caffe2 { -#ifdef USE_MKLDNN -REGISTER_IDEEP_OPERATOR( - UpsampleNearest, - IDEEPFallbackOp>); -#endif - -REGISTER_CPU_OPERATOR(UpsampleNearest, UpsampleNearestOp); -REGISTER_CPU_OPERATOR( - UpsampleNearestGradient, - UpsampleNearestGradientOp); - -OPERATOR_SCHEMA(UpsampleNearest) - .NumInputs(1) - .NumOutputs(1) - .SetDoc(R"DOC( -Nearest neighbor upsampling operation. Implementation taken from THCUNN. -)DOC") - .Arg( - "scale", - "(int) default 2; integer upsampling factor.") - .Input( - 0, - "X", - "4D feature map input of shape (N, C, H, W).") - .Output( - 0, - "Y", - "4D feature map of shape (N, C, scale * H, scale * W); Values are " - "neareast neighbor samples from X."); - -OPERATOR_SCHEMA(UpsampleNearestGradient) - .NumInputs(2) - .NumOutputs(1) - .Input( - 0, - "X", - "See UpsampleNearest.") - .Input( - 1, - "dY", - "Gradient of forward output 0 (Y).") - .Output( - 0, - "dX", - "Gradient of forward input 0 (X)."); - -class GetUpsampleNearestGradient : public GradientMakerBase { - using GradientMakerBase::GradientMakerBase; - vector GetGradientDefs() override { - return SingleGradientDef( - "UpsampleNearestGradient", - "", - vector{I(0), GO(0)}, - vector{GI(0)}); - } -}; - -REGISTER_GRADIENT(UpsampleNearest, GetUpsampleNearestGradient); - -} // namespace caffe2 diff --git a/modules/detectron/upsample_nearest_op.cu b/modules/detectron/upsample_nearest_op.cu deleted file mode 100644 index 0ea32e348c0b37..00000000000000 --- a/modules/detectron/upsample_nearest_op.cu +++ /dev/null @@ -1,223 +0,0 @@ -/** - * Copyright (c) 2016-present, Facebook, Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/** - * Adapted from https://github.com/torch/cunn/blob/master/lib/THCUNN/SpatialUpSamplingNearest.cu - * - * Copyright (c) 2011-2014 Idiap Research Institute (Ronan Collobert) - * Copyright (c) 2011-2012 NEC Laboratories America (Koray Kavukcuoglu) - * Copyright (c) 2011-2013 NYU (Clement Farabet) - * Copyright (c) 2006-2010 NEC Laboratories America (Ronan Collobert, - * Leon Bottou, Iain Melvin, Jason Weston) - * Copyright (c) 2006 Idiap Research Institute (Samy Bengio) - * Copyright (c) 2001-2004 Idiap Research Institute (Ronan Collobert, - * Samy Bengio, Johnny Mariethoz) - * - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * 3. Neither the names of NEC Laboratories American and IDIAP Research - * Institute nor the names of its contributors may be used to endorse or - * promote products derived from this software without specific prior - * written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - */ - - -#include "caffe2/core/context_gpu.h" -#include "modules/detectron/upsample_nearest_op.h" - -namespace caffe2 { - -namespace { -__device__ int translate_idx(int ii, int d1, int d2, int d3, int scale_factor) { - int x, y, z, w; - w = ii % d3; - ii = ii/d3; - z = ii % d2; - ii = ii/d2; - y = ii % d1; - ii = ii/d1; - x = ii; - w = w/scale_factor; - z = z/scale_factor; - d2 /= scale_factor; - d3 /= scale_factor; - return (((x*d1+y)*d2)+z)*d3+w; -} - -__device__ int translate_idx_inv( - int ii, int d1, int d2, int d3, int scale_factor, int off_x, int off_y) { - int x, y, z, w; - w = ii % d3; - ii = ii/d3; - z = ii % d2; - ii = ii/d2; - y = ii % d1; - ii = ii/d1; - x = ii; - w = w*scale_factor+off_x; - z = z*scale_factor+off_y; - d2 *= scale_factor; - d3 *= scale_factor; - return (((x*d1+y)*d2)+z)*d3+w; -} - -__global__ void upscale(const float *input, float *output, long no_elements, - int scale_factor, int d1, int d2, int d3) { - long ii = threadIdx.x + blockDim.x * blockIdx.x; - ii += threadIdx.y + blockDim.y * (blockDim.x * gridDim.x) * blockIdx.y; - if (ii >= no_elements) return; - int ipidx = translate_idx(ii, d1, d2, d3, scale_factor); - output[ii]=input[ipidx]; -} - -__global__ void downscale(float *gradInput_data, const float *gradOutput_data, - long no_elements, int scale_factor, int d1, int d2, - int d3) { - long ii = threadIdx.x + blockDim.x * blockIdx.x; - ii += threadIdx.y + blockDim.y * (blockDim.x * gridDim.x) * blockIdx.y; - if (ii >= no_elements) return; - for (int i=0; i < scale_factor; i++){ - for(int j=0; j < scale_factor; j++){ - int ipidx = translate_idx_inv(ii, d1, d2, d3, scale_factor, i, j); - gradInput_data[ii] += gradOutput_data[ipidx]; - } - } -} -} // namespace - -template<> -bool UpsampleNearestOp::RunOnDevice() { - auto& X = Input(0); - auto* Y = Output(0); - - vector out_shape; - for (int i = 0; i < X.ndim(); ++i) { - out_shape.push_back(X.dim32(i)); - } - out_shape[X.ndim() - 1] *= scale_; - out_shape[X.ndim() - 2] *= scale_; - Y->Resize(out_shape); - - int d1; - int d2; - int d3; - if (X.ndim() == 3) { - d1 = Y->dim32(0); - d2 = Y->dim32(1); - d3 = Y->dim32(2); - } else { - d1 = Y->dim32(1); - d2 = Y->dim32(2); - d3 = Y->dim32(3); - } - long no_elements = Y->size(); - - const float *input_data = X.data(); - float *output_data = Y->mutable_data(); - - // cuda blocks & threads: - long nthreads = 256; - // Max number of blocks: http://en.wikipedia.org/wiki/CUDA - // 65535 for SM 2.x, 2^32 -1 for >= 3.0 - // TODO: When we move to SM 3.5 we should update this - long n_xblocks = min(max((int)ceil((float)no_elements / nthreads), 1), 65535); - long n_yblocks = (long)ceil( - (float)no_elements / (float)(n_xblocks * nthreads)); - CAFFE_ENFORCE(n_yblocks <= 65535); - dim3 blocks(n_xblocks, n_yblocks); - dim3 threads(nthreads); - - upscale<<>>( - input_data, output_data, no_elements, scale_, d1, d2, d3); - C10_CUDA_KERNEL_LAUNCH_CHECK(); - - return true; -} - - -template<> -bool UpsampleNearestGradientOp::RunOnDevice() { - auto& X = Input(0); // Original input to "forward" op - auto& dY = Input(1); // Gradient of net w.r.t. output of "forward" op - // (aka "gradOutput") - auto* dX = Output(0); // Gradient of net w.r.t. input to "forward" op - // (aka "gradInput") - - dX->ResizeLike(X); - float *gradInput_data = dX->mutable_data(); - const float *gradOutput_data = dY.data(); - - int d1; - int d2; - int d3; - if (dX->ndim() == 3) { - d1 = dX->dim32(0); - d2 = dX->dim32(1); - d3 = dX->dim32(2); - } else { - d1 = dX->dim32(1); - d2 = dX->dim32(2); - d3 = dX->dim32(3); - } - long no_elements = dX->size(); - - // cuda blocks & threads: - long nthreads = 256; - // Max number of blocks: http://en.wikipedia.org/wiki/CUDA - // 65535 for SM 2.x, 2^32 -1 for >= 3.0 - // TODO: When we move to SM 3.5 we should update this - long n_xblocks = min(max((int)ceil((float)no_elements / nthreads), 1), 65535); - long n_yblocks = (long)ceil( - (float)no_elements / (float)(n_xblocks * nthreads)); - CAFFE_ENFORCE(n_yblocks <= 65535); - dim3 blocks(n_xblocks, n_yblocks); - dim3 threads(nthreads); - - math::Set(no_elements, 0.f, gradInput_data, &context_); - downscale<<>>( - gradInput_data, gradOutput_data, no_elements, scale_, d1, d2, d3); - C10_CUDA_KERNEL_LAUNCH_CHECK(); - - return true; -} - -REGISTER_CUDA_OPERATOR(UpsampleNearest, - UpsampleNearestOp); -REGISTER_CUDA_OPERATOR(UpsampleNearestGradient, - UpsampleNearestGradientOp); -} // namespace caffe2 diff --git a/modules/detectron/upsample_nearest_op.h b/modules/detectron/upsample_nearest_op.h deleted file mode 100644 index f850f0381a1e8e..00000000000000 --- a/modules/detectron/upsample_nearest_op.h +++ /dev/null @@ -1,106 +0,0 @@ -/** - * Copyright (c) 2016-present, Facebook, Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef UPSAMPLE_NEAREST_OP_H_ -#define UPSAMPLE_NEAREST_OP_H_ - -#include "caffe2/core/context.h" -#include "caffe2/core/logging.h" -#include "caffe2/core/operator.h" -#include "caffe2/utils/math.h" - -namespace caffe2 { - -template -class UpsampleNearestOp final : public Operator { - public: - UpsampleNearestOp(const OperatorDef& operator_def, Workspace* ws) - : Operator(operator_def, ws), - scale_(this->template GetSingleArgument("scale", 2)) { - TORCH_DCHECK_GE(scale_, 1); - } - USE_OPERATOR_CONTEXT_FUNCTIONS; - - bool RunOnDevice() override { - auto& X = Input(0); - - auto out_shape = X.sizes().vec(); - out_shape[X.dim() - 1] *= scale_; - out_shape[X.dim() - 2] *= scale_; - auto* Y = Output(0, out_shape, at::dtype()); - - int d1; - int d2; - int d3; - if (X.dim() == 3) { - d1 = Y->dim32(0); - d2 = Y->dim32(1); - d3 = Y->dim32(2); - } else { - d1 = Y->dim32(0) * Y->dim32(1); - d2 = Y->dim32(2); - d3 = Y->dim32(3); - } - - const T *input_data = X.template data(); - T *output_data = Y->template mutable_data(); - int scaled_d2 = d2 / scale_; - int scaled_d3 = d3 / scale_; - -#ifdef _OPENMP -#pragma omp parallel for -#endif - for (int i = 0; i < d1; ++i) { - for (int j = 0; j < d2; ++j) { - for (int u = 0; u < d3; ++u) { - int ii = (i * d2 + j) * d3 + u; - int scaled_u = u / scale_; - int scaled_j = j / scale_; - int ipidx = ((i * scaled_d2) + scaled_j) * scaled_d3 + scaled_u; - output_data[ii] = input_data[ipidx]; - } - } - } - - return true; - } - - protected: - int scale_; -}; - -template -class UpsampleNearestGradientOp final : public Operator { - public: - UpsampleNearestGradientOp(const OperatorDef& def, Workspace* ws) - : Operator(def, ws), - scale_(this->template GetSingleArgument("scale", 2)) { - TORCH_DCHECK_GE(scale_, 1); - } - USE_OPERATOR_CONTEXT_FUNCTIONS; - - bool RunOnDevice() override { - // No CPU implementation for now - CAFFE_NOT_IMPLEMENTED; - } - - protected: - int scale_; -}; - -} // namespace caffe2 - -#endif // UPSAMPLE_NEAREST_OP_H_ diff --git a/modules/detectron/upsample_nearest_op_test.py b/modules/detectron/upsample_nearest_op_test.py deleted file mode 100644 index 276d50474d1fe2..00000000000000 --- a/modules/detectron/upsample_nearest_op_test.py +++ /dev/null @@ -1,42 +0,0 @@ -import unittest - -import caffe2.python.hypothesis_test_util as hu -import hypothesis.strategies as st -import numpy as np -from caffe2.python import core, dyndep -from hypothesis import given, settings - - -dyndep.InitOpsLibrary("@/caffe2/modules/detectron:detectron_ops") - - -class TestUpsampleNearestOp(hu.HypothesisTestCase): - @given( - N=st.integers(1, 3), - H=st.integers(10, 300), - W=st.integers(10, 300), - scale=st.integers(1, 3), - **hu.gcs, - ) - @settings(deadline=None, max_examples=20) - def test_upsample_nearest_op(self, N, H, W, scale, gc, dc): - C = 32 - X = np.random.randn(N, C, H, W).astype(np.float32) - op = core.CreateOperator("UpsampleNearest", ["X"], ["Y"], scale=scale) - - def ref(X): - outH = H * scale - outW = W * scale - outH_idxs, outW_idxs = np.meshgrid( - np.arange(outH), np.arange(outW), indexing="ij" - ) - inH_idxs = (outH_idxs / scale).astype(np.int32) - inW_idxs = (outW_idxs / scale).astype(np.int32) - Y = X[:, :, inH_idxs, inW_idxs] - return [Y] - - self.assertReferenceChecks(device_option=gc, op=op, inputs=[X], reference=ref) - - -if __name__ == "__main__": - unittest.main() diff --git a/modules/module_test/CMakeLists.txt b/modules/module_test/CMakeLists.txt deleted file mode 100644 index f72120d535f309..00000000000000 --- a/modules/module_test/CMakeLists.txt +++ /dev/null @@ -1,23 +0,0 @@ -if(NOT CAFFE2_CMAKE_BUILDING_WITH_MAIN_REPO) - # If we are building the standalone module, we set the proper cmake variables. - cmake_minimum_required(VERSION 3.0 FATAL_ERROR) - find_package(Caffe2 REQUIRED) - set(BUILD_TEST ON) - option(BUILD_SHARED_LIBS "Build shared libs." ON) -endif() - -if(BUILD_TEST AND NOT BUILD_LITE_INTERPRETER) - add_library( - caffe2_module_test_dynamic - ${CMAKE_CURRENT_SOURCE_DIR}/module_test_dynamic.cc) - - if(HAVE_SOVERSION) - set_target_properties(caffe2_module_test_dynamic PROPERTIES - VERSION ${TORCH_VERSION} SOVERSION ${TORCH_SOVERSION}) - endif() - target_link_libraries(caffe2_module_test_dynamic torch_library) - install(TARGETS caffe2_module_test_dynamic DESTINATION lib) - if(MSVC AND BUILD_SHARED_LIBS) - install(FILES $ DESTINATION lib OPTIONAL) - endif() -endif() diff --git a/modules/module_test/module_test_dynamic.cc b/modules/module_test/module_test_dynamic.cc deleted file mode 100644 index 32596167a3761e..00000000000000 --- a/modules/module_test/module_test_dynamic.cc +++ /dev/null @@ -1,41 +0,0 @@ -/** - * Copyright (c) 2016-present, Facebook, Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "caffe2/core/module.h" -#include "caffe2/core/operator.h" - -// An explicitly defined module, testing correctness when we dynamically link a -// module -CAFFE2_MODULE(caffe2_module_test_dynamic, "Dynamic module only used for testing."); - -namespace caffe2 { - -class Caffe2ModuleTestDynamicDummyOp : public OperatorBase { - public: - using OperatorBase::OperatorBase; - bool Run(int /* unused */ /*stream_id*/) override { - return true; - } - virtual string type() { - return "base"; - } -}; - -REGISTER_CPU_OPERATOR( - Caffe2ModuleTestDynamicDummy, Caffe2ModuleTestDynamicDummyOp); -OPERATOR_SCHEMA(Caffe2ModuleTestDynamicDummy); - -} // namespace caffe2 diff --git a/modules/observers/CMakeLists.txt b/modules/observers/CMakeLists.txt deleted file mode 100644 index 050b8a1461e32f..00000000000000 --- a/modules/observers/CMakeLists.txt +++ /dev/null @@ -1,32 +0,0 @@ -if(CAFFE2_CMAKE_BUILDING_WITH_MAIN_REPO) - if(NOT USE_OBSERVERS) - return() - endif() -else() - cmake_minimum_required(VERSION 3.0 FATAL_ERROR) - project(caffe2_observers CXX) - find_package(Caffe2 REQUIRED) - option(BUILD_SHARED_LIBS "Build shared libs." ON) -endif() - -add_library(caffe2_observers - "${CMAKE_CURRENT_SOURCE_DIR}/net_observer_reporter_print.cc" - "${CMAKE_CURRENT_SOURCE_DIR}/observer_config.cc" - "${CMAKE_CURRENT_SOURCE_DIR}/perf_observer.cc" - ) -if(HAVE_SOVERSION) - set_target_properties(caffe2_observers PROPERTIES - VERSION ${TORCH_VERSION} SOVERSION ${TORCH_SOVERSION}) -endif() -target_link_libraries(caffe2_observers PUBLIC torch_library) -target_include_directories(caffe2_observers PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/..) -target_compile_options(caffe2_observers PRIVATE "-DCAFFE2_BUILD_OBSERVER_LIB") -install(TARGETS caffe2_observers DESTINATION lib) -caffe2_interface_library(caffe2_observers caffe2_observers_library) -if(MSVC AND BUILD_SHARED_LIBS) - install(FILES $ DESTINATION lib OPTIONAL) -endif() - -if(CAFFE2_CMAKE_BUILDING_WITH_MAIN_REPO) - set(Caffe2_MODULES ${Caffe2_MODULES} caffe2_observers_library PARENT_SCOPE) -endif() diff --git a/modules/observers/macros.h b/modules/observers/macros.h deleted file mode 100644 index e69b055d2a1d55..00000000000000 --- a/modules/observers/macros.h +++ /dev/null @@ -1,7 +0,0 @@ -#include "c10/macros/Macros.h" - -#ifdef CAFFE2_BUILD_OBSERVER_LIB -#define CAFFE2_OBSERVER_API C10_EXPORT -#else -#define CAFFE2_OBSERVER_API C10_IMPORT -#endif diff --git a/modules/observers/net_observer_reporter.h b/modules/observers/net_observer_reporter.h deleted file mode 100644 index bfccef64cee2b7..00000000000000 --- a/modules/observers/net_observer_reporter.h +++ /dev/null @@ -1,38 +0,0 @@ -#pragma once - -#include - -#include "caffe2/core/common.h" -#include "caffe2/core/net.h" -#include "observers/macros.h" - -namespace caffe2 { - -struct PerformanceInformation { - // Analytic - int64_t flops = 0; - int64_t bytes_written = 0; - int64_t bytes_read = 0; - std::vector tensor_shapes = {}; - std::vector args = {}; - std::string engine = ""; // the engine used - std::string type = ""; // the type of the operator - // Measured - double latency = 0; - double cpuMilliseconds = 0; -}; - -class CAFFE2_OBSERVER_API NetObserverReporter { - public: - virtual ~NetObserverReporter() = default; - - /* - Report the delay metric collected by the observer. - The delays are saved in a map. The key is an identifier associated - with the reported delay. The value is the delay value in float - */ - virtual void report( - NetBase* net, - std::map&) = 0; -}; -} diff --git a/modules/observers/net_observer_reporter_print.cc b/modules/observers/net_observer_reporter_print.cc deleted file mode 100644 index dca9cbba44bf1a..00000000000000 --- a/modules/observers/net_observer_reporter_print.cc +++ /dev/null @@ -1,158 +0,0 @@ -#include "observers/net_observer_reporter_print.h" - -#include -#include -#include "caffe2/core/init.h" -#include "observers/observer_config.h" - -#include - -namespace caffe2 { - -const std::string NetObserverReporterPrint::IDENTIFIER = "Caffe2Observer "; -static std::string get_op_args(PerformanceInformation p); -static std::string get_tensor_shapes(PerformanceInformation p); -static std::string sanatize(std::string json_s); - -void NetObserverReporterPrint::report( - NetBase* net, - std::map& info) { - // Not allowed to use json library - std::vector> caffe2_perf; - - for (auto& p : info) { - if ((p.first == "NET_DELAY") && (info.size() == 1)) { - // for Net_delay perf - caffe2_perf.push_back({{"type", "NET"}, - {"value", c10::to_string(p.second.latency * 1000)}, - {"unit", "us"}, - {"metric", "latency"}}); - caffe2_perf.push_back({{"type", "NET_"}, - { - "value", - c10::to_string( - p.second.cpuMilliseconds / - p.second.latency * - 100), - }, - {"unit", "percent"}, - {"metric", "cpu_percent"}}); - } else if (p.first != "NET_DELAY") { - // for operator perf - std::string shape_str = get_tensor_shapes(p.second); - std::string args_str = get_op_args(p.second); - std::string type = p.first; - caffe2_perf.push_back({{"type", type}, - {"value", c10::to_string(p.second.latency * 1000)}, - {"unit", "us"}, - {"metric", "latency"}}); - caffe2_perf.push_back({{"type", type}, - { - "value", - c10::to_string( - p.second.cpuMilliseconds / - p.second.latency * - 100), - }, - {"unit", "percent"}, - {"metric", "cpu_percent"}}); - if (p.second.flops > 0) { - caffe2_perf.push_back({{"type", type}, - {"value", c10::to_string(p.second.flops)}, - {"unit", "flop"}, - {"metric", "flops"}}); - } - if (shape_str != "") { - caffe2_perf.push_back({{"type", type}, - {"info_string", shape_str}, - {"unit", ""}, - {"metric", "tensor_shapes"}}); - } - if (args_str != "") { - caffe2_perf.push_back({{"type", type}, - {"info_string", args_str}, - {"unit", ""}, - {"metric", "op_args"}}); - } - } - } - - // NOLINTNEXTLINE(modernize-loop-convert) - for (auto it = caffe2_perf.begin(); it != caffe2_perf.end(); it++) { - std::stringstream buffer; - auto entry = *it; - buffer << IDENTIFIER << "{"; - // NOLINTNEXTLINE(modernize-raw-string-literal) - buffer << "\"type\": \"" << sanatize(entry["type"]) << "\"," - // NOLINTNEXTLINE(modernize-raw-string-literal) - << "\"unit\": \"" << sanatize(entry["unit"]) << "\"," - // NOLINTNEXTLINE(modernize-raw-string-literal) - << "\"metric\": \"" << sanatize(entry["metric"]) << "\","; - if (entry.find("value") != entry.end()) { - // NOLINTNEXTLINE(modernize-raw-string-literal) - buffer << "\"value\": \"" << sanatize(entry["value"]) << "\""; - } else if (entry.find("info_string") != entry.end()) { - // NOLINTNEXTLINE(modernize-raw-string-literal) - buffer << "\"info_string\": \"" << sanatize(entry["info_string"]) << "\""; - } - buffer << "}"; - LOG(INFO) << buffer.str(); - } -} - -static std::string get_tensor_shapes(PerformanceInformation p) { - std::string shape_str; - std::stringstream shape_stream; - if (!p.tensor_shapes.empty()) { - shape_stream << "["; - for (const auto i : c10::irange(p.tensor_shapes.size())) { - shape_stream << "["; - for (int j = 0; j < p.tensor_shapes[i].dims_size(); j++) { - shape_stream << p.tensor_shapes[i].dims(j) << ", "; - } - shape_stream << "], "; - } - shape_stream << "]"; - shape_str = shape_stream.str(); - } else { - shape_str = ""; - } - return shape_str; -} - -static std::string get_op_args(PerformanceInformation p) { - std::string args_str; - if (!p.args.empty()) { - std::stringstream args; - args << "["; - for (const auto i : c10::irange(p.args.size())) { - args << "{" << p.args[i].name() << ": "; - if (p.args[i].has_i()) { - args << p.args[i].i(); - } else if (p.args[i].has_s()) { - args << p.args[i].s(); - } else if (p.args[i].has_n()) { - args << &p.args[i].n(); - } else if (p.args[i].has_f()) { - args << p.args[i].f(); - } else { - args << "None"; - } - args << "}, "; - } - args << "]"; - args_str = args.str(); - } else { - args_str = ""; - } - return args_str; -} - -static std::string sanatize(std::string json_s) { - // Remove illegal characters from the name that would cause json string to - // become invalid - json_s.erase(std::remove(json_s.begin(), json_s.end(), '"'), json_s.end()); - json_s.erase(std::remove(json_s.begin(), json_s.end(), '\\'), json_s.end()); - return json_s; -} -} diff --git a/modules/observers/net_observer_reporter_print.h b/modules/observers/net_observer_reporter_print.h deleted file mode 100644 index 5d4640c24c994b..00000000000000 --- a/modules/observers/net_observer_reporter_print.h +++ /dev/null @@ -1,16 +0,0 @@ -#pragma once - -#include "observers/macros.h" -#include "observers/net_observer_reporter.h" - -#include "caffe2/core/common.h" - -namespace caffe2 { - -class CAFFE2_OBSERVER_API NetObserverReporterPrint : public NetObserverReporter { - public: - static const std::string IDENTIFIER; - void report(NetBase* net, std::map&) override; -}; - -} // namespace caffe2 diff --git a/modules/observers/observer_config.cc b/modules/observers/observer_config.cc deleted file mode 100644 index c6ba6a2d370c06..00000000000000 --- a/modules/observers/observer_config.cc +++ /dev/null @@ -1,12 +0,0 @@ -#include "observers/observer_config.h" - -namespace caffe2 { - -int ObserverConfig::netInitSampleRate_ = 0; -int ObserverConfig::netFollowupSampleRate_ = 0; -int ObserverConfig::netFollowupSampleCount_ = 0; -int ObserverConfig::operatorNetSampleRatio_ = 0; -int ObserverConfig::skipIters_ = 0; -unique_ptr ObserverConfig::reporter_ = nullptr; -int ObserverConfig::marker_ = -1; -} diff --git a/modules/observers/observer_config.h b/modules/observers/observer_config.h deleted file mode 100644 index cc967263a66b9b..00000000000000 --- a/modules/observers/observer_config.h +++ /dev/null @@ -1,99 +0,0 @@ -#pragma once - -#include "observers/macros.h" -#include "observers/net_observer_reporter.h" - -#include "caffe2/core/common.h" - -namespace caffe2 { - -/* - netInitSampleRate_ == 1 && operatorNetSampleRatio_ == 1 : - Log operator metrics in every iteration - netInitSampleRate_ == 1 && operatorNetSampleRatio_ == 0 : - Log net metrics in every iterationn - netInitSampleRate_ == n && netFollowupSampleRate_ == m && - netFollowupSampleCount == c && operatorNetSampleRatio_ == 1 : - Log operator metrics first at odds of 1 / n. Once first logged, - the following c logs are at odds of 1 / min(n, m). Then repeat - netInitSampleRate_ == n && netFollowupSampleRate_ == m && - netFollowupSampleCount == c && operatorNetSampleRatio_ == 0 : - Log net metrics first at odds of 1 / n. Once first logged, - the following c logs are at odds of 1 / min(n, m). Then repeat - netInitSampleRate_ == n && netFollowupSampleRate_ == m && - netFollowupSampleCount == c && operatorNetSampleRatio_ == o : - Log net metrics first at odds of 1 / n. Once first logged, - the following c logs are at odds of 1 / min(n, m), if the random number - is multiples of o, log operator metrics instead. Then repeat - skipIters_ == n: skip the first n iterations of the net. -*/ -class CAFFE2_OBSERVER_API ObserverConfig { - public: - static void initSampleRate( - int netInitSampleRate, - int netFollowupSampleRate, - int netFollowupSampleCount, - int operatorNetSampleRatio, - int skipIters) { - CAFFE_ENFORCE(netFollowupSampleRate <= netInitSampleRate); - CAFFE_ENFORCE(netFollowupSampleRate >= 1 || netInitSampleRate == 0); - netInitSampleRate_ = netInitSampleRate; - netFollowupSampleRate_ = netFollowupSampleRate; - netFollowupSampleCount_ = netFollowupSampleCount; - operatorNetSampleRatio_ = operatorNetSampleRatio; - skipIters_ = skipIters; - } - static int getNetInitSampleRate() { - return netInitSampleRate_; - } - static int getNetFollowupSampleRate() { - return netFollowupSampleRate_; - } - static int getNetFollowupSampleCount() { - return netFollowupSampleCount_; - } - static int getOpoeratorNetSampleRatio() { - return operatorNetSampleRatio_; - } - static int getSkipIters() { - return skipIters_; - } - static void setReporter(unique_ptr reporter) { - reporter_ = std::move(reporter); - } - static NetObserverReporter* getReporter() { - CAFFE_ENFORCE(reporter_); - return reporter_.get(); - } - static void setMarker(int marker) { - marker_ = marker; - } - static int getMarker() { - return marker_; - } - - private: - /* The odds of log net metric initially or immediately after reset */ - static int netInitSampleRate_; - - /* The odds of log net metric after log once after start of reset */ - static int netFollowupSampleRate_; - - /* The number of follow up logs to be collected for odds of - netFollowupSampleRate_ */ - static int netFollowupSampleCount_; - - /* The odds to log the operator metric instead of the net metric. - When the operator is logged the net is not logged. */ - static int operatorNetSampleRatio_; - - /* skip the first few iterations */ - static int skipIters_; - - static unique_ptr reporter_; - - /* marker used in identifying the metrics in certain reporters */ - static int marker_; -}; - -} diff --git a/modules/observers/perf_observer.cc b/modules/observers/perf_observer.cc deleted file mode 100644 index cfd6130f7255e3..00000000000000 --- a/modules/observers/perf_observer.cc +++ /dev/null @@ -1,330 +0,0 @@ -#include "observers/perf_observer.h" -#include "observers/observer_config.h" -#ifndef C10_MOBILE -#include "caffe2/core/flags.h" -#include "observers/net_observer_reporter_print.h" -#endif - -#include -// NOLINTNEXTLINE(modernize-deprecated-headers) -#include -#include "caffe2/core/common.h" -#include "caffe2/core/init.h" -#include "caffe2/core/operator.h" - -#if defined(TARGET_OS_MAC) || \ -defined(TARGET_OS_IPHONE) || \ -defined(TARGET_IPHONE_SIMULATOR) -#define _APPLE 1 -#endif - -#ifdef _WIN32 -#ifndef WIN32_LEAN_AND_MEAN -#define WIN32_LEAN_AND_MEAN -#endif -#include -#endif - -#ifdef _APPLE -#include -#include -#include -#endif - -#ifndef C10_MOBILE -C10_DEFINE_int64( - aiBench_netInitSampleRate, - 0, - "One in N sampling rate for net delay"); - -C10_DEFINE_int64( - aiBench_netFollowupSampleRate, - 0, - "One in N sampling rate for net delay"); - -C10_DEFINE_int64( - aiBench_netFollowupSampleCount, - 0, - "control the following c logs"); - -C10_DEFINE_int64( - aiBench_operatorNetSampleRatio, - 0, - "One in N sampling rate for operator delay"); - -C10_DEFINE_int64( - aiBench_skipIters, - 0, - "skip the first N iterations of the net run"); -#endif - -namespace caffe2 { -namespace { - -bool registerGlobalPerfNetObserverCreator(int* /*pargc*/, char*** /*pargv*/) { - AddGlobalNetObserverCreator([](NetBase* subject) { - return std::make_unique(subject); - }); - -#if !defined(C10_MOBILE) - // for aibench usage - caffe2::ObserverConfig::setReporter( - std::make_unique()); - - caffe2::ObserverConfig::initSampleRate( - FLAGS_aiBench_netInitSampleRate, - FLAGS_aiBench_netFollowupSampleRate, - FLAGS_aiBench_netFollowupSampleCount, - FLAGS_aiBench_operatorNetSampleRatio, - FLAGS_aiBench_skipIters); -#endif - - return true; -} -} // namespace - -#ifdef _WIN32 -double getTicksPerMillisecond() { - static LARGE_INTEGER ticks_per_sec; - if (!ticks_per_sec.QuadPart) { - QueryPerformanceFrequency(&ticks_per_sec); - if (!ticks_per_sec.QuadPart) { - return 0.0; - } - } - - return static_cast(ticks_per_sec.QuadPart) / 1000.0; -} -#elif !defined _APPLE -double getClockTimeMilliseconds(clockid_t clk_id) { - int result; - struct timespec tp; - result = clock_gettime(clk_id, &tp); - if (result == -1) { - return 0.0; - } else { - return tp.tv_sec * 1000.0 + tp.tv_nsec / 1000000.0; - } -} -#endif - -double getWallClockTimeMilliseconds() { -#ifdef _WIN32 - double ticks_per_ms = getTicksPerMillisecond(); - if (ticks_per_ms) { - LARGE_INTEGER ticks; - if (QueryPerformanceCounter(&ticks)) { - return static_cast(ticks.QuadPart) / ticks_per_ms; - } - } - - return 0.0; -#elif defined _APPLE - static mach_timebase_info_data_t info; - if (info.denom == 0) { - mach_timebase_info(&info); - } - - uint64_t now = mach_absolute_time(); - now = now * info.numer / info.denom; // convert to nanoseconds - return now / 1000000.0; -#else - return getClockTimeMilliseconds(CLOCK_MONOTONIC); -#endif -} - -double getCpuTimeMilliseconds() { -#ifdef _WIN32 - FILETIME creation_time; - FILETIME exit_time; - FILETIME kernel_time; - FILETIME user_time; - if (GetProcessTimes( - GetCurrentProcess(), - &creation_time, - &exit_time, - &kernel_time, - &user_time)) { - ULARGE_INTEGER kernel; - ULARGE_INTEGER user; - kernel.HighPart = kernel_time.dwHighDateTime; - kernel.LowPart = kernel_time.dwLowDateTime; - user.HighPart = user_time.dwHighDateTime; - user.LowPart = user_time.dwLowDateTime; - return (static_cast(kernel.QuadPart) + - static_cast(user.QuadPart)) / 10000.0; - } - - return 0.0; -#elif defined _APPLE - // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init) - struct rusage ru; - if (getrusage(RUSAGE_SELF, &ru)) { - return 0.0; - } - - return ru.ru_utime.tv_sec * 1000.0 - + ru.ru_utime.tv_usec / 1000.0 - + ru.ru_stime.tv_sec * 1000.0 - + ru.ru_stime.tv_usec / 1000.0; -#else - return getClockTimeMilliseconds(CLOCK_PROCESS_CPUTIME_ID); -#endif -} - -REGISTER_CAFFE2_EARLY_INIT_FUNCTION( - registerGlobalPerfNetObserverCreator, - ®isterGlobalPerfNetObserverCreator, - "Caffe2 net global observer creator"); - -// NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init) -PerfNetObserver::PerfNetObserver(NetBase* subject_) - : NetObserver(subject_), numRuns_(0) {} - -// NOLINTNEXTLINE(modernize-use-equals-default) -PerfNetObserver::~PerfNetObserver() {} - -void PerfNetObserver::Start() { - static int visitCount = 0; - // Select whether to log the operator or the net. - // We have one sample rate for the entire app. - int netInitSampleRate = ObserverConfig::getNetInitSampleRate(); - int netFollowupSampleRate = ObserverConfig::getNetFollowupSampleRate(); - int netFollowupSampleCount = ObserverConfig::getNetFollowupSampleCount(); - int operatorNetSampleRatio = ObserverConfig::getOpoeratorNetSampleRatio(); - int skipIters = ObserverConfig::getSkipIters(); - int sampleRate = visitCount > 0 ? netFollowupSampleRate : netInitSampleRate; - // NOLINTNEXTLINE(clang-analyzer-security.insecureAPI.rand) - if (skipIters <= static_cast(numRuns_) && sampleRate > 0 && rand() % sampleRate == 0) { - visitCount++; - if (visitCount == netFollowupSampleCount) { - visitCount = 0; - } - // NOLINTNEXTLINE(clang-analyzer-security.insecureAPI.rand) - if (operatorNetSampleRatio > 0 && rand() % operatorNetSampleRatio == 0) { - logType_ = PerfNetObserver::OPERATOR_DELAY; - } else { - logType_ = PerfNetObserver::NET_DELAY; - } - } else { - logType_ = PerfNetObserver::NONE; - } - numRuns_++; - - if (logType_ == PerfNetObserver::OPERATOR_DELAY) { - /* Always recreate new operator observers - whenever we measure operator delay */ - const auto& operators = subject_->GetOperators(); - for (auto* op : operators) { - observerMap_[op] = op->AttachObserver( - std::make_unique(op, this)); - } - } - - wallMilliseconds_ = getWallClockTimeMilliseconds(); - cpuMilliseconds_ = getCpuTimeMilliseconds(); -} - -void PerfNetObserver::Stop() { - if (logType_ == PerfNetObserver::NONE) { - return; - } - std::map info; - PerformanceInformation net_perf; - net_perf.cpuMilliseconds = - getCpuTimeMilliseconds() - cpuMilliseconds_; - net_perf.latency = - getWallClockTimeMilliseconds() - wallMilliseconds_; - - if (logType_ == PerfNetObserver::OPERATOR_DELAY) { - const auto& operators = subject_->GetOperators(); - for (unsigned idx = 0; idx < operators.size(); ++idx) { - const auto* op = operators[idx]; - auto name = getObserverName(op, static_cast(idx)); - PerformanceInformation p; - const PerfOperatorObserver* opObserver = - static_cast(observerMap_[op]); - p.latency = opObserver->getWallMilliseconds(); - p.cpuMilliseconds = opObserver->getCpuMilliseconds(); - p.engine = op->engine(); - p.type = op->type(); - p.tensor_shapes = - static_cast(observerMap_[op]) - ->getTensorShapes(); - - if (op->has_debug_def()) { - // NOLINTNEXTLINE(performance-for-range-copy) - for (auto arg : op->debug_def().arg()) { - p.args.emplace_back(arg); - } - } - - info.insert({name, p}); - } - - /* clear all operator delay after use so that we don't spent time - collecting the operator delay info in later runs */ - for (auto* op : operators) { - op->DetachObserver(observerMap_[op]); - } - observerMap_.clear(); - } - info.insert({"NET_DELAY", net_perf}); - ObserverConfig::getReporter()->report(subject_, info); -} - -caffe2::string PerfNetObserver::getObserverName(const OperatorBase* op, int idx) - const { - string opType = op->has_debug_def() ? op->debug_def().type() : "NO_TYPE"; - string displayName = - (op->has_debug_def() ? op->debug_def().name().size() - ? op->debug_def().name() - : (op->debug_def().output_size() ? op->debug_def().output(0) - : "NO_OUTPUT") - : "NO_DEF"); - caffe2::string name = - "ID_" + c10::to_string(idx) + "_" + opType + "_" + displayName; - return name; -} - -PerfOperatorObserver::PerfOperatorObserver( - OperatorBase* op, - PerfNetObserver* netObserver) - : ObserverBase(op), - netObserver_(netObserver), - wallMilliseconds_(0), - cpuMilliseconds_(0) { - CAFFE_ENFORCE(netObserver_, "Observers can't operate outside of the net"); -} - -// NOLINTNEXTLINE(modernize-use-equals-default) -PerfOperatorObserver::~PerfOperatorObserver() {} - -void PerfOperatorObserver::Start() { - wallMilliseconds_ = getWallClockTimeMilliseconds(); - cpuMilliseconds_ = getCpuTimeMilliseconds(); -} - -void PerfOperatorObserver::Stop() { - /* Time from the start of the net minus the time spent on all other - operators is the time spent on this operator */ - cpuMilliseconds_ = - getCpuTimeMilliseconds() - cpuMilliseconds_; - wallMilliseconds_ = - getWallClockTimeMilliseconds() - wallMilliseconds_; - tensor_shapes_ = subject_->InputTensorShapes(); -} - -double PerfOperatorObserver::getWallMilliseconds() const { - return wallMilliseconds_; -} - -double PerfOperatorObserver::getCpuMilliseconds() const { - return cpuMilliseconds_; -} - -std::vector PerfOperatorObserver::getTensorShapes() const { - return tensor_shapes_; -} - -} // namespace caffe2 diff --git a/modules/observers/perf_observer.h b/modules/observers/perf_observer.h deleted file mode 100644 index 71e1190e840ba1..00000000000000 --- a/modules/observers/perf_observer.h +++ /dev/null @@ -1,66 +0,0 @@ -#pragma once - -#include "caffe2/core/common.h" -#include "caffe2/core/net.h" -#include "caffe2/core/observer.h" -#include "caffe2/core/timer.h" -#include "observers/macros.h" - -#include - -namespace caffe2 { - -double getClockTimeMilliseconds(); - -class CAFFE2_OBSERVER_API PerfNetObserver : public NetObserver { - public: - explicit PerfNetObserver(NetBase* subject_); - virtual ~PerfNetObserver(); - - private: - void Start() override; - void Stop() override; - - caffe2::string getObserverName(const OperatorBase* op, int idx) const; - - private: - enum LogType { - NONE, - OPERATOR_DELAY, - NET_DELAY, - }; - LogType logType_; - unsigned int numRuns_; - std::unordered_map*> - observerMap_; - - double wallMilliseconds_; - double cpuMilliseconds_; -}; - -class PerfOperatorObserver : public ObserverBase { - public: - PerfOperatorObserver(OperatorBase* op, PerfNetObserver* netObserver); - virtual ~PerfOperatorObserver(); - - double getWallMilliseconds() const; - double getCpuMilliseconds() const; - std::vector getTensorShapes() const; - - private: - void Start() override; - void Stop() override; - - private: - // Observer of a net that owns corresponding op. We make sure net is never - // destructed while operator observer is still alive. First operator observer - // gets destructed, then the op, then the net and its observer. - // We do this trick in order to get access to net's name and other fields - // without storing inside the operator observer. Each field is memory - // costly here and a raw pointer is a cheapest sholution - PerfNetObserver* netObserver_; - double wallMilliseconds_; - double cpuMilliseconds_; - std::vector tensor_shapes_; -}; -} // namespace caffe2 diff --git a/setup.py b/setup.py index d774446780b484..ce279884716149 100644 --- a/setup.py +++ b/setup.py @@ -88,12 +88,6 @@ # disables use of system-wide nccl (we will use our submoduled # copy in third_party/nccl) # -# BUILD_CAFFE2_OPS=0 -# disable Caffe2 operators build -# -# BUILD_CAFFE2=0 -# disable Caffe2 build -# # USE_IBVERBS # toggle features related to distributed support # diff --git a/torch/csrc/jit/serialization/export.cpp b/torch/csrc/jit/serialization/export.cpp index c23e3b52bfb1ba..2df70800a8ad80 100644 --- a/torch/csrc/jit/serialization/export.cpp +++ b/torch/csrc/jit/serialization/export.cpp @@ -145,26 +145,6 @@ void validateBlock( "\n\nDefined at:\n" + getNodeStackTraceString(node)) } } else { -#ifdef BUILD_CAFFE2 - // Assuming this is a Caffe2 change as it only modifies an aten op - // for operator_export_type == ONNX_ATEN_FALLBACK, which is a common - // pattern for Caffe2-specific scenarios. - if (node->kind() == aten::expand) { - if (operator_export_type == - onnx_torch::OperatorExportTypes::ONNX_ATEN_FALLBACK) { - WithInsertPoint guard(node); - auto* new_node = - b->owningGraph()->insertNode(b->owningGraph()->create( - Symbol(::c10::aten::ATen), - node->inputs(), - node->outputs().size())); - for (size_t i = 0; i < node->outputs().size(); ++i) { - node->output(i)->replaceAllUsesWith(new_node->output(i)); - } - new_node->s_(Symbol::fromQualString("attr::operator"), "expand"); - } - } -#endif if (node->kind() == prim::PackPadded || node->kind() == prim::PadPacked) { if (operator_export_type != onnx_torch::OperatorExportTypes::ONNX_FALLTHROUGH) { diff --git a/torch/csrc/onnx/init.cpp b/torch/csrc/onnx/init.cpp index 825ed46e11a500..b8bef342323c57 100644 --- a/torch/csrc/onnx/init.cpp +++ b/torch/csrc/onnx/init.cpp @@ -293,10 +293,6 @@ void initONNXBindings(PyObject* module) { onnx.attr("PRODUCER_VERSION") = py::str(TORCH_VERSION); -#ifdef BUILD_CAFFE2 - onnx.attr("_CAFFE2_ATEN_FALLBACK") = true; -#else onnx.attr("_CAFFE2_ATEN_FALLBACK") = false; -#endif } } // namespace torch::onnx diff --git a/torch/onnx/utils.py b/torch/onnx/utils.py index a4dc9a17089c53..f5206d425b4d8a 100644 --- a/torch/onnx/utils.py +++ b/torch/onnx/utils.py @@ -350,11 +350,6 @@ def export( %3 : Float = onnx::Mul(%2, %0) return (%3) - If PyTorch was built with Caffe2 (i.e. with ``BUILD_CAFFE2=1``), then - Caffe2-specific behavior will be enabled, including special support - for ops are produced by the modules described in - `Quantization `_. - .. warning:: Models exported this way are probably runnable only by Caffe2. @@ -1802,9 +1797,8 @@ def _add_output_to_block(block: _C.Block, value: _C.Value) -> int: def _should_aten_fallback( name: str, opset_version: int, operator_export_type: _C_onnx.OperatorExportTypes ): - # For BUILD_CAFFE2=0 builds, if domain=="aten" and operator_export_type==ONNX_ATEN, + # For all builds, if domain=="aten" and operator_export_type==ONNX_ATEN, # an aten::ATen operator is created regardless of symbolics existence - # For BUILD_CAFFE2=1, the same applies only if there is no symbolic available is_exportable_aten_op = registration.registry.is_registered_op(name, opset_version) is_onnx_aten_export = operator_export_type == _C_onnx.OperatorExportTypes.ONNX_ATEN