[MNN:Sync] Sync Internal 2.8.2

alibaba · Feb 29, 2024 · 970b63f · 970b63f
1 parent 5607201
commit 970b63f
Show file tree

Hide file tree

Showing 295 changed files with 1,013,758 additions and 3,667 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -44,6 +44,7 @@ option(MNN_DEBUG_MEMORY "MNN Debug Memory Access" OFF)
 option(MNN_DEBUG_TENSOR_SIZE "Enable Tensor Size" OFF)
 option(MNN_GPU_TRACE "Enable MNN Gpu Debug" OFF)
 option(MNN_SUPPORT_RENDER "Enable MNN Render Ops" OFF)
+option(MNN_SUPPORT_TRANSFORMER_FUSE "Enable MNN transformer Fuse Ops" OFF)
 option(MNN_PORTABLE_BUILD "Link the static version of third party libraries where possible to improve the portability of built executables" OFF)
 option(MNN_SEP_BUILD "Build MNN Backends and expression separately. Only works with MNN_BUILD_SHARED_LIBS=ON" ON)
 option(NATIVE_LIBRARY_OUTPUT "Native Library Path" OFF)
@@ -166,7 +167,9 @@ endif()
 if(MNN_SUPPORT_RENDER)
     add_definitions(-DMNN_SUPPORT_RENDER)
 endif()
-
+if(MNN_SUPPORT_TRANSFORMER_FUSE)
+    add_definitions(-DMNN_SUPPORT_TRANSFORMER_FUSE)
+endif()
 # debug options
 if(MNN_DEBUG_MEMORY)
     add_definitions(-DMNN_DEBUG_MEMORY)

diff --git a/docker_release.sh b/docker_release.sh
@@ -0,0 +1,6 @@
+# using docker run release
+docker start mnn_release
+docker exec -i -e TEST_ID=$(pwd | awk -F "/" '{print $(NF-1)}') mnn_release bash <<'EOF'
+cd ~/yanxing_zhaode/cise/space/$TEST_ID/source && ./release.sh pymnn
+exit
+EOF
diff --git a/docs/compile/cmake.md b/docs/compile/cmake.md
@@ -81,4 +81,5 @@ MNN使用CMake构建项目，CMake中的宏定义列表如下：
 | MNN_VULKAN_IMAGE     | 构建MNN的Vulkan后端时采用Image内存模式，以便支持FP16和部分移动端上GPU的加速，默认为`ON` |
 | MNN_LOW_MEMORY       | 是否支持低内存模式，支持低内存模式使用权值量化模型并设置`low_memory`则会使用计算时反量化，默认为`OFF` |
 | MNN_SUPPORT_RENDER       | 是否支持图形渲染相关算子实现，默认为 `OFF` |
+| MNN_SUPPORT_TRANSFORMER_FUSE | 是否支持Fuse Transformer相关OP实现，默认为 `OFF` |
 | MNN_BUILD_LLM        | 是否构建基于MNN的llm库和demo，默认为`OFF` |
diff --git a/docs/faq.md b/docs/faq.md
@@ -246,7 +246,7 @@ GPU 后端调用 copy 的时间包含两个部分
    - x86 / x64 架构下，无 vnni 指令，量化计算需要先从 int8 转到 int16 ，乘加到 int32 ，本身计算效率不如浮点直接乘加到 fp32 上快。
    - x64 + vnni 指令，量化计算有 sdot 指令，明显快于 FP32 ，编译 MNN 时需要开启 MNN_AVX512 以支持这个指令，一般相比 AVX512 的浮点运算快 30%
    - ARM v7a / ARMv8 ：量化计算采用 int8 乘加到 int16，再双加到 int32 的方式，计算效率略快于浮点（一般 30% 左右提升）。
-   - ARMv8.2 + 量化计算有 sdot 指令，但同时 FP32 相对之前架构发射数也提升了一倍，编译 MNN 打开 MNN_ARM82 启用 sdot 指令则量化计算更快，否则 FP32 更快，理想情况比 FP32 快1倍以上，比 FP16 快 20%。
+   - ARMv8.2 架构有 sdot 指令，但同时 FP32 相对之前架构发射数也提升了一倍，也支持了比 FP32 快一倍的 FP16 向量计算指令，MNN 会检查设备架构以开启 sdot / smmla ，理想情况下量化计算性能比 FP32 快1倍以上，比 FP16 快 20%。
 
 ## 其他问题
 ### MNN模型如何加密
@@ -256,4 +256,4 @@ GPU 后端调用 copy 的时间包含两个部分
 2. 执行`schema/generate.sh`重新生成`flatbuffers`头文件；
 3. 重新编译`MNN`库文件， `Convert`等所有工具；
 4. 使用新的工具重新转换模型；
-5. 在端侧使用新模型和新的`MNN`库文件进行部署；
+5. 在端侧使用新模型和新的`MNN`库文件进行部署；
diff --git a/docs/inference/module.md b/docs/inference/module.md
@@ -45,9 +45,10 @@ rtmgr->setCache(".cachefile");
 // 从模型文件加载并创建新Module
 const std::string model_file = "/tmp/mymodule.mnn"; // model file with path
 
-// 输入名，可以为空，为空时 MNN 自动搜索模型中的输入，多输入情况下无法保证顺序，需要通过 getInfo 接口查看
+// 输入名：多个输入时按顺序填入，其顺序与后续 onForward 中的输入数组需要保持一致
 const std::vector<std::string> input_names{"input_1", "input_2", "input_3"};
-// 输出名，可以为空，为空时 MNN 自动搜索模型中的输出，多输出情况下无法保证顺序，需要通过 getInfo 接口查看
+
+// 输出名，多个输出按顺序填入，其顺序决定 onForward 的输出数组顺序
 const std::vector<std::string> output_names{"output_1"};
 
 Module::Config mdconfig; // default module config
@@ -56,6 +57,8 @@ std::unique_ptr<Module> module; // module
 module.reset(Module::load(input_names, output_names, model_filename.c_str(), rtMgr, &mdconfig));
 ```
 
+输入输出的名字可以为空，此时，MNN 会检索模型中的输入/输出填入，在多输入/输出情况下无法保证顺序，需要通过 getInfo 接口查看。
+
 ### Module::Config 
 创建`Module`时可传入`Module::Config`，具体结构如下：
 

diff --git a/docs/pymnn/expr.md b/docs/pymnn/expr.md
@@ -190,6 +190,62 @@ array([0., 1., 2., 3.], dtype=float32)
 ```python
 >>> expr.set_global_executor_config(2, 2, 1)
 ```
+
+---
+### `sync()`
+MNN VARP同步，调用后可以保证改VARP计算完毕
+
+返回：`None`
+
+返回类型：`None`
+
+示例：
+
+```python
+>>> mnn_var = expr.placeholder([2,2])
+>>> mnn_var.sync()
+```
+
+---
+### `set_device_ptr(device_ptr, memory_type)`
+设置MNN VARP GPU内存地址，同时指定给定内存地址对应的内存类型(CUDA/OPENCL/OPENGL等)，仅在MNN VARP有GPU内存时可用：
+
+参数：
+- `device_ptr:uint64_t` 整形内存指针地址
+- `memory_type:int` 例如： 2->CUDA 3->OpenCL等, 详见include/MNN/MNNForwardType.h文件中MNNForwardType结构体
+
+返回：`None`
+
+返回类型：`None`
+
+示例：
+
+```python
+>>> torch_tensor = torch.empty([1, 1000], dtype=torch.float16).cuda()
+>>> mnn_var = expr.placeholder([2,2])
+>>> mnn_var.set_device_ptr(torch_tensor.data_ptr() ,2)
+```
+
+---
+### `copy_to_device_ptr(device_ptr, memory_type)`
+拷贝MNN VARP GPU内存到指定内存地址, 同时指定给定内存地址对应的内存类型(CUDA/OPENCL/OPENGL等)：
+
+参数：
+- `device_ptr:uint64_t` 整形内存指针地址
+- `memory_type:int` 例如： 2->CUDA 3->OpenCL等, 详见include/MNN/MNNForwardType.h文件中MNNForwardType结构体
+
+返回：`None`
+
+返回类型：`None`
+
+示例：
+
+```python
+>>> torch_tensor = torch.empty([1, 1000], dtype=torch.float16).cuda()
+>>> mnn_var = expr.placeholder([2,2])
+>>> mnn_var.copy_to_device_ptr(torch_tensor.data_ptr() ,2)
+```
+
 ---
 ### `sign(x)`
 返回输入值的符号，正数返回1，负数返回-1

diff --git a/express/Executor.cpp b/express/Executor.cpp
@@ -58,6 +58,10 @@ void Executor::setGlobalExecutorConfig(MNNForwardType type, const BackendConfig&
         mAttr->firstType = std::make_pair(type, numberThread);
         info.mode = Backend::Info::DIRECT;
         info.numThread = numberThread;
+        if (MNN_FORWARD_METAL == type) {
+            // Close metal's defer encoder
+            info.numThread |= MNN_GPU_RECORD_OP;
+        }
         info.user = (BackendConfig*)&config;
         std::shared_ptr<Runtime> bn(creator->onCreate(info));
         mRuntimes[mAttr->firstType] = bn;
@@ -257,6 +261,9 @@ void Executor::RuntimeManager::setHint(Interpreter::HintMode mode, int value) {
         case Interpreter::STRICT_CHECK_MODEL:
             mInside->checkNetBuffer = value > 0;
             break;
+        case Interpreter::MEM_ALLOCATOR_TYPE:
+            mInside->modes.memoryAllocatorType = value;
+            break;
         default:
             break;
     }
@@ -538,7 +545,7 @@ void Executor::_makeCache(const std::vector<EXPRP>& expr, bool forceCPU) {
                 quant->scale = TensorUtils::getDescribe(srcTensor)->quantAttr.get()->scale;
                 quant->zero = TensorUtils::getDescribe(srcTensor)->quantAttr.get()->zero;
             }
-            
+
             TensorUtils::getDescribe(tensor.get())->index = (int)scheduleInfo.allTensors.size();
             scheduleInfo.allTensors.emplace_back(tensor);
         }

diff --git a/express/Expr.cpp b/express/Expr.cpp
@@ -545,6 +545,48 @@ void Variable::setName(const std::string& name) {
         mFrom->setName(name);
     }
 }
+
+bool Variable::setDevicePtr(const void* devicePtr, int memoryType) {
+    if (nullptr != mFrom->get()) {
+        MNN_ERROR("Can't setDevicePtr to no-input op\n");
+        return false;
+    }
+    informDirty();
+    MNN_ASSERT(TensorUtils::getDescribe(mFrom->inside()->mOutputTensors[0])->quantAttr == nullptr || TensorUtils::getDescribe(mFrom->inside()->mOutputTensors[0])->type == DataType_DT_FLOAT);
+    mFrom->mInside->mContentDirty = false;
+    // Clear host address, Don't malloc hostPtr afterwards
+    Utils::releaseMemoryForHostTensor(mFrom->inside()->mOutputTensors[0]);
+    return mFrom->inside()->mOutputTensors[0]->setDevicePtr(devicePtr, memoryType);
+}
+
+bool Variable::copyToDevicePtr(void* devicePtr, int memoryType) {
+    if (nullptr != mFrom->get()) {
+        MNN_ERROR("Can't copyToDevicePtr to no-input op\n");
+        return false;
+    }
+
+    auto inside = mFrom->inside();
+    auto originTensor = inside->mOutputTensors[mFromIndex];
+
+    auto bn = TensorUtils::getDescribe(originTensor)->getBackend();
+    if(bn == nullptr) {
+        MNN_ERROR("Error: Varp copyToDevicePtr can't find backend\n");
+        return false;
+    }
+    if (bn->type() != memoryType) {
+        MNN_ERROR("Error: VARP backend type ( %d ), is not same as assigned memory type ( %d )\n", bn->type(), memoryType);
+        return false;
+    }
+
+    MNN::Tensor tempTensor(originTensor->dimensions(), originTensor->getDimensionType());
+    tempTensor.buffer().device = (uint64_t)devicePtr;
+
+    TensorUtils::getDescribe(originTensor)->getBackend()->onCopyBuffer(originTensor, &tempTensor);
+    // Sync the result
+    tempTensor.wait(Tensor::MAP_TENSOR_READ, true);
+    return true;
+}
+
 const std::string& Variable::name() const {
     return mFrom->outputName(mFromIndex);
 }

diff --git a/express/Utils.cpp b/express/Utils.cpp
@@ -116,10 +116,7 @@ bool Utils::allocMemoryForHostTensor(Tensor* dest) {
     if (TensorUtils::getDescribe(dest)->memoryType != Tensor::InsideDescribe::MEMORY_HOST) {
         return false;
     }
-    auto size = dest->size();
-    if (0 >= size) {
-        return false;
-    }
+    auto size = dest->usize();
     dest->buffer().host = (uint8_t*)MNNMemoryAllocAlign(size, MNN_MEMORY_ALIGN_DEFAULT);
     return dest->buffer().host != nullptr;
 }

diff --git a/express/module/PipelineModule.cpp b/express/module/PipelineModule.cpp
@@ -335,7 +335,7 @@ static std::vector<int> _collectNeededOps(const MNN::Net* net, const std::set<in
     // 0: use, 1: no use
     std::vector<int> opMask(net->oplists()->size());
     ::memset(opMask.data(), 0, opMask.size() * sizeof(int));
-    
+
     // Set Initial Status
     for (auto v : outputIndexes) {
         tensorMask[v] = 1;
@@ -638,6 +638,9 @@ Module* PipelineModule::load(const std::vector<std::string>& inputs, const std::
             modRuntime.rt.first.begin()->second->setExternalFile(rtMgr->getInside()->mExternalFile);
             modRuntime.rt.second->setExternalFile(rtMgr->getInside()->mExternalFile);
         }
+        // set allocator type
+        modRuntime.rt.first.begin()->second->setAllocatorType(rtMgr->getInside()->modes.memoryAllocatorType);
+        modRuntime.rt.second->setAllocatorType(rtMgr->getInside()->modes.memoryAllocatorType);
     }
     auto& rt = modRuntime.rt;
     auto firstRt = rt.first[modRuntime.compute.type];

diff --git a/include/MNN/ImageProcess.hpp b/include/MNN/ImageProcess.hpp
@@ -44,6 +44,7 @@ enum Wrap { CLAMP_TO_EDGE = 0, ZERO = 1, REPEAT = 2 };
  *  2: Sample line and do format convert
  *  3: Turn RGBA to float tensor, and do sub and normalize
  */
+
 class MNN_PUBLIC ImageProcess {
 public:
     struct Inside;
@@ -62,7 +63,6 @@ class MNN_PUBLIC ImageProcess {
         /** edge wrapper */
         Wrap wrap = CLAMP_TO_EDGE;
     };
-
 public:
     /**
      * @brief create image process with given config for given tensor.
@@ -86,10 +86,10 @@ class MNN_PUBLIC ImageProcess {
     static ImageProcess* create(const ImageFormat sourceFormat = RGBA, const ImageFormat destFormat = RGBA,
                                 const float* means = nullptr, const int meanCount = 0, const float* normals = nullptr,
                                 const int normalCount = 0, const Tensor* dstTensor = nullptr);
-
+    
     ~ImageProcess();
     static void destroy(ImageProcess* imageProcess);
-
+    
     /**
      * @brief get affine transform matrix.
      * @return affine transform matrix.
@@ -98,7 +98,7 @@ class MNN_PUBLIC ImageProcess {
         return mTransform;
     }
     void setMatrix(const Matrix& matrix);
-
+    
     /**
      * @brief convert source data to given tensor.
      * @param source    source data.
@@ -109,7 +109,7 @@ class MNN_PUBLIC ImageProcess {
      * @return result code.
      */
     ErrorCode convert(const uint8_t* source, int iw, int ih, int stride, Tensor* dest);
-
+    
     /**
      * @brief convert source data to given tensor.
      * @param source    source data.
@@ -126,7 +126,7 @@ class MNN_PUBLIC ImageProcess {
      */
     ErrorCode convert(const uint8_t* source, int iw, int ih, int stride, void* dest, int ow, int oh, int outputBpp = 0,
                       int outputStride = 0, halide_type_t type = halide_type_of<float>());
-
+    
     /**
      * @brief create tensor with given data.
      * @param w     image width.
@@ -140,7 +140,7 @@ class MNN_PUBLIC ImageProcess {
         return createImageTensor(halide_type_of<T>(), w, h, bpp, p);
     }
     static Tensor* createImageTensor(halide_type_t type, int w, int h, int bpp, void* p = nullptr);
-
+    
     /**
      * @brief set padding value when wrap=ZERO.
      * @param value     padding value.
@@ -149,14 +149,14 @@ class MNN_PUBLIC ImageProcess {
     void setPadding(uint8_t value) {
         mPaddingValue = value;
     }
-
+    
     /**
      * @brief set to draw mode.
      * @param void
      * @return void.
      */
     void setDraw();
-
+    
     /**
      * @brief draw color to regions of img.
      * @param img  the image to draw.
@@ -179,4 +179,4 @@ class MNN_PUBLIC ImageProcess {
 } // namespace CV
 } // namespace MNN
 
-#endif /* ImageProcess_hpp */
+#endif /* MNN_ImageProcess_hpp */
diff --git a/include/MNN/MNNDefine.h b/include/MNN/MNNDefine.h
@@ -69,6 +69,6 @@ MNN_ERROR("Check failed: %s ==> %s\n", #success, #log); \
 #define STR(x) STR_IMP(x)
 #define MNN_VERSION_MAJOR 2
 #define MNN_VERSION_MINOR 8
-#define MNN_VERSION_PATCH 1
+#define MNN_VERSION_PATCH 2
 #define MNN_VERSION STR(MNN_VERSION_MAJOR) "." STR(MNN_VERSION_MINOR) "." STR(MNN_VERSION_PATCH)
 #endif /* MNNDefine_h */
diff --git a/include/MNN/Tensor.hpp b/include/MNN/Tensor.hpp
@@ -304,6 +304,10 @@ class MNN_PUBLIC Tensor {
      * @param finish wait for command flush or finish
      */
     int wait(MapType mtype, bool finish);
+    /**
+     * @brief set GPU tensor device ptr, and inform memory type
+     */
+    bool setDevicePtr(const void* devicePtr, int memoryType);
 private:
     halide_buffer_t mBuffer;
     struct InsideDescribe* mDescribe;

diff --git a/include/MNN/expr/Expr.hpp b/include/MNN/expr/Expr.hpp
@@ -108,11 +108,14 @@ class MNN_PUBLIC Variable {
         Dimensionformat order = NHWC;
         INTS dim;
         halide_type_t type;
-        int size;
+        size_t size;
         void syncSize();
     };
     const std::string& name() const;
     void setName(const std::string& name);
+    bool setDevicePtr(const void* devicePtr, int memoryType);
+    bool copyToDevicePtr(void* devicePtr, int memoryType);
+
     std::pair<EXPRP, int> expr() const {
         return std::make_pair(mFrom, mFromIndex);
     }

diff --git a/project/android/gradle/wrapper/gradle-wrapper.properties b/project/android/gradle/wrapper/gradle-wrapper.properties
@@ -3,4 +3,4 @@ distributionBase=GRADLE_USER_HOME
 distributionPath=wrapper/dists
 zipStoreBase=GRADLE_USER_HOME
 zipStorePath=wrapper/dists
-distributionUrl=https\://services.gradle.org/distributions/gradle-4.6-all.zip
+distributionUrl=http\://mtl-gradle-mirror.oss-cn-hangzhou.aliyuncs.com/gradle-4.6-all.zip