From 492e770cad64181f35a165fef06529399dd61be1 Mon Sep 17 00:00:00 2001
From: yanxing <zhaode.wzd@alibaba-inc.com>
Date: Wed, 13 Nov 2024 16:02:00 +0800
Subject: [PATCH 1/7] support linux aarch64 pymnn release.

---
 .github/workflows/pymnn_release.yml |  7 +++++++
 pymnn/pip_package/pyproject.toml    | 27 ++-------------------------
 2 files changed, 9 insertions(+), 25 deletions(-)

diff --git a/.github/workflows/pymnn_release.yml b/.github/workflows/pymnn_release.yml
index 4294244c1..fbac1684e 100644
--- a/.github/workflows/pymnn_release.yml
+++ b/.github/workflows/pymnn_release.yml
@@ -16,6 +16,7 @@ jobs:
       matrix:
         include:
           - { os: ubuntu-latest,    arch: x86_64,     build: 'cp*-manylinux*' }
+          - { os: ubuntu-latest,    arch: aarch64,    build: 'cp*-manylinux*' }
           - { os: windows-latest,   arch: AMD64,      build: 'cp*'          }
           - { os: macos-13,         arch: x86_64,     build: 'cp*'          }
           - { os: macos-14,         arch: arm64,      build: 'cp*'          }
@@ -33,6 +34,12 @@ jobs:
       with:
         python-version: '3.12'
 
+    - name: set up qemu
+      if: matrix.os == 'ubuntu-latest' && matrix.arch == 'aarch64'
+      uses: docker/setup-qemu-action@v3
+      with:
+        platforms: all
+
     - name: install pipx
       if: matrix.os == 'macos-14'
       run: python -m pip install pipx
diff --git a/pymnn/pip_package/pyproject.toml b/pymnn/pip_package/pyproject.toml
index 8f9732815..0e2a7a18a 100644
--- a/pymnn/pip_package/pyproject.toml
+++ b/pymnn/pip_package/pyproject.toml
@@ -8,24 +8,6 @@ requires = [
 
 build-backend = "setuptools.build_meta"
 
-[tool.cibuildwheel]
-test-skip = [
-    "cp36-*",
-    "*-macosx_arm64",
-    "*-macosx_x86_64"
-]
-test-requires = [
-    "opencv-python",
-    "numpy",
-    "torch"
-]
-test-command = [
-  "cd {project}/pymnn/test",
-  "ls",
-  "python unit_test.py",
-  "cd ../.."
-]
-
 [tool.cibuildwheel.macos]
 archs = ["native"]
 build = "cp*-macosx_*"
@@ -48,7 +30,7 @@ archs = ["native"]
 repair-wheel-command = [
     "export LD_LIBRARY_PATH=$(pwd)/pymnn_build/tools/converter/libtorch/lib:$LD_LIBRARY_PATH",
     "echo $LD_LIBRARY_PATH",
-    "auditwheel repair --plat manylinux2014_x86_64 -w {dest_dir} {wheel}"
+    "auditwheel repair -w {dest_dir} {wheel}"
 ]
 
 [tool.cibuildwheel.windows]
@@ -56,9 +38,4 @@ before-all = [
     "cd pymnn/pip_package",
     "python3 build_deps.py",
     "cd ../.."
-]
-test-command = [
-  "cd /d {project}/pymnn/test",
-  "python unit_test.py",
-  "cd ../.."
-]
+]
\ No newline at end of file

From 04cdaf1781c8454e2343943f6d6249f29457f70a Mon Sep 17 00:00:00 2001
From: yanxing <zhaode.wzd@alibaba-inc.com>
Date: Thu, 14 Nov 2024 15:28:52 +0800
Subject: [PATCH 2/7] [MNN:CI] update actions/upload-artifact@v4.

---
 .github/workflows/mnn_release.yml   | 2 +-
 .github/workflows/pymnn_release.yml | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/mnn_release.yml b/.github/workflows/mnn_release.yml
index 867ac3404..afbc0024c 100644
--- a/.github/workflows/mnn_release.yml
+++ b/.github/workflows/mnn_release.yml
@@ -56,7 +56,7 @@ jobs:
     - name: package
       run: 7z a -r ${{ env.PACKAGENAME }}.zip ${{ env.PACKAGENAME }}
     - name: upload-zip
-      uses: actions/upload-artifact@v3
+      uses: actions/upload-artifact@v4
       with:
         path: ${{ env.PACKAGENAME }}.zip
 
diff --git a/.github/workflows/pymnn_release.yml b/.github/workflows/pymnn_release.yml
index 090bfce43..e0bb16567 100644
--- a/.github/workflows/pymnn_release.yml
+++ b/.github/workflows/pymnn_release.yml
@@ -67,7 +67,7 @@ jobs:
       shell: bash
 
     - name: Upload wheels
-      uses: actions/upload-artifact@v3
+      uses: actions/upload-artifact@v4
       with:
         path: wheelhouse/*.whl
 

From deb79255e1bf03e5831a29fb7b7d8d8f973c1f26 Mon Sep 17 00:00:00 2001
From: yanxing <zhaode.wzd@alibaba-inc.com>
Date: Thu, 14 Nov 2024 16:45:40 +0800
Subject: [PATCH 3/7] [MNN:CI] update upload/download-artifact@v4.

---
 .github/workflows/mnn_release.yml   | 23 +++++++++++++++--------
 .github/workflows/pymnn_release.yml |  7 ++++---
 2 files changed, 19 insertions(+), 11 deletions(-)

diff --git a/.github/workflows/mnn_release.yml b/.github/workflows/mnn_release.yml
index afbc0024c..eb033918e 100644
--- a/.github/workflows/mnn_release.yml
+++ b/.github/workflows/mnn_release.yml
@@ -3,6 +3,8 @@ on:
   push:
     tags:
       - '*'
+  workflow_dispatch:
+
 jobs:
   setup:
     permissions:
@@ -34,8 +36,9 @@ jobs:
         rm -f ${{ env.PACKAGENAME }}.zip
         zip -9 -y -r ${{ env.PACKAGENAME }}.zip ${{ env.PACKAGENAME }}
     - name: upload-zip
-      uses: actions/upload-artifact@v3
+      uses: actions/upload-artifact@v4
       with:
+        name: artifact-${{ env.PACKAGENAME }}
         path: ${{ env.PACKAGENAME }}.zip
 
   windows-release:
@@ -44,7 +47,7 @@ jobs:
     env:
       PACKAGENAME: mnn_${{ needs.setup.outputs.VERSION }}_windows_x64_cpu_opencl
     steps:
-    - uses: actions/checkout@v3
+    - uses: actions/checkout@v4
       with:
         submodules: true
 
@@ -58,6 +61,7 @@ jobs:
     - name: upload-zip
       uses: actions/upload-artifact@v4
       with:
+        name: artifact-${{ env.PACKAGENAME }}
         path: ${{ env.PACKAGENAME }}.zip
 
   macos-release:
@@ -79,8 +83,9 @@ jobs:
         rm -f ${{ env.PACKAGENAME }}.zip
         zip -9 -y -r ${{ env.PACKAGENAME }}.zip ${{ env.PACKAGENAME }}
     - name: upload-zip
-      uses: actions/upload-artifact@v3
+      uses: actions/upload-artifact@v4
       with:
+        name: artifact-${{ env.PACKAGENAME }}
         path: ${{ env.PACKAGENAME }}.zip
 
   android-release:
@@ -99,8 +104,9 @@ jobs:
         rm -f ${{ env.PACKAGENAME }}.zip
         zip -9 -y -r ${{ env.PACKAGENAME }}.zip ${{ env.PACKAGENAME }}
     - name: upload-zip
-      uses: actions/upload-artifact@v3
+      uses: actions/upload-artifact@v4
       with:
+        name: artifact-${{ env.PACKAGENAME }}
         path: ${{ env.PACKAGENAME }}.zip
 
   ios-release:
@@ -123,8 +129,9 @@ jobs:
         rm -f ${{ env.PACKAGENAME }}.zip
         zip -9 -y -r ${{ env.PACKAGENAME }}.zip ios_build/Release-iphoneos/MNN.framework
     - name: upload-zip
-      uses: actions/upload-artifact@v3
+      uses: actions/upload-artifact@v4
       with:
+        name: artifact-${{ env.PACKAGENAME }}
         path: ${{ env.PACKAGENAME }}.zip
 
   upload-release:
@@ -132,9 +139,9 @@ jobs:
     needs: [linux-release, windows-release, macos-release, android-release, ios-release]
     runs-on: ubuntu-latest
     steps:
-    - uses: actions/download-artifact@v4.1.7
+    - uses: actions/download-artifact@v4
       with:
-        name: artifact
+        pattern: artifact-*
         path: assert
 
     - name: show file
@@ -146,4 +153,4 @@ jobs:
       with:
         file: assert/*.zip
         tags: true
-        draft: true
+        draft: true
\ No newline at end of file
diff --git a/.github/workflows/pymnn_release.yml b/.github/workflows/pymnn_release.yml
index e0bb16567..c04ddf5ae 100644
--- a/.github/workflows/pymnn_release.yml
+++ b/.github/workflows/pymnn_release.yml
@@ -69,6 +69,7 @@ jobs:
     - name: Upload wheels
       uses: actions/upload-artifact@v4
       with:
+        name: artifact-${{ matrix.arch }}-${{ matrix.arch }}
         path: wheelhouse/*.whl
 
   publish_wheels:
@@ -83,12 +84,12 @@ jobs:
       with:
         python-version: '3.x'
 
-    - uses: actions/download-artifact@v4.1.7
+    - uses: actions/download-artifact@v4
       with:
-        name: artifact
+        pattern: artifact-*
         path: dist
 
     - uses: pypa/gh-action-pypi-publish@release/v1
       with:
         password: ${{ secrets.PYPI_API_TOKEN }}
-        skip_existing: true
+        skip_existing: true
\ No newline at end of file

From 0222000cbdec0b2e031a4a87db763afaaccea7f1 Mon Sep 17 00:00:00 2001
From: yanxing <zhaode.wzd@alibaba-inc.com>
Date: Thu, 14 Nov 2024 17:26:48 +0800
Subject: [PATCH 4/7] [MNN:CI] mnn release support dev test.

---
 .github/workflows/mnn_release.yml   | 9 ++++++++-
 .github/workflows/pymnn_release.yml | 2 +-
 2 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/mnn_release.yml b/.github/workflows/mnn_release.yml
index eb033918e..91c6839b3 100644
--- a/.github/workflows/mnn_release.yml
+++ b/.github/workflows/mnn_release.yml
@@ -15,7 +15,14 @@ jobs:
     steps:
     - name: get-version
       id: get_version
-      run: echo "VERSION=${GITHUB_REF/refs\/tags\//}" >> $GITHUB_OUTPUT
+      run: |
+        if [[ "${GITHUB_REF}" == refs/tags/* ]]; then
+          # 提取标签版本号
+          echo "VERSION=${GITHUB_REF#refs/tags/}" >> $GITHUB_OUTPUT
+        else
+          # 如果不是标签，则设置版本为 'dev'
+          echo "VERSION=dev" >> $GITHUB_OUTPUT
+        fi
 
   linux-release:
     needs: [setup]
diff --git a/.github/workflows/pymnn_release.yml b/.github/workflows/pymnn_release.yml
index c04ddf5ae..f7f6311a1 100644
--- a/.github/workflows/pymnn_release.yml
+++ b/.github/workflows/pymnn_release.yml
@@ -69,7 +69,7 @@ jobs:
     - name: Upload wheels
       uses: actions/upload-artifact@v4
       with:
-        name: artifact-${{ matrix.arch }}-${{ matrix.arch }}
+        name: artifact-${{ matrix.os }}-${{ matrix.arch }}
         path: wheelhouse/*.whl
 
   publish_wheels:

From 17bc7f0f4394c802c3495aae9e7ccef1156be6c9 Mon Sep 17 00:00:00 2001
From: yanxing <zhaode.wzd@alibaba-inc.com>
Date: Thu, 14 Nov 2024 21:50:29 +0800
Subject: [PATCH 5/7] [MNN:CI] add merge-multiple in download-artifact

---
 .github/workflows/mnn_release.yml   | 1 +
 .github/workflows/pymnn_release.yml | 1 +
 2 files changed, 2 insertions(+)

diff --git a/.github/workflows/mnn_release.yml b/.github/workflows/mnn_release.yml
index 91c6839b3..42fcb18fb 100644
--- a/.github/workflows/mnn_release.yml
+++ b/.github/workflows/mnn_release.yml
@@ -150,6 +150,7 @@ jobs:
       with:
         pattern: artifact-*
         path: assert
+        merge-multiple: true
 
     - name: show file
       run: ls assert
diff --git a/.github/workflows/pymnn_release.yml b/.github/workflows/pymnn_release.yml
index f7f6311a1..69cdc1ae3 100644
--- a/.github/workflows/pymnn_release.yml
+++ b/.github/workflows/pymnn_release.yml
@@ -88,6 +88,7 @@ jobs:
       with:
         pattern: artifact-*
         path: dist
+        merge-multiple: true
 
     - uses: pypa/gh-action-pypi-publish@release/v1
       with:

From 5b901d9d87acba5fab24dcec5e975f1ad9bc9973 Mon Sep 17 00:00:00 2001
From: xiaying <xiaotang.jxt@alibaba-inc.com>
Date: Mon, 18 Nov 2024 14:37:45 +0800
Subject: [PATCH 6/7] MNN:Sync: Sync Internal 3.0.0

---
 .gitignore                                    |    1 +
 CMakeLists.txt                                |   33 +-
 README.md                                     |    8 +-
 README_CN.md                                  |    8 +-
 docs/compile/cmake.md                         |    4 +-
 docs/compile/engine.md                        |   74 +-
 docs/contribute/op.md                         |   52 +-
 docs/faq.md                                   |   15 +-
 docs/index.rst                                |    3 +-
 docs/inference/session.md                     |   27 +-
 docs/tools/compress.md                        |   97 +-
 docs/tools/convert.md                         |    8 +-
 docs/tools/quant.md                           |    9 +-
 docs/tools/test.md                            |    4 +-
 docs/train/quant.md                           |  100 -
 docs/transformers/diffusion.md                |    4 +-
 docs/transformers/models.md                   |   50 +
 express/Executor.cpp                          |   10 +
 express/Expr.cpp                              |    1 +
 express/RuntimeAttr.hpp                       |    2 +
 express/Utils.cpp                             |   15 +-
 express/module/Module.cpp                     |    6 +-
 include/MNN/MNNDefine.h                       |   12 +-
 project/harmony/build_64.sh                   |    3 +-
 project/harmony/updateTest.sh                 |    3 +-
 project/ios/MNN.xcodeproj/project.pbxproj     |   36 +-
 pymnn/CMakeLists.txt                          |    5 +-
 .../MNNQuant/test_mnn_offline_quant.py        |  201 -
 pymnn/src/expr.h                              |    2 +-
 pymnn/src/llm.h                               |   34 +
 schema/current/CaffeOp_generated.h            |   26 +-
 schema/current/MNN_generated.h                |    8 +-
 schema/default/CaffeOp.fbs                    |    1 +
 schema/default/MNN.fbs                        |    2 +-
 source/backend/arm82/Arm82Backend.cpp         |    1 +
 source/backend/arm82/Arm82Functions.cpp       |  242 +-
 source/backend/arm82/Arm82Functions.hpp       |    1 +
 .../arm64/low_memory/MNNDynamicQuantFP16.S    |  284 +-
 ...MNNGemmInt8AddBiasScale_ARMV82_Unit_FP16.S |    2 +-
 ...GemmInt8AddBiasScale_ARMV82_w4_Unit_FP16.S |   66 +-
 ...MNNGemmInt8AddBiasScale_ARMV86_Unit_FP16.S |    2 +-
 ...GemmInt8AddBiasScale_ARMV86_w4_Unit_FP16.S |   14 +-
 .../asm/arm64/low_memory/MNNQuantScaleFP16.S  |   30 +-
 source/backend/coreml/CMakeLists.txt          |   19 +-
 .../backend/coreml/backend/CoreMLBackend.cpp  |   93 +-
 .../backend/coreml/backend/CoreMLBackend.hpp  |    6 +-
 .../backend/coreml/backend/CoreMLExecutor.h   |    3 +-
 .../backend/coreml/backend/CoreMLExecutor.mm  |  165 +-
 .../coreml/backend/CoreMLExecutorWrapper.h    |    2 +-
 .../coreml/backend/CoreMLExecutorWrapper.mm   |   12 +-
 .../coreml/backend/CoreMLOPRegister.cpp       |   10 +-
 .../backend/coreml/backend/CoreMLRaster.metal |   39 -
 .../coreml/execution/CoreMLActivation.cpp     |   40 +-
 .../backend/coreml/execution/CoreMLBinary.cpp |   25 +-
 .../coreml/execution/CoreMLConvolution.cpp    |   85 +-
 .../coreml/execution/CoreMLConvolution.hpp    |    2 +-
 .../backend/coreml/execution/CoreMLMatMul.cpp |   57 +
 .../backend/coreml/execution/CoreMLMatMul.hpp |   25 +
 .../backend/coreml/execution/CoreMLRelu6.cpp  |   36 +
 .../backend/coreml/execution/CoreMLRelu6.hpp  |   28 +
 .../coreml/execution/coreMLLayerNorm.hpp      |    2 +-
 source/backend/cpu/CPUCast.cpp                |    4 +-
 .../backend/cpu/CPUConvolutionDepthwise.cpp   |    9 +
 source/backend/cpu/CPUDeconvolution.cpp       |   70 +-
 source/backend/cpu/CPUDeconvolution.hpp       |   51 +-
 source/backend/cpu/CPUFloatToInt8.cpp         |    7 +-
 source/backend/cpu/CPUInt8ToFloat.cpp         |   21 +-
 source/backend/cpu/CPUInt8ToFloat.hpp         |    2 +-
 source/backend/cpu/CPURNNSequenceGRU.cpp      |   96 +-
 source/backend/cpu/CPURNNSequenceGRU.hpp      |    2 +-
 source/backend/cpu/CPUUnique.cpp              |   50 +-
 source/backend/cpu/OneDNNConvInt8.cpp         |    2 +-
 .../arm32/MNNGemmInt8AddBiasScale_16x4_Unit.S |    6 +-
 .../MNNGemmInt8AddBiasScale_16x4_Unit_FAST.S  |    8 +-
 .../MNNGemmInt8AddBiasScale_16x4_w4_Unit.S    |    4 +-
 .../cpu/arm/arm32/MNNInt8ScaleToFloat.S       |   35 +-
 .../arm64/MNNGemmInt8AddBiasScale_16x4_Unit.S |   23 +-
 .../MNNGemmInt8AddBiasScale_16x4_Unit_FAST.S  |   15 +-
 .../MNNGemmInt8AddBiasScale_ARMV82_Unit.S     |    7 +-
 .../MNNGemmInt8AddBiasScale_ARMV86_Unit.S     |    2 +-
 .../cpu/arm/arm64/MNNInt8ScaleToFloat.S       |   31 +-
 .../arm/arm64/MNNPackC4Int8ForMatMulA_ARM82.S |    6 +-
 .../arm/arm64/MNNPackC4Int8ForMatMulA_ARM86.S |    5 +-
 .../MNNGemmInt8AddBiasScale_16x4_w4_Unit.S    |    8 +-
 .../MNNGemmInt8AddBiasScale_ARMV82_w4_Unit.S  |    6 +-
 .../MNNGemmInt8AddBiasScale_ARMV86_w4_Unit.S  |    2 +-
 .../arm/arm64/low_memory/MNNQuantScaleFP32.S  |    8 +
 .../backend/cpu/compute/CommonOptFunction.cpp |   23 +-
 .../cpu/compute/ConvInt8TiledExecutor.cpp     |   64 +-
 .../backend/cpu/compute/ConvInt8Winograd.cpp  |    4 +-
 .../cpu/compute/ConvolutionFloatFactory.cpp   |    7 +-
 .../compute/ConvolutionPackFreeWinograd.cpp   |    7 +-
 .../backend/cpu/compute/GemmInt8Executor.cpp  |    3 +-
 .../cpu/compute/IdstConvolutionInt8.cpp       |    6 +-
 .../backend/cpu/compute/Int8FunctionsOpt.cpp  |  157 +-
 source/backend/cpu/compute/Int8FunctionsOpt.h |    4 +-
 source/backend/cpu/x86_x64/AVX2Backend.cpp    |    7 +-
 source/backend/cpu/x86_x64/AVX2Functions.cpp  |    9 +-
 source/backend/cpu/x86_x64/CMakeLists.txt     |   45 +-
 .../cpu/x86_x64/FunctionDispatcher.cpp        |   12 +-
 .../cpu/x86_x64/avx/FunctionSummary.hpp       |    8 +-
 source/backend/cpu/x86_x64/avx/GemmInt8.cpp   |  202 +-
 .../cpu/x86_x64/avx512/FunctionSummary.hpp    |    7 +-
 .../backend/cpu/x86_x64/avx512/GemmInt8.cpp   |   99 +-
 .../cpu/x86_x64/avx512/GemmInt8_VNNI.cpp      |   32 +-
 .../cpu/x86_x64/avx512/Matmul_4_4_64.inl      |   32 +-
 .../cpu/x86_x64/avxfma/FunctionSummary.hpp    |    6 +-
 .../cpu/x86_x64/sse/FunctionSummary.hpp       |   11 +-
 source/backend/cpu/x86_x64/sse/GemmInt8.cpp   |  107 +-
 source/backend/cuda/CMakeLists.txt            |   10 +-
 source/backend/metal/AllShader.cpp            |  283 +-
 source/backend/metal/MNNMetalContext.h        |    1 -
 source/backend/metal/MNNMetalContext.mm       |   15 -
 source/backend/metal/MetalAttention.mm        |  296 +-
 source/backend/metal/MetalBackend.hpp         |   10 +-
 source/backend/metal/MetalBackend.mm          |    4 +
 source/backend/metal/MetalConvolution1x1.mm   |   18 +-
 .../backend/metal/MetalConvolutionCommon.mm   |    8 +-
 source/backend/metal/MetalLayerNorm.mm        |   37 +-
 source/backend/metal/MetalRaster.mm           |    7 +
 .../metal/shader/MetalConvolution1x1.metal    |  117 +-
 .../backend/metal/shader/MetalLayerNorm.metal |  138 +-
 .../backend/metal/shader/MetalSoftmax.metal   |   47 +-
 source/backend/opencl/core/OpenCLBackend.cpp  |    6 +-
 .../opencl/core/runtime/OpenCLRuntime.cpp     |   21 +-
 .../opencl/core/runtime/OpenCLWrapper.cpp     |   14 +-
 .../buffer/AttentionBufExecution.cpp          | 1308 ++++--
 .../buffer/AttentionBufExecution.hpp          |   13 +-
 .../execution/buffer/BinaryBufExecution.cpp   |    2 +
 .../execution/buffer/ConvBufExecution.cpp     |    2 +
 .../buffer/ConvBufLowMemoryExecution.cpp      |    7 +-
 .../execution/buffer/PoolBufExecution.cpp     |    4 +-
 .../execution/buffer/ReluBufExecution.cpp     |    3 +-
 .../execution/buffer/UnaryBufExecution.cpp    |    2 +
 .../opencl/execution/cl/attention_buf.cl      |  906 ++--
 .../opencl/execution/cl/conv_2d_buf.cl        |    2 +-
 .../opencl/execution/cl/opencl_program.cc     |  944 ++--
 source/backend/opencl/execution/cl/select.cl  |    2 +-
 .../opencl/execution/cl/self_attention_buf.cl |   36 +
 .../image/ConvLowMemoryExecution.cpp          |    7 +-
 .../vulkan/buffer/execution/VulkanRaster.cpp  |    1 -
 .../vulkan/image/backend/VulkanBackend.cpp    |   16 +-
 .../vulkan/image/backend/VulkanBackend.hpp    |    6 +-
 .../vulkan/image/compiler/AllShader.cpp       | 4058 ++++++++++++-----
 .../vulkan/image/compiler/VulkanShaderMap.cpp |    6 +-
 .../vulkan/image/execution/VulkanReduce.cpp   |    2 +-
 .../vulkan/image/execution/VulkanSoftmax.cpp  |  162 +-
 .../vulkan/image/execution/VulkanSoftmax.hpp  |   13 +-
 .../vulkan/image/execution/glsl/macro.json    |    6 +
 .../vulkan/image/execution/glsl/reduce.comp   |  118 +-
 .../execution/glsl/softmaxHeight_NHWC.comp    |   47 -
 .../image/execution/glsl/softmaxImage.comp    |  288 ++
 .../backend/vulkan/image/shaders/AllShader.h  |   12 +-
 source/core/ConvolutionCommon.cpp             |  480 +-
 source/core/ConvolutionCommon.hpp             |    1 +
 source/core/IDSTDecoder.hpp                   |  433 --
 source/core/IDSTEncoder.hpp                   |   46 +-
 source/core/Interpreter.cpp                   |    3 +
 source/core/OpCommonUtils.cpp                 |   45 +-
 source/core/OpCommonUtils.hpp                 |    2 +
 source/core/Pipeline.cpp                      |    8 +-
 source/core/Session.cpp                       |    6 +-
 source/core/SimdHeader.h                      |   15 +
 source/core/Tensor.cpp                        |    2 +-
 source/core/TensorUtils.cpp                   |   22 +
 source/geometry/GeometryBinary.cpp            |    2 +-
 source/geometry/GeometryConvert.cpp           |   26 +-
 source/geometry/GeometryELU.cpp               |   35 +-
 source/geometry/GeometryImageOp.cpp           |  129 +-
 source/math/Vec.hpp                           |   11 +-
 source/shape/ShapeCast.cpp                    |    5 +
 source/shape/ShapeInterp.cpp                  |    7 +-
 source/shape/ShapeRegister.cpp                |    2 +
 source/shape/ShapeTensorArray.cpp             |    9 +-
 source/shape/ShapeUnique.cpp                  |   14 +-
 test.sh                                       |   11 +-
 test/core/BackendTest.cpp                     |   39 +-
 test/core/TensorTest.cpp                      |    6 +-
 test/expr/ModuleTest.cpp                      |   59 +-
 test/op/ConvInt8Test.cpp                      |   52 +-
 test/op/ConvolutionTest.cpp                   |    6 +-
 test/op/DeconvolutionTest.cpp                 |    7 +-
 test/op/LayerNormTest.cpp                     |    8 +-
 test/op/RasterTest.cpp                        |   47 +
 test/op/ReductionTest.cpp                     |   82 +-
 test/op/ResizeTest.cpp                        |   44 +
 test/speed/HybridConvSpeedTest.cpp            |   16 +-
 tools/MNNPythonOfflineQuant/ReadMe.txt        |   44 -
 .../calibration_dataset.py                    |  106 -
 tools/MNNPythonOfflineQuant/config.yaml       |   10 -
 .../mnn_offline_quant.py                      |  137 -
 tools/converter/include/config.hpp            |    1 +
 tools/converter/source/TestConvertResult.cpp  |    1 +
 tools/converter/source/common/cli.cpp         |   17 +-
 tools/converter/source/common/writeFb.cpp     |    4 +-
 tools/converter/source/onnx/CastLikeOnnx.cpp  |   26 +
 tools/converter/source/onnx/UniqueOnnx.cpp    |   34 +
 .../optimizer/merge/ConstantFolding.cpp       |   16 +-
 .../ConvDeQuantizeLinearFuseToConvInt8.cpp    |   36 +-
 .../optimizer/merge/ConvertMatMulToConv2D.cpp |    2 +-
 .../source/optimizer/merge/FuseTemplateOp.cpp |    2 +-
 .../onnxextra/OnnxConvolutionMerge.cpp        |  123 +-
 .../onnxextra/OnnxDeQuantizeLinear.cpp        |  105 +-
 .../source/optimizer/onnxextra/OnnxGemm.cpp   |  151 +
 .../onnxextra/OnnxQuantizeLinear.cpp          |   43 +-
 .../postconvert/RemoveInvalidCast.cpp         |   54 +-
 .../optimizer/tflitextra/FullConnect.cpp      |    8 +-
 .../converter/source/tflite/liteConverter.cpp |   15 +-
 tools/converter/source/torch/CMakeLists.txt   |    1 +
 tools/converter/source/torch/EluTorch.cpp     |   35 +
 tools/cpp/CMakeLists.txt                      |    2 -
 tools/cpp/LoRA.cpp                            |  272 --
 tools/cpp/LoRA.hpp                            |   32 -
 tools/cpp/ModuleBasic.cpp                     |   11 +-
 tools/cpp/backendTest.cpp                     |  231 +-
 tools/cpp/testModel.cpp                       |    3 +-
 tools/quantization/calibration.cpp            |   14 +-
 tools/script/make_test_for_mnn.py             |   26 +-
 transformers/llm/engine/include/llm/llm.hpp   |   16 +-
 transformers/llm/engine/llm_demo.cpp          |   11 +
 transformers/llm/engine/src/llm.cpp           |  178 +-
 transformers/llm/engine/src/llmconfig.hpp     |    4 +
 transformers/llm/eval/evaluate_perplexity.py  |   68 +
 transformers/llm/export/README.md             |    2 +-
 transformers/llm/export/llmexport.py          | 1398 +++++-
 transformers/llm/export/requirements.txt      |   14 +
 226 files changed, 11364 insertions(+), 6061 deletions(-)
 delete mode 100644 docs/train/quant.md
 create mode 100644 docs/transformers/models.md
 delete mode 100644 pymnn/examples/MNNQuant/test_mnn_offline_quant.py
 delete mode 100644 source/backend/coreml/backend/CoreMLRaster.metal
 create mode 100644 source/backend/coreml/execution/CoreMLMatMul.cpp
 create mode 100644 source/backend/coreml/execution/CoreMLMatMul.hpp
 create mode 100644 source/backend/coreml/execution/CoreMLRelu6.cpp
 create mode 100644 source/backend/coreml/execution/CoreMLRelu6.hpp
 delete mode 100644 source/backend/vulkan/image/execution/glsl/softmaxHeight_NHWC.comp
 create mode 100644 source/backend/vulkan/image/execution/glsl/softmaxImage.comp
 delete mode 100644 source/core/IDSTDecoder.hpp
 create mode 100644 source/core/SimdHeader.h
 delete mode 100644 tools/MNNPythonOfflineQuant/ReadMe.txt
 delete mode 100644 tools/MNNPythonOfflineQuant/calibration_dataset.py
 delete mode 100644 tools/MNNPythonOfflineQuant/config.yaml
 delete mode 100644 tools/MNNPythonOfflineQuant/mnn_offline_quant.py
 create mode 100644 tools/converter/source/onnx/CastLikeOnnx.cpp
 create mode 100644 tools/converter/source/onnx/UniqueOnnx.cpp
 create mode 100644 tools/converter/source/torch/EluTorch.cpp
 delete mode 100644 tools/cpp/LoRA.cpp
 delete mode 100644 tools/cpp/LoRA.hpp
 create mode 100644 transformers/llm/eval/evaluate_perplexity.py
 create mode 100644 transformers/llm/export/requirements.txt

diff --git a/.gitignore b/.gitignore
index a6391ada9..d11586811 100644
--- a/.gitignore
+++ b/.gitignore
@@ -24,6 +24,7 @@ out/
 .gradle
 .gradle/
 build/
+buildvisionOs/
 
 # Signing files
 .signing/
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 9983eae10..e345987d1 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -73,7 +73,7 @@ option(MNN_SUPPORT_BF16 "Enable MNN's bf16 op" OFF)
 option(MNN_LOW_MEMORY "Build MNN support low memory for weight quant model." OFF)
 option(MNN_CPU_WEIGHT_DEQUANT_GEMM "Build MNN CPU weight dequant related gemm kernels." OFF)
 
-IF (OHOS)
+IF (OHOS AND MNN_INTERNAL)
   include($ENV{NODE_PATH}/@ali/tcpkg/tcpkg.cmake)
   export_headers(DIR ${CMAKE_SOURCE_DIR}/include/MNN)
   IF (MNN_BUILD_OPENCV)
@@ -209,6 +209,7 @@ option(MNN_VULKAN "Enable Vulkan" OFF)
 option(MNN_ARM82 "Enable ARMv8.2's FP16 Compute" ON)
 option(MNN_KLEIDIAI "Enable KLEIDIAI" OFF)
 option(MNN_ONEDNN "Enable oneDNN" OFF)
+option(MNN_AVX2 "Open AVX2 Compile for x86 if possible" ON)
 option(MNN_AVX512 "Enable AVX512" OFF)
 option(MNN_CUDA "Enable CUDA" OFF)
 option(MNN_TENSORRT "Enable TensorRT" OFF)
@@ -312,6 +313,9 @@ IF(MNN_DEBUG_MEMORY)
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsanitize=address")
 endif()
 
+set(MNN_DEPS "")
+set(MNN_EXTRA_DEPENDS "")
+
 IF(CMAKE_BUILD_TYPE MATCHES Debug)
     add_definitions(-DMNN_DEBUG -DDEBUG)
     if(MSVC)
@@ -337,6 +341,13 @@ else()
         endif()
     endif()
 ENDIF(CMAKE_BUILD_TYPE MATCHES Debug)
+if(OHOS)
+    IF(MNN_USE_LOGCAT)
+        add_definitions(-DMNN_USE_LOGCAT)
+        add_definitions(-Wno-format-security)
+        list(APPEND MNN_EXTRA_DEPENDS libhilog_ndk.z.so)
+    ENDIF()
+endif()
 if(CMAKE_SYSTEM_NAME MATCHES "^Android")
     IF(MNN_USE_LOGCAT)
         add_definitions(-DMNN_USE_LOGCAT)
@@ -456,8 +467,6 @@ IF(MNN_BUILD_LLM)
 ENDIF()
 
 
-set(MNN_DEPS "")
-set(MNN_EXTRA_DEPENDS "")
 
 # Add Thread dependency
 find_package(Threads)
@@ -505,13 +514,11 @@ if (NOT MSVC)
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fstrict-aliasing -ffunction-sections -fdata-sections -ffast-math -fno-rtti -fno-exceptions ")
     set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fstrict-aliasing -ffunction-sections -fdata-sections -ffast-math")
 else()
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /fp:fast")
-    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /fp:fast")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /fp:precise")
+    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /fp:precise")
 endif()
 
 # Metal
-set(MNN_DEPS "")
-set(MNN_EXTRA_DEPENDS "")
 list(APPEND MNN_DEPS MNN)
 
 # Plugin
@@ -531,14 +538,10 @@ endif()
 # CoreML
 IF(MNN_COREML)
     add_definitions(-DMNN_COREML_ENABLED=1)
-    add_subdirectory(${CMAKE_CURRENT_LIST_DIR}/source/backend/coreml/)
+    include(${CMAKE_CURRENT_LIST_DIR}/source/backend/coreml/CMakeLists.txt)
 
-    IF(MNN_SEP_BUILD)
-      list(APPEND MNN_DEPS MNNCoreML)
-      list(APPEND MNN_EXTRA_DEPENDS MNNCoreML)
-    ELSE()
-      list(APPEND MNN_OBJECTS_TO_LINK $<TARGET_OBJECTS:MNNCoreML>)
-    ENDIF()
+    list(APPEND MNN_TARGETS MNNCoreML)
+    list(APPEND MNN_OBJECTS_TO_LINK $<TARGET_OBJECTS:MNNCoreML>)
 
     find_library(COREML CoreML)
     find_library(FOUNDATION Foundation)
@@ -639,7 +642,7 @@ ELSE()
 ENDIF()
 
 # Model Internal. Enable MNN internal features such as model authentication and metrics logging.
-if (MNN_INTERNAL)
+if (MNN_INTERNAL AND NOT OHOS) # TODO: support OHOS logging
     target_compile_options(MNNCore PRIVATE -DMNN_INTERNAL_ENABLED)
     target_compile_options(MNN_Express PRIVATE -DMNN_INTERNAL_ENABLED)
     include(${CMAKE_CURRENT_LIST_DIR}/source/internal/logging/CMakeLists.txt)
diff --git a/README.md b/README.md
index 32b69e3ff..c7d8e0cc8 100644
--- a/README.md
+++ b/README.md
@@ -7,6 +7,10 @@
 ## Intro
 MNN is a highly efficient and lightweight deep learning framework. It supports inference and training of deep learning models and has industry-leading performance for inference and training on-device. At present, MNN has been integrated into more than 30 apps of Alibaba Inc, such as Taobao, Tmall, Youku, DingTalk, Xianyu, etc., covering more than 70 usage scenarios such as live broadcast, short video capture, search recommendation, product searching by image, interactive marketing, equity distribution, security risk control. In addition, MNN is also used on embedded devices, such as IoT.
 
+[MNN-LLM](https://github.com/alibaba/MNN/tree/master/transformers/llm) is a large language model runtime solution developed based on the MNN engine. The mission of this project is to deploy LLM models locally on everyone's platforms(Mobile Phone/PC/IOT). It supports popular large language models such as Qianwen, Baichuan, Zhipu, LLAMA, and others. [MNN-LLM User guide](https://mnn-docs.readthedocs.io/en/latest/transformers/llm.html)
+
+[MNN-Diffusion](https://github.com/alibaba/MNN/tree/master/transformers/diffusion) is a stable diffusion model runtime solution developed based on the MNN engine. The mission of this project is to deploy stable diffusion models locally on everyone's platforms. [MNN-Diffusion User guide](https://mnn-docs.readthedocs.io/en/latest/transformers/diffusion.html)
+
 ![architecture](doc/architecture.png)
 
 Inside Alibaba, [MNN](https://mp.weixin.qq.com/s/5I1ISpx8lQqvCS8tGd6EJw) works as the basic module of the compute container in the [Walle](https://mp.weixin.qq.com/s/qpeCETty0BqqNJV9CMJafA) System, the first end-to-end, general-purpose, and large-scale production system for device-cloud collaborative machine learning, which has been published in the top system conference OSDI’22. The key design principles of MNN and the extensive benchmark testing results (vs. TensorFlow, TensorFlow Lite, PyTorch, PyTorch Mobile, TVM) can be found in the OSDI paper. The scripts and instructions for benchmark testing are put in the path “/benchmark”. If MNN or the design of Walle helps your research or production use, please cite our OSDI paper as follows:
@@ -26,7 +30,9 @@ Inside Alibaba, [MNN](https://mp.weixin.qq.com/s/5I1ISpx8lQqvCS8tGd6EJw) works a
 
 
 ## Documentation and Workbench
-MNN's docs are in place in [Yuque docs here](https://www.yuque.com/mnn/en) and [Read the docs](https://mnn-docs.readthedocs.io/en/latest).
+MNN's docs are in place in [Read the docs](https://mnn-docs.readthedocs.io/en/latest).
+
+You can also read docs/README to build docs's html.
 
 MNN Workbench could be downloaded from [MNN's homepage](http://www.mnn.zone), which provides pretrained models, visualized training tools, and one-click deployment of models to devices.
 
diff --git a/README_CN.md b/README_CN.md
index 1d5cbb042..d7095660d 100644
--- a/README_CN.md
+++ b/README_CN.md
@@ -6,6 +6,10 @@
 
 [MNN](https://github.com/alibaba/MNN)是一个轻量级的深度神经网络引擎，支持深度学习的推理与训练。适用于服务器/个人电脑/手机/嵌入式各类设备。目前，MNN已经在阿里巴巴的手机淘宝、手机天猫、优酷等30多个App中使用，覆盖直播、短视频、搜索推荐、商品图像搜索、互动营销、权益发放、安全风控等场景。
 
+[MNN-LLM](https://github.com/alibaba/MNN/tree/master/transformers/llm)是基于MNN引擎开发的大语言模型运行方案，解决大语言模型在本地设备的高效部署问题(手机/个人电脑/嵌入式设备)。支持常见的千问/百川/智谱/LLAMA等大语言模型。使用教程：[MNN-LLM使用教程](https://mnn-docs.readthedocs.io/en/latest/transformers/llm.html)
+
+[MNN-Diffusion](https://github.com/alibaba/MNN/tree/master/transformers/diffusion)是基于MNN引擎开发的Stable Diffusion文生图模型运行方案，解决Stable Diffusion模型在本地设备的高效部署问题。使用教程：[MNN-Diffusion使用教程](https://mnn-docs.readthedocs.io/en/latest/transformers/diffusion.html)
+
 ![架构图](doc/architecture.png)
 
 在阿里巴巴中，[MNN](https://mp.weixin.qq.com/s/5I1ISpx8lQqvCS8tGd6EJw)被用作为[Walle](https://mp.weixin.qq.com/s/qpeCETty0BqqNJV9CMJafA)系统中计算容器的基础模块。Walle是首个端到端、通用型、规模化产业应用的端云协同机器学习系统，发表于操作系统顶会OSDI 2022。Walle的论文中解释了MNN的关键设计理念，并提供了MNN相对于其他深度学习框架（TensorFlow, TensorFlow Lite, PyTorch, PyTorch Mobile, TVM）的benchmark测试结果。相关测试脚本和说明文档被放在“/benchmark”目录下。如果MNN或Walle的设计对你的研究或生产有所助益，欢迎引用我们的OSDI论文：
@@ -26,7 +30,9 @@
 ## 文档与工作台
 MNN文档：
 - [最新文档(readthedocs)](https://mnn-docs.readthedocs.io/en/latest/index.html)
-- [语雀文档](https://www.yuque.com/mnn/cn)
+
+- 也可阅读 docs/README ，编译本地文档
+
 
 [MNN官网](http://www.mnn.zone)上还可以下载MNN团队全新力作MNN工作台，涵盖开箱即用模型、可视化训练等工具，更可以一键部署到多端设备。
 
diff --git a/docs/compile/cmake.md b/docs/compile/cmake.md
index f9927b4f9..9307038ad 100644
--- a/docs/compile/cmake.md
+++ b/docs/compile/cmake.md
@@ -40,7 +40,8 @@ MNN使用CMake构建项目，CMake中的宏定义列表如下：
 | MNN_VULKAN           | 是否构建`Vulkan`后端，默认为`OFF` |
 | MNN_ARM82            | 编译ARM架构时，是否构建`Armv8.2`后端，以支持FP16计算，默认为`ON` |
 | MNN_ONEDNN           | 是否使用`oneDNN`，默认为`OFF` |
-| MNN_AVX512           | 是否构建`avx512`后端，默认为`OFF` |
+| MNN_AVX2             | 在`MNN_USE_SSE`开启的基础上，是否增加AVX2指令的支持，默认为`ON` |
+| MNN_AVX512           | 在`MNN_USE_SSE`和`MNN_AVX2`开启的基础上，是否增加`avx512`指令集的支持，默认为`OFF` |
 | MNN_CUDA             | 是否构建`Cuda`后端，默认为`OFF` |
 | MNN_CUDA_PROFILE     | 是否打开CUDA profile工具，默认为`OFF` |
 | MNN_CUDA_QUANT       | 是否打开CUDA 量化文件编译，默认为`OFF` |
@@ -85,3 +86,4 @@ MNN使用CMake构建项目，CMake中的宏定义列表如下：
 | MNN_SUPPORT_TRANSFORMER_FUSE | 是否支持Fuse Transformer相关OP实现，默认为 `OFF` |
 | MNN_BUILD_LLM        | 是否构建基于MNN的llm库和demo，默认为`OFF` |
 | MNN_BUILD_DIFFUSION  | 是否构建基于MNN的diffusion demo，需要打开MNN_BUILD_OPENCV和MNN_IMGCODECS宏使用 默认为`OFF` |
+| MNN_KLEIDIAI         | 是否集成ARM的klediAI加速库【目前处于实验状态，只能跑对称量化的LLM模型】，默认为`OFF` |
diff --git a/docs/compile/engine.md b/docs/compile/engine.md
index 200124725..71132c2ce 100644
--- a/docs/compile/engine.md
+++ b/docs/compile/engine.md
@@ -1,17 +1,17 @@
 # 主库编译
 默认编译产物为：`libMNN.so`，`express/libMNN_Express.so`
 ## Linux/MacOS
-- 环境要求
+### 环境要求
   - cmake >= 3.10
   - gcc >= 4.9 或者使用 clang
-- 相关编译选项
+### 相关编译选项
   - `MNN_AVX512` 是否使用AVX512指令，需要gcc9以上版本编译
   - `MNN_OPENCL` 是否使用OpenCL后端，针对GPU设备
   - `MNN_METAL` 是否使用Metal后端，针对MacOS/iOSGPU设备
   - `MNN_VULKAN` 是否使用Vulkan后端，针对GPU设备
   - `MNN_CUDA`  是否使用CUDA后端，针对Nivida GPU设备
   - 其他编译选项可自行查看 CMakeLists.txt
-- 具体步骤
+### 具体步骤
   1. 准备工作 (可选，修改 MNN Schema 后需要）
         ```bash
         cd /path/to/MNN
@@ -22,6 +22,15 @@
         ```bash
         mkdir build && cd build && cmake .. && make -j8
         ```
+### Mac M1 上编译
+- Mac M1 较为特殊的一点是作为过渡期间的芯片支持Arm/x64双架构，一般需要额外指定来获取需要的架构
+- 在 cmake 步骤增加 `-DCMAKE_OSX_ARCHITECTURES=arm64` 可以编译出 Arm 架构的库，对应地编译 x64 架构时加 `-DCMAKE_OSX_ARCHITECTURES=x86_64`:
+
+```
+cd /path/to/MNN
+mkdir build && cd build && cmake .. -DCMAKE_OSX_ARCHITECTURES=arm64 && make -j8
+```
+
 ## Windows(非ARM架构)
 - 环境要求
   - Microsoft Visual Studio >= 2017
@@ -87,14 +96,23 @@
         mkdir build_64 && cd build_64 && ../build_64.sh
         ```
 ## iOS
+可基于脚本编译或者基于xcode工程编译
+
 - 环境要求
   - xcode
+  - cmake
 - 相关编译选项
   - `MNN_METAL` 是否使用Metal后端，Metal后端可以利用GPU加速
   - `MNN_COREML`  是否使用CoreML后端，CoreML后端可以利用ANE硬件加速
   - `MNN_ARM82`  是否支持fp16推理，开启该编译选项后，在precision设成Precision_Low时，会在支持的设备（ARMv8.2 及以上架构）上启用低精度(fp16)推理，减少内存占用，提升性能
-- 具体步骤
-  - 在macOS下，用Xcode打开project/ios/MNN.xcodeproj，点击编译即可
+
+- 基于 xcode 编译：用Xcode打开project/ios/MNN.xcodeproj，点击编译即可，工程中默认打开上述所有编译选项
+
+- 基于脚本编译：运行脚本并开启`MNN_ARM82`选项
+```
+sh package_scripts/ios/buildiOS.sh "-DMNN_ARM82=true"
+```
+
 ## 其他平台交叉编译
 由于交叉编译的目标设备及厂商提供的编译环境类型众多，本文恕无法提供手把手教学。 以下是大致流程，请按照具体场景做相应修改。  
 交叉编译大致上分为以下两个步骤，即获取交叉编译器以及配置CMake进行交叉编译。
@@ -137,3 +155,49 @@
         -DCMAKE_CXX_COMPILER=$cross_compile_toolchain/bin/aarch64-linux-gnu-g++
         make -j4
         ```
+
+## Web
+
+- 可以把 MNN 源代码编译为 WebAssembly 以便在浏览器中使用
+
+### 安装 emcc
+参考 https://emscripten.org/docs/getting_started/downloads.html ，安装完成后并激活，此时可使用 emcmake
+
+### 编译（通用）
+- 使用 emcmake cmake 替代 cmake ，然后 make 即可: 
+```
+mkdir build
+cd build
+emcmake cmake .. -DCMAKE_BUILD_TYPE=Release -DMNN_FORBID_MULTI_THREAD=ON -DMNN_USE_THREAD_POOL=OFF -DMNN_USE_SSE=OFF
+emmake make MNN -j16
+```
+
+编译完成后产出 libMNN.a ，可在后续的 webassembly 程序中链接，链接时一般要添加 -s ALLOW_MEMORY_GROWTH=1 ，避免内存不足后 crash
+
+### SIMD 支持
+
+- 如果确认目标设备支持Web Simd ，在cmake时加上 -msimd128 -msse4.1 ，可以较大提升性能，eg: 
+```
+mkdir build
+cd build
+emcmake cmake .. -DCMAKE_BUILD_TYPE=Release -DMNN_BUILD_TEST=true -DCMAKE_CXX_FLAGS="-msimd128 -msse4.1" -DMNN_FORBID_MULTI_THREAD=ON -DMNN_USE_THREAD_POOL=OFF -DMNN_USE_SSE=ON
+emmake make MNN -j16
+```
+
+### 测试
+由于Web上文件系统不一致，建议只编译run_test.out运行，其他测试工具需要加上--preload-file {dir} 
+
+- 编译示例
+
+```
+mkdir build
+cd build
+emcmake cmake .. -DCMAKE_BUILD_TYPE=Release -DMNN_BUILD_TEST=true -DCMAKE_CXX_FLAGS="-msimd128 -msse4.1 -s ALLOW_MEMORY_GROWTH=1" -DMNN_FORBID_MULTI_THREAD=ON -DMNN_USE_THREAD_POOL=OFF -DMNN_USE_SSE=ON
+emmake make -j16
+```
+
+- 运行
+```
+node run_test.out.js speed/MatMulBConst   //测试性能
+node run_test.out.js  //测试功能
+```
diff --git a/docs/contribute/op.md b/docs/contribute/op.md
index 7a28397de..c106605f5 100644
--- a/docs/contribute/op.md
+++ b/docs/contribute/op.md
@@ -335,33 +335,22 @@ REGISTER_METAL_OP_CREATOR(MetalMyCustomOpCreator, OpType_MyCustomOp);
 重新运行一下 CMake ，或者手动在Xcode工程中新加文件
 
 ### 添加Vulkan实现
-1. 添加Shader
-在`source/backend/vulkan/execution/glsl`目录下添加具体的shader(*.comp)。若输入内存布局为`NC4HW4`，则按`image`实现，否则采用buffer实现。可以参考目录下已有实现。然后，执行`makeshader.py`脚本编译Shader。
-
-2. 实现类声明
-在目录`source/backend/vulkan/execution/`下添加`VulkanMyCustomOp.hpp`和`VulkanMyCustomOp.cpp`：
-```cpp
-class VulkanMyCustomOp : public VulkanBasicExecution {
-public:
-    VulkanMyCustomOp(const Op* op, Backend* bn);
-    virtual ~VulkanMyCustomOp();
-    ErrorCode onEncode(const std::vector<Tensor*>& inputs, 
-                       const std::vector<Tensor*>& outputs,
-                       const VulkanCommandPool::Buffer* cmdBuffer) override;
-private:
-    // GPU Shader所需的参数
-    std::shared_ptr<VulkanBuffer> mConstBuffer;
-    // Pipeline
-    const VulkanPipeline* mPipeline;
-    // Layout Descriptor Set
-    std::shared_ptr<VulkanPipeline::DescriptorSet> mDescriptorSet;
-};
-```
-
-3. 实现
-实现函数`onEncode`，首先需要做内存布局检查：若为`NC4HW4`，则Shader用image实现，否则用buffer。执行完毕返回NO_ERROR。
-
-4. 注册实现类
+Vulkan后端当前包含两种张量存储类型：buffer与image。开发者可在编译时通过宏`MNN_VULKAN_IMAGE`自行选择需要的存储类型。当开发者需要为Vulkan后端添加算子时，亦需要考虑选择何种存储类型并在相应目录下进行开发。下以image类型为例，阐述为Vulkan后端添加算子的主要流程。
+
+1. 实现Execution
+- 执行脚本`source/backend/vulkan/image/compiler/VulkanCodeGen.py`，该脚本将向`source/backend/vulkan/image/execution`中添加`VulkanMyOp.hpp`与`VulkanMyOp.cpp`的模版代码
+- 实现构造函数
+  - 从CPU中读取常量参数，并写入GPU中
+  - 创建算子所需的pipeline
+    - 确定要使用的shader以及Macro
+    - set descriptorTypes，即确定shader中用到的显存对象的类型
+    - 调用getPipeline接口
+- 实现onEncode
+  - 显存资源申请并更新descriptorSet，将shader中需要读写的显存对象写入descriptorSet
+  - 添加memoryBarrier
+  - 把pipeline绑到cmdBuffer与descriptorSet
+  - command dispatch
+- 注册算子并添加创建类
 ```cpp
 class VulkanMyCustomOpCreator : public VulkanBackend::Creator {
 public:
@@ -377,6 +366,15 @@ static bool gResistor = []() {
 }();
 ```
 
+2. 实现shader及编译
+- 编写Compute Shader文件`myOp.comp`，添加至目录`source/backend/vulkan/image/execution/glsl`
+- 将算子中用到的宏加入`source/backend/vulkan/image/execution/glsl/macro.json`
+- 执行脚本`source/backend/vulkan/image/compiler/makeshader.py`，该脚本将编译`myOp.comp`，并更新`source/backend/vulkan/image/compiler/AllShader.cpp`、`source/backend/vulkan/image/shaders/AllShader.h`以及`source/backend/vulkan/image/compiler/VulkanShaderMap.cpp`
+> MNN Vulkan当前使用glslangValidator（glslang仓库地址：<https://github.com/KhronosGroup/glslang>，版本号：12.2.0，commit id：d1517d64cfca91f573af1bf7341dc3a5113349c0）编译所有的compute shader。开发者如需保持自行编译后得到的二进制编译结果与MNN仓库中现有的编译结果一致，需要确保环境中的glslang的版本与MNN所使用的一致。
+
+
+
+
 ### 添加OpenCL实现
 1. 添加Kernel
 在`source/backend/opencl/execution/cl`目录添加具体的kernel(*.cl)。目前feature map均使用`image2d`实现。可以参考目录下已有实现。然后执行`opencl_codegen.py`来生成kernel映射。
diff --git a/docs/faq.md b/docs/faq.md
index db7241f12..f2abc7f0c 100644
--- a/docs/faq.md
+++ b/docs/faq.md
@@ -3,6 +3,8 @@
 - [模型转换后结果与其他框架不一致](faq.html#id8)
 - [compute shape error](faq.html#compute-shape-error-for-xxx)
 - [模型转换时有Error信息](faq.html#reshape-error)
+- [模型转换加上fp16没有性能提升](faq.html#fp16)
+- [如何开启动态量化](faq.html#weightquantbits)
 - [模型量化后为什么比浮点慢](faq.html#id14)
 - [输入输出的elementSize与实际有区别](faq.html#tensor-elementsize)
 - [MNN模型如何加密](faq.html#id18)
@@ -112,6 +114,14 @@ opConverter ==> MNN Converter NOT_SUPPORTED_OP: [ ANY_OP_NAME ]
 ### 模型转换后与原框架结果不一致
 先使用MNN中的模型一致性验证脚本进行测试，确定不是调用方法或其他错误，[使用方法](./tools/convert.html#id3)
 
+### 模型转换加上fp16后没有性能提升
+此功能只支持压缩模型数据，在运行时仍然先解压到float32运算。如果希望使用 fp16 加速，打开 `MNN_ARM82` 并在加载模型时设置 precision = low
+
+### 模型转换加上weightQuantBits后如何进行加速
+可以通过动态量化功能，加载仅权重量化的模型，降低内存占用和提升性能
+1. 打开 `MNN_LOW_MEMORY` 编译宏编译 MNN （支持动态量化功能）
+2. 使用 mnn 模型时 memory 设成 low
+
 ## Pymnn
 ### import MNN 出现 import numpy error
 临时解决方案：升级 numpy 版本到 1.20.0 或以上
@@ -169,10 +179,10 @@ const float* outputPtr = output->readMap<float>();
 
 ### Android 设备无法查看日志
 Android 系统有两类打印日志的方式: printf 和 logcat. 默认 MNN 的编译脚本使用 printf，这样方便在命令行中调试。集成到 App 上时，用 cmake  -DMNN_USE_LOGCAT=ON 将打印日志的方式改成 logcat 即可用 adb logcat 查看
-### 
+
 ### 如何增加 opencl so 地址?
 MNN opencl 后端默认采用 dlopen 的方式动态打开设备的 opencl 驱动，相应位置若找不到您设备上的驱动，请修改 **OpenCLWrapper.cpp**
-### 
+
 ### TensorArray Op 与 Switch / Merge 控制流支持
 TensorArray 和控制流支持需要借助 MNN-Express ，
 请参考 demo/exec/transformerDemo.cpp 的接口使用
@@ -284,6 +294,7 @@ GPU 后端调用 copy 的时间包含两个部分
    - x64 + vnni 指令，量化计算有 sdot 指令，明显快于 FP32 ，编译 MNN 时需要开启 MNN_AVX512 以支持这个指令，一般相比 AVX512 的浮点运算快 30%
    - ARM v7a / ARMv8 ：量化计算采用 int8 乘加到 int16，再双加到 int32 的方式，计算效率略快于浮点（一般 30% 左右提升）。
    - ARMv8.2 架构有 sdot 指令，但同时 FP32 相对之前架构发射数也提升了一倍，也支持了比 FP32 快一倍的 FP16 向量计算指令，MNN 会检查设备架构以开启 sdot / smmla ，理想情况下量化计算性能比 FP32 快1倍以上，比 FP16 快 20%。
+   - ARMv8.6 架构有 smmla 指令，理想情况下量化计算性能比 FP32 快3倍以上，比 FP16 快1倍以上，比 BF16 快 20%。
 
 ## 其他问题
 ### MNN模型如何加密
diff --git a/docs/index.rst b/docs/index.rst
index 174a53cf3..0e9264da3 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -58,7 +58,6 @@
    train/expr
    train/data
    train/optim
-   train/quant
    train/finetune
    train/distl
 
@@ -69,6 +68,7 @@
 
    transformers/diffusion
    transformers/llm
+   transformers/models
 
 .. toctree::
    :maxdepth: 1
@@ -78,7 +78,6 @@
    tools/convert
    tools/test
    tools/benchmark
-   tools/quant
    tools/compress
    tools/visual
    tools/python
diff --git a/docs/inference/session.md b/docs/inference/session.md
index 02cd651a3..2e5e0b6fc 100644
--- a/docs/inference/session.md
+++ b/docs/inference/session.md
@@ -270,7 +270,16 @@ const std::map<std::string, Tensor*>& getSessionInputAll(const Session* session)
 
 在只有一个输入tensor时，可以在调用`getSessionInput`时传入NULL以获取tensor。
 
-### 拷贝数据
+### 【推荐】映射填充数据
+**映射输入Tensor的内存，部分后端可以免数据拷贝**
+```cpp
+auto input = interpreter->getSessionInput(session, NULL);
+void* host = input->map(MNN::Tensor::MAP_TENSOR_WRITE, input->getDimensionType());
+// fill host memory data
+input->unmap(MNN::Tensor::MAP_TENSOR_WRITE,  input->getDimensionType(), host);
+```
+
+### 【不推荐】拷贝填充数据
 NCHW示例，适用 ONNX / Caffe / Torchscripts 转换而来的模型：
 ```cpp
 auto inputTensor = interpreter->getSessionInput(session, NULL);
@@ -293,7 +302,7 @@ delete nhwcTensor;
 通过这类拷贝数据的方式，用户只需要关注自己创建的tensor的数据布局，`copyFromHostTensor`会负责处理数据布局上的转换（如需）和后端间的数据拷贝（如需）。
 
 
-### 直接填充数据
+### 【不推荐】直接填充数据
 ```cpp
 auto inputTensor = interpreter->getSessionInput(session, NULL);
 inputTensor->host<float>()[0] = 1.f;
@@ -549,8 +558,16 @@ const std::map<std::string, Tensor*>& getSessionOutputAll(const Session* session
 
 **注意：当`Session`析构之后使用`getSessionOutput`获取的`Tensor`将不可用**
 
-### 拷贝数据
-**不熟悉MNN源码的用户，必须使用这种方式获取输出！！！**
+### 【推荐】映射输出数据
+**映射输出Tensor的内存数据，部分后端可以免数据拷贝**
+```cpp
+auto outputTensor = net->getSessionOutput(session, NULL);
+void* host = outputTensor->map(MNN::Tensor::MAP_TENSOR_READ,  outputTensor->getDimensionType());
+// use host memory by yourself
+outputTensor->unmap(MNN::Tensor::MAP_TENSOR_READ,  outputTensor->getDimensionType(), host);
+```
+### 【不推荐】拷贝输出数据
+**采用纯内存拷贝的方式，拷贝需要花费时间**
 NCHW （适用于 Caffe / TorchScript / Onnx 转换而来的模型）示例：
 ```cpp
 auto outputTensor = interpreter->getSessionOutput(session, NULL);
@@ -577,7 +594,7 @@ delete nhwcTensor;
 
 
 
-### 直接读取数据
+### 【不推荐】直接读取数据
 **由于绝大多数用户都不熟悉MNN底层数据布局，所以不要使用这种方式！！！**
 ```cpp
 auto outputTensor = interpreter->getSessionOutput(session, NULL);
diff --git a/docs/tools/compress.md b/docs/tools/compress.md
index 0d1e20628..b98a0371a 100644
--- a/docs/tools/compress.md
+++ b/docs/tools/compress.md
@@ -1,11 +1,13 @@
-# 模型压缩工具箱
+# 模型压缩 / 模型量化
 
 ## 介绍
 ### 是什么？
-MNN模型压缩工具箱提供了包括低秩分解、剪枝、量化等模型压缩算法的实现，并且MNN进一步实现了其中一些需要软件特殊实现的算法（如稀疏计算和量化）的底层计算过程，因此，此工具箱需要配合MNN推理框架来使用。
-具体来说，MNN压缩工具箱包含两个组成部分：
-1. **MNN框架自身提供的压缩工具**（输入MNN模型，输出MNN模型）
-2. **mnncompress**（基于主流训练框架TF/Pytorch的模型压缩工具）。
+MNN模型压缩工具提供了包括低秩分解、剪枝、量化等模型压缩算法的实现，并且MNN进一步实现了其中一些需要软件特殊实现的算法（如稀疏计算和量化）的底层计算过程，因此，此工具箱需要配合MNN推理框架来使用。
+具体来说，MNN压缩工具/量化工具包含三个部分，使用复杂度逐步上升：
+1. **模型转换工具中的压缩功能**（只实现权值量化，在模型转换过程中增加参数即可实现）
+2. **离线量化工具**（实现权值量化及特征量化，需要少量测试数据）
+3. **mnncompress**（基于主流训练框架TF/Pytorch的模型压缩工具，需要训练数据和对应的训练框架环境）。
+
 ### 有什么？
 目前提供的能力如下表所示：
 
@@ -26,64 +28,79 @@ MNN模型压缩工具箱提供了包括低秩分解、剪枝、量化等模型
 | 训练量化 | 将float卷积转换为int8卷积计算，需要进行训练，可提高量化模型精度，降低存储量到原始模型的四分之一，降低内存，加速计算（某些模型可能会比float模型慢，因为float的优化方法和int8不同） | LSQ，OAQ，WAQ |
 | 直接权值量化 | 仅将模型中的权值进行量化，计算时还原为float进行计算，因此仅减少模型存储量，计算速度和float相同，可以在模型转换时一键完成，8bit量化情况下，精度基本不变，模型大小减小到原来的1/4 | 对称量化，非对称量化 |
 | 训练权值量化 | 特点同直接权值量化，但通过mnncompress压缩算法插件实现，因而可以提供更低比特的权值量化，以减少更多的存储量，并提高权值量化之后模型的精度，例如4bit量化情况下，模型大小减小到原来的1/8 | 对称量化 |
-| FP16 | 将FP32计算转换为FP16计算，可在模型转换时一键完成，模型大小减小为原来的1/2，精度基本无损，并提高计算速度（需要硬件支持FP16计算） | - |
+| FP16 | 将FP32计算转换为FP16计算，可在模型转换时一键完成，模型大小减小为原来的1/2，精度基本无损 | - |
 
 ### 怎么用？
-1. 如果只想使用离线压缩方法，可以将模型转换为MNN模型之后使用对应的工具进行压缩。这类压缩算法不需要进行训练finetune，所以通常运行得很快。
-2. 如果离线压缩方法的精度不满足要求，且能够进行训练finetune的话，可以使用**mnncompress**中提供的压缩算法插件将原始模型进行压缩，得到压缩之后的模型和压缩信息描述文件，然后将这两个文件输入到MNN模型转换工具得到最终的MNN压缩模型。需要训练的压缩算法可以提供更好的精度，但需要一定的时间进行finetune训练，此finetune训练需要的时间一般比模型从0开始训练要少很多。
-3. 这些算法中有些是可以叠加使用的，以取得更好的压缩效果。推荐使用pipeline（**其中方框中的算法均为可选，叠加压缩算法若精度不好，可选择使用**）：
+1. 使用模型转换工具中的压缩功能无需额外数据，只要在模型转换时加对应参数即可，开启动态量化功能后也可以对卷积等计算量大的算子实现量化加速。
+2. 使用离线量化可以使大部分算子支持量化加速，这个可以将模型转换为MNN模型之后使用离线量化工具进行压缩，需要少量测试数据，但不需要进行训练finetune，通常运行得很快。
+3. 如果离线压缩方法的精度不满足要求，且能够进行训练finetune的话，可以使用**mnncompress**中提供的压缩算法插件将原始模型进行压缩，得到压缩之后的模型和压缩信息描述文件，然后将这两个文件输入到MNN模型转换工具得到最终的MNN压缩模型。需要训练的压缩算法可以提供更好的精度，但需要一定的时间进行finetune训练，此finetune训练需要的时间一般比模型从0开始训练要少很多。
+4. 这些算法中有些是可以叠加使用的，以取得更好的压缩效果。推荐使用pipeline（**其中方框中的算法均为可选，叠加压缩算法若精度不好，可选择使用**）：
 ![](../_static/images/tools/mnncompress.jpg)
 
-## MNN框架自身提供的压缩工具
-### 使用方法
-MNN框架压缩工具是基于离线量化工具和MNN转换工具来实现压缩功能的，这两个工具均提供c++版本和python版本，安装方式如下：
+## 使用模型转换工具的压缩功能
+
+### 模型转换工具安装
 - c++工具安装
 
-    需要源码编译MNN转换工具 `MNNConvert` 和量化工具 `quantized.out`
+    源码编译MNN转换工具 `MNNConvert`
     ```bash
     cd build
-    cmake ..  -DMNN_BUILD_CONVERTER=ON -DMNN_BUILD_QUANTOOLS=ON
-    make -j 8
+    cmake ..  -DMNN_BUILD_CONVERTER=ON
+    make -j8
     ```
 - python工具安装
     ```bash
-    # 外部版本MNN，外网安装方式
     pip install MNN
-    # 外部版本MNN，集团内安装方式
-    pip install --index-url https://pypi.antfin-inc.com/simple/ -U MNN
-    # 内部版本MNN
-    pip install --index-url https://pypi.antfin-inc.com/simple/ -U MNN-Internal
     # 安装之后，命令行中将有如下工具：
     mnn：显示MNN命令行工具
     mnnconvert：转换器 MNNConvert 的预编译工具，功能同 MNNConvert
     mnnquant：量化工具 quantized.out 的预编译工具，功能同 quantized.out
     ```
-### MNN离线量化工具
-#### 原理
-将float卷积转换为int8卷积进行计算（仅量化卷积，建议将FC转为1*1卷积实现），同时会通过MNN几何计算机制将量化信息在网络中进行传播，以支持尽可能多的算子的量化计算。模型大小减少为原始模型的1/4，并减少内存，提高推理速度（某些模型可能量化之后变慢，因为float的计算可以使用winograd、strassen等优化算法，而离线量化的int8计算并没有这些优化，如果要使用int8量化的特殊优化，如OAQ、WAQ等，需要使用mnncompress）。
-#### 单输入、图片输入模型的量化
-这类模型可以使用 `quantized.out`（或`mnnquant`）进行量化，使用文档在：[quantized.out](quant.md)，[mnnquant.md](python.html#mnnquant)
-#### 通用模型的量化
-通用模型量化工具可以支持任意输入和任意输入类型的模型的量化，基于MNN python包，使用文档在：[MNNPythonOfflineQuant](https://github.com/alibaba/MNN/tree/master/tools/MNNPythonOfflineQuant)
-
-**注意：**`calibration_dataset.py`中`__getitem__`返回为一个输入sample，其形状不应该包含batch维度，在量化时我们会根据工具命令行中传入的batch参数，stack出一个batch的数据，但我们默认batch维度在第一维，所以，如果你的某个输入的batch维不在第一维，你需要在你对应的输入之前加一个transpose。
-### MNN权值量化工具
-#### 原理
-仅将模型中卷积的float权值量化为int8存储，推理时反量化还原为float权值进行计算。因此，其推理速度和float模型一致，但是模型大小可以减小到原来的1/4，可以通过模型转换工具一键完成，比较方便。推荐float模型性能够用，仅需要减少模型大小的场景使用。
-#### 使用方法
-使用`MNNConvert`（c++）或者`mnnconvert`（python包中自带）进行转换，转换命令行中加上下述选项即可：
+
+### 权值量化
+- 仅将模型中卷积的float权值量化为int8存储，在不开启动态量化功能的情况下，推理时反量化还原为float权值进行计算。因此，其推理速度和float模型一致，但是模型大小可以减小到原来的1/4，可以通过模型转换工具一键完成，比较方便，推荐优先使用。
+- 使用`MNNConvert`（c++）或者`mnnconvert`（python包中自带）进行转换，转换命令行中加上下述选项即可：
 ```bash
---weightQuantBits 8 [--weightQuantAsymmetric](可选)
+--weightQuantBits 8 [--weightQuantAsymmetric](可选) [--weightQuantBlock 128](可选) 
 ```
 `--weightQuantAsymmetric` 选项是指使用非对称量化方法，精度要比默认的对称量化精度好一些。
-### MNN FP16压缩工具
-#### 原理
-将模型中FP32权值转换为FP16存储，并在支持的设备上开启FP16推理，可以获得推理加速，并且速度减少到原来的1/2。可以在模型转换时一键完成，使用方便。
-#### 使用方法
-使用`MNNConvert`（c++）或者`mnnconvert`（python包中自带）进行转换，转换命令行中加上下述选项即可：
+`--weightQuantBlock 128` 表示以128为单位进行量化，如不设置则以输入通道数为单位进行量化。如果牺牲一些存储大小来提升量化精度，可以增加这个设置，理论上越小精度越高，但建议不要低于32。
+- 动态量化
+可以通过如下方式打开MNN运行时的动态量化支持，使权值量化后的模型中卷积等核心算子使用量化计算，降低内存并提升性能
+1. 打开 MNN_LOW_MEMORY 编译宏编译 MNN （支持动态量化功能）
+2. 使用 mnn 模型时 memory mode 设成 low 
+
+### FP16压缩
+- 将模型中FP32权值转换为FP16存储，并在支持的设备上开启FP16推理，可以获得推理加速，并且速度减少到原来的1/2。可以在模型转换时一键完成，使用方便。
+- 使用`MNNConvert`（c++）或者`mnnconvert`（python包中自带）进行转换，转换命令行中加上下述选项即可：
 ```bash
 --fp16
 ```
+
+## 离线量化工具
+### 离线量化工具安装
+- c++工具安装
+
+    需要源码编译量化工具 `quantized.out`
+    ```bash
+    cd build
+    cmake ..  -DMNN_BUILD_QUANTOOLS=ON
+    make -j8
+    ```
+- python工具安装
+    ```bash
+    pip install MNN
+    # 安装之后，命令行中将有如下工具：
+    mnn：显示MNN命令行工具
+    mnnconvert：转换器 MNNConvert 的预编译工具，功能同 MNNConvert
+    mnnquant：量化工具 quantized.out 的预编译工具，功能同 quantized.out
+    ```
+
+### 离线量化原理
+
+将float卷积转换为int8卷积进行计算（仅量化卷积，建议将FC转为1*1卷积实现），同时会通过MNN几何计算机制将量化信息在网络中进行传播，以支持尽可能多的算子的量化计算。模型大小减少为原始模型的1/4，并减少内存，提高推理速度（某些模型可能量化之后变慢，因为float的计算可以使用winograd、strassen等优化算法，而离线量化的int8计算并没有这些优化，如果要使用int8量化的特殊优化，如OAQ、WAQ等，需要使用mnncompress）。
+可以使用 `quantized.out`（或`mnnquant`）进行量化，使用文档在：[quantized.out](quant.md)，[mnnquant.md](python.html#mnnquant)
+
 ## mnncompress
 ### 使用方法
 #### 安装
diff --git a/docs/tools/convert.md b/docs/tools/convert.md
index b815405bf..bc869abab 100644
--- a/docs/tools/convert.md
+++ b/docs/tools/convert.md
@@ -31,7 +31,7 @@ Usage:
       --MNNModel arg            转换之后保存的MNN模型文件名, ex: *.mnn
       
       --fp16                    将conv/matmul/LSTM的float32参数保存为float16，
-      													模型将减小一半，精度基本无损
+      													模型将减小一半，精度基本无损，运行速度和float32模型一致
       
       --bizCode arg             MNN模型Flag, ex: MNN
       
@@ -41,7 +41,7 @@ Usage:
       
       --weightQuantBits arg     arg=2~8，此功能仅对conv/matmul/LSTM的float32权值进行量化，
       													仅优化模型大小，加载模型后会解码为float32，量化位宽可选2~8，
-                                运行速度和float32模型一致。8bit时精度基本无损，模型大小减小4倍
+                                不开启动态量化的情况下，运行速度和float32模型一致。8bit时精度基本无损，模型大小减小4倍
                                 default: 0，即不进行权值量化
 
       --weightQuantAsymmetric   与weightQuantBits结合使用，决定是否用非对称量化，默认为`true`
@@ -77,7 +77,9 @@ Usage:
       --detectSparseSpeedUp arg
                                 可选值：{0, 1}， 默认为1, 会检测权重是否使用稀疏化加速
 
-      --saveExternalData        将权重，常量等数据存储在额外文件中，默认为`false`
+      --saveExternalData        将权重，常量等数据存储在额外文件中，默认为0，也就是`false`
+      
+      --useGeluApproximation    在进行Gelu算子合并时，使用Gelu的近似算法，默认为1 ，也就是`true`
 
 ```
 
diff --git a/docs/tools/quant.md b/docs/tools/quant.md
index 0e4e733c9..f61291797 100644
--- a/docs/tools/quant.md
+++ b/docs/tools/quant.md
@@ -1,9 +1,8 @@
-# 单输入模型离线量化工具
+# 离线量化工具（输入少量数据量化）
 `./quantized.out origin.mnn quan.mnn imageInputConfig.json`
 
 MNN quantized.out工具已支持通用（任意输入个数、维度、类型）模型离线量化， 但这里的多输入模型仅仅支持非图片输入类模型。
 
-MNN现已推出基于TensorFlow/Pytorch的模型压缩工具mnncompress，请查看[文档](https://mnn-docs.readthedocs.io/en/latest/tools/compress.html)选择使用
 
 ## 参数
   - 第一个参数为原始模型文件路径，即待量化的浮点模
@@ -31,7 +30,7 @@ MNN现已推出基于TensorFlow/Pytorch的模型压缩工具mnncompress，请查
 |--------------------|------|
 | KL | 使用KL散度进行特征量化系数的校正，一般需要100 ~ 1000张图片(若发现精度损失严重，可以适当增减样本数量，特别是检测/对齐等回归任务模型，样本建议适当减少) |
 | ADMM | 使用ADMM（Alternating Direction Method of Multipliers）方法进行特征量化系数的校正，一般需要一个batch的数据 |
-| EMA | 使用指数滑动平均来计算特征量化参数，这个方法会对特征进行非对称量化，精度可能比上面两种更好。这个方法也是[MNNPythonOfflineQuant](https://github.com/alibaba/MNN/tree/master/tools/MNNPythonOfflineQuant)的底层方法，建议使用这个方法量化时，保留你pb或onnx模型中的BatchNorm，并使用 --forTraining 将你的模型转到MNN，然后基于此带BatchNorm的模型使用EMA方法量化。另外，使用这个方法时batch size应设置为和训练时差不多最好。 |
+| EMA | 使用指数滑动平均来计算特征量化参数，这个方法会对特征进行非对称量化，精度可能比上面两种更好。使用这个方法时batch size应设置为和训练时差不多最好。|
 
 | weight_quantize_method | 说明 |
 |--------------------|------|
@@ -39,10 +38,12 @@ MNN现已推出基于TensorFlow/Pytorch的模型压缩工具mnncompress，请查
 | ADMM | 使用ADMM方法进行权值量化 |
 
 ## 多输入模型的参数设置的特别说明(MNN现阶段仅支持输入数据类型是非图片的多输入模型)
+
 | 需要特别指定的参数 | 设置值 |
 |--------------------|------|
 | input_type | `str`：输入数据的类型，"sequence" |
-| path | `str`：存放校正特征量化系数的输入数据目录 |，
+| path | `str`：存放校正特征量化系数的输入数据目录 |
+
 例如在quant.json文件中 "path": "/home/data/inputs_dir/"，你所构造的矫正数据集有两个，分别存放在input_0和input_1子目录下，即"/home/data/inputs_dir/input_0"和"/home/data/inputs_dir/input_1".由GetMNNInfo工具可以得到模型的输入输出名称，例如该模型的输入有三个：data0, data1, data2，输出有两个：out1, out2. 那么在input_0和input_1子目录下分别有六个文件：data0.txt, data1.txt, data2.txt, out1.txt, out2.txt, input.json. 其中的五个文件名要和模型的输入输出名对应，最后一个input.json文件则描述的是输入名和对应的shape内容：
 ```json
 {
diff --git a/docs/tools/test.md b/docs/tools/test.md
index 02c2d3df0..5613ff121 100644
--- a/docs/tools/test.md
+++ b/docs/tools/test.md
@@ -32,7 +32,7 @@ Model Version: < 2.0.0
 - `runMask:int` 是否输出推理中间结果，0为不输出，1为只输出每个算子的输出结果（{op_name}.txt）;2为输出每个算子的输入（Input_{op_name}.txt）和输出（{op_name}.txt）结果； 默认输出当前目录的output目录下（使用工具之前要自己建好output目录）; 16为开启自动选择后端；32为针对Winograd算法开启内存优化模式，开启后会降低模型（如果含有Winograd Convolution算子）运行时的内存但可能会导致算子的性能损失。可选，默认为`0`
 - `forwardType:int` 执行推理的计算设备，有效值为：0（CPU）、1（Metal）、2（CUDA）、3（OpenCL）、6（OpenGL），7(Vulkan) ，9 (TensorRT)，可选，默认为`0`
 - `numberThread:int` 线程数仅对CPU有效，可选，默认为`4`
-- `precision_memory:int` 测试精度与内存模式，precision_memory % 16 为精度，有效输入为：0(Normal), 1(High), 2(Low), 3(Low_BF16)，可选，默认为`2` ; precision_memory / 16 为内存设置，默认为 0 (memory_normal) 。例如测试 memory 为 low (2) ，precision 为 1 (high) 时，设置 precision_memory = 9 (2 * 4 + 1)
+- `precision_memory:int` 测试精度与内存模式，precision_memory % 4 为精度，有效输入为：0(Normal), 1(High), 2(Low), 3(Low_BF16)，可选，默认为`2` ; (precision_memory / 4) % 4 为内存设置，默认为 0 (memory_normal) 。例如测试 memory 为 low (2) ，precision 为 1 (high) 时，设置 precision_memory = 9 (2 * 4 + 1)
 - `inputSize:str` 输入tensor的大小，输入格式为：`1x3x224x224`，可选，默认使用模型默认输入
 
 
@@ -480,7 +480,7 @@ GPU 内存输入测试用例
 - `testmode:int` 默认为 0 ，测试输入GPU内存的类型，0 (OpenCL Buffer) 、 1（OpenGL Texture）
 - `forwardType:int` 执行推理的计算设备，有效值为：0（CPU）、1（Metal）、2（CUDA）、3（OpenCL）、6（OpenGL），7(Vulkan) ，9 (TensorRT)，可选，默认为`0`
 - `numberThread:int` GPU的线程数，可选，默认为`1`
-- `precision_memory:int` 测试精度与内存模式，precision_memory % 16 为精度，有效输入为：0(Normal), 1(High), 2(Low), 3(Low_BF16)，可选，默认为`2` ; precision_memory / 16 为内存设置，默认为 0 (memory_normal) 。例如测试 memory 为 2(low) ，precision 为 1 (high) 时，设置 precision_memory = 9 (2 * 4 + 1)
+- `precision_memory:int` 测试精度与内存模式，precision_memory % 4 为精度，有效输入为：0(Normal), 1(High), 2(Low), 3(Low_BF16)，可选，默认为`0` ; (precision_memory / 4) % 4 为内存设置，默认为 0 (memory_normal) 。 (precision_memory / 16） % 4 为功耗设置，默认为0（power_normal）。例如测试 memory 为 2(low) ，precision 为 1 (high) 时，设置 precision_memory = 9 (2 * 4 + 1)
 
 
 ## 在Android中使用测试工具
diff --git a/docs/train/quant.md b/docs/train/quant.md
deleted file mode 100644
index 70ef17122..000000000
--- a/docs/train/quant.md
+++ /dev/null
@@ -1,100 +0,0 @@
-# 训练量化
-## 什么是训练量化
-与离线量化不同，训练量化需要在训练中模拟量化操作的影响，并通过训练使得模型学习并适应量化操作所带来的误差，从而提高量化的精度。因此训练量化也称为Quantization-aware Training（QAT），意指训练中已经意识到此模型将会转换成量化模型。
-
-## 如何在MNN中使用训练量化
-已经通过其他训练框架如TensorFlow、Pytorch等训练得到一个float模型，此时可以通过先将此float模型通过MNNConverter转换为MNN统一的模型格式，然后使用MNN提供的离线量化工具直接量化得到一个全int8推理模型。如果此模型的精度不满足要求，则可以通过训练量化来提高量化模型的精度。
-
-使用步骤：
-1. 首先通过其他训练框架训练得到原始float模型；
-2. 编译MNNConverter模型转换工具；
-3. 使用MNNConverter将float模型转成MNN统一格式模型，因为要进行再训练，建议保留BN，Dropout等训练过程中会使用到的算子，这可以通过MNNConverter的 --forTraining 选项实现；
-4. 参考MNN_ROOT/tools/train/source/demo/mobilenetV2Train.cpp 中的 MobilenetV2TrainQuant demo来实现训练量化的功能，下面以MobilenetV2的训练量化为例，来看一下如何读取并将模型转换成训练量化模型
-5. 观察准确率变化，代码保存下来的模型即为量化推理模型
-```cpp
-//  mobilenetV2Train.cpp
-// 读取转换得到的MNN float模型
-auto varMap = Variable::loadMap(argv[1]);
-if (varMap.empty()) {
-    MNN_ERROR("Can not load model %s\n", argv[1]);
-    return 0;
-}
-// 指定量化比特数
-int bits = 8;
-if (argc > 6) {
-    std::istringstream is(argv[6]);
-    is >> bits;
-}
-if (1 > bits || bits > 8) {
-    MNN_ERROR("bits must be 2-8, use 8 default\n");
-    bits = 8;
-}
-// 获得模型的输入和输出
-auto inputOutputs = Variable::getInputAndOutput(varMap);
-auto inputs       = Variable::mapToSequence(inputOutputs.first);
-auto outputs      = Variable::mapToSequence(inputOutputs.second);
-
-// 扫描整个模型，并将inference模型转换成可训练模型，此时得到的模型是可训练的float模型
-std::shared_ptr<Module> model(PipelineModule::extract(inputs, outputs, true));
-// 将上面得到的模型转换成训练量化模型，此处指定量化bit数
-PipelineModule::turnQuantize(model.get(), bits);
-// 进行训练，观察训练结果，保存得到的模型即是量化模型
-MobilenetV2Utils::train(model, 1001, 1, trainImagesFolder, trainImagesTxt, testImagesFolder, testImagesTxt);
-```
-## MNN训练量化原理
-MNN训练量化的基本原理如下图所示
-![image.png](https://cdn.nlark.com/yuque/0/2020/png/405909/1582775538889-77cfe824-3f07-4456-a99e-b529ce888243.png#height=523&id=t2nNB&name=image.png&originHeight=1456&originWidth=1078&originalType=binary&size=590394&status=done&style=none&width=387)
-以int8量化为例，首先要理解全int8推理的整个过程，全int8推理，即feature要量化为int8，weight和bias也要量化为int8，输出结果可以是float或者是int8，视该卷积模块的后面一个op的情况而定。而训练量化的本质就是在训练的过程中去模拟量化操作的影响，借由训练来使得模型学习并适应这种影响，以此来提高最后量化模型的准确率。
-因此在两种 FakeQuant 模块中，我们的主要计算为
-![image.png](https://cdn.nlark.com/yuque/0/2020/png/405909/1582775538909-a701341d-ced6-48ad-9df3-d90b7d1cca36.png#height=538&id=thJFB&name=image.png&originHeight=1076&originWidth=632&originalType=binary&size=203698&status=done&style=none&width=316)
-对于权值和特征的fake-quant基本都和上图一致，不一样的是对于特征由于其范围是随输入动态变化的，而最终int8模型中必须固定一个对于输入特征的scale值，所以，我们对每一此前向计算出来的scale进行了累积更新，例如使用滑动平均，或者直接取每一次的最大值。对于权值的scale，则没有进行平均，因为每一次更新之后的权值都是学习之后的较好的结果，没有状态保留。
-此外，对于特征，我们提供了分通道(PerChannel)或者不分通道(PerTensor)的scale统计方法，可根据效果选择使用。对于权值，我们则使用分通道的量化方法，效果较好。
-
-上述是在训练中的training阶段的计算过程，在test阶段，我们会将BatchNorm合进权值，使用训练过程得到的特征scale和此时权值的scale（每次重新计算得到）对特征和权值进行量化，并真实调用MNN中的 _FloatToInt8 和 _Int8ToFloat 来进行推理，以保证测试得到的结果和最后转换得到的全int8推理模型的结果一致。
-
-最后保存模型的时候会自动保存test阶段的模型，并去掉一些冗余的算子，所以直接保存出来即是全int8推理模型。
-
-## 训练量化结果
-目前我们在Lenet，MobilenetV2，以及内部的一些人脸模型上进行了测试，均取得了不错的效果，下面给出MobilenetV2的一些详细数据
-
-|  | 准确率 / 模型大小 |
-| --- | --- |
-| 原始float模型 | 72.324% / 13M |
-| MNN训练量化int8模型 | 72.456% / 3.5M |
-| TF训练量化int8模型 | 71.1% / 3.5M (原始 71.8% / 13M) |
-
-
-上述数据是使用batchsize为32，训练100次迭代得到的，即仅使用到了3200张图片进行训练量化，在ImageNet验证集5万张图片上进行测试得到。可以看到int8量化模型的准确率甚至比float还要高一点，而模型大小下降了73%，同时还可以得到推理速度上的增益。
-
-【注】此处使用到的float模型为TensorFlow官方提供的模型，但官方给出的准确率数据是71.8%，我们测出来比他们要高一点，原因是因为我们使用的预处理代码上有细微差别所致。
-
-## 使用训练量化的一些建议
-
-1. 模型转换时保留BatchNorm和Dropout等训练中会用到的算子，这些算子对训练量化也有帮助
-2. 要使用原始模型接近收敛阶段的训练参数，训练参数不对，将导致训练量化不稳定
-3. 学习率要调到比较小
-4. 我们仅对卷积层实现了训练量化，因此如果用MNN从零开始搭建模型，后期接训练量化，或者Finetune之后想继续训练量化，那么需要用卷积层来实现全连接层即可对全连接层也进行训练量化。示例代码如下
-```cpp
-// 用卷积层实现输入1280，输出为4的全连接层
-NN::ConvOption option;
-option.channel = {1280, 4};
-mLastConv      = std::shared_ptr<Module>(NN::Conv(option));
-```
-
-## 训练量化的配置选项
-详见 MNN_ROOT/tools/train/source/module/PipelineModule.hpp
-```cpp
-// 特征scale的计算方法
-enum FeatureScaleStatMethod {
-    PerTensor = 0, // 对特征不分通道进行量化
-    PerChannel = 1 // 对特征分通道进行量化，deprecated
-};
-// 特征scale的更新方法
-enum ScaleUpdateMethod {
-    Maximum = 0, // 使用每一次计算得到的scale的最大值
-    MovingAverage = 1 // 使用滑动平均来更新
-};
-// 指定训练量化的bit数，特征scale的计算方法，特征scale的更新方法，
-void toTrainQuant(const int bits = 8, NN::FeatureScaleStatMethod featureScaleStatMethod = NN::PerTensor,
-                      NN::ScaleUpdateMethod scaleUpdateMethod = NN::MovingAverage);
-```
\ No newline at end of file
diff --git a/docs/transformers/diffusion.md b/docs/transformers/diffusion.md
index 5c6d341fb..b8367fd23 100644
--- a/docs/transformers/diffusion.md
+++ b/docs/transformers/diffusion.md
@@ -2,9 +2,9 @@
 
 ## 模型支持与下载
 
-1. runwayml/stable-diffusion-v1-5
+1. stable-diffusion-v1-5
 ```
-https://huggingface.co/runwayml/stable-diffusion-v1-5/tree/main
+https://huggingface.co/stable-diffusion-v1-5/stable-diffusion-v1-5/tree/main
 ```
 2. chilloutmix
 ```
diff --git a/docs/transformers/models.md b/docs/transformers/models.md
new file mode 100644
index 000000000..5587b41a5
--- /dev/null
+++ b/docs/transformers/models.md
@@ -0,0 +1,50 @@
+# 模型下载
+
+## 大语言模型
+
+|   Model  | ModelScope  | Hugging Face |
+| -------- | ----------- | ------------ |
+| [Qwen-VL-Chat](https://modelscope.cn/models/qwen/Qwen-VL-Chat/summary) | [Q4_1](https://modelscope.cn/models/MNN/Qwen-VL-Chat-MNN) | [Q4_1](https://huggingface.co/taobao-mnn/Qwen-VL-Chat-MNN) |
+| [Baichuan2-7B-Chat](https://modelscope.cn/models/baichuan-inc/Baichuan2-7B-Chat/summary) | [Q4_1](https://modelscope.cn/models/MNN/Baichuan2-7B-Chat-MNN) | [Q4_1](https://huggingface.co/taobao-mnn/Baichuan2-7B-Chat-MNN) |
+| [bge-large-zh](https://modelscope.cn/models/AI-ModelScope/bge-large-zh/summary) | [Q4_1](https://modelscope.cn/models/MNN/bge-large-zh-MNN) | [Q4_1](https://huggingface.co/taobao-mnn/bge-large-zh-MNN) |
+| [chatglm-6b](https://modelscope.cn/models/ZhipuAI/ChatGLM-6B/summary) | [Q4_1](https://modelscope.cn/models/MNN/chatglm-6b-MNN) | [Q4_1](https://huggingface.co/taobao-mnn/chatglm-6b-MNN) |
+| [chatglm2-6b](https://modelscope.cn/models/ZhipuAI/chatglm2-6b/summary) | [Q4_1](https://modelscope.cn/models/MNN/chatglm2-6b-MNN) | [Q4_1](https://huggingface.co/taobao-mnn/chatglm2-6b-MNN) |
+| [chatglm3-6b](https://modelscope.cn/models/ZhipuAI/chatglm3-6b/summary) | [Q4_1](https://modelscope.cn/models/MNN/chatglm3-6b-MNN) | [Q4_1](https://huggingface.co/taobao-mnn/chatglm3-6b-MNN) |
+| [codegeex2-6b](https://modelscope.cn/models/MNN/codegeex2-6b-MNN/summary) | [Q4_1](https://modelscope.cn/models/MNN/codegeex2-6b-MNN) | [Q4_1](https://huggingface.co/taobao-mnn/codegeex2-6b-MNN) |
+| [deepseek-llm-7b-chat](https://modelscope.cn/models/deepseek-ai/deepseek-llm-7b-chat/summary) | [Q4_1](https://modelscope.cn/models/MNN/deepseek-llm-7b-chat-MNN) | [Q4_1](https://huggingface.co/taobao-mnn/deepseek-llm-7b-chat-MNN) |
+| [gemma-2-2b-it](https://modelscope.cn/models/llm-research/gemma-2-2b-it) | [Q4_1](https://modelscope.cn/models/MNN/gemma-2-2b-it-MNN) | [Q4_1](https://huggingface.co/taobao-mnn/gemma-2-2b-it-MNN) |
+| [glm-4-9b-chat](https://modelscope.cn/models/ZhipuAI/glm-4-9b-chat/summary) | [Q4_1](https://modelscope.cn/models/MNN/glm-4-9b-chat-MNN) | [Q4_1](https://huggingface.co/taobao-mnn/glm-4-9b-chat-MNN) |
+| [gte_sentence-embedding_multilingual-base](https://modelscope.cn/models/iic/gte_sentence-embedding_multilingual-base/summary) | [Q4_1](https://modelscope.cn/models/MNN/gte_sentence-embedding_multilingual-base-MNN) | [Q4_1](https://huggingface.co/taobao-mnn/gte_sentence-embedding_multilingual-base-MNN) |
+| [internlm-chat-7b](https://modelscope.cn/models/AI-ModelScope/internlm-chat-7b/summary) | [Q4_1](https://modelscope.cn/models/MNN/internlm-chat-7b-MNN) | [Q4_1](https://huggingface.co/taobao-mnn/internlm-chat-7b-MNN) |
+| [Llama-2-7b-chat](https://modelscope.cn/models/modelscope/Llama-2-7b-chat-ms/summary) | [Q4_1](https://modelscope.cn/models/MNN/Llama-2-7b-chat-MNN) | [Q4_1](https://huggingface.co/taobao-mnn/Llama-2-7b-chat-MNN) |
+| [Llama-3-8B-Instruct](https://modelscope.cn/models/modelscope/Meta-Llama-3-8B-Instruct/summary) | [Q4_1](https://modelscope.cn/models/MNN/Llama-3-8B-Instruct-MNN) | [Q4_1](https://huggingface.co/taobao-mnn/Llama-3-8B-Instruct-MNN) |
+| [Llama-3.2-1B-Instruct](https://modelscope.cn/models/LLM-Research/Llama-3.2-1B-Instruct/summary) | [Q4_1](https://modelscope.cn/models/MNN/Llama-3.2-1B-Instruct-MNN) | [Q4_1](https://huggingface.co/taobao-mnn/Llama-3.2-1B-Instruct-MNN) |
+| [Llama-3.2-3B-Instruct](https://modelscope.cn/models/LLM-Research/Llama-3.2-3B-Instruct/summary) | [Q4_1](https://modelscope.cn/models/MNN/Llama-3.2-3B-Instruct-MNN) | [Q4_1](https://huggingface.co/taobao-mnn/Llama-3.2-3B-Instruct-MNN) |
+| [OpenELM-1_1B-Instruct](https://huggingface.co/apple/OpenELM-1_1B-Instruct) | [Q4_1](https://modelscope.cn/models/MNN/OpenELM-1_1B-Instruct-MNN) | [Q4_1](https://huggingface.co/taobao-mnn/OpenELM-1_1B-Instruct-MNN) |
+| [OpenELM-270M-Instruct](https://huggingface.co/apple/OpenELM-270M-Instruct) | [Q4_1](https://modelscope.cn/models/MNN/OpenELM-270M-Instruct-MNN) | [Q4_1](https://huggingface.co/taobao-mnn/OpenELM-270M-Instruct-MNN) |
+| [OpenELM-3B-Instruct](https://huggingface.co/apple/OpenELM-3B-Instruct) | [Q8_1](https://modelscope.cn/models/MNN/OpenELM-3B-Instruct-MNN) | [Q8_1](https://huggingface.co/taobao-mnn/OpenELM-3B-Instruct-MNN) |
+| [OpenELM-450M-Instruct](https://huggingface.co/apple/OpenELM-450M-Instruct) | [Q4_1](https://modelscope.cn/models/MNN/OpenELM-450M-Instruct-MNN) | [Q4_1](https://huggingface.co/taobao-mnn/OpenELM-450M-Instruct-MNN) |
+| [phi-2](https://modelscope.cn/models/mengzhao/phi-2/summary) | [Q4_1](https://modelscope.cn/models/MNN/phi-2-MNN) | [Q4_1](https://huggingface.co/taobao-mnn/phi-2-MNN) |
+| [qwen/Qwen-1_8B-Chat](https://modelscope.cn/models/qwen/Qwen-1_8B-Chat/summary) | [Q4_1](https://modelscope.cn/models/MNN/Qwen-1_8B-Chat-MNN) | [Q4_1](https://huggingface.co/taobao-mnn/Qwen-1_8B-Chat-MNN) |
+| [Qwen-7B-Chat](https://modelscope.cn/models/qwen/Qwen-7B-Chat/summary) | [Q4_1](https://modelscope.cn/models/MNN/Qwen-7B-Chat-MNN) | [Q4_1](https://huggingface.co/taobao-mnn/Qwen-7B-Chat-MNN) |
+| [Qwen1.5-0.5B-Chat](https://modelscope.cn/models/qwen/Qwen1.5-0.5B-Chat/summary) | [Q4_1](https://modelscope.cn/models/MNN/Qwen1.5-0.5B-Chat-MNN) | [Q4_1](https://huggingface.co/taobao-mnn/Qwen1.5-0.5B-Chat-MNN) |
+| [Qwen1.5-1.8B-Chat](https://modelscope.cn/models/qwen/Qwen1.5-1.8B-Chat/summary) | [Q4_1](https://modelscope.cn/models/MNN/Qwen1.5-1.8B-Chat-MNN) | [Q4_1](https://huggingface.co/taobao-mnn/Qwen1.5-1.8B-Chat-MNN) |
+| [Qwen1.5-4B-Chat](https://modelscope.cn/models/qwen/Qwen1.5-4B-Chat/summary) | [Q4_1](https://modelscope.cn/models/MNN/Qwen1.5-4B-Chat-MNN) | [Q4_1](https://huggingface.co/taobao-mnn/Qwen1.5-4B-Chat-MNN) |
+| [Qwen1.5-7B-Chat](https://modelscope.cn/models/qwen/Qwen1.5-7B-Chat/summary) | [Q4_1](https://modelscope.cn/models/MNN/Qwen1.5-7B-Chat-MNN) | [Q4_1](https://huggingface.co/taobao-mnn/Qwen1.5-7B-Chat-MNN) |
+| [Qwen2-0.5B-Instruct](https://modelscope.cn/models/qwen/Qwen2-0.5B-Instruct/summary) | [Q4_1](https://modelscope.cn/models/MNN/Qwen2-0.5B-Instruct-MNN) | [Q4_1](https://huggingface.co/taobao-mnn/Qwen2-0.5B-Instruct-MNN) |
+| [Qwen2-1.5B-Instruct](https://modelscope.cn/models/qwen/Qwen2-1.5B-Instruct/summary) | [Q4_1](https://modelscope.cn/models/MNN/Qwen2-1.5B-Instruct-MNN) | [Q4_1](https://huggingface.co/taobao-mnn/Qwen2-1.5B-Instruct-MNN) |
+| [Qwen2-7B-Instruct](https://modelscope.cn/models/qwen/Qwen2-7B-Instruct/summary) | [Q4_1](https://modelscope.cn/models/MNN/Qwen2-7B-Instruct-MNN) | [Q4_1](https://huggingface.co/taobao-mnn/Qwen2-7B-Instruct-MNN) |
+| [Qwen2-VL-2B-Instruct](https://modelscope.cn/models/qwen/Qwen2-VL-2B-Instruct/summary) | [Q4_1](https://modelscope.cn/models/MNN/Qwen2-VL-2B-Instruct-MNN) | [Q4_1](https://huggingface.co/taobao-mnn/Qwen2-VL-2B-Instruct-MNN) |
+| [Qwen2-VL-7B-Instruct](https://modelscope.cn/models/qwen/Qwen2-VL-7B-Instruct/summary) | [Q4_1](https://modelscope.cn/models/MNN/Qwen2-VL-7B-Instruct-MNN) | [Q4_1](https://huggingface.co/taobao-mnn/Qwen2-VL-7B-Instruct-MNN) |
+| [Qwen2.5-0.5B-Instruct](https://modelscope.cn/models/qwen/Qwen2.5-0.5B-Instruct/summary) | [Q4_1](https://modelscope.cn/models/MNN/Qwen2.5-0.5B-Instruct-MNN) | [Q4_1](https://huggingface.co/taobao-mnn/Qwen2.5-0.5B-Instruct-MNN) |
+| [Qwen2.5-1.5B-Instruct](https://modelscope.cn/models/qwen/Qwen2.5-1.5B-Instruct/summary) | [Q4_1](https://modelscope.cn/models/MNN/Qwen2.5-1.5B-Instruct-MNN) | [Q4_1](https://huggingface.co/taobao-mnn/Qwen2.5-1.5B-Instruct-MNN) |
+| [Qwen2.5-3B-Instruct](https://modelscope.cn/models/qwen/Qwen2.5-3B-Instruct/summary) | [Q4_1](https://modelscope.cn/models/MNN/Qwen2.5-3B-Instruct-MNN) | [Q4_1](https://huggingface.co/taobao-mnn/Qwen2.5-3B-Instruct-MNN) |
+| [Qwen2.5-7B-Instruct](https://modelscope.cn/models/qwen/Qwen2.5-7B-Instruct/summary) | [Q4_1](https://modelscope.cn/models/MNN/Qwen2.5-7B-Instruct-MNN) | [Q4_1](https://huggingface.co/taobao-mnn/Qwen2.5-7B-Instruct-MNN) |
+| [Qwen2.5-Coder-1.5B-Instruct](https://modelscope.cn/models/qwen/Qwen2.5-Coder-1.5B-Instruct/summary) | [Q4_1](https://modelscope.cn/models/MNN/Qwen2.5-Coder-1.5B-Instruct-MNN) | [Q4_1](https://huggingface.co/taobao-mnn/Qwen2.5-Coder-1.5B-Instruct-MNN) |
+| [Qwen2.5-Coder-7B-Instruct](https://modelscope.cn/models/qwen/Qwen2.5-Coder-7B-Instruct/summary) | [Q4_1](https://modelscope.cn/models/MNN/Qwen2.5-Coder-7B-Instruct-MNN) | [Q4_1](https://huggingface.co/taobao-mnn/Qwen2.5-Coder-7B-Instruct-MNN) |
+| [Qwen2.5-Math-1.5B-Instruct](https://modelscope.cn/models/qwen/Qwen2.5-Math-1.5B-Instruct/summary) | [Q4_1](https://modelscope.cn/models/MNN/Qwen2.5-Math-1.5B-Instruct-MNN) | [Q4_1](https://huggingface.co/taobao-mnn/Qwen2.5-Math-1.5B-Instruct-MNN) |
+| [Qwen2.5-Math-7B-Instruct](https://modelscope.cn/models/qwen/Qwen2.5-Math-7B-Instruct/summary) | [Q4_1](https://modelscope.cn/models/MNN/Qwen2.5-Math-7B-Instruct-MNN) | [Q4_1](https://huggingface.co/taobao-mnn/Qwen2.5-Math-7B-Instruct-MNN) |
+| [reader-lm-0.5b](https://huggingface.co/jinaai/reader-lm-0.5b) | [Q4_1](https://modelscope.cn/models/MNN/reader-lm-0.5b-MNN) | [Q4_1](https://huggingface.co/taobao-mnn/reader-lm-0.5b-MNN) |
+| [reader-lm-1.5b](https://huggingface.co/jinaai/reader-lm-1.5b) | [Q4_1](https://modelscope.cn/models/MNN/reader-lm-1.5b-MNN) | [Q4_1](https://huggingface.co/taobao-mnn/reader-lm-1.5b-MNN) |
+| [TinyLlama-1.1B-Chat-v1.0](https://modelscope.cn/models/AI-ModelScope/TinyLlama-1.1B-Chat-v1.0/summary) | [Q4_1](https://modelscope.cn/models/MNN/TinyLlama-1.1B-Chat-MNN) | [Q4_1](https://huggingface.co/taobao-mnn/TinyLlama-1.1B-Chat-MNN) |
+| [Yi-6B-Chat](https://modelscope.cn/models/01ai/Yi-6B-Chat/summary) | [Q4_1](https://modelscope.cn/models/MNN/Yi-6B-Chat-MNN) | [Q4_1](https://huggingface.co/taobao-mnn/Yi-6B-Chat-MNN) |
\ No newline at end of file
diff --git a/express/Executor.cpp b/express/Executor.cpp
index bb54a393e..4b48c0d90 100644
--- a/express/Executor.cpp
+++ b/express/Executor.cpp
@@ -41,6 +41,11 @@ void Executor::setGlobalExecutorConfig(MNNForwardType type, const BackendConfig&
     }
     MNN_ASSERT(nullptr != rt);
     mAttr->firstType = type;
+    // Cache threadnumber and config
+    mAttr->numThread = numberThread;
+    mAttr->config = config;
+    // Remove sharedContext because it's not used for create backend
+    mAttr->config.sharedContext = nullptr;
 }
 
 int Executor::getCurrentRuntimeStatus(RuntimeStatus statusEnum) {
@@ -219,6 +224,11 @@ void Executor::RuntimeManager::setMode(Interpreter::SessionMode mode) {
 }
 void Executor::RuntimeManager::setHint(Interpreter::HintMode mode, int value) {
     mInside->modes.setHint(mode, value);
+    auto current = ExecutorScope::Current();
+    auto rt = current->getRuntime();
+    for (auto& iter : rt.first) {
+        iter.second->setRuntimeHint(mInside->modes.runtimeHint);
+    }
 }
 void Executor::RuntimeManager::setExternalPath(std::string path, int type) {
     mInside->modes.setExternalPath(path, type);
diff --git a/express/Expr.cpp b/express/Expr.cpp
index be8b01bfa..3ac528491 100644
--- a/express/Expr.cpp
+++ b/express/Expr.cpp
@@ -91,6 +91,7 @@ bool VARP::fix(VARP::InputType type) const {
         newVARP->expr().first->inside()->mHoldBackend = pipelineInfo.first.cache.second;
     }
     Variable::replace(VARP(mContent), newVARP);
+    inputTensor->wait(MNN::Tensor::MAP_TENSOR_READ, true);
     return true;
 }
 
diff --git a/express/RuntimeAttr.hpp b/express/RuntimeAttr.hpp
index 21fd54fa0..2c19f8b37 100644
--- a/express/RuntimeAttr.hpp
+++ b/express/RuntimeAttr.hpp
@@ -25,6 +25,8 @@ struct RuntimeAttr {
 struct ExecutorAttr {
     std::shared_ptr<Backend> constantBackend;
     MNNForwardType firstType;
+    int numThread = 1;
+    BackendConfig config;
     std::string externalFile;
 };
 };
diff --git a/express/Utils.cpp b/express/Utils.cpp
index 3826d5679..957e11a6d 100644
--- a/express/Utils.cpp
+++ b/express/Utils.cpp
@@ -13,6 +13,7 @@
 #include <MNN/expr/ExecutorScope.hpp>
 #include "MNN_generated.h"
 #include "core/TensorUtils.hpp"
+#include "core/OpCommonUtils.hpp"
 #include "core/Session.hpp"
 #include "core/MNNMemoryUtils.h"
 #include "core/Backend.hpp"
@@ -61,19 +62,7 @@ int Utils::convertFormat(Dimensionformat format) {
 }
 
 DataType Utils::convertDataType(halide_type_t type) {
-    if (type.code == halide_type_float) {
-        return DataType_DT_FLOAT;
-    }
-    if (type.code == halide_type_uint && type.bits == 8) {
-        return DataType_DT_UINT8;
-    }
-    if (type.code == halide_type_int && type.bits == 8) {
-        return DataType_DT_INT8;
-    }
-    if (type.code == halide_type_int && type.bits == 32) {
-        return DataType_DT_INT32;
-    }
-    return DataType_DT_INVALID;
+    return OpCommonUtils::convertDataType(type);
 }
 halide_type_t Utils::revertDataType(DataType dataType) {
     CONVERT(DataType_DT_FLOAT, halide_type_of<float>(), dataType);
diff --git a/express/module/Module.cpp b/express/module/Module.cpp
index d1dea03dc..e3372db0c 100644
--- a/express/module/Module.cpp
+++ b/express/module/Module.cpp
@@ -32,8 +32,10 @@ static MNN::Express::Executor::RuntimeManager* _createDefaultRuntimeManager(cons
         sche_config.backendConfig = config->backend->config;
     } else {
         auto exe = ExecutorScope::Current();
-        sche_config.type = exe->getAttr()->firstType;
-        sche_config.numThread = 1;
+        auto attr = exe->getAttr();
+        sche_config.type = attr->firstType;
+        sche_config.numThread = attr->numThread;
+        sche_config.backendConfig = &attr->config;
     }
     return Executor::RuntimeManager::createRuntimeManager(sche_config);
 }
diff --git a/include/MNN/MNNDefine.h b/include/MNN/MNNDefine.h
index 33bc515fd..8f30cd682 100644
--- a/include/MNN/MNNDefine.h
+++ b/include/MNN/MNNDefine.h
@@ -20,9 +20,15 @@
 #endif
 
 #ifdef MNN_USE_LOGCAT
+#if defined(__OHOS__)
+#include <hilog/log.h>
+#define MNN_ERROR(format, ...) {char logtmp[4096]; snprintf(logtmp, 4096, format, ##__VA_ARGS__); OH_LOG_Print(LOG_APP, LOG_ERROR, LOG_DOMAIN, "MNNJNI", (const char*)logtmp);}
+#define MNN_PRINT(format, ...) {char logtmp[4096]; snprintf(logtmp, 4096, format, ##__VA_ARGS__); OH_LOG_Print(LOG_APP, LOG_DEBUG, LOG_DOMAIN, "MNNJNI", (const char*)logtmp);}
+#else
 #include <android/log.h>
 #define MNN_ERROR(format, ...) __android_log_print(ANDROID_LOG_ERROR, "MNNJNI", format, ##__VA_ARGS__)
 #define MNN_PRINT(format, ...) __android_log_print(ANDROID_LOG_INFO, "MNNJNI", format, ##__VA_ARGS__)
+#endif
 #elif defined MNN_BUILD_FOR_IOS
 // on iOS, stderr prints to XCode debug area and syslog prints Console. You need both.
 #include <syslog.h>
@@ -67,8 +73,8 @@ MNN_ERROR("Check failed: %s ==> %s\n", #success, #log); \
 #endif
 #define STR_IMP(x) #x
 #define STR(x) STR_IMP(x)
-#define MNN_VERSION_MAJOR 2
-#define MNN_VERSION_MINOR 9
-#define MNN_VERSION_PATCH 6
+#define MNN_VERSION_MAJOR 3
+#define MNN_VERSION_MINOR 0
+#define MNN_VERSION_PATCH 0
 #define MNN_VERSION STR(MNN_VERSION_MAJOR) "." STR(MNN_VERSION_MINOR) "." STR(MNN_VERSION_PATCH)
 #endif /* MNNDefine_h */
diff --git a/project/harmony/build_64.sh b/project/harmony/build_64.sh
index fefd02c60..2207eebcd 100755
--- a/project/harmony/build_64.sh
+++ b/project/harmony/build_64.sh
@@ -4,13 +4,12 @@ cmake ../../../ \
 -DCMAKE_BUILD_TYPE=Release \
 -DOHOS_ARCH="arm64-v8a" \
 -DOHOS_STL=c++_static \
--DMNN_USE_LOGCAT=false \
+-DMNN_USE_LOGCAT=true \
 -DMNN_BUILD_BENCHMARK=ON \
 -DMNN_USE_SSE=OFF \
 -DMNN_SUPPORT_BF16=OFF \
 -DMNN_BUILD_TEST=ON \
 -DOHOS_PLATFORM_LEVEL=9  \
--DMNN_BUILD_FOR_ANDROID_COMMAND=true \
 -DNATIVE_LIBRARY_OUTPUT=. -DNATIVE_INCLUDE_OUTPUT=. $1 $2 $3
 
 make -j4
diff --git a/project/harmony/updateTest.sh b/project/harmony/updateTest.sh
index 5a0be5c6f..43358d89e 100755
--- a/project/harmony/updateTest.sh
+++ b/project/harmony/updateTest.sh
@@ -1,5 +1,6 @@
 #!/bin/bash
-DIR=yanxing
+DIR=MNN
+hdc shell mkdir /data/local/tmp/MNN
 
 make -j16
 hdc file send ./libMNN.so /data/local/tmp/$DIR/libMNN.so
diff --git a/project/ios/MNN.xcodeproj/project.pbxproj b/project/ios/MNN.xcodeproj/project.pbxproj
index c8afc9f93..6aafd2121 100644
--- a/project/ios/MNN.xcodeproj/project.pbxproj
+++ b/project/ios/MNN.xcodeproj/project.pbxproj
@@ -727,7 +727,7 @@
 		952298B22B4D39050043978B /* MetalLoop.mm in Sources */ = {isa = PBXBuildFile; fileRef = 952298B12B4D39050043978B /* MetalLoop.mm */; };
 		952298B42B4D39260043978B /* MetalArgMax.mm in Sources */ = {isa = PBXBuildFile; fileRef = 952298B32B4D39250043978B /* MetalArgMax.mm */; };
 		952298B72B4D4CC80043978B /* CoreMLLayerNorm.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 952298B52B4D4CC80043978B /* CoreMLLayerNorm.cpp */; };
-		952298B82B4D4CC80043978B /* coreMLLayerNorm.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 952298B62B4D4CC80043978B /* coreMLLayerNorm.hpp */; };
+		952298B82B4D4CC80043978B /* CoreMLLayerNorm.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 952298B62B4D4CC80043978B /* CoreMLLayerNorm.hpp */; };
 		95278CE72B9F0999009E9B29 /* CPUDynamicQuant.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 95278CE52B9F0999009E9B29 /* CPUDynamicQuant.hpp */; };
 		95278CE82B9F0999009E9B29 /* CPUDynamicQuant.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 95278CE62B9F0999009E9B29 /* CPUDynamicQuant.cpp */; };
 		95278CEA2B9F09C0009E9B29 /* ShapeDynamicQuant.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 95278CE92B9F09C0009E9B29 /* ShapeDynamicQuant.cpp */; };
@@ -796,6 +796,10 @@
 		CEA49AA92AFD010900971CB7 /* MetalExecution.hpp in Headers */ = {isa = PBXBuildFile; fileRef = CEA49AA72AFD010900971CB7 /* MetalExecution.hpp */; };
 		CEA82BDB2A15F8AD002CBC95 /* IdstConvolutionInt8.cpp in Sources */ = {isa = PBXBuildFile; fileRef = CEA82BD92A15F8AD002CBC95 /* IdstConvolutionInt8.cpp */; };
 		CEA82BDC2A15F8AD002CBC95 /* IdstConvolutionInt8.hpp in Headers */ = {isa = PBXBuildFile; fileRef = CEA82BDA2A15F8AD002CBC95 /* IdstConvolutionInt8.hpp */; };
+		CED81F8F2CC23C8A00666B48 /* CoreMLRelu6.cpp in Sources */ = {isa = PBXBuildFile; fileRef = CED81F8E2CC23C8A00666B48 /* CoreMLRelu6.cpp */; };
+		CED81F902CC23C8A00666B48 /* CoreMLRelu6.hpp in Headers */ = {isa = PBXBuildFile; fileRef = CED81F8D2CC23C8A00666B48 /* CoreMLRelu6.hpp */; };
+		CED81F932CC23FE800666B48 /* CoreMLMatMul.cpp in Sources */ = {isa = PBXBuildFile; fileRef = CED81F922CC23FE800666B48 /* CoreMLMatMul.cpp */; };
+		CED81F942CC23FE800666B48 /* CoreMLMatMul.hpp in Headers */ = {isa = PBXBuildFile; fileRef = CED81F912CC23FE800666B48 /* CoreMLMatMul.hpp */; };
 		CEDB20EB2846D07100AE9DC4 /* AppDelegate.m in Sources */ = {isa = PBXBuildFile; fileRef = CEDB20EA2846D07100AE9DC4 /* AppDelegate.m */; };
 		CEDB20F42846D07100AE9DC4 /* Main.storyboard in Resources */ = {isa = PBXBuildFile; fileRef = CEDB20F22846D07100AE9DC4 /* Main.storyboard */; };
 		CEDB20F62846D07200AE9DC4 /* Assets.xcassets in Resources */ = {isa = PBXBuildFile; fileRef = CEDB20F52846D07200AE9DC4 /* Assets.xcassets */; };
@@ -1580,7 +1584,7 @@
 		952298B12B4D39050043978B /* MetalLoop.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = MetalLoop.mm; sourceTree = "<group>"; };
 		952298B32B4D39250043978B /* MetalArgMax.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = MetalArgMax.mm; sourceTree = "<group>"; };
 		952298B52B4D4CC80043978B /* CoreMLLayerNorm.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = CoreMLLayerNorm.cpp; sourceTree = "<group>"; };
-		952298B62B4D4CC80043978B /* coreMLLayerNorm.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = coreMLLayerNorm.hpp; sourceTree = "<group>"; };
+		952298B62B4D4CC80043978B /* CoreMLLayerNorm.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = CoreMLLayerNorm.hpp; sourceTree = "<group>"; };
 		95278CE52B9F0999009E9B29 /* CPUDynamicQuant.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = CPUDynamicQuant.hpp; sourceTree = "<group>"; };
 		95278CE62B9F0999009E9B29 /* CPUDynamicQuant.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = CPUDynamicQuant.cpp; sourceTree = "<group>"; };
 		95278CE92B9F09C0009E9B29 /* ShapeDynamicQuant.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = ShapeDynamicQuant.cpp; sourceTree = "<group>"; };
@@ -1649,6 +1653,10 @@
 		CEA49AA72AFD010900971CB7 /* MetalExecution.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = MetalExecution.hpp; sourceTree = "<group>"; };
 		CEA82BD92A15F8AD002CBC95 /* IdstConvolutionInt8.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = IdstConvolutionInt8.cpp; sourceTree = "<group>"; };
 		CEA82BDA2A15F8AD002CBC95 /* IdstConvolutionInt8.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = IdstConvolutionInt8.hpp; sourceTree = "<group>"; };
+		CED81F8D2CC23C8A00666B48 /* CoreMLRelu6.hpp */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.h; path = CoreMLRelu6.hpp; sourceTree = "<group>"; };
+		CED81F8E2CC23C8A00666B48 /* CoreMLRelu6.cpp */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.cpp; path = CoreMLRelu6.cpp; sourceTree = "<group>"; };
+		CED81F912CC23FE800666B48 /* CoreMLMatMul.hpp */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.h; path = CoreMLMatMul.hpp; sourceTree = "<group>"; };
+		CED81F922CC23FE800666B48 /* CoreMLMatMul.cpp */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.cpp; path = CoreMLMatMul.cpp; sourceTree = "<group>"; };
 		CEDB20E72846D07100AE9DC4 /* demo.app */ = {isa = PBXFileReference; explicitFileType = wrapper.application; includeInIndex = 0; path = demo.app; sourceTree = BUILT_PRODUCTS_DIR; };
 		CEDB20E92846D07100AE9DC4 /* AppDelegate.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = AppDelegate.h; sourceTree = "<group>"; };
 		CEDB20EA2846D07100AE9DC4 /* AppDelegate.m */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.objc; path = AppDelegate.m; sourceTree = "<group>"; };
@@ -2364,8 +2372,12 @@
 		4D9A933A26255BDA00F9B43C /* execution */ = {
 			isa = PBXGroup;
 			children = (
+				CED81F912CC23FE800666B48 /* CoreMLMatMul.hpp */,
+				CED81F922CC23FE800666B48 /* CoreMLMatMul.cpp */,
+				CED81F8D2CC23C8A00666B48 /* CoreMLRelu6.hpp */,
+				CED81F8E2CC23C8A00666B48 /* CoreMLRelu6.cpp */,
 				952298B52B4D4CC80043978B /* CoreMLLayerNorm.cpp */,
-				952298B62B4D4CC80043978B /* coreMLLayerNorm.hpp */,
+				952298B62B4D4CC80043978B /* CoreMLLayerNorm.hpp */,
 				4DF63F2E2660D9D100590730 /* CoreMLInterp.hpp */,
 				4DF63F2C2660D9CB00590730 /* CoreMLInterp.cpp */,
 				4D9A933B26255BDA00F9B43C /* CoreMLReduction.cpp */,
@@ -3009,6 +3021,7 @@
 				92FF037823AA0B5A00AC97F6 /* CPUROIPooling.hpp in Headers */,
 				4D9A935626255BDA00F9B43C /* Model.pb-c.h in Headers */,
 				48747D6D245D9E33000B9709 /* ConvertUtils.hpp in Headers */,
+				CED81F902CC23C8A00666B48 /* CoreMLRelu6.hpp in Headers */,
 				4838EA832611C00B0027232C /* MetalGridSample.hpp in Headers */,
 				92FF038723AA0B5A00AC97F6 /* CPUTensorConvert.hpp in Headers */,
 				92FF036E23AA0B5A00AC97F6 /* CPUQuantizedSoftmax.hpp in Headers */,
@@ -3018,7 +3031,7 @@
 				489D7A9B2550FDC900AD896A /* MetalDeconvolution.hpp in Headers */,
 				4D9A935726255BDA00F9B43C /* protobuf-c.h in Headers */,
 				489D7A982550FDC900AD896A /* MNNMetalContext.h in Headers */,
-				952298B82B4D4CC80043978B /* coreMLLayerNorm.hpp in Headers */,
+				952298B82B4D4CC80043978B /* CoreMLLayerNorm.hpp in Headers */,
 				92FF029323AA0B5A00AC97F6 /* CPURange.hpp in Headers */,
 				CEEDB5542C7475A100FED0DC /* MNNFileUtils.h in Headers */,
 				4D9A937526255BDA00F9B43C /* CoreMLCommonExecution.hpp in Headers */,
@@ -3141,6 +3154,7 @@
 				48C84B6C250F709E00EE7666 /* SizeComputer.hpp in Headers */,
 				92FF035023AA0B5A00AC97F6 /* CPUOneHot.hpp in Headers */,
 				92FF039123AA0B5A00AC97F6 /* CPUBackend.hpp in Headers */,
+				CED81F942CC23FE800666B48 /* CoreMLMatMul.hpp in Headers */,
 				489D7AA52550FDC900AD896A /* MetalInterp.hpp in Headers */,
 				486E1A9A24F5078D00C16006 /* CPURandomUniform.hpp in Headers */,
 				92FF038C23AA0B5A00AC97F6 /* CPUEltwise.hpp in Headers */,
@@ -3411,6 +3425,7 @@
 				92FF02B023AA0B5A00AC97F6 /* CPUDequantize.cpp in Sources */,
 				92FF04C223AA0BFB00AC97F6 /* Pipeline.cpp in Sources */,
 				92FF04C423AA0BFB00AC97F6 /* Session.cpp in Sources */,
+				CED81F932CC23FE800666B48 /* CoreMLMatMul.cpp in Sources */,
 				952298B72B4D4CC80043978B /* CoreMLLayerNorm.cpp in Sources */,
 				4D9A936826255BDA00F9B43C /* CoreMLCommonExecution.cpp in Sources */,
 				92FF02D123AA0B5A00AC97F6 /* MNNMaxFloat.S in Sources */,
@@ -3640,6 +3655,7 @@
 				CE072A262C91AF0700F190FD /* MNNC3ToYUVFast.S in Sources */,
 				92FF042823AA0B7100AC97F6 /* ShapeInterp.cpp in Sources */,
 				92FF02D623AA0B5A00AC97F6 /* MNNConvRunForLineDepthWiseInt8.S in Sources */,
+				CED81F8F2CC23C8A00666B48 /* CoreMLRelu6.cpp in Sources */,
 				48FB9DCA24A848D0008E1A2D /* MNNAxByClampBroadcastC4.S in Sources */,
 				489D7A832550FDC900AD896A /* MetalMatMul.mm in Sources */,
 				482BFBD028351BA1009210E4 /* AllShader.cpp in Sources */,
@@ -4148,7 +4164,7 @@
 				METAL_LIBRARY_FILE_BASE = mnn;
 				ONLY_ACTIVE_ARCH = YES;
 				OTHER_CFLAGS = "";
-				PRODUCT_BUNDLE_IDENTIFIER = com.taobao.mnn.abcde3;
+				PRODUCT_BUNDLE_IDENTIFIER = com.taobao.mnn.abcde3vjk;
 				PRODUCT_NAME = "$(TARGET_NAME:c99extidentifier)";
 				PROVISIONING_PROFILE_SPECIFIER = "";
 				"PROVISIONING_PROFILE_SPECIFIER[sdk=macosx*]" = "";
@@ -4210,7 +4226,7 @@
 				MACH_O_TYPE = staticlib;
 				METAL_LIBRARY_FILE_BASE = mnn;
 				OTHER_CFLAGS = "";
-				PRODUCT_BUNDLE_IDENTIFIER = com.taobao.mnn.abcde3;
+				PRODUCT_BUNDLE_IDENTIFIER = com.taobao.mnn.abcde3vjk;
 				PRODUCT_NAME = "$(TARGET_NAME:c99extidentifier)";
 				PROVISIONING_PROFILE_SPECIFIER = "";
 				"PROVISIONING_PROFILE_SPECIFIER[sdk=macosx*]" = "";
@@ -4244,7 +4260,7 @@
 				IPHONEOS_DEPLOYMENT_TARGET = 9.0;
 				LD_RUNPATH_SEARCH_PATHS = "$(inherited) @executable_path/Frameworks";
 				OTHER_CPLUSPLUSFLAGS = "$(OTHER_CFLAGS)";
-				PRODUCT_BUNDLE_IDENTIFIER = com.taobao.mnn.abcde3vj;
+				PRODUCT_BUNDLE_IDENTIFIER = com.taobao.mnn.abcde3vjk;
 				PRODUCT_NAME = "$(TARGET_NAME)";
 				TARGETED_DEVICE_FAMILY = "1,2";
 			};
@@ -4271,7 +4287,7 @@
 				IPHONEOS_DEPLOYMENT_TARGET = 9.0;
 				LD_RUNPATH_SEARCH_PATHS = "$(inherited) @executable_path/Frameworks";
 				OTHER_CPLUSPLUSFLAGS = "$(OTHER_CFLAGS)";
-				PRODUCT_BUNDLE_IDENTIFIER = com.taobao.mnn.abcdedddddd;
+				PRODUCT_BUNDLE_IDENTIFIER = com.taobao.mnn.abcde3vjk;
 				PRODUCT_NAME = "$(TARGET_NAME)";
 				TARGETED_DEVICE_FAMILY = "1,2";
 			};
@@ -4303,7 +4319,7 @@
 				MARKETING_VERSION = 1.0;
 				MTL_ENABLE_DEBUG_INFO = INCLUDE_SOURCE;
 				MTL_FAST_MATH = YES;
-				PRODUCT_BUNDLE_IDENTIFIER = com.taobao.mnn.abcde3vj;
+				PRODUCT_BUNDLE_IDENTIFIER = com.taobao.mnn.abcde3vjk;
 				PRODUCT_NAME = "$(TARGET_NAME)";
 				SWIFT_EMIT_LOC_STRINGS = YES;
 				TARGETED_DEVICE_FAMILY = "1,2";
@@ -4335,7 +4351,7 @@
 				LD_RUNPATH_SEARCH_PATHS = "$(inherited) @executable_path/Frameworks";
 				MARKETING_VERSION = 1.0;
 				MTL_FAST_MATH = YES;
-				PRODUCT_BUNDLE_IDENTIFIER = com.taobao.mnn.abcde3vj;
+				PRODUCT_BUNDLE_IDENTIFIER = com.taobao.mnn.abcde3vjk;
 				PRODUCT_NAME = "$(TARGET_NAME)";
 				SWIFT_EMIT_LOC_STRINGS = YES;
 				TARGETED_DEVICE_FAMILY = "1,2";
diff --git a/pymnn/CMakeLists.txt b/pymnn/CMakeLists.txt
index 0813adb64..7e3694407 100644
--- a/pymnn/CMakeLists.txt
+++ b/pymnn/CMakeLists.txt
@@ -16,8 +16,9 @@ option(PYMNN_TRAIN_API "MNN train API be exposed" OFF)
 option(PYMNN_INTERNAL_SERVING "Internal use only." OFF)
 option(PYMNN_OPENCV_API "MNN OpenCV API be exposed" ON)
 option(PYMNN_IMGCODECS "MNN IMGCODECS API be exposed" OFF)
+option(PYMNN_OHOS_INTERNAL "compile for harmony internal." OFF)
 
-if (OHOS)
+if (PYMNN_OHOS_INTERNAL)
   include($ENV{NODE_PATH}/@ali/tcpkg/tcpkg.cmake)
 endif()
 
@@ -189,7 +190,7 @@ if(WIN32 OR APPLE OR CMAKE_SYSTEM_NAME MATCHES "^Linux")
 else()
     target_include_directories(mnnpybridge PRIVATE ${MNN_DIR}/pymnn/src ${MNN_DIR}/pymnn/android/src/main/c/include)
     set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${MNN_DIR}/pymnn/android/src/main/jniLibs/${ANDROID_ABI})
-    if (OHOS)
+    if (PYMNN_OHOS_INTERNAL)
         target_link_libraries(mnnpybridge PRIVATE tcpkg::mnn)
         if(PYMNN_USE_ALINNPYTHON)
             target_link_libraries(mnnpybridge PRIVATE tcpkg::alinnpython)
diff --git a/pymnn/examples/MNNQuant/test_mnn_offline_quant.py b/pymnn/examples/MNNQuant/test_mnn_offline_quant.py
deleted file mode 100644
index ae16d5dc7..000000000
--- a/pymnn/examples/MNNQuant/test_mnn_offline_quant.py
+++ /dev/null
@@ -1,201 +0,0 @@
-from __future__ import print_function
-import time
-import argparse
-import numpy as np
-import tqdm
-import os
-import MNN
-from PIL import Image
-
-nn = MNN.nn
-F = MNN.expr
-F.lazy_eval(True)
-
-
-# adapted from pycaffe
-def load_image(filename, color=True):
-    """
-    Load an image converting from grayscale or alpha as needed.
-
-    Parameters
-    ----------
-    filename : string
-    color : boolean
-        flag for color format. True (default) loads as RGB while False
-        loads as intensity (if image is already grayscale).
-
-    Returns
-    -------
-    image : an image with type np.float32 in range [0, 1]
-        of size (H x W x 3) in RGB or
-        of size (H x W x 1) in grayscale.
-    """
-    img = Image.open(filename)
-    img = np.array(img)
-    if img.ndim == 2:
-        img = img[:, :, np.newaxis]
-        if color:
-            img = np.tile(img, (1, 1, 3))
-    elif img.shape[2] == 4:
-        img = img[:, :, :3]
-    return img
-
-
-def center_crop(image_data, crop_factor):
-    height, width, channels = image_data.shape
-
-    h_size = int(height * crop_factor)
-    h_start = int((height - h_size) / 2)
-    h_end = h_start + h_size
-
-    w_size = int(width * crop_factor)
-    w_start = int((width - w_size) / 2)
-    w_end = w_start + w_size
-
-    cropped_image = image_data[h_start:h_end, w_start:w_end, :]
-
-    return cropped_image
-
-
-def resize_image(image, shape):
-    im = Image.fromarray(image)
-    im = im.resize(shape)
-    resized_image = np.array(im)
-
-    return resized_image
-
-
-class CalibrationDataset(MNN.data.Dataset):
-    '''
-    This is demo for Imagenet calibration dataset. like pytorch, you need to overload __getiterm__ and __len__ methods
-    __getiterm__ should return a sample in F.const, and you should not use batch dimension here
-    __len__ should return the number of total samples in the calibration dataset
-    '''
-    def __init__(self, image_folder):
-        super(CalibrationDataset, self).__init__()
-        self.image_folder = image_folder
-        self.image_list = os.listdir(image_folder)[0:64]
-
-    def __getitem__(self, index):
-        image_name = os.path.join(self.image_folder, self.image_list[index].split(' ')[0])
-
-
-        # preprocess your data here, the following code are for tensorflow mobilenets
-        image_data = load_image(image_name)
-        image_data = center_crop(image_data, 0.875)
-        image_data = resize_image(image_data, (224, 224))
-        image_data = (image_data - 127.5) / 127.5
-
-
-        # after preprocessing the data, convert it to MNN data structure
-        dv = F.const(image_data.flatten().tolist(), [224, 224, 3], F.data_format.NHWC, F.dtype.float)
-
-        '''
-        first list for inputs, and may have many inputs, so it's a list
-        if your model have more than one inputs, add the preprocessed MNN const data to the input list
-
-        second list for targets, also, there may be more than one targets
-        for calibration dataset, we don't need labels, so leave it blank
-
-        Note that, the input order in the first list should be the same in your 'config.yaml' file.
-        '''
-        
-        return [dv], []
-
-    def __len__(self):
-        # size of the dataset
-        return len(self.image_list)
-
-
-def get_mnn_format(format_str):
-    fmt = str.lower(format_str)
-    if fmt == 'nchw':
-        return F.NCHW
-    elif fmt == 'nhwc':
-        return F.NHWC
-    elif fmt == 'nc4hw4':
-        return F.NC4HW4
-    else:
-        raise ValueError("unknown format:", format_str)
-
-def quant_func(net, dataloader, opt):
-    net.train(True)
-    dataloader.reset()
-
-    t0 = time.time()
-    for i in tqdm.trange(dataloader.iter_number):
-        example = dataloader.next()
-        input_data = example[0]
-        predicts = net.forward(input_data)
-        # fake update
-        opt.step(F.const([0.0], []))
-        for predict in predicts:
-            predict.read()
-
-    t1 = time.time()
-    cost = t1 - t0
-    print("Epoch cost: %.3f s." % cost)
-
-    return cost
-
-
-def main():
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--mnn_model", type=str, required=True,\
-        help="original float MNN model file")
-    parser.add_argument("--quant_imgs", type=str, required=True, \
-        help="path of quant images")
-    parser.add_argument("--quant_model", type=str, required=True, \
-        help="name of quantized model to save")
-    parser.add_argument("--batch_size", type=int, required=False, default=32,\
-                        help="calibration batch size")
-
-    args = parser.parse_args()
-
-    mnn_model = args.mnn_model
-    quant_imgs = args.quant_imgs
-    quant_model = args.quant_model
-    batch_size = args.batch_size
-
-    calibration_dataset = CalibrationDataset(image_folder=quant_imgs)
-
-    dataloader = MNN.data.DataLoader(calibration_dataset, batch_size=batch_size, shuffle=True)
-
-    m = F.load_as_dict(mnn_model)
-
-    inputs_outputs = F.get_inputs_and_outputs(m)
-    for key in inputs_outputs[0].keys():
-        print('input names:\t', key)
-    for key in inputs_outputs[1].keys():
-        print('output names:\t', key)
-    
-    # set inputs and outputs
-    inputs = [m['input']]
-    outputs = [m['MobilenetV2/Predictions/Reshape_1']]
-    input_placeholders = []
-    for i in range(len(inputs)):
-        shape = [1, 3, 224, 224]
-        fmt = 'nchw'
-        nnn_format = get_mnn_format(fmt)
-        placeholder = F.placeholder(shape, nnn_format)
-        placeholder.name = 'input'
-        input_placeholders.append(placeholder)
-
-    net = nn.load_module(inputs, outputs, True)
-
-    # no use optimizer
-    opt = MNN.optim.SGD(net, 0.01, 0.9, 0.0005)
-
-    nn.compress.train_quant(net, quant_bits=8)
-
-    used_time = quant_func(net, dataloader, opt)
-
-    # save model
-    net.train(False)
-    predicts = net.forward(input_placeholders)
-    print("quantized model save to " + quant_model)
-    F.save(predicts, quant_model)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/pymnn/src/expr.h b/pymnn/src/expr.h
index dbcb317da..b31cfa5ac 100644
--- a/pymnn/src/expr.h
+++ b/pymnn/src/expr.h
@@ -137,7 +137,7 @@ static PyMethodDef PyMNNVar_methods[] = {
     {"set_device_ptr", (PyCFunction)PyMNNVar_set_device_ptr, METH_VARARGS, "set_device_ptr data"},
     {"copy_to_device_ptr", (PyCFunction)PyMNNVar_copy_to_device_ptr, METH_VARARGS, "copy_to_device_ptr data"},
 
-    
+
     {NULL}  /* Sentinel */
 };
 static PyObject* PyMNNVar_add(PyObject*, PyObject*);
diff --git a/pymnn/src/llm.h b/pymnn/src/llm.h
index fc4e885e2..0d363fe98 100644
--- a/pymnn/src/llm.h
+++ b/pymnn/src/llm.h
@@ -24,6 +24,18 @@ static PyObject* PyMNNLLM_load(LLM *self, PyObject *args) {
     Py_RETURN_NONE;
 }
 
+static PyObject* PyMNNLLM_forward(LLM *self, PyObject *args) {
+    PyObject *input_ids = nullptr;
+    if (!PyArg_ParseTuple(args, "O", &input_ids) && isInts(input_ids)) {
+        Py_RETURN_NONE;
+    }
+    auto logits = getVar();
+    self->llm->generate_init();
+    *(logits->var) = self->llm->forward(toInts(input_ids));
+    self->llm->reset();
+    return (PyObject *)logits;
+}
+
 static PyObject* PyMNNLLM_generate(LLM *self, PyObject *args) {
     PyObject *input_ids = nullptr;
     if (!PyArg_ParseTuple(args, "O", &input_ids) && isInts(input_ids)) {
@@ -44,10 +56,32 @@ static PyObject* PyMNNLLM_response(LLM *self, PyObject *args) {
     return string2Object(res);
 }
 
+static PyObject* PyMNNLLM_tokenizer_encode(LLM *self, PyObject *args) {
+    const char* prompt = NULL;
+    int use_template = 0;
+    if (!PyArg_ParseTuple(args, "s|p", &prompt, &use_template)) {
+        Py_RETURN_NONE;
+    }
+    auto ids = self->llm->tokenizer_encode(prompt, use_template);
+    return toPyObj<int, toPyObj>(ids);
+}
+
+static PyObject* PyMNNLLM_tokenizer_decode(LLM *self, PyObject *args) {
+    PyObject *id = nullptr;
+    if (!PyArg_ParseTuple(args, "O", &id) && isInt(id)) {
+        Py_RETURN_NONE;
+    }
+    auto query = self->llm->tokenizer_decode(toInt(id));
+    return string2Object(query);
+}
+
 static PyMethodDef PyMNNLLM_methods[] = {
     {"load", (PyCFunction)PyMNNLLM_load, METH_VARARGS, "load model."},
+    {"forward", (PyCFunction)PyMNNLLM_forward, METH_VARARGS, "forward `logits` by `input_ids`."},
     {"generate", (PyCFunction)PyMNNLLM_generate, METH_VARARGS, "generate `output_ids` by `input_ids`."},
     {"response", (PyCFunction)PyMNNLLM_response, METH_VARARGS, "response `query` without hsitory."},
+    {"tokenizer_encode", (PyCFunction)PyMNNLLM_tokenizer_encode, METH_VARARGS, "tokenizer encode."},
+    {"tokenizer_decode", (PyCFunction)PyMNNLLM_tokenizer_decode, METH_VARARGS, "tokenizer decode."},
     {NULL}  /* Sentinel */
 };
 
diff --git a/schema/current/CaffeOp_generated.h b/schema/current/CaffeOp_generated.h
index 715e96fc2..09f9632e8 100644
--- a/schema/current/CaffeOp_generated.h
+++ b/schema/current/CaffeOp_generated.h
@@ -1140,6 +1140,7 @@ struct QuantizedFloatParamT : public flatbuffers::NativeTable {
   int8_t clampMax;
   std::vector<int32_t> winogradAttr;
   DataType outputDataType;
+  std::vector<float> floatzeros;
   QuantizedFloatParamT()
       : method(QuantizeAlgo_DEFAULT),
         nbits(8),
@@ -1192,6 +1193,9 @@ struct QuantizedFloatParam FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table
   DataType outputDataType() const {
     return static_cast<DataType>(GetField<int32_t>(26, 6));
   }
+  const flatbuffers::Vector<float> *floatzeros() const {
+    return GetPointer<const flatbuffers::Vector<float> *>(28);
+  }
   bool Verify(flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            VerifyOffset(verifier, 4) &&
@@ -1211,6 +1215,8 @@ struct QuantizedFloatParam FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table
            VerifyOffset(verifier, 24) &&
            verifier.VerifyVector(winogradAttr()) &&
            VerifyField<int32_t>(verifier, 26) &&
+           VerifyOffset(verifier, 28) &&
+           verifier.VerifyVector(floatzeros()) &&
            verifier.EndTable();
   }
   QuantizedFloatParamT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
@@ -1257,6 +1263,9 @@ struct QuantizedFloatParamBuilder {
   void add_outputDataType(DataType outputDataType) {
     fbb_.AddElement<int32_t>(26, static_cast<int32_t>(outputDataType), 6);
   }
+  void add_floatzeros(flatbuffers::Offset<flatbuffers::Vector<float>> floatzeros) {
+    fbb_.AddOffset(28, floatzeros);
+  }
   explicit QuantizedFloatParamBuilder(flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
@@ -1282,8 +1291,10 @@ inline flatbuffers::Offset<QuantizedFloatParam> CreateQuantizedFloatParam(
     int8_t clampMin = -128,
     int8_t clampMax = 127,
     flatbuffers::Offset<flatbuffers::Vector<int32_t>> winogradAttr = 0,
-    DataType outputDataType = DataType_DT_INT8) {
+    DataType outputDataType = DataType_DT_INT8,
+    flatbuffers::Offset<flatbuffers::Vector<float>> floatzeros = 0) {
   QuantizedFloatParamBuilder builder_(_fbb);
+  builder_.add_floatzeros(floatzeros);
   builder_.add_outputDataType(outputDataType);
   builder_.add_winogradAttr(winogradAttr);
   builder_.add_nbits(nbits);
@@ -4500,6 +4511,7 @@ inline void QuantizedFloatParam::UnPackTo(QuantizedFloatParamT *_o, const flatbu
   { auto _e = clampMax(); _o->clampMax = _e; };
   { auto _e = winogradAttr(); if (_e) { _o->winogradAttr.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->winogradAttr[_i] = _e->Get(_i); } } };
   { auto _e = outputDataType(); _o->outputDataType = _e; };
+  { auto _e = floatzeros(); if (_e) { _o->floatzeros.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->floatzeros[_i] = _e->Get(_i); } } };
 }
 
 inline flatbuffers::Offset<QuantizedFloatParam> QuantizedFloatParam::Pack(flatbuffers::FlatBufferBuilder &_fbb, const QuantizedFloatParamT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
@@ -4522,6 +4534,7 @@ inline flatbuffers::Offset<QuantizedFloatParam> CreateQuantizedFloatParam(flatbu
   auto _clampMax = _o->clampMax;
   auto _winogradAttr = _o->winogradAttr.size() ? _fbb.CreateVector(_o->winogradAttr) : 0;
   auto _outputDataType = _o->outputDataType;
+  auto _floatzeros = _o->floatzeros.size() ? _fbb.CreateVector(_o->floatzeros) : 0;
   return MNN::CreateQuantizedFloatParam(
       _fbb,
       _weight,
@@ -4535,7 +4548,8 @@ inline flatbuffers::Offset<QuantizedFloatParam> CreateQuantizedFloatParam(flatbu
       _clampMin,
       _clampMax,
       _winogradAttr,
-      _outputDataType);
+      _outputDataType,
+      _floatzeros);
 }
 
 inline Convolution2DT *Convolution2D::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
@@ -6004,7 +6018,8 @@ inline const flatbuffers::TypeTable *QuantizedFloatParamTypeTable() {
     { flatbuffers::ET_CHAR, 0, -1 },
     { flatbuffers::ET_CHAR, 0, -1 },
     { flatbuffers::ET_INT, 1, -1 },
-    { flatbuffers::ET_INT, 0, 1 }
+    { flatbuffers::ET_INT, 0, 1 },
+    { flatbuffers::ET_FLOAT, 1, -1 }
   };
   static const flatbuffers::TypeFunction type_refs[] = {
     QuantizeAlgoTypeTable,
@@ -6022,10 +6037,11 @@ inline const flatbuffers::TypeTable *QuantizedFloatParamTypeTable() {
     "clampMin",
     "clampMax",
     "winogradAttr",
-    "outputDataType"
+    "outputDataType",
+    "floatzeros"
   };
   static const flatbuffers::TypeTable tt = {
-    flatbuffers::ST_TABLE, 12, type_codes, type_refs, nullptr, names
+    flatbuffers::ST_TABLE, 13, type_codes, type_refs, nullptr, names
   };
   return &tt;
 }
diff --git a/schema/current/MNN_generated.h b/schema/current/MNN_generated.h
index 14ae9e4e9..bb4f48a44 100644
--- a/schema/current/MNN_generated.h
+++ b/schema/current/MNN_generated.h
@@ -193,7 +193,7 @@ enum OpType {
   OpType_Segment = 89,
   OpType_Squeeze = 90,
   OpType_StridedSlice = 91,
-  OpType_StringJoin = 92,
+  OpType_CastLike = 92,
   OpType_StringSplit = 93,
   OpType_StringToNumber = 94,
   OpType_TanH = 95,
@@ -381,7 +381,7 @@ inline const OpType (&EnumValuesOpType())[182] {
     OpType_Segment,
     OpType_Squeeze,
     OpType_StridedSlice,
-    OpType_StringJoin,
+    OpType_CastLike,
     OpType_StringSplit,
     OpType_StringToNumber,
     OpType_TanH,
@@ -569,7 +569,7 @@ inline const char * const *EnumNamesOpType() {
     "Segment",
     "Squeeze",
     "StridedSlice",
-    "StringJoin",
+    "CastLike",
     "StringSplit",
     "StringToNumber",
     "TanH",
@@ -8006,7 +8006,7 @@ inline const flatbuffers::TypeTable *OpTypeTypeTable() {
     "Segment",
     "Squeeze",
     "StridedSlice",
-    "StringJoin",
+    "CastLike",
     "StringSplit",
     "StringToNumber",
     "TanH",
diff --git a/schema/default/CaffeOp.fbs b/schema/default/CaffeOp.fbs
index dfa45bb0e..f87f9d083 100644
--- a/schema/default/CaffeOp.fbs
+++ b/schema/default/CaffeOp.fbs
@@ -96,6 +96,7 @@ table QuantizedFloatParam{
     // binary proto: [originKySize, originKxSize, transKySize, transKxSize, {kyStart, kxStart, unitY, unitX}, {...} ...]
     winogradAttr:[int];
     outputDataType:DataType=DT_INT8;
+    floatzeros: [float];
 }
 
 table Convolution2D {
diff --git a/schema/default/MNN.fbs b/schema/default/MNN.fbs
index b5d8b5756..d415bddcb 100644
--- a/schema/default/MNN.fbs
+++ b/schema/default/MNN.fbs
@@ -107,7 +107,7 @@ enum OpType : int {
     Segment,
     Squeeze,
     StridedSlice,
-    StringJoin,
+    CastLike,
     StringSplit,
     StringToNumber,
     TanH,
diff --git a/source/backend/arm82/Arm82Backend.cpp b/source/backend/arm82/Arm82Backend.cpp
index 377243388..5e3f78ec4 100644
--- a/source/backend/arm82/Arm82Backend.cpp
+++ b/source/backend/arm82/Arm82Backend.cpp
@@ -42,6 +42,7 @@ bool Arm82Backend::addArm82Creator(OpType t, Arm82Creator* ct) {
 
 Arm82Backend::Arm82Backend(const CPURuntime* runtime, BackendConfig::MemoryMode memory) : CPUBackend(runtime, BackendConfig::Precision_Low, memory, MNN_FORWARD_CPU_EXTENSION) {
     mCoreFunctions = Arm82Functions::get();
+    mInt8CoreFunctions = Arm82Functions::getInt8();
 }
 
 Arm82Backend::~Arm82Backend() {
diff --git a/source/backend/arm82/Arm82Functions.cpp b/source/backend/arm82/Arm82Functions.cpp
index ea57b4d9e..5e2efa3b9 100644
--- a/source/backend/arm82/Arm82Functions.cpp
+++ b/source/backend/arm82/Arm82Functions.cpp
@@ -526,7 +526,7 @@ static void _MNNComputeMatMulForE_1_FP16(const float* AF, const float* BF, float
             Vec sumValue = Vec(0.0f);
             auto by = B + y * l;
             for (int x=0; x<lC4; ++x) {
-                sumValue = sumValue + Vec::load(A + x * 8) * Vec::load(by + x * 8);
+                sumValue = Vec::fma(sumValue, Vec::load(A + x * 8), Vec::load(by + x * 8));
             }
             if (lR > 0) {
                 FLOAT16 AR[8] = {0, 0, 0, 0, 0, 0, 0, 0};
@@ -544,7 +544,36 @@ static void _MNNComputeMatMulForE_1_FP16(const float* AF, const float* BF, float
     } else {
         auto hC4 = h / 8;
         auto hR = h % 8;
-        for (int y=tId; y<hC4; y+=numberThread) {
+        auto hC16 = hC4 / 4;
+        auto hC4R = hC4 % 4;
+        for (int y=tId; y<hC16; y+=numberThread) {
+            auto biasP = biasPtr + 8 * 4 * y;
+            auto bs = B + 8 * 4 * y;
+            Vec s0 = Vec(0.0f);
+            Vec s1 = Vec(0.0f);
+            Vec s2 = Vec(0.0f);
+            Vec s3 = Vec(0.0f);
+            if (biasPtr != nullptr) {
+                s0 = Vec::load(biasP + 8 * 0);
+                s1 = Vec::load(biasP + 8 * 1);
+                s2 = Vec::load(biasP + 8 * 2);
+                s3 = Vec::load(biasP + 8 * 3);
+            }
+            auto srcY = A + y * l * 8 * 4;
+            for (int x=0; x<l; ++x) {
+                auto a = Vec(A[x]);
+                s0 = Vec::fma(s0, a, Vec::load(bs + h * x + 0 * 8));
+                s1 = Vec::fma(s1, a, Vec::load(bs + h * x + 1 * 8));
+                s2 = Vec::fma(s2, a, Vec::load(bs + h * x + 2 * 8));
+                s3 = Vec::fma(s3, a, Vec::load(bs + h * x + 3 * 8));
+            }
+            Vec::save(C + 4 * 8 * y + 8 * 0, s0);
+            Vec::save(C + 4 * 8 * y + 8 * 1, s1);
+            Vec::save(C + 4 * 8 * y + 8 * 2, s2);
+            Vec::save(C + 4 * 8 * y + 8 * 3, s3);
+        }
+
+        for (int y=hC16*4+tId; y<hC4; y+=numberThread) {
             auto bs = B + 8 * y;
             Vec sumValue = Vec(0.0f);
             if (biasPtr != nullptr) {
@@ -552,7 +581,7 @@ static void _MNNComputeMatMulForE_1_FP16(const float* AF, const float* BF, float
             }
             auto srcY = A + y * l * 8;
             for (int x=0; x<l; ++x) {
-                sumValue = sumValue + Vec(A[x]) * Vec::load(bs + h * x);
+                sumValue = Vec::fma(sumValue, Vec(A[x]), Vec::load(bs + h * x));
             }
             Vec::save(C + 8 * y, sumValue);
         }
@@ -577,13 +606,217 @@ static void _MNNComputeMatMulForE_1_FP16(const float* AF, const float* BF, float
     }
 }
 
+template<int EP, int LP>
+static void _Arm82MNNPackC4ForMatMul_A(int8_t* destOrigin, int8_t const** sourceGroup, const int32_t* info, const int32_t* el) {
+    const int pack = 8;
+    int number = info[0];
+    int eReal = info[1];
+    int xStride = info[3];
+    int xS4 = xStride * pack / sizeof(int32_t);
+    int PUNIT = pack / LP;
+    int FLOATPACK = pack / sizeof(int32_t);
+    int eOutsideStride = info[2] / sizeof(int32_t);
+    int eDest = EP;
+    int realDstCount = info[4];
+    for (int n=0; n<number; ++n) {
+        int e = el[4 * n + 0];
+        int l = el[4 * n + 1];
+        int eOffset = el[4 * n + 2];
+        int lOffset = el[4 * n + 3];
+        int eC = eOffset / EP;
+        int eR = eOffset % EP;
+        int eS = eDest - eR;
+        bool lastBag = false;
+        int eOutsideStride4LastBag = eOutsideStride;
+        if (realDstCount % EP > 0) {
+            int jobsE = realDstCount - eOffset - e;
+            if (jobsE == 0 || (jobsE < (realDstCount % EP))) {
+                lastBag = true;
+            }
+        }
+        auto source = (int32_t*)sourceGroup[n];
+        auto dest = (int32_t*)(destOrigin + eC * info[2] + eR * LP + lOffset * EP);
+        //printf("e=%d, l=%d, eOffset=%d, lOffset=%d, eDest=%d\n", e, l, eOffset, lOffset, eDest);
+        l = l / 4; // Use float instead of int8 * 4
+        if (lastBag && e + eR < EP) {
+            int elast = ALIMAX(eR + e, realDstCount % EP);
+            dest = (int32_t*)(destOrigin + lOffset * elast + eC * info[2] + eR * LP);
+        }
+        int offsetLC = lOffset / 4;
+        for (int x = 0; x < l; ++x) {
+            int eRemain = e;
+            auto xR                  = x % PUNIT;
+            auto xC                  = x / PUNIT;
+            auto d = dest;
+            auto s = source + xC * eReal * FLOATPACK + xR;
+            if (eR > 0) {
+                int eStep = ALIMIN(eRemain, eS);
+                for (int yi=0; yi<eStep; ++yi) {
+                    d[yi] = s[yi * xS4];
+                }
+                eRemain-=eStep;
+                if (!lastBag ||eRemain >= EP) {
+                    d += (eOutsideStride - eR);
+                } else {
+                    int eFill = ALIMAX(eRemain, realDstCount % EP); // maybe padding>0
+                    eOutsideStride4LastBag = eOutsideStride - (EP * 4 * offsetLC / sizeof(float));
+                    d += (eOutsideStride4LastBag - eR + offsetLC * eFill);
+                }
+                s += eS * xS4;
+            }
+            while (eRemain > 0) {
+                int eStep = ALIMIN(eDest, eRemain);
+                for (int yi=0; yi<eStep; ++yi) {
+                    d[yi] = s[yi * xS4];
+                }
+                eRemain-=eStep;
+                if (!lastBag || eRemain >= EP) {
+                    d+= eOutsideStride;
+                } else {
+                    int eFill = ALIMAX(eRemain, realDstCount % EP); // maybe padding>0
+                    eOutsideStride4LastBag = eOutsideStride - (EP * 4 * offsetLC / sizeof(float));
+                    d+= (eOutsideStride4LastBag + offsetLC * eFill);
+                }
+                s+= eStep * xS4;
+            }
+            if (lastBag && e + eR < EP) {
+                int efill = ALIMAX(e + eR, realDstCount % EP);
+                dest += efill;
+            } else {
+                dest += eDest;
+            }
+            offsetLC++;
+        }
+    }
+}
+
+template<int EP, int HP>
+static void _ArmBasicMNNPackC4ForMatMul_A_L8(int8_t* destOrigin, int8_t const** sourceGroup, const int32_t* info, const int32_t* el) {
+    int number = info[0];
+    int eReal = info[1];
+    int eDest = EP;
+    int offset = info[3];
+    const int LP = 8;
+    int eOutsideStride = info[2] / sizeof(int64_t);
+    int realDstCount = info[4];
+    for (int n=0; n<number; ++n) {
+        int e = el[4 * n + 0];
+        int l = el[4 * n + 1];
+        int eOffset = el[4 * n + 2];
+        int lOffset = el[4 * n + 3];
+        int eC = eOffset / EP;
+        int eR = eOffset % EP;
+        int eS = eDest - eR;
+        bool lastBag = false;
+        int eOutsideStride4LastBag = eOutsideStride;
+        int eres = realDstCount - eOffset;
+        if (realDstCount % EP > 0) {
+            int jobsE = realDstCount - eOffset - e;
+            if (jobsE == 0 || (jobsE < (realDstCount % EP))) {
+                lastBag = true;
+            }
+        }
+        auto dest = (int64_t*)(destOrigin + lOffset * eDest + eC * info[2] + eR * LP);
+        auto source = (int64_t*)sourceGroup[n];
+        int lRemain = l / LP;
+        if (lastBag && e + eR < EP) {
+            int elast = ALIMIN(ALIMAX(eR + e, realDstCount % EP), EP);
+            dest = (int64_t*)(destOrigin + lOffset * elast + eC * info[2] + eR * LP);
+        }
+        int offsetLC = lOffset / LP;
+        for (int x = 0; x < lRemain; ++x) {
+            int eRemain = e;
+            auto d = dest;
+            auto s = source;
+            if (1 == offset) {
+                if (eR > 0) {
+                    int eStep = ALIMIN(eRemain, eS);
+                    ::memcpy(d, s, eStep * sizeof(int64_t));
+                    eRemain-=eStep;
+                    if (!lastBag ||eRemain >= EP) {
+                        d += (eOutsideStride - eR);
+                    } else {
+                        int eFill = ALIMAX(eRemain, realDstCount % EP); // maybe padding>0
+                        eOutsideStride4LastBag = eOutsideStride - (EP * offsetLC);
+                        d += (eOutsideStride4LastBag - eR + offsetLC * eFill);
+                    }
+                    s += (eS * offset);
+                }
+                while (eRemain > 0) {
+                    int eStep = ALIMIN(eDest, eRemain);
+                    ::memcpy(d, s, eStep * sizeof(int64_t));
+                    eRemain-=eStep;
+                    if (!lastBag || eRemain >= EP) {
+                        d+= eOutsideStride;
+                    } else {
+                        int eFill = ALIMAX(eRemain, realDstCount % EP); // maybe padding>0
+                        eOutsideStride4LastBag = eOutsideStride - (EP * offsetLC);
+                        d+= (eOutsideStride4LastBag + offsetLC * eFill);
+                    }
+                    s+= (eStep * offset);
+                }
+            } else {
+                if (eR > 0) {
+                    int eStep = ALIMIN(eRemain, eS);
+                    for (int yi=0; yi<eStep; ++yi) {
+                        d[yi] = s[yi * offset];
+                    }
+                    eRemain-=eStep;
+                    if (!lastBag ||eRemain >= EP) {
+                        d += (eOutsideStride - eR);
+                    } else {
+                        int eFill = ALIMAX(eRemain, realDstCount % EP); // maybe padding>0
+                        eOutsideStride4LastBag = eOutsideStride - (EP * offsetLC);
+                        d += (eOutsideStride4LastBag - eR + offsetLC * eFill);
+                    }
+                    s += eS * offset;
+                }
+                while (eRemain > 0) {
+                    int eStep = ALIMIN(eDest, eRemain);
+                    for (int yi=0; yi<eStep; ++yi) {
+                        d[yi] = s[yi * offset];
+                    }
+                    eRemain-=eStep;
+                    if (!lastBag || eRemain >= EP) {
+                        d+= eOutsideStride;
+                    } else {
+                        int eFill = ALIMAX(eRemain, realDstCount % EP); // maybe padding>0
+                        eOutsideStride4LastBag = eOutsideStride - (EP * offsetLC);
+                        d+= (eOutsideStride4LastBag + offsetLC * eFill);
+                    }
+                    s+= eStep * offset;
+                }
+            }
+            source += eReal;
+            if (lastBag && e + eR < EP ) { // eR=0;eR>0
+                int efill = ALIMAX(e + eR, realDstCount % EP);
+                dest += efill;
+            } else {
+                dest += eDest;
+            }
+            offsetLC++;
+        }
+    }
+}
+
 static CoreFunctions* gInstance = nullptr;
+static CoreInt8Functions* gArm82CoreInt8Functions = nullptr;
 
 bool Arm82Functions::init() {
     using Vec = MNN::Math::Vec<FLOAT16, 8>;
     auto origin = MNNGetCoreFunctions();
 #define FUNC_PTR_ASSIGN(dst, src) dst = (decltype(dst))(src)
     gInstance = new CoreFunctions;
+    gArm82CoreInt8Functions = new CoreInt8Functions;
+    *gArm82CoreInt8Functions = *MNNGetInt8CoreFunctions();
+    {
+        if (origin->supportSDot) {
+            gArm82CoreInt8Functions->MNNPackC4Int8ForMatMul_A = _Arm82MNNPackC4ForMatMul_A<12, 4>;
+        }
+        if (origin->supportI8mm) {
+            gArm82CoreInt8Functions->MNNPackC4Int8ForMatMul_A = _ArmBasicMNNPackC4ForMatMul_A_L8<10, 8>;
+        }
+    }
 
     FUNC_PTR_ASSIGN(gInstance->MNNFp32ToFp8, MNNFp32ToFp8);
     FUNC_PTR_ASSIGN(gInstance->MNNFp16ToFp8, MNNFp16ToFp8);
@@ -674,5 +907,8 @@ bool Arm82Functions::init() {
 CoreFunctions* Arm82Functions::get() {
     return gInstance;
 }
+CoreInt8Functions* Arm82Functions::getInt8() {
+    return gArm82CoreInt8Functions;
+}
 };
 #endif
diff --git a/source/backend/arm82/Arm82Functions.hpp b/source/backend/arm82/Arm82Functions.hpp
index 3282af972..3c070684e 100644
--- a/source/backend/arm82/Arm82Functions.hpp
+++ b/source/backend/arm82/Arm82Functions.hpp
@@ -12,6 +12,7 @@ class Arm82Functions {
 public:
     static bool init();
     static CoreFunctions* get();
+    static CoreInt8Functions* getInt8();
 };
 
 };
diff --git a/source/backend/arm82/asm/arm64/low_memory/MNNDynamicQuantFP16.S b/source/backend/arm82/asm/arm64/low_memory/MNNDynamicQuantFP16.S
index 01455850d..34f16eb86 100644
--- a/source/backend/arm82/asm/arm64/low_memory/MNNDynamicQuantFP16.S
+++ b/source/backend/arm82/asm/arm64/low_memory/MNNDynamicQuantFP16.S
@@ -22,8 +22,6 @@
 //void MNNDynamicQuantFP16(const float* src, int8_t* dst, const float* scale, size_t src_depth_quad, size_t realSize, int pack)
 asm_function MNNDynamicQuantFP16
 
-// Feature: quant and reorder C8->C4
-
 // x0: src, x1:dst, x2:scale, x3:src_depth_quad, x4:realSize
 stp d14, d15, [sp, #(-16 * 4)]!
 stp d12, d13, [sp, #(16 * 1)]
@@ -33,21 +31,191 @@ stp d8,  d9,  [sp, #(16 * 3)]
 Start:
 lsl x6, x4, #3  // dst_step = batch * (2*unit) * sizeof(int8_t) = batch * 8 = batch << 3
 lsl x7, x4, #4  // src_step = batch * pack * sizeof(float16) = batch * 8 * 2 = batch << 4
-lsl x8, x4, #2  // 4 * plane
-add x11, x1, x8 // second N*4
+
+TILE_24:
+cmp x4, #24
+blt TILE_16
+mov x9, x0   // src
+mov x10, x1  // dst
+sub x15, x6, #128
+mov x12, x3  // src_depth_quad
+sub x13, x7, #320 // src_step - 320
+
+ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [x2], #64
+ld1 {v16.4s, v17.4s}, [x2], #32
+fcvtn v12.4h, v12.4s
+fcvtn2 v12.8h, v13.4s
+fcvtn v13.4h, v14.4s
+fcvtn2 v13.8h, v15.4s
+fcvtn v14.4h, v16.4s
+fcvtn2 v14.8h, v17.4s
+
+LoopSz_24:
+ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x9], #64
+ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x9], #64
+ld1 {v8.8h, v9.8h, v10.8h, v11.8h}, [x9], #64
+ld1 {v15.8h, v16.8h, v17.8h, v18.8h}, [x9], #64
+ld1 {v19.8h, v20.8h, v21.8h, v22.8h}, [x9], #64
+ld1 {v23.8h, v24.8h, v25.8h, v26.8h}, [x9], x13
+
+// float16_t x = x * quant_scale
+fmul v0.8h, v0.8h, v12.h[0]
+fmul v1.8h, v1.8h, v12.h[1]
+fmul v2.8h, v2.8h, v12.h[2]
+fmul v3.8h, v3.8h, v12.h[3]
+fmul v4.8h, v4.8h, v12.h[4]
+fmul v5.8h, v5.8h, v12.h[5]
+fmul v6.8h, v6.8h, v12.h[6]
+fmul v7.8h, v7.8h, v12.h[7]
+fmul v8.8h, v8.8h, v13.h[0]
+fmul v9.8h, v9.8h, v13.h[1]
+fmul v10.8h, v10.8h, v13.h[2]
+fmul v11.8h, v11.8h, v13.h[3]
+fmul v15.8h, v15.8h, v13.h[4]
+fmul v16.8h, v16.8h, v13.h[5]
+fmul v17.8h, v17.8h, v13.h[6]
+fmul v18.8h, v18.8h, v13.h[7]
+
+fmul v19.8h, v19.8h, v14.h[0]
+fmul v20.8h, v20.8h, v14.h[1]
+fmul v21.8h, v21.8h, v14.h[2]
+fmul v22.8h, v22.8h, v14.h[3]
+fmul v23.8h, v23.8h, v14.h[4]
+fmul v24.8h, v24.8h, v14.h[5]
+fmul v25.8h, v25.8h, v14.h[6]
+fmul v26.8h, v26.8h, v14.h[7]
+
+// int16_t x = round(x)
+Round v0, v1, v2, v3
+Round v4, v5, v6, v7
+Round v8, v9, v10, v11
+Round v15, v16, v17, v18
+Round v19, v20, v21, v22
+Round v23, v24, v25, v26
+
+// y = (int8_t)x
+sqxtn v27.8b, v0.8h
+sqxtn2 v27.16b, v1.8h
+sqxtn v28.8b, v2.8h
+sqxtn2 v28.16b, v3.8h
+sqxtn v29.8b, v4.8h
+sqxtn2 v29.16b, v5.8h
+sqxtn v30.8b, v6.8h
+sqxtn2 v30.16b, v7.8h
+sqxtn v0.8b, v8.8h
+sqxtn2 v0.16b, v9.8h
+sqxtn v1.8b, v10.8h
+sqxtn2 v1.16b, v11.8h
+sqxtn v2.8b, v15.8h
+sqxtn2 v2.16b, v16.8h
+sqxtn v3.8b, v17.8h
+sqxtn2 v3.16b, v18.8h
+sqxtn v4.8b, v19.8h
+sqxtn2 v4.16b, v20.8h
+sqxtn v5.8b, v21.8h
+sqxtn2 v5.16b, v22.8h
+sqxtn v6.8b, v23.8h
+sqxtn2 v6.16b, v24.8h
+sqxtn v7.8b, v25.8h
+sqxtn2 v7.16b, v26.8h
+
+st1 {v27.16b, v28.16b, v29.16b, v30.16b}, [x10], #64
+st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x10], #64
+st1 {v4.16b, v5.16b, v6.16b, v7.16b}, [x10], x15
+
+subs x12, x12, #1
+bne LoopSz_24
+
+Tile24End:
+sub x4, x4, #24   // batch -= 24
+add x0, x0, #384  // src += 24 * 8 * sizeof(float16_t)
+add x1, x1, #192   // dst += 24 * 8 * sizeof(int8_t)
+b TILE_24
+
+TILE_16:
+cmp x4, #16
+blt TILE_12
+mov x9, x0   // src
+mov x10, x1  // dst
+sub x15, x6, #64
+mov x12, x3  // src_depth_quad
+sub x13, x7, #192 // src_step - 192
+
+ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [x2], #64
+fcvtn v12.4h, v12.4s
+fcvtn2 v12.8h, v13.4s
+fcvtn v13.4h, v14.4s
+fcvtn2 v13.8h, v15.4s
+
+LoopSz_16:
+ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x9], #64
+ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x9], #64
+ld1 {v8.8h, v9.8h, v10.8h, v11.8h}, [x9], #64
+ld1 {v15.8h, v16.8h, v17.8h, v18.8h}, [x9], x13
+
+// float16_t x = x * quant_scale
+fmul v0.8h, v0.8h, v12.h[0]
+fmul v1.8h, v1.8h, v12.h[1]
+fmul v2.8h, v2.8h, v12.h[2]
+fmul v3.8h, v3.8h, v12.h[3]
+fmul v4.8h, v4.8h, v12.h[4]
+fmul v5.8h, v5.8h, v12.h[5]
+fmul v6.8h, v6.8h, v12.h[6]
+fmul v7.8h, v7.8h, v12.h[7]
+fmul v8.8h, v8.8h, v13.h[0]
+fmul v9.8h, v9.8h, v13.h[1]
+fmul v10.8h, v10.8h, v13.h[2]
+fmul v11.8h, v11.8h, v13.h[3]
+fmul v15.8h, v15.8h, v13.h[4]
+fmul v16.8h, v16.8h, v13.h[5]
+fmul v17.8h, v17.8h, v13.h[6]
+fmul v18.8h, v18.8h, v13.h[7]
+
+// int16_t x = round(x)
+Round v0, v1, v2, v3
+Round v4, v5, v6, v7
+Round v8, v9, v10, v11
+Round v15, v16, v17, v18
+
+// y = (int8_t)x
+sqxtn v19.8b, v0.8h
+sqxtn2 v19.16b, v1.8h
+sqxtn v20.8b, v2.8h
+sqxtn2 v20.16b, v3.8h
+sqxtn v21.8b, v4.8h
+sqxtn2 v21.16b, v5.8h
+sqxtn v22.8b, v6.8h
+sqxtn2 v22.16b, v7.8h
+sqxtn v23.8b, v8.8h
+sqxtn2 v23.16b, v9.8h
+sqxtn v24.8b, v10.8h
+sqxtn2 v24.16b, v11.8h
+sqxtn v25.8b, v15.8h
+sqxtn2 v25.16b, v16.8h
+sqxtn v26.8b, v17.8h
+sqxtn2 v26.16b, v18.8h
+
+st1 {v19.16b, v20.16b, v21.16b, v22.16b}, [x10], #64
+st1 {v23.16b, v24.16b, v25.16b, v26.16b}, [x10], x15
+
+subs x12, x12, #1
+bne LoopSz_16
+
+Tile16End:
+sub x4, x4, #16   // batch -= 16
+add x0, x0, #256  // src += 16 * 8 * sizeof(float16_t)
+add x1, x1, #128   // dst += 16 * 8 * sizeof(int8_t)
+b TILE_16
 
 TILE_12:
 cmp x4, #12
 blt TILE_10
 mov x9, x0   // src
 mov x10, x1  // dst
-mov x15, x11 // second dst 
+sub x15, x6, #64
 mov x12, x3  // src_depth_quad
-sub x13, x7, #128 // src_step - 64
+sub x13, x7, #128 // src_step - 128
 
-// quant_scale: v12, v13, v14
-// ld1 {v12.8h}, [x2], #16
-// ld1 {v13.d}[0], [x2], #8
 ld1 {v12.4s, v13.4s, v14.4s}, [x2], #48
 fcvtn v12.4h, v12.4s
 fcvtn2 v12.8h, v13.4s
@@ -78,31 +246,21 @@ Round v4, v5, v6, v7
 Round v8, v9, v10, v11
 
 // y = (int8_t)x
-sqxtn v0.8b, v0.8h
-sqxtn2 v0.16b, v1.8h
-sqxtn v1.8b, v2.8h
-sqxtn2 v1.16b, v3.8h
-sqxtn v2.8b, v4.8h
-sqxtn2 v2.16b, v5.8h
-sqxtn v3.8b, v6.8h
-sqxtn2 v3.16b, v7.8h
-sqxtn v4.8b, v8.8h
-sqxtn2 v4.16b, v9.8h
-sqxtn v5.8b, v10.8h
-sqxtn2 v5.16b, v11.8h
-
-uzp1 v6.4s, v0.4s, v1.4s
-uzp1 v7.4s, v2.4s, v3.4s
-uzp1 v8.4s, v4.4s, v5.4s
-uzp2 v9.4s, v0.4s, v1.4s
-uzp2 v10.4s, v2.4s, v3.4s
-uzp2 v11.4s, v4.4s, v5.4s
-
-st1 {v6.16b, v7.16b, v8.16b}, [x10], x6
-st1 {v9.16b, v10.16b, v11.16b}, [x15], x6
-
-//st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x10], #64
-//st1 {v4.16b, v5.16b}, [x10], x14
+sqxtn  v14.8b, v0.8h
+sqxtn2 v14.16b, v1.8h
+sqxtn  v15.8b, v2.8h
+sqxtn2 v15.16b, v3.8h
+sqxtn  v16.8b, v4.8h
+sqxtn2 v16.16b, v5.8h
+sqxtn  v17.8b, v6.8h
+sqxtn2 v17.16b, v7.8h
+sqxtn  v18.8b, v8.8h
+sqxtn2 v18.16b, v9.8h
+sqxtn  v19.8b, v10.8h
+sqxtn2 v19.16b, v11.8h
+
+st1 {v14.16b, v15.16b, v16.16b, v17.16b}, [x10], #64
+st1 {v18.16b, v19.16b}, [x10], x15
 
 subs x12, x12, #1
 bne LoopSz_12
@@ -110,8 +268,7 @@ bne LoopSz_12
 Tile12End:
 sub x4, x4, #12   // batch -= 12
 add x0, x0, #192  // src += 12 * 8 * sizeof(float16_t)
-add x1, x1, #48   // dst += 12 * 4 * sizeof(int8_t)
-add x11, x11, #48
+add x1, x1, #96   // dst += 12 * 8 * sizeof(int8_t)
 b TILE_12
 
 TILE_10:
@@ -119,7 +276,6 @@ cmp x4, #10
 blt TILE_8
 mov x9, x0   // src
 mov x10, x1  // dst
-mov x15, x11 // second dst 
 mov x12, x3  // src_depth_quad
 sub x13, x7, #128 // src_step - 128
 sub x14, x6, #32 // dst_step - 32
@@ -168,19 +324,9 @@ sqxtn2 v3.16b, v7.8h
 sqxtn v4.8b, v8.8h
 sqxtn2 v4.16b, v9.8h
 
-uzp1 v6.4s, v0.4s, v1.4s // 0 1 2 3
-uzp1 v7.4s, v2.4s, v3.4s // 4 5 6 7
-uzp1 v8.4s, v4.4s, v4.4s // 8 9 8 9
-uzp2 v12.4s, v0.4s, v1.4s
-uzp2 v13.4s, v2.4s, v3.4s
-uzp2 v14.4s, v4.4s, v4.4s
-st1 {v6.16b, v7.16b}, [x10], #32
-st1 {v8.d}[0], [x10], x14
-st1 {v12.16b, v13.16b}, [x15], #32
-st1 {v14.d}[0], [x15], x14
+st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x10], #64
+st1 {v4.16b}, [x10], x15
 
-// st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x10], #64
-// st1 {v4.16b}, [x10], x14
 
 subs x12, x12, #1
 bne LoopSz_10
@@ -188,8 +334,7 @@ bne LoopSz_10
 Tile10End:
 sub x4, x4, #10   // batch -= 10
 add x0, x0, #160  // src += 10 * 8 * sizeof(float16_t)
-add x1, x1, #40   // dst += 10 * 4 * sizeof(int8_t)
-add x11, x11, #40
+add x1, x1, #80   // dst += 10 * 4 * sizeof(int8_t)
 b TILE_10
 
 
@@ -199,7 +344,6 @@ blt TILE_1
 sub x8, x7, #64 // src_step - 64
 mov x9, x0   // src
 mov x10, x1  // dst
-mov x15, x11 // second dst 
 mov x12, x3  // src_depth_quad
 
 // quant_scale: v8
@@ -236,13 +380,7 @@ sqxtn2 v11.16b, v5.8h
 sqxtn v12.8b, v6.8h
 sqxtn2 v12.16b, v7.8h
 
-uzp1 v6.4s, v9.4s, v10.4s // 0 1 2 3 first
-uzp1 v7.4s, v11.4s, v12.4s // 4 5 6 7
-uzp2 v14.4s, v9.4s, v10.4s // 0 1 2 3 second
-uzp2 v15.4s, v11.4s, v12.4s // 4 5 6 7
-st1 {v6.16b, v7.16b}, [x10], x6
-st1 {v14.16b, v15.16b}, [x15], x6
-//st1 {v9.16b, v10.16b, v11.16b, v12.16b}, [x10], x6
+st1 {v9.16b, v10.16b, v11.16b, v12.16b}, [x10], x6
 
 subs x12, x12, #1
 bne LoopSz_8
@@ -250,8 +388,7 @@ bne LoopSz_8
 Tile8End:
 sub x4, x4, #8    // batch -= 8
 add x0, x0, #128  // src += 8 * 8 * sizeof(float16_t)
-add x1, x1, #32   // dst += 8 * 4 * sizeof(int8_t)
-add x11, x11, #32
+add x1, x1, #64   // dst += 8 * 8 * sizeof(int8_t)
 b TILE_8
 
 TILE_4:
@@ -259,7 +396,6 @@ cmp x4, #4
 blt TILE_2
 mov x9, x0   // src
 mov x10, x1  // dst
-mov x15, x11 // second dst 
 mov x12, x3  // src_depth_quad
 
 // quant_scale: v8
@@ -285,11 +421,7 @@ sqxtn2 v4.16b, v1.8h
 sqxtn v5.8b, v2.8h
 sqxtn2 v5.16b, v3.8h
 
-uzp1 v6.4s, v4.4s, v5.4s // 0 1 2 3 first
-uzp2 v14.4s, v4.4s, v5.4s // 0 1 2 3 second
-st1 {v6.16b}, [x10], x6
-st1 {v14.16b}, [x15], x6
-//st1 {v4.16b, v5.16b}, [x10], x6
+st1 {v4.16b, v5.16b}, [x10], x6
 
 subs x12, x12, #1
 bne LoopSz_4
@@ -297,8 +429,7 @@ bne LoopSz_4
 Tile4End:
 sub x4, x4, #4    // batch -= 4
 add x0, x0, #64   // src += 4 * 8 * sizeof(float16_t)
-add x1, x1, #16   // dst += 4 * 4 * sizeof(int8_t)
-add x11, x11, #16
+add x1, x1, #32   // dst += 4 * 8 * sizeof(int8_t)
 b TILE_4
 
 
@@ -307,7 +438,6 @@ cmp x4, #2
 blt TILE_1
 mov x9, x0   // src
 mov x10, x1  // dst
-mov x15, x11 // second dst 
 mov x12, x3  // src_depth_quad
 
 // quant_scale: v8
@@ -330,9 +460,7 @@ fcvtas v1.8h, v1.8h
 sqxtn v2.8b, v0.8h
 sqxtn2 v2.16b, v1.8h
 
-st1 {v2.d}[0], [x10], x6
-st1 {v2.d}[1], [x15], x6
-//st1 {v2.16b}, [x10], x6
+st1 {v2.16b}, [x10], x6
 
 subs x12, x12, #1
 bne LoopSz_2
@@ -340,8 +468,7 @@ bne LoopSz_2
 Tile2End:
 sub x4, x4, #2    // batch -= 2
 add x0, x0, #32   // src += 2 * 8 * sizeof(float16_t)
-add x1, x1, #8   // dst += 2 * 4 * sizeof(int8_t)
-add x11, x11, #8
+add x1, x1, #16   // dst += 2 * 8 * sizeof(int8_t)
 b TILE_2
 
 
@@ -350,7 +477,6 @@ cmp x4, #1
 blt End
 mov x9, x0   // src
 mov x10, x1  // dst
-mov x15, x11 // second dst 
 mov x12, x3  // src_depth_quad
 
 // quant_scale: v8
@@ -368,8 +494,7 @@ fcvtas v0.8h, v0.8h
 // y = (int8_t)x
 sqxtn v0.8b, v0.8h
 
-st1 {v0.s}[0], [x10], x6
-st1 {v0.s}[1], [x15], x6
+st1 {v0.8b}, [x10], x6
 
 subs x12, x12, #1
 bne LoopSz_1
@@ -377,8 +502,7 @@ bne LoopSz_1
 Tile1End:
 sub x4, x4, #1   // batch -= 1
 add x0, x0, #16  // src += 1 * 8 * sizeof(float16_t)
-add x1, x1, #4   // dst += 1 * 4 * sizeof(int8_t)
-add x11, x11, #4
+add x1, x1, #8   // dst += 1 * 8 * sizeof(int8_t)
 b TILE_1
 
 
diff --git a/source/backend/arm82/asm/arm64/low_memory/MNNGemmInt8AddBiasScale_ARMV82_Unit_FP16.S b/source/backend/arm82/asm/arm64/low_memory/MNNGemmInt8AddBiasScale_ARMV82_Unit_FP16.S
index 2a7cf474f..7e876c92e 100644
--- a/source/backend/arm82/asm/arm64/low_memory/MNNGemmInt8AddBiasScale_ARMV82_Unit_FP16.S
+++ b/source/backend/arm82/asm/arm64/low_memory/MNNGemmInt8AddBiasScale_ARMV82_Unit_FP16.S
@@ -114,7 +114,7 @@ ldr x23, [x6, #56]  // fp32minmax
 mov x21, #16 // sizeof(float16_t) * PACK
 Start:
 lsl x15, x3, #5 // x15 = src_depth_quad * UNIT * SRC_UNIT
-mov x22, #48 // src_steps
+lsl x22, x7, #2 // src_steps
 ldr x27, [x6, #80] // extra scale
 TILE_12:
     cmp x7, #12
diff --git a/source/backend/arm82/asm/arm64/low_memory/MNNGemmInt8AddBiasScale_ARMV82_w4_Unit_FP16.S b/source/backend/arm82/asm/arm64/low_memory/MNNGemmInt8AddBiasScale_ARMV82_w4_Unit_FP16.S
index decf68d84..c4f8282a8 100644
--- a/source/backend/arm82/asm/arm64/low_memory/MNNGemmInt8AddBiasScale_ARMV82_w4_Unit_FP16.S
+++ b/source/backend/arm82/asm/arm64/low_memory/MNNGemmInt8AddBiasScale_ARMV82_w4_Unit_FP16.S
@@ -113,7 +113,7 @@ ldr x23, [x6, #56]  // fp32minmax
 mov x21, #16 // sizeof(float16_t) * PACK
 Start:
 lsl x15, x3, #4 // x15 = src_depth_quad * UNIT * SRC_UNIT * sizeof(int4_t)
-mov x22, #48 // src_steps
+lsl x22, x7, #2 // src_steps
 ldr x27, [x6, #80] // extra scale
 TILE_12:
     cmp x7, #12
@@ -572,15 +572,71 @@ L8LoopDz_TILE_1:
     movi v9.16b, #0
     
     mov x28, x12
+    cmp x22, #4
+    bne L8LoopSz_TILE_1_lu1
+    cmp x13, #4
+    blt L8LoopSz_TILE_1_lu1
+    cmp x13, #8
+    blt L8LoopSz_TILE_1_lu4
+    L8LoopSz_TILE_1_lu8:
+        ld1 {v3.16b, v4.16b, v5.16b, v6.16b}, [x12], #64 // weight: hu=0,1,2,3,pack=0~7
+        ld1 {v10.16b, v11.16b, v12.16b, v13.16b}, [x12], #64
+        ld1 {v0.4s, v1.4s}, [x11], #32 // src
+
+        sub x13, x13, #8
+        // int4->int8
+        ushr v14.16b, v3.16b, #4
+        and v22.16b, v3.16b, v7.16b
+
+        ushr v15.16b, v4.16b, #4
+        and v23.16b, v4.16b, v7.16b
+
+        ushr v18.16b, v5.16b, #4
+        and v24.16b, v5.16b, v7.16b
+
+        ushr v21.16b, v6.16b, #4
+        and v25.16b, v6.16b, v7.16b
+
+        ushr v16.16b, v10.16b, #4
+        and v17.16b, v10.16b, v7.16b
+
+        ushr v19.16b, v11.16b, #4
+        and v20.16b, v11.16b, v7.16b
+
+        ushr v26.16b, v12.16b, #4
+        and v27.16b, v12.16b, v7.16b
+
+        ushr v28.16b, v13.16b, #4
+        and v29.16b, v13.16b, v7.16b
+
+        cmp x13, #8
+        //sub x12, x12, x15
+        .inst 0x4f80e1c8 // sdot v8.4s, v14.16b, v0.4b[0]
+        .inst 0x4f80e2c9 // sdot v9.4s, v22.16b, v0.4b[0]
+        .inst 0x4fa0e1e8 // sdot v8.4s, v15.16b, v0.4b[1]
+        .inst 0x4fa0e2e9 // sdot v9.4s, v23.16b, v0.4b[1]
+        .inst 0x4f80ea48 // sdot v8.4s, v18.16b, v0.4b[2]
+        .inst 0x4f80eb09 // sdot v9.4s, v24.16b, v0.4b[2]
+        .inst 0x4fa0eaa8 // sdot v8.4s, v21.16b, v0.4b[3]
+        .inst 0x4fa0eb29 // sdot v9.4s, v25.16b, v0.4b[3]
+
+        .inst 0x4f81e208 // sdot v8.4s, v16.16b, v1.4b[0]
+        .inst 0x4f81e229 // sdot v9.4s, v17.16b, v1.4b[0]
+        .inst 0x4fa1e268 // sdot v8.4s, v19.16b, v1.4b[1]
+        .inst 0x4fa1e289 // sdot v9.4s, v20.16b, v1.4b[1]
+        .inst 0x4f81eb48 // sdot v8.4s, v26.16b, v1.4b[2]
+        .inst 0x4f81eb69 // sdot v9.4s, v27.16b, v1.4b[2]
+        .inst 0x4fa1eb88 // sdot v8.4s, v28.16b, v1.4b[3]
+        .inst 0x4fa1eba9 // sdot v9.4s, v29.16b, v1.4b[3]
+        bge L8LoopSz_TILE_1_lu8
+
+    cbz x13, L8LoopSzEnd_TILE_1
     cmp x13, #4
     blt L8LoopSz_TILE_1_lu1
 
     L8LoopSz_TILE_1_lu4:
         ld1 {v3.16b, v4.16b, v5.16b, v6.16b}, [x12], #64 // weight: hu=0,1,2,3,pack=0~7
-        ld1 {v0.s}[0], [x11], x22 // src
-        ld1 {v0.s}[1], [x11], x22
-        ld1 {v0.s}[2], [x11], x22
-        ld1 {v0.s}[3], [x11], x22
+        ld1 {v0.4s}, [x11], #16 // src
 
         sub x13, x13, #4
         // int4->int8
diff --git a/source/backend/arm82/asm/arm64/low_memory/MNNGemmInt8AddBiasScale_ARMV86_Unit_FP16.S b/source/backend/arm82/asm/arm64/low_memory/MNNGemmInt8AddBiasScale_ARMV86_Unit_FP16.S
index 6602d18b9..382a8733f 100644
--- a/source/backend/arm82/asm/arm64/low_memory/MNNGemmInt8AddBiasScale_ARMV86_Unit_FP16.S
+++ b/source/backend/arm82/asm/arm64/low_memory/MNNGemmInt8AddBiasScale_ARMV86_Unit_FP16.S
@@ -152,7 +152,7 @@ ldr x27, [x6, #40] // srcKernelSum
 ldr x28, [x6, #48] // weightQuanBias
 ldr x14, [x6, #56]  // fp32minmax
 
-mov x22, #80 // GEMM_INT8_DST_XUNIT * GEMM_INT8_SRC_UNIT = 10 * 8 = 80
+lsl x22, x7, #3 // eDest * GEMM_INT8_SRC_UNIT
 mov x21, #16 // sizeof(float16_t) * UNIT
 
 Start:
diff --git a/source/backend/arm82/asm/arm64/low_memory/MNNGemmInt8AddBiasScale_ARMV86_w4_Unit_FP16.S b/source/backend/arm82/asm/arm64/low_memory/MNNGemmInt8AddBiasScale_ARMV86_w4_Unit_FP16.S
index ea01fef1a..ad7b63e83 100644
--- a/source/backend/arm82/asm/arm64/low_memory/MNNGemmInt8AddBiasScale_ARMV86_w4_Unit_FP16.S
+++ b/source/backend/arm82/asm/arm64/low_memory/MNNGemmInt8AddBiasScale_ARMV86_w4_Unit_FP16.S
@@ -132,7 +132,7 @@ ldr x27, [x6, #40] // srcKernelSum
 ldr x28, [x6, #48] // weightQuanBias
 ldr x14, [x6, #56]  // fp32minmax
 
-mov x22, #80 // GEMM_INT8_DST_XUNIT * GEMM_INT8_SRC_UNIT = 10 * 8 = 80
+lsl x22, x7, #3 // eDest * GEMM_INT8_SRC_UNIT
 mov x21, #16 // sizeof(float16_t) * UNIT
 
 Start:
@@ -771,15 +771,15 @@ LoopDz_TILE_1:
     movi v18.4s, #0 // oc:4,5,4,5
     movi v19.4s, #0 // oc:6,7,6,7
 
-    cmp x13, #4
-    blt LoopSz1_TILE_1_lu1
+cmp x22, #8
+bne LoopSz1_TILE_1_lu1
+cmp x13, #4
+blt LoopSz1_TILE_1_lu1
+
 LoopSz1_TILE_1_lu4:
     ld1 {v5.16b, v6.16b, v7.16b, v8.16b}, [x12], #64     // weight
     ld1 {v9.16b, v10.16b, v11.16b, v12.16b}, [x12], #64
-    ld1 {v0.8b}, [x11], x22                              // src
-    ld1 {v1.8b}, [x11], x22
-    ld1 {v2.8b}, [x11], x22
-    ld1 {v3.8b}, [x11], x22
+    ld1 {v0.8b, v1.8b, v2.8b, v3.8b}, [x11], #32         // src
 
     // int4->int8
     ushr v4.16b, v5.16b, #4
diff --git a/source/backend/arm82/asm/arm64/low_memory/MNNQuantScaleFP16.S b/source/backend/arm82/asm/arm64/low_memory/MNNQuantScaleFP16.S
index 3c2358402..75c740c3f 100644
--- a/source/backend/arm82/asm/arm64/low_memory/MNNQuantScaleFP16.S
+++ b/source/backend/arm82/asm/arm64/low_memory/MNNQuantScaleFP16.S
@@ -31,6 +31,8 @@ stp d8,  d9,  [sp, #(16 * 3)]
 Start:
 movi v31.4s, #127
 scvtf v31.4s, v31.4s
+fcvtn v30.4h, v31.4s
+dup v30.2d, v30.d[0]
 //fcvtn v31.4h, v0.4s
 //fcvtn2 v31.8h, v0.4s
 lsl x9, x4, #1 // src_step = batch * sizeof(float16_t)
@@ -65,6 +67,10 @@ add x0, x0, #24
 // quant_scale = 127 / absmax
 // dequant_scale = absmax / 127
 
+fcmle v28.8h, v0.8h, #0
+fcmle v29.4h, v1.4h, #0
+bit v0.16b, v30.16b, v28.16b
+bit v1.16b, v30.16b, v29.16b
 // float16->float32
 fcvtl v4.4s, v0.4h
 fcvtl2 v5.4s, v0.8h
@@ -122,6 +128,10 @@ add x0, x0, #20
 // quant_scale = 127 / absmax
 // dequant_scale = absmax / 127
 
+fcmle v28.8h, v0.8h, #0
+fcmle v29.4h, v1.4h, #0
+bit v0.16b, v30.16b, v28.16b
+bit v1.16b, v30.16b, v29.16b
 // float16->float32
 fcvtl v4.4s, v0.4h
 fcvtl2 v5.4s, v0.8h
@@ -140,14 +150,6 @@ st1 {v10.d}[0], [x1], #8
 st1 {v12.4s, v13.4s}, [x2], #32
 st1 {v14.d}[0], [x2], #8
 
-// fdiv v4.8h, v31.8h, v0.8h
-// fdiv v5.8h, v31.8h, v1.8h
-// fdiv v6.8h, v0.8h, v31.8h
-// fdiv v7.8h, v1.8h, v31.8h
-// st1 {v4.8h}, [x1], #16
-// st1 {v5.s}[0], [x1], #4
-// st1 {v6.8h}, [x2], #16
-// st1 {v7.s}[0], [x2], #4
 b TILE_10
 
 
@@ -176,6 +178,8 @@ sub x4, x4, #8
 add x0, x0, #16
 // quant_scale = 127 / absmax
 // dequant_scale = absmax / 127
+fcmle v28.8h, v0.8h, #0
+bit v0.16b, v30.16b, v28.16b
 // float16->float32
 fcvtl v4.4s, v0.4h
 fcvtl2 v5.4s, v0.8h
@@ -189,10 +193,6 @@ fdiv v13.4s, v5.4s, v31.4s
 st1 {v8.4s, v9.4s}, [x1], #32
 st1 {v12.4s, v13.4s}, [x2], #32
 
-// fdiv v2.8h, v31.8h, v0.8h
-// fdiv v3.8h, v0.8h, v31.8h
-// st1 {v2.8h}, [x1], #16
-// st1 {v3.8h}, [x2], #16
 b TILE_8
 
 
@@ -221,6 +221,8 @@ sub x4, x4, #1
 add x0, x0, #2
 // quant_scale = 127 / absmax
 // dequant_scale = absmax / 127
+fcmle v28.8h, v0.8h, #0
+bit v0.16b, v30.16b, v28.16b
 fcvtl v4.4s, v0.4h
 
 fdiv v8.4s, v31.4s, v4.4s
@@ -229,10 +231,6 @@ fdiv v12.4s, v4.4s, v31.4s
 st1 {v8.s}[0], [x1], #4
 st1 {v12.s}[0], [x2], #4
 
-// fdiv h2, h31, h0
-// fdiv h3, h0, h31
-// st1 {v2.h}[0], [x1], #2
-// st1 {v3.h}[0], [x2], #2
 b TILE_1
 
 
diff --git a/source/backend/coreml/CMakeLists.txt b/source/backend/coreml/CMakeLists.txt
index c24d01da0..25563fd09 100644
--- a/source/backend/coreml/CMakeLists.txt
+++ b/source/backend/coreml/CMakeLists.txt
@@ -10,20 +10,6 @@ ELSE()
     SET(METAL_SDK_PLAT "macosx")
 ENDIF()
 
-message(STATUS "Compiling CoreML Metal Kernels with ${METAL_SDK_PLAT} SDK")
-
-message(STATUS "Generating coreml.metallib at ${PROJECT_BINARY_DIR}/coreml.metallib")
-
-add_custom_command(OUTPUT ${PROJECT_BINARY_DIR}/coreml.metallib
-                  COMMAND xcrun -sdk ${METAL_SDK_PLAT}
-                  metal "${MNN_COREML_METAL_SRCS}"
-                  -o ${PROJECT_BINARY_DIR}/coreml.metallib
-                  COMMAND_EXPAND_LISTS)
-
-add_custom_target(MNNCoreMLMetalLib DEPENDS
-                 ${PROJECT_BINARY_DIR}/coreml.metallib
-                 COMMENT "Generating coreml.metallib")
-
 # CoreML
 file(GLOB MNN_COREML_SRCS
     ${CMAKE_CURRENT_LIST_DIR}/backend/*.cpp
@@ -37,10 +23,10 @@ file(GLOB MNN_COREML_SRCS
 
 add_library(
     MNNCoreML 
-    STATIC
+    OBJECT
     ${MNN_COREML_SRCS}
-    ${MNNCoreMLMetalLib}
 )
+set_property(TARGET MNNCoreML APPEND_STRING PROPERTY COMPILE_FLAGS "-fobjc-arc")
 
 target_include_directories(MNNCoreML PRIVATE 
     ${CMAKE_CURRENT_LIST_DIR}/mlmodel/include
@@ -48,4 +34,3 @@ target_include_directories(MNNCoreML PRIVATE
     ${CMAKE_CURRENT_LIST_DIR}/execution
 )
 
-add_dependencies(MNNCoreML MNNCoreMLMetalLib)
diff --git a/source/backend/coreml/backend/CoreMLBackend.cpp b/source/backend/coreml/backend/CoreMLBackend.cpp
index 8342e68dd..c576abfc8 100644
--- a/source/backend/coreml/backend/CoreMLBackend.cpp
+++ b/source/backend/coreml/backend/CoreMLBackend.cpp
@@ -35,8 +35,9 @@ namespace MNN {
 
     CoreMLBackend::CoreMLBackend(const CoreMLRuntime* runtime) : Backend(MNN_FORWARD_NN) {
         mNPURuntime = runtime;
+        mInputBuffer.root = BufferAllocator::Allocator::createDefault();
         mPrecision  = mNPURuntime->mPrecision;
-        mCoreMLExecutor.reset(new CoreMLExecutorWrapper);
+        mCoreMLExecutor.reset(new CoreMLExecutorWrapper(mPrecision));
         if (mCoreMLModel_ == nullptr) {
             mCoreMLModel_.reset(new _CoreML__Specification__Model);
             core_ml__specification__model__init(mCoreMLModel_.get());
@@ -81,20 +82,11 @@ namespace MNN {
     Backend::MemObj* CoreMLBackend::onAcquire(const Tensor* tensor, StorageType storageType) {
         bool isInputCopy = TensorUtils::getDescribe(tensor)->usage==Tensor::InsideDescribe::Usage::INPUT;
         bool isOutputCopy = TensorUtils::getDescribe(tensor)->usage==Tensor::InsideDescribe::Usage::OUTPUT;
-        // using CvPixelBuffer as input and output
-        if (mPrecision == BackendConfig::Precision_Low) {
-            const_cast<Tensor*>(tensor)->setType(DataType_DT_UINT8);
-        }
         if(isInputCopy){
             mInputIdxMap.insert(std::make_pair(tensor, mInputIdxMap.size()));
         }
         if(isOutputCopy){
             mOutputIdxMap.insert(std::make_pair(tensor, mOutputIdxMap.size()));
-            if (mPrecision == BackendConfig::Precision_Low) {
-                TensorUtils::getDescribe(tensor)->memoryType = Tensor::InsideDescribe::MEMORY_HOST;
-                const_cast<halide_buffer_t&>(tensor->buffer()).host = (uint8_t*)MNNMemoryAllocAlign(tensor->size(), MNN_MEMORY_ALIGN_DEFAULT);
-                MNN_ASSERT(tensor->buffer().host != nullptr);
-            }
         }
         // Don't need release
         return new Backend::MemObj;
@@ -105,31 +97,81 @@ namespace MNN {
     }
     
     void CoreMLBackend::onCopyBuffer(const Tensor* srcTensor, const Tensor* dstTensor) const {
+        if (nullptr == srcTensor->buffer().host || nullptr == dstTensor->buffer().host) {
+            MNN_ERROR("[MNN-CoreML]: Invalid copy because not valid input / output\n");
+            return;
+        }
 
         bool isInputCopy = TensorUtils::getDescribe(dstTensor)->usage==Tensor::InsideDescribe::Usage::INPUT;
         bool isOutputCopy = TensorUtils::getDescribe(srcTensor)->usage==Tensor::InsideDescribe::Usage::OUTPUT;
-        bool isConst = TensorUtils::getDescribe(srcTensor)->usage==Tensor::InsideDescribe::Usage::CONSTANT || TensorUtils::getDescribe(dstTensor)->usage==Tensor::InsideDescribe::Usage::CONSTANT;
-
-        if(isConst){ return; }
-
+        if ((isInputCopy || isOutputCopy) && mPrecision == BackendConfig::Precision_Low) {
+            // TODO: Fix bug for int8 with nc4hw4
+            ::memcpy(dstTensor->host<void>(), srcTensor->host<void>(),TensorUtils::getRawSize(srcTensor) * sizeof(uint8_t));
+            return;
+        }
         if (isInputCopy) {
-            const auto iter = mInputIdxMap.find(dstTensor);
-            MNN_ASSERT(iter != mInputIdxMap.end());
-            memcpy((void*)&mInputTensors[iter->second], &srcTensor, sizeof(void*));
-        } else if (isOutputCopy) {
-            // MNN_ASSERT(mOutputIdxMap.find(srcTensor) != mOutputIdxMap.end());
-            int srcSize = static_cast<int>(TensorUtils::getRawSize(srcTensor) * srcTensor->getType().bytes());
-            memcpy(dstTensor->host<void>(), srcTensor->host<void>(), std::min(srcSize, dstTensor->size()));
+            if (TensorUtils::getDescribe(dstTensor)->dimensionFormat == MNN_DATA_FORMAT_NC4HW4) {
+                std::unique_ptr<Tensor> tmp(new Tensor(dstTensor, Tensor::CAFFE, false));
+                tmp->buffer().host = dstTensor->buffer().host;
+                MNNCPUCopyBuffer(srcTensor, tmp.get());
+            } else {
+                MNNCPUCopyBuffer(srcTensor, dstTensor);
+            }
+            return;
+        }
+        if(isOutputCopy) {
+            if (TensorUtils::getDescribe(srcTensor)->dimensionFormat == MNN_DATA_FORMAT_NC4HW4) {
+                std::unique_ptr<Tensor> tmp(new Tensor(srcTensor, Tensor::CAFFE, false));
+                tmp->buffer().host = srcTensor->buffer().host;
+                MNNCPUCopyBuffer(tmp.get(), dstTensor);
+            } else {
+                MNNCPUCopyBuffer(srcTensor, dstTensor);
+            }
         }
     }
 
     void CoreMLBackend::onResizeBegin() {
         mCoreMLLayerPtrs.clear();
     }
+    int CoreMLBackend::getBytes(const halide_type_t& type) {
+        if (type.code == halide_type_float && mPrecision == BackendConfig::Precision_Low) {
+            return 1;
+        }
+        return type.bytes();
+    }
 
     ErrorCode CoreMLBackend::onResizeEnd() {
+        bool useImage = mPrecision == BackendConfig::Precision_Low;
+        size_t allocSize = 0;
+        for (auto t : mInputIdxMap) {
+            allocSize += (TensorUtils::getRawSize(t.first) * getBytes(t.first->getType()));
+        }
+        if (useImage) {
+            for (auto t : mOutputIdxMap) {
+                allocSize += (TensorUtils::getRawSize(t.first) * getBytes(t.first->getType()));
+            }
+        }
+        auto code = mInputBuffer.realloc(allocSize, MNN_MEMORY_ALIGN_DEFAULT);
+        if (NO_ERROR != code) {
+            return code;
+        }
+        allocSize = 0;
+        auto ptr = mInputBuffer.current.ptr();
+        for (auto tt : mInputIdxMap) {
+            auto t = (Tensor*)tt.first;
+            t->buffer().host = ptr + allocSize;
+            allocSize += (TensorUtils::getRawSize(t) * getBytes(t->getType()));
+        }
+        for (auto tt : mOutputIdxMap) {
+            auto t = (Tensor*)tt.first;
+            t->buffer().host = ptr + allocSize;
+            allocSize += (TensorUtils::getRawSize(t) * getBytes(t->getType()));
+        }
         return buildModel();
     }
+    bool CoreMLBackend::onUnmapTensor(Tensor::MapType mtype, Tensor::DimensionType dtype, const Tensor* dstTensor, void* mapPtr) {
+        return true;
+    }
 
     std::string CoreMLBackend::getTensorName(const Tensor* t) {
         const auto& iter = mTensorIdxMap.find(t);
@@ -196,6 +238,10 @@ namespace MNN {
             copyName(&(layer->output[i]), std::move(outputs[i]));
         }
     }
+    void* CoreMLBackend::onMapTensor(Tensor::MapType mtype, Tensor::DimensionType dtype, const Tensor* srcTensor) {
+        return srcTensor->host<void>();
+    }
+
     void CoreMLBackend::setIO(CoreML__Specification__FeatureDescription** describe, const Tensor* t) {
         auto name = getTensorName(t);
         auto des = create<CoreML__Specification__FeatureDescription>();
@@ -227,7 +273,6 @@ namespace MNN {
         *describe = des;
     }
     ErrorCode CoreMLBackend::buildModel() {
-        mInputTensors.resize(mInputIdxMap.size());
         mCoreMLModel_->description = create<CoreML__Specification__ModelDescription>();
         core_ml__specification__model_description__init(mCoreMLModel_->description);
         mCoreMLModel_->description->n_input = mInputIdxMap.size();
@@ -270,12 +315,12 @@ namespace MNN {
         if (mCoreMLModel_->neuralnetwork->n_layers <= 0) {
             return;
         }
-        std::vector<std::pair<const MNN::Tensor*, std::string>> inputs(mInputTensors.size()), outputs(mOutputIdxMap.size());
+        std::vector<std::pair<const MNN::Tensor*, std::string>> inputs(mInputIdxMap.size()), outputs(mOutputIdxMap.size());
         // get names
         for (const auto& iter : mInputIdxMap) {
             auto t = iter.first;
             auto idx = iter.second;
-            inputs[idx].first = mInputTensors[idx];
+            inputs[idx].first = t;
             inputs[idx].second = std::to_string(mTensorIdxMap.find(t)->second);
         }
         for (const auto& iter : mOutputIdxMap) {
diff --git a/source/backend/coreml/backend/CoreMLBackend.hpp b/source/backend/coreml/backend/CoreMLBackend.hpp
index b9136690b..66e6b54a1 100644
--- a/source/backend/coreml/backend/CoreMLBackend.hpp
+++ b/source/backend/coreml/backend/CoreMLBackend.hpp
@@ -19,6 +19,7 @@
 #include "MNN_generated.h"
 #include "Model.pb-c.h"
 #include "CoreMLExecutorWrapper.h"
+#include "core/BufferAllocator.hpp"
 
 namespace MNN {
     class CoreMLRuntime : public Runtime {
@@ -49,6 +50,8 @@ namespace MNN {
         virtual ~CoreMLBackend();
 
         virtual Execution* onCreate(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs, const MNN::Op* op) override;
+        virtual void* onMapTensor(Tensor::MapType mtype, Tensor::DimensionType dtype, const Tensor* srcTensor) override;
+        virtual bool onUnmapTensor(Tensor::MapType mtype, Tensor::DimensionType dtype, const Tensor* dstTensor, void* mapPtr) override;
 
         virtual void onExecuteBegin() const override;
         virtual void onExecuteEnd() const override;
@@ -104,6 +107,7 @@ namespace MNN {
         void setLayerOutputs(CoreML__Specification__NeuralNetworkLayer* layer, std::vector<std::string>&& outputs);
         void copyName(char** ptr, std::string&& name);
         int getInOutTensorInfo(std::string modelName);
+        int getBytes(const halide_type_t& type);
 
         class Creator {
         public:
@@ -117,12 +121,12 @@ namespace MNN {
         std::vector<CoreML__Specification__NeuralNetworkLayer*> mCoreMLLayerPtrs;
 
         std::map<const Tensor*, int> mTensorIdxMap, mInputIdxMap, mOutputIdxMap;
-        std::vector<const Tensor*> mInputTensors;
         std::vector<std::string> mModelName;
         std::vector<std::unique_ptr<float>> mInputData, mOutputData;
         const CoreMLRuntime* mNPURuntime;
         BackendConfig::PrecisionMode mPrecision;
         std::unique_ptr<CoreMLExecutorWrapper> mCoreMLExecutor;
+        SingleBufferWithAllocator mInputBuffer;
     };
 
     template <class T>
diff --git a/source/backend/coreml/backend/CoreMLExecutor.h b/source/backend/coreml/backend/CoreMLExecutor.h
index 853578db1..cad0a9d29 100644
--- a/source/backend/coreml/backend/CoreMLExecutor.h
+++ b/source/backend/coreml/backend/CoreMLExecutor.h
@@ -32,11 +32,12 @@ struct Region {
 - (bool)build:(NSURL*)modelUrl API_AVAILABLE(ios(11));
 - (bool)cleanup;
 
+@property int precision;
 @property MLModel* model API_AVAILABLE(ios(11));
 @property NSString* mlModelFilePath;
 @property NSString* compiledModelFilePath;
 @property(nonatomic, readonly) int coreMlVersion;
-@property __strong id<MLFeatureProvider> outputFeature API_AVAILABLE(ios(11));
+@property __strong NSMutableArray* outputArray;
 @end
 
 // RasterLayer
diff --git a/source/backend/coreml/backend/CoreMLExecutor.mm b/source/backend/coreml/backend/CoreMLExecutor.mm
index 7b664a0cd..1bca430e6 100644
--- a/source/backend/coreml/backend/CoreMLExecutor.mm
+++ b/source/backend/coreml/backend/CoreMLExecutor.mm
@@ -34,39 +34,14 @@ bool isAvailable() {
     NSURL* temporaryFileURL = [temporaryDirectoryURL URLByAppendingPathComponent:temporaryFilename];
     return temporaryFileURL;
 }
-static id<MTLComputePipelineState> rasterPipeline;
-id<MTLComputePipelineState> getRasterPipeline() {
-    if (rasterPipeline == nil) {
-        id device = MTLCreateSystemDefaultDevice();
-#if TARGET_OS_IOS
-        NSString *path = [NSBundle.mainBundle pathForResource:@"coreml" ofType:@"metallib"];
-#else
-        NSString *path = @"coreml.metallib";
-#endif
-        NSError* error;
-        id library = path ? [device newLibraryWithFile:path error:&error] : [device newDefaultLibrary];
-        if (error) {
-            printf("[METAL] create library error: %s\n", error.localizedDescription.UTF8String);
-            return nullptr;
-        }
-        id function = [library newFunctionWithName:@"raster_texture"];
-        rasterPipeline = [device newComputePipelineStateWithFunction:function error:&error];
-        if (error) {
-            printf("[METAL] create pipeline error: %s\n", error.localizedDescription.UTF8String);
-            return nullptr;
-        }
-        return rasterPipeline;
-    }
-    return rasterPipeline;
-}
 }  // namespace
 
 @interface MultiArrayFeatureProvider : NSObject <MLFeatureProvider> {
-    const std::vector<std::pair<const MNN::Tensor*, std::string>>* _inputs;
+    NSMutableDictionary* _inputs;
     NSSet* _featureNames;
 }
 
-- (instancetype)initWithInputs:(const std::vector<std::pair<const MNN::Tensor*, std::string>>*)inputs
+- (instancetype)initWithInputs:(const std::vector<std::pair<const MNN::Tensor*, std::string>>*)inputs useImage:(bool)useImage
                  coreMlVersion:(int)coreMlVersion;
 - (MLFeatureValue*)featureValueForName:(NSString*)featureName API_AVAILABLE(ios(11));
 - (NSSet<NSString*>*)featureNames;
@@ -77,34 +52,30 @@ - (MLFeatureValue*)featureValueForName:(NSString*)featureName API_AVAILABLE(ios(
 
 @implementation MultiArrayFeatureProvider
 
-- (instancetype)initWithInputs:(const std::vector<std::pair<const MNN::Tensor*, std::string>>*)inputs
+- (instancetype)initWithInputs:(const std::vector<std::pair<const MNN::Tensor*, std::string>>*)inputs useImage:(bool)useImage
                   coreMlVersion:(int)coreMlVersion {
     self = [super init];
-    _inputs = inputs;
+    _inputs = [NSMutableDictionary dictionaryWithCapacity:inputs->size()];
     _coreMlVersion = coreMlVersion;
-    for (auto& input : *_inputs) {
-        if (input.second.empty()) {
-            return nil;
-        }
-    }
-    return self;
-}
-
-- (NSSet<NSString*>*)featureNames {
-    if (_featureNames == nil) {
-        NSMutableArray* names = [[NSMutableArray alloc] init];
-        for (auto& input : *_inputs) {
-            [names addObject:[NSString stringWithCString:input.second.c_str()
-                                          encoding:[NSString defaultCStringEncoding]]];
-        }
-        _featureNames = [NSSet setWithArray:names];
-    }
-    return _featureNames;
-}
-
-- (MLFeatureValue*)featureValueForName:(NSString*)featureName {
-    for (auto& input : *_inputs) {
-        if ([featureName cStringUsingEncoding:NSUTF8StringEncoding] == input.second) {
+    _featureNames = nil;
+    NSMutableArray* names = [[NSMutableArray alloc] init];
+    for (auto& input : *inputs) {
+        MLFeatureValue* value = nil;
+        auto tensor = input.first;
+        NSError* error = nil;
+        NSString* name = [NSString stringWithCString:input.second.c_str() encoding:[NSString defaultCStringEncoding]];
+        if (useImage) {
+            CVPixelBufferRef pixelBuffer = NULL;
+            OSType pixelFormat = kCVPixelFormatType_OneComponent8;
+            size_t bytePerRow = tensor->width();
+            CVReturn status = CVPixelBufferCreateWithBytes(nil, tensor->width(), tensor->height(), pixelFormat,
+                                                           tensor->host<void>(), bytePerRow, nil, nil, nil, &pixelBuffer);
+            if (status != kCVReturnSuccess) {
+                NSLog(@"Failed to create CVPixelBufferRef for feature %@", name);
+                return nil;
+            }
+            value = [MLFeatureValue featureValueWithPixelBuffer:pixelBuffer];
+        } else {
             auto input_shape = input.first->shape();
             NSMutableArray* shape = [NSMutableArray arrayWithCapacity:input_shape.size()];
             NSMutableArray* strides = [NSMutableArray arrayWithCapacity:input_shape.size()];
@@ -120,37 +91,30 @@ - (MLFeatureValue*)featureValueForName:(NSString*)featureName {
                 [shape addObject:@(input_shape[i])];
                 [strides addObject:@(stridesDim[i])];
             }
-            auto tensor = input.first;
-            if (tensor->getType() == halide_type_of<uint8_t>()) {
-                CVPixelBufferRef pixelBuffer = NULL;
-                OSType pixelFormat = kCVPixelFormatType_OneComponent8;
-                size_t bytePerRow = tensor->width();
-                CVReturn status = CVPixelBufferCreateWithBytes(nil, tensor->width(), tensor->height(), pixelFormat,
-                                                               tensor->host<void>(), bytePerRow, nil, nil, nil, &pixelBuffer);
-                if (status != kCVReturnSuccess) {
-                    NSLog(@"Failed to create CVPixelBufferRef for feature %@", featureName);
-                    return nil;
-                }
-                auto* mlFeatureValue = [MLFeatureValue featureValueWithPixelBuffer:pixelBuffer];
-                return mlFeatureValue;
-            } else {
-                NSError* error = nil;
-                MLMultiArray* mlArray = [[MLMultiArray alloc] initWithDataPointer:tensor->host<float>()
-                                                                            shape:shape
-                                                                         dataType:MLMultiArrayDataTypeFloat32
-                                                                          strides:strides
-                                                                      deallocator:(^(void* bytes){})error:&error];
-                if (error != nil) {
-                    NSLog(@"Failed to create MLMultiArray for feature %@ error: %@", featureName, [error localizedDescription]);
-                    return nil;
-                }
-                auto* mlFeatureValue = [MLFeatureValue featureValueWithMultiArray:mlArray];
-                return mlFeatureValue;
+            MLMultiArray* mlArray = [[MLMultiArray alloc] initWithDataPointer:tensor->host<float>()
+                                                                        shape:shape
+                                                                     dataType:MLMultiArrayDataTypeFloat32
+                                                                      strides:strides
+                                                                  deallocator:(^(void* bytes){})error:&error];
+            if (error != nil) {
+                NSLog(@"Failed to create MLMultiArray for feature %@ error: %@", name, [error localizedDescription]);
+                return nil;
             }
+            value= [MLFeatureValue featureValueWithMultiArray:mlArray];
         }
+        [names addObject:name];
+        [_inputs setValue:value forKey:(name)];
     }
-    NSLog(@"Feature %@ not found", featureName);
-    return nil;
+    _featureNames = [NSSet setWithArray:names];
+    return self;
+}
+
+- (NSSet<NSString*>*)featureNames {
+    return _featureNames;
+}
+
+- (MLFeatureValue*)featureValueForName:(NSString*)featureName {
+    return _inputs[featureName];
 }
 @end
 
@@ -160,16 +124,20 @@ - (bool)invokeWithInputs:(const std::vector<std::pair<const MNN::Tensor*, std::s
     if (_model == nil) {
         return NO;
     }
-    @autoreleasepool {
+
+    @autoreleasepool{
+        _outputArray = nil;
+        _outputArray = [NSMutableArray arrayWithCapacity:0];
         NSError* error = nil;
-        MultiArrayFeatureProvider* inputFeature = [[MultiArrayFeatureProvider alloc] initWithInputs:&inputs coreMlVersion:[self coreMlVersion]];
+        bool useImage = _precision == 2;
+        MultiArrayFeatureProvider* inputFeature = [[MultiArrayFeatureProvider alloc] initWithInputs:&inputs useImage:useImage coreMlVersion:[self coreMlVersion]];
         if (inputFeature == nil) {
             NSLog(@"inputFeature is not initialized.");
             return NO;
         }
         MLPredictionOptions* options = [[MLPredictionOptions alloc] init];
         // options.usesCPUOnly = true;
-        _outputFeature = [_model predictionFromFeatures:inputFeature
+        auto _outputFeature = [_model predictionFromFeatures:inputFeature
                                                 options:options
                                                   error:&error];
         if (error != nil) {
@@ -196,8 +164,9 @@ - (bool)invokeWithInputs:(const std::vector<std::pair<const MNN::Tensor*, std::s
                 if (data.dataPointer == nullptr) {
                     return NO;
                 }
+                [_outputArray addObject:data];
                 const_cast<MNN::Tensor*>(output.first)->buffer().host = (unsigned char*)data.dataPointer;
-            }
+           }
         }
         inputFeature = nil;
     }
@@ -269,9 +238,6 @@ @implementation RasterLayer
 - (instancetype)initWithParameterDictionary:(NSDictionary<NSString *,id> *)parameters
                                       error:(NSError * _Nullable *)error {
     self = [super init];
-#ifdef COREML_METAL_RASTER
-    pipeline = getRasterPipeline();
-#endif
     return self;
 }
 - (void) setRegionSampler
@@ -428,31 +394,6 @@ - (BOOL)evaluateOnCPUWithInputs:(NSArray<MLMultiArray *> *)inputs
     return YES;
 }
 
-// TODO: raster in metal with texture
-#ifdef COREML_METAL_RASTER
-// execute on gpu
-- (BOOL)encodeToCommandBuffer:(id<MTLCommandBuffer>)commandBuffer
-                       inputs:(NSArray<id<MTLTexture>> *)inputs
-                      outputs:(NSArray<id<MTLTexture>> *)outputs
-                        error:(NSError **)error {
-    printf("Raster GPU execute\n");
-    id outputBuffer = [ outputs[0] buffer];
-    NSLog(@"in  -> %@", inputs[0]);
-    NSLog(@"out -> %@", outputs[0]);
-    id encoder = [commandBuffer computeCommandEncoder];
-    [encoder setComputePipelineState:pipeline];
-    for (int i = 0; i < inputs.count; i++) {
-        [encoder setTexture:inputs[i] atIndex:0];
-        [encoder setTexture:outputs[0] atIndex:1];
-        [encoder setBytes:&samplers[i] length:sizeof(SamplerInfo) atIndex:0];
-        std::pair<MTLSize, MTLSize> group = [self computeBestGroupAndLocal:samplers[i]];
-        [encoder dispatchThreadgroups:group.first threadsPerThreadgroup:group.second];
-    }
-    // [encoder endEncoding];
-    return YES;
-
-}
-#endif
 @end
 
 @implementation DumpLayer
diff --git a/source/backend/coreml/backend/CoreMLExecutorWrapper.h b/source/backend/coreml/backend/CoreMLExecutorWrapper.h
index b448c6fa4..4a722f1f6 100644
--- a/source/backend/coreml/backend/CoreMLExecutorWrapper.h
+++ b/source/backend/coreml/backend/CoreMLExecutorWrapper.h
@@ -18,7 +18,7 @@
 namespace MNN {
     class CoreMLExecutorWrapper {
     public:
-        CoreMLExecutorWrapper();
+        CoreMLExecutorWrapper(int precision);
         ~CoreMLExecutorWrapper();
         bool compileModel(CoreML__Specification__Model* model);
         void invokModel(const std::vector<std::pair<const MNN::Tensor*, std::string>>& inputs,
diff --git a/source/backend/coreml/backend/CoreMLExecutorWrapper.mm b/source/backend/coreml/backend/CoreMLExecutorWrapper.mm
index 6fd9c81b0..5d3ff7377 100644
--- a/source/backend/coreml/backend/CoreMLExecutorWrapper.mm
+++ b/source/backend/coreml/backend/CoreMLExecutorWrapper.mm
@@ -20,16 +20,20 @@
     return (__bridge CoreMLExecutor*)ptr;
 }
 
-CoreMLExecutorWrapper::CoreMLExecutorWrapper() {
+CoreMLExecutorWrapper::CoreMLExecutorWrapper(int precision) {
     if (mCoreMLExecutorPtr == nullptr)  {
         mCoreMLExecutorPtr = (__bridge_retained void*)[[CoreMLExecutor alloc] init];
+        auto executor = getCoreMLExecutoreRef(mCoreMLExecutorPtr);
+        executor.precision = precision;
     }
 }
 
 CoreMLExecutorWrapper::~CoreMLExecutorWrapper() {
-    auto executor = getCoreMLExecutoreOwn(mCoreMLExecutorPtr);
-    (void)executor;
-    mCoreMLExecutorPtr = nullptr;
+    @autoreleasepool {
+        auto executor = getCoreMLExecutoreOwn(mCoreMLExecutorPtr);
+        (void)executor;
+        executor = nullptr;
+    }
 }
 
 bool CoreMLExecutorWrapper::compileModel(CoreML__Specification__Model* model) {
diff --git a/source/backend/coreml/backend/CoreMLOPRegister.cpp b/source/backend/coreml/backend/CoreMLOPRegister.cpp
index 2b19e80af..a1cfccdc7 100644
--- a/source/backend/coreml/backend/CoreMLOPRegister.cpp
+++ b/source/backend/coreml/backend/CoreMLOPRegister.cpp
@@ -1,5 +1,6 @@
 // This file is generated by Shell for ops register
 namespace MNN {
+extern void ___CoreMLRelu6__OpType_ReLU6__();
 extern void ___CoreMLReduction__OpType_Reduction__();
 extern void ___CoreMLBinary__OpType_BinaryOp__();
 extern void ___CoreMLBinary__OpType_Eltwise__();
@@ -7,20 +8,23 @@ extern void ___CoreMLArgMax__OpType_ArgMax__();
 extern void ___CoreMLConvolution__OpType_Convolution__();
 extern void ___CoreMLConvolution__OpType_ConvolutionDepthwise__();
 extern void ___CoreMLConvolution__OpType_Deconvolution__();
+extern void ___CoreMLConvolution__OpType_DeconvolutionDepthwise__();
 extern void ___CoreMLInterp__OpType_Interp__();
 extern void ___CoreMLLayerNorm__OpType_LayerNorm__();
 extern void ___CoreMLUnary__OpType_UnaryOp__();
+extern void ___CoreMLMatMul__OpType_BatchMatMul__();
+extern void ___CoreMLMatMul__OpType_MatMul__();
 extern void ___CoreMLScale__OpType_Scale__();
 extern void ___CoreMLPool__OpType_Pooling__();
 extern void ___CoreMLRaster__OpType_Raster__();
 extern void ___CoreMLActivation__OpType_ReLU__();
-extern void ___CoreMLActivation__OpType_ReLU6__();
 extern void ___CoreMLActivation__OpType_ELU__();
 extern void ___CoreMLActivation__OpType_PReLU__();
 extern void ___CoreMLActivation__OpType_Sigmoid__();
 extern void ___CoreMLActivation__OpType_Softmax__();
 
 void registerCoreMLOps() {
+___CoreMLRelu6__OpType_ReLU6__();
 ___CoreMLReduction__OpType_Reduction__();
 ___CoreMLBinary__OpType_BinaryOp__();
 ___CoreMLBinary__OpType_Eltwise__();
@@ -28,14 +32,16 @@ ___CoreMLArgMax__OpType_ArgMax__();
 ___CoreMLConvolution__OpType_Convolution__();
 ___CoreMLConvolution__OpType_ConvolutionDepthwise__();
 ___CoreMLConvolution__OpType_Deconvolution__();
+___CoreMLConvolution__OpType_DeconvolutionDepthwise__();
 ___CoreMLInterp__OpType_Interp__();
 ___CoreMLLayerNorm__OpType_LayerNorm__();
 ___CoreMLUnary__OpType_UnaryOp__();
+___CoreMLMatMul__OpType_BatchMatMul__();
+___CoreMLMatMul__OpType_MatMul__();
 ___CoreMLScale__OpType_Scale__();
 ___CoreMLPool__OpType_Pooling__();
 ___CoreMLRaster__OpType_Raster__();
 ___CoreMLActivation__OpType_ReLU__();
-___CoreMLActivation__OpType_ReLU6__();
 ___CoreMLActivation__OpType_ELU__();
 ___CoreMLActivation__OpType_PReLU__();
 ___CoreMLActivation__OpType_Sigmoid__();
diff --git a/source/backend/coreml/backend/CoreMLRaster.metal b/source/backend/coreml/backend/CoreMLRaster.metal
deleted file mode 100644
index cd9e9fc66..000000000
--- a/source/backend/coreml/backend/CoreMLRaster.metal
+++ /dev/null
@@ -1,39 +0,0 @@
-//
-//  CoreMLRaster.metal
-//  MNN
-//
-//  Created by MNN on 2021/04/26.
-//  Copyright © 2018, Alibaba Group Holding Limited
-//
-#include <metal_stdlib>
-using namespace metal;
-
-struct SamplerInfo {
-    uint4 stride;   //stride[3] + offset
-    uint4 size;     //size[3] + totalSize
-    uint4 extent;   //dstStride[3]+dstOffset
-    uint4 imageSize;
-};
-
-kernel void raster_texture(texture2d_array<half, access::read> in   [[texture(0)]],
-                           texture2d_array<half, access::write> out [[texture(1)]],
-                           constant SamplerInfo &info               [[buffer(0)]],
-                           uint3 gid                                [[thread_position_in_grid]]) {
-    if (gid.x < info.size.x && gid.y < info.size.y && gid.z < info.size.z) {
-        uint dstOffset = gid.x * info.extent.x + gid.y * info.extent.y + gid.z * info.extent.z + info.extent.w;
-        uint srcOffset = gid.x * info.stride.x + gid.y * info.stride.y + gid.z * info.stride.z + info.stride.w;
-        // out[int(dstOffset)] = in[int(srcOffset)];
-        // do raster on texture
-    }
-}
-
-kernel void raster(const device int *in         [[buffer(0)]],
-                   device int *out              [[buffer(1)]],
-                   constant SamplerInfo &info   [[buffer(2)]],
-                   uint3 gid                    [[thread_position_in_grid]]) {
-    if (gid.x < info.size.x && gid.y < info.size.y && gid.z < info.size.z) {
-        uint dstOffset = gid.x * info.extent.x + gid.y * info.extent.y + gid.z * info.extent.z + info.extent.w;
-        uint srcOffset = gid.x * info.stride.x + gid.y * info.stride.y + gid.z * info.stride.z + info.stride.w;
-        out[int(dstOffset)] = in[int(srcOffset)];
-    }
-}
diff --git a/source/backend/coreml/execution/CoreMLActivation.cpp b/source/backend/coreml/execution/CoreMLActivation.cpp
index 10b8d1958..d6e70bc3c 100644
--- a/source/backend/coreml/execution/CoreMLActivation.cpp
+++ b/source/backend/coreml/execution/CoreMLActivation.cpp
@@ -35,38 +35,6 @@ ErrorCode CoreMLActivation::onResize(const std::vector<Tensor *> &inputs, const
                 core_ml__specification__activation_leaky_re_lu__init(mLayer_->activation->leakyrelu);
                 mLayer_->activation->leakyrelu->alpha = mOp->main_as_Relu()->slope();
                 break;
-            case OpType_ReLU6:
-            {
-                // relu + threshold
-                auto reluLayer = mCoreMLBackend->create<CoreML__Specification__NeuralNetworkLayer>();
-                core_ml__specification__neural_network_layer__init(reluLayer);
-                mCoreMLBackend->setLayerName(reluLayer, "relu6-relu");
-                reluLayer->activation = mCoreMLBackend->create<CoreML__Specification__ActivationParams>();
-                reluLayer->activation->nonlinearity_type_case = CORE_ML__SPECIFICATION__ACTIVATION_PARAMS__NONLINEARITY_TYPE_RE_LU;
-                reluLayer->activation->relu = mCoreMLBackend->create<CoreML__Specification__ActivationReLU>();
-                core_ml__specification__activation_re_lu__init(reluLayer->activation->relu);
-                std::string reluOutput = mCoreMLBackend->getTensorName(inputs[0]) + "-relu";
-                setLayerInputsAndOutputs(reluLayer, {mCoreMLBackend->getTensorName(inputs[0])}, {reluOutput});
-                mCoreMLBackend->addLayer(reluLayer);
-
-                auto thresholdLayer = mCoreMLBackend->create<CoreML__Specification__NeuralNetworkLayer>();
-                core_ml__specification__neural_network_layer__init(thresholdLayer);
-                mCoreMLBackend->setLayerName(thresholdLayer, "relu6-threshold");
-                thresholdLayer->layer_case = CORE_ML__SPECIFICATION__NEURAL_NETWORK_LAYER__LAYER_UNARY;
-                thresholdLayer->unary = mCoreMLBackend->create<CoreML__Specification__UnaryFunctionLayerParams>();
-                core_ml__specification__unary_function_layer_params__init(thresholdLayer->unary);
-                thresholdLayer->unary->type = CORE_ML__SPECIFICATION__UNARY_FUNCTION_LAYER_PARAMS__OPERATION__THRESHOLD;
-                thresholdLayer->unary->alpha = -6;
-                thresholdLayer->unary->scale = -1;
-                inputName = reluOutput + "-threshold";
-                setLayerInputsAndOutputs(thresholdLayer, {reluOutput}, {inputName});
-                mCoreMLBackend->addLayer(thresholdLayer);
-
-                mLayer_->activation->linear = mCoreMLBackend->create<CoreML__Specification__ActivationLinear>();
-                core_ml__specification__activation_linear__init(mLayer_->activation->linear);
-                mLayer_->activation->linear->alpha = -1;
-                break;
-            }
             case OpType_ELU:
                 mLayer_->activation->nonlinearity_type_case = CORE_ML__SPECIFICATION__ACTIVATION_PARAMS__NONLINEARITY_TYPE_ELU;
                 mLayer_->activation->elu = mCoreMLBackend->create<CoreML__Specification__ActivationELU>();
@@ -74,6 +42,13 @@ ErrorCode CoreMLActivation::onResize(const std::vector<Tensor *> &inputs, const
                 break;
             case OpType_PReLU:
             {
+                if (mOp->main_as_PRelu()->slopeCount() == 1) {
+                    mLayer_->activation->nonlinearity_type_case = CORE_ML__SPECIFICATION__ACTIVATION_PARAMS__NONLINEARITY_TYPE_LEAKY_RE_LU;
+                    mLayer_->activation->leakyrelu = mCoreMLBackend->create<CoreML__Specification__ActivationLeakyReLU>();
+                    core_ml__specification__activation_leaky_re_lu__init(mLayer_->activation->leakyrelu);
+                    mLayer_->activation->leakyrelu->alpha = mOp->main_as_PRelu()->slope()->data()[0];
+                    break;
+                }
                 mLayer_->activation->nonlinearity_type_case = CORE_ML__SPECIFICATION__ACTIVATION_PARAMS__NONLINEARITY_TYPE_PRE_LU;
                 mLayer_->activation->prelu = mCoreMLBackend->create<CoreML__Specification__ActivationPReLU>();
                 core_ml__specification__activation_pre_lu__init(mLayer_->activation->prelu);
@@ -100,7 +75,6 @@ ErrorCode CoreMLActivation::onResize(const std::vector<Tensor *> &inputs, const
 }
 
 REGISTER_COREML_OP_CREATOR(CoreMLActivation, OpType_ReLU)
-REGISTER_COREML_OP_CREATOR(CoreMLActivation, OpType_ReLU6)
 REGISTER_COREML_OP_CREATOR(CoreMLActivation, OpType_ELU)
 REGISTER_COREML_OP_CREATOR(CoreMLActivation, OpType_PReLU)
 REGISTER_COREML_OP_CREATOR(CoreMLActivation, OpType_Sigmoid)
diff --git a/source/backend/coreml/execution/CoreMLBinary.cpp b/source/backend/coreml/execution/CoreMLBinary.cpp
index 9595f52ca..c81340643 100644
--- a/source/backend/coreml/execution/CoreMLBinary.cpp
+++ b/source/backend/coreml/execution/CoreMLBinary.cpp
@@ -7,6 +7,7 @@
 //
 
 #include "CoreMLBinary.hpp"
+#include "core/TensorUtils.hpp"
 
 namespace MNN {
 
@@ -40,21 +41,25 @@ ErrorCode CoreMLBinary::onResize(const std::vector<Tensor *> &inputs, const std:
     bool oneInput = false;
     float constVal = 0.f;
     const Tensor* input = nullptr;
-    if (TensorUtils::getDescribe(inputs[0])->usage == Tensor::InsideDescribe::CONSTANT) {
+    if (TensorUtils::getDescribe(inputs[0])->usage == Tensor::InsideDescribe::CONSTANT && 1 == TensorUtils::getRawSize(inputs[0])) {
         constVal = inputs[0]->host<float>()[0];
         input = inputs[1];
-    } else if (TensorUtils::getDescribe(inputs[1])->usage == Tensor::InsideDescribe::CONSTANT) {
+    } else if (TensorUtils::getDescribe(inputs[1])->usage == Tensor::InsideDescribe::CONSTANT && 1 == TensorUtils::getRawSize(inputs[1])) {
         constVal = inputs[1]->host<float>()[0];
         input = inputs[0];
     }
     switch (binaryType) {
         case BinaryOpOperation_ADD:
-            mLayer_->layer_case = CORE_ML__SPECIFICATION__NEURAL_NETWORK_LAYER__LAYER_ADD;
-            mLayer_->add = mCoreMLBackend->create<CoreML__Specification__AddLayerParams>();
-            core_ml__specification__add_layer_params__init(mLayer_->add);
             if (input) {
+                mLayer_->layer_case = CORE_ML__SPECIFICATION__NEURAL_NETWORK_LAYER__LAYER_ADD;
+                mLayer_->add = mCoreMLBackend->create<CoreML__Specification__AddLayerParams>();
+                core_ml__specification__add_layer_params__init(mLayer_->add);
                 mLayer_->add->alpha = constVal;
                 oneInput = true;
+            } else {
+                mLayer_->layer_case = CORE_ML__SPECIFICATION__NEURAL_NETWORK_LAYER__LAYER_ADD_BROADCASTABLE;
+                mLayer_->addbroadcastable = mCoreMLBackend->create<CoreML__Specification__AddBroadcastableLayerParams>();
+                core_ml__specification__add_broadcastable_layer_params__init(mLayer_->addbroadcastable);
             }
             break;
         case BinaryOpOperation_SUB:
@@ -75,12 +80,16 @@ ErrorCode CoreMLBinary::onResize(const std::vector<Tensor *> &inputs, const std:
             }
             break;
         case BinaryOpOperation_MUL:
-            mLayer_->layer_case = CORE_ML__SPECIFICATION__NEURAL_NETWORK_LAYER__LAYER_MULTIPLY;
-            mLayer_->multiply = mCoreMLBackend->create<CoreML__Specification__MultiplyLayerParams>();
-            core_ml__specification__multiply_layer_params__init(mLayer_->multiply);
             if (input) {
+                mLayer_->layer_case = CORE_ML__SPECIFICATION__NEURAL_NETWORK_LAYER__LAYER_MULTIPLY;
+                mLayer_->multiply = mCoreMLBackend->create<CoreML__Specification__MultiplyLayerParams>();
+                core_ml__specification__multiply_layer_params__init(mLayer_->multiply);
                 mLayer_->multiply->alpha = constVal;
                 oneInput = true;
+            } else {
+                mLayer_->layer_case = CORE_ML__SPECIFICATION__NEURAL_NETWORK_LAYER__LAYER_MULTIPLY_BROADCASTABLE;
+                mLayer_->multiplybroadcastable = mCoreMLBackend->create<_CoreML__Specification__MultiplyBroadcastableLayerParams>();
+                core_ml__specification__multiply_broadcastable_layer_params__init(mLayer_->multiplybroadcastable);
             }
             break;
         case BinaryOpOperation_DIV:
diff --git a/source/backend/coreml/execution/CoreMLConvolution.cpp b/source/backend/coreml/execution/CoreMLConvolution.cpp
index 7e1a22fb6..722d6cd8c 100644
--- a/source/backend/coreml/execution/CoreMLConvolution.cpp
+++ b/source/backend/coreml/execution/CoreMLConvolution.cpp
@@ -6,13 +6,15 @@
 //  Copyright © 2018, Alibaba Group Holding Limited
 //
 
+#include <float.h>
+#include "core/ConvolutionCommon.hpp"
 #include "CoreMLConvolution.hpp"
 
 namespace MNN {
 
 
 CoreMLConvolution::CoreMLConvolution(MNN::Backend *b, const MNN::Op *op, const std::vector<Tensor *> &inputs, const std::vector<MNN::Tensor *> &outputs) : CoreMLCommonExecution(b, op) {
-    isDeconv = op->type() == OpType_Deconvolution;
+    isDeconv = op->type() == OpType_Deconvolution || op->type() == OpType_DeconvolutionDepthwise;
     initLayer();
 }
 
@@ -47,21 +49,17 @@ void CoreMLConvolution::loadWeightBias(const std::vector<Tensor *> &inputs) {
     biasPtr  = conv2D->bias()->data();
 }
 
-void CoreMLConvolution::addPadLayer(const Tensor * input, const Convolution2DCommon* common) {
-    MNN_ASSERT(common->padMode() == PadMode_CAFFE);
-    int top, left, bottom, right;
-    if (nullptr != common->pads()) {
-        MNN_ASSERT(common->pads()->size() >= 4);
-        top = common->pads()->Get(0);
-        left = common->pads()->Get(1);
-        bottom = common->pads()->Get(2);
-        right = common->pads()->Get(3);
+void CoreMLConvolution::addPadLayer(const Tensor * input, const Tensor * output, const Convolution2DCommon* common) {
+    std::pair<int, int> pads;
+    if (isDeconv) {
+        pads = ConvolutionCommon::convolutionTransposePad(input, output, common);
     } else {
-        top = common->padY();
-        left = common->padX();
-        bottom = common->padY();
-        right = common->padX();
+        pads = ConvolutionCommon::convolutionPad(input, output, common);
     }
+    int top = pads.second;
+    int left = pads.first;
+    int bottom = pads.second;
+    int right = pads.first;
     if (top == 0 && left == 0 && bottom == 0 && right == 0) {
         return;
     }
@@ -69,32 +67,10 @@ void CoreMLConvolution::addPadLayer(const Tensor * input, const Convolution2DCom
         isSamePadding = true;
         return;
     }
-    if (!isDeconv && outputWidth == UP_DIV(inputWidth, common->strideX()) && outputHeight == UP_DIV(outputHeight, common->strideY())) {
+    if (!isDeconv && outputWidth == UP_DIV(inputWidth, common->strideX()) && outputHeight == UP_DIV(inputHeight, common->strideY())) {
         isSamePadding = true;
         return;
     }
-    if (isDeconv) {
-        int ky = common->kernelY();
-        int kx = common->kernelX();
-        int sy = common->strideY();
-        int sx = common->strideX();
-        int pad_out_height = (outputHeight - ky) / sy + 1;
-        int pad_out_width = (outputWidth - kx) / sx + 1;
-        top = (pad_out_height - inputHeight) / 2;
-        bottom = (pad_out_height - inputHeight) - top;
-        left = (pad_out_width - inputWidth) / 2;
-        right = (pad_out_width - inputWidth) - left;
-
-        if (top < 0 || bottom < 0 || left < 0 || right < 0) {
-            isSamePadding = true;
-            pad_out_width = outputWidth / sx;
-            pad_out_height = outputHeight / sy;
-            bottom = 0;
-            top = pad_out_height - inputHeight;
-            right = 0;
-            left = pad_out_width - inputWidth;
-        }
-    }
     std::string layerName = "ConvPadding-" + mConvInputName;
     auto paddingLayer = mCoreMLBackend->create<CoreML__Specification__NeuralNetworkLayer>();
     core_ml__specification__neural_network_layer__init(paddingLayer);
@@ -132,6 +108,7 @@ ErrorCode CoreMLConvolution::onResize(const std::vector<Tensor *> &inputs, const
     outputWidth = outputs[0]->width();
     outputHeight = outputs[0]->height();
     loadWeightBias(inputs);
+    isSamePadding = false;
     auto conv2D      = mOp->main_as_Convolution2D();
     auto common      = conv2D->common();
     auto kernelX     = common->kernelX();
@@ -156,6 +133,12 @@ ErrorCode CoreMLConvolution::onResize(const std::vector<Tensor *> &inputs, const
     mLayer_->convolution->dilationfactor = mCoreMLBackend->create<uint64_t>(mLayer_->convolution->n_dilationfactor);
     mLayer_->convolution->dilationfactor[0] = dilateY;
     mLayer_->convolution->dilationfactor[1] = dilateX;
+    if (isDeconv) {
+        mLayer_->convolution->n_outputshape = 2;
+        mLayer_->convolution->outputshape = mCoreMLBackend->create<uint64_t>(2);
+        mLayer_->convolution->outputshape[0] = outputHeight;
+        mLayer_->convolution->outputshape[1] = outputWidth;
+    }
     switch (padMod) {
         case PadMode_SAME:
             mLayer_->convolution->convolution_padding_type_case = CORE_ML__SPECIFICATION__CONVOLUTION_LAYER_PARAMS__CONVOLUTION_PADDING_TYPE_SAME;
@@ -168,11 +151,12 @@ ErrorCode CoreMLConvolution::onResize(const std::vector<Tensor *> &inputs, const
             core_ml__specification__valid_padding__init(mLayer_->convolution->valid);
             break;
         case PadMode_CAFFE:
-            addPadLayer(inputs[0], common);
+            addPadLayer(inputs[0], outputs[0], common);
             if (isSamePadding){
                 mLayer_->convolution->convolution_padding_type_case = CORE_ML__SPECIFICATION__CONVOLUTION_LAYER_PARAMS__CONVOLUTION_PADDING_TYPE_SAME;
                 mLayer_->convolution->same = mCoreMLBackend->create<CoreML__Specification__SamePadding>();
                 core_ml__specification__same_padding__init(mLayer_->convolution->same);
+                mLayer_->convolution->same->asymmetrymode = CORE_ML__SPECIFICATION__SAME_PADDING__SAME_PADDING_MODE__TOP_LEFT_HEAVY;
                 break;
             } else {
                 mLayer_->convolution->convolution_padding_type_case = CORE_ML__SPECIFICATION__CONVOLUTION_LAYER_PARAMS__CONVOLUTION_PADDING_TYPE_VALID;
@@ -183,9 +167,11 @@ ErrorCode CoreMLConvolution::onResize(const std::vector<Tensor *> &inputs, const
         default:
             break;
     }
-
-    int inputCount = weightSize / (kernelX * kernelY * outputCount);
-    mLayer_->convolution->kernelchannels = inputCount;
+    if (isDeconv) {
+        mLayer_->convolution->kernelchannels = inputs[0]->channel();
+    } else {
+        mLayer_->convolution->kernelchannels = weightSize / (kernelX * kernelY * outputCount);
+    }
     mLayer_->convolution->outputchannels = outputCount;
     mLayer_->convolution->n_kernelsize = 2;
     mLayer_->convolution->kernelsize = mCoreMLBackend->create<uint64_t>(mLayer_->convolution->n_kernelsize);
@@ -214,12 +200,16 @@ ErrorCode CoreMLConvolution::onResize(const std::vector<Tensor *> &inputs, const
         auto reluLayer = mCoreMLBackend->create<CoreML__Specification__NeuralNetworkLayer>();
         core_ml__specification__neural_network_layer__init(reluLayer);
         mCoreMLBackend->setLayerName(reluLayer, "ConvRelu");
-        reluLayer->layer_case = CORE_ML__SPECIFICATION__NEURAL_NETWORK_LAYER__LAYER_ACTIVATION;
-        reluLayer->activation = mCoreMLBackend->create<CoreML__Specification__ActivationParams>();
-        core_ml__specification__activation_params__init(reluLayer->activation);
-        reluLayer->activation->nonlinearity_type_case = CORE_ML__SPECIFICATION__ACTIVATION_PARAMS__NONLINEARITY_TYPE_RE_LU;
-        reluLayer->activation->relu = mCoreMLBackend->create<CoreML__Specification__ActivationReLU>();
-        core_ml__specification__activation_re_lu__init(reluLayer->activation->relu);
+        reluLayer->layer_case = CORE_ML__SPECIFICATION__NEURAL_NETWORK_LAYER__LAYER_CLIP;
+        reluLayer->clip = mCoreMLBackend->create<CoreML__Specification__ClipLayerParams>();
+        core_ml__specification__clip_layer_params__init(reluLayer->clip);
+        if (common->relu()) {
+            reluLayer->clip->minval = 0.0f;
+            reluLayer->clip->maxval = FLT_MAX;
+        } else {
+            reluLayer->clip->minval = 0.0f;
+            reluLayer->clip->maxval = 6.0f;
+        }
         setLayerInputsAndOutputs(reluLayer, {mConvOutputName}, {mCoreMLBackend->getTensorName(outputs[0])});
         mCoreMLBackend->addLayer(reluLayer);
     }
@@ -229,4 +219,5 @@ ErrorCode CoreMLConvolution::onResize(const std::vector<Tensor *> &inputs, const
 REGISTER_COREML_OP_CREATOR(CoreMLConvolution, OpType_Convolution)
 REGISTER_COREML_OP_CREATOR(CoreMLConvolution, OpType_ConvolutionDepthwise)
 REGISTER_COREML_OP_CREATOR(CoreMLConvolution, OpType_Deconvolution)
+REGISTER_COREML_OP_CREATOR(CoreMLConvolution, OpType_DeconvolutionDepthwise)
 } // namespace MNN
diff --git a/source/backend/coreml/execution/CoreMLConvolution.hpp b/source/backend/coreml/execution/CoreMLConvolution.hpp
index 180e3f98d..2ed03c14d 100644
--- a/source/backend/coreml/execution/CoreMLConvolution.hpp
+++ b/source/backend/coreml/execution/CoreMLConvolution.hpp
@@ -22,7 +22,7 @@ class CoreMLConvolution : public CoreMLCommonExecution {
     virtual ~CoreMLConvolution() = default;
 private:
     void loadWeightBias(const std::vector<Tensor *> &inputs);
-    void addPadLayer(const Tensor * input, const Convolution2DCommon* common);
+    void addPadLayer(const Tensor * input, const Tensor* output, const Convolution2DCommon* common);
     std::string mConvInputName, mConvOutputName;
     std::shared_ptr<ConvolutionCommon::Int8Common> quanCommon;
     const float *weightPtr, *biasPtr;
diff --git a/source/backend/coreml/execution/CoreMLMatMul.cpp b/source/backend/coreml/execution/CoreMLMatMul.cpp
new file mode 100644
index 000000000..24c767b8d
--- /dev/null
+++ b/source/backend/coreml/execution/CoreMLMatMul.cpp
@@ -0,0 +1,57 @@
+//
+//  CoreMLMatMul.cpp
+//  MNN
+//
+//  Created by MNN on 2021/03/24.
+//  Copyright © 2018, Alibaba Group Holding Limited
+//
+
+#include "CoreMLMatMul.hpp"
+namespace MNN {
+
+static void _makeMatMul() {
+    
+}
+CoreMLMatMul::CoreMLMatMul(MNN::Backend *b, const MNN::Op *op, const std::vector<Tensor *> &inputs, const std::vector<MNN::Tensor *> &outputs) : CoreMLCommonExecution(b, op) {
+    initLayer();
+}
+
+ErrorCode CoreMLMatMul::onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
+    auto outputName = mCoreMLBackend->getTensorName(outputs[0]);
+    std::string matmulOutput = outputName;
+    if (inputs.size() > 2) {
+        // Has Bias
+        matmulOutput = matmulOutput + "--matmul";
+    }
+    mLayer_->layer_case = CORE_ML__SPECIFICATION__NEURAL_NETWORK_LAYER__LAYER_BATCHED_MATMUL;
+    mLayer_->batchedmatmul = mCoreMLBackend->create<CoreML__Specification__BatchedMatMulLayerParams>();
+    core_ml__specification__batched_mat_mul_layer_params__init(mLayer_->batchedmatmul);
+    if (mOp->main_type() == OpParameter_MatMul) {
+        mLayer_->batchedmatmul->transposea = mOp->main_as_MatMul()->transposeA();
+        mLayer_->batchedmatmul->transposeb = mOp->main_as_MatMul()->transposeB();
+    } else if (mOp->main_type() == OpParameter_BatchMatMulParam) {
+        mLayer_->batchedmatmul->transposea = mOp->main_as_BatchMatMulParam()->adjX();
+        mLayer_->batchedmatmul->transposeb = mOp->main_as_BatchMatMulParam()->adjY();
+    }
+    setLayerInputsAndOutputs(mLayer_, {mCoreMLBackend->getTensorName(inputs[0]), mCoreMLBackend->getTensorName(inputs[1])}, {matmulOutput});
+    mCoreMLBackend->setLayerName(mLayer_, "MatMul");
+    mCoreMLBackend->addLayer(mLayer_);
+    if (inputs.size() > 2) {
+        // Add Bias
+        auto biasLayer = mCoreMLBackend->create<CoreML__Specification__NeuralNetworkLayer>();
+        core_ml__specification__neural_network_layer__init(biasLayer);
+        mCoreMLBackend->setLayerName(biasLayer, outputName + "Bias");
+        mLayer_->layer_case = CORE_ML__SPECIFICATION__NEURAL_NETWORK_LAYER__LAYER_ADD_BROADCASTABLE;
+        mLayer_->addbroadcastable = mCoreMLBackend->create<CoreML__Specification__AddBroadcastableLayerParams>();
+        core_ml__specification__add_broadcastable_layer_params__init(mLayer_->addbroadcastable);
+        setLayerInputsAndOutputs(biasLayer, {matmulOutput, mCoreMLBackend->getTensorName(inputs[2])}, {outputName});
+        mCoreMLBackend->addLayer(biasLayer);
+    }
+    return NO_ERROR;
+}
+
+
+REGISTER_COREML_OP_CREATOR(CoreMLMatMul, OpType_BatchMatMul)
+REGISTER_COREML_OP_CREATOR(CoreMLMatMul, OpType_MatMul)
+
+} // namespace MNN
diff --git a/source/backend/coreml/execution/CoreMLMatMul.hpp b/source/backend/coreml/execution/CoreMLMatMul.hpp
new file mode 100644
index 000000000..af32f94ea
--- /dev/null
+++ b/source/backend/coreml/execution/CoreMLMatMul.hpp
@@ -0,0 +1,25 @@
+//
+//  CoreMLMatMul.hpp
+//  MNN
+//
+//  Created by MNN on 2024/10/18.
+//  Copyright © 2018, Alibaba Group Holding Limited
+//
+
+#ifndef MNN_COREMLMATMUL_HPP
+#define MNN_COREMLMATMUL_HPP
+
+#include "CoreMLCommonExecution.hpp"
+#include "CoreMLBackend.hpp"
+
+namespace MNN {
+
+class CoreMLMatMul : public CoreMLCommonExecution {
+public:
+    CoreMLMatMul(Backend *b, const Op *op, const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs);
+    ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs);
+    virtual ~CoreMLMatMul() = default;
+};
+} // namespace MNN
+
+#endif // MNN_COREMLMATMUL_HPP
diff --git a/source/backend/coreml/execution/CoreMLRelu6.cpp b/source/backend/coreml/execution/CoreMLRelu6.cpp
new file mode 100644
index 000000000..43a1623c0
--- /dev/null
+++ b/source/backend/coreml/execution/CoreMLRelu6.cpp
@@ -0,0 +1,36 @@
+//
+//  CoreMLRelu6.cpp
+//  MNN
+//
+//  Created by MNN on 2021/03/31.
+//  Copyright © 2018, Alibaba Group Holding Limited
+//
+
+#include "CoreMLRelu6.hpp"
+
+namespace MNN {
+
+CoreMLRelu6::CoreMLRelu6(MNN::Backend *b, const MNN::Op *op, const std::vector<Tensor *> &inputs, const std::vector<MNN::Tensor *> &outputs) : CoreMLCommonExecution(b, op) {
+    if (nullptr != op->main()) {
+        auto p = op->main_as_Relu6();
+        mMinValue = p->minValue();
+        mMaxValue = p->maxValue();
+    }
+    initLayer();
+}
+
+ErrorCode CoreMLRelu6::onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
+    MNN_ASSERT(inputs.size() == 1 && outputs.size() == 1);
+    mLayer_->layer_case = CORE_ML__SPECIFICATION__NEURAL_NETWORK_LAYER__LAYER_CLIP;
+    mLayer_->clip = mCoreMLBackend->create<_CoreML__Specification__ClipLayerParams>();
+    core_ml__specification__clip_layer_params__init(mLayer_->clip);
+    mLayer_->clip->maxval = mMaxValue;
+    mLayer_->clip->minval = mMinValue;
+
+    setLayerInputsAndOutputs(mLayer_, {mCoreMLBackend->getTensorName(inputs[0])}, {mCoreMLBackend->getTensorName(outputs[0])});
+    mCoreMLBackend->addLayer(mLayer_);
+    return NO_ERROR;
+}
+
+REGISTER_COREML_OP_CREATOR(CoreMLRelu6, OpType_ReLU6)
+} // namespace MNN
diff --git a/source/backend/coreml/execution/CoreMLRelu6.hpp b/source/backend/coreml/execution/CoreMLRelu6.hpp
new file mode 100644
index 000000000..95ade8a75
--- /dev/null
+++ b/source/backend/coreml/execution/CoreMLRelu6.hpp
@@ -0,0 +1,28 @@
+//
+//  CoreMLRelu6.hpp
+//  MNN
+//
+//  Created by MNN on 2024/10/18.
+//  Copyright © 2018, Alibaba Group Holding Limited
+//
+
+#ifndef MNN_COREMLRelu6_HPP
+#define MNN_COREMLRelu6_HPP
+
+#include "CoreMLCommonExecution.hpp"
+#include "CoreMLBackend.hpp"
+
+namespace MNN {
+
+class CoreMLRelu6 : public CoreMLCommonExecution {
+public:
+    CoreMLRelu6(Backend *b, const Op *op, const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs);
+    ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs);
+    virtual ~CoreMLRelu6() = default;
+private:
+    float mMinValue = 0.0f;
+    float mMaxValue = 6.0f;
+};
+} // namespace MNN
+
+#endif // MNN_COREMLRelu6_HPP
diff --git a/source/backend/coreml/execution/coreMLLayerNorm.hpp b/source/backend/coreml/execution/coreMLLayerNorm.hpp
index 73f2a1e85..599e176ea 100644
--- a/source/backend/coreml/execution/coreMLLayerNorm.hpp
+++ b/source/backend/coreml/execution/coreMLLayerNorm.hpp
@@ -22,4 +22,4 @@ class CoreMLLayerNorm : public CoreMLCommonExecution {
 };
 } // namespace MNN
 
-#endif // MNN_COREMLLAYERNORM_HPP
\ No newline at end of file
+#endif // MNN_COREMLLAYERNORM_HPP
diff --git a/source/backend/cpu/CPUCast.cpp b/source/backend/cpu/CPUCast.cpp
index 1bc72dbb1..ad05b159a 100644
--- a/source/backend/cpu/CPUCast.cpp
+++ b/source/backend/cpu/CPUCast.cpp
@@ -33,12 +33,12 @@ ErrorCode CPUCastCreator::cast(const void* inputRaw, void* outputRaw, ConvertTyp
     }
     if (type == INT8_TO_FlOAT) {
         std::vector<float> scales(pack, scale);
-        bn->int8Functions()->MNNInt8ScaleToFloat((float*)(outputRaw), (int8_t*)(inputRaw), scales.data(), c4Size, zero);
+        bn->int8Functions()->MNNInt8ScaleToFloat((float*)(outputRaw), (int8_t*)(inputRaw), &scale, c4Size, &zero, 0);
         if (remain > 0) {
             std::vector<float> tempDst(pack);
             std::vector<int8_t> tempSrc(pack);
             ::memcpy(tempSrc.data(), (int8_t*)(inputRaw) + c4Size * pack, remain * sizeof(int8_t));
-            bn->int8Functions()->MNNInt8ScaleToFloat(tempDst.data(), tempSrc.data(), scales.data(), 1, zero);
+            bn->int8Functions()->MNNInt8ScaleToFloat(tempDst.data(), tempSrc.data(), &scale, 1, &zero, 0);
             ::memcpy(static_cast<float*>(outputRaw) + c4Size * pack, tempDst.data(), remain * sizeof(float));
         }
         return NO_ERROR;
diff --git a/source/backend/cpu/CPUConvolutionDepthwise.cpp b/source/backend/cpu/CPUConvolutionDepthwise.cpp
index 6d6e2df96..168b4193b 100644
--- a/source/backend/cpu/CPUConvolutionDepthwise.cpp
+++ b/source/backend/cpu/CPUConvolutionDepthwise.cpp
@@ -175,6 +175,14 @@ ErrorCode CPUConvolutionDepthwise::BasicFloatExecution::onResize(const std::vect
     divides[0] = 0;
     static_cast<CPUBackend *>(backend())->computeDivideSizes(total, divides.data()+1);
     mNumber = numberThread;
+    for (int i=1; i<numberThread; ++i) {
+        if (divides[i+1] <= divides[i]) {
+            // Only 0-(i-1) thread has work
+            mNumber = i;
+            break;
+        }
+    }
+    MNN_ASSERT(mNumber > 0);
     auto postData = getPostParameters();
     if (static_cast<CPUBackend*>(backend())->functions()->bytes < 4) {
         static_cast<CPUBackend*>(backend())->functions()->MNNFp32ToLowp(postData.data() + 2, (int16_t*)(postData.data() + 2), 2);
@@ -196,6 +204,7 @@ ErrorCode CPUConvolutionDepthwise::BasicFloatExecution::onResize(const std::vect
         src_y_step     = paddedWidth * unit;
     }
     mExecutor   = [=](const uint8_t* inputPtr, uint8_t* outputPtr, int tId) {
+        MNN_ASSERT(divides[tId] < divides[tId+1]);
         const auto inputPadPtr = mInputPad->host<uint8_t>() + mInputPad->stride(0) * tId * bytes;
         ::memset(inputPadPtr, 0, mInputPad->stride(0) * bytes);
         auto biasP   = inputs[2]->host<uint8_t>();
diff --git a/source/backend/cpu/CPUDeconvolution.cpp b/source/backend/cpu/CPUDeconvolution.cpp
index 6a75b3c61..bdef005cb 100644
--- a/source/backend/cpu/CPUDeconvolution.cpp
+++ b/source/backend/cpu/CPUDeconvolution.cpp
@@ -260,6 +260,55 @@ ErrorCode CPUDeconvolution::onResize(const std::vector<Tensor *> &inputs, const
     return NO_ERROR;
 }
 
+CPUDeconvolutionOrigin::CPUDeconvolutionOrigin(const Tensor *input, Tensor *weight, const Op *convOp, Backend *b, bool ModeInt8) : CPUDeconvolutionBasic(input, convOp, b) {
+    if (ModeInt8) {
+        const auto weightDataPtr = weight->host<int8_t>();
+        auto conv2d = convOp->main_as_Convolution2D();
+        auto common = conv2d->common();
+        auto pack = static_cast<CPUBackend*>(b)->functions()->pack;
+        mResource = CPUConvolution::makeResourceInt8(backend(), convOp, pack);
+        CPUConvolution::MutableResourceInt8 mutableResource(mResource, b);
+        auto core = static_cast<CPUBackend*>(b)->int8Functions();
+        auto gemmKernel = core->Int8GemmKernel;
+        int UNIT, SRC_UNIT, DST_XUNIT;
+        core->MNNGetGemmUnit(&UNIT, &SRC_UNIT, &DST_XUNIT);
+        const auto kEleCnt = mCommon->kernelX() * mCommon->kernelY();
+        const int ocDiv4 = UP_DIV(common->outputCount(), pack) * kEleCnt;
+        const int icDiv4 = UP_DIV(common->inputCount(), SRC_UNIT);
+        const int ocDivUnit = UP_DIV(common->outputCount(), UNIT);
+        const int oc4 = ocDiv4 / kEleCnt;
+        const int bias_elesize = ocDiv4 * pack;
+        // set offset if use SSE.
+        auto inputQuant = TensorUtils::getQuantInfo(input);
+        auto inputZeroPoint = inputQuant[1];
+        std::vector<int32_t> _bias(bias_elesize, inputZeroPoint);
+#ifdef MNN_USE_SSE
+        int actBits = conv2d->symmetricQuan()->nbits();
+        if (actBits <= 7) {
+            gemmKernel = core->Int8GemmKernelFast;
+        }
+        for (int a = 0; a < kEleCnt; ++a){
+            for (int oz = 0; oz < ocDivUnit * UNIT; ++oz) {
+                int offset = inputZeroPoint, oz4 = oz / UNIT, ozRemain = oz % UNIT;
+                for (int sz = 0; sz < icDiv4 * SRC_UNIT; ++sz) {
+                    int sz4 = sz / SRC_UNIT, szRemain = sz % SRC_UNIT;
+                    int index = (((a * oc4 + oz4) * icDiv4 + sz4) * UNIT + ozRemain) * SRC_UNIT + szRemain;
+                    auto weightInt8Data = weightDataPtr[index];
+                    offset += weightInt8Data * (-128);
+                }
+                if (oz < oc4 * pack) {
+                    _bias[a * oc4 * pack + oz] = offset;
+                }
+            }
+        }
+#else
+        if(conv2d->symmetricQuan() && conv2d->symmetricQuan()->method() == QuantizeAlgo_OVERFLOW_AWARE){
+            gemmKernel = core->Int8GemmKernelFast;
+        }
+#endif
+        mDeconvInt8Exe.reset(new GemmInt8Executor(b, mResource, convOp, gemmKernel, _bias));
+    }
+}
 
 ErrorCode CPUDeconvolutionOrigin::onResize(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs) {
     CPUDeconvolutionBasic::onResize(inputs, outputs);
@@ -340,9 +389,12 @@ ErrorCode CPUDeconvolutionOrigin::onResize(const std::vector<Tensor*>& inputs, c
     }
     auto threadNumber = ((CPUBackend*)backend())->threadNumber();
     std::vector<float> scales(core->pack * src_height * src_width * batch, scale);
-    auto outputFp32Ptr = allocator->alloc(batch * src_height * src_width * ocC4 * core->pack * bytes);
-    if (outputFp32Ptr.invalid()) {
-        return OUT_OF_MEMORY;
+    MemChunk outputFp32Ptr;
+    if (outi8) {
+        outputFp32Ptr = allocator->alloc(batch * src_height * src_width * ocC4 * core->pack * bytes);
+        if (outputFp32Ptr.invalid()) {
+            return OUT_OF_MEMORY;
+        }
     }
 
     mPostFunctions.emplace_back(std::make_pair([ocC4, width, height, kh, kw, padY, padX, dilateY, dilateX, strideY,
@@ -397,15 +449,9 @@ ErrorCode CPUDeconvolutionOrigin::onResize(const std::vector<Tensor*>& inputs, c
             }
         }
     }, threadNumber));
-    /*
-    if (TensorUtils::getDescribe(tempInput.get())->mem->chunk().offset() != TensorUtils::getDescribe(input)->mem->chunk().offset()) {
-        backend()->onReleaseBuffer(tempInput.get(), Backend::DYNAMIC);
+    if (outi8) {
+        allocator->free(outputFp32Ptr);
     }
-     if (tempInput->host<float>() != inputPtr) {
-         backend()->onReleaseBuffer(tempInput.get(), Backend::DYNAMIC);
-     }
-    */
-    allocator->free(outputFp32Ptr);
     if (needReleaseTempInput) {
         backend()->onReleaseBuffer(tempInput.get(), Backend::DYNAMIC);
     }
@@ -416,7 +462,7 @@ ErrorCode CPUDeconvolutionOrigin::onResize(const std::vector<Tensor*>& inputs, c
 ErrorCode CPUDeconvolutionOrigin::onExecute(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs) {
     auto inputPtr = inputs[0]->host<uint8_t>();
     auto outputPtr = outputs[0]->host<uint8_t>();
-    if (CPUBackend::getDataType(inputs[0]) == DataType_DT_INT8 || inputs[0]->getType().bytes() == 1) {
+    if (mDeconvInt8Exe.get() != nullptr) {
         mDeconvInt8Exe->onExecute({inputs[0], inputs[1]}, {mTempOutput.get()});
     }
     else {
diff --git a/source/backend/cpu/CPUDeconvolution.hpp b/source/backend/cpu/CPUDeconvolution.hpp
index ed932e0b4..82f7168d4 100644
--- a/source/backend/cpu/CPUDeconvolution.hpp
+++ b/source/backend/cpu/CPUDeconvolution.hpp
@@ -38,56 +38,7 @@ class CPUDeconvolutionCommon : public CPUDeconvolutionBasic {
 
 class CPUDeconvolutionOrigin : public CPUDeconvolutionBasic {
 public:
-    CPUDeconvolutionOrigin(const Tensor *input, Tensor *weight, const Op *convOp, Backend *b, bool ModeInt8)
-        : CPUDeconvolutionBasic(input, convOp, b){
-        if (ModeInt8) {
-            const auto weightDataPtr = weight->host<int8_t>();
-            auto conv2d = convOp->main_as_Convolution2D();
-            auto common = conv2d->common();
-            auto pack = static_cast<CPUBackend*>(b)->functions()->pack;
-            mResource = CPUConvolution::makeResourceInt8(backend(), convOp, pack);
-            CPUConvolution::MutableResourceInt8 mutableResource(mResource, b);
-            auto core = static_cast<CPUBackend*>(b)->int8Functions();
-            auto gemmKernel = core->Int8GemmKernel;
-            int UNIT, SRC_UNIT, DST_XUNIT;
-            core->MNNGetGemmUnit(&UNIT, &SRC_UNIT, &DST_XUNIT);
-            const auto kEleCnt = mCommon->kernelX() * mCommon->kernelY();
-            const int ocDiv4 = UP_DIV(common->outputCount(), pack) * kEleCnt; 
-            const int icDiv4 = UP_DIV(common->inputCount(), SRC_UNIT);
-            const int ocDivUnit = UP_DIV(common->outputCount(), UNIT); 
-            const int oc4 = ocDiv4 / kEleCnt;
-            const int bias_elesize = ocDiv4 * pack;
-            // set offset if use SSE.
-            auto inputQuant = TensorUtils::getQuantInfo(input);
-            auto inputZeroPoint = inputQuant[1];
-            std::vector<int32_t> _bias(bias_elesize, inputZeroPoint);
-#ifdef MNN_USE_SSE
-            int actBits = conv2d->symmetricQuan()->nbits();
-            if (actBits <= 7) {
-                gemmKernel = core->Int8GemmKernelFast;
-            }
-            for (int a = 0; a < kEleCnt; ++a){
-                for (int oz = 0; oz < ocDivUnit * UNIT; ++oz) {
-                int offset = inputZeroPoint, oz4 = oz / UNIT, ozRemain = oz % UNIT;
-                for (int sz = 0; sz < icDiv4 * SRC_UNIT; ++sz) {
-                    int sz4 = sz / SRC_UNIT, szRemain = sz % SRC_UNIT;
-                    int index = (((a * oc4 + oz4) * icDiv4 + sz4) * UNIT + ozRemain) * SRC_UNIT + szRemain;
-                    auto weightInt8Data = weightDataPtr[index];
-                    offset += weightInt8Data * (-128);
-                }
-                if (oz < oc4 * pack) {
-                    _bias[a * oc4 * pack + oz] = offset;
-                }
-        }
-    }
-#else
-            if(conv2d->symmetricQuan() && conv2d->symmetricQuan()->method() == QuantizeAlgo_OVERFLOW_AWARE){
-                gemmKernel = core->Int8GemmKernelFast;
-            }
-#endif
-            mDeconvInt8Exe.reset(new GemmInt8Executor(b, mResource, convOp, gemmKernel, _bias));
-        }
-    }
+    CPUDeconvolutionOrigin(const Tensor *input, Tensor *weight, const Op *convOp, Backend *b, bool ModeInt8);
     virtual ~CPUDeconvolutionOrigin() = default;
     virtual ErrorCode onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
     virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
diff --git a/source/backend/cpu/CPUFloatToInt8.cpp b/source/backend/cpu/CPUFloatToInt8.cpp
index 7770377c6..306f1053d 100644
--- a/source/backend/cpu/CPUFloatToInt8.cpp
+++ b/source/backend/cpu/CPUFloatToInt8.cpp
@@ -35,8 +35,11 @@ CPUFloatToInt8::CPUFloatToInt8(Backend* backend, const MNN::Op* param) : Executi
         memset(mScales->host<float>(), 0, UP_DIV(scaleLen, pack) * pack * sizeof(float));
         memcpy(mScales->host<float>(), scale->tensorScale()->data(), scaleLen * sizeof(float));
     }
-
-    mZeroPoint = static_cast<float>(scale->zeroPoint());
+    if (scale->floatzeros()) {
+        mZeroPoint = scale->floatzeros()->data()[0];
+    } else {
+        mZeroPoint = static_cast<float>(scale->zeroPoint());
+    }
     mClampMin = scale->clampMin();
     mClampMax = scale->clampMax();
 }
diff --git a/source/backend/cpu/CPUInt8ToFloat.cpp b/source/backend/cpu/CPUInt8ToFloat.cpp
index a2d13f958..4dc9720e8 100644
--- a/source/backend/cpu/CPUInt8ToFloat.cpp
+++ b/source/backend/cpu/CPUInt8ToFloat.cpp
@@ -21,7 +21,8 @@ CPUInt8ToFloat::CPUInt8ToFloat(Backend* backend, const MNN::Op* param) : Executi
     const int scaleLen = scale->tensorScale()->size();
     auto pack = static_cast<CPUBackend*>(backend)->functions()->pack;
     mScales.reset(Tensor::createDevice<float>({UP_DIV(scaleLen, pack) * pack}));
-    mValid = backend->onAcquireBuffer(mScales.get(), Backend::STATIC);
+    mZeroPoint.reset(Tensor::createDevice<float>({UP_DIV(scaleLen, pack) * pack}));
+    mValid = backend->onAcquireBuffer(mScales.get(), Backend::STATIC) && backend->onAcquireBuffer(mZeroPoint.get(), Backend::STATIC);
     if (!mValid) {
         return;
     }
@@ -29,12 +30,24 @@ CPUInt8ToFloat::CPUInt8ToFloat(Backend* backend, const MNN::Op* param) : Executi
         mSingle = true;
         for (int i = 0; i < pack; ++i) {
             mScales->host<float>()[i] = scale->tensorScale()->data()[0];
+            if (scale->floatzeros()) {
+                mZeroPoint->host<float>()[i] = scale->floatzeros()->data()[0];
+            }
         }
     } else {
         memset(mScales->host<float>(), 0, UP_DIV(scaleLen, pack) * pack * sizeof(float));
         memcpy(mScales->host<float>(), scale->tensorScale()->data(), scaleLen * sizeof(float));
+        memset(mZeroPoint->host<float>(), 0, UP_DIV(scaleLen, pack) * pack * sizeof(float));
+        if (scale->floatzeros()) {
+            memcpy(mZeroPoint->host<float>(), scale->floatzeros()->data(), scale->floatzeros()->size() * sizeof(float));
+        }
+    }
+    if (!scale->floatzeros()) {
+        for (int i = 0;i < ROUND_UP(scaleLen, pack); ++i) {
+            mZeroPoint->host<float>()[i] = static_cast<float>(scale->zeroPoint());
+        }
     }
-    mZeroPoint = scale->zeroPoint();
+    
 }
 CPUInt8ToFloat::~CPUInt8ToFloat() {
     backend()->onReleaseBuffer(mScales.get(), Backend::STATIC);
@@ -48,6 +61,7 @@ ErrorCode CPUInt8ToFloat::onExecute(const std::vector<Tensor*>& inputs, const st
     const auto inputDataPtr = input->host<int8_t>();
     auto outputDataPtr      = output->host<float>();
     const auto scaleDataPtr = mScales->host<float>();
+    const auto zeroDataPtr  = mZeroPoint->host<float>();
     const int channels      = input->channel();
     int icDiv4        = UP_DIV(channels, pack);
     const int batch         = input->batch();
@@ -67,8 +81,9 @@ ErrorCode CPUInt8ToFloat::onExecute(const std::vector<Tensor*>& inputs, const st
         int z = tId % icDiv4;
         const auto srcChannelPtr   = inputDataPtr + tId * oc4Stride * pack;
         const auto scaleChannelPtr = scaleDataPtr + z * pack;
+        const auto zeroChannelPtr  = zeroDataPtr + z * pack;
         auto dstChannlePtr         = outputDataPtr + tId * oc4Stride * pack;
-        int8F->MNNInt8ScaleToFloat(dstChannlePtr, srcChannelPtr, scaleChannelPtr, oc4Stride, mZeroPoint);
+        int8F->MNNInt8ScaleToFloat(dstChannlePtr, srcChannelPtr, scaleChannelPtr, oc4Stride, zeroChannelPtr, 3);
     }
     MNN_CONCURRENCY_END();
 
diff --git a/source/backend/cpu/CPUInt8ToFloat.hpp b/source/backend/cpu/CPUInt8ToFloat.hpp
index e2d31f4e1..c082a3e66 100644
--- a/source/backend/cpu/CPUInt8ToFloat.hpp
+++ b/source/backend/cpu/CPUInt8ToFloat.hpp
@@ -24,7 +24,7 @@ class CPUInt8ToFloat : public Execution {
     std::shared_ptr<Tensor> mScales;
 
     bool mSingle = false;
-    int8_t mZeroPoint;
+    std::shared_ptr<Tensor> mZeroPoint;
 };
 
 } // namespace MNN
diff --git a/source/backend/cpu/CPURNNSequenceGRU.cpp b/source/backend/cpu/CPURNNSequenceGRU.cpp
index 815e0bcb8..8486e2908 100644
--- a/source/backend/cpu/CPURNNSequenceGRU.cpp
+++ b/source/backend/cpu/CPURNNSequenceGRU.cpp
@@ -11,77 +11,74 @@
 #include "backend/cpu/CPUBackend.hpp"
 #include "backend/cpu/compute/ConvOpt.h"
 #include "backend/cpu/compute/CommonOptFunction.h"
-#include "math/Matrix.hpp"
 #include "core/TensorUtils.hpp"
 
 namespace MNN {
 
-static inline void ArrayProduct(float* C, float* A, float* B, const int length) {
-    MNNMatrixProdCommon(C, A, B, length, 0, 0, 0, 1);
-    return;
-}
-
 // implement GRU cell function
 // Ref: tensorflow/python/ops/rnn_cell_impl.py
-void CPURNNSequenceGRU::runRNNStep(const float* input, const int inputLength, const bool linearBeforeReset,
+void CPURNNSequenceGRU::runRNNStep(const uint8_t* input, const int inputLength, const bool linearBeforeReset,
                        std::shared_ptr<Tensor>& hiddenState, const int numUnits, Tensor* gateWeight, Tensor* gateBias,
                        Tensor* candidateWeight, Tensor* candidateBias, Tensor* recurrentBias,
                        std::shared_ptr<Tensor>& inputAndState, std::shared_ptr<Tensor>& gate,
                        std::shared_ptr<Tensor>& resetHt) {
     auto bn = static_cast<CPUBackend*>(backend());
+    auto mulFunction = bn->functions()->MNNSelectBinaryFunctionForFloat(BinaryOpOperation_MUL);
+    auto addFunction = bn->functions()->MNNSelectBinaryFunctionForFloat(BinaryOpOperation_ADD);
+    auto subFunction = bn->functions()->MNNSelectBinaryFunctionForFloat(BinaryOpOperation_SUB);
+    auto tanhFunction = bn->functions()->MNNSelectUnaryFunctionForFloat(UnaryOpOperation_TANH, bn->precisionMode());
+    auto bytes = bn->functions()->bytes;
+    auto sigmoidFunc = bn->functions()->MNNSelectUnaryFunctionForFloat(UnaryOpOperation_SIGMOID, bn->precisionMode());
     // gate is (z_t, r_t)
-    auto inputAndStatePtr = inputAndState->host<float>();
-    auto hiddenStatePtr   = hiddenState->host<float>();
-    ::memcpy(inputAndStatePtr, input, inputLength * sizeof(float));
-    ::memcpy(inputAndStatePtr + inputLength, hiddenStatePtr, numUnits * sizeof(float));
+    auto inputAndStatePtr = inputAndState->host<uint8_t>();
+    auto hiddenStatePtr   = hiddenState->host<uint8_t>();
+    ::memcpy(inputAndStatePtr, input, inputLength * bytes);
+    ::memcpy(inputAndStatePtr + inputLength * bytes, hiddenStatePtr, numUnits * bytes);
     inputAndState->setLength(1, inputLength + numUnits);
 
     // // [x_t, h_t-1] * [W_zr, R_zr]: (1, inputLength + numUnits) X (inputLength + numUnits, 2 * numUnits)
     mMatMulIU2U->execute(inputAndState->host<float>(), gateWeight->host<float>(), gate->host<float>(), gateBias->host<float>());
 
     recurrentBias->setLength(1, 2 * numUnits);
-    Math::Matrix::add(gate.get(), gate.get(), recurrentBias);
+    addFunction(gate->host<float>(), gate->host<float>(), recurrentBias->host<float>(), 2*numUnits, -1);
     // (1, 2*numUnits)
     const int gateSize = gate->elementSize();
-    auto gatePtr       = gate->host<float>();
-    auto core = bn->functions();
-    auto sigmoidFunc = core->MNNSelectUnaryFunctionForFloat(UnaryOpOperation_SIGMOID, bn->precisionMode());
+    auto gatePtr       = gate->host<uint8_t>();
     sigmoidFunc(gatePtr, gatePtr, gateSize);
     // reset gate, // r_t is the second segment
-    auto rtPtr = gatePtr + numUnits;
+    auto rtPtr = gatePtr + numUnits * bytes;
 
     if (linearBeforeReset) {
         // calculate Rt (.) (Ht_1 * Rh + Rbh)
-        auto recurrentHiddenBiasPtr = recurrentBias->host<float>() + 2 * numUnits;
-        auto rhWeightPtr = candidateWeight->host<float>() + inputLength * numUnits;
-        mMatMulU2U->execute(hiddenState->host<float>(), rhWeightPtr, resetHt->host<float>(), recurrentHiddenBiasPtr);
-        ArrayProduct(resetHt->host<float>(), rtPtr, resetHt->host<float>(), numUnits);
+        auto recurrentHiddenBiasPtr = recurrentBias->host<uint8_t>() + 2 * numUnits * bytes;
+        auto rhWeightPtr = candidateWeight->host<uint8_t>() + inputLength * numUnits * bytes;
+        mMatMulU2U->execute(hiddenState->host<float>(), (float*)rhWeightPtr, resetHt->host<float>(), (float*)recurrentHiddenBiasPtr);
+        mulFunction(resetHt->host<float>(), rtPtr, resetHt->host<float>(), numUnits, -1);
 
         // calculate Xt * Wh
-        mMatMulI2U->execute(input, candidateWeight->host<float>(), inputAndStatePtr + inputLength + numUnits, nullptr);
+        mMatMulI2U->execute((float*)input, candidateWeight->host<float>(), (float*)(inputAndStatePtr + (inputLength + numUnits) * bytes), nullptr);
         // sum 3 parts
-        Math::Matrix::add(resetHt->host<float>(), resetHt->host<float>(), inputAndStatePtr + inputLength + numUnits, numUnits);
-        Math::Matrix::add(rtPtr, resetHt->host<float>(), candidateBias->host<float>(), numUnits);
+        addFunction(resetHt->host<float>(), resetHt->host<float>(), inputAndStatePtr + (inputLength + numUnits) * bytes, numUnits, -1);
+        addFunction(rtPtr, resetHt->host<float>(), candidateBias->host<float>(), numUnits, -1);
 
     } else {
         // r_t: (1, numUnits)
-        auto resetGatePtr = inputAndStatePtr + inputLength;
+        auto resetGatePtr = inputAndStatePtr + inputLength * bytes;
         // h_t1(1, numUnits) = r_t(1, numUnits) * h_t-1_(1, numUnits)
-        ArrayProduct(resetGatePtr, rtPtr, hiddenStatePtr, numUnits);
+        mulFunction(resetGatePtr, rtPtr, hiddenStatePtr, numUnits, -1);
         // deal with recurrent bias and linear_before_reset parameter
-        auto recurrentBiasAddedPtr = inputAndStatePtr + inputLength + numUnits;
-        auto recurrentHiddenBiasPtr = recurrentBias->host<float>() + 2 * numUnits;
-        Math::Matrix::add(recurrentBiasAddedPtr, recurrentHiddenBiasPtr, candidateBias->host<float>(), numUnits);
+        auto recurrentBiasAddedPtr = inputAndStatePtr + (inputLength + numUnits) * bytes;
+        auto recurrentHiddenBiasPtr = recurrentBias->host<float>() + 2 * numUnits * bytes;
+        addFunction(recurrentBiasAddedPtr, recurrentHiddenBiasPtr, candidateBias->host<float>(), numUnits, -1);
         mMatMulI2U->execute(inputAndState->host<float>(), candidateWeight->host<float>(),  resetHt->host<float>(), nullptr);
         // reuse r_t memory as h_t'
-        Math::Matrix::add(rtPtr, resetHt->host<float>(), recurrentBiasAddedPtr, numUnits);
+        addFunction(rtPtr, resetHt->host<float>(), recurrentBiasAddedPtr, numUnits, -1);
     }
-
-    for (int i = 0; i < numUnits; ++i) {
-        hiddenStatePtr[i] =
-            (1 - gatePtr[i]) * tanhf(rtPtr[i]) + gatePtr[i] * hiddenStatePtr[i];
-    }
-
+    // h = (1-g)*t+g*h = t + g*(h-t)
+    tanhFunction(resetHt->host<float>(), rtPtr, numUnits);
+    subFunction(hiddenStatePtr, hiddenStatePtr, resetHt->host<float>(), numUnits, -1);
+    mulFunction(hiddenStatePtr, hiddenStatePtr, gatePtr, numUnits, -1);
+    addFunction(hiddenStatePtr, hiddenStatePtr, resetHt->host<float>(), numUnits, -1);
     inputAndState->setLength(1, inputLength + 2 * numUnits);
 }
 
@@ -162,6 +159,7 @@ ErrorCode CPURNNSequenceGRU::onExecute(const std::vector<Tensor*>& inputs, const
     auto fwCandidateBias = inputs[4];
     auto fwRecurrentBias = inputs[5];
     auto cpuBn = static_cast<CPUBackend*>(backend());
+    auto bytes = cpuBn->functions()->bytes;
 
     // fwGateWeight->printShape();// mFwGateWeight
     // fwGateBias->printShape();// mFwGateBias
@@ -170,15 +168,15 @@ ErrorCode CPURNNSequenceGRU::onExecute(const std::vector<Tensor*>& inputs, const
     // fwRecurrentBias->printShape();// mFwRecurrentBias
 
     // firstly set the hidden state to zero
-    float* const hiddenStatePtr   = mHiddenState->host<float>();
-    const int hiddenStateDataSize = mHiddenState->size();
+    auto const hiddenStatePtr   = mHiddenState->host<uint8_t>();
+    const int hiddenStateDataSize = mHiddenState->elementSize() * bytes;
 
     auto input                    = inputs[0];  // shape :(seq_length, batch_size, input_size)
     auto output                   = outputs[0]; // shape :(seq_length, num_directions, batch_size, hidden_size)
-    float* const inputPtr         = input->host<float>();
-    float* const outputPtr        = output->host<float>();
+    auto const inputPtr         = input->host<uint8_t>();
+    auto const outputPtr        = output->host<uint8_t>();
 
-    float* outputYhPtr = mKeepAllOutputs && outputSize > 1 ? outputs[1]->host<float>() : outputs[0]->host<float>();
+    auto outputYhPtr = mKeepAllOutputs && outputSize > 1 ? outputs[1]->host<uint8_t>() : outputs[0]->host<uint8_t>();
     const int batchSize           = input->length(1);
     const int SequenceStride      = input->stride(0);
     const int inputSequenceLength = input->length(0);
@@ -194,24 +192,24 @@ ErrorCode CPURNNSequenceGRU::onExecute(const std::vector<Tensor*>& inputs, const
 
         for (int i = 0; i < inputSequenceLength; ++i) {
             const int inputOffset = i * SequenceStride + b * inputCodeLength;
-            runRNNStep(inputPtr + inputOffset, inputCodeLength, mlinearBeforeReset, mHiddenState, mNumUnits, fwGateWeight, fwGateBias,
+            runRNNStep(inputPtr + inputOffset * bytes, inputCodeLength, mlinearBeforeReset, mHiddenState, mNumUnits, fwGateWeight, fwGateBias,
                        fwCandidateWeight, fwCandidateBias, fwRecurrentBias, mInputAndState, mGate, mResetHt);
 
             if (mKeepAllOutputs) {
-                ::memcpy(outputPtr + i * output->stride(0) + b * mNumUnits, hiddenStatePtr, hiddenStateDataSize);
+                ::memcpy(outputPtr + (i * output->stride(0) + b * mNumUnits) * bytes, hiddenStatePtr, hiddenStateDataSize);
             }
         }
         if ((mKeepAllOutputs && outputSize > 1) || !mKeepAllOutputs) {
             ::memcpy(outputYhPtr, hiddenStatePtr, hiddenStateDataSize);
-            outputYhPtr += mNumUnits;
+            outputYhPtr += mNumUnits * bytes;
         }
 
     }
 
     // backward rnn
     if (mIsBidirectionalRNN) {
-        float* outputYhPtr = mKeepAllOutputs && outputSize > 1 ? outputs[1]->host<float>() : outputs[0]->host<float>();
-        outputYhPtr += batchSize * mNumUnits;
+        auto outputYhPtr = mKeepAllOutputs && outputSize > 1 ? outputs[1]->host<uint8_t>() : outputs[0]->host<uint8_t>();
+        outputYhPtr += batchSize * mNumUnits * bytes;
         // todo: modify the inputOffset
         MNN_ASSERT(11 <= inputs.size());
         auto bwGateWeight = inputs[6];
@@ -221,7 +219,7 @@ ErrorCode CPURNNSequenceGRU::onExecute(const std::vector<Tensor*>& inputs, const
         auto bwRecurrentBias = inputs[10];
 
         auto outputBw            = outputs[0];
-        float* const outputBwPtr = outputBw->host<float>();
+        auto const outputBwPtr = outputBw->host<uint8_t>();
         for (int b = 0; b < batchSize; ++b) {
 
             if (inputSize > 1 + forwardParamNumber * 2) {
@@ -233,16 +231,16 @@ ErrorCode CPURNNSequenceGRU::onExecute(const std::vector<Tensor*>& inputs, const
 
             for (int i = inputSequenceLength - 1; i >= 0; i--) {
                 const int inputOffset = i * SequenceStride + b * inputCodeLength;
-                runRNNStep(inputPtr + inputOffset, inputCodeLength, mlinearBeforeReset, mHiddenState, mNumUnits, bwGateWeight, bwGateBias,
+                runRNNStep(inputPtr + inputOffset * bytes, inputCodeLength, mlinearBeforeReset, mHiddenState, mNumUnits, bwGateWeight, bwGateBias,
                            bwCandidateWeight, bwCandidateBias, bwRecurrentBias, mInputAndState, mGate, mResetHt);
                 if (mKeepAllOutputs) {
-                    ::memcpy(outputBwPtr + i * outputBw->stride(0) + (batchSize + b) * mNumUnits,
+                    ::memcpy(outputBwPtr + (i * outputBw->stride(0) + (batchSize + b) * mNumUnits) * bytes,
                              hiddenStatePtr, hiddenStateDataSize);
                 }
             }
             if ((mKeepAllOutputs && outputSize > 1) || !mKeepAllOutputs) {
                 ::memcpy(outputYhPtr, hiddenStatePtr, hiddenStateDataSize);
-                outputYhPtr += mNumUnits;
+                outputYhPtr += mNumUnits * bytes;
             }
         }
     }
diff --git a/source/backend/cpu/CPURNNSequenceGRU.hpp b/source/backend/cpu/CPURNNSequenceGRU.hpp
index c2b56816c..0987d1305 100644
--- a/source/backend/cpu/CPURNNSequenceGRU.hpp
+++ b/source/backend/cpu/CPURNNSequenceGRU.hpp
@@ -21,7 +21,7 @@ class CPURNNSequenceGRU : public Execution {
     virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
 
 private:
-    void runRNNStep(const float* input, const int inputLength, const bool linearBeforeReset,
+    void runRNNStep(const uint8_t* input, const int inputLength, const bool linearBeforeReset,
                            std::shared_ptr<Tensor>& hiddenState, const int numUnits, Tensor* gateWeight, Tensor* gateBias,
                            Tensor* candidateWeight, Tensor* candidateBias, Tensor* recurrentBias,
                            std::shared_ptr<Tensor>& inputAndState, std::shared_ptr<Tensor>& gate,
diff --git a/source/backend/cpu/CPUUnique.cpp b/source/backend/cpu/CPUUnique.cpp
index d1c3d52e6..defd22a10 100644
--- a/source/backend/cpu/CPUUnique.cpp
+++ b/source/backend/cpu/CPUUnique.cpp
@@ -12,30 +12,50 @@ namespace MNN {
 
 ErrorCode CPUUnique::onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
     auto input  = inputs[0];
-    if (input->getType().code != halide_type_int) {
-        return NOT_SUPPORT;
-    }
     auto output = outputs[0];
     auto outputPtr  = output->host<int32_t>();
     int outputSize  = 0;
-    std::unordered_map<int, int> idx_map;
     auto eleSize = input->elementSize();
-    for (int i = 0; i < eleSize; ++i) {
-        auto value = input->host<int32_t>()[i];
-        if (idx_map.find(value) == idx_map.end()) {
-            outputPtr[outputSize] = value;
-            idx_map[value] = outputSize++;
+    if (outputs.size() <= 2) {
+        std::unordered_map<int, int> idx_map;
+        for (int i = 0; i < eleSize; ++i) {
+            auto value = input->host<int32_t>()[i];
+            if (idx_map.find(value) == idx_map.end()) {
+                outputPtr[outputSize] = value;
+                idx_map[value] = outputSize++;
+            }
         }
-    }
-    outputSize  = 0;
-    if (outputs.size() > 1) {
+        outputSize  = 0;
+        if (outputs.size() > 1) {
+            auto outIdx = outputs[1]->host<int>();
+            for (int i = 0; i < eleSize; ++i) {
+                auto value = input->host<int32_t>()[i];
+                if (idx_map.find(value) == idx_map.end()) {
+                    outIdx[outputSize] = idx_map[value];
+                    outputSize++;
+                }
+            }
+        }
+    } else {
+        MNN_ASSERT(4 == outputs.size());
         auto outIdx = outputs[1]->host<int>();
+        auto reverseIdx = outputs[2]->host<int>();
+        auto count = outputs[3]->host<int>();
+        ::memset(count, 0, outputs[3]->usize());
+        std::unordered_map<int, int> idx_map;
         for (int i = 0; i < eleSize; ++i) {
             auto value = input->host<int32_t>()[i];
-            if (idx_map.find(value) == idx_map.end()) {
-                outIdx[outputSize] = idx_map[value];
-                outputSize++;
+            auto iter = idx_map.find(value);
+            int pos;
+            if (iter == idx_map.end()) {
+                outputPtr[outputSize] = value;
+                outIdx[outputSize] = i;
+                pos = outputSize;
+                idx_map[value] = outputSize++;
+            } else {
+                pos = iter->second;
             }
+            reverseIdx[i] = pos;
         }
     }
     return NO_ERROR;
diff --git a/source/backend/cpu/OneDNNConvInt8.cpp b/source/backend/cpu/OneDNNConvInt8.cpp
index db8a5ec43..e01bd197f 100644
--- a/source/backend/cpu/OneDNNConvInt8.cpp
+++ b/source/backend/cpu/OneDNNConvInt8.cpp
@@ -68,7 +68,7 @@ Execution* OneDNNConvInt8::create(Backend* backend, const MNN::Convolution2D* co
     }
     std::shared_ptr<ConvolutionCommon::Int8Common> quanCommon;
     if (convParam->quanParameter() != nullptr) {
-        quanCommon = ConvolutionCommon::load(convParam, backend(), false);
+        quanCommon = ConvolutionCommon::load(convParam, backend, false);
         weightSrc = quanCommon->weight.get();
     }
     auto user_weights = memory(user_weights_md, eng, (int8_t*)weightSrc);
diff --git a/source/backend/cpu/arm/arm32/MNNGemmInt8AddBiasScale_16x4_Unit.S b/source/backend/cpu/arm/arm32/MNNGemmInt8AddBiasScale_16x4_Unit.S
index 9c37ae75d..9ac9a4e28 100644
--- a/source/backend/cpu/arm/arm32/MNNGemmInt8AddBiasScale_16x4_Unit.S
+++ b/source/backend/cpu/arm/arm32/MNNGemmInt8AddBiasScale_16x4_Unit.S
@@ -172,9 +172,6 @@ L2LoopDz:
     vpadd.s32 d18, d24, d26
     vpadd.s32 d19, d28, d30
 
-    // vaddq.s32 q0, q8, q4 // add bias
-    // vaddq.s32 q1, q9, q4
-
     vcvt.f32.s32 q0, q8
     vcvt.f32.s32 q1, q9
 
@@ -295,7 +292,6 @@ L1LoopDz:
     vmlal.s8 q0, d5, d13
     vmlal.s8 q1, d5, d15
     vpaddl.s16 q10, q0
-    add r1, r1, #16
     vpaddl.s16 q11, q1
 
     beq L1LoopSzEnd
@@ -316,7 +312,7 @@ L1LoopDz:
         vmull.s8 q1, d4, d14
         vmlal.s8 q0, d5, d13
         vmlal.s8 q1, d5, d15
-        add r1, r1, #16
+
         vpadal.s16 q10, q0
         vpadal.s16 q11, q1
 
diff --git a/source/backend/cpu/arm/arm32/MNNGemmInt8AddBiasScale_16x4_Unit_FAST.S b/source/backend/cpu/arm/arm32/MNNGemmInt8AddBiasScale_16x4_Unit_FAST.S
index 8d9d0ef63..1299c213c 100644
--- a/source/backend/cpu/arm/arm32/MNNGemmInt8AddBiasScale_16x4_Unit_FAST.S
+++ b/source/backend/cpu/arm/arm32/MNNGemmInt8AddBiasScale_16x4_Unit_FAST.S
@@ -147,9 +147,6 @@ L2LoopDz:
     vpadd.s32 d18, d24, d25
     vpadd.s32 d19, d26, d27
 
-    //vaddq.s32 q0, q8, q14 // add bias
-    //vaddq.s32 q1, q9, q14
-
     vcvt.f32.s32 q0, q8
     vcvt.f32.s32 q1, q9
     vmulq.f32 q0, q0, q15 // mul scale
@@ -210,7 +207,6 @@ L1LoopDz:
     vmull.s8 q8, d0, d4
     vld1.8 {q4,q5}, [r2]!
     vmull.s8 q9, d0, d6
-    add r1, r1, #16
     vmull.s8 q10, d0, d8
     subs r12, r3, #1
     vmull.s8 q11, d0, d10
@@ -230,7 +226,7 @@ L1LoopDz:
 
         vmlal.s8 q8, d0, d4
         vmlal.s8 q9, d0, d6
-        add r1, r1, #16
+
         vmlal.s8 q10, d0, d8
         vmlal.s8 q11, d0, d10
 
@@ -262,8 +258,6 @@ L1LoopDz:
     vpadd.s32 d16, d20, d21
     vpadd.s32 d17, d22, d23
 
-    //vaddq.s32 q0, q8, q14
-
     vcvt.f32.s32 q0, q8
     vmulq.f32 q0, q0, q15
 
diff --git a/source/backend/cpu/arm/arm32/MNNGemmInt8AddBiasScale_16x4_w4_Unit.S b/source/backend/cpu/arm/arm32/MNNGemmInt8AddBiasScale_16x4_w4_Unit.S
index f3cdc98f9..def8a8336 100644
--- a/source/backend/cpu/arm/arm32/MNNGemmInt8AddBiasScale_16x4_w4_Unit.S
+++ b/source/backend/cpu/arm/arm32/MNNGemmInt8AddBiasScale_16x4_w4_Unit.S
@@ -280,7 +280,7 @@ L1LoopDz:
     vmlal.s8 q0, d5, d13
     vmlal.s8 q1, d5, d15
     vpaddl.s16 q10, q0
-    add r1, r1, #16
+
     vpaddl.s16 q11, q1
 
     beq L1LoopSzEnd
@@ -307,7 +307,7 @@ L1LoopDz:
         vmull.s8 q1, d4, d14
         vmlal.s8 q0, d5, d13
         vmlal.s8 q1, d5, d15
-        add r1, r1, #16
+
         vpadal.s16 q10, q0
         vpadal.s16 q11, q1
 
diff --git a/source/backend/cpu/arm/arm32/MNNInt8ScaleToFloat.S b/source/backend/cpu/arm/arm32/MNNInt8ScaleToFloat.S
index c17f748c0..02522c694 100644
--- a/source/backend/cpu/arm/arm32/MNNInt8ScaleToFloat.S
+++ b/source/backend/cpu/arm/arm32/MNNInt8ScaleToFloat.S
@@ -16,20 +16,41 @@
 
 asm_function MNNInt8ScaleToFloat
 
-// void MNNInt8ScaleToFloat(float* dst, const int8_t* src, const float* scale, size_t size, ssize_t zeroPoint)
-
+// void MNNInt8ScaleToFloat(float* dst, const int8_t* src, const float* scale, size_t size, const float* zeroPoint, ssize_t quanParamVec)
+// Auto Load: r0: dst*, r1: src*, r2: scale*, r3: size, 
+// Load from sp: r4: zeroPoint, r5: quanParamVec
 push {lr}
+
+vld1.32 {d30[0]}, [r2] // scale
+vdup.32 q15, d30[0]
+
 ldr r12, [sp, #4]
-vdup.s32 q13, r12
-vcvt.f32.s32 q13, q13
+vld1.32 {d26[0]},[r12] // zero
+vdup.32 q13, d26[0]
 
-vpush {q4-q7}
+ldr lr, [sp, #8]     // quanParamVec
+cmp lr, #0
+beq COMPUTE
 
-// Auto Load:
-// r0: dst*, r1: src*, r2: scale*, r3: size, r4: zeroPoint
+cmp lr, #3
+bne LOAD_VEC_ZERO
+vld1.32 {q15}, [r2]
+vld1.32 {q13}, [r12]
+b COMPUTE
 
+LOAD_VEC_ZERO:
+cmp lr, #2
+bne LOAD_VEC_SCALE
+vld1.32 {q13}, [r12]
+b COMPUTE
+
+LOAD_VEC_SCALE:
 vld1.32 {q15}, [r2]
 
+COMPUTE:
+vpush {q4-q7}
+
+
 L4:
 cmp r3, #4
 blt L1
diff --git a/source/backend/cpu/arm/arm64/MNNGemmInt8AddBiasScale_16x4_Unit.S b/source/backend/cpu/arm/arm64/MNNGemmInt8AddBiasScale_16x4_Unit.S
index b2cf3b215..ded4c3eae 100644
--- a/source/backend/cpu/arm/arm64/MNNGemmInt8AddBiasScale_16x4_Unit.S
+++ b/source/backend/cpu/arm/arm64/MNNGemmInt8AddBiasScale_16x4_Unit.S
@@ -120,7 +120,7 @@ ldr x21, [x15, #64] // blockNum
 ldr x23, [x15, #80] // extraScale
 lsl x21, x3, #6    // src_depth_quad* SRC_UNIT * UNIT * sizeof(int8_t)
 add x20, x19, #4
-
+lsl x24, x8, #4    // eDest * SRC_UNIT
 Start:
 cmp x8, #3
 beq L3Dz
@@ -367,8 +367,7 @@ L3LoopDz:
     mov x8, x1
     mov x22, x2
     ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x2], #64
-    ld1 {v4.16b, v5.16b, v6.16b}, [x1], #48
-    add x1, x1, #16
+    ld1 {v4.16b, v5.16b, v6.16b}, [x1], x24
 
     smull v8.8h, v0.8b, v4.8b
     smull v9.8h, v1.8b, v4.8b
@@ -418,7 +417,7 @@ L3LoopDz:
         beq L3ComputeSum
 
     L3LoopSz:
-        ld1 {v4.16b, v5.16b, v6.16b}, [x1], #48
+        ld1 {v4.16b, v5.16b, v6.16b}, [x1], x24
         ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x2], #64
 
         smull v8.8h, v0.8b, v4.8b
@@ -454,7 +453,6 @@ L3LoopDz:
         smull v11.8h, v3.8b, v6.8b
 
         subs x9, x9, #1
-        add x1, x1, #16
 
         smlal2 v8.8h,  v0.16b, v6.16b
         smlal2 v9.8h,  v1.16b, v6.16b
@@ -571,7 +569,7 @@ L2LoopDz:
     mov x8, x1
     mov x22, x2
     ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x2], #64
-    ld1 {v4.16b, v5.16b}, [x1], #32
+    ld1 {v4.16b, v5.16b}, [x1], x24
     
     
     smull v8.8h, v0.8b, v4.8b
@@ -582,7 +580,7 @@ L2LoopDz:
     smull v13.8h, v1.8b, v5.8b
     smull v14.8h, v2.8b, v5.8b
     smull v15.8h, v3.8b, v5.8b
-    add x1, x1, #32
+
     smlal2 v8.8h, v0.16b, v4.16b
     smlal2 v9.8h, v1.16b, v4.16b
     smlal2 v10.8h, v2.16b, v4.16b
@@ -606,7 +604,7 @@ L2LoopDz:
         beq L2ComputeSum
 
     L2LoopSz:
-        ld1 {v4.16b, v5.16b}, [x1], #32
+        ld1 {v4.16b, v5.16b}, [x1], x24
         ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x2], #64
 
         smull v8.8h, v0.8b, v4.8b
@@ -622,7 +620,7 @@ L2LoopDz:
         smlal2 v9.8h, v1.16b, v4.16b
         smlal2 v10.8h, v2.16b, v4.16b
         smlal2 v11.8h, v3.16b, v4.16b
-        add x1, x1, #32
+
         subs x9, x9, #1
         smlal2 v12.8h, v0.16b, v5.16b
         smlal2 v13.8h, v1.16b, v5.16b
@@ -727,8 +725,7 @@ L1LoopDz:
     ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x2], #64
     dup v16.4s, wzr
     dup v17.4s, wzr
-    ld1 {v4.16b}, [x1], #16
-    add x1, x1, #48
+    ld1 {v4.16b}, [x1], x24
     
     smull v8.8h, v0.8b, v4.8b
     dup v18.4s, wzr
@@ -745,7 +742,7 @@ L1LoopDz:
 
     L1LoopSz:
         sadalp v16.4s, v8.8h
-        ld1 {v4.16b}, [x1], #16
+        ld1 {v4.16b}, [x1], x24
         sadalp v17.4s, v9.8h
         sadalp v18.4s, v10.8h
         sadalp v19.4s, v11.8h
@@ -755,7 +752,6 @@ L1LoopDz:
         sadalp v23.4s, v15.8h
 
         ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x2], #64
-        add x1, x1, #48
 
         smull v8.8h, v0.8b, v4.8b
         smull v9.8h, v1.8b, v4.8b
@@ -776,7 +772,6 @@ L1LoopDz:
     sadalp v18.4s, v10.8h
     sadalp  v19.4s, v11.8h
 
-    //ld1 {v0.4s}, [x10], #16
     addp v4.4s, v16.4s, v17.4s
     addp v5.4s, v18.4s, v19.4s
 
diff --git a/source/backend/cpu/arm/arm64/MNNGemmInt8AddBiasScale_16x4_Unit_FAST.S b/source/backend/cpu/arm/arm64/MNNGemmInt8AddBiasScale_16x4_Unit_FAST.S
index 16b2837b7..eafc65837 100644
--- a/source/backend/cpu/arm/arm64/MNNGemmInt8AddBiasScale_16x4_Unit_FAST.S
+++ b/source/backend/cpu/arm/arm64/MNNGemmInt8AddBiasScale_16x4_Unit_FAST.S
@@ -228,11 +228,6 @@ L4LoopDz:
     addp v14.4s, v20.4s, v21.4s
     addp v15.4s, v22.4s, v23.4s
 
-    //add v16.4s, v12.4s, v0.4s
-    //add v17.4s, v13.4s, v0.4s
-    //add v18.4s, v14.4s, v0.4s
-    //add v19.4s, v15.4s, v0.4s
-
     L4Quan:
     ld1 {v1.4s}, [x7], #16 // scale
     ld1 {v2.4s}, [x19] // x kernel sum
@@ -329,7 +324,7 @@ L3LoopDz:
     smull v23.8h, v3.8b, v5.8b
     smull v24.8h, v0.8b, v6.8b
     smull v25.8h, v1.8b, v6.8b
-    add x1, x1, #16
+    // add x1, x1, #16
     smull v26.8h, v2.8b, v6.8b
     smull v27.8h, v3.8b, v6.8b
     subs x9, x9, #1
@@ -357,7 +352,7 @@ L3LoopDz:
         ld1 {v2.16b}, [x2], #16
 
         smlal v16.8h, v0.8b, v4.8b
-        add x1, x1, #16
+
         smlal v17.8h, v1.8b, v4.8b
         ld1 {v3.16b}, [x2], #16
         smlal v18.8h, v2.8b, v4.8b
@@ -490,7 +485,7 @@ L2LoopDz:
     smull v21.8h, v1.8b, v5.8b
     smull v22.8h, v2.8b, v5.8b
     smull v23.8h, v3.8b, v5.8b
-    add x1, x1, #32
+
     subs x9, x9, #1
     
     beq L2LoopSzEnd
@@ -511,7 +506,6 @@ L2LoopDz:
         ld1 {v2.16b}, [x2], #16
 
         smlal v16.8h, v0.8b, v4.8b
-        add x1, x1, #32
         smlal v17.8h, v1.8b, v4.8b
         ld1 {v3.16b}, [x2], #16
         smlal v18.8h, v2.8b, v4.8b
@@ -611,7 +605,7 @@ L1LoopDz:
     smull v17.8h, v1.8b, v4.8b
     smull v18.8h, v2.8b, v4.8b
     smull v19.8h, v3.8b, v4.8b
-    add x1, x1, #48
+
     subs x9, x3, #1
 
     beq L1LoopSzEnd
@@ -627,7 +621,6 @@ L1LoopDz:
         ld1 {v2.16b}, [x2], #16
 
         smlal v16.8h, v0.8b, v4.8b
-        add x1, x1, #48
         smlal v17.8h, v1.8b, v4.8b
         ld1 {v3.16b}, [x2], #16
         smlal v18.8h, v2.8b, v4.8b
diff --git a/source/backend/cpu/arm/arm64/MNNGemmInt8AddBiasScale_ARMV82_Unit.S b/source/backend/cpu/arm/arm64/MNNGemmInt8AddBiasScale_ARMV82_Unit.S
index c5203dde4..50fe4d54f 100644
--- a/source/backend/cpu/arm/arm64/MNNGemmInt8AddBiasScale_ARMV82_Unit.S
+++ b/source/backend/cpu/arm/arm64/MNNGemmInt8AddBiasScale_ARMV82_Unit.S
@@ -138,7 +138,7 @@ cbnz w28, Start
 mov x21, #16 // sizeof(float) * pack
 ldr x23, [x6, #56]  // fp32minmax
 Start:
-mov x22, #48 // src_steps
+lsl x22, x7, #2 // eDest * SRC_UNIT
 
 TILE_12:
     cmp x7, #12
@@ -483,7 +483,6 @@ TILE_8:
     cmp x5, #2
     blt L4LoopDz_TILE_8
 L8LoopDz_TILE_8:
-    //ld1 {v0.4s, v1.4s}, [x20], #32 // bias
     mov x11, x1
     mov x13, x3
     mov x27, x12
@@ -640,7 +639,6 @@ L8LoopDz_TILE_8:
     cbz x14, Tile8End
 
 L4LoopDz_TILE_8:
-    //ld1 {v0.4s}, [x20], #16 // bias
     mov x11, x1
     mov x13, x3
 
@@ -868,7 +866,6 @@ L8LoopDz_TILE_4:
     cbz x14, Tile4End
 
 L4LoopDz_TILE_4:
-    //ld1 {v0.4s}, [x20], #16 // bias
     mov x11, x1
     mov x13, x3
     SET_BIAS v8, v9, v10, v11
@@ -962,7 +959,6 @@ TILE_1:
     cmp x5, #2
     blt L4LoopDz_TILE_1
 L8LoopDz_TILE_1:
-    //ld1 {v0.4s, v1.4s}, [x20], #32 // bias
     mov x11, x1
     mov x13, x3
     mov x27, x12
@@ -1056,7 +1052,6 @@ L8LoopDz_TILE_1:
     cbz x14, Tile1End
 
 L4LoopDz_TILE_1:
-    //ld1 {v0.4s}, [x20], #16 // bias
     mov x11, x1
     mov x13, x3
     movi v8.16b, #0
diff --git a/source/backend/cpu/arm/arm64/MNNGemmInt8AddBiasScale_ARMV86_Unit.S b/source/backend/cpu/arm/arm64/MNNGemmInt8AddBiasScale_ARMV86_Unit.S
index 621f7a84b..1765a00fe 100644
--- a/source/backend/cpu/arm/arm64/MNNGemmInt8AddBiasScale_ARMV86_Unit.S
+++ b/source/backend/cpu/arm/arm64/MNNGemmInt8AddBiasScale_ARMV86_Unit.S
@@ -148,7 +148,7 @@ mov x21, #16 // sizeof(float) * pack
 ldr x14, [x6, #56]  // float32 maxmin ptr
 
 Start:
-mov x22, #80 // GEMM_INT8_DST_XUNIT * GEMM_INT8_SRC_UNIT = 10 * 8 = 80
+lsl x22, x7, #3 // eDest * GEMM_INT8_SRC_UNIT
 
 TILE_10:
     cmp x7, #10
diff --git a/source/backend/cpu/arm/arm64/MNNInt8ScaleToFloat.S b/source/backend/cpu/arm/arm64/MNNInt8ScaleToFloat.S
index 15f778528..684564b17 100644
--- a/source/backend/cpu/arm/arm64/MNNInt8ScaleToFloat.S
+++ b/source/backend/cpu/arm/arm64/MNNInt8ScaleToFloat.S
@@ -16,22 +16,35 @@
 asm_function MNNInt8ScaleToFloat
 
 // void MNNInt8ScaleToFloat(float* dst, 
-//    const int8_t* src, const float* scale, size_t size, ssize_t zeroPoint)
+//    const int8_t* src, const float* scale, size_t size, const float* zeroPoint, ssize_t quanParamVec)
 
 // Auto Load:
-// x0: dst*, x1: src*, x2: scale*, x3: size, x4: zeroPoint
+// x0: dst*, x1: src*, x2: scale*, x3: size, x4: zeroPoint, x5: quanParamVec
 
-// copy zero point
-mov v28.s[0], w4
-mov v28.s[1], w4
-mov v28.s[2], w4
-mov v28.s[3], w4
-scvtf v28.4s, v28.4s
+ld1r {v28.4s}, [x4] // zero
+ld1r {v16.4s}, [x2] // scale
+cbz x5, COMPUTE
 
+cmp x5, #3
+bne LOAD_VEC_ZERO
+ld1 {v28.4s}, [x4]
+ld1 {v16.4s}, [x2]
+b COMPUTE
+
+LOAD_VEC_ZERO:
+cmp x5, #2
+bne LOAD_VEC_SCALE
+ld1 {v28.4s}, [x4]
+b COMPUTE
+
+LOAD_VEC_SCALE:
+ld1 {v16.4s}, [x2]
+
+COMPUTE:
 cmp x3, #0
 beq End
 
-ld1 {v16.4s}, [x2]
+
 
 L4:
 cmp x3, #4
diff --git a/source/backend/cpu/arm/arm64/MNNPackC4Int8ForMatMulA_ARM82.S b/source/backend/cpu/arm/arm64/MNNPackC4Int8ForMatMulA_ARM82.S
index db81f8a03..c62416972 100644
--- a/source/backend/cpu/arm/arm64/MNNPackC4Int8ForMatMulA_ARM82.S
+++ b/source/backend/cpu/arm/arm64/MNNPackC4Int8ForMatMulA_ARM82.S
@@ -34,8 +34,6 @@ ldr x8, [x4, #32] // blockNum
 ldr x5, [x4, #40] // oneScale
 ldr x4, [x4, #0]  // kernelCountUnitDouble
 
-//ldr x8, [sp, #0] // blockNum
-
 stp d14, d15, [sp, #(-16 * 4)]!
 stp d12, d13, [sp, #(16 * 1)]
 stp d10, d11, [sp, #(16 * 2)]
@@ -43,7 +41,6 @@ stp d8,  d9,  [sp, #(16 * 3)]
 
 movi v31.16b, #1
 ld1r {v30.4s}, [x2] // Dequant scale
-mov x6, #48 // EP*LP
 sdiv x4, x4, x8     // src_depth_quad per block
 
 TILE_12:
@@ -103,6 +100,7 @@ Remain: // remain realDstCount < EP
 cbz x3, End
 /* x11: Remain dstCount step for each block */
 lsl x11, x3, #2
+lsl x6, x3, #2 // x6=eDest * LP
 
 TILE_2: // realDstCount >= 1
 cmp x3, #2
@@ -199,4 +197,4 @@ ldp d10, d11, [sp, #(16 * 2)]
 ldp d12, d13, [sp, #(16 * 1)]
 ldp d14, d15, [sp], #(16 * 4)
 ret
-#endif
\ No newline at end of file
+#endif
diff --git a/source/backend/cpu/arm/arm64/MNNPackC4Int8ForMatMulA_ARM86.S b/source/backend/cpu/arm/arm64/MNNPackC4Int8ForMatMulA_ARM86.S
index dde601bfc..b402cdd9c 100644
--- a/source/backend/cpu/arm/arm64/MNNPackC4Int8ForMatMulA_ARM86.S
+++ b/source/backend/cpu/arm/arm64/MNNPackC4Int8ForMatMulA_ARM86.S
@@ -39,13 +39,13 @@ stp d8,  d9,  [sp, #(16 * 3)]
 
 movi v31.16b, #1
 ld1r {v30.4s}, [x2] // dequant scale
-mov x8, #80 // EP*LP
 sdiv x5, x5, x6    // src_depth_quad_per_block
 
 START:
 lsl x11, x3, #2
 
 cmp x3, #1
+mov x8, #8 // for LLM decode, otherwise update in Remain
 beq TILE_1
 
 TILE_10: // realDstCount >= EP(10)
@@ -114,7 +114,8 @@ Remain: // remain realDstCount < EP
 cbz x3, End
 
 lsl x11, x3, #2
-/* For remain dstCount, each E's block step is x11. */ 
+lsl x8, x3, #3 // x8: eDest*LP
+/* For remain dstCount, each E's block step is x11. */
 TILE_8: // realDstCount >= 8
 cmp x3, #8
 blt TILE_4
diff --git a/source/backend/cpu/arm/arm64/low_memory/MNNGemmInt8AddBiasScale_16x4_w4_Unit.S b/source/backend/cpu/arm/arm64/low_memory/MNNGemmInt8AddBiasScale_16x4_w4_Unit.S
index 01d574fa8..eed75e93d 100644
--- a/source/backend/cpu/arm/arm64/low_memory/MNNGemmInt8AddBiasScale_16x4_w4_Unit.S
+++ b/source/backend/cpu/arm/arm64/low_memory/MNNGemmInt8AddBiasScale_16x4_w4_Unit.S
@@ -344,7 +344,6 @@ L3LoopDz:
     mov x22, x2
     ld1 {v10.16b, v11.16b}, [x2], #32
     ld1 {v4.16b, v5.16b, v6.16b}, [x1], #48
-    add x1, x1, #16
     // int4->int8
     movi v8.16b, #15
     ushr v0.16b, v10.16b, #4
@@ -442,7 +441,6 @@ L3LoopDz:
         smull v11.8h, v3.8b, v6.8b
 
         subs x9, x9, #1
-        add x1, x1, #16
 
         smlal2 v8.8h,  v0.16b, v6.16b
         smlal2 v9.8h,  v1.16b, v6.16b
@@ -544,7 +542,7 @@ L2LoopDz:
     smull v13.8h, v1.8b, v5.8b
     smull v14.8h, v2.8b, v5.8b
     smull v15.8h, v3.8b, v5.8b
-    add x1, x1, #32
+
     smlal2 v8.8h, v0.16b, v4.16b
     smlal2 v9.8h, v1.16b, v4.16b
     smlal2 v10.8h, v2.16b, v4.16b
@@ -590,7 +588,7 @@ L2LoopDz:
         smlal2 v9.8h, v1.16b, v4.16b
         smlal2 v10.8h, v2.16b, v4.16b
         smlal2 v11.8h, v3.16b, v4.16b
-        add x1, x1, #32
+
         subs x9, x9, #1
         smlal2 v12.8h, v0.16b, v5.16b
         smlal2 v13.8h, v1.16b, v5.16b
@@ -680,7 +678,6 @@ L1LoopDz:
     dup v16.4s, wzr
     dup v17.4s, wzr
     ld1 {v4.16b}, [x1], #16
-    add x1, x1, #48
     
     smull v8.8h, v0.8b, v4.8b
     dup v18.4s, wzr
@@ -707,7 +704,6 @@ L1LoopDz:
         sadalp v23.4s, v15.8h
 
         ld1 {v10.16b, v11.16b}, [x2], #32
-        add x1, x1, #48
         // int4->int8
         movi v8.16b, #15
         ushr v0.16b, v10.16b, #4
diff --git a/source/backend/cpu/arm/arm64/low_memory/MNNGemmInt8AddBiasScale_ARMV82_w4_Unit.S b/source/backend/cpu/arm/arm64/low_memory/MNNGemmInt8AddBiasScale_ARMV82_w4_Unit.S
index 4e94c454d..21ebb04f6 100644
--- a/source/backend/cpu/arm/arm64/low_memory/MNNGemmInt8AddBiasScale_ARMV82_w4_Unit.S
+++ b/source/backend/cpu/arm/arm64/low_memory/MNNGemmInt8AddBiasScale_ARMV82_w4_Unit.S
@@ -133,7 +133,7 @@ ldr x24, [x6, #80]  // extraScale
 mov x21, #16 // sizeof(float) * pack
 ldr x23, [x6, #56]  // fp32minmax
 Start:
-mov x22, #48 // src_steps
+lsl x22, x7, #2 // eDest * SRC_UNIT
 
 TILE_12:
     cmp x7, #12
@@ -823,8 +823,8 @@ L8LoopDz_TILE_1:
 
     movi v8.16b, #0
     movi v9.16b, #0
-    cmp x13, #4
-    blt L8LoopSz_TILE_1_lu1
+    //cmp x13, #4
+    b L8LoopSz_TILE_1_lu1
     //lsl x22, x22, #2
 
     L8LoopSz_TILE_1_lu4:
diff --git a/source/backend/cpu/arm/arm64/low_memory/MNNGemmInt8AddBiasScale_ARMV86_w4_Unit.S b/source/backend/cpu/arm/arm64/low_memory/MNNGemmInt8AddBiasScale_ARMV86_w4_Unit.S
index d6b2c53e2..7b207a024 100644
--- a/source/backend/cpu/arm/arm64/low_memory/MNNGemmInt8AddBiasScale_ARMV86_w4_Unit.S
+++ b/source/backend/cpu/arm/arm64/low_memory/MNNGemmInt8AddBiasScale_ARMV86_w4_Unit.S
@@ -123,7 +123,7 @@ ldr x14, [x6, #56]  // float32 maxmin ptr
 ldr x23, [x6, #80]  // extra scale
 
 Start:
-mov x22, #80 // GEMM_INT8_DST_XUNIT * GEMM_INT8_SRC_UNIT = 10 * 8 = 80
+lsl x22, x7, #3// eDest * GEMM_INT8_SRC_UNIT
 
 TILE_10:
     cmp x7, #10
diff --git a/source/backend/cpu/arm/arm64/low_memory/MNNQuantScaleFP32.S b/source/backend/cpu/arm/arm64/low_memory/MNNQuantScaleFP32.S
index 85a1caeea..a1a3523b7 100644
--- a/source/backend/cpu/arm/arm64/low_memory/MNNQuantScaleFP32.S
+++ b/source/backend/cpu/arm/arm64/low_memory/MNNQuantScaleFP32.S
@@ -45,6 +45,10 @@ bne LoopSz_8
 
 Tile8End:
 sub x4, x4, #8
+fcmle v28.4s, v1.4s, #0
+fcmle v29.4s, v2.4s, #0
+bit v1.16b, v31.16b, v28.16b
+bit v2.16b, v31.16b, v29.16b
 add x0, x0, #32
 fdiv v5.4s, v31.4s, v1.4s
 fdiv v6.4s, v31.4s, v2.4s
@@ -80,6 +84,8 @@ sub x4, x4, #4
 add x0, x0, #16
 // quant_scale = 127 / absmax
 // dequant_scale = absmax / 127
+fcmle v28.4s, v1.4s, #0
+bit v1.16b, v31.16b, v28.16b
 fdiv v2.4s, v31.4s, v1.4s
 fdiv v3.4s, v1.4s, v31.4s
 st1 {v2.4s}, [x1], #16
@@ -113,6 +119,8 @@ sub x4, x4, #1
 add x0, x0, #4
 // quant_scale = 127 / absmax
 // dequant_scale = absmax / 127
+fcmle v28.4s, v1.4s, #0
+bit v1.16b, v31.16b, v28.16b
 fdiv s2, s31, s1
 fdiv s3, s1, s31
 st1 {v2.s}[0], [x1], #4
diff --git a/source/backend/cpu/compute/CommonOptFunction.cpp b/source/backend/cpu/compute/CommonOptFunction.cpp
index f5f1af06a..c77790561 100644
--- a/source/backend/cpu/compute/CommonOptFunction.cpp
+++ b/source/backend/cpu/compute/CommonOptFunction.cpp
@@ -215,8 +215,13 @@ void MNNQuantScaleFP32(float* absmax, float* quant_scale, float* dequant_scale,
         for (int t = 0; t < thread; ++t) {
             absVal = std::max(absVal, absmaxPtr[t * batch]);
         }
-        quant_scale[i] = 127.0f / absVal;
-        dequant_scale[i] = absVal / 127.0f;
+        if (absVal < 1e-7) {
+            quant_scale[i] = 1.f;
+            dequant_scale[i] = 1.f;
+        } else {
+            quant_scale[i] = 127.0f / absVal;
+            dequant_scale[i] = absVal / 127.0f;
+        }
     }
 }
 void MNNQuantSumFP32(float* sum, const float* dequant_scale, size_t thread, size_t batch) {
@@ -287,7 +292,7 @@ static void MNNSumByAxisLForMatmul_A(float* dest, int8_t* source, const float* s
 
         for (int k = 0; k < blockNum; ++k) {
             // const auto src_x   = srcInt8 + w * LP;
-            const auto src_x = srcInt8 + k * (EP * LP * blockSizeQuad);
+            const auto src_x = srcInt8 + k * (step * LP * blockSizeQuad);
             for (int w = 0; w < step; ++w) {
                 float dequantScale = scale[0];
                 if (oneScale == 0) {
@@ -296,7 +301,7 @@ static void MNNSumByAxisLForMatmul_A(float* dest, int8_t* source, const float* s
                 int sumint32 = 0;
                 const auto src_y = src_x + w * LP;
                 for (int j = 0; j < blockSizeQuad; ++j) {
-                    const auto src_z = src_y + j * (EP * LP);
+                    const auto src_z = src_y + j * (step * LP);
                     for (int i = 0; i < LP; ++i) {
                         sumint32 += src_z[i];
                     }
@@ -2762,7 +2767,7 @@ void MNNComputeMatMulForE_1(const float* A, const float* B, float* C, const floa
             Vec4 sumValue = Vec4(0.0f);
             auto by = B + y * l;
             for (int x=0; x<lC4; ++x) {
-                sumValue = sumValue + Vec4::load(A + x * 4) * Vec4::load(by + x * 4);
+                sumValue = Vec4::fma(sumValue, Vec4::load(A + x * 4), Vec4::load(by + x * 4));
             }
             float sumRemain = 0.0f;
             for (int x=lR; x<l; ++x) {
@@ -2791,10 +2796,10 @@ void MNNComputeMatMulForE_1(const float* A, const float* B, float* C, const floa
             auto srcY = A + y * l;
             for (int x=0; x<l; ++x) {
                 auto a = Vec4(A[x]);
-                sumValue0 = sumValue0 + a * Vec4::load(bs + h * x);
-                sumValue1 = sumValue1 + a * Vec4::load(bs + h * x + 4);
-                sumValue2 = sumValue2 + a * Vec4::load(bs + h * x + 8);
-                sumValue3 = sumValue3 + a * Vec4::load(bs + h * x + 12);
+                sumValue0 = Vec4::fma(sumValue0, a, Vec4::load(bs + h * x));
+                sumValue1 = Vec4::fma(sumValue1, a, Vec4::load(bs + h * x + 4));
+                sumValue2 = Vec4::fma(sumValue2, a, Vec4::load(bs + h * x + 8));
+                sumValue3 = Vec4::fma(sumValue3, a, Vec4::load(bs + h * x + 12));
             }
             Vec4::save(C + 16 * y, sumValue0);
             Vec4::save(C + 16 * y + 4, sumValue1);
diff --git a/source/backend/cpu/compute/ConvInt8TiledExecutor.cpp b/source/backend/cpu/compute/ConvInt8TiledExecutor.cpp
index 4788d88c3..aa4aadbd2 100644
--- a/source/backend/cpu/compute/ConvInt8TiledExecutor.cpp
+++ b/source/backend/cpu/compute/ConvInt8TiledExecutor.cpp
@@ -113,9 +113,6 @@ static bool _reorderWeightInside(Backend* bn, const Convolution2DCommon* common,
     int oc = common->outputCount(), ic = common->inputCount(), kernelCount = common->kernelX() * common->kernelY();
     std::vector<int> shape;
     int pack = gcore->pack;
-    if (gcore->bytes == 2 && gcore->pack == 8) {
-        pack = 4;
-    }
     if (SRC_UNIT > pack) {
         MNN_ASSERT(SRC_UNIT % pack == 0);
         shape = {UP_DIV(oc, UNIT), UP_DIV(UP_DIV(ic, pack) * kernelCount, SRC_UNIT / pack), UNIT, SRC_UNIT};
@@ -178,8 +175,10 @@ static void GetResourceInt8(std::shared_ptr<CPUConvolution::ResourceInt8> resour
     }
     auto alphaPtr = resource->mOriginScale->host<float>();
     auto biasPtr = reinterpret_cast<float*>(reinterpret_cast<uint8_t*>(alphaPtr) + scaleSize * bytes);
-    ::memset(alphaPtr, 1, scaleSize * bytes);
-    ::memset(biasPtr, 0, scaleSize * bytes);
+    if (outputCount % core->pack != 0) {
+        ::memset(alphaPtr, 0, scaleSize * bytes);
+        ::memset(biasPtr, 0, scaleSize * bytes);
+    }
     auto quanInfoPtr = quantCommon->alpha.get();
     int h = quantCommon->alpha.size();
     if (quantCommon->asymmetric) {
@@ -444,18 +443,10 @@ static void _computeAlphaScale(Backend* backend, const Convolution2D* conv2d, st
     auto wZero = resourceInt8->mWeightQuantZero->host<int32_t>(); // has packed to outputUp4
     auto wScale = resourceInt8->mOriginScale->host<float>();
     int h = ocUp4;
-    if (core->bytes == 2) {
-        std::unique_ptr<int16_t[]> tmp(new int16_t[h]);
-        core->MNNFp32ToLowp(wScale, tmp.get(), h);
-        for (int i=0; i< h; ++i) {
-            reinterpret_cast<int16_t*>(alphaPtr)[i] = tmp[i];
-            reinterpret_cast<int16_t*>(biasPtr)[i] = (-1.f) * wZero[i] * tmp[i];
-        }
-    } else {
-        for (int i=0; i< h; ++i) {
-            alphaPtr[i] = wScale[i];
-            biasPtr[i] = (-1.f) * wZero[i] * wScale[i];
-        }
+    MNN_ASSERT(4 == core->bytes);
+    for (int i=0; i< h; ++i) {
+        alphaPtr[i] = wScale[i];
+        biasPtr[i] = (-1.f) * wZero[i] * wScale[i];
     }
     resourceInt8->mOriginScale = scaleBias;
 
@@ -582,11 +573,8 @@ ErrorCode DenseConvInt8TiledExecutor::onResize(const std::vector<Tensor*>& input
         }
         // A axisSum kernel
         mSumByAxisLFunc = gcore->MNNSumByAxisLForMatmul_A;
-        if (gcore->bytes == 2 && gcore->pack == 8) { // use fp16
-            ConvolutionTiledExecutor::setIm2ColParameter(mIm2ColParamter, mCommon, inputs[0], outputs[0], mPadX, mPadY, gcore, core, 4);
-        } else {
-            ConvolutionTiledExecutor::setIm2ColParameter(mIm2ColParamter, mCommon, inputs[0], outputs[0], mPadX, mPadY, gcore, core);
-        }
+        ConvolutionTiledExecutor::setIm2ColParameter(mIm2ColParamter, mCommon, inputs[0], outputs[0], mPadX, mPadY, gcore, core);
+
         int ocUp4 = ROUND_UP(outputs[0]->channel(), gcore->pack);
         int alphaSize = mResourceInt8->mOriginScale->size() / (sizeof(float) * 2);
         mBlockNum  = alphaSize / ocUp4;
@@ -864,6 +852,15 @@ ErrorCode DenseConvInt8TiledExecutor::onExecute(const std::vector<Tensor*>& inpu
         }
 
         /* Dynamic quant */
+        if (mCommon->padX() > 0 || mCommon->padY() > 0) { // Ensure "0.0f" included in range.
+            if (minVal > 0.f) {
+                minVal = 0.f;
+            } else if (maxVal < 0.f){
+                maxVal = 0.f;
+            } else {
+                //
+            }
+        }
         float range = maxVal - minVal;
         if (fabs(range) < 1e-7) {
             zeropoint = maxVal;
@@ -875,12 +872,22 @@ ErrorCode DenseConvInt8TiledExecutor::onExecute(const std::vector<Tensor*>& inpu
             zeropoint = roundf(-minVal * 255.f / range) - 128.0f;
         }
         auto sizeDiv = UP_DIV(inputsize, PackUnit);
-        int inputPlane = input->batch() * mIm2ColParamter.iw * mIm2ColParamter.ih;
-        if (gcore->bytes == 2 && gcore->pack == 8 && inputPlane > 1) { // C8->C4
-            mQuantAndReorderFunc(floatptr, int8ptr, inputPlane, &quantscale, -128, 127, &zeropoint, UP_DIV(input->channel(), PackUnit), 4 * inputPlane);
+
+        threadNeed = mThreadNums;
+        inputSizeCount = UP_DIV(sizeDiv, mThreadNums);
+        if (inputSizeCount < 9) {
+            threadNeed = 1;
+            inputSizeCount = sizeDiv;
         } else {
-            mQuantFunc(floatptr, int8ptr, sizeDiv, &quantscale, -128, 127, &zeropoint, 0);
+            threadNeed = ALIMIN(UP_DIV(sizeDiv, inputSizeCount), mThreadNums);
+            inputSizeCount = UP_DIV(sizeDiv, threadNeed);
         }
+        MNN_CONCURRENCY_BEGIN(tId, threadNeed) {
+            auto perThreadWorkCount = ALIMIN(inputSizeCount, sizeDiv - tId * inputSizeCount);
+            auto inptr_ = (float*)(((int8_t*)floatptr) + tId * inputSizeCount * PackUnit * gcore->bytes);
+            mQuantFunc(inptr_ , int8ptr + tId * inputSizeCount * PackUnit, perThreadWorkCount, &quantscale, -128, 127, &zeropoint, 0);
+        }
+        MNN_CONCURRENCY_END();
 
         /* bias float */
     #ifdef MNN_USE_SSE
@@ -1078,7 +1085,7 @@ ErrorCode DenseConvInt8TiledExecutor::onExecute(const std::vector<Tensor*>& inpu
                         quanParam.weightQuanBias = weightDequanBiasTid + k * ocUp4;
                         quanParam.scale = (float*)(scaleFloatTid + k * ocUp4);
 
-                        mGemmKernel(outputInTilePtr, colAddrTemp + k * blockL * src_step_Y, weightPtrTid + k * blockL * weight_step_Y * UP_DIV(output->channel(), UNIT__), blockL, dstZStep * dstBytes, ocDivThread, &quanParam, step);
+                        mGemmKernel(outputInTilePtr, colAddrTemp + k * blockL * step * SRC_UNIT, weightPtrTid + k * blockL * weight_step_Y * UP_DIV(output->channel(), UNIT__), blockL, dstZStep * dstBytes, ocDivThread, &quanParam, step);
                     }
                     ptrX += (step * mBlockNum);
                     realDstCount-=step;
@@ -1092,7 +1099,8 @@ ErrorCode DenseConvInt8TiledExecutor::onExecute(const std::vector<Tensor*>& inpu
     const int threads = static_cast<CPUBackend*>(backend())->threadNumber();
     if (!mSplitByOc) {
         MNN_CONCURRENCY_BEGIN(tId, threads) {
-            ThreadFunction((int)tId, mDivides[tId], mDivides[tId + 1], 1, 0);
+                ThreadFunction((int)tId, mDivides[tId], mDivides[tId + 1], 1, 0);
+
         }
         MNN_CONCURRENCY_END();
     } else {
diff --git a/source/backend/cpu/compute/ConvInt8Winograd.cpp b/source/backend/cpu/compute/ConvInt8Winograd.cpp
index a460c1db8..23f31d8c8 100644
--- a/source/backend/cpu/compute/ConvInt8Winograd.cpp
+++ b/source/backend/cpu/compute/ConvInt8Winograd.cpp
@@ -323,7 +323,7 @@ ErrorCode ConvInt8Winograd::onExecute(const std::vector<Tensor *> &inputs, const
 
     std::vector<float> scale(pack, inputQuant[0]);
     int size = bn->getTensorSize(mInputFloat.get());
-    core->MNNInt8ScaleToFloat(mInputFloat->host<float>(), inputs[0]->host<int8_t>(), scale.data(), size / pack, inputQuant[1]);
+    core->MNNInt8ScaleToFloat(mInputFloat->host<float>(), inputs[0]->host<int8_t>(), &inputQuant[0], size / pack, &inputQuant[1], 0);
     std::vector<Tensor*> tmp_outputs;
     for (auto& unit : mUnits) {
         unit.input->buffer().host = TensorUtils::getDescribeOrigin(unit.input.get())->mem->chunk().ptr();
@@ -557,7 +557,7 @@ ErrorCode ConvInt8Winograd::WinoExecution::onExecute(const std::vector<Tensor *>
                     quanParam.extraScale = nullptr;
                     quanParam.bias = nullptr;
                     quanParam.blockNum = 1;
-                    gemmFunc((int8_t*)_dstFloatPtr, _srcInt8Ptr, _weightInt8Ptr, mTempInputBuffer->length(2), xC * pack * sizeof(float), dc_4, &quanParam, xC);
+                    gemmFunc((int8_t*)_dstFloatPtr, _srcInt8Ptr, _weightInt8Ptr, mTempInputBuffer->length(2), xC * pack * sizeof(float), dc_4, &quanParam, DST_XUNIT);
                 }
     #ifndef MNN_WINO_TRANFORM_TEST_CLOSE
                 {
diff --git a/source/backend/cpu/compute/ConvolutionFloatFactory.cpp b/source/backend/cpu/compute/ConvolutionFloatFactory.cpp
index d09a3f6fd..d8c6e6cd9 100644
--- a/source/backend/cpu/compute/ConvolutionFloatFactory.cpp
+++ b/source/backend/cpu/compute/ConvolutionFloatFactory.cpp
@@ -56,13 +56,18 @@ static Execution* _createUnit(const Tensor* input, const Tensor* output, Backend
             return new DenseConvolutionTiledExecutor(common, backend, originWeight, originWeightSize, bias, biasSize, weightQuantInfo);
         }
     }
+#ifndef MNN_LOW_MEMORY
+    if (cpuBackend->memoryMode() == BackendConfig::Memory_Low) {
+        return new DenseConvolutionTiledExecutor(common, backend, originWeight, originWeightSize, bias, biasSize, weightQuantInfo);
+    }
+#endif
     if (fastWay && cpuBackend->functions()->matmulBytes == 0) {
         return new Convolution1x1Strassen(common, backend, originWeight, originWeightSize, bias, biasSize, weightQuantInfo);
     }
     if (originWeightSize == 0) {
         return new DenseConvolutionTiledExecutor(common, backend, originWeight, originWeightSize, bias, biasSize, weightQuantInfo);
     }
-    if (!ConvolutionWinogradBridge::canUseWinograd(common)) {
+    if (cpuBackend->getRuntime()->hint().winogradMemoryUsed == 0 || (!ConvolutionWinogradBridge::canUseWinograd(common))) {
         return new DenseConvolutionTiledExecutor(common, backend, originWeight, originWeightSize, bias, biasSize, nullptr);
     }
     PerfConfig convPerfconfig = DenseConvolutionTiledExecutor::bestTileConvolutionConfig(common, input, output, cpuBackend->threadNumber(), backend);
diff --git a/source/backend/cpu/compute/ConvolutionPackFreeWinograd.cpp b/source/backend/cpu/compute/ConvolutionPackFreeWinograd.cpp
index a4ccae1db..98480c6d9 100644
--- a/source/backend/cpu/compute/ConvolutionPackFreeWinograd.cpp
+++ b/source/backend/cpu/compute/ConvolutionPackFreeWinograd.cpp
@@ -645,6 +645,11 @@ WinogradConfig ConvolutionPackFreeWinograd::updateBestWinogradUnit(const Convolu
     auto oc4 = UP_DIV(oc, pack);
     int ePackMax, hPack, lPack;
     core->MNNGetMatMulPackMode(&ePackMax, &lPack, &hPack);
+    auto winogradMemoryLevel = static_cast<CPUBackend*>(b)->getRuntime()->hint().winogradMemoryUsed;
+    int unitMaxLimit = CONVOLUTION_WINOGRAD_MAX_UNIT;
+    if (winogradMemoryLevel != 3) {
+        unitMaxLimit = CONVOLUTION_WINOGRAD_MIN_UNIT;
+    }
 
     WinogradConfig bestConfig(0, false, 0, 0, 0, std::numeric_limits<float>().max());
     auto kernelSize  = common->kernelY();
@@ -659,7 +664,7 @@ WinogradConfig ConvolutionPackFreeWinograd::updateBestWinogradUnit(const Convolu
     for (int ePack = ePackUnit; ePack <= ePackUnit; ePack += ePackUnit) {
         int unit2   = UP_DIV(batch * ow * oh, ePack);
         int maxUnit = (int)::sqrtf((float)unit2);
-        maxUnit     = std::min(maxUnit, CONVOLUTION_WINOGRAD_MAX_UNIT);
+        maxUnit     = std::min(maxUnit, unitMaxLimit);
         maxUnit     = std::max(maxUnit, CONVOLUTION_WINOGRAD_MIN_UNIT);
         std::set<int> supportSu{4, 6, 8};
 
diff --git a/source/backend/cpu/compute/GemmInt8Executor.cpp b/source/backend/cpu/compute/GemmInt8Executor.cpp
index e314f9fcf..032e97c26 100644
--- a/source/backend/cpu/compute/GemmInt8Executor.cpp
+++ b/source/backend/cpu/compute/GemmInt8Executor.cpp
@@ -200,11 +200,10 @@ ErrorCode GemmInt8Executor::onExecute(const std::vector<Tensor *> &inputs, const
     auto threadFunction = [&](int tId) {
         auto colAddr        = im2colPtr + tId * mInputCol->stride(0);
         auto col_buffer_size = mInputCol->stride(0);
-        int32_t info[6];
+        int32_t info[5];
         info[1] = mIm2ColParamter.iw * mIm2ColParamter.ih * batch;
         info[2] = DST_XUNIT;
         info[3] = mIm2ColParamter.strideX;
-        info[5] = mIm2ColParamter.kernelCountUnit;
         float paramsf[1];
         paramsf[0] = dequantScale;
         auto srcPtr     = (int8_t const **)(mBlitInfo.ptr() + tId * mBlitInfoStride.first);
diff --git a/source/backend/cpu/compute/IdstConvolutionInt8.cpp b/source/backend/cpu/compute/IdstConvolutionInt8.cpp
index 20ce94af3..8046ecd15 100644
--- a/source/backend/cpu/compute/IdstConvolutionInt8.cpp
+++ b/source/backend/cpu/compute/IdstConvolutionInt8.cpp
@@ -154,6 +154,9 @@ ErrorCode IdstConvolutionInt8::onExecute(const std::vector<Tensor*>& inputs, con
     int PackUnit = static_cast<CPUBackend*>(backend())->functions()->pack;
     
     auto gemmKernel = coreInt->Int8GemmKernel;
+    if (SRC_UNIT > PackUnit) {
+        memset(mTempBuffer.host<int8_t>(), 0, mTempBuffer.size());
+    }
     
     //        AUTOTIME;
     auto input        = inputs[0];
@@ -210,7 +213,7 @@ ErrorCode IdstConvolutionInt8::onExecute(const std::vector<Tensor*>& inputs, con
             auto srcPtr     = (int8_t const **)(mBlitInfo.ptr() + tId * mBlitInfoStride.first);
             auto el         = (int32_t *)(srcPtr + mBlitInfoStride.second);
 
-            int32_t info[4];
+            int32_t info[5];
             info[1] = mIm2ColParamter.iw * mIm2ColParamter.ih;
             info[2] = DST_XUNIT;
             info[3] = mIm2ColParamter.strideX;
@@ -225,6 +228,7 @@ ErrorCode IdstConvolutionInt8::onExecute(const std::vector<Tensor*>& inputs, con
                     ::memset(colAddr, zeroPoint, col_buffer_size);
                 }
                 info[0] = number;
+                info[4] = realDstCount;
                 if (number > 0) {
                     blitProc(colAddr, srcPtr, info, el);
                 }
diff --git a/source/backend/cpu/compute/Int8FunctionsOpt.cpp b/source/backend/cpu/compute/Int8FunctionsOpt.cpp
index 5bed95103..7dd218564 100644
--- a/source/backend/cpu/compute/Int8FunctionsOpt.cpp
+++ b/source/backend/cpu/compute/Int8FunctionsOpt.cpp
@@ -1440,7 +1440,7 @@ static void MNNGemmInt8AddBiasScale_16x4_Unit(int8_t* dst, const int8_t* src, co
 
             for (int sz = 0; sz < src_depth_quad; ++sz) {
                 const auto weight_sz = weight_dz + weight_step_Y * sz;
-                const auto src_z     = src_x + sz * GEMM_INT8_DST_XUNIT * GEMM_INT8_SRC_UNIT;
+                const auto src_z     = src_x + sz * realCount * GEMM_INT8_SRC_UNIT;
 
                 for (int j = 0; j < GEMM_INT8_UNIT; ++j) {
                     const auto weight_j = weight_sz + j * GEMM_INT8_SRC_UNIT;
@@ -1506,7 +1506,7 @@ static void MNNGemmInt8AddBiasScale_16x4_w4_Unit(int8_t* dst, const int8_t* src,
 
             for (int sz = 0; sz < src_depth_quad; ++sz) {
                 const auto weight_sz = (uint8_t*)weight_dz + weight_step_Y * sz;
-                const auto src_z     = src_x + sz * GEMM_INT8_DST_XUNIT * GEMM_INT8_SRC_UNIT;
+                const auto src_z     = src_x + sz * realCount * GEMM_INT8_SRC_UNIT;
 
                 int w8[64]; // 64=GEMM_INT8_UNIT * GEMM_INT8_SRC_UNIT
                 for (int k = 0; k < 32; ++k) {
@@ -1671,12 +1671,20 @@ void MNNFloat2Int8(const float* src, int8_t* dst, size_t sizeQuad, const float*
     }
 }
 
-void MNNInt8ScaleToFloat(float* dst, const int8_t* src, const float* scale, size_t size, ssize_t zeroPoint) {
+void MNNInt8ScaleToFloat(float* dst, const int8_t* src, const float* scale, size_t size, const float* zeroPoint, ssize_t quantParamVec) {
+    float scale_[4] = {scale[0], scale[0], scale[0], scale[0]};
+    float zero_[4] = {zeroPoint[0], zeroPoint[0], zeroPoint[0], zeroPoint[0]};
+    if (quantParamVec & 1) {
+        ::memcpy(scale_, scale, 4 * sizeof(float));
+    }
+    if (quantParamVec >> 1) {
+        ::memcpy(zero_, zeroPoint, 4 * sizeof(float));
+    }
     for (int i = 0; i < size; ++i) {
         const auto srcStart = src + i * 4;
         auto dstStart       = dst + i * 4;
         for (int j = 0; j < 4; ++j) {
-            dstStart[j] = static_cast<float>(srcStart[j] - zeroPoint) * scale[j];
+            dstStart[j] = static_cast<float>(srcStart[j] - zero_[j]) * scale_[j];
         }
     }
 }
@@ -2033,22 +2041,39 @@ static void _ArmBasicMNNPackC4ForMatMul_A(int8_t* destOrigin, int8_t const** sou
     int eDest = EP;
     int offset = info[3];
     const int LUNIT = LP / sizeof(float);
+    int realDstCount = info[4];
     for (int n=0; n<number; ++n) {
-        int e = el[4 * n + 0];
+        int e = el[4 * n + 0];       // to fill
         int l = el[4 * n + 1];
-        int eOffset = el[4 * n + 2];
+        int eOffset = el[4 * n + 2]; // have filled
         int lOffset = el[4 * n + 3];
         int lC = lOffset / LP;
         int lR = lOffset % LP;
         int eC = eOffset / eDest;
         int eR = eOffset % eDest;
+        int eS = eDest - eR;
+//        printf("e=%d, eC=%d, lC=%d, eR=%d, lR=%d\n", e, eC, lC, eR, lR);
+        bool lastBag = false;
+        int eOutsideStride4LastBag = eOutsideStride;
+        if (realDstCount % EP > 0) {
+            int jobsE = realDstCount - eOffset - e;
+            if (jobsE == 0 || (jobsE < (realDstCount % EP))) {
+                lastBag = true;
+            }
+        }
         auto dest = (int32_t*)(destOrigin + lC * eDest * LP + lR + eC * info[2] + eR * LP);
         auto source = (int32_t*)sourceGroup[n];
         int lRemain = l / 4;
-        int lR4 = lR / LUNIT;
+        int lR4 = lR / 4;
         int lS = LUNIT - lR4;
-        int eS = eDest - eR;
+        
+        if (lastBag && e + eR < EP) {
+            int elast = ALIMAX(eR + e, realDstCount % EP);
+            dest = (int32_t*)(destOrigin + lC * elast * LP + lR + eC * info[2] + eR * LP);
+        }
         // Step for start
+        int offsetLC = lC * LUNIT + lR / 4;
+
         if (lR4 > 0) {
             int step = ALIMIN(lS, lRemain);
             for (int x=0; x<step; ++x) {
@@ -2061,7 +2086,13 @@ static void _ArmBasicMNNPackC4ForMatMul_A(int8_t* destOrigin, int8_t const** sou
                         d[yi * LUNIT] = s[yi * offset];
                     }
                     eRemain-=eStep;
-                    d += (eOutsideStride - eR * LUNIT);
+                    if (!lastBag ||eRemain >= EP) {
+                        d += (eOutsideStride - eR * LUNIT);
+                    } else {
+                        int eFill = ALIMAX(eRemain, realDstCount % EP); // maybe padding>0
+                        eOutsideStride4LastBag = eOutsideStride - ((offsetLC / LUNIT) * EP * LUNIT);
+                        d += (eOutsideStride4LastBag - eR * LUNIT + (offsetLC / LUNIT) * eFill * LUNIT);
+                    }
                     s += eS * offset;
                 }
                 while (eRemain > 0) {
@@ -2070,14 +2101,29 @@ static void _ArmBasicMNNPackC4ForMatMul_A(int8_t* destOrigin, int8_t const** sou
                         d[yi * LUNIT] = s[yi * offset];
                     }
                     eRemain-=eStep;
-                    d+= eOutsideStride;
+                    if (!lastBag || eRemain >= EP) {
+                        d+= eOutsideStride;
+                    } else {
+                        int eFill = ALIMAX(eRemain, realDstCount % EP); // maybe padding>0
+                        eOutsideStride4LastBag = eOutsideStride - ((offsetLC / LUNIT) * EP * LUNIT);
+                        d+= (eOutsideStride4LastBag + (offsetLC / LUNIT) * eFill * LUNIT);
+                    }
                     s+= eStep * offset;
                 }
+                offsetLC++;
             }
             lRemain -= step;
-            dest += step;
+            if (lastBag && e + eR < EP) {
+                int eFill = ALIMAX(realDstCount % EP, e + eR);
+                int nextLP = (eFill * LP - lR) / sizeof(int32_t);
+                dest += nextLP;
+            } else {
+                int nextLP = (eDest * LP - lR) / sizeof(int32_t);
+                dest += nextLP;
+            }
             source += eReal * step;
         }
+        
         while (lRemain > 0) {
             int step = ALIMIN(lRemain, LUNIT);
             for (int x=0; x<step; ++x) {
@@ -2090,7 +2136,13 @@ static void _ArmBasicMNNPackC4ForMatMul_A(int8_t* destOrigin, int8_t const** sou
                         d[yi * LUNIT] = s[yi * offset];
                     }
                     eRemain-=eStep;
-                    d += (eOutsideStride - eR * LUNIT);
+                    if (!lastBag ||eRemain >= EP) {
+                        d += (eOutsideStride - eR * LUNIT);
+                    } else {
+                        int eFill = ALIMAX(eRemain, realDstCount % EP); // maybe padding>0
+                        eOutsideStride4LastBag = eOutsideStride - ((offsetLC / LUNIT) * EP * LUNIT);
+                        d += (eOutsideStride4LastBag - eR * LUNIT + (offsetLC / LUNIT) * eFill * LUNIT);
+                    }
                     s += eS * offset;
                 }
                 while (eRemain > 0) {
@@ -2099,12 +2151,25 @@ static void _ArmBasicMNNPackC4ForMatMul_A(int8_t* destOrigin, int8_t const** sou
                         d[yi * LUNIT] = s[yi * offset];
                     }
                     eRemain-=eStep;
-                    d+= eOutsideStride;
+                    if (!lastBag || eRemain >= EP) {
+                        d+= eOutsideStride;
+                    } else {
+                        int eFill = ALIMAX(eRemain, realDstCount % EP); // maybe padding>0
+                        eOutsideStride4LastBag = eOutsideStride - ((offsetLC / LUNIT) * EP * LUNIT);
+                        d+= (eOutsideStride4LastBag + (offsetLC / LUNIT) * eFill * LUNIT);
+                    }
                     s+= eStep * offset;
                 }
+                offsetLC++;
             }
+            
             lRemain -= step;
-            dest += eDest * LUNIT;
+            if (lastBag && e + eR < EP) {
+                int efill = ALIMAX(e + eR, realDstCount % EP);
+                dest += efill * LUNIT;
+            } else {
+                dest += eDest * LUNIT;
+            }
             source += eReal * step;
         }
     }
@@ -2136,17 +2201,33 @@ static void _ArmBasicMNNPackC4ForMatMul_A_L4(int8_t* destOrigin, int8_t const**
     int offset = info[3];
     const int LP = 4;
     int eOutsideStride = info[2] / sizeof(float);
+    int kernelCountUnit = eOutsideStride;
+    int realDstCount = info[4];
     for (int n=0; n<number; ++n) {
         int e = el[4 * n + 0];
         int l = el[4 * n + 1];
         int eOffset = el[4 * n + 2];
         int lOffset = el[4 * n + 3];
-        int eC = eOffset / eDest;
-        int eR = eOffset % eDest;
-        auto dest = (int32_t*)(destOrigin + lOffset * eDest + eC * info[2] + eR * LP);
+        int eC = eOffset / EP;
+        int eR = eOffset % EP;
         int eS = eDest - eR;
+        bool lastBag = false;
+        int eOutsideStride4LastBag = eOutsideStride;
+        int eres = realDstCount - eOffset;
+        if (realDstCount % EP > 0) {
+            int jobsE = realDstCount - eOffset - e;
+            if (jobsE == 0 || (jobsE < (realDstCount % EP))) {
+                lastBag = true;
+            }
+        }
+        auto dest = (int32_t*)(destOrigin + lOffset * eDest + eC * info[2] + eR * LP);
         auto source = (int32_t*)sourceGroup[n];
         int lRemain = l / sizeof(float);
+        if (lastBag && e + eR < EP) {
+            int elast = ALIMIN(ALIMAX(eR + e, realDstCount % EP), EP);
+            dest = (int32_t*)(destOrigin + lOffset * elast + eC * info[2] + eR * LP);
+        }
+        int offsetLC = lOffset / 4;
         for (int x=0; x<lRemain; ++x) {
             int eRemain = e;
             auto d = dest;
@@ -2156,14 +2237,26 @@ static void _ArmBasicMNNPackC4ForMatMul_A_L4(int8_t* destOrigin, int8_t const**
                     int eStep = ALIMIN(eRemain, eS);
                     ::memcpy(d, s, eStep * sizeof(int32_t));
                     eRemain-=eStep;
-                    d += (eOutsideStride - eR);
+                    if (!lastBag ||eRemain >= EP) {
+                        d += (eOutsideStride - eR);
+                    } else {
+                        int eFill = ALIMAX(eRemain, realDstCount % EP); // maybe padding>0
+                        eOutsideStride4LastBag = eOutsideStride - (EP * 4 * offsetLC / sizeof(float));
+                        d += (eOutsideStride4LastBag - eR + offsetLC * eFill);
+                    }
                     s += eS * offset;
                 }
                 while (eRemain > 0) {
                     int eStep = ALIMIN(eDest, eRemain);
                     ::memcpy(d, s, eStep * sizeof(int32_t));
                     eRemain-=eStep;
-                    d+= eOutsideStride;
+                    if (!lastBag || eRemain >= EP) {
+                        d+= eOutsideStride;
+                    } else {
+                        int eFill = ALIMAX(eRemain, realDstCount % EP); // maybe padding>0
+                        eOutsideStride4LastBag = eOutsideStride - (EP * 4 * offsetLC / sizeof(float));
+                        d+= (eOutsideStride4LastBag + offsetLC * eFill);
+                    }
                     s+= eStep * offset;
                 }
             } else {
@@ -2173,7 +2266,13 @@ static void _ArmBasicMNNPackC4ForMatMul_A_L4(int8_t* destOrigin, int8_t const**
                         d[yi] = s[yi * offset];
                     }
                     eRemain-=eStep;
-                    d += (eOutsideStride - eR);
+                    if (!lastBag ||eRemain >= EP) {
+                        d += (eOutsideStride - eR);
+                    } else {
+                        int eFill = ALIMAX(eRemain, realDstCount % EP); // maybe padding>0
+                        eOutsideStride4LastBag = eOutsideStride - (EP * 4 * offsetLC / sizeof(float));
+                        d += (eOutsideStride4LastBag - eR + offsetLC * eFill);
+                    }
                     s += eS * offset;
                 }
                 while (eRemain > 0) {
@@ -2182,12 +2281,24 @@ static void _ArmBasicMNNPackC4ForMatMul_A_L4(int8_t* destOrigin, int8_t const**
                         d[yi] = s[yi * offset];
                     }
                     eRemain-=eStep;
-                    d+= eOutsideStride;
+                    if (!lastBag || eRemain >= EP) {
+                        d+= eOutsideStride;
+                    } else {
+                        int eFill = ALIMAX(eRemain, realDstCount % EP); // maybe padding>0
+                        eOutsideStride4LastBag = eOutsideStride - (EP * 4 * offsetLC / sizeof(float));
+                        d+= (eOutsideStride4LastBag + offsetLC * eFill);
+                    }
                     s+= eStep * offset;
                 }
             }
-            dest += eDest;
             source += eReal;
+            if (lastBag && e + eR < EP ) { // eR=0;eR>0
+                int efill = ALIMAX(e + eR, realDstCount % EP);
+                dest += efill;
+            } else {
+                dest += eDest;
+            }
+            offsetLC++;
         }
     }
 }
@@ -2237,7 +2348,7 @@ void MNNCoreInt8FunctionInit() {
         gCoreFunc->Int8GemmKernelFast = MNNGemmInt8AddBiasScale_ARMV82_Unit;
         gCoreFunc->MNNGetGemmUnit = MNNGetGemmUnitSdot;
         // Im2Col
-        gCoreFunc->MNNPackC4Int8ForMatMul_A = _ArmBasicMNNPackC4ForMatMul_A_L4<12, 4>;
+        gCoreFunc->MNNPackC4Int8ForMatMul_A = _ArmBasicMNNPackC4ForMatMul_A_L4<12, 8>;
         // ConvDepthwise
         gCoreFunc->ConvDepthwise3x3LineInt8_ARM82 = MNNLineDepthWiseInt8AddBiasScale_ARMV82_Unit3X3;
         core->MNNSumByAxisLForMatmul_A = MNNSumByAxisLForMatmul_A_ARM82;
diff --git a/source/backend/cpu/compute/Int8FunctionsOpt.h b/source/backend/cpu/compute/Int8FunctionsOpt.h
index 6860c0643..eb405e6e8 100644
--- a/source/backend/cpu/compute/Int8FunctionsOpt.h
+++ b/source/backend/cpu/compute/Int8FunctionsOpt.h
@@ -62,7 +62,7 @@ struct QuanPrePostParameters{
 };
 void MNNFloat2Int8(const float* src, int8_t* dst, size_t sizeQuad, const float* scalep, ssize_t minValue,
                    ssize_t maxValue, const float* zeroPoint, ssize_t quanParamVec);
-void MNNInt8ScaleToFloat(float* dst, const int8_t* src, const float* scale, size_t size, ssize_t zeroPoint);
+void MNNInt8ScaleToFloat(float* dst, const int8_t* src, const float* scale, size_t size, const float* zeroPoint, ssize_t quanParamVec);
 void MNNInt8FunctionInit();
 void MNNPackedSparseQuantMatMulEpx1(int8_t* C, const int8_t* A, const int8_t* B, const size_t* sparseQuantParam, const QuanPostTreatParameters* post, unsigned int* NNZMap, int* dataOffsetMap);
 void MNNPackedSparseQuantMatMulEpx4(int8_t* C, const int8_t* A, const int8_t* B, const size_t* sparseQuantParam, const QuanPostTreatParameters* post, unsigned int* NNZMap, int* dataOffsetMap);
@@ -104,7 +104,7 @@ struct CoreInt8Functions {
     void(*DynamicQuanInput_ARM82)(const float* src, int8_t* dst, size_t sizeQuad, const float* scalep, ssize_t minValue, ssize_t maxValue, const float* zeroPoint, ssize_t quanParamVec) = nullptr;
     void (*DynamicQuanInputAndReorder_ARM82)(const float* src, int8_t* dst, size_t planeSize, const float* scale, ssize_t aMin, ssize_t aMax, const float* zeroPoint, size_t ocQuad, size_t offset) = nullptr;
     void(*MNNFloat2Int8)(const float* src, int8_t* dst, size_t sizeQuad, const float* scalep, ssize_t minValue, ssize_t maxValue, const float* zeroPoint, ssize_t quanParamVec);
-    void(*MNNInt8ScaleToFloat)(float* dst, const int8_t* src, const float* scale, size_t size, ssize_t zeroPoint);
+    void(*MNNInt8ScaleToFloat)(float* dst, const int8_t* src, const float* scale, size_t size, const float* zeroPoint, ssize_t quanParamVec);
 
     void(*MNNScaleAndAddBias)(float* dst, const float* src, const float* bias, const float* alpha, size_t planeNumber, size_t biasNumber);
 
diff --git a/source/backend/cpu/x86_x64/AVX2Backend.cpp b/source/backend/cpu/x86_x64/AVX2Backend.cpp
index ed263e366..65567295d 100644
--- a/source/backend/cpu/x86_x64/AVX2Backend.cpp
+++ b/source/backend/cpu/x86_x64/AVX2Backend.cpp
@@ -7,12 +7,7 @@
 //
 
 #include <algorithm>
-#if defined(_MSC_VER)
-#include <intrin.h>
-#else
-#include <x86intrin.h>
-#endif
-
+#include "core/SimdHeader.h"
 #include "AVX2Functions.hpp"
 #include "AVX2Backend.hpp"
 #include "core/BufferAllocator.hpp"
diff --git a/source/backend/cpu/x86_x64/AVX2Functions.cpp b/source/backend/cpu/x86_x64/AVX2Functions.cpp
index 3bafc7573..ad105f6ba 100644
--- a/source/backend/cpu/x86_x64/AVX2Functions.cpp
+++ b/source/backend/cpu/x86_x64/AVX2Functions.cpp
@@ -22,6 +22,12 @@ static void _MNNGetMatMulPackMode(int* eP, int *lP, int* hP) {
     *hP = ghP;
 }
 
+#ifndef MNN_USE_AVX
+bool AVX2Functions::init(int cpuFlags) {
+    return false;
+}
+#else
+
 bool AVX2Functions::init(int cpuFlags) {
     gAVX2CoreFunctions = new CoreFunctions;
     auto coreFunction = gAVX2CoreFunctions;
@@ -99,11 +105,12 @@ bool AVX2Functions::init(int cpuFlags) {
 #endif
     return true;
 }
+#endif
+
 CoreFunctions* AVX2Functions::get() {
     return gAVX2CoreFunctions;
 }
 CoreInt8Functions* AVX2Functions::getInt8() {
     return gAVX2CoreInt8Functions;
 }
-
 };
diff --git a/source/backend/cpu/x86_x64/CMakeLists.txt b/source/backend/cpu/x86_x64/CMakeLists.txt
index d9b462266..42d26b472 100644
--- a/source/backend/cpu/x86_x64/CMakeLists.txt
+++ b/source/backend/cpu/x86_x64/CMakeLists.txt
@@ -5,6 +5,12 @@ IF(MSVC AND (DEFINED ENV{MNN_ASSEMBLER}) AND "${CMAKE_SIZEOF_VOID_P}" STREQUAL "
     set(WIN_USE_ASM ON)
 ENDIF()
 message(STATUS "WIN_USE_ASM: ${WIN_USE_ASM}")
+if (EMSCRIPTEN)
+    set(MNN_AVX2 OFF)
+endif()
+if (NOT MNN_AVX2)
+set(MNN_AVX512 OFF)
+endif()
 function (process_asm TARGET_NAME FILE_SRCS)
     if(NOT MSVC)
         return()
@@ -32,7 +38,7 @@ function (process_asm TARGET_NAME FILE_SRCS)
     set(EXTRA_OBJS ${EXTRA_OBJS} PARENT_SCOPE)
 endfunction()
 
-if(CMAKE_SYSTEM_PROCESSOR MATCHES "(x86_64)|(X86_64)|(x64)|(X64)|(amd64)|(AMD64)|(i686)")
+if(CMAKE_SYSTEM_PROCESSOR MATCHES "(x86_64)|(X86_64)|(x64)|(X64)|(amd64)|(AMD64)|(i686)|(x86)")
     message(STATUS "${CMAKE_SYSTEM_PROCESSOR}: Open SSE")
     target_compile_options(MNNCPU PRIVATE -DMNN_USE_SSE)
     option(MNN_AVX512_VNNI "Enable AVX512 VNNI" ON)
@@ -55,6 +61,7 @@ if(CMAKE_SYSTEM_PROCESSOR MATCHES "(x86_64)|(X86_64)|(x64)|(X64)|(amd64)|(AMD64)
         if (MNN_AVX512_VNNI)
             target_compile_options(MNNAVX512 PRIVATE -DMNN_AVX512_VNNI)
             add_library(MNNAVX512_VNNI OBJECT ${MNNAVX512_VNNI_SRC})
+            target_compile_options(MNNAVX512_VNNI PRIVATE -DMNN_USE_SSE)
             target_compile_options(MNNAVX512_VNNI PRIVATE -DMNN_AVX512_VNNI)
             if (MSVC)
                 target_compile_options(MNNAVX512 PRIVATE /arch:AVX512)
@@ -68,27 +75,29 @@ if(CMAKE_SYSTEM_PROCESSOR MATCHES "(x86_64)|(X86_64)|(x64)|(X64)|(amd64)|(AMD64)
     process_asm(MNNAVXFMA MNN_AVXFMA_SRC)
     process_asm(MNNSSE MNN_SSE_SRC)
     add_library(MNNX8664 OBJECT ${MNN_X8664_SRC})
-    add_library(MNNAVX OBJECT ${MNN_AVX_SRC})
-    add_library(MNNAVXFMA OBJECT ${MNN_AVXFMA_SRC})
     add_library(MNNSSE OBJECT ${MNN_SSE_SRC})
     target_compile_options(MNNX8664 PRIVATE -DMNN_USE_SSE)
     target_compile_options(MNNSSE PRIVATE -DMNN_USE_SSE)
-    target_compile_options(MNNAVX PRIVATE -DMNN_USE_SSE)
-    target_compile_options(MNNAVXFMA PRIVATE -DMNN_USE_SSE)
+    if (MNN_AVX2)
+        target_compile_options(MNNX8664 PRIVATE -DMNN_USE_AVX)
+        add_library(MNNAVX OBJECT ${MNN_AVX_SRC})
+        add_library(MNNAVXFMA OBJECT ${MNN_AVXFMA_SRC})
+        target_compile_options(MNNAVX PRIVATE -DMNN_USE_SSE)
+        target_compile_options(MNNAVXFMA PRIVATE -DMNN_USE_SSE)
+    endif()
     if(MSVC)
-        target_compile_options(MNNAVX PRIVATE /arch:AVX)
-        target_compile_options(MNNAVXFMA PRIVATE /arch:AVX2)
+        if (MNN_AVX2)
+            target_compile_options(MNNAVX PRIVATE /arch:AVX)
+            target_compile_options(MNNAVXFMA PRIVATE /arch:AVX2)
+        endif()
     else()
         target_compile_options(MNNSSE PRIVATE -msse4.1)
-        target_compile_options(MNNAVX PRIVATE -mavx2 -DMNN_X86_USE_ASM)
-        target_compile_options(MNNAVXFMA PRIVATE -mavx2 -mfma -DMNN_X86_USE_ASM)
-    endif()
-    if (MNN_SUPPORT_BF16)
-        target_compile_options(MNNAVXFMA PRIVATE -DMNN_SUPPORT_BF16)
-        if (MNN_SSE_USE_FP16_INSTEAD)
-            target_compile_options(MNNAVXFMA PRIVATE -DMNN_SSE_USE_FP16_INSTEAD -mf16c)
+        if (MNN_AVX2)
+            target_compile_options(MNNAVX PRIVATE -mavx2 -DMNN_X86_USE_ASM)
+            target_compile_options(MNNAVXFMA PRIVATE -mavx2 -mfma -DMNN_X86_USE_ASM)
         endif()
     endif()
+
     if (MNN_LOW_MEMORY)
         target_compile_options(MNNX8664 PRIVATE -DMNN_LOW_MEMORY)
         target_compile_options(MNNSSE PRIVATE -DMNN_LOW_MEMORY)
@@ -101,8 +110,12 @@ if(CMAKE_SYSTEM_PROCESSOR MATCHES "(x86_64)|(X86_64)|(x64)|(X64)|(amd64)|(AMD64)
         target_compile_options(MNNAVX PRIVATE -DMNN_CPU_WEIGHT_DEQUANT_GEMM)
         target_compile_options(MNNAVXFMA PRIVATE -DMNN_CPU_WEIGHT_DEQUANT_GEMM)
     endif()
-    list(APPEND MNN_OBJECTS_TO_LINK $<TARGET_OBJECTS:MNNX8664> $<TARGET_OBJECTS:MNNAVXFMA> $<TARGET_OBJECTS:MNNAVX> $<TARGET_OBJECTS:MNNSSE>)
-    if (MSVC AND WIN_USE_ASM)
+    list(APPEND MNN_OBJECTS_TO_LINK $<TARGET_OBJECTS:MNNX8664> $<TARGET_OBJECTS:MNNSSE>)
+    if (MNN_AVX2)
+        list(APPEND MNN_OBJECTS_TO_LINK $<TARGET_OBJECTS:MNNAVXFMA> $<TARGET_OBJECTS:MNNAVX>)
+    endif()
+
+    if (MSVC AND WIN_USE_ASM AND MNN_AVX2)
         target_compile_options(MNNAVX PRIVATE -DMNN_X86_USE_ASM)
         target_compile_options(MNNAVXFMA PRIVATE -DMNN_X86_USE_ASM)
         list(APPEND MNN_OBJECTS_TO_LINK ${EXTRA_OBJS})
diff --git a/source/backend/cpu/x86_x64/FunctionDispatcher.cpp b/source/backend/cpu/x86_x64/FunctionDispatcher.cpp
index 54effc2cb..21c8bd408 100644
--- a/source/backend/cpu/x86_x64/FunctionDispatcher.cpp
+++ b/source/backend/cpu/x86_x64/FunctionDispatcher.cpp
@@ -17,11 +17,6 @@
 #include "cpu_id.h"
 #include "sse/FunctionSummary.hpp"
 // https://stackoverflow.com/a/11230437
-#if defined(_MSC_VER)
-#include <intrin.h>
-#else
-#include <x86intrin.h>
-#endif
 
 struct FunctionGroup {
     int tileNumber                                                                               = 8;
@@ -45,6 +40,11 @@ void _SSEMNNGetMatMulPackMode(int* eP, int *lP, int* hP) {
 }
 void MNNFunctionInit() {
     auto cpuFlags = libyuv::InitCpuFlags();
+#ifdef __EMSCRIPTEN__
+    // TODO: Find better way
+    cpuFlags |= libyuv::kCpuHasSSE41;
+    cpuFlags |= libyuv::kCpuHasSSSE3;
+#endif
     auto coreFunction = MNN::MNNGetCoreFunctions();
     if (cpuFlags & libyuv::kCpuHasSSSE3) {
         coreFunction->MNNGetMatMulPackMode = _SSEMNNGetMatMulPackMode;
@@ -65,6 +65,7 @@ void MNNFunctionInit() {
         // Dynamic Quant
         coreFunction->MNNCountMaxMinValue = _SSE_MNNComputeScaleZeroScalar;
     }
+#ifdef MNN_USE_AVX
     if (cpuFlags & libyuv::kCpuHasAVX2) {
         MNN::AVX2Functions::init(cpuFlags);
         gFunc.MNNExpC8 = _AVX_MNNExpC8;
@@ -76,6 +77,7 @@ void MNNFunctionInit() {
         }
         gFunc.MNNNorm = _AVX_MNNNorm;
     }
+#endif
     _SSE_ImageProcessInit(coreFunction, cpuFlags);
 }
 
diff --git a/source/backend/cpu/x86_x64/avx/FunctionSummary.hpp b/source/backend/cpu/x86_x64/avx/FunctionSummary.hpp
index c21411b48..fbc7ae104 100644
--- a/source/backend/cpu/x86_x64/avx/FunctionSummary.hpp
+++ b/source/backend/cpu/x86_x64/avx/FunctionSummary.hpp
@@ -6,11 +6,7 @@
 //  Copyright © 2018, Alibaba Group Holding Limited
 //
 
-#if defined(_MSC_VER)
-#include <intrin.h>
-#else
-#include <x86intrin.h>
-#endif
+#include "core/SimdHeader.h"
 #include <MNN/MNNDefine.h>
 #include <stdint.h>
 
@@ -56,7 +52,7 @@ void _AVX_MNNPackC4ForMatMul_A(float* destOrigin, float const** sourceGroup, con
 void _AVX_MNNExpC8(float* dest, const float* source, float* offset, const float* parameters, size_t countC8);
 void _AVX_MNNSoftmax(float* dest, const float* source, size_t size);
 void _AVX_MNNFloat2Int8(const float* src, int8_t* dst, size_t sizeQuad, const float* scalep, ssize_t minV, ssize_t maxV, const float* zeroPoint, ssize_t quanParamVec);
-void _AVX_MNNInt8ScaleToFloat(float* dst, const int8_t* src, const float* scale, size_t sizeQuad, ssize_t zeroPoint);
+void _AVX_MNNInt8ScaleToFloat(float* dst, const int8_t* src, const float* scale, size_t sizeQuad, const float* zeroPoint, ssize_t quanParamVec);
 void _AVX_MNNLineDepthWiseInt8AddBiasScaleUnit(int8_t* dstO, const int8_t* srcO, const int8_t* weightO, const QuanPostTreatParameters* parameters, size_t width, size_t src_w_step, size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step, int8_t* idxOrder);
 void _AVX_MNNComputeMatMulForE_1(const float* A, const float* B, float* C, const float* biasPtr, const MatMulParam* param, size_t tId);
 void _AVX_MNNPackC4ForMatMul_A_BF16(float* destOrigin, float const** sourceGroup, const int32_t* info, const int32_t* el);
diff --git a/source/backend/cpu/x86_x64/avx/GemmInt8.cpp b/source/backend/cpu/x86_x64/avx/GemmInt8.cpp
index ed4226b89..5a1d26af1 100644
--- a/source/backend/cpu/x86_x64/avx/GemmInt8.cpp
+++ b/source/backend/cpu/x86_x64/avx/GemmInt8.cpp
@@ -117,6 +117,7 @@ void _AVX_MNNGemmInt8AddBiasScale_16x4_w4(int8_t* dst, const int8_t* src, const
             }
         }
     }
+    auto oneValue = _mm256_set1_epi16(1);
     //printf("e=%d, sz=%d, dz=%d\n", realDst, src_depth_quad, dst_depth_quad);
     if (GEMMINT8_AVX2_E == realDst) {
         for (int dz = 0; dz < dst_depth_quad; ++dz) {
@@ -130,40 +131,26 @@ void _AVX_MNNGemmInt8AddBiasScale_16x4_w4(int8_t* dst, const int8_t* src, const
             __m256i D01 = _mm256_set1_epi32(0);
             __m256i D02 = _mm256_set1_epi32(0);
             __m256i D03 = _mm256_set1_epi32(0);
-            __m256i D10 = _mm256_set1_epi32(0);
-            __m256i D11 = _mm256_set1_epi32(0);
-            __m256i D12 = _mm256_set1_epi32(0);
-            __m256i D13 = _mm256_set1_epi32(0);
 
             for (int sz = 0; sz < src_depth_quad; ++sz) {
                 const auto weight_sz = weight_dz + sz * weight_step_Y;
-                const auto src_z     = src_x + sz * GEMMINT8_AVX2_L * GEMMINT8_AVX2_E;
+                const auto src_z     = src_x + sz * GEMMINT8_AVX2_L * realDst;
                 LOAD_INT4_TO_INT8;
-                auto W0 = _mm256_cvtepi8_epi16(w_0);
-                auto W1 = _mm256_cvtepi8_epi16(w_1);
-
-                auto s0 = _mm_castps_si128(_mm_broadcast_ss((float*)src_z + 0));
-                auto s1 = _mm_castps_si128(_mm_broadcast_ss((float*)src_z + 1));
-                auto s2 = _mm_castps_si128(_mm_broadcast_ss((float*)src_z + 2));
-                auto s3 = _mm_castps_si128(_mm_broadcast_ss((float*)src_z + 3));
-                auto S0 = _mm256_cvtepu8_epi16(s0);
-                auto S1 = _mm256_cvtepu8_epi16(s1);
-                auto S2 = _mm256_cvtepu8_epi16(s2);
-                auto S3 = _mm256_cvtepu8_epi16(s3);
+                auto w0 = _mm256_set_m128i(w_1, w_0);
+                auto s0 = _mm256_castps_si256(_mm256_broadcast_ss((float*)src_z + 0));
+                auto s1 = _mm256_castps_si256(_mm256_broadcast_ss((float*)src_z + 1));
+                auto s2 = _mm256_castps_si256(_mm256_broadcast_ss((float*)src_z + 2));
+                auto s3 = _mm256_castps_si256(_mm256_broadcast_ss((float*)src_z + 3));
 
-                COMPUTE(0, 0);
-                COMPUTE(1, 0);
-                COMPUTE(0, 1);
-                COMPUTE(1, 1);
-                COMPUTE(0, 2);
-                COMPUTE(1, 2);
-                COMPUTE(0, 3);
-                COMPUTE(1, 3);
+                D00 = _mm256_add_epi32(D00, _mm256_madd_epi16(_mm256_maddubs_epi16(s0, w0), oneValue));
+                D01 = _mm256_add_epi32(D01, _mm256_madd_epi16(_mm256_maddubs_epi16(s1, w0), oneValue));
+                D02 = _mm256_add_epi32(D02, _mm256_madd_epi16(_mm256_maddubs_epi16(s2, w0), oneValue));
+                D03 = _mm256_add_epi32(D03, _mm256_madd_epi16(_mm256_maddubs_epi16(s3, w0), oneValue));
             }
-            auto D0 = NORMAL_HADD(D00, D10);
-            auto D1 = NORMAL_HADD(D01, D11);
-            auto D2 = NORMAL_HADD(D02, D12);
-            auto D3 = NORMAL_HADD(D03, D13);
+            auto D0 = D00;
+            auto D1 = D01;
+            auto D2 = D02;
+            auto D3 = D03;
             auto scaleValue = _mm256_loadu_ps(scale_dz);         
             auto weightBiasValue = _mm256_loadu_ps((float*)weightBias_dz);
 
@@ -251,35 +238,23 @@ void _AVX_MNNGemmInt8AddBiasScale_16x4_w4(int8_t* dst, const int8_t* src, const
             __m256i D01 = _mm256_set1_epi32(0);
             __m256i D02 = _mm256_set1_epi32(0);
 
-            __m256i D10 = _mm256_set1_epi32(0);
-            __m256i D11 = _mm256_set1_epi32(0);
-            __m256i D12 = _mm256_set1_epi32(0);
-
             for (int sz = 0; sz < src_depth_quad; ++sz) {
                 const auto weight_sz = weight_dz + sz * weight_step_Y;
-                const auto src_z     = src_x + sz * GEMMINT8_AVX2_L * GEMMINT8_AVX2_E;
+                const auto src_z     = src_x + sz * GEMMINT8_AVX2_L * realDst;
                 LOAD_INT4_TO_INT8;
 
-                auto W0 = _mm256_cvtepi8_epi16(w_0);
-                auto W1 = _mm256_cvtepi8_epi16(w_1);
-
-                auto s0 = _mm_castps_si128(_mm_broadcast_ss((float*)src_z + 0));
-                auto s1 = _mm_castps_si128(_mm_broadcast_ss((float*)src_z + 1));
-                auto s2 = _mm_castps_si128(_mm_broadcast_ss((float*)src_z + 2));
-                auto S0 = _mm256_cvtepu8_epi16(s0);
-                auto S1 = _mm256_cvtepu8_epi16(s1);
-                auto S2 = _mm256_cvtepu8_epi16(s2);
+                auto w0 = _mm256_set_m128i(w_1, w_0);
+                auto s0 = _mm256_castps_si256(_mm256_broadcast_ss((float*)src_z + 0));
+                auto s1 = _mm256_castps_si256(_mm256_broadcast_ss((float*)src_z + 1));
+                auto s2 = _mm256_castps_si256(_mm256_broadcast_ss((float*)src_z + 2));
 
-                COMPUTE(0, 0);
-                COMPUTE(1, 0);
-                COMPUTE(0, 1);
-                COMPUTE(1, 1);
-                COMPUTE(0, 2);
-                COMPUTE(1, 2);
+                D00 = _mm256_add_epi32(D00, _mm256_madd_epi16(_mm256_maddubs_epi16(s0, w0), oneValue));
+                D01 = _mm256_add_epi32(D01, _mm256_madd_epi16(_mm256_maddubs_epi16(s1, w0), oneValue));
+                D02 = _mm256_add_epi32(D02, _mm256_madd_epi16(_mm256_maddubs_epi16(s2, w0), oneValue));
             }
-            auto D0 = NORMAL_HADD(D00, D10);
-            auto D1 = NORMAL_HADD(D01, D11);
-            auto D2 = NORMAL_HADD(D02, D12);
+            auto D0 = D00;
+            auto D1 = D01;
+            auto D2 = D02;
             auto scaleValue = _mm256_loadu_ps(scale_dz);
             auto weightBiasValue = _mm256_loadu_ps((float*)weightBias_dz);
 
@@ -358,23 +333,17 @@ void _AVX_MNNGemmInt8AddBiasScale_16x4_w4(int8_t* dst, const int8_t* src, const
 
             for (int sz = 0; sz < src_depth_quad; ++sz) {
                 const auto weight_sz = weight_dz + sz * weight_step_Y;
-                const auto src_z     = src_x + sz * GEMMINT8_AVX2_L * GEMMINT8_AVX2_E;
+                const auto src_z     = src_x + sz * GEMMINT8_AVX2_L * realDst;
                 LOAD_INT4_TO_INT8;
-                auto W0 = _mm256_cvtepi8_epi16(w_0);
-                auto W1 = _mm256_cvtepi8_epi16(w_1);
-
-                auto s0 = _mm_castps_si128(_mm_broadcast_ss((float*)src_z + 0));
-                auto s1 = _mm_castps_si128(_mm_broadcast_ss((float*)src_z + 1));
-                auto S0 = _mm256_cvtepu8_epi16(s0);
-                auto S1 = _mm256_cvtepu8_epi16(s1);
+                auto w0 = _mm256_set_m128i(w_1, w_0);
+                auto s0 = _mm256_castps_si256(_mm256_broadcast_ss((float*)src_z + 0));
+                auto s1 = _mm256_castps_si256(_mm256_broadcast_ss((float*)src_z + 1));
 
-                COMPUTE(0, 0);
-                COMPUTE(1, 0);
-                COMPUTE(0, 1);
-                COMPUTE(1, 1);
+                D00 = _mm256_add_epi32(D00, _mm256_madd_epi16(_mm256_maddubs_epi16(s0, w0), oneValue));
+                D01 = _mm256_add_epi32(D01, _mm256_madd_epi16(_mm256_maddubs_epi16(s1, w0), oneValue));
             }
-            auto D0 = NORMAL_HADD(D00, D10);
-            auto D1 = NORMAL_HADD(D01, D11);
+            auto D0 = D00;
+            auto D1 = D01;
             auto scaleValue = _mm256_loadu_ps(scale_dz);
             auto weightBiasValue = _mm256_loadu_ps((float*)weightBias_dz);
 
@@ -438,18 +407,14 @@ void _AVX_MNNGemmInt8AddBiasScale_16x4_w4(int8_t* dst, const int8_t* src, const
 
             for (int sz = 0; sz < src_depth_quad; ++sz) {
                 const auto weight_sz = weight_dz + sz * weight_step_Y;
-                const auto src_z     = src_x + sz * GEMMINT8_AVX2_L * GEMMINT8_AVX2_E;
+                const auto src_z     = src_x + sz * GEMMINT8_AVX2_L * realDst;
                 LOAD_INT4_TO_INT8;
-                auto W0 = _mm256_cvtepi8_epi16(w_0);
-                auto W1 = _mm256_cvtepi8_epi16(w_1);
-
-                auto s0 = _mm_castps_si128(_mm_broadcast_ss((float*)src_z + 0));
-                auto S0 = _mm256_cvtepu8_epi16(s0);
+                auto w0 = _mm256_set_m128i(w_1, w_0);
+                auto s0 = _mm256_castps_si256(_mm256_broadcast_ss((float*)src_z + 0));
 
-                COMPUTE(0, 0);
-                COMPUTE(1, 0);
+                D00 = _mm256_add_epi32(D00, _mm256_madd_epi16(_mm256_maddubs_epi16(s0, w0), oneValue));
             }
-            auto D0 = NORMAL_HADD(D00, D10);
+            auto D0 = D00;
             auto scaleValue = _mm256_loadu_ps(scale_dz);
             auto weightBiasValue = _mm256_loadu_ps((float*)weightBias_dz);
 
@@ -569,7 +534,7 @@ void _AVX_MNNGemmInt8AddBiasScale_16x4_Unit(int8_t* dst, const int8_t* src, cons
 
             for (int sz = 0; sz < src_depth_quad; ++sz) {
                 const auto weight_sz = weight_dz + sz * (GEMMINT8_AVX2_L * GEMMINT8_AVX2_H);
-                const auto src_z     = src_x + sz * GEMMINT8_AVX2_L * GEMMINT8_AVX2_E;
+                const auto src_z     = src_x + sz * GEMMINT8_AVX2_L * realDst;
                 auto w0 = mm_loadu_si128(weight_sz + 16 * 0);
                 auto w1 = mm_loadu_si128(weight_sz + 16 * 1);
                 auto W0 = _mm256_cvtepi8_epi16(w0);
@@ -697,7 +662,7 @@ void _AVX_MNNGemmInt8AddBiasScale_16x4_Unit(int8_t* dst, const int8_t* src, cons
 
             for (int sz = 0; sz < src_depth_quad; ++sz) {
                 const auto weight_sz = weight_dz + sz * (GEMMINT8_AVX2_L * GEMMINT8_AVX2_H);
-                const auto src_z     = src_x + sz * GEMMINT8_AVX2_L * GEMMINT8_AVX2_E;
+                const auto src_z     = src_x + sz * GEMMINT8_AVX2_L * realDst;
                 auto w0 = mm_loadu_si128(weight_sz + 16 * 0);
                 auto w1 = mm_loadu_si128(weight_sz + 16 * 1);
                 auto W0 = _mm256_cvtepi8_epi16(w0);
@@ -803,7 +768,7 @@ void _AVX_MNNGemmInt8AddBiasScale_16x4_Unit(int8_t* dst, const int8_t* src, cons
 
             for (int sz = 0; sz < src_depth_quad; ++sz) {
                 const auto weight_sz = weight_dz + sz * (GEMMINT8_AVX2_L * GEMMINT8_AVX2_H);
-                const auto src_z     = src_x + sz * GEMMINT8_AVX2_L * GEMMINT8_AVX2_E;
+                const auto src_z     = src_x + sz * GEMMINT8_AVX2_L * realDst;
                 auto w0 = mm_loadu_si128(weight_sz + 16 * 0);
                 auto w1 = mm_loadu_si128(weight_sz + 16 * 1);
                 auto W0 = _mm256_cvtepi8_epi16(w0);
@@ -888,7 +853,7 @@ void _AVX_MNNGemmInt8AddBiasScale_16x4_Unit(int8_t* dst, const int8_t* src, cons
 
             for (int sz = 0; sz < src_depth_quad; ++sz) {
                 const auto weight_sz = weight_dz + sz * (GEMMINT8_AVX2_L * GEMMINT8_AVX2_H);
-                const auto src_z     = src_x + sz * GEMMINT8_AVX2_L * GEMMINT8_AVX2_E;
+                const auto src_z     = src_x + sz * GEMMINT8_AVX2_L * realDst;
                 auto w0 = mm_loadu_si128(weight_sz + 16 * 0);
                 auto w1 = mm_loadu_si128(weight_sz + 16 * 1);
                 auto W0 = _mm256_cvtepi8_epi16(w0);
@@ -994,7 +959,7 @@ void _AVX_MNNGemmInt8AddBiasScale_16x4_Unit_Fast(int8_t* dst, const int8_t* src,
 
             for (int sz = 0; sz < src_depth_quad; ++sz) {
                 const auto weight_sz = weight_dz + sz * (GEMMINT8_AVX2_L * GEMMINT8_AVX2_H);
-                const auto src_z     = src_x + sz * GEMMINT8_AVX2_L * GEMMINT8_AVX2_E;
+                const auto src_z     = src_x + sz * GEMMINT8_AVX2_L * realDst;
                 auto w0 = _mm256_loadu_si256((__m256i*)weight_sz);
 
                 auto s0 = _mm256_castps_si256(_mm256_broadcast_ss((float*)src_z + 0));
@@ -1080,7 +1045,7 @@ void _AVX_MNNGemmInt8AddBiasScale_16x4_Unit_Fast(int8_t* dst, const int8_t* src,
 
             for (int sz = 0; sz < src_depth_quad; ++sz) {
                 const auto weight_sz = weight_dz + sz * (GEMMINT8_AVX2_L * GEMMINT8_AVX2_H);
-                const auto src_z     = src_x + sz * GEMMINT8_AVX2_L * GEMMINT8_AVX2_E;
+                const auto src_z     = src_x + sz * GEMMINT8_AVX2_L * realDst;
                 auto w0 = _mm256_loadu_si256((__m256i*)weight_sz);
 
                 auto s0 = _mm256_castps_si256(_mm256_broadcast_ss((float*)src_z + 0));
@@ -1152,7 +1117,7 @@ void _AVX_MNNGemmInt8AddBiasScale_16x4_Unit_Fast(int8_t* dst, const int8_t* src,
 
             for (int sz = 0; sz < src_depth_quad; ++sz) {
                 const auto weight_sz = weight_dz + sz * (GEMMINT8_AVX2_L * GEMMINT8_AVX2_H);
-                const auto src_z     = src_x + sz * GEMMINT8_AVX2_L * GEMMINT8_AVX2_E;
+                const auto src_z     = src_x + sz * GEMMINT8_AVX2_L * realDst;
                 auto w0 = _mm256_loadu_si256((__m256i*)weight_sz);
 
                 auto s0 = _mm256_castps_si256(_mm256_broadcast_ss((float*)src_z + 0));
@@ -1206,7 +1171,7 @@ void _AVX_MNNGemmInt8AddBiasScale_16x4_Unit_Fast(int8_t* dst, const int8_t* src,
 
             for (int sz = 0; sz < src_depth_quad; ++sz) {
                 const auto weight_sz = weight_dz + sz * (GEMMINT8_AVX2_L * GEMMINT8_AVX2_H);
-                const auto src_z     = src_x + sz * GEMMINT8_AVX2_L * GEMMINT8_AVX2_E;
+                const auto src_z     = src_x + sz * GEMMINT8_AVX2_L * realDst;
                 auto w0 = _mm256_loadu_si256((__m256i*)weight_sz);
 
                 auto s0 = _mm256_castps_si256(_mm256_broadcast_ss((float*)src_z + 0));
@@ -1353,12 +1318,18 @@ void _AVX_MNNFloat2Int8(const float* src, int8_t* dst, size_t sizeQuad, const fl
     }
 }
 
-void _AVX_MNNInt8ScaleToFloat(float* dst, const int8_t* src, const float* scale, size_t sizeQuad, ssize_t zeroPoint) {
+void _AVX_MNNInt8ScaleToFloat(float* dst, const int8_t* src, const float* scale, size_t sizeQuad, const float* zeroPoint, ssize_t quanParamVec) {
     auto sizeC4 = sizeQuad / 4;
     auto sizeRemain = sizeQuad % 4;
     auto zero = _mm256_set1_epi32(0);
-    auto scaleValue = _mm256_loadu_ps(scale);
-    auto zeroPointValue = _mm256_set1_epi32(zeroPoint + 128);
+    auto scaleValue = _mm256_set1_ps(scale[0]);
+    auto zeroPointValue = _mm256_set1_ps(zeroPoint[0] + 128.f);
+    if (quanParamVec & 1) {
+        scaleValue = _mm256_loadu_ps(scale);
+    }
+    if (quanParamVec >> 1) {
+        zeroPointValue  = _mm256_add_ps(_mm256_loadu_ps(zeroPoint), _mm256_set1_ps(128.f));
+    }
     for (int i = 0; i < sizeC4; ++i) {
         auto s = _mm256_castps_si256(_mm256_loadu_ps((const float*)(src)));
         auto s0_16 = _mm256_permute4x64_epi64(_mm256_unpacklo_epi8(s, zero), 0XD8);
@@ -1367,14 +1338,14 @@ void _AVX_MNNInt8ScaleToFloat(float* dst, const int8_t* src, const float* scale,
         auto s1_32 = _mm256_unpacklo_epi16(s1_16, zero);
         auto s2_32 = _mm256_unpackhi_epi16(s0_16, zero);
         auto s3_32 = _mm256_unpackhi_epi16(s1_16, zero);
-        s0_32 = _mm256_sub_epi32(s0_32, zeroPointValue);
-        s1_32 = _mm256_sub_epi32(s1_32, zeroPointValue);
-        s2_32 = _mm256_sub_epi32(s2_32, zeroPointValue);
-        s3_32 = _mm256_sub_epi32(s3_32, zeroPointValue);
         auto s0_f = _mm256_cvtepi32_ps(s0_32);
         auto s1_f = _mm256_cvtepi32_ps(s1_32);
         auto s2_f = _mm256_cvtepi32_ps(s2_32);
         auto s3_f = _mm256_cvtepi32_ps(s3_32);
+        s0_f = _mm256_sub_ps(s0_f, zeroPointValue);
+        s1_f = _mm256_sub_ps(s1_f, zeroPointValue);
+        s2_f = _mm256_sub_ps(s2_f, zeroPointValue);
+        s3_f = _mm256_sub_ps(s3_f, zeroPointValue);
         _mm256_storeu_ps(dst + 8 * 0, _mm256_mul_ps(s0_f, scaleValue));
         _mm256_storeu_ps(dst + 8 * 1, _mm256_mul_ps(s1_f, scaleValue));
         _mm256_storeu_ps(dst + 8 * 2, _mm256_mul_ps(s2_f, scaleValue));
@@ -1392,14 +1363,14 @@ void _AVX_MNNInt8ScaleToFloat(float* dst, const int8_t* src, const float* scale,
         auto s1_32 = _mm256_unpacklo_epi16(s1_16, zero);
         auto s2_32 = _mm256_unpackhi_epi16(s0_16, zero);
         auto s3_32 = _mm256_unpackhi_epi16(s1_16, zero);
-        s0_32 = _mm256_sub_epi32(s0_32, zeroPointValue);
-        s1_32 = _mm256_sub_epi32(s1_32, zeroPointValue);
-        s2_32 = _mm256_sub_epi32(s2_32, zeroPointValue);
-        s3_32 = _mm256_sub_epi32(s3_32, zeroPointValue);
         auto s0_f = _mm256_cvtepi32_ps(s0_32);
         auto s1_f = _mm256_cvtepi32_ps(s1_32);
         auto s2_f = _mm256_cvtepi32_ps(s2_32);
         auto s3_f = _mm256_cvtepi32_ps(s3_32);
+        s0_f = _mm256_sub_ps(s0_f, zeroPointValue);
+        s1_f = _mm256_sub_ps(s1_f, zeroPointValue);
+        s2_f = _mm256_sub_ps(s2_f, zeroPointValue);
+        s3_f = _mm256_sub_ps(s3_f, zeroPointValue);
         switch (sizeRemain) {
             case 3:
                 _mm256_storeu_ps(dst + 8 * 0, _mm256_mul_ps(s0_f, scaleValue));
@@ -1436,23 +1407,37 @@ static void _AVXMNNPackC4ForMatMul_A(int8_t* destOrigin, int8_t const** sourceGr
     const int EP = GEMMINT8_AVX2_E;
     int eDest = EP;
     const int LP = GEMMINT8_AVX2_L;
+    int realDstCount = info[4];
     for (int n=0; n<number; ++n) {
         int e = el[4 * n + 0];
         int l = el[4 * n + 1];
         int eOffset = el[4 * n + 2];
         int lOffset = el[4 * n + 3];
-        int eC = eOffset / eDest;
-        int eR = eOffset % eDest;
+        int eC = eOffset / EP;
+        int eR = eOffset % EP;
+        int eS = eDest - eR;
+        bool lastBag = false;
+        int eOutsideStride4LastBag = eOutsideStride;
+        if (realDstCount % EP > 0) {
+            int jobsE = realDstCount - eOffset - e;
+            if (jobsE == 0 || (jobsE < (realDstCount % EP))) {
+                lastBag = true;
+            }
+        }
         auto source = (int32_t*)sourceGroup[n];
         auto dest = (int32_t*)(destOrigin + eC * info[2] + eR * LP + lOffset * EP);
         //printf("e=%d, l=%d, eOffset=%d, lOffset=%d, eDest=%d\n", e, l, eOffset, lOffset, eDest);
         l = l / 4; // Use float instead of int8 * 4
-        int eS = eDest - eR;
+        if (lastBag && e + eR < EP) {
+            int elast = ALIMAX(eR + e, realDstCount % EP);
+            dest = (int32_t*)(destOrigin + lOffset * elast + eC * info[2] + eR * LP);
+        }
+        int offsetLC = lOffset / 4;
         for (int x = 0; x < l; ++x) {
             int eRemain = e;
             auto xR                  = x % PUNIT;
             auto xC                  = x / PUNIT;
-            auto d = dest + x * eDest;
+            auto d = dest;
             auto s = source + xC * eReal * FLOATPACK + xR;
             if (eR > 0) {
                 int eStep = ALIMIN(eRemain, eS);
@@ -1460,7 +1445,13 @@ static void _AVXMNNPackC4ForMatMul_A(int8_t* destOrigin, int8_t const** sourceGr
                     d[yi] = s[yi * xS4];
                 }
                 eRemain-=eStep;
-                d += (eOutsideStride - eR);
+                if (!lastBag ||eRemain >= EP) {
+                    d += (eOutsideStride - eR);
+                } else {
+                    int eFill = ALIMAX(eRemain, realDstCount % EP); // maybe padding>0
+                    eOutsideStride4LastBag = eOutsideStride - (EP * 4 * offsetLC / sizeof(float));
+                    d += (eOutsideStride4LastBag - eR + offsetLC * eFill);
+                }
                 s += eS * xS4;
             }
             while (eRemain > 0) {
@@ -1469,9 +1460,22 @@ static void _AVXMNNPackC4ForMatMul_A(int8_t* destOrigin, int8_t const** sourceGr
                     d[yi] = s[yi * xS4];
                 }
                 eRemain-=eStep;
-                d+= eOutsideStride;
+                if (!lastBag || eRemain >= EP) {
+                    d+= eOutsideStride;
+                } else {
+                    int eFill = ALIMAX(eRemain, realDstCount % EP); // maybe padding>0
+                    eOutsideStride4LastBag = eOutsideStride - (EP * 4 * offsetLC / sizeof(float));
+                    d+= (eOutsideStride4LastBag + offsetLC * eFill);
+                }
                 s+= eStep * xS4;
             }
+            if (lastBag && e + eR < EP) {
+                int efill = ALIMAX(e + eR, realDstCount % EP);
+                dest += efill;
+            } else {
+                dest += eDest;
+            }
+            offsetLC++;
         }
     }
 }
diff --git a/source/backend/cpu/x86_x64/avx512/FunctionSummary.hpp b/source/backend/cpu/x86_x64/avx512/FunctionSummary.hpp
index 9c6968840..d11cf0c37 100644
--- a/source/backend/cpu/x86_x64/avx512/FunctionSummary.hpp
+++ b/source/backend/cpu/x86_x64/avx512/FunctionSummary.hpp
@@ -6,11 +6,8 @@
 //  Copyright © 2018, Alibaba Group Holding Limited
 //
 
-#if defined(_MSC_VER)
-#include <intrin.h>
-#else
-#include <x86intrin.h>
-#endif
+#include "core/SimdHeader.h"
+
 #include <stdint.h>
 #include "backend/cpu/compute/Int8FunctionsOpt.h"
 #include "backend/cpu/compute/CommonOptFunction.h"
diff --git a/source/backend/cpu/x86_x64/avx512/GemmInt8.cpp b/source/backend/cpu/x86_x64/avx512/GemmInt8.cpp
index fd80b6dc8..c9f9168f1 100644
--- a/source/backend/cpu/x86_x64/avx512/GemmInt8.cpp
+++ b/source/backend/cpu/x86_x64/avx512/GemmInt8.cpp
@@ -34,46 +34,68 @@ static void _AVX512BasicMNNPackC4ForMatMul_A(int8_t* destOrigin, int8_t const**
     const int EP = GEMMINT8_AVX512_E;
     int eDest = EP;
     const int LP = 4;
+    int realDstCount = info[4];
     for (int n=0; n<number; ++n) {
         int e = el[4 * n + 0];
         int l = el[4 * n + 1];
         int eOffset = el[4 * n + 2];
         int lOffset = el[4 * n + 3];
-        int eC = eOffset / eDest;
-        int eR = eOffset % eDest;
+        int eC = eOffset / EP;
+        int eR = eOffset % EP;
         int eS = eDest - eR;
+        // a bag: e_filled*LP, e_filled<=EP.
+        bool willReachBag = false;// will fill last bag(EP*LP) but this bag has e_filled<EP, maybe not the first bag.
+        int elast = EP;
+        int eOutsideStride4LastBag = eOutsideStride;
+        if (realDstCount % EP > 0) {
+            int jobsE = realDstCount - eOffset - e;
+            if (jobsE == 0 || (jobsE < (realDstCount % EP))) {
+                willReachBag = true;
+            }
+        }
         auto source = (float*)sourceGroup[n];
         auto dest = (float*)(destOrigin + eC * info[2] + eR * LP + lOffset * EP);
         l = l / 4; // Use float instead of int8 * 4
+        if (willReachBag && e + eR < EP) { // The first bag to fill has e_filled<EP.
+            elast = ALIMAX(eR + e, realDstCount % EP); // maybe padding, so max().
+            dest = (float*)(destOrigin + lOffset * elast + eC * info[2] + eR * LP);
+        }
+        int offsetLC_ = lOffset / 4;
         if (eR > 0) {
             int eStep = ALIMIN(e, eS);
             for (int y = 0; y < eStep; ++y) {
                 for (int x = 0; x < l; ++x) {
                     auto xR                  = x % 4;
                     auto xC                  = x / 4;
-                    dest[x * eDest + y] = source[xC * eReal * 4 + y * xS4 + xR];
+                    dest[x * elast + y] = source[xC * eReal * 4 + y * xS4 + xR];
                 }
             }
             e-= eStep;
-            dest += (eOutsideStride - eR);
+            if (!willReachBag || e >= EP) {
+                dest += (eOutsideStride - eR);
+            } else { // The bag to fill: e_filled < EP
+                int e_tofill = ALIMAX(e, realDstCount % EP); // maybe padding>0
+                eOutsideStride4LastBag = eOutsideStride - (EP * offsetLC_);
+                dest += (eOutsideStride4LastBag - eR + offsetLC_ * e_tofill);
+            }
             source += eStep * xS4;
         }
         if (e <=0 ) {
             continue;
         }
-        const int pack   = GEMMINT8_AVX512_E;
-        auto ePack       = e / pack;
-        auto lC4         = l / 4;
+
+        auto ePack       = e / EP;
+        auto lC4         = l / LP;
         auto lDiv        = UP_DIV(l, 4);
-        auto eRemain     = ePack * pack;
+        auto eRemain     = ePack * EP;
         auto lRemain     = lC4 * 4;
         auto lRes        = l - lRemain;
         for (int y = 0; y < ePack; ++y) {
             auto dstY = dest + y * eOutsideStride;
-            auto srcY = source + y * pack * xS4;
+            auto srcY = source + y * EP * xS4;
             for (int x = 0; x < lC4; ++x) {
                 auto srcX = srcY + x * 4 * eReal;
-                auto dstX = dstY + x * pack * 4;
+                auto dstX = dstY + x * EP * 4;
                 auto s00  = _mm_loadu_ps(srcX + 0 * xS4);
                 auto s01  = _mm_loadu_ps(srcX + 1 * xS4);
                 auto s02  = _mm_loadu_ps(srcX + 2 * xS4);
@@ -92,8 +114,9 @@ static void _AVX512BasicMNNPackC4ForMatMul_A(int8_t* destOrigin, int8_t const**
             if (lRes == 0) {
                 continue;
             }
-            auto srcX = srcY + lC4 * 4 * eReal;
-            auto dstX = dstY + lC4 * eDest * 4;
+            auto srcX = srcY + lC4 * LP * eReal;
+            auto dstX = dstY + lC4 * EP * LP;
+            
             auto s00  = _mm_loadu_ps(srcX + 0 * xS4);
             auto s01  = _mm_loadu_ps(srcX + 1 * xS4);
             auto s02  = _mm_loadu_ps(srcX + 2 * xS4);
@@ -113,14 +136,21 @@ static void _AVX512BasicMNNPackC4ForMatMul_A(int8_t* destOrigin, int8_t const**
         }
         // Down
         {
-            auto eLast    = e - eRemain;
+            auto eLast    = e - eRemain; // e - ePack * EP
             auto lastDest = dest + ePack * eOutsideStride;
+            int eFill = EP;
+            if (eLast > 0 && willReachBag) {
+                eFill = ALIMAX((realDstCount % EP), eLast);
+                if (ePack > 0) {
+                    lastDest = dest + ePack * eOutsideStride - offsetLC_ * (EP - eFill);
+                }
+            }
             for (int y = eRemain; y < e; ++y) {
                 auto yR = y - eRemain;
                 for (int x = 0; x < l; ++x) {
                     auto xR                  = x % 4;
                     auto xC                  = x / 4;
-                    lastDest[x * eDest + yR] = source[xC * eReal * 4 + y * 4 * xStride + xR];
+                    lastDest[x * eFill + yR] = source[xC * eReal * 4 + y * 4 * xStride + xR];
                 }
             }
         }
@@ -205,7 +235,8 @@ void _AVX512_MNNFloat2Int8(const float* src, int8_t* dst, size_t sizeQuad, const
     auto zero = _mm256_set1_epi32(0);
     auto minValue = _mm256_set1_ps(minV);
     auto maxValue = _mm256_set1_ps(maxV);
-    auto zeroPointValue = _mm256_set1_ps(zeroPoint[0]);
+    auto zeroPointValue0 = _mm256_set1_ps(zeroPoint[0]);
+    auto zeroPointValue1 = zeroPointValue0;
     auto offset = _mm256_set1_epi32(128);
     auto plus = _mm256_set1_ps(0.5f);
     auto minus = _mm256_set1_ps(-0.5f);
@@ -216,7 +247,8 @@ void _AVX512_MNNFloat2Int8(const float* src, int8_t* dst, size_t sizeQuad, const
         scaleValue1 = _mm256_loadu_ps(scalep + 8);
     }
     if (quanParamVec >> 1) {
-        zeroPointValue = _mm256_loadu_ps(zeroPoint);
+        zeroPointValue0 = _mm256_loadu_ps(zeroPoint);
+        zeroPointValue1 = _mm256_loadu_ps(zeroPoint + 8);
     }
 
     for (int i = 0; i < sizeQuad; ++i) {
@@ -224,8 +256,8 @@ void _AVX512_MNNFloat2Int8(const float* src, int8_t* dst, size_t sizeQuad, const
         auto f1 = _mm256_loadu_ps(src + PACK_UNIT * i + 8);
         f0 = _mm256_mul_ps(f0, scaleValue0);
         f1 = _mm256_mul_ps(f1, scaleValue1);
-        f0 = _mm256_add_ps(f0, zeroPointValue);
-        f1 = _mm256_add_ps(f1, zeroPointValue);
+        f0 = _mm256_add_ps(f0, zeroPointValue0);
+        f1 = _mm256_add_ps(f1, zeroPointValue1);
         f0 = _mm256_min_ps(f0, maxValue);
         f1 = _mm256_min_ps(f1, maxValue);
         f0 = _mm256_max_ps(f0, minValue);
@@ -259,13 +291,24 @@ void _AVX512_MNNFloat2Int8(const float* src, int8_t* dst, size_t sizeQuad, const
     }
 }
 
-void _AVX512_MNNInt8ScaleToFloat(float* dst, const int8_t* src, const float* scale, size_t sizeQuad, ssize_t zeroPoint) {
+void _AVX512_MNNInt8ScaleToFloat(float* dst, const int8_t* src, const float* scale, size_t sizeQuad, const float* zeroPoint, ssize_t quanParamVec) {
     auto sizeC4 = sizeQuad / 2;
     auto sizeRemain = sizeQuad % 2;
     auto zero = _mm256_set1_epi32(0);
-    auto scaleValue0 = _mm256_loadu_ps(scale);
-    auto scaleValue1 = _mm256_loadu_ps(scale + 8);
-    auto zeroPointValue = _mm256_set1_epi32(zeroPoint + 128);
+    
+    auto scaleValue0 = _mm256_set1_ps(scale[0]);
+    auto scaleValue1 = scaleValue0;
+    if (quanParamVec & 1) {
+        scaleValue0 = _mm256_loadu_ps(scale);
+        scaleValue1 = _mm256_loadu_ps(scale + 8);
+    }
+    auto zeroPointValue0 = _mm256_set1_ps(zeroPoint[0]) + _mm256_set1_ps(128.f);
+    auto zeroPointValue1 = zeroPointValue0;
+    if (quanParamVec >> 1) {
+        zeroPointValue0 = _mm256_loadu_ps(zeroPoint) + _mm256_set1_ps(128.f);
+        zeroPointValue1 = _mm256_loadu_ps(zeroPoint + 8) + _mm256_set1_ps(128.f);
+    }
+
     for (int i = 0; i < sizeC4; ++i) {
         auto s = _mm256_castps_si256(_mm256_loadu_ps((const float*)(src)));
         auto s0_16 = _mm256_permute4x64_epi64(_mm256_unpacklo_epi8(s, zero), 0XD8);
@@ -274,14 +317,14 @@ void _AVX512_MNNInt8ScaleToFloat(float* dst, const int8_t* src, const float* sca
         auto s1_32 = _mm256_unpacklo_epi16(s1_16, zero);
         auto s2_32 = _mm256_unpackhi_epi16(s0_16, zero);
         auto s3_32 = _mm256_unpackhi_epi16(s1_16, zero);
-        s0_32 = _mm256_sub_epi32(s0_32, zeroPointValue);
-        s1_32 = _mm256_sub_epi32(s1_32, zeroPointValue);
-        s2_32 = _mm256_sub_epi32(s2_32, zeroPointValue);
-        s3_32 = _mm256_sub_epi32(s3_32, zeroPointValue);
         auto s0_f = _mm256_cvtepi32_ps(s0_32);
         auto s1_f = _mm256_cvtepi32_ps(s1_32);
         auto s2_f = _mm256_cvtepi32_ps(s2_32);
         auto s3_f = _mm256_cvtepi32_ps(s3_32);
+        s0_f = _mm256_sub_ps(s0_f, zeroPointValue0);
+        s1_f = _mm256_sub_ps(s1_f, zeroPointValue1);
+        s2_f = _mm256_sub_ps(s2_f, zeroPointValue0);
+        s3_f = _mm256_sub_ps(s3_f, zeroPointValue1);
         _mm256_storeu_ps(dst + 8 * 0, _mm256_mul_ps(s0_f, scaleValue0));
         _mm256_storeu_ps(dst + 8 * 1, _mm256_mul_ps(s1_f, scaleValue1));
         _mm256_storeu_ps(dst + 8 * 2, _mm256_mul_ps(s2_f, scaleValue0));
@@ -297,10 +340,10 @@ void _AVX512_MNNInt8ScaleToFloat(float* dst, const int8_t* src, const float* sca
         auto s1_32 = _mm256_unpacklo_epi16(s1_16, zero);
         auto s2_32 = _mm256_unpackhi_epi16(s0_16, zero);
         auto s3_32 = _mm256_unpackhi_epi16(s1_16, zero);
-        s0_32 = _mm256_sub_epi32(s0_32, zeroPointValue);
-        s1_32 = _mm256_sub_epi32(s1_32, zeroPointValue);
         auto s0_f = _mm256_cvtepi32_ps(s0_32);
         auto s1_f = _mm256_cvtepi32_ps(s1_32);
+        s0_f = _mm256_sub_ps(s0_f, zeroPointValue0);
+        s1_f = _mm256_sub_ps(s1_f, zeroPointValue1);
         _mm256_storeu_ps(dst + 8 * 0, _mm256_mul_ps(s0_f, scaleValue0));
         _mm256_storeu_ps(dst + 8 * 1, _mm256_mul_ps(s1_f, scaleValue1));
     }
diff --git a/source/backend/cpu/x86_x64/avx512/GemmInt8_VNNI.cpp b/source/backend/cpu/x86_x64/avx512/GemmInt8_VNNI.cpp
index 5d73ffc50..f9ae80d28 100644
--- a/source/backend/cpu/x86_x64/avx512/GemmInt8_VNNI.cpp
+++ b/source/backend/cpu/x86_x64/avx512/GemmInt8_VNNI.cpp
@@ -199,7 +199,7 @@ void _AVX512_MNNGemmInt8AddBiasScale_16x4_Unit_VNNI(int8_t* dst, const int8_t* s
 
             for (int sz = 0; sz < src_depth_quad; ++sz) {
                 const auto weight_sz = weight_dz + (GEMMINT8_AVX512_L * GEMMINT8_AVX512_H) * sz;
-                const auto src_z     = (const float*)(src_x + sz * GEMMINT8_AVX512_E * GEMMINT8_AVX512_L);
+                const auto src_z     = (const float*)(src_x + sz * realDst * GEMMINT8_AVX512_L);
                 auto w0 = _mm512_loadu_si512(weight_sz);
                 auto w1 = _mm512_loadu_si512(weight_sz + 1 * PACK_UNIT * GEMMINT8_AVX512_L);
                 auto w2 = _mm512_loadu_si512(weight_sz + 2 * PACK_UNIT * GEMMINT8_AVX512_L);
@@ -474,7 +474,7 @@ void _AVX512_MNNGemmInt8AddBiasScale_16x4_Unit_VNNI(int8_t* dst, const int8_t* s
 
             for (int sz = 0; sz < src_depth_quad; ++sz) {
                 const auto weight_sz = weight_dz + (GEMMINT8_AVX512_L * GEMMINT8_AVX512_H) * sz;
-                const auto src_z     = (const float*)(src_x + sz * GEMMINT8_AVX512_E * GEMMINT8_AVX512_L);
+                const auto src_z     = (const float*)(src_x + sz * realDst * GEMMINT8_AVX512_L);
                 auto w0 = _mm512_loadu_si512(weight_sz);
 
                 auto s0 = AVX512_BROADCAST_INT32(src_z + 0);
@@ -592,7 +592,7 @@ void _AVX512_MNNGemmInt8AddBiasScale_16x4_Unit_VNNI(int8_t* dst, const int8_t* s
 
             for (int sz = 0; sz < src_depth_quad; ++sz) {
                 const auto weight_sz = weight_dz + (GEMMINT8_AVX512_L * GEMMINT8_AVX512_H) * sz;
-                const auto src_z     = (const float*)(src_x + sz * GEMMINT8_AVX512_E * GEMMINT8_AVX512_L);
+                const auto src_z     = (const float*)(src_x + sz * realDst * GEMMINT8_AVX512_L);
                 auto w0 = _mm512_loadu_si512(weight_sz);
                 auto w1 = _mm512_loadu_si512(weight_sz + 1 * PACK_UNIT * GEMMINT8_AVX512_L);
                 auto w2 = _mm512_loadu_si512(weight_sz + 2 * PACK_UNIT * GEMMINT8_AVX512_L);
@@ -829,7 +829,7 @@ void _AVX512_MNNGemmInt8AddBiasScale_16x4_Unit_VNNI(int8_t* dst, const int8_t* s
 
             for (int sz = 0; sz < src_depth_quad; ++sz) {
                 const auto weight_sz = weight_dz + (GEMMINT8_AVX512_L * GEMMINT8_AVX512_H) * sz;
-                const auto src_z     = (const float*)(src_x + sz * GEMMINT8_AVX512_E * GEMMINT8_AVX512_L);
+                const auto src_z     = (const float*)(src_x + sz * realDst * GEMMINT8_AVX512_L);
                 auto w0 = _mm512_loadu_si512(weight_sz);
 
                 auto s0 = AVX512_BROADCAST_INT32(src_z + 0);
@@ -934,7 +934,7 @@ void _AVX512_MNNGemmInt8AddBiasScale_16x4_Unit_VNNI(int8_t* dst, const int8_t* s
 
             for (int sz = 0; sz < src_depth_quad; ++sz) {
                 const auto weight_sz = weight_dz + (GEMMINT8_AVX512_L * GEMMINT8_AVX512_H) * sz;
-                const auto src_z     = (const float*)(src_x + sz * GEMMINT8_AVX512_E * GEMMINT8_AVX512_L);
+                const auto src_z     = (const float*)(src_x + sz * realDst * GEMMINT8_AVX512_L);
                 auto w0 = _mm512_loadu_si512(weight_sz);
                 auto w1 = _mm512_loadu_si512(weight_sz + 1 * PACK_UNIT * GEMMINT8_AVX512_L);
                 auto w2 = _mm512_loadu_si512(weight_sz + 2 * PACK_UNIT * GEMMINT8_AVX512_L);
@@ -1132,7 +1132,7 @@ void _AVX512_MNNGemmInt8AddBiasScale_16x4_Unit_VNNI(int8_t* dst, const int8_t* s
 
             for (int sz = 0; sz < src_depth_quad; ++sz) {
                 const auto weight_sz = weight_dz + (GEMMINT8_AVX512_L * GEMMINT8_AVX512_H) * sz;
-                const auto src_z     = (const float*)(src_x + sz * GEMMINT8_AVX512_E * GEMMINT8_AVX512_L);
+                const auto src_z     = (const float*)(src_x + sz * realDst * GEMMINT8_AVX512_L);
                 auto w0 = _mm512_loadu_si512(weight_sz);
 
                 auto s0 = AVX512_BROADCAST_INT32(src_z + 0);
@@ -1218,7 +1218,7 @@ void _AVX512_MNNGemmInt8AddBiasScale_16x4_Unit_VNNI(int8_t* dst, const int8_t* s
 
             for (int sz = 0; sz < src_depth_quad; ++sz) {
                 const auto weight_sz = weight_dz + (GEMMINT8_AVX512_L * GEMMINT8_AVX512_H) * sz;
-                const auto src_z     = (const float*)(src_x + sz * GEMMINT8_AVX512_E * GEMMINT8_AVX512_L);
+                const auto src_z     = (const float*)(src_x + sz * realDst * GEMMINT8_AVX512_L);
                 auto w0 = _mm512_loadu_si512(weight_sz);
                 auto w1 = _mm512_loadu_si512(weight_sz + 1 * PACK_UNIT * GEMMINT8_AVX512_L);
                 auto w2 = _mm512_loadu_si512(weight_sz + 2 * PACK_UNIT * GEMMINT8_AVX512_L);
@@ -1380,7 +1380,7 @@ void _AVX512_MNNGemmInt8AddBiasScale_16x4_Unit_VNNI(int8_t* dst, const int8_t* s
 
             for (int sz = 0; sz < src_depth_quad; ++sz) {
                 const auto weight_sz = weight_dz + (GEMMINT8_AVX512_L * GEMMINT8_AVX512_H) * sz;
-                const auto src_z     = (const float*)(src_x + sz * GEMMINT8_AVX512_E * GEMMINT8_AVX512_L);
+                const auto src_z     = (const float*)(src_x + sz * realDst * GEMMINT8_AVX512_L);
                 auto w0 = _mm512_loadu_si512(weight_sz);
 
                 auto s0 = AVX512_BROADCAST_INT32(src_z + 0);
@@ -1538,7 +1538,7 @@ void _AVX512_MNNGemmInt8AddBiasScale_16x4_w4_Unit_VNNI(int8_t* dst, const int8_t
 
             for (int sz = 0; sz < src_depth_quad; ++sz) {
                 const auto weight_sz = weight_dz + weight_step_Y * sz;
-                const auto src_z     = (const float*)(src_x + sz * GEMMINT8_AVX512_E * GEMMINT8_AVX512_L);
+                const auto src_z     = (const float*)(src_x + sz * realDst * GEMMINT8_AVX512_L);
 
                 // int4->int8: total count=4*64(GEMMINT8_AVX512_L * GEMMINT8_AVX512_H)
                 // Load 4*64 int4 weight
@@ -1794,7 +1794,7 @@ void _AVX512_MNNGemmInt8AddBiasScale_16x4_w4_Unit_VNNI(int8_t* dst, const int8_t
 
             for (int sz = 0; sz < src_depth_quad; ++sz) {
                 const auto weight_sz = weight_dz + weight_step_Y * sz;
-                const auto src_z     = (const float*)(src_x + sz * GEMMINT8_AVX512_E * GEMMINT8_AVX512_L);
+                const auto src_z     = (const float*)(src_x + sz * realDst * GEMMINT8_AVX512_L);
                 auto w0_int4_64 = _mm512_loadu_si512(weight_sz); // 128xint4_t=64 byte
                 // 256xint4_t->256xint8_t
                 auto w0 = _mm512_and_si512(mask, _mm512_srli_epi16(w0_int4_64, 4)); // 64xint8_t
@@ -1908,7 +1908,7 @@ void _AVX512_MNNGemmInt8AddBiasScale_16x4_w4_Unit_VNNI(int8_t* dst, const int8_t
 
             for (int sz = 0; sz < src_depth_quad; ++sz) {
                 const auto weight_sz = weight_dz + weight_step_Y * sz;
-                const auto src_z     = (const float*)(src_x + sz * GEMMINT8_AVX512_E * GEMMINT8_AVX512_L);
+                const auto src_z     = (const float*)(src_x + sz * realDst * GEMMINT8_AVX512_L);
                 // int4->int8: total count=4*64(GEMMINT8_AVX512_L * GEMMINT8_AVX512_H)
                 // Load 4*64 int4 weight
                 auto w0_int4_64 = _mm512_loadu_si512(weight_sz); // 128xint4_t=64 byte
@@ -2130,7 +2130,7 @@ void _AVX512_MNNGemmInt8AddBiasScale_16x4_w4_Unit_VNNI(int8_t* dst, const int8_t
 
             for (int sz = 0; sz < src_depth_quad; ++sz) {
                 const auto weight_sz = weight_dz + weight_step_Y * sz;
-                const auto src_z     = (const float*)(src_x + sz * GEMMINT8_AVX512_E * GEMMINT8_AVX512_L);
+                const auto src_z     = (const float*)(src_x + sz * realDst * GEMMINT8_AVX512_L);
                 auto w0_int4_64 = _mm512_loadu_si512(weight_sz); // 128xint4_t=64 byte
                 auto w0 = _mm512_and_si512(mask, _mm512_srli_epi16(w0_int4_64, 4)); // 64xint8_t
 
@@ -2231,7 +2231,7 @@ void _AVX512_MNNGemmInt8AddBiasScale_16x4_w4_Unit_VNNI(int8_t* dst, const int8_t
 
             for (int sz = 0; sz < src_depth_quad; ++sz) {
                 const auto weight_sz = weight_dz + weight_step_Y * sz;
-                const auto src_z     = (const float*)(src_x + sz * GEMMINT8_AVX512_E * GEMMINT8_AVX512_L);
+                const auto src_z     = (const float*)(src_x + sz * realDst * GEMMINT8_AVX512_L);
                 // int4->int8: total count=4*64(GEMMINT8_AVX512_L * GEMMINT8_AVX512_H)
                 // Load 4*64 int4 weight
                 auto w0_int4_64 = _mm512_loadu_si512(weight_sz); // 128xint4_t=64 byte
@@ -2418,7 +2418,7 @@ void _AVX512_MNNGemmInt8AddBiasScale_16x4_w4_Unit_VNNI(int8_t* dst, const int8_t
 
             for (int sz = 0; sz < src_depth_quad; ++sz) {
                 const auto weight_sz = weight_dz + weight_step_Y * sz;
-                const auto src_z     = (const float*)(src_x + sz * GEMMINT8_AVX512_E * GEMMINT8_AVX512_L);
+                const auto src_z     = (const float*)(src_x + sz * realDst * GEMMINT8_AVX512_L);
                 auto w0_int4_64 = _mm512_loadu_si512(weight_sz); // 128xint4_t=64 byte
                 auto w0 = _mm512_and_si512(mask, _mm512_srli_epi16(w0_int4_64, 4)); // 64xint8_t
 
@@ -2501,7 +2501,7 @@ void _AVX512_MNNGemmInt8AddBiasScale_16x4_w4_Unit_VNNI(int8_t* dst, const int8_t
 
             for (int sz = 0; sz < src_depth_quad; ++sz) {
                 const auto weight_sz = weight_dz + weight_step_Y * sz;
-                const auto src_z     = (const float*)(src_x + sz * GEMMINT8_AVX512_E * GEMMINT8_AVX512_L);
+                const auto src_z     = (const float*)(src_x + sz * realDst * GEMMINT8_AVX512_L);
                 // int4->int8: total count=4*64(GEMMINT8_AVX512_L * GEMMINT8_AVX512_H)
                 // Load 4*64 int4 weight
                 auto w0_int4_64 = _mm512_loadu_si512(weight_sz); // 128xint4_t=64 byte
@@ -2656,7 +2656,7 @@ void _AVX512_MNNGemmInt8AddBiasScale_16x4_w4_Unit_VNNI(int8_t* dst, const int8_t
 
             for (int sz = 0; sz < src_depth_quad; ++sz) {
                 const auto weight_sz = weight_dz + weight_step_Y * sz;
-                const auto src_z     = (const float*)(src_x + sz * GEMMINT8_AVX512_E * GEMMINT8_AVX512_L);
+                const auto src_z     = (const float*)(src_x + sz * realDst * GEMMINT8_AVX512_L);
                 auto w0_int4_64 = _mm512_loadu_si512(weight_sz); // 128xint4_t=64 byte
                 auto w0 = _mm512_and_si512(mask, _mm512_srli_epi16(w0_int4_64, 4)); // 64xint8_t
 
diff --git a/source/backend/cpu/x86_x64/avx512/Matmul_4_4_64.inl b/source/backend/cpu/x86_x64/avx512/Matmul_4_4_64.inl
index 44e9bc36f..203b58b40 100644
--- a/source/backend/cpu/x86_x64/avx512/Matmul_4_4_64.inl
+++ b/source/backend/cpu/x86_x64/avx512/Matmul_4_4_64.inl
@@ -191,7 +191,7 @@ void MATMULCOREFUNC_NAME(int8_t* dst, const int8_t* src, const int8_t* weight, s
 
             for (int sz = 0; sz < src_depth_quad; ++sz) {
                 const auto weight_sz = weight_dz + (GEMMINT8_AVX512_L * GEMMINT8_AVX512_H) * sz;
-                const auto src_z     = (const float*)(src_x + sz * GEMMINT8_AVX512_E * GEMMINT8_AVX512_L);
+                const auto src_z     = (const float*)(src_x + sz * realDst * GEMMINT8_AVX512_L);
                 auto w0 = _mm512_loadu_si512(weight_sz);
                 auto w1 = _mm512_loadu_si512(weight_sz + 1 * PACK_UNIT * GEMMINT8_AVX512_L);
                 auto w2 = _mm512_loadu_si512(weight_sz + 2 * PACK_UNIT * GEMMINT8_AVX512_L);
@@ -466,7 +466,7 @@ void MATMULCOREFUNC_NAME(int8_t* dst, const int8_t* src, const int8_t* weight, s
 
             for (int sz = 0; sz < src_depth_quad; ++sz) {
                 const auto weight_sz = weight_dz + (GEMMINT8_AVX512_L * GEMMINT8_AVX512_H) * sz;
-                const auto src_z     = (const float*)(src_x + sz * GEMMINT8_AVX512_E * GEMMINT8_AVX512_L);
+                const auto src_z     = (const float*)(src_x + sz * realDst * GEMMINT8_AVX512_L);
                 auto w0 = _mm512_loadu_si512(weight_sz);
 
                 auto s0 = AVX512_BROADCAST_INT32(src_z + 0);
@@ -584,7 +584,7 @@ void MATMULCOREFUNC_NAME(int8_t* dst, const int8_t* src, const int8_t* weight, s
 
             for (int sz = 0; sz < src_depth_quad; ++sz) {
                 const auto weight_sz = weight_dz + (GEMMINT8_AVX512_L * GEMMINT8_AVX512_H) * sz;
-                const auto src_z     = (const float*)(src_x + sz * GEMMINT8_AVX512_E * GEMMINT8_AVX512_L);
+                const auto src_z     = (const float*)(src_x + sz * realDst * GEMMINT8_AVX512_L);
                 auto w0 = _mm512_loadu_si512(weight_sz);
                 auto w1 = _mm512_loadu_si512(weight_sz + 1 * PACK_UNIT * GEMMINT8_AVX512_L);
                 auto w2 = _mm512_loadu_si512(weight_sz + 2 * PACK_UNIT * GEMMINT8_AVX512_L);
@@ -821,7 +821,7 @@ void MATMULCOREFUNC_NAME(int8_t* dst, const int8_t* src, const int8_t* weight, s
 
             for (int sz = 0; sz < src_depth_quad; ++sz) {
                 const auto weight_sz = weight_dz + (GEMMINT8_AVX512_L * GEMMINT8_AVX512_H) * sz;
-                const auto src_z     = (const float*)(src_x + sz * GEMMINT8_AVX512_E * GEMMINT8_AVX512_L);
+                const auto src_z     = (const float*)(src_x + sz * realDst * GEMMINT8_AVX512_L);
                 auto w0 = _mm512_loadu_si512(weight_sz);
 
                 auto s0 = AVX512_BROADCAST_INT32(src_z + 0);
@@ -926,7 +926,7 @@ void MATMULCOREFUNC_NAME(int8_t* dst, const int8_t* src, const int8_t* weight, s
 
             for (int sz = 0; sz < src_depth_quad; ++sz) {
                 const auto weight_sz = weight_dz + (GEMMINT8_AVX512_L * GEMMINT8_AVX512_H) * sz;
-                const auto src_z     = (const float*)(src_x + sz * GEMMINT8_AVX512_E * GEMMINT8_AVX512_L);
+                const auto src_z     = (const float*)(src_x + sz * realDst * GEMMINT8_AVX512_L);
                 auto w0 = _mm512_loadu_si512(weight_sz);
                 auto w1 = _mm512_loadu_si512(weight_sz + 1 * PACK_UNIT * GEMMINT8_AVX512_L);
                 auto w2 = _mm512_loadu_si512(weight_sz + 2 * PACK_UNIT * GEMMINT8_AVX512_L);
@@ -1124,7 +1124,7 @@ void MATMULCOREFUNC_NAME(int8_t* dst, const int8_t* src, const int8_t* weight, s
 
             for (int sz = 0; sz < src_depth_quad; ++sz) {
                 const auto weight_sz = weight_dz + (GEMMINT8_AVX512_L * GEMMINT8_AVX512_H) * sz;
-                const auto src_z     = (const float*)(src_x + sz * GEMMINT8_AVX512_E * GEMMINT8_AVX512_L);
+                const auto src_z     = (const float*)(src_x + sz * realDst * GEMMINT8_AVX512_L);
                 auto w0 = _mm512_loadu_si512(weight_sz);
 
                 auto s0 = AVX512_BROADCAST_INT32(src_z + 0);
@@ -1210,7 +1210,7 @@ void MATMULCOREFUNC_NAME(int8_t* dst, const int8_t* src, const int8_t* weight, s
 
             for (int sz = 0; sz < src_depth_quad; ++sz) {
                 const auto weight_sz = weight_dz + (GEMMINT8_AVX512_L * GEMMINT8_AVX512_H) * sz;
-                const auto src_z     = (const float*)(src_x + sz * GEMMINT8_AVX512_E * GEMMINT8_AVX512_L);
+                const auto src_z     = (const float*)(src_x + sz * realDst * GEMMINT8_AVX512_L);
                 auto w0 = _mm512_loadu_si512(weight_sz);
                 auto w1 = _mm512_loadu_si512(weight_sz + 1 * PACK_UNIT * GEMMINT8_AVX512_L);
                 auto w2 = _mm512_loadu_si512(weight_sz + 2 * PACK_UNIT * GEMMINT8_AVX512_L);
@@ -1372,7 +1372,7 @@ void MATMULCOREFUNC_NAME(int8_t* dst, const int8_t* src, const int8_t* weight, s
 
             for (int sz = 0; sz < src_depth_quad; ++sz) {
                 const auto weight_sz = weight_dz + (GEMMINT8_AVX512_L * GEMMINT8_AVX512_H) * sz;
-                const auto src_z     = (const float*)(src_x + sz * GEMMINT8_AVX512_E * GEMMINT8_AVX512_L);
+                const auto src_z     = (const float*)(src_x + sz * realDst * GEMMINT8_AVX512_L);
                 auto w0 = _mm512_loadu_si512(weight_sz);
 
                 auto s0 = AVX512_BROADCAST_INT32(src_z + 0);
@@ -1532,7 +1532,7 @@ void MATMULCOREFUNC_NAME_W4(int8_t* dst, const int8_t* src, const int8_t* weight
 
             for (int sz = 0; sz < src_depth_quad; ++sz) {
                 const auto weight_sz = weight_dz + weight_step_Y * sz;
-                const auto src_z     = (const float*)(src_x + sz * GEMMINT8_AVX512_E * GEMMINT8_AVX512_L);
+                const auto src_z     = (const float*)(src_x + sz * realDst * GEMMINT8_AVX512_L);
                 // int4->int8: total count=4*64(GEMMINT8_AVX512_L * GEMMINT8_AVX512_H)
                 // Load 4*64 int4 weight
                 auto w0_int4_64 = _mm512_loadu_si512(weight_sz); // 128xint4_t=64 byte
@@ -1787,7 +1787,7 @@ void MATMULCOREFUNC_NAME_W4(int8_t* dst, const int8_t* src, const int8_t* weight
 
             for (int sz = 0; sz < src_depth_quad; ++sz) {
                 const auto weight_sz = weight_dz + weight_step_Y * sz;
-                const auto src_z     = (const float*)(src_x + sz * GEMMINT8_AVX512_E * GEMMINT8_AVX512_L);
+                const auto src_z     = (const float*)(src_x + sz * realDst * GEMMINT8_AVX512_L);
                 auto w0_int4_64 = _mm512_loadu_si512(weight_sz); // 128xint4_t=64 byte
                 auto w0 = _mm512_and_si512(mask, _mm512_srli_epi16(w0_int4_64, 4)); // 64xint8_t
 
@@ -1900,7 +1900,7 @@ void MATMULCOREFUNC_NAME_W4(int8_t* dst, const int8_t* src, const int8_t* weight
 
             for (int sz = 0; sz < src_depth_quad; ++sz) {
                 const auto weight_sz = weight_dz + weight_step_Y * sz;
-                const auto src_z     = (const float*)(src_x + sz * GEMMINT8_AVX512_E * GEMMINT8_AVX512_L);
+                const auto src_z     = (const float*)(src_x + sz * realDst * GEMMINT8_AVX512_L);
                 // int4->int8: total count=4*64(GEMMINT8_AVX512_L * GEMMINT8_AVX512_H)
                 // Load 4*64 int4 weight
                 auto w0_int4_64 = _mm512_loadu_si512(weight_sz); // 128xint4_t=64 byte
@@ -2122,7 +2122,7 @@ void MATMULCOREFUNC_NAME_W4(int8_t* dst, const int8_t* src, const int8_t* weight
 
             for (int sz = 0; sz < src_depth_quad; ++sz) {
                 const auto weight_sz = weight_dz + weight_step_Y * sz;
-                const auto src_z     = (const float*)(src_x + sz * GEMMINT8_AVX512_E * GEMMINT8_AVX512_L);
+                const auto src_z     = (const float*)(src_x + sz * realDst * GEMMINT8_AVX512_L);
                 auto w0_int4_64 = _mm512_loadu_si512(weight_sz); // 128xint4_t=64 byte
                 auto w0 = _mm512_and_si512(mask, _mm512_srli_epi16(w0_int4_64, 4)); // 64xint8_t
 
@@ -2223,7 +2223,7 @@ void MATMULCOREFUNC_NAME_W4(int8_t* dst, const int8_t* src, const int8_t* weight
 
             for (int sz = 0; sz < src_depth_quad; ++sz) {
                 const auto weight_sz = weight_dz + weight_step_Y * sz;
-                const auto src_z     = (const float*)(src_x + sz * GEMMINT8_AVX512_E * GEMMINT8_AVX512_L);
+                const auto src_z     = (const float*)(src_x + sz * realDst * GEMMINT8_AVX512_L);
                 // int4->int8: total count=4*64(GEMMINT8_AVX512_L * GEMMINT8_AVX512_H)
                 // Load 4*64 int4 weight
                 auto w0_int4_64 = _mm512_loadu_si512(weight_sz); // 128xint4_t=64 byte
@@ -2410,7 +2410,7 @@ void MATMULCOREFUNC_NAME_W4(int8_t* dst, const int8_t* src, const int8_t* weight
 
             for (int sz = 0; sz < src_depth_quad; ++sz) {
                 const auto weight_sz = weight_dz + weight_step_Y * sz;
-                const auto src_z     = (const float*)(src_x + sz * GEMMINT8_AVX512_E * GEMMINT8_AVX512_L);
+                const auto src_z     = (const float*)(src_x + sz * realDst * GEMMINT8_AVX512_L);
                 auto w0_int4_64 = _mm512_loadu_si512(weight_sz); // 128xint4_t=64 byte
                 // 256xint4_t->256xint8_t
                 auto w0 = _mm512_and_si512(mask, _mm512_srli_epi16(w0_int4_64, 4)); // 64xint8_t
@@ -2494,7 +2494,7 @@ void MATMULCOREFUNC_NAME_W4(int8_t* dst, const int8_t* src, const int8_t* weight
 
             for (int sz = 0; sz < src_depth_quad; ++sz) {
                 const auto weight_sz = weight_dz + weight_step_Y * sz;
-                const auto src_z     = (const float*)(src_x + sz * GEMMINT8_AVX512_E * GEMMINT8_AVX512_L);
+                const auto src_z     = (const float*)(src_x + sz * realDst * GEMMINT8_AVX512_L);
                 // int4->int8: total count=4*64(GEMMINT8_AVX512_L * GEMMINT8_AVX512_H)
                 // Load 4*64 int4 weight
                 auto w0_int4_64 = _mm512_loadu_si512(weight_sz); // 128xint4_t=64 byte
@@ -2649,7 +2649,7 @@ void MATMULCOREFUNC_NAME_W4(int8_t* dst, const int8_t* src, const int8_t* weight
 
             for (int sz = 0; sz < src_depth_quad; ++sz) {
                 const auto weight_sz = weight_dz + weight_step_Y * sz;
-                const auto src_z     = (const float*)(src_x + sz * GEMMINT8_AVX512_E * GEMMINT8_AVX512_L);
+                const auto src_z     = (const float*)(src_x + sz * realDst * GEMMINT8_AVX512_L);
                 auto w0_int4_64 = _mm512_loadu_si512(weight_sz); // 128xint4_t=64 byte
                 auto w0 = _mm512_and_si512(mask, _mm512_srli_epi16(w0_int4_64, 4)); // 64xint8_t
 
diff --git a/source/backend/cpu/x86_x64/avxfma/FunctionSummary.hpp b/source/backend/cpu/x86_x64/avxfma/FunctionSummary.hpp
index 9d911dbb1..43d890fc8 100644
--- a/source/backend/cpu/x86_x64/avxfma/FunctionSummary.hpp
+++ b/source/backend/cpu/x86_x64/avxfma/FunctionSummary.hpp
@@ -6,11 +6,7 @@
 //  Copyright © 2018, Alibaba Group Holding Limited
 //
 
-#if defined(_MSC_VER)
-#include <intrin.h>
-#else
-#include <x86intrin.h>
-#endif
+#include "core/SimdHeader.h"
 #include <MNN/MNNDefine.h>
 #include <stdint.h>
 
diff --git a/source/backend/cpu/x86_x64/sse/FunctionSummary.hpp b/source/backend/cpu/x86_x64/sse/FunctionSummary.hpp
index a132b48b9..5f8653066 100644
--- a/source/backend/cpu/x86_x64/sse/FunctionSummary.hpp
+++ b/source/backend/cpu/x86_x64/sse/FunctionSummary.hpp
@@ -6,11 +6,8 @@
 //  Copyright © 2018, Alibaba Group Holding Limited
 //
 
-#if defined(_MSC_VER)
-#include <intrin.h>
-#else
-#include <x86intrin.h>
-#endif
+#include "core/SimdHeader.h"
+
 #include <MNN/MNNDefine.h>
 #include <stdint.h>
 #include "backend/cpu/compute/Int8FunctionsOpt.h"
@@ -75,7 +72,7 @@ void _SSE_MNNExpC8(float* dest, const float* source, float* offset, const float*
 void _SSE_MNNPackForMatMul_B(float* dest, const float* source, size_t h, size_t l, bool transpose);
 void _SSE_MNNFloat2Int8(const float* src, int8_t* dst, size_t sizeQuad, const float* scalep, ssize_t minValue, ssize_t maxValue, const float* zeroPoint, ssize_t quanParamVec);
 
-void _SSE_MNNInt8ScaleToFloat(float* dst, const int8_t* src, const float* scale, size_t size, ssize_t zeroPoint);
+void _SSE_MNNInt8ScaleToFloat(float* dst, const int8_t* src, const float* scale, size_t size, const float* zeroPoint, ssize_t quanParamVec);
 void _SSE_MNNLineDepthWiseInt8AddBiasScaleUnit(int8_t* dst, const int8_t* src, const int8_t* weight, const QuanPostTreatParameters* parameters, size_t width, size_t src_w_step, size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step, int8_t* idxOrder=nullptr);
 void _SSE_MNNInt8ToInt16(int16_t* dest, const int8_t* source, size_t count);
 
@@ -106,4 +103,4 @@ void _SSE_MNNSampleBilinear(const unsigned char* source, unsigned char* dest, MN
                                   size_t iw, size_t ih, size_t yStride, size_t bpp);
 
 // Dynamic Quant
-void _SSE_MNNComputeScaleZeroScalar(float* source, float* minVal, float* maxVal, size_t size);
\ No newline at end of file
+void _SSE_MNNComputeScaleZeroScalar(float* source, float* minVal, float* maxVal, size_t size);
diff --git a/source/backend/cpu/x86_x64/sse/GemmInt8.cpp b/source/backend/cpu/x86_x64/sse/GemmInt8.cpp
index d20f3dc23..f2218b80e 100644
--- a/source/backend/cpu/x86_x64/sse/GemmInt8.cpp
+++ b/source/backend/cpu/x86_x64/sse/GemmInt8.cpp
@@ -23,6 +23,8 @@ void _SSE_MNNGemmInt8AddBiasScale_16x4_Unit(int8_t* dst, const int8_t* src, cons
     __m128 plus = _mm_set1_ps(0.5f);
     __m128 minus = _mm_set1_ps(-0.5f);
     __m128 fp32min, fp32max;
+    __m128i s0, s1, s2, s3;
+    __m128i d00, d01, d02, d03, d10, d11, d12, d13, d20, d21, d22, d23, d30, d31, d32, d33;
     if (0 == post->useInt8 && post->fp32minmax) {
         fp32min = _mm_set1_ps((post->fp32minmax)[0]);
         fp32max = _mm_set1_ps((post->fp32minmax)[1]);
@@ -103,64 +105,61 @@ void _SSE_MNNGemmInt8AddBiasScale_16x4_Unit(int8_t* dst, const int8_t* src, cons
 
         for (int sz = 0; sz < src_depth_quad; ++sz) {
             const auto weight_sz = weight_dz + (GEMM_INT8_UNIT * GEMM_INT8_SRC_UNIT) * sz;
-            const auto src_z     = src_x + sz * GEMM_INT8_DST_XUNIT * GEMM_INT8_SRC_UNIT;
+            const auto src_z     = src_x + sz * realDst * GEMM_INT8_SRC_UNIT;
             auto w0 = _mm_loadu_si128((__m128i*)(weight_sz + GEMM_INT8_SRC_UNIT * 0));
             auto w1 = _mm_loadu_si128((__m128i*)(weight_sz + GEMM_INT8_SRC_UNIT * 1));
             auto w2 = _mm_loadu_si128((__m128i*)(weight_sz + GEMM_INT8_SRC_UNIT * 2));
             auto w3 = _mm_loadu_si128((__m128i*)(weight_sz + GEMM_INT8_SRC_UNIT * 3));
 
-            auto s0 = _mm_loadu_si128((__m128i*)(src_z + GEMM_INT8_SRC_UNIT * 0));
-            auto s1 = _mm_loadu_si128((__m128i*)(src_z + GEMM_INT8_SRC_UNIT * 1));
-            auto s2 = _mm_loadu_si128((__m128i*)(src_z + GEMM_INT8_SRC_UNIT * 2));
-            auto s3 = _mm_loadu_si128((__m128i*)(src_z + GEMM_INT8_SRC_UNIT * 3));
-
-//#define COMPUTE(i, j)\
-//auto d##i##j = _mm_maddubs_epi16(s##i, w##j);\
-//d##i##j = _mm_madd_epi16(d##i##j, oneValue);\
-
 #define COMPUTE(i, j)\
 auto W##i##j##0 = _mm_srai_epi16(_mm_unpacklo_epi8(zero, w##j), 8);\
 auto W##i##j##1 = _mm_srai_epi16(_mm_unpackhi_epi8(zero, w##j), 8);\
 auto S##i##j##0 = _mm_unpacklo_epi8(s##i, zero);\
 auto S##i##j##1 = _mm_unpackhi_epi8(s##i, zero);\
-auto d##i##j = _mm_add_epi32(_mm_madd_epi16(S##i##j##0, W##i##j##0), _mm_madd_epi16(S##i##j##1, W##i##j##1));\
+d##i##j = _mm_add_epi32(_mm_madd_epi16(S##i##j##0, W##i##j##0), _mm_madd_epi16(S##i##j##1, W##i##j##1));\
 
+            s0 = _mm_loadu_si128((__m128i*)(src_z + GEMM_INT8_SRC_UNIT * 0));
             COMPUTE(0, 0);
             COMPUTE(0, 1);
             COMPUTE(0, 2);
             COMPUTE(0, 3);
-            COMPUTE(1, 0);
-            COMPUTE(1, 1);
-            COMPUTE(1, 2);
-            COMPUTE(1, 3);
-            COMPUTE(2, 0);
-            COMPUTE(2, 1);
-            COMPUTE(2, 2);
-            COMPUTE(2, 3);
-            COMPUTE(3, 0);
-            COMPUTE(3, 1);
-            COMPUTE(3, 2);
-            COMPUTE(3, 3);
-
             d0 = _mm_add_epi32(d0, d00);
             d1 = _mm_add_epi32(d1, d01);
             d2 = _mm_add_epi32(d2, d02);
             d3 = _mm_add_epi32(d3, d03);
-
-            e0 = _mm_add_epi32(e0, d10);
-            e1 = _mm_add_epi32(e1, d11);
-            e2 = _mm_add_epi32(e2, d12);
-            e3 = _mm_add_epi32(e3, d13);
-
-            D0 = _mm_add_epi32(D0, d20);
-            D1 = _mm_add_epi32(D1, d21);
-            D2 = _mm_add_epi32(D2, d22);
-            D3 = _mm_add_epi32(D3, d23);
-
-            E0 = _mm_add_epi32(E0, d30);
-            E1 = _mm_add_epi32(E1, d31);
-            E2 = _mm_add_epi32(E2, d32);
-            E3 = _mm_add_epi32(E3, d33);
+            if (realDst > 1) {
+                s1 = _mm_loadu_si128((__m128i*)(src_z + GEMM_INT8_SRC_UNIT * 1));
+                COMPUTE(1, 0);
+                COMPUTE(1, 1);
+                COMPUTE(1, 2);
+                COMPUTE(1, 3);
+                e0 = _mm_add_epi32(e0, d10);
+                e1 = _mm_add_epi32(e1, d11);
+                e2 = _mm_add_epi32(e2, d12);
+                e3 = _mm_add_epi32(e3, d13);
+            }
+            if (realDst > 2) {
+                s2 = _mm_loadu_si128((__m128i*)(src_z + GEMM_INT8_SRC_UNIT * 2));
+                COMPUTE(2, 0);
+                COMPUTE(2, 1);
+                COMPUTE(2, 2);
+                COMPUTE(2, 3);
+                D0 = _mm_add_epi32(D0, d20);
+                D1 = _mm_add_epi32(D1, d21);
+                D2 = _mm_add_epi32(D2, d22);
+                D3 = _mm_add_epi32(D3, d23);
+            }
+            if (realDst > 3) {
+                s3 = _mm_loadu_si128((__m128i*)(src_z + GEMM_INT8_SRC_UNIT * 3));
+                COMPUTE(3, 0);
+                COMPUTE(3, 1);
+                COMPUTE(3, 2);
+                COMPUTE(3, 3);
+                E0 = _mm_add_epi32(E0, d30);
+                E1 = _mm_add_epi32(E1, d31);
+                E2 = _mm_add_epi32(E2, d32);
+                E3 = _mm_add_epi32(E3, d33);
+            }
         }
         d0 = _mm_hadd_epi32(d0, d1);
         d1 = _mm_hadd_epi32(d2, d3);
@@ -399,7 +398,7 @@ void _SSE_MNNGemmInt8AddBiasScale_16x4_w4(int8_t* dst, const int8_t* src, const
 
         for (int sz = 0; sz < src_depth_quad; ++sz) {
             const auto weight_sz = weight_dz + weight_step_Y * sz;
-            const auto src_z     = src_x + sz * GEMM_INT8_DST_XUNIT * GEMM_INT8_SRC_UNIT;
+            const auto src_z     = src_x + sz * realDst * GEMM_INT8_SRC_UNIT;
 
             LOAD_INT4_TO_INT8;
 
@@ -605,12 +604,18 @@ void _SSE_MNNFloat2Int8(const float* src, int8_t* dst, size_t sizeQuad, const fl
     }
 }
 
-void _SSE_MNNInt8ScaleToFloat(float* dst, const int8_t* src, const float* scale, size_t sizeQuad, ssize_t zeroPoint) {
+void _SSE_MNNInt8ScaleToFloat(float* dst, const int8_t* src, const float* scale, size_t sizeQuad, const float* zeroPoint, ssize_t quanParamVec) {
     auto sizeC4 = sizeQuad / 4;
     auto sizeRemain = sizeQuad % 4;
     __m128i zero = _mm_set1_epi32(0);
-    __m128 scaleValue = _mm_loadu_ps(scale);
-    __m128i zeroPointValue = _mm_set1_epi32(zeroPoint + 128);
+    auto scaleValue = _mm_set1_ps(scale[0]);
+    auto zeroPointValue = _mm_set1_ps(zeroPoint[0] + 128.f);
+    if (quanParamVec & 1) {
+        scaleValue = _mm_loadu_ps(scale);
+    }
+    if (quanParamVec >> 1) {
+        zeroPointValue = _mm_add_ps(_mm_loadu_ps(zeroPoint), _mm_set1_ps(128.f));
+    }
     for (int i = 0; i < sizeC4; ++i) {
         auto s = _mm_castps_si128(_mm_loadu_ps((const float*)(src)));
         auto s0_16 = _mm_unpacklo_epi8(s, zero);
@@ -619,14 +624,14 @@ void _SSE_MNNInt8ScaleToFloat(float* dst, const int8_t* src, const float* scale,
         auto s1_32 = _mm_unpackhi_epi16(s0_16, zero);
         auto s2_32 = _mm_unpacklo_epi16(s1_16, zero);
         auto s3_32 = _mm_unpackhi_epi16(s1_16, zero);
-        s0_32 = _mm_sub_epi32(s0_32, zeroPointValue);
-        s1_32 = _mm_sub_epi32(s1_32, zeroPointValue);
-        s2_32 = _mm_sub_epi32(s2_32, zeroPointValue);
-        s3_32 = _mm_sub_epi32(s3_32, zeroPointValue);
         auto s0_f = _mm_cvtepi32_ps(s0_32);
         auto s1_f = _mm_cvtepi32_ps(s1_32);
         auto s2_f = _mm_cvtepi32_ps(s2_32);
         auto s3_f = _mm_cvtepi32_ps(s3_32);
+        s0_f = _mm_sub_ps(s0_f, zeroPointValue);
+        s1_f = _mm_sub_ps(s1_f, zeroPointValue);
+        s2_f = _mm_sub_ps(s2_f, zeroPointValue);
+        s3_f = _mm_sub_ps(s3_f, zeroPointValue);
         _mm_storeu_ps(dst + 4 * 0, _mm_mul_ps(s0_f, scaleValue));
         _mm_storeu_ps(dst + 4 * 1, _mm_mul_ps(s1_f, scaleValue));
         _mm_storeu_ps(dst + 4 * 2, _mm_mul_ps(s2_f, scaleValue));
@@ -644,14 +649,14 @@ void _SSE_MNNInt8ScaleToFloat(float* dst, const int8_t* src, const float* scale,
         auto s1_32 = _mm_unpackhi_epi16(s0_16, zero);
         auto s2_32 = _mm_unpacklo_epi16(s1_16, zero);
         auto s3_32 = _mm_unpackhi_epi16(s1_16, zero);
-        s0_32 = _mm_sub_epi32(s0_32, zeroPointValue);
-        s1_32 = _mm_sub_epi32(s1_32, zeroPointValue);
-        s2_32 = _mm_sub_epi32(s2_32, zeroPointValue);
-        s3_32 = _mm_sub_epi32(s3_32, zeroPointValue);
         auto s0_f = _mm_cvtepi32_ps(s0_32);
         auto s1_f = _mm_cvtepi32_ps(s1_32);
         auto s2_f = _mm_cvtepi32_ps(s2_32);
         auto s3_f = _mm_cvtepi32_ps(s3_32);
+        s0_f = _mm_sub_ps(s0_f, zeroPointValue);
+        s1_f = _mm_sub_ps(s1_f, zeroPointValue);
+        s2_f = _mm_sub_ps(s2_f, zeroPointValue);
+        s3_f = _mm_sub_ps(s3_f, zeroPointValue);
         switch (sizeRemain) {
             case 3:
                 _mm_storeu_ps(dst + 4 * 0, _mm_mul_ps(s0_f, scaleValue));
diff --git a/source/backend/cuda/CMakeLists.txt b/source/backend/cuda/CMakeLists.txt
index 3c95095df..646dd405f 100644
--- a/source/backend/cuda/CMakeLists.txt
+++ b/source/backend/cuda/CMakeLists.txt
@@ -25,8 +25,11 @@ if(CUDA_FOUND)
 
     list(LENGTH CUDA_ARCH_FLAGS_readable_code arch_count)
     # Current Supported Arch List 
+    IF (MSVC)
+     set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -allow-unsupported-compiler")
+    ENDIF()
     IF (${arch_count} EQUAL 1)
-        set(support_archs 60 61 62 70 72 75 80 86)
+        set(support_archs 60 61 62 70 72 75 80 86 89)
         list(FIND support_archs ${CUDA_ARCH_FLAGS_readable_code} list_index)
         IF (${list_index} EQUAL -1)
             message(FATAL_ERROR "Please add your own sm arch ${CUDA_ARCH_FLAGS_readable_code} to CmakeLists.txt!")
@@ -55,6 +58,11 @@ if(CUDA_FOUND)
         add_definitions(-DMNN_CUDA_ENABLE_SM80 -DMNN_CUDA_ENABLE_SM86)
     ENDIF()
 
+    IF ((CUDA_VERSION VERSION_GREATER "12.2") OR (CUDA_VERSION VERSION_EQUAL "12.2"))
+        set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_89,code=sm_89")
+        add_definitions(-DMNN_CUDA_ENABLE_SM89 -DMNN_CUDA_ENABLE_SM89)
+    ENDIF()
+
     # Limit minimum cuda version for each archs
 
     IF (${arch_count} EQUAL 1)
diff --git a/source/backend/metal/AllShader.cpp b/source/backend/metal/AllShader.cpp
index 16c50153f..f820b4722 100644
--- a/source/backend/metal/AllShader.cpp
+++ b/source/backend/metal/AllShader.cpp
@@ -574,6 +574,38 @@ const char* shader_MetalSoftmax_metal =
 "static inline float4 softmax_filter(float4 V,int z,int limit) {\n"
 " return select(0,V,z*4+int4(0,1,2,3)<limit);\n"
 "}\n"
+"kernel void softmax_plane_sg(const device M *in [[buffer(0)]],\n"
+" device M *out [[buffer(1)]],\n"
+" constant softmax_shape& s [[buffer(2)]],\n"
+" uint2 gid[[threadgroup_position_in_grid]],\n"
+" uint tiisg[[thread_index_in_simdgroup]],\n"
+" uint sgitg[[simdgroup_index_in_threadgroup]]\n"
+" ) {\n"
+" // threadgroup contain one simdgroup\n"
+" // simdgroup compute axis data\n"
+" if ((int)gid.x >= s.inside_size || (int)gid.y >= s.outside_size) return;\n"
+" \n"
+" auto axis_off=gid.y*s.axis_length*s.inside_size+gid.x;\n"
+" auto axis_in=in+axis_off;\n"
+" auto axis_out=out+axis_off;\n"
+" \n"
+" // get max\n"
+" float max1=-INFINITY;\n"
+" for (int i=tiisg; i<s.axis_length; i+=SIMD_GROUP_WIDTH) {\n"
+" max1=max(max1,float(axis_in[i*s.inside_size]));\n"
+" }\n"
+" max1=simd_max(max1);\n"
+" // get sum\n"
+" float sum1=0;\n"
+" for (int i=tiisg; i<s.axis_length; i+=SIMD_GROUP_WIDTH) {\n"
+" sum1 += exp(float(axis_in[i*s.inside_size])-float(max1));\n"
+" }\n"
+" sum1=simd_sum(sum1);\n"
+" // output\n"
+" for (int i=tiisg; i<s.axis_length; i+=SIMD_GROUP_WIDTH) {\n"
+" axis_out[i*s.inside_size]=M(exp(float(axis_in[i*s.inside_size])-float(max1))/sum1);\n"
+" }\n"
+"}\n"
 "kernel void softmax_plane(const device M *in [[buffer(0)]],\n"
 " device M *out [[buffer(1)]],\n"
 " constant softmax_shape& s [[buffer(2)]],\n"
@@ -585,20 +617,20 @@ const char* shader_MetalSoftmax_metal =
 " auto axis_out=out+axis_off;\n"
 " \n"
 " // get max\n"
-" auto max1=axis_in[0];\n"
-" for (int i=1; i<s.axis_length; i++) {\n"
-" max1=max(max1,axis_in[i*s.inside_size]);\n"
+" float max1=-INFINITY;\n"
+" for (int i=0; i<s.axis_length; i++) {\n"
+" max1=max(max1,float(axis_in[i*s.inside_size]));\n"
 " }\n"
 " \n"
 " // get sum\n"
 " float sum1=0;\n"
 " for (int i=0; i<s.axis_length; i++) {\n"
-" sum1 += float(exp(axis_in[i*s.inside_size]-max1));\n"
+" sum1 += float(exp(float(axis_in[i*s.inside_size])-float(max1)));\n"
 " }\n"
 " \n"
 " // output\n"
 " for (int i=0; i<s.axis_length; i++) {\n"
-" axis_out[i*s.inside_size]=M(exp(float(axis_in[i*s.inside_size]-max1))/sum1);\n"
+" axis_out[i*s.inside_size]=M(exp(float(axis_in[i*s.inside_size])-float(max1))/sum1);\n"
 " }\n"
 "}\n"
 "kernel void softmax_on_reorder(const device M4 *in [[buffer(0)]],\n"
@@ -693,6 +725,45 @@ const char* shader_MetalLayerNorm_metal =
 " out_data[gid.x]=(M)(norm);\n"
 " }\n"
 "}\n"
+"kernel void layernorm_x1_sg(const device M *in [[buffer(0)]],\n"
+" device M *out [[buffer(1)]],\n"
+" constant layernorm_constants& cst [[buffer(2)]],\n"
+" const device float *gamma [[buffer(3)]],\n"
+" const device float *beta [[buffer(4)]],\n"
+" uint3 gid [[threadgroup_position_in_grid]],\n"
+" uint tiisg[[thread_index_in_simdgroup]],\n"
+" uint sgitg[[simdgroup_index_in_threadgroup]]) {\n"
+" if ((int)gid.x >= cst.inside || (int)gid.y >= cst.outside) {\n"
+" return;\n"
+" }\n"
+" auto in_data=in+gid.y*cst.inside;\n"
+" auto out_data=out+gid.y*cst.inside;\n"
+" float mean;\n"
+" float sum=0.0f;\n"
+" float square_sum=0.0f;\n"
+" \n"
+" for(int i=tiisg; i<cst.inside; i+=SIMD_GROUP_WIDTH) {\n"
+" sum += in_data[i];\n"
+" }\n"
+" sum=simd_sum(sum);\n"
+" mean=sum/cst.inside;\n"
+" \n"
+" for(int i=tiisg; i<cst.inside; i+=SIMD_GROUP_WIDTH) {\n"
+" float dis=(in_data[i]-mean);\n"
+" square_sum += dis*dis;\n"
+" }\n"
+" square_sum=simd_sum(square_sum);\n"
+" if(tiisg == 0) {\n"
+" float var=1.0/sqrt(square_sum/cst.inside+cst.eps);\n"
+" \n"
+" float norm=var*((float)in_data[gid.x]-mean);\n"
+" if(cst.has_gamma_beta) {\n"
+" out_data[gid.x]=(M)(norm*gamma[gid.x]+beta[gid.x]);\n"
+" } else {\n"
+" out_data[gid.x]=(M)(norm);\n"
+" }\n"
+" }\n"
+"}\n"
 "kernel void layernorm_x4(const device M4 *in [[buffer(0)]],\n"
 " device M4 *out [[buffer(1)]],\n"
 " constant layernorm_constants& cst [[buffer(2)]],\n"
@@ -735,6 +806,54 @@ const char* shader_MetalLayerNorm_metal =
 " out_data[gid.x]=(M4)(norm);\n"
 " }\n"
 "}\n"
+"kernel void layernorm_x4_sg(const device M4 *in [[buffer(0)]],\n"
+" device M4 *out [[buffer(1)]],\n"
+" constant layernorm_constants& cst [[buffer(2)]],\n"
+" const device float4 *gamma [[buffer(3)]],\n"
+" const device float4 *beta [[buffer(4)]],\n"
+" uint3 gid [[threadgroup_position_in_grid]],\n"
+" uint tiisg[[thread_index_in_simdgroup]],\n"
+" uint sgitg[[simdgroup_index_in_threadgroup]]) {\n"
+" if ((int)gid.x >= cst.inside/4 || (int)gid.y >= cst.outside) {\n"
+" return;\n"
+" }\n"
+" auto in_data=in+gid.y*cst.inside/4;\n"
+" auto out_data=out+gid.y*cst.inside/4;\n"
+" float mean;\n"
+" float sum=0.0f;\n"
+" float square_sum=0.0f;\n"
+" \n"
+" for(int i=tiisg; i<cst.inside/4; i+=SIMD_GROUP_WIDTH) {\n"
+" sum += in_data[i].x;\n"
+" sum += in_data[i].y;\n"
+" sum += in_data[i].z;\n"
+" sum += in_data[i].w;\n"
+" }\n"
+" sum=simd_sum(sum);\n"
+" mean=sum/cst.inside;\n"
+" \n"
+" for(int i=tiisg; i<cst.inside/4; i+=SIMD_GROUP_WIDTH) {\n"
+" float dis=(in_data[i].x-mean);\n"
+" square_sum += dis*dis;\n"
+" dis=(in_data[i].y-mean);\n"
+" square_sum += dis*dis;\n"
+" dis=(in_data[i].z-mean);\n"
+" square_sum += dis*dis;\n"
+" dis=(in_data[i].w-mean);\n"
+" square_sum += dis*dis;\n"
+" }\n"
+" square_sum=simd_sum(square_sum);\n"
+" if(tiisg == 0) {\n"
+" float var=1.0/sqrt(square_sum/cst.inside+cst.eps);\n"
+" \n"
+" float4 norm=var*((float4)in_data[gid.x]-mean);\n"
+" if(cst.has_gamma_beta) {\n"
+" out_data[gid.x]=(M4)(norm*gamma[gid.x]+beta[gid.x]);\n"
+" } else {\n"
+" out_data[gid.x]=(M4)(norm);\n"
+" }\n"
+" }\n"
+"}\n"
 "kernel void layernorm_x1_rms(const device M *in [[buffer(0)]],\n"
 " device M *out [[buffer(1)]],\n"
 " constant layernorm_constants& cst [[buffer(2)]],\n"
@@ -761,6 +880,39 @@ const char* shader_MetalLayerNorm_metal =
 " out_data[gid.x]=(M)(norm);\n"
 " }\n"
 "}\n"
+"kernel void layernorm_x1_rms_sg(const device M *in [[buffer(0)]],\n"
+" device M *out [[buffer(1)]],\n"
+" constant layernorm_constants& cst [[buffer(2)]],\n"
+" const device float *gamma [[buffer(3)]],\n"
+" const device float *beta [[buffer(4)]],\n"
+" uint3 gid [[threadgroup_position_in_grid]],\n"
+" uint tiisg[[thread_index_in_simdgroup]],\n"
+" uint sgitg[[simdgroup_index_in_threadgroup]]) {\n"
+" if ((int)gid.x >= cst.inside || (int)gid.y >= cst.outside) {\n"
+" return;\n"
+" }\n"
+" auto in_data=in+gid.y*cst.inside;\n"
+" auto out_data=out+gid.y*cst.inside;\n"
+" float square_sum=0.0f;\n"
+" \n"
+" for(int i=tiisg; i<cst.inside; i+=SIMD_GROUP_WIDTH) {\n"
+" float dis=in_data[i];\n"
+" square_sum += dis*dis;\n"
+" }\n"
+" \n"
+" square_sum=simd_sum(square_sum);\n"
+" \n"
+" if(tiisg == 0) {\n"
+" float var=1.0/sqrt(square_sum/cst.inside+cst.eps);\n"
+" \n"
+" float norm=var*((float)in_data[gid.x]);\n"
+" if(cst.has_gamma_beta) {\n"
+" out_data[gid.x]=(M)(norm*gamma[gid.x]+beta[gid.x]);\n"
+" } else {\n"
+" out_data[gid.x]=(M)(norm);\n"
+" }\n"
+" }\n"
+"}\n"
 "kernel void layernorm_x4_rms(const device M4 *in [[buffer(0)]],\n"
 " device M4 *out [[buffer(1)]],\n"
 " constant layernorm_constants& cst [[buffer(2)]],\n"
@@ -792,17 +944,19 @@ const char* shader_MetalLayerNorm_metal =
 " out_data[gid.x]=(M4)(norm);\n"
 " }\n"
 "}\n"
-"kernel void layernorm_m1x4_rms(const device M4 *in [[buffer(0)]],\n"
+"kernel void layernorm_x4_rms_sg(const device M4 *in [[buffer(0)]],\n"
 " device M4 *out [[buffer(1)]],\n"
 " constant layernorm_constants& cst [[buffer(2)]],\n"
 " const device float4 *gamma [[buffer(3)]],\n"
 " const device float4 *beta [[buffer(4)]],\n"
-" uint gid [[threadgroup_position_in_grid]],\n"
+" uint3 gid [[threadgroup_position_in_grid]],\n"
 " uint tiisg[[thread_index_in_simdgroup]],\n"
 " uint sgitg[[simdgroup_index_in_threadgroup]]) {\n"
-" int total_idx=(gid*4+sgitg);\n"
-" int in_idx=total_idx % (cst.inside/4);\n"
-" int out_idx=total_idx/(cst.inside/4);\n"
+" if ((int)gid.x >= cst.inside/4 || (int)gid.y >= cst.outside) {\n"
+" return;\n"
+" }\n"
+" int in_idx=gid.x;\n"
+" int out_idx=gid.y;\n"
 " auto in_data=in+out_idx*cst.inside/4;\n"
 " auto out_data=out+out_idx*cst.inside/4;\n"
 " float square_sum=0.0f;\n"
@@ -1616,7 +1770,7 @@ const char* shader_MetalConvolution1x1_metal =
 " //if (computeSize>2) {xy_out[2]=activate(M4(result2),cst.activation); }\n"
 " //if (computeSize>3) {xy_out[3]=activate(M4(result3),cst.activation); }\n"
 "}\n"
-"kernel void conv1x1_g1z4_m1w4(const device M4 *in [[buffer(0)]],\n"
+"kernel void conv1x1_gemv_g8_w4(const device M4 *in [[buffer(0)]],\n"
 " device M4 *out [[buffer(1)]],\n"
 " constant conv1x1_constants& cst [[buffer(2)]],\n"
 " const device MNN::uchar4x2 *wt [[buffer(3)]],\n"
@@ -1625,7 +1779,12 @@ const char* shader_MetalConvolution1x1_metal =
 " uint3 gid[[threadgroup_position_in_grid]],\n"
 " uint tiisg[[thread_index_in_simdgroup]],\n"
 " uint sgitg[[simdgroup_index_in_threadgroup]]) {\n"
+" // each threadgroup contain 2 simdgroup\n"
+" // each simdgroup compute 4 data\n"
 " int uz=gid.x*2+sgitg;\n"
+" if(uz >= cst.output_slice) {\n"
+" return;\n"
+" }\n"
 " int rx=gid.y;\n"
 " auto xy_wt=wt+uz*cst.input_slice;\n"
 " auto xy_in0=in+(int)gid.z*cst.input_size+rx+0;\n"
@@ -1633,43 +1792,105 @@ const char* shader_MetalConvolution1x1_metal =
 " auto biasValue=FLOAT4(biasTerms[uz]);\n"
 " FLOAT4 result0=FLOAT4(0);\n"
 " int block=(cst.input_slice+cst.block_size-1)/cst.block_size;\n"
-" for (int bi=0; bi<cst.block_size; bi++) {\n"
+" \n"
+" int middle_step=min(SIMD_GROUP_WIDTH,block);\n"
+" int outer_step=SIMD_GROUP_WIDTH/middle_step;\n"
+" int middle_index=(tiisg) % middle_step;\n"
+" int outer_index=(tiisg)/middle_step;\n"
+" \n"
+" for (int bi= outer_index; bi<cst.block_size; bi += outer_step) {\n"
 " FLOAT4 scale=FLOAT4(dequantScale[2*(uz*cst.block_size+bi)+0]);\n"
 " FLOAT4 dequant_bias=FLOAT4(dequantScale[2*(uz*cst.block_size+bi)+1]);\n"
 " int zmin=bi*block;\n"
 " int zmax=min(zmin+block,cst.input_slice);\n"
-" for (int z=zmin+tiisg; z<zmax; z+=SIMD_GROUP_WIDTH) {\n"
-" auto in40=(FLOAT4)*(xy_in0+z*cst.input_size*cst.batch);\n"
+" for (int z=zmin+middle_index; z<zmax; z += middle_step) {\n"
+" FLOAT4 in40=(FLOAT4)*(xy_in0+z);\n"
+" \n"
 " MNN::uchar4x2 w_int4=xy_wt[z];\n"
 " FLOAT4x4 w_dequant;\n"
-" for (int i=0; i<4; ++i) {\n"
+" for (int i=0; i<4; i += 1) {\n"
 " FLOAT4 w4=FLOAT4((float)(w_int4[i][0] >> 4)-8,(float)(w_int4[i][0] & 15)-8,(float)(w_int4[i][1] >> 4)-8,(float)(w_int4[i][1] & 15)-8);\n"
 " FLOAT4 res=w4*scale[i]+dequant_bias[i];\n"
 " w_dequant[i]=res;\n"
 " }\n"
 " result0 += FLOAT4(in40*w_dequant);\n"
 " \n"
-"// FLOAT4x4 w_dequant;\n"
-"// for (int i=0; i<4; ++i) {\n"
-"// FLOAT4 w4=FLOAT4((float)(w_int4[i][0] >> 4)-8,(float)(w_int4[i][0] & 15)-8,(float)(w_int4[i][1] >> 4)-8,(float)(w_int4[i][1] & 15)-8);\n"
-"// FLOAT4 res=w4*scale[i]+dequant_bias[i];\n"
-"// w_dequant[i]=w4;\n"
-"// }\n"
-"//\n"
-"// FLOAT4 temp=FLOAT4(in40*w_dequant);\n"
-"// result0 += temp*scale+(in40.x+in40.y+in40.z+in40.w)*dequant_bias;\n"
-" }\n"
-" }\n"
-" FLOAT4 res;\n"
-" res.x=simd_sum(result0.x);\n"
-" res.y=simd_sum(result0.y);\n"
-" res.z=simd_sum(result0.z);\n"
-" res.w=simd_sum(result0.w);\n"
+" }\n"
+" }\n"
+" FLOAT4 res=simd_sum(result0);\n"
 " /* true */\n"
 " if (tiisg == 0) {\n"
 " xy_out[0]=activate(M4(res+biasValue),cst.activation);\n"
 " }\n"
 "}\n"
+"kernel void conv1x1_gemv_g16_w4(const device M4 *in [[buffer(0)]],\n"
+" device M4 *out [[buffer(1)]],\n"
+" constant conv1x1_constants& cst [[buffer(2)]],\n"
+" const device MNN::uchar4x2 *wt [[buffer(3)]],\n"
+" const device M4 *biasTerms [[buffer(4)]],\n"
+" const device float4 *dequantScale [[buffer(5)]],\n"
+" uint3 gid[[threadgroup_position_in_grid]],\n"
+" uint tiisg[[thread_index_in_simdgroup]],\n"
+" uint sgitg[[simdgroup_index_in_threadgroup]]) {\n"
+" // each threadgroup contain 2 simdgroup\n"
+" // each simdgroup compute 8 data\n"
+" int uz=2*(gid.x*2+sgitg);\n"
+" if(uz >= cst.output_slice) {\n"
+" return;\n"
+" }\n"
+" \n"
+" auto xy_wt=wt+uz*cst.input_slice;\n"
+" auto xy_in0=in;\n"
+" auto xy_out=out+(int)gid.z*cst.output_size+uz;\n"
+" auto biasValue0=FLOAT4(biasTerms[uz]);\n"
+" auto biasValue1=FLOAT4(biasTerms[uz+1]);\n"
+" FLOAT4 result0=FLOAT4(0);\n"
+" FLOAT4 result1=FLOAT4(0);\n"
+" int block=(cst.input_slice+cst.block_size-1)/cst.block_size;\n"
+" \n"
+" int middle_step=min(SIMD_GROUP_WIDTH,block);\n"
+" int outer_step=SIMD_GROUP_WIDTH/middle_step;\n"
+" int middle_index=(tiisg) % middle_step;\n"
+" int outer_index=(tiisg)/middle_step;\n"
+" \n"
+" for (int bi= outer_index; bi<cst.block_size; bi += outer_step) {\n"
+" const int quant_offset=2*(uz*cst.block_size+bi);\n"
+" FLOAT4 scale0=FLOAT4(dequantScale[quant_offset+0]);\n"
+" FLOAT4 dequant_bias0=FLOAT4(dequantScale[quant_offset+1]);\n"
+" FLOAT4 scale1=FLOAT4(dequantScale[quant_offset+(cst.block_size << 1)]);\n"
+" FLOAT4 dequant_bias1=FLOAT4(dequantScale[quant_offset+(cst.block_size << 1)+1]);\n"
+" int zmin=bi*block;\n"
+" int zmax=min(zmin+block,cst.input_slice);\n"
+" for (int z=zmin+middle_index; z<zmax; z += middle_step) {\n"
+" FLOAT4 in40=(FLOAT4)*(xy_in0+z);\n"
+" \n"
+" MNN::uchar4x2 w_int4=xy_wt[z];\n"
+" FLOAT4x4 w_dequant;\n"
+" for (int i=0; i<4; i += 1) {\n"
+" FLOAT4 w4=FLOAT4((float)(w_int4[i][0] >> 4)-8,(float)(w_int4[i][0] & 15)-8,(float)(w_int4[i][1] >> 4)-8,(float)(w_int4[i][1] & 15)-8);\n"
+" FLOAT4 res=w4*scale0[i]+dequant_bias0[i];\n"
+" w_dequant[i]=res;\n"
+" }\n"
+" result0 += FLOAT4(in40*w_dequant);\n"
+" w_int4=xy_wt[cst.input_slice+z];\n"
+" for (int i=0; i<4; i += 1) {\n"
+" FLOAT4 w4=FLOAT4((float)(w_int4[i][0] >> 4)-8,(float)(w_int4[i][0] & 15)-8,(float)(w_int4[i][1] >> 4)-8,(float)(w_int4[i][1] & 15)-8);\n"
+" FLOAT4 res=w4*scale1[i]+dequant_bias1[i];\n"
+" w_dequant[i]=res;\n"
+" }\n"
+" \n"
+" result1 += FLOAT4(in40*w_dequant);\n"
+" \n"
+" }\n"
+" }\n"
+" FLOAT4 res0=simd_sum(result0);\n"
+" FLOAT4 res1=simd_sum(result1);\n"
+" /* true */\n"
+" if (tiisg == 0) {\n"
+" xy_out[0]=activate(M4(res0+biasValue0),cst.activation);\n"
+" xy_out[1]=activate(M4(res1+biasValue1),cst.activation);\n"
+" }\n"
+"}\n"
 "kernel void conv1x1_g1z8(const device M4 *in [[buffer(0)]],\n"
 " device M4 *out [[buffer(1)]],\n"
 " constant conv1x1_constants& cst [[buffer(2)]],\n"
diff --git a/source/backend/metal/MNNMetalContext.h b/source/backend/metal/MNNMetalContext.h
index ca5a589d1..af197f14f 100644
--- a/source/backend/metal/MNNMetalContext.h
+++ b/source/backend/metal/MNNMetalContext.h
@@ -34,7 +34,6 @@ typedef enum {
 @property (strong, nonatomic, readonly) id<MTLDevice> device;
 /** max memory length cound be used in threadgroup */
 @property (assign, nonatomic, readonly) BOOL isIphone;
-@property (assign, nonatomic, readonly) BOOL isSimdGroupAvailable;
 
 /**
  * @brief alloc temp buffer on device
diff --git a/source/backend/metal/MNNMetalContext.mm b/source/backend/metal/MNNMetalContext.mm
index b271c1243..2fe8a7199 100644
--- a/source/backend/metal/MNNMetalContext.mm
+++ b/source/backend/metal/MNNMetalContext.mm
@@ -79,20 +79,6 @@ static void createLibrary(id<MTLDevice> device, NSMutableDictionary<NSString *,
     }
 }
 
-+ (BOOL)isSimdGroupAvailable{
-#if TARGET_OS_IPHONE
-    if(@available(iOS 14, *)) {
-        return YES;
-    }
-#endif
-#if TARGET_OS_MAC
-    if(@available(macOS 10.14, *)) {
-        return YES;
-    }
-#endif
-    return NO;
-}
-
 + (BOOL)isIphone{
     struct utsname systemInfo;
     uname(&systemInfo);
@@ -112,7 +98,6 @@ - (BOOL) initWithSharedContext:(const MNNMetalSharedContext*)context dev:(id<MTL
     _cachesFp16   = [NSMutableDictionary dictionary];
     _cachesFp32   = [NSMutableDictionary dictionary];
     _isIphone = self.class.isIphone;
-    _isSimdGroupAvailable = self.class.isSimdGroupAvailable;
     createLibrary(_device, _cachesFp16, true);
     createLibrary(_device, _cachesFp32, false);
     return nil != _device;
diff --git a/source/backend/metal/MetalAttention.mm b/source/backend/metal/MetalAttention.mm
index 9679fe1ab..933c2b23a 100644
--- a/source/backend/metal/MetalAttention.mm
+++ b/source/backend/metal/MetalAttention.mm
@@ -28,8 +28,9 @@
     int head_dim;
     float scale;
 };
+#define SIMD_GROUP_WIDTH 32
 
-kernel void main0(const device T* input0 [[buffer(0)]],
+kernel void prefill(const device T* input0 [[buffer(0)]],
     const device T* input1 [[buffer(1)]],
     device T* output [[buffer(2)]],
     device T* past_key [[buffer(3)]],
@@ -61,7 +62,7 @@ kernel void main0(const device T* input0 [[buffer(0)]],
     const device T* A_offset = input0 + x * offset + offset_head;
     device T* Pastkey_offset = past_key + z * offset / group + offset_head_kv;
     float Vscale = (float)param.scale;
-#ifdef FOR_PREFILL
+
     device const T* B_offset = input1 + z * offset / group + offset_head_kv;
     const int output_offset = y * query_seq_len * key_seq_len;
     float out0 = 0.0;
@@ -83,11 +84,75 @@ kernel void main0(const device T* input0 [[buffer(0)]],
     out0 = mask[((x + 0) * key_seq_len + (z + 0))] == 0 ? -FLT_MAX : out0;
 #endif
     output[output_offset + x * key_seq_len + z] = (T)out0;
+}
+
+kernel void decode(const device T* input0 [[buffer(0)]],
+    const device T* input1 [[buffer(1)]],
+    device T* output [[buffer(2)]],
+    device T* past_key [[buffer(3)]],
+#ifdef FLOAT_MASK
+    const device T* mask [[buffer(4)]],
 #else
+    const device int* mask [[buffer(4)]],
+#endif
+    constant Param& param [[buffer(5)]],
+#ifdef SIMD_GROUP_REDUCE
+    uint3 gid[[threadgroup_position_in_grid]],
+    uint  tiisg[[thread_index_in_simdgroup]],
+    uint  sgitg[[simdgroup_index_in_threadgroup]]
+#else
+    uint3 gid[[thread_position_in_grid]]
+#endif
+) {
+    const int x = gid.x; // query_seq_len
+    const int y = gid.y; // head_num
+    const int z = gid.z; // key_seq_len
+    if (x >= param.query_seq_len || y >= param.head_num || z >= param.key_seq_len) {
+        return;
+    }
+    int group = param.group;
+
+    int key_seq_len = param.key_seq_len;
+    int head_num = param.head_num;
+    int head_dim = param.head_dim;
+    int yr = y % param.group;
+    
+    const int offset = head_num * head_dim;
+    const int offset_head = y * head_dim;
+    const int offset_head_kv = (y / param.group) * head_dim;
+    const device T* A_offset = input0 + x * offset + offset_head;
+    device T* Pastkey_offset = past_key + z * offset / group + offset_head_kv;
+    float Vscale = (float)param.scale;
+
     const device T *B_offset = input1 + offset_head_kv;
     float out = 0.0;
+
+#ifdef SIMD_GROUP_REDUCE
+    if (z == key_seq_len - 1) {
+        for(int i = tiisg; i < head_dim; i+=SIMD_GROUP_WIDTH){
+            float A = (float)(A_offset[i]);
+            float B = (float)(B_offset[i]);
+            out += B * A;
+            if (yr == 0) {
+                Pastkey_offset[i] = (T)B;
+            }
+        }
+    } else {
+        for(int i = tiisg; i < head_dim; i+=SIMD_GROUP_WIDTH){
+            float A = A_offset[i];
+            float B = (float)Pastkey_offset[i];
+            
+            out += A * B;
+        }
+    }
+    out = simd_sum(out);
+    if(tiisg == 0) {
+        out *= Vscale;
+        output[y * key_seq_len + z] = (T)out;
+    }
+#else
     if (z == key_seq_len - 1) {
-        for(int i = 0; i < head_dim; ++i){
+        for(int i = 0; i < head_dim; i++){
             float A = (float)(A_offset[i]);
             float B = (float)(B_offset[i]);
             out += B * A;
@@ -96,7 +161,7 @@ kernel void main0(const device T* input0 [[buffer(0)]],
             }
         }
     } else {
-        for(int i = 0; i < head_dim; ++i){
+        for(int i = 0; i < head_dim; i++){
             float A = A_offset[i];
             float B = (float)Pastkey_offset[i];
             
@@ -123,7 +188,8 @@ kernel void main0(const device T* input0 [[buffer(0)]],
     int head_dim;
     float scale;
 };
-kernel void main0(const device T* input0 [[buffer(0)]],
+#define SIMD_GROUP_WIDTH 32
+kernel void prefill(const device T* input0 [[buffer(0)]],
     const device T* input1 [[buffer(1)]],
     device T* output [[buffer(2)]],
     device T* past_value [[buffer(3)]],
@@ -144,7 +210,7 @@ kernel void main0(const device T* input0 [[buffer(0)]],
     int head_dim = param.head_dim;
     const int stride = head_num * head_dim / group;
     const int offset_head = yin * head_dim + z;
-#ifdef FOR_PREFILL
+
     device const T *A_offset = input0 + (y * qk_seq_len + x) * value_seq_len;
     device const T *B_offset = input1 + offset_head;
     device T *Pastvalue_offset = past_value + offset_head;
@@ -159,13 +225,59 @@ kernel void main0(const device T* input0 [[buffer(0)]],
         }
     }
     output[ x * stride * group + (y * head_dim + z)] = out;
+}
+
+kernel void decode(const device T* input0 [[buffer(0)]],
+    const device T* input1 [[buffer(1)]],
+    device T* output [[buffer(2)]],
+    device T* past_value [[buffer(3)]],
+    constant Param& param [[buffer(4)]],
+#ifdef SIMD_GROUP_REDUCE
+    uint3 gid[[threadgroup_position_in_grid]],
+    uint  tiisg[[thread_index_in_simdgroup]],
+    uint  sgitg[[simdgroup_index_in_threadgroup]]
 #else
+    uint3 gid[[thread_position_in_grid]]
+#endif
+) {
+    const int x = gid.x; // query_seq_len
+    const int y = gid.y; // head_num
+    const int z = gid.z; // head_dim
+    if (x >= param.query_seq_len || y >= param.head_num || z >= param.head_dim) {
+        return;
+    }
+    int group = param.group;
+    int yin = y / param.group;
+    int yr = y % param.group;
+
+    int value_seq_len = param.key_seq_len;
+    int head_num = param.head_num;
+    int head_dim = param.head_dim;
+    const int stride = head_num * head_dim / group;
+    const int offset_head = yin * head_dim + z;
+
     device const T *A_offset = input0 + y * value_seq_len;
     device const T *B_offset = input1 + offset_head;
     device T *Pastvalue_offset = past_value + offset_head;
     float out = 0;
     
-    for(int i = 0; i < value_seq_len - 1; ++i){
+#ifdef SIMD_GROUP_REDUCE
+    for(int i = tiisg; i < value_seq_len - 1; i+=SIMD_GROUP_WIDTH){
+        float A = (float)A_offset[i];
+        float B = (float)Pastvalue_offset[i * stride];
+        
+        out += A * B;
+    }
+    out = simd_sum(out);
+    if(tiisg == 0) {
+        out += (float)A_offset[(value_seq_len - 1)] * (float)B_offset[0];
+        if (yr == 0) {
+            Pastvalue_offset[(value_seq_len - 1)*stride] = B_offset[0];
+        }
+        output[(y * head_dim + z)] = (T)out;
+    }
+#else
+    for(int i = 0; i < value_seq_len - 1; i++){
         float A = (float)A_offset[i];
         float B = (float)Pastvalue_offset[i * stride];
         
@@ -177,7 +289,6 @@ kernel void main0(const device T* input0 [[buffer(0)]],
     }
     output[(y * head_dim + z)] = (T)out;
 #endif
-
 }
 )metal";
 
@@ -292,68 +403,6 @@ virtual bool onClone(Backend* bn, const Op* op, Execution** dst) override {
     auto mtbn = static_cast<MetalBackend *>(backend());
     auto context = (__bridge MNNMetalContext *)mtbn->context();
     auto shape = query->shape();
-    if (nil == mKernel_softmax) {
-        // Init Kernel
-        bool float_mask = (mask->getType() == halide_type_of<float>());
-        auto rt = mtbn->runtime();
-        std::string T = "float";
-        if (mtbn->useFp16InsteadFp32()) {
-            T = "half";
-        }
-        std::vector<std::string> qkKeys = {
-            {"matmul_qk_div_mask", T}
-        };
-        std::vector<std::string> qkvKeys = {
-            {"matmul_qkv", T}
-        };
-        std::vector<std::string> qkPrefillKeys = {
-            {"matmul_qk_div_mask", T, "FOR_PREFILL"}
-        };
-        if (float_mask) {
-            qkPrefillKeys.emplace_back("FLOAT_MASK");
-        }
-        std::vector<std::string> qkvPrefillKeys = {
-            {"matmul_qkv", T, "FOR_PREFILL"}
-        };
-        std::vector<std::vector<std::string>> keys = {
-            qkKeys,
-            qkvKeys,
-            qkPrefillKeys,
-            qkvPrefillKeys
-        };
-        std::vector<const char*> sources = {
-            gMatMulDivMask,
-            gMatMulQKV,
-            gMatMulDivMask,
-            gMatMulQKV,
-        };
-        std::vector<id<MTLComputePipelineState>> pipelines(keys.size());
-        for (int i=0; i<keys.size(); ++i) {
-            auto pipeline = rt->findPipeline(keys[i]);
-            if (nil == pipeline) {
-                // Rebuild Pipeline
-                MTLCompileOptions *option = [[MTLCompileOptions alloc] init];
-                auto dic = [NSMutableDictionary dictionaryWithCapacity:0];
-                [dic setValue:@(keys[i][1].c_str()) forKey:@"T"];
-                for (int j=2; j<keys[i].size(); ++j) {
-                    [dic setValue:@"1" forKey:@(keys[i][j].c_str())];;
-                }
-                option.preprocessorMacros = dic;
-                pipeline = mtbn->makeComputePipelineWithSourceOption(sources[i], "main0", option);
-                rt->insertPipeline(keys[i], pipeline);
-            }
-            pipelines[i] = pipeline;
-        }
-        mKernel_qk = pipelines[0];
-        mKernel_qkv = pipelines[1];
-        mKernelPrefill_qk = pipelines[2];
-        mKernelPrefill_qkv = pipelines[3];
-        MNN_ASSERT(nil != mKernel_qk);
-        MNN_ASSERT(nil != mKernel_qkv);
-        MNN_ASSERT(nil != mKernelPrefill_qk);
-        MNN_ASSERT(nil != mKernelPrefill_qkv);
-        mKernel_softmax = [context pipelineWithName:@"softmax_plane" fp16:mtbn->useFp16InsteadFp32()];
-    }
     int seq_len = shape[1];
     mNumHead = shape[2];
     mHeadDim = shape[3];
@@ -367,6 +416,87 @@ virtual bool onClone(Backend* bn, const Op* op, Execution** dst) override {
         mCache->mKv_seq_len = mCache->mPastLength + 1;
     }
     mKvNumHead = key->shape()[2];
+    
+    auto rt = (MetalRuntime*)mtbn->runtime();
+    bool supportSimdReduce = rt->supportSimdGroupReduce();
+    // decode and thread number not too large
+    bool qkSimdReduce = supportSimdReduce && seq_len == 1 && mCache->mKv_seq_len * mNumHead < mHeadDim * 32;
+    bool sftmSimdReduce = supportSimdReduce;
+    bool qkvSimdReduce = supportSimdReduce && seq_len == 1 && mHeadDim * mNumHead < mCache->mKv_seq_len * 32;
+    
+    // Init Kernel
+    bool float_mask = (mask->getType() == halide_type_of<float>());
+    std::string T = "float";
+    if (mtbn->useFp16InsteadFp32()) {
+        T = "half";
+    }
+    std::vector<std::string> qkKeys = {
+        {"matmul_qk_div_mask", T}
+    };
+    if(qkSimdReduce) {
+        qkKeys.emplace_back("SIMD_GROUP_REDUCE");
+    }
+    std::vector<std::string> qkvKeys = {
+        {"matmul_qkv", T}
+    };
+    if(qkvSimdReduce) {
+        qkvKeys.emplace_back("SIMD_GROUP_REDUCE");
+    }
+    std::vector<std::string> qkPrefillKeys = {
+        {"matmul_qk_div_mask", T, "FOR_PREFILL"}
+    };
+    if (float_mask) {
+        qkPrefillKeys.emplace_back("FLOAT_MASK");
+    }
+    std::vector<std::string> qkvPrefillKeys = {
+        {"matmul_qkv", T, "FOR_PREFILL"}
+    };
+    std::vector<std::vector<std::string>> keys = {
+        qkKeys,
+        qkvKeys,
+        qkPrefillKeys,
+        qkvPrefillKeys
+    };
+    std::vector<const char*> sources = {
+        gMatMulDivMask,
+        gMatMulQKV,
+        gMatMulDivMask,
+        gMatMulQKV,
+    };
+    std::vector<id<MTLComputePipelineState>> pipelines(keys.size());
+    for (int i=0; i<keys.size(); ++i) {
+        auto pipeline = rt->findPipeline(keys[i]);
+        if (nil == pipeline) {
+            // Rebuild Pipeline
+            MTLCompileOptions *option = [[MTLCompileOptions alloc] init];
+            auto dic = [NSMutableDictionary dictionaryWithCapacity:0];
+            [dic setValue:@(keys[i][1].c_str()) forKey:@"T"];
+            for (int j=2; j<keys[i].size(); ++j) {
+                [dic setValue:@"1" forKey:@(keys[i][j].c_str())];;
+            }
+            option.preprocessorMacros = dic;
+            if(std::find(keys[i].begin(), keys[i].end(), "FOR_PREFILL") != keys[i].end()) {
+                pipeline = mtbn->makeComputePipelineWithSourceOption(sources[i], "prefill", option);
+            } else {
+                pipeline = mtbn->makeComputePipelineWithSourceOption(sources[i], "decode", option);
+            }
+            rt->insertPipeline(keys[i], pipeline);
+        }
+        pipelines[i] = pipeline;
+    }
+    mKernel_qk = pipelines[0];
+    mKernel_qkv = pipelines[1];
+    mKernelPrefill_qk = pipelines[2];
+    mKernelPrefill_qkv = pipelines[3];
+    MNN_ASSERT(nil != mKernel_qk);
+    MNN_ASSERT(nil != mKernel_qkv);
+    MNN_ASSERT(nil != mKernelPrefill_qk);
+    MNN_ASSERT(nil != mKernelPrefill_qkv);
+    if(sftmSimdReduce) {
+        mKernel_softmax = [context pipelineWithName:@"softmax_plane_sg" fp16:mtbn->useFp16InsteadFp32()];
+    } else {
+        mKernel_softmax = [context pipelineWithName:@"softmax_plane" fp16:mtbn->useFp16InsteadFp32()];
+    }
 
     int group_size = mNumHead / mKvNumHead;
 
@@ -440,7 +570,13 @@ virtual bool onClone(Backend* bn, const Op* op, Execution** dst) override {
         MetalBackend::setTensor(mCache->mPastKey.get(), encoder, 3);
         MetalBackend::setTensor(mask, encoder, 4);
         [encoder setBuffer:mParamQKV offset:0 atIndex:5];
-        auto gl = [context computeBestGroupAndLocal:pipeline threads:MTLSizeMake(seq_len, mNumHead, mCache->mKv_seq_len)];
+
+        std::pair<MTLSize, MTLSize> gl;
+        if(qkSimdReduce) {
+            gl = std::make_pair(MTLSizeMake(seq_len, mNumHead, mCache->mKv_seq_len), MTLSizeMake(32, 1, 1));
+        } else {
+            gl = [context computeBestGroupAndLocal:pipeline threads:MTLSizeMake(seq_len, mNumHead, mCache->mKv_seq_len)];
+        }
         [encoder dispatchThreadgroups:gl.first threadsPerThreadgroup:gl.second];
     }
     // Run Softmax Kernel
@@ -449,7 +585,15 @@ virtual bool onClone(Backend* bn, const Op* op, Execution** dst) override {
         MetalBackend::setTensor(mTempQK.get(), encoder, 0);
         MetalBackend::setTensor(mTempSoftMax.get(), encoder, 1);
         [encoder setBuffer:mParamSoftmax offset:0 atIndex:2];
-        auto gl = [context computeBestGroupAndLocal: mKernel_softmax threads:MTLSizeMake(inside, outside, 1)];
+
+        int thread_group_size = 32;
+        std::pair<MTLSize, MTLSize> gl;
+        if(sftmSimdReduce) {
+            gl = std::make_pair(MTLSizeMake(inside, outside, 1), MTLSizeMake(thread_group_size, 1, 1));
+        } else {
+            gl = [context computeBestGroupAndLocal: mKernel_softmax threads:MTLSizeMake(inside, outside, 1)];
+        }
+
         [encoder dispatchThreadgroups:gl.first threadsPerThreadgroup:gl.second];
     }
     // Run QKV Kernel
@@ -466,7 +610,12 @@ virtual bool onClone(Backend* bn, const Op* op, Execution** dst) override {
         MetalBackend::setTensor(outputs[0], encoder, 2);
         MetalBackend::setTensor(mCache->mPastValue.get(), encoder, 3);
         [encoder setBuffer:mParamQKV offset:0 atIndex:4];
-        auto gl = [context computeBestGroupAndLocal:pipeline threads:MTLSizeMake(seq_len, mNumHead, mHeadDim)];
+        std::pair<MTLSize, MTLSize> gl;
+        if(qkvSimdReduce) {
+            gl = std::make_pair(MTLSizeMake(seq_len, mNumHead, mHeadDim), MTLSizeMake(32, 1, 1));
+        } else {
+            gl = [context computeBestGroupAndLocal:pipeline threads:MTLSizeMake(seq_len, mNumHead, mHeadDim)];
+        }
         [encoder dispatchThreadgroups:gl.first threadsPerThreadgroup:gl.second];
     }
     // Update status
@@ -474,6 +623,7 @@ virtual bool onClone(Backend* bn, const Op* op, Execution** dst) override {
         mCache->mPastLength += 1;
         mCache->mKv_seq_len = mCache->mPastLength + 1;
     }
+//    printf("qk:%d %d %d, softmax:%d %d %d, qkv:%d %d %d\n", seq_len, mNumHead, mCache->mKv_seq_len, inside, outside, 1, seq_len, mNumHead, mHeadDim);
     return;
 }
 
diff --git a/source/backend/metal/MetalBackend.hpp b/source/backend/metal/MetalBackend.hpp
index dfcc571dc..88996a968 100644
--- a/source/backend/metal/MetalBackend.hpp
+++ b/source/backend/metal/MetalBackend.hpp
@@ -36,7 +36,12 @@ class MetalRuntime : public Runtime {
     void *context() const {
         return mContext;
     }
-
+    bool supportSimdGroupReduce() {
+        return mSimdGroupReduce;
+    }
+    bool supportSimdGroupMatrix() {
+        return mSimdGroupMatrix;
+    }
     void setGpuMode(const int cl_mode_num);
     void setCommandQueue(id<MTLCommandQueue> queue, bool userSync);
     id<MTLCommandQueue> getCommandQueue() const {
@@ -93,6 +98,9 @@ class MetalRuntime : public Runtime {
     TunedInfo* mTunedInfo;
     BackendConfig mDefaultConfig;
     mutable std::map<std::vector<std::string>, id<MTLComputePipelineState>> mCachePipeine;
+private:
+    bool mSimdGroupReduce;
+    bool mSimdGroupMatrix;
 };
 
 
diff --git a/source/backend/metal/MetalBackend.mm b/source/backend/metal/MetalBackend.mm
index 3d680b65f..d0d5229a2 100644
--- a/source/backend/metal/MetalBackend.mm
+++ b/source/backend/metal/MetalBackend.mm
@@ -219,6 +219,7 @@ MemChunk chunk() override {
 Execution *MetalBackend::onCreate(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs,
                                   const Op *op) {
     auto map  = getCreatorMap();
+
     auto iter = map->find(op->type());
     if (iter == map->end()) {
         mSupportDeferEncode = false;
@@ -967,6 +968,9 @@ static void _execute(id<MTLComputeCommandEncoder> encoder, const MetalBackend::C
     mContext = context;
     auto ctx = (__bridge MNNMetalContext *)mContext;
     std::shared_ptr<EagerBufferAllocator::Allocator> allocator(new MetalRuntimeAllocator([ctx device]));
+    mSimdGroupReduce = [[ctx device] supportsFamily:MTLGPUFamilyApple7];
+    mSimdGroupReduce |= [[ctx device] supportsFamily:MTLGPUFamilyMetal3];
+    mSimdGroupMatrix = [[ctx device] supportsFamily:MTLGPUFamilyApple7];
     mStatic.reset(new EagerBufferAllocator(allocator));
     mDynamic.resize(METAL_SEPERATE_MAX_COUNT);
     for (auto& buf : mDynamic) {
diff --git a/source/backend/metal/MetalConvolution1x1.mm b/source/backend/metal/MetalConvolution1x1.mm
index 35e65118d..2a5065615 100644
--- a/source/backend/metal/MetalConvolution1x1.mm
+++ b/source/backend/metal/MetalConvolution1x1.mm
@@ -87,11 +87,19 @@
         std::string name = "conv1x1_g1z4_w8";
         mPipeline = [context pipelineWithName:@"conv1x1_g1z4_w8" fp16:backend->useFp16InsteadFp32()];
         if (mDequantBits == 4) {
-            if(context.isSimdGroupAvailable && ob * ow * oh == 1) {
-                mPipeline = [context pipelineWithName:@"conv1x1_g1z4_m1w4" fp16:backend->useFp16InsteadFp32()];
-                name = "conv1x1_g1z4_m1w4";
-                mThreads = std::make_pair(MTLSizeMake(UP_DIV(oc, 8), 1, 1), MTLSizeMake(8, 8, 1));
-
+            if(rt->supportSimdGroupReduce() && ob * ow * oh == 1) {
+                // unrool c for avoid memory exceed
+                if(oc > 16384 && oc_4 % 2 == 0) {
+                    mPipeline = [context pipelineWithName:@"conv1x1_gemv_g16_w4" fp16:backend->useFp16InsteadFp32()];
+                    name = "conv1x1_gemv_g16_w4";
+//                    MNN_PRINT("g16 ic: %d oc: %d\n", input->channel(), oc);
+                    mThreads = std::make_pair(MTLSizeMake(UP_DIV(oc, 16), 1, 1), MTLSizeMake(64, 1, 1));
+                } else {
+                    mPipeline = [context pipelineWithName:@"conv1x1_gemv_g8_w4" fp16:backend->useFp16InsteadFp32()];
+                    name = "conv1x1_gemv_g8_w4";
+//                    MNN_PRINT("g8  ic: %d oc: %d\n", input->channel(), oc);
+                    mThreads = std::make_pair(MTLSizeMake(UP_DIV(oc, 8), 1, 1), MTLSizeMake(64, 1, 1));
+                }
                 return NO_ERROR;
             } else {
                 mPipeline = [context pipelineWithName:@"conv1x1_g1z4_w4" fp16:backend->useFp16InsteadFp32()];
diff --git a/source/backend/metal/MetalConvolutionCommon.mm b/source/backend/metal/MetalConvolutionCommon.mm
index 318c138eb..f47464209 100644
--- a/source/backend/metal/MetalConvolutionCommon.mm
+++ b/source/backend/metal/MetalConvolutionCommon.mm
@@ -156,15 +156,15 @@ void weightInBlock(int group, int oc, int ic, int kh, int kw, const FType *src,
     }
     // param
     auto size   = qnt ? MAX(qnt->weight.size(), qnt->weightFloat.size()) : conv->weight()->size();
-    if (loadWeightInt8 && qnt->canUseInt4) {
-        size *= 2;
-    }
     auto common = conv->common();
     auto kw     = common->kernelX();
     auto kh     = common->kernelY();
     auto group  = common->group();
     auto oc     = common->outputCount();
-    auto ic     = size / kw / kh / (oc / group);
+    int ic     = common->inputCount();
+    if (0 == ic) {
+        ic = size / kw / kh / (oc / group);
+    }
 
     // convert
     if (loadWeightInt8 && qnt->weight.get() != nullptr) {
diff --git a/source/backend/metal/MetalLayerNorm.mm b/source/backend/metal/MetalLayerNorm.mm
index 7eaf586f0..ac096193c 100755
--- a/source/backend/metal/MetalLayerNorm.mm
+++ b/source/backend/metal/MetalLayerNorm.mm
@@ -75,22 +75,33 @@
     ((float *)mShapeBuffer.contents)[2] = mEps;
     ((int *)mShapeBuffer.contents)[3]   = (int)has_gamma_beta_;
 
-    
-    
     bool parallel = (mInside > 32) && ((mInside & 3) == 0);
-    if(RMSNorm){
-        mPipeline = [context pipelineWithName:parallel ? @"layernorm_x4_rms" : @"layernorm_x1_rms" fp16:backend->useFp16InsteadFp32()];
-    }else{
-        mPipeline = [context pipelineWithName:parallel ? @"layernorm_x4" : @"layernorm_x1" fp16:backend->useFp16InsteadFp32()];
-    }
-    
     auto inside = parallel ? mInside/4 : mInside;
-    mThreads = [context computeBestGroupAndLocal:mPipeline threads:MTLSizeMake((NSUInteger)inside, (NSUInteger)mOutside, 1)];
-    if(context.isSimdGroupAvailable) {
-        if(mOutside == 1 && RMSNorm && parallel) {
-            mPipeline = [context pipelineWithName:@"layernorm_m1x4_rms" fp16:backend->useFp16InsteadFp32()];
-            mThreads = std::make_pair(MTLSizeMake((NSUInteger)UP_DIV(inside, 4) * mOutside, 1, 1), MTLSizeMake(128, 1, 1));
+    if(((MetalRuntime *)backend->runtime())->supportSimdGroupReduce()) {
+        if(RMSNorm) {
+            if(parallel) {
+                mPipeline = [context pipelineWithName:@"layernorm_x4_rms_sg" fp16:backend->useFp16InsteadFp32()];
+                mThreads = std::make_pair(MTLSizeMake(inside, mOutside, 1), MTLSizeMake(32, 1, 1));
+            } else {
+                mPipeline = [context pipelineWithName:@"layernorm_x1_rms_sg" fp16:backend->useFp16InsteadFp32()];
+                mThreads = std::make_pair(MTLSizeMake(inside, mOutside, 1), MTLSizeMake(32, 1, 1));
+            }
+        } else {
+            if(parallel) {
+                mPipeline = [context pipelineWithName:@"layernorm_x4_sg" fp16:backend->useFp16InsteadFp32()];
+                mThreads = std::make_pair(MTLSizeMake(inside, mOutside, 1), MTLSizeMake(32, 1, 1));
+            } else {
+                mPipeline = [context pipelineWithName:@"layernorm_x1_sg" fp16:backend->useFp16InsteadFp32()];
+                mThreads = std::make_pair(MTLSizeMake(inside, mOutside, 1), MTLSizeMake(32, 1, 1));
+            }
+        }
+    } else {
+        if(RMSNorm){
+            mPipeline = [context pipelineWithName:parallel ? @"layernorm_x4_rms" : @"layernorm_x1_rms" fp16:backend->useFp16InsteadFp32()];
+        }else{
+            mPipeline = [context pipelineWithName:parallel ? @"layernorm_x4" : @"layernorm_x1" fp16:backend->useFp16InsteadFp32()];
         }
+        mThreads = [context computeBestGroupAndLocal:mPipeline threads:MTLSizeMake((NSUInteger)inside, (NSUInteger)mOutside, 1)];
     }
     return NO_ERROR;
 }
diff --git a/source/backend/metal/MetalRaster.mm b/source/backend/metal/MetalRaster.mm
index 788f13087..27dbfb2ff 100644
--- a/source/backend/metal/MetalRaster.mm
+++ b/source/backend/metal/MetalRaster.mm
@@ -284,6 +284,10 @@ kernel void main0(device int4 *out   [[buffer(0)]],
     if (nil != mZeroCopy) {
         mtbn->returnConstBuffer(mZeroCopy);
     }
+    auto bufferAlloc = mtbn->getStaticBufferPool();
+    for(auto& iter : mTempInputCopy) {
+        bufferAlloc->free(iter.second.blit);
+    }
 }
 struct MemsetInfo {
     int value[4];
@@ -320,6 +324,9 @@ kernel void main0(device int4 *out   [[buffer(0)]],
         }
     }
 
+    for (auto& iter : mTempInputCopy) {
+        bufferAlloc->free(iter.second.blit);
+    }
     mTempInputCopy.clear();
     mOutputPtr = output;
 #ifndef MNN_METAL_FORBID_RASTER_C4
diff --git a/source/backend/metal/shader/MetalConvolution1x1.metal b/source/backend/metal/shader/MetalConvolution1x1.metal
index 80e4d7fb6..584f30000 100644
--- a/source/backend/metal/shader/MetalConvolution1x1.metal
+++ b/source/backend/metal/shader/MetalConvolution1x1.metal
@@ -167,7 +167,7 @@ kernel void conv1x1_g1z4_w4(const device ftype4 *in            [[buffer(0)]],
     //if (computeSize > 3) {xy_out[3] = activate(ftype4(result3), cst.activation); }
 }
 
-kernel void conv1x1_g1z4_m1w4(const device ftype4 *in            [[buffer(0)]],
+kernel void conv1x1_gemv_g8_w4(const device ftype4 *in            [[buffer(0)]],
                             device ftype4 *out                 [[buffer(1)]],
                             constant conv1x1_constants& cst    [[buffer(2)]],
                             const device MNN::uchar4x2 *wt      [[buffer(3)]],
@@ -176,7 +176,12 @@ kernel void conv1x1_g1z4_m1w4(const device ftype4 *in            [[buffer(0)]],
                             uint3 gid[[threadgroup_position_in_grid]],
                             uint  tiisg[[thread_index_in_simdgroup]],
                             uint  sgitg[[simdgroup_index_in_threadgroup]]) {
+    // each threadgroup contain 2 simdgroup
+    // each simdgroup compute 4 data
     int uz = gid.x * 2 + sgitg;
+    if(uz >= cst.output_slice) {
+        return;
+    }
 
     int rx = gid.y;
     auto xy_wt = wt + uz * cst.input_slice;
@@ -186,17 +191,24 @@ kernel void conv1x1_g1z4_m1w4(const device ftype4 *in            [[buffer(0)]],
     FLOAT4 result0 = FLOAT4(0);
 
     int block = (cst.input_slice + cst.block_size - 1) / cst.block_size;
-    for (int bi=0; bi<cst.block_size; bi++) {
+    
+    int middle_step = min(SIMD_GROUP_WIDTH, block);
+    int outer_step  = SIMD_GROUP_WIDTH / middle_step;
+    int middle_index = (tiisg) % middle_step;
+    int outer_index  = (tiisg) / middle_step;
+    
+    for (int bi= outer_index; bi<cst.block_size; bi += outer_step) {
         FLOAT4 scale = FLOAT4(dequantScale[2 * (uz * cst.block_size + bi) + 0]);
         FLOAT4 dequant_bias = FLOAT4(dequantScale[2 * (uz * cst.block_size + bi) + 1]);
         int zmin = bi * block;
         int zmax = min(zmin + block, cst.input_slice);
-        for (int z = zmin + tiisg; z < zmax; z+=SIMD_GROUP_WIDTH) {
-            auto in40 = (FLOAT4)*(xy_in0 + z * cst.input_size * cst.batch);
+        for (int z = zmin + middle_index; z < zmax; z += middle_step) {
+            FLOAT4 in40 = (FLOAT4)*(xy_in0 + z);
+            
             MNN::uchar4x2 w_int4 = xy_wt[z];
 
             FLOAT4x4 w_dequant;
-            for (int i = 0; i < 4; ++i) {
+            for (int i = 0; i < 4; i += 1) {
                 FLOAT4 w4 = FLOAT4((float)(w_int4[i][0] >> 4) - 8, (float)(w_int4[i][0] & 15) - 8, (float)(w_int4[i][1] >> 4) - 8, (float)(w_int4[i][1] & 15) - 8);
                 FLOAT4 res = w4 * scale[i] + dequant_bias[i];
                 w_dequant[i] = res;
@@ -204,28 +216,95 @@ kernel void conv1x1_g1z4_m1w4(const device ftype4 *in            [[buffer(0)]],
 
             result0 += FLOAT4(in40 * w_dequant);
             
-//            FLOAT4x4 w_dequant;
-//            for (int i = 0; i < 4; ++i) {
-//                FLOAT4 w4 = FLOAT4((float)(w_int4[i][0] >> 4) - 8, (float)(w_int4[i][0] & 15) - 8, (float)(w_int4[i][1] >> 4) - 8, (float)(w_int4[i][1] & 15) - 8);
-//                FLOAT4 res = w4 * scale[i] + dequant_bias[i];
-//                w_dequant[i] = w4;
-//            }
-//
-//            FLOAT4 temp = FLOAT4(in40 * w_dequant);
-//            result0 += temp * scale + (in40.x + in40.y + in40.z + in40.w) * dequant_bias;
         }
     }
-    FLOAT4 res;
-    res.x = simd_sum(result0.x);
-    res.y = simd_sum(result0.y);
-    res.z = simd_sum(result0.z);
-    res.w = simd_sum(result0.w);
+
+    FLOAT4 res = simd_sum(result0);
     /* true */
     if (tiisg == 0) {
         xy_out[0] = activate(ftype4(res + biasValue), cst.activation);
     }
 }
 
+
+
+
+kernel void conv1x1_gemv_g16_w4(const device ftype4 *in            [[buffer(0)]],
+                            device ftype4 *out                 [[buffer(1)]],
+                            constant conv1x1_constants& cst    [[buffer(2)]],
+                            const device MNN::uchar4x2 *wt      [[buffer(3)]],
+                            const device ftype4 *biasTerms     [[buffer(4)]],
+                            const device float4 *dequantScale  [[buffer(5)]],
+                            uint3 gid[[threadgroup_position_in_grid]],
+                            uint  tiisg[[thread_index_in_simdgroup]],
+                            uint  sgitg[[simdgroup_index_in_threadgroup]]) {
+    // each threadgroup contain 2 simdgroup
+    // each simdgroup compute 8 data
+    int uz = 2 * (gid.x * 2 + sgitg);
+    if(uz >= cst.output_slice) {
+        return;
+    }
+    
+    auto xy_wt = wt + uz * cst.input_slice;
+    auto xy_in0  = in;
+    auto xy_out = out + (int)gid.z * cst.output_size + uz;
+    auto biasValue0 = FLOAT4(biasTerms[uz]);
+    auto biasValue1 = FLOAT4(biasTerms[uz + 1]);
+
+    FLOAT4 result0 = FLOAT4(0);
+    FLOAT4 result1 = FLOAT4(0);
+
+    int block = (cst.input_slice + cst.block_size - 1) / cst.block_size;
+    
+    int middle_step = min(SIMD_GROUP_WIDTH, block);
+    int outer_step  = SIMD_GROUP_WIDTH / middle_step;
+    int middle_index = (tiisg) % middle_step;
+    int outer_index  = (tiisg) / middle_step;
+    
+    for (int bi= outer_index; bi<cst.block_size; bi += outer_step) {
+        const int quant_offset = 2 * (uz * cst.block_size + bi);
+        FLOAT4 scale0 = FLOAT4(dequantScale[quant_offset + 0]);
+        FLOAT4 dequant_bias0 = FLOAT4(dequantScale[quant_offset + 1]);
+        FLOAT4 scale1 = FLOAT4(dequantScale[quant_offset + (cst.block_size << 1)]);
+        FLOAT4 dequant_bias1 = FLOAT4(dequantScale[quant_offset + (cst.block_size << 1) + 1]);
+        int zmin = bi * block;
+        int zmax = min(zmin + block, cst.input_slice);
+        for (int z = zmin + middle_index; z < zmax; z += middle_step) {
+            FLOAT4 in40 = (FLOAT4)*(xy_in0 + z);
+            
+            MNN::uchar4x2 w_int4 = xy_wt[z];
+
+            FLOAT4x4 w_dequant;
+            for (int i = 0; i < 4; i += 1) {
+                FLOAT4 w4 = FLOAT4((float)(w_int4[i][0] >> 4) - 8, (float)(w_int4[i][0] & 15) - 8, (float)(w_int4[i][1] >> 4) - 8, (float)(w_int4[i][1] & 15) - 8);
+                FLOAT4 res = w4 * scale0[i] + dequant_bias0[i];
+                w_dequant[i] = res;
+            }
+            result0 += FLOAT4(in40 * w_dequant);
+
+            w_int4 = xy_wt[cst.input_slice + z];
+            for (int i = 0; i < 4; i += 1) {
+                FLOAT4 w4 = FLOAT4((float)(w_int4[i][0] >> 4) - 8, (float)(w_int4[i][0] & 15) - 8, (float)(w_int4[i][1] >> 4) - 8, (float)(w_int4[i][1] & 15) - 8);
+                FLOAT4 res = w4 * scale1[i] + dequant_bias1[i];
+                w_dequant[i] = res;
+            }
+            
+            result1 += FLOAT4(in40 * w_dequant);
+            
+        }
+    }
+
+    FLOAT4 res0 = simd_sum(result0);
+    FLOAT4 res1 = simd_sum(result1);
+
+    /* true */
+    if (tiisg == 0) {
+        xy_out[0] = activate(ftype4(res0 + biasValue0), cst.activation);
+        xy_out[1] = activate(ftype4(res1 + biasValue1), cst.activation);
+
+    }
+}
+
 kernel void conv1x1_g1z8(const device ftype4 *in            [[buffer(0)]],
                          device ftype4 *out                 [[buffer(1)]],
                          constant conv1x1_constants& cst    [[buffer(2)]],
diff --git a/source/backend/metal/shader/MetalLayerNorm.metal b/source/backend/metal/shader/MetalLayerNorm.metal
index bad927112..3c02d204e 100644
--- a/source/backend/metal/shader/MetalLayerNorm.metal
+++ b/source/backend/metal/shader/MetalLayerNorm.metal
@@ -40,6 +40,47 @@ kernel void layernorm_x1(const device ftype *in       [[buffer(0)]],
     }
 }
 
+kernel void layernorm_x1_sg(const device ftype *in       [[buffer(0)]],
+                         device ftype *out            [[buffer(1)]],
+                         constant layernorm_constants& cst  [[buffer(2)]],
+                         const device float *gamma    [[buffer(3)]],
+                         const device float *beta     [[buffer(4)]],
+                         uint3  gid  [[threadgroup_position_in_grid]],
+                         uint  tiisg[[thread_index_in_simdgroup]],
+                         uint  sgitg[[simdgroup_index_in_threadgroup]]) {
+    if ((int)gid.x >= cst.inside || (int)gid.y >= cst.outside) {
+        return;
+    }
+    auto in_data = in + gid.y * cst.inside;
+    auto out_data = out + gid.y * cst.inside;
+
+    float mean;
+    float sum = 0.0f;
+    float square_sum = 0.0f;
+    
+    for(int i = tiisg; i < cst.inside; i+=SIMD_GROUP_WIDTH) {
+        sum += in_data[i];
+    }
+    sum = simd_sum(sum);
+    mean = sum / cst.inside;
+    
+    for(int i = tiisg; i < cst.inside; i+=SIMD_GROUP_WIDTH) {
+        float dis = (in_data[i] - mean);
+        square_sum += dis * dis;
+    }
+    square_sum = simd_sum(square_sum);
+
+    if(tiisg == 0) {
+        float var = 1.0 / sqrt(square_sum / cst.inside + cst.eps);
+        
+        float norm = var * ((float)in_data[gid.x] - mean);
+        if(cst.has_gamma_beta) {
+            out_data[gid.x] = (ftype)(norm * gamma[gid.x] + beta[gid.x]);
+        } else {
+            out_data[gid.x] = (ftype)(norm);
+        }
+    }
+}
 
 kernel void layernorm_x4(const device ftype4 *in       [[buffer(0)]],
                          device ftype4 *out            [[buffer(1)]],
@@ -85,6 +126,56 @@ kernel void layernorm_x4(const device ftype4 *in       [[buffer(0)]],
     }
 }
 
+kernel void layernorm_x4_sg(const device ftype4 *in       [[buffer(0)]],
+                         device ftype4 *out            [[buffer(1)]],
+                         constant layernorm_constants& cst  [[buffer(2)]],
+                         const device float4 *gamma    [[buffer(3)]],
+                         const device float4 *beta     [[buffer(4)]],
+                         uint3  gid  [[threadgroup_position_in_grid]],
+                         uint  tiisg[[thread_index_in_simdgroup]],
+                         uint  sgitg[[simdgroup_index_in_threadgroup]]) {
+    if ((int)gid.x >= cst.inside/4 || (int)gid.y >= cst.outside) {
+        return;
+    }
+    auto in_data = in + gid.y * cst.inside/4;
+    auto out_data = out + gid.y * cst.inside/4;
+
+    float mean;
+    float sum = 0.0f;
+    float square_sum = 0.0f;
+    
+    for(int i = tiisg; i < cst.inside/4; i+=SIMD_GROUP_WIDTH) {
+        sum += in_data[i].x;
+        sum += in_data[i].y;
+        sum += in_data[i].z;
+        sum += in_data[i].w;
+    }
+    sum = simd_sum(sum);
+    mean = sum / cst.inside;
+    
+    for(int i = tiisg; i < cst.inside/4; i+=SIMD_GROUP_WIDTH) {
+        float dis = (in_data[i].x - mean);
+        square_sum += dis * dis;
+        dis = (in_data[i].y - mean);
+        square_sum += dis * dis;
+        dis = (in_data[i].z - mean);
+        square_sum += dis * dis;
+        dis = (in_data[i].w - mean);
+        square_sum += dis * dis;
+    }
+    square_sum = simd_sum(square_sum);
+
+    if(tiisg == 0) {
+        float var = 1.0 / sqrt(square_sum / cst.inside + cst.eps);
+        
+        float4 norm = var * ((float4)in_data[gid.x] - mean);
+        if(cst.has_gamma_beta) {
+            out_data[gid.x] = (ftype4)(norm * gamma[gid.x] + beta[gid.x]);
+        } else {
+            out_data[gid.x] = (ftype4)(norm);
+        }
+    }
+}
 
 kernel void layernorm_x1_rms(const device ftype *in       [[buffer(0)]],
                             device ftype *out            [[buffer(1)]],
@@ -114,6 +205,41 @@ kernel void layernorm_x1_rms(const device ftype *in       [[buffer(0)]],
     }
 }
 
+kernel void layernorm_x1_rms_sg(const device ftype *in       [[buffer(0)]],
+                            device ftype *out            [[buffer(1)]],
+                            constant layernorm_constants& cst  [[buffer(2)]],
+                            const device float *gamma    [[buffer(3)]],
+                            const device float *beta     [[buffer(4)]],
+                            uint3  gid  [[threadgroup_position_in_grid]],
+                            uint  tiisg[[thread_index_in_simdgroup]],
+                            uint  sgitg[[simdgroup_index_in_threadgroup]]) {
+    if ((int)gid.x >= cst.inside || (int)gid.y >= cst.outside) {
+        return;
+    }
+    auto in_data = in + gid.y * cst.inside;
+    auto out_data = out + gid.y * cst.inside;
+
+    float square_sum = 0.0f;
+    
+    for(int i = tiisg; i < cst.inside; i+=SIMD_GROUP_WIDTH) {
+        float dis = in_data[i];
+        square_sum += dis * dis;
+    }
+    
+    square_sum = simd_sum(square_sum);
+    
+    if(tiisg == 0) {
+        float var = 1.0 / sqrt(square_sum / cst.inside + cst.eps);
+        
+        float norm = var * ((float)in_data[gid.x]);
+        if(cst.has_gamma_beta) {
+            out_data[gid.x] = (ftype)(norm * gamma[gid.x] + beta[gid.x]);
+        } else {
+            out_data[gid.x] = (ftype)(norm);
+        }
+    }
+}
+
 kernel void layernorm_x4_rms(const device ftype4 *in       [[buffer(0)]],
                              device ftype4 *out            [[buffer(1)]],
                              constant layernorm_constants& cst  [[buffer(2)]],
@@ -148,18 +274,20 @@ kernel void layernorm_x4_rms(const device ftype4 *in       [[buffer(0)]],
     }
 }
 
-kernel void layernorm_m1x4_rms(const device ftype4 *in       [[buffer(0)]],
+kernel void layernorm_x4_rms_sg(const device ftype4 *in       [[buffer(0)]],
                              device ftype4 *out            [[buffer(1)]],
                              constant layernorm_constants& cst  [[buffer(2)]],
                              const device float4 *gamma    [[buffer(3)]],
                              const device float4 *beta     [[buffer(4)]],
-                             uint  gid  [[threadgroup_position_in_grid]],
+                             uint3  gid  [[threadgroup_position_in_grid]],
                              uint  tiisg[[thread_index_in_simdgroup]],
                              uint  sgitg[[simdgroup_index_in_threadgroup]]) {
+    if ((int)gid.x >= cst.inside/4 || (int)gid.y >= cst.outside) {
+        return;
+    }
 
-    int total_idx = (gid * 4 + sgitg);
-    int in_idx = total_idx  % (cst.inside/4);
-    int out_idx = total_idx  / (cst.inside/4);
+    int in_idx = gid.x;
+    int out_idx = gid.y;
 
     auto in_data = in + out_idx * cst.inside/4;
     auto out_data = out + out_idx * cst.inside/4;
diff --git a/source/backend/metal/shader/MetalSoftmax.metal b/source/backend/metal/shader/MetalSoftmax.metal
index 4b7affc04..7bed37c00 100644
--- a/source/backend/metal/shader/MetalSoftmax.metal
+++ b/source/backend/metal/shader/MetalSoftmax.metal
@@ -17,6 +17,42 @@ static inline float4 softmax_filter(float4 value, int z, int limit) {
     return select(0, value, z * 4 + int4(0, 1, 2, 3) < limit);
 }
 
+
+kernel void softmax_plane_sg(const device ftype *in     [[buffer(0)]],
+                        device ftype *out          [[buffer(1)]],
+                        constant softmax_shape& s   [[buffer(2)]],
+                        uint2 gid[[threadgroup_position_in_grid]],
+                        uint  tiisg[[thread_index_in_simdgroup]],
+                        uint  sgitg[[simdgroup_index_in_threadgroup]]
+    ) {
+    // threadgroup contain one simdgroup
+    // simdgroup compute axis data
+    if ((int)gid.x >= s.inside_size || (int)gid.y >= s.outside_size) return;
+    
+    auto axis_off = gid.y * s.axis_length * s.inside_size + gid.x;
+    auto axis_in  = in + axis_off;
+    auto axis_out = out + axis_off;
+    
+    // get max
+    float max1 = -INFINITY;
+    for (int i = tiisg; i < s.axis_length; i+=SIMD_GROUP_WIDTH) {
+        max1 = max(max1, float(axis_in[i * s.inside_size]));
+    }
+    max1 = simd_max(max1);
+
+    // get sum
+    float sum1 = 0;
+    for (int i = tiisg; i < s.axis_length; i+=SIMD_GROUP_WIDTH) {
+        sum1 += exp(float(axis_in[i * s.inside_size]) - float(max1));
+    }
+    sum1 = simd_sum(sum1);
+
+    // output
+    for (int i = tiisg; i < s.axis_length; i+=SIMD_GROUP_WIDTH) {
+        axis_out[i * s.inside_size] = ftype(exp(float(axis_in[i * s.inside_size]) - float(max1)) / sum1);
+    }
+}
+
 kernel void softmax_plane(const device ftype *in     [[buffer(0)]],
                        device ftype *out          [[buffer(1)]],
                        constant softmax_shape& s   [[buffer(2)]],
@@ -28,23 +64,24 @@ kernel void softmax_plane(const device ftype *in     [[buffer(0)]],
     auto axis_out = out + axis_off;
     
     // get max
-    auto max1 = axis_in[0];
-    for (int i = 1; i < s.axis_length; i++) {
-        max1 = max(max1, axis_in[i * s.inside_size]);
+    float max1 = -INFINITY;
+    for (int i = 0; i < s.axis_length; i++) {
+        max1 = max(max1, float(axis_in[i * s.inside_size]));
     }
     
     // get sum
     float sum1 = 0;
     for (int i = 0; i < s.axis_length; i++) {
-        sum1 += float(exp(axis_in[i * s.inside_size] - max1));
+        sum1 += float(exp(float(axis_in[i * s.inside_size]) - float(max1)));
     }
     
     // output
     for (int i = 0; i < s.axis_length; i++) {
-        axis_out[i * s.inside_size] = ftype(exp(float(axis_in[i * s.inside_size] - max1)) / sum1);
+        axis_out[i * s.inside_size] = ftype(exp(float(axis_in[i * s.inside_size]) - float(max1)) / sum1);
     }
 }
 
+
 kernel void softmax_on_reorder(const device ftype4 *in      [[buffer(0)]],
                                device ftype4 *out           [[buffer(1)]],
                                constant softmax_shape& s    [[buffer(2)]],
diff --git a/source/backend/opencl/core/OpenCLBackend.cpp b/source/backend/opencl/core/OpenCLBackend.cpp
index 67e0a1a81..23cb43b2e 100644
--- a/source/backend/opencl/core/OpenCLBackend.cpp
+++ b/source/backend/opencl/core/OpenCLBackend.cpp
@@ -685,7 +685,7 @@ void CLRuntime::convertFromDevice(const Tensor* srcTensor, const Tensor* dstTens
         }
 #ifdef MNN_SUPPORT_INTEL_SUBGROUP
         int cPack = TensorUtils::getTensorChannelPack(srcTensor);
-        if (cPack == 16) {
+        if (cPack == 16 && mOpenCLRuntime->isSupportedIntelSubgroup()) {
             switch (data_format) {
                 case MNN_DATA_FORMAT_NHWC:
                     OpenCL::convertNC4HW4OrNC16HW16BufferToNCHWOrNHWCBuffer(srcTensor, const_cast<Tensor*>(dstTensor),
@@ -803,7 +803,7 @@ void CLRuntime::convertToDevice(const Tensor* srcTensor, const Tensor* dstTensor
         }
 #ifdef MNN_SUPPORT_INTEL_SUBGROUP
         int cPack = TensorUtils::getTensorChannelPack(dstTensor);
-        if (cPack == 16) {
+        if (cPack == 16 && mOpenCLRuntime->isSupportedIntelSubgroup()) {
             if (MNN_DATA_FORMAT_NHWC == data_format) {
                 OpenCL::converNCHWOrNHWCBufferToNC4HW4OrNC16HW16Buffer(srcTensor, const_cast<Tensor*>(dstTensor), "nhwc_buffer_to_nc16hw16_buffer", mOpenCLRuntime.get(), true, false, svmFlag);
             } else if (MNN_DATA_FORMAT_NCHW == data_format) {
@@ -855,7 +855,7 @@ void OpenCLBackend::copyToDevice(const Tensor* srcTensor, const Tensor* dstTenso
     auto memType = srcTensor->buffer().flags;
     void* hostPtr = srcTensor->host<float>();
     // 1*1*1*1 don't need convert
-    if(srcTensor->getType().code == halide_type_float && mOpenCLRuntime->isSupportedFP16() && 1 == shape[0] * shape[1] * shape[2] * shape[3]){
+    if(BUFFER == mOpenCLRuntime->getGpuMemType() && srcTensor->getType().code == halide_type_float && mOpenCLRuntime->isSupportedFP16() && 1 == shape[0] * shape[1] * shape[2] * shape[3]){
         needSize /= 2;
         void *tmpPtr = malloc(needSize);
         ((half_float::half*)tmpPtr)[0] = (half_float::half)(((float*)hostPtr)[0]);
diff --git a/source/backend/opencl/core/runtime/OpenCLRuntime.cpp b/source/backend/opencl/core/runtime/OpenCLRuntime.cpp
index 2b45559c4..40d7a4bce 100644
--- a/source/backend/opencl/core/runtime/OpenCLRuntime.cpp
+++ b/source/backend/opencl/core/runtime/OpenCLRuntime.cpp
@@ -17,7 +17,8 @@
 //#define MNN_OPEN_TIME_TRACE
 #include <MNN/AutoTime.hpp>
 #include "CLCache_generated.h"
-#include "backend/opencl/execution/cl/opencl_source_map.hpp" 
+#include "backend/opencl/execution/cl/opencl_source_map.hpp"
+//#define ARM_OPENCL_PRINTF_DEBUG
 using namespace CLCache;
 namespace MNN {
 
@@ -30,6 +31,13 @@ bool OpenCLRuntime::getDeviceSupportsExtension(const cl::Device &device, const c
     return (pos != std::string::npos);
 }
 
+#ifdef ARM_OPENCL_PRINTF_DEBUG
+static void callback(const char *buffer, size_t length, size_t final, void *user_data)
+{
+    fwrite(buffer, 1, length, stdout);
+}
+#endif
+
 OpenCLRuntime::OpenCLRuntime(const BackendConfig::PrecisionMode precision, const int cl_mode, int platformSize, int platformId, int deviceId, void *contextPtr, void *glShared) {
 #ifdef LOG_VERBOSE
     MNN_PRINT("start OpenCLRuntime !\n");
@@ -182,7 +190,18 @@ OpenCLRuntime::OpenCLRuntime(const BackendConfig::PrecisionMode precision, const
                     mContext = std::shared_ptr<cl::Context>(new cl::Context(std::vector<cl::Device>({*mFirstGPUDevicePtr}), context_properties.data(), nullptr, nullptr, &res));
                     mIsDeviceSupportedLowPower = true;
                 }else{
+                    #ifdef ARM_OPENCL_PRINTF_DEBUG
+                    cl_context_properties context_properties[] =
+                    {
+                        CL_CONTEXT_PLATFORM, (cl_context_properties)platforms[platformId](),
+                        CL_PRINTF_CALLBACK_ARM, (cl_context_properties)callback,
+                        CL_PRINTF_BUFFERSIZE_ARM, 0x1000,
+                        0
+                    };
+                    mContext = std::shared_ptr<cl::Context>(new cl::Context(std::vector<cl::Device>({*mFirstGPUDevicePtr}), context_properties, nullptr, nullptr, &res));
+                    #else
                     mContext = std::shared_ptr<cl::Context>(new cl::Context(std::vector<cl::Device>({*mFirstGPUDevicePtr}), nullptr, nullptr, nullptr, &res));
+                    #endif
                 }
                 
                 MNN_CHECK_CL_SUCCESS(res, "context");
diff --git a/source/backend/opencl/core/runtime/OpenCLWrapper.cpp b/source/backend/opencl/core/runtime/OpenCLWrapper.cpp
index 3e288c1f5..a83bfaa8f 100644
--- a/source/backend/opencl/core/runtime/OpenCLWrapper.cpp
+++ b/source/backend/opencl/core/runtime/OpenCLWrapper.cpp
@@ -29,19 +29,17 @@ bool OpenCLSymbols::LoadOpenCLLibrary() {
 
     #if defined(__APPLE__) || defined(__MACOSX)
         "libOpenCL.so", "/System/Library/Frameworks/OpenCL.framework/OpenCL"
-    #elif defined(__ANDROID__)
-        "libOpenCL.so",
-        "libGLES_mali.so",
-        "libmali.so",
-        "libOpenCL-pixel.so",
-    /*
     #elif defined(__OHOS__)
+        "/vendor/lib64/chipsetsdk/libhvgr_v200.so",
         "/vendor/lib64/chipsetsdk/libGLES_mali.so",
         "/system/lib64/libGLES_mali.so",
         "libGLES_mali.so",
-        "/vendor/lib64/chipsetsdk/libhvgr_v200.so",
         "/vendor/lib64/chipsetsdk/libEGI_imp1.so",
-    */
+    #elif defined(__ANDROID__)
+        "libOpenCL.so",
+        "libGLES_mali.so",
+        "libmali.so",
+        "libOpenCL-pixel.so",
     #if defined(__aarch64__)
         // Qualcomm Adreno
         "/system/vendor/lib64/libOpenCL.so",
diff --git a/source/backend/opencl/execution/buffer/AttentionBufExecution.cpp b/source/backend/opencl/execution/buffer/AttentionBufExecution.cpp
index 2302714cc..6a435ca7d 100644
--- a/source/backend/opencl/execution/buffer/AttentionBufExecution.cpp
+++ b/source/backend/opencl/execution/buffer/AttentionBufExecution.cpp
@@ -18,7 +18,7 @@ KVCacheCLManager::KVCacheCLManager(Backend *backend, bool kv_cahce) : mKVCache(k
 }
 
 void KVCacheCLManager::allocKVCache() {
-    if (!mKVCache || mPastLength < mMaxLength) {
+    if (!mKVCache) {
         return;
     }
     if(mOpenCLBackend->getOpenCLRuntime()->isSupportedFP16()){
@@ -36,38 +36,51 @@ bool KVCacheCLManager::reallocKVCache() {
     if (!mKVCache || mPastLength < mMaxLength) {
         return false;
     }
-    
     size_t old_size = mKvNumHead * UP_DIV(mMaxLength, 4) * mHeadDim * 4 * mByte;
+    size_t old_maxlen = ROUND_UP(mMaxLength, 4);
     mMaxLength = mPastLength + mExpandChunk;
+    size_t new_maxlen = ROUND_UP(mMaxLength, 4);
     size_t buffer_size = UP_DIV(mMaxLength, 4) * mKvNumHead * mHeadDim * 4 * mByte;
     // past_key: [1, numhead, headdim, maxlen]
     auto new_key = new cl::Buffer(mOpenCLBackend->getOpenCLRuntime()->context(), CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, buffer_size);
     // past_value: [1, numhead, maxlen, headdim]
     auto new_value = new cl::Buffer(mOpenCLBackend->getOpenCLRuntime()->context(), CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, buffer_size);
-    // copy
     cl_int res;
-    auto new_key_ptr = mOpenCLBackend->getOpenCLRuntime()->commandQueue().enqueueMapBuffer(*new_key, true, CL_MAP_WRITE, 0, buffer_size, nullptr, nullptr, &res);
-    auto key_ptr = mOpenCLBackend->getOpenCLRuntime()->commandQueue().enqueueMapBuffer(*mPastKey.get(), true, CL_MAP_READ, 0, old_size, nullptr, nullptr, &res);
-    if(new_key_ptr != nullptr && key_ptr != nullptr && res == CL_SUCCESS){
-        ::memcpy(new_key_ptr, key_ptr, old_size);
-    }else{
-        MNN_ERROR("Map error key_ptr == nullptr \n");
-        MNN_ASSERT(false);
+    // copy key
+    {
+        size_t old_maxlen_size = old_maxlen * mByte;
+        size_t new_maxlen_size = new_maxlen * mByte;
+        char *new_key_ptr = (char*)mOpenCLBackend->getOpenCLRuntime()->commandQueue().enqueueMapBuffer(*new_key, true, CL_MAP_WRITE, 0, buffer_size, nullptr, nullptr, &res);
+        char *key_ptr = (char*)mOpenCLBackend->getOpenCLRuntime()->commandQueue().enqueueMapBuffer(*mPastKey.get(), true, CL_MAP_READ, 0, old_size, nullptr, nullptr, &res);
+        if(new_key_ptr != nullptr && key_ptr != nullptr && res == CL_SUCCESS){
+            for(int i = 0; i < mKvNumHead * mHeadDim; ++i){
+                ::memcpy(new_key_ptr + i * new_maxlen_size, key_ptr + i * old_maxlen_size, old_maxlen_size);
+            }
+        }else{
+            MNN_ERROR("Map error key_ptr == nullptr \n");
+            MNN_ASSERT(false);
+        }
+        mOpenCLBackend->getOpenCLRuntime()->commandQueue().enqueueUnmapMemObject(*new_key, new_key_ptr);
+        mOpenCLBackend->getOpenCLRuntime()->commandQueue().enqueueUnmapMemObject(*mPastKey.get(), key_ptr);
     }
-    mOpenCLBackend->getOpenCLRuntime()->commandQueue().enqueueUnmapMemObject(*new_key, new_key_ptr);
-    mOpenCLBackend->getOpenCLRuntime()->commandQueue().enqueueUnmapMemObject(*mPastKey.get(), key_ptr);
     
-    auto new_value_ptr = mOpenCLBackend->getOpenCLRuntime()->commandQueue().enqueueMapBuffer(*new_value, true, CL_MAP_WRITE, 0, buffer_size, nullptr, nullptr, &res);
-    auto value_ptr = mOpenCLBackend->getOpenCLRuntime()->commandQueue().enqueueMapBuffer(*mPastValue.get(), true, CL_MAP_READ, 0, old_size, nullptr, nullptr, &res);
-    if(new_value_ptr != nullptr && value_ptr != nullptr && res == CL_SUCCESS){
-        ::memcpy(new_value_ptr, value_ptr, old_size);
-    }else{
-        MNN_ERROR("Map error value_ptr == nullptr \n");
-        MNN_ASSERT(false);
+    // copy value
+    {
+        char *new_value_ptr = (char*)mOpenCLBackend->getOpenCLRuntime()->commandQueue().enqueueMapBuffer(*new_value, true, CL_MAP_WRITE, 0, buffer_size, nullptr, nullptr, &res);
+        char *value_ptr = (char*)mOpenCLBackend->getOpenCLRuntime()->commandQueue().enqueueMapBuffer(*mPastValue.get(), true, CL_MAP_READ, 0, old_size, nullptr, nullptr, &res);
+        if(new_value_ptr != nullptr && value_ptr != nullptr && res == CL_SUCCESS){
+            for(int i = 0; i < mKvNumHead; ++i){
+                for(int j = 0; j < old_maxlen; ++j){
+                    ::memcpy(new_value_ptr + (i * new_maxlen + j) * mHeadDim * mByte, value_ptr + (i * old_maxlen + j) * mHeadDim * mByte, mHeadDim * mByte);
+                }
+            }
+        }else{
+            MNN_ERROR("Map error value_ptr == nullptr \n");
+            MNN_ASSERT(false);
+        }
+        mOpenCLBackend->getOpenCLRuntime()->commandQueue().enqueueUnmapMemObject(*new_value, new_value_ptr);
+        mOpenCLBackend->getOpenCLRuntime()->commandQueue().enqueueUnmapMemObject(*mPastValue.get(), value_ptr);
     }
-    mOpenCLBackend->getOpenCLRuntime()->commandQueue().enqueueUnmapMemObject(*new_value, new_value_ptr);
-    mOpenCLBackend->getOpenCLRuntime()->commandQueue().enqueueUnmapMemObject(*mPastValue.get(), value_ptr);
-    
     mPastKey.reset(new_key);
     mPastValue.reset(new_value);
     return true;
@@ -82,44 +95,477 @@ int AttentionBufExecution::getLocalSize(int size, int maxGroupSize){
 }
 
 void AttentionBufExecution::reallocKVCache() {
-    int maxLength = mKVCacheCLManager->maxLength();
+    mMax_len = ROUND_UP(mKVCacheCLManager->maxLength(), 4);
     int numHead = mKVCacheCLManager->numHead();
-    mTempQK.reset(Tensor::createDevice<float>({UP_DIV(maxLength, 4) * numHead * 4}));
-    mTempSoftMax.reset(Tensor::createDevice<float>({UP_DIV(maxLength, 4) * numHead * 4}));
+    mTempQK.reset(Tensor::createDevice<float>({UP_DIV(mMax_len, 4) * numHead * 4}));
+    mTempSoftMax.reset(Tensor::createDevice<float>({UP_DIV(mMax_len, 4) * numHead * 4}));
     mOpenCLBackend->onAcquireBuffer(mTempQK.get(), Backend::STATIC);
     mOpenCLBackend->onAcquireBuffer(mTempSoftMax.get(), Backend::STATIC);
     // reset memory for args
     if(mOpenCLBackend->isUseRecordQueue()){
-        mQkUpdateInfo.update_kernel_args[1].arg_value = &openCLBuffer(mTempQK.get())();
-        mQkUpdateInfo.update_kernel_args[2].arg_value = &(*(mKVCacheCLManager->key()))();
+        mRgUpdateInfo.update_kernel_args[0].arg_value = &(*(mKVCacheCLManager->key()))();
+        mQkUpdateInfo.update_kernel_args[1].arg_value = &(*(mKVCacheCLManager->key()))();
+        mQkUpdateInfo.update_kernel_args[2].arg_value = &openCLBuffer(mTempQK.get())();
         mSoftMaxUpdateInfo.update_kernel_args[0].arg_value = &openCLBuffer(mTempQK.get())();
         mSoftMaxUpdateInfo.update_kernel_args[1].arg_value = &openCLBuffer(mTempSoftMax.get())();
+        mRgVUpdateInfo.update_kernel_args[0].arg_value = &(*(mKVCacheCLManager->value()))();
         mQkvUpdateInfo.update_kernel_args[0].arg_value = &openCLBuffer(mTempSoftMax.get())();
         mQkvUpdateInfo.update_kernel_args[1].arg_value = &(*(mKVCacheCLManager->value()))();
     }else{
         cl_int ret = CL_SUCCESS;
-        ret |= mKernel_qk->get().setArg(5, openCLBuffer(mTempQK.get()));
-        ret |= mKernel_qk->get().setArg(6, *mKVCacheCLManager->key());
+        ret |= mKernel_rearrange->get().setArg(4, *mKVCacheCLManager->key());
+        ret |= mKernel_rearrange->get().setArg(6, mMax_len);
+        ret |= mKernel_qk->get().setArg(3, *mKVCacheCLManager->key());
+        ret |= mKernel_qk->get().setArg(4, openCLBuffer(mTempQK.get()));
+        ret |= mKernel_qk->get().setArg(7, mMax_len);
         ret |= mKernel_softmax->get().setArg(3, openCLBuffer(mTempQK.get()));
         ret |= mKernel_softmax->get().setArg(4, openCLBuffer(mTempSoftMax.get()));
-        ret |= mKernel_qkv->get().setArg(3, openCLBuffer(mTempSoftMax.get()));
-        ret |= mKernel_qkv->get().setArg(6, *mKVCacheCLManager->value());
+        ret |= mKernel_rearrangeV->get().setArg(4, *mKVCacheCLManager->value());
+        ret |= mKernel_rearrangeV->get().setArg(6, mMax_len);
+        ret |= mKernel_qkv->get().setArg(2, openCLBuffer(mTempSoftMax.get()));
+        ret |= mKernel_qkv->get().setArg(3, *mKVCacheCLManager->value());
+        ret |= mKernel_qkv->get().setArg(6, mMax_len);
         MNN_CHECK_CL_SUCCESS(ret, "reset memory arg for AttentionBufExecution");
     }
     mOpenCLBackend->onReleaseBuffer(mTempQK.get(), Backend::STATIC);
     mOpenCLBackend->onReleaseBuffer(mTempSoftMax.get(), Backend::STATIC);
 }
 
+ErrorCode AttentionBufExecution::longPrefillResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs){
+    
+    auto query = inputs[0];
+    auto key = inputs[1];
+    auto value = inputs[2];
+    auto mask = inputs[3];
+    auto runtime = mOpenCLBackend->getOpenCLRuntime();
+    auto shape = query->shape();
+       
+    int batch = shape[0];
+    int seq_len = shape[1];
+    int numHead = shape[2];
+    int kvNumHead = key->shape()[2];
+    int headDim = shape[3];
+    int group_size = numHead / kvNumHead;
+    float scale = 1.0 / sqrt(headDim);
+    mKVCacheCLManager->setArgs(seq_len, numHead, kvNumHead, headDim);
+    mKVCacheCLManager->allocKVCache();
+    mKv_seq_len = mKVCacheCLManager->kvLength();
+    int max_len = ROUND_UP(mKVCacheCLManager->maxLength(), 4);
+    mAlignQ = 128;
+    mAlignKV = 128;
+    mAlignHDK = 4;
+    mAlignHDN = 128;
+    
+    mTempQ.reset(Tensor::createDevice<float>({ROUND_UP(seq_len, mAlignQ) * ROUND_UP(headDim, mAlignHDK) * batch * numHead}));
+    mTempK.reset(Tensor::createDevice<float>({ROUND_UP(seq_len, mAlignKV) * ROUND_UP(headDim, mAlignHDK) * batch * numHead}));
+    mTempV.reset(Tensor::createDevice<float>({ROUND_UP(seq_len, mAlignKV) * ROUND_UP(headDim, mAlignHDN) * batch * numHead}));
+    if(mIsAddMask) {
+        mTempMask.reset(Tensor::createDevice<float>({ROUND_UP(seq_len, mAlignQ) * ROUND_UP(seq_len, mAlignKV) * batch}));
+    } else {
+        mTempMask.reset(Tensor::createDevice<uint32_t>({ROUND_UP(seq_len, mAlignQ) * ROUND_UP(seq_len, mAlignKV) * batch}));
+    }
+    mTempQK.reset(Tensor::createDevice<float>({ROUND_UP(seq_len, mAlignQ) * ROUND_UP(seq_len, mAlignKV) * batch * numHead}));
+    mTempSoftMax.reset(Tensor::createDevice<float>({ROUND_UP(seq_len, mAlignQ) * ROUND_UP(seq_len, mAlignKV) * batch * numHead}));
+    mTempQKV.reset(Tensor::createDevice<float>({ROUND_UP(seq_len, mAlignQ) * ROUND_UP(headDim, mAlignHDN) * batch * numHead}));
+    
+    
+    mOpenCLBackend->onAcquireBuffer(mTempQ.get(), Backend::DYNAMIC);
+    mOpenCLBackend->onAcquireBuffer(mTempK.get(), Backend::DYNAMIC);
+    mOpenCLBackend->onAcquireBuffer(mTempV.get(), Backend::DYNAMIC);
+    mOpenCLBackend->onAcquireBuffer(mTempMask.get(), Backend::DYNAMIC);
+    mOpenCLBackend->onAcquireBuffer(mTempQK.get(), Backend::DYNAMIC);
+
+    mOpenCLBackend->onReleaseBuffer(mTempQ.get(), Backend::DYNAMIC);
+    mOpenCLBackend->onReleaseBuffer(mTempK.get(), Backend::DYNAMIC);
+    mOpenCLBackend->onReleaseBuffer(mTempMask.get(), Backend::DYNAMIC);
+
+    mOpenCLBackend->onAcquireBuffer(mTempSoftMax.get(), Backend::DYNAMIC);
+        
+    mOpenCLBackend->onReleaseBuffer(mTempSoftMax.get(), Backend::DYNAMIC);
+        
+    mOpenCLBackend->onAcquireBuffer(mTempQKV.get(), Backend::DYNAMIC);
+
+    mOpenCLBackend->onReleaseBuffer(mTempV.get(), Backend::DYNAMIC);
+    mOpenCLBackend->onReleaseBuffer(mTempQK.get(), Backend::DYNAMIC);
+    mOpenCLBackend->onReleaseBuffer(mTempQKV.get(), Backend::DYNAMIC);
+    
+    // query: [batch, seqLenQ, headNum, headDim] -> mTempQ: [batch*headNum, ROUND_UP(headDim, mAlignHDK), ROUND_UP(seqLenQ, mAlignQ)]
+    // key: [batch, seqLenKV/4, headNum/group, headDim, seqLenKV_4] -> mTempK: [batch*headNum/group, ROUND_UP(headDim, mAlignHDK), ROUND_UP(seqLenKV, mAlignKV)]
+    // value: [batch, seqLenKV/4, headNum/group, headDim, seqLenKV_4] -> mTempV: [batch*headNum/group, ROUND_UP(seqLenKV, mAlignKV), ROUND_UP(headDim, mAlignHDK]
+    // key & value -> pastKey & pastValue (copy)
+    {
+        std::set<std::string> buildOption;
+        if((headDim % 4) != 0){
+            buildOption.emplace("-DHEADDIM_LEAVE");
+        }
+        if((seq_len % 4) != 0){
+            buildOption.emplace("-DSEQLEN_LEAVE");
+        }
+        
+        int seq_len_pack_q = ROUND_UP(seq_len, mAlignQ);
+        int seq_len_pack_kv = ROUND_UP(mKv_seq_len, mAlignKV);
+
+        int head_dim_pack_qk = ROUND_UP(headDim, mAlignHDK);
+        int head_dim_pack_v = ROUND_UP(headDim, mAlignHDN);
+
+        int tile[4] = {mAlignQ, mAlignKV, mAlignHDK, mAlignHDN};
+        int shape[4] = {seq_len, mKv_seq_len, numHead, headDim};
+        int param[4] = {group_size, batch, max_len, 0};
+        mKernel_rearrange = runtime->buildKernel("attention_buf", "rearrange_qkv", buildOption, inputs[0], outputs[0]);
+        auto maxWorkGroupSize  = static_cast<uint32_t>(runtime->getMaxWorkGroupSize(mKernel_rearrange));
+        
+        mGlobalWorkSizeRearrg = {static_cast<uint32_t>(ALIMAX(UP_DIV(seq_len_pack_q, 4), UP_DIV(seq_len_pack_kv, 4))), \
+                                static_cast<uint32_t>(ALIMAX(UP_DIV(head_dim_pack_qk, 4), UP_DIV(head_dim_pack_v, 4))), \
+                                static_cast<uint32_t>(batch*numHead)};
+
+        uint32_t index = 0;
+        cl_int ret = CL_SUCCESS;
+        ret |= mKernel_rearrange->get().setArg(index++, mGlobalWorkSizeRearrg[0]);
+        ret |= mKernel_rearrange->get().setArg(index++, mGlobalWorkSizeRearrg[1]);
+        ret |= mKernel_rearrange->get().setArg(index++, mGlobalWorkSizeRearrg[2]);
+        ret |= mKernel_rearrange->get().setArg(index++, openCLBuffer(query));
+        ret |= mKernel_rearrange->get().setArg(index++, openCLBuffer(key));
+        ret |= mKernel_rearrange->get().setArg(index++, openCLBuffer(value));
+        ret |= mKernel_rearrange->get().setArg(index++, openCLBuffer(mTempQ.get()));
+        ret |= mKernel_rearrange->get().setArg(index++, openCLBuffer(mTempK.get()));
+        ret |= mKernel_rearrange->get().setArg(index++, openCLBuffer(mTempV.get()));
+        ret |= mKernel_rearrange->get().setArg(index++, *mKVCacheCLManager->key());
+        ret |= mKernel_rearrange->get().setArg(index++, *mKVCacheCLManager->value());
+        ret |= mKernel_rearrange->get().setArg(index++, tile);
+        ret |= mKernel_rearrange->get().setArg(index++, shape);
+        ret |= mKernel_rearrange->get().setArg(index++, param);
+        
+        MNN_CHECK_CL_SUCCESS(ret, "setArg rearrange_qkv");
+        mLocalWorkSizeRearrg = localWS3DDefault(mGlobalWorkSizeRearrg, maxWorkGroupSize, runtime, "rearrange_qkv", mKernel_rearrange).first;
+        mGlobalWorkSizeRearrg[0] = ROUND_UP(mGlobalWorkSizeRearrg[0], std::max((uint32_t)1, mLocalWorkSizeRearrg[0]));
+        mGlobalWorkSizeRearrg[1] = ROUND_UP(mGlobalWorkSizeRearrg[1], std::max((uint32_t)1, mLocalWorkSizeRearrg[1]));
+        mGlobalWorkSizeRearrg[2] = ROUND_UP(mGlobalWorkSizeRearrg[2], std::max((uint32_t)1, mLocalWorkSizeRearrg[2]));
+        mOpenCLBackend->recordKernel3d(mKernel_rearrange, mGlobalWorkSizeRearrg, mLocalWorkSizeRearrg);
+    }
+    
+    // mask rearaange
+    {
+        std::set<std::string> buildOption;
+
+        int seq_len_pack_q = ROUND_UP(seq_len, mAlignQ);
+        int seq_len_pack_kv = ROUND_UP(mKv_seq_len, mAlignKV);
+        int shape[4] = {seq_len, mKv_seq_len, mAlignQ, mAlignKV};
+
+        mKernel_mask = runtime->buildKernel("attention_buf", "rearrange_mask", buildOption, inputs[0], outputs[0]);
+        auto maxWorkGroupSize  = static_cast<uint32_t>(runtime->getMaxWorkGroupSize(mKernel_mask));
+        
+        mGlobalWorkSizeMask = {static_cast<uint32_t>(UP_DIV(seq_len_pack_q, 4)), \
+                                static_cast<uint32_t>(UP_DIV(seq_len_pack_kv, 4)), \
+                                static_cast<uint32_t>(batch)};
+
+        uint32_t index = 0;
+        cl_int ret = CL_SUCCESS;
+        ret |= mKernel_mask->get().setArg(index++, mGlobalWorkSizeMask[0]);
+        ret |= mKernel_mask->get().setArg(index++, mGlobalWorkSizeMask[1]);
+        ret |= mKernel_mask->get().setArg(index++, mGlobalWorkSizeMask[2]);
+        ret |= mKernel_mask->get().setArg(index++, openCLBuffer(mask));
+        ret |= mKernel_mask->get().setArg(index++, openCLBuffer(mTempMask.get()));
+        ret |= mKernel_mask->get().setArg(index++, shape);
+        
+        MNN_CHECK_CL_SUCCESS(ret, "setArg rearrange_mask");
+        mLocalWorkSizeMask = localWS3DDefault(mGlobalWorkSizeMask, maxWorkGroupSize, runtime, "rearrange_mask", mKernel_mask).first;
+        mGlobalWorkSizeMask[0] = ROUND_UP(mGlobalWorkSizeMask[0], std::max((uint32_t)1, mLocalWorkSizeMask[0]));
+        mGlobalWorkSizeMask[1] = ROUND_UP(mGlobalWorkSizeMask[1], std::max((uint32_t)1, mLocalWorkSizeMask[1]));
+        mGlobalWorkSizeMask[2] = ROUND_UP(mGlobalWorkSizeMask[2], std::max((uint32_t)1, mLocalWorkSizeMask[2]));
+        mOpenCLBackend->recordKernel3d(mKernel_mask, mGlobalWorkSizeMask, mLocalWorkSizeMask);
+    }
+    
+    {
+        // Q : [batch*headNum, ROUND_UP(headDim, mAlignHDK), ROUND_UP(seqLenQ, mAlignQ)] -> [B, K, M]
+        // K : [batch*headNum/group, ROUND_UP(headDim, mAlignHDK), ROUND_UP(seqLenKV, mAlignKV)] -> [B, K, N]
+        // QV: [Batch * numHead, ROUND_UP(seqLenQ, mAlignQ), ROUND_UP(seqLenKV, mAlignKV)]   -> [B, M, N]
+        int loop = batch * numHead;
+        int e_pack = ROUND_UP(seq_len, mAlignQ);
+        int h_pack = ROUND_UP(mKv_seq_len, mAlignKV);
+        int l_pack = ROUND_UP(headDim, mAlignHDK);
+        
+        std::set<std::string> buildOptions;
+
+        int biasType = 5;// int value mask
+        if(mIsAddMask) {
+            biasType = 2;
+        }
+        uint32_t layout = 14; // 10 means mix-precision, 4 means layput
+        auto param = getGemmParams({(uint32_t)e_pack, (uint32_t)h_pack, (uint32_t)l_pack, layout, (uint32_t)loop, (uint32_t)(biasType + 10*(group_size-1))}, {openCLBuffer(mTempQ.get()), openCLBuffer(mTempK.get()), openCLBuffer(mTempQK.get()), openCLBuffer(mTempMask.get())}, mOpenCLBackend->getOpenCLRuntime());
+        
+        int KWG=param[0], KWI=param[1], MDIMA=param[2], MDIMC=param[3], MWG=param[4], NDIMB=param[5], NDIMC=param[6], NWG=param[7], SA=param[8], SB=param[9], STRM=param[10], STRN=param[11], VWM=param[12], VWN=param[13];
+        buildOptions.emplace("-DKWG=" + std::to_string(KWG));
+        buildOptions.emplace("-DKWI=" + std::to_string(KWI));
+        buildOptions.emplace("-DMDIMA=" + std::to_string(MDIMA));
+        buildOptions.emplace("-DMDIMC=" + std::to_string(MDIMC));
+        buildOptions.emplace("-DMWG=" + std::to_string(MWG));
+        buildOptions.emplace("-DNDIMB=" + std::to_string(NDIMB));
+        buildOptions.emplace("-DNDIMC=" + std::to_string(NDIMC));
+        buildOptions.emplace("-DNWG=" + std::to_string(NWG));
+        buildOptions.emplace("-DSA=" + std::to_string(SA));
+        buildOptions.emplace("-DSB=" + std::to_string(SB));
+        buildOptions.emplace("-DSTRM=" + std::to_string(STRM));
+        buildOptions.emplace("-DSTRN=" + std::to_string(STRN));
+        buildOptions.emplace("-DVWM=" + std::to_string(VWM));
+        buildOptions.emplace("-DVWN=" + std::to_string(VWN));
+        if(layout >= 4) {
+            buildOptions.emplace("-DOUTPUTMN");
+        }
+        
+        int tileM = MWG;
+        int tileN = NWG;
+        int localM = MDIMC;
+        int localN = NDIMC;
+        
+        if(mOpenCLBackend->getOpenCLRuntime()->getGpuType() == GpuType::ADRENO) {
+            buildOptions.emplace("-DUSE_CL_MAD=1");
+            buildOptions.emplace("-DRELAX_WORKGROUP_SIZE=1");
+        }
+        buildOptions.emplace("-DONLY_HAVE_ALPHA");
+        buildOptions.emplace("-DBIAS_TYPE=" + std::to_string(biasType));
+        
+        buildOptions.emplace("-DPRECISION_COMPUTE=float -DCONVERT_PRECISION_COMPUTE=convert_float");
+        buildOptions.emplace("-DPRECISION_COMPUTE2=float2 -DCONVERT_PRECISION_COMPUTE2=convert_float2");
+        buildOptions.emplace("-DPRECISION_COMPUTE4=float4 -DCONVERT_PRECISION_COMPUTE4=convert_float4");
+        buildOptions.emplace("-DPRECISION_COMPUTE8=float8 -DCONVERT_PRECISION_COMPUTE8=convert_float8");
+        buildOptions.emplace("-DPRECISION_COMPUTE16=float16 -DCONVERT_PRECISION_COMPUTE16=convert_float16");
+
+        mKernel_qk = mOpenCLBackend->getOpenCLRuntime()->buildKernel("matmul_params_buf", "XgemmBatched", buildOptions);
+        
+        int out_per_thread_m = tileM / localM;
+        int out_per_thread_n = tileN / localN;
+        
+        mGlobalWorkSizeQk = {static_cast<uint32_t>(e_pack/out_per_thread_m), static_cast<uint32_t>(h_pack/out_per_thread_n), static_cast<uint32_t>(loop)};
+        mLocalWorkSizeQk = {static_cast<uint32_t>(localM), static_cast<uint32_t>(localN), 1};
+        
+        float alpha = scale;
+        float beta = 0.0f;
+        int batch_offset_a = e_pack * l_pack;
+        int batch_offset_b = h_pack * l_pack;
+        int batch_offset_c = e_pack * h_pack;
+        
+        int batch_offset[4] = {batch_offset_a, batch_offset_b, batch_offset_c, 0};
+        int stride[4] = {e_pack, h_pack, h_pack, h_pack};
+        int group[4] = {1, group_size, 1, numHead};
+        
+        int idx            = 0;
+        cl_int ret = CL_SUCCESS;
+        ret |= mKernel_qk->get().setArg(idx++, static_cast<int>(e_pack));
+        ret |= mKernel_qk->get().setArg(idx++, static_cast<int>(h_pack));
+        ret |= mKernel_qk->get().setArg(idx++, static_cast<int>(l_pack));
+        ret |= mKernel_qk->get().setArg(idx++, alpha);
+        ret |= mKernel_qk->get().setArg(idx++, beta);
+        ret |= mKernel_qk->get().setArg(idx++, openCLBuffer(mTempQ.get()));
+        ret |= mKernel_qk->get().setArg(idx++, openCLBuffer(mTempK.get()));
+        ret |= mKernel_qk->get().setArg(idx++, openCLBuffer(mTempMask.get()));
+        ret |= mKernel_qk->get().setArg(idx++, openCLBuffer(mTempQK.get()));
+        ret |= mKernel_qk->get().setArg(idx++, batch_offset);
+        ret |= mKernel_qk->get().setArg(idx++, stride);
+        ret |= mKernel_qk->get().setArg(idx++, group);
+        MNN_CHECK_CL_SUCCESS(ret, "setArg Self-Attention batchmatmul qk Kernel");
+        mOpenCLBackend->recordKernel3d(mKernel_qk, mGlobalWorkSizeQk, mLocalWorkSizeQk);
+    }
+    
+    // softmax
+    {
+        // QV:     [Batch * numHead, ROUND_UP(seqLenQ, mAlignQ), ROUND_UP(seqLenKV, mAlignKV)]
+        // Sotmax: [Batch * numHead, ROUND_UP(seqLenQ, mAlignQ), ROUND_UP(seqLenKV, mAlignKV)]
+        // axis  : 2 (last dim)
+        int softmaxShape[4];
+        softmaxShape[0] = batch*numHead;
+        softmaxShape[1] = ROUND_UP(seq_len, mAlignQ);
+        softmaxShape[2] = ROUND_UP(mKv_seq_len, mAlignKV);
+        
+        auto MaxLocalSize = std::min(std::min(runtime->getMaxWorkItemSizes()[0], mMaxWorkGroupSize), static_cast<uint32_t>(256));
+        int localSize = getLocalSize(softmaxShape[2], MaxLocalSize);
+        if(localSize < 4){
+            localSize = 1;
+        }
+        
+        std::set<std::string> buildOption;
+        buildOption.emplace("-DSOFTMAX_LOCAL_SIZE=" + std::to_string(localSize));
+        
+        mKernel_softmax = runtime->buildKernel("self_attention_buf", "softmax_inside", buildOption, inputs[0], outputs[0]);
+        mGlobalWorkSizeSoftMax =  {static_cast<uint32_t>(localSize), static_cast<uint32_t>(softmaxShape[1]), static_cast<uint32_t>(softmaxShape[0])};
+        
+        uint32_t index = 0;
+        cl_int ret = CL_SUCCESS;
+        ret |= mKernel_softmax->get().setArg(index++, mGlobalWorkSizeSoftMax[0]);
+        ret |= mKernel_softmax->get().setArg(index++, mGlobalWorkSizeSoftMax[1]);
+        ret |= mKernel_softmax->get().setArg(index++, mGlobalWorkSizeSoftMax[2]);
+        ret |= mKernel_softmax->get().setArg(index++, openCLBuffer(mTempQK.get()));
+        ret |= mKernel_softmax->get().setArg(index++, openCLBuffer(mTempSoftMax.get()));
+        ret |= mKernel_softmax->get().setArg(index++, mKv_seq_len);
+        ret |= mKernel_softmax->get().setArg(index++, softmaxShape);
+        MNN_CHECK_CL_SUCCESS(ret, "setArg Attention softmax");
+        
+        mLocalWorkSizeSoftMax = {static_cast<uint32_t>(localSize), 1, 1};
+        mOpenCLBackend->recordKernel3d(mKernel_softmax, mGlobalWorkSizeSoftMax, mLocalWorkSizeSoftMax);
+    }
+    {
+        // Sotmax: [Batch * numHead, ROUND_UP(seqLenQ, mAlignQ), ROUND_UP(seqLenKV, mAlignKV)]
+        // Trans:  [Batch * numHead, ROUND_UP(seqLenKV, mAlignKV), ROUND_UP(seqLenQ, mAlignQ)]
+        int loop = batch * numHead;
+        int transDimW = ROUND_UP(seq_len, mAlignQ);
+        int transDimH = ROUND_UP(mKv_seq_len, mAlignKV);
+        
+        std::set<std::string> buildOptions;
+        mKernel_trans = runtime->buildKernel("self_attention_buf", "trans_3d_buf", buildOptions, inputs[0], outputs[0]);
+        uint32_t maxWorkGroupSize = static_cast<uint32_t>(mOpenCLBackend->getOpenCLRuntime()->getMaxWorkGroupSize(mKernel_trans));
+
+        mGlobalWorkSizeTrans = {(uint32_t)transDimW/8, (uint32_t)transDimH/8, (uint32_t)(loop)};
+        
+        uint32_t index = 0;
+        cl_int ret = CL_SUCCESS;
+        ret |= mKernel_trans->get().setArg(index++, mGlobalWorkSizeTrans[0]);
+        ret |= mKernel_trans->get().setArg(index++, mGlobalWorkSizeTrans[1]);
+        ret |= mKernel_trans->get().setArg(index++, mGlobalWorkSizeTrans[2]);
+        ret |= mKernel_trans->get().setArg(index++, openCLBuffer(mTempSoftMax.get()));
+        ret |= mKernel_trans->get().setArg(index++, openCLBuffer(mTempQK.get()));
+        ret |= mKernel_trans->get().setArg(index++, loop);
+        ret |= mKernel_trans->get().setArg(index++, transDimW);
+        ret |= mKernel_trans->get().setArg(index++, transDimH);
+        MNN_CHECK_CL_SUCCESS(ret, "setArg Attention transpose");
+        mLocalWorkSizeTrans = localWS3DDefault(mGlobalWorkSizeTrans, maxWorkGroupSize, mOpenCLBackend->getOpenCLRuntime(), "trans_3d_buf", mKernel_trans).first;
+        
+        mGlobalWorkSizeTrans[0] = ROUND_UP(mGlobalWorkSizeTrans[0], std::max((uint32_t)1, mLocalWorkSizeTrans[0]));
+        mGlobalWorkSizeTrans[1] = ROUND_UP(mGlobalWorkSizeTrans[1], std::max((uint32_t)1, mLocalWorkSizeTrans[1]));
+        mGlobalWorkSizeTrans[2] = ROUND_UP(mGlobalWorkSizeTrans[2], std::max((uint32_t)1, mLocalWorkSizeTrans[2]));
+        
+        mOpenCLBackend->recordKernel3d(mKernel_trans, mGlobalWorkSizeTrans, mLocalWorkSizeTrans);
+    }
+
+    // qk * value
+    {
+        // Trans: [Batch * numHead, ROUND_UP(seqLenKV, mAlignKV), ROUND_UP(seqLenQ, mAlignQ)]   -> [B, K, M]
+        // V :     [Batch * numHead / group, ROUND_UP(seqLenKV, mAlignKV), ROUND_UP(headDim, mAlignHDN)] -> [B, K, N]
+        // QKV :   [Batch * numHead, ROUND_UP(headDim, mAlignHDN), ROUND_UP(seqLenQ, mAlignQ)] -> [B, N, M]
+        
+        int loop = batch * numHead;
+        int e_pack = ROUND_UP(seq_len, mAlignQ);
+        int l_pack = ROUND_UP(mKv_seq_len, mAlignKV);
+        int h_pack = ROUND_UP(headDim, mAlignHDN);
+        
+        std::set<std::string> buildOptions;
+
+        uint32_t layout = 0;
+        auto param = getGemmParams({(uint32_t)e_pack, (uint32_t)h_pack, (uint32_t)l_pack, layout, (uint32_t)loop, (uint32_t)0}, {openCLBuffer(mTempQK.get()), openCLBuffer(mTempV.get()), openCLBuffer(mTempQKV.get())}, mOpenCLBackend->getOpenCLRuntime());
+
+        int KWG=param[0], KWI=param[1], MDIMA=param[2], MDIMC=param[3], MWG=param[4], NDIMB=param[5], NDIMC=param[6], NWG=param[7], SA=param[8], SB=param[9], STRM=param[10], STRN=param[11], VWM=param[12], VWN=param[13];
+        buildOptions.emplace("-DKWG=" + std::to_string(KWG));
+        buildOptions.emplace("-DKWI=" + std::to_string(KWI));
+        buildOptions.emplace("-DMDIMA=" + std::to_string(MDIMA));
+        buildOptions.emplace("-DMDIMC=" + std::to_string(MDIMC));
+        buildOptions.emplace("-DMWG=" + std::to_string(MWG));
+        buildOptions.emplace("-DNDIMB=" + std::to_string(NDIMB));
+        buildOptions.emplace("-DNDIMC=" + std::to_string(NDIMC));
+        buildOptions.emplace("-DNWG=" + std::to_string(NWG));
+        buildOptions.emplace("-DSA=" + std::to_string(SA));
+        buildOptions.emplace("-DSB=" + std::to_string(SB));
+        buildOptions.emplace("-DSTRM=" + std::to_string(STRM));
+        buildOptions.emplace("-DSTRN=" + std::to_string(STRN));
+        buildOptions.emplace("-DVWM=" + std::to_string(VWM));
+        buildOptions.emplace("-DVWN=" + std::to_string(VWN));
+        if(layout >= 4) {
+            buildOptions.emplace("-DOUTPUTMN");
+        }
+        
+        int tileM = MWG;
+        int tileN = NWG;
+        int localM = MDIMC;
+        int localN = NDIMC;
+        
+        if(mOpenCLBackend->getOpenCLRuntime()->getGpuType() == GpuType::ADRENO) {
+            buildOptions.emplace("-DUSE_CL_MAD=1");
+            buildOptions.emplace("-DRELAX_WORKGROUP_SIZE=1");
+        }
+
+        mKernel_qkv = mOpenCLBackend->getOpenCLRuntime()->buildKernel("matmul_params_buf", "XgemmBatched", buildOptions);
+        
+        int out_per_thread_m = tileM / localM;
+        int out_per_thread_n = tileN / localN;
+        
+        mGlobalWorkSizeQkv = {static_cast<uint32_t>(e_pack/out_per_thread_m), static_cast<uint32_t>(h_pack/out_per_thread_n), static_cast<uint32_t>(loop)};
+        mLocalWorkSizeQkv = {static_cast<uint32_t>(localM), static_cast<uint32_t>(localN), 1};
+        
+        float alpha = 1.0f;
+        float beta = 0.0f;
+        int batch_offset_a = e_pack * l_pack;
+        int batch_offset_b = h_pack * l_pack;
+        int batch_offset_c = e_pack * h_pack;
+        int batch_offset[4] = {batch_offset_a, batch_offset_b, batch_offset_c, 0};
+        int stride[4] = {e_pack, h_pack, e_pack, h_pack};
+        int group[4] = {1, group_size, 1, numHead};
+        
+        int idx            = 0;
+        cl_int ret = CL_SUCCESS;
+        ret |= mKernel_qkv->get().setArg(idx++, static_cast<int>(e_pack));
+        ret |= mKernel_qkv->get().setArg(idx++, static_cast<int>(h_pack));
+        ret |= mKernel_qkv->get().setArg(idx++, static_cast<int>(l_pack));
+        ret |= mKernel_qkv->get().setArg(idx++, alpha);
+        ret |= mKernel_qkv->get().setArg(idx++, beta);
+        ret |= mKernel_qkv->get().setArg(idx++, openCLBuffer(mTempQK.get()));
+        ret |= mKernel_qkv->get().setArg(idx++, openCLBuffer(mTempV.get()));
+        ret |= mKernel_qkv->get().setArg(idx++, openCLBuffer(mTempQKV.get()));
+        ret |= mKernel_qkv->get().setArg(idx++, batch_offset);
+        ret |= mKernel_qkv->get().setArg(idx++, stride);
+        ret |= mKernel_qkv->get().setArg(idx++, group);
+        MNN_CHECK_CL_SUCCESS(ret, "setArg Self-Attention batchmatmul qkv Kernel");
+        mOpenCLBackend->recordKernel3d(mKernel_qkv, mGlobalWorkSizeQkv, mLocalWorkSizeQkv);
+    }
+    
+    // transpose to output
+    {
+        // QKV :   [Batch * numHead, ROUND_UP(headDim, mAlignHDN), ROUND_UP(seqLenQ, mAlignQ)] -> [B, N, M]
+        // output: [batch, seqLenQ/4, headNum, headDim, seqLenQ_4]
+        std::set<std::string> buildOption;
+        
+        mKernel_clip = runtime->buildKernel("attention_buf", "qkv_transpose_output", buildOption, inputs[0], outputs[0]);
+        auto maxWorkGroupSize  = static_cast<uint32_t>(runtime->getMaxWorkGroupSize(mKernel_clip));
+                    
+        mGlobalWorkSizeClip = {static_cast<uint32_t>(UP_DIV(seq_len, 4)), static_cast<uint32_t>(UP_DIV(headDim, 4)), static_cast<uint32_t>(batch*numHead)};
+        
+        uint32_t index = 0;
+        cl_int ret = CL_SUCCESS;
+        ret |= mKernel_clip->get().setArg(index++, mGlobalWorkSizeClip[0]);
+        ret |= mKernel_clip->get().setArg(index++, mGlobalWorkSizeClip[1]);
+        ret |= mKernel_clip->get().setArg(index++, mGlobalWorkSizeClip[2]);
+        ret |= mKernel_clip->get().setArg(index++, openCLBuffer(mTempQKV.get()));
+        ret |= mKernel_clip->get().setArg(index++, openCLBuffer(outputs[0]));
+        ret |= mKernel_clip->get().setArg(index++, mAlignQ);
+        ret |= mKernel_clip->get().setArg(index++, mAlignHDN);
+        ret |= mKernel_clip->get().setArg(index++, seq_len);
+        ret |= mKernel_clip->get().setArg(index++, numHead);
+        ret |= mKernel_clip->get().setArg(index++, headDim);
+
+        mLocalWorkSizeClip = localWS3DDefault(mGlobalWorkSizeClip, maxWorkGroupSize, runtime, "qkv_transpose_output", mKernel_clip).first;
+        mGlobalWorkSizeClip[0] = ROUND_UP(mGlobalWorkSizeClip[0], std::max((uint32_t)1, mLocalWorkSizeClip[0]));
+        mGlobalWorkSizeClip[1] = ROUND_UP(mGlobalWorkSizeClip[1], std::max((uint32_t)1, mLocalWorkSizeClip[1]));
+        mGlobalWorkSizeClip[2] = ROUND_UP(mGlobalWorkSizeClip[2], std::max((uint32_t)1, mLocalWorkSizeClip[2]));
+
+        MNN_CHECK_CL_SUCCESS(ret, "setArg qkv_transpose_output");
+        mOpenCLBackend->recordKernel3d(mKernel_clip, mGlobalWorkSizeClip, mLocalWorkSizeClip);
+    }
+    mOpenCLBackend->endRecord(mRecording);
+
+    return NO_ERROR;
+}
 ErrorCode AttentionBufExecution::onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
     mOpenCLBackend->startRecord(mRecording);
     //clear update arg vector, if prefill and decode use the same one
     mOpRecordUpdateInfo.clear();
+    mRgUpdateInfo.update_kernel_args.clear();
+    mRgUpdateInfo.update_global_size.clear();
+    mRgUpdateInfo.update_local_size.clear();
     mQkUpdateInfo.update_kernel_args.clear();
     mQkUpdateInfo.update_global_size.clear();
     mQkUpdateInfo.update_local_size.clear();
     mSoftMaxUpdateInfo.update_kernel_args.clear();
     mSoftMaxUpdateInfo.update_global_size.clear();
     mSoftMaxUpdateInfo.update_local_size.clear();
+    mRgVUpdateInfo.update_kernel_args.clear();
+    mRgVUpdateInfo.update_global_size.clear();
+    mRgVUpdateInfo.update_local_size.clear();
     mQkvUpdateInfo.update_kernel_args.clear();
     mQkvUpdateInfo.update_global_size.clear();
     mQkvUpdateInfo.update_local_size.clear();
@@ -128,6 +574,9 @@ ErrorCode AttentionBufExecution::onResize(const std::vector<Tensor *> &inputs, c
     auto key = inputs[1];
     auto value = inputs[2];
     auto mask = inputs[3];
+    auto mask_shape = mask->shape();
+    int mask_seqlen = mask_shape[2];
+    int mask_kvlen  = mask_shape[3];
     auto runtime = mOpenCLBackend->getOpenCLRuntime();
     auto shape = query->shape();
     
@@ -139,270 +588,154 @@ ErrorCode AttentionBufExecution::onResize(const std::vector<Tensor *> &inputs, c
     int group_size = numHead / kvNumHead;
     float scale = 1.0 / sqrt(headDim);
     mIsDecode = seq_len == 1;
+    mIsFirstPrefill = (!mIsDecode) && (mask_kvlen == mask_seqlen);
     
     mIsAddMask = (mask->getType() == halide_type_of<float>());
     mLongPrefill = false;
+    if(seq_len > 512 && mIsFirstPrefill) {
+        mLongPrefill = true;
+        return longPrefillResize(inputs, outputs);
+    }
     if(false == mIsDecode){
-        mKVCacheCLManager->setArgs(seq_len, numHead, kvNumHead, headDim);
-        mKVCacheCLManager->allocKVCache();
-        
-        if(seq_len > 512) {
-            mLongPrefill = true;
-            mAlignQ = 128;
-            mAlignKV = 128;
-            mAlignHDK = 4;
-            mAlignHDN = 128;
-            
-            mTempQ.reset(Tensor::createDevice<float>({ROUND_UP(seq_len, mAlignQ) * ROUND_UP(headDim, mAlignHDK) * batch * numHead}));
-            mTempK.reset(Tensor::createDevice<float>({ROUND_UP(seq_len, mAlignKV) * ROUND_UP(headDim, mAlignHDK) * batch * numHead}));
-            mTempV.reset(Tensor::createDevice<float>({ROUND_UP(seq_len, mAlignKV) * ROUND_UP(headDim, mAlignHDN) * batch * numHead}));
-            if(mIsAddMask) {
-                mTempMask.reset(Tensor::createDevice<float>({ROUND_UP(seq_len, mAlignQ) * ROUND_UP(seq_len, mAlignKV) * batch}));
-            } else {
-                mTempMask.reset(Tensor::createDevice<uint32_t>({ROUND_UP(seq_len, mAlignQ) * ROUND_UP(seq_len, mAlignKV) * batch}));
-            }
-            mTempQK.reset(Tensor::createDevice<float>({ROUND_UP(seq_len, mAlignQ) * ROUND_UP(seq_len, mAlignKV) * batch * numHead}));
-            mTempSoftMax.reset(Tensor::createDevice<float>({ROUND_UP(seq_len, mAlignQ) * ROUND_UP(seq_len, mAlignKV) * batch * numHead}));
-            mTempQKV.reset(Tensor::createDevice<float>({ROUND_UP(seq_len, mAlignQ) * ROUND_UP(headDim, mAlignHDN) * batch * numHead}));
-            
-        } else {
-            mTempQK.reset(Tensor::createDevice<float>({UP_DIV(seq_len, 4) * seq_len * numHead * 4}));
-            mTempSoftMax.reset(Tensor::createDevice<float>({UP_DIV(seq_len, 4) * seq_len * numHead * 4}));
+        mKVCacheCLManager->setArgs(mask_kvlen, numHead, kvNumHead, headDim);
+        if(mIsFirstPrefill){
+            mKVCacheCLManager->allocKVCache();
+        } else{
+            mKVCacheCLManager->reallocKVCache();
         }
-        mKv_seq_len = mKVCacheCLManager->kvLength();
+        mTempQ.reset(Tensor::createDevice<float>({ROUND_UP(seq_len, 4) * ROUND_UP(headDim, 4) * numHead}));
+        mTempQK.reset(Tensor::createDevice<float>({ROUND_UP(seq_len, 4) * mask_kvlen * numHead}));
+        mTempSoftMax.reset(Tensor::createDevice<float>({ROUND_UP(seq_len, 4) * mask_kvlen * numHead}));
+        
+        mOpenCLBackend->onAcquireBuffer(mTempQK.get(), Backend::DYNAMIC);
+        mOpenCLBackend->onAcquireBuffer(mTempSoftMax.get(), Backend::DYNAMIC);
+        mOpenCLBackend->onAcquireBuffer(mTempQ.get(), Backend::DYNAMIC);
+        mOpenCLBackend->onReleaseBuffer(mTempQ.get(), Backend::DYNAMIC);
+        mOpenCLBackend->onReleaseBuffer(mTempQK.get(), Backend::DYNAMIC);
+        mOpenCLBackend->onReleaseBuffer(mTempSoftMax.get(), Backend::DYNAMIC);
     } else {
         mKv_seq_len = mKVCacheCLManager->kvLength() + 1;
         int maxLength = mKVCacheCLManager->maxLength();
         mTempQK.reset(Tensor::createDevice<float>({UP_DIV(maxLength, 4) * numHead * 4}));
         mTempSoftMax.reset(Tensor::createDevice<float>({UP_DIV(maxLength, 4) * numHead * 4}));
-    }
-
-    if(mLongPrefill) {
-        mOpenCLBackend->onAcquireBuffer(mTempQ.get(), Backend::DYNAMIC);
-        mOpenCLBackend->onAcquireBuffer(mTempK.get(), Backend::DYNAMIC);
-        mOpenCLBackend->onAcquireBuffer(mTempV.get(), Backend::DYNAMIC);
-        mOpenCLBackend->onAcquireBuffer(mTempMask.get(), Backend::DYNAMIC);
         mOpenCLBackend->onAcquireBuffer(mTempQK.get(), Backend::DYNAMIC);
-
-        mOpenCLBackend->onReleaseBuffer(mTempQ.get(), Backend::DYNAMIC);
-        mOpenCLBackend->onReleaseBuffer(mTempK.get(), Backend::DYNAMIC);
-        mOpenCLBackend->onReleaseBuffer(mTempMask.get(), Backend::DYNAMIC);
-
         mOpenCLBackend->onAcquireBuffer(mTempSoftMax.get(), Backend::DYNAMIC);
-        
-        mOpenCLBackend->onReleaseBuffer(mTempSoftMax.get(), Backend::DYNAMIC);
-        
-        mOpenCLBackend->onAcquireBuffer(mTempQKV.get(), Backend::DYNAMIC);
-
-        mOpenCLBackend->onReleaseBuffer(mTempV.get(), Backend::DYNAMIC);
         mOpenCLBackend->onReleaseBuffer(mTempQK.get(), Backend::DYNAMIC);
-        mOpenCLBackend->onReleaseBuffer(mTempQKV.get(), Backend::DYNAMIC);
-
-    } else {
-        mOpenCLBackend->onAcquireBuffer(mTempQK.get(), Backend::DYNAMIC);
-        mOpenCLBackend->onAcquireBuffer(mTempSoftMax.get(), Backend::DYNAMIC);
+        mOpenCLBackend->onReleaseBuffer(mTempSoftMax.get(), Backend::DYNAMIC);
     }
-    
-    
-    if(mLongPrefill) {
-        // query: [batch, seqLenQ, headNum, headDim] -> mTempQ: [batch*headNum, ROUND_UP(headDim, mAlignHDK), ROUND_UP(seqLenQ, mAlignQ)]
-        // key: [batch, seqLenKV/4, headNum/group, headDim, seqLenKV_4] -> mTempK: [batch*headNum/group, ROUND_UP(headDim, mAlignHDK), ROUND_UP(seqLenKV, mAlignKV)]
-        // value: [batch, seqLenKV/4, headNum/group, headDim, seqLenKV_4] -> mTempV: [batch*headNum/group, ROUND_UP(seqLenKV, mAlignKV), ROUND_UP(headDim, mAlignHDK]
-        // key & value -> pastKey & pastValue (copy)
+    mMax_len = ROUND_UP(mKVCacheCLManager->maxLength(), 4);
+    if(false == mIsDecode){
+        int past_len = mIsFirstPrefill ? 0 : mask_kvlen - mask_seqlen;
         {
+            // rearrange query
             std::set<std::string> buildOption;
-            if((headDim % 4) != 0){
-                buildOption.emplace("-DHEADDIM_LEAVE");
-            }
-            if((seq_len % 4) != 0){
-                buildOption.emplace("-DSEQLEN_LEAVE");
-            }
-            
-            int seq_len_pack_q = ROUND_UP(seq_len, mAlignQ);
-            int seq_len_pack_kv = ROUND_UP(mKv_seq_len, mAlignKV);
 
-            int head_dim_pack_qk = ROUND_UP(headDim, mAlignHDK);
-            int head_dim_pack_v = ROUND_UP(headDim, mAlignHDN);
-
-            int tile[4] = {mAlignQ, mAlignKV, mAlignHDK, mAlignHDN};
-            int shape[4] = {seq_len, mKv_seq_len, numHead, headDim};
-            int param[4] = {group_size, batch, 0, 0};
-            mKernel_rearrange = runtime->buildKernel("attention_buf", "rearrange_qkv", buildOption, inputs[0], outputs[0]);
+            mKernel_rearrangeQ = runtime->buildKernel("attention_buf", "rearrange_q", buildOption, inputs[0], outputs[0]);
+            auto maxWorkGroupSize  = static_cast<uint32_t>(runtime->getMaxWorkGroupSize(mKernel_rearrangeQ));
+            
+            mGlobalWorkSizeRearrgQ = {static_cast<uint32_t>(UP_DIV(seq_len, 4)), \
+                                    static_cast<uint32_t>(UP_DIV(headDim, 4)), \
+                                    static_cast<uint32_t>(numHead)};
+  
+            uint32_t index = 0;
+            cl_int ret = CL_SUCCESS;
+            ret |= mKernel_rearrangeQ->get().setArg(index++, mGlobalWorkSizeRearrgQ[0]);
+            ret |= mKernel_rearrangeQ->get().setArg(index++, mGlobalWorkSizeRearrgQ[1]);
+            ret |= mKernel_rearrangeQ->get().setArg(index++, mGlobalWorkSizeRearrgQ[2]);
+            ret |= mKernel_rearrangeQ->get().setArg(index++, openCLBuffer(query));
+            ret |= mKernel_rearrangeQ->get().setArg(index++, openCLBuffer(mTempQ.get()));
+            ret |= mKernel_rearrangeQ->get().setArg(index++, seq_len);
+            ret |= mKernel_rearrangeQ->get().setArg(index++, headDim);
+            ret |= mKernel_rearrangeQ->get().setArg(index++, numHead);
+            
+            MNN_CHECK_CL_SUCCESS(ret, "setArg rearrange_q");
+            mLocalWorkSizeRearrgQ = localWS3DDefault(mGlobalWorkSizeRearrgQ, maxWorkGroupSize, runtime, "rearrange_q", mKernel_rearrangeQ).first;
+            mGlobalWorkSizeRearrgQ[0] = ROUND_UP(mGlobalWorkSizeRearrgQ[0], std::max((uint32_t)1, mLocalWorkSizeRearrgQ[0]));
+            mGlobalWorkSizeRearrgQ[1] = ROUND_UP(mGlobalWorkSizeRearrgQ[1], std::max((uint32_t)1, mLocalWorkSizeRearrgQ[1]));
+            mGlobalWorkSizeRearrgQ[2] = ROUND_UP(mGlobalWorkSizeRearrgQ[2], std::max((uint32_t)1, mLocalWorkSizeRearrgQ[2]));
+            mOpenCLBackend->recordKernel3d(mKernel_rearrangeQ, mGlobalWorkSizeRearrgQ, mLocalWorkSizeRearrgQ);
+        }
+        {
+            // rearrange key
+            std::set<std::string> buildOption;
+            
+            buildOption.emplace("-DOPENCL_PREFILL_ATTENTION");
+            mKernel_rearrange = runtime->buildKernel("attention_buf", "rearrange_k", buildOption, inputs[0], outputs[0]);
             auto maxWorkGroupSize  = static_cast<uint32_t>(runtime->getMaxWorkGroupSize(mKernel_rearrange));
             
-            mGlobalWorkSizeRearrg = {static_cast<uint32_t>(ALIMAX(UP_DIV(seq_len_pack_q, 4), UP_DIV(seq_len_pack_kv, 4))), \
-                                    static_cast<uint32_t>(ALIMAX(UP_DIV(head_dim_pack_qk, 4), UP_DIV(head_dim_pack_v, 4))), \
-                                    static_cast<uint32_t>(batch*numHead)};
+            mGlobalWorkSizeRearrg = {static_cast<uint32_t>(UP_DIV(seq_len, 4)), \
+                                    static_cast<uint32_t>(UP_DIV(headDim, 4)), \
+                                    static_cast<uint32_t>(kvNumHead)};
   
             uint32_t index = 0;
             cl_int ret = CL_SUCCESS;
             ret |= mKernel_rearrange->get().setArg(index++, mGlobalWorkSizeRearrg[0]);
             ret |= mKernel_rearrange->get().setArg(index++, mGlobalWorkSizeRearrg[1]);
             ret |= mKernel_rearrange->get().setArg(index++, mGlobalWorkSizeRearrg[2]);
-            ret |= mKernel_rearrange->get().setArg(index++, openCLBuffer(query));
             ret |= mKernel_rearrange->get().setArg(index++, openCLBuffer(key));
-            ret |= mKernel_rearrange->get().setArg(index++, openCLBuffer(value));
-            ret |= mKernel_rearrange->get().setArg(index++, openCLBuffer(mTempQ.get()));
-            ret |= mKernel_rearrange->get().setArg(index++, openCLBuffer(mTempK.get()));
-            ret |= mKernel_rearrange->get().setArg(index++, openCLBuffer(mTempV.get()));
             ret |= mKernel_rearrange->get().setArg(index++, *mKVCacheCLManager->key());
-            ret |= mKernel_rearrange->get().setArg(index++, *mKVCacheCLManager->value());
-            ret |= mKernel_rearrange->get().setArg(index++, tile);
-            ret |= mKernel_rearrange->get().setArg(index++, shape);
-            ret |= mKernel_rearrange->get().setArg(index++, param);
+            ret |= mKernel_rearrange->get().setArg(index++, past_len);
+            ret |= mKernel_rearrange->get().setArg(index++, mMax_len);
+            ret |= mKernel_rearrange->get().setArg(index++, seq_len);
+            ret |= mKernel_rearrange->get().setArg(index++, kvNumHead);
+            ret |= mKernel_rearrange->get().setArg(index++, numHead);
+            ret |= mKernel_rearrange->get().setArg(index++, headDim);
             
-            MNN_CHECK_CL_SUCCESS(ret, "setArg rearrange_qkv");
-            mLocalWorkSizeRearrg = localWS3DDefault(mGlobalWorkSizeRearrg, maxWorkGroupSize, runtime, "rearrange_qkv", mKernel_rearrange).first;
+            MNN_CHECK_CL_SUCCESS(ret, "setArg rearrange_k");
+            mLocalWorkSizeRearrg = localWS3DDefault(mGlobalWorkSizeRearrg, maxWorkGroupSize, runtime, "rearrange_k", mKernel_rearrange).first;
             mGlobalWorkSizeRearrg[0] = ROUND_UP(mGlobalWorkSizeRearrg[0], std::max((uint32_t)1, mLocalWorkSizeRearrg[0]));
             mGlobalWorkSizeRearrg[1] = ROUND_UP(mGlobalWorkSizeRearrg[1], std::max((uint32_t)1, mLocalWorkSizeRearrg[1]));
             mGlobalWorkSizeRearrg[2] = ROUND_UP(mGlobalWorkSizeRearrg[2], std::max((uint32_t)1, mLocalWorkSizeRearrg[2]));
             mOpenCLBackend->recordKernel3d(mKernel_rearrange, mGlobalWorkSizeRearrg, mLocalWorkSizeRearrg);
         }
-        
-        // mask rearaange
         {
+            // matmul qk
             std::set<std::string> buildOption;
-
-            int seq_len_pack_q = ROUND_UP(seq_len, mAlignQ);
-            int seq_len_pack_kv = ROUND_UP(mKv_seq_len, mAlignKV);
-            int shape[4] = {seq_len, mKv_seq_len, mAlignQ, mAlignKV};
-
-            mKernel_mask = runtime->buildKernel("attention_buf", "rearrange_mask", buildOption, inputs[0], outputs[0]);
-            auto maxWorkGroupSize  = static_cast<uint32_t>(runtime->getMaxWorkGroupSize(mKernel_mask));
+            if(mask->getType() == halide_type_of<float>()){
+                buildOption.emplace("-DADD_MASK");
+            }
+            buildOption.emplace("-DNUMHEAD_GROUP_SIZE=" + std::to_string(group_size));
+            mKernel_qk = runtime->buildKernel("attention_buf", "matmul_qk_div_mask_prefill", buildOption, inputs[0], outputs[0]);
+            mGlobalWorkSizeQk =  {static_cast<uint32_t>(UP_DIV(seq_len, 4)), static_cast<uint32_t>(UP_DIV(mask_kvlen, 4)), static_cast<uint32_t>(numHead)};
+            auto maxWorkGroupSize  = static_cast<uint32_t>(runtime->getMaxWorkGroupSize(mKernel_qk));
             
-            mGlobalWorkSizeMask = {static_cast<uint32_t>(UP_DIV(seq_len_pack_q, 4)), \
-                                    static_cast<uint32_t>(UP_DIV(seq_len_pack_kv, 4)), \
-                                    static_cast<uint32_t>(batch)};
-  
             uint32_t index = 0;
             cl_int ret = CL_SUCCESS;
-            ret |= mKernel_mask->get().setArg(index++, mGlobalWorkSizeMask[0]);
-            ret |= mKernel_mask->get().setArg(index++, mGlobalWorkSizeMask[1]);
-            ret |= mKernel_mask->get().setArg(index++, mGlobalWorkSizeMask[2]);
-            ret |= mKernel_mask->get().setArg(index++, openCLBuffer(mask));
-            ret |= mKernel_mask->get().setArg(index++, openCLBuffer(mTempMask.get()));
-            ret |= mKernel_mask->get().setArg(index++, shape);
-            
-            MNN_CHECK_CL_SUCCESS(ret, "setArg rearrange_mask");
-            mLocalWorkSizeMask = localWS3DDefault(mGlobalWorkSizeMask, maxWorkGroupSize, runtime, "rearrange_mask", mKernel_mask).first;
-            mGlobalWorkSizeMask[0] = ROUND_UP(mGlobalWorkSizeMask[0], std::max((uint32_t)1, mLocalWorkSizeMask[0]));
-            mGlobalWorkSizeMask[1] = ROUND_UP(mGlobalWorkSizeMask[1], std::max((uint32_t)1, mLocalWorkSizeMask[1]));
-            mGlobalWorkSizeMask[2] = ROUND_UP(mGlobalWorkSizeMask[2], std::max((uint32_t)1, mLocalWorkSizeMask[2]));
-            mOpenCLBackend->recordKernel3d(mKernel_mask, mGlobalWorkSizeMask, mLocalWorkSizeMask);
-        }
-        
-        {
-            // Q : [batch*headNum, ROUND_UP(headDim, mAlignHDK), ROUND_UP(seqLenQ, mAlignQ)] -> [B, K, M]
-            // K : [batch*headNum/group, ROUND_UP(headDim, mAlignHDK), ROUND_UP(seqLenKV, mAlignKV)] -> [B, K, N]
-            // QV: [Batch * numHead, ROUND_UP(seqLenQ, mAlignQ), ROUND_UP(seqLenKV, mAlignKV)]   -> [B, M, N]
-            int loop = batch * numHead;
-            int e_pack = ROUND_UP(seq_len, mAlignQ);
-            int h_pack = ROUND_UP(mKv_seq_len, mAlignKV);
-            int l_pack = ROUND_UP(headDim, mAlignHDK);
-            
-            std::set<std::string> buildOptions;
-
-            int biasType = 5;// int value mask
-            if(mIsAddMask) {
-                biasType = 2;
-            }
-            uint32_t layout = 14; // 10 means mix-precision, 4 means layput
-            auto param = getGemmParams({(uint32_t)e_pack, (uint32_t)h_pack, (uint32_t)l_pack, layout, (uint32_t)loop, (uint32_t)(biasType + 10*(group_size-1))}, {openCLBuffer(mTempQ.get()), openCLBuffer(mTempK.get()), openCLBuffer(mTempQK.get()), openCLBuffer(mTempMask.get())}, mOpenCLBackend->getOpenCLRuntime());
-            
-            int KWG=param[0], KWI=param[1], MDIMA=param[2], MDIMC=param[3], MWG=param[4], NDIMB=param[5], NDIMC=param[6], NWG=param[7], SA=param[8], SB=param[9], STRM=param[10], STRN=param[11], VWM=param[12], VWN=param[13];
-            buildOptions.emplace("-DKWG=" + std::to_string(KWG));
-            buildOptions.emplace("-DKWI=" + std::to_string(KWI));
-            buildOptions.emplace("-DMDIMA=" + std::to_string(MDIMA));
-            buildOptions.emplace("-DMDIMC=" + std::to_string(MDIMC));
-            buildOptions.emplace("-DMWG=" + std::to_string(MWG));
-            buildOptions.emplace("-DNDIMB=" + std::to_string(NDIMB));
-            buildOptions.emplace("-DNDIMC=" + std::to_string(NDIMC));
-            buildOptions.emplace("-DNWG=" + std::to_string(NWG));
-            buildOptions.emplace("-DSA=" + std::to_string(SA));
-            buildOptions.emplace("-DSB=" + std::to_string(SB));
-            buildOptions.emplace("-DSTRM=" + std::to_string(STRM));
-            buildOptions.emplace("-DSTRN=" + std::to_string(STRN));
-            buildOptions.emplace("-DVWM=" + std::to_string(VWM));
-            buildOptions.emplace("-DVWN=" + std::to_string(VWN));
-            if(layout >= 4) {
-                buildOptions.emplace("-DOUTPUTMN");
-            }
-            
-            int tileM = MWG;
-            int tileN = NWG;
-            int localM = MDIMC;
-            int localN = NDIMC;
-            
-            if(mOpenCLBackend->getOpenCLRuntime()->getGpuType() == GpuType::ADRENO) {
-                buildOptions.emplace("-DUSE_CL_MAD=1");
-                buildOptions.emplace("-DRELAX_WORKGROUP_SIZE=1");
-            }
-            buildOptions.emplace("-DONLY_HAVE_ALPHA");
-            buildOptions.emplace("-DBIAS_TYPE=" + std::to_string(biasType));
-            
-            buildOptions.emplace("-DPRECISION_COMPUTE=float -DCONVERT_PRECISION_COMPUTE=convert_float");
-            buildOptions.emplace("-DPRECISION_COMPUTE2=float2 -DCONVERT_PRECISION_COMPUTE2=convert_float2");
-            buildOptions.emplace("-DPRECISION_COMPUTE4=float4 -DCONVERT_PRECISION_COMPUTE4=convert_float4");
-            buildOptions.emplace("-DPRECISION_COMPUTE8=float8 -DCONVERT_PRECISION_COMPUTE8=convert_float8");
-            buildOptions.emplace("-DPRECISION_COMPUTE16=float16 -DCONVERT_PRECISION_COMPUTE16=convert_float16");
-
-            mKernel_qk = mOpenCLBackend->getOpenCLRuntime()->buildKernel("matmul_params_buf", "XgemmBatched", buildOptions);
-            
-            int out_per_thread_m = tileM / localM;
-            int out_per_thread_n = tileN / localN;
-            
-            mGlobalWorkSizeQk = {static_cast<uint32_t>(e_pack/out_per_thread_m), static_cast<uint32_t>(h_pack/out_per_thread_n), static_cast<uint32_t>(loop)};
-            mLocalWorkSizeQk = {static_cast<uint32_t>(localM), static_cast<uint32_t>(localN), 1};
-            
-            float alpha = scale;
-            float beta = 0.0f;
-            int batch_offset_a = e_pack * l_pack;
-            int batch_offset_b = h_pack * l_pack;
-            int batch_offset_c = e_pack * h_pack;
-            
-            int batch_offset[4] = {batch_offset_a, batch_offset_b, batch_offset_c, 0};
-            int stride[4] = {e_pack, h_pack, h_pack, h_pack};
-            int group[4] = {1, group_size, 1, numHead};
+            ret |= mKernel_qk->get().setArg(index++, mGlobalWorkSizeQk[0]);
+            ret |= mKernel_qk->get().setArg(index++, mGlobalWorkSizeQk[1]);
+            ret |= mKernel_qk->get().setArg(index++, mGlobalWorkSizeQk[2]);
+            ret |= mKernel_qk->get().setArg(index++, openCLBuffer(mTempQ.get()));
+            ret |= mKernel_qk->get().setArg(index++, *mKVCacheCLManager->key());
+            ret |= mKernel_qk->get().setArg(index++, openCLBuffer(mask));
+            ret |= mKernel_qk->get().setArg(index++, openCLBuffer(mTempQK.get()));
+            ret |= mKernel_qk->get().setArg(index++, scale);
+            ret |= mKernel_qk->get().setArg(index++, seq_len);
+            ret |= mKernel_qk->get().setArg(index++, mask_kvlen);
+            ret |= mKernel_qk->get().setArg(index++, mMax_len);
+            ret |= mKernel_qk->get().setArg(index++, numHead);
+            ret |= mKernel_qk->get().setArg(index++, headDim);
+            MNN_CHECK_CL_SUCCESS(ret, "setArg matmul_qk_div_mask_prefill");
             
-            int idx            = 0;
-            cl_int ret = CL_SUCCESS;
-            ret |= mKernel_qk->get().setArg(idx++, static_cast<int>(e_pack));
-            ret |= mKernel_qk->get().setArg(idx++, static_cast<int>(h_pack));
-            ret |= mKernel_qk->get().setArg(idx++, static_cast<int>(l_pack));
-            ret |= mKernel_qk->get().setArg(idx++, alpha);
-            ret |= mKernel_qk->get().setArg(idx++, beta);
-            ret |= mKernel_qk->get().setArg(idx++, openCLBuffer(mTempQ.get()));
-            ret |= mKernel_qk->get().setArg(idx++, openCLBuffer(mTempK.get()));
-            ret |= mKernel_qk->get().setArg(idx++, openCLBuffer(mTempMask.get()));
-            ret |= mKernel_qk->get().setArg(idx++, openCLBuffer(mTempQK.get()));
-            ret |= mKernel_qk->get().setArg(idx++, batch_offset);
-            ret |= mKernel_qk->get().setArg(idx++, stride);
-            ret |= mKernel_qk->get().setArg(idx++, group);
-            MNN_CHECK_CL_SUCCESS(ret, "setArg Self-Attention batchmatmul qk Kernel");
+            mLocalWorkSizeQk = localWS3DDefault(mGlobalWorkSizeQk, maxWorkGroupSize, runtime, "matmul_qk_div_mask_prefill", mKernel_qk).first;
+            mGlobalWorkSizeQk[0] = ROUND_UP(mGlobalWorkSizeQk[0], std::max((uint32_t)1, mLocalWorkSizeQk[0]));
+            mGlobalWorkSizeQk[1] = ROUND_UP(mGlobalWorkSizeQk[1], std::max((uint32_t)1, mLocalWorkSizeQk[1]));
+            mGlobalWorkSizeQk[2] = ROUND_UP(mGlobalWorkSizeQk[2], std::max((uint32_t)1, mLocalWorkSizeQk[2]));
             mOpenCLBackend->recordKernel3d(mKernel_qk, mGlobalWorkSizeQk, mLocalWorkSizeQk);
         }
-        
-        // softmax
         {
-            // QV:     [Batch * numHead, ROUND_UP(seqLenQ, mAlignQ), ROUND_UP(seqLenKV, mAlignKV)]
-            // Sotmax: [Batch * numHead, ROUND_UP(seqLenQ, mAlignQ), ROUND_UP(seqLenKV, mAlignKV)]
-            // axis  : 2 (last dim)
-            int softmaxShape[4];
-            softmaxShape[0] = batch*numHead;
-            softmaxShape[1] = ROUND_UP(seq_len, mAlignQ);
-            softmaxShape[2] = ROUND_UP(mKv_seq_len, mAlignKV);
-            
-            auto MaxLocalSize = std::min(std::min(runtime->getMaxWorkItemSizes()[0], mMaxWorkGroupSize), static_cast<uint32_t>(256));
-            int localSize = getLocalSize(softmaxShape[2], MaxLocalSize);
+            // softmax
+            int inside  = ROUND_UP(seq_len, 4);
+            int outside = numHead;
+            int localSize = getLocalSize(mask_kvlen, 128);
             if(localSize < 4){
                 localSize = 1;
             }
             
             std::set<std::string> buildOption;
             buildOption.emplace("-DSOFTMAX_LOCAL_SIZE=" + std::to_string(localSize));
-            
-            mKernel_softmax = runtime->buildKernel("self_attention_buf", "softmax_inside", buildOption, inputs[0], outputs[0]);
-            mGlobalWorkSizeSoftMax =  {static_cast<uint32_t>(localSize), static_cast<uint32_t>(softmaxShape[1]), static_cast<uint32_t>(softmaxShape[0])};
+            mKernel_softmax = runtime->buildKernel("softmax_buf", "softmax_v4_buf", buildOption);
+            mGlobalWorkSizeSoftMax = {static_cast<uint32_t>(localSize), static_cast<uint32_t>(UP_DIV(inside, 4)), static_cast<uint32_t>(outside)};
+            auto maxWorkGroupSize  = static_cast<uint32_t>(runtime->getMaxWorkGroupSize(mKernel_softmax));
             
             uint32_t index = 0;
             cl_int ret = CL_SUCCESS;
@@ -411,217 +744,163 @@ ErrorCode AttentionBufExecution::onResize(const std::vector<Tensor *> &inputs, c
             ret |= mKernel_softmax->get().setArg(index++, mGlobalWorkSizeSoftMax[2]);
             ret |= mKernel_softmax->get().setArg(index++, openCLBuffer(mTempQK.get()));
             ret |= mKernel_softmax->get().setArg(index++, openCLBuffer(mTempSoftMax.get()));
-            ret |= mKernel_softmax->get().setArg(index++, mKv_seq_len);
-            ret |= mKernel_softmax->get().setArg(index++, softmaxShape);
-            MNN_CHECK_CL_SUCCESS(ret, "setArg Attention softmax");
+            ret |= mKernel_softmax->get().setArg(index++, inside);
+            ret |= mKernel_softmax->get().setArg(index++, outside);
+            ret |= mKernel_softmax->get().setArg(index++, mask_kvlen);
+            MNN_CHECK_CL_SUCCESS(ret, "setArg softmax");
             
             mLocalWorkSizeSoftMax = {static_cast<uint32_t>(localSize), 1, 1};
+            if(localSize == 1){
+                mLocalWorkSizeSoftMax = localWS3DDefault(mGlobalWorkSizeSoftMax, maxWorkGroupSize, runtime, "softmax", mKernel_softmax).first;
+            }
+            mGlobalWorkSizeSoftMax[0] = ROUND_UP(mGlobalWorkSizeSoftMax[0], std::max((uint32_t)1, mLocalWorkSizeSoftMax[0]));
+            mGlobalWorkSizeSoftMax[1] = ROUND_UP(mGlobalWorkSizeSoftMax[1], std::max((uint32_t)1, mLocalWorkSizeSoftMax[1]));
+            mGlobalWorkSizeSoftMax[2] = ROUND_UP(mGlobalWorkSizeSoftMax[2], std::max((uint32_t)1, mLocalWorkSizeSoftMax[2]));
             mOpenCLBackend->recordKernel3d(mKernel_softmax, mGlobalWorkSizeSoftMax, mLocalWorkSizeSoftMax);
         }
         {
-            // Sotmax: [Batch * numHead, ROUND_UP(seqLenQ, mAlignQ), ROUND_UP(seqLenKV, mAlignKV)]
-            // Trans:  [Batch * numHead, ROUND_UP(seqLenKV, mAlignKV), ROUND_UP(seqLenQ, mAlignQ)]
-            int loop = batch * numHead;
-            int transDimW = ROUND_UP(seq_len, mAlignQ);
-            int transDimH = ROUND_UP(mKv_seq_len, mAlignKV);
+            // rearrange value
+            std::set<std::string> buildOption;
             
-            std::set<std::string> buildOptions;
-            mKernel_trans = runtime->buildKernel("self_attention_buf", "trans_3d_buf", buildOptions, inputs[0], outputs[0]);
-            uint32_t maxWorkGroupSize = static_cast<uint32_t>(mOpenCLBackend->getOpenCLRuntime()->getMaxWorkGroupSize(mKernel_trans));
-
-            mGlobalWorkSizeTrans = {(uint32_t)transDimW/8, (uint32_t)transDimH/8, (uint32_t)(loop)};
+            buildOption.emplace("-DOPENCL_PREFILL_ATTENTION");
+            mKernel_rearrangeV = runtime->buildKernel("attention_buf", "rearrange_v", buildOption, inputs[0], outputs[0]);
+            auto maxWorkGroupSize  = static_cast<uint32_t>(runtime->getMaxWorkGroupSize(mKernel_rearrangeV));
             
+            mGlobalWorkSizeRearrgV = {static_cast<uint32_t>(UP_DIV(headDim, 4)), \
+                                    static_cast<uint32_t>(UP_DIV(seq_len, 4)), \
+                                    static_cast<uint32_t>(kvNumHead)};
+  
             uint32_t index = 0;
             cl_int ret = CL_SUCCESS;
-            ret |= mKernel_trans->get().setArg(index++, mGlobalWorkSizeTrans[0]);
-            ret |= mKernel_trans->get().setArg(index++, mGlobalWorkSizeTrans[1]);
-            ret |= mKernel_trans->get().setArg(index++, mGlobalWorkSizeTrans[2]);
-            ret |= mKernel_trans->get().setArg(index++, openCLBuffer(mTempSoftMax.get()));
-            ret |= mKernel_trans->get().setArg(index++, openCLBuffer(mTempQK.get()));
-            ret |= mKernel_trans->get().setArg(index++, loop);
-            ret |= mKernel_trans->get().setArg(index++, transDimW);
-            ret |= mKernel_trans->get().setArg(index++, transDimH);
-            MNN_CHECK_CL_SUCCESS(ret, "setArg Attention transpose");
-            mLocalWorkSizeTrans = localWS3DDefault(mGlobalWorkSizeTrans, maxWorkGroupSize, mOpenCLBackend->getOpenCLRuntime(), "trans_3d_buf", mKernel_trans).first;
-            
-            mGlobalWorkSizeTrans[0] = ROUND_UP(mGlobalWorkSizeTrans[0], std::max((uint32_t)1, mLocalWorkSizeTrans[0]));
-            mGlobalWorkSizeTrans[1] = ROUND_UP(mGlobalWorkSizeTrans[1], std::max((uint32_t)1, mLocalWorkSizeTrans[1]));
-            mGlobalWorkSizeTrans[2] = ROUND_UP(mGlobalWorkSizeTrans[2], std::max((uint32_t)1, mLocalWorkSizeTrans[2]));
+            ret |= mKernel_rearrangeV->get().setArg(index++, mGlobalWorkSizeRearrgV[0]);
+            ret |= mKernel_rearrangeV->get().setArg(index++, mGlobalWorkSizeRearrgV[1]);
+            ret |= mKernel_rearrangeV->get().setArg(index++, mGlobalWorkSizeRearrgV[2]);
+            ret |= mKernel_rearrangeV->get().setArg(index++, openCLBuffer(value));
+            ret |= mKernel_rearrangeV->get().setArg(index++, *mKVCacheCLManager->value());
+            ret |= mKernel_rearrangeV->get().setArg(index++, past_len);
+            ret |= mKernel_rearrangeV->get().setArg(index++, mMax_len);
+            ret |= mKernel_rearrangeV->get().setArg(index++, seq_len);
+            ret |= mKernel_rearrangeV->get().setArg(index++, kvNumHead);
+            ret |= mKernel_rearrangeV->get().setArg(index++, headDim);
             
-            mOpenCLBackend->recordKernel3d(mKernel_trans, mGlobalWorkSizeTrans, mLocalWorkSizeTrans);
+            MNN_CHECK_CL_SUCCESS(ret, "setArg rearrange_v");
+            mLocalWorkSizeRearrgV = localWS3DDefault(mGlobalWorkSizeRearrgV, maxWorkGroupSize, runtime, "rearrange_v", mKernel_rearrangeV).first;
+            mGlobalWorkSizeRearrgV[0] = ROUND_UP(mGlobalWorkSizeRearrgV[0], std::max((uint32_t)1, mLocalWorkSizeRearrgV[0]));
+            mGlobalWorkSizeRearrgV[1] = ROUND_UP(mGlobalWorkSizeRearrgV[1], std::max((uint32_t)1, mLocalWorkSizeRearrgV[1]));
+            mGlobalWorkSizeRearrgV[2] = ROUND_UP(mGlobalWorkSizeRearrgV[2], std::max((uint32_t)1, mLocalWorkSizeRearrgV[2]));
+            mOpenCLBackend->recordKernel3d(mKernel_rearrangeV, mGlobalWorkSizeRearrgV, mLocalWorkSizeRearrgV);
         }
-
         // qk * value
         {
-            // Trans: [Batch * numHead, ROUND_UP(seqLenKV, mAlignKV), ROUND_UP(seqLenQ, mAlignQ)]   -> [B, K, M]
-            // V :     [Batch * numHead / group, ROUND_UP(seqLenKV, mAlignKV), ROUND_UP(headDim, mAlignHDN)] -> [B, K, N]
-            // QKV :   [Batch * numHead, ROUND_UP(headDim, mAlignHDN), ROUND_UP(seqLenQ, mAlignQ)] -> [B, N, M]
-            
-            int loop = batch * numHead;
-            int e_pack = ROUND_UP(seq_len, mAlignQ);
-            int l_pack = ROUND_UP(mKv_seq_len, mAlignKV);
-            int h_pack = ROUND_UP(headDim, mAlignHDN);
-            
-            std::set<std::string> buildOptions;
-
-            uint32_t layout = 0;
-            auto param = getGemmParams({(uint32_t)e_pack, (uint32_t)h_pack, (uint32_t)l_pack, layout, (uint32_t)loop, (uint32_t)0}, {openCLBuffer(mTempQK.get()), openCLBuffer(mTempV.get()), openCLBuffer(mTempQKV.get())}, mOpenCLBackend->getOpenCLRuntime());
-
-            int KWG=param[0], KWI=param[1], MDIMA=param[2], MDIMC=param[3], MWG=param[4], NDIMB=param[5], NDIMC=param[6], NWG=param[7], SA=param[8], SB=param[9], STRM=param[10], STRN=param[11], VWM=param[12], VWN=param[13];
-            buildOptions.emplace("-DKWG=" + std::to_string(KWG));
-            buildOptions.emplace("-DKWI=" + std::to_string(KWI));
-            buildOptions.emplace("-DMDIMA=" + std::to_string(MDIMA));
-            buildOptions.emplace("-DMDIMC=" + std::to_string(MDIMC));
-            buildOptions.emplace("-DMWG=" + std::to_string(MWG));
-            buildOptions.emplace("-DNDIMB=" + std::to_string(NDIMB));
-            buildOptions.emplace("-DNDIMC=" + std::to_string(NDIMC));
-            buildOptions.emplace("-DNWG=" + std::to_string(NWG));
-            buildOptions.emplace("-DSA=" + std::to_string(SA));
-            buildOptions.emplace("-DSB=" + std::to_string(SB));
-            buildOptions.emplace("-DSTRM=" + std::to_string(STRM));
-            buildOptions.emplace("-DSTRN=" + std::to_string(STRN));
-            buildOptions.emplace("-DVWM=" + std::to_string(VWM));
-            buildOptions.emplace("-DVWN=" + std::to_string(VWN));
-            if(layout >= 4) {
-                buildOptions.emplace("-DOUTPUTMN");
-            }
-            
-            int tileM = MWG;
-            int tileN = NWG;
-            int localM = MDIMC;
-            int localN = NDIMC;
-            
-            if(mOpenCLBackend->getOpenCLRuntime()->getGpuType() == GpuType::ADRENO) {
-                buildOptions.emplace("-DUSE_CL_MAD=1");
-                buildOptions.emplace("-DRELAX_WORKGROUP_SIZE=1");
-            }
-
-            mKernel_qkv = mOpenCLBackend->getOpenCLRuntime()->buildKernel("matmul_params_buf", "XgemmBatched", buildOptions);
-            
-            int out_per_thread_m = tileM / localM;
-            int out_per_thread_n = tileN / localN;
-            
-            mGlobalWorkSizeQkv = {static_cast<uint32_t>(e_pack/out_per_thread_m), static_cast<uint32_t>(h_pack/out_per_thread_n), static_cast<uint32_t>(loop)};
-            mLocalWorkSizeQkv = {static_cast<uint32_t>(localM), static_cast<uint32_t>(localN), 1};
-            
-            float alpha = 1.0f;
-            float beta = 0.0f;
-            int batch_offset_a = e_pack * l_pack;
-            int batch_offset_b = h_pack * l_pack;
-            int batch_offset_c = e_pack * h_pack;
-            int batch_offset[4] = {batch_offset_a, batch_offset_b, batch_offset_c, 0};
-            int stride[4] = {e_pack, h_pack, e_pack, h_pack};
-            int group[4] = {1, group_size, 1, numHead};
+            std::set<std::string> buildOption;
+            buildOption.emplace("-DNUMHEAD_GROUP_SIZE=" + std::to_string(group_size));
+            mKernel_qkv = runtime->buildKernel("attention_buf", "matmul_qkv_prefill", buildOption, inputs[0], outputs[0]);
+            auto maxWorkGroupSize  = static_cast<uint32_t>(runtime->getMaxWorkGroupSize(mKernel_qkv));
+            mGlobalWorkSizeQkv =  {static_cast<uint32_t>(UP_DIV(headDim, 8)), static_cast<uint32_t>(UP_DIV(seq_len, 4)), static_cast<uint32_t>(numHead)};
             
-            int idx            = 0;
+            uint32_t index = 0;
             cl_int ret = CL_SUCCESS;
-            ret |= mKernel_qkv->get().setArg(idx++, static_cast<int>(e_pack));
-            ret |= mKernel_qkv->get().setArg(idx++, static_cast<int>(h_pack));
-            ret |= mKernel_qkv->get().setArg(idx++, static_cast<int>(l_pack));
-            ret |= mKernel_qkv->get().setArg(idx++, alpha);
-            ret |= mKernel_qkv->get().setArg(idx++, beta);
-            ret |= mKernel_qkv->get().setArg(idx++, openCLBuffer(mTempQK.get()));
-            ret |= mKernel_qkv->get().setArg(idx++, openCLBuffer(mTempV.get()));
-            ret |= mKernel_qkv->get().setArg(idx++, openCLBuffer(mTempQKV.get()));
-            ret |= mKernel_qkv->get().setArg(idx++, batch_offset);
-            ret |= mKernel_qkv->get().setArg(idx++, stride);
-            ret |= mKernel_qkv->get().setArg(idx++, group);
-            MNN_CHECK_CL_SUCCESS(ret, "setArg Self-Attention batchmatmul qkv Kernel");
+            ret |= mKernel_qkv->get().setArg(index++, mGlobalWorkSizeQkv[0]);
+            ret |= mKernel_qkv->get().setArg(index++, mGlobalWorkSizeQkv[1]);
+            ret |= mKernel_qkv->get().setArg(index++, mGlobalWorkSizeQkv[2]);
+            ret |= mKernel_qkv->get().setArg(index++, openCLBuffer(mTempSoftMax.get()));
+            ret |= mKernel_qkv->get().setArg(index++, *mKVCacheCLManager->value());
+            ret |= mKernel_qkv->get().setArg(index++, openCLBuffer(outputs[0]));
+            ret |= mKernel_qkv->get().setArg(index++, seq_len);
+            ret |= mKernel_qkv->get().setArg(index++, mask_kvlen);
+            ret |= mKernel_qkv->get().setArg(index++, mMax_len);
+            ret |= mKernel_qkv->get().setArg(index++, numHead);
+            ret |= mKernel_qkv->get().setArg(index++, kvNumHead);
+            ret |= mKernel_qkv->get().setArg(index++, headDim);
+            MNN_CHECK_CL_SUCCESS(ret, "setArg matmul_qkv_prefill");
+            
+            mLocalWorkSizeQkv = localWS3DDefault(mGlobalWorkSizeQkv, maxWorkGroupSize, runtime, "matmul_qkv_prefill", mKernel_qkv).first;
+            mGlobalWorkSizeQkv[0] = ROUND_UP(mGlobalWorkSizeQkv[0], std::max((uint32_t)1, mLocalWorkSizeQkv[0]));
+            mGlobalWorkSizeQkv[1] = ROUND_UP(mGlobalWorkSizeQkv[1], std::max((uint32_t)1, mLocalWorkSizeQkv[1]));
+            mGlobalWorkSizeQkv[2] = ROUND_UP(mGlobalWorkSizeQkv[2], std::max((uint32_t)1, mLocalWorkSizeQkv[2]));
             mOpenCLBackend->recordKernel3d(mKernel_qkv, mGlobalWorkSizeQkv, mLocalWorkSizeQkv);
         }
-        
-        // transpose to output
+    }else{
         {
-            // QKV :   [Batch * numHead, ROUND_UP(headDim, mAlignHDN), ROUND_UP(seqLenQ, mAlignQ)] -> [B, N, M]
-            // output: [batch, seqLenQ/4, headNum, headDim, seqLenQ_4]
+            // rearrange key
             std::set<std::string> buildOption;
             
-            mKernel_clip = runtime->buildKernel("attention_buf", "qkv_transpose_output", buildOption, inputs[0], outputs[0]);
-            auto maxWorkGroupSize  = static_cast<uint32_t>(runtime->getMaxWorkGroupSize(mKernel_clip));
-                        
-            mGlobalWorkSizeClip = {static_cast<uint32_t>(UP_DIV(seq_len, 4)), static_cast<uint32_t>(UP_DIV(headDim, 4)), static_cast<uint32_t>(batch*numHead)};
+            mKernel_rearrange = runtime->buildKernel("attention_buf", "rearrange_k", buildOption, inputs[0], outputs[0]);
+            auto maxWorkGroupSize  = static_cast<uint32_t>(runtime->getMaxWorkGroupSize(mKernel_rearrange));
             
+            mGlobalWorkSizeRearrg = {static_cast<uint32_t>(1), \
+                                    static_cast<uint32_t>(UP_DIV(headDim, 4)), \
+                                    static_cast<uint32_t>(kvNumHead)};
+  
             uint32_t index = 0;
             cl_int ret = CL_SUCCESS;
-            ret |= mKernel_clip->get().setArg(index++, mGlobalWorkSizeClip[0]);
-            ret |= mKernel_clip->get().setArg(index++, mGlobalWorkSizeClip[1]);
-            ret |= mKernel_clip->get().setArg(index++, mGlobalWorkSizeClip[2]);
-            ret |= mKernel_clip->get().setArg(index++, openCLBuffer(mTempQKV.get()));
-            ret |= mKernel_clip->get().setArg(index++, openCLBuffer(outputs[0]));
-            ret |= mKernel_clip->get().setArg(index++, mAlignQ);
-            ret |= mKernel_clip->get().setArg(index++, mAlignHDN);
-            ret |= mKernel_clip->get().setArg(index++, seq_len);
-            ret |= mKernel_clip->get().setArg(index++, numHead);
-            ret |= mKernel_clip->get().setArg(index++, headDim);
-
-            mLocalWorkSizeClip = localWS3DDefault(mGlobalWorkSizeClip, maxWorkGroupSize, runtime, "qkv_transpose_output", mKernel_clip).first;
-            mGlobalWorkSizeClip[0] = ROUND_UP(mGlobalWorkSizeClip[0], std::max((uint32_t)1, mLocalWorkSizeClip[0]));
-            mGlobalWorkSizeClip[1] = ROUND_UP(mGlobalWorkSizeClip[1], std::max((uint32_t)1, mLocalWorkSizeClip[1]));
-            mGlobalWorkSizeClip[2] = ROUND_UP(mGlobalWorkSizeClip[2], std::max((uint32_t)1, mLocalWorkSizeClip[2]));
-
-            MNN_CHECK_CL_SUCCESS(ret, "setArg qkv_transpose_output");
-            mOpenCLBackend->recordKernel3d(mKernel_clip, mGlobalWorkSizeClip, mLocalWorkSizeClip);
+            ret |= mKernel_rearrange->get().setArg(index++, mGlobalWorkSizeRearrg[0]);
+            ret |= mKernel_rearrange->get().setArg(index++, mGlobalWorkSizeRearrg[1]);
+            ret |= mKernel_rearrange->get().setArg(index++, mGlobalWorkSizeRearrg[2]);
+            ret |= mKernel_rearrange->get().setArg(index++, openCLBuffer(key));
+            ret |= mKernel_rearrange->get().setArg(index++, *mKVCacheCLManager->key());
+            ret |= mKernel_rearrange->get().setArg(index++, mKv_seq_len);
+            ret |= mKernel_rearrange->get().setArg(index++, mMax_len);
+            ret |= mKernel_rearrange->get().setArg(index++, seq_len);
+            ret |= mKernel_rearrange->get().setArg(index++, kvNumHead);
+            ret |= mKernel_rearrange->get().setArg(index++, numHead);
+            ret |= mKernel_rearrange->get().setArg(index++, headDim);
+            
+            MNN_CHECK_CL_SUCCESS(ret, "setArg rearrange_k");
+            mLocalWorkSizeRearrg = localWS3DDefault(mGlobalWorkSizeRearrg, maxWorkGroupSize, runtime, "rearrange_k", mKernel_rearrange).first;
+            mGlobalWorkSizeRearrg[0] = ROUND_UP(mGlobalWorkSizeRearrg[0], std::max((uint32_t)1, mLocalWorkSizeRearrg[0]));
+            mGlobalWorkSizeRearrg[1] = ROUND_UP(mGlobalWorkSizeRearrg[1], std::max((uint32_t)1, mLocalWorkSizeRearrg[1]));
+            mGlobalWorkSizeRearrg[2] = ROUND_UP(mGlobalWorkSizeRearrg[2], std::max((uint32_t)1, mLocalWorkSizeRearrg[2]));
+            mRgUpdateInfo.update_kernel_args.push_back({0, 4, sizeof(cl_mem), &(*(mKVCacheCLManager->key()))()});
+            mRgUpdateInfo.update_kernel_args.push_back({0, 5, sizeof(mKv_seq_len), &mKv_seq_len});
+            mRgUpdateInfo.update_kernel_args.push_back({0, 6, sizeof(mMax_len), &mMax_len});
+            mOpRecordUpdateInfo.emplace_back(&mRgUpdateInfo);
+            mOpenCLBackend->recordKernel3d(mKernel_rearrange, mGlobalWorkSizeRearrg, mLocalWorkSizeRearrg, &mRgUpdateInfo);
         }
-        
-    } else {
-        // query * key -> div -> select
         {
+            // matmul qk
             std::set<std::string> buildOption;
-            if(!mIsDecode){
-                buildOption.emplace("-DOPENCL_PREFILL_ATTENTION");
-            }
-            if((headDim % 4) != 0){
-                buildOption.emplace("-DHEADDIM_LEAVE");
-            }
             if(mask->getType() == halide_type_of<float>()){
                 buildOption.emplace("-DADD_MASK");
             }
             buildOption.emplace("-DNUMHEAD_GROUP_SIZE=" + std::to_string(group_size));
-            mKernel_qk = runtime->buildKernel("attention_buf", "matmul_qk_div_mask", buildOption, inputs[0], outputs[0]);
-            mGlobalWorkSizeQk =  {static_cast<uint32_t>(UP_DIV(mKv_seq_len, 4)), static_cast<uint32_t>(UP_DIV(seq_len, 4)), static_cast<uint32_t>(numHead)};
+            mKernel_qk = runtime->buildKernel("attention_buf", "matmul_qk_decode", buildOption, inputs[0], outputs[0]);
+            mGlobalWorkSizeQk =  {static_cast<uint32_t>(UP_DIV(mKv_seq_len, 4)), static_cast<uint32_t>(numHead)};
             auto maxWorkGroupSize  = static_cast<uint32_t>(runtime->getMaxWorkGroupSize(mKernel_qk));
-            mGlobalWorkSizeQk0 = UP_DIV(mKv_seq_len, 4);
             
             uint32_t index = 0;
             cl_int ret = CL_SUCCESS;
-            ret |= mKernel_qk->get().setArg(index++, mGlobalWorkSizeQk0);
+            ret |= mKernel_qk->get().setArg(index++, mGlobalWorkSizeQk[0]);
             ret |= mKernel_qk->get().setArg(index++, mGlobalWorkSizeQk[1]);
-            ret |= mKernel_qk->get().setArg(index++, mGlobalWorkSizeQk[2]);
             ret |= mKernel_qk->get().setArg(index++, openCLBuffer(query));
-            ret |= mKernel_qk->get().setArg(index++, openCLBuffer(key));
-            ret |= mKernel_qk->get().setArg(index++, openCLBuffer(mTempQK.get()));
             ret |= mKernel_qk->get().setArg(index++, *mKVCacheCLManager->key());
-            ret |= mKernel_qk->get().setArg(index++, openCLBuffer(mask));
+            ret |= mKernel_qk->get().setArg(index++, openCLBuffer(mTempQK.get()));
             ret |= mKernel_qk->get().setArg(index++, scale);
-            ret |= mKernel_qk->get().setArg(index++, seq_len);
             ret |= mKernel_qk->get().setArg(index++, mKv_seq_len);
+            ret |= mKernel_qk->get().setArg(index++, mMax_len);
             ret |= mKernel_qk->get().setArg(index++, numHead);
-            ret |= mKernel_qk->get().setArg(index++, kvNumHead);
             ret |= mKernel_qk->get().setArg(index++, headDim);
-            MNN_CHECK_CL_SUCCESS(ret, "setArg matmul_qk_div_mask");
+            MNN_CHECK_CL_SUCCESS(ret, "setArg matmul_qk_decode");
             
-            mLocalWorkSizeQk = localWS3DDefault(mGlobalWorkSizeQk, maxWorkGroupSize, runtime, "matmul_qk_div_mask", mKernel_qk).first;
+            mLocalWorkSizeQk = localWS2DDefault(mGlobalWorkSizeQk, maxWorkGroupSize, runtime, "matmul_qk_decode", mKernel_qk).first;
             mGlobalWorkSizeQk[0] = ROUND_UP(mGlobalWorkSizeQk[0], std::max((uint32_t)1, mLocalWorkSizeQk[0]));
             mGlobalWorkSizeQk[1] = ROUND_UP(mGlobalWorkSizeQk[1], std::max((uint32_t)1, mLocalWorkSizeQk[1]));
-            mGlobalWorkSizeQk[2] = ROUND_UP(mGlobalWorkSizeQk[2], std::max((uint32_t)1, mLocalWorkSizeQk[2]));
             mQkUpdateInfo.update_kernel_args.push_back({0, 0, sizeof(mGlobalWorkSizeQk0), &mGlobalWorkSizeQk0});
-            mQkUpdateInfo.update_kernel_args.push_back({0, 5, sizeof(cl_mem), &openCLBuffer(mTempQK.get())()});
-            mQkUpdateInfo.update_kernel_args.push_back({0, 6, sizeof(cl_mem), &(*(mKVCacheCLManager->key()))()});
-            mQkUpdateInfo.update_kernel_args.push_back({0, 10, sizeof(mKv_seq_len), &mKv_seq_len});
+            mQkUpdateInfo.update_kernel_args.push_back({0, 3, sizeof(cl_mem), &(*(mKVCacheCLManager->key()))()});
+            mQkUpdateInfo.update_kernel_args.push_back({0, 4, sizeof(cl_mem), &openCLBuffer(mTempQK.get())()});
+            mQkUpdateInfo.update_kernel_args.push_back({0, 6, sizeof(mKv_seq_len), &mKv_seq_len});
+            mQkUpdateInfo.update_kernel_args.push_back({0, 7, sizeof(mMax_len), &mMax_len});
             mQkGlobal_size[0] = mGlobalWorkSizeQk[0];
             mQkGlobal_size[1] = mGlobalWorkSizeQk[1];
-            mQkGlobal_size[2] = mGlobalWorkSizeQk[2];
             mQkUpdateInfo.update_global_size.push_back({0, mQkGlobal_size});
             mOpRecordUpdateInfo.emplace_back(&mQkUpdateInfo);
-            mOpenCLBackend->recordKernel3d(mKernel_qk, mGlobalWorkSizeQk, mLocalWorkSizeQk, &mQkUpdateInfo);
+            mOpenCLBackend->recordKernel2d(mKernel_qk, mGlobalWorkSizeQk, mLocalWorkSizeQk, &mQkUpdateInfo);
         }
-        
-        // softmax
         {
+            // softmax
             int inside  = 1;
-            int outside = numHead * seq_len;
-            auto MaxLocalSize = std::min(std::min(runtime->getMaxWorkItemSizes()[0], mMaxWorkGroupSize), static_cast<uint32_t>(256));
-            int localSize = getLocalSize(UP_DIV(mKv_seq_len, 4), MaxLocalSize);
+            int outside = numHead;
+            int localSize = getLocalSize(UP_DIV(mKv_seq_len, 4), 128);
             if(localSize < 4){
                 localSize = 1;
             }
@@ -657,51 +936,114 @@ ErrorCode AttentionBufExecution::onResize(const std::vector<Tensor *> &inputs, c
             mOpRecordUpdateInfo.emplace_back(&mSoftMaxUpdateInfo);
             mOpenCLBackend->recordKernel3d(mKernel_softmax, mGlobalWorkSizeSoftMax, mLocalWorkSizeSoftMax, &mSoftMaxUpdateInfo);
         }
-        
+        {
+            // rearrange value
+            std::set<std::string> buildOption;
+            
+            mKernel_rearrangeV = runtime->buildKernel("attention_buf", "rearrange_v", buildOption, inputs[0], outputs[0]);
+            auto maxWorkGroupSize  = static_cast<uint32_t>(runtime->getMaxWorkGroupSize(mKernel_rearrangeV));
+            
+            mGlobalWorkSizeRearrgV = {static_cast<uint32_t>(UP_DIV(headDim, 4)), \
+                                    static_cast<uint32_t>(1), \
+                                    static_cast<uint32_t>(kvNumHead)};
+  
+            uint32_t index = 0;
+            cl_int ret = CL_SUCCESS;
+            ret |= mKernel_rearrangeV->get().setArg(index++, mGlobalWorkSizeRearrgV[0]);
+            ret |= mKernel_rearrangeV->get().setArg(index++, mGlobalWorkSizeRearrgV[1]);
+            ret |= mKernel_rearrangeV->get().setArg(index++, mGlobalWorkSizeRearrgV[2]);
+            ret |= mKernel_rearrangeV->get().setArg(index++, openCLBuffer(value));
+            ret |= mKernel_rearrangeV->get().setArg(index++, *mKVCacheCLManager->value());
+            ret |= mKernel_rearrangeV->get().setArg(index++, mKv_seq_len);
+            ret |= mKernel_rearrangeV->get().setArg(index++, mMax_len);
+            ret |= mKernel_rearrangeV->get().setArg(index++, seq_len);
+            ret |= mKernel_rearrangeV->get().setArg(index++, kvNumHead);
+            ret |= mKernel_rearrangeV->get().setArg(index++, headDim);
+            
+            MNN_CHECK_CL_SUCCESS(ret, "setArg rearrange_v");
+            mLocalWorkSizeRearrgV = localWS3DDefault(mGlobalWorkSizeRearrgV, maxWorkGroupSize, runtime, "rearrange_v", mKernel_rearrangeV).first;
+            mGlobalWorkSizeRearrgV[0] = ROUND_UP(mGlobalWorkSizeRearrgV[0], std::max((uint32_t)1, mLocalWorkSizeRearrgV[0]));
+            mGlobalWorkSizeRearrgV[1] = ROUND_UP(mGlobalWorkSizeRearrgV[1], std::max((uint32_t)1, mLocalWorkSizeRearrgV[1]));
+            mGlobalWorkSizeRearrgV[2] = ROUND_UP(mGlobalWorkSizeRearrgV[2], std::max((uint32_t)1, mLocalWorkSizeRearrgV[2]));
+            mRgVUpdateInfo.update_kernel_args.push_back({0, 4, sizeof(cl_mem), &(*(mKVCacheCLManager->value()))()});
+            mRgVUpdateInfo.update_kernel_args.push_back({0, 5, sizeof(mKv_seq_len), &mKv_seq_len});
+            mRgVUpdateInfo.update_kernel_args.push_back({0, 6, sizeof(mMax_len), &mMax_len});
+            mOpRecordUpdateInfo.emplace_back(&mRgVUpdateInfo);
+            mOpenCLBackend->recordKernel3d(mKernel_rearrangeV, mGlobalWorkSizeRearrgV, mLocalWorkSizeRearrgV, &mRgVUpdateInfo);
+        }
         // qk * value
         {
             std::set<std::string> buildOption;
-            if(!mIsDecode){
-                buildOption.emplace("-DOPENCL_PREFILL_ATTENTION");
-            }
-            if((headDim % 4) != 0){
-                buildOption.emplace("-DHEADDIM_LEAVE");
-            }
             buildOption.emplace("-DNUMHEAD_GROUP_SIZE=" + std::to_string(group_size));
-            mKernel_qkv = runtime->buildKernel("attention_buf", "matmul_qkv", buildOption, inputs[0], outputs[0]);
-            auto maxWorkGroupSize  = static_cast<uint32_t>(runtime->getMaxWorkGroupSize(mKernel_qkv));
-            mGlobalWorkSizeQkv =  {static_cast<uint32_t>(UP_DIV(headDim, 4)), static_cast<uint32_t>(numHead), static_cast<uint32_t>(UP_DIV(seq_len, 4))};
+            const int total_kernel = 2;
+            std::string kernelName[total_kernel] = {"matmul_qkv_decode_b4", "matmul_qkv_decode_b8"};
+            std::string unroll[total_kernel] = {"-DLOOP_UNROLL_4", "-DLOOP_UNROLL_8"};
+            int itemC[total_kernel] = {4, 8};
+            int actual_kernel = 2;
+            std::shared_ptr<KernelWrap> kernel[total_kernel * total_kernel];
+            std::vector<uint32_t> globalWorkSize[total_kernel * total_kernel];
+            std::vector<uint32_t> localWorkSize[total_kernel * total_kernel];
+            std::pair<int, int> min_cost(INT_MAX, 0);//(min_time, min_index)
+            
+            for (int i = 0; i < actual_kernel; i++) {
+                for(int j = 0; j < actual_kernel; j++){
+                    int knl_idx = i * total_kernel + j;
+                    auto option = buildOption;
+                    option.emplace(unroll[j]);
+                    kernel[knl_idx] = mOpenCLBackend->getOpenCLRuntime()->buildKernel("attention_buf", kernelName[i], option);
+                    uint32_t maxWorkGroupSize = static_cast<uint32_t>(mOpenCLBackend->getOpenCLRuntime()->getMaxWorkGroupSize(kernel[knl_idx]));
+                    globalWorkSize[knl_idx] = {static_cast<uint32_t>(UP_DIV(headDim, itemC[i])), static_cast<uint32_t>(numHead)};
+                    uint32_t index = 0;
+                    cl_int ret = CL_SUCCESS;
+                    ret |= kernel[knl_idx]->get().setArg(index++, globalWorkSize[knl_idx][0]);
+                    ret |= kernel[knl_idx]->get().setArg(index++, globalWorkSize[knl_idx][1]);
+                    ret |= kernel[knl_idx]->get().setArg(index++, openCLBuffer(mTempSoftMax.get()));
+                    ret |= kernel[knl_idx]->get().setArg(index++, *mKVCacheCLManager->value());
+                    ret |= kernel[knl_idx]->get().setArg(index++, openCLBuffer(outputs[0]));
+                    ret |= kernel[knl_idx]->get().setArg(index++, mKv_seq_len);
+                    ret |= kernel[knl_idx]->get().setArg(index++, mMax_len);
+                    ret |= kernel[knl_idx]->get().setArg(index++, numHead);
+                    ret |= kernel[knl_idx]->get().setArg(index++, kvNumHead);
+                    ret |= kernel[knl_idx]->get().setArg(index++, headDim);
+                    MNN_CHECK_CL_SUCCESS(ret, "setArg matmul_qkv_decode");
+                    std::pair<std::vector<uint32_t>, int> retTune;
+                    retTune = localWS2DDefault(globalWorkSize[knl_idx], maxWorkGroupSize, mOpenCLBackend->getOpenCLRuntime(), kernelName[i] + unroll[j], kernel[knl_idx]);
+                    if(min_cost.first > retTune.second) {
+                        min_cost.first = retTune.second;
+                        min_cost.second = knl_idx;
+                        mLocalWorkSizeQkv = {retTune.first[0], retTune.first[1]};
+                    }
+                }
+            }
+            int min_index  = min_cost.second / 2;
+            int min_index_unroll  = min_cost.second % 2;
+            mGlobalWorkSizeQkv = {globalWorkSize[min_cost.second][0], globalWorkSize[min_cost.second][1]};
+            buildOption.emplace(unroll[min_index_unroll]);
+            mKernel_qkv = runtime->buildKernel("attention_buf", kernelName[min_index], buildOption, inputs[0], outputs[0]);
             
             uint32_t index = 0;
             cl_int ret = CL_SUCCESS;
             ret |= mKernel_qkv->get().setArg(index++, mGlobalWorkSizeQkv[0]);
             ret |= mKernel_qkv->get().setArg(index++, mGlobalWorkSizeQkv[1]);
-            ret |= mKernel_qkv->get().setArg(index++, mGlobalWorkSizeQkv[2]);
             ret |= mKernel_qkv->get().setArg(index++, openCLBuffer(mTempSoftMax.get()));
-            ret |= mKernel_qkv->get().setArg(index++, openCLBuffer(value));
-            ret |= mKernel_qkv->get().setArg(index++, openCLBuffer(outputs[0]));
             ret |= mKernel_qkv->get().setArg(index++, *mKVCacheCLManager->value());
-            ret |= mKernel_qkv->get().setArg(index++, seq_len);
+            ret |= mKernel_qkv->get().setArg(index++, openCLBuffer(outputs[0]));
             ret |= mKernel_qkv->get().setArg(index++, mKv_seq_len);
+            ret |= mKernel_qkv->get().setArg(index++, mMax_len);
             ret |= mKernel_qkv->get().setArg(index++, numHead);
             ret |= mKernel_qkv->get().setArg(index++, kvNumHead);
             ret |= mKernel_qkv->get().setArg(index++, headDim);
-            MNN_CHECK_CL_SUCCESS(ret, "setArg matmul_qkv");
+            MNN_CHECK_CL_SUCCESS(ret, "setArg matmul_qkv_decode");
             
-            mLocalWorkSizeQkv = localWS3DDefault(mGlobalWorkSizeQkv, maxWorkGroupSize, runtime, "matmul_qkv", mKernel_qkv).first;
             mGlobalWorkSizeQkv[0] = ROUND_UP(mGlobalWorkSizeQkv[0], std::max((uint32_t)1, mLocalWorkSizeQkv[0]));
             mGlobalWorkSizeQkv[1] = ROUND_UP(mGlobalWorkSizeQkv[1], std::max((uint32_t)1, mLocalWorkSizeQkv[1]));
-            mGlobalWorkSizeQkv[2] = ROUND_UP(mGlobalWorkSizeQkv[2], std::max((uint32_t)1, mLocalWorkSizeQkv[2]));
-            
-            mQkvUpdateInfo.update_kernel_args.push_back({0, 3, sizeof(cl_mem), &openCLBuffer(mTempSoftMax.get())()});
-            mQkvUpdateInfo.update_kernel_args.push_back({0, 6, sizeof(cl_mem), &(*(mKVCacheCLManager->value()))()});
-            mQkvUpdateInfo.update_kernel_args.push_back({0, 8, sizeof(mKv_seq_len), &mKv_seq_len});
+            mQkvUpdateInfo.update_kernel_args.push_back({0, 2, sizeof(cl_mem), &openCLBuffer(mTempSoftMax.get())()});
+            mQkvUpdateInfo.update_kernel_args.push_back({0, 3, sizeof(cl_mem), &(*(mKVCacheCLManager->value()))()});
+            mQkvUpdateInfo.update_kernel_args.push_back({0, 5, sizeof(mKv_seq_len), &mKv_seq_len});
+            mQkvUpdateInfo.update_kernel_args.push_back({0, 6, sizeof(mMax_len), &mMax_len});
             mOpRecordUpdateInfo.emplace_back(&mQkvUpdateInfo);
-            mOpenCLBackend->recordKernel3d(mKernel_qkv, mGlobalWorkSizeQkv, mLocalWorkSizeQkv, &mQkvUpdateInfo);
+            mOpenCLBackend->recordKernel2d(mKernel_qkv, mGlobalWorkSizeQkv, mLocalWorkSizeQkv, &mQkvUpdateInfo);
         }
-        
-        mOpenCLBackend->onReleaseBuffer(mTempQK.get(), Backend::DYNAMIC);
-        mOpenCLBackend->onReleaseBuffer(mTempSoftMax.get(), Backend::DYNAMIC);
     }
     mOpenCLBackend->endRecord(mRecording);
 
@@ -724,44 +1066,49 @@ ErrorCode AttentionBufExecution::onExecute(const std::vector<Tensor *> &inputs,
     }
 #ifdef ENABLE_OPENCL_TIME_PROFILER
     if(mLongPrefill) {
-        cl::Event event0, event1;
+        cl::Event event0, event1, event2, event3, event4, event5, event6;
         run3DKernelDefault(mKernel_rearrange, mGlobalWorkSizeRearrg, mLocalWorkSizeRearrg, mOpenCLBackend->getOpenCLRuntime(), &event0);
         mOpenCLBackend->getOpenCLRuntime()->pushEvent({"rearrange_qkv", event0});
         run3DKernelDefault(mKernel_mask, mGlobalWorkSizeMask, mLocalWorkSizeMask, mOpenCLBackend->getOpenCLRuntime(), &event1);
         mOpenCLBackend->getOpenCLRuntime()->pushEvent({"rearrange_mask", event1});
-    }
-    {
-        cl::Event event;
-        run3DKernelDefault(mKernel_qk, mGlobalWorkSizeQk, mLocalWorkSizeQk,
-                           mOpenCLBackend->getOpenCLRuntime(), &event);
-        
-        mOpenCLBackend->getOpenCLRuntime()->pushEvent({"matmul_qk_div_mask", event});
-    }
-    {
-        cl::Event event;
-        run3DKernelDefault(mKernel_softmax, mGlobalWorkSizeSoftMax, mLocalWorkSizeSoftMax,
-                           mOpenCLBackend->getOpenCLRuntime(), &event);
-        
-        mOpenCLBackend->getOpenCLRuntime()->pushEvent({"softmax", event});
-    }
-    if(mLongPrefill) {
-        cl::Event event;
-        run3DKernelDefault(mKernel_trans, mGlobalWorkSizeTrans, mLocalWorkSizeTrans, mOpenCLBackend->getOpenCLRuntime(), &event);
-        
-        mOpenCLBackend->getOpenCLRuntime()->pushEvent({"transpose_softmax", event});
-    }
-    {
-        cl::Event event;
-        run3DKernelDefault(mKernel_qkv, mGlobalWorkSizeQkv, mLocalWorkSizeQkv,
-                           mOpenCLBackend->getOpenCLRuntime(), &event);
-        
-        mOpenCLBackend->getOpenCLRuntime()->pushEvent({"matmul_qkv", event});
-    }
-    if(mLongPrefill) {
-        cl::Event event;
-        run3DKernelDefault(mKernel_clip, mGlobalWorkSizeClip, mLocalWorkSizeClip, mOpenCLBackend->getOpenCLRuntime(), &event);
-        
-        mOpenCLBackend->getOpenCLRuntime()->pushEvent({"rearrange_output", event});
+        run3DKernelDefault(mKernel_qk, mGlobalWorkSizeQk, mLocalWorkSizeQk, mOpenCLBackend->getOpenCLRuntime(), &event2);
+        mOpenCLBackend->getOpenCLRuntime()->pushEvent({"matmul_qk_div_mask", event2});
+        run3DKernelDefault(mKernel_softmax, mGlobalWorkSizeSoftMax, mLocalWorkSizeSoftMax, mOpenCLBackend->getOpenCLRuntime(), &event3);
+        mOpenCLBackend->getOpenCLRuntime()->pushEvent({"softmax", event3});
+        run3DKernelDefault(mKernel_trans, mGlobalWorkSizeTrans, mLocalWorkSizeTrans, mOpenCLBackend->getOpenCLRuntime(), &event4);
+        mOpenCLBackend->getOpenCLRuntime()->pushEvent({"transpose_softmax", event4});
+        run3DKernelDefault(mKernel_qkv, mGlobalWorkSizeQkv, mLocalWorkSizeQkv, mOpenCLBackend->getOpenCLRuntime(), &event5);
+        mOpenCLBackend->getOpenCLRuntime()->pushEvent({"matmul_qkv", event5});
+        run3DKernelDefault(mKernel_clip, mGlobalWorkSizeClip, mLocalWorkSizeClip, mOpenCLBackend->getOpenCLRuntime(), &event6);
+        mOpenCLBackend->getOpenCLRuntime()->pushEvent({"rearrange_output", event6});
+    } else{
+        if(mIsDecode){
+            cl::Event event0, event1, event2, event3, event4;
+            run3DKernelDefault(mKernel_rearrange, mGlobalWorkSizeRearrg, mLocalWorkSizeRearrg, mOpenCLBackend->getOpenCLRuntime(), &event0);
+            mOpenCLBackend->getOpenCLRuntime()->pushEvent({"rearrange_k", event0});
+            runKernel2D(mKernel_qk, mGlobalWorkSizeQk, mLocalWorkSizeQk, mOpenCLBackend->getOpenCLRuntime(), &event1);
+            mOpenCLBackend->getOpenCLRuntime()->pushEvent({"matmul_qk_div_mask", event1});
+            run3DKernelDefault(mKernel_softmax, mGlobalWorkSizeSoftMax, mLocalWorkSizeSoftMax, mOpenCLBackend->getOpenCLRuntime(), &event2);
+            mOpenCLBackend->getOpenCLRuntime()->pushEvent({"softmax", event2});
+            run3DKernelDefault(mKernel_rearrangeV, mGlobalWorkSizeRearrgV, mLocalWorkSizeRearrgV, mOpenCLBackend->getOpenCLRuntime(), &event3);
+            mOpenCLBackend->getOpenCLRuntime()->pushEvent({"rearrange_v", event3});
+            runKernel2D(mKernel_qkv, mGlobalWorkSizeQkv, mLocalWorkSizeQkv, mOpenCLBackend->getOpenCLRuntime(), &event4);
+            mOpenCLBackend->getOpenCLRuntime()->pushEvent({"matmul_qkv", event4});
+        }else{
+            cl::Event event0, event1, event2, event3, event4, event5;
+            run3DKernelDefault(mKernel_rearrangeQ, mGlobalWorkSizeRearrgQ, mLocalWorkSizeRearrgQ, mOpenCLBackend->getOpenCLRuntime(), &event0);
+            mOpenCLBackend->getOpenCLRuntime()->pushEvent({"rearrange_q", event0});
+            run3DKernelDefault(mKernel_rearrange, mGlobalWorkSizeRearrg, mLocalWorkSizeRearrg, mOpenCLBackend->getOpenCLRuntime(), &event1);
+            mOpenCLBackend->getOpenCLRuntime()->pushEvent({"rearrange_k", event1});
+            run3DKernelDefault(mKernel_qk, mGlobalWorkSizeQk, mLocalWorkSizeQk, mOpenCLBackend->getOpenCLRuntime(), &event2);
+            mOpenCLBackend->getOpenCLRuntime()->pushEvent({"matmul_qk_div_mask", event2});
+            run3DKernelDefault(mKernel_softmax, mGlobalWorkSizeSoftMax, mLocalWorkSizeSoftMax, mOpenCLBackend->getOpenCLRuntime(), &event3);
+            mOpenCLBackend->getOpenCLRuntime()->pushEvent({"softmax", event3});
+            run3DKernelDefault(mKernel_rearrangeV, mGlobalWorkSizeRearrgV, mLocalWorkSizeRearrgV, mOpenCLBackend->getOpenCLRuntime(), &event4);
+            mOpenCLBackend->getOpenCLRuntime()->pushEvent({"rearrange_v", event4});
+            run3DKernelDefault(mKernel_qkv, mGlobalWorkSizeQkv, mLocalWorkSizeQkv, mOpenCLBackend->getOpenCLRuntime(), &event5);
+            mOpenCLBackend->getOpenCLRuntime()->pushEvent({"matmul_qkv", event5});
+        }
     }
 #else
     if(mOpenCLBackend->isUseRecordQueue()){
@@ -775,24 +1122,37 @@ ErrorCode AttentionBufExecution::onExecute(const std::vector<Tensor *> &inputs,
     // decode
     if(mIsDecode){
         cl_int ret = CL_SUCCESS;
+        ret |= mKernel_rearrange->get().setArg(5, mKv_seq_len);
         ret |= mKernel_qk->get().setArg(0, mGlobalWorkSizeQk0);
-        ret |= mKernel_qk->get().setArg(10, mKv_seq_len);
+        ret |= mKernel_qk->get().setArg(6, mKv_seq_len);
         ret |= mKernel_softmax->get().setArg(7, mKv_seq_len);
-        ret |= mKernel_qkv->get().setArg(8, mKv_seq_len);
+        ret |= mKernel_rearrangeV->get().setArg(5, mKv_seq_len);
+        ret |= mKernel_qkv->get().setArg(5, mKv_seq_len);
         MNN_CHECK_CL_SUCCESS(ret, "reset arg for AttentionBufExecution");
     }
     if(mLongPrefill) {
         run3DKernelDefault(mKernel_rearrange, mGlobalWorkSizeRearrg, mLocalWorkSizeRearrg, mOpenCLBackend->getOpenCLRuntime());
         run3DKernelDefault(mKernel_mask, mGlobalWorkSizeMask, mLocalWorkSizeMask, mOpenCLBackend->getOpenCLRuntime());
-    }
-    run3DKernelDefault(mKernel_qk, mGlobalWorkSizeQk, mLocalWorkSizeQk, mOpenCLBackend->getOpenCLRuntime());
-    run3DKernelDefault(mKernel_softmax, mGlobalWorkSizeSoftMax, mLocalWorkSizeSoftMax, mOpenCLBackend->getOpenCLRuntime());
-    if(mLongPrefill) {
+        run3DKernelDefault(mKernel_qk, mGlobalWorkSizeQk, mLocalWorkSizeQk, mOpenCLBackend->getOpenCLRuntime());
+        run3DKernelDefault(mKernel_softmax, mGlobalWorkSizeSoftMax, mLocalWorkSizeSoftMax, mOpenCLBackend->getOpenCLRuntime());
         run3DKernelDefault(mKernel_trans, mGlobalWorkSizeTrans, mLocalWorkSizeTrans, mOpenCLBackend->getOpenCLRuntime());
-    }
-    run3DKernelDefault(mKernel_qkv, mGlobalWorkSizeQkv, mLocalWorkSizeQkv, mOpenCLBackend->getOpenCLRuntime());
-    if(mLongPrefill) {
+        run3DKernelDefault(mKernel_qkv, mGlobalWorkSizeQkv, mLocalWorkSizeQkv, mOpenCLBackend->getOpenCLRuntime());
         run3DKernelDefault(mKernel_clip, mGlobalWorkSizeClip, mLocalWorkSizeClip, mOpenCLBackend->getOpenCLRuntime());
+    } else{
+        if(mIsDecode){
+            run3DKernelDefault(mKernel_rearrange, mGlobalWorkSizeRearrg, mLocalWorkSizeRearrg, mOpenCLBackend->getOpenCLRuntime());
+            runKernel2D(mKernel_qk, mGlobalWorkSizeQk, mLocalWorkSizeQk, mOpenCLBackend->getOpenCLRuntime());
+            run3DKernelDefault(mKernel_softmax, mGlobalWorkSizeSoftMax, mLocalWorkSizeSoftMax, mOpenCLBackend->getOpenCLRuntime());
+            run3DKernelDefault(mKernel_rearrangeV, mGlobalWorkSizeRearrgV, mLocalWorkSizeRearrgV, mOpenCLBackend->getOpenCLRuntime());
+            runKernel2D(mKernel_qkv, mGlobalWorkSizeQkv, mLocalWorkSizeQkv, mOpenCLBackend->getOpenCLRuntime());
+        }else{
+            run3DKernelDefault(mKernel_rearrangeQ, mGlobalWorkSizeRearrgQ, mLocalWorkSizeRearrgQ, mOpenCLBackend->getOpenCLRuntime());
+            run3DKernelDefault(mKernel_rearrange, mGlobalWorkSizeRearrg, mLocalWorkSizeRearrg, mOpenCLBackend->getOpenCLRuntime());
+            run3DKernelDefault(mKernel_qk, mGlobalWorkSizeQk, mLocalWorkSizeQk, mOpenCLBackend->getOpenCLRuntime());
+            run3DKernelDefault(mKernel_softmax, mGlobalWorkSizeSoftMax, mLocalWorkSizeSoftMax, mOpenCLBackend->getOpenCLRuntime());
+            run3DKernelDefault(mKernel_rearrangeV, mGlobalWorkSizeRearrgV, mLocalWorkSizeRearrgV, mOpenCLBackend->getOpenCLRuntime());
+            run3DKernelDefault(mKernel_qkv, mGlobalWorkSizeQkv, mLocalWorkSizeQkv, mOpenCLBackend->getOpenCLRuntime());
+        }
     }
 #endif
     
diff --git a/source/backend/opencl/execution/buffer/AttentionBufExecution.hpp b/source/backend/opencl/execution/buffer/AttentionBufExecution.hpp
index 1292ace2f..5089f968a 100644
--- a/source/backend/opencl/execution/buffer/AttentionBufExecution.hpp
+++ b/source/backend/opencl/execution/buffer/AttentionBufExecution.hpp
@@ -61,6 +61,7 @@ class AttentionBufExecution : public CommonExecution {
 public:
     AttentionBufExecution(const MNN::Op *op, Backend *backend, bool kv_cache);
     AttentionBufExecution(std::shared_ptr<KVCacheCLManager> manager, const MNN::Op *op, Backend *backend);
+    ErrorCode longPrefillResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs);
 
     virtual ~AttentionBufExecution() = default;
     virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
@@ -72,7 +73,9 @@ class AttentionBufExecution : public CommonExecution {
     int getLocalSize(int size, int maxGroupSize);
     void reallocKVCache();
     bool mIsDecode = false;
+    bool mIsFirstPrefill = true;
     int mKv_seq_len = 0;
+    int mMax_len = 0;
     std::shared_ptr<KernelWrap> mKernel_qk;
     std::shared_ptr<KernelWrap> mKernel_softmax;
     std::shared_ptr<KernelWrap> mKernel_qkv;
@@ -84,17 +87,25 @@ class AttentionBufExecution : public CommonExecution {
     std::vector<uint32_t> mLocalWorkSizeQkv{1, 1, 1, 1};
     uint32_t mMaxWorkGroupSize;
     OpenCLBackend *mOpenCLBackend;
+    RecordUpdateInfo mRgUpdateInfo;
     RecordUpdateInfo mQkUpdateInfo;
     RecordUpdateInfo mSoftMaxUpdateInfo;
+    RecordUpdateInfo mRgVUpdateInfo;
     RecordUpdateInfo mQkvUpdateInfo;
     int mGlobalWorkSizeQk0 = 0;
-    size_t mQkGlobal_size[3];
+    size_t mQkGlobal_size[2];
     std::vector<RecordUpdateInfo*> mOpRecordUpdateInfo;
     std::shared_ptr<KVCacheCLManager> mKVCacheCLManager;
     std::shared_ptr<Tensor> mTempQK, mTempSoftMax;
 private:
     int mAlignQ, mAlignKV, mAlignHDK, mAlignHDN;
     bool mLongPrefill = false;
+    std::shared_ptr<KernelWrap> mKernel_rearrangeQ;
+    std::vector<uint32_t> mGlobalWorkSizeRearrgQ{1, 1, 1};
+    std::vector<uint32_t> mLocalWorkSizeRearrgQ{1, 1, 1, 1};
+    std::shared_ptr<KernelWrap> mKernel_rearrangeV;
+    std::vector<uint32_t> mGlobalWorkSizeRearrgV{1, 1, 1};
+    std::vector<uint32_t> mLocalWorkSizeRearrgV{1, 1, 1, 1};
     std::shared_ptr<KernelWrap> mKernel_rearrange;
     std::vector<uint32_t> mGlobalWorkSizeRearrg{1, 1, 1};
     std::vector<uint32_t> mLocalWorkSizeRearrg{1, 1, 1, 1};
diff --git a/source/backend/opencl/execution/buffer/BinaryBufExecution.cpp b/source/backend/opencl/execution/buffer/BinaryBufExecution.cpp
index 94db4128e..3ff1de41f 100644
--- a/source/backend/opencl/execution/buffer/BinaryBufExecution.cpp
+++ b/source/backend/opencl/execution/buffer/BinaryBufExecution.cpp
@@ -329,6 +329,7 @@ class BinaryBufCreator : public OpenCLBackend::Creator {
 public:
     virtual Execution *onCreate(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs,
                                 const MNN::Op *op, Backend *backend) const override {
+#ifdef MNN_SUPPORT_INTEL_SUBGROUP
         for (int i = 0; i < inputs.size(); ++i) {
             int channel = inputs[i]->channel();
             if (channel >= 16 && static_cast<OpenCLBackend *>(backend)->getOpenCLRuntime()->isSupportedIntelSubgroup()
@@ -336,6 +337,7 @@ class BinaryBufCreator : public OpenCLBackend::Creator {
                 TensorUtils::setTensorChannelPack(inputs[i], 16);
             }
         }
+#endif /* MNN_SUPPORT_INTEL_SUBGROUP */
         if (op->type() == OpType_Eltwise) {
             switch (op->main_as_Eltwise()->type()) {
                 case EltwiseType_SUM:
diff --git a/source/backend/opencl/execution/buffer/ConvBufExecution.cpp b/source/backend/opencl/execution/buffer/ConvBufExecution.cpp
index 185a25294..db3fb2d38 100644
--- a/source/backend/opencl/execution/buffer/ConvBufExecution.cpp
+++ b/source/backend/opencl/execution/buffer/ConvBufExecution.cpp
@@ -801,6 +801,7 @@ class ConvolutionBufCreator : public OpenCLBackend::Creator {
         }
 
         if (ConvBufWinograd::valid(conv2D->common(), inputs[0], outputs[0], static_cast<OpenCLBackend *>(backend)->getOpenCLRuntime()->getGpuType() == INTEL)) {
+#ifdef MNN_SUPPORT_INTEL_SUBGROUP
             if(static_cast<OpenCLBackend *>(backend)->getOpenCLRuntime()->isSupportedIntelSubgroup()){
                 std::vector<int> inputShape = tensorShapeFormat(input);
                 std::vector<int> outputShape = tensorShapeFormat(output);
@@ -810,6 +811,7 @@ class ConvolutionBufCreator : public OpenCLBackend::Creator {
                 TensorUtils::setTensorPad(input, padding.first, pad_right, 0, 0);
                 TensorUtils::setTensorChannelPack(input, 16);
             }
+#endif /* MNN_SUPPORT_INTEL_SUBGROUP */
             return new ConvBufWinograd(op, backend);
         }
 #ifdef MNN_SUPPORT_INTEL_SUBGROUP
diff --git a/source/backend/opencl/execution/buffer/ConvBufLowMemoryExecution.cpp b/source/backend/opencl/execution/buffer/ConvBufLowMemoryExecution.cpp
index 3c2a02b9d..1ce568cbd 100644
--- a/source/backend/opencl/execution/buffer/ConvBufLowMemoryExecution.cpp
+++ b/source/backend/opencl/execution/buffer/ConvBufLowMemoryExecution.cpp
@@ -18,14 +18,17 @@ void ConvBufLowMemoryExecution::getInfoFromOpLowMemory(std::shared_ptr<Convoluti
         MNN_ERROR("Conv buf low memory init error.\n");
         MNN_ASSERT(false);
     }
-    mResource->mInputChannel = quanCommon->weight.size() / (mResource->mKernelWidth * mResource->mKernelHeight * mResource->mOutputChannel);
     // set mResource->mNumQuantBit
     if(quanCommon->canUseInt4){
         mResource->mNumQuantBit = 4;
-        mResource->mInputChannel = (quanCommon->weight.size() * 2) / (mResource->mKernelWidth * mResource->mKernelHeight * mResource->mOutputChannel);
     }else{
         mResource->mNumQuantBit = 8;
     }
+    if (mOp->main_as_Convolution2D()->common()->inputCount() > 0) {
+        mResource->mInputChannel = mOp->main_as_Convolution2D()->common()->inputCount();
+    } else {
+        mResource->mInputChannel = quanCommon->weight.size() / (mResource->mKernelWidth * mResource->mKernelHeight * mResource->mOutputChannel);
+    }
     // src of alpha in CPU
     float * dequantAlpha = quanCommon->alpha.get();
     int totalCount = quanCommon->alpha.size();
diff --git a/source/backend/opencl/execution/buffer/PoolBufExecution.cpp b/source/backend/opencl/execution/buffer/PoolBufExecution.cpp
index 66e29d1b7..07cca36f5 100644
--- a/source/backend/opencl/execution/buffer/PoolBufExecution.cpp
+++ b/source/backend/opencl/execution/buffer/PoolBufExecution.cpp
@@ -327,13 +327,15 @@ class PoolBufCreator : public OpenCLBackend::Creator {
     virtual ~PoolBufCreator() = default;
     virtual Execution *onCreate(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs,
                                 const MNN::Op *op, Backend *backend) const override {
+#ifdef MNN_SUPPORT_INTEL_SUBGROUP
         for (int i = 0; i < inputs.size(); ++i) {
             int channel = inputs[i]->channel();
             if (channel >= 16 && static_cast<OpenCLBackend *>(backend)->getOpenCLRuntime()->isSupportedIntelSubgroup()) {
                 TensorUtils::setTensorChannelPack(inputs[i], 16);
             }
         }
-        return new PoolBufExecution(inputs, op, backend); 
+#endif /* MNN_SUPPORT_INTEL_SUBGROUP */
+        return new PoolBufExecution(inputs, op, backend);
     }
 };
 
diff --git a/source/backend/opencl/execution/buffer/ReluBufExecution.cpp b/source/backend/opencl/execution/buffer/ReluBufExecution.cpp
index b268f8dc4..cbf310824 100644
--- a/source/backend/opencl/execution/buffer/ReluBufExecution.cpp
+++ b/source/backend/opencl/execution/buffer/ReluBufExecution.cpp
@@ -229,13 +229,14 @@ class ReluBufCreator : public OpenCLBackend::Creator {
         // So we use ternary operation (A ? B: C) instead of function call with comma
         // (e.g, fmax(in,(float4)(0))), when there is a Radeon GPU.
         bool isRadeonGpu = (static_cast<OpenCLBackend*>(backend)->getOpenCLRuntime()->getGpuType() == RADEON);
+#ifdef MNN_SUPPORT_INTEL_SUBGROUP
         for (int i = 0; i < inputs.size(); ++i) {
             int channel = inputs[i]->channel();
             if (channel >= 16 && static_cast<OpenCLBackend *>(backend)->getOpenCLRuntime()->isSupportedIntelSubgroup()) {
                 TensorUtils::setTensorChannelPack(inputs[i], 16);
             }
         }
-
+#endif /* MNN_SUPPORT_INTEL_SUBGROUP */
         if (op->type() == OpType_ReLU6) {
             char storage[256];
             float minValue = 0.0f;
diff --git a/source/backend/opencl/execution/buffer/UnaryBufExecution.cpp b/source/backend/opencl/execution/buffer/UnaryBufExecution.cpp
index 56a1c9027..71b83ba78 100644
--- a/source/backend/opencl/execution/buffer/UnaryBufExecution.cpp
+++ b/source/backend/opencl/execution/buffer/UnaryBufExecution.cpp
@@ -179,6 +179,7 @@ class UnaryBufCreator : public OpenCLBackend::Creator {
 public:
     virtual Execution* onCreate(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs,
                                 const MNN::Op* op, Backend* backend) const override {
+#ifdef MNN_SUPPORT_INTEL_SUBGROUP
         for (int i = 0; i < inputs.size(); ++i) {
             int channel = inputs[i]->channel();
             if (channel >= 16 && static_cast<OpenCLBackend *>(backend)->getOpenCLRuntime()->isSupportedIntelSubgroup()
@@ -186,6 +187,7 @@ class UnaryBufCreator : public OpenCLBackend::Creator {
                 TensorUtils::setTensorChannelPack(inputs[i], 16);
             }
         }
+#endif /* MNN_SUPPORT_INTEL_SUBGROUP */
         if (op->type() == OpType_UnaryOp) {
             switch (op->main_as_UnaryOp()->opType()) {
                 case UnaryOpOperation_ABS:
diff --git a/source/backend/opencl/execution/cl/attention_buf.cl b/source/backend/opencl/execution/cl/attention_buf.cl
index 074956902..5b7366018 100644
--- a/source/backend/opencl/execution/cl/attention_buf.cl
+++ b/source/backend/opencl/execution/cl/attention_buf.cl
@@ -10,6 +10,14 @@
         return;                                                                                   \
     }
 
+#define GLOBAL_SIZE_2_DIMS \
+    __private const int global_size_dim0, __private const int global_size_dim1,
+
+#define DEAL_NON_UNIFORM_DIM2(input1, input2)                                             \
+    if (input1 >= global_size_dim0 || input2 >= global_size_dim1) { \
+        return;                                                                                   \
+    }
+
 #define DEAL_OUTER_SEQLEN_NOT_ALIGN(length) \
     if(4 * sl + 3 >= length) {\
         temp_3 = (FLOAT4)0;\
@@ -50,11 +58,11 @@ __kernel void rearrange_qkv(GLOBAL_SIZE_3_DIMS
                               __global FLOAT *output_q, // [batch*headNum, ROUND_UP(headDim, mTileHDK), ROUND_UP(seqLenQ, mTileQ)]
                               __global FLOAT *output_k, // [batch*headNum/group, ROUND_UP(headDim, mTileHDK), ROUND_UP(seqLenKV, mTileKV)]
                               __global FLOAT *output_v, // [batch*headNum/group, ROUND_UP(seqLenKV, mTileKV), ROUND_UP(headDim, mTileHDN)]
-                              __global FLOAT *past_k, // [batch, seqLenKV/4, headNum/group, headDim, seqLenKV_4]
-                              __global FLOAT *past_v, // [batch, seqLenKV/4, headNum/group, headDim, seqLenKV_4]
+                              __global FLOAT *past_k, // [batch, headNum/group, headDim, seqLenKV_4]
+                              __global FLOAT *past_v, // [batch, headNum/group, seqLenKV_4, headDim]
                               __private const int4 tile, // [mTileQ, mTileKV, mTileHDK, mTileHDN]
                               __private const int4 shape,// [seqLenQ, seqLenKV, headNum, headDim]
-                              __private const int4 param // [group, batch]
+                              __private const int4 param // [group, batch, max_len, past_len]
 ) {
     const int sl = get_global_id(0); // seqLen/4 : max(seqLenPackQ/4, seqLenPackKV/4)
     const int hd = get_global_id(1); // headDim/4 : max(headDimPackQK/4, headDimPackV/4)
@@ -67,6 +75,7 @@ __kernel void rearrange_qkv(GLOBAL_SIZE_3_DIMS
     const int headDim = shape.w;
     const int group = param.x;
     const int batch = param.y;
+    const int maxLenKV = param.z;
 
     const int b = z % batch;
     const int hn = z / batch;
@@ -112,7 +121,8 @@ __kernel void rearrange_qkv(GLOBAL_SIZE_3_DIMS
     const int headDimPackV = ((headDim + tile.w - 1) / tile.w) * tile.w;
     const int seqLenKV_4 = (seqLenKV + 3) / 4;
     const int in_offset_kv = (((b * seqLenKV + sl*4) * headNum/group + hn) * headDim + 4 * hd);
-    
+    const int past_offset_k = (((b * headNum/group + hn) * headDim + hd * 4) * maxLenKV + sl*4);
+    const int past_offset_v = (((b * headNum/group + hn) * maxLenKV + sl*4) * headDim + 4 * hd);
     if(sl * 4 < seqLenPackKV && hd * 4 < headDimPackQK) {
         const int out_offset_k = (((b * headNum/group + hn) * headDimPackQK + hd * 4) * seqLenPackKV + sl * 4);
 
@@ -132,22 +142,20 @@ __kernel void rearrange_qkv(GLOBAL_SIZE_3_DIMS
             #ifdef SEQLEN_LEAVE
             DEAL_OUTER_SEQLEN_NOT_ALIGN(seqLenKV)
             #endif
-            vstore4((FLOAT4)(temp_0.s0, temp_1.s0, temp_2.s0, temp_3.s0), 0, output_k + out_offset_k);
-            vstore4((FLOAT4)(temp_0.s1, temp_1.s1, temp_2.s1, temp_3.s1), 0, output_k + out_offset_k + seqLenPackKV);
-            vstore4((FLOAT4)(temp_0.s2, temp_1.s2, temp_2.s2, temp_3.s2), 0, output_k + out_offset_k + 2 * seqLenPackKV);
-            vstore4((FLOAT4)(temp_0.s3, temp_1.s3, temp_2.s3, temp_3.s3), 0, output_k + out_offset_k + 3 * seqLenPackKV);
+            FLOAT4 key0 = (FLOAT4)(temp_0.s0, temp_1.s0, temp_2.s0, temp_3.s0);
+            FLOAT4 key1 = (FLOAT4)(temp_0.s1, temp_1.s1, temp_2.s1, temp_3.s1);
+            FLOAT4 key2 = (FLOAT4)(temp_0.s2, temp_1.s2, temp_2.s2, temp_3.s2);
+            FLOAT4 key3 = (FLOAT4)(temp_0.s3, temp_1.s3, temp_2.s3, temp_3.s3);
+            vstore4(key0, 0, output_k + out_offset_k);
+            vstore4(key1, 0, output_k + out_offset_k + seqLenPackKV);
+            vstore4(key2, 0, output_k + out_offset_k + 2 * seqLenPackKV);
+            vstore4(key3, 0, output_k + out_offset_k + 3 * seqLenPackKV);
             
             // pastK
-            vstore4(temp_0, 0, past_k + in_offset_kv);
-            if(sl * 4 + 1 < seqLenKV) {
-                vstore4(temp_1, 0, past_k + in_offset_kv + headNum*headDim/group);
-            }
-            if(sl * 4 + 2 < seqLenKV) {
-                vstore4(temp_2, 0, past_k + in_offset_kv + 2*headNum*headDim/group);
-            }
-            if(sl * 4 + 3 < seqLenKV) {
-                vstore4(temp_3, 0, past_k + in_offset_kv + 3*headNum*headDim/group);
-            }
+            vstore4(key0, 0, past_k + past_offset_k);
+            vstore4(key1, 0, past_k + past_offset_k + maxLenKV);
+            vstore4(key2, 0, past_k + past_offset_k + 2*maxLenKV);
+            vstore4(key3, 0, past_k + past_offset_k + 3*maxLenKV);
         }
         
     }
@@ -177,16 +185,10 @@ __kernel void rearrange_qkv(GLOBAL_SIZE_3_DIMS
             vstore4(temp_3, 0, output_v + out_offset_v + 3 * headDimPackV);
             
             // pastV
-            vstore4(temp_0, 0, past_v + in_offset_kv);
-            if(sl * 4 + 1 < seqLenKV) {
-                vstore4(temp_1, 0, past_v + in_offset_kv + headNum*headDim/group);
-            }
-            if(sl * 4 + 2 < seqLenKV) {
-                vstore4(temp_2, 0, past_v + in_offset_kv + 2*headNum*headDim/group);
-            }
-            if(sl * 4 + 3 < seqLenKV) {
-                vstore4(temp_3, 0, past_v + in_offset_kv + 3*headNum*headDim/group);
-            }
+            vstore4(temp_0, 0, past_v + past_offset_v);
+            vstore4(temp_1, 0, past_v + past_offset_v + headDim);
+            vstore4(temp_2, 0, past_v + past_offset_v + 2*headDim);
+            vstore4(temp_3, 0, past_v + past_offset_v + 3*headDim);
         }
         
     }
@@ -296,499 +298,483 @@ __kernel void qkv_transpose_output(GLOBAL_SIZE_3_DIMS
 #define NUMHEAD_GROUP_SIZE 1
 #endif
 
-__kernel void matmul_qk_div_mask(GLOBAL_SIZE_3_DIMS
-                              __global const FLOAT *input0, // query [1 query_seq_len head_num head_dim]
-                              __global const FLOAT *input1, // key [1 key_seq_len head_num head_dim]
-                              __global FLOAT *output, // prefill [1 head_num query_seq_len key_seq_len]   decode[1 head_num key_seq_len/4 4]
-                              __global FLOAT *past_key, // [1 max_length head_num head_dim]
+__kernel void rearrange_q(GLOBAL_SIZE_3_DIMS
+                              __global const FLOAT *query, // [1 query_seq_len head_num head_dim]
+                              __global FLOAT *query_tmp, // [1 head_num head_dim key_seq_len4]
+                              __private const int seq_len,
+                              __private const int head_dim,
+                              __private const int head_num) {
+                                  
+    const int x = get_global_id(0); // query_seq_len
+    const int y = get_global_id(1); // head_dim
+    const int z = get_global_id(2); // head_num
+    DEAL_NON_UNIFORM_DIM3(x, y, z);
+    
+    const int x4 = x << 2;
+    const int y4 = y << 2;
+    const int seq_len4 = (seq_len + 3) / 4 * 4;;
+    const int stride = head_num * head_dim;
+    int query_offset = (x4 * head_num + z) * head_dim + y4;
+    FLOAT4 query_vec0 = vload4(0, query + query_offset); query_offset += stride;
+    FLOAT4 query_vec1 = (x4 + 1 >= seq_len) ? (FLOAT4)0 : vload4(0, query + query_offset); query_offset += stride;
+    FLOAT4 query_vec2 = (x4 + 2 >= seq_len) ? (FLOAT4)0 : vload4(0, query + query_offset); query_offset += stride;
+    FLOAT4 query_vec3 = (x4 + 3 >= seq_len) ? (FLOAT4)0 : vload4(0, query + query_offset);
+    
+    const int queryout_offset = (z * head_dim + y4) * seq_len4 + x4;
+    vstore4((FLOAT4)(query_vec0.s0, query_vec1.s0, query_vec2.s0, query_vec3.s0), 0, query_tmp + queryout_offset);
+    vstore4((FLOAT4)(query_vec0.s1, query_vec1.s1, query_vec2.s1, query_vec3.s1), 0, query_tmp + queryout_offset + seq_len4);
+    vstore4((FLOAT4)(query_vec0.s2, query_vec1.s2, query_vec2.s2, query_vec3.s2), 0, query_tmp + queryout_offset + seq_len4 + seq_len4);
+    vstore4((FLOAT4)(query_vec0.s3, query_vec1.s3, query_vec2.s3, query_vec3.s3), 0, query_tmp + queryout_offset + seq_len4 + seq_len4 + seq_len4);
+}
+
+__kernel void rearrange_k(GLOBAL_SIZE_3_DIMS
+                              __global const FLOAT *key, // [1 key_seq_len kv_head_num head_dim]
+                              __global FLOAT *past_key, // [1 kv_head_num head_dim max_length]
+                              __private const int past_len, // prefill = 0, decode = past_key len
+                              __private const int max_len,
+                              __private const int seq_len,
+                              __private const int kv_head_num,
+                              __private const int head_num,
+                              __private const int head_dim) {
+                                  
+    const int x = get_global_id(0); // seq_len decode = 1
+    const int y = get_global_id(1); // head_dim
+    const int z = get_global_id(2); // kv_head_num
+    DEAL_NON_UNIFORM_DIM3(x, y, z);
+    
+    const int y4 = y << 2;
+    
+#ifdef OPENCL_PREFILL_ATTENTION
+    const int x4 = x << 2;
+    const int stride = kv_head_num * head_dim;
+    int key_offset = (x4 * kv_head_num + z) * head_dim + y4;
+    FLOAT4 key_vec0 = vload4(0, key + key_offset); key_offset += stride;
+    FLOAT4 key_vec1 = (x4 + 1 >= seq_len) ? (FLOAT4)0 : vload4(0, key + key_offset); key_offset += stride;
+    FLOAT4 key_vec2 = (x4 + 2 >= seq_len) ? (FLOAT4)0 : vload4(0, key + key_offset); key_offset += stride;
+    FLOAT4 key_vec3 = (x4 + 3 >= seq_len) ? (FLOAT4)0 : vload4(0, key + key_offset);
+    const int output_offset = (z * head_dim + y4) * max_len + past_len + x4;
+    vstore4((FLOAT4)(key_vec0.s0, key_vec1.s0, key_vec2.s0, key_vec3.s0), 0, past_key + output_offset);
+    vstore4((FLOAT4)(key_vec0.s1, key_vec1.s1, key_vec2.s1, key_vec3.s1), 0, past_key + output_offset + max_len);
+    vstore4((FLOAT4)(key_vec0.s2, key_vec1.s2, key_vec2.s2, key_vec3.s2), 0, past_key + output_offset + max_len + max_len);
+    vstore4((FLOAT4)(key_vec0.s3, key_vec1.s3, key_vec2.s3, key_vec3.s3), 0, past_key + output_offset + max_len + max_len + max_len);
+#else
+    FLOAT4 key_vec = vload4(0, key + z * head_dim + y4);
+    const int output_offset = (z * head_dim + y4) * max_len + past_len - 1;
+    past_key[output_offset] = key_vec.s0;
+    past_key[output_offset + max_len] = key_vec.s1;
+    past_key[output_offset + max_len + max_len] = key_vec.s2;
+    past_key[output_offset + max_len + max_len + max_len] = key_vec.s3;
+#endif
+}
+
+__kernel void rearrange_v(GLOBAL_SIZE_3_DIMS
+                              __global const FLOAT *value, // [1 value_seq_len kv_head_num head_dim]
+                              __global FLOAT *past_value, // [1 kv_head_num max_length head_dim]
+                              __private const int past_len,
+                              __private const int max_len,
+                              __private const int seq_len,
+                              __private const int kv_head_num,
+                              __private const int head_dim) {
+                                  
+    const int x = get_global_id(0); // head_dim
+    const int y = get_global_id(1); // seq_len decode = 1
+    const int z = get_global_id(2); // kv_head_num
+    DEAL_NON_UNIFORM_DIM3(x, y, z);
+    
+    const int x4 = x << 2;
+    
+#ifdef OPENCL_PREFILL_ATTENTION
+    const int y4 = y << 2;
+    const int stride = kv_head_num * head_dim;
+    int value_offset = (y4 * kv_head_num + z) * head_dim + x4;
+    FLOAT4 value_vec0 = vload4(0, value + value_offset); value_offset += stride;
+    FLOAT4 value_vec1 = (y4 + 1 >= seq_len) ? (FLOAT4)0 : vload4(0, value + value_offset); value_offset += stride;
+    FLOAT4 value_vec2 = (y4 + 2 >= seq_len) ? (FLOAT4)0 : vload4(0, value + value_offset); value_offset += stride;
+    FLOAT4 value_vec3 = (y4 + 3 >= seq_len) ? (FLOAT4)0 : vload4(0, value + value_offset);
+    const int output_offset = (z * max_len + past_len + y4) * head_dim + x4;
+    vstore4(value_vec0, 0, past_value + output_offset);
+    vstore4(value_vec1, 0, past_value + output_offset + head_dim);
+    vstore4(value_vec2, 0, past_value + output_offset + head_dim + head_dim);
+    vstore4(value_vec3, 0, past_value + output_offset + head_dim + head_dim + head_dim);
+#else
+    FLOAT4 value_vec = vload4(0, value + z * head_dim + x4);
+    const int output_offset = (z * max_len + past_len - 1) * head_dim + x4;
+    vstore4(value_vec, 0, past_value + output_offset);
+#endif
+}
+
+__kernel void matmul_qk_div_mask_prefill(GLOBAL_SIZE_3_DIMS
+                              __global const FLOAT *query, // [1 head_num head_dim query_seq_len]
+                              __global const FLOAT *past_key, // [1 head_num head_dim max_length]
                               #ifdef ADD_MASK
                               __global const FLOAT* mask,
                               #else
                               __global const int* mask, // [1 1 query_seq_len key_seq_len]
                               #endif
+                              __global FLOAT *qk, // [1 head_num key_seq_len query_seq_len]
                               __private const float scale,
                               __private const int query_seq_len,
                               __private const int key_seq_len,
+                              __private const int max_len,
                               __private const int head_num,
-                              __private const int kv_head_num,
                               __private const int head_dim) {
                                   
-    const int x = get_global_id(0); // key_seq_len
-    const int y = get_global_id(1); // query_seq_len for prefill   1 for decode
+    const int x = get_global_id(0); // query_seq_len
+    const int y = get_global_id(1); // key_seq_len
     const int z = get_global_id(2); // head_num
     DEAL_NON_UNIFORM_DIM3(x, y, z);
+    const int x4 = x << 2;
+    const int y4 = y << 2;
     
-    int x4 = x << 2;
-    int y4 = y << 2;
-    int zin = z / NUMHEAD_GROUP_SIZE;
-    __global const FLOAT *A_offset = input0 + (y4 * head_num + z) * head_dim;
-    __global FLOAT *Pastkey_offset = past_key + (x4 * kv_head_num + zin) * head_dim;
-    int strideA = head_num * head_dim;
-    int strideB = kv_head_num * head_dim;
-#ifdef OPENCL_PREFILL_ATTENTION
-    __global const FLOAT *B_offset = input1 + (x4 * kv_head_num + zin) * head_dim;
-    int output_offset = (z * query_seq_len + y4) * key_seq_len + x4;
-    float4 out0 = 0;
-    float4 out1 = 0;
-    float4 out2 = 0;
-    float4 out3 = 0;
-    
-    bool A1_enable = y4 + 1 < query_seq_len;
-    bool A2_enable = y4 + 2 < query_seq_len;
-    bool A3_enable = y4 + 3 < query_seq_len;
-    
-    bool B1_enable = x4 + 1 < key_seq_len;
-    bool B2_enable = x4 + 2 < key_seq_len;
-    bool B3_enable = x4 + 3 < key_seq_len;
-    
-    const int head_dim4 = (head_dim + 3) / 4;
-    #ifdef HEADDIM_LEAVE
-    for(int i = 0; i < head_dim4 - 1; ++i){
-        float4 A0 = convert_float4(vload4(i, A_offset));
-        float4 A1 = A1_enable ? convert_float4(vload4(i, A_offset + strideA)) : (float4)0;
-        float4 A2 = A2_enable ? convert_float4(vload4(i, A_offset + strideA + strideA)) : (float4)0;
-        float4 A3 = A3_enable ? convert_float4(vload4(i, A_offset + strideA + strideA + strideA)) : (float4)0;
-        float4 B0 = convert_float4(vload4(i, B_offset));
-        float4 B1 = B1_enable ? convert_float4(vload4(i, B_offset + strideB)) : (float4)0;
-        float4 B2 = B2_enable ? convert_float4(vload4(i, B_offset + strideB + strideB)) : (float4)0;
-        float4 B3 = B3_enable ? convert_float4(vload4(i, B_offset + strideB + strideB + strideB)) : (float4)0;
-        
-        out0.x += dot(A0, B0);
-        out0.y += dot(A0, B1);
-        out0.z += dot(A0, B2);
-        out0.w += dot(A0, B3);
-        
-        out1.x += dot(A1, B0);
-        out1.y += dot(A1, B1);
-        out1.z += dot(A1, B2);
-        out1.w += dot(A1, B3);
-        
-        out2.x += dot(A2, B0);
-        out2.y += dot(A2, B1);
-        out2.z += dot(A2, B2);
-        out2.w += dot(A2, B3);
-        
-        out3.x += dot(A3, B0);
-        out3.y += dot(A3, B1);
-        out3.z += dot(A3, B2);
-        out3.w += dot(A3, B3);
-        
-        vstore4(CONVERT_FLOAT4(B0), i, Pastkey_offset);
-        vstore4(CONVERT_FLOAT4(B1), i, Pastkey_offset + strideB);
-        vstore4(CONVERT_FLOAT4(B2), i, Pastkey_offset + strideB + strideB);
-        vstore4(CONVERT_FLOAT4(B3), i, Pastkey_offset + strideB + strideB + strideB);
-    }
-    for(int i = (head_dim4 - 1) * 4; i < head_dim; ++i){
-        float A0 = A_offset[i];
-        float A1 = A1_enable ? A_offset[i + strideA] : 0;
-        float A2 = A2_enable ? A_offset[i + strideA + strideA] : 0;
-        float A3 = A3_enable ? A_offset[i + strideA + strideA + strideA] : 0;
-        float B0 = B_offset[i];
-        float B1 = B1_enable ? B_offset[i + strideB] : 0;
-        float B2 = B2_enable ? B_offset[i + strideB + strideB] : 0;
-        float B3 = B3_enable ? B_offset[i + strideB + strideB + strideB] : 0;
-        
-        out0.x += A0 * B0;
-        out0.y += A0 * B1;
-        out0.z += A0 * B2;
-        out0.w += A0 * B3;
-        
-        out1.x += A1 * B0;
-        out1.y += A1 * B1;
-        out1.z += A1 * B2;
-        out1.w += A1 * B3
-        
-        out2.x += A2 * B0;
-        out2.y += A2 * B1;
-        out2.z += A2 * B2;
-        out2.w += A2 * B3;
-        
-        out3.x += A3 * B0;
-        out3.y += A3 * B1;
-        out3.z += A3 * B2;
-        out3.w += A3 * B3;
-        
-        Pastkey_offset[i] = (FLOAT)B0;
-        Pastkey_offset[i + strideB] = (FLOAT)B1;
-        Pastkey_offset[i + strideB + strideB] = (FLOAT)B2;
-        Pastkey_offset[i + strideB + strideB + strideB] = (FLOAT)B3;
-    }
-    #else
-    for(int i = 0; i < head_dim4; ++i){
-        float4 A0 = convert_float4(vload4(i, A_offset));
-        float4 A1 = A1_enable ? convert_float4(vload4(i, A_offset + strideA)) : (float4)0;
-        float4 A2 = A2_enable ? convert_float4(vload4(i, A_offset + strideA + strideA)) : (float4)0;
-        float4 A3 = A3_enable ? convert_float4(vload4(i, A_offset + strideA + strideA + strideA)) : (float4)0;
-        float4 B0 = convert_float4(vload4(i, B_offset));
-        float4 B1 = B1_enable ? convert_float4(vload4(i, B_offset + strideB)) : (float4)0;
-        float4 B2 = B2_enable ? convert_float4(vload4(i, B_offset + strideB + strideB)) : (float4)0;
-        float4 B3 = B3_enable ? convert_float4(vload4(i, B_offset + strideB + strideB + strideB)) : (float4)0;
-        
-        out0.x += dot(A0, B0);
-        out0.y += dot(A0, B1);
-        out0.z += dot(A0, B2);
-        out0.w += dot(A0, B3);
+    const int query_seq_len4 = (query_seq_len + 3) / 4 * 4;;
+    const int query_offset = z * head_dim * query_seq_len4 + x4;
+    const int past_offset = (z / NUMHEAD_GROUP_SIZE) * head_dim * max_len + y4;
+    float4 out0 = 0, out1 = 0, out2 = 0, out3 = 0;
+    
+    for(int i = 0; i < head_dim / 4; ++i){
+        int i4 = i << 2;
+        float4 query_vec0 = convert_float4(vload4(0, query + query_offset + i4 * query_seq_len4));
+        float4 query_vec1 = convert_float4(vload4(0, query + query_offset + (i4 + 1) * query_seq_len4));
+        float4 query_vec2 = convert_float4(vload4(0, query + query_offset + (i4 + 2) * query_seq_len4));
+        float4 query_vec3 = convert_float4(vload4(0, query + query_offset + (i4 + 3) * query_seq_len4));
         
-        out1.x += dot(A1, B0);
-        out1.y += dot(A1, B1);
-        out1.z += dot(A1, B2);
-        out1.w += dot(A1, B3);
+        float4 past_vec0 = convert_float4(vload4(0, past_key + past_offset + i4 * max_len));
+        float4 past_vec1 = convert_float4(vload4(0, past_key + past_offset + (i4 + 1) * max_len));
+        float4 past_vec2 = convert_float4(vload4(0, past_key + past_offset + (i4 + 2) * max_len));
+        float4 past_vec3 = convert_float4(vload4(0, past_key + past_offset + (i4 + 3) * max_len));
+
+        out0 = mad((float4)past_vec0.s0, query_vec0, out0);
+        out0 = mad((float4)past_vec1.s0, query_vec1, out0);
+        out0 = mad((float4)past_vec2.s0, query_vec2, out0);
+        out0 = mad((float4)past_vec3.s0, query_vec3, out0);
         
-        out2.x += dot(A2, B0);
-        out2.y += dot(A2, B1);
-        out2.z += dot(A2, B2);
-        out2.w += dot(A2, B3);
+        out1 = mad((float4)past_vec0.s1, query_vec0, out1);
+        out1 = mad((float4)past_vec1.s1, query_vec1, out1);
+        out1 = mad((float4)past_vec2.s1, query_vec2, out1);
+        out1 = mad((float4)past_vec3.s1, query_vec3, out1);
         
-        out3.x += dot(A3, B0);
-        out3.y += dot(A3, B1);
-        out3.z += dot(A3, B2);
-        out3.w += dot(A3, B3);
+        out2 = mad((float4)past_vec0.s2, query_vec0, out2);
+        out2 = mad((float4)past_vec1.s2, query_vec1, out2);
+        out2 = mad((float4)past_vec2.s2, query_vec2, out2);
+        out2 = mad((float4)past_vec3.s2, query_vec3, out2);
         
-        vstore4(CONVERT_FLOAT4(B0), i, Pastkey_offset);
-        vstore4(CONVERT_FLOAT4(B1), i, Pastkey_offset + strideB);
-        vstore4(CONVERT_FLOAT4(B2), i, Pastkey_offset + strideB + strideB);
-        vstore4(CONVERT_FLOAT4(B3), i, Pastkey_offset + strideB + strideB + strideB);
+        out3 = mad((float4)past_vec0.s3, query_vec0, out3);
+        out3 = mad((float4)past_vec1.s3, query_vec1, out3);
+        out3 = mad((float4)past_vec2.s3, query_vec2, out3);
+        out3 = mad((float4)past_vec3.s3, query_vec3, out3);
     }
-    #endif
     out0 *= (float4)scale;
     out1 *= (float4)scale;
     out2 *= (float4)scale;
     out3 *= (float4)scale;
-    float4 mask0 = convert_float4(vload4(0, mask + y4 * key_seq_len + x4));
-    float4 mask1 = convert_float4(vload4(0, mask + (y4 + 1) * key_seq_len + x4));
-    float4 mask2 = convert_float4(vload4(0, mask + (y4 + 2) * key_seq_len + x4));
-    float4 mask3 = convert_float4(vload4(0, mask + (y4 + 3) * key_seq_len + x4));
-    #ifdef ADD_MASK
-    out0 += mask0;
-    out1 += mask1;
-    out2 += mask2;
-    out3 += mask3;
-    #else
-    out0 = (mask0 == (float4)0) ? (float4)(-FLT_MAX) : out0;
-    out1 = (mask1 == (float4)0) ? (float4)(-FLT_MAX) : out1;
-    out2 = (mask2 == (float4)0) ? (float4)(-FLT_MAX) : out2;
-    out3 = (mask3 == (float4)0) ? (float4)(-FLT_MAX) : out3;
-    #endif
-    if(B3_enable){
-        vstore4(CONVERT_FLOAT4(out0), 0, output + output_offset);
-        if(!A1_enable) return;
-        output_offset += key_seq_len;
-        vstore4(CONVERT_FLOAT4(out1), 0, output + output_offset);
-        if(!A2_enable) return;
-        output_offset += key_seq_len;
-        vstore4(CONVERT_FLOAT4(out2), 0, output + output_offset);
-        if(!A3_enable) return;
-        output_offset += key_seq_len;
-        vstore4(CONVERT_FLOAT4(out3), 0, output + output_offset);
-    } else if(B2_enable){
-        vstore3(CONVERT_FLOAT3((float3)(out0.x, out0.y, out0.z)), 0, output + output_offset);
-        if(!A1_enable) return;
-        output_offset += key_seq_len;
-        vstore3(CONVERT_FLOAT3((float3)(out1.x, out1.y, out1.z)), 0, output + output_offset);
-        if(!A2_enable) return;
-        output_offset += key_seq_len;
-        vstore3(CONVERT_FLOAT3((float3)(out2.x, out2.y, out2.z)), 0, output + output_offset);
-        if(!A3_enable) return;
-        output_offset += key_seq_len;
-        vstore3(CONVERT_FLOAT3((float3)(out3.x, out3.y, out3.z)), 0, output + output_offset);
-    } else if(B1_enable){
-        vstore2(CONVERT_FLOAT2((float2)(out0.x, out0.y)), 0, output + output_offset);
-        if(!A1_enable) return;
-        output_offset += key_seq_len;
-        vstore2(CONVERT_FLOAT2((float2)(out1.x, out1.y)), 0, output + output_offset);
-        if(!A2_enable) return;
-        output_offset += key_seq_len;
-        vstore2(CONVERT_FLOAT2((float2)(out2.x, out2.y)), 0, output + output_offset);
-        if(!A3_enable) return;
-        output_offset += key_seq_len;
-        vstore2(CONVERT_FLOAT2((float2)(out3.x, out3.y)), 0, output + output_offset);
-    } else {
-        output[output_offset] = out0.x;
-        if(!A1_enable) return;
-        output[output_offset + key_seq_len] = out1.x;
-        if(!A2_enable) return;
-        output[output_offset + key_seq_len + key_seq_len] = out2.x;
-        if(!A3_enable) return;
-        output[output_offset + key_seq_len + key_seq_len + key_seq_len] = out3.x;
-    }
-#else
-    float4 out = 0;
-    const int head_dim4 = (head_dim + 3) / 4;
-    int key_seq_len4 = (key_seq_len + 3) / 4;
-    #ifdef HEADDIM_LEAVE
-    for(int i = 0; i < head_dim4 - 1; ++i){
-        float4 A = convert_float4(vload4(i, A_offset));
-        float4 B0 = convert_float4(vload4(i, Pastkey_offset));
-        float4 B1 = convert_float4(vload4(i, Pastkey_offset + strideB));
-        float4 B2 = convert_float4(vload4(i, Pastkey_offset + strideB + strideB));
-        float4 B3 = convert_float4(vload4(i, Pastkey_offset + strideB + strideB + strideB));
-    
-        out.x += dot(A, B0);
-        out.y += dot(A, B1);
-        out.z += dot(A, B2);
-        out.w += dot(A, B3);
-    }
-    for(int i = (head_dim4 - 1) * 4; i < head_dim; ++i){
-        float A = A_offset[i];
-        float B0 = Pastkey_offset[i];
-        float B1 = Pastkey_offset[i + strideB];
-        float B2 = Pastkey_offset[i + strideB + strideB];
-        float B3 = Pastkey_offset[i + strideB + strideB];
-        out.x += A * B0;
-        out.y += A * B1;
-        out.z += A * B2;
-        out.w += A * B3;
-    }
-    #else
-    for(int i = 0; i < head_dim4; ++i){
-        float4 A = convert_float4(vload4(i, A_offset));
-        float4 B0 = convert_float4(vload4(i, Pastkey_offset));
-        float4 B1 = convert_float4(vload4(i, Pastkey_offset + strideB));
-        float4 B2 = convert_float4(vload4(i, Pastkey_offset + strideB + strideB));
-        float4 B3 = convert_float4(vload4(i, Pastkey_offset + strideB + strideB + strideB));
-    
-        out.x += dot(A, B0);
-        out.y += dot(A, B1);
-        out.z += dot(A, B2);
-        out.w += dot(A, B3);
+    {
+        int mask_offset = x4 * key_seq_len + y4;
+        float4 mask_tmp0 = convert_float4(vload4(0, mask + mask_offset)); mask_offset += key_seq_len;
+        float4 mask_tmp1 = (x4 + 1 >= query_seq_len) ? (float4)0 : convert_float4(vload4(0, mask + mask_offset)); mask_offset += key_seq_len;
+        float4 mask_tmp2 = (x4 + 2 >= query_seq_len) ? (float4)0 : convert_float4(vload4(0, mask + mask_offset)); mask_offset += key_seq_len;
+        float4 mask_tmp3 = (x4 + 3 >= query_seq_len) ? (float4)0 : convert_float4(vload4(0, mask + mask_offset));
+        float4 mask0 = (float4)(mask_tmp0.s0, mask_tmp1.s0, mask_tmp2.s0, mask_tmp3.s0);
+        float4 mask1 = (float4)(mask_tmp0.s1, mask_tmp1.s1, mask_tmp2.s1, mask_tmp3.s1);
+        float4 mask2 = (float4)(mask_tmp0.s2, mask_tmp1.s2, mask_tmp2.s2, mask_tmp3.s2);
+        float4 mask3 = (float4)(mask_tmp0.s3, mask_tmp1.s3, mask_tmp2.s3, mask_tmp3.s3);
+        #ifdef ADD_MASK
+        out0 += mask0;
+        out1 += mask1;
+        out2 += mask2;
+        out3 += mask3;
+        #else
+        out0 = (mask0 == (float4)0) ? (float4)(-FLT_MAX) : out0;
+        out1 = (mask1 == (float4)0) ? (float4)(-FLT_MAX) : out1;
+        out2 = (mask2 == (float4)0) ? (float4)(-FLT_MAX) : out2;
+        out3 = (mask3 == (float4)0) ? (float4)(-FLT_MAX) : out3;
+        #endif
     }
-    #endif
-    int remain = key_seq_len - x4;
-    if(x == key_seq_len4 - 1){
-        __global const FLOAT *B_offset = input1 + zin * head_dim;
-        Pastkey_offset += (remain - 1) * strideB;
-        float tmp = 0;
-        #ifdef HEADDIM_LEAVE
-        for(int i = 0; i < head_dim4 - 1; ++i){
-            float4 A = convert_float4(vload4(i, A_offset));
-            float4 B = convert_float4(vload4(i, B_offset));
+    
+    const int qk_offset = (z * key_seq_len + y4) * query_seq_len4 + x4;
+    vstore4(CONVERT_FLOAT4(out0), 0, qk + qk_offset);
+    if(y4 + 1 >= key_seq_len) return;
+    vstore4(CONVERT_FLOAT4(out1), 0, qk + qk_offset + query_seq_len4);
+    if(y4 + 2 >= key_seq_len) return;
+    vstore4(CONVERT_FLOAT4(out2), 0, qk + qk_offset + query_seq_len4 + query_seq_len4);
+    if(y4 + 3 >= key_seq_len) return;
+    vstore4(CONVERT_FLOAT4(out3), 0, qk + qk_offset + query_seq_len4 + query_seq_len4 + query_seq_len4);
+}
+
+__kernel void matmul_qk_decode(GLOBAL_SIZE_2_DIMS
+                              __global const FLOAT *query, // key [1 head_num head_dim]
+                              __global const FLOAT *past_key, // [1 head_num head_dim max_length]
+                              __global FLOAT *qk, // [1 head_num key_seq_len 1]
+                              __private const float scale,
+                              __private const int seq_len,
+                              __private const int max_len,
+                              __private const int head_num,
+                              __private const int head_dim) {
+                                  
+    const int x = get_global_id(0); // key_seq_len
+    const int y = get_global_id(1); // head_num
+    DEAL_NON_UNIFORM_DIM2(x, y);
+    const int x4 = x << 2;
+    
+    const int query_offset = y * head_dim;
+    const int past_offset = (y / NUMHEAD_GROUP_SIZE) * head_dim * max_len + x4;
+    float4 out0 = 0;
+    
+    for(int i = 0; i < head_dim / 4; ++i){
+        int i4 = i << 2;
+        float4 query_vec = convert_float4(vload4(0, query + query_offset + i4));
         
-            tmp += dot(A, B);
-            vstore4(CONVERT_FLOAT4(B), i, Pastkey_offset);
-        }
-        for(int i = (head_dim4 - 1) * 4; i < head_dim; ++i){
-            float A = A_offset[i];
-            float B = B_offset[i];
-            tmp += A * B;
-            Pastkey_offset[i] = B;
-        }
-        #else
-        for(int i = 0; i < head_dim4; ++i){
-            float4 A = convert_float4(vload4(i, A_offset));
-            float4 B = convert_float4(vload4(i, B_offset));
+        float4 past_vec0 = convert_float4(vload4(0, past_key + past_offset + i4 * max_len));
+        float4 past_vec1 = convert_float4(vload4(0, past_key + past_offset + (i4 + 1) * max_len));
+        float4 past_vec2 = convert_float4(vload4(0, past_key + past_offset + (i4 + 2) * max_len));
+        float4 past_vec3 = convert_float4(vload4(0, past_key + past_offset + (i4 + 3) * max_len));
         
-            tmp += dot(A, B);
-            vstore4(CONVERT_FLOAT4(B), i, Pastkey_offset);
-        }
-        #endif
-        float *out_ptr = (float*)&out;
-        out_ptr[remain - 1] = tmp;
+        out0 = mad((float4)query_vec.s0, past_vec0, out0);
+        out0 = mad((float4)query_vec.s1, past_vec1, out0);
+        out0 = mad((float4)query_vec.s2, past_vec2, out0);
+        out0 = mad((float4)query_vec.s3, past_vec3, out0);
     }
-    out *= (float4)scale;
-    if(remain >= 4){
-        vstore4(CONVERT_FLOAT4(out), 0, output + z * key_seq_len + x4);
-    } else if (remain >= 3){
-        vstore3(CONVERT_FLOAT3((float3)(out.x, out.y, out.z)), 0, output + z * key_seq_len + x4);
-    } else if (remain >= 2){
-        vstore2(CONVERT_FLOAT2((float2)(out.x, out.y)), 0, output + z * key_seq_len + x4);
-    } else {
-        output[z * key_seq_len + x4] = out.x;
+    out0 *= (float4)scale;
+    const int qk_offset = y * seq_len + x4;
+    if(x4 + 3 < seq_len){
+        vstore4(CONVERT_FLOAT4(out0), 0, qk + qk_offset);
+    }else {
+        int remain = seq_len - x4;
+        if(remain == 3){
+            vstore3(CONVERT_FLOAT3((float3)(out0.s012)), 0, qk + qk_offset);
+        } else if(remain == 2){
+            vstore2(CONVERT_FLOAT2((float2)(out0.s01)), 0, qk + qk_offset);
+        }else if(remain == 1){
+            qk[qk_offset] = out0.s0;
+        }
     }
-#endif
 }
 
-__kernel void matmul_qkv(GLOBAL_SIZE_3_DIMS
-                              __global const FLOAT *input0, // qk prefill [1 head_num qk_seq_len value_seq_len]   decode[1 head_num value_seq_len]
-                              __global const FLOAT *input1, // [1 value_seq_len head_num head_dim]
-                              __global FLOAT *output, // [1 qk_seq_len head_num head_dim]
-                              __global FLOAT *past_value, // [1 value_seq_len head_num head_dim]
+__kernel void matmul_qkv_prefill(GLOBAL_SIZE_3_DIMS
+                              __global const FLOAT *qk, // qk prefill [1 head_num qk_seq_len value_seq_len]
+                              __global const FLOAT *past_value, // [1 head_num max_len head_dim]
+                              __global FLOAT *output, // [1 value_seq_len head_num head_dim]
                               __private const int qk_seq_len,
                               __private const int value_seq_len,
+                              __private const int max_len,
                               __private const int head_num,
                               __private const int kv_head_num,
                               __private const int head_dim) {
                                   
-    const int x = get_global_id(0); // head_dim << 2
-    const int y = get_global_id(1); // head_num
-    const int z = get_global_id(2); // prefill qk_seq_len decode 1
+    const int x = get_global_id(0); // head_dim
+    const int y = get_global_id(1); // qk_seq_len
+    const int z = get_global_id(2); // head_num
     
-    const int x4 = x << 2;
     DEAL_NON_UNIFORM_DIM3(x, y, z);
+    const int x8 = x << 3;
+    const int y4 = y << 2;
     
-    const int yin = y / NUMHEAD_GROUP_SIZE;
-#ifdef OPENCL_PREFILL_ATTENTION
-    int z4 = z << 2;
-    int value_seq_len4 = (value_seq_len + 3) / 4;
-    int loop_end = max(value_seq_len4 - 1, 0);
-    const int stride = kv_head_num * head_dim;
-    __global const FLOAT *A_offset = input0 + (y * qk_seq_len + z4) * value_seq_len;
-    __global const FLOAT *B_offset = input1 + yin * head_dim + x4;
-    __global FLOAT *Pastvalue_offset = past_value + yin * head_dim + x4;
-    COMPUTE_FLOAT4 out0 = 0;
-    COMPUTE_FLOAT4 out1 = 0;
-    COMPUTE_FLOAT4 out2 = 0;
-    COMPUTE_FLOAT4 out3 = 0;
+    const int qk_seq_len4 = (qk_seq_len + 3) / 4 * 4;
+    const int qk_offset = z * value_seq_len * qk_seq_len4 + y4;
+    const int past_offset = ((z / NUMHEAD_GROUP_SIZE) * max_len) * head_dim + x8;
+    const int loop_end = max(value_seq_len / 4 - 1, 0);
+    COMPUTE_FLOAT8 out0 = 0, out1 = 0, out2 = 0, out3 = 0;
     
     for(int i = 0; i < loop_end; ++i){
-        int index = i << 2;
-        COMPUTE_FLOAT4 A0 = CONVERT_COMPUTE_FLOAT4(vload4(i, A_offset));
-        COMPUTE_FLOAT4 A1 = CONVERT_COMPUTE_FLOAT4(vload4(i, A_offset + value_seq_len));
-        COMPUTE_FLOAT4 A2 = CONVERT_COMPUTE_FLOAT4(vload4(i, A_offset + value_seq_len + value_seq_len));
-        COMPUTE_FLOAT4 A3 = CONVERT_COMPUTE_FLOAT4(vload4(i, A_offset + value_seq_len + value_seq_len + value_seq_len));
-        COMPUTE_FLOAT4 B0 = CONVERT_COMPUTE_FLOAT4(vload4(0, B_offset + (index + 0) * stride));
-        COMPUTE_FLOAT4 B1 = CONVERT_COMPUTE_FLOAT4(vload4(0, B_offset + (index + 1) * stride));
-        COMPUTE_FLOAT4 B2 = CONVERT_COMPUTE_FLOAT4(vload4(0, B_offset + (index + 2) * stride));
-        COMPUTE_FLOAT4 B3 = CONVERT_COMPUTE_FLOAT4(vload4(0, B_offset + (index + 3) * stride));
+        int i4 = i << 2;
+        COMPUTE_FLOAT4 qk_vec0 = CONVERT_COMPUTE_FLOAT4(vload4(0, qk + qk_offset + i4 * qk_seq_len4));
+        COMPUTE_FLOAT4 qk_vec1 = CONVERT_COMPUTE_FLOAT4(vload4(0, qk + qk_offset + (i4 + 1) * qk_seq_len4));
+        COMPUTE_FLOAT4 qk_vec2 = CONVERT_COMPUTE_FLOAT4(vload4(0, qk + qk_offset + (i4 + 2) * qk_seq_len4));
+        COMPUTE_FLOAT4 qk_vec3 = CONVERT_COMPUTE_FLOAT4(vload4(0, qk + qk_offset + (i4 + 3) * qk_seq_len4));
         
-        out0 = mad(B0, (COMPUTE_FLOAT4)A0.x, out0);
-        out0 = mad(B1, (COMPUTE_FLOAT4)A0.y, out0);
-        out0 = mad(B2, (COMPUTE_FLOAT4)A0.z, out0);
-        out0 = mad(B3, (COMPUTE_FLOAT4)A0.w, out0);
+        COMPUTE_FLOAT8 past_vec0 = CONVERT_COMPUTE_FLOAT8(vload8(0, past_value + past_offset + i4 * head_dim));
+        COMPUTE_FLOAT8 past_vec1 = CONVERT_COMPUTE_FLOAT8(vload8(0, past_value + past_offset + (i4 + 1) * head_dim));
+        COMPUTE_FLOAT8 past_vec2 = CONVERT_COMPUTE_FLOAT8(vload8(0, past_value + past_offset + (i4 + 2) * head_dim));
+        COMPUTE_FLOAT8 past_vec3 = CONVERT_COMPUTE_FLOAT8(vload8(0, past_value + past_offset + (i4 + 3) * head_dim));
         
-        out1 = mad(B0, (COMPUTE_FLOAT4)A1.x, out1);
-        out1 = mad(B1, (COMPUTE_FLOAT4)A1.y, out1);
-        out1 = mad(B2, (COMPUTE_FLOAT4)A1.z, out1);
-        out1 = mad(B3, (COMPUTE_FLOAT4)A1.w, out1);
+        out0 = mad((COMPUTE_FLOAT8)qk_vec0.s0, past_vec0, out0);
+        out0 = mad((COMPUTE_FLOAT8)qk_vec1.s0, past_vec1, out0);
+        out0 = mad((COMPUTE_FLOAT8)qk_vec2.s0, past_vec2, out0);
+        out0 = mad((COMPUTE_FLOAT8)qk_vec3.s0, past_vec3, out0);
         
-        out2 = mad(B0, (COMPUTE_FLOAT4)A2.x, out2);
-        out2 = mad(B1, (COMPUTE_FLOAT4)A2.y, out2);
-        out2 = mad(B2, (COMPUTE_FLOAT4)A2.z, out2);
-        out2 = mad(B3, (COMPUTE_FLOAT4)A2.w, out2);
+        out1 = mad((COMPUTE_FLOAT8)qk_vec0.s1, past_vec0, out1);
+        out1 = mad((COMPUTE_FLOAT8)qk_vec1.s1, past_vec1, out1);
+        out1 = mad((COMPUTE_FLOAT8)qk_vec2.s1, past_vec2, out1);
+        out1 = mad((COMPUTE_FLOAT8)qk_vec3.s1, past_vec3, out1);
         
-        out3 = mad(B0, (COMPUTE_FLOAT4)A3.x, out3);
-        out3 = mad(B1, (COMPUTE_FLOAT4)A3.y, out3);
-        out3 = mad(B2, (COMPUTE_FLOAT4)A3.z, out3);
-        out3 = mad(B3, (COMPUTE_FLOAT4)A3.w, out3);
-        vstore4(CONVERT_FLOAT4(B0), 0, Pastvalue_offset + (index + 0) * stride);
-        vstore4(CONVERT_FLOAT4(B1), 0, Pastvalue_offset + (index + 1) * stride);
-        vstore4(CONVERT_FLOAT4(B2), 0, Pastvalue_offset + (index + 2) * stride);
-        vstore4(CONVERT_FLOAT4(B3), 0, Pastvalue_offset + (index + 3) * stride);
+        out2 = mad((COMPUTE_FLOAT8)qk_vec0.s2, past_vec0, out2);
+        out2 = mad((COMPUTE_FLOAT8)qk_vec1.s2, past_vec1, out2);
+        out2 = mad((COMPUTE_FLOAT8)qk_vec2.s2, past_vec2, out2);
+        out2 = mad((COMPUTE_FLOAT8)qk_vec3.s2, past_vec3, out2);
+        
+        out3 = mad((COMPUTE_FLOAT8)qk_vec0.s3, past_vec0, out3);
+        out3 = mad((COMPUTE_FLOAT8)qk_vec1.s3, past_vec1, out3);
+        out3 = mad((COMPUTE_FLOAT8)qk_vec2.s3, past_vec2, out3);
+        out3 = mad((COMPUTE_FLOAT8)qk_vec3.s3, past_vec3, out3);
     }
-    for(int i = loop_end << 2; i < value_seq_len; ++i){
-        COMPUTE_FLOAT A0 = A_offset[i];
-        COMPUTE_FLOAT A1 = A_offset[i + value_seq_len];
-        COMPUTE_FLOAT A2 = A_offset[i + value_seq_len + value_seq_len];
-        COMPUTE_FLOAT A3 = A_offset[i + value_seq_len + value_seq_len + value_seq_len];
-        COMPUTE_FLOAT4 B = CONVERT_COMPUTE_FLOAT4(vload4(0, B_offset + i * stride));
+    for(int i = (loop_end << 2); i < value_seq_len; ++i){
+        COMPUTE_FLOAT4 qk_vec = CONVERT_COMPUTE_FLOAT4(vload4(0, qk + qk_offset + i * qk_seq_len4));
+        COMPUTE_FLOAT8 past_vec = CONVERT_COMPUTE_FLOAT8(vload8(0, past_value + past_offset + i * head_dim));
         
-        out0 = mad(B, (COMPUTE_FLOAT4)A0, out0);
-        out1 = mad(B, (COMPUTE_FLOAT4)A1, out1);
-        out2 = mad(B, (COMPUTE_FLOAT4)A2, out2);
-        out3 = mad(B, (COMPUTE_FLOAT4)A3, out3);
-        vstore4(CONVERT_FLOAT4(B), 0, Pastvalue_offset + i * stride);
+        out0 = mad((COMPUTE_FLOAT8)qk_vec.s0, past_vec, out0);
+        out1 = mad((COMPUTE_FLOAT8)qk_vec.s1, past_vec, out1);
+        out2 = mad((COMPUTE_FLOAT8)qk_vec.s2, past_vec, out2);
+        out3 = mad((COMPUTE_FLOAT8)qk_vec.s3, past_vec, out3);
     }
     
-    #ifdef HEADDIM_LEAVE
-    int remain = head_dim - x4;
-    int output_offset = (z4 * head_num + y) * head_dim + x4;
-    if(remain >= 4){
-        vstore4(CONVERT_FLOAT4(out0), 0, output + output_offset);
-    } else if(remain == 3){
-        vstore3(CONVERT_FLOAT3((COMPUTE_FLOAT3)(out0.x, out0.y, out0.z)), 0, output + output_offset);
-    } else if(remain == 2){
-        vstore2(CONVERT_FLOAT2((COMPUTE_FLOAT3)(out0.x, out0.y)), 0, output + output_offset);
-    } else{
-        output[output_offset] = out0.x;
+    const int output_offset = (y4 * head_num + z) * head_dim + x8;
+    const int stride = head_num * head_dim;
+    vstore8(CONVERT_FLOAT8(out0), 0, output + output_offset);
+    if(y4 + 1 >= qk_seq_len) return;
+    vstore8(CONVERT_FLOAT8(out1), 0, output + output_offset + stride);
+    if(y4 + 2 >= qk_seq_len) return;
+    vstore8(CONVERT_FLOAT8(out2), 0, output + output_offset + stride + stride);
+    if(y4 + 3 >= qk_seq_len) return;
+    vstore8(CONVERT_FLOAT8(out3), 0, output + output_offset + stride + stride + stride);
+}
+
+
+__kernel void matmul_qkv_decode_b8(GLOBAL_SIZE_2_DIMS
+                              __global const FLOAT *qk, // qk [1 head_num qk_seq_len 1]
+                              __global const FLOAT *past_value, // [1 head_num max_len head_dim]
+                              __global FLOAT *output, // [1 1 head_num head_dim]
+                              __private const int qk_seq_len,
+                              __private const int max_len,
+                              __private const int head_num,
+                              __private const int kv_head_num,
+                              __private const int head_dim) {
+                                  
+    const int x = get_global_id(0); // head_dim
+    const int y = get_global_id(1); // head_num
+    
+    DEAL_NON_UNIFORM_DIM2(x, y);
+    const int x8 = x << 3;
+    
+    const int qk_offset = y * qk_seq_len;
+    const int past_offset = ((y / NUMHEAD_GROUP_SIZE) * max_len) * head_dim + x8;
+    COMPUTE_FLOAT8 out0 = 0;
+    #ifdef LOOP_UNROLL_4
+    const int loop_end = max((qk_seq_len + 3) / 4 - 1, 0);
+    for(int i = 0; i < loop_end; ++i){
+        int i4 = i << 2;
+        COMPUTE_FLOAT4 qk_vec = CONVERT_COMPUTE_FLOAT4(vload4(0, qk + qk_offset + i4));
+        
+        COMPUTE_FLOAT8 past_vec0 = CONVERT_COMPUTE_FLOAT8(vload8(0, past_value + past_offset + i4 * head_dim));
+        COMPUTE_FLOAT8 past_vec1 = CONVERT_COMPUTE_FLOAT8(vload8(0, past_value + past_offset + (i4 + 1) * head_dim));
+        COMPUTE_FLOAT8 past_vec2 = CONVERT_COMPUTE_FLOAT8(vload8(0, past_value + past_offset + (i4 + 2) * head_dim));
+        COMPUTE_FLOAT8 past_vec3 = CONVERT_COMPUTE_FLOAT8(vload8(0, past_value + past_offset + (i4 + 3) * head_dim));
+        
+        out0 = mad((COMPUTE_FLOAT8)qk_vec.s0, past_vec0, out0);
+        out0 = mad((COMPUTE_FLOAT8)qk_vec.s1, past_vec1, out0);
+        out0 = mad((COMPUTE_FLOAT8)qk_vec.s2, past_vec2, out0);
+        out0 = mad((COMPUTE_FLOAT8)qk_vec.s3, past_vec3, out0);
     }
-    if(z4 + 1 >= qk_seq_len) return;
-    output_offset += head_num * head_dim;
-    if(remain >= 4){
-        vstore4(CONVERT_FLOAT4(out1), 0, output + output_offset);
-    } else if(remain == 3){
-        vstore3(CONVERT_FLOAT3((COMPUTE_FLOAT3)(out1.x, out1.y, out1.z)), 0, output + output_offset);
-    } else if(remain == 2){
-        vstore2(CONVERT_FLOAT2((COMPUTE_FLOAT3)(out1.x, out1.y)), 0, output + output_offset);
-    } else{
-        output[output_offset] = out1.x;
+    for(int i = (loop_end << 2); i < qk_seq_len; ++i){
+        COMPUTE_FLOAT qk_vec = qk[qk_offset + i];
+        COMPUTE_FLOAT8 past_vec = CONVERT_COMPUTE_FLOAT8(vload8(0, past_value + past_offset + i * head_dim));
+        out0 = mad((COMPUTE_FLOAT8)qk_vec, past_vec, out0);
     }
-    if(z4 + 2 >= qk_seq_len) return;
-    output_offset += head_num * head_dim;
-    if(remain >= 4){
-        vstore4(CONVERT_FLOAT4(out2), 0, output + output_offset);
-    } else if(remain == 3){
-        vstore3(CONVERT_FLOAT3((COMPUTE_FLOAT3)(out2.x, out2.y, out2.z)), 0, output + output_offset);
-    } else if(remain == 2){
-        vstore2(CONVERT_FLOAT2((COMPUTE_FLOAT3)(out2.x, out2.y)), 0, output + output_offset);
-    } else{
-        output[output_offset] = out2.x;
+    #elif (defined LOOP_UNROLL_8)
+    const int loop_end = max((qk_seq_len + 7) / 8 - 1, 0);
+    for(int i = 0; i < loop_end; ++i){
+        int i8 = i << 3;
+        COMPUTE_FLOAT8 qk_vec = CONVERT_COMPUTE_FLOAT8(vload8(0, qk + qk_offset + i8));
+        
+        COMPUTE_FLOAT8 past_vec0 = CONVERT_COMPUTE_FLOAT8(vload8(0, past_value + past_offset + i8 * head_dim));
+        COMPUTE_FLOAT8 past_vec1 = CONVERT_COMPUTE_FLOAT8(vload8(0, past_value + past_offset + (i8 + 1) * head_dim));
+        COMPUTE_FLOAT8 past_vec2 = CONVERT_COMPUTE_FLOAT8(vload8(0, past_value + past_offset + (i8 + 2) * head_dim));
+        COMPUTE_FLOAT8 past_vec3 = CONVERT_COMPUTE_FLOAT8(vload8(0, past_value + past_offset + (i8 + 3) * head_dim));
+        COMPUTE_FLOAT8 past_vec4 = CONVERT_COMPUTE_FLOAT8(vload8(0, past_value + past_offset + (i8 + 4) * head_dim));
+        COMPUTE_FLOAT8 past_vec5 = CONVERT_COMPUTE_FLOAT8(vload8(0, past_value + past_offset + (i8 + 5) * head_dim));
+        COMPUTE_FLOAT8 past_vec6 = CONVERT_COMPUTE_FLOAT8(vload8(0, past_value + past_offset + (i8 + 6) * head_dim));
+        COMPUTE_FLOAT8 past_vec7 = CONVERT_COMPUTE_FLOAT8(vload8(0, past_value + past_offset + (i8 + 7) * head_dim));
+        
+        out0 = mad((COMPUTE_FLOAT8)qk_vec.s0, past_vec0, out0);
+        out0 = mad((COMPUTE_FLOAT8)qk_vec.s1, past_vec1, out0);
+        out0 = mad((COMPUTE_FLOAT8)qk_vec.s2, past_vec2, out0);
+        out0 = mad((COMPUTE_FLOAT8)qk_vec.s3, past_vec3, out0);
+        out0 = mad((COMPUTE_FLOAT8)qk_vec.s4, past_vec4, out0);
+        out0 = mad((COMPUTE_FLOAT8)qk_vec.s5, past_vec5, out0);
+        out0 = mad((COMPUTE_FLOAT8)qk_vec.s6, past_vec6, out0);
+        out0 = mad((COMPUTE_FLOAT8)qk_vec.s7, past_vec7, out0);
     }
-    if(z4 + 3 >= qk_seq_len) return;
-    output_offset += head_num * head_dim;
-    if(remain >= 4){
-        vstore4(CONVERT_FLOAT4(out3), 0, output + output_offset);
-    } else if(remain == 3){
-        vstore3(CONVERT_FLOAT3((COMPUTE_FLOAT3)(out3.x, out3.y, out3.z)), 0, output + output_offset);
-    } else if(remain == 2){
-        vstore2(CONVERT_FLOAT2((COMPUTE_FLOAT3)(out3.x, out3.y)), 0, output + output_offset);
-    } else{
-        output[(x * head_num + y) * head_dim + z4] = out3.x;
+    for(int i = (loop_end << 3); i < qk_seq_len; ++i){
+        COMPUTE_FLOAT qk_vec = qk[qk_offset + i];
+        COMPUTE_FLOAT8 past_vec = CONVERT_COMPUTE_FLOAT8(vload8(0, past_value + past_offset + i * head_dim));
+        out0 = mad((COMPUTE_FLOAT8)qk_vec, past_vec, out0);
     }
-    #else
-    int output_offset = (z4 * head_num + y) * head_dim + x4;
-    vstore4(CONVERT_FLOAT4(out0), 0, output + output_offset);
-    if(z4 + 1 >= qk_seq_len) return;
-    output_offset += head_num * head_dim;
-    vstore4(CONVERT_FLOAT4(out1), 0, output + output_offset);
-    if(z4 + 2 >= qk_seq_len) return;
-    output_offset += head_num * head_dim;
-    vstore4(CONVERT_FLOAT4(out2), 0, output + output_offset);
-    if(z4 + 3 >= qk_seq_len) return;
-    output_offset += head_num * head_dim;
-    vstore4(CONVERT_FLOAT4(out3), 0, output + output_offset);
     #endif
+    
+    const int output_offset = y * head_dim + x8;
+    vstore8(CONVERT_FLOAT8(out0), 0, output + output_offset);
+}
 
-#else
-    int value_seq_len4 = (value_seq_len - 1 + 3) / 4;
-    int loop_end = max(value_seq_len4 - 1, 0);
-    const int stride = kv_head_num * head_dim;
-    __global const FLOAT *A_offset = input0 + y * value_seq_len;
-    __global const FLOAT *B_offset = input1 + yin * head_dim + x4;
-    __global FLOAT *Pastvalue_offset = past_value + yin * head_dim + x4;
-    COMPUTE_FLOAT4 out = 0;
-    
-    for(int i = 0; i < loop_end; i++){
-        int index = i << 2;
-        COMPUTE_FLOAT4 A = CONVERT_COMPUTE_FLOAT4(vload4(i, A_offset));
-        COMPUTE_FLOAT4 B0 = CONVERT_COMPUTE_FLOAT4(vload4(0, Pastvalue_offset + (index + 0) * stride));
-        COMPUTE_FLOAT4 B1 = CONVERT_COMPUTE_FLOAT4(vload4(0, Pastvalue_offset + (index + 1) * stride));
-        COMPUTE_FLOAT4 B2 = CONVERT_COMPUTE_FLOAT4(vload4(0, Pastvalue_offset + (index + 2) * stride));
-        COMPUTE_FLOAT4 B3 = CONVERT_COMPUTE_FLOAT4(vload4(0, Pastvalue_offset + (index + 3) * stride));
+__kernel void matmul_qkv_decode_b4(GLOBAL_SIZE_2_DIMS
+                              __global const FLOAT *qk, // qk [1 head_num qk_seq_len 1]
+                              __global const FLOAT *past_value, // [1 head_num max_len head_dim]
+                              __global FLOAT *output, // [1 1 head_num head_dim]
+                              __private const int qk_seq_len,
+                              __private const int max_len,
+                              __private const int head_num,
+                              __private const int kv_head_num,
+                              __private const int head_dim) {
+                                  
+    const int x = get_global_id(0); // head_dim
+    const int y = get_global_id(1); // head_num
+    
+    DEAL_NON_UNIFORM_DIM2(x, y);
+    const int x4 = x << 2;
+    
+    const int qk_offset = y * qk_seq_len;
+    const int past_offset = ((y / NUMHEAD_GROUP_SIZE) * max_len) * head_dim + x4;
+    COMPUTE_FLOAT4 out0 = 0;
+    #ifdef LOOP_UNROLL_4
+    const int loop_end = max((qk_seq_len + 3) / 4 - 1, 0);
+    for(int i = 0; i < loop_end; ++i){
+        int i4 = i << 2;
+        COMPUTE_FLOAT4 qk_vec = CONVERT_COMPUTE_FLOAT4(vload4(0, qk + qk_offset + i4));
         
-        out = mad(B0, (COMPUTE_FLOAT4)A.x, out);
-        out = mad(B1, (COMPUTE_FLOAT4)A.y, out);
-        out = mad(B2, (COMPUTE_FLOAT4)A.z, out);
-        out = mad(B3, (COMPUTE_FLOAT4)A.w, out);
+        COMPUTE_FLOAT4 past_vec0 = CONVERT_COMPUTE_FLOAT4(vload4(0, past_value + past_offset + i4 * head_dim));
+        COMPUTE_FLOAT4 past_vec1 = CONVERT_COMPUTE_FLOAT4(vload4(0, past_value + past_offset + (i4 + 1) * head_dim));
+        COMPUTE_FLOAT4 past_vec2 = CONVERT_COMPUTE_FLOAT4(vload4(0, past_value + past_offset + (i4 + 2) * head_dim));
+        COMPUTE_FLOAT4 past_vec3 = CONVERT_COMPUTE_FLOAT4(vload4(0, past_value + past_offset + (i4 + 3) * head_dim));
+        
+        out0 = mad((COMPUTE_FLOAT4)qk_vec.s0, past_vec0, out0);
+        out0 = mad((COMPUTE_FLOAT4)qk_vec.s1, past_vec1, out0);
+        out0 = mad((COMPUTE_FLOAT4)qk_vec.s2, past_vec2, out0);
+        out0 = mad((COMPUTE_FLOAT4)qk_vec.s3, past_vec3, out0);
+    }
+    for(int i = (loop_end << 2); i < qk_seq_len; ++i){
+        COMPUTE_FLOAT qk_vec = qk[qk_offset + i];
+        COMPUTE_FLOAT4 past_vec = CONVERT_COMPUTE_FLOAT4(vload4(0, past_value + past_offset + i * head_dim));
+        out0 = mad((COMPUTE_FLOAT4)qk_vec, past_vec, out0);
     }
-    for(int i = loop_end << 2; i < value_seq_len - 1; i++){
-        COMPUTE_FLOAT A = A_offset[i];
-        COMPUTE_FLOAT4 B = CONVERT_COMPUTE_FLOAT4(vload4(0, Pastvalue_offset + i * stride));
+    #elif (defined LOOP_UNROLL_8)
+    const int loop_end = max((qk_seq_len + 7) / 8 - 1, 0);
+    for(int i = 0; i < loop_end; ++i){
+        int i8 = i << 3;
+        COMPUTE_FLOAT8 qk_vec = CONVERT_COMPUTE_FLOAT8(vload8(0, qk + qk_offset + i8));
+        
+        COMPUTE_FLOAT4 past_vec0 = CONVERT_COMPUTE_FLOAT4(vload4(0, past_value + past_offset + i8 * head_dim));
+        COMPUTE_FLOAT4 past_vec1 = CONVERT_COMPUTE_FLOAT4(vload4(0, past_value + past_offset + (i8 + 1) * head_dim));
+        COMPUTE_FLOAT4 past_vec2 = CONVERT_COMPUTE_FLOAT4(vload4(0, past_value + past_offset + (i8 + 2) * head_dim));
+        COMPUTE_FLOAT4 past_vec3 = CONVERT_COMPUTE_FLOAT4(vload4(0, past_value + past_offset + (i8 + 3) * head_dim));
+        COMPUTE_FLOAT4 past_vec4 = CONVERT_COMPUTE_FLOAT4(vload4(0, past_value + past_offset + (i8 + 4) * head_dim));
+        COMPUTE_FLOAT4 past_vec5 = CONVERT_COMPUTE_FLOAT4(vload4(0, past_value + past_offset + (i8 + 5) * head_dim));
+        COMPUTE_FLOAT4 past_vec6 = CONVERT_COMPUTE_FLOAT4(vload4(0, past_value + past_offset + (i8 + 6) * head_dim));
+        COMPUTE_FLOAT4 past_vec7 = CONVERT_COMPUTE_FLOAT4(vload4(0, past_value + past_offset + (i8 + 7) * head_dim));
         
-        out = mad(B, (COMPUTE_FLOAT4)A, out);
+        out0 = mad((COMPUTE_FLOAT4)qk_vec.s0, past_vec0, out0);
+        out0 = mad((COMPUTE_FLOAT4)qk_vec.s1, past_vec1, out0);
+        out0 = mad((COMPUTE_FLOAT4)qk_vec.s2, past_vec2, out0);
+        out0 = mad((COMPUTE_FLOAT4)qk_vec.s3, past_vec3, out0);
+        out0 = mad((COMPUTE_FLOAT4)qk_vec.s4, past_vec4, out0);
+        out0 = mad((COMPUTE_FLOAT4)qk_vec.s5, past_vec5, out0);
+        out0 = mad((COMPUTE_FLOAT4)qk_vec.s6, past_vec6, out0);
+        out0 = mad((COMPUTE_FLOAT4)qk_vec.s7, past_vec7, out0);
     }
-    COMPUTE_FLOAT A = A_offset[value_seq_len - 1];
-    COMPUTE_FLOAT4 B = CONVERT_COMPUTE_FLOAT4(vload4(0, B_offset));
-    out = mad(B, (COMPUTE_FLOAT4)A, out);
-    
-    #ifdef HEADDIM_LEAVE
-    int remain = head_dim - x4;
-    if(remain >= 4){
-        vstore4(CONVERT_FLOAT4(out), 0, output + y * head_dim + x4);
-        vstore4(CONVERT_FLOAT4(B), 0, Pastvalue_offset + (value_seq_len - 1) * stride);
-    } else if(remain == 3){
-        vstore3(CONVERT_FLOAT3((COMPUTE_FLOAT3)(out.x, out.y, out.z)), 0, output + y * head_dim + x4);
-        vstore3(CONVERT_FLOAT4((COMPUTE_FLOAT3)(B.x, B.y, B.z)), 0, Pastvalue_offset + (value_seq_len - 1) * stride);
-    } else if(remain == 2){
-        vstore2(CONVERT_FLOAT2((COMPUTE_FLOAT3)(out.x, out.y)), 0, output + y * head_dim + x4);
-        vstore2(CONVERT_FLOAT4((COMPUTE_FLOAT3)(B.x, B.y)), 0, Pastvalue_offset + (value_seq_len - 1) * stride);
-    } else{
-        output[(x * head_num + y) * head_dim + x4] = out.x;
-        Pastvalue_offset[(value_seq_len - 1) * stride] = B.x;
+    for(int i = (loop_end << 3); i < qk_seq_len; ++i){
+        COMPUTE_FLOAT qk_vec = qk[qk_offset + i];
+        COMPUTE_FLOAT4 past_vec = CONVERT_COMPUTE_FLOAT4(vload4(0, past_value + past_offset + i * head_dim));
+        out0 = mad((COMPUTE_FLOAT4)qk_vec, past_vec, out0);
     }
-    #else
-    vstore4(CONVERT_FLOAT4(B), 0, Pastvalue_offset + (value_seq_len - 1) * stride);
-    vstore4(CONVERT_FLOAT4(out), 0, output + y * head_dim + x4);
     #endif
     
-#endif
+    const int output_offset = y * head_dim + x4;
+    vstore4(CONVERT_FLOAT4(out0), 0, output + output_offset);
 }
 
diff --git a/source/backend/opencl/execution/cl/conv_2d_buf.cl b/source/backend/opencl/execution/cl/conv_2d_buf.cl
index 07f8d96fe..d3d34e5f4 100644
--- a/source/backend/opencl/execution/cl/conv_2d_buf.cl
+++ b/source/backend/opencl/execution/cl/conv_2d_buf.cl
@@ -38,7 +38,7 @@ void conv_2d_1x1_local(__private const int out_w_blocks,
     COMPUTE_FLOAT4 out0 = (COMPUTE_FLOAT4)0;
 
     int offset = out_c_idx*4;
-    int inp_offset = (((out_b_idx+in_c_block*batch)*out_h + out_h_idx)* out_w + out_w_idx) << 2;
+    int inp_offset = ((out_b_idx*out_h + out_h_idx)* out_w + out_w_idx) << 2;
     
     const int inp_add = batch*out_h*out_w*4;
     for (ushort in_channel_block_idx = lid; in_channel_block_idx < in_c_block; in_channel_block_idx+=CONV_LOCAL_SIZE) {
diff --git a/source/backend/opencl/execution/cl/opencl_program.cc b/source/backend/opencl/execution/cl/opencl_program.cc
index ed7ee0cc6..f8f68df1c 100644
--- a/source/backend/opencl/execution/cl/opencl_program.cc
+++ b/source/backend/opencl/execution/cl/opencl_program.cc
@@ -1838,7 +1838,7 @@ const char* select =
 "#else\n"
 " FLOAT4 in1=RI_F(input1,SAMPLER,(int2)(idx,idy));\n"
 "#endif\n"
-" FLOAT4 out=select(in1,in0,select_vec == (int4)1);\n"
+" FLOAT4 out=select(in1,in0,CONVERT_FLOAT4(select_vec) == (FLOAT4)(1));\n"
 " WI_F(output,(int2)(idx,idy),out);\n"
 "}\n"
 ;
@@ -2039,6 +2039,7 @@ const char* self_attention_buf =
 " const int outside=get_global_id(2);\n"
 " DEAL_NON_UNIFORM_DIM3(inside,axis,outside);\n"
 " const int offset=(outside*shape.y+axis)*shape.z+0;\n"
+"#if SOFTMAX_LOCAL_SIZE >= 4\n"
 " int lid=get_local_id(0);\n"
 " float local sum[SOFTMAX_LOCAL_SIZE];\n"
 " /*Compute Max */\n"
@@ -2091,6 +2092,40 @@ const char* self_attention_buf =
 " #endif\n"
 " }\n"
 " }\n"
+"#else\n"
+" /*Compute Max */\n"
+" float maxValue=(float)(-FLT_MAX);\n"
+" // clip to seq_len\n"
+" for (int i=0; i<inside_len; i++) {\n"
+" maxValue=fmax(maxValue,(float)input[offset+ i]);\n"
+" }\n"
+" /*Compute Exp Sum*/\n"
+" float sumValue=0;\n"
+" for (int i=0; i<inside_len; i++) {\n"
+" sumValue += exp((float)input[offset+ i]-maxValue);\n"
+" }\n"
+" #ifdef OUTPUT_TRANSPOSE\n"
+" const int out_offset=(outside*shape.z+0)*shape.y+axis;\n"
+" #endif\n"
+" /*Compute Result */\n"
+" for (int i=0; i<inside_len; i++) {\n"
+" float value=exp((float)input[offset+ i]-maxValue)/sumValue;\n"
+" #ifdef OUTPUT_TRANSPOSE\n"
+" output[out_offset+ i*shape.y]=value;\n"
+" #else\n"
+" output[offset+ i]=value;\n"
+" #endif\n"
+" }\n"
+" if(shape.z>inside_len){\n"
+" for(int i=inside_len; i<shape.z; i++){\n"
+" #ifdef OUTPUT_TRANSPOSE\n"
+" output[out_offset+ i*shape.y]=(FLOAT)0;\n"
+" #else\n"
+" output[offset+ i]=(FLOAT)0;\n"
+" #endif\n"
+" }\n"
+" }\n"
+"#endif\n"
 "}\n"
 "// [N X Y4 4] -> [N Y X]\n"
 "__kernel void trans_3d_buf(GLOBAL_SIZE_3_DIMS\n"
@@ -11253,6 +11288,8 @@ const char* attention_buf =
 "#endif\n"
 "#define GLOBAL_SIZE_3_DIMS "" __private const int global_size_dim0,__private const int global_size_dim1,__private const int global_size_dim2,\n"
 "#define DEAL_NON_UNIFORM_DIM3(input1, input2, input3) "" if (input1 >= global_size_dim0 || input2 >= global_size_dim1 || input3 >= global_size_dim2) { "" return; "" }\n"
+"#define GLOBAL_SIZE_2_DIMS "" __private const int global_size_dim0,__private const int global_size_dim1,\n"
+"#define DEAL_NON_UNIFORM_DIM2(input1, input2) "" if (input1 >= global_size_dim0 || input2 >= global_size_dim1) { "" return; "" }\n"
 "#define DEAL_OUTER_SEQLEN_NOT_ALIGN(length) "" if(4 * sl + 3 >= length) {"" temp_3 = (FLOAT4)0;"" }"" if(4 * sl + 2 >= length) {"" temp_2 = (FLOAT4)0;"" }"" if(4 * sl + 1 >= length) {"" temp_1 = (FLOAT4)0;"" }\n"
 "#define DEAL_INNER_HEADDIM_NOT_ALIGN(length) "" if(hd * 4 + 3 >= length) {"" temp_0.w = (FLOAT)0;"" temp_1.w = (FLOAT)0;"" temp_2.w = (FLOAT)0;"" temp_3.w = (FLOAT)0;"" }"" if(hd * 4 + 2 >= length) {"" temp_0.z = (FLOAT)0;"" temp_1.z = (FLOAT)0;"" temp_2.z = (FLOAT)0;"" temp_3.z = (FLOAT)0;"" }"" if(hd * 4 + 1 >= length) {"" temp_0.y = (FLOAT)0;"" temp_1.y = (FLOAT)0;"" temp_2.y = (FLOAT)0;"" temp_3.y = (FLOAT)0;"" }\n"
 "__kernel void rearrange_qkv(GLOBAL_SIZE_3_DIMS\n"
@@ -11262,11 +11299,11 @@ const char* attention_buf =
 " __global FLOAT *output_q,// [batch*headNum,ROUND_UP(headDim,mTileHDK),ROUND_UP(seqLenQ,mTileQ)]\n"
 " __global FLOAT *output_k,// [batch*headNum/group,ROUND_UP(headDim,mTileHDK),ROUND_UP(seqLenKV,mTileKV)]\n"
 " __global FLOAT *output_v,// [batch*headNum/group,ROUND_UP(seqLenKV,mTileKV),ROUND_UP(headDim,mTileHDN)]\n"
-" __global FLOAT *past_k,// [batch,seqLenKV/4,headNum/group,headDim,seqLenKV_4]\n"
-" __global FLOAT *past_v,// [batch,seqLenKV/4,headNum/group,headDim,seqLenKV_4]\n"
+" __global FLOAT *past_k,// [batch,headNum/group,headDim,seqLenKV_4]\n"
+" __global FLOAT *past_v,// [batch,headNum/group,seqLenKV_4,headDim]\n"
 " __private const int4 tile,// [mTileQ,mTileKV,mTileHDK,mTileHDN]\n"
 " __private const int4 shape,// [seqLenQ,seqLenKV,headNum,headDim]\n"
-" __private const int4 param // [group,batch]\n"
+" __private const int4 param // [group,batch,max_len,past_len]\n"
 ") {\n"
 " const int sl=get_global_id(0); // seqLen/4 : max(seqLenPackQ/4,seqLenPackKV/4)\n"
 " const int hd=get_global_id(1); // headDim/4 : max(headDimPackQK/4,headDimPackV/4)\n"
@@ -11279,6 +11316,7 @@ const char* attention_buf =
 " const int headDim=shape.w;\n"
 " const int group=param.x;\n"
 " const int batch=param.y;\n"
+" const int maxLenKV=param.z;\n"
 " const int b=z % batch;\n"
 " const int hn=z/batch;\n"
 " \n"
@@ -11321,7 +11359,8 @@ const char* attention_buf =
 " const int headDimPackV=((headDim+tile.w-1)/tile.w)*tile.w;\n"
 " const int seqLenKV_4=(seqLenKV+3)/4;\n"
 " const int in_offset_kv=(((b*seqLenKV+sl*4)*headNum/group+hn)*headDim+4*hd);\n"
-" \n"
+" const int past_offset_k=(((b*headNum/group+hn)*headDim+hd*4)*maxLenKV+sl*4);\n"
+" const int past_offset_v=(((b*headNum/group+hn)*maxLenKV+sl*4)*headDim+4*hd);\n"
 " if(sl*4<seqLenPackKV && hd*4<headDimPackQK) {\n"
 " const int out_offset_k=(((b*headNum/group+hn)*headDimPackQK+hd*4)*seqLenPackKV+sl*4);\n"
 " if(sl*4 >= seqLenKV || hd*4 >= headDim) {\n"
@@ -11340,22 +11379,20 @@ const char* attention_buf =
 " #ifdef SEQLEN_LEAVE\n"
 " DEAL_OUTER_SEQLEN_NOT_ALIGN(seqLenKV)\n"
 " #endif\n"
-" vstore4((FLOAT4)(temp_0.s0,temp_1.s0,temp_2.s0,temp_3.s0),0,output_k+out_offset_k);\n"
-" vstore4((FLOAT4)(temp_0.s1,temp_1.s1,temp_2.s1,temp_3.s1),0,output_k+out_offset_k+seqLenPackKV);\n"
-" vstore4((FLOAT4)(temp_0.s2,temp_1.s2,temp_2.s2,temp_3.s2),0,output_k+out_offset_k+2*seqLenPackKV);\n"
-" vstore4((FLOAT4)(temp_0.s3,temp_1.s3,temp_2.s3,temp_3.s3),0,output_k+out_offset_k+3*seqLenPackKV);\n"
+" FLOAT4 key0=(FLOAT4)(temp_0.s0,temp_1.s0,temp_2.s0,temp_3.s0);\n"
+" FLOAT4 key1=(FLOAT4)(temp_0.s1,temp_1.s1,temp_2.s1,temp_3.s1);\n"
+" FLOAT4 key2=(FLOAT4)(temp_0.s2,temp_1.s2,temp_2.s2,temp_3.s2);\n"
+" FLOAT4 key3=(FLOAT4)(temp_0.s3,temp_1.s3,temp_2.s3,temp_3.s3);\n"
+" vstore4(key0,0,output_k+out_offset_k);\n"
+" vstore4(key1,0,output_k+out_offset_k+seqLenPackKV);\n"
+" vstore4(key2,0,output_k+out_offset_k+2*seqLenPackKV);\n"
+" vstore4(key3,0,output_k+out_offset_k+3*seqLenPackKV);\n"
 " \n"
 " // pastK\n"
-" vstore4(temp_0,0,past_k+in_offset_kv);\n"
-" if(sl*4+1<seqLenKV) {\n"
-" vstore4(temp_1,0,past_k+in_offset_kv+headNum*headDim/group);\n"
-" }\n"
-" if(sl*4+2<seqLenKV) {\n"
-" vstore4(temp_2,0,past_k+in_offset_kv+2*headNum*headDim/group);\n"
-" }\n"
-" if(sl*4+3<seqLenKV) {\n"
-" vstore4(temp_3,0,past_k+in_offset_kv+3*headNum*headDim/group);\n"
-" }\n"
+" vstore4(key0,0,past_k+past_offset_k);\n"
+" vstore4(key1,0,past_k+past_offset_k+maxLenKV);\n"
+" vstore4(key2,0,past_k+past_offset_k+2*maxLenKV);\n"
+" vstore4(key3,0,past_k+past_offset_k+3*maxLenKV);\n"
 " }\n"
 " \n"
 " }\n"
@@ -11384,16 +11421,10 @@ const char* attention_buf =
 " vstore4(temp_3,0,output_v+out_offset_v+3*headDimPackV);\n"
 " \n"
 " // pastV\n"
-" vstore4(temp_0,0,past_v+in_offset_kv);\n"
-" if(sl*4+1<seqLenKV) {\n"
-" vstore4(temp_1,0,past_v+in_offset_kv+headNum*headDim/group);\n"
-" }\n"
-" if(sl*4+2<seqLenKV) {\n"
-" vstore4(temp_2,0,past_v+in_offset_kv+2*headNum*headDim/group);\n"
-" }\n"
-" if(sl*4+3<seqLenKV) {\n"
-" vstore4(temp_3,0,past_v+in_offset_kv+3*headNum*headDim/group);\n"
-" }\n"
+" vstore4(temp_0,0,past_v+past_offset_v);\n"
+" vstore4(temp_1,0,past_v+past_offset_v+headDim);\n"
+" vstore4(temp_2,0,past_v+past_offset_v+2*headDim);\n"
+" vstore4(temp_3,0,past_v+past_offset_v+3*headDim);\n"
 " }\n"
 " \n"
 " }\n"
@@ -11494,168 +11525,181 @@ const char* attention_buf =
 "#ifndef NUMHEAD_GROUP_SIZE\n"
 "#define NUMHEAD_GROUP_SIZE 1\n"
 "#endif\n"
-"__kernel void matmul_qk_div_mask(GLOBAL_SIZE_3_DIMS\n"
-" __global const FLOAT *input0,// query [1 query_seq_len head_num head_dim]\n"
-" __global const FLOAT *input1,// key [1 key_seq_len head_num head_dim]\n"
-" __global FLOAT *output,// prefill [1 head_num query_seq_len key_seq_len] decode[1 head_num key_seq_len/4 4]\n"
-" __global FLOAT *past_key,// [1 max_length head_num head_dim]\n"
+"__kernel void rearrange_q(GLOBAL_SIZE_3_DIMS\n"
+" __global const FLOAT *query,// [1 query_seq_len head_num head_dim]\n"
+" __global FLOAT *query_tmp,// [1 head_num head_dim key_seq_len4]\n"
+" __private const int seq_len,\n"
+" __private const int head_dim,\n"
+" __private const int head_num) {\n"
+" \n"
+" const int x=get_global_id(0); // query_seq_len\n"
+" const int y=get_global_id(1); // head_dim\n"
+" const int z=get_global_id(2); // head_num\n"
+" DEAL_NON_UNIFORM_DIM3(x,y,z);\n"
+" \n"
+" const int x4=x << 2;\n"
+" const int y4=y << 2;\n"
+" const int seq_len4=(seq_len+3)/4*4;;\n"
+" const int stride=head_num*head_dim;\n"
+" int query_offset=(x4*head_num+z)*head_dim+y4;\n"
+" FLOAT4 query_vec0=vload4(0,query+query_offset); query_offset += stride;\n"
+" FLOAT4 query_vec1=(x4+1 >= seq_len) ? (FLOAT4)0 : vload4(0,query+query_offset); query_offset += stride;\n"
+" FLOAT4 query_vec2=(x4+2 >= seq_len) ? (FLOAT4)0 : vload4(0,query+query_offset); query_offset += stride;\n"
+" FLOAT4 query_vec3=(x4+3 >= seq_len) ? (FLOAT4)0 : vload4(0,query+query_offset);\n"
+" \n"
+" const int queryout_offset=(z*head_dim+y4)*seq_len4+x4;\n"
+" vstore4((FLOAT4)(query_vec0.s0,query_vec1.s0,query_vec2.s0,query_vec3.s0),0,query_tmp+queryout_offset);\n"
+" vstore4((FLOAT4)(query_vec0.s1,query_vec1.s1,query_vec2.s1,query_vec3.s1),0,query_tmp+queryout_offset+seq_len4);\n"
+" vstore4((FLOAT4)(query_vec0.s2,query_vec1.s2,query_vec2.s2,query_vec3.s2),0,query_tmp+queryout_offset+seq_len4+seq_len4);\n"
+" vstore4((FLOAT4)(query_vec0.s3,query_vec1.s3,query_vec2.s3,query_vec3.s3),0,query_tmp+queryout_offset+seq_len4+seq_len4+seq_len4);\n"
+"}\n"
+"__kernel void rearrange_k(GLOBAL_SIZE_3_DIMS\n"
+" __global const FLOAT *key,// [1 key_seq_len kv_head_num head_dim]\n"
+" __global FLOAT *past_key,// [1 kv_head_num head_dim max_length]\n"
+" __private const int past_len,// prefill=0,decode=past_key len\n"
+" __private const int max_len,\n"
+" __private const int seq_len,\n"
+" __private const int kv_head_num,\n"
+" __private const int head_num,\n"
+" __private const int head_dim) {\n"
+" \n"
+" const int x=get_global_id(0); // seq_len decode=1\n"
+" const int y=get_global_id(1); // head_dim\n"
+" const int z=get_global_id(2); // kv_head_num\n"
+" DEAL_NON_UNIFORM_DIM3(x,y,z);\n"
+" \n"
+" const int y4=y << 2;\n"
+" \n"
+"#ifdef OPENCL_PREFILL_ATTENTION\n"
+" const int x4=x << 2;\n"
+" const int stride=kv_head_num*head_dim;\n"
+" int key_offset=(x4*kv_head_num+z)*head_dim+y4;\n"
+" FLOAT4 key_vec0=vload4(0,key+key_offset); key_offset += stride;\n"
+" FLOAT4 key_vec1=(x4+1 >= seq_len) ? (FLOAT4)0 : vload4(0,key+key_offset); key_offset += stride;\n"
+" FLOAT4 key_vec2=(x4+2 >= seq_len) ? (FLOAT4)0 : vload4(0,key+key_offset); key_offset += stride;\n"
+" FLOAT4 key_vec3=(x4+3 >= seq_len) ? (FLOAT4)0 : vload4(0,key+key_offset);\n"
+" const int output_offset=(z*head_dim+y4)*max_len+past_len+x4;\n"
+" vstore4((FLOAT4)(key_vec0.s0,key_vec1.s0,key_vec2.s0,key_vec3.s0),0,past_key+output_offset);\n"
+" vstore4((FLOAT4)(key_vec0.s1,key_vec1.s1,key_vec2.s1,key_vec3.s1),0,past_key+output_offset+max_len);\n"
+" vstore4((FLOAT4)(key_vec0.s2,key_vec1.s2,key_vec2.s2,key_vec3.s2),0,past_key+output_offset+max_len+max_len);\n"
+" vstore4((FLOAT4)(key_vec0.s3,key_vec1.s3,key_vec2.s3,key_vec3.s3),0,past_key+output_offset+max_len+max_len+max_len);\n"
+"#else\n"
+" FLOAT4 key_vec=vload4(0,key+z*head_dim+y4);\n"
+" const int output_offset=(z*head_dim+y4)*max_len+past_len-1;\n"
+" past_key[output_offset]=key_vec.s0;\n"
+" past_key[output_offset+max_len]=key_vec.s1;\n"
+" past_key[output_offset+max_len+max_len]=key_vec.s2;\n"
+" past_key[output_offset+max_len+max_len+max_len]=key_vec.s3;\n"
+"#endif\n"
+"}\n"
+"__kernel void rearrange_v(GLOBAL_SIZE_3_DIMS\n"
+" __global const FLOAT *value,// [1 value_seq_len kv_head_num head_dim]\n"
+" __global FLOAT *past_value,// [1 kv_head_num max_length head_dim]\n"
+" __private const int past_len,\n"
+" __private const int max_len,\n"
+" __private const int seq_len,\n"
+" __private const int kv_head_num,\n"
+" __private const int head_dim) {\n"
+" \n"
+" const int x=get_global_id(0); // head_dim\n"
+" const int y=get_global_id(1); // seq_len decode=1\n"
+" const int z=get_global_id(2); // kv_head_num\n"
+" DEAL_NON_UNIFORM_DIM3(x,y,z);\n"
+" \n"
+" const int x4=x << 2;\n"
+" \n"
+"#ifdef OPENCL_PREFILL_ATTENTION\n"
+" const int y4=y << 2;\n"
+" const int stride=kv_head_num*head_dim;\n"
+" int value_offset=(y4*kv_head_num+z)*head_dim+x4;\n"
+" FLOAT4 value_vec0=vload4(0,value+value_offset); value_offset += stride;\n"
+" FLOAT4 value_vec1=(y4+1 >= seq_len) ? (FLOAT4)0 : vload4(0,value+value_offset); value_offset += stride;\n"
+" FLOAT4 value_vec2=(y4+2 >= seq_len) ? (FLOAT4)0 : vload4(0,value+value_offset); value_offset += stride;\n"
+" FLOAT4 value_vec3=(y4+3 >= seq_len) ? (FLOAT4)0 : vload4(0,value+value_offset);\n"
+" const int output_offset=(z*max_len+past_len+y4)*head_dim+x4;\n"
+" vstore4(value_vec0,0,past_value+output_offset);\n"
+" vstore4(value_vec1,0,past_value+output_offset+head_dim);\n"
+" vstore4(value_vec2,0,past_value+output_offset+head_dim+head_dim);\n"
+" vstore4(value_vec3,0,past_value+output_offset+head_dim+head_dim+head_dim);\n"
+"#else\n"
+" FLOAT4 value_vec=vload4(0,value+z*head_dim+x4);\n"
+" const int output_offset=(z*max_len+past_len-1)*head_dim+x4;\n"
+" vstore4(value_vec,0,past_value+output_offset);\n"
+"#endif\n"
+"}\n"
+"__kernel void matmul_qk_div_mask_prefill(GLOBAL_SIZE_3_DIMS\n"
+" __global const FLOAT *query,// [1 head_num head_dim query_seq_len]\n"
+" __global const FLOAT *past_key,// [1 head_num head_dim max_length]\n"
 " #ifdef ADD_MASK\n"
 " __global const FLOAT* mask,\n"
 " #else\n"
 " __global const int* mask,// [1 1 query_seq_len key_seq_len]\n"
 " #endif\n"
+" __global FLOAT *qk,// [1 head_num key_seq_len query_seq_len]\n"
 " __private const float scale,\n"
 " __private const int query_seq_len,\n"
 " __private const int key_seq_len,\n"
+" __private const int max_len,\n"
 " __private const int head_num,\n"
-" __private const int kv_head_num,\n"
 " __private const int head_dim) {\n"
 " \n"
-" const int x=get_global_id(0); // key_seq_len\n"
-" const int y=get_global_id(1); // query_seq_len for prefill 1 for decode\n"
+" const int x=get_global_id(0); // query_seq_len\n"
+" const int y=get_global_id(1); // key_seq_len\n"
 " const int z=get_global_id(2); // head_num\n"
 " DEAL_NON_UNIFORM_DIM3(x,y,z);\n"
+" const int x4=x << 2;\n"
+" const int y4=y << 2;\n"
 " \n"
-" int x4=x << 2;\n"
-" int y4=y << 2;\n"
-" int zin=z/NUMHEAD_GROUP_SIZE;\n"
-" __global const FLOAT *A_offset=input0+(y4*head_num+z)*head_dim;\n"
-" __global FLOAT *Pastkey_offset=past_key+(x4*kv_head_num+zin)*head_dim;\n"
-" int strideA=head_num*head_dim;\n"
-" int strideB=kv_head_num*head_dim;\n"
-"#ifdef OPENCL_PREFILL_ATTENTION\n"
-" __global const FLOAT *B_offset=input1+(x4*kv_head_num+zin)*head_dim;\n"
-" int output_offset=(z*query_seq_len+y4)*key_seq_len+x4;\n"
-" float4 out0=0;\n"
-" float4 out1=0;\n"
-" float4 out2=0;\n"
-" float4 out3=0;\n"
-" \n"
-" bool A1_enable=y4+1<query_seq_len;\n"
-" bool A2_enable=y4+2<query_seq_len;\n"
-" bool A3_enable=y4+3<query_seq_len;\n"
-" \n"
-" bool B1_enable=x4+1<key_seq_len;\n"
-" bool B2_enable=x4+2<key_seq_len;\n"
-" bool B3_enable=x4+3<key_seq_len;\n"
-" \n"
-" const int head_dim4=(head_dim+3)/4;\n"
-" #ifdef HEADDIM_LEAVE\n"
-" for(int i=0; i<head_dim4-1; ++i){\n"
-" float4 A0=convert_float4(vload4(i,A_offset));\n"
-" float4 A1=A1_enable ? convert_float4(vload4(i,A_offset+strideA)) : (float4)0;\n"
-" float4 A2=A2_enable ? convert_float4(vload4(i,A_offset+strideA+strideA)) : (float4)0;\n"
-" float4 A3=A3_enable ? convert_float4(vload4(i,A_offset+strideA+strideA+strideA)) : (float4)0;\n"
-" float4 B0=convert_float4(vload4(i,B_offset));\n"
-" float4 B1=B1_enable ? convert_float4(vload4(i,B_offset+strideB)) : (float4)0;\n"
-" float4 B2=B2_enable ? convert_float4(vload4(i,B_offset+strideB+strideB)) : (float4)0;\n"
-" float4 B3=B3_enable ? convert_float4(vload4(i,B_offset+strideB+strideB+strideB)) : (float4)0;\n"
-" \n"
-" out0.x += dot(A0,B0);\n"
-" out0.y += dot(A0,B1);\n"
-" out0.z += dot(A0,B2);\n"
-" out0.w += dot(A0,B3);\n"
-" \n"
-" out1.x += dot(A1,B0);\n"
-" out1.y += dot(A1,B1);\n"
-" out1.z += dot(A1,B2);\n"
-" out1.w += dot(A1,B3);\n"
-" \n"
-" out2.x += dot(A2,B0);\n"
-" out2.y += dot(A2,B1);\n"
-" out2.z += dot(A2,B2);\n"
-" out2.w += dot(A2,B3);\n"
-" \n"
-" out3.x += dot(A3,B0);\n"
-" out3.y += dot(A3,B1);\n"
-" out3.z += dot(A3,B2);\n"
-" out3.w += dot(A3,B3);\n"
-" \n"
-" vstore4(CONVERT_FLOAT4(B0),i,Pastkey_offset);\n"
-" vstore4(CONVERT_FLOAT4(B1),i,Pastkey_offset+strideB);\n"
-" vstore4(CONVERT_FLOAT4(B2),i,Pastkey_offset+strideB+strideB);\n"
-" vstore4(CONVERT_FLOAT4(B3),i,Pastkey_offset+strideB+strideB+strideB);\n"
-" }\n"
-" for(int i=(head_dim4-1)*4; i<head_dim; ++i){\n"
-" float A0=A_offset[i];\n"
-" float A1=A1_enable ? A_offset[i+strideA] : 0;\n"
-" float A2=A2_enable ? A_offset[i+strideA+strideA] : 0;\n"
-" float A3=A3_enable ? A_offset[i+strideA+strideA+strideA] : 0;\n"
-" float B0=B_offset[i];\n"
-" float B1=B1_enable ? B_offset[i+strideB] : 0;\n"
-" float B2=B2_enable ? B_offset[i+strideB+strideB] : 0;\n"
-" float B3=B3_enable ? B_offset[i+strideB+strideB+strideB] : 0;\n"
-" \n"
-" out0.x += A0*B0;\n"
-" out0.y += A0*B1;\n"
-" out0.z += A0*B2;\n"
-" out0.w += A0*B3;\n"
-" \n"
-" out1.x += A1*B0;\n"
-" out1.y += A1*B1;\n"
-" out1.z += A1*B2;\n"
-" out1.w += A1*B3\n"
-" \n"
-" out2.x += A2*B0;\n"
-" out2.y += A2*B1;\n"
-" out2.z += A2*B2;\n"
-" out2.w += A2*B3;\n"
-" \n"
-" out3.x += A3*B0;\n"
-" out3.y += A3*B1;\n"
-" out3.z += A3*B2;\n"
-" out3.w += A3*B3;\n"
-" \n"
-" Pastkey_offset[i]=(FLOAT)B0;\n"
-" Pastkey_offset[i+strideB]=(FLOAT)B1;\n"
-" Pastkey_offset[i+strideB+strideB]=(FLOAT)B2;\n"
-" Pastkey_offset[i+strideB+strideB+strideB]=(FLOAT)B3;\n"
+" const int query_seq_len4=(query_seq_len+3)/4*4;;\n"
+" const int query_offset=z*head_dim*query_seq_len4+x4;\n"
+" const int past_offset=(z/NUMHEAD_GROUP_SIZE)*head_dim*max_len+y4;\n"
+" float4 out0=0,out1=0,out2=0,out3=0;\n"
+" \n"
+" for(int i=0; i<head_dim/4; ++i){\n"
+" int i4=i << 2;\n"
+" float4 query_vec0=convert_float4(vload4(0,query+query_offset+i4*query_seq_len4));\n"
+" float4 query_vec1=convert_float4(vload4(0,query+query_offset+(i4+1)*query_seq_len4));\n"
+" float4 query_vec2=convert_float4(vload4(0,query+query_offset+(i4+2)*query_seq_len4));\n"
+" float4 query_vec3=convert_float4(vload4(0,query+query_offset+(i4+3)*query_seq_len4));\n"
+" \n"
+" float4 past_vec0=convert_float4(vload4(0,past_key+past_offset+i4*max_len));\n"
+" float4 past_vec1=convert_float4(vload4(0,past_key+past_offset+(i4+1)*max_len));\n"
+" float4 past_vec2=convert_float4(vload4(0,past_key+past_offset+(i4+2)*max_len));\n"
+" float4 past_vec3=convert_float4(vload4(0,past_key+past_offset+(i4+3)*max_len));\n"
+" out0=mad((float4)past_vec0.s0,query_vec0,out0);\n"
+" out0=mad((float4)past_vec1.s0,query_vec1,out0);\n"
+" out0=mad((float4)past_vec2.s0,query_vec2,out0);\n"
+" out0=mad((float4)past_vec3.s0,query_vec3,out0);\n"
+" \n"
+" out1=mad((float4)past_vec0.s1,query_vec0,out1);\n"
+" out1=mad((float4)past_vec1.s1,query_vec1,out1);\n"
+" out1=mad((float4)past_vec2.s1,query_vec2,out1);\n"
+" out1=mad((float4)past_vec3.s1,query_vec3,out1);\n"
+" \n"
+" out2=mad((float4)past_vec0.s2,query_vec0,out2);\n"
+" out2=mad((float4)past_vec1.s2,query_vec1,out2);\n"
+" out2=mad((float4)past_vec2.s2,query_vec2,out2);\n"
+" out2=mad((float4)past_vec3.s2,query_vec3,out2);\n"
+" \n"
+" out3=mad((float4)past_vec0.s3,query_vec0,out3);\n"
+" out3=mad((float4)past_vec1.s3,query_vec1,out3);\n"
+" out3=mad((float4)past_vec2.s3,query_vec2,out3);\n"
+" out3=mad((float4)past_vec3.s3,query_vec3,out3);\n"
 " }\n"
-" #else\n"
-" for(int i=0; i<head_dim4; ++i){\n"
-" float4 A0=convert_float4(vload4(i,A_offset));\n"
-" float4 A1=A1_enable ? convert_float4(vload4(i,A_offset+strideA)) : (float4)0;\n"
-" float4 A2=A2_enable ? convert_float4(vload4(i,A_offset+strideA+strideA)) : (float4)0;\n"
-" float4 A3=A3_enable ? convert_float4(vload4(i,A_offset+strideA+strideA+strideA)) : (float4)0;\n"
-" float4 B0=convert_float4(vload4(i,B_offset));\n"
-" float4 B1=B1_enable ? convert_float4(vload4(i,B_offset+strideB)) : (float4)0;\n"
-" float4 B2=B2_enable ? convert_float4(vload4(i,B_offset+strideB+strideB)) : (float4)0;\n"
-" float4 B3=B3_enable ? convert_float4(vload4(i,B_offset+strideB+strideB+strideB)) : (float4)0;\n"
-" \n"
-" out0.x += dot(A0,B0);\n"
-" out0.y += dot(A0,B1);\n"
-" out0.z += dot(A0,B2);\n"
-" out0.w += dot(A0,B3);\n"
-" \n"
-" out1.x += dot(A1,B0);\n"
-" out1.y += dot(A1,B1);\n"
-" out1.z += dot(A1,B2);\n"
-" out1.w += dot(A1,B3);\n"
-" \n"
-" out2.x += dot(A2,B0);\n"
-" out2.y += dot(A2,B1);\n"
-" out2.z += dot(A2,B2);\n"
-" out2.w += dot(A2,B3);\n"
-" \n"
-" out3.x += dot(A3,B0);\n"
-" out3.y += dot(A3,B1);\n"
-" out3.z += dot(A3,B2);\n"
-" out3.w += dot(A3,B3);\n"
-" \n"
-" vstore4(CONVERT_FLOAT4(B0),i,Pastkey_offset);\n"
-" vstore4(CONVERT_FLOAT4(B1),i,Pastkey_offset+strideB);\n"
-" vstore4(CONVERT_FLOAT4(B2),i,Pastkey_offset+strideB+strideB);\n"
-" vstore4(CONVERT_FLOAT4(B3),i,Pastkey_offset+strideB+strideB+strideB);\n"
-" }\n"
-" #endif\n"
 " out0 *= (float4)scale;\n"
 " out1 *= (float4)scale;\n"
 " out2 *= (float4)scale;\n"
 " out3 *= (float4)scale;\n"
-" float4 mask0=convert_float4(vload4(0,mask+y4*key_seq_len+x4));\n"
-" float4 mask1=convert_float4(vload4(0,mask+(y4+1)*key_seq_len+x4));\n"
-" float4 mask2=convert_float4(vload4(0,mask+(y4+2)*key_seq_len+x4));\n"
-" float4 mask3=convert_float4(vload4(0,mask+(y4+3)*key_seq_len+x4));\n"
+" {\n"
+" int mask_offset=x4*key_seq_len+y4;\n"
+" float4 mask_tmp0=convert_float4(vload4(0,mask+mask_offset)); mask_offset += key_seq_len;\n"
+" float4 mask_tmp1=(x4+1 >= query_seq_len) ? (float4)0 : convert_float4(vload4(0,mask+mask_offset)); mask_offset += key_seq_len;\n"
+" float4 mask_tmp2=(x4+2 >= query_seq_len) ? (float4)0 : convert_float4(vload4(0,mask+mask_offset)); mask_offset += key_seq_len;\n"
+" float4 mask_tmp3=(x4+3 >= query_seq_len) ? (float4)0 : convert_float4(vload4(0,mask+mask_offset));\n"
+" float4 mask0=(float4)(mask_tmp0.s0,mask_tmp1.s0,mask_tmp2.s0,mask_tmp3.s0);\n"
+" float4 mask1=(float4)(mask_tmp0.s1,mask_tmp1.s1,mask_tmp2.s1,mask_tmp3.s1);\n"
+" float4 mask2=(float4)(mask_tmp0.s2,mask_tmp1.s2,mask_tmp2.s2,mask_tmp3.s2);\n"
+" float4 mask3=(float4)(mask_tmp0.s3,mask_tmp1.s3,mask_tmp2.s3,mask_tmp3.s3);\n"
 " #ifdef ADD_MASK\n"
 " out0 += mask0;\n"
 " out1 += mask1;\n"
@@ -11667,325 +11711,289 @@ const char* attention_buf =
 " out2=(mask2 == (float4)0) ? (float4)(-FLT_MAX) : out2;\n"
 " out3=(mask3 == (float4)0) ? (float4)(-FLT_MAX) : out3;\n"
 " #endif\n"
-" if(B3_enable){\n"
-" vstore4(CONVERT_FLOAT4(out0),0,output+output_offset);\n"
-" if(!A1_enable) return;\n"
-" output_offset += key_seq_len;\n"
-" vstore4(CONVERT_FLOAT4(out1),0,output+output_offset);\n"
-" if(!A2_enable) return;\n"
-" output_offset += key_seq_len;\n"
-" vstore4(CONVERT_FLOAT4(out2),0,output+output_offset);\n"
-" if(!A3_enable) return;\n"
-" output_offset += key_seq_len;\n"
-" vstore4(CONVERT_FLOAT4(out3),0,output+output_offset);\n"
-" } else if(B2_enable){\n"
-" vstore3(CONVERT_FLOAT3((float3)(out0.x,out0.y,out0.z)),0,output+output_offset);\n"
-" if(!A1_enable) return;\n"
-" output_offset += key_seq_len;\n"
-" vstore3(CONVERT_FLOAT3((float3)(out1.x,out1.y,out1.z)),0,output+output_offset);\n"
-" if(!A2_enable) return;\n"
-" output_offset += key_seq_len;\n"
-" vstore3(CONVERT_FLOAT3((float3)(out2.x,out2.y,out2.z)),0,output+output_offset);\n"
-" if(!A3_enable) return;\n"
-" output_offset += key_seq_len;\n"
-" vstore3(CONVERT_FLOAT3((float3)(out3.x,out3.y,out3.z)),0,output+output_offset);\n"
-" } else if(B1_enable){\n"
-" vstore2(CONVERT_FLOAT2((float2)(out0.x,out0.y)),0,output+output_offset);\n"
-" if(!A1_enable) return;\n"
-" output_offset += key_seq_len;\n"
-" vstore2(CONVERT_FLOAT2((float2)(out1.x,out1.y)),0,output+output_offset);\n"
-" if(!A2_enable) return;\n"
-" output_offset += key_seq_len;\n"
-" vstore2(CONVERT_FLOAT2((float2)(out2.x,out2.y)),0,output+output_offset);\n"
-" if(!A3_enable) return;\n"
-" output_offset += key_seq_len;\n"
-" vstore2(CONVERT_FLOAT2((float2)(out3.x,out3.y)),0,output+output_offset);\n"
-" } else {\n"
-" output[output_offset]=out0.x;\n"
-" if(!A1_enable) return;\n"
-" output[output_offset+key_seq_len]=out1.x;\n"
-" if(!A2_enable) return;\n"
-" output[output_offset+key_seq_len+key_seq_len]=out2.x;\n"
-" if(!A3_enable) return;\n"
-" output[output_offset+key_seq_len+key_seq_len+key_seq_len]=out3.x;\n"
 " }\n"
-"#else\n"
-" float4 out=0;\n"
-" const int head_dim4=(head_dim+3)/4;\n"
-" int key_seq_len4=(key_seq_len+3)/4;\n"
-" #ifdef HEADDIM_LEAVE\n"
-" for(int i=0; i<head_dim4-1; ++i){\n"
-" float4 A=convert_float4(vload4(i,A_offset));\n"
-" float4 B0=convert_float4(vload4(i,Pastkey_offset));\n"
-" float4 B1=convert_float4(vload4(i,Pastkey_offset+strideB));\n"
-" float4 B2=convert_float4(vload4(i,Pastkey_offset+strideB+strideB));\n"
-" float4 B3=convert_float4(vload4(i,Pastkey_offset+strideB+strideB+strideB));\n"
-" \n"
-" out.x += dot(A,B0);\n"
-" out.y += dot(A,B1);\n"
-" out.z += dot(A,B2);\n"
-" out.w += dot(A,B3);\n"
-" }\n"
-" for(int i=(head_dim4-1)*4; i<head_dim; ++i){\n"
-" float A=A_offset[i];\n"
-" float B0=Pastkey_offset[i];\n"
-" float B1=Pastkey_offset[i+strideB];\n"
-" float B2=Pastkey_offset[i+strideB+strideB];\n"
-" float B3=Pastkey_offset[i+strideB+strideB];\n"
-" out.x += A*B0;\n"
-" out.y += A*B1;\n"
-" out.z += A*B2;\n"
-" out.w += A*B3;\n"
-" }\n"
-" #else\n"
-" for(int i=0; i<head_dim4; ++i){\n"
-" float4 A=convert_float4(vload4(i,A_offset));\n"
-" float4 B0=convert_float4(vload4(i,Pastkey_offset));\n"
-" float4 B1=convert_float4(vload4(i,Pastkey_offset+strideB));\n"
-" float4 B2=convert_float4(vload4(i,Pastkey_offset+strideB+strideB));\n"
-" float4 B3=convert_float4(vload4(i,Pastkey_offset+strideB+strideB+strideB));\n"
-" \n"
-" out.x += dot(A,B0);\n"
-" out.y += dot(A,B1);\n"
-" out.z += dot(A,B2);\n"
-" out.w += dot(A,B3);\n"
-" }\n"
-" #endif\n"
-" int remain=key_seq_len-x4;\n"
-" if(x == key_seq_len4-1){\n"
-" __global const FLOAT *B_offset=input1+zin*head_dim;\n"
-" Pastkey_offset += (remain-1)*strideB;\n"
-" float tmp=0;\n"
-" #ifdef HEADDIM_LEAVE\n"
-" for(int i=0; i<head_dim4-1; ++i){\n"
-" float4 A=convert_float4(vload4(i,A_offset));\n"
-" float4 B=convert_float4(vload4(i,B_offset));\n"
 " \n"
-" tmp += dot(A,B);\n"
-" vstore4(CONVERT_FLOAT4(B),i,Pastkey_offset);\n"
-" }\n"
-" for(int i=(head_dim4-1)*4; i<head_dim; ++i){\n"
-" float A=A_offset[i];\n"
-" float B=B_offset[i];\n"
-" tmp += A*B;\n"
-" Pastkey_offset[i]=B;\n"
-" }\n"
-" #else\n"
-" for(int i=0; i<head_dim4; ++i){\n"
-" float4 A=convert_float4(vload4(i,A_offset));\n"
-" float4 B=convert_float4(vload4(i,B_offset));\n"
+" const int qk_offset=(z*key_seq_len+y4)*query_seq_len4+x4;\n"
+" vstore4(CONVERT_FLOAT4(out0),0,qk+qk_offset);\n"
+" if(y4+1 >= key_seq_len) return;\n"
+" vstore4(CONVERT_FLOAT4(out1),0,qk+qk_offset+query_seq_len4);\n"
+" if(y4+2 >= key_seq_len) return;\n"
+" vstore4(CONVERT_FLOAT4(out2),0,qk+qk_offset+query_seq_len4+query_seq_len4);\n"
+" if(y4+3 >= key_seq_len) return;\n"
+" vstore4(CONVERT_FLOAT4(out3),0,qk+qk_offset+query_seq_len4+query_seq_len4+query_seq_len4);\n"
+"}\n"
+"__kernel void matmul_qk_decode(GLOBAL_SIZE_2_DIMS\n"
+" __global const FLOAT *query,// key [1 head_num head_dim]\n"
+" __global const FLOAT *past_key,// [1 head_num head_dim max_length]\n"
+" __global FLOAT *qk,// [1 head_num key_seq_len 1]\n"
+" __private const float scale,\n"
+" __private const int seq_len,\n"
+" __private const int max_len,\n"
+" __private const int head_num,\n"
+" __private const int head_dim) {\n"
+" \n"
+" const int x=get_global_id(0); // key_seq_len\n"
+" const int y=get_global_id(1); // head_num\n"
+" DEAL_NON_UNIFORM_DIM2(x,y);\n"
+" const int x4=x << 2;\n"
 " \n"
-" tmp += dot(A,B);\n"
-" vstore4(CONVERT_FLOAT4(B),i,Pastkey_offset);\n"
+" const int query_offset=y*head_dim;\n"
+" const int past_offset=(y/NUMHEAD_GROUP_SIZE)*head_dim*max_len+x4;\n"
+" float4 out0=0;\n"
+" \n"
+" for(int i=0; i<head_dim/4; ++i){\n"
+" int i4=i << 2;\n"
+" float4 query_vec=convert_float4(vload4(0,query+query_offset+i4));\n"
+" \n"
+" float4 past_vec0=convert_float4(vload4(0,past_key+past_offset+i4*max_len));\n"
+" float4 past_vec1=convert_float4(vload4(0,past_key+past_offset+(i4+1)*max_len));\n"
+" float4 past_vec2=convert_float4(vload4(0,past_key+past_offset+(i4+2)*max_len));\n"
+" float4 past_vec3=convert_float4(vload4(0,past_key+past_offset+(i4+3)*max_len));\n"
+" \n"
+" out0=mad((float4)query_vec.s0,past_vec0,out0);\n"
+" out0=mad((float4)query_vec.s1,past_vec1,out0);\n"
+" out0=mad((float4)query_vec.s2,past_vec2,out0);\n"
+" out0=mad((float4)query_vec.s3,past_vec3,out0);\n"
 " }\n"
-" #endif\n"
-" float *out_ptr=(float*)&out;\n"
-" out_ptr[remain-1]=tmp;\n"
+" out0 *= (float4)scale;\n"
+" const int qk_offset=y*seq_len+x4;\n"
+" if(x4+3<seq_len){\n"
+" vstore4(CONVERT_FLOAT4(out0),0,qk+qk_offset);\n"
+" }else {\n"
+" int remain=seq_len-x4;\n"
+" if(remain == 3){\n"
+" vstore3(CONVERT_FLOAT3((float3)(out0.s012)),0,qk+qk_offset);\n"
+" } else if(remain == 2){\n"
+" vstore2(CONVERT_FLOAT2((float2)(out0.s01)),0,qk+qk_offset);\n"
+" }else if(remain == 1){\n"
+" qk[qk_offset]=out0.s0;\n"
 " }\n"
-" out *= (float4)scale;\n"
-" if(remain >= 4){\n"
-" vstore4(CONVERT_FLOAT4(out),0,output+z*key_seq_len+x4);\n"
-" } else if (remain >= 3){\n"
-" vstore3(CONVERT_FLOAT3((float3)(out.x,out.y,out.z)),0,output+z*key_seq_len+x4);\n"
-" } else if (remain >= 2){\n"
-" vstore2(CONVERT_FLOAT2((float2)(out.x,out.y)),0,output+z*key_seq_len+x4);\n"
-" } else {\n"
-" output[z*key_seq_len+x4]=out.x;\n"
 " }\n"
-"#endif\n"
 "}\n"
-"__kernel void matmul_qkv(GLOBAL_SIZE_3_DIMS\n"
-" __global const FLOAT *input0,// qk prefill [1 head_num qk_seq_len value_seq_len] decode[1 head_num value_seq_len]\n"
-" __global const FLOAT *input1,// [1 value_seq_len head_num head_dim]\n"
-" __global FLOAT *output,// [1 qk_seq_len head_num head_dim]\n"
-" __global FLOAT *past_value,// [1 value_seq_len head_num head_dim]\n"
+"__kernel void matmul_qkv_prefill(GLOBAL_SIZE_3_DIMS\n"
+" __global const FLOAT *qk,// qk prefill [1 head_num qk_seq_len value_seq_len]\n"
+" __global const FLOAT *past_value,// [1 head_num max_len head_dim]\n"
+" __global FLOAT *output,// [1 value_seq_len head_num head_dim]\n"
 " __private const int qk_seq_len,\n"
 " __private const int value_seq_len,\n"
+" __private const int max_len,\n"
 " __private const int head_num,\n"
 " __private const int kv_head_num,\n"
 " __private const int head_dim) {\n"
 " \n"
-" const int x=get_global_id(0); // head_dim << 2\n"
-" const int y=get_global_id(1); // head_num\n"
-" const int z=get_global_id(2); // prefill qk_seq_len decode 1\n"
+" const int x=get_global_id(0); // head_dim\n"
+" const int y=get_global_id(1); // qk_seq_len\n"
+" const int z=get_global_id(2); // head_num\n"
 " \n"
-" const int x4=x << 2;\n"
 " DEAL_NON_UNIFORM_DIM3(x,y,z);\n"
+" const int x8=x << 3;\n"
+" const int y4=y << 2;\n"
 " \n"
-" const int yin=y/NUMHEAD_GROUP_SIZE;\n"
-"#ifdef OPENCL_PREFILL_ATTENTION\n"
-" int z4=z << 2;\n"
-" int value_seq_len4=(value_seq_len+3)/4;\n"
-" int loop_end=max(value_seq_len4-1,0);\n"
-" const int stride=kv_head_num*head_dim;\n"
-" __global const FLOAT *A_offset=input0+(y*qk_seq_len+z4)*value_seq_len;\n"
-" __global const FLOAT *B_offset=input1+yin*head_dim+x4;\n"
-" __global FLOAT *Pastvalue_offset=past_value+yin*head_dim+x4;\n"
-" COMPUTE_FLOAT4 out0=0;\n"
-" COMPUTE_FLOAT4 out1=0;\n"
-" COMPUTE_FLOAT4 out2=0;\n"
-" COMPUTE_FLOAT4 out3=0;\n"
+" const int qk_seq_len4=(qk_seq_len+3)/4*4;\n"
+" const int qk_offset=z*value_seq_len*qk_seq_len4+y4;\n"
+" const int past_offset=((z/NUMHEAD_GROUP_SIZE)*max_len)*head_dim+x8;\n"
+" const int loop_end=max(value_seq_len/4-1,0);\n"
+" COMPUTE_FLOAT8 out0=0,out1=0,out2=0,out3=0;\n"
 " \n"
 " for(int i=0; i<loop_end; ++i){\n"
-" int index=i << 2;\n"
-" COMPUTE_FLOAT4 A0=CONVERT_COMPUTE_FLOAT4(vload4(i,A_offset));\n"
-" COMPUTE_FLOAT4 A1=CONVERT_COMPUTE_FLOAT4(vload4(i,A_offset+value_seq_len));\n"
-" COMPUTE_FLOAT4 A2=CONVERT_COMPUTE_FLOAT4(vload4(i,A_offset+value_seq_len+value_seq_len));\n"
-" COMPUTE_FLOAT4 A3=CONVERT_COMPUTE_FLOAT4(vload4(i,A_offset+value_seq_len+value_seq_len+value_seq_len));\n"
-" COMPUTE_FLOAT4 B0=CONVERT_COMPUTE_FLOAT4(vload4(0,B_offset+(index+0)*stride));\n"
-" COMPUTE_FLOAT4 B1=CONVERT_COMPUTE_FLOAT4(vload4(0,B_offset+(index+1)*stride));\n"
-" COMPUTE_FLOAT4 B2=CONVERT_COMPUTE_FLOAT4(vload4(0,B_offset+(index+2)*stride));\n"
-" COMPUTE_FLOAT4 B3=CONVERT_COMPUTE_FLOAT4(vload4(0,B_offset+(index+3)*stride));\n"
-" \n"
-" out0=mad(B0,(COMPUTE_FLOAT4)A0.x,out0);\n"
-" out0=mad(B1,(COMPUTE_FLOAT4)A0.y,out0);\n"
-" out0=mad(B2,(COMPUTE_FLOAT4)A0.z,out0);\n"
-" out0=mad(B3,(COMPUTE_FLOAT4)A0.w,out0);\n"
-" \n"
-" out1=mad(B0,(COMPUTE_FLOAT4)A1.x,out1);\n"
-" out1=mad(B1,(COMPUTE_FLOAT4)A1.y,out1);\n"
-" out1=mad(B2,(COMPUTE_FLOAT4)A1.z,out1);\n"
-" out1=mad(B3,(COMPUTE_FLOAT4)A1.w,out1);\n"
-" \n"
-" out2=mad(B0,(COMPUTE_FLOAT4)A2.x,out2);\n"
-" out2=mad(B1,(COMPUTE_FLOAT4)A2.y,out2);\n"
-" out2=mad(B2,(COMPUTE_FLOAT4)A2.z,out2);\n"
-" out2=mad(B3,(COMPUTE_FLOAT4)A2.w,out2);\n"
-" \n"
-" out3=mad(B0,(COMPUTE_FLOAT4)A3.x,out3);\n"
-" out3=mad(B1,(COMPUTE_FLOAT4)A3.y,out3);\n"
-" out3=mad(B2,(COMPUTE_FLOAT4)A3.z,out3);\n"
-" out3=mad(B3,(COMPUTE_FLOAT4)A3.w,out3);\n"
-" vstore4(CONVERT_FLOAT4(B0),0,Pastvalue_offset+(index+0)*stride);\n"
-" vstore4(CONVERT_FLOAT4(B1),0,Pastvalue_offset+(index+1)*stride);\n"
-" vstore4(CONVERT_FLOAT4(B2),0,Pastvalue_offset+(index+2)*stride);\n"
-" vstore4(CONVERT_FLOAT4(B3),0,Pastvalue_offset+(index+3)*stride);\n"
-" }\n"
-" for(int i=loop_end << 2; i<value_seq_len; ++i){\n"
-" COMPUTE_FLOAT A0=A_offset[i];\n"
-" COMPUTE_FLOAT A1=A_offset[i+value_seq_len];\n"
-" COMPUTE_FLOAT A2=A_offset[i+value_seq_len+value_seq_len];\n"
-" COMPUTE_FLOAT A3=A_offset[i+value_seq_len+value_seq_len+value_seq_len];\n"
-" COMPUTE_FLOAT4 B=CONVERT_COMPUTE_FLOAT4(vload4(0,B_offset+i*stride));\n"
-" \n"
-" out0=mad(B,(COMPUTE_FLOAT4)A0,out0);\n"
-" out1=mad(B,(COMPUTE_FLOAT4)A1,out1);\n"
-" out2=mad(B,(COMPUTE_FLOAT4)A2,out2);\n"
-" out3=mad(B,(COMPUTE_FLOAT4)A3,out3);\n"
-" vstore4(CONVERT_FLOAT4(B),0,Pastvalue_offset+i*stride);\n"
-" }\n"
+" int i4=i << 2;\n"
+" COMPUTE_FLOAT4 qk_vec0=CONVERT_COMPUTE_FLOAT4(vload4(0,qk+qk_offset+i4*qk_seq_len4));\n"
+" COMPUTE_FLOAT4 qk_vec1=CONVERT_COMPUTE_FLOAT4(vload4(0,qk+qk_offset+(i4+1)*qk_seq_len4));\n"
+" COMPUTE_FLOAT4 qk_vec2=CONVERT_COMPUTE_FLOAT4(vload4(0,qk+qk_offset+(i4+2)*qk_seq_len4));\n"
+" COMPUTE_FLOAT4 qk_vec3=CONVERT_COMPUTE_FLOAT4(vload4(0,qk+qk_offset+(i4+3)*qk_seq_len4));\n"
+" \n"
+" COMPUTE_FLOAT8 past_vec0=CONVERT_COMPUTE_FLOAT8(vload8(0,past_value+past_offset+i4*head_dim));\n"
+" COMPUTE_FLOAT8 past_vec1=CONVERT_COMPUTE_FLOAT8(vload8(0,past_value+past_offset+(i4+1)*head_dim));\n"
+" COMPUTE_FLOAT8 past_vec2=CONVERT_COMPUTE_FLOAT8(vload8(0,past_value+past_offset+(i4+2)*head_dim));\n"
+" COMPUTE_FLOAT8 past_vec3=CONVERT_COMPUTE_FLOAT8(vload8(0,past_value+past_offset+(i4+3)*head_dim));\n"
+" \n"
+" out0=mad((COMPUTE_FLOAT8)qk_vec0.s0,past_vec0,out0);\n"
+" out0=mad((COMPUTE_FLOAT8)qk_vec1.s0,past_vec1,out0);\n"
+" out0=mad((COMPUTE_FLOAT8)qk_vec2.s0,past_vec2,out0);\n"
+" out0=mad((COMPUTE_FLOAT8)qk_vec3.s0,past_vec3,out0);\n"
+" \n"
+" out1=mad((COMPUTE_FLOAT8)qk_vec0.s1,past_vec0,out1);\n"
+" out1=mad((COMPUTE_FLOAT8)qk_vec1.s1,past_vec1,out1);\n"
+" out1=mad((COMPUTE_FLOAT8)qk_vec2.s1,past_vec2,out1);\n"
+" out1=mad((COMPUTE_FLOAT8)qk_vec3.s1,past_vec3,out1);\n"
+" \n"
+" out2=mad((COMPUTE_FLOAT8)qk_vec0.s2,past_vec0,out2);\n"
+" out2=mad((COMPUTE_FLOAT8)qk_vec1.s2,past_vec1,out2);\n"
+" out2=mad((COMPUTE_FLOAT8)qk_vec2.s2,past_vec2,out2);\n"
+" out2=mad((COMPUTE_FLOAT8)qk_vec3.s2,past_vec3,out2);\n"
+" \n"
+" out3=mad((COMPUTE_FLOAT8)qk_vec0.s3,past_vec0,out3);\n"
+" out3=mad((COMPUTE_FLOAT8)qk_vec1.s3,past_vec1,out3);\n"
+" out3=mad((COMPUTE_FLOAT8)qk_vec2.s3,past_vec2,out3);\n"
+" out3=mad((COMPUTE_FLOAT8)qk_vec3.s3,past_vec3,out3);\n"
+" }\n"
+" for(int i=(loop_end << 2); i<value_seq_len; ++i){\n"
+" COMPUTE_FLOAT4 qk_vec=CONVERT_COMPUTE_FLOAT4(vload4(0,qk+qk_offset+i*qk_seq_len4));\n"
+" COMPUTE_FLOAT8 past_vec=CONVERT_COMPUTE_FLOAT8(vload8(0,past_value+past_offset+i*head_dim));\n"
+" \n"
+" out0=mad((COMPUTE_FLOAT8)qk_vec.s0,past_vec,out0);\n"
+" out1=mad((COMPUTE_FLOAT8)qk_vec.s1,past_vec,out1);\n"
+" out2=mad((COMPUTE_FLOAT8)qk_vec.s2,past_vec,out2);\n"
+" out3=mad((COMPUTE_FLOAT8)qk_vec.s3,past_vec,out3);\n"
+" }\n"
+" \n"
+" const int output_offset=(y4*head_num+z)*head_dim+x8;\n"
+" const int stride=head_num*head_dim;\n"
+" vstore8(CONVERT_FLOAT8(out0),0,output+output_offset);\n"
+" if(y4+1 >= qk_seq_len) return;\n"
+" vstore8(CONVERT_FLOAT8(out1),0,output+output_offset+stride);\n"
+" if(y4+2 >= qk_seq_len) return;\n"
+" vstore8(CONVERT_FLOAT8(out2),0,output+output_offset+stride+stride);\n"
+" if(y4+3 >= qk_seq_len) return;\n"
+" vstore8(CONVERT_FLOAT8(out3),0,output+output_offset+stride+stride+stride);\n"
+"}\n"
+"__kernel void matmul_qkv_decode_b8(GLOBAL_SIZE_2_DIMS\n"
+" __global const FLOAT *qk,// qk [1 head_num qk_seq_len 1]\n"
+" __global const FLOAT *past_value,// [1 head_num max_len head_dim]\n"
+" __global FLOAT *output,// [1 1 head_num head_dim]\n"
+" __private const int qk_seq_len,\n"
+" __private const int max_len,\n"
+" __private const int head_num,\n"
+" __private const int kv_head_num,\n"
+" __private const int head_dim) {\n"
 " \n"
-" #ifdef HEADDIM_LEAVE\n"
-" int remain=head_dim-x4;\n"
-" int output_offset=(z4*head_num+y)*head_dim+x4;\n"
-" if(remain >= 4){\n"
-" vstore4(CONVERT_FLOAT4(out0),0,output+output_offset);\n"
-" } else if(remain == 3){\n"
-" vstore3(CONVERT_FLOAT3((COMPUTE_FLOAT3)(out0.x,out0.y,out0.z)),0,output+output_offset);\n"
-" } else if(remain == 2){\n"
-" vstore2(CONVERT_FLOAT2((COMPUTE_FLOAT3)(out0.x,out0.y)),0,output+output_offset);\n"
-" } else{\n"
-" output[output_offset]=out0.x;\n"
-" }\n"
-" if(z4+1 >= qk_seq_len) return;\n"
-" output_offset += head_num*head_dim;\n"
-" if(remain >= 4){\n"
-" vstore4(CONVERT_FLOAT4(out1),0,output+output_offset);\n"
-" } else if(remain == 3){\n"
-" vstore3(CONVERT_FLOAT3((COMPUTE_FLOAT3)(out1.x,out1.y,out1.z)),0,output+output_offset);\n"
-" } else if(remain == 2){\n"
-" vstore2(CONVERT_FLOAT2((COMPUTE_FLOAT3)(out1.x,out1.y)),0,output+output_offset);\n"
-" } else{\n"
-" output[output_offset]=out1.x;\n"
-" }\n"
-" if(z4+2 >= qk_seq_len) return;\n"
-" output_offset += head_num*head_dim;\n"
-" if(remain >= 4){\n"
-" vstore4(CONVERT_FLOAT4(out2),0,output+output_offset);\n"
-" } else if(remain == 3){\n"
-" vstore3(CONVERT_FLOAT3((COMPUTE_FLOAT3)(out2.x,out2.y,out2.z)),0,output+output_offset);\n"
-" } else if(remain == 2){\n"
-" vstore2(CONVERT_FLOAT2((COMPUTE_FLOAT3)(out2.x,out2.y)),0,output+output_offset);\n"
-" } else{\n"
-" output[output_offset]=out2.x;\n"
-" }\n"
-" if(z4+3 >= qk_seq_len) return;\n"
-" output_offset += head_num*head_dim;\n"
-" if(remain >= 4){\n"
-" vstore4(CONVERT_FLOAT4(out3),0,output+output_offset);\n"
-" } else if(remain == 3){\n"
-" vstore3(CONVERT_FLOAT3((COMPUTE_FLOAT3)(out3.x,out3.y,out3.z)),0,output+output_offset);\n"
-" } else if(remain == 2){\n"
-" vstore2(CONVERT_FLOAT2((COMPUTE_FLOAT3)(out3.x,out3.y)),0,output+output_offset);\n"
-" } else{\n"
-" output[(x*head_num+y)*head_dim+z4]=out3.x;\n"
-" }\n"
-" #else\n"
-" int output_offset=(z4*head_num+y)*head_dim+x4;\n"
-" vstore4(CONVERT_FLOAT4(out0),0,output+output_offset);\n"
-" if(z4+1 >= qk_seq_len) return;\n"
-" output_offset += head_num*head_dim;\n"
-" vstore4(CONVERT_FLOAT4(out1),0,output+output_offset);\n"
-" if(z4+2 >= qk_seq_len) return;\n"
-" output_offset += head_num*head_dim;\n"
-" vstore4(CONVERT_FLOAT4(out2),0,output+output_offset);\n"
-" if(z4+3 >= qk_seq_len) return;\n"
-" output_offset += head_num*head_dim;\n"
-" vstore4(CONVERT_FLOAT4(out3),0,output+output_offset);\n"
-" #endif\n"
-"#else\n"
-" int value_seq_len4=(value_seq_len-1+3)/4;\n"
-" int loop_end=max(value_seq_len4-1,0);\n"
-" const int stride=kv_head_num*head_dim;\n"
-" __global const FLOAT *A_offset=input0+y*value_seq_len;\n"
-" __global const FLOAT *B_offset=input1+yin*head_dim+x4;\n"
-" __global FLOAT *Pastvalue_offset=past_value+yin*head_dim+x4;\n"
-" COMPUTE_FLOAT4 out=0;\n"
+" const int x=get_global_id(0); // head_dim\n"
+" const int y=get_global_id(1); // head_num\n"
 " \n"
-" for(int i=0; i<loop_end; i++){\n"
-" int index=i << 2;\n"
-" COMPUTE_FLOAT4 A=CONVERT_COMPUTE_FLOAT4(vload4(i,A_offset));\n"
-" COMPUTE_FLOAT4 B0=CONVERT_COMPUTE_FLOAT4(vload4(0,Pastvalue_offset+(index+0)*stride));\n"
-" COMPUTE_FLOAT4 B1=CONVERT_COMPUTE_FLOAT4(vload4(0,Pastvalue_offset+(index+1)*stride));\n"
-" COMPUTE_FLOAT4 B2=CONVERT_COMPUTE_FLOAT4(vload4(0,Pastvalue_offset+(index+2)*stride));\n"
-" COMPUTE_FLOAT4 B3=CONVERT_COMPUTE_FLOAT4(vload4(0,Pastvalue_offset+(index+3)*stride));\n"
+" DEAL_NON_UNIFORM_DIM2(x,y);\n"
+" const int x8=x << 3;\n"
 " \n"
-" out=mad(B0,(COMPUTE_FLOAT4)A.x,out);\n"
-" out=mad(B1,(COMPUTE_FLOAT4)A.y,out);\n"
-" out=mad(B2,(COMPUTE_FLOAT4)A.z,out);\n"
-" out=mad(B3,(COMPUTE_FLOAT4)A.w,out);\n"
-" }\n"
-" for(int i=loop_end << 2; i<value_seq_len-1; i++){\n"
-" COMPUTE_FLOAT A=A_offset[i];\n"
-" COMPUTE_FLOAT4 B=CONVERT_COMPUTE_FLOAT4(vload4(0,Pastvalue_offset+i*stride));\n"
+" const int qk_offset=y*qk_seq_len;\n"
+" const int past_offset=((y/NUMHEAD_GROUP_SIZE)*max_len)*head_dim+x8;\n"
+" COMPUTE_FLOAT8 out0=0;\n"
+" #ifdef LOOP_UNROLL_4\n"
+" const int loop_end=max((qk_seq_len+3)/4-1,0);\n"
+" for(int i=0; i<loop_end; ++i){\n"
+" int i4=i << 2;\n"
+" COMPUTE_FLOAT4 qk_vec=CONVERT_COMPUTE_FLOAT4(vload4(0,qk+qk_offset+i4));\n"
+" \n"
+" COMPUTE_FLOAT8 past_vec0=CONVERT_COMPUTE_FLOAT8(vload8(0,past_value+past_offset+i4*head_dim));\n"
+" COMPUTE_FLOAT8 past_vec1=CONVERT_COMPUTE_FLOAT8(vload8(0,past_value+past_offset+(i4+1)*head_dim));\n"
+" COMPUTE_FLOAT8 past_vec2=CONVERT_COMPUTE_FLOAT8(vload8(0,past_value+past_offset+(i4+2)*head_dim));\n"
+" COMPUTE_FLOAT8 past_vec3=CONVERT_COMPUTE_FLOAT8(vload8(0,past_value+past_offset+(i4+3)*head_dim));\n"
+" \n"
+" out0=mad((COMPUTE_FLOAT8)qk_vec.s0,past_vec0,out0);\n"
+" out0=mad((COMPUTE_FLOAT8)qk_vec.s1,past_vec1,out0);\n"
+" out0=mad((COMPUTE_FLOAT8)qk_vec.s2,past_vec2,out0);\n"
+" out0=mad((COMPUTE_FLOAT8)qk_vec.s3,past_vec3,out0);\n"
+" }\n"
+" for(int i=(loop_end << 2); i<qk_seq_len; ++i){\n"
+" COMPUTE_FLOAT qk_vec=qk[qk_offset+i];\n"
+" COMPUTE_FLOAT8 past_vec=CONVERT_COMPUTE_FLOAT8(vload8(0,past_value+past_offset+i*head_dim));\n"
+" out0=mad((COMPUTE_FLOAT8)qk_vec,past_vec,out0);\n"
+" }\n"
+" #elif (defined LOOP_UNROLL_8)\n"
+" const int loop_end=max((qk_seq_len+7)/8-1,0);\n"
+" for(int i=0; i<loop_end; ++i){\n"
+" int i8=i << 3;\n"
+" COMPUTE_FLOAT8 qk_vec=CONVERT_COMPUTE_FLOAT8(vload8(0,qk+qk_offset+i8));\n"
+" \n"
+" COMPUTE_FLOAT8 past_vec0=CONVERT_COMPUTE_FLOAT8(vload8(0,past_value+past_offset+i8*head_dim));\n"
+" COMPUTE_FLOAT8 past_vec1=CONVERT_COMPUTE_FLOAT8(vload8(0,past_value+past_offset+(i8+1)*head_dim));\n"
+" COMPUTE_FLOAT8 past_vec2=CONVERT_COMPUTE_FLOAT8(vload8(0,past_value+past_offset+(i8+2)*head_dim));\n"
+" COMPUTE_FLOAT8 past_vec3=CONVERT_COMPUTE_FLOAT8(vload8(0,past_value+past_offset+(i8+3)*head_dim));\n"
+" COMPUTE_FLOAT8 past_vec4=CONVERT_COMPUTE_FLOAT8(vload8(0,past_value+past_offset+(i8+4)*head_dim));\n"
+" COMPUTE_FLOAT8 past_vec5=CONVERT_COMPUTE_FLOAT8(vload8(0,past_value+past_offset+(i8+5)*head_dim));\n"
+" COMPUTE_FLOAT8 past_vec6=CONVERT_COMPUTE_FLOAT8(vload8(0,past_value+past_offset+(i8+6)*head_dim));\n"
+" COMPUTE_FLOAT8 past_vec7=CONVERT_COMPUTE_FLOAT8(vload8(0,past_value+past_offset+(i8+7)*head_dim));\n"
+" \n"
+" out0=mad((COMPUTE_FLOAT8)qk_vec.s0,past_vec0,out0);\n"
+" out0=mad((COMPUTE_FLOAT8)qk_vec.s1,past_vec1,out0);\n"
+" out0=mad((COMPUTE_FLOAT8)qk_vec.s2,past_vec2,out0);\n"
+" out0=mad((COMPUTE_FLOAT8)qk_vec.s3,past_vec3,out0);\n"
+" out0=mad((COMPUTE_FLOAT8)qk_vec.s4,past_vec4,out0);\n"
+" out0=mad((COMPUTE_FLOAT8)qk_vec.s5,past_vec5,out0);\n"
+" out0=mad((COMPUTE_FLOAT8)qk_vec.s6,past_vec6,out0);\n"
+" out0=mad((COMPUTE_FLOAT8)qk_vec.s7,past_vec7,out0);\n"
+" }\n"
+" for(int i=(loop_end << 3); i<qk_seq_len; ++i){\n"
+" COMPUTE_FLOAT qk_vec=qk[qk_offset+i];\n"
+" COMPUTE_FLOAT8 past_vec=CONVERT_COMPUTE_FLOAT8(vload8(0,past_value+past_offset+i*head_dim));\n"
+" out0=mad((COMPUTE_FLOAT8)qk_vec,past_vec,out0);\n"
+" }\n"
+" #endif\n"
+" \n"
+" const int output_offset=y*head_dim+x8;\n"
+" vstore8(CONVERT_FLOAT8(out0),0,output+output_offset);\n"
+"}\n"
+"__kernel void matmul_qkv_decode_b4(GLOBAL_SIZE_2_DIMS\n"
+" __global const FLOAT *qk,// qk [1 head_num qk_seq_len 1]\n"
+" __global const FLOAT *past_value,// [1 head_num max_len head_dim]\n"
+" __global FLOAT *output,// [1 1 head_num head_dim]\n"
+" __private const int qk_seq_len,\n"
+" __private const int max_len,\n"
+" __private const int head_num,\n"
+" __private const int kv_head_num,\n"
+" __private const int head_dim) {\n"
 " \n"
-" out=mad(B,(COMPUTE_FLOAT4)A,out);\n"
-" }\n"
-" COMPUTE_FLOAT A=A_offset[value_seq_len-1];\n"
-" COMPUTE_FLOAT4 B=CONVERT_COMPUTE_FLOAT4(vload4(0,B_offset));\n"
-" out=mad(B,(COMPUTE_FLOAT4)A,out);\n"
+" const int x=get_global_id(0); // head_dim\n"
+" const int y=get_global_id(1); // head_num\n"
 " \n"
-" #ifdef HEADDIM_LEAVE\n"
-" int remain=head_dim-x4;\n"
-" if(remain >= 4){\n"
-" vstore4(CONVERT_FLOAT4(out),0,output+y*head_dim+x4);\n"
-" vstore4(CONVERT_FLOAT4(B),0,Pastvalue_offset+(value_seq_len-1)*stride);\n"
-" } else if(remain == 3){\n"
-" vstore3(CONVERT_FLOAT3((COMPUTE_FLOAT3)(out.x,out.y,out.z)),0,output+y*head_dim+x4);\n"
-" vstore3(CONVERT_FLOAT4((COMPUTE_FLOAT3)(B.x,B.y,B.z)),0,Pastvalue_offset+(value_seq_len-1)*stride);\n"
-" } else if(remain == 2){\n"
-" vstore2(CONVERT_FLOAT2((COMPUTE_FLOAT3)(out.x,out.y)),0,output+y*head_dim+x4);\n"
-" vstore2(CONVERT_FLOAT4((COMPUTE_FLOAT3)(B.x,B.y)),0,Pastvalue_offset+(value_seq_len-1)*stride);\n"
-" } else{\n"
-" output[(x*head_num+y)*head_dim+x4]=out.x;\n"
-" Pastvalue_offset[(value_seq_len-1)*stride]=B.x;\n"
-" }\n"
-" #else\n"
-" vstore4(CONVERT_FLOAT4(B),0,Pastvalue_offset+(value_seq_len-1)*stride);\n"
-" vstore4(CONVERT_FLOAT4(out),0,output+y*head_dim+x4);\n"
-" #endif\n"
+" DEAL_NON_UNIFORM_DIM2(x,y);\n"
+" const int x4=x << 2;\n"
 " \n"
-"#endif\n"
+" const int qk_offset=y*qk_seq_len;\n"
+" const int past_offset=((y/NUMHEAD_GROUP_SIZE)*max_len)*head_dim+x4;\n"
+" COMPUTE_FLOAT4 out0=0;\n"
+" #ifdef LOOP_UNROLL_4\n"
+" const int loop_end=max((qk_seq_len+3)/4-1,0);\n"
+" for(int i=0; i<loop_end; ++i){\n"
+" int i4=i << 2;\n"
+" COMPUTE_FLOAT4 qk_vec=CONVERT_COMPUTE_FLOAT4(vload4(0,qk+qk_offset+i4));\n"
+" \n"
+" COMPUTE_FLOAT4 past_vec0=CONVERT_COMPUTE_FLOAT4(vload4(0,past_value+past_offset+i4*head_dim));\n"
+" COMPUTE_FLOAT4 past_vec1=CONVERT_COMPUTE_FLOAT4(vload4(0,past_value+past_offset+(i4+1)*head_dim));\n"
+" COMPUTE_FLOAT4 past_vec2=CONVERT_COMPUTE_FLOAT4(vload4(0,past_value+past_offset+(i4+2)*head_dim));\n"
+" COMPUTE_FLOAT4 past_vec3=CONVERT_COMPUTE_FLOAT4(vload4(0,past_value+past_offset+(i4+3)*head_dim));\n"
+" \n"
+" out0=mad((COMPUTE_FLOAT4)qk_vec.s0,past_vec0,out0);\n"
+" out0=mad((COMPUTE_FLOAT4)qk_vec.s1,past_vec1,out0);\n"
+" out0=mad((COMPUTE_FLOAT4)qk_vec.s2,past_vec2,out0);\n"
+" out0=mad((COMPUTE_FLOAT4)qk_vec.s3,past_vec3,out0);\n"
+" }\n"
+" for(int i=(loop_end << 2); i<qk_seq_len; ++i){\n"
+" COMPUTE_FLOAT qk_vec=qk[qk_offset+i];\n"
+" COMPUTE_FLOAT4 past_vec=CONVERT_COMPUTE_FLOAT4(vload4(0,past_value+past_offset+i*head_dim));\n"
+" out0=mad((COMPUTE_FLOAT4)qk_vec,past_vec,out0);\n"
+" }\n"
+" #elif (defined LOOP_UNROLL_8)\n"
+" const int loop_end=max((qk_seq_len+7)/8-1,0);\n"
+" for(int i=0; i<loop_end; ++i){\n"
+" int i8=i << 3;\n"
+" COMPUTE_FLOAT8 qk_vec=CONVERT_COMPUTE_FLOAT8(vload8(0,qk+qk_offset+i8));\n"
+" \n"
+" COMPUTE_FLOAT4 past_vec0=CONVERT_COMPUTE_FLOAT4(vload4(0,past_value+past_offset+i8*head_dim));\n"
+" COMPUTE_FLOAT4 past_vec1=CONVERT_COMPUTE_FLOAT4(vload4(0,past_value+past_offset+(i8+1)*head_dim));\n"
+" COMPUTE_FLOAT4 past_vec2=CONVERT_COMPUTE_FLOAT4(vload4(0,past_value+past_offset+(i8+2)*head_dim));\n"
+" COMPUTE_FLOAT4 past_vec3=CONVERT_COMPUTE_FLOAT4(vload4(0,past_value+past_offset+(i8+3)*head_dim));\n"
+" COMPUTE_FLOAT4 past_vec4=CONVERT_COMPUTE_FLOAT4(vload4(0,past_value+past_offset+(i8+4)*head_dim));\n"
+" COMPUTE_FLOAT4 past_vec5=CONVERT_COMPUTE_FLOAT4(vload4(0,past_value+past_offset+(i8+5)*head_dim));\n"
+" COMPUTE_FLOAT4 past_vec6=CONVERT_COMPUTE_FLOAT4(vload4(0,past_value+past_offset+(i8+6)*head_dim));\n"
+" COMPUTE_FLOAT4 past_vec7=CONVERT_COMPUTE_FLOAT4(vload4(0,past_value+past_offset+(i8+7)*head_dim));\n"
+" \n"
+" out0=mad((COMPUTE_FLOAT4)qk_vec.s0,past_vec0,out0);\n"
+" out0=mad((COMPUTE_FLOAT4)qk_vec.s1,past_vec1,out0);\n"
+" out0=mad((COMPUTE_FLOAT4)qk_vec.s2,past_vec2,out0);\n"
+" out0=mad((COMPUTE_FLOAT4)qk_vec.s3,past_vec3,out0);\n"
+" out0=mad((COMPUTE_FLOAT4)qk_vec.s4,past_vec4,out0);\n"
+" out0=mad((COMPUTE_FLOAT4)qk_vec.s5,past_vec5,out0);\n"
+" out0=mad((COMPUTE_FLOAT4)qk_vec.s6,past_vec6,out0);\n"
+" out0=mad((COMPUTE_FLOAT4)qk_vec.s7,past_vec7,out0);\n"
+" }\n"
+" for(int i=(loop_end << 3); i<qk_seq_len; ++i){\n"
+" COMPUTE_FLOAT qk_vec=qk[qk_offset+i];\n"
+" COMPUTE_FLOAT4 past_vec=CONVERT_COMPUTE_FLOAT4(vload4(0,past_value+past_offset+i*head_dim));\n"
+" out0=mad((COMPUTE_FLOAT4)qk_vec,past_vec,out0);\n"
+" }\n"
+" #endif\n"
+" \n"
+" const int output_offset=y*head_dim+x4;\n"
+" vstore4(CONVERT_FLOAT4(out0),0,output+output_offset);\n"
 "}\n"
 ;
 #endif
@@ -13292,7 +13300,7 @@ const char* conv_2d_buf =
 " COMPUTE_FLOAT4 bias0=CONVERT_COMPUTE_FLOAT4(vload4(out_c_idx,bias_ptr));\n"
 " COMPUTE_FLOAT4 out0=(COMPUTE_FLOAT4)0;\n"
 " int offset=out_c_idx*4;\n"
-" int inp_offset=(((out_b_idx+in_c_block*batch)*out_h+out_h_idx)* out_w+out_w_idx) << 2;\n"
+" int inp_offset=((out_b_idx*out_h+out_h_idx)* out_w+out_w_idx) << 2;\n"
 " \n"
 " const int inp_add=batch*out_h*out_w*4;\n"
 " for (ushort in_channel_block_idx=lid; in_channel_block_idx<in_c_block; in_channel_block_idx+=CONV_LOCAL_SIZE) {\n"
diff --git a/source/backend/opencl/execution/cl/select.cl b/source/backend/opencl/execution/cl/select.cl
index 4bd6cece0..c84674bad 100644
--- a/source/backend/opencl/execution/cl/select.cl
+++ b/source/backend/opencl/execution/cl/select.cl
@@ -35,6 +35,6 @@ __kernel void select_img(GLOBAL_SIZE_2_DIMS
 #else
     FLOAT4 in1 = RI_F(input1, SAMPLER, (int2)(idx, idy));
 #endif
-    FLOAT4 out = select(in1, in0, select_vec == (int4)1);
+    FLOAT4 out = select(in1, in0, CONVERT_FLOAT4(select_vec) == (FLOAT4)(1));
     WI_F(output, (int2)(idx, idy), out);
 }
diff --git a/source/backend/opencl/execution/cl/self_attention_buf.cl b/source/backend/opencl/execution/cl/self_attention_buf.cl
index 2b6cf9d9d..dc77548e1 100644
--- a/source/backend/opencl/execution/cl/self_attention_buf.cl
+++ b/source/backend/opencl/execution/cl/self_attention_buf.cl
@@ -202,6 +202,7 @@ __kernel void softmax_inside(GLOBAL_SIZE_3_DIMS
 
     const int offset = (outside * shape.y + axis) * shape.z + 0;
 
+#if SOFTMAX_LOCAL_SIZE >= 4
     int lid = get_local_id(0);
     float local sum[SOFTMAX_LOCAL_SIZE];
 
@@ -257,6 +258,41 @@ __kernel void softmax_inside(GLOBAL_SIZE_3_DIMS
             #endif
         }
     }
+#else
+    /*Compute Max */
+    float maxValue = (float)(-FLT_MAX);
+    // clip to seq_len
+    for (int i=0; i<inside_len; i++) {
+        maxValue = fmax(maxValue, (float)input[offset+ i]);
+    }
+
+    /*Compute Exp Sum*/
+    float sumValue = 0;
+    for (int i=0; i<inside_len; i++) {
+        sumValue += exp((float)input[offset+ i] - maxValue);
+    }
+    #ifdef OUTPUT_TRANSPOSE
+    const int out_offset = (outside * shape.z + 0) * shape.y + axis;
+    #endif
+    /*Compute Result */
+    for (int i=0; i<inside_len; i++) {
+        float value = exp((float)input[offset+ i] - maxValue) / sumValue;
+        #ifdef OUTPUT_TRANSPOSE
+        output[out_offset+ i*shape.y] = value;
+        #else
+        output[offset+ i] = value;
+        #endif
+    }
+    if(shape.z > inside_len){
+        for(int i = inside_len; i < shape.z; i++){
+            #ifdef OUTPUT_TRANSPOSE
+            output[out_offset+ i*shape.y] = (FLOAT)0;
+            #else
+            output[offset+ i] = (FLOAT)0;
+            #endif
+        }
+    }
+#endif
 }
 
 // [N X Y4 4] -> [N Y X]
diff --git a/source/backend/opencl/execution/image/ConvLowMemoryExecution.cpp b/source/backend/opencl/execution/image/ConvLowMemoryExecution.cpp
index 717bab14a..ec8e9f3e2 100644
--- a/source/backend/opencl/execution/image/ConvLowMemoryExecution.cpp
+++ b/source/backend/opencl/execution/image/ConvLowMemoryExecution.cpp
@@ -20,14 +20,17 @@ void ConvLowMemoryExecution::getInfoFromOpLowMemory(std::shared_ptr<ConvolutionC
         MNN_ASSERT(false);
     }
     
-    mResource->mInputChannel = quanCommon->weight.size() / (mResource->mKernelWidth * mResource->mKernelHeight * mResource->mOutputChannel);
     // set mNumQuantBit
     if(quanCommon->canUseInt4){
         mNumQuantBit = 4;
-        mResource->mInputChannel = (quanCommon->weight.size() * 2) / (mResource->mKernelWidth * mResource->mKernelHeight * mResource->mOutputChannel);
     }else{
         mNumQuantBit = 8;
     }
+    if (mOp->main_as_Convolution2D()->common()->inputCount() > 0) {
+        mResource->mInputChannel = mOp->main_as_Convolution2D()->common()->inputCount();
+    } else {
+        mResource->mInputChannel = quanCommon->weight.size() / (mResource->mKernelWidth * mResource->mKernelHeight * mResource->mOutputChannel);
+    }
     // src of alpha in CPU
     float * dequantAlpha = quanCommon->alpha.get();
     int totalCount = quanCommon->alpha.size();
diff --git a/source/backend/vulkan/buffer/execution/VulkanRaster.cpp b/source/backend/vulkan/buffer/execution/VulkanRaster.cpp
index b53f5241d..83434ff2b 100644
--- a/source/backend/vulkan/buffer/execution/VulkanRaster.cpp
+++ b/source/backend/vulkan/buffer/execution/VulkanRaster.cpp
@@ -132,7 +132,6 @@ ErrorCode VulkanRaster::onEncode(const std::vector<Tensor *> &____inputs, const
                 break;
             }
         }
-        fast = false;
         if (fast) {
             onEncodeFast(output, output, cmdBuffer, needZero);
             return NO_ERROR;
diff --git a/source/backend/vulkan/image/backend/VulkanBackend.cpp b/source/backend/vulkan/image/backend/VulkanBackend.cpp
index 4be0fddb4..39799e3e0 100644
--- a/source/backend/vulkan/image/backend/VulkanBackend.cpp
+++ b/source/backend/vulkan/image/backend/VulkanBackend.cpp
@@ -298,6 +298,10 @@ void VulkanBackend::onCopyBuffer(const Tensor* srcTensor, const Tensor* dstTenso
         mHostBuffer->unmap();
         auto key    = std::make_tuple(TensorUtils::getDescribe(dstTensor), true, format);
         auto iter   = mConverters.find(key);
+        if (iter != mConverters.end() && std::get<2>(iter->second).lock() == nullptr) {
+            mConverters.erase(iter);
+            iter = mConverters.end();
+        }
         if (iter == mConverters.end()) {
             if (mConverters.size() > MNN_VULKAN_MAX_CACHE_CONVSIZE) {
                 mConverters.clear();
@@ -317,10 +321,10 @@ void VulkanBackend::onCopyBuffer(const Tensor* srcTensor, const Tensor* dstTenso
                 vkTensor->image(i)->barrierRead(convertorBuffer->get());
             }
             convertorBuffer->end();
-            mConverters.insert(std::make_pair(key, std::make_pair(converter, convertorBuffer)));
+            mConverters.insert(std::make_pair(key, std::make_tuple(converter, convertorBuffer, std::weak_ptr<Tensor::InsideDescribe::NativeInsideDescribe>(TensorUtils::getDescribeOrigin(dstTensor)->mContent))));
             iter = mConverters.find(key);
         }
-        mCmdBuffers.push_back(iter->second.second->get());
+        mCmdBuffers.push_back(std::get<1>(iter->second)->get());
         if (TensorUtils::getDescribe(srcTensor)->isMutable == false) {
             _finish();
         }
@@ -333,6 +337,10 @@ void VulkanBackend::onCopyBuffer(const Tensor* srcTensor, const Tensor* dstTenso
         auto key    = std::make_tuple(TensorUtils::getDescribe(srcTensor), false, format);
 
         auto iter = mConverters.find(key);
+        if (iter != mConverters.end() && std::get<2>(iter->second).lock() == nullptr) {
+            mConverters.erase(iter);
+            iter = mConverters.end();
+        }
         if (iter == mConverters.end()) {
             if (mConverters.size() > MNN_VULKAN_MAX_CACHE_CONVSIZE) {
                 mConverters.clear();
@@ -345,10 +353,10 @@ void VulkanBackend::onCopyBuffer(const Tensor* srcTensor, const Tensor* dstTenso
                                             format,
                                             convertorBuffer.get());
             convertorBuffer->end();
-            mConverters.insert(std::make_pair(key, std::make_pair(converter, convertorBuffer)));
+            mConverters.insert(std::make_pair(key, std::make_tuple(converter, convertorBuffer, std::weak_ptr<Tensor::InsideDescribe::NativeInsideDescribe>(TensorUtils::getDescribeOrigin(srcTensor)->mContent))));
             iter = mConverters.find(key);
         }
-        mCmdBuffers.push_back(iter->second.second->get());
+        mCmdBuffers.push_back(std::get<1>(iter->second)->get());
         _finish();
         std::shared_ptr<Tensor> tempTensor(new Tensor);
         TensorUtils::copyShape(srcTensor, tempTensor.get(), true);
diff --git a/source/backend/vulkan/image/backend/VulkanBackend.hpp b/source/backend/vulkan/image/backend/VulkanBackend.hpp
index ef07c4c32..60949fdae 100644
--- a/source/backend/vulkan/image/backend/VulkanBackend.hpp
+++ b/source/backend/vulkan/image/backend/VulkanBackend.hpp
@@ -19,6 +19,8 @@
 namespace MNN {
 class VulkanImageConverter;
 class VulkanBasicExecution;
+typedef std::tuple<const Tensor::InsideDescribe::NativeInsideDescribe*, bool, MNN_DATA_FORMAT> VulkanTensorConvertKey;
+typedef std::tuple<std::shared_ptr<VulkanImageConverter>, std::shared_ptr<VulkanCommandPool::Buffer>, std::weak_ptr<Tensor::InsideDescribe::NativeInsideDescribe>>  VulkanTensorConvertValue;
 
 class VulkanBackend : public Backend {
 public:
@@ -94,9 +96,7 @@ class VulkanBackend : public Backend {
     mutable std::shared_ptr<VulkanFence> mFence;
 
 
-    mutable std::map<std::tuple<const Tensor::InsideDescribe::NativeInsideDescribe*, bool, MNN_DATA_FORMAT>,
-                     std::pair<std::shared_ptr<VulkanImageConverter>, std::shared_ptr<VulkanCommandPool::Buffer>>>
-        mConverters;
+    mutable std::map<VulkanTensorConvertKey, VulkanTensorConvertValue> mConverters;
 
     bool mDirect;
     const VulkanRuntime* mRuntime;
diff --git a/source/backend/vulkan/image/compiler/AllShader.cpp b/source/backend/vulkan/image/compiler/AllShader.cpp
index 843d6fe37..3de46fdd4 100644
--- a/source/backend/vulkan/image/compiler/AllShader.cpp
+++ b/source/backend/vulkan/image/compiler/AllShader.cpp
@@ -373,6 +373,1758 @@ const unsigned char glsl_deconvCol2Im_comp[] = {
 };
 unsigned int glsl_deconvCol2Im_comp_len = 2292;
 
+const unsigned char glsl_softmaxImage_comp[] = {
+  0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0b, 0x00, 0x08, 0x00,
+  0x7d, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00,
+  0x01, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00,
+  0x47, 0x4c, 0x53, 0x4c, 0x2e, 0x73, 0x74, 0x64, 0x2e, 0x34, 0x35, 0x30,
+  0x00, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x01, 0x00, 0x00, 0x00, 0x0f, 0x00, 0x05, 0x00, 0x05, 0x00, 0x00, 0x00,
+  0x04, 0x00, 0x00, 0x00, 0x6d, 0x61, 0x69, 0x6e, 0x00, 0x00, 0x00, 0x00,
+  0x10, 0x00, 0x06, 0x00, 0x04, 0x00, 0x00, 0x00, 0x11, 0x00, 0x00, 0x00,
+  0x00, 0x01, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+  0x03, 0x00, 0x03, 0x00, 0x02, 0x00, 0x00, 0x00, 0xc2, 0x01, 0x00, 0x00,
+  0x05, 0x00, 0x04, 0x00, 0x04, 0x00, 0x00, 0x00, 0x6d, 0x61, 0x69, 0x6e,
+  0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x76, 0x00, 0x00, 0x00,
+  0x0b, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x13, 0x00, 0x02, 0x00,
+  0x02, 0x00, 0x00, 0x00, 0x21, 0x00, 0x03, 0x00, 0x03, 0x00, 0x00, 0x00,
+  0x02, 0x00, 0x00, 0x00, 0x15, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0x20, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x17, 0x00, 0x04, 0x00, 0x13, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0x03, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0x1f, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0x27, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+  0x2b, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00,
+  0x08, 0x01, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0x2a, 0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0x14, 0x00, 0x02, 0x00,
+  0x31, 0x00, 0x00, 0x00, 0x15, 0x00, 0x04, 0x00, 0x44, 0x00, 0x00, 0x00,
+  0x20, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00,
+  0x44, 0x00, 0x00, 0x00, 0x45, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+  0x2b, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, 0x75, 0x00, 0x00, 0x00,
+  0x01, 0x00, 0x00, 0x00, 0x2c, 0x00, 0x06, 0x00, 0x13, 0x00, 0x00, 0x00,
+  0x76, 0x00, 0x00, 0x00, 0x1f, 0x00, 0x00, 0x00, 0x75, 0x00, 0x00, 0x00,
+  0x75, 0x00, 0x00, 0x00, 0x36, 0x00, 0x05, 0x00, 0x02, 0x00, 0x00, 0x00,
+  0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00,
+  0xf8, 0x00, 0x02, 0x00, 0x05, 0x00, 0x00, 0x00, 0xe0, 0x00, 0x04, 0x00,
+  0x27, 0x00, 0x00, 0x00, 0x27, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00,
+  0xf9, 0x00, 0x02, 0x00, 0x2b, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00,
+  0x2b, 0x00, 0x00, 0x00, 0xf5, 0x00, 0x07, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0x7b, 0x00, 0x00, 0x00, 0x2a, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00,
+  0x46, 0x00, 0x00, 0x00, 0x2c, 0x00, 0x00, 0x00, 0xac, 0x00, 0x05, 0x00,
+  0x31, 0x00, 0x00, 0x00, 0x32, 0x00, 0x00, 0x00, 0x7b, 0x00, 0x00, 0x00,
+  0x0c, 0x00, 0x00, 0x00, 0xf6, 0x00, 0x04, 0x00, 0x2d, 0x00, 0x00, 0x00,
+  0x2c, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xfa, 0x00, 0x04, 0x00,
+  0x32, 0x00, 0x00, 0x00, 0x2c, 0x00, 0x00, 0x00, 0x2d, 0x00, 0x00, 0x00,
+  0xf8, 0x00, 0x02, 0x00, 0x2c, 0x00, 0x00, 0x00, 0xe0, 0x00, 0x04, 0x00,
+  0x27, 0x00, 0x00, 0x00, 0x27, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00,
+  0xc2, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0x46, 0x00, 0x00, 0x00,
+  0x7b, 0x00, 0x00, 0x00, 0x45, 0x00, 0x00, 0x00, 0xf9, 0x00, 0x02, 0x00,
+  0x2b, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x2d, 0x00, 0x00, 0x00,
+  0xe0, 0x00, 0x04, 0x00, 0x27, 0x00, 0x00, 0x00, 0x27, 0x00, 0x00, 0x00,
+  0x28, 0x00, 0x00, 0x00, 0xf9, 0x00, 0x02, 0x00, 0x51, 0x00, 0x00, 0x00,
+  0xf8, 0x00, 0x02, 0x00, 0x51, 0x00, 0x00, 0x00, 0xf5, 0x00, 0x07, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0x7c, 0x00, 0x00, 0x00, 0x2a, 0x00, 0x00, 0x00,
+  0x2d, 0x00, 0x00, 0x00, 0x68, 0x00, 0x00, 0x00, 0x52, 0x00, 0x00, 0x00,
+  0xac, 0x00, 0x05, 0x00, 0x31, 0x00, 0x00, 0x00, 0x57, 0x00, 0x00, 0x00,
+  0x7c, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x00, 0x00, 0xf6, 0x00, 0x04, 0x00,
+  0x53, 0x00, 0x00, 0x00, 0x52, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0xfa, 0x00, 0x04, 0x00, 0x57, 0x00, 0x00, 0x00, 0x52, 0x00, 0x00, 0x00,
+  0x53, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x52, 0x00, 0x00, 0x00,
+  0xe0, 0x00, 0x04, 0x00, 0x27, 0x00, 0x00, 0x00, 0x27, 0x00, 0x00, 0x00,
+  0x28, 0x00, 0x00, 0x00, 0xc2, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0x68, 0x00, 0x00, 0x00, 0x7c, 0x00, 0x00, 0x00, 0x45, 0x00, 0x00, 0x00,
+  0xf9, 0x00, 0x02, 0x00, 0x51, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00,
+  0x53, 0x00, 0x00, 0x00, 0xfd, 0x00, 0x01, 0x00, 0x38, 0x00, 0x01, 0x00
+};
+unsigned int glsl_softmaxImage_comp_len = 744;
+
+const unsigned char glsl_softmaxImage_AXIS_N_comp[] = {
+  0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0b, 0x00, 0x08, 0x00,
+  0x12, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00,
+  0x01, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x38, 0x00, 0x00, 0x00,
+  0x0b, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x4c, 0x53, 0x4c,
+  0x2e, 0x73, 0x74, 0x64, 0x2e, 0x34, 0x35, 0x30, 0x00, 0x00, 0x00, 0x00,
+  0x0e, 0x00, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+  0x0f, 0x00, 0x07, 0x00, 0x05, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
+  0x6d, 0x61, 0x69, 0x6e, 0x00, 0x00, 0x00, 0x00, 0x78, 0x00, 0x00, 0x00,
+  0x7d, 0x00, 0x00, 0x00, 0x10, 0x00, 0x06, 0x00, 0x04, 0x00, 0x00, 0x00,
+  0x11, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+  0x01, 0x00, 0x00, 0x00, 0x03, 0x00, 0x03, 0x00, 0x02, 0x00, 0x00, 0x00,
+  0xc2, 0x01, 0x00, 0x00, 0x05, 0x00, 0x04, 0x00, 0x04, 0x00, 0x00, 0x00,
+  0x6d, 0x61, 0x69, 0x6e, 0x00, 0x00, 0x00, 0x00, 0x05, 0x00, 0x05, 0x00,
+  0x24, 0x00, 0x00, 0x00, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x42, 0x75, 0x66,
+  0x66, 0x65, 0x72, 0x00, 0x06, 0x00, 0x04, 0x00, 0x24, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x4e, 0x00, 0x00, 0x00, 0x06, 0x00, 0x04, 0x00,
+  0x24, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x48, 0x00, 0x00, 0x00,
+  0x06, 0x00, 0x04, 0x00, 0x24, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+  0x57, 0x00, 0x00, 0x00, 0x06, 0x00, 0x04, 0x00, 0x24, 0x00, 0x00, 0x00,
+  0x03, 0x00, 0x00, 0x00, 0x43, 0x34, 0x00, 0x00, 0x06, 0x00, 0x05, 0x00,
+  0x24, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x43, 0x4c, 0x65, 0x66,
+  0x74, 0x00, 0x00, 0x00, 0x05, 0x00, 0x04, 0x00, 0x26, 0x00, 0x00, 0x00,
+  0x75, 0x43, 0x6f, 0x6e, 0x73, 0x74, 0x00, 0x00, 0x05, 0x00, 0x08, 0x00,
+  0x78, 0x00, 0x00, 0x00, 0x67, 0x6c, 0x5f, 0x4c, 0x6f, 0x63, 0x61, 0x6c,
+  0x49, 0x6e, 0x76, 0x6f, 0x63, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x49, 0x44,
+  0x00, 0x00, 0x00, 0x00, 0x05, 0x00, 0x08, 0x00, 0x7d, 0x00, 0x00, 0x00,
+  0x67, 0x6c, 0x5f, 0x47, 0x6c, 0x6f, 0x62, 0x61, 0x6c, 0x49, 0x6e, 0x76,
+  0x6f, 0x63, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x49, 0x44, 0x00, 0x00, 0x00,
+  0x05, 0x00, 0x04, 0x00, 0xb9, 0x00, 0x00, 0x00, 0x75, 0x49, 0x6e, 0x70,
+  0x75, 0x74, 0x00, 0x00, 0x05, 0x00, 0x06, 0x00, 0xd9, 0x00, 0x00, 0x00,
+  0x73, 0x68, 0x61, 0x72, 0x65, 0x64, 0x56, 0x61, 0x6c, 0x75, 0x65, 0x73,
+  0x00, 0x00, 0x00, 0x00, 0x05, 0x00, 0x04, 0x00, 0x81, 0x01, 0x00, 0x00,
+  0x75, 0x4f, 0x75, 0x74, 0x70, 0x75, 0x74, 0x00, 0x48, 0x00, 0x05, 0x00,
+  0x24, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00, 0x24, 0x00, 0x00, 0x00,
+  0x01, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
+  0x48, 0x00, 0x05, 0x00, 0x24, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+  0x23, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00,
+  0x24, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00,
+  0x0c, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00, 0x24, 0x00, 0x00, 0x00,
+  0x04, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00,
+  0x47, 0x00, 0x03, 0x00, 0x24, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+  0x47, 0x00, 0x04, 0x00, 0x26, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x26, 0x00, 0x00, 0x00,
+  0x21, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
+  0x78, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x1b, 0x00, 0x00, 0x00,
+  0x47, 0x00, 0x04, 0x00, 0x7d, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00,
+  0x1c, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0xb9, 0x00, 0x00, 0x00,
+  0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
+  0xb9, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+  0x47, 0x00, 0x04, 0x00, 0x81, 0x01, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x81, 0x01, 0x00, 0x00,
+  0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00,
+  0x81, 0x01, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
+  0x87, 0x01, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00,
+  0x13, 0x00, 0x02, 0x00, 0x02, 0x00, 0x00, 0x00, 0x21, 0x00, 0x03, 0x00,
+  0x03, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x15, 0x00, 0x04, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x15, 0x00, 0x04, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00,
+  0x01, 0x00, 0x00, 0x00, 0x17, 0x00, 0x04, 0x00, 0x0c, 0x00, 0x00, 0x00,
+  0x0b, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x16, 0x00, 0x03, 0x00,
+  0x16, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0x17, 0x00, 0x04, 0x00,
+  0x17, 0x00, 0x00, 0x00, 0x16, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
+  0x2b, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x1e, 0x00, 0x07, 0x00, 0x24, 0x00, 0x00, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00,
+  0x25, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00,
+  0x3b, 0x00, 0x04, 0x00, 0x25, 0x00, 0x00, 0x00, 0x26, 0x00, 0x00, 0x00,
+  0x02, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00, 0x0b, 0x00, 0x00, 0x00,
+  0x27, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00,
+  0x28, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0x2b, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x00, 0x00,
+  0x00, 0x01, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0x2d, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00,
+  0x16, 0x00, 0x00, 0x00, 0x47, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x2c, 0x00, 0x07, 0x00, 0x17, 0x00, 0x00, 0x00, 0x48, 0x00, 0x00, 0x00,
+  0x47, 0x00, 0x00, 0x00, 0x47, 0x00, 0x00, 0x00, 0x47, 0x00, 0x00, 0x00,
+  0x47, 0x00, 0x00, 0x00, 0x14, 0x00, 0x02, 0x00, 0x4a, 0x00, 0x00, 0x00,
+  0x2b, 0x00, 0x04, 0x00, 0x16, 0x00, 0x00, 0x00, 0x4c, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x80, 0x3f, 0x2b, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0x4e, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0x52, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+  0x2b, 0x00, 0x04, 0x00, 0x16, 0x00, 0x00, 0x00, 0x5b, 0x00, 0x00, 0x00,
+  0xff, 0xff, 0x7f, 0xff, 0x2c, 0x00, 0x07, 0x00, 0x17, 0x00, 0x00, 0x00,
+  0x5c, 0x00, 0x00, 0x00, 0x5b, 0x00, 0x00, 0x00, 0x5b, 0x00, 0x00, 0x00,
+  0x5b, 0x00, 0x00, 0x00, 0x5b, 0x00, 0x00, 0x00, 0x17, 0x00, 0x04, 0x00,
+  0x76, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00,
+  0x20, 0x00, 0x04, 0x00, 0x77, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+  0x76, 0x00, 0x00, 0x00, 0x3b, 0x00, 0x04, 0x00, 0x77, 0x00, 0x00, 0x00,
+  0x78, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00,
+  0x79, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0x3b, 0x00, 0x04, 0x00, 0x77, 0x00, 0x00, 0x00, 0x7d, 0x00, 0x00, 0x00,
+  0x01, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00, 0x0b, 0x00, 0x00, 0x00,
+  0x80, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00,
+  0x0b, 0x00, 0x00, 0x00, 0x8c, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+  0x2b, 0x00, 0x04, 0x00, 0x0b, 0x00, 0x00, 0x00, 0xa5, 0x00, 0x00, 0x00,
+  0x01, 0x00, 0x00, 0x00, 0x19, 0x00, 0x09, 0x00, 0xb6, 0x00, 0x00, 0x00,
+  0x16, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x1b, 0x00, 0x03, 0x00, 0xb7, 0x00, 0x00, 0x00,
+  0xb6, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00, 0xb8, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0xb7, 0x00, 0x00, 0x00, 0x3b, 0x00, 0x04, 0x00,
+  0xb8, 0x00, 0x00, 0x00, 0xb9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x2b, 0x00, 0x04, 0x00, 0x0b, 0x00, 0x00, 0x00, 0xc5, 0x00, 0x00, 0x00,
+  0x04, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x04, 0x00, 0xd7, 0x00, 0x00, 0x00,
+  0x17, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00,
+  0xd8, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0xd7, 0x00, 0x00, 0x00,
+  0x3b, 0x00, 0x04, 0x00, 0xd8, 0x00, 0x00, 0x00, 0xd9, 0x00, 0x00, 0x00,
+  0x04, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00, 0xdc, 0x00, 0x00, 0x00,
+  0x04, 0x00, 0x00, 0x00, 0x17, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0xde, 0x00, 0x00, 0x00, 0x08, 0x01, 0x00, 0x00,
+  0x2b, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, 0xe0, 0x00, 0x00, 0x00,
+  0x80, 0x00, 0x00, 0x00, 0x19, 0x00, 0x09, 0x00, 0x7f, 0x01, 0x00, 0x00,
+  0x16, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00, 0x80, 0x01, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x7f, 0x01, 0x00, 0x00, 0x3b, 0x00, 0x04, 0x00,
+  0x80, 0x01, 0x00, 0x00, 0x81, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x2c, 0x00, 0x06, 0x00, 0x76, 0x00, 0x00, 0x00, 0x87, 0x01, 0x00, 0x00,
+  0x2b, 0x00, 0x00, 0x00, 0x2d, 0x00, 0x00, 0x00, 0x2d, 0x00, 0x00, 0x00,
+  0x36, 0x00, 0x05, 0x00, 0x02, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00,
+  0x05, 0x00, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00, 0x79, 0x00, 0x00, 0x00,
+  0x7a, 0x00, 0x00, 0x00, 0x78, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00,
+  0x3d, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, 0x7b, 0x00, 0x00, 0x00,
+  0x7a, 0x00, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00, 0x79, 0x00, 0x00, 0x00,
+  0x7e, 0x00, 0x00, 0x00, 0x7d, 0x00, 0x00, 0x00, 0x2d, 0x00, 0x00, 0x00,
+  0x3d, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, 0x7f, 0x00, 0x00, 0x00,
+  0x7e, 0x00, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00, 0x28, 0x00, 0x00, 0x00,
+  0x81, 0x00, 0x00, 0x00, 0x26, 0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00,
+  0x3d, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, 0x82, 0x00, 0x00, 0x00,
+  0x81, 0x00, 0x00, 0x00, 0x89, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0x83, 0x00, 0x00, 0x00, 0x7f, 0x00, 0x00, 0x00, 0x82, 0x00, 0x00, 0x00,
+  0x86, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0x89, 0x00, 0x00, 0x00,
+  0x7f, 0x00, 0x00, 0x00, 0x82, 0x00, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00,
+  0x28, 0x00, 0x00, 0x00, 0x8d, 0x00, 0x00, 0x00, 0x26, 0x00, 0x00, 0x00,
+  0x8c, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0x8e, 0x00, 0x00, 0x00, 0x8d, 0x00, 0x00, 0x00, 0x89, 0x00, 0x05, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0x8f, 0x00, 0x00, 0x00, 0x89, 0x00, 0x00, 0x00,
+  0x8e, 0x00, 0x00, 0x00, 0x86, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0x94, 0x00, 0x00, 0x00, 0x89, 0x00, 0x00, 0x00, 0x8e, 0x00, 0x00, 0x00,
+  0xf9, 0x00, 0x02, 0x00, 0x9a, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00,
+  0x9a, 0x00, 0x00, 0x00, 0xf5, 0x00, 0x07, 0x00, 0x17, 0x00, 0x00, 0x00,
+  0xfc, 0x01, 0x00, 0x00, 0x5c, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00,
+  0xd4, 0x00, 0x00, 0x00, 0x9d, 0x00, 0x00, 0x00, 0xf5, 0x00, 0x07, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0xfb, 0x01, 0x00, 0x00, 0x7b, 0x00, 0x00, 0x00,
+  0x05, 0x00, 0x00, 0x00, 0xd6, 0x00, 0x00, 0x00, 0x9d, 0x00, 0x00, 0x00,
+  0x41, 0x00, 0x05, 0x00, 0x28, 0x00, 0x00, 0x00, 0xa0, 0x00, 0x00, 0x00,
+  0x26, 0x00, 0x00, 0x00, 0x27, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0xa1, 0x00, 0x00, 0x00, 0xa0, 0x00, 0x00, 0x00,
+  0xb0, 0x00, 0x05, 0x00, 0x4a, 0x00, 0x00, 0x00, 0xa2, 0x00, 0x00, 0x00,
+  0xfb, 0x01, 0x00, 0x00, 0xa1, 0x00, 0x00, 0x00, 0xf6, 0x00, 0x04, 0x00,
+  0x9c, 0x00, 0x00, 0x00, 0x9d, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0xfa, 0x00, 0x04, 0x00, 0xa2, 0x00, 0x00, 0x00, 0x9b, 0x00, 0x00, 0x00,
+  0x9c, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x9b, 0x00, 0x00, 0x00,
+  0x41, 0x00, 0x05, 0x00, 0x28, 0x00, 0x00, 0x00, 0xaf, 0x00, 0x00, 0x00,
+  0x26, 0x00, 0x00, 0x00, 0xa5, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0xb0, 0x00, 0x00, 0x00, 0xaf, 0x00, 0x00, 0x00,
+  0x84, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0x98, 0x01, 0x00, 0x00,
+  0x83, 0x00, 0x00, 0x00, 0x8e, 0x00, 0x00, 0x00, 0x80, 0x00, 0x05, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0x99, 0x01, 0x00, 0x00, 0x8f, 0x00, 0x00, 0x00,
+  0x98, 0x01, 0x00, 0x00, 0x84, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0x9d, 0x01, 0x00, 0x00, 0xfb, 0x01, 0x00, 0x00, 0xb0, 0x00, 0x00, 0x00,
+  0x80, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0x9e, 0x01, 0x00, 0x00,
+  0x94, 0x00, 0x00, 0x00, 0x9d, 0x01, 0x00, 0x00, 0x7c, 0x00, 0x04, 0x00,
+  0x0b, 0x00, 0x00, 0x00, 0xa0, 0x01, 0x00, 0x00, 0x99, 0x01, 0x00, 0x00,
+  0x7c, 0x00, 0x04, 0x00, 0x0b, 0x00, 0x00, 0x00, 0xa2, 0x01, 0x00, 0x00,
+  0x9e, 0x01, 0x00, 0x00, 0x50, 0x00, 0x05, 0x00, 0x0c, 0x00, 0x00, 0x00,
+  0xa3, 0x01, 0x00, 0x00, 0xa0, 0x01, 0x00, 0x00, 0xa2, 0x01, 0x00, 0x00,
+  0x3d, 0x00, 0x04, 0x00, 0xb7, 0x00, 0x00, 0x00, 0xba, 0x00, 0x00, 0x00,
+  0xb9, 0x00, 0x00, 0x00, 0x64, 0x00, 0x04, 0x00, 0xb6, 0x00, 0x00, 0x00,
+  0xbc, 0x00, 0x00, 0x00, 0xba, 0x00, 0x00, 0x00, 0x5f, 0x00, 0x07, 0x00,
+  0x17, 0x00, 0x00, 0x00, 0xbd, 0x00, 0x00, 0x00, 0xbc, 0x00, 0x00, 0x00,
+  0xa3, 0x01, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x27, 0x00, 0x00, 0x00,
+  0x82, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0xc1, 0x00, 0x00, 0x00,
+  0x82, 0x00, 0x00, 0x00, 0x2d, 0x00, 0x00, 0x00, 0xaa, 0x00, 0x05, 0x00,
+  0x4a, 0x00, 0x00, 0x00, 0xc2, 0x00, 0x00, 0x00, 0x83, 0x00, 0x00, 0x00,
+  0xc1, 0x00, 0x00, 0x00, 0xf7, 0x00, 0x03, 0x00, 0xc4, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0xfa, 0x00, 0x04, 0x00, 0xc2, 0x00, 0x00, 0x00,
+  0xc3, 0x00, 0x00, 0x00, 0xc4, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00,
+  0xc3, 0x00, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00, 0x28, 0x00, 0x00, 0x00,
+  0xc6, 0x00, 0x00, 0x00, 0x26, 0x00, 0x00, 0x00, 0xc5, 0x00, 0x00, 0x00,
+  0x3d, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, 0xc7, 0x00, 0x00, 0x00,
+  0xc6, 0x00, 0x00, 0x00, 0xac, 0x00, 0x05, 0x00, 0x4a, 0x00, 0x00, 0x00,
+  0xc8, 0x00, 0x00, 0x00, 0xc7, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00,
+  0xf9, 0x00, 0x02, 0x00, 0xc4, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00,
+  0xc4, 0x00, 0x00, 0x00, 0xf5, 0x00, 0x07, 0x00, 0x4a, 0x00, 0x00, 0x00,
+  0xc9, 0x00, 0x00, 0x00, 0xc2, 0x00, 0x00, 0x00, 0x9b, 0x00, 0x00, 0x00,
+  0xc8, 0x00, 0x00, 0x00, 0xc3, 0x00, 0x00, 0x00, 0xf7, 0x00, 0x03, 0x00,
+  0xcb, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xfa, 0x00, 0x04, 0x00,
+  0xc9, 0x00, 0x00, 0x00, 0xca, 0x00, 0x00, 0x00, 0xcb, 0x00, 0x00, 0x00,
+  0xf8, 0x00, 0x02, 0x00, 0xca, 0x00, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00,
+  0x28, 0x00, 0x00, 0x00, 0xcf, 0x00, 0x00, 0x00, 0x26, 0x00, 0x00, 0x00,
+  0xc5, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0xd0, 0x00, 0x00, 0x00, 0xcf, 0x00, 0x00, 0x00, 0xae, 0x00, 0x05, 0x00,
+  0x4a, 0x00, 0x00, 0x00, 0xa8, 0x01, 0x00, 0x00, 0xd0, 0x00, 0x00, 0x00,
+  0x2d, 0x00, 0x00, 0x00, 0xa9, 0x00, 0x06, 0x00, 0x16, 0x00, 0x00, 0x00,
+  0xa9, 0x01, 0x00, 0x00, 0xa8, 0x01, 0x00, 0x00, 0x4c, 0x00, 0x00, 0x00,
+  0x47, 0x00, 0x00, 0x00, 0x52, 0x00, 0x06, 0x00, 0x17, 0x00, 0x00, 0x00,
+  0xef, 0x01, 0x00, 0x00, 0xa9, 0x01, 0x00, 0x00, 0x48, 0x00, 0x00, 0x00,
+  0x03, 0x00, 0x00, 0x00, 0xae, 0x00, 0x05, 0x00, 0x4a, 0x00, 0x00, 0x00,
+  0xac, 0x01, 0x00, 0x00, 0xd0, 0x00, 0x00, 0x00, 0x52, 0x00, 0x00, 0x00,
+  0xa9, 0x00, 0x06, 0x00, 0x16, 0x00, 0x00, 0x00, 0xad, 0x01, 0x00, 0x00,
+  0xac, 0x01, 0x00, 0x00, 0x4c, 0x00, 0x00, 0x00, 0x47, 0x00, 0x00, 0x00,
+  0x52, 0x00, 0x06, 0x00, 0x17, 0x00, 0x00, 0x00, 0xf1, 0x01, 0x00, 0x00,
+  0xad, 0x01, 0x00, 0x00, 0xef, 0x01, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+  0xae, 0x00, 0x05, 0x00, 0x4a, 0x00, 0x00, 0x00, 0xb0, 0x01, 0x00, 0x00,
+  0xd0, 0x00, 0x00, 0x00, 0x4e, 0x00, 0x00, 0x00, 0xa9, 0x00, 0x06, 0x00,
+  0x16, 0x00, 0x00, 0x00, 0xb1, 0x01, 0x00, 0x00, 0xb0, 0x01, 0x00, 0x00,
+  0x4c, 0x00, 0x00, 0x00, 0x47, 0x00, 0x00, 0x00, 0x52, 0x00, 0x06, 0x00,
+  0x17, 0x00, 0x00, 0x00, 0xf3, 0x01, 0x00, 0x00, 0xb1, 0x01, 0x00, 0x00,
+  0xf1, 0x01, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x08, 0x00,
+  0x17, 0x00, 0x00, 0x00, 0xb5, 0x01, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+  0x2e, 0x00, 0x00, 0x00, 0xbd, 0x00, 0x00, 0x00, 0x5c, 0x00, 0x00, 0x00,
+  0xf3, 0x01, 0x00, 0x00, 0xf9, 0x00, 0x02, 0x00, 0xcb, 0x00, 0x00, 0x00,
+  0xf8, 0x00, 0x02, 0x00, 0xcb, 0x00, 0x00, 0x00, 0xf5, 0x00, 0x07, 0x00,
+  0x17, 0x00, 0x00, 0x00, 0x11, 0x02, 0x00, 0x00, 0xbd, 0x00, 0x00, 0x00,
+  0xc4, 0x00, 0x00, 0x00, 0xb5, 0x01, 0x00, 0x00, 0xca, 0x00, 0x00, 0x00,
+  0x0c, 0x00, 0x07, 0x00, 0x17, 0x00, 0x00, 0x00, 0xd4, 0x00, 0x00, 0x00,
+  0x01, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00, 0xfc, 0x01, 0x00, 0x00,
+  0x11, 0x02, 0x00, 0x00, 0xf9, 0x00, 0x02, 0x00, 0x9d, 0x00, 0x00, 0x00,
+  0xf8, 0x00, 0x02, 0x00, 0x9d, 0x00, 0x00, 0x00, 0x80, 0x00, 0x05, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0xd6, 0x00, 0x00, 0x00, 0xfb, 0x01, 0x00, 0x00,
+  0x2b, 0x00, 0x00, 0x00, 0xf9, 0x00, 0x02, 0x00, 0x9a, 0x00, 0x00, 0x00,
+  0xf8, 0x00, 0x02, 0x00, 0x9c, 0x00, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00,
+  0xdc, 0x00, 0x00, 0x00, 0xdd, 0x00, 0x00, 0x00, 0xd9, 0x00, 0x00, 0x00,
+  0x7b, 0x00, 0x00, 0x00, 0x3e, 0x00, 0x03, 0x00, 0xdd, 0x00, 0x00, 0x00,
+  0xfc, 0x01, 0x00, 0x00, 0xe0, 0x00, 0x04, 0x00, 0x52, 0x00, 0x00, 0x00,
+  0x52, 0x00, 0x00, 0x00, 0xde, 0x00, 0x00, 0x00, 0xf9, 0x00, 0x02, 0x00,
+  0xe1, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0xe1, 0x00, 0x00, 0x00,
+  0xf5, 0x00, 0x07, 0x00, 0x06, 0x00, 0x00, 0x00, 0xfd, 0x01, 0x00, 0x00,
+  0xe0, 0x00, 0x00, 0x00, 0x9c, 0x00, 0x00, 0x00, 0xf9, 0x00, 0x00, 0x00,
+  0xe4, 0x00, 0x00, 0x00, 0xac, 0x00, 0x05, 0x00, 0x4a, 0x00, 0x00, 0x00,
+  0xe7, 0x00, 0x00, 0x00, 0xfd, 0x01, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00,
+  0xf6, 0x00, 0x04, 0x00, 0xe3, 0x00, 0x00, 0x00, 0xe4, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0xfa, 0x00, 0x04, 0x00, 0xe7, 0x00, 0x00, 0x00,
+  0xe2, 0x00, 0x00, 0x00, 0xe3, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00,
+  0xe2, 0x00, 0x00, 0x00, 0xb0, 0x00, 0x05, 0x00, 0x4a, 0x00, 0x00, 0x00,
+  0xea, 0x00, 0x00, 0x00, 0x7b, 0x00, 0x00, 0x00, 0xfd, 0x01, 0x00, 0x00,
+  0xf7, 0x00, 0x03, 0x00, 0xec, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0xfa, 0x00, 0x04, 0x00, 0xea, 0x00, 0x00, 0x00, 0xeb, 0x00, 0x00, 0x00,
+  0xec, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0xeb, 0x00, 0x00, 0x00,
+  0x3d, 0x00, 0x04, 0x00, 0x17, 0x00, 0x00, 0x00, 0xf0, 0x00, 0x00, 0x00,
+  0xdd, 0x00, 0x00, 0x00, 0x80, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0xf3, 0x00, 0x00, 0x00, 0x7b, 0x00, 0x00, 0x00, 0xfd, 0x01, 0x00, 0x00,
+  0x41, 0x00, 0x05, 0x00, 0xdc, 0x00, 0x00, 0x00, 0xf4, 0x00, 0x00, 0x00,
+  0xd9, 0x00, 0x00, 0x00, 0xf3, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00,
+  0x17, 0x00, 0x00, 0x00, 0xf5, 0x00, 0x00, 0x00, 0xf4, 0x00, 0x00, 0x00,
+  0x0c, 0x00, 0x07, 0x00, 0x17, 0x00, 0x00, 0x00, 0xf6, 0x00, 0x00, 0x00,
+  0x01, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00, 0xf0, 0x00, 0x00, 0x00,
+  0xf5, 0x00, 0x00, 0x00, 0x3e, 0x00, 0x03, 0x00, 0xdd, 0x00, 0x00, 0x00,
+  0xf6, 0x00, 0x00, 0x00, 0xf9, 0x00, 0x02, 0x00, 0xec, 0x00, 0x00, 0x00,
+  0xf8, 0x00, 0x02, 0x00, 0xec, 0x00, 0x00, 0x00, 0xe0, 0x00, 0x04, 0x00,
+  0x52, 0x00, 0x00, 0x00, 0x52, 0x00, 0x00, 0x00, 0xde, 0x00, 0x00, 0x00,
+  0xf9, 0x00, 0x02, 0x00, 0xe4, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00,
+  0xe4, 0x00, 0x00, 0x00, 0xc2, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0xf9, 0x00, 0x00, 0x00, 0xfd, 0x01, 0x00, 0x00, 0xa5, 0x00, 0x00, 0x00,
+  0xf9, 0x00, 0x02, 0x00, 0xe1, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00,
+  0xe3, 0x00, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00, 0xdc, 0x00, 0x00, 0x00,
+  0xfa, 0x00, 0x00, 0x00, 0xd9, 0x00, 0x00, 0x00, 0x27, 0x00, 0x00, 0x00,
+  0x3d, 0x00, 0x04, 0x00, 0x17, 0x00, 0x00, 0x00, 0xfb, 0x00, 0x00, 0x00,
+  0xfa, 0x00, 0x00, 0x00, 0xf9, 0x00, 0x02, 0x00, 0xff, 0x00, 0x00, 0x00,
+  0xf8, 0x00, 0x02, 0x00, 0xff, 0x00, 0x00, 0x00, 0xf5, 0x00, 0x07, 0x00,
+  0x17, 0x00, 0x00, 0x00, 0xff, 0x01, 0x00, 0x00, 0x48, 0x00, 0x00, 0x00,
+  0xe3, 0x00, 0x00, 0x00, 0x37, 0x01, 0x00, 0x00, 0x02, 0x01, 0x00, 0x00,
+  0xf5, 0x00, 0x07, 0x00, 0x06, 0x00, 0x00, 0x00, 0xfe, 0x01, 0x00, 0x00,
+  0x7b, 0x00, 0x00, 0x00, 0xe3, 0x00, 0x00, 0x00, 0x39, 0x01, 0x00, 0x00,
+  0x02, 0x01, 0x00, 0x00, 0xb0, 0x00, 0x05, 0x00, 0x4a, 0x00, 0x00, 0x00,
+  0x07, 0x01, 0x00, 0x00, 0xfe, 0x01, 0x00, 0x00, 0xa1, 0x00, 0x00, 0x00,
+  0xf6, 0x00, 0x04, 0x00, 0x01, 0x01, 0x00, 0x00, 0x02, 0x01, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0xfa, 0x00, 0x04, 0x00, 0x07, 0x01, 0x00, 0x00,
+  0x00, 0x01, 0x00, 0x00, 0x01, 0x01, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00,
+  0x00, 0x01, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00, 0x28, 0x00, 0x00, 0x00,
+  0x12, 0x01, 0x00, 0x00, 0x26, 0x00, 0x00, 0x00, 0xa5, 0x00, 0x00, 0x00,
+  0x3d, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, 0x13, 0x01, 0x00, 0x00,
+  0x12, 0x01, 0x00, 0x00, 0x84, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0xbd, 0x01, 0x00, 0x00, 0x83, 0x00, 0x00, 0x00, 0x8e, 0x00, 0x00, 0x00,
+  0x80, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0xbe, 0x01, 0x00, 0x00,
+  0x8f, 0x00, 0x00, 0x00, 0xbd, 0x01, 0x00, 0x00, 0x84, 0x00, 0x05, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0xc2, 0x01, 0x00, 0x00, 0xfe, 0x01, 0x00, 0x00,
+  0x13, 0x01, 0x00, 0x00, 0x80, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0xc3, 0x01, 0x00, 0x00, 0x94, 0x00, 0x00, 0x00, 0xc2, 0x01, 0x00, 0x00,
+  0x7c, 0x00, 0x04, 0x00, 0x0b, 0x00, 0x00, 0x00, 0xc5, 0x01, 0x00, 0x00,
+  0xbe, 0x01, 0x00, 0x00, 0x7c, 0x00, 0x04, 0x00, 0x0b, 0x00, 0x00, 0x00,
+  0xc7, 0x01, 0x00, 0x00, 0xc3, 0x01, 0x00, 0x00, 0x50, 0x00, 0x05, 0x00,
+  0x0c, 0x00, 0x00, 0x00, 0xc8, 0x01, 0x00, 0x00, 0xc5, 0x01, 0x00, 0x00,
+  0xc7, 0x01, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00, 0xb7, 0x00, 0x00, 0x00,
+  0x19, 0x01, 0x00, 0x00, 0xb9, 0x00, 0x00, 0x00, 0x64, 0x00, 0x04, 0x00,
+  0xb6, 0x00, 0x00, 0x00, 0x1b, 0x01, 0x00, 0x00, 0x19, 0x01, 0x00, 0x00,
+  0x5f, 0x00, 0x07, 0x00, 0x17, 0x00, 0x00, 0x00, 0x1c, 0x01, 0x00, 0x00,
+  0x1b, 0x01, 0x00, 0x00, 0xc8, 0x01, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+  0x27, 0x00, 0x00, 0x00, 0x83, 0x00, 0x05, 0x00, 0x17, 0x00, 0x00, 0x00,
+  0x20, 0x01, 0x00, 0x00, 0x1c, 0x01, 0x00, 0x00, 0xfb, 0x00, 0x00, 0x00,
+  0x0c, 0x00, 0x06, 0x00, 0x17, 0x00, 0x00, 0x00, 0x21, 0x01, 0x00, 0x00,
+  0x01, 0x00, 0x00, 0x00, 0x1b, 0x00, 0x00, 0x00, 0x20, 0x01, 0x00, 0x00,
+  0x82, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0x25, 0x01, 0x00, 0x00,
+  0x82, 0x00, 0x00, 0x00, 0x2d, 0x00, 0x00, 0x00, 0xaa, 0x00, 0x05, 0x00,
+  0x4a, 0x00, 0x00, 0x00, 0x26, 0x01, 0x00, 0x00, 0x83, 0x00, 0x00, 0x00,
+  0x25, 0x01, 0x00, 0x00, 0xf7, 0x00, 0x03, 0x00, 0x28, 0x01, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0xfa, 0x00, 0x04, 0x00, 0x26, 0x01, 0x00, 0x00,
+  0x27, 0x01, 0x00, 0x00, 0x28, 0x01, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00,
+  0x27, 0x01, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00, 0x28, 0x00, 0x00, 0x00,
+  0x29, 0x01, 0x00, 0x00, 0x26, 0x00, 0x00, 0x00, 0xc5, 0x00, 0x00, 0x00,
+  0x3d, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, 0x2a, 0x01, 0x00, 0x00,
+  0x29, 0x01, 0x00, 0x00, 0xac, 0x00, 0x05, 0x00, 0x4a, 0x00, 0x00, 0x00,
+  0x2b, 0x01, 0x00, 0x00, 0x2a, 0x01, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00,
+  0xf9, 0x00, 0x02, 0x00, 0x28, 0x01, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00,
+  0x28, 0x01, 0x00, 0x00, 0xf5, 0x00, 0x07, 0x00, 0x4a, 0x00, 0x00, 0x00,
+  0x2c, 0x01, 0x00, 0x00, 0x26, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00,
+  0x2b, 0x01, 0x00, 0x00, 0x27, 0x01, 0x00, 0x00, 0xf7, 0x00, 0x03, 0x00,
+  0x2e, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xfa, 0x00, 0x04, 0x00,
+  0x2c, 0x01, 0x00, 0x00, 0x2d, 0x01, 0x00, 0x00, 0x2e, 0x01, 0x00, 0x00,
+  0xf8, 0x00, 0x02, 0x00, 0x2d, 0x01, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00,
+  0x28, 0x00, 0x00, 0x00, 0x32, 0x01, 0x00, 0x00, 0x26, 0x00, 0x00, 0x00,
+  0xc5, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0x33, 0x01, 0x00, 0x00, 0x32, 0x01, 0x00, 0x00, 0xae, 0x00, 0x05, 0x00,
+  0x4a, 0x00, 0x00, 0x00, 0xcd, 0x01, 0x00, 0x00, 0x33, 0x01, 0x00, 0x00,
+  0x2d, 0x00, 0x00, 0x00, 0xa9, 0x00, 0x06, 0x00, 0x16, 0x00, 0x00, 0x00,
+  0xce, 0x01, 0x00, 0x00, 0xcd, 0x01, 0x00, 0x00, 0x4c, 0x00, 0x00, 0x00,
+  0x47, 0x00, 0x00, 0x00, 0x52, 0x00, 0x06, 0x00, 0x17, 0x00, 0x00, 0x00,
+  0xf5, 0x01, 0x00, 0x00, 0xce, 0x01, 0x00, 0x00, 0x48, 0x00, 0x00, 0x00,
+  0x03, 0x00, 0x00, 0x00, 0xae, 0x00, 0x05, 0x00, 0x4a, 0x00, 0x00, 0x00,
+  0xd1, 0x01, 0x00, 0x00, 0x33, 0x01, 0x00, 0x00, 0x52, 0x00, 0x00, 0x00,
+  0xa9, 0x00, 0x06, 0x00, 0x16, 0x00, 0x00, 0x00, 0xd2, 0x01, 0x00, 0x00,
+  0xd1, 0x01, 0x00, 0x00, 0x4c, 0x00, 0x00, 0x00, 0x47, 0x00, 0x00, 0x00,
+  0x52, 0x00, 0x06, 0x00, 0x17, 0x00, 0x00, 0x00, 0xf7, 0x01, 0x00, 0x00,
+  0xd2, 0x01, 0x00, 0x00, 0xf5, 0x01, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+  0xae, 0x00, 0x05, 0x00, 0x4a, 0x00, 0x00, 0x00, 0xd5, 0x01, 0x00, 0x00,
+  0x33, 0x01, 0x00, 0x00, 0x4e, 0x00, 0x00, 0x00, 0xa9, 0x00, 0x06, 0x00,
+  0x16, 0x00, 0x00, 0x00, 0xd6, 0x01, 0x00, 0x00, 0xd5, 0x01, 0x00, 0x00,
+  0x4c, 0x00, 0x00, 0x00, 0x47, 0x00, 0x00, 0x00, 0x52, 0x00, 0x06, 0x00,
+  0x17, 0x00, 0x00, 0x00, 0xf9, 0x01, 0x00, 0x00, 0xd6, 0x01, 0x00, 0x00,
+  0xf7, 0x01, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x08, 0x00,
+  0x17, 0x00, 0x00, 0x00, 0xda, 0x01, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+  0x2e, 0x00, 0x00, 0x00, 0x21, 0x01, 0x00, 0x00, 0x48, 0x00, 0x00, 0x00,
+  0xf9, 0x01, 0x00, 0x00, 0xf9, 0x00, 0x02, 0x00, 0x2e, 0x01, 0x00, 0x00,
+  0xf8, 0x00, 0x02, 0x00, 0x2e, 0x01, 0x00, 0x00, 0xf5, 0x00, 0x07, 0x00,
+  0x17, 0x00, 0x00, 0x00, 0x08, 0x02, 0x00, 0x00, 0x21, 0x01, 0x00, 0x00,
+  0x28, 0x01, 0x00, 0x00, 0xda, 0x01, 0x00, 0x00, 0x2d, 0x01, 0x00, 0x00,
+  0x81, 0x00, 0x05, 0x00, 0x17, 0x00, 0x00, 0x00, 0x37, 0x01, 0x00, 0x00,
+  0xff, 0x01, 0x00, 0x00, 0x08, 0x02, 0x00, 0x00, 0xf9, 0x00, 0x02, 0x00,
+  0x02, 0x01, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x02, 0x01, 0x00, 0x00,
+  0x80, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0x39, 0x01, 0x00, 0x00,
+  0xfe, 0x01, 0x00, 0x00, 0x2b, 0x00, 0x00, 0x00, 0xf9, 0x00, 0x02, 0x00,
+  0xff, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x01, 0x01, 0x00, 0x00,
+  0x3e, 0x00, 0x03, 0x00, 0xdd, 0x00, 0x00, 0x00, 0xff, 0x01, 0x00, 0x00,
+  0xe0, 0x00, 0x04, 0x00, 0x52, 0x00, 0x00, 0x00, 0x52, 0x00, 0x00, 0x00,
+  0xde, 0x00, 0x00, 0x00, 0xf9, 0x00, 0x02, 0x00, 0x3e, 0x01, 0x00, 0x00,
+  0xf8, 0x00, 0x02, 0x00, 0x3e, 0x01, 0x00, 0x00, 0xf5, 0x00, 0x07, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0xe0, 0x00, 0x00, 0x00,
+  0x01, 0x01, 0x00, 0x00, 0x55, 0x01, 0x00, 0x00, 0x41, 0x01, 0x00, 0x00,
+  0xac, 0x00, 0x05, 0x00, 0x4a, 0x00, 0x00, 0x00, 0x44, 0x01, 0x00, 0x00,
+  0x00, 0x02, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00, 0xf6, 0x00, 0x04, 0x00,
+  0x40, 0x01, 0x00, 0x00, 0x41, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0xfa, 0x00, 0x04, 0x00, 0x44, 0x01, 0x00, 0x00, 0x3f, 0x01, 0x00, 0x00,
+  0x40, 0x01, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x3f, 0x01, 0x00, 0x00,
+  0xb0, 0x00, 0x05, 0x00, 0x4a, 0x00, 0x00, 0x00, 0x47, 0x01, 0x00, 0x00,
+  0x7b, 0x00, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0xf7, 0x00, 0x03, 0x00,
+  0x49, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xfa, 0x00, 0x04, 0x00,
+  0x47, 0x01, 0x00, 0x00, 0x48, 0x01, 0x00, 0x00, 0x49, 0x01, 0x00, 0x00,
+  0xf8, 0x00, 0x02, 0x00, 0x48, 0x01, 0x00, 0x00, 0x80, 0x00, 0x05, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0x4d, 0x01, 0x00, 0x00, 0x7b, 0x00, 0x00, 0x00,
+  0x00, 0x02, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00, 0xdc, 0x00, 0x00, 0x00,
+  0x4e, 0x01, 0x00, 0x00, 0xd9, 0x00, 0x00, 0x00, 0x4d, 0x01, 0x00, 0x00,
+  0x3d, 0x00, 0x04, 0x00, 0x17, 0x00, 0x00, 0x00, 0x4f, 0x01, 0x00, 0x00,
+  0x4e, 0x01, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00, 0x17, 0x00, 0x00, 0x00,
+  0x51, 0x01, 0x00, 0x00, 0xdd, 0x00, 0x00, 0x00, 0x81, 0x00, 0x05, 0x00,
+  0x17, 0x00, 0x00, 0x00, 0x52, 0x01, 0x00, 0x00, 0x51, 0x01, 0x00, 0x00,
+  0x4f, 0x01, 0x00, 0x00, 0x3e, 0x00, 0x03, 0x00, 0xdd, 0x00, 0x00, 0x00,
+  0x52, 0x01, 0x00, 0x00, 0xf9, 0x00, 0x02, 0x00, 0x49, 0x01, 0x00, 0x00,
+  0xf8, 0x00, 0x02, 0x00, 0x49, 0x01, 0x00, 0x00, 0xe0, 0x00, 0x04, 0x00,
+  0x52, 0x00, 0x00, 0x00, 0x52, 0x00, 0x00, 0x00, 0xde, 0x00, 0x00, 0x00,
+  0xf9, 0x00, 0x02, 0x00, 0x41, 0x01, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00,
+  0x41, 0x01, 0x00, 0x00, 0xc2, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0x55, 0x01, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0xa5, 0x00, 0x00, 0x00,
+  0xf9, 0x00, 0x02, 0x00, 0x3e, 0x01, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00,
+  0x40, 0x01, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00, 0x17, 0x00, 0x00, 0x00,
+  0x57, 0x01, 0x00, 0x00, 0xfa, 0x00, 0x00, 0x00, 0xf9, 0x00, 0x02, 0x00,
+  0x5a, 0x01, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x5a, 0x01, 0x00, 0x00,
+  0xf5, 0x00, 0x07, 0x00, 0x06, 0x00, 0x00, 0x00, 0x01, 0x02, 0x00, 0x00,
+  0x7b, 0x00, 0x00, 0x00, 0x40, 0x01, 0x00, 0x00, 0x86, 0x01, 0x00, 0x00,
+  0x5b, 0x01, 0x00, 0x00, 0xb0, 0x00, 0x05, 0x00, 0x4a, 0x00, 0x00, 0x00,
+  0x62, 0x01, 0x00, 0x00, 0x01, 0x02, 0x00, 0x00, 0xa1, 0x00, 0x00, 0x00,
+  0xf6, 0x00, 0x04, 0x00, 0x5c, 0x01, 0x00, 0x00, 0x5b, 0x01, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0xfa, 0x00, 0x04, 0x00, 0x62, 0x01, 0x00, 0x00,
+  0x5b, 0x01, 0x00, 0x00, 0x5c, 0x01, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00,
+  0x5b, 0x01, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00, 0x28, 0x00, 0x00, 0x00,
+  0x6d, 0x01, 0x00, 0x00, 0x26, 0x00, 0x00, 0x00, 0xa5, 0x00, 0x00, 0x00,
+  0x3d, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, 0x6e, 0x01, 0x00, 0x00,
+  0x6d, 0x01, 0x00, 0x00, 0x84, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0xe2, 0x01, 0x00, 0x00, 0x83, 0x00, 0x00, 0x00, 0x8e, 0x00, 0x00, 0x00,
+  0x80, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0xe3, 0x01, 0x00, 0x00,
+  0x8f, 0x00, 0x00, 0x00, 0xe2, 0x01, 0x00, 0x00, 0x84, 0x00, 0x05, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0xe7, 0x01, 0x00, 0x00, 0x01, 0x02, 0x00, 0x00,
+  0x6e, 0x01, 0x00, 0x00, 0x80, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0xe8, 0x01, 0x00, 0x00, 0x94, 0x00, 0x00, 0x00, 0xe7, 0x01, 0x00, 0x00,
+  0x7c, 0x00, 0x04, 0x00, 0x0b, 0x00, 0x00, 0x00, 0xea, 0x01, 0x00, 0x00,
+  0xe3, 0x01, 0x00, 0x00, 0x7c, 0x00, 0x04, 0x00, 0x0b, 0x00, 0x00, 0x00,
+  0xec, 0x01, 0x00, 0x00, 0xe8, 0x01, 0x00, 0x00, 0x50, 0x00, 0x05, 0x00,
+  0x0c, 0x00, 0x00, 0x00, 0xed, 0x01, 0x00, 0x00, 0xea, 0x01, 0x00, 0x00,
+  0xec, 0x01, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00, 0xb7, 0x00, 0x00, 0x00,
+  0x74, 0x01, 0x00, 0x00, 0xb9, 0x00, 0x00, 0x00, 0x64, 0x00, 0x04, 0x00,
+  0xb6, 0x00, 0x00, 0x00, 0x76, 0x01, 0x00, 0x00, 0x74, 0x01, 0x00, 0x00,
+  0x5f, 0x00, 0x07, 0x00, 0x17, 0x00, 0x00, 0x00, 0x77, 0x01, 0x00, 0x00,
+  0x76, 0x01, 0x00, 0x00, 0xed, 0x01, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+  0x27, 0x00, 0x00, 0x00, 0x83, 0x00, 0x05, 0x00, 0x17, 0x00, 0x00, 0x00,
+  0x7b, 0x01, 0x00, 0x00, 0x77, 0x01, 0x00, 0x00, 0xfb, 0x00, 0x00, 0x00,
+  0x0c, 0x00, 0x06, 0x00, 0x17, 0x00, 0x00, 0x00, 0x7c, 0x01, 0x00, 0x00,
+  0x01, 0x00, 0x00, 0x00, 0x1b, 0x00, 0x00, 0x00, 0x7b, 0x01, 0x00, 0x00,
+  0x88, 0x00, 0x05, 0x00, 0x17, 0x00, 0x00, 0x00, 0x7e, 0x01, 0x00, 0x00,
+  0x7c, 0x01, 0x00, 0x00, 0x57, 0x01, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00,
+  0x7f, 0x01, 0x00, 0x00, 0x82, 0x01, 0x00, 0x00, 0x81, 0x01, 0x00, 0x00,
+  0x63, 0x00, 0x04, 0x00, 0x82, 0x01, 0x00, 0x00, 0xed, 0x01, 0x00, 0x00,
+  0x7e, 0x01, 0x00, 0x00, 0x80, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0x86, 0x01, 0x00, 0x00, 0x01, 0x02, 0x00, 0x00, 0x2b, 0x00, 0x00, 0x00,
+  0xf9, 0x00, 0x02, 0x00, 0x5a, 0x01, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00,
+  0x5c, 0x01, 0x00, 0x00, 0xfd, 0x00, 0x01, 0x00, 0x38, 0x00, 0x01, 0x00
+};
+unsigned int glsl_softmaxImage_AXIS_N_comp_len = 5016;
+
+const unsigned char glsl_softmaxImage_AXIS_H_comp[] = {
+  0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0b, 0x00, 0x08, 0x00,
+  0x12, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00,
+  0x01, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x38, 0x00, 0x00, 0x00,
+  0x0b, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x4c, 0x53, 0x4c,
+  0x2e, 0x73, 0x74, 0x64, 0x2e, 0x34, 0x35, 0x30, 0x00, 0x00, 0x00, 0x00,
+  0x0e, 0x00, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+  0x0f, 0x00, 0x07, 0x00, 0x05, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
+  0x6d, 0x61, 0x69, 0x6e, 0x00, 0x00, 0x00, 0x00, 0x78, 0x00, 0x00, 0x00,
+  0x7d, 0x00, 0x00, 0x00, 0x10, 0x00, 0x06, 0x00, 0x04, 0x00, 0x00, 0x00,
+  0x11, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+  0x01, 0x00, 0x00, 0x00, 0x03, 0x00, 0x03, 0x00, 0x02, 0x00, 0x00, 0x00,
+  0xc2, 0x01, 0x00, 0x00, 0x05, 0x00, 0x04, 0x00, 0x04, 0x00, 0x00, 0x00,
+  0x6d, 0x61, 0x69, 0x6e, 0x00, 0x00, 0x00, 0x00, 0x05, 0x00, 0x05, 0x00,
+  0x24, 0x00, 0x00, 0x00, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x42, 0x75, 0x66,
+  0x66, 0x65, 0x72, 0x00, 0x06, 0x00, 0x04, 0x00, 0x24, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x4e, 0x00, 0x00, 0x00, 0x06, 0x00, 0x04, 0x00,
+  0x24, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x48, 0x00, 0x00, 0x00,
+  0x06, 0x00, 0x04, 0x00, 0x24, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+  0x57, 0x00, 0x00, 0x00, 0x06, 0x00, 0x04, 0x00, 0x24, 0x00, 0x00, 0x00,
+  0x03, 0x00, 0x00, 0x00, 0x43, 0x34, 0x00, 0x00, 0x06, 0x00, 0x05, 0x00,
+  0x24, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x43, 0x4c, 0x65, 0x66,
+  0x74, 0x00, 0x00, 0x00, 0x05, 0x00, 0x04, 0x00, 0x26, 0x00, 0x00, 0x00,
+  0x75, 0x43, 0x6f, 0x6e, 0x73, 0x74, 0x00, 0x00, 0x05, 0x00, 0x08, 0x00,
+  0x78, 0x00, 0x00, 0x00, 0x67, 0x6c, 0x5f, 0x4c, 0x6f, 0x63, 0x61, 0x6c,
+  0x49, 0x6e, 0x76, 0x6f, 0x63, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x49, 0x44,
+  0x00, 0x00, 0x00, 0x00, 0x05, 0x00, 0x08, 0x00, 0x7d, 0x00, 0x00, 0x00,
+  0x67, 0x6c, 0x5f, 0x47, 0x6c, 0x6f, 0x62, 0x61, 0x6c, 0x49, 0x6e, 0x76,
+  0x6f, 0x63, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x49, 0x44, 0x00, 0x00, 0x00,
+  0x05, 0x00, 0x04, 0x00, 0xb8, 0x00, 0x00, 0x00, 0x75, 0x49, 0x6e, 0x70,
+  0x75, 0x74, 0x00, 0x00, 0x05, 0x00, 0x06, 0x00, 0xd9, 0x00, 0x00, 0x00,
+  0x73, 0x68, 0x61, 0x72, 0x65, 0x64, 0x56, 0x61, 0x6c, 0x75, 0x65, 0x73,
+  0x00, 0x00, 0x00, 0x00, 0x05, 0x00, 0x04, 0x00, 0x81, 0x01, 0x00, 0x00,
+  0x75, 0x4f, 0x75, 0x74, 0x70, 0x75, 0x74, 0x00, 0x48, 0x00, 0x05, 0x00,
+  0x24, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00, 0x24, 0x00, 0x00, 0x00,
+  0x01, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
+  0x48, 0x00, 0x05, 0x00, 0x24, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+  0x23, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00,
+  0x24, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00,
+  0x0c, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00, 0x24, 0x00, 0x00, 0x00,
+  0x04, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00,
+  0x47, 0x00, 0x03, 0x00, 0x24, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+  0x47, 0x00, 0x04, 0x00, 0x26, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x26, 0x00, 0x00, 0x00,
+  0x21, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
+  0x78, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x1b, 0x00, 0x00, 0x00,
+  0x47, 0x00, 0x04, 0x00, 0x7d, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00,
+  0x1c, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0xb8, 0x00, 0x00, 0x00,
+  0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
+  0xb8, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+  0x47, 0x00, 0x04, 0x00, 0x81, 0x01, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x81, 0x01, 0x00, 0x00,
+  0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00,
+  0x81, 0x01, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
+  0x87, 0x01, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00,
+  0x13, 0x00, 0x02, 0x00, 0x02, 0x00, 0x00, 0x00, 0x21, 0x00, 0x03, 0x00,
+  0x03, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x15, 0x00, 0x04, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x15, 0x00, 0x04, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00,
+  0x01, 0x00, 0x00, 0x00, 0x17, 0x00, 0x04, 0x00, 0x0c, 0x00, 0x00, 0x00,
+  0x0b, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x16, 0x00, 0x03, 0x00,
+  0x16, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0x17, 0x00, 0x04, 0x00,
+  0x17, 0x00, 0x00, 0x00, 0x16, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
+  0x2b, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x1e, 0x00, 0x07, 0x00, 0x24, 0x00, 0x00, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00,
+  0x25, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00,
+  0x3b, 0x00, 0x04, 0x00, 0x25, 0x00, 0x00, 0x00, 0x26, 0x00, 0x00, 0x00,
+  0x02, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00, 0x0b, 0x00, 0x00, 0x00,
+  0x27, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00,
+  0x28, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0x2b, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x00, 0x00,
+  0x00, 0x01, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0x2d, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00,
+  0x16, 0x00, 0x00, 0x00, 0x47, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x2c, 0x00, 0x07, 0x00, 0x17, 0x00, 0x00, 0x00, 0x48, 0x00, 0x00, 0x00,
+  0x47, 0x00, 0x00, 0x00, 0x47, 0x00, 0x00, 0x00, 0x47, 0x00, 0x00, 0x00,
+  0x47, 0x00, 0x00, 0x00, 0x14, 0x00, 0x02, 0x00, 0x4a, 0x00, 0x00, 0x00,
+  0x2b, 0x00, 0x04, 0x00, 0x16, 0x00, 0x00, 0x00, 0x4c, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x80, 0x3f, 0x2b, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0x4e, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0x52, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+  0x2b, 0x00, 0x04, 0x00, 0x16, 0x00, 0x00, 0x00, 0x5b, 0x00, 0x00, 0x00,
+  0xff, 0xff, 0x7f, 0xff, 0x2c, 0x00, 0x07, 0x00, 0x17, 0x00, 0x00, 0x00,
+  0x5c, 0x00, 0x00, 0x00, 0x5b, 0x00, 0x00, 0x00, 0x5b, 0x00, 0x00, 0x00,
+  0x5b, 0x00, 0x00, 0x00, 0x5b, 0x00, 0x00, 0x00, 0x17, 0x00, 0x04, 0x00,
+  0x76, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00,
+  0x20, 0x00, 0x04, 0x00, 0x77, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+  0x76, 0x00, 0x00, 0x00, 0x3b, 0x00, 0x04, 0x00, 0x77, 0x00, 0x00, 0x00,
+  0x78, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00,
+  0x79, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0x3b, 0x00, 0x04, 0x00, 0x77, 0x00, 0x00, 0x00, 0x7d, 0x00, 0x00, 0x00,
+  0x01, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00, 0x0b, 0x00, 0x00, 0x00,
+  0x80, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00,
+  0x0b, 0x00, 0x00, 0x00, 0x8c, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+  0x19, 0x00, 0x09, 0x00, 0xb5, 0x00, 0x00, 0x00, 0x16, 0x00, 0x00, 0x00,
+  0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x1b, 0x00, 0x03, 0x00, 0xb6, 0x00, 0x00, 0x00, 0xb5, 0x00, 0x00, 0x00,
+  0x20, 0x00, 0x04, 0x00, 0xb7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0xb6, 0x00, 0x00, 0x00, 0x3b, 0x00, 0x04, 0x00, 0xb7, 0x00, 0x00, 0x00,
+  0xb8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00,
+  0x0b, 0x00, 0x00, 0x00, 0xbb, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x2b, 0x00, 0x04, 0x00, 0x0b, 0x00, 0x00, 0x00, 0xc5, 0x00, 0x00, 0x00,
+  0x04, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x04, 0x00, 0xd7, 0x00, 0x00, 0x00,
+  0x17, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00,
+  0xd8, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0xd7, 0x00, 0x00, 0x00,
+  0x3b, 0x00, 0x04, 0x00, 0xd8, 0x00, 0x00, 0x00, 0xd9, 0x00, 0x00, 0x00,
+  0x04, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00, 0xdc, 0x00, 0x00, 0x00,
+  0x04, 0x00, 0x00, 0x00, 0x17, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0xde, 0x00, 0x00, 0x00, 0x08, 0x01, 0x00, 0x00,
+  0x2b, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, 0xe0, 0x00, 0x00, 0x00,
+  0x80, 0x00, 0x00, 0x00, 0x19, 0x00, 0x09, 0x00, 0x7f, 0x01, 0x00, 0x00,
+  0x16, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00, 0x80, 0x01, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x7f, 0x01, 0x00, 0x00, 0x3b, 0x00, 0x04, 0x00,
+  0x80, 0x01, 0x00, 0x00, 0x81, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x2c, 0x00, 0x06, 0x00, 0x76, 0x00, 0x00, 0x00, 0x87, 0x01, 0x00, 0x00,
+  0x2b, 0x00, 0x00, 0x00, 0x2d, 0x00, 0x00, 0x00, 0x2d, 0x00, 0x00, 0x00,
+  0x36, 0x00, 0x05, 0x00, 0x02, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00,
+  0x05, 0x00, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00, 0x79, 0x00, 0x00, 0x00,
+  0x7a, 0x00, 0x00, 0x00, 0x78, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00,
+  0x3d, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, 0x7b, 0x00, 0x00, 0x00,
+  0x7a, 0x00, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00, 0x79, 0x00, 0x00, 0x00,
+  0x7e, 0x00, 0x00, 0x00, 0x7d, 0x00, 0x00, 0x00, 0x2d, 0x00, 0x00, 0x00,
+  0x3d, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, 0x7f, 0x00, 0x00, 0x00,
+  0x7e, 0x00, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00, 0x28, 0x00, 0x00, 0x00,
+  0x81, 0x00, 0x00, 0x00, 0x26, 0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00,
+  0x3d, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, 0x82, 0x00, 0x00, 0x00,
+  0x81, 0x00, 0x00, 0x00, 0x89, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0x83, 0x00, 0x00, 0x00, 0x7f, 0x00, 0x00, 0x00, 0x82, 0x00, 0x00, 0x00,
+  0x86, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0x89, 0x00, 0x00, 0x00,
+  0x7f, 0x00, 0x00, 0x00, 0x82, 0x00, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00,
+  0x28, 0x00, 0x00, 0x00, 0x8d, 0x00, 0x00, 0x00, 0x26, 0x00, 0x00, 0x00,
+  0x8c, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0x8e, 0x00, 0x00, 0x00, 0x8d, 0x00, 0x00, 0x00, 0x89, 0x00, 0x05, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0x8f, 0x00, 0x00, 0x00, 0x89, 0x00, 0x00, 0x00,
+  0x8e, 0x00, 0x00, 0x00, 0x86, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0x94, 0x00, 0x00, 0x00, 0x89, 0x00, 0x00, 0x00, 0x8e, 0x00, 0x00, 0x00,
+  0xf9, 0x00, 0x02, 0x00, 0x9a, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00,
+  0x9a, 0x00, 0x00, 0x00, 0xf5, 0x00, 0x07, 0x00, 0x17, 0x00, 0x00, 0x00,
+  0xfc, 0x01, 0x00, 0x00, 0x5c, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00,
+  0xd4, 0x00, 0x00, 0x00, 0x9d, 0x00, 0x00, 0x00, 0xf5, 0x00, 0x07, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0xfb, 0x01, 0x00, 0x00, 0x7b, 0x00, 0x00, 0x00,
+  0x05, 0x00, 0x00, 0x00, 0xd6, 0x00, 0x00, 0x00, 0x9d, 0x00, 0x00, 0x00,
+  0x41, 0x00, 0x05, 0x00, 0x28, 0x00, 0x00, 0x00, 0xa0, 0x00, 0x00, 0x00,
+  0x26, 0x00, 0x00, 0x00, 0x27, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0xa1, 0x00, 0x00, 0x00, 0xa0, 0x00, 0x00, 0x00,
+  0xb0, 0x00, 0x05, 0x00, 0x4a, 0x00, 0x00, 0x00, 0xa2, 0x00, 0x00, 0x00,
+  0xfb, 0x01, 0x00, 0x00, 0xa1, 0x00, 0x00, 0x00, 0xf6, 0x00, 0x04, 0x00,
+  0x9c, 0x00, 0x00, 0x00, 0x9d, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0xfa, 0x00, 0x04, 0x00, 0xa2, 0x00, 0x00, 0x00, 0x9b, 0x00, 0x00, 0x00,
+  0x9c, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x9b, 0x00, 0x00, 0x00,
+  0x84, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0x98, 0x01, 0x00, 0x00,
+  0x83, 0x00, 0x00, 0x00, 0x8e, 0x00, 0x00, 0x00, 0x80, 0x00, 0x05, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0x99, 0x01, 0x00, 0x00, 0x8f, 0x00, 0x00, 0x00,
+  0x98, 0x01, 0x00, 0x00, 0x84, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0x9d, 0x01, 0x00, 0x00, 0x94, 0x00, 0x00, 0x00, 0xa1, 0x00, 0x00, 0x00,
+  0x80, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0x9e, 0x01, 0x00, 0x00,
+  0xfb, 0x01, 0x00, 0x00, 0x9d, 0x01, 0x00, 0x00, 0x7c, 0x00, 0x04, 0x00,
+  0x0b, 0x00, 0x00, 0x00, 0xa0, 0x01, 0x00, 0x00, 0x99, 0x01, 0x00, 0x00,
+  0x7c, 0x00, 0x04, 0x00, 0x0b, 0x00, 0x00, 0x00, 0xa2, 0x01, 0x00, 0x00,
+  0x9e, 0x01, 0x00, 0x00, 0x50, 0x00, 0x05, 0x00, 0x0c, 0x00, 0x00, 0x00,
+  0xa3, 0x01, 0x00, 0x00, 0xa0, 0x01, 0x00, 0x00, 0xa2, 0x01, 0x00, 0x00,
+  0x3d, 0x00, 0x04, 0x00, 0xb6, 0x00, 0x00, 0x00, 0xb9, 0x00, 0x00, 0x00,
+  0xb8, 0x00, 0x00, 0x00, 0x64, 0x00, 0x04, 0x00, 0xb5, 0x00, 0x00, 0x00,
+  0xbc, 0x00, 0x00, 0x00, 0xb9, 0x00, 0x00, 0x00, 0x5f, 0x00, 0x07, 0x00,
+  0x17, 0x00, 0x00, 0x00, 0xbd, 0x00, 0x00, 0x00, 0xbc, 0x00, 0x00, 0x00,
+  0xa3, 0x01, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0xbb, 0x00, 0x00, 0x00,
+  0x82, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0xc1, 0x00, 0x00, 0x00,
+  0x82, 0x00, 0x00, 0x00, 0x2d, 0x00, 0x00, 0x00, 0xaa, 0x00, 0x05, 0x00,
+  0x4a, 0x00, 0x00, 0x00, 0xc2, 0x00, 0x00, 0x00, 0x83, 0x00, 0x00, 0x00,
+  0xc1, 0x00, 0x00, 0x00, 0xf7, 0x00, 0x03, 0x00, 0xc4, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0xfa, 0x00, 0x04, 0x00, 0xc2, 0x00, 0x00, 0x00,
+  0xc3, 0x00, 0x00, 0x00, 0xc4, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00,
+  0xc3, 0x00, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00, 0x28, 0x00, 0x00, 0x00,
+  0xc6, 0x00, 0x00, 0x00, 0x26, 0x00, 0x00, 0x00, 0xc5, 0x00, 0x00, 0x00,
+  0x3d, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, 0xc7, 0x00, 0x00, 0x00,
+  0xc6, 0x00, 0x00, 0x00, 0xac, 0x00, 0x05, 0x00, 0x4a, 0x00, 0x00, 0x00,
+  0xc8, 0x00, 0x00, 0x00, 0xc7, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00,
+  0xf9, 0x00, 0x02, 0x00, 0xc4, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00,
+  0xc4, 0x00, 0x00, 0x00, 0xf5, 0x00, 0x07, 0x00, 0x4a, 0x00, 0x00, 0x00,
+  0xc9, 0x00, 0x00, 0x00, 0xc2, 0x00, 0x00, 0x00, 0x9b, 0x00, 0x00, 0x00,
+  0xc8, 0x00, 0x00, 0x00, 0xc3, 0x00, 0x00, 0x00, 0xf7, 0x00, 0x03, 0x00,
+  0xcb, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xfa, 0x00, 0x04, 0x00,
+  0xc9, 0x00, 0x00, 0x00, 0xca, 0x00, 0x00, 0x00, 0xcb, 0x00, 0x00, 0x00,
+  0xf8, 0x00, 0x02, 0x00, 0xca, 0x00, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00,
+  0x28, 0x00, 0x00, 0x00, 0xcf, 0x00, 0x00, 0x00, 0x26, 0x00, 0x00, 0x00,
+  0xc5, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0xd0, 0x00, 0x00, 0x00, 0xcf, 0x00, 0x00, 0x00, 0xae, 0x00, 0x05, 0x00,
+  0x4a, 0x00, 0x00, 0x00, 0xa8, 0x01, 0x00, 0x00, 0xd0, 0x00, 0x00, 0x00,
+  0x2d, 0x00, 0x00, 0x00, 0xa9, 0x00, 0x06, 0x00, 0x16, 0x00, 0x00, 0x00,
+  0xa9, 0x01, 0x00, 0x00, 0xa8, 0x01, 0x00, 0x00, 0x4c, 0x00, 0x00, 0x00,
+  0x47, 0x00, 0x00, 0x00, 0x52, 0x00, 0x06, 0x00, 0x17, 0x00, 0x00, 0x00,
+  0xef, 0x01, 0x00, 0x00, 0xa9, 0x01, 0x00, 0x00, 0x48, 0x00, 0x00, 0x00,
+  0x03, 0x00, 0x00, 0x00, 0xae, 0x00, 0x05, 0x00, 0x4a, 0x00, 0x00, 0x00,
+  0xac, 0x01, 0x00, 0x00, 0xd0, 0x00, 0x00, 0x00, 0x52, 0x00, 0x00, 0x00,
+  0xa9, 0x00, 0x06, 0x00, 0x16, 0x00, 0x00, 0x00, 0xad, 0x01, 0x00, 0x00,
+  0xac, 0x01, 0x00, 0x00, 0x4c, 0x00, 0x00, 0x00, 0x47, 0x00, 0x00, 0x00,
+  0x52, 0x00, 0x06, 0x00, 0x17, 0x00, 0x00, 0x00, 0xf1, 0x01, 0x00, 0x00,
+  0xad, 0x01, 0x00, 0x00, 0xef, 0x01, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+  0xae, 0x00, 0x05, 0x00, 0x4a, 0x00, 0x00, 0x00, 0xb0, 0x01, 0x00, 0x00,
+  0xd0, 0x00, 0x00, 0x00, 0x4e, 0x00, 0x00, 0x00, 0xa9, 0x00, 0x06, 0x00,
+  0x16, 0x00, 0x00, 0x00, 0xb1, 0x01, 0x00, 0x00, 0xb0, 0x01, 0x00, 0x00,
+  0x4c, 0x00, 0x00, 0x00, 0x47, 0x00, 0x00, 0x00, 0x52, 0x00, 0x06, 0x00,
+  0x17, 0x00, 0x00, 0x00, 0xf3, 0x01, 0x00, 0x00, 0xb1, 0x01, 0x00, 0x00,
+  0xf1, 0x01, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x08, 0x00,
+  0x17, 0x00, 0x00, 0x00, 0xb5, 0x01, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+  0x2e, 0x00, 0x00, 0x00, 0xbd, 0x00, 0x00, 0x00, 0x5c, 0x00, 0x00, 0x00,
+  0xf3, 0x01, 0x00, 0x00, 0xf9, 0x00, 0x02, 0x00, 0xcb, 0x00, 0x00, 0x00,
+  0xf8, 0x00, 0x02, 0x00, 0xcb, 0x00, 0x00, 0x00, 0xf5, 0x00, 0x07, 0x00,
+  0x17, 0x00, 0x00, 0x00, 0x11, 0x02, 0x00, 0x00, 0xbd, 0x00, 0x00, 0x00,
+  0xc4, 0x00, 0x00, 0x00, 0xb5, 0x01, 0x00, 0x00, 0xca, 0x00, 0x00, 0x00,
+  0x0c, 0x00, 0x07, 0x00, 0x17, 0x00, 0x00, 0x00, 0xd4, 0x00, 0x00, 0x00,
+  0x01, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00, 0xfc, 0x01, 0x00, 0x00,
+  0x11, 0x02, 0x00, 0x00, 0xf9, 0x00, 0x02, 0x00, 0x9d, 0x00, 0x00, 0x00,
+  0xf8, 0x00, 0x02, 0x00, 0x9d, 0x00, 0x00, 0x00, 0x80, 0x00, 0x05, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0xd6, 0x00, 0x00, 0x00, 0xfb, 0x01, 0x00, 0x00,
+  0x2b, 0x00, 0x00, 0x00, 0xf9, 0x00, 0x02, 0x00, 0x9a, 0x00, 0x00, 0x00,
+  0xf8, 0x00, 0x02, 0x00, 0x9c, 0x00, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00,
+  0xdc, 0x00, 0x00, 0x00, 0xdd, 0x00, 0x00, 0x00, 0xd9, 0x00, 0x00, 0x00,
+  0x7b, 0x00, 0x00, 0x00, 0x3e, 0x00, 0x03, 0x00, 0xdd, 0x00, 0x00, 0x00,
+  0xfc, 0x01, 0x00, 0x00, 0xe0, 0x00, 0x04, 0x00, 0x52, 0x00, 0x00, 0x00,
+  0x52, 0x00, 0x00, 0x00, 0xde, 0x00, 0x00, 0x00, 0xf9, 0x00, 0x02, 0x00,
+  0xe1, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0xe1, 0x00, 0x00, 0x00,
+  0xf5, 0x00, 0x07, 0x00, 0x06, 0x00, 0x00, 0x00, 0xfd, 0x01, 0x00, 0x00,
+  0xe0, 0x00, 0x00, 0x00, 0x9c, 0x00, 0x00, 0x00, 0xf9, 0x00, 0x00, 0x00,
+  0xe4, 0x00, 0x00, 0x00, 0xac, 0x00, 0x05, 0x00, 0x4a, 0x00, 0x00, 0x00,
+  0xe7, 0x00, 0x00, 0x00, 0xfd, 0x01, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00,
+  0xf6, 0x00, 0x04, 0x00, 0xe3, 0x00, 0x00, 0x00, 0xe4, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0xfa, 0x00, 0x04, 0x00, 0xe7, 0x00, 0x00, 0x00,
+  0xe2, 0x00, 0x00, 0x00, 0xe3, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00,
+  0xe2, 0x00, 0x00, 0x00, 0xb0, 0x00, 0x05, 0x00, 0x4a, 0x00, 0x00, 0x00,
+  0xea, 0x00, 0x00, 0x00, 0x7b, 0x00, 0x00, 0x00, 0xfd, 0x01, 0x00, 0x00,
+  0xf7, 0x00, 0x03, 0x00, 0xec, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0xfa, 0x00, 0x04, 0x00, 0xea, 0x00, 0x00, 0x00, 0xeb, 0x00, 0x00, 0x00,
+  0xec, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0xeb, 0x00, 0x00, 0x00,
+  0x3d, 0x00, 0x04, 0x00, 0x17, 0x00, 0x00, 0x00, 0xf0, 0x00, 0x00, 0x00,
+  0xdd, 0x00, 0x00, 0x00, 0x80, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0xf3, 0x00, 0x00, 0x00, 0x7b, 0x00, 0x00, 0x00, 0xfd, 0x01, 0x00, 0x00,
+  0x41, 0x00, 0x05, 0x00, 0xdc, 0x00, 0x00, 0x00, 0xf4, 0x00, 0x00, 0x00,
+  0xd9, 0x00, 0x00, 0x00, 0xf3, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00,
+  0x17, 0x00, 0x00, 0x00, 0xf5, 0x00, 0x00, 0x00, 0xf4, 0x00, 0x00, 0x00,
+  0x0c, 0x00, 0x07, 0x00, 0x17, 0x00, 0x00, 0x00, 0xf6, 0x00, 0x00, 0x00,
+  0x01, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00, 0xf0, 0x00, 0x00, 0x00,
+  0xf5, 0x00, 0x00, 0x00, 0x3e, 0x00, 0x03, 0x00, 0xdd, 0x00, 0x00, 0x00,
+  0xf6, 0x00, 0x00, 0x00, 0xf9, 0x00, 0x02, 0x00, 0xec, 0x00, 0x00, 0x00,
+  0xf8, 0x00, 0x02, 0x00, 0xec, 0x00, 0x00, 0x00, 0xe0, 0x00, 0x04, 0x00,
+  0x52, 0x00, 0x00, 0x00, 0x52, 0x00, 0x00, 0x00, 0xde, 0x00, 0x00, 0x00,
+  0xf9, 0x00, 0x02, 0x00, 0xe4, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00,
+  0xe4, 0x00, 0x00, 0x00, 0xc2, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0xf9, 0x00, 0x00, 0x00, 0xfd, 0x01, 0x00, 0x00, 0x27, 0x00, 0x00, 0x00,
+  0xf9, 0x00, 0x02, 0x00, 0xe1, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00,
+  0xe3, 0x00, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00, 0xdc, 0x00, 0x00, 0x00,
+  0xfa, 0x00, 0x00, 0x00, 0xd9, 0x00, 0x00, 0x00, 0xbb, 0x00, 0x00, 0x00,
+  0x3d, 0x00, 0x04, 0x00, 0x17, 0x00, 0x00, 0x00, 0xfb, 0x00, 0x00, 0x00,
+  0xfa, 0x00, 0x00, 0x00, 0xf9, 0x00, 0x02, 0x00, 0xff, 0x00, 0x00, 0x00,
+  0xf8, 0x00, 0x02, 0x00, 0xff, 0x00, 0x00, 0x00, 0xf5, 0x00, 0x07, 0x00,
+  0x17, 0x00, 0x00, 0x00, 0xff, 0x01, 0x00, 0x00, 0x48, 0x00, 0x00, 0x00,
+  0xe3, 0x00, 0x00, 0x00, 0x37, 0x01, 0x00, 0x00, 0x02, 0x01, 0x00, 0x00,
+  0xf5, 0x00, 0x07, 0x00, 0x06, 0x00, 0x00, 0x00, 0xfe, 0x01, 0x00, 0x00,
+  0x7b, 0x00, 0x00, 0x00, 0xe3, 0x00, 0x00, 0x00, 0x39, 0x01, 0x00, 0x00,
+  0x02, 0x01, 0x00, 0x00, 0xb0, 0x00, 0x05, 0x00, 0x4a, 0x00, 0x00, 0x00,
+  0x07, 0x01, 0x00, 0x00, 0xfe, 0x01, 0x00, 0x00, 0xa1, 0x00, 0x00, 0x00,
+  0xf6, 0x00, 0x04, 0x00, 0x01, 0x01, 0x00, 0x00, 0x02, 0x01, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0xfa, 0x00, 0x04, 0x00, 0x07, 0x01, 0x00, 0x00,
+  0x00, 0x01, 0x00, 0x00, 0x01, 0x01, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00,
+  0x00, 0x01, 0x00, 0x00, 0x84, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0xbd, 0x01, 0x00, 0x00, 0x83, 0x00, 0x00, 0x00, 0x8e, 0x00, 0x00, 0x00,
+  0x80, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0xbe, 0x01, 0x00, 0x00,
+  0x8f, 0x00, 0x00, 0x00, 0xbd, 0x01, 0x00, 0x00, 0x84, 0x00, 0x05, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0xc2, 0x01, 0x00, 0x00, 0x94, 0x00, 0x00, 0x00,
+  0xa1, 0x00, 0x00, 0x00, 0x80, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0xc3, 0x01, 0x00, 0x00, 0xfe, 0x01, 0x00, 0x00, 0xc2, 0x01, 0x00, 0x00,
+  0x7c, 0x00, 0x04, 0x00, 0x0b, 0x00, 0x00, 0x00, 0xc5, 0x01, 0x00, 0x00,
+  0xbe, 0x01, 0x00, 0x00, 0x7c, 0x00, 0x04, 0x00, 0x0b, 0x00, 0x00, 0x00,
+  0xc7, 0x01, 0x00, 0x00, 0xc3, 0x01, 0x00, 0x00, 0x50, 0x00, 0x05, 0x00,
+  0x0c, 0x00, 0x00, 0x00, 0xc8, 0x01, 0x00, 0x00, 0xc5, 0x01, 0x00, 0x00,
+  0xc7, 0x01, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00, 0xb6, 0x00, 0x00, 0x00,
+  0x19, 0x01, 0x00, 0x00, 0xb8, 0x00, 0x00, 0x00, 0x64, 0x00, 0x04, 0x00,
+  0xb5, 0x00, 0x00, 0x00, 0x1b, 0x01, 0x00, 0x00, 0x19, 0x01, 0x00, 0x00,
+  0x5f, 0x00, 0x07, 0x00, 0x17, 0x00, 0x00, 0x00, 0x1c, 0x01, 0x00, 0x00,
+  0x1b, 0x01, 0x00, 0x00, 0xc8, 0x01, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+  0xbb, 0x00, 0x00, 0x00, 0x83, 0x00, 0x05, 0x00, 0x17, 0x00, 0x00, 0x00,
+  0x20, 0x01, 0x00, 0x00, 0x1c, 0x01, 0x00, 0x00, 0xfb, 0x00, 0x00, 0x00,
+  0x0c, 0x00, 0x06, 0x00, 0x17, 0x00, 0x00, 0x00, 0x21, 0x01, 0x00, 0x00,
+  0x01, 0x00, 0x00, 0x00, 0x1b, 0x00, 0x00, 0x00, 0x20, 0x01, 0x00, 0x00,
+  0x82, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0x25, 0x01, 0x00, 0x00,
+  0x82, 0x00, 0x00, 0x00, 0x2d, 0x00, 0x00, 0x00, 0xaa, 0x00, 0x05, 0x00,
+  0x4a, 0x00, 0x00, 0x00, 0x26, 0x01, 0x00, 0x00, 0x83, 0x00, 0x00, 0x00,
+  0x25, 0x01, 0x00, 0x00, 0xf7, 0x00, 0x03, 0x00, 0x28, 0x01, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0xfa, 0x00, 0x04, 0x00, 0x26, 0x01, 0x00, 0x00,
+  0x27, 0x01, 0x00, 0x00, 0x28, 0x01, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00,
+  0x27, 0x01, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00, 0x28, 0x00, 0x00, 0x00,
+  0x29, 0x01, 0x00, 0x00, 0x26, 0x00, 0x00, 0x00, 0xc5, 0x00, 0x00, 0x00,
+  0x3d, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, 0x2a, 0x01, 0x00, 0x00,
+  0x29, 0x01, 0x00, 0x00, 0xac, 0x00, 0x05, 0x00, 0x4a, 0x00, 0x00, 0x00,
+  0x2b, 0x01, 0x00, 0x00, 0x2a, 0x01, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00,
+  0xf9, 0x00, 0x02, 0x00, 0x28, 0x01, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00,
+  0x28, 0x01, 0x00, 0x00, 0xf5, 0x00, 0x07, 0x00, 0x4a, 0x00, 0x00, 0x00,
+  0x2c, 0x01, 0x00, 0x00, 0x26, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00,
+  0x2b, 0x01, 0x00, 0x00, 0x27, 0x01, 0x00, 0x00, 0xf7, 0x00, 0x03, 0x00,
+  0x2e, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xfa, 0x00, 0x04, 0x00,
+  0x2c, 0x01, 0x00, 0x00, 0x2d, 0x01, 0x00, 0x00, 0x2e, 0x01, 0x00, 0x00,
+  0xf8, 0x00, 0x02, 0x00, 0x2d, 0x01, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00,
+  0x28, 0x00, 0x00, 0x00, 0x32, 0x01, 0x00, 0x00, 0x26, 0x00, 0x00, 0x00,
+  0xc5, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0x33, 0x01, 0x00, 0x00, 0x32, 0x01, 0x00, 0x00, 0xae, 0x00, 0x05, 0x00,
+  0x4a, 0x00, 0x00, 0x00, 0xcd, 0x01, 0x00, 0x00, 0x33, 0x01, 0x00, 0x00,
+  0x2d, 0x00, 0x00, 0x00, 0xa9, 0x00, 0x06, 0x00, 0x16, 0x00, 0x00, 0x00,
+  0xce, 0x01, 0x00, 0x00, 0xcd, 0x01, 0x00, 0x00, 0x4c, 0x00, 0x00, 0x00,
+  0x47, 0x00, 0x00, 0x00, 0x52, 0x00, 0x06, 0x00, 0x17, 0x00, 0x00, 0x00,
+  0xf5, 0x01, 0x00, 0x00, 0xce, 0x01, 0x00, 0x00, 0x48, 0x00, 0x00, 0x00,
+  0x03, 0x00, 0x00, 0x00, 0xae, 0x00, 0x05, 0x00, 0x4a, 0x00, 0x00, 0x00,
+  0xd1, 0x01, 0x00, 0x00, 0x33, 0x01, 0x00, 0x00, 0x52, 0x00, 0x00, 0x00,
+  0xa9, 0x00, 0x06, 0x00, 0x16, 0x00, 0x00, 0x00, 0xd2, 0x01, 0x00, 0x00,
+  0xd1, 0x01, 0x00, 0x00, 0x4c, 0x00, 0x00, 0x00, 0x47, 0x00, 0x00, 0x00,
+  0x52, 0x00, 0x06, 0x00, 0x17, 0x00, 0x00, 0x00, 0xf7, 0x01, 0x00, 0x00,
+  0xd2, 0x01, 0x00, 0x00, 0xf5, 0x01, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+  0xae, 0x00, 0x05, 0x00, 0x4a, 0x00, 0x00, 0x00, 0xd5, 0x01, 0x00, 0x00,
+  0x33, 0x01, 0x00, 0x00, 0x4e, 0x00, 0x00, 0x00, 0xa9, 0x00, 0x06, 0x00,
+  0x16, 0x00, 0x00, 0x00, 0xd6, 0x01, 0x00, 0x00, 0xd5, 0x01, 0x00, 0x00,
+  0x4c, 0x00, 0x00, 0x00, 0x47, 0x00, 0x00, 0x00, 0x52, 0x00, 0x06, 0x00,
+  0x17, 0x00, 0x00, 0x00, 0xf9, 0x01, 0x00, 0x00, 0xd6, 0x01, 0x00, 0x00,
+  0xf7, 0x01, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x08, 0x00,
+  0x17, 0x00, 0x00, 0x00, 0xda, 0x01, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+  0x2e, 0x00, 0x00, 0x00, 0x21, 0x01, 0x00, 0x00, 0x48, 0x00, 0x00, 0x00,
+  0xf9, 0x01, 0x00, 0x00, 0xf9, 0x00, 0x02, 0x00, 0x2e, 0x01, 0x00, 0x00,
+  0xf8, 0x00, 0x02, 0x00, 0x2e, 0x01, 0x00, 0x00, 0xf5, 0x00, 0x07, 0x00,
+  0x17, 0x00, 0x00, 0x00, 0x08, 0x02, 0x00, 0x00, 0x21, 0x01, 0x00, 0x00,
+  0x28, 0x01, 0x00, 0x00, 0xda, 0x01, 0x00, 0x00, 0x2d, 0x01, 0x00, 0x00,
+  0x81, 0x00, 0x05, 0x00, 0x17, 0x00, 0x00, 0x00, 0x37, 0x01, 0x00, 0x00,
+  0xff, 0x01, 0x00, 0x00, 0x08, 0x02, 0x00, 0x00, 0xf9, 0x00, 0x02, 0x00,
+  0x02, 0x01, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x02, 0x01, 0x00, 0x00,
+  0x80, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0x39, 0x01, 0x00, 0x00,
+  0xfe, 0x01, 0x00, 0x00, 0x2b, 0x00, 0x00, 0x00, 0xf9, 0x00, 0x02, 0x00,
+  0xff, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x01, 0x01, 0x00, 0x00,
+  0x3e, 0x00, 0x03, 0x00, 0xdd, 0x00, 0x00, 0x00, 0xff, 0x01, 0x00, 0x00,
+  0xe0, 0x00, 0x04, 0x00, 0x52, 0x00, 0x00, 0x00, 0x52, 0x00, 0x00, 0x00,
+  0xde, 0x00, 0x00, 0x00, 0xf9, 0x00, 0x02, 0x00, 0x3e, 0x01, 0x00, 0x00,
+  0xf8, 0x00, 0x02, 0x00, 0x3e, 0x01, 0x00, 0x00, 0xf5, 0x00, 0x07, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0xe0, 0x00, 0x00, 0x00,
+  0x01, 0x01, 0x00, 0x00, 0x55, 0x01, 0x00, 0x00, 0x41, 0x01, 0x00, 0x00,
+  0xac, 0x00, 0x05, 0x00, 0x4a, 0x00, 0x00, 0x00, 0x44, 0x01, 0x00, 0x00,
+  0x00, 0x02, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00, 0xf6, 0x00, 0x04, 0x00,
+  0x40, 0x01, 0x00, 0x00, 0x41, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0xfa, 0x00, 0x04, 0x00, 0x44, 0x01, 0x00, 0x00, 0x3f, 0x01, 0x00, 0x00,
+  0x40, 0x01, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x3f, 0x01, 0x00, 0x00,
+  0xb0, 0x00, 0x05, 0x00, 0x4a, 0x00, 0x00, 0x00, 0x47, 0x01, 0x00, 0x00,
+  0x7b, 0x00, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0xf7, 0x00, 0x03, 0x00,
+  0x49, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xfa, 0x00, 0x04, 0x00,
+  0x47, 0x01, 0x00, 0x00, 0x48, 0x01, 0x00, 0x00, 0x49, 0x01, 0x00, 0x00,
+  0xf8, 0x00, 0x02, 0x00, 0x48, 0x01, 0x00, 0x00, 0x80, 0x00, 0x05, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0x4d, 0x01, 0x00, 0x00, 0x7b, 0x00, 0x00, 0x00,
+  0x00, 0x02, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00, 0xdc, 0x00, 0x00, 0x00,
+  0x4e, 0x01, 0x00, 0x00, 0xd9, 0x00, 0x00, 0x00, 0x4d, 0x01, 0x00, 0x00,
+  0x3d, 0x00, 0x04, 0x00, 0x17, 0x00, 0x00, 0x00, 0x4f, 0x01, 0x00, 0x00,
+  0x4e, 0x01, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00, 0x17, 0x00, 0x00, 0x00,
+  0x51, 0x01, 0x00, 0x00, 0xdd, 0x00, 0x00, 0x00, 0x81, 0x00, 0x05, 0x00,
+  0x17, 0x00, 0x00, 0x00, 0x52, 0x01, 0x00, 0x00, 0x51, 0x01, 0x00, 0x00,
+  0x4f, 0x01, 0x00, 0x00, 0x3e, 0x00, 0x03, 0x00, 0xdd, 0x00, 0x00, 0x00,
+  0x52, 0x01, 0x00, 0x00, 0xf9, 0x00, 0x02, 0x00, 0x49, 0x01, 0x00, 0x00,
+  0xf8, 0x00, 0x02, 0x00, 0x49, 0x01, 0x00, 0x00, 0xe0, 0x00, 0x04, 0x00,
+  0x52, 0x00, 0x00, 0x00, 0x52, 0x00, 0x00, 0x00, 0xde, 0x00, 0x00, 0x00,
+  0xf9, 0x00, 0x02, 0x00, 0x41, 0x01, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00,
+  0x41, 0x01, 0x00, 0x00, 0xc2, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0x55, 0x01, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x27, 0x00, 0x00, 0x00,
+  0xf9, 0x00, 0x02, 0x00, 0x3e, 0x01, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00,
+  0x40, 0x01, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00, 0x17, 0x00, 0x00, 0x00,
+  0x57, 0x01, 0x00, 0x00, 0xfa, 0x00, 0x00, 0x00, 0xf9, 0x00, 0x02, 0x00,
+  0x5a, 0x01, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x5a, 0x01, 0x00, 0x00,
+  0xf5, 0x00, 0x07, 0x00, 0x06, 0x00, 0x00, 0x00, 0x01, 0x02, 0x00, 0x00,
+  0x7b, 0x00, 0x00, 0x00, 0x40, 0x01, 0x00, 0x00, 0x86, 0x01, 0x00, 0x00,
+  0x5b, 0x01, 0x00, 0x00, 0xb0, 0x00, 0x05, 0x00, 0x4a, 0x00, 0x00, 0x00,
+  0x62, 0x01, 0x00, 0x00, 0x01, 0x02, 0x00, 0x00, 0xa1, 0x00, 0x00, 0x00,
+  0xf6, 0x00, 0x04, 0x00, 0x5c, 0x01, 0x00, 0x00, 0x5b, 0x01, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0xfa, 0x00, 0x04, 0x00, 0x62, 0x01, 0x00, 0x00,
+  0x5b, 0x01, 0x00, 0x00, 0x5c, 0x01, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00,
+  0x5b, 0x01, 0x00, 0x00, 0x84, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0xe2, 0x01, 0x00, 0x00, 0x83, 0x00, 0x00, 0x00, 0x8e, 0x00, 0x00, 0x00,
+  0x80, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0xe3, 0x01, 0x00, 0x00,
+  0x8f, 0x00, 0x00, 0x00, 0xe2, 0x01, 0x00, 0x00, 0x84, 0x00, 0x05, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0xe7, 0x01, 0x00, 0x00, 0x94, 0x00, 0x00, 0x00,
+  0xa1, 0x00, 0x00, 0x00, 0x80, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0xe8, 0x01, 0x00, 0x00, 0x01, 0x02, 0x00, 0x00, 0xe7, 0x01, 0x00, 0x00,
+  0x7c, 0x00, 0x04, 0x00, 0x0b, 0x00, 0x00, 0x00, 0xea, 0x01, 0x00, 0x00,
+  0xe3, 0x01, 0x00, 0x00, 0x7c, 0x00, 0x04, 0x00, 0x0b, 0x00, 0x00, 0x00,
+  0xec, 0x01, 0x00, 0x00, 0xe8, 0x01, 0x00, 0x00, 0x50, 0x00, 0x05, 0x00,
+  0x0c, 0x00, 0x00, 0x00, 0xed, 0x01, 0x00, 0x00, 0xea, 0x01, 0x00, 0x00,
+  0xec, 0x01, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00, 0xb6, 0x00, 0x00, 0x00,
+  0x74, 0x01, 0x00, 0x00, 0xb8, 0x00, 0x00, 0x00, 0x64, 0x00, 0x04, 0x00,
+  0xb5, 0x00, 0x00, 0x00, 0x76, 0x01, 0x00, 0x00, 0x74, 0x01, 0x00, 0x00,
+  0x5f, 0x00, 0x07, 0x00, 0x17, 0x00, 0x00, 0x00, 0x77, 0x01, 0x00, 0x00,
+  0x76, 0x01, 0x00, 0x00, 0xed, 0x01, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+  0xbb, 0x00, 0x00, 0x00, 0x83, 0x00, 0x05, 0x00, 0x17, 0x00, 0x00, 0x00,
+  0x7b, 0x01, 0x00, 0x00, 0x77, 0x01, 0x00, 0x00, 0xfb, 0x00, 0x00, 0x00,
+  0x0c, 0x00, 0x06, 0x00, 0x17, 0x00, 0x00, 0x00, 0x7c, 0x01, 0x00, 0x00,
+  0x01, 0x00, 0x00, 0x00, 0x1b, 0x00, 0x00, 0x00, 0x7b, 0x01, 0x00, 0x00,
+  0x88, 0x00, 0x05, 0x00, 0x17, 0x00, 0x00, 0x00, 0x7e, 0x01, 0x00, 0x00,
+  0x7c, 0x01, 0x00, 0x00, 0x57, 0x01, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00,
+  0x7f, 0x01, 0x00, 0x00, 0x82, 0x01, 0x00, 0x00, 0x81, 0x01, 0x00, 0x00,
+  0x63, 0x00, 0x04, 0x00, 0x82, 0x01, 0x00, 0x00, 0xed, 0x01, 0x00, 0x00,
+  0x7e, 0x01, 0x00, 0x00, 0x80, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0x86, 0x01, 0x00, 0x00, 0x01, 0x02, 0x00, 0x00, 0x2b, 0x00, 0x00, 0x00,
+  0xf9, 0x00, 0x02, 0x00, 0x5a, 0x01, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00,
+  0x5c, 0x01, 0x00, 0x00, 0xfd, 0x00, 0x01, 0x00, 0x38, 0x00, 0x01, 0x00
+};
+unsigned int glsl_softmaxImage_AXIS_H_comp_len = 4908;
+
+const unsigned char glsl_softmaxImage_AXIS_W_comp[] = {
+  0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0b, 0x00, 0x08, 0x00,
+  0x12, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00,
+  0x01, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x38, 0x00, 0x00, 0x00,
+  0x0b, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x4c, 0x53, 0x4c,
+  0x2e, 0x73, 0x74, 0x64, 0x2e, 0x34, 0x35, 0x30, 0x00, 0x00, 0x00, 0x00,
+  0x0e, 0x00, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+  0x0f, 0x00, 0x07, 0x00, 0x05, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
+  0x6d, 0x61, 0x69, 0x6e, 0x00, 0x00, 0x00, 0x00, 0x78, 0x00, 0x00, 0x00,
+  0x7d, 0x00, 0x00, 0x00, 0x10, 0x00, 0x06, 0x00, 0x04, 0x00, 0x00, 0x00,
+  0x11, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+  0x01, 0x00, 0x00, 0x00, 0x03, 0x00, 0x03, 0x00, 0x02, 0x00, 0x00, 0x00,
+  0xc2, 0x01, 0x00, 0x00, 0x05, 0x00, 0x04, 0x00, 0x04, 0x00, 0x00, 0x00,
+  0x6d, 0x61, 0x69, 0x6e, 0x00, 0x00, 0x00, 0x00, 0x05, 0x00, 0x05, 0x00,
+  0x24, 0x00, 0x00, 0x00, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x42, 0x75, 0x66,
+  0x66, 0x65, 0x72, 0x00, 0x06, 0x00, 0x04, 0x00, 0x24, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x4e, 0x00, 0x00, 0x00, 0x06, 0x00, 0x04, 0x00,
+  0x24, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x48, 0x00, 0x00, 0x00,
+  0x06, 0x00, 0x04, 0x00, 0x24, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+  0x57, 0x00, 0x00, 0x00, 0x06, 0x00, 0x04, 0x00, 0x24, 0x00, 0x00, 0x00,
+  0x03, 0x00, 0x00, 0x00, 0x43, 0x34, 0x00, 0x00, 0x06, 0x00, 0x05, 0x00,
+  0x24, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x43, 0x4c, 0x65, 0x66,
+  0x74, 0x00, 0x00, 0x00, 0x05, 0x00, 0x04, 0x00, 0x26, 0x00, 0x00, 0x00,
+  0x75, 0x43, 0x6f, 0x6e, 0x73, 0x74, 0x00, 0x00, 0x05, 0x00, 0x08, 0x00,
+  0x78, 0x00, 0x00, 0x00, 0x67, 0x6c, 0x5f, 0x4c, 0x6f, 0x63, 0x61, 0x6c,
+  0x49, 0x6e, 0x76, 0x6f, 0x63, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x49, 0x44,
+  0x00, 0x00, 0x00, 0x00, 0x05, 0x00, 0x08, 0x00, 0x7d, 0x00, 0x00, 0x00,
+  0x67, 0x6c, 0x5f, 0x47, 0x6c, 0x6f, 0x62, 0x61, 0x6c, 0x49, 0x6e, 0x76,
+  0x6f, 0x63, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x49, 0x44, 0x00, 0x00, 0x00,
+  0x05, 0x00, 0x04, 0x00, 0xb8, 0x00, 0x00, 0x00, 0x75, 0x49, 0x6e, 0x70,
+  0x75, 0x74, 0x00, 0x00, 0x05, 0x00, 0x06, 0x00, 0xd9, 0x00, 0x00, 0x00,
+  0x73, 0x68, 0x61, 0x72, 0x65, 0x64, 0x56, 0x61, 0x6c, 0x75, 0x65, 0x73,
+  0x00, 0x00, 0x00, 0x00, 0x05, 0x00, 0x04, 0x00, 0x81, 0x01, 0x00, 0x00,
+  0x75, 0x4f, 0x75, 0x74, 0x70, 0x75, 0x74, 0x00, 0x48, 0x00, 0x05, 0x00,
+  0x24, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00, 0x24, 0x00, 0x00, 0x00,
+  0x01, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
+  0x48, 0x00, 0x05, 0x00, 0x24, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+  0x23, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00,
+  0x24, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00,
+  0x0c, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00, 0x24, 0x00, 0x00, 0x00,
+  0x04, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00,
+  0x47, 0x00, 0x03, 0x00, 0x24, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+  0x47, 0x00, 0x04, 0x00, 0x26, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x26, 0x00, 0x00, 0x00,
+  0x21, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
+  0x78, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x1b, 0x00, 0x00, 0x00,
+  0x47, 0x00, 0x04, 0x00, 0x7d, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00,
+  0x1c, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0xb8, 0x00, 0x00, 0x00,
+  0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
+  0xb8, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+  0x47, 0x00, 0x04, 0x00, 0x81, 0x01, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x81, 0x01, 0x00, 0x00,
+  0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00,
+  0x81, 0x01, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
+  0x87, 0x01, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00,
+  0x13, 0x00, 0x02, 0x00, 0x02, 0x00, 0x00, 0x00, 0x21, 0x00, 0x03, 0x00,
+  0x03, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x15, 0x00, 0x04, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x15, 0x00, 0x04, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00,
+  0x01, 0x00, 0x00, 0x00, 0x17, 0x00, 0x04, 0x00, 0x0c, 0x00, 0x00, 0x00,
+  0x0b, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x16, 0x00, 0x03, 0x00,
+  0x16, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0x17, 0x00, 0x04, 0x00,
+  0x17, 0x00, 0x00, 0x00, 0x16, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
+  0x2b, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x1e, 0x00, 0x07, 0x00, 0x24, 0x00, 0x00, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00,
+  0x25, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00,
+  0x3b, 0x00, 0x04, 0x00, 0x25, 0x00, 0x00, 0x00, 0x26, 0x00, 0x00, 0x00,
+  0x02, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00, 0x0b, 0x00, 0x00, 0x00,
+  0x27, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00,
+  0x28, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0x2b, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x00, 0x00,
+  0x00, 0x01, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0x2d, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00,
+  0x16, 0x00, 0x00, 0x00, 0x47, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x2c, 0x00, 0x07, 0x00, 0x17, 0x00, 0x00, 0x00, 0x48, 0x00, 0x00, 0x00,
+  0x47, 0x00, 0x00, 0x00, 0x47, 0x00, 0x00, 0x00, 0x47, 0x00, 0x00, 0x00,
+  0x47, 0x00, 0x00, 0x00, 0x14, 0x00, 0x02, 0x00, 0x4a, 0x00, 0x00, 0x00,
+  0x2b, 0x00, 0x04, 0x00, 0x16, 0x00, 0x00, 0x00, 0x4c, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x80, 0x3f, 0x2b, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0x4e, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0x52, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+  0x2b, 0x00, 0x04, 0x00, 0x16, 0x00, 0x00, 0x00, 0x5b, 0x00, 0x00, 0x00,
+  0xff, 0xff, 0x7f, 0xff, 0x2c, 0x00, 0x07, 0x00, 0x17, 0x00, 0x00, 0x00,
+  0x5c, 0x00, 0x00, 0x00, 0x5b, 0x00, 0x00, 0x00, 0x5b, 0x00, 0x00, 0x00,
+  0x5b, 0x00, 0x00, 0x00, 0x5b, 0x00, 0x00, 0x00, 0x17, 0x00, 0x04, 0x00,
+  0x76, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00,
+  0x20, 0x00, 0x04, 0x00, 0x77, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+  0x76, 0x00, 0x00, 0x00, 0x3b, 0x00, 0x04, 0x00, 0x77, 0x00, 0x00, 0x00,
+  0x78, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00,
+  0x79, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0x3b, 0x00, 0x04, 0x00, 0x77, 0x00, 0x00, 0x00, 0x7d, 0x00, 0x00, 0x00,
+  0x01, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00, 0x0b, 0x00, 0x00, 0x00,
+  0x80, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00,
+  0x0b, 0x00, 0x00, 0x00, 0x8c, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+  0x19, 0x00, 0x09, 0x00, 0xb5, 0x00, 0x00, 0x00, 0x16, 0x00, 0x00, 0x00,
+  0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x1b, 0x00, 0x03, 0x00, 0xb6, 0x00, 0x00, 0x00, 0xb5, 0x00, 0x00, 0x00,
+  0x20, 0x00, 0x04, 0x00, 0xb7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0xb6, 0x00, 0x00, 0x00, 0x3b, 0x00, 0x04, 0x00, 0xb7, 0x00, 0x00, 0x00,
+  0xb8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00,
+  0x0b, 0x00, 0x00, 0x00, 0xbb, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x2b, 0x00, 0x04, 0x00, 0x0b, 0x00, 0x00, 0x00, 0xc5, 0x00, 0x00, 0x00,
+  0x04, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x04, 0x00, 0xd7, 0x00, 0x00, 0x00,
+  0x17, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00,
+  0xd8, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0xd7, 0x00, 0x00, 0x00,
+  0x3b, 0x00, 0x04, 0x00, 0xd8, 0x00, 0x00, 0x00, 0xd9, 0x00, 0x00, 0x00,
+  0x04, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00, 0xdc, 0x00, 0x00, 0x00,
+  0x04, 0x00, 0x00, 0x00, 0x17, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0xde, 0x00, 0x00, 0x00, 0x08, 0x01, 0x00, 0x00,
+  0x2b, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, 0xe0, 0x00, 0x00, 0x00,
+  0x80, 0x00, 0x00, 0x00, 0x19, 0x00, 0x09, 0x00, 0x7f, 0x01, 0x00, 0x00,
+  0x16, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00, 0x80, 0x01, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x7f, 0x01, 0x00, 0x00, 0x3b, 0x00, 0x04, 0x00,
+  0x80, 0x01, 0x00, 0x00, 0x81, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x2c, 0x00, 0x06, 0x00, 0x76, 0x00, 0x00, 0x00, 0x87, 0x01, 0x00, 0x00,
+  0x2b, 0x00, 0x00, 0x00, 0x2d, 0x00, 0x00, 0x00, 0x2d, 0x00, 0x00, 0x00,
+  0x36, 0x00, 0x05, 0x00, 0x02, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00,
+  0x05, 0x00, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00, 0x79, 0x00, 0x00, 0x00,
+  0x7a, 0x00, 0x00, 0x00, 0x78, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00,
+  0x3d, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, 0x7b, 0x00, 0x00, 0x00,
+  0x7a, 0x00, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00, 0x79, 0x00, 0x00, 0x00,
+  0x7e, 0x00, 0x00, 0x00, 0x7d, 0x00, 0x00, 0x00, 0x2d, 0x00, 0x00, 0x00,
+  0x3d, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, 0x7f, 0x00, 0x00, 0x00,
+  0x7e, 0x00, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00, 0x28, 0x00, 0x00, 0x00,
+  0x81, 0x00, 0x00, 0x00, 0x26, 0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00,
+  0x3d, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, 0x82, 0x00, 0x00, 0x00,
+  0x81, 0x00, 0x00, 0x00, 0x89, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0x83, 0x00, 0x00, 0x00, 0x7f, 0x00, 0x00, 0x00, 0x82, 0x00, 0x00, 0x00,
+  0x86, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0x89, 0x00, 0x00, 0x00,
+  0x7f, 0x00, 0x00, 0x00, 0x82, 0x00, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00,
+  0x28, 0x00, 0x00, 0x00, 0x8d, 0x00, 0x00, 0x00, 0x26, 0x00, 0x00, 0x00,
+  0x8c, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0x8e, 0x00, 0x00, 0x00, 0x8d, 0x00, 0x00, 0x00, 0x89, 0x00, 0x05, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0x8f, 0x00, 0x00, 0x00, 0x89, 0x00, 0x00, 0x00,
+  0x8e, 0x00, 0x00, 0x00, 0x86, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0x94, 0x00, 0x00, 0x00, 0x89, 0x00, 0x00, 0x00, 0x8e, 0x00, 0x00, 0x00,
+  0xf9, 0x00, 0x02, 0x00, 0x9a, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00,
+  0x9a, 0x00, 0x00, 0x00, 0xf5, 0x00, 0x07, 0x00, 0x17, 0x00, 0x00, 0x00,
+  0xfc, 0x01, 0x00, 0x00, 0x5c, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00,
+  0xd4, 0x00, 0x00, 0x00, 0x9d, 0x00, 0x00, 0x00, 0xf5, 0x00, 0x07, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0xfb, 0x01, 0x00, 0x00, 0x7b, 0x00, 0x00, 0x00,
+  0x05, 0x00, 0x00, 0x00, 0xd6, 0x00, 0x00, 0x00, 0x9d, 0x00, 0x00, 0x00,
+  0x41, 0x00, 0x05, 0x00, 0x28, 0x00, 0x00, 0x00, 0xa0, 0x00, 0x00, 0x00,
+  0x26, 0x00, 0x00, 0x00, 0x27, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0xa1, 0x00, 0x00, 0x00, 0xa0, 0x00, 0x00, 0x00,
+  0xb0, 0x00, 0x05, 0x00, 0x4a, 0x00, 0x00, 0x00, 0xa2, 0x00, 0x00, 0x00,
+  0xfb, 0x01, 0x00, 0x00, 0xa1, 0x00, 0x00, 0x00, 0xf6, 0x00, 0x04, 0x00,
+  0x9c, 0x00, 0x00, 0x00, 0x9d, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0xfa, 0x00, 0x04, 0x00, 0xa2, 0x00, 0x00, 0x00, 0x9b, 0x00, 0x00, 0x00,
+  0x9c, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x9b, 0x00, 0x00, 0x00,
+  0x84, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0x98, 0x01, 0x00, 0x00,
+  0x83, 0x00, 0x00, 0x00, 0xa1, 0x00, 0x00, 0x00, 0x80, 0x00, 0x05, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0x99, 0x01, 0x00, 0x00, 0xfb, 0x01, 0x00, 0x00,
+  0x98, 0x01, 0x00, 0x00, 0x84, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0x9d, 0x01, 0x00, 0x00, 0x94, 0x00, 0x00, 0x00, 0x8e, 0x00, 0x00, 0x00,
+  0x80, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0x9e, 0x01, 0x00, 0x00,
+  0x8f, 0x00, 0x00, 0x00, 0x9d, 0x01, 0x00, 0x00, 0x7c, 0x00, 0x04, 0x00,
+  0x0b, 0x00, 0x00, 0x00, 0xa0, 0x01, 0x00, 0x00, 0x99, 0x01, 0x00, 0x00,
+  0x7c, 0x00, 0x04, 0x00, 0x0b, 0x00, 0x00, 0x00, 0xa2, 0x01, 0x00, 0x00,
+  0x9e, 0x01, 0x00, 0x00, 0x50, 0x00, 0x05, 0x00, 0x0c, 0x00, 0x00, 0x00,
+  0xa3, 0x01, 0x00, 0x00, 0xa0, 0x01, 0x00, 0x00, 0xa2, 0x01, 0x00, 0x00,
+  0x3d, 0x00, 0x04, 0x00, 0xb6, 0x00, 0x00, 0x00, 0xb9, 0x00, 0x00, 0x00,
+  0xb8, 0x00, 0x00, 0x00, 0x64, 0x00, 0x04, 0x00, 0xb5, 0x00, 0x00, 0x00,
+  0xbc, 0x00, 0x00, 0x00, 0xb9, 0x00, 0x00, 0x00, 0x5f, 0x00, 0x07, 0x00,
+  0x17, 0x00, 0x00, 0x00, 0xbd, 0x00, 0x00, 0x00, 0xbc, 0x00, 0x00, 0x00,
+  0xa3, 0x01, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0xbb, 0x00, 0x00, 0x00,
+  0x82, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0xc1, 0x00, 0x00, 0x00,
+  0x82, 0x00, 0x00, 0x00, 0x2d, 0x00, 0x00, 0x00, 0xaa, 0x00, 0x05, 0x00,
+  0x4a, 0x00, 0x00, 0x00, 0xc2, 0x00, 0x00, 0x00, 0x83, 0x00, 0x00, 0x00,
+  0xc1, 0x00, 0x00, 0x00, 0xf7, 0x00, 0x03, 0x00, 0xc4, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0xfa, 0x00, 0x04, 0x00, 0xc2, 0x00, 0x00, 0x00,
+  0xc3, 0x00, 0x00, 0x00, 0xc4, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00,
+  0xc3, 0x00, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00, 0x28, 0x00, 0x00, 0x00,
+  0xc6, 0x00, 0x00, 0x00, 0x26, 0x00, 0x00, 0x00, 0xc5, 0x00, 0x00, 0x00,
+  0x3d, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, 0xc7, 0x00, 0x00, 0x00,
+  0xc6, 0x00, 0x00, 0x00, 0xac, 0x00, 0x05, 0x00, 0x4a, 0x00, 0x00, 0x00,
+  0xc8, 0x00, 0x00, 0x00, 0xc7, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00,
+  0xf9, 0x00, 0x02, 0x00, 0xc4, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00,
+  0xc4, 0x00, 0x00, 0x00, 0xf5, 0x00, 0x07, 0x00, 0x4a, 0x00, 0x00, 0x00,
+  0xc9, 0x00, 0x00, 0x00, 0xc2, 0x00, 0x00, 0x00, 0x9b, 0x00, 0x00, 0x00,
+  0xc8, 0x00, 0x00, 0x00, 0xc3, 0x00, 0x00, 0x00, 0xf7, 0x00, 0x03, 0x00,
+  0xcb, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xfa, 0x00, 0x04, 0x00,
+  0xc9, 0x00, 0x00, 0x00, 0xca, 0x00, 0x00, 0x00, 0xcb, 0x00, 0x00, 0x00,
+  0xf8, 0x00, 0x02, 0x00, 0xca, 0x00, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00,
+  0x28, 0x00, 0x00, 0x00, 0xcf, 0x00, 0x00, 0x00, 0x26, 0x00, 0x00, 0x00,
+  0xc5, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0xd0, 0x00, 0x00, 0x00, 0xcf, 0x00, 0x00, 0x00, 0xae, 0x00, 0x05, 0x00,
+  0x4a, 0x00, 0x00, 0x00, 0xa8, 0x01, 0x00, 0x00, 0xd0, 0x00, 0x00, 0x00,
+  0x2d, 0x00, 0x00, 0x00, 0xa9, 0x00, 0x06, 0x00, 0x16, 0x00, 0x00, 0x00,
+  0xa9, 0x01, 0x00, 0x00, 0xa8, 0x01, 0x00, 0x00, 0x4c, 0x00, 0x00, 0x00,
+  0x47, 0x00, 0x00, 0x00, 0x52, 0x00, 0x06, 0x00, 0x17, 0x00, 0x00, 0x00,
+  0xef, 0x01, 0x00, 0x00, 0xa9, 0x01, 0x00, 0x00, 0x48, 0x00, 0x00, 0x00,
+  0x03, 0x00, 0x00, 0x00, 0xae, 0x00, 0x05, 0x00, 0x4a, 0x00, 0x00, 0x00,
+  0xac, 0x01, 0x00, 0x00, 0xd0, 0x00, 0x00, 0x00, 0x52, 0x00, 0x00, 0x00,
+  0xa9, 0x00, 0x06, 0x00, 0x16, 0x00, 0x00, 0x00, 0xad, 0x01, 0x00, 0x00,
+  0xac, 0x01, 0x00, 0x00, 0x4c, 0x00, 0x00, 0x00, 0x47, 0x00, 0x00, 0x00,
+  0x52, 0x00, 0x06, 0x00, 0x17, 0x00, 0x00, 0x00, 0xf1, 0x01, 0x00, 0x00,
+  0xad, 0x01, 0x00, 0x00, 0xef, 0x01, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+  0xae, 0x00, 0x05, 0x00, 0x4a, 0x00, 0x00, 0x00, 0xb0, 0x01, 0x00, 0x00,
+  0xd0, 0x00, 0x00, 0x00, 0x4e, 0x00, 0x00, 0x00, 0xa9, 0x00, 0x06, 0x00,
+  0x16, 0x00, 0x00, 0x00, 0xb1, 0x01, 0x00, 0x00, 0xb0, 0x01, 0x00, 0x00,
+  0x4c, 0x00, 0x00, 0x00, 0x47, 0x00, 0x00, 0x00, 0x52, 0x00, 0x06, 0x00,
+  0x17, 0x00, 0x00, 0x00, 0xf3, 0x01, 0x00, 0x00, 0xb1, 0x01, 0x00, 0x00,
+  0xf1, 0x01, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x08, 0x00,
+  0x17, 0x00, 0x00, 0x00, 0xb5, 0x01, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+  0x2e, 0x00, 0x00, 0x00, 0xbd, 0x00, 0x00, 0x00, 0x5c, 0x00, 0x00, 0x00,
+  0xf3, 0x01, 0x00, 0x00, 0xf9, 0x00, 0x02, 0x00, 0xcb, 0x00, 0x00, 0x00,
+  0xf8, 0x00, 0x02, 0x00, 0xcb, 0x00, 0x00, 0x00, 0xf5, 0x00, 0x07, 0x00,
+  0x17, 0x00, 0x00, 0x00, 0x11, 0x02, 0x00, 0x00, 0xbd, 0x00, 0x00, 0x00,
+  0xc4, 0x00, 0x00, 0x00, 0xb5, 0x01, 0x00, 0x00, 0xca, 0x00, 0x00, 0x00,
+  0x0c, 0x00, 0x07, 0x00, 0x17, 0x00, 0x00, 0x00, 0xd4, 0x00, 0x00, 0x00,
+  0x01, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00, 0xfc, 0x01, 0x00, 0x00,
+  0x11, 0x02, 0x00, 0x00, 0xf9, 0x00, 0x02, 0x00, 0x9d, 0x00, 0x00, 0x00,
+  0xf8, 0x00, 0x02, 0x00, 0x9d, 0x00, 0x00, 0x00, 0x80, 0x00, 0x05, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0xd6, 0x00, 0x00, 0x00, 0xfb, 0x01, 0x00, 0x00,
+  0x2b, 0x00, 0x00, 0x00, 0xf9, 0x00, 0x02, 0x00, 0x9a, 0x00, 0x00, 0x00,
+  0xf8, 0x00, 0x02, 0x00, 0x9c, 0x00, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00,
+  0xdc, 0x00, 0x00, 0x00, 0xdd, 0x00, 0x00, 0x00, 0xd9, 0x00, 0x00, 0x00,
+  0x7b, 0x00, 0x00, 0x00, 0x3e, 0x00, 0x03, 0x00, 0xdd, 0x00, 0x00, 0x00,
+  0xfc, 0x01, 0x00, 0x00, 0xe0, 0x00, 0x04, 0x00, 0x52, 0x00, 0x00, 0x00,
+  0x52, 0x00, 0x00, 0x00, 0xde, 0x00, 0x00, 0x00, 0xf9, 0x00, 0x02, 0x00,
+  0xe1, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0xe1, 0x00, 0x00, 0x00,
+  0xf5, 0x00, 0x07, 0x00, 0x06, 0x00, 0x00, 0x00, 0xfd, 0x01, 0x00, 0x00,
+  0xe0, 0x00, 0x00, 0x00, 0x9c, 0x00, 0x00, 0x00, 0xf9, 0x00, 0x00, 0x00,
+  0xe4, 0x00, 0x00, 0x00, 0xac, 0x00, 0x05, 0x00, 0x4a, 0x00, 0x00, 0x00,
+  0xe7, 0x00, 0x00, 0x00, 0xfd, 0x01, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00,
+  0xf6, 0x00, 0x04, 0x00, 0xe3, 0x00, 0x00, 0x00, 0xe4, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0xfa, 0x00, 0x04, 0x00, 0xe7, 0x00, 0x00, 0x00,
+  0xe2, 0x00, 0x00, 0x00, 0xe3, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00,
+  0xe2, 0x00, 0x00, 0x00, 0xb0, 0x00, 0x05, 0x00, 0x4a, 0x00, 0x00, 0x00,
+  0xea, 0x00, 0x00, 0x00, 0x7b, 0x00, 0x00, 0x00, 0xfd, 0x01, 0x00, 0x00,
+  0xf7, 0x00, 0x03, 0x00, 0xec, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0xfa, 0x00, 0x04, 0x00, 0xea, 0x00, 0x00, 0x00, 0xeb, 0x00, 0x00, 0x00,
+  0xec, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0xeb, 0x00, 0x00, 0x00,
+  0x3d, 0x00, 0x04, 0x00, 0x17, 0x00, 0x00, 0x00, 0xf0, 0x00, 0x00, 0x00,
+  0xdd, 0x00, 0x00, 0x00, 0x80, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0xf3, 0x00, 0x00, 0x00, 0x7b, 0x00, 0x00, 0x00, 0xfd, 0x01, 0x00, 0x00,
+  0x41, 0x00, 0x05, 0x00, 0xdc, 0x00, 0x00, 0x00, 0xf4, 0x00, 0x00, 0x00,
+  0xd9, 0x00, 0x00, 0x00, 0xf3, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00,
+  0x17, 0x00, 0x00, 0x00, 0xf5, 0x00, 0x00, 0x00, 0xf4, 0x00, 0x00, 0x00,
+  0x0c, 0x00, 0x07, 0x00, 0x17, 0x00, 0x00, 0x00, 0xf6, 0x00, 0x00, 0x00,
+  0x01, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00, 0xf0, 0x00, 0x00, 0x00,
+  0xf5, 0x00, 0x00, 0x00, 0x3e, 0x00, 0x03, 0x00, 0xdd, 0x00, 0x00, 0x00,
+  0xf6, 0x00, 0x00, 0x00, 0xf9, 0x00, 0x02, 0x00, 0xec, 0x00, 0x00, 0x00,
+  0xf8, 0x00, 0x02, 0x00, 0xec, 0x00, 0x00, 0x00, 0xe0, 0x00, 0x04, 0x00,
+  0x52, 0x00, 0x00, 0x00, 0x52, 0x00, 0x00, 0x00, 0xde, 0x00, 0x00, 0x00,
+  0xf9, 0x00, 0x02, 0x00, 0xe4, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00,
+  0xe4, 0x00, 0x00, 0x00, 0xc2, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0xf9, 0x00, 0x00, 0x00, 0xfd, 0x01, 0x00, 0x00, 0x8c, 0x00, 0x00, 0x00,
+  0xf9, 0x00, 0x02, 0x00, 0xe1, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00,
+  0xe3, 0x00, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00, 0xdc, 0x00, 0x00, 0x00,
+  0xfa, 0x00, 0x00, 0x00, 0xd9, 0x00, 0x00, 0x00, 0xbb, 0x00, 0x00, 0x00,
+  0x3d, 0x00, 0x04, 0x00, 0x17, 0x00, 0x00, 0x00, 0xfb, 0x00, 0x00, 0x00,
+  0xfa, 0x00, 0x00, 0x00, 0xf9, 0x00, 0x02, 0x00, 0xff, 0x00, 0x00, 0x00,
+  0xf8, 0x00, 0x02, 0x00, 0xff, 0x00, 0x00, 0x00, 0xf5, 0x00, 0x07, 0x00,
+  0x17, 0x00, 0x00, 0x00, 0xff, 0x01, 0x00, 0x00, 0x48, 0x00, 0x00, 0x00,
+  0xe3, 0x00, 0x00, 0x00, 0x37, 0x01, 0x00, 0x00, 0x02, 0x01, 0x00, 0x00,
+  0xf5, 0x00, 0x07, 0x00, 0x06, 0x00, 0x00, 0x00, 0xfe, 0x01, 0x00, 0x00,
+  0x7b, 0x00, 0x00, 0x00, 0xe3, 0x00, 0x00, 0x00, 0x39, 0x01, 0x00, 0x00,
+  0x02, 0x01, 0x00, 0x00, 0xb0, 0x00, 0x05, 0x00, 0x4a, 0x00, 0x00, 0x00,
+  0x07, 0x01, 0x00, 0x00, 0xfe, 0x01, 0x00, 0x00, 0xa1, 0x00, 0x00, 0x00,
+  0xf6, 0x00, 0x04, 0x00, 0x01, 0x01, 0x00, 0x00, 0x02, 0x01, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0xfa, 0x00, 0x04, 0x00, 0x07, 0x01, 0x00, 0x00,
+  0x00, 0x01, 0x00, 0x00, 0x01, 0x01, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00,
+  0x00, 0x01, 0x00, 0x00, 0x84, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0xbd, 0x01, 0x00, 0x00, 0x83, 0x00, 0x00, 0x00, 0xa1, 0x00, 0x00, 0x00,
+  0x80, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0xbe, 0x01, 0x00, 0x00,
+  0xfe, 0x01, 0x00, 0x00, 0xbd, 0x01, 0x00, 0x00, 0x84, 0x00, 0x05, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0xc2, 0x01, 0x00, 0x00, 0x94, 0x00, 0x00, 0x00,
+  0x8e, 0x00, 0x00, 0x00, 0x80, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0xc3, 0x01, 0x00, 0x00, 0x8f, 0x00, 0x00, 0x00, 0xc2, 0x01, 0x00, 0x00,
+  0x7c, 0x00, 0x04, 0x00, 0x0b, 0x00, 0x00, 0x00, 0xc5, 0x01, 0x00, 0x00,
+  0xbe, 0x01, 0x00, 0x00, 0x7c, 0x00, 0x04, 0x00, 0x0b, 0x00, 0x00, 0x00,
+  0xc7, 0x01, 0x00, 0x00, 0xc3, 0x01, 0x00, 0x00, 0x50, 0x00, 0x05, 0x00,
+  0x0c, 0x00, 0x00, 0x00, 0xc8, 0x01, 0x00, 0x00, 0xc5, 0x01, 0x00, 0x00,
+  0xc7, 0x01, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00, 0xb6, 0x00, 0x00, 0x00,
+  0x19, 0x01, 0x00, 0x00, 0xb8, 0x00, 0x00, 0x00, 0x64, 0x00, 0x04, 0x00,
+  0xb5, 0x00, 0x00, 0x00, 0x1b, 0x01, 0x00, 0x00, 0x19, 0x01, 0x00, 0x00,
+  0x5f, 0x00, 0x07, 0x00, 0x17, 0x00, 0x00, 0x00, 0x1c, 0x01, 0x00, 0x00,
+  0x1b, 0x01, 0x00, 0x00, 0xc8, 0x01, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+  0xbb, 0x00, 0x00, 0x00, 0x83, 0x00, 0x05, 0x00, 0x17, 0x00, 0x00, 0x00,
+  0x20, 0x01, 0x00, 0x00, 0x1c, 0x01, 0x00, 0x00, 0xfb, 0x00, 0x00, 0x00,
+  0x0c, 0x00, 0x06, 0x00, 0x17, 0x00, 0x00, 0x00, 0x21, 0x01, 0x00, 0x00,
+  0x01, 0x00, 0x00, 0x00, 0x1b, 0x00, 0x00, 0x00, 0x20, 0x01, 0x00, 0x00,
+  0x82, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0x25, 0x01, 0x00, 0x00,
+  0x82, 0x00, 0x00, 0x00, 0x2d, 0x00, 0x00, 0x00, 0xaa, 0x00, 0x05, 0x00,
+  0x4a, 0x00, 0x00, 0x00, 0x26, 0x01, 0x00, 0x00, 0x83, 0x00, 0x00, 0x00,
+  0x25, 0x01, 0x00, 0x00, 0xf7, 0x00, 0x03, 0x00, 0x28, 0x01, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0xfa, 0x00, 0x04, 0x00, 0x26, 0x01, 0x00, 0x00,
+  0x27, 0x01, 0x00, 0x00, 0x28, 0x01, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00,
+  0x27, 0x01, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00, 0x28, 0x00, 0x00, 0x00,
+  0x29, 0x01, 0x00, 0x00, 0x26, 0x00, 0x00, 0x00, 0xc5, 0x00, 0x00, 0x00,
+  0x3d, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, 0x2a, 0x01, 0x00, 0x00,
+  0x29, 0x01, 0x00, 0x00, 0xac, 0x00, 0x05, 0x00, 0x4a, 0x00, 0x00, 0x00,
+  0x2b, 0x01, 0x00, 0x00, 0x2a, 0x01, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00,
+  0xf9, 0x00, 0x02, 0x00, 0x28, 0x01, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00,
+  0x28, 0x01, 0x00, 0x00, 0xf5, 0x00, 0x07, 0x00, 0x4a, 0x00, 0x00, 0x00,
+  0x2c, 0x01, 0x00, 0x00, 0x26, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00,
+  0x2b, 0x01, 0x00, 0x00, 0x27, 0x01, 0x00, 0x00, 0xf7, 0x00, 0x03, 0x00,
+  0x2e, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xfa, 0x00, 0x04, 0x00,
+  0x2c, 0x01, 0x00, 0x00, 0x2d, 0x01, 0x00, 0x00, 0x2e, 0x01, 0x00, 0x00,
+  0xf8, 0x00, 0x02, 0x00, 0x2d, 0x01, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00,
+  0x28, 0x00, 0x00, 0x00, 0x32, 0x01, 0x00, 0x00, 0x26, 0x00, 0x00, 0x00,
+  0xc5, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0x33, 0x01, 0x00, 0x00, 0x32, 0x01, 0x00, 0x00, 0xae, 0x00, 0x05, 0x00,
+  0x4a, 0x00, 0x00, 0x00, 0xcd, 0x01, 0x00, 0x00, 0x33, 0x01, 0x00, 0x00,
+  0x2d, 0x00, 0x00, 0x00, 0xa9, 0x00, 0x06, 0x00, 0x16, 0x00, 0x00, 0x00,
+  0xce, 0x01, 0x00, 0x00, 0xcd, 0x01, 0x00, 0x00, 0x4c, 0x00, 0x00, 0x00,
+  0x47, 0x00, 0x00, 0x00, 0x52, 0x00, 0x06, 0x00, 0x17, 0x00, 0x00, 0x00,
+  0xf5, 0x01, 0x00, 0x00, 0xce, 0x01, 0x00, 0x00, 0x48, 0x00, 0x00, 0x00,
+  0x03, 0x00, 0x00, 0x00, 0xae, 0x00, 0x05, 0x00, 0x4a, 0x00, 0x00, 0x00,
+  0xd1, 0x01, 0x00, 0x00, 0x33, 0x01, 0x00, 0x00, 0x52, 0x00, 0x00, 0x00,
+  0xa9, 0x00, 0x06, 0x00, 0x16, 0x00, 0x00, 0x00, 0xd2, 0x01, 0x00, 0x00,
+  0xd1, 0x01, 0x00, 0x00, 0x4c, 0x00, 0x00, 0x00, 0x47, 0x00, 0x00, 0x00,
+  0x52, 0x00, 0x06, 0x00, 0x17, 0x00, 0x00, 0x00, 0xf7, 0x01, 0x00, 0x00,
+  0xd2, 0x01, 0x00, 0x00, 0xf5, 0x01, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+  0xae, 0x00, 0x05, 0x00, 0x4a, 0x00, 0x00, 0x00, 0xd5, 0x01, 0x00, 0x00,
+  0x33, 0x01, 0x00, 0x00, 0x4e, 0x00, 0x00, 0x00, 0xa9, 0x00, 0x06, 0x00,
+  0x16, 0x00, 0x00, 0x00, 0xd6, 0x01, 0x00, 0x00, 0xd5, 0x01, 0x00, 0x00,
+  0x4c, 0x00, 0x00, 0x00, 0x47, 0x00, 0x00, 0x00, 0x52, 0x00, 0x06, 0x00,
+  0x17, 0x00, 0x00, 0x00, 0xf9, 0x01, 0x00, 0x00, 0xd6, 0x01, 0x00, 0x00,
+  0xf7, 0x01, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x08, 0x00,
+  0x17, 0x00, 0x00, 0x00, 0xda, 0x01, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+  0x2e, 0x00, 0x00, 0x00, 0x21, 0x01, 0x00, 0x00, 0x48, 0x00, 0x00, 0x00,
+  0xf9, 0x01, 0x00, 0x00, 0xf9, 0x00, 0x02, 0x00, 0x2e, 0x01, 0x00, 0x00,
+  0xf8, 0x00, 0x02, 0x00, 0x2e, 0x01, 0x00, 0x00, 0xf5, 0x00, 0x07, 0x00,
+  0x17, 0x00, 0x00, 0x00, 0x08, 0x02, 0x00, 0x00, 0x21, 0x01, 0x00, 0x00,
+  0x28, 0x01, 0x00, 0x00, 0xda, 0x01, 0x00, 0x00, 0x2d, 0x01, 0x00, 0x00,
+  0x81, 0x00, 0x05, 0x00, 0x17, 0x00, 0x00, 0x00, 0x37, 0x01, 0x00, 0x00,
+  0xff, 0x01, 0x00, 0x00, 0x08, 0x02, 0x00, 0x00, 0xf9, 0x00, 0x02, 0x00,
+  0x02, 0x01, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x02, 0x01, 0x00, 0x00,
+  0x80, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0x39, 0x01, 0x00, 0x00,
+  0xfe, 0x01, 0x00, 0x00, 0x2b, 0x00, 0x00, 0x00, 0xf9, 0x00, 0x02, 0x00,
+  0xff, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x01, 0x01, 0x00, 0x00,
+  0x3e, 0x00, 0x03, 0x00, 0xdd, 0x00, 0x00, 0x00, 0xff, 0x01, 0x00, 0x00,
+  0xe0, 0x00, 0x04, 0x00, 0x52, 0x00, 0x00, 0x00, 0x52, 0x00, 0x00, 0x00,
+  0xde, 0x00, 0x00, 0x00, 0xf9, 0x00, 0x02, 0x00, 0x3e, 0x01, 0x00, 0x00,
+  0xf8, 0x00, 0x02, 0x00, 0x3e, 0x01, 0x00, 0x00, 0xf5, 0x00, 0x07, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0xe0, 0x00, 0x00, 0x00,
+  0x01, 0x01, 0x00, 0x00, 0x55, 0x01, 0x00, 0x00, 0x41, 0x01, 0x00, 0x00,
+  0xac, 0x00, 0x05, 0x00, 0x4a, 0x00, 0x00, 0x00, 0x44, 0x01, 0x00, 0x00,
+  0x00, 0x02, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00, 0xf6, 0x00, 0x04, 0x00,
+  0x40, 0x01, 0x00, 0x00, 0x41, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0xfa, 0x00, 0x04, 0x00, 0x44, 0x01, 0x00, 0x00, 0x3f, 0x01, 0x00, 0x00,
+  0x40, 0x01, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x3f, 0x01, 0x00, 0x00,
+  0xb0, 0x00, 0x05, 0x00, 0x4a, 0x00, 0x00, 0x00, 0x47, 0x01, 0x00, 0x00,
+  0x7b, 0x00, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0xf7, 0x00, 0x03, 0x00,
+  0x49, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xfa, 0x00, 0x04, 0x00,
+  0x47, 0x01, 0x00, 0x00, 0x48, 0x01, 0x00, 0x00, 0x49, 0x01, 0x00, 0x00,
+  0xf8, 0x00, 0x02, 0x00, 0x48, 0x01, 0x00, 0x00, 0x80, 0x00, 0x05, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0x4d, 0x01, 0x00, 0x00, 0x7b, 0x00, 0x00, 0x00,
+  0x00, 0x02, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00, 0xdc, 0x00, 0x00, 0x00,
+  0x4e, 0x01, 0x00, 0x00, 0xd9, 0x00, 0x00, 0x00, 0x4d, 0x01, 0x00, 0x00,
+  0x3d, 0x00, 0x04, 0x00, 0x17, 0x00, 0x00, 0x00, 0x4f, 0x01, 0x00, 0x00,
+  0x4e, 0x01, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00, 0x17, 0x00, 0x00, 0x00,
+  0x51, 0x01, 0x00, 0x00, 0xdd, 0x00, 0x00, 0x00, 0x81, 0x00, 0x05, 0x00,
+  0x17, 0x00, 0x00, 0x00, 0x52, 0x01, 0x00, 0x00, 0x51, 0x01, 0x00, 0x00,
+  0x4f, 0x01, 0x00, 0x00, 0x3e, 0x00, 0x03, 0x00, 0xdd, 0x00, 0x00, 0x00,
+  0x52, 0x01, 0x00, 0x00, 0xf9, 0x00, 0x02, 0x00, 0x49, 0x01, 0x00, 0x00,
+  0xf8, 0x00, 0x02, 0x00, 0x49, 0x01, 0x00, 0x00, 0xe0, 0x00, 0x04, 0x00,
+  0x52, 0x00, 0x00, 0x00, 0x52, 0x00, 0x00, 0x00, 0xde, 0x00, 0x00, 0x00,
+  0xf9, 0x00, 0x02, 0x00, 0x41, 0x01, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00,
+  0x41, 0x01, 0x00, 0x00, 0xc2, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0x55, 0x01, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x8c, 0x00, 0x00, 0x00,
+  0xf9, 0x00, 0x02, 0x00, 0x3e, 0x01, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00,
+  0x40, 0x01, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00, 0x17, 0x00, 0x00, 0x00,
+  0x57, 0x01, 0x00, 0x00, 0xfa, 0x00, 0x00, 0x00, 0xf9, 0x00, 0x02, 0x00,
+  0x5a, 0x01, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x5a, 0x01, 0x00, 0x00,
+  0xf5, 0x00, 0x07, 0x00, 0x06, 0x00, 0x00, 0x00, 0x01, 0x02, 0x00, 0x00,
+  0x7b, 0x00, 0x00, 0x00, 0x40, 0x01, 0x00, 0x00, 0x86, 0x01, 0x00, 0x00,
+  0x5b, 0x01, 0x00, 0x00, 0xb0, 0x00, 0x05, 0x00, 0x4a, 0x00, 0x00, 0x00,
+  0x62, 0x01, 0x00, 0x00, 0x01, 0x02, 0x00, 0x00, 0xa1, 0x00, 0x00, 0x00,
+  0xf6, 0x00, 0x04, 0x00, 0x5c, 0x01, 0x00, 0x00, 0x5b, 0x01, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0xfa, 0x00, 0x04, 0x00, 0x62, 0x01, 0x00, 0x00,
+  0x5b, 0x01, 0x00, 0x00, 0x5c, 0x01, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00,
+  0x5b, 0x01, 0x00, 0x00, 0x84, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0xe2, 0x01, 0x00, 0x00, 0x83, 0x00, 0x00, 0x00, 0xa1, 0x00, 0x00, 0x00,
+  0x80, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0xe3, 0x01, 0x00, 0x00,
+  0x01, 0x02, 0x00, 0x00, 0xe2, 0x01, 0x00, 0x00, 0x84, 0x00, 0x05, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0xe7, 0x01, 0x00, 0x00, 0x94, 0x00, 0x00, 0x00,
+  0x8e, 0x00, 0x00, 0x00, 0x80, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0xe8, 0x01, 0x00, 0x00, 0x8f, 0x00, 0x00, 0x00, 0xe7, 0x01, 0x00, 0x00,
+  0x7c, 0x00, 0x04, 0x00, 0x0b, 0x00, 0x00, 0x00, 0xea, 0x01, 0x00, 0x00,
+  0xe3, 0x01, 0x00, 0x00, 0x7c, 0x00, 0x04, 0x00, 0x0b, 0x00, 0x00, 0x00,
+  0xec, 0x01, 0x00, 0x00, 0xe8, 0x01, 0x00, 0x00, 0x50, 0x00, 0x05, 0x00,
+  0x0c, 0x00, 0x00, 0x00, 0xed, 0x01, 0x00, 0x00, 0xea, 0x01, 0x00, 0x00,
+  0xec, 0x01, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00, 0xb6, 0x00, 0x00, 0x00,
+  0x74, 0x01, 0x00, 0x00, 0xb8, 0x00, 0x00, 0x00, 0x64, 0x00, 0x04, 0x00,
+  0xb5, 0x00, 0x00, 0x00, 0x76, 0x01, 0x00, 0x00, 0x74, 0x01, 0x00, 0x00,
+  0x5f, 0x00, 0x07, 0x00, 0x17, 0x00, 0x00, 0x00, 0x77, 0x01, 0x00, 0x00,
+  0x76, 0x01, 0x00, 0x00, 0xed, 0x01, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+  0xbb, 0x00, 0x00, 0x00, 0x83, 0x00, 0x05, 0x00, 0x17, 0x00, 0x00, 0x00,
+  0x7b, 0x01, 0x00, 0x00, 0x77, 0x01, 0x00, 0x00, 0xfb, 0x00, 0x00, 0x00,
+  0x0c, 0x00, 0x06, 0x00, 0x17, 0x00, 0x00, 0x00, 0x7c, 0x01, 0x00, 0x00,
+  0x01, 0x00, 0x00, 0x00, 0x1b, 0x00, 0x00, 0x00, 0x7b, 0x01, 0x00, 0x00,
+  0x88, 0x00, 0x05, 0x00, 0x17, 0x00, 0x00, 0x00, 0x7e, 0x01, 0x00, 0x00,
+  0x7c, 0x01, 0x00, 0x00, 0x57, 0x01, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00,
+  0x7f, 0x01, 0x00, 0x00, 0x82, 0x01, 0x00, 0x00, 0x81, 0x01, 0x00, 0x00,
+  0x63, 0x00, 0x04, 0x00, 0x82, 0x01, 0x00, 0x00, 0xed, 0x01, 0x00, 0x00,
+  0x7e, 0x01, 0x00, 0x00, 0x80, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0x86, 0x01, 0x00, 0x00, 0x01, 0x02, 0x00, 0x00, 0x2b, 0x00, 0x00, 0x00,
+  0xf9, 0x00, 0x02, 0x00, 0x5a, 0x01, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00,
+  0x5c, 0x01, 0x00, 0x00, 0xfd, 0x00, 0x01, 0x00, 0x38, 0x00, 0x01, 0x00
+};
+unsigned int glsl_softmaxImage_AXIS_W_comp_len = 4908;
+
+const unsigned char glsl_softmaxImage_AXIS_C_comp[] = {
+  0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0b, 0x00, 0x08, 0x00,
+  0x37, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00,
+  0x01, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x38, 0x00, 0x00, 0x00,
+  0x0b, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x4c, 0x53, 0x4c,
+  0x2e, 0x73, 0x74, 0x64, 0x2e, 0x34, 0x35, 0x30, 0x00, 0x00, 0x00, 0x00,
+  0x0e, 0x00, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+  0x0f, 0x00, 0x07, 0x00, 0x05, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
+  0x6d, 0x61, 0x69, 0x6e, 0x00, 0x00, 0x00, 0x00, 0x78, 0x00, 0x00, 0x00,
+  0x7d, 0x00, 0x00, 0x00, 0x10, 0x00, 0x06, 0x00, 0x04, 0x00, 0x00, 0x00,
+  0x11, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+  0x01, 0x00, 0x00, 0x00, 0x03, 0x00, 0x03, 0x00, 0x02, 0x00, 0x00, 0x00,
+  0xc2, 0x01, 0x00, 0x00, 0x05, 0x00, 0x04, 0x00, 0x04, 0x00, 0x00, 0x00,
+  0x6d, 0x61, 0x69, 0x6e, 0x00, 0x00, 0x00, 0x00, 0x05, 0x00, 0x05, 0x00,
+  0x24, 0x00, 0x00, 0x00, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x42, 0x75, 0x66,
+  0x66, 0x65, 0x72, 0x00, 0x06, 0x00, 0x04, 0x00, 0x24, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x4e, 0x00, 0x00, 0x00, 0x06, 0x00, 0x04, 0x00,
+  0x24, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x48, 0x00, 0x00, 0x00,
+  0x06, 0x00, 0x04, 0x00, 0x24, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+  0x57, 0x00, 0x00, 0x00, 0x06, 0x00, 0x04, 0x00, 0x24, 0x00, 0x00, 0x00,
+  0x03, 0x00, 0x00, 0x00, 0x43, 0x34, 0x00, 0x00, 0x06, 0x00, 0x05, 0x00,
+  0x24, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x43, 0x4c, 0x65, 0x66,
+  0x74, 0x00, 0x00, 0x00, 0x05, 0x00, 0x04, 0x00, 0x26, 0x00, 0x00, 0x00,
+  0x75, 0x43, 0x6f, 0x6e, 0x73, 0x74, 0x00, 0x00, 0x05, 0x00, 0x08, 0x00,
+  0x78, 0x00, 0x00, 0x00, 0x67, 0x6c, 0x5f, 0x4c, 0x6f, 0x63, 0x61, 0x6c,
+  0x49, 0x6e, 0x76, 0x6f, 0x63, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x49, 0x44,
+  0x00, 0x00, 0x00, 0x00, 0x05, 0x00, 0x08, 0x00, 0x7d, 0x00, 0x00, 0x00,
+  0x67, 0x6c, 0x5f, 0x47, 0x6c, 0x6f, 0x62, 0x61, 0x6c, 0x49, 0x6e, 0x76,
+  0x6f, 0x63, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x49, 0x44, 0x00, 0x00, 0x00,
+  0x05, 0x00, 0x04, 0x00, 0xb8, 0x00, 0x00, 0x00, 0x75, 0x49, 0x6e, 0x70,
+  0x75, 0x74, 0x00, 0x00, 0x05, 0x00, 0x06, 0x00, 0xd9, 0x00, 0x00, 0x00,
+  0x73, 0x68, 0x61, 0x72, 0x65, 0x64, 0x56, 0x61, 0x6c, 0x75, 0x65, 0x73,
+  0x00, 0x00, 0x00, 0x00, 0x05, 0x00, 0x04, 0x00, 0x9b, 0x01, 0x00, 0x00,
+  0x75, 0x4f, 0x75, 0x74, 0x70, 0x75, 0x74, 0x00, 0x48, 0x00, 0x05, 0x00,
+  0x24, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00, 0x24, 0x00, 0x00, 0x00,
+  0x01, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
+  0x48, 0x00, 0x05, 0x00, 0x24, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+  0x23, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00,
+  0x24, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00,
+  0x0c, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00, 0x24, 0x00, 0x00, 0x00,
+  0x04, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00,
+  0x47, 0x00, 0x03, 0x00, 0x24, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+  0x47, 0x00, 0x04, 0x00, 0x26, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x26, 0x00, 0x00, 0x00,
+  0x21, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
+  0x78, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x1b, 0x00, 0x00, 0x00,
+  0x47, 0x00, 0x04, 0x00, 0x7d, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00,
+  0x1c, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0xb8, 0x00, 0x00, 0x00,
+  0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
+  0xb8, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+  0x47, 0x00, 0x04, 0x00, 0x9b, 0x01, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x9b, 0x01, 0x00, 0x00,
+  0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00,
+  0x9b, 0x01, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
+  0xa1, 0x01, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00,
+  0x13, 0x00, 0x02, 0x00, 0x02, 0x00, 0x00, 0x00, 0x21, 0x00, 0x03, 0x00,
+  0x03, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x15, 0x00, 0x04, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x15, 0x00, 0x04, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00,
+  0x01, 0x00, 0x00, 0x00, 0x17, 0x00, 0x04, 0x00, 0x0c, 0x00, 0x00, 0x00,
+  0x0b, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x16, 0x00, 0x03, 0x00,
+  0x16, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0x17, 0x00, 0x04, 0x00,
+  0x17, 0x00, 0x00, 0x00, 0x16, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
+  0x2b, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x1e, 0x00, 0x07, 0x00, 0x24, 0x00, 0x00, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00,
+  0x25, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00,
+  0x3b, 0x00, 0x04, 0x00, 0x25, 0x00, 0x00, 0x00, 0x26, 0x00, 0x00, 0x00,
+  0x02, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00, 0x0b, 0x00, 0x00, 0x00,
+  0x27, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00,
+  0x28, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0x2b, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x00, 0x00,
+  0x00, 0x01, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0x2d, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00,
+  0x16, 0x00, 0x00, 0x00, 0x47, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x2c, 0x00, 0x07, 0x00, 0x17, 0x00, 0x00, 0x00, 0x48, 0x00, 0x00, 0x00,
+  0x47, 0x00, 0x00, 0x00, 0x47, 0x00, 0x00, 0x00, 0x47, 0x00, 0x00, 0x00,
+  0x47, 0x00, 0x00, 0x00, 0x14, 0x00, 0x02, 0x00, 0x4a, 0x00, 0x00, 0x00,
+  0x2b, 0x00, 0x04, 0x00, 0x16, 0x00, 0x00, 0x00, 0x4c, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x80, 0x3f, 0x2b, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0x4e, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0x52, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+  0x2b, 0x00, 0x04, 0x00, 0x16, 0x00, 0x00, 0x00, 0x5b, 0x00, 0x00, 0x00,
+  0xff, 0xff, 0x7f, 0xff, 0x2c, 0x00, 0x07, 0x00, 0x17, 0x00, 0x00, 0x00,
+  0x5c, 0x00, 0x00, 0x00, 0x5b, 0x00, 0x00, 0x00, 0x5b, 0x00, 0x00, 0x00,
+  0x5b, 0x00, 0x00, 0x00, 0x5b, 0x00, 0x00, 0x00, 0x17, 0x00, 0x04, 0x00,
+  0x76, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00,
+  0x20, 0x00, 0x04, 0x00, 0x77, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+  0x76, 0x00, 0x00, 0x00, 0x3b, 0x00, 0x04, 0x00, 0x77, 0x00, 0x00, 0x00,
+  0x78, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00,
+  0x79, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0x3b, 0x00, 0x04, 0x00, 0x77, 0x00, 0x00, 0x00, 0x7d, 0x00, 0x00, 0x00,
+  0x01, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00, 0x0b, 0x00, 0x00, 0x00,
+  0x80, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00,
+  0x0b, 0x00, 0x00, 0x00, 0x8c, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+  0x19, 0x00, 0x09, 0x00, 0xb5, 0x00, 0x00, 0x00, 0x16, 0x00, 0x00, 0x00,
+  0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x1b, 0x00, 0x03, 0x00, 0xb6, 0x00, 0x00, 0x00, 0xb5, 0x00, 0x00, 0x00,
+  0x20, 0x00, 0x04, 0x00, 0xb7, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0xb6, 0x00, 0x00, 0x00, 0x3b, 0x00, 0x04, 0x00, 0xb7, 0x00, 0x00, 0x00,
+  0xb8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00,
+  0x0b, 0x00, 0x00, 0x00, 0xbb, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x2b, 0x00, 0x04, 0x00, 0x0b, 0x00, 0x00, 0x00, 0xc5, 0x00, 0x00, 0x00,
+  0x04, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x04, 0x00, 0xd7, 0x00, 0x00, 0x00,
+  0x17, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00,
+  0xd8, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0xd7, 0x00, 0x00, 0x00,
+  0x3b, 0x00, 0x04, 0x00, 0xd8, 0x00, 0x00, 0x00, 0xd9, 0x00, 0x00, 0x00,
+  0x04, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00, 0xdc, 0x00, 0x00, 0x00,
+  0x04, 0x00, 0x00, 0x00, 0x17, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0xde, 0x00, 0x00, 0x00, 0x08, 0x01, 0x00, 0x00,
+  0x2b, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, 0xe0, 0x00, 0x00, 0x00,
+  0x80, 0x00, 0x00, 0x00, 0x2c, 0x00, 0x07, 0x00, 0x17, 0x00, 0x00, 0x00,
+  0x6a, 0x01, 0x00, 0x00, 0x4c, 0x00, 0x00, 0x00, 0x4c, 0x00, 0x00, 0x00,
+  0x4c, 0x00, 0x00, 0x00, 0x4c, 0x00, 0x00, 0x00, 0x19, 0x00, 0x09, 0x00,
+  0x99, 0x01, 0x00, 0x00, 0x16, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00,
+  0x9a, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x99, 0x01, 0x00, 0x00,
+  0x3b, 0x00, 0x04, 0x00, 0x9a, 0x01, 0x00, 0x00, 0x9b, 0x01, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x2c, 0x00, 0x06, 0x00, 0x76, 0x00, 0x00, 0x00,
+  0xa1, 0x01, 0x00, 0x00, 0x2b, 0x00, 0x00, 0x00, 0x2d, 0x00, 0x00, 0x00,
+  0x2d, 0x00, 0x00, 0x00, 0x36, 0x00, 0x05, 0x00, 0x02, 0x00, 0x00, 0x00,
+  0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00,
+  0xf8, 0x00, 0x02, 0x00, 0x05, 0x00, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00,
+  0x79, 0x00, 0x00, 0x00, 0x7a, 0x00, 0x00, 0x00, 0x78, 0x00, 0x00, 0x00,
+  0x23, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0x7b, 0x00, 0x00, 0x00, 0x7a, 0x00, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00,
+  0x79, 0x00, 0x00, 0x00, 0x7e, 0x00, 0x00, 0x00, 0x7d, 0x00, 0x00, 0x00,
+  0x2d, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0x7f, 0x00, 0x00, 0x00, 0x7e, 0x00, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00,
+  0x28, 0x00, 0x00, 0x00, 0x81, 0x00, 0x00, 0x00, 0x26, 0x00, 0x00, 0x00,
+  0x80, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0x82, 0x00, 0x00, 0x00, 0x81, 0x00, 0x00, 0x00, 0x89, 0x00, 0x05, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0x83, 0x00, 0x00, 0x00, 0x7f, 0x00, 0x00, 0x00,
+  0x82, 0x00, 0x00, 0x00, 0x86, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0x89, 0x00, 0x00, 0x00, 0x7f, 0x00, 0x00, 0x00, 0x82, 0x00, 0x00, 0x00,
+  0x41, 0x00, 0x05, 0x00, 0x28, 0x00, 0x00, 0x00, 0x8d, 0x00, 0x00, 0x00,
+  0x26, 0x00, 0x00, 0x00, 0x8c, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0x8e, 0x00, 0x00, 0x00, 0x8d, 0x00, 0x00, 0x00,
+  0x89, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0x8f, 0x00, 0x00, 0x00,
+  0x89, 0x00, 0x00, 0x00, 0x8e, 0x00, 0x00, 0x00, 0x86, 0x00, 0x05, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0x94, 0x00, 0x00, 0x00, 0x89, 0x00, 0x00, 0x00,
+  0x8e, 0x00, 0x00, 0x00, 0xf9, 0x00, 0x02, 0x00, 0x9a, 0x00, 0x00, 0x00,
+  0xf8, 0x00, 0x02, 0x00, 0x9a, 0x00, 0x00, 0x00, 0xf5, 0x00, 0x07, 0x00,
+  0x17, 0x00, 0x00, 0x00, 0x21, 0x02, 0x00, 0x00, 0x5c, 0x00, 0x00, 0x00,
+  0x05, 0x00, 0x00, 0x00, 0xd4, 0x00, 0x00, 0x00, 0x9d, 0x00, 0x00, 0x00,
+  0xf5, 0x00, 0x07, 0x00, 0x06, 0x00, 0x00, 0x00, 0x20, 0x02, 0x00, 0x00,
+  0x7b, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0xd6, 0x00, 0x00, 0x00,
+  0x9d, 0x00, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00, 0x28, 0x00, 0x00, 0x00,
+  0xa0, 0x00, 0x00, 0x00, 0x26, 0x00, 0x00, 0x00, 0x27, 0x00, 0x00, 0x00,
+  0x3d, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, 0xa1, 0x00, 0x00, 0x00,
+  0xa0, 0x00, 0x00, 0x00, 0xb0, 0x00, 0x05, 0x00, 0x4a, 0x00, 0x00, 0x00,
+  0xa2, 0x00, 0x00, 0x00, 0x20, 0x02, 0x00, 0x00, 0xa1, 0x00, 0x00, 0x00,
+  0xf6, 0x00, 0x04, 0x00, 0x9c, 0x00, 0x00, 0x00, 0x9d, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0xfa, 0x00, 0x04, 0x00, 0xa2, 0x00, 0x00, 0x00,
+  0x9b, 0x00, 0x00, 0x00, 0x9c, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00,
+  0x9b, 0x00, 0x00, 0x00, 0x84, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0xb2, 0x01, 0x00, 0x00, 0x20, 0x02, 0x00, 0x00, 0x82, 0x00, 0x00, 0x00,
+  0x80, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0xb3, 0x01, 0x00, 0x00,
+  0x83, 0x00, 0x00, 0x00, 0xb2, 0x01, 0x00, 0x00, 0x84, 0x00, 0x05, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0xb7, 0x01, 0x00, 0x00, 0x94, 0x00, 0x00, 0x00,
+  0x8e, 0x00, 0x00, 0x00, 0x80, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0xb8, 0x01, 0x00, 0x00, 0x8f, 0x00, 0x00, 0x00, 0xb7, 0x01, 0x00, 0x00,
+  0x7c, 0x00, 0x04, 0x00, 0x0b, 0x00, 0x00, 0x00, 0xba, 0x01, 0x00, 0x00,
+  0xb3, 0x01, 0x00, 0x00, 0x7c, 0x00, 0x04, 0x00, 0x0b, 0x00, 0x00, 0x00,
+  0xbc, 0x01, 0x00, 0x00, 0xb8, 0x01, 0x00, 0x00, 0x50, 0x00, 0x05, 0x00,
+  0x0c, 0x00, 0x00, 0x00, 0xbd, 0x01, 0x00, 0x00, 0xba, 0x01, 0x00, 0x00,
+  0xbc, 0x01, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00, 0xb6, 0x00, 0x00, 0x00,
+  0xb9, 0x00, 0x00, 0x00, 0xb8, 0x00, 0x00, 0x00, 0x64, 0x00, 0x04, 0x00,
+  0xb5, 0x00, 0x00, 0x00, 0xbc, 0x00, 0x00, 0x00, 0xb9, 0x00, 0x00, 0x00,
+  0x5f, 0x00, 0x07, 0x00, 0x17, 0x00, 0x00, 0x00, 0xbd, 0x00, 0x00, 0x00,
+  0xbc, 0x00, 0x00, 0x00, 0xbd, 0x01, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+  0xbb, 0x00, 0x00, 0x00, 0x82, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0xc1, 0x00, 0x00, 0x00, 0xa1, 0x00, 0x00, 0x00, 0x2d, 0x00, 0x00, 0x00,
+  0xaa, 0x00, 0x05, 0x00, 0x4a, 0x00, 0x00, 0x00, 0xc2, 0x00, 0x00, 0x00,
+  0x20, 0x02, 0x00, 0x00, 0xc1, 0x00, 0x00, 0x00, 0xf7, 0x00, 0x03, 0x00,
+  0xc4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xfa, 0x00, 0x04, 0x00,
+  0xc2, 0x00, 0x00, 0x00, 0xc3, 0x00, 0x00, 0x00, 0xc4, 0x00, 0x00, 0x00,
+  0xf8, 0x00, 0x02, 0x00, 0xc3, 0x00, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00,
+  0x28, 0x00, 0x00, 0x00, 0xc6, 0x00, 0x00, 0x00, 0x26, 0x00, 0x00, 0x00,
+  0xc5, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0xc7, 0x00, 0x00, 0x00, 0xc6, 0x00, 0x00, 0x00, 0xac, 0x00, 0x05, 0x00,
+  0x4a, 0x00, 0x00, 0x00, 0xc8, 0x00, 0x00, 0x00, 0xc7, 0x00, 0x00, 0x00,
+  0x23, 0x00, 0x00, 0x00, 0xf9, 0x00, 0x02, 0x00, 0xc4, 0x00, 0x00, 0x00,
+  0xf8, 0x00, 0x02, 0x00, 0xc4, 0x00, 0x00, 0x00, 0xf5, 0x00, 0x07, 0x00,
+  0x4a, 0x00, 0x00, 0x00, 0xc9, 0x00, 0x00, 0x00, 0xc2, 0x00, 0x00, 0x00,
+  0x9b, 0x00, 0x00, 0x00, 0xc8, 0x00, 0x00, 0x00, 0xc3, 0x00, 0x00, 0x00,
+  0xf7, 0x00, 0x03, 0x00, 0xcb, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0xfa, 0x00, 0x04, 0x00, 0xc9, 0x00, 0x00, 0x00, 0xca, 0x00, 0x00, 0x00,
+  0xcb, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0xca, 0x00, 0x00, 0x00,
+  0x41, 0x00, 0x05, 0x00, 0x28, 0x00, 0x00, 0x00, 0xcf, 0x00, 0x00, 0x00,
+  0x26, 0x00, 0x00, 0x00, 0xc5, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0xd0, 0x00, 0x00, 0x00, 0xcf, 0x00, 0x00, 0x00,
+  0xae, 0x00, 0x05, 0x00, 0x4a, 0x00, 0x00, 0x00, 0xc2, 0x01, 0x00, 0x00,
+  0xd0, 0x00, 0x00, 0x00, 0x2d, 0x00, 0x00, 0x00, 0xa9, 0x00, 0x06, 0x00,
+  0x16, 0x00, 0x00, 0x00, 0xc3, 0x01, 0x00, 0x00, 0xc2, 0x01, 0x00, 0x00,
+  0x4c, 0x00, 0x00, 0x00, 0x47, 0x00, 0x00, 0x00, 0x52, 0x00, 0x06, 0x00,
+  0x17, 0x00, 0x00, 0x00, 0x09, 0x02, 0x00, 0x00, 0xc3, 0x01, 0x00, 0x00,
+  0x48, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0xae, 0x00, 0x05, 0x00,
+  0x4a, 0x00, 0x00, 0x00, 0xc6, 0x01, 0x00, 0x00, 0xd0, 0x00, 0x00, 0x00,
+  0x52, 0x00, 0x00, 0x00, 0xa9, 0x00, 0x06, 0x00, 0x16, 0x00, 0x00, 0x00,
+  0xc7, 0x01, 0x00, 0x00, 0xc6, 0x01, 0x00, 0x00, 0x4c, 0x00, 0x00, 0x00,
+  0x47, 0x00, 0x00, 0x00, 0x52, 0x00, 0x06, 0x00, 0x17, 0x00, 0x00, 0x00,
+  0x0b, 0x02, 0x00, 0x00, 0xc7, 0x01, 0x00, 0x00, 0x09, 0x02, 0x00, 0x00,
+  0x02, 0x00, 0x00, 0x00, 0xae, 0x00, 0x05, 0x00, 0x4a, 0x00, 0x00, 0x00,
+  0xca, 0x01, 0x00, 0x00, 0xd0, 0x00, 0x00, 0x00, 0x4e, 0x00, 0x00, 0x00,
+  0xa9, 0x00, 0x06, 0x00, 0x16, 0x00, 0x00, 0x00, 0xcb, 0x01, 0x00, 0x00,
+  0xca, 0x01, 0x00, 0x00, 0x4c, 0x00, 0x00, 0x00, 0x47, 0x00, 0x00, 0x00,
+  0x52, 0x00, 0x06, 0x00, 0x17, 0x00, 0x00, 0x00, 0x0d, 0x02, 0x00, 0x00,
+  0xcb, 0x01, 0x00, 0x00, 0x0b, 0x02, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+  0x0c, 0x00, 0x08, 0x00, 0x17, 0x00, 0x00, 0x00, 0xcf, 0x01, 0x00, 0x00,
+  0x01, 0x00, 0x00, 0x00, 0x2e, 0x00, 0x00, 0x00, 0xbd, 0x00, 0x00, 0x00,
+  0x5c, 0x00, 0x00, 0x00, 0x0d, 0x02, 0x00, 0x00, 0xf9, 0x00, 0x02, 0x00,
+  0xcb, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0xcb, 0x00, 0x00, 0x00,
+  0xf5, 0x00, 0x07, 0x00, 0x17, 0x00, 0x00, 0x00, 0x36, 0x02, 0x00, 0x00,
+  0xbd, 0x00, 0x00, 0x00, 0xc4, 0x00, 0x00, 0x00, 0xcf, 0x01, 0x00, 0x00,
+  0xca, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x07, 0x00, 0x17, 0x00, 0x00, 0x00,
+  0xd4, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00,
+  0x21, 0x02, 0x00, 0x00, 0x36, 0x02, 0x00, 0x00, 0xf9, 0x00, 0x02, 0x00,
+  0x9d, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x9d, 0x00, 0x00, 0x00,
+  0x80, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0xd6, 0x00, 0x00, 0x00,
+  0x20, 0x02, 0x00, 0x00, 0x2b, 0x00, 0x00, 0x00, 0xf9, 0x00, 0x02, 0x00,
+  0x9a, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x9c, 0x00, 0x00, 0x00,
+  0x41, 0x00, 0x05, 0x00, 0xdc, 0x00, 0x00, 0x00, 0xdd, 0x00, 0x00, 0x00,
+  0xd9, 0x00, 0x00, 0x00, 0x7b, 0x00, 0x00, 0x00, 0x3e, 0x00, 0x03, 0x00,
+  0xdd, 0x00, 0x00, 0x00, 0x21, 0x02, 0x00, 0x00, 0xe0, 0x00, 0x04, 0x00,
+  0x52, 0x00, 0x00, 0x00, 0x52, 0x00, 0x00, 0x00, 0xde, 0x00, 0x00, 0x00,
+  0xf9, 0x00, 0x02, 0x00, 0xe1, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00,
+  0xe1, 0x00, 0x00, 0x00, 0xf5, 0x00, 0x07, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0x22, 0x02, 0x00, 0x00, 0xe0, 0x00, 0x00, 0x00, 0x9c, 0x00, 0x00, 0x00,
+  0xf9, 0x00, 0x00, 0x00, 0xe4, 0x00, 0x00, 0x00, 0xac, 0x00, 0x05, 0x00,
+  0x4a, 0x00, 0x00, 0x00, 0xe7, 0x00, 0x00, 0x00, 0x22, 0x02, 0x00, 0x00,
+  0x23, 0x00, 0x00, 0x00, 0xf6, 0x00, 0x04, 0x00, 0xe3, 0x00, 0x00, 0x00,
+  0xe4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xfa, 0x00, 0x04, 0x00,
+  0xe7, 0x00, 0x00, 0x00, 0xe2, 0x00, 0x00, 0x00, 0xe3, 0x00, 0x00, 0x00,
+  0xf8, 0x00, 0x02, 0x00, 0xe2, 0x00, 0x00, 0x00, 0xb0, 0x00, 0x05, 0x00,
+  0x4a, 0x00, 0x00, 0x00, 0xea, 0x00, 0x00, 0x00, 0x7b, 0x00, 0x00, 0x00,
+  0x22, 0x02, 0x00, 0x00, 0xf7, 0x00, 0x03, 0x00, 0xec, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0xfa, 0x00, 0x04, 0x00, 0xea, 0x00, 0x00, 0x00,
+  0xeb, 0x00, 0x00, 0x00, 0xec, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00,
+  0xeb, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00, 0x17, 0x00, 0x00, 0x00,
+  0xf0, 0x00, 0x00, 0x00, 0xdd, 0x00, 0x00, 0x00, 0x80, 0x00, 0x05, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0xf3, 0x00, 0x00, 0x00, 0x7b, 0x00, 0x00, 0x00,
+  0x22, 0x02, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00, 0xdc, 0x00, 0x00, 0x00,
+  0xf4, 0x00, 0x00, 0x00, 0xd9, 0x00, 0x00, 0x00, 0xf3, 0x00, 0x00, 0x00,
+  0x3d, 0x00, 0x04, 0x00, 0x17, 0x00, 0x00, 0x00, 0xf5, 0x00, 0x00, 0x00,
+  0xf4, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x07, 0x00, 0x17, 0x00, 0x00, 0x00,
+  0xf6, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00,
+  0xf0, 0x00, 0x00, 0x00, 0xf5, 0x00, 0x00, 0x00, 0x3e, 0x00, 0x03, 0x00,
+  0xdd, 0x00, 0x00, 0x00, 0xf6, 0x00, 0x00, 0x00, 0xf9, 0x00, 0x02, 0x00,
+  0xec, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0xec, 0x00, 0x00, 0x00,
+  0xe0, 0x00, 0x04, 0x00, 0x52, 0x00, 0x00, 0x00, 0x52, 0x00, 0x00, 0x00,
+  0xde, 0x00, 0x00, 0x00, 0xf9, 0x00, 0x02, 0x00, 0xe4, 0x00, 0x00, 0x00,
+  0xf8, 0x00, 0x02, 0x00, 0xe4, 0x00, 0x00, 0x00, 0xc2, 0x00, 0x05, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0xf9, 0x00, 0x00, 0x00, 0x22, 0x02, 0x00, 0x00,
+  0x8c, 0x00, 0x00, 0x00, 0xf9, 0x00, 0x02, 0x00, 0xe1, 0x00, 0x00, 0x00,
+  0xf8, 0x00, 0x02, 0x00, 0xe3, 0x00, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00,
+  0xdc, 0x00, 0x00, 0x00, 0xfa, 0x00, 0x00, 0x00, 0xd9, 0x00, 0x00, 0x00,
+  0xbb, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00, 0x17, 0x00, 0x00, 0x00,
+  0xfb, 0x00, 0x00, 0x00, 0xfa, 0x00, 0x00, 0x00, 0x51, 0x00, 0x05, 0x00,
+  0x16, 0x00, 0x00, 0x00, 0xfe, 0x00, 0x00, 0x00, 0xfb, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x51, 0x00, 0x05, 0x00, 0x16, 0x00, 0x00, 0x00,
+  0x00, 0x01, 0x00, 0x00, 0xfb, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+  0x0c, 0x00, 0x07, 0x00, 0x16, 0x00, 0x00, 0x00, 0x01, 0x01, 0x00, 0x00,
+  0x01, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00, 0xfe, 0x00, 0x00, 0x00,
+  0x00, 0x01, 0x00, 0x00, 0x51, 0x00, 0x05, 0x00, 0x16, 0x00, 0x00, 0x00,
+  0x03, 0x01, 0x00, 0x00, 0xfb, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+  0x51, 0x00, 0x05, 0x00, 0x16, 0x00, 0x00, 0x00, 0x05, 0x01, 0x00, 0x00,
+  0xfb, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x07, 0x00,
+  0x16, 0x00, 0x00, 0x00, 0x06, 0x01, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+  0x28, 0x00, 0x00, 0x00, 0x03, 0x01, 0x00, 0x00, 0x05, 0x01, 0x00, 0x00,
+  0x0c, 0x00, 0x07, 0x00, 0x16, 0x00, 0x00, 0x00, 0x07, 0x01, 0x00, 0x00,
+  0x01, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00, 0x01, 0x01, 0x00, 0x00,
+  0x06, 0x01, 0x00, 0x00, 0xf9, 0x00, 0x02, 0x00, 0x0d, 0x01, 0x00, 0x00,
+  0xf8, 0x00, 0x02, 0x00, 0x0d, 0x01, 0x00, 0x00, 0xf5, 0x00, 0x07, 0x00,
+  0x17, 0x00, 0x00, 0x00, 0x24, 0x02, 0x00, 0x00, 0x48, 0x00, 0x00, 0x00,
+  0xe3, 0x00, 0x00, 0x00, 0x47, 0x01, 0x00, 0x00, 0x10, 0x01, 0x00, 0x00,
+  0xf5, 0x00, 0x07, 0x00, 0x06, 0x00, 0x00, 0x00, 0x23, 0x02, 0x00, 0x00,
+  0x7b, 0x00, 0x00, 0x00, 0xe3, 0x00, 0x00, 0x00, 0x49, 0x01, 0x00, 0x00,
+  0x10, 0x01, 0x00, 0x00, 0xb0, 0x00, 0x05, 0x00, 0x4a, 0x00, 0x00, 0x00,
+  0x15, 0x01, 0x00, 0x00, 0x23, 0x02, 0x00, 0x00, 0xa1, 0x00, 0x00, 0x00,
+  0xf6, 0x00, 0x04, 0x00, 0x0f, 0x01, 0x00, 0x00, 0x10, 0x01, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0xfa, 0x00, 0x04, 0x00, 0x15, 0x01, 0x00, 0x00,
+  0x0e, 0x01, 0x00, 0x00, 0x0f, 0x01, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00,
+  0x0e, 0x01, 0x00, 0x00, 0x84, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0xd7, 0x01, 0x00, 0x00, 0x23, 0x02, 0x00, 0x00, 0x82, 0x00, 0x00, 0x00,
+  0x80, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0xd8, 0x01, 0x00, 0x00,
+  0x83, 0x00, 0x00, 0x00, 0xd7, 0x01, 0x00, 0x00, 0x84, 0x00, 0x05, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0xdc, 0x01, 0x00, 0x00, 0x94, 0x00, 0x00, 0x00,
+  0x8e, 0x00, 0x00, 0x00, 0x80, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0xdd, 0x01, 0x00, 0x00, 0x8f, 0x00, 0x00, 0x00, 0xdc, 0x01, 0x00, 0x00,
+  0x7c, 0x00, 0x04, 0x00, 0x0b, 0x00, 0x00, 0x00, 0xdf, 0x01, 0x00, 0x00,
+  0xd8, 0x01, 0x00, 0x00, 0x7c, 0x00, 0x04, 0x00, 0x0b, 0x00, 0x00, 0x00,
+  0xe1, 0x01, 0x00, 0x00, 0xdd, 0x01, 0x00, 0x00, 0x50, 0x00, 0x05, 0x00,
+  0x0c, 0x00, 0x00, 0x00, 0xe2, 0x01, 0x00, 0x00, 0xdf, 0x01, 0x00, 0x00,
+  0xe1, 0x01, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00, 0xb6, 0x00, 0x00, 0x00,
+  0x27, 0x01, 0x00, 0x00, 0xb8, 0x00, 0x00, 0x00, 0x64, 0x00, 0x04, 0x00,
+  0xb5, 0x00, 0x00, 0x00, 0x29, 0x01, 0x00, 0x00, 0x27, 0x01, 0x00, 0x00,
+  0x5f, 0x00, 0x07, 0x00, 0x17, 0x00, 0x00, 0x00, 0x2a, 0x01, 0x00, 0x00,
+  0x29, 0x01, 0x00, 0x00, 0xe2, 0x01, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+  0xbb, 0x00, 0x00, 0x00, 0x50, 0x00, 0x07, 0x00, 0x17, 0x00, 0x00, 0x00,
+  0x2f, 0x01, 0x00, 0x00, 0x07, 0x01, 0x00, 0x00, 0x07, 0x01, 0x00, 0x00,
+  0x07, 0x01, 0x00, 0x00, 0x07, 0x01, 0x00, 0x00, 0x83, 0x00, 0x05, 0x00,
+  0x17, 0x00, 0x00, 0x00, 0x30, 0x01, 0x00, 0x00, 0x2a, 0x01, 0x00, 0x00,
+  0x2f, 0x01, 0x00, 0x00, 0x0c, 0x00, 0x06, 0x00, 0x17, 0x00, 0x00, 0x00,
+  0x31, 0x01, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x1b, 0x00, 0x00, 0x00,
+  0x30, 0x01, 0x00, 0x00, 0x82, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0x35, 0x01, 0x00, 0x00, 0xa1, 0x00, 0x00, 0x00, 0x2d, 0x00, 0x00, 0x00,
+  0xaa, 0x00, 0x05, 0x00, 0x4a, 0x00, 0x00, 0x00, 0x36, 0x01, 0x00, 0x00,
+  0x23, 0x02, 0x00, 0x00, 0x35, 0x01, 0x00, 0x00, 0xf7, 0x00, 0x03, 0x00,
+  0x38, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xfa, 0x00, 0x04, 0x00,
+  0x36, 0x01, 0x00, 0x00, 0x37, 0x01, 0x00, 0x00, 0x38, 0x01, 0x00, 0x00,
+  0xf8, 0x00, 0x02, 0x00, 0x37, 0x01, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00,
+  0x28, 0x00, 0x00, 0x00, 0x39, 0x01, 0x00, 0x00, 0x26, 0x00, 0x00, 0x00,
+  0xc5, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0x3a, 0x01, 0x00, 0x00, 0x39, 0x01, 0x00, 0x00, 0xac, 0x00, 0x05, 0x00,
+  0x4a, 0x00, 0x00, 0x00, 0x3b, 0x01, 0x00, 0x00, 0x3a, 0x01, 0x00, 0x00,
+  0x23, 0x00, 0x00, 0x00, 0xf9, 0x00, 0x02, 0x00, 0x38, 0x01, 0x00, 0x00,
+  0xf8, 0x00, 0x02, 0x00, 0x38, 0x01, 0x00, 0x00, 0xf5, 0x00, 0x07, 0x00,
+  0x4a, 0x00, 0x00, 0x00, 0x3c, 0x01, 0x00, 0x00, 0x36, 0x01, 0x00, 0x00,
+  0x0e, 0x01, 0x00, 0x00, 0x3b, 0x01, 0x00, 0x00, 0x37, 0x01, 0x00, 0x00,
+  0xf7, 0x00, 0x03, 0x00, 0x3e, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0xfa, 0x00, 0x04, 0x00, 0x3c, 0x01, 0x00, 0x00, 0x3d, 0x01, 0x00, 0x00,
+  0x3e, 0x01, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x3d, 0x01, 0x00, 0x00,
+  0x41, 0x00, 0x05, 0x00, 0x28, 0x00, 0x00, 0x00, 0x42, 0x01, 0x00, 0x00,
+  0x26, 0x00, 0x00, 0x00, 0xc5, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0x43, 0x01, 0x00, 0x00, 0x42, 0x01, 0x00, 0x00,
+  0xae, 0x00, 0x05, 0x00, 0x4a, 0x00, 0x00, 0x00, 0xe7, 0x01, 0x00, 0x00,
+  0x43, 0x01, 0x00, 0x00, 0x2d, 0x00, 0x00, 0x00, 0xa9, 0x00, 0x06, 0x00,
+  0x16, 0x00, 0x00, 0x00, 0xe8, 0x01, 0x00, 0x00, 0xe7, 0x01, 0x00, 0x00,
+  0x4c, 0x00, 0x00, 0x00, 0x47, 0x00, 0x00, 0x00, 0x52, 0x00, 0x06, 0x00,
+  0x17, 0x00, 0x00, 0x00, 0x16, 0x02, 0x00, 0x00, 0xe8, 0x01, 0x00, 0x00,
+  0x48, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0xae, 0x00, 0x05, 0x00,
+  0x4a, 0x00, 0x00, 0x00, 0xeb, 0x01, 0x00, 0x00, 0x43, 0x01, 0x00, 0x00,
+  0x52, 0x00, 0x00, 0x00, 0xa9, 0x00, 0x06, 0x00, 0x16, 0x00, 0x00, 0x00,
+  0xec, 0x01, 0x00, 0x00, 0xeb, 0x01, 0x00, 0x00, 0x4c, 0x00, 0x00, 0x00,
+  0x47, 0x00, 0x00, 0x00, 0x52, 0x00, 0x06, 0x00, 0x17, 0x00, 0x00, 0x00,
+  0x18, 0x02, 0x00, 0x00, 0xec, 0x01, 0x00, 0x00, 0x16, 0x02, 0x00, 0x00,
+  0x02, 0x00, 0x00, 0x00, 0xae, 0x00, 0x05, 0x00, 0x4a, 0x00, 0x00, 0x00,
+  0xef, 0x01, 0x00, 0x00, 0x43, 0x01, 0x00, 0x00, 0x4e, 0x00, 0x00, 0x00,
+  0xa9, 0x00, 0x06, 0x00, 0x16, 0x00, 0x00, 0x00, 0xf0, 0x01, 0x00, 0x00,
+  0xef, 0x01, 0x00, 0x00, 0x4c, 0x00, 0x00, 0x00, 0x47, 0x00, 0x00, 0x00,
+  0x52, 0x00, 0x06, 0x00, 0x17, 0x00, 0x00, 0x00, 0x1a, 0x02, 0x00, 0x00,
+  0xf0, 0x01, 0x00, 0x00, 0x18, 0x02, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+  0x0c, 0x00, 0x08, 0x00, 0x17, 0x00, 0x00, 0x00, 0xf4, 0x01, 0x00, 0x00,
+  0x01, 0x00, 0x00, 0x00, 0x2e, 0x00, 0x00, 0x00, 0x31, 0x01, 0x00, 0x00,
+  0x48, 0x00, 0x00, 0x00, 0x1a, 0x02, 0x00, 0x00, 0xf9, 0x00, 0x02, 0x00,
+  0x3e, 0x01, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x3e, 0x01, 0x00, 0x00,
+  0xf5, 0x00, 0x07, 0x00, 0x17, 0x00, 0x00, 0x00, 0x2d, 0x02, 0x00, 0x00,
+  0x31, 0x01, 0x00, 0x00, 0x38, 0x01, 0x00, 0x00, 0xf4, 0x01, 0x00, 0x00,
+  0x3d, 0x01, 0x00, 0x00, 0x81, 0x00, 0x05, 0x00, 0x17, 0x00, 0x00, 0x00,
+  0x47, 0x01, 0x00, 0x00, 0x24, 0x02, 0x00, 0x00, 0x2d, 0x02, 0x00, 0x00,
+  0xf9, 0x00, 0x02, 0x00, 0x10, 0x01, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00,
+  0x10, 0x01, 0x00, 0x00, 0x80, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0x49, 0x01, 0x00, 0x00, 0x23, 0x02, 0x00, 0x00, 0x2b, 0x00, 0x00, 0x00,
+  0xf9, 0x00, 0x02, 0x00, 0x0d, 0x01, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00,
+  0x0f, 0x01, 0x00, 0x00, 0x3e, 0x00, 0x03, 0x00, 0xdd, 0x00, 0x00, 0x00,
+  0x24, 0x02, 0x00, 0x00, 0xe0, 0x00, 0x04, 0x00, 0x52, 0x00, 0x00, 0x00,
+  0x52, 0x00, 0x00, 0x00, 0xde, 0x00, 0x00, 0x00, 0xf9, 0x00, 0x02, 0x00,
+  0x4e, 0x01, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x4e, 0x01, 0x00, 0x00,
+  0xf5, 0x00, 0x07, 0x00, 0x06, 0x00, 0x00, 0x00, 0x25, 0x02, 0x00, 0x00,
+  0xe0, 0x00, 0x00, 0x00, 0x0f, 0x01, 0x00, 0x00, 0x65, 0x01, 0x00, 0x00,
+  0x51, 0x01, 0x00, 0x00, 0xac, 0x00, 0x05, 0x00, 0x4a, 0x00, 0x00, 0x00,
+  0x54, 0x01, 0x00, 0x00, 0x25, 0x02, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00,
+  0xf6, 0x00, 0x04, 0x00, 0x50, 0x01, 0x00, 0x00, 0x51, 0x01, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0xfa, 0x00, 0x04, 0x00, 0x54, 0x01, 0x00, 0x00,
+  0x4f, 0x01, 0x00, 0x00, 0x50, 0x01, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00,
+  0x4f, 0x01, 0x00, 0x00, 0xb0, 0x00, 0x05, 0x00, 0x4a, 0x00, 0x00, 0x00,
+  0x57, 0x01, 0x00, 0x00, 0x7b, 0x00, 0x00, 0x00, 0x25, 0x02, 0x00, 0x00,
+  0xf7, 0x00, 0x03, 0x00, 0x59, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0xfa, 0x00, 0x04, 0x00, 0x57, 0x01, 0x00, 0x00, 0x58, 0x01, 0x00, 0x00,
+  0x59, 0x01, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x58, 0x01, 0x00, 0x00,
+  0x80, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0x5d, 0x01, 0x00, 0x00,
+  0x7b, 0x00, 0x00, 0x00, 0x25, 0x02, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00,
+  0xdc, 0x00, 0x00, 0x00, 0x5e, 0x01, 0x00, 0x00, 0xd9, 0x00, 0x00, 0x00,
+  0x5d, 0x01, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00, 0x17, 0x00, 0x00, 0x00,
+  0x5f, 0x01, 0x00, 0x00, 0x5e, 0x01, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00,
+  0x17, 0x00, 0x00, 0x00, 0x61, 0x01, 0x00, 0x00, 0xdd, 0x00, 0x00, 0x00,
+  0x81, 0x00, 0x05, 0x00, 0x17, 0x00, 0x00, 0x00, 0x62, 0x01, 0x00, 0x00,
+  0x61, 0x01, 0x00, 0x00, 0x5f, 0x01, 0x00, 0x00, 0x3e, 0x00, 0x03, 0x00,
+  0xdd, 0x00, 0x00, 0x00, 0x62, 0x01, 0x00, 0x00, 0xf9, 0x00, 0x02, 0x00,
+  0x59, 0x01, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x59, 0x01, 0x00, 0x00,
+  0xe0, 0x00, 0x04, 0x00, 0x52, 0x00, 0x00, 0x00, 0x52, 0x00, 0x00, 0x00,
+  0xde, 0x00, 0x00, 0x00, 0xf9, 0x00, 0x02, 0x00, 0x51, 0x01, 0x00, 0x00,
+  0xf8, 0x00, 0x02, 0x00, 0x51, 0x01, 0x00, 0x00, 0xc2, 0x00, 0x05, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0x65, 0x01, 0x00, 0x00, 0x25, 0x02, 0x00, 0x00,
+  0x8c, 0x00, 0x00, 0x00, 0xf9, 0x00, 0x02, 0x00, 0x4e, 0x01, 0x00, 0x00,
+  0xf8, 0x00, 0x02, 0x00, 0x50, 0x01, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00,
+  0x17, 0x00, 0x00, 0x00, 0x67, 0x01, 0x00, 0x00, 0xfa, 0x00, 0x00, 0x00,
+  0x94, 0x00, 0x05, 0x00, 0x16, 0x00, 0x00, 0x00, 0x6b, 0x01, 0x00, 0x00,
+  0x67, 0x01, 0x00, 0x00, 0x6a, 0x01, 0x00, 0x00, 0xf9, 0x00, 0x02, 0x00,
+  0x70, 0x01, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x70, 0x01, 0x00, 0x00,
+  0xf5, 0x00, 0x07, 0x00, 0x06, 0x00, 0x00, 0x00, 0x26, 0x02, 0x00, 0x00,
+  0x7b, 0x00, 0x00, 0x00, 0x50, 0x01, 0x00, 0x00, 0xa0, 0x01, 0x00, 0x00,
+  0x71, 0x01, 0x00, 0x00, 0xb0, 0x00, 0x05, 0x00, 0x4a, 0x00, 0x00, 0x00,
+  0x78, 0x01, 0x00, 0x00, 0x26, 0x02, 0x00, 0x00, 0xa1, 0x00, 0x00, 0x00,
+  0xf6, 0x00, 0x04, 0x00, 0x72, 0x01, 0x00, 0x00, 0x71, 0x01, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0xfa, 0x00, 0x04, 0x00, 0x78, 0x01, 0x00, 0x00,
+  0x71, 0x01, 0x00, 0x00, 0x72, 0x01, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00,
+  0x71, 0x01, 0x00, 0x00, 0x84, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0xfc, 0x01, 0x00, 0x00, 0x26, 0x02, 0x00, 0x00, 0x82, 0x00, 0x00, 0x00,
+  0x80, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0xfd, 0x01, 0x00, 0x00,
+  0x83, 0x00, 0x00, 0x00, 0xfc, 0x01, 0x00, 0x00, 0x84, 0x00, 0x05, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0x01, 0x02, 0x00, 0x00, 0x94, 0x00, 0x00, 0x00,
+  0x8e, 0x00, 0x00, 0x00, 0x80, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0x02, 0x02, 0x00, 0x00, 0x8f, 0x00, 0x00, 0x00, 0x01, 0x02, 0x00, 0x00,
+  0x7c, 0x00, 0x04, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x04, 0x02, 0x00, 0x00,
+  0xfd, 0x01, 0x00, 0x00, 0x7c, 0x00, 0x04, 0x00, 0x0b, 0x00, 0x00, 0x00,
+  0x06, 0x02, 0x00, 0x00, 0x02, 0x02, 0x00, 0x00, 0x50, 0x00, 0x05, 0x00,
+  0x0c, 0x00, 0x00, 0x00, 0x07, 0x02, 0x00, 0x00, 0x04, 0x02, 0x00, 0x00,
+  0x06, 0x02, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00, 0xb6, 0x00, 0x00, 0x00,
+  0x8a, 0x01, 0x00, 0x00, 0xb8, 0x00, 0x00, 0x00, 0x64, 0x00, 0x04, 0x00,
+  0xb5, 0x00, 0x00, 0x00, 0x8c, 0x01, 0x00, 0x00, 0x8a, 0x01, 0x00, 0x00,
+  0x5f, 0x00, 0x07, 0x00, 0x17, 0x00, 0x00, 0x00, 0x8d, 0x01, 0x00, 0x00,
+  0x8c, 0x01, 0x00, 0x00, 0x07, 0x02, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+  0xbb, 0x00, 0x00, 0x00, 0x50, 0x00, 0x07, 0x00, 0x17, 0x00, 0x00, 0x00,
+  0x92, 0x01, 0x00, 0x00, 0x07, 0x01, 0x00, 0x00, 0x07, 0x01, 0x00, 0x00,
+  0x07, 0x01, 0x00, 0x00, 0x07, 0x01, 0x00, 0x00, 0x83, 0x00, 0x05, 0x00,
+  0x17, 0x00, 0x00, 0x00, 0x93, 0x01, 0x00, 0x00, 0x8d, 0x01, 0x00, 0x00,
+  0x92, 0x01, 0x00, 0x00, 0x0c, 0x00, 0x06, 0x00, 0x17, 0x00, 0x00, 0x00,
+  0x94, 0x01, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x1b, 0x00, 0x00, 0x00,
+  0x93, 0x01, 0x00, 0x00, 0x50, 0x00, 0x07, 0x00, 0x17, 0x00, 0x00, 0x00,
+  0x97, 0x01, 0x00, 0x00, 0x6b, 0x01, 0x00, 0x00, 0x6b, 0x01, 0x00, 0x00,
+  0x6b, 0x01, 0x00, 0x00, 0x6b, 0x01, 0x00, 0x00, 0x88, 0x00, 0x05, 0x00,
+  0x17, 0x00, 0x00, 0x00, 0x98, 0x01, 0x00, 0x00, 0x94, 0x01, 0x00, 0x00,
+  0x97, 0x01, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00, 0x99, 0x01, 0x00, 0x00,
+  0x9c, 0x01, 0x00, 0x00, 0x9b, 0x01, 0x00, 0x00, 0x63, 0x00, 0x04, 0x00,
+  0x9c, 0x01, 0x00, 0x00, 0x07, 0x02, 0x00, 0x00, 0x98, 0x01, 0x00, 0x00,
+  0x80, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0xa0, 0x01, 0x00, 0x00,
+  0x26, 0x02, 0x00, 0x00, 0x2b, 0x00, 0x00, 0x00, 0xf9, 0x00, 0x02, 0x00,
+  0x70, 0x01, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x72, 0x01, 0x00, 0x00,
+  0xfd, 0x00, 0x01, 0x00, 0x38, 0x00, 0x01, 0x00
+};
+unsigned int glsl_softmaxImage_AXIS_C_comp_len = 5204;
+
 const unsigned char glsl_convolutionDepthwiseMali_comp[] = {
   0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0b, 0x00, 0x08, 0x00,
   0xec, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00,
@@ -11986,228 +13738,6 @@ const unsigned char glsl_imageTonchw_comp[] = {
 };
 unsigned int glsl_imageTonchw_comp_len = 2988;
 
-const unsigned char glsl_softmaxHeight_NHWC_comp[] = {
-  0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0b, 0x00, 0x08, 0x00,
-  0xc5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00,
-  0x01, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00,
-  0x47, 0x4c, 0x53, 0x4c, 0x2e, 0x73, 0x74, 0x64, 0x2e, 0x34, 0x35, 0x30,
-  0x00, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00,
-  0x01, 0x00, 0x00, 0x00, 0x0f, 0x00, 0x06, 0x00, 0x05, 0x00, 0x00, 0x00,
-  0x04, 0x00, 0x00, 0x00, 0x6d, 0x61, 0x69, 0x6e, 0x00, 0x00, 0x00, 0x00,
-  0x0d, 0x00, 0x00, 0x00, 0x10, 0x00, 0x06, 0x00, 0x04, 0x00, 0x00, 0x00,
-  0x11, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00,
-  0x01, 0x00, 0x00, 0x00, 0x03, 0x00, 0x03, 0x00, 0x02, 0x00, 0x00, 0x00,
-  0xb8, 0x01, 0x00, 0x00, 0x05, 0x00, 0x04, 0x00, 0x04, 0x00, 0x00, 0x00,
-  0x6d, 0x61, 0x69, 0x6e, 0x00, 0x00, 0x00, 0x00, 0x05, 0x00, 0x08, 0x00,
-  0x0d, 0x00, 0x00, 0x00, 0x67, 0x6c, 0x5f, 0x47, 0x6c, 0x6f, 0x62, 0x61,
-  0x6c, 0x49, 0x6e, 0x76, 0x6f, 0x63, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x49,
-  0x44, 0x00, 0x00, 0x00, 0x05, 0x00, 0x05, 0x00, 0x15, 0x00, 0x00, 0x00,
-  0x63, 0x6f, 0x6e, 0x73, 0x74, 0x42, 0x75, 0x66, 0x66, 0x65, 0x72, 0x00,
-  0x06, 0x00, 0x04, 0x00, 0x15, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-  0x77, 0x00, 0x00, 0x00, 0x06, 0x00, 0x04, 0x00, 0x15, 0x00, 0x00, 0x00,
-  0x01, 0x00, 0x00, 0x00, 0x68, 0x00, 0x00, 0x00, 0x06, 0x00, 0x04, 0x00,
-  0x15, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x63, 0x00, 0x00, 0x00,
-  0x05, 0x00, 0x04, 0x00, 0x17, 0x00, 0x00, 0x00, 0x75, 0x43, 0x6f, 0x6e,
-  0x73, 0x74, 0x00, 0x00, 0x05, 0x00, 0x06, 0x00, 0x37, 0x00, 0x00, 0x00,
-  0x73, 0x6f, 0x75, 0x72, 0x63, 0x65, 0x42, 0x75, 0x66, 0x66, 0x65, 0x72,
-  0x00, 0x00, 0x00, 0x00, 0x06, 0x00, 0x05, 0x00, 0x37, 0x00, 0x00, 0x00,
-  0x00, 0x00, 0x00, 0x00, 0x64, 0x61, 0x74, 0x61, 0x00, 0x00, 0x00, 0x00,
-  0x05, 0x00, 0x04, 0x00, 0x39, 0x00, 0x00, 0x00, 0x75, 0x49, 0x6e, 0x70,
-  0x75, 0x74, 0x00, 0x00, 0x05, 0x00, 0x05, 0x00, 0x9f, 0x00, 0x00, 0x00,
-  0x64, 0x65, 0x73, 0x74, 0x62, 0x75, 0x66, 0x66, 0x65, 0x72, 0x00, 0x00,
-  0x06, 0x00, 0x05, 0x00, 0x9f, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-  0x64, 0x61, 0x74, 0x61, 0x00, 0x00, 0x00, 0x00, 0x05, 0x00, 0x04, 0x00,
-  0xa1, 0x00, 0x00, 0x00, 0x75, 0x4f, 0x75, 0x74, 0x70, 0x75, 0x74, 0x00,
-  0x47, 0x00, 0x04, 0x00, 0x0d, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00,
-  0x1c, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00, 0x15, 0x00, 0x00, 0x00,
-  0x00, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-  0x48, 0x00, 0x05, 0x00, 0x15, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
-  0x23, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00,
-  0x15, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00,
-  0x08, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x15, 0x00, 0x00, 0x00,
-  0x02, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x17, 0x00, 0x00, 0x00,
-  0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
-  0x17, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
-  0x47, 0x00, 0x04, 0x00, 0x36, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00,
-  0x04, 0x00, 0x00, 0x00, 0x48, 0x00, 0x04, 0x00, 0x37, 0x00, 0x00, 0x00,
-  0x00, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00,
-  0x37, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00,
-  0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x37, 0x00, 0x00, 0x00,
-  0x03, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x39, 0x00, 0x00, 0x00,
-  0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
-  0x39, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
-  0x47, 0x00, 0x04, 0x00, 0x9e, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00,
-  0x04, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00, 0x9f, 0x00, 0x00, 0x00,
-  0x00, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-  0x47, 0x00, 0x03, 0x00, 0x9f, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00,
-  0x47, 0x00, 0x04, 0x00, 0xa1, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00,
-  0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0xa1, 0x00, 0x00, 0x00,
-  0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
-  0xaf, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00,
-  0x13, 0x00, 0x02, 0x00, 0x02, 0x00, 0x00, 0x00, 0x21, 0x00, 0x03, 0x00,
-  0x03, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x15, 0x00, 0x04, 0x00,
-  0x06, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
-  0x17, 0x00, 0x04, 0x00, 0x07, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00,
-  0x03, 0x00, 0x00, 0x00, 0x15, 0x00, 0x04, 0x00, 0x0a, 0x00, 0x00, 0x00,
-  0x20, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x17, 0x00, 0x04, 0x00,
-  0x0b, 0x00, 0x00, 0x00, 0x0a, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00,
-  0x20, 0x00, 0x04, 0x00, 0x0c, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
-  0x0b, 0x00, 0x00, 0x00, 0x3b, 0x00, 0x04, 0x00, 0x0c, 0x00, 0x00, 0x00,
-  0x0d, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x14, 0x00, 0x02, 0x00,
-  0x10, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00, 0x0a, 0x00, 0x00, 0x00,
-  0x11, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x1e, 0x00, 0x05, 0x00,
-  0x15, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00,
-  0x06, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00, 0x16, 0x00, 0x00, 0x00,
-  0x02, 0x00, 0x00, 0x00, 0x15, 0x00, 0x00, 0x00, 0x3b, 0x00, 0x04, 0x00,
-  0x16, 0x00, 0x00, 0x00, 0x17, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
-  0x2b, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00,
-  0x00, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00, 0x19, 0x00, 0x00, 0x00,
-  0x02, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00,
-  0x06, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
-  0x2b, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, 0x2d, 0x00, 0x00, 0x00,
-  0x01, 0x00, 0x00, 0x00, 0x16, 0x00, 0x03, 0x00, 0x33, 0x00, 0x00, 0x00,
-  0x20, 0x00, 0x00, 0x00, 0x1d, 0x00, 0x03, 0x00, 0x36, 0x00, 0x00, 0x00,
-  0x33, 0x00, 0x00, 0x00, 0x1e, 0x00, 0x03, 0x00, 0x37, 0x00, 0x00, 0x00,
-  0x36, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00, 0x38, 0x00, 0x00, 0x00,
-  0x02, 0x00, 0x00, 0x00, 0x37, 0x00, 0x00, 0x00, 0x3b, 0x00, 0x04, 0x00,
-  0x38, 0x00, 0x00, 0x00, 0x39, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
-  0x20, 0x00, 0x04, 0x00, 0x43, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
-  0x33, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00, 0x33, 0x00, 0x00, 0x00,
-  0x65, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x1d, 0x00, 0x03, 0x00,
-  0x9e, 0x00, 0x00, 0x00, 0x33, 0x00, 0x00, 0x00, 0x1e, 0x00, 0x03, 0x00,
-  0x9f, 0x00, 0x00, 0x00, 0x9e, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00,
-  0xa0, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x9f, 0x00, 0x00, 0x00,
-  0x3b, 0x00, 0x04, 0x00, 0xa0, 0x00, 0x00, 0x00, 0xa1, 0x00, 0x00, 0x00,
-  0x02, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00, 0x0a, 0x00, 0x00, 0x00,
-  0xae, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x2c, 0x00, 0x06, 0x00,
-  0x0b, 0x00, 0x00, 0x00, 0xaf, 0x00, 0x00, 0x00, 0xae, 0x00, 0x00, 0x00,
-  0xae, 0x00, 0x00, 0x00, 0x11, 0x00, 0x00, 0x00, 0x36, 0x00, 0x05, 0x00,
-  0x02, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-  0x03, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x05, 0x00, 0x00, 0x00,
-  0x3d, 0x00, 0x04, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x00, 0x00,
-  0x0d, 0x00, 0x00, 0x00, 0x7c, 0x00, 0x04, 0x00, 0x07, 0x00, 0x00, 0x00,
-  0x0f, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x00, 0x00, 0x51, 0x00, 0x05, 0x00,
-  0x06, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x0f, 0x00, 0x00, 0x00,
-  0x01, 0x00, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00, 0x19, 0x00, 0x00, 0x00,
-  0x1a, 0x00, 0x00, 0x00, 0x17, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00,
-  0x3d, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, 0x1b, 0x00, 0x00, 0x00,
-  0x1a, 0x00, 0x00, 0x00, 0xb1, 0x00, 0x05, 0x00, 0x10, 0x00, 0x00, 0x00,
-  0x1c, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x1b, 0x00, 0x00, 0x00,
-  0xf7, 0x00, 0x03, 0x00, 0x1e, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-  0xfa, 0x00, 0x04, 0x00, 0x1c, 0x00, 0x00, 0x00, 0x1d, 0x00, 0x00, 0x00,
-  0x1e, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x1d, 0x00, 0x00, 0x00,
-  0x51, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00,
-  0x0f, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00,
-  0x19, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00, 0x17, 0x00, 0x00, 0x00,
-  0x22, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00,
-  0x24, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00, 0xb1, 0x00, 0x05, 0x00,
-  0x10, 0x00, 0x00, 0x00, 0x25, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00,
-  0x24, 0x00, 0x00, 0x00, 0xf9, 0x00, 0x02, 0x00, 0x1e, 0x00, 0x00, 0x00,
-  0xf8, 0x00, 0x02, 0x00, 0x1e, 0x00, 0x00, 0x00, 0xf5, 0x00, 0x07, 0x00,
-  0x10, 0x00, 0x00, 0x00, 0x26, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00,
-  0x05, 0x00, 0x00, 0x00, 0x25, 0x00, 0x00, 0x00, 0x1d, 0x00, 0x00, 0x00,
-  0xf7, 0x00, 0x03, 0x00, 0x28, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-  0xfa, 0x00, 0x04, 0x00, 0x26, 0x00, 0x00, 0x00, 0x27, 0x00, 0x00, 0x00,
-  0x28, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x27, 0x00, 0x00, 0x00,
-  0x41, 0x00, 0x05, 0x00, 0x19, 0x00, 0x00, 0x00, 0x2e, 0x00, 0x00, 0x00,
-  0x17, 0x00, 0x00, 0x00, 0x2d, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00,
-  0x06, 0x00, 0x00, 0x00, 0x2f, 0x00, 0x00, 0x00, 0x2e, 0x00, 0x00, 0x00,
-  0x51, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0x3b, 0x00, 0x00, 0x00,
-  0x0f, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x84, 0x00, 0x05, 0x00,
-  0x06, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x00, 0x00, 0x3b, 0x00, 0x00, 0x00,
-  0x2f, 0x00, 0x00, 0x00, 0x84, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00,
-  0x3f, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x00, 0x00, 0x1b, 0x00, 0x00, 0x00,
-  0x80, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0x42, 0x00, 0x00, 0x00,
-  0x3f, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x41, 0x00, 0x06, 0x00,
-  0x43, 0x00, 0x00, 0x00, 0x44, 0x00, 0x00, 0x00, 0x39, 0x00, 0x00, 0x00,
-  0x18, 0x00, 0x00, 0x00, 0x42, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00,
-  0x33, 0x00, 0x00, 0x00, 0x45, 0x00, 0x00, 0x00, 0x44, 0x00, 0x00, 0x00,
-  0xf9, 0x00, 0x02, 0x00, 0x47, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00,
-  0x47, 0x00, 0x00, 0x00, 0xf5, 0x00, 0x07, 0x00, 0x33, 0x00, 0x00, 0x00,
-  0xc2, 0x00, 0x00, 0x00, 0x45, 0x00, 0x00, 0x00, 0x27, 0x00, 0x00, 0x00,
-  0x61, 0x00, 0x00, 0x00, 0x48, 0x00, 0x00, 0x00, 0xf5, 0x00, 0x07, 0x00,
-  0x06, 0x00, 0x00, 0x00, 0xbd, 0x00, 0x00, 0x00, 0x2d, 0x00, 0x00, 0x00,
-  0x27, 0x00, 0x00, 0x00, 0x63, 0x00, 0x00, 0x00, 0x48, 0x00, 0x00, 0x00,
-  0xb1, 0x00, 0x05, 0x00, 0x10, 0x00, 0x00, 0x00, 0x4e, 0x00, 0x00, 0x00,
-  0xbd, 0x00, 0x00, 0x00, 0x2f, 0x00, 0x00, 0x00, 0xf6, 0x00, 0x04, 0x00,
-  0x49, 0x00, 0x00, 0x00, 0x48, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-  0xfa, 0x00, 0x04, 0x00, 0x4e, 0x00, 0x00, 0x00, 0x48, 0x00, 0x00, 0x00,
-  0x49, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x48, 0x00, 0x00, 0x00,
-  0x80, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0xbc, 0x00, 0x00, 0x00,
-  0xbd, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x00, 0x00, 0x84, 0x00, 0x05, 0x00,
-  0x06, 0x00, 0x00, 0x00, 0x59, 0x00, 0x00, 0x00, 0x1b, 0x00, 0x00, 0x00,
-  0xbc, 0x00, 0x00, 0x00, 0x80, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00,
-  0x5c, 0x00, 0x00, 0x00, 0x59, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00,
-  0x41, 0x00, 0x06, 0x00, 0x43, 0x00, 0x00, 0x00, 0x5f, 0x00, 0x00, 0x00,
-  0x39, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x5c, 0x00, 0x00, 0x00,
-  0x3d, 0x00, 0x04, 0x00, 0x33, 0x00, 0x00, 0x00, 0x60, 0x00, 0x00, 0x00,
-  0x5f, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x07, 0x00, 0x33, 0x00, 0x00, 0x00,
-  0x61, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00,
-  0xc2, 0x00, 0x00, 0x00, 0x60, 0x00, 0x00, 0x00, 0x80, 0x00, 0x05, 0x00,
-  0x06, 0x00, 0x00, 0x00, 0x63, 0x00, 0x00, 0x00, 0xbd, 0x00, 0x00, 0x00,
-  0x2d, 0x00, 0x00, 0x00, 0xf9, 0x00, 0x02, 0x00, 0x47, 0x00, 0x00, 0x00,
-  0xf8, 0x00, 0x02, 0x00, 0x49, 0x00, 0x00, 0x00, 0xf9, 0x00, 0x02, 0x00,
-  0x67, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x67, 0x00, 0x00, 0x00,
-  0xf5, 0x00, 0x07, 0x00, 0x33, 0x00, 0x00, 0x00, 0xc4, 0x00, 0x00, 0x00,
-  0x65, 0x00, 0x00, 0x00, 0x49, 0x00, 0x00, 0x00, 0x84, 0x00, 0x00, 0x00,
-  0x68, 0x00, 0x00, 0x00, 0xf5, 0x00, 0x07, 0x00, 0x06, 0x00, 0x00, 0x00,
-  0xbe, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x49, 0x00, 0x00, 0x00,
-  0x86, 0x00, 0x00, 0x00, 0x68, 0x00, 0x00, 0x00, 0xb1, 0x00, 0x05, 0x00,
-  0x10, 0x00, 0x00, 0x00, 0x6e, 0x00, 0x00, 0x00, 0xbe, 0x00, 0x00, 0x00,
-  0x2f, 0x00, 0x00, 0x00, 0xf6, 0x00, 0x04, 0x00, 0x69, 0x00, 0x00, 0x00,
-  0x68, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xfa, 0x00, 0x04, 0x00,
-  0x6e, 0x00, 0x00, 0x00, 0x68, 0x00, 0x00, 0x00, 0x69, 0x00, 0x00, 0x00,
-  0xf8, 0x00, 0x02, 0x00, 0x68, 0x00, 0x00, 0x00, 0x80, 0x00, 0x05, 0x00,
-  0x06, 0x00, 0x00, 0x00, 0xbb, 0x00, 0x00, 0x00, 0xbe, 0x00, 0x00, 0x00,
-  0x3d, 0x00, 0x00, 0x00, 0x84, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00,
-  0x79, 0x00, 0x00, 0x00, 0x1b, 0x00, 0x00, 0x00, 0xbb, 0x00, 0x00, 0x00,
-  0x80, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0x7c, 0x00, 0x00, 0x00,
-  0x79, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x41, 0x00, 0x06, 0x00,
-  0x43, 0x00, 0x00, 0x00, 0x7e, 0x00, 0x00, 0x00, 0x39, 0x00, 0x00, 0x00,
-  0x18, 0x00, 0x00, 0x00, 0x7c, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00,
-  0x33, 0x00, 0x00, 0x00, 0x7f, 0x00, 0x00, 0x00, 0x7e, 0x00, 0x00, 0x00,
-  0x83, 0x00, 0x05, 0x00, 0x33, 0x00, 0x00, 0x00, 0x81, 0x00, 0x00, 0x00,
-  0x7f, 0x00, 0x00, 0x00, 0xc2, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x06, 0x00,
-  0x33, 0x00, 0x00, 0x00, 0x82, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
-  0x1b, 0x00, 0x00, 0x00, 0x81, 0x00, 0x00, 0x00, 0x81, 0x00, 0x05, 0x00,
-  0x33, 0x00, 0x00, 0x00, 0x84, 0x00, 0x00, 0x00, 0xc4, 0x00, 0x00, 0x00,
-  0x82, 0x00, 0x00, 0x00, 0x80, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00,
-  0x86, 0x00, 0x00, 0x00, 0xbe, 0x00, 0x00, 0x00, 0x2d, 0x00, 0x00, 0x00,
-  0xf9, 0x00, 0x02, 0x00, 0x67, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00,
-  0x69, 0x00, 0x00, 0x00, 0xf9, 0x00, 0x02, 0x00, 0x88, 0x00, 0x00, 0x00,
-  0xf8, 0x00, 0x02, 0x00, 0x88, 0x00, 0x00, 0x00, 0xf5, 0x00, 0x07, 0x00,
-  0x06, 0x00, 0x00, 0x00, 0xbf, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00,
-  0x69, 0x00, 0x00, 0x00, 0xad, 0x00, 0x00, 0x00, 0x89, 0x00, 0x00, 0x00,
-  0xb1, 0x00, 0x05, 0x00, 0x10, 0x00, 0x00, 0x00, 0x8f, 0x00, 0x00, 0x00,
-  0xbf, 0x00, 0x00, 0x00, 0x2f, 0x00, 0x00, 0x00, 0xf6, 0x00, 0x04, 0x00,
-  0x8a, 0x00, 0x00, 0x00, 0x89, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-  0xfa, 0x00, 0x04, 0x00, 0x8f, 0x00, 0x00, 0x00, 0x89, 0x00, 0x00, 0x00,
-  0x8a, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x89, 0x00, 0x00, 0x00,
-  0x80, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0xba, 0x00, 0x00, 0x00,
-  0xbf, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x00, 0x00, 0x84, 0x00, 0x05, 0x00,
-  0x06, 0x00, 0x00, 0x00, 0x9a, 0x00, 0x00, 0x00, 0x1b, 0x00, 0x00, 0x00,
-  0xba, 0x00, 0x00, 0x00, 0x80, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00,
-  0x9d, 0x00, 0x00, 0x00, 0x9a, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00,
-  0x41, 0x00, 0x06, 0x00, 0x43, 0x00, 0x00, 0x00, 0xa4, 0x00, 0x00, 0x00,
-  0x39, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x9d, 0x00, 0x00, 0x00,
-  0x3d, 0x00, 0x04, 0x00, 0x33, 0x00, 0x00, 0x00, 0xa5, 0x00, 0x00, 0x00,
-  0xa4, 0x00, 0x00, 0x00, 0x83, 0x00, 0x05, 0x00, 0x33, 0x00, 0x00, 0x00,
-  0xa7, 0x00, 0x00, 0x00, 0xa5, 0x00, 0x00, 0x00, 0xc2, 0x00, 0x00, 0x00,
-  0x0c, 0x00, 0x06, 0x00, 0x33, 0x00, 0x00, 0x00, 0xa8, 0x00, 0x00, 0x00,
-  0x01, 0x00, 0x00, 0x00, 0x1b, 0x00, 0x00, 0x00, 0xa7, 0x00, 0x00, 0x00,
-  0x88, 0x00, 0x05, 0x00, 0x33, 0x00, 0x00, 0x00, 0xaa, 0x00, 0x00, 0x00,
-  0xa8, 0x00, 0x00, 0x00, 0xc4, 0x00, 0x00, 0x00, 0x41, 0x00, 0x06, 0x00,
-  0x43, 0x00, 0x00, 0x00, 0xab, 0x00, 0x00, 0x00, 0xa1, 0x00, 0x00, 0x00,
-  0x18, 0x00, 0x00, 0x00, 0x9d, 0x00, 0x00, 0x00, 0x3e, 0x00, 0x03, 0x00,
-  0xab, 0x00, 0x00, 0x00, 0xaa, 0x00, 0x00, 0x00, 0x80, 0x00, 0x05, 0x00,
-  0x06, 0x00, 0x00, 0x00, 0xad, 0x00, 0x00, 0x00, 0xbf, 0x00, 0x00, 0x00,
-  0x2d, 0x00, 0x00, 0x00, 0xf9, 0x00, 0x02, 0x00, 0x88, 0x00, 0x00, 0x00,
-  0xf8, 0x00, 0x02, 0x00, 0x8a, 0x00, 0x00, 0x00, 0xf9, 0x00, 0x02, 0x00,
-  0x28, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x28, 0x00, 0x00, 0x00,
-  0xfd, 0x00, 0x01, 0x00, 0x38, 0x00, 0x01, 0x00
-};
-unsigned int glsl_softmaxHeight_NHWC_comp_len = 2612;
-
 const unsigned char glsl_resizeNearest_comp[] = {
   0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0b, 0x00, 0x08, 0x00,
   0x9a, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00,
@@ -12621,1009 +14151,1195 @@ unsigned int glsl_resizeNearest_NEAREST_ROUND_comp_len = 2444;
 
 const unsigned char glsl_reduce_comp[] = {
   0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0b, 0x00, 0x08, 0x00,
-  0x7b, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00,
+  0xaf, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00,
   0x01, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00,
   0x47, 0x4c, 0x53, 0x4c, 0x2e, 0x73, 0x74, 0x64, 0x2e, 0x34, 0x35, 0x30,
   0x00, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00,
-  0x01, 0x00, 0x00, 0x00, 0x0f, 0x00, 0x06, 0x00, 0x05, 0x00, 0x00, 0x00,
+  0x01, 0x00, 0x00, 0x00, 0x0f, 0x00, 0x07, 0x00, 0x05, 0x00, 0x00, 0x00,
   0x04, 0x00, 0x00, 0x00, 0x6d, 0x61, 0x69, 0x6e, 0x00, 0x00, 0x00, 0x00,
-  0x0d, 0x00, 0x00, 0x00, 0x10, 0x00, 0x06, 0x00, 0x04, 0x00, 0x00, 0x00,
-  0x11, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
-  0x01, 0x00, 0x00, 0x00, 0x03, 0x00, 0x03, 0x00, 0x02, 0x00, 0x00, 0x00,
-  0xb8, 0x01, 0x00, 0x00, 0x05, 0x00, 0x04, 0x00, 0x04, 0x00, 0x00, 0x00,
-  0x6d, 0x61, 0x69, 0x6e, 0x00, 0x00, 0x00, 0x00, 0x05, 0x00, 0x08, 0x00,
-  0x0d, 0x00, 0x00, 0x00, 0x67, 0x6c, 0x5f, 0x47, 0x6c, 0x6f, 0x62, 0x61,
-  0x6c, 0x49, 0x6e, 0x76, 0x6f, 0x63, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x49,
-  0x44, 0x00, 0x00, 0x00, 0x05, 0x00, 0x05, 0x00, 0x18, 0x00, 0x00, 0x00,
-  0x63, 0x6f, 0x6e, 0x73, 0x74, 0x42, 0x75, 0x66, 0x66, 0x65, 0x72, 0x00,
-  0x06, 0x00, 0x04, 0x00, 0x18, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-  0x77, 0x00, 0x00, 0x00, 0x06, 0x00, 0x04, 0x00, 0x18, 0x00, 0x00, 0x00,
-  0x01, 0x00, 0x00, 0x00, 0x68, 0x00, 0x00, 0x00, 0x06, 0x00, 0x04, 0x00,
-  0x18, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x63, 0x00, 0x00, 0x00,
-  0x06, 0x00, 0x04, 0x00, 0x18, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00,
-  0x6b, 0x00, 0x00, 0x00, 0x05, 0x00, 0x04, 0x00, 0x1a, 0x00, 0x00, 0x00,
-  0x75, 0x43, 0x6f, 0x6e, 0x73, 0x74, 0x00, 0x00, 0x05, 0x00, 0x06, 0x00,
-  0x55, 0x00, 0x00, 0x00, 0x73, 0x6f, 0x75, 0x72, 0x63, 0x65, 0x42, 0x75,
-  0x66, 0x66, 0x65, 0x72, 0x30, 0x00, 0x00, 0x00, 0x06, 0x00, 0x05, 0x00,
-  0x55, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x64, 0x61, 0x74, 0x61,
-  0x00, 0x00, 0x00, 0x00, 0x05, 0x00, 0x04, 0x00, 0x57, 0x00, 0x00, 0x00,
-  0x75, 0x49, 0x6e, 0x70, 0x75, 0x74, 0x00, 0x00, 0x05, 0x00, 0x05, 0x00,
-  0x72, 0x00, 0x00, 0x00, 0x64, 0x65, 0x73, 0x74, 0x42, 0x75, 0x66, 0x66,
-  0x65, 0x72, 0x00, 0x00, 0x06, 0x00, 0x05, 0x00, 0x72, 0x00, 0x00, 0x00,
+  0x1d, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00, 0x10, 0x00, 0x06, 0x00,
+  0x04, 0x00, 0x00, 0x00, 0x11, 0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00,
+  0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x03, 0x00, 0x03, 0x00,
+  0x02, 0x00, 0x00, 0x00, 0xc2, 0x01, 0x00, 0x00, 0x05, 0x00, 0x04, 0x00,
+  0x04, 0x00, 0x00, 0x00, 0x6d, 0x61, 0x69, 0x6e, 0x00, 0x00, 0x00, 0x00,
+  0x05, 0x00, 0x08, 0x00, 0x1d, 0x00, 0x00, 0x00, 0x67, 0x6c, 0x5f, 0x47,
+  0x6c, 0x6f, 0x62, 0x61, 0x6c, 0x49, 0x6e, 0x76, 0x6f, 0x63, 0x61, 0x74,
+  0x69, 0x6f, 0x6e, 0x49, 0x44, 0x00, 0x00, 0x00, 0x05, 0x00, 0x06, 0x00,
+  0x23, 0x00, 0x00, 0x00, 0x67, 0x6c, 0x5f, 0x57, 0x6f, 0x72, 0x6b, 0x47,
+  0x72, 0x6f, 0x75, 0x70, 0x49, 0x44, 0x00, 0x00, 0x05, 0x00, 0x06, 0x00,
+  0x6c, 0x00, 0x00, 0x00, 0x73, 0x68, 0x61, 0x72, 0x65, 0x64, 0x56, 0x61,
+  0x6c, 0x75, 0x65, 0x73, 0x00, 0x00, 0x00, 0x00, 0x05, 0x00, 0x05, 0x00,
+  0x98, 0x00, 0x00, 0x00, 0x64, 0x65, 0x73, 0x74, 0x42, 0x75, 0x66, 0x66,
+  0x65, 0x72, 0x00, 0x00, 0x06, 0x00, 0x05, 0x00, 0x98, 0x00, 0x00, 0x00,
   0x00, 0x00, 0x00, 0x00, 0x64, 0x61, 0x74, 0x61, 0x00, 0x00, 0x00, 0x00,
-  0x05, 0x00, 0x04, 0x00, 0x74, 0x00, 0x00, 0x00, 0x75, 0x4f, 0x75, 0x74,
-  0x70, 0x75, 0x74, 0x00, 0x47, 0x00, 0x04, 0x00, 0x0d, 0x00, 0x00, 0x00,
-  0x0b, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00,
-  0x18, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00,
-  0x00, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00, 0x18, 0x00, 0x00, 0x00,
-  0x01, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
-  0x48, 0x00, 0x05, 0x00, 0x18, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
-  0x23, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00,
-  0x18, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00,
-  0x0c, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x18, 0x00, 0x00, 0x00,
-  0x02, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x1a, 0x00, 0x00, 0x00,
-  0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
-  0x1a, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
-  0x47, 0x00, 0x04, 0x00, 0x54, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00,
-  0x04, 0x00, 0x00, 0x00, 0x48, 0x00, 0x04, 0x00, 0x55, 0x00, 0x00, 0x00,
-  0x00, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00,
-  0x55, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00,
-  0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x55, 0x00, 0x00, 0x00,
-  0x03, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x57, 0x00, 0x00, 0x00,
-  0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
-  0x57, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
-  0x47, 0x00, 0x04, 0x00, 0x71, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00,
-  0x04, 0x00, 0x00, 0x00, 0x48, 0x00, 0x04, 0x00, 0x72, 0x00, 0x00, 0x00,
+  0x05, 0x00, 0x04, 0x00, 0x9a, 0x00, 0x00, 0x00, 0x75, 0x4f, 0x75, 0x74,
+  0x70, 0x75, 0x74, 0x00, 0x47, 0x00, 0x04, 0x00, 0x1d, 0x00, 0x00, 0x00,
+  0x0b, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
+  0x23, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x1a, 0x00, 0x00, 0x00,
+  0x47, 0x00, 0x04, 0x00, 0x97, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0x04, 0x00, 0x00, 0x00, 0x48, 0x00, 0x04, 0x00, 0x98, 0x00, 0x00, 0x00,
   0x00, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00,
-  0x72, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00,
-  0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x72, 0x00, 0x00, 0x00,
-  0x03, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x74, 0x00, 0x00, 0x00,
+  0x98, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x98, 0x00, 0x00, 0x00,
+  0x03, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x9a, 0x00, 0x00, 0x00,
   0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
-  0x74, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-  0x47, 0x00, 0x04, 0x00, 0x7a, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00,
+  0x9a, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x47, 0x00, 0x04, 0x00, 0x9e, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00,
   0x19, 0x00, 0x00, 0x00, 0x13, 0x00, 0x02, 0x00, 0x02, 0x00, 0x00, 0x00,
   0x21, 0x00, 0x03, 0x00, 0x03, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
-  0x15, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00,
-  0x01, 0x00, 0x00, 0x00, 0x17, 0x00, 0x04, 0x00, 0x07, 0x00, 0x00, 0x00,
-  0x06, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x15, 0x00, 0x04, 0x00,
-  0x0a, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-  0x17, 0x00, 0x04, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x0a, 0x00, 0x00, 0x00,
-  0x03, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00, 0x0c, 0x00, 0x00, 0x00,
-  0x01, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x3b, 0x00, 0x04, 0x00,
-  0x0c, 0x00, 0x00, 0x00, 0x0d, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
-  0x16, 0x00, 0x03, 0x00, 0x17, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00,
-  0x1e, 0x00, 0x06, 0x00, 0x18, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00,
-  0x06, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x17, 0x00, 0x00, 0x00,
-  0x20, 0x00, 0x04, 0x00, 0x19, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
-  0x18, 0x00, 0x00, 0x00, 0x3b, 0x00, 0x04, 0x00, 0x19, 0x00, 0x00, 0x00,
-  0x1a, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00,
-  0x06, 0x00, 0x00, 0x00, 0x1b, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-  0x20, 0x00, 0x04, 0x00, 0x1c, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
-  0x06, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00, 0x0a, 0x00, 0x00, 0x00,
-  0x26, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x14, 0x00, 0x02, 0x00,
-  0x28, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00,
-  0x32, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00,
-  0x06, 0x00, 0x00, 0x00, 0x3a, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
-  0x1d, 0x00, 0x03, 0x00, 0x54, 0x00, 0x00, 0x00, 0x17, 0x00, 0x00, 0x00,
-  0x1e, 0x00, 0x03, 0x00, 0x55, 0x00, 0x00, 0x00, 0x54, 0x00, 0x00, 0x00,
-  0x20, 0x00, 0x04, 0x00, 0x56, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
-  0x55, 0x00, 0x00, 0x00, 0x3b, 0x00, 0x04, 0x00, 0x56, 0x00, 0x00, 0x00,
-  0x57, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00,
-  0x59, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x17, 0x00, 0x00, 0x00,
-  0x1d, 0x00, 0x03, 0x00, 0x71, 0x00, 0x00, 0x00, 0x17, 0x00, 0x00, 0x00,
-  0x1e, 0x00, 0x03, 0x00, 0x72, 0x00, 0x00, 0x00, 0x71, 0x00, 0x00, 0x00,
-  0x20, 0x00, 0x04, 0x00, 0x73, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
-  0x72, 0x00, 0x00, 0x00, 0x3b, 0x00, 0x04, 0x00, 0x73, 0x00, 0x00, 0x00,
-  0x74, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00,
-  0x0a, 0x00, 0x00, 0x00, 0x79, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00,
-  0x2c, 0x00, 0x06, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x7a, 0x00, 0x00, 0x00,
-  0x79, 0x00, 0x00, 0x00, 0x26, 0x00, 0x00, 0x00, 0x26, 0x00, 0x00, 0x00,
+  0x16, 0x00, 0x03, 0x00, 0x06, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00,
+  0x15, 0x00, 0x04, 0x00, 0x18, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x17, 0x00, 0x04, 0x00, 0x1b, 0x00, 0x00, 0x00,
+  0x18, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00,
+  0x1c, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x1b, 0x00, 0x00, 0x00,
+  0x3b, 0x00, 0x04, 0x00, 0x1c, 0x00, 0x00, 0x00, 0x1d, 0x00, 0x00, 0x00,
+  0x01, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00, 0x18, 0x00, 0x00, 0x00,
+  0x1e, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00,
+  0x1f, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00,
+  0x3b, 0x00, 0x04, 0x00, 0x1c, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00,
+  0x01, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00, 0x18, 0x00, 0x00, 0x00,
+  0x24, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x15, 0x00, 0x04, 0x00,
+  0x29, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+  0x2b, 0x00, 0x04, 0x00, 0x29, 0x00, 0x00, 0x00, 0x2d, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00, 0x29, 0x00, 0x00, 0x00,
+  0x3e, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x14, 0x00, 0x02, 0x00,
+  0x53, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00, 0x63, 0x00, 0x00, 0x00,
+  0x02, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00,
+  0x18, 0x00, 0x00, 0x00, 0x67, 0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00,
+  0x1c, 0x00, 0x04, 0x00, 0x6a, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0x67, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00, 0x6b, 0x00, 0x00, 0x00,
+  0x04, 0x00, 0x00, 0x00, 0x6a, 0x00, 0x00, 0x00, 0x3b, 0x00, 0x04, 0x00,
+  0x6b, 0x00, 0x00, 0x00, 0x6c, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
+  0x20, 0x00, 0x04, 0x00, 0x6f, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00, 0x18, 0x00, 0x00, 0x00,
+  0x71, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00,
+  0x18, 0x00, 0x00, 0x00, 0x72, 0x00, 0x00, 0x00, 0x08, 0x01, 0x00, 0x00,
+  0x2b, 0x00, 0x04, 0x00, 0x18, 0x00, 0x00, 0x00, 0x74, 0x00, 0x00, 0x00,
+  0x40, 0x00, 0x00, 0x00, 0x1d, 0x00, 0x03, 0x00, 0x97, 0x00, 0x00, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0x1e, 0x00, 0x03, 0x00, 0x98, 0x00, 0x00, 0x00,
+  0x97, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00, 0x99, 0x00, 0x00, 0x00,
+  0x02, 0x00, 0x00, 0x00, 0x98, 0x00, 0x00, 0x00, 0x3b, 0x00, 0x04, 0x00,
+  0x99, 0x00, 0x00, 0x00, 0x9a, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+  0x2c, 0x00, 0x06, 0x00, 0x1b, 0x00, 0x00, 0x00, 0x9e, 0x00, 0x00, 0x00,
+  0x67, 0x00, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00,
   0x36, 0x00, 0x05, 0x00, 0x02, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
   0x00, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00,
-  0x05, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00, 0x0b, 0x00, 0x00, 0x00,
-  0x0e, 0x00, 0x00, 0x00, 0x0d, 0x00, 0x00, 0x00, 0x7c, 0x00, 0x04, 0x00,
-  0x07, 0x00, 0x00, 0x00, 0x0f, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x00, 0x00,
-  0x51, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0x16, 0x00, 0x00, 0x00,
-  0x0f, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00,
-  0x1c, 0x00, 0x00, 0x00, 0x1d, 0x00, 0x00, 0x00, 0x1a, 0x00, 0x00, 0x00,
-  0x1b, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00,
-  0x1e, 0x00, 0x00, 0x00, 0x1d, 0x00, 0x00, 0x00, 0x87, 0x00, 0x05, 0x00,
-  0x06, 0x00, 0x00, 0x00, 0x1f, 0x00, 0x00, 0x00, 0x16, 0x00, 0x00, 0x00,
-  0x1e, 0x00, 0x00, 0x00, 0x8b, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00,
-  0x25, 0x00, 0x00, 0x00, 0x16, 0x00, 0x00, 0x00, 0x1e, 0x00, 0x00, 0x00,
-  0xb1, 0x00, 0x05, 0x00, 0x28, 0x00, 0x00, 0x00, 0x2d, 0x00, 0x00, 0x00,
-  0x25, 0x00, 0x00, 0x00, 0x1e, 0x00, 0x00, 0x00, 0xf7, 0x00, 0x03, 0x00,
-  0x2f, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xfa, 0x00, 0x04, 0x00,
-  0x2d, 0x00, 0x00, 0x00, 0x2e, 0x00, 0x00, 0x00, 0x2f, 0x00, 0x00, 0x00,
-  0xf8, 0x00, 0x02, 0x00, 0x2e, 0x00, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00,
-  0x1c, 0x00, 0x00, 0x00, 0x33, 0x00, 0x00, 0x00, 0x1a, 0x00, 0x00, 0x00,
-  0x32, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00,
-  0x34, 0x00, 0x00, 0x00, 0x33, 0x00, 0x00, 0x00, 0xb1, 0x00, 0x05, 0x00,
-  0x28, 0x00, 0x00, 0x00, 0x35, 0x00, 0x00, 0x00, 0x1f, 0x00, 0x00, 0x00,
-  0x34, 0x00, 0x00, 0x00, 0xf9, 0x00, 0x02, 0x00, 0x2f, 0x00, 0x00, 0x00,
-  0xf8, 0x00, 0x02, 0x00, 0x2f, 0x00, 0x00, 0x00, 0xf5, 0x00, 0x07, 0x00,
-  0x28, 0x00, 0x00, 0x00, 0x36, 0x00, 0x00, 0x00, 0x2d, 0x00, 0x00, 0x00,
-  0x05, 0x00, 0x00, 0x00, 0x35, 0x00, 0x00, 0x00, 0x2e, 0x00, 0x00, 0x00,
-  0xf7, 0x00, 0x03, 0x00, 0x38, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-  0xfa, 0x00, 0x04, 0x00, 0x36, 0x00, 0x00, 0x00, 0x37, 0x00, 0x00, 0x00,
-  0x38, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x37, 0x00, 0x00, 0x00,
-  0x84, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0x4b, 0x00, 0x00, 0x00,
-  0x1f, 0x00, 0x00, 0x00, 0x1e, 0x00, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00,
-  0x1c, 0x00, 0x00, 0x00, 0x4c, 0x00, 0x00, 0x00, 0x1a, 0x00, 0x00, 0x00,
-  0x3a, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00,
-  0x4d, 0x00, 0x00, 0x00, 0x4c, 0x00, 0x00, 0x00, 0x84, 0x00, 0x05, 0x00,
-  0x06, 0x00, 0x00, 0x00, 0x4e, 0x00, 0x00, 0x00, 0x4b, 0x00, 0x00, 0x00,
-  0x4d, 0x00, 0x00, 0x00, 0x80, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00,
-  0x51, 0x00, 0x00, 0x00, 0x4e, 0x00, 0x00, 0x00, 0x25, 0x00, 0x00, 0x00,
-  0x41, 0x00, 0x06, 0x00, 0x59, 0x00, 0x00, 0x00, 0x5a, 0x00, 0x00, 0x00,
-  0x57, 0x00, 0x00, 0x00, 0x1b, 0x00, 0x00, 0x00, 0x51, 0x00, 0x00, 0x00,
-  0x3d, 0x00, 0x04, 0x00, 0x17, 0x00, 0x00, 0x00, 0x5b, 0x00, 0x00, 0x00,
-  0x5a, 0x00, 0x00, 0x00, 0x41, 0x00, 0x06, 0x00, 0x59, 0x00, 0x00, 0x00,
-  0x78, 0x00, 0x00, 0x00, 0x74, 0x00, 0x00, 0x00, 0x1b, 0x00, 0x00, 0x00,
-  0x16, 0x00, 0x00, 0x00, 0x3e, 0x00, 0x03, 0x00, 0x78, 0x00, 0x00, 0x00,
-  0x5b, 0x00, 0x00, 0x00, 0xf9, 0x00, 0x02, 0x00, 0x38, 0x00, 0x00, 0x00,
-  0xf8, 0x00, 0x02, 0x00, 0x38, 0x00, 0x00, 0x00, 0xfd, 0x00, 0x01, 0x00,
-  0x38, 0x00, 0x01, 0x00
+  0x05, 0x00, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00, 0x1f, 0x00, 0x00, 0x00,
+  0x20, 0x00, 0x00, 0x00, 0x1d, 0x00, 0x00, 0x00, 0x1e, 0x00, 0x00, 0x00,
+  0x3d, 0x00, 0x04, 0x00, 0x18, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00,
+  0x20, 0x00, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00, 0x1f, 0x00, 0x00, 0x00,
+  0x25, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00,
+  0x3d, 0x00, 0x04, 0x00, 0x18, 0x00, 0x00, 0x00, 0x26, 0x00, 0x00, 0x00,
+  0x25, 0x00, 0x00, 0x00, 0xf9, 0x00, 0x02, 0x00, 0x4a, 0x00, 0x00, 0x00,
+  0xf8, 0x00, 0x02, 0x00, 0x4a, 0x00, 0x00, 0x00, 0xf9, 0x00, 0x02, 0x00,
+  0x4c, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x4c, 0x00, 0x00, 0x00,
+  0xe0, 0x00, 0x04, 0x00, 0x71, 0x00, 0x00, 0x00, 0x71, 0x00, 0x00, 0x00,
+  0x72, 0x00, 0x00, 0x00, 0xf9, 0x00, 0x02, 0x00, 0x75, 0x00, 0x00, 0x00,
+  0xf8, 0x00, 0x02, 0x00, 0x75, 0x00, 0x00, 0x00, 0xf5, 0x00, 0x07, 0x00,
+  0x18, 0x00, 0x00, 0x00, 0xae, 0x00, 0x00, 0x00, 0x74, 0x00, 0x00, 0x00,
+  0x4c, 0x00, 0x00, 0x00, 0x8f, 0x00, 0x00, 0x00, 0x78, 0x00, 0x00, 0x00,
+  0xac, 0x00, 0x05, 0x00, 0x53, 0x00, 0x00, 0x00, 0x7b, 0x00, 0x00, 0x00,
+  0xae, 0x00, 0x00, 0x00, 0x1e, 0x00, 0x00, 0x00, 0xf6, 0x00, 0x04, 0x00,
+  0x77, 0x00, 0x00, 0x00, 0x78, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0xfa, 0x00, 0x04, 0x00, 0x7b, 0x00, 0x00, 0x00, 0x76, 0x00, 0x00, 0x00,
+  0x77, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x76, 0x00, 0x00, 0x00,
+  0xf9, 0x00, 0x02, 0x00, 0x80, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00,
+  0x80, 0x00, 0x00, 0x00, 0xe0, 0x00, 0x04, 0x00, 0x71, 0x00, 0x00, 0x00,
+  0x71, 0x00, 0x00, 0x00, 0x72, 0x00, 0x00, 0x00, 0xf9, 0x00, 0x02, 0x00,
+  0x78, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x78, 0x00, 0x00, 0x00,
+  0xc2, 0x00, 0x05, 0x00, 0x18, 0x00, 0x00, 0x00, 0x8f, 0x00, 0x00, 0x00,
+  0xae, 0x00, 0x00, 0x00, 0x3e, 0x00, 0x00, 0x00, 0xf9, 0x00, 0x02, 0x00,
+  0x75, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x77, 0x00, 0x00, 0x00,
+  0xaa, 0x00, 0x05, 0x00, 0x53, 0x00, 0x00, 0x00, 0x91, 0x00, 0x00, 0x00,
+  0x21, 0x00, 0x00, 0x00, 0x1e, 0x00, 0x00, 0x00, 0xf7, 0x00, 0x03, 0x00,
+  0x93, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xfa, 0x00, 0x04, 0x00,
+  0x91, 0x00, 0x00, 0x00, 0x92, 0x00, 0x00, 0x00, 0x93, 0x00, 0x00, 0x00,
+  0xf8, 0x00, 0x02, 0x00, 0x92, 0x00, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00,
+  0x6f, 0x00, 0x00, 0x00, 0x95, 0x00, 0x00, 0x00, 0x6c, 0x00, 0x00, 0x00,
+  0x2d, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0x96, 0x00, 0x00, 0x00, 0x95, 0x00, 0x00, 0x00, 0x41, 0x00, 0x06, 0x00,
+  0x63, 0x00, 0x00, 0x00, 0x9d, 0x00, 0x00, 0x00, 0x9a, 0x00, 0x00, 0x00,
+  0x2d, 0x00, 0x00, 0x00, 0x26, 0x00, 0x00, 0x00, 0x3e, 0x00, 0x03, 0x00,
+  0x9d, 0x00, 0x00, 0x00, 0x96, 0x00, 0x00, 0x00, 0xf9, 0x00, 0x02, 0x00,
+  0x93, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x93, 0x00, 0x00, 0x00,
+  0xfd, 0x00, 0x01, 0x00, 0x38, 0x00, 0x01, 0x00
 };
-unsigned int glsl_reduce_comp_len = 1720;
+unsigned int glsl_reduce_comp_len = 1352;
 
 const unsigned char glsl_reduce_VMAX_comp[] = {
   0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0b, 0x00, 0x08, 0x00,
-  0x8c, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00,
+  0xb8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00,
   0x01, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00,
   0x47, 0x4c, 0x53, 0x4c, 0x2e, 0x73, 0x74, 0x64, 0x2e, 0x34, 0x35, 0x30,
   0x00, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00,
-  0x01, 0x00, 0x00, 0x00, 0x0f, 0x00, 0x06, 0x00, 0x05, 0x00, 0x00, 0x00,
+  0x01, 0x00, 0x00, 0x00, 0x0f, 0x00, 0x07, 0x00, 0x05, 0x00, 0x00, 0x00,
   0x04, 0x00, 0x00, 0x00, 0x6d, 0x61, 0x69, 0x6e, 0x00, 0x00, 0x00, 0x00,
-  0x0d, 0x00, 0x00, 0x00, 0x10, 0x00, 0x06, 0x00, 0x04, 0x00, 0x00, 0x00,
-  0x11, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
-  0x01, 0x00, 0x00, 0x00, 0x03, 0x00, 0x03, 0x00, 0x02, 0x00, 0x00, 0x00,
-  0xb8, 0x01, 0x00, 0x00, 0x05, 0x00, 0x04, 0x00, 0x04, 0x00, 0x00, 0x00,
-  0x6d, 0x61, 0x69, 0x6e, 0x00, 0x00, 0x00, 0x00, 0x05, 0x00, 0x08, 0x00,
-  0x0d, 0x00, 0x00, 0x00, 0x67, 0x6c, 0x5f, 0x47, 0x6c, 0x6f, 0x62, 0x61,
-  0x6c, 0x49, 0x6e, 0x76, 0x6f, 0x63, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x49,
-  0x44, 0x00, 0x00, 0x00, 0x05, 0x00, 0x05, 0x00, 0x18, 0x00, 0x00, 0x00,
-  0x63, 0x6f, 0x6e, 0x73, 0x74, 0x42, 0x75, 0x66, 0x66, 0x65, 0x72, 0x00,
-  0x06, 0x00, 0x04, 0x00, 0x18, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-  0x77, 0x00, 0x00, 0x00, 0x06, 0x00, 0x04, 0x00, 0x18, 0x00, 0x00, 0x00,
-  0x01, 0x00, 0x00, 0x00, 0x68, 0x00, 0x00, 0x00, 0x06, 0x00, 0x04, 0x00,
-  0x18, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x63, 0x00, 0x00, 0x00,
-  0x06, 0x00, 0x04, 0x00, 0x18, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00,
-  0x6b, 0x00, 0x00, 0x00, 0x05, 0x00, 0x04, 0x00, 0x1a, 0x00, 0x00, 0x00,
-  0x75, 0x43, 0x6f, 0x6e, 0x73, 0x74, 0x00, 0x00, 0x05, 0x00, 0x06, 0x00,
-  0x55, 0x00, 0x00, 0x00, 0x73, 0x6f, 0x75, 0x72, 0x63, 0x65, 0x42, 0x75,
-  0x66, 0x66, 0x65, 0x72, 0x30, 0x00, 0x00, 0x00, 0x06, 0x00, 0x05, 0x00,
-  0x55, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x64, 0x61, 0x74, 0x61,
-  0x00, 0x00, 0x00, 0x00, 0x05, 0x00, 0x04, 0x00, 0x57, 0x00, 0x00, 0x00,
-  0x75, 0x49, 0x6e, 0x70, 0x75, 0x74, 0x00, 0x00, 0x05, 0x00, 0x05, 0x00,
-  0x75, 0x00, 0x00, 0x00, 0x64, 0x65, 0x73, 0x74, 0x42, 0x75, 0x66, 0x66,
-  0x65, 0x72, 0x00, 0x00, 0x06, 0x00, 0x05, 0x00, 0x75, 0x00, 0x00, 0x00,
-  0x00, 0x00, 0x00, 0x00, 0x64, 0x61, 0x74, 0x61, 0x00, 0x00, 0x00, 0x00,
-  0x05, 0x00, 0x04, 0x00, 0x77, 0x00, 0x00, 0x00, 0x75, 0x4f, 0x75, 0x74,
-  0x70, 0x75, 0x74, 0x00, 0x47, 0x00, 0x04, 0x00, 0x0d, 0x00, 0x00, 0x00,
-  0x0b, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00,
-  0x18, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00,
-  0x00, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00, 0x18, 0x00, 0x00, 0x00,
-  0x01, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
-  0x48, 0x00, 0x05, 0x00, 0x18, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
-  0x23, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00,
-  0x18, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00,
-  0x0c, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x18, 0x00, 0x00, 0x00,
-  0x02, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x1a, 0x00, 0x00, 0x00,
-  0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
-  0x1a, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
-  0x47, 0x00, 0x04, 0x00, 0x54, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00,
-  0x04, 0x00, 0x00, 0x00, 0x48, 0x00, 0x04, 0x00, 0x55, 0x00, 0x00, 0x00,
-  0x00, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00,
-  0x55, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00,
-  0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x55, 0x00, 0x00, 0x00,
-  0x03, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x57, 0x00, 0x00, 0x00,
-  0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
-  0x57, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
-  0x47, 0x00, 0x04, 0x00, 0x74, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00,
-  0x04, 0x00, 0x00, 0x00, 0x48, 0x00, 0x04, 0x00, 0x75, 0x00, 0x00, 0x00,
-  0x00, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00,
-  0x75, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00,
-  0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x75, 0x00, 0x00, 0x00,
-  0x03, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x77, 0x00, 0x00, 0x00,
-  0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
-  0x77, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-  0x47, 0x00, 0x04, 0x00, 0x7d, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00,
-  0x19, 0x00, 0x00, 0x00, 0x13, 0x00, 0x02, 0x00, 0x02, 0x00, 0x00, 0x00,
-  0x21, 0x00, 0x03, 0x00, 0x03, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
-  0x15, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00,
-  0x01, 0x00, 0x00, 0x00, 0x17, 0x00, 0x04, 0x00, 0x07, 0x00, 0x00, 0x00,
-  0x06, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x15, 0x00, 0x04, 0x00,
-  0x0a, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-  0x17, 0x00, 0x04, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x0a, 0x00, 0x00, 0x00,
-  0x03, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00, 0x0c, 0x00, 0x00, 0x00,
-  0x01, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x3b, 0x00, 0x04, 0x00,
-  0x0c, 0x00, 0x00, 0x00, 0x0d, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
-  0x16, 0x00, 0x03, 0x00, 0x17, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00,
-  0x1e, 0x00, 0x06, 0x00, 0x18, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00,
-  0x06, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x17, 0x00, 0x00, 0x00,
-  0x20, 0x00, 0x04, 0x00, 0x19, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
-  0x18, 0x00, 0x00, 0x00, 0x3b, 0x00, 0x04, 0x00, 0x19, 0x00, 0x00, 0x00,
-  0x1a, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00,
-  0x06, 0x00, 0x00, 0x00, 0x1b, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-  0x20, 0x00, 0x04, 0x00, 0x1c, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
-  0x06, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00, 0x0a, 0x00, 0x00, 0x00,
-  0x26, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x14, 0x00, 0x02, 0x00,
-  0x28, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00,
-  0x32, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00,
-  0x06, 0x00, 0x00, 0x00, 0x3a, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
-  0x1d, 0x00, 0x03, 0x00, 0x54, 0x00, 0x00, 0x00, 0x17, 0x00, 0x00, 0x00,
-  0x1e, 0x00, 0x03, 0x00, 0x55, 0x00, 0x00, 0x00, 0x54, 0x00, 0x00, 0x00,
-  0x20, 0x00, 0x04, 0x00, 0x56, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
-  0x55, 0x00, 0x00, 0x00, 0x3b, 0x00, 0x04, 0x00, 0x56, 0x00, 0x00, 0x00,
-  0x57, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00,
-  0x59, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x17, 0x00, 0x00, 0x00,
-  0x1d, 0x00, 0x03, 0x00, 0x74, 0x00, 0x00, 0x00, 0x17, 0x00, 0x00, 0x00,
-  0x1e, 0x00, 0x03, 0x00, 0x75, 0x00, 0x00, 0x00, 0x74, 0x00, 0x00, 0x00,
-  0x20, 0x00, 0x04, 0x00, 0x76, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
-  0x75, 0x00, 0x00, 0x00, 0x3b, 0x00, 0x04, 0x00, 0x76, 0x00, 0x00, 0x00,
-  0x77, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00,
-  0x0a, 0x00, 0x00, 0x00, 0x7c, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00,
-  0x2c, 0x00, 0x06, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x7d, 0x00, 0x00, 0x00,
-  0x7c, 0x00, 0x00, 0x00, 0x26, 0x00, 0x00, 0x00, 0x26, 0x00, 0x00, 0x00,
-  0x36, 0x00, 0x05, 0x00, 0x02, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
-  0x00, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00,
-  0x05, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00, 0x0b, 0x00, 0x00, 0x00,
-  0x0e, 0x00, 0x00, 0x00, 0x0d, 0x00, 0x00, 0x00, 0x7c, 0x00, 0x04, 0x00,
-  0x07, 0x00, 0x00, 0x00, 0x0f, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x00, 0x00,
-  0x51, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0x16, 0x00, 0x00, 0x00,
-  0x0f, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00,
-  0x1c, 0x00, 0x00, 0x00, 0x1d, 0x00, 0x00, 0x00, 0x1a, 0x00, 0x00, 0x00,
-  0x1b, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00,
-  0x1e, 0x00, 0x00, 0x00, 0x1d, 0x00, 0x00, 0x00, 0x87, 0x00, 0x05, 0x00,
-  0x06, 0x00, 0x00, 0x00, 0x1f, 0x00, 0x00, 0x00, 0x16, 0x00, 0x00, 0x00,
-  0x1e, 0x00, 0x00, 0x00, 0x8b, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00,
-  0x25, 0x00, 0x00, 0x00, 0x16, 0x00, 0x00, 0x00, 0x1e, 0x00, 0x00, 0x00,
-  0xb1, 0x00, 0x05, 0x00, 0x28, 0x00, 0x00, 0x00, 0x2d, 0x00, 0x00, 0x00,
-  0x25, 0x00, 0x00, 0x00, 0x1e, 0x00, 0x00, 0x00, 0xf7, 0x00, 0x03, 0x00,
-  0x2f, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xfa, 0x00, 0x04, 0x00,
-  0x2d, 0x00, 0x00, 0x00, 0x2e, 0x00, 0x00, 0x00, 0x2f, 0x00, 0x00, 0x00,
-  0xf8, 0x00, 0x02, 0x00, 0x2e, 0x00, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00,
-  0x1c, 0x00, 0x00, 0x00, 0x33, 0x00, 0x00, 0x00, 0x1a, 0x00, 0x00, 0x00,
-  0x32, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00,
-  0x34, 0x00, 0x00, 0x00, 0x33, 0x00, 0x00, 0x00, 0xb1, 0x00, 0x05, 0x00,
-  0x28, 0x00, 0x00, 0x00, 0x35, 0x00, 0x00, 0x00, 0x1f, 0x00, 0x00, 0x00,
-  0x34, 0x00, 0x00, 0x00, 0xf9, 0x00, 0x02, 0x00, 0x2f, 0x00, 0x00, 0x00,
-  0xf8, 0x00, 0x02, 0x00, 0x2f, 0x00, 0x00, 0x00, 0xf5, 0x00, 0x07, 0x00,
-  0x28, 0x00, 0x00, 0x00, 0x36, 0x00, 0x00, 0x00, 0x2d, 0x00, 0x00, 0x00,
-  0x05, 0x00, 0x00, 0x00, 0x35, 0x00, 0x00, 0x00, 0x2e, 0x00, 0x00, 0x00,
-  0xf7, 0x00, 0x03, 0x00, 0x38, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-  0xfa, 0x00, 0x04, 0x00, 0x36, 0x00, 0x00, 0x00, 0x37, 0x00, 0x00, 0x00,
-  0x38, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x37, 0x00, 0x00, 0x00,
-  0x84, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0x4b, 0x00, 0x00, 0x00,
-  0x1f, 0x00, 0x00, 0x00, 0x1e, 0x00, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00,
-  0x1c, 0x00, 0x00, 0x00, 0x4c, 0x00, 0x00, 0x00, 0x1a, 0x00, 0x00, 0x00,
-  0x3a, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00,
-  0x4d, 0x00, 0x00, 0x00, 0x4c, 0x00, 0x00, 0x00, 0x84, 0x00, 0x05, 0x00,
-  0x06, 0x00, 0x00, 0x00, 0x4e, 0x00, 0x00, 0x00, 0x4b, 0x00, 0x00, 0x00,
-  0x4d, 0x00, 0x00, 0x00, 0x80, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00,
-  0x51, 0x00, 0x00, 0x00, 0x4e, 0x00, 0x00, 0x00, 0x25, 0x00, 0x00, 0x00,
-  0x41, 0x00, 0x06, 0x00, 0x59, 0x00, 0x00, 0x00, 0x5a, 0x00, 0x00, 0x00,
-  0x57, 0x00, 0x00, 0x00, 0x1b, 0x00, 0x00, 0x00, 0x51, 0x00, 0x00, 0x00,
-  0x3d, 0x00, 0x04, 0x00, 0x17, 0x00, 0x00, 0x00, 0x5b, 0x00, 0x00, 0x00,
-  0x5a, 0x00, 0x00, 0x00, 0xf9, 0x00, 0x02, 0x00, 0x5d, 0x00, 0x00, 0x00,
-  0xf8, 0x00, 0x02, 0x00, 0x5d, 0x00, 0x00, 0x00, 0xf5, 0x00, 0x07, 0x00,
-  0x17, 0x00, 0x00, 0x00, 0x8b, 0x00, 0x00, 0x00, 0x5b, 0x00, 0x00, 0x00,
-  0x37, 0x00, 0x00, 0x00, 0x71, 0x00, 0x00, 0x00, 0x5e, 0x00, 0x00, 0x00,
-  0xf5, 0x00, 0x07, 0x00, 0x06, 0x00, 0x00, 0x00, 0x8a, 0x00, 0x00, 0x00,
-  0x3a, 0x00, 0x00, 0x00, 0x37, 0x00, 0x00, 0x00, 0x73, 0x00, 0x00, 0x00,
-  0x5e, 0x00, 0x00, 0x00, 0xb1, 0x00, 0x05, 0x00, 0x28, 0x00, 0x00, 0x00,
-  0x65, 0x00, 0x00, 0x00, 0x8a, 0x00, 0x00, 0x00, 0x4d, 0x00, 0x00, 0x00,
-  0xf6, 0x00, 0x04, 0x00, 0x5f, 0x00, 0x00, 0x00, 0x5e, 0x00, 0x00, 0x00,
-  0x00, 0x00, 0x00, 0x00, 0xfa, 0x00, 0x04, 0x00, 0x65, 0x00, 0x00, 0x00,
-  0x5e, 0x00, 0x00, 0x00, 0x5f, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00,
-  0x5e, 0x00, 0x00, 0x00, 0x84, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00,
-  0x6b, 0x00, 0x00, 0x00, 0x8a, 0x00, 0x00, 0x00, 0x1e, 0x00, 0x00, 0x00,
-  0x80, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0x6c, 0x00, 0x00, 0x00,
-  0x51, 0x00, 0x00, 0x00, 0x6b, 0x00, 0x00, 0x00, 0x41, 0x00, 0x06, 0x00,
-  0x59, 0x00, 0x00, 0x00, 0x6d, 0x00, 0x00, 0x00, 0x57, 0x00, 0x00, 0x00,
-  0x1b, 0x00, 0x00, 0x00, 0x6c, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00,
-  0x17, 0x00, 0x00, 0x00, 0x6e, 0x00, 0x00, 0x00, 0x6d, 0x00, 0x00, 0x00,
-  0x0c, 0x00, 0x07, 0x00, 0x17, 0x00, 0x00, 0x00, 0x71, 0x00, 0x00, 0x00,
-  0x01, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00, 0x8b, 0x00, 0x00, 0x00,
-  0x6e, 0x00, 0x00, 0x00, 0x80, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00,
-  0x73, 0x00, 0x00, 0x00, 0x8a, 0x00, 0x00, 0x00, 0x3a, 0x00, 0x00, 0x00,
-  0xf9, 0x00, 0x02, 0x00, 0x5d, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00,
-  0x5f, 0x00, 0x00, 0x00, 0x41, 0x00, 0x06, 0x00, 0x59, 0x00, 0x00, 0x00,
-  0x7b, 0x00, 0x00, 0x00, 0x77, 0x00, 0x00, 0x00, 0x1b, 0x00, 0x00, 0x00,
-  0x16, 0x00, 0x00, 0x00, 0x3e, 0x00, 0x03, 0x00, 0x7b, 0x00, 0x00, 0x00,
-  0x8b, 0x00, 0x00, 0x00, 0xf9, 0x00, 0x02, 0x00, 0x38, 0x00, 0x00, 0x00,
-  0xf8, 0x00, 0x02, 0x00, 0x38, 0x00, 0x00, 0x00, 0xfd, 0x00, 0x01, 0x00,
+  0x21, 0x00, 0x00, 0x00, 0x27, 0x00, 0x00, 0x00, 0x10, 0x00, 0x06, 0x00,
+  0x04, 0x00, 0x00, 0x00, 0x11, 0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00,
+  0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x03, 0x00, 0x03, 0x00,
+  0x02, 0x00, 0x00, 0x00, 0xc2, 0x01, 0x00, 0x00, 0x05, 0x00, 0x04, 0x00,
+  0x04, 0x00, 0x00, 0x00, 0x6d, 0x61, 0x69, 0x6e, 0x00, 0x00, 0x00, 0x00,
+  0x05, 0x00, 0x08, 0x00, 0x21, 0x00, 0x00, 0x00, 0x67, 0x6c, 0x5f, 0x47,
+  0x6c, 0x6f, 0x62, 0x61, 0x6c, 0x49, 0x6e, 0x76, 0x6f, 0x63, 0x61, 0x74,
+  0x69, 0x6f, 0x6e, 0x49, 0x44, 0x00, 0x00, 0x00, 0x05, 0x00, 0x06, 0x00,
+  0x27, 0x00, 0x00, 0x00, 0x67, 0x6c, 0x5f, 0x57, 0x6f, 0x72, 0x6b, 0x47,
+  0x72, 0x6f, 0x75, 0x70, 0x49, 0x44, 0x00, 0x00, 0x05, 0x00, 0x05, 0x00,
+  0x2e, 0x00, 0x00, 0x00, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x42, 0x75, 0x66,
+  0x66, 0x65, 0x72, 0x00, 0x06, 0x00, 0x04, 0x00, 0x2e, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x77, 0x00, 0x00, 0x00, 0x06, 0x00, 0x04, 0x00,
+  0x2e, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x68, 0x00, 0x00, 0x00,
+  0x06, 0x00, 0x04, 0x00, 0x2e, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+  0x63, 0x00, 0x00, 0x00, 0x06, 0x00, 0x04, 0x00, 0x2e, 0x00, 0x00, 0x00,
+  0x03, 0x00, 0x00, 0x00, 0x6b, 0x00, 0x00, 0x00, 0x05, 0x00, 0x04, 0x00,
+  0x30, 0x00, 0x00, 0x00, 0x75, 0x43, 0x6f, 0x6e, 0x73, 0x74, 0x00, 0x00,
+  0x05, 0x00, 0x06, 0x00, 0x5a, 0x00, 0x00, 0x00, 0x73, 0x6f, 0x75, 0x72,
+  0x63, 0x65, 0x42, 0x75, 0x66, 0x66, 0x65, 0x72, 0x30, 0x00, 0x00, 0x00,
+  0x06, 0x00, 0x05, 0x00, 0x5a, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x64, 0x61, 0x74, 0x61, 0x00, 0x00, 0x00, 0x00, 0x05, 0x00, 0x04, 0x00,
+  0x5c, 0x00, 0x00, 0x00, 0x75, 0x49, 0x6e, 0x70, 0x75, 0x74, 0x00, 0x00,
+  0x05, 0x00, 0x06, 0x00, 0x70, 0x00, 0x00, 0x00, 0x73, 0x68, 0x61, 0x72,
+  0x65, 0x64, 0x56, 0x61, 0x6c, 0x75, 0x65, 0x73, 0x00, 0x00, 0x00, 0x00,
+  0x05, 0x00, 0x05, 0x00, 0x9c, 0x00, 0x00, 0x00, 0x64, 0x65, 0x73, 0x74,
+  0x42, 0x75, 0x66, 0x66, 0x65, 0x72, 0x00, 0x00, 0x06, 0x00, 0x05, 0x00,
+  0x9c, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x64, 0x61, 0x74, 0x61,
+  0x00, 0x00, 0x00, 0x00, 0x05, 0x00, 0x04, 0x00, 0x9e, 0x00, 0x00, 0x00,
+  0x75, 0x4f, 0x75, 0x74, 0x70, 0x75, 0x74, 0x00, 0x47, 0x00, 0x04, 0x00,
+  0x21, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00,
+  0x47, 0x00, 0x04, 0x00, 0x27, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00,
+  0x1a, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00, 0x2e, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x48, 0x00, 0x05, 0x00, 0x2e, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+  0x23, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00,
+  0x2e, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00,
+  0x08, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00, 0x2e, 0x00, 0x00, 0x00,
+  0x03, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x00, 0x00,
+  0x47, 0x00, 0x03, 0x00, 0x2e, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+  0x47, 0x00, 0x04, 0x00, 0x30, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x30, 0x00, 0x00, 0x00,
+  0x21, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
+  0x59, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
+  0x48, 0x00, 0x04, 0x00, 0x5a, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x18, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00, 0x5a, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x47, 0x00, 0x03, 0x00, 0x5a, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00,
+  0x47, 0x00, 0x04, 0x00, 0x5c, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x5c, 0x00, 0x00, 0x00,
+  0x21, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
+  0x9b, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
+  0x48, 0x00, 0x04, 0x00, 0x9c, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x19, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00, 0x9c, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x47, 0x00, 0x03, 0x00, 0x9c, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00,
+  0x47, 0x00, 0x04, 0x00, 0x9e, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x9e, 0x00, 0x00, 0x00,
+  0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
+  0xa2, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00,
+  0x13, 0x00, 0x02, 0x00, 0x02, 0x00, 0x00, 0x00, 0x21, 0x00, 0x03, 0x00,
+  0x03, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x16, 0x00, 0x03, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0x11, 0x00, 0x00, 0x00, 0xff, 0xff, 0x7f, 0xff,
+  0x15, 0x00, 0x04, 0x00, 0x1c, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x17, 0x00, 0x04, 0x00, 0x1f, 0x00, 0x00, 0x00,
+  0x1c, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00,
+  0x20, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x1f, 0x00, 0x00, 0x00,
+  0x3b, 0x00, 0x04, 0x00, 0x20, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00,
+  0x01, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00, 0x1c, 0x00, 0x00, 0x00,
+  0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00,
+  0x23, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00,
+  0x3b, 0x00, 0x04, 0x00, 0x20, 0x00, 0x00, 0x00, 0x27, 0x00, 0x00, 0x00,
+  0x01, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00, 0x1c, 0x00, 0x00, 0x00,
+  0x28, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x15, 0x00, 0x04, 0x00,
+  0x2d, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+  0x1e, 0x00, 0x06, 0x00, 0x2e, 0x00, 0x00, 0x00, 0x2d, 0x00, 0x00, 0x00,
+  0x2d, 0x00, 0x00, 0x00, 0x2d, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0x20, 0x00, 0x04, 0x00, 0x2f, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+  0x2e, 0x00, 0x00, 0x00, 0x3b, 0x00, 0x04, 0x00, 0x2f, 0x00, 0x00, 0x00,
+  0x30, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00,
+  0x2d, 0x00, 0x00, 0x00, 0x31, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x20, 0x00, 0x04, 0x00, 0x32, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+  0x2d, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00, 0x2d, 0x00, 0x00, 0x00,
+  0x42, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x14, 0x00, 0x02, 0x00,
+  0x57, 0x00, 0x00, 0x00, 0x1d, 0x00, 0x03, 0x00, 0x59, 0x00, 0x00, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0x1e, 0x00, 0x03, 0x00, 0x5a, 0x00, 0x00, 0x00,
+  0x59, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00, 0x5b, 0x00, 0x00, 0x00,
+  0x02, 0x00, 0x00, 0x00, 0x5a, 0x00, 0x00, 0x00, 0x3b, 0x00, 0x04, 0x00,
+  0x5b, 0x00, 0x00, 0x00, 0x5c, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+  0x20, 0x00, 0x04, 0x00, 0x67, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00, 0x1c, 0x00, 0x00, 0x00,
+  0x6b, 0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x04, 0x00,
+  0x6e, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x6b, 0x00, 0x00, 0x00,
+  0x20, 0x00, 0x04, 0x00, 0x6f, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
+  0x6e, 0x00, 0x00, 0x00, 0x3b, 0x00, 0x04, 0x00, 0x6f, 0x00, 0x00, 0x00,
+  0x70, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00,
+  0x73, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0x2b, 0x00, 0x04, 0x00, 0x1c, 0x00, 0x00, 0x00, 0x75, 0x00, 0x00, 0x00,
+  0x02, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00, 0x1c, 0x00, 0x00, 0x00,
+  0x76, 0x00, 0x00, 0x00, 0x08, 0x01, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00,
+  0x1c, 0x00, 0x00, 0x00, 0x78, 0x00, 0x00, 0x00, 0x40, 0x00, 0x00, 0x00,
+  0x1d, 0x00, 0x03, 0x00, 0x9b, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0x1e, 0x00, 0x03, 0x00, 0x9c, 0x00, 0x00, 0x00, 0x9b, 0x00, 0x00, 0x00,
+  0x20, 0x00, 0x04, 0x00, 0x9d, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+  0x9c, 0x00, 0x00, 0x00, 0x3b, 0x00, 0x04, 0x00, 0x9d, 0x00, 0x00, 0x00,
+  0x9e, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x2c, 0x00, 0x06, 0x00,
+  0x1f, 0x00, 0x00, 0x00, 0xa2, 0x00, 0x00, 0x00, 0x6b, 0x00, 0x00, 0x00,
+  0x28, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00, 0x36, 0x00, 0x05, 0x00,
+  0x02, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x03, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x05, 0x00, 0x00, 0x00,
+  0x41, 0x00, 0x05, 0x00, 0x23, 0x00, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00,
+  0x21, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00,
+  0x1c, 0x00, 0x00, 0x00, 0x25, 0x00, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00,
+  0x41, 0x00, 0x05, 0x00, 0x23, 0x00, 0x00, 0x00, 0x29, 0x00, 0x00, 0x00,
+  0x27, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00,
+  0x1c, 0x00, 0x00, 0x00, 0x2a, 0x00, 0x00, 0x00, 0x29, 0x00, 0x00, 0x00,
+  0x41, 0x00, 0x05, 0x00, 0x32, 0x00, 0x00, 0x00, 0x33, 0x00, 0x00, 0x00,
+  0x30, 0x00, 0x00, 0x00, 0x31, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00,
+  0x2d, 0x00, 0x00, 0x00, 0x34, 0x00, 0x00, 0x00, 0x33, 0x00, 0x00, 0x00,
+  0x7c, 0x00, 0x04, 0x00, 0x1c, 0x00, 0x00, 0x00, 0x35, 0x00, 0x00, 0x00,
+  0x34, 0x00, 0x00, 0x00, 0x89, 0x00, 0x05, 0x00, 0x1c, 0x00, 0x00, 0x00,
+  0x36, 0x00, 0x00, 0x00, 0x2a, 0x00, 0x00, 0x00, 0x35, 0x00, 0x00, 0x00,
+  0x86, 0x00, 0x05, 0x00, 0x1c, 0x00, 0x00, 0x00, 0x3c, 0x00, 0x00, 0x00,
+  0x2a, 0x00, 0x00, 0x00, 0x35, 0x00, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00,
+  0x32, 0x00, 0x00, 0x00, 0x43, 0x00, 0x00, 0x00, 0x30, 0x00, 0x00, 0x00,
+  0x42, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00, 0x2d, 0x00, 0x00, 0x00,
+  0x44, 0x00, 0x00, 0x00, 0x43, 0x00, 0x00, 0x00, 0x7c, 0x00, 0x04, 0x00,
+  0x1c, 0x00, 0x00, 0x00, 0x45, 0x00, 0x00, 0x00, 0x44, 0x00, 0x00, 0x00,
+  0x84, 0x00, 0x05, 0x00, 0x1c, 0x00, 0x00, 0x00, 0x47, 0x00, 0x00, 0x00,
+  0x45, 0x00, 0x00, 0x00, 0x3c, 0x00, 0x00, 0x00, 0x84, 0x00, 0x05, 0x00,
+  0x1c, 0x00, 0x00, 0x00, 0x48, 0x00, 0x00, 0x00, 0x35, 0x00, 0x00, 0x00,
+  0x47, 0x00, 0x00, 0x00, 0x80, 0x00, 0x05, 0x00, 0x1c, 0x00, 0x00, 0x00,
+  0x49, 0x00, 0x00, 0x00, 0x36, 0x00, 0x00, 0x00, 0x48, 0x00, 0x00, 0x00,
+  0xf9, 0x00, 0x02, 0x00, 0x4e, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00,
+  0x4e, 0x00, 0x00, 0x00, 0xf5, 0x00, 0x07, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0xb6, 0x00, 0x00, 0x00, 0x11, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00,
+  0xac, 0x00, 0x00, 0x00, 0x4f, 0x00, 0x00, 0x00, 0xf5, 0x00, 0x07, 0x00,
+  0x1c, 0x00, 0x00, 0x00, 0xb5, 0x00, 0x00, 0x00, 0x25, 0x00, 0x00, 0x00,
+  0x05, 0x00, 0x00, 0x00, 0x6d, 0x00, 0x00, 0x00, 0x4f, 0x00, 0x00, 0x00,
+  0xb0, 0x00, 0x05, 0x00, 0x57, 0x00, 0x00, 0x00, 0x58, 0x00, 0x00, 0x00,
+  0xb5, 0x00, 0x00, 0x00, 0x45, 0x00, 0x00, 0x00, 0xf6, 0x00, 0x04, 0x00,
+  0x50, 0x00, 0x00, 0x00, 0x4f, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0xfa, 0x00, 0x04, 0x00, 0x58, 0x00, 0x00, 0x00, 0x4f, 0x00, 0x00, 0x00,
+  0x50, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x4f, 0x00, 0x00, 0x00,
+  0x84, 0x00, 0x05, 0x00, 0x1c, 0x00, 0x00, 0x00, 0x62, 0x00, 0x00, 0x00,
+  0xb5, 0x00, 0x00, 0x00, 0x35, 0x00, 0x00, 0x00, 0x80, 0x00, 0x05, 0x00,
+  0x1c, 0x00, 0x00, 0x00, 0x63, 0x00, 0x00, 0x00, 0x49, 0x00, 0x00, 0x00,
+  0x62, 0x00, 0x00, 0x00, 0x41, 0x00, 0x06, 0x00, 0x67, 0x00, 0x00, 0x00,
+  0x68, 0x00, 0x00, 0x00, 0x5c, 0x00, 0x00, 0x00, 0x31, 0x00, 0x00, 0x00,
+  0x63, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0x69, 0x00, 0x00, 0x00, 0x68, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x07, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0xac, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+  0x28, 0x00, 0x00, 0x00, 0xb6, 0x00, 0x00, 0x00, 0x69, 0x00, 0x00, 0x00,
+  0x80, 0x00, 0x05, 0x00, 0x1c, 0x00, 0x00, 0x00, 0x6d, 0x00, 0x00, 0x00,
+  0xb5, 0x00, 0x00, 0x00, 0x6b, 0x00, 0x00, 0x00, 0xf9, 0x00, 0x02, 0x00,
+  0x4e, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x50, 0x00, 0x00, 0x00,
+  0x41, 0x00, 0x05, 0x00, 0x73, 0x00, 0x00, 0x00, 0x74, 0x00, 0x00, 0x00,
+  0x70, 0x00, 0x00, 0x00, 0x25, 0x00, 0x00, 0x00, 0x3e, 0x00, 0x03, 0x00,
+  0x74, 0x00, 0x00, 0x00, 0xb6, 0x00, 0x00, 0x00, 0xe0, 0x00, 0x04, 0x00,
+  0x75, 0x00, 0x00, 0x00, 0x75, 0x00, 0x00, 0x00, 0x76, 0x00, 0x00, 0x00,
+  0xf9, 0x00, 0x02, 0x00, 0x79, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00,
+  0x79, 0x00, 0x00, 0x00, 0xf5, 0x00, 0x07, 0x00, 0x1c, 0x00, 0x00, 0x00,
+  0xb7, 0x00, 0x00, 0x00, 0x78, 0x00, 0x00, 0x00, 0x50, 0x00, 0x00, 0x00,
+  0x93, 0x00, 0x00, 0x00, 0x7c, 0x00, 0x00, 0x00, 0xac, 0x00, 0x05, 0x00,
+  0x57, 0x00, 0x00, 0x00, 0x7f, 0x00, 0x00, 0x00, 0xb7, 0x00, 0x00, 0x00,
+  0x22, 0x00, 0x00, 0x00, 0xf6, 0x00, 0x04, 0x00, 0x7b, 0x00, 0x00, 0x00,
+  0x7c, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xfa, 0x00, 0x04, 0x00,
+  0x7f, 0x00, 0x00, 0x00, 0x7a, 0x00, 0x00, 0x00, 0x7b, 0x00, 0x00, 0x00,
+  0xf8, 0x00, 0x02, 0x00, 0x7a, 0x00, 0x00, 0x00, 0xb0, 0x00, 0x05, 0x00,
+  0x57, 0x00, 0x00, 0x00, 0x82, 0x00, 0x00, 0x00, 0x25, 0x00, 0x00, 0x00,
+  0xb7, 0x00, 0x00, 0x00, 0xf7, 0x00, 0x03, 0x00, 0x84, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0xfa, 0x00, 0x04, 0x00, 0x82, 0x00, 0x00, 0x00,
+  0x83, 0x00, 0x00, 0x00, 0x84, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00,
+  0x83, 0x00, 0x00, 0x00, 0x80, 0x00, 0x05, 0x00, 0x1c, 0x00, 0x00, 0x00,
+  0x88, 0x00, 0x00, 0x00, 0x25, 0x00, 0x00, 0x00, 0xb7, 0x00, 0x00, 0x00,
+  0x41, 0x00, 0x05, 0x00, 0x73, 0x00, 0x00, 0x00, 0x8b, 0x00, 0x00, 0x00,
+  0x70, 0x00, 0x00, 0x00, 0x88, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0x8c, 0x00, 0x00, 0x00, 0x8b, 0x00, 0x00, 0x00,
+  0x3d, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, 0x8f, 0x00, 0x00, 0x00,
+  0x74, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x07, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0xb3, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00,
+  0x8c, 0x00, 0x00, 0x00, 0x8f, 0x00, 0x00, 0x00, 0x3e, 0x00, 0x03, 0x00,
+  0x74, 0x00, 0x00, 0x00, 0xb3, 0x00, 0x00, 0x00, 0xf9, 0x00, 0x02, 0x00,
+  0x84, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x84, 0x00, 0x00, 0x00,
+  0xe0, 0x00, 0x04, 0x00, 0x75, 0x00, 0x00, 0x00, 0x75, 0x00, 0x00, 0x00,
+  0x76, 0x00, 0x00, 0x00, 0xf9, 0x00, 0x02, 0x00, 0x7c, 0x00, 0x00, 0x00,
+  0xf8, 0x00, 0x02, 0x00, 0x7c, 0x00, 0x00, 0x00, 0xc2, 0x00, 0x05, 0x00,
+  0x1c, 0x00, 0x00, 0x00, 0x93, 0x00, 0x00, 0x00, 0xb7, 0x00, 0x00, 0x00,
+  0x42, 0x00, 0x00, 0x00, 0xf9, 0x00, 0x02, 0x00, 0x79, 0x00, 0x00, 0x00,
+  0xf8, 0x00, 0x02, 0x00, 0x7b, 0x00, 0x00, 0x00, 0xaa, 0x00, 0x05, 0x00,
+  0x57, 0x00, 0x00, 0x00, 0x95, 0x00, 0x00, 0x00, 0x25, 0x00, 0x00, 0x00,
+  0x22, 0x00, 0x00, 0x00, 0xf7, 0x00, 0x03, 0x00, 0x97, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0xfa, 0x00, 0x04, 0x00, 0x95, 0x00, 0x00, 0x00,
+  0x96, 0x00, 0x00, 0x00, 0x97, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00,
+  0x96, 0x00, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00, 0x73, 0x00, 0x00, 0x00,
+  0x99, 0x00, 0x00, 0x00, 0x70, 0x00, 0x00, 0x00, 0x31, 0x00, 0x00, 0x00,
+  0x3d, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, 0x9a, 0x00, 0x00, 0x00,
+  0x99, 0x00, 0x00, 0x00, 0x41, 0x00, 0x06, 0x00, 0x67, 0x00, 0x00, 0x00,
+  0xa1, 0x00, 0x00, 0x00, 0x9e, 0x00, 0x00, 0x00, 0x31, 0x00, 0x00, 0x00,
+  0x2a, 0x00, 0x00, 0x00, 0x3e, 0x00, 0x03, 0x00, 0xa1, 0x00, 0x00, 0x00,
+  0x9a, 0x00, 0x00, 0x00, 0xf9, 0x00, 0x02, 0x00, 0x97, 0x00, 0x00, 0x00,
+  0xf8, 0x00, 0x02, 0x00, 0x97, 0x00, 0x00, 0x00, 0xfd, 0x00, 0x01, 0x00,
   0x38, 0x00, 0x01, 0x00
 };
-unsigned int glsl_reduce_VMAX_comp_len = 1996;
+unsigned int glsl_reduce_VMAX_comp_len = 2524;
 
 const unsigned char glsl_reduce_VMIN_comp[] = {
   0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0b, 0x00, 0x08, 0x00,
-  0x8c, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00,
+  0xb8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00,
   0x01, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00,
   0x47, 0x4c, 0x53, 0x4c, 0x2e, 0x73, 0x74, 0x64, 0x2e, 0x34, 0x35, 0x30,
   0x00, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00,
-  0x01, 0x00, 0x00, 0x00, 0x0f, 0x00, 0x06, 0x00, 0x05, 0x00, 0x00, 0x00,
+  0x01, 0x00, 0x00, 0x00, 0x0f, 0x00, 0x07, 0x00, 0x05, 0x00, 0x00, 0x00,
   0x04, 0x00, 0x00, 0x00, 0x6d, 0x61, 0x69, 0x6e, 0x00, 0x00, 0x00, 0x00,
-  0x0d, 0x00, 0x00, 0x00, 0x10, 0x00, 0x06, 0x00, 0x04, 0x00, 0x00, 0x00,
-  0x11, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
-  0x01, 0x00, 0x00, 0x00, 0x03, 0x00, 0x03, 0x00, 0x02, 0x00, 0x00, 0x00,
-  0xb8, 0x01, 0x00, 0x00, 0x05, 0x00, 0x04, 0x00, 0x04, 0x00, 0x00, 0x00,
-  0x6d, 0x61, 0x69, 0x6e, 0x00, 0x00, 0x00, 0x00, 0x05, 0x00, 0x08, 0x00,
-  0x0d, 0x00, 0x00, 0x00, 0x67, 0x6c, 0x5f, 0x47, 0x6c, 0x6f, 0x62, 0x61,
-  0x6c, 0x49, 0x6e, 0x76, 0x6f, 0x63, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x49,
-  0x44, 0x00, 0x00, 0x00, 0x05, 0x00, 0x05, 0x00, 0x18, 0x00, 0x00, 0x00,
-  0x63, 0x6f, 0x6e, 0x73, 0x74, 0x42, 0x75, 0x66, 0x66, 0x65, 0x72, 0x00,
-  0x06, 0x00, 0x04, 0x00, 0x18, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-  0x77, 0x00, 0x00, 0x00, 0x06, 0x00, 0x04, 0x00, 0x18, 0x00, 0x00, 0x00,
-  0x01, 0x00, 0x00, 0x00, 0x68, 0x00, 0x00, 0x00, 0x06, 0x00, 0x04, 0x00,
-  0x18, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x63, 0x00, 0x00, 0x00,
-  0x06, 0x00, 0x04, 0x00, 0x18, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00,
-  0x6b, 0x00, 0x00, 0x00, 0x05, 0x00, 0x04, 0x00, 0x1a, 0x00, 0x00, 0x00,
-  0x75, 0x43, 0x6f, 0x6e, 0x73, 0x74, 0x00, 0x00, 0x05, 0x00, 0x06, 0x00,
-  0x55, 0x00, 0x00, 0x00, 0x73, 0x6f, 0x75, 0x72, 0x63, 0x65, 0x42, 0x75,
-  0x66, 0x66, 0x65, 0x72, 0x30, 0x00, 0x00, 0x00, 0x06, 0x00, 0x05, 0x00,
-  0x55, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x64, 0x61, 0x74, 0x61,
-  0x00, 0x00, 0x00, 0x00, 0x05, 0x00, 0x04, 0x00, 0x57, 0x00, 0x00, 0x00,
-  0x75, 0x49, 0x6e, 0x70, 0x75, 0x74, 0x00, 0x00, 0x05, 0x00, 0x05, 0x00,
-  0x75, 0x00, 0x00, 0x00, 0x64, 0x65, 0x73, 0x74, 0x42, 0x75, 0x66, 0x66,
-  0x65, 0x72, 0x00, 0x00, 0x06, 0x00, 0x05, 0x00, 0x75, 0x00, 0x00, 0x00,
-  0x00, 0x00, 0x00, 0x00, 0x64, 0x61, 0x74, 0x61, 0x00, 0x00, 0x00, 0x00,
-  0x05, 0x00, 0x04, 0x00, 0x77, 0x00, 0x00, 0x00, 0x75, 0x4f, 0x75, 0x74,
-  0x70, 0x75, 0x74, 0x00, 0x47, 0x00, 0x04, 0x00, 0x0d, 0x00, 0x00, 0x00,
-  0x0b, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00,
-  0x18, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00,
-  0x00, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00, 0x18, 0x00, 0x00, 0x00,
-  0x01, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
-  0x48, 0x00, 0x05, 0x00, 0x18, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
-  0x23, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00,
-  0x18, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00,
-  0x0c, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x18, 0x00, 0x00, 0x00,
-  0x02, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x1a, 0x00, 0x00, 0x00,
-  0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
-  0x1a, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
-  0x47, 0x00, 0x04, 0x00, 0x54, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00,
-  0x04, 0x00, 0x00, 0x00, 0x48, 0x00, 0x04, 0x00, 0x55, 0x00, 0x00, 0x00,
-  0x00, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00,
-  0x55, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00,
-  0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x55, 0x00, 0x00, 0x00,
-  0x03, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x57, 0x00, 0x00, 0x00,
-  0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
-  0x57, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
-  0x47, 0x00, 0x04, 0x00, 0x74, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00,
-  0x04, 0x00, 0x00, 0x00, 0x48, 0x00, 0x04, 0x00, 0x75, 0x00, 0x00, 0x00,
-  0x00, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00,
-  0x75, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00,
-  0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x75, 0x00, 0x00, 0x00,
-  0x03, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x77, 0x00, 0x00, 0x00,
-  0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
-  0x77, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-  0x47, 0x00, 0x04, 0x00, 0x7d, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00,
-  0x19, 0x00, 0x00, 0x00, 0x13, 0x00, 0x02, 0x00, 0x02, 0x00, 0x00, 0x00,
-  0x21, 0x00, 0x03, 0x00, 0x03, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
-  0x15, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00,
-  0x01, 0x00, 0x00, 0x00, 0x17, 0x00, 0x04, 0x00, 0x07, 0x00, 0x00, 0x00,
-  0x06, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x15, 0x00, 0x04, 0x00,
-  0x0a, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-  0x17, 0x00, 0x04, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x0a, 0x00, 0x00, 0x00,
-  0x03, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00, 0x0c, 0x00, 0x00, 0x00,
-  0x01, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x3b, 0x00, 0x04, 0x00,
-  0x0c, 0x00, 0x00, 0x00, 0x0d, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
-  0x16, 0x00, 0x03, 0x00, 0x17, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00,
-  0x1e, 0x00, 0x06, 0x00, 0x18, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00,
-  0x06, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x17, 0x00, 0x00, 0x00,
-  0x20, 0x00, 0x04, 0x00, 0x19, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
-  0x18, 0x00, 0x00, 0x00, 0x3b, 0x00, 0x04, 0x00, 0x19, 0x00, 0x00, 0x00,
-  0x1a, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00,
-  0x06, 0x00, 0x00, 0x00, 0x1b, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-  0x20, 0x00, 0x04, 0x00, 0x1c, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
-  0x06, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00, 0x0a, 0x00, 0x00, 0x00,
-  0x26, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x14, 0x00, 0x02, 0x00,
-  0x28, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00,
-  0x32, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00,
-  0x06, 0x00, 0x00, 0x00, 0x3a, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
-  0x1d, 0x00, 0x03, 0x00, 0x54, 0x00, 0x00, 0x00, 0x17, 0x00, 0x00, 0x00,
-  0x1e, 0x00, 0x03, 0x00, 0x55, 0x00, 0x00, 0x00, 0x54, 0x00, 0x00, 0x00,
-  0x20, 0x00, 0x04, 0x00, 0x56, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
-  0x55, 0x00, 0x00, 0x00, 0x3b, 0x00, 0x04, 0x00, 0x56, 0x00, 0x00, 0x00,
-  0x57, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00,
-  0x59, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x17, 0x00, 0x00, 0x00,
-  0x1d, 0x00, 0x03, 0x00, 0x74, 0x00, 0x00, 0x00, 0x17, 0x00, 0x00, 0x00,
-  0x1e, 0x00, 0x03, 0x00, 0x75, 0x00, 0x00, 0x00, 0x74, 0x00, 0x00, 0x00,
-  0x20, 0x00, 0x04, 0x00, 0x76, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
-  0x75, 0x00, 0x00, 0x00, 0x3b, 0x00, 0x04, 0x00, 0x76, 0x00, 0x00, 0x00,
-  0x77, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00,
-  0x0a, 0x00, 0x00, 0x00, 0x7c, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00,
-  0x2c, 0x00, 0x06, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x7d, 0x00, 0x00, 0x00,
-  0x7c, 0x00, 0x00, 0x00, 0x26, 0x00, 0x00, 0x00, 0x26, 0x00, 0x00, 0x00,
-  0x36, 0x00, 0x05, 0x00, 0x02, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
-  0x00, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00,
-  0x05, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00, 0x0b, 0x00, 0x00, 0x00,
-  0x0e, 0x00, 0x00, 0x00, 0x0d, 0x00, 0x00, 0x00, 0x7c, 0x00, 0x04, 0x00,
-  0x07, 0x00, 0x00, 0x00, 0x0f, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x00, 0x00,
-  0x51, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0x16, 0x00, 0x00, 0x00,
-  0x0f, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00,
-  0x1c, 0x00, 0x00, 0x00, 0x1d, 0x00, 0x00, 0x00, 0x1a, 0x00, 0x00, 0x00,
-  0x1b, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00,
-  0x1e, 0x00, 0x00, 0x00, 0x1d, 0x00, 0x00, 0x00, 0x87, 0x00, 0x05, 0x00,
-  0x06, 0x00, 0x00, 0x00, 0x1f, 0x00, 0x00, 0x00, 0x16, 0x00, 0x00, 0x00,
-  0x1e, 0x00, 0x00, 0x00, 0x8b, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00,
-  0x25, 0x00, 0x00, 0x00, 0x16, 0x00, 0x00, 0x00, 0x1e, 0x00, 0x00, 0x00,
-  0xb1, 0x00, 0x05, 0x00, 0x28, 0x00, 0x00, 0x00, 0x2d, 0x00, 0x00, 0x00,
-  0x25, 0x00, 0x00, 0x00, 0x1e, 0x00, 0x00, 0x00, 0xf7, 0x00, 0x03, 0x00,
-  0x2f, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xfa, 0x00, 0x04, 0x00,
-  0x2d, 0x00, 0x00, 0x00, 0x2e, 0x00, 0x00, 0x00, 0x2f, 0x00, 0x00, 0x00,
-  0xf8, 0x00, 0x02, 0x00, 0x2e, 0x00, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00,
-  0x1c, 0x00, 0x00, 0x00, 0x33, 0x00, 0x00, 0x00, 0x1a, 0x00, 0x00, 0x00,
-  0x32, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00,
-  0x34, 0x00, 0x00, 0x00, 0x33, 0x00, 0x00, 0x00, 0xb1, 0x00, 0x05, 0x00,
-  0x28, 0x00, 0x00, 0x00, 0x35, 0x00, 0x00, 0x00, 0x1f, 0x00, 0x00, 0x00,
-  0x34, 0x00, 0x00, 0x00, 0xf9, 0x00, 0x02, 0x00, 0x2f, 0x00, 0x00, 0x00,
-  0xf8, 0x00, 0x02, 0x00, 0x2f, 0x00, 0x00, 0x00, 0xf5, 0x00, 0x07, 0x00,
-  0x28, 0x00, 0x00, 0x00, 0x36, 0x00, 0x00, 0x00, 0x2d, 0x00, 0x00, 0x00,
-  0x05, 0x00, 0x00, 0x00, 0x35, 0x00, 0x00, 0x00, 0x2e, 0x00, 0x00, 0x00,
-  0xf7, 0x00, 0x03, 0x00, 0x38, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-  0xfa, 0x00, 0x04, 0x00, 0x36, 0x00, 0x00, 0x00, 0x37, 0x00, 0x00, 0x00,
-  0x38, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x37, 0x00, 0x00, 0x00,
-  0x84, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0x4b, 0x00, 0x00, 0x00,
-  0x1f, 0x00, 0x00, 0x00, 0x1e, 0x00, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00,
-  0x1c, 0x00, 0x00, 0x00, 0x4c, 0x00, 0x00, 0x00, 0x1a, 0x00, 0x00, 0x00,
-  0x3a, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00,
-  0x4d, 0x00, 0x00, 0x00, 0x4c, 0x00, 0x00, 0x00, 0x84, 0x00, 0x05, 0x00,
-  0x06, 0x00, 0x00, 0x00, 0x4e, 0x00, 0x00, 0x00, 0x4b, 0x00, 0x00, 0x00,
-  0x4d, 0x00, 0x00, 0x00, 0x80, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00,
-  0x51, 0x00, 0x00, 0x00, 0x4e, 0x00, 0x00, 0x00, 0x25, 0x00, 0x00, 0x00,
-  0x41, 0x00, 0x06, 0x00, 0x59, 0x00, 0x00, 0x00, 0x5a, 0x00, 0x00, 0x00,
-  0x57, 0x00, 0x00, 0x00, 0x1b, 0x00, 0x00, 0x00, 0x51, 0x00, 0x00, 0x00,
-  0x3d, 0x00, 0x04, 0x00, 0x17, 0x00, 0x00, 0x00, 0x5b, 0x00, 0x00, 0x00,
-  0x5a, 0x00, 0x00, 0x00, 0xf9, 0x00, 0x02, 0x00, 0x5d, 0x00, 0x00, 0x00,
-  0xf8, 0x00, 0x02, 0x00, 0x5d, 0x00, 0x00, 0x00, 0xf5, 0x00, 0x07, 0x00,
-  0x17, 0x00, 0x00, 0x00, 0x8b, 0x00, 0x00, 0x00, 0x5b, 0x00, 0x00, 0x00,
-  0x37, 0x00, 0x00, 0x00, 0x71, 0x00, 0x00, 0x00, 0x5e, 0x00, 0x00, 0x00,
-  0xf5, 0x00, 0x07, 0x00, 0x06, 0x00, 0x00, 0x00, 0x8a, 0x00, 0x00, 0x00,
-  0x3a, 0x00, 0x00, 0x00, 0x37, 0x00, 0x00, 0x00, 0x73, 0x00, 0x00, 0x00,
-  0x5e, 0x00, 0x00, 0x00, 0xb1, 0x00, 0x05, 0x00, 0x28, 0x00, 0x00, 0x00,
-  0x65, 0x00, 0x00, 0x00, 0x8a, 0x00, 0x00, 0x00, 0x4d, 0x00, 0x00, 0x00,
-  0xf6, 0x00, 0x04, 0x00, 0x5f, 0x00, 0x00, 0x00, 0x5e, 0x00, 0x00, 0x00,
-  0x00, 0x00, 0x00, 0x00, 0xfa, 0x00, 0x04, 0x00, 0x65, 0x00, 0x00, 0x00,
-  0x5e, 0x00, 0x00, 0x00, 0x5f, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00,
-  0x5e, 0x00, 0x00, 0x00, 0x84, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00,
-  0x6b, 0x00, 0x00, 0x00, 0x8a, 0x00, 0x00, 0x00, 0x1e, 0x00, 0x00, 0x00,
-  0x80, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0x6c, 0x00, 0x00, 0x00,
-  0x51, 0x00, 0x00, 0x00, 0x6b, 0x00, 0x00, 0x00, 0x41, 0x00, 0x06, 0x00,
-  0x59, 0x00, 0x00, 0x00, 0x6d, 0x00, 0x00, 0x00, 0x57, 0x00, 0x00, 0x00,
-  0x1b, 0x00, 0x00, 0x00, 0x6c, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00,
-  0x17, 0x00, 0x00, 0x00, 0x6e, 0x00, 0x00, 0x00, 0x6d, 0x00, 0x00, 0x00,
-  0x0c, 0x00, 0x07, 0x00, 0x17, 0x00, 0x00, 0x00, 0x71, 0x00, 0x00, 0x00,
-  0x01, 0x00, 0x00, 0x00, 0x25, 0x00, 0x00, 0x00, 0x8b, 0x00, 0x00, 0x00,
-  0x6e, 0x00, 0x00, 0x00, 0x80, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00,
-  0x73, 0x00, 0x00, 0x00, 0x8a, 0x00, 0x00, 0x00, 0x3a, 0x00, 0x00, 0x00,
-  0xf9, 0x00, 0x02, 0x00, 0x5d, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00,
-  0x5f, 0x00, 0x00, 0x00, 0x41, 0x00, 0x06, 0x00, 0x59, 0x00, 0x00, 0x00,
-  0x7b, 0x00, 0x00, 0x00, 0x77, 0x00, 0x00, 0x00, 0x1b, 0x00, 0x00, 0x00,
-  0x16, 0x00, 0x00, 0x00, 0x3e, 0x00, 0x03, 0x00, 0x7b, 0x00, 0x00, 0x00,
-  0x8b, 0x00, 0x00, 0x00, 0xf9, 0x00, 0x02, 0x00, 0x38, 0x00, 0x00, 0x00,
-  0xf8, 0x00, 0x02, 0x00, 0x38, 0x00, 0x00, 0x00, 0xfd, 0x00, 0x01, 0x00,
+  0x21, 0x00, 0x00, 0x00, 0x27, 0x00, 0x00, 0x00, 0x10, 0x00, 0x06, 0x00,
+  0x04, 0x00, 0x00, 0x00, 0x11, 0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00,
+  0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x03, 0x00, 0x03, 0x00,
+  0x02, 0x00, 0x00, 0x00, 0xc2, 0x01, 0x00, 0x00, 0x05, 0x00, 0x04, 0x00,
+  0x04, 0x00, 0x00, 0x00, 0x6d, 0x61, 0x69, 0x6e, 0x00, 0x00, 0x00, 0x00,
+  0x05, 0x00, 0x08, 0x00, 0x21, 0x00, 0x00, 0x00, 0x67, 0x6c, 0x5f, 0x47,
+  0x6c, 0x6f, 0x62, 0x61, 0x6c, 0x49, 0x6e, 0x76, 0x6f, 0x63, 0x61, 0x74,
+  0x69, 0x6f, 0x6e, 0x49, 0x44, 0x00, 0x00, 0x00, 0x05, 0x00, 0x06, 0x00,
+  0x27, 0x00, 0x00, 0x00, 0x67, 0x6c, 0x5f, 0x57, 0x6f, 0x72, 0x6b, 0x47,
+  0x72, 0x6f, 0x75, 0x70, 0x49, 0x44, 0x00, 0x00, 0x05, 0x00, 0x05, 0x00,
+  0x2e, 0x00, 0x00, 0x00, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x42, 0x75, 0x66,
+  0x66, 0x65, 0x72, 0x00, 0x06, 0x00, 0x04, 0x00, 0x2e, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x77, 0x00, 0x00, 0x00, 0x06, 0x00, 0x04, 0x00,
+  0x2e, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x68, 0x00, 0x00, 0x00,
+  0x06, 0x00, 0x04, 0x00, 0x2e, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+  0x63, 0x00, 0x00, 0x00, 0x06, 0x00, 0x04, 0x00, 0x2e, 0x00, 0x00, 0x00,
+  0x03, 0x00, 0x00, 0x00, 0x6b, 0x00, 0x00, 0x00, 0x05, 0x00, 0x04, 0x00,
+  0x30, 0x00, 0x00, 0x00, 0x75, 0x43, 0x6f, 0x6e, 0x73, 0x74, 0x00, 0x00,
+  0x05, 0x00, 0x06, 0x00, 0x5a, 0x00, 0x00, 0x00, 0x73, 0x6f, 0x75, 0x72,
+  0x63, 0x65, 0x42, 0x75, 0x66, 0x66, 0x65, 0x72, 0x30, 0x00, 0x00, 0x00,
+  0x06, 0x00, 0x05, 0x00, 0x5a, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x64, 0x61, 0x74, 0x61, 0x00, 0x00, 0x00, 0x00, 0x05, 0x00, 0x04, 0x00,
+  0x5c, 0x00, 0x00, 0x00, 0x75, 0x49, 0x6e, 0x70, 0x75, 0x74, 0x00, 0x00,
+  0x05, 0x00, 0x06, 0x00, 0x70, 0x00, 0x00, 0x00, 0x73, 0x68, 0x61, 0x72,
+  0x65, 0x64, 0x56, 0x61, 0x6c, 0x75, 0x65, 0x73, 0x00, 0x00, 0x00, 0x00,
+  0x05, 0x00, 0x05, 0x00, 0x9c, 0x00, 0x00, 0x00, 0x64, 0x65, 0x73, 0x74,
+  0x42, 0x75, 0x66, 0x66, 0x65, 0x72, 0x00, 0x00, 0x06, 0x00, 0x05, 0x00,
+  0x9c, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x64, 0x61, 0x74, 0x61,
+  0x00, 0x00, 0x00, 0x00, 0x05, 0x00, 0x04, 0x00, 0x9e, 0x00, 0x00, 0x00,
+  0x75, 0x4f, 0x75, 0x74, 0x70, 0x75, 0x74, 0x00, 0x47, 0x00, 0x04, 0x00,
+  0x21, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00,
+  0x47, 0x00, 0x04, 0x00, 0x27, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00,
+  0x1a, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00, 0x2e, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x48, 0x00, 0x05, 0x00, 0x2e, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+  0x23, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00,
+  0x2e, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00,
+  0x08, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00, 0x2e, 0x00, 0x00, 0x00,
+  0x03, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x00, 0x00,
+  0x47, 0x00, 0x03, 0x00, 0x2e, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+  0x47, 0x00, 0x04, 0x00, 0x30, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x30, 0x00, 0x00, 0x00,
+  0x21, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
+  0x59, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
+  0x48, 0x00, 0x04, 0x00, 0x5a, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x18, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00, 0x5a, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x47, 0x00, 0x03, 0x00, 0x5a, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00,
+  0x47, 0x00, 0x04, 0x00, 0x5c, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x5c, 0x00, 0x00, 0x00,
+  0x21, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
+  0x9b, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
+  0x48, 0x00, 0x04, 0x00, 0x9c, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x19, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00, 0x9c, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x47, 0x00, 0x03, 0x00, 0x9c, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00,
+  0x47, 0x00, 0x04, 0x00, 0x9e, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x9e, 0x00, 0x00, 0x00,
+  0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
+  0xa2, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00,
+  0x13, 0x00, 0x02, 0x00, 0x02, 0x00, 0x00, 0x00, 0x21, 0x00, 0x03, 0x00,
+  0x03, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x16, 0x00, 0x03, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0x11, 0x00, 0x00, 0x00, 0xff, 0xff, 0x7f, 0x7f,
+  0x15, 0x00, 0x04, 0x00, 0x1c, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x17, 0x00, 0x04, 0x00, 0x1f, 0x00, 0x00, 0x00,
+  0x1c, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00,
+  0x20, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x1f, 0x00, 0x00, 0x00,
+  0x3b, 0x00, 0x04, 0x00, 0x20, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00,
+  0x01, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00, 0x1c, 0x00, 0x00, 0x00,
+  0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00,
+  0x23, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00,
+  0x3b, 0x00, 0x04, 0x00, 0x20, 0x00, 0x00, 0x00, 0x27, 0x00, 0x00, 0x00,
+  0x01, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00, 0x1c, 0x00, 0x00, 0x00,
+  0x28, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x15, 0x00, 0x04, 0x00,
+  0x2d, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+  0x1e, 0x00, 0x06, 0x00, 0x2e, 0x00, 0x00, 0x00, 0x2d, 0x00, 0x00, 0x00,
+  0x2d, 0x00, 0x00, 0x00, 0x2d, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0x20, 0x00, 0x04, 0x00, 0x2f, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+  0x2e, 0x00, 0x00, 0x00, 0x3b, 0x00, 0x04, 0x00, 0x2f, 0x00, 0x00, 0x00,
+  0x30, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00,
+  0x2d, 0x00, 0x00, 0x00, 0x31, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x20, 0x00, 0x04, 0x00, 0x32, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+  0x2d, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00, 0x2d, 0x00, 0x00, 0x00,
+  0x42, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x14, 0x00, 0x02, 0x00,
+  0x57, 0x00, 0x00, 0x00, 0x1d, 0x00, 0x03, 0x00, 0x59, 0x00, 0x00, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0x1e, 0x00, 0x03, 0x00, 0x5a, 0x00, 0x00, 0x00,
+  0x59, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00, 0x5b, 0x00, 0x00, 0x00,
+  0x02, 0x00, 0x00, 0x00, 0x5a, 0x00, 0x00, 0x00, 0x3b, 0x00, 0x04, 0x00,
+  0x5b, 0x00, 0x00, 0x00, 0x5c, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+  0x20, 0x00, 0x04, 0x00, 0x67, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00, 0x1c, 0x00, 0x00, 0x00,
+  0x6b, 0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x04, 0x00,
+  0x6e, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x6b, 0x00, 0x00, 0x00,
+  0x20, 0x00, 0x04, 0x00, 0x6f, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
+  0x6e, 0x00, 0x00, 0x00, 0x3b, 0x00, 0x04, 0x00, 0x6f, 0x00, 0x00, 0x00,
+  0x70, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00,
+  0x73, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0x2b, 0x00, 0x04, 0x00, 0x1c, 0x00, 0x00, 0x00, 0x75, 0x00, 0x00, 0x00,
+  0x02, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00, 0x1c, 0x00, 0x00, 0x00,
+  0x76, 0x00, 0x00, 0x00, 0x08, 0x01, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00,
+  0x1c, 0x00, 0x00, 0x00, 0x78, 0x00, 0x00, 0x00, 0x40, 0x00, 0x00, 0x00,
+  0x1d, 0x00, 0x03, 0x00, 0x9b, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0x1e, 0x00, 0x03, 0x00, 0x9c, 0x00, 0x00, 0x00, 0x9b, 0x00, 0x00, 0x00,
+  0x20, 0x00, 0x04, 0x00, 0x9d, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+  0x9c, 0x00, 0x00, 0x00, 0x3b, 0x00, 0x04, 0x00, 0x9d, 0x00, 0x00, 0x00,
+  0x9e, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x2c, 0x00, 0x06, 0x00,
+  0x1f, 0x00, 0x00, 0x00, 0xa2, 0x00, 0x00, 0x00, 0x6b, 0x00, 0x00, 0x00,
+  0x28, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00, 0x36, 0x00, 0x05, 0x00,
+  0x02, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x03, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x05, 0x00, 0x00, 0x00,
+  0x41, 0x00, 0x05, 0x00, 0x23, 0x00, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00,
+  0x21, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00,
+  0x1c, 0x00, 0x00, 0x00, 0x25, 0x00, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00,
+  0x41, 0x00, 0x05, 0x00, 0x23, 0x00, 0x00, 0x00, 0x29, 0x00, 0x00, 0x00,
+  0x27, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00,
+  0x1c, 0x00, 0x00, 0x00, 0x2a, 0x00, 0x00, 0x00, 0x29, 0x00, 0x00, 0x00,
+  0x41, 0x00, 0x05, 0x00, 0x32, 0x00, 0x00, 0x00, 0x33, 0x00, 0x00, 0x00,
+  0x30, 0x00, 0x00, 0x00, 0x31, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00,
+  0x2d, 0x00, 0x00, 0x00, 0x34, 0x00, 0x00, 0x00, 0x33, 0x00, 0x00, 0x00,
+  0x7c, 0x00, 0x04, 0x00, 0x1c, 0x00, 0x00, 0x00, 0x35, 0x00, 0x00, 0x00,
+  0x34, 0x00, 0x00, 0x00, 0x89, 0x00, 0x05, 0x00, 0x1c, 0x00, 0x00, 0x00,
+  0x36, 0x00, 0x00, 0x00, 0x2a, 0x00, 0x00, 0x00, 0x35, 0x00, 0x00, 0x00,
+  0x86, 0x00, 0x05, 0x00, 0x1c, 0x00, 0x00, 0x00, 0x3c, 0x00, 0x00, 0x00,
+  0x2a, 0x00, 0x00, 0x00, 0x35, 0x00, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00,
+  0x32, 0x00, 0x00, 0x00, 0x43, 0x00, 0x00, 0x00, 0x30, 0x00, 0x00, 0x00,
+  0x42, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00, 0x2d, 0x00, 0x00, 0x00,
+  0x44, 0x00, 0x00, 0x00, 0x43, 0x00, 0x00, 0x00, 0x7c, 0x00, 0x04, 0x00,
+  0x1c, 0x00, 0x00, 0x00, 0x45, 0x00, 0x00, 0x00, 0x44, 0x00, 0x00, 0x00,
+  0x84, 0x00, 0x05, 0x00, 0x1c, 0x00, 0x00, 0x00, 0x47, 0x00, 0x00, 0x00,
+  0x45, 0x00, 0x00, 0x00, 0x3c, 0x00, 0x00, 0x00, 0x84, 0x00, 0x05, 0x00,
+  0x1c, 0x00, 0x00, 0x00, 0x48, 0x00, 0x00, 0x00, 0x35, 0x00, 0x00, 0x00,
+  0x47, 0x00, 0x00, 0x00, 0x80, 0x00, 0x05, 0x00, 0x1c, 0x00, 0x00, 0x00,
+  0x49, 0x00, 0x00, 0x00, 0x36, 0x00, 0x00, 0x00, 0x48, 0x00, 0x00, 0x00,
+  0xf9, 0x00, 0x02, 0x00, 0x4e, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00,
+  0x4e, 0x00, 0x00, 0x00, 0xf5, 0x00, 0x07, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0xb6, 0x00, 0x00, 0x00, 0x11, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00,
+  0xac, 0x00, 0x00, 0x00, 0x4f, 0x00, 0x00, 0x00, 0xf5, 0x00, 0x07, 0x00,
+  0x1c, 0x00, 0x00, 0x00, 0xb5, 0x00, 0x00, 0x00, 0x25, 0x00, 0x00, 0x00,
+  0x05, 0x00, 0x00, 0x00, 0x6d, 0x00, 0x00, 0x00, 0x4f, 0x00, 0x00, 0x00,
+  0xb0, 0x00, 0x05, 0x00, 0x57, 0x00, 0x00, 0x00, 0x58, 0x00, 0x00, 0x00,
+  0xb5, 0x00, 0x00, 0x00, 0x45, 0x00, 0x00, 0x00, 0xf6, 0x00, 0x04, 0x00,
+  0x50, 0x00, 0x00, 0x00, 0x4f, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0xfa, 0x00, 0x04, 0x00, 0x58, 0x00, 0x00, 0x00, 0x4f, 0x00, 0x00, 0x00,
+  0x50, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x4f, 0x00, 0x00, 0x00,
+  0x84, 0x00, 0x05, 0x00, 0x1c, 0x00, 0x00, 0x00, 0x62, 0x00, 0x00, 0x00,
+  0xb5, 0x00, 0x00, 0x00, 0x35, 0x00, 0x00, 0x00, 0x80, 0x00, 0x05, 0x00,
+  0x1c, 0x00, 0x00, 0x00, 0x63, 0x00, 0x00, 0x00, 0x49, 0x00, 0x00, 0x00,
+  0x62, 0x00, 0x00, 0x00, 0x41, 0x00, 0x06, 0x00, 0x67, 0x00, 0x00, 0x00,
+  0x68, 0x00, 0x00, 0x00, 0x5c, 0x00, 0x00, 0x00, 0x31, 0x00, 0x00, 0x00,
+  0x63, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0x69, 0x00, 0x00, 0x00, 0x68, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x07, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0xac, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+  0x25, 0x00, 0x00, 0x00, 0xb6, 0x00, 0x00, 0x00, 0x69, 0x00, 0x00, 0x00,
+  0x80, 0x00, 0x05, 0x00, 0x1c, 0x00, 0x00, 0x00, 0x6d, 0x00, 0x00, 0x00,
+  0xb5, 0x00, 0x00, 0x00, 0x6b, 0x00, 0x00, 0x00, 0xf9, 0x00, 0x02, 0x00,
+  0x4e, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x50, 0x00, 0x00, 0x00,
+  0x41, 0x00, 0x05, 0x00, 0x73, 0x00, 0x00, 0x00, 0x74, 0x00, 0x00, 0x00,
+  0x70, 0x00, 0x00, 0x00, 0x25, 0x00, 0x00, 0x00, 0x3e, 0x00, 0x03, 0x00,
+  0x74, 0x00, 0x00, 0x00, 0xb6, 0x00, 0x00, 0x00, 0xe0, 0x00, 0x04, 0x00,
+  0x75, 0x00, 0x00, 0x00, 0x75, 0x00, 0x00, 0x00, 0x76, 0x00, 0x00, 0x00,
+  0xf9, 0x00, 0x02, 0x00, 0x79, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00,
+  0x79, 0x00, 0x00, 0x00, 0xf5, 0x00, 0x07, 0x00, 0x1c, 0x00, 0x00, 0x00,
+  0xb7, 0x00, 0x00, 0x00, 0x78, 0x00, 0x00, 0x00, 0x50, 0x00, 0x00, 0x00,
+  0x93, 0x00, 0x00, 0x00, 0x7c, 0x00, 0x00, 0x00, 0xac, 0x00, 0x05, 0x00,
+  0x57, 0x00, 0x00, 0x00, 0x7f, 0x00, 0x00, 0x00, 0xb7, 0x00, 0x00, 0x00,
+  0x22, 0x00, 0x00, 0x00, 0xf6, 0x00, 0x04, 0x00, 0x7b, 0x00, 0x00, 0x00,
+  0x7c, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xfa, 0x00, 0x04, 0x00,
+  0x7f, 0x00, 0x00, 0x00, 0x7a, 0x00, 0x00, 0x00, 0x7b, 0x00, 0x00, 0x00,
+  0xf8, 0x00, 0x02, 0x00, 0x7a, 0x00, 0x00, 0x00, 0xb0, 0x00, 0x05, 0x00,
+  0x57, 0x00, 0x00, 0x00, 0x82, 0x00, 0x00, 0x00, 0x25, 0x00, 0x00, 0x00,
+  0xb7, 0x00, 0x00, 0x00, 0xf7, 0x00, 0x03, 0x00, 0x84, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0xfa, 0x00, 0x04, 0x00, 0x82, 0x00, 0x00, 0x00,
+  0x83, 0x00, 0x00, 0x00, 0x84, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00,
+  0x83, 0x00, 0x00, 0x00, 0x80, 0x00, 0x05, 0x00, 0x1c, 0x00, 0x00, 0x00,
+  0x88, 0x00, 0x00, 0x00, 0x25, 0x00, 0x00, 0x00, 0xb7, 0x00, 0x00, 0x00,
+  0x41, 0x00, 0x05, 0x00, 0x73, 0x00, 0x00, 0x00, 0x8b, 0x00, 0x00, 0x00,
+  0x70, 0x00, 0x00, 0x00, 0x88, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0x8c, 0x00, 0x00, 0x00, 0x8b, 0x00, 0x00, 0x00,
+  0x3d, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, 0x8f, 0x00, 0x00, 0x00,
+  0x74, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x07, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0xb3, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x25, 0x00, 0x00, 0x00,
+  0x8c, 0x00, 0x00, 0x00, 0x8f, 0x00, 0x00, 0x00, 0x3e, 0x00, 0x03, 0x00,
+  0x74, 0x00, 0x00, 0x00, 0xb3, 0x00, 0x00, 0x00, 0xf9, 0x00, 0x02, 0x00,
+  0x84, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x84, 0x00, 0x00, 0x00,
+  0xe0, 0x00, 0x04, 0x00, 0x75, 0x00, 0x00, 0x00, 0x75, 0x00, 0x00, 0x00,
+  0x76, 0x00, 0x00, 0x00, 0xf9, 0x00, 0x02, 0x00, 0x7c, 0x00, 0x00, 0x00,
+  0xf8, 0x00, 0x02, 0x00, 0x7c, 0x00, 0x00, 0x00, 0xc2, 0x00, 0x05, 0x00,
+  0x1c, 0x00, 0x00, 0x00, 0x93, 0x00, 0x00, 0x00, 0xb7, 0x00, 0x00, 0x00,
+  0x42, 0x00, 0x00, 0x00, 0xf9, 0x00, 0x02, 0x00, 0x79, 0x00, 0x00, 0x00,
+  0xf8, 0x00, 0x02, 0x00, 0x7b, 0x00, 0x00, 0x00, 0xaa, 0x00, 0x05, 0x00,
+  0x57, 0x00, 0x00, 0x00, 0x95, 0x00, 0x00, 0x00, 0x25, 0x00, 0x00, 0x00,
+  0x22, 0x00, 0x00, 0x00, 0xf7, 0x00, 0x03, 0x00, 0x97, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0xfa, 0x00, 0x04, 0x00, 0x95, 0x00, 0x00, 0x00,
+  0x96, 0x00, 0x00, 0x00, 0x97, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00,
+  0x96, 0x00, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00, 0x73, 0x00, 0x00, 0x00,
+  0x99, 0x00, 0x00, 0x00, 0x70, 0x00, 0x00, 0x00, 0x31, 0x00, 0x00, 0x00,
+  0x3d, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, 0x9a, 0x00, 0x00, 0x00,
+  0x99, 0x00, 0x00, 0x00, 0x41, 0x00, 0x06, 0x00, 0x67, 0x00, 0x00, 0x00,
+  0xa1, 0x00, 0x00, 0x00, 0x9e, 0x00, 0x00, 0x00, 0x31, 0x00, 0x00, 0x00,
+  0x2a, 0x00, 0x00, 0x00, 0x3e, 0x00, 0x03, 0x00, 0xa1, 0x00, 0x00, 0x00,
+  0x9a, 0x00, 0x00, 0x00, 0xf9, 0x00, 0x02, 0x00, 0x97, 0x00, 0x00, 0x00,
+  0xf8, 0x00, 0x02, 0x00, 0x97, 0x00, 0x00, 0x00, 0xfd, 0x00, 0x01, 0x00,
   0x38, 0x00, 0x01, 0x00
 };
-unsigned int glsl_reduce_VMIN_comp_len = 1996;
+unsigned int glsl_reduce_VMIN_comp_len = 2524;
 
 const unsigned char glsl_reduce_MEAN_comp[] = {
   0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0b, 0x00, 0x08, 0x00,
-  0x91, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00,
+  0xbc, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00,
   0x01, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00,
   0x47, 0x4c, 0x53, 0x4c, 0x2e, 0x73, 0x74, 0x64, 0x2e, 0x34, 0x35, 0x30,
   0x00, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00,
-  0x01, 0x00, 0x00, 0x00, 0x0f, 0x00, 0x06, 0x00, 0x05, 0x00, 0x00, 0x00,
+  0x01, 0x00, 0x00, 0x00, 0x0f, 0x00, 0x07, 0x00, 0x05, 0x00, 0x00, 0x00,
   0x04, 0x00, 0x00, 0x00, 0x6d, 0x61, 0x69, 0x6e, 0x00, 0x00, 0x00, 0x00,
-  0x0d, 0x00, 0x00, 0x00, 0x10, 0x00, 0x06, 0x00, 0x04, 0x00, 0x00, 0x00,
-  0x11, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
-  0x01, 0x00, 0x00, 0x00, 0x03, 0x00, 0x03, 0x00, 0x02, 0x00, 0x00, 0x00,
-  0xb8, 0x01, 0x00, 0x00, 0x05, 0x00, 0x04, 0x00, 0x04, 0x00, 0x00, 0x00,
-  0x6d, 0x61, 0x69, 0x6e, 0x00, 0x00, 0x00, 0x00, 0x05, 0x00, 0x08, 0x00,
-  0x0d, 0x00, 0x00, 0x00, 0x67, 0x6c, 0x5f, 0x47, 0x6c, 0x6f, 0x62, 0x61,
-  0x6c, 0x49, 0x6e, 0x76, 0x6f, 0x63, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x49,
-  0x44, 0x00, 0x00, 0x00, 0x05, 0x00, 0x05, 0x00, 0x18, 0x00, 0x00, 0x00,
-  0x63, 0x6f, 0x6e, 0x73, 0x74, 0x42, 0x75, 0x66, 0x66, 0x65, 0x72, 0x00,
-  0x06, 0x00, 0x04, 0x00, 0x18, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-  0x77, 0x00, 0x00, 0x00, 0x06, 0x00, 0x04, 0x00, 0x18, 0x00, 0x00, 0x00,
-  0x01, 0x00, 0x00, 0x00, 0x68, 0x00, 0x00, 0x00, 0x06, 0x00, 0x04, 0x00,
-  0x18, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x63, 0x00, 0x00, 0x00,
-  0x06, 0x00, 0x04, 0x00, 0x18, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00,
-  0x6b, 0x00, 0x00, 0x00, 0x05, 0x00, 0x04, 0x00, 0x1a, 0x00, 0x00, 0x00,
-  0x75, 0x43, 0x6f, 0x6e, 0x73, 0x74, 0x00, 0x00, 0x05, 0x00, 0x06, 0x00,
-  0x55, 0x00, 0x00, 0x00, 0x73, 0x6f, 0x75, 0x72, 0x63, 0x65, 0x42, 0x75,
-  0x66, 0x66, 0x65, 0x72, 0x30, 0x00, 0x00, 0x00, 0x06, 0x00, 0x05, 0x00,
-  0x55, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x64, 0x61, 0x74, 0x61,
-  0x00, 0x00, 0x00, 0x00, 0x05, 0x00, 0x04, 0x00, 0x57, 0x00, 0x00, 0x00,
-  0x75, 0x49, 0x6e, 0x70, 0x75, 0x74, 0x00, 0x00, 0x05, 0x00, 0x05, 0x00,
-  0x7a, 0x00, 0x00, 0x00, 0x64, 0x65, 0x73, 0x74, 0x42, 0x75, 0x66, 0x66,
-  0x65, 0x72, 0x00, 0x00, 0x06, 0x00, 0x05, 0x00, 0x7a, 0x00, 0x00, 0x00,
-  0x00, 0x00, 0x00, 0x00, 0x64, 0x61, 0x74, 0x61, 0x00, 0x00, 0x00, 0x00,
-  0x05, 0x00, 0x04, 0x00, 0x7c, 0x00, 0x00, 0x00, 0x75, 0x4f, 0x75, 0x74,
-  0x70, 0x75, 0x74, 0x00, 0x47, 0x00, 0x04, 0x00, 0x0d, 0x00, 0x00, 0x00,
-  0x0b, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00,
-  0x18, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00,
-  0x00, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00, 0x18, 0x00, 0x00, 0x00,
-  0x01, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
-  0x48, 0x00, 0x05, 0x00, 0x18, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
-  0x23, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00,
-  0x18, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00,
-  0x0c, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x18, 0x00, 0x00, 0x00,
-  0x02, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x1a, 0x00, 0x00, 0x00,
-  0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
-  0x1a, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
-  0x47, 0x00, 0x04, 0x00, 0x54, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00,
-  0x04, 0x00, 0x00, 0x00, 0x48, 0x00, 0x04, 0x00, 0x55, 0x00, 0x00, 0x00,
-  0x00, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00,
-  0x55, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00,
-  0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x55, 0x00, 0x00, 0x00,
-  0x03, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x57, 0x00, 0x00, 0x00,
-  0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
-  0x57, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
-  0x47, 0x00, 0x04, 0x00, 0x79, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00,
-  0x04, 0x00, 0x00, 0x00, 0x48, 0x00, 0x04, 0x00, 0x7a, 0x00, 0x00, 0x00,
-  0x00, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00,
-  0x7a, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00,
-  0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x7a, 0x00, 0x00, 0x00,
-  0x03, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x7c, 0x00, 0x00, 0x00,
-  0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
-  0x7c, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-  0x47, 0x00, 0x04, 0x00, 0x82, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00,
-  0x19, 0x00, 0x00, 0x00, 0x13, 0x00, 0x02, 0x00, 0x02, 0x00, 0x00, 0x00,
-  0x21, 0x00, 0x03, 0x00, 0x03, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
-  0x15, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00,
-  0x01, 0x00, 0x00, 0x00, 0x17, 0x00, 0x04, 0x00, 0x07, 0x00, 0x00, 0x00,
-  0x06, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x15, 0x00, 0x04, 0x00,
-  0x0a, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-  0x17, 0x00, 0x04, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x0a, 0x00, 0x00, 0x00,
-  0x03, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00, 0x0c, 0x00, 0x00, 0x00,
-  0x01, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x3b, 0x00, 0x04, 0x00,
-  0x0c, 0x00, 0x00, 0x00, 0x0d, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
-  0x16, 0x00, 0x03, 0x00, 0x17, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00,
-  0x1e, 0x00, 0x06, 0x00, 0x18, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00,
-  0x06, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x17, 0x00, 0x00, 0x00,
-  0x20, 0x00, 0x04, 0x00, 0x19, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
-  0x18, 0x00, 0x00, 0x00, 0x3b, 0x00, 0x04, 0x00, 0x19, 0x00, 0x00, 0x00,
-  0x1a, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00,
-  0x06, 0x00, 0x00, 0x00, 0x1b, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-  0x20, 0x00, 0x04, 0x00, 0x1c, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
-  0x06, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00, 0x0a, 0x00, 0x00, 0x00,
-  0x26, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x14, 0x00, 0x02, 0x00,
-  0x28, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00,
-  0x32, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00,
-  0x06, 0x00, 0x00, 0x00, 0x3a, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
-  0x1d, 0x00, 0x03, 0x00, 0x54, 0x00, 0x00, 0x00, 0x17, 0x00, 0x00, 0x00,
-  0x1e, 0x00, 0x03, 0x00, 0x55, 0x00, 0x00, 0x00, 0x54, 0x00, 0x00, 0x00,
-  0x20, 0x00, 0x04, 0x00, 0x56, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
-  0x55, 0x00, 0x00, 0x00, 0x3b, 0x00, 0x04, 0x00, 0x56, 0x00, 0x00, 0x00,
-  0x57, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00,
-  0x59, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x17, 0x00, 0x00, 0x00,
-  0x2b, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, 0x75, 0x00, 0x00, 0x00,
-  0x03, 0x00, 0x00, 0x00, 0x1d, 0x00, 0x03, 0x00, 0x79, 0x00, 0x00, 0x00,
-  0x17, 0x00, 0x00, 0x00, 0x1e, 0x00, 0x03, 0x00, 0x7a, 0x00, 0x00, 0x00,
-  0x79, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00, 0x7b, 0x00, 0x00, 0x00,
-  0x02, 0x00, 0x00, 0x00, 0x7a, 0x00, 0x00, 0x00, 0x3b, 0x00, 0x04, 0x00,
-  0x7b, 0x00, 0x00, 0x00, 0x7c, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
-  0x2b, 0x00, 0x04, 0x00, 0x0a, 0x00, 0x00, 0x00, 0x81, 0x00, 0x00, 0x00,
-  0x00, 0x01, 0x00, 0x00, 0x2c, 0x00, 0x06, 0x00, 0x0b, 0x00, 0x00, 0x00,
-  0x82, 0x00, 0x00, 0x00, 0x81, 0x00, 0x00, 0x00, 0x26, 0x00, 0x00, 0x00,
-  0x26, 0x00, 0x00, 0x00, 0x36, 0x00, 0x05, 0x00, 0x02, 0x00, 0x00, 0x00,
-  0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00,
-  0xf8, 0x00, 0x02, 0x00, 0x05, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00,
-  0x0b, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x00, 0x00, 0x0d, 0x00, 0x00, 0x00,
-  0x7c, 0x00, 0x04, 0x00, 0x07, 0x00, 0x00, 0x00, 0x0f, 0x00, 0x00, 0x00,
-  0x0e, 0x00, 0x00, 0x00, 0x51, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00,
-  0x16, 0x00, 0x00, 0x00, 0x0f, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-  0x41, 0x00, 0x05, 0x00, 0x1c, 0x00, 0x00, 0x00, 0x1d, 0x00, 0x00, 0x00,
-  0x1a, 0x00, 0x00, 0x00, 0x1b, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00,
-  0x06, 0x00, 0x00, 0x00, 0x1e, 0x00, 0x00, 0x00, 0x1d, 0x00, 0x00, 0x00,
-  0x87, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0x1f, 0x00, 0x00, 0x00,
-  0x16, 0x00, 0x00, 0x00, 0x1e, 0x00, 0x00, 0x00, 0x8b, 0x00, 0x05, 0x00,
-  0x06, 0x00, 0x00, 0x00, 0x25, 0x00, 0x00, 0x00, 0x16, 0x00, 0x00, 0x00,
-  0x1e, 0x00, 0x00, 0x00, 0xb1, 0x00, 0x05, 0x00, 0x28, 0x00, 0x00, 0x00,
-  0x2d, 0x00, 0x00, 0x00, 0x25, 0x00, 0x00, 0x00, 0x1e, 0x00, 0x00, 0x00,
-  0xf7, 0x00, 0x03, 0x00, 0x2f, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-  0xfa, 0x00, 0x04, 0x00, 0x2d, 0x00, 0x00, 0x00, 0x2e, 0x00, 0x00, 0x00,
-  0x2f, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x2e, 0x00, 0x00, 0x00,
-  0x41, 0x00, 0x05, 0x00, 0x1c, 0x00, 0x00, 0x00, 0x33, 0x00, 0x00, 0x00,
-  0x1a, 0x00, 0x00, 0x00, 0x32, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00,
-  0x06, 0x00, 0x00, 0x00, 0x34, 0x00, 0x00, 0x00, 0x33, 0x00, 0x00, 0x00,
-  0xb1, 0x00, 0x05, 0x00, 0x28, 0x00, 0x00, 0x00, 0x35, 0x00, 0x00, 0x00,
-  0x1f, 0x00, 0x00, 0x00, 0x34, 0x00, 0x00, 0x00, 0xf9, 0x00, 0x02, 0x00,
-  0x2f, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x2f, 0x00, 0x00, 0x00,
-  0xf5, 0x00, 0x07, 0x00, 0x28, 0x00, 0x00, 0x00, 0x36, 0x00, 0x00, 0x00,
-  0x2d, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x35, 0x00, 0x00, 0x00,
-  0x2e, 0x00, 0x00, 0x00, 0xf7, 0x00, 0x03, 0x00, 0x38, 0x00, 0x00, 0x00,
-  0x00, 0x00, 0x00, 0x00, 0xfa, 0x00, 0x04, 0x00, 0x36, 0x00, 0x00, 0x00,
-  0x37, 0x00, 0x00, 0x00, 0x38, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00,
-  0x37, 0x00, 0x00, 0x00, 0x84, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00,
-  0x4b, 0x00, 0x00, 0x00, 0x1f, 0x00, 0x00, 0x00, 0x1e, 0x00, 0x00, 0x00,
-  0x41, 0x00, 0x05, 0x00, 0x1c, 0x00, 0x00, 0x00, 0x4c, 0x00, 0x00, 0x00,
-  0x1a, 0x00, 0x00, 0x00, 0x3a, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00,
-  0x06, 0x00, 0x00, 0x00, 0x4d, 0x00, 0x00, 0x00, 0x4c, 0x00, 0x00, 0x00,
-  0x84, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0x4e, 0x00, 0x00, 0x00,
-  0x4b, 0x00, 0x00, 0x00, 0x4d, 0x00, 0x00, 0x00, 0x80, 0x00, 0x05, 0x00,
-  0x06, 0x00, 0x00, 0x00, 0x51, 0x00, 0x00, 0x00, 0x4e, 0x00, 0x00, 0x00,
-  0x25, 0x00, 0x00, 0x00, 0x41, 0x00, 0x06, 0x00, 0x59, 0x00, 0x00, 0x00,
-  0x5a, 0x00, 0x00, 0x00, 0x57, 0x00, 0x00, 0x00, 0x1b, 0x00, 0x00, 0x00,
-  0x51, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00, 0x17, 0x00, 0x00, 0x00,
-  0x5b, 0x00, 0x00, 0x00, 0x5a, 0x00, 0x00, 0x00, 0xf9, 0x00, 0x02, 0x00,
-  0x5d, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x5d, 0x00, 0x00, 0x00,
-  0xf5, 0x00, 0x07, 0x00, 0x17, 0x00, 0x00, 0x00, 0x90, 0x00, 0x00, 0x00,
-  0x5b, 0x00, 0x00, 0x00, 0x37, 0x00, 0x00, 0x00, 0x71, 0x00, 0x00, 0x00,
-  0x5e, 0x00, 0x00, 0x00, 0xf5, 0x00, 0x07, 0x00, 0x06, 0x00, 0x00, 0x00,
-  0x8f, 0x00, 0x00, 0x00, 0x3a, 0x00, 0x00, 0x00, 0x37, 0x00, 0x00, 0x00,
-  0x73, 0x00, 0x00, 0x00, 0x5e, 0x00, 0x00, 0x00, 0xb1, 0x00, 0x05, 0x00,
-  0x28, 0x00, 0x00, 0x00, 0x65, 0x00, 0x00, 0x00, 0x8f, 0x00, 0x00, 0x00,
-  0x4d, 0x00, 0x00, 0x00, 0xf6, 0x00, 0x04, 0x00, 0x5f, 0x00, 0x00, 0x00,
-  0x5e, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xfa, 0x00, 0x04, 0x00,
-  0x65, 0x00, 0x00, 0x00, 0x5e, 0x00, 0x00, 0x00, 0x5f, 0x00, 0x00, 0x00,
-  0xf8, 0x00, 0x02, 0x00, 0x5e, 0x00, 0x00, 0x00, 0x84, 0x00, 0x05, 0x00,
-  0x06, 0x00, 0x00, 0x00, 0x6b, 0x00, 0x00, 0x00, 0x8f, 0x00, 0x00, 0x00,
-  0x1e, 0x00, 0x00, 0x00, 0x80, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00,
-  0x6c, 0x00, 0x00, 0x00, 0x51, 0x00, 0x00, 0x00, 0x6b, 0x00, 0x00, 0x00,
-  0x41, 0x00, 0x06, 0x00, 0x59, 0x00, 0x00, 0x00, 0x6d, 0x00, 0x00, 0x00,
-  0x57, 0x00, 0x00, 0x00, 0x1b, 0x00, 0x00, 0x00, 0x6c, 0x00, 0x00, 0x00,
-  0x3d, 0x00, 0x04, 0x00, 0x17, 0x00, 0x00, 0x00, 0x6e, 0x00, 0x00, 0x00,
-  0x6d, 0x00, 0x00, 0x00, 0x81, 0x00, 0x05, 0x00, 0x17, 0x00, 0x00, 0x00,
-  0x71, 0x00, 0x00, 0x00, 0x90, 0x00, 0x00, 0x00, 0x6e, 0x00, 0x00, 0x00,
-  0x80, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0x73, 0x00, 0x00, 0x00,
-  0x8f, 0x00, 0x00, 0x00, 0x3a, 0x00, 0x00, 0x00, 0xf9, 0x00, 0x02, 0x00,
-  0x5d, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x5f, 0x00, 0x00, 0x00,
-  0x41, 0x00, 0x05, 0x00, 0x59, 0x00, 0x00, 0x00, 0x76, 0x00, 0x00, 0x00,
-  0x1a, 0x00, 0x00, 0x00, 0x75, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00,
-  0x17, 0x00, 0x00, 0x00, 0x77, 0x00, 0x00, 0x00, 0x76, 0x00, 0x00, 0x00,
-  0x85, 0x00, 0x05, 0x00, 0x17, 0x00, 0x00, 0x00, 0x78, 0x00, 0x00, 0x00,
-  0x90, 0x00, 0x00, 0x00, 0x77, 0x00, 0x00, 0x00, 0x41, 0x00, 0x06, 0x00,
-  0x59, 0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0x7c, 0x00, 0x00, 0x00,
-  0x1b, 0x00, 0x00, 0x00, 0x16, 0x00, 0x00, 0x00, 0x3e, 0x00, 0x03, 0x00,
-  0x80, 0x00, 0x00, 0x00, 0x78, 0x00, 0x00, 0x00, 0xf9, 0x00, 0x02, 0x00,
-  0x38, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x38, 0x00, 0x00, 0x00,
-  0xfd, 0x00, 0x01, 0x00, 0x38, 0x00, 0x01, 0x00
+  0x21, 0x00, 0x00, 0x00, 0x27, 0x00, 0x00, 0x00, 0x10, 0x00, 0x06, 0x00,
+  0x04, 0x00, 0x00, 0x00, 0x11, 0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00,
+  0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x03, 0x00, 0x03, 0x00,
+  0x02, 0x00, 0x00, 0x00, 0xc2, 0x01, 0x00, 0x00, 0x05, 0x00, 0x04, 0x00,
+  0x04, 0x00, 0x00, 0x00, 0x6d, 0x61, 0x69, 0x6e, 0x00, 0x00, 0x00, 0x00,
+  0x05, 0x00, 0x08, 0x00, 0x21, 0x00, 0x00, 0x00, 0x67, 0x6c, 0x5f, 0x47,
+  0x6c, 0x6f, 0x62, 0x61, 0x6c, 0x49, 0x6e, 0x76, 0x6f, 0x63, 0x61, 0x74,
+  0x69, 0x6f, 0x6e, 0x49, 0x44, 0x00, 0x00, 0x00, 0x05, 0x00, 0x06, 0x00,
+  0x27, 0x00, 0x00, 0x00, 0x67, 0x6c, 0x5f, 0x57, 0x6f, 0x72, 0x6b, 0x47,
+  0x72, 0x6f, 0x75, 0x70, 0x49, 0x44, 0x00, 0x00, 0x05, 0x00, 0x05, 0x00,
+  0x2e, 0x00, 0x00, 0x00, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x42, 0x75, 0x66,
+  0x66, 0x65, 0x72, 0x00, 0x06, 0x00, 0x04, 0x00, 0x2e, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x77, 0x00, 0x00, 0x00, 0x06, 0x00, 0x04, 0x00,
+  0x2e, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x68, 0x00, 0x00, 0x00,
+  0x06, 0x00, 0x04, 0x00, 0x2e, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+  0x63, 0x00, 0x00, 0x00, 0x06, 0x00, 0x04, 0x00, 0x2e, 0x00, 0x00, 0x00,
+  0x03, 0x00, 0x00, 0x00, 0x6b, 0x00, 0x00, 0x00, 0x05, 0x00, 0x04, 0x00,
+  0x30, 0x00, 0x00, 0x00, 0x75, 0x43, 0x6f, 0x6e, 0x73, 0x74, 0x00, 0x00,
+  0x05, 0x00, 0x06, 0x00, 0x5a, 0x00, 0x00, 0x00, 0x73, 0x6f, 0x75, 0x72,
+  0x63, 0x65, 0x42, 0x75, 0x66, 0x66, 0x65, 0x72, 0x30, 0x00, 0x00, 0x00,
+  0x06, 0x00, 0x05, 0x00, 0x5a, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x64, 0x61, 0x74, 0x61, 0x00, 0x00, 0x00, 0x00, 0x05, 0x00, 0x04, 0x00,
+  0x5c, 0x00, 0x00, 0x00, 0x75, 0x49, 0x6e, 0x70, 0x75, 0x74, 0x00, 0x00,
+  0x05, 0x00, 0x06, 0x00, 0x70, 0x00, 0x00, 0x00, 0x73, 0x68, 0x61, 0x72,
+  0x65, 0x64, 0x56, 0x61, 0x6c, 0x75, 0x65, 0x73, 0x00, 0x00, 0x00, 0x00,
+  0x05, 0x00, 0x05, 0x00, 0x9c, 0x00, 0x00, 0x00, 0x64, 0x65, 0x73, 0x74,
+  0x42, 0x75, 0x66, 0x66, 0x65, 0x72, 0x00, 0x00, 0x06, 0x00, 0x05, 0x00,
+  0x9c, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x64, 0x61, 0x74, 0x61,
+  0x00, 0x00, 0x00, 0x00, 0x05, 0x00, 0x04, 0x00, 0x9e, 0x00, 0x00, 0x00,
+  0x75, 0x4f, 0x75, 0x74, 0x70, 0x75, 0x74, 0x00, 0x47, 0x00, 0x04, 0x00,
+  0x21, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00,
+  0x47, 0x00, 0x04, 0x00, 0x27, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00,
+  0x1a, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00, 0x2e, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x48, 0x00, 0x05, 0x00, 0x2e, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+  0x23, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00,
+  0x2e, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00,
+  0x08, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00, 0x2e, 0x00, 0x00, 0x00,
+  0x03, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x00, 0x00,
+  0x47, 0x00, 0x03, 0x00, 0x2e, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+  0x47, 0x00, 0x04, 0x00, 0x30, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x30, 0x00, 0x00, 0x00,
+  0x21, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
+  0x59, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
+  0x48, 0x00, 0x04, 0x00, 0x5a, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x18, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00, 0x5a, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x47, 0x00, 0x03, 0x00, 0x5a, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00,
+  0x47, 0x00, 0x04, 0x00, 0x5c, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x5c, 0x00, 0x00, 0x00,
+  0x21, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
+  0x9b, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
+  0x48, 0x00, 0x04, 0x00, 0x9c, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x19, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00, 0x9c, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x47, 0x00, 0x03, 0x00, 0x9c, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00,
+  0x47, 0x00, 0x04, 0x00, 0x9e, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x9e, 0x00, 0x00, 0x00,
+  0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
+  0xa6, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00,
+  0x13, 0x00, 0x02, 0x00, 0x02, 0x00, 0x00, 0x00, 0x21, 0x00, 0x03, 0x00,
+  0x03, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x16, 0x00, 0x03, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0x11, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x15, 0x00, 0x04, 0x00, 0x1c, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x17, 0x00, 0x04, 0x00, 0x1f, 0x00, 0x00, 0x00,
+  0x1c, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00,
+  0x20, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x1f, 0x00, 0x00, 0x00,
+  0x3b, 0x00, 0x04, 0x00, 0x20, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00,
+  0x01, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00, 0x1c, 0x00, 0x00, 0x00,
+  0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00,
+  0x23, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00,
+  0x3b, 0x00, 0x04, 0x00, 0x20, 0x00, 0x00, 0x00, 0x27, 0x00, 0x00, 0x00,
+  0x01, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00, 0x1c, 0x00, 0x00, 0x00,
+  0x28, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x15, 0x00, 0x04, 0x00,
+  0x2d, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+  0x1e, 0x00, 0x06, 0x00, 0x2e, 0x00, 0x00, 0x00, 0x2d, 0x00, 0x00, 0x00,
+  0x2d, 0x00, 0x00, 0x00, 0x2d, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0x20, 0x00, 0x04, 0x00, 0x2f, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+  0x2e, 0x00, 0x00, 0x00, 0x3b, 0x00, 0x04, 0x00, 0x2f, 0x00, 0x00, 0x00,
+  0x30, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00,
+  0x2d, 0x00, 0x00, 0x00, 0x31, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x20, 0x00, 0x04, 0x00, 0x32, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+  0x2d, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00, 0x2d, 0x00, 0x00, 0x00,
+  0x42, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x14, 0x00, 0x02, 0x00,
+  0x57, 0x00, 0x00, 0x00, 0x1d, 0x00, 0x03, 0x00, 0x59, 0x00, 0x00, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0x1e, 0x00, 0x03, 0x00, 0x5a, 0x00, 0x00, 0x00,
+  0x59, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00, 0x5b, 0x00, 0x00, 0x00,
+  0x02, 0x00, 0x00, 0x00, 0x5a, 0x00, 0x00, 0x00, 0x3b, 0x00, 0x04, 0x00,
+  0x5b, 0x00, 0x00, 0x00, 0x5c, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+  0x20, 0x00, 0x04, 0x00, 0x67, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00, 0x1c, 0x00, 0x00, 0x00,
+  0x6b, 0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x04, 0x00,
+  0x6e, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x6b, 0x00, 0x00, 0x00,
+  0x20, 0x00, 0x04, 0x00, 0x6f, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
+  0x6e, 0x00, 0x00, 0x00, 0x3b, 0x00, 0x04, 0x00, 0x6f, 0x00, 0x00, 0x00,
+  0x70, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00,
+  0x73, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0x2b, 0x00, 0x04, 0x00, 0x1c, 0x00, 0x00, 0x00, 0x75, 0x00, 0x00, 0x00,
+  0x02, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00, 0x1c, 0x00, 0x00, 0x00,
+  0x76, 0x00, 0x00, 0x00, 0x08, 0x01, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00,
+  0x1c, 0x00, 0x00, 0x00, 0x78, 0x00, 0x00, 0x00, 0x40, 0x00, 0x00, 0x00,
+  0x1d, 0x00, 0x03, 0x00, 0x9b, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0x1e, 0x00, 0x03, 0x00, 0x9c, 0x00, 0x00, 0x00, 0x9b, 0x00, 0x00, 0x00,
+  0x20, 0x00, 0x04, 0x00, 0x9d, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+  0x9c, 0x00, 0x00, 0x00, 0x3b, 0x00, 0x04, 0x00, 0x9d, 0x00, 0x00, 0x00,
+  0x9e, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00,
+  0x2d, 0x00, 0x00, 0x00, 0xa1, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00,
+  0x2c, 0x00, 0x06, 0x00, 0x1f, 0x00, 0x00, 0x00, 0xa6, 0x00, 0x00, 0x00,
+  0x6b, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00,
+  0x36, 0x00, 0x05, 0x00, 0x02, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00,
+  0x05, 0x00, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00, 0x23, 0x00, 0x00, 0x00,
+  0x24, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00,
+  0x3d, 0x00, 0x04, 0x00, 0x1c, 0x00, 0x00, 0x00, 0x25, 0x00, 0x00, 0x00,
+  0x24, 0x00, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00, 0x23, 0x00, 0x00, 0x00,
+  0x29, 0x00, 0x00, 0x00, 0x27, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00,
+  0x3d, 0x00, 0x04, 0x00, 0x1c, 0x00, 0x00, 0x00, 0x2a, 0x00, 0x00, 0x00,
+  0x29, 0x00, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00, 0x32, 0x00, 0x00, 0x00,
+  0x33, 0x00, 0x00, 0x00, 0x30, 0x00, 0x00, 0x00, 0x31, 0x00, 0x00, 0x00,
+  0x3d, 0x00, 0x04, 0x00, 0x2d, 0x00, 0x00, 0x00, 0x34, 0x00, 0x00, 0x00,
+  0x33, 0x00, 0x00, 0x00, 0x7c, 0x00, 0x04, 0x00, 0x1c, 0x00, 0x00, 0x00,
+  0x35, 0x00, 0x00, 0x00, 0x34, 0x00, 0x00, 0x00, 0x89, 0x00, 0x05, 0x00,
+  0x1c, 0x00, 0x00, 0x00, 0x36, 0x00, 0x00, 0x00, 0x2a, 0x00, 0x00, 0x00,
+  0x35, 0x00, 0x00, 0x00, 0x86, 0x00, 0x05, 0x00, 0x1c, 0x00, 0x00, 0x00,
+  0x3c, 0x00, 0x00, 0x00, 0x2a, 0x00, 0x00, 0x00, 0x35, 0x00, 0x00, 0x00,
+  0x41, 0x00, 0x05, 0x00, 0x32, 0x00, 0x00, 0x00, 0x43, 0x00, 0x00, 0x00,
+  0x30, 0x00, 0x00, 0x00, 0x42, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00,
+  0x2d, 0x00, 0x00, 0x00, 0x44, 0x00, 0x00, 0x00, 0x43, 0x00, 0x00, 0x00,
+  0x7c, 0x00, 0x04, 0x00, 0x1c, 0x00, 0x00, 0x00, 0x45, 0x00, 0x00, 0x00,
+  0x44, 0x00, 0x00, 0x00, 0x84, 0x00, 0x05, 0x00, 0x1c, 0x00, 0x00, 0x00,
+  0x47, 0x00, 0x00, 0x00, 0x45, 0x00, 0x00, 0x00, 0x3c, 0x00, 0x00, 0x00,
+  0x84, 0x00, 0x05, 0x00, 0x1c, 0x00, 0x00, 0x00, 0x48, 0x00, 0x00, 0x00,
+  0x35, 0x00, 0x00, 0x00, 0x47, 0x00, 0x00, 0x00, 0x80, 0x00, 0x05, 0x00,
+  0x1c, 0x00, 0x00, 0x00, 0x49, 0x00, 0x00, 0x00, 0x36, 0x00, 0x00, 0x00,
+  0x48, 0x00, 0x00, 0x00, 0xf9, 0x00, 0x02, 0x00, 0x4e, 0x00, 0x00, 0x00,
+  0xf8, 0x00, 0x02, 0x00, 0x4e, 0x00, 0x00, 0x00, 0xf5, 0x00, 0x07, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0xba, 0x00, 0x00, 0x00, 0x11, 0x00, 0x00, 0x00,
+  0x05, 0x00, 0x00, 0x00, 0xb0, 0x00, 0x00, 0x00, 0x4f, 0x00, 0x00, 0x00,
+  0xf5, 0x00, 0x07, 0x00, 0x1c, 0x00, 0x00, 0x00, 0xb9, 0x00, 0x00, 0x00,
+  0x25, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x6d, 0x00, 0x00, 0x00,
+  0x4f, 0x00, 0x00, 0x00, 0xb0, 0x00, 0x05, 0x00, 0x57, 0x00, 0x00, 0x00,
+  0x58, 0x00, 0x00, 0x00, 0xb9, 0x00, 0x00, 0x00, 0x45, 0x00, 0x00, 0x00,
+  0xf6, 0x00, 0x04, 0x00, 0x50, 0x00, 0x00, 0x00, 0x4f, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0xfa, 0x00, 0x04, 0x00, 0x58, 0x00, 0x00, 0x00,
+  0x4f, 0x00, 0x00, 0x00, 0x50, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00,
+  0x4f, 0x00, 0x00, 0x00, 0x84, 0x00, 0x05, 0x00, 0x1c, 0x00, 0x00, 0x00,
+  0x62, 0x00, 0x00, 0x00, 0xb9, 0x00, 0x00, 0x00, 0x35, 0x00, 0x00, 0x00,
+  0x80, 0x00, 0x05, 0x00, 0x1c, 0x00, 0x00, 0x00, 0x63, 0x00, 0x00, 0x00,
+  0x49, 0x00, 0x00, 0x00, 0x62, 0x00, 0x00, 0x00, 0x41, 0x00, 0x06, 0x00,
+  0x67, 0x00, 0x00, 0x00, 0x68, 0x00, 0x00, 0x00, 0x5c, 0x00, 0x00, 0x00,
+  0x31, 0x00, 0x00, 0x00, 0x63, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0x69, 0x00, 0x00, 0x00, 0x68, 0x00, 0x00, 0x00,
+  0x81, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0xb0, 0x00, 0x00, 0x00,
+  0xba, 0x00, 0x00, 0x00, 0x69, 0x00, 0x00, 0x00, 0x80, 0x00, 0x05, 0x00,
+  0x1c, 0x00, 0x00, 0x00, 0x6d, 0x00, 0x00, 0x00, 0xb9, 0x00, 0x00, 0x00,
+  0x6b, 0x00, 0x00, 0x00, 0xf9, 0x00, 0x02, 0x00, 0x4e, 0x00, 0x00, 0x00,
+  0xf8, 0x00, 0x02, 0x00, 0x50, 0x00, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00,
+  0x73, 0x00, 0x00, 0x00, 0x74, 0x00, 0x00, 0x00, 0x70, 0x00, 0x00, 0x00,
+  0x25, 0x00, 0x00, 0x00, 0x3e, 0x00, 0x03, 0x00, 0x74, 0x00, 0x00, 0x00,
+  0xba, 0x00, 0x00, 0x00, 0xe0, 0x00, 0x04, 0x00, 0x75, 0x00, 0x00, 0x00,
+  0x75, 0x00, 0x00, 0x00, 0x76, 0x00, 0x00, 0x00, 0xf9, 0x00, 0x02, 0x00,
+  0x79, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x79, 0x00, 0x00, 0x00,
+  0xf5, 0x00, 0x07, 0x00, 0x1c, 0x00, 0x00, 0x00, 0xbb, 0x00, 0x00, 0x00,
+  0x78, 0x00, 0x00, 0x00, 0x50, 0x00, 0x00, 0x00, 0x93, 0x00, 0x00, 0x00,
+  0x7c, 0x00, 0x00, 0x00, 0xac, 0x00, 0x05, 0x00, 0x57, 0x00, 0x00, 0x00,
+  0x7f, 0x00, 0x00, 0x00, 0xbb, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00,
+  0xf6, 0x00, 0x04, 0x00, 0x7b, 0x00, 0x00, 0x00, 0x7c, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0xfa, 0x00, 0x04, 0x00, 0x7f, 0x00, 0x00, 0x00,
+  0x7a, 0x00, 0x00, 0x00, 0x7b, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00,
+  0x7a, 0x00, 0x00, 0x00, 0xb0, 0x00, 0x05, 0x00, 0x57, 0x00, 0x00, 0x00,
+  0x82, 0x00, 0x00, 0x00, 0x25, 0x00, 0x00, 0x00, 0xbb, 0x00, 0x00, 0x00,
+  0xf7, 0x00, 0x03, 0x00, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0xfa, 0x00, 0x04, 0x00, 0x82, 0x00, 0x00, 0x00, 0x83, 0x00, 0x00, 0x00,
+  0x84, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x83, 0x00, 0x00, 0x00,
+  0x80, 0x00, 0x05, 0x00, 0x1c, 0x00, 0x00, 0x00, 0x88, 0x00, 0x00, 0x00,
+  0x25, 0x00, 0x00, 0x00, 0xbb, 0x00, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00,
+  0x73, 0x00, 0x00, 0x00, 0x8b, 0x00, 0x00, 0x00, 0x70, 0x00, 0x00, 0x00,
+  0x88, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0x8c, 0x00, 0x00, 0x00, 0x8b, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0x8f, 0x00, 0x00, 0x00, 0x74, 0x00, 0x00, 0x00,
+  0x81, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0xb7, 0x00, 0x00, 0x00,
+  0x8c, 0x00, 0x00, 0x00, 0x8f, 0x00, 0x00, 0x00, 0x3e, 0x00, 0x03, 0x00,
+  0x74, 0x00, 0x00, 0x00, 0xb7, 0x00, 0x00, 0x00, 0xf9, 0x00, 0x02, 0x00,
+  0x84, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x84, 0x00, 0x00, 0x00,
+  0xe0, 0x00, 0x04, 0x00, 0x75, 0x00, 0x00, 0x00, 0x75, 0x00, 0x00, 0x00,
+  0x76, 0x00, 0x00, 0x00, 0xf9, 0x00, 0x02, 0x00, 0x7c, 0x00, 0x00, 0x00,
+  0xf8, 0x00, 0x02, 0x00, 0x7c, 0x00, 0x00, 0x00, 0xc2, 0x00, 0x05, 0x00,
+  0x1c, 0x00, 0x00, 0x00, 0x93, 0x00, 0x00, 0x00, 0xbb, 0x00, 0x00, 0x00,
+  0x42, 0x00, 0x00, 0x00, 0xf9, 0x00, 0x02, 0x00, 0x79, 0x00, 0x00, 0x00,
+  0xf8, 0x00, 0x02, 0x00, 0x7b, 0x00, 0x00, 0x00, 0xaa, 0x00, 0x05, 0x00,
+  0x57, 0x00, 0x00, 0x00, 0x95, 0x00, 0x00, 0x00, 0x25, 0x00, 0x00, 0x00,
+  0x22, 0x00, 0x00, 0x00, 0xf7, 0x00, 0x03, 0x00, 0x97, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0xfa, 0x00, 0x04, 0x00, 0x95, 0x00, 0x00, 0x00,
+  0x96, 0x00, 0x00, 0x00, 0x97, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00,
+  0x96, 0x00, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00, 0x73, 0x00, 0x00, 0x00,
+  0x99, 0x00, 0x00, 0x00, 0x70, 0x00, 0x00, 0x00, 0x31, 0x00, 0x00, 0x00,
+  0x3d, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, 0x9a, 0x00, 0x00, 0x00,
+  0x99, 0x00, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00, 0x67, 0x00, 0x00, 0x00,
+  0xa2, 0x00, 0x00, 0x00, 0x30, 0x00, 0x00, 0x00, 0xa1, 0x00, 0x00, 0x00,
+  0x3d, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, 0xa3, 0x00, 0x00, 0x00,
+  0xa2, 0x00, 0x00, 0x00, 0x85, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0xa4, 0x00, 0x00, 0x00, 0x9a, 0x00, 0x00, 0x00, 0xa3, 0x00, 0x00, 0x00,
+  0x41, 0x00, 0x06, 0x00, 0x67, 0x00, 0x00, 0x00, 0xa5, 0x00, 0x00, 0x00,
+  0x9e, 0x00, 0x00, 0x00, 0x31, 0x00, 0x00, 0x00, 0x2a, 0x00, 0x00, 0x00,
+  0x3e, 0x00, 0x03, 0x00, 0xa5, 0x00, 0x00, 0x00, 0xa4, 0x00, 0x00, 0x00,
+  0xf9, 0x00, 0x02, 0x00, 0x97, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00,
+  0x97, 0x00, 0x00, 0x00, 0xfd, 0x00, 0x01, 0x00, 0x38, 0x00, 0x01, 0x00
 };
-unsigned int glsl_reduce_MEAN_comp_len = 2060;
+unsigned int glsl_reduce_MEAN_comp_len = 2580;
 
 const unsigned char glsl_reduce_PROD_comp[] = {
   0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0b, 0x00, 0x08, 0x00,
-  0x8c, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00,
+  0xb8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00,
   0x01, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00,
   0x47, 0x4c, 0x53, 0x4c, 0x2e, 0x73, 0x74, 0x64, 0x2e, 0x34, 0x35, 0x30,
   0x00, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00,
-  0x01, 0x00, 0x00, 0x00, 0x0f, 0x00, 0x06, 0x00, 0x05, 0x00, 0x00, 0x00,
+  0x01, 0x00, 0x00, 0x00, 0x0f, 0x00, 0x07, 0x00, 0x05, 0x00, 0x00, 0x00,
   0x04, 0x00, 0x00, 0x00, 0x6d, 0x61, 0x69, 0x6e, 0x00, 0x00, 0x00, 0x00,
-  0x0d, 0x00, 0x00, 0x00, 0x10, 0x00, 0x06, 0x00, 0x04, 0x00, 0x00, 0x00,
-  0x11, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
-  0x01, 0x00, 0x00, 0x00, 0x03, 0x00, 0x03, 0x00, 0x02, 0x00, 0x00, 0x00,
-  0xb8, 0x01, 0x00, 0x00, 0x05, 0x00, 0x04, 0x00, 0x04, 0x00, 0x00, 0x00,
-  0x6d, 0x61, 0x69, 0x6e, 0x00, 0x00, 0x00, 0x00, 0x05, 0x00, 0x08, 0x00,
-  0x0d, 0x00, 0x00, 0x00, 0x67, 0x6c, 0x5f, 0x47, 0x6c, 0x6f, 0x62, 0x61,
-  0x6c, 0x49, 0x6e, 0x76, 0x6f, 0x63, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x49,
-  0x44, 0x00, 0x00, 0x00, 0x05, 0x00, 0x05, 0x00, 0x18, 0x00, 0x00, 0x00,
-  0x63, 0x6f, 0x6e, 0x73, 0x74, 0x42, 0x75, 0x66, 0x66, 0x65, 0x72, 0x00,
-  0x06, 0x00, 0x04, 0x00, 0x18, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-  0x77, 0x00, 0x00, 0x00, 0x06, 0x00, 0x04, 0x00, 0x18, 0x00, 0x00, 0x00,
-  0x01, 0x00, 0x00, 0x00, 0x68, 0x00, 0x00, 0x00, 0x06, 0x00, 0x04, 0x00,
-  0x18, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x63, 0x00, 0x00, 0x00,
-  0x06, 0x00, 0x04, 0x00, 0x18, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00,
-  0x6b, 0x00, 0x00, 0x00, 0x05, 0x00, 0x04, 0x00, 0x1a, 0x00, 0x00, 0x00,
-  0x75, 0x43, 0x6f, 0x6e, 0x73, 0x74, 0x00, 0x00, 0x05, 0x00, 0x06, 0x00,
-  0x55, 0x00, 0x00, 0x00, 0x73, 0x6f, 0x75, 0x72, 0x63, 0x65, 0x42, 0x75,
-  0x66, 0x66, 0x65, 0x72, 0x30, 0x00, 0x00, 0x00, 0x06, 0x00, 0x05, 0x00,
-  0x55, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x64, 0x61, 0x74, 0x61,
-  0x00, 0x00, 0x00, 0x00, 0x05, 0x00, 0x04, 0x00, 0x57, 0x00, 0x00, 0x00,
-  0x75, 0x49, 0x6e, 0x70, 0x75, 0x74, 0x00, 0x00, 0x05, 0x00, 0x05, 0x00,
-  0x75, 0x00, 0x00, 0x00, 0x64, 0x65, 0x73, 0x74, 0x42, 0x75, 0x66, 0x66,
-  0x65, 0x72, 0x00, 0x00, 0x06, 0x00, 0x05, 0x00, 0x75, 0x00, 0x00, 0x00,
-  0x00, 0x00, 0x00, 0x00, 0x64, 0x61, 0x74, 0x61, 0x00, 0x00, 0x00, 0x00,
-  0x05, 0x00, 0x04, 0x00, 0x77, 0x00, 0x00, 0x00, 0x75, 0x4f, 0x75, 0x74,
-  0x70, 0x75, 0x74, 0x00, 0x47, 0x00, 0x04, 0x00, 0x0d, 0x00, 0x00, 0x00,
-  0x0b, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00,
-  0x18, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00,
-  0x00, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00, 0x18, 0x00, 0x00, 0x00,
-  0x01, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
-  0x48, 0x00, 0x05, 0x00, 0x18, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
-  0x23, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00,
-  0x18, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00,
-  0x0c, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x18, 0x00, 0x00, 0x00,
-  0x02, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x1a, 0x00, 0x00, 0x00,
-  0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
-  0x1a, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
-  0x47, 0x00, 0x04, 0x00, 0x54, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00,
-  0x04, 0x00, 0x00, 0x00, 0x48, 0x00, 0x04, 0x00, 0x55, 0x00, 0x00, 0x00,
-  0x00, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00,
-  0x55, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00,
-  0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x55, 0x00, 0x00, 0x00,
-  0x03, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x57, 0x00, 0x00, 0x00,
-  0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
-  0x57, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
-  0x47, 0x00, 0x04, 0x00, 0x74, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00,
-  0x04, 0x00, 0x00, 0x00, 0x48, 0x00, 0x04, 0x00, 0x75, 0x00, 0x00, 0x00,
-  0x00, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00,
-  0x75, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00,
-  0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x75, 0x00, 0x00, 0x00,
-  0x03, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x77, 0x00, 0x00, 0x00,
-  0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
-  0x77, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-  0x47, 0x00, 0x04, 0x00, 0x7d, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00,
-  0x19, 0x00, 0x00, 0x00, 0x13, 0x00, 0x02, 0x00, 0x02, 0x00, 0x00, 0x00,
-  0x21, 0x00, 0x03, 0x00, 0x03, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
-  0x15, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00,
-  0x01, 0x00, 0x00, 0x00, 0x17, 0x00, 0x04, 0x00, 0x07, 0x00, 0x00, 0x00,
-  0x06, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x15, 0x00, 0x04, 0x00,
-  0x0a, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-  0x17, 0x00, 0x04, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x0a, 0x00, 0x00, 0x00,
-  0x03, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00, 0x0c, 0x00, 0x00, 0x00,
-  0x01, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x3b, 0x00, 0x04, 0x00,
-  0x0c, 0x00, 0x00, 0x00, 0x0d, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
-  0x16, 0x00, 0x03, 0x00, 0x17, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00,
-  0x1e, 0x00, 0x06, 0x00, 0x18, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00,
-  0x06, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x17, 0x00, 0x00, 0x00,
-  0x20, 0x00, 0x04, 0x00, 0x19, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
-  0x18, 0x00, 0x00, 0x00, 0x3b, 0x00, 0x04, 0x00, 0x19, 0x00, 0x00, 0x00,
-  0x1a, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00,
-  0x06, 0x00, 0x00, 0x00, 0x1b, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-  0x20, 0x00, 0x04, 0x00, 0x1c, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
-  0x06, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00, 0x0a, 0x00, 0x00, 0x00,
-  0x26, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x14, 0x00, 0x02, 0x00,
-  0x28, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00,
-  0x32, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00,
-  0x06, 0x00, 0x00, 0x00, 0x3a, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
-  0x1d, 0x00, 0x03, 0x00, 0x54, 0x00, 0x00, 0x00, 0x17, 0x00, 0x00, 0x00,
-  0x1e, 0x00, 0x03, 0x00, 0x55, 0x00, 0x00, 0x00, 0x54, 0x00, 0x00, 0x00,
-  0x20, 0x00, 0x04, 0x00, 0x56, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
-  0x55, 0x00, 0x00, 0x00, 0x3b, 0x00, 0x04, 0x00, 0x56, 0x00, 0x00, 0x00,
-  0x57, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00,
-  0x59, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x17, 0x00, 0x00, 0x00,
-  0x1d, 0x00, 0x03, 0x00, 0x74, 0x00, 0x00, 0x00, 0x17, 0x00, 0x00, 0x00,
-  0x1e, 0x00, 0x03, 0x00, 0x75, 0x00, 0x00, 0x00, 0x74, 0x00, 0x00, 0x00,
-  0x20, 0x00, 0x04, 0x00, 0x76, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
-  0x75, 0x00, 0x00, 0x00, 0x3b, 0x00, 0x04, 0x00, 0x76, 0x00, 0x00, 0x00,
-  0x77, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00,
-  0x0a, 0x00, 0x00, 0x00, 0x7c, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00,
-  0x2c, 0x00, 0x06, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x7d, 0x00, 0x00, 0x00,
-  0x7c, 0x00, 0x00, 0x00, 0x26, 0x00, 0x00, 0x00, 0x26, 0x00, 0x00, 0x00,
-  0x36, 0x00, 0x05, 0x00, 0x02, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
-  0x00, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00,
-  0x05, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00, 0x0b, 0x00, 0x00, 0x00,
-  0x0e, 0x00, 0x00, 0x00, 0x0d, 0x00, 0x00, 0x00, 0x7c, 0x00, 0x04, 0x00,
-  0x07, 0x00, 0x00, 0x00, 0x0f, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x00, 0x00,
-  0x51, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0x16, 0x00, 0x00, 0x00,
-  0x0f, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00,
-  0x1c, 0x00, 0x00, 0x00, 0x1d, 0x00, 0x00, 0x00, 0x1a, 0x00, 0x00, 0x00,
-  0x1b, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00,
-  0x1e, 0x00, 0x00, 0x00, 0x1d, 0x00, 0x00, 0x00, 0x87, 0x00, 0x05, 0x00,
-  0x06, 0x00, 0x00, 0x00, 0x1f, 0x00, 0x00, 0x00, 0x16, 0x00, 0x00, 0x00,
-  0x1e, 0x00, 0x00, 0x00, 0x8b, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00,
-  0x25, 0x00, 0x00, 0x00, 0x16, 0x00, 0x00, 0x00, 0x1e, 0x00, 0x00, 0x00,
-  0xb1, 0x00, 0x05, 0x00, 0x28, 0x00, 0x00, 0x00, 0x2d, 0x00, 0x00, 0x00,
-  0x25, 0x00, 0x00, 0x00, 0x1e, 0x00, 0x00, 0x00, 0xf7, 0x00, 0x03, 0x00,
-  0x2f, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xfa, 0x00, 0x04, 0x00,
-  0x2d, 0x00, 0x00, 0x00, 0x2e, 0x00, 0x00, 0x00, 0x2f, 0x00, 0x00, 0x00,
-  0xf8, 0x00, 0x02, 0x00, 0x2e, 0x00, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00,
-  0x1c, 0x00, 0x00, 0x00, 0x33, 0x00, 0x00, 0x00, 0x1a, 0x00, 0x00, 0x00,
-  0x32, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00,
-  0x34, 0x00, 0x00, 0x00, 0x33, 0x00, 0x00, 0x00, 0xb1, 0x00, 0x05, 0x00,
-  0x28, 0x00, 0x00, 0x00, 0x35, 0x00, 0x00, 0x00, 0x1f, 0x00, 0x00, 0x00,
-  0x34, 0x00, 0x00, 0x00, 0xf9, 0x00, 0x02, 0x00, 0x2f, 0x00, 0x00, 0x00,
-  0xf8, 0x00, 0x02, 0x00, 0x2f, 0x00, 0x00, 0x00, 0xf5, 0x00, 0x07, 0x00,
-  0x28, 0x00, 0x00, 0x00, 0x36, 0x00, 0x00, 0x00, 0x2d, 0x00, 0x00, 0x00,
-  0x05, 0x00, 0x00, 0x00, 0x35, 0x00, 0x00, 0x00, 0x2e, 0x00, 0x00, 0x00,
-  0xf7, 0x00, 0x03, 0x00, 0x38, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-  0xfa, 0x00, 0x04, 0x00, 0x36, 0x00, 0x00, 0x00, 0x37, 0x00, 0x00, 0x00,
-  0x38, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x37, 0x00, 0x00, 0x00,
-  0x84, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0x4b, 0x00, 0x00, 0x00,
-  0x1f, 0x00, 0x00, 0x00, 0x1e, 0x00, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00,
-  0x1c, 0x00, 0x00, 0x00, 0x4c, 0x00, 0x00, 0x00, 0x1a, 0x00, 0x00, 0x00,
-  0x3a, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00,
-  0x4d, 0x00, 0x00, 0x00, 0x4c, 0x00, 0x00, 0x00, 0x84, 0x00, 0x05, 0x00,
-  0x06, 0x00, 0x00, 0x00, 0x4e, 0x00, 0x00, 0x00, 0x4b, 0x00, 0x00, 0x00,
-  0x4d, 0x00, 0x00, 0x00, 0x80, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00,
-  0x51, 0x00, 0x00, 0x00, 0x4e, 0x00, 0x00, 0x00, 0x25, 0x00, 0x00, 0x00,
-  0x41, 0x00, 0x06, 0x00, 0x59, 0x00, 0x00, 0x00, 0x5a, 0x00, 0x00, 0x00,
-  0x57, 0x00, 0x00, 0x00, 0x1b, 0x00, 0x00, 0x00, 0x51, 0x00, 0x00, 0x00,
-  0x3d, 0x00, 0x04, 0x00, 0x17, 0x00, 0x00, 0x00, 0x5b, 0x00, 0x00, 0x00,
-  0x5a, 0x00, 0x00, 0x00, 0xf9, 0x00, 0x02, 0x00, 0x5d, 0x00, 0x00, 0x00,
-  0xf8, 0x00, 0x02, 0x00, 0x5d, 0x00, 0x00, 0x00, 0xf5, 0x00, 0x07, 0x00,
-  0x17, 0x00, 0x00, 0x00, 0x8b, 0x00, 0x00, 0x00, 0x5b, 0x00, 0x00, 0x00,
-  0x37, 0x00, 0x00, 0x00, 0x71, 0x00, 0x00, 0x00, 0x5e, 0x00, 0x00, 0x00,
-  0xf5, 0x00, 0x07, 0x00, 0x06, 0x00, 0x00, 0x00, 0x8a, 0x00, 0x00, 0x00,
-  0x3a, 0x00, 0x00, 0x00, 0x37, 0x00, 0x00, 0x00, 0x73, 0x00, 0x00, 0x00,
-  0x5e, 0x00, 0x00, 0x00, 0xb1, 0x00, 0x05, 0x00, 0x28, 0x00, 0x00, 0x00,
-  0x65, 0x00, 0x00, 0x00, 0x8a, 0x00, 0x00, 0x00, 0x4d, 0x00, 0x00, 0x00,
-  0xf6, 0x00, 0x04, 0x00, 0x5f, 0x00, 0x00, 0x00, 0x5e, 0x00, 0x00, 0x00,
-  0x00, 0x00, 0x00, 0x00, 0xfa, 0x00, 0x04, 0x00, 0x65, 0x00, 0x00, 0x00,
-  0x5e, 0x00, 0x00, 0x00, 0x5f, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00,
-  0x5e, 0x00, 0x00, 0x00, 0x84, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00,
-  0x6b, 0x00, 0x00, 0x00, 0x8a, 0x00, 0x00, 0x00, 0x1e, 0x00, 0x00, 0x00,
-  0x80, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0x6c, 0x00, 0x00, 0x00,
-  0x51, 0x00, 0x00, 0x00, 0x6b, 0x00, 0x00, 0x00, 0x41, 0x00, 0x06, 0x00,
-  0x59, 0x00, 0x00, 0x00, 0x6d, 0x00, 0x00, 0x00, 0x57, 0x00, 0x00, 0x00,
-  0x1b, 0x00, 0x00, 0x00, 0x6c, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00,
-  0x17, 0x00, 0x00, 0x00, 0x6e, 0x00, 0x00, 0x00, 0x6d, 0x00, 0x00, 0x00,
-  0x85, 0x00, 0x05, 0x00, 0x17, 0x00, 0x00, 0x00, 0x71, 0x00, 0x00, 0x00,
-  0x8b, 0x00, 0x00, 0x00, 0x6e, 0x00, 0x00, 0x00, 0x80, 0x00, 0x05, 0x00,
-  0x06, 0x00, 0x00, 0x00, 0x73, 0x00, 0x00, 0x00, 0x8a, 0x00, 0x00, 0x00,
-  0x3a, 0x00, 0x00, 0x00, 0xf9, 0x00, 0x02, 0x00, 0x5d, 0x00, 0x00, 0x00,
-  0xf8, 0x00, 0x02, 0x00, 0x5f, 0x00, 0x00, 0x00, 0x41, 0x00, 0x06, 0x00,
-  0x59, 0x00, 0x00, 0x00, 0x7b, 0x00, 0x00, 0x00, 0x77, 0x00, 0x00, 0x00,
-  0x1b, 0x00, 0x00, 0x00, 0x16, 0x00, 0x00, 0x00, 0x3e, 0x00, 0x03, 0x00,
-  0x7b, 0x00, 0x00, 0x00, 0x8b, 0x00, 0x00, 0x00, 0xf9, 0x00, 0x02, 0x00,
-  0x38, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x38, 0x00, 0x00, 0x00,
-  0xfd, 0x00, 0x01, 0x00, 0x38, 0x00, 0x01, 0x00
+  0x21, 0x00, 0x00, 0x00, 0x27, 0x00, 0x00, 0x00, 0x10, 0x00, 0x06, 0x00,
+  0x04, 0x00, 0x00, 0x00, 0x11, 0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00,
+  0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x03, 0x00, 0x03, 0x00,
+  0x02, 0x00, 0x00, 0x00, 0xc2, 0x01, 0x00, 0x00, 0x05, 0x00, 0x04, 0x00,
+  0x04, 0x00, 0x00, 0x00, 0x6d, 0x61, 0x69, 0x6e, 0x00, 0x00, 0x00, 0x00,
+  0x05, 0x00, 0x08, 0x00, 0x21, 0x00, 0x00, 0x00, 0x67, 0x6c, 0x5f, 0x47,
+  0x6c, 0x6f, 0x62, 0x61, 0x6c, 0x49, 0x6e, 0x76, 0x6f, 0x63, 0x61, 0x74,
+  0x69, 0x6f, 0x6e, 0x49, 0x44, 0x00, 0x00, 0x00, 0x05, 0x00, 0x06, 0x00,
+  0x27, 0x00, 0x00, 0x00, 0x67, 0x6c, 0x5f, 0x57, 0x6f, 0x72, 0x6b, 0x47,
+  0x72, 0x6f, 0x75, 0x70, 0x49, 0x44, 0x00, 0x00, 0x05, 0x00, 0x05, 0x00,
+  0x2e, 0x00, 0x00, 0x00, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x42, 0x75, 0x66,
+  0x66, 0x65, 0x72, 0x00, 0x06, 0x00, 0x04, 0x00, 0x2e, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x77, 0x00, 0x00, 0x00, 0x06, 0x00, 0x04, 0x00,
+  0x2e, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x68, 0x00, 0x00, 0x00,
+  0x06, 0x00, 0x04, 0x00, 0x2e, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+  0x63, 0x00, 0x00, 0x00, 0x06, 0x00, 0x04, 0x00, 0x2e, 0x00, 0x00, 0x00,
+  0x03, 0x00, 0x00, 0x00, 0x6b, 0x00, 0x00, 0x00, 0x05, 0x00, 0x04, 0x00,
+  0x30, 0x00, 0x00, 0x00, 0x75, 0x43, 0x6f, 0x6e, 0x73, 0x74, 0x00, 0x00,
+  0x05, 0x00, 0x06, 0x00, 0x5a, 0x00, 0x00, 0x00, 0x73, 0x6f, 0x75, 0x72,
+  0x63, 0x65, 0x42, 0x75, 0x66, 0x66, 0x65, 0x72, 0x30, 0x00, 0x00, 0x00,
+  0x06, 0x00, 0x05, 0x00, 0x5a, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x64, 0x61, 0x74, 0x61, 0x00, 0x00, 0x00, 0x00, 0x05, 0x00, 0x04, 0x00,
+  0x5c, 0x00, 0x00, 0x00, 0x75, 0x49, 0x6e, 0x70, 0x75, 0x74, 0x00, 0x00,
+  0x05, 0x00, 0x06, 0x00, 0x70, 0x00, 0x00, 0x00, 0x73, 0x68, 0x61, 0x72,
+  0x65, 0x64, 0x56, 0x61, 0x6c, 0x75, 0x65, 0x73, 0x00, 0x00, 0x00, 0x00,
+  0x05, 0x00, 0x05, 0x00, 0x9c, 0x00, 0x00, 0x00, 0x64, 0x65, 0x73, 0x74,
+  0x42, 0x75, 0x66, 0x66, 0x65, 0x72, 0x00, 0x00, 0x06, 0x00, 0x05, 0x00,
+  0x9c, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x64, 0x61, 0x74, 0x61,
+  0x00, 0x00, 0x00, 0x00, 0x05, 0x00, 0x04, 0x00, 0x9e, 0x00, 0x00, 0x00,
+  0x75, 0x4f, 0x75, 0x74, 0x70, 0x75, 0x74, 0x00, 0x47, 0x00, 0x04, 0x00,
+  0x21, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00,
+  0x47, 0x00, 0x04, 0x00, 0x27, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00,
+  0x1a, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00, 0x2e, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x48, 0x00, 0x05, 0x00, 0x2e, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+  0x23, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00,
+  0x2e, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00,
+  0x08, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00, 0x2e, 0x00, 0x00, 0x00,
+  0x03, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x00, 0x00,
+  0x47, 0x00, 0x03, 0x00, 0x2e, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+  0x47, 0x00, 0x04, 0x00, 0x30, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x30, 0x00, 0x00, 0x00,
+  0x21, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
+  0x59, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
+  0x48, 0x00, 0x04, 0x00, 0x5a, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x18, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00, 0x5a, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x47, 0x00, 0x03, 0x00, 0x5a, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00,
+  0x47, 0x00, 0x04, 0x00, 0x5c, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x5c, 0x00, 0x00, 0x00,
+  0x21, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
+  0x9b, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
+  0x48, 0x00, 0x04, 0x00, 0x9c, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x19, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00, 0x9c, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x47, 0x00, 0x03, 0x00, 0x9c, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00,
+  0x47, 0x00, 0x04, 0x00, 0x9e, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x9e, 0x00, 0x00, 0x00,
+  0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
+  0xa2, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00,
+  0x13, 0x00, 0x02, 0x00, 0x02, 0x00, 0x00, 0x00, 0x21, 0x00, 0x03, 0x00,
+  0x03, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x16, 0x00, 0x03, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0x11, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80, 0x3f,
+  0x15, 0x00, 0x04, 0x00, 0x1c, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x17, 0x00, 0x04, 0x00, 0x1f, 0x00, 0x00, 0x00,
+  0x1c, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00,
+  0x20, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x1f, 0x00, 0x00, 0x00,
+  0x3b, 0x00, 0x04, 0x00, 0x20, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00,
+  0x01, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00, 0x1c, 0x00, 0x00, 0x00,
+  0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00,
+  0x23, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00,
+  0x3b, 0x00, 0x04, 0x00, 0x20, 0x00, 0x00, 0x00, 0x27, 0x00, 0x00, 0x00,
+  0x01, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00, 0x1c, 0x00, 0x00, 0x00,
+  0x28, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x15, 0x00, 0x04, 0x00,
+  0x2d, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+  0x1e, 0x00, 0x06, 0x00, 0x2e, 0x00, 0x00, 0x00, 0x2d, 0x00, 0x00, 0x00,
+  0x2d, 0x00, 0x00, 0x00, 0x2d, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0x20, 0x00, 0x04, 0x00, 0x2f, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+  0x2e, 0x00, 0x00, 0x00, 0x3b, 0x00, 0x04, 0x00, 0x2f, 0x00, 0x00, 0x00,
+  0x30, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00,
+  0x2d, 0x00, 0x00, 0x00, 0x31, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x20, 0x00, 0x04, 0x00, 0x32, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+  0x2d, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00, 0x2d, 0x00, 0x00, 0x00,
+  0x42, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x14, 0x00, 0x02, 0x00,
+  0x57, 0x00, 0x00, 0x00, 0x1d, 0x00, 0x03, 0x00, 0x59, 0x00, 0x00, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0x1e, 0x00, 0x03, 0x00, 0x5a, 0x00, 0x00, 0x00,
+  0x59, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00, 0x5b, 0x00, 0x00, 0x00,
+  0x02, 0x00, 0x00, 0x00, 0x5a, 0x00, 0x00, 0x00, 0x3b, 0x00, 0x04, 0x00,
+  0x5b, 0x00, 0x00, 0x00, 0x5c, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+  0x20, 0x00, 0x04, 0x00, 0x67, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00, 0x1c, 0x00, 0x00, 0x00,
+  0x6b, 0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x04, 0x00,
+  0x6e, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x6b, 0x00, 0x00, 0x00,
+  0x20, 0x00, 0x04, 0x00, 0x6f, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
+  0x6e, 0x00, 0x00, 0x00, 0x3b, 0x00, 0x04, 0x00, 0x6f, 0x00, 0x00, 0x00,
+  0x70, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00,
+  0x73, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0x2b, 0x00, 0x04, 0x00, 0x1c, 0x00, 0x00, 0x00, 0x75, 0x00, 0x00, 0x00,
+  0x02, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00, 0x1c, 0x00, 0x00, 0x00,
+  0x76, 0x00, 0x00, 0x00, 0x08, 0x01, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00,
+  0x1c, 0x00, 0x00, 0x00, 0x78, 0x00, 0x00, 0x00, 0x40, 0x00, 0x00, 0x00,
+  0x1d, 0x00, 0x03, 0x00, 0x9b, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0x1e, 0x00, 0x03, 0x00, 0x9c, 0x00, 0x00, 0x00, 0x9b, 0x00, 0x00, 0x00,
+  0x20, 0x00, 0x04, 0x00, 0x9d, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+  0x9c, 0x00, 0x00, 0x00, 0x3b, 0x00, 0x04, 0x00, 0x9d, 0x00, 0x00, 0x00,
+  0x9e, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x2c, 0x00, 0x06, 0x00,
+  0x1f, 0x00, 0x00, 0x00, 0xa2, 0x00, 0x00, 0x00, 0x6b, 0x00, 0x00, 0x00,
+  0x28, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00, 0x36, 0x00, 0x05, 0x00,
+  0x02, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x03, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x05, 0x00, 0x00, 0x00,
+  0x41, 0x00, 0x05, 0x00, 0x23, 0x00, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00,
+  0x21, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00,
+  0x1c, 0x00, 0x00, 0x00, 0x25, 0x00, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00,
+  0x41, 0x00, 0x05, 0x00, 0x23, 0x00, 0x00, 0x00, 0x29, 0x00, 0x00, 0x00,
+  0x27, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00,
+  0x1c, 0x00, 0x00, 0x00, 0x2a, 0x00, 0x00, 0x00, 0x29, 0x00, 0x00, 0x00,
+  0x41, 0x00, 0x05, 0x00, 0x32, 0x00, 0x00, 0x00, 0x33, 0x00, 0x00, 0x00,
+  0x30, 0x00, 0x00, 0x00, 0x31, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00,
+  0x2d, 0x00, 0x00, 0x00, 0x34, 0x00, 0x00, 0x00, 0x33, 0x00, 0x00, 0x00,
+  0x7c, 0x00, 0x04, 0x00, 0x1c, 0x00, 0x00, 0x00, 0x35, 0x00, 0x00, 0x00,
+  0x34, 0x00, 0x00, 0x00, 0x89, 0x00, 0x05, 0x00, 0x1c, 0x00, 0x00, 0x00,
+  0x36, 0x00, 0x00, 0x00, 0x2a, 0x00, 0x00, 0x00, 0x35, 0x00, 0x00, 0x00,
+  0x86, 0x00, 0x05, 0x00, 0x1c, 0x00, 0x00, 0x00, 0x3c, 0x00, 0x00, 0x00,
+  0x2a, 0x00, 0x00, 0x00, 0x35, 0x00, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00,
+  0x32, 0x00, 0x00, 0x00, 0x43, 0x00, 0x00, 0x00, 0x30, 0x00, 0x00, 0x00,
+  0x42, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00, 0x2d, 0x00, 0x00, 0x00,
+  0x44, 0x00, 0x00, 0x00, 0x43, 0x00, 0x00, 0x00, 0x7c, 0x00, 0x04, 0x00,
+  0x1c, 0x00, 0x00, 0x00, 0x45, 0x00, 0x00, 0x00, 0x44, 0x00, 0x00, 0x00,
+  0x84, 0x00, 0x05, 0x00, 0x1c, 0x00, 0x00, 0x00, 0x47, 0x00, 0x00, 0x00,
+  0x45, 0x00, 0x00, 0x00, 0x3c, 0x00, 0x00, 0x00, 0x84, 0x00, 0x05, 0x00,
+  0x1c, 0x00, 0x00, 0x00, 0x48, 0x00, 0x00, 0x00, 0x35, 0x00, 0x00, 0x00,
+  0x47, 0x00, 0x00, 0x00, 0x80, 0x00, 0x05, 0x00, 0x1c, 0x00, 0x00, 0x00,
+  0x49, 0x00, 0x00, 0x00, 0x36, 0x00, 0x00, 0x00, 0x48, 0x00, 0x00, 0x00,
+  0xf9, 0x00, 0x02, 0x00, 0x4e, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00,
+  0x4e, 0x00, 0x00, 0x00, 0xf5, 0x00, 0x07, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0xb6, 0x00, 0x00, 0x00, 0x11, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00,
+  0xac, 0x00, 0x00, 0x00, 0x4f, 0x00, 0x00, 0x00, 0xf5, 0x00, 0x07, 0x00,
+  0x1c, 0x00, 0x00, 0x00, 0xb5, 0x00, 0x00, 0x00, 0x25, 0x00, 0x00, 0x00,
+  0x05, 0x00, 0x00, 0x00, 0x6d, 0x00, 0x00, 0x00, 0x4f, 0x00, 0x00, 0x00,
+  0xb0, 0x00, 0x05, 0x00, 0x57, 0x00, 0x00, 0x00, 0x58, 0x00, 0x00, 0x00,
+  0xb5, 0x00, 0x00, 0x00, 0x45, 0x00, 0x00, 0x00, 0xf6, 0x00, 0x04, 0x00,
+  0x50, 0x00, 0x00, 0x00, 0x4f, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0xfa, 0x00, 0x04, 0x00, 0x58, 0x00, 0x00, 0x00, 0x4f, 0x00, 0x00, 0x00,
+  0x50, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x4f, 0x00, 0x00, 0x00,
+  0x84, 0x00, 0x05, 0x00, 0x1c, 0x00, 0x00, 0x00, 0x62, 0x00, 0x00, 0x00,
+  0xb5, 0x00, 0x00, 0x00, 0x35, 0x00, 0x00, 0x00, 0x80, 0x00, 0x05, 0x00,
+  0x1c, 0x00, 0x00, 0x00, 0x63, 0x00, 0x00, 0x00, 0x49, 0x00, 0x00, 0x00,
+  0x62, 0x00, 0x00, 0x00, 0x41, 0x00, 0x06, 0x00, 0x67, 0x00, 0x00, 0x00,
+  0x68, 0x00, 0x00, 0x00, 0x5c, 0x00, 0x00, 0x00, 0x31, 0x00, 0x00, 0x00,
+  0x63, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0x69, 0x00, 0x00, 0x00, 0x68, 0x00, 0x00, 0x00, 0x85, 0x00, 0x05, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0xac, 0x00, 0x00, 0x00, 0xb6, 0x00, 0x00, 0x00,
+  0x69, 0x00, 0x00, 0x00, 0x80, 0x00, 0x05, 0x00, 0x1c, 0x00, 0x00, 0x00,
+  0x6d, 0x00, 0x00, 0x00, 0xb5, 0x00, 0x00, 0x00, 0x6b, 0x00, 0x00, 0x00,
+  0xf9, 0x00, 0x02, 0x00, 0x4e, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00,
+  0x50, 0x00, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00, 0x73, 0x00, 0x00, 0x00,
+  0x74, 0x00, 0x00, 0x00, 0x70, 0x00, 0x00, 0x00, 0x25, 0x00, 0x00, 0x00,
+  0x3e, 0x00, 0x03, 0x00, 0x74, 0x00, 0x00, 0x00, 0xb6, 0x00, 0x00, 0x00,
+  0xe0, 0x00, 0x04, 0x00, 0x75, 0x00, 0x00, 0x00, 0x75, 0x00, 0x00, 0x00,
+  0x76, 0x00, 0x00, 0x00, 0xf9, 0x00, 0x02, 0x00, 0x79, 0x00, 0x00, 0x00,
+  0xf8, 0x00, 0x02, 0x00, 0x79, 0x00, 0x00, 0x00, 0xf5, 0x00, 0x07, 0x00,
+  0x1c, 0x00, 0x00, 0x00, 0xb7, 0x00, 0x00, 0x00, 0x78, 0x00, 0x00, 0x00,
+  0x50, 0x00, 0x00, 0x00, 0x93, 0x00, 0x00, 0x00, 0x7c, 0x00, 0x00, 0x00,
+  0xac, 0x00, 0x05, 0x00, 0x57, 0x00, 0x00, 0x00, 0x7f, 0x00, 0x00, 0x00,
+  0xb7, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0xf6, 0x00, 0x04, 0x00,
+  0x7b, 0x00, 0x00, 0x00, 0x7c, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0xfa, 0x00, 0x04, 0x00, 0x7f, 0x00, 0x00, 0x00, 0x7a, 0x00, 0x00, 0x00,
+  0x7b, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x7a, 0x00, 0x00, 0x00,
+  0xb0, 0x00, 0x05, 0x00, 0x57, 0x00, 0x00, 0x00, 0x82, 0x00, 0x00, 0x00,
+  0x25, 0x00, 0x00, 0x00, 0xb7, 0x00, 0x00, 0x00, 0xf7, 0x00, 0x03, 0x00,
+  0x84, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xfa, 0x00, 0x04, 0x00,
+  0x82, 0x00, 0x00, 0x00, 0x83, 0x00, 0x00, 0x00, 0x84, 0x00, 0x00, 0x00,
+  0xf8, 0x00, 0x02, 0x00, 0x83, 0x00, 0x00, 0x00, 0x80, 0x00, 0x05, 0x00,
+  0x1c, 0x00, 0x00, 0x00, 0x88, 0x00, 0x00, 0x00, 0x25, 0x00, 0x00, 0x00,
+  0xb7, 0x00, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00, 0x73, 0x00, 0x00, 0x00,
+  0x8b, 0x00, 0x00, 0x00, 0x70, 0x00, 0x00, 0x00, 0x88, 0x00, 0x00, 0x00,
+  0x3d, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, 0x8c, 0x00, 0x00, 0x00,
+  0x8b, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0x8f, 0x00, 0x00, 0x00, 0x74, 0x00, 0x00, 0x00, 0x85, 0x00, 0x05, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0xb3, 0x00, 0x00, 0x00, 0x8c, 0x00, 0x00, 0x00,
+  0x8f, 0x00, 0x00, 0x00, 0x3e, 0x00, 0x03, 0x00, 0x74, 0x00, 0x00, 0x00,
+  0xb3, 0x00, 0x00, 0x00, 0xf9, 0x00, 0x02, 0x00, 0x84, 0x00, 0x00, 0x00,
+  0xf8, 0x00, 0x02, 0x00, 0x84, 0x00, 0x00, 0x00, 0xe0, 0x00, 0x04, 0x00,
+  0x75, 0x00, 0x00, 0x00, 0x75, 0x00, 0x00, 0x00, 0x76, 0x00, 0x00, 0x00,
+  0xf9, 0x00, 0x02, 0x00, 0x7c, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00,
+  0x7c, 0x00, 0x00, 0x00, 0xc2, 0x00, 0x05, 0x00, 0x1c, 0x00, 0x00, 0x00,
+  0x93, 0x00, 0x00, 0x00, 0xb7, 0x00, 0x00, 0x00, 0x42, 0x00, 0x00, 0x00,
+  0xf9, 0x00, 0x02, 0x00, 0x79, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00,
+  0x7b, 0x00, 0x00, 0x00, 0xaa, 0x00, 0x05, 0x00, 0x57, 0x00, 0x00, 0x00,
+  0x95, 0x00, 0x00, 0x00, 0x25, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00,
+  0xf7, 0x00, 0x03, 0x00, 0x97, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0xfa, 0x00, 0x04, 0x00, 0x95, 0x00, 0x00, 0x00, 0x96, 0x00, 0x00, 0x00,
+  0x97, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x96, 0x00, 0x00, 0x00,
+  0x41, 0x00, 0x05, 0x00, 0x73, 0x00, 0x00, 0x00, 0x99, 0x00, 0x00, 0x00,
+  0x70, 0x00, 0x00, 0x00, 0x31, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0x9a, 0x00, 0x00, 0x00, 0x99, 0x00, 0x00, 0x00,
+  0x41, 0x00, 0x06, 0x00, 0x67, 0x00, 0x00, 0x00, 0xa1, 0x00, 0x00, 0x00,
+  0x9e, 0x00, 0x00, 0x00, 0x31, 0x00, 0x00, 0x00, 0x2a, 0x00, 0x00, 0x00,
+  0x3e, 0x00, 0x03, 0x00, 0xa1, 0x00, 0x00, 0x00, 0x9a, 0x00, 0x00, 0x00,
+  0xf9, 0x00, 0x02, 0x00, 0x97, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00,
+  0x97, 0x00, 0x00, 0x00, 0xfd, 0x00, 0x01, 0x00, 0x38, 0x00, 0x01, 0x00
 };
-unsigned int glsl_reduce_PROD_comp_len = 1988;
+unsigned int glsl_reduce_PROD_comp_len = 2508;
 
 const unsigned char glsl_reduce_SUM_comp[] = {
   0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0b, 0x00, 0x08, 0x00,
-  0x8c, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00,
+  0xb8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00,
   0x01, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00,
   0x47, 0x4c, 0x53, 0x4c, 0x2e, 0x73, 0x74, 0x64, 0x2e, 0x34, 0x35, 0x30,
   0x00, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00,
-  0x01, 0x00, 0x00, 0x00, 0x0f, 0x00, 0x06, 0x00, 0x05, 0x00, 0x00, 0x00,
+  0x01, 0x00, 0x00, 0x00, 0x0f, 0x00, 0x07, 0x00, 0x05, 0x00, 0x00, 0x00,
   0x04, 0x00, 0x00, 0x00, 0x6d, 0x61, 0x69, 0x6e, 0x00, 0x00, 0x00, 0x00,
-  0x0d, 0x00, 0x00, 0x00, 0x10, 0x00, 0x06, 0x00, 0x04, 0x00, 0x00, 0x00,
-  0x11, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
-  0x01, 0x00, 0x00, 0x00, 0x03, 0x00, 0x03, 0x00, 0x02, 0x00, 0x00, 0x00,
-  0xb8, 0x01, 0x00, 0x00, 0x05, 0x00, 0x04, 0x00, 0x04, 0x00, 0x00, 0x00,
-  0x6d, 0x61, 0x69, 0x6e, 0x00, 0x00, 0x00, 0x00, 0x05, 0x00, 0x08, 0x00,
-  0x0d, 0x00, 0x00, 0x00, 0x67, 0x6c, 0x5f, 0x47, 0x6c, 0x6f, 0x62, 0x61,
-  0x6c, 0x49, 0x6e, 0x76, 0x6f, 0x63, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x49,
-  0x44, 0x00, 0x00, 0x00, 0x05, 0x00, 0x05, 0x00, 0x18, 0x00, 0x00, 0x00,
-  0x63, 0x6f, 0x6e, 0x73, 0x74, 0x42, 0x75, 0x66, 0x66, 0x65, 0x72, 0x00,
-  0x06, 0x00, 0x04, 0x00, 0x18, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-  0x77, 0x00, 0x00, 0x00, 0x06, 0x00, 0x04, 0x00, 0x18, 0x00, 0x00, 0x00,
-  0x01, 0x00, 0x00, 0x00, 0x68, 0x00, 0x00, 0x00, 0x06, 0x00, 0x04, 0x00,
-  0x18, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x63, 0x00, 0x00, 0x00,
-  0x06, 0x00, 0x04, 0x00, 0x18, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00,
-  0x6b, 0x00, 0x00, 0x00, 0x05, 0x00, 0x04, 0x00, 0x1a, 0x00, 0x00, 0x00,
-  0x75, 0x43, 0x6f, 0x6e, 0x73, 0x74, 0x00, 0x00, 0x05, 0x00, 0x06, 0x00,
-  0x55, 0x00, 0x00, 0x00, 0x73, 0x6f, 0x75, 0x72, 0x63, 0x65, 0x42, 0x75,
-  0x66, 0x66, 0x65, 0x72, 0x30, 0x00, 0x00, 0x00, 0x06, 0x00, 0x05, 0x00,
-  0x55, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x64, 0x61, 0x74, 0x61,
-  0x00, 0x00, 0x00, 0x00, 0x05, 0x00, 0x04, 0x00, 0x57, 0x00, 0x00, 0x00,
-  0x75, 0x49, 0x6e, 0x70, 0x75, 0x74, 0x00, 0x00, 0x05, 0x00, 0x05, 0x00,
-  0x75, 0x00, 0x00, 0x00, 0x64, 0x65, 0x73, 0x74, 0x42, 0x75, 0x66, 0x66,
-  0x65, 0x72, 0x00, 0x00, 0x06, 0x00, 0x05, 0x00, 0x75, 0x00, 0x00, 0x00,
-  0x00, 0x00, 0x00, 0x00, 0x64, 0x61, 0x74, 0x61, 0x00, 0x00, 0x00, 0x00,
-  0x05, 0x00, 0x04, 0x00, 0x77, 0x00, 0x00, 0x00, 0x75, 0x4f, 0x75, 0x74,
-  0x70, 0x75, 0x74, 0x00, 0x47, 0x00, 0x04, 0x00, 0x0d, 0x00, 0x00, 0x00,
-  0x0b, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00,
-  0x18, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00,
-  0x00, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00, 0x18, 0x00, 0x00, 0x00,
-  0x01, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
-  0x48, 0x00, 0x05, 0x00, 0x18, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
-  0x23, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00,
-  0x18, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00,
-  0x0c, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x18, 0x00, 0x00, 0x00,
-  0x02, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x1a, 0x00, 0x00, 0x00,
-  0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
-  0x1a, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
-  0x47, 0x00, 0x04, 0x00, 0x54, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00,
-  0x04, 0x00, 0x00, 0x00, 0x48, 0x00, 0x04, 0x00, 0x55, 0x00, 0x00, 0x00,
-  0x00, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00,
-  0x55, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00,
-  0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x55, 0x00, 0x00, 0x00,
-  0x03, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x57, 0x00, 0x00, 0x00,
-  0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
-  0x57, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
-  0x47, 0x00, 0x04, 0x00, 0x74, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00,
-  0x04, 0x00, 0x00, 0x00, 0x48, 0x00, 0x04, 0x00, 0x75, 0x00, 0x00, 0x00,
-  0x00, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00,
-  0x75, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00,
-  0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x75, 0x00, 0x00, 0x00,
-  0x03, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x77, 0x00, 0x00, 0x00,
-  0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
-  0x77, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-  0x47, 0x00, 0x04, 0x00, 0x7d, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00,
-  0x19, 0x00, 0x00, 0x00, 0x13, 0x00, 0x02, 0x00, 0x02, 0x00, 0x00, 0x00,
-  0x21, 0x00, 0x03, 0x00, 0x03, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
-  0x15, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00,
-  0x01, 0x00, 0x00, 0x00, 0x17, 0x00, 0x04, 0x00, 0x07, 0x00, 0x00, 0x00,
-  0x06, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x15, 0x00, 0x04, 0x00,
-  0x0a, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-  0x17, 0x00, 0x04, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x0a, 0x00, 0x00, 0x00,
-  0x03, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00, 0x0c, 0x00, 0x00, 0x00,
-  0x01, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x3b, 0x00, 0x04, 0x00,
-  0x0c, 0x00, 0x00, 0x00, 0x0d, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
-  0x16, 0x00, 0x03, 0x00, 0x17, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00,
-  0x1e, 0x00, 0x06, 0x00, 0x18, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00,
-  0x06, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x17, 0x00, 0x00, 0x00,
-  0x20, 0x00, 0x04, 0x00, 0x19, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
-  0x18, 0x00, 0x00, 0x00, 0x3b, 0x00, 0x04, 0x00, 0x19, 0x00, 0x00, 0x00,
-  0x1a, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00,
-  0x06, 0x00, 0x00, 0x00, 0x1b, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-  0x20, 0x00, 0x04, 0x00, 0x1c, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
-  0x06, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00, 0x0a, 0x00, 0x00, 0x00,
-  0x26, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x14, 0x00, 0x02, 0x00,
-  0x28, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00,
-  0x32, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00,
-  0x06, 0x00, 0x00, 0x00, 0x3a, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
-  0x1d, 0x00, 0x03, 0x00, 0x54, 0x00, 0x00, 0x00, 0x17, 0x00, 0x00, 0x00,
-  0x1e, 0x00, 0x03, 0x00, 0x55, 0x00, 0x00, 0x00, 0x54, 0x00, 0x00, 0x00,
-  0x20, 0x00, 0x04, 0x00, 0x56, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
-  0x55, 0x00, 0x00, 0x00, 0x3b, 0x00, 0x04, 0x00, 0x56, 0x00, 0x00, 0x00,
-  0x57, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00,
-  0x59, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x17, 0x00, 0x00, 0x00,
-  0x1d, 0x00, 0x03, 0x00, 0x74, 0x00, 0x00, 0x00, 0x17, 0x00, 0x00, 0x00,
-  0x1e, 0x00, 0x03, 0x00, 0x75, 0x00, 0x00, 0x00, 0x74, 0x00, 0x00, 0x00,
-  0x20, 0x00, 0x04, 0x00, 0x76, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
-  0x75, 0x00, 0x00, 0x00, 0x3b, 0x00, 0x04, 0x00, 0x76, 0x00, 0x00, 0x00,
-  0x77, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00,
-  0x0a, 0x00, 0x00, 0x00, 0x7c, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00,
-  0x2c, 0x00, 0x06, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x7d, 0x00, 0x00, 0x00,
-  0x7c, 0x00, 0x00, 0x00, 0x26, 0x00, 0x00, 0x00, 0x26, 0x00, 0x00, 0x00,
-  0x36, 0x00, 0x05, 0x00, 0x02, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
-  0x00, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00,
-  0x05, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00, 0x0b, 0x00, 0x00, 0x00,
-  0x0e, 0x00, 0x00, 0x00, 0x0d, 0x00, 0x00, 0x00, 0x7c, 0x00, 0x04, 0x00,
-  0x07, 0x00, 0x00, 0x00, 0x0f, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x00, 0x00,
-  0x51, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0x16, 0x00, 0x00, 0x00,
-  0x0f, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00,
-  0x1c, 0x00, 0x00, 0x00, 0x1d, 0x00, 0x00, 0x00, 0x1a, 0x00, 0x00, 0x00,
-  0x1b, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00,
-  0x1e, 0x00, 0x00, 0x00, 0x1d, 0x00, 0x00, 0x00, 0x87, 0x00, 0x05, 0x00,
-  0x06, 0x00, 0x00, 0x00, 0x1f, 0x00, 0x00, 0x00, 0x16, 0x00, 0x00, 0x00,
-  0x1e, 0x00, 0x00, 0x00, 0x8b, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00,
-  0x25, 0x00, 0x00, 0x00, 0x16, 0x00, 0x00, 0x00, 0x1e, 0x00, 0x00, 0x00,
-  0xb1, 0x00, 0x05, 0x00, 0x28, 0x00, 0x00, 0x00, 0x2d, 0x00, 0x00, 0x00,
-  0x25, 0x00, 0x00, 0x00, 0x1e, 0x00, 0x00, 0x00, 0xf7, 0x00, 0x03, 0x00,
-  0x2f, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xfa, 0x00, 0x04, 0x00,
-  0x2d, 0x00, 0x00, 0x00, 0x2e, 0x00, 0x00, 0x00, 0x2f, 0x00, 0x00, 0x00,
-  0xf8, 0x00, 0x02, 0x00, 0x2e, 0x00, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00,
-  0x1c, 0x00, 0x00, 0x00, 0x33, 0x00, 0x00, 0x00, 0x1a, 0x00, 0x00, 0x00,
-  0x32, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00,
-  0x34, 0x00, 0x00, 0x00, 0x33, 0x00, 0x00, 0x00, 0xb1, 0x00, 0x05, 0x00,
-  0x28, 0x00, 0x00, 0x00, 0x35, 0x00, 0x00, 0x00, 0x1f, 0x00, 0x00, 0x00,
-  0x34, 0x00, 0x00, 0x00, 0xf9, 0x00, 0x02, 0x00, 0x2f, 0x00, 0x00, 0x00,
-  0xf8, 0x00, 0x02, 0x00, 0x2f, 0x00, 0x00, 0x00, 0xf5, 0x00, 0x07, 0x00,
-  0x28, 0x00, 0x00, 0x00, 0x36, 0x00, 0x00, 0x00, 0x2d, 0x00, 0x00, 0x00,
-  0x05, 0x00, 0x00, 0x00, 0x35, 0x00, 0x00, 0x00, 0x2e, 0x00, 0x00, 0x00,
-  0xf7, 0x00, 0x03, 0x00, 0x38, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-  0xfa, 0x00, 0x04, 0x00, 0x36, 0x00, 0x00, 0x00, 0x37, 0x00, 0x00, 0x00,
-  0x38, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x37, 0x00, 0x00, 0x00,
-  0x84, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0x4b, 0x00, 0x00, 0x00,
-  0x1f, 0x00, 0x00, 0x00, 0x1e, 0x00, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00,
-  0x1c, 0x00, 0x00, 0x00, 0x4c, 0x00, 0x00, 0x00, 0x1a, 0x00, 0x00, 0x00,
-  0x3a, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00,
-  0x4d, 0x00, 0x00, 0x00, 0x4c, 0x00, 0x00, 0x00, 0x84, 0x00, 0x05, 0x00,
-  0x06, 0x00, 0x00, 0x00, 0x4e, 0x00, 0x00, 0x00, 0x4b, 0x00, 0x00, 0x00,
-  0x4d, 0x00, 0x00, 0x00, 0x80, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00,
-  0x51, 0x00, 0x00, 0x00, 0x4e, 0x00, 0x00, 0x00, 0x25, 0x00, 0x00, 0x00,
-  0x41, 0x00, 0x06, 0x00, 0x59, 0x00, 0x00, 0x00, 0x5a, 0x00, 0x00, 0x00,
-  0x57, 0x00, 0x00, 0x00, 0x1b, 0x00, 0x00, 0x00, 0x51, 0x00, 0x00, 0x00,
-  0x3d, 0x00, 0x04, 0x00, 0x17, 0x00, 0x00, 0x00, 0x5b, 0x00, 0x00, 0x00,
-  0x5a, 0x00, 0x00, 0x00, 0xf9, 0x00, 0x02, 0x00, 0x5d, 0x00, 0x00, 0x00,
-  0xf8, 0x00, 0x02, 0x00, 0x5d, 0x00, 0x00, 0x00, 0xf5, 0x00, 0x07, 0x00,
-  0x17, 0x00, 0x00, 0x00, 0x8b, 0x00, 0x00, 0x00, 0x5b, 0x00, 0x00, 0x00,
-  0x37, 0x00, 0x00, 0x00, 0x71, 0x00, 0x00, 0x00, 0x5e, 0x00, 0x00, 0x00,
-  0xf5, 0x00, 0x07, 0x00, 0x06, 0x00, 0x00, 0x00, 0x8a, 0x00, 0x00, 0x00,
-  0x3a, 0x00, 0x00, 0x00, 0x37, 0x00, 0x00, 0x00, 0x73, 0x00, 0x00, 0x00,
-  0x5e, 0x00, 0x00, 0x00, 0xb1, 0x00, 0x05, 0x00, 0x28, 0x00, 0x00, 0x00,
-  0x65, 0x00, 0x00, 0x00, 0x8a, 0x00, 0x00, 0x00, 0x4d, 0x00, 0x00, 0x00,
-  0xf6, 0x00, 0x04, 0x00, 0x5f, 0x00, 0x00, 0x00, 0x5e, 0x00, 0x00, 0x00,
-  0x00, 0x00, 0x00, 0x00, 0xfa, 0x00, 0x04, 0x00, 0x65, 0x00, 0x00, 0x00,
-  0x5e, 0x00, 0x00, 0x00, 0x5f, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00,
-  0x5e, 0x00, 0x00, 0x00, 0x84, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00,
-  0x6b, 0x00, 0x00, 0x00, 0x8a, 0x00, 0x00, 0x00, 0x1e, 0x00, 0x00, 0x00,
-  0x80, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0x6c, 0x00, 0x00, 0x00,
-  0x51, 0x00, 0x00, 0x00, 0x6b, 0x00, 0x00, 0x00, 0x41, 0x00, 0x06, 0x00,
-  0x59, 0x00, 0x00, 0x00, 0x6d, 0x00, 0x00, 0x00, 0x57, 0x00, 0x00, 0x00,
-  0x1b, 0x00, 0x00, 0x00, 0x6c, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00,
-  0x17, 0x00, 0x00, 0x00, 0x6e, 0x00, 0x00, 0x00, 0x6d, 0x00, 0x00, 0x00,
-  0x81, 0x00, 0x05, 0x00, 0x17, 0x00, 0x00, 0x00, 0x71, 0x00, 0x00, 0x00,
-  0x8b, 0x00, 0x00, 0x00, 0x6e, 0x00, 0x00, 0x00, 0x80, 0x00, 0x05, 0x00,
-  0x06, 0x00, 0x00, 0x00, 0x73, 0x00, 0x00, 0x00, 0x8a, 0x00, 0x00, 0x00,
-  0x3a, 0x00, 0x00, 0x00, 0xf9, 0x00, 0x02, 0x00, 0x5d, 0x00, 0x00, 0x00,
-  0xf8, 0x00, 0x02, 0x00, 0x5f, 0x00, 0x00, 0x00, 0x41, 0x00, 0x06, 0x00,
-  0x59, 0x00, 0x00, 0x00, 0x7b, 0x00, 0x00, 0x00, 0x77, 0x00, 0x00, 0x00,
-  0x1b, 0x00, 0x00, 0x00, 0x16, 0x00, 0x00, 0x00, 0x3e, 0x00, 0x03, 0x00,
-  0x7b, 0x00, 0x00, 0x00, 0x8b, 0x00, 0x00, 0x00, 0xf9, 0x00, 0x02, 0x00,
-  0x38, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x38, 0x00, 0x00, 0x00,
-  0xfd, 0x00, 0x01, 0x00, 0x38, 0x00, 0x01, 0x00
+  0x21, 0x00, 0x00, 0x00, 0x27, 0x00, 0x00, 0x00, 0x10, 0x00, 0x06, 0x00,
+  0x04, 0x00, 0x00, 0x00, 0x11, 0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00,
+  0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x03, 0x00, 0x03, 0x00,
+  0x02, 0x00, 0x00, 0x00, 0xc2, 0x01, 0x00, 0x00, 0x05, 0x00, 0x04, 0x00,
+  0x04, 0x00, 0x00, 0x00, 0x6d, 0x61, 0x69, 0x6e, 0x00, 0x00, 0x00, 0x00,
+  0x05, 0x00, 0x08, 0x00, 0x21, 0x00, 0x00, 0x00, 0x67, 0x6c, 0x5f, 0x47,
+  0x6c, 0x6f, 0x62, 0x61, 0x6c, 0x49, 0x6e, 0x76, 0x6f, 0x63, 0x61, 0x74,
+  0x69, 0x6f, 0x6e, 0x49, 0x44, 0x00, 0x00, 0x00, 0x05, 0x00, 0x06, 0x00,
+  0x27, 0x00, 0x00, 0x00, 0x67, 0x6c, 0x5f, 0x57, 0x6f, 0x72, 0x6b, 0x47,
+  0x72, 0x6f, 0x75, 0x70, 0x49, 0x44, 0x00, 0x00, 0x05, 0x00, 0x05, 0x00,
+  0x2e, 0x00, 0x00, 0x00, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x42, 0x75, 0x66,
+  0x66, 0x65, 0x72, 0x00, 0x06, 0x00, 0x04, 0x00, 0x2e, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x77, 0x00, 0x00, 0x00, 0x06, 0x00, 0x04, 0x00,
+  0x2e, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x68, 0x00, 0x00, 0x00,
+  0x06, 0x00, 0x04, 0x00, 0x2e, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+  0x63, 0x00, 0x00, 0x00, 0x06, 0x00, 0x04, 0x00, 0x2e, 0x00, 0x00, 0x00,
+  0x03, 0x00, 0x00, 0x00, 0x6b, 0x00, 0x00, 0x00, 0x05, 0x00, 0x04, 0x00,
+  0x30, 0x00, 0x00, 0x00, 0x75, 0x43, 0x6f, 0x6e, 0x73, 0x74, 0x00, 0x00,
+  0x05, 0x00, 0x06, 0x00, 0x5a, 0x00, 0x00, 0x00, 0x73, 0x6f, 0x75, 0x72,
+  0x63, 0x65, 0x42, 0x75, 0x66, 0x66, 0x65, 0x72, 0x30, 0x00, 0x00, 0x00,
+  0x06, 0x00, 0x05, 0x00, 0x5a, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x64, 0x61, 0x74, 0x61, 0x00, 0x00, 0x00, 0x00, 0x05, 0x00, 0x04, 0x00,
+  0x5c, 0x00, 0x00, 0x00, 0x75, 0x49, 0x6e, 0x70, 0x75, 0x74, 0x00, 0x00,
+  0x05, 0x00, 0x06, 0x00, 0x70, 0x00, 0x00, 0x00, 0x73, 0x68, 0x61, 0x72,
+  0x65, 0x64, 0x56, 0x61, 0x6c, 0x75, 0x65, 0x73, 0x00, 0x00, 0x00, 0x00,
+  0x05, 0x00, 0x05, 0x00, 0x9c, 0x00, 0x00, 0x00, 0x64, 0x65, 0x73, 0x74,
+  0x42, 0x75, 0x66, 0x66, 0x65, 0x72, 0x00, 0x00, 0x06, 0x00, 0x05, 0x00,
+  0x9c, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x64, 0x61, 0x74, 0x61,
+  0x00, 0x00, 0x00, 0x00, 0x05, 0x00, 0x04, 0x00, 0x9e, 0x00, 0x00, 0x00,
+  0x75, 0x4f, 0x75, 0x74, 0x70, 0x75, 0x74, 0x00, 0x47, 0x00, 0x04, 0x00,
+  0x21, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00,
+  0x47, 0x00, 0x04, 0x00, 0x27, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00,
+  0x1a, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00, 0x2e, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x48, 0x00, 0x05, 0x00, 0x2e, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+  0x23, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00,
+  0x2e, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00,
+  0x08, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00, 0x2e, 0x00, 0x00, 0x00,
+  0x03, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x00, 0x00,
+  0x47, 0x00, 0x03, 0x00, 0x2e, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+  0x47, 0x00, 0x04, 0x00, 0x30, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x30, 0x00, 0x00, 0x00,
+  0x21, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
+  0x59, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
+  0x48, 0x00, 0x04, 0x00, 0x5a, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x18, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00, 0x5a, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x47, 0x00, 0x03, 0x00, 0x5a, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00,
+  0x47, 0x00, 0x04, 0x00, 0x5c, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x5c, 0x00, 0x00, 0x00,
+  0x21, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
+  0x9b, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
+  0x48, 0x00, 0x04, 0x00, 0x9c, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x19, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00, 0x9c, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x47, 0x00, 0x03, 0x00, 0x9c, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00,
+  0x47, 0x00, 0x04, 0x00, 0x9e, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x9e, 0x00, 0x00, 0x00,
+  0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
+  0xa2, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00,
+  0x13, 0x00, 0x02, 0x00, 0x02, 0x00, 0x00, 0x00, 0x21, 0x00, 0x03, 0x00,
+  0x03, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x16, 0x00, 0x03, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0x11, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x15, 0x00, 0x04, 0x00, 0x1c, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x17, 0x00, 0x04, 0x00, 0x1f, 0x00, 0x00, 0x00,
+  0x1c, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00,
+  0x20, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x1f, 0x00, 0x00, 0x00,
+  0x3b, 0x00, 0x04, 0x00, 0x20, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00,
+  0x01, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00, 0x1c, 0x00, 0x00, 0x00,
+  0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00,
+  0x23, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00,
+  0x3b, 0x00, 0x04, 0x00, 0x20, 0x00, 0x00, 0x00, 0x27, 0x00, 0x00, 0x00,
+  0x01, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00, 0x1c, 0x00, 0x00, 0x00,
+  0x28, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x15, 0x00, 0x04, 0x00,
+  0x2d, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+  0x1e, 0x00, 0x06, 0x00, 0x2e, 0x00, 0x00, 0x00, 0x2d, 0x00, 0x00, 0x00,
+  0x2d, 0x00, 0x00, 0x00, 0x2d, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0x20, 0x00, 0x04, 0x00, 0x2f, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+  0x2e, 0x00, 0x00, 0x00, 0x3b, 0x00, 0x04, 0x00, 0x2f, 0x00, 0x00, 0x00,
+  0x30, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00,
+  0x2d, 0x00, 0x00, 0x00, 0x31, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x20, 0x00, 0x04, 0x00, 0x32, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+  0x2d, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00, 0x2d, 0x00, 0x00, 0x00,
+  0x42, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x14, 0x00, 0x02, 0x00,
+  0x57, 0x00, 0x00, 0x00, 0x1d, 0x00, 0x03, 0x00, 0x59, 0x00, 0x00, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0x1e, 0x00, 0x03, 0x00, 0x5a, 0x00, 0x00, 0x00,
+  0x59, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00, 0x5b, 0x00, 0x00, 0x00,
+  0x02, 0x00, 0x00, 0x00, 0x5a, 0x00, 0x00, 0x00, 0x3b, 0x00, 0x04, 0x00,
+  0x5b, 0x00, 0x00, 0x00, 0x5c, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+  0x20, 0x00, 0x04, 0x00, 0x67, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00, 0x1c, 0x00, 0x00, 0x00,
+  0x6b, 0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x04, 0x00,
+  0x6e, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x6b, 0x00, 0x00, 0x00,
+  0x20, 0x00, 0x04, 0x00, 0x6f, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
+  0x6e, 0x00, 0x00, 0x00, 0x3b, 0x00, 0x04, 0x00, 0x6f, 0x00, 0x00, 0x00,
+  0x70, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00,
+  0x73, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0x2b, 0x00, 0x04, 0x00, 0x1c, 0x00, 0x00, 0x00, 0x75, 0x00, 0x00, 0x00,
+  0x02, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00, 0x1c, 0x00, 0x00, 0x00,
+  0x76, 0x00, 0x00, 0x00, 0x08, 0x01, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00,
+  0x1c, 0x00, 0x00, 0x00, 0x78, 0x00, 0x00, 0x00, 0x40, 0x00, 0x00, 0x00,
+  0x1d, 0x00, 0x03, 0x00, 0x9b, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0x1e, 0x00, 0x03, 0x00, 0x9c, 0x00, 0x00, 0x00, 0x9b, 0x00, 0x00, 0x00,
+  0x20, 0x00, 0x04, 0x00, 0x9d, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+  0x9c, 0x00, 0x00, 0x00, 0x3b, 0x00, 0x04, 0x00, 0x9d, 0x00, 0x00, 0x00,
+  0x9e, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x2c, 0x00, 0x06, 0x00,
+  0x1f, 0x00, 0x00, 0x00, 0xa2, 0x00, 0x00, 0x00, 0x6b, 0x00, 0x00, 0x00,
+  0x28, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00, 0x36, 0x00, 0x05, 0x00,
+  0x02, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x03, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x05, 0x00, 0x00, 0x00,
+  0x41, 0x00, 0x05, 0x00, 0x23, 0x00, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00,
+  0x21, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00,
+  0x1c, 0x00, 0x00, 0x00, 0x25, 0x00, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00,
+  0x41, 0x00, 0x05, 0x00, 0x23, 0x00, 0x00, 0x00, 0x29, 0x00, 0x00, 0x00,
+  0x27, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00,
+  0x1c, 0x00, 0x00, 0x00, 0x2a, 0x00, 0x00, 0x00, 0x29, 0x00, 0x00, 0x00,
+  0x41, 0x00, 0x05, 0x00, 0x32, 0x00, 0x00, 0x00, 0x33, 0x00, 0x00, 0x00,
+  0x30, 0x00, 0x00, 0x00, 0x31, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00,
+  0x2d, 0x00, 0x00, 0x00, 0x34, 0x00, 0x00, 0x00, 0x33, 0x00, 0x00, 0x00,
+  0x7c, 0x00, 0x04, 0x00, 0x1c, 0x00, 0x00, 0x00, 0x35, 0x00, 0x00, 0x00,
+  0x34, 0x00, 0x00, 0x00, 0x89, 0x00, 0x05, 0x00, 0x1c, 0x00, 0x00, 0x00,
+  0x36, 0x00, 0x00, 0x00, 0x2a, 0x00, 0x00, 0x00, 0x35, 0x00, 0x00, 0x00,
+  0x86, 0x00, 0x05, 0x00, 0x1c, 0x00, 0x00, 0x00, 0x3c, 0x00, 0x00, 0x00,
+  0x2a, 0x00, 0x00, 0x00, 0x35, 0x00, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00,
+  0x32, 0x00, 0x00, 0x00, 0x43, 0x00, 0x00, 0x00, 0x30, 0x00, 0x00, 0x00,
+  0x42, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00, 0x2d, 0x00, 0x00, 0x00,
+  0x44, 0x00, 0x00, 0x00, 0x43, 0x00, 0x00, 0x00, 0x7c, 0x00, 0x04, 0x00,
+  0x1c, 0x00, 0x00, 0x00, 0x45, 0x00, 0x00, 0x00, 0x44, 0x00, 0x00, 0x00,
+  0x84, 0x00, 0x05, 0x00, 0x1c, 0x00, 0x00, 0x00, 0x47, 0x00, 0x00, 0x00,
+  0x45, 0x00, 0x00, 0x00, 0x3c, 0x00, 0x00, 0x00, 0x84, 0x00, 0x05, 0x00,
+  0x1c, 0x00, 0x00, 0x00, 0x48, 0x00, 0x00, 0x00, 0x35, 0x00, 0x00, 0x00,
+  0x47, 0x00, 0x00, 0x00, 0x80, 0x00, 0x05, 0x00, 0x1c, 0x00, 0x00, 0x00,
+  0x49, 0x00, 0x00, 0x00, 0x36, 0x00, 0x00, 0x00, 0x48, 0x00, 0x00, 0x00,
+  0xf9, 0x00, 0x02, 0x00, 0x4e, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00,
+  0x4e, 0x00, 0x00, 0x00, 0xf5, 0x00, 0x07, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0xb6, 0x00, 0x00, 0x00, 0x11, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00,
+  0xac, 0x00, 0x00, 0x00, 0x4f, 0x00, 0x00, 0x00, 0xf5, 0x00, 0x07, 0x00,
+  0x1c, 0x00, 0x00, 0x00, 0xb5, 0x00, 0x00, 0x00, 0x25, 0x00, 0x00, 0x00,
+  0x05, 0x00, 0x00, 0x00, 0x6d, 0x00, 0x00, 0x00, 0x4f, 0x00, 0x00, 0x00,
+  0xb0, 0x00, 0x05, 0x00, 0x57, 0x00, 0x00, 0x00, 0x58, 0x00, 0x00, 0x00,
+  0xb5, 0x00, 0x00, 0x00, 0x45, 0x00, 0x00, 0x00, 0xf6, 0x00, 0x04, 0x00,
+  0x50, 0x00, 0x00, 0x00, 0x4f, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0xfa, 0x00, 0x04, 0x00, 0x58, 0x00, 0x00, 0x00, 0x4f, 0x00, 0x00, 0x00,
+  0x50, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x4f, 0x00, 0x00, 0x00,
+  0x84, 0x00, 0x05, 0x00, 0x1c, 0x00, 0x00, 0x00, 0x62, 0x00, 0x00, 0x00,
+  0xb5, 0x00, 0x00, 0x00, 0x35, 0x00, 0x00, 0x00, 0x80, 0x00, 0x05, 0x00,
+  0x1c, 0x00, 0x00, 0x00, 0x63, 0x00, 0x00, 0x00, 0x49, 0x00, 0x00, 0x00,
+  0x62, 0x00, 0x00, 0x00, 0x41, 0x00, 0x06, 0x00, 0x67, 0x00, 0x00, 0x00,
+  0x68, 0x00, 0x00, 0x00, 0x5c, 0x00, 0x00, 0x00, 0x31, 0x00, 0x00, 0x00,
+  0x63, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0x69, 0x00, 0x00, 0x00, 0x68, 0x00, 0x00, 0x00, 0x81, 0x00, 0x05, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0xac, 0x00, 0x00, 0x00, 0xb6, 0x00, 0x00, 0x00,
+  0x69, 0x00, 0x00, 0x00, 0x80, 0x00, 0x05, 0x00, 0x1c, 0x00, 0x00, 0x00,
+  0x6d, 0x00, 0x00, 0x00, 0xb5, 0x00, 0x00, 0x00, 0x6b, 0x00, 0x00, 0x00,
+  0xf9, 0x00, 0x02, 0x00, 0x4e, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00,
+  0x50, 0x00, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00, 0x73, 0x00, 0x00, 0x00,
+  0x74, 0x00, 0x00, 0x00, 0x70, 0x00, 0x00, 0x00, 0x25, 0x00, 0x00, 0x00,
+  0x3e, 0x00, 0x03, 0x00, 0x74, 0x00, 0x00, 0x00, 0xb6, 0x00, 0x00, 0x00,
+  0xe0, 0x00, 0x04, 0x00, 0x75, 0x00, 0x00, 0x00, 0x75, 0x00, 0x00, 0x00,
+  0x76, 0x00, 0x00, 0x00, 0xf9, 0x00, 0x02, 0x00, 0x79, 0x00, 0x00, 0x00,
+  0xf8, 0x00, 0x02, 0x00, 0x79, 0x00, 0x00, 0x00, 0xf5, 0x00, 0x07, 0x00,
+  0x1c, 0x00, 0x00, 0x00, 0xb7, 0x00, 0x00, 0x00, 0x78, 0x00, 0x00, 0x00,
+  0x50, 0x00, 0x00, 0x00, 0x93, 0x00, 0x00, 0x00, 0x7c, 0x00, 0x00, 0x00,
+  0xac, 0x00, 0x05, 0x00, 0x57, 0x00, 0x00, 0x00, 0x7f, 0x00, 0x00, 0x00,
+  0xb7, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0xf6, 0x00, 0x04, 0x00,
+  0x7b, 0x00, 0x00, 0x00, 0x7c, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0xfa, 0x00, 0x04, 0x00, 0x7f, 0x00, 0x00, 0x00, 0x7a, 0x00, 0x00, 0x00,
+  0x7b, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x7a, 0x00, 0x00, 0x00,
+  0xb0, 0x00, 0x05, 0x00, 0x57, 0x00, 0x00, 0x00, 0x82, 0x00, 0x00, 0x00,
+  0x25, 0x00, 0x00, 0x00, 0xb7, 0x00, 0x00, 0x00, 0xf7, 0x00, 0x03, 0x00,
+  0x84, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xfa, 0x00, 0x04, 0x00,
+  0x82, 0x00, 0x00, 0x00, 0x83, 0x00, 0x00, 0x00, 0x84, 0x00, 0x00, 0x00,
+  0xf8, 0x00, 0x02, 0x00, 0x83, 0x00, 0x00, 0x00, 0x80, 0x00, 0x05, 0x00,
+  0x1c, 0x00, 0x00, 0x00, 0x88, 0x00, 0x00, 0x00, 0x25, 0x00, 0x00, 0x00,
+  0xb7, 0x00, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00, 0x73, 0x00, 0x00, 0x00,
+  0x8b, 0x00, 0x00, 0x00, 0x70, 0x00, 0x00, 0x00, 0x88, 0x00, 0x00, 0x00,
+  0x3d, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, 0x8c, 0x00, 0x00, 0x00,
+  0x8b, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0x8f, 0x00, 0x00, 0x00, 0x74, 0x00, 0x00, 0x00, 0x81, 0x00, 0x05, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0xb3, 0x00, 0x00, 0x00, 0x8c, 0x00, 0x00, 0x00,
+  0x8f, 0x00, 0x00, 0x00, 0x3e, 0x00, 0x03, 0x00, 0x74, 0x00, 0x00, 0x00,
+  0xb3, 0x00, 0x00, 0x00, 0xf9, 0x00, 0x02, 0x00, 0x84, 0x00, 0x00, 0x00,
+  0xf8, 0x00, 0x02, 0x00, 0x84, 0x00, 0x00, 0x00, 0xe0, 0x00, 0x04, 0x00,
+  0x75, 0x00, 0x00, 0x00, 0x75, 0x00, 0x00, 0x00, 0x76, 0x00, 0x00, 0x00,
+  0xf9, 0x00, 0x02, 0x00, 0x7c, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00,
+  0x7c, 0x00, 0x00, 0x00, 0xc2, 0x00, 0x05, 0x00, 0x1c, 0x00, 0x00, 0x00,
+  0x93, 0x00, 0x00, 0x00, 0xb7, 0x00, 0x00, 0x00, 0x42, 0x00, 0x00, 0x00,
+  0xf9, 0x00, 0x02, 0x00, 0x79, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00,
+  0x7b, 0x00, 0x00, 0x00, 0xaa, 0x00, 0x05, 0x00, 0x57, 0x00, 0x00, 0x00,
+  0x95, 0x00, 0x00, 0x00, 0x25, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00,
+  0xf7, 0x00, 0x03, 0x00, 0x97, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0xfa, 0x00, 0x04, 0x00, 0x95, 0x00, 0x00, 0x00, 0x96, 0x00, 0x00, 0x00,
+  0x97, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x96, 0x00, 0x00, 0x00,
+  0x41, 0x00, 0x05, 0x00, 0x73, 0x00, 0x00, 0x00, 0x99, 0x00, 0x00, 0x00,
+  0x70, 0x00, 0x00, 0x00, 0x31, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0x9a, 0x00, 0x00, 0x00, 0x99, 0x00, 0x00, 0x00,
+  0x41, 0x00, 0x06, 0x00, 0x67, 0x00, 0x00, 0x00, 0xa1, 0x00, 0x00, 0x00,
+  0x9e, 0x00, 0x00, 0x00, 0x31, 0x00, 0x00, 0x00, 0x2a, 0x00, 0x00, 0x00,
+  0x3e, 0x00, 0x03, 0x00, 0xa1, 0x00, 0x00, 0x00, 0x9a, 0x00, 0x00, 0x00,
+  0xf9, 0x00, 0x02, 0x00, 0x97, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00,
+  0x97, 0x00, 0x00, 0x00, 0xfd, 0x00, 0x01, 0x00, 0x38, 0x00, 0x01, 0x00
 };
-unsigned int glsl_reduce_SUM_comp_len = 1988;
+unsigned int glsl_reduce_SUM_comp_len = 2508;
 
 const unsigned char glsl_resizeBilinear_comp[] = {
   0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0b, 0x00, 0x08, 0x00,
diff --git a/source/backend/vulkan/image/compiler/VulkanShaderMap.cpp b/source/backend/vulkan/image/compiler/VulkanShaderMap.cpp
index 915ca987b..7fbe41499 100644
--- a/source/backend/vulkan/image/compiler/VulkanShaderMap.cpp
+++ b/source/backend/vulkan/image/compiler/VulkanShaderMap.cpp
@@ -5,6 +5,11 @@ namespace MNN {
 void VulkanShaderMap::init() {
 mMaps.insert(std::make_pair("glsl_dwweightcopy_comp", std::make_pair(glsl_dwweightcopy_comp,glsl_dwweightcopy_comp_len)));
 mMaps.insert(std::make_pair("glsl_deconvCol2Im_comp", std::make_pair(glsl_deconvCol2Im_comp,glsl_deconvCol2Im_comp_len)));
+mMaps.insert(std::make_pair("glsl_softmaxImage_comp", std::make_pair(glsl_softmaxImage_comp,glsl_softmaxImage_comp_len)));
+mMaps.insert(std::make_pair("glsl_softmaxImage_AXIS_N_comp", std::make_pair(glsl_softmaxImage_AXIS_N_comp,glsl_softmaxImage_AXIS_N_comp_len)));
+mMaps.insert(std::make_pair("glsl_softmaxImage_AXIS_H_comp", std::make_pair(glsl_softmaxImage_AXIS_H_comp,glsl_softmaxImage_AXIS_H_comp_len)));
+mMaps.insert(std::make_pair("glsl_softmaxImage_AXIS_W_comp", std::make_pair(glsl_softmaxImage_AXIS_W_comp,glsl_softmaxImage_AXIS_W_comp_len)));
+mMaps.insert(std::make_pair("glsl_softmaxImage_AXIS_C_comp", std::make_pair(glsl_softmaxImage_AXIS_C_comp,glsl_softmaxImage_AXIS_C_comp_len)));
 mMaps.insert(std::make_pair("glsl_convolutionDepthwiseMali_comp", std::make_pair(glsl_convolutionDepthwiseMali_comp,glsl_convolutionDepthwiseMali_comp_len)));
 mMaps.insert(std::make_pair("glsl_convolutionDepthwiseMali_RELU_comp", std::make_pair(glsl_convolutionDepthwiseMali_RELU_comp,glsl_convolutionDepthwiseMali_RELU_comp_len)));
 mMaps.insert(std::make_pair("glsl_convolutionDepthwiseMali_RELU6_comp", std::make_pair(glsl_convolutionDepthwiseMali_RELU6_comp,glsl_convolutionDepthwiseMali_RELU6_comp_len)));
@@ -67,7 +72,6 @@ mMaps.insert(std::make_pair("glsl_blit_comp", std::make_pair(glsl_blit_comp,glsl
 mMaps.insert(std::make_pair("glsl_blit_image_comp", std::make_pair(glsl_blit_image_comp,glsl_blit_image_comp_len)));
 mMaps.insert(std::make_pair("glsl_fill_image_comp", std::make_pair(glsl_fill_image_comp,glsl_fill_image_comp_len)));
 mMaps.insert(std::make_pair("glsl_imageTonchw_comp", std::make_pair(glsl_imageTonchw_comp,glsl_imageTonchw_comp_len)));
-mMaps.insert(std::make_pair("glsl_softmaxHeight_NHWC_comp", std::make_pair(glsl_softmaxHeight_NHWC_comp,glsl_softmaxHeight_NHWC_comp_len)));
 mMaps.insert(std::make_pair("glsl_resizeNearest_comp", std::make_pair(glsl_resizeNearest_comp,glsl_resizeNearest_comp_len)));
 mMaps.insert(std::make_pair("glsl_resizeNearest_NEAREST_ROUND_comp", std::make_pair(glsl_resizeNearest_NEAREST_ROUND_comp,glsl_resizeNearest_NEAREST_ROUND_comp_len)));
 mMaps.insert(std::make_pair("glsl_reduce_comp", std::make_pair(glsl_reduce_comp,glsl_reduce_comp_len)));
diff --git a/source/backend/vulkan/image/execution/VulkanReduce.cpp b/source/backend/vulkan/image/execution/VulkanReduce.cpp
index 1d7256168..96d38d5ec 100644
--- a/source/backend/vulkan/image/execution/VulkanReduce.cpp
+++ b/source/backend/vulkan/image/execution/VulkanReduce.cpp
@@ -81,7 +81,7 @@ ErrorCode VulkanReduce::onEncode(const std::vector<Tensor*>& inputs, const std::
     mDescriptorSet->writeBuffer(mConstBuffer->buffer(), 2, mConstBuffer->size());
     cmdBuffer->barrierSource(mSource.buffer->buffer(), 0, mSource.buffer->size());
     mPipeline->bind(cmdBuffer->get(), mDescriptorSet->get());
-    vkCmdDispatch(cmdBuffer->get(), UP_DIV(total, 256), 1, 1);
+    vkCmdDispatch(cmdBuffer->get(), 1, total, 1);
     cmdBuffer->barrierSource(mOutput.buffer->buffer(), 0, mOutput.buffer->size());
     mOutput.convert->encodeBufferToTensor(mOutput.buffer->buffer(), output, mOutput.buffer->size(), 0, VulkanImageConverter::getTensorLinearFormat(output), cmdBuffer);
     {
diff --git a/source/backend/vulkan/image/execution/VulkanSoftmax.cpp b/source/backend/vulkan/image/execution/VulkanSoftmax.cpp
index 6f5b9f650..7c19595fd 100644
--- a/source/backend/vulkan/image/execution/VulkanSoftmax.cpp
+++ b/source/backend/vulkan/image/execution/VulkanSoftmax.cpp
@@ -12,25 +12,38 @@
 
 namespace MNN {
 
-struct ConstBuffer {
-    int w;
-    int h;
-    int c;
+struct SoftmaxConstBuffer {
+    uint N;
+    uint H;
+    uint W;
+    uint C4;
+    uint CLeft;
 };
 
-VulkanSoftmax::VulkanSoftmax(const Op* op, Backend* bn) : VulkanBasicExecution(bn) {
-    const auto softmaxParam = op->main_as_Axis();
-    mAxis                   = softmaxParam->axis();
+VulkanSoftmax::VulkanSoftmax(const Op* op, Backend* bn, const uint axisIndex) : VulkanBasicExecution(bn) {
+    mAxisIndex = axisIndex;
     auto vkBn = (VulkanBackend*)backend();
-    mConstBuffer = std::make_shared<VulkanBuffer>(vkBn->getMemoryPool(), false, sizeof(ConstBuffer), nullptr,
-                                                  VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT);
-    std::vector<VkDescriptorType> types{VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
-                                        VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER};
-    mSoftmaxPipeline =
-        vkBn->getPipeline("glsl_softmaxHeight_NHWC_comp", types);
+    std::string shaderName = "glsl_softmaxImage_";
+    std::string macro = "";
+    std::string suffix = "comp";
+    switch (axisIndex) {
+        case 0:
+            macro = "AXIS_N_"; break;
+        case 1:
+            macro = "AXIS_H_"; break;
+        case 2:
+            macro = "AXIS_W_"; break;
+        case 3:
+            macro = "AXIS_C_"; break;
+    }
+
+    std::vector<VkDescriptorType> types {VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
+                                            VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER,
+                                            VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER};
+
+    mSoftmaxPipeline = vkBn->getPipeline(shaderName + macro + suffix, types);
     mDescriptorSet.reset(mSoftmaxPipeline->createSet());
-    mSource.convert.reset(new VulkanImageConverter(vkBn));
-    mOutput.convert.reset(new VulkanImageConverter(vkBn));
+    mSoftmaxConstBuffer = std::make_shared<VulkanBuffer>(vkBn->getMemoryPool(), false, sizeof(SoftmaxConstBuffer), nullptr, VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT);
 }
 
 VulkanSoftmax::~VulkanSoftmax() {
@@ -38,65 +51,44 @@ VulkanSoftmax::~VulkanSoftmax() {
 
 ErrorCode VulkanSoftmax::onEncode(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs,
                                   const VulkanCommandPool::Buffer* cmdBuffer) {
+    auto vkBn = static_cast<VulkanBackend *>(backend());
     auto input  = inputs[0];
     auto output = outputs[0];
+    auto inputShapeNHWC = VulkanTensor::tensorShapeFormat(input);
+    std::vector<uint> cpuSoftmaxConstBuffer = {(uint)inputShapeNHWC[0], (uint)inputShapeNHWC[1], (uint)inputShapeNHWC[2], (uint)UP_DIV(inputShapeNHWC[3], 4), (uint)ROUND_UP(inputShapeNHWC[3], 4) - inputShapeNHWC[3]};
 
-    auto inputFormat = TensorUtils::getDescribe(input)->dimensionFormat;
-    auto axis = mAxis;
-    if (axis < 0) {
-        axis = input->dimensions() + axis;
-    }
-    auto mVkBackend = (VulkanBackend*)backend();
-    int inside = 1;
-    int outside = 1;
-    int mid = input->length(axis);
-    for (int i=0; i<axis; ++i) {
-        outside *= input->length(i);
-    }
-    for (int i=axis+1; i<output->dimensions(); ++i) {
-        inside *= input->length(i);
-    }
-    // gpu param
-    {
-        auto softmax = reinterpret_cast<ConstBuffer*>(mConstBuffer->map());
-        ::memset(softmax, 0, sizeof(ConstBuffer));
-        softmax->w = inside;
-        softmax->h = mid;
-        softmax->c = outside;
-        mConstBuffer->unmap();
-    }
-    auto vkBn = static_cast<VulkanBackend*>(backend());
     {
-        int bufferSize = sizeof(float);
-        for (int i=0; i<input->dimensions(); ++i) {
-            bufferSize *= input->length(i);
-        }
-        mSource.buffer.reset(new VulkanBuffer(vkBn->getDynamicMemoryPool(),
-                                           false, bufferSize, nullptr, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT));
-    }
-    {
-        int bufferSize = sizeof(float);
-        for (int i=0; i<output->dimensions(); ++i) {
-            bufferSize *= output->length(i);
-        }
-        mOutput.buffer.reset(new VulkanBuffer(vkBn->getDynamicMemoryPool(), false, bufferSize, nullptr, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT));
+        auto softmaxConst = reinterpret_cast<SoftmaxConstBuffer*>(mSoftmaxConstBuffer->map());
+        ::memset(softmaxConst, 0, sizeof(SoftmaxConstBuffer));
+        softmaxConst->N = cpuSoftmaxConstBuffer[0];
+        softmaxConst->H = cpuSoftmaxConstBuffer[1];
+        softmaxConst->W = cpuSoftmaxConstBuffer[2];
+        softmaxConst->C4 = cpuSoftmaxConstBuffer[3];
+        softmaxConst->CLeft = cpuSoftmaxConstBuffer[4];
+        mSoftmaxConstBuffer->unmap();
     }
 
-    // Encode
-    mSource.convert->encodeTensorToBuffer(input, mSource.buffer->buffer(), mSource.buffer->size(), 0, VulkanImageConverter::getTensorLinearFormat(input), cmdBuffer);
+    // N * H * W * C4
+    uint numTotal = cpuSoftmaxConstBuffer[0] * cpuSoftmaxConstBuffer[1] * cpuSoftmaxConstBuffer[2] * cpuSoftmaxConstBuffer[3];
+    uint numY = numTotal / cpuSoftmaxConstBuffer[mAxisIndex];
+
+    auto vkOutput  = (VulkanTensor*)output->deviceId();
+    auto vkInput   = (VulkanTensor*)input->deviceId();
+
+    mDescriptorSet.reset(mSoftmaxPipeline->createSet());
+    mDescriptorSet->writeImage(vkOutput->image()->view(), vkBn->getCommonSampler()->get(),
+                            VK_IMAGE_LAYOUT_GENERAL, 0);
+    mDescriptorSet->writeImage(vkInput->image()->view(), vkBn->getCommonSampler()->get(),
+                            VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL, 1);
+    mDescriptorSet->writeBuffer(mSoftmaxConstBuffer->buffer(), 2, mSoftmaxConstBuffer->size());
+
+    vkOutput->image()->barrierWrite(cmdBuffer->get());
+    vkInput->image()->barrierRead(cmdBuffer->get());
 
-    mDescriptorSet->writeBuffer(mOutput.buffer->buffer(), 0, mOutput.buffer->size());
-    mDescriptorSet->writeBuffer(mSource.buffer->buffer(), 1, mSource.buffer->size());
-    mDescriptorSet->writeBuffer(mConstBuffer->buffer(), 2, mConstBuffer->size());
-    cmdBuffer->barrierSource(mSource.buffer->buffer(), 0, mSource.buffer->size());
     mSoftmaxPipeline->bind(cmdBuffer->get(), mDescriptorSet->get());
-    vkCmdDispatch(cmdBuffer->get(), UP_DIV(outside, 8), UP_DIV(inside, 8), 1);
-    cmdBuffer->barrierSource(mOutput.buffer->buffer(), 0, mOutput.buffer->size());
-    mOutput.convert->encodeBufferToTensor(mOutput.buffer->buffer(), output, mOutput.buffer->size(), 0, VulkanImageConverter::getTensorLinearFormat(output), cmdBuffer);
-    {
-        mSource.buffer->release();
-        mOutput.buffer->release();
-    }
+
+    vkCmdDispatch(cmdBuffer->get(), 1, numY, 1);
+
     return NO_ERROR;
 }
 
@@ -104,7 +96,43 @@ class VulkanSoftmaxCreator : public VulkanBackend::Creator {
 public:
     virtual VulkanBasicExecution* onCreate(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs, const MNN::Op* op,
                                 Backend* backend) const override {
-        return new VulkanSoftmax(op, backend);
+        auto input = inputs[0];
+
+        uint dimension = input->dimensions();
+        if (dimension > 4) {
+            return nullptr;
+        }
+
+        // Work out the reduce axis, taking various formats and dimensions into account.
+        MNN_DATA_FORMAT format = VulkanImageConverter::getTensorLinearFormat(input);
+        int axis = op->main_as_Axis()->axis();
+        if (axis < 0) {
+            axis = input->dimensions() + axis;
+        }
+        std::vector<uint> axisMap;
+
+        if (dimension == 4) {
+            if (format == MNN_DATA_FORMAT_NCHW) {
+                axisMap.assign({0, 3, 1, 2});
+            } else {
+                axisMap.assign({0, 1, 2, 3});
+            }
+        } else if (dimension == 3) {
+            if (format == MNN_DATA_FORMAT_NCHW) {
+                axisMap.assign({0, 3, 1});
+            } else {
+                axisMap.assign({0, 1, 3});
+            }
+        } else if (dimension == 2) {
+            axisMap.assign({0, 3});
+        } else if (dimension == 1) {
+            axisMap.assign({3});
+        } else {
+            return nullptr;
+        }
+        uint axisIndex = axisMap[axis];
+
+        return new VulkanSoftmax(op, backend, axisIndex);
     }
 };
 
diff --git a/source/backend/vulkan/image/execution/VulkanSoftmax.hpp b/source/backend/vulkan/image/execution/VulkanSoftmax.hpp
index c92e17b1b..fafd1b34a 100644
--- a/source/backend/vulkan/image/execution/VulkanSoftmax.hpp
+++ b/source/backend/vulkan/image/execution/VulkanSoftmax.hpp
@@ -16,23 +16,16 @@
 namespace MNN {
 class VulkanSoftmax : public VulkanBasicExecution {
 public:
-    VulkanSoftmax(const Op* op, Backend* bn);
+    VulkanSoftmax(const Op* op, Backend* bn, const uint axisIndex);
     virtual ~VulkanSoftmax();
     ErrorCode onEncode(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs,
                        const VulkanCommandPool::Buffer* cmdBuffer) override;
 
 private:
-    std::shared_ptr<VulkanBuffer> mConstBuffer;
+    std::shared_ptr<VulkanBuffer> mSoftmaxConstBuffer;
     const VulkanPipeline* mSoftmaxPipeline;
     std::shared_ptr<VulkanLayout::DescriptorSet> mDescriptorSet;
-    int mAxis;
-    struct ConvertInfo {
-        const VulkanPipeline* pipeline;
-        std::shared_ptr<VulkanImageConverter> convert;
-        std::shared_ptr<VulkanBuffer> buffer;
-    };
-    ConvertInfo mSource;
-    ConvertInfo mOutput;
+    uint mAxisIndex;
 };
 
 } // namespace MNN
diff --git a/source/backend/vulkan/image/execution/glsl/macro.json b/source/backend/vulkan/image/execution/glsl/macro.json
index bfc289616..bb50284b3 100644
--- a/source/backend/vulkan/image/execution/glsl/macro.json
+++ b/source/backend/vulkan/image/execution/glsl/macro.json
@@ -117,5 +117,11 @@
         "VMAX",
         "VMIN",
         "SQUDIFF"
+    ],
+    "softmaxImage.comp":[
+        "AXIS_N",
+        "AXIS_H",
+        "AXIS_W",
+        "AXIS_C"
     ]
 }
diff --git a/source/backend/vulkan/image/execution/glsl/reduce.comp b/source/backend/vulkan/image/execution/glsl/reduce.comp
index ebb9425c6..4b2bc8e44 100644
--- a/source/backend/vulkan/image/execution/glsl/reduce.comp
+++ b/source/backend/vulkan/image/execution/glsl/reduce.comp
@@ -1,5 +1,7 @@
-#version 440 core
+#version 450
+
 #define FLOAT float
+#define MAX_FLOAT (3.402823466e+38)
 
 layout(std430) buffer;
 layout(set=0, binding=0) writeonly buffer destBuffer{
@@ -17,45 +19,81 @@ layout(set=0, binding=2) uniform constBuffer {
     float k;//For mean
 }uConst;
 
-layout(local_size_x = 256, local_size_y = 1, local_size_z = 1) in;
-
-void main()
-{
-    ivec3 posTmp = ivec3(gl_GlobalInvocationID);
-    ivec2 pos;
-    pos.x = posTmp.x / uConst.w;
-    pos.y = posTmp.x % uConst.w;
-    // x: index in outside, y: index in inside
-    if(pos.y < uConst.w && pos.x < uConst.c)
-    {
-        int h = uConst.h;
-        int W = uConst.w;
-        int H = uConst.h;
-        int C = uConst.c;
-        int basicOffset = pos.x * uConst.w * uConst.h + pos.y;
-        FLOAT res = uInput.data[basicOffset];
-        for(int i = 1; i < uConst.h; ++i)
-        {
-            FLOAT next = uInput.data[basicOffset + i * uConst.w];
-#ifdef VMAX
-            res = max(res, next);
-#endif
-#ifdef VMIN
-            res = min(res, next);
-#endif
-#ifdef SUM
-            res = res + next;
-#endif
-#ifdef PROD
-            res = res * next;
-#endif
-#ifdef MEAN
-            res = res + next;
-#endif
+shared float sharedValues[128];
+
+
+FLOAT initValue() {
+    FLOAT result;
+    #ifdef VMAX
+    result = -1 * MAX_FLOAT;
+    #endif
+    #ifdef VMIN
+    result = MAX_FLOAT;
+    #endif
+    #ifdef SUM
+    result = 0.0f;
+    #endif
+    #ifdef MEAN
+    result = 0.0f;
+    #endif
+    #ifdef PROD
+    result = 1.0f;
+    #endif
+    return result;
+}
+
+FLOAT reduceFunc(FLOAT a, FLOAT b) {
+    FLOAT result;
+    #ifdef VMAX
+    result = max(a, b);
+    #endif
+    #ifdef VMIN
+    result = min(a, b);
+    #endif
+    #ifdef SUM
+    result = a + b;
+    #endif
+    #ifdef MEAN
+    result = a + b;
+    #endif
+    #ifdef PROD
+    result = a * b;
+    #endif
+    return result;
+}
+
+layout(local_size_x = 128, local_size_y = 1, local_size_z = 1) in;
+
+void main() {
+    uint axisIndex = gl_GlobalInvocationID.x;
+
+    uint outputIndex = gl_WorkGroupID.y;
+    uint insideIndex = outputIndex % uConst.w;
+    uint outsideIndex = outputIndex / uConst.w;
+
+    uint inputOffsetBase = insideIndex + uConst.w * (uConst.h * outsideIndex);
+    FLOAT beforeSharedResult = initValue();
+    for (uint i = axisIndex; i < uConst.h; i += gl_WorkGroupSize.x) {
+        beforeSharedResult = reduceFunc(beforeSharedResult, uInput.data[inputOffsetBase + i * uConst.w]);
+    }
+
+    sharedValues[axisIndex] = beforeSharedResult;
+
+    barrier();
+
+    for (uint stride = gl_WorkGroupSize.x >> 1; stride > 0; stride = stride >> 1) {
+        if (axisIndex < stride) {
+            sharedValues[axisIndex] = reduceFunc(sharedValues[axisIndex + stride], sharedValues[axisIndex]);
         }
-#ifdef MEAN
-        res = res * uConst.k;
-#endif
-        uOutput.data[posTmp.x] = res;
+        barrier();
+    }
+
+    if (axisIndex == 0) {
+        FLOAT result = sharedValues[0];
+        #ifdef MEAN
+        uOutput.data[outputIndex] = result * uConst.k;
+        #else
+        uOutput.data[outputIndex] = result;
+        #endif
     }
 }
diff --git a/source/backend/vulkan/image/execution/glsl/softmaxHeight_NHWC.comp b/source/backend/vulkan/image/execution/glsl/softmaxHeight_NHWC.comp
deleted file mode 100644
index 339ffe4ea..000000000
--- a/source/backend/vulkan/image/execution/glsl/softmaxHeight_NHWC.comp
+++ /dev/null
@@ -1,47 +0,0 @@
-#version 440 core
-layout(std430) buffer;
-layout(set=0, binding=0) buffer destbuffer{
-    float data[];
-}uOutput;
-
-layout(set=0, binding=1) readonly buffer sourceBuffer{
-    float data[];
-}uInput;
-
-layout(set = 0, binding = 2) uniform constBuffer {
-    int w;//inside
-    int h;//axis
-    int c;//outside
-}uConst;
-
-layout(local_size_x = 8, local_size_y = 8, local_size_z = 1) in;
-
-void main()
-{
-    // input tensor's layout is NHWC
-    ivec3 pos = ivec3(gl_GlobalInvocationID);
-    // x: index in outside, y: index in inside
-    if(pos.y < uConst.w && pos.x < uConst.c)
-    {
-        int W = uConst.w;
-        int H = uConst.h;
-        int C = uConst.c;
-        float maxValue = uInput.data[pos.x * H * W + pos.y];
-        for(int i = 1; i < H; ++i)
-        {
-            int index = i * W + pos.x * H * W + pos.y;
-            maxValue = max(maxValue, uInput.data[index]);
-        }
-        float sum = 0.0;
-        for(int i = 0; i < H; ++i)
-        {
-            int index = i * W + pos.x * H * W + pos.y;
-            sum += exp(uInput.data[index] - maxValue);
-        }
-        for(int i = 0; i < H; ++i)
-        {
-            int index = i * W + pos.x * H * W + pos.y;
-            uOutput.data[index] = exp(uInput.data[index] - maxValue) / sum;
-        }
-    }
-}
diff --git a/source/backend/vulkan/image/execution/glsl/softmaxImage.comp b/source/backend/vulkan/image/execution/glsl/softmaxImage.comp
new file mode 100644
index 000000000..178b28fcf
--- /dev/null
+++ b/source/backend/vulkan/image/execution/glsl/softmaxImage.comp
@@ -0,0 +1,288 @@
+#version 450
+
+#define UP_DIV(x, y) (((x)+(y)-1)/(y))
+#define LOCAL_SIZE (256)
+#define MAX_FLOAT (3.402823466e+38)
+
+layout(set=0, binding=0) writeonly uniform image2D uOutput;
+
+layout(set=0, binding=1) uniform sampler2D uInput;
+
+layout(set=0, binding=2) readonly uniform constBuffer {
+    uint N;
+    uint H;
+    uint W;
+    uint C4;
+    uint CLeft;
+} uConst;
+
+shared vec4 sharedValues[LOCAL_SIZE];
+
+layout(local_size_x = LOCAL_SIZE, local_size_y = 1, local_size_z = 1) in;
+
+uint calculateNumElePerInvocation() {
+    uint numElePerInvocation = 0;
+#ifdef AXIS_N
+    numElePerInvocation = UP_DIV(uConst.N, LOCAL_SIZE);
+#endif
+#ifdef AXIS_H
+    numElePerInvocation = UP_DIV(uConst.H, LOCAL_SIZE);
+#endif
+#ifdef AXIS_W
+    numElePerInvocation = UP_DIV(uConst.W, LOCAL_SIZE);
+#endif
+#ifdef AXIS_C
+    numElePerInvocation = UP_DIV(uConst.C4, LOCAL_SIZE);
+#endif
+    return numElePerInvocation;
+}
+
+ivec2 calculatePos(uint indexN, uint indexH, uint indexW, uint indexCOut, uint H, uint W) {
+    uint x = indexW + indexCOut * W;
+    uint y = indexH + indexN * H;
+    return ivec2(int(x), int(y));
+}
+
+vec4 eleMaskMax(vec4 ele, uint CLeft) {
+    vec4 mask = vec4(0.0);
+    mask[3] = (CLeft >= 1) ? 1.0 : 0.0;
+    mask[2] = (CLeft >= 2) ? 1.0 : 0.0;
+    mask[1] = (CLeft >= 3) ? 1.0 : 0.0;
+    return mix(ele, vec4(-MAX_FLOAT), mask);
+}
+
+vec4 eleMaskSum(vec4 ele, uint CLeft) {
+    vec4 mask = vec4(0.0);
+    mask[3] = (CLeft >= 1) ? 1.0 : 0.0;
+    mask[2] = (CLeft >= 2) ? 1.0 : 0.0;
+    mask[1] = (CLeft >= 3) ? 1.0 : 0.0;
+    return mix(ele, vec4(0.0), mask);
+}
+
+void main() {
+    uint numElePerInvocation = calculateNumElePerInvocation();
+    uint localIndex = gl_LocalInvocationID.x;
+
+
+// ***************************
+// Index calculation starts.
+// ***************************
+#ifdef AXIS_N
+    uint indexC4 = gl_GlobalInvocationID.y % uConst.C4;
+    uint indexHW = gl_GlobalInvocationID.y / uConst.C4;
+    uint indexW = indexHW % uConst.W;
+    uint indexH = indexHW / uConst.W;
+    uint indexNBase = localIndex;
+#endif
+
+#ifdef AXIS_H
+    uint indexC4 = gl_GlobalInvocationID.y % uConst.C4;
+    uint indexNW = gl_GlobalInvocationID.y / uConst.C4;
+    uint indexW = indexNW % uConst.W;
+    uint indexN = indexNW / uConst.W;
+    uint indexHBase = localIndex;
+#endif
+
+#ifdef AXIS_W
+    uint indexC4 = gl_GlobalInvocationID.y % uConst.C4;
+    uint indexNH = gl_GlobalInvocationID.y / uConst.C4;
+    uint indexH = indexNH % uConst.H;
+    uint indexN = indexNH / uConst.H;
+    uint indexWBase = localIndex;
+#endif
+
+#ifdef AXIS_C
+    uint indexW = gl_GlobalInvocationID.y % uConst.W;
+    uint indexNH = gl_GlobalInvocationID.y / uConst.W;
+    uint indexH = indexNH % uConst.H;
+    uint indexN = indexNH / uConst.H;
+    uint indexC4Base = localIndex;
+#endif
+// ***************************
+// Index calculation ends.
+// ***************************
+
+
+// ***************************
+// Max reduction starts.
+// ***************************
+    vec4 maxValue = vec4(-1 * (MAX_FLOAT));
+#ifdef AXIS_N
+    for (uint indexN = indexNBase; indexN < uConst.N; indexN += gl_WorkGroupSize.x) {
+        ivec2 pos = calculatePos(indexN, indexH, indexW, indexC4, uConst.H, uConst.W);
+        vec4 ele = texelFetch(uInput, pos, 0);
+        if (indexC4 == (uConst.C4 - 1) && uConst.CLeft > 0) {
+            ele = eleMaskMax(ele, uConst.CLeft);
+        }
+        maxValue = max(maxValue, ele);
+    }
+#endif
+
+#ifdef AXIS_H
+    for (uint indexH = indexHBase; indexH < uConst.H; indexH += gl_WorkGroupSize.x) {
+        ivec2 pos = calculatePos(indexN, indexH, indexW, indexC4, uConst.H, uConst.W);
+        vec4 ele = texelFetch(uInput, pos, 0);
+        if (indexC4 == (uConst.C4 - 1) && uConst.CLeft > 0) {
+            ele = eleMaskMax(ele, uConst.CLeft);
+        }
+        maxValue = max(maxValue, ele);
+    }
+#endif
+
+#ifdef AXIS_W
+    for (uint indexW = indexWBase; indexW < uConst.W; indexW += gl_WorkGroupSize.x) {
+        ivec2 pos = calculatePos(indexN, indexH, indexW, indexC4, uConst.H, uConst.W);
+        vec4 ele = texelFetch(uInput, pos, 0);
+        if (indexC4 == (uConst.C4 - 1) && uConst.CLeft > 0) {
+            ele = eleMaskMax(ele, uConst.CLeft);
+        }
+        maxValue = max(maxValue, ele);
+    }
+#endif
+
+#ifdef AXIS_C
+    for (uint indexC4 = indexC4Base; indexC4 < uConst.C4; indexC4 += gl_WorkGroupSize.x) {
+        ivec2 pos = calculatePos(indexN, indexH, indexW, indexC4, uConst.H, uConst.W);
+        vec4 ele = texelFetch(uInput, pos, 0);
+        if (indexC4 == (uConst.C4 - 1) && uConst.CLeft > 0) {
+            ele = eleMaskMax(ele, uConst.CLeft);
+        }
+        maxValue = max(maxValue, ele);
+    }
+#endif
+
+    sharedValues[localIndex] = maxValue;
+    barrier();
+
+    for (uint stride = gl_WorkGroupSize.x >> 1; stride > 0; stride = stride >> 1) {
+        if (localIndex < stride) {
+            sharedValues[localIndex] = max(sharedValues[localIndex], sharedValues[localIndex + stride]);
+        }
+        barrier();
+    }
+    maxValue = sharedValues[0];
+
+#ifdef AXIS_C
+    float maxC = max(max(maxValue[0], maxValue[1]), max(maxValue[2], maxValue[3]));
+    maxValue[0] = maxC;
+#endif
+// ***************************
+// Max reduction ends.
+// ***************************
+
+
+// ***************************
+// Sum reduction starts.
+// ***************************
+    vec4 sumValue = vec4(0.0f);
+
+#ifdef AXIS_N
+    for (uint indexN = indexNBase; indexN < uConst.N; indexN += gl_WorkGroupSize.x) {
+        ivec2 pos = calculatePos(indexN, indexH, indexW, indexC4, uConst.H, uConst.W);
+        vec4 ele = texelFetch(uInput, pos, 0);
+        vec4 expEle = exp(ele - maxValue);
+        if (indexC4 == (uConst.C4 - 1) && uConst.CLeft > 0) {
+            expEle = eleMaskSum(expEle, uConst.CLeft);
+        }
+        sumValue += expEle;
+    }
+#endif
+
+#ifdef AXIS_H
+    for (uint indexH = indexHBase; indexH < uConst.H; indexH += gl_WorkGroupSize.x) {
+        ivec2 pos = calculatePos(indexN, indexH, indexW, indexC4, uConst.H, uConst.W);
+        vec4 ele = texelFetch(uInput, pos, 0);
+        vec4 expEle = exp(ele - maxValue);
+        if (indexC4 == (uConst.C4 - 1) && uConst.CLeft > 0) {
+            expEle = eleMaskSum(expEle, uConst.CLeft);
+        }
+        sumValue += expEle;
+    }
+#endif
+
+#ifdef AXIS_W
+    for (uint indexW = indexWBase; indexW < uConst.W; indexW += gl_WorkGroupSize.x) {
+        ivec2 pos = calculatePos(indexN, indexH, indexW, indexC4, uConst.H, uConst.W);
+        vec4 ele = texelFetch(uInput, pos, 0);
+        vec4 expEle = exp(ele - maxValue);
+        if (indexC4 == (uConst.C4 - 1) && uConst.CLeft > 0) {
+            expEle = eleMaskSum(expEle, uConst.CLeft);
+        }
+        sumValue += expEle;
+    }
+#endif
+
+#ifdef AXIS_C
+    for (uint indexC4 = indexC4Base; indexC4 < uConst.C4; indexC4 += gl_WorkGroupSize.x) {
+        ivec2 pos = calculatePos(indexN, indexH, indexW, indexC4, uConst.H, uConst.W);
+        vec4 ele = texelFetch(uInput, pos, 0);
+        vec4 expEle = exp(ele - vec4(maxValue[0])); // different from other cases
+        if (indexC4 == (uConst.C4 - 1) && uConst.CLeft > 0) {
+            expEle = eleMaskSum(expEle, uConst.CLeft);
+        }
+        sumValue += expEle;
+    }
+#endif
+
+    sharedValues[localIndex] = sumValue;
+    barrier();
+
+    for (uint stride = gl_WorkGroupSize.x >> 1; stride > 0; stride = stride >> 1) {
+        if (localIndex < stride) {
+            sharedValues[localIndex] += sharedValues[localIndex + stride];
+        }
+        barrier();
+    }
+    sumValue = sharedValues[0];
+
+#ifdef AXIS_C
+    float sumC = dot(sumValue, vec4(1.0f));
+    sumValue[0] = sumC;
+#endif
+// ***************************
+// Sum reduction ends.
+// ***************************
+
+
+// ***************************
+// Results output starts.
+// ***************************
+#ifdef AXIS_N
+    for (uint indexN = indexNBase; indexN < uConst.N; indexN += gl_WorkGroupSize.x) {
+        ivec2 pos = calculatePos(indexN, indexH, indexW, indexC4, uConst.H, uConst.W);
+        vec4 ele = texelFetch(uInput, pos, 0);
+        vec4 expEle = exp(ele - maxValue) / sumValue;
+        imageStore(uOutput, pos, expEle);
+    }
+#endif
+
+#ifdef AXIS_H
+    for (uint indexH = indexHBase; indexH < uConst.H; indexH += gl_WorkGroupSize.x) {
+        ivec2 pos = calculatePos(indexN, indexH, indexW, indexC4, uConst.H, uConst.W);
+        vec4 ele = texelFetch(uInput, pos, 0);
+        vec4 expEle = exp(ele - maxValue) / sumValue;
+        imageStore(uOutput, pos, expEle);
+    }
+#endif
+
+#ifdef AXIS_W
+    for (uint indexW = indexWBase; indexW < uConst.W; indexW += gl_WorkGroupSize.x) {
+        ivec2 pos = calculatePos(indexN, indexH, indexW, indexC4, uConst.H, uConst.W);
+        vec4 ele = texelFetch(uInput, pos, 0);
+        vec4 expEle = exp(ele - maxValue) / sumValue;
+        imageStore(uOutput, pos, expEle);
+    }
+#endif
+
+#ifdef AXIS_C
+    for (uint indexC4 = indexC4Base; indexC4 < uConst.C4; indexC4 += gl_WorkGroupSize.x) {
+        ivec2 pos = calculatePos(indexN, indexH, indexW, indexC4, uConst.H, uConst.W);
+        vec4 ele = texelFetch(uInput, pos, 0);
+        vec4 expEle = exp(ele - vec4(maxValue[0])) / vec4(sumValue[0]); // different from other cases
+        imageStore(uOutput, pos, expEle);
+    }
+#endif
+// ***************************
+// Results output ends.
+// ***************************
+}
diff --git a/source/backend/vulkan/image/shaders/AllShader.h b/source/backend/vulkan/image/shaders/AllShader.h
index 5f52602a3..9e8adad36 100644
--- a/source/backend/vulkan/image/shaders/AllShader.h
+++ b/source/backend/vulkan/image/shaders/AllShader.h
@@ -4,6 +4,16 @@ extern const unsigned char glsl_dwweightcopy_comp[];
 extern unsigned int glsl_dwweightcopy_comp_len;
 extern const unsigned char glsl_deconvCol2Im_comp[];
 extern unsigned int glsl_deconvCol2Im_comp_len;
+extern const unsigned char glsl_softmaxImage_comp[];
+extern unsigned int glsl_softmaxImage_comp_len;
+extern const unsigned char glsl_softmaxImage_AXIS_N_comp[];
+extern unsigned int glsl_softmaxImage_AXIS_N_comp_len;
+extern const unsigned char glsl_softmaxImage_AXIS_H_comp[];
+extern unsigned int glsl_softmaxImage_AXIS_H_comp_len;
+extern const unsigned char glsl_softmaxImage_AXIS_W_comp[];
+extern unsigned int glsl_softmaxImage_AXIS_W_comp_len;
+extern const unsigned char glsl_softmaxImage_AXIS_C_comp[];
+extern unsigned int glsl_softmaxImage_AXIS_C_comp_len;
 extern const unsigned char glsl_convolutionDepthwiseMali_comp[];
 extern unsigned int glsl_convolutionDepthwiseMali_comp_len;
 extern const unsigned char glsl_convolutionDepthwiseMali_RELU_comp[];
@@ -128,8 +138,6 @@ extern const unsigned char glsl_fill_image_comp[];
 extern unsigned int glsl_fill_image_comp_len;
 extern const unsigned char glsl_imageTonchw_comp[];
 extern unsigned int glsl_imageTonchw_comp_len;
-extern const unsigned char glsl_softmaxHeight_NHWC_comp[];
-extern unsigned int glsl_softmaxHeight_NHWC_comp_len;
 extern const unsigned char glsl_resizeNearest_comp[];
 extern unsigned int glsl_resizeNearest_comp_len;
 extern const unsigned char glsl_resizeNearest_NEAREST_ROUND_comp[];
diff --git a/source/core/ConvolutionCommon.cpp b/source/core/ConvolutionCommon.cpp
index 970b17288..a1890e980 100644
--- a/source/core/ConvolutionCommon.cpp
+++ b/source/core/ConvolutionCommon.cpp
@@ -12,14 +12,451 @@
 #include "backend/cpu/CPUBackend.hpp"
 #include "half.hpp"
 #include "core/OpCommonUtils.hpp"
-#include "core/IDSTDecoder.hpp"
 
 namespace MNN {
 
+namespace IDSTDecoder {
+
+static inline void *MNNMemoryAllocAlignZeroAlign(size_t size) {
+    return MNNMemoryCallocAlign(size, MNN_MEMORY_ALIGN_DEFAULT);
+}
+
+static int ReadBlobDim(BaseLoader* myfile, unsigned int* shape, int shapeBufCnt, bool useInt32) {
+    uint8_t uSize = 0;
+    myfile->read((char*)&uSize, 1);
+    if (uSize > 4) {
+        printf("Read shape error!\n");
+        return 0;
+    }
+    int copyLength = uSize;
+    if (copyLength > shapeBufCnt) {
+        copyLength = shapeBufCnt;
+    }
+    if (useInt32) {
+        myfile->read((char*)shape, sizeof(unsigned int) * copyLength);
+    } else {
+        uint16_t shape_i16[32] = {0};
+        myfile->read((char*)shape_i16, sizeof(uint16_t) * copyLength);
+        for (int i = 0; i < copyLength; ++i) {
+            shape[i] = shape_i16[i];
+        }
+    }
+    return copyLength;
+}
+
+static double _log2(double x) {
+    return log(x) / log(2);
+}
+
+static uint32_t atLestBitsCnt(uint32_t n) {
+    for (uint32_t i = 0; i < 32; i++) {
+        int32_t t = n << i;
+        if (t < 0)
+            return 32 - i - (((t << 1) == 0) ? 1 : 0);
+    }
+    return 0;
+}
+
+static void SplitBufToArray(uint8_t *buf, size_t bufLen, uint8_t *arr, size_t arrLen, size_t iNeedBits) {
+    unsigned char cMask = (1 << (iNeedBits)) - 1;
+    unsigned char *tmp  = (unsigned char *)buf;
+    int iOffset         = 0;
+    for (unsigned int i = 0; i < arrLen; i++) {
+        unsigned char idx = 0;
+        long uShift       = 8 - iNeedBits - iOffset % 8;
+        if (uShift < 0) {
+            idx = (tmp[iOffset / 8] << (0 - uShift)) & cMask;
+            idx |= (tmp[(iOffset / 8) + 1] >> (8 + uShift)) & cMask;
+        } else {
+            idx = (tmp[iOffset / 8] >> uShift) & cMask;
+        }
+        iOffset += iNeedBits;
+        if (iOffset % 8 == 0) {
+            tmp += iOffset / 8;
+            iOffset = 0;
+        }
+        arr[i] = idx;
+    }
+}
+
+// fixme!!! not efficiency
+typedef struct _SIMPLE_SET {
+    int8_t *UniSet;
+    uint32_t UniSetSize;
+    uint32_t CurUniCnt;
+} SIMPLE_SET, *PSIMPLE_SET;
+
+static PSIMPLE_SET CreateSimpleSet(uint32_t maxSize) {
+    PSIMPLE_SET set = (PSIMPLE_SET)calloc(1, sizeof(SIMPLE_SET));
+    if (set == nullptr)
+        return nullptr;
+    set->UniSet     = (int8_t *)calloc(maxSize, sizeof(int8_t));
+    set->UniSetSize = maxSize;
+    set->CurUniCnt  = 0;
+    return set;
+}
+
+static void SimpleRank(int8_t *data, uint32_t cnt, int up) {
+    if (up) {
+        for (uint32_t i = 0; i < cnt; i++) {
+            for (uint32_t j = i + 1; j < cnt; j++) {
+                if (data[i] > data[j]) {
+                    int8_t tmp = data[i];
+                    data[i]    = data[j];
+                    data[j]    = tmp;
+                }
+            }
+        }
+    } else {
+        for (uint32_t i = 0; i < cnt; i++) {
+            for (uint32_t j = i + 1; j < cnt; j++) {
+                if (data[i] < data[j]) {
+                    int8_t tmp = data[i];
+                    data[i]    = data[j];
+                    data[j]    = tmp;
+                }
+            }
+        }
+    }
+}
+
+static void InsertSimpleSet(PSIMPLE_SET set, int8_t value) {
+    if (set->CurUniCnt >= set->UniSetSize)
+        return;
+    for (uint32_t i = 0; i < set->CurUniCnt; i++) {
+        if (set->UniSet[i] == value)
+            return;
+    }
+    set->UniSet[set->CurUniCnt++] = value;
+    //    SimpleRank(set->UniSet, set->CurUniCnt, 1);
+}
+
+static void DestorySimpleSet(PSIMPLE_SET set) {
+    if (set->UniSet != nullptr)
+        free(set->UniSet);
+    free(set);
+}
+
+typedef struct _SIMPLE_MAP {
+    int8_t *CharCharMap;
+    uint32_t CharMapSize;
+    uint32_t CurMapCnt;
+} SIMPLE_MAP, *PSIMPLE_MAP;
+
+static PSIMPLE_MAP CreateSimpleMap(uint32_t MaxCnt) {
+    PSIMPLE_MAP map = (PSIMPLE_MAP)calloc(1, sizeof(SIMPLE_MAP));
+    if (map == nullptr)
+        return nullptr;
+    map->CharMapSize = MaxCnt * sizeof(int8_t);
+    map->CurMapCnt   = 0;
+    map->CharCharMap = (int8_t *)calloc(1, MaxCnt * 2);
+    return map;
+}
+
+static void DestroySimpleMap(PSIMPLE_MAP map) {
+    if (map->CharCharMap)
+        free(map->CharCharMap);
+    free(map);
+}
+
+static void InsertMap(PSIMPLE_MAP map, int8_t k, int8_t v) {
+    for (uint32_t i = 0; i < map->CurMapCnt; i++) {
+        if (map->CharCharMap[i * 2] == k) {
+            map->CharCharMap[i * 2 + 1] = v;
+            return;
+        }
+    }
+    if (map->CurMapCnt >= map->CharMapSize)
+        return;
+    map->CharCharMap[map->CurMapCnt * 2]     = k;
+    map->CharCharMap[map->CurMapCnt * 2 + 1] = v;
+    map->CurMapCnt++;
+}
+
+static int8_t FindInMap(PSIMPLE_MAP map, int8_t k, int *found) {
+    for (uint32_t i = 0; i < map->CurMapCnt; i++) {
+        if (map->CharCharMap[i * 2] == k) {
+            if (found != nullptr)
+                *found = 1;
+            return map->CharCharMap[i * 2 + 1];
+        }
+    }
+    if (found != nullptr)
+        *found = 0;
+    return 0;
+}
+
+static bool isLinearSample(const std::vector<int8_t>& sample, int bit) {
+    const int offset = 1 << (bit - 1);
+    const int size = 1 << bit;
+    if (sample.size() != size) {
+        return false;
+    }
+    for (int i = 0; i < sample.size(); i++) {
+        if (static_cast<int>(sample[i]) != i - offset) {
+            return false;
+        }
+    }
+    return true;
+}
+
+static int8_t *ReadQuanData_c(BaseLoader* s, size_t* len, ConvolutionCommon::Int8Common* result, bool shapeInt32, bool forceQuant) {
+    int8_t *blob      = nullptr;
+    uint8_t *idxBuf   = nullptr;
+    size_t dataCnt  = 1;
+
+    do {
+        // blob shape
+        unsigned int shape[32] = {0};
+        uint32_t shapeDim = (uint32_t)ReadBlobDim(s, shape, 32, shapeInt32);
+        if (shapeDim == 0 || shapeDim > 32)
+            break;
+        for (uint32_t i = 0; i < shapeDim; i++)
+            dataCnt *= shape[i];
+
+        // sample
+        uint32_t sampleCnt = 0;
+        s->read((char*)&sampleCnt, 1);
+        if (sampleCnt == 0) {
+            sampleCnt = 256;
+        }
+        result->weightMap.resize(sampleCnt);
+        auto samples = result->weightMap.data();
+        if (samples == nullptr)
+            break;
+        s->read((char*)samples, sampleCnt);
+        SimpleRank(samples, sampleCnt, 1);
+        uint32_t idxBitsCnt = atLestBitsCnt(sampleCnt);
+        idxBitsCnt = idxBitsCnt < 1 ? 1 : idxBitsCnt;
+        // index
+        size_t idxBufSize   = ceil(idxBitsCnt * dataCnt * 0.125);
+        idxBuf              = (uint8_t *)MNNMemoryAllocAlignZeroAlign(idxBufSize);
+        if (nullptr == idxBuf) {
+            MNN_ERROR("Not enought memory\n");
+            break;
+        }
+        s->read((char*)idxBuf, idxBufSize);
+        bool linear = isLinearSample(result->weightMap, idxBitsCnt);
+        if (linear) {
+            result->originBits = idxBitsCnt;
+        }
+        if (linear && (idxBitsCnt == 4 || idxBitsCnt == 8)) {
+            if (!forceQuant && idxBitsCnt == 4) {
+                // back to float, 4bit to 8bit
+                *len = dataCnt;
+                blob  = (int8_t *)MNNMemoryAllocAlignZeroAlign((size_t)UP_DIV(dataCnt, 2) * 2);
+                for (int i = 0; i < idxBufSize; i++) {
+                    int val = idxBuf[i];
+                    int x1 = val / 16;
+                    int x2 = val % 16;
+                    blob[2 * i] = x1 - 8;
+                    blob[2 * i + 1] = x2 - 8;
+                }
+            } else {
+                // keep quant
+                blob = (int8_t*)idxBuf;
+                idxBuf = nullptr;
+                if (idxBitsCnt == 4) {
+                    result->canUseInt4 = true;
+                } else {
+                    for (int i = 0; i < idxBufSize; i++) {
+                        blob[i] = (int)blob[i] - 128;
+                    }
+                }
+                *len = idxBufSize;
+            }
+        } else {
+            blob  = (int8_t *)MNNMemoryAllocAlignZeroAlign((size_t)UP_DIV(dataCnt, 2) * 2);
+            if (nullptr == blob) {
+                break;
+            }
+            bool success = true;
+            int offset = (1 << (idxBitsCnt-1));
+            do {
+                if (linear) {
+                    SplitBufToArray(idxBuf, (uint32_t)idxBufSize, (uint8_t*)blob, (uint32_t)dataCnt, (uint32_t)idxBitsCnt);
+                    auto src = (uint8_t*)blob;
+                    auto dst = blob;
+                    for (int i=0; i<dataCnt; ++i) {
+                        dst[i] = (int)src[i] - offset;
+                    }
+                    break;
+                }
+                // split index value into bytes
+                uint8_t* idxBytes = (uint8_t *)MNNMemoryAllocAlignZeroAlign(dataCnt * sizeof(uint8_t));
+                if (idxBitsCnt == 0 || nullptr == idxBytes) {
+                    success = false;
+                    break;
+                }
+                SplitBufToArray(idxBuf, (uint32_t)idxBufSize, idxBytes, (uint32_t)dataCnt, (uint32_t)idxBitsCnt);
+                int i = 0;
+                for (; i < dataCnt; i++) {
+                    if (idxBytes[i] >= sampleCnt) {
+                        MNN_PRINT("iNeedBits is %u\nRead quan weights error with idx:%d\n", idxBitsCnt, (int)idxBytes[i]);
+                        success = false;
+                        break;
+                    }
+                    blob[i] = samples[idxBytes[i]];
+                }
+                MNNMemoryFreeAlign(idxBytes);
+            } while (false);
+
+            if (!success) {
+                MNNMemoryFreeAlign(blob);
+                blob = nullptr;
+                break;
+            }
+            if (len) {
+                *len = blob ? dataCnt : 0;
+            }
+            if (result->originBits <= 4 && forceQuant) {
+                // Reduce blob to 4 bit
+                result->canUseInt4 = true;
+                auto sizeDiv2 = UP_DIV(dataCnt, 2);
+                auto newBlob  = (int8_t *)MNNMemoryAllocAlign((size_t)sizeDiv2, MNN_MEMORY_ALIGN_DEFAULT);
+                for (int i=0; i<sizeDiv2; ++i) {
+                    auto s0 = blob[2*i+0] + 8;
+                    auto s1 = blob[2*i+1] + 8;
+                    newBlob[i] = (s0 << 4) + s1;
+                }
+                MNNMemoryFreeAlign(blob);
+                blob = newBlob;
+            }
+        }
+    } while (0);
+
+    if (idxBuf != nullptr)
+        MNNMemoryFreeAlign(idxBuf);
+
+    return blob;
+}
+
+static int8_t *ReadSparseQuanData_c(BaseLoader* myfile, size_t* len, const float* alpha_ptr, size_t alpha_size, ConvolutionCommon::Int8Common* result, bool useInt32) {    // MNN_ERROR("sparse:%d\n", 1);
+    unsigned int shape[32];
+    uint32_t ucMapSize = 0;
+    PSIMPLE_SET setWeight = CreateSimpleSet(256);
+    if (setWeight == nullptr) {
+        return nullptr;
+    }
+    std::shared_ptr<unsigned int> __autoReleaseSetWeight(nullptr, [setWeight](void *) { DestorySimpleSet(setWeight); });
+    unsigned int nnz;
+    unsigned char iIdxNeedBits;
+    int8_t *blob = nullptr;
+    // 1. weights blob shape(unsigned int32)
+    int ShapeDim = ReadBlobDim(myfile, shape, 32, useInt32);
+    size_t Size     = sizeof(int8_t);
+    for (int i = 0; i < ShapeDim; i++)
+        Size *= shape[i];
+    blob = (int8_t *)MNNMemoryAllocAlignZeroAlign((size_t)Size);
+    if (blob == nullptr)
+        return nullptr;
+    // 2. nnz
+    myfile->read((char *)&nnz, 4);
+    // 3. max_step use # bits () (unsigned char)
+    myfile->read((char *)&iIdxNeedBits, 1);
+    // read idx array
+    // 4. buf for steps ceil(nnz*step need bits/8)
+    AutoStorage<unsigned char> arrIdxBuffer(nnz);
+    unsigned char *arrIdx = arrIdxBuffer.get();
+    if (nullptr == arrIdx) {
+        return nullptr;
+    }
+    {
+        size_t bufLen = (size_t)(ceil(0.125 * iIdxNeedBits * nnz));
+        char *buf     = (char *)MNNMemoryAllocAlignZeroAlign(bufLen * sizeof(char));
+        if (nullptr == buf) {
+            return nullptr;
+        }
+        myfile->read((char *)buf, bufLen);
+        SplitBufToArray((uint8_t *)buf, (uint32_t)bufLen, (uint8_t *)arrIdx, (uint32_t)nnz, (uint32_t)iIdxNeedBits);
+        MNNMemoryFreeAlign(buf);
+    }
+    // 5. Avalable values Count(unsigned char)
+    myfile->read((char *)&ucMapSize, 1);
+    if (0 == ucMapSize) {
+        ucMapSize = 256;
+    }
+    result->weightMap.resize(ucMapSize);
+    // 6. valueset(signed char * valueset_size)
+    for (int i = 0; i < ucMapSize; i++) {
+        int8_t tmp;
+        myfile->read((char *)&tmp, 1);
+        InsertSimpleSet(setWeight, tmp);
+        result->weightMap[i] = tmp;
+    }
+    SimpleRank(setWeight->UniSet, setWeight->CurUniCnt, 1);
+    // map<unsigned char, signed char> mapWeight;
+    PSIMPLE_MAP mapWeight = CreateSimpleMap(256);
+    if (mapWeight == nullptr) {
+        return nullptr;
+    }
+    std::shared_ptr<unsigned int> __autoReleaseMapWeight(nullptr, [mapWeight](void *) { DestroySimpleMap(mapWeight); });
+
+    for (int i = 0; i < setWeight->CurUniCnt; i++) {
+        InsertMap(mapWeight, i, setWeight->UniSet[i]);
+    }
+    //    unsigned char iIdx = 0;
+    // 7. none zero weights indexes(nnz*ceil(log2(Avalable_values_Count))/8)
+    AutoStorage<unsigned char> arrWeightIdxBuffer(nnz);
+    unsigned char *arrWeightIdx = arrWeightIdxBuffer.get();
+    if (nullptr == arrWeightIdx) {
+        return nullptr;
+    }
+    int iDataNeedBits = (int)ceil(_log2(ucMapSize));
+    iDataNeedBits = iDataNeedBits < 1 ? 1 : iDataNeedBits;
+    {
+        size_t bufLen     = (size_t)(ceil(0.125 * iDataNeedBits * nnz));
+        char *buf         = (char *)MNNMemoryAllocAlignZeroAlign(bufLen * sizeof(char));
+        if (nullptr == buf) {
+            return nullptr;
+        }
+        myfile->read((char *)buf, bufLen);
+        SplitBufToArray((uint8_t *)buf, (uint32_t)bufLen, (uint8_t *)arrWeightIdx, (uint32_t)nnz,
+                        (uint32_t)iDataNeedBits);
+        MNNMemoryFreeAlign(buf);
+    }
+    // set blob data with idx and weight idx
+    {
+        if (alpha_size == 2 * shape[0]) {
+            const int min_value = -(1 << (iDataNeedBits - 1));
+            auto alphaPtr = alpha_ptr;
+            int area = Size / shape[0];
+            for (int i = 0; i < shape[0]; i++) {
+                float min = alphaPtr[2*i];
+                float scale = alphaPtr[2*i+1];
+                int zeroQuant = min_value;
+                if (scale > 1e-6) {
+                    zeroQuant = round((0.0f - min) / scale) + min_value;
+                }
+                memset(blob+area*i, zeroQuant, area * sizeof(signed char));
+            }
+        } else {
+            memset(blob, 0, Size * sizeof(signed char)); //backward compability with previous symmetric weight quant
+        }
+        int iPreIdx = 0;
+        for (int i = 0; i < nnz; i++) {
+            iPreIdx += arrIdx[i];
+            int found    = 0;
+            int8_t value = FindInMap(mapWeight, arrWeightIdx[i], &found);
+            if (!found) {
+                MNN_ERROR("Read quan weights error with idx:%d\n", arrWeightIdx[i]);
+                MNNMemoryFreeAlign(blob);
+                return nullptr;
+            }
+            blob[iPreIdx] = value;
+        }
+    }
+    *len = Size;
+    return blob;
+}
+
+
+} // namespace IDSTDecoder
+
 std::shared_ptr<ConvolutionCommon::Int8Common> ConvolutionCommon::load(const Op* op, Backend* backend, bool forceFloat, bool forceInt8) {
     auto conv = op->main_as_Convolution2D();
     auto quan = conv->quanParameter();
-    auto result = std::make_shared<Int8Common>();
+    std::shared_ptr<ConvolutionCommon::Int8Common> result(new Int8Common);
     result->quan = quan;
     size_t buffer_size = 0, alpha_size = 0;
     const int8_t* buffer_ptr = nullptr;
@@ -96,16 +533,6 @@ std::shared_ptr<ConvolutionCommon::Int8Common> ConvolutionCommon::load(const Op*
     if (2 == quan->type()) {
         buffer = IDSTDecoder::ReadSparseQuanData_c(originBuffer.get(), &weightLength, alpha_ptr, alpha_size, result.get(), quan->shapeInt32());
     }
-    /*
-    if (result->weightMap.size() > 0) {
-        result->canUseInt4 = true;
-        for (auto value : result->weightMap) {
-            if (value < -8 || value > 7) {
-                result->canUseInt4 = false;
-            }
-        }
-    }
-    */
     // read fp16 data
     if (3 == quan->type()) {
         weightLength = buffer_size / sizeof(half_float::half);
@@ -150,8 +577,10 @@ std::shared_ptr<ConvolutionCommon::Int8Common> ConvolutionCommon::load(const Op*
             // clampMin is minVal in asymmetric quant, clampMin = -(2^(bit))
             // and old version clampMin is -128
             float clampMin = quan->aMin() == 0 ? -128 : quan->aMin();
-            for (int o = 0; o < outputCount; ++o) {
-                result->alpha.get()[2 * o] = result->alpha.get()[2 * o] - clampMin * result->alpha.get()[2 * o + 1];
+            if (clampMin < 0) {
+                for (int o = 0; o < outputCount; ++o) {
+                    result->alpha.get()[2 * o] = result->alpha.get()[2 * o] - clampMin * result->alpha.get()[2 * o + 1];
+                }
             }
         }
         if (!quan->has_scaleInt()) {
@@ -222,10 +651,10 @@ void ConvolutionCommon::getConvParameters(std::shared_ptr<Int8Common> *quanCommo
 
 bool ConvolutionCommon::getConvInt8Parameters(const MNN::Op* op, std::shared_ptr<Int8Common>& quanCommon, Backend* backend,
                                               const int8_t*& weight, int& weightSize, float*& scale, int32_t*& bias, int32_t*& weightQuantZeroPoint) {
+    // Compability for old quant model
     auto conv2d = op->main_as_Convolution2D();
     int outputCount = conv2d->common()->outputCount();
     weightSize = 0;
-    auto core = static_cast<CPUBackend*>(backend)->functions();
     // fix xcode UndefinedBehaviorSanitizer
     if (conv2d->symmetricQuan() && conv2d->symmetricQuan()->weight() != nullptr) {
         weight = conv2d->symmetricQuan()->weight()->data();
@@ -260,27 +689,14 @@ bool ConvolutionCommon::getConvInt8Parameters(const MNN::Op* op, std::shared_ptr
         auto alphaAndBeta = conv2d->quanParameter()->alpha()->data();
         int quantCount    = conv2d->quanParameter()->alpha()->size();
         if (false == weightAsy) { // symmetric quant
-            if (core->bytes == 2) {
-                core->MNNFp32ToLowp(quanCommon->alpha.get(), reinterpret_cast<int16_t*>(scale), quantCount);
-            } else {
-                ::memcpy(scale, conv2d->quanParameter()->alpha()->data(), quantCount * core->bytes);
-            }
+            ::memcpy(scale, conv2d->quanParameter()->alpha()->data(), quantCount * sizeof(float));
         } else if (true == weightAsy) { // asymmetric
             // int ocx2 = 2 * outputCount;
             int scaleSize = quantCount / 2;
             float clampMin = conv2d->quanParameter()->aMin() == 0 ? -128 : conv2d->quanParameter()->aMin();
-            if (core->bytes == 2) {
-                std::unique_ptr<int16_t[]> tmp(new int16_t[quantCount]);
-                core->MNNFp32ToLowp(alphaAndBeta, tmp.get(), quantCount);
-                for (int i = 0; i < scaleSize; ++i) {
-                    weightQuantZeroPoint[i] = static_cast<int32_t>(roundf((-1) * tmp[2 * i] / tmp[2 * i + 1]) + clampMin);
-                    reinterpret_cast<int16_t*>(scale)[i] = tmp[2 * i + 1];
-                }
-            } else {
-                for (int i = 0; i < scaleSize; ++i) {
-                    weightQuantZeroPoint[i] = static_cast<int32_t>(roundf((-1) * alphaAndBeta[2 * i] / alphaAndBeta[2 * i + 1])  + clampMin);
-                    scale[i] = alphaAndBeta[2 * i + 1];
-                }
+            for (int i = 0; i < scaleSize; ++i) {
+                weightQuantZeroPoint[i] = static_cast<int32_t>(roundf((-1) * alphaAndBeta[2 * i] / alphaAndBeta[2 * i + 1])  + clampMin);
+                scale[i] = alphaAndBeta[2 * i + 1];
             }
         }
         return true;
diff --git a/source/core/ConvolutionCommon.hpp b/source/core/ConvolutionCommon.hpp
index 7b1bbd5f0..eae3d0072 100644
--- a/source/core/ConvolutionCommon.hpp
+++ b/source/core/ConvolutionCommon.hpp
@@ -23,6 +23,7 @@ class MNN_PUBLIC ConvolutionCommon : public Execution {
         std::vector<int8_t> weightMap;
         bool canUseInt4 = false;
         Backend* backend = nullptr;
+        int originBits = 8;
     };
     static std::shared_ptr<Int8Common> load(const Op* op, Backend* backend = nullptr, bool forceFloat = false, bool forceInt8 = false);
     static void getConvParameters(std::shared_ptr<ConvolutionCommon::Int8Common> *quanCommon, Backend* backend, const MNN::Op *op, const float** originWeight, int* originWeightSize);
diff --git a/source/core/IDSTDecoder.hpp b/source/core/IDSTDecoder.hpp
deleted file mode 100644
index 757fdbf4d..000000000
--- a/source/core/IDSTDecoder.hpp
+++ /dev/null
@@ -1,433 +0,0 @@
-//
-//  IDSTDecoder.hpp
-//  MNN
-//
-//  Created by MNN on 2024/03/18.
-//  Copyright © 2018, Alibaba Group Holding Limited
-//
-
-#ifndef IDSTDECODER_HPP
-#define IDSTDECODER_HPP
-
-#include <map>
-#include <cmath>
-#include "MNN_generated.h"
-#include "core/FileLoader.hpp"
-#include "core/ConvolutionCommon.hpp"
-
-using namespace MNN;
-
-namespace IDSTDecoder {
-
-static inline void *MNNMemoryAllocAlignZeroAlign(size_t size) {
-    return MNNMemoryCallocAlign(size, MNN_MEMORY_ALIGN_DEFAULT);
-}
-
-static int ReadBlobDim(BaseLoader* myfile, unsigned int* shape, int shapeBufCnt, bool useInt32) {
-    uint8_t uSize = 0;
-    myfile->read((char*)&uSize, 1);
-    if (uSize > 4) {
-        printf("Read shape error!\n");
-        return 0;
-    }
-    int copyLength = uSize;
-    if (copyLength > shapeBufCnt) {
-        copyLength = shapeBufCnt;
-    }
-    if (useInt32) {
-        myfile->read((char*)shape, sizeof(unsigned int) * copyLength);
-    } else {
-        uint16_t shape_i16[32] = {0};
-        myfile->read((char*)shape_i16, sizeof(uint16_t) * copyLength);
-        for (int i = 0; i < copyLength; ++i) {
-            shape[i] = shape_i16[i];
-        }
-    }
-    return copyLength;
-}
-
-static double _log2(double x) {
-    return log(x) / log(2);
-}
-
-static uint32_t atLestBitsCnt(uint32_t n) {
-    for (uint32_t i = 0; i < 32; i++) {
-        int32_t t = n << i;
-        if (t < 0)
-            return 32 - i - (((t << 1) == 0) ? 1 : 0);
-    }
-    return 0;
-}
-
-static void SplitBufToArray(uint8_t *buf, size_t bufLen, uint8_t *arr, size_t arrLen, size_t iNeedBits) {
-    unsigned char cMask = (1 << (iNeedBits)) - 1;
-    unsigned char *tmp  = (unsigned char *)buf;
-    int iOffset         = 0;
-    for (unsigned int i = 0; i < arrLen; i++) {
-        unsigned char idx = 0;
-        long uShift       = 8 - iNeedBits - iOffset % 8;
-        if (uShift < 0) {
-            idx = (tmp[iOffset / 8] << (0 - uShift)) & cMask;
-            idx |= (tmp[(iOffset / 8) + 1] >> (8 + uShift)) & cMask;
-        } else {
-            idx = (tmp[iOffset / 8] >> uShift) & cMask;
-        }
-        iOffset += iNeedBits;
-        if (iOffset % 8 == 0) {
-            tmp += iOffset / 8;
-            iOffset = 0;
-        }
-        arr[i] = idx;
-    }
-}
-
-// fixme!!! not efficiency
-typedef struct _SIMPLE_SET {
-    int8_t *UniSet;
-    uint32_t UniSetSize;
-    uint32_t CurUniCnt;
-} SIMPLE_SET, *PSIMPLE_SET;
-
-static PSIMPLE_SET CreateSimpleSet(uint32_t maxSize) {
-    PSIMPLE_SET set = (PSIMPLE_SET)calloc(1, sizeof(SIMPLE_SET));
-    if (set == nullptr)
-        return nullptr;
-    set->UniSet     = (int8_t *)calloc(maxSize, sizeof(int8_t));
-    set->UniSetSize = maxSize;
-    set->CurUniCnt  = 0;
-    return set;
-}
-
-static void SimpleRank(int8_t *data, uint32_t cnt, int up) {
-    if (up) {
-        for (uint32_t i = 0; i < cnt; i++) {
-            for (uint32_t j = i + 1; j < cnt; j++) {
-                if (data[i] > data[j]) {
-                    int8_t tmp = data[i];
-                    data[i]    = data[j];
-                    data[j]    = tmp;
-                }
-            }
-        }
-    } else {
-        for (uint32_t i = 0; i < cnt; i++) {
-            for (uint32_t j = i + 1; j < cnt; j++) {
-                if (data[i] < data[j]) {
-                    int8_t tmp = data[i];
-                    data[i]    = data[j];
-                    data[j]    = tmp;
-                }
-            }
-        }
-    }
-}
-
-static void InsertSimpleSet(PSIMPLE_SET set, int8_t value) {
-    if (set->CurUniCnt >= set->UniSetSize)
-        return;
-    for (uint32_t i = 0; i < set->CurUniCnt; i++) {
-        if (set->UniSet[i] == value)
-            return;
-    }
-    set->UniSet[set->CurUniCnt++] = value;
-    //    SimpleRank(set->UniSet, set->CurUniCnt, 1);
-}
-
-static void DestorySimpleSet(PSIMPLE_SET set) {
-    if (set->UniSet != nullptr)
-        free(set->UniSet);
-    free(set);
-}
-
-typedef struct _SIMPLE_MAP {
-    int8_t *CharCharMap;
-    uint32_t CharMapSize;
-    uint32_t CurMapCnt;
-} SIMPLE_MAP, *PSIMPLE_MAP;
-
-static PSIMPLE_MAP CreateSimpleMap(uint32_t MaxCnt) {
-    PSIMPLE_MAP map = (PSIMPLE_MAP)calloc(1, sizeof(SIMPLE_MAP));
-    if (map == nullptr)
-        return nullptr;
-    map->CharMapSize = MaxCnt * sizeof(int8_t);
-    map->CurMapCnt   = 0;
-    map->CharCharMap = (int8_t *)calloc(1, MaxCnt * 2);
-    return map;
-}
-
-static void DestroySimpleMap(PSIMPLE_MAP map) {
-    if (map->CharCharMap)
-        free(map->CharCharMap);
-    free(map);
-}
-
-static void InsertMap(PSIMPLE_MAP map, int8_t k, int8_t v) {
-    for (uint32_t i = 0; i < map->CurMapCnt; i++) {
-        if (map->CharCharMap[i * 2] == k) {
-            map->CharCharMap[i * 2 + 1] = v;
-            return;
-        }
-    }
-    if (map->CurMapCnt >= map->CharMapSize)
-        return;
-    map->CharCharMap[map->CurMapCnt * 2]     = k;
-    map->CharCharMap[map->CurMapCnt * 2 + 1] = v;
-    map->CurMapCnt++;
-}
-
-static int8_t FindInMap(PSIMPLE_MAP map, int8_t k, int *found) {
-    for (uint32_t i = 0; i < map->CurMapCnt; i++) {
-        if (map->CharCharMap[i * 2] == k) {
-            if (found != nullptr)
-                *found = 1;
-            return map->CharCharMap[i * 2 + 1];
-        }
-    }
-    if (found != nullptr)
-        *found = 0;
-    return 0;
-}
-
-static bool isLinearSample(const std::vector<int8_t>& sample, int bit) {
-    const int offset = 1 << (bit - 1);
-    const int size = 1 << bit;
-    if (sample.size() != size) {
-        return false;
-    }
-    for (int i = 0; i < sample.size(); i++) {
-        if (static_cast<int>(sample[i]) != i - offset) {
-            return false;
-        }
-    }
-    return true;
-}
-
-static int8_t *ReadQuanData_c(BaseLoader* s, size_t* len, ConvolutionCommon::Int8Common* result, bool shapeInt32, bool forceQuant) {
-    int8_t *blob      = nullptr;
-    uint8_t *idxBuf   = nullptr;
-    uint8_t *idxBytes = nullptr;
-    size_t dataCnt  = 1;
-
-    do {
-        // blob shape
-        unsigned int shape[32] = {0};
-        uint32_t shapeDim = (uint32_t)ReadBlobDim(s, shape, 32, shapeInt32);
-        if (shapeDim == 0 || shapeDim > 32)
-            break;
-        for (uint32_t i = 0; i < shapeDim; i++)
-            dataCnt *= shape[i];
-
-        // sample
-        uint32_t sampleCnt = 0;
-        s->read((char*)&sampleCnt, 1);
-        if (sampleCnt == 0) {
-            sampleCnt = 256;
-        }
-        result->weightMap.resize(sampleCnt);
-        auto samples = result->weightMap.data();
-        if (samples == nullptr)
-            break;
-        s->read((char*)samples, sampleCnt);
-        SimpleRank(samples, sampleCnt, 1);
-        uint32_t idxBitsCnt = atLestBitsCnt(sampleCnt);
-        idxBitsCnt = idxBitsCnt < 1 ? 1 : idxBitsCnt;
-        // index
-        size_t idxBufSize   = ceil(idxBitsCnt * dataCnt * 0.125);
-        idxBuf              = (uint8_t *)MNNMemoryAllocAlignZeroAlign(idxBufSize);
-        if (nullptr == idxBuf) {
-            MNN_ERROR("Not enought memory\n");
-            break;
-        }
-        s->read((char*)idxBuf, idxBufSize);
-        if (idxBitsCnt == 4) {
-            dataCnt = UP_DIV(dataCnt, 2) * 2;
-        }
-
-        if (isLinearSample(result->weightMap, idxBitsCnt) && (idxBitsCnt == 4 || idxBitsCnt == 8)) {
-            if (!forceQuant && idxBitsCnt == 4) {
-                // back to float, 4bit to 8bit
-                *len = dataCnt;
-                blob  = (int8_t *)MNNMemoryAllocAlignZeroAlign((size_t)dataCnt);
-                for (int i = 0; i < idxBufSize; i++) {
-                    int val = idxBuf[i];
-                    int x1 = val / 16;
-                    int x2 = val % 16;
-                    blob[2 * i] = x1 - 8;
-                    blob[2 * i + 1] = x2 - 8;
-                }
-            } else {
-                // keep quant
-                blob = (int8_t*)idxBuf;
-                idxBuf = nullptr;
-                if (idxBitsCnt == 4) {
-                    result->canUseInt4 = true;
-                } else {
-                    for (int i = 0; i < idxBufSize; i++) {
-                        blob[i] = (int)blob[i] - 128;
-                    }
-                }
-                *len = idxBufSize;
-            }
-        } else {
-            blob  = (int8_t *)MNNMemoryAllocAlignZeroAlign((size_t)dataCnt);
-            if (nullptr == blob) {
-                break;
-            }
-            // split index value into bytes
-            idxBytes = (uint8_t *)MNNMemoryAllocAlignZeroAlign(dataCnt * sizeof(uint8_t));
-            if (idxBitsCnt == 0 || nullptr == idxBytes) {
-                break;
-            }
-            SplitBufToArray(idxBuf, (uint32_t)idxBufSize, idxBytes, (uint32_t)dataCnt, (uint32_t)idxBitsCnt);
-            int i = 0;
-            for (; i < dataCnt; i++) {
-                if (idxBytes[i] >= sampleCnt) {
-                    MNN_PRINT("iNeedBits is %u\nRead quan weights error with idx:%d\n", idxBitsCnt, (int)idxBytes[i]);
-                    break;
-                }
-                blob[i] = samples[idxBytes[i]];
-            }
-
-            if (i < dataCnt) {
-                MNNMemoryFreeAlign(blob);
-                blob = nullptr;
-                break;
-            }
-            MNNMemoryFreeAlign(idxBytes);
-            idxBytes = nullptr;
-            if (len)
-                *len = blob ? dataCnt : 0;
-        }
-    } while (0);
-
-    if (idxBuf != nullptr)
-        MNNMemoryFreeAlign(idxBuf);
-    if (idxBytes != nullptr)
-        MNNMemoryFreeAlign(idxBytes);
-
-    return blob;
-}
-
-static int8_t *ReadSparseQuanData_c(BaseLoader* myfile, size_t* len, const float* alpha_ptr, size_t alpha_size, ConvolutionCommon::Int8Common* result, bool useInt32) {    // MNN_ERROR("sparse:%d\n", 1);
-    unsigned int shape[32];
-    uint32_t ucMapSize = 0;
-    PSIMPLE_SET setWeight = CreateSimpleSet(256);
-    if (setWeight == nullptr) {
-        return nullptr;
-    }
-    std::shared_ptr<unsigned int> __autoReleaseSetWeight(nullptr, [setWeight](void *) { DestorySimpleSet(setWeight); });
-    unsigned int nnz;
-    unsigned char iIdxNeedBits;
-    int8_t *blob = nullptr;
-    // 1. weights blob shape(unsigned int32)
-    int ShapeDim = ReadBlobDim(myfile, shape, 32, useInt32);
-    size_t Size     = sizeof(int8_t);
-    for (int i = 0; i < ShapeDim; i++)
-        Size *= shape[i];
-    blob = (int8_t *)MNNMemoryAllocAlignZeroAlign((size_t)Size);
-    if (blob == nullptr)
-        return nullptr;
-    // 2. nnz
-    myfile->read((char *)&nnz, 4);
-    // 3. max_step use # bits () (unsigned char)
-    myfile->read((char *)&iIdxNeedBits, 1);
-    // read idx array
-    // 4. buf for steps ceil(nnz*step need bits/8)
-    AutoStorage<unsigned char> arrIdxBuffer(nnz);
-    unsigned char *arrIdx = arrIdxBuffer.get();
-    if (nullptr == arrIdx) {
-        return nullptr;
-    }
-    {
-        size_t bufLen = (size_t)(ceil(0.125 * iIdxNeedBits * nnz));
-        char *buf     = (char *)MNNMemoryAllocAlignZeroAlign(bufLen * sizeof(char));
-        if (nullptr == buf) {
-            return nullptr;
-        }
-        myfile->read((char *)buf, bufLen);
-        SplitBufToArray((uint8_t *)buf, (uint32_t)bufLen, (uint8_t *)arrIdx, (uint32_t)nnz, (uint32_t)iIdxNeedBits);
-        MNNMemoryFreeAlign(buf);
-    }
-    // 5. Avalable values Count(unsigned char)
-    myfile->read((char *)&ucMapSize, 1);
-    if (0 == ucMapSize) {
-        ucMapSize = 256;
-    }
-    result->weightMap.resize(ucMapSize);
-    // 6. valueset(signed char * valueset_size)
-    for (int i = 0; i < ucMapSize; i++) {
-        int8_t tmp;
-        myfile->read((char *)&tmp, 1);
-        InsertSimpleSet(setWeight, tmp);
-        result->weightMap[i] = tmp;
-    }
-    SimpleRank(setWeight->UniSet, setWeight->CurUniCnt, 1);
-    // map<unsigned char, signed char> mapWeight;
-    PSIMPLE_MAP mapWeight = CreateSimpleMap(256);
-    if (mapWeight == nullptr) {
-        return nullptr;
-    }
-    std::shared_ptr<unsigned int> __autoReleaseMapWeight(nullptr, [mapWeight](void *) { DestroySimpleMap(mapWeight); });
-
-    for (int i = 0; i < setWeight->CurUniCnt; i++) {
-        InsertMap(mapWeight, i, setWeight->UniSet[i]);
-    }
-    //    unsigned char iIdx = 0;
-    // 7. none zero weights indexes(nnz*ceil(log2(Avalable_values_Count))/8)
-    AutoStorage<unsigned char> arrWeightIdxBuffer(nnz);
-    unsigned char *arrWeightIdx = arrWeightIdxBuffer.get();
-    if (nullptr == arrWeightIdx) {
-        return nullptr;
-    }
-    int iDataNeedBits = (int)ceil(_log2(ucMapSize));
-    iDataNeedBits = iDataNeedBits < 1 ? 1 : iDataNeedBits;
-    {
-        size_t bufLen     = (size_t)(ceil(0.125 * iDataNeedBits * nnz));
-        char *buf         = (char *)MNNMemoryAllocAlignZeroAlign(bufLen * sizeof(char));
-        if (nullptr == buf) {
-            return nullptr;
-        }
-        myfile->read((char *)buf, bufLen);
-        SplitBufToArray((uint8_t *)buf, (uint32_t)bufLen, (uint8_t *)arrWeightIdx, (uint32_t)nnz,
-                        (uint32_t)iDataNeedBits);
-        MNNMemoryFreeAlign(buf);
-    }
-    // set blob data with idx and weight idx
-    {
-        if (alpha_size == 2 * shape[0]) {
-            const int min_value = -(1 << (iDataNeedBits - 1));
-            auto alphaPtr = alpha_ptr;
-            int area = Size / shape[0];
-            for (int i = 0; i < shape[0]; i++) {
-                float min = alphaPtr[2*i];
-                float scale = alphaPtr[2*i+1];
-                int zeroQuant = min_value;
-                if (scale > 1e-6) {
-                    zeroQuant = round((0.0f - min) / scale) + min_value;
-                }
-                memset(blob+area*i, zeroQuant, area * sizeof(signed char));
-            }
-        } else {
-            memset(blob, 0, Size * sizeof(signed char)); //backward compability with previous symmetric weight quant
-        }
-        int iPreIdx = 0;
-        for (int i = 0; i < nnz; i++) {
-            iPreIdx += arrIdx[i];
-            int found    = 0;
-            int8_t value = FindInMap(mapWeight, arrWeightIdx[i], &found);
-            if (!found) {
-                MNN_ERROR("Read quan weights error with idx:%d\n", arrWeightIdx[i]);
-                MNNMemoryFreeAlign(blob);
-                return nullptr;
-            }
-            blob[iPreIdx] = value;
-        }
-    }
-    *len = Size;
-    return blob;
-}
-
-
-} // namespace IDSTDecoder
-
-#endif // IDSTDECODER_HPP
diff --git a/source/core/IDSTEncoder.hpp b/source/core/IDSTEncoder.hpp
index 4c72f2d29..0982e55da 100644
--- a/source/core/IDSTEncoder.hpp
+++ b/source/core/IDSTEncoder.hpp
@@ -432,31 +432,35 @@ static std::unique_ptr<IDSTQuanT> encode(const float* weight, const std::vector<
     bool shapeUseInt32 = false;
     std::unique_ptr<IDSTQuanT> idst(new IDSTQuanT);
     std::ostringstream outputStringStreamCQ;
-    WriteCQBlobs(outputStringStreamCQ, weight, scale.data(), kernelSize, kernelNum, asymmetricQuantFlag, shapeUseInt32, bits);
-    auto cqStr = outputStringStreamCQ.str();
-    if (detectSparse) {
-        std::ostringstream outputStringStreamSQ;
-        bool sparseValid = WriteSparseQuanBlobs(outputStringStreamSQ, weight, scale.data(), kernelSize, kernelNum, asymmetricQuantFlag, shapeUseInt32, bits);
-        auto sqStr = outputStringStreamSQ.str();
-        int int8Size = kernelNum * kernelSize;
-        if (quantWeightPtr && (int8Size <= cqStr.size() && int8Size <= sqStr.size())) {
-            idst->type = 4;
-            idst->aMax = kernelNum;
-            idst->buffer.resize(int8Size);
-            ::memcpy(idst->buffer.data(), quantWeightPtr, int8Size);
-        } else if (cqStr.size() <= sqStr.size() || (!sparseValid)) {
+    if (quantWeightPtr && nullptr == weight) {
+        auto int8Size = kernelNum * kernelSize;
+        // Use Quanted weight
+        idst->type = 4;
+        idst->aMax = kernelNum;
+        idst->buffer.resize(int8Size);
+        ::memcpy(idst->buffer.data(), quantWeightPtr, int8Size);
+    } else {
+        WriteCQBlobs(outputStringStreamCQ, weight, scale.data(), kernelSize, kernelNum, asymmetricQuantFlag, shapeUseInt32, bits);
+        auto cqStr = outputStringStreamCQ.str();
+        if (detectSparse) {
+            std::ostringstream outputStringStreamSQ;
+            bool sparseValid = WriteSparseQuanBlobs(outputStringStreamSQ, weight, scale.data(), kernelSize, kernelNum, asymmetricQuantFlag, shapeUseInt32, bits);
+            auto sqStr = outputStringStreamSQ.str();
+            int int8Size = kernelNum * kernelSize;
+            if (cqStr.size() <= sqStr.size() || (!sparseValid)) {
+                idst->type = 1;
+                idst->buffer.resize(cqStr.size());
+                ::memcpy(idst->buffer.data(), cqStr.data(), cqStr.size());
+            } else {
+                idst->type = 2;
+                idst->buffer.resize(sqStr.size());
+                ::memcpy(idst->buffer.data(), sqStr.data(), sqStr.size());
+            }
+        } else {
             idst->type = 1;
             idst->buffer.resize(cqStr.size());
             ::memcpy(idst->buffer.data(), cqStr.data(), cqStr.size());
-        } else {
-            idst->type = 2;
-            idst->buffer.resize(sqStr.size());
-            ::memcpy(idst->buffer.data(), sqStr.data(), sqStr.size());
         }
-    } else {
-        idst->type = 1;
-        idst->buffer.resize(cqStr.size());
-        ::memcpy(idst->buffer.data(), cqStr.data(), cqStr.size());
     }
     idst->shapeInt32 = shapeUseInt32;
     idst->alpha.resize(scale.size());
diff --git a/source/core/Interpreter.cpp b/source/core/Interpreter.cpp
index 6620e0045..c8280ceef 100644
--- a/source/core/Interpreter.cpp
+++ b/source/core/Interpreter.cpp
@@ -226,6 +226,9 @@ Interpreter::Interpreter(Content* net) {
 }
 
 Interpreter::~Interpreter() {
+    for (auto iter = mNet->sessions.begin(); iter != mNet->sessions.end(); iter++) {
+        updateCacheFile((*iter).get());
+    }
     {
         // If the session is running, we must not delete session
         std::unique_lock<std::mutex> _l(mNet->lock);
diff --git a/source/core/OpCommonUtils.cpp b/source/core/OpCommonUtils.cpp
index 4a62fb4db..0197fb965 100644
--- a/source/core/OpCommonUtils.cpp
+++ b/source/core/OpCommonUtils.cpp
@@ -109,11 +109,11 @@ static std::tuple<int, int, int> _computeStride(const std::tuple<int, int, int>&
     return std::make_tuple(inside, axis, outside);
 }
 
-static bool _checkFuseValid(const OpCommonUtils::SPLITS& srcTup, const OpCommonUtils::SPLITS& srcSplits, bool swapnc, bool swapcw, bool srcAllLengthValid) {
+static bool _checkFuseValid(const OpCommonUtils::SPLITS& srcTup, const OpCommonUtils::SPLITS& srcSplits, bool swapnc, bool swapcw, const std::tuple<bool,bool,bool>& valid) {
     auto srcFused = _computeAxisFused(srcTup);
     if (swapnc) {
         // cw can't be fused if n > 1, because layout is c, n, w
-        if (std::get<1>(srcFused) && srcAllLengthValid) {
+        if (std::get<1>(srcFused) && std::get<2>(valid)) {
             return false;
         }
         if (std::get<0>(srcFused)) {
@@ -124,7 +124,7 @@ static bool _checkFuseValid(const OpCommonUtils::SPLITS& srcTup, const OpCommonU
         }
     } else if (swapcw) {
         // nc can't be fused if w > 1
-        if (std::get<0>(srcFused) && srcAllLengthValid) {
+        if (std::get<0>(srcFused) && std::get<0>(valid)) {
             return false;
         }
         if (std::get<1>(srcFused)) {
@@ -135,7 +135,15 @@ static bool _checkFuseValid(const OpCommonUtils::SPLITS& srcTup, const OpCommonU
         }
     } else {
         // nw can't be fused if c > 1
-        if (std::get<2>(srcFused) && srcAllLengthValid) {
+        if (std::get<2>(srcFused) && std::get<1>(valid)) {
+            return false;
+        }
+        // nc can't be fused if w > 1
+        if (std::get<0>(srcFused) && std::get<0>(valid)) {
+            return false;
+        }
+        // cw can't be fused if n > 1, because layout is c, n, w
+        if (std::get<1>(srcFused) && std::get<2>(valid)) {
             return false;
         }
     }
@@ -151,10 +159,15 @@ bool OpCommonUtils::canBlitFast(const Tensor::InsideDescribe::Region& region, co
     if (dstCOffset % pack != 0) {
         return false;
     }
-    bool srcAllLengthValid = std::get<0>(srcSplits) > 1 && std::get<1>(srcSplits) > 1 && std::get<2>(srcSplits) > 1;
-    bool dstAllLengthValid = std::get<0>(dstSplits) > 1 && std::get<1>(dstSplits) > 1 && std::get<2>(dstSplits) > 1;
+    auto wValid = std::get<0>(srcSplits) > 1 || std::get<0>(dstSplits) > 1;
+    auto cValid = std::get<1>(srcSplits) > 1 || std::get<1>(dstSplits) > 1;
+    auto nValid = std::get<2>(srcSplits) > 1 || std::get<2>(dstSplits) > 1;
+    auto valid = std::make_tuple(wValid, cValid, nValid);
     // Check Dst stride
     for (int i = 0; i < 3; ++i) {
+        if (region.size[i] <= 1) {
+            continue;
+        }
         int dstStride  = (region.size[i] - 1) * region.dst.stride[i];
         auto srcStride = region.src.stride[i] * (region.size[i] - 1);
         auto dstTup = _split(dstStride, std::get<1>(dstSplits), std::get<0>(dstSplits));
@@ -162,10 +175,10 @@ bool OpCommonUtils::canBlitFast(const Tensor::InsideDescribe::Region& region, co
         if (std::get<1>(dstTup) != std::get<1>(srcTup)) {
             return false;
         }
-        if (!_checkFuseValid(srcTup, srcSplits, swapnc, swapcw, srcAllLengthValid)) {
+        if (!_checkFuseValid(srcTup, srcSplits, swapnc, swapcw, valid)) {
             return false;
         }
-        if (!_checkFuseValid(dstTup, dstSplits, swapnc, swapcw, dstAllLengthValid)) {
+        if (!_checkFuseValid(dstTup, dstSplits, swapnc, swapcw, valid)) {
             return false;
         }
     }
@@ -571,6 +584,7 @@ bool OpCommonUtils::opCompabilityForLowp(const Op* op, int bytes) {
         case OpType_GridSample:
         case OpType_ROIPooling:
         case OpType_ROIAlign:
+        case OpType_RNNSequenceGRU:
         case OpType_DynamicQuant:
         case OpType_Attention:
         case OpType_LayerNorm:
@@ -879,4 +893,19 @@ bool OpCommonUtils::computeMatMulSize(bool transposeA, bool transposeB, const Te
 }
 
 
+DataType OpCommonUtils::convertDataType(halide_type_t type) {
+    if (type.code == halide_type_float) {
+        return DataType_DT_FLOAT;
+    }
+    if (type.code == halide_type_uint && type.bits == 8) {
+        return DataType_DT_UINT8;
+    }
+    if (type.code == halide_type_int && type.bits == 8) {
+        return DataType_DT_INT8;
+    }
+    if (type.code == halide_type_int && type.bits == 32) {
+        return DataType_DT_INT32;
+    }
+    return DataType_DT_INVALID;
+}
 } // namespace MNN
diff --git a/source/core/OpCommonUtils.hpp b/source/core/OpCommonUtils.hpp
index 7c35bcaa8..a51c901d0 100644
--- a/source/core/OpCommonUtils.hpp
+++ b/source/core/OpCommonUtils.hpp
@@ -63,6 +63,8 @@ class MNN_PUBLIC OpCommonUtils {
     static bool computeMatMulSize(bool transposeA, bool transposeB, const Tensor* A, const Tensor* B, int& e, int& l, int& h);
     static Execution* createExecutionWithExternal(Backend* backend, const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs,
                                                   const MNN::Op* op, FileLoader* externalFile, std::shared_ptr<BufferStorage>& tmpstore);
+    static DataType convertDataType(halide_type_t type);
+
 };
 } // namespace MNN
 
diff --git a/source/core/Pipeline.cpp b/source/core/Pipeline.cpp
index 0108aa5e6..8bb123a41 100644
--- a/source/core/Pipeline.cpp
+++ b/source/core/Pipeline.cpp
@@ -27,12 +27,10 @@ static bool _supportQuant(const Op* op, const std::vector<Tensor*>& inputs, cons
     switch (otype) {
         case OpType_Convolution:
         case OpType_ConvolutionDepthwise:
-            if (op->main_as_Convolution2D() && op->main_as_Convolution2D()->weight() != nullptr) {
+        case OpType_Deconvolution:
+            if (inputs.size() > 1) {
                 return false;
-            } else {
-                return true;
             }
-        case OpType_Deconvolution:
             if (op->main_as_Convolution2D() && op->main_as_Convolution2D()->weight() != nullptr) {
                 return false;
             } else {
@@ -1138,7 +1136,7 @@ ErrorCode Pipeline::allocMemory(bool firstMalloc, bool forbidReplace) {
             if (!mRuntime->hasAsyncWork()) {
                 _pushTuningTask(std::move(initInfos));
             }
-            mBackend.reset(mCpuRuntime->onCreate(nullptr));
+            mBackend.reset(mCpuRuntime->onCreate(nullptr, mBackupBackend.get()));
         }
     }
     {
diff --git a/source/core/Session.cpp b/source/core/Session.cpp
index b8354d53a..1d32a8194 100644
--- a/source/core/Session.cpp
+++ b/source/core/Session.cpp
@@ -135,7 +135,11 @@ Session::Session(Schedule::ScheduleInfo&& info, const ModeGroup& mode, RuntimeIn
         attr.autoSetOpType = mode.backendMode == Interpreter::Session_Backend_Auto;
         auto rt    = mRuntime.first.find(iter.first.info.type)->second.get();
         auto cpuRuntime = mRuntime.second;
-        std::shared_ptr<Pipeline> newPipeline(new Pipeline( mInfo.externalWeightPath, std::move(iter), mode.inputMode == Interpreter::Session_Input_Inside, mode.outputMode == Interpreter::Session_Output_User, attr, rt, cpuRuntime.get(), mMode.geometryMask));
+        auto geoMask = mMode.geometryMask;
+        if (rt->onGetCompilerType() != Runtime::Compiler_Loop) {
+            geoMask = 0;
+        }
+        std::shared_ptr<Pipeline> newPipeline(new Pipeline( mInfo.externalWeightPath, std::move(iter), mode.inputMode == Interpreter::Session_Input_Inside, mode.outputMode == Interpreter::Session_Output_User, attr, rt, cpuRuntime.get(), geoMask));
         mPipelines.emplace_back(std::move(newPipeline));
     }
     mCallBackMode = mode.callBackMode;
diff --git a/source/core/SimdHeader.h b/source/core/SimdHeader.h
new file mode 100644
index 000000000..befe78aab
--- /dev/null
+++ b/source/core/SimdHeader.h
@@ -0,0 +1,15 @@
+#ifndef SIMDHEADER_HPP
+#define SIMDHEADER_HPP
+#ifdef MNN_USE_NEON
+#include <arm_neon.h>
+#endif
+#ifdef MNN_USE_SSE
+#if defined(_MSC_VER)
+#include <intrin.h>
+#elif defined(__EMSCRIPTEN__)
+#include <smmintrin.h>
+#else
+#include <x86intrin.h>
+#endif
+#endif
+#endif
diff --git a/source/core/Tensor.cpp b/source/core/Tensor.cpp
index 28d5076a8..b8395412b 100644
--- a/source/core/Tensor.cpp
+++ b/source/core/Tensor.cpp
@@ -77,7 +77,7 @@ Tensor::Tensor(const Tensor* tensor, DimensionType type, bool allocMemory) {
 
     // format mapping
     auto originType = tensor->getDimensionType();
-    if (originType != type && buffer.dimensions >= 4) {
+    if (originType != type && buffer.dimensions >= 3) {
         std::vector<int> axisMap;
         // NCHW -> NHWC
         if (originType == CAFFE) {
diff --git a/source/core/TensorUtils.cpp b/source/core/TensorUtils.cpp
index e0e9517b8..01398fb34 100644
--- a/source/core/TensorUtils.cpp
+++ b/source/core/TensorUtils.cpp
@@ -798,8 +798,30 @@ TensorUtils::FuseWrap::~ FuseWrap() {
 bool TensorUtils::FuseWrap::match(const Tensor::InsideDescribe::Region& srcReg, const Tensor::InsideDescribe::Region& dstReg) {
     return mStatus->match(srcReg, dstReg);
 }
+#ifdef MNN_DEBUG_BLIT
+static std::string _printRegion(const Tensor::InsideDescribe::Region& reg) {
+    char info[2048];
+    sprintf(info, "size: %d, %d, %d; src: %d, %d, %d, %d; dst: %d, %d, %d, %d", reg.size[0], reg.size[1], reg.size[2], reg.src.offset, reg.src.stride[0], reg.src.stride[1], reg.src.stride[2], reg.dst.offset, reg.dst.stride[0], reg.dst.stride[1], reg.dst.stride[2]);
+    info[2047] = 0;
+    return std::string(info);
+}
+#endif
+
 void TensorUtils::FuseWrap::apply(const Tensor::InsideDescribe::Region& srcReg, Tensor::InsideDescribe::Region& dstReg) {
+#ifdef MNN_DEBUG_BLIT
+    {
+        auto src = _printRegion(srcReg);
+        auto dst = _printRegion(dstReg);
+        MNN_PRINT("Fuse:\n %s \n %s\n To: \n", src.c_str(), dst.c_str());
+    }
+#endif
     mStatus->apply(srcReg, dstReg);
+#ifdef MNN_DEBUG_BLIT
+    {
+        auto dst = _printRegion(dstReg);
+        MNN_PRINT("%s\n", dst.c_str());
+    }
+#endif
 }
 
 void TensorUtils::adjustTensorForCompability(Tensor* newTensor) {
diff --git a/source/geometry/GeometryBinary.cpp b/source/geometry/GeometryBinary.cpp
index 1e6610c11..c7f52eec3 100644
--- a/source/geometry/GeometryBinary.cpp
+++ b/source/geometry/GeometryBinary.cpp
@@ -99,7 +99,7 @@ class GeometryBinary : public GeometryComputer {
         auto outFormat  = TensorUtils::getDescribe(output)->dimensionFormat;
         MNN_ASSERT(0 != inputL1 && 0 != inputL0 && 0 != outputSize);
         //MNN_PRINT("On compute geometry: %d - %d - %d\n", inputL0, inputL1, outputSize);
-        if (1 == inputL0 || 1 == inputL1) {
+        if (1 == inputL0 || 1 == inputL1 || context.forwardType() == MNN_FORWARD_NN) {
             // Can directly compute
             std::shared_ptr<Command> cmdP(new Command);
             auto& cmd = *cmdP;
diff --git a/source/geometry/GeometryConvert.cpp b/source/geometry/GeometryConvert.cpp
index 7d696a261..2f936af25 100644
--- a/source/geometry/GeometryConvert.cpp
+++ b/source/geometry/GeometryConvert.cpp
@@ -7,21 +7,41 @@
 //
 
 #include "ConvertUtils.hpp"
-#include "geometry/GeometryComputer.hpp"
+#include "core/OpCommonUtils.hpp"
 #include "core/TensorUtils.hpp"
+#include "GeometryComputer.hpp"
+#include "GeometryComputerUtils.hpp"
 namespace MNN {
 class GeometryConvert : public GeometryComputer {
 public:
     virtual bool onCompute(const Op* op, const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs,
                            Context& context, CommandBuffer& buffer) const override {
+        if (op->type() == OpType_ConvertTensor) {
+            auto input  = inputs[0];
+            auto output = outputs[0];
+            return ConvertUtils::compute(input, output, buffer);
+        }
+        MNN_ASSERT(op->type() == OpType_CastLike);
         auto input  = inputs[0];
+        auto type  = OpCommonUtils::convertDataType(inputs[1]->getType());
         auto output = outputs[0];
-        return ConvertUtils::compute(input, output, buffer);
+        flatbuffers::FlatBufferBuilder builder;
+        CastParamBuilder builder_(builder);
+        builder_.add_dstT(type);
+        auto mainOffset = builder_.Finish().Union();
+        OpBuilder opB(builder);
+        opB.add_type(OpType_Cast);
+        opB.add_main(mainOffset);
+        opB.add_main_type(OpParameter_CastParam);
+        builder.Finish(opB.Finish());
+        auto cmd = GeometryComputerUtils::makeCommand(builder, {input}, outputs);
+        buffer.command.emplace_back(cmd);
+        return true;
     }
 };
 static void _create() {
     std::shared_ptr<GeometryComputer> comp(new GeometryConvert);
-    GeometryComputer::registerGeometryComputer(comp, {OpType_ConvertTensor});
+    GeometryComputer::registerGeometryComputer(comp, {OpType_ConvertTensor, OpType_CastLike});
 }
 
 REGISTER_GEOMETRY(GeometryConvert, _create);
diff --git a/source/geometry/GeometryELU.cpp b/source/geometry/GeometryELU.cpp
index eb3f0ab49..865d3a8fe 100644
--- a/source/geometry/GeometryELU.cpp
+++ b/source/geometry/GeometryELU.cpp
@@ -29,20 +29,10 @@ class GeometryELU : public GeometryComputer {
         std::shared_ptr<Tensor> expValue(new Tensor);
         {
             initTensor(expValue, input);
-            auto cmd = GeometryComputerUtils::makeUnary(UnaryOpOperation_EXP, input, expValue.get());
+            auto cmd = GeometryComputerUtils::makeUnary(UnaryOpOperation_EXPM1, input, expValue.get());
             res.extras.emplace_back(expValue);
             res.command.emplace_back(std::move(cmd));
         }
-        // sub
-        std::shared_ptr<Tensor> subValue(new Tensor);
-        {
-            auto oneConst = context.allocConst(op, {}, halide_type_of<float>());
-            oneConst->host<float>()[0] = 1.0;
-            initTensor(subValue, input);
-            auto cmd = GeometryComputerUtils::makeBinary(BinaryOpOperation_SUB, expValue.get(), oneConst.get(), subValue.get());
-            res.extras.emplace_back(subValue);
-            res.command.emplace_back(std::move(cmd));
-        }
         // mul
         std::shared_ptr<Tensor> mulValue(new Tensor);
         {
@@ -56,7 +46,7 @@ class GeometryELU : public GeometryComputer {
             }
             alphaConst->host<float>()[0] = alpha;
             initTensor(mulValue, input);
-            auto cmd = GeometryComputerUtils::makeBinary(BinaryOpOperation_MUL, subValue.get(), alphaConst.get(), mulValue.get());
+            auto cmd = GeometryComputerUtils::makeBinary(BinaryOpOperation_MUL, expValue.get(), alphaConst.get(), mulValue.get());
             res.extras.emplace_back(mulValue);
             res.command.emplace_back(std::move(cmd));
         }
@@ -72,17 +62,16 @@ class GeometryELU : public GeometryComputer {
             res.extras.emplace_back(compValue);
             res.command.emplace_back(std::move(cmd));
         }
-        std::shared_ptr<Tensor> scaleValue(new Tensor);
-        {
-            if (op->type() == OpType_Selu) {
-                auto scaleConst = context.allocConst(op, {}, halide_type_of<float>());
-                float scale = op->main_as_Selu()->scale();
-                scaleConst->host<float>()[0] = scale;
-                initTensor(scaleValue, input);
-                auto cmd = GeometryComputerUtils::makeBinary(BinaryOpOperation_MUL, input, scaleConst.get(), scaleValue.get());
-                res.extras.emplace_back(scaleValue);
-                res.command.emplace_back(std::move(cmd));
-            }
+        std::shared_ptr<Tensor> scaleValue;
+        if (op->type() == OpType_Selu) {
+            scaleValue.reset(new Tensor);
+            auto scaleConst = context.allocConst(op, {}, halide_type_of<float>());
+            float scale = op->main_as_Selu()->scale();
+            scaleConst->host<float>()[0] = scale;
+            initTensor(scaleValue, input);
+            auto cmd = GeometryComputerUtils::makeBinary(BinaryOpOperation_MUL, input, scaleConst.get(), scaleValue.get());
+            res.extras.emplace_back(scaleValue);
+            res.command.emplace_back(std::move(cmd));
         }
         // select
         {
diff --git a/source/geometry/GeometryImageOp.cpp b/source/geometry/GeometryImageOp.cpp
index 8f679422e..7689c1478 100644
--- a/source/geometry/GeometryImageOp.cpp
+++ b/source/geometry/GeometryImageOp.cpp
@@ -64,51 +64,96 @@ class GeometryImageOp : public GeometryComputer {
             flatbuffers::FlatBufferBuilder builder;
             builder.Finish(makeInterp(builder, &info, 2, op, OpType_Interp));
             res.command.emplace_back(GeometryComputerUtils::makeCommand(builder, {newInputs[0]}, newOutputs));
-        }
-        else if (OpType_Interp == op->type() && inputs[0]->dimensions() <= 4) {
-            // Compute cord transform for interp
-            auto resize                           = op->main_as_Interp();
-            auto inW = inputs[0]->width();
-            auto inH = inputs[0]->height();
-            auto outW = outputs[0]->width();
-            auto outH = outputs[0]->height();
-            InterpInfo info;
-            bool computeScale = true;
-            if (inputs.size() > 1 && inputs[1]->getType().code == halide_type_float) {
-                computeScale = false;
-                info.heightScale = 1.0f / inputs[1]->host<float>()[2];
-                if (inputs[0]->dimensions() >= 4) {
-                    info.widthScale = 1.0f / inputs[1]->host<float>()[3];
+        } else if (OpType_Interp == op->type()) {
+            auto tempInput = newInputs[0];
+            auto tempOutput = newOutputs[0];
+            int offset = 2;
+            for (int d=0; d<tempInput->dimensions() && d<2; ++d) {
+                if (tempInput->length(d) != tempOutput->length(d)) {
+                    offset = d;
+                    break;
                 }
             }
-            const int defaultDepth = 10;
-            _ConverterInterp(resize, &info, inW, inH, defaultDepth, outW, outH, defaultDepth, computeScale);
-            flatbuffers::FlatBufferBuilder builder;
-            builder.Finish(makeInterp(builder, &info, resize->resizeType(), op, OpType_Interp));
-            res.command.emplace_back(GeometryComputerUtils::makeCommand(builder, {newInputs[0]}, newOutputs));
-        } else if (OpType_Interp == op->type() && inputs[0]->dimensions() == 5) {
-            // Compute cord transform for interp
-            auto resize                           = op->main_as_Interp();
-            auto inShape = newInputs[0]->shape();
-            auto outShape = newOutputs[0]->shape();
-            auto inW = inShape[4];
-            auto inH = inShape[3];
-            auto inD = inShape[2];
-            auto outW = outShape[4];
-            auto outH = outShape[3];
-            auto outD = outShape[2];
-            InterpInfo info;
-            bool computeScale = true;
-            if (inputs.size() > 1 && inputs[1]->getType().code == halide_type_float) {
-                computeScale = false;
-                info.depthScale = 1.0f / inputs[1]->host<float>()[2];
-                info.heightScale = 1.0f / inputs[1]->host<float>()[3];
-                info.widthScale = 1.0f / inputs[1]->host<float>()[4];
+            if (offset < 2) {
+                int enlargeDim = 2 - offset;
+                std::shared_ptr<Tensor> flattentInput(new Tensor(enlargeDim + tempInput->dimensions(), Tensor::CAFFE_C4));
+                std::shared_ptr<Tensor> flattentOutput(new Tensor(enlargeDim + tempInput->dimensions(), Tensor::CAFFE_C4));
+
+                if (0 == offset) {
+                    flattentInput->setLength(0, 1);
+                    flattentInput->setLength(1, 1);
+                    flattentOutput->setLength(0, 1);
+                    flattentOutput->setLength(1, 1);
+                } else {
+                    flattentInput->setLength(0, tempInput->length(0));
+                    flattentInput->setLength(1, 1);
+                    flattentOutput->setLength(0, tempOutput->length(0));
+                    flattentOutput->setLength(1, 1);
+                }
+                for (int v=offset; v<tempInput->buffer().dimensions; ++v) {
+                    flattentInput->setLength(v+enlargeDim, tempInput->length(v));
+                    flattentOutput->setLength(v+enlargeDim, tempOutput->length(v));
+                }
+                TensorUtils::getDescribe(flattentInput.get())->memoryType = Tensor::InsideDescribe::MEMORY_VIRTUAL;
+                TensorUtils::getDescribe(flattentInput.get())->regions = {TensorUtils::makeFullSlice(tempInput)};
+
+                TensorUtils::getDescribe(tempOutput)->memoryType = Tensor::InsideDescribe::MEMORY_VIRTUAL;
+                TensorUtils::getDescribe(tempOutput)->regions = {TensorUtils::makeFullSlice(flattentOutput.get())};
+                
+                tempInput = flattentInput.get();
+                tempOutput = flattentOutput.get();
+
+                res.extras.emplace_back(flattentInput);
+                res.extras.emplace_back(flattentOutput);
+            }
+            if (tempInput->dimensions() <= 4) {
+                // Compute cord transform for interp
+                auto resize = op->main_as_Interp();
+                auto inW = tempInput->width();
+                auto inH = tempInput->height();
+                auto outW = tempOutput->width();
+                auto outH = tempOutput->height();
+                InterpInfo info;
+                bool computeScale = true;
+                if (inputs.size() > 1 && inputs[1]->getType().code == halide_type_float) {
+                    computeScale = false;
+                    info.heightScale = 1.0f / inputs[1]->host<float>()[offset];
+                    if (tempInput->dimensions() >= 4) {
+                        info.widthScale = 1.0f / inputs[1]->host<float>()[offset+1];
+                    }
+                }
+                const int defaultDepth = 10;
+                _ConverterInterp(resize, &info, inW, inH, defaultDepth, outW, outH, defaultDepth, computeScale);
+                flatbuffers::FlatBufferBuilder builder;
+                builder.Finish(makeInterp(builder, &info, resize->resizeType(), op, OpType_Interp));
+                res.command.emplace_back(GeometryComputerUtils::makeCommand(builder, {tempInput}, {tempOutput}));
+            } else if(tempInput->dimensions() == 5) {
+                // Compute cord transform for interp
+                auto resize = op->main_as_Interp();
+                auto inShape = tempInput->shape();
+                auto outShape = tempOutput->shape();
+                auto inW = inShape[4];
+                auto inH = inShape[3];
+                auto inD = inShape[2];
+                auto outW = outShape[4];
+                auto outH = outShape[3];
+                auto outD = outShape[2];
+                InterpInfo info;
+                bool computeScale = true;
+                if (inputs.size() > 1 && inputs[1]->getType().code == halide_type_float) {
+                    computeScale = false;
+                    info.depthScale = 1.0f / inputs[1]->host<float>()[offset];
+                    info.heightScale = 1.0f / inputs[1]->host<float>()[offset+1];
+                    info.widthScale = 1.0f / inputs[1]->host<float>()[offset+2];
+                }
+                _ConverterInterp(resize, &info, inW, inH, inD, outW, outH, outD, computeScale);
+                flatbuffers::FlatBufferBuilder builder;
+                builder.Finish(makeInterp(builder, &info, resize->resizeType(), op, OpType_Interp3D));
+                res.command.emplace_back(GeometryComputerUtils::makeCommand(builder, {tempInput}, {tempOutput}));
+            } else {
+                MNN_ERROR("MNN Interp don't support >= 6 dimension Interp\n");
+                return false;
             }
-            _ConverterInterp(resize, &info, inW, inH, inD, outW, outH, outD, computeScale);
-            flatbuffers::FlatBufferBuilder builder;
-            builder.Finish(makeInterp(builder, &info, resize->resizeType(), op, OpType_Interp3D));
-            res.command.emplace_back(GeometryComputerUtils::makeCommand(builder, {newInputs[0]}, newOutputs));
         } else {
             std::shared_ptr<Command> cmdP(new Command);
             auto& cmd = *cmdP;;
diff --git a/source/math/Vec.hpp b/source/math/Vec.hpp
index d7636e074..6839ab83b 100644
--- a/source/math/Vec.hpp
+++ b/source/math/Vec.hpp
@@ -9,19 +9,10 @@
 #ifndef Vec_hpp
 #define Vec_hpp
 #include "core/Macro.h"
+#include "core/SimdHeader.h"
 #include <array>
 #include <algorithm>  // supply std::max and std::min
 #include <math.h>
-#ifdef MNN_USE_NEON
-#include <arm_neon.h>
-#endif
-#ifdef MNN_USE_SSE
-#if defined(_MSC_VER)
-#include <intrin.h>
-#else
-#include <x86intrin.h>
-#endif
-#endif
 namespace MNN {
 namespace Math {
 
diff --git a/source/shape/ShapeCast.cpp b/source/shape/ShapeCast.cpp
index cae9cbc89..c566ef98c 100644
--- a/source/shape/ShapeCast.cpp
+++ b/source/shape/ShapeCast.cpp
@@ -17,6 +17,10 @@ class CastSizeComputer : public SizeComputer {
         auto output = outputs[0];
         auto input  = inputs[0];
         TensorUtils::copyShape(input, output, true);
+        if (OpType_CastLike == op->type()) {
+            output->buffer().type = inputs[1]->buffer().type;
+            return true;
+        }
         if (OpType_FloatToInt8 == op->type()) {
             output->buffer().type = halide_type_of<int8_t>();
             return true;
@@ -33,6 +37,7 @@ class CastSizeComputer : public SizeComputer {
     }
 };
 REGISTER_SHAPE(CastSizeComputer, OpType_Cast);
+REGISTER_SHAPE(CastSizeComputer, OpType_CastLike);
 REGISTER_SHAPE(CastSizeComputer, OpType_FloatToInt8);
 REGISTER_SHAPE(CastSizeComputer, OpType_Int8ToFloat);
 } // namespace MNN
diff --git a/source/shape/ShapeInterp.cpp b/source/shape/ShapeInterp.cpp
index c824740f5..b4d7bab82 100644
--- a/source/shape/ShapeInterp.cpp
+++ b/source/shape/ShapeInterp.cpp
@@ -36,19 +36,16 @@ class InterpComputer : public SizeComputer {
             if(shape->length(0) == input.dimensions) {
                 // For Onnx's Resize
                 // Don't support batch / channel resize
-                for (int i=0; i<2; ++i) {
-                    output.dim[i].extent = input.dim[i].extent;
-                }
                 if (shape->getType().code == halide_type_int) {
                     // Width / Height
                     auto shapePtr = shape->host<int>();
-                    for (int i=2; i<input.dimensions; ++i) {
+                    for (int i=0; i<input.dimensions; ++i) {
                         output.dim[i].extent = shapePtr[i];
                     }
                 } else {
                     // Scale
                     auto scalePtr = shape->host<float>();
-                    for (int i=2; i<input.dimensions; ++i) {
+                    for (int i=0; i<input.dimensions; ++i) {
                         output.dim[i].extent = (scalePtr[i] * (float)input.dim[i].extent);
                     }
                 }
diff --git a/source/shape/ShapeRegister.cpp b/source/shape/ShapeRegister.cpp
index 4763d7ab5..f917bf39a 100644
--- a/source/shape/ShapeRegister.cpp
+++ b/source/shape/ShapeRegister.cpp
@@ -42,6 +42,7 @@ extern void ___Convolution3DSizeComputer__OpType_Convolution3D__();
 extern void ___ConcatSizeComputer__OpType_Concat__();
 extern void ___ConcatSizeComputer__OpType_QuantizedConcat__();
 extern void ___CastSizeComputer__OpType_Cast__();
+extern void ___CastSizeComputer__OpType_CastLike__();
 extern void ___CastSizeComputer__OpType_FloatToInt8__();
 extern void ___CastSizeComputer__OpType_Int8ToFloat__();
 extern void ___BatchToSpaceNDSizeComputer__OpType_BatchToSpaceND__();
@@ -164,6 +165,7 @@ ___Convolution3DSizeComputer__OpType_Convolution3D__();
 ___ConcatSizeComputer__OpType_Concat__();
 ___ConcatSizeComputer__OpType_QuantizedConcat__();
 ___CastSizeComputer__OpType_Cast__();
+___CastSizeComputer__OpType_CastLike__();
 ___CastSizeComputer__OpType_FloatToInt8__();
 ___CastSizeComputer__OpType_Int8ToFloat__();
 ___BatchToSpaceNDSizeComputer__OpType_BatchToSpaceND__();
diff --git a/source/shape/ShapeTensorArray.cpp b/source/shape/ShapeTensorArray.cpp
index f23c4fc12..166df399e 100644
--- a/source/shape/ShapeTensorArray.cpp
+++ b/source/shape/ShapeTensorArray.cpp
@@ -120,7 +120,7 @@ class TensorArrayReadComputer : public SizeComputer {
         } else {
             MNN_ASSERT(false);
         }
-        outputs[0]->setType(op->main_as_TensorArray()->T());
+        outputs[0]->buffer().type = inputs[2]->buffer().type;
         outputs[0]->buffer().dimensions    = readElemShape.size();
         for (int i = 0; i < readElemShape.size(); i++) {
             outputs[0]->setLength(i, readElemShape[i]);
@@ -149,7 +149,7 @@ class TensorArrayWriteComputer : public SizeComputer {
             return false;
         }
         copyTensorArrayAttribute(inputs[3], outputs[0]);
-        outputs[0]->setType(op->main_as_TensorArray()->T());
+        outputs[0]->buffer().type = inputs[2]->buffer().type;
         int writeIndex = inputs[1]->host<uint32_t>()[0];
         // update arraySize
         if (!inDes->tensorArrayAttr->isDynamicSize) {
@@ -254,7 +254,7 @@ class TensorArrayScatterComputer : public SizeComputer {
                 outDes->tensorArrayAttr->elemShape[0] = writeElemShape;
             }
         }
-        outputs[0]->setType(op->main_as_TensorArray()->T());
+        outputs[0]->buffer().type = inputs[3]->buffer().type;
         updateTensorArrayDims(outputs[0]);
         MNN_ASSERT(outDes->tensorArrayAttr != nullptr);
         return true;
@@ -340,8 +340,7 @@ class TensorArrayConcatComputer : public SizeComputer {
         copyTensorArrayAttribute(inputs[1], outputs[0]);
         auto tpParam = op->main_as_TensorArray();
         int concatAxis = tpParam->axis(), newAxis = tpParam->new_axis();
-        outputs[0]->setType(op->main_as_TensorArray()->T());
-
+        outputs[0]->buffer().type = inputs[1]->buffer().type;
         const auto& elemShapes = inDes->tensorArrayAttr->elemShape;
         auto outShape = elemShapes[0];
         bool valid = true; // avoid use MNN_ASSERT because it's no-op in release mode
diff --git a/source/shape/ShapeUnique.cpp b/source/shape/ShapeUnique.cpp
index 3488add88..c201a6c03 100644
--- a/source/shape/ShapeUnique.cpp
+++ b/source/shape/ShapeUnique.cpp
@@ -16,7 +16,7 @@ class ShapeUnique : public SizeComputer {
     virtual bool onComputeSize(const MNN::Op* op, const std::vector<Tensor*>& inputs,
                                const std::vector<Tensor*>& outputs) const override {
         MNN_ASSERT(1 == inputs.size());
-        if (inputs[0]->getType().code != halide_type_int) {
+        if (inputs[0]->getType().bytes() != 4) {
             return false;
         }
         auto& ib = inputs[0]->buffer();
@@ -37,6 +37,18 @@ class ShapeUnique : public SizeComputer {
             TensorUtils::copyShape(outputs[0], outputs[1], true);
             outputs[1]->buffer().type = halide_type_of<int>();
         }
+        if (outputs.size() > 2) {
+            outputs[2]->buffer().dimensions = 1;
+            outputs[2]->buffer().dim[0].extent = eleSize;
+            TensorUtils::getDescribe(outputs[2])->dimensionFormat = TensorUtils::getDescribe(inputs[0])->dimensionFormat;
+            outputs[2]->buffer().type = halide_type_of<int>();
+        }
+        if (outputs.size() > 3) {
+            outputs[3]->buffer().dimensions = 1;
+            outputs[3]->buffer().dim[0].extent = (int)values.size();
+            TensorUtils::getDescribe(outputs[3])->dimensionFormat = TensorUtils::getDescribe(inputs[0])->dimensionFormat;
+            outputs[3]->buffer().type = halide_type_of<int>();
+        }
         return true;
     }
 };
diff --git a/test.sh b/test.sh
index 1ad2ab3ab..7204117a8 100755
--- a/test.sh
+++ b/test.sh
@@ -385,16 +385,7 @@ pymnn_test() {
     fi
     # 4. train test
     ./train_test.sh
-    # 5. quant test
-    python3 ../examples/MNNQuant/test_mnn_offline_quant.py \
-            --mnn_model ~/AliNNModel/TestQuant/mobilenet_v2_tfpb_train_withBN.mnn \
-            --quant_imgs ~/AliNNModel/TestQuant/quant_imgs \
-            --quant_model ./quant_model.mnn
-    rm ./quant_model.mnn
-    quant_wrong=$[$? > 0]
-    printf "TEST_NAME_QUANT_TEST: pymnn量化测试\nTEST_CASE_AMOUNT_QUANT_TEST: {\"blocked\":0,\"failed\":%d,\"passed\":%d,\"skipped\":0}\n" \
-            $quant_wrong $[1 - $quant_wrong]
-    # 6. uninstall pymnn
+    # 5. uninstall pymnn
     pip uninstall --yes MNN-Internal
     popd
     popd
diff --git a/test/core/BackendTest.cpp b/test/core/BackendTest.cpp
index c40472ad2..669fd30a3 100644
--- a/test/core/BackendTest.cpp
+++ b/test/core/BackendTest.cpp
@@ -540,6 +540,40 @@ bool nhwc_2_NC4HW4_2_nhwc_inttype(std::shared_ptr<Backend> bn) {
     free(temp);
     return true;
 }
+bool nchwTonhwc3Dim(std::shared_ptr<Backend> bn) {
+    // Test NHWC -> NC4HW4 -> NHWC
+    MNN_PRINT("\n ========= check nchwTonhwc 3dim result ! ========= \n");
+    int batch   = 2;
+    int channel = 12;
+    int width   = 32;
+    std::shared_ptr<Tensor> hostTensor(
+        Tensor::create<float>(std::vector<int>{batch, channel, width}, nullptr, Tensor::CAFFE));
+    auto elementSize = hostTensor->elementSize();
+    auto hostData    = hostTensor->host<float>();
+    for (int i = 0; i < elementSize; ++i) {
+        int flagRandom    = (rand() % 2 == 0);
+        float valueRandom = rand() % 255 / 255.f;
+        hostData[i]       = ((flagRandom == 1) ? 1.0 : -1.0) * valueRandom;
+    }
+    std::vector<float> tempStorage(hostTensor->elementSize());
+    float* temp = tempStorage.data();
+    memset(temp, 0.0f, hostTensor->size());
+    NCHW2NHWC(hostData, temp, batch, 1, width, channel);
+    std::shared_ptr<Tensor> deviceTensor(Tensor::createDevice<float>(std::vector<int>{batch, channel, width}, Tensor::CAFFE));
+    bn->onAcquireBuffer(deviceTensor.get(), Backend::STATIC);
+    bn->onCopyBuffer(hostTensor.get(), deviceTensor.get());
+    std::shared_ptr<Tensor> hostTensorNHWC(new Tensor( deviceTensor.get(), Tensor::TENSORFLOW));
+    bn->onCopyBuffer(deviceTensor.get(), hostTensorNHWC.get());
+    auto backendCopyData = hostTensorNHWC->host<float>();
+    for (int i = 0; i < elementSize; ++i) {
+        if (abs(backendCopyData[i] - temp[i]) >= F32_BF16_MAX_LOSS) { //Error of converting from float32 to bf16 is more than 0.001
+            MNN_PRINT("Error for bn:%d, %f -> %f. F32_BF16_MAX_LOSS:%f\n", i, temp[i], backendCopyData[i], F32_BF16_MAX_LOSS);
+            FUNC_PRINT(1);
+            return false;
+        }
+    }
+    return true;
+}
 bool nchwTonhwc(std::shared_ptr<Backend> bn) {
     // Test NHWC -> NC4HW4 -> NHWC
     MNN_PRINT("\n ========= check nchwTonhwc result ! ========= \n");
@@ -668,10 +702,13 @@ class BackendCopyBufferFloatTest : public MNNTestCase {
                 std::shared_ptr<Runtime> runtime(creator->onCreate(info));
                 MNN_PRINT("Test %d Backend for %d \n", type, user.precision);
                 std::shared_ptr<Backend> bn(runtime->onCreate(&user));
-                auto res = NC4HW4_2_NC4HW4_float(bn);
+                bool res = true;
+                res = NC4HW4_2_NC4HW4_float(bn);
                 FUNC_PRINT(res);
                 res = res && nchwTonhwc(bn);
                 FUNC_PRINT(res);
+                res = res && nchwTonhwc3Dim(bn);
+                FUNC_PRINT(res);
                 res = res && nhwc_2_NC4HW4_2_nhwc_float(bn);
                 FUNC_PRINT(res);
                 res = res && NCHW_NC4HW4_NCHW(bn, 3, 16, 17, 19);
diff --git a/test/core/TensorTest.cpp b/test/core/TensorTest.cpp
index 4ff28733e..be696076c 100644
--- a/test/core/TensorTest.cpp
+++ b/test/core/TensorTest.cpp
@@ -44,10 +44,10 @@ class TensorTest : public MNNTestCase {
             MNNTEST_ASSERT(alloc.deviceId() == 0);
 
             MNNTEST_ASSERT(alloc.length(0) == 3);
-            MNNTEST_ASSERT(alloc.length(1) == 5);
-            MNNTEST_ASSERT(alloc.length(2) == 7);
+            MNNTEST_ASSERT(alloc.length(1) == 7);
+            MNNTEST_ASSERT(alloc.length(2) == 5);
             MNNTEST_ASSERT(alloc.stride(0) == 5 * 7);
-            MNNTEST_ASSERT(alloc.stride(1) == 7);
+            MNNTEST_ASSERT(alloc.stride(1) == 5);
             MNNTEST_ASSERT(alloc.stride(2) == 1);
         }
         {
diff --git a/test/expr/ModuleTest.cpp b/test/expr/ModuleTest.cpp
index 233711fda..60d25f010 100644
--- a/test/expr/ModuleTest.cpp
+++ b/test/expr/ModuleTest.cpp
@@ -1148,7 +1148,10 @@ MNNTestSuiteRegister(ResizeOptimizationTest, "expr/ResizeOptimizationTest");
 class WinogradMemoryTest : public MNNTestCase {
 public:
     float memoryUsed(int level) {
-        auto y = _mobileNetV1Expr();
+        auto x = _Input({1, 64, 224, 224}, MNN::Express::NC4HW4, halide_type_of<float>());
+        x->setName("Input");
+        auto y = _Conv(0.0f, 0.0f, x, {64, 112}, {3, 3});
+        y->setName("Prob");
         std::unique_ptr<MNN::NetT> net(new NetT);
         Variable::save({y}, net.get());
         y = nullptr;
@@ -1181,9 +1184,10 @@ class WinogradMemoryTest : public MNNTestCase {
     }
     virtual bool run(int precision) {
         float mem0 = memoryUsed(0);
+        float mem1 = memoryUsed(1);
         float mem3 = memoryUsed(3);
-        printf("level=0,3: %fMb, %fMb\n", mem0,mem3);
-        if (mem3 < mem0) {
+        MNN_PRINT("level=0, 1, 3: %fMb, %fMb, %fMb\n", mem0,mem1,mem3);
+        if (mem3 <= mem1 || mem1 <= mem0) {
             return false;
         }
         return true;
@@ -1331,3 +1335,52 @@ class PrearrangeTest : public MNNTestCase {
 };
 MNNTestSuiteRegister(PrearrangeTest, "expr/PrearrangeTest");
 
+class ExecutorResetLoadModuleTest : public MNNTestCase {
+public:
+    virtual bool run(int precision) {
+        BackendConfig originConfig;
+        auto exe = Executor::newExecutor(MNN_FORWARD_CPU, originConfig, 1);
+        ExecutorScope _s(exe);
+        // Make Model include convolution in shape compute and content compute
+        auto x = _Input({1, 3, 24, 24}, NCHW, halide_type_of<float>());
+        x->setName("x");
+        auto xs = _Convert(_Reshape(_Cast<float>(_Shape(x, NCHW)), {1, 1, 2, 2}), NC4HW4);
+        xs = _Convert(_Conv(1.0f, 0.0f, xs, {1, 1}, {2, 2}), NCHW);
+        auto y = _Conv(0.1f, 0.0f, _Convert(x, NC4HW4), {3, 1}, {3, 3});
+        y = _Convert(y, NCHW);
+        y = _ReduceMean(y);
+        y = y * _Reciprocal(xs);
+        auto info = y->getInfo();
+        y->setName("y");
+        auto buffer = Variable::save({y});
+        MNN::ScheduleConfig sconfig;
+        BackendConfig bnConfig;
+        bnConfig.precision = MNN::BackendConfig::Precision_Low;
+        bnConfig.memory = MNN::BackendConfig::Memory_Low;
+        sconfig.backendConfig = &bnConfig;
+        sconfig.numThread = 4;
+        exe->setGlobalExecutorConfig(MNN_FORWARD_CPU, bnConfig, 4);
+        std::shared_ptr<Executor::RuntimeManager> rtMgr(Executor::RuntimeManager::createRuntimeManager(sconfig));
+        Module::Config config;
+        config.rearrange = false;
+        std::shared_ptr<MNN::Express::Module> m0(Module::load({"x"}, {"y"}, (const unsigned char*)buffer.data(), buffer.size(), nullptr, &config), Module::destroy);
+        config.rearrange = true;
+        std::shared_ptr<MNN::Express::Module> m1(Module::load({"x"}, {"y"}, (const unsigned char*)buffer.data(), buffer.size(), rtMgr, &config), Module::destroy);
+        auto m0Rt = m0->getInfo()->runTimeManager;
+        auto m1Rt = m1->getInfo()->runTimeManager;
+        if (nullptr == m0Rt->getBnConfig() || nullptr == m1Rt->getBnConfig()) {
+            FUNC_PRINT(1);
+            return false;
+        }
+        if (MNN::BackendConfig::Precision_Low != m0Rt->getBnConfig()->precision || MNN::BackendConfig::Memory_Low != m0Rt->getBnConfig()->memory) {
+            FUNC_PRINT(1);
+            return false;
+        }
+        if (MNN::BackendConfig::Precision_Low != m1Rt->getBnConfig()->precision || MNN::BackendConfig::Memory_Low != m1Rt->getBnConfig()->memory) {
+            FUNC_PRINT(1);
+            return false;
+        }
+        return true;
+    }
+};
+MNNTestSuiteRegister(ExecutorResetLoadModuleTest, "expr/ExecutorResetLoadModuleTest");
diff --git a/test/op/ConvInt8Test.cpp b/test/op/ConvInt8Test.cpp
index 3b9d94856..20075cd1c 100644
--- a/test/op/ConvInt8Test.cpp
+++ b/test/op/ConvInt8Test.cpp
@@ -258,7 +258,7 @@ class ConvInt8TestCommon : public MNNTestCase {
             auto error = (int32_t)targetValue - (int32_t)computeResult;
             if (error * error > 1) {
                 MNN_PRINT("ic=%d, oc=%d, ow=%d, oh=%d, ConvInt8 result No.%d Error: right=%d, error=%d\n", channel[0], channel[1], ow, oh, i, targetValue, computeResult);
-#ifdef DEBUG
+#ifdef DEBUG 
                 x->writeMap<int8_t>();
                 auto ptr = y->readMap<int8_t>();
                 FUNC_PRINT_ALL(ptr, p);
@@ -290,7 +290,52 @@ class ConvInt8Im2colGemmTest : public ConvInt8TestCommon {
 public:
 
     virtual bool run(int precision) {
-        return true;
+        auto backendType = getCurrentType();
+        if (backendType != MNN_FORWARD_CPU && backendType != MNN_FORWARD_CPU_EXTENSION) {
+            // Skip other backend test for conv int8
+            return true;
+        }
+        std::vector< std::vector<int>> iwih = {{27, 27}, {20, 20}, {11, 11}, {14, 11}, {14, 12}};
+        std::vector< std::vector<int>> kxky = {{3, 3}, {5, 5}};
+        std::vector< std::vector<int>> icoc = {{3, 64}, {8, 32}, {1, 32}, {54, 8}};
+        std::vector<int> batch              = {1, 2, 5};
+        std::vector< std::vector<int>> pxpy = {{1, 1}, {0, 0}, {2, 3}};
+        std::vector< std::vector<int>> sxsy = {{1, 1}, {2, 2}};
+        std::vector< std::vector<int>> dxdy = {{1, 1}, {2, 2}};
+        
+        for (int i0 = 0; i0 < kxky.size(); i0++) {
+            for (int i1 = 0; i1 < icoc.size(); i1++) {
+                for (int i2 = 0; i2 < batch.size(); i2++) {
+                    for (int i3 = 0; i3 < pxpy.size(); i3++) {
+                        for (int i4 = 0; i4 < sxsy.size(); i4++) {
+                            for (int i5 = 0; i5 < dxdy.size(); i5++) {
+                                for (int i6 = 3; i6 < iwih.size(); i6++) {
+                                    auto res = testKernel(iwih[i6], kxky[i0], icoc[i1], pxpy[i3], sxsy[i4], dxdy[i5], 8, false, 1, batch[i2], MNN::SparseAlgo_RANDOM, 1, false);
+                                    if (!res) {
+                                        MNN_ERROR("kx=%d, ky=%d, iw=%d, ih=%d, overflow=false, bit=8, batch=%d, Conv info: sx=%d, sy=%d, dx=%d, dy=%d, px=%d, py=%d, ic=%d, oc=%d\n", 
+                                                   kxky[i0][0], kxky[i0][1], iwih[i6][0], iwih[i6][1], batch[i2], sxsy[i4][0], sxsy[i4][1], dxdy[i5][0], dxdy[i5][1], pxpy[i3][0], pxpy[i3][1], icoc[i1][0], icoc[i1][1]);
+                                        return false;
+                                    }
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        }
+        
+        int sx = 1, sy = 1, dx = 1, dy = 1, px = 1, py = 1, ic = 17, oc = 8, kx = 3, ky = 3; // ic=17,54,{14,11},{7,7}
+        auto res = testKernel({7, 7}, {kx, ky}, {ic, oc}, {px, py}, {sx, sy}, {dx, dy}, 8, false, 1, 1, MNN::SparseAlgo_RANDOM, 1, false);
+        if (!res) {
+            MNN_ERROR("overflow=false, bit=8, batch=%d, Conv info: sx=%d, sy=%d, dx=%d, dy=%d, px=%d, py=%d, ic=%d, oc=%d\n", 1, sx, sy, dx, dy, px, py, ic, oc);
+            return false;
+        }
+        res = testKernel({4, 4}, {1, 3}, {ic, oc}, {px, py}, {sx, sy}, {dx, dy}, 8, false, 1, 1, MNN::SparseAlgo_RANDOM, 1, false);
+        if (!res) {
+            MNN_ERROR("overflow=false, bit=8, batch=%d, Conv info: sx=%d, sy=%d, dx=%d, dy=%d, px=%d, py=%d, ic=%d, oc=%d\n", 1, sx, sy, dx, dy, px, py, ic, oc);
+            return false;
+        }
+        
         std::vector<std::vector<int>> kernels = {
             {4, 2}, {1, 5}, {7, 1}
         };
@@ -566,7 +611,8 @@ class ConvInt8WinogradTestCommon : public MNNTestCase {
                 MNN_ERROR("[ConvInt8WinogradTestCommon] getInfo not match\n");
                 return false;
             }
-            auto yTargetPtr = yTarget->readMap<int>(), yPtr = y->readMap<int>();
+            auto yTargetPtr = yTarget->readMap<int>();
+            auto yPtr = y->readMap<int>();
             if (yTargetPtr == nullptr || yPtr == nullptr) {
                 MNN_ERROR("[ConvInt8WinogradTestCommon] result is nullptr\n");
                 return false;
diff --git a/test/op/ConvolutionTest.cpp b/test/op/ConvolutionTest.cpp
index 5f1bb9e11..7ead7d92c 100644
--- a/test/op/ConvolutionTest.cpp
+++ b/test/op/ConvolutionTest.cpp
@@ -728,7 +728,7 @@ class ConvolutionTest : public ConvolutionType {
             1, 3, 10, 17
         };
         std::vector<int> icSize = {
-            1, 3, 10, 17
+            4, 1, 3, 8, 11
         };
         std::vector<int> isSize = {
             1, 7, 9
@@ -883,8 +883,8 @@ class DepthwiseConvolutionTest : public ConvolutionCommonTest {
         for (int b = 1; b <= 2; b++) {
             for (int oc = 4; oc <= 16; oc *= 2) {
                 for (int ic = oc; ic <= oc; ic++) {
-                    for (int isw = 1; isw <= 8; ++isw) {
-                        for (int ish = 1; ish <= 8; ++ish) {
+                    for (int isw = 1; isw <= 8; isw+=2) {
+                        for (int ish = 1; ish <= 8; ish*=2) {
                             for (int kw = 1; kw <= 4; kw++) {
                                 for (int kh = 1; kh <= 4; kh++) {
                                     for (int d = 1; d <= 2; d++) {
diff --git a/test/op/DeconvolutionTest.cpp b/test/op/DeconvolutionTest.cpp
index c4e46af32..e7443064b 100644
--- a/test/op/DeconvolutionTest.cpp
+++ b/test/op/DeconvolutionTest.cpp
@@ -12,6 +12,7 @@
 #include <vector>
 #include "MNNTestSuite.h"
 #include "TestUtils.h"
+#include "core/IDSTEncoder.hpp"
 using namespace std;
 using namespace MNN;
 using namespace MNN::Express;
@@ -60,12 +61,8 @@ VARP _Deconv(std::vector<int8_t>&& weight, std::vector<float>&& bias, std::vecto
     conv2D->common->kernelY     = kernelSize[1];
     conv2D->common->relu6 = relu6;
     conv2D->common->relu = relu;
-    MNN_ASSERT(weight.size() == channel[1] * (channel[0] / group) * kernelSize[0] * kernelSize[1]);
+    conv2D->quanParameter = IDSTEncoder::encode(nullptr, scale, channel[1], channel[0] * kernelSize[0] * kernelSize[1], false, weight.data(), -128);
     conv2D->symmetricQuan.reset(new QuantizedFloatParamT);
-    conv2D->symmetricQuan->weight = std::move(weight);
-    MNN_ASSERT(bias.size() == channel[1]);
-    conv2D->quanParameter.reset(new IDSTQuanT);
-    conv2D->quanParameter->alpha = std::move(scale);
     conv2D->bias = std::move(bias);
     return (Variable::create(Expr::create(convOp.get(), {x})));
 }
diff --git a/test/op/LayerNormTest.cpp b/test/op/LayerNormTest.cpp
index fb3020112..b6913edb2 100644
--- a/test/op/LayerNormTest.cpp
+++ b/test/op/LayerNormTest.cpp
@@ -130,12 +130,12 @@ static bool testKernel (std::vector<float> inputdata, std::vector<float> targetd
     if (!checkVector<float>(outputPtr, targetdata.data(), size, ratio)) {
         MNN_ERROR("%s failed: data dimension=[", testName.c_str());
         for (int i = 0; i < dimensions.size(); ++i) {
-            if (i < dimensions.size() - 1) MNN_PRINT("%d, ", dimensions[i]);
-            else MNN_PRINT("%d], reduce axis=[", dimensions[i]);
+            if (i < dimensions.size() - 1) {MNN_PRINT("%d, ", dimensions[i]);}
+            else {MNN_PRINT("%d], reduce axis=[", dimensions[i]);};
         }
         for (int i = 0; i < reduceAxis.size(); ++i) {
-            if (i < reduceAxis.size() - 1) MNN_PRINT("%d, ", reduceAxis[i]);
-            else MNN_PRINT("%d]\n", reduceAxis[i]);
+            if (i < reduceAxis.size() - 1) {MNN_PRINT("%d, ", reduceAxis[i]);}
+            else {MNN_PRINT("%d]\n", reduceAxis[i]);};
         }
         return false;
     }
diff --git a/test/op/RasterTest.cpp b/test/op/RasterTest.cpp
index 72b8baf7a..fb67d701e 100644
--- a/test/op/RasterTest.cpp
+++ b/test/op/RasterTest.cpp
@@ -297,3 +297,50 @@ class ConcatSliceTest : public MNNTestCase {
 
 };
 MNNTestSuiteRegister(ConcatSliceTest, "op/concat_slice");
+
+class TransposeC4Test : public MNNTestCase {
+public:
+    virtual ~TransposeC4Test() = default;
+    bool _run(int precision, bool lazy) {
+        int n = 32;
+        int c = 32;
+        auto input = _Input({n, c, 1, 1}, NCHW, halide_type_of<int>());
+        auto inputPtr = input->writeMap<int>();
+        for (int i=0; i<n; ++i) {
+            for (int j=0; j<c; ++j) {
+                inputPtr[c*i+j] = 1000 * i + j;
+            }
+        }
+        input = _Convert(input, NC4HW4);
+        input.fix(MNN::Express::VARP::CONSTANT);
+        std::vector<int> output0(n*c);
+        {
+            // Split Compute
+            auto o0 = _RasterRaw({input}, {
+                0, 0, 1, 1, 0, 0, 1, 1, 1,1,n*c,
+            }, {1, c, 1, n}, halide_type_of<int>(), NC4HW4);
+            o0.fix(MNN::Express::VARP::CONSTANT);
+            o0 = _Convert(o0, NCHW);
+            auto ptr = o0->readMap<int>();
+            ::memcpy(output0.data(), ptr, n*c*sizeof(int));
+        }
+        for (int i=0; i<n; ++i) {
+            for (int j=0; j<c; ++j) {
+                int value = 1000 * i + j;
+                if (output0[i*c+j] != value) {
+                    MNN_PRINT("%d - %d, %d : %d\n", i,j,output0[i*c+j], value);
+                    return false;
+                }
+            }
+        }
+        return true;
+    }
+    virtual bool run(int precision) {
+        ExecutorScope::Current()->lazyEval = true;
+        ExecutorScope::Current()->setLazyComputeMode(MNN::Express::Executor::LAZY_FULL);
+        auto res = _run(precision, true);
+        return res;
+    }
+
+};
+MNNTestSuiteRegister(TransposeC4Test, "op/transpose_c4");
diff --git a/test/op/ReductionTest.cpp b/test/op/ReductionTest.cpp
index 2e87b458c..c2e7d073a 100644
--- a/test/op/ReductionTest.cpp
+++ b/test/op/ReductionTest.cpp
@@ -10,6 +10,7 @@
 #include <MNN/expr/ExprCreator.hpp>
 #include "MNNTestSuite.h"
 #include "TestUtils.h"
+#include <random>
 using namespace MNN::Express;
 class ReduceSumTest : public MNNTestCase {
 public:
@@ -40,33 +41,68 @@ class ReduceSumMultiTest : public MNNTestCase {
 public:
     virtual ~ReduceSumMultiTest() = default;
     virtual bool run(int precision) {
-        auto input = _Input({4, 10, 1, 4}, NCHW, halide_type_of<float>());
-        // set input data
-        auto inputPtr  = input->writeMap<float>();
-        auto inputInfo = input->getInfo();
-        std::vector<float> inputData(inputInfo->size);
-        for (int i = 0; i < inputData.size(); ++i) {
-            inputData[i] = (float)((10.3 - i) * (i + 0.2));
-        }
-        memcpy(inputPtr, inputData.data(), inputData.size() * sizeof(float));
-        input->unMap();
-        auto output = _ReduceSum(input, {0, 2, 3});
-        std::vector<float> expectedOutput(10);
-        auto func = FP32Converter[precision];
-        for (int i = 0; i < 10; ++i) {
-            float sumValue = 0.0f;
-            for (int j = 0; j < 4; ++j) {
-                for (int k = 0; k < 4; ++k) {
-                    sumValue = func(func(inputData[i * 4 + k + j * 40]) + sumValue);
+        {
+            auto input = _Input({4, 10, 1, 4}, NCHW, halide_type_of<float>());
+            // set input data
+            auto inputPtr  = input->writeMap<float>();
+            auto inputInfo = input->getInfo();
+            std::vector<float> inputData(inputInfo->size);
+            for (int i = 0; i < inputData.size(); ++i) {
+                inputData[i] = (float)((10.3 - i) * (i + 0.2));
+            }
+            memcpy(inputPtr, inputData.data(), inputData.size() * sizeof(float));
+            input->unMap();
+            auto output = _ReduceSum(input, {0, 2, 3});
+            std::vector<float> expectedOutput(10);
+            auto func = FP32Converter[precision];
+            for (int i = 0; i < 10; ++i) {
+                float sumValue = 0.0f;
+                for (int j = 0; j < 4; ++j) {
+                    for (int k = 0; k < 4; ++k) {
+                        sumValue = func(func(inputData[i * 4 + k + j * 40]) + sumValue);
+                    }
                 }
+                expectedOutput[i] = sumValue;
+            }
+            auto gotOutput = output->readMap<float>();
+            if (!checkVector<float>(gotOutput, expectedOutput.data(), 1, 0.01)) {
+                MNN_ERROR("ReduceSumMultiTest test failed!\n");
+                return false;
             }
-            expectedOutput[i] = sumValue;
         }
-        auto gotOutput = output->readMap<float>();
-        if (!checkVector<float>(gotOutput, expectedOutput.data(), 1, 0.01)) {
-            MNN_ERROR("ReduceSumMultiTest test failed!\n");
-            return false;
+
+        {
+            std::mt19937 gen(42);
+            std::uniform_real_distribution<> dis(0.0, 1.0);
+            std::vector<int> inputShape = {3136, 16};
+            auto input = _Input({inputShape[0], inputShape[1]}, NCHW, halide_type_of<float>());
+            // set input data
+            auto inputPtr  = input->writeMap<float>();
+            auto inputInfo = input->getInfo();
+            std::vector<float> inputData(inputInfo->size);
+            for (int i = 0; i < inputData.size(); ++i) {
+                float randomValue = dis(gen);
+                inputData[i] = randomValue;
+            }
+            memcpy(inputPtr, inputData.data(), inputData.size() * sizeof(float));
+            input->unMap();
+            auto output = _ReduceSum(input, {0});
+            std::vector<float> expectedOutput(inputShape[1]);
+            auto func = FP32Converter[precision];
+            for (int i = 0; i < inputShape[1]; ++i) {
+                float sumValue = 0.0f;
+                for (int j = 0; j < inputShape[0]; ++j) {
+                    sumValue = func(func(inputData[i + j * inputShape[1]]) + sumValue);
+                }
+                expectedOutput[i] = sumValue;
+            }
+            auto gotOutput = output->readMap<float>();
+            if (!checkVector<float>(gotOutput, expectedOutput.data(), 1, 0.01)) {
+                MNN_ERROR("ReduceSumMultiTest test failed!\n");
+                return false;
+            }
         }
+
         return true;
     }
 };
diff --git a/test/op/ResizeTest.cpp b/test/op/ResizeTest.cpp
index 9a6ac5ef4..a2fd46fd5 100644
--- a/test/op/ResizeTest.cpp
+++ b/test/op/ResizeTest.cpp
@@ -41,6 +41,49 @@ class ResizeTest : public MNNTestCase {
         return true;
     }
 };
+class Interp2DTest : public MNNTestCase {
+public:
+    virtual ~Interp2DTest() = default;
+    virtual bool run(int precision) {
+        auto input = _Input({2, 2}, NCHW);
+        input->setName("input_tensor");
+        // set input data
+        const float inpudata[] = {-1.0, -2.0, 3.0, 4.0};
+        auto inputPtr          = input->writeMap<float>();
+        memcpy(inputPtr, inpudata, 4 * sizeof(float));
+        input->unMap();
+        input                                   = _Convert(input, NC4HW4);
+        
+        float hScale = 2.0;
+        float wScale = 2.0;
+        float scales[] = {hScale, wScale};
+        auto scaleVar = _Const((void*)scales, {2}, NCHW);
+        int outW = int(wScale * 2);
+        int outH = int(hScale * 2);
+        
+        //Interp Type:1
+        {
+            auto output                             = _Interp({input, scaleVar}, wScale, hScale, outW, outH, 1, false);
+            output                                  = _Convert(output, NHWC);
+            const std::vector<float> expectedOutput = {-1.0, -1.0, -2.0, -2.0, -1.0, -1.0, -2.0, -2.0,
+                3.0,  3.0,  4.0,  4.0,  3.0, 3.0, 4.0, 4.0};
+            auto gotOutput                          = output->readMap<float>();
+            
+            if (!checkVector<float>(gotOutput, expectedOutput.data(), 16, 0.01)) {
+                MNN_ERROR("Interp2D Type:1 test failed!\n");
+                return false;
+            }
+            
+            const std::vector<int> expectedDim = {4, 4};
+            auto gotDim                        = output->getInfo()->dim;
+            if (!checkVector<int>(gotDim.data(), expectedDim.data(), 2, 0)) {
+                MNN_ERROR("Interp2D Type:1 test failed!\n");
+                return false;
+            }
+        }
+        return true;
+    }
+};
 
 class InterpTest : public MNNTestCase {
 public:
@@ -226,4 +269,5 @@ class InterpInt8Test : public MNNTestCase {
 
 MNNTestSuiteRegister(ResizeTest, "op/resize");
 MNNTestSuiteRegister(InterpTest, "op/Interp");
+MNNTestSuiteRegister(Interp2DTest, "op/Interp2D");
 MNNTestSuiteRegister(InterpInt8Test, "op/InterpInt8");
diff --git a/test/speed/HybridConvSpeedTest.cpp b/test/speed/HybridConvSpeedTest.cpp
index fc7a56868..c17ecfebe 100644
--- a/test/speed/HybridConvSpeedTest.cpp
+++ b/test/speed/HybridConvSpeedTest.cpp
@@ -121,10 +121,10 @@ class HybridConvSpeedInt8Test : public HybridConvSpeedTestCommon {
         INTS channel0 = {4096, 4096}; // {ic, co}
         INTS channel1 = {1496, 256};
         int batch[3] = {23, 13, 1};
-        std::vector<int> blocks = {0, 32, 128};
+        std::vector<int> blocks = {32, 128, 0};
 
         std::vector<int> kernels = {1, 1};
-        std::vector<int> weightBits = {8, 4};
+        std::vector<int> weightBits = {4, 8};
         bool lowmemory = true;
         int batchNum = sizeof(batch) / sizeof(int);
         bool correct = true;
@@ -158,13 +158,13 @@ class HybridConvInt8Test : public HybridConvSpeedTestCommon {
         INTS strides = {1, 1}, dilate = {1, 1}, pad = {0, 0}, inputShape = {1, 1}; // {w, h}
         int testBatchCount = 5;
         // std::vector<int> batch(testBatchCount);
-        std::vector<int> batch = {1, 23, 1479, 38, 29};
+        std::vector<int> batch = {1, 23, 149, 38, 29};
         std::vector<int> kernels = {1, 1};
         bool lowmemory = true;
         {
-           std::vector< std::vector<int>> channels = {{7, 9}, {2048, 6144}, {1, 10}, {20, 153}, {9, 18}};
+           std::vector< std::vector<int>> channels = {{7, 9}, {2048, 54}, {1, 10}, {20, 153}, {9, 18}};
            for (int i = 0; i < channels.size(); ++i) {
-               for (int n = 0; n < 5; ++n) {
+               for (int n = 0; n < batch.size(); ++n) {
                    auto res = testKernel("Low memory HybridConv test:", inputShape, kernels, channels[i], pad, strides, dilate, batch[n], 8, precision);
                    if (!res) {
                        MNN_ERROR("Error: low memory hybridConv when bits=8, n=%d, ic=%d, oc=%d\n", batch[n], channels[i][0], channels[i][1]);
@@ -174,9 +174,9 @@ class HybridConvInt8Test : public HybridConvSpeedTestCommon {
            }
         }
         {
-            std::vector< std::vector<int>> channels = {{2048, 6144}, {8, 8}, {8, 9}, {8, 16}};
+            std::vector< std::vector<int>> channels = {{2048, 54}, {8, 8}, {8, 9}, {8, 16}};
             for (int i = 0; i < channels.size(); ++i) {
-                for (int n = 0; n < 5; ++n) {
+                for (int n = 0; n < batch.size(); ++n) {
                     auto res = testKernel("Low memory HybridConv test:", inputShape, kernels, channels[i], pad, strides, dilate, batch[n], 4, precision);
                     if (!res) {
                         MNN_ERROR("Error: low memory hybridConv when bits=4, n=%d, ic=%d, oc=%d\n", batch[n], channels[i][0], channels[i][1]);
@@ -193,7 +193,7 @@ class DenseConvInt8Test : public HybridConvSpeedTestCommon {
 public:
     virtual bool run(int precision) {
         std::vector< std::vector<int>> channels = {{4, 256}, {512, 128}, {1, 8}, {7, 9}};
-        INTS strides = {1, 1}, dilate = {1, 3}, pad = {0, 3}, inputShape = {1, 2640}; // {w, h}
+        INTS strides = {1, 1}, dilate = {1, 3}, pad = {0, 3}, inputShape = {1, 131}; // {w, h}
         std::vector<int> batch = {1, 13};
         std::vector<std::vector<int>> kernels = {{1, 1}, {1, 3}};
         std::vector<int> weightBits = {4, 8};
diff --git a/tools/MNNPythonOfflineQuant/ReadMe.txt b/tools/MNNPythonOfflineQuant/ReadMe.txt
deleted file mode 100644
index a07773575..000000000
--- a/tools/MNNPythonOfflineQuant/ReadMe.txt
+++ /dev/null
@@ -1,44 +0,0 @@
-这是用MNN的python接口改造的离线量化工具，适用于如下情况：
-    1. 你的模型无法使用MNN离线量化工具tools/quantization进行量化，例如多输入，数据预处理比较复杂
-    2. 你的模型无法使用MNN进行训练量化，受限于MNN的训练能力
-
-为了使用这个工具，你需要提供：
-    0. 使用 MNNConvert工具加上 --forTraining 将你的模型转换成MNN模型 (这步主要是为了保留模型中的BatchNorm，因此你保存pb或者onnx时不要做BatchNorm融合)
-    1. 一个 calibration_dataset.py 文件，里面包含了你的校准数据集的定义
-    2. 一个 config.yaml 文件，里面包含了你模型的输入输出的相关信息
-
-可以参考提供的 calibration_dataset.py 和 config.yaml 来实现
-
-特别注意校准集中返回输入数据的顺序和config文件中输入的顺序应该是对应的
-
-使用方法（batch size可以根据你的模型调整）：
-    python mnn_offline_quant.py --mnn_model origin_float_model.mnn --quant_model quant_model.mnn --batch_size 32
-
-使用建议：
-    1. 如果你的模型中卷积的激活是prelu的话，使用relu/relu6代替prelu可能会取得更好的量化精度和推理速度，这可能需要重新训练模型
-    2. 如果模型的输入无法固定，请将batch size设置为1，并且calibration_dataset的返回值也使用实际输入值的形状
-
-
-############################################################################
-
-
-This is a python version of MNN offline quant tool, use this tool when:
-    1. you can not use MNN offline quant tool (tools/quantization) to quantize your model, cases like multi-input, complecated preprocessing
-    2. you can not use MNN's quant-aware-training (QAT) tool to quantize your model, because of MNN's limited training features
-
-in order to use this tool, you need to provide:
-    0. use --forTraining flag of MNNConvert to convert your model to MNN (this is mainly for preserving BatchNorm, 
-        so you should NOT fuse BatchNorm when you save pb or onnx model files)
-    1. a calibration_dataset.py file, in which you define your calibration dataset
-    2. a config.yaml file, in which you provide information of inputs and outputs of your model
-
-you can refer to the example file to write your own.
-
-please Note, the order of returned input data in your calibration dataset should be aligned with the order of input your provide in your config.yaml file.
-
-usage of the tool (you can adjust batch size according to your own model):
-    python mnn_offline_quant.py --mnn_model origin_float_model.mnn --quant_model quant_model.mnn --batch_size 32
-
-usage tips:
-    1. if the activation function of conv is prelu in your model, use relu/relu6 instead of prelu may improve precision and inference speed of quantized model. re-training may be required.
-    2. if the input shape can not be fixed, your should set batch_size=1, and the shape of returned values of calibration_dataset should be actual input's shape.
diff --git a/tools/MNNPythonOfflineQuant/calibration_dataset.py b/tools/MNNPythonOfflineQuant/calibration_dataset.py
deleted file mode 100644
index e1f291ef6..000000000
--- a/tools/MNNPythonOfflineQuant/calibration_dataset.py
+++ /dev/null
@@ -1,106 +0,0 @@
-import numpy as np
-import os
-from PIL import Image
-import MNN
-F = MNN.expr
-
-
-# adapted from pycaffe
-def load_image(filename, color=True):
-    """
-    Load an image converting from grayscale or alpha as needed.
-
-    Parameters
-    ----------
-    filename : string
-    color : boolean
-        flag for color format. True (default) loads as RGB while False
-        loads as intensity (if image is already grayscale).
-
-    Returns
-    -------
-    image : an image with type np.float32 in range [0, 1]
-        of size (H x W x 3) in RGB or
-        of size (H x W x 1) in grayscale.
-    """
-    img = Image.open(filename)
-    img = np.array(img)
-    if img.ndim == 2:
-        img = img[:, :, np.newaxis]
-        if color:
-            img = np.tile(img, (1, 1, 3))
-    elif img.shape[2] == 4:
-        img = img[:, :, :3]
-    return img
-
-
-def center_crop(image_data, crop_factor):
-    height, width, channels = image_data.shape
-
-    h_size = int(height * crop_factor)
-    h_start = int((height - h_size) / 2)
-    h_end = h_start + h_size
-
-    w_size = int(width * crop_factor)
-    w_start = int((width - w_size) / 2)
-    w_end = w_start + w_size
-
-    cropped_image = image_data[h_start:h_end, w_start:w_end, :]
-
-    return cropped_image
-
-
-def resize_image(image, shape):
-    im = Image.fromarray(image)
-    im = im.resize(shape)
-    resized_image = np.array(im)
-
-    return resized_image
-
-
-class CalibrationDataset(MNN.data.Dataset):
-    '''
-    This is demo for Imagenet calibration dataset. like pytorch, you need to overload __getiterm__ and __len__ methods
-    __getiterm__ should return a sample in F.const, and you should not use batch dimension here
-    __len__ should return the number of total samples in the calibration dataset
-    '''
-    def __init__(self, image_folder):
-        super(CalibrationDataset, self).__init__()
-        self.image_folder = image_folder
-        self.image_list = os.listdir(image_folder)[0:1000]
-
-    def __getitem__(self, index):
-        image_name = os.path.join(self.image_folder, self.image_list[index].split(' ')[0])
-
-
-        # preprocess your data here, the following code are for tensorflow mobilenets
-        image_data = load_image(image_name)
-        image_data = center_crop(image_data, 0.875)
-        image_data = resize_image(image_data, (224, 224))
-        image_data = (image_data - 127.5) / 127.5
-
-
-        # after preprocessing the data, convert it to MNN data structure
-        dv = F.const(image_data.flatten().tolist(), [224, 224, 3], F.data_format.NHWC, F.dtype.float)
-
-        '''
-        first list for inputs, and may have many inputs, so it's a list
-        if your model have more than one inputs, add the preprocessed MNN const data to the input list
-
-        second list for targets, also, there may be more than one targets
-        for calibration dataset, we don't need labels, so leave it blank
-
-        Note that, the input order in the first list should be the same in your 'config.yaml' file.
-        '''
-        
-        return [dv], []
-
-    def __len__(self):
-        # size of the dataset
-        return len(self.image_list)
-
-
-'''
-initialize a CalibrationDataset object, its name should be exactly 'calibration_dataset'
-'''
-calibration_dataset = CalibrationDataset(image_folder='/data/imagenet_train_images')
diff --git a/tools/MNNPythonOfflineQuant/config.yaml b/tools/MNNPythonOfflineQuant/config.yaml
deleted file mode 100644
index 77e8c9863..000000000
--- a/tools/MNNPythonOfflineQuant/config.yaml
+++ /dev/null
@@ -1,10 +0,0 @@
-inputs:
-  names:
-    ['input', ]
-  shapes:
-    [[1, 3, 224, 224], ]
-  formats:
-    ['nchw', ]
-
-output_names:
-  ['MobilenetV2/Predictions/Reshape_1', ]
diff --git a/tools/MNNPythonOfflineQuant/mnn_offline_quant.py b/tools/MNNPythonOfflineQuant/mnn_offline_quant.py
deleted file mode 100644
index edb217516..000000000
--- a/tools/MNNPythonOfflineQuant/mnn_offline_quant.py
+++ /dev/null
@@ -1,137 +0,0 @@
-from __future__ import print_function
-import time
-import argparse
-import numpy as np
-import tqdm
-import os
-import MNN
-import yaml
-from calibration_dataset import calibration_dataset
-try:
-    from MNN.tools.utils.log import mnn_logger
-except:
-    mnn_logger = None
-
-nn = MNN.nn
-F = MNN.expr
-F.lazy_eval(True)
-
-def get_mnn_format(format_str):
-    fmt = str.lower(format_str)
-    if fmt == 'nchw':
-        return F.NCHW
-    elif fmt == 'nhwc':
-        return F.NHWC
-    elif fmt == 'nc4hw4':
-        return F.NC4HW4
-    else:
-        raise ValueError("unknown format:", format_str)
-
-def quant_func(net, dataloader, opt):
-    net.train(True)
-    dataloader.reset()
-
-    t0 = time.time()
-    for i in tqdm.trange(dataloader.iter_number):
-        example = dataloader.next()
-        input_data = example[0]
-        predicts = net.forward(input_data)
-        # fake update
-        opt.step(F.const([0.0], []))
-        for predict in predicts:
-            predict.read()
-
-    t1 = time.time()
-    cost = t1 - t0
-    print("Epoch cost: %.3f s." % cost)
-
-    return cost
-
-
-def main():
-    '''
-    offline quantization using MNN python api.
-
-    1. you need to convert your model to mnn model
-
-    2. you need to provide a calibration dataset by modifying preprocessing steps in
-    'calibration_dataset.py' to suit your case.
-
-    3. you need to provide a config yaml file in which provide input and output information about your model.
-    '''
-
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--mnn_model", type=str, required=True,\
-        help="original float MNN model file")
-    parser.add_argument("--quant_model", type=str, required=True, \
-        help="name of quantized model to save")
-    parser.add_argument("--batch_size", type=int, required=False, default=32,\
-                        help="calibration batch size")
-
-    args = parser.parse_args()
-
-    mnn_model = args.mnn_model
-    quant_model = args.quant_model
-    batch_size = args.batch_size
-
-    dataloader = MNN.data.DataLoader(calibration_dataset, batch_size=batch_size, shuffle=True)
-
-    m = F.load_as_dict(mnn_model)
-
-    inputs_outputs = F.get_inputs_and_outputs(m)
-    for key in inputs_outputs[0].keys():
-        print('input names:\t', key)
-    for key in inputs_outputs[1].keys():
-        print('output names:\t', key)
-    
-    config_file = "config.yaml"
-    f = open(config_file)
-    config = yaml.safe_load(f)
-
-    # get inputs and outputs
-    inputs = []
-    for name in config['inputs']['names']:
-        inputs.append(m[name])
-    
-    outputs = []
-    for name in config['output_names']:
-        outputs.append(m[name])
-    
-    input_placeholders = []
-    for i in range(len(inputs)):
-        shape = config['inputs']['shapes'][i]
-        fmt = config['inputs']['formats'][i]
-        nnn_format = get_mnn_format(fmt)
-        placeholder = F.placeholder(shape, nnn_format)
-        placeholder.name = config['inputs']['names'][i]
-        input_placeholders.append(placeholder)
-
-    net = nn.load_module(inputs, outputs, True)
-
-    # no use optimizer
-    opt = MNN.optim.SGD(net, 0.01, 0.9, 0.0005)
-
-    nn.compress.train_quant(net, quant_bits=8)
-
-    used_time = quant_func(net, dataloader, opt)
-
-    # save model
-    net.train(False)
-    predicts = net.forward(input_placeholders)
-    print("quantized model save to " + quant_model)
-    F.save(predicts, quant_model)
-
-    if mnn_logger is not None:
-        log_dict = {}
-        log_dict["tool"] = "python_offline_quant"
-        log_dict["model_guid"] = MNN.get_model_uuid(mnn_model)
-        src_model_size = os.path.getsize(mnn_model) / 1024.0 / 1024.0
-        dst_model_size = os.path.getsize(quant_model) / 1024.0 / 1024.0
-        compress_rate = src_model_size / dst_model_size
-        log_dict["detail"] = {"input_num": len(inputs), "used_time": int(used_time), \
-                                "src_model_size": src_model_size, "dst_model_size": dst_model_size, "compress_rate": compress_rate}
-        mnn_logger.put_log(log_dict, "quant")
-
-
-if __name__ == "__main__":
-    main()
diff --git a/tools/converter/include/config.hpp b/tools/converter/include/config.hpp
index 63e72052d..57956a9b0 100644
--- a/tools/converter/include/config.hpp
+++ b/tools/converter/include/config.hpp
@@ -52,6 +52,7 @@ class MNN_PUBLIC modelConfig {
     bool alignDenormalizedValue = true;
     bool detectSparseSpeedUp = true;
     bool convertMatmulToConv = true;
+    bool useGeluApproximation = true;
     bool transformerFuse = false;
     bool allowCustomOp = false;
     std::string customOpLibs = "";
diff --git a/tools/converter/source/TestConvertResult.cpp b/tools/converter/source/TestConvertResult.cpp
index 89651d3ee..ea6659801 100644
--- a/tools/converter/source/TestConvertResult.cpp
+++ b/tools/converter/source/TestConvertResult.cpp
@@ -42,6 +42,7 @@ int main(int argc, char *argv[]) {
         modelPath.MNNModel = defaultCacheFile;
         modelPath.keepInputFormat = true;
         modelPath.saveExternalData = true;
+        modelPath.useGeluApproximation = false;
         MNN::Cli::convertModel(modelPath);
     }
     return MNN::Cli::testconvert(defaultCacheFile, directName, 0.01f, configFile);
diff --git a/tools/converter/source/common/cli.cpp b/tools/converter/source/common/cli.cpp
index bc2399b36..c2a1fbae4 100644
--- a/tools/converter/source/common/cli.cpp
+++ b/tools/converter/source/common/cli.cpp
@@ -286,6 +286,11 @@ bool Cli::initializeMNNConvertArgs(modelConfig &modelPath, int argc, char **argv
      "save weight to extenal bin file.",
      cxxopts::value<bool>()
      )
+    (
+     "useGeluApproximation",
+     "Use Gelu Approximation Compute Instead of use ERF",
+     cxxopts::value<int>()
+     )
     (
      "convertMatmulToConv",
      "if 1, converter matmul with constant input to convolution. default: 1, range: {0, 1}",
@@ -478,7 +483,9 @@ bool Cli::initializeMNNConvertArgs(modelConfig &modelPath, int argc, char **argv
     if (result.count("convertMatmulToConv")) {
         modelPath.convertMatmulToConv = result["convertMatmulToConv"].as<int>();
     }
-
+    if (result.count("useGeluApproximation")) {
+        modelPath.useGeluApproximation = result["useGeluApproximation"].as<int>();
+    }
     if (result.count("testdir")) {
         modelPath.testDir = result["testdir"].as<std::string>();
     }
@@ -783,6 +790,14 @@ static bool compareOutput(MNN::Express::VARP output, const std::string& directNa
     absMax = MNN::Express::_Maximum(absMax, MNN::Express::_Scalar<float>(0.0001f));
     auto diff = MNN::Express::_Abs(targetValue - output);
     auto outputPtr = output->readMap<float>();
+#define MNN_IS_INF(x) (fabs(x) == INFINITY)
+#define MNN_IS_NAN(x) ((x) != (x))
+    for (int i=0; i<info->size; ++i) {
+        if (MNN_IS_INF(outputPtr[i]) || MNN_IS_NAN(outputPtr[i])) {
+            MNN_ERROR("TESTERROR %s value error:%f\n", name.c_str(), outputPtr[i]);
+            return false;
+        }
+    }
     auto diffAbsMax = MNN::Express::_ReduceMax(diff);
     auto absMaxV = absMax->readMap<float>()[0];
     auto diffAbsMaxV = diffAbsMax->readMap<float>()[0];
diff --git a/tools/converter/source/common/writeFb.cpp b/tools/converter/source/common/writeFb.cpp
index 4132a2770..68e4ce33e 100644
--- a/tools/converter/source/common/writeFb.cpp
+++ b/tools/converter/source/common/writeFb.cpp
@@ -14,6 +14,7 @@
 #include <sstream>
 
 #include "MNN_generated.h"
+#include "core/MNNFileUtils.h"
 #include "logkit.h"
 #include "writeFb.hpp"
 #include "CommonUtils.hpp"
@@ -145,8 +146,7 @@ int writeFb(std::unique_ptr<MNN::NetT>& netT, const std::string& MNNModelFile, c
         }
     }
     {
-        std::ofstream erase(".__convert_external_data.bin");
-        erase << "0";
+        MNNRemoveFile(".__convert_external_data.bin");
     }
     std::set<std::string> notSupportOps;
     auto CheckIfNotSupported = [&] (const std::unique_ptr<MNN::OpT>& op) {
diff --git a/tools/converter/source/onnx/CastLikeOnnx.cpp b/tools/converter/source/onnx/CastLikeOnnx.cpp
new file mode 100644
index 000000000..4a8f63547
--- /dev/null
+++ b/tools/converter/source/onnx/CastLikeOnnx.cpp
@@ -0,0 +1,26 @@
+//
+//  CastLikeOnnx.cpp
+//  MNNConverter
+//
+//  Created by MNN on 2024/10/17.
+//  Copyright © 2018, Alibaba Group Holding Limited
+//
+
+#include "onnxOpConverter.hpp"
+
+DECLARE_OP_CONVERTER(CastLikeOnnx);
+
+MNN::OpType CastLikeOnnx::opType() {
+    return MNN::OpType_CastLike;
+}
+
+MNN::OpParameter CastLikeOnnx::type() {
+    return MNN::OpParameter_NONE;
+}
+
+void CastLikeOnnx::run(MNN::OpT *dstOp, const onnx::NodeProto *onnxNode,
+                   OnnxScope* scope) {
+    return;
+}
+
+REGISTER_CONVERTER(CastLikeOnnx, CastLike);
diff --git a/tools/converter/source/onnx/UniqueOnnx.cpp b/tools/converter/source/onnx/UniqueOnnx.cpp
new file mode 100644
index 000000000..7c1fa3a76
--- /dev/null
+++ b/tools/converter/source/onnx/UniqueOnnx.cpp
@@ -0,0 +1,34 @@
+//
+//  UniqueOnnx.cpp
+//  MNNConverter
+//
+//  Created by MNN on 2024/10/17.
+//  Copyright © 2018, Alibaba Group Holding Limited
+//
+
+#include <MNN/MNNDefine.h>
+#include "onnxOpConverter.hpp"
+
+DECLARE_OP_CONVERTER(UniqueOnnx);
+
+MNN::OpType UniqueOnnx::opType() {
+    return MNN::OpType_Unique;
+}
+
+MNN::OpParameter UniqueOnnx::type() {
+    return MNN::OpParameter_NONE;
+}
+
+void UniqueOnnx::run(MNN::OpT *dstOp, const onnx::NodeProto *onnxNode,
+                   OnnxScope* scope) {
+    for (int i = 0; i < onnxNode->attribute_size(); ++i) {
+        const auto& attributeProto = onnxNode->attribute(i);
+        const auto& attributeName  = attributeProto.name();
+        if (attributeName == "axis") {
+            MNN_ERROR("Don't support onnx Unique with axis\n");
+        }
+    }
+    return;
+}
+
+REGISTER_CONVERTER(UniqueOnnx, Unique);
diff --git a/tools/converter/source/optimizer/merge/ConstantFolding.cpp b/tools/converter/source/optimizer/merge/ConstantFolding.cpp
index 2c28c9797..b0a04b36a 100644
--- a/tools/converter/source/optimizer/merge/ConstantFolding.cpp
+++ b/tools/converter/source/optimizer/merge/ConstantFolding.cpp
@@ -52,8 +52,20 @@ ConstantFolding::ConstantFolding() {
             if (!output_info) {
                 return false;
             }
-            const void* output_data = output->readMap<void>();
-            VARP const_var          = _Const(output_data, output_info->dim, output_info->order, output_info->type);
+            VARP const_var;
+            if (expr->get() && expr->get()->type() == OpType_Int8ToFloat) {
+                auto yy = expr->inputs()[0];
+                int size_ = expr->get()->main_as_QuantizedFloatParam()->tensorScale()->size();
+                auto ss = _Const(expr->get()->main_as_QuantizedFloatParam()->tensorScale()->data(), {size_});
+                auto zz = _Const(expr->get()->main_as_QuantizedFloatParam()->floatzeros()->data(), {size_});
+                auto wf = (_Cast<float>(yy) - zz) * ss;
+                auto weightDataPtr = wf->readMap<float>();
+                const_var = _Const(weightDataPtr, output_info->dim, output_info->order, output_info->type);
+            } else {
+                const void* output_data = output->readMap<void>();
+                const_var = _Const(output_data, output_info->dim, output_info->order, output_info->type);
+            }
+            
             const_var->setName(expr->name());
             EXPRP constant = const_var->expr().first;
             constant->setName(expr->name());
diff --git a/tools/converter/source/optimizer/merge/ConvDeQuantizeLinearFuseToConvInt8.cpp b/tools/converter/source/optimizer/merge/ConvDeQuantizeLinearFuseToConvInt8.cpp
index 83d45af22..8551edd6c 100644
--- a/tools/converter/source/optimizer/merge/ConvDeQuantizeLinearFuseToConvInt8.cpp
+++ b/tools/converter/source/optimizer/merge/ConvDeQuantizeLinearFuseToConvInt8.cpp
@@ -550,34 +550,24 @@ static auto gRegister = []() { // convInt8->(relu)->quant->cast->dequant->convIn
         if (nullptr == expr->get()) {
             return false;
         }
-        if (expr->get()->type() == OpType_Const || expr->get()->type() == OpType_TrainableParam) {
+        if (expr->get()->type() != OpType_Cast) {
             return false;
         }
-
-        int inputs_size = static_cast<int32_t>(expr->inputs().size());
-        for (int i = 0; i < inputs_size; ++i) {
-            if (!matchConvInt8ToOther(expr, i) && !matchOtherToOther(expr, i)) {
-                return false;
-            }
+        auto castparam = expr->get()->main_as_CastParam();
+        if (castparam->dstT() != MNN::DataType_DT_UINT8) {
+            return false;
+        }
+        auto quantExpr = expr->inputs()[0]->expr().first;
+        if (quantExpr->get()->type() != OpType_FloatToInt8) {
+            return false;
         }
         return true;
     };
     auto transformXToOther = [](EXPRP expr) { // X->quant->cast->dequant->output_other => X->output_other
-        int input_size = static_cast<int32_t>(expr->inputs().size());
-        std::vector<VARP> new_inputs(input_size);
-        for (int i = 0; i < input_size; ++i) {
-            if (matchConvInt8ToOther(expr, i)) {
-                VARP input_i = transformConvInt8ToOther(expr, i);
-                new_inputs[i] = input_i;
-            } else {
-                VARP input_i = transformOtherToOther(expr, i);
-                new_inputs[i] = input_i;
-            }
-        }
-
+        auto quantExpr = expr->inputs()[0]->expr().first;
         // generate a new oher op.
-        std::unique_ptr<OpT> oldOtherOp(expr->get()->UnPack());
-        auto newop_expr = Expr::create(oldOtherOp.get(), new_inputs);
+        std::unique_ptr<OpT> oldOtherOp(quantExpr->get()->UnPack());
+        auto newop_expr = Expr::create(oldOtherOp.get(), quantExpr->inputs());
         Expr::replace(expr, newop_expr);
         return true;
         
@@ -708,8 +698,8 @@ static auto gRegister = []() { // convInt8->(relu)->quant->cast->dequant->convIn
             Expr::replace(expr, conv_expr);
             return true;
         }
-        float output_scale = quan_expr->inputs().at(2)->readMap<float>()[0];
-        float output_zero  = quan_expr->inputs().at(3)->readMap<float>()[0];
+        float output_scale = quan_expr->get()->main_as_QuantizedFloatParam()->tensorScale()->data()[0];
+        float output_zero  = quan_expr->get()->main_as_QuantizedFloatParam()->floatzeros()->data()[0];
         // directly return the op output.
         std::unique_ptr<OpT> oldOtherOp(X_expr->get()->UnPack());
         auto newop_expr = Expr::create(oldOtherOp.get(), X_expr->inputs());
diff --git a/tools/converter/source/optimizer/merge/ConvertMatMulToConv2D.cpp b/tools/converter/source/optimizer/merge/ConvertMatMulToConv2D.cpp
index 998f68844..56f4ccbf8 100644
--- a/tools/converter/source/optimizer/merge/ConvertMatMulToConv2D.cpp
+++ b/tools/converter/source/optimizer/merge/ConvertMatMulToConv2D.cpp
@@ -418,7 +418,7 @@ ConvertMatMulToConv2D::ConvertMatMulToConv2D() {
                         matmul_expr = matmul_var->expr().first;
                     }
                 }
-                if (matmul_expr->inputs().size() != 8 && matmul_expr->inputs().size() != 9) { // matmul 8 input: (x,y,x_scale,x_zero,y_scale,y_zero,out_scale,out_zero,bias
+                if (matmul_expr->inputs().size() != 8 && matmul_expr->inputs().size() != 9) { // matmul 8 input: for MatMulInteger (x,y,x_scale,x_zero,y_scale,y_zero,out_scale,out_zero,bias
                     return false;
                 }
                 if (matmul_var->linkNumber() > 1) {
diff --git a/tools/converter/source/optimizer/merge/FuseTemplateOp.cpp b/tools/converter/source/optimizer/merge/FuseTemplateOp.cpp
index c6a0c3c65..2d838a69c 100644
--- a/tools/converter/source/optimizer/merge/FuseTemplateOp.cpp
+++ b/tools/converter/source/optimizer/merge/FuseTemplateOp.cpp
@@ -384,7 +384,7 @@ static auto gRegister = []() {
         auto transform = [templatesExprs, input](EXPRP expr) {
             auto config = Global<modelConfig>::Get();
             auto unaryType = UnaryOpOperation_GELU_STANDARD;
-            if (config->optimizeLevel == 2) {
+            if (config->useGeluApproximation) {
                 unaryType = UnaryOpOperation_GELU;
             }
             for (auto templateExpr : templatesExprs) {
diff --git a/tools/converter/source/optimizer/onnxextra/OnnxConvolutionMerge.cpp b/tools/converter/source/optimizer/onnxextra/OnnxConvolutionMerge.cpp
index b122d2fb7..ec43deee9 100644
--- a/tools/converter/source/optimizer/onnxextra/OnnxConvolutionMerge.cpp
+++ b/tools/converter/source/optimizer/onnxextra/OnnxConvolutionMerge.cpp
@@ -149,18 +149,9 @@ class OnnxConvolutionTransform : public OnnxExtraManager::Transform {
             MNN_ERROR("Convolution should know weight shape infromation!\n");
             return nullptr;
         }
-        INTS weightShape;
-        if (weightIden) {
-            auto dim = weight_expr->inputs().at(4)->readMap<int32_t>();
-            int dimSize = weight_expr->inputs().at(4)->getInfo()->dim[0];
-            for (int k = 0; k < dimSize; ++k) {
-                weightShape.emplace_back(dim[k]);
-            }
-        } else {
-            weightShape = weight->getInfo()->dim;
-        }
+        INTS weightShape = weight->getInfo()->dim;
+        
         bool convertToConvint8 = false;
-        convertToConvint8 = (true == weightIden && true == xIden && weight_expr->inputs().size() == 5);
 
         auto op         = expr->get();
         auto extraParam = op->main_as_Extra();
@@ -179,6 +170,10 @@ class OnnxConvolutionTransform : public OnnxExtraManager::Transform {
             co = weightShape[1];
             ci = weightShape[0];
         }
+        if (weightIden) {
+            co = weightShape[1];
+            ci = weightShape[0];
+        }
 
         int group           = 1;
         int dilation_h      = 1;
@@ -292,103 +287,23 @@ class OnnxConvolutionTransform : public OnnxExtraManager::Transform {
             // Fastest
             limitNumber = 100;
         }
+        VARP wf = weight;
         if ( weight->linkNumber() <= limitNumber && !convertToConvint8) {
-            weightDataPtr = weight->readMap<float>();
-        }
-        EXPRP reluExpr;
-        bool hasRelu = false;
-        if (convertToConvint8) {
-            // Get output quant info.
-            auto outputExpr = expr->outputs().front().lock();
-            
-            if (outputExpr->get() && (outputExpr->get()->type() == OpType::OpType_ReLU || outputExpr->get()->type() == OpType_ReLU6)) {
-                reluExpr = std::move(outputExpr);
-                outputExpr = reluExpr->outputs().front().lock();
-                hasRelu = true;
+            if (!weightIden) {
+                weightDataPtr = weight->readMap<float>();
             }
-            auto outputScaleVar = outputExpr->inputs()[1];
-            float outputScale = outputScaleVar->readMap<float>()[0];
-            int8_t outputZero = 0;
-            if (outputExpr->inputs().size() > 2) {
-                if (outputExpr->inputs()[2]->getInfo()->type.code == halide_type_uint) {
-                    outputZero = static_cast<int8_t>(outputExpr->inputs()[2]->readMap<uint8_t>()[0] - 128);
-                } else {
-                    outputZero = static_cast<int8_t>(outputExpr->inputs()[2]->readMap<int8_t>()[0]);
-                }
-                
-            }
-            // Get weight quant info.
-            float inputClampMin = -128;
-            float inputClampMax = 127;
-            auto weightexpr = weight->expr().first;
-            auto weightInt8 = weightexpr->inputs()[0];
-            auto pw= weightInt8->readMap<int8_t>();
-            const size_t weightSize = co * ci * kh * kw;
-//            std::vector<int8_t> weightData(weightSize);
-            std::vector<int32_t> weightKenelSum(co);
-            const int kernelSize = static_cast<int32_t>(weightSize / co);
-//            for (int cnt = 0; cnt < weightSize; ++cnt) {
-//                weightData[cnt] = pw[cnt];
-//            }
-            for (int i = 0; i < co; i++) {
-                int temp = 0;
-                int offset = i * kernelSize;
-                for (int j = 0; j < kernelSize; j++) {
-                    temp += int(pw[offset + j]);
-                }
-                weightKenelSum[i] = temp;
-            }
-            std::vector<int32_t> biasInt32(common->outputCount, 0);
-            convParam->quanParameter.reset(new IDSTQuanT);
-            convParam->quanParameter->aMin = -128;
-            convParam->quanParameter->aMax = co;
-            convParam->quanParameter->readType = co;
-            convParam->quanParameter->type = 4;
-            convParam->quanParameter->buffer.resize(weightSize);
-            ::memcpy(convParam->quanParameter->buffer.data(), pw, weightSize * sizeof(int8_t));
-            convParam->quanParameter->quantScale = 1.0f;
-            convParam->quanParameter->scaleOut = outputScale;
-            convParam->symmetricQuan.reset(new QuantizedFloatParamT);
-            convParam->symmetricQuan->nbits = 8;
-            
-            // Get input quant info.
-            auto inputExpr = inputs[0]->expr().first;
-            //x = inputExpr->inputs()[0]; // for op merge to convint8, so remain int8ToFloat layer for the moment
-            auto inputScaleVar = inputExpr->inputs()[2];
-            auto inputZeroVar = inputExpr->inputs()[3];
-            float inputScale = inputScaleVar->readMap<float>()[0];
-            int8_t inputZero = static_cast<int8_t>(inputZeroVar->readMap<float>()[0]);
-            
-            convParam->quanParameter->scaleIn = inputScale;
-            convParam->quanParameter->alpha.resize(2 * co);
-
-            // Compute convInt8 scale=(inputScale * weightScale)/outputScale
-            std::vector<float> scale(co);
-            auto weightScale = weightexpr->inputs().at(2);
-            auto ptrscale = weightScale->readMap<float>();
-            auto weightZero = weightexpr->inputs().at(3);
-            auto ptrzero = weightZero->readMap<float>();
-            for (int cnt = 0; cnt < co; ++cnt) {
-                convParam->quanParameter->alpha[2 * cnt + 1] = ptrscale[cnt];
-                convParam->quanParameter->alpha[2 * cnt] = (-1)*(ptrzero[cnt] + 128) * ptrscale[cnt];
-            }
-            convParam->bias.resize(co);
-            if (inputSize > 2) {
-                auto biasExpr = inputs[2]->expr().first;
-                auto biasfp32Var = biasExpr->inputs()[1];
-                if (biasfp32Var->readMap<float>() == nullptr) {
-                    MNN_ERROR("Convolution bias should be constant\n");
-                    return nullptr;
-                }
-                ::memcpy(convParam->bias.data(), biasfp32Var->readMap<float>(), co * sizeof(float));
+            else {
+                auto yy = weight->expr().first->inputs()[0]; // weight shape: [ic,oc,kh,kw]
+                auto ss = _Const(weight->expr().first->get()->main_as_QuantizedFloatParam()->tensorScale()->data(), {co});
+                auto zz = _Const(weight->expr().first->get()->main_as_QuantizedFloatParam()->floatzeros()->data(), {co});
+                wf = (_Cast<float>(_Permute(yy, {0, 2, 3, 1})) - zz) * ss;
+                wf = _Permute(wf, {3, 0, 1, 2});
+                weightDataPtr = wf->readMap<float>();
             }
-            convParam->symmetricQuan->clampMax = 127;
-            convParam->symmetricQuan->clampMin = -128;
-            convParam->symmetricQuan->zeroPoint = std::move(inputZero);
-            convParam->symmetricQuan->outputZeroPoint = std::move(outputZero);
         }
-        // Do not return convInt8.
-        if (false == convertToConvint8 && weightDataPtr) {
+        EXPRP reluExpr;
+        bool hasRelu = false;
+        if (weightDataPtr) {
             if (weight->linkNumber() > 1) {
                 static bool gPrint = false;
                 if (!gPrint) {
diff --git a/tools/converter/source/optimizer/onnxextra/OnnxDeQuantizeLinear.cpp b/tools/converter/source/optimizer/onnxextra/OnnxDeQuantizeLinear.cpp
index d67760d27..856871225 100644
--- a/tools/converter/source/optimizer/onnxextra/OnnxDeQuantizeLinear.cpp
+++ b/tools/converter/source/optimizer/onnxextra/OnnxDeQuantizeLinear.cpp
@@ -13,6 +13,32 @@
 namespace MNN {
 namespace Express {
 
+static VARP _Int8ToFloat(VARP x, VARP scale, VARP zero) {
+    MNN_ASSERT(scale->getInfo() && zero->getInfo());
+    MNN_ASSERT(scale->getInfo()->size == zero->getInfo()->size || zero->getInfo()->size <= 1);
+    auto size = 1;
+    if (scale->getInfo()->size > 1) {
+        size = scale->getInfo()->size;
+    }
+    std::unique_ptr<OpT> op(new OpT);
+    op->type = OpType_Int8ToFloat;
+    op->main.type = OpParameter_QuantizedFloatParam;
+    op->main.value = new QuantizedFloatParamT;
+    op->main.AsQuantizedFloatParam()->tensorScale.resize(size);
+    if (scale->readMap<float>()) {
+        ::memcpy(op->main.AsQuantizedFloatParam()->tensorScale.data(), scale->readMap<float>(), size * sizeof(float));
+    }
+    op->main.AsQuantizedFloatParam()->floatzeros.resize(size);
+    if (zero->readMap<float>()) {
+        auto zerosize = 1;
+        if (zero->getInfo()->size > 1) {
+            zerosize = zero->getInfo()->size;
+        }
+        ::memcpy(op->main.AsQuantizedFloatParam()->floatzeros.data(), zero->readMap<float>(), zerosize * sizeof(float));
+    }
+    return Variable::create(Expr::create(op.get(), {x}));
+}
+
 class OnnxDequantizeLinearTransform : public OnnxExtraManager::Transform {
 public:
     virtual EXPRP onExecute(EXPRP expr) const override {
@@ -23,64 +49,71 @@ class OnnxDequantizeLinearTransform : public OnnxExtraManager::Transform {
             MNN_ERROR("Onnx QuantizeLinear input error: inputs size<2\n");
             return nullptr;
         }
+        bool int32Dequant = false;
         auto input = inputs[0];
         auto scale = inputs[1];
-        
-        if (nullptr == scale || nullptr == input) {
-            MNN_ERROR("QuantizeLinear should provide scale and input\n");
-            return nullptr;
-        }
 
         auto dataType = halide_type_int;
         VARP zeropoint = _Const(0.f);
         if (inputs.size() > 2) {
-            if (inputs[2]->getInfo() == nullptr) {
-                MNN_ERROR("DequantizeLinear layer inputs.size>2, but zeroPoint is not const\n");
-            }
-            MNN_ASSERT(inputs[2]->getInfo() != nullptr);
-            auto zeroDim = inputs[2]->getInfo()->dim;
-            dataType = static_cast<halide_type_code_t>(inputs[2]->getInfo()->type.code);
-            std::vector<float> fp32Zero(inputs[2]->getInfo()->size);
-            if (dataType == halide_type_int) {
-                const int8_t* zeroPtr = inputs[2]->readMap<int8_t>();
-                for (int j = 0; j < fp32Zero.size(); ++j) {
-                    fp32Zero[j] = static_cast<float>(zeroPtr[j]);
-                }
-                zeropoint = _Const(fp32Zero.data(), zeroDim, inputs[2]->getInfo()->order, halide_type_of<float>());
-            } else {
-                const uint8_t* zeroPtr = inputs[2]->readMap<uint8_t>();
-                for (int j = 0; j < fp32Zero.size(); ++j) {
-                    fp32Zero[j] = static_cast<float>(zeroPtr[j]) - 128.f;
-                }
-                zeropoint = _Const(fp32Zero.data(), zeroDim, inputs[2]->getInfo()->order, halide_type_of<float>());
+            if (inputs[2]->getInfo()) {
+                dataType = static_cast<halide_type_code_t>(inputs[2]->getInfo()->type.code);
             }
             zeropoint = _Cast<float>(inputs[2]);
+
         }
         
         std::vector<int32_t> inputDim = {};
         if (input->getInfo()) {
             inputDim = input->getInfo()->dim;
             dataType = static_cast<halide_type_code_t>(input->getInfo()->type.code);
+            if (input->getInfo()->type.bits == 32) {
+                // from onnx document.
+                auto floatinput = _Cast<float>(input);
+                auto output = floatinput * scale;
+                output->expr().first->setName(expr->name());
+                return output->expr().first;
+            }
+            if (dataType == halide_type_uint && input->readMap<uint8_t>()) {
+                auto floatinput = _Cast<float>(input);
+                auto output = (floatinput - zeropoint) * scale;
+                output->expr().first->setName(expr->name());
+                return output->expr().first;
+            }
         }
         auto offset = _Const(0.f);
         if (dataType == halide_type_uint) {
             offset = _Const(128.f);
         }
-        // if (!scale->getInfo()->dim.empty()) {
-        //     zeropoint = _Unsqueeze(zeropoint, {1,2,3});
-        //     scale = _Unsqueeze(scale, {1, 2, 3});
-        // } else {
-        //     scale = _Reshape(scale, {1});
-        //     zeropoint = _Reshape(zeropoint, {1});
-        // }
-        auto _shape  = _Const(inputDim.data(), {static_cast<int32_t>(inputDim.size())}, NHWC, halide_type_of<int>());
-        auto output = (_Cast<float>(input) - zeropoint) * scale;
         std::unique_ptr<MNN::OpT> iden(new MNN::OpT);
         iden->type = OpType_Int8ToFloat;
 
-        auto newExpr = MNN::Express::Expr::create(iden.get(), {input, output, scale, zeropoint - offset, _shape}, 5);
-        newExpr->setName(expr->name());
-        return newExpr;
+        if (input->getInfo() && input->getInfo()->dim.size() == 4) { // convolution weight
+            auto shape_ = input->getInfo()->dim;
+            int size = scale->getInfo()->dim[0];
+            // [oc,ic,kx,ky] -> [ic,oc,kx,ky]
+            auto x = _Permute(input, {1, 0, 2, 3});
+            auto y = _Int8ToFloat(x, scale, zeropoint - offset);
+            y->expr().first->setName(expr->name());
+            return y->expr().first;
+        }
+        if (scale->readMap<float>() && input->getInfo() && input->getInfo()->type.bits == 8) { // matmul B const
+            auto newvar = _Int8ToFloat(input, scale, (zeropoint- offset));
+            newvar->expr().first->setName(expr->name());
+            return newvar->expr().first;
+        }
+        
+        if (scale->readMap<float>() == nullptr) { // dynamic layer's input
+            auto int8ToFloatvar = _Int8ToFloat(input, _Const(1.0f), _Const(0.f));
+            auto output = (int8ToFloatvar - zeropoint) * scale;
+            output->expr().first->setName(expr->name());
+            return output->expr().first;
+        }
+        auto newvar = _Int8ToFloat(input, scale, (zeropoint- offset));
+        newvar->expr().first->setName(expr->name());
+        return newvar->expr().first;
+        
+        
     }
 };
 
diff --git a/tools/converter/source/optimizer/onnxextra/OnnxGemm.cpp b/tools/converter/source/optimizer/onnxextra/OnnxGemm.cpp
index f5a96cb27..d90434df5 100644
--- a/tools/converter/source/optimizer/onnxextra/OnnxGemm.cpp
+++ b/tools/converter/source/optimizer/onnxextra/OnnxGemm.cpp
@@ -8,6 +8,7 @@
 
 #include "MNN_generated.h"
 #include "OnnxExtraManager.hpp"
+#include "../merge/MergeHelpers.hpp"
 
 namespace MNN {
 namespace Express {
@@ -21,6 +22,25 @@ static VARP _MatMul_Int8(VARP a, VARP b, bool tranposeA, bool tranposeB, VARP sc
     return (Variable::create(Expr::create(op.get(), {a, b, scaleA, zeroA, scaleB, zeroB, ScaleOut, ScaleZero, bias})));
 }
 
+static VARP _ReshapeF(VARP x, VARP shape, MNN::MNN_DATA_FORMAT format) {
+    MNN_ASSERT(nullptr != x);
+    std::unique_ptr<OpT> reshape(new OpT);
+    reshape->type                      = OpType_Reshape;
+    reshape->main.type                 = OpParameter_Reshape;
+    reshape->main.value                = new ReshapeT;
+    reshape->main.AsReshape()->dimType = format;
+    return (Variable::create(Expr::create(reshape.get(), {x, shape})));
+}
+static VARP _ConvertF(VARP input, MNN::MNN_DATA_FORMAT format) {
+    std::unique_ptr<OpT> convert(new OpT);
+    convert->type                               = OpType_ConvertTensor;
+    convert->main.type                          = OpParameter_TensorConvertInfo;
+    convert->main.value                         = new TensorConvertInfoT;
+    convert->main.AsTensorConvertInfo()->source = MNN_DATA_FORMAT_NC4HW4;
+    convert->main.AsTensorConvertInfo()->dest   = format;
+    return (Variable::create(Expr::create(convert.get(), {input})));
+}
+
 class OnnxGemmTransform : public OnnxExtraManager::Transform {
 public:
     virtual EXPRP onExecute(EXPRP expr) const override {
@@ -60,6 +80,137 @@ class OnnxGemmTransform : public OnnxExtraManager::Transform {
         auto y_expr = Y->expr().first;
         auto Z = _MatMul(X, Y, transA, transB);
         if (x_expr->get() && y_expr->get() && x_expr->get()->type() == OpType_Int8ToFloat && y_expr->get()->type() == OpType_Int8ToFloat) {
+            auto config = Global<modelConfig>::Get();
+            if (helpers::IsConstant(y_expr)) {
+                auto matmulOp = expr->get();
+                auto weight = Y;
+                auto input = X;
+                auto weightInfo = weight->getInfo();
+                auto transposeB = matmulOp->main_as_MatMul()->transposeB();
+                auto transposeA = matmulOp->main_as_MatMul()->transposeA();
+                auto needSqueezeB = false;
+                auto needSqueezeA = false;
+                bool inputShapeUnknow = false;
+                if (input->getInfo() != nullptr) {
+                    if (input->getInfo()->dim.size() <= 1) {
+                        input = _Unsqueeze(input, {0});
+                        needSqueezeA = true;
+                    }
+                } else {
+                    inputShapeUnknow = true;
+                }
+                if (weightInfo->dim.size() == 1) {
+                    weight = _Unsqueeze(weight, {1});
+                    needSqueezeB = true;
+                }
+                if (!transposeB) {
+                    weight = _Transpose(weight, {1, 0});
+                }
+                if (X->getInfo() && X->getInfo()->dim.size() <= 1) {
+                    X = _Unsqueeze(X, {0});
+                    needSqueezeA = true;
+                }
+                if (needSqueezeA && needSqueezeB) {
+                    MNN_ERROR("Invalid MatMul for one-dimension A and B\n");
+                    return nullptr;
+                }
+                auto format = MNN::MNN_DATA_FORMAT_NCHW;
+                int oc = weight->getInfo()->dim[0];
+                int ic = weight->getInfo()->dim[1];
+                
+                // quan parameters
+                float inputScale  = X->expr().first->get()->main_as_QuantizedFloatParam()->tensorScale()->data()[0];
+                float inputZero   = X->expr().first->get()->main_as_QuantizedFloatParam()->floatzeros() ->data()[0];
+                auto  weightScale = Y->expr().first->get()->main_as_QuantizedFloatParam()->tensorScale()->data();
+                auto  weightZero  = Y->expr().first->get()->main_as_QuantizedFloatParam()->floatzeros()->data();
+                // conv op
+                std::unique_ptr<Convolution2DT> conv(new MNN::Convolution2DT);
+                conv->common.reset(new MNN::Convolution2DCommonT);
+                conv->common->inputCount = ic;
+                conv->common->outputCount = oc;
+                // conv quant parameters
+                conv->quanParameter.reset(new IDSTQuanT);
+                conv->quanParameter->scaleIn = inputScale;
+                conv->quanParameter->type = 4;
+                conv->quanParameter->aMin = -128;
+                conv->quanParameter->readType = oc;
+                conv->quanParameter->quantScale = 1.f;
+                conv->quanParameter->buffer.resize(Y->getInfo()->size);
+                ::memcpy(conv->quanParameter->buffer.data(), weight->readMap<int8_t>(), Y->getInfo()->size);
+                conv->quanParameter->alpha.resize(2 * oc);
+                for (int i = 0; i < oc; ++i) {
+                    conv->quanParameter->alpha[2 * i] = (-1 * weightZero[i] - 128.f) / weightScale[i]; // minval
+                    conv->quanParameter->alpha[2 * i + 1] = weightScale[i];
+                }
+                // output expr
+                auto outputExpr = expr->outputs().front().lock();
+                auto outputScaleVar = outputExpr->inputs()[1];
+                auto outputZero = _Const(0.f);
+                if (outputExpr->inputs().size() > 2 && outputExpr->inputs()[2]->getInfo()) {
+                    if (outputExpr->inputs()[2]->getInfo()->type.code == halide_type_int) {
+                        outputZero = _Cast<float>(outputExpr->inputs()[2]);
+                    } else {
+                        outputZero = _Cast<float>(outputExpr->inputs()[2]) - _Const(128.f);
+                    }
+                }
+                conv->quanParameter->scaleOut = outputScaleVar->readMap<float>()[0];
+                conv->symmetricQuan.reset(new QuantizedFloatParamT);
+                conv->symmetricQuan->nbits = 8;
+                conv->symmetricQuan->clampMax = 127;
+                conv->symmetricQuan->clampMin = -128;
+                conv->symmetricQuan->zeroPoint = static_cast<int8_t>(inputZero);
+                conv->symmetricQuan->outputZeroPoint = static_cast<int8_t>(outputZero->readMap<float>()[0]);
+                conv->bias.resize(oc);
+                if (inputs.size() > 2) {
+                    memcpy(conv->bias.data(), inputs[2]->readMap<float>(), oc * sizeof(float));
+                }
+                
+                std::unique_ptr<OpT> conv_op(new OpT);
+                conv_op->type = OpType_Convolution;
+                conv_op->main.type = OpParameter_Convolution2D;
+                conv_op->main.value = conv.release();
+
+                auto rank = _Rank(X);
+                auto inputShape = _Shape(X, NCHW);
+                auto inputL = _Unsqueeze(_Scalar<int>(ic), {0});
+                inputL.fix(VARP::CONSTANT);
+                auto outputH = _Unsqueeze(_Scalar<int>(oc), {0});
+                outputH.fix(VARP::CONSTANT);
+                VARP remainBegin;
+                VARP inputELength;
+                if (inputShapeUnknow) {
+                    remainBegin = _Minimum(_Scalar<int>(2), rank);
+                    inputELength = remainBegin - _Scalar<int>(1);
+                } else {
+                    remainBegin = _Scalar<int>(2);
+                    inputELength = _Scalar<int>(1);
+                }
+                auto rankRemain = _Unsqueeze(rank - remainBegin, {0});
+                VARP inputE;
+                VARP inputRemain = _Slice(inputShape, _Unsqueeze(_Scalar<int>(0), {0}), rankRemain);
+                if (transposeA) {
+                    inputE = _Slice(inputShape, _Unsqueeze(rank - _Scalar<int>(1), {0}), _Unsqueeze(_Scalar<int>(1), {0}));
+                    input = _ReshapeF(X, _Concat({_Unsqueeze(_Scalar<int>(-1), {0}), inputL, inputE, _Unsqueeze(_Scalar<int>(1), {0})}, 0), format);
+                    
+                } else {
+                    inputE = _Slice(inputShape, rankRemain, _Unsqueeze(inputELength, {0}));
+                    input = _ReshapeF(X, _Concat({_Unsqueeze(_Scalar<int>(-1), {0}), inputL, _Unsqueeze(_Scalar<int>(1), {0}), _Unsqueeze(_Scalar<int>(1), {0})}, 0), format);
+                    
+                }
+                EXPRP dense_expr = Expr::create(conv_op.get(), {X}, 1);
+                VARP output = Variable::create(dense_expr);
+                output->setName(expr->outputName(0) + "__matmul_converted");
+                output = _ConvertF(output, format);
+                VARP reshapeVar = _ReshapeF(output, _Concat({inputRemain, inputE, outputH}, 0), format);
+                if (needSqueezeA) {
+                    reshapeVar = _Squeeze(reshapeVar, {0});
+                }
+                if (needSqueezeB) {
+                    reshapeVar = _Squeeze(reshapeVar, {1});
+                }
+                reshapeVar->setName(expr->outputName(0));
+                return reshapeVar->expr().first;
+            }
             // input quant info
             auto y_int8 = y_expr->inputs().at(0);
             auto y_scale = y_expr->inputs().at(2);
diff --git a/tools/converter/source/optimizer/onnxextra/OnnxQuantizeLinear.cpp b/tools/converter/source/optimizer/onnxextra/OnnxQuantizeLinear.cpp
index fae1ffb0c..ec765c434 100644
--- a/tools/converter/source/optimizer/onnxextra/OnnxQuantizeLinear.cpp
+++ b/tools/converter/source/optimizer/onnxextra/OnnxQuantizeLinear.cpp
@@ -13,6 +13,22 @@
 namespace MNN {
 namespace Express {
 
+static VARP _Float2Int8(VARP x, VARP scale, VARP zero) {
+    int size = 1;
+    if (scale->getInfo()->size > 1) {
+        size = scale->getInfo()->size;
+    }
+    std::unique_ptr<OpT> op(new OpT);
+    op->type = OpType_FloatToInt8;
+    op->main.type = OpParameter_QuantizedFloatParam;
+    op->main.value = new QuantizedFloatParamT;
+    op->main.AsQuantizedFloatParam()->tensorScale.resize(size);
+    op->main.AsQuantizedFloatParam()->floatzeros.resize(size);
+    ::memcpy(op->main.AsQuantizedFloatParam()->tensorScale.data(), scale->readMap<float>(), size * sizeof(float));
+    ::memcpy(op->main.AsQuantizedFloatParam()->floatzeros.data(), zero->readMap<float>(), size * sizeof(float));
+    return Variable::create(Expr::create(op.get(), {x}));
+}
+
 /* Given a float input value x, it quantizes x to corresponding int8 value quant_x using scales and zeroPoint. */
 class OnnxQuantizeLinearTransform : public OnnxExtraManager::Transform {
 public:
@@ -26,35 +42,22 @@ class OnnxQuantizeLinearTransform : public OnnxExtraManager::Transform {
         }
         auto input = inputs[0];
         auto scale = inputs[1];
-        
-        if (nullptr == scale || nullptr == input) {
-            MNN_ERROR("QuantizeLinear should provide scale and input\n");
-            return nullptr;
-        }
         auto dataType = halide_type_int;
         VARP zeropoint = _Const(0.f);
         auto offset = _Const(0.f);
         if (inputs.size() > 2) {
             zeropoint = _Cast<float>(inputs[2]);
-            dataType = static_cast<halide_type_code_t>(inputs[2]->getInfo()->type.code);
+            if (inputs[2]->getInfo()) {
+                dataType = static_cast<halide_type_code_t>(inputs[2]->getInfo()->type.code);
+            }
         }
         if (dataType == halide_type_uint) {
             offset = _Const(128.f);
         }
-        auto scaleReq = _Reciprocal(scale);
-        // auto output = _Cast<int8_t>(_Round(_Relu6(_Round(input * scaleReq) + zeropoint, -128.0f, 127.0f)));
-        auto output = _FloatToInt8(input, scaleReq, -128, 127, static_cast<int8_t>(zeropoint->readMap<float>()[0] - offset->readMap<float>()[0]));
-        std::unique_ptr<MNN::OpT> iden(new MNN::OpT);
-        iden->type = OpType_FloatToInt8;
-        std::vector<int32_t> inputDim = {};
-        
-        if (input->getInfo()) {
-            inputDim = input->getInfo()->dim;
-        }
-        auto _shape  = _Const(inputDim.data(), {static_cast<int32_t>(inputDim.size())}, NHWC, halide_type_of<int>());
-        auto newExpr = MNN::Express::Expr::create(iden.get(), {input, output, scale, zeropoint - offset, _shape}, 5);
-        newExpr->setName(expr->name());
-        return newExpr;
+        MNN_ASSERT(scale->readMap<float>() != nullptr);
+        auto newvar = _Float2Int8(input, _Reciprocal(scale), zeropoint - offset);
+        newvar->expr().first->setName(expr->name());
+        return newvar->expr().first;
     }
 };
 
diff --git a/tools/converter/source/optimizer/postconvert/RemoveInvalidCast.cpp b/tools/converter/source/optimizer/postconvert/RemoveInvalidCast.cpp
index 4f4001b00..0c7059c02 100644
--- a/tools/converter/source/optimizer/postconvert/RemoveInvalidCast.cpp
+++ b/tools/converter/source/optimizer/postconvert/RemoveInvalidCast.cpp
@@ -62,12 +62,6 @@ class RemoveInvalidCast : public PostConverter {
         for (auto iter = net->oplists.begin(); iter != net->oplists.end(); iter++) {
             auto& op = *iter;
             switch (op->type) {
-                case MNN::OpType_Input:
-                    types[op->outputIndexes[0]] = op->main.AsInput()->dtype;
-                    break;
-                case MNN::OpType_Cast:
-                    types[op->outputIndexes[0]] = op->main.AsCastParam()->dstT;
-                    break;
                 // Float Op
                 case MNN::OpType_PReLU:
                 case MNN::OpType_Softmax:
@@ -76,12 +70,34 @@ class RemoveInvalidCast : public PostConverter {
                 case MNN::OpType_Convolution3D:
                 case MNN::OpType_Deconvolution:
                 case MNN::OpType_DeconvolutionDepthwise:
+                case MNN::OpType_Interp:
+                case MNN::OpType_LSTM:
+                case MNN::OpType_LSTMBlockCell:
+                case MNN::OpType_GridSample:
+                case MNN::OpType_RNNSequenceGRU:
                 case MNN::OpType_MatMul:
+                    types[op->inputIndexes[0]] = MNN::DataType_DT_FLOAT;
                     if (op->outputIndexes.size() == 1) {
                         // 4 is integer matmul
                         types[op->outputIndexes[0]] = MNN::DataType_DT_FLOAT;
                     }
                     break;
+                default:
+                    break;
+            }
+        }
+        for (auto iter = net->oplists.begin(); iter != net->oplists.end(); iter++) {
+            auto& op = *iter;
+            switch (op->type) {
+                case MNN::OpType_Input:
+                    types[op->outputIndexes[0]] = op->main.AsInput()->dtype;
+                    break;
+                case MNN::OpType_Cast:
+                    types[op->outputIndexes[0]] = op->main.AsCastParam()->dstT;
+                    break;
+                case MNN::OpType_CastLike:
+                    types[op->outputIndexes[0]] = types[op->inputIndexes[1]];
+                    break;
                 case MNN::OpType_Const:
                 case MNN::OpType_TrainableParam:
                     types[op->outputIndexes[0]] = op->main.AsBlob()->dataType;
@@ -96,12 +112,24 @@ class RemoveInvalidCast : public PostConverter {
                         types[v] = types[op->inputIndexes[0]];
                     }
                     break;
+                case MNN::OpType_GatherV2:
+                case MNN::OpType_GatherND:
+                case MNN::OpType_Reduction:
+                case MNN::OpType_Range:
+                    types[op->outputIndexes[0]] = types[op->inputIndexes[0]];
+                    break;
                 case MNN::OpType_Shape:
                 case MNN::OpType_Size:
                 case MNN::OpType_Rank:
                 case MNN::OpType_UnravelIndex:
                     types[op->outputIndexes[0]] = MNN::DataType_DT_INT32;
                     break;
+                case MNN::OpType_Unique:
+                    types[op->outputIndexes[0]] = types[op->inputIndexes[0]];
+                    for (int v=1; v<op->outputIndexes.size(); ++v) {
+                        types[op->outputIndexes[v]] = MNN::DataType_DT_INT32;
+                    }
+                    break;
                 case MNN::OpType_RandomUniform:
                     types[op->outputIndexes[0]] = op->main.AsRandomUniform()->type;
                     break;
@@ -167,7 +195,7 @@ class RemoveInvalidCast : public PostConverter {
         const MNN::NetT* const netPtr = net.get();
         for (auto iter = net->oplists.begin(); iter != net->oplists.end();) {
             auto& op          = *iter;
-            if (op->type != MNN::OpType_Cast) {
+            if (op->type != MNN::OpType_Cast && op->type != MNN::OpType_CastLike) {
                 iter++;
                 continue;
             }
@@ -176,6 +204,18 @@ class RemoveInvalidCast : public PostConverter {
                 continue;
             }
             if (types[op->inputIndexes[0]] != types[op->outputIndexes[0]]) {
+                auto type = types[op->outputIndexes[0]];
+                if (op->type == MNN::OpType_CastLike) {
+                    if (type != MNN::DataType_DT_INVALID) {
+                        // Turn Castlike to cast
+                        op->type = MNN::OpType_Cast;
+                        op->inputIndexes = {op->inputIndexes[0]};
+                        op->main.Reset();
+                        op->main.value = new CastParamT;
+                        op->main.type = OpParameter_CastParam;
+                        op->main.AsCastParam()->dstT = type;
+                    }
+                }
                 iter++;
                 continue;
             }
diff --git a/tools/converter/source/optimizer/tflitextra/FullConnect.cpp b/tools/converter/source/optimizer/tflitextra/FullConnect.cpp
index da124e525..406bd8ccb 100644
--- a/tools/converter/source/optimizer/tflitextra/FullConnect.cpp
+++ b/tools/converter/source/optimizer/tflitextra/FullConnect.cpp
@@ -30,12 +30,14 @@ class FCTransform : public TFliteExtraManager::Transform {
                 }
             }
         }
-        MNN_ASSERT(inputs.size() == 3);
+        MNN_ASSERT(inputs.size() >= 2);
         auto input     = inputs[0];
         auto weight    = inputs[1];
-        auto bias      = inputs[2];
         input = _Reshape(input, {0, -1}, NHWC);
-        auto newOutput = _MatMul(input, weight, false, true) + bias;
+        auto newOutput = _MatMul(input, weight, false, true);
+        if (inputs.size() == 3) {
+            newOutput = newOutput + inputs[2];
+        }
         if (activation == tflite::ActivationFunctionType_RELU) {
             newOutput = _Relu(newOutput);
         } else if (activation == tflite::ActivationFunctionType_RELU6) {
diff --git a/tools/converter/source/tflite/liteConverter.cpp b/tools/converter/source/tflite/liteConverter.cpp
index 49d307e28..401d7426e 100644
--- a/tools/converter/source/tflite/liteConverter.cpp
+++ b/tools/converter/source/tflite/liteConverter.cpp
@@ -305,8 +305,6 @@ int tflite2MNNNet(const std::string inputModel, const std::string bizCode,
             op->type      = creator->opType(quantizedModel);
             op->main.type = creator->type(quantizedModel);
             // set default input output index
-            op->inputIndexes.resize(ops[j]->inputs.size());
-            op->outputIndexes.resize(ops[j]->outputs.size());
             auto insertQuantinfo = [&](int idx) {
                 if (quantizedModel != 2) {
                     return;
@@ -327,12 +325,19 @@ int tflite2MNNNet(const std::string inputModel, const std::string bizCode,
                 tensorDescribe->quantInfo->zero = quant->zero_point[0];
                 MNNNetT->extraTensorDescribe.emplace_back(std::move(tensorDescribe));
             };
+            op->inputIndexes.clear();
+            op->outputIndexes.clear();
+
             for (int i = 0; i < ops[j]->inputs.size(); i++) {
-                op->inputIndexes[i] = ops[j]->inputs[i];
+                if (ops[j]->inputs[i] >= 0) {
+                    op->inputIndexes.emplace_back(ops[j]->inputs[i]);
+                }
             }
             for (int i = 0; i < ops[j]->outputs.size(); i++) {
-                op->outputIndexes[i] = ops[j]->outputs[i];
-                insertQuantinfo(ops[j]->outputs[i]);
+                if (ops[j]->outputs[i] >= 0) {
+                    op->outputIndexes.emplace_back(ops[j]->outputs[i]);
+                    insertQuantinfo(ops[j]->outputs[i]);
+                }
             }
             // Run actual conversion
             creator->run(op, ops[j], tensors, tfliteModelBuffer, tfliteOpSet, quantizedModel);
diff --git a/tools/converter/source/torch/CMakeLists.txt b/tools/converter/source/torch/CMakeLists.txt
index e8eca41ec..6f52a3e28 100644
--- a/tools/converter/source/torch/CMakeLists.txt
+++ b/tools/converter/source/torch/CMakeLists.txt
@@ -1,5 +1,6 @@
 file(GLOB TORCH_SRC ${CMAKE_CURRENT_LIST_DIR}/*.cpp ${CMAKE_CURRENT_LIST_DIR}/*.hpp)
 add_library(MNNConverterTorch OBJECT ${TORCH_SRC})
+target_compile_options(MNNConverterTorch PRIVATE -std=c++17)
 
 IF (CMAKE_SYSTEM_NAME MATCHES "Linux" AND "${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU")
     SET(LIB_TORCH_PATH "${CMAKE_CURRENT_BINARY_DIR}/libtorch/share/cmake")
diff --git a/tools/converter/source/torch/EluTorch.cpp b/tools/converter/source/torch/EluTorch.cpp
new file mode 100644
index 000000000..39d928bad
--- /dev/null
+++ b/tools/converter/source/torch/EluTorch.cpp
@@ -0,0 +1,35 @@
+//
+//  EluTorch.cpp
+//  MNNConverter
+//
+//  Created by MNN on 2024/11/08.
+//  Copyright © 2018, Alibaba Group Holding Limited
+//
+
+#include <stdio.h>
+#include "torchOpConverter.hpp"
+
+DECLARE_OP_CONVERTER(EluTorch);
+
+MNN::OpType EluTorch::opType() {
+    return MNN::OpType_ELU;
+}
+MNN::OpParameter EluTorch::type() {
+    return MNN::OpParameter_ELU;
+}
+std::vector<int> EluTorch::inputTensorIdx() {
+    return {0};
+}
+
+void EluTorch::run(MNN::OpT* dstOp, const torch::jit::Node* node, TorchScope* scope) {
+    auto param = new MNN::ELUT;
+    if (node->inputs().size() > 1) {
+        param->alpha = getValue<double>(node->input(1));
+    } else {
+        param->alpha = 1.0f;
+    }
+    dstOp->main.value = param;
+    return;
+}
+
+REGISTER_CONVERTER(EluTorch, elu);
diff --git a/tools/cpp/CMakeLists.txt b/tools/cpp/CMakeLists.txt
index c560fb401..2bb120f81 100644
--- a/tools/cpp/CMakeLists.txt
+++ b/tools/cpp/CMakeLists.txt
@@ -60,8 +60,6 @@ list(APPEND MNN_CPP_TOOLS testTrain.out)
 add_executable(fuseTest ${CMAKE_CURRENT_LIST_DIR}/fuseTest.cpp)
 list(APPEND MNN_CPP_TOOLS fuseTest)
 
-add_executable(LoRA ${CMAKE_CURRENT_LIST_DIR}/LoRA.cpp)
-list(APPEND MNN_CPP_TOOLS LoRA)
 
 foreach(TARGET ${MNN_CPP_TOOLS})
     target_link_libraries(${TARGET} ${MNN_DEPS})
diff --git a/tools/cpp/LoRA.cpp b/tools/cpp/LoRA.cpp
deleted file mode 100644
index 558e0fa08..000000000
--- a/tools/cpp/LoRA.cpp
+++ /dev/null
@@ -1,272 +0,0 @@
-//
-//  LoRA.cpp
-//  MNN
-//
-//  Created by MNN on 2024/03/15.
-//  Copyright © 2018, Alibaba Group Holding Limited
-//
-
-#include <cstdlib>
-#include <random>
-#include <ctime>
-#include <fstream>
-#include <sstream>
-#include <iostream>
-#include <map>
-#include <chrono>
-
-#include <string.h>
-#include <stdlib.h>
-#include <MNN/MNNDefine.h>
-#include "LoRA.hpp"
-#include "core/CommonCompute.hpp"
-#include "core/MemoryFormater.h"
-#include "core/IDSTDecoder.hpp"
-#include "core/IDSTEncoder.hpp"
-#include "core/ConvolutionCommon.hpp"
-
-#include <MNN/expr/Expr.hpp>
-#include <MNN/expr/NeuralNetWorkOp.hpp>
-#include <MNN/expr/MathOp.hpp>
-
-int SymmetricQuantizeWeight(const float* weight, const int size, int8_t* quantizedWeight, float* scale,
-                            const int channels, float weightClampValue) {
-    const int channelStride     = size / channels;
-    const int quantizedMaxValue = weightClampValue;
-
-    for (int c = 0; c < channels; ++c) {
-        const auto weightChannelStart    = weight + c * channelStride;
-        auto quantizedWeightChannelStart = quantizedWeight + c * channelStride;
-        auto minmaxValue                 = std::minmax_element(weightChannelStart, weightChannelStart + channelStride);
-        const float dataAbsMax           = std::fmax(std::fabs(*minmaxValue.first), std::fabs(*minmaxValue.second));
-
-        float scaleDataToInt8 = 1.0f;
-        if (dataAbsMax == 0) {
-            scale[c] = 0.0f;
-        } else {
-            scale[c]        = dataAbsMax / quantizedMaxValue;
-            scaleDataToInt8 = quantizedMaxValue / dataAbsMax;
-        }
-
-        for (int i = 0; i < channelStride; ++i) {
-            const int32_t quantizedInt8Value = static_cast<int32_t>(roundf(weightChannelStart[i] * scaleDataToInt8));
-            quantizedWeightChannelStart[i] =
-                std::min(quantizedMaxValue, std::max(-quantizedMaxValue, quantizedInt8Value));
-        }
-    }
-
-    return 0;
-}
-
-std::unique_ptr<MNN::NetT> LoRA::load_model(const char* name) {
-    std::ifstream inputFile(name, std::ios::binary);
-    inputFile.seekg(0, std::ios::end);
-    const auto size = inputFile.tellg();
-    inputFile.seekg(0, std::ios::beg);
-
-    char* buffer = new char[size];
-    inputFile.read(buffer, size);
-    inputFile.close();
-    auto net = MNN::UnPackNet(buffer);
-    delete[] buffer;
-    MNN_ASSERT(net->oplists.size() > 0);
-    return net;
-}
-
-static float findAbsMax(const float *weights, const int count) {
-    float absMax = fabs(weights[0]);
-    for (int i = 1; i < count; i++) {
-        float value = fabs(weights[i]);
-        if (value > absMax) {
-            absMax = value;
-        }
-    }
-
-    return absMax;
-}
-
-static std::vector<float> findMinMax(const float *weights, const int count) {
-    float min = weights[0];
-    float max = weights[0];
-
-    for (int i = 1; i < count; i++) {
-        float value = weights[i];
-        if (value > max) {
-            max = value;
-        }
-        if (value < min) {
-            min = value;
-        }
-    }
-
-    return {min, max};
-}
-
-
-LoRA::LoRA(const char* origin_model, const char* lora_model) {
-    mMNNNet = std::move(load_model(origin_model));
-    mLoRANet = std::move(load_model(lora_model));
-    mExternalFile.reset(new std::fstream(std::string(origin_model) + ".weight", std::ios::in | std::ios::out | std::ios::binary));
-    if (mExternalFile->bad()) {
-        mExternalFile.reset(nullptr);
-    }
-}
-
-LoRA::~LoRA() {
-}
-
-std::vector<std::string> split(const std::string& name, char delimiter) {
-    std::vector<std::string> tokens;
-    std::string token;
-    std::istringstream tokenStream(name);
-    while (std::getline(tokenStream, token, delimiter)) {
-        tokens.push_back(token);
-    }
-    return tokens;
-}
-
-inline MNN::Express::VARP OpT2Const(MNN::OpT* op) {
-    return MNN::Express::Variable::create(MNN::Express::Expr::create(op, {}, 1));
-}
-
-inline MNN::Express::VARP computeLoRA(MNN::OpT *lora_A, MNN::OpT *lora_B) {
-    auto A = MNN::Express::_Cast(OpT2Const(lora_A), halide_type_of<float>());
-    auto B = MNN::Express::_Cast(OpT2Const(lora_B), halide_type_of<float>());
-    auto scale = MNN::Express::_Scalar<float>(4.0 * 5);
-    auto lora = MNN::Express::_Multiply(MNN::Express::_MatMul(B, A), scale);
-    // lora = MNN::Express::_Transpose(lora, {1, 0});
-    return lora;
-}
-
-void LoRA::apply_external(MNN::OpT* op, MNN::OpT* lora_A, MNN::OpT* lora_B) {
-    // lora origin weight
-    auto result = std::make_shared<MNN::ConvolutionCommon::Int8Common>();
-    auto param = op->main.AsConvolution2D();
-    int ic = param->common->inputCount;
-    int oc = param->common->outputCount;
-    auto buffer_size = param->external[1];
-    auto alpha_size = param->external[2];
-    result->weight.reset(buffer_size);
-    result->alpha.reset(alpha_size / sizeof(float));
-    mExternalFile->seekg(param->external[0]);
-    mExternalFile->read(reinterpret_cast<char*>(result->weight.get()), buffer_size);
-    mExternalFile->read(reinterpret_cast<char*>(result->alpha.get()), alpha_size);
-    auto& quan = param->quanParameter;
-    size_t weightLength = 0;
-    auto ptr = reinterpret_cast<unsigned char*>(result->weight.get());
-    std::unique_ptr<MemoryLoader> loader(new MemoryLoader(ptr));
-    auto new_ptr = IDSTDecoder::ReadQuanData_c(loader.get(), &weightLength, result.get(), quan->shapeInt32, false);
-    result->weight.set(new_ptr, weightLength);
-    result->weightFloat.reset(weightLength);
-    // dequant to float
-    bool oldType4 = (quan->type == 4 && quan->aMin == 0 && std::abs(quan->quantScale) < 1e-6);
-    if (quan->readType != 0 || oldType4) {
-        result->asymmetric = true;
-        float clampMin = quan->aMin == 0 ? -128 : quan->aMin;
-        for (int o = 0; o < oc; ++o) {
-            float min = result->alpha.get()[2 * o];
-            float alpha = result->alpha.get()[2 * o + 1];
-            min = min - clampMin * alpha;
-            auto dstW   = result->weightFloat.get() + o * ic;
-            auto srcW   = result->weight.get() + o * ic;
-            for (int v=0; v < ic; ++v) {
-                dstW[v] = (float)srcW[v] * alpha + min;
-            }
-        }
-    } else {
-        result->asymmetric = false;
-        for (int o = 0; o < oc; ++o) {
-            float alpha = result->alpha.get()[o];
-            auto dstW   = result->weightFloat.get() + o * ic;
-            auto srcW   = result->weight.get() + o * ic;
-            for (int v=0; v < ic; ++v) {
-                dstW[v] = (float)srcW[v] * alpha;
-            }
-        }
-    }
-    result->weight.release();
-    result->alpha.release();
-    auto weight = Express::_Const(result->weightFloat.get(), {oc, ic});
-    auto lora = computeLoRA(lora_A, lora_B);
-    result->weightFloat.release();
-    weight = Express::_Add(weight, lora);
-    // weight = Express::_Subtract(weight, lora);
-    // quant
-    int bits = 4;
-    float threshold = (float)(1 << (bits - 1)) - 1.0f;
-    auto clampMin = quan->aMin;
-    std::vector<float> scales;
-    std::vector<int8_t> quantWeights;
-    if (result->asymmetric) {
-        scales.resize(oc*2);
-        for (int o = 0; o < oc; ++o) {
-            const float* ptr = weight->readMap<float>() + o * ic;
-            auto minAndMax = findMinMax(ptr, ic);
-            float min = minAndMax[0];
-            float max = minAndMax[1];
-            float scale = (max - min) / (threshold - clampMin);
-
-            scales[2*o] = min;
-            scales[2*o+1] = scale;
-            /*
-            for (int ii = 0; ii < partWeightSize; ii++) {
-                int8_t quantValue = int8_t(std::round((ptr[ii] - min) / scale + clampMin));
-                quantWeights.emplace_back(quantValue);
-            }
-            */
-        }
-    }
-    auto res = IDSTEncoder::encode(weight->readMap<float>(), scales, ic, oc, result->asymmetric, /*quantWeights.data()*/nullptr, int(clampMin), bits, false);
-    mExternalFile->seekp(param->external[0]);
-    mExternalFile->write(reinterpret_cast<char*>(res->buffer.data()), buffer_size);
-    mExternalFile->write(reinterpret_cast<char*>(res->alpha.data()), alpha_size);
-}
-
-void LoRA::apply_lora() {
-    std::set<std::string> lora_keys;
-    std::map<std::string, std::pair<OpT*, OpT*>> loras;
-    for (int i = 0; i < mLoRANet->oplists.size(); i+= 2) {
-        auto& op_A = mLoRANet->oplists[i];
-        auto& op_B = mLoRANet->oplists[i + 1];
-        auto tokens = split(op_A->name, '/');
-        auto layer = tokens[4];
-        auto key = tokens[6];
-        lora_keys.insert(key);
-        loras[layer + key] = std::make_pair(op_A.get(), op_B.get());
-    }
-    for (auto& op : mMNNNet->oplists) {
-        if (op->type == MNN::OpType_Convolution) {
-            bool has_lora = false;
-            for (auto key : lora_keys) {
-                if (op->name.find(key) != std::string::npos) {
-                    has_lora = true;
-                    break;
-                }
-            }
-            if (!has_lora) continue;
-            auto tokens = split(op->name, '/');
-            auto layer = split(tokens[1], '.')[1];
-            auto key = tokens[3];
-            auto lora = loras[layer + key];
-            apply_external(op.get(), lora.first, lora.second);
-        }
-    }
-    mExternalFile->flush();
-    mExternalFile->close();
-}
-
-int main(int argc, char *argv[]) {
-    if (argc < 3) {
-        MNN_ERROR("Usage: ./LoRA ${origin.mnn} ${lora.mnn}\n");
-        return 0;
-    }
-    const char* origin_model = argv[1];
-    const char* lora_model = argv[2];
-    auto lora = std::unique_ptr<LoRA>(new LoRA(origin_model, lora_model));
-    auto st = std::chrono::system_clock::now();
-    lora->apply_lora();
-    auto et = std::chrono::system_clock::now();
-    auto lora_during = std::chrono::duration_cast<std::chrono::microseconds>(et - st).count() * 1e-6;
-    printf("### total time = %.2f s\n", lora_during);
-    return 0;
-}
diff --git a/tools/cpp/LoRA.hpp b/tools/cpp/LoRA.hpp
deleted file mode 100644
index 2fecd587e..000000000
--- a/tools/cpp/LoRA.hpp
+++ /dev/null
@@ -1,32 +0,0 @@
-//
-//  LoRA.hpp
-//  MNN
-//
-//  Created by MNN on 2024/03/15.
-//  Copyright © 2018, Alibaba Group Holding Limited
-//
-
-#ifndef LORA_HPP
-#define LORA_HPP
-
-#include "MNN_generated.h"
-
-class LoRA {
-public:
-    LoRA(const char* originalModelFileName, const char* loraModelFileName);
-    ~LoRA();
-    void* getBuffer() const;
-    const size_t getBufferSize() const;
-    void apply_lora();
-    void revert_lora();
-private:
-    std::unique_ptr<MNN::NetT> load_model(const char* name);
-    void apply_external(MNN::OpT* conv, MNN::OpT* lora_A, MNN::OpT* lora_B);
-private:
-    LoRA();
-    std::unique_ptr<MNN::NetT> mMNNNet, mLoRANet;
-    std::unique_ptr<std::fstream> mExternalFile;
-    void packMNNNet();
-};
-
-#endif // LORA_HPP
diff --git a/tools/cpp/ModuleBasic.cpp b/tools/cpp/ModuleBasic.cpp
index 6f0eb2538..adcea4937 100644
--- a/tools/cpp/ModuleBasic.cpp
+++ b/tools/cpp/ModuleBasic.cpp
@@ -331,9 +331,14 @@ int main(int argc, char *argv[]) {
             auto inputName = inputNames[i];
             // Resize
             auto shapeIter = inputShape.find(inputName);
+            auto order = mInfo->inputs[i].order;
+            if (MNN::Express::Dimensionformat::NC4HW4 == mInfo->inputs[i].order) {
+                order = MNN::Express::Dimensionformat::NCHW;
+            }
+
             if (shapeIter != inputShape.end()) {
                 auto s = shapeIter->second;
-                inputs[i] = _Input(s, mInfo->defaultFormat, mInfo->inputs[i].type);
+                inputs[i] = _Input(s, order, mInfo->inputs[i].type);
             }
             auto info = inputs[i]->getInfo();
             if (info->type == halide_type_of<float>()){
@@ -346,7 +351,9 @@ int main(int argc, char *argv[]) {
                 auto temp = _Cast(floatVar, info->type);
                 inputs[i]->input(temp);
             }
-            inputs[i] = _Convert(inputs[i], mInfo->inputs[i].order);
+            if (MNN::Express::Dimensionformat::NC4HW4 == mInfo->inputs[i].order) {
+                inputs[i] = _Convert(inputs[i], MNN::Express::Dimensionformat::NC4HW4);
+            }
         }
     }
 #undef LOAD_DATA
diff --git a/tools/cpp/backendTest.cpp b/tools/cpp/backendTest.cpp
index 18637f989..c77f2c517 100644
--- a/tools/cpp/backendTest.cpp
+++ b/tools/cpp/backendTest.cpp
@@ -17,11 +17,63 @@
 #include <map>
 #include <sstream>
 #include <string>
+#include "MNN_generated.h"
+#include <MNN/expr/Module.hpp>
 #include <MNN/AutoTime.hpp>
 #include <MNN/Interpreter.hpp>
 #include <MNN/Tensor.hpp>
 #include "core/TensorUtils.hpp"
+#include "core/Session.hpp"
 #include "rapidjson/document.h"
+typedef std::vector<std::pair<std::string, std::vector<std::string>>> OUTPUTCONFIG;
+
+static OUTPUTCONFIG _getAllOutputs(const MNN::Net* net, const MNN::Session* session) {
+    auto info = session->getPipelineInfo(0);
+    std::vector<std::pair<std::string, std::vector<std::string>>> res;
+    auto tensorName = net->tensorName();
+    auto oplist = net->oplists();
+    if (nullptr == oplist || nullptr == tensorName) {
+        FUNC_PRINT(1);
+        return res;
+    }
+    for (int i=0; i<info.second.size(); ++i) {
+        auto& unit = info.second[i];
+        if (unit.type != MNN::Schedule::SEPARATE) {
+            continue;
+        }
+        auto op = unit.op;
+        if (op->type() == MNN::OpType_Const || op->type() == MNN::OpType_TrainableParam || op->type() == MNN::OpType_Input) {
+            continue;
+        }
+        if (nullptr == op->outputIndexes() || op->outputIndexes()->size() == 0) {
+            continue;
+        }
+        std::vector<std::string> outputNames(op->outputIndexes()->size());
+        for (int v=0; v<op->outputIndexes()->size(); ++v) {
+            auto index = op->outputIndexes()->data()[v];
+            outputNames[v] = tensorName->GetAsString(index)->str();
+        }
+        res.emplace_back(std::make_pair(op->name()->str(), outputNames));
+    }
+    return res;
+}
+static std::vector<std::string> _getAllInputs(const MNN::Net* net) {
+    auto tensorName = net->tensorName();
+    auto oplist = net->oplists();
+    std::vector<std::string> res;
+    if (nullptr == oplist || nullptr == tensorName) {
+        FUNC_PRINT(1);
+        return res;
+    }
+    for (int i=0; i<oplist->size(); ++i) {
+        auto op = oplist->GetAs<MNN::Op>(i);
+        if (op->type() == MNN::OpType_Input) {
+            auto index = op->outputIndexes()->data()[0];
+            res.emplace_back(tensorName->GetAsString(index)->str());
+        }
+    }
+    return res;
+}
 
 template<typename T>
 inline T stringConvert(const char* number) {
@@ -47,114 +99,91 @@ static void _zeroInputs(const Interpreter* net, const Session* session) {
         inputTensor->copyFromHostTensor(&tempTensor);
     }
 }
-static void compareForwadType(Interpreter* net, MNNForwardType expectType, MNNForwardType compareType, float tolerance,
+static void compareForwadType(OUTPUTCONFIG outputNames, Interpreter* net, MNNForwardType expectType, MNNForwardType compareType, float tolerance,
                               const std::map<std::string, std::shared_ptr<Tensor>>& inputs, const std::string& stopOp, BackendConfig::PrecisionMode precision, int modeNum) {
-    std::map<std::string, std::vector<std::shared_ptr<MNN::Tensor>>> correctResult;
-    int index;
-    MNN::ScheduleConfig expectConfig, compareConfig;
-    BackendConfig backendConfig;
-    backendConfig.precision = precision;
-    expectConfig.type   = expectType;
-    compareConfig.type  = compareType;
-    compareConfig.backendConfig = &backendConfig;
-    compareConfig.mode = modeNum;
-    auto expectSession  = net->createSession(expectConfig);
-    auto compareSession = net->createSession(compareConfig);
-    _zeroInputs(net, expectSession);
-    _zeroInputs(net, compareSession);
-    bool allCorrect = true;
+    auto inputNames = _getAllInputs(MNN::GetNet(net->getModelBuffer().first));
+    for (int v=0; v<outputNames.size(); ++v) {
+        auto outputName = outputNames[v].second;
+        auto opName = outputNames[v].first;
+        MNN::ScheduleConfig expectConfig, compareConfig;
+        BackendConfig backendConfig;
+        backendConfig.precision = precision;
+        expectConfig.type   = expectType;
+        expectConfig.path.inputs = inputNames;
+        expectConfig.path.outputs = outputName;
+        expectConfig.saveTensors = outputName;
+        expectConfig.path.mode = MNN::ScheduleConfig::Path::Tensor;
 
-    MNN::TensorCallBackWithInfo beginCallBack = [&](const std::vector<MNN::Tensor*>& t, const OperatorInfo* op) {
-        if (op->name() == stopOp) {
-            return false;
-        }
-        return true;
-    };
-    MNN::TensorCallBackWithInfo saveExpect = [&](const std::vector<MNN::Tensor*>& t, const OperatorInfo* op) {
-        if (op->name() == stopOp) {
-            return false;
-        }
-        if (op->name().empty()) {
-            return true;
-        }
-        if (op->type() == "Raster") {
-            return true;
+        compareConfig.type  = compareType;
+        compareConfig.backendConfig = &backendConfig;
+        compareConfig.mode = modeNum;
+        compareConfig.path.inputs = inputNames;
+        compareConfig.path.outputs = outputName;
+        compareConfig.saveTensors = outputName;
+        compareConfig.path.mode = MNN::ScheduleConfig::Path::Tensor;
+        auto expectSession  = net->createSession(expectConfig);
+        auto compareSession = net->createSession(compareConfig);
+        _zeroInputs(net, expectSession);
+        _zeroInputs(net, compareSession);
+        for (auto& iter : inputs) {
+            Tensor* expectInput = net->getSessionInput(expectSession, iter.first.empty() ? NULL : iter.first.c_str());
+            expectInput->copyFromHostTensor(iter.second.get());
+            Tensor* compareInput = net->getSessionInput(compareSession, iter.first.empty() ? NULL : iter.first.c_str());
+            compareInput->copyFromHostTensor(iter.second.get());
         }
-        std::vector<std::shared_ptr<MNN::Tensor>> tensors(t.size());
-        for (int i=0; i<t.size(); ++i) {
-            auto tensor = t[i];
-            if (tensor->elementSize() <= 0) {
-                continue;
-            }
-            if (tensor->buffer().device == 0 && tensor->buffer().host == nullptr) {
-                continue;
+        net->runSession(expectSession);
+        net->runSession(compareSession);
+        bool allCorrect = true;
+        bool outputValid = false;
+        auto compare = [&]() {
+            for(auto name : outputName) {
+                auto expectTensor = net->getSessionOutput(expectSession, name.c_str());
+                if (nullptr == expectTensor || expectTensor->host<void>() == nullptr) {
+                    MNN_ERROR("Can't compare tensor: %s\n", name.c_str());
+                    continue;
+                }
+                outputValid = true;
+                auto compareTensor = net->getSessionOutput(compareSession, name.c_str());
+                if (nullptr == compareTensor) {
+                    MNN_ERROR("%d [%s] Tensor %s invalid\n", v, opName.c_str(), name.c_str());
+                    allCorrect = false;
+                    break;
+                }
+                auto correct      = TensorUtils::compareTensors(compareTensor, expectTensor, tolerance, true);
+                if (!correct) {
+                    MNN_PRINT("%d [%s] Op outputs %s is error\n", v, opName.c_str(), name.c_str());
+                    allCorrect = false;
+                    break;
+                }
             }
-
-            std::shared_ptr<MNN::Tensor> copyTensor(new MNN::Tensor(tensor, tensor->getDimensionType()));
-            tensor->copyToHostTensor(copyTensor.get());
-            tensors[i] = copyTensor;
+        };
+        compare();
+        if (!outputValid) {
+            net->releaseSession(expectSession);
+            net->releaseSession(compareSession);
+            continue;
         }
-        correctResult.insert(std::make_pair(op->name(), tensors));
-        return true;
-    };
-    MNN::TensorCallBackWithInfo compareExpect = [&](const std::vector<MNN::Tensor*>& t, const OperatorInfo* op) {
-        if (op->name() == stopOp) {
-            return false;
+
+        if (allCorrect) {
+            MNN_PRINT("Correct ! Run second pass\n");
+        } else {
+            return;
         }
-        if (op->type() == "Raster") {
-            return true;
+        for (auto& iter : inputs) {
+            Tensor* compareInput = net->getSessionInput(compareSession, iter.first.empty() ? NULL : iter.first.c_str());
+            compareInput->copyFromHostTensor(iter.second.get());
         }
-        if (correctResult.find(op->name()) == correctResult.end()) {
-            return true;
+        net->runSession(compareSession);
+        compare();
+        if (allCorrect) {
+            MNN_PRINT("Correct for %d, name=%s\n", v, opName.c_str());
+        } else {
+            return;
         }
-        auto correctTensors = correctResult[op->name()];
-        for (int i=0; i<t.size(); ++i) {
-            auto tensor = t[i];
-            if (tensor->elementSize() <= 0) {
-                continue;
-            }
-            if (tensor->buffer().device == 0 && tensor->buffer().host == nullptr) {
-                continue;
-            }
-            
-            tensor->wait(MNN::Tensor::MAP_TENSOR_READ, false);
-            std::shared_ptr<MNN::Tensor> copyTensor(new MNN::Tensor(tensor, tensor->getDimensionType()));
-            tensor->copyToHostTensor(copyTensor.get());
-            auto expectTensor = correctTensors[i];
-            auto correct      = TensorUtils::compareTensors(copyTensor.get(), expectTensor.get(), tolerance, true);
-            if (!correct) {
-                MNN_PRINT("%s - %d is error\n", op->name().c_str(), i);
-                allCorrect = false;
-            }
-        }
-        return allCorrect;
-    };
-
-    for (auto& iter : inputs) {
-        Tensor* expectInput = net->getSessionInput(expectSession, iter.first.empty() ? NULL : iter.first.c_str());
-        expectInput->copyFromHostTensor(iter.second.get());
-        Tensor* compareInput = net->getSessionInput(compareSession, iter.first.empty() ? NULL : iter.first.c_str());
-        compareInput->copyFromHostTensor(iter.second.get());
-    }
-    correctResult.clear();
-    net->runSessionWithCallBackInfo(expectSession, beginCallBack, saveExpect);
-    index = 0;
-    net->runSessionWithCallBackInfo(compareSession, beginCallBack, compareExpect);
-    if (allCorrect) {
-        MNN_PRINT("Correct ! Run second pass\n");
-    } else {
-        return;
-    }
-    _zeroInputs(net, compareSession);
-    index = 0;
-    for (auto& iter : inputs) {
-        Tensor* compareInput = net->getSessionInput(compareSession, iter.first.empty() ? NULL : iter.first.c_str());
-        compareInput->copyFromHostTensor(iter.second.get());
-    }
-    net->runSessionWithCallBackInfo(compareSession, beginCallBack, compareExpect);
-    if (allCorrect) {
-        MNN_PRINT("Correct !\n");
+        net->releaseSession(expectSession);
+        net->releaseSession(compareSession);
     }
+    MNN_PRINT("Correct !\n");
 }
 
 int main(int argc, const char* argv[]) {
@@ -288,8 +317,10 @@ int main(int argc, const char* argv[]) {
         stopOp = argv[6];
     }
     FUNC_PRINT_ALL(stopOp.c_str(), s);
+    auto outputNames = _getAllOutputs(MNN::GetNet(net->getModelBuffer().first), session);
+
     net->releaseSession(session);
-    compareForwadType(net.get(), MNN_FORWARD_CPU, type, tolerance, inputs, stopOp, precision, modeNum);
+    compareForwadType(outputNames, net.get(), MNN_FORWARD_CPU, type, tolerance, inputs, stopOp, precision, modeNum);
 
     return 0;
 }
diff --git a/tools/cpp/testModel.cpp b/tools/cpp/testModel.cpp
index 30d2d5c2c..1932f1abc 100644
--- a/tools/cpp/testModel.cpp
+++ b/tools/cpp/testModel.cpp
@@ -147,7 +147,8 @@ int main(int argc, const char* argv[]) {
         
         void* host = inputTensor->map(MNN::Tensor::MAP_TENSOR_WRITE,  inputTensor->getDimensionType());
         if(host != nullptr) {
-            ::memset(host, 0, inputTensor->size());
+            // TODO: Find better way to memset zero
+            ::memset(host, 0, MNN::TensorUtils::getRawSize(inputTensor) * inputTensor->getType().bytes());
         }
         inputTensor->unmap(MNN::Tensor::MAP_TENSOR_WRITE,  inputTensor->getDimensionType(), host);
     }
diff --git a/tools/quantization/calibration.cpp b/tools/quantization/calibration.cpp
index 9551b7d6f..8814ac452 100644
--- a/tools/quantization/calibration.cpp
+++ b/tools/quantization/calibration.cpp
@@ -326,12 +326,14 @@ Calibration::Calibration(MNN::NetT* model, const uint8_t* modelBuffer, const int
     }
     DLOG(INFO) << "feature_clamp_value: " << _featureClampValue;
     DLOG(INFO) << "weight_clamp_value: " << _weightClampValue;
-    if (picObj.HasMember("winogradOpt") && picObj["winogradOpt"].GetBool() == true) {
-        if (_featureQuantizeMethod == "EMA") {
-            _winogradOpt = true;
-        } else {
-            DLOG(ERROR) << "winogradOpt only be available under EMA";
-        }
+    if (_featureQuantizeMethod == "EMA") {
+        _winogradOpt = true;
+    } else {
+        DLOG(INFO) << "winogradOpt only be available under EMA";
+    }
+    if (picObj.HasMember("winogradOpt") && picObj["winogradOpt"].GetBool() == false) {
+        DLOG(INFO) << "Close winogradOpt because set winogradOpt as false";
+        _winogradOpt = false;
     }
     if (picObj.HasMember("skip_quant_op_names")) {
         auto skip_quant_op_names = picObj["skip_quant_op_names"].GetArray();
diff --git a/tools/script/make_test_for_mnn.py b/tools/script/make_test_for_mnn.py
index 8e1b8cba0..8efd1b991 100644
--- a/tools/script/make_test_for_mnn.py
+++ b/tools/script/make_test_for_mnn.py
@@ -3,6 +3,17 @@
 import MNN.numpy as np
 import sys
 import os
+def makeDirForPath(filename):
+    if filename.find('/') < 0:
+        return
+    names = filename.split('/')
+    dirname = ""
+    for l in range(0, len(names)-1):
+        dirname = dirname + names[l] + '/'
+    print(dirname)
+    if os.path.exists(dirname):
+        return
+    os.makedirs(dirname)
 
 def run():
     if len(sys.argv) < 3:
@@ -30,7 +41,7 @@ def run():
         dims = var.shape
         for j in range(0, len(dims)):
             if dims[j] == -1:
-                dims[j] = 20
+                dims[j] = 1
         input['shape'] = dims
         dformat = var.data_format
         var = np.random.random(dims)
@@ -38,7 +49,9 @@ def run():
             var = var * 10.0
         var = var.astype(dtype)
         data = var.read().flatten()
-        with open(os.path.join(outputDir, input['name'] + '.txt'), 'w') as f:
+        fname = os.path.join(outputDir, input['name'] + '.txt')
+        makeDirForPath(fname)
+        with open(fname, 'w') as f:
             for floatValue in data:
                 f.write('%f\n' %floatValue)
         var = F.convert(var, dformat)
@@ -52,8 +65,13 @@ def run():
     
     outputs = net.forward(inputs)
     for i in range(0, len(outputs)):
-        data = outputs[i].read().flatten()
-        with open(os.path.join(outputDir, info['outputNames'][i] + '.txt'), 'w') as f:
+        out = outputs[i]
+        if out.data_format == F.NC4HW4:
+            out = F.convert(out, F.NCHW)
+        data = out.read().flatten()
+        fname = os.path.join(outputDir, info['outputNames'][i] + '.txt')
+        makeDirForPath(fname)
+        with open(fname, 'w') as f:
             for floatValue in data:
                 f.write('%f\n' %floatValue)
 
diff --git a/transformers/llm/engine/include/llm/llm.hpp b/transformers/llm/engine/include/llm/llm.hpp
index 951a1578e..8c905f814 100644
--- a/transformers/llm/engine/include/llm/llm.hpp
+++ b/transformers/llm/engine/include/llm/llm.hpp
@@ -28,6 +28,12 @@ namespace Transformer {
 class Tokenizer;
 class Pipeline;
 class LlmConfig;
+class DiskEmbedding;
+
+enum TuneType {
+    // op encoder number for commit
+    OP_ENCODER_NUMBER = 0,
+};
 
 class MNN_PUBLIC Llm {
     using PromptItem = std::pair<std::string, std::string>; // <role, content>
@@ -38,6 +44,7 @@ class MNN_PUBLIC Llm {
     void chat();
     void reset();
     void trace(bool start);
+    void tuning(TuneType type, std::vector<int> candidates);
     virtual void load();
     MNN::Express::VARP forward(const std::vector<int>& input_ids);
     int sample(MNN::Express::VARP logits, const std::vector<int>& pre_ids);
@@ -57,6 +64,10 @@ class MNN_PUBLIC Llm {
     Llm* create_lora(const std::string& lora_path);
     bool release_module(size_t index);
     bool select_module(size_t index);
+    // tokenier function
+    bool is_stop(int token_id);
+    std::string tokenizer_decode(int id);
+    virtual std::vector<int> tokenizer_encode(const std::string& query, bool use_template = true);
     friend class Pipeline;
 public:
     // forward info
@@ -72,6 +83,7 @@ class MNN_PUBLIC Llm {
 protected:
     std::shared_ptr<LlmConfig> config_;
     std::shared_ptr<Tokenizer> tokenizer_;
+    std::shared_ptr<DiskEmbedding> disk_embedding_;
     std::vector<int> key_value_shape_ = {};
     std::vector<MNN::Express::VARP> past_key_values_;
     MNN::Express::VARP inputs_embeds_, attention_mask_, position_ids_;
@@ -80,9 +92,6 @@ class MNN_PUBLIC Llm {
     std::vector<std::shared_ptr<MNN::Express::Module>> prefill_modules_, decode_modules_, current_modules_;
     const MNN::Express::Module* base_module_ = nullptr;
     void init_runtime();
-    std::string decode(int id);
-    bool is_stop(int token_id);
-    virtual std::vector<int> tokenizer(const std::string& query);
     virtual MNN::Express::VARP embedding(const std::vector<int>& input_ids);
     virtual MNN::Express::VARP gen_attention_mask(int seq_len);
     virtual MNN::Express::VARP gen_position_ids(int seq_len);
@@ -100,7 +109,6 @@ class Embedding : public Llm {
     MNN::Express::VARP txt_embedding(const std::string& txt);
     int dim() const;
 private:
-    virtual std::vector<int> tokenizer(const std::string& query) override;
     virtual MNN::Express::VARP gen_attention_mask(int seq_len) override;
     virtual MNN::Express::VARP gen_position_ids(int seq_len) override;
 };
diff --git a/transformers/llm/engine/llm_demo.cpp b/transformers/llm/engine/llm_demo.cpp
index 1200957c0..593e622fa 100644
--- a/transformers/llm/engine/llm_demo.cpp
+++ b/transformers/llm/engine/llm_demo.cpp
@@ -13,6 +13,7 @@
 #include <sstream>
 #include <stdlib.h>
 using namespace MNN::Transformer;
+
 static void trace_prepare(Llm* llm) {
     MNN_PRINT("Prepare for resize opt Begin\n");
     llm->trace(true);
@@ -22,6 +23,12 @@ static void trace_prepare(Llm* llm) {
     llm->trace(false);
 }
 
+static void tuning_prepare(Llm* llm) {
+    MNN_PRINT("Prepare for tuning opt Begin\n");
+    llm->tuning(OP_ENCODER_NUMBER, {1, 5, 10, 20, 30, 50, 100});
+    MNN_PRINT("Prepare for tuning opt End\n");
+}
+
 std::vector<std::vector<std::string>> parse_csv(const std::vector<std::string>& lines) {
     std::vector<std::vector<std::string>> csv_data;
     std::string line;
@@ -177,6 +184,10 @@ int main(int argc, const char* argv[]) {
         AUTOTIME;
         trace_prepare(llm.get());
     }
+    if (true) {
+        AUTOTIME;
+        tuning_prepare(llm.get());
+    }
     if (argc < 3) {
         llm->chat();
         return 0;
diff --git a/transformers/llm/engine/src/llm.cpp b/transformers/llm/engine/src/llm.cpp
index 8b836595f..87e061767 100644
--- a/transformers/llm/engine/src/llm.cpp
+++ b/transformers/llm/engine/src/llm.cpp
@@ -30,6 +30,99 @@ using namespace MNN::Express;
 namespace MNN {
 namespace Transformer {
 
+typedef void (*DequantFunction)(const uint8_t*, float*, float, float, int);
+
+static void q41_dequant_ref(const uint8_t* src, float* dst, float scale, float zero, int size) {
+    for (int i = 0; i < size / 2; i++) {
+        int x = src[i];
+        int x1 = x / 16 - 8;
+        int x2= x % 16 - 8;
+        float w1 = x1 * scale + zero;
+        float w2 = x2 * scale + zero;
+        dst[2 * i] = w1;
+        dst[2 * i + 1] = w2;
+    }
+}
+
+static void q81_dequant_ref(const uint8_t* src, float* dst, float scale, float zero, int size) {
+    for (int i = 0; i < size; i++) {
+        dst[i] = src[i] * scale + zero;
+    }
+}
+
+class DiskEmbedding {
+public:
+    explicit DiskEmbedding(const std::shared_ptr<LlmConfig>& config);
+    ~DiskEmbedding() {}
+    void embedding(const std::vector<int>& input_ids, float* ptr);
+private:
+    void seek_read(uint8_t* dst, int size, int offset);
+    std::unique_ptr<uint8_t[]> alpha_ = nullptr;
+    std::unique_ptr<uint8_t[]> weight_ = nullptr;
+    std::unique_ptr<FILE, decltype(&fclose)> fp_;
+    DequantFunction dequant_;
+    int hidden_size_, weight_token_size_;
+    int w_offset_, block_num_, quant_block_, quant_bit_;
+};
+
+void DiskEmbedding::seek_read(uint8_t* dst, int size, int offset) {
+    fseek(fp_.get(), offset, SEEK_SET);
+    size_t bytes_read = fread(dst, 1, size, fp_.get());
+    (void)bytes_read;
+}
+
+DiskEmbedding::DiskEmbedding(const std::shared_ptr<LlmConfig>& config) : fp_(nullptr, &fclose) {
+    auto tie_embeddings = config->tie_embeddings();
+    hidden_size_ = config->hidden_size();
+    if (tie_embeddings.size() == 5) {
+        w_offset_    = tie_embeddings[0];
+        quant_bit_   = tie_embeddings[3];
+        quant_block_ = tie_embeddings[4];
+        block_num_ = hidden_size_ / quant_block_;
+        weight_token_size_ = hidden_size_ * quant_bit_ / 8;
+        fp_.reset(fopen(config->llm_weight().c_str(), "rb"));
+        // TODO: optimize dequant function
+        dequant_ = quant_bit_ == 8 ? q81_dequant_ref : q41_dequant_ref;
+        int a_offset    = tie_embeddings[1];
+        int alpha_size  = tie_embeddings[2];
+        alpha_.reset(new uint8_t[alpha_size]);
+        seek_read(alpha_.get(), alpha_size, a_offset);
+    } else {
+        weight_token_size_ = hidden_size_ * sizeof(int16_t);
+        fp_.reset(fopen(config->embedding_file().c_str(), "rb"));
+    }
+    weight_.reset(new uint8_t[weight_token_size_]);
+}
+
+void DiskEmbedding::embedding(const std::vector<int>& input_ids, float* dst) {
+    if (alpha_.get()) {
+        // quant
+        for (size_t i = 0; i < input_ids.size(); i++) {
+            int token = input_ids[i];
+            seek_read(weight_.get(), weight_token_size_, w_offset_ + token * weight_token_size_);
+            auto dptr = dst + i * hidden_size_;
+            auto alpha_ptr = reinterpret_cast<float*>(alpha_.get()) + token * block_num_ * 2;
+            for (int n = 0; n < block_num_; n++) {
+                auto dst_ptr = dptr + n * quant_block_;
+                uint8_t* src_ptr = weight_.get() + n * (quant_block_ * quant_bit_ / 8);
+                float zero = (alpha_ptr + n * 2)[0];
+                float scale = (alpha_ptr + n * 2)[1];
+                dequant_(src_ptr, dst_ptr, scale, zero, quant_block_);
+            }
+        }
+    } else {
+        // bf16
+        for (size_t i = 0; i < input_ids.size(); i++) {
+            seek_read(weight_.get(), weight_token_size_, input_ids[i] * weight_token_size_);
+            int16_t* dst_ptr = reinterpret_cast<int16_t*>(dst + i * hidden_size_);
+            for (int j = 0; j < hidden_size_; j++) {
+                dst_ptr[j * 2] = 0;
+                dst_ptr[j * 2 + 1] = reinterpret_cast<int16_t*>(weight_.get())[j];
+            }
+        }
+    }
+}
+
 class Lvlm : public Llm {
 public:
     Lvlm(std::shared_ptr<LlmConfig> config) : Llm(config) {
@@ -42,7 +135,8 @@ class Lvlm : public Llm {
     }
     ~Lvlm() { visual_module_.reset(); }
     virtual void load() override;
-    virtual std::vector<int> tokenizer(const std::string& query) override;
+
+    virtual std::vector<int> tokenizer_encode(const std::string& query, bool use_template = true) override;
     virtual MNN::Express::VARP embedding(const std::vector<int>& input_ids) override;
 private:
     int image_size_ = 448, vision_start_ = 151857, vision_end_ = 151858, image_pad_ = 151859;
@@ -139,15 +233,12 @@ void Llm::load() {
     key_value_shape_ = config_->key_value_shape();
     is_single_ = config_->is_single();
     attention_fused_ = config_->attention_fused();
-    {
-        std::ifstream embedding_bin(config_->embedding_file());
-        embedding_bin.close();
-    }
     MNN_PRINT("### is_single_ = %d\n", is_single_);
     // 1. load vocab
     MNN_PRINT("load tokenizer\n");
     tokenizer_.reset(Tokenizer::createTokenizer(config_->tokenizer_file()));
     MNN_PRINT("load tokenizer Done\n");
+    disk_embedding_.reset(new DiskEmbedding(config_));
     // 3. load model
     Module::Config module_config;
     module_config.shapeMutable = true;
@@ -254,6 +345,41 @@ void Llm::trace(bool start) {
     mTracing = start;
 }
 
+void Llm::tuning(TuneType type, std::vector<int> candidates) {
+    if(type != OP_ENCODER_NUMBER) {
+        MNN_ERROR("tuning type not supported\n");
+        return;
+    }
+    if(config_->backend_type() != "metal") {
+        return;
+    }
+
+    current_modules_ = decode_modules_;
+    int64_t min_time = INT64_MAX;
+    int prefer_candidate = 10;
+    for(auto& candidate : candidates) {
+        runtime_manager_->setHint(MNN::Interpreter::OP_ENCODER_NUMBER_FOR_COMMIT, candidate);
+
+        auto st = std::chrono::system_clock::now();
+        auto logits = forward({0});
+        if (nullptr == logits.get()) {
+            return;
+        }
+        if (logits->getInfo()->size == 0) {
+            return;
+        }
+        auto token = sample(logits, {});
+        auto et = std::chrono::system_clock::now();
+        int64_t time = std::chrono::duration_cast<std::chrono::microseconds>(et - st).count();
+        if(time < min_time) {
+            prefer_candidate = candidate;
+            min_time = time;
+            //MNN_PRINT("op encode number:%d, decode time: %lld us\n", candidate, time);
+        }
+    }
+    runtime_manager_->setHint(MNN::Interpreter::OP_ENCODER_NUMBER_FOR_COMMIT, prefer_candidate);
+}
+
 VARP Llm::forward(const std::vector<int>& input_ids) {
     int seq_len = input_ids.size();
     auto attention_mask = gen_attention_mask(seq_len);
@@ -390,6 +516,7 @@ void Llm::generate_init() {
         all_seq_len_ = 0;
         history_ids_.clear();
     }
+    current_modules_ = prefill_modules_;
 }
 
 std::vector<int> Llm::generate(const std::vector<int>& input_ids, int max_new_tokens) {
@@ -443,7 +570,7 @@ std::string Llm::generate(const std::vector<int>& input_ids, std::ostream* os, c
     int token = sample(logits, history_ids_);
     auto et = std::chrono::system_clock::now();
     current_modules_ = decode_modules_;
-    std::string output_str = decode(token);
+    std::string output_str = tokenizer_decode(token);
     prefill_us_ = std::chrono::duration_cast<std::chrono::microseconds>(et - st).count();
     *os << output_str << std::flush;
     while (gen_seq_len_ < config_->max_new_tokens()) {
@@ -464,7 +591,7 @@ std::string Llm::generate(const std::vector<int>& input_ids, std::ostream* os, c
             *os << end_with << std::flush;
             break;
         }
-        auto word = decode(token);
+        auto word = tokenizer_decode(token);
         *os << word << std::flush;
         output_str += word;
     }
@@ -475,7 +602,10 @@ std::string Llm::generate(const std::vector<int>& input_ids, std::ostream* os, c
     return output_str;
 }
 
-std::vector<int> Llm::tokenizer(const std::string& user_content) {
+std::vector<int> Llm::tokenizer_encode(const std::string& user_content, bool use_template) {
+    if (!use_template) {
+        return tokenizer_->encode(user_content);
+    }
     auto prompt = apply_prompt_template(user_content);
     auto input_ids = tokenizer_->encode(prompt);
     return input_ids;
@@ -492,7 +622,7 @@ std::string Llm::response(const std::string& user_content, std::ostream* os, con
         }
         input_ids = tokenizer_->encode(prompt);
     } else {
-        input_ids = tokenizer(user_content);
+        input_ids = tokenizer_encode(user_content);
     }
     return generate(input_ids, os, end_with);
 }
@@ -571,31 +701,17 @@ static inline bool needNewVar(VARP var, int axis, int seq_len) {
 
 VARP Llm::embedding(const std::vector<int>& input_ids) {
     AUTOTIME;
-    // disk embedding to save memory
     int hidden_size = config_->hidden_size();
     int seq_len = static_cast<int>(input_ids.size());
     if (needNewVar(inputs_embeds_, 0, seq_len)) {
         inputs_embeds_ = _Input({seq_len, 1, hidden_size}, NCHW);
     }
-
-    size_t size = hidden_size * sizeof(int16_t);
-    FILE* file = fopen(config_->embedding_file().c_str(), "rb");
-    std::unique_ptr<int16_t[]> buffer(new int16_t[hidden_size]);
-    for (size_t i = 0; i < seq_len; i++) {
-        fseek(file, input_ids[i] * size, SEEK_SET);
-        size_t bytes_read = fread(buffer.get(), 1, size, file);
-        (void)bytes_read;
-        auto ptr = inputs_embeds_->writeMap<int16_t>() + i * hidden_size * 2;
-        for (int j = 0; j < hidden_size; j++) {
-            ptr[j * 2] = 0;
-            ptr[j * 2 + 1] = buffer[j];
-        }
-    }
-    fclose(file);
+    // disk embedding to save memory
+    disk_embedding_->embedding(input_ids, inputs_embeds_->writeMap<float>());
     return inputs_embeds_;
 }
 
-std::string Llm::decode(int id) {
+std::string Llm::tokenizer_decode(int id) {
     std::string word = tokenizer_->decode(id);
     // Fix utf-8 garbled characters
     if (word.length() == 6 && word[0] == '<' && word[word.length()-1] == '>' && word[1] == '0' && word[2] == 'x') {
@@ -751,7 +867,7 @@ std::vector<int> Lvlm::image_process(const std::string& image_info) {
 #endif
 }
 
-std::vector<int> Lvlm::tokenizer(const std::string& query) {
+std::vector<int> Lvlm::tokenizer_encode(const std::string& query, bool use_template) {
     auto prompt = apply_prompt_template(query);
     // split query
     std::regex img_regex("<img>(.*?)</img>");
@@ -859,13 +975,7 @@ VARP Embedding::ids_embedding(const std::vector<int>& ids) {
 }
 
 VARP Embedding::txt_embedding(const std::string& txt) {
-    return ids_embedding(tokenizer(txt));
-}
-
-std::vector<int> Embedding::tokenizer(const std::string& query) {
-    auto prompt = apply_prompt_template(query);
-    auto ids = tokenizer_->encode(prompt);
-    return ids;
+    return ids_embedding(tokenizer_encode(txt));
 }
 
 VARP Embedding::gen_attention_mask(int seq_len) {
diff --git a/transformers/llm/engine/src/llmconfig.hpp b/transformers/llm/engine/src/llmconfig.hpp
index 78bd3bc61..6562b7692 100644
--- a/transformers/llm/engine/src/llmconfig.hpp
+++ b/transformers/llm/engine/src/llmconfig.hpp
@@ -318,6 +318,10 @@ class LlmConfig {
     std::string prompt_template() const {
         return llm_config_.value("prompt_template", "");
     }
+
+    std::vector<int> tie_embeddings() const {
+        return llm_config_.value("tie_embeddings", std::vector<int>{});
+    }
     // llm model config end >
 };
 } // Transformer
diff --git a/transformers/llm/eval/evaluate_perplexity.py b/transformers/llm/eval/evaluate_perplexity.py
new file mode 100644
index 000000000..7b467bb58
--- /dev/null
+++ b/transformers/llm/eval/evaluate_perplexity.py
@@ -0,0 +1,68 @@
+import os
+import argparse
+from tqdm import tqdm
+import MNN.llm as mnnllm
+from datasets import load_dataset
+import torch
+import copy
+
+def main(args):
+    # load model
+    model = mnnllm.create(args.mnn_path)
+    model.load()
+
+    # load dataset
+    eval_dataset = args.eval_dataset
+    dataset_name = eval_dataset.split("/")[0]
+    dataset_dir = eval_dataset.split("/")[1]
+
+    dataset = load_dataset(dataset_name, dataset_dir, split="test")
+    input_ids = model.tokenizer_encode("\n\n".join(dataset["text"]))
+    stride = 512
+    context_length = stride + stride // 2
+    seq_len = len(input_ids)
+    # seq_len = 10240
+
+    nlls = []
+    prev_end_loc = 0
+    criterion = torch.nn.CrossEntropyLoss()
+    for begin_loc in tqdm(range(0, seq_len, stride)):
+        end_loc = min(begin_loc + context_length, seq_len)
+        chunk_ids = input_ids[begin_loc:end_loc]
+        logits = model.forward(chunk_ids)
+        npy_logits = copy.deepcopy(logits.read())
+        logits = torch.from_numpy(npy_logits).squeeze(0)
+        # logits = torch.from_numpy(logits.read()).squeeze(0) # crash when opencl
+
+        target_ids = torch.tensor(chunk_ids)
+        trg_len = end_loc - prev_end_loc
+        target_ids[:-trg_len] = -100
+        neg_log_likelihood = criterion(logits[:-1, :], target_ids[1:])
+        nlls.append(neg_log_likelihood)
+
+        prev_end_loc = end_loc
+        if end_loc == seq_len:
+            break
+
+    perplexity = torch.exp(torch.stack(nlls).mean())
+    print(f"Perplexity: {perplexity}")
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Evaluate mnn perplexity.")
+    parser.add_argument(
+        "-m",
+        "--mnn-path",
+        type=str,
+        required=True,
+        help="mnn model path",
+    )
+
+    # Provide extra arguments required for tasks
+    group = parser.add_argument_group(title="Evaluation options")
+    group.add_argument(
+        "-d", "--eval_dataset", type=str, default='wikitext/wikitext-2-raw-v1', help="Evaluation dataset, default is `wikitext/wikitext-2-raw-v1`."
+    )
+
+    args = parser.parse_args()
+
+    main(args)
diff --git a/transformers/llm/export/README.md b/transformers/llm/export/README.md
index 233f738ad..136f1329f 100644
--- a/transformers/llm/export/README.md
+++ b/transformers/llm/export/README.md
@@ -77,7 +77,7 @@ options:
   --dst_path DST_PATH   export onnx/mnn model to path, defaut is `./model`.
   --test TEST           test model inference with query `TEST`.
   --export EXPORT       export model to an onnx/mnn model.
-  --skip_slim           Whether or not to skip onnx-slim.
+  --onnx_slim           Whether or not to use onnx-slim.
   --quant_bit QUANT_BIT
                         mnn quant bit, 4 or 8, default is 4.
   --quant_block QUANT_BLOCK
diff --git a/transformers/llm/export/llmexport.py b/transformers/llm/export/llmexport.py
index 904128bb5..efb58927b 100644
--- a/transformers/llm/export/llmexport.py
+++ b/transformers/llm/export/llmexport.py
@@ -1,4 +1,5 @@
 import os
+import gc
 import sys
 import math
 import copy
@@ -6,17 +7,21 @@
 import time
 import base64
 import logging
+import inspect
 import warnings
 import argparse
 import functools
-from typing import Optional, Tuple
+import traceback
+from collections import defaultdict
+from typing import Optional, Tuple, List, Union, Dict
 
+from tqdm import tqdm
 from yaspin import yaspin
 
 import onnx
 import torch
 import numpy as np
-from transformers import AutoModel, AutoModelForCausalLM, AutoTokenizer
+from transformers import AutoConfig, AutoModel, AutoModelForCausalLM, AutoTokenizer
 
 RESET = "\033[0m"
 GREEN = "\033[32;1m"
@@ -38,7 +43,7 @@ def wrapper(*args, **kwargs):
                     result = func(*args, **kwargs)
                 except Exception as e:
                     spinner.fail("💥 Failed")
-                    print(e)
+                    traceback.print_exc()
                     exit(1)
                 end = time.time()
                 during = f'[{end-start:05.2f} s]'.replace('[0', '[ ')
@@ -76,17 +81,21 @@ def regist_models(self):
         self.defualt_map()
         # regist models
         self.regist_llama()
+        self.regist_mllama()
         self.regist_qwen()
         self.regist_glm()
         self.regist_glm2()
         self.regist_phi()
         self.regist_gemma2()
+        self.register_openelm()
 
     def regist_llama(self):
         llama_map = self.default_map
         self.regist('llama', llama_map)
         self.regist('qwen2', llama_map)
         self.regist('internlm', llama_map)
+        self.regist('mobilellm', llama_map)
+        # baichuan
         baichuan_map = copy.deepcopy(self.default_map)
         baichuan_map[self.attention_key] = {
             'qkv_proj': 'W_pack',
@@ -94,6 +103,42 @@ def regist_llama(self):
         }
         self.regist('baichuan', baichuan_map)
 
+    def regist_mllama(self):
+        mllama_map = {
+            'config': {
+                'hidden_size': 'text_config.hidden_size',
+                'num_attention_heads': 'text_config.num_attention_heads',
+                'num_hidden_layers': 'text_config.num_hidden_layers',
+                'num_key_value_heads': 'text_config.num_key_value_heads',
+                'rope_theta': 'text_config.rope_theta'
+            },
+            'model': {
+                'lm_': 'language_model.lm_head',
+                'embed_': 'language_model.model.embed_tokens',
+                'blocks_': 'language_model.model.layers',
+                'final_layernorm_': 'language_model.model.norm',
+                'visual': 'vision_model'
+            },
+            'decoder': {
+                'self_attn': 'self_attn',
+                'cross_attn': 'cross_attn',
+                'mlp': 'mlp',
+                'input_layernorm': 'input_layernorm',
+                'post_attention_layernorm': 'post_attention_layernorm'
+            },
+            'attention': {
+                'q_proj': 'q_proj',
+                'k_proj': 'k_proj',
+                'v_proj': 'v_proj',
+                'o_proj': 'o_proj',
+                'q_norm': 'q_norm',
+                'k_norm': 'k_norm',
+                'cross_attn_attn_gate': 'cross_attn_attn_gate',
+                'cross_attn_mlp_gate': 'cross_attn_mlp_gate'
+            }
+        }
+        self.regist('mllama', mllama_map)
+
     def regist_qwen(self):
         qwen_map = {
             'config': {
@@ -215,6 +260,41 @@ def regist_gemma2(self):
         }
         self.regist('gemma2', gemma2_map)
 
+    def register_openelm(self):
+        openelm_config = {
+            'hidden_size': 'model_dim',
+            'head_dim': 'head_dim',
+            'num_attention_heads': 'num_query_heads',
+            'num_hidden_layers': 'num_transformer_layers',
+            'num_key_value_heads': 'num_kv_heads',
+            'rope_theta': 'rope_freq_constant'
+        }
+        openelm_model = {
+            'lm_': 'lm_head',
+            'embed_': 'transformer.token_embeddings',
+            'blocks_': 'transformer.layers',
+            'final_layernorm_': 'transformer.norm'
+        }
+        openelm_decoder = {
+            'self_attn': 'attn',
+            'mlp': 'ffn',
+            'input_layernorm': 'attn_norm',
+            'post_attention_layernorm': 'ffn_norm'
+        }
+        openelm_attention = {
+            'qkv_proj': 'qkv_proj',
+            'o_proj': 'out_proj',
+            'q_norm': 'q_norm',
+            'k_norm': 'k_norm'
+        }
+        openelm_map = {
+            'config': openelm_config,
+            'model': openelm_model,
+            'decoder': openelm_decoder,
+            'attention': openelm_attention
+        }
+        self.regist('openelm', openelm_map)
+
     def defualt_map(self):
         # default map is `LlamaForCausalLM`
         self.config_key = 'config'
@@ -267,12 +347,821 @@ def do_map(dst, src, map):
                     break
             setattr(dst, dst_attr, obj)
 
+# Quant class
+
+# awq quantizer start
+class AwqQuantizer:
+    def __init__(
+        self,
+        model,
+        modules_to_not_convert=None,
+        apply_clip=True,
+        n_parallel_calib_samples=None,
+        max_calib_samples=128,
+        max_calib_seq_len=512,
+        max_chunk_memory=1024 * 1024 * 1024,
+    ) -> None:
+        self.awq_model = model
+        self.model = model
+        self.tokenizer = model.tokenizer
+        self.w_bit = model.quant_bit
+        self.group_size = model.quant_block
+        self.zeropoint = not model.symmetric
+        self.calib_data = 'ag_news'
+        self.split = 'test'
+        self.duo_scaling = True
+        self.apply_clip = apply_clip
+        self.n_parallel_calib_samples = n_parallel_calib_samples
+        self.max_calib_samples = max_calib_samples
+        self.max_calib_seq_len = max_calib_seq_len
+        self.max_chunk_memory = max_chunk_memory
+        self.modules_to_not_convert = (
+            modules_to_not_convert if modules_to_not_convert is not None else []
+        )
+        self.modules, self.module_kwargs, self.inps = self.init_quant(
+            n_samples=self.max_calib_samples, max_seq_len=self.max_calib_seq_len
+        )
+
+    def pseudo_quantize_tensor(self, w: torch.Tensor):
+        org_w_shape = w.shape
+        if self.group_size > 0:
+            assert org_w_shape[-1] % self.group_size == 0
+            w = w.reshape(-1, self.group_size)
+        assert w.dim() == 2
+        assert torch.isnan(w).sum() == 0
+        # zero point quantization
+        if self.zeropoint:
+            max_val = w.amax(dim=1, keepdim=True)
+            min_val = w.amin(dim=1, keepdim=True)
+            offset = 1 << (self.w_bit - 1)
+            clip_max = offset - 1
+            clip_min = -offset
+            scales = (max_val - min_val) / (clip_max - clip_min)
+            zeros =  - torch.round(min_val / scales) + clip_min
+            qw = torch.round(w / scales) + zeros
+            qw = torch.clamp(qw, clip_min, clip_max)
+            w = (qw - zeros) * scales
+            zeros = min_val.view(org_w_shape[0], -1)
+        else:
+            abs_max = w.abs().amax(dim=1, keepdim=True)
+            offset = 1 << (self.w_bit - 1)
+            clip_max = offset - 1
+            clip_min = -clip_max
+            scales = abs_max / clip_max
+            w = torch.clamp(torch.round(w / scales), clip_min, clip_max)  * scales
+            zeros = None
+
+        assert torch.isnan(scales).sum() == 0
+        assert torch.isnan(w).sum() == 0
+
+        scales = scales.view(org_w_shape[0], -1)
+        w = w.reshape(org_w_shape)
+
+        return w, scales, zeros
+
+    def quantize(self):
+        for i in tqdm(range(len(self.modules)), desc="AWQ"):
+            # if i > 0: break
+            # Move module and inputs to correct device
+            common_device = next(self.modules[i].parameters()).device
+            if common_device is None or str(common_device) == "cpu":
+                best_device = AwqQuantizer.get_best_device()
+
+                self.modules[i] = self.modules[i].to(best_device)
+                common_device = next(self.modules[i].parameters()).device
+
+            if self.module_kwargs.get("position_ids") is not None:
+                self.module_kwargs["position_ids"] = self.module_kwargs[
+                    "position_ids"
+                ].to(common_device)
+
+            if self.module_kwargs.get("attention_mask") is not None:
+                self.module_kwargs["attention_mask"] = self.module_kwargs[
+                    "attention_mask"
+                ].to(common_device)
+
+            self.inps = self.inps.to(common_device)
+            # print(f'# {i} inps shape: {self.inps.shape}, inps.max: {self.inps.max()}')
+
+            # [STEP 1]: Get layer, extract linear modules, extract input features
+            named_linears = AwqQuantizer.get_named_linears(self.modules[i])
+
+            # Filter out the linear layers we don't want to exclude
+            named_linears = AwqQuantizer.exclude_layers_to_not_quantize(
+                named_linears, self.modules_to_not_convert
+            )
+            input_feat = self._get_input_feat(self.modules[i], named_linears)
+            AwqQuantizer.clear_memory()
+
+            # [STEP 2]: Compute and apply scale list
+            module_config = []
+            # q, k, v proj
+            module_config.append(
+                dict(
+                    prev_op=self.modules[i].input_layernorm,
+                    layers=[
+                        self.modules[i].self_attn.q_proj,
+                        self.modules[i].self_attn.k_proj,
+                        self.modules[i].self_attn.v_proj,
+                    ],
+                    inp=input_feat["self_attn.q_proj"],
+                    module2inspect=self.modules[i].self_attn,
+                    kwargs=self.module_kwargs,
+                )
+            )
+            # o_proj
+            if self.modules[i].self_attn.v_proj.weight.shape == self.modules[i].self_attn.o_proj.weight.shape:
+                module_config.append(
+                    dict(
+                        prev_op=self.modules[i].self_attn.v_proj,
+                        layers=[self.modules[i].self_attn.o_proj],
+                        inp=input_feat["self_attn.o_proj"],
+                    )
+                )
+            # mlp gate
+            module_config.append(
+                dict(
+                    prev_op=self.modules[i].post_attention_layernorm,
+                    layers=[self.modules[i].mlp.gate_proj, self.modules[i].mlp.up_proj],
+                    inp=input_feat["mlp.gate_proj"],
+                    module2inspect=self.modules[i].mlp,
+                )
+            )
+            # mlp down
+            module_config.append(
+                dict(
+                    prev_op=self.modules[i].mlp.up_proj,
+                    layers=[self.modules[i].mlp.down_proj],
+                    inp=input_feat["mlp.down_proj"],
+                )
+            )
+            scales_list = [
+                self._search_best_scale(self.modules[i], **layer)
+                for layer in module_config
+            ]
+            # print(scales_list); exit(0)
+            AwqQuantizer.apply_scale(self.modules[i], scales_list, input_feat_dict=input_feat)
+            # [STEP 3]: Compute and apply clipping list
+            if self.apply_clip:
+                clip_list = self._search_best_clip(
+                    self.modules[i], named_linears, input_feat
+                )
+                AwqQuantizer.apply_clip(self.modules[i], clip_list)
+
+            AwqQuantizer.clear_memory()
+
+    @torch.no_grad()
+    def _module_forward(
+        self, x: torch.Tensor, module: torch.nn.Module, module_kwargs: Dict
+    ) -> torch.Tensor:
+        if self.n_parallel_calib_samples is None:
+            # runs through all samples at once
+            # print(module, x, module_kwargs); exit(0)
+            module_output = module(x, **module_kwargs)
+            if isinstance(module_output, tuple):
+                module_output = module_output[0]
+        else:
+            # memory efficiently runs through all calibration samples
+            # but only n_parallel_calib_samples at a time
+            module_output = []
+            partitioned_inputs = torch.split(x, self.n_parallel_calib_samples)
+            for x_partial in partitioned_inputs:
+                partial_output = module(x_partial, **module_kwargs)
+
+                if isinstance(partial_output, tuple):
+                    partial_output = partial_output[0]
+
+                module_output.append(partial_output.cpu())
+
+            module_output = torch.cat(module_output, dim=0)
+
+        return module_output
+
+    @torch.no_grad()
+    def _search_best_scale(
+        self,
+        module,
+        prev_op,
+        layers: List[torch.nn.Linear],
+        inp: torch.Tensor,
+        module2inspect=None,
+        kwargs={},
+    ):
+        if module2inspect is None:
+            assert len(layers) == 1
+            module2inspect = layers[0]
+
+        if "use_cache" in kwargs:
+            kwargs.pop("use_cache")
+
+        # Put x on the right device
+        inp = inp.to(next(module2inspect.parameters()).device)
+
+        # [STEP 1]: Compute per-channel mean of normalised weights
+        # All layer weights are concatted together
+        weight = torch.cat([_m.weight for _m in layers], dim=0)
+        org_shape = weight.shape
+        # The weights are reshaped to be organised by quantization group
+        weight = weight.view(-1, self.group_size)
+        # Calculates the relative magnitude of the weights within each of the quantization groups,
+        # and rescales each group individually so that each group has weights on a 0-1 scale.
+        w_scale = weight.abs() / (weight.abs().amax(dim=1, keepdim=True) + 1e-6)
+        # Resizes the rescaled weight matrix back up to its original dimensions
+        w_scale = w_scale.view(org_shape)
+        # Gets the average rescaled magnitude for each output channel
+        w_mean = w_scale.mean(0)
+        AwqQuantizer.clear_memory(weight)
+
+        # [STEP 2]: Compute per-channel mean of the input activation with chunking
+        # move inp to cpu to avoid memory leak
+        inp_flat = inp.cpu().abs().view(-1, inp.shape[-1])
+        num_elements = inp_flat.size(0)
+        num_channels = inp_flat.size(1)
+        element_size_bytes = inp_flat.element_size() * 2 # multiplied by 2 for FP32
+
+        # Calculate chunk size dynamically based on max_chunk_memory
+        chunk_size = int(self.max_chunk_memory // (element_size_bytes * num_channels))
+        chunk_size = min(chunk_size, num_elements)
+
+        # Use float32 for sum calculation
+        x_sum = torch.zeros(num_channels, dtype=torch.float32, device=inp.device)
+
+        for i in range(0, num_elements, chunk_size):
+            end = min(i + chunk_size, num_elements)
+            chunk_sum = inp_flat[i:end].to(torch.float32).sum(dim=0)
+            x_sum += chunk_sum.to(inp.device)
+
+        x_mean = (x_sum / num_elements).to(inp.dtype)
+        AwqQuantizer.clear_memory(x_sum)
+
+        # [STEP 3]: Compute output of module
+        with torch.no_grad():
+            module_kwargs = self._sanitize_kwargs(kwargs, module2inspect)
+            fp16_output = self._module_forward(inp, module2inspect, module_kwargs)
+
+        # [STEP 4]: Compute loss
+        best_scales = self._compute_best_scale(
+            inp, w_mean, x_mean, module2inspect, layers, fp16_output, module_kwargs
+        )
+
+        return (
+            AwqQuantizer.get_op_name(module, prev_op),
+            tuple([AwqQuantizer.get_op_name(module, m) for m in layers]),
+            best_scales,
+        )
+
+    def _compute_best_scale(
+        self,
+        x: torch.Tensor,
+        w_mean: torch.Tensor,
+        x_mean: torch.Tensor,
+        module2inspect: torch.nn.Module,
+        linears2scale: List[torch.nn.Linear],
+        fp16_output: torch.Tensor,
+        kwargs: Dict={},
+    ):
+        """
+        Compute loss and select best scales
+
+        L(s) = || Q(W * s) (s^-1 * X) - W * X ||
+        Q: weight quantization function | pseudo_quantize_tensor(W * s)
+        X: inputs from calib dataset    | X
+        W: original weights in FP16     | layer
+        s: per channel scaling factor   | s^-1 * X
+        """
+        n_grid = 20
+        history = []
+        best_ratio = -1
+        best_scales = None
+        best_error = float("inf")
+
+        device = x.device
+        x_mean = x_mean.view(-1).to(device)
+        w_mean = w_mean.view(-1).to(device)
+
+        ord_weights = []
+        for fc in linears2scale:
+            ord_weights.append(fc.weight.data.clone())
+
+        for ratio in range(n_grid):
+            # create new scales
+            ratio = ratio / n_grid
+
+            # NOTE: s^-1 * x is fused here, according to paper
+            if self.duo_scaling:
+                scales = (x_mean.pow(ratio) / (w_mean.pow(1 - ratio) + 1e-4)).clamp(min=1e-4)
+            else:
+                scales = x_mean.pow(ratio).clamp(min=1e-4).view(-1)
+            scales = scales / (scales.max() * scales.min()).sqrt()
+            scales_view = scales.view(1, -1).to(device)
+
+            # avoid scaling values that overflow
+            scales[torch.isinf(scales)] = 1
+            scales[torch.isnan(scales)] = 1
+
+            # Q(W * s)
+            for fc in linears2scale:
+                fc.weight.mul_(scales_view)
+                fc.weight.data = (
+                    self.pseudo_quantize_tensor(fc.weight.data)[0] / scales_view
+                )
+
+            # W * X
+            int_w_output = self._module_forward(x, module2inspect, kwargs)
+
+            # compute mean squared error (L2 norm)
+            loss = self._compute_loss(fp16_output, int_w_output, device)
+
+            history.append(loss)
+            if loss < best_error:
+                best_error = loss
+                best_ratio = ratio
+                best_scales = scales.clone()
+
+            for fc, ord_weight in zip(linears2scale, ord_weights):
+                fc.weight.data = ord_weight.clone()
+
+        del ord_weights
+
+        if best_ratio == -1:
+            logging.debug(history)
+            raise Exception
+
+        assert torch.isnan(best_scales).sum() == 0, best_scales
+
+        return best_scales.detach().cpu()
+
+    @torch.no_grad()
+    def _compute_loss(
+        self,
+        fp16_output: torch.Tensor,
+        int_w_output: torch.Tensor,
+        device: torch.device,
+    ):
+        loss = 0.0
+        fp16_output_flat = fp16_output.view(-1)
+        int_w_output_flat = int_w_output.view(-1)
+        num_elements = fp16_output_flat.size(0)
+        element_size_bytes = fp16_output.element_size()
+
+        # Calculate chunk size dynamically based on max_chunk_memory
+        # Divide the max_chunk_memory by twice the element size
+        chunk_size = self.max_chunk_memory // (element_size_bytes * 2)
+        chunk_size = min(chunk_size, num_elements)
+
+        # Split the computation into chunks
+        fp16_chunks = torch.split(fp16_output_flat, chunk_size)
+        int_w_chunks = torch.split(int_w_output_flat, chunk_size)
+
+        # Compute the loss for each chunk
+        for fp16_chunk, int_w_chunk in zip(fp16_chunks, int_w_chunks):
+            chunk_loss = (fp16_chunk.to(device) - int_w_chunk.to(device)).float().pow(2).sum().item()
+            loss += chunk_loss
+
+        # Normalize the loss by the total number of elements
+        loss /= num_elements
+
+        return loss
+
+    @torch.no_grad()
+    def _search_best_clip(self, layer, named_linears, input_feat):
+        clip_list = []
+        avoid_clipping = ["q_", "k_", "query", "key", "Wqkv"]
+
+        for name in named_linears:
+            # due to qk bmm, it is hard to clip precisely
+            if any([_ in name for _ in avoid_clipping]):
+                continue
+
+            named_linears[name].to(AwqQuantizer.get_best_device())
+            max_val = self._compute_best_clip(
+                named_linears[name].weight, input_feat[name]
+            )
+            clip_list.append((name, max_val))
+            named_linears[name].cpu()
+
+        return clip_list
+
+    @torch.no_grad()
+    def _compute_best_clip(
+        self,
+        w: torch.Tensor,
+        input_feat: torch.Tensor,
+        n_grid=20,
+        max_shrink=0.5,
+        n_sample_token=512,
+    ):
+        assert w.dim() == 2
+        org_w_shape = w.shape
+        # w           [co, ci]      -> [co, 1, n_group, group size]
+        # input_feat  [n_token, ci] -> [1, n_token, n_group, group size]
+        group_size = self.group_size if self.group_size > 0 else org_w_shape[1]
+        input_feat = input_feat.view(-1, input_feat.shape[-1])
+        input_feat = input_feat.reshape(1, input_feat.shape[0], -1, group_size)
+
+        # Compute input feature step size (minimum 1)
+        step_size = max(1, input_feat.shape[1] // n_sample_token)
+        input_feat = input_feat[:, ::step_size]
+
+        w = w.reshape(org_w_shape[0], 1, -1, group_size)
+
+        oc_batch_size = 256 if org_w_shape[0] % 256 == 0 else 64  # prevent OOM
+        assert org_w_shape[0] % oc_batch_size == 0
+        w_all = w
+        best_max_val_all = []
+
+        for i_b in range(org_w_shape[0] // oc_batch_size):
+            w = w_all[i_b * oc_batch_size : (i_b + 1) * oc_batch_size]
+
+            org_max_val = w.abs().amax(dim=-1, keepdim=True)  # co, 1, n_group, 1
+
+            best_max_val = org_max_val.clone()
+            min_errs = torch.ones_like(org_max_val) * 1e9
+            input_feat = input_feat.to(w.device)
+            org_out = (input_feat * w).sum(dim=-1)  # co, n_token, n_group
+
+            for i_s in range(int(max_shrink * n_grid)):
+                max_val = org_max_val * (1 - i_s / n_grid)
+                min_val = -max_val
+                cur_w = torch.clamp(w, min_val, max_val)
+                q_w = self.pseudo_quantize_tensor(cur_w)[0]
+                cur_out = (input_feat * q_w).sum(dim=-1)
+
+                # co, 1, n_group, 1
+                err = (cur_out - org_out).pow(2).mean(dim=1).view(min_errs.shape)
+                del cur_w
+                del cur_out
+                cur_best_idx = err < min_errs
+                min_errs[cur_best_idx] = err[cur_best_idx]
+                best_max_val[cur_best_idx] = max_val[cur_best_idx]
+            best_max_val_all.append(best_max_val)
+
+        best_max_val = torch.cat(best_max_val_all, dim=0)
+
+        AwqQuantizer.clear_memory(input_feat)
+        AwqQuantizer.clear_memory(org_out)
+
+        return best_max_val.squeeze(1)
+
+    @staticmethod
+    @torch.no_grad()
+    def apply_clip(module, clip_list: Tuple[str, torch.Tensor]):
+        for name, max_val in clip_list:
+            layer: torch.nn.Linear = AwqQuantizer.get_op_by_name(module, name)
+            layer.to(AwqQuantizer.get_best_device())
+            max_val = max_val.to(layer.weight.device)
+            org_shape = layer.weight.shape
+            layer.weight.data = layer.weight.data.reshape(*max_val.shape[:2], -1)
+            layer.weight.data = torch.clamp(layer.weight.data, -max_val, max_val)
+            layer.weight.data = layer.weight.data.reshape(org_shape)
+            layer.cpu()
+
+    @staticmethod
+    @torch.no_grad()
+    def scale_fc_fcs(fc1: torch.nn.Linear, fcs: List[torch.nn.Linear], scales: torch.Tensor):
+        if not isinstance(fcs, list):
+            fcs = [fcs]
+
+        scales = scales.to(fc1.weight.device)
+
+        fc1.weight[-scales.size(0) :].div_(scales.view(-1, 1))
+        if fc1.bias is not None:
+            fc1.bias.div_(scales.view(-1))
+
+        for fc in fcs:
+            fc.weight.mul_(scales.view(1, -1))
+
+        for p in fc1.parameters():
+            assert torch.isnan(p).sum() == 0
+        for fc in fcs:
+            for p in fc.parameters():
+                assert torch.isnan(p).sum() == 0
+
+    @staticmethod
+    def is_allowed_act_fns(op):
+        from transformers.activations import NewGELUActivation, PytorchGELUTanh, GELUActivation
+        allowed_act_fns = [
+            torch.nn.GELU,
+            NewGELUActivation,
+            PytorchGELUTanh,
+            GELUActivation,
+        ]
+        return (op in allowed_act_fns)
+
+    @staticmethod
+    def is_allowed_norms(op):
+        if isinstance(op, torch.nn.LayerNorm):
+            return True
+        if any(t in str(type(op)) for t in ['LlamaRMSNorm', 'GemmaRMSNorm', 'CohereLayerNorm']):
+            return True
+        return False
+
+    @staticmethod
+    @torch.no_grad()
+    def scale_fc_fc(fc1: torch.nn.Linear, fc2: torch.nn.Linear, scales: torch.Tensor):
+        assert isinstance(fc1, torch.nn.Linear)
+        assert isinstance(fc2, torch.nn.Linear)
+
+        scales = scales.to(fc1.weight.device)
+        fc1.weight[-scales.size(0) :].div_(scales.view(-1, 1))
+        if fc1.bias is not None:
+            fc1.bias.div_(scales.view(-1))
+
+        fc2.weight.mul_(scales.view(1, -1))
+
+        for p in fc1.parameters():
+            assert torch.isnan(p).sum() == 0
+        for p in fc2.parameters():
+            assert torch.isnan(p).sum() == 0
+
+    @staticmethod
+    @torch.no_grad()
+    def scale_ln_fcs(ln: torch.nn.Linear, fcs: List[torch.nn.Linear], scales: torch.Tensor):
+        if not isinstance(fcs, list):
+            fcs = [fcs]
+
+        scales = scales.to(ln.weight.device)
+
+        # GemmaRMSNorm is different from Llama's in that it multiplies
+        # (1 + weight) to the output, instead of just weight.
+        if 'GemmaRMSNorm' in str(type(ln)):
+            ln.weight += 1
+            ln.weight.div_(scales)
+            ln.weight -= 1
+        else:
+            ln.weight.div_(scales)
+
+        if hasattr(ln, "bias") and ln.bias is not None:
+            ln.bias.div_(scales)
+
+        for fc in fcs:
+            fc.weight.mul_(scales.view(1, -1))
+
+        for p in ln.parameters():
+            assert torch.isnan(p).sum() == 0
+        for fc in fcs:
+            for p in fc.parameters():
+                assert torch.isnan(p).sum() == 0
+
+    @staticmethod
+    @torch.no_grad()
+    def scale_gelu_fc(gelu, fc: torch.nn.Linear, scales: torch.Tensor):
+        assert AwqQuantizer.is_allowed_act_fns(gelu)
+        assert isinstance(fc, torch.nn.Linear)
+
+        fc.weight.mul_(scales.view(1, -1).to(fc.weight.device))
+
+        for p in fc.parameters():
+            assert torch.isnan(p).sum() == 0
+
+    @staticmethod
+    def apply_scale(module, scales_list, input_feat_dict=None):
+        for prev_op_name, layer_names, scales in scales_list:
+            prev_op = AwqQuantizer.get_op_by_name(module, prev_op_name)
+            layers = [AwqQuantizer.get_op_by_name(module, name) for name in layer_names]
+
+            best_device = AwqQuantizer.get_best_device()
+            prev_op.to(best_device)
+            for layer in layers:
+                layer.to(best_device)
+            scales.to(best_device)
+            if (
+                isinstance(prev_op, torch.nn.Linear)
+                and type(layers) == list
+                and isinstance(layers[0], torch.nn.Linear)
+            ):
+                if len(layers) == 1:
+                    AwqQuantizer.scale_fc_fc(prev_op, layers[0], scales)
+                else:
+                    AwqQuantizer.scale_fc_fcs(prev_op, layers, scales)
+            elif (
+                AwqQuantizer.is_allowed_norms(prev_op)
+                or "rmsnorm" in str(prev_op.__class__).lower()
+            ):
+                AwqQuantizer.scale_ln_fcs(prev_op, layers, scales)
+
+            elif AwqQuantizer.is_allowed_act_fns(prev_op):
+                #new_module = ScaledActivation(prev_op, scales)
+                #set_op_by_name(module, prev_op_name, new_module)
+                AwqQuantizer.scale_gelu_fc(prev_op, layers[0], scales)
+            else:
+                raise NotImplementedError(f"prev_op {type(prev_op)} not supported yet!")
+
+            # apply the scaling to input feat if given; prepare it for clipping
+            if input_feat_dict is not None:
+                for layer_name in layer_names:
+                    # Skip the modules that are not quantized
+                    if layer_name in input_feat_dict:
+                        inp = input_feat_dict[layer_name]
+                        inp.div_(scales.view(1, -1).to(inp.device))
+
+            prev_op.cpu()
+            for layer in layers:
+                layer.cpu()
+            scales.cpu()
+
+    @staticmethod
+    def exclude_layers_to_not_quantize(linear_layers, modules_to_not_convert):
+        if modules_to_not_convert is None:
+            return linear_layers
+
+        filtered_layers = {}
+        for name, linear_layer in linear_layers.items():
+            if not any(key in name for key in modules_to_not_convert):
+                filtered_layers[name] = linear_layer
+        return filtered_layers
+
+    @staticmethod
+    def get_named_linears(module):
+        return {name: m for name, m in module.named_modules() if isinstance(m, torch.nn.Linear)}
+
+    @staticmethod
+    def get_op_by_name(module, op_name):
+        # get the op by its name relative to the module
+        for name, m in module.named_modules():
+            if name == op_name:
+                return m
+        raise ValueError(f"Cannot find op {op_name} in module {module}")
+
+    @staticmethod
+    def get_calib_dataset(
+        data: Union[str, List[str], List[List[int]]] = "pileval",
+        tokenizer=None,
+        n_samples=128,
+        max_seq_len=512,
+        split="train",
+        text_column="text",
+    ):
+        if isinstance(data, str):
+            from datasets import load_dataset
+            if data == "pileval":
+                dataset = load_dataset("mit-han-lab/pile-val-backup", split="validation")
+            else:
+                dataset = load_dataset(data, split=split)
+            # dataset = dataset.shuffle(seed=42)
+        elif isinstance(data, list):
+            if isinstance(data[0], str):
+                dataset = [{text_column: text} for text in data]
+            elif isinstance(data[0][0], int):
+                dataset = data
+            else:
+                raise NotImplementedError(
+                    "Either pass a string to a huggingface dataset or a list"
+                    "that is preprocessed with one sample of text per element"
+                    " or a list of list of int for tokenized words."
+                )
+        else:
+            raise NotImplementedError(
+                "Either pass a string to a huggingface dataset or a list"
+                "that is preprocessed with one sample of text per element"
+                " or a list of list of int for tokenized words."
+            )
+
+        samples = []
+        n_run = 0
+        for data in dataset:
+            if isinstance(data, list):
+                line_encoded = data
+            else:
+                line = data[text_column]
+                line = line.strip()
+                line_encoded = tokenizer.encode(line)
+            if len(line_encoded) > max_seq_len:
+                continue
+            sample = torch.tensor([line_encoded])
+            if sample.numel() == 0:
+                continue
+            samples.append(sample)
+            n_run += 1
+            if n_run == n_samples:
+                break
+        # now concatenate all samples and split according to max sequence length
+        cat_samples = torch.cat(samples, dim=1)
+        n_split = cat_samples.shape[1] // max_seq_len
+        logging.debug(f" * Split into {n_split} blocks")
+        return [
+            cat_samples[:, i * max_seq_len : (i + 1) * max_seq_len] for i in range(n_split)
+        ]
+
+    @staticmethod
+    def get_best_device():
+        if torch.backends.mps.is_available():
+            return "mps"
+        elif torch.cuda.is_available():
+            return "cuda:0"
+        else:
+            return "cpu"
+
+    @staticmethod
+    def clear_memory(weight=None):
+        if weight is not None:
+            del weight
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    @staticmethod
+    def get_op_name(module, op):
+        # get the name of the op relative to the module
+        for name, m in module.named_modules():
+            if m is op:
+                return name
+        raise ValueError(f"Cannot find op {op} in module {module}")
+
+    @staticmethod
+    def append_str_prefix(x, prefix):
+        if isinstance(x, str):
+            return prefix + x
+        elif isinstance(x, tuple):
+            return tuple([AwqQuantizer.append_str_prefix(y, prefix) for y in x])
+        elif isinstance(x, list):
+            return [AwqQuantizer.append_str_prefix(y, prefix) for y in x]
+        else:
+            return x
+
+    def init_quant(self, n_samples=128, max_seq_len=512):
+        modules = self.awq_model.blocks
+        samples = AwqQuantizer.get_calib_dataset(
+            data=self.calib_data,
+            tokenizer=self.tokenizer,
+            n_samples=n_samples,
+            max_seq_len=max_seq_len,
+            split=self.split
+        )
+        # samples = torch.cat(samples, dim=0)
+        samples = torch.cat(samples[:1], dim=0) # just using 1 batch
+        inps = []
+        layer_kwargs = {}
+        # build inps
+        self.model.seq_len = samples.numel()
+        self.model.context_len = samples.numel() - 2
+        self.model.token_len = 0
+        best_device = AwqQuantizer.get_best_device()
+        inps = self.model.embedding(samples).to(best_device)
+        position_ids = self.model.get_position_ids()
+        rotary_pos_emb = self.model.rotary(position_ids)
+        attention_mask = self.model.get_attention_mask()
+        layer_kwargs["rotary_pos_emb"] = rotary_pos_emb.to(best_device)
+        layer_kwargs["attention_mask"] = attention_mask.to(best_device)
+        del samples
+        AwqQuantizer.clear_memory()
+        return modules, layer_kwargs, inps
+
+    def _get_input_feat(self, layer, named_linears):
+        # firstly, get input features of all linear layers
+        def cache_input_hook(m, x, y, name, feat_dict):
+            x = x[0]
+            x = x.detach().cpu()
+            feat_dict[name].append(x)
+        input_feat = defaultdict(list)
+        handles = []
+        for name in named_linears:
+            handles.append(
+                named_linears[name].register_forward_hook(
+                    functools.partial(cache_input_hook, name=name, feat_dict=input_feat)
+                )
+            )
+        self.inps = self.inps.to(next(layer.parameters()).device)  # in case multi-gpu
+        # get output as next layer's input
+
+        # Sanitize the kwargs in case we use transformers version that contains
+        # kwargs that are not handled by the module.
+        # Useful for trust_remote_code models.
+        module_kwargs = self._sanitize_kwargs(self.module_kwargs, layer)
+
+        self.inps = self._module_forward(self.inps, layer, module_kwargs)
+        for h in handles:
+            h.remove()
+        # now solve for scaling and clipping
+        input_feat = {k: torch.cat(v, dim=0) for k, v in input_feat.items()}
+
+        return input_feat
+
+    def _sanitize_kwargs(self, inputs_kwargs, module):
+        """
+        Remove the arguments that are not supported in the module's
+        forward pass to avoid breaking behaviour between different versions
+        of transformers.
+
+        Args:
+            inputs_kwargs (`dict`):
+                The input dictionary to pass to the model layer
+            module (`torch.nn.Module`):
+                Target module to quantize.
+        """
+        module_signature = inspect.signature(module.forward).parameters
+        sanitized_kwargs = {}
+        for k, v in inputs_kwargs.items():
+            if k in module_signature:
+                sanitized_kwargs[k] = v
+        return sanitized_kwargs
+# awq quantizer end
 
 # Export class
-class LlmExporterOp(torch.autograd.Function):
+
+# custom op start
+class FakeLinearOp(torch.autograd.Function):
     @staticmethod
     def symbolic(g, input, in_features, out_features, has_bias, name):
-        args = [input]
         # These become the operator attributes.
         kwargs = {
             "in_features_i": in_features,
@@ -299,7 +1188,36 @@ def __init__(self, in_features, out_features, has_bias, name):
         self.name = name
 
     def forward(self, x):
-        return LlmExporterOp.apply(x, self.in_features, self.out_features, self.has_bias, self.name)
+        return FakeLinearOp.apply(x, self.in_features, self.out_features, self.has_bias, self.name)
+
+class FusedAttentionOp(torch.autograd.Function):
+    @staticmethod
+    def symbolic(g, query, key, value, attention_mask, hidden_size, name):
+        # These become the operator attributes.
+        kwargs = {
+            "hidden_size_i": hidden_size,
+            "name_s": name
+        }
+        from torch.onnx.symbolic_helper import _get_tensor_sizes
+        out_sizes = _get_tensor_sizes(query)
+        output_type = query.type().with_sizes(out_sizes)
+        return g.op("LlmExporter::FusedAttention", query, key, value, attention_mask, **kwargs).setType(output_type)
+
+    @staticmethod
+    def forward(ctx, query, key, value, attention_mask, hidden_size, name):
+        out_shape = list(query.shape)[:2] + [hidden_size]
+        return query.new_zeros(out_shape)
+
+class FusedAttention(torch.nn.Module):
+    def __init__(self, hidden_size, name):
+        super(FusedAttention, self).__init__()
+        self.hidden_size = hidden_size
+        self.name = name
+
+    def forward(self, query, key, value, attention_mask):
+        return FusedAttentionOp.apply(query, key, value, attention_mask, self.hidden_size, self.name)
+
+# custom op end
 
 class OnnxRebuilder:
     def __init__(self, onnx_path, weight_ops):
@@ -357,7 +1275,7 @@ def rebuild(self):
                     # fakelinear -> matmul + add
                     middle_tensor = f'{name}_matmul'
                     new_nodes.append(helper.make_node('MatMul', [node.input[0], weight], [middle_tensor], name))
-                    new_nodes.append(helper.make_node('Add', [middle_tensor, bias], node.output, name))
+                    new_nodes.append(helper.make_node('Add', [middle_tensor, bias], node.output, f'{name}/Add'))
                 else:
                     # fakelinear -> matmul
                     new_nodes.append(helper.make_node('MatMul', [node.input[0], weight], node.output, name))
@@ -372,9 +1290,11 @@ def rebuild(self):
 class MNNConveter:
     def __init__(self, onnx_path, weight_ops, config):
         self.weight_ops = weight_ops
+        self.config = config
         self.quant_block = config.quant_block
         self.quant_bit = config.quant_bit
         self.lm_quant_bit = config.lm_quant_bit
+        self.symmetric = config.symmetric
         self.mnn_weight_offset = 0
         self.onnx_model_path = onnx_path
         self.mnn_name = os.path.basename(onnx_path).replace('.onnx', '.mnn')
@@ -488,30 +1408,44 @@ def rebuild(self, json_path):
             json.dump(mnn_graph, file, ensure_ascii=False, indent=4)
         return self.mnn_weight_path
 
-    def quant(self, weight, quant_bit, quant_block):
+    def quant(self, weight, quant_bit, quant_block, symmetric):
         weight = weight.numpy()
         oc, ic = weight.shape
         if quant_block == 0:
             block_size = ic
         else:
             block_size = quant_block
-        if ic % block_size != 0:
-            block_size = ic
-            print('Skip block quant for ic=', ic, ', quant_block:', quant_block)
         block_num = ic // block_size
         weight = weight.reshape(oc, block_num, block_size)
-        max_val = np.max(weight, axis=-1, keepdims=True)
-        min_val = np.min(weight, axis=-1, keepdims=True)
         offset = 1 << (quant_bit - 1)
         clip_max = offset - 1
-        clip_min = -offset
-        scale = (max_val - min_val) / (clip_max - clip_min)
-        q_weight = np.round((weight - min_val) / scale) + clip_min
-        q_weight = (np.clip(q_weight.flatten(), clip_min, clip_max) + offset).astype(np.uint8)
+        if symmetric:
+            clip_min = -clip_max
+            abs_max = np.max(np.abs(weight), axis=-1, keepdims=True)
+            scale = abs_max / clip_max
+            q_weight = np.round(weight / scale)
+            q_weight = (np.clip(q_weight.flatten(), clip_min, clip_max) + offset).astype(np.uint8)
+            alpha = scale.flatten()
+        else:
+            clip_min = -offset
+            max_val = np.max(weight, axis=-1, keepdims=True)
+            min_val = np.min(weight, axis=-1, keepdims=True)
+            scale = (max_val - min_val) / (clip_max - clip_min)
+
+            if False:
+                q_weight = np.round((weight - min_val) / scale) + clip_min
+                zeros =  min_val - scale * clip_min
+            else:
+                q_weight = np.round(weight / scale) - np.round(min_val / scale) + clip_min
+                zeros =  (np.round(min_val / scale) - clip_min) * scale
+            q_weight = (np.clip(q_weight.flatten(), clip_min, clip_max) + offset).astype(np.uint8)
+            alpha = np.stack([zeros.flatten(), scale.flatten()], axis=-1).flatten()
+
         q_weight = q_weight.reshape(-1, 2)
         if quant_bit == 4:
             q_weight = q_weight[:, 0] * 16 + q_weight[:, 1]
-        alpha = np.stack([min_val.flatten(), scale.flatten()], axis=-1).flatten()
+
+        clip_min = 1
         return q_weight, alpha, clip_min
 
     def write_npy(self, data):
@@ -533,12 +1467,18 @@ def write_header(self, ic, oc, quant_bit):
         header_length = dim_num + dim_length + map_length
         return header_length, shape_dtype == np.int32
 
-    def build_weight(self, linear, quant_bit, quant_block):
+    def build_weight(self, linear, quant_bit, quant_block, symmetric):
         ic, oc = linear.in_features, linear.out_features
-        q_weight, alpha, q_min = self.quant(linear.weight.data, quant_bit, quant_block)
-        header_len, shape_int32 = self.write_header(ic, oc, quant_bit)
-        weight_len = self.write_npy(q_weight) + header_len
-        alpha_len = self.write_npy(alpha)
+        if quant_bit == 16:
+            half_weight = linear.weight.data.half().flatten().numpy()
+            weight_len = self.write_npy(half_weight)
+            alpha_len, q_min, shape_int32 = 0, 0, False
+        else:
+            assert(quant_bit in (4, 8))
+            q_weight, alpha, q_min = self.quant(linear.weight.data, quant_bit, quant_block, symmetric)
+            header_len, shape_int32 = self.write_header(ic, oc, quant_bit)
+            weight_len = self.write_npy(q_weight) + header_len
+            alpha_len = self.write_npy(alpha)
         if linear.bias is not None:
             bias = linear.bias.data.flatten().numpy()
             bias_length = self.write_npy(bias)
@@ -548,7 +1488,7 @@ def build_weight(self, linear, quant_bit, quant_block):
             # bias_length = self.write_npy(bias)
         external = [self.mnn_weight_offset, weight_len, alpha_len, bias_length, 0]
         self.mnn_weight_offset += (weight_len + alpha_len + bias_length)
-        return external, q_min, shape_int32
+        return external, q_min, shape_int32, header_len
 
     def build_tensor(self, graph, tensor_name):
         tensor_idx = [len(graph['tensorName'])]
@@ -556,6 +1496,31 @@ def build_tensor(self, graph, tensor_name):
         return tensor_idx
 
     def rebuild_op(self, op, graph):
+        op_type = op['main']['type']
+        if op_type == 'FakeLinear':
+            return self.rebuild_linear(op, graph)
+        if op_type == 'FusedAttention':
+            return self.rebuild_attnention(op, graph)
+
+    def rebuild_attnention(self, op, graph):
+        attrs = op['main']['attr']
+        for attr in attrs:
+            if attr['key'] == 'name':
+                name = attr['s']
+        origin_input = op['inputIndexes']
+        origin_output = op['outputIndexes']
+        fused_attention = {
+            "inputIndexes": origin_input,
+            "main_type": "AttentionParam",
+            "main": { "kv_cache": True },
+            "name": name,
+            "outputIndexes": origin_output,
+            "type": "Attention",
+            "defaultDimentionFormat": "NHWC"
+        }
+        return [fused_attention]
+
+    def rebuild_linear(self, op, graph):
         attrs = op['main']['attr']
         for attr in attrs:
             if attr['key'] == 'name':
@@ -571,8 +1536,15 @@ def rebuild_op(self, op, graph):
                linear.out_features == oc and
                (linear.bias is not None) == has_bias)
 
-        quant_bit = self.lm_quant_bit if 'lm_head' in name else self.quant_bit
-        external, q_min, shape_int32 = self.build_weight(linear, quant_bit, self.quant_block)
+        is_lm = 'lm_head' in name
+        quant_bit = self.lm_quant_bit if is_lm else self.quant_bit
+        block_size = ic if self.quant_block == 0 else self.quant_block
+        external, q_min, shape_int32, header_len = self.build_weight(linear, quant_bit, self.quant_block, self.symmetric)
+        if is_lm and self.config.tie_word_embeddings:
+            weight_offset = external[0] + header_len
+            alpha_offset = external[0] + external[1]
+            alpha_size = external[2]
+            self.config.llm_config['tie_embeddings'] = [weight_offset, alpha_offset, alpha_size, quant_bit, self.quant_block]
 
         origin_input = op['inputIndexes']
         origin_output = op['outputIndexes']
@@ -613,6 +1585,22 @@ def rebuild_op(self, op, graph):
             },
             "defaultDimentionFormat": "NHWC"
         }
+
+        if quant_bit == 16:
+            quanParameter = { "type": 3 }
+        else:
+            if self.symmetric:
+                aMin = 0
+                readType = 0
+            else:
+                aMin = q_min
+                readType = oc * (ic // block_size)
+
+            quanParameter = {
+                "quantScale": 1.0, "scaleIn": 0.0, "scaleOut": 0.0,
+                "useInt32": False, "has_scaleInt": False, "shapeInt32": shape_int32,
+                "type": 1, "aMax": 0, "aMin": aMin, "readType": readType, "weightSize": 0
+            }
         conv_op = {
             "name": conv_name,
             "inputIndexes": pre_convert_output,
@@ -626,11 +1614,7 @@ def rebuild_op(self, op, graph):
                     'outputCount': oc, 'relu': False, 'padMode': 'CAFFE',
                     'relu6': False, 'inputCount': ic, 'hasOutputShape': False
                 },
-                "quanParameter": {
-                    "quantScale": 1.0, "scaleIn": 0.0, "scaleOut": 0.0,
-                    "useInt32": False, "has_scaleInt": False, "shapeInt32": shape_int32,
-                    "type": 1, "aMax": 0, "aMin": q_min, "readType": -1, "weightSize": 0
-                },
+                "quanParameter": quanParameter,
                 "external": external
             },
             "defaultDimentionFormat": "NHWC"
@@ -683,23 +1667,36 @@ def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
     return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
 
 class Attention(torch.nn.Module):
-    def __init__(self, attn, config):
+    def __init__(self, attn, layer_id, config):
         super().__init__()
+        self.export_fused_attn = False
+        self.fused_attn = FusedAttention(config.hidden_size, f'/layers.{layer_id}/self_attn/FusedAttention')
+        self.layer_id = layer_id
         self.hidden_size = config.hidden_size
-        self.num_heads = config.num_attention_heads
         self.head_dim = config.head_dim
-        self.num_key_value_heads = config.num_key_value_heads
+        if isinstance(config.num_attention_heads, list):
+            self.num_heads = config.num_attention_heads[layer_id]
+            self.num_key_value_heads = config.num_key_value_heads[layer_id]
+        else:
+            self.head_dim = config.head_dim
+            self.num_heads = config.num_attention_heads
+            self.num_key_value_heads = config.num_key_value_heads
         self.num_key_value_groups = self.num_heads // self.num_key_value_heads
         self.rotary = config.rotary
+
         ModelMapper.do_map(self, attn, config.model_map['attention'])
+
         if hasattr(self, 'qkv_proj') and self.qkv_proj is not None:
             # split qkv linear to q, k, v
             split_sizes = [self.hidden_size] * 3
             if self.qkv_proj.weight.shape[0] != self.hidden_size * 3:
                 # M/GQA
-                qkv_hidden_size = self.qkv_proj.weight.shape[0]
-                kv_hidden_size = (qkv_hidden_size - self.hidden_size) // 2
-                split_sizes = [self.hidden_size, kv_hidden_size, kv_hidden_size]
+                split_sizes = [
+                    self.num_heads * self.head_dim,           # q_size
+                    self.num_key_value_heads * self.head_dim, # k_size
+                    self.num_key_value_heads * self.head_dim  # v_size
+                ]
+
             self.q_proj = torch.nn.Linear(self.hidden_size, split_sizes[0])
             self.k_proj = torch.nn.Linear(self.hidden_size, split_sizes[1])
             self.v_proj = torch.nn.Linear(self.hidden_size, split_sizes[2])
@@ -724,6 +1721,10 @@ def __init__(self, attn, config):
                     self.q_proj.bias.data = qb
                     self.k_proj.bias.data = kb
                     self.v_proj.bias.data = vb
+                else:
+                    self.q_proj.bias.data = torch.zeros(split_sizes[0])
+                    self.k_proj.bias.data = torch.zeros(split_sizes[1])
+                    self.v_proj.bias.data = torch.zeros(split_sizes[2])
 
     def forward(
         self,
@@ -731,14 +1732,23 @@ def forward(
         attention_mask: Optional[torch.Tensor] = None,
         past_key_value: Optional[Tuple[torch.Tensor]] = None,
         rotary_pos_emb: Optional[torch.Tensor] = None,
+        cross_attention_states: Optional[torch.Tensor] = None,
     ) -> Tuple[torch.Tensor, torch.Tensor]:
         bsz, q_len, _ = hidden_states.size()
         query_states = self.q_proj(hidden_states)
+        if cross_attention_states is not None:
+            hidden_states = cross_attention_states
         key_states = self.k_proj(hidden_states)
         value_states = self.v_proj(hidden_states)
         query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim)
         key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim)
         value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim)
+        # openelm model has qk_norm
+        if hasattr(self, 'q_norm') and self.q_norm is not None and \
+           hasattr(self, 'k_norm') and self.k_norm is not None :
+            query_states = self.q_norm(query_states)
+            key_states = self.k_norm(key_states)
+
         kv_seq_len = key_states.shape[1]
         if past_key_value is not None:
             kv_seq_len += past_key_value[0].shape[1]
@@ -747,6 +1757,12 @@ def forward(
         cos, sin = rotary_pos_emb[0], rotary_pos_emb[1]
         query_states = self.rotary.apply_rotary_pos(query_states, cos, sin)
         key_states = self.rotary.apply_rotary_pos(key_states, cos, sin)
+
+        if self.export_fused_attn:
+            attn_output = self.fused_attn(query_states, key_states, value_states, attention_mask)
+            attn_output = self.o_proj(attn_output)
+            return attn_output, past_key_value
+
         # kv cache
         if past_key_value is not None:
             past_key, past_value = past_key_value[0], past_key_value[1]
@@ -846,11 +1862,17 @@ def chatglm_rotary_pos(self, x, cos, sin):
         return torch.cat((x1, x2), dim=-1)
 
 class Decoder(torch.nn.Module):
-    def __init__(self, decoder, config):
+    def __init__(self, decoder, layer_id, config):
         super().__init__()
+        self.cross_decoder = False
         ModelMapper.do_map(self, decoder, config.model_map['decoder'])
+        # mllama has cross_attn
+        if hasattr(self, 'cross_attn') and self.cross_attn is not None:
+            self.cross_decoder = True
+            self.self_attn = Attention(self.cross_attn, layer_id, config)
+        else:
+            self.self_attn = Attention(self.self_attn, layer_id, config)
         self.hidden_size = config.hidden_size
-        self.self_attn = Attention(self.self_attn, config)
         # chatglm
         self.alpha = (2 * config.num_hidden_layers) ** 0.5 if config.model_type == 'chatglm' else 1.0
 
@@ -860,6 +1882,8 @@ def forward(
         rotary_pos_emb: Optional[torch.Tensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        cross_attention_states: Optional[torch.Tensor] = None,
+        cross_attention_mask: Optional[torch.Tensor] = None,
     ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
         hidden_states = hidden_states.view(1, -1, self.hidden_size)
         residual = hidden_states
@@ -871,6 +1895,7 @@ def forward(
             rotary_pos_emb=rotary_pos_emb,
             attention_mask=attention_mask,
             past_key_value=past_key_value,
+            cross_attention_states=cross_attention_states,
         )
         # Fully Connected
         if not hasattr(self, 'post_attention_layernorm'):
@@ -892,6 +1917,13 @@ def forward(
             hidden_states = self.mlp(hidden_states)
             hidden_states = self.post_feedforward_layernorm(hidden_states)
             hidden_states = residual + hidden_states
+        elif cross_attention_mask is not None:
+            hidden_states = residual + self.cross_attn_attn_gate.tanh() * hidden_states
+            residual = hidden_states
+            hidden_states = self.post_attention_layernorm(hidden_states)
+            hidden_states = self.mlp(hidden_states)
+            hidden_states = cross_attention_mask * hidden_states
+            hidden_states = residual + self.cross_attn_mlp_gate.tanh() * hidden_states
         else:
             # general
             hidden_states = residual + hidden_states
@@ -908,9 +1940,12 @@ def __init__(self, lm_, final_layernorm_, config):
         self.final_layernorm = final_layernorm_
         self.lm = lm_
         self.hidden_size = config.hidden_size
+        self.ppl = config.ppl
 
     def forward(self, hidden_states):
-        hidden_states = hidden_states.view(-1, self.hidden_size)[-1].view(1, 1, self.hidden_size)
+        if not self.ppl:
+            # just need last logit for predict next token
+            hidden_states = hidden_states.view(-1, self.hidden_size)[-1].view(1, 1, self.hidden_size)
         hidden_states = self.final_layernorm(hidden_states)
         m_logits = self.lm(hidden_states)
         return m_logits
@@ -924,6 +1959,9 @@ def __init__(self, visual, base):
         self.config = base.config
         self.hidden_size = base.hidden_size
         self.llm_config = base.llm_config
+        # mllama
+        self.cross_attention_states = None
+        self.cross_attention_mask = None
         self.init_config()
         self.load()
 
@@ -931,7 +1969,8 @@ def __init__(self, visual, base):
     def get_visual(model_type):
         visual_models = {
             'qwen': QwenVisual,
-            'qwen2_vl': Qwen2Visual
+            'qwen2_vl': Qwen2Visual,
+            'mllama': MllamaVision
         }
         if model_type in visual_models:
             return visual_models[model_type]
@@ -1097,6 +2136,83 @@ def embed(self, input_ids, images = None, videos = None):
             input_embeds[image_mask] = self.image_embeds
         return input_embeds
 
+class MllamaVision(Visual):
+    def __init__(self, visual, base):
+        super().__init__(visual, base)
+        self.image_objs = []
+
+    def load(self):
+        self.llm_config['is_visual'] = True
+        self.llm_config['image_size'] = self.config.vision_config.image_size
+        self.image_size = self.config.vision_config.image_size
+
+    def str_to_ids(self, prompt):
+        if '<img>' in prompt and '</img>' in prompt:
+            import re
+            import requests
+            from PIL import Image
+            pattern = r'(<img>.*?</img>)'
+            parts = re.split(pattern, prompt)
+            txt_prompt = ''
+            for part in parts:
+                if re.match(pattern, part):
+                    img_content = re.search(r'<img>(.*?)</img>', part).group(1)
+                    if img_content.startswith('http://') or img_content.startswith('https://'):
+                        self.image_objs.append(Image.open(requests.get(img_content, stream=True).raw))
+                    txt_prompt += '<|image|>'
+                else:
+                    txt_prompt += part
+        else:
+            txt_prompt = prompt
+        input_ids = self.tokenizer(txt_prompt, return_tensors="pt")['input_ids']
+        # image process
+        for img in self.image_objs:
+            image_embeds = self.img_process(img)
+            print(image_embeds.shape)
+            pass
+        return input_ids
+
+    def img_process(self, image):
+        resized_height = self.image_size
+        resized_width = self.image_size
+        from transformers.image_transforms import (
+            convert_to_rgb,
+            resize,
+            rescale,
+            normalize
+        )
+        from transformers.image_utils import (
+            OPENAI_CLIP_MEAN,
+            OPENAI_CLIP_STD,
+            PILImageResampling,
+            infer_channel_dimension_format,
+            to_numpy_array
+        )
+        image = convert_to_rgb(image)
+        image = to_numpy_array(image)
+        format = infer_channel_dimension_format(image)
+        resample = PILImageResampling.BICUBIC
+        image = resize(image, size=(resized_height, resized_width), resample=resample, input_data_format=format)
+        image = rescale(image, scale=1 / 255.0, input_data_format=format)
+        image = normalize(image=image, mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, input_data_format=format)
+        image = image.transpose(2, 0, 1)
+        image = np.expand_dims(image, [0, 1, 2])
+        pad_val = np.zeros_like(image)
+        image = np.concatenate([image, pad_val, pad_val, pad_val], axis=2)
+        print(image.shape)
+        image = torch.from_numpy(image)
+        image_embeds = self.forward(image)
+        print(image_embeds.shape)
+        return image_embeds
+
+    def forward(self, images):
+        aspect_ratio_ids = torch.tensor([[1]])
+        aspect_ratio_mask = torch.tensor([[[1, 0, 0, 0]]])
+        return self.visual(images, aspect_ratio_ids, aspect_ratio_mask)
+
+    def embed(self, input_ids, images = None, videos = None):
+        return self.embed_(input_ids)
+
 class LlmExporter(torch.nn.Module):
     '''
     Base class for all llm model export. Inherits from [`torch.nn.Module`].
@@ -1108,7 +2224,7 @@ def __init__(self, args):
         self.load_model(args.path)
 
     def init_from_args(self, args):
-        self.max_length = 1024
+        self.max_length = 128
         self.stop_ids = []
         self.visual = None
         self.dst_name = 'llm'
@@ -1116,11 +2232,17 @@ def init_from_args(self, args):
         self.path = args.path
         self.dst_path = args.dst_path
         self.onnx_path = os.path.join(self.dst_path, 'onnx')
+        self.tokenizer_path = args.tokenizer_path
         self.lora_path = args.lora_path
-        self.skip_slim = args.skip_slim
+        self.onnx_slim = args.onnx_slim
+        self.ppl = args.ppl
+        self.awq = args.awq
         self.quant_bit = args.quant_bit
         self.quant_block = args.quant_block
+        self.symmetric = args.sym
         self.mnnconvert = args.mnnconvert
+        if self.tokenizer_path is None:
+            self.tokenizer_path = self.path
         if args.lm_quant_bit is not None:
             self.lm_quant_bit = args.lm_quant_bit
         else:
@@ -1132,10 +2254,13 @@ def init_from_args(self, args):
             os.makedirs(self.onnx_path)
 
     def load_pretrained(self, model_path: str):
-        self.tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
+        self.tokenizer = AutoTokenizer.from_pretrained(self.tokenizer_path, trust_remote_code=True, use_fast=False)
         if 'Qwen2-VL' in model_path:
             from transformers import Qwen2VLForConditionalGeneration
             self.model = Qwen2VLForConditionalGeneration.from_pretrained(model_path).float().eval()
+        elif 'Llama-3.2' in model_path and 'Vision' in model_path:
+            from transformers import MllamaForConditionalGeneration
+            self.model = MllamaForConditionalGeneration.from_pretrained(model_path).float().eval()
         else:
             try:
                 self.model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True).float().eval()
@@ -1174,6 +2299,7 @@ def load_model(self, model_path):
         self.stop_ids = list(set(self.stop_ids))
         model_mapper = ModelMapper()
 
+        self.tie_word_embeddings = (hasattr(self.config, 'tie_word_embeddings') and self.config.tie_word_embeddings)
         self.model_type, self.model_map = model_mapper.get_map(self.config)
         # print(self.config, self.model_type, self.model_map, self.model)
         # load config info
@@ -1183,9 +2309,15 @@ def load_model(self, model_path):
         if not hasattr(self, 'rope_theta') or self.rope_theta is None:
             self.rope_theta = 10000.0
         if not hasattr(self, 'head_dim') or self.head_dim is None:
-            self.head_dim = self.hidden_size // self.num_attention_heads
+            if isinstance(self.num_attention_heads, list):
+                self.head_dim = [self.hidden_size // atten_head for atten_head in self.num_attention_heads]
+            else:
+                self.head_dim = self.hidden_size // self.num_attention_heads
         # some export info
-        self.past_kv_shape = [self.num_hidden_layers, 2, 1, 0, self.num_key_value_heads, self.head_dim]
+        if isinstance(self.num_attention_heads, list):
+            self.past_kv_shape = [self.num_hidden_layers, 2, 1, 0, self.num_key_value_heads[0], self.head_dim]
+        else:
+            self.past_kv_shape = [self.num_hidden_layers, 2, 1, 0, self.num_key_value_heads, self.head_dim]
         self.block_dynamic_axes = {
             "inputs_embeds" : { 0: "seq_len" },
             "attention_mask" : { 2: "seq_len", 3: "seq_len" },
@@ -1195,8 +2327,8 @@ def load_model(self, model_path):
         self.model_dynamic_axes = {
             "input_ids" : { 0: "seq_len" },
             "attention_mask" : { 2: "seq_len", 3: "seq_len" },
-            "position_ids" : { 0: "seq_len" },
-            "past_key_values" : { 2: "history_len" }
+            "position_ids" : { 1: "seq_len" },
+            "past_key_values" : { 3: "history_len" }
         }
         self.llm_config = {
             'hidden_size' : self.hidden_size,
@@ -1209,6 +2341,11 @@ def load_model(self, model_path):
         # load modules
         ModelMapper.do_map(self, self.model, self.model_map['model'])
         # rebuild modules
+        if self.lm_ is None:
+            out_features, in_features = self.embed_.weight.shape
+            self.lm_ = torch.nn.Linear(in_features, out_features)
+            self.lm_.weight = self.embed_.weight
+
         if self.embed_.weight is self.lm_.weight:
             import copy
             embed_copy = copy.deepcopy(self.embed_)
@@ -1219,7 +2356,8 @@ def load_model(self, model_path):
         self.rotary = Rotary(self)
         self.blocks = []
         for block in self.blocks_.children():
-            self.blocks.append(Decoder(block, self))
+            layer_id = len(self.blocks)
+            self.blocks.append(Decoder(block, layer_id, self))
         self.lm = Lm(self.lm_, self.final_layernorm_, self)
         # visual model
         if self.visual is not None:
@@ -1237,8 +2375,8 @@ def get_position_ids(self) -> torch.Tensor:
         if self.model_type == 'chatglm':
             return self.chatglm_position_ids()
         if self.token_len:
-            return torch.tensor([[self.seq_len - 1]], dtype=torch.long)
-        return torch.arange(self.seq_len, dtype=torch.long).unsqueeze(0)
+            return torch.tensor([[self.seq_len - 1]], dtype=torch.int)
+        return torch.arange(self.seq_len, dtype=torch.int).unsqueeze(0)
 
     def chatglm_attention_mask(self):
         if self.token_len:
@@ -1252,8 +2390,8 @@ def chatglm_attention_mask(self):
     def chatglm_position_ids(self):
         if self.token_len:
             return torch.tensor([self.context_len, self.token_len + 1]).reshape([1, 2, 1])
-        position_ids_0 = torch.arange(self.seq_len, dtype=torch.long)
-        position_ids_1 = torch.zeros(self.seq_len, dtype=torch.long)
+        position_ids_0 = torch.arange(self.seq_len, dtype=torch.int)
+        position_ids_1 = torch.zeros(self.seq_len, dtype=torch.int)
         position_ids_0[-1] = position_ids_0[-2]
         position_ids_1[-1] = 1
         position_ids = torch.stack([position_ids_0, position_ids_1]).view(1, 2, -1)
@@ -1269,15 +2407,27 @@ def embedding(self, input_ids):
             input_embeds = self.embed(input_ids)
         return input_embeds
 
-    def forward(self, input_ids, attention_mask, position_ids, past_key_values):
+    def forward(self,
+                input_ids: torch.Tensor,
+                attention_mask: torch.Tensor,
+                position_ids: torch.Tensor,
+                past_key_values: Optional[list[torch.Tensor]] = None,
+                cross_attention_states: Optional[torch.Tensor] = None,
+                cross_attention_mask: Optional[torch.Tensor] = None,
+                ):
         hidden_states = input_ids # llm forward without embedding
-        presents = []
+        presents = [None for i in range(self.num_hidden_layers)]
         rotary_pos_emb = self.rotary(position_ids)
         for i in range(self.num_hidden_layers):
+            if self.blocks[i].cross_decoder and cross_attention_states is None:
+                continue
             hidden_states, kv = self.blocks[i](hidden_states, rotary_pos_emb, attention_mask, past_key_values[i])
-            presents.append(kv)
-        logits = self.lm(hidden_states).reshape(-1)
-        presents = torch.stack(presents)
+            presents[i] = kv
+        logits = self.lm(hidden_states)
+        if not self.ppl:
+            logits = logits.reshape(-1)
+        if presents[0].shape == presents[-1].shape and None not in presents:
+            presents = torch.stack(presents)
         self.seq_len += 1
         self.token_len += 1
         return logits, presents
@@ -1285,7 +2435,7 @@ def forward(self, input_ids, attention_mask, position_ids, past_key_values):
     # some test functions
     def build_prompt(self, query):
         # just for test
-        if 'Qwen2' in self.path:
+        if 'Qwen2' in self.path or 'reader' in self.path:
             return f'<|im_start|>user\n{query}<|im_end|>\n<|im_start|>assistant\n'
         if 'Qwen' in self.path:
             return f'\n<|im_start|>user\n{query}<|im_end|>\n<|im_start|>assistant\n'
@@ -1315,6 +2465,10 @@ def build_prompt(self, query):
             return f'Instruct: {query}\nOutput:'
         if 'gemma-2' in self.path:
             return f'<bos><start_of_turn>user\n{query}<end_of_turn>\n<start_of_turn>model\n'
+        if 'OpenELM' in self.path:
+            return f'<s>{query}'
+        if 'SmolLM2' in self.path:
+            return f'<|im_start|>system\nYou are a helpful AI assistant named SmolLM, trained by Hugging Face<|im_end|>\n<|im_start|>user\n{query}<|im_end|>\n<|im_start|>assistant\n'
         return query
 
     def str_to_ids(self, prompt):
@@ -1324,14 +2478,36 @@ def str_to_ids(self, prompt):
         return input_ids
 
     def id_to_str(self, token_id):
-        word = self.tokenizer._convert_id_to_token(int(token_id))
-        word = self.tokenizer.convert_tokens_to_string([word])
+        def contains_replacement(text): return '\uFFFD' in text
+        def decode_id(token_id):
+            return self.tokenizer.convert_tokens_to_string(
+                    self.tokenizer._convert_id_to_token(int(token_id)))
+        def decode_ids(token_ids):
+            return self.tokenizer.convert_tokens_to_string(
+                    self.tokenizer.convert_ids_to_tokens(token_ids))
+        word = decode_id(int(token_id))
+        # Smollm tokenizer will produce half chinese character, using buffer to decode
+        if contains_replacement(word):
+            self.decode_buffer.append(token_id)
+            buffer_txt = decode_ids(self.decode_buffer)
+            if not contains_replacement(buffer_txt):
+                word = buffer_txt
+                self.decode_buffer.clear()
+            else:
+                word = ''
         return word
 
     def response(self, query):
-        self.imitate_quant()
+        # self.imitate_quant()
+        self.decode_buffer = []
         prompt = self.build_prompt(query)
         input_ids = self.str_to_ids(prompt)
+        if self.visual is not None:
+            cross_attention_states = self.visual.cross_attention_states
+            cross_attention_mask = self.visual.cross_attention_mask
+        else:
+            cross_attention_states = None
+            cross_attention_mask = None
         self.seq_len = input_ids.numel()
         self.context_len = self.seq_len - 2
         self.token_len = 0
@@ -1341,7 +2517,12 @@ def response(self, query):
             attention_mask = self.get_attention_mask()
             position_ids = self.get_position_ids()
             input_ids = self.embedding(token_id)
-            logits, past_key_values = self.forward(input_ids, attention_mask, position_ids, past_key_values)
+            logits, past_key_values = self.forward(input_ids,
+                                                   attention_mask,
+                                                   position_ids,
+                                                   past_key_values,
+                                                   cross_attention_states,
+                                                   cross_attention_mask)
             token_id = torch.argmax(logits)
             if token_id in self.stop_ids:
                 print("", end='\n')
@@ -1402,29 +2583,6 @@ def export_config(self, mnn_config = False):
             json.dump(config, f, ensure_ascii=False, indent=4)
         return config_json
 
-    def quant(self, weight, quant_bit, quant_block):
-        weight = weight.numpy()
-        oc, ic = weight.shape
-        if quant_block == 0:
-            block_size = ic
-        else:
-            block_size = quant_block
-        block_num = ic // block_size
-        weight = weight.reshape(oc, block_num, block_size)
-        max_val = np.max(weight, axis=-1, keepdims=True)
-        min_val = np.min(weight, axis=-1, keepdims=True)
-        offset = 1 << (quant_bit - 1)
-        clip_max = offset - 1
-        clip_min = -offset
-        scale = (max_val - min_val) / (clip_max - clip_min)
-        q_weight = np.round((weight - min_val) / scale) + clip_min
-        q_weight = (np.clip(q_weight.flatten(), clip_min, clip_max) + offset).astype(np.uint8)
-        q_weight = q_weight.reshape(-1, 2)
-        if quant_bit == 4:
-            q_weight = q_weight[:, 0] * 16 + q_weight[:, 1]
-        alpha = np.stack([min_val.flatten(), scale.flatten()], axis=-1).flatten()
-        return q_weight, alpha, clip_min
-
     def imitate_quant(self):
         def quant_dequant(linear, quant_bit = self.quant_bit, quant_block = self.quant_block):
             weight = linear.weight.data
@@ -1466,6 +2624,9 @@ def build_faker(real, name):
         # replace linear with fakelinear to save export memory and time
         with torch.no_grad():
             for i in range(self.num_hidden_layers):
+                # different kv cache shape in different layers
+                if isinstance(self.num_attention_heads, list):
+                    self.blocks[i].self_attn.export_fused_attn = True
                 for name, child in self.blocks[i].self_attn.named_children():
                     if isinstance(child, torch.nn.Linear):
                         setattr(self.blocks[i].self_attn, name, build_faker(child, f'/layers.{i}/self_attn/{name}/Linear'))
@@ -1495,9 +2656,10 @@ def export_onnx(self):
         input_ids = torch.arange(3, dtype=torch.long)
         attention_mask =  self.get_attention_mask()
         position_ids = self.get_position_ids()
-        past_key_values = torch.zeros(self.past_kv_shape)
         onnx_model = f'{self.onnx_path}/{self.dst_name}.onnx'
         input_ids = self.embedding(input_ids)
+        past_key_values = torch.zeros(self.past_kv_shape)
+
         # export to onnx
         torch.onnx.export(
             model, (input_ids, attention_mask, position_ids, past_key_values),
@@ -1508,37 +2670,57 @@ def export_onnx(self):
             output_names=['logits', 'presents'],
             dynamic_axes=self.model_dynamic_axes,
             do_constant_folding=True,
+            verbose=False,
             opset_version=15)
         return onnx_model
 
+    def awq_quant(self):
+        self.awq_quantizer = AwqQuantizer(self)
+        self.awq_quantizer.quantize()
+        self.is_awq_quantized = True
+
     def export(self, export_type):
+        if self.awq:
+            self.awq_quant()
         export_mnn = export_type == 'mnn'
         # export tokenizer
         self.export_tokenizer()
-        self.export_config(export_mnn)
-        self.export_embed()
+        if export_mnn and self.tie_word_embeddings:
+            pass # mnn tie_word_embeddings need't export embedding
+        else:
+            self.export_embed()
         if self.visual:
             visual_onnx = self.export_visual()
-            #if not self.skip_slim:
+            #if self.onnx_slim:
                 #visual_onnx = self.onnx_slim(visual_onnx)
             if export_mnn:
                 MNNConveter(visual_onnx, None, self).export(quant_bit=self.visual.quant_bit)
         # export graph to llm.onnx
         onnx_model = self.export_onnx()
-        if not self.skip_slim:
+        if self.onnx_slim:
             self.onnx_slim(onnx_model)
         if export_mnn:
             # convert onnx to mnn and quant weight
             MNNConveter(onnx_model, self.unloaded_ops, self).export()
+            # delete onnx file
+            if os.path.exists(onnx_model):
+                try:
+                    os.remove(onnx_model)
+                    os.rmdir(self.onnx_path)
+                except Exception as e:
+                    print(f"remove onnx error: {e}")
         else:
             # export weight to llm.onnx.data
             self.onnx_load_param(onnx_model)
+        # export llm_config.json and config.json
+        self.export_config(export_mnn)
+
 
     @spinner_run(f'export tokenizer to ')
     def export_tokenizer(self):
         # load tokenizer file
-        tokenizer_model = os.path.join(self.path, 'tokenizer.model')
-        ice_text_model = os.path.join(self.path, 'ice_text.model')
+        tokenizer_model = os.path.join(self.tokenizer_path, 'tokenizer.model')
+        ice_text_model = os.path.join(self.tokenizer_path, 'ice_text.model')
         try:
             import sentencepiece as spm
             if os.path.exists(tokenizer_model):
@@ -1579,6 +2761,13 @@ def write_header(fp, type, speicals, prefix = []):
         prefix_list = []
         if hasattr(self.tokenizer, 'get_prefix_tokens'):
             prefix_list = self.tokenizer.get_prefix_tokens()
+        if len(prefix_list) == 0:
+            test_txt = 'A'
+            ids = self.tokenizer.encode(test_txt)
+            get_txt = self.tokenizer.decode(ids[-1])
+            if len(ids) > 1 and get_txt == test_txt:
+                prefix_list += ids[:-1]
+
         if self.sp_model is not None:
             # senetencepiece
             NORMAL = 1; UNKNOWN = 2; CONTROL = 3
@@ -1739,8 +2928,9 @@ def response(self, query):
     @spinner_run(f'load pretrained model ')
     def load_model(self, model_path):
         self.tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
-        self.model = AutoModel.from_pretrained(model_path, trust_remote_code=True).float().eval()
-        self.config = self.model.config
+        self.config = AutoConfig.from_pretrained(model_path)
+        self.config._attn_implementation = 'eager'
+        self.model = AutoModel.from_config(self.config)
         transformer = self.model.encoder
         self.model_type = self.config.model_type
         self.lm_ = self.model.pooler
@@ -1805,7 +2995,7 @@ def export(self, export_type):
         self.export_config(export_mnn)
         self.export_embed()
         onnx_model = self.export_onnx()
-        if not self.skip_slim:
+        if self.onnx_slim:
             self.onnx_slim(onnx_model)
         if export_mnn:
             MNNConveter(onnx_model, None, self).export()
@@ -1822,12 +3012,13 @@ def get_position_ids(self) -> torch.Tensor:
     def get_attention_mask(self) -> torch.Tensor:
         return torch.ones([1, 1, 1, self.seq_len], dtype=torch.long)
 
+
 def export(path,
            type = None,
            lora_path = None,
            dst_path = './model',
            export = 'onnx',
-           skip_slim = False,
+           onnx_slim = False,
            quant_bit = 4,
            quant_block = 128,
            lm_quant_bit = None):
@@ -1838,7 +3029,7 @@ def export(path,
         'lora_path': lora_path,
         'dst_path': dst_path,
         'export': export,
-        'skip_slim': skip_slim,
+        'onnx_slim': onnx_slim,
         'quant_bit': quant_bit,
         'quant_block': quant_block,
         'lm_quant_bit': lm_quant_bit
@@ -1861,15 +3052,20 @@ def main():
                         help='type(`str`, *optional*):'
                         '\n\tThe pretrain llm model type.'
                         )
+    parser.add_argument('--tokenizer_path', type=str, default=None, help='tokenizer path, defaut is `None` mean using `--path` value.')
     parser.add_argument('--lora_path', type=str, default=None, help='lora path, defaut is `None` mean not apply lora.')
     parser.add_argument('--dst_path', type=str, default='./model', help='export onnx/mnn model to path, defaut is `./model`.')
+    parser.add_argument('--verbose', action='store_true', help='Whether or not to print verbose.')
     parser.add_argument('--test', type=str, help='test model inference with query `TEST`.')
     parser.add_argument('--export', type=str, default=None, help='export model to an onnx/mnn model.')
-    parser.add_argument('--skip_slim', action='store_true', help='Whether or not to skip onnx-slim.')
+    parser.add_argument('--onnx_slim', action='store_true', help='Whether or not to use onnx-slim.')
     parser.add_argument('--quant_bit', type=int, default=4, help='mnn quant bit, 4 or 8, default is 4.')
-    parser.add_argument('--quant_block', type=int, default=0, help='mnn quant block, default is 0 mean channle-wise.')
+    parser.add_argument('--quant_block', type=int, default=128, help='mnn quant block, default is 0 mean channle-wise.')
     parser.add_argument('--lm_quant_bit', type=int, default=None, help='mnn lm_head quant bit, 4 or 8, default is `quant_bit`.')
     parser.add_argument('--mnnconvert', type=str, default='../../../build/MNNConvert', help='local mnnconvert path, if invalid, using pymnn.')
+    parser.add_argument('--ppl', action='store_true', help='Whether or not to get all logits of input tokens.')
+    parser.add_argument('--awq', action='store_true', help='Whether or not to use awq quant.')
+    parser.add_argument('--sym', action='store_true', help='Whether or not to using symmetric quant (without zeropoint), defualt is False.')
 
     args = parser.parse_args()
 
@@ -1889,4 +3085,4 @@ def main():
         llm_exporter.export(args.export)
 
 if __name__ == '__main__':
-    main()
\ No newline at end of file
+    main()
diff --git a/transformers/llm/export/requirements.txt b/transformers/llm/export/requirements.txt
new file mode 100644
index 000000000..c60fefe73
--- /dev/null
+++ b/transformers/llm/export/requirements.txt
@@ -0,0 +1,14 @@
+datasets==2.18.0
+MNN==2.9.6
+onnx==1.16.2
+onnxslim==0.1.34
+onnxruntime==1.19.2
+peft==0.11.1
+Pillow==11.0.0
+Requests==2.32.3
+sentencepiece==0.1.99
+torch==2.3.1
+tqdm==4.65.0
+transformers==4.45.2
+yaspin==3.1.0
+numpy==1.25.2

From 47a17f5d6b82213ea7daa0879714fcf9bf544c8b Mon Sep 17 00:00:00 2001
From: "wangshuaikang.wsk" <MNNTeam@taobao.com>
Date: Tue, 19 Nov 2024 20:51:19 +0800
Subject: [PATCH 7/7] Bugfix: Vulkan windows compilation error.

---
 .../vulkan/image/execution/VulkanSoftmax.cpp  | 24 +++++++++----------
 .../vulkan/image/execution/VulkanSoftmax.hpp  |  4 ++--
 2 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/source/backend/vulkan/image/execution/VulkanSoftmax.cpp b/source/backend/vulkan/image/execution/VulkanSoftmax.cpp
index 7c19595fd..d86750a66 100644
--- a/source/backend/vulkan/image/execution/VulkanSoftmax.cpp
+++ b/source/backend/vulkan/image/execution/VulkanSoftmax.cpp
@@ -13,14 +13,14 @@
 namespace MNN {
 
 struct SoftmaxConstBuffer {
-    uint N;
-    uint H;
-    uint W;
-    uint C4;
-    uint CLeft;
+    uint32_t N;
+    uint32_t H;
+    uint32_t W;
+    uint32_t C4;
+    uint32_t CLeft;
 };
 
-VulkanSoftmax::VulkanSoftmax(const Op* op, Backend* bn, const uint axisIndex) : VulkanBasicExecution(bn) {
+VulkanSoftmax::VulkanSoftmax(const Op* op, Backend* bn, const uint32_t axisIndex) : VulkanBasicExecution(bn) {
     mAxisIndex = axisIndex;
     auto vkBn = (VulkanBackend*)backend();
     std::string shaderName = "glsl_softmaxImage_";
@@ -55,7 +55,7 @@ ErrorCode VulkanSoftmax::onEncode(const std::vector<Tensor*>& inputs, const std:
     auto input  = inputs[0];
     auto output = outputs[0];
     auto inputShapeNHWC = VulkanTensor::tensorShapeFormat(input);
-    std::vector<uint> cpuSoftmaxConstBuffer = {(uint)inputShapeNHWC[0], (uint)inputShapeNHWC[1], (uint)inputShapeNHWC[2], (uint)UP_DIV(inputShapeNHWC[3], 4), (uint)ROUND_UP(inputShapeNHWC[3], 4) - inputShapeNHWC[3]};
+    std::vector<uint32_t> cpuSoftmaxConstBuffer = {(uint32_t)inputShapeNHWC[0], (uint32_t)inputShapeNHWC[1], (uint32_t)inputShapeNHWC[2], (uint32_t)UP_DIV(inputShapeNHWC[3], 4), (uint32_t)ROUND_UP(inputShapeNHWC[3], 4) - inputShapeNHWC[3]};
 
     {
         auto softmaxConst = reinterpret_cast<SoftmaxConstBuffer*>(mSoftmaxConstBuffer->map());
@@ -69,8 +69,8 @@ ErrorCode VulkanSoftmax::onEncode(const std::vector<Tensor*>& inputs, const std:
     }
 
     // N * H * W * C4
-    uint numTotal = cpuSoftmaxConstBuffer[0] * cpuSoftmaxConstBuffer[1] * cpuSoftmaxConstBuffer[2] * cpuSoftmaxConstBuffer[3];
-    uint numY = numTotal / cpuSoftmaxConstBuffer[mAxisIndex];
+    uint32_t numTotal = cpuSoftmaxConstBuffer[0] * cpuSoftmaxConstBuffer[1] * cpuSoftmaxConstBuffer[2] * cpuSoftmaxConstBuffer[3];
+    uint32_t numY = numTotal / cpuSoftmaxConstBuffer[mAxisIndex];
 
     auto vkOutput  = (VulkanTensor*)output->deviceId();
     auto vkInput   = (VulkanTensor*)input->deviceId();
@@ -98,7 +98,7 @@ class VulkanSoftmaxCreator : public VulkanBackend::Creator {
                                 Backend* backend) const override {
         auto input = inputs[0];
 
-        uint dimension = input->dimensions();
+        uint32_t dimension = input->dimensions();
         if (dimension > 4) {
             return nullptr;
         }
@@ -109,7 +109,7 @@ class VulkanSoftmaxCreator : public VulkanBackend::Creator {
         if (axis < 0) {
             axis = input->dimensions() + axis;
         }
-        std::vector<uint> axisMap;
+        std::vector<uint32_t> axisMap;
 
         if (dimension == 4) {
             if (format == MNN_DATA_FORMAT_NCHW) {
@@ -130,7 +130,7 @@ class VulkanSoftmaxCreator : public VulkanBackend::Creator {
         } else {
             return nullptr;
         }
-        uint axisIndex = axisMap[axis];
+        uint32_t axisIndex = axisMap[axis];
 
         return new VulkanSoftmax(op, backend, axisIndex);
     }
diff --git a/source/backend/vulkan/image/execution/VulkanSoftmax.hpp b/source/backend/vulkan/image/execution/VulkanSoftmax.hpp
index fafd1b34a..38dd3179b 100644
--- a/source/backend/vulkan/image/execution/VulkanSoftmax.hpp
+++ b/source/backend/vulkan/image/execution/VulkanSoftmax.hpp
@@ -16,7 +16,7 @@
 namespace MNN {
 class VulkanSoftmax : public VulkanBasicExecution {
 public:
-    VulkanSoftmax(const Op* op, Backend* bn, const uint axisIndex);
+    VulkanSoftmax(const Op* op, Backend* bn, const uint32_t axisIndex);
     virtual ~VulkanSoftmax();
     ErrorCode onEncode(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs,
                        const VulkanCommandPool::Buffer* cmdBuffer) override;
@@ -25,7 +25,7 @@ class VulkanSoftmax : public VulkanBasicExecution {
     std::shared_ptr<VulkanBuffer> mSoftmaxConstBuffer;
     const VulkanPipeline* mSoftmaxPipeline;
     std::shared_ptr<VulkanLayout::DescriptorSet> mDescriptorSet;
-    uint mAxisIndex;
+    uint32_t mAxisIndex;
 };
 
 } // namespace MNN