diff --git a/docs/.buildinfo b/docs/.buildinfo index ad50ab2..a2d394b 100644 --- a/docs/.buildinfo +++ b/docs/.buildinfo @@ -1,4 +1,4 @@ # Sphinx build info version 1 # This file hashes the configuration used when building these files. When it is not found, a full rebuild will be done. -config: 2a6c4a48cd36af00ff44d2fe2c5e2d69 +config: d23dc79ff23e0b7c0eb8540e45ddc148 tags: 645f666f9bcd5a90fca523b33c5a78b7 diff --git a/docs/.doctrees/environment.pickle b/docs/.doctrees/environment.pickle index 82aef1b..8ad1632 100644 Binary files a/docs/.doctrees/environment.pickle and b/docs/.doctrees/environment.pickle differ diff --git a/docs/.doctrees/recipes/amx.doctree b/docs/.doctrees/recipes/amx.doctree index d9dd517..7b74b0d 100644 Binary files a/docs/.doctrees/recipes/amx.doctree and b/docs/.doctrees/recipes/amx.doctree differ diff --git a/docs/.doctrees/recipes/compiling_optimizer.doctree b/docs/.doctrees/recipes/compiling_optimizer.doctree index 2b3c130..cdf78a7 100644 Binary files a/docs/.doctrees/recipes/compiling_optimizer.doctree and b/docs/.doctrees/recipes/compiling_optimizer.doctree differ diff --git a/docs/.doctrees/recipes/inference_tuning_on_aws_graviton.doctree b/docs/.doctrees/recipes/inference_tuning_on_aws_graviton.doctree index 9786bde..f54056b 100644 Binary files a/docs/.doctrees/recipes/inference_tuning_on_aws_graviton.doctree and b/docs/.doctrees/recipes/inference_tuning_on_aws_graviton.doctree differ diff --git a/docs/.doctrees/recipes/torch_compile_user_defined_triton_kernel_tutorial.doctree b/docs/.doctrees/recipes/torch_compile_user_defined_triton_kernel_tutorial.doctree index 6a3188e..4746861 100644 Binary files a/docs/.doctrees/recipes/torch_compile_user_defined_triton_kernel_tutorial.doctree and b/docs/.doctrees/recipes/torch_compile_user_defined_triton_kernel_tutorial.doctree differ diff --git a/docs/_downloads/0ccffddcfee1f815c02241b985844376/torch_compile_user_defined_triton_kernel_tutorial.py b/docs/_downloads/0ccffddcfee1f815c02241b985844376/torch_compile_user_defined_triton_kernel_tutorial.py index 68187d6..b5c0faf 100644 --- a/docs/_downloads/0ccffddcfee1f815c02241b985844376/torch_compile_user_defined_triton_kernel_tutorial.py +++ b/docs/_downloads/0ccffddcfee1f815c02241b985844376/torch_compile_user_defined_triton_kernel_tutorial.py @@ -1,49 +1,46 @@ # -*- coding: utf-8 -*- """ -Using User-Defined Triton Kernels with ``torch.compile`` +使用 ``torch.compile`` 和用户自定义的 Triton 内核 ========================================================= -**Author:** `Oguz Ulgen `_ +**作者:** `Oguz Ulgen `_ """ ###################################################################### -# User-defined Triton kernels can be used to optimize specific parts of your -# model's computation. These kernels are written in Triton's language, which is designed -# to make it easier to achieve peak hardware performance. By using user-defined Triton -# kernels with ``torch.compile``, you can integrate these optimized computations into -# your PyTorch model, potentially achieving significant performance improvements. +# 用户自定义的 Triton 内核可用于优化模型计算的特定部分。这些内核是用 Triton 语言编写的, +# 旨在更容易实现硬件的峰值性能。通过在 ``torch.compile`` 中使用用户自定义的 Triton 内核, +# 您可以将这些优化过的计算集成到 PyTorch 模型中,从而可能获得显著的性能提升。 # -# This recipes demonstrates how you can use user-defined Triton kernels with ``torch.compile``. +# 本教程演示了如何在 ``torch.compile`` 中使用用户自定义的 Triton 内核。 # -# Prerequisites +# 先决条件 # ------------------- # -# Before starting this recipe, make sure that you have the following: +# 在开始本教程之前,请确保您具备以下条件: # -# * Basic understanding of ``torch.compile`` and Triton. See: +# * 对 ``torch.compile`` 和 Triton 有基本的了解。参见: # -# * `torch.compiler API documentation `__ -# * `Introduction to torch.compile `__ -# * `Triton language documentation `__ +# * `torch.compiler API 文档 `__ +# * `torch.compile 介绍 `__ +# * `Triton 语言文档 `__ # -# * PyTorch 2.3 or later -# * A GPU that supports Triton +# * PyTorch 2.3 或更高版本 +# * 支持 Triton 的 GPU # import torch from torch.utils._triton import has_triton ###################################################################### -# Basic Usage +# 基本用法 # -------------------- # -# In this example, we will use a simple vector addition kernel from the Triton documentation -# with ``torch.compile``. -# For reference, see `Triton documentation `__. +# 在此示例中,我们将使用来自 Triton 文档的一个简单向量加法内核与 ``torch.compile``。 +# 参考 `Triton 文档 `__。 # if not has_triton(): - print("Skipping because triton is not supported on this device.") + print("由于此设备不支持 triton,因此跳过。") else: import triton from triton import language as tl @@ -76,26 +73,24 @@ def add_fn(x, y): x = torch.randn(4, device="cuda") y = torch.randn(4, device="cuda") out = add_fn(x, y) - print(f"Vector addition of\nX:\t{x}\nY:\t{y}\nis equal to\n{out}") + print(f"向量加法\nX:\t{x}\nY:\t{y}\n结果为\n{out}") ###################################################################### -# Advanced Usage +# 高级用法 # ------------------------------------------------------------------- # -# Triton's autotune feature is a powerful tool that automatically optimizes the configuration -# parameters of your Triton kernels. It explores a range of possible configurations and -# selects the one that delivers the best performance for your specific use case. +# Triton 的自动调优功能是一个强大的工具,可自动优化 Triton 内核的配置参数。 +# 它探索一系列可能的配置,并选择为您的特定用例提供最佳性能的配置。 # -# When used with ``torch.compile``, ``triton.autotune`` can help ensure that your PyTorch -# model is running as efficiently as possible. Here is an example of using ``torch.compile`` -# and ``triton.autotune``. +# 与 ``torch.compile`` 一起使用时, ``triton.autotune`` 可以帮助确保您的 PyTorch 模型以最高效的方式运行。 +# 下面是使用 ``torch.compile`` 和 ``triton.autotune`` 的示例。 # # .. note:: # -# ``torch.compile`` only supports configs and key arguments to ``triton.autotune``. +# ``torch.compile`` 仅支持 ``triton.autotune`` 的配置和关键参数。 if not has_triton(): - print("Skipping because triton is not supported on this device.") + print("由于此设备不支持 triton,因此跳过。") else: import triton from triton import language as tl @@ -137,35 +132,31 @@ def add_fn(x, y): x = torch.randn(4, device="cuda") y = torch.randn(4, device="cuda") out = add_fn(x, y) - print(f"Vector addition of\nX:\t{x}\nY:\t{y}\nis equal to\n{out}") + print(f"向量加法\nX:\t{x}\nY:\t{y}\n结果为\n{out}") ###################################################################### -# Composibility and Limitations +# 可组合性和限制 # -------------------------------------------------------------------- # -# As of PyTorch 2.3, the support for user-defined Triton kernels in ``torch.compile`` -# includes dynamic shapes, ``torch.autograd.Function``, JIT inductor, and AOT inductor. -# You can use these features together to build complex, high-performance models. +# 从 PyTorch 2.3 开始, ``torch.compile`` 中对用户自定义 Triton 内核的支持包括动态形状、 +# ``torch.autograd.Function``、JIT inductor 和 AOT inductor。 +# 您可以将这些功能组合在一起构建复杂的高性能模型。 # -# However, there are certain limitations to be aware of: +# 但是,也需要注意一些限制: # -# * **Tensor Subclasses:** Currently, there is no support for -# tensor subclasses and other advanced features. -# * **Triton Features:** While ``triton.heuristics`` can be used either standalone or -# before ``triton.autotune``, it cannot be used after ```triton.autotune``. This -# implies that if ``triton.heuristics`` and ``triton.autotune`` are to be used -# together, ``triton.heuristics`` must be used first. +# * **Tensor 子类:** 目前不支持张量子类和其他高级功能。 +# * **Triton 功能:** 虽然 ``triton.heuristics`` 可以单独使用或在 ``triton.autotune`` 之前使用, +# 但不能在 ``triton.autotune`` 之后使用。这意味着如果要一起使用 ``triton.heuristics`` 和 ``triton.autotune``, +# 则必须先使用 ``triton.heuristics``。 # -# Conclusion +# 结论 # ----------- -# In this recipe, we explored how to utilize user-defined Triton kernels -# with ``torch.compile``. We delved into the basic usage of a simple -# vector addition kernel and advanced usage involving Triton's autotune -# feature. We also discussed the composability of user-defined Triton -# kernels with other PyTorch features and highlighted some current limitations. +# 在本教程中,我们探讨了如何在 ``torch.compile`` 中使用用户自定义的 Triton 内核。 +# 我们深入研究了使用简单向量加法内核的基本用法,以及涉及 Triton 自动调优功能的高级用法。 +# 我们还讨论了用户自定义 Triton 内核与其他 PyTorch 功能的可组合性,并强调了一些当前的限制。 # -# See Also +# 另请参阅 # --------- # -# * `Compiling the Optimizers `__ -# * `Implementing High-Performance Transformers with Scaled Dot Product Attention `__ +# * `编译优化器 `__ +# * `使用缩放点积注意力实现高性能 Transformer `__ diff --git a/docs/_downloads/f827f181506a79226f4ffbcf7c9a5a50/torch_compile_user_defined_triton_kernel_tutorial.ipynb b/docs/_downloads/f827f181506a79226f4ffbcf7c9a5a50/torch_compile_user_defined_triton_kernel_tutorial.ipynb index 7e7032b..abcbc36 100644 --- a/docs/_downloads/f827f181506a79226f4ffbcf7c9a5a50/torch_compile_user_defined_triton_kernel_tutorial.ipynb +++ b/docs/_downloads/f827f181506a79226f4ffbcf7c9a5a50/torch_compile_user_defined_triton_kernel_tutorial.ipynb @@ -15,14 +15,14 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "\n# Using User-Defined Triton Kernels with ``torch.compile``\n**Author:** [Oguz Ulgen](https://github.com/oulgen)\n" + "\n# \u4f7f\u7528 ``torch.compile`` \u548c\u7528\u6237\u81ea\u5b9a\u4e49\u7684 Triton \u5185\u6838\n**\u4f5c\u8005:** [Oguz Ulgen](https://github.com/oulgen)\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "User-defined Triton kernels can be used to optimize specific parts of your\nmodel's computation. These kernels are written in Triton's language, which is designed\nto make it easier to achieve peak hardware performance. By using user-defined Triton\nkernels with ``torch.compile``, you can integrate these optimized computations into\nyour PyTorch model, potentially achieving significant performance improvements.\n\nThis recipes demonstrates how you can use user-defined Triton kernels with ``torch.compile``.\n\n## Prerequisites\n\nBefore starting this recipe, make sure that you have the following:\n\n* Basic understanding of ``torch.compile`` and Triton. See:\n\n * [torch.compiler API documentation](https://pytorch.org/docs/stable/torch.compiler.html#torch-compiler)_\n * [Introduction to torch.compile](https://pytorch.org/tutorials/intermediate/torch_compile_tutorial.html)_\n * [Triton language documentation](https://triton-lang.org/main/index.html)_\n\n* PyTorch 2.3 or later\n* A GPU that supports Triton\n\n\n" + "\u7528\u6237\u81ea\u5b9a\u4e49\u7684 Triton \u5185\u6838\u53ef\u7528\u4e8e\u4f18\u5316\u6a21\u578b\u8ba1\u7b97\u7684\u7279\u5b9a\u90e8\u5206\u3002\u8fd9\u4e9b\u5185\u6838\u662f\u7528 Triton \u8bed\u8a00\u7f16\u5199\u7684,\n\u65e8\u5728\u66f4\u5bb9\u6613\u5b9e\u73b0\u786c\u4ef6\u7684\u5cf0\u503c\u6027\u80fd\u3002\u901a\u8fc7\u5728 ``torch.compile`` \u4e2d\u4f7f\u7528\u7528\u6237\u81ea\u5b9a\u4e49\u7684 Triton \u5185\u6838,\n\u60a8\u53ef\u4ee5\u5c06\u8fd9\u4e9b\u4f18\u5316\u8fc7\u7684\u8ba1\u7b97\u96c6\u6210\u5230 PyTorch \u6a21\u578b\u4e2d,\u4ece\u800c\u53ef\u80fd\u83b7\u5f97\u663e\u8457\u7684\u6027\u80fd\u63d0\u5347\u3002\n\n\u672c\u6559\u7a0b\u6f14\u793a\u4e86\u5982\u4f55\u5728 ``torch.compile`` \u4e2d\u4f7f\u7528\u7528\u6237\u81ea\u5b9a\u4e49\u7684 Triton \u5185\u6838\u3002\n\n## \u5148\u51b3\u6761\u4ef6\n\n\u5728\u5f00\u59cb\u672c\u6559\u7a0b\u4e4b\u524d,\u8bf7\u786e\u4fdd\u60a8\u5177\u5907\u4ee5\u4e0b\u6761\u4ef6:\n\n* \u5bf9 ``torch.compile`` \u548c Triton \u6709\u57fa\u672c\u7684\u4e86\u89e3\u3002\u53c2\u89c1:\n\n * [torch.compiler API \u6587\u6863](https://pytorch.org/docs/stable/torch.compiler.html#torch-compiler)_\n * [torch.compile \u4ecb\u7ecd](https://pytorch.org/tutorials/intermediate/torch_compile_tutorial.html)_\n * [Triton \u8bed\u8a00\u6587\u6863](https://triton-lang.org/main/index.html)_\n\n* PyTorch 2.3 \u6216\u66f4\u9ad8\u7248\u672c\n* \u652f\u6301 Triton \u7684 GPU\n\n\n" ] }, { @@ -40,7 +40,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Basic Usage\n\nIn this example, we will use a simple vector addition kernel from the Triton documentation\nwith ``torch.compile``.\nFor reference, see [Triton documentation](https://triton-lang.org/main/getting-started/tutorials/01-vector-add.html)_.\n\n\n" + "## \u57fa\u672c\u7528\u6cd5\n\n\u5728\u6b64\u793a\u4f8b\u4e2d,\u6211\u4eec\u5c06\u4f7f\u7528\u6765\u81ea Triton \u6587\u6863\u7684\u4e00\u4e2a\u7b80\u5355\u5411\u91cf\u52a0\u6cd5\u5185\u6838\u4e0e ``torch.compile``\u3002\n\u53c2\u8003 [Triton \u6587\u6863](https://triton-lang.org/main/getting-started/tutorials/01-vector-add.html)_\u3002\n\n\n" ] }, { @@ -51,14 +51,14 @@ }, "outputs": [], "source": [ - "if not has_triton():\n print(\"Skipping because triton is not supported on this device.\")\nelse:\n import triton\n from triton import language as tl\n\n @triton.jit\n def add_kernel(\n in_ptr0,\n in_ptr1,\n out_ptr,\n n_elements,\n BLOCK_SIZE: \"tl.constexpr\",\n ):\n pid = tl.program_id(axis=0)\n block_start = pid * BLOCK_SIZE\n offsets = block_start + tl.arange(0, BLOCK_SIZE)\n mask = offsets < n_elements\n x = tl.load(in_ptr0 + offsets, mask=mask)\n y = tl.load(in_ptr1 + offsets, mask=mask)\n output = x + y\n tl.store(out_ptr + offsets, output, mask=mask)\n\n @torch.compile(fullgraph=True)\n def add_fn(x, y):\n output = torch.zeros_like(x)\n n_elements = output.numel()\n grid = lambda meta: (triton.cdiv(n_elements, meta[\"BLOCK_SIZE\"]),)\n add_kernel[grid](x, y, output, n_elements, BLOCK_SIZE=4)\n return output\n\n x = torch.randn(4, device=\"cuda\")\n y = torch.randn(4, device=\"cuda\")\n out = add_fn(x, y)\n print(f\"Vector addition of\\nX:\\t{x}\\nY:\\t{y}\\nis equal to\\n{out}\")" + "if not has_triton():\n print(\"\u7531\u4e8e\u6b64\u8bbe\u5907\u4e0d\u652f\u6301 triton,\u56e0\u6b64\u8df3\u8fc7\u3002\")\nelse:\n import triton\n from triton import language as tl\n\n @triton.jit\n def add_kernel(\n in_ptr0,\n in_ptr1,\n out_ptr,\n n_elements,\n BLOCK_SIZE: \"tl.constexpr\",\n ):\n pid = tl.program_id(axis=0)\n block_start = pid * BLOCK_SIZE\n offsets = block_start + tl.arange(0, BLOCK_SIZE)\n mask = offsets < n_elements\n x = tl.load(in_ptr0 + offsets, mask=mask)\n y = tl.load(in_ptr1 + offsets, mask=mask)\n output = x + y\n tl.store(out_ptr + offsets, output, mask=mask)\n\n @torch.compile(fullgraph=True)\n def add_fn(x, y):\n output = torch.zeros_like(x)\n n_elements = output.numel()\n grid = lambda meta: (triton.cdiv(n_elements, meta[\"BLOCK_SIZE\"]),)\n add_kernel[grid](x, y, output, n_elements, BLOCK_SIZE=4)\n return output\n\n x = torch.randn(4, device=\"cuda\")\n y = torch.randn(4, device=\"cuda\")\n out = add_fn(x, y)\n print(f\"\u5411\u91cf\u52a0\u6cd5\\nX:\\t{x}\\nY:\\t{y}\\n\u7ed3\u679c\u4e3a\\n{out}\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## Advanced Usage\n\nTriton's autotune feature is a powerful tool that automatically optimizes the configuration\nparameters of your Triton kernels. It explores a range of possible configurations and\nselects the one that delivers the best performance for your specific use case.\n\nWhen used with ``torch.compile``, ``triton.autotune`` can help ensure that your PyTorch\nmodel is running as efficiently as possible. Here is an example of using ``torch.compile``\nand ``triton.autotune``.\n\n

Note

``torch.compile`` only supports configs and key arguments to ``triton.autotune``.

\n\n" + "## \u9ad8\u7ea7\u7528\u6cd5\n\nTriton \u7684\u81ea\u52a8\u8c03\u4f18\u529f\u80fd\u662f\u4e00\u4e2a\u5f3a\u5927\u7684\u5de5\u5177,\u53ef\u81ea\u52a8\u4f18\u5316 Triton \u5185\u6838\u7684\u914d\u7f6e\u53c2\u6570\u3002\n\u5b83\u63a2\u7d22\u4e00\u7cfb\u5217\u53ef\u80fd\u7684\u914d\u7f6e,\u5e76\u9009\u62e9\u4e3a\u60a8\u7684\u7279\u5b9a\u7528\u4f8b\u63d0\u4f9b\u6700\u4f73\u6027\u80fd\u7684\u914d\u7f6e\u3002\n\n\u4e0e ``torch.compile`` \u4e00\u8d77\u4f7f\u7528\u65f6, ``triton.autotune`` \u53ef\u4ee5\u5e2e\u52a9\u786e\u4fdd\u60a8\u7684 PyTorch \u6a21\u578b\u4ee5\u6700\u9ad8\u6548\u7684\u65b9\u5f0f\u8fd0\u884c\u3002\n\u4e0b\u9762\u662f\u4f7f\u7528 ``torch.compile`` \u548c ``triton.autotune`` \u7684\u793a\u4f8b\u3002\n\n

Note

``torch.compile`` \u4ec5\u652f\u6301 ``triton.autotune`` \u7684\u914d\u7f6e\u548c\u5173\u952e\u53c2\u6570\u3002

\n\n" ] }, { @@ -69,14 +69,14 @@ }, "outputs": [], "source": [ - "if not has_triton():\n print(\"Skipping because triton is not supported on this device.\")\nelse:\n import triton\n from triton import language as tl\n\n @triton.autotune(\n configs=[\n triton.Config({\"BLOCK_SIZE\": 4}, num_stages=3, num_warps=8),\n triton.Config({\"BLOCK_SIZE\": 4}, num_stages=4, num_warps=4),\n triton.Config({\"BLOCK_SIZE\": 2}, num_stages=3, num_warps=8),\n triton.Config({\"BLOCK_SIZE\": 2}, num_stages=4, num_warps=4),\n ],\n key=[],\n )\n @triton.jit\n def add_kernel_autotuned(\n in_ptr0,\n in_ptr1,\n out_ptr,\n n_elements,\n BLOCK_SIZE: \"tl.constexpr\",\n ):\n pid = tl.program_id(axis=0)\n block_start = pid * BLOCK_SIZE\n offsets = block_start + tl.arange(0, BLOCK_SIZE)\n mask = offsets < n_elements\n x = tl.load(in_ptr0 + offsets, mask=mask)\n y = tl.load(in_ptr1 + offsets, mask=mask)\n output = x + y\n tl.store(out_ptr + offsets, output, mask=mask)\n\n @torch.compile(fullgraph=True)\n def add_fn(x, y):\n output = torch.zeros_like(x)\n n_elements = output.numel()\n grid = lambda meta: (triton.cdiv(n_elements, meta[\"BLOCK_SIZE\"]),)\n add_kernel_autotuned[grid](x, y, output, n_elements)\n return output\n\n x = torch.randn(4, device=\"cuda\")\n y = torch.randn(4, device=\"cuda\")\n out = add_fn(x, y)\n print(f\"Vector addition of\\nX:\\t{x}\\nY:\\t{y}\\nis equal to\\n{out}\")" + "if not has_triton():\n print(\"\u7531\u4e8e\u6b64\u8bbe\u5907\u4e0d\u652f\u6301 triton,\u56e0\u6b64\u8df3\u8fc7\u3002\")\nelse:\n import triton\n from triton import language as tl\n\n @triton.autotune(\n configs=[\n triton.Config({\"BLOCK_SIZE\": 4}, num_stages=3, num_warps=8),\n triton.Config({\"BLOCK_SIZE\": 4}, num_stages=4, num_warps=4),\n triton.Config({\"BLOCK_SIZE\": 2}, num_stages=3, num_warps=8),\n triton.Config({\"BLOCK_SIZE\": 2}, num_stages=4, num_warps=4),\n ],\n key=[],\n )\n @triton.jit\n def add_kernel_autotuned(\n in_ptr0,\n in_ptr1,\n out_ptr,\n n_elements,\n BLOCK_SIZE: \"tl.constexpr\",\n ):\n pid = tl.program_id(axis=0)\n block_start = pid * BLOCK_SIZE\n offsets = block_start + tl.arange(0, BLOCK_SIZE)\n mask = offsets < n_elements\n x = tl.load(in_ptr0 + offsets, mask=mask)\n y = tl.load(in_ptr1 + offsets, mask=mask)\n output = x + y\n tl.store(out_ptr + offsets, output, mask=mask)\n\n @torch.compile(fullgraph=True)\n def add_fn(x, y):\n output = torch.zeros_like(x)\n n_elements = output.numel()\n grid = lambda meta: (triton.cdiv(n_elements, meta[\"BLOCK_SIZE\"]),)\n add_kernel_autotuned[grid](x, y, output, n_elements)\n return output\n\n x = torch.randn(4, device=\"cuda\")\n y = torch.randn(4, device=\"cuda\")\n out = add_fn(x, y)\n print(f\"\u5411\u91cf\u52a0\u6cd5\\nX:\\t{x}\\nY:\\t{y}\\n\u7ed3\u679c\u4e3a\\n{out}\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## Composibility and Limitations\n\nAs of PyTorch 2.3, the support for user-defined Triton kernels in ``torch.compile``\nincludes dynamic shapes, ``torch.autograd.Function``, JIT inductor, and AOT inductor.\nYou can use these features together to build complex, high-performance models.\n\nHowever, there are certain limitations to be aware of:\n\n* **Tensor Subclasses:** Currently, there is no support for\n tensor subclasses and other advanced features.\n* **Triton Features:** While ``triton.heuristics`` can be used either standalone or\n before ``triton.autotune``, it cannot be used after ```triton.autotune``. This\n implies that if ``triton.heuristics`` and ``triton.autotune`` are to be used\n together, ``triton.heuristics`` must be used first.\n\n## Conclusion\nIn this recipe, we explored how to utilize user-defined Triton kernels\nwith ``torch.compile``. We delved into the basic usage of a simple\nvector addition kernel and advanced usage involving Triton's autotune\nfeature. We also discussed the composability of user-defined Triton\nkernels with other PyTorch features and highlighted some current limitations.\n\n## See Also\n\n* [Compiling the Optimizers](https://pytorch.org/tutorials/recipes/compiling_optimizer.html)_\n* [Implementing High-Performance Transformers with Scaled Dot Product Attention](https://pytorch.org/tutorials/intermediate/scaled_dot_product_attention_tutorial.html)_\n\n" + "## \u53ef\u7ec4\u5408\u6027\u548c\u9650\u5236\n\n\u4ece PyTorch 2.3 \u5f00\u59cb, ``torch.compile`` \u4e2d\u5bf9\u7528\u6237\u81ea\u5b9a\u4e49 Triton \u5185\u6838\u7684\u652f\u6301\u5305\u62ec\u52a8\u6001\u5f62\u72b6\u3001\n``torch.autograd.Function``\u3001JIT inductor \u548c AOT inductor\u3002\n\u60a8\u53ef\u4ee5\u5c06\u8fd9\u4e9b\u529f\u80fd\u7ec4\u5408\u5728\u4e00\u8d77\u6784\u5efa\u590d\u6742\u7684\u9ad8\u6027\u80fd\u6a21\u578b\u3002\n\n\u4f46\u662f,\u4e5f\u9700\u8981\u6ce8\u610f\u4e00\u4e9b\u9650\u5236:\n\n* **Tensor \u5b50\u7c7b:** \u76ee\u524d\u4e0d\u652f\u6301\u5f20\u91cf\u5b50\u7c7b\u548c\u5176\u4ed6\u9ad8\u7ea7\u529f\u80fd\u3002\n* **Triton \u529f\u80fd:** \u867d\u7136 ``triton.heuristics`` \u53ef\u4ee5\u5355\u72ec\u4f7f\u7528\u6216\u5728 ``triton.autotune`` \u4e4b\u524d\u4f7f\u7528,\n \u4f46\u4e0d\u80fd\u5728 ``triton.autotune`` \u4e4b\u540e\u4f7f\u7528\u3002\u8fd9\u610f\u5473\u7740\u5982\u679c\u8981\u4e00\u8d77\u4f7f\u7528 ``triton.heuristics`` \u548c ``triton.autotune``,\n \u5219\u5fc5\u987b\u5148\u4f7f\u7528 ``triton.heuristics``\u3002\n\n## \u7ed3\u8bba\n\u5728\u672c\u6559\u7a0b\u4e2d,\u6211\u4eec\u63a2\u8ba8\u4e86\u5982\u4f55\u5728 ``torch.compile`` \u4e2d\u4f7f\u7528\u7528\u6237\u81ea\u5b9a\u4e49\u7684 Triton \u5185\u6838\u3002\n\u6211\u4eec\u6df1\u5165\u7814\u7a76\u4e86\u4f7f\u7528\u7b80\u5355\u5411\u91cf\u52a0\u6cd5\u5185\u6838\u7684\u57fa\u672c\u7528\u6cd5,\u4ee5\u53ca\u6d89\u53ca Triton \u81ea\u52a8\u8c03\u4f18\u529f\u80fd\u7684\u9ad8\u7ea7\u7528\u6cd5\u3002\n\u6211\u4eec\u8fd8\u8ba8\u8bba\u4e86\u7528\u6237\u81ea\u5b9a\u4e49 Triton \u5185\u6838\u4e0e\u5176\u4ed6 PyTorch \u529f\u80fd\u7684\u53ef\u7ec4\u5408\u6027,\u5e76\u5f3a\u8c03\u4e86\u4e00\u4e9b\u5f53\u524d\u7684\u9650\u5236\u3002\n\n## \u53e6\u8bf7\u53c2\u9605\n\n* [\u7f16\u8bd1\u4f18\u5316\u5668](https://pytorch.org/tutorials/recipes/compiling_optimizer.html)_\n* [\u4f7f\u7528\u7f29\u653e\u70b9\u79ef\u6ce8\u610f\u529b\u5b9e\u73b0\u9ad8\u6027\u80fd Transformer](https://pytorch.org/tutorials/intermediate/scaled_dot_product_attention_tutorial.html)_\n\n" ] } ], diff --git a/docs/_sources/recipes/amx.rst.txt b/docs/_sources/recipes/amx.rst.txt index 459e7c5..e09b3e7 100644 --- a/docs/_sources/recipes/amx.rst.txt +++ b/docs/_sources/recipes/amx.rst.txt @@ -1,38 +1,31 @@ ============================================== -Leverage Intel® Advanced Matrix Extensions +利用英特尔®高级矩阵扩展(Intel® Advanced Matrix Extensions) ============================================== -Introduction -============ +简介 +==== -Advanced Matrix Extensions (AMX), also known as Intel® Advanced Matrix Extensions (Intel® AMX), is an x86 extension, -which introduce two new components: a 2-dimensional register file called 'tiles' and an accelerator of Tile Matrix Multiplication (TMUL) that is able to operate on those tiles. -AMX is designed to work on matrices to accelerate deep-learning training and inference on the CPU and is ideal for workloads like natural-language processing, recommendation systems and image recognition. +高级矩阵扩展(AMX),也称为英特尔®高级矩阵扩展(Intel® AMX),是一种x86扩展,引入了两个新组件:一个称为"tile"的二维寄存器文件和一个能够在这些tile上进行矩阵乘法(TMUL)的加速器。AMX旨在加速CPU上的深度学习训练和推理工作负载,非常适合自然语言处理、推荐系统和图像识别等工作负载。 -Intel advances AI capabilities with 4th Gen Intel® Xeon® Scalable processors and Intel® AMX, delivering 3x to 10x higher inference and training performance versus the previous generation, see `Accelerate AI Workloads with Intel® AMX`_. -Compared to 3rd Gen Intel Xeon Scalable processors running Intel® Advanced Vector Extensions 512 Neural Network Instructions (Intel® AVX-512 VNNI), -4th Gen Intel Xeon Scalable processors running Intel AMX can perform 2,048 INT8 operations per cycle, rather than 256 INT8 operations per cycle. They can also perform 1,024 BF16 operations per cycle, as compared to 64 FP32 operations per cycle, see page 4 of `Accelerate AI Workloads with Intel® AMX`_. -For more detailed information of AMX, see `Intel® AMX Overview`_. +英特尔通过第4代英特尔®至强®可扩展处理器和英特尔®AMX推进了AI能力,相比上一代产品,推理和训练性能提高了3倍至10倍,详见`使用Intel® AMX加速AI工作负载`_。与运行Intel®高级矢量扩展512神经网络指令(Intel® AVX-512 VNNI)的第3代英特尔至强可扩展处理器相比,运行Intel AMX的第4代英特尔至强可扩展处理器每周期可执行2,048个INT8操作,而不是256个INT8操作;它们还可以每周期执行1,024个BF16操作,而不是64个FP32操作,详见`使用Intel® AMX加速AI工作负载`_第4页。有关AMX的更多详细信息,请参阅`Intel® AMX概述`_。 - -AMX in PyTorch +PyTorch中的AMX ============== -PyTorch leverages AMX for computing intensive operators with BFloat16 and quantization with INT8 by its backend oneDNN -to get higher performance out-of-box on x86 CPUs with AMX support. -For more detailed information of oneDNN, see `oneDNN`_. +PyTorch通过其后端oneDNN利用AMX来计算BFloat16和INT8量化的计算密集型算子,从而在支持AMX的x86 CPU上获得更高的性能。 +有关oneDNN的更多详细信息,请参阅`oneDNN`_。 -The operation is fully handled by oneDNN according to the execution code path generated. For example, when a supported operation gets executed into oneDNN implementation on a hardware platform with AMX support, AMX instructions will be invoked automatically inside oneDNN. -Since oneDNN is the default acceleration library for PyTorch CPU, no manual operations are required to enable the AMX support. +操作完全由oneDNN根据生成的执行代码路径处理。例如,当支持的操作在支持AMX的硬件平台上执行到oneDNN实现时,AMX指令将在oneDNN内部自动调用。 +由于oneDNN是PyTorch CPU的默认加速库,因此无需手动操作即可启用AMX支持。 -Guidelines of leveraging AMX with workloads +利用AMX加速工作负载的指南 ------------------------------------------- -This section provides guidelines on how to leverage AMX with various workloads. +本节提供了如何利用AMX加速各种工作负载的指南。 -- BFloat16 data type: +- BFloat16数据类型: - - Using ``torch.cpu.amp`` or ``torch.autocast("cpu")`` would utilize AMX acceleration for supported operators. + - 使用``torch.cpu.amp``或``torch.autocast("cpu")``将利用AMX加速支持的算子。 :: @@ -40,23 +33,23 @@ This section provides guidelines on how to leverage AMX with various workloads. with torch.cpu.amp.autocast(): output = model(input) -.. note:: Use ``torch.channels_last`` memory format to get better performance. +.. note:: 使用``torch.channels_last``内存格式可获得更好的性能。 -- Quantization: +- 量化: - - Applying quantization would utilize AMX acceleration for supported operators. + - 应用量化将利用AMX加速支持的算子。 - torch.compile: - - When the generated graph model runs into oneDNN implementations with the supported operators, AMX accelerations will be activated. + - 当生成的图模型运行到oneDNN实现的支持算子时,AMX加速将被激活。 -.. note:: When using PyTorch on CPUs that support AMX, the framework will automatically enable AMX usage by default. This means that PyTorch will attempt to leverage the AMX feature whenever possible to speed up matrix multiplication operations. However, it's important to note that the decision to dispatch to the AMX kernel ultimately depends on the internal optimization strategy of the oneDNN library and the quantization backend, which PyTorch relies on for performance enhancements. The specific details of how AMX utilization is handled internally by PyTorch and the oneDNN library may be subject to change with updates and improvements to the framework. +.. note:: 在支持AMX的CPU上使用PyTorch时,框架将默认自动启用AMX使用。这意味着PyTorch将尽可能利用AMX功能来加速矩阵乘法操作。但是,重要的是要注意,是否调度到AMX内核最终取决于PyTorch所依赖的oneDNN库和量化后端的内部优化策略。PyTorch和oneDNN库内部如何处理AMX利用的具体细节可能会随着框架的更新和改进而发生变化。 -CPU operators that can leverage AMX: +可利用AMX的CPU算子: ------------------------------------ -BF16 CPU ops that can leverage AMX: +可利用AMX的BF16 CPU算子: - ``conv1d`` - ``conv2d`` @@ -72,7 +65,7 @@ BF16 CPU ops that can leverage AMX: - ``linear`` - ``matmul`` -Quantization CPU ops that can leverage AMX: +可利用AMX的量化CPU算子: - ``conv1d`` - ``conv2d`` @@ -84,10 +77,10 @@ Quantization CPU ops that can leverage AMX: -Confirm AMX is being utilized +确认AMX正在被利用 ------------------------------ -Set environment variable ``export ONEDNN_VERBOSE=1``, or use ``torch.backends.mkldnn.verbose`` to enable oneDNN to dump verbose messages. +设置环境变量``export ONEDNN_VERBOSE=1``或使用``torch.backends.mkldnn.verbose``以启用oneDNN转储详细消息。 :: @@ -95,7 +88,7 @@ Set environment variable ``export ONEDNN_VERBOSE=1``, or use ``torch.backends.mk with torch.cpu.amp.autocast(): model(input) -For example, get oneDNN verbose: +例如,获取oneDNN详细输出: :: @@ -111,24 +104,21 @@ For example, get oneDNN verbose: onednn_verbose,exec,cpu,matmul,brg:avx512_core_amx_int8,undef,src_s8::blocked:ab:f0 wei_s8:p:blocked:BA16a64b4a:f0 dst_s8::blocked:ab:f0,attr-scratchpad:user ,,1x30522:30522x768:1x768,7.66382 ... -If you get the verbose of ``avx512_core_amx_bf16`` for BFloat16 or ``avx512_core_amx_int8`` for quantization with INT8, it indicates that AMX is activated. +如果你获得了``avx512_core_amx_bf16``的详细输出(用于BFloat16)或``avx512_core_amx_int8``(用于INT8量化),则表示AMX已被激活。 -Conclusion +结论 ---------- +在本教程中,我们简要介绍了AMX、如何在PyTorch中利用AMX来加速工作负载,以及如何确认AMX正在被利用。 -In this tutorial, we briefly introduced AMX, how to utilize AMX in PyTorch to accelerate workloads, and how to confirm that AMX is being utilized. - -With the improvements and updates of PyTorch and oneDNN, the utilization of AMX may be subject to change accordingly. +随着PyTorch和oneDNN的改进和更新,AMX的利用情况可能会相应发生变化。 -As always, if you run into any problems or have any questions, you can use -`forum `_ or `GitHub issues -`_ to get in touch. +如果您遇到任何问题或有任何疑问,您可以使用`论坛 `_或`GitHub issues `_与我们联系。 -.. _Accelerate AI Workloads with Intel® AMX: https://www.intel.com/content/www/us/en/products/docs/accelerator-engines/advanced-matrix-extensions/ai-solution-brief.html +.. _使用Intel® AMX加速AI工作负载: https://www.intel.com/content/www/us/en/products/docs/accelerator-engines/advanced-matrix-extensions/ai-solution-brief.html -.. _Intel® AMX Overview: https://www.intel.com/content/www/us/en/products/docs/accelerator-engines/advanced-matrix-extensions/overview.html +.. _Intel® AMX概述: https://www.intel.com/content/www/us/en/products/docs/accelerator-engines/advanced-matrix-extensions/overview.html .. _oneDNN: https://oneapi-src.github.io/oneDNN/index.html diff --git a/docs/_sources/recipes/compiling_optimizer.rst.txt b/docs/_sources/recipes/compiling_optimizer.rst.txt index 951495c..702dace 100644 --- a/docs/_sources/recipes/compiling_optimizer.rst.txt +++ b/docs/_sources/recipes/compiling_optimizer.rst.txt @@ -1,25 +1,21 @@ -(beta) Compiling the optimizer with torch.compile +(beta) 使用 torch.compile 编译优化器 ========================================================================================== -**Author:** `Michael Lazos `_ +**作者:** `Michael Lazos `_ -The optimizer is a key algorithm for training any deep learning model. -Since it is responsible for updating every model parameter, it can often -become the bottleneck in training performance for large models. In this recipe, -we will apply ``torch.compile`` to the optimizer to observe the GPU performance -improvement. +优化器是训练任何深度学习模型的关键算法。由于它负责更新每个模型参数,因此对于大型模型,它往往会成为训练性能的瓶颈。 +在本教程中,我们将在优化器使用 ``torch.compile`` ,提升在 GPU 上的性能。 .. note:: - This tutorial requires PyTorch 2.2.0 or later. + 本教程需要 PyTorch 2.2.0 或更高版本。 -Model Setup +模型设置 ~~~~~~~~~~~~~~~~~~~~~ -For this example, we'll use a simple sequence of linear layers. -Since we are only benchmarking the optimizer, the choice of model doesn't matter -because optimizer performance is a function of the number of parameters. +对于本例,我们将使用一个简单的线性层序列。由于我们只是对优化器进行基准测试,所选择的模型并不重要, +因为优化器的性能与函数参数数量有关。 -Depending on what machine you are using, your exact results may vary. +根据您使用的机器不同,结果可能会有所不同。 .. code-block:: python @@ -32,19 +28,17 @@ Depending on what machine you are using, your exact results may vary. output = model(input) output.sum().backward() -Setting up and running the optimizer benchmark +设置和运行优化器基准测试 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -In this example, we'll use the Adam optimizer -and create a helper function to wrap the step() -in ``torch.compile()``. +在本例中,我们将使用 Adam 优化器,并创建一个辅助函数来将 step() 包装在 ``torch.compile()`` 中。 .. note:: - ``torch.compile`` is only supported on cuda devices with compute capability >= 7.0 + ``torch.compile`` 仅支持device_capability >= 7.0 的 CUDA 设备 .. code-block:: python - # exit cleanly if we are on a device that doesn't support torch.compile + # 如果我们在不支持 torch.compile 的设备上,则干净地退出 if torch.cuda.get_device_capability() < (7, 0): print("Exiting because torch.compile is not supported on this device.") import sys @@ -59,7 +53,7 @@ in ``torch.compile()``. opt.step() - # Let's define a helpful benchmarking function: + # 让我们定义一个有用的基准测试函数: import torch.utils.benchmark as benchmark @@ -70,7 +64,7 @@ in ``torch.compile()``. return t0.blocked_autorange().mean * 1e6 - # Warmup runs to compile the function + # 预热运行以编译函数 for _ in range(5): fn() @@ -82,13 +76,13 @@ in ``torch.compile()``. print(f"eager runtime: {eager_runtime}us") print(f"compiled runtime: {compiled_runtime}us") -Sample Results: +示例结果: * Eager runtime: 747.2437149845064us * Compiled runtime: 392.07384741178us -See Also +另请参阅 ~~~~~~~~~ -* For an in-depth technical overview, see -`Compiling the optimizer with PT2 `__ +* 有关深入的技术概述,请参阅 +`使用 PT2 编译优化器 `__ diff --git a/docs/_sources/recipes/inference_tuning_on_aws_graviton.rst.txt b/docs/_sources/recipes/inference_tuning_on_aws_graviton.rst.txt index 08d3515..949fa30 100644 --- a/docs/_sources/recipes/inference_tuning_on_aws_graviton.rst.txt +++ b/docs/_sources/recipes/inference_tuning_on_aws_graviton.rst.txt @@ -1,38 +1,40 @@ -(Beta) PyTorch Inference Performance Tuning on AWS Graviton Processors + + +(Beta) PyTorch在AWS Graviton处理器上的推理性能优化 ====================================================================== -**Author**: `Sunita Nadampalli `_ +**作者**: `Sunita Nadampalli `_ -`AWS Graviton `_ is a series of ARM-based processors designed by AWS. AWS Graviton3 processors are optimized for Machine Learning (ML) workloads, including support for ``bfloat16``, Scalable Vector Extension (SVE) and twice the Single Instruction Multiple Data (SIMD) bandwidth compared to Graviton2. +`AWS Graviton `_ 是一系列由AWS设计的基于ARM的处理器。AWS Graviton3处理器针对机器学习(ML)工作负载进行了优化,包括支持 ``bfloat16``、可扩展向量扩展(SVE)以及比Graviton2高两倍的单指令多数据(SIMD)带宽。 -PyTorch provides native reference ATen kernels for the machine learning operators like convolutions, matmul, relu, etc. These operators can be accelerated with platform specific kernel implementations from Basic Linear Algebra (BLAS) libraries. On AWS Graviton CPUs, MKLDNN with Arm Compute Library (`ACL `_) and `OpenBLAS `_ libraries provide optimized implementations for a subset of the operators. Both these libraries are integrated into PyTorch with PyTorch 2.0 version. +PyTorch为机器学习算子(如卷积、矩阵乘法、relu等)提供了原生参考ATen内核。这些算子可以通过来自基本线性代数(BLAS)库的特定于平台的内核实现进行加速。在AWS Graviton CPU上,MKLDNN与Arm Compute Library (`ACL `_) 和 `OpenBLAS `_ 库为一部分算子提供了优化实现。从PyTorch 2.0版本开始,这两个库都集成到了PyTorch中。 -In this tutorial we will cover how to achieve the best inference performance for linear layer neural network on AWS Graviton3 CPUs (`AWS c7g instance `_) with ``bfloa16`` kernels and with the right backend selection. +在本教程中,我们将介绍如何通过 ``bfloat16`` 内核和正确的后端选择,在AWS Graviton3 CPU (`AWS c7g实例 `_) 上实现线性层神经网络的最佳推理性能。 -Contents +内容 -------- -1. Basic Usage -2. Speed up inference with Bfloat16 fast math kernels -3. Improve inference performance with OpenBLAS for smaller batch dimensions -4. Optimize memory allocation overhead with Linux Transparent huge pages -5. Conclusion +1. 基本用法 +2. 使用Bfloat16快速数学内核加速推理 +3. 对于较小的批次维度,使用OpenBLAS提高推理性能 +4. 使用Linux透明大页优化内存分配开销 +5. 总结 .. note:: - To successfully run this tutorial and reproduce the speedup numbers shown below, you need an instance from the Graviton3 family (``c7g/r7g/m7g``) of hardware. For this tutorial, we used the `c7g.xl (4vcpu) instance `_ . + 要成功运行本教程并重现下面显示的加速数字,您需要来自Graviton3系列(``c7g/r7g/m7g``)的硬件实例。对于本教程,我们使用了 `c7g.xl (4vcpu)实例 `_ 。 -Basic Usage +基本用法 --------------- -PyTorch natively supports AWS Graviton3 optimizations starting with PyTorch 2.0 version. -Please refer to this `blog `_ for more details on the optimizations. +从PyTorch 2.0版本开始,PyTorch原生支持AWS Graviton3优化。 +更多详细信息请参阅此 `博客 `_。 -1. Install PyTorch by running the following command: +1. 运行以下命令安装PyTorch: .. code-block:: python3 -m pip install torch -2. We will start by importing the required dependencies and defining the device will run on: +2. 我们将从导入所需的依赖项并定义将在其上运行的设备开始: .. code-block:: python @@ -45,7 +47,7 @@ Please refer to this `blog `_. Arm Compute Library (`ACL `_) provides optimized ``bfloat16`` General Matrix Multiplication (GEMM) kernels for AWS Graviton processors, and are integrated into PyTorch via MKLDNN backend starting with PyTorch 2.0. The inference performance can be optimized with the fast math GEMM kernels. The fast math mode is not enabled by default because these kernels perform GEMM in ``bfloat16`` precision instead of ``float``, and hence results in a slight drop in the model inference accuracy. However, the accuracy drop is within the ``cosine similarity`` threshold defined for ``bfloat16`` backend in ``torchbench`` test suite, and hence acceptable for majority of the applications. To enable the fast math GEMM kernels, set the following environment variable: +AWS Graviton3处理器支持 `bfloat16 MMLA指令 `_。Arm Compute Library (`ACL `_) 为AWS Graviton处理器提供了优化的 ``bfloat16`` 通用矩阵乘法(GEMM)内核,并从PyTorch 2.0版本开始通过MKLDNN后端集成到PyTorch中。可以使用快速数学GEMM内核优化推理性能。默认情况下不启用快速数学模式,因为这些内核以 ``bfloat16`` 精度而不是 ``float`` 执行GEMM,因此会导致模型推理精度略有下降。但是,精度下降在 ``torchbench`` 测试套件中为 ``bfloat16`` 后端定义的 ``余弦相似度`` 阈值范围内,因此对大多数应用程序来说是可以接受的。要启用快速数学GEMM内核,请设置以下环境变量: .. code-block:: bash $ export DNNL_DEFAULT_FPMATH_MODE=BF16 -When you run the above inference script, you should see the following profiler output with the MKLDNN fast math mode enabled: +当您运行上述推理脚本时,应该会看到启用MKLDNN快速数学模式后的分析器输出: .. table:: :widths: auto @@ -160,16 +162,16 @@ When you run the above inference script, you should see the following profiler o **Self CPU time total:** 7.262s -This is around ``2x (7.262s vs 16.201s)`` performance improvement with the ``bfloat16`` fastmath kernels. Next, let’s look at the smaller batch dimension scenario. +这比默认配置快约 ``2倍 (7.262s vs 16.201s)``。接下来,让我们看看较小批次维度的情况。 -**Scenario 2:** A smaller batch dimension, for example, 32: +**场景 2:** 较小的批量维度,例如 32: .. code-block:: python X = torch.rand(32, 64, 64, device=device) with torch.set_grad_enabled(False): for _ in range(50): - model(X) #Warmup + model(X) #预热 with profile(activities=[ProfilerActivity.CPU]) as prof: with record_function("mymodel_inference"): for _ in range(100): @@ -178,13 +180,13 @@ This is around ``2x (7.262s vs 16.201s)`` performance improvement with the ``bfl print(prof.key_averages().table(sort_by="self_cpu_time_total")) -You should see the following profiler output when the above script is run with the PyTorch default configuration: +使用 PyTorch 默认配置运行上述脚本时,您应该会看到以下 profiler 输出: .. table:: :widths: auto ====================== ============= ============ ============ ============ ============ ============ - Name Self CPU % Self CPU CPU total % CPU total CPU time avg # of Calls + 名称 自身 CPU % 自身 CPU CPU 总计 % CPU 总计 CPU 平均时间 调用次数 ====================== ============= ============ ============ ============ ============ ============ aten::addmm 95.51% 5.821s 97.04% 5.914s 19.713ms 300 aten::clamp_min 2.33% 142.244ms 2.33% 142.244ms 711.220us 200 @@ -195,10 +197,10 @@ You should see the following profiler output when the above script is run with t aten::relu 0.03% 1.942ms 2.37% 144.186ms 720.930us 200 ====================== ============= ============ ============ ============ ============ ============ -**Self CPU time total:** 6.094s +**自身 CPU 总计:** 6.094s -The following output is the profiler output when run with the MKLDNN fast math mode enabled: +以下是启用 MKLDNN 快速数学模式时的 profiler 输出: .. code-block:: bash @@ -208,7 +210,7 @@ The following output is the profiler output when run with the MKLDNN fast math m :widths: auto ====================== ============ ============ ============ ============ ============ ============= - Name Self CPU % Self CPU CPU total % CPU total CPU time avg # of Calls + 名称 自身 CPU % 自身 CPU CPU 总计 % CPU 总计 CPU 平均时间 调用次数 ====================== ============ ============ ============ ============ ============ ============= aten::addmm 93.31% 3.848s 95.66% 3.944s 13.148ms 300 aten::clamp_min 3.43% 141.309ms 3.43% 141.309ms 706.545us 200 @@ -219,27 +221,29 @@ The following output is the profiler output when run with the MKLDNN fast math m aten::relu 0.05% 1.928ms 3.47% 143.237ms 716.185us 200 ====================== ============ ============ ============ ============ ============ ============= -**Self CPU time total:** 4.123s +**自身 CPU 总计:** 4.123s -The MKLDNN fast math mode yields approximately a **1.47x (4.123s vs 6.094s)** performance improvement for smaller batch dimensions. Although this improvement is noteworthy, the overall performance still leaves room for improvement. This is because of the runtime overhead (weights reorders and kernel launch time) from oneDNN and ACL backend outweighing the compute benefits from the ACL GEMM kernels for the smaller batch compute. +MKLDNN 快速数学模式为较小的批量维度提供了大约 **1.47x (4.123s vs 6.094s)** 的性能提升。 +尽管性能提升明显,但整体仍有提升空间。因为来自 oneDNN 和 ACL 后端的运行时开销(权重重排和内核启动时间) +超过了 ACL GEMM 内核对较小批量计算的计算优势。 -Improve Inference Performance with OpenBLAS for Smaller Batch Dimensions ------------------------------------------------------------------------- +使用 OpenBLAS 提高较小批量维度的推理性能 +---------------------------------------- -The inference performance for smaller batch dimensions can be improved by offloading the smaller shapes from MKLDNN to OpenBLAS backend. We are working on making the backend selection automatic, with robust heuristics, for the future releases. Till the heuristics are implemented, the smaller shapes can be offloaded to OpenBLAS by increasing the threshold for MKLDNN backend selection. In the following example, we use ``64`` as the threshold, so that input with ``batch dimension of 32`` is not dispatched to MKLDNN. Instead, it is dispatched to OpenBLAS. +可以通过将较小的形状从 MKLDNN 卸载到 OpenBLAS 后端来提高较小批量维度的推理性能。我们正在努力为未来版本实现自动化的后端选择,并具有健壮的启发式算法。在实现启发式算法之前,可以通过增加 MKLDNN 后端选择的阈值将较小的形状卸载到 OpenBLAS。在以下示例中,我们使用 ``64`` 作为阈值,因此批量维度为 ``32`` 的输入不会分派到 MKLDNN。相反,它会被分派到 OpenBLAS。 .. code-block:: bash $ export TORCH_MKLDNN_MATMUL_MIN_DIM=64 -Here is the profiler output with OpenBLAS backend: +以下是使用 OpenBLAS 后端时的 profiler 输出: .. table:: :widths: auto ====================== ============ ============ ============ ============= ============ ============= - Name Self CPU % Self CPU CPU total % CPU total CPU time avg # of Calls + 名称 自身 CPU % 自身 CPU CPU 总计 % CPU 总计 CPU 平均时间 调用次数 ====================== ============ ============ ============ ============= ============ ============= aten::addmm 96.25% 1.958s 97.51% 1.984s 6.612ms 300 aten::clamp_min 1.28% 26.124ms 1.28% 26.124ms 130.620us 200 @@ -250,17 +254,18 @@ Here is the profiler output with OpenBLAS backend: aten::relu 0.06% 1.258ms 1.35% 27.382ms 136.910us 200 ====================== ============ ============ ============ ============= ============ ============= -**Self CPU time total:** 2.034s +**自身 CPU 总计:** 2.034s -As you can see above, switching to OpenBLAS doubled the performance **(2.034s vs 4.123s)** compared to the default MKLDNN backend configuration. This becomes significant for even smaller batch dimensions, for example, for a batch dimension of 10: +如您所见,切换到 OpenBLAS 将性能提高了一倍 **(2.034s vs 4.123s)** 与默认的 MKLDNN 后端配置相比。 +对于更小的批量维度,例如批量维度为 10,这一点更加显著: .. code-block:: python X = torch.rand(10, 64, 64, device=device) with torch.set_grad_enabled(False): for _ in range(50): - model(X) #Warmup + model(X) #预热 with profile(activities=[ProfilerActivity.CPU]) as prof: with record_function("mymodel_inference"): for _ in range(100): @@ -269,13 +274,13 @@ As you can see above, switching to OpenBLAS doubled the performance **(2.034s vs print(prof.key_averages().table(sort_by="self_cpu_time_total")) -The following is the profiler output with MKLDNN fast math mode: +以下是启用 MKLDNN 快速数学模式时的 profiler 输出: .. table:: :widths: auto ====================== ============ ============ ============ ============ ============= ============= - Name Self CPU % Self CPU CPU total % CPU total CPU time avg # of Calls + 名称 自身 CPU % 自身 CPU CPU 总计 % CPU 总计 CPU 平均时间 调用次数 ====================== ============ ============ ============ ============ ============= ============= aten::addmm 87.81% 3.613s 91.90% 3.781s 12.604ms 300 aten::clamp_min 7.18% 295.437ms 7.18% 295.437ms 1.477ms 200 @@ -286,10 +291,10 @@ The following is the profiler output with MKLDNN fast math mode: aten::relu 0.05% 1.932ms 7.23% 297.369ms 1.487ms 200 ====================== ============ ============ ============ ============ ============= ============= -**Self CPU time total:** 4.115s +**自身 CPU 总计:** 4.115s -and the following is the profiler output with the OpenBLAS backend: +以下是使用 OpenBLAS 后端时的 profiler 输出: .. code-block:: bash @@ -299,7 +304,7 @@ and the following is the profiler output with the OpenBLAS backend: :widths: auto ====================== ============= ============ ============ ============ ============= ============ - Name Self CPU % Self CPU CPU total % CPU total CPU time avg # of Calls + 名称 自身 CPU % 自身 CPU CPU 总计 % CPU 总计 CPU 平均时间 调用次数 ====================== ============= ============ ============ ============ ============= ============ aten::addmm 92.66% 1.179s 95.23% 1.211s 4.038ms 300 aten::clamp_min 2.83% 36.060ms 2.83% 36.060ms 180.300us 200 @@ -310,29 +315,28 @@ and the following is the profiler output with the OpenBLAS backend: aten::relu 0.10% 1.285ms 2.94% 37.345ms 186.725us 200 ====================== ============= ============ ============ ============ ============= ============ -**Self CPU time total:** 1.272s - +**自身 CPU 总计:** 1.272s -Here we observed **3.2x (1.272s vs 4.115s)** performance improvement by tuning the backend thresholds appropriately. +这里我们观察到通过适当调整后端阈值,**性能提高了3.2倍(1.272s vs 4.115s)**。 +使用 Linux Transparent Huge Pages (THP) 优化内存分配开销 +------------------------------------------------------ -Optimize Memory Allocation Overhead with Linux Transparent Huge Pages (THP) ---------------------------------------------------------------------------- - -We also observed that for these larger networks, tensor memory allocations take significant portion of the inference latency. This can be optimized by enabling Linux transparent huge page allocations from PyTorch C10 memory allocator. Currently the feature is not enabled by default because it will increase the memory footprint marginally. Set the following environment variable to enable it: +我们还观察到,对于这些较大的网络,张量内存分配占推理延迟的很大一部分。这可以通过从PyTorch C10内存分配器 +启用 THP 来优化。目前,该功能默认未启用,因为它会略微增加内存占用。设置以下环境变量以启用它: .. code-block:: bash $ export THP_MEM_ALLOC_ENABLE=1 -For the batch dimension of 256 and with MKLDNN fast math mode: +对于批量维度为 256 且启用 MKLDNN Fast Math 模式: .. code-block:: python X = torch.rand(256, 64, 64, device=device) with torch.set_grad_enabled(False): for _ in range(50): - model(X) #Warmup + model(X) #预热 with profile(activities=[ProfilerActivity.CPU]) as prof: with record_function("mymodel_inference"): for _ in range(100): @@ -340,14 +344,13 @@ For the batch dimension of 256 and with MKLDNN fast math mode: print(prof.key_averages().table(sort_by="self_cpu_time_total")) - -The following is the profiler output with THP memory allocations enabled: +启用THP内存分配后,profiler的输出如下: .. table:: :widths: auto ====================== ============ ============ ============ ============ ============== ============ - Name Self CPU % Self CPU CPU total % CPU total CPU time avg # of Calls + 名称 自身CPU% 自身CPU CPU总% CPU总 CPU平均时间 调用次数 ====================== ============ ============ ============ ============ ============== ============ aten::addmm 91.31% 6.115s 94.39% 6.321s 21.069ms 300 aten::clamp_min 4.82% 322.568ms 4.82% 322.568ms 1.613ms 200 @@ -357,12 +360,14 @@ The following is the profiler output with THP memory allocations enabled: aten::relu 0.04% 2.547ms 4.85% 325.115ms 1.626ms 200 ====================== ============ ============ ============ ============ ============== ============ -**Self CPU time total:** 6.697s - -This is an additional **1.08x or 8% (6.697s vs 7.262s)** improvement on top of the already optimized MKLDNN fast math mode measured above. +**自身CPU总时间:** 6.697s +这比上面测量的已优化的 MKLDNN Fast Math 模式又提高了 **1.08倍或8%(6.697s vs 7.262s)**。 -Conclusion +结论 ------------ -In this tutorial, we covered PyTorch inference on AWS Graviton3 instances by covering the basic usage, demonstrating speedups with fast math kernels, comparing different backends for different batch dimensions, and how to optimize tensor memory allocation latencies with Linux transparent huge pages. The recommendation is to use MKLDNN backend with Bfloat16 fastmath mode and THP memory allocations for larger tensor shapes and to use OpenBLAS backend for smaller tensor shapes. We hope that you will give it a try! +在本教程中,我们介绍了在AWS Graviton3实例上的PyTorch推理,包括基本用法、使用快速数学内核的加速、 +比较不同批量维度下不同后端的性能,以及如何使用Linux透明大页面优化张量内存分配延迟。 +对于较大的张量形状,建议使用MKLDNN后端和Bfloat16快速数学模式以及THP内存分配;对于较小的张量形状, +建议使用OpenBLAS后端。希望您能尝试一下! diff --git a/docs/_sources/recipes/torch_compile_user_defined_triton_kernel_tutorial.rst.txt b/docs/_sources/recipes/torch_compile_user_defined_triton_kernel_tutorial.rst.txt index d7f9256..42b91d4 100644 --- a/docs/_sources/recipes/torch_compile_user_defined_triton_kernel_tutorial.rst.txt +++ b/docs/_sources/recipes/torch_compile_user_defined_triton_kernel_tutorial.rst.txt @@ -18,36 +18,34 @@ .. _sphx_glr_recipes_torch_compile_user_defined_triton_kernel_tutorial.py: -Using User-Defined Triton Kernels with ``torch.compile`` +使用 ``torch.compile`` 和用户自定义的 Triton 内核 ========================================================= -**Author:** `Oguz Ulgen `_ +**作者:** `Oguz Ulgen `_ -.. GENERATED FROM PYTHON SOURCE LINES 10-32 +.. GENERATED FROM PYTHON SOURCE LINES 10-30 -User-defined Triton kernels can be used to optimize specific parts of your -model's computation. These kernels are written in Triton's language, which is designed -to make it easier to achieve peak hardware performance. By using user-defined Triton -kernels with ``torch.compile``, you can integrate these optimized computations into -your PyTorch model, potentially achieving significant performance improvements. +用户自定义的 Triton 内核可用于优化模型计算的特定部分。这些内核是用 Triton 语言编写的, +旨在更容易实现硬件的峰值性能。通过在 ``torch.compile`` 中使用用户自定义的 Triton 内核, +您可以将这些优化过的计算集成到 PyTorch 模型中,从而可能获得显著的性能提升。 -This recipes demonstrates how you can use user-defined Triton kernels with ``torch.compile``. +本教程演示了如何在 ``torch.compile`` 中使用用户自定义的 Triton 内核。 -Prerequisites +先决条件 ------------------- -Before starting this recipe, make sure that you have the following: +在开始本教程之前,请确保您具备以下条件: -* Basic understanding of ``torch.compile`` and Triton. See: +* 对 ``torch.compile`` 和 Triton 有基本的了解。参见: - * `torch.compiler API documentation `__ - * `Introduction to torch.compile `__ - * `Triton language documentation `__ + * `torch.compiler API 文档 `__ + * `torch.compile 介绍 `__ + * `Triton 语言文档 `__ -* PyTorch 2.3 or later -* A GPU that supports Triton +* PyTorch 2.3 或更高版本 +* 支持 Triton 的 GPU -.. GENERATED FROM PYTHON SOURCE LINES 32-36 +.. GENERATED FROM PYTHON SOURCE LINES 30-34 .. code-block:: default @@ -56,23 +54,22 @@ Before starting this recipe, make sure that you have the following: from torch.utils._triton import has_triton -.. GENERATED FROM PYTHON SOURCE LINES 37-44 +.. GENERATED FROM PYTHON SOURCE LINES 35-41 -Basic Usage +基本用法 -------------------- -In this example, we will use a simple vector addition kernel from the Triton documentation -with ``torch.compile``. -For reference, see `Triton documentation `__. +在此示例中,我们将使用来自 Triton 文档的一个简单向量加法内核与 ``torch.compile``。 +参考 `Triton 文档 `__。 -.. GENERATED FROM PYTHON SOURCE LINES 44-81 +.. GENERATED FROM PYTHON SOURCE LINES 41-78 .. code-block:: default if not has_triton(): - print("Skipping because triton is not supported on this device.") + print("由于此设备不支持 triton,因此跳过。") else: import triton from triton import language as tl @@ -105,33 +102,31 @@ For reference, see `Triton documentation `__ -* `Implementing High-Performance Transformers with Scaled Dot Product Attention `__ +* `编译优化器 `__ +* `使用缩放点积注意力实现高性能 Transformer `__ .. rst-class:: sphx-glr-timing diff --git a/docs/objects.inv b/docs/objects.inv index 853f51e..99ddfe7 100644 Binary files a/docs/objects.inv and b/docs/objects.inv differ diff --git a/docs/recipes/amx.html b/docs/recipes/amx.html index faece65..97dafba 100644 --- a/docs/recipes/amx.html +++ b/docs/recipes/amx.html @@ -9,7 +9,7 @@ - Leverage Intel® Advanced Matrix Extensions — PyTorch Tutorials 2.3.0 documentation + 利用英特尔®高级矩阵扩展(Intel® Advanced Matrix Extensions) — PyTorch Tutorials 2.3.0 documentation @@ -531,7 +531,7 @@ -
  • Leverage Intel® Advanced Matrix Extensions
  • +
  • 利用英特尔®高级矩阵扩展(Intel® Advanced Matrix Extensions)
  • @@ -590,32 +590,26 @@
    -
    -

    Leverage Intel® Advanced Matrix Extensions

    -
    -

    Introduction

    -

    Advanced Matrix Extensions (AMX), also known as Intel® Advanced Matrix Extensions (Intel® AMX), is an x86 extension, -which introduce two new components: a 2-dimensional register file called ‘tiles’ and an accelerator of Tile Matrix Multiplication (TMUL) that is able to operate on those tiles. -AMX is designed to work on matrices to accelerate deep-learning training and inference on the CPU and is ideal for workloads like natural-language processing, recommendation systems and image recognition.

    -

    Intel advances AI capabilities with 4th Gen Intel® Xeon® Scalable processors and Intel® AMX, delivering 3x to 10x higher inference and training performance versus the previous generation, see Accelerate AI Workloads with Intel® AMX. -Compared to 3rd Gen Intel Xeon Scalable processors running Intel® Advanced Vector Extensions 512 Neural Network Instructions (Intel® AVX-512 VNNI), -4th Gen Intel Xeon Scalable processors running Intel AMX can perform 2,048 INT8 operations per cycle, rather than 256 INT8 operations per cycle. They can also perform 1,024 BF16 operations per cycle, as compared to 64 FP32 operations per cycle, see page 4 of Accelerate AI Workloads with Intel® AMX. -For more detailed information of AMX, see Intel® AMX Overview.

    +
    +

    利用英特尔®高级矩阵扩展(Intel® Advanced Matrix Extensions)

    +
    +

    简介

    +

    高级矩阵扩展(AMX),也称为英特尔®高级矩阵扩展(Intel® AMX),是一种x86扩展,引入了两个新组件:一个称为”tile”的二维寄存器文件和一个能够在这些tile上进行矩阵乘法(TMUL)的加速器。AMX旨在加速CPU上的深度学习训练和推理工作负载,非常适合自然语言处理、推荐系统和图像识别等工作负载。

    +

    英特尔通过第4代英特尔®至强®可扩展处理器和英特尔®AMX推进了AI能力,相比上一代产品,推理和训练性能提高了3倍至10倍,详见`使用Intel® AMX加速AI工作负载`_。与运行Intel®高级矢量扩展512神经网络指令(Intel® AVX-512 VNNI)的第3代英特尔至强可扩展处理器相比,运行Intel AMX的第4代英特尔至强可扩展处理器每周期可执行2,048个INT8操作,而不是256个INT8操作;它们还可以每周期执行1,024个BF16操作,而不是64个FP32操作,详见`使用Intel® AMX加速AI工作负载`_第4页。有关AMX的更多详细信息,请参阅`Intel® AMX概述`_。

    -
    -

    AMX in PyTorch

    -

    PyTorch leverages AMX for computing intensive operators with BFloat16 and quantization with INT8 by its backend oneDNN -to get higher performance out-of-box on x86 CPUs with AMX support. -For more detailed information of oneDNN, see oneDNN.

    -

    The operation is fully handled by oneDNN according to the execution code path generated. For example, when a supported operation gets executed into oneDNN implementation on a hardware platform with AMX support, AMX instructions will be invoked automatically inside oneDNN. -Since oneDNN is the default acceleration library for PyTorch CPU, no manual operations are required to enable the AMX support.

    -
    -

    Guidelines of leveraging AMX with workloads

    -

    This section provides guidelines on how to leverage AMX with various workloads.

    +
    +

    PyTorch中的AMX

    +

    PyTorch通过其后端oneDNN利用AMX来计算BFloat16和INT8量化的计算密集型算子,从而在支持AMX的x86 CPU上获得更高的性能。 +有关oneDNN的更多详细信息,请参阅`oneDNN`_。

    +

    操作完全由oneDNN根据生成的执行代码路径处理。例如,当支持的操作在支持AMX的硬件平台上执行到oneDNN实现时,AMX指令将在oneDNN内部自动调用。 +由于oneDNN是PyTorch CPU的默认加速库,因此无需手动操作即可启用AMX支持。

    +
    +

    利用AMX加速工作负载的指南

    +

    本节提供了如何利用AMX加速各种工作负载的指南。

      -
    • BFloat16 data type:

      +
    • BFloat16数据类型:

        -
      • Using torch.cpu.amp or torch.autocast("cpu") would utilize AMX acceleration for supported operators.

      • +
      • 使用``torch.cpu.amp``或``torch.autocast(“cpu”)``将利用AMX加速支持的算子。

      model = model.to(memory_format=torch.channels_last)
      @@ -628,28 +622,28 @@ 

      Guidelines of leveraging AMX with workloads

      Note

      -

      Use torch.channels_last memory format to get better performance.

      +

      使用``torch.channels_last``内存格式可获得更好的性能。

        -
      • Quantization:

        +
      • 量化:

          -
        • Applying quantization would utilize AMX acceleration for supported operators.

        • +
        • 应用量化将利用AMX加速支持的算子。

      • torch.compile:

          -
        • When the generated graph model runs into oneDNN implementations with the supported operators, AMX accelerations will be activated.

        • +
        • 当生成的图模型运行到oneDNN实现的支持算子时,AMX加速将被激活。

      Note

      -

      When using PyTorch on CPUs that support AMX, the framework will automatically enable AMX usage by default. This means that PyTorch will attempt to leverage the AMX feature whenever possible to speed up matrix multiplication operations. However, it’s important to note that the decision to dispatch to the AMX kernel ultimately depends on the internal optimization strategy of the oneDNN library and the quantization backend, which PyTorch relies on for performance enhancements. The specific details of how AMX utilization is handled internally by PyTorch and the oneDNN library may be subject to change with updates and improvements to the framework.

      +

      在支持AMX的CPU上使用PyTorch时,框架将默认自动启用AMX使用。这意味着PyTorch将尽可能利用AMX功能来加速矩阵乘法操作。但是,重要的是要注意,是否调度到AMX内核最终取决于PyTorch所依赖的oneDNN库和量化后端的内部优化策略。PyTorch和oneDNN库内部如何处理AMX利用的具体细节可能会随着框架的更新和改进而发生变化。

      -
      -

      CPU operators that can leverage AMX:

      -

      BF16 CPU ops that can leverage AMX:

      +
      +

      可利用AMX的CPU算子:

      +

      可利用AMX的BF16 CPU算子:

      • conv1d

      • conv2d

      • @@ -665,7 +659,7 @@

        CPU operators that can leverage AMX:linear

      • matmul

      -

      Quantization CPU ops that can leverage AMX:

      +

      可利用AMX的量化CPU算子:

      -
      -

      Confirm AMX is being utilized

      -

      Set environment variable export ONEDNN_VERBOSE=1, or use torch.backends.mkldnn.verbose to enable oneDNN to dump verbose messages.

      +
      +

      确认AMX正在被利用

      +

      设置环境变量``export ONEDNN_VERBOSE=1``或使用``torch.backends.mkldnn.verbose``以启用oneDNN转储详细消息。

      with torch.backends.mkldnn.verbose(torch.backends.mkldnn.VERBOSE_ON):
           with torch.cpu.amp.autocast():
               model(input)
       
      -

      For example, get oneDNN verbose:

      +

      例如,获取oneDNN详细输出:

      onednn_verbose,info,oneDNN v2.7.3 (commit 6dbeffbae1f23cbbeae17adb7b5b13f1f37c080e)
       onednn_verbose,info,cpu,runtime:OpenMP,nthr:128
       onednn_verbose,info,cpu,isa:Intel AVX-512 with float16, Intel DL Boost and bfloat16 support and Intel AMX with bfloat16 and 8-bit integer support
      @@ -698,14 +692,13 @@ 

      Confirm AMX is being utilized...

      -

      If you get the verbose of avx512_core_amx_bf16 for BFloat16 or avx512_core_amx_int8 for quantization with INT8, it indicates that AMX is activated.

      +

      如果你获得了``avx512_core_amx_bf16``的详细输出(用于BFloat16)或``avx512_core_amx_int8``(用于INT8量化),则表示AMX已被激活。

      -
      -

      Conclusion

      -

      In this tutorial, we briefly introduced AMX, how to utilize AMX in PyTorch to accelerate workloads, and how to confirm that AMX is being utilized.

      -

      With the improvements and updates of PyTorch and oneDNN, the utilization of AMX may be subject to change accordingly.

      -

      As always, if you run into any problems or have any questions, you can use -forum or GitHub issues to get in touch.

      +
      +

      结论

      +

      在本教程中,我们简要介绍了AMX、如何在PyTorch中利用AMX来加速工作负载,以及如何确认AMX正在被利用。

      +

      随着PyTorch和oneDNN的改进和更新,AMX的利用情况可能会相应发生变化。

      +

      如果您遇到任何问题或有任何疑问,您可以使用`论坛 <https://discuss.pytorch.org/>`_或`GitHub issues <https://github.com/pytorch/pytorch/issues>`_与我们联系。

      @@ -762,13 +755,13 @@

      Conclusion
        -
      • Leverage Intel® Advanced Matrix Extensions
          -
        • Introduction
        • -
        • AMX in PyTorch
            -
          • Guidelines of leveraging AMX with workloads
          • -
          • CPU operators that can leverage AMX:
          • -
          • Confirm AMX is being utilized
          • -
          • Conclusion
          • +
          • 利用英特尔®高级矩阵扩展(Intel® Advanced Matrix Extensions) diff --git a/docs/recipes/compiling_optimizer.html b/docs/recipes/compiling_optimizer.html index 6561962..a8fd32d 100644 --- a/docs/recipes/compiling_optimizer.html +++ b/docs/recipes/compiling_optimizer.html @@ -9,7 +9,7 @@ - (beta) Compiling the optimizer with torch.compile — PyTorch Tutorials 2.3.0 documentation + (beta) 使用 torch.compile 编译优化器 — PyTorch Tutorials 2.3.0 documentation @@ -535,7 +535,7 @@
          • PyTorch 示例 >
          • -
          • (beta) Compiling the optimizer with torch.compile
          • +
          • (beta) 使用 torch.compile 编译优化器
          • @@ -594,24 +594,20 @@
            -
            -

            (beta) Compiling the optimizer with torch.compile

            -

            Author: Michael Lazos

            -

            The optimizer is a key algorithm for training any deep learning model. -Since it is responsible for updating every model parameter, it can often -become the bottleneck in training performance for large models. In this recipe, -we will apply torch.compile to the optimizer to observe the GPU performance -improvement.

            +
            +

            (beta) 使用 torch.compile 编译优化器

            +

            作者: Michael Lazos

            +

            优化器是训练任何深度学习模型的关键算法。由于它负责更新每个模型参数,因此对于大型模型,它往往会成为训练性能的瓶颈。 +在本教程中,我们将在优化器使用 torch.compile ,提升在 GPU 上的性能。

            Note

            -

            This tutorial requires PyTorch 2.2.0 or later.

            +

            本教程需要 PyTorch 2.2.0 或更高版本。

            -
            -

            Model Setup

            -

            For this example, we’ll use a simple sequence of linear layers. -Since we are only benchmarking the optimizer, the choice of model doesn’t matter -because optimizer performance is a function of the number of parameters.

            -

            Depending on what machine you are using, your exact results may vary.

            +
            +

            模型设置

            +

            对于本例,我们将使用一个简单的线性层序列。由于我们只是对优化器进行基准测试,所选择的模型并不重要, +因为优化器的性能与函数参数数量有关。

            +

            根据您使用的机器不同,结果可能会有所不同。

            import torch
             
             model = torch.nn.Sequential(
            @@ -623,16 +619,14 @@ 

            Model Setup -

            Setting up and running the optimizer benchmark

            -

            In this example, we’ll use the Adam optimizer -and create a helper function to wrap the step() -in torch.compile().

            +
            +

            设置和运行优化器基准测试

            +

            在本例中,我们将使用 Adam 优化器,并创建一个辅助函数来将 step() 包装在 torch.compile() 中。

            Note

            -

            torch.compile is only supported on cuda devices with compute capability >= 7.0

            +

            torch.compile 仅支持device_capability >= 7.0 的 CUDA 设备

            -
            -
            -

            See Also

            +
            +

            另请参阅

              -
            • For an in-depth technical overview, see

            • +
            • 有关深入的技术概述,请参阅

            -

            Compiling the optimizer with PT2

            +

            使用 PT2 编译优化器

            @@ -747,10 +741,10 @@

            See Also
              -
            • (beta) Compiling the optimizer with torch.compile diff --git a/docs/recipes/inference_tuning_on_aws_graviton.html b/docs/recipes/inference_tuning_on_aws_graviton.html index 0e92a08..f073a9d 100644 --- a/docs/recipes/inference_tuning_on_aws_graviton.html +++ b/docs/recipes/inference_tuning_on_aws_graviton.html @@ -9,7 +9,7 @@ - (Beta) PyTorch Inference Performance Tuning on AWS Graviton Processors — PyTorch Tutorials 2.3.0 documentation + (Beta) PyTorch在AWS Graviton处理器上的推理性能优化 — PyTorch Tutorials 2.3.0 documentation @@ -531,7 +531,7 @@
            • -
            • (Beta) PyTorch Inference Performance Tuning on AWS Graviton Processors
            • +
            • (Beta) PyTorch在AWS Graviton处理器上的推理性能优化
            • @@ -590,37 +590,37 @@
              -
              -

              (Beta) PyTorch Inference Performance Tuning on AWS Graviton Processors

              -

              Author: Sunita Nadampalli

              -

              AWS Graviton is a series of ARM-based processors designed by AWS. AWS Graviton3 processors are optimized for Machine Learning (ML) workloads, including support for bfloat16, Scalable Vector Extension (SVE) and twice the Single Instruction Multiple Data (SIMD) bandwidth compared to Graviton2.

              -

              PyTorch provides native reference ATen kernels for the machine learning operators like convolutions, matmul, relu, etc. These operators can be accelerated with platform specific kernel implementations from Basic Linear Algebra (BLAS) libraries. On AWS Graviton CPUs, MKLDNN with Arm Compute Library (ACL) and OpenBLAS libraries provide optimized implementations for a subset of the operators. Both these libraries are integrated into PyTorch with PyTorch 2.0 version.

              -

              In this tutorial we will cover how to achieve the best inference performance for linear layer neural network on AWS Graviton3 CPUs (AWS c7g instance) with bfloa16 kernels and with the right backend selection.

              -
              -

              Contents

              +
              +

              (Beta) PyTorch在AWS Graviton处理器上的推理性能优化

              +

              作者: Sunita Nadampalli

              +

              AWS Graviton 是一系列由AWS设计的基于ARM的处理器。AWS Graviton3处理器针对机器学习(ML)工作负载进行了优化,包括支持 bfloat16、可扩展向量扩展(SVE)以及比Graviton2高两倍的单指令多数据(SIMD)带宽。

              +

              PyTorch为机器学习算子(如卷积、矩阵乘法、relu等)提供了原生参考ATen内核。这些算子可以通过来自基本线性代数(BLAS)库的特定于平台的内核实现进行加速。在AWS Graviton CPU上,MKLDNN与Arm Compute Library (ACL) 和 OpenBLAS 库为一部分算子提供了优化实现。从PyTorch 2.0版本开始,这两个库都集成到了PyTorch中。

              +

              在本教程中,我们将介绍如何通过 bfloat16 内核和正确的后端选择,在AWS Graviton3 CPU (AWS c7g实例) 上实现线性层神经网络的最佳推理性能。

              +
              +

              内容

                -
              1. Basic Usage

              2. -
              3. Speed up inference with Bfloat16 fast math kernels

              4. -
              5. Improve inference performance with OpenBLAS for smaller batch dimensions

              6. -
              7. Optimize memory allocation overhead with Linux Transparent huge pages

              8. -
              9. Conclusion

              10. +
              11. 基本用法

              12. +
              13. 使用Bfloat16快速数学内核加速推理

              14. +
              15. 对于较小的批次维度,使用OpenBLAS提高推理性能

              16. +
              17. 使用Linux透明大页优化内存分配开销

              18. +
              19. 总结

              Note

              -

              To successfully run this tutorial and reproduce the speedup numbers shown below, you need an instance from the Graviton3 family (c7g/r7g/m7g) of hardware. For this tutorial, we used the c7g.xl (4vcpu) instance .

              +

              要成功运行本教程并重现下面显示的加速数字,您需要来自Graviton3系列(c7g/r7g/m7g)的硬件实例。对于本教程,我们使用了 c7g.xl (4vcpu)实例

              -
              -

              Basic Usage

              -

              PyTorch natively supports AWS Graviton3 optimizations starting with PyTorch 2.0 version. -Please refer to this blog for more details on the optimizations.

              +
              +

              基本用法

              +

              从PyTorch 2.0版本开始,PyTorch原生支持AWS Graviton3优化。 +更多详细信息请参阅此 博客

                -
              1. Install PyTorch by running the following command:

                +
              2. 运行以下命令安装PyTorch:

                python3 -m pip install torch
                 
              3. -
              4. We will start by importing the required dependencies and defining the device will run on:

              5. +
              6. 我们将从导入所需的依赖项并定义将在其上运行的设备开始:

              import torch
               import torch.nn as nn
              @@ -632,7 +632,7 @@ 

              Basic Usage -
            • Given linear layers are at the heart of several neural networks, including transformers, we take a linear layer for this demo. We define our neural network by subclassing nn.Module, and initializing the layers in __init__. We construct the network with a typical large language model parameters to match the real world scenario:

            • +
            • 鉴于线性层是许多神经网络(包括Transformer)的核心,我们在此演示中使用线性层。我们通过子类化 nn.Module 并在 __init__ 中初始化层来定义我们的神经网络。我们使用典型的大型语言模型参数构建网络,以匹配真实世界场景:

            • class MyNeuralNetwork(nn.Module):
                 def __init__(self):
              @@ -653,13 +653,13 @@ 

              Basic Usage -
            • Let’s create an instance of MyNeuralNetwork, and move it to the device:

            • +
            • 让我们创建一个 MyNeuralNetwork 的实例,并将其移动到设备上:

            • model = MyNeuralNetwork().to(device)
               print(model)
               
              -

              Next, let’s get the prediction probabilities by passing them through an instance of the nn.Softmax module:

              +

              接下来,让我们通过将它们传递给 nn.Softmax 模块的实例来获取预测概率:

              -

              output:

              +

              输出:

              Predicted class: tensor([2])
               
              -

              Our network functionality is verified. Next, we will profile the performance. Lets’ check two different scenarios: small and large batch dimensions.

              -

              Scenario 1: A larger batch dimension, for example 256:

              -
              # warm it up first and loop over multiple times to have enough execution time
              +

              我们已验证了网络功能。接下来,我们将分析性能。让我们检查两种不同的情况:小批次维度和大批次维度。

              +

              情况1: 较大的批次维度,例如256:

              +
              -

              Following is the profiler output with the default PyTorch configuration:

              +

              使用默认PyTorch配置时的分析器输出如下:

              @@ -761,13 +761,13 @@

              Basic Usage -

              Speed up Inference with bfloat16 Fast Math Kernels

              -

              AWS Graviton3 processors support bfloat16 MMLA instructions. Arm Compute Library (ACL) provides optimized bfloat16 General Matrix Multiplication (GEMM) kernels for AWS Graviton processors, and are integrated into PyTorch via MKLDNN backend starting with PyTorch 2.0. The inference performance can be optimized with the fast math GEMM kernels. The fast math mode is not enabled by default because these kernels perform GEMM in bfloat16 precision instead of float, and hence results in a slight drop in the model inference accuracy. However, the accuracy drop is within the cosine similarity threshold defined for bfloat16 backend in torchbench test suite, and hence acceptable for majority of the applications. To enable the fast math GEMM kernels, set the following environment variable:

              +
              +

              使用 bfloat16 Fast Math Kernels加速推理

              +

              AWS Graviton3处理器支持 bfloat16 MMLA指令。Arm Compute Library (ACL) 为AWS Graviton处理器提供了优化的 bfloat16 通用矩阵乘法(GEMM)内核,并从PyTorch 2.0版本开始通过MKLDNN后端集成到PyTorch中。可以使用快速数学GEMM内核优化推理性能。默认情况下不启用快速数学模式,因为这些内核以 bfloat16 精度而不是 float 执行GEMM,因此会导致模型推理精度略有下降。但是,精度下降在 torchbench 测试套件中为 bfloat16 后端定义的 余弦相似度 阈值范围内,因此对大多数应用程序来说是可以接受的。要启用快速数学GEMM内核,请设置以下环境变量:

              $ export DNNL_DEFAULT_FPMATH_MODE=BF16
               
              -

              When you run the above inference script, you should see the following profiler output with the MKLDNN fast math mode enabled:

              +

              当您运行上述推理脚本时,应该会看到启用MKLDNN快速数学模式后的分析器输出:

              Name

              @@ -839,12 +839,12 @@

              Speed up Inference with

              Name

              Self CPU time total: 7.262s

              -

              This is around 2x (7.262s vs 16.201s) performance improvement with the bfloat16 fastmath kernels. Next, let’s look at the smaller batch dimension scenario.

              -

              Scenario 2: A smaller batch dimension, for example, 32:

              +

              这比默认配置快约 2倍 (7.262s vs 16.201s)。接下来,让我们看看较小批次维度的情况。

              +

              场景 2: 较小的批量维度,例如 32:

              X = torch.rand(32, 64, 64, device=device)
               with torch.set_grad_enabled(False):
                   for _ in range(50):
              -        model(X) #Warmup
              +        model(X) #预热
                   with profile(activities=[ProfilerActivity.CPU]) as prof:
                       with record_function("mymodel_inference"):
                           for _ in range(100):
              @@ -853,238 +853,31 @@ 

              Speed up Inference with print(prof.key_averages().table(sort_by="self_cpu_time_total"))

              -

              You should see the following profiler output when the above script is run with the PyTorch default configuration:

              - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

              Name

              Self CPU %

              Self CPU

              CPU total %

              CPU total

              CPU time avg

              # of Calls

              aten::addmm

              95.51%

              5.821s

              97.04%

              5.914s

              19.713ms

              300

              aten::clamp_min

              2.33%

              142.244ms

              2.33%

              142.244ms

              711.220us

              200

              aten::copy

              1.51%

              92.322ms

              1.51%

              92.322ms

              307.740us

              300

              mymodel_inference

              0.45%

              27.713ms

              100.00%

              6.094s

              6.094s

              1

              aten::linear

              0.04%

              2.495ms

              97.16%

              5.921s

              19.736ms

              300

              aten::t

              0.03%

              2.131ms

              0.07%

              4.441ms

              14.803us

              300

              aten::relu

              0.03%

              1.942ms

              2.37%

              144.186ms

              720.930us

              200

              -

              Self CPU time total: 6.094s

              -

              The following output is the profiler output when run with the MKLDNN fast math mode enabled:

              +

              使用 PyTorch 默认配置运行上述脚本时,您应该会看到以下 profiler 输出:

              +

              自身 CPU 总计: 6.094s

              +

              以下是启用 MKLDNN 快速数学模式时的 profiler 输出:

              $ export DNNL_DEFAULT_FPMATH_MODE=BF16
               
              - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

              Name

              Self CPU %

              Self CPU

              CPU total %

              CPU total

              CPU time avg

              # of Calls

              aten::addmm

              93.31%

              3.848s

              95.66%

              3.944s

              13.148ms

              300

              aten::clamp_min

              3.43%

              141.309ms

              3.43%

              141.309ms

              706.545us

              200

              aten::copy

              2.33%

              95.916ms

              2.33%

              95.916ms

              319.720us

              300

              mymodel_inference

              0.67%

              27.431ms

              100.00%

              4.123s

              4.123s

              1

              aten::linear

              0.06%

              2.471ms

              95.83%

              3.951s

              13.170ms

              300

              aten::t

              0.05%

              2.027ms

              0.10%

              4.243ms

              14.143us

              300

              aten::relu

              0.05%

              1.928ms

              3.47%

              143.237ms

              716.185us

              200

              -

              Self CPU time total: 4.123s

              -

              The MKLDNN fast math mode yields approximately a 1.47x (4.123s vs 6.094s) performance improvement for smaller batch dimensions. Although this improvement is noteworthy, the overall performance still leaves room for improvement. This is because of the runtime overhead (weights reorders and kernel launch time) from oneDNN and ACL backend outweighing the compute benefits from the ACL GEMM kernels for the smaller batch compute.

              +

              自身 CPU 总计: 4.123s

              +

              MKLDNN 快速数学模式为较小的批量维度提供了大约 1.47x (4.123s vs 6.094s) 的性能提升。 +尽管性能提升明显,但整体仍有提升空间。因为来自 oneDNN 和 ACL 后端的运行时开销(权重重排和内核启动时间) +超过了 ACL GEMM 内核对较小批量计算的计算优势。

              -
              -

              Improve Inference Performance with OpenBLAS for Smaller Batch Dimensions

              -

              The inference performance for smaller batch dimensions can be improved by offloading the smaller shapes from MKLDNN to OpenBLAS backend. We are working on making the backend selection automatic, with robust heuristics, for the future releases. Till the heuristics are implemented, the smaller shapes can be offloaded to OpenBLAS by increasing the threshold for MKLDNN backend selection. In the following example, we use 64 as the threshold, so that input with batch dimension of 32 is not dispatched to MKLDNN. Instead, it is dispatched to OpenBLAS.

              +
              +

              使用 OpenBLAS 提高较小批量维度的推理性能

              +

              可以通过将较小的形状从 MKLDNN 卸载到 OpenBLAS 后端来提高较小批量维度的推理性能。我们正在努力为未来版本实现自动化的后端选择,并具有健壮的启发式算法。在实现启发式算法之前,可以通过增加 MKLDNN 后端选择的阈值将较小的形状卸载到 OpenBLAS。在以下示例中,我们使用 64 作为阈值,因此批量维度为 32 的输入不会分派到 MKLDNN。相反,它会被分派到 OpenBLAS。

              $ export TORCH_MKLDNN_MATMUL_MIN_DIM=64
               
              -

              Here is the profiler output with OpenBLAS backend:

              - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

              Name

              Self CPU %

              Self CPU

              CPU total %

              CPU total

              CPU time avg

              # of Calls

              aten::addmm

              96.25%

              1.958s

              97.51%

              1.984s

              6.612ms

              300

              aten::clamp_min

              1.28%

              26.124ms

              1.28%

              26.124ms

              130.620us

              200

              aten::copy

              1.23%

              24.951ms

              1.23%

              24.951ms

              83.170us

              300

              mymodel_inference

              0.86%

              17.423ms

              100.00%

              2.034s

              2.034s

              1

              aten::linear

              0.08%

              1.691ms

              97.74%

              1.988s

              6.628ms

              300

              aten::t

              0.07%

              1.520ms

              0.14%

              2.945ms

              9.817us

              300

              aten::relu

              0.06%

              1.258ms

              1.35%

              27.382ms

              136.910us

              200

              -

              Self CPU time total: 2.034s

              -

              As you can see above, switching to OpenBLAS doubled the performance (2.034s vs 4.123s) compared to the default MKLDNN backend configuration. This becomes significant for even smaller batch dimensions, for example, for a batch dimension of 10:

              +

              以下是使用 OpenBLAS 后端时的 profiler 输出:

              +

              自身 CPU 总计: 2.034s

              +

              如您所见,切换到 OpenBLAS 将性能提高了一倍 (2.034s vs 4.123s) 与默认的 MKLDNN 后端配置相比。 +对于更小的批量维度,例如批量维度为 10,这一点更加显著:

              X = torch.rand(10, 64, 64, device=device)
               with torch.set_grad_enabled(False):
                   for _ in range(50):
              -        model(X) #Warmup
              +        model(X) #预热
                   with profile(activities=[ProfilerActivity.CPU]) as prof:
                       with record_function("mymodel_inference"):
                           for _ in range(100):
              @@ -1093,166 +886,27 @@ 

              Improve Inference Performance with OpenBLAS for Smaller Batch Dimensionsprint(prof.key_averages().table(sort_by="self_cpu_time_total"))

              -

              The following is the profiler output with MKLDNN fast math mode:

              - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

              Name

              Self CPU %

              Self CPU

              CPU total %

              CPU total

              CPU time avg

              # of Calls

              aten::addmm

              87.81%

              3.613s

              91.90%

              3.781s

              12.604ms

              300

              aten::clamp_min

              7.18%

              295.437ms

              7.18%

              295.437ms

              1.477ms

              200

              aten::copy

              4.07%

              167.516ms

              4.07%

              167.516ms

              558.387us

              300

              mymodel_inference

              0.67%

              27.708ms

              100.00%

              4.115s

              4.115s

              1

              aten::linear

              0.06%

              2.499ms

              92.06%

              3.788s

              12.627ms

              300

              aten::t

              0.05%

              1.982ms

              0.11%

              4.385ms

              14.617us

              300

              aten::relu

              0.05%

              1.932ms

              7.23%

              297.369ms

              1.487ms

              200

              -

              Self CPU time total: 4.115s

              -

              and the following is the profiler output with the OpenBLAS backend:

              +

              以下是启用 MKLDNN 快速数学模式时的 profiler 输出:

              +

              自身 CPU 总计: 4.115s

              +

              以下是使用 OpenBLAS 后端时的 profiler 输出:

              $ export TORCH_MKLDNN_MATMUL_MIN_DIM=64
               
              - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

              Name

              Self CPU %

              Self CPU

              CPU total %

              CPU total

              CPU time avg

              # of Calls

              aten::addmm

              92.66%

              1.179s

              95.23%

              1.211s

              4.038ms

              300

              aten::clamp_min

              2.83%

              36.060ms

              2.83%

              36.060ms

              180.300us

              200

              aten::copy

              2.52%

              32.013ms

              2.52%

              32.013ms

              106.710us

              300

              mymodel_inference

              1.38%

              17.521ms

              100.00%

              1.272s

              1.272s

              1

              aten::linear

              0.14%

              1.750ms

              95.60%

              1.216s

              4.054ms

              300

              aten::t

              0.12%

              1.475ms

              0.24%

              3.033ms

              10.110us

              300

              aten::relu

              0.10%

              1.285ms

              2.94%

              37.345ms

              186.725us

              200

              -

              Self CPU time total: 1.272s

              -

              Here we observed 3.2x (1.272s vs 4.115s) performance improvement by tuning the backend thresholds appropriately.

              +

              自身 CPU 总计: 1.272s

              +

              这里我们观察到通过适当调整后端阈值,**性能提高了3.2倍(1.272s vs 4.115s)**。

              -
              -

              Optimize Memory Allocation Overhead with Linux Transparent Huge Pages (THP)

              -

              We also observed that for these larger networks, tensor memory allocations take significant portion of the inference latency. This can be optimized by enabling Linux transparent huge page allocations from PyTorch C10 memory allocator. Currently the feature is not enabled by default because it will increase the memory footprint marginally. Set the following environment variable to enable it:

              +
              +

              使用 Linux Transparent Huge Pages (THP) 优化内存分配开销

              +

              我们还观察到,对于这些较大的网络,张量内存分配占推理延迟的很大一部分。这可以通过从PyTorch C10内存分配器 +启用 THP 来优化。目前,该功能默认未启用,因为它会略微增加内存占用。设置以下环境变量以启用它:

              $ export THP_MEM_ALLOC_ENABLE=1
               
              -

              For the batch dimension of 256 and with MKLDNN fast math mode:

              +

              对于批量维度为 256 且启用 MKLDNN Fast Math 模式:

              X = torch.rand(256, 64, 64, device=device)
               with torch.set_grad_enabled(False):
                   for _ in range(50):
              -        model(X) #Warmup
              +        model(X) #预热
                   with profile(activities=[ProfilerActivity.CPU]) as prof:
                       with record_function("mymodel_inference"):
                           for _ in range(100):
              @@ -1261,16 +915,16 @@ 

              Optimize Memory Allocation Overhead with Linux Transparent Huge Pages (THP)< print(prof.key_averages().table(sort_by="self_cpu_time_total"))

              -

              The following is the profiler output with THP memory allocations enabled:

              +

              启用THP内存分配后,profiler的输出如下:

              - - - - - - - + + + + + + + @@ -1324,12 +978,15 @@

              Optimize Memory Allocation Overhead with Linux Transparent Huge Pages (THP)<

              Name

              Self CPU %

              Self CPU

              CPU total %

              CPU total

              CPU time avg

              # of Calls

              名称

              自身CPU%

              自身CPU

              CPU总%

              CPU总

              CPU平均时间

              调用次数

              -

              Self CPU time total: 6.697s

              -

              This is an additional 1.08x or 8% (6.697s vs 7.262s) improvement on top of the already optimized MKLDNN fast math mode measured above.

              +

              自身CPU总时间: 6.697s

              +

              这比上面测量的已优化的 MKLDNN Fast Math 模式又提高了 1.08倍或8%(6.697s vs 7.262s)

              -
              -

              Conclusion

              -

              In this tutorial, we covered PyTorch inference on AWS Graviton3 instances by covering the basic usage, demonstrating speedups with fast math kernels, comparing different backends for different batch dimensions, and how to optimize tensor memory allocation latencies with Linux transparent huge pages. The recommendation is to use MKLDNN backend with Bfloat16 fastmath mode and THP memory allocations for larger tensor shapes and to use OpenBLAS backend for smaller tensor shapes. We hope that you will give it a try!

              +
              +

              结论

              +

              在本教程中,我们介绍了在AWS Graviton3实例上的PyTorch推理,包括基本用法、使用快速数学内核的加速、 +比较不同批量维度下不同后端的性能,以及如何使用Linux透明大页面优化张量内存分配延迟。 +对于较大的张量形状,建议使用MKLDNN后端和Bfloat16快速数学模式以及THP内存分配;对于较小的张量形状, +建议使用OpenBLAS后端。希望您能尝试一下!

              @@ -1385,13 +1042,13 @@

              Conclusion
                -
              • (Beta) PyTorch Inference Performance Tuning on AWS Graviton Processors diff --git a/docs/recipes/recipes/tuning_guide.html b/docs/recipes/recipes/tuning_guide.html index 67e8000..880f402 100644 --- a/docs/recipes/recipes/tuning_guide.html +++ b/docs/recipes/recipes/tuning_guide.html @@ -42,7 +42,7 @@ - +