Merge pull request #12 from openai/main

pr
1proprogrammerchant · Sep 20, 2023 · 879a916 · 879a916
2 parents fd22f45 + ed5a530
commit 879a916
Show file tree

Hide file tree

Showing 177 changed files with 7,959 additions and 5,542 deletions.
diff --git a/.github/workflows/documentation.yml b/.github/workflows/documentation.yml
@@ -24,6 +24,8 @@ jobs:
         run: |
           pip3 install tabulate
           pip3 install cmake
+          pip3 install sphinx
+          pip3 install myst_parser
 
       #- name: Fetch dependent branches
       #  run: |
@@ -33,7 +35,7 @@ jobs:
         run: |
           cd docs
           export PATH=$(python3 -c "import cmake; print(cmake.CMAKE_BIN_DIR)"):$PATH
-          python3 -m sphinx_multiversion . _build/html/
+          python3 -m sphinx . _build/html/main
 
       - name: Update docs
         run: |

diff --git a/.github/workflows/integration-tests.yml b/.github/workflows/integration-tests.yml
@@ -27,12 +27,13 @@ jobs:
         run: |
           if [ x"${{ github.repository }}" == x"openai/triton" ]; then
             echo '::set-output name=matrix-required::[["self-hosted", "A100"], ["self-hosted", "H100"]]'
-            echo '::set-output name=matrix-optional::[]'
+            echo '::set-output name=matrix-optional::[["self-hosted", "gfx908"], ["self-hosted", "arc770"]]'
           else
             echo '::set-output name=matrix-required::["ubuntu-latest"]'
             echo '::set-output name=matrix-optional::["ubuntu-latest"]'
           fi
 
+
   Integration-Tests-Nvidia:
     needs: Runner-Preparation
 
@@ -44,8 +45,9 @@ jobs:
 
     steps:
       - name: Checkout
-        uses: actions/checkout@v2
-
+        uses: actions/checkout@v3
+        with:
+          submodules: 'true'
       - name: Set CUDA ENV
         if: ${{(matrix.runner[0] == 'self-hosted') && (matrix.runner[1] == 'V100' || matrix.runner[1] == 'A100' || matrix.runner[1] == 'H100')}}
         run: |
@@ -62,12 +64,18 @@ jobs:
         run: |
           echo "PATH=${HOME}/.local/bin:${PATH}" >> "${GITHUB_ENV}"
 
+      - name: Check pre-commit
+        run: |
+          python3 -m pip install --upgrade pre-commit
+          python3 -m pre_commit run --all-files --verbose
+
       - name: Install Triton
         if: ${{ env.BACKEND == 'CUDA'}}
         run: |
           cd python
           python3 -m pip install --upgrade pip
           python3 -m pip install cmake==3.24
+          python3 -m pip install ninja
           python3 -m pip install --no-build-isolation -vvv '.[tests]'
           python3 -m pip install pytest-xdist
 
@@ -97,6 +105,8 @@ jobs:
           python3 -m pytest runtime/
           # run test_line_info.py separately with TRITON_DISABLE_LINE_INFO=0
           TRITON_DISABLE_LINE_INFO=0 python3 -m pytest language/test_line_info.py
+          #run hopper/test_flashattention.py to avoid out of gpu memory
+          python3 -m pytest hopper/test_flashattention.py
 
       - name: Run python tests on CUDA with ENABLE_TMA=0 and ENABLE_MMA_V3=0
         if: ${{ env.BACKEND == 'CUDA' && env.ENABLE_TMA == '0' && env.ENABLE_MMA_V3 == '0'}}
@@ -112,6 +122,14 @@ jobs:
         run: |
           rm -rf ~/.triton
 
+      - name: Run interpreter tests
+        env:
+          # TRITON_INTERPRET: "1"
+          CUA_VISIBLE_DEVICES: ""
+        run: |
+          cd python/test/unit
+          python3 -m pytest -vs operators/test_flash_attention.py
+
       - name: Run partial tests on CUDA with ENABLE_TMA=1 and ENABLE_MMA_V3=1
         if: ${{ env.BACKEND == 'CUDA' && env.ENABLE_TMA == '1' && env.ENABLE_MMA_V3 == '1'}}
         run: |
@@ -204,10 +222,12 @@ jobs:
       - name: Install Triton on ROCM
         if: ${{ env.BACKEND == 'ROCM'}}
         run: |
+          git submodule update --init --recursive
           cd python
           python3 -m pip install --upgrade pip
           python3 -m pip install cmake==3.24
           python3 -m pip install torch==1.13.1 --index-url https://download.pytorch.org/whl/rocm5.2
+          export TRITON_CODEGEN_AMD_HIP_BACKEND=1
           python3 -m pip install --no-build-isolation -vvv '.[tests]'
 
       - name: Install Triton on XPU
@@ -229,7 +249,7 @@ jobs:
         if: ${{ env.BACKEND == 'ROCM'}}
         run: |
           cd python/test/unit/language
-          python3 -m pytest --capture=tee-sys -rfs --verbose "test_core.py::test_empty_kernel"
+          python3 -m pytest --capture=tee-sys -rfs --verbose "test_core.py"
 
       - name: Run python tests on XPU
         if: ${{ env.BACKEND == 'XPU'}}

diff --git a/.gitignore b/.gitignore
@@ -26,4 +26,14 @@ venv.bak/
 cmake-build-*
 
 # Third-party binaries
+cuobjdump
+nvdisasm
 ptxas
+
+# Docs
+docs/_build/
+docs/python-api/generated/
+docs/dialects/
+docs/getting-started/tutorials
+!python/tutorials/*.py
+!python/tutorials/*.rst
diff --git a/.gitmodules b/.gitmodules
@@ -1,3 +1,7 @@
 [submodule "third_party/intel_xpu_backend"]
 	path = third_party/intel_xpu_backend
 	url = http://github.com/intel/intel-xpu-backend-for-triton
+[submodule "third_party/amd_hip_backend"]
+	path = third_party/amd_hip_backend
+	url = https://github.com/ROCmSoftwarePlatform/triton
+	branch = third_party_backend_2
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -44,7 +44,7 @@ repos:
             ^docs/conf.py$
           )
   - repo: https://github.com/pre-commit/mirrors-clang-format
-    rev: v14.0.6
+    rev: v16.0.6
     hooks:
       - id: clang-format
         stages: [commit, push, manual]

diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -212,7 +212,6 @@ if(TRITON_BUILD_PYTHON_MODULE)
     TritonNvidiaGPUTransforms
     TritonLLVMIR
     TritonPTX
-    TritonHSACO
     ${dialect_libs}
     ${conversion_libs}
 

diff --git a/README.md b/README.md
@@ -4,11 +4,36 @@
 
 [![Wheels](https://github.com/openai/triton/actions/workflows/wheels.yml/badge.svg?branch=release/2.0.x)](https://github.com/openai/triton/actions/workflows/wheels.yml)
 
+We're hiring! If you are interested in working on Triton at OpenAI, we have roles open for [Compiler Engineers](https://openai.com/careers/software-engineer-triton-compiler) and [Kernel Engineers](https://openai.com/careers/kernel-engineer).
 
 **`Documentation`** |
 ------------------- |
 [![Documentation](https://github.com/openai/triton/actions/workflows/documentation.yml/badge.svg)](https://triton-lang.org/)
 
+# Triton Developer Conference Registration Now Closed
+The Triton Developer Conference will be held in a hybrid mode at the Microsoft Silicon Valley Campus in Mountain View, California. The conference will be held on September 20th from 10am to 4pm, followed by a reception till 5:30 pm.
+
+Tentative Agenda for the conference (subject to change):
+
+|Time    |Title  |Speaker
+|--------|-------|-------|
+|10:00 AM|Welcome|Kevin Scott (Microsoft)|
+|10:20 AM|The Triton Compiler: Past, Present and Future|Phil Tillet (OpenAI)|
+|11:00 AM|**Break**||
+|11:20 AM|Hopper support in Triton|Gustav Zhu (Nvidia)|
+|11:40 AM|Bringing Triton to AMD GPUs|Jason Furmanek, Lixun Zhang (AMD)|
+|12:00 PM|Intel XPU Backend for Triton|Eikan Wang (Intel)|
+|12:20 PM|Vectorization of Triton Kernels for Qualcomm Hexagon Backend|Javed Absar (Qualcomm)|
+|12:30 PM|**Lunch**||
+|1:40 PM |Triton for MTIA|Roman Levenstein et al, (Meta)|
+|2:00 PM |Using Triton IR for high-performance fusions in XLA|George Karpenkov (Google)|
+|2:20 PM |Triton for All: Triton as a device-independent language|Ian Bearman (Microsoft)|
+|2:40 PM|**Break**||
+|3:00 PM|PyTorch 2.0 and TorchInductor|Jason Ansel, Horace He (Meta)|
+|3:20 PM|Pallas: A JAX Kernel Language|Sharad Vikram (Google)|
+|3:40 PM|Writing Grouped GEMMs in Triton|Vinod Grover (Nvidia)|
+|4:00 PM|**Reception**||
+
 
 # Triton
 
@@ -37,12 +62,64 @@ pip install -U --index-url https://aiinfra.pkgs.visualstudio.com/PublicPackages/
 
 ```
 git clone https://github.com/openai/triton.git;
-cd triton/python;
-pip install cmake; # build-time dependency
-pip install -e .
+cd triton;
+
+pip install ninja cmake; # build-time dependencies
+pip install -e python
+```
+
+Or with a virtualenv:
+
+```
+git clone https://github.com/openai/triton.git;
+cd triton;
+
+python -m venv .venv --prompt triton;
+source .venv/bin/activate;
+
+pip install ninja cmake; # build-time dependencies
+pip install -e python
 ```
 
+# Building with a custom LLVM
+
+Triton uses LLVM to generate code for GPUs and CPUs.  Normally, the Triton build
+downloads a prebuilt LLVM, but you can also build LLVM from source and use that.
+
+LLVM does not have a stable API, so the Triton build will not work at an
+arbitrary LLVM version.
+
+1. Find the version of LLVM that Triton builds against.  Check `python/setup.py`
+   for a line like
 
+       version = "llvm-17.0.0-c5dede880d17"
+
+   This means that the version of Triton you have builds against
+   [LLVM](https://github.com/llvm/llvm-project) c5dede880d17.
+
+2. `git checkout` LLVM at this revision.  Optionally, make additional
+   modifications to LLVM.
+
+3. [Build LLVM](https://llvm.org/docs/CMake.html).  For example, you might run
+
+       $ cd $HOME/llvm-project  # your clone of LLVM.
+       $ mkdir build
+       $ cd build
+       $ cmake -G Ninja -DCMAKE_BUILD_TYPE=Release -DLLVM_ENABLE_ASSERTIONS=ON  ../llvm -DLLVM_ENABLE_PROJECTS="mlir"
+       $ ninja
+
+4. Grab a snack, this will take a while.
+
+5. Build Triton as above, but set the following environment variables.
+
+       # Modify as appropriate to point to your LLVM build.
+       $ export LLVM_BUILD_DIR=$HOME/llvm-project/build
+
+       $ cd <triton install>/python
+       $ LLVM_INCLUDE_DIRS=$LLVM_BUILD_DIR/include \
+         LLVM_LIBRARY_DIR=$LLVM_BUILD_DIR \
+         LLVM_SYSPATH=$LLVM_BUILD_DIR \
+         pip install -e python
 
 # Changelog
 
@@ -56,10 +133,6 @@ Version 2.0 is out! New features include:
 
 Community contributions are more than welcome, whether it be to fix bugs or to add new features at [github](https://github.com/openai/triton/). For more detailed instructions, please visit our [contributor's guide](CONTRIBUTING.md).
 
-If you’re interested in joining our team and working on Triton & GPU kernels, [we’re hiring](https://openai.com/jobs/#acceleration)!
-
-
-
 
 # Compatibility
 

diff --git a/bin/CMakeLists.txt b/bin/CMakeLists.txt
@@ -53,7 +53,6 @@ llvm_update_compile_flags(triton-translate)
          TritonNvidiaGPUTransforms
          TritonLLVMIR
          TritonPTX
-         TritonHSACO
          ${dialect_libs}
          ${conversion_libs}
          # tests
@@ -80,3 +79,20 @@ llvm_update_compile_flags(triton-translate)
          MLIRROCDLToLLVMIRTranslation
          )
 mlir_check_all_link_libraries(triton-translate)
+
+add_llvm_executable(triton-llvm-opt
+  triton-llvm-opt.cpp
+
+  DEPENDS
+  intrinsics_gen
+  SUPPORT_PLUGINS
+  )
+target_link_libraries(triton-llvm-opt PRIVATE
+  TritonLLVMIR
+
+  LLVMCore
+  LLVMSupport
+  LLVMOption
+  LLVMCodeGen
+  )
+export_executable_symbols_for_plugins(triton-llvm-opt)
diff --git a/bin/RegisterTritonDialects.h b/bin/RegisterTritonDialects.h
@@ -1,4 +1,5 @@
 #pragma once
+#include "triton/Dialect/NVGPU/IR/Dialect.h"
 #include "triton/Dialect/Triton/IR/Dialect.h"
 #include "triton/Dialect/TritonGPU/IR/Dialect.h"
 #include "triton/Dialect/TritonNvidiaGPU/IR/Dialect.h"
@@ -40,5 +41,6 @@ inline void registerTritonDialects(mlir::DialectRegistry &registry) {
                   mlir::triton::nvidia_gpu::TritonNvidiaGPUDialect,
                   mlir::triton::gpu::TritonGPUDialect, mlir::math::MathDialect,
                   mlir::arith::ArithDialect, mlir::scf::SCFDialect,
-                  mlir::gpu::GPUDialect>();
+                  mlir::gpu::GPUDialect, mlir::LLVM::LLVMDialect,
+                  mlir::triton::nvgpu::NVGPUDialect>();
 }