Generate CUDA stubs dynamically (#2884)

This PR follows google/tsl@726288e and has the same motivation as that commit. `libcudart.so` stubs using assembly language are generated dynamically by [implib.so](https://github.com/yugr/Implib.so) during CMake execution, instead of storing many versions of stub files in the repository. Implib.so is vendored in `source/3rdparty` directory in the size of 156KB and slightly modified (to allow no CUDA library found). --------- Signed-off-by: Jinzhe Zeng <[email protected]>
deepmodeling · Oct 3, 2023 · f256dff · f256dff
1 parent f7b87c3
commit f256dff
Show file tree

Hide file tree

Showing 37 changed files with 1,757 additions and 17,543 deletions.
diff --git a/.gitattributes b/.gitattributes
@@ -0,0 +1,2 @@
+# do not show up detailed difference on GitHub
+source/3rdparty/* linguist-generated=true
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -27,22 +27,26 @@ repos:
     hooks:
     - id: isort
       files: \.py$
+      exclude: ^source/3rdparty
 -   repo: https://github.com/astral-sh/ruff-pre-commit
     # Ruff version.
     rev: v0.0.291
     hooks:
     - id: ruff
       args: ["--fix"]
+      exclude: ^source/3rdparty
 -   repo: https://github.com/psf/black-pre-commit-mirror
     rev: 23.9.1
     hooks:
     -   id: black-jupyter
+        exclude: ^source/3rdparty
 # numpydoc
 -   repo: https://github.com/Carreau/velin
     rev: 0.0.12
     hooks:
     - id: velin
       args: ["--write"]
+      exclude: ^source/3rdparty
 # Python inside docs
 -   repo: https://github.com/asottile/blacken-docs
     rev: 1.16.0
@@ -102,6 +106,7 @@ repos:
         - --comment-style
         - "#"
         - --no-extra-eol
+        exclude: ^source/3rdparty
     # HTML
     -   id: insert-license
         files: \.(html|vue|xml)$

diff --git a/source/3rdparty/implib/arch/aarch64/config.ini b/source/3rdparty/implib/arch/aarch64/config.ini
@@ -0,0 +1,3 @@
+[Arch]
+PointerSize = 8
+SymbolReloc = R_AARCH64_ABS64
diff --git a/source/3rdparty/implib/arch/aarch64/table.S.tpl b/source/3rdparty/implib/arch/aarch64/table.S.tpl
@@ -0,0 +1,77 @@
+/*
+ * Copyright 2018-2020 Yury Gribov
+ *
+ * The MIT License (MIT)
+ *
+ * Use of this source code is governed by MIT license that can be
+ * found in the LICENSE.txt file.
+ */
+
+#define lr x30
+#define ip0 x16
+
+  .data
+
+  .globl _${lib_suffix}_tramp_table
+  .hidden _${lib_suffix}_tramp_table
+  .align 8
+_${lib_suffix}_tramp_table:
+  .zero $table_size
+
+  .text
+
+  .globl _${lib_suffix}_tramp_resolve
+  .hidden _${lib_suffix}_tramp_resolve
+
+  .globl _${lib_suffix}_save_regs_and_resolve
+  .hidden _${lib_suffix}_save_regs_and_resolve
+  .type _${lib_suffix}_save_regs_and_resolve, %function
+_${lib_suffix}_save_regs_and_resolve:
+  .cfi_startproc
+
+  // Slow path which calls dlsym, taken only on first call.
+  // Registers are saved according to "Procedure Call Standard for the Arm® 64-bit Architecture".
+  // For DWARF directives, read https://www.imperialviolet.org/2017/01/18/cfi.html.
+
+  // Stack is aligned at 16 bytes
+
+#define PUSH_PAIR(reg1, reg2) stp reg1, reg2, [sp, #-16]!; .cfi_adjust_cfa_offset 16; .cfi_rel_offset reg1, 0; .cfi_rel_offset reg2, 8
+#define POP_PAIR(reg1, reg2) ldp reg1, reg2, [sp], #16; .cfi_adjust_cfa_offset -16; .cfi_restore reg2; .cfi_restore reg1
+
+#define PUSH_WIDE_PAIR(reg1, reg2) stp reg1, reg2, [sp, #-32]!; .cfi_adjust_cfa_offset 32; .cfi_rel_offset reg1, 0; .cfi_rel_offset reg2, 16
+#define POP_WIDE_PAIR(reg1, reg2) ldp reg1, reg2, [sp], #32; .cfi_adjust_cfa_offset -32; .cfi_restore reg2; .cfi_restore reg1
+
+  // Save only arguments (and lr)
+  PUSH_PAIR(x0, x1)
+  PUSH_PAIR(x2, x3)
+  PUSH_PAIR(x4, x5)
+  PUSH_PAIR(x6, x7)
+  PUSH_PAIR(x8, lr)
+
+  ldr x0, [sp, #80]  // 16*5
+
+  PUSH_WIDE_PAIR(q0, q1)
+  PUSH_WIDE_PAIR(q2, q3)
+  PUSH_WIDE_PAIR(q4, q5)
+  PUSH_WIDE_PAIR(q6, q7)
+
+  // Stack is aligned at 16 bytes
+
+  bl _${lib_suffix}_tramp_resolve
+
+  // TODO: pop pc?
+
+  POP_WIDE_PAIR(q6, q7)
+  POP_WIDE_PAIR(q4, q5)
+  POP_WIDE_PAIR(q2, q3)
+  POP_WIDE_PAIR(q0, q1)
+
+  POP_PAIR(x8, lr)
+  POP_PAIR(x6, x7)
+  POP_PAIR(x4, x5)
+  POP_PAIR(x2, x3)
+  POP_PAIR(x0, x1)
+
+  br lr
+
+  .cfi_endproc
diff --git a/source/3rdparty/implib/arch/aarch64/trampoline.S.tpl b/source/3rdparty/implib/arch/aarch64/trampoline.S.tpl
@@ -0,0 +1,40 @@
+/*
+ * Copyright 2018-2023 Yury Gribov
+ *
+ * The MIT License (MIT)
+ *
+ * Use of this source code is governed by MIT license that can be
+ * found in the LICENSE.txt file.
+ */
+
+  .globl $sym
+  .p2align 4
+  .type $sym, %function
+#ifndef IMPLIB_EXPORT_SHIMS
+  .hidden $sym
+#endif
+$sym:
+  .cfi_startproc
+
+1:
+  // Load address
+  // TODO: can we do this faster on newer ARMs?
+  adrp ip0, _${lib_suffix}_tramp_table+$offset
+  ldr ip0, [ip0, #:lo12:_${lib_suffix}_tramp_table+$offset]
+
+  cbz ip0, 2f
+
+  // Fast path
+  br ip0
+
+2:
+  // Slow path
+  mov ip0, $number & 0xffff
+#if $number > 0xffff
+  movk ip0, $number >> 16, lsl #16
+#endif
+  stp ip0, lr, [sp, #-16]!; .cfi_adjust_cfa_offset 16; .cfi_rel_offset ip0, 0; .cfi_rel_offset lr, 8;
+  bl _${lib_suffix}_save_regs_and_resolve
+  ldp ip0, lr, [sp], #16; .cfi_adjust_cfa_offset -16; .cfi_restore lr; .cfi_restore ip0
+  b 1b
+  .cfi_endproc
diff --git a/source/3rdparty/implib/arch/arm/config.ini b/source/3rdparty/implib/arch/arm/config.ini
@@ -0,0 +1,3 @@
+[Arch]
+PointerSize = 4
+SymbolReloc = R_ARM_ABS32
diff --git a/source/3rdparty/implib/arch/arm/table.S.tpl b/source/3rdparty/implib/arch/arm/table.S.tpl
@@ -0,0 +1,88 @@
+/*
+ * Copyright 2018-2022 Yury Gribov
+ *
+ * The MIT License (MIT)
+ *
+ * Use of this source code is governed by MIT license that can be
+ * found in the LICENSE.txt file.
+ */
+
+  .data
+
+  .globl _${lib_suffix}_tramp_table
+  .hidden _${lib_suffix}_tramp_table
+  .align 4
+_${lib_suffix}_tramp_table:
+  .zero $table_size
+
+  .text
+
+  .globl _${lib_suffix}_tramp_resolve
+  .hidden _${lib_suffix}_tramp_resolve
+
+  .globl _${lib_suffix}_save_regs_and_resolve
+  .hidden _${lib_suffix}_save_regs_and_resolve
+  .type _${lib_suffix}_save_regs_and_resolve, %function
+_${lib_suffix}_save_regs_and_resolve:
+  .cfi_startproc
+
+#define PUSH_REG(reg) push {reg}; .cfi_adjust_cfa_offset 4; .cfi_rel_offset reg, 0
+#define POP_REG(reg) pop {reg} ; .cfi_adjust_cfa_offset -4; .cfi_restore reg
+
+// Binutils 2.30 does not like q0 in .cfi_rel_offset
+#define PUSH_DREG_PAIR(reg1, reg2) vpush {reg1, reg2}; .cfi_adjust_cfa_offset 16; .cfi_rel_offset reg1, 0; .cfi_rel_offset reg2, 8
+#define POP_DREG_PAIR(reg1, reg2) vpop {reg1, reg2}; .cfi_adjust_cfa_offset -16; .cfi_restore reg1; .cfi_restore reg2
+
+  // Slow path which calls dlsym, taken only on first call.
+  // Registers are saved acc. to "Procedure Call Standard for the ARM Architecture".
+  // For DWARF directives, read https://www.imperialviolet.org/2017/01/18/cfi.html.
+
+  // Stack is aligned at 16 bytes at this point
+
+  // Save only arguments (and lr)
+  PUSH_REG(r0)
+  ldr r0, [sp, #8]
+  PUSH_REG(r1)
+  PUSH_REG(r2)
+  PUSH_REG(r3)
+  PUSH_REG(lr)
+  PUSH_REG(lr)  // Align to 8 bytes
+
+  // Arguments can be passed in VFP registers only when hard-float ABI is used
+  // for arm-gnueabihf target // (http://android-doc.github.io/ndk/guides/abis.html#v7a).
+  // Use compiler macro to detect this case.
+#ifdef __ARM_PCS_VFP
+  PUSH_DREG_PAIR(d0, d1)
+  PUSH_DREG_PAIR(d2, d3)
+  PUSH_DREG_PAIR(d4, d5)
+  PUSH_DREG_PAIR(d6, d7)
+  PUSH_DREG_PAIR(d8, d9)
+  PUSH_DREG_PAIR(d10, d11)
+  PUSH_DREG_PAIR(d12, d13)
+  PUSH_DREG_PAIR(d14, d15)
+  // FIXME: NEON actually supports 32 D-registers but it's unclear how to detect this
+#endif
+
+  bl _${lib_suffix}_tramp_resolve(PLT)
+
+#ifdef __ARM_PCS_VFP
+  POP_DREG_PAIR(d14, d15)
+  POP_DREG_PAIR(d12, d13)
+  POP_DREG_PAIR(d10, d11)
+  POP_DREG_PAIR(d8, d9)
+  POP_DREG_PAIR(d6, d7)
+  POP_DREG_PAIR(d4, d5)
+  POP_DREG_PAIR(d2, d3)
+  POP_DREG_PAIR(d0, d1)
+#endif
+
+  POP_REG(lr)  // TODO: pop pc?
+  POP_REG(lr)
+  POP_REG(r3)
+  POP_REG(r2)
+  POP_REG(r1)
+  POP_REG(r0)
+
+  bx lr
+
+  .cfi_endproc
diff --git a/source/3rdparty/implib/arch/arm/trampoline.S.tpl b/source/3rdparty/implib/arch/arm/trampoline.S.tpl
@@ -0,0 +1,49 @@
+/*
+ * Copyright 2018-2023 Yury Gribov
+ *
+ * The MIT License (MIT)
+ *
+ * Use of this source code is governed by MIT license that can be
+ * found in the LICENSE.txt file.
+ */
+
+  .globl $sym
+  .p2align 4
+  .type $sym, %function
+#ifndef IMPLIB_EXPORT_SHIMS
+  .hidden $sym
+#endif
+$sym:
+  .cfi_startproc
+
+1:
+  // Load address
+  // TODO: can we do this faster on newer ARMs?
+  ldr ip, 3f
+2:
+  add ip, pc, ip
+  ldr ip, [ip]
+
+  cmp ip, #0
+
+  // Fast path
+  bxne ip
+
+  // Slow path
+  ldr ip, =$number
+  push {ip}
+  .cfi_adjust_cfa_offset 4
+  PUSH_REG(lr)
+  bl _${lib_suffix}_save_regs_and_resolve
+  POP_REG(lr)
+  add sp, #4
+  .cfi_adjust_cfa_offset -4
+  b 1b
+
+  // Force constant pool for ldr above
+  .ltorg
+
+  .cfi_endproc
+
+3:
+  .word _${lib_suffix}_tramp_table - (2b + 8) + $offset
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		# do not show up detailed difference on GitHub
		source/3rdparty/* linguist-generated=true