Skip to content

Commit

Permalink
Generate CUDA stubs dynamically (#2884)
Browse files Browse the repository at this point in the history
This PR follows
google/tsl@726288e
and has the same motivation as that commit. `libcudart.so` stubs using
assembly language are generated dynamically by
[implib.so](https://github.com/yugr/Implib.so) during CMake execution,
instead of storing many versions of stub files in the repository.

Implib.so is vendored in `source/3rdparty` directory in the size of
156KB and slightly modified (to allow no CUDA library found).

---------

Signed-off-by: Jinzhe Zeng <[email protected]>
  • Loading branch information
njzjz authored Oct 3, 2023
1 parent f7b87c3 commit f256dff
Show file tree
Hide file tree
Showing 37 changed files with 1,757 additions and 17,543 deletions.
2 changes: 2 additions & 0 deletions .gitattributes
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
# do not show up detailed difference on GitHub
source/3rdparty/* linguist-generated=true
5 changes: 5 additions & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -27,22 +27,26 @@ repos:
hooks:
- id: isort
files: \.py$
exclude: ^source/3rdparty
- repo: https://github.com/astral-sh/ruff-pre-commit
# Ruff version.
rev: v0.0.291
hooks:
- id: ruff
args: ["--fix"]
exclude: ^source/3rdparty
- repo: https://github.com/psf/black-pre-commit-mirror
rev: 23.9.1
hooks:
- id: black-jupyter
exclude: ^source/3rdparty
# numpydoc
- repo: https://github.com/Carreau/velin
rev: 0.0.12
hooks:
- id: velin
args: ["--write"]
exclude: ^source/3rdparty
# Python inside docs
- repo: https://github.com/asottile/blacken-docs
rev: 1.16.0
Expand Down Expand Up @@ -102,6 +106,7 @@ repos:
- --comment-style
- "#"
- --no-extra-eol
exclude: ^source/3rdparty
# HTML
- id: insert-license
files: \.(html|vue|xml)$
Expand Down
3 changes: 3 additions & 0 deletions source/3rdparty/implib/arch/aarch64/config.ini
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
[Arch]
PointerSize = 8
SymbolReloc = R_AARCH64_ABS64
77 changes: 77 additions & 0 deletions source/3rdparty/implib/arch/aarch64/table.S.tpl
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
/*
* Copyright 2018-2020 Yury Gribov
*
* The MIT License (MIT)
*
* Use of this source code is governed by MIT license that can be
* found in the LICENSE.txt file.
*/

#define lr x30
#define ip0 x16

.data

.globl _${lib_suffix}_tramp_table
.hidden _${lib_suffix}_tramp_table
.align 8
_${lib_suffix}_tramp_table:
.zero $table_size

.text

.globl _${lib_suffix}_tramp_resolve
.hidden _${lib_suffix}_tramp_resolve

.globl _${lib_suffix}_save_regs_and_resolve
.hidden _${lib_suffix}_save_regs_and_resolve
.type _${lib_suffix}_save_regs_and_resolve, %function
_${lib_suffix}_save_regs_and_resolve:
.cfi_startproc

// Slow path which calls dlsym, taken only on first call.
// Registers are saved according to "Procedure Call Standard for the Arm® 64-bit Architecture".
// For DWARF directives, read https://www.imperialviolet.org/2017/01/18/cfi.html.

// Stack is aligned at 16 bytes

#define PUSH_PAIR(reg1, reg2) stp reg1, reg2, [sp, #-16]!; .cfi_adjust_cfa_offset 16; .cfi_rel_offset reg1, 0; .cfi_rel_offset reg2, 8
#define POP_PAIR(reg1, reg2) ldp reg1, reg2, [sp], #16; .cfi_adjust_cfa_offset -16; .cfi_restore reg2; .cfi_restore reg1

#define PUSH_WIDE_PAIR(reg1, reg2) stp reg1, reg2, [sp, #-32]!; .cfi_adjust_cfa_offset 32; .cfi_rel_offset reg1, 0; .cfi_rel_offset reg2, 16
#define POP_WIDE_PAIR(reg1, reg2) ldp reg1, reg2, [sp], #32; .cfi_adjust_cfa_offset -32; .cfi_restore reg2; .cfi_restore reg1

// Save only arguments (and lr)
PUSH_PAIR(x0, x1)
PUSH_PAIR(x2, x3)
PUSH_PAIR(x4, x5)
PUSH_PAIR(x6, x7)
PUSH_PAIR(x8, lr)

ldr x0, [sp, #80] // 16*5

PUSH_WIDE_PAIR(q0, q1)
PUSH_WIDE_PAIR(q2, q3)
PUSH_WIDE_PAIR(q4, q5)
PUSH_WIDE_PAIR(q6, q7)

// Stack is aligned at 16 bytes

bl _${lib_suffix}_tramp_resolve

// TODO: pop pc?

POP_WIDE_PAIR(q6, q7)
POP_WIDE_PAIR(q4, q5)
POP_WIDE_PAIR(q2, q3)
POP_WIDE_PAIR(q0, q1)

POP_PAIR(x8, lr)
POP_PAIR(x6, x7)
POP_PAIR(x4, x5)
POP_PAIR(x2, x3)
POP_PAIR(x0, x1)

br lr

.cfi_endproc
40 changes: 40 additions & 0 deletions source/3rdparty/implib/arch/aarch64/trampoline.S.tpl
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
/*
* Copyright 2018-2023 Yury Gribov
*
* The MIT License (MIT)
*
* Use of this source code is governed by MIT license that can be
* found in the LICENSE.txt file.
*/

.globl $sym
.p2align 4
.type $sym, %function
#ifndef IMPLIB_EXPORT_SHIMS
.hidden $sym
#endif
$sym:
.cfi_startproc

1:
// Load address
// TODO: can we do this faster on newer ARMs?
adrp ip0, _${lib_suffix}_tramp_table+$offset
ldr ip0, [ip0, #:lo12:_${lib_suffix}_tramp_table+$offset]

cbz ip0, 2f

// Fast path
br ip0

2:
// Slow path
mov ip0, $number & 0xffff
#if $number > 0xffff
movk ip0, $number >> 16, lsl #16
#endif
stp ip0, lr, [sp, #-16]!; .cfi_adjust_cfa_offset 16; .cfi_rel_offset ip0, 0; .cfi_rel_offset lr, 8;
bl _${lib_suffix}_save_regs_and_resolve
ldp ip0, lr, [sp], #16; .cfi_adjust_cfa_offset -16; .cfi_restore lr; .cfi_restore ip0
b 1b
.cfi_endproc
3 changes: 3 additions & 0 deletions source/3rdparty/implib/arch/arm/config.ini
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
[Arch]
PointerSize = 4
SymbolReloc = R_ARM_ABS32
88 changes: 88 additions & 0 deletions source/3rdparty/implib/arch/arm/table.S.tpl
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
/*
* Copyright 2018-2022 Yury Gribov
*
* The MIT License (MIT)
*
* Use of this source code is governed by MIT license that can be
* found in the LICENSE.txt file.
*/

.data

.globl _${lib_suffix}_tramp_table
.hidden _${lib_suffix}_tramp_table
.align 4
_${lib_suffix}_tramp_table:
.zero $table_size

.text

.globl _${lib_suffix}_tramp_resolve
.hidden _${lib_suffix}_tramp_resolve

.globl _${lib_suffix}_save_regs_and_resolve
.hidden _${lib_suffix}_save_regs_and_resolve
.type _${lib_suffix}_save_regs_and_resolve, %function
_${lib_suffix}_save_regs_and_resolve:
.cfi_startproc

#define PUSH_REG(reg) push {reg}; .cfi_adjust_cfa_offset 4; .cfi_rel_offset reg, 0
#define POP_REG(reg) pop {reg} ; .cfi_adjust_cfa_offset -4; .cfi_restore reg

// Binutils 2.30 does not like q0 in .cfi_rel_offset
#define PUSH_DREG_PAIR(reg1, reg2) vpush {reg1, reg2}; .cfi_adjust_cfa_offset 16; .cfi_rel_offset reg1, 0; .cfi_rel_offset reg2, 8
#define POP_DREG_PAIR(reg1, reg2) vpop {reg1, reg2}; .cfi_adjust_cfa_offset -16; .cfi_restore reg1; .cfi_restore reg2

// Slow path which calls dlsym, taken only on first call.
// Registers are saved acc. to "Procedure Call Standard for the ARM Architecture".
// For DWARF directives, read https://www.imperialviolet.org/2017/01/18/cfi.html.

// Stack is aligned at 16 bytes at this point

// Save only arguments (and lr)
PUSH_REG(r0)
ldr r0, [sp, #8]
PUSH_REG(r1)
PUSH_REG(r2)
PUSH_REG(r3)
PUSH_REG(lr)
PUSH_REG(lr) // Align to 8 bytes

// Arguments can be passed in VFP registers only when hard-float ABI is used
// for arm-gnueabihf target // (http://android-doc.github.io/ndk/guides/abis.html#v7a).
// Use compiler macro to detect this case.
#ifdef __ARM_PCS_VFP
PUSH_DREG_PAIR(d0, d1)
PUSH_DREG_PAIR(d2, d3)
PUSH_DREG_PAIR(d4, d5)
PUSH_DREG_PAIR(d6, d7)
PUSH_DREG_PAIR(d8, d9)
PUSH_DREG_PAIR(d10, d11)
PUSH_DREG_PAIR(d12, d13)
PUSH_DREG_PAIR(d14, d15)
// FIXME: NEON actually supports 32 D-registers but it's unclear how to detect this
#endif

bl _${lib_suffix}_tramp_resolve(PLT)

#ifdef __ARM_PCS_VFP
POP_DREG_PAIR(d14, d15)
POP_DREG_PAIR(d12, d13)
POP_DREG_PAIR(d10, d11)
POP_DREG_PAIR(d8, d9)
POP_DREG_PAIR(d6, d7)
POP_DREG_PAIR(d4, d5)
POP_DREG_PAIR(d2, d3)
POP_DREG_PAIR(d0, d1)
#endif

POP_REG(lr) // TODO: pop pc?
POP_REG(lr)
POP_REG(r3)
POP_REG(r2)
POP_REG(r1)
POP_REG(r0)

bx lr

.cfi_endproc
49 changes: 49 additions & 0 deletions source/3rdparty/implib/arch/arm/trampoline.S.tpl
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
/*
* Copyright 2018-2023 Yury Gribov
*
* The MIT License (MIT)
*
* Use of this source code is governed by MIT license that can be
* found in the LICENSE.txt file.
*/

.globl $sym
.p2align 4
.type $sym, %function
#ifndef IMPLIB_EXPORT_SHIMS
.hidden $sym
#endif
$sym:
.cfi_startproc

1:
// Load address
// TODO: can we do this faster on newer ARMs?
ldr ip, 3f
2:
add ip, pc, ip
ldr ip, [ip]

cmp ip, #0

// Fast path
bxne ip

// Slow path
ldr ip, =$number
push {ip}
.cfi_adjust_cfa_offset 4
PUSH_REG(lr)
bl _${lib_suffix}_save_regs_and_resolve
POP_REG(lr)
add sp, #4
.cfi_adjust_cfa_offset -4
b 1b

// Force constant pool for ldr above
.ltorg

.cfi_endproc

3:
.word _${lib_suffix}_tramp_table - (2b + 8) + $offset
Loading

0 comments on commit f256dff

Please sign in to comment.