-
Notifications
You must be signed in to change notification settings - Fork 526
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Generate CUDA stubs dynamically (#2884)
This PR follows google/tsl@726288e and has the same motivation as that commit. `libcudart.so` stubs using assembly language are generated dynamically by [implib.so](https://github.com/yugr/Implib.so) during CMake execution, instead of storing many versions of stub files in the repository. Implib.so is vendored in `source/3rdparty` directory in the size of 156KB and slightly modified (to allow no CUDA library found). --------- Signed-off-by: Jinzhe Zeng <[email protected]>
- Loading branch information
Showing
37 changed files
with
1,757 additions
and
17,543 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
# do not show up detailed difference on GitHub | ||
source/3rdparty/* linguist-generated=true |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
[Arch] | ||
PointerSize = 8 | ||
SymbolReloc = R_AARCH64_ABS64 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,77 @@ | ||
/* | ||
* Copyright 2018-2020 Yury Gribov | ||
* | ||
* The MIT License (MIT) | ||
* | ||
* Use of this source code is governed by MIT license that can be | ||
* found in the LICENSE.txt file. | ||
*/ | ||
|
||
#define lr x30 | ||
#define ip0 x16 | ||
|
||
.data | ||
|
||
.globl _${lib_suffix}_tramp_table | ||
.hidden _${lib_suffix}_tramp_table | ||
.align 8 | ||
_${lib_suffix}_tramp_table: | ||
.zero $table_size | ||
|
||
.text | ||
|
||
.globl _${lib_suffix}_tramp_resolve | ||
.hidden _${lib_suffix}_tramp_resolve | ||
|
||
.globl _${lib_suffix}_save_regs_and_resolve | ||
.hidden _${lib_suffix}_save_regs_and_resolve | ||
.type _${lib_suffix}_save_regs_and_resolve, %function | ||
_${lib_suffix}_save_regs_and_resolve: | ||
.cfi_startproc | ||
|
||
// Slow path which calls dlsym, taken only on first call. | ||
// Registers are saved according to "Procedure Call Standard for the Arm® 64-bit Architecture". | ||
// For DWARF directives, read https://www.imperialviolet.org/2017/01/18/cfi.html. | ||
|
||
// Stack is aligned at 16 bytes | ||
|
||
#define PUSH_PAIR(reg1, reg2) stp reg1, reg2, [sp, #-16]!; .cfi_adjust_cfa_offset 16; .cfi_rel_offset reg1, 0; .cfi_rel_offset reg2, 8 | ||
#define POP_PAIR(reg1, reg2) ldp reg1, reg2, [sp], #16; .cfi_adjust_cfa_offset -16; .cfi_restore reg2; .cfi_restore reg1 | ||
|
||
#define PUSH_WIDE_PAIR(reg1, reg2) stp reg1, reg2, [sp, #-32]!; .cfi_adjust_cfa_offset 32; .cfi_rel_offset reg1, 0; .cfi_rel_offset reg2, 16 | ||
#define POP_WIDE_PAIR(reg1, reg2) ldp reg1, reg2, [sp], #32; .cfi_adjust_cfa_offset -32; .cfi_restore reg2; .cfi_restore reg1 | ||
|
||
// Save only arguments (and lr) | ||
PUSH_PAIR(x0, x1) | ||
PUSH_PAIR(x2, x3) | ||
PUSH_PAIR(x4, x5) | ||
PUSH_PAIR(x6, x7) | ||
PUSH_PAIR(x8, lr) | ||
|
||
ldr x0, [sp, #80] // 16*5 | ||
|
||
PUSH_WIDE_PAIR(q0, q1) | ||
PUSH_WIDE_PAIR(q2, q3) | ||
PUSH_WIDE_PAIR(q4, q5) | ||
PUSH_WIDE_PAIR(q6, q7) | ||
|
||
// Stack is aligned at 16 bytes | ||
|
||
bl _${lib_suffix}_tramp_resolve | ||
|
||
// TODO: pop pc? | ||
|
||
POP_WIDE_PAIR(q6, q7) | ||
POP_WIDE_PAIR(q4, q5) | ||
POP_WIDE_PAIR(q2, q3) | ||
POP_WIDE_PAIR(q0, q1) | ||
|
||
POP_PAIR(x8, lr) | ||
POP_PAIR(x6, x7) | ||
POP_PAIR(x4, x5) | ||
POP_PAIR(x2, x3) | ||
POP_PAIR(x0, x1) | ||
|
||
br lr | ||
|
||
.cfi_endproc |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,40 @@ | ||
/* | ||
* Copyright 2018-2023 Yury Gribov | ||
* | ||
* The MIT License (MIT) | ||
* | ||
* Use of this source code is governed by MIT license that can be | ||
* found in the LICENSE.txt file. | ||
*/ | ||
|
||
.globl $sym | ||
.p2align 4 | ||
.type $sym, %function | ||
#ifndef IMPLIB_EXPORT_SHIMS | ||
.hidden $sym | ||
#endif | ||
$sym: | ||
.cfi_startproc | ||
|
||
1: | ||
// Load address | ||
// TODO: can we do this faster on newer ARMs? | ||
adrp ip0, _${lib_suffix}_tramp_table+$offset | ||
ldr ip0, [ip0, #:lo12:_${lib_suffix}_tramp_table+$offset] | ||
|
||
cbz ip0, 2f | ||
|
||
// Fast path | ||
br ip0 | ||
|
||
2: | ||
// Slow path | ||
mov ip0, $number & 0xffff | ||
#if $number > 0xffff | ||
movk ip0, $number >> 16, lsl #16 | ||
#endif | ||
stp ip0, lr, [sp, #-16]!; .cfi_adjust_cfa_offset 16; .cfi_rel_offset ip0, 0; .cfi_rel_offset lr, 8; | ||
bl _${lib_suffix}_save_regs_and_resolve | ||
ldp ip0, lr, [sp], #16; .cfi_adjust_cfa_offset -16; .cfi_restore lr; .cfi_restore ip0 | ||
b 1b | ||
.cfi_endproc |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
[Arch] | ||
PointerSize = 4 | ||
SymbolReloc = R_ARM_ABS32 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,88 @@ | ||
/* | ||
* Copyright 2018-2022 Yury Gribov | ||
* | ||
* The MIT License (MIT) | ||
* | ||
* Use of this source code is governed by MIT license that can be | ||
* found in the LICENSE.txt file. | ||
*/ | ||
|
||
.data | ||
|
||
.globl _${lib_suffix}_tramp_table | ||
.hidden _${lib_suffix}_tramp_table | ||
.align 4 | ||
_${lib_suffix}_tramp_table: | ||
.zero $table_size | ||
|
||
.text | ||
|
||
.globl _${lib_suffix}_tramp_resolve | ||
.hidden _${lib_suffix}_tramp_resolve | ||
|
||
.globl _${lib_suffix}_save_regs_and_resolve | ||
.hidden _${lib_suffix}_save_regs_and_resolve | ||
.type _${lib_suffix}_save_regs_and_resolve, %function | ||
_${lib_suffix}_save_regs_and_resolve: | ||
.cfi_startproc | ||
|
||
#define PUSH_REG(reg) push {reg}; .cfi_adjust_cfa_offset 4; .cfi_rel_offset reg, 0 | ||
#define POP_REG(reg) pop {reg} ; .cfi_adjust_cfa_offset -4; .cfi_restore reg | ||
|
||
// Binutils 2.30 does not like q0 in .cfi_rel_offset | ||
#define PUSH_DREG_PAIR(reg1, reg2) vpush {reg1, reg2}; .cfi_adjust_cfa_offset 16; .cfi_rel_offset reg1, 0; .cfi_rel_offset reg2, 8 | ||
#define POP_DREG_PAIR(reg1, reg2) vpop {reg1, reg2}; .cfi_adjust_cfa_offset -16; .cfi_restore reg1; .cfi_restore reg2 | ||
|
||
// Slow path which calls dlsym, taken only on first call. | ||
// Registers are saved acc. to "Procedure Call Standard for the ARM Architecture". | ||
// For DWARF directives, read https://www.imperialviolet.org/2017/01/18/cfi.html. | ||
|
||
// Stack is aligned at 16 bytes at this point | ||
|
||
// Save only arguments (and lr) | ||
PUSH_REG(r0) | ||
ldr r0, [sp, #8] | ||
PUSH_REG(r1) | ||
PUSH_REG(r2) | ||
PUSH_REG(r3) | ||
PUSH_REG(lr) | ||
PUSH_REG(lr) // Align to 8 bytes | ||
|
||
// Arguments can be passed in VFP registers only when hard-float ABI is used | ||
// for arm-gnueabihf target // (http://android-doc.github.io/ndk/guides/abis.html#v7a). | ||
// Use compiler macro to detect this case. | ||
#ifdef __ARM_PCS_VFP | ||
PUSH_DREG_PAIR(d0, d1) | ||
PUSH_DREG_PAIR(d2, d3) | ||
PUSH_DREG_PAIR(d4, d5) | ||
PUSH_DREG_PAIR(d6, d7) | ||
PUSH_DREG_PAIR(d8, d9) | ||
PUSH_DREG_PAIR(d10, d11) | ||
PUSH_DREG_PAIR(d12, d13) | ||
PUSH_DREG_PAIR(d14, d15) | ||
// FIXME: NEON actually supports 32 D-registers but it's unclear how to detect this | ||
#endif | ||
|
||
bl _${lib_suffix}_tramp_resolve(PLT) | ||
|
||
#ifdef __ARM_PCS_VFP | ||
POP_DREG_PAIR(d14, d15) | ||
POP_DREG_PAIR(d12, d13) | ||
POP_DREG_PAIR(d10, d11) | ||
POP_DREG_PAIR(d8, d9) | ||
POP_DREG_PAIR(d6, d7) | ||
POP_DREG_PAIR(d4, d5) | ||
POP_DREG_PAIR(d2, d3) | ||
POP_DREG_PAIR(d0, d1) | ||
#endif | ||
|
||
POP_REG(lr) // TODO: pop pc? | ||
POP_REG(lr) | ||
POP_REG(r3) | ||
POP_REG(r2) | ||
POP_REG(r1) | ||
POP_REG(r0) | ||
|
||
bx lr | ||
|
||
.cfi_endproc |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,49 @@ | ||
/* | ||
* Copyright 2018-2023 Yury Gribov | ||
* | ||
* The MIT License (MIT) | ||
* | ||
* Use of this source code is governed by MIT license that can be | ||
* found in the LICENSE.txt file. | ||
*/ | ||
|
||
.globl $sym | ||
.p2align 4 | ||
.type $sym, %function | ||
#ifndef IMPLIB_EXPORT_SHIMS | ||
.hidden $sym | ||
#endif | ||
$sym: | ||
.cfi_startproc | ||
|
||
1: | ||
// Load address | ||
// TODO: can we do this faster on newer ARMs? | ||
ldr ip, 3f | ||
2: | ||
add ip, pc, ip | ||
ldr ip, [ip] | ||
|
||
cmp ip, #0 | ||
|
||
// Fast path | ||
bxne ip | ||
|
||
// Slow path | ||
ldr ip, =$number | ||
push {ip} | ||
.cfi_adjust_cfa_offset 4 | ||
PUSH_REG(lr) | ||
bl _${lib_suffix}_save_regs_and_resolve | ||
POP_REG(lr) | ||
add sp, #4 | ||
.cfi_adjust_cfa_offset -4 | ||
b 1b | ||
|
||
// Force constant pool for ldr above | ||
.ltorg | ||
|
||
.cfi_endproc | ||
|
||
3: | ||
.word _${lib_suffix}_tramp_table - (2b + 8) + $offset |
Oops, something went wrong.