diff --git a/tinygrad/runtime/ops_nv.py b/tinygrad/runtime/ops_nv.py index f5b7585589283..f2e16c3a011c3 100644 --- a/tinygrad/runtime/ops_nv.py +++ b/tinygrad/runtime/ops_nv.py @@ -230,12 +230,12 @@ def __init__(self, device:NVDevice, name:str, lib:bytes): # NOTE: Ensure at least 4KB of space after the program to mitigate prefetch memory faults. self.lib_gpu = self.device.allocator.alloc(round_up(image.nbytes, 0x1000) + 0x1000, BufferOptions(cpu_access=True)) - self.program_addr, self.program_sz, self.registers_usage, self.shmem_usage, self.lcmem_usage = self.lib_gpu.va_addr, image.nbytes, 0, 0, 0 + self.prog_addr, self.prog_sz, self.regs_usage, self.shmem_usage, self.lcmem_usage = self.lib_gpu.va_addr, image.nbytes, 0, 0, 0 self.constbufs: Dict[int, Tuple[int, int]] = {0: (0, 0x160)} # Dict[constbuf index, Tuple[va_addr, size]] for sh in sections: if sh.name == f".nv.shared.{self.name}": self.shmem_usage = sh.header.sh_size if sh.name == f".text.{self.name}": - self.program_addr, self.program_sz, self.registers_usage = self.lib_gpu.va_addr+sh.header.sh_addr, sh.header.sh_size, sh.header.sh_info>>24 + self.prog_addr, self.prog_sz, self.regs_usage = self.lib_gpu.va_addr+sh.header.sh_addr, sh.header.sh_size, max(sh.header.sh_info>>24, 16) elif m:=re.match(r'\.nv\.constant(\d+)', sh.name): self.constbufs[int(m.group(1))] = (self.lib_gpu.va_addr+sh.header.sh_addr, sh.header.sh_size) elif sh.name == ".nv.info": for off in range(0, sh.header.sh_size, 12): @@ -263,10 +263,10 @@ def __init__(self, device:NVDevice, name:str, lib:bytes): invalidate_texture_data_cache=1, invalidate_shader_data_cache=1, api_visible_call_limit=1, sampler_index=1, cwd_membar_type=nv_gpu.NVC6C0_QMDV03_00_CWD_MEMBAR_TYPE_L1_SYSMEMBAR, qmd_major_version=3, constant_buffer_invalidate_0=1, shared_memory_size=max(0x400, round_up(self.shmem_usage, 0x100)), min_sm_config_shared_mem_size=smem_config, - max_sm_config_shared_mem_size=0x1a, register_count_v=self.registers_usage, target_sm_config_shared_mem_size=smem_config, - barrier_count=1, shader_local_memory_high_size=self.device.slm_per_thread, program_prefetch_size=self.program_sz>>8, - program_address=self.program_addr, sass_version=0x89, - program_prefetch_addr_lower_shifted=self.program_addr>>8, program_prefetch_addr_upper_shifted=self.program_addr>>40) + max_sm_config_shared_mem_size=0x1a, register_count_v=self.regs_usage, target_sm_config_shared_mem_size=smem_config, + barrier_count=1, shader_local_memory_high_size=self.device.slm_per_thread, program_prefetch_size=self.prog_sz>>8, + program_address=self.prog_addr, sass_version=0x89, + program_prefetch_addr_lower_shifted=self.prog_addr>>8, program_prefetch_addr_upper_shifted=self.prog_addr>>40) for i,(addr,sz) in self.constbufs.items(): self.qmd.__setattr__(f'constant_buffer_addr_upper_{i}', (addr) >> 32) @@ -275,7 +275,7 @@ def __init__(self, device:NVDevice, name:str, lib:bytes): self.qmd.__setattr__(f'constant_buffer_valid_{i}', 1) # Registers allocation granularity per warp is 256, warp allocaiton granularity is 4. Register file size is 65536. - self.max_threads = ((65536 // round_up(max(1, self.registers_usage) * 32, 256)) // 4) * 4 * 32 + self.max_threads = ((65536 // round_up(max(1, self.regs_usage) * 32, 256)) // 4) * 4 * 32 # NV's kernargs is constbuffer (size 0x160), then arguments to the kernel follows. Kernargs also appends QMD at the end of the kernel. super().__init__(NVArgsState, self.device, self.name, kernargs_alloc_size=round_up(self.constbufs[0][1], 1 << 8) + (8 << 8))