Skip to content

Commit

Permalink
hcq no timeline signals in init (tinygrad#6944)
Browse files Browse the repository at this point in the history
  • Loading branch information
nimlgen authored Oct 7, 2024
1 parent 0ecc417 commit 4260930
Show file tree
Hide file tree
Showing 4 changed files with 10 additions and 10 deletions.
6 changes: 3 additions & 3 deletions tinygrad/runtime/ops_amd.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,10 +30,10 @@ def gfxreg(reg): return reg + 0x00001260 - amd_gpu.PACKET3_SET_SH_REG_START
def nbioreg(reg): return reg + 0x00000d20 # NBIO_BASE__INST0_SEG2

class AMDSignal(HCQSignal):
def __init__(self, value=0, alloc_event=False):
def __init__(self, value=0, is_timeline=False):
self._signal = AMDDevice.signals_pool.pop()
self._value_addr, self._timestamp_addr = mv_address(self._signal), mv_address(self._signal) + 8
if alloc_event:
if is_timeline:
sync_event = kfd.AMDKFD_IOC_CREATE_EVENT(AMDDevice.kfd, auto_reset=1)
self._event_mailbox_ptr = AMDDevice.event_page.va_addr + sync_event.event_slot_index*8
self._event_id = sync_event.event_id
Expand Down Expand Up @@ -418,7 +418,7 @@ def __init__(self, device:str=""):
self.sdma_queue = self._alloc_queue(kfd.KFD_IOC_QUEUE_TYPE_SDMA, 0x100000)

super().__init__(device, AMDAllocator(self), AMDRenderer(), AMDCompiler(self.arch), functools.partial(AMDProgram, self),
AMDSignal, AMDComputeQueue, AMDCopyQueue, (AMDSignal(alloc_event=True), AMDSignal(alloc_event=True)))
AMDSignal, AMDComputeQueue, AMDCopyQueue)

def _alloc_queue(self, queue_type, ring_size, ctx_save_restore_size=None, eop_buffer_size=None) -> AMDQueueDesc:
gart = self._gpu_alloc(0x1000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_GTT, uncached=True)
Expand Down
4 changes: 2 additions & 2 deletions tinygrad/runtime/ops_nv.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ def make_qmd_struct_type():
def nvmethod(subc, mthd, size, typ=2): return (typ << 28) | (size << 16) | (subc << 13) | (mthd >> 2)

class NVSignal(HCQSignal):
def __init__(self, value=0):
def __init__(self, value=0, is_timeline=False):
self._signal = NVDevice.signals_pool.pop()
self.signal_addr = mv_address(self._signal)
super().__init__(value)
Expand Down Expand Up @@ -480,7 +480,7 @@ def __init__(self, device:str=""):

compiler_t = (PTXCompiler if PTX else CUDACompiler) if MOCKGPU else (NVPTXCompiler if PTX else NVCompiler)
super().__init__(device, NVAllocator(self), PTXRenderer(self.arch, device="NV") if PTX else NVRenderer(self.arch), compiler_t(self.arch),
functools.partial(NVProgram, self), NVSignal, NVComputeQueue, NVCopyQueue, timeline_signals=(NVSignal(), NVSignal()))
functools.partial(NVProgram, self), NVSignal, NVComputeQueue, NVCopyQueue)

self._setup_gpfifos()

Expand Down
4 changes: 2 additions & 2 deletions tinygrad/runtime/ops_qcom.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ class QCOMCompiler(CLCompiler):
def __init__(self, device:str=""): super().__init__(CLDevice(device), 'compile_qcom')

class QCOMSignal(HCQSignal):
def __init__(self, value=0, **kwargs):
def __init__(self, value=0, is_timeline=False):
self._signal = QCOMDevice.signals_pool.pop()
super().__init__(value)
def __del__(self): QCOMDevice.signals_pool.append(self._signal)
Expand Down Expand Up @@ -351,7 +351,7 @@ def __init__(self, device:str=""):
if QCOMDevice.gpu_id >= 700: raise RuntimeError(f"Unsupported GPU: {QCOMDevice.gpu_id}")

super().__init__(device, QCOMAllocator(self), QCOMRenderer(), QCOMCompiler(device), functools.partial(QCOMProgram, self),
QCOMSignal, QCOMComputeQueue, None, timeline_signals=(QCOMSignal(), QCOMSignal()))
QCOMSignal, QCOMComputeQueue, None)

def _ctx_create(self):
cr = kgsl.IOCTL_KGSL_DRAWCTXT_CREATE(self.fd, flags=(kgsl.KGSL_CONTEXT_PREAMBLE | kgsl.KGSL_CONTEXT_PWR_CONSTRAINT |
Expand Down
6 changes: 3 additions & 3 deletions tinygrad/runtime/support/hcq.py
Original file line number Diff line number Diff line change
Expand Up @@ -195,7 +195,7 @@ def update_copy(self, cmd_idx:int, dest:Optional[HCQBuffer]=None, src:Optional[H
def _update_copy(self, cmd_idx, dest, src): raise NotImplementedError("backend should overload this function")

class HCQSignal:
def __init__(self, value:int=0): self._set_value(value)
def __init__(self, value:int=0, is_timeline:bool=False): self._set_value(value)

@property
def value(self) -> int: return self._get_value()
Expand Down Expand Up @@ -346,10 +346,10 @@ class HCQCompiled(Compiled):
gpu2cpu_compute_time_diff: decimal.Decimal = decimal.Decimal('nan')

def __init__(self, device:str, allocator:Allocator, renderer:Renderer, compiler:Compiler, runtime, signal_t:Type[HCQSignal],
comp_queue_t:Type[HWComputeQueue], copy_queue_t:Optional[Type[HWCopyQueue]], timeline_signals:Tuple[HCQSignal, HCQSignal]):
comp_queue_t:Type[HWComputeQueue], copy_queue_t:Optional[Type[HWCopyQueue]]):
self.signal_t, self.hw_compute_queue_t, self.hw_copy_queue_t = signal_t, comp_queue_t, copy_queue_t
self.timeline_value:int = 1
self.timeline_signal, self._shadow_timeline_signal = timeline_signals
self.timeline_signal, self._shadow_timeline_signal = self.signal_t(0, is_timeline=True), self.signal_t(0, is_timeline=True)
self.sig_prof_records:List[Tuple[HCQSignal, HCQSignal, str, bool]] = []
self.raw_prof_records:List[Tuple[decimal.Decimal, decimal.Decimal, str, bool, Optional[Dict]]] = []
self.dep_prof_records:List[Tuple[decimal.Decimal, decimal.Decimal, HCQCompiled, bool, decimal.Decimal, decimal.Decimal, HCQCompiled, bool]] = []
Expand Down

0 comments on commit 4260930

Please sign in to comment.