Skip to content

Commit

Permalink
qcom cleanup allocs (tinygrad#7200)
Browse files Browse the repository at this point in the history
* qcom cleanup allocs

* oops
  • Loading branch information
nimlgen authored Oct 21, 2024
1 parent f37e6b4 commit 21acfc3
Showing 1 changed file with 13 additions and 17 deletions.
30 changes: 13 additions & 17 deletions tinygrad/runtime/ops_qcom.py
Original file line number Diff line number Diff line change
Expand Up @@ -299,7 +299,7 @@ def _alloc(self, size:int, options:BufferOptions) -> HCQBuffer:
pitch = round_up((real_stride:=imgw * 4 * options.image.itemsize), 1 << pitchalign) + pitch_add

if options.external_ptr: texture = QCOMBuffer(options.external_ptr, size)
else: texture = self.device._gpu_alloc(pitch * imgh, kgsl.KGSL_MEMTYPE_TEXTURE, map_to_cpu=True)
else: texture = self.device._gpu_alloc(pitch * imgh, kgsl.KGSL_MEMTYPE_TEXTURE)

texture.pitch, texture.real_stride = pitch, real_stride

Expand All @@ -312,7 +312,7 @@ def _alloc(self, size:int, options:BufferOptions) -> HCQBuffer:

return texture

return QCOMBuffer(options.external_ptr, size) if options.external_ptr else self.device._gpu_alloc(size, map_to_cpu=True)
return QCOMBuffer(options.external_ptr, size) if options.external_ptr else self.device._gpu_alloc(size)

def _do_copy(self, src_addr, dest_addr, src_size, real_size, src_stride, dest_stride, dest_off=0, src_off=0):
while src_off < src_size:
Expand All @@ -336,7 +336,6 @@ def _free(self, opaque, options:BufferOptions):
self.device.synchronize()
self.device._gpu_free(opaque)

MAP_FIXED = 0x10
class QCOMDevice(HCQCompiled):
signals_page: Any = None
signals_pool: List[Any] = []
Expand All @@ -345,10 +344,10 @@ class QCOMDevice(HCQCompiled):

def __init__(self, device:str=""):
self.fd = os.open('/dev/kgsl-3d0', os.O_RDWR)
QCOMDevice.dummy_addr = self._gpu_alloc(0x1000, map_to_cpu=False).va_addr
QCOMDevice.signals_page = self._gpu_alloc(16 * 65536, map_to_cpu=True, uncached=True)
QCOMDevice.dummy_addr = self._gpu_alloc(0x1000).va_addr
QCOMDevice.signals_page = self._gpu_alloc(16 * 65536, uncached=True)
QCOMDevice.signals_pool = [to_mv(self.signals_page.va_addr + off, 16).cast("Q") for off in range(0, self.signals_page.size, 16)]
info, self.ctx, self.cmd_buf, self.cmd_buf_ptr, self.last_cmd = self._info(), self._ctx_create(), self._gpu_alloc(16 << 20, map_to_cpu=True), 0,0
info, self.ctx, self.cmd_buf, self.cmd_buf_ptr, self.last_cmd = self._info(), self._ctx_create(), self._gpu_alloc(16 << 20), 0,0
QCOMDevice.gpu_id = ((info.chip_id >> 24) & 0xFF) * 100 + ((info.chip_id >> 16) & 0xFF) * 10 + ((info.chip_id >> 8) & 0xFF)
if QCOMDevice.gpu_id >= 700: raise RuntimeError(f"Unsupported GPU: {QCOMDevice.gpu_id}")

Expand All @@ -370,29 +369,26 @@ def _info(self):
kgsl.IOCTL_KGSL_DEVICE_GETPROPERTY(self.fd, type=kgsl.KGSL_PROP_DEVICE_INFO, value=ctypes.addressof(info), sizebytes=ctypes.sizeof(info))
return info

def _gpu_alloc(self, size:int, flags:int=0, map_to_cpu=False, uncached=False, fill_zeroes=False):
def _gpu_alloc(self, size:int, flags:int=0, uncached=False, fill_zeroes=False):
flags |= kgsl.KGSL_MEMALIGN(alignment_hint:=12) | kgsl.KGSL_MEMFLAGS_USE_CPU_MAP
if uncached: flags |= kgsl.KGSL_CACHEMODE(kgsl.KGSL_CACHEMODE_UNCACHED)

alloc = kgsl.IOCTL_KGSL_GPUOBJ_ALLOC(self.fd, size=round_up(size, 1<<(alignment_hint:=12)), flags=flags | kgsl.KGSL_MEMALIGN(alignment_hint))
info = kgsl.IOCTL_KGSL_GPUOBJ_INFO(self.fd, id=alloc.id)
va_addr, va_len = info.gpuaddr, info.va_len
alloc = kgsl.IOCTL_KGSL_GPUOBJ_ALLOC(self.fd, size=(bosz:=round_up(size, 1<<alignment_hint)), flags=flags, mmapsize=bosz)
va_addr = libc.mmap(0, bosz, mmap.PROT_READ | mmap.PROT_WRITE, mmap.MAP_SHARED, self.fd, alloc.id * 0x1000)

if map_to_cpu:
va_addr = libc.mmap(va_addr, va_len, mmap.PROT_READ|mmap.PROT_WRITE, mmap.MAP_SHARED|MAP_FIXED, self.fd, alloc.id * 0x1000)
if fill_zeroes: ctypes.memset(va_addr, 0, va_len)

return QCOMBuffer(va_addr=va_addr, size=size, mapped=map_to_cpu, info=alloc)
if fill_zeroes: ctypes.memset(va_addr, 0, size)
return QCOMBuffer(va_addr=va_addr, size=size, info=alloc)

def _gpu_free(self, mem):
kgsl.IOCTL_KGSL_GPUOBJ_FREE(self.fd, id=mem.info.id)
if mem.mapped: libc.munmap(mem.va_addr, mem.info.va_len)
libc.munmap(mem.va_addr, mem.info.mmapsize)

def _alloc_cmd_buf(self, sz: int):
self.cmd_buf_ptr = (cur_ptr:=self.cmd_buf_ptr if self.cmd_buf_ptr + sz < self.cmd_buf.size else 0) + sz
return self.cmd_buf.va_addr + cur_ptr

def _border_color_base(self):
if not hasattr(self, '_border_color_gpu'): self._border_color_gpu = self._gpu_alloc(0x1000, map_to_cpu=True, fill_zeroes=True)
if not hasattr(self, '_border_color_gpu'): self._border_color_gpu = self._gpu_alloc(0x1000, fill_zeroes=True)
return self._border_color_gpu.va_addr

def _ensure_stack_size(self, sz):
Expand Down

0 comments on commit 21acfc3

Please sign in to comment.