Deployed 6e2e56c with MkDocs version: 1.6.1

chenyuxyz · Dec 17, 2024 · 0857250 · 0857250
commit 0857250
Show file tree

Hide file tree

Showing 85 changed files with 57,445 additions and 0 deletions.
diff --git a/.nojekyll b/.nojekyll
diff --git a/404.html b/404.html
diff --git a/CNAME b/CNAME
@@ -0,0 +1 @@
+docs.tinygrad.org
diff --git a/abstractions2.py b/abstractions2.py
@@ -0,0 +1,118 @@
+# tinygrad is a tensor library, and as a tensor library it has multiple parts
+# 1. a "runtime". this allows buffer management, compilation, and running programs
+# 2. a "Device" that uses the runtime but specifies compute in an abstract way for all
+# 3. a "LazyBuffer" that fuses the compute into kernels, using memory only when needed
+# 4. a "Tensor" that provides an easy to use frontend with autograd ".backward()"
+
+
+print("******** first, the runtime ***********")
+
+from tinygrad.runtime.ops_clang import ClangProgram, ClangCompiler, MallocAllocator
+
+# allocate some buffers
+out = MallocAllocator.alloc(4)
+a = MallocAllocator.alloc(4)
+b = MallocAllocator.alloc(4)
+
+# load in some values (little endian)
+MallocAllocator._copyin(a, memoryview(bytearray([2,0,0,0])))
+MallocAllocator._copyin(b, memoryview(bytearray([3,0,0,0])))
+
+# compile a program to a binary
+lib = ClangCompiler().compile("void add(int *out, int *a, int *b) { out[0] = a[0] + b[0]; }")
+
+# create a runtime for the program (ctypes.CDLL)
+fxn = ClangProgram("add", lib)
+
+# run the program
+fxn(out, a, b)
+
+# check the data out
+print(val := MallocAllocator._as_buffer(out).cast("I").tolist()[0])
+assert val == 5
+
+
+print("******** second, the Device ***********")
+
+DEVICE = "CLANG"   # NOTE: you can change this!
+
+import struct
+from tinygrad.dtype import dtypes
+from tinygrad.device import Buffer, Device
+from tinygrad.ops import UOp, Ops
+from tinygrad.shape.shapetracker import ShapeTracker
+
+# allocate some buffers + load in values
+out = Buffer(DEVICE, 1, dtypes.int32).allocate()
+a = Buffer(DEVICE, 1, dtypes.int32).allocate().copyin(memoryview(bytearray(struct.pack("I", 2))))
+b = Buffer(DEVICE, 1, dtypes.int32).allocate().copyin(memoryview(bytearray(struct.pack("I", 3))))
+# NOTE: a._buf is the same as the return from MallocAllocator.alloc
+
+# describe the computation
+buf_1 = UOp(Ops.DEFINE_GLOBAL, dtypes.int32.ptr(), (), 1)
+buf_2 = UOp(Ops.DEFINE_GLOBAL, dtypes.int32.ptr(), (), 2)
+ld_1 = UOp(Ops.LOAD, dtypes.int32, (buf_1, ShapeTracker.from_shape((1,)).to_uop()))
+ld_2 = UOp(Ops.LOAD, dtypes.int32, (buf_2, ShapeTracker.from_shape((1,)).to_uop()))
+alu = ld_1 + ld_2
+output_buf = UOp(Ops.DEFINE_GLOBAL, dtypes.int32.ptr(), (), 0)
+st_0 = UOp(Ops.STORE, dtypes.void, (output_buf, ShapeTracker.from_shape((1,)).to_uop(), alu))
+s = UOp(Ops.SINK, dtypes.void, (st_0,))
+
+# convert the computation to a "linearized" format (print the format)
+from tinygrad.engine.realize import get_kernel, CompiledRunner
+kernel = get_kernel(Device[DEVICE].renderer, s).linearize()
+
+# compile a program (and print the source)
+fxn = CompiledRunner(kernel.to_program())
+print(fxn.p.src)
+# NOTE: fxn.clprg is the ClangProgram
+
+# run the program
+fxn.exec([out, a, b])
+
+# check the data out
+assert out.as_buffer().cast('I')[0] == 5
+
+
+print("******** third, the LazyBuffer ***********")
+
+from tinygrad.engine.realize import run_schedule
+from tinygrad.engine.schedule import create_schedule
+
+# allocate some values + load in values
+a = UOp.metaop(Ops.EMPTY, (1,), dtypes.int32, DEVICE)
+b = UOp.metaop(Ops.EMPTY, (1,), dtypes.int32, DEVICE)
+a.buffer.allocate().copyin(memoryview(bytearray(struct.pack("I", 2))))
+b.buffer.allocate().copyin(memoryview(bytearray(struct.pack("I", 3))))
+a = a.buf_uop_view()
+b = b.buf_uop_view()
+
+# describe the computation
+out = a.alu(Ops.ADD, b)
+
+# schedule the computation as a list of kernels
+sched = create_schedule([out])
+for si in sched: print(si.ast.op)  # NOTE: the first two convert it to CLANG
+
+# DEBUGGING: print the compute ast
+print(sched[-1].ast)
+# NOTE: sched[-1].ast is the same as st_0 above
+
+# run that schedule
+run_schedule(sched)
+
+# check the data out
+assert out.realized is not None and out.realized.as_buffer().cast('I')[0] == 5
+
+
+print("******** fourth, the Tensor ***********")
+
+from tinygrad import Tensor
+
+a = Tensor([2], dtype=dtypes.int32, device=DEVICE)
+b = Tensor([3], dtype=dtypes.int32, device=DEVICE)
+out = a + b
+
+# check the data out
+print(val:=out.item())
+assert val == 5
diff --git a/abstractions3.py b/abstractions3.py
@@ -0,0 +1,62 @@
+# abstractions2 goes from back to front, here we will go from front to back
+from typing import List
+from tinygrad.helpers import tqdm
+
+# *****
+# 0. Load mnist on the device
+
+from tinygrad.nn.datasets import mnist
+X_train, Y_train, _, _ = mnist()
+X_train = X_train.float()
+X_train -= X_train.mean()
+
+# *****
+# 1. Define an MNIST model.
+
+from tinygrad import Tensor
+
+l1 = Tensor.kaiming_uniform(128, 784)
+l2 = Tensor.kaiming_uniform(10, 128)
+def model(x): return x.flatten(1).dot(l1.T).relu().dot(l2.T)
+l1n, l2n = l1.numpy(), l2.numpy()
+
+# *****
+# 2. Choose a batch for training and do the backward pass.
+
+from tinygrad.nn.optim import SGD
+optim = SGD([l1, l2])
+
+Tensor.training = True
+X, Y = X_train[(samples:=Tensor.randint(128, high=X_train.shape[0]))], Y_train[samples]
+optim.zero_grad()
+model(X).sparse_categorical_crossentropy(Y).backward()
+optim.schedule_step()   # this will step the optimizer without running realize
+
+# *****
+# 3. Create a schedule.
+
+# The weight Tensors have been assigned to, but not yet realized. Everything is still lazy at this point
+# l1.lazydata and l2.lazydata define a computation graph
+
+from tinygrad.engine.schedule import ScheduleItem
+schedule: List[ScheduleItem] = Tensor.schedule(l1, l2)
+
+print(f"The schedule contains {len(schedule)} items.")
+for si in schedule: print(str(si)[:80])
+
+# *****
+# 4. Lower a schedule.
+
+from tinygrad.engine.realize import lower_schedule_item, ExecItem
+lowered: List[ExecItem] = [ExecItem(lower_schedule_item(si).prg, list(si.bufs)) for si in tqdm(schedule)]
+
+# *****
+# 5. Run the schedule
+
+for ei in tqdm(lowered): ei.run()
+
+# *****
+# 6. Print the weight change
+
+print("first weight change\n", l1.numpy()-l1n)
+print("second weight change\n", l2.numpy()-l2n)