# Lecture 008

## Transform Around Tensorized Block

Since GPU operation can perform a large region in parallel, we can define a block operation and this block may be replacable by GPU's primitive functions. The following code defines a block of 16x16 region. You can reorder things outside of the block you defined without interfere with the inner block implementation.

@tvm.script.ir_module
class MatmulBlockModule:
@T.prim_func
def main(
A: T.Buffer[(1024, 1024), "float32"],
B: T.Buffer[(1024, 1024), "float32"],
C: T.Buffer[(1024, 1024), "float32"],
) -> None:
T.func_attr({"global_symbol": "main", "tir.noalias": True})
for i0, j0, k0 in T.grid(64, 64, 64):
with T.block("tmm-16x16"):
vi0, vj0, vk0 = T.axis.remap("SSR", [i0, j0, k0])
with T.init():
for i1, j1 in T.grid(16, 16):
with T.block("tmm_init"):
vi1, vj1 = T.axis.remap("SS", [i1, j1])
C[vi0 * 16 + vi1, vj0 * 16 + vj1] = T.float32(0)

for i1, j1, k1 in T.grid(16, 16, 16):
with T.block("tmm"):
vi1, vj1, vk1 = T.axis.remap("SSR", [i1, j1, k1])
C[vi0 *16 + vi1, vj0 * 16 + vj1] += \
A[vi0 * 16 + vi1, vk0 * 16 + vk1] * B[vj0 * 16 + vj1, vk0 * 16 + vk1]


## Create Tensorized Block

This is our original code

@tvm.script.ir_module
class MatmulModule:
@T.prim_func
def main(
A: T.Buffer[(1024, 1024), "float32"],
B: T.Buffer[(1024, 1024), "float32"],
C: T.Buffer[(1024, 1024), "float32"],
) -> None:
T.func_attr({"global_symbol": "main", "tir.noalias": True})
for i, j, k in T.grid(1024, 1024, 1024):
with T.block("matmul"):
vi, vj, vk = T.axis.remap("SSR", [i, j, k])
with T.init():
C[vi, vj] = T.float32(0)
C[vi, vj] += A[vi, vk] * B[vj, vk]


we run some transformation code

sch = tvm.tir.Schedule(MatmulModule)
i, j, k = sch.get_loops("matmul")
i, ii = sch.split(i, factors=[None, 16])
j, ji = sch.split(j, factors=[None, 16])
k, ki = sch.split(k, factors=[None, 16])
sch.reorder(i, j, k, ii, ji, ki)
sch.mod.show()


and it out put this TensorIR module

@tvm.script.ir_module
class Module:
@T.prim_func
def main(A: T.Buffer[(1024, 1024), "float32"], B: T.Buffer[(1024, 1024), "float32"], C: T.Buffer[(1024, 1024), "float32"]) -> None:
# function attr dict
T.func_attr({"global_symbol": "main", "tir.noalias": True})
# body
# with T.block("root")
for i_0, j_0, k_0, i_1, j_1, k_1 in T.grid(64, 64, 64, 16, 16, 16):
with T.block("matmul"):
vi = T.axis.spatial(1024, i_0 * 16 + i_1)
vj = T.axis.spatial(1024, j_0 * 16 + j_1)
vk = T.axis.reduce(1024, k_0 * 16 + k_1)
T.writes(C[vi, vj])
with T.init():
C[vi, vj] = T.float32(0)
C[vi, vj] = C[vi, vj] + A[vi, vk] * B[vj, vk]


and we can not create blocks by running blockize along some dimension

block_mm = sch.blockize(ii)
sch.mod.show()


and here is the code we generated

@tvm.script.ir_module
class Module:
@T.prim_func
def main(A: T.Buffer[(1024, 1024), "float32"], B: T.Buffer[(1024, 1024), "float32"], C: T.Buffer[(1024, 1024), "float32"]) -> None:
# function attr dict
T.func_attr({"global_symbol": "main", "tir.noalias": True})
# body
# with T.block("root")
for i_0, j_0, k_0 in T.grid(64, 64, 64):
with T.block("matmul_o"):
vi_o, vj_o, vk_o = T.axis.remap("SSR", [i_0, j_0, k_0])
T.reads(A[vi_o * 16 : vi_o * 16 + 16, vk_o * 16 : vk_o * 16 + 16], B[vj_o * 16 : vj_o * 16 + 16, vk_o * 16 : vk_o * 16 + 16])
T.writes(C[vi_o * 16 : vi_o * 16 + 16, vj_o * 16 : vj_o * 16 + 16])
with T.init():
for i_1, j_1 in T.grid(16, 16):
with T.block("matmul_init"):
vi_i_init, vj_i_init = T.axis.remap("SS", [i_1, j_1])
T.writes(C[vi_o * 16 + vi_i_init, vj_o * 16 + vj_i_init])
C[vi_o * 16 + vi_i_init, vj_o * 16 + vj_i_init] = T.float32(0)
for i_1, j_1, k_1 in T.grid(16, 16, 16):
with T.block("matmul"):
vi_i, vj_i, vk_i = T.axis.remap("SSR", [i_1, j_1, k_1])
T.reads(C[vi_o * 16 + vi_i, vj_o * 16 + vj_i], A[vi_o * 16 + vi_i, vk_o * 16 + vk_i], B[vj_o * 16 + vj_i, vk_o * 16 + vk_i])
T.writes(C[vi_o * 16 + vi_i, vj_o * 16 + vj_i])
C[vi_o * 16 + vi_i, vj_o * 16 + vj_i] = C[vi_o * 16 + vi_i, vj_o * 16 + vj_i] + A[vi_o * 16 + vi_i, vk_o * 16 + vk_i] * B[vj_o * 16 + vj_i, vk_o * 16 + vk_i]


## Specifying Control Flow

We can to explicitely specify how memory are allocated and used.

The following code will compute things on temporary memory, then accumulate to another temporary memory for the final answer. We want to specify this in transforms:

A_reg = sch.cache_read(block_mm, 0, storage_scope="global.A_reg")
sch.compute_at(A_reg, k)
sch.compute_at(B_reg, k)

write_back_block = sch.cache_write(block_mm, 0, storage_scope="global.accumulator")
sch.reverse_compute_at(write_back_block, j)
sch.mod.show()


And now, our code support explicits memory allocation

@tvm.script.ir_module
class Module:
@T.prim_func
def main(A: T.Buffer[(1024, 1024), "float32"], B: T.Buffer[(1024, 1024), "float32"], C: T.Buffer[(1024, 1024), "float32"]) -> None:
# function attr dict
T.func_attr({"global_symbol": "main", "tir.noalias": True})
# body
# with T.block("root")
A_global_A_reg = T.alloc_buffer([1024, 1024], dtype="float32", scope="global.A_reg")
B_global_B_reg = T.alloc_buffer([1024, 1024], dtype="float32", scope="global.B_reg")
C_global_accumulator = T.alloc_buffer([1024, 1024], dtype="float32", scope="global.accumulator")
for i_0, j_0 in T.grid(64, 64):
for k_0 in T.serial(64):
for ax0, ax1 in T.grid(16, 16):
with T.block("A_global.A_reg"):
v0 = T.axis.spatial(1024, i_0 * 16 + ax0)
v1 = T.axis.spatial(1024, k_0 * 16 + ax1)
T.writes(A_global_A_reg[v0, v1])
A_global_A_reg[v0, v1] = A[v0, v1]
for ax0, ax1 in T.grid(16, 16):
with T.block("B_global.B_reg"):
v0 = T.axis.spatial(1024, j_0 * 16 + ax0)
v1 = T.axis.spatial(1024, k_0 * 16 + ax1)
T.writes(B_global_B_reg[v0, v1])
B_global_B_reg[v0, v1] = B[v0, v1]
with T.block("matmul_o"):
vi_o, vj_o, vk_o = T.axis.remap("SSR", [i_0, j_0, k_0])
T.reads(A_global_A_reg[vi_o * 16 : vi_o * 16 + 16, vk_o * 16 : vk_o * 16 + 16], B_global_B_reg[vj_o * 16 : vj_o * 16 + 16, vk_o * 16 : vk_o * 16 + 16])
T.writes(C_global_accumulator[vi_o * 16 : vi_o * 16 + 16, vj_o * 16 : vj_o * 16 + 16])
with T.init():
for i_1, j_1 in T.grid(16, 16):
with T.block("matmul_init"):
vi_i_init, vj_i_init = T.axis.remap("SS", [i_1, j_1])
T.writes(C_global_accumulator[vi_o * 16 + vi_i_init, vj_o * 16 + vj_i_init])
C_global_accumulator[vi_o * 16 + vi_i_init, vj_o * 16 + vj_i_init] = T.float32(0)
for i_1, j_1, k_1 in T.grid(16, 16, 16):
with T.block("matmul"):
vi_i, vj_i, vk_i = T.axis.remap("SSR", [i_1, j_1, k_1])
T.reads(C_global_accumulator[vi_o * 16 + vi_i, vj_o * 16 + vj_i], A_global_A_reg[vi_o * 16 + vi_i, vk_o * 16 + vk_i], B_global_B_reg[vj_o * 16 + vj_i, vk_o * 16 + vk_i])
T.writes(C_global_accumulator[vi_o * 16 + vi_i, vj_o * 16 + vj_i])
C_global_accumulator[vi_o * 16 + vi_i, vj_o * 16 + vj_i] = C_global_accumulator[vi_o * 16 + vi_i, vj_o * 16 + vj_i] + A_global_A_reg[vi_o * 16 + vi_i, vk_o * 16 + vk_i] * B_global_B_reg[vj_o * 16 + vj_i, vk_o * 16 + vk_i]
for ax0, ax1 in T.grid(16, 16):
with T.block("C_global.accumulator"):
v0 = T.axis.spatial(1024, i_0 * 16 + ax0)
v1 = T.axis.spatial(1024, j_0 * 16 + ax1)
T.writes(C[v0, v1])
C[v0, v1] = C_global_accumulator[v0, v1]


## Tensorization

Tensorization: like vectorization, but to 3D or more

Description vs Impelmentation: we write a simple naive, unaccelerated, but correct version of a piece of primitive code as "description". And if there exist an implementation of the same code in hardware, we can replace our naive description with implementation.

Here are description and implementation.

@T.prim_func
def tmm16_desc(a: T.handle, b: T.handle, c: T.handle) -> None:
A = T.match_buffer(a, (16, 16), "float32", offset_factor=16, scope="global.A_reg")
B = T.match_buffer(b, (16, 16), "float32", offset_factor=16, scope="global.B_reg")
C = T.match_buffer(c, (16, 16), "float32", offset_factor=16,  scope="global.accumulator")

with T.block("root"):
T.reads(C[0:16, 0:16], A[0:16, 0:16], B[0:16, 0:16])
T.writes(C[0:16, 0:16])
for i, j, k in T.grid(16, 16, 16):
with T.block(""):
vii, vjj, vkk = T.axis.remap("SSR", [i, j, k])
C[vii, vjj] = C[vii, vjj] + A[vii, vkk] * B[vjj, vkk]

@T.prim_func
def tmm16_impl(a: T.handle, b: T.handle, c: T.handle) -> None:
sa = T.var("int32")
sb = T.var("int32")
sc = T.var("int32")
A = T.match_buffer(a, (16, 16), "float32", offset_factor=16, strides=[sa, 1], scope="global.A_reg") # offset_factor: memory has continuous 16 byte chunk
B = T.match_buffer(b, (16, 16), "float32", offset_factor=16, strides=[sb, 1], scope="global.B_reg") # only read from B_register
C = T.match_buffer(c, (16, 16), "float32", offset_factor=16, strides=[sc, 1], scope="global.accumulator") # expect variable c of (16, 16) size, and store the result stride sc, only write to global accumulator memory region

with T.block("root"):
T.reads(C[0:16, 0:16], A[0:16, 0:16], B[0:16, 0:16])
T.writes(C[0:16, 0:16])
T.evaluate(
T.call_extern(
"tmm16",
C.access_ptr("w"),
A.access_ptr("r"),
B.access_ptr("r"),
sa,
sb,
sc,
dtype="int32",
)
)

tvm.tir.TensorIntrin.register("tmm16", tmm16_desc, tmm16_impl)


Table of Content