"""
def __init__(self, chip, gmem_align_w, gmem_align_h,
tile_align_w, tile_align_h,
- tile_max_w, tile_max_h, num_vsc_pipes):
+ tile_max_w, tile_max_h, num_vsc_pipes,
+ cs_shared_mem_size):
self.chip = chip.value
self.gmem_align_w = gmem_align_w
self.gmem_align_h = gmem_align_h
self.tile_max_w = tile_max_w
self.tile_max_h = tile_max_h
self.num_vsc_pipes = num_vsc_pipes
+ self.cs_shared_mem_size = cs_shared_mem_size
s.gpu_infos.append(self)
into distinct sub-generations. The template parameter avoids
duplication of parameters that are unique to the sub-generation.
"""
- def __init__(self, chip, template, num_ccu, tile_align_w, tile_align_h, num_vsc_pipes, magic_regs):
+ def __init__(self, chip, template, num_ccu, tile_align_w, tile_align_h, num_vsc_pipes, cs_shared_mem_size, magic_regs):
super().__init__(chip, gmem_align_w = 16, gmem_align_h = 4,
tile_align_w = tile_align_w,
tile_align_h = tile_align_h,
tile_max_w = 1024, # max_bitfield_val(5, 0, 5)
tile_max_h = max_bitfield_val(14, 8, 4),
- num_vsc_pipes = num_vsc_pipes)
+ num_vsc_pipes = num_vsc_pipes,
+ cs_shared_mem_size = cs_shared_mem_size)
# The # of SP cores seems to always match # of CCU
self.num_sp_cores = num_ccu
self.num_ccu = num_ccu
tile_max_w = 512,
tile_max_h = ~0, # TODO
num_vsc_pipes = 8,
+ cs_shared_mem_size = 0,
))
add_gpus([
tile_max_w = 992, # max_bitfield_val(4, 0, 5)
tile_max_h = max_bitfield_val(9, 5, 5),
num_vsc_pipes = 8,
+ cs_shared_mem_size = 32 * 1024,
))
add_gpus([
tile_max_w = 1024, # max_bitfield_val(4, 0, 5)
tile_max_h = max_bitfield_val(9, 5, 5),
num_vsc_pipes = 8,
+ cs_shared_mem_size = 32 * 1024,
))
add_gpus([
tile_max_w = 1024, # max_bitfield_val(7, 0, 5)
tile_max_h = max_bitfield_val(16, 9, 5),
num_vsc_pipes = 16,
+ cs_shared_mem_size = 32 * 1024,
))
# a6xx can be divided into distinct sub-generations, where certain device-
tile_align_w = 32,
tile_align_h = 32,
num_vsc_pipes = 32,
+ cs_shared_mem_size = 32 * 1024,
magic_regs = dict(
PC_POWER_CNTL = 0,
TPL1_DBG_ECO_CNTL = 0x00108000,
tile_align_w = 32,
tile_align_h = 16,
num_vsc_pipes = 32,
+ cs_shared_mem_size = 32 * 1024,
magic_regs = dict(
PC_POWER_CNTL = 0,
TPL1_DBG_ECO_CNTL = 0x01008000,
tile_align_w = 32,
tile_align_h = 16,
num_vsc_pipes = 32,
+ cs_shared_mem_size = 32 * 1024,
magic_regs = dict(
PC_POWER_CNTL = 1,
TPL1_DBG_ECO_CNTL = 0x00108000,
tile_align_w = 32,
tile_align_h = 16,
num_vsc_pipes = 32,
+ cs_shared_mem_size = 32 * 1024,
magic_regs = dict(
PC_POWER_CNTL = 1,
TPL1_DBG_ECO_CNTL = 0x00008000,
tile_align_w = 64,
tile_align_h = 32,
num_vsc_pipes = 32,
+ cs_shared_mem_size = 32 * 1024,
magic_regs = dict(
PC_POWER_CNTL = 3,
TPL1_DBG_ECO_CNTL = 0x00108000,
tile_align_w = 96,
tile_align_h = 16,
num_vsc_pipes = 32,
+ cs_shared_mem_size = 32 * 1024,
magic_regs = dict(
PC_POWER_CNTL = 2,
# this seems to be a chicken bit that fixes cubic filtering:
tile_align_w = 32,
tile_align_h = 16,
num_vsc_pipes = 32,
+ cs_shared_mem_size = 32 * 1024,
magic_regs = dict(
PC_POWER_CNTL = 1,
TPL1_DBG_ECO_CNTL = 0x05008000,
tile_align_w = 96,
tile_align_h = 16,
num_vsc_pipes = 32,
+ cs_shared_mem_size = 32 * 1024,
magic_regs = dict(
PC_POWER_CNTL = 2,
TPL1_DBG_ECO_CNTL = 0x05008000,
tile_align_w = 64,
tile_align_h = 32,
num_vsc_pipes = 32,
+ cs_shared_mem_size = 32 * 1024,
magic_regs = dict(
PC_POWER_CNTL = 7,
TPL1_DBG_ECO_CNTL = 0x01008000,
tile_align_w = 64,
tile_align_h = 32,
num_vsc_pipes = 32,
+ cs_shared_mem_size = 32 * 1024,
magic_regs = dict()
))