No change in behavior. The previous overalignment is preserved.
It sets ib_pad_dw_mask sooner.
Reviewed-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Reviewed-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/25578>
return false;
}
+ unsigned max_ib_alignment = 0;
+
for (unsigned ip_type = 0; ip_type < AMD_NUM_IP_TYPES; ip_type++) {
struct drm_amdgpu_info_hw_ip ip_info = {0};
info->ip[AMD_IP_GFX].ver_minor = info->ip[AMD_IP_COMPUTE].ver_minor = 3;
}
info->ip[ip_type].num_queues = util_bitcount(ip_info.available_rings);
- info->ib_alignment = MAX3(info->ib_alignment, ip_info.ib_start_alignment,
- ip_info.ib_size_alignment);
+ info->ip[ip_type].ib_alignment = MAX2(ip_info.ib_start_alignment, ip_info.ib_size_alignment);
+ max_ib_alignment = MAX2(max_ib_alignment, info->ip[ip_type].ib_alignment);
+ }
+
+ /* TODO: Remove this. This hack mimics the previous behavior of global ib_alignment. */
+ for (unsigned ip_type = 0; ip_type < AMD_NUM_IP_TYPES; ip_type++) {
+ info->ip[ip_type].ib_alignment = MAX2(max_ib_alignment, 1024);
}
+ /* This is "align_mask" copied from the kernel, maximums of all IP versions. */
+ info->ib_pad_dw_mask[AMD_IP_GFX] = 0xff;
+ info->ib_pad_dw_mask[AMD_IP_COMPUTE] = 0xff;
+ info->ib_pad_dw_mask[AMD_IP_SDMA] = 0xf;
+ info->ib_pad_dw_mask[AMD_IP_UVD] = 0xf;
+ info->ib_pad_dw_mask[AMD_IP_VCE] = 0x3f;
+ info->ib_pad_dw_mask[AMD_IP_UVD_ENC] = 0x3f;
+ info->ib_pad_dw_mask[AMD_IP_VCN_DEC] = 0xf;
+ info->ib_pad_dw_mask[AMD_IP_VCN_ENC] = 0x3f;
+ info->ib_pad_dw_mask[AMD_IP_VCN_JPEG] = 0xf;
+
/* Only require gfx or compute. */
if (!info->ip[AMD_IP_GFX].num_queues && !info->ip[AMD_IP_COMPUTE].num_queues) {
fprintf(stderr, "amdgpu: failed to find gfx or compute.\n");
assert(util_is_power_of_two_or_zero(info->ip[AMD_IP_COMPUTE].num_queues));
assert(util_is_power_of_two_or_zero(info->ip[AMD_IP_SDMA].num_queues));
- /* The kernel pads gfx and compute IBs to 256 dwords since:
- * 66f3b2d527154bd258a57c8815004b5964aa1cf5
- * Do the same.
- */
- info->ib_alignment = MAX2(info->ib_alignment, 1024);
-
r = amdgpu_query_firmware_version(dev, AMDGPU_INFO_FW_GFX_ME, 0, 0, &info->me_fw_version,
&info->me_fw_feature);
if (r) {
info->lds_encode_granularity = info->gfx_level >= GFX7 ? 128 * 4 : 64 * 4;
info->lds_alloc_granularity = info->gfx_level >= GFX10_3 ? 256 * 4 : info->lds_encode_granularity;
- /* This is "align_mask" copied from the kernel, maximums of all IP versions. */
- info->ib_pad_dw_mask[AMD_IP_GFX] = 0xff;
- info->ib_pad_dw_mask[AMD_IP_COMPUTE] = 0xff;
- info->ib_pad_dw_mask[AMD_IP_SDMA] = 0xf;
- info->ib_pad_dw_mask[AMD_IP_UVD] = 0xf;
- info->ib_pad_dw_mask[AMD_IP_VCE] = 0x3f;
- info->ib_pad_dw_mask[AMD_IP_UVD_ENC] = 0x3f;
- info->ib_pad_dw_mask[AMD_IP_VCN_DEC] = 0xf;
- info->ib_pad_dw_mask[AMD_IP_VCN_ENC] = 0x3f;
- info->ib_pad_dw_mask[AMD_IP_VCN_JPEG] = 0xf;
-
/* The mere presence of CLEAR_STATE in the IB causes random GPU hangs
* on GFX6. Some CLEAR_STATE cause asic hang on radeon kernel, etc.
* SPI_VS_OUT_CONFIG. So only enable GFX7 CLEAR_STATE on amdgpu kernel.
for (unsigned i = 0; i < AMD_NUM_IP_TYPES; i++) {
if (info->ip[i].num_queues) {
- fprintf(f, " IP %-7s %2u.%u \tqueues:%u\n", ip_string[i],
- info->ip[i].ver_major, info->ip[i].ver_minor, info->ip[i].num_queues);
+ fprintf(f, " IP %-7s %2u.%u \tqueues:%u (align:%u, pad_dw:0x%x)\n", ip_string[i],
+ info->ip[i].ver_major, info->ip[i].ver_minor, info->ip[i].num_queues,
+ info->ip[i].ib_alignment, info->ib_pad_dw_mask[i]);
}
}
fprintf(f, "CP info:\n");
fprintf(f, " gfx_ib_pad_with_type2 = %i\n", info->gfx_ib_pad_with_type2);
- fprintf(f, " ib_alignment = %u\n", info->ib_alignment);
fprintf(f, " me_fw_version = %i\n", info->me_fw_version);
fprintf(f, " me_fw_feature = %i\n", info->me_fw_feature);
fprintf(f, " mec_fw_version = %i\n", info->mec_fw_version);
uint8_t ver_minor;
uint8_t ver_rev;
uint8_t num_queues;
+ uint32_t ib_alignment;
};
struct radeon_info {
/* CP info. */
bool gfx_ib_pad_with_type2;
- unsigned ib_alignment; /* both start and size alignment */
uint32_t me_fw_version;
uint32_t me_fw_feature;
uint32_t mec_fw_version;
}
static uint32_t
-radv_align_cmdbuf_size(const struct radv_device *device, uint32_t size)
+radv_align_cmdbuf_size(const struct radv_device *device, uint32_t size, enum amd_ip_type ip_type)
{
- const uint32_t ib_alignment = device->physical_device->rad_info.ib_alignment;
+ const uint32_t ib_alignment = device->physical_device->rad_info.ip[ip_type].ib_alignment;
return align(size, ib_alignment);
}
static unsigned
radv_dgc_preamble_cmdbuf_size(const struct radv_device *device)
{
- return radv_align_cmdbuf_size(device, 16);
+ return radv_align_cmdbuf_size(device, 16, AMD_IP_GFX);
}
static bool
uint32_t cmd_size, upload_size;
radv_get_sequence_size(layout, pipeline, &cmd_size, &upload_size);
- return radv_align_cmdbuf_size(device, cmd_size * cmd_info->sequencesCount);
+ return radv_align_cmdbuf_size(device, cmd_size * cmd_info->sequencesCount, AMD_IP_GFX);
}
struct radv_dgc_params {
nir_def *cmd_buf_size = load_param32(b, cmd_buf_size);
nir_def *cmd_buf_stride = load_param32(b, cmd_buf_stride);
nir_def *size = nir_imul(b, cmd_buf_stride, sequence_count);
- unsigned align_mask = radv_align_cmdbuf_size(device, 1) - 1;
+ unsigned align_mask = radv_align_cmdbuf_size(device, 1, AMD_IP_GFX) - 1;
size = nir_iand_imm(b, nir_iadd_imm(b, size, align_mask), ~align_mask);
uint32_t cmd_stride, upload_stride;
radv_get_sequence_size(layout, pipeline, &cmd_stride, &upload_stride);
- VkDeviceSize cmd_buf_size =
- radv_align_cmdbuf_size(device, cmd_stride * pInfo->maxSequencesCount) + radv_dgc_preamble_cmdbuf_size(device);
+ VkDeviceSize cmd_buf_size = radv_align_cmdbuf_size(device, cmd_stride * pInfo->maxSequencesCount, AMD_IP_GFX) +
+ radv_dgc_preamble_cmdbuf_size(device);
VkDeviceSize upload_buf_size = upload_stride * pInfo->maxSequencesCount;
pMemoryRequirements->memoryRequirements.memoryTypeBits = device->physical_device->memory_types_32bit;
- pMemoryRequirements->memoryRequirements.alignment = device->physical_device->rad_info.ib_alignment;
+ pMemoryRequirements->memoryRequirements.alignment =
+ MAX2(device->physical_device->rad_info.ip[AMD_IP_GFX].ib_alignment,
+ device->physical_device->rad_info.ip[AMD_IP_COMPUTE].ib_alignment);
pMemoryRequirements->memoryRequirements.size =
align(cmd_buf_size + upload_buf_size, pMemoryRequirements->memoryRequirements.alignment);
}
radv_get_sequence_size(layout, pipeline, &cmd_stride, &upload_stride);
unsigned cmd_buf_size =
- radv_align_cmdbuf_size(cmd_buffer->device, cmd_stride * pGeneratedCommandsInfo->sequencesCount);
+ radv_align_cmdbuf_size(cmd_buffer->device, cmd_stride * pGeneratedCommandsInfo->sequencesCount, AMD_IP_GFX);
uint64_t upload_addr =
radv_buffer_get_va(prep_buffer->bo) + prep_buffer->offset + pGeneratedCommandsInfo->preprocessOffset;
const enum radeon_bo_flag flags =
RADEON_FLAG_CPU_ACCESS | RADEON_FLAG_NO_INTERPROCESS_SHARING | RADEON_FLAG_READ_ONLY | gtt_wc_flag;
- return ws->buffer_create(ws, ib_size, cs->ws->info.ib_alignment, domain, flags, RADV_BO_PRIORITY_CS, 0,
+ return ws->buffer_create(ws, ib_size, cs->ws->info.ip[cs->hw_ip].ib_alignment, domain, flags, RADV_BO_PRIORITY_CS, 0,
&cs->ib_buffer);
}
static unsigned
radv_amdgpu_cs_get_initial_size(struct radv_amdgpu_winsys *ws, enum amd_ip_type ip_type)
{
- const uint32_t ib_alignment = ws->info.ib_alignment;
+ const uint32_t ib_alignment = ws->info.ip[ip_type].ib_alignment;
assert(util_is_power_of_two_nonzero(ib_alignment));
return align(20 * 1024 * 4, ib_alignment);
}
return;
}
- const uint32_t ib_alignment = cs->ws->info.ib_alignment;
+ const uint32_t ib_alignment = cs->ws->info.ip[cs->hw_ip].ib_alignment;
cs->ws->base.cs_finalize(_cs);
chunks[i].chunk_data = (uint64_t)(uintptr_t)&chunk_data[i];
ib = &request->ibs[i];
- assert(ib->ib_mc_address && ib->ib_mc_address % ctx->ws->info.ib_alignment == 0);
+ assert(ib->ib_mc_address && ib->ib_mc_address % ctx->ws->info.ip[ib->ip_type].ib_alignment == 0);
assert(ib->size);
chunk_data[i].ib_data._pad = 0;
}
static void amdgpu_ib_finalize(struct amdgpu_winsys *ws, struct radeon_cmdbuf *rcs,
- struct amdgpu_ib *ib)
+ struct amdgpu_ib *ib, enum amd_ip_type ip_type)
{
amdgpu_set_ib_size(rcs, ib);
ib->used_ib_space += rcs->current.cdw * 4;
- ib->used_ib_space = align(ib->used_ib_space, ws->info.ib_alignment);
+ ib->used_ib_space = align(ib->used_ib_space, ws->info.ip[ip_type].ib_alignment);
ib->max_ib_size = MAX2(ib->max_ib_size, rcs->prev_dw + rcs->current.cdw);
}
struct amdgpu_cs *cs = amdgpu_cs(rcs);
struct amdgpu_winsys *ws = cs->ws;
struct amdgpu_cs_context *csc[2] = {&cs->csc1, &cs->csc2};
- unsigned size = align(preamble_num_dw * 4, ws->info.ib_alignment);
+ unsigned size = align(preamble_num_dw * 4, ws->info.ip[AMD_IP_GFX].ib_alignment);
struct pb_buffer *preamble_bo;
uint32_t *map;
/* Create the preamble IB buffer. */
- preamble_bo = amdgpu_bo_create(ws, size, ws->info.ib_alignment,
+ preamble_bo = amdgpu_bo_create(ws, size, ws->info.ip[AMD_IP_GFX].ib_alignment,
RADEON_DOMAIN_VRAM,
RADEON_FLAG_NO_INTERPROCESS_SHARING |
RADEON_FLAG_GTT_WC |
if (noop && acs->ip_type == AMD_IP_GFX) {
/* Reduce the IB size and fill it with NOP to make it like an empty IB. */
- unsigned noop_size = MIN2(cs->ib[IB_MAIN].ib_bytes, ws->info.ib_alignment);
+ unsigned noop_size = MIN2(cs->ib[IB_MAIN].ib_bytes, ws->info.ip[AMD_IP_GFX].ib_alignment);
cs->ib_main_addr[0] = PKT3(PKT3_NOP, noop_size / 4 - 2, 0);
cs->ib[IB_MAIN].ib_bytes = noop_size;
struct amdgpu_cs_context *cur = cs->csc;
/* Set IB sizes. */
- amdgpu_ib_finalize(ws, rcs, &cs->main);
+ amdgpu_ib_finalize(ws, rcs, &cs->main, cs->ip_type);
/* Create a fence. */
amdgpu_fence_reference(&cur->fence, NULL);
}
}
+ for (unsigned ip_type = 0; ip_type < AMD_NUM_IP_TYPES; ip_type++)
+ ws->info.ip[ip_type].ib_alignment = 4096;
+
/* Hawaii with old firmware needs type2 nop packet.
* accel_working2 with value 3 indicates the new firmware.
*/
(ws->info.family == CHIP_HAWAII &&
ws->accel_working2 < 3);
ws->info.tcc_cache_line_size = 64; /* TC L2 line size on GCN */
- ws->info.ib_alignment = 4096;
ws->info.has_bo_metadata = false;
ws->info.has_eqaa_surface_allocator = false;
ws->info.has_sparse_vm_mappings = false;