The current code was broken, and I decided to redesign it instead.
This puts the sample positions for all samples into the queue
constant descriptor buffer after all the spill/ring descriptors.
It then uses a single offset register to point how far into the
samples the samples for num_samples are. This saves one user sgpr
and means we only generate the sample position data in the rare
single case where we need it currently.
This doesn't fix the failing CTS tests without the followup
fix.
Reviewed-by: Bas Nieuwenhuizen <bas@basnieuwenhuizen.nl>
Signed-off-by: Dave Airlie <airlied@redhat.com>
LLVMValueRef hs_ring_tess_factor;
LLVMValueRef prim_mask;
- LLVMValueRef sample_positions;
+ LLVMValueRef sample_pos_offset;
LLVMValueRef persp_sample, persp_center, persp_centroid;
LLVMValueRef linear_sample, linear_center, linear_centroid;
LLVMValueRef front_face;
ctx->stage == MESA_SHADER_VERTEX ||
ctx->stage == MESA_SHADER_TESS_CTRL ||
ctx->stage == MESA_SHADER_TESS_EVAL ||
+ ctx->stage == MESA_SHADER_FRAGMENT ||
ctx->is_gs_copy_shader)
need_ring_offsets = true;
need_push_constants = false;
if (need_ring_offsets && !ctx->options->supports_spill) {
- arg_types[arg_idx++] = const_array(ctx->v16i8, 8); /* address of rings */
+ arg_types[arg_idx++] = const_array(ctx->v16i8, 16); /* address of rings */
}
/* 1 for each descriptor set */
arg_types[arg_idx++] = ctx->i32; // GS instance id
break;
case MESA_SHADER_FRAGMENT:
- arg_types[arg_idx++] = const_array(ctx->f32, 32); /* sample positions */
+ arg_types[arg_idx++] = ctx->i32; /* sample position offset */
user_sgpr_count = arg_idx;
arg_types[arg_idx++] = ctx->i32; /* prim mask */
sgpr_count = arg_idx;
LLVMPointerType(ctx->i8, CONST_ADDR_SPACE),
NULL, 0, AC_FUNC_ATTR_READNONE);
ctx->ring_offsets = LLVMBuildBitCast(ctx->builder, ctx->ring_offsets,
- const_array(ctx->v16i8, 8), "");
+ const_array(ctx->v16i8, 16), "");
} else
ctx->ring_offsets = LLVMGetParam(ctx->main_function, arg_idx++);
}
ctx->gs_invocation_id = LLVMGetParam(ctx->main_function, arg_idx++);
break;
case MESA_SHADER_FRAGMENT:
- set_userdata_location_shader(ctx, AC_UD_PS_SAMPLE_POS, user_sgpr_idx, 2);
- user_sgpr_idx += 2;
- ctx->sample_positions = LLVMGetParam(ctx->main_function, arg_idx++);
+ set_userdata_location_shader(ctx, AC_UD_PS_SAMPLE_POS_OFFSET, user_sgpr_idx, 1);
+ user_sgpr_idx += 1;
+ ctx->sample_pos_offset = LLVMGetParam(ctx->main_function, arg_idx++);
ctx->prim_mask = LLVMGetParam(ctx->main_function, arg_idx++);
ctx->persp_sample = LLVMGetParam(ctx->main_function, arg_idx++);
ctx->persp_center = LLVMGetParam(ctx->main_function, arg_idx++);
static LLVMValueRef load_sample_position(struct nir_to_llvm_context *ctx,
LLVMValueRef sample_id)
{
- /* offset = sample_id * 8 (8 = 2 floats containing samplepos.xy) */
- LLVMValueRef offset0 = LLVMBuildMul(ctx->builder, sample_id, LLVMConstInt(ctx->i32, 8, false), "");
- LLVMValueRef offset1 = LLVMBuildAdd(ctx->builder, offset0, LLVMConstInt(ctx->i32, 4, false), "");
- LLVMValueRef result[2];
+ LLVMValueRef result;
+ LLVMValueRef ptr = ac_build_gep0(&ctx->ac, ctx->ring_offsets, LLVMConstInt(ctx->i32, RING_PS_SAMPLE_POSITIONS, false));
- result[0] = ac_build_indexed_load_const(&ctx->ac, ctx->sample_positions, offset0);
- result[1] = ac_build_indexed_load_const(&ctx->ac, ctx->sample_positions, offset1);
+ ptr = LLVMBuildBitCast(ctx->builder, ptr,
+ const_array(ctx->v2f32, 64), "");
- return ac_build_gather_values(&ctx->ac, result, 2);
+ sample_id = LLVMBuildAdd(ctx->builder, sample_id, ctx->sample_pos_offset, "");
+ result = ac_build_indexed_load(&ctx->ac, ptr, sample_id, false);
+
+ ctx->shader_info->fs.uses_sample_positions = true;
+ return result;
}
static LLVMValueRef load_sample_pos(struct nir_to_llvm_context *ctx)
AC_UD_VS_BASE_VERTEX_START_INSTANCE,
AC_UD_VS_LS_TCS_IN_LAYOUT,
AC_UD_VS_MAX_UD,
- AC_UD_PS_SAMPLE_POS = AC_UD_SHADER_START,
+ AC_UD_PS_SAMPLE_POS_OFFSET = AC_UD_SHADER_START,
AC_UD_PS_MAX_UD,
AC_UD_CS_GRID_SIZE = AC_UD_SHADER_START,
AC_UD_CS_MAX_UD,
#define RING_GSVS_GS 4
#define RING_HS_TESS_FACTOR 5
#define RING_HS_TESS_OFFCHIP 6
+#define RING_PS_SAMPLE_POSITIONS 7
// Match MAX_SETS from radv_descriptor_set.h
#define AC_UD_MAX_SETS MAX_SETS
bool force_persample;
bool prim_id_input;
bool layer_input;
+ bool uses_sample_positions;
} fs;
struct {
unsigned block_size[3];
cmd_buffer->esgs_ring_size_needed = 0;
cmd_buffer->gsvs_ring_size_needed = 0;
cmd_buffer->tess_rings_needed = false;
+ cmd_buffer->sample_positions_needed = false;
if (cmd_buffer->upload.upload_bo)
cmd_buffer->device->ws->cs_add_buffer(cmd_buffer->cs,
radv_cayman_emit_msaa_sample_locs(cmd_buffer->cs, num_samples);
- uint32_t samples_offset;
- void *samples_ptr;
- void *src;
- radv_cmd_buffer_upload_alloc(cmd_buffer, num_samples * 4 * 2, 256, &samples_offset,
- &samples_ptr);
- switch (num_samples) {
- case 1:
- src = cmd_buffer->device->sample_locations_1x;
- break;
- case 2:
- src = cmd_buffer->device->sample_locations_2x;
- break;
- case 4:
- src = cmd_buffer->device->sample_locations_4x;
- break;
- case 8:
- src = cmd_buffer->device->sample_locations_8x;
- break;
- case 16:
- src = cmd_buffer->device->sample_locations_16x;
- break;
- default:
- unreachable("unknown number of samples");
- }
- memcpy(samples_ptr, src, num_samples * 4 * 2);
-
- uint64_t va = cmd_buffer->device->ws->buffer_get_va(cmd_buffer->upload.upload_bo);
- va += samples_offset;
+ if (pipeline->shaders[MESA_SHADER_FRAGMENT]->info.fs.uses_sample_positions) {
+ uint32_t offset;
+ struct ac_userdata_info *loc = radv_lookup_user_sgpr(pipeline, MESA_SHADER_FRAGMENT, AC_UD_PS_SAMPLE_POS_OFFSET);
+ uint32_t base_reg = shader_stage_to_user_data_0(MESA_SHADER_FRAGMENT, radv_pipeline_has_gs(pipeline), radv_pipeline_has_tess(pipeline));
+ if (loc->sgpr_idx == -1)
+ return;
+ assert(loc->num_sgprs == 1);
+ assert(!loc->indirect);
+ switch (num_samples) {
+ default:
+ offset = 0;
+ break;
+ case 2:
+ offset = 1;
+ break;
+ case 4:
+ offset = 3;
+ break;
+ case 8:
+ offset = 7;
+ break;
+ case 16:
+ offset = 15;
+ break;
+ }
- radv_emit_userdata_address(cmd_buffer, pipeline, MESA_SHADER_FRAGMENT,
- AC_UD_PS_SAMPLE_POS, va);
+ radeon_set_sh_reg(cmd_buffer->cs, base_reg + loc->sgpr_idx * 4, offset);
+ cmd_buffer->sample_positions_needed = true;
+ }
}
static void
primary->gsvs_ring_size_needed = secondary->gsvs_ring_size_needed;
if (secondary->tess_rings_needed)
primary->tess_rings_needed = true;
+ if (secondary->sample_positions_needed)
+ primary->sample_positions_needed = true;
if (secondary->ring_offsets_idx != -1) {
if (primary->ring_offsets_idx == -1)
static void
fill_geom_tess_rings(struct radv_queue *queue,
uint32_t *map,
+ bool add_sample_positions,
uint32_t esgs_ring_size,
struct radeon_winsys_bo *esgs_ring_bo,
uint32_t gsvs_ring_size,
S_008F0C_ELEMENT_SIZE(0) |
S_008F0C_INDEX_STRIDE(0) |
S_008F0C_ADD_TID_ENABLE(false);
+ desc += 4;
+
+ /* add sample positions after all rings */
+ memcpy(desc, queue->device->sample_locations_1x, 8);
+ desc += 2;
+ memcpy(desc, queue->device->sample_locations_2x, 16);
+ desc += 4;
+ memcpy(desc, queue->device->sample_locations_4x, 32);
+ desc += 8;
+ memcpy(desc, queue->device->sample_locations_8x, 64);
+ desc += 16;
+ memcpy(desc, queue->device->sample_locations_16x, 128);
}
static unsigned
uint32_t esgs_ring_size,
uint32_t gsvs_ring_size,
bool needs_tess_rings,
+ bool needs_sample_positions,
struct radeon_winsys_cs **initial_preamble_cs,
struct radeon_winsys_cs **continue_preamble_cs)
{
struct radeon_winsys_bo *tess_factor_ring_bo = NULL;
struct radeon_winsys_bo *tess_offchip_ring_bo = NULL;
struct radeon_winsys_cs *dest_cs[2] = {0};
- bool add_tess_rings = false;
+ bool add_tess_rings = false, add_sample_positions = false;
unsigned tess_factor_ring_size = 0, tess_offchip_ring_size = 0;
unsigned max_offchip_buffers;
unsigned hs_offchip_param = 0;
if (needs_tess_rings)
add_tess_rings = true;
}
+ if (!queue->has_sample_positions) {
+ if (needs_sample_positions)
+ add_sample_positions = true;
+ }
tess_factor_ring_size = 32768 * queue->device->physical_device->rad_info.max_se;
hs_offchip_param = radv_get_hs_offchip_param(queue->device,
&max_offchip_buffers);
compute_scratch_size <= queue->compute_scratch_size &&
esgs_ring_size <= queue->esgs_ring_size &&
gsvs_ring_size <= queue->gsvs_ring_size &&
- !add_tess_rings &&
+ !add_tess_rings && !add_sample_positions &&
queue->initial_preamble_cs) {
*initial_preamble_cs = queue->initial_preamble_cs;
*continue_preamble_cs = queue->continue_preamble_cs;
esgs_ring_bo != queue->esgs_ring_bo ||
gsvs_ring_bo != queue->gsvs_ring_bo ||
tess_factor_ring_bo != queue->tess_factor_ring_bo ||
- tess_offchip_ring_bo != queue->tess_offchip_ring_bo) {
+ tess_offchip_ring_bo != queue->tess_offchip_ring_bo || add_sample_positions) {
uint32_t size = 0;
if (gsvs_ring_bo || esgs_ring_bo ||
- tess_factor_ring_bo || tess_offchip_ring_bo)
+ tess_factor_ring_bo || tess_offchip_ring_bo || add_sample_positions) {
size = 112; /* 2 dword + 2 padding + 4 dword * 6 */
+ if (add_sample_positions)
+ size += 256; /* 32+16+8+4+2+1 samples * 4 * 2 = 248 bytes. */
+ }
else if (scratch_bo)
size = 8; /* 2 dword */
map[1] = rsrc1;
}
- if (esgs_ring_bo || gsvs_ring_bo || tess_factor_ring_bo || tess_offchip_ring_bo)
- fill_geom_tess_rings(queue, map,
+ if (esgs_ring_bo || gsvs_ring_bo || tess_factor_ring_bo || tess_offchip_ring_bo ||
+ add_sample_positions)
+ fill_geom_tess_rings(queue, map, add_sample_positions,
esgs_ring_size, esgs_ring_bo,
gsvs_ring_size, gsvs_ring_bo,
tess_factor_ring_size, tess_factor_ring_bo,
queue->descriptor_bo = descriptor_bo;
}
+ if (add_sample_positions)
+ queue->has_sample_positions = true;
+
*initial_preamble_cs = queue->initial_preamble_cs;
*continue_preamble_cs = queue->continue_preamble_cs;
if (!scratch_size && !compute_scratch_size && !esgs_ring_size && !gsvs_ring_size)
VkResult result;
bool fence_emitted = false;
bool tess_rings_needed = false;
+ bool sample_positions_needed = false;
/* Do this first so failing to allocate scratch buffers can't result in
* partially executed submissions. */
esgs_ring_size = MAX2(esgs_ring_size, cmd_buffer->esgs_ring_size_needed);
gsvs_ring_size = MAX2(gsvs_ring_size, cmd_buffer->gsvs_ring_size_needed);
tess_rings_needed |= cmd_buffer->tess_rings_needed;
+ sample_positions_needed |= cmd_buffer->sample_positions_needed;
}
}
result = radv_get_preamble_cs(queue, scratch_size, compute_scratch_size,
esgs_ring_size, gsvs_ring_size, tess_rings_needed,
+ sample_positions_needed,
&initial_preamble_cs, &continue_preamble_cs);
if (result != VK_SUCCESS)
return result;
uint32_t esgs_ring_size;
uint32_t gsvs_ring_size;
bool has_tess_rings;
+ bool has_sample_positions;
struct radeon_winsys_bo *scratch_bo;
struct radeon_winsys_bo *descriptor_bo;
uint32_t esgs_ring_size_needed;
uint32_t gsvs_ring_size_needed;
bool tess_rings_needed;
+ bool sample_positions_needed;
int ring_offsets_idx; /* just used for verification */
};