dEQP-GLES3.functional.shaders.derivate.fwidth.nicest.fbo_msaa4.vec3_mediump,Fail
dEQP-GLES3.functional.shaders.derivate.fwidth.nicest.fbo_msaa4.vec4_highp,Fail
dEQP-GLES3.functional.shaders.derivate.fwidth.nicest.fbo_msaa4.vec4_mediump,Fail
-dEQP-GLES2.functional.shaders.indexing.tmp_array.vec3_const_write_dynamic_loop_read_fragment,Fail
-dEQP-GLES2.functional.shaders.indexing.tmp_array.vec3_const_write_dynamic_read_fragment,Fail
-dEQP-GLES2.functional.shaders.indexing.tmp_array.vec4_const_write_dynamic_loop_read_fragment,Fail
-dEQP-GLES2.functional.shaders.indexing.tmp_array.vec4_const_write_dynamic_read_fragment,Fail
dEQP-GLES3.functional.shaders.linkage.varying.rules.differing_interpolation_2,Fail
dEQP-GLES3.functional.shaders.texture_functions.texturegradoffset.isampler2d_vertex,Fail
dEQP-GLES3.functional.shaders.texture_functions.texturegradoffset.isampler3d_vertex,Fail
-dEQP-GLES2.functional.shaders.indexing.tmp_array.vec3_const_write_dynamic_loop_read_vertex,Fail
-dEQP-GLES2.functional.shaders.indexing.tmp_array.vec3_const_write_dynamic_read_vertex,Fail
-dEQP-GLES2.functional.shaders.indexing.tmp_array.vec4_const_write_dynamic_loop_read_vertex,Fail
-dEQP-GLES2.functional.shaders.indexing.tmp_array.vec4_const_write_dynamic_read_vertex,Fail
dEQP-VK.api.copy_and_blit.core.blit_image.all_formats.color.3d.a1r5g5b5_unorm_pack16.a1r5g5b5_unorm_pack16.optimal_general_nearest,Fail
dEQP-VK.api.copy_and_blit.core.blit_image.all_formats.color.3d.a2b10g10r10_uint_pack32.a2b10g10r10_uint_pack32.general_optimal_nearest,Fail
dEQP-VK.api.copy_and_blit.core.blit_image.all_formats.color.3d.a2r10g10b10_unorm_pack32.a2r10g10b10_unorm_pack32.optimal_optimal_nearest,Fail
kernel->base.num_bufs = kernel->info.num_bufs;
memcpy(kernel->base.buf_sizes, kernel->info.buf_sizes, sizeof(kernel->base.buf_sizes));
- unsigned sz = v->info.sizedwords * 4;
+ unsigned sz = v->info.size;
v->bo = fd_bo_new(c->dev, sz,
DRM_FREEDRENO_GEM_CACHE_WCOMBINE |
* doesn't try to decode the following data as instructions (such as the
* next stage's shader in turnip)
*/
- info->sizedwords = MAX2(v->instrlen * compiler->instr_align,
- instr_count + 4) * sizeof(instr_t) / 4;
+ info->size = MAX2(v->instrlen * compiler->instr_align, instr_count + 4) *
+ sizeof(instr_t);
+ info->sizedwords = info->size / 4;
+
+ if (v->constant_data_size) {
+ /* Make sure that where we're about to place the constant_data is safe
+ * to indirectly upload from.
+ */
+ info->constant_data_offset = align(info->size, v->shader->compiler->const_upload_unit * 16);
+ info->size = info->constant_data_offset + v->constant_data_size;
+ }
/* Pad out the size so that when turnip uploads the shaders in
* sequence, the starting offset of the next one is properly aligned.
*/
- info->sizedwords = align(info->sizedwords, compiler->instr_align * sizeof(instr_t) / 4);
+ info->size = align(info->size, compiler->instr_align * sizeof(instr_t));
- ptr = dwords = rzalloc_size(v, 4 * info->sizedwords);
+ ptr = dwords = rzalloc_size(v, info->size);
foreach_block (block, &shader->block_list) {
unsigned sfu_delay = 0;
}
}
+ /* Append the immediates after the end of the program. This lets us emit
+ * the immediates as an indirect load, while avoiding creating another BO.
+ */
+ if (v->constant_data_size)
+ memcpy(&ptr[info->constant_data_offset / 4], v->constant_data, v->constant_data_size);
+ ralloc_free(v->constant_data);
+ v->constant_data = NULL;
+
return ptr;
fail:
struct ir3_info {
void *data; /* used internally in ir3 assembler */
+ /* Size in bytes of the shader binary, including NIR constants and
+ * padding
+ */
+ uint32_t size;
+ /* byte offset from start of the shader to the NIR constant data. */
+ uint32_t constant_data_offset;
+ /* Size in dwords of the instructions. */
uint16_t sizedwords;
uint16_t instrs_count; /* expanded to account for rpt's */
uint16_t nops_count; /* # of nop instructions, including nopN */
* pointers need special handling:
*/
- v->bin = rzalloc_size(v, 4 * v->info.sizedwords);
- blob_copy_bytes(blob, v->bin, 4 * v->info.sizedwords);
+ v->bin = rzalloc_size(v, v->info.size);
+ blob_copy_bytes(blob, v->bin, v->info.size);
if (!v->binning_pass) {
blob_copy_bytes(blob, v->const_state, sizeof(*v->const_state));
* pointers need special handling:
*/
- blob_write_bytes(blob, v->bin, 4 * v->info.sizedwords);
+ blob_write_bytes(blob, v->bin, v->info.size);
+
+ /* No saving constant_data, it's already baked into bin at this point. */
if (!v->binning_pass) {
blob_write_bytes(blob, v->const_state, sizeof(*v->const_state));
progress |= OPT(s, nir_lower_tex, &tex_options);
}
+ /* Move large constant variables to the constants attached to the NIR
+ * shader, which we will upload in the immediates range. This generates
+ * amuls, so we need to clean those up after.
+ *
+ * Passing no size_align, we would get packed values, which if we end up
+ * having to load with LDC would result in extra reads to unpack from
+ * straddling loads. Align everything to vec4 to avoid that, though we
+ * could theoretically do better.
+ */
+ OPT_V(s, nir_opt_large_constants, glsl_get_vec4_size_align_bytes, 32 /* bytes */);
+ OPT_V(s, ir3_nir_lower_load_constant, so);
+
if (!so->binning_pass)
OPT_V(s, ir3_nir_analyze_ubo_ranges, so);
progress |= OPT(s, ir3_nir_lower_ubo_loads, so);
+ OPT_V(s, nir_lower_amul, ir3_glsl_type_size);
+
/* UBO offset lowering has to come after we've decided what will
* be left as load_ubo
*/
void ir3_setup_const_state(nir_shader *nir, struct ir3_shader_variant *v,
struct ir3_const_state *const_state);
+bool ir3_nir_lower_load_constant(nir_shader *nir, struct ir3_shader_variant *v);
void ir3_nir_analyze_ubo_ranges(nir_shader *nir, struct ir3_shader_variant *v);
bool ir3_nir_lower_ubo_loads(nir_shader *nir, struct ir3_shader_variant *v);
bool ir3_nir_fixup_load_uniform(nir_shader *nir);
fixup_load_uniform_filter, fixup_load_uniform_instr,
NULL);
}
+static nir_ssa_def *
+ir3_nir_lower_load_const_instr(nir_builder *b, nir_instr *in_instr, void *data)
+{
+ struct ir3_const_state *const_state = data;
+ nir_intrinsic_instr *instr = nir_instr_as_intrinsic(in_instr);
+
+ /* Pick a UBO index to use as our constant data. Skip UBO 0 since that's
+ * reserved for gallium's cb0.
+ */
+ if (const_state->constant_data_ubo == -1) {
+ if (b->shader->info.num_ubos == 0)
+ b->shader->info.num_ubos++;
+ const_state->constant_data_ubo = b->shader->info.num_ubos++;
+ }
+
+ unsigned num_components = instr->num_components;
+ if (nir_dest_bit_size(instr->dest) == 16) {
+ /* We can't do 16b loads -- either from LDC (32-bit only in any of our
+ * traces, and disasm that doesn't look like it really supports it) or
+ * from the constant file (where CONSTANT_DEMOTION_ENABLE means we get
+ * automatic 32b-to-16b conversions when we ask for 16b from it).
+ * Instead, we'll load 32b from a UBO and unpack from there.
+ */
+ num_components = DIV_ROUND_UP(num_components, 2);
+ }
+ unsigned base = nir_intrinsic_base(instr);
+ nir_intrinsic_instr *load =
+ nir_intrinsic_instr_create(b->shader, nir_intrinsic_load_ubo);
+ load->num_components = num_components;
+ nir_ssa_dest_init(&load->instr, &load->dest,
+ load->num_components, 32,
+ instr->dest.ssa.name);
+
+ load->src[0] = nir_src_for_ssa(nir_imm_int(b,
+ const_state->constant_data_ubo));
+ load->src[1] = nir_src_for_ssa(nir_iadd_imm(b,
+ nir_ssa_for_src(b, instr->src[0], 1), base));
+
+ nir_intrinsic_set_align(load,
+ nir_intrinsic_align_mul(instr),
+ nir_intrinsic_align_offset(instr));
+ nir_intrinsic_set_range_base(load, base);
+ nir_intrinsic_set_range(load, nir_intrinsic_range(instr));
+
+ nir_builder_instr_insert(b, &load->instr);
+
+ nir_ssa_def *result = &load->dest.ssa;
+ if (nir_dest_bit_size(instr->dest) == 16) {
+ result = nir_bitcast_vector(b, result, 16);
+ result = nir_channels(b, result, BITSET_MASK(instr->num_components));
+ }
+
+ return result;
+}
+
+static bool
+ir3_lower_load_const_filter(const nir_instr *instr, const void *data)
+{
+ return (instr->type == nir_instr_type_intrinsic &&
+ nir_instr_as_intrinsic(instr)->intrinsic == nir_intrinsic_load_constant);
+}
+
+/* Lowers load_constant intrinsics to UBO accesses so we can run them through
+ * the general "upload to const file or leave as UBO access" code.
+ */
+bool
+ir3_nir_lower_load_constant(nir_shader *nir, struct ir3_shader_variant *v)
+{
+ struct ir3_const_state *const_state = ir3_const_state(v);
+
+ const_state->constant_data_ubo = -1;
+
+ bool progress = nir_shader_lower_instructions(nir,
+ ir3_lower_load_const_filter, ir3_nir_lower_load_const_instr,
+ const_state);
+
+ if (progress) {
+ struct ir3_compiler *compiler = v->shader->compiler;
+
+ /* Save a copy of the NIR constant data to the variant for
+ * inclusion in the final assembly.
+ */
+ v->constant_data_size = align(nir->constant_data_size,
+ compiler->const_upload_unit * 4 * sizeof(uint32_t));
+ v->constant_data = rzalloc_size(v, v->constant_data_size);
+ memcpy(v->constant_data, nir->constant_data,
+ nir->constant_data_size);
+ }
+
+ return progress;
+}
unsigned num_ubos;
unsigned num_driver_params; /* scalar */
+ /* UBO that should be mapped to the NIR shader's constant_data (or -1). */
+ int32_t constant_data_ubo;
+
struct {
/* user const start at zero */
unsigned ubo;
gl_shader_stage type;
struct ir3_shader *shader;
+ /* variant's copy of nir->constant_data (since we don't track the NIR in
+ * the variant, and shader->nir is before the opt pass). Moves to v->bin
+ * after assembly.
+ */
+ void *constant_data;
+
/*
* Below here is serialized when written to disk cache:
*/
struct ir3_info info;
+ uint32_t constant_data_size;
+
/* Levels of nesting of flow control:
*/
unsigned branchstack;
{
const struct tu_program_descriptor_linkage *link =
&pipeline->program.link[type];
- const struct ir3_ubo_analysis_state *state = &link->const_state.ubo_state;
+ const struct ir3_const_state *const_state = &link->const_state;
+ const struct ir3_ubo_analysis_state *state = &const_state->ubo_state;
if (link->push_consts.count > 0) {
unsigned num_units = link->push_consts.count;
debug_assert((offset % 16) == 0);
/* Dig out the descriptor from the descriptor state and read the VA from
- * it.
+ * it. All our UBOs are bindless with the exception of the NIR
+ * constant_data, which is uploaded once in the pipeline.
*/
- assert(state->range[i].ubo.bindless);
+ if (!state->range[i].ubo.bindless) {
+ assert(state->range[i].ubo.block == const_state->constant_data_ubo);
+ continue;
+ }
+
uint32_t *base = state->range[i].ubo.bindless_base == MAX_SETS ?
descriptors_state->dynamic_descriptors :
descriptors_state->sets[state->range[i].ubo.bindless_base]->mapped_ptr;
*/
size = MIN2(size + base, xs->constlen) - base;
- if (size <= 0)
- return;
+ if (size > 0) {
+ tu_cs_emit_pkt7(cs, tu6_stage2opcode(stage), 3 + size * 4);
+ tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(base) |
+ CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
+ CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
+ CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(stage)) |
+ CP_LOAD_STATE6_0_NUM_UNIT(size));
+ tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
+ tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
- tu_cs_emit_pkt7(cs, tu6_stage2opcode(stage), 3 + size * 4);
- tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(base) |
- CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
- CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
- CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(stage)) |
- CP_LOAD_STATE6_0_NUM_UNIT(size));
- tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
- tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
+ tu_cs_emit_array(cs, const_state->immediates, size * 4);
+ }
+
+ if (const_state->constant_data_ubo != -1) {
+ uint64_t iova = binary_iova + xs->info.constant_data_offset;
- tu_cs_emit_array(cs, const_state->immediates, size * 4);
+ /* Upload UBO state for the constant data. */
+ tu_cs_emit_pkt7(cs, tu6_stage2opcode(stage), 5);
+ tu_cs_emit(cs,
+ CP_LOAD_STATE6_0_DST_OFF(const_state->constant_data_ubo) |
+ CP_LOAD_STATE6_0_STATE_TYPE(ST6_UBO)|
+ CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
+ CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(stage)) |
+ CP_LOAD_STATE6_0_NUM_UNIT(1));
+ tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
+ tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
+ int size_vec4s = DIV_ROUND_UP(xs->constant_data_size, 16);
+ tu_cs_emit_qw(cs,
+ iova |
+ (uint64_t)A6XX_UBO_1_SIZE(size_vec4s) << 32);
+
+ /* Upload the constant data to the const file if needed. */
+ const struct ir3_ubo_analysis_state *ubo_state = &const_state->ubo_state;
+
+ for (int i = 0; i < ubo_state->num_enabled; i++) {
+ if (ubo_state->range[i].ubo.block != const_state->constant_data_ubo ||
+ ubo_state->range[i].ubo.bindless) {
+ continue;
+ }
+
+ uint32_t start = ubo_state->range[i].start;
+ uint32_t end = ubo_state->range[i].end;
+ uint32_t size = MIN2(end - start,
+ (16 * xs->constlen) - ubo_state->range[i].offset);
+
+ tu_cs_emit_pkt7(cs, tu6_stage2opcode(stage), 3);
+ tu_cs_emit(cs,
+ CP_LOAD_STATE6_0_DST_OFF(ubo_state->range[i].offset / 16) |
+ CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
+ CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) |
+ CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(stage)) |
+ CP_LOAD_STATE6_0_NUM_UNIT(size / 16));
+ tu_cs_emit_qw(cs, iova + start);
+ }
+ }
}
static void
if (builder) {
for (uint32_t i = 0; i < MESA_SHADER_STAGES; i++) {
if (builder->variants[i])
- size += builder->variants[i]->info.sizedwords;
+ size += builder->variants[i]->info.size / 4;
}
- size += builder->binning_variant->info.sizedwords;
+ size += builder->binning_variant->info.size / 4;
} else {
- size += compute->info.sizedwords;
+ size += compute->info.size / 4;
}
tu_cs_init(&pipeline->cs, dev, TU_CS_MODE_SUB_STREAM, size);
return 0;
/* this expects to get enough alignment because shaders are allocated first
- * and sizedwords is always aligned correctly
+ * and total size is always aligned correctly
* note: an assert in tu6_emit_xs_config validates the alignment
*/
- tu_cs_alloc(&pipeline->cs, variant->info.sizedwords, 1, &memory);
+ tu_cs_alloc(&pipeline->cs, variant->info.size / 4, 1, &memory);
- memcpy(memory.map, variant->bin, sizeof(uint32_t) * variant->info.sizedwords);
+ memcpy(memory.map, variant->bin, variant->info.size);
return memory.iova;
}
OUT_RING(ring, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
for (int i = 0; i < num_ubos; i++) {
+ /* NIR constant data is packed into the end of the shader. */
+ if (i == const_state->constant_data_ubo) {
+ int size_vec4s = DIV_ROUND_UP(v->constant_data_size, 16);
+ OUT_RELOC(ring, v->bo,
+ v->info.constant_data_offset,
+ (uint64_t)A6XX_UBO_1_SIZE(size_vec4s) << 32,
+ 0);
+ continue;
+ }
+
struct pipe_constant_buffer *cb = &constbuf->cb[i];
/* If we have user pointers (constbuf 0, aka GL uniforms), upload them
}
/**
+ * Uploads the referenced subranges of the nir constant_data to the hardware's
+ * constant buffer.
+ */
+static inline void
+ir3_emit_constant_data(struct fd_screen *screen,
+ const struct ir3_shader_variant *v, struct fd_ringbuffer *ring)
+{
+ const struct ir3_const_state *const_state = ir3_const_state(v);
+ const struct ir3_ubo_analysis_state *state = &const_state->ubo_state;
+
+ for (unsigned i = 0; i < state->num_enabled; i++) {
+ unsigned ubo = state->range[i].ubo.block;
+ if (ubo != const_state->constant_data_ubo)
+ continue;
+
+ uint32_t size = state->range[i].end - state->range[i].start;
+
+ /* Pre-a6xx, we might have ranges enabled in the shader that aren't
+ * used in the binning variant.
+ */
+ if (16 * v->constlen <= state->range[i].offset)
+ continue;
+
+ /* and even if the start of the const buffer is before
+ * first_immediate, the end may not be:
+ */
+ size = MIN2(size, (16 * v->constlen) - state->range[i].offset);
+
+ if (size == 0)
+ continue;
+
+ emit_const_bo(ring, v, state->range[i].offset / 4,
+ v->info.constant_data_offset + state->range[i].start,
+ size / 4, v->bo);
+ }
+}
+
+/**
* Uploads sub-ranges of UBOs to the hardware's constant buffer (UBO access
* outside of these ranges will be done using full UBO accesses in the
* shader).
for (unsigned i = 0; i < state->num_enabled; i++) {
assert(!state->range[i].ubo.bindless);
unsigned ubo = state->range[i].ubo.block;
- if (!(constbuf->enabled_mask & (1 << ubo)))
+ if (!(constbuf->enabled_mask & (1 << ubo)) ||
+ ubo == const_state->constant_data_ubo) {
continue;
+ }
struct pipe_constant_buffer *cb = &constbuf->cb[ubo];
uint32_t size = state->range[i].end - state->range[i].start;
struct fd_bo *bos[params];
for (uint32_t i = 0; i < params; i++) {
+ if (i == const_state->constant_data_ubo) {
+ bos[i] = v->bo;
+ offsets[i] = v->info.constant_data_offset;
+ continue;
+ }
+
struct pipe_constant_buffer *cb = &constbuf->cb[i];
/* If we have user pointers (constbuf 0, aka GL uniforms), upload
if (size > 0)
emit_const_user(ring, v, base, size, const_state->immediates);
+
+ /* NIR constant data has the same lifetime as immediates, so upload it
+ * now, too.
+ */
+ ir3_emit_constant_data(screen, v, ring);
}
static inline void
assert(!v->bo);
- unsigned sz = v->info.sizedwords * 4;
-
- v->bo = fd_bo_new(compiler->dev, sz,
+ v->bo = fd_bo_new(compiler->dev, v->info.size,
DRM_FREEDRENO_GEM_CACHE_WCOMBINE |
DRM_FREEDRENO_GEM_TYPE_KMEM,
"%s:%s", ir3_shader_stage(v), info->name);
/* Always include shaders in kernel crash dumps. */
fd_bo_mark_for_dump(v->bo);
- memcpy(fd_bo_map(v->bo), v->bin, sz);
+ memcpy(fd_bo_map(v->bo), v->bin, v->info.size);
}
struct ir3_shader_variant *