From: Rob Clark Date: Sun, 3 Sep 2023 16:18:42 +0000 (-0700) Subject: freedreno/a6xx: Re-write the function-of-doom X-Git-Tag: upstream/23.3.3~2171 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=c2e8b529220fdc69ebeafbd84c3ddd8a0f848c6a;p=platform%2Fupstream%2Fmesa.git freedreno/a6xx: Re-write the function-of-doom Re-write the program state building to align more closely with turnip, for easier comparision. Signed-off-by: Rob Clark Part-of: --- diff --git a/src/freedreno/common/fd6_pack.h b/src/freedreno/common/fd6_pack.h index 630cc14..da24274 100644 --- a/src/freedreno/common/fd6_pack.h +++ b/src/freedreno/common/fd6_pack.h @@ -180,4 +180,12 @@ __reg_iova(const struct fd_reg_pair *reg) ring->cur = __p; \ } while (0) +#define OUT_BUF(ring, dwords, sizedwords) \ + do { \ + uint32_t *__p = ring->cur; \ + memcpy(__p, dwords, 4 * sizedwords); \ + __p += sizedwords; \ + ring->cur = __p; \ + } while (0) + #endif /* FD6_PACK_H */ diff --git a/src/gallium/drivers/freedreno/a6xx/fd6_emit.cc b/src/gallium/drivers/freedreno/a6xx/fd6_emit.cc index 3ead08e..0b3ffd0 100644 --- a/src/gallium/drivers/freedreno/a6xx/fd6_emit.cc +++ b/src/gallium/drivers/freedreno/a6xx/fd6_emit.cc @@ -806,9 +806,11 @@ fd6_emit_restore(struct fd_batch *batch, struct fd_ringbuffer *ring) WRITE(REG_A6XX_UCHE_UNKNOWN_0E12, screen->info->a6xx.magic.UCHE_UNKNOWN_0E12); WRITE(REG_A6XX_UCHE_CLIENT_PF, screen->info->a6xx.magic.UCHE_CLIENT_PF); WRITE(REG_A6XX_RB_UNKNOWN_8E01, screen->info->a6xx.magic.RB_UNKNOWN_8E01); + WRITE(REG_A6XX_SP_UNKNOWN_A9A8, 0); WRITE(REG_A6XX_SP_MODE_CONTROL, A6XX_SP_MODE_CONTROL_CONSTANT_DEMOTION_ENABLE | 4); WRITE(REG_A6XX_VFD_ADD_OFFSET, A6XX_VFD_ADD_OFFSET_VERTEX); + WRITE(REG_A6XX_VPC_UNKNOWN_9107, 0); WRITE(REG_A6XX_RB_UNKNOWN_8811, 0x00000010); WRITE(REG_A6XX_PC_MODE_CNTL, screen->info->a6xx.magic.PC_MODE_CNTL); diff --git a/src/gallium/drivers/freedreno/a6xx/fd6_program.cc b/src/gallium/drivers/freedreno/a6xx/fd6_program.cc index e514ada..d02b350 100644 --- a/src/gallium/drivers/freedreno/a6xx/fd6_program.cc +++ b/src/gallium/drivers/freedreno/a6xx/fd6_program.cc @@ -42,6 +42,21 @@ #include "fd6_program.h" #include "fd6_texture.h" +/** + * Temporary program building state. + */ +struct program_builder { + struct fd6_program_state *state; + struct fd_context *ctx; + const struct ir3_cache_key *key; + const struct ir3_shader_variant *vs; + const struct ir3_shader_variant *hs; + const struct ir3_shader_variant *ds; + const struct ir3_shader_variant *gs; + const struct ir3_shader_variant *fs; + const struct ir3_shader_variant *last_shader; + bool binning_pass; +}; static const struct xs_config { uint16_t reg_sp_xs_instrlen; @@ -464,217 +479,165 @@ fd6_emit_tess_bos(struct fd_screen *screen, struct fd_ringbuffer *ring, OUT_RELOC(ring, screen->tess_bo, 0, 0, 0); } -static enum a6xx_tex_prefetch_cmd -tex_opc_to_prefetch_cmd(opc_t tex_opc) +static enum a6xx_tess_output +primitive_to_tess(enum mesa_prim primitive) { - switch (tex_opc) { - case OPC_SAM: - return TEX_PREFETCH_SAM; + switch (primitive) { + case MESA_PRIM_POINTS: + return TESS_POINTS; + case MESA_PRIM_LINE_STRIP: + return TESS_LINES; + case MESA_PRIM_TRIANGLE_STRIP: + return TESS_CW_TRIS; default: - unreachable("Unknown tex opc for prefeth cmd"); + unreachable(""); } } -template +#define MAX_VERTEX_ATTRIBS 32 + static void -setup_stateobj(struct fd_screen *screen, struct fd_ringbuffer *ring, - struct fd_context *ctx, struct fd6_program_state *state, - const struct ir3_cache_key *cache_key, - bool binning_pass) assert_dt +emit_vfd_dest(struct fd_ringbuffer *ring, const struct ir3_shader_variant *vs) { - const struct ir3_shader_key *key = &cache_key->key; - uint32_t pos_regid, psize_regid, color_regid[8], posz_regid; - uint32_t clip0_regid, clip1_regid; - uint32_t face_regid, coord_regid, zwcoord_regid, samp_id_regid; - uint32_t smask_in_regid, smask_regid; - uint32_t stencilref_regid; - uint32_t vertex_regid, instance_regid, layer_regid, view_regid, vs_primitive_regid; - uint32_t hs_invocation_regid; - uint32_t tess_coord_x_regid, tess_coord_y_regid, hs_rel_patch_regid, - ds_rel_patch_regid, ds_primitive_regid; - uint32_t ij_regid[IJ_COUNT]; - uint32_t gs_header_regid; - enum a6xx_threadsize fssz; - uint8_t psize_loc = ~0, pos_loc = ~0, layer_loc = ~0, view_loc =~0; - uint8_t clip0_loc, clip1_loc; - int i, j; - - static const struct ir3_shader_variant dummy_fs = { - .info = { - .max_reg = -1, - .max_half_reg = -1, - .max_const = -1, - }, - }; - const struct ir3_shader_variant *vs = binning_pass ? state->bs : state->vs; - const struct ir3_shader_variant *hs = state->hs; - const struct ir3_shader_variant *ds = state->ds; - const struct ir3_shader_variant *gs = state->gs; - const struct ir3_shader_variant *fs = binning_pass ? &dummy_fs : state->fs; - const struct ir3_shader_variant *last_shader = binning_pass ? state->bs : - fd6_last_shader(state); - - /* binning VS is wrong when GS is present, so use nonbinning VS - * TODO: compile both binning VS/GS variants correctly - */ - if (binning_pass && state->gs) - vs = state->vs; - - bool sample_shading = fs->per_samp | key->sample_shading; - - fssz = fs->info.double_threadsize ? THREAD128 : THREAD64; - - pos_regid = ir3_find_output_regid(last_shader, VARYING_SLOT_POS); - psize_regid = ir3_find_output_regid(last_shader, VARYING_SLOT_PSIZ); - clip0_regid = ir3_find_output_regid(last_shader, VARYING_SLOT_CLIP_DIST0); - clip1_regid = ir3_find_output_regid(last_shader, VARYING_SLOT_CLIP_DIST1); - layer_regid = ir3_find_output_regid(last_shader, VARYING_SLOT_LAYER); - view_regid = ir3_find_output_regid(last_shader, VARYING_SLOT_VIEWPORT); - vertex_regid = ir3_find_sysval_regid(vs, SYSTEM_VALUE_VERTEX_ID); - instance_regid = ir3_find_sysval_regid(vs, SYSTEM_VALUE_INSTANCE_ID); - if (hs) - vs_primitive_regid = ir3_find_sysval_regid(hs, SYSTEM_VALUE_PRIMITIVE_ID); - else if (gs) - vs_primitive_regid = ir3_find_sysval_regid(gs, SYSTEM_VALUE_PRIMITIVE_ID); - else - vs_primitive_regid = regid(63, 0); + uint32_t attr_count = 0; - bool hs_reads_primid = false, ds_reads_primid = false; - if (hs) { - tess_coord_x_regid = ir3_find_sysval_regid(ds, SYSTEM_VALUE_TESS_COORD); - tess_coord_y_regid = next_regid(tess_coord_x_regid, 1); - hs_reads_primid = VALIDREG(ir3_find_sysval_regid(hs, SYSTEM_VALUE_PRIMITIVE_ID)); - ds_reads_primid = VALIDREG(ir3_find_sysval_regid(ds, SYSTEM_VALUE_PRIMITIVE_ID)); - hs_rel_patch_regid = ir3_find_sysval_regid(hs, SYSTEM_VALUE_REL_PATCH_ID_IR3); - ds_rel_patch_regid = ir3_find_sysval_regid(ds, SYSTEM_VALUE_REL_PATCH_ID_IR3); - ds_primitive_regid = ir3_find_sysval_regid(ds, SYSTEM_VALUE_PRIMITIVE_ID); - hs_invocation_regid = - ir3_find_sysval_regid(hs, SYSTEM_VALUE_TCS_HEADER_IR3); - - pos_regid = ir3_find_output_regid(ds, VARYING_SLOT_POS); - psize_regid = ir3_find_output_regid(ds, VARYING_SLOT_PSIZ); - clip0_regid = ir3_find_output_regid(ds, VARYING_SLOT_CLIP_DIST0); - clip1_regid = ir3_find_output_regid(ds, VARYING_SLOT_CLIP_DIST1); - } else { - tess_coord_x_regid = regid(63, 0); - tess_coord_y_regid = regid(63, 0); - hs_rel_patch_regid = regid(63, 0); - ds_rel_patch_regid = regid(63, 0); - ds_primitive_regid = regid(63, 0); - hs_invocation_regid = regid(63, 0); - } + for (uint32_t i = 0; i < vs->inputs_count; i++) + if (!vs->inputs[i].sysval) + attr_count++; - bool gs_reads_primid = false; - if (gs) { - gs_header_regid = ir3_find_sysval_regid(gs, SYSTEM_VALUE_GS_HEADER_IR3); - gs_reads_primid = VALIDREG(ir3_find_sysval_regid(gs, SYSTEM_VALUE_PRIMITIVE_ID)); - pos_regid = ir3_find_output_regid(gs, VARYING_SLOT_POS); - psize_regid = ir3_find_output_regid(gs, VARYING_SLOT_PSIZ); - clip0_regid = ir3_find_output_regid(gs, VARYING_SLOT_CLIP_DIST0); - clip1_regid = ir3_find_output_regid(gs, VARYING_SLOT_CLIP_DIST1); - } else { - gs_header_regid = regid(63, 0); - } + OUT_REG(ring, A6XX_VFD_CONTROL_0( + .fetch_cnt = attr_count, /* decode_cnt for binning pass ? */ + .decode_cnt = attr_count)); - if (fs->color0_mrt) { - color_regid[0] = color_regid[1] = color_regid[2] = color_regid[3] = - color_regid[4] = color_regid[5] = color_regid[6] = color_regid[7] = - ir3_find_output_regid(fs, FRAG_RESULT_COLOR); - } else { - color_regid[0] = ir3_find_output_regid(fs, FRAG_RESULT_DATA0); - color_regid[1] = ir3_find_output_regid(fs, FRAG_RESULT_DATA1); - color_regid[2] = ir3_find_output_regid(fs, FRAG_RESULT_DATA2); - color_regid[3] = ir3_find_output_regid(fs, FRAG_RESULT_DATA3); - color_regid[4] = ir3_find_output_regid(fs, FRAG_RESULT_DATA4); - color_regid[5] = ir3_find_output_regid(fs, FRAG_RESULT_DATA5); - color_regid[6] = ir3_find_output_regid(fs, FRAG_RESULT_DATA6); - color_regid[7] = ir3_find_output_regid(fs, FRAG_RESULT_DATA7); - } + if (attr_count) + OUT_PKT4(ring, REG_A6XX_VFD_DEST_CNTL_INSTR(0), attr_count); - samp_id_regid = ir3_find_sysval_regid(fs, SYSTEM_VALUE_SAMPLE_ID); - smask_in_regid = ir3_find_sysval_regid(fs, SYSTEM_VALUE_SAMPLE_MASK_IN); - face_regid = ir3_find_sysval_regid(fs, SYSTEM_VALUE_FRONT_FACE); - coord_regid = ir3_find_sysval_regid(fs, SYSTEM_VALUE_FRAG_COORD); - zwcoord_regid = next_regid(coord_regid, 2); - posz_regid = ir3_find_output_regid(fs, FRAG_RESULT_DEPTH); - smask_regid = ir3_find_output_regid(fs, FRAG_RESULT_SAMPLE_MASK); - stencilref_regid = ir3_find_output_regid(fs, FRAG_RESULT_STENCIL); - for (unsigned i = 0; i < ARRAY_SIZE(ij_regid); i++) - ij_regid[i] = - ir3_find_sysval_regid(fs, SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL + i); - - if (fs->num_sampler_prefetch > 0) { - /* It seems like ij_pix is *required* to be r0.x */ - assert(!VALIDREG(ij_regid[IJ_PERSP_PIXEL]) || - ij_regid[IJ_PERSP_PIXEL] == regid(0, 0)); + for (uint32_t i = 0; i < attr_count; i++) { + assert(vs->inputs[i].compmask); + assert(!vs->inputs[i].sysval); + OUT_RING(ring, + A6XX_VFD_DEST_CNTL_INSTR_WRITEMASK(vs->inputs[i].compmask) | + A6XX_VFD_DEST_CNTL_INSTR_REGID(vs->inputs[i].regid)); } +} - /* we can't write gl_SampleMask for !msaa.. if b0 is zero then we - * end up masking the single sample!! - */ - if (!key->msaa) - smask_regid = regid(63, 0); - - if (!binning_pass) - state->num_viewports = VALIDREG(view_regid) ? PIPE_MAX_VIEWPORTS : 1; - - /* we could probably divide this up into things that need to be - * emitted if frag-prog is dirty vs if vert-prog is dirty.. +static void +emit_vs_system_values(struct fd_ringbuffer *ring, + const struct program_builder *b) +{ + const uint32_t vertexid_regid = + ir3_find_sysval_regid(b->vs, SYSTEM_VALUE_VERTEX_ID); + const uint32_t instanceid_regid = + ir3_find_sysval_regid(b->vs, SYSTEM_VALUE_INSTANCE_ID); + const uint32_t tess_coord_x_regid = + ir3_find_sysval_regid(b->ds, SYSTEM_VALUE_TESS_COORD); + const uint32_t tess_coord_y_regid = next_regid(tess_coord_x_regid, 1); + const uint32_t hs_rel_patch_regid = + ir3_find_sysval_regid(b->hs, SYSTEM_VALUE_REL_PATCH_ID_IR3); + const uint32_t ds_rel_patch_regid = + ir3_find_sysval_regid(b->ds, SYSTEM_VALUE_REL_PATCH_ID_IR3); + const uint32_t hs_invocation_regid = + ir3_find_sysval_regid(b->hs, SYSTEM_VALUE_TCS_HEADER_IR3); + const uint32_t gs_primitiveid_regid = + ir3_find_sysval_regid(b->gs, SYSTEM_VALUE_PRIMITIVE_ID); + const uint32_t vs_primitiveid_regid = b->hs ? + ir3_find_sysval_regid(b->hs, SYSTEM_VALUE_PRIMITIVE_ID) : + gs_primitiveid_regid; + const uint32_t ds_primitiveid_regid = + ir3_find_sysval_regid(b->ds, SYSTEM_VALUE_PRIMITIVE_ID); + const uint32_t gsheader_regid = + ir3_find_sysval_regid(b->gs, SYSTEM_VALUE_GS_HEADER_IR3); + + /* Note: we currently don't support multiview. */ + const uint32_t viewid_regid = INVALID_REG; - OUT_PKT4(ring, REG_A6XX_SP_FS_PREFETCH_CNTL, 1 + fs->num_sampler_prefetch); - OUT_RING(ring, A6XX_SP_FS_PREFETCH_CNTL_COUNT(fs->num_sampler_prefetch) | - COND(!VALIDREG(ij_regid[IJ_PERSP_PIXEL]), - A6XX_SP_FS_PREFETCH_CNTL_IJ_WRITE_DISABLE) | - COND(fs->prefetch_end_of_quad, - A6XX_SP_FS_PREFETCH_CNTL_ENDOFQUAD)); - for (int i = 0; i < fs->num_sampler_prefetch; i++) { - const struct ir3_sampler_prefetch *prefetch = &fs->sampler_prefetch[i]; - OUT_RING(ring, SP_FS_PREFETCH_CMD( - CHIP, i, - .src = prefetch->src, - .samp_id = prefetch->samp_id, - .tex_id = prefetch->tex_id, - .dst = prefetch->dst, - .wrmask = prefetch->wrmask, - .half = prefetch->half_precision, - .bindless = prefetch->bindless, - .cmd = tex_opc_to_prefetch_cmd(prefetch->tex_opc), - ).value - ); - } - - OUT_PKT4(ring, REG_A6XX_SP_UNKNOWN_A9A8, 1); - OUT_RING(ring, 0); - - OUT_PKT4(ring, REG_A6XX_SP_MODE_CONTROL, 1); - OUT_RING(ring, A6XX_SP_MODE_CONTROL_CONSTANT_DEMOTION_ENABLE | 4); - - bool fs_has_dual_src_color = - !binning_pass && fs->fs.color_is_dual_source; - - OUT_PKT4(ring, REG_A6XX_SP_FS_OUTPUT_CNTL0, 1); - OUT_RING(ring, - A6XX_SP_FS_OUTPUT_CNTL0_DEPTH_REGID(posz_regid) | - A6XX_SP_FS_OUTPUT_CNTL0_SAMPMASK_REGID(smask_regid) | - A6XX_SP_FS_OUTPUT_CNTL0_STENCILREF_REGID(stencilref_regid) | - COND(fs_has_dual_src_color, - A6XX_SP_FS_OUTPUT_CNTL0_DUAL_COLOR_IN_ENABLE)); - - fd6_emit_shader(ctx, ring, vs); - if (hs) { - fd6_emit_tess_bos(ctx->screen, ring, hs); - fd6_emit_tess_bos(ctx->screen, ring, ds); - } - - struct ir3_shader_linkage l = {0}; + OUT_PKT4(ring, REG_A6XX_VFD_CONTROL_1, 6); + OUT_RING(ring, A6XX_VFD_CONTROL_1_REGID4VTX(vertexid_regid) | + A6XX_VFD_CONTROL_1_REGID4INST(instanceid_regid) | + A6XX_VFD_CONTROL_1_REGID4PRIMID(vs_primitiveid_regid) | + A6XX_VFD_CONTROL_1_REGID4VIEWID(viewid_regid)); + OUT_RING(ring, A6XX_VFD_CONTROL_2_REGID_HSRELPATCHID(hs_rel_patch_regid) | + A6XX_VFD_CONTROL_2_REGID_INVOCATIONID(hs_invocation_regid)); + OUT_RING(ring, A6XX_VFD_CONTROL_3_REGID_DSRELPATCHID(ds_rel_patch_regid) | + A6XX_VFD_CONTROL_3_REGID_TESSX(tess_coord_x_regid) | + A6XX_VFD_CONTROL_3_REGID_TESSY(tess_coord_y_regid) | + A6XX_VFD_CONTROL_3_REGID_DSPRIMID(ds_primitiveid_regid)); + OUT_RING(ring, 0x000000fc); /* VFD_CONTROL_4 */ + OUT_RING(ring, A6XX_VFD_CONTROL_5_REGID_GSHEADER(gsheader_regid) | + 0xfc00); /* VFD_CONTROL_5 */ + OUT_RING(ring, COND(b->fs->reads_primid, A6XX_VFD_CONTROL_6_PRIMID4PSEN)); /* VFD_CONTROL_6 */ +} - bool do_streamout = (last_shader->stream_output.num_outputs > 0); - uint8_t clip_mask = last_shader->clip_mask, - cull_mask = last_shader->cull_mask; - uint8_t clip_cull_mask = clip_mask | cull_mask; +static void +emit_vpc(struct fd_ringbuffer *ring, const struct program_builder *b) +{ + const struct ir3_shader_variant *last_shader = b->last_shader; + + /* note: doesn't compile as static because of the array regs.. */ + const struct reg_config { + uint16_t reg_sp_xs_out_reg; + uint16_t reg_sp_xs_vpc_dst_reg; + uint16_t reg_vpc_xs_pack; + uint16_t reg_vpc_xs_clip_cntl; + uint16_t reg_gras_xs_cl_cntl; + uint16_t reg_pc_xs_out_cntl; + uint16_t reg_sp_xs_primitive_cntl; + uint16_t reg_vpc_xs_layer_cntl; + uint16_t reg_gras_xs_layer_cntl; + } reg_config[] = { + [MESA_SHADER_VERTEX] = { + REG_A6XX_SP_VS_OUT_REG(0), + REG_A6XX_SP_VS_VPC_DST_REG(0), + REG_A6XX_VPC_VS_PACK, + REG_A6XX_VPC_VS_CLIP_CNTL, + REG_A6XX_GRAS_VS_CL_CNTL, + REG_A6XX_PC_VS_OUT_CNTL, + REG_A6XX_SP_VS_PRIMITIVE_CNTL, + REG_A6XX_VPC_VS_LAYER_CNTL, + REG_A6XX_GRAS_VS_LAYER_CNTL + }, + [MESA_SHADER_TESS_CTRL] = { + 0, + 0, + 0, + 0, + 0, + REG_A6XX_PC_HS_OUT_CNTL, + 0, + 0, + 0 + }, + [MESA_SHADER_TESS_EVAL] = { + REG_A6XX_SP_DS_OUT_REG(0), + REG_A6XX_SP_DS_VPC_DST_REG(0), + REG_A6XX_VPC_DS_PACK, + REG_A6XX_VPC_DS_CLIP_CNTL, + REG_A6XX_GRAS_DS_CL_CNTL, + REG_A6XX_PC_DS_OUT_CNTL, + REG_A6XX_SP_DS_PRIMITIVE_CNTL, + REG_A6XX_VPC_DS_LAYER_CNTL, + REG_A6XX_GRAS_DS_LAYER_CNTL + }, + [MESA_SHADER_GEOMETRY] = { + REG_A6XX_SP_GS_OUT_REG(0), + REG_A6XX_SP_GS_VPC_DST_REG(0), + REG_A6XX_VPC_GS_PACK, + REG_A6XX_VPC_GS_CLIP_CNTL, + REG_A6XX_GRAS_GS_CL_CNTL, + REG_A6XX_PC_GS_OUT_CNTL, + REG_A6XX_SP_GS_PRIMITIVE_CNTL, + REG_A6XX_VPC_GS_LAYER_CNTL, + REG_A6XX_GRAS_GS_LAYER_CNTL + }, + }; + const struct reg_config *cfg = ®_config[b->last_shader->type]; - clip_mask &= cache_key->clip_plane_enable; + struct ir3_shader_linkage linkage = { + .primid_loc = 0xff, + .clip0_loc = 0xff, + .clip1_loc = 0xff, + }; /* If we have streamout, link against the real FS, rather than the * dummy FS used for binning pass state, to ensure the OUTLOC's @@ -683,65 +646,79 @@ setup_stateobj(struct fd_screen *screen, struct fd_ringbuffer *ring, * or draw pass program, but the same streamout stateobj is used * in either case: */ - ir3_link_shaders(&l, last_shader, do_streamout ? state->fs : fs, true); - - clip0_loc = l.clip0_loc; - clip1_loc = l.clip1_loc; + bool do_streamout = (b->last_shader->stream_output.num_outputs > 0); + ir3_link_shaders(&linkage, b->last_shader, + do_streamout ? b->state->fs : b->fs, + true); - OUT_PKT4(ring, REG_A6XX_VPC_VAR_DISABLE(0), 4); - OUT_RING(ring, ~l.varmask[0]); /* VPC_VAR[0].DISABLE */ - OUT_RING(ring, ~l.varmask[1]); /* VPC_VAR[1].DISABLE */ - OUT_RING(ring, ~l.varmask[2]); /* VPC_VAR[2].DISABLE */ - OUT_RING(ring, ~l.varmask[3]); /* VPC_VAR[3].DISABLE */ + if (do_streamout) + ir3_link_stream_out(&linkage, b->last_shader); - /* Add stream out outputs after computing the VPC_VAR_DISABLE bitmask. */ - ir3_link_stream_out(&l, last_shader); + emit_vs_system_values(ring, b); - if (VALIDREG(layer_regid)) { - layer_loc = l.max_loc; - ir3_link_add(&l, VARYING_SLOT_LAYER, layer_regid, 0x1, l.max_loc); + OUT_PKT4(ring, REG_A6XX_VPC_VAR_DISABLE(0), 4); + OUT_RING(ring, ~linkage.varmask[0]); + OUT_RING(ring, ~linkage.varmask[1]); + OUT_RING(ring, ~linkage.varmask[2]); + OUT_RING(ring, ~linkage.varmask[3]); + + /* a6xx finds position/pointsize at the end */ + const uint32_t position_regid = + ir3_find_output_regid(last_shader, VARYING_SLOT_POS); + const uint32_t pointsize_regid = + ir3_find_output_regid(last_shader, VARYING_SLOT_PSIZ); + const uint32_t layer_regid = + ir3_find_output_regid(last_shader, VARYING_SLOT_LAYER); + const uint32_t view_regid = + ir3_find_output_regid(last_shader, VARYING_SLOT_VIEWPORT); + const uint32_t clip0_regid = + ir3_find_output_regid(last_shader, VARYING_SLOT_CLIP_DIST0); + const uint32_t clip1_regid = + ir3_find_output_regid(last_shader, VARYING_SLOT_CLIP_DIST1); + uint32_t flags_regid = b->gs ? + ir3_find_output_regid(b->gs, VARYING_SLOT_GS_VERTEX_FLAGS_IR3) : 0; + + uint32_t pointsize_loc = 0xff, position_loc = 0xff, layer_loc = 0xff, view_loc = 0xff; + +// XXX replace regid(63,0) with INVALID_REG + if (layer_regid != regid(63, 0)) { + layer_loc = linkage.max_loc; + ir3_link_add(&linkage, VARYING_SLOT_LAYER, layer_regid, 0x1, linkage.max_loc); } - if (VALIDREG(view_regid)) { - view_loc = l.max_loc; - ir3_link_add(&l, VARYING_SLOT_VIEWPORT, view_regid, 0x1, l.max_loc); + if (view_regid != regid(63, 0)) { + view_loc = linkage.max_loc; + ir3_link_add(&linkage, VARYING_SLOT_VIEWPORT, view_regid, 0x1, linkage.max_loc); } - if (VALIDREG(pos_regid)) { - pos_loc = l.max_loc; - ir3_link_add(&l, VARYING_SLOT_POS, pos_regid, 0xf, l.max_loc); + if (position_regid != regid(63, 0)) { + position_loc = linkage.max_loc; + ir3_link_add(&linkage, VARYING_SLOT_POS, position_regid, 0xf, linkage.max_loc); } - if (VALIDREG(psize_regid)) { - psize_loc = l.max_loc; - ir3_link_add(&l, VARYING_SLOT_PSIZ, psize_regid, 0x1, l.max_loc); + if (pointsize_regid != regid(63, 0)) { + pointsize_loc = linkage.max_loc; + ir3_link_add(&linkage, VARYING_SLOT_PSIZ, pointsize_regid, 0x1, linkage.max_loc); } - /* Handle the case where clip/cull distances aren't read by the FS. Make - * sure to avoid adding an output with an empty writemask if the user - * disables all the clip distances in the API so that the slot is unused. - */ - if (clip0_loc == 0xff && VALIDREG(clip0_regid) && - (clip_cull_mask & 0xf) != 0) { - clip0_loc = l.max_loc; - ir3_link_add(&l, VARYING_SLOT_CLIP_DIST0, clip0_regid, - clip_cull_mask & 0xf, l.max_loc); - } + uint8_t clip_mask = last_shader->clip_mask, + cull_mask = last_shader->cull_mask; + uint8_t clip_cull_mask = clip_mask | cull_mask; - if (clip1_loc == 0xff && VALIDREG(clip1_regid) && - (clip_cull_mask >> 4) != 0) { - clip1_loc = l.max_loc; - ir3_link_add(&l, VARYING_SLOT_CLIP_DIST1, clip1_regid, - clip_cull_mask >> 4, l.max_loc); - } + clip_mask &= b->key->clip_plane_enable; - /* The hw seems to be unhappy if the last geom shader has no outputs. - * Normally you'd at least see gl_Position writes, but if the last - * geom stage simply writes to an SSBO (for ex) and no other outputs, - * insert a dummy varying - */ - if (!l.cnt) - ir3_link_add(&l, 0, regid(0, 0), 0x1, l.max_loc); + /* Handle the case where clip/cull distances aren't read by the FS */ + uint32_t clip0_loc = linkage.clip0_loc, clip1_loc = linkage.clip1_loc; + if (clip0_loc == 0xff && clip0_regid != regid(63, 0)) { + clip0_loc = linkage.max_loc; + ir3_link_add(&linkage, VARYING_SLOT_CLIP_DIST0, clip0_regid, + clip_cull_mask & 0xf, linkage.max_loc); + } + if (clip1_loc == 0xff && clip1_regid != regid(63, 0)) { + clip1_loc = linkage.max_loc; + ir3_link_add(&linkage, VARYING_SLOT_CLIP_DIST1, clip1_regid, + clip_cull_mask >> 4, linkage.max_loc); + } /* If we have stream-out, we use the full shader for binning * pass, rather than the optimized binning pass one, so that we @@ -749,182 +726,213 @@ setup_stateobj(struct fd_screen *screen, struct fd_ringbuffer *ring, * state should always be derived from the non-binning pass * program: */ - if (do_streamout && !binning_pass) { - setup_stream_out(ctx, state, last_shader, &l); + if (do_streamout && !b->binning_pass) { + setup_stream_out(b->ctx, b->state, b->last_shader, &linkage); - if (!fd6_context(ctx)->streamout_disable_stateobj) - setup_stream_out_disable(ctx); + if (!fd6_context(b->ctx)->streamout_disable_stateobj) + setup_stream_out_disable(b->ctx); } - assert(l.cnt <= 32); - if (gs) - OUT_PKT4(ring, REG_A6XX_SP_GS_OUT_REG(0), DIV_ROUND_UP(l.cnt, 2)); - else if (ds) - OUT_PKT4(ring, REG_A6XX_SP_DS_OUT_REG(0), DIV_ROUND_UP(l.cnt, 2)); - else - OUT_PKT4(ring, REG_A6XX_SP_VS_OUT_REG(0), DIV_ROUND_UP(l.cnt, 2)); + /* The GPU hangs on some models when there are no outputs (xs_pack::CNT), + * at least when a DS is the last stage, so add a dummy output to keep it + * happy if there aren't any. We do this late in order to avoid emitting + * any unused code and make sure that optimizations don't remove it. + */ + if (linkage.cnt == 0) + ir3_link_add(&linkage, 0, 0, 0x1, linkage.max_loc); + + /* map outputs of the last shader to VPC */ + assert(linkage.cnt <= 32); + const uint32_t sp_out_count = DIV_ROUND_UP(linkage.cnt, 2); + const uint32_t sp_vpc_dst_count = DIV_ROUND_UP(linkage.cnt, 4); + uint16_t sp_out[32] = {0}; + uint8_t sp_vpc_dst[32] = {0}; + for (uint32_t i = 0; i < linkage.cnt; i++) { + sp_out[i] = + A6XX_SP_VS_OUT_REG_A_REGID(linkage.var[i].regid) | + A6XX_SP_VS_OUT_REG_A_COMPMASK(linkage.var[i].compmask); + sp_vpc_dst[i] = + A6XX_SP_VS_VPC_DST_REG_OUTLOC0(linkage.var[i].loc); + } - for (j = 0; j < l.cnt;) { - uint32_t reg = 0; + OUT_PKT4(ring, cfg->reg_sp_xs_out_reg, sp_out_count); + OUT_BUF(ring, sp_out, sp_out_count); - reg |= A6XX_SP_VS_OUT_REG_A_REGID(l.var[j].regid); - reg |= A6XX_SP_VS_OUT_REG_A_COMPMASK(l.var[j].compmask); - j++; + OUT_PKT4(ring, cfg->reg_sp_xs_vpc_dst_reg, sp_vpc_dst_count); + OUT_BUF(ring, sp_vpc_dst, sp_vpc_dst_count); - reg |= A6XX_SP_VS_OUT_REG_B_REGID(l.var[j].regid); - reg |= A6XX_SP_VS_OUT_REG_B_COMPMASK(l.var[j].compmask); - j++; + OUT_PKT4(ring, cfg->reg_vpc_xs_pack, 1); + OUT_RING(ring, A6XX_VPC_VS_PACK_POSITIONLOC(position_loc) | + A6XX_VPC_VS_PACK_PSIZELOC(pointsize_loc) | + A6XX_VPC_VS_PACK_STRIDE_IN_VPC(linkage.max_loc)); - OUT_RING(ring, reg); - } + OUT_PKT4(ring, cfg->reg_vpc_xs_clip_cntl, 1); + OUT_RING(ring, A6XX_VPC_VS_CLIP_CNTL_CLIP_MASK(clip_cull_mask) | + A6XX_VPC_VS_CLIP_CNTL_CLIP_DIST_03_LOC(clip0_loc) | + A6XX_VPC_VS_CLIP_CNTL_CLIP_DIST_47_LOC(clip1_loc)); - if (gs) - OUT_PKT4(ring, REG_A6XX_SP_GS_VPC_DST_REG(0), DIV_ROUND_UP(l.cnt, 4)); - else if (ds) - OUT_PKT4(ring, REG_A6XX_SP_DS_VPC_DST_REG(0), DIV_ROUND_UP(l.cnt, 4)); - else - OUT_PKT4(ring, REG_A6XX_SP_VS_VPC_DST_REG(0), DIV_ROUND_UP(l.cnt, 4)); + OUT_PKT4(ring, cfg->reg_gras_xs_cl_cntl, 1); + OUT_RING(ring, A6XX_GRAS_VS_CL_CNTL_CLIP_MASK(clip_mask) | + A6XX_GRAS_VS_CL_CNTL_CULL_MASK(cull_mask)); + + const struct ir3_shader_variant *geom_stages[] = { b->vs, b->hs, b->ds, b->gs }; + + for (unsigned i = 0; i < ARRAY_SIZE(geom_stages); i++) { + const struct ir3_shader_variant *shader = geom_stages[i]; + if (!shader) + continue; + + bool primid = shader->type != MESA_SHADER_VERTEX && + VALIDREG(ir3_find_sysval_regid(shader, SYSTEM_VALUE_PRIMITIVE_ID)); + + OUT_PKT4(ring, reg_config[shader->type].reg_pc_xs_out_cntl, 1); + if (shader == last_shader) { + OUT_RING(ring, A6XX_PC_VS_OUT_CNTL_STRIDE_IN_VPC(linkage.max_loc) | + CONDREG(pointsize_regid, A6XX_PC_VS_OUT_CNTL_PSIZE) | + CONDREG(layer_regid, A6XX_PC_VS_OUT_CNTL_LAYER) | + CONDREG(view_regid, A6XX_PC_VS_OUT_CNTL_VIEW) | + COND(primid, A6XX_PC_VS_OUT_CNTL_PRIMITIVE_ID) | + COND(primid, A6XX_PC_GS_OUT_CNTL_PRIMITIVE_ID) | + A6XX_PC_VS_OUT_CNTL_CLIP_MASK(clip_cull_mask)); + } else { + OUT_RING(ring, COND(primid, A6XX_PC_VS_OUT_CNTL_PRIMITIVE_ID)); + } + } - for (j = 0; j < l.cnt;) { - uint32_t reg = 0; + /* if vertex_flags somehow gets optimized out, your gonna have a bad time: */ + assert(flags_regid != INVALID_REG); - reg |= A6XX_SP_VS_VPC_DST_REG_OUTLOC0(l.var[j++].loc); - reg |= A6XX_SP_VS_VPC_DST_REG_OUTLOC1(l.var[j++].loc); - reg |= A6XX_SP_VS_VPC_DST_REG_OUTLOC2(l.var[j++].loc); - reg |= A6XX_SP_VS_VPC_DST_REG_OUTLOC3(l.var[j++].loc); + OUT_PKT4(ring, cfg->reg_sp_xs_primitive_cntl, 1); + OUT_RING(ring, A6XX_SP_VS_PRIMITIVE_CNTL_OUT(linkage.cnt) | + A6XX_SP_GS_PRIMITIVE_CNTL_FLAGS_REGID(flags_regid)); - OUT_RING(ring, reg); - } + OUT_PKT4(ring, cfg->reg_vpc_xs_layer_cntl, 1); + OUT_RING(ring, A6XX_VPC_VS_LAYER_CNTL_LAYERLOC(layer_loc) | + A6XX_VPC_VS_LAYER_CNTL_VIEWLOC(view_loc)); - if (hs) { - assert(vs->mergedregs == hs->mergedregs); + OUT_PKT4(ring, cfg->reg_gras_xs_layer_cntl, 1); + OUT_RING(ring, CONDREG(layer_regid, A6XX_GRAS_GS_LAYER_CNTL_WRITES_LAYER) | + CONDREG(view_regid, A6XX_GRAS_GS_LAYER_CNTL_WRITES_VIEW)); - fd6_emit_shader(ctx, ring, hs); - fd6_emit_link_map(vs, hs, ring); + OUT_REG(ring, A6XX_PC_PS_CNTL(b->fs->reads_primid)); - fd6_emit_shader(ctx, ring, ds); - fd6_emit_link_map(hs, ds, ring); + OUT_PKT4(ring, REG_A6XX_VPC_CNTL_0, 1); + OUT_RING(ring, A6XX_VPC_CNTL_0_NUMNONPOSVAR(b->fs->total_in) | + COND(b->fs->total_in, A6XX_VPC_CNTL_0_VARYING) | + A6XX_VPC_CNTL_0_PRIMIDLOC(linkage.primid_loc) | + A6XX_VPC_CNTL_0_VIEWIDLOC(linkage.viewid_loc)); + if (b->hs) { OUT_PKT4(ring, REG_A6XX_PC_TESS_NUM_VERTEX, 1); - OUT_RING(ring, hs->tess.tcs_vertices_out); + OUT_RING(ring, b->hs->tess.tcs_vertices_out); - if (ctx->screen->info->a6xx.tess_use_shared) { - unsigned hs_input_size = 6 + (3 * (vs->output_size - 1)); - unsigned wave_input_size = - MIN2(64, DIV_ROUND_UP(hs_input_size * 4, - hs->tess.tcs_vertices_out)); + fd6_emit_link_map(b->vs, b->hs, ring); + fd6_emit_link_map(b->hs, b->ds, ring); + } - OUT_PKT4(ring, REG_A6XX_PC_HS_INPUT_SIZE, 1); - OUT_RING(ring, hs_input_size); + if (b->gs) { + uint32_t vertices_out, invocations, vec4_size; + uint32_t prev_stage_output_size = + b->ds ? b->ds->output_size : b->vs->output_size; - OUT_PKT4(ring, REG_A6XX_SP_HS_WAVE_INPUT_SIZE, 1); - OUT_RING(ring, wave_input_size); + if (b->hs) { + fd6_emit_link_map(b->ds, b->gs, ring); } else { - uint32_t hs_input_size = - hs->tess.tcs_vertices_out * vs->output_size / 4; - - /* Total attribute slots in HS incoming patch. */ - OUT_PKT4(ring, REG_A6XX_PC_HS_INPUT_SIZE, 1); - OUT_RING(ring, hs_input_size); - - const uint32_t wavesize = 64; - const uint32_t max_wave_input_size = 64; - const uint32_t patch_control_points = hs->tess.tcs_vertices_out; - - /* note: if HS is really just the VS extended, then this - * should be by MAX2(patch_control_points, hs_info->tess.tcs_vertices_out) - * however that doesn't match the blob, and fails some dEQP tests. - */ - uint32_t prims_per_wave = wavesize / hs->tess.tcs_vertices_out; - uint32_t max_prims_per_wave = max_wave_input_size * wavesize / - (vs->output_size * patch_control_points); - prims_per_wave = MIN2(prims_per_wave, max_prims_per_wave); - - uint32_t total_size = - vs->output_size * patch_control_points * prims_per_wave; - uint32_t wave_input_size = DIV_ROUND_UP(total_size, wavesize); - - OUT_PKT4(ring, REG_A6XX_SP_HS_WAVE_INPUT_SIZE, 1); - OUT_RING(ring, wave_input_size); + fd6_emit_link_map(b->vs, b->gs, ring); } + vertices_out = b->gs->gs.vertices_out - 1; + enum a6xx_tess_output output = + primitive_to_tess((enum mesa_prim)b->gs->gs.output_primitive); + invocations = b->gs->gs.invocations - 1; + /* Size of per-primitive alloction in ldlw memory in vec4s. */ + vec4_size = b->gs->gs.vertices_in * + DIV_ROUND_UP(prev_stage_output_size, 4); - OUT_PKT4(ring, REG_A6XX_PC_TESS_CNTL, 1); - enum a6xx_tess_output output; - if (ds->tess.point_mode) - output = TESS_POINTS; - else if (ds->tess.primitive_mode == TESS_PRIMITIVE_ISOLINES) - output = TESS_LINES; - else if (ds->tess.ccw) - output = TESS_CCW_TRIS; - else - output = TESS_CW_TRIS; - - OUT_RING(ring, A6XX_PC_TESS_CNTL_SPACING( - fd6_gl2spacing(ds->tess.spacing)) | - A6XX_PC_TESS_CNTL_OUTPUT(output)); + OUT_PKT4(ring, REG_A6XX_PC_PRIMITIVE_CNTL_5, 1); + OUT_RING(ring, + A6XX_PC_PRIMITIVE_CNTL_5_GS_VERTICES_OUT(vertices_out) | + A6XX_PC_PRIMITIVE_CNTL_5_GS_OUTPUT(output) | + A6XX_PC_PRIMITIVE_CNTL_5_GS_INVOCATIONS(invocations)); - OUT_PKT4(ring, REG_A6XX_VPC_DS_CLIP_CNTL, 1); - OUT_RING(ring, A6XX_VPC_DS_CLIP_CNTL_CLIP_MASK(clip_cull_mask) | - A6XX_VPC_DS_CLIP_CNTL_CLIP_DIST_03_LOC(clip0_loc) | - A6XX_VPC_DS_CLIP_CNTL_CLIP_DIST_47_LOC(clip1_loc)); + OUT_PKT4(ring, REG_A6XX_VPC_GS_PARAM, 1); + OUT_RING(ring, 0xff); - OUT_PKT4(ring, REG_A6XX_VPC_DS_LAYER_CNTL, 1); - OUT_RING(ring, 0x0000ffff); + OUT_PKT4(ring, REG_A6XX_PC_PRIMITIVE_CNTL_6, 1); + OUT_RING(ring, A6XX_PC_PRIMITIVE_CNTL_6_STRIDE_IN_VPC(vec4_size)); - OUT_PKT4(ring, REG_A6XX_GRAS_DS_LAYER_CNTL, 1); - OUT_RING(ring, 0x0); + uint32_t prim_size = prev_stage_output_size; + if (prim_size > 64) + prim_size = 64; + else if (prim_size == 64) + prim_size = 63; - OUT_PKT4(ring, REG_A6XX_GRAS_DS_CL_CNTL, 1); - OUT_RING(ring, A6XX_GRAS_DS_CL_CNTL_CLIP_MASK(clip_mask) | - A6XX_GRAS_DS_CL_CNTL_CULL_MASK(cull_mask)); + OUT_PKT4(ring, REG_A6XX_SP_GS_PRIM_SIZE, 1); + OUT_RING(ring, prim_size); + } +} - OUT_PKT4(ring, REG_A6XX_VPC_VS_PACK, 1); - OUT_RING(ring, A6XX_VPC_VS_PACK_POSITIONLOC(pos_loc) | - A6XX_VPC_VS_PACK_PSIZELOC(255) | - A6XX_VPC_VS_PACK_STRIDE_IN_VPC(l.max_loc)); +static enum a6xx_tex_prefetch_cmd +tex_opc_to_prefetch_cmd(opc_t tex_opc) +{ + switch (tex_opc) { + case OPC_SAM: + return TEX_PREFETCH_SAM; + default: + unreachable("Unknown tex opc for prefeth cmd"); + } +} - OUT_PKT4(ring, REG_A6XX_VPC_DS_PACK, 1); - OUT_RING(ring, A6XX_VPC_DS_PACK_POSITIONLOC(pos_loc) | - A6XX_VPC_DS_PACK_PSIZELOC(psize_loc) | - A6XX_VPC_DS_PACK_STRIDE_IN_VPC(l.max_loc)); +template +static void +emit_fs_inputs(struct fd_ringbuffer *ring, const struct program_builder *b) +{ + const struct ir3_shader_variant *fs = b->fs; + uint32_t face_regid, coord_regid, zwcoord_regid, samp_id_regid; + uint32_t ij_regid[IJ_COUNT]; + uint32_t smask_in_regid; - OUT_PKT4(ring, REG_A6XX_SP_DS_PRIMITIVE_CNTL, 1); - OUT_RING(ring, A6XX_SP_DS_PRIMITIVE_CNTL_OUT(l.cnt)); + bool sample_shading = fs->per_samp | fs->key.sample_shading; + bool enable_varyings = fs->total_in > 0; - OUT_PKT4(ring, REG_A6XX_PC_DS_OUT_CNTL, 1); - OUT_RING(ring, A6XX_PC_DS_OUT_CNTL_STRIDE_IN_VPC(l.max_loc) | - CONDREG(psize_regid, A6XX_PC_DS_OUT_CNTL_PSIZE) | - COND(ds_reads_primid, A6XX_PC_DS_OUT_CNTL_PRIMITIVE_ID) | - A6XX_PC_DS_OUT_CNTL_CLIP_MASK(clip_cull_mask)); + samp_id_regid = ir3_find_sysval_regid(fs, SYSTEM_VALUE_SAMPLE_ID); + smask_in_regid = ir3_find_sysval_regid(fs, SYSTEM_VALUE_SAMPLE_MASK_IN); + face_regid = ir3_find_sysval_regid(fs, SYSTEM_VALUE_FRONT_FACE); + coord_regid = ir3_find_sysval_regid(fs, SYSTEM_VALUE_FRAG_COORD); + zwcoord_regid = VALIDREG(coord_regid) ? coord_regid + 2 : regid(63, 0); + for (unsigned i = 0; i < ARRAY_SIZE(ij_regid); i++) + ij_regid[i] = ir3_find_sysval_regid(fs, SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL + i); - OUT_PKT4(ring, REG_A6XX_PC_HS_OUT_CNTL, 1); - OUT_RING(ring, COND(hs_reads_primid, A6XX_PC_HS_OUT_CNTL_PRIMITIVE_ID)); - } else { - OUT_PKT4(ring, REG_A6XX_SP_HS_WAVE_INPUT_SIZE, 1); - OUT_RING(ring, 0); + if (fs->num_sampler_prefetch > 0) { + /* It seems like ij_pix is *required* to be r0.x */ + assert(!VALIDREG(ij_regid[IJ_PERSP_PIXEL]) || + ij_regid[IJ_PERSP_PIXEL] == regid(0, 0)); } - OUT_PKT4(ring, REG_A6XX_SP_VS_PRIMITIVE_CNTL, 1); - OUT_RING(ring, A6XX_SP_VS_PRIMITIVE_CNTL_OUT(l.cnt)); - - bool enable_varyings = fs->total_in > 0; - - OUT_PKT4(ring, REG_A6XX_VPC_CNTL_0, 1); - OUT_RING(ring, A6XX_VPC_CNTL_0_NUMNONPOSVAR(fs->total_in) | - COND(enable_varyings, A6XX_VPC_CNTL_0_VARYING) | - A6XX_VPC_CNTL_0_PRIMIDLOC(l.primid_loc) | - A6XX_VPC_CNTL_0_VIEWIDLOC(0xff)); - - OUT_PKT4(ring, REG_A6XX_PC_VS_OUT_CNTL, 1); - OUT_RING(ring, A6XX_PC_VS_OUT_CNTL_STRIDE_IN_VPC(l.max_loc) | - CONDREG(psize_regid, A6XX_PC_VS_OUT_CNTL_PSIZE) | - CONDREG(layer_regid, A6XX_PC_VS_OUT_CNTL_LAYER) | - CONDREG(view_regid, A6XX_PC_VS_OUT_CNTL_VIEW) | - A6XX_PC_VS_OUT_CNTL_CLIP_MASK(clip_cull_mask)); + OUT_PKT4(ring, REG_A6XX_SP_FS_PREFETCH_CNTL, 1 + fs->num_sampler_prefetch); + OUT_RING(ring, A6XX_SP_FS_PREFETCH_CNTL_COUNT(fs->num_sampler_prefetch) | + COND(!VALIDREG(ij_regid[IJ_PERSP_PIXEL]), + A6XX_SP_FS_PREFETCH_CNTL_IJ_WRITE_DISABLE) | + COND(fs->prefetch_end_of_quad, + A6XX_SP_FS_PREFETCH_CNTL_ENDOFQUAD)); + for (int i = 0; i < fs->num_sampler_prefetch; i++) { + const struct ir3_sampler_prefetch *prefetch = &fs->sampler_prefetch[i]; + OUT_RING(ring, SP_FS_PREFETCH_CMD( + CHIP, i, + .src = prefetch->src, + .samp_id = prefetch->samp_id, + .tex_id = prefetch->tex_id, + .dst = prefetch->dst, + .wrmask = prefetch->wrmask, + .half = prefetch->half_precision, + .bindless = prefetch->bindless, + .cmd = tex_opc_to_prefetch_cmd(prefetch->tex_opc), + ).value + ); + } OUT_REG(ring, HLSQ_CONTROL_1_REG(CHIP, - screen->info->a6xx.prim_alloc_threshold), + b->ctx->screen->info->a6xx.prim_alloc_threshold), HLSQ_CONTROL_2_REG( CHIP, .faceregid = face_regid, @@ -953,18 +961,15 @@ setup_stateobj(struct fd_screen *screen, struct fd_ringbuffer *ring, ), ); + enum a6xx_threadsize thrsz = fs->info.double_threadsize ? THREAD128 : THREAD64; OUT_REG(ring, HLSQ_FS_CNTL_0( CHIP, - .threadsize = fssz, + .threadsize = thrsz, .varyings = enable_varyings, ), ); - OUT_PKT4(ring, REG_A6XX_VPC_VS_LAYER_CNTL, 1); - OUT_RING(ring, A6XX_VPC_VS_LAYER_CNTL_LAYERLOC(layer_loc) | - A6XX_VPC_VS_LAYER_CNTL_VIEWLOC(view_loc)); - bool need_size = fs->frag_face || fs->fragcoord_compmask != 0; bool need_size_persamp = false; if (VALIDREG(ij_regid[IJ_PERSP_CENTER_RHW])) { @@ -975,15 +980,12 @@ setup_stateobj(struct fd_screen *screen, struct fd_ringbuffer *ring, } OUT_PKT4(ring, REG_A6XX_GRAS_CNTL, 1); - OUT_RING( - ring, - CONDREG(ij_regid[IJ_PERSP_PIXEL], A6XX_GRAS_CNTL_IJ_PERSP_PIXEL) | - CONDREG(ij_regid[IJ_PERSP_CENTROID], - A6XX_GRAS_CNTL_IJ_PERSP_CENTROID) | + OUT_RING(ring, + CONDREG(ij_regid[IJ_PERSP_PIXEL], A6XX_GRAS_CNTL_IJ_PERSP_PIXEL) | + CONDREG(ij_regid[IJ_PERSP_CENTROID], A6XX_GRAS_CNTL_IJ_PERSP_CENTROID) | CONDREG(ij_regid[IJ_PERSP_SAMPLE], A6XX_GRAS_CNTL_IJ_PERSP_SAMPLE) | CONDREG(ij_regid[IJ_LINEAR_PIXEL], A6XX_GRAS_CNTL_IJ_LINEAR_PIXEL) | - CONDREG(ij_regid[IJ_LINEAR_CENTROID], - A6XX_GRAS_CNTL_IJ_LINEAR_CENTROID) | + CONDREG(ij_regid[IJ_LINEAR_CENTROID], A6XX_GRAS_CNTL_IJ_LINEAR_CENTROID) | CONDREG(ij_regid[IJ_LINEAR_SAMPLE], A6XX_GRAS_CNTL_IJ_LINEAR_SAMPLE) | COND(need_size, A6XX_GRAS_CNTL_IJ_LINEAR_PIXEL) | COND(need_size_persamp, A6XX_GRAS_CNTL_IJ_LINEAR_SAMPLE) | @@ -991,31 +993,26 @@ setup_stateobj(struct fd_screen *screen, struct fd_ringbuffer *ring, A6XX_GRAS_CNTL_COORD_MASK(fs->fragcoord_compmask))); OUT_PKT4(ring, REG_A6XX_RB_RENDER_CONTROL0, 2); - OUT_RING( - ring, - CONDREG(ij_regid[IJ_PERSP_PIXEL], - A6XX_RB_RENDER_CONTROL0_IJ_PERSP_PIXEL) | - CONDREG(ij_regid[IJ_PERSP_CENTROID], - A6XX_RB_RENDER_CONTROL0_IJ_PERSP_CENTROID) | - CONDREG(ij_regid[IJ_PERSP_SAMPLE], - A6XX_RB_RENDER_CONTROL0_IJ_PERSP_SAMPLE) | - CONDREG(ij_regid[IJ_LINEAR_PIXEL], - A6XX_RB_RENDER_CONTROL0_IJ_LINEAR_PIXEL) | - CONDREG(ij_regid[IJ_LINEAR_CENTROID], - A6XX_RB_RENDER_CONTROL0_IJ_LINEAR_CENTROID) | - CONDREG(ij_regid[IJ_LINEAR_SAMPLE], - A6XX_RB_RENDER_CONTROL0_IJ_LINEAR_SAMPLE) | + OUT_RING(ring, + CONDREG(ij_regid[IJ_PERSP_PIXEL], A6XX_RB_RENDER_CONTROL0_IJ_PERSP_PIXEL) | + CONDREG(ij_regid[IJ_PERSP_CENTROID], A6XX_RB_RENDER_CONTROL0_IJ_PERSP_CENTROID) | + CONDREG(ij_regid[IJ_PERSP_SAMPLE], A6XX_RB_RENDER_CONTROL0_IJ_PERSP_SAMPLE) | + CONDREG(ij_regid[IJ_LINEAR_PIXEL], A6XX_RB_RENDER_CONTROL0_IJ_LINEAR_PIXEL) | + CONDREG(ij_regid[IJ_LINEAR_CENTROID], A6XX_RB_RENDER_CONTROL0_IJ_LINEAR_CENTROID) | + CONDREG(ij_regid[IJ_LINEAR_SAMPLE], A6XX_RB_RENDER_CONTROL0_IJ_LINEAR_SAMPLE) | COND(need_size, A6XX_RB_RENDER_CONTROL0_IJ_LINEAR_PIXEL) | COND(enable_varyings, A6XX_RB_RENDER_CONTROL0_UNK10) | COND(need_size_persamp, A6XX_RB_RENDER_CONTROL0_IJ_LINEAR_SAMPLE) | COND(fs->fragcoord_compmask != 0, A6XX_RB_RENDER_CONTROL0_COORD_MASK(fs->fragcoord_compmask))); - OUT_RING(ring, - CONDREG(smask_in_regid, A6XX_RB_RENDER_CONTROL1_SAMPLEMASK) | - CONDREG(samp_id_regid, A6XX_RB_RENDER_CONTROL1_SAMPLEID) | - CONDREG(ij_regid[IJ_PERSP_CENTER_RHW], A6XX_RB_RENDER_CONTROL1_CENTERRHW) | - COND(fs->frag_face, A6XX_RB_RENDER_CONTROL1_FACENESS)); + A6XX_RB_RENDER_CONTROL1_FRAGCOORDSAMPLEMODE( + sample_shading ? FRAGCOORD_SAMPLE : FRAGCOORD_CENTER) | + CONDREG(smask_in_regid, A6XX_RB_RENDER_CONTROL1_SAMPLEMASK) | + CONDREG(samp_id_regid, A6XX_RB_RENDER_CONTROL1_SAMPLEID) | + CONDREG(ij_regid[IJ_PERSP_CENTER_RHW], A6XX_RB_RENDER_CONTROL1_CENTERRHW) | + COND(fs->post_depth_coverage, A6XX_RB_RENDER_CONTROL1_POSTDEPTHCOVERAGE) | + COND(fs->frag_face, A6XX_RB_RENDER_CONTROL1_FACENESS)); OUT_PKT4(ring, REG_A6XX_RB_SAMPLE_CNTL, 1); OUT_RING(ring, COND(sample_shading, A6XX_RB_SAMPLE_CNTL_PER_SAMP_MODE)); @@ -1028,183 +1025,135 @@ setup_stateobj(struct fd_screen *screen, struct fd_ringbuffer *ring, OUT_PKT4(ring, REG_A6XX_GRAS_SAMPLE_CNTL, 1); OUT_RING(ring, COND(sample_shading, A6XX_GRAS_SAMPLE_CNTL_PER_SAMP_MODE)); +} - OUT_PKT4(ring, REG_A6XX_SP_FS_OUTPUT_REG(0), 8); - for (i = 0; i < 8; i++) { - OUT_RING(ring, A6XX_SP_FS_OUTPUT_REG_REGID(color_regid[i]) | - COND(color_regid[i] & HALF_REG_ID, - A6XX_SP_FS_OUTPUT_REG_HALF_PRECISION)); - if (VALIDREG(color_regid[i])) { - state->mrt_components |= 0xf << (i * 4); - } - } - - /* dual source blending has an extra fs output in the 2nd slot */ - if (fs_has_dual_src_color) { - state->mrt_components |= 0xf << 4; - } - - OUT_PKT4(ring, REG_A6XX_VPC_VS_PACK, 1); - OUT_RING(ring, A6XX_VPC_VS_PACK_POSITIONLOC(pos_loc) | - A6XX_VPC_VS_PACK_PSIZELOC(psize_loc) | - A6XX_VPC_VS_PACK_STRIDE_IN_VPC(l.max_loc)); - - if (gs) { - assert(gs->mergedregs == (ds ? ds->mergedregs : vs->mergedregs)); - - fd6_emit_shader(ctx, ring, gs); - if (ds) - fd6_emit_link_map(ds, gs, ring); - else - fd6_emit_link_map(vs, gs, ring); - - OUT_PKT4(ring, REG_A6XX_VPC_GS_PACK, 1); - OUT_RING(ring, A6XX_VPC_GS_PACK_POSITIONLOC(pos_loc) | - A6XX_VPC_GS_PACK_PSIZELOC(psize_loc) | - A6XX_VPC_GS_PACK_STRIDE_IN_VPC(l.max_loc)); - - OUT_PKT4(ring, REG_A6XX_VPC_GS_LAYER_CNTL, 1); - OUT_RING(ring, A6XX_VPC_GS_LAYER_CNTL_LAYERLOC(layer_loc) | - A6XX_VPC_GS_LAYER_CNTL_VIEWLOC(view_loc)); +static void +emit_fs_outputs(struct fd_ringbuffer *ring, const struct program_builder *b) +{ + const struct ir3_shader_variant *fs = b->fs; + uint32_t smask_regid, posz_regid, stencilref_regid; - OUT_PKT4(ring, REG_A6XX_GRAS_GS_LAYER_CNTL, 1); - OUT_RING(ring, - CONDREG(layer_regid, A6XX_GRAS_GS_LAYER_CNTL_WRITES_LAYER) | - CONDREG(view_regid, A6XX_GRAS_GS_LAYER_CNTL_WRITES_VIEW)); + posz_regid = ir3_find_output_regid(fs, FRAG_RESULT_DEPTH); + smask_regid = ir3_find_output_regid(fs, FRAG_RESULT_SAMPLE_MASK); + stencilref_regid = ir3_find_output_regid(fs, FRAG_RESULT_STENCIL); - uint32_t flags_regid = - ir3_find_output_regid(gs, VARYING_SLOT_GS_VERTEX_FLAGS_IR3); + /* we can't write gl_SampleMask for !msaa.. if b0 is zero then we + * end up masking the single sample!! + */ + if (!b->key->key.msaa) + smask_regid = regid(63, 0); - /* if vertex_flags somehow gets optimized out, your gonna have a bad time: */ - assert(flags_regid != INVALID_REG); + int output_reg_count = 0; + uint32_t fragdata_regid[8]; - OUT_PKT4(ring, REG_A6XX_SP_GS_PRIMITIVE_CNTL, 1); - OUT_RING(ring, A6XX_SP_GS_PRIMITIVE_CNTL_OUT(l.cnt) | - A6XX_SP_GS_PRIMITIVE_CNTL_FLAGS_REGID(flags_regid)); + for (uint32_t i = 0; i < ARRAY_SIZE(fragdata_regid); i++) { + unsigned slot = fs->color0_mrt ? FRAG_RESULT_COLOR : FRAG_RESULT_DATA0 + i; + fragdata_regid[i] = ir3_find_output_regid(fs, slot); + if (VALIDREG(fragdata_regid[i])) + output_reg_count = i + 1; + } - OUT_PKT4(ring, REG_A6XX_PC_GS_OUT_CNTL, 1); - OUT_RING(ring, - A6XX_PC_GS_OUT_CNTL_STRIDE_IN_VPC(l.max_loc) | - CONDREG(psize_regid, A6XX_PC_GS_OUT_CNTL_PSIZE) | - CONDREG(layer_regid, A6XX_PC_GS_OUT_CNTL_LAYER) | - CONDREG(view_regid, A6XX_PC_GS_OUT_CNTL_VIEW) | - COND(gs_reads_primid, A6XX_PC_GS_OUT_CNTL_PRIMITIVE_ID) | - A6XX_PC_GS_OUT_CNTL_CLIP_MASK(clip_cull_mask)); + OUT_PKT4(ring, REG_A6XX_SP_FS_OUTPUT_CNTL0, 1); + OUT_RING(ring, A6XX_SP_FS_OUTPUT_CNTL0_DEPTH_REGID(posz_regid) | + A6XX_SP_FS_OUTPUT_CNTL0_SAMPMASK_REGID(smask_regid) | + A6XX_SP_FS_OUTPUT_CNTL0_STENCILREF_REGID(stencilref_regid) | + COND(fs->dual_src_blend, A6XX_SP_FS_OUTPUT_CNTL0_DUAL_COLOR_IN_ENABLE)); + + OUT_PKT4(ring, REG_A6XX_SP_FS_OUTPUT_REG(0), output_reg_count); + for (uint32_t i = 0; i < output_reg_count; i++) { + OUT_RING(ring, A6XX_SP_FS_OUTPUT_REG_REGID(fragdata_regid[i]) | + COND(fragdata_regid[i] & HALF_REG_ID, + A6XX_SP_FS_OUTPUT_REG_HALF_PRECISION)); - enum a6xx_tess_output output; - switch (gs->gs.output_primitive) { - case MESA_PRIM_POINTS: - output = TESS_POINTS; - break; - case MESA_PRIM_LINE_STRIP: - output = TESS_LINES; - break; - case MESA_PRIM_TRIANGLE_STRIP: - output = TESS_CW_TRIS; - break; - default: - unreachable(""); + if (VALIDREG(fragdata_regid[i])) { + b->state->mrt_components |= 0xf << (i * 4); } - OUT_PKT4(ring, REG_A6XX_PC_PRIMITIVE_CNTL_5, 1); - OUT_RING(ring, A6XX_PC_PRIMITIVE_CNTL_5_GS_VERTICES_OUT( - gs->gs.vertices_out - 1) | - A6XX_PC_PRIMITIVE_CNTL_5_GS_OUTPUT(output) | - A6XX_PC_PRIMITIVE_CNTL_5_GS_INVOCATIONS( - gs->gs.invocations - 1)); - - OUT_PKT4(ring, REG_A6XX_GRAS_GS_CL_CNTL, 1); - OUT_RING(ring, A6XX_GRAS_GS_CL_CNTL_CLIP_MASK(clip_mask) | - A6XX_GRAS_GS_CL_CNTL_CULL_MASK(cull_mask)); + } +} - OUT_PKT4(ring, REG_A6XX_VPC_GS_PARAM, 1); - OUT_RING(ring, 0xff); +template +static void +setup_stateobj(struct fd_ringbuffer *ring, const struct program_builder *b) + assert_dt +{ + fd6_emit_shader(b->ctx, ring, b->vs); + fd6_emit_shader(b->ctx, ring, b->hs); + fd6_emit_shader(b->ctx, ring, b->ds); + fd6_emit_shader(b->ctx, ring, b->gs); + if (!b->binning_pass) + fd6_emit_shader(b->ctx, ring, b->fs); + + OUT_PKT4(ring, REG_A6XX_PC_MULTIVIEW_CNTL, 1); + OUT_RING(ring, 0); - OUT_PKT4(ring, REG_A6XX_VPC_GS_CLIP_CNTL, 1); - OUT_RING(ring, A6XX_VPC_GS_CLIP_CNTL_CLIP_MASK(clip_cull_mask) | - A6XX_VPC_GS_CLIP_CNTL_CLIP_DIST_03_LOC(clip0_loc) | - A6XX_VPC_GS_CLIP_CNTL_CLIP_DIST_47_LOC(clip1_loc)); + emit_vfd_dest(ring, b->vs); - const struct ir3_shader_variant *prev = state->ds ? state->ds : state->vs; + emit_vpc(ring, b); - /* Size of per-primitive alloction in ldlw memory in vec4s. */ - uint32_t vec4_size = gs->gs.vertices_in * - DIV_ROUND_UP(prev->output_size, 4); - OUT_PKT4(ring, REG_A6XX_PC_PRIMITIVE_CNTL_6, 1); - OUT_RING(ring, A6XX_PC_PRIMITIVE_CNTL_6_STRIDE_IN_VPC(vec4_size)); + emit_fs_inputs(ring, b); + emit_fs_outputs(ring, b); - OUT_PKT4(ring, REG_A6XX_PC_MULTIVIEW_CNTL, 1); - OUT_RING(ring, 0); + if (b->hs) { + fd6_emit_tess_bos(b->ctx->screen, ring, b->hs); + fd6_emit_tess_bos(b->ctx->screen, ring, b->ds); + } - uint32_t prim_size = prev->output_size; - if (prim_size > 64) - prim_size = 64; - else if (prim_size == 64) - prim_size = 63; - OUT_PKT4(ring, REG_A6XX_SP_GS_PRIM_SIZE, 1); - OUT_RING(ring, prim_size); - } else { - OUT_PKT4(ring, REG_A6XX_PC_PRIMITIVE_CNTL_6, 1); - OUT_RING(ring, 0); - OUT_PKT4(ring, REG_A6XX_SP_GS_PRIM_SIZE, 1); - OUT_RING(ring, 0); + if (b->hs) { + if (b->ctx->screen->info->a6xx.tess_use_shared) { + unsigned hs_input_size = 6 + (3 * (b->vs->output_size - 1)); + unsigned wave_input_size = + MIN2(64, DIV_ROUND_UP(hs_input_size * 4, + b->hs->tess.tcs_vertices_out)); - OUT_PKT4(ring, REG_A6XX_GRAS_VS_LAYER_CNTL, 1); - OUT_RING(ring, - CONDREG(layer_regid, A6XX_GRAS_VS_LAYER_CNTL_WRITES_LAYER) | - CONDREG(view_regid, A6XX_GRAS_VS_LAYER_CNTL_WRITES_VIEW)); - } + OUT_PKT4(ring, REG_A6XX_PC_HS_INPUT_SIZE, 1); + OUT_RING(ring, hs_input_size); - OUT_PKT4(ring, REG_A6XX_VPC_VS_CLIP_CNTL, 1); - OUT_RING(ring, A6XX_VPC_VS_CLIP_CNTL_CLIP_MASK(clip_cull_mask) | - A6XX_VPC_VS_CLIP_CNTL_CLIP_DIST_03_LOC(clip0_loc) | - A6XX_VPC_VS_CLIP_CNTL_CLIP_DIST_47_LOC(clip1_loc)); + OUT_PKT4(ring, REG_A6XX_SP_HS_WAVE_INPUT_SIZE, 1); + OUT_RING(ring, wave_input_size); + } else { + uint32_t hs_input_size = + b->hs->tess.tcs_vertices_out * b->vs->output_size / 4; - OUT_PKT4(ring, REG_A6XX_GRAS_VS_CL_CNTL, 1); - OUT_RING(ring, A6XX_GRAS_VS_CL_CNTL_CLIP_MASK(clip_mask) | - A6XX_GRAS_VS_CL_CNTL_CULL_MASK(cull_mask)); + /* Total attribute slots in HS incoming patch. */ + OUT_PKT4(ring, REG_A6XX_PC_HS_INPUT_SIZE, 1); + OUT_RING(ring, hs_input_size); - OUT_PKT4(ring, REG_A6XX_VPC_UNKNOWN_9107, 1); - OUT_RING(ring, 0); + const uint32_t wavesize = 64; + const uint32_t max_wave_input_size = 64; + const uint32_t patch_control_points = b->hs->tess.tcs_vertices_out; - if (fs->instrlen) - fd6_emit_shader(ctx, ring, fs); + /* note: if HS is really just the VS extended, then this + * should be by MAX2(patch_control_points, hs_info->tess.tcs_vertices_out) + * however that doesn't match the blob, and fails some dEQP tests. + */ + uint32_t prims_per_wave = wavesize / b->hs->tess.tcs_vertices_out; + uint32_t max_prims_per_wave = max_wave_input_size * wavesize / + (b->vs->output_size * patch_control_points); + prims_per_wave = MIN2(prims_per_wave, max_prims_per_wave); - OUT_REG(ring, A6XX_PC_PS_CNTL(.primitiveiden = fs->reads_primid)); + uint32_t total_size = + b->vs->output_size * patch_control_points * prims_per_wave; + uint32_t wave_input_size = DIV_ROUND_UP(total_size, wavesize); - uint32_t non_sysval_input_count = 0; - for (uint32_t i = 0; i < vs->inputs_count; i++) - if (!vs->inputs[i].sysval) - non_sysval_input_count++; + OUT_PKT4(ring, REG_A6XX_SP_HS_WAVE_INPUT_SIZE, 1); + OUT_RING(ring, wave_input_size); + } - OUT_PKT4(ring, REG_A6XX_VFD_CONTROL_0, 1); - OUT_RING(ring, A6XX_VFD_CONTROL_0_FETCH_CNT(non_sysval_input_count) | - A6XX_VFD_CONTROL_0_DECODE_CNT(non_sysval_input_count)); + enum a6xx_tess_output output; + if (b->ds->tess.point_mode) + output = TESS_POINTS; + else if (b->ds->tess.primitive_mode == TESS_PRIMITIVE_ISOLINES) + output = TESS_LINES; + else if (b->ds->tess.ccw) + output = TESS_CCW_TRIS; + else + output = TESS_CW_TRIS; - OUT_PKT4(ring, REG_A6XX_VFD_DEST_CNTL(0), non_sysval_input_count); - for (uint32_t i = 0; i < non_sysval_input_count; i++) { - assert(vs->inputs[i].compmask); - OUT_RING(ring, - A6XX_VFD_DEST_CNTL_INSTR_WRITEMASK(vs->inputs[i].compmask) | - A6XX_VFD_DEST_CNTL_INSTR_REGID(vs->inputs[i].regid)); + OUT_PKT4(ring, REG_A6XX_PC_TESS_CNTL, 1); + OUT_RING(ring, A6XX_PC_TESS_CNTL_SPACING( + fd6_gl2spacing(b->ds->tess.spacing)) | + A6XX_PC_TESS_CNTL_OUTPUT(output)); } - - OUT_PKT4(ring, REG_A6XX_VFD_CONTROL_1, 6); - OUT_RING(ring, A6XX_VFD_CONTROL_1_REGID4VTX(vertex_regid) | - A6XX_VFD_CONTROL_1_REGID4INST(instance_regid) | - A6XX_VFD_CONTROL_1_REGID4PRIMID(vs_primitive_regid) | - 0xfc000000); - OUT_RING(ring, - A6XX_VFD_CONTROL_2_REGID_HSRELPATCHID(hs_rel_patch_regid) | - A6XX_VFD_CONTROL_2_REGID_INVOCATIONID(hs_invocation_regid)); - OUT_RING(ring, A6XX_VFD_CONTROL_3_REGID_DSRELPATCHID(ds_rel_patch_regid) | - A6XX_VFD_CONTROL_3_REGID_TESSX(tess_coord_x_regid) | - A6XX_VFD_CONTROL_3_REGID_TESSY(tess_coord_y_regid) | - A6XX_VFD_CONTROL_3_REGID_DSPRIMID(ds_primitive_regid)); - OUT_RING(ring, 0x000000fc); /* VFD_CONTROL_4 */ - OUT_RING(ring, A6XX_VFD_CONTROL_5_REGID_GSHEADER(gs_header_regid) | - 0xfc00); /* VFD_CONTROL_5 */ - OUT_RING(ring, COND(fs->reads_primid, - A6XX_VFD_CONTROL_6_PRIMID4PSEN)); /* VFD_CONTROL_6 */ } static void emit_interp_state(struct fd_ringbuffer *ring, @@ -1388,16 +1337,70 @@ fd6_program_create(void *data, const struct ir3_shader_variant *bs, fd_screen_unlock(screen); } + /* Dummy frag shader used for binning pass: */ + static const struct ir3_shader_variant dummy_fs = { + .info = { + .max_reg = -1, + .max_half_reg = -1, + .max_const = -1, + }, + }; + /* The last geometry stage in use: */ + const struct ir3_shader_variant *last_shader = fd6_last_shader(state); + setup_config_stateobj(ctx, state); - setup_stateobj(screen, state->binning_stateobj, ctx, state, key, true); - setup_stateobj(screen, state->stateobj, ctx, state, key, false); + + struct program_builder b = { + .state = state, + .ctx = ctx, + .key = key, + .hs = state->hs, + .ds = state->ds, + .gs = state->gs, + }; + + /* + * Setup binning pass program state: + */ + + /* binning VS is wrong when GS is present, so use nonbinning VS + * TODO: compile both binning VS/GS variants correctly + * + * If we have stream-out, we use the full shader for binning + * pass, rather than the optimized binning pass one, so that we + * have all the varying outputs available for xfb. So streamout + * state should always be derived from the non-binning pass + * program. + */ + b.vs = state->gs || last_shader->stream_output.num_outputs ? + state->vs : state->bs; + b.fs = &dummy_fs; + b.last_shader = last_shader->type != MESA_SHADER_VERTEX ? + last_shader : state->bs; + b.binning_pass = true; + + setup_stateobj(state->binning_stateobj, &b); + + /* + * Setup draw pass program state: + */ + b.vs = state->vs; + b.fs = state->fs; + b.last_shader = last_shader; + b.binning_pass = false; + + setup_stateobj(state->stateobj, &b); + state->interp_stateobj = create_interp_stateobj(ctx, state); - const struct ir3_stream_output_info *stream_output = - &fd6_last_shader(state)->stream_output; + const struct ir3_stream_output_info *stream_output = &last_shader->stream_output; if (stream_output->num_outputs > 0) state->stream_output = stream_output; + bool has_viewport = + VALIDREG(ir3_find_output_regid(last_shader, VARYING_SLOT_VIEWPORT)); + state->num_viewports = has_viewport ? PIPE_MAX_VIEWPORTS : 1; + /* Note that binning pass uses same const state as draw pass: */ state->user_consts_cmdstream_size = fd6_user_consts_cmdstream_size(state->vs) + @@ -1418,6 +1421,11 @@ fd6_program_create(void *data, const struct ir3_shader_variant *bs, state->num_driver_params = num_dp; + /* dual source blending has an extra fs output in the 2nd slot */ + if (fs->fs.color_is_dual_source) { + state->mrt_components |= 0xf << 4; + } + state->lrz_mask.val = ~0; if (fs->has_kill) {