From: Rob Clark Date: Wed, 14 Apr 2021 15:04:06 +0000 (-0700) Subject: freedreno: Re-indent X-Git-Tag: upstream/21.2.3~4829 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=2d439343ea1aee146d4ce32800992cd389bd505d;p=platform%2Fupstream%2Fmesa.git freedreno: Re-indent clang-format -fallback-style=none --style=file -i src/gallium/drivers/freedreno/*.[ch] src/gallium/drivers/freedreno/*/*.[ch] Signed-off-by: Rob Clark Part-of: --- diff --git a/src/gallium/drivers/freedreno/a2xx/fd2_blend.c b/src/gallium/drivers/freedreno/a2xx/fd2_blend.c index 7593eaa..4d9210d 100644 --- a/src/gallium/drivers/freedreno/a2xx/fd2_blend.c +++ b/src/gallium/drivers/freedreno/a2xx/fd2_blend.c @@ -26,87 +26,90 @@ #include "pipe/p_state.h" #include "util/u_blend.h" -#include "util/u_string.h" #include "util/u_memory.h" +#include "util/u_string.h" #include "fd2_blend.h" #include "fd2_context.h" #include "fd2_util.h" - static enum a2xx_rb_blend_opcode blend_func(unsigned func) { - switch (func) { - case PIPE_BLEND_ADD: - return BLEND2_DST_PLUS_SRC; - case PIPE_BLEND_MIN: - return BLEND2_MIN_DST_SRC; - case PIPE_BLEND_MAX: - return BLEND2_MAX_DST_SRC; - case PIPE_BLEND_SUBTRACT: - return BLEND2_SRC_MINUS_DST; - case PIPE_BLEND_REVERSE_SUBTRACT: - return BLEND2_DST_MINUS_SRC; - default: - DBG("invalid blend func: %x", func); - return 0; - } + switch (func) { + case PIPE_BLEND_ADD: + return BLEND2_DST_PLUS_SRC; + case PIPE_BLEND_MIN: + return BLEND2_MIN_DST_SRC; + case PIPE_BLEND_MAX: + return BLEND2_MAX_DST_SRC; + case PIPE_BLEND_SUBTRACT: + return BLEND2_SRC_MINUS_DST; + case PIPE_BLEND_REVERSE_SUBTRACT: + return BLEND2_DST_MINUS_SRC; + default: + DBG("invalid blend func: %x", func); + return 0; + } } void * fd2_blend_state_create(struct pipe_context *pctx, - const struct pipe_blend_state *cso) + const struct pipe_blend_state *cso) { - const struct pipe_rt_blend_state *rt = &cso->rt[0]; - struct fd2_blend_stateobj *so; - unsigned rop = PIPE_LOGICOP_COPY; - - if (cso->logicop_enable) - rop = cso->logicop_func; /* 1:1 mapping with hw */ - - if (cso->independent_blend_enable) { - DBG("Unsupported! independent blend state"); - return NULL; - } - - so = CALLOC_STRUCT(fd2_blend_stateobj); - if (!so) - return NULL; - - so->base = *cso; - - so->rb_colorcontrol = A2XX_RB_COLORCONTROL_ROP_CODE(rop); - - so->rb_blendcontrol = - A2XX_RB_BLEND_CONTROL_COLOR_SRCBLEND(fd_blend_factor(rt->rgb_src_factor)) | - A2XX_RB_BLEND_CONTROL_COLOR_COMB_FCN(blend_func(rt->rgb_func)) | - A2XX_RB_BLEND_CONTROL_COLOR_DESTBLEND(fd_blend_factor(rt->rgb_dst_factor)); - - /* hardware doesn't support SRC_ALPHA_SATURATE for alpha, but it is equivalent to ONE */ - unsigned alpha_src_factor = rt->alpha_src_factor; - if (alpha_src_factor == PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE) - alpha_src_factor = PIPE_BLENDFACTOR_ONE; - - so->rb_blendcontrol |= - A2XX_RB_BLEND_CONTROL_ALPHA_SRCBLEND(fd_blend_factor(alpha_src_factor)) | - A2XX_RB_BLEND_CONTROL_ALPHA_COMB_FCN(blend_func(rt->alpha_func)) | - A2XX_RB_BLEND_CONTROL_ALPHA_DESTBLEND(fd_blend_factor(rt->alpha_dst_factor)); - - if (rt->colormask & PIPE_MASK_R) - so->rb_colormask |= A2XX_RB_COLOR_MASK_WRITE_RED; - if (rt->colormask & PIPE_MASK_G) - so->rb_colormask |= A2XX_RB_COLOR_MASK_WRITE_GREEN; - if (rt->colormask & PIPE_MASK_B) - so->rb_colormask |= A2XX_RB_COLOR_MASK_WRITE_BLUE; - if (rt->colormask & PIPE_MASK_A) - so->rb_colormask |= A2XX_RB_COLOR_MASK_WRITE_ALPHA; - - if (!rt->blend_enable) - so->rb_colorcontrol |= A2XX_RB_COLORCONTROL_BLEND_DISABLE; - - if (cso->dither) - so->rb_colorcontrol |= A2XX_RB_COLORCONTROL_DITHER_MODE(DITHER_ALWAYS); - - return so; + const struct pipe_rt_blend_state *rt = &cso->rt[0]; + struct fd2_blend_stateobj *so; + unsigned rop = PIPE_LOGICOP_COPY; + + if (cso->logicop_enable) + rop = cso->logicop_func; /* 1:1 mapping with hw */ + + if (cso->independent_blend_enable) { + DBG("Unsupported! independent blend state"); + return NULL; + } + + so = CALLOC_STRUCT(fd2_blend_stateobj); + if (!so) + return NULL; + + so->base = *cso; + + so->rb_colorcontrol = A2XX_RB_COLORCONTROL_ROP_CODE(rop); + + so->rb_blendcontrol = + A2XX_RB_BLEND_CONTROL_COLOR_SRCBLEND( + fd_blend_factor(rt->rgb_src_factor)) | + A2XX_RB_BLEND_CONTROL_COLOR_COMB_FCN(blend_func(rt->rgb_func)) | + A2XX_RB_BLEND_CONTROL_COLOR_DESTBLEND( + fd_blend_factor(rt->rgb_dst_factor)); + + /* hardware doesn't support SRC_ALPHA_SATURATE for alpha, but it is + * equivalent to ONE */ + unsigned alpha_src_factor = rt->alpha_src_factor; + if (alpha_src_factor == PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE) + alpha_src_factor = PIPE_BLENDFACTOR_ONE; + + so->rb_blendcontrol |= + A2XX_RB_BLEND_CONTROL_ALPHA_SRCBLEND(fd_blend_factor(alpha_src_factor)) | + A2XX_RB_BLEND_CONTROL_ALPHA_COMB_FCN(blend_func(rt->alpha_func)) | + A2XX_RB_BLEND_CONTROL_ALPHA_DESTBLEND( + fd_blend_factor(rt->alpha_dst_factor)); + + if (rt->colormask & PIPE_MASK_R) + so->rb_colormask |= A2XX_RB_COLOR_MASK_WRITE_RED; + if (rt->colormask & PIPE_MASK_G) + so->rb_colormask |= A2XX_RB_COLOR_MASK_WRITE_GREEN; + if (rt->colormask & PIPE_MASK_B) + so->rb_colormask |= A2XX_RB_COLOR_MASK_WRITE_BLUE; + if (rt->colormask & PIPE_MASK_A) + so->rb_colormask |= A2XX_RB_COLOR_MASK_WRITE_ALPHA; + + if (!rt->blend_enable) + so->rb_colorcontrol |= A2XX_RB_COLORCONTROL_BLEND_DISABLE; + + if (cso->dither) + so->rb_colorcontrol |= A2XX_RB_COLORCONTROL_DITHER_MODE(DITHER_ALWAYS); + + return so; } diff --git a/src/gallium/drivers/freedreno/a2xx/fd2_blend.h b/src/gallium/drivers/freedreno/a2xx/fd2_blend.h index 9fe6e6e..813712d 100644 --- a/src/gallium/drivers/freedreno/a2xx/fd2_blend.h +++ b/src/gallium/drivers/freedreno/a2xx/fd2_blend.h @@ -27,23 +27,23 @@ #ifndef FD2_BLEND_H_ #define FD2_BLEND_H_ -#include "pipe/p_state.h" #include "pipe/p_context.h" +#include "pipe/p_state.h" struct fd2_blend_stateobj { - struct pipe_blend_state base; - uint32_t rb_blendcontrol; - uint32_t rb_colorcontrol; /* must be OR'd w/ zsa->rb_colorcontrol */ - uint32_t rb_colormask; + struct pipe_blend_state base; + uint32_t rb_blendcontrol; + uint32_t rb_colorcontrol; /* must be OR'd w/ zsa->rb_colorcontrol */ + uint32_t rb_colormask; }; static inline struct fd2_blend_stateobj * fd2_blend_stateobj(struct pipe_blend_state *blend) { - return (struct fd2_blend_stateobj *)blend; + return (struct fd2_blend_stateobj *)blend; } -void * fd2_blend_state_create(struct pipe_context *pctx, - const struct pipe_blend_state *cso); +void *fd2_blend_state_create(struct pipe_context *pctx, + const struct pipe_blend_state *cso); #endif /* FD2_BLEND_H_ */ diff --git a/src/gallium/drivers/freedreno/a2xx/fd2_context.c b/src/gallium/drivers/freedreno/a2xx/fd2_context.c index 2d0dfca..2c9118f 100644 --- a/src/gallium/drivers/freedreno/a2xx/fd2_context.c +++ b/src/gallium/drivers/freedreno/a2xx/fd2_context.c @@ -24,7 +24,6 @@ * Rob Clark */ - #include "fd2_context.h" #include "fd2_blend.h" #include "fd2_draw.h" @@ -37,11 +36,10 @@ #include "fd2_zsa.h" static void -fd2_context_destroy(struct pipe_context *pctx) - in_dt +fd2_context_destroy(struct pipe_context *pctx) in_dt { - fd_context_destroy(pctx); - free(pctx); + fd_context_destroy(pctx); + free(pctx); } static struct pipe_resource * @@ -64,11 +62,12 @@ create_solid_vertexbuf(struct pipe_context *pctx) }; /* clang-format on */ - struct pipe_resource *prsc = pipe_buffer_create(pctx->screen, - PIPE_BIND_CUSTOM, PIPE_USAGE_IMMUTABLE, sizeof(init_shader_const)); - pipe_buffer_write(pctx, prsc, 0, - sizeof(init_shader_const), init_shader_const); - return prsc; + struct pipe_resource *prsc = + pipe_buffer_create(pctx->screen, PIPE_BIND_CUSTOM, PIPE_USAGE_IMMUTABLE, + sizeof(init_shader_const)); + pipe_buffer_write(pctx, prsc, 0, sizeof(init_shader_const), + init_shader_const); + return prsc; } /* clang-format off */ @@ -95,40 +94,40 @@ static const uint8_t a20x_primtypes[PIPE_PRIM_MAX] = { struct pipe_context * fd2_context_create(struct pipe_screen *pscreen, void *priv, unsigned flags) { - struct fd_screen *screen = fd_screen(pscreen); - struct fd2_context *fd2_ctx = CALLOC_STRUCT(fd2_context); - struct pipe_context *pctx; + struct fd_screen *screen = fd_screen(pscreen); + struct fd2_context *fd2_ctx = CALLOC_STRUCT(fd2_context); + struct pipe_context *pctx; - if (!fd2_ctx) - return NULL; + if (!fd2_ctx) + return NULL; - pctx = &fd2_ctx->base.base; - pctx->screen = pscreen; + pctx = &fd2_ctx->base.base; + pctx->screen = pscreen; - fd2_ctx->base.dev = fd_device_ref(screen->dev); - fd2_ctx->base.screen = fd_screen(pscreen); + fd2_ctx->base.dev = fd_device_ref(screen->dev); + fd2_ctx->base.screen = fd_screen(pscreen); - pctx->destroy = fd2_context_destroy; - pctx->create_blend_state = fd2_blend_state_create; - pctx->create_rasterizer_state = fd2_rasterizer_state_create; - pctx->create_depth_stencil_alpha_state = fd2_zsa_state_create; + pctx->destroy = fd2_context_destroy; + pctx->create_blend_state = fd2_blend_state_create; + pctx->create_rasterizer_state = fd2_rasterizer_state_create; + pctx->create_depth_stencil_alpha_state = fd2_zsa_state_create; - fd2_draw_init(pctx); - fd2_gmem_init(pctx); - fd2_texture_init(pctx); - fd2_prog_init(pctx); - fd2_emit_init(pctx); + fd2_draw_init(pctx); + fd2_gmem_init(pctx); + fd2_texture_init(pctx); + fd2_prog_init(pctx); + fd2_emit_init(pctx); - pctx = fd_context_init(&fd2_ctx->base, pscreen, - (screen->gpu_id >= 220) ? a22x_primtypes : a20x_primtypes, - priv, flags); - if (!pctx) - return NULL; + pctx = fd_context_init( + &fd2_ctx->base, pscreen, + (screen->gpu_id >= 220) ? a22x_primtypes : a20x_primtypes, priv, flags); + if (!pctx) + return NULL; - /* construct vertex state used for solid ops (clear, and gmem<->mem) */ - fd2_ctx->solid_vertexbuf = create_solid_vertexbuf(pctx); + /* construct vertex state used for solid ops (clear, and gmem<->mem) */ + fd2_ctx->solid_vertexbuf = create_solid_vertexbuf(pctx); - fd2_query_context_init(pctx); + fd2_query_context_init(pctx); - return pctx; + return pctx; } diff --git a/src/gallium/drivers/freedreno/a2xx/fd2_context.h b/src/gallium/drivers/freedreno/a2xx/fd2_context.h index e35f791..5f399ff 100644 --- a/src/gallium/drivers/freedreno/a2xx/fd2_context.h +++ b/src/gallium/drivers/freedreno/a2xx/fd2_context.h @@ -30,21 +30,21 @@ #include "freedreno_context.h" struct fd2_context { - struct fd_context base; + struct fd_context base; - /* vertex buf used for clear/gmem->mem vertices, and mem->gmem - * vertices and tex coords: - */ - struct pipe_resource *solid_vertexbuf; + /* vertex buf used for clear/gmem->mem vertices, and mem->gmem + * vertices and tex coords: + */ + struct pipe_resource *solid_vertexbuf; }; static inline struct fd2_context * fd2_context(struct fd_context *ctx) { - return (struct fd2_context *)ctx; + return (struct fd2_context *)ctx; } -struct pipe_context * -fd2_context_create(struct pipe_screen *pscreen, void *priv, unsigned flags); +struct pipe_context *fd2_context_create(struct pipe_screen *pscreen, void *priv, + unsigned flags); #endif /* FD2_CONTEXT_H_ */ diff --git a/src/gallium/drivers/freedreno/a2xx/fd2_draw.c b/src/gallium/drivers/freedreno/a2xx/fd2_draw.c index 0b6e0f7..ff3c22c 100644 --- a/src/gallium/drivers/freedreno/a2xx/fd2_draw.c +++ b/src/gallium/drivers/freedreno/a2xx/fd2_draw.c @@ -25,164 +25,159 @@ */ #include "pipe/p_state.h" -#include "util/u_string.h" #include "util/u_memory.h" #include "util/u_prim.h" +#include "util/u_string.h" -#include "freedreno_state.h" #include "freedreno_resource.h" +#include "freedreno_state.h" -#include "fd2_draw.h" #include "fd2_context.h" +#include "fd2_draw.h" #include "fd2_emit.h" #include "fd2_program.h" #include "fd2_util.h" #include "fd2_zsa.h" - static void emit_cacheflush(struct fd_ringbuffer *ring) { - unsigned i; + unsigned i; - for (i = 0; i < 12; i++) { - OUT_PKT3(ring, CP_EVENT_WRITE, 1); - OUT_RING(ring, CACHE_FLUSH); - } + for (i = 0; i < 12; i++) { + OUT_PKT3(ring, CP_EVENT_WRITE, 1); + OUT_RING(ring, CACHE_FLUSH); + } } static void -emit_vertexbufs(struct fd_context *ctx) - assert_dt +emit_vertexbufs(struct fd_context *ctx) assert_dt { - struct fd_vertex_stateobj *vtx = ctx->vtx.vtx; - struct fd_vertexbuf_stateobj *vertexbuf = &ctx->vtx.vertexbuf; - struct fd2_vertex_buf bufs[PIPE_MAX_ATTRIBS]; - unsigned i; - - if (!vtx->num_elements) - return; - - for (i = 0; i < vtx->num_elements; i++) { - struct pipe_vertex_element *elem = &vtx->pipe[i]; - struct pipe_vertex_buffer *vb = - &vertexbuf->vb[elem->vertex_buffer_index]; - bufs[i].offset = vb->buffer_offset; - bufs[i].size = fd_bo_size(fd_resource(vb->buffer.resource)->bo); - bufs[i].prsc = vb->buffer.resource; - } - - // NOTE I believe the 0x78 (or 0x9c in solid_vp) relates to the - // CONST(20,0) (or CONST(26,0) in soliv_vp) - - fd2_emit_vertex_bufs(ctx->batch->draw, 0x78, bufs, vtx->num_elements); - fd2_emit_vertex_bufs(ctx->batch->binning, 0x78, bufs, vtx->num_elements); + struct fd_vertex_stateobj *vtx = ctx->vtx.vtx; + struct fd_vertexbuf_stateobj *vertexbuf = &ctx->vtx.vertexbuf; + struct fd2_vertex_buf bufs[PIPE_MAX_ATTRIBS]; + unsigned i; + + if (!vtx->num_elements) + return; + + for (i = 0; i < vtx->num_elements; i++) { + struct pipe_vertex_element *elem = &vtx->pipe[i]; + struct pipe_vertex_buffer *vb = &vertexbuf->vb[elem->vertex_buffer_index]; + bufs[i].offset = vb->buffer_offset; + bufs[i].size = fd_bo_size(fd_resource(vb->buffer.resource)->bo); + bufs[i].prsc = vb->buffer.resource; + } + + // NOTE I believe the 0x78 (or 0x9c in solid_vp) relates to the + // CONST(20,0) (or CONST(26,0) in soliv_vp) + + fd2_emit_vertex_bufs(ctx->batch->draw, 0x78, bufs, vtx->num_elements); + fd2_emit_vertex_bufs(ctx->batch->binning, 0x78, bufs, vtx->num_elements); } static void draw_impl(struct fd_context *ctx, const struct pipe_draw_info *info, - const struct pipe_draw_start_count *draw, - struct fd_ringbuffer *ring, unsigned index_offset, bool binning) - assert_dt + const struct pipe_draw_start_count *draw, struct fd_ringbuffer *ring, + unsigned index_offset, bool binning) assert_dt { - OUT_PKT3(ring, CP_SET_CONSTANT, 2); - OUT_RING(ring, CP_REG(REG_A2XX_VGT_INDX_OFFSET)); - OUT_RING(ring, info->index_size ? 0 : draw->start); - - OUT_PKT0(ring, REG_A2XX_TC_CNTL_STATUS, 1); - OUT_RING(ring, A2XX_TC_CNTL_STATUS_L2_INVALIDATE); - - if (is_a20x(ctx->screen)) { - /* wait for DMA to finish and - * dummy draw one triangle with indexes 0,0,0. - * with PRE_FETCH_CULL_ENABLE | GRP_CULL_ENABLE. - * - * this workaround is for a HW bug related to DMA alignment: - * it is necessary for indexed draws and possibly also - * draws that read binning data - */ - OUT_PKT3(ring, CP_WAIT_REG_EQ, 4); - OUT_RING(ring, 0x000005d0); /* RBBM_STATUS */ - OUT_RING(ring, 0x00000000); - OUT_RING(ring, 0x00001000); /* bit: 12: VGT_BUSY_NO_DMA */ - OUT_RING(ring, 0x00000001); - - OUT_PKT3(ring, CP_DRAW_INDX_BIN, 6); - OUT_RING(ring, 0x00000000); - OUT_RING(ring, 0x0003c004); - OUT_RING(ring, 0x00000000); - OUT_RING(ring, 0x00000003); - OUT_RELOC(ring, fd_resource(fd2_context(ctx)->solid_vertexbuf)->bo, 64, 0, 0); - OUT_RING(ring, 0x00000006); - } else { - OUT_WFI (ring); - - OUT_PKT3(ring, CP_SET_CONSTANT, 3); - OUT_RING(ring, CP_REG(REG_A2XX_VGT_MAX_VTX_INDX)); - OUT_RING(ring, info->index_bounds_valid ? info->max_index : ~0); /* VGT_MAX_VTX_INDX */ - OUT_RING(ring, info->index_bounds_valid ? info->min_index : 0); /* VGT_MIN_VTX_INDX */ - } - - /* binning shader will take offset from C64 */ - if (binning && is_a20x(ctx->screen)) { - OUT_PKT3(ring, CP_SET_CONSTANT, 5); - OUT_RING(ring, 0x00000180); - OUT_RING(ring, fui(ctx->batch->num_vertices)); - OUT_RING(ring, fui(0.0f)); - OUT_RING(ring, fui(0.0f)); - OUT_RING(ring, fui(0.0f)); - } - - enum pc_di_vis_cull_mode vismode = USE_VISIBILITY; - if (binning || info->mode == PIPE_PRIM_POINTS) - vismode = IGNORE_VISIBILITY; - - fd_draw_emit(ctx->batch, ring, ctx->primtypes[info->mode], - vismode, info, draw, index_offset); - - if (is_a20x(ctx->screen)) { - /* not sure why this is required, but it fixes some hangs */ - OUT_WFI(ring); - } else { - OUT_PKT3(ring, CP_SET_CONSTANT, 2); - OUT_RING(ring, CP_REG(REG_A2XX_UNKNOWN_2010)); - OUT_RING(ring, 0x00000000); - } - - emit_cacheflush(ring); + OUT_PKT3(ring, CP_SET_CONSTANT, 2); + OUT_RING(ring, CP_REG(REG_A2XX_VGT_INDX_OFFSET)); + OUT_RING(ring, info->index_size ? 0 : draw->start); + + OUT_PKT0(ring, REG_A2XX_TC_CNTL_STATUS, 1); + OUT_RING(ring, A2XX_TC_CNTL_STATUS_L2_INVALIDATE); + + if (is_a20x(ctx->screen)) { + /* wait for DMA to finish and + * dummy draw one triangle with indexes 0,0,0. + * with PRE_FETCH_CULL_ENABLE | GRP_CULL_ENABLE. + * + * this workaround is for a HW bug related to DMA alignment: + * it is necessary for indexed draws and possibly also + * draws that read binning data + */ + OUT_PKT3(ring, CP_WAIT_REG_EQ, 4); + OUT_RING(ring, 0x000005d0); /* RBBM_STATUS */ + OUT_RING(ring, 0x00000000); + OUT_RING(ring, 0x00001000); /* bit: 12: VGT_BUSY_NO_DMA */ + OUT_RING(ring, 0x00000001); + + OUT_PKT3(ring, CP_DRAW_INDX_BIN, 6); + OUT_RING(ring, 0x00000000); + OUT_RING(ring, 0x0003c004); + OUT_RING(ring, 0x00000000); + OUT_RING(ring, 0x00000003); + OUT_RELOC(ring, fd_resource(fd2_context(ctx)->solid_vertexbuf)->bo, 64, 0, + 0); + OUT_RING(ring, 0x00000006); + } else { + OUT_WFI(ring); + + OUT_PKT3(ring, CP_SET_CONSTANT, 3); + OUT_RING(ring, CP_REG(REG_A2XX_VGT_MAX_VTX_INDX)); + OUT_RING(ring, info->index_bounds_valid ? info->max_index + : ~0); /* VGT_MAX_VTX_INDX */ + OUT_RING(ring, info->index_bounds_valid ? info->min_index + : 0); /* VGT_MIN_VTX_INDX */ + } + + /* binning shader will take offset from C64 */ + if (binning && is_a20x(ctx->screen)) { + OUT_PKT3(ring, CP_SET_CONSTANT, 5); + OUT_RING(ring, 0x00000180); + OUT_RING(ring, fui(ctx->batch->num_vertices)); + OUT_RING(ring, fui(0.0f)); + OUT_RING(ring, fui(0.0f)); + OUT_RING(ring, fui(0.0f)); + } + + enum pc_di_vis_cull_mode vismode = USE_VISIBILITY; + if (binning || info->mode == PIPE_PRIM_POINTS) + vismode = IGNORE_VISIBILITY; + + fd_draw_emit(ctx->batch, ring, ctx->primtypes[info->mode], vismode, info, + draw, index_offset); + + if (is_a20x(ctx->screen)) { + /* not sure why this is required, but it fixes some hangs */ + OUT_WFI(ring); + } else { + OUT_PKT3(ring, CP_SET_CONSTANT, 2); + OUT_RING(ring, CP_REG(REG_A2XX_UNKNOWN_2010)); + OUT_RING(ring, 0x00000000); + } + + emit_cacheflush(ring); } - static bool fd2_draw_vbo(struct fd_context *ctx, const struct pipe_draw_info *pinfo, - const struct pipe_draw_indirect_info *indirect, - const struct pipe_draw_start_count *pdraw, - unsigned index_offset) - assert_dt + const struct pipe_draw_indirect_info *indirect, + const struct pipe_draw_start_count *pdraw, + unsigned index_offset) assert_dt { - if (!ctx->prog.fs || !ctx->prog.vs) - return false; + if (!ctx->prog.fs || !ctx->prog.vs) + return false; - if (pinfo->mode != PIPE_PRIM_MAX && - !indirect && - !pinfo->primitive_restart && - !u_trim_pipe_prim(pinfo->mode, (unsigned*)&pdraw->count)) - return false; + if (pinfo->mode != PIPE_PRIM_MAX && !indirect && !pinfo->primitive_restart && + !u_trim_pipe_prim(pinfo->mode, (unsigned *)&pdraw->count)) + return false; - if (ctx->dirty & FD_DIRTY_VTXBUF) - emit_vertexbufs(ctx); + if (ctx->dirty & FD_DIRTY_VTXBUF) + emit_vertexbufs(ctx); - if (fd_binning_enabled) - fd2_emit_state_binning(ctx, ctx->dirty); + if (fd_binning_enabled) + fd2_emit_state_binning(ctx, ctx->dirty); - fd2_emit_state(ctx, ctx->dirty); + fd2_emit_state(ctx, ctx->dirty); - /* a2xx can draw only 65535 vertices at once - * on a22x the field in the draw command is 32bits but seems limited too - * using a limit of 32k because it fixes an unexplained hang - * 32766 works for all primitives (multiple of 2 and 3) - */ - if (pdraw->count > 32766) { + /* a2xx can draw only 65535 vertices at once + * on a22x the field in the draw command is 32bits but seems limited too + * using a limit of 32k because it fixes an unexplained hang + * 32766 works for all primitives (multiple of 2 and 3) + */ + if (pdraw->count > 32766) { /* clang-format off */ static const uint16_t step_tbl[PIPE_PRIM_MAX] = { [0 ... PIPE_PRIM_MAX - 1] = 32766, @@ -195,454 +190,456 @@ fd2_draw_vbo(struct fd_context *ctx, const struct pipe_draw_info *pinfo, }; /* clang-format on */ - struct pipe_draw_start_count draw = *pdraw; - unsigned count = draw.count; - unsigned step = step_tbl[pinfo->mode]; - unsigned num_vertices = ctx->batch->num_vertices; - - if (!step) - return false; - - for (; count + step > 32766; count -= step) { - draw.count = MIN2(count, 32766); - draw_impl(ctx, pinfo, &draw, ctx->batch->draw, index_offset, false); - draw_impl(ctx, pinfo, &draw, ctx->batch->binning, index_offset, true); - draw.start += step; - ctx->batch->num_vertices += step; - } - /* changing this value is a hack, restore it */ - ctx->batch->num_vertices = num_vertices; - } else { - draw_impl(ctx, pinfo, pdraw, ctx->batch->draw, index_offset, false); - draw_impl(ctx, pinfo, pdraw, ctx->batch->binning, index_offset, true); - } - - fd_context_all_clean(ctx); - - return true; + struct pipe_draw_start_count draw = *pdraw; + unsigned count = draw.count; + unsigned step = step_tbl[pinfo->mode]; + unsigned num_vertices = ctx->batch->num_vertices; + + if (!step) + return false; + + for (; count + step > 32766; count -= step) { + draw.count = MIN2(count, 32766); + draw_impl(ctx, pinfo, &draw, ctx->batch->draw, index_offset, false); + draw_impl(ctx, pinfo, &draw, ctx->batch->binning, index_offset, true); + draw.start += step; + ctx->batch->num_vertices += step; + } + /* changing this value is a hack, restore it */ + ctx->batch->num_vertices = num_vertices; + } else { + draw_impl(ctx, pinfo, pdraw, ctx->batch->draw, index_offset, false); + draw_impl(ctx, pinfo, pdraw, ctx->batch->binning, index_offset, true); + } + + fd_context_all_clean(ctx); + + return true; } static void clear_state(struct fd_batch *batch, struct fd_ringbuffer *ring, - unsigned buffers, bool fast_clear) - assert_dt + unsigned buffers, bool fast_clear) assert_dt { - struct fd_context *ctx = batch->ctx; - struct fd2_context *fd2_ctx = fd2_context(ctx); - uint32_t reg; - - fd2_emit_vertex_bufs(ring, 0x9c, (struct fd2_vertex_buf[]) { - { .prsc = fd2_ctx->solid_vertexbuf, .size = 36 }, - }, 1); - - OUT_PKT3(ring, CP_SET_CONSTANT, 2); - OUT_RING(ring, CP_REG(REG_A2XX_VGT_INDX_OFFSET)); - OUT_RING(ring, 0); - - fd2_program_emit(ctx, ring, &ctx->solid_prog); - - OUT_PKT0(ring, REG_A2XX_TC_CNTL_STATUS, 1); - OUT_RING(ring, A2XX_TC_CNTL_STATUS_L2_INVALIDATE); - - if (buffers & (PIPE_CLEAR_DEPTH | PIPE_CLEAR_STENCIL)) { - OUT_PKT3(ring, CP_SET_CONSTANT, 2); - OUT_RING(ring, CP_REG(REG_A2XX_RB_DEPTHCONTROL)); - reg = 0; - if (buffers & PIPE_CLEAR_DEPTH) { - reg |= A2XX_RB_DEPTHCONTROL_ZFUNC(FUNC_ALWAYS) | - A2XX_RB_DEPTHCONTROL_Z_ENABLE | - A2XX_RB_DEPTHCONTROL_Z_WRITE_ENABLE | - A2XX_RB_DEPTHCONTROL_EARLY_Z_ENABLE; - } - if (buffers & PIPE_CLEAR_STENCIL) { - reg |= A2XX_RB_DEPTHCONTROL_STENCILFUNC(FUNC_ALWAYS) | - A2XX_RB_DEPTHCONTROL_STENCIL_ENABLE | - A2XX_RB_DEPTHCONTROL_STENCILZPASS(STENCIL_REPLACE); - } - OUT_RING(ring, reg); - } - - OUT_PKT3(ring, CP_SET_CONSTANT, 2); - OUT_RING(ring, CP_REG(REG_A2XX_RB_COLORCONTROL)); - OUT_RING(ring, A2XX_RB_COLORCONTROL_ALPHA_FUNC(FUNC_ALWAYS) | - A2XX_RB_COLORCONTROL_BLEND_DISABLE | - A2XX_RB_COLORCONTROL_ROP_CODE(12) | - A2XX_RB_COLORCONTROL_DITHER_MODE(DITHER_DISABLE) | - A2XX_RB_COLORCONTROL_DITHER_TYPE(DITHER_PIXEL)); - - OUT_PKT3(ring, CP_SET_CONSTANT, 3); - OUT_RING(ring, CP_REG(REG_A2XX_PA_CL_CLIP_CNTL)); - OUT_RING(ring, 0x00000000); /* PA_CL_CLIP_CNTL */ - OUT_RING(ring, A2XX_PA_SU_SC_MODE_CNTL_PROVOKING_VTX_LAST | /* PA_SU_SC_MODE_CNTL */ - A2XX_PA_SU_SC_MODE_CNTL_FRONT_PTYPE(PC_DRAW_TRIANGLES) | - A2XX_PA_SU_SC_MODE_CNTL_BACK_PTYPE(PC_DRAW_TRIANGLES) | - (fast_clear ? A2XX_PA_SU_SC_MODE_CNTL_MSAA_ENABLE : 0)); - - if (fast_clear) { - OUT_PKT3(ring, CP_SET_CONSTANT, 2); - OUT_RING(ring, CP_REG(REG_A2XX_PA_SC_AA_CONFIG)); - OUT_RING(ring, A2XX_PA_SC_AA_CONFIG_MSAA_NUM_SAMPLES(3)); - } - - OUT_PKT3(ring, CP_SET_CONSTANT, 2); - OUT_RING(ring, CP_REG(REG_A2XX_PA_SC_AA_MASK)); - OUT_RING(ring, 0x0000ffff); - - OUT_PKT3(ring, CP_SET_CONSTANT, 2); - OUT_RING(ring, CP_REG(REG_A2XX_RB_COLOR_MASK)); - if (buffers & PIPE_CLEAR_COLOR) { - OUT_RING(ring, A2XX_RB_COLOR_MASK_WRITE_RED | - A2XX_RB_COLOR_MASK_WRITE_GREEN | - A2XX_RB_COLOR_MASK_WRITE_BLUE | - A2XX_RB_COLOR_MASK_WRITE_ALPHA); - } else { - OUT_RING(ring, 0x0); - } - - OUT_PKT3(ring, CP_SET_CONSTANT, 2); - OUT_RING(ring, CP_REG(REG_A2XX_RB_BLEND_CONTROL)); - OUT_RING(ring, 0); - - if (is_a20x(batch->ctx->screen)) - return; - - OUT_PKT3(ring, CP_SET_CONSTANT, 3); - OUT_RING(ring, CP_REG(REG_A2XX_VGT_MAX_VTX_INDX)); - OUT_RING(ring, 3); /* VGT_MAX_VTX_INDX */ - OUT_RING(ring, 0); /* VGT_MIN_VTX_INDX */ - - OUT_PKT3(ring, CP_SET_CONSTANT, 3); - OUT_RING(ring, CP_REG(REG_A2XX_RB_STENCILREFMASK_BF)); - OUT_RING(ring, 0xff000000 | A2XX_RB_STENCILREFMASK_BF_STENCILWRITEMASK(0xff)); - OUT_RING(ring, 0xff000000 | A2XX_RB_STENCILREFMASK_STENCILWRITEMASK(0xff)); - - OUT_PKT3(ring, CP_SET_CONSTANT, 2); - OUT_RING(ring, CP_REG(REG_A2XX_A220_RB_LRZ_VSC_CONTROL)); - OUT_RING(ring, 0x00000084); - - OUT_PKT3(ring, CP_SET_CONSTANT, 2); - OUT_RING(ring, CP_REG(REG_A2XX_VGT_VERTEX_REUSE_BLOCK_CNTL)); - OUT_RING(ring, 0x0000028f); + struct fd_context *ctx = batch->ctx; + struct fd2_context *fd2_ctx = fd2_context(ctx); + uint32_t reg; + + fd2_emit_vertex_bufs(ring, 0x9c, + (struct fd2_vertex_buf[]){ + {.prsc = fd2_ctx->solid_vertexbuf, .size = 36}, + }, + 1); + + OUT_PKT3(ring, CP_SET_CONSTANT, 2); + OUT_RING(ring, CP_REG(REG_A2XX_VGT_INDX_OFFSET)); + OUT_RING(ring, 0); + + fd2_program_emit(ctx, ring, &ctx->solid_prog); + + OUT_PKT0(ring, REG_A2XX_TC_CNTL_STATUS, 1); + OUT_RING(ring, A2XX_TC_CNTL_STATUS_L2_INVALIDATE); + + if (buffers & (PIPE_CLEAR_DEPTH | PIPE_CLEAR_STENCIL)) { + OUT_PKT3(ring, CP_SET_CONSTANT, 2); + OUT_RING(ring, CP_REG(REG_A2XX_RB_DEPTHCONTROL)); + reg = 0; + if (buffers & PIPE_CLEAR_DEPTH) { + reg |= A2XX_RB_DEPTHCONTROL_ZFUNC(FUNC_ALWAYS) | + A2XX_RB_DEPTHCONTROL_Z_ENABLE | + A2XX_RB_DEPTHCONTROL_Z_WRITE_ENABLE | + A2XX_RB_DEPTHCONTROL_EARLY_Z_ENABLE; + } + if (buffers & PIPE_CLEAR_STENCIL) { + reg |= A2XX_RB_DEPTHCONTROL_STENCILFUNC(FUNC_ALWAYS) | + A2XX_RB_DEPTHCONTROL_STENCIL_ENABLE | + A2XX_RB_DEPTHCONTROL_STENCILZPASS(STENCIL_REPLACE); + } + OUT_RING(ring, reg); + } + + OUT_PKT3(ring, CP_SET_CONSTANT, 2); + OUT_RING(ring, CP_REG(REG_A2XX_RB_COLORCONTROL)); + OUT_RING(ring, A2XX_RB_COLORCONTROL_ALPHA_FUNC(FUNC_ALWAYS) | + A2XX_RB_COLORCONTROL_BLEND_DISABLE | + A2XX_RB_COLORCONTROL_ROP_CODE(12) | + A2XX_RB_COLORCONTROL_DITHER_MODE(DITHER_DISABLE) | + A2XX_RB_COLORCONTROL_DITHER_TYPE(DITHER_PIXEL)); + + OUT_PKT3(ring, CP_SET_CONSTANT, 3); + OUT_RING(ring, CP_REG(REG_A2XX_PA_CL_CLIP_CNTL)); + OUT_RING(ring, 0x00000000); /* PA_CL_CLIP_CNTL */ + OUT_RING( + ring, + A2XX_PA_SU_SC_MODE_CNTL_PROVOKING_VTX_LAST | /* PA_SU_SC_MODE_CNTL */ + A2XX_PA_SU_SC_MODE_CNTL_FRONT_PTYPE(PC_DRAW_TRIANGLES) | + A2XX_PA_SU_SC_MODE_CNTL_BACK_PTYPE(PC_DRAW_TRIANGLES) | + (fast_clear ? A2XX_PA_SU_SC_MODE_CNTL_MSAA_ENABLE : 0)); + + if (fast_clear) { + OUT_PKT3(ring, CP_SET_CONSTANT, 2); + OUT_RING(ring, CP_REG(REG_A2XX_PA_SC_AA_CONFIG)); + OUT_RING(ring, A2XX_PA_SC_AA_CONFIG_MSAA_NUM_SAMPLES(3)); + } + + OUT_PKT3(ring, CP_SET_CONSTANT, 2); + OUT_RING(ring, CP_REG(REG_A2XX_PA_SC_AA_MASK)); + OUT_RING(ring, 0x0000ffff); + + OUT_PKT3(ring, CP_SET_CONSTANT, 2); + OUT_RING(ring, CP_REG(REG_A2XX_RB_COLOR_MASK)); + if (buffers & PIPE_CLEAR_COLOR) { + OUT_RING(ring, A2XX_RB_COLOR_MASK_WRITE_RED | + A2XX_RB_COLOR_MASK_WRITE_GREEN | + A2XX_RB_COLOR_MASK_WRITE_BLUE | + A2XX_RB_COLOR_MASK_WRITE_ALPHA); + } else { + OUT_RING(ring, 0x0); + } + + OUT_PKT3(ring, CP_SET_CONSTANT, 2); + OUT_RING(ring, CP_REG(REG_A2XX_RB_BLEND_CONTROL)); + OUT_RING(ring, 0); + + if (is_a20x(batch->ctx->screen)) + return; + + OUT_PKT3(ring, CP_SET_CONSTANT, 3); + OUT_RING(ring, CP_REG(REG_A2XX_VGT_MAX_VTX_INDX)); + OUT_RING(ring, 3); /* VGT_MAX_VTX_INDX */ + OUT_RING(ring, 0); /* VGT_MIN_VTX_INDX */ + + OUT_PKT3(ring, CP_SET_CONSTANT, 3); + OUT_RING(ring, CP_REG(REG_A2XX_RB_STENCILREFMASK_BF)); + OUT_RING(ring, + 0xff000000 | A2XX_RB_STENCILREFMASK_BF_STENCILWRITEMASK(0xff)); + OUT_RING(ring, 0xff000000 | A2XX_RB_STENCILREFMASK_STENCILWRITEMASK(0xff)); + + OUT_PKT3(ring, CP_SET_CONSTANT, 2); + OUT_RING(ring, CP_REG(REG_A2XX_A220_RB_LRZ_VSC_CONTROL)); + OUT_RING(ring, 0x00000084); + + OUT_PKT3(ring, CP_SET_CONSTANT, 2); + OUT_RING(ring, CP_REG(REG_A2XX_VGT_VERTEX_REUSE_BLOCK_CNTL)); + OUT_RING(ring, 0x0000028f); } static void clear_state_restore(struct fd_context *ctx, struct fd_ringbuffer *ring) { - if (is_a20x(ctx->screen)) - return; + if (is_a20x(ctx->screen)) + return; - OUT_PKT3(ring, CP_SET_CONSTANT, 2); - OUT_RING(ring, CP_REG(REG_A2XX_RB_COPY_CONTROL)); - OUT_RING(ring, 0x00000000); + OUT_PKT3(ring, CP_SET_CONSTANT, 2); + OUT_RING(ring, CP_REG(REG_A2XX_RB_COPY_CONTROL)); + OUT_RING(ring, 0x00000000); - OUT_PKT3(ring, CP_SET_CONSTANT, 2); - OUT_RING(ring, CP_REG(REG_A2XX_A220_RB_LRZ_VSC_CONTROL)); - OUT_RING(ring, 0x00000000); + OUT_PKT3(ring, CP_SET_CONSTANT, 2); + OUT_RING(ring, CP_REG(REG_A2XX_A220_RB_LRZ_VSC_CONTROL)); + OUT_RING(ring, 0x00000000); - OUT_PKT3(ring, CP_SET_CONSTANT, 2); - OUT_RING(ring, CP_REG(REG_A2XX_VGT_VERTEX_REUSE_BLOCK_CNTL)); - OUT_RING(ring, 0x0000003b); + OUT_PKT3(ring, CP_SET_CONSTANT, 2); + OUT_RING(ring, CP_REG(REG_A2XX_VGT_VERTEX_REUSE_BLOCK_CNTL)); + OUT_RING(ring, 0x0000003b); } static void clear_fast(struct fd_batch *batch, struct fd_ringbuffer *ring, - uint32_t color_clear, uint32_t depth_clear, unsigned patch_type) + uint32_t color_clear, uint32_t depth_clear, unsigned patch_type) { - BEGIN_RING(ring, 8); /* preallocate next 2 packets (for patching) */ - - /* zero values are patched in */ - OUT_PKT3(ring, CP_SET_CONSTANT, 2); - OUT_RING(ring, CP_REG(REG_A2XX_PA_SC_SCREEN_SCISSOR_BR)); - OUT_RINGP(ring, patch_type, &batch->gmem_patches); - - OUT_PKT3(ring, CP_SET_CONSTANT, 4); - OUT_RING(ring, CP_REG(REG_A2XX_RB_SURFACE_INFO)); - OUT_RING(ring, 0x8000 | 32); - OUT_RING(ring, 0); - OUT_RING(ring, 0); - - /* set fill values */ - if (!is_a20x(batch->ctx->screen)) { - OUT_PKT3(ring, CP_SET_CONSTANT, 2); - OUT_RING(ring, CP_REG(REG_A2XX_CLEAR_COLOR)); - OUT_RING(ring, color_clear); - - OUT_PKT3(ring, CP_SET_CONSTANT, 2); - OUT_RING(ring, CP_REG(REG_A2XX_RB_COPY_CONTROL)); - OUT_RING(ring, A2XX_RB_COPY_CONTROL_DEPTH_CLEAR_ENABLE | - A2XX_RB_COPY_CONTROL_CLEAR_MASK(0xf)); - - OUT_PKT3(ring, CP_SET_CONSTANT, 2); - OUT_RING(ring, CP_REG(REG_A2XX_RB_DEPTH_CLEAR)); - OUT_RING(ring, depth_clear); - } else { - const float sc = 1.0f / 255.0f; - - OUT_PKT3(ring, CP_SET_CONSTANT, 5); - OUT_RING(ring, 0x00000480); - OUT_RING(ring, fui((float) (color_clear >> 0 & 0xff) * sc)); - OUT_RING(ring, fui((float) (color_clear >> 8 & 0xff) * sc)); - OUT_RING(ring, fui((float) (color_clear >> 16 & 0xff) * sc)); - OUT_RING(ring, fui((float) (color_clear >> 24 & 0xff) * sc)); - - // XXX if using float the rounding error breaks it.. - float depth = ((double) (depth_clear >> 8)) * (1.0/(double) 0xffffff); - assert((unsigned) (((double) depth * (double) 0xffffff)) == - (depth_clear >> 8)); - - OUT_PKT3(ring, CP_SET_CONSTANT, 3); - OUT_RING(ring, CP_REG(REG_A2XX_PA_CL_VPORT_ZSCALE)); - OUT_RING(ring, fui(0.0f)); - OUT_RING(ring, fui(depth)); - - OUT_PKT3(ring, CP_SET_CONSTANT, 3); - OUT_RING(ring, CP_REG(REG_A2XX_RB_STENCILREFMASK_BF)); - OUT_RING(ring, 0xff000000 | - A2XX_RB_STENCILREFMASK_BF_STENCILREF(depth_clear & 0xff) | - A2XX_RB_STENCILREFMASK_BF_STENCILWRITEMASK(0xff)); - OUT_RING(ring, 0xff000000 | - A2XX_RB_STENCILREFMASK_STENCILREF(depth_clear & 0xff) | - A2XX_RB_STENCILREFMASK_STENCILWRITEMASK(0xff)); - } - - fd_draw(batch, ring, DI_PT_RECTLIST, IGNORE_VISIBILITY, - DI_SRC_SEL_AUTO_INDEX, 3, 0, INDEX_SIZE_IGN, 0, 0, NULL); + BEGIN_RING(ring, 8); /* preallocate next 2 packets (for patching) */ + + /* zero values are patched in */ + OUT_PKT3(ring, CP_SET_CONSTANT, 2); + OUT_RING(ring, CP_REG(REG_A2XX_PA_SC_SCREEN_SCISSOR_BR)); + OUT_RINGP(ring, patch_type, &batch->gmem_patches); + + OUT_PKT3(ring, CP_SET_CONSTANT, 4); + OUT_RING(ring, CP_REG(REG_A2XX_RB_SURFACE_INFO)); + OUT_RING(ring, 0x8000 | 32); + OUT_RING(ring, 0); + OUT_RING(ring, 0); + + /* set fill values */ + if (!is_a20x(batch->ctx->screen)) { + OUT_PKT3(ring, CP_SET_CONSTANT, 2); + OUT_RING(ring, CP_REG(REG_A2XX_CLEAR_COLOR)); + OUT_RING(ring, color_clear); + + OUT_PKT3(ring, CP_SET_CONSTANT, 2); + OUT_RING(ring, CP_REG(REG_A2XX_RB_COPY_CONTROL)); + OUT_RING(ring, A2XX_RB_COPY_CONTROL_DEPTH_CLEAR_ENABLE | + A2XX_RB_COPY_CONTROL_CLEAR_MASK(0xf)); + + OUT_PKT3(ring, CP_SET_CONSTANT, 2); + OUT_RING(ring, CP_REG(REG_A2XX_RB_DEPTH_CLEAR)); + OUT_RING(ring, depth_clear); + } else { + const float sc = 1.0f / 255.0f; + + OUT_PKT3(ring, CP_SET_CONSTANT, 5); + OUT_RING(ring, 0x00000480); + OUT_RING(ring, fui((float)(color_clear >> 0 & 0xff) * sc)); + OUT_RING(ring, fui((float)(color_clear >> 8 & 0xff) * sc)); + OUT_RING(ring, fui((float)(color_clear >> 16 & 0xff) * sc)); + OUT_RING(ring, fui((float)(color_clear >> 24 & 0xff) * sc)); + + // XXX if using float the rounding error breaks it.. + float depth = ((double)(depth_clear >> 8)) * (1.0 / (double)0xffffff); + assert((unsigned)(((double)depth * (double)0xffffff)) == + (depth_clear >> 8)); + + OUT_PKT3(ring, CP_SET_CONSTANT, 3); + OUT_RING(ring, CP_REG(REG_A2XX_PA_CL_VPORT_ZSCALE)); + OUT_RING(ring, fui(0.0f)); + OUT_RING(ring, fui(depth)); + + OUT_PKT3(ring, CP_SET_CONSTANT, 3); + OUT_RING(ring, CP_REG(REG_A2XX_RB_STENCILREFMASK_BF)); + OUT_RING(ring, + 0xff000000 | + A2XX_RB_STENCILREFMASK_BF_STENCILREF(depth_clear & 0xff) | + A2XX_RB_STENCILREFMASK_BF_STENCILWRITEMASK(0xff)); + OUT_RING(ring, 0xff000000 | + A2XX_RB_STENCILREFMASK_STENCILREF(depth_clear & 0xff) | + A2XX_RB_STENCILREFMASK_STENCILWRITEMASK(0xff)); + } + + fd_draw(batch, ring, DI_PT_RECTLIST, IGNORE_VISIBILITY, + DI_SRC_SEL_AUTO_INDEX, 3, 0, INDEX_SIZE_IGN, 0, 0, NULL); } static bool fd2_clear_fast(struct fd_context *ctx, unsigned buffers, - const union pipe_color_union *color, double depth, unsigned stencil) - assert_dt + const union pipe_color_union *color, double depth, + unsigned stencil) assert_dt { - /* using 4x MSAA allows clearing ~2x faster - * then we can use higher bpp clearing to clear lower bpp - * 1 "pixel" can clear 64 bits (rgba8+depth24+stencil8) - * note: its possible to clear with 32_32_32_32 format but its not faster - * note: fast clear doesn't work with sysmem rendering - * (sysmem rendering is disabled when clear is used) - * - * we only have 16-bit / 32-bit color formats - * and 16-bit / 32-bit depth formats - * so there are only a few possible combinations - * - * if the bpp of the color/depth doesn't match - * we clear with depth/color individually - */ - struct fd2_context *fd2_ctx = fd2_context(ctx); - struct fd_batch *batch = ctx->batch; - struct fd_ringbuffer *ring = batch->draw; - struct pipe_framebuffer_state *pfb = &batch->framebuffer; - uint32_t color_clear = 0, depth_clear = 0; - enum pipe_format format = pipe_surface_format(pfb->cbufs[0]); - int depth_size = -1; /* -1: no clear, 0: clear 16-bit, 1: clear 32-bit */ - int color_size = -1; - - /* TODO: need to test performance on a22x */ - if (!is_a20x(ctx->screen)) - return false; - - if (buffers & PIPE_CLEAR_COLOR) - color_size = util_format_get_blocksizebits(format) == 32; - - if (buffers & (PIPE_CLEAR_DEPTH | PIPE_CLEAR_STENCIL)) { - /* no fast clear when clearing only one component of depth+stencil buffer */ - if (!(buffers & PIPE_CLEAR_DEPTH)) - return false; - - if ((pfb->zsbuf->format == PIPE_FORMAT_Z24_UNORM_S8_UINT || - pfb->zsbuf->format == PIPE_FORMAT_S8_UINT_Z24_UNORM) && - !(buffers & PIPE_CLEAR_STENCIL)) - return false; - - depth_size = fd_pipe2depth(pfb->zsbuf->format) == DEPTHX_24_8; - } - - assert(color_size >= 0 || depth_size >= 0); - - if (color_size == 0) { - color_clear = pack_rgba(format, color->f); - color_clear = (color_clear << 16) | (color_clear & 0xffff); - } else if (color_size == 1) { - color_clear = pack_rgba(format, color->f); - } - - if (depth_size == 0) { - depth_clear = (uint32_t)(0xffff * depth); - depth_clear |= depth_clear << 16; - } else if (depth_size == 1) { - depth_clear = (((uint32_t)(0xffffff * depth)) << 8); - depth_clear |= (stencil & 0xff); - } - - /* disable "window" scissor.. */ - OUT_PKT3(ring, CP_SET_CONSTANT, 3); - OUT_RING(ring, CP_REG(REG_A2XX_PA_SC_WINDOW_SCISSOR_TL)); - OUT_RING(ring, xy2d(0, 0)); - OUT_RING(ring, xy2d(0x7fff, 0x7fff)); - - /* make sure we fill all "pixels" (in SCREEN_SCISSOR) */ - OUT_PKT3(ring, CP_SET_CONSTANT, 5); - OUT_RING(ring, CP_REG(REG_A2XX_PA_CL_VPORT_XSCALE)); - OUT_RING(ring, fui(4096.0)); - OUT_RING(ring, fui(4096.0)); - OUT_RING(ring, fui(4096.0)); - OUT_RING(ring, fui(4096.0)); - - clear_state(batch, ring, ~0u, true); - - if (color_size >= 0 && depth_size != color_size) - clear_fast(batch, ring, color_clear, color_clear, GMEM_PATCH_FASTCLEAR_COLOR); - - if (depth_size >= 0 && depth_size != color_size) - clear_fast(batch, ring, depth_clear, depth_clear, GMEM_PATCH_FASTCLEAR_DEPTH); - - if (depth_size == color_size) - clear_fast(batch, ring, color_clear, depth_clear, GMEM_PATCH_FASTCLEAR_COLOR_DEPTH); - - clear_state_restore(ctx, ring); - - OUT_PKT3(ring, CP_SET_CONSTANT, 2); - OUT_RING(ring, CP_REG(REG_A2XX_PA_SC_AA_CONFIG)); - OUT_RING(ring, 0); - - /* can't patch in SCREEN_SCISSOR_BR as it can be different for each tile. - * MEM_WRITE the value in tile_renderprep, and use CP_LOAD_CONSTANT_CONTEXT - * the value is read from byte offset 60 in the given bo - */ - OUT_PKT3(ring, CP_LOAD_CONSTANT_CONTEXT, 3); - OUT_RELOC(ring, fd_resource(fd2_ctx->solid_vertexbuf)->bo, 0, 0, 0); - OUT_RING(ring, CP_REG(REG_A2XX_PA_SC_SCREEN_SCISSOR_BR)); - OUT_RING(ring, 1); - - OUT_PKT3(ring, CP_SET_CONSTANT, 4); - OUT_RING(ring, CP_REG(REG_A2XX_RB_SURFACE_INFO)); - OUT_RINGP(ring, GMEM_PATCH_RESTORE_INFO, &batch->gmem_patches); - OUT_RING(ring, 0); - OUT_RING(ring, 0); - return true; + /* using 4x MSAA allows clearing ~2x faster + * then we can use higher bpp clearing to clear lower bpp + * 1 "pixel" can clear 64 bits (rgba8+depth24+stencil8) + * note: its possible to clear with 32_32_32_32 format but its not faster + * note: fast clear doesn't work with sysmem rendering + * (sysmem rendering is disabled when clear is used) + * + * we only have 16-bit / 32-bit color formats + * and 16-bit / 32-bit depth formats + * so there are only a few possible combinations + * + * if the bpp of the color/depth doesn't match + * we clear with depth/color individually + */ + struct fd2_context *fd2_ctx = fd2_context(ctx); + struct fd_batch *batch = ctx->batch; + struct fd_ringbuffer *ring = batch->draw; + struct pipe_framebuffer_state *pfb = &batch->framebuffer; + uint32_t color_clear = 0, depth_clear = 0; + enum pipe_format format = pipe_surface_format(pfb->cbufs[0]); + int depth_size = -1; /* -1: no clear, 0: clear 16-bit, 1: clear 32-bit */ + int color_size = -1; + + /* TODO: need to test performance on a22x */ + if (!is_a20x(ctx->screen)) + return false; + + if (buffers & PIPE_CLEAR_COLOR) + color_size = util_format_get_blocksizebits(format) == 32; + + if (buffers & (PIPE_CLEAR_DEPTH | PIPE_CLEAR_STENCIL)) { + /* no fast clear when clearing only one component of depth+stencil buffer */ + if (!(buffers & PIPE_CLEAR_DEPTH)) + return false; + + if ((pfb->zsbuf->format == PIPE_FORMAT_Z24_UNORM_S8_UINT || + pfb->zsbuf->format == PIPE_FORMAT_S8_UINT_Z24_UNORM) && + !(buffers & PIPE_CLEAR_STENCIL)) + return false; + + depth_size = fd_pipe2depth(pfb->zsbuf->format) == DEPTHX_24_8; + } + + assert(color_size >= 0 || depth_size >= 0); + + if (color_size == 0) { + color_clear = pack_rgba(format, color->f); + color_clear = (color_clear << 16) | (color_clear & 0xffff); + } else if (color_size == 1) { + color_clear = pack_rgba(format, color->f); + } + + if (depth_size == 0) { + depth_clear = (uint32_t)(0xffff * depth); + depth_clear |= depth_clear << 16; + } else if (depth_size == 1) { + depth_clear = (((uint32_t)(0xffffff * depth)) << 8); + depth_clear |= (stencil & 0xff); + } + + /* disable "window" scissor.. */ + OUT_PKT3(ring, CP_SET_CONSTANT, 3); + OUT_RING(ring, CP_REG(REG_A2XX_PA_SC_WINDOW_SCISSOR_TL)); + OUT_RING(ring, xy2d(0, 0)); + OUT_RING(ring, xy2d(0x7fff, 0x7fff)); + + /* make sure we fill all "pixels" (in SCREEN_SCISSOR) */ + OUT_PKT3(ring, CP_SET_CONSTANT, 5); + OUT_RING(ring, CP_REG(REG_A2XX_PA_CL_VPORT_XSCALE)); + OUT_RING(ring, fui(4096.0)); + OUT_RING(ring, fui(4096.0)); + OUT_RING(ring, fui(4096.0)); + OUT_RING(ring, fui(4096.0)); + + clear_state(batch, ring, ~0u, true); + + if (color_size >= 0 && depth_size != color_size) + clear_fast(batch, ring, color_clear, color_clear, + GMEM_PATCH_FASTCLEAR_COLOR); + + if (depth_size >= 0 && depth_size != color_size) + clear_fast(batch, ring, depth_clear, depth_clear, + GMEM_PATCH_FASTCLEAR_DEPTH); + + if (depth_size == color_size) + clear_fast(batch, ring, color_clear, depth_clear, + GMEM_PATCH_FASTCLEAR_COLOR_DEPTH); + + clear_state_restore(ctx, ring); + + OUT_PKT3(ring, CP_SET_CONSTANT, 2); + OUT_RING(ring, CP_REG(REG_A2XX_PA_SC_AA_CONFIG)); + OUT_RING(ring, 0); + + /* can't patch in SCREEN_SCISSOR_BR as it can be different for each tile. + * MEM_WRITE the value in tile_renderprep, and use CP_LOAD_CONSTANT_CONTEXT + * the value is read from byte offset 60 in the given bo + */ + OUT_PKT3(ring, CP_LOAD_CONSTANT_CONTEXT, 3); + OUT_RELOC(ring, fd_resource(fd2_ctx->solid_vertexbuf)->bo, 0, 0, 0); + OUT_RING(ring, CP_REG(REG_A2XX_PA_SC_SCREEN_SCISSOR_BR)); + OUT_RING(ring, 1); + + OUT_PKT3(ring, CP_SET_CONSTANT, 4); + OUT_RING(ring, CP_REG(REG_A2XX_RB_SURFACE_INFO)); + OUT_RINGP(ring, GMEM_PATCH_RESTORE_INFO, &batch->gmem_patches); + OUT_RING(ring, 0); + OUT_RING(ring, 0); + return true; } static bool fd2_clear(struct fd_context *ctx, unsigned buffers, - const union pipe_color_union *color, double depth, unsigned stencil) - assert_dt + const union pipe_color_union *color, double depth, + unsigned stencil) assert_dt { - struct fd_ringbuffer *ring = ctx->batch->draw; - struct pipe_framebuffer_state *fb = &ctx->batch->framebuffer; - - if (fd2_clear_fast(ctx, buffers, color, depth, stencil)) - goto dirty; - - /* set clear value */ - if (is_a20x(ctx->screen)) { - if (buffers & PIPE_CLEAR_COLOR) { - /* C0 used by fragment shader */ - OUT_PKT3(ring, CP_SET_CONSTANT, 5); - OUT_RING(ring, 0x00000480); - OUT_RING(ring, color->ui[0]); - OUT_RING(ring, color->ui[1]); - OUT_RING(ring, color->ui[2]); - OUT_RING(ring, color->ui[3]); - } - - if (buffers & PIPE_CLEAR_DEPTH) { - /* use viewport to set depth value */ - OUT_PKT3(ring, CP_SET_CONSTANT, 3); - OUT_RING(ring, CP_REG(REG_A2XX_PA_CL_VPORT_ZSCALE)); - OUT_RING(ring, fui(0.0f)); - OUT_RING(ring, fui(depth)); - } - - if (buffers & PIPE_CLEAR_STENCIL) { - OUT_PKT3(ring, CP_SET_CONSTANT, 3); - OUT_RING(ring, CP_REG(REG_A2XX_RB_STENCILREFMASK_BF)); - OUT_RING(ring, 0xff000000 | - A2XX_RB_STENCILREFMASK_BF_STENCILREF(stencil) | - A2XX_RB_STENCILREFMASK_BF_STENCILWRITEMASK(0xff)); - OUT_RING(ring, 0xff000000 | - A2XX_RB_STENCILREFMASK_STENCILREF(stencil) | - A2XX_RB_STENCILREFMASK_STENCILWRITEMASK(0xff)); - } - } else { - if (buffers & PIPE_CLEAR_COLOR) { - OUT_PKT3(ring, CP_SET_CONSTANT, 2); - OUT_RING(ring, CP_REG(REG_A2XX_CLEAR_COLOR)); - OUT_RING(ring, pack_rgba(PIPE_FORMAT_R8G8B8A8_UNORM, color->f)); - } - - if (buffers & (PIPE_CLEAR_DEPTH | PIPE_CLEAR_STENCIL)) { - uint32_t clear_mask, depth_clear; - switch (fd_pipe2depth(fb->zsbuf->format)) { - case DEPTHX_24_8: - clear_mask = ((buffers & PIPE_CLEAR_DEPTH) ? 0xe : 0) | - ((buffers & PIPE_CLEAR_STENCIL) ? 0x1 : 0); - depth_clear = (((uint32_t)(0xffffff * depth)) << 8) | - (stencil & 0xff); - break; - case DEPTHX_16: - clear_mask = 0xf; - depth_clear = (uint32_t)(0xffffffff * depth); - break; - default: - unreachable("invalid depth"); - break; - } - - OUT_PKT3(ring, CP_SET_CONSTANT, 2); - OUT_RING(ring, CP_REG(REG_A2XX_RB_COPY_CONTROL)); - OUT_RING(ring, A2XX_RB_COPY_CONTROL_DEPTH_CLEAR_ENABLE | - A2XX_RB_COPY_CONTROL_CLEAR_MASK(clear_mask)); - - OUT_PKT3(ring, CP_SET_CONSTANT, 2); - OUT_RING(ring, CP_REG(REG_A2XX_RB_DEPTH_CLEAR)); - OUT_RING(ring, depth_clear); - } - } - - /* scissor state */ - OUT_PKT3(ring, CP_SET_CONSTANT, 3); - OUT_RING(ring, CP_REG(REG_A2XX_PA_SC_WINDOW_SCISSOR_TL)); - OUT_RING(ring, xy2d(0, 0)); - OUT_RING(ring, xy2d(fb->width, fb->height)); - - /* viewport state */ - OUT_PKT3(ring, CP_SET_CONSTANT, 5); - OUT_RING(ring, CP_REG(REG_A2XX_PA_CL_VPORT_XSCALE)); - OUT_RING(ring, fui((float) fb->width / 2.0)); - OUT_RING(ring, fui((float) fb->width / 2.0)); - OUT_RING(ring, fui((float) fb->height / 2.0)); - OUT_RING(ring, fui((float) fb->height / 2.0)); - - /* common state */ - clear_state(ctx->batch, ring, buffers, false); - - fd_draw(ctx->batch, ring, DI_PT_RECTLIST, IGNORE_VISIBILITY, - DI_SRC_SEL_AUTO_INDEX, 3, 0, INDEX_SIZE_IGN, 0, 0, NULL); - - clear_state_restore(ctx, ring); + struct fd_ringbuffer *ring = ctx->batch->draw; + struct pipe_framebuffer_state *fb = &ctx->batch->framebuffer; + + if (fd2_clear_fast(ctx, buffers, color, depth, stencil)) + goto dirty; + + /* set clear value */ + if (is_a20x(ctx->screen)) { + if (buffers & PIPE_CLEAR_COLOR) { + /* C0 used by fragment shader */ + OUT_PKT3(ring, CP_SET_CONSTANT, 5); + OUT_RING(ring, 0x00000480); + OUT_RING(ring, color->ui[0]); + OUT_RING(ring, color->ui[1]); + OUT_RING(ring, color->ui[2]); + OUT_RING(ring, color->ui[3]); + } + + if (buffers & PIPE_CLEAR_DEPTH) { + /* use viewport to set depth value */ + OUT_PKT3(ring, CP_SET_CONSTANT, 3); + OUT_RING(ring, CP_REG(REG_A2XX_PA_CL_VPORT_ZSCALE)); + OUT_RING(ring, fui(0.0f)); + OUT_RING(ring, fui(depth)); + } + + if (buffers & PIPE_CLEAR_STENCIL) { + OUT_PKT3(ring, CP_SET_CONSTANT, 3); + OUT_RING(ring, CP_REG(REG_A2XX_RB_STENCILREFMASK_BF)); + OUT_RING(ring, 0xff000000 | + A2XX_RB_STENCILREFMASK_BF_STENCILREF(stencil) | + A2XX_RB_STENCILREFMASK_BF_STENCILWRITEMASK(0xff)); + OUT_RING(ring, 0xff000000 | + A2XX_RB_STENCILREFMASK_STENCILREF(stencil) | + A2XX_RB_STENCILREFMASK_STENCILWRITEMASK(0xff)); + } + } else { + if (buffers & PIPE_CLEAR_COLOR) { + OUT_PKT3(ring, CP_SET_CONSTANT, 2); + OUT_RING(ring, CP_REG(REG_A2XX_CLEAR_COLOR)); + OUT_RING(ring, pack_rgba(PIPE_FORMAT_R8G8B8A8_UNORM, color->f)); + } + + if (buffers & (PIPE_CLEAR_DEPTH | PIPE_CLEAR_STENCIL)) { + uint32_t clear_mask, depth_clear; + switch (fd_pipe2depth(fb->zsbuf->format)) { + case DEPTHX_24_8: + clear_mask = ((buffers & PIPE_CLEAR_DEPTH) ? 0xe : 0) | + ((buffers & PIPE_CLEAR_STENCIL) ? 0x1 : 0); + depth_clear = + (((uint32_t)(0xffffff * depth)) << 8) | (stencil & 0xff); + break; + case DEPTHX_16: + clear_mask = 0xf; + depth_clear = (uint32_t)(0xffffffff * depth); + break; + default: + unreachable("invalid depth"); + break; + } + + OUT_PKT3(ring, CP_SET_CONSTANT, 2); + OUT_RING(ring, CP_REG(REG_A2XX_RB_COPY_CONTROL)); + OUT_RING(ring, A2XX_RB_COPY_CONTROL_DEPTH_CLEAR_ENABLE | + A2XX_RB_COPY_CONTROL_CLEAR_MASK(clear_mask)); + + OUT_PKT3(ring, CP_SET_CONSTANT, 2); + OUT_RING(ring, CP_REG(REG_A2XX_RB_DEPTH_CLEAR)); + OUT_RING(ring, depth_clear); + } + } + + /* scissor state */ + OUT_PKT3(ring, CP_SET_CONSTANT, 3); + OUT_RING(ring, CP_REG(REG_A2XX_PA_SC_WINDOW_SCISSOR_TL)); + OUT_RING(ring, xy2d(0, 0)); + OUT_RING(ring, xy2d(fb->width, fb->height)); + + /* viewport state */ + OUT_PKT3(ring, CP_SET_CONSTANT, 5); + OUT_RING(ring, CP_REG(REG_A2XX_PA_CL_VPORT_XSCALE)); + OUT_RING(ring, fui((float)fb->width / 2.0)); + OUT_RING(ring, fui((float)fb->width / 2.0)); + OUT_RING(ring, fui((float)fb->height / 2.0)); + OUT_RING(ring, fui((float)fb->height / 2.0)); + + /* common state */ + clear_state(ctx->batch, ring, buffers, false); + + fd_draw(ctx->batch, ring, DI_PT_RECTLIST, IGNORE_VISIBILITY, + DI_SRC_SEL_AUTO_INDEX, 3, 0, INDEX_SIZE_IGN, 0, 0, NULL); + + clear_state_restore(ctx, ring); dirty: - ctx->dirty |= FD_DIRTY_ZSA | - FD_DIRTY_VIEWPORT | - FD_DIRTY_RASTERIZER | - FD_DIRTY_SAMPLE_MASK | - FD_DIRTY_PROG | - FD_DIRTY_CONST | - FD_DIRTY_BLEND | - FD_DIRTY_FRAMEBUFFER | - FD_DIRTY_SCISSOR; - - ctx->dirty_shader[PIPE_SHADER_VERTEX] |= FD_DIRTY_SHADER_PROG; - ctx->dirty_shader[PIPE_SHADER_FRAGMENT] |= FD_DIRTY_SHADER_PROG | FD_DIRTY_SHADER_CONST; - - return true; + ctx->dirty |= FD_DIRTY_ZSA | FD_DIRTY_VIEWPORT | FD_DIRTY_RASTERIZER | + FD_DIRTY_SAMPLE_MASK | FD_DIRTY_PROG | FD_DIRTY_CONST | + FD_DIRTY_BLEND | FD_DIRTY_FRAMEBUFFER | FD_DIRTY_SCISSOR; + + ctx->dirty_shader[PIPE_SHADER_VERTEX] |= FD_DIRTY_SHADER_PROG; + ctx->dirty_shader[PIPE_SHADER_FRAGMENT] |= + FD_DIRTY_SHADER_PROG | FD_DIRTY_SHADER_CONST; + + return true; } void -fd2_draw_init(struct pipe_context *pctx) - disable_thread_safety_analysis +fd2_draw_init(struct pipe_context *pctx) disable_thread_safety_analysis { - struct fd_context *ctx = fd_context(pctx); - ctx->draw_vbo = fd2_draw_vbo; - ctx->clear = fd2_clear; + struct fd_context *ctx = fd_context(pctx); + ctx->draw_vbo = fd2_draw_vbo; + ctx->clear = fd2_clear; } diff --git a/src/gallium/drivers/freedreno/a2xx/fd2_draw.h b/src/gallium/drivers/freedreno/a2xx/fd2_draw.h index c796475..fa5322e 100644 --- a/src/gallium/drivers/freedreno/a2xx/fd2_draw.h +++ b/src/gallium/drivers/freedreno/a2xx/fd2_draw.h @@ -34,10 +34,10 @@ void fd2_draw_init(struct pipe_context *pctx); enum { - GMEM_PATCH_FASTCLEAR_COLOR, - GMEM_PATCH_FASTCLEAR_DEPTH, - GMEM_PATCH_FASTCLEAR_COLOR_DEPTH, - GMEM_PATCH_RESTORE_INFO, + GMEM_PATCH_FASTCLEAR_COLOR, + GMEM_PATCH_FASTCLEAR_DEPTH, + GMEM_PATCH_FASTCLEAR_COLOR_DEPTH, + GMEM_PATCH_RESTORE_INFO, }; #endif /* FD2_DRAW_H_ */ diff --git a/src/gallium/drivers/freedreno/a2xx/fd2_emit.c b/src/gallium/drivers/freedreno/a2xx/fd2_emit.c index 848b1ec..ecd9333 100644 --- a/src/gallium/drivers/freedreno/a2xx/fd2_emit.c +++ b/src/gallium/drivers/freedreno/a2xx/fd2_emit.c @@ -25,15 +25,15 @@ */ #include "pipe/p_state.h" -#include "util/u_string.h" -#include "util/u_memory.h" #include "util/u_helpers.h" +#include "util/u_memory.h" +#include "util/u_string.h" #include "freedreno_resource.h" -#include "fd2_emit.h" #include "fd2_blend.h" #include "fd2_context.h" +#include "fd2_emit.h" #include "fd2_program.h" #include "fd2_rasterizer.h" #include "fd2_texture.h" @@ -49,347 +49,353 @@ static void emit_constants(struct fd_ringbuffer *ring, uint32_t base, - struct fd_constbuf_stateobj *constbuf, - struct fd2_shader_stateobj *shader) + struct fd_constbuf_stateobj *constbuf, + struct fd2_shader_stateobj *shader) { - uint32_t enabled_mask = constbuf->enabled_mask; - uint32_t start_base = base; - unsigned i; - - /* emit user constants: */ - while (enabled_mask) { - unsigned index = ffs(enabled_mask) - 1; - struct pipe_constant_buffer *cb = &constbuf->cb[index]; - unsigned size = align(cb->buffer_size, 4) / 4; /* size in dwords */ - - // I expect that size should be a multiple of vec4's: - assert(size == align(size, 4)); - - /* hmm, sometimes we still seem to end up with consts bound, - * even if shader isn't using them, which ends up overwriting - * const reg's used for immediates.. this is a hack to work - * around that: - */ - if (shader && ((base - start_base) >= (shader->first_immediate * 4))) - break; - - const uint32_t *dwords; - - if (cb->user_buffer) { - dwords = cb->user_buffer; - } else { - struct fd_resource *rsc = fd_resource(cb->buffer); - dwords = fd_bo_map(rsc->bo); - } - - dwords = (uint32_t *)(((uint8_t *)dwords) + cb->buffer_offset); - - OUT_PKT3(ring, CP_SET_CONSTANT, size + 1); - OUT_RING(ring, base); - for (i = 0; i < size; i++) - OUT_RING(ring, *(dwords++)); - - base += size; - enabled_mask &= ~(1 << index); - } - - /* emit shader immediates: */ - if (shader) { - for (i = 0; i < shader->num_immediates; i++) { - OUT_PKT3(ring, CP_SET_CONSTANT, 5); - OUT_RING(ring, start_base + (4 * (shader->first_immediate + i))); - OUT_RING(ring, shader->immediates[i].val[0]); - OUT_RING(ring, shader->immediates[i].val[1]); - OUT_RING(ring, shader->immediates[i].val[2]); - OUT_RING(ring, shader->immediates[i].val[3]); - base += 4; - } - } + uint32_t enabled_mask = constbuf->enabled_mask; + uint32_t start_base = base; + unsigned i; + + /* emit user constants: */ + while (enabled_mask) { + unsigned index = ffs(enabled_mask) - 1; + struct pipe_constant_buffer *cb = &constbuf->cb[index]; + unsigned size = align(cb->buffer_size, 4) / 4; /* size in dwords */ + + // I expect that size should be a multiple of vec4's: + assert(size == align(size, 4)); + + /* hmm, sometimes we still seem to end up with consts bound, + * even if shader isn't using them, which ends up overwriting + * const reg's used for immediates.. this is a hack to work + * around that: + */ + if (shader && ((base - start_base) >= (shader->first_immediate * 4))) + break; + + const uint32_t *dwords; + + if (cb->user_buffer) { + dwords = cb->user_buffer; + } else { + struct fd_resource *rsc = fd_resource(cb->buffer); + dwords = fd_bo_map(rsc->bo); + } + + dwords = (uint32_t *)(((uint8_t *)dwords) + cb->buffer_offset); + + OUT_PKT3(ring, CP_SET_CONSTANT, size + 1); + OUT_RING(ring, base); + for (i = 0; i < size; i++) + OUT_RING(ring, *(dwords++)); + + base += size; + enabled_mask &= ~(1 << index); + } + + /* emit shader immediates: */ + if (shader) { + for (i = 0; i < shader->num_immediates; i++) { + OUT_PKT3(ring, CP_SET_CONSTANT, 5); + OUT_RING(ring, start_base + (4 * (shader->first_immediate + i))); + OUT_RING(ring, shader->immediates[i].val[0]); + OUT_RING(ring, shader->immediates[i].val[1]); + OUT_RING(ring, shader->immediates[i].val[2]); + OUT_RING(ring, shader->immediates[i].val[3]); + base += 4; + } + } } typedef uint32_t texmask; static texmask emit_texture(struct fd_ringbuffer *ring, struct fd_context *ctx, - struct fd_texture_stateobj *tex, unsigned samp_id, texmask emitted) + struct fd_texture_stateobj *tex, unsigned samp_id, texmask emitted) { - unsigned const_idx = fd2_get_const_idx(ctx, tex, samp_id); - static const struct fd2_sampler_stateobj dummy_sampler = {}; - static const struct fd2_pipe_sampler_view dummy_view = {}; - const struct fd2_sampler_stateobj *sampler; - const struct fd2_pipe_sampler_view *view; - struct fd_resource *rsc; - - if (emitted & (1 << const_idx)) - return 0; - - sampler = tex->samplers[samp_id] ? - fd2_sampler_stateobj(tex->samplers[samp_id]) : - &dummy_sampler; - view = tex->textures[samp_id] ? - fd2_pipe_sampler_view(tex->textures[samp_id]) : - &dummy_view; - - rsc = view->base.texture ? fd_resource(view->base.texture) : NULL; - - OUT_PKT3(ring, CP_SET_CONSTANT, 7); - OUT_RING(ring, 0x00010000 + (0x6 * const_idx)); - - OUT_RING(ring, sampler->tex0 | view->tex0); - if (rsc) - OUT_RELOC(ring, rsc->bo, fd_resource_offset(rsc, 0, 0), view->tex1, 0); - else - OUT_RING(ring, 0); - - OUT_RING(ring, view->tex2); - OUT_RING(ring, sampler->tex3 | view->tex3); - OUT_RING(ring, sampler->tex4 | view->tex4); - - if (rsc && rsc->b.b.last_level) - OUT_RELOC(ring, rsc->bo, fd_resource_offset(rsc, 1, 0), view->tex5, 0); - else - OUT_RING(ring, view->tex5); - - return (1 << const_idx); + unsigned const_idx = fd2_get_const_idx(ctx, tex, samp_id); + static const struct fd2_sampler_stateobj dummy_sampler = {}; + static const struct fd2_pipe_sampler_view dummy_view = {}; + const struct fd2_sampler_stateobj *sampler; + const struct fd2_pipe_sampler_view *view; + struct fd_resource *rsc; + + if (emitted & (1 << const_idx)) + return 0; + + sampler = tex->samplers[samp_id] + ? fd2_sampler_stateobj(tex->samplers[samp_id]) + : &dummy_sampler; + view = tex->textures[samp_id] ? fd2_pipe_sampler_view(tex->textures[samp_id]) + : &dummy_view; + + rsc = view->base.texture ? fd_resource(view->base.texture) : NULL; + + OUT_PKT3(ring, CP_SET_CONSTANT, 7); + OUT_RING(ring, 0x00010000 + (0x6 * const_idx)); + + OUT_RING(ring, sampler->tex0 | view->tex0); + if (rsc) + OUT_RELOC(ring, rsc->bo, fd_resource_offset(rsc, 0, 0), view->tex1, 0); + else + OUT_RING(ring, 0); + + OUT_RING(ring, view->tex2); + OUT_RING(ring, sampler->tex3 | view->tex3); + OUT_RING(ring, sampler->tex4 | view->tex4); + + if (rsc && rsc->b.b.last_level) + OUT_RELOC(ring, rsc->bo, fd_resource_offset(rsc, 1, 0), view->tex5, 0); + else + OUT_RING(ring, view->tex5); + + return (1 << const_idx); } static void emit_textures(struct fd_ringbuffer *ring, struct fd_context *ctx) { - struct fd_texture_stateobj *fragtex = &ctx->tex[PIPE_SHADER_FRAGMENT]; - struct fd_texture_stateobj *verttex = &ctx->tex[PIPE_SHADER_VERTEX]; - texmask emitted = 0; - unsigned i; - - for (i = 0; i < verttex->num_samplers; i++) - if (verttex->samplers[i]) - emitted |= emit_texture(ring, ctx, verttex, i, emitted); - - for (i = 0; i < fragtex->num_samplers; i++) - if (fragtex->samplers[i]) - emitted |= emit_texture(ring, ctx, fragtex, i, emitted); + struct fd_texture_stateobj *fragtex = &ctx->tex[PIPE_SHADER_FRAGMENT]; + struct fd_texture_stateobj *verttex = &ctx->tex[PIPE_SHADER_VERTEX]; + texmask emitted = 0; + unsigned i; + + for (i = 0; i < verttex->num_samplers; i++) + if (verttex->samplers[i]) + emitted |= emit_texture(ring, ctx, verttex, i, emitted); + + for (i = 0; i < fragtex->num_samplers; i++) + if (fragtex->samplers[i]) + emitted |= emit_texture(ring, ctx, fragtex, i, emitted); } void fd2_emit_vertex_bufs(struct fd_ringbuffer *ring, uint32_t val, - struct fd2_vertex_buf *vbufs, uint32_t n) + struct fd2_vertex_buf *vbufs, uint32_t n) { - unsigned i; - - OUT_PKT3(ring, CP_SET_CONSTANT, 1 + (2 * n)); - OUT_RING(ring, (0x1 << 16) | (val & 0xffff)); - for (i = 0; i < n; i++) { - struct fd_resource *rsc = fd_resource(vbufs[i].prsc); - OUT_RELOC(ring, rsc->bo, vbufs[i].offset, 3, 0); - OUT_RING (ring, vbufs[i].size); - } + unsigned i; + + OUT_PKT3(ring, CP_SET_CONSTANT, 1 + (2 * n)); + OUT_RING(ring, (0x1 << 16) | (val & 0xffff)); + for (i = 0; i < n; i++) { + struct fd_resource *rsc = fd_resource(vbufs[i].prsc); + OUT_RELOC(ring, rsc->bo, vbufs[i].offset, 3, 0); + OUT_RING(ring, vbufs[i].size); + } } void -fd2_emit_state_binning(struct fd_context *ctx, const enum fd_dirty_3d_state dirty) +fd2_emit_state_binning(struct fd_context *ctx, + const enum fd_dirty_3d_state dirty) { - struct fd2_blend_stateobj *blend = fd2_blend_stateobj(ctx->blend); - struct fd_ringbuffer *ring = ctx->batch->binning; - - /* subset of fd2_emit_state needed for hw binning on a20x */ - - if (dirty & (FD_DIRTY_PROG | FD_DIRTY_VTXSTATE)) - fd2_program_emit(ctx, ring, &ctx->prog); - - if (dirty & (FD_DIRTY_PROG | FD_DIRTY_CONST)) { - emit_constants(ring, VS_CONST_BASE * 4, - &ctx->constbuf[PIPE_SHADER_VERTEX], - (dirty & FD_DIRTY_PROG) ? ctx->prog.vs : NULL); - } - - if (dirty & FD_DIRTY_VIEWPORT) { - OUT_PKT3(ring, CP_SET_CONSTANT, 9); - OUT_RING(ring, 0x00000184); - OUT_RING(ring, fui(ctx->viewport.translate[0])); - OUT_RING(ring, fui(ctx->viewport.translate[1])); - OUT_RING(ring, fui(ctx->viewport.translate[2])); - OUT_RING(ring, fui(0.0f)); - OUT_RING(ring, fui(ctx->viewport.scale[0])); - OUT_RING(ring, fui(ctx->viewport.scale[1])); - OUT_RING(ring, fui(ctx->viewport.scale[2])); - OUT_RING(ring, fui(0.0f)); - } - - /* not sure why this is needed */ - if (dirty & (FD_DIRTY_BLEND | FD_DIRTY_FRAMEBUFFER)) { - OUT_PKT3(ring, CP_SET_CONSTANT, 2); - OUT_RING(ring, CP_REG(REG_A2XX_RB_BLEND_CONTROL)); - OUT_RING(ring, blend->rb_blendcontrol); - - OUT_PKT3(ring, CP_SET_CONSTANT, 2); - OUT_RING(ring, CP_REG(REG_A2XX_RB_COLOR_MASK)); - OUT_RING(ring, blend->rb_colormask); - } - - OUT_PKT3(ring, CP_SET_CONSTANT, 2); - OUT_RING(ring, CP_REG(REG_A2XX_PA_SU_SC_MODE_CNTL)); - OUT_RING(ring, A2XX_PA_SU_SC_MODE_CNTL_FACE_KILL_ENABLE); + struct fd2_blend_stateobj *blend = fd2_blend_stateobj(ctx->blend); + struct fd_ringbuffer *ring = ctx->batch->binning; + + /* subset of fd2_emit_state needed for hw binning on a20x */ + + if (dirty & (FD_DIRTY_PROG | FD_DIRTY_VTXSTATE)) + fd2_program_emit(ctx, ring, &ctx->prog); + + if (dirty & (FD_DIRTY_PROG | FD_DIRTY_CONST)) { + emit_constants(ring, VS_CONST_BASE * 4, + &ctx->constbuf[PIPE_SHADER_VERTEX], + (dirty & FD_DIRTY_PROG) ? ctx->prog.vs : NULL); + } + + if (dirty & FD_DIRTY_VIEWPORT) { + OUT_PKT3(ring, CP_SET_CONSTANT, 9); + OUT_RING(ring, 0x00000184); + OUT_RING(ring, fui(ctx->viewport.translate[0])); + OUT_RING(ring, fui(ctx->viewport.translate[1])); + OUT_RING(ring, fui(ctx->viewport.translate[2])); + OUT_RING(ring, fui(0.0f)); + OUT_RING(ring, fui(ctx->viewport.scale[0])); + OUT_RING(ring, fui(ctx->viewport.scale[1])); + OUT_RING(ring, fui(ctx->viewport.scale[2])); + OUT_RING(ring, fui(0.0f)); + } + + /* not sure why this is needed */ + if (dirty & (FD_DIRTY_BLEND | FD_DIRTY_FRAMEBUFFER)) { + OUT_PKT3(ring, CP_SET_CONSTANT, 2); + OUT_RING(ring, CP_REG(REG_A2XX_RB_BLEND_CONTROL)); + OUT_RING(ring, blend->rb_blendcontrol); + + OUT_PKT3(ring, CP_SET_CONSTANT, 2); + OUT_RING(ring, CP_REG(REG_A2XX_RB_COLOR_MASK)); + OUT_RING(ring, blend->rb_colormask); + } + + OUT_PKT3(ring, CP_SET_CONSTANT, 2); + OUT_RING(ring, CP_REG(REG_A2XX_PA_SU_SC_MODE_CNTL)); + OUT_RING(ring, A2XX_PA_SU_SC_MODE_CNTL_FACE_KILL_ENABLE); } void fd2_emit_state(struct fd_context *ctx, const enum fd_dirty_3d_state dirty) { - struct fd2_blend_stateobj *blend = fd2_blend_stateobj(ctx->blend); - struct fd2_zsa_stateobj *zsa = fd2_zsa_stateobj(ctx->zsa); - struct fd2_shader_stateobj *fs = ctx->prog.fs; - struct fd_ringbuffer *ring = ctx->batch->draw; - - /* NOTE: we probably want to eventually refactor this so each state - * object handles emitting it's own state.. although the mapping of - * state to registers is not always orthogonal, sometimes a single - * register contains bitfields coming from multiple state objects, - * so not sure the best way to deal with that yet. - */ - - if (dirty & FD_DIRTY_SAMPLE_MASK) { - OUT_PKT3(ring, CP_SET_CONSTANT, 2); - OUT_RING(ring, CP_REG(REG_A2XX_PA_SC_AA_MASK)); - OUT_RING(ring, ctx->sample_mask); - } - - if (dirty & (FD_DIRTY_ZSA | FD_DIRTY_STENCIL_REF | FD_DIRTY_PROG)) { - struct pipe_stencil_ref *sr = &ctx->stencil_ref; - uint32_t val = zsa->rb_depthcontrol; - - if (fs->has_kill) - val &= ~A2XX_RB_DEPTHCONTROL_EARLY_Z_ENABLE; - - OUT_PKT3(ring, CP_SET_CONSTANT, 2); - OUT_RING(ring, CP_REG(REG_A2XX_RB_DEPTHCONTROL)); - OUT_RING(ring, val); - - OUT_PKT3(ring, CP_SET_CONSTANT, 4); - OUT_RING(ring, CP_REG(REG_A2XX_RB_STENCILREFMASK_BF)); - OUT_RING(ring, zsa->rb_stencilrefmask_bf | - A2XX_RB_STENCILREFMASK_STENCILREF(sr->ref_value[1])); - OUT_RING(ring, zsa->rb_stencilrefmask | - A2XX_RB_STENCILREFMASK_STENCILREF(sr->ref_value[0])); - OUT_RING(ring, zsa->rb_alpha_ref); - } - - if (ctx->rasterizer && dirty & FD_DIRTY_RASTERIZER) { - struct fd2_rasterizer_stateobj *rasterizer = - fd2_rasterizer_stateobj(ctx->rasterizer); - OUT_PKT3(ring, CP_SET_CONSTANT, 3); - OUT_RING(ring, CP_REG(REG_A2XX_PA_CL_CLIP_CNTL)); - OUT_RING(ring, rasterizer->pa_cl_clip_cntl); - OUT_RING(ring, rasterizer->pa_su_sc_mode_cntl | - A2XX_PA_SU_SC_MODE_CNTL_VTX_WINDOW_OFFSET_ENABLE); - - OUT_PKT3(ring, CP_SET_CONSTANT, 5); - OUT_RING(ring, CP_REG(REG_A2XX_PA_SU_POINT_SIZE)); - OUT_RING(ring, rasterizer->pa_su_point_size); - OUT_RING(ring, rasterizer->pa_su_point_minmax); - OUT_RING(ring, rasterizer->pa_su_line_cntl); - OUT_RING(ring, rasterizer->pa_sc_line_stipple); - - OUT_PKT3(ring, CP_SET_CONSTANT, 6); - OUT_RING(ring, CP_REG(REG_A2XX_PA_SU_VTX_CNTL)); - OUT_RING(ring, rasterizer->pa_su_vtx_cntl); - OUT_RING(ring, fui(1.0)); /* PA_CL_GB_VERT_CLIP_ADJ */ - OUT_RING(ring, fui(1.0)); /* PA_CL_GB_VERT_DISC_ADJ */ - OUT_RING(ring, fui(1.0)); /* PA_CL_GB_HORZ_CLIP_ADJ */ - OUT_RING(ring, fui(1.0)); /* PA_CL_GB_HORZ_DISC_ADJ */ - - if (rasterizer->base.offset_tri) { - /* TODO: why multiply scale by 2 ? without it deqp test fails - * deqp/piglit tests aren't very precise - */ - OUT_PKT3(ring, CP_SET_CONSTANT, 5); - OUT_RING(ring, CP_REG(REG_A2XX_PA_SU_POLY_OFFSET_FRONT_SCALE)); - OUT_RING(ring, fui(rasterizer->base.offset_scale * 2.0f)); /* FRONT_SCALE */ - OUT_RING(ring, fui(rasterizer->base.offset_units)); /* FRONT_OFFSET */ - OUT_RING(ring, fui(rasterizer->base.offset_scale * 2.0f)); /* BACK_SCALE */ - OUT_RING(ring, fui(rasterizer->base.offset_units)); /* BACK_OFFSET */ - } - } - - /* NOTE: scissor enabled bit is part of rasterizer state: */ - if (dirty & (FD_DIRTY_SCISSOR | FD_DIRTY_RASTERIZER)) { - struct pipe_scissor_state *scissor = fd_context_get_scissor(ctx); - - OUT_PKT3(ring, CP_SET_CONSTANT, 3); - OUT_RING(ring, CP_REG(REG_A2XX_PA_SC_WINDOW_SCISSOR_TL)); - OUT_RING(ring, xy2d(scissor->minx, /* PA_SC_WINDOW_SCISSOR_TL */ - scissor->miny)); - OUT_RING(ring, xy2d(scissor->maxx, /* PA_SC_WINDOW_SCISSOR_BR */ - scissor->maxy)); - - ctx->batch->max_scissor.minx = MIN2(ctx->batch->max_scissor.minx, scissor->minx); - ctx->batch->max_scissor.miny = MIN2(ctx->batch->max_scissor.miny, scissor->miny); - ctx->batch->max_scissor.maxx = MAX2(ctx->batch->max_scissor.maxx, scissor->maxx); - ctx->batch->max_scissor.maxy = MAX2(ctx->batch->max_scissor.maxy, scissor->maxy); - } - - if (dirty & FD_DIRTY_VIEWPORT) { - OUT_PKT3(ring, CP_SET_CONSTANT, 7); - OUT_RING(ring, CP_REG(REG_A2XX_PA_CL_VPORT_XSCALE)); - OUT_RING(ring, fui(ctx->viewport.scale[0])); /* PA_CL_VPORT_XSCALE */ - OUT_RING(ring, fui(ctx->viewport.translate[0])); /* PA_CL_VPORT_XOFFSET */ - OUT_RING(ring, fui(ctx->viewport.scale[1])); /* PA_CL_VPORT_YSCALE */ - OUT_RING(ring, fui(ctx->viewport.translate[1])); /* PA_CL_VPORT_YOFFSET */ - OUT_RING(ring, fui(ctx->viewport.scale[2])); /* PA_CL_VPORT_ZSCALE */ - OUT_RING(ring, fui(ctx->viewport.translate[2])); /* PA_CL_VPORT_ZOFFSET */ - - /* set viewport in C65/C66, for a20x hw binning and fragcoord.z */ - OUT_PKT3(ring, CP_SET_CONSTANT, 9); - OUT_RING(ring, 0x00000184); - - OUT_RING(ring, fui(ctx->viewport.translate[0])); - OUT_RING(ring, fui(ctx->viewport.translate[1])); - OUT_RING(ring, fui(ctx->viewport.translate[2])); - OUT_RING(ring, fui(0.0f)); - - OUT_RING(ring, fui(ctx->viewport.scale[0])); - OUT_RING(ring, fui(ctx->viewport.scale[1])); - OUT_RING(ring, fui(ctx->viewport.scale[2])); - OUT_RING(ring, fui(0.0f)); - } - - if (dirty & (FD_DIRTY_PROG | FD_DIRTY_VTXSTATE | FD_DIRTY_TEXSTATE)) - fd2_program_emit(ctx, ring, &ctx->prog); - - if (dirty & (FD_DIRTY_PROG | FD_DIRTY_CONST)) { - emit_constants(ring, VS_CONST_BASE * 4, - &ctx->constbuf[PIPE_SHADER_VERTEX], - (dirty & FD_DIRTY_PROG) ? ctx->prog.vs : NULL); - emit_constants(ring, PS_CONST_BASE * 4, - &ctx->constbuf[PIPE_SHADER_FRAGMENT], - (dirty & FD_DIRTY_PROG) ? ctx->prog.fs : NULL); - } - - if (dirty & (FD_DIRTY_BLEND | FD_DIRTY_ZSA)) { - OUT_PKT3(ring, CP_SET_CONSTANT, 2); - OUT_RING(ring, CP_REG(REG_A2XX_RB_COLORCONTROL)); - OUT_RING(ring, zsa->rb_colorcontrol | blend->rb_colorcontrol); - } - - if (dirty & (FD_DIRTY_BLEND | FD_DIRTY_FRAMEBUFFER)) { - OUT_PKT3(ring, CP_SET_CONSTANT, 2); - OUT_RING(ring, CP_REG(REG_A2XX_RB_BLEND_CONTROL)); - OUT_RING(ring, blend->rb_blendcontrol); - - OUT_PKT3(ring, CP_SET_CONSTANT, 2); - OUT_RING(ring, CP_REG(REG_A2XX_RB_COLOR_MASK)); - OUT_RING(ring, blend->rb_colormask); - } - - if (dirty & FD_DIRTY_BLEND_COLOR) { - OUT_PKT3(ring, CP_SET_CONSTANT, 5); - OUT_RING(ring, CP_REG(REG_A2XX_RB_BLEND_RED)); - OUT_RING(ring, float_to_ubyte(ctx->blend_color.color[0])); - OUT_RING(ring, float_to_ubyte(ctx->blend_color.color[1])); - OUT_RING(ring, float_to_ubyte(ctx->blend_color.color[2])); - OUT_RING(ring, float_to_ubyte(ctx->blend_color.color[3])); - } - - if (dirty & (FD_DIRTY_TEX | FD_DIRTY_PROG)) - emit_textures(ring, ctx); + struct fd2_blend_stateobj *blend = fd2_blend_stateobj(ctx->blend); + struct fd2_zsa_stateobj *zsa = fd2_zsa_stateobj(ctx->zsa); + struct fd2_shader_stateobj *fs = ctx->prog.fs; + struct fd_ringbuffer *ring = ctx->batch->draw; + + /* NOTE: we probably want to eventually refactor this so each state + * object handles emitting it's own state.. although the mapping of + * state to registers is not always orthogonal, sometimes a single + * register contains bitfields coming from multiple state objects, + * so not sure the best way to deal with that yet. + */ + + if (dirty & FD_DIRTY_SAMPLE_MASK) { + OUT_PKT3(ring, CP_SET_CONSTANT, 2); + OUT_RING(ring, CP_REG(REG_A2XX_PA_SC_AA_MASK)); + OUT_RING(ring, ctx->sample_mask); + } + + if (dirty & (FD_DIRTY_ZSA | FD_DIRTY_STENCIL_REF | FD_DIRTY_PROG)) { + struct pipe_stencil_ref *sr = &ctx->stencil_ref; + uint32_t val = zsa->rb_depthcontrol; + + if (fs->has_kill) + val &= ~A2XX_RB_DEPTHCONTROL_EARLY_Z_ENABLE; + + OUT_PKT3(ring, CP_SET_CONSTANT, 2); + OUT_RING(ring, CP_REG(REG_A2XX_RB_DEPTHCONTROL)); + OUT_RING(ring, val); + + OUT_PKT3(ring, CP_SET_CONSTANT, 4); + OUT_RING(ring, CP_REG(REG_A2XX_RB_STENCILREFMASK_BF)); + OUT_RING(ring, zsa->rb_stencilrefmask_bf | + A2XX_RB_STENCILREFMASK_STENCILREF(sr->ref_value[1])); + OUT_RING(ring, zsa->rb_stencilrefmask | + A2XX_RB_STENCILREFMASK_STENCILREF(sr->ref_value[0])); + OUT_RING(ring, zsa->rb_alpha_ref); + } + + if (ctx->rasterizer && dirty & FD_DIRTY_RASTERIZER) { + struct fd2_rasterizer_stateobj *rasterizer = + fd2_rasterizer_stateobj(ctx->rasterizer); + OUT_PKT3(ring, CP_SET_CONSTANT, 3); + OUT_RING(ring, CP_REG(REG_A2XX_PA_CL_CLIP_CNTL)); + OUT_RING(ring, rasterizer->pa_cl_clip_cntl); + OUT_RING(ring, rasterizer->pa_su_sc_mode_cntl | + A2XX_PA_SU_SC_MODE_CNTL_VTX_WINDOW_OFFSET_ENABLE); + + OUT_PKT3(ring, CP_SET_CONSTANT, 5); + OUT_RING(ring, CP_REG(REG_A2XX_PA_SU_POINT_SIZE)); + OUT_RING(ring, rasterizer->pa_su_point_size); + OUT_RING(ring, rasterizer->pa_su_point_minmax); + OUT_RING(ring, rasterizer->pa_su_line_cntl); + OUT_RING(ring, rasterizer->pa_sc_line_stipple); + + OUT_PKT3(ring, CP_SET_CONSTANT, 6); + OUT_RING(ring, CP_REG(REG_A2XX_PA_SU_VTX_CNTL)); + OUT_RING(ring, rasterizer->pa_su_vtx_cntl); + OUT_RING(ring, fui(1.0)); /* PA_CL_GB_VERT_CLIP_ADJ */ + OUT_RING(ring, fui(1.0)); /* PA_CL_GB_VERT_DISC_ADJ */ + OUT_RING(ring, fui(1.0)); /* PA_CL_GB_HORZ_CLIP_ADJ */ + OUT_RING(ring, fui(1.0)); /* PA_CL_GB_HORZ_DISC_ADJ */ + + if (rasterizer->base.offset_tri) { + /* TODO: why multiply scale by 2 ? without it deqp test fails + * deqp/piglit tests aren't very precise + */ + OUT_PKT3(ring, CP_SET_CONSTANT, 5); + OUT_RING(ring, CP_REG(REG_A2XX_PA_SU_POLY_OFFSET_FRONT_SCALE)); + OUT_RING(ring, + fui(rasterizer->base.offset_scale * 2.0f)); /* FRONT_SCALE */ + OUT_RING(ring, fui(rasterizer->base.offset_units)); /* FRONT_OFFSET */ + OUT_RING(ring, + fui(rasterizer->base.offset_scale * 2.0f)); /* BACK_SCALE */ + OUT_RING(ring, fui(rasterizer->base.offset_units)); /* BACK_OFFSET */ + } + } + + /* NOTE: scissor enabled bit is part of rasterizer state: */ + if (dirty & (FD_DIRTY_SCISSOR | FD_DIRTY_RASTERIZER)) { + struct pipe_scissor_state *scissor = fd_context_get_scissor(ctx); + + OUT_PKT3(ring, CP_SET_CONSTANT, 3); + OUT_RING(ring, CP_REG(REG_A2XX_PA_SC_WINDOW_SCISSOR_TL)); + OUT_RING(ring, xy2d(scissor->minx, /* PA_SC_WINDOW_SCISSOR_TL */ + scissor->miny)); + OUT_RING(ring, xy2d(scissor->maxx, /* PA_SC_WINDOW_SCISSOR_BR */ + scissor->maxy)); + + ctx->batch->max_scissor.minx = + MIN2(ctx->batch->max_scissor.minx, scissor->minx); + ctx->batch->max_scissor.miny = + MIN2(ctx->batch->max_scissor.miny, scissor->miny); + ctx->batch->max_scissor.maxx = + MAX2(ctx->batch->max_scissor.maxx, scissor->maxx); + ctx->batch->max_scissor.maxy = + MAX2(ctx->batch->max_scissor.maxy, scissor->maxy); + } + + if (dirty & FD_DIRTY_VIEWPORT) { + OUT_PKT3(ring, CP_SET_CONSTANT, 7); + OUT_RING(ring, CP_REG(REG_A2XX_PA_CL_VPORT_XSCALE)); + OUT_RING(ring, fui(ctx->viewport.scale[0])); /* PA_CL_VPORT_XSCALE */ + OUT_RING(ring, fui(ctx->viewport.translate[0])); /* PA_CL_VPORT_XOFFSET */ + OUT_RING(ring, fui(ctx->viewport.scale[1])); /* PA_CL_VPORT_YSCALE */ + OUT_RING(ring, fui(ctx->viewport.translate[1])); /* PA_CL_VPORT_YOFFSET */ + OUT_RING(ring, fui(ctx->viewport.scale[2])); /* PA_CL_VPORT_ZSCALE */ + OUT_RING(ring, fui(ctx->viewport.translate[2])); /* PA_CL_VPORT_ZOFFSET */ + + /* set viewport in C65/C66, for a20x hw binning and fragcoord.z */ + OUT_PKT3(ring, CP_SET_CONSTANT, 9); + OUT_RING(ring, 0x00000184); + + OUT_RING(ring, fui(ctx->viewport.translate[0])); + OUT_RING(ring, fui(ctx->viewport.translate[1])); + OUT_RING(ring, fui(ctx->viewport.translate[2])); + OUT_RING(ring, fui(0.0f)); + + OUT_RING(ring, fui(ctx->viewport.scale[0])); + OUT_RING(ring, fui(ctx->viewport.scale[1])); + OUT_RING(ring, fui(ctx->viewport.scale[2])); + OUT_RING(ring, fui(0.0f)); + } + + if (dirty & (FD_DIRTY_PROG | FD_DIRTY_VTXSTATE | FD_DIRTY_TEXSTATE)) + fd2_program_emit(ctx, ring, &ctx->prog); + + if (dirty & (FD_DIRTY_PROG | FD_DIRTY_CONST)) { + emit_constants(ring, VS_CONST_BASE * 4, + &ctx->constbuf[PIPE_SHADER_VERTEX], + (dirty & FD_DIRTY_PROG) ? ctx->prog.vs : NULL); + emit_constants(ring, PS_CONST_BASE * 4, + &ctx->constbuf[PIPE_SHADER_FRAGMENT], + (dirty & FD_DIRTY_PROG) ? ctx->prog.fs : NULL); + } + + if (dirty & (FD_DIRTY_BLEND | FD_DIRTY_ZSA)) { + OUT_PKT3(ring, CP_SET_CONSTANT, 2); + OUT_RING(ring, CP_REG(REG_A2XX_RB_COLORCONTROL)); + OUT_RING(ring, zsa->rb_colorcontrol | blend->rb_colorcontrol); + } + + if (dirty & (FD_DIRTY_BLEND | FD_DIRTY_FRAMEBUFFER)) { + OUT_PKT3(ring, CP_SET_CONSTANT, 2); + OUT_RING(ring, CP_REG(REG_A2XX_RB_BLEND_CONTROL)); + OUT_RING(ring, blend->rb_blendcontrol); + + OUT_PKT3(ring, CP_SET_CONSTANT, 2); + OUT_RING(ring, CP_REG(REG_A2XX_RB_COLOR_MASK)); + OUT_RING(ring, blend->rb_colormask); + } + + if (dirty & FD_DIRTY_BLEND_COLOR) { + OUT_PKT3(ring, CP_SET_CONSTANT, 5); + OUT_RING(ring, CP_REG(REG_A2XX_RB_BLEND_RED)); + OUT_RING(ring, float_to_ubyte(ctx->blend_color.color[0])); + OUT_RING(ring, float_to_ubyte(ctx->blend_color.color[1])); + OUT_RING(ring, float_to_ubyte(ctx->blend_color.color[2])); + OUT_RING(ring, float_to_ubyte(ctx->blend_color.color[3])); + } + + if (dirty & (FD_DIRTY_TEX | FD_DIRTY_PROG)) + emit_textures(ring, ctx); } /* emit per-context initialization: @@ -397,177 +403,175 @@ fd2_emit_state(struct fd_context *ctx, const enum fd_dirty_3d_state dirty) void fd2_emit_restore(struct fd_context *ctx, struct fd_ringbuffer *ring) { - if (is_a20x(ctx->screen)) { - OUT_PKT0(ring, REG_A2XX_RB_BC_CONTROL, 1); - OUT_RING(ring, - A2XX_RB_BC_CONTROL_ACCUM_TIMEOUT_SELECT(3) | - A2XX_RB_BC_CONTROL_DISABLE_LZ_NULL_ZCMD_DROP | - A2XX_RB_BC_CONTROL_ENABLE_CRC_UPDATE | - A2XX_RB_BC_CONTROL_ACCUM_DATA_FIFO_LIMIT(8) | - A2XX_RB_BC_CONTROL_MEM_EXPORT_TIMEOUT_SELECT(3)); - - /* not sure why this is required */ - OUT_PKT3(ring, CP_SET_CONSTANT, 2); - OUT_RING(ring, CP_REG(REG_A2XX_PA_SC_VIZ_QUERY)); - OUT_RING(ring, A2XX_PA_SC_VIZ_QUERY_VIZ_QUERY_ID(16)); - - OUT_PKT3(ring, CP_SET_CONSTANT, 2); - OUT_RING(ring, CP_REG(REG_A2XX_VGT_VERTEX_REUSE_BLOCK_CNTL)); - OUT_RING(ring, 0x00000002); - - OUT_PKT3(ring, CP_SET_CONSTANT, 2); - OUT_RING(ring, CP_REG(REG_A2XX_VGT_OUT_DEALLOC_CNTL)); - OUT_RING(ring, 0x00000002); - } else { - OUT_PKT3(ring, CP_SET_CONSTANT, 2); - OUT_RING(ring, CP_REG(REG_A2XX_VGT_VERTEX_REUSE_BLOCK_CNTL)); - OUT_RING(ring, 0x0000003b); - } - - /* enable perfcntrs */ - OUT_PKT0(ring, REG_A2XX_CP_PERFMON_CNTL, 1); - OUT_RING(ring, COND(FD_DBG(PERFC), 1)); - - /* note: perfcntrs don't work without the PM_OVERRIDE bit */ - OUT_PKT0(ring, REG_A2XX_RBBM_PM_OVERRIDE1, 2); - OUT_RING(ring, 0xffffffff); - OUT_RING(ring, 0x00000fff); - - OUT_PKT0(ring, REG_A2XX_TP0_CHICKEN, 1); - OUT_RING(ring, 0x00000002); - - OUT_PKT3(ring, CP_INVALIDATE_STATE, 1); - OUT_RING(ring, 0x00007fff); - - OUT_PKT3(ring, CP_SET_CONSTANT, 2); - OUT_RING(ring, CP_REG(REG_A2XX_SQ_VS_CONST)); - OUT_RING(ring, A2XX_SQ_VS_CONST_BASE(VS_CONST_BASE) | - A2XX_SQ_VS_CONST_SIZE(0x100)); - - OUT_PKT3(ring, CP_SET_CONSTANT, 2); - OUT_RING(ring, CP_REG(REG_A2XX_SQ_PS_CONST)); - OUT_RING(ring, A2XX_SQ_PS_CONST_BASE(PS_CONST_BASE) | - A2XX_SQ_PS_CONST_SIZE(0xe0)); - - OUT_PKT3(ring, CP_SET_CONSTANT, 3); - OUT_RING(ring, CP_REG(REG_A2XX_VGT_MAX_VTX_INDX)); - OUT_RING(ring, 0xffffffff); /* VGT_MAX_VTX_INDX */ - OUT_RING(ring, 0x00000000); /* VGT_MIN_VTX_INDX */ - - OUT_PKT3(ring, CP_SET_CONSTANT, 2); - OUT_RING(ring, CP_REG(REG_A2XX_VGT_INDX_OFFSET)); - OUT_RING(ring, 0x00000000); - - OUT_PKT3(ring, CP_SET_CONSTANT, 2); - OUT_RING(ring, CP_REG(REG_A2XX_SQ_CONTEXT_MISC)); - OUT_RING(ring, A2XX_SQ_CONTEXT_MISC_SC_SAMPLE_CNTL(CENTERS_ONLY)); - - OUT_PKT3(ring, CP_SET_CONSTANT, 2); - OUT_RING(ring, CP_REG(REG_A2XX_SQ_INTERPOLATOR_CNTL)); - OUT_RING(ring, 0xffffffff); - - OUT_PKT3(ring, CP_SET_CONSTANT, 2); - OUT_RING(ring, CP_REG(REG_A2XX_PA_SC_AA_CONFIG)); - OUT_RING(ring, 0x00000000); - - OUT_PKT3(ring, CP_SET_CONSTANT, 2); - OUT_RING(ring, CP_REG(REG_A2XX_PA_SC_LINE_CNTL)); - OUT_RING(ring, 0x00000000); - - OUT_PKT3(ring, CP_SET_CONSTANT, 2); - OUT_RING(ring, CP_REG(REG_A2XX_PA_SC_WINDOW_OFFSET)); - OUT_RING(ring, 0x00000000); - - // XXX we change this dynamically for draw/clear.. vs gmem<->mem.. - OUT_PKT3(ring, CP_SET_CONSTANT, 2); - OUT_RING(ring, CP_REG(REG_A2XX_RB_MODECONTROL)); - OUT_RING(ring, A2XX_RB_MODECONTROL_EDRAM_MODE(COLOR_DEPTH)); - - OUT_PKT3(ring, CP_SET_CONSTANT, 2); - OUT_RING(ring, CP_REG(REG_A2XX_RB_SAMPLE_POS)); - OUT_RING(ring, 0x88888888); - - OUT_PKT3(ring, CP_SET_CONSTANT, 2); - OUT_RING(ring, CP_REG(REG_A2XX_RB_COLOR_DEST_MASK)); - OUT_RING(ring, 0xffffffff); - - OUT_PKT3(ring, CP_SET_CONSTANT, 2); - OUT_RING(ring, CP_REG(REG_A2XX_RB_COPY_DEST_INFO)); - OUT_RING(ring, A2XX_RB_COPY_DEST_INFO_FORMAT(COLORX_4_4_4_4) | - A2XX_RB_COPY_DEST_INFO_WRITE_RED | - A2XX_RB_COPY_DEST_INFO_WRITE_GREEN | - A2XX_RB_COPY_DEST_INFO_WRITE_BLUE | - A2XX_RB_COPY_DEST_INFO_WRITE_ALPHA); - - OUT_PKT3(ring, CP_SET_CONSTANT, 3); - OUT_RING(ring, CP_REG(REG_A2XX_SQ_WRAPPING_0)); - OUT_RING(ring, 0x00000000); /* SQ_WRAPPING_0 */ - OUT_RING(ring, 0x00000000); /* SQ_WRAPPING_1 */ - - OUT_PKT3(ring, CP_SET_DRAW_INIT_FLAGS, 1); - OUT_RING(ring, 0x00000000); - - OUT_PKT3(ring, CP_WAIT_REG_EQ, 4); - OUT_RING(ring, 0x000005d0); - OUT_RING(ring, 0x00000000); - OUT_RING(ring, 0x5f601000); - OUT_RING(ring, 0x00000001); - - OUT_PKT0(ring, REG_A2XX_SQ_INST_STORE_MANAGMENT, 1); - OUT_RING(ring, 0x00000180); - - OUT_PKT3(ring, CP_INVALIDATE_STATE, 1); - OUT_RING(ring, 0x00000300); - - OUT_PKT3(ring, CP_SET_SHADER_BASES, 1); - OUT_RING(ring, 0x80000180); - - /* not sure what this form of CP_SET_CONSTANT is.. */ - OUT_PKT3(ring, CP_SET_CONSTANT, 13); - OUT_RING(ring, 0x00000000); - OUT_RING(ring, 0x00000000); - OUT_RING(ring, 0x00000000); - OUT_RING(ring, 0x00000000); - OUT_RING(ring, 0x00000000); - OUT_RING(ring, 0x469c4000); - OUT_RING(ring, 0x3f800000); - OUT_RING(ring, 0x3f000000); - OUT_RING(ring, 0x00000000); - OUT_RING(ring, 0x40000000); - OUT_RING(ring, 0x3f400000); - OUT_RING(ring, 0x3ec00000); - OUT_RING(ring, 0x3e800000); - - OUT_PKT3(ring, CP_SET_CONSTANT, 2); - OUT_RING(ring, CP_REG(REG_A2XX_RB_COLOR_MASK)); - OUT_RING(ring, A2XX_RB_COLOR_MASK_WRITE_RED | - A2XX_RB_COLOR_MASK_WRITE_GREEN | - A2XX_RB_COLOR_MASK_WRITE_BLUE | - A2XX_RB_COLOR_MASK_WRITE_ALPHA); - - OUT_PKT3(ring, CP_SET_CONSTANT, 5); - OUT_RING(ring, CP_REG(REG_A2XX_RB_BLEND_RED)); - OUT_RING(ring, 0x00000000); /* RB_BLEND_RED */ - OUT_RING(ring, 0x00000000); /* RB_BLEND_GREEN */ - OUT_RING(ring, 0x00000000); /* RB_BLEND_BLUE */ - OUT_RING(ring, 0x000000ff); /* RB_BLEND_ALPHA */ - - OUT_PKT3(ring, CP_SET_CONSTANT, 2); - OUT_RING(ring, CP_REG(REG_A2XX_PA_CL_VTE_CNTL)); - OUT_RING(ring, A2XX_PA_CL_VTE_CNTL_VTX_W0_FMT | - A2XX_PA_CL_VTE_CNTL_VPORT_X_SCALE_ENA | - A2XX_PA_CL_VTE_CNTL_VPORT_X_OFFSET_ENA | - A2XX_PA_CL_VTE_CNTL_VPORT_Y_SCALE_ENA | - A2XX_PA_CL_VTE_CNTL_VPORT_Y_OFFSET_ENA | - A2XX_PA_CL_VTE_CNTL_VPORT_Z_SCALE_ENA | - A2XX_PA_CL_VTE_CNTL_VPORT_Z_OFFSET_ENA); + if (is_a20x(ctx->screen)) { + OUT_PKT0(ring, REG_A2XX_RB_BC_CONTROL, 1); + OUT_RING(ring, A2XX_RB_BC_CONTROL_ACCUM_TIMEOUT_SELECT(3) | + A2XX_RB_BC_CONTROL_DISABLE_LZ_NULL_ZCMD_DROP | + A2XX_RB_BC_CONTROL_ENABLE_CRC_UPDATE | + A2XX_RB_BC_CONTROL_ACCUM_DATA_FIFO_LIMIT(8) | + A2XX_RB_BC_CONTROL_MEM_EXPORT_TIMEOUT_SELECT(3)); + + /* not sure why this is required */ + OUT_PKT3(ring, CP_SET_CONSTANT, 2); + OUT_RING(ring, CP_REG(REG_A2XX_PA_SC_VIZ_QUERY)); + OUT_RING(ring, A2XX_PA_SC_VIZ_QUERY_VIZ_QUERY_ID(16)); + + OUT_PKT3(ring, CP_SET_CONSTANT, 2); + OUT_RING(ring, CP_REG(REG_A2XX_VGT_VERTEX_REUSE_BLOCK_CNTL)); + OUT_RING(ring, 0x00000002); + + OUT_PKT3(ring, CP_SET_CONSTANT, 2); + OUT_RING(ring, CP_REG(REG_A2XX_VGT_OUT_DEALLOC_CNTL)); + OUT_RING(ring, 0x00000002); + } else { + OUT_PKT3(ring, CP_SET_CONSTANT, 2); + OUT_RING(ring, CP_REG(REG_A2XX_VGT_VERTEX_REUSE_BLOCK_CNTL)); + OUT_RING(ring, 0x0000003b); + } + + /* enable perfcntrs */ + OUT_PKT0(ring, REG_A2XX_CP_PERFMON_CNTL, 1); + OUT_RING(ring, COND(FD_DBG(PERFC), 1)); + + /* note: perfcntrs don't work without the PM_OVERRIDE bit */ + OUT_PKT0(ring, REG_A2XX_RBBM_PM_OVERRIDE1, 2); + OUT_RING(ring, 0xffffffff); + OUT_RING(ring, 0x00000fff); + + OUT_PKT0(ring, REG_A2XX_TP0_CHICKEN, 1); + OUT_RING(ring, 0x00000002); + + OUT_PKT3(ring, CP_INVALIDATE_STATE, 1); + OUT_RING(ring, 0x00007fff); + + OUT_PKT3(ring, CP_SET_CONSTANT, 2); + OUT_RING(ring, CP_REG(REG_A2XX_SQ_VS_CONST)); + OUT_RING(ring, A2XX_SQ_VS_CONST_BASE(VS_CONST_BASE) | + A2XX_SQ_VS_CONST_SIZE(0x100)); + + OUT_PKT3(ring, CP_SET_CONSTANT, 2); + OUT_RING(ring, CP_REG(REG_A2XX_SQ_PS_CONST)); + OUT_RING(ring, + A2XX_SQ_PS_CONST_BASE(PS_CONST_BASE) | A2XX_SQ_PS_CONST_SIZE(0xe0)); + + OUT_PKT3(ring, CP_SET_CONSTANT, 3); + OUT_RING(ring, CP_REG(REG_A2XX_VGT_MAX_VTX_INDX)); + OUT_RING(ring, 0xffffffff); /* VGT_MAX_VTX_INDX */ + OUT_RING(ring, 0x00000000); /* VGT_MIN_VTX_INDX */ + + OUT_PKT3(ring, CP_SET_CONSTANT, 2); + OUT_RING(ring, CP_REG(REG_A2XX_VGT_INDX_OFFSET)); + OUT_RING(ring, 0x00000000); + + OUT_PKT3(ring, CP_SET_CONSTANT, 2); + OUT_RING(ring, CP_REG(REG_A2XX_SQ_CONTEXT_MISC)); + OUT_RING(ring, A2XX_SQ_CONTEXT_MISC_SC_SAMPLE_CNTL(CENTERS_ONLY)); + + OUT_PKT3(ring, CP_SET_CONSTANT, 2); + OUT_RING(ring, CP_REG(REG_A2XX_SQ_INTERPOLATOR_CNTL)); + OUT_RING(ring, 0xffffffff); + + OUT_PKT3(ring, CP_SET_CONSTANT, 2); + OUT_RING(ring, CP_REG(REG_A2XX_PA_SC_AA_CONFIG)); + OUT_RING(ring, 0x00000000); + + OUT_PKT3(ring, CP_SET_CONSTANT, 2); + OUT_RING(ring, CP_REG(REG_A2XX_PA_SC_LINE_CNTL)); + OUT_RING(ring, 0x00000000); + + OUT_PKT3(ring, CP_SET_CONSTANT, 2); + OUT_RING(ring, CP_REG(REG_A2XX_PA_SC_WINDOW_OFFSET)); + OUT_RING(ring, 0x00000000); + + // XXX we change this dynamically for draw/clear.. vs gmem<->mem.. + OUT_PKT3(ring, CP_SET_CONSTANT, 2); + OUT_RING(ring, CP_REG(REG_A2XX_RB_MODECONTROL)); + OUT_RING(ring, A2XX_RB_MODECONTROL_EDRAM_MODE(COLOR_DEPTH)); + + OUT_PKT3(ring, CP_SET_CONSTANT, 2); + OUT_RING(ring, CP_REG(REG_A2XX_RB_SAMPLE_POS)); + OUT_RING(ring, 0x88888888); + + OUT_PKT3(ring, CP_SET_CONSTANT, 2); + OUT_RING(ring, CP_REG(REG_A2XX_RB_COLOR_DEST_MASK)); + OUT_RING(ring, 0xffffffff); + + OUT_PKT3(ring, CP_SET_CONSTANT, 2); + OUT_RING(ring, CP_REG(REG_A2XX_RB_COPY_DEST_INFO)); + OUT_RING(ring, A2XX_RB_COPY_DEST_INFO_FORMAT(COLORX_4_4_4_4) | + A2XX_RB_COPY_DEST_INFO_WRITE_RED | + A2XX_RB_COPY_DEST_INFO_WRITE_GREEN | + A2XX_RB_COPY_DEST_INFO_WRITE_BLUE | + A2XX_RB_COPY_DEST_INFO_WRITE_ALPHA); + + OUT_PKT3(ring, CP_SET_CONSTANT, 3); + OUT_RING(ring, CP_REG(REG_A2XX_SQ_WRAPPING_0)); + OUT_RING(ring, 0x00000000); /* SQ_WRAPPING_0 */ + OUT_RING(ring, 0x00000000); /* SQ_WRAPPING_1 */ + + OUT_PKT3(ring, CP_SET_DRAW_INIT_FLAGS, 1); + OUT_RING(ring, 0x00000000); + + OUT_PKT3(ring, CP_WAIT_REG_EQ, 4); + OUT_RING(ring, 0x000005d0); + OUT_RING(ring, 0x00000000); + OUT_RING(ring, 0x5f601000); + OUT_RING(ring, 0x00000001); + + OUT_PKT0(ring, REG_A2XX_SQ_INST_STORE_MANAGMENT, 1); + OUT_RING(ring, 0x00000180); + + OUT_PKT3(ring, CP_INVALIDATE_STATE, 1); + OUT_RING(ring, 0x00000300); + + OUT_PKT3(ring, CP_SET_SHADER_BASES, 1); + OUT_RING(ring, 0x80000180); + + /* not sure what this form of CP_SET_CONSTANT is.. */ + OUT_PKT3(ring, CP_SET_CONSTANT, 13); + OUT_RING(ring, 0x00000000); + OUT_RING(ring, 0x00000000); + OUT_RING(ring, 0x00000000); + OUT_RING(ring, 0x00000000); + OUT_RING(ring, 0x00000000); + OUT_RING(ring, 0x469c4000); + OUT_RING(ring, 0x3f800000); + OUT_RING(ring, 0x3f000000); + OUT_RING(ring, 0x00000000); + OUT_RING(ring, 0x40000000); + OUT_RING(ring, 0x3f400000); + OUT_RING(ring, 0x3ec00000); + OUT_RING(ring, 0x3e800000); + + OUT_PKT3(ring, CP_SET_CONSTANT, 2); + OUT_RING(ring, CP_REG(REG_A2XX_RB_COLOR_MASK)); + OUT_RING(ring, + A2XX_RB_COLOR_MASK_WRITE_RED | A2XX_RB_COLOR_MASK_WRITE_GREEN | + A2XX_RB_COLOR_MASK_WRITE_BLUE | A2XX_RB_COLOR_MASK_WRITE_ALPHA); + + OUT_PKT3(ring, CP_SET_CONSTANT, 5); + OUT_RING(ring, CP_REG(REG_A2XX_RB_BLEND_RED)); + OUT_RING(ring, 0x00000000); /* RB_BLEND_RED */ + OUT_RING(ring, 0x00000000); /* RB_BLEND_GREEN */ + OUT_RING(ring, 0x00000000); /* RB_BLEND_BLUE */ + OUT_RING(ring, 0x000000ff); /* RB_BLEND_ALPHA */ + + OUT_PKT3(ring, CP_SET_CONSTANT, 2); + OUT_RING(ring, CP_REG(REG_A2XX_PA_CL_VTE_CNTL)); + OUT_RING(ring, A2XX_PA_CL_VTE_CNTL_VTX_W0_FMT | + A2XX_PA_CL_VTE_CNTL_VPORT_X_SCALE_ENA | + A2XX_PA_CL_VTE_CNTL_VPORT_X_OFFSET_ENA | + A2XX_PA_CL_VTE_CNTL_VPORT_Y_SCALE_ENA | + A2XX_PA_CL_VTE_CNTL_VPORT_Y_OFFSET_ENA | + A2XX_PA_CL_VTE_CNTL_VPORT_Z_SCALE_ENA | + A2XX_PA_CL_VTE_CNTL_VPORT_Z_OFFSET_ENA); } void fd2_emit_init_screen(struct pipe_screen *pscreen) { - struct fd_screen *screen = fd_screen(pscreen); - screen->emit_ib = fd2_emit_ib; + struct fd_screen *screen = fd_screen(pscreen); + screen->emit_ib = fd2_emit_ib; } void diff --git a/src/gallium/drivers/freedreno/a2xx/fd2_emit.h b/src/gallium/drivers/freedreno/a2xx/fd2_emit.h index 2d15cb9..c818947 100644 --- a/src/gallium/drivers/freedreno/a2xx/fd2_emit.h +++ b/src/gallium/drivers/freedreno/a2xx/fd2_emit.h @@ -34,14 +34,16 @@ struct fd_ringbuffer; struct fd2_vertex_buf { - unsigned offset, size; - struct pipe_resource *prsc; + unsigned offset, size; + struct pipe_resource *prsc; }; void fd2_emit_vertex_bufs(struct fd_ringbuffer *ring, uint32_t val, - struct fd2_vertex_buf *vbufs, uint32_t n); -void fd2_emit_state_binning(struct fd_context *ctx, const enum fd_dirty_3d_state dirty) assert_dt; -void fd2_emit_state(struct fd_context *ctx, const enum fd_dirty_3d_state dirty) assert_dt; + struct fd2_vertex_buf *vbufs, uint32_t n); +void fd2_emit_state_binning(struct fd_context *ctx, + const enum fd_dirty_3d_state dirty) assert_dt; +void fd2_emit_state(struct fd_context *ctx, + const enum fd_dirty_3d_state dirty) assert_dt; void fd2_emit_restore(struct fd_context *ctx, struct fd_ringbuffer *ring); void fd2_emit_init_screen(struct pipe_screen *pscreen); @@ -50,7 +52,7 @@ void fd2_emit_init(struct pipe_context *pctx); static inline void fd2_emit_ib(struct fd_ringbuffer *ring, struct fd_ringbuffer *target) { - __OUT_IB(ring, false, target); + __OUT_IB(ring, false, target); } #endif /* FD2_EMIT_H */ diff --git a/src/gallium/drivers/freedreno/a2xx/fd2_gmem.c b/src/gallium/drivers/freedreno/a2xx/fd2_gmem.c index 4588808..c427bd1 100644 --- a/src/gallium/drivers/freedreno/a2xx/fd2_gmem.c +++ b/src/gallium/drivers/freedreno/a2xx/fd2_gmem.c @@ -25,732 +25,742 @@ */ #include "pipe/p_state.h" -#include "util/u_string.h" -#include "util/u_memory.h" #include "util/u_inlines.h" +#include "util/u_memory.h" +#include "util/u_string.h" #include "freedreno_draw.h" -#include "freedreno_state.h" #include "freedreno_resource.h" +#include "freedreno_state.h" -#include "fd2_gmem.h" +#include "ir2/instr-a2xx.h" #include "fd2_context.h" +#include "fd2_draw.h" #include "fd2_emit.h" +#include "fd2_gmem.h" #include "fd2_program.h" #include "fd2_util.h" #include "fd2_zsa.h" -#include "fd2_draw.h" -#include "ir2/instr-a2xx.h" -static uint32_t fmt2swap(enum pipe_format format) +static uint32_t +fmt2swap(enum pipe_format format) { - switch (format) { - case PIPE_FORMAT_B8G8R8A8_UNORM: - case PIPE_FORMAT_B8G8R8X8_UNORM: - case PIPE_FORMAT_B5G6R5_UNORM: - case PIPE_FORMAT_B5G5R5A1_UNORM: - case PIPE_FORMAT_B5G5R5X1_UNORM: - case PIPE_FORMAT_B4G4R4A4_UNORM: - case PIPE_FORMAT_B4G4R4X4_UNORM: - case PIPE_FORMAT_B2G3R3_UNORM: - return 1; - default: - return 0; - } + switch (format) { + case PIPE_FORMAT_B8G8R8A8_UNORM: + case PIPE_FORMAT_B8G8R8X8_UNORM: + case PIPE_FORMAT_B5G6R5_UNORM: + case PIPE_FORMAT_B5G5R5A1_UNORM: + case PIPE_FORMAT_B5G5R5X1_UNORM: + case PIPE_FORMAT_B4G4R4A4_UNORM: + case PIPE_FORMAT_B4G4R4X4_UNORM: + case PIPE_FORMAT_B2G3R3_UNORM: + return 1; + default: + return 0; + } } static bool use_hw_binning(struct fd_batch *batch) { - const struct fd_gmem_stateobj *gmem = batch->gmem_state; - - /* we hardcoded a limit of 8 "pipes", we can increase this limit - * at the cost of a slightly larger command stream - * however very few cases will need more than 8 - * gmem->num_vsc_pipes == 0 means empty batch (TODO: does it still happen?) - */ - if (gmem->num_vsc_pipes > 8 || !gmem->num_vsc_pipes) - return false; - - /* only a20x hw binning is implement - * a22x is more like a3xx, but perhaps the a20x works? (TODO) - */ - if (!is_a20x(batch->ctx->screen)) - return false; - - return fd_binning_enabled && ((gmem->nbins_x * gmem->nbins_y) > 2); + const struct fd_gmem_stateobj *gmem = batch->gmem_state; + + /* we hardcoded a limit of 8 "pipes", we can increase this limit + * at the cost of a slightly larger command stream + * however very few cases will need more than 8 + * gmem->num_vsc_pipes == 0 means empty batch (TODO: does it still happen?) + */ + if (gmem->num_vsc_pipes > 8 || !gmem->num_vsc_pipes) + return false; + + /* only a20x hw binning is implement + * a22x is more like a3xx, but perhaps the a20x works? (TODO) + */ + if (!is_a20x(batch->ctx->screen)) + return false; + + return fd_binning_enabled && ((gmem->nbins_x * gmem->nbins_y) > 2); } /* transfer from gmem to system memory (ie. normal RAM) */ static void emit_gmem2mem_surf(struct fd_batch *batch, uint32_t base, - struct pipe_surface *psurf) + struct pipe_surface *psurf) { - struct fd_ringbuffer *ring = batch->tile_fini; - struct fd_resource *rsc = fd_resource(psurf->texture); - uint32_t offset = - fd_resource_offset(rsc, psurf->u.tex.level, psurf->u.tex.first_layer); - enum pipe_format format = fd_gmem_restore_format(psurf->format); - uint32_t pitch = fdl2_pitch_pixels(&rsc->layout, psurf->u.tex.level); - - assert((pitch & 31) == 0); - assert((offset & 0xfff) == 0); - - if (!rsc->valid) - return; - - OUT_PKT3(ring, CP_SET_CONSTANT, 2); - OUT_RING(ring, CP_REG(REG_A2XX_RB_COLOR_INFO)); - OUT_RING(ring, A2XX_RB_COLOR_INFO_BASE(base) | - A2XX_RB_COLOR_INFO_FORMAT(fd2_pipe2color(format))); - - OUT_PKT3(ring, CP_SET_CONSTANT, 5); - OUT_RING(ring, CP_REG(REG_A2XX_RB_COPY_CONTROL)); - OUT_RING(ring, 0x00000000); /* RB_COPY_CONTROL */ - OUT_RELOC(ring, rsc->bo, offset, 0, 0); /* RB_COPY_DEST_BASE */ - OUT_RING(ring, pitch >> 5); /* RB_COPY_DEST_PITCH */ - OUT_RING(ring, /* RB_COPY_DEST_INFO */ - A2XX_RB_COPY_DEST_INFO_FORMAT(fd2_pipe2color(format)) | - COND(!rsc->layout.tile_mode, A2XX_RB_COPY_DEST_INFO_LINEAR) | - A2XX_RB_COPY_DEST_INFO_WRITE_RED | - A2XX_RB_COPY_DEST_INFO_WRITE_GREEN | - A2XX_RB_COPY_DEST_INFO_WRITE_BLUE | - A2XX_RB_COPY_DEST_INFO_WRITE_ALPHA); - - if (!is_a20x(batch->ctx->screen)) { - OUT_WFI (ring); - - OUT_PKT3(ring, CP_SET_CONSTANT, 3); - OUT_RING(ring, CP_REG(REG_A2XX_VGT_MAX_VTX_INDX)); - OUT_RING(ring, 3); /* VGT_MAX_VTX_INDX */ - OUT_RING(ring, 0); /* VGT_MIN_VTX_INDX */ - } - - fd_draw(batch, ring, DI_PT_RECTLIST, IGNORE_VISIBILITY, - DI_SRC_SEL_AUTO_INDEX, 3, 0, INDEX_SIZE_IGN, 0, 0, NULL); + struct fd_ringbuffer *ring = batch->tile_fini; + struct fd_resource *rsc = fd_resource(psurf->texture); + uint32_t offset = + fd_resource_offset(rsc, psurf->u.tex.level, psurf->u.tex.first_layer); + enum pipe_format format = fd_gmem_restore_format(psurf->format); + uint32_t pitch = fdl2_pitch_pixels(&rsc->layout, psurf->u.tex.level); + + assert((pitch & 31) == 0); + assert((offset & 0xfff) == 0); + + if (!rsc->valid) + return; + + OUT_PKT3(ring, CP_SET_CONSTANT, 2); + OUT_RING(ring, CP_REG(REG_A2XX_RB_COLOR_INFO)); + OUT_RING(ring, A2XX_RB_COLOR_INFO_BASE(base) | + A2XX_RB_COLOR_INFO_FORMAT(fd2_pipe2color(format))); + + OUT_PKT3(ring, CP_SET_CONSTANT, 5); + OUT_RING(ring, CP_REG(REG_A2XX_RB_COPY_CONTROL)); + OUT_RING(ring, 0x00000000); /* RB_COPY_CONTROL */ + OUT_RELOC(ring, rsc->bo, offset, 0, 0); /* RB_COPY_DEST_BASE */ + OUT_RING(ring, pitch >> 5); /* RB_COPY_DEST_PITCH */ + OUT_RING(ring, /* RB_COPY_DEST_INFO */ + A2XX_RB_COPY_DEST_INFO_FORMAT(fd2_pipe2color(format)) | + COND(!rsc->layout.tile_mode, A2XX_RB_COPY_DEST_INFO_LINEAR) | + A2XX_RB_COPY_DEST_INFO_WRITE_RED | + A2XX_RB_COPY_DEST_INFO_WRITE_GREEN | + A2XX_RB_COPY_DEST_INFO_WRITE_BLUE | + A2XX_RB_COPY_DEST_INFO_WRITE_ALPHA); + + if (!is_a20x(batch->ctx->screen)) { + OUT_WFI(ring); + + OUT_PKT3(ring, CP_SET_CONSTANT, 3); + OUT_RING(ring, CP_REG(REG_A2XX_VGT_MAX_VTX_INDX)); + OUT_RING(ring, 3); /* VGT_MAX_VTX_INDX */ + OUT_RING(ring, 0); /* VGT_MIN_VTX_INDX */ + } + + fd_draw(batch, ring, DI_PT_RECTLIST, IGNORE_VISIBILITY, + DI_SRC_SEL_AUTO_INDEX, 3, 0, INDEX_SIZE_IGN, 0, 0, NULL); } static void -prepare_tile_fini_ib(struct fd_batch *batch) - assert_dt +prepare_tile_fini_ib(struct fd_batch *batch) assert_dt { - struct fd_context *ctx = batch->ctx; - struct fd2_context *fd2_ctx = fd2_context(ctx); - const struct fd_gmem_stateobj *gmem = batch->gmem_state; - struct pipe_framebuffer_state *pfb = &batch->framebuffer; - struct fd_ringbuffer *ring; - - batch->tile_fini = fd_submit_new_ringbuffer(batch->submit, 0x1000, - FD_RINGBUFFER_STREAMING); - ring = batch->tile_fini; - - fd2_emit_vertex_bufs(ring, 0x9c, (struct fd2_vertex_buf[]) { - { .prsc = fd2_ctx->solid_vertexbuf, .size = 36 }, - }, 1); - - OUT_PKT3(ring, CP_SET_CONSTANT, 2); - OUT_RING(ring, CP_REG(REG_A2XX_PA_SC_WINDOW_OFFSET)); - OUT_RING(ring, 0x00000000); /* PA_SC_WINDOW_OFFSET */ - - OUT_PKT3(ring, CP_SET_CONSTANT, 2); - OUT_RING(ring, CP_REG(REG_A2XX_VGT_INDX_OFFSET)); - OUT_RING(ring, 0); - - if (!is_a20x(ctx->screen)) { - OUT_PKT3(ring, CP_SET_CONSTANT, 2); - OUT_RING(ring, CP_REG(REG_A2XX_VGT_VERTEX_REUSE_BLOCK_CNTL)); - OUT_RING(ring, 0x0000028f); - } - - fd2_program_emit(ctx, ring, &ctx->solid_prog); - - OUT_PKT3(ring, CP_SET_CONSTANT, 2); - OUT_RING(ring, CP_REG(REG_A2XX_PA_SC_AA_MASK)); - OUT_RING(ring, 0x0000ffff); - - OUT_PKT3(ring, CP_SET_CONSTANT, 2); - OUT_RING(ring, CP_REG(REG_A2XX_RB_DEPTHCONTROL)); - OUT_RING(ring, A2XX_RB_DEPTHCONTROL_EARLY_Z_ENABLE); - - OUT_PKT3(ring, CP_SET_CONSTANT, 2); - OUT_RING(ring, CP_REG(REG_A2XX_PA_SU_SC_MODE_CNTL)); - OUT_RING(ring, A2XX_PA_SU_SC_MODE_CNTL_PROVOKING_VTX_LAST | /* PA_SU_SC_MODE_CNTL */ - A2XX_PA_SU_SC_MODE_CNTL_FRONT_PTYPE(PC_DRAW_TRIANGLES) | - A2XX_PA_SU_SC_MODE_CNTL_BACK_PTYPE(PC_DRAW_TRIANGLES)); - - OUT_PKT3(ring, CP_SET_CONSTANT, 3); - OUT_RING(ring, CP_REG(REG_A2XX_PA_SC_WINDOW_SCISSOR_TL)); - OUT_RING(ring, xy2d(0, 0)); /* PA_SC_WINDOW_SCISSOR_TL */ - OUT_RING(ring, xy2d(pfb->width, pfb->height)); /* PA_SC_WINDOW_SCISSOR_BR */ - - OUT_PKT3(ring, CP_SET_CONSTANT, 2); - OUT_RING(ring, CP_REG(REG_A2XX_PA_CL_CLIP_CNTL)); - OUT_RING(ring, 0x00000000); - - OUT_PKT3(ring, CP_SET_CONSTANT, 5); - OUT_RING(ring, CP_REG(REG_A2XX_PA_CL_VPORT_XSCALE)); - OUT_RING(ring, fui((float) gmem->bin_w / 2.0)); /* XSCALE */ - OUT_RING(ring, fui((float) gmem->bin_w / 2.0)); /* XOFFSET */ - OUT_RING(ring, fui((float) gmem->bin_h / 2.0)); /* YSCALE */ - OUT_RING(ring, fui((float) gmem->bin_h / 2.0)); /* YOFFSET */ - - OUT_PKT3(ring, CP_SET_CONSTANT, 2); - OUT_RING(ring, CP_REG(REG_A2XX_RB_MODECONTROL)); - OUT_RING(ring, A2XX_RB_MODECONTROL_EDRAM_MODE(EDRAM_COPY)); - - if (batch->resolve & (FD_BUFFER_DEPTH | FD_BUFFER_STENCIL)) - emit_gmem2mem_surf(batch, gmem->zsbuf_base[0], pfb->zsbuf); - - if (batch->resolve & FD_BUFFER_COLOR) - emit_gmem2mem_surf(batch, gmem->cbuf_base[0], pfb->cbufs[0]); - - OUT_PKT3(ring, CP_SET_CONSTANT, 2); - OUT_RING(ring, CP_REG(REG_A2XX_RB_MODECONTROL)); - OUT_RING(ring, A2XX_RB_MODECONTROL_EDRAM_MODE(COLOR_DEPTH)); - - if (!is_a20x(ctx->screen)) { - OUT_PKT3(ring, CP_SET_CONSTANT, 2); - OUT_RING(ring, CP_REG(REG_A2XX_VGT_VERTEX_REUSE_BLOCK_CNTL)); - OUT_RING(ring, 0x0000003b); - } + struct fd_context *ctx = batch->ctx; + struct fd2_context *fd2_ctx = fd2_context(ctx); + const struct fd_gmem_stateobj *gmem = batch->gmem_state; + struct pipe_framebuffer_state *pfb = &batch->framebuffer; + struct fd_ringbuffer *ring; + + batch->tile_fini = + fd_submit_new_ringbuffer(batch->submit, 0x1000, FD_RINGBUFFER_STREAMING); + ring = batch->tile_fini; + + fd2_emit_vertex_bufs(ring, 0x9c, + (struct fd2_vertex_buf[]){ + {.prsc = fd2_ctx->solid_vertexbuf, .size = 36}, + }, + 1); + + OUT_PKT3(ring, CP_SET_CONSTANT, 2); + OUT_RING(ring, CP_REG(REG_A2XX_PA_SC_WINDOW_OFFSET)); + OUT_RING(ring, 0x00000000); /* PA_SC_WINDOW_OFFSET */ + + OUT_PKT3(ring, CP_SET_CONSTANT, 2); + OUT_RING(ring, CP_REG(REG_A2XX_VGT_INDX_OFFSET)); + OUT_RING(ring, 0); + + if (!is_a20x(ctx->screen)) { + OUT_PKT3(ring, CP_SET_CONSTANT, 2); + OUT_RING(ring, CP_REG(REG_A2XX_VGT_VERTEX_REUSE_BLOCK_CNTL)); + OUT_RING(ring, 0x0000028f); + } + + fd2_program_emit(ctx, ring, &ctx->solid_prog); + + OUT_PKT3(ring, CP_SET_CONSTANT, 2); + OUT_RING(ring, CP_REG(REG_A2XX_PA_SC_AA_MASK)); + OUT_RING(ring, 0x0000ffff); + + OUT_PKT3(ring, CP_SET_CONSTANT, 2); + OUT_RING(ring, CP_REG(REG_A2XX_RB_DEPTHCONTROL)); + OUT_RING(ring, A2XX_RB_DEPTHCONTROL_EARLY_Z_ENABLE); + + OUT_PKT3(ring, CP_SET_CONSTANT, 2); + OUT_RING(ring, CP_REG(REG_A2XX_PA_SU_SC_MODE_CNTL)); + OUT_RING( + ring, + A2XX_PA_SU_SC_MODE_CNTL_PROVOKING_VTX_LAST | /* PA_SU_SC_MODE_CNTL */ + A2XX_PA_SU_SC_MODE_CNTL_FRONT_PTYPE(PC_DRAW_TRIANGLES) | + A2XX_PA_SU_SC_MODE_CNTL_BACK_PTYPE(PC_DRAW_TRIANGLES)); + + OUT_PKT3(ring, CP_SET_CONSTANT, 3); + OUT_RING(ring, CP_REG(REG_A2XX_PA_SC_WINDOW_SCISSOR_TL)); + OUT_RING(ring, xy2d(0, 0)); /* PA_SC_WINDOW_SCISSOR_TL */ + OUT_RING(ring, xy2d(pfb->width, pfb->height)); /* PA_SC_WINDOW_SCISSOR_BR */ + + OUT_PKT3(ring, CP_SET_CONSTANT, 2); + OUT_RING(ring, CP_REG(REG_A2XX_PA_CL_CLIP_CNTL)); + OUT_RING(ring, 0x00000000); + + OUT_PKT3(ring, CP_SET_CONSTANT, 5); + OUT_RING(ring, CP_REG(REG_A2XX_PA_CL_VPORT_XSCALE)); + OUT_RING(ring, fui((float)gmem->bin_w / 2.0)); /* XSCALE */ + OUT_RING(ring, fui((float)gmem->bin_w / 2.0)); /* XOFFSET */ + OUT_RING(ring, fui((float)gmem->bin_h / 2.0)); /* YSCALE */ + OUT_RING(ring, fui((float)gmem->bin_h / 2.0)); /* YOFFSET */ + + OUT_PKT3(ring, CP_SET_CONSTANT, 2); + OUT_RING(ring, CP_REG(REG_A2XX_RB_MODECONTROL)); + OUT_RING(ring, A2XX_RB_MODECONTROL_EDRAM_MODE(EDRAM_COPY)); + + if (batch->resolve & (FD_BUFFER_DEPTH | FD_BUFFER_STENCIL)) + emit_gmem2mem_surf(batch, gmem->zsbuf_base[0], pfb->zsbuf); + + if (batch->resolve & FD_BUFFER_COLOR) + emit_gmem2mem_surf(batch, gmem->cbuf_base[0], pfb->cbufs[0]); + + OUT_PKT3(ring, CP_SET_CONSTANT, 2); + OUT_RING(ring, CP_REG(REG_A2XX_RB_MODECONTROL)); + OUT_RING(ring, A2XX_RB_MODECONTROL_EDRAM_MODE(COLOR_DEPTH)); + + if (!is_a20x(ctx->screen)) { + OUT_PKT3(ring, CP_SET_CONSTANT, 2); + OUT_RING(ring, CP_REG(REG_A2XX_VGT_VERTEX_REUSE_BLOCK_CNTL)); + OUT_RING(ring, 0x0000003b); + } } static void fd2_emit_tile_gmem2mem(struct fd_batch *batch, const struct fd_tile *tile) { - fd2_emit_ib(batch->gmem, batch->tile_fini); + fd2_emit_ib(batch->gmem, batch->tile_fini); } /* transfer from system memory to gmem */ static void emit_mem2gmem_surf(struct fd_batch *batch, uint32_t base, - struct pipe_surface *psurf) + struct pipe_surface *psurf) { - struct fd_ringbuffer *ring = batch->gmem; - struct fd_resource *rsc = fd_resource(psurf->texture); - uint32_t offset = - fd_resource_offset(rsc, psurf->u.tex.level, psurf->u.tex.first_layer); - enum pipe_format format = fd_gmem_restore_format(psurf->format); - - - OUT_PKT3(ring, CP_SET_CONSTANT, 2); - OUT_RING(ring, CP_REG(REG_A2XX_RB_COLOR_INFO)); - OUT_RING(ring, A2XX_RB_COLOR_INFO_BASE(base) | - A2XX_RB_COLOR_INFO_FORMAT(fd2_pipe2color(format))); - - /* emit fb as a texture: */ - OUT_PKT3(ring, CP_SET_CONSTANT, 7); - OUT_RING(ring, 0x00010000); - OUT_RING(ring, A2XX_SQ_TEX_0_CLAMP_X(SQ_TEX_WRAP) | - A2XX_SQ_TEX_0_CLAMP_Y(SQ_TEX_WRAP) | - A2XX_SQ_TEX_0_CLAMP_Z(SQ_TEX_WRAP) | - A2XX_SQ_TEX_0_PITCH(fdl2_pitch_pixels(&rsc->layout, psurf->u.tex.level))); - OUT_RELOC(ring, rsc->bo, offset, - A2XX_SQ_TEX_1_FORMAT(fd2_pipe2surface(format).format) | - A2XX_SQ_TEX_1_CLAMP_POLICY(SQ_TEX_CLAMP_POLICY_OGL), 0); - OUT_RING(ring, A2XX_SQ_TEX_2_WIDTH(psurf->width - 1) | - A2XX_SQ_TEX_2_HEIGHT(psurf->height - 1)); - OUT_RING(ring, A2XX_SQ_TEX_3_MIP_FILTER(SQ_TEX_FILTER_BASEMAP) | - A2XX_SQ_TEX_3_SWIZ_X(0) | - A2XX_SQ_TEX_3_SWIZ_Y(1) | - A2XX_SQ_TEX_3_SWIZ_Z(2) | - A2XX_SQ_TEX_3_SWIZ_W(3) | - A2XX_SQ_TEX_3_XY_MAG_FILTER(SQ_TEX_FILTER_POINT) | - A2XX_SQ_TEX_3_XY_MIN_FILTER(SQ_TEX_FILTER_POINT)); - OUT_RING(ring, 0x00000000); - OUT_RING(ring, A2XX_SQ_TEX_5_DIMENSION(SQ_TEX_DIMENSION_2D)); - - if (!is_a20x(batch->ctx->screen)) { - OUT_PKT3(ring, CP_SET_CONSTANT, 3); - OUT_RING(ring, CP_REG(REG_A2XX_VGT_MAX_VTX_INDX)); - OUT_RING(ring, 3); /* VGT_MAX_VTX_INDX */ - OUT_RING(ring, 0); /* VGT_MIN_VTX_INDX */ - } - - fd_draw(batch, ring, DI_PT_RECTLIST, IGNORE_VISIBILITY, - DI_SRC_SEL_AUTO_INDEX, 3, 0, INDEX_SIZE_IGN, 0, 0, NULL); + struct fd_ringbuffer *ring = batch->gmem; + struct fd_resource *rsc = fd_resource(psurf->texture); + uint32_t offset = + fd_resource_offset(rsc, psurf->u.tex.level, psurf->u.tex.first_layer); + enum pipe_format format = fd_gmem_restore_format(psurf->format); + + OUT_PKT3(ring, CP_SET_CONSTANT, 2); + OUT_RING(ring, CP_REG(REG_A2XX_RB_COLOR_INFO)); + OUT_RING(ring, A2XX_RB_COLOR_INFO_BASE(base) | + A2XX_RB_COLOR_INFO_FORMAT(fd2_pipe2color(format))); + + /* emit fb as a texture: */ + OUT_PKT3(ring, CP_SET_CONSTANT, 7); + OUT_RING(ring, 0x00010000); + OUT_RING(ring, A2XX_SQ_TEX_0_CLAMP_X(SQ_TEX_WRAP) | + A2XX_SQ_TEX_0_CLAMP_Y(SQ_TEX_WRAP) | + A2XX_SQ_TEX_0_CLAMP_Z(SQ_TEX_WRAP) | + A2XX_SQ_TEX_0_PITCH( + fdl2_pitch_pixels(&rsc->layout, psurf->u.tex.level))); + OUT_RELOC(ring, rsc->bo, offset, + A2XX_SQ_TEX_1_FORMAT(fd2_pipe2surface(format).format) | + A2XX_SQ_TEX_1_CLAMP_POLICY(SQ_TEX_CLAMP_POLICY_OGL), + 0); + OUT_RING(ring, A2XX_SQ_TEX_2_WIDTH(psurf->width - 1) | + A2XX_SQ_TEX_2_HEIGHT(psurf->height - 1)); + OUT_RING(ring, A2XX_SQ_TEX_3_MIP_FILTER(SQ_TEX_FILTER_BASEMAP) | + A2XX_SQ_TEX_3_SWIZ_X(0) | A2XX_SQ_TEX_3_SWIZ_Y(1) | + A2XX_SQ_TEX_3_SWIZ_Z(2) | A2XX_SQ_TEX_3_SWIZ_W(3) | + A2XX_SQ_TEX_3_XY_MAG_FILTER(SQ_TEX_FILTER_POINT) | + A2XX_SQ_TEX_3_XY_MIN_FILTER(SQ_TEX_FILTER_POINT)); + OUT_RING(ring, 0x00000000); + OUT_RING(ring, A2XX_SQ_TEX_5_DIMENSION(SQ_TEX_DIMENSION_2D)); + + if (!is_a20x(batch->ctx->screen)) { + OUT_PKT3(ring, CP_SET_CONSTANT, 3); + OUT_RING(ring, CP_REG(REG_A2XX_VGT_MAX_VTX_INDX)); + OUT_RING(ring, 3); /* VGT_MAX_VTX_INDX */ + OUT_RING(ring, 0); /* VGT_MIN_VTX_INDX */ + } + + fd_draw(batch, ring, DI_PT_RECTLIST, IGNORE_VISIBILITY, + DI_SRC_SEL_AUTO_INDEX, 3, 0, INDEX_SIZE_IGN, 0, 0, NULL); } static void -fd2_emit_tile_mem2gmem(struct fd_batch *batch, const struct fd_tile *tile) - assert_dt +fd2_emit_tile_mem2gmem(struct fd_batch *batch, + const struct fd_tile *tile) assert_dt { - struct fd_context *ctx = batch->ctx; - struct fd2_context *fd2_ctx = fd2_context(ctx); - const struct fd_gmem_stateobj *gmem = batch->gmem_state; - struct fd_ringbuffer *ring = batch->gmem; - struct pipe_framebuffer_state *pfb = &batch->framebuffer; - unsigned bin_w = tile->bin_w; - unsigned bin_h = tile->bin_h; - float x0, y0, x1, y1; - - fd2_emit_vertex_bufs(ring, 0x9c, (struct fd2_vertex_buf[]) { - { .prsc = fd2_ctx->solid_vertexbuf, .size = 36 }, - { .prsc = fd2_ctx->solid_vertexbuf, .size = 24, .offset = 36 }, - }, 2); - - /* write texture coordinates to vertexbuf: */ - x0 = ((float)tile->xoff) / ((float)pfb->width); - x1 = ((float)tile->xoff + bin_w) / ((float)pfb->width); - y0 = ((float)tile->yoff) / ((float)pfb->height); - y1 = ((float)tile->yoff + bin_h) / ((float)pfb->height); - OUT_PKT3(ring, CP_MEM_WRITE, 7); - OUT_RELOC(ring, fd_resource(fd2_ctx->solid_vertexbuf)->bo, 36, 0, 0); - OUT_RING(ring, fui(x0)); - OUT_RING(ring, fui(y0)); - OUT_RING(ring, fui(x1)); - OUT_RING(ring, fui(y0)); - OUT_RING(ring, fui(x0)); - OUT_RING(ring, fui(y1)); - - OUT_PKT3(ring, CP_SET_CONSTANT, 2); - OUT_RING(ring, CP_REG(REG_A2XX_VGT_INDX_OFFSET)); - OUT_RING(ring, 0); - - fd2_program_emit(ctx, ring, &ctx->blit_prog[0]); - - OUT_PKT0(ring, REG_A2XX_TC_CNTL_STATUS, 1); - OUT_RING(ring, A2XX_TC_CNTL_STATUS_L2_INVALIDATE); - - OUT_PKT3(ring, CP_SET_CONSTANT, 2); - OUT_RING(ring, CP_REG(REG_A2XX_RB_DEPTHCONTROL)); - OUT_RING(ring, A2XX_RB_DEPTHCONTROL_EARLY_Z_ENABLE); - - OUT_PKT3(ring, CP_SET_CONSTANT, 2); - OUT_RING(ring, CP_REG(REG_A2XX_PA_SU_SC_MODE_CNTL)); - OUT_RING(ring, A2XX_PA_SU_SC_MODE_CNTL_PROVOKING_VTX_LAST | - A2XX_PA_SU_SC_MODE_CNTL_FRONT_PTYPE(PC_DRAW_TRIANGLES) | - A2XX_PA_SU_SC_MODE_CNTL_BACK_PTYPE(PC_DRAW_TRIANGLES)); - - OUT_PKT3(ring, CP_SET_CONSTANT, 2); - OUT_RING(ring, CP_REG(REG_A2XX_PA_SC_AA_MASK)); - OUT_RING(ring, 0x0000ffff); - - OUT_PKT3(ring, CP_SET_CONSTANT, 2); - OUT_RING(ring, CP_REG(REG_A2XX_RB_COLORCONTROL)); - OUT_RING(ring, A2XX_RB_COLORCONTROL_ALPHA_FUNC(FUNC_ALWAYS) | - A2XX_RB_COLORCONTROL_BLEND_DISABLE | - A2XX_RB_COLORCONTROL_ROP_CODE(12) | - A2XX_RB_COLORCONTROL_DITHER_MODE(DITHER_DISABLE) | - A2XX_RB_COLORCONTROL_DITHER_TYPE(DITHER_PIXEL)); - - OUT_PKT3(ring, CP_SET_CONSTANT, 2); - OUT_RING(ring, CP_REG(REG_A2XX_RB_BLEND_CONTROL)); - OUT_RING(ring, A2XX_RB_BLEND_CONTROL_COLOR_SRCBLEND(FACTOR_ONE) | - A2XX_RB_BLEND_CONTROL_COLOR_COMB_FCN(BLEND2_DST_PLUS_SRC) | - A2XX_RB_BLEND_CONTROL_COLOR_DESTBLEND(FACTOR_ZERO) | - A2XX_RB_BLEND_CONTROL_ALPHA_SRCBLEND(FACTOR_ONE) | - A2XX_RB_BLEND_CONTROL_ALPHA_COMB_FCN(BLEND2_DST_PLUS_SRC) | - A2XX_RB_BLEND_CONTROL_ALPHA_DESTBLEND(FACTOR_ZERO)); - - OUT_PKT3(ring, CP_SET_CONSTANT, 3); - OUT_RING(ring, CP_REG(REG_A2XX_PA_SC_WINDOW_SCISSOR_TL)); - OUT_RING(ring, A2XX_PA_SC_WINDOW_OFFSET_DISABLE | - xy2d(0,0)); /* PA_SC_WINDOW_SCISSOR_TL */ - OUT_RING(ring, xy2d(bin_w, bin_h)); /* PA_SC_WINDOW_SCISSOR_BR */ - - OUT_PKT3(ring, CP_SET_CONSTANT, 5); - OUT_RING(ring, CP_REG(REG_A2XX_PA_CL_VPORT_XSCALE)); - OUT_RING(ring, fui((float)bin_w/2.0)); /* PA_CL_VPORT_XSCALE */ - OUT_RING(ring, fui((float)bin_w/2.0)); /* PA_CL_VPORT_XOFFSET */ - OUT_RING(ring, fui(-(float)bin_h/2.0)); /* PA_CL_VPORT_YSCALE */ - OUT_RING(ring, fui((float)bin_h/2.0)); /* PA_CL_VPORT_YOFFSET */ - - OUT_PKT3(ring, CP_SET_CONSTANT, 2); - OUT_RING(ring, CP_REG(REG_A2XX_PA_CL_VTE_CNTL)); - OUT_RING(ring, A2XX_PA_CL_VTE_CNTL_VTX_XY_FMT | - A2XX_PA_CL_VTE_CNTL_VTX_Z_FMT | // XXX check this??? - A2XX_PA_CL_VTE_CNTL_VPORT_X_SCALE_ENA | - A2XX_PA_CL_VTE_CNTL_VPORT_X_OFFSET_ENA | - A2XX_PA_CL_VTE_CNTL_VPORT_Y_SCALE_ENA | - A2XX_PA_CL_VTE_CNTL_VPORT_Y_OFFSET_ENA); - - OUT_PKT3(ring, CP_SET_CONSTANT, 2); - OUT_RING(ring, CP_REG(REG_A2XX_PA_CL_CLIP_CNTL)); - OUT_RING(ring, 0x00000000); - - if (fd_gmem_needs_restore(batch, tile, FD_BUFFER_DEPTH | FD_BUFFER_STENCIL)) - emit_mem2gmem_surf(batch, gmem->zsbuf_base[0], pfb->zsbuf); - - if (fd_gmem_needs_restore(batch, tile, FD_BUFFER_COLOR)) - emit_mem2gmem_surf(batch, gmem->cbuf_base[0], pfb->cbufs[0]); - - OUT_PKT3(ring, CP_SET_CONSTANT, 2); - OUT_RING(ring, CP_REG(REG_A2XX_PA_CL_VTE_CNTL)); - OUT_RING(ring, A2XX_PA_CL_VTE_CNTL_VTX_W0_FMT | - A2XX_PA_CL_VTE_CNTL_VPORT_X_SCALE_ENA | - A2XX_PA_CL_VTE_CNTL_VPORT_X_OFFSET_ENA | - A2XX_PA_CL_VTE_CNTL_VPORT_Y_SCALE_ENA | - A2XX_PA_CL_VTE_CNTL_VPORT_Y_OFFSET_ENA | - A2XX_PA_CL_VTE_CNTL_VPORT_Z_SCALE_ENA | - A2XX_PA_CL_VTE_CNTL_VPORT_Z_OFFSET_ENA); - - /* TODO blob driver seems to toss in a CACHE_FLUSH after each DRAW_INDX.. */ + struct fd_context *ctx = batch->ctx; + struct fd2_context *fd2_ctx = fd2_context(ctx); + const struct fd_gmem_stateobj *gmem = batch->gmem_state; + struct fd_ringbuffer *ring = batch->gmem; + struct pipe_framebuffer_state *pfb = &batch->framebuffer; + unsigned bin_w = tile->bin_w; + unsigned bin_h = tile->bin_h; + float x0, y0, x1, y1; + + fd2_emit_vertex_bufs( + ring, 0x9c, + (struct fd2_vertex_buf[]){ + {.prsc = fd2_ctx->solid_vertexbuf, .size = 36}, + {.prsc = fd2_ctx->solid_vertexbuf, .size = 24, .offset = 36}, + }, + 2); + + /* write texture coordinates to vertexbuf: */ + x0 = ((float)tile->xoff) / ((float)pfb->width); + x1 = ((float)tile->xoff + bin_w) / ((float)pfb->width); + y0 = ((float)tile->yoff) / ((float)pfb->height); + y1 = ((float)tile->yoff + bin_h) / ((float)pfb->height); + OUT_PKT3(ring, CP_MEM_WRITE, 7); + OUT_RELOC(ring, fd_resource(fd2_ctx->solid_vertexbuf)->bo, 36, 0, 0); + OUT_RING(ring, fui(x0)); + OUT_RING(ring, fui(y0)); + OUT_RING(ring, fui(x1)); + OUT_RING(ring, fui(y0)); + OUT_RING(ring, fui(x0)); + OUT_RING(ring, fui(y1)); + + OUT_PKT3(ring, CP_SET_CONSTANT, 2); + OUT_RING(ring, CP_REG(REG_A2XX_VGT_INDX_OFFSET)); + OUT_RING(ring, 0); + + fd2_program_emit(ctx, ring, &ctx->blit_prog[0]); + + OUT_PKT0(ring, REG_A2XX_TC_CNTL_STATUS, 1); + OUT_RING(ring, A2XX_TC_CNTL_STATUS_L2_INVALIDATE); + + OUT_PKT3(ring, CP_SET_CONSTANT, 2); + OUT_RING(ring, CP_REG(REG_A2XX_RB_DEPTHCONTROL)); + OUT_RING(ring, A2XX_RB_DEPTHCONTROL_EARLY_Z_ENABLE); + + OUT_PKT3(ring, CP_SET_CONSTANT, 2); + OUT_RING(ring, CP_REG(REG_A2XX_PA_SU_SC_MODE_CNTL)); + OUT_RING(ring, A2XX_PA_SU_SC_MODE_CNTL_PROVOKING_VTX_LAST | + A2XX_PA_SU_SC_MODE_CNTL_FRONT_PTYPE(PC_DRAW_TRIANGLES) | + A2XX_PA_SU_SC_MODE_CNTL_BACK_PTYPE(PC_DRAW_TRIANGLES)); + + OUT_PKT3(ring, CP_SET_CONSTANT, 2); + OUT_RING(ring, CP_REG(REG_A2XX_PA_SC_AA_MASK)); + OUT_RING(ring, 0x0000ffff); + + OUT_PKT3(ring, CP_SET_CONSTANT, 2); + OUT_RING(ring, CP_REG(REG_A2XX_RB_COLORCONTROL)); + OUT_RING(ring, A2XX_RB_COLORCONTROL_ALPHA_FUNC(FUNC_ALWAYS) | + A2XX_RB_COLORCONTROL_BLEND_DISABLE | + A2XX_RB_COLORCONTROL_ROP_CODE(12) | + A2XX_RB_COLORCONTROL_DITHER_MODE(DITHER_DISABLE) | + A2XX_RB_COLORCONTROL_DITHER_TYPE(DITHER_PIXEL)); + + OUT_PKT3(ring, CP_SET_CONSTANT, 2); + OUT_RING(ring, CP_REG(REG_A2XX_RB_BLEND_CONTROL)); + OUT_RING(ring, A2XX_RB_BLEND_CONTROL_COLOR_SRCBLEND(FACTOR_ONE) | + A2XX_RB_BLEND_CONTROL_COLOR_COMB_FCN(BLEND2_DST_PLUS_SRC) | + A2XX_RB_BLEND_CONTROL_COLOR_DESTBLEND(FACTOR_ZERO) | + A2XX_RB_BLEND_CONTROL_ALPHA_SRCBLEND(FACTOR_ONE) | + A2XX_RB_BLEND_CONTROL_ALPHA_COMB_FCN(BLEND2_DST_PLUS_SRC) | + A2XX_RB_BLEND_CONTROL_ALPHA_DESTBLEND(FACTOR_ZERO)); + + OUT_PKT3(ring, CP_SET_CONSTANT, 3); + OUT_RING(ring, CP_REG(REG_A2XX_PA_SC_WINDOW_SCISSOR_TL)); + OUT_RING(ring, A2XX_PA_SC_WINDOW_OFFSET_DISABLE | + xy2d(0, 0)); /* PA_SC_WINDOW_SCISSOR_TL */ + OUT_RING(ring, xy2d(bin_w, bin_h)); /* PA_SC_WINDOW_SCISSOR_BR */ + + OUT_PKT3(ring, CP_SET_CONSTANT, 5); + OUT_RING(ring, CP_REG(REG_A2XX_PA_CL_VPORT_XSCALE)); + OUT_RING(ring, fui((float)bin_w / 2.0)); /* PA_CL_VPORT_XSCALE */ + OUT_RING(ring, fui((float)bin_w / 2.0)); /* PA_CL_VPORT_XOFFSET */ + OUT_RING(ring, fui(-(float)bin_h / 2.0)); /* PA_CL_VPORT_YSCALE */ + OUT_RING(ring, fui((float)bin_h / 2.0)); /* PA_CL_VPORT_YOFFSET */ + + OUT_PKT3(ring, CP_SET_CONSTANT, 2); + OUT_RING(ring, CP_REG(REG_A2XX_PA_CL_VTE_CNTL)); + OUT_RING(ring, A2XX_PA_CL_VTE_CNTL_VTX_XY_FMT | + A2XX_PA_CL_VTE_CNTL_VTX_Z_FMT | // XXX check this??? + A2XX_PA_CL_VTE_CNTL_VPORT_X_SCALE_ENA | + A2XX_PA_CL_VTE_CNTL_VPORT_X_OFFSET_ENA | + A2XX_PA_CL_VTE_CNTL_VPORT_Y_SCALE_ENA | + A2XX_PA_CL_VTE_CNTL_VPORT_Y_OFFSET_ENA); + + OUT_PKT3(ring, CP_SET_CONSTANT, 2); + OUT_RING(ring, CP_REG(REG_A2XX_PA_CL_CLIP_CNTL)); + OUT_RING(ring, 0x00000000); + + if (fd_gmem_needs_restore(batch, tile, FD_BUFFER_DEPTH | FD_BUFFER_STENCIL)) + emit_mem2gmem_surf(batch, gmem->zsbuf_base[0], pfb->zsbuf); + + if (fd_gmem_needs_restore(batch, tile, FD_BUFFER_COLOR)) + emit_mem2gmem_surf(batch, gmem->cbuf_base[0], pfb->cbufs[0]); + + OUT_PKT3(ring, CP_SET_CONSTANT, 2); + OUT_RING(ring, CP_REG(REG_A2XX_PA_CL_VTE_CNTL)); + OUT_RING(ring, A2XX_PA_CL_VTE_CNTL_VTX_W0_FMT | + A2XX_PA_CL_VTE_CNTL_VPORT_X_SCALE_ENA | + A2XX_PA_CL_VTE_CNTL_VPORT_X_OFFSET_ENA | + A2XX_PA_CL_VTE_CNTL_VPORT_Y_SCALE_ENA | + A2XX_PA_CL_VTE_CNTL_VPORT_Y_OFFSET_ENA | + A2XX_PA_CL_VTE_CNTL_VPORT_Z_SCALE_ENA | + A2XX_PA_CL_VTE_CNTL_VPORT_Z_OFFSET_ENA); + + /* TODO blob driver seems to toss in a CACHE_FLUSH after each DRAW_INDX.. */ } static void patch_draws(struct fd_batch *batch, enum pc_di_vis_cull_mode vismode) { - unsigned i; - - if (!is_a20x(batch->ctx->screen)) { - /* identical to a3xx */ - for (i = 0; i < fd_patch_num_elements(&batch->draw_patches); i++) { - struct fd_cs_patch *patch = fd_patch_element(&batch->draw_patches, i); - *patch->cs = patch->val | DRAW(0, 0, 0, vismode, 0); - } - util_dynarray_clear(&batch->draw_patches); - return; - } - - if (vismode == USE_VISIBILITY) - return; - - for (i = 0; i < batch->draw_patches.size / sizeof(uint32_t*); i++) { - uint32_t *ptr = *util_dynarray_element(&batch->draw_patches, uint32_t*, i); - unsigned cnt = ptr[0] >> 16 & 0xfff; /* 5 with idx buffer, 3 without */ - - /* convert CP_DRAW_INDX_BIN to a CP_DRAW_INDX - * replace first two DWORDS with NOP and move the rest down - * (we don't want to have to move the idx buffer reloc) - */ - ptr[0] = CP_TYPE3_PKT | (CP_NOP << 8); - ptr[1] = 0x00000000; - - ptr[4] = ptr[2] & ~(1 << 14 | 1 << 15); /* remove cull_enable bits */ - ptr[2] = CP_TYPE3_PKT | ((cnt-2) << 16) | (CP_DRAW_INDX << 8); - ptr[3] = 0x00000000; - } + unsigned i; + + if (!is_a20x(batch->ctx->screen)) { + /* identical to a3xx */ + for (i = 0; i < fd_patch_num_elements(&batch->draw_patches); i++) { + struct fd_cs_patch *patch = fd_patch_element(&batch->draw_patches, i); + *patch->cs = patch->val | DRAW(0, 0, 0, vismode, 0); + } + util_dynarray_clear(&batch->draw_patches); + return; + } + + if (vismode == USE_VISIBILITY) + return; + + for (i = 0; i < batch->draw_patches.size / sizeof(uint32_t *); i++) { + uint32_t *ptr = + *util_dynarray_element(&batch->draw_patches, uint32_t *, i); + unsigned cnt = ptr[0] >> 16 & 0xfff; /* 5 with idx buffer, 3 without */ + + /* convert CP_DRAW_INDX_BIN to a CP_DRAW_INDX + * replace first two DWORDS with NOP and move the rest down + * (we don't want to have to move the idx buffer reloc) + */ + ptr[0] = CP_TYPE3_PKT | (CP_NOP << 8); + ptr[1] = 0x00000000; + + ptr[4] = ptr[2] & ~(1 << 14 | 1 << 15); /* remove cull_enable bits */ + ptr[2] = CP_TYPE3_PKT | ((cnt - 2) << 16) | (CP_DRAW_INDX << 8); + ptr[3] = 0x00000000; + } } static void fd2_emit_sysmem_prep(struct fd_batch *batch) { - struct fd_context *ctx = batch->ctx; - struct fd_ringbuffer *ring = batch->gmem; - struct pipe_framebuffer_state *pfb = &batch->framebuffer; - struct pipe_surface *psurf = pfb->cbufs[0]; - - if (!psurf) - return; - - struct fd_resource *rsc = fd_resource(psurf->texture); - uint32_t offset = - fd_resource_offset(rsc, psurf->u.tex.level, psurf->u.tex.first_layer); - uint32_t pitch = fdl2_pitch_pixels(&rsc->layout, psurf->u.tex.level); - - assert((pitch & 31) == 0); - assert((offset & 0xfff) == 0); - - fd2_emit_restore(ctx, ring); - - OUT_PKT3(ring, CP_SET_CONSTANT, 2); - OUT_RING(ring, CP_REG(REG_A2XX_RB_SURFACE_INFO)); - OUT_RING(ring, A2XX_RB_SURFACE_INFO_SURFACE_PITCH(pitch)); - - OUT_PKT3(ring, CP_SET_CONSTANT, 2); - OUT_RING(ring, CP_REG(REG_A2XX_RB_COLOR_INFO)); - OUT_RELOC(ring, rsc->bo, offset, - COND(!rsc->layout.tile_mode, A2XX_RB_COLOR_INFO_LINEAR) | - A2XX_RB_COLOR_INFO_SWAP(fmt2swap(psurf->format)) | - A2XX_RB_COLOR_INFO_FORMAT(fd2_pipe2color(psurf->format)), 0); - - OUT_PKT3(ring, CP_SET_CONSTANT, 3); - OUT_RING(ring, CP_REG(REG_A2XX_PA_SC_SCREEN_SCISSOR_TL)); - OUT_RING(ring, A2XX_PA_SC_SCREEN_SCISSOR_TL_WINDOW_OFFSET_DISABLE); - OUT_RING(ring, A2XX_PA_SC_SCREEN_SCISSOR_BR_X(pfb->width) | - A2XX_PA_SC_SCREEN_SCISSOR_BR_Y(pfb->height)); - - OUT_PKT3(ring, CP_SET_CONSTANT, 2); - OUT_RING(ring, CP_REG(REG_A2XX_PA_SC_WINDOW_OFFSET)); - OUT_RING(ring, A2XX_PA_SC_WINDOW_OFFSET_X(0) | - A2XX_PA_SC_WINDOW_OFFSET_Y(0)); - - patch_draws(batch, IGNORE_VISIBILITY); - util_dynarray_clear(&batch->draw_patches); - util_dynarray_clear(&batch->shader_patches); + struct fd_context *ctx = batch->ctx; + struct fd_ringbuffer *ring = batch->gmem; + struct pipe_framebuffer_state *pfb = &batch->framebuffer; + struct pipe_surface *psurf = pfb->cbufs[0]; + + if (!psurf) + return; + + struct fd_resource *rsc = fd_resource(psurf->texture); + uint32_t offset = + fd_resource_offset(rsc, psurf->u.tex.level, psurf->u.tex.first_layer); + uint32_t pitch = fdl2_pitch_pixels(&rsc->layout, psurf->u.tex.level); + + assert((pitch & 31) == 0); + assert((offset & 0xfff) == 0); + + fd2_emit_restore(ctx, ring); + + OUT_PKT3(ring, CP_SET_CONSTANT, 2); + OUT_RING(ring, CP_REG(REG_A2XX_RB_SURFACE_INFO)); + OUT_RING(ring, A2XX_RB_SURFACE_INFO_SURFACE_PITCH(pitch)); + + OUT_PKT3(ring, CP_SET_CONSTANT, 2); + OUT_RING(ring, CP_REG(REG_A2XX_RB_COLOR_INFO)); + OUT_RELOC(ring, rsc->bo, offset, + COND(!rsc->layout.tile_mode, A2XX_RB_COLOR_INFO_LINEAR) | + A2XX_RB_COLOR_INFO_SWAP(fmt2swap(psurf->format)) | + A2XX_RB_COLOR_INFO_FORMAT(fd2_pipe2color(psurf->format)), + 0); + + OUT_PKT3(ring, CP_SET_CONSTANT, 3); + OUT_RING(ring, CP_REG(REG_A2XX_PA_SC_SCREEN_SCISSOR_TL)); + OUT_RING(ring, A2XX_PA_SC_SCREEN_SCISSOR_TL_WINDOW_OFFSET_DISABLE); + OUT_RING(ring, A2XX_PA_SC_SCREEN_SCISSOR_BR_X(pfb->width) | + A2XX_PA_SC_SCREEN_SCISSOR_BR_Y(pfb->height)); + + OUT_PKT3(ring, CP_SET_CONSTANT, 2); + OUT_RING(ring, CP_REG(REG_A2XX_PA_SC_WINDOW_OFFSET)); + OUT_RING(ring, + A2XX_PA_SC_WINDOW_OFFSET_X(0) | A2XX_PA_SC_WINDOW_OFFSET_Y(0)); + + patch_draws(batch, IGNORE_VISIBILITY); + util_dynarray_clear(&batch->draw_patches); + util_dynarray_clear(&batch->shader_patches); } /* before first tile */ static void -fd2_emit_tile_init(struct fd_batch *batch) - assert_dt +fd2_emit_tile_init(struct fd_batch *batch) assert_dt { - struct fd_context *ctx = batch->ctx; - struct fd_ringbuffer *ring = batch->gmem; - struct pipe_framebuffer_state *pfb = &batch->framebuffer; - const struct fd_gmem_stateobj *gmem = batch->gmem_state; - enum pipe_format format = pipe_surface_format(pfb->cbufs[0]); - uint32_t reg; - - fd2_emit_restore(ctx, ring); - - prepare_tile_fini_ib(batch); - - OUT_PKT3(ring, CP_SET_CONSTANT, 4); - OUT_RING(ring, CP_REG(REG_A2XX_RB_SURFACE_INFO)); - OUT_RING(ring, gmem->bin_w); /* RB_SURFACE_INFO */ - OUT_RING(ring, A2XX_RB_COLOR_INFO_SWAP(fmt2swap(format)) | - A2XX_RB_COLOR_INFO_FORMAT(fd2_pipe2color(format))); - reg = A2XX_RB_DEPTH_INFO_DEPTH_BASE(gmem->zsbuf_base[0]); - if (pfb->zsbuf) - reg |= A2XX_RB_DEPTH_INFO_DEPTH_FORMAT(fd_pipe2depth(pfb->zsbuf->format)); - OUT_RING(ring, reg); /* RB_DEPTH_INFO */ - - /* fast clear patches */ - int depth_size = -1; - int color_size = -1; - - if (pfb->cbufs[0]) - color_size = util_format_get_blocksizebits(format) == 32 ? 4 : 2; - - if (pfb->zsbuf) - depth_size = fd_pipe2depth(pfb->zsbuf->format) == 1 ? 4 : 2; - - for (int i = 0; i < fd_patch_num_elements(&batch->gmem_patches); i++) { - struct fd_cs_patch *patch = fd_patch_element(&batch->gmem_patches, i); - uint32_t color_base = 0, depth_base = gmem->zsbuf_base[0]; - uint32_t size, lines; - - /* note: 1 "line" is 512 bytes in both color/depth areas (1K total) */ - switch (patch->val) { - case GMEM_PATCH_FASTCLEAR_COLOR: - size = align(gmem->bin_w * gmem->bin_h * color_size, 0x8000); - lines = size / 1024; - depth_base = size / 2; - break; - case GMEM_PATCH_FASTCLEAR_DEPTH: - size = align(gmem->bin_w * gmem->bin_h * depth_size, 0x8000); - lines = size / 1024; - color_base = depth_base; - depth_base = depth_base + size / 2; - break; - case GMEM_PATCH_FASTCLEAR_COLOR_DEPTH: - lines = align(gmem->bin_w * gmem->bin_h * color_size * 2, 0x8000) / 1024; - break; - case GMEM_PATCH_RESTORE_INFO: - patch->cs[0] = gmem->bin_w; - patch->cs[1] = A2XX_RB_COLOR_INFO_SWAP(fmt2swap(format)) | - A2XX_RB_COLOR_INFO_FORMAT(fd2_pipe2color(format)); - patch->cs[2] = A2XX_RB_DEPTH_INFO_DEPTH_BASE(gmem->zsbuf_base[0]); - if (pfb->zsbuf) - patch->cs[2] |= A2XX_RB_DEPTH_INFO_DEPTH_FORMAT(fd_pipe2depth(pfb->zsbuf->format)); - continue; - default: - continue; - } - - patch->cs[0] = A2XX_PA_SC_SCREEN_SCISSOR_BR_X(32) | - A2XX_PA_SC_SCREEN_SCISSOR_BR_Y(lines); - patch->cs[4] = A2XX_RB_COLOR_INFO_BASE(color_base) | - A2XX_RB_COLOR_INFO_FORMAT(COLORX_8_8_8_8); - patch->cs[5] = A2XX_RB_DEPTH_INFO_DEPTH_BASE(depth_base) | - A2XX_RB_DEPTH_INFO_DEPTH_FORMAT(1); - } - util_dynarray_clear(&batch->gmem_patches); - - /* set to zero, for some reason hardware doesn't like certain values */ - OUT_PKT3(ring, CP_SET_CONSTANT, 2); - OUT_RING(ring, CP_REG(REG_A2XX_VGT_CURRENT_BIN_ID_MIN)); - OUT_RING(ring, 0); - - OUT_PKT3(ring, CP_SET_CONSTANT, 2); - OUT_RING(ring, CP_REG(REG_A2XX_VGT_CURRENT_BIN_ID_MAX)); - OUT_RING(ring, 0); - - if (use_hw_binning(batch)) { - /* patch out unneeded memory exports by changing EXEC CF to EXEC_END - * - * in the shader compiler, we guarantee that the shader ends with - * a specific pattern of ALLOC/EXEC CF pairs for the hw binning exports - * - * the since patches point only to dwords and CFs are 1.5 dwords - * the patch is aligned and might point to a ALLOC CF - */ - for (int i = 0; i < batch->shader_patches.size / sizeof(void*); i++) { - instr_cf_t *cf = - *util_dynarray_element(&batch->shader_patches, instr_cf_t*, i); - if (cf->opc == ALLOC) - cf++; - assert(cf->opc == EXEC); - assert(cf[ctx->screen->info.num_vsc_pipes*2-2].opc == EXEC_END); - cf[2*(gmem->num_vsc_pipes-1)].opc = EXEC_END; - } - - patch_draws(batch, USE_VISIBILITY); - - /* initialize shader constants for the binning memexport */ - OUT_PKT3(ring, CP_SET_CONSTANT, 1 + gmem->num_vsc_pipes * 4); - OUT_RING(ring, 0x0000000C); - - for (int i = 0; i < gmem->num_vsc_pipes; i++) { - /* allocate in 64k increments to avoid reallocs */ - uint32_t bo_size = align(batch->num_vertices, 0x10000); - if (!ctx->vsc_pipe_bo[i] || fd_bo_size(ctx->vsc_pipe_bo[i]) < bo_size) { - if (ctx->vsc_pipe_bo[i]) - fd_bo_del(ctx->vsc_pipe_bo[i]); - ctx->vsc_pipe_bo[i] = fd_bo_new(ctx->dev, bo_size, - DRM_FREEDRENO_GEM_TYPE_KMEM, "vsc_pipe[%u]", i); - assert(ctx->vsc_pipe_bo[i]); - } - - /* memory export address (export32): - * .x: (base_address >> 2) | 0x40000000 (?) - * .y: index (float) - set by shader - * .z: 0x4B00D000 (?) - * .w: 0x4B000000 (?) | max_index (?) - */ - OUT_RELOC(ring, ctx->vsc_pipe_bo[i], 0, 0x40000000, -2); - OUT_RING(ring, 0x00000000); - OUT_RING(ring, 0x4B00D000); - OUT_RING(ring, 0x4B000000 | bo_size); - } - - OUT_PKT3(ring, CP_SET_CONSTANT, 1 + gmem->num_vsc_pipes * 8); - OUT_RING(ring, 0x0000018C); - - for (int i = 0; i < gmem->num_vsc_pipes; i++) { - const struct fd_vsc_pipe *pipe = &gmem->vsc_pipe[i]; - float off_x, off_y, mul_x, mul_y; - - /* const to tranform from [-1,1] to bin coordinates for this pipe - * for x/y, [0,256/2040] = 0, [256/2040,512/2040] = 1, etc - * 8 possible values on x/y axis, - * to clip at binning stage: only use center 6x6 - * TODO: set the z parameters too so that hw binning - * can clip primitives in Z too - */ - - mul_x = 1.0f / (float) (gmem->bin_w * 8); - mul_y = 1.0f / (float) (gmem->bin_h * 8); - off_x = -pipe->x * (1.0/8.0f) + 0.125f - mul_x * gmem->minx; - off_y = -pipe->y * (1.0/8.0f) + 0.125f - mul_y * gmem->miny; - - OUT_RING(ring, fui(off_x * (256.0f/255.0f))); - OUT_RING(ring, fui(off_y * (256.0f/255.0f))); - OUT_RING(ring, 0x3f000000); - OUT_RING(ring, fui(0.0f)); - - OUT_RING(ring, fui(mul_x * (256.0f/255.0f))); - OUT_RING(ring, fui(mul_y * (256.0f/255.0f))); - OUT_RING(ring, fui(0.0f)); - OUT_RING(ring, fui(0.0f)); - } - - OUT_PKT3(ring, CP_SET_CONSTANT, 2); - OUT_RING(ring, CP_REG(REG_A2XX_VGT_VERTEX_REUSE_BLOCK_CNTL)); - OUT_RING(ring, 0); - - fd2_emit_ib(ring, batch->binning); - - OUT_PKT3(ring, CP_SET_CONSTANT, 2); - OUT_RING(ring, CP_REG(REG_A2XX_VGT_VERTEX_REUSE_BLOCK_CNTL)); - OUT_RING(ring, 0x00000002); - } else { - patch_draws(batch, IGNORE_VISIBILITY); - } - - util_dynarray_clear(&batch->draw_patches); - util_dynarray_clear(&batch->shader_patches); + struct fd_context *ctx = batch->ctx; + struct fd_ringbuffer *ring = batch->gmem; + struct pipe_framebuffer_state *pfb = &batch->framebuffer; + const struct fd_gmem_stateobj *gmem = batch->gmem_state; + enum pipe_format format = pipe_surface_format(pfb->cbufs[0]); + uint32_t reg; + + fd2_emit_restore(ctx, ring); + + prepare_tile_fini_ib(batch); + + OUT_PKT3(ring, CP_SET_CONSTANT, 4); + OUT_RING(ring, CP_REG(REG_A2XX_RB_SURFACE_INFO)); + OUT_RING(ring, gmem->bin_w); /* RB_SURFACE_INFO */ + OUT_RING(ring, A2XX_RB_COLOR_INFO_SWAP(fmt2swap(format)) | + A2XX_RB_COLOR_INFO_FORMAT(fd2_pipe2color(format))); + reg = A2XX_RB_DEPTH_INFO_DEPTH_BASE(gmem->zsbuf_base[0]); + if (pfb->zsbuf) + reg |= A2XX_RB_DEPTH_INFO_DEPTH_FORMAT(fd_pipe2depth(pfb->zsbuf->format)); + OUT_RING(ring, reg); /* RB_DEPTH_INFO */ + + /* fast clear patches */ + int depth_size = -1; + int color_size = -1; + + if (pfb->cbufs[0]) + color_size = util_format_get_blocksizebits(format) == 32 ? 4 : 2; + + if (pfb->zsbuf) + depth_size = fd_pipe2depth(pfb->zsbuf->format) == 1 ? 4 : 2; + + for (int i = 0; i < fd_patch_num_elements(&batch->gmem_patches); i++) { + struct fd_cs_patch *patch = fd_patch_element(&batch->gmem_patches, i); + uint32_t color_base = 0, depth_base = gmem->zsbuf_base[0]; + uint32_t size, lines; + + /* note: 1 "line" is 512 bytes in both color/depth areas (1K total) */ + switch (patch->val) { + case GMEM_PATCH_FASTCLEAR_COLOR: + size = align(gmem->bin_w * gmem->bin_h * color_size, 0x8000); + lines = size / 1024; + depth_base = size / 2; + break; + case GMEM_PATCH_FASTCLEAR_DEPTH: + size = align(gmem->bin_w * gmem->bin_h * depth_size, 0x8000); + lines = size / 1024; + color_base = depth_base; + depth_base = depth_base + size / 2; + break; + case GMEM_PATCH_FASTCLEAR_COLOR_DEPTH: + lines = + align(gmem->bin_w * gmem->bin_h * color_size * 2, 0x8000) / 1024; + break; + case GMEM_PATCH_RESTORE_INFO: + patch->cs[0] = gmem->bin_w; + patch->cs[1] = A2XX_RB_COLOR_INFO_SWAP(fmt2swap(format)) | + A2XX_RB_COLOR_INFO_FORMAT(fd2_pipe2color(format)); + patch->cs[2] = A2XX_RB_DEPTH_INFO_DEPTH_BASE(gmem->zsbuf_base[0]); + if (pfb->zsbuf) + patch->cs[2] |= A2XX_RB_DEPTH_INFO_DEPTH_FORMAT( + fd_pipe2depth(pfb->zsbuf->format)); + continue; + default: + continue; + } + + patch->cs[0] = A2XX_PA_SC_SCREEN_SCISSOR_BR_X(32) | + A2XX_PA_SC_SCREEN_SCISSOR_BR_Y(lines); + patch->cs[4] = A2XX_RB_COLOR_INFO_BASE(color_base) | + A2XX_RB_COLOR_INFO_FORMAT(COLORX_8_8_8_8); + patch->cs[5] = A2XX_RB_DEPTH_INFO_DEPTH_BASE(depth_base) | + A2XX_RB_DEPTH_INFO_DEPTH_FORMAT(1); + } + util_dynarray_clear(&batch->gmem_patches); + + /* set to zero, for some reason hardware doesn't like certain values */ + OUT_PKT3(ring, CP_SET_CONSTANT, 2); + OUT_RING(ring, CP_REG(REG_A2XX_VGT_CURRENT_BIN_ID_MIN)); + OUT_RING(ring, 0); + + OUT_PKT3(ring, CP_SET_CONSTANT, 2); + OUT_RING(ring, CP_REG(REG_A2XX_VGT_CURRENT_BIN_ID_MAX)); + OUT_RING(ring, 0); + + if (use_hw_binning(batch)) { + /* patch out unneeded memory exports by changing EXEC CF to EXEC_END + * + * in the shader compiler, we guarantee that the shader ends with + * a specific pattern of ALLOC/EXEC CF pairs for the hw binning exports + * + * the since patches point only to dwords and CFs are 1.5 dwords + * the patch is aligned and might point to a ALLOC CF + */ + for (int i = 0; i < batch->shader_patches.size / sizeof(void *); i++) { + instr_cf_t *cf = + *util_dynarray_element(&batch->shader_patches, instr_cf_t *, i); + if (cf->opc == ALLOC) + cf++; + assert(cf->opc == EXEC); + assert(cf[ctx->screen->info.num_vsc_pipes * 2 - 2].opc == EXEC_END); + cf[2 * (gmem->num_vsc_pipes - 1)].opc = EXEC_END; + } + + patch_draws(batch, USE_VISIBILITY); + + /* initialize shader constants for the binning memexport */ + OUT_PKT3(ring, CP_SET_CONSTANT, 1 + gmem->num_vsc_pipes * 4); + OUT_RING(ring, 0x0000000C); + + for (int i = 0; i < gmem->num_vsc_pipes; i++) { + /* allocate in 64k increments to avoid reallocs */ + uint32_t bo_size = align(batch->num_vertices, 0x10000); + if (!ctx->vsc_pipe_bo[i] || + fd_bo_size(ctx->vsc_pipe_bo[i]) < bo_size) { + if (ctx->vsc_pipe_bo[i]) + fd_bo_del(ctx->vsc_pipe_bo[i]); + ctx->vsc_pipe_bo[i] = + fd_bo_new(ctx->dev, bo_size, DRM_FREEDRENO_GEM_TYPE_KMEM, + "vsc_pipe[%u]", i); + assert(ctx->vsc_pipe_bo[i]); + } + + /* memory export address (export32): + * .x: (base_address >> 2) | 0x40000000 (?) + * .y: index (float) - set by shader + * .z: 0x4B00D000 (?) + * .w: 0x4B000000 (?) | max_index (?) + */ + OUT_RELOC(ring, ctx->vsc_pipe_bo[i], 0, 0x40000000, -2); + OUT_RING(ring, 0x00000000); + OUT_RING(ring, 0x4B00D000); + OUT_RING(ring, 0x4B000000 | bo_size); + } + + OUT_PKT3(ring, CP_SET_CONSTANT, 1 + gmem->num_vsc_pipes * 8); + OUT_RING(ring, 0x0000018C); + + for (int i = 0; i < gmem->num_vsc_pipes; i++) { + const struct fd_vsc_pipe *pipe = &gmem->vsc_pipe[i]; + float off_x, off_y, mul_x, mul_y; + + /* const to tranform from [-1,1] to bin coordinates for this pipe + * for x/y, [0,256/2040] = 0, [256/2040,512/2040] = 1, etc + * 8 possible values on x/y axis, + * to clip at binning stage: only use center 6x6 + * TODO: set the z parameters too so that hw binning + * can clip primitives in Z too + */ + + mul_x = 1.0f / (float)(gmem->bin_w * 8); + mul_y = 1.0f / (float)(gmem->bin_h * 8); + off_x = -pipe->x * (1.0 / 8.0f) + 0.125f - mul_x * gmem->minx; + off_y = -pipe->y * (1.0 / 8.0f) + 0.125f - mul_y * gmem->miny; + + OUT_RING(ring, fui(off_x * (256.0f / 255.0f))); + OUT_RING(ring, fui(off_y * (256.0f / 255.0f))); + OUT_RING(ring, 0x3f000000); + OUT_RING(ring, fui(0.0f)); + + OUT_RING(ring, fui(mul_x * (256.0f / 255.0f))); + OUT_RING(ring, fui(mul_y * (256.0f / 255.0f))); + OUT_RING(ring, fui(0.0f)); + OUT_RING(ring, fui(0.0f)); + } + + OUT_PKT3(ring, CP_SET_CONSTANT, 2); + OUT_RING(ring, CP_REG(REG_A2XX_VGT_VERTEX_REUSE_BLOCK_CNTL)); + OUT_RING(ring, 0); + + fd2_emit_ib(ring, batch->binning); + + OUT_PKT3(ring, CP_SET_CONSTANT, 2); + OUT_RING(ring, CP_REG(REG_A2XX_VGT_VERTEX_REUSE_BLOCK_CNTL)); + OUT_RING(ring, 0x00000002); + } else { + patch_draws(batch, IGNORE_VISIBILITY); + } + + util_dynarray_clear(&batch->draw_patches); + util_dynarray_clear(&batch->shader_patches); } /* before mem2gmem */ static void fd2_emit_tile_prep(struct fd_batch *batch, const struct fd_tile *tile) { - struct fd_ringbuffer *ring = batch->gmem; - struct pipe_framebuffer_state *pfb = &batch->framebuffer; - enum pipe_format format = pipe_surface_format(pfb->cbufs[0]); - - OUT_PKT3(ring, CP_SET_CONSTANT, 2); - OUT_RING(ring, CP_REG(REG_A2XX_RB_COLOR_INFO)); - OUT_RING(ring, A2XX_RB_COLOR_INFO_SWAP(1) | /* RB_COLOR_INFO */ - A2XX_RB_COLOR_INFO_FORMAT(fd2_pipe2color(format))); - - /* setup screen scissor for current tile (same for mem2gmem): */ - OUT_PKT3(ring, CP_SET_CONSTANT, 3); - OUT_RING(ring, CP_REG(REG_A2XX_PA_SC_SCREEN_SCISSOR_TL)); - OUT_RING(ring, A2XX_PA_SC_SCREEN_SCISSOR_TL_X(0) | - A2XX_PA_SC_SCREEN_SCISSOR_TL_Y(0)); - OUT_RING(ring, A2XX_PA_SC_SCREEN_SCISSOR_BR_X(tile->bin_w) | - A2XX_PA_SC_SCREEN_SCISSOR_BR_Y(tile->bin_h)); + struct fd_ringbuffer *ring = batch->gmem; + struct pipe_framebuffer_state *pfb = &batch->framebuffer; + enum pipe_format format = pipe_surface_format(pfb->cbufs[0]); + + OUT_PKT3(ring, CP_SET_CONSTANT, 2); + OUT_RING(ring, CP_REG(REG_A2XX_RB_COLOR_INFO)); + OUT_RING(ring, A2XX_RB_COLOR_INFO_SWAP(1) | /* RB_COLOR_INFO */ + A2XX_RB_COLOR_INFO_FORMAT(fd2_pipe2color(format))); + + /* setup screen scissor for current tile (same for mem2gmem): */ + OUT_PKT3(ring, CP_SET_CONSTANT, 3); + OUT_RING(ring, CP_REG(REG_A2XX_PA_SC_SCREEN_SCISSOR_TL)); + OUT_RING(ring, A2XX_PA_SC_SCREEN_SCISSOR_TL_X(0) | + A2XX_PA_SC_SCREEN_SCISSOR_TL_Y(0)); + OUT_RING(ring, A2XX_PA_SC_SCREEN_SCISSOR_BR_X(tile->bin_w) | + A2XX_PA_SC_SCREEN_SCISSOR_BR_Y(tile->bin_h)); } /* before IB to rendering cmds: */ static void -fd2_emit_tile_renderprep(struct fd_batch *batch, const struct fd_tile *tile) - assert_dt +fd2_emit_tile_renderprep(struct fd_batch *batch, + const struct fd_tile *tile) assert_dt { - struct fd_context *ctx = batch->ctx; - struct fd2_context *fd2_ctx = fd2_context(ctx); - struct fd_ringbuffer *ring = batch->gmem; - struct pipe_framebuffer_state *pfb = &batch->framebuffer; - enum pipe_format format = pipe_surface_format(pfb->cbufs[0]); - - OUT_PKT3(ring, CP_SET_CONSTANT, 2); - OUT_RING(ring, CP_REG(REG_A2XX_RB_COLOR_INFO)); - OUT_RING(ring, A2XX_RB_COLOR_INFO_SWAP(fmt2swap(format)) | - A2XX_RB_COLOR_INFO_FORMAT(fd2_pipe2color(format))); - - /* setup window scissor and offset for current tile (different - * from mem2gmem): - */ - OUT_PKT3(ring, CP_SET_CONSTANT, 2); - OUT_RING(ring, CP_REG(REG_A2XX_PA_SC_WINDOW_OFFSET)); - OUT_RING(ring, A2XX_PA_SC_WINDOW_OFFSET_X(-tile->xoff) | - A2XX_PA_SC_WINDOW_OFFSET_Y(-tile->yoff)); - - /* write SCISSOR_BR to memory so fast clear path can restore from it */ - OUT_PKT3(ring, CP_MEM_WRITE, 2); - OUT_RELOC(ring, fd_resource(fd2_ctx->solid_vertexbuf)->bo, 60, 0, 0); - OUT_RING(ring, A2XX_PA_SC_SCREEN_SCISSOR_BR_X(tile->bin_w) | - A2XX_PA_SC_SCREEN_SCISSOR_BR_Y(tile->bin_h)); - - /* set the copy offset for gmem2mem */ - OUT_PKT3(ring, CP_SET_CONSTANT, 2); - OUT_RING(ring, CP_REG(REG_A2XX_RB_COPY_DEST_OFFSET)); - OUT_RING(ring, A2XX_RB_COPY_DEST_OFFSET_X(tile->xoff) | - A2XX_RB_COPY_DEST_OFFSET_Y(tile->yoff)); - - /* tile offset for gl_FragCoord on a20x (C64 in fragment shader) */ - if (is_a20x(ctx->screen)) { - OUT_PKT3(ring, CP_SET_CONSTANT, 5); - OUT_RING(ring, 0x00000580); - OUT_RING(ring, fui(tile->xoff)); - OUT_RING(ring, fui(tile->yoff)); - OUT_RING(ring, fui(0.0f)); - OUT_RING(ring, fui(0.0f)); - } - - if (use_hw_binning(batch)) { - struct fd_bo *pipe_bo = ctx->vsc_pipe_bo[tile->p]; - - OUT_PKT3(ring, CP_SET_CONSTANT, 2); - OUT_RING(ring, CP_REG(REG_A2XX_VGT_CURRENT_BIN_ID_MIN)); - OUT_RING(ring, tile->n); - - OUT_PKT3(ring, CP_SET_CONSTANT, 2); - OUT_RING(ring, CP_REG(REG_A2XX_VGT_CURRENT_BIN_ID_MAX)); - OUT_RING(ring, tile->n); - - /* TODO only emit this when tile->p changes */ - OUT_PKT3(ring, CP_SET_DRAW_INIT_FLAGS, 1); - OUT_RELOC(ring, pipe_bo, 0, 0, 0); - } + struct fd_context *ctx = batch->ctx; + struct fd2_context *fd2_ctx = fd2_context(ctx); + struct fd_ringbuffer *ring = batch->gmem; + struct pipe_framebuffer_state *pfb = &batch->framebuffer; + enum pipe_format format = pipe_surface_format(pfb->cbufs[0]); + + OUT_PKT3(ring, CP_SET_CONSTANT, 2); + OUT_RING(ring, CP_REG(REG_A2XX_RB_COLOR_INFO)); + OUT_RING(ring, A2XX_RB_COLOR_INFO_SWAP(fmt2swap(format)) | + A2XX_RB_COLOR_INFO_FORMAT(fd2_pipe2color(format))); + + /* setup window scissor and offset for current tile (different + * from mem2gmem): + */ + OUT_PKT3(ring, CP_SET_CONSTANT, 2); + OUT_RING(ring, CP_REG(REG_A2XX_PA_SC_WINDOW_OFFSET)); + OUT_RING(ring, A2XX_PA_SC_WINDOW_OFFSET_X(-tile->xoff) | + A2XX_PA_SC_WINDOW_OFFSET_Y(-tile->yoff)); + + /* write SCISSOR_BR to memory so fast clear path can restore from it */ + OUT_PKT3(ring, CP_MEM_WRITE, 2); + OUT_RELOC(ring, fd_resource(fd2_ctx->solid_vertexbuf)->bo, 60, 0, 0); + OUT_RING(ring, A2XX_PA_SC_SCREEN_SCISSOR_BR_X(tile->bin_w) | + A2XX_PA_SC_SCREEN_SCISSOR_BR_Y(tile->bin_h)); + + /* set the copy offset for gmem2mem */ + OUT_PKT3(ring, CP_SET_CONSTANT, 2); + OUT_RING(ring, CP_REG(REG_A2XX_RB_COPY_DEST_OFFSET)); + OUT_RING(ring, A2XX_RB_COPY_DEST_OFFSET_X(tile->xoff) | + A2XX_RB_COPY_DEST_OFFSET_Y(tile->yoff)); + + /* tile offset for gl_FragCoord on a20x (C64 in fragment shader) */ + if (is_a20x(ctx->screen)) { + OUT_PKT3(ring, CP_SET_CONSTANT, 5); + OUT_RING(ring, 0x00000580); + OUT_RING(ring, fui(tile->xoff)); + OUT_RING(ring, fui(tile->yoff)); + OUT_RING(ring, fui(0.0f)); + OUT_RING(ring, fui(0.0f)); + } + + if (use_hw_binning(batch)) { + struct fd_bo *pipe_bo = ctx->vsc_pipe_bo[tile->p]; + + OUT_PKT3(ring, CP_SET_CONSTANT, 2); + OUT_RING(ring, CP_REG(REG_A2XX_VGT_CURRENT_BIN_ID_MIN)); + OUT_RING(ring, tile->n); + + OUT_PKT3(ring, CP_SET_CONSTANT, 2); + OUT_RING(ring, CP_REG(REG_A2XX_VGT_CURRENT_BIN_ID_MAX)); + OUT_RING(ring, tile->n); + + /* TODO only emit this when tile->p changes */ + OUT_PKT3(ring, CP_SET_DRAW_INIT_FLAGS, 1); + OUT_RELOC(ring, pipe_bo, 0, 0, 0); + } } void -fd2_gmem_init(struct pipe_context *pctx) - disable_thread_safety_analysis +fd2_gmem_init(struct pipe_context *pctx) disable_thread_safety_analysis { - struct fd_context *ctx = fd_context(pctx); - - ctx->emit_sysmem_prep = fd2_emit_sysmem_prep; - ctx->emit_tile_init = fd2_emit_tile_init; - ctx->emit_tile_prep = fd2_emit_tile_prep; - ctx->emit_tile_mem2gmem = fd2_emit_tile_mem2gmem; - ctx->emit_tile_renderprep = fd2_emit_tile_renderprep; - ctx->emit_tile_gmem2mem = fd2_emit_tile_gmem2mem; + struct fd_context *ctx = fd_context(pctx); + + ctx->emit_sysmem_prep = fd2_emit_sysmem_prep; + ctx->emit_tile_init = fd2_emit_tile_init; + ctx->emit_tile_prep = fd2_emit_tile_prep; + ctx->emit_tile_mem2gmem = fd2_emit_tile_mem2gmem; + ctx->emit_tile_renderprep = fd2_emit_tile_renderprep; + ctx->emit_tile_gmem2mem = fd2_emit_tile_gmem2mem; } diff --git a/src/gallium/drivers/freedreno/a2xx/fd2_program.c b/src/gallium/drivers/freedreno/a2xx/fd2_program.c index c56ff89..46f0124 100644 --- a/src/gallium/drivers/freedreno/a2xx/fd2_program.c +++ b/src/gallium/drivers/freedreno/a2xx/fd2_program.c @@ -25,319 +25,321 @@ * Jonathan Marek */ +#include "nir/tgsi_to_nir.h" #include "pipe/p_state.h" -#include "util/u_string.h" -#include "util/u_memory.h" -#include "util/u_inlines.h" -#include "util/format/u_format.h" #include "tgsi/tgsi_dump.h" #include "tgsi/tgsi_parse.h" -#include "nir/tgsi_to_nir.h" +#include "util/format/u_format.h" +#include "util/u_inlines.h" +#include "util/u_memory.h" +#include "util/u_string.h" #include "freedreno_program.h" -#include "ir2.h" +#include "ir2/instr-a2xx.h" #include "fd2_program.h" #include "fd2_texture.h" #include "fd2_util.h" -#include "ir2/instr-a2xx.h" +#include "ir2.h" static struct fd2_shader_stateobj * create_shader(struct pipe_context *pctx, gl_shader_stage type) { - struct fd2_shader_stateobj *so = CALLOC_STRUCT(fd2_shader_stateobj); - if (!so) - return NULL; - so->type = type; - so->is_a20x = is_a20x(fd_context(pctx)->screen); - return so; + struct fd2_shader_stateobj *so = CALLOC_STRUCT(fd2_shader_stateobj); + if (!so) + return NULL; + so->type = type; + so->is_a20x = is_a20x(fd_context(pctx)->screen); + return so; } static void delete_shader(struct fd2_shader_stateobj *so) { - if (!so) - return; - ralloc_free(so->nir); - for (int i = 0; i < ARRAY_SIZE(so->variant); i++) - free(so->variant[i].info.dwords); - free(so); + if (!so) + return; + ralloc_free(so->nir); + for (int i = 0; i < ARRAY_SIZE(so->variant); i++) + free(so->variant[i].info.dwords); + free(so); } static void emit(struct fd_ringbuffer *ring, gl_shader_stage type, - struct ir2_shader_info *info, struct util_dynarray *patches) + struct ir2_shader_info *info, struct util_dynarray *patches) { - unsigned i; + unsigned i; - assert(info->sizedwords); + assert(info->sizedwords); - OUT_PKT3(ring, CP_IM_LOAD_IMMEDIATE, 2 + info->sizedwords); - OUT_RING(ring, type == MESA_SHADER_FRAGMENT); - OUT_RING(ring, info->sizedwords); + OUT_PKT3(ring, CP_IM_LOAD_IMMEDIATE, 2 + info->sizedwords); + OUT_RING(ring, type == MESA_SHADER_FRAGMENT); + OUT_RING(ring, info->sizedwords); - if (patches) - util_dynarray_append(patches, uint32_t*, &ring->cur[info->mem_export_ptr]); + if (patches) + util_dynarray_append(patches, uint32_t *, + &ring->cur[info->mem_export_ptr]); - for (i = 0; i < info->sizedwords; i++) - OUT_RING(ring, info->dwords[i]); + for (i = 0; i < info->sizedwords; i++) + OUT_RING(ring, info->dwords[i]); } static int ir2_glsl_type_size(const struct glsl_type *type, bool bindless) { - return glsl_count_attribute_slots(type, false); + return glsl_count_attribute_slots(type, false); } static void * fd2_fp_state_create(struct pipe_context *pctx, - const struct pipe_shader_state *cso) + const struct pipe_shader_state *cso) { - struct fd2_shader_stateobj *so = create_shader(pctx, MESA_SHADER_FRAGMENT); - if (!so) - return NULL; + struct fd2_shader_stateobj *so = create_shader(pctx, MESA_SHADER_FRAGMENT); + if (!so) + return NULL; - so->nir = (cso->type == PIPE_SHADER_IR_NIR) ? cso->ir.nir : - tgsi_to_nir(cso->tokens, pctx->screen, false); + so->nir = (cso->type == PIPE_SHADER_IR_NIR) + ? cso->ir.nir + : tgsi_to_nir(cso->tokens, pctx->screen, false); - NIR_PASS_V(so->nir, nir_lower_io, - nir_var_shader_in | nir_var_shader_out, - ir2_glsl_type_size, (nir_lower_io_options)0); + NIR_PASS_V(so->nir, nir_lower_io, nir_var_shader_in | nir_var_shader_out, + ir2_glsl_type_size, (nir_lower_io_options)0); - if (ir2_optimize_nir(so->nir, true)) - goto fail; + if (ir2_optimize_nir(so->nir, true)) + goto fail; - so->first_immediate = so->nir->num_uniforms; + so->first_immediate = so->nir->num_uniforms; - ir2_compile(so, 0, NULL); + ir2_compile(so, 0, NULL); - ralloc_free(so->nir); - so->nir = NULL; - return so; + ralloc_free(so->nir); + so->nir = NULL; + return so; fail: - delete_shader(so); - return NULL; + delete_shader(so); + return NULL; } static void fd2_fp_state_delete(struct pipe_context *pctx, void *hwcso) { - struct fd2_shader_stateobj *so = hwcso; - delete_shader(so); + struct fd2_shader_stateobj *so = hwcso; + delete_shader(so); } static void * fd2_vp_state_create(struct pipe_context *pctx, - const struct pipe_shader_state *cso) + const struct pipe_shader_state *cso) { - struct fd2_shader_stateobj *so = create_shader(pctx, MESA_SHADER_VERTEX); - if (!so) - return NULL; + struct fd2_shader_stateobj *so = create_shader(pctx, MESA_SHADER_VERTEX); + if (!so) + return NULL; - so->nir = (cso->type == PIPE_SHADER_IR_NIR) ? cso->ir.nir : - tgsi_to_nir(cso->tokens, pctx->screen, false); + so->nir = (cso->type == PIPE_SHADER_IR_NIR) + ? cso->ir.nir + : tgsi_to_nir(cso->tokens, pctx->screen, false); - NIR_PASS_V(so->nir, nir_lower_io, - nir_var_shader_in | nir_var_shader_out, - ir2_glsl_type_size, (nir_lower_io_options)0); + NIR_PASS_V(so->nir, nir_lower_io, nir_var_shader_in | nir_var_shader_out, + ir2_glsl_type_size, (nir_lower_io_options)0); - if (ir2_optimize_nir(so->nir, true)) - goto fail; + if (ir2_optimize_nir(so->nir, true)) + goto fail; - so->first_immediate = so->nir->num_uniforms; + so->first_immediate = so->nir->num_uniforms; - /* compile binning variant now */ - ir2_compile(so, 0, NULL); + /* compile binning variant now */ + ir2_compile(so, 0, NULL); - return so; + return so; fail: - delete_shader(so); - return NULL; + delete_shader(so); + return NULL; } static void fd2_vp_state_delete(struct pipe_context *pctx, void *hwcso) { - struct fd2_shader_stateobj *so = hwcso; - delete_shader(so); + struct fd2_shader_stateobj *so = hwcso; + delete_shader(so); } static void patch_vtx_fetch(struct fd_context *ctx, struct pipe_vertex_element *elem, - instr_fetch_vtx_t *instr, uint16_t dst_swiz) - assert_dt + instr_fetch_vtx_t *instr, uint16_t dst_swiz) assert_dt { - struct surface_format fmt = fd2_pipe2surface(elem->src_format); - - instr->dst_swiz = fd2_vtx_swiz(elem->src_format, dst_swiz); - instr->format_comp_all = fmt.sign == SQ_TEX_SIGN_SIGNED; - instr->num_format_all = fmt.num_format; - instr->format = fmt.format; - instr->exp_adjust_all = fmt.exp_adjust; - instr->stride = ctx->vtx.vertexbuf.vb[elem->vertex_buffer_index].stride; - instr->offset = elem->src_offset; + struct surface_format fmt = fd2_pipe2surface(elem->src_format); + + instr->dst_swiz = fd2_vtx_swiz(elem->src_format, dst_swiz); + instr->format_comp_all = fmt.sign == SQ_TEX_SIGN_SIGNED; + instr->num_format_all = fmt.num_format; + instr->format = fmt.format; + instr->exp_adjust_all = fmt.exp_adjust; + instr->stride = ctx->vtx.vertexbuf.vb[elem->vertex_buffer_index].stride; + instr->offset = elem->src_offset; } static void patch_fetches(struct fd_context *ctx, struct ir2_shader_info *info, - struct fd_vertex_stateobj *vtx, struct fd_texture_stateobj *tex) - assert_dt + struct fd_vertex_stateobj *vtx, + struct fd_texture_stateobj *tex) assert_dt { - for (int i = 0; i < info->num_fetch_instrs; i++) { - struct ir2_fetch_info *fi = &info->fetch_info[i]; - - instr_fetch_t *instr = (instr_fetch_t*) &info->dwords[fi->offset]; - if (instr->opc == VTX_FETCH) { - unsigned idx = (instr->vtx.const_index - 20) * 3 + - instr->vtx.const_index_sel; - patch_vtx_fetch(ctx, &vtx->pipe[idx], &instr->vtx, fi->vtx.dst_swiz); - continue; - } - - assert(instr->opc == TEX_FETCH); - instr->tex.const_idx = fd2_get_const_idx(ctx, tex, fi->tex.samp_id); - instr->tex.src_swiz = fi->tex.src_swiz; - } + for (int i = 0; i < info->num_fetch_instrs; i++) { + struct ir2_fetch_info *fi = &info->fetch_info[i]; + + instr_fetch_t *instr = (instr_fetch_t *)&info->dwords[fi->offset]; + if (instr->opc == VTX_FETCH) { + unsigned idx = + (instr->vtx.const_index - 20) * 3 + instr->vtx.const_index_sel; + patch_vtx_fetch(ctx, &vtx->pipe[idx], &instr->vtx, fi->vtx.dst_swiz); + continue; + } + + assert(instr->opc == TEX_FETCH); + instr->tex.const_idx = fd2_get_const_idx(ctx, tex, fi->tex.samp_id); + instr->tex.src_swiz = fi->tex.src_swiz; + } } void fd2_program_emit(struct fd_context *ctx, struct fd_ringbuffer *ring, - struct fd_program_stateobj *prog) + struct fd_program_stateobj *prog) { - struct fd2_shader_stateobj *fp = NULL, *vp; - struct ir2_shader_info *fpi, *vpi; - struct ir2_frag_linkage *f; - uint8_t vs_gprs, fs_gprs = 0, vs_export = 0; - enum a2xx_sq_ps_vtx_mode mode = POSITION_1_VECTOR; - bool binning = (ctx->batch && ring == ctx->batch->binning); - unsigned variant = 0; - - vp = prog->vs; - - /* find variant matching the linked fragment shader */ - if (!binning) { - fp = prog->fs; - for (variant = 1; variant < ARRAY_SIZE(vp->variant); variant++) { - /* if checked all variants, compile a new variant */ - if (!vp->variant[variant].info.sizedwords) { - ir2_compile(vp, variant, fp); - break; - } - - /* check if fragment shader linkage matches */ - if (!memcmp(&vp->variant[variant].f, &fp->variant[0].f, - sizeof(struct ir2_frag_linkage))) - break; - } - assert(variant < ARRAY_SIZE(vp->variant)); - } - - vpi = &vp->variant[variant].info; - fpi = &fp->variant[0].info; - f = &fp->variant[0].f; - - /* clear/gmem2mem/mem2gmem need to be changed to remove this condition */ - if (prog != &ctx->solid_prog && prog != &ctx->blit_prog[0]) { - patch_fetches(ctx, vpi, ctx->vtx.vtx, &ctx->tex[PIPE_SHADER_VERTEX]); - if (fp) - patch_fetches(ctx, fpi, NULL, &ctx->tex[PIPE_SHADER_FRAGMENT]); - } - - emit(ring, MESA_SHADER_VERTEX, vpi, - binning ? &ctx->batch->shader_patches : NULL); - - if (fp) { - emit(ring, MESA_SHADER_FRAGMENT, fpi, NULL); - fs_gprs = (fpi->max_reg < 0) ? 0x80 : fpi->max_reg; - vs_export = MAX2(1, f->inputs_count) - 1; - } - - vs_gprs = (vpi->max_reg < 0) ? 0x80 : vpi->max_reg; - - if (vp->writes_psize && !binning) - mode = POSITION_2_VECTORS_SPRITE; - - /* set register to use for param (fragcoord/pointcoord/frontfacing) */ - OUT_PKT3(ring, CP_SET_CONSTANT, 2); - OUT_RING(ring, CP_REG(REG_A2XX_SQ_CONTEXT_MISC)); - OUT_RING(ring, A2XX_SQ_CONTEXT_MISC_SC_SAMPLE_CNTL(CENTERS_ONLY) | - COND(fp, A2XX_SQ_CONTEXT_MISC_PARAM_GEN_POS(f->inputs_count)) | - /* we need SCREEN_XY for both fragcoord and frontfacing */ - A2XX_SQ_CONTEXT_MISC_SC_OUTPUT_SCREEN_XY); - - OUT_PKT3(ring, CP_SET_CONSTANT, 2); - OUT_RING(ring, CP_REG(REG_A2XX_SQ_PROGRAM_CNTL)); - OUT_RING(ring, A2XX_SQ_PROGRAM_CNTL_PS_EXPORT_MODE(2) | - A2XX_SQ_PROGRAM_CNTL_VS_EXPORT_MODE(mode) | - A2XX_SQ_PROGRAM_CNTL_VS_RESOURCE | - A2XX_SQ_PROGRAM_CNTL_PS_RESOURCE | - A2XX_SQ_PROGRAM_CNTL_VS_EXPORT_COUNT(vs_export) | - A2XX_SQ_PROGRAM_CNTL_PS_REGS(fs_gprs) | - A2XX_SQ_PROGRAM_CNTL_VS_REGS(vs_gprs) | - COND(fp && fp->need_param, A2XX_SQ_PROGRAM_CNTL_PARAM_GEN) | - COND(!fp, A2XX_SQ_PROGRAM_CNTL_GEN_INDEX_VTX)); + struct fd2_shader_stateobj *fp = NULL, *vp; + struct ir2_shader_info *fpi, *vpi; + struct ir2_frag_linkage *f; + uint8_t vs_gprs, fs_gprs = 0, vs_export = 0; + enum a2xx_sq_ps_vtx_mode mode = POSITION_1_VECTOR; + bool binning = (ctx->batch && ring == ctx->batch->binning); + unsigned variant = 0; + + vp = prog->vs; + + /* find variant matching the linked fragment shader */ + if (!binning) { + fp = prog->fs; + for (variant = 1; variant < ARRAY_SIZE(vp->variant); variant++) { + /* if checked all variants, compile a new variant */ + if (!vp->variant[variant].info.sizedwords) { + ir2_compile(vp, variant, fp); + break; + } + + /* check if fragment shader linkage matches */ + if (!memcmp(&vp->variant[variant].f, &fp->variant[0].f, + sizeof(struct ir2_frag_linkage))) + break; + } + assert(variant < ARRAY_SIZE(vp->variant)); + } + + vpi = &vp->variant[variant].info; + fpi = &fp->variant[0].info; + f = &fp->variant[0].f; + + /* clear/gmem2mem/mem2gmem need to be changed to remove this condition */ + if (prog != &ctx->solid_prog && prog != &ctx->blit_prog[0]) { + patch_fetches(ctx, vpi, ctx->vtx.vtx, &ctx->tex[PIPE_SHADER_VERTEX]); + if (fp) + patch_fetches(ctx, fpi, NULL, &ctx->tex[PIPE_SHADER_FRAGMENT]); + } + + emit(ring, MESA_SHADER_VERTEX, vpi, + binning ? &ctx->batch->shader_patches : NULL); + + if (fp) { + emit(ring, MESA_SHADER_FRAGMENT, fpi, NULL); + fs_gprs = (fpi->max_reg < 0) ? 0x80 : fpi->max_reg; + vs_export = MAX2(1, f->inputs_count) - 1; + } + + vs_gprs = (vpi->max_reg < 0) ? 0x80 : vpi->max_reg; + + if (vp->writes_psize && !binning) + mode = POSITION_2_VECTORS_SPRITE; + + /* set register to use for param (fragcoord/pointcoord/frontfacing) */ + OUT_PKT3(ring, CP_SET_CONSTANT, 2); + OUT_RING(ring, CP_REG(REG_A2XX_SQ_CONTEXT_MISC)); + OUT_RING(ring, + A2XX_SQ_CONTEXT_MISC_SC_SAMPLE_CNTL(CENTERS_ONLY) | + COND(fp, A2XX_SQ_CONTEXT_MISC_PARAM_GEN_POS(f->inputs_count)) | + /* we need SCREEN_XY for both fragcoord and frontfacing */ + A2XX_SQ_CONTEXT_MISC_SC_OUTPUT_SCREEN_XY); + + OUT_PKT3(ring, CP_SET_CONSTANT, 2); + OUT_RING(ring, CP_REG(REG_A2XX_SQ_PROGRAM_CNTL)); + OUT_RING(ring, + A2XX_SQ_PROGRAM_CNTL_PS_EXPORT_MODE(2) | + A2XX_SQ_PROGRAM_CNTL_VS_EXPORT_MODE(mode) | + A2XX_SQ_PROGRAM_CNTL_VS_RESOURCE | + A2XX_SQ_PROGRAM_CNTL_PS_RESOURCE | + A2XX_SQ_PROGRAM_CNTL_VS_EXPORT_COUNT(vs_export) | + A2XX_SQ_PROGRAM_CNTL_PS_REGS(fs_gprs) | + A2XX_SQ_PROGRAM_CNTL_VS_REGS(vs_gprs) | + COND(fp && fp->need_param, A2XX_SQ_PROGRAM_CNTL_PARAM_GEN) | + COND(!fp, A2XX_SQ_PROGRAM_CNTL_GEN_INDEX_VTX)); } void fd2_prog_init(struct pipe_context *pctx) { - struct fd_context *ctx = fd_context(pctx); - struct fd_program_stateobj *prog; - struct fd2_shader_stateobj *so; - struct ir2_shader_info *info; - instr_fetch_vtx_t *instr; + struct fd_context *ctx = fd_context(pctx); + struct fd_program_stateobj *prog; + struct fd2_shader_stateobj *so; + struct ir2_shader_info *info; + instr_fetch_vtx_t *instr; - pctx->create_fs_state = fd2_fp_state_create; - pctx->delete_fs_state = fd2_fp_state_delete; + pctx->create_fs_state = fd2_fp_state_create; + pctx->delete_fs_state = fd2_fp_state_delete; - pctx->create_vs_state = fd2_vp_state_create; - pctx->delete_vs_state = fd2_vp_state_delete; + pctx->create_vs_state = fd2_vp_state_create; + pctx->delete_vs_state = fd2_vp_state_delete; - fd_prog_init(pctx); + fd_prog_init(pctx); - /* XXX maybe its possible to reuse patch_vtx_fetch somehow? */ + /* XXX maybe its possible to reuse patch_vtx_fetch somehow? */ - prog = &ctx->solid_prog; - so = prog->vs; - ir2_compile(prog->vs, 1, prog->fs); + prog = &ctx->solid_prog; + so = prog->vs; + ir2_compile(prog->vs, 1, prog->fs); #define IR2_FETCH_SWIZ_XY01 0xb08 #define IR2_FETCH_SWIZ_XYZ1 0xa88 - info = &so->variant[1].info; - - instr = (instr_fetch_vtx_t*) &info->dwords[info->fetch_info[0].offset]; - instr->const_index = 26; - instr->const_index_sel = 0; - instr->format = FMT_32_32_32_FLOAT; - instr->format_comp_all = false; - instr->stride = 12; - instr->num_format_all = true; - instr->dst_swiz = IR2_FETCH_SWIZ_XYZ1; - - prog = &ctx->blit_prog[0]; - so = prog->vs; - ir2_compile(prog->vs, 1, prog->fs); - - info = &so->variant[1].info; - - instr = (instr_fetch_vtx_t*) &info->dwords[info->fetch_info[0].offset]; - instr->const_index = 26; - instr->const_index_sel = 1; - instr->format = FMT_32_32_FLOAT; - instr->format_comp_all = false; - instr->stride = 8; - instr->num_format_all = false; - instr->dst_swiz = IR2_FETCH_SWIZ_XY01; - - instr = (instr_fetch_vtx_t*) &info->dwords[info->fetch_info[1].offset]; - instr->const_index = 26; - instr->const_index_sel = 0; - instr->format = FMT_32_32_32_FLOAT; - instr->format_comp_all = false; - instr->stride = 12; - instr->num_format_all = false; - instr->dst_swiz = IR2_FETCH_SWIZ_XYZ1; + info = &so->variant[1].info; + + instr = (instr_fetch_vtx_t *)&info->dwords[info->fetch_info[0].offset]; + instr->const_index = 26; + instr->const_index_sel = 0; + instr->format = FMT_32_32_32_FLOAT; + instr->format_comp_all = false; + instr->stride = 12; + instr->num_format_all = true; + instr->dst_swiz = IR2_FETCH_SWIZ_XYZ1; + + prog = &ctx->blit_prog[0]; + so = prog->vs; + ir2_compile(prog->vs, 1, prog->fs); + + info = &so->variant[1].info; + + instr = (instr_fetch_vtx_t *)&info->dwords[info->fetch_info[0].offset]; + instr->const_index = 26; + instr->const_index_sel = 1; + instr->format = FMT_32_32_FLOAT; + instr->format_comp_all = false; + instr->stride = 8; + instr->num_format_all = false; + instr->dst_swiz = IR2_FETCH_SWIZ_XY01; + + instr = (instr_fetch_vtx_t *)&info->dwords[info->fetch_info[1].offset]; + instr->const_index = 26; + instr->const_index_sel = 0; + instr->format = FMT_32_32_32_FLOAT; + instr->format_comp_all = false; + instr->stride = 12; + instr->num_format_all = false; + instr->dst_swiz = IR2_FETCH_SWIZ_XYZ1; } diff --git a/src/gallium/drivers/freedreno/a2xx/fd2_program.h b/src/gallium/drivers/freedreno/a2xx/fd2_program.h index fdca2a2..0dbe342 100644 --- a/src/gallium/drivers/freedreno/a2xx/fd2_program.h +++ b/src/gallium/drivers/freedreno/a2xx/fd2_program.h @@ -31,39 +31,39 @@ #include "freedreno_context.h" -#include "ir2.h" #include "disasm.h" +#include "ir2.h" struct fd2_shader_stateobj { - nir_shader *nir; - gl_shader_stage type; - bool is_a20x; + nir_shader *nir; + gl_shader_stage type; + bool is_a20x; - /* note: using same set of immediates for all variants - * it doesn't matter, other than the slightly larger command stream - */ - unsigned first_immediate; /* const reg # of first immediate */ - unsigned num_immediates; - struct { - uint32_t val[4]; - unsigned ncomp; - } immediates[64]; + /* note: using same set of immediates for all variants + * it doesn't matter, other than the slightly larger command stream + */ + unsigned first_immediate; /* const reg # of first immediate */ + unsigned num_immediates; + struct { + uint32_t val[4]; + unsigned ncomp; + } immediates[64]; - bool writes_psize; - bool need_param; - bool has_kill; + bool writes_psize; + bool need_param; + bool has_kill; - /* note: - * fragment shader only has one variant - * first vertex shader variant is always binning shader - * we should use a dynamic array but in normal case there is - * only 2 variants (and 3 sometimes with GALLIUM_HUD) - */ - struct ir2_shader_variant variant[8]; + /* note: + * fragment shader only has one variant + * first vertex shader variant is always binning shader + * we should use a dynamic array but in normal case there is + * only 2 variants (and 3 sometimes with GALLIUM_HUD) + */ + struct ir2_shader_variant variant[8]; }; void fd2_program_emit(struct fd_context *ctx, struct fd_ringbuffer *ring, - struct fd_program_stateobj *prog) assert_dt; + struct fd_program_stateobj *prog) assert_dt; void fd2_prog_init(struct pipe_context *pctx); diff --git a/src/gallium/drivers/freedreno/a2xx/fd2_query.c b/src/gallium/drivers/freedreno/a2xx/fd2_query.c index 41b125d..0f4052a 100644 --- a/src/gallium/drivers/freedreno/a2xx/fd2_query.c +++ b/src/gallium/drivers/freedreno/a2xx/fd2_query.c @@ -37,20 +37,19 @@ #include "fd2_query.h" struct PACKED fd2_query_sample { - uint32_t start; - uint32_t stop; + uint32_t start; + uint32_t stop; }; /* offset of a single field of an array of fd2_query_sample: */ -#define query_sample_idx(aq, idx, field) \ - fd_resource((aq)->prsc)->bo, \ - (idx * sizeof(struct fd2_query_sample)) + \ - offsetof(struct fd2_query_sample, field), \ - 0, 0 +#define query_sample_idx(aq, idx, field) \ + fd_resource((aq)->prsc)->bo, \ + (idx * sizeof(struct fd2_query_sample)) + \ + offsetof(struct fd2_query_sample, field), \ + 0, 0 /* offset of a single field of fd2_query_sample: */ -#define query_sample(aq, field) \ - query_sample_idx(aq, 0, field) +#define query_sample(aq, field) query_sample_idx(aq, 0, field) /* * Performance Counter (batch) queries: @@ -62,186 +61,183 @@ struct PACKED fd2_query_sample { */ struct fd_batch_query_entry { - uint8_t gid; /* group-id */ - uint8_t cid; /* countable-id within the group */ + uint8_t gid; /* group-id */ + uint8_t cid; /* countable-id within the group */ }; struct fd_batch_query_data { - struct fd_screen *screen; - unsigned num_query_entries; - struct fd_batch_query_entry query_entries[]; + struct fd_screen *screen; + unsigned num_query_entries; + struct fd_batch_query_entry query_entries[]; }; static void -perfcntr_resume(struct fd_acc_query *aq, struct fd_batch *batch) - assert_dt +perfcntr_resume(struct fd_acc_query *aq, struct fd_batch *batch) assert_dt { - struct fd_batch_query_data *data = aq->query_data; - struct fd_screen *screen = data->screen; - struct fd_ringbuffer *ring = batch->draw; + struct fd_batch_query_data *data = aq->query_data; + struct fd_screen *screen = data->screen; + struct fd_ringbuffer *ring = batch->draw; - unsigned counters_per_group[screen->num_perfcntr_groups]; - memset(counters_per_group, 0, sizeof(counters_per_group)); + unsigned counters_per_group[screen->num_perfcntr_groups]; + memset(counters_per_group, 0, sizeof(counters_per_group)); - fd_wfi(batch, ring); + fd_wfi(batch, ring); - /* configure performance counters for the requested queries: */ - for (unsigned i = 0; i < data->num_query_entries; i++) { - struct fd_batch_query_entry *entry = &data->query_entries[i]; - const struct fd_perfcntr_group *g = &screen->perfcntr_groups[entry->gid]; - unsigned counter_idx = counters_per_group[entry->gid]++; + /* configure performance counters for the requested queries: */ + for (unsigned i = 0; i < data->num_query_entries; i++) { + struct fd_batch_query_entry *entry = &data->query_entries[i]; + const struct fd_perfcntr_group *g = &screen->perfcntr_groups[entry->gid]; + unsigned counter_idx = counters_per_group[entry->gid]++; - debug_assert(counter_idx < g->num_counters); + debug_assert(counter_idx < g->num_counters); - OUT_PKT0(ring, g->counters[counter_idx].select_reg, 1); - OUT_RING(ring, g->countables[entry->cid].selector); - } + OUT_PKT0(ring, g->counters[counter_idx].select_reg, 1); + OUT_RING(ring, g->countables[entry->cid].selector); + } - memset(counters_per_group, 0, sizeof(counters_per_group)); + memset(counters_per_group, 0, sizeof(counters_per_group)); - /* and snapshot the start values */ - for (unsigned i = 0; i < data->num_query_entries; i++) { - struct fd_batch_query_entry *entry = &data->query_entries[i]; - const struct fd_perfcntr_group *g = &screen->perfcntr_groups[entry->gid]; - unsigned counter_idx = counters_per_group[entry->gid]++; - const struct fd_perfcntr_counter *counter = &g->counters[counter_idx]; + /* and snapshot the start values */ + for (unsigned i = 0; i < data->num_query_entries; i++) { + struct fd_batch_query_entry *entry = &data->query_entries[i]; + const struct fd_perfcntr_group *g = &screen->perfcntr_groups[entry->gid]; + unsigned counter_idx = counters_per_group[entry->gid]++; + const struct fd_perfcntr_counter *counter = &g->counters[counter_idx]; - OUT_PKT3(ring, CP_REG_TO_MEM, 2); - OUT_RING(ring, counter->counter_reg_lo | CP_REG_TO_MEM_0_ACCUMULATE); - OUT_RELOC(ring, query_sample_idx(aq, i, start)); - } + OUT_PKT3(ring, CP_REG_TO_MEM, 2); + OUT_RING(ring, counter->counter_reg_lo | CP_REG_TO_MEM_0_ACCUMULATE); + OUT_RELOC(ring, query_sample_idx(aq, i, start)); + } } static void -perfcntr_pause(struct fd_acc_query *aq, struct fd_batch *batch) - assert_dt +perfcntr_pause(struct fd_acc_query *aq, struct fd_batch *batch) assert_dt { - struct fd_batch_query_data *data = aq->query_data; - struct fd_screen *screen = data->screen; - struct fd_ringbuffer *ring = batch->draw; + struct fd_batch_query_data *data = aq->query_data; + struct fd_screen *screen = data->screen; + struct fd_ringbuffer *ring = batch->draw; - unsigned counters_per_group[screen->num_perfcntr_groups]; - memset(counters_per_group, 0, sizeof(counters_per_group)); + unsigned counters_per_group[screen->num_perfcntr_groups]; + memset(counters_per_group, 0, sizeof(counters_per_group)); - fd_wfi(batch, ring); + fd_wfi(batch, ring); - /* TODO do we need to bother to turn anything off? */ + /* TODO do we need to bother to turn anything off? */ - /* snapshot the end values: */ - for (unsigned i = 0; i < data->num_query_entries; i++) { - struct fd_batch_query_entry *entry = &data->query_entries[i]; - const struct fd_perfcntr_group *g = &screen->perfcntr_groups[entry->gid]; - unsigned counter_idx = counters_per_group[entry->gid]++; - const struct fd_perfcntr_counter *counter = &g->counters[counter_idx]; + /* snapshot the end values: */ + for (unsigned i = 0; i < data->num_query_entries; i++) { + struct fd_batch_query_entry *entry = &data->query_entries[i]; + const struct fd_perfcntr_group *g = &screen->perfcntr_groups[entry->gid]; + unsigned counter_idx = counters_per_group[entry->gid]++; + const struct fd_perfcntr_counter *counter = &g->counters[counter_idx]; - OUT_PKT3(ring, CP_REG_TO_MEM, 2); - OUT_RING(ring, counter->counter_reg_lo | CP_REG_TO_MEM_0_ACCUMULATE); - OUT_RELOC(ring, query_sample_idx(aq, i, stop)); - } + OUT_PKT3(ring, CP_REG_TO_MEM, 2); + OUT_RING(ring, counter->counter_reg_lo | CP_REG_TO_MEM_0_ACCUMULATE); + OUT_RELOC(ring, query_sample_idx(aq, i, stop)); + } } static void perfcntr_accumulate_result(struct fd_acc_query *aq, void *buf, - union pipe_query_result *result) + union pipe_query_result *result) { - struct fd_batch_query_data *data = aq->query_data; - struct fd2_query_sample *sp = buf; + struct fd_batch_query_data *data = aq->query_data; + struct fd2_query_sample *sp = buf; - for (unsigned i = 0; i < data->num_query_entries; i++) - result->batch[i].u64 = sp[i].stop - sp[i].start; + for (unsigned i = 0; i < data->num_query_entries; i++) + result->batch[i].u64 = sp[i].stop - sp[i].start; } static const struct fd_acc_sample_provider perfcntr = { - .query_type = FD_QUERY_FIRST_PERFCNTR, - .always = true, - .resume = perfcntr_resume, - .pause = perfcntr_pause, - .result = perfcntr_accumulate_result, + .query_type = FD_QUERY_FIRST_PERFCNTR, + .always = true, + .resume = perfcntr_resume, + .pause = perfcntr_pause, + .result = perfcntr_accumulate_result, }; static struct pipe_query * -fd2_create_batch_query(struct pipe_context *pctx, - unsigned num_queries, unsigned *query_types) +fd2_create_batch_query(struct pipe_context *pctx, unsigned num_queries, + unsigned *query_types) { - struct fd_context *ctx = fd_context(pctx); - struct fd_screen *screen = ctx->screen; - struct fd_query *q; - struct fd_acc_query *aq; - struct fd_batch_query_data *data; - - data = CALLOC_VARIANT_LENGTH_STRUCT(fd_batch_query_data, - num_queries * sizeof(data->query_entries[0])); - - data->screen = screen; - data->num_query_entries = num_queries; - - /* validate the requested query_types and ensure we don't try - * to request more query_types of a given group than we have - * counters: - */ - unsigned counters_per_group[screen->num_perfcntr_groups]; - memset(counters_per_group, 0, sizeof(counters_per_group)); - - for (unsigned i = 0; i < num_queries; i++) { - unsigned idx = query_types[i] - FD_QUERY_FIRST_PERFCNTR; - - /* verify valid query_type, ie. is it actually a perfcntr? */ - if ((query_types[i] < FD_QUERY_FIRST_PERFCNTR) || - (idx >= screen->num_perfcntr_queries)) { - mesa_loge("invalid batch query query_type: %u", query_types[i]); - goto error; - } - - struct fd_batch_query_entry *entry = &data->query_entries[i]; - struct pipe_driver_query_info *pq = &screen->perfcntr_queries[idx]; - - entry->gid = pq->group_id; - - /* the perfcntr_queries[] table flattens all the countables - * for each group in series, ie: - * - * (G0,C0), .., (G0,Cn), (G1,C0), .., (G1,Cm), ... - * - * So to find the countable index just step back through the - * table to find the first entry with the same group-id. - */ - while (pq > screen->perfcntr_queries) { - pq--; - if (pq->group_id == entry->gid) - entry->cid++; - } - - if (counters_per_group[entry->gid] >= - screen->perfcntr_groups[entry->gid].num_counters) { - mesa_loge("too many counters for group %u", entry->gid); - goto error; - } - - counters_per_group[entry->gid]++; - } - - q = fd_acc_create_query2(ctx, 0, 0, &perfcntr); - aq = fd_acc_query(q); - - /* sample buffer size is based on # of queries: */ - aq->size = num_queries * sizeof(struct fd2_query_sample); - aq->query_data = data; - - return (struct pipe_query *)q; + struct fd_context *ctx = fd_context(pctx); + struct fd_screen *screen = ctx->screen; + struct fd_query *q; + struct fd_acc_query *aq; + struct fd_batch_query_data *data; + + data = CALLOC_VARIANT_LENGTH_STRUCT( + fd_batch_query_data, num_queries * sizeof(data->query_entries[0])); + + data->screen = screen; + data->num_query_entries = num_queries; + + /* validate the requested query_types and ensure we don't try + * to request more query_types of a given group than we have + * counters: + */ + unsigned counters_per_group[screen->num_perfcntr_groups]; + memset(counters_per_group, 0, sizeof(counters_per_group)); + + for (unsigned i = 0; i < num_queries; i++) { + unsigned idx = query_types[i] - FD_QUERY_FIRST_PERFCNTR; + + /* verify valid query_type, ie. is it actually a perfcntr? */ + if ((query_types[i] < FD_QUERY_FIRST_PERFCNTR) || + (idx >= screen->num_perfcntr_queries)) { + mesa_loge("invalid batch query query_type: %u", query_types[i]); + goto error; + } + + struct fd_batch_query_entry *entry = &data->query_entries[i]; + struct pipe_driver_query_info *pq = &screen->perfcntr_queries[idx]; + + entry->gid = pq->group_id; + + /* the perfcntr_queries[] table flattens all the countables + * for each group in series, ie: + * + * (G0,C0), .., (G0,Cn), (G1,C0), .., (G1,Cm), ... + * + * So to find the countable index just step back through the + * table to find the first entry with the same group-id. + */ + while (pq > screen->perfcntr_queries) { + pq--; + if (pq->group_id == entry->gid) + entry->cid++; + } + + if (counters_per_group[entry->gid] >= + screen->perfcntr_groups[entry->gid].num_counters) { + mesa_loge("too many counters for group %u", entry->gid); + goto error; + } + + counters_per_group[entry->gid]++; + } + + q = fd_acc_create_query2(ctx, 0, 0, &perfcntr); + aq = fd_acc_query(q); + + /* sample buffer size is based on # of queries: */ + aq->size = num_queries * sizeof(struct fd2_query_sample); + aq->query_data = data; + + return (struct pipe_query *)q; error: - free(data); - return NULL; + free(data); + return NULL; } void -fd2_query_context_init(struct pipe_context *pctx) - disable_thread_safety_analysis +fd2_query_context_init(struct pipe_context *pctx) disable_thread_safety_analysis { - struct fd_context *ctx = fd_context(pctx); + struct fd_context *ctx = fd_context(pctx); - ctx->create_query = fd_acc_create_query; - ctx->query_update_batch = fd_acc_query_update_batch; + ctx->create_query = fd_acc_create_query; + ctx->query_update_batch = fd_acc_query_update_batch; - pctx->create_batch_query = fd2_create_batch_query; + pctx->create_batch_query = fd2_create_batch_query; } diff --git a/src/gallium/drivers/freedreno/a2xx/fd2_rasterizer.c b/src/gallium/drivers/freedreno/a2xx/fd2_rasterizer.c index a81f63b..5cd06b2 100644 --- a/src/gallium/drivers/freedreno/a2xx/fd2_rasterizer.c +++ b/src/gallium/drivers/freedreno/a2xx/fd2_rasterizer.c @@ -24,88 +24,86 @@ * Rob Clark */ - #include "pipe/p_state.h" -#include "util/u_string.h" #include "util/u_memory.h" +#include "util/u_string.h" -#include "fd2_rasterizer.h" #include "fd2_context.h" +#include "fd2_rasterizer.h" #include "fd2_util.h" - void * fd2_rasterizer_state_create(struct pipe_context *pctx, - const struct pipe_rasterizer_state *cso) + const struct pipe_rasterizer_state *cso) { - struct fd2_rasterizer_stateobj *so; - float psize_min, psize_max; - - so = CALLOC_STRUCT(fd2_rasterizer_stateobj); - if (!so) - return NULL; - - if (cso->point_size_per_vertex) { - psize_min = util_get_min_point_size(cso); - psize_max = 8192.0 - 0.0625; - } else { - /* Force the point size to be as if the vertex output was disabled. */ - psize_min = cso->point_size; - psize_max = cso->point_size; - } - - so->base = *cso; - - so->pa_sc_line_stipple = cso->line_stipple_enable ? - A2XX_PA_SC_LINE_STIPPLE_LINE_PATTERN(cso->line_stipple_pattern) | - A2XX_PA_SC_LINE_STIPPLE_REPEAT_COUNT(cso->line_stipple_factor) : 0; - - so->pa_cl_clip_cntl = 0; // TODO - - so->pa_su_vtx_cntl = - A2XX_PA_SU_VTX_CNTL_PIX_CENTER(cso->half_pixel_center ? PIXCENTER_OGL : PIXCENTER_D3D) | - A2XX_PA_SU_VTX_CNTL_QUANT_MODE(ONE_SIXTEENTH); - - so->pa_su_point_size = - A2XX_PA_SU_POINT_SIZE_HEIGHT(cso->point_size/2) | - A2XX_PA_SU_POINT_SIZE_WIDTH(cso->point_size/2); - - so->pa_su_point_minmax = - A2XX_PA_SU_POINT_MINMAX_MIN(psize_min/2) | - A2XX_PA_SU_POINT_MINMAX_MAX(psize_max/2); - - so->pa_su_line_cntl = - A2XX_PA_SU_LINE_CNTL_WIDTH(cso->line_width/2); - - so->pa_su_sc_mode_cntl = - A2XX_PA_SU_SC_MODE_CNTL_VTX_WINDOW_OFFSET_ENABLE | - A2XX_PA_SU_SC_MODE_CNTL_FRONT_PTYPE(fd_polygon_mode(cso->fill_front)) | - A2XX_PA_SU_SC_MODE_CNTL_BACK_PTYPE(fd_polygon_mode(cso->fill_back)); - - if (cso->cull_face & PIPE_FACE_FRONT) - so->pa_su_sc_mode_cntl |= A2XX_PA_SU_SC_MODE_CNTL_CULL_FRONT; - if (cso->cull_face & PIPE_FACE_BACK) - so->pa_su_sc_mode_cntl |= A2XX_PA_SU_SC_MODE_CNTL_CULL_BACK; - if (!cso->flatshade_first) - so->pa_su_sc_mode_cntl |= A2XX_PA_SU_SC_MODE_CNTL_PROVOKING_VTX_LAST; - if (!cso->front_ccw) - so->pa_su_sc_mode_cntl |= A2XX_PA_SU_SC_MODE_CNTL_FACE; - if (cso->line_stipple_enable) - so->pa_su_sc_mode_cntl |= A2XX_PA_SU_SC_MODE_CNTL_LINE_STIPPLE_ENABLE; - if (cso->multisample) - so->pa_su_sc_mode_cntl |= A2XX_PA_SU_SC_MODE_CNTL_MSAA_ENABLE; - - if (cso->fill_front != PIPE_POLYGON_MODE_FILL || - cso->fill_back != PIPE_POLYGON_MODE_FILL) - so->pa_su_sc_mode_cntl |= A2XX_PA_SU_SC_MODE_CNTL_POLYMODE(POLY_DUALMODE); - else - so->pa_su_sc_mode_cntl |= A2XX_PA_SU_SC_MODE_CNTL_POLYMODE(POLY_DISABLED); - - if (cso->offset_tri) - so->pa_su_sc_mode_cntl |= - A2XX_PA_SU_SC_MODE_CNTL_POLY_OFFSET_FRONT_ENABLE | - A2XX_PA_SU_SC_MODE_CNTL_POLY_OFFSET_BACK_ENABLE | - A2XX_PA_SU_SC_MODE_CNTL_POLY_OFFSET_PARA_ENABLE; - - return so; + struct fd2_rasterizer_stateobj *so; + float psize_min, psize_max; + + so = CALLOC_STRUCT(fd2_rasterizer_stateobj); + if (!so) + return NULL; + + if (cso->point_size_per_vertex) { + psize_min = util_get_min_point_size(cso); + psize_max = 8192.0 - 0.0625; + } else { + /* Force the point size to be as if the vertex output was disabled. */ + psize_min = cso->point_size; + psize_max = cso->point_size; + } + + so->base = *cso; + + so->pa_sc_line_stipple = + cso->line_stipple_enable + ? A2XX_PA_SC_LINE_STIPPLE_LINE_PATTERN(cso->line_stipple_pattern) | + A2XX_PA_SC_LINE_STIPPLE_REPEAT_COUNT(cso->line_stipple_factor) + : 0; + + so->pa_cl_clip_cntl = 0; // TODO + + so->pa_su_vtx_cntl = + A2XX_PA_SU_VTX_CNTL_PIX_CENTER(cso->half_pixel_center ? PIXCENTER_OGL + : PIXCENTER_D3D) | + A2XX_PA_SU_VTX_CNTL_QUANT_MODE(ONE_SIXTEENTH); + + so->pa_su_point_size = A2XX_PA_SU_POINT_SIZE_HEIGHT(cso->point_size / 2) | + A2XX_PA_SU_POINT_SIZE_WIDTH(cso->point_size / 2); + + so->pa_su_point_minmax = A2XX_PA_SU_POINT_MINMAX_MIN(psize_min / 2) | + A2XX_PA_SU_POINT_MINMAX_MAX(psize_max / 2); + + so->pa_su_line_cntl = A2XX_PA_SU_LINE_CNTL_WIDTH(cso->line_width / 2); + + so->pa_su_sc_mode_cntl = + A2XX_PA_SU_SC_MODE_CNTL_VTX_WINDOW_OFFSET_ENABLE | + A2XX_PA_SU_SC_MODE_CNTL_FRONT_PTYPE(fd_polygon_mode(cso->fill_front)) | + A2XX_PA_SU_SC_MODE_CNTL_BACK_PTYPE(fd_polygon_mode(cso->fill_back)); + + if (cso->cull_face & PIPE_FACE_FRONT) + so->pa_su_sc_mode_cntl |= A2XX_PA_SU_SC_MODE_CNTL_CULL_FRONT; + if (cso->cull_face & PIPE_FACE_BACK) + so->pa_su_sc_mode_cntl |= A2XX_PA_SU_SC_MODE_CNTL_CULL_BACK; + if (!cso->flatshade_first) + so->pa_su_sc_mode_cntl |= A2XX_PA_SU_SC_MODE_CNTL_PROVOKING_VTX_LAST; + if (!cso->front_ccw) + so->pa_su_sc_mode_cntl |= A2XX_PA_SU_SC_MODE_CNTL_FACE; + if (cso->line_stipple_enable) + so->pa_su_sc_mode_cntl |= A2XX_PA_SU_SC_MODE_CNTL_LINE_STIPPLE_ENABLE; + if (cso->multisample) + so->pa_su_sc_mode_cntl |= A2XX_PA_SU_SC_MODE_CNTL_MSAA_ENABLE; + + if (cso->fill_front != PIPE_POLYGON_MODE_FILL || + cso->fill_back != PIPE_POLYGON_MODE_FILL) + so->pa_su_sc_mode_cntl |= A2XX_PA_SU_SC_MODE_CNTL_POLYMODE(POLY_DUALMODE); + else + so->pa_su_sc_mode_cntl |= A2XX_PA_SU_SC_MODE_CNTL_POLYMODE(POLY_DISABLED); + + if (cso->offset_tri) + so->pa_su_sc_mode_cntl |= + A2XX_PA_SU_SC_MODE_CNTL_POLY_OFFSET_FRONT_ENABLE | + A2XX_PA_SU_SC_MODE_CNTL_POLY_OFFSET_BACK_ENABLE | + A2XX_PA_SU_SC_MODE_CNTL_POLY_OFFSET_PARA_ENABLE; + + return so; } diff --git a/src/gallium/drivers/freedreno/a2xx/fd2_rasterizer.h b/src/gallium/drivers/freedreno/a2xx/fd2_rasterizer.h index 21dc289..86d9b8f 100644 --- a/src/gallium/drivers/freedreno/a2xx/fd2_rasterizer.h +++ b/src/gallium/drivers/freedreno/a2xx/fd2_rasterizer.h @@ -27,27 +27,27 @@ #ifndef FD2_RASTERIZER_H_ #define FD2_RASTERIZER_H_ -#include "pipe/p_state.h" #include "pipe/p_context.h" +#include "pipe/p_state.h" struct fd2_rasterizer_stateobj { - struct pipe_rasterizer_state base; - uint32_t pa_sc_line_stipple; - uint32_t pa_cl_clip_cntl; - uint32_t pa_su_vtx_cntl; - uint32_t pa_su_point_size; - uint32_t pa_su_point_minmax; - uint32_t pa_su_line_cntl; - uint32_t pa_su_sc_mode_cntl; + struct pipe_rasterizer_state base; + uint32_t pa_sc_line_stipple; + uint32_t pa_cl_clip_cntl; + uint32_t pa_su_vtx_cntl; + uint32_t pa_su_point_size; + uint32_t pa_su_point_minmax; + uint32_t pa_su_line_cntl; + uint32_t pa_su_sc_mode_cntl; }; static inline struct fd2_rasterizer_stateobj * fd2_rasterizer_stateobj(struct pipe_rasterizer_state *rast) { - return (struct fd2_rasterizer_stateobj *)rast; + return (struct fd2_rasterizer_stateobj *)rast; } -void * fd2_rasterizer_state_create(struct pipe_context *pctx, - const struct pipe_rasterizer_state *cso); +void *fd2_rasterizer_state_create(struct pipe_context *pctx, + const struct pipe_rasterizer_state *cso); #endif /* FD2_RASTERIZER_H_ */ diff --git a/src/gallium/drivers/freedreno/a2xx/fd2_resource.c b/src/gallium/drivers/freedreno/a2xx/fd2_resource.c index 9738228..6b559f1 100644 --- a/src/gallium/drivers/freedreno/a2xx/fd2_resource.c +++ b/src/gallium/drivers/freedreno/a2xx/fd2_resource.c @@ -29,40 +29,40 @@ uint32_t fd2_setup_slices(struct fd_resource *rsc) { - struct pipe_resource *prsc = &rsc->b.b; - enum pipe_format format = prsc->format; - uint32_t height0 = util_format_get_nblocksy(format, prsc->height0); - uint32_t level, size = 0; + struct pipe_resource *prsc = &rsc->b.b; + enum pipe_format format = prsc->format; + uint32_t height0 = util_format_get_nblocksy(format, prsc->height0); + uint32_t level, size = 0; - /* 32 pixel alignment */ - fdl_set_pitchalign(&rsc->layout, fdl_cpp_shift(&rsc->layout) + 5); + /* 32 pixel alignment */ + fdl_set_pitchalign(&rsc->layout, fdl_cpp_shift(&rsc->layout) + 5); - for (level = 0; level <= prsc->last_level; level++) { - struct fdl_slice *slice = fd_resource_slice(rsc, level); - uint32_t pitch = fdl2_pitch(&rsc->layout, level); - uint32_t nblocksy = align(u_minify(height0, level), 32); + for (level = 0; level <= prsc->last_level; level++) { + struct fdl_slice *slice = fd_resource_slice(rsc, level); + uint32_t pitch = fdl2_pitch(&rsc->layout, level); + uint32_t nblocksy = align(u_minify(height0, level), 32); - /* mipmaps have power of two sizes in memory */ - if (level) - nblocksy = util_next_power_of_two(nblocksy); + /* mipmaps have power of two sizes in memory */ + if (level) + nblocksy = util_next_power_of_two(nblocksy); - slice->offset = size; - slice->size0 = align(pitch * nblocksy, 4096); + slice->offset = size; + slice->size0 = align(pitch * nblocksy, 4096); - size += slice->size0 * u_minify(prsc->depth0, level) * prsc->array_size; - } + size += slice->size0 * u_minify(prsc->depth0, level) * prsc->array_size; + } - return size; + return size; } unsigned fd2_tile_mode(const struct pipe_resource *tmpl) { - /* disable tiling for cube maps, freedreno uses a 2D array for the staging texture, - * (a2xx supports 2D arrays but it is not implemented) - */ - if (tmpl->target == PIPE_TEXTURE_CUBE) - return 0; - /* we can enable tiling for any resource we can render to */ - return (tmpl->bind & PIPE_BIND_RENDER_TARGET) ? 1 : 0; + /* disable tiling for cube maps, freedreno uses a 2D array for the staging + * texture, (a2xx supports 2D arrays but it is not implemented) + */ + if (tmpl->target == PIPE_TEXTURE_CUBE) + return 0; + /* we can enable tiling for any resource we can render to */ + return (tmpl->bind & PIPE_BIND_RENDER_TARGET) ? 1 : 0; } diff --git a/src/gallium/drivers/freedreno/a2xx/fd2_screen.c b/src/gallium/drivers/freedreno/a2xx/fd2_screen.c index 182319a..1ed14da 100644 --- a/src/gallium/drivers/freedreno/a2xx/fd2_screen.c +++ b/src/gallium/drivers/freedreno/a2xx/fd2_screen.c @@ -27,90 +27,84 @@ #include "pipe/p_screen.h" #include "util/format/u_format.h" -#include "fd2_screen.h" #include "fd2_context.h" #include "fd2_emit.h" -#include "fd2_util.h" #include "fd2_resource.h" +#include "fd2_screen.h" +#include "fd2_util.h" static bool fd2_screen_is_format_supported(struct pipe_screen *pscreen, - enum pipe_format format, - enum pipe_texture_target target, - unsigned sample_count, - unsigned storage_sample_count, - unsigned usage) + enum pipe_format format, + enum pipe_texture_target target, + unsigned sample_count, + unsigned storage_sample_count, unsigned usage) { - unsigned retval = 0; - - if ((target >= PIPE_MAX_TEXTURE_TYPES) || - (sample_count > 1)) { /* TODO add MSAA */ - DBG("not supported: format=%s, target=%d, sample_count=%d, usage=%x", - util_format_name(format), target, sample_count, usage); - return false; - } - - if (MAX2(1, sample_count) != MAX2(1, storage_sample_count)) - return false; - - if ((usage & PIPE_BIND_RENDER_TARGET) && - fd2_pipe2color(format) != (enum a2xx_colorformatx)~0) { - retval |= PIPE_BIND_RENDER_TARGET; - } - - if ((usage & (PIPE_BIND_SAMPLER_VIEW | PIPE_BIND_VERTEX_BUFFER)) && - !util_format_is_srgb(format) && - !util_format_is_pure_integer(format) && - fd2_pipe2surface(format).format != FMT_INVALID) { - retval |= usage & PIPE_BIND_VERTEX_BUFFER; - /* the only npot blocksize supported texture format is R32G32B32_FLOAT */ - if (util_is_power_of_two_or_zero(util_format_get_blocksize(format)) || - format == PIPE_FORMAT_R32G32B32_FLOAT) - retval |= usage & PIPE_BIND_SAMPLER_VIEW; - } - - if ((usage & (PIPE_BIND_RENDER_TARGET | - PIPE_BIND_DISPLAY_TARGET | - PIPE_BIND_SCANOUT | - PIPE_BIND_SHARED)) && - (fd2_pipe2color(format) != (enum a2xx_colorformatx)~0)) { - retval |= usage & (PIPE_BIND_RENDER_TARGET | - PIPE_BIND_DISPLAY_TARGET | - PIPE_BIND_SCANOUT | - PIPE_BIND_SHARED); - } - - if ((usage & PIPE_BIND_DEPTH_STENCIL) && - (fd_pipe2depth(format) != (enum adreno_rb_depth_format)~0)) { - retval |= PIPE_BIND_DEPTH_STENCIL; - } - - if ((usage & PIPE_BIND_INDEX_BUFFER) && - (fd_pipe2index(format) != (enum pc_di_index_size)~0)) { - retval |= PIPE_BIND_INDEX_BUFFER; - } - - if (retval != usage) { - DBG("not supported: format=%s, target=%d, sample_count=%d, " - "usage=%x, retval=%x", util_format_name(format), - target, sample_count, usage, retval); - } - - return retval == usage; + unsigned retval = 0; + + if ((target >= PIPE_MAX_TEXTURE_TYPES) || + (sample_count > 1)) { /* TODO add MSAA */ + DBG("not supported: format=%s, target=%d, sample_count=%d, usage=%x", + util_format_name(format), target, sample_count, usage); + return false; + } + + if (MAX2(1, sample_count) != MAX2(1, storage_sample_count)) + return false; + + if ((usage & PIPE_BIND_RENDER_TARGET) && + fd2_pipe2color(format) != (enum a2xx_colorformatx) ~0) { + retval |= PIPE_BIND_RENDER_TARGET; + } + + if ((usage & (PIPE_BIND_SAMPLER_VIEW | PIPE_BIND_VERTEX_BUFFER)) && + !util_format_is_srgb(format) && !util_format_is_pure_integer(format) && + fd2_pipe2surface(format).format != FMT_INVALID) { + retval |= usage & PIPE_BIND_VERTEX_BUFFER; + /* the only npot blocksize supported texture format is R32G32B32_FLOAT */ + if (util_is_power_of_two_or_zero(util_format_get_blocksize(format)) || + format == PIPE_FORMAT_R32G32B32_FLOAT) + retval |= usage & PIPE_BIND_SAMPLER_VIEW; + } + + if ((usage & (PIPE_BIND_RENDER_TARGET | PIPE_BIND_DISPLAY_TARGET | + PIPE_BIND_SCANOUT | PIPE_BIND_SHARED)) && + (fd2_pipe2color(format) != (enum a2xx_colorformatx) ~0)) { + retval |= usage & (PIPE_BIND_RENDER_TARGET | PIPE_BIND_DISPLAY_TARGET | + PIPE_BIND_SCANOUT | PIPE_BIND_SHARED); + } + + if ((usage & PIPE_BIND_DEPTH_STENCIL) && + (fd_pipe2depth(format) != (enum adreno_rb_depth_format) ~0)) { + retval |= PIPE_BIND_DEPTH_STENCIL; + } + + if ((usage & PIPE_BIND_INDEX_BUFFER) && + (fd_pipe2index(format) != (enum pc_di_index_size) ~0)) { + retval |= PIPE_BIND_INDEX_BUFFER; + } + + if (retval != usage) { + DBG("not supported: format=%s, target=%d, sample_count=%d, " + "usage=%x, retval=%x", + util_format_name(format), target, sample_count, usage, retval); + } + + return retval == usage; } void fd2_screen_init(struct pipe_screen *pscreen) { - struct fd_screen *screen = fd_screen(pscreen); + struct fd_screen *screen = fd_screen(pscreen); - screen->max_rts = 1; - pscreen->context_create = fd2_context_create; - pscreen->is_format_supported = fd2_screen_is_format_supported; + screen->max_rts = 1; + pscreen->context_create = fd2_context_create; + pscreen->is_format_supported = fd2_screen_is_format_supported; - screen->setup_slices = fd2_setup_slices; - if (FD_DBG(TTILE)) - screen->tile_mode = fd2_tile_mode; + screen->setup_slices = fd2_setup_slices; + if (FD_DBG(TTILE)) + screen->tile_mode = fd2_tile_mode; - fd2_emit_init_screen(pscreen); + fd2_emit_init_screen(pscreen); } diff --git a/src/gallium/drivers/freedreno/a2xx/fd2_texture.c b/src/gallium/drivers/freedreno/a2xx/fd2_texture.c index 35bcf9b..3048011 100644 --- a/src/gallium/drivers/freedreno/a2xx/fd2_texture.c +++ b/src/gallium/drivers/freedreno/a2xx/fd2_texture.c @@ -25,9 +25,9 @@ */ #include "pipe/p_state.h" -#include "util/u_string.h" -#include "util/u_memory.h" #include "util/u_inlines.h" +#include "util/u_memory.h" +#include "util/u_string.h" #include "fd2_texture.h" #include "fd2_util.h" @@ -35,200 +35,190 @@ static enum sq_tex_clamp tex_clamp(unsigned wrap) { - switch (wrap) { - case PIPE_TEX_WRAP_REPEAT: - return SQ_TEX_WRAP; - case PIPE_TEX_WRAP_CLAMP: - return SQ_TEX_CLAMP_HALF_BORDER; - case PIPE_TEX_WRAP_CLAMP_TO_EDGE: - return SQ_TEX_CLAMP_LAST_TEXEL; - case PIPE_TEX_WRAP_CLAMP_TO_BORDER: - return SQ_TEX_CLAMP_BORDER; - case PIPE_TEX_WRAP_MIRROR_REPEAT: - return SQ_TEX_MIRROR; - case PIPE_TEX_WRAP_MIRROR_CLAMP: - return SQ_TEX_MIRROR_ONCE_HALF_BORDER; - case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE: - return SQ_TEX_MIRROR_ONCE_LAST_TEXEL; - case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER: - return SQ_TEX_MIRROR_ONCE_BORDER; - default: - DBG("invalid wrap: %u", wrap); - return 0; - } + switch (wrap) { + case PIPE_TEX_WRAP_REPEAT: + return SQ_TEX_WRAP; + case PIPE_TEX_WRAP_CLAMP: + return SQ_TEX_CLAMP_HALF_BORDER; + case PIPE_TEX_WRAP_CLAMP_TO_EDGE: + return SQ_TEX_CLAMP_LAST_TEXEL; + case PIPE_TEX_WRAP_CLAMP_TO_BORDER: + return SQ_TEX_CLAMP_BORDER; + case PIPE_TEX_WRAP_MIRROR_REPEAT: + return SQ_TEX_MIRROR; + case PIPE_TEX_WRAP_MIRROR_CLAMP: + return SQ_TEX_MIRROR_ONCE_HALF_BORDER; + case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE: + return SQ_TEX_MIRROR_ONCE_LAST_TEXEL; + case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER: + return SQ_TEX_MIRROR_ONCE_BORDER; + default: + DBG("invalid wrap: %u", wrap); + return 0; + } } static enum sq_tex_filter tex_filter(unsigned filter) { - switch (filter) { - case PIPE_TEX_FILTER_NEAREST: - return SQ_TEX_FILTER_POINT; - case PIPE_TEX_FILTER_LINEAR: - return SQ_TEX_FILTER_BILINEAR; - default: - DBG("invalid filter: %u", filter); - return 0; - } + switch (filter) { + case PIPE_TEX_FILTER_NEAREST: + return SQ_TEX_FILTER_POINT; + case PIPE_TEX_FILTER_LINEAR: + return SQ_TEX_FILTER_BILINEAR; + default: + DBG("invalid filter: %u", filter); + return 0; + } } static enum sq_tex_filter mip_filter(unsigned filter) { - switch (filter) { - case PIPE_TEX_MIPFILTER_NONE: - return SQ_TEX_FILTER_BASEMAP; - case PIPE_TEX_MIPFILTER_NEAREST: - return SQ_TEX_FILTER_POINT; - case PIPE_TEX_MIPFILTER_LINEAR: - return SQ_TEX_FILTER_BILINEAR; - default: - DBG("invalid filter: %u", filter); - return 0; - } + switch (filter) { + case PIPE_TEX_MIPFILTER_NONE: + return SQ_TEX_FILTER_BASEMAP; + case PIPE_TEX_MIPFILTER_NEAREST: + return SQ_TEX_FILTER_POINT; + case PIPE_TEX_MIPFILTER_LINEAR: + return SQ_TEX_FILTER_BILINEAR; + default: + DBG("invalid filter: %u", filter); + return 0; + } } static void * fd2_sampler_state_create(struct pipe_context *pctx, - const struct pipe_sampler_state *cso) + const struct pipe_sampler_state *cso) { - struct fd2_sampler_stateobj *so = CALLOC_STRUCT(fd2_sampler_stateobj); + struct fd2_sampler_stateobj *so = CALLOC_STRUCT(fd2_sampler_stateobj); - if (!so) - return NULL; + if (!so) + return NULL; - so->base = *cso; + so->base = *cso; - /* TODO - * cso->max_anisotropy - * cso->normalized_coords (dealt with by shader for rect textures?) - */ + /* TODO + * cso->max_anisotropy + * cso->normalized_coords (dealt with by shader for rect textures?) + */ - /* SQ_TEX0_PITCH() must be OR'd in later when we know the bound texture: */ - so->tex0 = - A2XX_SQ_TEX_0_CLAMP_X(tex_clamp(cso->wrap_s)) | - A2XX_SQ_TEX_0_CLAMP_Y(tex_clamp(cso->wrap_t)) | - A2XX_SQ_TEX_0_CLAMP_Z(tex_clamp(cso->wrap_r)); + /* SQ_TEX0_PITCH() must be OR'd in later when we know the bound texture: */ + so->tex0 = A2XX_SQ_TEX_0_CLAMP_X(tex_clamp(cso->wrap_s)) | + A2XX_SQ_TEX_0_CLAMP_Y(tex_clamp(cso->wrap_t)) | + A2XX_SQ_TEX_0_CLAMP_Z(tex_clamp(cso->wrap_r)); - so->tex3 = - A2XX_SQ_TEX_3_XY_MAG_FILTER(tex_filter(cso->mag_img_filter)) | - A2XX_SQ_TEX_3_XY_MIN_FILTER(tex_filter(cso->min_img_filter)) | - A2XX_SQ_TEX_3_MIP_FILTER(mip_filter(cso->min_mip_filter)); + so->tex3 = A2XX_SQ_TEX_3_XY_MAG_FILTER(tex_filter(cso->mag_img_filter)) | + A2XX_SQ_TEX_3_XY_MIN_FILTER(tex_filter(cso->min_img_filter)) | + A2XX_SQ_TEX_3_MIP_FILTER(mip_filter(cso->min_mip_filter)); - so->tex4 = 0; - if (cso->min_mip_filter != PIPE_TEX_MIPFILTER_NONE) - so->tex4 = A2XX_SQ_TEX_4_LOD_BIAS(cso->lod_bias); + so->tex4 = 0; + if (cso->min_mip_filter != PIPE_TEX_MIPFILTER_NONE) + so->tex4 = A2XX_SQ_TEX_4_LOD_BIAS(cso->lod_bias); - return so; + return so; } static void -fd2_sampler_states_bind(struct pipe_context *pctx, - enum pipe_shader_type shader, unsigned start, - unsigned nr, void **hwcso) - in_dt +fd2_sampler_states_bind(struct pipe_context *pctx, enum pipe_shader_type shader, + unsigned start, unsigned nr, void **hwcso) in_dt { - if (!hwcso) - nr = 0; + if (!hwcso) + nr = 0; - if (shader == PIPE_SHADER_FRAGMENT) { - struct fd_context *ctx = fd_context(pctx); + if (shader == PIPE_SHADER_FRAGMENT) { + struct fd_context *ctx = fd_context(pctx); - /* on a2xx, since there is a flat address space for textures/samplers, - * a change in # of fragment textures/samplers will trigger patching and - * re-emitting the vertex shader: - */ - if (nr != ctx->tex[PIPE_SHADER_FRAGMENT].num_samplers) - ctx->dirty |= FD_DIRTY_TEXSTATE; - } + /* on a2xx, since there is a flat address space for textures/samplers, + * a change in # of fragment textures/samplers will trigger patching and + * re-emitting the vertex shader: + */ + if (nr != ctx->tex[PIPE_SHADER_FRAGMENT].num_samplers) + ctx->dirty |= FD_DIRTY_TEXSTATE; + } - fd_sampler_states_bind(pctx, shader, start, nr, hwcso); + fd_sampler_states_bind(pctx, shader, start, nr, hwcso); } static enum sq_tex_dimension tex_dimension(unsigned target) { - switch (target) { - default: - assert(0); - case PIPE_TEXTURE_1D: - assert(0); /* TODO */ - return SQ_TEX_DIMENSION_1D; - case PIPE_TEXTURE_RECT: - case PIPE_TEXTURE_2D: - return SQ_TEX_DIMENSION_2D; - case PIPE_TEXTURE_3D: - assert(0); /* TODO */ - return SQ_TEX_DIMENSION_3D; - case PIPE_TEXTURE_CUBE: - return SQ_TEX_DIMENSION_CUBE; - } + switch (target) { + default: + assert(0); + case PIPE_TEXTURE_1D: + assert(0); /* TODO */ + return SQ_TEX_DIMENSION_1D; + case PIPE_TEXTURE_RECT: + case PIPE_TEXTURE_2D: + return SQ_TEX_DIMENSION_2D; + case PIPE_TEXTURE_3D: + assert(0); /* TODO */ + return SQ_TEX_DIMENSION_3D; + case PIPE_TEXTURE_CUBE: + return SQ_TEX_DIMENSION_CUBE; + } } static struct pipe_sampler_view * fd2_sampler_view_create(struct pipe_context *pctx, struct pipe_resource *prsc, - const struct pipe_sampler_view *cso) + const struct pipe_sampler_view *cso) { - struct fd2_pipe_sampler_view *so = CALLOC_STRUCT(fd2_pipe_sampler_view); - struct fd_resource *rsc = fd_resource(prsc); - struct surface_format fmt = fd2_pipe2surface(cso->format); - - if (!so) - return NULL; - - so->base = *cso; - pipe_reference(NULL, &prsc->reference); - so->base.texture = prsc; - so->base.reference.count = 1; - so->base.context = pctx; - - so->tex0 = - A2XX_SQ_TEX_0_SIGN_X(fmt.sign) | - A2XX_SQ_TEX_0_SIGN_Y(fmt.sign) | - A2XX_SQ_TEX_0_SIGN_Z(fmt.sign) | - A2XX_SQ_TEX_0_SIGN_W(fmt.sign) | - A2XX_SQ_TEX_0_PITCH(fdl2_pitch_pixels(&rsc->layout, 0) * - util_format_get_blockwidth(prsc->format)) | - COND(rsc->layout.tile_mode, A2XX_SQ_TEX_0_TILED); - so->tex1 = - A2XX_SQ_TEX_1_FORMAT(fmt.format) | - A2XX_SQ_TEX_1_CLAMP_POLICY(SQ_TEX_CLAMP_POLICY_OGL); - so->tex2 = - A2XX_SQ_TEX_2_HEIGHT(prsc->height0 - 1) | - A2XX_SQ_TEX_2_WIDTH(prsc->width0 - 1); - so->tex3 = - A2XX_SQ_TEX_3_NUM_FORMAT(fmt.num_format) | - fd2_tex_swiz(cso->format, cso->swizzle_r, cso->swizzle_g, - cso->swizzle_b, cso->swizzle_a) | - A2XX_SQ_TEX_3_EXP_ADJUST(fmt.exp_adjust); - - so->tex4 = - A2XX_SQ_TEX_4_MIP_MIN_LEVEL(fd_sampler_first_level(cso)) | - A2XX_SQ_TEX_4_MIP_MAX_LEVEL(fd_sampler_last_level(cso)); - - so->tex5 = A2XX_SQ_TEX_5_DIMENSION(tex_dimension(prsc->target)); - - return &so->base; + struct fd2_pipe_sampler_view *so = CALLOC_STRUCT(fd2_pipe_sampler_view); + struct fd_resource *rsc = fd_resource(prsc); + struct surface_format fmt = fd2_pipe2surface(cso->format); + + if (!so) + return NULL; + + so->base = *cso; + pipe_reference(NULL, &prsc->reference); + so->base.texture = prsc; + so->base.reference.count = 1; + so->base.context = pctx; + + so->tex0 = A2XX_SQ_TEX_0_SIGN_X(fmt.sign) | A2XX_SQ_TEX_0_SIGN_Y(fmt.sign) | + A2XX_SQ_TEX_0_SIGN_Z(fmt.sign) | A2XX_SQ_TEX_0_SIGN_W(fmt.sign) | + A2XX_SQ_TEX_0_PITCH(fdl2_pitch_pixels(&rsc->layout, 0) * + util_format_get_blockwidth(prsc->format)) | + COND(rsc->layout.tile_mode, A2XX_SQ_TEX_0_TILED); + so->tex1 = A2XX_SQ_TEX_1_FORMAT(fmt.format) | + A2XX_SQ_TEX_1_CLAMP_POLICY(SQ_TEX_CLAMP_POLICY_OGL); + so->tex2 = A2XX_SQ_TEX_2_HEIGHT(prsc->height0 - 1) | + A2XX_SQ_TEX_2_WIDTH(prsc->width0 - 1); + so->tex3 = A2XX_SQ_TEX_3_NUM_FORMAT(fmt.num_format) | + fd2_tex_swiz(cso->format, cso->swizzle_r, cso->swizzle_g, + cso->swizzle_b, cso->swizzle_a) | + A2XX_SQ_TEX_3_EXP_ADJUST(fmt.exp_adjust); + + so->tex4 = A2XX_SQ_TEX_4_MIP_MIN_LEVEL(fd_sampler_first_level(cso)) | + A2XX_SQ_TEX_4_MIP_MAX_LEVEL(fd_sampler_last_level(cso)); + + so->tex5 = A2XX_SQ_TEX_5_DIMENSION(tex_dimension(prsc->target)); + + return &so->base; } static void fd2_set_sampler_views(struct pipe_context *pctx, enum pipe_shader_type shader, - unsigned start, unsigned nr, unsigned unbind_num_trailing_slots, - struct pipe_sampler_view **views) - in_dt + unsigned start, unsigned nr, + unsigned unbind_num_trailing_slots, + struct pipe_sampler_view **views) in_dt { - if (shader == PIPE_SHADER_FRAGMENT) { - struct fd_context *ctx = fd_context(pctx); - - /* on a2xx, since there is a flat address space for textures/samplers, - * a change in # of fragment textures/samplers will trigger patching and - * re-emitting the vertex shader: - */ - if (nr != ctx->tex[PIPE_SHADER_FRAGMENT].num_textures) - ctx->dirty |= FD_DIRTY_TEXSTATE; - } - - fd_set_sampler_views(pctx, shader, start, nr, unbind_num_trailing_slots, views); + if (shader == PIPE_SHADER_FRAGMENT) { + struct fd_context *ctx = fd_context(pctx); + + /* on a2xx, since there is a flat address space for textures/samplers, + * a change in # of fragment textures/samplers will trigger patching and + * re-emitting the vertex shader: + */ + if (nr != ctx->tex[PIPE_SHADER_FRAGMENT].num_textures) + ctx->dirty |= FD_DIRTY_TEXSTATE; + } + + fd_set_sampler_views(pctx, shader, start, nr, unbind_num_trailing_slots, + views); } /* map gallium sampler-id to hw const-idx.. adreno uses a flat address @@ -244,19 +234,18 @@ fd2_set_sampler_views(struct pipe_context *pctx, enum pipe_shader_type shader, */ unsigned fd2_get_const_idx(struct fd_context *ctx, struct fd_texture_stateobj *tex, - unsigned samp_id) - assert_dt + unsigned samp_id) assert_dt { - if (tex == &ctx->tex[PIPE_SHADER_FRAGMENT]) - return samp_id; - return samp_id + ctx->tex[PIPE_SHADER_FRAGMENT].num_samplers; + if (tex == &ctx->tex[PIPE_SHADER_FRAGMENT]) + return samp_id; + return samp_id + ctx->tex[PIPE_SHADER_FRAGMENT].num_samplers; } void fd2_texture_init(struct pipe_context *pctx) { - pctx->create_sampler_state = fd2_sampler_state_create; - pctx->bind_sampler_states = fd2_sampler_states_bind; - pctx->create_sampler_view = fd2_sampler_view_create; - pctx->set_sampler_views = fd2_set_sampler_views; + pctx->create_sampler_state = fd2_sampler_state_create; + pctx->bind_sampler_states = fd2_sampler_states_bind; + pctx->create_sampler_view = fd2_sampler_view_create; + pctx->set_sampler_views = fd2_set_sampler_views; } diff --git a/src/gallium/drivers/freedreno/a2xx/fd2_texture.h b/src/gallium/drivers/freedreno/a2xx/fd2_texture.h index b7ade85..b26dc64 100644 --- a/src/gallium/drivers/freedreno/a2xx/fd2_texture.h +++ b/src/gallium/drivers/freedreno/a2xx/fd2_texture.h @@ -29,36 +29,36 @@ #include "pipe/p_context.h" -#include "freedreno_texture.h" #include "freedreno_resource.h" +#include "freedreno_texture.h" #include "fd2_context.h" #include "fd2_util.h" struct fd2_sampler_stateobj { - struct pipe_sampler_state base; - uint32_t tex0, tex3, tex4; + struct pipe_sampler_state base; + uint32_t tex0, tex3, tex4; }; static inline struct fd2_sampler_stateobj * fd2_sampler_stateobj(struct pipe_sampler_state *samp) { - return (struct fd2_sampler_stateobj *)samp; + return (struct fd2_sampler_stateobj *)samp; } struct fd2_pipe_sampler_view { - struct pipe_sampler_view base; - uint32_t tex0, tex1, tex2, tex3, tex4, tex5; + struct pipe_sampler_view base; + uint32_t tex0, tex1, tex2, tex3, tex4, tex5; }; static inline struct fd2_pipe_sampler_view * fd2_pipe_sampler_view(struct pipe_sampler_view *pview) { - return (struct fd2_pipe_sampler_view *)pview; + return (struct fd2_pipe_sampler_view *)pview; } unsigned fd2_get_const_idx(struct fd_context *ctx, - struct fd_texture_stateobj *tex, unsigned samp_id); + struct fd_texture_stateobj *tex, unsigned samp_id); void fd2_texture_init(struct pipe_context *pctx); diff --git a/src/gallium/drivers/freedreno/a2xx/fd2_util.c b/src/gallium/drivers/freedreno/a2xx/fd2_util.c index f424304..0b98f60 100644 --- a/src/gallium/drivers/freedreno/a2xx/fd2_util.c +++ b/src/gallium/drivers/freedreno/a2xx/fd2_util.c @@ -32,53 +32,54 @@ static enum a2xx_sq_surfaceformat pipe2surface(enum pipe_format format, struct surface_format *fmt) { - const struct util_format_description *desc = util_format_description(format); - - if (desc->layout != UTIL_FORMAT_LAYOUT_PLAIN) { - switch (format) { - /* Compressed textures. */ - case PIPE_FORMAT_ETC1_RGB8: - return FMT_ETC1_RGB; - case PIPE_FORMAT_DXT1_RGB: - case PIPE_FORMAT_DXT1_RGBA: - return FMT_DXT1; - case PIPE_FORMAT_DXT3_RGBA: - return FMT_DXT2_3; - case PIPE_FORMAT_DXT5_RGBA: - return FMT_DXT4_5; - case PIPE_FORMAT_ATC_RGB: - return FMT_ATI_TC_555_565_RGB; - case PIPE_FORMAT_ATC_RGBA_EXPLICIT: - return FMT_ATI_TC_555_565_RGBA; - case PIPE_FORMAT_ATC_RGBA_INTERPOLATED: - return FMT_ATI_TC_555_565_RGBA_INTERP; - /* YUV buffers. */ - case PIPE_FORMAT_UYVY: - return FMT_Y1_Cr_Y0_Cb; - case PIPE_FORMAT_YUYV: - return FMT_Cr_Y1_Cb_Y0; - default: - return ~0; - } - } - - uint32_t channel_size = 0; - for (unsigned i = 0; i < 4; i++) - channel_size |= desc->channel[i].size << i*8; - - unsigned i = util_format_get_first_non_void_channel(format); - if (desc->channel[i].type == UTIL_FORMAT_TYPE_SIGNED || - desc->channel[i].type == UTIL_FORMAT_TYPE_FIXED) - fmt->sign = SQ_TEX_SIGN_SIGNED; - if (!desc->channel[i].normalized) - fmt->num_format = SQ_TEX_NUM_FORMAT_INT; - if (desc->channel[i].type == UTIL_FORMAT_TYPE_FIXED) - fmt->exp_adjust = -16; - - /* Note: the 3 channel 24bpp/48bpp/96bpp formats are only for vertex fetch - * we can use the 4 channel format and ignore the 4th component just isn't used - * XXX: is it possible for the extra loaded component to cause a MMU fault? - */ + const struct util_format_description *desc = util_format_description(format); + + if (desc->layout != UTIL_FORMAT_LAYOUT_PLAIN) { + switch (format) { + /* Compressed textures. */ + case PIPE_FORMAT_ETC1_RGB8: + return FMT_ETC1_RGB; + case PIPE_FORMAT_DXT1_RGB: + case PIPE_FORMAT_DXT1_RGBA: + return FMT_DXT1; + case PIPE_FORMAT_DXT3_RGBA: + return FMT_DXT2_3; + case PIPE_FORMAT_DXT5_RGBA: + return FMT_DXT4_5; + case PIPE_FORMAT_ATC_RGB: + return FMT_ATI_TC_555_565_RGB; + case PIPE_FORMAT_ATC_RGBA_EXPLICIT: + return FMT_ATI_TC_555_565_RGBA; + case PIPE_FORMAT_ATC_RGBA_INTERPOLATED: + return FMT_ATI_TC_555_565_RGBA_INTERP; + /* YUV buffers. */ + case PIPE_FORMAT_UYVY: + return FMT_Y1_Cr_Y0_Cb; + case PIPE_FORMAT_YUYV: + return FMT_Cr_Y1_Cb_Y0; + default: + return ~0; + } + } + + uint32_t channel_size = 0; + for (unsigned i = 0; i < 4; i++) + channel_size |= desc->channel[i].size << i * 8; + + unsigned i = util_format_get_first_non_void_channel(format); + if (desc->channel[i].type == UTIL_FORMAT_TYPE_SIGNED || + desc->channel[i].type == UTIL_FORMAT_TYPE_FIXED) + fmt->sign = SQ_TEX_SIGN_SIGNED; + if (!desc->channel[i].normalized) + fmt->num_format = SQ_TEX_NUM_FORMAT_INT; + if (desc->channel[i].type == UTIL_FORMAT_TYPE_FIXED) + fmt->exp_adjust = -16; + + /* Note: the 3 channel 24bpp/48bpp/96bpp formats are only for vertex fetch + * we can use the 4 channel format and ignore the 4th component just isn't + * used + * XXX: is it possible for the extra loaded component to cause a MMU fault? + */ #define CASE(r, g, b, a) case (r | g << 8 | b << 16 | a << 24) @@ -119,116 +120,125 @@ pipe2surface(enum pipe_format format, struct surface_format *fmt) /* clang-format on */ #undef CASE - return ~0; + return ~0; } struct surface_format fd2_pipe2surface(enum pipe_format format) { - struct surface_format fmt = { - .sign = SQ_TEX_SIGN_UNSIGNED, - .num_format = SQ_TEX_NUM_FORMAT_FRAC, - .exp_adjust = 0, - }; - fmt.format = pipe2surface(format, &fmt); - return fmt; + struct surface_format fmt = { + .sign = SQ_TEX_SIGN_UNSIGNED, + .num_format = SQ_TEX_NUM_FORMAT_FRAC, + .exp_adjust = 0, + }; + fmt.format = pipe2surface(format, &fmt); + return fmt; } enum a2xx_colorformatx fd2_pipe2color(enum pipe_format format) { - switch (format) { - /* 8-bit buffers. */ - case PIPE_FORMAT_R8_UNORM: - return COLORX_8; - case PIPE_FORMAT_B2G3R3_UNORM: - return COLORX_2_3_3; /* note: untested */ - - /* 16-bit buffers. */ - case PIPE_FORMAT_B5G6R5_UNORM: - return COLORX_5_6_5; - case PIPE_FORMAT_B5G5R5A1_UNORM: - case PIPE_FORMAT_B5G5R5X1_UNORM: - return COLORX_1_5_5_5; - case PIPE_FORMAT_B4G4R4A4_UNORM: - case PIPE_FORMAT_B4G4R4X4_UNORM: - return COLORX_4_4_4_4; - case PIPE_FORMAT_R8G8_UNORM: - return COLORX_8_8; - - /* 32-bit buffers. */ - case PIPE_FORMAT_B8G8R8A8_UNORM: - case PIPE_FORMAT_B8G8R8X8_UNORM: - case PIPE_FORMAT_R8G8B8A8_UNORM: - case PIPE_FORMAT_R8G8B8X8_UNORM: - return COLORX_8_8_8_8; - /* Note: snorm untested */ - case PIPE_FORMAT_R8G8B8A8_SNORM: - case PIPE_FORMAT_R8G8B8X8_SNORM: - return COLORX_S8_8_8_8; - - /* float buffers */ - case PIPE_FORMAT_R16_FLOAT: - return COLORX_16_FLOAT; - case PIPE_FORMAT_R16G16_FLOAT: - return COLORX_16_16_FLOAT; - case PIPE_FORMAT_R16G16B16A16_FLOAT: - return COLORX_16_16_16_16_FLOAT; - case PIPE_FORMAT_R32_FLOAT: - return COLORX_32_FLOAT; - case PIPE_FORMAT_R32G32_FLOAT: - return COLORX_32_32_FLOAT; - case PIPE_FORMAT_R32G32B32A32_FLOAT: - return COLORX_32_32_32_32_FLOAT; - - default: - return ~0; - } + switch (format) { + /* 8-bit buffers. */ + case PIPE_FORMAT_R8_UNORM: + return COLORX_8; + case PIPE_FORMAT_B2G3R3_UNORM: + return COLORX_2_3_3; /* note: untested */ + + /* 16-bit buffers. */ + case PIPE_FORMAT_B5G6R5_UNORM: + return COLORX_5_6_5; + case PIPE_FORMAT_B5G5R5A1_UNORM: + case PIPE_FORMAT_B5G5R5X1_UNORM: + return COLORX_1_5_5_5; + case PIPE_FORMAT_B4G4R4A4_UNORM: + case PIPE_FORMAT_B4G4R4X4_UNORM: + return COLORX_4_4_4_4; + case PIPE_FORMAT_R8G8_UNORM: + return COLORX_8_8; + + /* 32-bit buffers. */ + case PIPE_FORMAT_B8G8R8A8_UNORM: + case PIPE_FORMAT_B8G8R8X8_UNORM: + case PIPE_FORMAT_R8G8B8A8_UNORM: + case PIPE_FORMAT_R8G8B8X8_UNORM: + return COLORX_8_8_8_8; + /* Note: snorm untested */ + case PIPE_FORMAT_R8G8B8A8_SNORM: + case PIPE_FORMAT_R8G8B8X8_SNORM: + return COLORX_S8_8_8_8; + + /* float buffers */ + case PIPE_FORMAT_R16_FLOAT: + return COLORX_16_FLOAT; + case PIPE_FORMAT_R16G16_FLOAT: + return COLORX_16_16_FLOAT; + case PIPE_FORMAT_R16G16B16A16_FLOAT: + return COLORX_16_16_16_16_FLOAT; + case PIPE_FORMAT_R32_FLOAT: + return COLORX_32_FLOAT; + case PIPE_FORMAT_R32G32_FLOAT: + return COLORX_32_32_FLOAT; + case PIPE_FORMAT_R32G32B32A32_FLOAT: + return COLORX_32_32_32_32_FLOAT; + + default: + return ~0; + } } static inline enum sq_tex_swiz tex_swiz(unsigned swiz) { - switch (swiz) { - default: - case PIPE_SWIZZLE_X: return SQ_TEX_X; - case PIPE_SWIZZLE_Y: return SQ_TEX_Y; - case PIPE_SWIZZLE_Z: return SQ_TEX_Z; - case PIPE_SWIZZLE_W: return SQ_TEX_W; - case PIPE_SWIZZLE_0: return SQ_TEX_ZERO; - case PIPE_SWIZZLE_1: return SQ_TEX_ONE; - } + switch (swiz) { + default: + case PIPE_SWIZZLE_X: + return SQ_TEX_X; + case PIPE_SWIZZLE_Y: + return SQ_TEX_Y; + case PIPE_SWIZZLE_Z: + return SQ_TEX_Z; + case PIPE_SWIZZLE_W: + return SQ_TEX_W; + case PIPE_SWIZZLE_0: + return SQ_TEX_ZERO; + case PIPE_SWIZZLE_1: + return SQ_TEX_ONE; + } } uint32_t fd2_tex_swiz(enum pipe_format format, unsigned swizzle_r, unsigned swizzle_g, - unsigned swizzle_b, unsigned swizzle_a) + unsigned swizzle_b, unsigned swizzle_a) { - const struct util_format_description *desc = - util_format_description(format); - unsigned char swiz[4] = { - swizzle_r, swizzle_g, swizzle_b, swizzle_a, - }, rswiz[4]; - - util_format_compose_swizzles(desc->swizzle, swiz, rswiz); - - return A2XX_SQ_TEX_3_SWIZ_X(tex_swiz(rswiz[0])) | - A2XX_SQ_TEX_3_SWIZ_Y(tex_swiz(rswiz[1])) | - A2XX_SQ_TEX_3_SWIZ_Z(tex_swiz(rswiz[2])) | - A2XX_SQ_TEX_3_SWIZ_W(tex_swiz(rswiz[3])); + const struct util_format_description *desc = util_format_description(format); + unsigned char swiz[4] = + { + swizzle_r, + swizzle_g, + swizzle_b, + swizzle_a, + }, + rswiz[4]; + + util_format_compose_swizzles(desc->swizzle, swiz, rswiz); + + return A2XX_SQ_TEX_3_SWIZ_X(tex_swiz(rswiz[0])) | + A2XX_SQ_TEX_3_SWIZ_Y(tex_swiz(rswiz[1])) | + A2XX_SQ_TEX_3_SWIZ_Z(tex_swiz(rswiz[2])) | + A2XX_SQ_TEX_3_SWIZ_W(tex_swiz(rswiz[3])); } uint32_t fd2_vtx_swiz(enum pipe_format format, unsigned swizzle) { - const struct util_format_description *desc = - util_format_description(format); - unsigned char swiz[4], rswiz[4]; + const struct util_format_description *desc = util_format_description(format); + unsigned char swiz[4], rswiz[4]; - for (unsigned i = 0; i < 4; i++) - swiz[i] = (swizzle >> i * 3) & 7; + for (unsigned i = 0; i < 4; i++) + swiz[i] = (swizzle >> i * 3) & 7; - util_format_compose_swizzles(desc->swizzle, swiz, rswiz); + util_format_compose_swizzles(desc->swizzle, swiz, rswiz); - return rswiz[0] | rswiz[1] << 3 | rswiz[2] << 6 | rswiz[3] << 9; + return rswiz[0] | rswiz[1] << 3 | rswiz[2] << 6 | rswiz[3] << 9; } diff --git a/src/gallium/drivers/freedreno/a2xx/fd2_util.h b/src/gallium/drivers/freedreno/a2xx/fd2_util.h index b87ef41..e48e54a 100644 --- a/src/gallium/drivers/freedreno/a2xx/fd2_util.h +++ b/src/gallium/drivers/freedreno/a2xx/fd2_util.h @@ -33,23 +33,25 @@ struct surface_format { /* If enum is a signed type, 0x7f is out of range. Cast it to avoid warnings. */ -#define FMT_INVALID ((enum a2xx_sq_surfaceformat) 0x7f) - enum a2xx_sq_surfaceformat format : 7; - enum sq_tex_sign sign : 2; - enum sq_tex_num_format num_format : 1; - int exp_adjust : 6; +#define FMT_INVALID ((enum a2xx_sq_surfaceformat)0x7f) + enum a2xx_sq_surfaceformat format : 7; + enum sq_tex_sign sign : 2; + enum sq_tex_num_format num_format : 1; + int exp_adjust : 6; }; struct surface_format fd2_pipe2surface(enum pipe_format format); enum a2xx_colorformatx fd2_pipe2color(enum pipe_format format); uint32_t fd2_tex_swiz(enum pipe_format format, unsigned swizzle_r, - unsigned swizzle_g, unsigned swizzle_b, unsigned swizzle_a); + unsigned swizzle_g, unsigned swizzle_b, + unsigned swizzle_a); uint32_t fd2_vtx_swiz(enum pipe_format format, unsigned swizzle); /* convert x,y to dword */ -static inline uint32_t xy2d(uint16_t x, uint16_t y) +static inline uint32_t +xy2d(uint16_t x, uint16_t y) { - return ((y & 0x3fff) << 16) | (x & 0x3fff); + return ((y & 0x3fff) << 16) | (x & 0x3fff); } #endif /* FD2_UTIL_H_ */ diff --git a/src/gallium/drivers/freedreno/a2xx/fd2_zsa.c b/src/gallium/drivers/freedreno/a2xx/fd2_zsa.c index fdd7a2a..6d4872b 100644 --- a/src/gallium/drivers/freedreno/a2xx/fd2_zsa.c +++ b/src/gallium/drivers/freedreno/a2xx/fd2_zsa.c @@ -24,72 +24,71 @@ * Rob Clark */ - #include "pipe/p_state.h" -#include "util/u_string.h" #include "util/u_memory.h" +#include "util/u_string.h" -#include "fd2_zsa.h" #include "fd2_context.h" #include "fd2_util.h" +#include "fd2_zsa.h" void * fd2_zsa_state_create(struct pipe_context *pctx, - const struct pipe_depth_stencil_alpha_state *cso) + const struct pipe_depth_stencil_alpha_state *cso) { - struct fd2_zsa_stateobj *so; + struct fd2_zsa_stateobj *so; - so = CALLOC_STRUCT(fd2_zsa_stateobj); - if (!so) - return NULL; + so = CALLOC_STRUCT(fd2_zsa_stateobj); + if (!so) + return NULL; - so->base = *cso; + so->base = *cso; - so->rb_depthcontrol |= - A2XX_RB_DEPTHCONTROL_ZFUNC(cso->depth_func); /* maps 1:1 */ + so->rb_depthcontrol |= + A2XX_RB_DEPTHCONTROL_ZFUNC(cso->depth_func); /* maps 1:1 */ - if (cso->depth_enabled) - so->rb_depthcontrol |= A2XX_RB_DEPTHCONTROL_Z_ENABLE | - COND(!cso->alpha_enabled, A2XX_RB_DEPTHCONTROL_EARLY_Z_ENABLE); - if (cso->depth_writemask) - so->rb_depthcontrol |= A2XX_RB_DEPTHCONTROL_Z_WRITE_ENABLE; + if (cso->depth_enabled) + so->rb_depthcontrol |= + A2XX_RB_DEPTHCONTROL_Z_ENABLE | + COND(!cso->alpha_enabled, A2XX_RB_DEPTHCONTROL_EARLY_Z_ENABLE); + if (cso->depth_writemask) + so->rb_depthcontrol |= A2XX_RB_DEPTHCONTROL_Z_WRITE_ENABLE; - if (cso->stencil[0].enabled) { - const struct pipe_stencil_state *s = &cso->stencil[0]; + if (cso->stencil[0].enabled) { + const struct pipe_stencil_state *s = &cso->stencil[0]; - so->rb_depthcontrol |= - A2XX_RB_DEPTHCONTROL_STENCIL_ENABLE | - A2XX_RB_DEPTHCONTROL_STENCILFUNC(s->func) | /* maps 1:1 */ - A2XX_RB_DEPTHCONTROL_STENCILFAIL(fd_stencil_op(s->fail_op)) | - A2XX_RB_DEPTHCONTROL_STENCILZPASS(fd_stencil_op(s->zpass_op)) | - A2XX_RB_DEPTHCONTROL_STENCILZFAIL(fd_stencil_op(s->zfail_op)); - so->rb_stencilrefmask |= - 0xff000000 | /* ??? */ - A2XX_RB_STENCILREFMASK_STENCILWRITEMASK(s->writemask) | - A2XX_RB_STENCILREFMASK_STENCILMASK(s->valuemask); + so->rb_depthcontrol |= + A2XX_RB_DEPTHCONTROL_STENCIL_ENABLE | + A2XX_RB_DEPTHCONTROL_STENCILFUNC(s->func) | /* maps 1:1 */ + A2XX_RB_DEPTHCONTROL_STENCILFAIL(fd_stencil_op(s->fail_op)) | + A2XX_RB_DEPTHCONTROL_STENCILZPASS(fd_stencil_op(s->zpass_op)) | + A2XX_RB_DEPTHCONTROL_STENCILZFAIL(fd_stencil_op(s->zfail_op)); + so->rb_stencilrefmask |= + 0xff000000 | /* ??? */ + A2XX_RB_STENCILREFMASK_STENCILWRITEMASK(s->writemask) | + A2XX_RB_STENCILREFMASK_STENCILMASK(s->valuemask); - if (cso->stencil[1].enabled) { - const struct pipe_stencil_state *bs = &cso->stencil[1]; + if (cso->stencil[1].enabled) { + const struct pipe_stencil_state *bs = &cso->stencil[1]; - so->rb_depthcontrol |= - A2XX_RB_DEPTHCONTROL_BACKFACE_ENABLE | - A2XX_RB_DEPTHCONTROL_STENCILFUNC_BF(bs->func) | /* maps 1:1 */ - A2XX_RB_DEPTHCONTROL_STENCILFAIL_BF(fd_stencil_op(bs->fail_op)) | - A2XX_RB_DEPTHCONTROL_STENCILZPASS_BF(fd_stencil_op(bs->zpass_op)) | - A2XX_RB_DEPTHCONTROL_STENCILZFAIL_BF(fd_stencil_op(bs->zfail_op)); - so->rb_stencilrefmask_bf |= - 0xff000000 | /* ??? */ - A2XX_RB_STENCILREFMASK_STENCILWRITEMASK(bs->writemask) | - A2XX_RB_STENCILREFMASK_STENCILMASK(bs->valuemask); - } - } + so->rb_depthcontrol |= + A2XX_RB_DEPTHCONTROL_BACKFACE_ENABLE | + A2XX_RB_DEPTHCONTROL_STENCILFUNC_BF(bs->func) | /* maps 1:1 */ + A2XX_RB_DEPTHCONTROL_STENCILFAIL_BF(fd_stencil_op(bs->fail_op)) | + A2XX_RB_DEPTHCONTROL_STENCILZPASS_BF(fd_stencil_op(bs->zpass_op)) | + A2XX_RB_DEPTHCONTROL_STENCILZFAIL_BF(fd_stencil_op(bs->zfail_op)); + so->rb_stencilrefmask_bf |= + 0xff000000 | /* ??? */ + A2XX_RB_STENCILREFMASK_STENCILWRITEMASK(bs->writemask) | + A2XX_RB_STENCILREFMASK_STENCILMASK(bs->valuemask); + } + } - if (cso->alpha_enabled) { - so->rb_colorcontrol = - A2XX_RB_COLORCONTROL_ALPHA_FUNC(cso->alpha_func) | - A2XX_RB_COLORCONTROL_ALPHA_TEST_ENABLE; - so->rb_alpha_ref = fui(cso->alpha_ref_value); - } + if (cso->alpha_enabled) { + so->rb_colorcontrol = A2XX_RB_COLORCONTROL_ALPHA_FUNC(cso->alpha_func) | + A2XX_RB_COLORCONTROL_ALPHA_TEST_ENABLE; + so->rb_alpha_ref = fui(cso->alpha_ref_value); + } - return so; + return so; } diff --git a/src/gallium/drivers/freedreno/a2xx/fd2_zsa.h b/src/gallium/drivers/freedreno/a2xx/fd2_zsa.h index 48a4240..547fc55 100644 --- a/src/gallium/drivers/freedreno/a2xx/fd2_zsa.h +++ b/src/gallium/drivers/freedreno/a2xx/fd2_zsa.h @@ -27,28 +27,27 @@ #ifndef FD2_ZSA_H_ #define FD2_ZSA_H_ - -#include "pipe/p_state.h" #include "pipe/p_context.h" +#include "pipe/p_state.h" #include "freedreno_util.h" struct fd2_zsa_stateobj { - struct pipe_depth_stencil_alpha_state base; - uint32_t rb_depthcontrol; - uint32_t rb_colorcontrol; /* must be OR'd w/ blend->rb_colorcontrol */ - uint32_t rb_alpha_ref; - uint32_t rb_stencilrefmask; - uint32_t rb_stencilrefmask_bf; + struct pipe_depth_stencil_alpha_state base; + uint32_t rb_depthcontrol; + uint32_t rb_colorcontrol; /* must be OR'd w/ blend->rb_colorcontrol */ + uint32_t rb_alpha_ref; + uint32_t rb_stencilrefmask; + uint32_t rb_stencilrefmask_bf; }; static inline struct fd2_zsa_stateobj * fd2_zsa_stateobj(struct pipe_depth_stencil_alpha_state *zsa) { - return (struct fd2_zsa_stateobj *)zsa; + return (struct fd2_zsa_stateobj *)zsa; } -void * fd2_zsa_state_create(struct pipe_context *pctx, - const struct pipe_depth_stencil_alpha_state *cso); +void *fd2_zsa_state_create(struct pipe_context *pctx, + const struct pipe_depth_stencil_alpha_state *cso); #endif /* FD2_ZSA_H_ */ diff --git a/src/gallium/drivers/freedreno/a2xx/ir2.c b/src/gallium/drivers/freedreno/a2xx/ir2.c index 2d54248..d810e8f 100644 --- a/src/gallium/drivers/freedreno/a2xx/ir2.c +++ b/src/gallium/drivers/freedreno/a2xx/ir2.c @@ -26,120 +26,124 @@ #include "ir2_private.h" -static bool scalar_possible(struct ir2_instr *instr) +static bool +scalar_possible(struct ir2_instr *instr) { - if (instr->alu.scalar_opc == SCALAR_NONE) - return false; + if (instr->alu.scalar_opc == SCALAR_NONE) + return false; - return src_ncomp(instr) == 1; + return src_ncomp(instr) == 1; } -static bool is_alu_compatible(struct ir2_instr *a, struct ir2_instr *b) +static bool +is_alu_compatible(struct ir2_instr *a, struct ir2_instr *b) { - if (!a) - return true; + if (!a) + return true; - /* dont use same instruction twice */ - if (a == b) - return false; + /* dont use same instruction twice */ + if (a == b) + return false; - /* PRED_SET must be alone */ - if (b->alu.scalar_opc >= PRED_SETEs && - b->alu.scalar_opc <= PRED_SET_RESTOREs) - return false; + /* PRED_SET must be alone */ + if (b->alu.scalar_opc >= PRED_SETEs && + b->alu.scalar_opc <= PRED_SET_RESTOREs) + return false; - /* must write to same export (issues otherwise?) */ - return a->alu.export == b->alu.export; + /* must write to same export (issues otherwise?) */ + return a->alu.export == b->alu.export; } /* priority of vector instruction for scheduling (lower=higher prio) */ -static unsigned alu_vector_prio(struct ir2_instr *instr) +static unsigned +alu_vector_prio(struct ir2_instr *instr) { - if (instr->alu.vector_opc == VECTOR_NONE) - return ~0u; + if (instr->alu.vector_opc == VECTOR_NONE) + return ~0u; - if (is_export(instr)) - return 4; + if (is_export(instr)) + return 4; - /* TODO check src type and ncomps */ - if (instr->src_count == 3) - return 0; + /* TODO check src type and ncomps */ + if (instr->src_count == 3) + return 0; - if (!scalar_possible(instr)) - return 1; + if (!scalar_possible(instr)) + return 1; - return instr->src_count == 2 ? 2 : 3; + return instr->src_count == 2 ? 2 : 3; } /* priority of scalar instruction for scheduling (lower=higher prio) */ -static unsigned alu_scalar_prio(struct ir2_instr *instr) +static unsigned +alu_scalar_prio(struct ir2_instr *instr) { - if (!scalar_possible(instr)) - return ~0u; + if (!scalar_possible(instr)) + return ~0u; - /* this case is dealt with later */ - if (instr->src_count > 1) - return ~0u; + /* this case is dealt with later */ + if (instr->src_count > 1) + return ~0u; - if (is_export(instr)) - return 4; + if (is_export(instr)) + return 4; - /* PRED to end of block */ - if (instr->alu.scalar_opc >= PRED_SETEs && - instr->alu.scalar_opc <= PRED_SET_RESTOREs) - return 5; + /* PRED to end of block */ + if (instr->alu.scalar_opc >= PRED_SETEs && + instr->alu.scalar_opc <= PRED_SET_RESTOREs) + return 5; - /* scalar only have highest priority */ - return instr->alu.vector_opc == VECTOR_NONE ? 0 : 3; + /* scalar only have highest priority */ + return instr->alu.vector_opc == VECTOR_NONE ? 0 : 3; } /* this is a bit messy: * we want to find a slot where we can insert a scalar MOV with * a vector instruction that was already scheduled */ -static struct ir2_sched_instr* +static struct ir2_sched_instr * insert(struct ir2_context *ctx, unsigned block_idx, unsigned reg_idx, - struct ir2_src src1, unsigned *comp) + struct ir2_src src1, unsigned *comp) { - struct ir2_sched_instr *sched = NULL, *s; - unsigned i, mask = 0xf; + struct ir2_sched_instr *sched = NULL, *s; + unsigned i, mask = 0xf; - /* go first earliest point where the mov can be inserted */ - for (i = ctx->instr_sched_count-1; i > 0; i--) { - s = &ctx->instr_sched[i - 1]; + /* go first earliest point where the mov can be inserted */ + for (i = ctx->instr_sched_count - 1; i > 0; i--) { + s = &ctx->instr_sched[i - 1]; - if (s->instr && s->instr->block_idx != block_idx) - break; - if (s->instr_s && s->instr_s->block_idx != block_idx) - break; + if (s->instr && s->instr->block_idx != block_idx) + break; + if (s->instr_s && s->instr_s->block_idx != block_idx) + break; - if (src1.type == IR2_SRC_SSA) { - if ((s->instr && s->instr->idx == src1.num) || - (s->instr_s && s->instr_s->idx == src1.num)) - break; - } + if (src1.type == IR2_SRC_SSA) { + if ((s->instr && s->instr->idx == src1.num) || + (s->instr_s && s->instr_s->idx == src1.num)) + break; + } - unsigned mr = ~(s->reg_state[reg_idx/8] >> reg_idx%8*4 & 0xf); - if ((mask & mr) == 0) - break; + unsigned mr = ~(s->reg_state[reg_idx / 8] >> reg_idx % 8 * 4 & 0xf); + if ((mask & mr) == 0) + break; - mask &= mr; - if (s->instr_s || s->instr->src_count == 3) - continue; + mask &= mr; + if (s->instr_s || s->instr->src_count == 3) + continue; - if (s->instr->type != IR2_ALU || s->instr->alu.export >= 0) - continue; + if (s->instr->type != IR2_ALU || s->instr->alu.export >= 0) + continue; - sched = s; - } - *comp = ffs(mask) - 1; + sched = s; + } + *comp = ffs(mask) - 1; - if (sched) { - for (s = sched; s != &ctx->instr_sched[ctx->instr_sched_count]; s++) - s->reg_state[reg_idx/8] |= 1 << (*comp+reg_idx%8*4); - } + if (sched) { + for (s = sched; s != &ctx->instr_sched[ctx->instr_sched_count]; s++) + s->reg_state[reg_idx / 8] |= 1 << (*comp + reg_idx % 8 * 4); + } - return sched; + return sched; } /* case1: @@ -152,313 +156,326 @@ insert(struct ir2_context *ctx, unsigned block_idx, unsigned reg_idx, static bool scalarize_case1(struct ir2_context *ctx, struct ir2_instr *instr, bool order) { - struct ir2_src src0 = instr->src[ order]; - struct ir2_src src1 = instr->src[!order]; - struct ir2_sched_instr *sched; - struct ir2_instr *ins; - struct ir2_reg *reg; - unsigned idx, comp; - - switch (src0.type) { - case IR2_SRC_CONST: - case IR2_SRC_INPUT: - return false; - default: - break; - } - - /* TODO, insert needs logic for this */ - if (src1.type == IR2_SRC_REG) - return false; - - /* we could do something if they match src1.. */ - if (src0.negate || src0.abs) - return false; - - reg = get_reg_src(ctx, &src0); - - /* result not used more since we will overwrite */ - for (int i = 0; i < 4; i++) - if (reg->comp[i].ref_count != !!(instr->alu.write_mask & 1 << i)) - return false; - - /* find a place to insert the mov */ - sched = insert(ctx, instr->block_idx, reg->idx, src1, &comp); - if (!sched) - return false; - - ins = &ctx->instr[idx = ctx->instr_count++]; - ins->idx = idx; - ins->type = IR2_ALU; - ins->src[0] = src1; - ins->src_count = 1; - ins->is_ssa = true; - ins->ssa.idx = reg->idx; - ins->ssa.ncomp = 1; - ins->ssa.comp[0].c = comp; - ins->alu.scalar_opc = MAXs; - ins->alu.export = -1; - ins->alu.write_mask = 1; - ins->pred = instr->pred; - ins->block_idx = instr->block_idx; - - instr->src[0] = src0; - instr->alu.src1_swizzle = comp; - - sched->instr_s = ins; - return true; + struct ir2_src src0 = instr->src[order]; + struct ir2_src src1 = instr->src[!order]; + struct ir2_sched_instr *sched; + struct ir2_instr *ins; + struct ir2_reg *reg; + unsigned idx, comp; + + switch (src0.type) { + case IR2_SRC_CONST: + case IR2_SRC_INPUT: + return false; + default: + break; + } + + /* TODO, insert needs logic for this */ + if (src1.type == IR2_SRC_REG) + return false; + + /* we could do something if they match src1.. */ + if (src0.negate || src0.abs) + return false; + + reg = get_reg_src(ctx, &src0); + + /* result not used more since we will overwrite */ + for (int i = 0; i < 4; i++) + if (reg->comp[i].ref_count != !!(instr->alu.write_mask & 1 << i)) + return false; + + /* find a place to insert the mov */ + sched = insert(ctx, instr->block_idx, reg->idx, src1, &comp); + if (!sched) + return false; + + ins = &ctx->instr[idx = ctx->instr_count++]; + ins->idx = idx; + ins->type = IR2_ALU; + ins->src[0] = src1; + ins->src_count = 1; + ins->is_ssa = true; + ins->ssa.idx = reg->idx; + ins->ssa.ncomp = 1; + ins->ssa.comp[0].c = comp; + ins->alu.scalar_opc = MAXs; + ins->alu.export = -1; + ins->alu.write_mask = 1; + ins->pred = instr->pred; + ins->block_idx = instr->block_idx; + + instr->src[0] = src0; + instr->alu.src1_swizzle = comp; + + sched->instr_s = ins; + return true; } /* fill sched with next fetch or (vector and/or scalar) alu instruction */ -static int sched_next(struct ir2_context *ctx, struct ir2_sched_instr *sched) +static int +sched_next(struct ir2_context *ctx, struct ir2_sched_instr *sched) { - struct ir2_instr *avail[0x100], *instr_v = NULL, *instr_s = NULL; - unsigned avail_count = 0; - - instr_alloc_type_t export = ~0u; - int block_idx = -1; - - /* XXX merge this loop with the other one somehow? */ - ir2_foreach_instr(instr, ctx) { - if (!instr->need_emit) - continue; - if (is_export(instr)) - export = MIN2(export, export_buf(instr->alu.export)); - } - - ir2_foreach_instr(instr, ctx) { - if (!instr->need_emit) - continue; - - /* dont mix exports */ - if (is_export(instr) && export_buf(instr->alu.export) != export) - continue; - - if (block_idx < 0) - block_idx = instr->block_idx; - else if (block_idx != instr->block_idx || /* must be same block */ - instr->type == IR2_CF || /* CF/MEM must be alone */ - (is_export(instr) && export == SQ_MEMORY)) - break; - /* it works because IR2_CF is always at end of block - * and somewhat same idea with MEM exports, which might not be alone - * but will end up in-order at least - */ - - /* check if dependencies are satisfied */ - bool is_ok = true; - ir2_foreach_src(src, instr) { - if (src->type == IR2_SRC_REG) { - /* need to check if all previous instructions in the block - * which write the reg have been emitted - * slow.. - * XXX: check components instead of whole register - */ - struct ir2_reg *reg = get_reg_src(ctx, src); - ir2_foreach_instr(p, ctx) { - if (!p->is_ssa && p->reg == reg && p->idx < instr->idx) - is_ok &= !p->need_emit; - } - } else if (src->type == IR2_SRC_SSA) { - /* in this case its easy, just check need_emit */ - is_ok &= !ctx->instr[src->num].need_emit; - } - } - /* don't reorder non-ssa write before read */ - if (!instr->is_ssa) { - ir2_foreach_instr(p, ctx) { - if (!p->need_emit || p->idx >= instr->idx) - continue; - - ir2_foreach_src(src, p) { - if (get_reg_src(ctx, src) == instr->reg) - is_ok = false; - } - } - } - /* don't reorder across predicates */ - if (avail_count && instr->pred != avail[0]->pred) - is_ok = false; - - if (!is_ok) - continue; - - avail[avail_count++] = instr; - } - - if (!avail_count) { - assert(block_idx == -1); - return -1; - } - - /* priority to FETCH instructions */ - ir2_foreach_avail(instr) { - if (instr->type == IR2_ALU) - continue; - - ra_src_free(ctx, instr); - ra_reg(ctx, get_reg(instr), -1, false, 0); - - instr->need_emit = false; - sched->instr = instr; - sched->instr_s = NULL; - return block_idx; - } - - /* TODO precompute priorities */ - - unsigned prio_v = ~0u, prio_s = ~0u, prio; - ir2_foreach_avail(instr) { - prio = alu_vector_prio(instr); - if (prio < prio_v) { - instr_v = instr; - prio_v = prio; - } - } - - /* TODO can still insert scalar if src_count=3, if smart about it */ - if (!instr_v || instr_v->src_count < 3) { - ir2_foreach_avail(instr) { - bool compat = is_alu_compatible(instr_v, instr); - - prio = alu_scalar_prio(instr); - if (prio >= prio_v && !compat) - continue; - - if (prio < prio_s) { - instr_s = instr; - prio_s = prio; - if (!compat) - instr_v = NULL; - } - } - } - - assert(instr_v || instr_s); - - /* now, we try more complex insertion of vector instruction as scalar - * TODO: if we are smart we can still insert if instr_v->src_count==3 - */ - if (!instr_s && instr_v->src_count < 3) { - ir2_foreach_avail(instr) { - if (!is_alu_compatible(instr_v, instr) || !scalar_possible(instr)) - continue; - - /* at this point, src_count should always be 2 */ - assert(instr->src_count == 2); - - if (scalarize_case1(ctx, instr, 0)) { - instr_s = instr; - break; - } - if (scalarize_case1(ctx, instr, 1)) { - instr_s = instr; - break; - } - } - } - - /* free src registers */ - if (instr_v) { - instr_v->need_emit = false; - ra_src_free(ctx, instr_v); - } - - if (instr_s) { - instr_s->need_emit = false; - ra_src_free(ctx, instr_s); - } - - /* allocate dst registers */ - if (instr_v) - ra_reg(ctx, get_reg(instr_v), -1, is_export(instr_v), instr_v->alu.write_mask); - - if (instr_s) - ra_reg(ctx, get_reg(instr_s), -1, is_export(instr_s), instr_s->alu.write_mask); - - sched->instr = instr_v; - sched->instr_s = instr_s; - return block_idx; + struct ir2_instr *avail[0x100], *instr_v = NULL, *instr_s = NULL; + unsigned avail_count = 0; + + instr_alloc_type_t export = ~0u; + int block_idx = -1; + + /* XXX merge this loop with the other one somehow? */ + ir2_foreach_instr(instr, ctx) + { + if (!instr->need_emit) + continue; + if (is_export(instr)) + export = MIN2(export, export_buf(instr->alu.export)); + } + + ir2_foreach_instr(instr, ctx) + { + if (!instr->need_emit) + continue; + + /* dont mix exports */ + if (is_export(instr) && export_buf(instr->alu.export) != export) + continue; + + if (block_idx < 0) + block_idx = instr->block_idx; + else if (block_idx != instr->block_idx || /* must be same block */ + instr->type == IR2_CF || /* CF/MEM must be alone */ + (is_export(instr) && export == SQ_MEMORY)) + break; + /* it works because IR2_CF is always at end of block + * and somewhat same idea with MEM exports, which might not be alone + * but will end up in-order at least + */ + + /* check if dependencies are satisfied */ + bool is_ok = true; + ir2_foreach_src(src, instr) + { + if (src->type == IR2_SRC_REG) { + /* need to check if all previous instructions in the block + * which write the reg have been emitted + * slow.. + * XXX: check components instead of whole register + */ + struct ir2_reg *reg = get_reg_src(ctx, src); + ir2_foreach_instr(p, ctx) + { + if (!p->is_ssa && p->reg == reg && p->idx < instr->idx) + is_ok &= !p->need_emit; + } + } else if (src->type == IR2_SRC_SSA) { + /* in this case its easy, just check need_emit */ + is_ok &= !ctx->instr[src->num].need_emit; + } + } + /* don't reorder non-ssa write before read */ + if (!instr->is_ssa) { + ir2_foreach_instr(p, ctx) + { + if (!p->need_emit || p->idx >= instr->idx) + continue; + + ir2_foreach_src(src, p) + { + if (get_reg_src(ctx, src) == instr->reg) + is_ok = false; + } + } + } + /* don't reorder across predicates */ + if (avail_count && instr->pred != avail[0]->pred) + is_ok = false; + + if (!is_ok) + continue; + + avail[avail_count++] = instr; + } + + if (!avail_count) { + assert(block_idx == -1); + return -1; + } + + /* priority to FETCH instructions */ + ir2_foreach_avail(instr) + { + if (instr->type == IR2_ALU) + continue; + + ra_src_free(ctx, instr); + ra_reg(ctx, get_reg(instr), -1, false, 0); + + instr->need_emit = false; + sched->instr = instr; + sched->instr_s = NULL; + return block_idx; + } + + /* TODO precompute priorities */ + + unsigned prio_v = ~0u, prio_s = ~0u, prio; + ir2_foreach_avail(instr) + { + prio = alu_vector_prio(instr); + if (prio < prio_v) { + instr_v = instr; + prio_v = prio; + } + } + + /* TODO can still insert scalar if src_count=3, if smart about it */ + if (!instr_v || instr_v->src_count < 3) { + ir2_foreach_avail(instr) + { + bool compat = is_alu_compatible(instr_v, instr); + + prio = alu_scalar_prio(instr); + if (prio >= prio_v && !compat) + continue; + + if (prio < prio_s) { + instr_s = instr; + prio_s = prio; + if (!compat) + instr_v = NULL; + } + } + } + + assert(instr_v || instr_s); + + /* now, we try more complex insertion of vector instruction as scalar + * TODO: if we are smart we can still insert if instr_v->src_count==3 + */ + if (!instr_s && instr_v->src_count < 3) { + ir2_foreach_avail(instr) + { + if (!is_alu_compatible(instr_v, instr) || !scalar_possible(instr)) + continue; + + /* at this point, src_count should always be 2 */ + assert(instr->src_count == 2); + + if (scalarize_case1(ctx, instr, 0)) { + instr_s = instr; + break; + } + if (scalarize_case1(ctx, instr, 1)) { + instr_s = instr; + break; + } + } + } + + /* free src registers */ + if (instr_v) { + instr_v->need_emit = false; + ra_src_free(ctx, instr_v); + } + + if (instr_s) { + instr_s->need_emit = false; + ra_src_free(ctx, instr_s); + } + + /* allocate dst registers */ + if (instr_v) + ra_reg(ctx, get_reg(instr_v), -1, is_export(instr_v), + instr_v->alu.write_mask); + + if (instr_s) + ra_reg(ctx, get_reg(instr_s), -1, is_export(instr_s), + instr_s->alu.write_mask); + + sched->instr = instr_v; + sched->instr_s = instr_s; + return block_idx; } /* scheduling: determine order of instructions */ -static void schedule_instrs(struct ir2_context *ctx) +static void +schedule_instrs(struct ir2_context *ctx) { - struct ir2_sched_instr *sched; - int block_idx; - - /* allocate input registers */ - for (unsigned idx = 0; idx < ARRAY_SIZE(ctx->input); idx++) - if (ctx->input[idx].initialized) - ra_reg(ctx, &ctx->input[idx], idx, false, 0); - - for (;;) { - sched = &ctx->instr_sched[ctx->instr_sched_count++]; - block_idx = sched_next(ctx, sched); - if (block_idx < 0) - break; - memcpy(sched->reg_state, ctx->reg_state, sizeof(ctx->reg_state)); - - /* catch texture fetch after scheduling and insert the - * SET_TEX_LOD right before it if necessary - * TODO clean this up - */ - struct ir2_instr *instr = sched->instr, *tex_lod; - if (instr && instr->type == IR2_FETCH && - instr->fetch.opc == TEX_FETCH && instr->src_count == 2) { - /* generate the SET_LOD instruction */ - tex_lod = &ctx->instr[ctx->instr_count++]; - tex_lod->type = IR2_FETCH; - tex_lod->block_idx = instr->block_idx; - tex_lod->pred = instr->pred; - tex_lod->fetch.opc = TEX_SET_TEX_LOD; - tex_lod->src[0] = instr->src[1]; - tex_lod->src_count = 1; - - sched[1] = sched[0]; - sched->instr = tex_lod; - ctx->instr_sched_count++; - } - - bool free_block = true; - ir2_foreach_instr(instr, ctx) - free_block &= instr->block_idx != block_idx; - if (free_block) - ra_block_free(ctx, block_idx); - }; - ctx->instr_sched_count--; + struct ir2_sched_instr *sched; + int block_idx; + + /* allocate input registers */ + for (unsigned idx = 0; idx < ARRAY_SIZE(ctx->input); idx++) + if (ctx->input[idx].initialized) + ra_reg(ctx, &ctx->input[idx], idx, false, 0); + + for (;;) { + sched = &ctx->instr_sched[ctx->instr_sched_count++]; + block_idx = sched_next(ctx, sched); + if (block_idx < 0) + break; + memcpy(sched->reg_state, ctx->reg_state, sizeof(ctx->reg_state)); + + /* catch texture fetch after scheduling and insert the + * SET_TEX_LOD right before it if necessary + * TODO clean this up + */ + struct ir2_instr *instr = sched->instr, *tex_lod; + if (instr && instr->type == IR2_FETCH && instr->fetch.opc == TEX_FETCH && + instr->src_count == 2) { + /* generate the SET_LOD instruction */ + tex_lod = &ctx->instr[ctx->instr_count++]; + tex_lod->type = IR2_FETCH; + tex_lod->block_idx = instr->block_idx; + tex_lod->pred = instr->pred; + tex_lod->fetch.opc = TEX_SET_TEX_LOD; + tex_lod->src[0] = instr->src[1]; + tex_lod->src_count = 1; + + sched[1] = sched[0]; + sched->instr = tex_lod; + ctx->instr_sched_count++; + } + + bool free_block = true; + ir2_foreach_instr(instr, ctx) free_block &= instr->block_idx != block_idx; + if (free_block) + ra_block_free(ctx, block_idx); + }; + ctx->instr_sched_count--; } void ir2_compile(struct fd2_shader_stateobj *so, unsigned variant, - struct fd2_shader_stateobj *fp) + struct fd2_shader_stateobj *fp) { - struct ir2_context ctx = { }; - bool binning = !fp && so->type == MESA_SHADER_VERTEX; + struct ir2_context ctx = {}; + bool binning = !fp && so->type == MESA_SHADER_VERTEX; - if (fp) - so->variant[variant].f = fp->variant[0].f; + if (fp) + so->variant[variant].f = fp->variant[0].f; - ctx.so = so; - ctx.info = &so->variant[variant].info; - ctx.f = &so->variant[variant].f; - ctx.info->max_reg = -1; + ctx.so = so; + ctx.info = &so->variant[variant].info; + ctx.f = &so->variant[variant].f; + ctx.info->max_reg = -1; - /* convert nir to internal representation */ - ir2_nir_compile(&ctx, binning); + /* convert nir to internal representation */ + ir2_nir_compile(&ctx, binning); - /* copy propagate srcs */ - cp_src(&ctx); + /* copy propagate srcs */ + cp_src(&ctx); - /* get ref_counts and kill non-needed instructions */ - ra_count_refs(&ctx); + /* get ref_counts and kill non-needed instructions */ + ra_count_refs(&ctx); - /* remove movs used to write outputs */ - cp_export(&ctx); + /* remove movs used to write outputs */ + cp_export(&ctx); - /* instruction order.. and vector->scalar conversions */ - schedule_instrs(&ctx); + /* instruction order.. and vector->scalar conversions */ + schedule_instrs(&ctx); - /* finally, assemble to bitcode */ - assemble(&ctx, binning); + /* finally, assemble to bitcode */ + assemble(&ctx, binning); } diff --git a/src/gallium/drivers/freedreno/a2xx/ir2.h b/src/gallium/drivers/freedreno/a2xx/ir2.h index 706dc94..11bf56e 100644 --- a/src/gallium/drivers/freedreno/a2xx/ir2.h +++ b/src/gallium/drivers/freedreno/a2xx/ir2.h @@ -31,66 +31,66 @@ #include "pipe/p_context.h" struct ir2_fetch_info { - /* dword offset of the fetch instruction */ - uint16_t offset; - union { - /* swizzle to merge with tgsi swizzle */ - struct { - uint16_t dst_swiz; - } vtx; - /* sampler id to patch const_idx */ - struct { - uint16_t samp_id; - uint8_t src_swiz; - } tex; - }; + /* dword offset of the fetch instruction */ + uint16_t offset; + union { + /* swizzle to merge with tgsi swizzle */ + struct { + uint16_t dst_swiz; + } vtx; + /* sampler id to patch const_idx */ + struct { + uint16_t samp_id; + uint8_t src_swiz; + } tex; + }; }; struct ir2_shader_info { - /* compiler shader */ - uint32_t *dwords; + /* compiler shader */ + uint32_t *dwords; - /* size of the compiled shader in dwords */ - uint16_t sizedwords; + /* size of the compiled shader in dwords */ + uint16_t sizedwords; - /* highest GPR # used by shader */ - int8_t max_reg; + /* highest GPR # used by shader */ + int8_t max_reg; - /* offset in dwords of first MEMORY export CF (for a20x hw binning) */ - int16_t mem_export_ptr; + /* offset in dwords of first MEMORY export CF (for a20x hw binning) */ + int16_t mem_export_ptr; - /* fetch instruction info for patching */ - uint16_t num_fetch_instrs; - struct ir2_fetch_info fetch_info[64]; + /* fetch instruction info for patching */ + uint16_t num_fetch_instrs; + struct ir2_fetch_info fetch_info[64]; }; struct ir2_frag_linkage { - unsigned inputs_count; - struct { - uint8_t slot; - uint8_t ncomp; - } inputs[16]; - - /* driver_location of fragcoord.zw, -1 if not used */ - int fragcoord; + unsigned inputs_count; + struct { + uint8_t slot; + uint8_t ncomp; + } inputs[16]; + + /* driver_location of fragcoord.zw, -1 if not used */ + int fragcoord; }; struct ir2_shader_variant { - struct ir2_shader_info info; - struct ir2_frag_linkage f; + struct ir2_shader_info info; + struct ir2_frag_linkage f; }; struct fd2_shader_stateobj; struct tgsi_token; void ir2_compile(struct fd2_shader_stateobj *so, unsigned variant, - struct fd2_shader_stateobj *fp); + struct fd2_shader_stateobj *fp); struct nir_shader *ir2_tgsi_to_nir(const struct tgsi_token *tokens, - struct pipe_screen *screen); + struct pipe_screen *screen); const nir_shader_compiler_options *ir2_get_compiler_options(void); int ir2_optimize_nir(nir_shader *s, bool lower); -#endif /* IR2_H_ */ +#endif /* IR2_H_ */ diff --git a/src/gallium/drivers/freedreno/a2xx/ir2_assemble.c b/src/gallium/drivers/freedreno/a2xx/ir2_assemble.c index 6d2297c..2f96af5 100644 --- a/src/gallium/drivers/freedreno/a2xx/ir2_assemble.c +++ b/src/gallium/drivers/freedreno/a2xx/ir2_assemble.c @@ -29,22 +29,22 @@ static unsigned src_swizzle(struct ir2_context *ctx, struct ir2_src *src, unsigned ncomp) { - struct ir2_reg_component *comps; - unsigned swiz = 0; - - switch (src->type) { - case IR2_SRC_SSA: - case IR2_SRC_REG: - break; - default: - return src->swizzle; - } - /* we need to take into account where the components were allocated */ - comps = get_reg_src(ctx, src)->comp; - for (int i = 0; i < ncomp; i++) { - swiz |= swiz_set(comps[swiz_get(src->swizzle, i)].c, i); - } - return swiz; + struct ir2_reg_component *comps; + unsigned swiz = 0; + + switch (src->type) { + case IR2_SRC_SSA: + case IR2_SRC_REG: + break; + default: + return src->swizzle; + } + /* we need to take into account where the components were allocated */ + comps = get_reg_src(ctx, src)->comp; + for (int i = 0; i < ncomp; i++) { + swiz |= swiz_set(comps[swiz_get(src->swizzle, i)].c, i); + } + return swiz; } /* alu instr need to take into how the output components are allocated */ @@ -54,46 +54,47 @@ src_swizzle(struct ir2_context *ctx, struct ir2_src *src, unsigned ncomp) static unsigned alu_swizzle_scalar(struct ir2_context *ctx, struct ir2_src *reg) { - /* hardware seems to take from W, but swizzle everywhere just in case */ - return swiz_merge(src_swizzle(ctx, reg, 1), IR2_SWIZZLE_XXXX); + /* hardware seems to take from W, but swizzle everywhere just in case */ + return swiz_merge(src_swizzle(ctx, reg, 1), IR2_SWIZZLE_XXXX); } static unsigned -alu_swizzle(struct ir2_context *ctx, struct ir2_instr *instr, struct ir2_src *src) +alu_swizzle(struct ir2_context *ctx, struct ir2_instr *instr, + struct ir2_src *src) { - struct ir2_reg_component *comp = get_reg(instr)->comp; - unsigned swiz0 = src_swizzle(ctx, src, src_ncomp(instr)); - unsigned swiz = 0; - - /* non per component special cases */ - switch (instr->alu.vector_opc) { - case PRED_SETE_PUSHv ... PRED_SETGTE_PUSHv: - return alu_swizzle_scalar(ctx, src); - case DOT2ADDv: - case DOT3v: - case DOT4v: - case CUBEv: - return swiz0; - default: - break; - } - - for (int i = 0, j = 0; i < dst_ncomp(instr); j++) { - if (instr->alu.write_mask & 1 << j) { - if (comp[j].c != 7) - swiz |= swiz_set(i, comp[j].c); - i++; - } - } - return swiz_merge(swiz0, swiz); + struct ir2_reg_component *comp = get_reg(instr)->comp; + unsigned swiz0 = src_swizzle(ctx, src, src_ncomp(instr)); + unsigned swiz = 0; + + /* non per component special cases */ + switch (instr->alu.vector_opc) { + case PRED_SETE_PUSHv ... PRED_SETGTE_PUSHv: + return alu_swizzle_scalar(ctx, src); + case DOT2ADDv: + case DOT3v: + case DOT4v: + case CUBEv: + return swiz0; + default: + break; + } + + for (int i = 0, j = 0; i < dst_ncomp(instr); j++) { + if (instr->alu.write_mask & 1 << j) { + if (comp[j].c != 7) + swiz |= swiz_set(i, comp[j].c); + i++; + } + } + return swiz_merge(swiz0, swiz); } static unsigned alu_swizzle_scalar2(struct ir2_context *ctx, struct ir2_src *src, unsigned s1) { - /* hardware seems to take from ZW, but swizzle everywhere (ABAB) */ - unsigned s0 = swiz_get(src_swizzle(ctx, src, 1), 0); - return swiz_merge(swiz_set(s0, 0) | swiz_set(s1, 1), IR2_SWIZZLE_XYXY); + /* hardware seems to take from ZW, but swizzle everywhere (ABAB) */ + unsigned s0 = swiz_get(src_swizzle(ctx, src, 1), 0); + return swiz_merge(swiz_set(s0, 0) | swiz_set(s1, 1), IR2_SWIZZLE_XYXY); } /* write_mask needs to be transformed by allocation information */ @@ -101,15 +102,15 @@ alu_swizzle_scalar2(struct ir2_context *ctx, struct ir2_src *src, unsigned s1) static unsigned alu_write_mask(struct ir2_context *ctx, struct ir2_instr *instr) { - struct ir2_reg_component *comp = get_reg(instr)->comp; - unsigned write_mask = 0; + struct ir2_reg_component *comp = get_reg(instr)->comp; + unsigned write_mask = 0; - for (int i = 0; i < 4; i++) { - if (instr->alu.write_mask & 1 << i) - write_mask |= 1 << comp[i].c; - } + for (int i = 0; i < 4; i++) { + if (instr->alu.write_mask & 1 << i) + write_mask |= 1 << comp[i].c; + } - return write_mask; + return write_mask; } /* fetch instructions can swizzle dest, but src swizzle needs conversion */ @@ -117,432 +118,436 @@ alu_write_mask(struct ir2_context *ctx, struct ir2_instr *instr) static unsigned fetch_swizzle(struct ir2_context *ctx, struct ir2_src *src, unsigned ncomp) { - unsigned alu_swiz = src_swizzle(ctx, src, ncomp); - unsigned swiz = 0; - for (int i = 0; i < ncomp; i++) - swiz |= swiz_get(alu_swiz, i) << i * 2; - return swiz; + unsigned alu_swiz = src_swizzle(ctx, src, ncomp); + unsigned swiz = 0; + for (int i = 0; i < ncomp; i++) + swiz |= swiz_get(alu_swiz, i) << i * 2; + return swiz; } static unsigned fetch_dst_swiz(struct ir2_context *ctx, struct ir2_instr *instr) { - struct ir2_reg_component *comp = get_reg(instr)->comp; - unsigned dst_swiz = 0xfff; - for (int i = 0; i < dst_ncomp(instr); i++) { - dst_swiz &= ~(7 << comp[i].c * 3); - dst_swiz |= i << comp[i].c * 3; - } - return dst_swiz; + struct ir2_reg_component *comp = get_reg(instr)->comp; + unsigned dst_swiz = 0xfff; + for (int i = 0; i < dst_ncomp(instr); i++) { + dst_swiz &= ~(7 << comp[i].c * 3); + dst_swiz |= i << comp[i].c * 3; + } + return dst_swiz; } /* register / export # for instr */ static unsigned dst_to_reg(struct ir2_context *ctx, struct ir2_instr *instr) { - if (is_export(instr)) - return instr->alu.export; + if (is_export(instr)) + return instr->alu.export; - return get_reg(instr)->idx; + return get_reg(instr)->idx; } /* register # for src */ -static unsigned src_to_reg(struct ir2_context *ctx, struct ir2_src *src) +static unsigned +src_to_reg(struct ir2_context *ctx, struct ir2_src *src) { - return get_reg_src(ctx, src)->idx; + return get_reg_src(ctx, src)->idx; } -static unsigned src_reg_byte(struct ir2_context *ctx, struct ir2_src *src) +static unsigned +src_reg_byte(struct ir2_context *ctx, struct ir2_src *src) { - if (src->type == IR2_SRC_CONST) { - assert(!src->abs); /* no abs bit for const */ - return src->num; - } - return src_to_reg(ctx, src) | (src->abs ? 0x80 : 0); + if (src->type == IR2_SRC_CONST) { + assert(!src->abs); /* no abs bit for const */ + return src->num; + } + return src_to_reg(ctx, src) | (src->abs ? 0x80 : 0); } /* produce the 12 byte binary instruction for a given sched_instr */ static void -fill_instr(struct ir2_context *ctx, struct ir2_sched_instr *sched, - instr_t *bc, bool * is_fetch) +fill_instr(struct ir2_context *ctx, struct ir2_sched_instr *sched, instr_t *bc, + bool *is_fetch) { - struct ir2_instr *instr = sched->instr, *instr_s, *instr_v; - - *bc = (instr_t) {}; - - if (instr && instr->type == IR2_FETCH) { - *is_fetch = true; - - bc->fetch.opc = instr->fetch.opc; - bc->fetch.pred_select = !!instr->pred; - bc->fetch.pred_condition = instr->pred & 1; - - struct ir2_src *src = instr->src; - - if (instr->fetch.opc == VTX_FETCH) { - instr_fetch_vtx_t *vtx = &bc->fetch.vtx; - - assert(instr->fetch.vtx.const_idx <= 0x1f); - assert(instr->fetch.vtx.const_idx_sel <= 0x3); - - vtx->src_reg = src_to_reg(ctx, src); - vtx->src_swiz = fetch_swizzle(ctx, src, 1); - vtx->dst_reg = dst_to_reg(ctx, instr); - vtx->dst_swiz = fetch_dst_swiz(ctx, instr); - - vtx->must_be_one = 1; - vtx->const_index = instr->fetch.vtx.const_idx; - vtx->const_index_sel = instr->fetch.vtx.const_idx_sel; - - /* other fields will be patched */ - - /* XXX seems like every FETCH but the first has - * this bit set: - */ - vtx->reserved3 = instr->idx ? 0x1 : 0x0; - vtx->reserved0 = instr->idx ? 0x2 : 0x3; - } else if (instr->fetch.opc == TEX_FETCH) { - instr_fetch_tex_t *tex = &bc->fetch.tex; - - tex->src_reg = src_to_reg(ctx, src); - tex->src_swiz = fetch_swizzle(ctx, src, 3); - tex->dst_reg = dst_to_reg(ctx, instr); - tex->dst_swiz = fetch_dst_swiz(ctx, instr); - /* tex->const_idx = patch_fetches */ - tex->mag_filter = TEX_FILTER_USE_FETCH_CONST; - tex->min_filter = TEX_FILTER_USE_FETCH_CONST; - tex->mip_filter = TEX_FILTER_USE_FETCH_CONST; - tex->aniso_filter = ANISO_FILTER_USE_FETCH_CONST; - tex->arbitrary_filter = ARBITRARY_FILTER_USE_FETCH_CONST; - tex->vol_mag_filter = TEX_FILTER_USE_FETCH_CONST; - tex->vol_min_filter = TEX_FILTER_USE_FETCH_CONST; - tex->use_comp_lod = ctx->so->type == MESA_SHADER_FRAGMENT; - tex->use_reg_lod = instr->src_count == 2; - tex->sample_location = SAMPLE_CENTER; - tex->tx_coord_denorm = instr->fetch.tex.is_rect; - } else if (instr->fetch.opc == TEX_SET_TEX_LOD) { - instr_fetch_tex_t *tex = &bc->fetch.tex; - - tex->src_reg = src_to_reg(ctx, src); - tex->src_swiz = fetch_swizzle(ctx, src, 1); - tex->dst_reg = 0; - tex->dst_swiz = 0xfff; - - tex->mag_filter = TEX_FILTER_USE_FETCH_CONST; - tex->min_filter = TEX_FILTER_USE_FETCH_CONST; - tex->mip_filter = TEX_FILTER_USE_FETCH_CONST; - tex->aniso_filter = ANISO_FILTER_USE_FETCH_CONST; - tex->arbitrary_filter = ARBITRARY_FILTER_USE_FETCH_CONST; - tex->vol_mag_filter = TEX_FILTER_USE_FETCH_CONST; - tex->vol_min_filter = TEX_FILTER_USE_FETCH_CONST; - tex->use_comp_lod = 1; - tex->use_reg_lod = 0; - tex->sample_location = SAMPLE_CENTER; - } else { - assert(0); - } - return; - } - - instr_v = sched->instr; - instr_s = sched->instr_s; - - if (instr_v) { - struct ir2_src src1, src2, *src3; - - src1 = instr_v->src[0]; - src2 = instr_v->src[instr_v->src_count > 1]; - src3 = instr_v->src_count == 3 ? &instr_v->src[2] : NULL; - - bc->alu.vector_opc = instr_v->alu.vector_opc; - bc->alu.vector_write_mask = alu_write_mask(ctx, instr_v); - bc->alu.vector_dest = dst_to_reg(ctx, instr_v); - bc->alu.vector_clamp = instr_v->alu.saturate; - bc->alu.export_data = instr_v->alu.export >= 0; - - /* single operand SETEv, use 0.0f as src2 */ - if (instr_v->src_count == 1 && - (bc->alu.vector_opc == SETEv || - bc->alu.vector_opc == SETNEv || - bc->alu.vector_opc == SETGTv || - bc->alu.vector_opc == SETGTEv)) - src2 = ir2_zero(ctx); - - /* export32 instr for a20x hw binning has this bit set.. - * it seems to do more than change the base address of constants - * XXX this is a hack - */ - bc->alu.relative_addr = - (bc->alu.export_data && bc->alu.vector_dest == 32); - - bc->alu.src1_reg_byte = src_reg_byte(ctx, &src1); - bc->alu.src1_swiz = alu_swizzle(ctx, instr_v, &src1); - bc->alu.src1_reg_negate = src1.negate; - bc->alu.src1_sel = src1.type != IR2_SRC_CONST; - - bc->alu.src2_reg_byte = src_reg_byte(ctx, &src2); - bc->alu.src2_swiz = alu_swizzle(ctx, instr_v, &src2); - bc->alu.src2_reg_negate = src2.negate; - bc->alu.src2_sel = src2.type != IR2_SRC_CONST; - - if (src3) { - bc->alu.src3_reg_byte = src_reg_byte(ctx, src3); - bc->alu.src3_swiz = alu_swizzle(ctx, instr_v, src3); - bc->alu.src3_reg_negate = src3->negate; - bc->alu.src3_sel = src3->type != IR2_SRC_CONST; - } - - bc->alu.pred_select = instr_v->pred; - } - - if (instr_s) { - struct ir2_src *src = instr_s->src; - - bc->alu.scalar_opc = instr_s->alu.scalar_opc; - bc->alu.scalar_write_mask = alu_write_mask(ctx, instr_s); - bc->alu.scalar_dest = dst_to_reg(ctx, instr_s); - bc->alu.scalar_clamp = instr_s->alu.saturate; - bc->alu.export_data = instr_s->alu.export >= 0; - - if (instr_s->src_count == 1) { - bc->alu.src3_reg_byte = src_reg_byte(ctx, src); - bc->alu.src3_swiz = alu_swizzle_scalar(ctx, src); - bc->alu.src3_reg_negate = src->negate; - bc->alu.src3_sel = src->type != IR2_SRC_CONST; - } else { - assert(instr_s->src_count == 2); - - bc->alu.src3_reg_byte = src_reg_byte(ctx, src); - bc->alu.src3_swiz = alu_swizzle_scalar2(ctx, src, instr_s->alu.src1_swizzle); - bc->alu.src3_reg_negate = src->negate; - bc->alu.src3_sel = src->type != IR2_SRC_CONST;; - } - - if (instr_v) - assert(instr_s->pred == instr_v->pred); - bc->alu.pred_select = instr_s->pred; - } - - *is_fetch = false; - return; + struct ir2_instr *instr = sched->instr, *instr_s, *instr_v; + + *bc = (instr_t){}; + + if (instr && instr->type == IR2_FETCH) { + *is_fetch = true; + + bc->fetch.opc = instr->fetch.opc; + bc->fetch.pred_select = !!instr->pred; + bc->fetch.pred_condition = instr->pred & 1; + + struct ir2_src *src = instr->src; + + if (instr->fetch.opc == VTX_FETCH) { + instr_fetch_vtx_t *vtx = &bc->fetch.vtx; + + assert(instr->fetch.vtx.const_idx <= 0x1f); + assert(instr->fetch.vtx.const_idx_sel <= 0x3); + + vtx->src_reg = src_to_reg(ctx, src); + vtx->src_swiz = fetch_swizzle(ctx, src, 1); + vtx->dst_reg = dst_to_reg(ctx, instr); + vtx->dst_swiz = fetch_dst_swiz(ctx, instr); + + vtx->must_be_one = 1; + vtx->const_index = instr->fetch.vtx.const_idx; + vtx->const_index_sel = instr->fetch.vtx.const_idx_sel; + + /* other fields will be patched */ + + /* XXX seems like every FETCH but the first has + * this bit set: + */ + vtx->reserved3 = instr->idx ? 0x1 : 0x0; + vtx->reserved0 = instr->idx ? 0x2 : 0x3; + } else if (instr->fetch.opc == TEX_FETCH) { + instr_fetch_tex_t *tex = &bc->fetch.tex; + + tex->src_reg = src_to_reg(ctx, src); + tex->src_swiz = fetch_swizzle(ctx, src, 3); + tex->dst_reg = dst_to_reg(ctx, instr); + tex->dst_swiz = fetch_dst_swiz(ctx, instr); + /* tex->const_idx = patch_fetches */ + tex->mag_filter = TEX_FILTER_USE_FETCH_CONST; + tex->min_filter = TEX_FILTER_USE_FETCH_CONST; + tex->mip_filter = TEX_FILTER_USE_FETCH_CONST; + tex->aniso_filter = ANISO_FILTER_USE_FETCH_CONST; + tex->arbitrary_filter = ARBITRARY_FILTER_USE_FETCH_CONST; + tex->vol_mag_filter = TEX_FILTER_USE_FETCH_CONST; + tex->vol_min_filter = TEX_FILTER_USE_FETCH_CONST; + tex->use_comp_lod = ctx->so->type == MESA_SHADER_FRAGMENT; + tex->use_reg_lod = instr->src_count == 2; + tex->sample_location = SAMPLE_CENTER; + tex->tx_coord_denorm = instr->fetch.tex.is_rect; + } else if (instr->fetch.opc == TEX_SET_TEX_LOD) { + instr_fetch_tex_t *tex = &bc->fetch.tex; + + tex->src_reg = src_to_reg(ctx, src); + tex->src_swiz = fetch_swizzle(ctx, src, 1); + tex->dst_reg = 0; + tex->dst_swiz = 0xfff; + + tex->mag_filter = TEX_FILTER_USE_FETCH_CONST; + tex->min_filter = TEX_FILTER_USE_FETCH_CONST; + tex->mip_filter = TEX_FILTER_USE_FETCH_CONST; + tex->aniso_filter = ANISO_FILTER_USE_FETCH_CONST; + tex->arbitrary_filter = ARBITRARY_FILTER_USE_FETCH_CONST; + tex->vol_mag_filter = TEX_FILTER_USE_FETCH_CONST; + tex->vol_min_filter = TEX_FILTER_USE_FETCH_CONST; + tex->use_comp_lod = 1; + tex->use_reg_lod = 0; + tex->sample_location = SAMPLE_CENTER; + } else { + assert(0); + } + return; + } + + instr_v = sched->instr; + instr_s = sched->instr_s; + + if (instr_v) { + struct ir2_src src1, src2, *src3; + + src1 = instr_v->src[0]; + src2 = instr_v->src[instr_v->src_count > 1]; + src3 = instr_v->src_count == 3 ? &instr_v->src[2] : NULL; + + bc->alu.vector_opc = instr_v->alu.vector_opc; + bc->alu.vector_write_mask = alu_write_mask(ctx, instr_v); + bc->alu.vector_dest = dst_to_reg(ctx, instr_v); + bc->alu.vector_clamp = instr_v->alu.saturate; + bc->alu.export_data = instr_v->alu.export >= 0; + + /* single operand SETEv, use 0.0f as src2 */ + if (instr_v->src_count == 1 && + (bc->alu.vector_opc == SETEv || bc->alu.vector_opc == SETNEv || + bc->alu.vector_opc == SETGTv || bc->alu.vector_opc == SETGTEv)) + src2 = ir2_zero(ctx); + + /* export32 instr for a20x hw binning has this bit set.. + * it seems to do more than change the base address of constants + * XXX this is a hack + */ + bc->alu.relative_addr = + (bc->alu.export_data && bc->alu.vector_dest == 32); + + bc->alu.src1_reg_byte = src_reg_byte(ctx, &src1); + bc->alu.src1_swiz = alu_swizzle(ctx, instr_v, &src1); + bc->alu.src1_reg_negate = src1.negate; + bc->alu.src1_sel = src1.type != IR2_SRC_CONST; + + bc->alu.src2_reg_byte = src_reg_byte(ctx, &src2); + bc->alu.src2_swiz = alu_swizzle(ctx, instr_v, &src2); + bc->alu.src2_reg_negate = src2.negate; + bc->alu.src2_sel = src2.type != IR2_SRC_CONST; + + if (src3) { + bc->alu.src3_reg_byte = src_reg_byte(ctx, src3); + bc->alu.src3_swiz = alu_swizzle(ctx, instr_v, src3); + bc->alu.src3_reg_negate = src3->negate; + bc->alu.src3_sel = src3->type != IR2_SRC_CONST; + } + + bc->alu.pred_select = instr_v->pred; + } + + if (instr_s) { + struct ir2_src *src = instr_s->src; + + bc->alu.scalar_opc = instr_s->alu.scalar_opc; + bc->alu.scalar_write_mask = alu_write_mask(ctx, instr_s); + bc->alu.scalar_dest = dst_to_reg(ctx, instr_s); + bc->alu.scalar_clamp = instr_s->alu.saturate; + bc->alu.export_data = instr_s->alu.export >= 0; + + if (instr_s->src_count == 1) { + bc->alu.src3_reg_byte = src_reg_byte(ctx, src); + bc->alu.src3_swiz = alu_swizzle_scalar(ctx, src); + bc->alu.src3_reg_negate = src->negate; + bc->alu.src3_sel = src->type != IR2_SRC_CONST; + } else { + assert(instr_s->src_count == 2); + + bc->alu.src3_reg_byte = src_reg_byte(ctx, src); + bc->alu.src3_swiz = + alu_swizzle_scalar2(ctx, src, instr_s->alu.src1_swizzle); + bc->alu.src3_reg_negate = src->negate; + bc->alu.src3_sel = src->type != IR2_SRC_CONST; + ; + } + + if (instr_v) + assert(instr_s->pred == instr_v->pred); + bc->alu.pred_select = instr_s->pred; + } + + *is_fetch = false; + return; } static unsigned -write_cfs(struct ir2_context *ctx, instr_cf_t * cfs, unsigned cf_idx, - instr_cf_alloc_t *alloc, instr_cf_exec_t *exec) +write_cfs(struct ir2_context *ctx, instr_cf_t *cfs, unsigned cf_idx, + instr_cf_alloc_t *alloc, instr_cf_exec_t *exec) { - assert(exec->count); + assert(exec->count); - if (alloc) - cfs[cf_idx++].alloc = *alloc; + if (alloc) + cfs[cf_idx++].alloc = *alloc; - /* for memory alloc offset for patching */ - if (alloc && alloc->buffer_select == SQ_MEMORY && - ctx->info->mem_export_ptr == -1) - ctx->info->mem_export_ptr = cf_idx / 2 * 3; + /* for memory alloc offset for patching */ + if (alloc && alloc->buffer_select == SQ_MEMORY && + ctx->info->mem_export_ptr == -1) + ctx->info->mem_export_ptr = cf_idx / 2 * 3; - cfs[cf_idx++].exec = *exec; - exec->address += exec->count; - exec->serialize = 0; - exec->count = 0; + cfs[cf_idx++].exec = *exec; + exec->address += exec->count; + exec->serialize = 0; + exec->count = 0; - return cf_idx; + return cf_idx; } /* assemble the final shader */ -void assemble(struct ir2_context *ctx, bool binning) +void +assemble(struct ir2_context *ctx, bool binning) { - /* hw seems to have a limit of 384 (num_cf/2+num_instr <= 384) - * address is 9 bits so could it be 512 ? - */ - instr_cf_t cfs[384]; - instr_t bytecode[384], bc; - unsigned block_addr[128]; - unsigned num_cf = 0; - - /* CF instr state */ - instr_cf_exec_t exec = {.opc = EXEC}; - instr_cf_alloc_t alloc = {.opc = ALLOC}; - - int sync_id, sync_id_prev = -1; - bool is_fetch = false; - bool need_sync = true; - bool need_alloc = false; - unsigned block_idx = 0; - - ctx->info->mem_export_ptr = -1; - ctx->info->num_fetch_instrs = 0; - - /* vertex shader always needs to allocate at least one parameter - * if it will never happen, - */ - if (ctx->so->type == MESA_SHADER_VERTEX && ctx->f->inputs_count == 0) { - alloc.buffer_select = SQ_PARAMETER_PIXEL; - cfs[num_cf++].alloc = alloc; - } - - block_addr[0] = 0; - - for (int i = 0, j = 0; j < ctx->instr_sched_count; j++) { - struct ir2_instr *instr = ctx->instr_sched[j].instr; - - /* catch IR2_CF since it isn't a regular instruction */ - if (instr && instr->type == IR2_CF) { - assert(!need_alloc); /* XXX */ - - /* flush any exec cf before inserting jmp */ - if (exec.count) - num_cf = write_cfs(ctx, cfs, num_cf, NULL, &exec); - - cfs[num_cf++].jmp_call = (instr_cf_jmp_call_t) { - .opc = COND_JMP, - .address = instr->cf.block_idx, /* will be fixed later */ - .force_call = !instr->pred, - .predicated_jmp = 1, - .direction = instr->cf.block_idx > instr->block_idx, - .condition = instr->pred & 1, - }; - continue; - } - - /* fill the 3 dwords for the instruction */ - fill_instr(ctx, &ctx->instr_sched[j], &bc, &is_fetch); - - /* we need to sync between ALU/VTX_FETCH/TEX_FETCH types */ - sync_id = 0; - if (is_fetch) - sync_id = bc.fetch.opc == VTX_FETCH ? 1 : 2; - - need_sync = sync_id != sync_id_prev; - sync_id_prev = sync_id; - - unsigned block; - { - - if (ctx->instr_sched[j].instr) - block = ctx->instr_sched[j].instr->block_idx; - else - block = ctx->instr_sched[j].instr_s->block_idx; - - assert(block_idx <= block); - } - - /* info for patching */ - if (is_fetch) { - struct ir2_fetch_info *info = - &ctx->info->fetch_info[ctx->info->num_fetch_instrs++]; - info->offset = i * 3; /* add cf offset later */ - - if (bc.fetch.opc == VTX_FETCH) { - info->vtx.dst_swiz = bc.fetch.vtx.dst_swiz; - } else if (bc.fetch.opc == TEX_FETCH) { - info->tex.samp_id = instr->fetch.tex.samp_id; - info->tex.src_swiz = bc.fetch.tex.src_swiz; - } else { - ctx->info->num_fetch_instrs--; - } - } - - /* exec cf after 6 instr or when switching between fetch / alu */ - if (exec.count == 6 || (exec.count && (need_sync || block != block_idx))) { - num_cf = write_cfs(ctx, cfs, num_cf, need_alloc ? &alloc : NULL, &exec); - need_alloc = false; - } - - /* update block_addrs for jmp patching */ - while (block_idx < block) - block_addr[++block_idx] = num_cf; - - /* export - fill alloc cf */ - if (!is_fetch && bc.alu.export_data) { - /* get the export buffer from either vector/scalar dest */ - instr_alloc_type_t buffer = - export_buf(bc.alu.vector_dest); - if (bc.alu.scalar_write_mask) { - if (bc.alu.vector_write_mask) - assert(buffer == export_buf(bc.alu.scalar_dest)); - buffer = export_buf(bc.alu.scalar_dest); - } - - /* flush previous alloc if the buffer changes */ - bool need_new_alloc = buffer != alloc.buffer_select; - - /* memory export always in 32/33 pair, new alloc on 32 */ - if (bc.alu.vector_dest == 32) - need_new_alloc = true; - - if (need_new_alloc && exec.count) { - num_cf = write_cfs(ctx, cfs, num_cf, need_alloc ? &alloc : NULL, &exec); - need_alloc = false; - } - - need_alloc |= need_new_alloc; - - alloc.size = 0; - alloc.buffer_select = buffer; - - if (buffer == SQ_PARAMETER_PIXEL && ctx->so->type == MESA_SHADER_VERTEX) - alloc.size = ctx->f->inputs_count - 1; - - if (buffer == SQ_POSITION) - alloc.size = ctx->so->writes_psize; - } - - if (is_fetch) - exec.serialize |= 0x1 << exec.count * 2; - if (need_sync) - exec.serialize |= 0x2 << exec.count * 2; - - need_sync = false; - exec.count += 1; - bytecode[i++] = bc; - } - - /* final exec cf */ - exec.opc = EXEC_END; - num_cf = - write_cfs(ctx, cfs, num_cf, need_alloc ? &alloc : NULL, &exec); - - /* insert nop to get an even # of CFs */ - if (num_cf % 2) - cfs[num_cf++] = (instr_cf_t) { - .opc = NOP}; - - /* patch cf addrs */ - for (int idx = 0; idx < num_cf; idx++) { - switch (cfs[idx].opc) { - case NOP: - case ALLOC: - break; - case EXEC: - case EXEC_END: - cfs[idx].exec.address += num_cf / 2; - break; - case COND_JMP: - cfs[idx].jmp_call.address = block_addr[cfs[idx].jmp_call.address]; - break; - default: - assert(0); - } - } - - /* concatenate cfs and alu/fetch */ - uint32_t cfdwords = num_cf / 2 * 3; - uint32_t alufetchdwords = exec.address * 3; - uint32_t sizedwords = cfdwords + alufetchdwords; - uint32_t *dwords = malloc(sizedwords * 4); - assert(dwords); - memcpy(dwords, cfs, cfdwords * 4); - memcpy(&dwords[cfdwords], bytecode, alufetchdwords * 4); - - /* finalize ir2_shader_info */ - ctx->info->dwords = dwords; - ctx->info->sizedwords = sizedwords; - for (int i = 0; i < ctx->info->num_fetch_instrs; i++) - ctx->info->fetch_info[i].offset += cfdwords; - - if (FD_DBG(DISASM)) { - DBG("disassemble: type=%d", ctx->so->type); - disasm_a2xx(dwords, sizedwords, 0, ctx->so->type); - } + /* hw seems to have a limit of 384 (num_cf/2+num_instr <= 384) + * address is 9 bits so could it be 512 ? + */ + instr_cf_t cfs[384]; + instr_t bytecode[384], bc; + unsigned block_addr[128]; + unsigned num_cf = 0; + + /* CF instr state */ + instr_cf_exec_t exec = {.opc = EXEC}; + instr_cf_alloc_t alloc = {.opc = ALLOC}; + + int sync_id, sync_id_prev = -1; + bool is_fetch = false; + bool need_sync = true; + bool need_alloc = false; + unsigned block_idx = 0; + + ctx->info->mem_export_ptr = -1; + ctx->info->num_fetch_instrs = 0; + + /* vertex shader always needs to allocate at least one parameter + * if it will never happen, + */ + if (ctx->so->type == MESA_SHADER_VERTEX && ctx->f->inputs_count == 0) { + alloc.buffer_select = SQ_PARAMETER_PIXEL; + cfs[num_cf++].alloc = alloc; + } + + block_addr[0] = 0; + + for (int i = 0, j = 0; j < ctx->instr_sched_count; j++) { + struct ir2_instr *instr = ctx->instr_sched[j].instr; + + /* catch IR2_CF since it isn't a regular instruction */ + if (instr && instr->type == IR2_CF) { + assert(!need_alloc); /* XXX */ + + /* flush any exec cf before inserting jmp */ + if (exec.count) + num_cf = write_cfs(ctx, cfs, num_cf, NULL, &exec); + + cfs[num_cf++].jmp_call = (instr_cf_jmp_call_t){ + .opc = COND_JMP, + .address = instr->cf.block_idx, /* will be fixed later */ + .force_call = !instr->pred, + .predicated_jmp = 1, + .direction = instr->cf.block_idx > instr->block_idx, + .condition = instr->pred & 1, + }; + continue; + } + + /* fill the 3 dwords for the instruction */ + fill_instr(ctx, &ctx->instr_sched[j], &bc, &is_fetch); + + /* we need to sync between ALU/VTX_FETCH/TEX_FETCH types */ + sync_id = 0; + if (is_fetch) + sync_id = bc.fetch.opc == VTX_FETCH ? 1 : 2; + + need_sync = sync_id != sync_id_prev; + sync_id_prev = sync_id; + + unsigned block; + { + + if (ctx->instr_sched[j].instr) + block = ctx->instr_sched[j].instr->block_idx; + else + block = ctx->instr_sched[j].instr_s->block_idx; + + assert(block_idx <= block); + } + + /* info for patching */ + if (is_fetch) { + struct ir2_fetch_info *info = + &ctx->info->fetch_info[ctx->info->num_fetch_instrs++]; + info->offset = i * 3; /* add cf offset later */ + + if (bc.fetch.opc == VTX_FETCH) { + info->vtx.dst_swiz = bc.fetch.vtx.dst_swiz; + } else if (bc.fetch.opc == TEX_FETCH) { + info->tex.samp_id = instr->fetch.tex.samp_id; + info->tex.src_swiz = bc.fetch.tex.src_swiz; + } else { + ctx->info->num_fetch_instrs--; + } + } + + /* exec cf after 6 instr or when switching between fetch / alu */ + if (exec.count == 6 || + (exec.count && (need_sync || block != block_idx))) { + num_cf = + write_cfs(ctx, cfs, num_cf, need_alloc ? &alloc : NULL, &exec); + need_alloc = false; + } + + /* update block_addrs for jmp patching */ + while (block_idx < block) + block_addr[++block_idx] = num_cf; + + /* export - fill alloc cf */ + if (!is_fetch && bc.alu.export_data) { + /* get the export buffer from either vector/scalar dest */ + instr_alloc_type_t buffer = export_buf(bc.alu.vector_dest); + if (bc.alu.scalar_write_mask) { + if (bc.alu.vector_write_mask) + assert(buffer == export_buf(bc.alu.scalar_dest)); + buffer = export_buf(bc.alu.scalar_dest); + } + + /* flush previous alloc if the buffer changes */ + bool need_new_alloc = buffer != alloc.buffer_select; + + /* memory export always in 32/33 pair, new alloc on 32 */ + if (bc.alu.vector_dest == 32) + need_new_alloc = true; + + if (need_new_alloc && exec.count) { + num_cf = + write_cfs(ctx, cfs, num_cf, need_alloc ? &alloc : NULL, &exec); + need_alloc = false; + } + + need_alloc |= need_new_alloc; + + alloc.size = 0; + alloc.buffer_select = buffer; + + if (buffer == SQ_PARAMETER_PIXEL && + ctx->so->type == MESA_SHADER_VERTEX) + alloc.size = ctx->f->inputs_count - 1; + + if (buffer == SQ_POSITION) + alloc.size = ctx->so->writes_psize; + } + + if (is_fetch) + exec.serialize |= 0x1 << exec.count * 2; + if (need_sync) + exec.serialize |= 0x2 << exec.count * 2; + + need_sync = false; + exec.count += 1; + bytecode[i++] = bc; + } + + /* final exec cf */ + exec.opc = EXEC_END; + num_cf = write_cfs(ctx, cfs, num_cf, need_alloc ? &alloc : NULL, &exec); + + /* insert nop to get an even # of CFs */ + if (num_cf % 2) + cfs[num_cf++] = (instr_cf_t){.opc = NOP}; + + /* patch cf addrs */ + for (int idx = 0; idx < num_cf; idx++) { + switch (cfs[idx].opc) { + case NOP: + case ALLOC: + break; + case EXEC: + case EXEC_END: + cfs[idx].exec.address += num_cf / 2; + break; + case COND_JMP: + cfs[idx].jmp_call.address = block_addr[cfs[idx].jmp_call.address]; + break; + default: + assert(0); + } + } + + /* concatenate cfs and alu/fetch */ + uint32_t cfdwords = num_cf / 2 * 3; + uint32_t alufetchdwords = exec.address * 3; + uint32_t sizedwords = cfdwords + alufetchdwords; + uint32_t *dwords = malloc(sizedwords * 4); + assert(dwords); + memcpy(dwords, cfs, cfdwords * 4); + memcpy(&dwords[cfdwords], bytecode, alufetchdwords * 4); + + /* finalize ir2_shader_info */ + ctx->info->dwords = dwords; + ctx->info->sizedwords = sizedwords; + for (int i = 0; i < ctx->info->num_fetch_instrs; i++) + ctx->info->fetch_info[i].offset += cfdwords; + + if (FD_DBG(DISASM)) { + DBG("disassemble: type=%d", ctx->so->type); + disasm_a2xx(dwords, sizedwords, 0, ctx->so->type); + } } diff --git a/src/gallium/drivers/freedreno/a2xx/ir2_cp.c b/src/gallium/drivers/freedreno/a2xx/ir2_cp.c index 2b165b3..5057c2a 100644 --- a/src/gallium/drivers/freedreno/a2xx/ir2_cp.c +++ b/src/gallium/drivers/freedreno/a2xx/ir2_cp.c @@ -26,20 +26,22 @@ #include "ir2_private.h" -static bool is_mov(struct ir2_instr *instr) +static bool +is_mov(struct ir2_instr *instr) { - return instr->type == IR2_ALU && instr->alu.vector_opc == MAXv && - instr->src_count == 1; + return instr->type == IR2_ALU && instr->alu.vector_opc == MAXv && + instr->src_count == 1; } -static void src_combine(struct ir2_src *src, struct ir2_src b) +static void +src_combine(struct ir2_src *src, struct ir2_src b) { - src->num = b.num; - src->type = b.type; - src->swizzle = swiz_merge(b.swizzle, src->swizzle); - if (!src->abs) /* if we have abs we don't care about previous negate */ - src->negate ^= b.negate; - src->abs |= b.abs; + src->num = b.num; + src->type = b.type; + src->swizzle = swiz_merge(b.swizzle, src->swizzle); + if (!src->abs) /* if we have abs we don't care about previous negate */ + src->negate ^= b.negate; + src->abs |= b.abs; } /* cp_src: replace src regs when they refer to a mov instruction @@ -49,37 +51,40 @@ static void src_combine(struct ir2_src *src, struct ir2_src b) * becomes: * ALU: MULADDv R7 = C7, R10, R0.xxxx */ -void cp_src(struct ir2_context *ctx) +void +cp_src(struct ir2_context *ctx) { - struct ir2_instr *p; - - ir2_foreach_instr(instr, ctx) { - ir2_foreach_src(src, instr) { - /* loop to replace recursively */ - do { - if (src->type != IR2_SRC_SSA) - break; - - p = &ctx->instr[src->num]; - /* don't work across blocks to avoid possible issues */ - if (p->block_idx != instr->block_idx) - break; - - if (!is_mov(p)) - break; - - if (p->alu.saturate) - break; - - /* cant apply abs to const src, const src only for alu */ - if (p->src[0].type == IR2_SRC_CONST && - (src->abs || instr->type != IR2_ALU)) - break; - - src_combine(src, p->src[0]); - } while (1); - } - } + struct ir2_instr *p; + + ir2_foreach_instr(instr, ctx) + { + ir2_foreach_src(src, instr) + { + /* loop to replace recursively */ + do { + if (src->type != IR2_SRC_SSA) + break; + + p = &ctx->instr[src->num]; + /* don't work across blocks to avoid possible issues */ + if (p->block_idx != instr->block_idx) + break; + + if (!is_mov(p)) + break; + + if (p->alu.saturate) + break; + + /* cant apply abs to const src, const src only for alu */ + if (p->src[0].type == IR2_SRC_CONST && + (src->abs || instr->type != IR2_ALU)) + break; + + src_combine(src, p->src[0]); + } while (1); + } + } } /* cp_export: replace mov to export when possible @@ -94,136 +99,138 @@ void cp_src(struct ir2_context *ctx) * ALU: MAXv export0.xyz_ = R0.xxx?, C8.xxx? * */ -void cp_export(struct ir2_context *ctx) +void +cp_export(struct ir2_context *ctx) { - struct ir2_instr *c[4], *ins[4]; - struct ir2_src *src; - struct ir2_reg *reg; - unsigned ncomp; - - ir2_foreach_instr(instr, ctx) { - if (!is_export(instr)) /* TODO */ - continue; - - if (!is_mov(instr)) - continue; - - src = &instr->src[0]; - - if (src->negate || src->abs) /* TODO handle these cases */ - continue; - - if (src->type == IR2_SRC_INPUT || src->type == IR2_SRC_CONST) - continue; - - reg = get_reg_src(ctx, src); - ncomp = dst_ncomp(instr); - - unsigned reswiz[4] = {}; - unsigned num_instr = 0; - - /* fill array c with pointers to instrs that write each component */ - if (src->type == IR2_SRC_SSA) { - struct ir2_instr *instr = &ctx->instr[src->num]; - - if (instr->type != IR2_ALU) - continue; - - for (int i = 0; i < ncomp; i++) - c[i] = instr; - - ins[num_instr++] = instr; - reswiz[0] = src->swizzle; - } else { - bool ok = true; - unsigned write_mask = 0; - - ir2_foreach_instr(instr, ctx) { - if (instr->is_ssa || instr->reg != reg) - continue; - - /* set by non-ALU */ - if (instr->type != IR2_ALU) { - ok = false; - break; - } - - /* component written more than once */ - if (write_mask & instr->alu.write_mask) { - ok = false; - break; - } - - write_mask |= instr->alu.write_mask; - - /* src pointers for components */ - for (int i = 0, j = 0; i < 4; i++) { - unsigned k = swiz_get(src->swizzle, i); - if (instr->alu.write_mask & 1 << k) { - c[i] = instr; - - /* reswiz = compressed src->swizzle */ - unsigned x = 0; - for (int i = 0; i < k; i++) - x += !!(instr->alu.write_mask & 1 << i); - - assert(src->swizzle || x == j); - reswiz[num_instr] |= swiz_set(x, j++); - } - } - ins[num_instr++] = instr; - } - if (!ok) - continue; - } - - bool redirect = true; - - /* must all be in same block */ - for (int i = 0; i < ncomp; i++) - redirect &= (c[i]->block_idx == instr->block_idx); - - /* no other instr using the value */ - ir2_foreach_instr(p, ctx) { - if (p == instr) - continue; - ir2_foreach_src(src, p) - redirect &= reg != get_reg_src(ctx, src); - } - - if (!redirect) - continue; - - /* redirect the instructions writing to the register */ - for (int i = 0; i < num_instr; i++) { - struct ir2_instr *p = ins[i]; - - p->alu.export = instr->alu.export; - p->alu.write_mask = 0; - p->is_ssa = true; - p->ssa.ncomp = 0; - memset(p->ssa.comp, 0, sizeof(p->ssa.comp)); - p->alu.saturate |= instr->alu.saturate; - - switch (p->alu.vector_opc) { - case PRED_SETE_PUSHv ... PRED_SETGTE_PUSHv: - case DOT2ADDv: - case DOT3v: - case DOT4v: - case CUBEv: - continue; - default: - break; - } - ir2_foreach_src(s, p) - swiz_merge_p(&s->swizzle, reswiz[i]); - } - - for (int i = 0; i < ncomp; i++) { - c[i]->alu.write_mask |= (1 << i); - c[i]->ssa.ncomp++; - } - instr->type = IR2_NONE; - instr->need_emit = false; - } + struct ir2_instr *c[4], *ins[4]; + struct ir2_src *src; + struct ir2_reg *reg; + unsigned ncomp; + + ir2_foreach_instr(instr, ctx) + { + if (!is_export(instr)) /* TODO */ + continue; + + if (!is_mov(instr)) + continue; + + src = &instr->src[0]; + + if (src->negate || src->abs) /* TODO handle these cases */ + continue; + + if (src->type == IR2_SRC_INPUT || src->type == IR2_SRC_CONST) + continue; + + reg = get_reg_src(ctx, src); + ncomp = dst_ncomp(instr); + + unsigned reswiz[4] = {}; + unsigned num_instr = 0; + + /* fill array c with pointers to instrs that write each component */ + if (src->type == IR2_SRC_SSA) { + struct ir2_instr *instr = &ctx->instr[src->num]; + + if (instr->type != IR2_ALU) + continue; + + for (int i = 0; i < ncomp; i++) + c[i] = instr; + + ins[num_instr++] = instr; + reswiz[0] = src->swizzle; + } else { + bool ok = true; + unsigned write_mask = 0; + + ir2_foreach_instr(instr, ctx) + { + if (instr->is_ssa || instr->reg != reg) + continue; + + /* set by non-ALU */ + if (instr->type != IR2_ALU) { + ok = false; + break; + } + + /* component written more than once */ + if (write_mask & instr->alu.write_mask) { + ok = false; + break; + } + + write_mask |= instr->alu.write_mask; + + /* src pointers for components */ + for (int i = 0, j = 0; i < 4; i++) { + unsigned k = swiz_get(src->swizzle, i); + if (instr->alu.write_mask & 1 << k) { + c[i] = instr; + + /* reswiz = compressed src->swizzle */ + unsigned x = 0; + for (int i = 0; i < k; i++) + x += !!(instr->alu.write_mask & 1 << i); + + assert(src->swizzle || x == j); + reswiz[num_instr] |= swiz_set(x, j++); + } + } + ins[num_instr++] = instr; + } + if (!ok) + continue; + } + + bool redirect = true; + + /* must all be in same block */ + for (int i = 0; i < ncomp; i++) + redirect &= (c[i]->block_idx == instr->block_idx); + + /* no other instr using the value */ + ir2_foreach_instr(p, ctx) + { + if (p == instr) + continue; + ir2_foreach_src(src, p) redirect &= reg != get_reg_src(ctx, src); + } + + if (!redirect) + continue; + + /* redirect the instructions writing to the register */ + for (int i = 0; i < num_instr; i++) { + struct ir2_instr *p = ins[i]; + + p->alu.export = instr->alu.export; + p->alu.write_mask = 0; + p->is_ssa = true; + p->ssa.ncomp = 0; + memset(p->ssa.comp, 0, sizeof(p->ssa.comp)); + p->alu.saturate |= instr->alu.saturate; + + switch (p->alu.vector_opc) { + case PRED_SETE_PUSHv ... PRED_SETGTE_PUSHv: + case DOT2ADDv: + case DOT3v: + case DOT4v: + case CUBEv: + continue; + default: + break; + } + ir2_foreach_src(s, p) swiz_merge_p(&s->swizzle, reswiz[i]); + } + + for (int i = 0; i < ncomp; i++) { + c[i]->alu.write_mask |= (1 << i); + c[i]->ssa.ncomp++; + } + instr->type = IR2_NONE; + instr->need_emit = false; + } } diff --git a/src/gallium/drivers/freedreno/a2xx/ir2_nir.c b/src/gallium/drivers/freedreno/a2xx/ir2_nir.c index 68f9de5..b303df5 100644 --- a/src/gallium/drivers/freedreno/a2xx/ir2_nir.c +++ b/src/gallium/drivers/freedreno/a2xx/ir2_nir.c @@ -26,963 +26,967 @@ #include "ir2_private.h" -#include "freedreno_util.h" #include "fd2_program.h" +#include "freedreno_util.h" static const nir_shader_compiler_options options = { - .lower_fpow = true, - .lower_flrp32 = true, - .lower_fmod = true, - .lower_fdiv = true, - .lower_fceil = true, - .fuse_ffma16 = true, - .fuse_ffma32 = true, - .fuse_ffma64 = true, - /* .fdot_replicates = true, it is replicated, but it makes things worse */ - .lower_all_io_to_temps = true, - .vertex_id_zero_based = true, /* its not implemented anyway */ - .lower_bitops = true, - .lower_rotate = true, - .lower_vector_cmp = true, - .lower_fdph = true, - .has_fsub = true, - .has_isub = true, + .lower_fpow = true, + .lower_flrp32 = true, + .lower_fmod = true, + .lower_fdiv = true, + .lower_fceil = true, + .fuse_ffma16 = true, + .fuse_ffma32 = true, + .fuse_ffma64 = true, + /* .fdot_replicates = true, it is replicated, but it makes things worse */ + .lower_all_io_to_temps = true, + .vertex_id_zero_based = true, /* its not implemented anyway */ + .lower_bitops = true, + .lower_rotate = true, + .lower_vector_cmp = true, + .lower_fdph = true, + .has_fsub = true, + .has_isub = true, }; const nir_shader_compiler_options * ir2_get_compiler_options(void) { - return &options; + return &options; } -#define OPT(nir, pass, ...) ({ \ - bool this_progress = false; \ - NIR_PASS(this_progress, nir, pass, ##__VA_ARGS__); \ - this_progress; \ -}) +#define OPT(nir, pass, ...) \ + ({ \ + bool this_progress = false; \ + NIR_PASS(this_progress, nir, pass, ##__VA_ARGS__); \ + this_progress; \ + }) #define OPT_V(nir, pass, ...) NIR_PASS_V(nir, pass, ##__VA_ARGS__) static void ir2_optimize_loop(nir_shader *s) { - bool progress; - do { - progress = false; - - OPT_V(s, nir_lower_vars_to_ssa); - progress |= OPT(s, nir_opt_copy_prop_vars); - progress |= OPT(s, nir_copy_prop); - progress |= OPT(s, nir_opt_dce); - progress |= OPT(s, nir_opt_cse); - /* progress |= OPT(s, nir_opt_gcm, true); */ - progress |= OPT(s, nir_opt_peephole_select, UINT_MAX, true, true); - progress |= OPT(s, nir_opt_intrinsics); - progress |= OPT(s, nir_opt_algebraic); - progress |= OPT(s, nir_opt_constant_folding); - progress |= OPT(s, nir_opt_dead_cf); - if (OPT(s, nir_opt_trivial_continues)) { - progress |= true; - /* If nir_opt_trivial_continues makes progress, then we need to clean - * things up if we want any hope of nir_opt_if or nir_opt_loop_unroll - * to make progress. - */ - OPT(s, nir_copy_prop); - OPT(s, nir_opt_dce); - } - progress |= OPT(s, nir_opt_loop_unroll, nir_var_all); - progress |= OPT(s, nir_opt_if, false); - progress |= OPT(s, nir_opt_remove_phis); - progress |= OPT(s, nir_opt_undef); - - } - while (progress); + bool progress; + do { + progress = false; + + OPT_V(s, nir_lower_vars_to_ssa); + progress |= OPT(s, nir_opt_copy_prop_vars); + progress |= OPT(s, nir_copy_prop); + progress |= OPT(s, nir_opt_dce); + progress |= OPT(s, nir_opt_cse); + /* progress |= OPT(s, nir_opt_gcm, true); */ + progress |= OPT(s, nir_opt_peephole_select, UINT_MAX, true, true); + progress |= OPT(s, nir_opt_intrinsics); + progress |= OPT(s, nir_opt_algebraic); + progress |= OPT(s, nir_opt_constant_folding); + progress |= OPT(s, nir_opt_dead_cf); + if (OPT(s, nir_opt_trivial_continues)) { + progress |= true; + /* If nir_opt_trivial_continues makes progress, then we need to clean + * things up if we want any hope of nir_opt_if or nir_opt_loop_unroll + * to make progress. + */ + OPT(s, nir_copy_prop); + OPT(s, nir_opt_dce); + } + progress |= OPT(s, nir_opt_loop_unroll, nir_var_all); + progress |= OPT(s, nir_opt_if, false); + progress |= OPT(s, nir_opt_remove_phis); + progress |= OPT(s, nir_opt_undef); + + } while (progress); } /* trig workarounds is the same as ir3.. but we don't want to include ir3 */ -bool ir3_nir_apply_trig_workarounds(nir_shader * shader); +bool ir3_nir_apply_trig_workarounds(nir_shader *shader); int ir2_optimize_nir(nir_shader *s, bool lower) { - struct nir_lower_tex_options tex_options = { - .lower_txp = ~0u, - .lower_rect = 0, - }; - - if (FD_DBG(DISASM)) { - debug_printf("----------------------\n"); - nir_print_shader(s, stdout); - debug_printf("----------------------\n"); - } - - OPT_V(s, nir_lower_regs_to_ssa); - OPT_V(s, nir_lower_vars_to_ssa); - OPT_V(s, nir_lower_indirect_derefs, nir_var_shader_in | nir_var_shader_out, UINT32_MAX); - - if (lower) { - OPT_V(s, ir3_nir_apply_trig_workarounds); - OPT_V(s, nir_lower_tex, &tex_options); - } - - ir2_optimize_loop(s); - - OPT_V(s, nir_remove_dead_variables, nir_var_function_temp, NULL); - OPT_V(s, nir_opt_sink, nir_move_const_undef); - - /* TODO we dont want to get shaders writing to depth for depth textures */ - if (s->info.stage == MESA_SHADER_FRAGMENT) { - nir_foreach_shader_out_variable(var, s) { - if (var->data.location == FRAG_RESULT_DEPTH) - return -1; - } - } - - return 0; + struct nir_lower_tex_options tex_options = { + .lower_txp = ~0u, + .lower_rect = 0, + }; + + if (FD_DBG(DISASM)) { + debug_printf("----------------------\n"); + nir_print_shader(s, stdout); + debug_printf("----------------------\n"); + } + + OPT_V(s, nir_lower_regs_to_ssa); + OPT_V(s, nir_lower_vars_to_ssa); + OPT_V(s, nir_lower_indirect_derefs, nir_var_shader_in | nir_var_shader_out, + UINT32_MAX); + + if (lower) { + OPT_V(s, ir3_nir_apply_trig_workarounds); + OPT_V(s, nir_lower_tex, &tex_options); + } + + ir2_optimize_loop(s); + + OPT_V(s, nir_remove_dead_variables, nir_var_function_temp, NULL); + OPT_V(s, nir_opt_sink, nir_move_const_undef); + + /* TODO we dont want to get shaders writing to depth for depth textures */ + if (s->info.stage == MESA_SHADER_FRAGMENT) { + nir_foreach_shader_out_variable(var, s) + { + if (var->data.location == FRAG_RESULT_DEPTH) + return -1; + } + } + + return 0; } static struct ir2_src load_const(struct ir2_context *ctx, float *value_f, unsigned ncomp) { - struct fd2_shader_stateobj *so = ctx->so; - unsigned imm_ncomp, swiz, idx, i, j; - uint32_t *value = (uint32_t*) value_f; - - /* try to merge with existing immediate (TODO: try with neg) */ - for (idx = 0; idx < so->num_immediates; idx++) { - swiz = 0; - imm_ncomp = so->immediates[idx].ncomp; - for (i = 0; i < ncomp; i++) { - for (j = 0; j < imm_ncomp; j++) { - if (value[i] == so->immediates[idx].val[j]) - break; - } - if (j == imm_ncomp) { - if (j == 4) - break; - so->immediates[idx].val[imm_ncomp++] = value[i]; - } - swiz |= swiz_set(j, i); - } - /* matched all components */ - if (i == ncomp) - break; - } - - /* need to allocate new immediate */ - if (idx == so->num_immediates) { - swiz = 0; - imm_ncomp = 0; - for (i = 0; i < ncomp; i++) { - for (j = 0; j < imm_ncomp; j++) { - if (value[i] == ctx->so->immediates[idx].val[j]) - break; - } - if (j == imm_ncomp) { - so->immediates[idx].val[imm_ncomp++] = value[i]; - } - swiz |= swiz_set(j, i); - } - so->num_immediates++; - } - so->immediates[idx].ncomp = imm_ncomp; - - if (ncomp == 1) - swiz = swiz_merge(swiz, IR2_SWIZZLE_XXXX); - - return ir2_src(so->first_immediate + idx, swiz, IR2_SRC_CONST); + struct fd2_shader_stateobj *so = ctx->so; + unsigned imm_ncomp, swiz, idx, i, j; + uint32_t *value = (uint32_t *)value_f; + + /* try to merge with existing immediate (TODO: try with neg) */ + for (idx = 0; idx < so->num_immediates; idx++) { + swiz = 0; + imm_ncomp = so->immediates[idx].ncomp; + for (i = 0; i < ncomp; i++) { + for (j = 0; j < imm_ncomp; j++) { + if (value[i] == so->immediates[idx].val[j]) + break; + } + if (j == imm_ncomp) { + if (j == 4) + break; + so->immediates[idx].val[imm_ncomp++] = value[i]; + } + swiz |= swiz_set(j, i); + } + /* matched all components */ + if (i == ncomp) + break; + } + + /* need to allocate new immediate */ + if (idx == so->num_immediates) { + swiz = 0; + imm_ncomp = 0; + for (i = 0; i < ncomp; i++) { + for (j = 0; j < imm_ncomp; j++) { + if (value[i] == ctx->so->immediates[idx].val[j]) + break; + } + if (j == imm_ncomp) { + so->immediates[idx].val[imm_ncomp++] = value[i]; + } + swiz |= swiz_set(j, i); + } + so->num_immediates++; + } + so->immediates[idx].ncomp = imm_ncomp; + + if (ncomp == 1) + swiz = swiz_merge(swiz, IR2_SWIZZLE_XXXX); + + return ir2_src(so->first_immediate + idx, swiz, IR2_SRC_CONST); } struct ir2_src ir2_zero(struct ir2_context *ctx) { - return load_const(ctx, (float[]) {0.0f}, 1); + return load_const(ctx, (float[]){0.0f}, 1); } static void update_range(struct ir2_context *ctx, struct ir2_reg *reg) { - if (!reg->initialized) { - reg->initialized = true; - reg->loop_depth = ctx->loop_depth; - } - - if (ctx->loop_depth > reg->loop_depth) { - reg->block_idx_free = ctx->loop_last_block[reg->loop_depth + 1]; - } else { - reg->loop_depth = ctx->loop_depth; - reg->block_idx_free = -1; - } - - /* for regs we want to free at the end of the loop in any case - * XXX dont do this for ssa - */ - if (reg->loop_depth) - reg->block_idx_free = ctx->loop_last_block[reg->loop_depth]; + if (!reg->initialized) { + reg->initialized = true; + reg->loop_depth = ctx->loop_depth; + } + + if (ctx->loop_depth > reg->loop_depth) { + reg->block_idx_free = ctx->loop_last_block[reg->loop_depth + 1]; + } else { + reg->loop_depth = ctx->loop_depth; + reg->block_idx_free = -1; + } + + /* for regs we want to free at the end of the loop in any case + * XXX dont do this for ssa + */ + if (reg->loop_depth) + reg->block_idx_free = ctx->loop_last_block[reg->loop_depth]; } static struct ir2_src make_src(struct ir2_context *ctx, nir_src src) { - struct ir2_src res = {}; - struct ir2_reg *reg; - - nir_const_value *const_value = nir_src_as_const_value(src); - - if (const_value) { - assert(src.is_ssa); - float c[src.ssa->num_components]; - nir_const_value_to_array(c, const_value, src.ssa->num_components, f32); - return load_const(ctx, c, src.ssa->num_components); - } - - if (!src.is_ssa) { - res.num = src.reg.reg->index; - res.type = IR2_SRC_REG; - reg = &ctx->reg[res.num]; - } else { - assert(ctx->ssa_map[src.ssa->index] >= 0); - res.num = ctx->ssa_map[src.ssa->index]; - res.type = IR2_SRC_SSA; - reg = &ctx->instr[res.num].ssa; - } - - update_range(ctx, reg); - return res; + struct ir2_src res = {}; + struct ir2_reg *reg; + + nir_const_value *const_value = nir_src_as_const_value(src); + + if (const_value) { + assert(src.is_ssa); + float c[src.ssa->num_components]; + nir_const_value_to_array(c, const_value, src.ssa->num_components, f32); + return load_const(ctx, c, src.ssa->num_components); + } + + if (!src.is_ssa) { + res.num = src.reg.reg->index; + res.type = IR2_SRC_REG; + reg = &ctx->reg[res.num]; + } else { + assert(ctx->ssa_map[src.ssa->index] >= 0); + res.num = ctx->ssa_map[src.ssa->index]; + res.type = IR2_SRC_SSA; + reg = &ctx->instr[res.num].ssa; + } + + update_range(ctx, reg); + return res; } static void -set_index(struct ir2_context *ctx, nir_dest * dst, - struct ir2_instr *instr) +set_index(struct ir2_context *ctx, nir_dest *dst, struct ir2_instr *instr) { - struct ir2_reg *reg = &instr->ssa; - - if (dst->is_ssa) { - ctx->ssa_map[dst->ssa.index] = instr->idx; - } else { - assert(instr->is_ssa); - reg = &ctx->reg[dst->reg.reg->index]; - - instr->is_ssa = false; - instr->reg = reg; - } - update_range(ctx, reg); + struct ir2_reg *reg = &instr->ssa; + + if (dst->is_ssa) { + ctx->ssa_map[dst->ssa.index] = instr->idx; + } else { + assert(instr->is_ssa); + reg = &ctx->reg[dst->reg.reg->index]; + + instr->is_ssa = false; + instr->reg = reg; + } + update_range(ctx, reg); } static struct ir2_instr * ir2_instr_create(struct ir2_context *ctx, int type) { - struct ir2_instr *instr; - - instr = &ctx->instr[ctx->instr_count++]; - instr->idx = ctx->instr_count - 1; - instr->type = type; - instr->block_idx = ctx->block_idx; - instr->pred = ctx->pred; - instr->is_ssa = true; - return instr; + struct ir2_instr *instr; + + instr = &ctx->instr[ctx->instr_count++]; + instr->idx = ctx->instr_count - 1; + instr->type = type; + instr->block_idx = ctx->block_idx; + instr->pred = ctx->pred; + instr->is_ssa = true; + return instr; } static struct ir2_instr * instr_create_alu(struct ir2_context *ctx, nir_op opcode, unsigned ncomp) { - /* emit_alu will fixup instrs that don't map directly */ - static const struct ir2_opc { - int8_t scalar, vector; - } nir_ir2_opc[nir_num_opcodes+1] = { - [0 ... nir_num_opcodes - 1] = {-1, -1}, - - [nir_op_mov] = {MAXs, MAXv}, - [nir_op_fneg] = {MAXs, MAXv}, - [nir_op_fabs] = {MAXs, MAXv}, - [nir_op_fsat] = {MAXs, MAXv}, - [nir_op_fsign] = {-1, CNDGTEv}, - [nir_op_fadd] = {ADDs, ADDv}, - [nir_op_fsub] = {ADDs, ADDv}, - [nir_op_fmul] = {MULs, MULv}, - [nir_op_ffma] = {-1, MULADDv}, - [nir_op_fmax] = {MAXs, MAXv}, - [nir_op_fmin] = {MINs, MINv}, - [nir_op_ffloor] = {FLOORs, FLOORv}, - [nir_op_ffract] = {FRACs, FRACv}, - [nir_op_ftrunc] = {TRUNCs, TRUNCv}, - [nir_op_fdot2] = {-1, DOT2ADDv}, - [nir_op_fdot3] = {-1, DOT3v}, - [nir_op_fdot4] = {-1, DOT4v}, - [nir_op_sge] = {-1, SETGTEv}, - [nir_op_slt] = {-1, SETGTv}, - [nir_op_sne] = {-1, SETNEv}, - [nir_op_seq] = {-1, SETEv}, - [nir_op_fcsel] = {-1, CNDEv}, - [nir_op_frsq] = {RECIPSQ_IEEE, -1}, - [nir_op_frcp] = {RECIP_IEEE, -1}, - [nir_op_flog2] = {LOG_IEEE, -1}, - [nir_op_fexp2] = {EXP_IEEE, -1}, - [nir_op_fsqrt] = {SQRT_IEEE, -1}, - [nir_op_fcos] = {COS, -1}, - [nir_op_fsin] = {SIN, -1}, - /* no fsat, fneg, fabs since source mods deal with those */ - - /* so we can use this function with non-nir op */ + /* emit_alu will fixup instrs that don't map directly */ + static const struct ir2_opc { + int8_t scalar, vector; + } nir_ir2_opc[nir_num_opcodes + 1] = { + [0 ... nir_num_opcodes - 1] = {-1, -1}, + + [nir_op_mov] = {MAXs, MAXv}, + [nir_op_fneg] = {MAXs, MAXv}, + [nir_op_fabs] = {MAXs, MAXv}, + [nir_op_fsat] = {MAXs, MAXv}, + [nir_op_fsign] = {-1, CNDGTEv}, + [nir_op_fadd] = {ADDs, ADDv}, + [nir_op_fsub] = {ADDs, ADDv}, + [nir_op_fmul] = {MULs, MULv}, + [nir_op_ffma] = {-1, MULADDv}, + [nir_op_fmax] = {MAXs, MAXv}, + [nir_op_fmin] = {MINs, MINv}, + [nir_op_ffloor] = {FLOORs, FLOORv}, + [nir_op_ffract] = {FRACs, FRACv}, + [nir_op_ftrunc] = {TRUNCs, TRUNCv}, + [nir_op_fdot2] = {-1, DOT2ADDv}, + [nir_op_fdot3] = {-1, DOT3v}, + [nir_op_fdot4] = {-1, DOT4v}, + [nir_op_sge] = {-1, SETGTEv}, + [nir_op_slt] = {-1, SETGTv}, + [nir_op_sne] = {-1, SETNEv}, + [nir_op_seq] = {-1, SETEv}, + [nir_op_fcsel] = {-1, CNDEv}, + [nir_op_frsq] = {RECIPSQ_IEEE, -1}, + [nir_op_frcp] = {RECIP_IEEE, -1}, + [nir_op_flog2] = {LOG_IEEE, -1}, + [nir_op_fexp2] = {EXP_IEEE, -1}, + [nir_op_fsqrt] = {SQRT_IEEE, -1}, + [nir_op_fcos] = {COS, -1}, + [nir_op_fsin] = {SIN, -1}, + /* no fsat, fneg, fabs since source mods deal with those */ + + /* so we can use this function with non-nir op */ #define ir2_op_cube nir_num_opcodes - [ir2_op_cube] = {-1, CUBEv}, - }; - - struct ir2_opc op = nir_ir2_opc[opcode]; - assert(op.vector >= 0 || op.scalar >= 0); - - struct ir2_instr *instr = ir2_instr_create(ctx, IR2_ALU); - instr->alu.vector_opc = op.vector; - instr->alu.scalar_opc = op.scalar; - instr->alu.export = -1; - instr->alu.write_mask = (1 << ncomp) - 1; - instr->src_count = opcode == ir2_op_cube ? 2 : - nir_op_infos[opcode].num_inputs; - instr->ssa.ncomp = ncomp; - return instr; + [ir2_op_cube] = {-1, CUBEv}, + }; + + struct ir2_opc op = nir_ir2_opc[opcode]; + assert(op.vector >= 0 || op.scalar >= 0); + + struct ir2_instr *instr = ir2_instr_create(ctx, IR2_ALU); + instr->alu.vector_opc = op.vector; + instr->alu.scalar_opc = op.scalar; + instr->alu.export = -1; + instr->alu.write_mask = (1 << ncomp) - 1; + instr->src_count = + opcode == ir2_op_cube ? 2 : nir_op_infos[opcode].num_inputs; + instr->ssa.ncomp = ncomp; + return instr; } static struct ir2_instr * -instr_create_alu_reg(struct ir2_context *ctx, nir_op opcode, - uint8_t write_mask, struct ir2_instr *share_reg) +instr_create_alu_reg(struct ir2_context *ctx, nir_op opcode, uint8_t write_mask, + struct ir2_instr *share_reg) { - struct ir2_instr *instr; - struct ir2_reg *reg; + struct ir2_instr *instr; + struct ir2_reg *reg; - reg = share_reg ? share_reg->reg : &ctx->reg[ctx->reg_count++]; - reg->ncomp = MAX2(reg->ncomp, util_logbase2(write_mask) + 1); + reg = share_reg ? share_reg->reg : &ctx->reg[ctx->reg_count++]; + reg->ncomp = MAX2(reg->ncomp, util_logbase2(write_mask) + 1); - instr = instr_create_alu(ctx, opcode, util_bitcount(write_mask)); - instr->alu.write_mask = write_mask; - instr->reg = reg; - instr->is_ssa = false; - return instr; + instr = instr_create_alu(ctx, opcode, util_bitcount(write_mask)); + instr->alu.write_mask = write_mask; + instr->reg = reg; + instr->is_ssa = false; + return instr; } - static struct ir2_instr * instr_create_alu_dest(struct ir2_context *ctx, nir_op opcode, nir_dest *dst) { - struct ir2_instr *instr; - instr = instr_create_alu(ctx, opcode, nir_dest_num_components(*dst)); - set_index(ctx, dst, instr); - return instr; + struct ir2_instr *instr; + instr = instr_create_alu(ctx, opcode, nir_dest_num_components(*dst)); + set_index(ctx, dst, instr); + return instr; } static struct ir2_instr * ir2_instr_create_fetch(struct ir2_context *ctx, nir_dest *dst, - instr_fetch_opc_t opc) + instr_fetch_opc_t opc) { - struct ir2_instr *instr = ir2_instr_create(ctx, IR2_FETCH); - instr->fetch.opc = opc; - instr->src_count = 1; - instr->ssa.ncomp = nir_dest_num_components(*dst); - set_index(ctx, dst, instr); - return instr; + struct ir2_instr *instr = ir2_instr_create(ctx, IR2_FETCH); + instr->fetch.opc = opc; + instr->src_count = 1; + instr->ssa.ncomp = nir_dest_num_components(*dst); + set_index(ctx, dst, instr); + return instr; } static struct ir2_src make_src_noconst(struct ir2_context *ctx, nir_src src) { - struct ir2_instr *instr; + struct ir2_instr *instr; - if (nir_src_as_const_value(src)) { - assert(src.is_ssa); - instr = instr_create_alu(ctx, nir_op_mov, src.ssa->num_components); - instr->src[0] = make_src(ctx, src); - return ir2_src(instr->idx, 0, IR2_SRC_SSA); - } + if (nir_src_as_const_value(src)) { + assert(src.is_ssa); + instr = instr_create_alu(ctx, nir_op_mov, src.ssa->num_components); + instr->src[0] = make_src(ctx, src); + return ir2_src(instr->idx, 0, IR2_SRC_SSA); + } - return make_src(ctx, src); + return make_src(ctx, src); } static void -emit_alu(struct ir2_context *ctx, nir_alu_instr * alu) +emit_alu(struct ir2_context *ctx, nir_alu_instr *alu) { - const nir_op_info *info = &nir_op_infos[alu->op]; - nir_dest *dst = &alu->dest.dest; - struct ir2_instr *instr; - struct ir2_src tmp; - unsigned ncomp; - - /* get the number of dst components */ - if (dst->is_ssa) { - ncomp = dst->ssa.num_components; - } else { - ncomp = 0; - for (int i = 0; i < 4; i++) - ncomp += !!(alu->dest.write_mask & 1 << i); - } - - instr = instr_create_alu(ctx, alu->op, ncomp); - set_index(ctx, dst, instr); - instr->alu.saturate = alu->dest.saturate; - instr->alu.write_mask = alu->dest.write_mask; - - for (int i = 0; i < info->num_inputs; i++) { - nir_alu_src *src = &alu->src[i]; - - /* compress swizzle with writemask when applicable */ - unsigned swiz = 0, j = 0; - for (int i = 0; i < 4; i++) { - if (!(alu->dest.write_mask & 1 << i) && !info->output_size) - continue; - swiz |= swiz_set(src->swizzle[i], j++); - } - - instr->src[i] = make_src(ctx, src->src); - instr->src[i].swizzle = swiz_merge(instr->src[i].swizzle, swiz); - instr->src[i].negate = src->negate; - instr->src[i].abs = src->abs; - } - - /* workarounds for NIR ops that don't map directly to a2xx ops */ - switch (alu->op) { - case nir_op_fneg: - instr->src[0].negate = 1; - break; - case nir_op_fabs: - instr->src[0].abs = 1; - break; - case nir_op_fsat: - instr->alu.saturate = 1; - break; - case nir_op_slt: - tmp = instr->src[0]; - instr->src[0] = instr->src[1]; - instr->src[1] = tmp; - break; - case nir_op_fcsel: - tmp = instr->src[1]; - instr->src[1] = instr->src[2]; - instr->src[2] = tmp; - break; - case nir_op_fsub: - instr->src[1].negate = !instr->src[1].negate; - break; - case nir_op_fdot2: - instr->src_count = 3; - instr->src[2] = ir2_zero(ctx); - break; - case nir_op_fsign: { - /* we need an extra instruction to deal with the zero case */ - struct ir2_instr *tmp; - - /* tmp = x == 0 ? 0 : 1 */ - tmp = instr_create_alu(ctx, nir_op_fcsel, ncomp); - tmp->src[0] = instr->src[0]; - tmp->src[1] = ir2_zero(ctx); - tmp->src[2] = load_const(ctx, (float[]) {1.0f}, 1); - - /* result = x >= 0 ? tmp : -tmp */ - instr->src[1] = ir2_src(tmp->idx, 0, IR2_SRC_SSA); - instr->src[2] = instr->src[1]; - instr->src[2].negate = true; - instr->src_count = 3; - } break; - default: - break; - } + const nir_op_info *info = &nir_op_infos[alu->op]; + nir_dest *dst = &alu->dest.dest; + struct ir2_instr *instr; + struct ir2_src tmp; + unsigned ncomp; + + /* get the number of dst components */ + if (dst->is_ssa) { + ncomp = dst->ssa.num_components; + } else { + ncomp = 0; + for (int i = 0; i < 4; i++) + ncomp += !!(alu->dest.write_mask & 1 << i); + } + + instr = instr_create_alu(ctx, alu->op, ncomp); + set_index(ctx, dst, instr); + instr->alu.saturate = alu->dest.saturate; + instr->alu.write_mask = alu->dest.write_mask; + + for (int i = 0; i < info->num_inputs; i++) { + nir_alu_src *src = &alu->src[i]; + + /* compress swizzle with writemask when applicable */ + unsigned swiz = 0, j = 0; + for (int i = 0; i < 4; i++) { + if (!(alu->dest.write_mask & 1 << i) && !info->output_size) + continue; + swiz |= swiz_set(src->swizzle[i], j++); + } + + instr->src[i] = make_src(ctx, src->src); + instr->src[i].swizzle = swiz_merge(instr->src[i].swizzle, swiz); + instr->src[i].negate = src->negate; + instr->src[i].abs = src->abs; + } + + /* workarounds for NIR ops that don't map directly to a2xx ops */ + switch (alu->op) { + case nir_op_fneg: + instr->src[0].negate = 1; + break; + case nir_op_fabs: + instr->src[0].abs = 1; + break; + case nir_op_fsat: + instr->alu.saturate = 1; + break; + case nir_op_slt: + tmp = instr->src[0]; + instr->src[0] = instr->src[1]; + instr->src[1] = tmp; + break; + case nir_op_fcsel: + tmp = instr->src[1]; + instr->src[1] = instr->src[2]; + instr->src[2] = tmp; + break; + case nir_op_fsub: + instr->src[1].negate = !instr->src[1].negate; + break; + case nir_op_fdot2: + instr->src_count = 3; + instr->src[2] = ir2_zero(ctx); + break; + case nir_op_fsign: { + /* we need an extra instruction to deal with the zero case */ + struct ir2_instr *tmp; + + /* tmp = x == 0 ? 0 : 1 */ + tmp = instr_create_alu(ctx, nir_op_fcsel, ncomp); + tmp->src[0] = instr->src[0]; + tmp->src[1] = ir2_zero(ctx); + tmp->src[2] = load_const(ctx, (float[]){1.0f}, 1); + + /* result = x >= 0 ? tmp : -tmp */ + instr->src[1] = ir2_src(tmp->idx, 0, IR2_SRC_SSA); + instr->src[2] = instr->src[1]; + instr->src[2].negate = true; + instr->src_count = 3; + } break; + default: + break; + } } static void load_input(struct ir2_context *ctx, nir_dest *dst, unsigned idx) { - struct ir2_instr *instr; - int slot = -1; - - if (ctx->so->type == MESA_SHADER_VERTEX) { - instr = ir2_instr_create_fetch(ctx, dst, 0); - instr->src[0] = ir2_src(0, 0, IR2_SRC_INPUT); - instr->fetch.vtx.const_idx = 20 + (idx / 3); - instr->fetch.vtx.const_idx_sel = idx % 3; - return; - } - - /* get slot from idx */ - nir_foreach_shader_in_variable(var, ctx->nir) { - if (var->data.driver_location == idx) { - slot = var->data.location; - break; - } - } - assert(slot >= 0); - - switch (slot) { - case VARYING_SLOT_POS: - /* need to extract xy with abs and add tile offset on a20x - * zw from fragcoord input (w inverted in fragment shader) - * TODO: only components that are required by fragment shader - */ - instr = instr_create_alu_reg(ctx, - ctx->so->is_a20x ? nir_op_fadd : nir_op_mov, 3, NULL); - instr->src[0] = ir2_src(ctx->f->inputs_count, 0, IR2_SRC_INPUT); - instr->src[0].abs = true; - /* on a20x, C64 contains the tile offset */ - instr->src[1] = ir2_src(64, 0, IR2_SRC_CONST); - - instr = instr_create_alu_reg(ctx, nir_op_mov, 4, instr); - instr->src[0] = ir2_src(ctx->f->fragcoord, 0, IR2_SRC_INPUT); - - instr = instr_create_alu_reg(ctx, nir_op_frcp, 8, instr); - instr->src[0] = ir2_src(ctx->f->fragcoord, IR2_SWIZZLE_Y, IR2_SRC_INPUT); - - unsigned reg_idx = instr->reg - ctx->reg; /* XXX */ - instr = instr_create_alu_dest(ctx, nir_op_mov, dst); - instr->src[0] = ir2_src(reg_idx, 0, IR2_SRC_REG); - break; - default: - instr = instr_create_alu_dest(ctx, nir_op_mov, dst); - instr->src[0] = ir2_src(idx, 0, IR2_SRC_INPUT); - break; - } + struct ir2_instr *instr; + int slot = -1; + + if (ctx->so->type == MESA_SHADER_VERTEX) { + instr = ir2_instr_create_fetch(ctx, dst, 0); + instr->src[0] = ir2_src(0, 0, IR2_SRC_INPUT); + instr->fetch.vtx.const_idx = 20 + (idx / 3); + instr->fetch.vtx.const_idx_sel = idx % 3; + return; + } + + /* get slot from idx */ + nir_foreach_shader_in_variable (var, ctx->nir) { + if (var->data.driver_location == idx) { + slot = var->data.location; + break; + } + } + assert(slot >= 0); + + switch (slot) { + case VARYING_SLOT_POS: + /* need to extract xy with abs and add tile offset on a20x + * zw from fragcoord input (w inverted in fragment shader) + * TODO: only components that are required by fragment shader + */ + instr = instr_create_alu_reg( + ctx, ctx->so->is_a20x ? nir_op_fadd : nir_op_mov, 3, NULL); + instr->src[0] = ir2_src(ctx->f->inputs_count, 0, IR2_SRC_INPUT); + instr->src[0].abs = true; + /* on a20x, C64 contains the tile offset */ + instr->src[1] = ir2_src(64, 0, IR2_SRC_CONST); + + instr = instr_create_alu_reg(ctx, nir_op_mov, 4, instr); + instr->src[0] = ir2_src(ctx->f->fragcoord, 0, IR2_SRC_INPUT); + + instr = instr_create_alu_reg(ctx, nir_op_frcp, 8, instr); + instr->src[0] = ir2_src(ctx->f->fragcoord, IR2_SWIZZLE_Y, IR2_SRC_INPUT); + + unsigned reg_idx = instr->reg - ctx->reg; /* XXX */ + instr = instr_create_alu_dest(ctx, nir_op_mov, dst); + instr->src[0] = ir2_src(reg_idx, 0, IR2_SRC_REG); + break; + default: + instr = instr_create_alu_dest(ctx, nir_op_mov, dst); + instr->src[0] = ir2_src(idx, 0, IR2_SRC_INPUT); + break; + } } static unsigned output_slot(struct ir2_context *ctx, nir_intrinsic_instr *intr) { - int slot = -1; - unsigned idx = nir_intrinsic_base(intr); - nir_foreach_shader_out_variable(var, ctx->nir) { - if (var->data.driver_location == idx) { - slot = var->data.location; - break; - } - } - assert(slot != -1); - return slot; + int slot = -1; + unsigned idx = nir_intrinsic_base(intr); + nir_foreach_shader_out_variable(var, ctx->nir) + { + if (var->data.driver_location == idx) { + slot = var->data.location; + break; + } + } + assert(slot != -1); + return slot; } static void -store_output(struct ir2_context *ctx, nir_src src, unsigned slot, unsigned ncomp) +store_output(struct ir2_context *ctx, nir_src src, unsigned slot, + unsigned ncomp) { - struct ir2_instr *instr; - unsigned idx = 0; - - if (ctx->so->type == MESA_SHADER_VERTEX) { - switch (slot) { - case VARYING_SLOT_POS: - ctx->position = make_src(ctx, src); - idx = 62; - break; - case VARYING_SLOT_PSIZ: - ctx->so->writes_psize = true; - idx = 63; - break; - default: - /* find matching slot from fragment shader input */ - for (idx = 0; idx < ctx->f->inputs_count; idx++) - if (ctx->f->inputs[idx].slot == slot) - break; - if (idx == ctx->f->inputs_count) - return; - } - } else if (slot != FRAG_RESULT_COLOR && slot != FRAG_RESULT_DATA0) { - /* only color output is implemented */ - return; - } - - instr = instr_create_alu(ctx, nir_op_mov, ncomp); - instr->src[0] = make_src(ctx, src); - instr->alu.export = idx; + struct ir2_instr *instr; + unsigned idx = 0; + + if (ctx->so->type == MESA_SHADER_VERTEX) { + switch (slot) { + case VARYING_SLOT_POS: + ctx->position = make_src(ctx, src); + idx = 62; + break; + case VARYING_SLOT_PSIZ: + ctx->so->writes_psize = true; + idx = 63; + break; + default: + /* find matching slot from fragment shader input */ + for (idx = 0; idx < ctx->f->inputs_count; idx++) + if (ctx->f->inputs[idx].slot == slot) + break; + if (idx == ctx->f->inputs_count) + return; + } + } else if (slot != FRAG_RESULT_COLOR && slot != FRAG_RESULT_DATA0) { + /* only color output is implemented */ + return; + } + + instr = instr_create_alu(ctx, nir_op_mov, ncomp); + instr->src[0] = make_src(ctx, src); + instr->alu.export = idx; } static void emit_intrinsic(struct ir2_context *ctx, nir_intrinsic_instr *intr) { - struct ir2_instr *instr; - ASSERTED nir_const_value *const_offset; - unsigned idx; - - switch (intr->intrinsic) { - case nir_intrinsic_load_input: - load_input(ctx, &intr->dest, nir_intrinsic_base(intr)); - break; - case nir_intrinsic_store_output: - store_output(ctx, intr->src[0], output_slot(ctx, intr), intr->num_components); - break; - case nir_intrinsic_load_uniform: - const_offset = nir_src_as_const_value(intr->src[0]); - assert(const_offset); /* TODO can be false in ES2? */ - idx = nir_intrinsic_base(intr); - idx += (uint32_t)const_offset[0].f32; - instr = instr_create_alu_dest(ctx, nir_op_mov, &intr->dest); - instr->src[0] = ir2_src(idx, 0, IR2_SRC_CONST); - break; - case nir_intrinsic_discard: - case nir_intrinsic_discard_if: - instr = ir2_instr_create(ctx, IR2_ALU); - instr->alu.vector_opc = VECTOR_NONE; - if (intr->intrinsic == nir_intrinsic_discard_if) { - instr->alu.scalar_opc = KILLNEs; - instr->src[0] = make_src(ctx, intr->src[0]); - } else { - instr->alu.scalar_opc = KILLEs; - instr->src[0] = ir2_zero(ctx); - } - instr->alu.export = -1; - instr->src_count = 1; - ctx->so->has_kill = true; - break; - case nir_intrinsic_load_front_face: - /* gl_FrontFacing is in the sign of param.x - * rcp required because otherwise we can't differentiate -0.0 and +0.0 - */ - ctx->so->need_param = true; - - struct ir2_instr *tmp = instr_create_alu(ctx, nir_op_frcp, 1); - tmp->src[0] = ir2_src(ctx->f->inputs_count, 0, IR2_SRC_INPUT); - - instr = instr_create_alu_dest(ctx, nir_op_sge, &intr->dest); - instr->src[0] = ir2_src(tmp->idx, 0, IR2_SRC_SSA); - instr->src[1] = ir2_zero(ctx); - break; - case nir_intrinsic_load_point_coord: - /* param.zw (note: abs might be needed like fragcoord in param.xy?) */ - ctx->so->need_param = true; - - instr = instr_create_alu_dest(ctx, nir_op_mov, &intr->dest); - instr->src[0] = ir2_src(ctx->f->inputs_count, IR2_SWIZZLE_ZW, IR2_SRC_INPUT); - break; - default: - compile_error(ctx, "unimplemented intr %d\n", intr->intrinsic); - break; - } + struct ir2_instr *instr; + ASSERTED nir_const_value *const_offset; + unsigned idx; + + switch (intr->intrinsic) { + case nir_intrinsic_load_input: + load_input(ctx, &intr->dest, nir_intrinsic_base(intr)); + break; + case nir_intrinsic_store_output: + store_output(ctx, intr->src[0], output_slot(ctx, intr), + intr->num_components); + break; + case nir_intrinsic_load_uniform: + const_offset = nir_src_as_const_value(intr->src[0]); + assert(const_offset); /* TODO can be false in ES2? */ + idx = nir_intrinsic_base(intr); + idx += (uint32_t)const_offset[0].f32; + instr = instr_create_alu_dest(ctx, nir_op_mov, &intr->dest); + instr->src[0] = ir2_src(idx, 0, IR2_SRC_CONST); + break; + case nir_intrinsic_discard: + case nir_intrinsic_discard_if: + instr = ir2_instr_create(ctx, IR2_ALU); + instr->alu.vector_opc = VECTOR_NONE; + if (intr->intrinsic == nir_intrinsic_discard_if) { + instr->alu.scalar_opc = KILLNEs; + instr->src[0] = make_src(ctx, intr->src[0]); + } else { + instr->alu.scalar_opc = KILLEs; + instr->src[0] = ir2_zero(ctx); + } + instr->alu.export = -1; + instr->src_count = 1; + ctx->so->has_kill = true; + break; + case nir_intrinsic_load_front_face: + /* gl_FrontFacing is in the sign of param.x + * rcp required because otherwise we can't differentiate -0.0 and +0.0 + */ + ctx->so->need_param = true; + + struct ir2_instr *tmp = instr_create_alu(ctx, nir_op_frcp, 1); + tmp->src[0] = ir2_src(ctx->f->inputs_count, 0, IR2_SRC_INPUT); + + instr = instr_create_alu_dest(ctx, nir_op_sge, &intr->dest); + instr->src[0] = ir2_src(tmp->idx, 0, IR2_SRC_SSA); + instr->src[1] = ir2_zero(ctx); + break; + case nir_intrinsic_load_point_coord: + /* param.zw (note: abs might be needed like fragcoord in param.xy?) */ + ctx->so->need_param = true; + + instr = instr_create_alu_dest(ctx, nir_op_mov, &intr->dest); + instr->src[0] = + ir2_src(ctx->f->inputs_count, IR2_SWIZZLE_ZW, IR2_SRC_INPUT); + break; + default: + compile_error(ctx, "unimplemented intr %d\n", intr->intrinsic); + break; + } } static void -emit_tex(struct ir2_context *ctx, nir_tex_instr * tex) +emit_tex(struct ir2_context *ctx, nir_tex_instr *tex) { - bool is_rect = false, is_cube = false; - struct ir2_instr *instr; - nir_src *coord, *lod_bias; - - coord = lod_bias = NULL; - - for (unsigned i = 0; i < tex->num_srcs; i++) { - switch (tex->src[i].src_type) { - case nir_tex_src_coord: - coord = &tex->src[i].src; - break; - case nir_tex_src_bias: - case nir_tex_src_lod: - assert(!lod_bias); - lod_bias = &tex->src[i].src; - break; - default: - compile_error(ctx, "Unhandled NIR tex src type: %d\n", - tex->src[i].src_type); - return; - } - } - - switch (tex->op) { - case nir_texop_tex: - case nir_texop_txb: - case nir_texop_txl: - break; - default: - compile_error(ctx, "unimplemented texop %d\n", tex->op); - return; - } - - switch (tex->sampler_dim) { - case GLSL_SAMPLER_DIM_2D: - break; - case GLSL_SAMPLER_DIM_RECT: - is_rect = true; - break; - case GLSL_SAMPLER_DIM_CUBE: - is_cube = true; - break; - default: - compile_error(ctx, "unimplemented sampler %d\n", tex->sampler_dim); - return; - } - - struct ir2_src src_coord = make_src_noconst(ctx, *coord); - - /* for cube maps - * tmp = cube(coord) - * tmp.xy = tmp.xy / |tmp.z| + 1.5 - * coord = tmp.xyw - */ - if (is_cube) { - struct ir2_instr *rcp, *coord_xy; - unsigned reg_idx; - - instr = instr_create_alu_reg(ctx, ir2_op_cube, 15, NULL); - instr->src[0] = src_coord; - instr->src[0].swizzle = IR2_SWIZZLE_ZZXY; - instr->src[1] = src_coord; - instr->src[1].swizzle = IR2_SWIZZLE_YXZZ; - - reg_idx = instr->reg - ctx->reg; /* hacky */ - - rcp = instr_create_alu(ctx, nir_op_frcp, 1); - rcp->src[0] = ir2_src(reg_idx, IR2_SWIZZLE_Z, IR2_SRC_REG); - rcp->src[0].abs = true; - - coord_xy = instr_create_alu_reg(ctx, nir_op_ffma, 3, instr); - coord_xy->src[0] = ir2_src(reg_idx, 0, IR2_SRC_REG); - coord_xy->src[1] = ir2_src(rcp->idx, IR2_SWIZZLE_XXXX, IR2_SRC_SSA); - coord_xy->src[2] = load_const(ctx, (float[]) {1.5f}, 1); - - src_coord = ir2_src(reg_idx, 0, IR2_SRC_REG); - /* TODO: lod/bias transformed by src_coord.z ? */ - } - - instr = ir2_instr_create_fetch(ctx, &tex->dest, TEX_FETCH); - instr->src[0] = src_coord; - instr->src[0].swizzle = is_cube ? IR2_SWIZZLE_YXW : 0; - instr->fetch.tex.is_cube = is_cube; - instr->fetch.tex.is_rect = is_rect; - instr->fetch.tex.samp_id = tex->sampler_index; - - /* for lod/bias, we insert an extra src for the backend to deal with */ - if (lod_bias) { - instr->src[1] = make_src_noconst(ctx, *lod_bias); - /* backend will use 2-3 components so apply swizzle */ - swiz_merge_p(&instr->src[1].swizzle, IR2_SWIZZLE_XXXX); - instr->src_count = 2; - } + bool is_rect = false, is_cube = false; + struct ir2_instr *instr; + nir_src *coord, *lod_bias; + + coord = lod_bias = NULL; + + for (unsigned i = 0; i < tex->num_srcs; i++) { + switch (tex->src[i].src_type) { + case nir_tex_src_coord: + coord = &tex->src[i].src; + break; + case nir_tex_src_bias: + case nir_tex_src_lod: + assert(!lod_bias); + lod_bias = &tex->src[i].src; + break; + default: + compile_error(ctx, "Unhandled NIR tex src type: %d\n", + tex->src[i].src_type); + return; + } + } + + switch (tex->op) { + case nir_texop_tex: + case nir_texop_txb: + case nir_texop_txl: + break; + default: + compile_error(ctx, "unimplemented texop %d\n", tex->op); + return; + } + + switch (tex->sampler_dim) { + case GLSL_SAMPLER_DIM_2D: + break; + case GLSL_SAMPLER_DIM_RECT: + is_rect = true; + break; + case GLSL_SAMPLER_DIM_CUBE: + is_cube = true; + break; + default: + compile_error(ctx, "unimplemented sampler %d\n", tex->sampler_dim); + return; + } + + struct ir2_src src_coord = make_src_noconst(ctx, *coord); + + /* for cube maps + * tmp = cube(coord) + * tmp.xy = tmp.xy / |tmp.z| + 1.5 + * coord = tmp.xyw + */ + if (is_cube) { + struct ir2_instr *rcp, *coord_xy; + unsigned reg_idx; + + instr = instr_create_alu_reg(ctx, ir2_op_cube, 15, NULL); + instr->src[0] = src_coord; + instr->src[0].swizzle = IR2_SWIZZLE_ZZXY; + instr->src[1] = src_coord; + instr->src[1].swizzle = IR2_SWIZZLE_YXZZ; + + reg_idx = instr->reg - ctx->reg; /* hacky */ + + rcp = instr_create_alu(ctx, nir_op_frcp, 1); + rcp->src[0] = ir2_src(reg_idx, IR2_SWIZZLE_Z, IR2_SRC_REG); + rcp->src[0].abs = true; + + coord_xy = instr_create_alu_reg(ctx, nir_op_ffma, 3, instr); + coord_xy->src[0] = ir2_src(reg_idx, 0, IR2_SRC_REG); + coord_xy->src[1] = ir2_src(rcp->idx, IR2_SWIZZLE_XXXX, IR2_SRC_SSA); + coord_xy->src[2] = load_const(ctx, (float[]){1.5f}, 1); + + src_coord = ir2_src(reg_idx, 0, IR2_SRC_REG); + /* TODO: lod/bias transformed by src_coord.z ? */ + } + + instr = ir2_instr_create_fetch(ctx, &tex->dest, TEX_FETCH); + instr->src[0] = src_coord; + instr->src[0].swizzle = is_cube ? IR2_SWIZZLE_YXW : 0; + instr->fetch.tex.is_cube = is_cube; + instr->fetch.tex.is_rect = is_rect; + instr->fetch.tex.samp_id = tex->sampler_index; + + /* for lod/bias, we insert an extra src for the backend to deal with */ + if (lod_bias) { + instr->src[1] = make_src_noconst(ctx, *lod_bias); + /* backend will use 2-3 components so apply swizzle */ + swiz_merge_p(&instr->src[1].swizzle, IR2_SWIZZLE_XXXX); + instr->src_count = 2; + } } static void -setup_input(struct ir2_context *ctx, nir_variable * in) +setup_input(struct ir2_context *ctx, nir_variable *in) { - struct fd2_shader_stateobj *so = ctx->so; - ASSERTED unsigned array_len = MAX2(glsl_get_length(in->type), 1); - unsigned n = in->data.driver_location; - unsigned slot = in->data.location; + struct fd2_shader_stateobj *so = ctx->so; + ASSERTED unsigned array_len = MAX2(glsl_get_length(in->type), 1); + unsigned n = in->data.driver_location; + unsigned slot = in->data.location; - assert(array_len == 1); + assert(array_len == 1); - /* handle later */ - if (ctx->so->type == MESA_SHADER_VERTEX) - return; + /* handle later */ + if (ctx->so->type == MESA_SHADER_VERTEX) + return; - if (ctx->so->type != MESA_SHADER_FRAGMENT) - compile_error(ctx, "unknown shader type: %d\n", ctx->so->type); + if (ctx->so->type != MESA_SHADER_FRAGMENT) + compile_error(ctx, "unknown shader type: %d\n", ctx->so->type); - n = ctx->f->inputs_count++; + n = ctx->f->inputs_count++; - /* half of fragcoord from param reg, half from a varying */ - if (slot == VARYING_SLOT_POS) { - ctx->f->fragcoord = n; - so->need_param = true; - } + /* half of fragcoord from param reg, half from a varying */ + if (slot == VARYING_SLOT_POS) { + ctx->f->fragcoord = n; + so->need_param = true; + } - ctx->f->inputs[n].slot = slot; - ctx->f->inputs[n].ncomp = glsl_get_components(in->type); + ctx->f->inputs[n].slot = slot; + ctx->f->inputs[n].ncomp = glsl_get_components(in->type); - /* in->data.interpolation? - * opengl ES 2.0 can't do flat mode, but we still get it from GALLIUM_HUD - */ + /* in->data.interpolation? + * opengl ES 2.0 can't do flat mode, but we still get it from GALLIUM_HUD + */ } static void -emit_undef(struct ir2_context *ctx, nir_ssa_undef_instr * undef) +emit_undef(struct ir2_context *ctx, nir_ssa_undef_instr *undef) { - /* TODO we don't want to emit anything for undefs */ + /* TODO we don't want to emit anything for undefs */ - struct ir2_instr *instr; + struct ir2_instr *instr; - instr = instr_create_alu_dest(ctx, nir_op_mov, - &(nir_dest) {.ssa = undef->def,.is_ssa = true}); - instr->src[0] = ir2_src(0, 0, IR2_SRC_CONST); + instr = instr_create_alu_dest( + ctx, nir_op_mov, &(nir_dest){.ssa = undef->def, .is_ssa = true}); + instr->src[0] = ir2_src(0, 0, IR2_SRC_CONST); } static void -emit_instr(struct ir2_context *ctx, nir_instr * instr) +emit_instr(struct ir2_context *ctx, nir_instr *instr) { - switch (instr->type) { - case nir_instr_type_alu: - emit_alu(ctx, nir_instr_as_alu(instr)); - break; - case nir_instr_type_deref: - /* ignored, handled as part of the intrinsic they are src to */ - break; - case nir_instr_type_intrinsic: - emit_intrinsic(ctx, nir_instr_as_intrinsic(instr)); - break; - case nir_instr_type_load_const: - /* dealt with when using nir_src */ - break; - case nir_instr_type_tex: - emit_tex(ctx, nir_instr_as_tex(instr)); - break; - case nir_instr_type_jump: - ctx->block_has_jump[ctx->block_idx] = true; - break; - case nir_instr_type_ssa_undef: - emit_undef(ctx, nir_instr_as_ssa_undef(instr)); - break; - default: - break; - } + switch (instr->type) { + case nir_instr_type_alu: + emit_alu(ctx, nir_instr_as_alu(instr)); + break; + case nir_instr_type_deref: + /* ignored, handled as part of the intrinsic they are src to */ + break; + case nir_instr_type_intrinsic: + emit_intrinsic(ctx, nir_instr_as_intrinsic(instr)); + break; + case nir_instr_type_load_const: + /* dealt with when using nir_src */ + break; + case nir_instr_type_tex: + emit_tex(ctx, nir_instr_as_tex(instr)); + break; + case nir_instr_type_jump: + ctx->block_has_jump[ctx->block_idx] = true; + break; + case nir_instr_type_ssa_undef: + emit_undef(ctx, nir_instr_as_ssa_undef(instr)); + break; + default: + break; + } } /* fragcoord.zw and a20x hw binning outputs */ static void extra_position_exports(struct ir2_context *ctx, bool binning) { - struct ir2_instr *instr, *rcp, *sc, *wincoord, *off; - - if (ctx->f->fragcoord < 0 && !binning) - return; - - instr = instr_create_alu(ctx, nir_op_fmax, 1); - instr->src[0] = ctx->position; - instr->src[0].swizzle = IR2_SWIZZLE_W; - instr->src[1] = ir2_zero(ctx); - - rcp = instr_create_alu(ctx, nir_op_frcp, 1); - rcp->src[0] = ir2_src(instr->idx, 0, IR2_SRC_SSA); - - sc = instr_create_alu(ctx, nir_op_fmul, 4); - sc->src[0] = ctx->position; - sc->src[1] = ir2_src(rcp->idx, IR2_SWIZZLE_XXXX, IR2_SRC_SSA); - - wincoord = instr_create_alu(ctx, nir_op_ffma, 4); - wincoord->src[0] = ir2_src(66, 0, IR2_SRC_CONST); - wincoord->src[1] = ir2_src(sc->idx, 0, IR2_SRC_SSA); - wincoord->src[2] = ir2_src(65, 0, IR2_SRC_CONST); - - /* fragcoord z/w */ - if (ctx->f->fragcoord >= 0 && !binning) { - instr = instr_create_alu(ctx, nir_op_mov, 1); - instr->src[0] = ir2_src(wincoord->idx, IR2_SWIZZLE_Z, IR2_SRC_SSA); - instr->alu.export = ctx->f->fragcoord; - - instr = instr_create_alu(ctx, nir_op_mov, 1); - instr->src[0] = ctx->position; - instr->src[0].swizzle = IR2_SWIZZLE_W; - instr->alu.export = ctx->f->fragcoord; - instr->alu.write_mask = 2; - } - - if (!binning) - return; - - off = instr_create_alu(ctx, nir_op_fadd, 1); - off->src[0] = ir2_src(64, 0, IR2_SRC_CONST); - off->src[1] = ir2_src(2, 0, IR2_SRC_INPUT); - - /* 8 max set in freedreno_screen.. unneeded instrs patched out */ - for (int i = 0; i < 8; i++) { - instr = instr_create_alu(ctx, nir_op_ffma, 4); - instr->src[0] = ir2_src(1, IR2_SWIZZLE_WYWW, IR2_SRC_CONST); - instr->src[1] = ir2_src(off->idx, IR2_SWIZZLE_XXXX, IR2_SRC_SSA); - instr->src[2] = ir2_src(3 + i, 0, IR2_SRC_CONST); - instr->alu.export = 32; - - instr = instr_create_alu(ctx, nir_op_ffma, 4); - instr->src[0] = ir2_src(68 + i * 2, 0, IR2_SRC_CONST); - instr->src[1] = ir2_src(wincoord->idx, 0, IR2_SRC_SSA); - instr->src[2] = ir2_src(67 + i * 2, 0, IR2_SRC_CONST); - instr->alu.export = 33; - } + struct ir2_instr *instr, *rcp, *sc, *wincoord, *off; + + if (ctx->f->fragcoord < 0 && !binning) + return; + + instr = instr_create_alu(ctx, nir_op_fmax, 1); + instr->src[0] = ctx->position; + instr->src[0].swizzle = IR2_SWIZZLE_W; + instr->src[1] = ir2_zero(ctx); + + rcp = instr_create_alu(ctx, nir_op_frcp, 1); + rcp->src[0] = ir2_src(instr->idx, 0, IR2_SRC_SSA); + + sc = instr_create_alu(ctx, nir_op_fmul, 4); + sc->src[0] = ctx->position; + sc->src[1] = ir2_src(rcp->idx, IR2_SWIZZLE_XXXX, IR2_SRC_SSA); + + wincoord = instr_create_alu(ctx, nir_op_ffma, 4); + wincoord->src[0] = ir2_src(66, 0, IR2_SRC_CONST); + wincoord->src[1] = ir2_src(sc->idx, 0, IR2_SRC_SSA); + wincoord->src[2] = ir2_src(65, 0, IR2_SRC_CONST); + + /* fragcoord z/w */ + if (ctx->f->fragcoord >= 0 && !binning) { + instr = instr_create_alu(ctx, nir_op_mov, 1); + instr->src[0] = ir2_src(wincoord->idx, IR2_SWIZZLE_Z, IR2_SRC_SSA); + instr->alu.export = ctx->f->fragcoord; + + instr = instr_create_alu(ctx, nir_op_mov, 1); + instr->src[0] = ctx->position; + instr->src[0].swizzle = IR2_SWIZZLE_W; + instr->alu.export = ctx->f->fragcoord; + instr->alu.write_mask = 2; + } + + if (!binning) + return; + + off = instr_create_alu(ctx, nir_op_fadd, 1); + off->src[0] = ir2_src(64, 0, IR2_SRC_CONST); + off->src[1] = ir2_src(2, 0, IR2_SRC_INPUT); + + /* 8 max set in freedreno_screen.. unneeded instrs patched out */ + for (int i = 0; i < 8; i++) { + instr = instr_create_alu(ctx, nir_op_ffma, 4); + instr->src[0] = ir2_src(1, IR2_SWIZZLE_WYWW, IR2_SRC_CONST); + instr->src[1] = ir2_src(off->idx, IR2_SWIZZLE_XXXX, IR2_SRC_SSA); + instr->src[2] = ir2_src(3 + i, 0, IR2_SRC_CONST); + instr->alu.export = 32; + + instr = instr_create_alu(ctx, nir_op_ffma, 4); + instr->src[0] = ir2_src(68 + i * 2, 0, IR2_SRC_CONST); + instr->src[1] = ir2_src(wincoord->idx, 0, IR2_SRC_SSA); + instr->src[2] = ir2_src(67 + i * 2, 0, IR2_SRC_CONST); + instr->alu.export = 33; + } } static bool emit_cf_list(struct ir2_context *ctx, struct exec_list *list); static bool -emit_block(struct ir2_context *ctx, nir_block * block) +emit_block(struct ir2_context *ctx, nir_block *block) { - struct ir2_instr *instr; - nir_block *succs = block->successors[0]; + struct ir2_instr *instr; + nir_block *succs = block->successors[0]; - ctx->block_idx = block->index; + ctx->block_idx = block->index; - nir_foreach_instr(instr, block) - emit_instr(ctx, instr); + nir_foreach_instr (instr, block) + emit_instr(ctx, instr); - if (!succs || !succs->index) - return false; + if (!succs || !succs->index) + return false; - /* we want to be smart and always jump and have the backend cleanup - * but we are not, so there are two cases where jump is needed: - * loops (succs index lower) - * jumps (jump instruction seen in block) - */ - if (succs->index > block->index && !ctx->block_has_jump[block->index]) - return false; + /* we want to be smart and always jump and have the backend cleanup + * but we are not, so there are two cases where jump is needed: + * loops (succs index lower) + * jumps (jump instruction seen in block) + */ + if (succs->index > block->index && !ctx->block_has_jump[block->index]) + return false; - assert(block->successors[1] == NULL); + assert(block->successors[1] == NULL); - instr = ir2_instr_create(ctx, IR2_CF); - instr->cf.block_idx = succs->index; - /* XXX can't jump to a block with different predicate */ - return true; + instr = ir2_instr_create(ctx, IR2_CF); + instr->cf.block_idx = succs->index; + /* XXX can't jump to a block with different predicate */ + return true; } static void -emit_if(struct ir2_context *ctx, nir_if * nif) +emit_if(struct ir2_context *ctx, nir_if *nif) { - unsigned pred = ctx->pred, pred_idx = ctx->pred_idx; - struct ir2_instr *instr; - - /* XXX: blob seems to always use same register for condition */ - - instr = ir2_instr_create(ctx, IR2_ALU); - instr->src[0] = make_src(ctx, nif->condition); - instr->src_count = 1; - instr->ssa.ncomp = 1; - instr->alu.vector_opc = VECTOR_NONE; - instr->alu.scalar_opc = SCALAR_NONE; - instr->alu.export = -1; - instr->alu.write_mask = 1; - instr->pred = 0; - - /* if nested, use PRED_SETNE_PUSHv */ - if (pred) { - instr->alu.vector_opc = PRED_SETNE_PUSHv; - instr->src[1] = instr->src[0]; - instr->src[0] = ir2_src(pred_idx, 0, IR2_SRC_SSA); - instr->src[0].swizzle = IR2_SWIZZLE_XXXX; - instr->src[1].swizzle = IR2_SWIZZLE_XXXX; - instr->src_count = 2; - } else { - instr->alu.scalar_opc = PRED_SETNEs; - } - - ctx->pred_idx = instr->idx; - ctx->pred = 3; - - emit_cf_list(ctx, &nif->then_list); - - /* TODO: if these is no else branch we don't need this - * and if the else branch is simple, can just flip ctx->pred instead - */ - instr = ir2_instr_create(ctx, IR2_ALU); - instr->src[0] = ir2_src(ctx->pred_idx, 0, IR2_SRC_SSA); - instr->src_count = 1; - instr->ssa.ncomp = 1; - instr->alu.vector_opc = VECTOR_NONE; - instr->alu.scalar_opc = PRED_SET_INVs; - instr->alu.export = -1; - instr->alu.write_mask = 1; - instr->pred = 0; - ctx->pred_idx = instr->idx; - - emit_cf_list(ctx, &nif->else_list); - - /* restore predicate for nested predicates */ - if (pred) { - instr = ir2_instr_create(ctx, IR2_ALU); - instr->src[0] = ir2_src(ctx->pred_idx, 0, IR2_SRC_SSA); - instr->src_count = 1; - instr->ssa.ncomp = 1; - instr->alu.vector_opc = VECTOR_NONE; - instr->alu.scalar_opc = PRED_SET_POPs; - instr->alu.export = -1; - instr->alu.write_mask = 1; - instr->pred = 0; - ctx->pred_idx = instr->idx; - } - - /* restore ctx->pred */ - ctx->pred = pred; + unsigned pred = ctx->pred, pred_idx = ctx->pred_idx; + struct ir2_instr *instr; + + /* XXX: blob seems to always use same register for condition */ + + instr = ir2_instr_create(ctx, IR2_ALU); + instr->src[0] = make_src(ctx, nif->condition); + instr->src_count = 1; + instr->ssa.ncomp = 1; + instr->alu.vector_opc = VECTOR_NONE; + instr->alu.scalar_opc = SCALAR_NONE; + instr->alu.export = -1; + instr->alu.write_mask = 1; + instr->pred = 0; + + /* if nested, use PRED_SETNE_PUSHv */ + if (pred) { + instr->alu.vector_opc = PRED_SETNE_PUSHv; + instr->src[1] = instr->src[0]; + instr->src[0] = ir2_src(pred_idx, 0, IR2_SRC_SSA); + instr->src[0].swizzle = IR2_SWIZZLE_XXXX; + instr->src[1].swizzle = IR2_SWIZZLE_XXXX; + instr->src_count = 2; + } else { + instr->alu.scalar_opc = PRED_SETNEs; + } + + ctx->pred_idx = instr->idx; + ctx->pred = 3; + + emit_cf_list(ctx, &nif->then_list); + + /* TODO: if these is no else branch we don't need this + * and if the else branch is simple, can just flip ctx->pred instead + */ + instr = ir2_instr_create(ctx, IR2_ALU); + instr->src[0] = ir2_src(ctx->pred_idx, 0, IR2_SRC_SSA); + instr->src_count = 1; + instr->ssa.ncomp = 1; + instr->alu.vector_opc = VECTOR_NONE; + instr->alu.scalar_opc = PRED_SET_INVs; + instr->alu.export = -1; + instr->alu.write_mask = 1; + instr->pred = 0; + ctx->pred_idx = instr->idx; + + emit_cf_list(ctx, &nif->else_list); + + /* restore predicate for nested predicates */ + if (pred) { + instr = ir2_instr_create(ctx, IR2_ALU); + instr->src[0] = ir2_src(ctx->pred_idx, 0, IR2_SRC_SSA); + instr->src_count = 1; + instr->ssa.ncomp = 1; + instr->alu.vector_opc = VECTOR_NONE; + instr->alu.scalar_opc = PRED_SET_POPs; + instr->alu.export = -1; + instr->alu.write_mask = 1; + instr->pred = 0; + ctx->pred_idx = instr->idx; + } + + /* restore ctx->pred */ + ctx->pred = pred; } /* get the highest block idx in the loop, so we know when @@ -991,185 +995,187 @@ emit_if(struct ir2_context *ctx, nir_if * nif) static unsigned loop_last_block(struct exec_list *list) { - nir_cf_node *node = - exec_node_data(nir_cf_node, exec_list_get_tail(list), node); - switch (node->type) { - case nir_cf_node_block: - return nir_cf_node_as_block(node)->index; - case nir_cf_node_if: - assert(0); /* XXX could this ever happen? */ - return 0; - case nir_cf_node_loop: - return loop_last_block(&nir_cf_node_as_loop(node)->body); - default: - compile_error(ctx, "Not supported\n"); - return 0; - } + nir_cf_node *node = + exec_node_data(nir_cf_node, exec_list_get_tail(list), node); + switch (node->type) { + case nir_cf_node_block: + return nir_cf_node_as_block(node)->index; + case nir_cf_node_if: + assert(0); /* XXX could this ever happen? */ + return 0; + case nir_cf_node_loop: + return loop_last_block(&nir_cf_node_as_loop(node)->body); + default: + compile_error(ctx, "Not supported\n"); + return 0; + } } static void emit_loop(struct ir2_context *ctx, nir_loop *nloop) { - ctx->loop_last_block[++ctx->loop_depth] = loop_last_block(&nloop->body); - emit_cf_list(ctx, &nloop->body); - ctx->loop_depth--; + ctx->loop_last_block[++ctx->loop_depth] = loop_last_block(&nloop->body); + emit_cf_list(ctx, &nloop->body); + ctx->loop_depth--; } static bool emit_cf_list(struct ir2_context *ctx, struct exec_list *list) { - bool ret = false; - foreach_list_typed(nir_cf_node, node, node, list) { - ret = false; - switch (node->type) { - case nir_cf_node_block: - ret = emit_block(ctx, nir_cf_node_as_block(node)); - break; - case nir_cf_node_if: - emit_if(ctx, nir_cf_node_as_if(node)); - break; - case nir_cf_node_loop: - emit_loop(ctx, nir_cf_node_as_loop(node)); - break; - case nir_cf_node_function: - compile_error(ctx, "Not supported\n"); - break; - } - } - return ret; + bool ret = false; + foreach_list_typed (nir_cf_node, node, node, list) { + ret = false; + switch (node->type) { + case nir_cf_node_block: + ret = emit_block(ctx, nir_cf_node_as_block(node)); + break; + case nir_cf_node_if: + emit_if(ctx, nir_cf_node_as_if(node)); + break; + case nir_cf_node_loop: + emit_loop(ctx, nir_cf_node_as_loop(node)); + break; + case nir_cf_node_function: + compile_error(ctx, "Not supported\n"); + break; + } + } + return ret; } -static void cleanup_binning(struct ir2_context *ctx) +static void +cleanup_binning(struct ir2_context *ctx) { - assert(ctx->so->type == MESA_SHADER_VERTEX); + assert(ctx->so->type == MESA_SHADER_VERTEX); - /* kill non-position outputs for binning variant */ - nir_foreach_block(block, nir_shader_get_entrypoint(ctx->nir)) { - nir_foreach_instr_safe(instr, block) { - if (instr->type != nir_instr_type_intrinsic) - continue; + /* kill non-position outputs for binning variant */ + nir_foreach_block (block, nir_shader_get_entrypoint(ctx->nir)) { + nir_foreach_instr_safe (instr, block) { + if (instr->type != nir_instr_type_intrinsic) + continue; - nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); - if (intr->intrinsic != nir_intrinsic_store_output) - continue; + nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); + if (intr->intrinsic != nir_intrinsic_store_output) + continue; - if (output_slot(ctx, intr) != VARYING_SLOT_POS) - nir_instr_remove(instr); - } - } + if (output_slot(ctx, intr) != VARYING_SLOT_POS) + nir_instr_remove(instr); + } + } - ir2_optimize_nir(ctx->nir, false); + ir2_optimize_nir(ctx->nir, false); } static bool ir2_alu_to_scalar_filter_cb(const nir_instr *instr, const void *data) { - if (instr->type != nir_instr_type_alu) - return false; - - nir_alu_instr *alu = nir_instr_as_alu(instr); - switch (alu->op) { - case nir_op_frsq: - case nir_op_frcp: - case nir_op_flog2: - case nir_op_fexp2: - case nir_op_fsqrt: - case nir_op_fcos: - case nir_op_fsin: - return true; - default: - break; - } - - return false; + if (instr->type != nir_instr_type_alu) + return false; + + nir_alu_instr *alu = nir_instr_as_alu(instr); + switch (alu->op) { + case nir_op_frsq: + case nir_op_frcp: + case nir_op_flog2: + case nir_op_fexp2: + case nir_op_fsqrt: + case nir_op_fcos: + case nir_op_fsin: + return true; + default: + break; + } + + return false; } void ir2_nir_compile(struct ir2_context *ctx, bool binning) { - struct fd2_shader_stateobj *so = ctx->so; + struct fd2_shader_stateobj *so = ctx->so; - memset(ctx->ssa_map, 0xff, sizeof(ctx->ssa_map)); + memset(ctx->ssa_map, 0xff, sizeof(ctx->ssa_map)); - ctx->nir = nir_shader_clone(NULL, so->nir); + ctx->nir = nir_shader_clone(NULL, so->nir); - if (binning) - cleanup_binning(ctx); + if (binning) + cleanup_binning(ctx); - OPT_V(ctx->nir, nir_copy_prop); - OPT_V(ctx->nir, nir_opt_dce); - OPT_V(ctx->nir, nir_opt_move, nir_move_comparisons); + OPT_V(ctx->nir, nir_copy_prop); + OPT_V(ctx->nir, nir_opt_dce); + OPT_V(ctx->nir, nir_opt_move, nir_move_comparisons); - OPT_V(ctx->nir, nir_lower_int_to_float); - OPT_V(ctx->nir, nir_lower_bool_to_float); - while(OPT(ctx->nir, nir_opt_algebraic)); - OPT_V(ctx->nir, nir_opt_algebraic_late); - OPT_V(ctx->nir, nir_lower_to_source_mods, nir_lower_all_source_mods); + OPT_V(ctx->nir, nir_lower_int_to_float); + OPT_V(ctx->nir, nir_lower_bool_to_float); + while (OPT(ctx->nir, nir_opt_algebraic)) + ; + OPT_V(ctx->nir, nir_opt_algebraic_late); + OPT_V(ctx->nir, nir_lower_to_source_mods, nir_lower_all_source_mods); - OPT_V(ctx->nir, nir_lower_alu_to_scalar, ir2_alu_to_scalar_filter_cb, NULL); + OPT_V(ctx->nir, nir_lower_alu_to_scalar, ir2_alu_to_scalar_filter_cb, NULL); - OPT_V(ctx->nir, nir_lower_locals_to_regs); + OPT_V(ctx->nir, nir_lower_locals_to_regs); - OPT_V(ctx->nir, nir_convert_from_ssa, true); + OPT_V(ctx->nir, nir_convert_from_ssa, true); - OPT_V(ctx->nir, nir_move_vec_src_uses_to_dest); - OPT_V(ctx->nir, nir_lower_vec_to_movs, NULL, NULL); + OPT_V(ctx->nir, nir_move_vec_src_uses_to_dest); + OPT_V(ctx->nir, nir_lower_vec_to_movs, NULL, NULL); - OPT_V(ctx->nir, nir_opt_dce); + OPT_V(ctx->nir, nir_opt_dce); - nir_sweep(ctx->nir); + nir_sweep(ctx->nir); - if (FD_DBG(DISASM)) { - debug_printf("----------------------\n"); - nir_print_shader(ctx->nir, stdout); - debug_printf("----------------------\n"); - } + if (FD_DBG(DISASM)) { + debug_printf("----------------------\n"); + nir_print_shader(ctx->nir, stdout); + debug_printf("----------------------\n"); + } - /* fd2_shader_stateobj init */ - if (so->type == MESA_SHADER_FRAGMENT) { - ctx->f->fragcoord = -1; - ctx->f->inputs_count = 0; - memset(ctx->f->inputs, 0, sizeof(ctx->f->inputs)); - } + /* fd2_shader_stateobj init */ + if (so->type == MESA_SHADER_FRAGMENT) { + ctx->f->fragcoord = -1; + ctx->f->inputs_count = 0; + memset(ctx->f->inputs, 0, sizeof(ctx->f->inputs)); + } - /* Setup inputs: */ - nir_foreach_shader_in_variable(in, ctx->nir) - setup_input(ctx, in); + /* Setup inputs: */ + nir_foreach_shader_in_variable (in, ctx->nir) + setup_input(ctx, in); - if (so->type == MESA_SHADER_FRAGMENT) { - unsigned idx; - for (idx = 0; idx < ctx->f->inputs_count; idx++) { - ctx->input[idx].ncomp = ctx->f->inputs[idx].ncomp; - update_range(ctx, &ctx->input[idx]); - } - /* assume we have param input and kill it later if not */ - ctx->input[idx].ncomp = 4; - update_range(ctx, &ctx->input[idx]); - } else { - ctx->input[0].ncomp = 1; - ctx->input[2].ncomp = 1; - update_range(ctx, &ctx->input[0]); - update_range(ctx, &ctx->input[2]); - } + if (so->type == MESA_SHADER_FRAGMENT) { + unsigned idx; + for (idx = 0; idx < ctx->f->inputs_count; idx++) { + ctx->input[idx].ncomp = ctx->f->inputs[idx].ncomp; + update_range(ctx, &ctx->input[idx]); + } + /* assume we have param input and kill it later if not */ + ctx->input[idx].ncomp = 4; + update_range(ctx, &ctx->input[idx]); + } else { + ctx->input[0].ncomp = 1; + ctx->input[2].ncomp = 1; + update_range(ctx, &ctx->input[0]); + update_range(ctx, &ctx->input[2]); + } - /* And emit the body: */ - nir_function_impl *fxn = nir_shader_get_entrypoint(ctx->nir); + /* And emit the body: */ + nir_function_impl *fxn = nir_shader_get_entrypoint(ctx->nir); - nir_foreach_register(reg, &fxn->registers) { - ctx->reg[reg->index].ncomp = reg->num_components; - ctx->reg_count = MAX2(ctx->reg_count, reg->index + 1); - } + nir_foreach_register (reg, &fxn->registers) { + ctx->reg[reg->index].ncomp = reg->num_components; + ctx->reg_count = MAX2(ctx->reg_count, reg->index + 1); + } - nir_metadata_require(fxn, nir_metadata_block_index); - emit_cf_list(ctx, &fxn->body); - /* TODO emit_block(ctx, fxn->end_block); */ + nir_metadata_require(fxn, nir_metadata_block_index); + emit_cf_list(ctx, &fxn->body); + /* TODO emit_block(ctx, fxn->end_block); */ - if (so->type == MESA_SHADER_VERTEX) - extra_position_exports(ctx, binning); + if (so->type == MESA_SHADER_VERTEX) + extra_position_exports(ctx, binning); - ralloc_free(ctx->nir); + ralloc_free(ctx->nir); - /* kill unused param input */ - if (so->type == MESA_SHADER_FRAGMENT && !so->need_param) - ctx->input[ctx->f->inputs_count].initialized = false; + /* kill unused param input */ + if (so->type == MESA_SHADER_FRAGMENT && !so->need_param) + ctx->input[ctx->f->inputs_count].initialized = false; } diff --git a/src/gallium/drivers/freedreno/a2xx/ir2_private.h b/src/gallium/drivers/freedreno/a2xx/ir2_private.h index 3b856bf..606a732 100644 --- a/src/gallium/drivers/freedreno/a2xx/ir2_private.h +++ b/src/gallium/drivers/freedreno/a2xx/ir2_private.h @@ -24,175 +24,175 @@ * Jonathan Marek */ -#include +#include #include #include +#include #include -#include -#include "ir2.h" -#include "fd2_program.h" #include "ir2/instr-a2xx.h" +#include "fd2_program.h" +#include "ir2.h" enum ir2_src_type { - IR2_SRC_SSA, - IR2_SRC_REG, - IR2_SRC_INPUT, - IR2_SRC_CONST, + IR2_SRC_SSA, + IR2_SRC_REG, + IR2_SRC_INPUT, + IR2_SRC_CONST, }; struct ir2_src { - /* num can mean different things - * ssa: index of instruction - * reg: index in ctx->reg array - * input: index in ctx->input array - * const: constant index (C0, C1, etc) - */ - uint16_t num; - uint8_t swizzle; - enum ir2_src_type type : 2; - uint8_t abs : 1; - uint8_t negate : 1; - uint8_t : 4; + /* num can mean different things + * ssa: index of instruction + * reg: index in ctx->reg array + * input: index in ctx->input array + * const: constant index (C0, C1, etc) + */ + uint16_t num; + uint8_t swizzle; + enum ir2_src_type type : 2; + uint8_t abs : 1; + uint8_t negate : 1; + uint8_t : 4; }; struct ir2_reg_component { - uint8_t c : 3; /* assigned x/y/z/w (7=dont write, for fetch instr) */ - bool alloc : 1; /* is it currently allocated */ - uint8_t ref_count; /* for ra */ + uint8_t c : 3; /* assigned x/y/z/w (7=dont write, for fetch instr) */ + bool alloc : 1; /* is it currently allocated */ + uint8_t ref_count; /* for ra */ }; struct ir2_reg { - uint8_t idx; /* assigned hardware register */ - uint8_t ncomp; - - uint8_t loop_depth; - bool initialized; - /* block_idx to free on (-1 = free on ref_count==0) */ - int block_idx_free; - struct ir2_reg_component comp[4]; + uint8_t idx; /* assigned hardware register */ + uint8_t ncomp; + + uint8_t loop_depth; + bool initialized; + /* block_idx to free on (-1 = free on ref_count==0) */ + int block_idx_free; + struct ir2_reg_component comp[4]; }; struct ir2_instr { - unsigned idx; - - unsigned block_idx; - - enum { - IR2_NONE, - IR2_FETCH, - IR2_ALU, - IR2_CF, - } type : 2; - - /* instruction needs to be emitted (for scheduling) */ - bool need_emit : 1; - - /* predicate value - (usually) same for entire block */ - uint8_t pred : 2; - - /* src */ - uint8_t src_count; - struct ir2_src src[4]; - - /* dst */ - bool is_ssa; - union { - struct ir2_reg ssa; - struct ir2_reg *reg; - }; - - /* type-specific */ - union { - struct { - instr_fetch_opc_t opc : 5; - union { - struct { - uint8_t const_idx; - uint8_t const_idx_sel; - } vtx; - struct { - bool is_cube : 1; - bool is_rect : 1; - uint8_t samp_id; - } tex; - }; - } fetch; - struct { - /* store possible opcs, then we can choose vector/scalar instr */ - instr_scalar_opc_t scalar_opc : 6; - instr_vector_opc_t vector_opc : 5; - /* same as nir */ - uint8_t write_mask : 4; - bool saturate : 1; - - /* export idx (-1 no export) */ - int8_t export; - - /* for scalarized 2 src instruction */ - uint8_t src1_swizzle; - } alu; - struct { - /* jmp dst block_idx */ - uint8_t block_idx; - } cf; - }; + unsigned idx; + + unsigned block_idx; + + enum { + IR2_NONE, + IR2_FETCH, + IR2_ALU, + IR2_CF, + } type : 2; + + /* instruction needs to be emitted (for scheduling) */ + bool need_emit : 1; + + /* predicate value - (usually) same for entire block */ + uint8_t pred : 2; + + /* src */ + uint8_t src_count; + struct ir2_src src[4]; + + /* dst */ + bool is_ssa; + union { + struct ir2_reg ssa; + struct ir2_reg *reg; + }; + + /* type-specific */ + union { + struct { + instr_fetch_opc_t opc : 5; + union { + struct { + uint8_t const_idx; + uint8_t const_idx_sel; + } vtx; + struct { + bool is_cube : 1; + bool is_rect : 1; + uint8_t samp_id; + } tex; + }; + } fetch; + struct { + /* store possible opcs, then we can choose vector/scalar instr */ + instr_scalar_opc_t scalar_opc : 6; + instr_vector_opc_t vector_opc : 5; + /* same as nir */ + uint8_t write_mask : 4; + bool saturate : 1; + + /* export idx (-1 no export) */ + int8_t export; + + /* for scalarized 2 src instruction */ + uint8_t src1_swizzle; + } alu; + struct { + /* jmp dst block_idx */ + uint8_t block_idx; + } cf; + }; }; struct ir2_sched_instr { - uint32_t reg_state[8]; - struct ir2_instr *instr, *instr_s; + uint32_t reg_state[8]; + struct ir2_instr *instr, *instr_s; }; struct ir2_context { - struct fd2_shader_stateobj *so; + struct fd2_shader_stateobj *so; - unsigned block_idx, pred_idx; - uint8_t pred; - bool block_has_jump[64]; + unsigned block_idx, pred_idx; + uint8_t pred; + bool block_has_jump[64]; - unsigned loop_last_block[64]; - unsigned loop_depth; + unsigned loop_last_block[64]; + unsigned loop_depth; - nir_shader *nir; + nir_shader *nir; - /* ssa index of position output */ - struct ir2_src position; + /* ssa index of position output */ + struct ir2_src position; - /* to translate SSA ids to instruction ids */ - int16_t ssa_map[1024]; + /* to translate SSA ids to instruction ids */ + int16_t ssa_map[1024]; - struct ir2_shader_info *info; - struct ir2_frag_linkage *f; + struct ir2_shader_info *info; + struct ir2_frag_linkage *f; - int prev_export; + int prev_export; - /* RA state */ - struct ir2_reg* live_regs[64]; - uint32_t reg_state[256/32]; /* 64*4 bits */ + /* RA state */ + struct ir2_reg *live_regs[64]; + uint32_t reg_state[256 / 32]; /* 64*4 bits */ - /* inputs */ - struct ir2_reg input[16 + 1]; /* 16 + param */ + /* inputs */ + struct ir2_reg input[16 + 1]; /* 16 + param */ - /* non-ssa regs */ - struct ir2_reg reg[64]; - unsigned reg_count; + /* non-ssa regs */ + struct ir2_reg reg[64]; + unsigned reg_count; - struct ir2_instr instr[0x300]; - unsigned instr_count; + struct ir2_instr instr[0x300]; + unsigned instr_count; - struct ir2_sched_instr instr_sched[0x180]; - unsigned instr_sched_count; + struct ir2_sched_instr instr_sched[0x180]; + unsigned instr_sched_count; }; void assemble(struct ir2_context *ctx, bool binning); void ir2_nir_compile(struct ir2_context *ctx, bool binning); -bool ir2_nir_lower_scalar(nir_shader * shader); +bool ir2_nir_lower_scalar(nir_shader *shader); void ra_count_refs(struct ir2_context *ctx); void ra_reg(struct ir2_context *ctx, struct ir2_reg *reg, int force_idx, - bool export, uint8_t export_writemask); + bool export, uint8_t export_writemask); void ra_src_free(struct ir2_context *ctx, struct ir2_instr *instr); void ra_block_free(struct ir2_context *ctx, unsigned block); @@ -201,196 +201,212 @@ void cp_export(struct ir2_context *ctx); /* utils */ enum { - IR2_SWIZZLE_Y = 1 << 0, - IR2_SWIZZLE_Z = 2 << 0, - IR2_SWIZZLE_W = 3 << 0, - - IR2_SWIZZLE_ZW = 2 << 0 | 2 << 2, - - IR2_SWIZZLE_YXW = 1 << 0 | 3 << 2 | 1 << 4, - - IR2_SWIZZLE_XXXX = 0 << 0 | 3 << 2 | 2 << 4 | 1 << 6, - IR2_SWIZZLE_YYYY = 1 << 0 | 0 << 2 | 3 << 4 | 2 << 6, - IR2_SWIZZLE_ZZZZ = 2 << 0 | 1 << 2 | 0 << 4 | 3 << 6, - IR2_SWIZZLE_WWWW = 3 << 0 | 2 << 2 | 1 << 4 | 0 << 6, - IR2_SWIZZLE_WYWW = 3 << 0 | 0 << 2 | 1 << 4 | 0 << 6, - IR2_SWIZZLE_XYXY = 0 << 0 | 0 << 2 | 2 << 4 | 2 << 6, - IR2_SWIZZLE_ZZXY = 2 << 0 | 1 << 2 | 2 << 4 | 2 << 6, - IR2_SWIZZLE_YXZZ = 1 << 0 | 3 << 2 | 0 << 4 | 3 << 6, + IR2_SWIZZLE_Y = 1 << 0, + IR2_SWIZZLE_Z = 2 << 0, + IR2_SWIZZLE_W = 3 << 0, + + IR2_SWIZZLE_ZW = 2 << 0 | 2 << 2, + + IR2_SWIZZLE_YXW = 1 << 0 | 3 << 2 | 1 << 4, + + IR2_SWIZZLE_XXXX = 0 << 0 | 3 << 2 | 2 << 4 | 1 << 6, + IR2_SWIZZLE_YYYY = 1 << 0 | 0 << 2 | 3 << 4 | 2 << 6, + IR2_SWIZZLE_ZZZZ = 2 << 0 | 1 << 2 | 0 << 4 | 3 << 6, + IR2_SWIZZLE_WWWW = 3 << 0 | 2 << 2 | 1 << 4 | 0 << 6, + IR2_SWIZZLE_WYWW = 3 << 0 | 0 << 2 | 1 << 4 | 0 << 6, + IR2_SWIZZLE_XYXY = 0 << 0 | 0 << 2 | 2 << 4 | 2 << 6, + IR2_SWIZZLE_ZZXY = 2 << 0 | 1 << 2 | 2 << 4 | 2 << 6, + IR2_SWIZZLE_YXZZ = 1 << 0 | 3 << 2 | 0 << 4 | 3 << 6, }; -#define compile_error(ctx, args...) ({ \ - printf(args); \ - assert(0); \ -}) +#define compile_error(ctx, args...) \ + ({ \ + printf(args); \ + assert(0); \ + }) static inline struct ir2_src ir2_src(uint16_t num, uint8_t swizzle, enum ir2_src_type type) { - return (struct ir2_src) { - .num = num, - .swizzle = swizzle, - .type = type - }; + return (struct ir2_src){.num = num, .swizzle = swizzle, .type = type}; } /* ir2_assemble uses it .. */ struct ir2_src ir2_zero(struct ir2_context *ctx); -#define ir2_foreach_instr(it, ctx) \ - for (struct ir2_instr *it = (ctx)->instr; ({ \ - while (it != &(ctx)->instr[(ctx)->instr_count] && it->type == IR2_NONE) it++; \ - it != &(ctx)->instr[(ctx)->instr_count]; }); it++) - -#define ir2_foreach_live_reg(it, ctx) \ - for (struct ir2_reg **__ptr = (ctx)->live_regs, *it; ({ \ - while (__ptr != &(ctx)->live_regs[64] && *__ptr == NULL) __ptr++; \ - __ptr != &(ctx)->live_regs[64] ? (it=*__ptr) : NULL; }); it++) - -#define ir2_foreach_avail(it) \ - for (struct ir2_instr **__instrp = avail, *it; \ - it = *__instrp, __instrp != &avail[avail_count]; __instrp++) - -#define ir2_foreach_src(it, instr) \ - for (struct ir2_src *it = instr->src; \ - it != &instr->src[instr->src_count]; it++) +#define ir2_foreach_instr(it, ctx) \ + for (struct ir2_instr *it = (ctx)->instr; ({ \ + while (it != &(ctx)->instr[(ctx)->instr_count] && \ + it->type == IR2_NONE) \ + it++; \ + it != &(ctx)->instr[(ctx)->instr_count]; \ + }); \ + it++) + +#define ir2_foreach_live_reg(it, ctx) \ + for (struct ir2_reg **__ptr = (ctx)->live_regs, *it; ({ \ + while (__ptr != &(ctx)->live_regs[64] && *__ptr == NULL) \ + __ptr++; \ + __ptr != &(ctx)->live_regs[64] ? (it = *__ptr) : NULL; \ + }); \ + it++) + +#define ir2_foreach_avail(it) \ + for (struct ir2_instr **__instrp = avail, *it; \ + it = *__instrp, __instrp != &avail[avail_count]; __instrp++) + +#define ir2_foreach_src(it, instr) \ + for (struct ir2_src *it = instr->src; it != &instr->src[instr->src_count]; \ + it++) /* mask for register allocation * 64 registers with 4 components each = 256 bits */ /* typedef struct { - uint64_t data[4]; + uint64_t data[4]; } regmask_t; */ -static inline bool mask_isset(uint32_t * mask, unsigned num) +static inline bool +mask_isset(uint32_t *mask, unsigned num) { - return ! !(mask[num / 32] & 1 << num % 32); + return !!(mask[num / 32] & 1 << num % 32); } -static inline void mask_set(uint32_t * mask, unsigned num) +static inline void +mask_set(uint32_t *mask, unsigned num) { - mask[num / 32] |= 1 << num % 32; + mask[num / 32] |= 1 << num % 32; } -static inline void mask_unset(uint32_t * mask, unsigned num) +static inline void +mask_unset(uint32_t *mask, unsigned num) { - mask[num / 32] &= ~(1 << num % 32); + mask[num / 32] &= ~(1 << num % 32); } -static inline unsigned mask_reg(uint32_t * mask, unsigned num) +static inline unsigned +mask_reg(uint32_t *mask, unsigned num) { - return mask[num / 8] >> num % 8 * 4 & 0xf; + return mask[num / 8] >> num % 8 * 4 & 0xf; } -static inline bool is_export(struct ir2_instr *instr) +static inline bool +is_export(struct ir2_instr *instr) { - return instr->type == IR2_ALU && instr->alu.export >= 0; + return instr->type == IR2_ALU && instr->alu.export >= 0; } -static inline instr_alloc_type_t export_buf(unsigned num) +static inline instr_alloc_type_t +export_buf(unsigned num) { - return num < 32 ? SQ_PARAMETER_PIXEL : - num >= 62 ? SQ_POSITION : SQ_MEMORY; + return num < 32 ? SQ_PARAMETER_PIXEL : num >= 62 ? SQ_POSITION : SQ_MEMORY; } /* component c for channel i */ -static inline unsigned swiz_set(unsigned c, unsigned i) +static inline unsigned +swiz_set(unsigned c, unsigned i) { - return ((c - i) & 3) << i * 2; + return ((c - i) & 3) << i * 2; } /* get swizzle in channel i */ -static inline unsigned swiz_get(unsigned swiz, unsigned i) +static inline unsigned +swiz_get(unsigned swiz, unsigned i) { - return ((swiz >> i * 2) + i) & 3; + return ((swiz >> i * 2) + i) & 3; } -static inline unsigned swiz_merge(unsigned swiz0, unsigned swiz1) +static inline unsigned +swiz_merge(unsigned swiz0, unsigned swiz1) { - unsigned swiz = 0; - for (int i = 0; i < 4; i++) - swiz |= swiz_set(swiz_get(swiz0, swiz_get(swiz1, i)), i); - return swiz; + unsigned swiz = 0; + for (int i = 0; i < 4; i++) + swiz |= swiz_set(swiz_get(swiz0, swiz_get(swiz1, i)), i); + return swiz; } -static inline void swiz_merge_p(uint8_t *swiz0, unsigned swiz1) +static inline void +swiz_merge_p(uint8_t *swiz0, unsigned swiz1) { - unsigned swiz = 0; - for (int i = 0; i < 4; i++) - swiz |= swiz_set(swiz_get(*swiz0, swiz_get(swiz1, i)), i); - *swiz0 = swiz; + unsigned swiz = 0; + for (int i = 0; i < 4; i++) + swiz |= swiz_set(swiz_get(*swiz0, swiz_get(swiz1, i)), i); + *swiz0 = swiz; } -static inline struct ir2_reg * get_reg(struct ir2_instr *instr) +static inline struct ir2_reg * +get_reg(struct ir2_instr *instr) { - return instr->is_ssa ? &instr->ssa : instr->reg; + return instr->is_ssa ? &instr->ssa : instr->reg; } static inline struct ir2_reg * get_reg_src(struct ir2_context *ctx, struct ir2_src *src) { - switch (src->type) { - case IR2_SRC_INPUT: - return &ctx->input[src->num]; - case IR2_SRC_SSA: - return &ctx->instr[src->num].ssa; - case IR2_SRC_REG: - return &ctx->reg[src->num]; - default: - return NULL; - } + switch (src->type) { + case IR2_SRC_INPUT: + return &ctx->input[src->num]; + case IR2_SRC_SSA: + return &ctx->instr[src->num].ssa; + case IR2_SRC_REG: + return &ctx->reg[src->num]; + default: + return NULL; + } } /* gets a ncomp value for the dst */ -static inline unsigned dst_ncomp(struct ir2_instr *instr) +static inline unsigned +dst_ncomp(struct ir2_instr *instr) { - if (instr->is_ssa) - return instr->ssa.ncomp; + if (instr->is_ssa) + return instr->ssa.ncomp; - if (instr->type == IR2_FETCH) - return instr->reg->ncomp; + if (instr->type == IR2_FETCH) + return instr->reg->ncomp; - assert(instr->type == IR2_ALU); + assert(instr->type == IR2_ALU); - unsigned ncomp = 0; - for (int i = 0; i < instr->reg->ncomp; i++) - ncomp += !!(instr->alu.write_mask & 1 << i); - return ncomp; + unsigned ncomp = 0; + for (int i = 0; i < instr->reg->ncomp; i++) + ncomp += !!(instr->alu.write_mask & 1 << i); + return ncomp; } /* gets a ncomp value for the src registers */ -static inline unsigned src_ncomp(struct ir2_instr *instr) +static inline unsigned +src_ncomp(struct ir2_instr *instr) { - if (instr->type == IR2_FETCH) { - switch (instr->fetch.opc) { - case VTX_FETCH: - return 1; - case TEX_FETCH: - return instr->fetch.tex.is_cube ? 3 : 2; - case TEX_SET_TEX_LOD: - return 1; - default: - assert(0); - } - } - - switch (instr->alu.scalar_opc) { - case PRED_SETEs ... KILLONEs: - return 1; - default: - break; - } - - switch (instr->alu.vector_opc) { - case DOT2ADDv: - return 2; - case DOT3v: - return 3; - case DOT4v: - case CUBEv: - case PRED_SETE_PUSHv: - return 4; - default: - return dst_ncomp(instr); - } + if (instr->type == IR2_FETCH) { + switch (instr->fetch.opc) { + case VTX_FETCH: + return 1; + case TEX_FETCH: + return instr->fetch.tex.is_cube ? 3 : 2; + case TEX_SET_TEX_LOD: + return 1; + default: + assert(0); + } + } + + switch (instr->alu.scalar_opc) { + case PRED_SETEs ... KILLONEs: + return 1; + default: + break; + } + + switch (instr->alu.vector_opc) { + case DOT2ADDv: + return 2; + case DOT3v: + return 3; + case DOT4v: + case CUBEv: + case PRED_SETE_PUSHv: + return 4; + default: + return dst_ncomp(instr); + } } diff --git a/src/gallium/drivers/freedreno/a2xx/ir2_ra.c b/src/gallium/drivers/freedreno/a2xx/ir2_ra.c index 066d9e3..3be4d64 100644 --- a/src/gallium/drivers/freedreno/a2xx/ir2_ra.c +++ b/src/gallium/drivers/freedreno/a2xx/ir2_ra.c @@ -27,201 +27,217 @@ #include "ir2_private.h" /* if an instruction has side effects, we should never kill it */ -static bool has_side_effects(struct ir2_instr *instr) +static bool +has_side_effects(struct ir2_instr *instr) { - if (instr->type == IR2_CF) - return true; - else if (instr->type == IR2_FETCH) - return false; - - switch (instr->alu.scalar_opc) { - case PRED_SETEs ... KILLONEs: - return true; - default: - break; - } - - switch (instr->alu.vector_opc) { - case PRED_SETE_PUSHv ... KILLNEv: - return true; - default: - break; - } - - return instr->alu.export >= 0; + if (instr->type == IR2_CF) + return true; + else if (instr->type == IR2_FETCH) + return false; + + switch (instr->alu.scalar_opc) { + case PRED_SETEs ... KILLONEs: + return true; + default: + break; + } + + switch (instr->alu.vector_opc) { + case PRED_SETE_PUSHv ... KILLNEv: + return true; + default: + break; + } + + return instr->alu.export >= 0; } /* mark an instruction as required, and all its sources recursively */ -static void set_need_emit(struct ir2_context *ctx, struct ir2_instr *instr) +static void +set_need_emit(struct ir2_context *ctx, struct ir2_instr *instr) { - struct ir2_reg *reg; - - /* don't repeat work already done */ - if (instr->need_emit) - return; - - instr->need_emit = true; - - ir2_foreach_src(src, instr) { - switch (src->type) { - case IR2_SRC_SSA: - set_need_emit(ctx, &ctx->instr[src->num]); - break; - case IR2_SRC_REG: - /* slow .. */ - reg = get_reg_src(ctx, src); - ir2_foreach_instr(instr, ctx) { - if (!instr->is_ssa && instr->reg == reg) - set_need_emit(ctx, instr); - } - break; - default: - break; - } - } + struct ir2_reg *reg; + + /* don't repeat work already done */ + if (instr->need_emit) + return; + + instr->need_emit = true; + + ir2_foreach_src(src, instr) + { + switch (src->type) { + case IR2_SRC_SSA: + set_need_emit(ctx, &ctx->instr[src->num]); + break; + case IR2_SRC_REG: + /* slow .. */ + reg = get_reg_src(ctx, src); + ir2_foreach_instr(instr, ctx) + { + if (!instr->is_ssa && instr->reg == reg) + set_need_emit(ctx, instr); + } + break; + default: + break; + } + } } /* get current bit mask of allocated components for a register */ -static unsigned reg_mask(struct ir2_context *ctx, unsigned idx) +static unsigned +reg_mask(struct ir2_context *ctx, unsigned idx) { - return ctx->reg_state[idx/8] >> idx%8*4 & 0xf; + return ctx->reg_state[idx / 8] >> idx % 8 * 4 & 0xf; } -static void reg_setmask(struct ir2_context *ctx, unsigned idx, unsigned c) +static void +reg_setmask(struct ir2_context *ctx, unsigned idx, unsigned c) { - idx = idx * 4 + c; - ctx->reg_state[idx/32] |= 1 << idx%32; + idx = idx * 4 + c; + ctx->reg_state[idx / 32] |= 1 << idx % 32; } -static void reg_freemask(struct ir2_context *ctx, unsigned idx, unsigned c) +static void +reg_freemask(struct ir2_context *ctx, unsigned idx, unsigned c) { - idx = idx * 4 + c; - ctx->reg_state[idx/32] &= ~(1 << idx%32); + idx = idx * 4 + c; + ctx->reg_state[idx / 32] &= ~(1 << idx % 32); } -void ra_count_refs(struct ir2_context *ctx) +void +ra_count_refs(struct ir2_context *ctx) { - struct ir2_reg *reg; - - /* mark instructions as needed - * need to do this because "substitutions" pass makes many movs not needed - */ - ir2_foreach_instr(instr, ctx) { - if (has_side_effects(instr)) - set_need_emit(ctx, instr); - } - - /* compute ref_counts */ - ir2_foreach_instr(instr, ctx) { - /* kill non-needed so they can be skipped */ - if (!instr->need_emit) { - instr->type = IR2_NONE; - continue; - } - - ir2_foreach_src(src, instr) { - if (src->type == IR2_SRC_CONST) - continue; - - reg = get_reg_src(ctx, src); - for (int i = 0; i < src_ncomp(instr); i++) - reg->comp[swiz_get(src->swizzle, i)].ref_count++; - } - } + struct ir2_reg *reg; + + /* mark instructions as needed + * need to do this because "substitutions" pass makes many movs not needed + */ + ir2_foreach_instr(instr, ctx) + { + if (has_side_effects(instr)) + set_need_emit(ctx, instr); + } + + /* compute ref_counts */ + ir2_foreach_instr(instr, ctx) + { + /* kill non-needed so they can be skipped */ + if (!instr->need_emit) { + instr->type = IR2_NONE; + continue; + } + + ir2_foreach_src(src, instr) + { + if (src->type == IR2_SRC_CONST) + continue; + + reg = get_reg_src(ctx, src); + for (int i = 0; i < src_ncomp(instr); i++) + reg->comp[swiz_get(src->swizzle, i)].ref_count++; + } + } } -void ra_reg(struct ir2_context *ctx, struct ir2_reg *reg, int force_idx, - bool export, uint8_t export_writemask) +void +ra_reg(struct ir2_context *ctx, struct ir2_reg *reg, int force_idx, bool export, + uint8_t export_writemask) { - /* for export, don't allocate anything but set component layout */ - if (export) { - for (int i = 0; i < 4; i++) - reg->comp[i].c = i; - return; - } - - unsigned idx = force_idx; - - /* TODO: allocate into the same register if theres room - * note: the blob doesn't do it, so verify that it is indeed better - * also, doing it would conflict with scalar mov insertion - */ - - /* check if already allocated */ - for (int i = 0; i < reg->ncomp; i++) { - if (reg->comp[i].alloc) - return; - } - - if (force_idx < 0) { - for (idx = 0; idx < 64; idx++) { - if (reg_mask(ctx, idx) == 0) - break; - } - } - assert(idx != 64); /* TODO ran out of register space.. */ - - /* update max_reg value */ - ctx->info->max_reg = MAX2(ctx->info->max_reg, (int) idx); - - unsigned mask = reg_mask(ctx, idx); - - for (int i = 0; i < reg->ncomp; i++) { - /* don't allocate never used values */ - if (reg->comp[i].ref_count == 0) { - reg->comp[i].c = 7; - continue; - } - - /* TODO */ - unsigned c = 1 ? i : (ffs(~mask) - 1); - mask |= 1 << c; - reg->comp[i].c = c; - reg_setmask(ctx, idx, c); - reg->comp[i].alloc = true; - } - - reg->idx = idx; - ctx->live_regs[reg->idx] = reg; + /* for export, don't allocate anything but set component layout */ + if (export) { + for (int i = 0; i < 4; i++) + reg->comp[i].c = i; + return; + } + + unsigned idx = force_idx; + + /* TODO: allocate into the same register if theres room + * note: the blob doesn't do it, so verify that it is indeed better + * also, doing it would conflict with scalar mov insertion + */ + + /* check if already allocated */ + for (int i = 0; i < reg->ncomp; i++) { + if (reg->comp[i].alloc) + return; + } + + if (force_idx < 0) { + for (idx = 0; idx < 64; idx++) { + if (reg_mask(ctx, idx) == 0) + break; + } + } + assert(idx != 64); /* TODO ran out of register space.. */ + + /* update max_reg value */ + ctx->info->max_reg = MAX2(ctx->info->max_reg, (int)idx); + + unsigned mask = reg_mask(ctx, idx); + + for (int i = 0; i < reg->ncomp; i++) { + /* don't allocate never used values */ + if (reg->comp[i].ref_count == 0) { + reg->comp[i].c = 7; + continue; + } + + /* TODO */ + unsigned c = 1 ? i : (ffs(~mask) - 1); + mask |= 1 << c; + reg->comp[i].c = c; + reg_setmask(ctx, idx, c); + reg->comp[i].alloc = true; + } + + reg->idx = idx; + ctx->live_regs[reg->idx] = reg; } /* reduce srcs ref_count and free if needed */ -void ra_src_free(struct ir2_context *ctx, struct ir2_instr *instr) +void +ra_src_free(struct ir2_context *ctx, struct ir2_instr *instr) { - struct ir2_reg *reg; - struct ir2_reg_component *comp; - - ir2_foreach_src(src, instr) { - if (src->type == IR2_SRC_CONST) - continue; - - reg = get_reg_src(ctx, src); - /* XXX use before write case */ - - for (int i = 0; i < src_ncomp(instr); i++) { - comp = ®->comp[swiz_get(src->swizzle, i)]; - if (!--comp->ref_count && reg->block_idx_free < 0) { - reg_freemask(ctx, reg->idx, comp->c); - comp->alloc = false; - } - } - } + struct ir2_reg *reg; + struct ir2_reg_component *comp; + + ir2_foreach_src(src, instr) + { + if (src->type == IR2_SRC_CONST) + continue; + + reg = get_reg_src(ctx, src); + /* XXX use before write case */ + + for (int i = 0; i < src_ncomp(instr); i++) { + comp = ®->comp[swiz_get(src->swizzle, i)]; + if (!--comp->ref_count && reg->block_idx_free < 0) { + reg_freemask(ctx, reg->idx, comp->c); + comp->alloc = false; + } + } + } } /* free any regs left for a block */ -void ra_block_free(struct ir2_context *ctx, unsigned block) +void +ra_block_free(struct ir2_context *ctx, unsigned block) { - ir2_foreach_live_reg(reg, ctx) { - if (reg->block_idx_free != block) - continue; - - for (int i = 0; i < reg->ncomp; i++) { - if (!reg->comp[i].alloc) /* XXX should never be true? */ - continue; - - reg_freemask(ctx, reg->idx, reg->comp[i].c); - reg->comp[i].alloc = false; - } - ctx->live_regs[reg->idx] = NULL; - } + ir2_foreach_live_reg(reg, ctx) + { + if (reg->block_idx_free != block) + continue; + + for (int i = 0; i < reg->ncomp; i++) { + if (!reg->comp[i].alloc) /* XXX should never be true? */ + continue; + + reg_freemask(ctx, reg->idx, reg->comp[i].c); + reg->comp[i].alloc = false; + } + ctx->live_regs[reg->idx] = NULL; + } } diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_blend.c b/src/gallium/drivers/freedreno/a3xx/fd3_blend.c index 5773ed2..cc92487 100644 --- a/src/gallium/drivers/freedreno/a3xx/fd3_blend.c +++ b/src/gallium/drivers/freedreno/a3xx/fd3_blend.c @@ -27,88 +27,92 @@ #include "pipe/p_state.h" #include "util/u_blend.h" #include "util/u_dual_blend.h" -#include "util/u_string.h" #include "util/u_memory.h" +#include "util/u_string.h" #include "fd3_blend.h" #include "fd3_context.h" #include "fd3_format.h" - static enum a3xx_rb_blend_opcode blend_func(unsigned func) { - switch (func) { - case PIPE_BLEND_ADD: - return BLEND_DST_PLUS_SRC; - case PIPE_BLEND_MIN: - return BLEND_MIN_DST_SRC; - case PIPE_BLEND_MAX: - return BLEND_MAX_DST_SRC; - case PIPE_BLEND_SUBTRACT: - return BLEND_SRC_MINUS_DST; - case PIPE_BLEND_REVERSE_SUBTRACT: - return BLEND_DST_MINUS_SRC; - default: - DBG("invalid blend func: %x", func); - return 0; - } + switch (func) { + case PIPE_BLEND_ADD: + return BLEND_DST_PLUS_SRC; + case PIPE_BLEND_MIN: + return BLEND_MIN_DST_SRC; + case PIPE_BLEND_MAX: + return BLEND_MAX_DST_SRC; + case PIPE_BLEND_SUBTRACT: + return BLEND_SRC_MINUS_DST; + case PIPE_BLEND_REVERSE_SUBTRACT: + return BLEND_DST_MINUS_SRC; + default: + DBG("invalid blend func: %x", func); + return 0; + } } void * fd3_blend_state_create(struct pipe_context *pctx, - const struct pipe_blend_state *cso) + const struct pipe_blend_state *cso) { - struct fd3_blend_stateobj *so; - enum a3xx_rop_code rop = ROP_COPY; - bool reads_dest = false; - int i; - - if (cso->logicop_enable) { - rop = cso->logicop_func; /* maps 1:1 */ - reads_dest = util_logicop_reads_dest(cso->logicop_func); - } - - so = CALLOC_STRUCT(fd3_blend_stateobj); - if (!so) - return NULL; - - so->base = *cso; - - for (i = 0; i < ARRAY_SIZE(so->rb_mrt); i++) { - const struct pipe_rt_blend_state *rt; - if (cso->independent_blend_enable) - rt = &cso->rt[i]; - else - rt = &cso->rt[0]; - - so->rb_mrt[i].blend_control = - A3XX_RB_MRT_BLEND_CONTROL_RGB_SRC_FACTOR(fd_blend_factor(rt->rgb_src_factor)) | - A3XX_RB_MRT_BLEND_CONTROL_RGB_BLEND_OPCODE(blend_func(rt->rgb_func)) | - A3XX_RB_MRT_BLEND_CONTROL_RGB_DEST_FACTOR(fd_blend_factor(rt->rgb_dst_factor)) | - A3XX_RB_MRT_BLEND_CONTROL_ALPHA_SRC_FACTOR(fd_blend_factor(rt->alpha_src_factor)) | - A3XX_RB_MRT_BLEND_CONTROL_ALPHA_BLEND_OPCODE(blend_func(rt->alpha_func)) | - A3XX_RB_MRT_BLEND_CONTROL_ALPHA_DEST_FACTOR(fd_blend_factor(rt->alpha_dst_factor)); - - so->rb_mrt[i].control = - A3XX_RB_MRT_CONTROL_ROP_CODE(rop) | - A3XX_RB_MRT_CONTROL_COMPONENT_ENABLE(rt->colormask); - - if (rt->blend_enable) - so->rb_mrt[i].control |= - A3XX_RB_MRT_CONTROL_READ_DEST_ENABLE | - A3XX_RB_MRT_CONTROL_BLEND | - A3XX_RB_MRT_CONTROL_BLEND2; - - if (reads_dest) - so->rb_mrt[i].control |= A3XX_RB_MRT_CONTROL_READ_DEST_ENABLE; - - if (cso->dither) - so->rb_mrt[i].control |= A3XX_RB_MRT_CONTROL_DITHER_MODE(DITHER_ALWAYS); - } - - if (cso->rt[0].blend_enable && util_blend_state_is_dual(cso, 0)) - so->rb_render_control = A3XX_RB_RENDER_CONTROL_DUAL_COLOR_IN_ENABLE; - - return so; + struct fd3_blend_stateobj *so; + enum a3xx_rop_code rop = ROP_COPY; + bool reads_dest = false; + int i; + + if (cso->logicop_enable) { + rop = cso->logicop_func; /* maps 1:1 */ + reads_dest = util_logicop_reads_dest(cso->logicop_func); + } + + so = CALLOC_STRUCT(fd3_blend_stateobj); + if (!so) + return NULL; + + so->base = *cso; + + for (i = 0; i < ARRAY_SIZE(so->rb_mrt); i++) { + const struct pipe_rt_blend_state *rt; + if (cso->independent_blend_enable) + rt = &cso->rt[i]; + else + rt = &cso->rt[0]; + + so->rb_mrt[i].blend_control = + A3XX_RB_MRT_BLEND_CONTROL_RGB_SRC_FACTOR( + fd_blend_factor(rt->rgb_src_factor)) | + A3XX_RB_MRT_BLEND_CONTROL_RGB_BLEND_OPCODE(blend_func(rt->rgb_func)) | + A3XX_RB_MRT_BLEND_CONTROL_RGB_DEST_FACTOR( + fd_blend_factor(rt->rgb_dst_factor)) | + A3XX_RB_MRT_BLEND_CONTROL_ALPHA_SRC_FACTOR( + fd_blend_factor(rt->alpha_src_factor)) | + A3XX_RB_MRT_BLEND_CONTROL_ALPHA_BLEND_OPCODE( + blend_func(rt->alpha_func)) | + A3XX_RB_MRT_BLEND_CONTROL_ALPHA_DEST_FACTOR( + fd_blend_factor(rt->alpha_dst_factor)); + + so->rb_mrt[i].control = + A3XX_RB_MRT_CONTROL_ROP_CODE(rop) | + A3XX_RB_MRT_CONTROL_COMPONENT_ENABLE(rt->colormask); + + if (rt->blend_enable) + so->rb_mrt[i].control |= A3XX_RB_MRT_CONTROL_READ_DEST_ENABLE | + A3XX_RB_MRT_CONTROL_BLEND | + A3XX_RB_MRT_CONTROL_BLEND2; + + if (reads_dest) + so->rb_mrt[i].control |= A3XX_RB_MRT_CONTROL_READ_DEST_ENABLE; + + if (cso->dither) + so->rb_mrt[i].control |= + A3XX_RB_MRT_CONTROL_DITHER_MODE(DITHER_ALWAYS); + } + + if (cso->rt[0].blend_enable && util_blend_state_is_dual(cso, 0)) + so->rb_render_control = A3XX_RB_RENDER_CONTROL_DUAL_COLOR_IN_ENABLE; + + return so; } diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_blend.h b/src/gallium/drivers/freedreno/a3xx/fd3_blend.h index 61fbeda..7ab0731 100644 --- a/src/gallium/drivers/freedreno/a3xx/fd3_blend.h +++ b/src/gallium/drivers/freedreno/a3xx/fd3_blend.h @@ -27,27 +27,27 @@ #ifndef FD3_BLEND_H_ #define FD3_BLEND_H_ -#include "pipe/p_state.h" #include "pipe/p_context.h" +#include "pipe/p_state.h" #include "freedreno_util.h" struct fd3_blend_stateobj { - struct pipe_blend_state base; - uint32_t rb_render_control; - struct { - uint32_t blend_control; - uint32_t control; - } rb_mrt[A3XX_MAX_RENDER_TARGETS]; + struct pipe_blend_state base; + uint32_t rb_render_control; + struct { + uint32_t blend_control; + uint32_t control; + } rb_mrt[A3XX_MAX_RENDER_TARGETS]; }; static inline struct fd3_blend_stateobj * fd3_blend_stateobj(struct pipe_blend_state *blend) { - return (struct fd3_blend_stateobj *)blend; + return (struct fd3_blend_stateobj *)blend; } -void * fd3_blend_state_create(struct pipe_context *pctx, - const struct pipe_blend_state *cso); +void *fd3_blend_state_create(struct pipe_context *pctx, + const struct pipe_blend_state *cso); #endif /* FD3_BLEND_H_ */ diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_context.c b/src/gallium/drivers/freedreno/a3xx/fd3_context.c index bd4f40b..43aa86e 100644 --- a/src/gallium/drivers/freedreno/a3xx/fd3_context.c +++ b/src/gallium/drivers/freedreno/a3xx/fd3_context.c @@ -26,8 +26,8 @@ #include "freedreno_query_hw.h" -#include "fd3_context.h" #include "fd3_blend.h" +#include "fd3_context.h" #include "fd3_draw.h" #include "fd3_emit.h" #include "fd3_gmem.h" @@ -38,25 +38,24 @@ #include "fd3_zsa.h" static void -fd3_context_destroy(struct pipe_context *pctx) - in_dt +fd3_context_destroy(struct pipe_context *pctx) in_dt { - struct fd3_context *fd3_ctx = fd3_context(fd_context(pctx)); + struct fd3_context *fd3_ctx = fd3_context(fd_context(pctx)); - u_upload_destroy(fd3_ctx->border_color_uploader); - pipe_resource_reference(&fd3_ctx->border_color_buf, NULL); + u_upload_destroy(fd3_ctx->border_color_uploader); + pipe_resource_reference(&fd3_ctx->border_color_buf, NULL); - fd_context_destroy(pctx); + fd_context_destroy(pctx); - fd_bo_del(fd3_ctx->vs_pvt_mem); - fd_bo_del(fd3_ctx->fs_pvt_mem); - fd_bo_del(fd3_ctx->vsc_size_mem); + fd_bo_del(fd3_ctx->vs_pvt_mem); + fd_bo_del(fd3_ctx->fs_pvt_mem); + fd_bo_del(fd3_ctx->vsc_size_mem); - fd_context_cleanup_common_vbos(&fd3_ctx->base); + fd_context_cleanup_common_vbos(&fd3_ctx->base); - fd_hw_query_fini(pctx); + fd_hw_query_fini(pctx); - free(fd3_ctx); + free(fd3_ctx); } /* clang-format off */ @@ -73,55 +72,55 @@ static const uint8_t primtypes[] = { /* clang-format on */ struct pipe_context * -fd3_context_create(struct pipe_screen *pscreen, void *priv, unsigned flags) - in_dt +fd3_context_create(struct pipe_screen *pscreen, void *priv, + unsigned flags) in_dt { - struct fd_screen *screen = fd_screen(pscreen); - struct fd3_context *fd3_ctx = CALLOC_STRUCT(fd3_context); - struct pipe_context *pctx; + struct fd_screen *screen = fd_screen(pscreen); + struct fd3_context *fd3_ctx = CALLOC_STRUCT(fd3_context); + struct pipe_context *pctx; - if (!fd3_ctx) - return NULL; + if (!fd3_ctx) + return NULL; - pctx = &fd3_ctx->base.base; - pctx->screen = pscreen; + pctx = &fd3_ctx->base.base; + pctx->screen = pscreen; - fd3_ctx->base.dev = fd_device_ref(screen->dev); - fd3_ctx->base.screen = fd_screen(pscreen); - fd3_ctx->base.last.key = &fd3_ctx->last_key; + fd3_ctx->base.dev = fd_device_ref(screen->dev); + fd3_ctx->base.screen = fd_screen(pscreen); + fd3_ctx->base.last.key = &fd3_ctx->last_key; - pctx->destroy = fd3_context_destroy; - pctx->create_blend_state = fd3_blend_state_create; - pctx->create_rasterizer_state = fd3_rasterizer_state_create; - pctx->create_depth_stencil_alpha_state = fd3_zsa_state_create; + pctx->destroy = fd3_context_destroy; + pctx->create_blend_state = fd3_blend_state_create; + pctx->create_rasterizer_state = fd3_rasterizer_state_create; + pctx->create_depth_stencil_alpha_state = fd3_zsa_state_create; - fd3_draw_init(pctx); - fd3_gmem_init(pctx); - fd3_texture_init(pctx); - fd3_prog_init(pctx); - fd3_emit_init(pctx); + fd3_draw_init(pctx); + fd3_gmem_init(pctx); + fd3_texture_init(pctx); + fd3_prog_init(pctx); + fd3_emit_init(pctx); - pctx = fd_context_init(&fd3_ctx->base, pscreen, primtypes, priv, flags); - if (!pctx) - return NULL; + pctx = fd_context_init(&fd3_ctx->base, pscreen, primtypes, priv, flags); + if (!pctx) + return NULL; - fd_hw_query_init(pctx); + fd_hw_query_init(pctx); - fd3_ctx->vs_pvt_mem = fd_bo_new(screen->dev, 0x2000, - DRM_FREEDRENO_GEM_TYPE_KMEM, "vs_pvt"); + fd3_ctx->vs_pvt_mem = + fd_bo_new(screen->dev, 0x2000, DRM_FREEDRENO_GEM_TYPE_KMEM, "vs_pvt"); - fd3_ctx->fs_pvt_mem = fd_bo_new(screen->dev, 0x2000, - DRM_FREEDRENO_GEM_TYPE_KMEM, "fs_pvt"); + fd3_ctx->fs_pvt_mem = + fd_bo_new(screen->dev, 0x2000, DRM_FREEDRENO_GEM_TYPE_KMEM, "fs_pvt"); - fd3_ctx->vsc_size_mem = fd_bo_new(screen->dev, 0x1000, - DRM_FREEDRENO_GEM_TYPE_KMEM, "vsc_size"); + fd3_ctx->vsc_size_mem = + fd_bo_new(screen->dev, 0x1000, DRM_FREEDRENO_GEM_TYPE_KMEM, "vsc_size"); - fd_context_setup_common_vbos(&fd3_ctx->base); + fd_context_setup_common_vbos(&fd3_ctx->base); - fd3_query_context_init(pctx); + fd3_query_context_init(pctx); - fd3_ctx->border_color_uploader = u_upload_create(pctx, 4096, 0, - PIPE_USAGE_STREAM, 0); + fd3_ctx->border_color_uploader = + u_upload_create(pctx, 4096, 0, PIPE_USAGE_STREAM, 0); - return pctx; + return pctx; } diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_context.h b/src/gallium/drivers/freedreno/a3xx/fd3_context.h index c5c4f4e..7a222df 100644 --- a/src/gallium/drivers/freedreno/a3xx/fd3_context.h +++ b/src/gallium/drivers/freedreno/a3xx/fd3_context.h @@ -33,31 +33,30 @@ #include "ir3/ir3_shader.h" - struct fd3_context { - struct fd_context base; + struct fd_context base; - struct fd_bo *vs_pvt_mem, *fs_pvt_mem; + struct fd_bo *vs_pvt_mem, *fs_pvt_mem; - /* This only needs to be 4 * num_of_pipes bytes (ie. 32 bytes). We - * could combine it with another allocation. - */ - struct fd_bo *vsc_size_mem; + /* This only needs to be 4 * num_of_pipes bytes (ie. 32 bytes). We + * could combine it with another allocation. + */ + struct fd_bo *vsc_size_mem; - struct u_upload_mgr *border_color_uploader; - struct pipe_resource *border_color_buf; + struct u_upload_mgr *border_color_uploader; + struct pipe_resource *border_color_buf; - /* storage for ctx->last.key: */ - struct ir3_shader_key last_key; + /* storage for ctx->last.key: */ + struct ir3_shader_key last_key; }; static inline struct fd3_context * fd3_context(struct fd_context *ctx) { - return (struct fd3_context *)ctx; + return (struct fd3_context *)ctx; } -struct pipe_context * -fd3_context_create(struct pipe_screen *pscreen, void *priv, unsigned flags); +struct pipe_context *fd3_context_create(struct pipe_screen *pscreen, void *priv, + unsigned flags); #endif /* FD3_CONTEXT_H_ */ diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_draw.c b/src/gallium/drivers/freedreno/a3xx/fd3_draw.c index e8298cf..d71ffb2 100644 --- a/src/gallium/drivers/freedreno/a3xx/fd3_draw.c +++ b/src/gallium/drivers/freedreno/a3xx/fd3_draw.c @@ -25,142 +25,146 @@ */ #include "pipe/p_state.h" -#include "util/u_string.h" +#include "util/format/u_format.h" #include "util/u_memory.h" #include "util/u_prim.h" -#include "util/format/u_format.h" +#include "util/u_string.h" -#include "freedreno_state.h" #include "freedreno_resource.h" +#include "freedreno_state.h" -#include "fd3_draw.h" #include "fd3_context.h" +#include "fd3_draw.h" #include "fd3_emit.h" -#include "fd3_program.h" #include "fd3_format.h" +#include "fd3_program.h" #include "fd3_zsa.h" static inline uint32_t add_sat(uint32_t a, int32_t b) { - int64_t ret = (uint64_t)a + (int64_t)b; - if (ret > ~0U) - return ~0U; - if (ret < 0) - return 0; - return (uint32_t)ret; + int64_t ret = (uint64_t)a + (int64_t)b; + if (ret > ~0U) + return ~0U; + if (ret < 0) + return 0; + return (uint32_t)ret; } static void draw_impl(struct fd_context *ctx, struct fd_ringbuffer *ring, - struct fd3_emit *emit, unsigned index_offset) - assert_dt + struct fd3_emit *emit, unsigned index_offset) assert_dt { - const struct pipe_draw_info *info = emit->info; - enum pc_di_primtype primtype = ctx->primtypes[info->mode]; - - fd3_emit_state(ctx, ring, emit); - - if (emit->dirty & (FD_DIRTY_VTXBUF | FD_DIRTY_VTXSTATE)) - fd3_emit_vertex_bufs(ring, emit); - - OUT_PKT0(ring, REG_A3XX_PC_VERTEX_REUSE_BLOCK_CNTL, 1); - OUT_RING(ring, 0x0000000b); /* PC_VERTEX_REUSE_BLOCK_CNTL */ - - OUT_PKT0(ring, REG_A3XX_VFD_INDEX_MIN, 4); - OUT_RING(ring, info->index_bounds_valid ? add_sat(info->min_index, info->index_size ? info->index_bias : 0) : 0); /* VFD_INDEX_MIN */ - OUT_RING(ring, info->index_bounds_valid ? add_sat(info->max_index, info->index_size ? info->index_bias : 0) : ~0); /* VFD_INDEX_MAX */ - OUT_RING(ring, info->start_instance); /* VFD_INSTANCEID_OFFSET */ - OUT_RING(ring, info->index_size ? info->index_bias : emit->draw->start); /* VFD_INDEX_OFFSET */ - - OUT_PKT0(ring, REG_A3XX_PC_RESTART_INDEX, 1); - OUT_RING(ring, info->primitive_restart ? /* PC_RESTART_INDEX */ - info->restart_index : 0xffffffff); - - /* points + psize -> spritelist: */ - if (ctx->rasterizer->point_size_per_vertex && - fd3_emit_get_vp(emit)->writes_psize && - (info->mode == PIPE_PRIM_POINTS)) - primtype = DI_PT_POINTLIST_PSIZE; - - fd_draw_emit(ctx->batch, ring, primtype, - emit->binning_pass ? IGNORE_VISIBILITY : USE_VISIBILITY, - info, emit->draw, index_offset); + const struct pipe_draw_info *info = emit->info; + enum pc_di_primtype primtype = ctx->primtypes[info->mode]; + + fd3_emit_state(ctx, ring, emit); + + if (emit->dirty & (FD_DIRTY_VTXBUF | FD_DIRTY_VTXSTATE)) + fd3_emit_vertex_bufs(ring, emit); + + OUT_PKT0(ring, REG_A3XX_PC_VERTEX_REUSE_BLOCK_CNTL, 1); + OUT_RING(ring, 0x0000000b); /* PC_VERTEX_REUSE_BLOCK_CNTL */ + + OUT_PKT0(ring, REG_A3XX_VFD_INDEX_MIN, 4); + OUT_RING(ring, info->index_bounds_valid + ? add_sat(info->min_index, + info->index_size ? info->index_bias : 0) + : 0); /* VFD_INDEX_MIN */ + OUT_RING(ring, info->index_bounds_valid + ? add_sat(info->max_index, + info->index_size ? info->index_bias : 0) + : ~0); /* VFD_INDEX_MAX */ + OUT_RING(ring, info->start_instance); /* VFD_INSTANCEID_OFFSET */ + OUT_RING(ring, info->index_size ? info->index_bias + : emit->draw->start); /* VFD_INDEX_OFFSET */ + + OUT_PKT0(ring, REG_A3XX_PC_RESTART_INDEX, 1); + OUT_RING(ring, info->primitive_restart ? /* PC_RESTART_INDEX */ + info->restart_index + : 0xffffffff); + + /* points + psize -> spritelist: */ + if (ctx->rasterizer->point_size_per_vertex && + fd3_emit_get_vp(emit)->writes_psize && (info->mode == PIPE_PRIM_POINTS)) + primtype = DI_PT_POINTLIST_PSIZE; + + fd_draw_emit(ctx->batch, ring, primtype, + emit->binning_pass ? IGNORE_VISIBILITY : USE_VISIBILITY, info, + emit->draw, index_offset); } static bool fd3_draw_vbo(struct fd_context *ctx, const struct pipe_draw_info *info, const struct pipe_draw_indirect_info *indirect, const struct pipe_draw_start_count *draw, - unsigned index_offset) - in_dt + unsigned index_offset) in_dt { - struct fd3_emit emit = { - .debug = &ctx->debug, - .vtx = &ctx->vtx, - .info = info, - .indirect = indirect, - .draw = draw, - .key = { - .vs = ctx->prog.vs, - .fs = ctx->prog.fs, - }, - .rasterflat = ctx->rasterizer->flatshade, - .sprite_coord_enable = ctx->rasterizer->sprite_coord_enable, - .sprite_coord_mode = ctx->rasterizer->sprite_coord_mode, - }; - - if (info->mode != PIPE_PRIM_MAX && - !indirect && - !info->primitive_restart && - !u_trim_pipe_prim(info->mode, (unsigned*)&draw->count)) - return false; - - if (fd3_needs_manual_clipping(ir3_get_shader(ctx->prog.vs), ctx->rasterizer)) - emit.key.key.ucp_enables = ctx->rasterizer->clip_plane_enable; - - ir3_fixup_shader_state(&ctx->base, &emit.key.key); - - unsigned dirty = ctx->dirty; - - emit.prog = fd3_program_state(ir3_cache_lookup(ctx->shader_cache, &emit.key, &ctx->debug)); - - /* bail if compile failed: */ - if (!emit.prog) - return false; - - const struct ir3_shader_variant *vp = fd3_emit_get_vp(&emit); - const struct ir3_shader_variant *fp = fd3_emit_get_fp(&emit); - - ir3_update_max_tf_vtx(ctx, vp); - - /* do regular pass first: */ - - if (unlikely(ctx->stats_users > 0)) { - ctx->stats.vs_regs += ir3_shader_halfregs(vp); - ctx->stats.fs_regs += ir3_shader_halfregs(fp); - } - - emit.binning_pass = false; - emit.dirty = dirty; - draw_impl(ctx, ctx->batch->draw, &emit, index_offset); - - /* and now binning pass: */ - emit.binning_pass = true; - emit.dirty = dirty & ~(FD_DIRTY_BLEND); - emit.vs = NULL; /* we changed key so need to refetch vs */ - emit.fs = NULL; - draw_impl(ctx, ctx->batch->binning, &emit, index_offset); - - fd_context_all_clean(ctx); - - return true; + struct fd3_emit emit = { + .debug = &ctx->debug, + .vtx = &ctx->vtx, + .info = info, + .indirect = indirect, + .draw = draw, + .key = + { + .vs = ctx->prog.vs, + .fs = ctx->prog.fs, + }, + .rasterflat = ctx->rasterizer->flatshade, + .sprite_coord_enable = ctx->rasterizer->sprite_coord_enable, + .sprite_coord_mode = ctx->rasterizer->sprite_coord_mode, + }; + + if (info->mode != PIPE_PRIM_MAX && !indirect && !info->primitive_restart && + !u_trim_pipe_prim(info->mode, (unsigned *)&draw->count)) + return false; + + if (fd3_needs_manual_clipping(ir3_get_shader(ctx->prog.vs), ctx->rasterizer)) + emit.key.key.ucp_enables = ctx->rasterizer->clip_plane_enable; + + ir3_fixup_shader_state(&ctx->base, &emit.key.key); + + unsigned dirty = ctx->dirty; + + emit.prog = fd3_program_state( + ir3_cache_lookup(ctx->shader_cache, &emit.key, &ctx->debug)); + + /* bail if compile failed: */ + if (!emit.prog) + return false; + + const struct ir3_shader_variant *vp = fd3_emit_get_vp(&emit); + const struct ir3_shader_variant *fp = fd3_emit_get_fp(&emit); + + ir3_update_max_tf_vtx(ctx, vp); + + /* do regular pass first: */ + + if (unlikely(ctx->stats_users > 0)) { + ctx->stats.vs_regs += ir3_shader_halfregs(vp); + ctx->stats.fs_regs += ir3_shader_halfregs(fp); + } + + emit.binning_pass = false; + emit.dirty = dirty; + draw_impl(ctx, ctx->batch->draw, &emit, index_offset); + + /* and now binning pass: */ + emit.binning_pass = true; + emit.dirty = dirty & ~(FD_DIRTY_BLEND); + emit.vs = NULL; /* we changed key so need to refetch vs */ + emit.fs = NULL; + draw_impl(ctx, ctx->batch->binning, &emit, index_offset); + + fd_context_all_clean(ctx); + + return true; } void -fd3_draw_init(struct pipe_context *pctx) - disable_thread_safety_analysis +fd3_draw_init(struct pipe_context *pctx) disable_thread_safety_analysis { - struct fd_context *ctx = fd_context(pctx); - ctx->draw_vbo = fd3_draw_vbo; + struct fd_context *ctx = fd_context(pctx); + ctx->draw_vbo = fd3_draw_vbo; } diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_emit.c b/src/gallium/drivers/freedreno/a3xx/fd3_emit.c index ba9878c..884deb4 100644 --- a/src/gallium/drivers/freedreno/a3xx/fd3_emit.c +++ b/src/gallium/drivers/freedreno/a3xx/fd3_emit.c @@ -25,31 +25,31 @@ */ #include "pipe/p_state.h" -#include "util/u_string.h" -#include "util/u_memory.h" -#include "util/u_helpers.h" #include "util/format/u_format.h" +#include "util/u_helpers.h" +#include "util/u_memory.h" +#include "util/u_string.h" #include "util/u_viewport.h" -#include "freedreno_resource.h" #include "freedreno_query_hw.h" +#include "freedreno_resource.h" -#include "fd3_emit.h" #include "fd3_blend.h" #include "fd3_context.h" +#include "fd3_emit.h" +#include "fd3_format.h" #include "fd3_program.h" #include "fd3_rasterizer.h" #include "fd3_texture.h" -#include "fd3_format.h" #include "fd3_zsa.h" #define emit_const_user fd3_emit_const_user -#define emit_const_bo fd3_emit_const_bo +#define emit_const_bo fd3_emit_const_bo #include "ir3_const.h" static const enum adreno_state_block sb[] = { - [MESA_SHADER_VERTEX] = SB_VERT_SHADER, - [MESA_SHADER_FRAGMENT] = SB_FRAG_SHADER, + [MESA_SHADER_VERTEX] = SB_VERT_SHADER, + [MESA_SHADER_FRAGMENT] = SB_FRAG_SHADER, }; /* regid: base const register @@ -58,213 +58,212 @@ static const enum adreno_state_block sb[] = { */ static void fd3_emit_const_user(struct fd_ringbuffer *ring, - const struct ir3_shader_variant *v, - uint32_t regid, uint32_t sizedwords, const uint32_t *dwords) + const struct ir3_shader_variant *v, uint32_t regid, + uint32_t sizedwords, const uint32_t *dwords) { - emit_const_asserts(ring, v, regid, sizedwords); - - OUT_PKT3(ring, CP_LOAD_STATE, 2 + sizedwords); - OUT_RING(ring, CP_LOAD_STATE_0_DST_OFF(regid/2) | - CP_LOAD_STATE_0_STATE_SRC(SS_DIRECT) | - CP_LOAD_STATE_0_STATE_BLOCK(sb[v->type]) | - CP_LOAD_STATE_0_NUM_UNIT(sizedwords/2)); - OUT_RING(ring, CP_LOAD_STATE_1_EXT_SRC_ADDR(0) | - CP_LOAD_STATE_1_STATE_TYPE(ST_CONSTANTS)); - for (int i = 0; i < sizedwords; i++) - OUT_RING(ring, dwords[i]); + emit_const_asserts(ring, v, regid, sizedwords); + + OUT_PKT3(ring, CP_LOAD_STATE, 2 + sizedwords); + OUT_RING(ring, CP_LOAD_STATE_0_DST_OFF(regid / 2) | + CP_LOAD_STATE_0_STATE_SRC(SS_DIRECT) | + CP_LOAD_STATE_0_STATE_BLOCK(sb[v->type]) | + CP_LOAD_STATE_0_NUM_UNIT(sizedwords / 2)); + OUT_RING(ring, CP_LOAD_STATE_1_EXT_SRC_ADDR(0) | + CP_LOAD_STATE_1_STATE_TYPE(ST_CONSTANTS)); + for (int i = 0; i < sizedwords; i++) + OUT_RING(ring, dwords[i]); } static void -fd3_emit_const_bo(struct fd_ringbuffer *ring, const struct ir3_shader_variant *v, - uint32_t regid, uint32_t offset, uint32_t sizedwords, - struct fd_bo *bo) +fd3_emit_const_bo(struct fd_ringbuffer *ring, + const struct ir3_shader_variant *v, uint32_t regid, + uint32_t offset, uint32_t sizedwords, struct fd_bo *bo) { - uint32_t dst_off = regid / 2; - /* The blob driver aligns all const uploads dst_off to 64. We've been - * successfully aligning to 8 vec4s as const_upload_unit so far with no - * ill effects. - */ - assert(dst_off % 16 == 0); - uint32_t num_unit = sizedwords / 2; - assert(num_unit % 2 == 0); - - emit_const_asserts(ring, v, regid, sizedwords); - - OUT_PKT3(ring, CP_LOAD_STATE, 2); - OUT_RING(ring, CP_LOAD_STATE_0_DST_OFF(dst_off) | - CP_LOAD_STATE_0_STATE_SRC(SS_INDIRECT) | - CP_LOAD_STATE_0_STATE_BLOCK(sb[v->type]) | - CP_LOAD_STATE_0_NUM_UNIT(num_unit)); - OUT_RELOC(ring, bo, offset, - CP_LOAD_STATE_1_STATE_TYPE(ST_CONSTANTS), 0); + uint32_t dst_off = regid / 2; + /* The blob driver aligns all const uploads dst_off to 64. We've been + * successfully aligning to 8 vec4s as const_upload_unit so far with no + * ill effects. + */ + assert(dst_off % 16 == 0); + uint32_t num_unit = sizedwords / 2; + assert(num_unit % 2 == 0); + + emit_const_asserts(ring, v, regid, sizedwords); + + OUT_PKT3(ring, CP_LOAD_STATE, 2); + OUT_RING(ring, CP_LOAD_STATE_0_DST_OFF(dst_off) | + CP_LOAD_STATE_0_STATE_SRC(SS_INDIRECT) | + CP_LOAD_STATE_0_STATE_BLOCK(sb[v->type]) | + CP_LOAD_STATE_0_NUM_UNIT(num_unit)); + OUT_RELOC(ring, bo, offset, CP_LOAD_STATE_1_STATE_TYPE(ST_CONSTANTS), 0); } static void fd3_emit_const_ptrs(struct fd_ringbuffer *ring, gl_shader_stage type, - uint32_t regid, uint32_t num, struct fd_bo **bos, uint32_t *offsets) + uint32_t regid, uint32_t num, struct fd_bo **bos, + uint32_t *offsets) { - uint32_t anum = align(num, 4); - uint32_t i; - - debug_assert((regid % 4) == 0); - - OUT_PKT3(ring, CP_LOAD_STATE, 2 + anum); - OUT_RING(ring, CP_LOAD_STATE_0_DST_OFF(regid/2) | - CP_LOAD_STATE_0_STATE_SRC(SS_DIRECT) | - CP_LOAD_STATE_0_STATE_BLOCK(sb[type]) | - CP_LOAD_STATE_0_NUM_UNIT(anum/2)); - OUT_RING(ring, CP_LOAD_STATE_1_EXT_SRC_ADDR(0) | - CP_LOAD_STATE_1_STATE_TYPE(ST_CONSTANTS)); - - for (i = 0; i < num; i++) { - if (bos[i]) { - OUT_RELOC(ring, bos[i], offsets[i], 0, 0); - } else { - OUT_RING(ring, 0xbad00000 | (i << 16)); - } - } - - for (; i < anum; i++) - OUT_RING(ring, 0xffffffff); + uint32_t anum = align(num, 4); + uint32_t i; + + debug_assert((regid % 4) == 0); + + OUT_PKT3(ring, CP_LOAD_STATE, 2 + anum); + OUT_RING(ring, CP_LOAD_STATE_0_DST_OFF(regid / 2) | + CP_LOAD_STATE_0_STATE_SRC(SS_DIRECT) | + CP_LOAD_STATE_0_STATE_BLOCK(sb[type]) | + CP_LOAD_STATE_0_NUM_UNIT(anum / 2)); + OUT_RING(ring, CP_LOAD_STATE_1_EXT_SRC_ADDR(0) | + CP_LOAD_STATE_1_STATE_TYPE(ST_CONSTANTS)); + + for (i = 0; i < num; i++) { + if (bos[i]) { + OUT_RELOC(ring, bos[i], offsets[i], 0, 0); + } else { + OUT_RING(ring, 0xbad00000 | (i << 16)); + } + } + + for (; i < anum; i++) + OUT_RING(ring, 0xffffffff); } static bool is_stateobj(struct fd_ringbuffer *ring) { - return false; + return false; } static void -emit_const_ptrs(struct fd_ringbuffer *ring, - const struct ir3_shader_variant *v, uint32_t dst_offset, - uint32_t num, struct fd_bo **bos, uint32_t *offsets) +emit_const_ptrs(struct fd_ringbuffer *ring, const struct ir3_shader_variant *v, + uint32_t dst_offset, uint32_t num, struct fd_bo **bos, + uint32_t *offsets) { - /* TODO inline this */ - assert(dst_offset + num <= v->constlen * 4); - fd3_emit_const_ptrs(ring, v->type, dst_offset, num, bos, offsets); + /* TODO inline this */ + assert(dst_offset + num <= v->constlen * 4); + fd3_emit_const_ptrs(ring, v->type, dst_offset, num, bos, offsets); } -#define VERT_TEX_OFF 0 -#define FRAG_TEX_OFF 16 -#define BASETABLE_SZ A3XX_MAX_MIP_LEVELS +#define VERT_TEX_OFF 0 +#define FRAG_TEX_OFF 16 +#define BASETABLE_SZ A3XX_MAX_MIP_LEVELS static void emit_textures(struct fd_context *ctx, struct fd_ringbuffer *ring, - enum adreno_state_block sb, struct fd_texture_stateobj *tex) + enum adreno_state_block sb, struct fd_texture_stateobj *tex) { - static const unsigned tex_off[] = { - [SB_VERT_TEX] = VERT_TEX_OFF, - [SB_FRAG_TEX] = FRAG_TEX_OFF, - }; - static const enum adreno_state_block mipaddr[] = { - [SB_VERT_TEX] = SB_VERT_MIPADDR, - [SB_FRAG_TEX] = SB_FRAG_MIPADDR, - }; - static const uint32_t bcolor_reg[] = { - [SB_VERT_TEX] = REG_A3XX_TPL1_TP_VS_BORDER_COLOR_BASE_ADDR, - [SB_FRAG_TEX] = REG_A3XX_TPL1_TP_FS_BORDER_COLOR_BASE_ADDR, - }; - struct fd3_context *fd3_ctx = fd3_context(ctx); - bool needs_border = false; - unsigned i, j; - - if (tex->num_samplers > 0) { - /* output sampler state: */ - OUT_PKT3(ring, CP_LOAD_STATE, 2 + (2 * tex->num_samplers)); - OUT_RING(ring, CP_LOAD_STATE_0_DST_OFF(tex_off[sb]) | - CP_LOAD_STATE_0_STATE_SRC(SS_DIRECT) | - CP_LOAD_STATE_0_STATE_BLOCK(sb) | - CP_LOAD_STATE_0_NUM_UNIT(tex->num_samplers)); - OUT_RING(ring, CP_LOAD_STATE_1_STATE_TYPE(ST_SHADER) | - CP_LOAD_STATE_1_EXT_SRC_ADDR(0)); - for (i = 0; i < tex->num_samplers; i++) { - static const struct fd3_sampler_stateobj dummy_sampler = {}; - const struct fd3_sampler_stateobj *sampler = tex->samplers[i] ? - fd3_sampler_stateobj(tex->samplers[i]) : - &dummy_sampler; - - OUT_RING(ring, sampler->texsamp0); - OUT_RING(ring, sampler->texsamp1); - - needs_border |= sampler->needs_border; - } - } - - if (tex->num_textures > 0) { - /* emit texture state: */ - OUT_PKT3(ring, CP_LOAD_STATE, 2 + (4 * tex->num_textures)); - OUT_RING(ring, CP_LOAD_STATE_0_DST_OFF(tex_off[sb]) | - CP_LOAD_STATE_0_STATE_SRC(SS_DIRECT) | - CP_LOAD_STATE_0_STATE_BLOCK(sb) | - CP_LOAD_STATE_0_NUM_UNIT(tex->num_textures)); - OUT_RING(ring, CP_LOAD_STATE_1_STATE_TYPE(ST_CONSTANTS) | - CP_LOAD_STATE_1_EXT_SRC_ADDR(0)); - for (i = 0; i < tex->num_textures; i++) { - static const struct fd3_pipe_sampler_view dummy_view = {}; - const struct fd3_pipe_sampler_view *view = tex->textures[i] ? - fd3_pipe_sampler_view(tex->textures[i]) : - &dummy_view; - OUT_RING(ring, view->texconst0); - OUT_RING(ring, view->texconst1); - OUT_RING(ring, view->texconst2 | - A3XX_TEX_CONST_2_INDX(BASETABLE_SZ * i)); - OUT_RING(ring, view->texconst3); - } - - /* emit mipaddrs: */ - OUT_PKT3(ring, CP_LOAD_STATE, 2 + (BASETABLE_SZ * tex->num_textures)); - OUT_RING(ring, CP_LOAD_STATE_0_DST_OFF(BASETABLE_SZ * tex_off[sb]) | - CP_LOAD_STATE_0_STATE_SRC(SS_DIRECT) | - CP_LOAD_STATE_0_STATE_BLOCK(mipaddr[sb]) | - CP_LOAD_STATE_0_NUM_UNIT(BASETABLE_SZ * tex->num_textures)); - OUT_RING(ring, CP_LOAD_STATE_1_STATE_TYPE(ST_CONSTANTS) | - CP_LOAD_STATE_1_EXT_SRC_ADDR(0)); - for (i = 0; i < tex->num_textures; i++) { - static const struct fd3_pipe_sampler_view dummy_view = { - .base.target = PIPE_TEXTURE_1D, /* anything !PIPE_BUFFER */ - .base.u.tex.first_level = 1, - }; - const struct fd3_pipe_sampler_view *view = tex->textures[i] ? - fd3_pipe_sampler_view(tex->textures[i]) : - &dummy_view; - struct fd_resource *rsc = fd_resource(view->base.texture); - if (rsc && rsc->b.b.target == PIPE_BUFFER) { - OUT_RELOC(ring, rsc->bo, view->base.u.buf.offset, 0, 0); - j = 1; - } else { - unsigned start = fd_sampler_first_level(&view->base); - unsigned end = fd_sampler_last_level(&view->base); - - for (j = 0; j < (end - start + 1); j++) { - struct fdl_slice *slice = fd_resource_slice(rsc, j + start); - OUT_RELOC(ring, rsc->bo, slice->offset, 0, 0); - } - } - - /* pad the remaining entries w/ null: */ - for (; j < BASETABLE_SZ; j++) { - OUT_RING(ring, 0x00000000); - } - } - } - - if (needs_border) { - unsigned off; - void *ptr; - - u_upload_alloc(fd3_ctx->border_color_uploader, - 0, BORDER_COLOR_UPLOAD_SIZE, - BORDER_COLOR_UPLOAD_SIZE, &off, - &fd3_ctx->border_color_buf, - &ptr); - - fd_setup_border_colors(tex, ptr, tex_off[sb]); - - OUT_PKT0(ring, bcolor_reg[sb], 1); - OUT_RELOC(ring, fd_resource(fd3_ctx->border_color_buf)->bo, off, 0, 0); - - u_upload_unmap(fd3_ctx->border_color_uploader); - } + static const unsigned tex_off[] = { + [SB_VERT_TEX] = VERT_TEX_OFF, + [SB_FRAG_TEX] = FRAG_TEX_OFF, + }; + static const enum adreno_state_block mipaddr[] = { + [SB_VERT_TEX] = SB_VERT_MIPADDR, + [SB_FRAG_TEX] = SB_FRAG_MIPADDR, + }; + static const uint32_t bcolor_reg[] = { + [SB_VERT_TEX] = REG_A3XX_TPL1_TP_VS_BORDER_COLOR_BASE_ADDR, + [SB_FRAG_TEX] = REG_A3XX_TPL1_TP_FS_BORDER_COLOR_BASE_ADDR, + }; + struct fd3_context *fd3_ctx = fd3_context(ctx); + bool needs_border = false; + unsigned i, j; + + if (tex->num_samplers > 0) { + /* output sampler state: */ + OUT_PKT3(ring, CP_LOAD_STATE, 2 + (2 * tex->num_samplers)); + OUT_RING(ring, CP_LOAD_STATE_0_DST_OFF(tex_off[sb]) | + CP_LOAD_STATE_0_STATE_SRC(SS_DIRECT) | + CP_LOAD_STATE_0_STATE_BLOCK(sb) | + CP_LOAD_STATE_0_NUM_UNIT(tex->num_samplers)); + OUT_RING(ring, CP_LOAD_STATE_1_STATE_TYPE(ST_SHADER) | + CP_LOAD_STATE_1_EXT_SRC_ADDR(0)); + for (i = 0; i < tex->num_samplers; i++) { + static const struct fd3_sampler_stateobj dummy_sampler = {}; + const struct fd3_sampler_stateobj *sampler = + tex->samplers[i] ? fd3_sampler_stateobj(tex->samplers[i]) + : &dummy_sampler; + + OUT_RING(ring, sampler->texsamp0); + OUT_RING(ring, sampler->texsamp1); + + needs_border |= sampler->needs_border; + } + } + + if (tex->num_textures > 0) { + /* emit texture state: */ + OUT_PKT3(ring, CP_LOAD_STATE, 2 + (4 * tex->num_textures)); + OUT_RING(ring, CP_LOAD_STATE_0_DST_OFF(tex_off[sb]) | + CP_LOAD_STATE_0_STATE_SRC(SS_DIRECT) | + CP_LOAD_STATE_0_STATE_BLOCK(sb) | + CP_LOAD_STATE_0_NUM_UNIT(tex->num_textures)); + OUT_RING(ring, CP_LOAD_STATE_1_STATE_TYPE(ST_CONSTANTS) | + CP_LOAD_STATE_1_EXT_SRC_ADDR(0)); + for (i = 0; i < tex->num_textures; i++) { + static const struct fd3_pipe_sampler_view dummy_view = {}; + const struct fd3_pipe_sampler_view *view = + tex->textures[i] ? fd3_pipe_sampler_view(tex->textures[i]) + : &dummy_view; + OUT_RING(ring, view->texconst0); + OUT_RING(ring, view->texconst1); + OUT_RING(ring, + view->texconst2 | A3XX_TEX_CONST_2_INDX(BASETABLE_SZ * i)); + OUT_RING(ring, view->texconst3); + } + + /* emit mipaddrs: */ + OUT_PKT3(ring, CP_LOAD_STATE, 2 + (BASETABLE_SZ * tex->num_textures)); + OUT_RING(ring, + CP_LOAD_STATE_0_DST_OFF(BASETABLE_SZ * tex_off[sb]) | + CP_LOAD_STATE_0_STATE_SRC(SS_DIRECT) | + CP_LOAD_STATE_0_STATE_BLOCK(mipaddr[sb]) | + CP_LOAD_STATE_0_NUM_UNIT(BASETABLE_SZ * tex->num_textures)); + OUT_RING(ring, CP_LOAD_STATE_1_STATE_TYPE(ST_CONSTANTS) | + CP_LOAD_STATE_1_EXT_SRC_ADDR(0)); + for (i = 0; i < tex->num_textures; i++) { + static const struct fd3_pipe_sampler_view dummy_view = { + .base.target = PIPE_TEXTURE_1D, /* anything !PIPE_BUFFER */ + .base.u.tex.first_level = 1, + }; + const struct fd3_pipe_sampler_view *view = + tex->textures[i] ? fd3_pipe_sampler_view(tex->textures[i]) + : &dummy_view; + struct fd_resource *rsc = fd_resource(view->base.texture); + if (rsc && rsc->b.b.target == PIPE_BUFFER) { + OUT_RELOC(ring, rsc->bo, view->base.u.buf.offset, 0, 0); + j = 1; + } else { + unsigned start = fd_sampler_first_level(&view->base); + unsigned end = fd_sampler_last_level(&view->base); + + for (j = 0; j < (end - start + 1); j++) { + struct fdl_slice *slice = fd_resource_slice(rsc, j + start); + OUT_RELOC(ring, rsc->bo, slice->offset, 0, 0); + } + } + + /* pad the remaining entries w/ null: */ + for (; j < BASETABLE_SZ; j++) { + OUT_RING(ring, 0x00000000); + } + } + } + + if (needs_border) { + unsigned off; + void *ptr; + + u_upload_alloc(fd3_ctx->border_color_uploader, 0, + BORDER_COLOR_UPLOAD_SIZE, BORDER_COLOR_UPLOAD_SIZE, &off, + &fd3_ctx->border_color_buf, &ptr); + + fd_setup_border_colors(tex, ptr, tex_off[sb]); + + OUT_PKT0(ring, bcolor_reg[sb], 1); + OUT_RELOC(ring, fd_resource(fd3_ctx->border_color_buf)->bo, off, 0, 0); + + u_upload_unmap(fd3_ctx->border_color_uploader); + } } /* emit texture state for mem->gmem restore operation.. eventually it would @@ -276,551 +275,565 @@ emit_textures(struct fd_context *ctx, struct fd_ringbuffer *ring, */ void fd3_emit_gmem_restore_tex(struct fd_ringbuffer *ring, - struct pipe_surface **psurf, - int bufs) + struct pipe_surface **psurf, int bufs) { - int i, j; - - /* output sampler state: */ - OUT_PKT3(ring, CP_LOAD_STATE, 2 + 2 * bufs); - OUT_RING(ring, CP_LOAD_STATE_0_DST_OFF(FRAG_TEX_OFF) | - CP_LOAD_STATE_0_STATE_SRC(SS_DIRECT) | - CP_LOAD_STATE_0_STATE_BLOCK(SB_FRAG_TEX) | - CP_LOAD_STATE_0_NUM_UNIT(bufs)); - OUT_RING(ring, CP_LOAD_STATE_1_STATE_TYPE(ST_SHADER) | - CP_LOAD_STATE_1_EXT_SRC_ADDR(0)); - for (i = 0; i < bufs; i++) { - OUT_RING(ring, A3XX_TEX_SAMP_0_XY_MAG(A3XX_TEX_NEAREST) | - A3XX_TEX_SAMP_0_XY_MIN(A3XX_TEX_NEAREST) | - A3XX_TEX_SAMP_0_WRAP_S(A3XX_TEX_CLAMP_TO_EDGE) | - A3XX_TEX_SAMP_0_WRAP_T(A3XX_TEX_CLAMP_TO_EDGE) | - A3XX_TEX_SAMP_0_WRAP_R(A3XX_TEX_REPEAT)); - OUT_RING(ring, 0x00000000); - } - - /* emit texture state: */ - OUT_PKT3(ring, CP_LOAD_STATE, 2 + 4 * bufs); - OUT_RING(ring, CP_LOAD_STATE_0_DST_OFF(FRAG_TEX_OFF) | - CP_LOAD_STATE_0_STATE_SRC(SS_DIRECT) | - CP_LOAD_STATE_0_STATE_BLOCK(SB_FRAG_TEX) | - CP_LOAD_STATE_0_NUM_UNIT(bufs)); - OUT_RING(ring, CP_LOAD_STATE_1_STATE_TYPE(ST_CONSTANTS) | - CP_LOAD_STATE_1_EXT_SRC_ADDR(0)); - for (i = 0; i < bufs; i++) { - if (!psurf[i]) { - OUT_RING(ring, A3XX_TEX_CONST_0_TYPE(A3XX_TEX_2D) | - A3XX_TEX_CONST_0_SWIZ_X(A3XX_TEX_ONE) | - A3XX_TEX_CONST_0_SWIZ_Y(A3XX_TEX_ONE) | - A3XX_TEX_CONST_0_SWIZ_Z(A3XX_TEX_ONE) | - A3XX_TEX_CONST_0_SWIZ_W(A3XX_TEX_ONE)); - OUT_RING(ring, 0x00000000); - OUT_RING(ring, A3XX_TEX_CONST_2_INDX(BASETABLE_SZ * i)); - OUT_RING(ring, 0x00000000); - continue; - } - - struct fd_resource *rsc = fd_resource(psurf[i]->texture); - enum pipe_format format = fd_gmem_restore_format(psurf[i]->format); - /* The restore blit_zs shader expects stencil in sampler 0, and depth - * in sampler 1 - */ - if (rsc->stencil && i == 0) { - rsc = rsc->stencil; - format = fd_gmem_restore_format(rsc->b.b.format); - } - - /* note: PIPE_BUFFER disallowed for surfaces */ - unsigned lvl = psurf[i]->u.tex.level; - - debug_assert(psurf[i]->u.tex.first_layer == psurf[i]->u.tex.last_layer); - - OUT_RING(ring, A3XX_TEX_CONST_0_TILE_MODE(rsc->layout.tile_mode) | - A3XX_TEX_CONST_0_FMT(fd3_pipe2tex(format)) | - A3XX_TEX_CONST_0_TYPE(A3XX_TEX_2D) | - fd3_tex_swiz(format, PIPE_SWIZZLE_X, PIPE_SWIZZLE_Y, - PIPE_SWIZZLE_Z, PIPE_SWIZZLE_W)); - OUT_RING(ring, A3XX_TEX_CONST_1_WIDTH(psurf[i]->width) | - A3XX_TEX_CONST_1_HEIGHT(psurf[i]->height)); - OUT_RING(ring, A3XX_TEX_CONST_2_PITCH(fd_resource_pitch(rsc, lvl)) | - A3XX_TEX_CONST_2_INDX(BASETABLE_SZ * i)); - OUT_RING(ring, 0x00000000); - } - - /* emit mipaddrs: */ - OUT_PKT3(ring, CP_LOAD_STATE, 2 + BASETABLE_SZ * bufs); - OUT_RING(ring, CP_LOAD_STATE_0_DST_OFF(BASETABLE_SZ * FRAG_TEX_OFF) | - CP_LOAD_STATE_0_STATE_SRC(SS_DIRECT) | - CP_LOAD_STATE_0_STATE_BLOCK(SB_FRAG_MIPADDR) | - CP_LOAD_STATE_0_NUM_UNIT(BASETABLE_SZ * bufs)); - OUT_RING(ring, CP_LOAD_STATE_1_STATE_TYPE(ST_CONSTANTS) | - CP_LOAD_STATE_1_EXT_SRC_ADDR(0)); - for (i = 0; i < bufs; i++) { - if (psurf[i]) { - struct fd_resource *rsc = fd_resource(psurf[i]->texture); - /* Matches above logic for blit_zs shader */ - if (rsc->stencil && i == 0) - rsc = rsc->stencil; - unsigned lvl = psurf[i]->u.tex.level; - uint32_t offset = fd_resource_offset(rsc, lvl, psurf[i]->u.tex.first_layer); - OUT_RELOC(ring, rsc->bo, offset, 0, 0); - } else { - OUT_RING(ring, 0x00000000); - } - - /* pad the remaining entries w/ null: */ - for (j = 1; j < BASETABLE_SZ; j++) { - OUT_RING(ring, 0x00000000); - } - } + int i, j; + + /* output sampler state: */ + OUT_PKT3(ring, CP_LOAD_STATE, 2 + 2 * bufs); + OUT_RING(ring, CP_LOAD_STATE_0_DST_OFF(FRAG_TEX_OFF) | + CP_LOAD_STATE_0_STATE_SRC(SS_DIRECT) | + CP_LOAD_STATE_0_STATE_BLOCK(SB_FRAG_TEX) | + CP_LOAD_STATE_0_NUM_UNIT(bufs)); + OUT_RING(ring, CP_LOAD_STATE_1_STATE_TYPE(ST_SHADER) | + CP_LOAD_STATE_1_EXT_SRC_ADDR(0)); + for (i = 0; i < bufs; i++) { + OUT_RING(ring, A3XX_TEX_SAMP_0_XY_MAG(A3XX_TEX_NEAREST) | + A3XX_TEX_SAMP_0_XY_MIN(A3XX_TEX_NEAREST) | + A3XX_TEX_SAMP_0_WRAP_S(A3XX_TEX_CLAMP_TO_EDGE) | + A3XX_TEX_SAMP_0_WRAP_T(A3XX_TEX_CLAMP_TO_EDGE) | + A3XX_TEX_SAMP_0_WRAP_R(A3XX_TEX_REPEAT)); + OUT_RING(ring, 0x00000000); + } + + /* emit texture state: */ + OUT_PKT3(ring, CP_LOAD_STATE, 2 + 4 * bufs); + OUT_RING(ring, CP_LOAD_STATE_0_DST_OFF(FRAG_TEX_OFF) | + CP_LOAD_STATE_0_STATE_SRC(SS_DIRECT) | + CP_LOAD_STATE_0_STATE_BLOCK(SB_FRAG_TEX) | + CP_LOAD_STATE_0_NUM_UNIT(bufs)); + OUT_RING(ring, CP_LOAD_STATE_1_STATE_TYPE(ST_CONSTANTS) | + CP_LOAD_STATE_1_EXT_SRC_ADDR(0)); + for (i = 0; i < bufs; i++) { + if (!psurf[i]) { + OUT_RING(ring, A3XX_TEX_CONST_0_TYPE(A3XX_TEX_2D) | + A3XX_TEX_CONST_0_SWIZ_X(A3XX_TEX_ONE) | + A3XX_TEX_CONST_0_SWIZ_Y(A3XX_TEX_ONE) | + A3XX_TEX_CONST_0_SWIZ_Z(A3XX_TEX_ONE) | + A3XX_TEX_CONST_0_SWIZ_W(A3XX_TEX_ONE)); + OUT_RING(ring, 0x00000000); + OUT_RING(ring, A3XX_TEX_CONST_2_INDX(BASETABLE_SZ * i)); + OUT_RING(ring, 0x00000000); + continue; + } + + struct fd_resource *rsc = fd_resource(psurf[i]->texture); + enum pipe_format format = fd_gmem_restore_format(psurf[i]->format); + /* The restore blit_zs shader expects stencil in sampler 0, and depth + * in sampler 1 + */ + if (rsc->stencil && i == 0) { + rsc = rsc->stencil; + format = fd_gmem_restore_format(rsc->b.b.format); + } + + /* note: PIPE_BUFFER disallowed for surfaces */ + unsigned lvl = psurf[i]->u.tex.level; + + debug_assert(psurf[i]->u.tex.first_layer == psurf[i]->u.tex.last_layer); + + OUT_RING(ring, A3XX_TEX_CONST_0_TILE_MODE(rsc->layout.tile_mode) | + A3XX_TEX_CONST_0_FMT(fd3_pipe2tex(format)) | + A3XX_TEX_CONST_0_TYPE(A3XX_TEX_2D) | + fd3_tex_swiz(format, PIPE_SWIZZLE_X, PIPE_SWIZZLE_Y, + PIPE_SWIZZLE_Z, PIPE_SWIZZLE_W)); + OUT_RING(ring, A3XX_TEX_CONST_1_WIDTH(psurf[i]->width) | + A3XX_TEX_CONST_1_HEIGHT(psurf[i]->height)); + OUT_RING(ring, A3XX_TEX_CONST_2_PITCH(fd_resource_pitch(rsc, lvl)) | + A3XX_TEX_CONST_2_INDX(BASETABLE_SZ * i)); + OUT_RING(ring, 0x00000000); + } + + /* emit mipaddrs: */ + OUT_PKT3(ring, CP_LOAD_STATE, 2 + BASETABLE_SZ * bufs); + OUT_RING(ring, CP_LOAD_STATE_0_DST_OFF(BASETABLE_SZ * FRAG_TEX_OFF) | + CP_LOAD_STATE_0_STATE_SRC(SS_DIRECT) | + CP_LOAD_STATE_0_STATE_BLOCK(SB_FRAG_MIPADDR) | + CP_LOAD_STATE_0_NUM_UNIT(BASETABLE_SZ * bufs)); + OUT_RING(ring, CP_LOAD_STATE_1_STATE_TYPE(ST_CONSTANTS) | + CP_LOAD_STATE_1_EXT_SRC_ADDR(0)); + for (i = 0; i < bufs; i++) { + if (psurf[i]) { + struct fd_resource *rsc = fd_resource(psurf[i]->texture); + /* Matches above logic for blit_zs shader */ + if (rsc->stencil && i == 0) + rsc = rsc->stencil; + unsigned lvl = psurf[i]->u.tex.level; + uint32_t offset = + fd_resource_offset(rsc, lvl, psurf[i]->u.tex.first_layer); + OUT_RELOC(ring, rsc->bo, offset, 0, 0); + } else { + OUT_RING(ring, 0x00000000); + } + + /* pad the remaining entries w/ null: */ + for (j = 1; j < BASETABLE_SZ; j++) { + OUT_RING(ring, 0x00000000); + } + } } void fd3_emit_vertex_bufs(struct fd_ringbuffer *ring, struct fd3_emit *emit) { - int32_t i, j, last = -1; - uint32_t total_in = 0; - const struct fd_vertex_state *vtx = emit->vtx; - const struct ir3_shader_variant *vp = fd3_emit_get_vp(emit); - unsigned vertex_regid = regid(63, 0); - unsigned instance_regid = regid(63, 0); - unsigned vtxcnt_regid = regid(63, 0); - - /* Note that sysvals come *after* normal inputs: */ - for (i = 0; i < vp->inputs_count; i++) { - if (!vp->inputs[i].compmask) - continue; - if (vp->inputs[i].sysval) { - switch(vp->inputs[i].slot) { - case SYSTEM_VALUE_VERTEX_ID_ZERO_BASE: - vertex_regid = vp->inputs[i].regid; - break; - case SYSTEM_VALUE_INSTANCE_ID: - instance_regid = vp->inputs[i].regid; - break; - case SYSTEM_VALUE_VERTEX_CNT: - vtxcnt_regid = vp->inputs[i].regid; - break; - default: - unreachable("invalid system value"); - break; - } - } else if (i < vtx->vtx->num_elements) { - last = i; - } - } - - for (i = 0, j = 0; i <= last; i++) { - assert(!vp->inputs[i].sysval); - if (vp->inputs[i].compmask) { - struct pipe_vertex_element *elem = &vtx->vtx->pipe[i]; - const struct pipe_vertex_buffer *vb = - &vtx->vertexbuf.vb[elem->vertex_buffer_index]; - struct fd_resource *rsc = fd_resource(vb->buffer.resource); - enum pipe_format pfmt = elem->src_format; - enum a3xx_vtx_fmt fmt = fd3_pipe2vtx(pfmt); - bool switchnext = (i != last) || - (vertex_regid != regid(63, 0)) || - (instance_regid != regid(63, 0)) || - (vtxcnt_regid != regid(63, 0)); - bool isint = util_format_is_pure_integer(pfmt); - uint32_t off = vb->buffer_offset + elem->src_offset; - uint32_t fs = util_format_get_blocksize(pfmt); + int32_t i, j, last = -1; + uint32_t total_in = 0; + const struct fd_vertex_state *vtx = emit->vtx; + const struct ir3_shader_variant *vp = fd3_emit_get_vp(emit); + unsigned vertex_regid = regid(63, 0); + unsigned instance_regid = regid(63, 0); + unsigned vtxcnt_regid = regid(63, 0); + + /* Note that sysvals come *after* normal inputs: */ + for (i = 0; i < vp->inputs_count; i++) { + if (!vp->inputs[i].compmask) + continue; + if (vp->inputs[i].sysval) { + switch (vp->inputs[i].slot) { + case SYSTEM_VALUE_VERTEX_ID_ZERO_BASE: + vertex_regid = vp->inputs[i].regid; + break; + case SYSTEM_VALUE_INSTANCE_ID: + instance_regid = vp->inputs[i].regid; + break; + case SYSTEM_VALUE_VERTEX_CNT: + vtxcnt_regid = vp->inputs[i].regid; + break; + default: + unreachable("invalid system value"); + break; + } + } else if (i < vtx->vtx->num_elements) { + last = i; + } + } + + for (i = 0, j = 0; i <= last; i++) { + assert(!vp->inputs[i].sysval); + if (vp->inputs[i].compmask) { + struct pipe_vertex_element *elem = &vtx->vtx->pipe[i]; + const struct pipe_vertex_buffer *vb = + &vtx->vertexbuf.vb[elem->vertex_buffer_index]; + struct fd_resource *rsc = fd_resource(vb->buffer.resource); + enum pipe_format pfmt = elem->src_format; + enum a3xx_vtx_fmt fmt = fd3_pipe2vtx(pfmt); + bool switchnext = (i != last) || (vertex_regid != regid(63, 0)) || + (instance_regid != regid(63, 0)) || + (vtxcnt_regid != regid(63, 0)); + bool isint = util_format_is_pure_integer(pfmt); + uint32_t off = vb->buffer_offset + elem->src_offset; + uint32_t fs = util_format_get_blocksize(pfmt); #ifdef DEBUG - /* see dEQP-GLES31.stress.vertex_attribute_binding.buffer_bounds.bind_vertex_buffer_offset_near_wrap_10 - * should mesa/st be protecting us from this? - */ - if (off > fd_bo_size(rsc->bo)) - continue; + /* see + * dEQP-GLES31.stress.vertex_attribute_binding.buffer_bounds.bind_vertex_buffer_offset_near_wrap_10 + * should mesa/st be protecting us from this? + */ + if (off > fd_bo_size(rsc->bo)) + continue; #endif - debug_assert(fmt != VFMT_NONE); - - OUT_PKT0(ring, REG_A3XX_VFD_FETCH(j), 2); - OUT_RING(ring, A3XX_VFD_FETCH_INSTR_0_FETCHSIZE(fs - 1) | - A3XX_VFD_FETCH_INSTR_0_BUFSTRIDE(vb->stride) | - COND(switchnext, A3XX_VFD_FETCH_INSTR_0_SWITCHNEXT) | - A3XX_VFD_FETCH_INSTR_0_INDEXCODE(j) | - COND(elem->instance_divisor, A3XX_VFD_FETCH_INSTR_0_INSTANCED) | - A3XX_VFD_FETCH_INSTR_0_STEPRATE(MAX2(1, elem->instance_divisor))); - OUT_RELOC(ring, rsc->bo, off, 0, 0); - - OUT_PKT0(ring, REG_A3XX_VFD_DECODE_INSTR(j), 1); - OUT_RING(ring, A3XX_VFD_DECODE_INSTR_CONSTFILL | - A3XX_VFD_DECODE_INSTR_WRITEMASK(vp->inputs[i].compmask) | - A3XX_VFD_DECODE_INSTR_FORMAT(fmt) | - A3XX_VFD_DECODE_INSTR_SWAP(fd3_pipe2swap(pfmt)) | - A3XX_VFD_DECODE_INSTR_REGID(vp->inputs[i].regid) | - A3XX_VFD_DECODE_INSTR_SHIFTCNT(fs) | - A3XX_VFD_DECODE_INSTR_LASTCOMPVALID | - COND(isint, A3XX_VFD_DECODE_INSTR_INT) | - COND(switchnext, A3XX_VFD_DECODE_INSTR_SWITCHNEXT)); - - total_in += util_bitcount(vp->inputs[i].compmask); - j++; - } - } - - /* hw doesn't like to be configured for zero vbo's, it seems: */ - if (last < 0) { - /* just recycle the shader bo, we just need to point to *something* - * valid: - */ - struct fd_bo *dummy_vbo = vp->bo; - bool switchnext = (vertex_regid != regid(63, 0)) || - (instance_regid != regid(63, 0)) || - (vtxcnt_regid != regid(63, 0)); - - OUT_PKT0(ring, REG_A3XX_VFD_FETCH(0), 2); - OUT_RING(ring, A3XX_VFD_FETCH_INSTR_0_FETCHSIZE(0) | - A3XX_VFD_FETCH_INSTR_0_BUFSTRIDE(0) | - COND(switchnext, A3XX_VFD_FETCH_INSTR_0_SWITCHNEXT) | - A3XX_VFD_FETCH_INSTR_0_INDEXCODE(0) | - A3XX_VFD_FETCH_INSTR_0_STEPRATE(1)); - OUT_RELOC(ring, dummy_vbo, 0, 0, 0); - - OUT_PKT0(ring, REG_A3XX_VFD_DECODE_INSTR(0), 1); - OUT_RING(ring, A3XX_VFD_DECODE_INSTR_CONSTFILL | - A3XX_VFD_DECODE_INSTR_WRITEMASK(0x1) | - A3XX_VFD_DECODE_INSTR_FORMAT(VFMT_8_UNORM) | - A3XX_VFD_DECODE_INSTR_SWAP(XYZW) | - A3XX_VFD_DECODE_INSTR_REGID(regid(0,0)) | - A3XX_VFD_DECODE_INSTR_SHIFTCNT(1) | - A3XX_VFD_DECODE_INSTR_LASTCOMPVALID | - COND(switchnext, A3XX_VFD_DECODE_INSTR_SWITCHNEXT)); - - total_in = 1; - j = 1; - } - - OUT_PKT0(ring, REG_A3XX_VFD_CONTROL_0, 2); - OUT_RING(ring, A3XX_VFD_CONTROL_0_TOTALATTRTOVS(total_in) | - A3XX_VFD_CONTROL_0_PACKETSIZE(2) | - A3XX_VFD_CONTROL_0_STRMDECINSTRCNT(j) | - A3XX_VFD_CONTROL_0_STRMFETCHINSTRCNT(j)); - OUT_RING(ring, A3XX_VFD_CONTROL_1_MAXSTORAGE(1) | // XXX - A3XX_VFD_CONTROL_1_REGID4VTX(vertex_regid) | - A3XX_VFD_CONTROL_1_REGID4INST(instance_regid)); - - OUT_PKT0(ring, REG_A3XX_VFD_VS_THREADING_THRESHOLD, 1); - OUT_RING(ring, A3XX_VFD_VS_THREADING_THRESHOLD_REGID_THRESHOLD(15) | - A3XX_VFD_VS_THREADING_THRESHOLD_REGID_VTXCNT(vtxcnt_regid)); + debug_assert(fmt != VFMT_NONE); + + OUT_PKT0(ring, REG_A3XX_VFD_FETCH(j), 2); + OUT_RING(ring, A3XX_VFD_FETCH_INSTR_0_FETCHSIZE(fs - 1) | + A3XX_VFD_FETCH_INSTR_0_BUFSTRIDE(vb->stride) | + COND(switchnext, A3XX_VFD_FETCH_INSTR_0_SWITCHNEXT) | + A3XX_VFD_FETCH_INSTR_0_INDEXCODE(j) | + COND(elem->instance_divisor, + A3XX_VFD_FETCH_INSTR_0_INSTANCED) | + A3XX_VFD_FETCH_INSTR_0_STEPRATE( + MAX2(1, elem->instance_divisor))); + OUT_RELOC(ring, rsc->bo, off, 0, 0); + + OUT_PKT0(ring, REG_A3XX_VFD_DECODE_INSTR(j), 1); + OUT_RING(ring, + A3XX_VFD_DECODE_INSTR_CONSTFILL | + A3XX_VFD_DECODE_INSTR_WRITEMASK(vp->inputs[i].compmask) | + A3XX_VFD_DECODE_INSTR_FORMAT(fmt) | + A3XX_VFD_DECODE_INSTR_SWAP(fd3_pipe2swap(pfmt)) | + A3XX_VFD_DECODE_INSTR_REGID(vp->inputs[i].regid) | + A3XX_VFD_DECODE_INSTR_SHIFTCNT(fs) | + A3XX_VFD_DECODE_INSTR_LASTCOMPVALID | + COND(isint, A3XX_VFD_DECODE_INSTR_INT) | + COND(switchnext, A3XX_VFD_DECODE_INSTR_SWITCHNEXT)); + + total_in += util_bitcount(vp->inputs[i].compmask); + j++; + } + } + + /* hw doesn't like to be configured for zero vbo's, it seems: */ + if (last < 0) { + /* just recycle the shader bo, we just need to point to *something* + * valid: + */ + struct fd_bo *dummy_vbo = vp->bo; + bool switchnext = (vertex_regid != regid(63, 0)) || + (instance_regid != regid(63, 0)) || + (vtxcnt_regid != regid(63, 0)); + + OUT_PKT0(ring, REG_A3XX_VFD_FETCH(0), 2); + OUT_RING(ring, A3XX_VFD_FETCH_INSTR_0_FETCHSIZE(0) | + A3XX_VFD_FETCH_INSTR_0_BUFSTRIDE(0) | + COND(switchnext, A3XX_VFD_FETCH_INSTR_0_SWITCHNEXT) | + A3XX_VFD_FETCH_INSTR_0_INDEXCODE(0) | + A3XX_VFD_FETCH_INSTR_0_STEPRATE(1)); + OUT_RELOC(ring, dummy_vbo, 0, 0, 0); + + OUT_PKT0(ring, REG_A3XX_VFD_DECODE_INSTR(0), 1); + OUT_RING(ring, A3XX_VFD_DECODE_INSTR_CONSTFILL | + A3XX_VFD_DECODE_INSTR_WRITEMASK(0x1) | + A3XX_VFD_DECODE_INSTR_FORMAT(VFMT_8_UNORM) | + A3XX_VFD_DECODE_INSTR_SWAP(XYZW) | + A3XX_VFD_DECODE_INSTR_REGID(regid(0, 0)) | + A3XX_VFD_DECODE_INSTR_SHIFTCNT(1) | + A3XX_VFD_DECODE_INSTR_LASTCOMPVALID | + COND(switchnext, A3XX_VFD_DECODE_INSTR_SWITCHNEXT)); + + total_in = 1; + j = 1; + } + + OUT_PKT0(ring, REG_A3XX_VFD_CONTROL_0, 2); + OUT_RING(ring, A3XX_VFD_CONTROL_0_TOTALATTRTOVS(total_in) | + A3XX_VFD_CONTROL_0_PACKETSIZE(2) | + A3XX_VFD_CONTROL_0_STRMDECINSTRCNT(j) | + A3XX_VFD_CONTROL_0_STRMFETCHINSTRCNT(j)); + OUT_RING(ring, A3XX_VFD_CONTROL_1_MAXSTORAGE(1) | // XXX + A3XX_VFD_CONTROL_1_REGID4VTX(vertex_regid) | + A3XX_VFD_CONTROL_1_REGID4INST(instance_regid)); + + OUT_PKT0(ring, REG_A3XX_VFD_VS_THREADING_THRESHOLD, 1); + OUT_RING(ring, + A3XX_VFD_VS_THREADING_THRESHOLD_REGID_THRESHOLD(15) | + A3XX_VFD_VS_THREADING_THRESHOLD_REGID_VTXCNT(vtxcnt_regid)); } void fd3_emit_state(struct fd_context *ctx, struct fd_ringbuffer *ring, - struct fd3_emit *emit) + struct fd3_emit *emit) { - const struct ir3_shader_variant *vp = fd3_emit_get_vp(emit); - const struct ir3_shader_variant *fp = fd3_emit_get_fp(emit); - const enum fd_dirty_3d_state dirty = emit->dirty; - - emit_marker(ring, 5); - - if (dirty & FD_DIRTY_SAMPLE_MASK) { - OUT_PKT0(ring, REG_A3XX_RB_MSAA_CONTROL, 1); - OUT_RING(ring, A3XX_RB_MSAA_CONTROL_DISABLE | - A3XX_RB_MSAA_CONTROL_SAMPLES(MSAA_ONE) | - A3XX_RB_MSAA_CONTROL_SAMPLE_MASK(ctx->sample_mask)); - } - - if ((dirty & (FD_DIRTY_ZSA | FD_DIRTY_RASTERIZER | FD_DIRTY_PROG | FD_DIRTY_BLEND_DUAL)) && - !emit->binning_pass) { - uint32_t val = fd3_zsa_stateobj(ctx->zsa)->rb_render_control | - fd3_blend_stateobj(ctx->blend)->rb_render_control; - - val |= COND(fp->frag_face, A3XX_RB_RENDER_CONTROL_FACENESS); - val |= COND(fp->fragcoord_compmask != 0, - A3XX_RB_RENDER_CONTROL_COORD_MASK(fp->fragcoord_compmask)); - val |= COND(ctx->rasterizer->rasterizer_discard, - A3XX_RB_RENDER_CONTROL_DISABLE_COLOR_PIPE); - - /* I suppose if we needed to (which I don't *think* we need - * to), we could emit this for binning pass too. But we - * would need to keep a different patch-list for binning - * vs render pass. - */ - - OUT_PKT0(ring, REG_A3XX_RB_RENDER_CONTROL, 1); - OUT_RINGP(ring, val, &ctx->batch->rbrc_patches); - } - - if (dirty & (FD_DIRTY_ZSA | FD_DIRTY_STENCIL_REF)) { - struct fd3_zsa_stateobj *zsa = fd3_zsa_stateobj(ctx->zsa); - struct pipe_stencil_ref *sr = &ctx->stencil_ref; - - OUT_PKT0(ring, REG_A3XX_RB_ALPHA_REF, 1); - OUT_RING(ring, zsa->rb_alpha_ref); - - OUT_PKT0(ring, REG_A3XX_RB_STENCIL_CONTROL, 1); - OUT_RING(ring, zsa->rb_stencil_control); - - OUT_PKT0(ring, REG_A3XX_RB_STENCILREFMASK, 2); - OUT_RING(ring, zsa->rb_stencilrefmask | - A3XX_RB_STENCILREFMASK_STENCILREF(sr->ref_value[0])); - OUT_RING(ring, zsa->rb_stencilrefmask_bf | - A3XX_RB_STENCILREFMASK_BF_STENCILREF(sr->ref_value[1])); - } - - if (dirty & (FD_DIRTY_ZSA | FD_DIRTY_RASTERIZER | FD_DIRTY_PROG)) { - uint32_t val = fd3_zsa_stateobj(ctx->zsa)->rb_depth_control; - if (fp->writes_pos) { - val |= A3XX_RB_DEPTH_CONTROL_FRAG_WRITES_Z; - val |= A3XX_RB_DEPTH_CONTROL_EARLY_Z_DISABLE; - } - if (fp->no_earlyz || fp->has_kill) { - val |= A3XX_RB_DEPTH_CONTROL_EARLY_Z_DISABLE; - } - if (!ctx->rasterizer->depth_clip_near) { - val |= A3XX_RB_DEPTH_CONTROL_Z_CLAMP_ENABLE; - } - OUT_PKT0(ring, REG_A3XX_RB_DEPTH_CONTROL, 1); - OUT_RING(ring, val); - } - - if (dirty & FD_DIRTY_RASTERIZER) { - struct fd3_rasterizer_stateobj *rasterizer = - fd3_rasterizer_stateobj(ctx->rasterizer); - - OUT_PKT0(ring, REG_A3XX_GRAS_SU_MODE_CONTROL, 1); - OUT_RING(ring, rasterizer->gras_su_mode_control); - - OUT_PKT0(ring, REG_A3XX_GRAS_SU_POINT_MINMAX, 2); - OUT_RING(ring, rasterizer->gras_su_point_minmax); - OUT_RING(ring, rasterizer->gras_su_point_size); - - OUT_PKT0(ring, REG_A3XX_GRAS_SU_POLY_OFFSET_SCALE, 2); - OUT_RING(ring, rasterizer->gras_su_poly_offset_scale); - OUT_RING(ring, rasterizer->gras_su_poly_offset_offset); - } - - if (dirty & (FD_DIRTY_RASTERIZER | FD_DIRTY_PROG)) { - uint32_t val = fd3_rasterizer_stateobj(ctx->rasterizer) - ->gras_cl_clip_cntl; - uint8_t planes = ctx->rasterizer->clip_plane_enable; - val |= CONDREG(ir3_find_sysval_regid(fp, SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL), - A3XX_GRAS_CL_CLIP_CNTL_IJ_PERSP_CENTER); - val |= CONDREG(ir3_find_sysval_regid(fp, SYSTEM_VALUE_BARYCENTRIC_LINEAR_PIXEL), - A3XX_GRAS_CL_CLIP_CNTL_IJ_NON_PERSP_CENTER); - val |= CONDREG(ir3_find_sysval_regid(fp, SYSTEM_VALUE_BARYCENTRIC_PERSP_CENTROID), - A3XX_GRAS_CL_CLIP_CNTL_IJ_PERSP_CENTROID); - val |= CONDREG(ir3_find_sysval_regid(fp, SYSTEM_VALUE_BARYCENTRIC_LINEAR_CENTROID), - A3XX_GRAS_CL_CLIP_CNTL_IJ_NON_PERSP_CENTROID); - /* docs say enable at least one of IJ_PERSP_CENTER/CENTROID when fragcoord is used */ - val |= CONDREG(ir3_find_sysval_regid(fp, SYSTEM_VALUE_FRAG_COORD), - A3XX_GRAS_CL_CLIP_CNTL_IJ_PERSP_CENTER); - val |= COND(fp->writes_pos, A3XX_GRAS_CL_CLIP_CNTL_ZCLIP_DISABLE); - val |= COND(fp->fragcoord_compmask != 0, A3XX_GRAS_CL_CLIP_CNTL_ZCOORD | - A3XX_GRAS_CL_CLIP_CNTL_WCOORD); - if (!emit->key.key.ucp_enables) - val |= A3XX_GRAS_CL_CLIP_CNTL_NUM_USER_CLIP_PLANES( - MIN2(util_bitcount(planes), 6)); - OUT_PKT0(ring, REG_A3XX_GRAS_CL_CLIP_CNTL, 1); - OUT_RING(ring, val); - } - - if (dirty & (FD_DIRTY_RASTERIZER | FD_DIRTY_PROG | FD_DIRTY_UCP)) { - uint32_t planes = ctx->rasterizer->clip_plane_enable; - int count = 0; - - if (emit->key.key.ucp_enables) - planes = 0; - - while (planes && count < 6) { - int i = ffs(planes) - 1; - - planes &= ~(1U << i); - fd_wfi(ctx->batch, ring); - OUT_PKT0(ring, REG_A3XX_GRAS_CL_USER_PLANE(count++), 4); - OUT_RING(ring, fui(ctx->ucp.ucp[i][0])); - OUT_RING(ring, fui(ctx->ucp.ucp[i][1])); - OUT_RING(ring, fui(ctx->ucp.ucp[i][2])); - OUT_RING(ring, fui(ctx->ucp.ucp[i][3])); - } - } - - /* NOTE: since primitive_restart is not actually part of any - * state object, we need to make sure that we always emit - * PRIM_VTX_CNTL.. either that or be more clever and detect - * when it changes. - */ - if (emit->info) { - const struct pipe_draw_info *info = emit->info; - uint32_t val = fd3_rasterizer_stateobj(ctx->rasterizer) - ->pc_prim_vtx_cntl; - - if (!emit->binning_pass) { - uint32_t stride_in_vpc = align(fp->total_in, 4) / 4; - if (stride_in_vpc > 0) - stride_in_vpc = MAX2(stride_in_vpc, 2); - val |= A3XX_PC_PRIM_VTX_CNTL_STRIDE_IN_VPC(stride_in_vpc); - } - - if (info->index_size && info->primitive_restart) { - val |= A3XX_PC_PRIM_VTX_CNTL_PRIMITIVE_RESTART; - } - - val |= COND(vp->writes_psize, A3XX_PC_PRIM_VTX_CNTL_PSIZE); - - OUT_PKT0(ring, REG_A3XX_PC_PRIM_VTX_CNTL, 1); - OUT_RING(ring, val); - } - - if (dirty & (FD_DIRTY_SCISSOR | FD_DIRTY_RASTERIZER | FD_DIRTY_VIEWPORT)) { - struct pipe_scissor_state *scissor = fd_context_get_scissor(ctx); - int minx = scissor->minx; - int miny = scissor->miny; - int maxx = scissor->maxx; - int maxy = scissor->maxy; - - /* Unfortunately there is no separate depth clip disable, only an all - * or nothing deal. So when we disable clipping, we must handle the - * viewport clip via scissors. - */ - if (!ctx->rasterizer->depth_clip_near) { - struct pipe_viewport_state *vp = &ctx->viewport; - minx = MAX2(minx, (int)floorf(vp->translate[0] - fabsf(vp->scale[0]))); - miny = MAX2(miny, (int)floorf(vp->translate[1] - fabsf(vp->scale[1]))); - maxx = MIN2(maxx, (int)ceilf(vp->translate[0] + fabsf(vp->scale[0]))); - maxy = MIN2(maxy, (int)ceilf(vp->translate[1] + fabsf(vp->scale[1]))); - } - - OUT_PKT0(ring, REG_A3XX_GRAS_SC_WINDOW_SCISSOR_TL, 2); - OUT_RING(ring, A3XX_GRAS_SC_WINDOW_SCISSOR_TL_X(minx) | - A3XX_GRAS_SC_WINDOW_SCISSOR_TL_Y(miny)); - OUT_RING(ring, A3XX_GRAS_SC_WINDOW_SCISSOR_BR_X(maxx - 1) | - A3XX_GRAS_SC_WINDOW_SCISSOR_BR_Y(maxy - 1)); - - ctx->batch->max_scissor.minx = MIN2(ctx->batch->max_scissor.minx, minx); - ctx->batch->max_scissor.miny = MIN2(ctx->batch->max_scissor.miny, miny); - ctx->batch->max_scissor.maxx = MAX2(ctx->batch->max_scissor.maxx, maxx); - ctx->batch->max_scissor.maxy = MAX2(ctx->batch->max_scissor.maxy, maxy); - } - - if (dirty & FD_DIRTY_VIEWPORT) { - fd_wfi(ctx->batch, ring); - OUT_PKT0(ring, REG_A3XX_GRAS_CL_VPORT_XOFFSET, 6); - OUT_RING(ring, A3XX_GRAS_CL_VPORT_XOFFSET(ctx->viewport.translate[0] - 0.5)); - OUT_RING(ring, A3XX_GRAS_CL_VPORT_XSCALE(ctx->viewport.scale[0])); - OUT_RING(ring, A3XX_GRAS_CL_VPORT_YOFFSET(ctx->viewport.translate[1] - 0.5)); - OUT_RING(ring, A3XX_GRAS_CL_VPORT_YSCALE(ctx->viewport.scale[1])); - OUT_RING(ring, A3XX_GRAS_CL_VPORT_ZOFFSET(ctx->viewport.translate[2])); - OUT_RING(ring, A3XX_GRAS_CL_VPORT_ZSCALE(ctx->viewport.scale[2])); - } - - if (dirty & (FD_DIRTY_VIEWPORT | FD_DIRTY_RASTERIZER | FD_DIRTY_FRAMEBUFFER)) { - float zmin, zmax; - int depth = 24; - if (ctx->batch->framebuffer.zsbuf) { - depth = util_format_get_component_bits( - pipe_surface_format(ctx->batch->framebuffer.zsbuf), - UTIL_FORMAT_COLORSPACE_ZS, 0); - } - util_viewport_zmin_zmax(&ctx->viewport, ctx->rasterizer->clip_halfz, - &zmin, &zmax); - - OUT_PKT0(ring, REG_A3XX_RB_Z_CLAMP_MIN, 2); - if (depth == 32) { - OUT_RING(ring, (uint32_t)(zmin * 0xffffffff)); - OUT_RING(ring, (uint32_t)(zmax * 0xffffffff)); - } else if (depth == 16) { - OUT_RING(ring, (uint32_t)(zmin * 0xffff)); - OUT_RING(ring, (uint32_t)(zmax * 0xffff)); - } else { - OUT_RING(ring, (uint32_t)(zmin * 0xffffff)); - OUT_RING(ring, (uint32_t)(zmax * 0xffffff)); - } - } - - if (dirty & (FD_DIRTY_PROG | FD_DIRTY_FRAMEBUFFER | FD_DIRTY_BLEND_DUAL)) { - struct pipe_framebuffer_state *pfb = &ctx->batch->framebuffer; - int nr_cbufs = pfb->nr_cbufs; - if (fd3_blend_stateobj(ctx->blend)->rb_render_control & - A3XX_RB_RENDER_CONTROL_DUAL_COLOR_IN_ENABLE) - nr_cbufs++; - fd3_program_emit(ring, emit, nr_cbufs, pfb->cbufs); - } - - /* TODO we should not need this or fd_wfi() before emit_constants(): - */ - OUT_PKT3(ring, CP_EVENT_WRITE, 1); - OUT_RING(ring, HLSQ_FLUSH); - - if (!emit->skip_consts) { - ir3_emit_vs_consts(vp, ring, ctx, emit->info, emit->indirect, emit->draw); - if (!emit->binning_pass) - ir3_emit_fs_consts(fp, ring, ctx); - } - - if (dirty & (FD_DIRTY_BLEND | FD_DIRTY_FRAMEBUFFER)) { - struct fd3_blend_stateobj *blend = fd3_blend_stateobj(ctx->blend); - uint32_t i; - - for (i = 0; i < ARRAY_SIZE(blend->rb_mrt); i++) { - enum pipe_format format = - pipe_surface_format(ctx->batch->framebuffer.cbufs[i]); - const struct util_format_description *desc = - util_format_description(format); - bool is_float = util_format_is_float(format); - bool is_int = util_format_is_pure_integer(format); - bool has_alpha = util_format_has_alpha(format); - uint32_t control = blend->rb_mrt[i].control; - - if (is_int) { - control &= (A3XX_RB_MRT_CONTROL_COMPONENT_ENABLE__MASK | - A3XX_RB_MRT_CONTROL_DITHER_MODE__MASK); - control |= A3XX_RB_MRT_CONTROL_ROP_CODE(ROP_COPY); - } - - if (format == PIPE_FORMAT_NONE) - control &= ~A3XX_RB_MRT_CONTROL_COMPONENT_ENABLE__MASK; - - if (!has_alpha) { - control &= ~A3XX_RB_MRT_CONTROL_BLEND2; - } - - if (format && util_format_get_component_bits( - format, UTIL_FORMAT_COLORSPACE_RGB, 0) < 8) { - const struct pipe_rt_blend_state *rt; - if (ctx->blend->independent_blend_enable) - rt = &ctx->blend->rt[i]; - else - rt = &ctx->blend->rt[0]; - - if (!util_format_colormask_full(desc, rt->colormask)) - control |= A3XX_RB_MRT_CONTROL_READ_DEST_ENABLE; - } - - OUT_PKT0(ring, REG_A3XX_RB_MRT_CONTROL(i), 1); - OUT_RING(ring, control); - - OUT_PKT0(ring, REG_A3XX_RB_MRT_BLEND_CONTROL(i), 1); - OUT_RING(ring, blend->rb_mrt[i].blend_control | - COND(!is_float, A3XX_RB_MRT_BLEND_CONTROL_CLAMP_ENABLE)); - } - } - - if (dirty & FD_DIRTY_BLEND_COLOR) { - struct pipe_blend_color *bcolor = &ctx->blend_color; - OUT_PKT0(ring, REG_A3XX_RB_BLEND_RED, 4); - OUT_RING(ring, A3XX_RB_BLEND_RED_UINT(bcolor->color[0] * 255.0) | - A3XX_RB_BLEND_RED_FLOAT(bcolor->color[0])); - OUT_RING(ring, A3XX_RB_BLEND_GREEN_UINT(bcolor->color[1] * 255.0) | - A3XX_RB_BLEND_GREEN_FLOAT(bcolor->color[1])); - OUT_RING(ring, A3XX_RB_BLEND_BLUE_UINT(bcolor->color[2] * 255.0) | - A3XX_RB_BLEND_BLUE_FLOAT(bcolor->color[2])); - OUT_RING(ring, A3XX_RB_BLEND_ALPHA_UINT(bcolor->color[3] * 255.0) | - A3XX_RB_BLEND_ALPHA_FLOAT(bcolor->color[3])); - } - - if (dirty & FD_DIRTY_TEX) - fd_wfi(ctx->batch, ring); - - if (ctx->dirty_shader[PIPE_SHADER_VERTEX] & FD_DIRTY_SHADER_TEX) - emit_textures(ctx, ring, SB_VERT_TEX, &ctx->tex[PIPE_SHADER_VERTEX]); - - if (ctx->dirty_shader[PIPE_SHADER_FRAGMENT] & FD_DIRTY_SHADER_TEX) - emit_textures(ctx, ring, SB_FRAG_TEX, &ctx->tex[PIPE_SHADER_FRAGMENT]); + const struct ir3_shader_variant *vp = fd3_emit_get_vp(emit); + const struct ir3_shader_variant *fp = fd3_emit_get_fp(emit); + const enum fd_dirty_3d_state dirty = emit->dirty; + + emit_marker(ring, 5); + + if (dirty & FD_DIRTY_SAMPLE_MASK) { + OUT_PKT0(ring, REG_A3XX_RB_MSAA_CONTROL, 1); + OUT_RING(ring, A3XX_RB_MSAA_CONTROL_DISABLE | + A3XX_RB_MSAA_CONTROL_SAMPLES(MSAA_ONE) | + A3XX_RB_MSAA_CONTROL_SAMPLE_MASK(ctx->sample_mask)); + } + + if ((dirty & (FD_DIRTY_ZSA | FD_DIRTY_RASTERIZER | FD_DIRTY_PROG | + FD_DIRTY_BLEND_DUAL)) && + !emit->binning_pass) { + uint32_t val = fd3_zsa_stateobj(ctx->zsa)->rb_render_control | + fd3_blend_stateobj(ctx->blend)->rb_render_control; + + val |= COND(fp->frag_face, A3XX_RB_RENDER_CONTROL_FACENESS); + val |= COND(fp->fragcoord_compmask != 0, + A3XX_RB_RENDER_CONTROL_COORD_MASK(fp->fragcoord_compmask)); + val |= COND(ctx->rasterizer->rasterizer_discard, + A3XX_RB_RENDER_CONTROL_DISABLE_COLOR_PIPE); + + /* I suppose if we needed to (which I don't *think* we need + * to), we could emit this for binning pass too. But we + * would need to keep a different patch-list for binning + * vs render pass. + */ + + OUT_PKT0(ring, REG_A3XX_RB_RENDER_CONTROL, 1); + OUT_RINGP(ring, val, &ctx->batch->rbrc_patches); + } + + if (dirty & (FD_DIRTY_ZSA | FD_DIRTY_STENCIL_REF)) { + struct fd3_zsa_stateobj *zsa = fd3_zsa_stateobj(ctx->zsa); + struct pipe_stencil_ref *sr = &ctx->stencil_ref; + + OUT_PKT0(ring, REG_A3XX_RB_ALPHA_REF, 1); + OUT_RING(ring, zsa->rb_alpha_ref); + + OUT_PKT0(ring, REG_A3XX_RB_STENCIL_CONTROL, 1); + OUT_RING(ring, zsa->rb_stencil_control); + + OUT_PKT0(ring, REG_A3XX_RB_STENCILREFMASK, 2); + OUT_RING(ring, zsa->rb_stencilrefmask | + A3XX_RB_STENCILREFMASK_STENCILREF(sr->ref_value[0])); + OUT_RING(ring, zsa->rb_stencilrefmask_bf | + A3XX_RB_STENCILREFMASK_BF_STENCILREF(sr->ref_value[1])); + } + + if (dirty & (FD_DIRTY_ZSA | FD_DIRTY_RASTERIZER | FD_DIRTY_PROG)) { + uint32_t val = fd3_zsa_stateobj(ctx->zsa)->rb_depth_control; + if (fp->writes_pos) { + val |= A3XX_RB_DEPTH_CONTROL_FRAG_WRITES_Z; + val |= A3XX_RB_DEPTH_CONTROL_EARLY_Z_DISABLE; + } + if (fp->no_earlyz || fp->has_kill) { + val |= A3XX_RB_DEPTH_CONTROL_EARLY_Z_DISABLE; + } + if (!ctx->rasterizer->depth_clip_near) { + val |= A3XX_RB_DEPTH_CONTROL_Z_CLAMP_ENABLE; + } + OUT_PKT0(ring, REG_A3XX_RB_DEPTH_CONTROL, 1); + OUT_RING(ring, val); + } + + if (dirty & FD_DIRTY_RASTERIZER) { + struct fd3_rasterizer_stateobj *rasterizer = + fd3_rasterizer_stateobj(ctx->rasterizer); + + OUT_PKT0(ring, REG_A3XX_GRAS_SU_MODE_CONTROL, 1); + OUT_RING(ring, rasterizer->gras_su_mode_control); + + OUT_PKT0(ring, REG_A3XX_GRAS_SU_POINT_MINMAX, 2); + OUT_RING(ring, rasterizer->gras_su_point_minmax); + OUT_RING(ring, rasterizer->gras_su_point_size); + + OUT_PKT0(ring, REG_A3XX_GRAS_SU_POLY_OFFSET_SCALE, 2); + OUT_RING(ring, rasterizer->gras_su_poly_offset_scale); + OUT_RING(ring, rasterizer->gras_su_poly_offset_offset); + } + + if (dirty & (FD_DIRTY_RASTERIZER | FD_DIRTY_PROG)) { + uint32_t val = + fd3_rasterizer_stateobj(ctx->rasterizer)->gras_cl_clip_cntl; + uint8_t planes = ctx->rasterizer->clip_plane_enable; + val |= CONDREG( + ir3_find_sysval_regid(fp, SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL), + A3XX_GRAS_CL_CLIP_CNTL_IJ_PERSP_CENTER); + val |= CONDREG( + ir3_find_sysval_regid(fp, SYSTEM_VALUE_BARYCENTRIC_LINEAR_PIXEL), + A3XX_GRAS_CL_CLIP_CNTL_IJ_NON_PERSP_CENTER); + val |= CONDREG( + ir3_find_sysval_regid(fp, SYSTEM_VALUE_BARYCENTRIC_PERSP_CENTROID), + A3XX_GRAS_CL_CLIP_CNTL_IJ_PERSP_CENTROID); + val |= CONDREG( + ir3_find_sysval_regid(fp, SYSTEM_VALUE_BARYCENTRIC_LINEAR_CENTROID), + A3XX_GRAS_CL_CLIP_CNTL_IJ_NON_PERSP_CENTROID); + /* docs say enable at least one of IJ_PERSP_CENTER/CENTROID when fragcoord + * is used */ + val |= CONDREG(ir3_find_sysval_regid(fp, SYSTEM_VALUE_FRAG_COORD), + A3XX_GRAS_CL_CLIP_CNTL_IJ_PERSP_CENTER); + val |= COND(fp->writes_pos, A3XX_GRAS_CL_CLIP_CNTL_ZCLIP_DISABLE); + val |= + COND(fp->fragcoord_compmask != 0, + A3XX_GRAS_CL_CLIP_CNTL_ZCOORD | A3XX_GRAS_CL_CLIP_CNTL_WCOORD); + if (!emit->key.key.ucp_enables) + val |= A3XX_GRAS_CL_CLIP_CNTL_NUM_USER_CLIP_PLANES( + MIN2(util_bitcount(planes), 6)); + OUT_PKT0(ring, REG_A3XX_GRAS_CL_CLIP_CNTL, 1); + OUT_RING(ring, val); + } + + if (dirty & (FD_DIRTY_RASTERIZER | FD_DIRTY_PROG | FD_DIRTY_UCP)) { + uint32_t planes = ctx->rasterizer->clip_plane_enable; + int count = 0; + + if (emit->key.key.ucp_enables) + planes = 0; + + while (planes && count < 6) { + int i = ffs(planes) - 1; + + planes &= ~(1U << i); + fd_wfi(ctx->batch, ring); + OUT_PKT0(ring, REG_A3XX_GRAS_CL_USER_PLANE(count++), 4); + OUT_RING(ring, fui(ctx->ucp.ucp[i][0])); + OUT_RING(ring, fui(ctx->ucp.ucp[i][1])); + OUT_RING(ring, fui(ctx->ucp.ucp[i][2])); + OUT_RING(ring, fui(ctx->ucp.ucp[i][3])); + } + } + + /* NOTE: since primitive_restart is not actually part of any + * state object, we need to make sure that we always emit + * PRIM_VTX_CNTL.. either that or be more clever and detect + * when it changes. + */ + if (emit->info) { + const struct pipe_draw_info *info = emit->info; + uint32_t val = fd3_rasterizer_stateobj(ctx->rasterizer)->pc_prim_vtx_cntl; + + if (!emit->binning_pass) { + uint32_t stride_in_vpc = align(fp->total_in, 4) / 4; + if (stride_in_vpc > 0) + stride_in_vpc = MAX2(stride_in_vpc, 2); + val |= A3XX_PC_PRIM_VTX_CNTL_STRIDE_IN_VPC(stride_in_vpc); + } + + if (info->index_size && info->primitive_restart) { + val |= A3XX_PC_PRIM_VTX_CNTL_PRIMITIVE_RESTART; + } + + val |= COND(vp->writes_psize, A3XX_PC_PRIM_VTX_CNTL_PSIZE); + + OUT_PKT0(ring, REG_A3XX_PC_PRIM_VTX_CNTL, 1); + OUT_RING(ring, val); + } + + if (dirty & (FD_DIRTY_SCISSOR | FD_DIRTY_RASTERIZER | FD_DIRTY_VIEWPORT)) { + struct pipe_scissor_state *scissor = fd_context_get_scissor(ctx); + int minx = scissor->minx; + int miny = scissor->miny; + int maxx = scissor->maxx; + int maxy = scissor->maxy; + + /* Unfortunately there is no separate depth clip disable, only an all + * or nothing deal. So when we disable clipping, we must handle the + * viewport clip via scissors. + */ + if (!ctx->rasterizer->depth_clip_near) { + struct pipe_viewport_state *vp = &ctx->viewport; + minx = MAX2(minx, (int)floorf(vp->translate[0] - fabsf(vp->scale[0]))); + miny = MAX2(miny, (int)floorf(vp->translate[1] - fabsf(vp->scale[1]))); + maxx = MIN2(maxx, (int)ceilf(vp->translate[0] + fabsf(vp->scale[0]))); + maxy = MIN2(maxy, (int)ceilf(vp->translate[1] + fabsf(vp->scale[1]))); + } + + OUT_PKT0(ring, REG_A3XX_GRAS_SC_WINDOW_SCISSOR_TL, 2); + OUT_RING(ring, A3XX_GRAS_SC_WINDOW_SCISSOR_TL_X(minx) | + A3XX_GRAS_SC_WINDOW_SCISSOR_TL_Y(miny)); + OUT_RING(ring, A3XX_GRAS_SC_WINDOW_SCISSOR_BR_X(maxx - 1) | + A3XX_GRAS_SC_WINDOW_SCISSOR_BR_Y(maxy - 1)); + + ctx->batch->max_scissor.minx = MIN2(ctx->batch->max_scissor.minx, minx); + ctx->batch->max_scissor.miny = MIN2(ctx->batch->max_scissor.miny, miny); + ctx->batch->max_scissor.maxx = MAX2(ctx->batch->max_scissor.maxx, maxx); + ctx->batch->max_scissor.maxy = MAX2(ctx->batch->max_scissor.maxy, maxy); + } + + if (dirty & FD_DIRTY_VIEWPORT) { + fd_wfi(ctx->batch, ring); + OUT_PKT0(ring, REG_A3XX_GRAS_CL_VPORT_XOFFSET, 6); + OUT_RING(ring, + A3XX_GRAS_CL_VPORT_XOFFSET(ctx->viewport.translate[0] - 0.5)); + OUT_RING(ring, A3XX_GRAS_CL_VPORT_XSCALE(ctx->viewport.scale[0])); + OUT_RING(ring, + A3XX_GRAS_CL_VPORT_YOFFSET(ctx->viewport.translate[1] - 0.5)); + OUT_RING(ring, A3XX_GRAS_CL_VPORT_YSCALE(ctx->viewport.scale[1])); + OUT_RING(ring, A3XX_GRAS_CL_VPORT_ZOFFSET(ctx->viewport.translate[2])); + OUT_RING(ring, A3XX_GRAS_CL_VPORT_ZSCALE(ctx->viewport.scale[2])); + } + + if (dirty & + (FD_DIRTY_VIEWPORT | FD_DIRTY_RASTERIZER | FD_DIRTY_FRAMEBUFFER)) { + float zmin, zmax; + int depth = 24; + if (ctx->batch->framebuffer.zsbuf) { + depth = util_format_get_component_bits( + pipe_surface_format(ctx->batch->framebuffer.zsbuf), + UTIL_FORMAT_COLORSPACE_ZS, 0); + } + util_viewport_zmin_zmax(&ctx->viewport, ctx->rasterizer->clip_halfz, + &zmin, &zmax); + + OUT_PKT0(ring, REG_A3XX_RB_Z_CLAMP_MIN, 2); + if (depth == 32) { + OUT_RING(ring, (uint32_t)(zmin * 0xffffffff)); + OUT_RING(ring, (uint32_t)(zmax * 0xffffffff)); + } else if (depth == 16) { + OUT_RING(ring, (uint32_t)(zmin * 0xffff)); + OUT_RING(ring, (uint32_t)(zmax * 0xffff)); + } else { + OUT_RING(ring, (uint32_t)(zmin * 0xffffff)); + OUT_RING(ring, (uint32_t)(zmax * 0xffffff)); + } + } + + if (dirty & (FD_DIRTY_PROG | FD_DIRTY_FRAMEBUFFER | FD_DIRTY_BLEND_DUAL)) { + struct pipe_framebuffer_state *pfb = &ctx->batch->framebuffer; + int nr_cbufs = pfb->nr_cbufs; + if (fd3_blend_stateobj(ctx->blend)->rb_render_control & + A3XX_RB_RENDER_CONTROL_DUAL_COLOR_IN_ENABLE) + nr_cbufs++; + fd3_program_emit(ring, emit, nr_cbufs, pfb->cbufs); + } + + /* TODO we should not need this or fd_wfi() before emit_constants(): + */ + OUT_PKT3(ring, CP_EVENT_WRITE, 1); + OUT_RING(ring, HLSQ_FLUSH); + + if (!emit->skip_consts) { + ir3_emit_vs_consts(vp, ring, ctx, emit->info, emit->indirect, emit->draw); + if (!emit->binning_pass) + ir3_emit_fs_consts(fp, ring, ctx); + } + + if (dirty & (FD_DIRTY_BLEND | FD_DIRTY_FRAMEBUFFER)) { + struct fd3_blend_stateobj *blend = fd3_blend_stateobj(ctx->blend); + uint32_t i; + + for (i = 0; i < ARRAY_SIZE(blend->rb_mrt); i++) { + enum pipe_format format = + pipe_surface_format(ctx->batch->framebuffer.cbufs[i]); + const struct util_format_description *desc = + util_format_description(format); + bool is_float = util_format_is_float(format); + bool is_int = util_format_is_pure_integer(format); + bool has_alpha = util_format_has_alpha(format); + uint32_t control = blend->rb_mrt[i].control; + + if (is_int) { + control &= (A3XX_RB_MRT_CONTROL_COMPONENT_ENABLE__MASK | + A3XX_RB_MRT_CONTROL_DITHER_MODE__MASK); + control |= A3XX_RB_MRT_CONTROL_ROP_CODE(ROP_COPY); + } + + if (format == PIPE_FORMAT_NONE) + control &= ~A3XX_RB_MRT_CONTROL_COMPONENT_ENABLE__MASK; + + if (!has_alpha) { + control &= ~A3XX_RB_MRT_CONTROL_BLEND2; + } + + if (format && util_format_get_component_bits( + format, UTIL_FORMAT_COLORSPACE_RGB, 0) < 8) { + const struct pipe_rt_blend_state *rt; + if (ctx->blend->independent_blend_enable) + rt = &ctx->blend->rt[i]; + else + rt = &ctx->blend->rt[0]; + + if (!util_format_colormask_full(desc, rt->colormask)) + control |= A3XX_RB_MRT_CONTROL_READ_DEST_ENABLE; + } + + OUT_PKT0(ring, REG_A3XX_RB_MRT_CONTROL(i), 1); + OUT_RING(ring, control); + + OUT_PKT0(ring, REG_A3XX_RB_MRT_BLEND_CONTROL(i), 1); + OUT_RING(ring, + blend->rb_mrt[i].blend_control | + COND(!is_float, A3XX_RB_MRT_BLEND_CONTROL_CLAMP_ENABLE)); + } + } + + if (dirty & FD_DIRTY_BLEND_COLOR) { + struct pipe_blend_color *bcolor = &ctx->blend_color; + OUT_PKT0(ring, REG_A3XX_RB_BLEND_RED, 4); + OUT_RING(ring, A3XX_RB_BLEND_RED_UINT(bcolor->color[0] * 255.0) | + A3XX_RB_BLEND_RED_FLOAT(bcolor->color[0])); + OUT_RING(ring, A3XX_RB_BLEND_GREEN_UINT(bcolor->color[1] * 255.0) | + A3XX_RB_BLEND_GREEN_FLOAT(bcolor->color[1])); + OUT_RING(ring, A3XX_RB_BLEND_BLUE_UINT(bcolor->color[2] * 255.0) | + A3XX_RB_BLEND_BLUE_FLOAT(bcolor->color[2])); + OUT_RING(ring, A3XX_RB_BLEND_ALPHA_UINT(bcolor->color[3] * 255.0) | + A3XX_RB_BLEND_ALPHA_FLOAT(bcolor->color[3])); + } + + if (dirty & FD_DIRTY_TEX) + fd_wfi(ctx->batch, ring); + + if (ctx->dirty_shader[PIPE_SHADER_VERTEX] & FD_DIRTY_SHADER_TEX) + emit_textures(ctx, ring, SB_VERT_TEX, &ctx->tex[PIPE_SHADER_VERTEX]); + + if (ctx->dirty_shader[PIPE_SHADER_FRAGMENT] & FD_DIRTY_SHADER_TEX) + emit_textures(ctx, ring, SB_FRAG_TEX, &ctx->tex[PIPE_SHADER_FRAGMENT]); } /* emit setup at begin of new cmdstream buffer (don't rely on previous @@ -829,150 +842,148 @@ fd3_emit_state(struct fd_context *ctx, struct fd_ringbuffer *ring, void fd3_emit_restore(struct fd_batch *batch, struct fd_ringbuffer *ring) { - struct fd_context *ctx = batch->ctx; - struct fd3_context *fd3_ctx = fd3_context(ctx); - int i; - - if (ctx->screen->gpu_id == 320) { - OUT_PKT3(ring, CP_REG_RMW, 3); - OUT_RING(ring, REG_A3XX_RBBM_CLOCK_CTL); - OUT_RING(ring, 0xfffcffff); - OUT_RING(ring, 0x00000000); - } - - fd_wfi(batch, ring); - OUT_PKT3(ring, CP_INVALIDATE_STATE, 1); - OUT_RING(ring, 0x00007fff); - - OUT_PKT0(ring, REG_A3XX_SP_VS_PVT_MEM_PARAM_REG, 3); - OUT_RING(ring, 0x08000001); /* SP_VS_PVT_MEM_CTRL_REG */ - OUT_RELOC(ring, fd3_ctx->vs_pvt_mem, 0,0,0); /* SP_VS_PVT_MEM_ADDR_REG */ - OUT_RING(ring, 0x00000000); /* SP_VS_PVT_MEM_SIZE_REG */ - - OUT_PKT0(ring, REG_A3XX_SP_FS_PVT_MEM_PARAM_REG, 3); - OUT_RING(ring, 0x08000001); /* SP_FS_PVT_MEM_CTRL_REG */ - OUT_RELOC(ring, fd3_ctx->fs_pvt_mem, 0,0,0); /* SP_FS_PVT_MEM_ADDR_REG */ - OUT_RING(ring, 0x00000000); /* SP_FS_PVT_MEM_SIZE_REG */ - - OUT_PKT0(ring, REG_A3XX_PC_VERTEX_REUSE_BLOCK_CNTL, 1); - OUT_RING(ring, 0x0000000b); /* PC_VERTEX_REUSE_BLOCK_CNTL */ - - OUT_PKT0(ring, REG_A3XX_GRAS_SC_CONTROL, 1); - OUT_RING(ring, A3XX_GRAS_SC_CONTROL_RENDER_MODE(RB_RENDERING_PASS) | - A3XX_GRAS_SC_CONTROL_MSAA_SAMPLES(MSAA_ONE) | - A3XX_GRAS_SC_CONTROL_RASTER_MODE(0)); - - OUT_PKT0(ring, REG_A3XX_RB_MSAA_CONTROL, 2); - OUT_RING(ring, A3XX_RB_MSAA_CONTROL_DISABLE | - A3XX_RB_MSAA_CONTROL_SAMPLES(MSAA_ONE) | - A3XX_RB_MSAA_CONTROL_SAMPLE_MASK(0xffff)); - OUT_RING(ring, 0x00000000); /* RB_ALPHA_REF */ - - OUT_PKT0(ring, REG_A3XX_GRAS_CL_GB_CLIP_ADJ, 1); - OUT_RING(ring, A3XX_GRAS_CL_GB_CLIP_ADJ_HORZ(0) | - A3XX_GRAS_CL_GB_CLIP_ADJ_VERT(0)); - - OUT_PKT0(ring, REG_A3XX_GRAS_TSE_DEBUG_ECO, 1); - OUT_RING(ring, 0x00000001); /* GRAS_TSE_DEBUG_ECO */ - - OUT_PKT0(ring, REG_A3XX_TPL1_TP_VS_TEX_OFFSET, 1); - OUT_RING(ring, A3XX_TPL1_TP_VS_TEX_OFFSET_SAMPLEROFFSET(VERT_TEX_OFF) | - A3XX_TPL1_TP_VS_TEX_OFFSET_MEMOBJOFFSET(VERT_TEX_OFF) | - A3XX_TPL1_TP_VS_TEX_OFFSET_BASETABLEPTR(BASETABLE_SZ * VERT_TEX_OFF)); - - OUT_PKT0(ring, REG_A3XX_TPL1_TP_FS_TEX_OFFSET, 1); - OUT_RING(ring, A3XX_TPL1_TP_FS_TEX_OFFSET_SAMPLEROFFSET(FRAG_TEX_OFF) | - A3XX_TPL1_TP_FS_TEX_OFFSET_MEMOBJOFFSET(FRAG_TEX_OFF) | - A3XX_TPL1_TP_FS_TEX_OFFSET_BASETABLEPTR(BASETABLE_SZ * FRAG_TEX_OFF)); - - OUT_PKT0(ring, REG_A3XX_VPC_VARY_CYLWRAP_ENABLE_0, 2); - OUT_RING(ring, 0x00000000); /* VPC_VARY_CYLWRAP_ENABLE_0 */ - OUT_RING(ring, 0x00000000); /* VPC_VARY_CYLWRAP_ENABLE_1 */ - - OUT_PKT0(ring, REG_A3XX_UNKNOWN_0E43, 1); - OUT_RING(ring, 0x00000001); /* UNKNOWN_0E43 */ - - OUT_PKT0(ring, REG_A3XX_UNKNOWN_0F03, 1); - OUT_RING(ring, 0x00000001); /* UNKNOWN_0F03 */ - - OUT_PKT0(ring, REG_A3XX_UNKNOWN_0EE0, 1); - OUT_RING(ring, 0x00000003); /* UNKNOWN_0EE0 */ - - OUT_PKT0(ring, REG_A3XX_UNKNOWN_0C3D, 1); - OUT_RING(ring, 0x00000001); /* UNKNOWN_0C3D */ - - OUT_PKT0(ring, REG_A3XX_HLSQ_PERFCOUNTER0_SELECT, 1); - OUT_RING(ring, 0x00000000); /* HLSQ_PERFCOUNTER0_SELECT */ - - OUT_PKT0(ring, REG_A3XX_HLSQ_CONST_VSPRESV_RANGE_REG, 2); - OUT_RING(ring, A3XX_HLSQ_CONST_VSPRESV_RANGE_REG_STARTENTRY(0) | - A3XX_HLSQ_CONST_VSPRESV_RANGE_REG_ENDENTRY(0)); - OUT_RING(ring, A3XX_HLSQ_CONST_FSPRESV_RANGE_REG_STARTENTRY(0) | - A3XX_HLSQ_CONST_FSPRESV_RANGE_REG_ENDENTRY(0)); - - fd3_emit_cache_flush(batch, ring); - - OUT_PKT0(ring, REG_A3XX_GRAS_CL_CLIP_CNTL, 1); - OUT_RING(ring, 0x00000000); /* GRAS_CL_CLIP_CNTL */ - - OUT_PKT0(ring, REG_A3XX_GRAS_SU_POINT_MINMAX, 2); - OUT_RING(ring, 0xffc00010); /* GRAS_SU_POINT_MINMAX */ - OUT_RING(ring, 0x00000008); /* GRAS_SU_POINT_SIZE */ - - OUT_PKT0(ring, REG_A3XX_PC_RESTART_INDEX, 1); - OUT_RING(ring, 0xffffffff); /* PC_RESTART_INDEX */ - - OUT_PKT0(ring, REG_A3XX_RB_WINDOW_OFFSET, 1); - OUT_RING(ring, A3XX_RB_WINDOW_OFFSET_X(0) | - A3XX_RB_WINDOW_OFFSET_Y(0)); - - OUT_PKT0(ring, REG_A3XX_RB_BLEND_RED, 4); - OUT_RING(ring, A3XX_RB_BLEND_RED_UINT(0) | - A3XX_RB_BLEND_RED_FLOAT(0.0)); - OUT_RING(ring, A3XX_RB_BLEND_GREEN_UINT(0) | - A3XX_RB_BLEND_GREEN_FLOAT(0.0)); - OUT_RING(ring, A3XX_RB_BLEND_BLUE_UINT(0) | - A3XX_RB_BLEND_BLUE_FLOAT(0.0)); - OUT_RING(ring, A3XX_RB_BLEND_ALPHA_UINT(0xff) | - A3XX_RB_BLEND_ALPHA_FLOAT(1.0)); - - for (i = 0; i < 6; i++) { - OUT_PKT0(ring, REG_A3XX_GRAS_CL_USER_PLANE(i), 4); - OUT_RING(ring, 0x00000000); /* GRAS_CL_USER_PLANE[i].X */ - OUT_RING(ring, 0x00000000); /* GRAS_CL_USER_PLANE[i].Y */ - OUT_RING(ring, 0x00000000); /* GRAS_CL_USER_PLANE[i].Z */ - OUT_RING(ring, 0x00000000); /* GRAS_CL_USER_PLANE[i].W */ - } - - OUT_PKT0(ring, REG_A3XX_PC_VSTREAM_CONTROL, 1); - OUT_RING(ring, 0x00000000); - - fd_event_write(batch, ring, CACHE_FLUSH); - - if (is_a3xx_p0(ctx->screen)) { - OUT_PKT3(ring, CP_DRAW_INDX, 3); - OUT_RING(ring, 0x00000000); - OUT_RING(ring, DRAW(1, DI_SRC_SEL_AUTO_INDEX, - INDEX_SIZE_IGN, IGNORE_VISIBILITY, 0)); - OUT_RING(ring, 0); /* NumIndices */ - } - - OUT_PKT3(ring, CP_NOP, 4); - OUT_RING(ring, 0x00000000); - OUT_RING(ring, 0x00000000); - OUT_RING(ring, 0x00000000); - OUT_RING(ring, 0x00000000); - - fd_wfi(batch, ring); - - fd_hw_query_enable(batch, ring); + struct fd_context *ctx = batch->ctx; + struct fd3_context *fd3_ctx = fd3_context(ctx); + int i; + + if (ctx->screen->gpu_id == 320) { + OUT_PKT3(ring, CP_REG_RMW, 3); + OUT_RING(ring, REG_A3XX_RBBM_CLOCK_CTL); + OUT_RING(ring, 0xfffcffff); + OUT_RING(ring, 0x00000000); + } + + fd_wfi(batch, ring); + OUT_PKT3(ring, CP_INVALIDATE_STATE, 1); + OUT_RING(ring, 0x00007fff); + + OUT_PKT0(ring, REG_A3XX_SP_VS_PVT_MEM_PARAM_REG, 3); + OUT_RING(ring, 0x08000001); /* SP_VS_PVT_MEM_CTRL_REG */ + OUT_RELOC(ring, fd3_ctx->vs_pvt_mem, 0, 0, 0); /* SP_VS_PVT_MEM_ADDR_REG */ + OUT_RING(ring, 0x00000000); /* SP_VS_PVT_MEM_SIZE_REG */ + + OUT_PKT0(ring, REG_A3XX_SP_FS_PVT_MEM_PARAM_REG, 3); + OUT_RING(ring, 0x08000001); /* SP_FS_PVT_MEM_CTRL_REG */ + OUT_RELOC(ring, fd3_ctx->fs_pvt_mem, 0, 0, 0); /* SP_FS_PVT_MEM_ADDR_REG */ + OUT_RING(ring, 0x00000000); /* SP_FS_PVT_MEM_SIZE_REG */ + + OUT_PKT0(ring, REG_A3XX_PC_VERTEX_REUSE_BLOCK_CNTL, 1); + OUT_RING(ring, 0x0000000b); /* PC_VERTEX_REUSE_BLOCK_CNTL */ + + OUT_PKT0(ring, REG_A3XX_GRAS_SC_CONTROL, 1); + OUT_RING(ring, A3XX_GRAS_SC_CONTROL_RENDER_MODE(RB_RENDERING_PASS) | + A3XX_GRAS_SC_CONTROL_MSAA_SAMPLES(MSAA_ONE) | + A3XX_GRAS_SC_CONTROL_RASTER_MODE(0)); + + OUT_PKT0(ring, REG_A3XX_RB_MSAA_CONTROL, 2); + OUT_RING(ring, A3XX_RB_MSAA_CONTROL_DISABLE | + A3XX_RB_MSAA_CONTROL_SAMPLES(MSAA_ONE) | + A3XX_RB_MSAA_CONTROL_SAMPLE_MASK(0xffff)); + OUT_RING(ring, 0x00000000); /* RB_ALPHA_REF */ + + OUT_PKT0(ring, REG_A3XX_GRAS_CL_GB_CLIP_ADJ, 1); + OUT_RING(ring, A3XX_GRAS_CL_GB_CLIP_ADJ_HORZ(0) | + A3XX_GRAS_CL_GB_CLIP_ADJ_VERT(0)); + + OUT_PKT0(ring, REG_A3XX_GRAS_TSE_DEBUG_ECO, 1); + OUT_RING(ring, 0x00000001); /* GRAS_TSE_DEBUG_ECO */ + + OUT_PKT0(ring, REG_A3XX_TPL1_TP_VS_TEX_OFFSET, 1); + OUT_RING(ring, A3XX_TPL1_TP_VS_TEX_OFFSET_SAMPLEROFFSET(VERT_TEX_OFF) | + A3XX_TPL1_TP_VS_TEX_OFFSET_MEMOBJOFFSET(VERT_TEX_OFF) | + A3XX_TPL1_TP_VS_TEX_OFFSET_BASETABLEPTR(BASETABLE_SZ * + VERT_TEX_OFF)); + + OUT_PKT0(ring, REG_A3XX_TPL1_TP_FS_TEX_OFFSET, 1); + OUT_RING(ring, A3XX_TPL1_TP_FS_TEX_OFFSET_SAMPLEROFFSET(FRAG_TEX_OFF) | + A3XX_TPL1_TP_FS_TEX_OFFSET_MEMOBJOFFSET(FRAG_TEX_OFF) | + A3XX_TPL1_TP_FS_TEX_OFFSET_BASETABLEPTR(BASETABLE_SZ * + FRAG_TEX_OFF)); + + OUT_PKT0(ring, REG_A3XX_VPC_VARY_CYLWRAP_ENABLE_0, 2); + OUT_RING(ring, 0x00000000); /* VPC_VARY_CYLWRAP_ENABLE_0 */ + OUT_RING(ring, 0x00000000); /* VPC_VARY_CYLWRAP_ENABLE_1 */ + + OUT_PKT0(ring, REG_A3XX_UNKNOWN_0E43, 1); + OUT_RING(ring, 0x00000001); /* UNKNOWN_0E43 */ + + OUT_PKT0(ring, REG_A3XX_UNKNOWN_0F03, 1); + OUT_RING(ring, 0x00000001); /* UNKNOWN_0F03 */ + + OUT_PKT0(ring, REG_A3XX_UNKNOWN_0EE0, 1); + OUT_RING(ring, 0x00000003); /* UNKNOWN_0EE0 */ + + OUT_PKT0(ring, REG_A3XX_UNKNOWN_0C3D, 1); + OUT_RING(ring, 0x00000001); /* UNKNOWN_0C3D */ + + OUT_PKT0(ring, REG_A3XX_HLSQ_PERFCOUNTER0_SELECT, 1); + OUT_RING(ring, 0x00000000); /* HLSQ_PERFCOUNTER0_SELECT */ + + OUT_PKT0(ring, REG_A3XX_HLSQ_CONST_VSPRESV_RANGE_REG, 2); + OUT_RING(ring, A3XX_HLSQ_CONST_VSPRESV_RANGE_REG_STARTENTRY(0) | + A3XX_HLSQ_CONST_VSPRESV_RANGE_REG_ENDENTRY(0)); + OUT_RING(ring, A3XX_HLSQ_CONST_FSPRESV_RANGE_REG_STARTENTRY(0) | + A3XX_HLSQ_CONST_FSPRESV_RANGE_REG_ENDENTRY(0)); + + fd3_emit_cache_flush(batch, ring); + + OUT_PKT0(ring, REG_A3XX_GRAS_CL_CLIP_CNTL, 1); + OUT_RING(ring, 0x00000000); /* GRAS_CL_CLIP_CNTL */ + + OUT_PKT0(ring, REG_A3XX_GRAS_SU_POINT_MINMAX, 2); + OUT_RING(ring, 0xffc00010); /* GRAS_SU_POINT_MINMAX */ + OUT_RING(ring, 0x00000008); /* GRAS_SU_POINT_SIZE */ + + OUT_PKT0(ring, REG_A3XX_PC_RESTART_INDEX, 1); + OUT_RING(ring, 0xffffffff); /* PC_RESTART_INDEX */ + + OUT_PKT0(ring, REG_A3XX_RB_WINDOW_OFFSET, 1); + OUT_RING(ring, A3XX_RB_WINDOW_OFFSET_X(0) | A3XX_RB_WINDOW_OFFSET_Y(0)); + + OUT_PKT0(ring, REG_A3XX_RB_BLEND_RED, 4); + OUT_RING(ring, A3XX_RB_BLEND_RED_UINT(0) | A3XX_RB_BLEND_RED_FLOAT(0.0)); + OUT_RING(ring, A3XX_RB_BLEND_GREEN_UINT(0) | A3XX_RB_BLEND_GREEN_FLOAT(0.0)); + OUT_RING(ring, A3XX_RB_BLEND_BLUE_UINT(0) | A3XX_RB_BLEND_BLUE_FLOAT(0.0)); + OUT_RING(ring, + A3XX_RB_BLEND_ALPHA_UINT(0xff) | A3XX_RB_BLEND_ALPHA_FLOAT(1.0)); + + for (i = 0; i < 6; i++) { + OUT_PKT0(ring, REG_A3XX_GRAS_CL_USER_PLANE(i), 4); + OUT_RING(ring, 0x00000000); /* GRAS_CL_USER_PLANE[i].X */ + OUT_RING(ring, 0x00000000); /* GRAS_CL_USER_PLANE[i].Y */ + OUT_RING(ring, 0x00000000); /* GRAS_CL_USER_PLANE[i].Z */ + OUT_RING(ring, 0x00000000); /* GRAS_CL_USER_PLANE[i].W */ + } + + OUT_PKT0(ring, REG_A3XX_PC_VSTREAM_CONTROL, 1); + OUT_RING(ring, 0x00000000); + + fd_event_write(batch, ring, CACHE_FLUSH); + + if (is_a3xx_p0(ctx->screen)) { + OUT_PKT3(ring, CP_DRAW_INDX, 3); + OUT_RING(ring, 0x00000000); + OUT_RING(ring, DRAW(1, DI_SRC_SEL_AUTO_INDEX, INDEX_SIZE_IGN, + IGNORE_VISIBILITY, 0)); + OUT_RING(ring, 0); /* NumIndices */ + } + + OUT_PKT3(ring, CP_NOP, 4); + OUT_RING(ring, 0x00000000); + OUT_RING(ring, 0x00000000); + OUT_RING(ring, 0x00000000); + OUT_RING(ring, 0x00000000); + + fd_wfi(batch, ring); + + fd_hw_query_enable(batch, ring); } void fd3_emit_init_screen(struct pipe_screen *pscreen) { - struct fd_screen *screen = fd_screen(pscreen); - screen->emit_ib = fd3_emit_ib; + struct fd_screen *screen = fd_screen(pscreen); + screen->emit_ib = fd3_emit_ib; } void diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_emit.h b/src/gallium/drivers/freedreno/a3xx/fd3_emit.h index 9672ebf..44621dc 100644 --- a/src/gallium/drivers/freedreno/a3xx/fd3_emit.h +++ b/src/gallium/drivers/freedreno/a3xx/fd3_emit.h @@ -29,69 +29,71 @@ #include "pipe/p_context.h" -#include "freedreno_batch.h" -#include "freedreno_context.h" #include "fd3_format.h" #include "fd3_program.h" +#include "freedreno_batch.h" +#include "freedreno_context.h" #include "ir3_cache.h" #include "ir3_gallium.h" struct fd_ringbuffer; void fd3_emit_gmem_restore_tex(struct fd_ringbuffer *ring, - struct pipe_surface **psurf, int bufs); + struct pipe_surface **psurf, int bufs); /* grouped together emit-state for prog/vertex/state emit: */ struct fd3_emit { - struct pipe_debug_callback *debug; - const struct fd_vertex_state *vtx; - const struct fd3_program_state *prog; - const struct pipe_draw_info *info; - const struct pipe_draw_indirect_info *indirect; - const struct pipe_draw_start_count *draw; - bool binning_pass; - struct ir3_cache_key key; - enum fd_dirty_3d_state dirty; - - uint32_t sprite_coord_enable; - bool sprite_coord_mode; - bool rasterflat; - bool skip_consts; - - /* cached to avoid repeated lookups of same variants: */ - const struct ir3_shader_variant *vs, *fs; + struct pipe_debug_callback *debug; + const struct fd_vertex_state *vtx; + const struct fd3_program_state *prog; + const struct pipe_draw_info *info; + const struct pipe_draw_indirect_info *indirect; + const struct pipe_draw_start_count *draw; + bool binning_pass; + struct ir3_cache_key key; + enum fd_dirty_3d_state dirty; + + uint32_t sprite_coord_enable; + bool sprite_coord_mode; + bool rasterflat; + bool skip_consts; + + /* cached to avoid repeated lookups of same variants: */ + const struct ir3_shader_variant *vs, *fs; }; static inline const struct ir3_shader_variant * fd3_emit_get_vp(struct fd3_emit *emit) { - if (!emit->vs) { - emit->vs = emit->binning_pass ? emit->prog->bs : emit->prog->vs; - } - return emit->vs; + if (!emit->vs) { + emit->vs = emit->binning_pass ? emit->prog->bs : emit->prog->vs; + } + return emit->vs; } static inline const struct ir3_shader_variant * fd3_emit_get_fp(struct fd3_emit *emit) { - if (!emit->fs) { - if (emit->binning_pass) { - /* use dummy stateobj to simplify binning vs non-binning: */ - static const struct ir3_shader_variant binning_fs = {}; - emit->fs = &binning_fs; - } else { - emit->fs = emit->prog->fs; - } - } - return emit->fs; + if (!emit->fs) { + if (emit->binning_pass) { + /* use dummy stateobj to simplify binning vs non-binning: */ + static const struct ir3_shader_variant binning_fs = {}; + emit->fs = &binning_fs; + } else { + emit->fs = emit->prog->fs; + } + } + return emit->fs; } -void fd3_emit_vertex_bufs(struct fd_ringbuffer *ring, struct fd3_emit *emit) assert_dt; +void fd3_emit_vertex_bufs(struct fd_ringbuffer *ring, + struct fd3_emit *emit) assert_dt; void fd3_emit_state(struct fd_context *ctx, struct fd_ringbuffer *ring, - struct fd3_emit *emit) assert_dt; + struct fd3_emit *emit) assert_dt; -void fd3_emit_restore(struct fd_batch *batch, struct fd_ringbuffer *ring) assert_dt; +void fd3_emit_restore(struct fd_batch *batch, + struct fd_ringbuffer *ring) assert_dt; void fd3_emit_init_screen(struct pipe_screen *pscreen); void fd3_emit_init(struct pipe_context *pctx); @@ -99,19 +101,19 @@ void fd3_emit_init(struct pipe_context *pctx); static inline void fd3_emit_ib(struct fd_ringbuffer *ring, struct fd_ringbuffer *target) { - __OUT_IB(ring, true, target); + __OUT_IB(ring, true, target); } static inline void -fd3_emit_cache_flush(struct fd_batch *batch, struct fd_ringbuffer *ring) - assert_dt +fd3_emit_cache_flush(struct fd_batch *batch, + struct fd_ringbuffer *ring) assert_dt { - fd_wfi(batch, ring); - OUT_PKT0(ring, REG_A3XX_UCHE_CACHE_INVALIDATE0_REG, 2); - OUT_RING(ring, A3XX_UCHE_CACHE_INVALIDATE0_REG_ADDR(0)); - OUT_RING(ring, A3XX_UCHE_CACHE_INVALIDATE1_REG_ADDR(0) | - A3XX_UCHE_CACHE_INVALIDATE1_REG_OPCODE(INVALIDATE) | - A3XX_UCHE_CACHE_INVALIDATE1_REG_ENTIRE_CACHE); + fd_wfi(batch, ring); + OUT_PKT0(ring, REG_A3XX_UCHE_CACHE_INVALIDATE0_REG, 2); + OUT_RING(ring, A3XX_UCHE_CACHE_INVALIDATE0_REG_ADDR(0)); + OUT_RING(ring, A3XX_UCHE_CACHE_INVALIDATE1_REG_ADDR(0) | + A3XX_UCHE_CACHE_INVALIDATE1_REG_OPCODE(INVALIDATE) | + A3XX_UCHE_CACHE_INVALIDATE1_REG_ENTIRE_CACHE); } #endif /* FD3_EMIT_H */ diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_format.c b/src/gallium/drivers/freedreno/a3xx/fd3_format.c index a6eed2e..8306b53 100644 --- a/src/gallium/drivers/freedreno/a3xx/fd3_format.c +++ b/src/gallium/drivers/freedreno/a3xx/fd3_format.c @@ -32,42 +32,36 @@ */ struct fd3_format { - enum a3xx_vtx_fmt vtx; - enum a3xx_tex_fmt tex; - enum a3xx_color_fmt rb; - enum a3xx_color_swap swap; - boolean present; + enum a3xx_vtx_fmt vtx; + enum a3xx_tex_fmt tex; + enum a3xx_color_fmt rb; + enum a3xx_color_swap swap; + boolean present; }; /* vertex + texture */ -#define VT(pipe, fmt, rbfmt, swapfmt) \ - [PIPE_FORMAT_ ## pipe] = { \ - .present = 1, \ - .vtx = VFMT_ ## fmt, \ - .tex = TFMT_ ## fmt, \ - .rb = RB_ ## rbfmt, \ - .swap = swapfmt \ - } +#define VT(pipe, fmt, rbfmt, swapfmt) \ + [PIPE_FORMAT_##pipe] = {.present = 1, \ + .vtx = VFMT_##fmt, \ + .tex = TFMT_##fmt, \ + .rb = RB_##rbfmt, \ + .swap = swapfmt} /* texture-only */ -#define _T(pipe, fmt, rbfmt, swapfmt) \ - [PIPE_FORMAT_ ## pipe] = { \ - .present = 1, \ - .vtx = VFMT_NONE, \ - .tex = TFMT_ ## fmt, \ - .rb = RB_ ## rbfmt, \ - .swap = swapfmt \ - } +#define _T(pipe, fmt, rbfmt, swapfmt) \ + [PIPE_FORMAT_##pipe] = {.present = 1, \ + .vtx = VFMT_NONE, \ + .tex = TFMT_##fmt, \ + .rb = RB_##rbfmt, \ + .swap = swapfmt} /* vertex-only */ -#define V_(pipe, fmt, rbfmt, swapfmt) \ - [PIPE_FORMAT_ ## pipe] = { \ - .present = 1, \ - .vtx = VFMT_ ## fmt, \ - .tex = TFMT_NONE, \ - .rb = RB_ ## rbfmt, \ - .swap = swapfmt \ - } +#define V_(pipe, fmt, rbfmt, swapfmt) \ + [PIPE_FORMAT_##pipe] = {.present = 1, \ + .vtx = VFMT_##fmt, \ + .tex = TFMT_NONE, \ + .rb = RB_##rbfmt, \ + .swap = swapfmt} /* clang-format off */ static struct fd3_format formats[PIPE_FORMAT_COUNT] = { @@ -294,80 +288,90 @@ static struct fd3_format formats[PIPE_FORMAT_COUNT] = { enum a3xx_vtx_fmt fd3_pipe2vtx(enum pipe_format format) { - if (!formats[format].present) - return VFMT_NONE; - return formats[format].vtx; + if (!formats[format].present) + return VFMT_NONE; + return formats[format].vtx; } enum a3xx_tex_fmt fd3_pipe2tex(enum pipe_format format) { - if (!formats[format].present) - return TFMT_NONE; - return formats[format].tex; + if (!formats[format].present) + return TFMT_NONE; + return formats[format].tex; } enum a3xx_color_fmt fd3_pipe2color(enum pipe_format format) { - if (!formats[format].present) - return RB_NONE; - return formats[format].rb; + if (!formats[format].present) + return RB_NONE; + return formats[format].rb; } enum a3xx_color_swap fd3_pipe2swap(enum pipe_format format) { - if (!formats[format].present) - return WZYX; - return formats[format].swap; + if (!formats[format].present) + return WZYX; + return formats[format].swap; } enum a3xx_color_fmt fd3_fs_output_format(enum pipe_format format) { - if (util_format_is_srgb(format)) - return RB_R16G16B16A16_FLOAT; - switch (format) { - case PIPE_FORMAT_R16_FLOAT: - case PIPE_FORMAT_R16G16_FLOAT: - case PIPE_FORMAT_R11G11B10_FLOAT: - return RB_R16G16B16A16_FLOAT; - case PIPE_FORMAT_L8_UNORM: - return RB_R8G8B8A8_UNORM; - default: - return fd3_pipe2color(format); - } + if (util_format_is_srgb(format)) + return RB_R16G16B16A16_FLOAT; + switch (format) { + case PIPE_FORMAT_R16_FLOAT: + case PIPE_FORMAT_R16G16_FLOAT: + case PIPE_FORMAT_R11G11B10_FLOAT: + return RB_R16G16B16A16_FLOAT; + case PIPE_FORMAT_L8_UNORM: + return RB_R8G8B8A8_UNORM; + default: + return fd3_pipe2color(format); + } } static inline enum a3xx_tex_swiz tex_swiz(unsigned swiz) { - switch (swiz) { - default: - case PIPE_SWIZZLE_X: return A3XX_TEX_X; - case PIPE_SWIZZLE_Y: return A3XX_TEX_Y; - case PIPE_SWIZZLE_Z: return A3XX_TEX_Z; - case PIPE_SWIZZLE_W: return A3XX_TEX_W; - case PIPE_SWIZZLE_0: return A3XX_TEX_ZERO; - case PIPE_SWIZZLE_1: return A3XX_TEX_ONE; - } + switch (swiz) { + default: + case PIPE_SWIZZLE_X: + return A3XX_TEX_X; + case PIPE_SWIZZLE_Y: + return A3XX_TEX_Y; + case PIPE_SWIZZLE_Z: + return A3XX_TEX_Z; + case PIPE_SWIZZLE_W: + return A3XX_TEX_W; + case PIPE_SWIZZLE_0: + return A3XX_TEX_ZERO; + case PIPE_SWIZZLE_1: + return A3XX_TEX_ONE; + } } uint32_t fd3_tex_swiz(enum pipe_format format, unsigned swizzle_r, unsigned swizzle_g, - unsigned swizzle_b, unsigned swizzle_a) + unsigned swizzle_b, unsigned swizzle_a) { - const struct util_format_description *desc = - util_format_description(format); - unsigned char swiz[4] = { - swizzle_r, swizzle_g, swizzle_b, swizzle_a, - }, rswiz[4]; - - util_format_compose_swizzles(desc->swizzle, swiz, rswiz); - - return A3XX_TEX_CONST_0_SWIZ_X(tex_swiz(rswiz[0])) | - A3XX_TEX_CONST_0_SWIZ_Y(tex_swiz(rswiz[1])) | - A3XX_TEX_CONST_0_SWIZ_Z(tex_swiz(rswiz[2])) | - A3XX_TEX_CONST_0_SWIZ_W(tex_swiz(rswiz[3])); + const struct util_format_description *desc = util_format_description(format); + unsigned char swiz[4] = + { + swizzle_r, + swizzle_g, + swizzle_b, + swizzle_a, + }, + rswiz[4]; + + util_format_compose_swizzles(desc->swizzle, swiz, rswiz); + + return A3XX_TEX_CONST_0_SWIZ_X(tex_swiz(rswiz[0])) | + A3XX_TEX_CONST_0_SWIZ_Y(tex_swiz(rswiz[1])) | + A3XX_TEX_CONST_0_SWIZ_Z(tex_swiz(rswiz[2])) | + A3XX_TEX_CONST_0_SWIZ_W(tex_swiz(rswiz[3])); } diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_format.h b/src/gallium/drivers/freedreno/a3xx/fd3_format.h index 229ed56..a057df0 100644 --- a/src/gallium/drivers/freedreno/a3xx/fd3_format.h +++ b/src/gallium/drivers/freedreno/a3xx/fd3_format.h @@ -37,6 +37,7 @@ enum a3xx_color_fmt fd3_fs_output_format(enum pipe_format format); enum a3xx_color_swap fd3_pipe2swap(enum pipe_format format); uint32_t fd3_tex_swiz(enum pipe_format format, unsigned swizzle_r, - unsigned swizzle_g, unsigned swizzle_b, unsigned swizzle_a); + unsigned swizzle_g, unsigned swizzle_b, + unsigned swizzle_a); #endif /* FD3_FORMAT_H_ */ diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_gmem.c b/src/gallium/drivers/freedreno/a3xx/fd3_gmem.c index cb44406..afa409e 100644 --- a/src/gallium/drivers/freedreno/a3xx/fd3_gmem.c +++ b/src/gallium/drivers/freedreno/a3xx/fd3_gmem.c @@ -25,1068 +25,1072 @@ */ #include "pipe/p_state.h" -#include "util/u_string.h" -#include "util/u_memory.h" -#include "util/u_inlines.h" #include "util/format/u_format.h" +#include "util/u_inlines.h" +#include "util/u_memory.h" +#include "util/u_string.h" #include "freedreno_draw.h" -#include "freedreno_state.h" #include "freedreno_resource.h" +#include "freedreno_state.h" -#include "fd3_gmem.h" #include "fd3_context.h" #include "fd3_emit.h" -#include "fd3_program.h" #include "fd3_format.h" +#include "fd3_gmem.h" +#include "fd3_program.h" #include "fd3_zsa.h" static void -fd3_gmem_emit_set_prog(struct fd_context *ctx, struct fd3_emit *emit, struct fd_program_stateobj *prog) +fd3_gmem_emit_set_prog(struct fd_context *ctx, struct fd3_emit *emit, + struct fd_program_stateobj *prog) { - emit->skip_consts = true; - emit->key.vs = prog->vs; - emit->key.fs = prog->fs; - emit->prog = fd3_program_state(ir3_cache_lookup(ctx->shader_cache, &emit->key, &ctx->debug)); - /* reset the fd3_emit_get_*p cache */ - emit->vs = NULL; - emit->fs = NULL; + emit->skip_consts = true; + emit->key.vs = prog->vs; + emit->key.fs = prog->fs; + emit->prog = fd3_program_state( + ir3_cache_lookup(ctx->shader_cache, &emit->key, &ctx->debug)); + /* reset the fd3_emit_get_*p cache */ + emit->vs = NULL; + emit->fs = NULL; } static void emit_mrt(struct fd_ringbuffer *ring, unsigned nr_bufs, - struct pipe_surface **bufs, const uint32_t *bases, uint32_t bin_w, - bool decode_srgb) + struct pipe_surface **bufs, const uint32_t *bases, uint32_t bin_w, + bool decode_srgb) { - enum a3xx_tile_mode tile_mode; - unsigned i; - - for (i = 0; i < A3XX_MAX_RENDER_TARGETS; i++) { - enum pipe_format pformat = 0; - enum a3xx_color_fmt format = 0; - enum a3xx_color_swap swap = WZYX; - bool srgb = false; - struct fd_resource *rsc = NULL; - uint32_t stride = 0; - uint32_t base = 0; - uint32_t offset = 0; - - if (bin_w) { - tile_mode = TILE_32X32; - } else { - tile_mode = LINEAR; - } - - if ((i < nr_bufs) && bufs[i]) { - struct pipe_surface *psurf = bufs[i]; - - rsc = fd_resource(psurf->texture); - pformat = psurf->format; - /* In case we're drawing to Z32F_S8, the "color" actually goes to - * the stencil - */ - if (rsc->stencil) { - rsc = rsc->stencil; - pformat = rsc->b.b.format; - if (bases) - bases++; - } - format = fd3_pipe2color(pformat); - if (decode_srgb) - srgb = util_format_is_srgb(pformat); - else - pformat = util_format_linear(pformat); - - debug_assert(psurf->u.tex.first_layer == psurf->u.tex.last_layer); - - offset = fd_resource_offset(rsc, psurf->u.tex.level, - psurf->u.tex.first_layer); - swap = rsc->layout.tile_mode ? WZYX : fd3_pipe2swap(pformat); - - if (bin_w) { - stride = bin_w << fdl_cpp_shift(&rsc->layout); - - if (bases) { - base = bases[i]; - } - } else { - stride = fd_resource_pitch(rsc, psurf->u.tex.level); - tile_mode = rsc->layout.tile_mode; - } - } else if (i < nr_bufs && bases) { - base = bases[i]; - } - - OUT_PKT0(ring, REG_A3XX_RB_MRT_BUF_INFO(i), 2); - OUT_RING(ring, A3XX_RB_MRT_BUF_INFO_COLOR_FORMAT(format) | - A3XX_RB_MRT_BUF_INFO_COLOR_TILE_MODE(tile_mode) | - A3XX_RB_MRT_BUF_INFO_COLOR_BUF_PITCH(stride) | - A3XX_RB_MRT_BUF_INFO_COLOR_SWAP(swap) | - COND(srgb, A3XX_RB_MRT_BUF_INFO_COLOR_SRGB)); - if (bin_w || (i >= nr_bufs) || !bufs[i]) { - OUT_RING(ring, A3XX_RB_MRT_BUF_BASE_COLOR_BUF_BASE(base)); - } else { - OUT_RELOC(ring, rsc->bo, offset, 0, -1); - } - - OUT_PKT0(ring, REG_A3XX_SP_FS_IMAGE_OUTPUT_REG(i), 1); - OUT_RING(ring, COND((i < nr_bufs) && bufs[i], - A3XX_SP_FS_IMAGE_OUTPUT_REG_MRTFORMAT( - fd3_fs_output_format(pformat)))); - } + enum a3xx_tile_mode tile_mode; + unsigned i; + + for (i = 0; i < A3XX_MAX_RENDER_TARGETS; i++) { + enum pipe_format pformat = 0; + enum a3xx_color_fmt format = 0; + enum a3xx_color_swap swap = WZYX; + bool srgb = false; + struct fd_resource *rsc = NULL; + uint32_t stride = 0; + uint32_t base = 0; + uint32_t offset = 0; + + if (bin_w) { + tile_mode = TILE_32X32; + } else { + tile_mode = LINEAR; + } + + if ((i < nr_bufs) && bufs[i]) { + struct pipe_surface *psurf = bufs[i]; + + rsc = fd_resource(psurf->texture); + pformat = psurf->format; + /* In case we're drawing to Z32F_S8, the "color" actually goes to + * the stencil + */ + if (rsc->stencil) { + rsc = rsc->stencil; + pformat = rsc->b.b.format; + if (bases) + bases++; + } + format = fd3_pipe2color(pformat); + if (decode_srgb) + srgb = util_format_is_srgb(pformat); + else + pformat = util_format_linear(pformat); + + debug_assert(psurf->u.tex.first_layer == psurf->u.tex.last_layer); + + offset = fd_resource_offset(rsc, psurf->u.tex.level, + psurf->u.tex.first_layer); + swap = rsc->layout.tile_mode ? WZYX : fd3_pipe2swap(pformat); + + if (bin_w) { + stride = bin_w << fdl_cpp_shift(&rsc->layout); + + if (bases) { + base = bases[i]; + } + } else { + stride = fd_resource_pitch(rsc, psurf->u.tex.level); + tile_mode = rsc->layout.tile_mode; + } + } else if (i < nr_bufs && bases) { + base = bases[i]; + } + + OUT_PKT0(ring, REG_A3XX_RB_MRT_BUF_INFO(i), 2); + OUT_RING(ring, A3XX_RB_MRT_BUF_INFO_COLOR_FORMAT(format) | + A3XX_RB_MRT_BUF_INFO_COLOR_TILE_MODE(tile_mode) | + A3XX_RB_MRT_BUF_INFO_COLOR_BUF_PITCH(stride) | + A3XX_RB_MRT_BUF_INFO_COLOR_SWAP(swap) | + COND(srgb, A3XX_RB_MRT_BUF_INFO_COLOR_SRGB)); + if (bin_w || (i >= nr_bufs) || !bufs[i]) { + OUT_RING(ring, A3XX_RB_MRT_BUF_BASE_COLOR_BUF_BASE(base)); + } else { + OUT_RELOC(ring, rsc->bo, offset, 0, -1); + } + + OUT_PKT0(ring, REG_A3XX_SP_FS_IMAGE_OUTPUT_REG(i), 1); + OUT_RING(ring, COND((i < nr_bufs) && bufs[i], + A3XX_SP_FS_IMAGE_OUTPUT_REG_MRTFORMAT( + fd3_fs_output_format(pformat)))); + } } static bool use_hw_binning(struct fd_batch *batch) { - const struct fd_gmem_stateobj *gmem = batch->gmem_state; - - /* workaround: combining scissor optimization and hw binning - * seems problematic. Seems like we end up with a mismatch - * between binning pass and rendering pass, wrt. where the hw - * thinks the vertices belong. And the blob driver doesn't - * seem to implement anything like scissor optimization, so - * not entirely sure what I might be missing. - * - * But scissor optimization is mainly for window managers, - * which don't have many vertices (and therefore doesn't - * benefit much from binning pass). - * - * So for now just disable binning if scissor optimization is - * used. - */ - if (gmem->minx || gmem->miny) - return false; - - if ((gmem->maxpw * gmem->maxph) > 32) - return false; - - if ((gmem->maxpw > 15) || (gmem->maxph > 15)) - return false; - - return fd_binning_enabled && ((gmem->nbins_x * gmem->nbins_y) > 2); + const struct fd_gmem_stateobj *gmem = batch->gmem_state; + + /* workaround: combining scissor optimization and hw binning + * seems problematic. Seems like we end up with a mismatch + * between binning pass and rendering pass, wrt. where the hw + * thinks the vertices belong. And the blob driver doesn't + * seem to implement anything like scissor optimization, so + * not entirely sure what I might be missing. + * + * But scissor optimization is mainly for window managers, + * which don't have many vertices (and therefore doesn't + * benefit much from binning pass). + * + * So for now just disable binning if scissor optimization is + * used. + */ + if (gmem->minx || gmem->miny) + return false; + + if ((gmem->maxpw * gmem->maxph) > 32) + return false; + + if ((gmem->maxpw > 15) || (gmem->maxph > 15)) + return false; + + return fd_binning_enabled && ((gmem->nbins_x * gmem->nbins_y) > 2); } /* workaround for (hlsq?) lockup with hw binning on a3xx patchlevel 0 */ static void update_vsc_pipe(struct fd_batch *batch); static void -emit_binning_workaround(struct fd_batch *batch) - assert_dt +emit_binning_workaround(struct fd_batch *batch) assert_dt { - struct fd_context *ctx = batch->ctx; - const struct fd_gmem_stateobj *gmem = batch->gmem_state; - struct fd_ringbuffer *ring = batch->gmem; - struct fd3_emit emit = { - .debug = &ctx->debug, - .vtx = &ctx->solid_vbuf_state, - .key = { - .vs = ctx->solid_prog.vs, - .fs = ctx->solid_prog.fs, - }, - }; - - fd3_gmem_emit_set_prog(ctx, &emit, &ctx->solid_prog); - - OUT_PKT0(ring, REG_A3XX_RB_MODE_CONTROL, 2); - OUT_RING(ring, A3XX_RB_MODE_CONTROL_RENDER_MODE(RB_RESOLVE_PASS) | - A3XX_RB_MODE_CONTROL_MARB_CACHE_SPLIT_MODE | - A3XX_RB_MODE_CONTROL_MRT(0)); - OUT_RING(ring, A3XX_RB_RENDER_CONTROL_BIN_WIDTH(32) | - A3XX_RB_RENDER_CONTROL_DISABLE_COLOR_PIPE | - A3XX_RB_RENDER_CONTROL_ALPHA_TEST_FUNC(FUNC_NEVER)); - - OUT_PKT0(ring, REG_A3XX_RB_COPY_CONTROL, 4); - OUT_RING(ring, A3XX_RB_COPY_CONTROL_MSAA_RESOLVE(MSAA_ONE) | - A3XX_RB_COPY_CONTROL_MODE(0) | - A3XX_RB_COPY_CONTROL_GMEM_BASE(0)); - OUT_RELOC(ring, fd_resource(ctx->solid_vbuf)->bo, 0x20, 0, -1); /* RB_COPY_DEST_BASE */ - OUT_RING(ring, A3XX_RB_COPY_DEST_PITCH_PITCH(128)); - OUT_RING(ring, A3XX_RB_COPY_DEST_INFO_TILE(LINEAR) | - A3XX_RB_COPY_DEST_INFO_FORMAT(RB_R8G8B8A8_UNORM) | - A3XX_RB_COPY_DEST_INFO_SWAP(WZYX) | - A3XX_RB_COPY_DEST_INFO_COMPONENT_ENABLE(0xf) | - A3XX_RB_COPY_DEST_INFO_ENDIAN(ENDIAN_NONE)); - - OUT_PKT0(ring, REG_A3XX_GRAS_SC_CONTROL, 1); - OUT_RING(ring, A3XX_GRAS_SC_CONTROL_RENDER_MODE(RB_RESOLVE_PASS) | - A3XX_GRAS_SC_CONTROL_MSAA_SAMPLES(MSAA_ONE) | - A3XX_GRAS_SC_CONTROL_RASTER_MODE(1)); - - fd3_program_emit(ring, &emit, 0, NULL); - fd3_emit_vertex_bufs(ring, &emit); - - OUT_PKT0(ring, REG_A3XX_HLSQ_CONTROL_0_REG, 4); - OUT_RING(ring, A3XX_HLSQ_CONTROL_0_REG_FSTHREADSIZE(FOUR_QUADS) | - A3XX_HLSQ_CONTROL_0_REG_FSSUPERTHREADENABLE | - A3XX_HLSQ_CONTROL_0_REG_RESERVED2 | - A3XX_HLSQ_CONTROL_0_REG_SPCONSTFULLUPDATE); - OUT_RING(ring, A3XX_HLSQ_CONTROL_1_REG_VSTHREADSIZE(TWO_QUADS) | - A3XX_HLSQ_CONTROL_1_REG_VSSUPERTHREADENABLE); - OUT_RING(ring, A3XX_HLSQ_CONTROL_2_REG_PRIMALLOCTHRESHOLD(31)); - OUT_RING(ring, 0); /* HLSQ_CONTROL_3_REG */ - - OUT_PKT0(ring, REG_A3XX_HLSQ_CONST_FSPRESV_RANGE_REG, 1); - OUT_RING(ring, A3XX_HLSQ_CONST_FSPRESV_RANGE_REG_STARTENTRY(0x20) | - A3XX_HLSQ_CONST_FSPRESV_RANGE_REG_ENDENTRY(0x20)); - - OUT_PKT0(ring, REG_A3XX_RB_MSAA_CONTROL, 1); - OUT_RING(ring, A3XX_RB_MSAA_CONTROL_DISABLE | - A3XX_RB_MSAA_CONTROL_SAMPLES(MSAA_ONE) | - A3XX_RB_MSAA_CONTROL_SAMPLE_MASK(0xffff)); - - OUT_PKT0(ring, REG_A3XX_RB_DEPTH_CONTROL, 1); - OUT_RING(ring, A3XX_RB_DEPTH_CONTROL_ZFUNC(FUNC_NEVER)); - - OUT_PKT0(ring, REG_A3XX_RB_STENCIL_CONTROL, 1); - OUT_RING(ring, A3XX_RB_STENCIL_CONTROL_FUNC(FUNC_NEVER) | - A3XX_RB_STENCIL_CONTROL_FAIL(STENCIL_KEEP) | - A3XX_RB_STENCIL_CONTROL_ZPASS(STENCIL_KEEP) | - A3XX_RB_STENCIL_CONTROL_ZFAIL(STENCIL_KEEP) | - A3XX_RB_STENCIL_CONTROL_FUNC_BF(FUNC_NEVER) | - A3XX_RB_STENCIL_CONTROL_FAIL_BF(STENCIL_KEEP) | - A3XX_RB_STENCIL_CONTROL_ZPASS_BF(STENCIL_KEEP) | - A3XX_RB_STENCIL_CONTROL_ZFAIL_BF(STENCIL_KEEP)); - - OUT_PKT0(ring, REG_A3XX_GRAS_SU_MODE_CONTROL, 1); - OUT_RING(ring, A3XX_GRAS_SU_MODE_CONTROL_LINEHALFWIDTH(0.0)); - - OUT_PKT0(ring, REG_A3XX_VFD_INDEX_MIN, 4); - OUT_RING(ring, 0); /* VFD_INDEX_MIN */ - OUT_RING(ring, 2); /* VFD_INDEX_MAX */ - OUT_RING(ring, 0); /* VFD_INSTANCEID_OFFSET */ - OUT_RING(ring, 0); /* VFD_INDEX_OFFSET */ - - OUT_PKT0(ring, REG_A3XX_PC_PRIM_VTX_CNTL, 1); - OUT_RING(ring, A3XX_PC_PRIM_VTX_CNTL_STRIDE_IN_VPC(0) | - A3XX_PC_PRIM_VTX_CNTL_POLYMODE_FRONT_PTYPE(PC_DRAW_TRIANGLES) | - A3XX_PC_PRIM_VTX_CNTL_POLYMODE_BACK_PTYPE(PC_DRAW_TRIANGLES) | - A3XX_PC_PRIM_VTX_CNTL_PROVOKING_VTX_LAST); - - OUT_PKT0(ring, REG_A3XX_GRAS_SC_WINDOW_SCISSOR_TL, 2); - OUT_RING(ring, A3XX_GRAS_SC_WINDOW_SCISSOR_TL_X(0) | - A3XX_GRAS_SC_WINDOW_SCISSOR_TL_Y(1)); - OUT_RING(ring, A3XX_GRAS_SC_WINDOW_SCISSOR_BR_X(0) | - A3XX_GRAS_SC_WINDOW_SCISSOR_BR_Y(1)); - - OUT_PKT0(ring, REG_A3XX_GRAS_SC_SCREEN_SCISSOR_TL, 2); - OUT_RING(ring, A3XX_GRAS_SC_SCREEN_SCISSOR_TL_X(0) | - A3XX_GRAS_SC_SCREEN_SCISSOR_TL_Y(0)); - OUT_RING(ring, A3XX_GRAS_SC_SCREEN_SCISSOR_BR_X(31) | - A3XX_GRAS_SC_SCREEN_SCISSOR_BR_Y(0)); - - fd_wfi(batch, ring); - OUT_PKT0(ring, REG_A3XX_GRAS_CL_VPORT_XOFFSET, 6); - OUT_RING(ring, A3XX_GRAS_CL_VPORT_XOFFSET(0.0)); - OUT_RING(ring, A3XX_GRAS_CL_VPORT_XSCALE(1.0)); - OUT_RING(ring, A3XX_GRAS_CL_VPORT_YOFFSET(0.0)); - OUT_RING(ring, A3XX_GRAS_CL_VPORT_YSCALE(1.0)); - OUT_RING(ring, A3XX_GRAS_CL_VPORT_ZOFFSET(0.0)); - OUT_RING(ring, A3XX_GRAS_CL_VPORT_ZSCALE(1.0)); - - OUT_PKT0(ring, REG_A3XX_GRAS_CL_CLIP_CNTL, 1); - OUT_RING(ring, A3XX_GRAS_CL_CLIP_CNTL_CLIP_DISABLE | - A3XX_GRAS_CL_CLIP_CNTL_ZFAR_CLIP_DISABLE | - A3XX_GRAS_CL_CLIP_CNTL_VP_CLIP_CODE_IGNORE | - A3XX_GRAS_CL_CLIP_CNTL_VP_XFORM_DISABLE | - A3XX_GRAS_CL_CLIP_CNTL_PERSP_DIVISION_DISABLE); - - OUT_PKT0(ring, REG_A3XX_GRAS_CL_GB_CLIP_ADJ, 1); - OUT_RING(ring, A3XX_GRAS_CL_GB_CLIP_ADJ_HORZ(0) | - A3XX_GRAS_CL_GB_CLIP_ADJ_VERT(0)); - - OUT_PKT3(ring, CP_DRAW_INDX_2, 5); - OUT_RING(ring, 0x00000000); /* viz query info. */ - OUT_RING(ring, DRAW(DI_PT_RECTLIST, DI_SRC_SEL_IMMEDIATE, - INDEX_SIZE_32_BIT, IGNORE_VISIBILITY, 0)); - OUT_RING(ring, 2); /* NumIndices */ - OUT_RING(ring, 2); - OUT_RING(ring, 1); - fd_reset_wfi(batch); - - OUT_PKT0(ring, REG_A3XX_HLSQ_CONTROL_0_REG, 1); - OUT_RING(ring, A3XX_HLSQ_CONTROL_0_REG_FSTHREADSIZE(TWO_QUADS)); - - OUT_PKT0(ring, REG_A3XX_VFD_PERFCOUNTER0_SELECT, 1); - OUT_RING(ring, 0x00000000); - - fd_wfi(batch, ring); - OUT_PKT0(ring, REG_A3XX_VSC_BIN_SIZE, 1); - OUT_RING(ring, A3XX_VSC_BIN_SIZE_WIDTH(gmem->bin_w) | - A3XX_VSC_BIN_SIZE_HEIGHT(gmem->bin_h)); - - OUT_PKT0(ring, REG_A3XX_GRAS_SC_CONTROL, 1); - OUT_RING(ring, A3XX_GRAS_SC_CONTROL_RENDER_MODE(RB_RENDERING_PASS) | - A3XX_GRAS_SC_CONTROL_MSAA_SAMPLES(MSAA_ONE) | - A3XX_GRAS_SC_CONTROL_RASTER_MODE(0)); - - OUT_PKT0(ring, REG_A3XX_GRAS_CL_CLIP_CNTL, 1); - OUT_RING(ring, 0x00000000); + struct fd_context *ctx = batch->ctx; + const struct fd_gmem_stateobj *gmem = batch->gmem_state; + struct fd_ringbuffer *ring = batch->gmem; + struct fd3_emit emit = { + .debug = &ctx->debug, + .vtx = &ctx->solid_vbuf_state, + .key = + { + .vs = ctx->solid_prog.vs, + .fs = ctx->solid_prog.fs, + }, + }; + + fd3_gmem_emit_set_prog(ctx, &emit, &ctx->solid_prog); + + OUT_PKT0(ring, REG_A3XX_RB_MODE_CONTROL, 2); + OUT_RING(ring, A3XX_RB_MODE_CONTROL_RENDER_MODE(RB_RESOLVE_PASS) | + A3XX_RB_MODE_CONTROL_MARB_CACHE_SPLIT_MODE | + A3XX_RB_MODE_CONTROL_MRT(0)); + OUT_RING(ring, A3XX_RB_RENDER_CONTROL_BIN_WIDTH(32) | + A3XX_RB_RENDER_CONTROL_DISABLE_COLOR_PIPE | + A3XX_RB_RENDER_CONTROL_ALPHA_TEST_FUNC(FUNC_NEVER)); + + OUT_PKT0(ring, REG_A3XX_RB_COPY_CONTROL, 4); + OUT_RING(ring, A3XX_RB_COPY_CONTROL_MSAA_RESOLVE(MSAA_ONE) | + A3XX_RB_COPY_CONTROL_MODE(0) | + A3XX_RB_COPY_CONTROL_GMEM_BASE(0)); + OUT_RELOC(ring, fd_resource(ctx->solid_vbuf)->bo, 0x20, 0, + -1); /* RB_COPY_DEST_BASE */ + OUT_RING(ring, A3XX_RB_COPY_DEST_PITCH_PITCH(128)); + OUT_RING(ring, A3XX_RB_COPY_DEST_INFO_TILE(LINEAR) | + A3XX_RB_COPY_DEST_INFO_FORMAT(RB_R8G8B8A8_UNORM) | + A3XX_RB_COPY_DEST_INFO_SWAP(WZYX) | + A3XX_RB_COPY_DEST_INFO_COMPONENT_ENABLE(0xf) | + A3XX_RB_COPY_DEST_INFO_ENDIAN(ENDIAN_NONE)); + + OUT_PKT0(ring, REG_A3XX_GRAS_SC_CONTROL, 1); + OUT_RING(ring, A3XX_GRAS_SC_CONTROL_RENDER_MODE(RB_RESOLVE_PASS) | + A3XX_GRAS_SC_CONTROL_MSAA_SAMPLES(MSAA_ONE) | + A3XX_GRAS_SC_CONTROL_RASTER_MODE(1)); + + fd3_program_emit(ring, &emit, 0, NULL); + fd3_emit_vertex_bufs(ring, &emit); + + OUT_PKT0(ring, REG_A3XX_HLSQ_CONTROL_0_REG, 4); + OUT_RING(ring, A3XX_HLSQ_CONTROL_0_REG_FSTHREADSIZE(FOUR_QUADS) | + A3XX_HLSQ_CONTROL_0_REG_FSSUPERTHREADENABLE | + A3XX_HLSQ_CONTROL_0_REG_RESERVED2 | + A3XX_HLSQ_CONTROL_0_REG_SPCONSTFULLUPDATE); + OUT_RING(ring, A3XX_HLSQ_CONTROL_1_REG_VSTHREADSIZE(TWO_QUADS) | + A3XX_HLSQ_CONTROL_1_REG_VSSUPERTHREADENABLE); + OUT_RING(ring, A3XX_HLSQ_CONTROL_2_REG_PRIMALLOCTHRESHOLD(31)); + OUT_RING(ring, 0); /* HLSQ_CONTROL_3_REG */ + + OUT_PKT0(ring, REG_A3XX_HLSQ_CONST_FSPRESV_RANGE_REG, 1); + OUT_RING(ring, A3XX_HLSQ_CONST_FSPRESV_RANGE_REG_STARTENTRY(0x20) | + A3XX_HLSQ_CONST_FSPRESV_RANGE_REG_ENDENTRY(0x20)); + + OUT_PKT0(ring, REG_A3XX_RB_MSAA_CONTROL, 1); + OUT_RING(ring, A3XX_RB_MSAA_CONTROL_DISABLE | + A3XX_RB_MSAA_CONTROL_SAMPLES(MSAA_ONE) | + A3XX_RB_MSAA_CONTROL_SAMPLE_MASK(0xffff)); + + OUT_PKT0(ring, REG_A3XX_RB_DEPTH_CONTROL, 1); + OUT_RING(ring, A3XX_RB_DEPTH_CONTROL_ZFUNC(FUNC_NEVER)); + + OUT_PKT0(ring, REG_A3XX_RB_STENCIL_CONTROL, 1); + OUT_RING(ring, A3XX_RB_STENCIL_CONTROL_FUNC(FUNC_NEVER) | + A3XX_RB_STENCIL_CONTROL_FAIL(STENCIL_KEEP) | + A3XX_RB_STENCIL_CONTROL_ZPASS(STENCIL_KEEP) | + A3XX_RB_STENCIL_CONTROL_ZFAIL(STENCIL_KEEP) | + A3XX_RB_STENCIL_CONTROL_FUNC_BF(FUNC_NEVER) | + A3XX_RB_STENCIL_CONTROL_FAIL_BF(STENCIL_KEEP) | + A3XX_RB_STENCIL_CONTROL_ZPASS_BF(STENCIL_KEEP) | + A3XX_RB_STENCIL_CONTROL_ZFAIL_BF(STENCIL_KEEP)); + + OUT_PKT0(ring, REG_A3XX_GRAS_SU_MODE_CONTROL, 1); + OUT_RING(ring, A3XX_GRAS_SU_MODE_CONTROL_LINEHALFWIDTH(0.0)); + + OUT_PKT0(ring, REG_A3XX_VFD_INDEX_MIN, 4); + OUT_RING(ring, 0); /* VFD_INDEX_MIN */ + OUT_RING(ring, 2); /* VFD_INDEX_MAX */ + OUT_RING(ring, 0); /* VFD_INSTANCEID_OFFSET */ + OUT_RING(ring, 0); /* VFD_INDEX_OFFSET */ + + OUT_PKT0(ring, REG_A3XX_PC_PRIM_VTX_CNTL, 1); + OUT_RING(ring, + A3XX_PC_PRIM_VTX_CNTL_STRIDE_IN_VPC(0) | + A3XX_PC_PRIM_VTX_CNTL_POLYMODE_FRONT_PTYPE(PC_DRAW_TRIANGLES) | + A3XX_PC_PRIM_VTX_CNTL_POLYMODE_BACK_PTYPE(PC_DRAW_TRIANGLES) | + A3XX_PC_PRIM_VTX_CNTL_PROVOKING_VTX_LAST); + + OUT_PKT0(ring, REG_A3XX_GRAS_SC_WINDOW_SCISSOR_TL, 2); + OUT_RING(ring, A3XX_GRAS_SC_WINDOW_SCISSOR_TL_X(0) | + A3XX_GRAS_SC_WINDOW_SCISSOR_TL_Y(1)); + OUT_RING(ring, A3XX_GRAS_SC_WINDOW_SCISSOR_BR_X(0) | + A3XX_GRAS_SC_WINDOW_SCISSOR_BR_Y(1)); + + OUT_PKT0(ring, REG_A3XX_GRAS_SC_SCREEN_SCISSOR_TL, 2); + OUT_RING(ring, A3XX_GRAS_SC_SCREEN_SCISSOR_TL_X(0) | + A3XX_GRAS_SC_SCREEN_SCISSOR_TL_Y(0)); + OUT_RING(ring, A3XX_GRAS_SC_SCREEN_SCISSOR_BR_X(31) | + A3XX_GRAS_SC_SCREEN_SCISSOR_BR_Y(0)); + + fd_wfi(batch, ring); + OUT_PKT0(ring, REG_A3XX_GRAS_CL_VPORT_XOFFSET, 6); + OUT_RING(ring, A3XX_GRAS_CL_VPORT_XOFFSET(0.0)); + OUT_RING(ring, A3XX_GRAS_CL_VPORT_XSCALE(1.0)); + OUT_RING(ring, A3XX_GRAS_CL_VPORT_YOFFSET(0.0)); + OUT_RING(ring, A3XX_GRAS_CL_VPORT_YSCALE(1.0)); + OUT_RING(ring, A3XX_GRAS_CL_VPORT_ZOFFSET(0.0)); + OUT_RING(ring, A3XX_GRAS_CL_VPORT_ZSCALE(1.0)); + + OUT_PKT0(ring, REG_A3XX_GRAS_CL_CLIP_CNTL, 1); + OUT_RING(ring, A3XX_GRAS_CL_CLIP_CNTL_CLIP_DISABLE | + A3XX_GRAS_CL_CLIP_CNTL_ZFAR_CLIP_DISABLE | + A3XX_GRAS_CL_CLIP_CNTL_VP_CLIP_CODE_IGNORE | + A3XX_GRAS_CL_CLIP_CNTL_VP_XFORM_DISABLE | + A3XX_GRAS_CL_CLIP_CNTL_PERSP_DIVISION_DISABLE); + + OUT_PKT0(ring, REG_A3XX_GRAS_CL_GB_CLIP_ADJ, 1); + OUT_RING(ring, A3XX_GRAS_CL_GB_CLIP_ADJ_HORZ(0) | + A3XX_GRAS_CL_GB_CLIP_ADJ_VERT(0)); + + OUT_PKT3(ring, CP_DRAW_INDX_2, 5); + OUT_RING(ring, 0x00000000); /* viz query info. */ + OUT_RING(ring, DRAW(DI_PT_RECTLIST, DI_SRC_SEL_IMMEDIATE, INDEX_SIZE_32_BIT, + IGNORE_VISIBILITY, 0)); + OUT_RING(ring, 2); /* NumIndices */ + OUT_RING(ring, 2); + OUT_RING(ring, 1); + fd_reset_wfi(batch); + + OUT_PKT0(ring, REG_A3XX_HLSQ_CONTROL_0_REG, 1); + OUT_RING(ring, A3XX_HLSQ_CONTROL_0_REG_FSTHREADSIZE(TWO_QUADS)); + + OUT_PKT0(ring, REG_A3XX_VFD_PERFCOUNTER0_SELECT, 1); + OUT_RING(ring, 0x00000000); + + fd_wfi(batch, ring); + OUT_PKT0(ring, REG_A3XX_VSC_BIN_SIZE, 1); + OUT_RING(ring, A3XX_VSC_BIN_SIZE_WIDTH(gmem->bin_w) | + A3XX_VSC_BIN_SIZE_HEIGHT(gmem->bin_h)); + + OUT_PKT0(ring, REG_A3XX_GRAS_SC_CONTROL, 1); + OUT_RING(ring, A3XX_GRAS_SC_CONTROL_RENDER_MODE(RB_RENDERING_PASS) | + A3XX_GRAS_SC_CONTROL_MSAA_SAMPLES(MSAA_ONE) | + A3XX_GRAS_SC_CONTROL_RASTER_MODE(0)); + + OUT_PKT0(ring, REG_A3XX_GRAS_CL_CLIP_CNTL, 1); + OUT_RING(ring, 0x00000000); } /* transfer from gmem to system memory (ie. normal RAM) */ static void emit_gmem2mem_surf(struct fd_batch *batch, - enum adreno_rb_copy_control_mode mode, - bool stencil, - uint32_t base, struct pipe_surface *psurf) + enum adreno_rb_copy_control_mode mode, bool stencil, + uint32_t base, struct pipe_surface *psurf) { - struct fd_ringbuffer *ring = batch->gmem; - struct fd_resource *rsc = fd_resource(psurf->texture); - enum pipe_format format = psurf->format; - - if (!rsc->valid) - return; - - if (stencil) { - rsc = rsc->stencil; - format = rsc->b.b.format; - } - - uint32_t offset = fd_resource_offset(rsc, psurf->u.tex.level, - psurf->u.tex.first_layer); - uint32_t pitch = fd_resource_pitch(rsc, psurf->u.tex.level); - - debug_assert(psurf->u.tex.first_layer == psurf->u.tex.last_layer); - - OUT_PKT0(ring, REG_A3XX_RB_COPY_CONTROL, 4); - OUT_RING(ring, A3XX_RB_COPY_CONTROL_MSAA_RESOLVE(MSAA_ONE) | - A3XX_RB_COPY_CONTROL_MODE(mode) | - A3XX_RB_COPY_CONTROL_GMEM_BASE(base) | - COND(format == PIPE_FORMAT_Z32_FLOAT || - format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT, - A3XX_RB_COPY_CONTROL_DEPTH32_RESOLVE)); - - OUT_RELOC(ring, rsc->bo, offset, 0, -1); /* RB_COPY_DEST_BASE */ - OUT_RING(ring, A3XX_RB_COPY_DEST_PITCH_PITCH(pitch)); - OUT_RING(ring, A3XX_RB_COPY_DEST_INFO_TILE(rsc->layout.tile_mode) | - A3XX_RB_COPY_DEST_INFO_FORMAT(fd3_pipe2color(format)) | - A3XX_RB_COPY_DEST_INFO_COMPONENT_ENABLE(0xf) | - A3XX_RB_COPY_DEST_INFO_ENDIAN(ENDIAN_NONE) | - A3XX_RB_COPY_DEST_INFO_SWAP(fd3_pipe2swap(format))); - - fd_draw(batch, ring, DI_PT_RECTLIST, IGNORE_VISIBILITY, - DI_SRC_SEL_AUTO_INDEX, 2, 0, INDEX_SIZE_IGN, 0, 0, NULL); + struct fd_ringbuffer *ring = batch->gmem; + struct fd_resource *rsc = fd_resource(psurf->texture); + enum pipe_format format = psurf->format; + + if (!rsc->valid) + return; + + if (stencil) { + rsc = rsc->stencil; + format = rsc->b.b.format; + } + + uint32_t offset = + fd_resource_offset(rsc, psurf->u.tex.level, psurf->u.tex.first_layer); + uint32_t pitch = fd_resource_pitch(rsc, psurf->u.tex.level); + + debug_assert(psurf->u.tex.first_layer == psurf->u.tex.last_layer); + + OUT_PKT0(ring, REG_A3XX_RB_COPY_CONTROL, 4); + OUT_RING(ring, A3XX_RB_COPY_CONTROL_MSAA_RESOLVE(MSAA_ONE) | + A3XX_RB_COPY_CONTROL_MODE(mode) | + A3XX_RB_COPY_CONTROL_GMEM_BASE(base) | + COND(format == PIPE_FORMAT_Z32_FLOAT || + format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT, + A3XX_RB_COPY_CONTROL_DEPTH32_RESOLVE)); + + OUT_RELOC(ring, rsc->bo, offset, 0, -1); /* RB_COPY_DEST_BASE */ + OUT_RING(ring, A3XX_RB_COPY_DEST_PITCH_PITCH(pitch)); + OUT_RING(ring, A3XX_RB_COPY_DEST_INFO_TILE(rsc->layout.tile_mode) | + A3XX_RB_COPY_DEST_INFO_FORMAT(fd3_pipe2color(format)) | + A3XX_RB_COPY_DEST_INFO_COMPONENT_ENABLE(0xf) | + A3XX_RB_COPY_DEST_INFO_ENDIAN(ENDIAN_NONE) | + A3XX_RB_COPY_DEST_INFO_SWAP(fd3_pipe2swap(format))); + + fd_draw(batch, ring, DI_PT_RECTLIST, IGNORE_VISIBILITY, + DI_SRC_SEL_AUTO_INDEX, 2, 0, INDEX_SIZE_IGN, 0, 0, NULL); } static void -fd3_emit_tile_gmem2mem(struct fd_batch *batch, const struct fd_tile *tile) - assert_dt +fd3_emit_tile_gmem2mem(struct fd_batch *batch, + const struct fd_tile *tile) assert_dt { - struct fd_context *ctx = batch->ctx; - struct fd_ringbuffer *ring = batch->gmem; - const struct fd_gmem_stateobj *gmem = batch->gmem_state; - struct pipe_framebuffer_state *pfb = &batch->framebuffer; - struct fd3_emit emit = { - .debug = &ctx->debug, - .vtx = &ctx->solid_vbuf_state, - .key = { - .vs = ctx->solid_prog.vs, - .fs = ctx->solid_prog.fs, - } - }; - int i; - - emit.prog = fd3_program_state(ir3_cache_lookup(ctx->shader_cache, &emit.key, &ctx->debug)); - - OUT_PKT0(ring, REG_A3XX_RB_DEPTH_CONTROL, 1); - OUT_RING(ring, A3XX_RB_DEPTH_CONTROL_ZFUNC(FUNC_NEVER)); - - OUT_PKT0(ring, REG_A3XX_RB_STENCIL_CONTROL, 1); - OUT_RING(ring, A3XX_RB_STENCIL_CONTROL_FUNC(FUNC_NEVER) | - A3XX_RB_STENCIL_CONTROL_FAIL(STENCIL_KEEP) | - A3XX_RB_STENCIL_CONTROL_ZPASS(STENCIL_KEEP) | - A3XX_RB_STENCIL_CONTROL_ZFAIL(STENCIL_KEEP) | - A3XX_RB_STENCIL_CONTROL_FUNC_BF(FUNC_NEVER) | - A3XX_RB_STENCIL_CONTROL_FAIL_BF(STENCIL_KEEP) | - A3XX_RB_STENCIL_CONTROL_ZPASS_BF(STENCIL_KEEP) | - A3XX_RB_STENCIL_CONTROL_ZFAIL_BF(STENCIL_KEEP)); - - OUT_PKT0(ring, REG_A3XX_RB_STENCILREFMASK, 2); - OUT_RING(ring, 0xff000000 | - A3XX_RB_STENCILREFMASK_STENCILREF(0) | - A3XX_RB_STENCILREFMASK_STENCILMASK(0) | - A3XX_RB_STENCILREFMASK_STENCILWRITEMASK(0xff)); - OUT_RING(ring, 0xff000000 | - A3XX_RB_STENCILREFMASK_STENCILREF(0) | - A3XX_RB_STENCILREFMASK_STENCILMASK(0) | - A3XX_RB_STENCILREFMASK_STENCILWRITEMASK(0xff)); - - OUT_PKT0(ring, REG_A3XX_GRAS_SU_MODE_CONTROL, 1); - OUT_RING(ring, A3XX_GRAS_SU_MODE_CONTROL_LINEHALFWIDTH(0)); - - OUT_PKT0(ring, REG_A3XX_GRAS_CL_CLIP_CNTL, 1); - OUT_RING(ring, 0x00000000); /* GRAS_CL_CLIP_CNTL */ - - fd_wfi(batch, ring); - OUT_PKT0(ring, REG_A3XX_GRAS_CL_VPORT_XOFFSET, 6); - OUT_RING(ring, A3XX_GRAS_CL_VPORT_XOFFSET((float)pfb->width/2.0 - 0.5)); - OUT_RING(ring, A3XX_GRAS_CL_VPORT_XSCALE((float)pfb->width/2.0)); - OUT_RING(ring, A3XX_GRAS_CL_VPORT_YOFFSET((float)pfb->height/2.0 - 0.5)); - OUT_RING(ring, A3XX_GRAS_CL_VPORT_YSCALE(-(float)pfb->height/2.0)); - OUT_RING(ring, A3XX_GRAS_CL_VPORT_ZOFFSET(0.0)); - OUT_RING(ring, A3XX_GRAS_CL_VPORT_ZSCALE(1.0)); - - OUT_PKT0(ring, REG_A3XX_RB_MODE_CONTROL, 1); - OUT_RING(ring, A3XX_RB_MODE_CONTROL_RENDER_MODE(RB_RESOLVE_PASS) | - A3XX_RB_MODE_CONTROL_MARB_CACHE_SPLIT_MODE | - A3XX_RB_MODE_CONTROL_MRT(0)); - - OUT_PKT0(ring, REG_A3XX_RB_RENDER_CONTROL, 1); - OUT_RING(ring, A3XX_RB_RENDER_CONTROL_DISABLE_COLOR_PIPE | - A3XX_RB_RENDER_CONTROL_ENABLE_GMEM | - A3XX_RB_RENDER_CONTROL_ALPHA_TEST_FUNC(FUNC_NEVER) | - A3XX_RB_RENDER_CONTROL_BIN_WIDTH(batch->gmem_state->bin_w)); - - OUT_PKT0(ring, REG_A3XX_GRAS_SC_CONTROL, 1); - OUT_RING(ring, A3XX_GRAS_SC_CONTROL_RENDER_MODE(RB_RESOLVE_PASS) | - A3XX_GRAS_SC_CONTROL_MSAA_SAMPLES(MSAA_ONE) | - A3XX_GRAS_SC_CONTROL_RASTER_MODE(1)); - - OUT_PKT0(ring, REG_A3XX_PC_PRIM_VTX_CNTL, 1); - OUT_RING(ring, A3XX_PC_PRIM_VTX_CNTL_STRIDE_IN_VPC(0) | - A3XX_PC_PRIM_VTX_CNTL_POLYMODE_FRONT_PTYPE(PC_DRAW_TRIANGLES) | - A3XX_PC_PRIM_VTX_CNTL_POLYMODE_BACK_PTYPE(PC_DRAW_TRIANGLES) | - A3XX_PC_PRIM_VTX_CNTL_PROVOKING_VTX_LAST); - - OUT_PKT0(ring, REG_A3XX_GRAS_SC_WINDOW_SCISSOR_TL, 2); - OUT_RING(ring, A3XX_GRAS_SC_WINDOW_SCISSOR_TL_X(0) | - A3XX_GRAS_SC_WINDOW_SCISSOR_TL_Y(0)); - OUT_RING(ring, A3XX_GRAS_SC_WINDOW_SCISSOR_BR_X(pfb->width - 1) | - A3XX_GRAS_SC_WINDOW_SCISSOR_BR_Y(pfb->height - 1)); - - OUT_PKT0(ring, REG_A3XX_VFD_INDEX_MIN, 4); - OUT_RING(ring, 0); /* VFD_INDEX_MIN */ - OUT_RING(ring, 2); /* VFD_INDEX_MAX */ - OUT_RING(ring, 0); /* VFD_INSTANCEID_OFFSET */ - OUT_RING(ring, 0); /* VFD_INDEX_OFFSET */ - - fd3_program_emit(ring, &emit, 0, NULL); - fd3_emit_vertex_bufs(ring, &emit); - - if (batch->resolve & (FD_BUFFER_DEPTH | FD_BUFFER_STENCIL)) { - struct fd_resource *rsc = fd_resource(pfb->zsbuf->texture); - if (!rsc->stencil || batch->resolve & FD_BUFFER_DEPTH) - emit_gmem2mem_surf(batch, RB_COPY_DEPTH_STENCIL, false, - gmem->zsbuf_base[0], pfb->zsbuf); - if (rsc->stencil && batch->resolve & FD_BUFFER_STENCIL) - emit_gmem2mem_surf(batch, RB_COPY_DEPTH_STENCIL, true, - gmem->zsbuf_base[1], pfb->zsbuf); - } - - if (batch->resolve & FD_BUFFER_COLOR) { - for (i = 0; i < pfb->nr_cbufs; i++) { - if (!pfb->cbufs[i]) - continue; - if (!(batch->resolve & (PIPE_CLEAR_COLOR0 << i))) - continue; - emit_gmem2mem_surf(batch, RB_COPY_RESOLVE, false, - gmem->cbuf_base[i], pfb->cbufs[i]); - } - } - - OUT_PKT0(ring, REG_A3XX_RB_MODE_CONTROL, 1); - OUT_RING(ring, A3XX_RB_MODE_CONTROL_RENDER_MODE(RB_RENDERING_PASS) | - A3XX_RB_MODE_CONTROL_MARB_CACHE_SPLIT_MODE | - A3XX_RB_MODE_CONTROL_MRT(MAX2(1, pfb->nr_cbufs) - 1)); - - OUT_PKT0(ring, REG_A3XX_GRAS_SC_CONTROL, 1); - OUT_RING(ring, A3XX_GRAS_SC_CONTROL_RENDER_MODE(RB_RENDERING_PASS) | - A3XX_GRAS_SC_CONTROL_MSAA_SAMPLES(MSAA_ONE) | - A3XX_GRAS_SC_CONTROL_RASTER_MODE(0)); + struct fd_context *ctx = batch->ctx; + struct fd_ringbuffer *ring = batch->gmem; + const struct fd_gmem_stateobj *gmem = batch->gmem_state; + struct pipe_framebuffer_state *pfb = &batch->framebuffer; + struct fd3_emit emit = {.debug = &ctx->debug, + .vtx = &ctx->solid_vbuf_state, + .key = { + .vs = ctx->solid_prog.vs, + .fs = ctx->solid_prog.fs, + }}; + int i; + + emit.prog = fd3_program_state( + ir3_cache_lookup(ctx->shader_cache, &emit.key, &ctx->debug)); + + OUT_PKT0(ring, REG_A3XX_RB_DEPTH_CONTROL, 1); + OUT_RING(ring, A3XX_RB_DEPTH_CONTROL_ZFUNC(FUNC_NEVER)); + + OUT_PKT0(ring, REG_A3XX_RB_STENCIL_CONTROL, 1); + OUT_RING(ring, A3XX_RB_STENCIL_CONTROL_FUNC(FUNC_NEVER) | + A3XX_RB_STENCIL_CONTROL_FAIL(STENCIL_KEEP) | + A3XX_RB_STENCIL_CONTROL_ZPASS(STENCIL_KEEP) | + A3XX_RB_STENCIL_CONTROL_ZFAIL(STENCIL_KEEP) | + A3XX_RB_STENCIL_CONTROL_FUNC_BF(FUNC_NEVER) | + A3XX_RB_STENCIL_CONTROL_FAIL_BF(STENCIL_KEEP) | + A3XX_RB_STENCIL_CONTROL_ZPASS_BF(STENCIL_KEEP) | + A3XX_RB_STENCIL_CONTROL_ZFAIL_BF(STENCIL_KEEP)); + + OUT_PKT0(ring, REG_A3XX_RB_STENCILREFMASK, 2); + OUT_RING(ring, 0xff000000 | A3XX_RB_STENCILREFMASK_STENCILREF(0) | + A3XX_RB_STENCILREFMASK_STENCILMASK(0) | + A3XX_RB_STENCILREFMASK_STENCILWRITEMASK(0xff)); + OUT_RING(ring, 0xff000000 | A3XX_RB_STENCILREFMASK_STENCILREF(0) | + A3XX_RB_STENCILREFMASK_STENCILMASK(0) | + A3XX_RB_STENCILREFMASK_STENCILWRITEMASK(0xff)); + + OUT_PKT0(ring, REG_A3XX_GRAS_SU_MODE_CONTROL, 1); + OUT_RING(ring, A3XX_GRAS_SU_MODE_CONTROL_LINEHALFWIDTH(0)); + + OUT_PKT0(ring, REG_A3XX_GRAS_CL_CLIP_CNTL, 1); + OUT_RING(ring, 0x00000000); /* GRAS_CL_CLIP_CNTL */ + + fd_wfi(batch, ring); + OUT_PKT0(ring, REG_A3XX_GRAS_CL_VPORT_XOFFSET, 6); + OUT_RING(ring, A3XX_GRAS_CL_VPORT_XOFFSET((float)pfb->width / 2.0 - 0.5)); + OUT_RING(ring, A3XX_GRAS_CL_VPORT_XSCALE((float)pfb->width / 2.0)); + OUT_RING(ring, A3XX_GRAS_CL_VPORT_YOFFSET((float)pfb->height / 2.0 - 0.5)); + OUT_RING(ring, A3XX_GRAS_CL_VPORT_YSCALE(-(float)pfb->height / 2.0)); + OUT_RING(ring, A3XX_GRAS_CL_VPORT_ZOFFSET(0.0)); + OUT_RING(ring, A3XX_GRAS_CL_VPORT_ZSCALE(1.0)); + + OUT_PKT0(ring, REG_A3XX_RB_MODE_CONTROL, 1); + OUT_RING(ring, A3XX_RB_MODE_CONTROL_RENDER_MODE(RB_RESOLVE_PASS) | + A3XX_RB_MODE_CONTROL_MARB_CACHE_SPLIT_MODE | + A3XX_RB_MODE_CONTROL_MRT(0)); + + OUT_PKT0(ring, REG_A3XX_RB_RENDER_CONTROL, 1); + OUT_RING(ring, + A3XX_RB_RENDER_CONTROL_DISABLE_COLOR_PIPE | + A3XX_RB_RENDER_CONTROL_ENABLE_GMEM | + A3XX_RB_RENDER_CONTROL_ALPHA_TEST_FUNC(FUNC_NEVER) | + A3XX_RB_RENDER_CONTROL_BIN_WIDTH(batch->gmem_state->bin_w)); + + OUT_PKT0(ring, REG_A3XX_GRAS_SC_CONTROL, 1); + OUT_RING(ring, A3XX_GRAS_SC_CONTROL_RENDER_MODE(RB_RESOLVE_PASS) | + A3XX_GRAS_SC_CONTROL_MSAA_SAMPLES(MSAA_ONE) | + A3XX_GRAS_SC_CONTROL_RASTER_MODE(1)); + + OUT_PKT0(ring, REG_A3XX_PC_PRIM_VTX_CNTL, 1); + OUT_RING(ring, + A3XX_PC_PRIM_VTX_CNTL_STRIDE_IN_VPC(0) | + A3XX_PC_PRIM_VTX_CNTL_POLYMODE_FRONT_PTYPE(PC_DRAW_TRIANGLES) | + A3XX_PC_PRIM_VTX_CNTL_POLYMODE_BACK_PTYPE(PC_DRAW_TRIANGLES) | + A3XX_PC_PRIM_VTX_CNTL_PROVOKING_VTX_LAST); + + OUT_PKT0(ring, REG_A3XX_GRAS_SC_WINDOW_SCISSOR_TL, 2); + OUT_RING(ring, A3XX_GRAS_SC_WINDOW_SCISSOR_TL_X(0) | + A3XX_GRAS_SC_WINDOW_SCISSOR_TL_Y(0)); + OUT_RING(ring, A3XX_GRAS_SC_WINDOW_SCISSOR_BR_X(pfb->width - 1) | + A3XX_GRAS_SC_WINDOW_SCISSOR_BR_Y(pfb->height - 1)); + + OUT_PKT0(ring, REG_A3XX_VFD_INDEX_MIN, 4); + OUT_RING(ring, 0); /* VFD_INDEX_MIN */ + OUT_RING(ring, 2); /* VFD_INDEX_MAX */ + OUT_RING(ring, 0); /* VFD_INSTANCEID_OFFSET */ + OUT_RING(ring, 0); /* VFD_INDEX_OFFSET */ + + fd3_program_emit(ring, &emit, 0, NULL); + fd3_emit_vertex_bufs(ring, &emit); + + if (batch->resolve & (FD_BUFFER_DEPTH | FD_BUFFER_STENCIL)) { + struct fd_resource *rsc = fd_resource(pfb->zsbuf->texture); + if (!rsc->stencil || batch->resolve & FD_BUFFER_DEPTH) + emit_gmem2mem_surf(batch, RB_COPY_DEPTH_STENCIL, false, + gmem->zsbuf_base[0], pfb->zsbuf); + if (rsc->stencil && batch->resolve & FD_BUFFER_STENCIL) + emit_gmem2mem_surf(batch, RB_COPY_DEPTH_STENCIL, true, + gmem->zsbuf_base[1], pfb->zsbuf); + } + + if (batch->resolve & FD_BUFFER_COLOR) { + for (i = 0; i < pfb->nr_cbufs; i++) { + if (!pfb->cbufs[i]) + continue; + if (!(batch->resolve & (PIPE_CLEAR_COLOR0 << i))) + continue; + emit_gmem2mem_surf(batch, RB_COPY_RESOLVE, false, gmem->cbuf_base[i], + pfb->cbufs[i]); + } + } + + OUT_PKT0(ring, REG_A3XX_RB_MODE_CONTROL, 1); + OUT_RING(ring, A3XX_RB_MODE_CONTROL_RENDER_MODE(RB_RENDERING_PASS) | + A3XX_RB_MODE_CONTROL_MARB_CACHE_SPLIT_MODE | + A3XX_RB_MODE_CONTROL_MRT(MAX2(1, pfb->nr_cbufs) - 1)); + + OUT_PKT0(ring, REG_A3XX_GRAS_SC_CONTROL, 1); + OUT_RING(ring, A3XX_GRAS_SC_CONTROL_RENDER_MODE(RB_RENDERING_PASS) | + A3XX_GRAS_SC_CONTROL_MSAA_SAMPLES(MSAA_ONE) | + A3XX_GRAS_SC_CONTROL_RASTER_MODE(0)); } /* transfer from system memory to gmem */ static void emit_mem2gmem_surf(struct fd_batch *batch, const uint32_t bases[], - struct pipe_surface **psurf, uint32_t bufs, uint32_t bin_w) + struct pipe_surface **psurf, uint32_t bufs, uint32_t bin_w) { - struct fd_ringbuffer *ring = batch->gmem; - struct pipe_surface *zsbufs[2]; - - assert(bufs > 0); - - OUT_PKT0(ring, REG_A3XX_RB_MODE_CONTROL, 1); - OUT_RING(ring, A3XX_RB_MODE_CONTROL_RENDER_MODE(RB_RENDERING_PASS) | - A3XX_RB_MODE_CONTROL_MARB_CACHE_SPLIT_MODE | - A3XX_RB_MODE_CONTROL_MRT(bufs - 1)); - - emit_mrt(ring, bufs, psurf, bases, bin_w, false); - - if (psurf[0] && (psurf[0]->format == PIPE_FORMAT_Z32_FLOAT || - psurf[0]->format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT)) { - /* Depth is stored as unorm in gmem, so we have to write it in using a - * special blit shader which writes depth. - */ - OUT_PKT0(ring, REG_A3XX_RB_DEPTH_CONTROL, 1); - OUT_RING(ring, (A3XX_RB_DEPTH_CONTROL_FRAG_WRITES_Z | - A3XX_RB_DEPTH_CONTROL_Z_WRITE_ENABLE | - A3XX_RB_DEPTH_CONTROL_Z_ENABLE | - A3XX_RB_DEPTH_CONTROL_EARLY_Z_DISABLE | - A3XX_RB_DEPTH_CONTROL_ZFUNC(FUNC_ALWAYS))); - - OUT_PKT0(ring, REG_A3XX_RB_DEPTH_INFO, 2); - OUT_RING(ring, A3XX_RB_DEPTH_INFO_DEPTH_BASE(bases[0]) | - A3XX_RB_DEPTH_INFO_DEPTH_FORMAT(DEPTHX_32)); - OUT_RING(ring, A3XX_RB_DEPTH_PITCH(4 * batch->gmem_state->bin_w)); - - if (psurf[0]->format == PIPE_FORMAT_Z32_FLOAT) { - OUT_PKT0(ring, REG_A3XX_RB_MRT_CONTROL(0), 1); - OUT_RING(ring, 0); - } else { - /* The gmem_restore_tex logic will put the first buffer's stencil - * as color. Supply it with the proper information to make that - * happen. - */ - zsbufs[0] = zsbufs[1] = psurf[0]; - psurf = zsbufs; - bufs = 2; - } - } else { - OUT_PKT0(ring, REG_A3XX_SP_FS_OUTPUT_REG, 1); - OUT_RING(ring, A3XX_SP_FS_OUTPUT_REG_MRT(bufs - 1)); - } - - fd3_emit_gmem_restore_tex(ring, psurf, bufs); - - fd_draw(batch, ring, DI_PT_RECTLIST, IGNORE_VISIBILITY, - DI_SRC_SEL_AUTO_INDEX, 2, 0, INDEX_SIZE_IGN, 0, 0, NULL); + struct fd_ringbuffer *ring = batch->gmem; + struct pipe_surface *zsbufs[2]; + + assert(bufs > 0); + + OUT_PKT0(ring, REG_A3XX_RB_MODE_CONTROL, 1); + OUT_RING(ring, A3XX_RB_MODE_CONTROL_RENDER_MODE(RB_RENDERING_PASS) | + A3XX_RB_MODE_CONTROL_MARB_CACHE_SPLIT_MODE | + A3XX_RB_MODE_CONTROL_MRT(bufs - 1)); + + emit_mrt(ring, bufs, psurf, bases, bin_w, false); + + if (psurf[0] && (psurf[0]->format == PIPE_FORMAT_Z32_FLOAT || + psurf[0]->format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT)) { + /* Depth is stored as unorm in gmem, so we have to write it in using a + * special blit shader which writes depth. + */ + OUT_PKT0(ring, REG_A3XX_RB_DEPTH_CONTROL, 1); + OUT_RING(ring, (A3XX_RB_DEPTH_CONTROL_FRAG_WRITES_Z | + A3XX_RB_DEPTH_CONTROL_Z_WRITE_ENABLE | + A3XX_RB_DEPTH_CONTROL_Z_ENABLE | + A3XX_RB_DEPTH_CONTROL_EARLY_Z_DISABLE | + A3XX_RB_DEPTH_CONTROL_ZFUNC(FUNC_ALWAYS))); + + OUT_PKT0(ring, REG_A3XX_RB_DEPTH_INFO, 2); + OUT_RING(ring, A3XX_RB_DEPTH_INFO_DEPTH_BASE(bases[0]) | + A3XX_RB_DEPTH_INFO_DEPTH_FORMAT(DEPTHX_32)); + OUT_RING(ring, A3XX_RB_DEPTH_PITCH(4 * batch->gmem_state->bin_w)); + + if (psurf[0]->format == PIPE_FORMAT_Z32_FLOAT) { + OUT_PKT0(ring, REG_A3XX_RB_MRT_CONTROL(0), 1); + OUT_RING(ring, 0); + } else { + /* The gmem_restore_tex logic will put the first buffer's stencil + * as color. Supply it with the proper information to make that + * happen. + */ + zsbufs[0] = zsbufs[1] = psurf[0]; + psurf = zsbufs; + bufs = 2; + } + } else { + OUT_PKT0(ring, REG_A3XX_SP_FS_OUTPUT_REG, 1); + OUT_RING(ring, A3XX_SP_FS_OUTPUT_REG_MRT(bufs - 1)); + } + + fd3_emit_gmem_restore_tex(ring, psurf, bufs); + + fd_draw(batch, ring, DI_PT_RECTLIST, IGNORE_VISIBILITY, + DI_SRC_SEL_AUTO_INDEX, 2, 0, INDEX_SIZE_IGN, 0, 0, NULL); } static void -fd3_emit_tile_mem2gmem(struct fd_batch *batch, const struct fd_tile *tile) - assert_dt +fd3_emit_tile_mem2gmem(struct fd_batch *batch, + const struct fd_tile *tile) assert_dt { - struct fd_context *ctx = batch->ctx; - const struct fd_gmem_stateobj *gmem = batch->gmem_state; - struct fd_ringbuffer *ring = batch->gmem; - struct pipe_framebuffer_state *pfb = &batch->framebuffer; - struct fd3_emit emit = { - .debug = &ctx->debug, - .vtx = &ctx->blit_vbuf_state, - .sprite_coord_enable = 1, - }; - /* NOTE: They all use the same VP, this is for vtx bufs. */ - fd3_gmem_emit_set_prog(ctx, &emit, &ctx->blit_prog[0]); - - float x0, y0, x1, y1; - unsigned bin_w = tile->bin_w; - unsigned bin_h = tile->bin_h; - unsigned i; - - /* write texture coordinates to vertexbuf: */ - x0 = ((float)tile->xoff) / ((float)pfb->width); - x1 = ((float)tile->xoff + bin_w) / ((float)pfb->width); - y0 = ((float)tile->yoff) / ((float)pfb->height); - y1 = ((float)tile->yoff + bin_h) / ((float)pfb->height); - - OUT_PKT3(ring, CP_MEM_WRITE, 5); - OUT_RELOC(ring, fd_resource(ctx->blit_texcoord_vbuf)->bo, 0, 0, 0); - OUT_RING(ring, fui(x0)); - OUT_RING(ring, fui(y0)); - OUT_RING(ring, fui(x1)); - OUT_RING(ring, fui(y1)); - - fd3_emit_cache_flush(batch, ring); - - for (i = 0; i < 4; i++) { - OUT_PKT0(ring, REG_A3XX_RB_MRT_CONTROL(i), 1); - OUT_RING(ring, A3XX_RB_MRT_CONTROL_ROP_CODE(ROP_COPY) | - A3XX_RB_MRT_CONTROL_DITHER_MODE(DITHER_DISABLE) | - A3XX_RB_MRT_CONTROL_COMPONENT_ENABLE(0xf)); - - OUT_PKT0(ring, REG_A3XX_RB_MRT_BLEND_CONTROL(i), 1); - OUT_RING(ring, A3XX_RB_MRT_BLEND_CONTROL_RGB_SRC_FACTOR(FACTOR_ONE) | - A3XX_RB_MRT_BLEND_CONTROL_RGB_BLEND_OPCODE(BLEND_DST_PLUS_SRC) | - A3XX_RB_MRT_BLEND_CONTROL_RGB_DEST_FACTOR(FACTOR_ZERO) | - A3XX_RB_MRT_BLEND_CONTROL_ALPHA_SRC_FACTOR(FACTOR_ONE) | - A3XX_RB_MRT_BLEND_CONTROL_ALPHA_BLEND_OPCODE(BLEND_DST_PLUS_SRC) | - A3XX_RB_MRT_BLEND_CONTROL_ALPHA_DEST_FACTOR(FACTOR_ZERO)); - } - - OUT_PKT0(ring, REG_A3XX_RB_RENDER_CONTROL, 1); - OUT_RING(ring, A3XX_RB_RENDER_CONTROL_ALPHA_TEST_FUNC(FUNC_ALWAYS) | - A3XX_RB_RENDER_CONTROL_BIN_WIDTH(gmem->bin_w)); - - fd_wfi(batch, ring); - OUT_PKT0(ring, REG_A3XX_RB_DEPTH_CONTROL, 1); - OUT_RING(ring, A3XX_RB_DEPTH_CONTROL_ZFUNC(FUNC_LESS)); - - OUT_PKT0(ring, REG_A3XX_RB_DEPTH_INFO, 2); - OUT_RING(ring, 0); - OUT_RING(ring, 0); - - OUT_PKT0(ring, REG_A3XX_GRAS_CL_CLIP_CNTL, 1); - OUT_RING(ring, A3XX_GRAS_CL_CLIP_CNTL_IJ_PERSP_CENTER); /* GRAS_CL_CLIP_CNTL */ - - fd_wfi(batch, ring); - OUT_PKT0(ring, REG_A3XX_GRAS_CL_VPORT_XOFFSET, 6); - OUT_RING(ring, A3XX_GRAS_CL_VPORT_XOFFSET((float)bin_w/2.0 - 0.5)); - OUT_RING(ring, A3XX_GRAS_CL_VPORT_XSCALE((float)bin_w/2.0)); - OUT_RING(ring, A3XX_GRAS_CL_VPORT_YOFFSET((float)bin_h/2.0 - 0.5)); - OUT_RING(ring, A3XX_GRAS_CL_VPORT_YSCALE(-(float)bin_h/2.0)); - OUT_RING(ring, A3XX_GRAS_CL_VPORT_ZOFFSET(0.0)); - OUT_RING(ring, A3XX_GRAS_CL_VPORT_ZSCALE(1.0)); - - OUT_PKT0(ring, REG_A3XX_GRAS_SC_WINDOW_SCISSOR_TL, 2); - OUT_RING(ring, A3XX_GRAS_SC_WINDOW_SCISSOR_TL_X(0) | - A3XX_GRAS_SC_WINDOW_SCISSOR_TL_Y(0)); - OUT_RING(ring, A3XX_GRAS_SC_WINDOW_SCISSOR_BR_X(bin_w - 1) | - A3XX_GRAS_SC_WINDOW_SCISSOR_BR_Y(bin_h - 1)); - - OUT_PKT0(ring, REG_A3XX_GRAS_SC_SCREEN_SCISSOR_TL, 2); - OUT_RING(ring, A3XX_GRAS_SC_SCREEN_SCISSOR_TL_X(0) | - A3XX_GRAS_SC_SCREEN_SCISSOR_TL_Y(0)); - OUT_RING(ring, A3XX_GRAS_SC_SCREEN_SCISSOR_BR_X(bin_w - 1) | - A3XX_GRAS_SC_SCREEN_SCISSOR_BR_Y(bin_h - 1)); - - OUT_PKT0(ring, REG_A3XX_RB_STENCIL_CONTROL, 1); - OUT_RING(ring, 0x2 | - A3XX_RB_STENCIL_CONTROL_FUNC(FUNC_ALWAYS) | - A3XX_RB_STENCIL_CONTROL_FAIL(STENCIL_KEEP) | - A3XX_RB_STENCIL_CONTROL_ZPASS(STENCIL_KEEP) | - A3XX_RB_STENCIL_CONTROL_ZFAIL(STENCIL_KEEP) | - A3XX_RB_STENCIL_CONTROL_FUNC_BF(FUNC_ALWAYS) | - A3XX_RB_STENCIL_CONTROL_FAIL_BF(STENCIL_KEEP) | - A3XX_RB_STENCIL_CONTROL_ZPASS_BF(STENCIL_KEEP) | - A3XX_RB_STENCIL_CONTROL_ZFAIL_BF(STENCIL_KEEP)); - - OUT_PKT0(ring, REG_A3XX_RB_STENCIL_INFO, 2); - OUT_RING(ring, 0); /* RB_STENCIL_INFO */ - OUT_RING(ring, 0); /* RB_STENCIL_PITCH */ - - OUT_PKT0(ring, REG_A3XX_GRAS_SC_CONTROL, 1); - OUT_RING(ring, A3XX_GRAS_SC_CONTROL_RENDER_MODE(RB_RENDERING_PASS) | - A3XX_GRAS_SC_CONTROL_MSAA_SAMPLES(MSAA_ONE) | - A3XX_GRAS_SC_CONTROL_RASTER_MODE(1)); - - OUT_PKT0(ring, REG_A3XX_PC_PRIM_VTX_CNTL, 1); - OUT_RING(ring, A3XX_PC_PRIM_VTX_CNTL_STRIDE_IN_VPC(2) | - A3XX_PC_PRIM_VTX_CNTL_POLYMODE_FRONT_PTYPE(PC_DRAW_TRIANGLES) | - A3XX_PC_PRIM_VTX_CNTL_POLYMODE_BACK_PTYPE(PC_DRAW_TRIANGLES) | - A3XX_PC_PRIM_VTX_CNTL_PROVOKING_VTX_LAST); - - OUT_PKT0(ring, REG_A3XX_VFD_INDEX_MIN, 4); - OUT_RING(ring, 0); /* VFD_INDEX_MIN */ - OUT_RING(ring, 2); /* VFD_INDEX_MAX */ - OUT_RING(ring, 0); /* VFD_INSTANCEID_OFFSET */ - OUT_RING(ring, 0); /* VFD_INDEX_OFFSET */ - - fd3_emit_vertex_bufs(ring, &emit); - - /* for gmem pitch/base calculations, we need to use the non- - * truncated tile sizes: - */ - bin_w = gmem->bin_w; - bin_h = gmem->bin_h; - - if (fd_gmem_needs_restore(batch, tile, FD_BUFFER_COLOR)) { - fd3_gmem_emit_set_prog(ctx, &emit, &ctx->blit_prog[pfb->nr_cbufs - 1]); - fd3_program_emit(ring, &emit, pfb->nr_cbufs, pfb->cbufs); - emit_mem2gmem_surf(batch, gmem->cbuf_base, pfb->cbufs, pfb->nr_cbufs, bin_w); - } - - if (fd_gmem_needs_restore(batch, tile, FD_BUFFER_DEPTH | FD_BUFFER_STENCIL)) { - if (pfb->zsbuf->format != PIPE_FORMAT_Z32_FLOAT_S8X24_UINT && - pfb->zsbuf->format != PIPE_FORMAT_Z32_FLOAT) { - /* Non-float can use a regular color write. It's split over 8-bit - * components, so half precision is always sufficient. - */ - fd3_gmem_emit_set_prog(ctx, &emit, &ctx->blit_prog[0]); - } else { - /* Float depth needs special blit shader that writes depth */ - if (pfb->zsbuf->format == PIPE_FORMAT_Z32_FLOAT) - fd3_gmem_emit_set_prog(ctx, &emit, &ctx->blit_z); - else - fd3_gmem_emit_set_prog(ctx, &emit, &ctx->blit_zs); - } - fd3_program_emit(ring, &emit, 1, &pfb->zsbuf); - emit_mem2gmem_surf(batch, gmem->zsbuf_base, &pfb->zsbuf, 1, bin_w); - } - - OUT_PKT0(ring, REG_A3XX_GRAS_SC_CONTROL, 1); - OUT_RING(ring, A3XX_GRAS_SC_CONTROL_RENDER_MODE(RB_RENDERING_PASS) | - A3XX_GRAS_SC_CONTROL_MSAA_SAMPLES(MSAA_ONE) | - A3XX_GRAS_SC_CONTROL_RASTER_MODE(0)); - - OUT_PKT0(ring, REG_A3XX_RB_MODE_CONTROL, 1); - OUT_RING(ring, A3XX_RB_MODE_CONTROL_RENDER_MODE(RB_RENDERING_PASS) | - A3XX_RB_MODE_CONTROL_MARB_CACHE_SPLIT_MODE | - A3XX_RB_MODE_CONTROL_MRT(MAX2(1, pfb->nr_cbufs) - 1)); + struct fd_context *ctx = batch->ctx; + const struct fd_gmem_stateobj *gmem = batch->gmem_state; + struct fd_ringbuffer *ring = batch->gmem; + struct pipe_framebuffer_state *pfb = &batch->framebuffer; + struct fd3_emit emit = { + .debug = &ctx->debug, + .vtx = &ctx->blit_vbuf_state, + .sprite_coord_enable = 1, + }; + /* NOTE: They all use the same VP, this is for vtx bufs. */ + fd3_gmem_emit_set_prog(ctx, &emit, &ctx->blit_prog[0]); + + float x0, y0, x1, y1; + unsigned bin_w = tile->bin_w; + unsigned bin_h = tile->bin_h; + unsigned i; + + /* write texture coordinates to vertexbuf: */ + x0 = ((float)tile->xoff) / ((float)pfb->width); + x1 = ((float)tile->xoff + bin_w) / ((float)pfb->width); + y0 = ((float)tile->yoff) / ((float)pfb->height); + y1 = ((float)tile->yoff + bin_h) / ((float)pfb->height); + + OUT_PKT3(ring, CP_MEM_WRITE, 5); + OUT_RELOC(ring, fd_resource(ctx->blit_texcoord_vbuf)->bo, 0, 0, 0); + OUT_RING(ring, fui(x0)); + OUT_RING(ring, fui(y0)); + OUT_RING(ring, fui(x1)); + OUT_RING(ring, fui(y1)); + + fd3_emit_cache_flush(batch, ring); + + for (i = 0; i < 4; i++) { + OUT_PKT0(ring, REG_A3XX_RB_MRT_CONTROL(i), 1); + OUT_RING(ring, A3XX_RB_MRT_CONTROL_ROP_CODE(ROP_COPY) | + A3XX_RB_MRT_CONTROL_DITHER_MODE(DITHER_DISABLE) | + A3XX_RB_MRT_CONTROL_COMPONENT_ENABLE(0xf)); + + OUT_PKT0(ring, REG_A3XX_RB_MRT_BLEND_CONTROL(i), 1); + OUT_RING( + ring, + A3XX_RB_MRT_BLEND_CONTROL_RGB_SRC_FACTOR(FACTOR_ONE) | + A3XX_RB_MRT_BLEND_CONTROL_RGB_BLEND_OPCODE(BLEND_DST_PLUS_SRC) | + A3XX_RB_MRT_BLEND_CONTROL_RGB_DEST_FACTOR(FACTOR_ZERO) | + A3XX_RB_MRT_BLEND_CONTROL_ALPHA_SRC_FACTOR(FACTOR_ONE) | + A3XX_RB_MRT_BLEND_CONTROL_ALPHA_BLEND_OPCODE(BLEND_DST_PLUS_SRC) | + A3XX_RB_MRT_BLEND_CONTROL_ALPHA_DEST_FACTOR(FACTOR_ZERO)); + } + + OUT_PKT0(ring, REG_A3XX_RB_RENDER_CONTROL, 1); + OUT_RING(ring, A3XX_RB_RENDER_CONTROL_ALPHA_TEST_FUNC(FUNC_ALWAYS) | + A3XX_RB_RENDER_CONTROL_BIN_WIDTH(gmem->bin_w)); + + fd_wfi(batch, ring); + OUT_PKT0(ring, REG_A3XX_RB_DEPTH_CONTROL, 1); + OUT_RING(ring, A3XX_RB_DEPTH_CONTROL_ZFUNC(FUNC_LESS)); + + OUT_PKT0(ring, REG_A3XX_RB_DEPTH_INFO, 2); + OUT_RING(ring, 0); + OUT_RING(ring, 0); + + OUT_PKT0(ring, REG_A3XX_GRAS_CL_CLIP_CNTL, 1); + OUT_RING(ring, + A3XX_GRAS_CL_CLIP_CNTL_IJ_PERSP_CENTER); /* GRAS_CL_CLIP_CNTL */ + + fd_wfi(batch, ring); + OUT_PKT0(ring, REG_A3XX_GRAS_CL_VPORT_XOFFSET, 6); + OUT_RING(ring, A3XX_GRAS_CL_VPORT_XOFFSET((float)bin_w / 2.0 - 0.5)); + OUT_RING(ring, A3XX_GRAS_CL_VPORT_XSCALE((float)bin_w / 2.0)); + OUT_RING(ring, A3XX_GRAS_CL_VPORT_YOFFSET((float)bin_h / 2.0 - 0.5)); + OUT_RING(ring, A3XX_GRAS_CL_VPORT_YSCALE(-(float)bin_h / 2.0)); + OUT_RING(ring, A3XX_GRAS_CL_VPORT_ZOFFSET(0.0)); + OUT_RING(ring, A3XX_GRAS_CL_VPORT_ZSCALE(1.0)); + + OUT_PKT0(ring, REG_A3XX_GRAS_SC_WINDOW_SCISSOR_TL, 2); + OUT_RING(ring, A3XX_GRAS_SC_WINDOW_SCISSOR_TL_X(0) | + A3XX_GRAS_SC_WINDOW_SCISSOR_TL_Y(0)); + OUT_RING(ring, A3XX_GRAS_SC_WINDOW_SCISSOR_BR_X(bin_w - 1) | + A3XX_GRAS_SC_WINDOW_SCISSOR_BR_Y(bin_h - 1)); + + OUT_PKT0(ring, REG_A3XX_GRAS_SC_SCREEN_SCISSOR_TL, 2); + OUT_RING(ring, A3XX_GRAS_SC_SCREEN_SCISSOR_TL_X(0) | + A3XX_GRAS_SC_SCREEN_SCISSOR_TL_Y(0)); + OUT_RING(ring, A3XX_GRAS_SC_SCREEN_SCISSOR_BR_X(bin_w - 1) | + A3XX_GRAS_SC_SCREEN_SCISSOR_BR_Y(bin_h - 1)); + + OUT_PKT0(ring, REG_A3XX_RB_STENCIL_CONTROL, 1); + OUT_RING(ring, 0x2 | A3XX_RB_STENCIL_CONTROL_FUNC(FUNC_ALWAYS) | + A3XX_RB_STENCIL_CONTROL_FAIL(STENCIL_KEEP) | + A3XX_RB_STENCIL_CONTROL_ZPASS(STENCIL_KEEP) | + A3XX_RB_STENCIL_CONTROL_ZFAIL(STENCIL_KEEP) | + A3XX_RB_STENCIL_CONTROL_FUNC_BF(FUNC_ALWAYS) | + A3XX_RB_STENCIL_CONTROL_FAIL_BF(STENCIL_KEEP) | + A3XX_RB_STENCIL_CONTROL_ZPASS_BF(STENCIL_KEEP) | + A3XX_RB_STENCIL_CONTROL_ZFAIL_BF(STENCIL_KEEP)); + + OUT_PKT0(ring, REG_A3XX_RB_STENCIL_INFO, 2); + OUT_RING(ring, 0); /* RB_STENCIL_INFO */ + OUT_RING(ring, 0); /* RB_STENCIL_PITCH */ + + OUT_PKT0(ring, REG_A3XX_GRAS_SC_CONTROL, 1); + OUT_RING(ring, A3XX_GRAS_SC_CONTROL_RENDER_MODE(RB_RENDERING_PASS) | + A3XX_GRAS_SC_CONTROL_MSAA_SAMPLES(MSAA_ONE) | + A3XX_GRAS_SC_CONTROL_RASTER_MODE(1)); + + OUT_PKT0(ring, REG_A3XX_PC_PRIM_VTX_CNTL, 1); + OUT_RING(ring, + A3XX_PC_PRIM_VTX_CNTL_STRIDE_IN_VPC(2) | + A3XX_PC_PRIM_VTX_CNTL_POLYMODE_FRONT_PTYPE(PC_DRAW_TRIANGLES) | + A3XX_PC_PRIM_VTX_CNTL_POLYMODE_BACK_PTYPE(PC_DRAW_TRIANGLES) | + A3XX_PC_PRIM_VTX_CNTL_PROVOKING_VTX_LAST); + + OUT_PKT0(ring, REG_A3XX_VFD_INDEX_MIN, 4); + OUT_RING(ring, 0); /* VFD_INDEX_MIN */ + OUT_RING(ring, 2); /* VFD_INDEX_MAX */ + OUT_RING(ring, 0); /* VFD_INSTANCEID_OFFSET */ + OUT_RING(ring, 0); /* VFD_INDEX_OFFSET */ + + fd3_emit_vertex_bufs(ring, &emit); + + /* for gmem pitch/base calculations, we need to use the non- + * truncated tile sizes: + */ + bin_w = gmem->bin_w; + bin_h = gmem->bin_h; + + if (fd_gmem_needs_restore(batch, tile, FD_BUFFER_COLOR)) { + fd3_gmem_emit_set_prog(ctx, &emit, &ctx->blit_prog[pfb->nr_cbufs - 1]); + fd3_program_emit(ring, &emit, pfb->nr_cbufs, pfb->cbufs); + emit_mem2gmem_surf(batch, gmem->cbuf_base, pfb->cbufs, pfb->nr_cbufs, + bin_w); + } + + if (fd_gmem_needs_restore(batch, tile, + FD_BUFFER_DEPTH | FD_BUFFER_STENCIL)) { + if (pfb->zsbuf->format != PIPE_FORMAT_Z32_FLOAT_S8X24_UINT && + pfb->zsbuf->format != PIPE_FORMAT_Z32_FLOAT) { + /* Non-float can use a regular color write. It's split over 8-bit + * components, so half precision is always sufficient. + */ + fd3_gmem_emit_set_prog(ctx, &emit, &ctx->blit_prog[0]); + } else { + /* Float depth needs special blit shader that writes depth */ + if (pfb->zsbuf->format == PIPE_FORMAT_Z32_FLOAT) + fd3_gmem_emit_set_prog(ctx, &emit, &ctx->blit_z); + else + fd3_gmem_emit_set_prog(ctx, &emit, &ctx->blit_zs); + } + fd3_program_emit(ring, &emit, 1, &pfb->zsbuf); + emit_mem2gmem_surf(batch, gmem->zsbuf_base, &pfb->zsbuf, 1, bin_w); + } + + OUT_PKT0(ring, REG_A3XX_GRAS_SC_CONTROL, 1); + OUT_RING(ring, A3XX_GRAS_SC_CONTROL_RENDER_MODE(RB_RENDERING_PASS) | + A3XX_GRAS_SC_CONTROL_MSAA_SAMPLES(MSAA_ONE) | + A3XX_GRAS_SC_CONTROL_RASTER_MODE(0)); + + OUT_PKT0(ring, REG_A3XX_RB_MODE_CONTROL, 1); + OUT_RING(ring, A3XX_RB_MODE_CONTROL_RENDER_MODE(RB_RENDERING_PASS) | + A3XX_RB_MODE_CONTROL_MARB_CACHE_SPLIT_MODE | + A3XX_RB_MODE_CONTROL_MRT(MAX2(1, pfb->nr_cbufs) - 1)); } static void patch_draws(struct fd_batch *batch, enum pc_di_vis_cull_mode vismode) { - unsigned i; - for (i = 0; i < fd_patch_num_elements(&batch->draw_patches); i++) { - struct fd_cs_patch *patch = fd_patch_element(&batch->draw_patches, i); - *patch->cs = patch->val | DRAW(0, 0, 0, vismode, 0); - } - util_dynarray_clear(&batch->draw_patches); + unsigned i; + for (i = 0; i < fd_patch_num_elements(&batch->draw_patches); i++) { + struct fd_cs_patch *patch = fd_patch_element(&batch->draw_patches, i); + *patch->cs = patch->val | DRAW(0, 0, 0, vismode, 0); + } + util_dynarray_clear(&batch->draw_patches); } static void patch_rbrc(struct fd_batch *batch, uint32_t val) { - unsigned i; - for (i = 0; i < fd_patch_num_elements(&batch->rbrc_patches); i++) { - struct fd_cs_patch *patch = fd_patch_element(&batch->rbrc_patches, i); - *patch->cs = patch->val | val; - } - util_dynarray_clear(&batch->rbrc_patches); + unsigned i; + for (i = 0; i < fd_patch_num_elements(&batch->rbrc_patches); i++) { + struct fd_cs_patch *patch = fd_patch_element(&batch->rbrc_patches, i); + *patch->cs = patch->val | val; + } + util_dynarray_clear(&batch->rbrc_patches); } /* for rendering directly to system memory: */ static void -fd3_emit_sysmem_prep(struct fd_batch *batch) - assert_dt +fd3_emit_sysmem_prep(struct fd_batch *batch) assert_dt { - struct pipe_framebuffer_state *pfb = &batch->framebuffer; - struct fd_ringbuffer *ring = batch->gmem; - uint32_t i, pitch = 0; - - for (i = 0; i < pfb->nr_cbufs; i++) { - struct pipe_surface *psurf = pfb->cbufs[i]; - if (!psurf) - continue; - struct fd_resource *rsc = fd_resource(psurf->texture); - pitch = fd_resource_pitch(rsc, psurf->u.tex.level) / rsc->layout.cpp; - } - - fd3_emit_restore(batch, ring); - - OUT_PKT0(ring, REG_A3XX_RB_FRAME_BUFFER_DIMENSION, 1); - OUT_RING(ring, A3XX_RB_FRAME_BUFFER_DIMENSION_WIDTH(pfb->width) | - A3XX_RB_FRAME_BUFFER_DIMENSION_HEIGHT(pfb->height)); - - emit_mrt(ring, pfb->nr_cbufs, pfb->cbufs, NULL, 0, true); - - /* setup scissor/offset for current tile: */ - OUT_PKT0(ring, REG_A3XX_RB_WINDOW_OFFSET, 1); - OUT_RING(ring, A3XX_RB_WINDOW_OFFSET_X(0) | - A3XX_RB_WINDOW_OFFSET_Y(0)); - - OUT_PKT0(ring, REG_A3XX_GRAS_SC_SCREEN_SCISSOR_TL, 2); - OUT_RING(ring, A3XX_GRAS_SC_SCREEN_SCISSOR_TL_X(0) | - A3XX_GRAS_SC_SCREEN_SCISSOR_TL_Y(0)); - OUT_RING(ring, A3XX_GRAS_SC_SCREEN_SCISSOR_BR_X(pfb->width - 1) | - A3XX_GRAS_SC_SCREEN_SCISSOR_BR_Y(pfb->height - 1)); - - OUT_PKT0(ring, REG_A3XX_RB_MODE_CONTROL, 1); - OUT_RING(ring, A3XX_RB_MODE_CONTROL_RENDER_MODE(RB_RENDERING_PASS) | - A3XX_RB_MODE_CONTROL_GMEM_BYPASS | - A3XX_RB_MODE_CONTROL_MARB_CACHE_SPLIT_MODE | - A3XX_RB_MODE_CONTROL_MRT(MAX2(1, pfb->nr_cbufs) - 1)); - - patch_draws(batch, IGNORE_VISIBILITY); - patch_rbrc(batch, A3XX_RB_RENDER_CONTROL_BIN_WIDTH(pitch)); + struct pipe_framebuffer_state *pfb = &batch->framebuffer; + struct fd_ringbuffer *ring = batch->gmem; + uint32_t i, pitch = 0; + + for (i = 0; i < pfb->nr_cbufs; i++) { + struct pipe_surface *psurf = pfb->cbufs[i]; + if (!psurf) + continue; + struct fd_resource *rsc = fd_resource(psurf->texture); + pitch = fd_resource_pitch(rsc, psurf->u.tex.level) / rsc->layout.cpp; + } + + fd3_emit_restore(batch, ring); + + OUT_PKT0(ring, REG_A3XX_RB_FRAME_BUFFER_DIMENSION, 1); + OUT_RING(ring, A3XX_RB_FRAME_BUFFER_DIMENSION_WIDTH(pfb->width) | + A3XX_RB_FRAME_BUFFER_DIMENSION_HEIGHT(pfb->height)); + + emit_mrt(ring, pfb->nr_cbufs, pfb->cbufs, NULL, 0, true); + + /* setup scissor/offset for current tile: */ + OUT_PKT0(ring, REG_A3XX_RB_WINDOW_OFFSET, 1); + OUT_RING(ring, A3XX_RB_WINDOW_OFFSET_X(0) | A3XX_RB_WINDOW_OFFSET_Y(0)); + + OUT_PKT0(ring, REG_A3XX_GRAS_SC_SCREEN_SCISSOR_TL, 2); + OUT_RING(ring, A3XX_GRAS_SC_SCREEN_SCISSOR_TL_X(0) | + A3XX_GRAS_SC_SCREEN_SCISSOR_TL_Y(0)); + OUT_RING(ring, A3XX_GRAS_SC_SCREEN_SCISSOR_BR_X(pfb->width - 1) | + A3XX_GRAS_SC_SCREEN_SCISSOR_BR_Y(pfb->height - 1)); + + OUT_PKT0(ring, REG_A3XX_RB_MODE_CONTROL, 1); + OUT_RING(ring, A3XX_RB_MODE_CONTROL_RENDER_MODE(RB_RENDERING_PASS) | + A3XX_RB_MODE_CONTROL_GMEM_BYPASS | + A3XX_RB_MODE_CONTROL_MARB_CACHE_SPLIT_MODE | + A3XX_RB_MODE_CONTROL_MRT(MAX2(1, pfb->nr_cbufs) - 1)); + + patch_draws(batch, IGNORE_VISIBILITY); + patch_rbrc(batch, A3XX_RB_RENDER_CONTROL_BIN_WIDTH(pitch)); } static void -update_vsc_pipe(struct fd_batch *batch) - assert_dt +update_vsc_pipe(struct fd_batch *batch) assert_dt { - struct fd_context *ctx = batch->ctx; - const struct fd_gmem_stateobj *gmem = batch->gmem_state; - struct fd3_context *fd3_ctx = fd3_context(ctx); - struct fd_ringbuffer *ring = batch->gmem; - int i; - - OUT_PKT0(ring, REG_A3XX_VSC_SIZE_ADDRESS, 1); - OUT_RELOC(ring, fd3_ctx->vsc_size_mem, 0, 0, 0); /* VSC_SIZE_ADDRESS */ - - for (i = 0; i < 8; i++) { - const struct fd_vsc_pipe *pipe = &gmem->vsc_pipe[i]; - - if (!ctx->vsc_pipe_bo[i]) { - ctx->vsc_pipe_bo[i] = fd_bo_new(ctx->dev, 0x40000, - DRM_FREEDRENO_GEM_TYPE_KMEM, "vsc_pipe[%u]", i); - } - - OUT_PKT0(ring, REG_A3XX_VSC_PIPE(i), 3); - OUT_RING(ring, A3XX_VSC_PIPE_CONFIG_X(pipe->x) | - A3XX_VSC_PIPE_CONFIG_Y(pipe->y) | - A3XX_VSC_PIPE_CONFIG_W(pipe->w) | - A3XX_VSC_PIPE_CONFIG_H(pipe->h)); - OUT_RELOC(ring, ctx->vsc_pipe_bo[i], 0, 0, 0); /* VSC_PIPE[i].DATA_ADDRESS */ - OUT_RING(ring, fd_bo_size(ctx->vsc_pipe_bo[i]) - 32); /* VSC_PIPE[i].DATA_LENGTH */ - } + struct fd_context *ctx = batch->ctx; + const struct fd_gmem_stateobj *gmem = batch->gmem_state; + struct fd3_context *fd3_ctx = fd3_context(ctx); + struct fd_ringbuffer *ring = batch->gmem; + int i; + + OUT_PKT0(ring, REG_A3XX_VSC_SIZE_ADDRESS, 1); + OUT_RELOC(ring, fd3_ctx->vsc_size_mem, 0, 0, 0); /* VSC_SIZE_ADDRESS */ + + for (i = 0; i < 8; i++) { + const struct fd_vsc_pipe *pipe = &gmem->vsc_pipe[i]; + + if (!ctx->vsc_pipe_bo[i]) { + ctx->vsc_pipe_bo[i] = fd_bo_new( + ctx->dev, 0x40000, DRM_FREEDRENO_GEM_TYPE_KMEM, "vsc_pipe[%u]", i); + } + + OUT_PKT0(ring, REG_A3XX_VSC_PIPE(i), 3); + OUT_RING(ring, A3XX_VSC_PIPE_CONFIG_X(pipe->x) | + A3XX_VSC_PIPE_CONFIG_Y(pipe->y) | + A3XX_VSC_PIPE_CONFIG_W(pipe->w) | + A3XX_VSC_PIPE_CONFIG_H(pipe->h)); + OUT_RELOC(ring, ctx->vsc_pipe_bo[i], 0, 0, + 0); /* VSC_PIPE[i].DATA_ADDRESS */ + OUT_RING(ring, fd_bo_size(ctx->vsc_pipe_bo[i]) - + 32); /* VSC_PIPE[i].DATA_LENGTH */ + } } static void -emit_binning_pass(struct fd_batch *batch) - assert_dt +emit_binning_pass(struct fd_batch *batch) assert_dt { - struct fd_context *ctx = batch->ctx; - const struct fd_gmem_stateobj *gmem = batch->gmem_state; - struct pipe_framebuffer_state *pfb = &batch->framebuffer; - struct fd_ringbuffer *ring = batch->gmem; - int i; - - uint32_t x1 = gmem->minx; - uint32_t y1 = gmem->miny; - uint32_t x2 = gmem->minx + gmem->width - 1; - uint32_t y2 = gmem->miny + gmem->height - 1; - - if (ctx->screen->gpu_id == 320) { - emit_binning_workaround(batch); - fd_wfi(batch, ring); - OUT_PKT3(ring, CP_INVALIDATE_STATE, 1); - OUT_RING(ring, 0x00007fff); - } - - OUT_PKT0(ring, REG_A3XX_VSC_BIN_CONTROL, 1); - OUT_RING(ring, A3XX_VSC_BIN_CONTROL_BINNING_ENABLE); - - OUT_PKT0(ring, REG_A3XX_GRAS_SC_CONTROL, 1); - OUT_RING(ring, A3XX_GRAS_SC_CONTROL_RENDER_MODE(RB_TILING_PASS) | - A3XX_GRAS_SC_CONTROL_MSAA_SAMPLES(MSAA_ONE) | - A3XX_GRAS_SC_CONTROL_RASTER_MODE(0)); - - OUT_PKT0(ring, REG_A3XX_RB_FRAME_BUFFER_DIMENSION, 1); - OUT_RING(ring, A3XX_RB_FRAME_BUFFER_DIMENSION_WIDTH(pfb->width) | - A3XX_RB_FRAME_BUFFER_DIMENSION_HEIGHT(pfb->height)); - - OUT_PKT0(ring, REG_A3XX_RB_RENDER_CONTROL, 1); - OUT_RING(ring, A3XX_RB_RENDER_CONTROL_ALPHA_TEST_FUNC(FUNC_NEVER) | - A3XX_RB_RENDER_CONTROL_DISABLE_COLOR_PIPE | - A3XX_RB_RENDER_CONTROL_BIN_WIDTH(gmem->bin_w)); - - /* setup scissor/offset for whole screen: */ - OUT_PKT0(ring, REG_A3XX_RB_WINDOW_OFFSET, 1); - OUT_RING(ring, A3XX_RB_WINDOW_OFFSET_X(x1) | - A3XX_RB_WINDOW_OFFSET_Y(y1)); - - OUT_PKT0(ring, REG_A3XX_RB_LRZ_VSC_CONTROL, 1); - OUT_RING(ring, A3XX_RB_LRZ_VSC_CONTROL_BINNING_ENABLE); - - OUT_PKT0(ring, REG_A3XX_GRAS_SC_SCREEN_SCISSOR_TL, 2); - OUT_RING(ring, A3XX_GRAS_SC_SCREEN_SCISSOR_TL_X(x1) | - A3XX_GRAS_SC_SCREEN_SCISSOR_TL_Y(y1)); - OUT_RING(ring, A3XX_GRAS_SC_SCREEN_SCISSOR_BR_X(x2) | - A3XX_GRAS_SC_SCREEN_SCISSOR_BR_Y(y2)); - - OUT_PKT0(ring, REG_A3XX_RB_MODE_CONTROL, 1); - OUT_RING(ring, A3XX_RB_MODE_CONTROL_RENDER_MODE(RB_TILING_PASS) | - A3XX_RB_MODE_CONTROL_MARB_CACHE_SPLIT_MODE | - A3XX_RB_MODE_CONTROL_MRT(0)); - - for (i = 0; i < 4; i++) { - OUT_PKT0(ring, REG_A3XX_RB_MRT_CONTROL(i), 1); - OUT_RING(ring, A3XX_RB_MRT_CONTROL_ROP_CODE(ROP_CLEAR) | - A3XX_RB_MRT_CONTROL_DITHER_MODE(DITHER_DISABLE) | - A3XX_RB_MRT_CONTROL_COMPONENT_ENABLE(0)); - } - - OUT_PKT0(ring, REG_A3XX_PC_VSTREAM_CONTROL, 1); - OUT_RING(ring, A3XX_PC_VSTREAM_CONTROL_SIZE(1) | - A3XX_PC_VSTREAM_CONTROL_N(0)); - - /* emit IB to binning drawcmds: */ - fd3_emit_ib(ring, batch->binning); - fd_reset_wfi(batch); - - fd_wfi(batch, ring); - - /* and then put stuff back the way it was: */ - - OUT_PKT0(ring, REG_A3XX_VSC_BIN_CONTROL, 1); - OUT_RING(ring, 0x00000000); - - OUT_PKT0(ring, REG_A3XX_SP_SP_CTRL_REG, 1); - OUT_RING(ring, A3XX_SP_SP_CTRL_REG_RESOLVE | - A3XX_SP_SP_CTRL_REG_CONSTMODE(1) | - A3XX_SP_SP_CTRL_REG_SLEEPMODE(1) | - A3XX_SP_SP_CTRL_REG_L0MODE(0)); - - OUT_PKT0(ring, REG_A3XX_RB_LRZ_VSC_CONTROL, 1); - OUT_RING(ring, 0x00000000); - - OUT_PKT0(ring, REG_A3XX_GRAS_SC_CONTROL, 1); - OUT_RING(ring, A3XX_GRAS_SC_CONTROL_RENDER_MODE(RB_RENDERING_PASS) | - A3XX_GRAS_SC_CONTROL_MSAA_SAMPLES(MSAA_ONE) | - A3XX_GRAS_SC_CONTROL_RASTER_MODE(0)); - - OUT_PKT0(ring, REG_A3XX_RB_MODE_CONTROL, 2); - OUT_RING(ring, A3XX_RB_MODE_CONTROL_RENDER_MODE(RB_RENDERING_PASS) | - A3XX_RB_MODE_CONTROL_MARB_CACHE_SPLIT_MODE | - A3XX_RB_MODE_CONTROL_MRT(pfb->nr_cbufs - 1)); - OUT_RING(ring, A3XX_RB_RENDER_CONTROL_ENABLE_GMEM | - A3XX_RB_RENDER_CONTROL_ALPHA_TEST_FUNC(FUNC_NEVER) | - A3XX_RB_RENDER_CONTROL_BIN_WIDTH(gmem->bin_w)); - - fd_event_write(batch, ring, CACHE_FLUSH); - fd_wfi(batch, ring); - - if (ctx->screen->gpu_id == 320) { - /* dummy-draw workaround: */ - OUT_PKT3(ring, CP_DRAW_INDX, 3); - OUT_RING(ring, 0x00000000); - OUT_RING(ring, DRAW(1, DI_SRC_SEL_AUTO_INDEX, - INDEX_SIZE_IGN, IGNORE_VISIBILITY, 0)); - OUT_RING(ring, 0); /* NumIndices */ - fd_reset_wfi(batch); - } - - OUT_PKT3(ring, CP_NOP, 4); - OUT_RING(ring, 0x00000000); - OUT_RING(ring, 0x00000000); - OUT_RING(ring, 0x00000000); - OUT_RING(ring, 0x00000000); - - fd_wfi(batch, ring); - - if (ctx->screen->gpu_id == 320) { - emit_binning_workaround(batch); - } + struct fd_context *ctx = batch->ctx; + const struct fd_gmem_stateobj *gmem = batch->gmem_state; + struct pipe_framebuffer_state *pfb = &batch->framebuffer; + struct fd_ringbuffer *ring = batch->gmem; + int i; + + uint32_t x1 = gmem->minx; + uint32_t y1 = gmem->miny; + uint32_t x2 = gmem->minx + gmem->width - 1; + uint32_t y2 = gmem->miny + gmem->height - 1; + + if (ctx->screen->gpu_id == 320) { + emit_binning_workaround(batch); + fd_wfi(batch, ring); + OUT_PKT3(ring, CP_INVALIDATE_STATE, 1); + OUT_RING(ring, 0x00007fff); + } + + OUT_PKT0(ring, REG_A3XX_VSC_BIN_CONTROL, 1); + OUT_RING(ring, A3XX_VSC_BIN_CONTROL_BINNING_ENABLE); + + OUT_PKT0(ring, REG_A3XX_GRAS_SC_CONTROL, 1); + OUT_RING(ring, A3XX_GRAS_SC_CONTROL_RENDER_MODE(RB_TILING_PASS) | + A3XX_GRAS_SC_CONTROL_MSAA_SAMPLES(MSAA_ONE) | + A3XX_GRAS_SC_CONTROL_RASTER_MODE(0)); + + OUT_PKT0(ring, REG_A3XX_RB_FRAME_BUFFER_DIMENSION, 1); + OUT_RING(ring, A3XX_RB_FRAME_BUFFER_DIMENSION_WIDTH(pfb->width) | + A3XX_RB_FRAME_BUFFER_DIMENSION_HEIGHT(pfb->height)); + + OUT_PKT0(ring, REG_A3XX_RB_RENDER_CONTROL, 1); + OUT_RING(ring, A3XX_RB_RENDER_CONTROL_ALPHA_TEST_FUNC(FUNC_NEVER) | + A3XX_RB_RENDER_CONTROL_DISABLE_COLOR_PIPE | + A3XX_RB_RENDER_CONTROL_BIN_WIDTH(gmem->bin_w)); + + /* setup scissor/offset for whole screen: */ + OUT_PKT0(ring, REG_A3XX_RB_WINDOW_OFFSET, 1); + OUT_RING(ring, A3XX_RB_WINDOW_OFFSET_X(x1) | A3XX_RB_WINDOW_OFFSET_Y(y1)); + + OUT_PKT0(ring, REG_A3XX_RB_LRZ_VSC_CONTROL, 1); + OUT_RING(ring, A3XX_RB_LRZ_VSC_CONTROL_BINNING_ENABLE); + + OUT_PKT0(ring, REG_A3XX_GRAS_SC_SCREEN_SCISSOR_TL, 2); + OUT_RING(ring, A3XX_GRAS_SC_SCREEN_SCISSOR_TL_X(x1) | + A3XX_GRAS_SC_SCREEN_SCISSOR_TL_Y(y1)); + OUT_RING(ring, A3XX_GRAS_SC_SCREEN_SCISSOR_BR_X(x2) | + A3XX_GRAS_SC_SCREEN_SCISSOR_BR_Y(y2)); + + OUT_PKT0(ring, REG_A3XX_RB_MODE_CONTROL, 1); + OUT_RING(ring, A3XX_RB_MODE_CONTROL_RENDER_MODE(RB_TILING_PASS) | + A3XX_RB_MODE_CONTROL_MARB_CACHE_SPLIT_MODE | + A3XX_RB_MODE_CONTROL_MRT(0)); + + for (i = 0; i < 4; i++) { + OUT_PKT0(ring, REG_A3XX_RB_MRT_CONTROL(i), 1); + OUT_RING(ring, A3XX_RB_MRT_CONTROL_ROP_CODE(ROP_CLEAR) | + A3XX_RB_MRT_CONTROL_DITHER_MODE(DITHER_DISABLE) | + A3XX_RB_MRT_CONTROL_COMPONENT_ENABLE(0)); + } + + OUT_PKT0(ring, REG_A3XX_PC_VSTREAM_CONTROL, 1); + OUT_RING(ring, + A3XX_PC_VSTREAM_CONTROL_SIZE(1) | A3XX_PC_VSTREAM_CONTROL_N(0)); + + /* emit IB to binning drawcmds: */ + fd3_emit_ib(ring, batch->binning); + fd_reset_wfi(batch); + + fd_wfi(batch, ring); + + /* and then put stuff back the way it was: */ + + OUT_PKT0(ring, REG_A3XX_VSC_BIN_CONTROL, 1); + OUT_RING(ring, 0x00000000); + + OUT_PKT0(ring, REG_A3XX_SP_SP_CTRL_REG, 1); + OUT_RING(ring, A3XX_SP_SP_CTRL_REG_RESOLVE | + A3XX_SP_SP_CTRL_REG_CONSTMODE(1) | + A3XX_SP_SP_CTRL_REG_SLEEPMODE(1) | + A3XX_SP_SP_CTRL_REG_L0MODE(0)); + + OUT_PKT0(ring, REG_A3XX_RB_LRZ_VSC_CONTROL, 1); + OUT_RING(ring, 0x00000000); + + OUT_PKT0(ring, REG_A3XX_GRAS_SC_CONTROL, 1); + OUT_RING(ring, A3XX_GRAS_SC_CONTROL_RENDER_MODE(RB_RENDERING_PASS) | + A3XX_GRAS_SC_CONTROL_MSAA_SAMPLES(MSAA_ONE) | + A3XX_GRAS_SC_CONTROL_RASTER_MODE(0)); + + OUT_PKT0(ring, REG_A3XX_RB_MODE_CONTROL, 2); + OUT_RING(ring, A3XX_RB_MODE_CONTROL_RENDER_MODE(RB_RENDERING_PASS) | + A3XX_RB_MODE_CONTROL_MARB_CACHE_SPLIT_MODE | + A3XX_RB_MODE_CONTROL_MRT(pfb->nr_cbufs - 1)); + OUT_RING(ring, A3XX_RB_RENDER_CONTROL_ENABLE_GMEM | + A3XX_RB_RENDER_CONTROL_ALPHA_TEST_FUNC(FUNC_NEVER) | + A3XX_RB_RENDER_CONTROL_BIN_WIDTH(gmem->bin_w)); + + fd_event_write(batch, ring, CACHE_FLUSH); + fd_wfi(batch, ring); + + if (ctx->screen->gpu_id == 320) { + /* dummy-draw workaround: */ + OUT_PKT3(ring, CP_DRAW_INDX, 3); + OUT_RING(ring, 0x00000000); + OUT_RING(ring, DRAW(1, DI_SRC_SEL_AUTO_INDEX, INDEX_SIZE_IGN, + IGNORE_VISIBILITY, 0)); + OUT_RING(ring, 0); /* NumIndices */ + fd_reset_wfi(batch); + } + + OUT_PKT3(ring, CP_NOP, 4); + OUT_RING(ring, 0x00000000); + OUT_RING(ring, 0x00000000); + OUT_RING(ring, 0x00000000); + OUT_RING(ring, 0x00000000); + + fd_wfi(batch, ring); + + if (ctx->screen->gpu_id == 320) { + emit_binning_workaround(batch); + } } /* before first tile */ static void -fd3_emit_tile_init(struct fd_batch *batch) - assert_dt +fd3_emit_tile_init(struct fd_batch *batch) assert_dt { - struct fd_ringbuffer *ring = batch->gmem; - struct pipe_framebuffer_state *pfb = &batch->framebuffer; - const struct fd_gmem_stateobj *gmem = batch->gmem_state; - uint32_t rb_render_control; + struct fd_ringbuffer *ring = batch->gmem; + struct pipe_framebuffer_state *pfb = &batch->framebuffer; + const struct fd_gmem_stateobj *gmem = batch->gmem_state; + uint32_t rb_render_control; - fd3_emit_restore(batch, ring); + fd3_emit_restore(batch, ring); - /* note: use gmem->bin_w/h, the bin_w/h parameters may be truncated - * at the right and bottom edge tiles - */ - OUT_PKT0(ring, REG_A3XX_VSC_BIN_SIZE, 1); - OUT_RING(ring, A3XX_VSC_BIN_SIZE_WIDTH(gmem->bin_w) | - A3XX_VSC_BIN_SIZE_HEIGHT(gmem->bin_h)); + /* note: use gmem->bin_w/h, the bin_w/h parameters may be truncated + * at the right and bottom edge tiles + */ + OUT_PKT0(ring, REG_A3XX_VSC_BIN_SIZE, 1); + OUT_RING(ring, A3XX_VSC_BIN_SIZE_WIDTH(gmem->bin_w) | + A3XX_VSC_BIN_SIZE_HEIGHT(gmem->bin_h)); - update_vsc_pipe(batch); + update_vsc_pipe(batch); - fd_wfi(batch, ring); - OUT_PKT0(ring, REG_A3XX_RB_FRAME_BUFFER_DIMENSION, 1); - OUT_RING(ring, A3XX_RB_FRAME_BUFFER_DIMENSION_WIDTH(pfb->width) | - A3XX_RB_FRAME_BUFFER_DIMENSION_HEIGHT(pfb->height)); + fd_wfi(batch, ring); + OUT_PKT0(ring, REG_A3XX_RB_FRAME_BUFFER_DIMENSION, 1); + OUT_RING(ring, A3XX_RB_FRAME_BUFFER_DIMENSION_WIDTH(pfb->width) | + A3XX_RB_FRAME_BUFFER_DIMENSION_HEIGHT(pfb->height)); - if (use_hw_binning(batch)) { - /* emit hw binning pass: */ - emit_binning_pass(batch); + if (use_hw_binning(batch)) { + /* emit hw binning pass: */ + emit_binning_pass(batch); - patch_draws(batch, USE_VISIBILITY); - } else { - patch_draws(batch, IGNORE_VISIBILITY); - } + patch_draws(batch, USE_VISIBILITY); + } else { + patch_draws(batch, IGNORE_VISIBILITY); + } - rb_render_control = A3XX_RB_RENDER_CONTROL_ENABLE_GMEM | - A3XX_RB_RENDER_CONTROL_BIN_WIDTH(gmem->bin_w); + rb_render_control = A3XX_RB_RENDER_CONTROL_ENABLE_GMEM | + A3XX_RB_RENDER_CONTROL_BIN_WIDTH(gmem->bin_w); - patch_rbrc(batch, rb_render_control); + patch_rbrc(batch, rb_render_control); } /* before mem2gmem */ static void fd3_emit_tile_prep(struct fd_batch *batch, const struct fd_tile *tile) { - struct fd_ringbuffer *ring = batch->gmem; - struct pipe_framebuffer_state *pfb = &batch->framebuffer; + struct fd_ringbuffer *ring = batch->gmem; + struct pipe_framebuffer_state *pfb = &batch->framebuffer; - OUT_PKT0(ring, REG_A3XX_RB_MODE_CONTROL, 1); - OUT_RING(ring, A3XX_RB_MODE_CONTROL_RENDER_MODE(RB_RENDERING_PASS) | - A3XX_RB_MODE_CONTROL_MARB_CACHE_SPLIT_MODE | - A3XX_RB_MODE_CONTROL_MRT(MAX2(1, pfb->nr_cbufs) - 1)); + OUT_PKT0(ring, REG_A3XX_RB_MODE_CONTROL, 1); + OUT_RING(ring, A3XX_RB_MODE_CONTROL_RENDER_MODE(RB_RENDERING_PASS) | + A3XX_RB_MODE_CONTROL_MARB_CACHE_SPLIT_MODE | + A3XX_RB_MODE_CONTROL_MRT(MAX2(1, pfb->nr_cbufs) - 1)); } /* before IB to rendering cmds: */ static void -fd3_emit_tile_renderprep(struct fd_batch *batch, const struct fd_tile *tile) - assert_dt +fd3_emit_tile_renderprep(struct fd_batch *batch, + const struct fd_tile *tile) assert_dt { - struct fd_context *ctx = batch->ctx; - struct fd3_context *fd3_ctx = fd3_context(ctx); - struct fd_ringbuffer *ring = batch->gmem; - const struct fd_gmem_stateobj *gmem = batch->gmem_state; - struct pipe_framebuffer_state *pfb = &batch->framebuffer; - - uint32_t x1 = tile->xoff; - uint32_t y1 = tile->yoff; - uint32_t x2 = tile->xoff + tile->bin_w - 1; - uint32_t y2 = tile->yoff + tile->bin_h - 1; - - uint32_t reg; - - OUT_PKT0(ring, REG_A3XX_RB_DEPTH_INFO, 2); - reg = A3XX_RB_DEPTH_INFO_DEPTH_BASE(gmem->zsbuf_base[0]); - if (pfb->zsbuf) { - reg |= A3XX_RB_DEPTH_INFO_DEPTH_FORMAT(fd_pipe2depth(pfb->zsbuf->format)); - } - OUT_RING(ring, reg); - if (pfb->zsbuf) { - struct fd_resource *rsc = fd_resource(pfb->zsbuf->texture); - OUT_RING(ring, A3XX_RB_DEPTH_PITCH(gmem->bin_w << - fdl_cpp_shift(&rsc->layout))); - if (rsc->stencil) { - OUT_PKT0(ring, REG_A3XX_RB_STENCIL_INFO, 2); - OUT_RING(ring, A3XX_RB_STENCIL_INFO_STENCIL_BASE(gmem->zsbuf_base[1])); - OUT_RING(ring, A3XX_RB_STENCIL_PITCH(gmem->bin_w << - fdl_cpp_shift(&rsc->stencil->layout))); - } - } else { - OUT_RING(ring, 0x00000000); - } - - if (use_hw_binning(batch)) { - const struct fd_vsc_pipe *pipe = &gmem->vsc_pipe[tile->p]; - struct fd_bo *pipe_bo = ctx->vsc_pipe_bo[tile->p]; - - assert(pipe->w && pipe->h); - - fd_event_write(batch, ring, HLSQ_FLUSH); - fd_wfi(batch, ring); - - OUT_PKT0(ring, REG_A3XX_PC_VSTREAM_CONTROL, 1); - OUT_RING(ring, A3XX_PC_VSTREAM_CONTROL_SIZE(pipe->w * pipe->h) | - A3XX_PC_VSTREAM_CONTROL_N(tile->n)); - - - OUT_PKT3(ring, CP_SET_BIN_DATA, 2); - OUT_RELOC(ring, pipe_bo, 0, 0, 0); /* BIN_DATA_ADDR <- VSC_PIPE[p].DATA_ADDRESS */ - OUT_RELOC(ring, fd3_ctx->vsc_size_mem, /* BIN_SIZE_ADDR <- VSC_SIZE_ADDRESS + (p * 4) */ - (tile->p * 4), 0, 0); - } else { - OUT_PKT0(ring, REG_A3XX_PC_VSTREAM_CONTROL, 1); - OUT_RING(ring, 0x00000000); - } - - OUT_PKT3(ring, CP_SET_BIN, 3); - OUT_RING(ring, 0x00000000); - OUT_RING(ring, CP_SET_BIN_1_X1(x1) | CP_SET_BIN_1_Y1(y1)); - OUT_RING(ring, CP_SET_BIN_2_X2(x2) | CP_SET_BIN_2_Y2(y2)); - - emit_mrt(ring, pfb->nr_cbufs, pfb->cbufs, gmem->cbuf_base, gmem->bin_w, true); - - /* setup scissor/offset for current tile: */ - OUT_PKT0(ring, REG_A3XX_RB_WINDOW_OFFSET, 1); - OUT_RING(ring, A3XX_RB_WINDOW_OFFSET_X(tile->xoff) | - A3XX_RB_WINDOW_OFFSET_Y(tile->yoff)); - - OUT_PKT0(ring, REG_A3XX_GRAS_SC_SCREEN_SCISSOR_TL, 2); - OUT_RING(ring, A3XX_GRAS_SC_SCREEN_SCISSOR_TL_X(x1) | - A3XX_GRAS_SC_SCREEN_SCISSOR_TL_Y(y1)); - OUT_RING(ring, A3XX_GRAS_SC_SCREEN_SCISSOR_BR_X(x2) | - A3XX_GRAS_SC_SCREEN_SCISSOR_BR_Y(y2)); + struct fd_context *ctx = batch->ctx; + struct fd3_context *fd3_ctx = fd3_context(ctx); + struct fd_ringbuffer *ring = batch->gmem; + const struct fd_gmem_stateobj *gmem = batch->gmem_state; + struct pipe_framebuffer_state *pfb = &batch->framebuffer; + + uint32_t x1 = tile->xoff; + uint32_t y1 = tile->yoff; + uint32_t x2 = tile->xoff + tile->bin_w - 1; + uint32_t y2 = tile->yoff + tile->bin_h - 1; + + uint32_t reg; + + OUT_PKT0(ring, REG_A3XX_RB_DEPTH_INFO, 2); + reg = A3XX_RB_DEPTH_INFO_DEPTH_BASE(gmem->zsbuf_base[0]); + if (pfb->zsbuf) { + reg |= A3XX_RB_DEPTH_INFO_DEPTH_FORMAT(fd_pipe2depth(pfb->zsbuf->format)); + } + OUT_RING(ring, reg); + if (pfb->zsbuf) { + struct fd_resource *rsc = fd_resource(pfb->zsbuf->texture); + OUT_RING(ring, + A3XX_RB_DEPTH_PITCH(gmem->bin_w << fdl_cpp_shift(&rsc->layout))); + if (rsc->stencil) { + OUT_PKT0(ring, REG_A3XX_RB_STENCIL_INFO, 2); + OUT_RING(ring, A3XX_RB_STENCIL_INFO_STENCIL_BASE(gmem->zsbuf_base[1])); + OUT_RING(ring, A3XX_RB_STENCIL_PITCH(gmem->bin_w << fdl_cpp_shift( + &rsc->stencil->layout))); + } + } else { + OUT_RING(ring, 0x00000000); + } + + if (use_hw_binning(batch)) { + const struct fd_vsc_pipe *pipe = &gmem->vsc_pipe[tile->p]; + struct fd_bo *pipe_bo = ctx->vsc_pipe_bo[tile->p]; + + assert(pipe->w && pipe->h); + + fd_event_write(batch, ring, HLSQ_FLUSH); + fd_wfi(batch, ring); + + OUT_PKT0(ring, REG_A3XX_PC_VSTREAM_CONTROL, 1); + OUT_RING(ring, A3XX_PC_VSTREAM_CONTROL_SIZE(pipe->w * pipe->h) | + A3XX_PC_VSTREAM_CONTROL_N(tile->n)); + + OUT_PKT3(ring, CP_SET_BIN_DATA, 2); + OUT_RELOC(ring, pipe_bo, 0, 0, + 0); /* BIN_DATA_ADDR <- VSC_PIPE[p].DATA_ADDRESS */ + OUT_RELOC(ring, fd3_ctx->vsc_size_mem, /* BIN_SIZE_ADDR <- + VSC_SIZE_ADDRESS + (p * 4) */ + (tile->p * 4), 0, 0); + } else { + OUT_PKT0(ring, REG_A3XX_PC_VSTREAM_CONTROL, 1); + OUT_RING(ring, 0x00000000); + } + + OUT_PKT3(ring, CP_SET_BIN, 3); + OUT_RING(ring, 0x00000000); + OUT_RING(ring, CP_SET_BIN_1_X1(x1) | CP_SET_BIN_1_Y1(y1)); + OUT_RING(ring, CP_SET_BIN_2_X2(x2) | CP_SET_BIN_2_Y2(y2)); + + emit_mrt(ring, pfb->nr_cbufs, pfb->cbufs, gmem->cbuf_base, gmem->bin_w, + true); + + /* setup scissor/offset for current tile: */ + OUT_PKT0(ring, REG_A3XX_RB_WINDOW_OFFSET, 1); + OUT_RING(ring, A3XX_RB_WINDOW_OFFSET_X(tile->xoff) | + A3XX_RB_WINDOW_OFFSET_Y(tile->yoff)); + + OUT_PKT0(ring, REG_A3XX_GRAS_SC_SCREEN_SCISSOR_TL, 2); + OUT_RING(ring, A3XX_GRAS_SC_SCREEN_SCISSOR_TL_X(x1) | + A3XX_GRAS_SC_SCREEN_SCISSOR_TL_Y(y1)); + OUT_RING(ring, A3XX_GRAS_SC_SCREEN_SCISSOR_BR_X(x2) | + A3XX_GRAS_SC_SCREEN_SCISSOR_BR_Y(y2)); } void -fd3_gmem_init(struct pipe_context *pctx) - disable_thread_safety_analysis +fd3_gmem_init(struct pipe_context *pctx) disable_thread_safety_analysis { - struct fd_context *ctx = fd_context(pctx); - - ctx->emit_sysmem_prep = fd3_emit_sysmem_prep; - ctx->emit_tile_init = fd3_emit_tile_init; - ctx->emit_tile_prep = fd3_emit_tile_prep; - ctx->emit_tile_mem2gmem = fd3_emit_tile_mem2gmem; - ctx->emit_tile_renderprep = fd3_emit_tile_renderprep; - ctx->emit_tile_gmem2mem = fd3_emit_tile_gmem2mem; + struct fd_context *ctx = fd_context(pctx); + + ctx->emit_sysmem_prep = fd3_emit_sysmem_prep; + ctx->emit_tile_init = fd3_emit_tile_init; + ctx->emit_tile_prep = fd3_emit_tile_prep; + ctx->emit_tile_mem2gmem = fd3_emit_tile_mem2gmem; + ctx->emit_tile_renderprep = fd3_emit_tile_renderprep; + ctx->emit_tile_gmem2mem = fd3_emit_tile_gmem2mem; } diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_program.c b/src/gallium/drivers/freedreno/a3xx/fd3_program.c index c2545ac..9d85f79 100644 --- a/src/gallium/drivers/freedreno/a3xx/fd3_program.c +++ b/src/gallium/drivers/freedreno/a3xx/fd3_program.c @@ -25,465 +25,467 @@ */ #include "pipe/p_state.h" -#include "util/u_string.h" +#include "util/format/u_format.h" +#include "util/u_inlines.h" #include "util/u_math.h" #include "util/u_memory.h" -#include "util/u_inlines.h" -#include "util/format/u_format.h" +#include "util/u_string.h" #include "freedreno_program.h" -#include "fd3_program.h" #include "fd3_emit.h" -#include "fd3_texture.h" #include "fd3_format.h" +#include "fd3_program.h" +#include "fd3_texture.h" bool fd3_needs_manual_clipping(const struct ir3_shader *shader, - const struct pipe_rasterizer_state *rast) + const struct pipe_rasterizer_state *rast) { - uint64_t outputs = ir3_shader_outputs(shader); + uint64_t outputs = ir3_shader_outputs(shader); - return (!rast->depth_clip_near || - util_bitcount(rast->clip_plane_enable) > 6 || - outputs & ((1ULL << VARYING_SLOT_CLIP_VERTEX) | - (1ULL << VARYING_SLOT_CLIP_DIST0) | - (1ULL << VARYING_SLOT_CLIP_DIST1))); + return (!rast->depth_clip_near || + util_bitcount(rast->clip_plane_enable) > 6 || + outputs & ((1ULL << VARYING_SLOT_CLIP_VERTEX) | + (1ULL << VARYING_SLOT_CLIP_DIST0) | + (1ULL << VARYING_SLOT_CLIP_DIST1))); } - static void emit_shader(struct fd_ringbuffer *ring, const struct ir3_shader_variant *so) { - const struct ir3_info *si = &so->info; - enum adreno_state_block sb; - enum adreno_state_src src; - uint32_t i, sz, *bin; - - if (so->type == MESA_SHADER_VERTEX) { - sb = SB_VERT_SHADER; - } else { - sb = SB_FRAG_SHADER; - } - - if (FD_DBG(DIRECT)) { - sz = si->sizedwords; - src = SS_DIRECT; - bin = fd_bo_map(so->bo); - } else { - sz = 0; - src = SS_INDIRECT; - bin = NULL; - } - - OUT_PKT3(ring, CP_LOAD_STATE, 2 + sz); - OUT_RING(ring, CP_LOAD_STATE_0_DST_OFF(0) | - CP_LOAD_STATE_0_STATE_SRC(src) | - CP_LOAD_STATE_0_STATE_BLOCK(sb) | - CP_LOAD_STATE_0_NUM_UNIT(so->instrlen)); - if (bin) { - OUT_RING(ring, CP_LOAD_STATE_1_EXT_SRC_ADDR(0) | - CP_LOAD_STATE_1_STATE_TYPE(ST_SHADER)); - } else { - OUT_RELOC(ring, so->bo, 0, - CP_LOAD_STATE_1_STATE_TYPE(ST_SHADER), 0); - } - for (i = 0; i < sz; i++) { - OUT_RING(ring, bin[i]); - } + const struct ir3_info *si = &so->info; + enum adreno_state_block sb; + enum adreno_state_src src; + uint32_t i, sz, *bin; + + if (so->type == MESA_SHADER_VERTEX) { + sb = SB_VERT_SHADER; + } else { + sb = SB_FRAG_SHADER; + } + + if (FD_DBG(DIRECT)) { + sz = si->sizedwords; + src = SS_DIRECT; + bin = fd_bo_map(so->bo); + } else { + sz = 0; + src = SS_INDIRECT; + bin = NULL; + } + + OUT_PKT3(ring, CP_LOAD_STATE, 2 + sz); + OUT_RING(ring, CP_LOAD_STATE_0_DST_OFF(0) | CP_LOAD_STATE_0_STATE_SRC(src) | + CP_LOAD_STATE_0_STATE_BLOCK(sb) | + CP_LOAD_STATE_0_NUM_UNIT(so->instrlen)); + if (bin) { + OUT_RING(ring, CP_LOAD_STATE_1_EXT_SRC_ADDR(0) | + CP_LOAD_STATE_1_STATE_TYPE(ST_SHADER)); + } else { + OUT_RELOC(ring, so->bo, 0, CP_LOAD_STATE_1_STATE_TYPE(ST_SHADER), 0); + } + for (i = 0; i < sz; i++) { + OUT_RING(ring, bin[i]); + } } void -fd3_program_emit(struct fd_ringbuffer *ring, struct fd3_emit *emit, - int nr, struct pipe_surface **bufs) +fd3_program_emit(struct fd_ringbuffer *ring, struct fd3_emit *emit, int nr, + struct pipe_surface **bufs) { - const struct ir3_shader_variant *vp, *fp; - const struct ir3_info *vsi, *fsi; - enum a3xx_instrbuffermode fpbuffer, vpbuffer; - uint32_t fpbuffersz, vpbuffersz, fsoff; - uint32_t pos_regid, posz_regid, psize_regid; - uint32_t ij_regid[4], face_regid, coord_regid, zwcoord_regid; - uint32_t color_regid[4] = {0}; - int constmode; - int i, j; - - debug_assert(nr <= ARRAY_SIZE(color_regid)); - - vp = fd3_emit_get_vp(emit); - fp = fd3_emit_get_fp(emit); - - vsi = &vp->info; - fsi = &fp->info; - - fpbuffer = BUFFER; - vpbuffer = BUFFER; - fpbuffersz = fp->instrlen; - vpbuffersz = vp->instrlen; - - /* - * Decide whether to use BUFFER or CACHE mode for VS and FS. It - * appears like 256 is the hard limit, but when the combined size - * exceeds 128 then blob will try to keep FS in BUFFER mode and - * switch to CACHE for VS until VS is too large. The blob seems - * to switch FS out of BUFFER mode at slightly under 128. But - * a bit fuzzy on the decision tree, so use slightly conservative - * limits. - * - * TODO check if these thresholds for BUFFER vs CACHE mode are the - * same for all a3xx or whether we need to consider the gpuid - */ - - if ((fpbuffersz + vpbuffersz) > 128) { - if (fpbuffersz < 112) { - /* FP:BUFFER VP:CACHE */ - vpbuffer = CACHE; - vpbuffersz = 256 - fpbuffersz; - } else if (vpbuffersz < 112) { - /* FP:CACHE VP:BUFFER */ - fpbuffer = CACHE; - fpbuffersz = 256 - vpbuffersz; - } else { - /* FP:CACHE VP:CACHE */ - vpbuffer = fpbuffer = CACHE; - vpbuffersz = fpbuffersz = 192; - } - } - - if (fpbuffer == BUFFER) { - fsoff = 128 - fpbuffersz; - } else { - fsoff = 256 - fpbuffersz; - } - - /* seems like vs->constlen + fs->constlen > 256, then CONSTMODE=1 */ - constmode = ((vp->constlen + fp->constlen) > 256) ? 1 : 0; - - pos_regid = ir3_find_output_regid(vp, VARYING_SLOT_POS); - posz_regid = ir3_find_output_regid(fp, FRAG_RESULT_DEPTH); - psize_regid = ir3_find_output_regid(vp, VARYING_SLOT_PSIZ); - if (fp->color0_mrt) { - color_regid[0] = color_regid[1] = color_regid[2] = color_regid[3] = - ir3_find_output_regid(fp, FRAG_RESULT_COLOR); - } else { - color_regid[0] = ir3_find_output_regid(fp, FRAG_RESULT_DATA0); - color_regid[1] = ir3_find_output_regid(fp, FRAG_RESULT_DATA1); - color_regid[2] = ir3_find_output_regid(fp, FRAG_RESULT_DATA2); - color_regid[3] = ir3_find_output_regid(fp, FRAG_RESULT_DATA3); - } - - face_regid = ir3_find_sysval_regid(fp, SYSTEM_VALUE_FRONT_FACE); - coord_regid = ir3_find_sysval_regid(fp, SYSTEM_VALUE_FRAG_COORD); - zwcoord_regid = (coord_regid == regid(63,0)) ? regid(63,0) : (coord_regid + 2); - ij_regid[0] = ir3_find_sysval_regid(fp, SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL); - ij_regid[1] = ir3_find_sysval_regid(fp, SYSTEM_VALUE_BARYCENTRIC_LINEAR_PIXEL); - ij_regid[2] = ir3_find_sysval_regid(fp, SYSTEM_VALUE_BARYCENTRIC_PERSP_CENTROID); - ij_regid[3] = ir3_find_sysval_regid(fp, SYSTEM_VALUE_BARYCENTRIC_LINEAR_CENTROID); - - /* adjust regids for alpha output formats. there is no alpha render - * format, so it's just treated like red - */ - for (i = 0; i < nr; i++) - if (util_format_is_alpha(pipe_surface_format(bufs[i]))) - color_regid[i] += 3; - - /* we could probably divide this up into things that need to be - * emitted if frag-prog is dirty vs if vert-prog is dirty.. - */ - - OUT_PKT0(ring, REG_A3XX_HLSQ_CONTROL_0_REG, 6); - OUT_RING(ring, A3XX_HLSQ_CONTROL_0_REG_FSTHREADSIZE(FOUR_QUADS) | - A3XX_HLSQ_CONTROL_0_REG_FSSUPERTHREADENABLE | - A3XX_HLSQ_CONTROL_0_REG_CONSTMODE(constmode) | - /* NOTE: I guess SHADERRESTART and CONSTFULLUPDATE maybe - * flush some caches? I think we only need to set those - * bits if we have updated const or shader.. - */ - A3XX_HLSQ_CONTROL_0_REG_SPSHADERRESTART | - A3XX_HLSQ_CONTROL_0_REG_SPCONSTFULLUPDATE); - OUT_RING(ring, A3XX_HLSQ_CONTROL_1_REG_VSTHREADSIZE(TWO_QUADS) | - A3XX_HLSQ_CONTROL_1_REG_VSSUPERTHREADENABLE | - A3XX_HLSQ_CONTROL_1_REG_FRAGCOORDXYREGID(coord_regid) | - A3XX_HLSQ_CONTROL_1_REG_FRAGCOORDZWREGID(zwcoord_regid)); - OUT_RING(ring, A3XX_HLSQ_CONTROL_2_REG_PRIMALLOCTHRESHOLD(31) | - A3XX_HLSQ_CONTROL_2_REG_FACENESSREGID(face_regid)); - OUT_RING(ring, - A3XX_HLSQ_CONTROL_3_REG_IJPERSPCENTERREGID(ij_regid[0]) | - A3XX_HLSQ_CONTROL_3_REG_IJNONPERSPCENTERREGID(ij_regid[1]) | - A3XX_HLSQ_CONTROL_3_REG_IJPERSPCENTROIDREGID(ij_regid[2]) | - A3XX_HLSQ_CONTROL_3_REG_IJNONPERSPCENTROIDREGID(ij_regid[3])); - OUT_RING(ring, A3XX_HLSQ_VS_CONTROL_REG_CONSTLENGTH(vp->constlen) | - A3XX_HLSQ_VS_CONTROL_REG_CONSTSTARTOFFSET(0) | - A3XX_HLSQ_VS_CONTROL_REG_INSTRLENGTH(vpbuffersz)); - OUT_RING(ring, A3XX_HLSQ_FS_CONTROL_REG_CONSTLENGTH(fp->constlen) | - A3XX_HLSQ_FS_CONTROL_REG_CONSTSTARTOFFSET(128) | - A3XX_HLSQ_FS_CONTROL_REG_INSTRLENGTH(fpbuffersz)); - - OUT_PKT0(ring, REG_A3XX_SP_SP_CTRL_REG, 1); - OUT_RING(ring, A3XX_SP_SP_CTRL_REG_CONSTMODE(constmode) | - COND(emit->binning_pass, A3XX_SP_SP_CTRL_REG_BINNING) | - A3XX_SP_SP_CTRL_REG_SLEEPMODE(1) | - A3XX_SP_SP_CTRL_REG_L0MODE(0)); - - OUT_PKT0(ring, REG_A3XX_SP_VS_LENGTH_REG, 1); - OUT_RING(ring, A3XX_SP_VS_LENGTH_REG_SHADERLENGTH(vp->instrlen)); - - OUT_PKT0(ring, REG_A3XX_SP_VS_CTRL_REG0, 3); - OUT_RING(ring, A3XX_SP_VS_CTRL_REG0_THREADMODE(MULTI) | - A3XX_SP_VS_CTRL_REG0_INSTRBUFFERMODE(vpbuffer) | - COND(vpbuffer == CACHE, A3XX_SP_VS_CTRL_REG0_CACHEINVALID) | - A3XX_SP_VS_CTRL_REG0_HALFREGFOOTPRINT(vsi->max_half_reg + 1) | - A3XX_SP_VS_CTRL_REG0_FULLREGFOOTPRINT(vsi->max_reg + 1) | - A3XX_SP_VS_CTRL_REG0_THREADSIZE(TWO_QUADS) | - A3XX_SP_VS_CTRL_REG0_SUPERTHREADMODE | - A3XX_SP_VS_CTRL_REG0_LENGTH(vpbuffersz)); - OUT_RING(ring, A3XX_SP_VS_CTRL_REG1_CONSTLENGTH(vp->constlen) | - A3XX_SP_VS_CTRL_REG1_INITIALOUTSTANDING(vp->total_in) | - A3XX_SP_VS_CTRL_REG1_CONSTFOOTPRINT(MAX2(vp->constlen - 1, 0))); - OUT_RING(ring, A3XX_SP_VS_PARAM_REG_POSREGID(pos_regid) | - A3XX_SP_VS_PARAM_REG_PSIZEREGID(psize_regid) | - A3XX_SP_VS_PARAM_REG_TOTALVSOUTVAR(fp->varying_in)); - - struct ir3_shader_linkage l = {0}; - ir3_link_shaders(&l, vp, fp, false); - - for (i = 0, j = 0; (i < 16) && (j < l.cnt); i++) { - uint32_t reg = 0; - - OUT_PKT0(ring, REG_A3XX_SP_VS_OUT_REG(i), 1); - - reg |= A3XX_SP_VS_OUT_REG_A_REGID(l.var[j].regid); - reg |= A3XX_SP_VS_OUT_REG_A_COMPMASK(l.var[j].compmask); - j++; - - reg |= A3XX_SP_VS_OUT_REG_B_REGID(l.var[j].regid); - reg |= A3XX_SP_VS_OUT_REG_B_COMPMASK(l.var[j].compmask); - j++; - - OUT_RING(ring, reg); - } - - for (i = 0, j = 0; (i < 8) && (j < l.cnt); i++) { - uint32_t reg = 0; - - OUT_PKT0(ring, REG_A3XX_SP_VS_VPC_DST_REG(i), 1); - - reg |= A3XX_SP_VS_VPC_DST_REG_OUTLOC0(l.var[j++].loc + 8); - reg |= A3XX_SP_VS_VPC_DST_REG_OUTLOC1(l.var[j++].loc + 8); - reg |= A3XX_SP_VS_VPC_DST_REG_OUTLOC2(l.var[j++].loc + 8); - reg |= A3XX_SP_VS_VPC_DST_REG_OUTLOC3(l.var[j++].loc + 8); - - OUT_RING(ring, reg); - } - - OUT_PKT0(ring, REG_A3XX_SP_VS_OBJ_OFFSET_REG, 2); - OUT_RING(ring, A3XX_SP_VS_OBJ_OFFSET_REG_CONSTOBJECTOFFSET(0) | - A3XX_SP_VS_OBJ_OFFSET_REG_SHADEROBJOFFSET(0)); - OUT_RELOC(ring, vp->bo, 0, 0, 0); /* SP_VS_OBJ_START_REG */ - - if (emit->binning_pass) { - OUT_PKT0(ring, REG_A3XX_SP_FS_LENGTH_REG, 1); - OUT_RING(ring, 0x00000000); - - OUT_PKT0(ring, REG_A3XX_SP_FS_CTRL_REG0, 2); - OUT_RING(ring, A3XX_SP_FS_CTRL_REG0_THREADMODE(MULTI) | - A3XX_SP_FS_CTRL_REG0_INSTRBUFFERMODE(BUFFER)); - OUT_RING(ring, 0x00000000); - - OUT_PKT0(ring, REG_A3XX_SP_FS_OBJ_OFFSET_REG, 1); - OUT_RING(ring, A3XX_SP_FS_OBJ_OFFSET_REG_CONSTOBJECTOFFSET(128) | - A3XX_SP_FS_OBJ_OFFSET_REG_SHADEROBJOFFSET(0)); - } else { - OUT_PKT0(ring, REG_A3XX_SP_FS_LENGTH_REG, 1); - OUT_RING(ring, A3XX_SP_FS_LENGTH_REG_SHADERLENGTH(fp->instrlen)); - - OUT_PKT0(ring, REG_A3XX_SP_FS_CTRL_REG0, 2); - OUT_RING(ring, A3XX_SP_FS_CTRL_REG0_THREADMODE(MULTI) | - A3XX_SP_FS_CTRL_REG0_INSTRBUFFERMODE(fpbuffer) | - COND(fpbuffer == CACHE, A3XX_SP_FS_CTRL_REG0_CACHEINVALID) | - A3XX_SP_FS_CTRL_REG0_HALFREGFOOTPRINT(fsi->max_half_reg + 1) | - A3XX_SP_FS_CTRL_REG0_FULLREGFOOTPRINT(fsi->max_reg + 1) | - A3XX_SP_FS_CTRL_REG0_INOUTREGOVERLAP | - A3XX_SP_FS_CTRL_REG0_THREADSIZE(FOUR_QUADS) | - A3XX_SP_FS_CTRL_REG0_SUPERTHREADMODE | - COND(fp->need_pixlod, A3XX_SP_FS_CTRL_REG0_PIXLODENABLE) | - A3XX_SP_FS_CTRL_REG0_LENGTH(fpbuffersz)); - OUT_RING(ring, A3XX_SP_FS_CTRL_REG1_CONSTLENGTH(fp->constlen) | - A3XX_SP_FS_CTRL_REG1_INITIALOUTSTANDING(fp->sysval_in) | - A3XX_SP_FS_CTRL_REG1_CONSTFOOTPRINT(MAX2(fp->constlen - 1, 0)) | - A3XX_SP_FS_CTRL_REG1_HALFPRECVAROFFSET(63)); - - OUT_PKT0(ring, REG_A3XX_SP_FS_OBJ_OFFSET_REG, 2); - OUT_RING(ring, A3XX_SP_FS_OBJ_OFFSET_REG_CONSTOBJECTOFFSET( - MAX2(128, vp->constlen)) | - A3XX_SP_FS_OBJ_OFFSET_REG_SHADEROBJOFFSET(fsoff)); - OUT_RELOC(ring, fp->bo, 0, 0, 0); /* SP_FS_OBJ_START_REG */ - } - - OUT_PKT0(ring, REG_A3XX_SP_FS_OUTPUT_REG, 1); - OUT_RING(ring, - COND(fp->writes_pos, A3XX_SP_FS_OUTPUT_REG_DEPTH_ENABLE) | - A3XX_SP_FS_OUTPUT_REG_DEPTH_REGID(posz_regid) | - A3XX_SP_FS_OUTPUT_REG_MRT(MAX2(1, nr) - 1)); - - OUT_PKT0(ring, REG_A3XX_SP_FS_MRT_REG(0), 4); - for (i = 0; i < 4; i++) { - uint32_t mrt_reg = A3XX_SP_FS_MRT_REG_REGID(color_regid[i]) | - COND(color_regid[i] & HALF_REG_ID, A3XX_SP_FS_MRT_REG_HALF_PRECISION); - - if (i < nr) { - enum pipe_format fmt = pipe_surface_format(bufs[i]); - mrt_reg |= COND(util_format_is_pure_uint(fmt), A3XX_SP_FS_MRT_REG_UINT) | - COND(util_format_is_pure_sint(fmt), A3XX_SP_FS_MRT_REG_SINT); - } - OUT_RING(ring, mrt_reg); - } - - if (emit->binning_pass) { - OUT_PKT0(ring, REG_A3XX_VPC_ATTR, 2); - OUT_RING(ring, A3XX_VPC_ATTR_THRDASSIGN(1) | - A3XX_VPC_ATTR_LMSIZE(1) | - COND(vp->writes_psize, A3XX_VPC_ATTR_PSIZE)); - OUT_RING(ring, 0x00000000); - } else { - uint32_t vinterp[4], flatshade[2], vpsrepl[4]; - - memset(vinterp, 0, sizeof(vinterp)); - memset(flatshade, 0, sizeof(flatshade)); - memset(vpsrepl, 0, sizeof(vpsrepl)); - - /* figure out VARYING_INTERP / FLAT_SHAD register values: */ - for (j = -1; (j = ir3_next_varying(fp, j)) < (int)fp->inputs_count; ) { - /* NOTE: varyings are packed, so if compmask is 0xb - * then first, third, and fourth component occupy - * three consecutive varying slots: - */ - unsigned compmask = fp->inputs[j].compmask; - - uint32_t inloc = fp->inputs[j].inloc; - - if (fp->inputs[j].flat || - (fp->inputs[j].rasterflat && emit->rasterflat)) { - uint32_t loc = inloc; - - for (i = 0; i < 4; i++) { - if (compmask & (1 << i)) { - vinterp[loc / 16] |= FLAT << ((loc % 16) * 2); - flatshade[loc / 32] |= 1 << (loc % 32); - loc++; - } - } - } - - bool coord_mode = emit->sprite_coord_mode; - if (ir3_point_sprite(fp, j, emit->sprite_coord_enable, &coord_mode)) { - /* mask is two 2-bit fields, where: - * '01' -> S - * '10' -> T - * '11' -> 1 - T (flip mode) - */ - unsigned mask = coord_mode ? 0b1101 : 0b1001; - uint32_t loc = inloc; - if (compmask & 0x1) { - vpsrepl[loc / 16] |= ((mask >> 0) & 0x3) << ((loc % 16) * 2); - loc++; - } - if (compmask & 0x2) { - vpsrepl[loc / 16] |= ((mask >> 2) & 0x3) << ((loc % 16) * 2); - loc++; - } - if (compmask & 0x4) { - /* .z <- 0.0f */ - vinterp[loc / 16] |= 0b10 << ((loc % 16) * 2); - loc++; - } - if (compmask & 0x8) { - /* .w <- 1.0f */ - vinterp[loc / 16] |= 0b11 << ((loc % 16) * 2); - loc++; - } - } - } - - OUT_PKT0(ring, REG_A3XX_VPC_ATTR, 2); - OUT_RING(ring, A3XX_VPC_ATTR_TOTALATTR(fp->total_in) | - A3XX_VPC_ATTR_THRDASSIGN(1) | - A3XX_VPC_ATTR_LMSIZE(1) | - COND(vp->writes_psize, A3XX_VPC_ATTR_PSIZE)); - OUT_RING(ring, A3XX_VPC_PACK_NUMFPNONPOSVAR(fp->total_in) | - A3XX_VPC_PACK_NUMNONPOSVSVAR(fp->total_in)); - - OUT_PKT0(ring, REG_A3XX_VPC_VARYING_INTERP_MODE(0), 4); - OUT_RING(ring, vinterp[0]); /* VPC_VARYING_INTERP[0].MODE */ - OUT_RING(ring, vinterp[1]); /* VPC_VARYING_INTERP[1].MODE */ - OUT_RING(ring, vinterp[2]); /* VPC_VARYING_INTERP[2].MODE */ - OUT_RING(ring, vinterp[3]); /* VPC_VARYING_INTERP[3].MODE */ - - OUT_PKT0(ring, REG_A3XX_VPC_VARYING_PS_REPL_MODE(0), 4); - OUT_RING(ring, vpsrepl[0]); /* VPC_VARYING_PS_REPL[0].MODE */ - OUT_RING(ring, vpsrepl[1]); /* VPC_VARYING_PS_REPL[1].MODE */ - OUT_RING(ring, vpsrepl[2]); /* VPC_VARYING_PS_REPL[2].MODE */ - OUT_RING(ring, vpsrepl[3]); /* VPC_VARYING_PS_REPL[3].MODE */ - - OUT_PKT0(ring, REG_A3XX_SP_FS_FLAT_SHAD_MODE_REG_0, 2); - OUT_RING(ring, flatshade[0]); /* SP_FS_FLAT_SHAD_MODE_REG_0 */ - OUT_RING(ring, flatshade[1]); /* SP_FS_FLAT_SHAD_MODE_REG_1 */ - } - - if (vpbuffer == BUFFER) - emit_shader(ring, vp); - - OUT_PKT0(ring, REG_A3XX_VFD_PERFCOUNTER0_SELECT, 1); - OUT_RING(ring, 0x00000000); /* VFD_PERFCOUNTER0_SELECT */ - - if (!emit->binning_pass) { - if (fpbuffer == BUFFER) - emit_shader(ring, fp); - - OUT_PKT0(ring, REG_A3XX_VFD_PERFCOUNTER0_SELECT, 1); - OUT_RING(ring, 0x00000000); /* VFD_PERFCOUNTER0_SELECT */ - } + const struct ir3_shader_variant *vp, *fp; + const struct ir3_info *vsi, *fsi; + enum a3xx_instrbuffermode fpbuffer, vpbuffer; + uint32_t fpbuffersz, vpbuffersz, fsoff; + uint32_t pos_regid, posz_regid, psize_regid; + uint32_t ij_regid[4], face_regid, coord_regid, zwcoord_regid; + uint32_t color_regid[4] = {0}; + int constmode; + int i, j; + + debug_assert(nr <= ARRAY_SIZE(color_regid)); + + vp = fd3_emit_get_vp(emit); + fp = fd3_emit_get_fp(emit); + + vsi = &vp->info; + fsi = &fp->info; + + fpbuffer = BUFFER; + vpbuffer = BUFFER; + fpbuffersz = fp->instrlen; + vpbuffersz = vp->instrlen; + + /* + * Decide whether to use BUFFER or CACHE mode for VS and FS. It + * appears like 256 is the hard limit, but when the combined size + * exceeds 128 then blob will try to keep FS in BUFFER mode and + * switch to CACHE for VS until VS is too large. The blob seems + * to switch FS out of BUFFER mode at slightly under 128. But + * a bit fuzzy on the decision tree, so use slightly conservative + * limits. + * + * TODO check if these thresholds for BUFFER vs CACHE mode are the + * same for all a3xx or whether we need to consider the gpuid + */ + + if ((fpbuffersz + vpbuffersz) > 128) { + if (fpbuffersz < 112) { + /* FP:BUFFER VP:CACHE */ + vpbuffer = CACHE; + vpbuffersz = 256 - fpbuffersz; + } else if (vpbuffersz < 112) { + /* FP:CACHE VP:BUFFER */ + fpbuffer = CACHE; + fpbuffersz = 256 - vpbuffersz; + } else { + /* FP:CACHE VP:CACHE */ + vpbuffer = fpbuffer = CACHE; + vpbuffersz = fpbuffersz = 192; + } + } + + if (fpbuffer == BUFFER) { + fsoff = 128 - fpbuffersz; + } else { + fsoff = 256 - fpbuffersz; + } + + /* seems like vs->constlen + fs->constlen > 256, then CONSTMODE=1 */ + constmode = ((vp->constlen + fp->constlen) > 256) ? 1 : 0; + + pos_regid = ir3_find_output_regid(vp, VARYING_SLOT_POS); + posz_regid = ir3_find_output_regid(fp, FRAG_RESULT_DEPTH); + psize_regid = ir3_find_output_regid(vp, VARYING_SLOT_PSIZ); + if (fp->color0_mrt) { + color_regid[0] = color_regid[1] = color_regid[2] = color_regid[3] = + ir3_find_output_regid(fp, FRAG_RESULT_COLOR); + } else { + color_regid[0] = ir3_find_output_regid(fp, FRAG_RESULT_DATA0); + color_regid[1] = ir3_find_output_regid(fp, FRAG_RESULT_DATA1); + color_regid[2] = ir3_find_output_regid(fp, FRAG_RESULT_DATA2); + color_regid[3] = ir3_find_output_regid(fp, FRAG_RESULT_DATA3); + } + + face_regid = ir3_find_sysval_regid(fp, SYSTEM_VALUE_FRONT_FACE); + coord_regid = ir3_find_sysval_regid(fp, SYSTEM_VALUE_FRAG_COORD); + zwcoord_regid = + (coord_regid == regid(63, 0)) ? regid(63, 0) : (coord_regid + 2); + ij_regid[0] = + ir3_find_sysval_regid(fp, SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL); + ij_regid[1] = + ir3_find_sysval_regid(fp, SYSTEM_VALUE_BARYCENTRIC_LINEAR_PIXEL); + ij_regid[2] = + ir3_find_sysval_regid(fp, SYSTEM_VALUE_BARYCENTRIC_PERSP_CENTROID); + ij_regid[3] = + ir3_find_sysval_regid(fp, SYSTEM_VALUE_BARYCENTRIC_LINEAR_CENTROID); + + /* adjust regids for alpha output formats. there is no alpha render + * format, so it's just treated like red + */ + for (i = 0; i < nr; i++) + if (util_format_is_alpha(pipe_surface_format(bufs[i]))) + color_regid[i] += 3; + + /* we could probably divide this up into things that need to be + * emitted if frag-prog is dirty vs if vert-prog is dirty.. + */ + + OUT_PKT0(ring, REG_A3XX_HLSQ_CONTROL_0_REG, 6); + OUT_RING(ring, A3XX_HLSQ_CONTROL_0_REG_FSTHREADSIZE(FOUR_QUADS) | + A3XX_HLSQ_CONTROL_0_REG_FSSUPERTHREADENABLE | + A3XX_HLSQ_CONTROL_0_REG_CONSTMODE(constmode) | + /* NOTE: I guess SHADERRESTART and CONSTFULLUPDATE maybe + * flush some caches? I think we only need to set those + * bits if we have updated const or shader.. + */ + A3XX_HLSQ_CONTROL_0_REG_SPSHADERRESTART | + A3XX_HLSQ_CONTROL_0_REG_SPCONSTFULLUPDATE); + OUT_RING(ring, A3XX_HLSQ_CONTROL_1_REG_VSTHREADSIZE(TWO_QUADS) | + A3XX_HLSQ_CONTROL_1_REG_VSSUPERTHREADENABLE | + A3XX_HLSQ_CONTROL_1_REG_FRAGCOORDXYREGID(coord_regid) | + A3XX_HLSQ_CONTROL_1_REG_FRAGCOORDZWREGID(zwcoord_regid)); + OUT_RING(ring, A3XX_HLSQ_CONTROL_2_REG_PRIMALLOCTHRESHOLD(31) | + A3XX_HLSQ_CONTROL_2_REG_FACENESSREGID(face_regid)); + OUT_RING(ring, + A3XX_HLSQ_CONTROL_3_REG_IJPERSPCENTERREGID(ij_regid[0]) | + A3XX_HLSQ_CONTROL_3_REG_IJNONPERSPCENTERREGID(ij_regid[1]) | + A3XX_HLSQ_CONTROL_3_REG_IJPERSPCENTROIDREGID(ij_regid[2]) | + A3XX_HLSQ_CONTROL_3_REG_IJNONPERSPCENTROIDREGID(ij_regid[3])); + OUT_RING(ring, A3XX_HLSQ_VS_CONTROL_REG_CONSTLENGTH(vp->constlen) | + A3XX_HLSQ_VS_CONTROL_REG_CONSTSTARTOFFSET(0) | + A3XX_HLSQ_VS_CONTROL_REG_INSTRLENGTH(vpbuffersz)); + OUT_RING(ring, A3XX_HLSQ_FS_CONTROL_REG_CONSTLENGTH(fp->constlen) | + A3XX_HLSQ_FS_CONTROL_REG_CONSTSTARTOFFSET(128) | + A3XX_HLSQ_FS_CONTROL_REG_INSTRLENGTH(fpbuffersz)); + + OUT_PKT0(ring, REG_A3XX_SP_SP_CTRL_REG, 1); + OUT_RING(ring, A3XX_SP_SP_CTRL_REG_CONSTMODE(constmode) | + COND(emit->binning_pass, A3XX_SP_SP_CTRL_REG_BINNING) | + A3XX_SP_SP_CTRL_REG_SLEEPMODE(1) | + A3XX_SP_SP_CTRL_REG_L0MODE(0)); + + OUT_PKT0(ring, REG_A3XX_SP_VS_LENGTH_REG, 1); + OUT_RING(ring, A3XX_SP_VS_LENGTH_REG_SHADERLENGTH(vp->instrlen)); + + OUT_PKT0(ring, REG_A3XX_SP_VS_CTRL_REG0, 3); + OUT_RING(ring, + A3XX_SP_VS_CTRL_REG0_THREADMODE(MULTI) | + A3XX_SP_VS_CTRL_REG0_INSTRBUFFERMODE(vpbuffer) | + COND(vpbuffer == CACHE, A3XX_SP_VS_CTRL_REG0_CACHEINVALID) | + A3XX_SP_VS_CTRL_REG0_HALFREGFOOTPRINT(vsi->max_half_reg + 1) | + A3XX_SP_VS_CTRL_REG0_FULLREGFOOTPRINT(vsi->max_reg + 1) | + A3XX_SP_VS_CTRL_REG0_THREADSIZE(TWO_QUADS) | + A3XX_SP_VS_CTRL_REG0_SUPERTHREADMODE | + A3XX_SP_VS_CTRL_REG0_LENGTH(vpbuffersz)); + OUT_RING(ring, + A3XX_SP_VS_CTRL_REG1_CONSTLENGTH(vp->constlen) | + A3XX_SP_VS_CTRL_REG1_INITIALOUTSTANDING(vp->total_in) | + A3XX_SP_VS_CTRL_REG1_CONSTFOOTPRINT(MAX2(vp->constlen - 1, 0))); + OUT_RING(ring, A3XX_SP_VS_PARAM_REG_POSREGID(pos_regid) | + A3XX_SP_VS_PARAM_REG_PSIZEREGID(psize_regid) | + A3XX_SP_VS_PARAM_REG_TOTALVSOUTVAR(fp->varying_in)); + + struct ir3_shader_linkage l = {0}; + ir3_link_shaders(&l, vp, fp, false); + + for (i = 0, j = 0; (i < 16) && (j < l.cnt); i++) { + uint32_t reg = 0; + + OUT_PKT0(ring, REG_A3XX_SP_VS_OUT_REG(i), 1); + + reg |= A3XX_SP_VS_OUT_REG_A_REGID(l.var[j].regid); + reg |= A3XX_SP_VS_OUT_REG_A_COMPMASK(l.var[j].compmask); + j++; + + reg |= A3XX_SP_VS_OUT_REG_B_REGID(l.var[j].regid); + reg |= A3XX_SP_VS_OUT_REG_B_COMPMASK(l.var[j].compmask); + j++; + + OUT_RING(ring, reg); + } + + for (i = 0, j = 0; (i < 8) && (j < l.cnt); i++) { + uint32_t reg = 0; + + OUT_PKT0(ring, REG_A3XX_SP_VS_VPC_DST_REG(i), 1); + + reg |= A3XX_SP_VS_VPC_DST_REG_OUTLOC0(l.var[j++].loc + 8); + reg |= A3XX_SP_VS_VPC_DST_REG_OUTLOC1(l.var[j++].loc + 8); + reg |= A3XX_SP_VS_VPC_DST_REG_OUTLOC2(l.var[j++].loc + 8); + reg |= A3XX_SP_VS_VPC_DST_REG_OUTLOC3(l.var[j++].loc + 8); + + OUT_RING(ring, reg); + } + + OUT_PKT0(ring, REG_A3XX_SP_VS_OBJ_OFFSET_REG, 2); + OUT_RING(ring, A3XX_SP_VS_OBJ_OFFSET_REG_CONSTOBJECTOFFSET(0) | + A3XX_SP_VS_OBJ_OFFSET_REG_SHADEROBJOFFSET(0)); + OUT_RELOC(ring, vp->bo, 0, 0, 0); /* SP_VS_OBJ_START_REG */ + + if (emit->binning_pass) { + OUT_PKT0(ring, REG_A3XX_SP_FS_LENGTH_REG, 1); + OUT_RING(ring, 0x00000000); + + OUT_PKT0(ring, REG_A3XX_SP_FS_CTRL_REG0, 2); + OUT_RING(ring, A3XX_SP_FS_CTRL_REG0_THREADMODE(MULTI) | + A3XX_SP_FS_CTRL_REG0_INSTRBUFFERMODE(BUFFER)); + OUT_RING(ring, 0x00000000); + + OUT_PKT0(ring, REG_A3XX_SP_FS_OBJ_OFFSET_REG, 1); + OUT_RING(ring, A3XX_SP_FS_OBJ_OFFSET_REG_CONSTOBJECTOFFSET(128) | + A3XX_SP_FS_OBJ_OFFSET_REG_SHADEROBJOFFSET(0)); + } else { + OUT_PKT0(ring, REG_A3XX_SP_FS_LENGTH_REG, 1); + OUT_RING(ring, A3XX_SP_FS_LENGTH_REG_SHADERLENGTH(fp->instrlen)); + + OUT_PKT0(ring, REG_A3XX_SP_FS_CTRL_REG0, 2); + OUT_RING(ring, + A3XX_SP_FS_CTRL_REG0_THREADMODE(MULTI) | + A3XX_SP_FS_CTRL_REG0_INSTRBUFFERMODE(fpbuffer) | + COND(fpbuffer == CACHE, A3XX_SP_FS_CTRL_REG0_CACHEINVALID) | + A3XX_SP_FS_CTRL_REG0_HALFREGFOOTPRINT(fsi->max_half_reg + 1) | + A3XX_SP_FS_CTRL_REG0_FULLREGFOOTPRINT(fsi->max_reg + 1) | + A3XX_SP_FS_CTRL_REG0_INOUTREGOVERLAP | + A3XX_SP_FS_CTRL_REG0_THREADSIZE(FOUR_QUADS) | + A3XX_SP_FS_CTRL_REG0_SUPERTHREADMODE | + COND(fp->need_pixlod, A3XX_SP_FS_CTRL_REG0_PIXLODENABLE) | + A3XX_SP_FS_CTRL_REG0_LENGTH(fpbuffersz)); + OUT_RING(ring, A3XX_SP_FS_CTRL_REG1_CONSTLENGTH(fp->constlen) | + A3XX_SP_FS_CTRL_REG1_INITIALOUTSTANDING(fp->sysval_in) | + A3XX_SP_FS_CTRL_REG1_CONSTFOOTPRINT( + MAX2(fp->constlen - 1, 0)) | + A3XX_SP_FS_CTRL_REG1_HALFPRECVAROFFSET(63)); + + OUT_PKT0(ring, REG_A3XX_SP_FS_OBJ_OFFSET_REG, 2); + OUT_RING(ring, A3XX_SP_FS_OBJ_OFFSET_REG_CONSTOBJECTOFFSET( + MAX2(128, vp->constlen)) | + A3XX_SP_FS_OBJ_OFFSET_REG_SHADEROBJOFFSET(fsoff)); + OUT_RELOC(ring, fp->bo, 0, 0, 0); /* SP_FS_OBJ_START_REG */ + } + + OUT_PKT0(ring, REG_A3XX_SP_FS_OUTPUT_REG, 1); + OUT_RING(ring, COND(fp->writes_pos, A3XX_SP_FS_OUTPUT_REG_DEPTH_ENABLE) | + A3XX_SP_FS_OUTPUT_REG_DEPTH_REGID(posz_regid) | + A3XX_SP_FS_OUTPUT_REG_MRT(MAX2(1, nr) - 1)); + + OUT_PKT0(ring, REG_A3XX_SP_FS_MRT_REG(0), 4); + for (i = 0; i < 4; i++) { + uint32_t mrt_reg = + A3XX_SP_FS_MRT_REG_REGID(color_regid[i]) | + COND(color_regid[i] & HALF_REG_ID, A3XX_SP_FS_MRT_REG_HALF_PRECISION); + + if (i < nr) { + enum pipe_format fmt = pipe_surface_format(bufs[i]); + mrt_reg |= + COND(util_format_is_pure_uint(fmt), A3XX_SP_FS_MRT_REG_UINT) | + COND(util_format_is_pure_sint(fmt), A3XX_SP_FS_MRT_REG_SINT); + } + OUT_RING(ring, mrt_reg); + } + + if (emit->binning_pass) { + OUT_PKT0(ring, REG_A3XX_VPC_ATTR, 2); + OUT_RING(ring, A3XX_VPC_ATTR_THRDASSIGN(1) | A3XX_VPC_ATTR_LMSIZE(1) | + COND(vp->writes_psize, A3XX_VPC_ATTR_PSIZE)); + OUT_RING(ring, 0x00000000); + } else { + uint32_t vinterp[4], flatshade[2], vpsrepl[4]; + + memset(vinterp, 0, sizeof(vinterp)); + memset(flatshade, 0, sizeof(flatshade)); + memset(vpsrepl, 0, sizeof(vpsrepl)); + + /* figure out VARYING_INTERP / FLAT_SHAD register values: */ + for (j = -1; (j = ir3_next_varying(fp, j)) < (int)fp->inputs_count;) { + /* NOTE: varyings are packed, so if compmask is 0xb + * then first, third, and fourth component occupy + * three consecutive varying slots: + */ + unsigned compmask = fp->inputs[j].compmask; + + uint32_t inloc = fp->inputs[j].inloc; + + if (fp->inputs[j].flat || + (fp->inputs[j].rasterflat && emit->rasterflat)) { + uint32_t loc = inloc; + + for (i = 0; i < 4; i++) { + if (compmask & (1 << i)) { + vinterp[loc / 16] |= FLAT << ((loc % 16) * 2); + flatshade[loc / 32] |= 1 << (loc % 32); + loc++; + } + } + } + + bool coord_mode = emit->sprite_coord_mode; + if (ir3_point_sprite(fp, j, emit->sprite_coord_enable, &coord_mode)) { + /* mask is two 2-bit fields, where: + * '01' -> S + * '10' -> T + * '11' -> 1 - T (flip mode) + */ + unsigned mask = coord_mode ? 0b1101 : 0b1001; + uint32_t loc = inloc; + if (compmask & 0x1) { + vpsrepl[loc / 16] |= ((mask >> 0) & 0x3) << ((loc % 16) * 2); + loc++; + } + if (compmask & 0x2) { + vpsrepl[loc / 16] |= ((mask >> 2) & 0x3) << ((loc % 16) * 2); + loc++; + } + if (compmask & 0x4) { + /* .z <- 0.0f */ + vinterp[loc / 16] |= 0b10 << ((loc % 16) * 2); + loc++; + } + if (compmask & 0x8) { + /* .w <- 1.0f */ + vinterp[loc / 16] |= 0b11 << ((loc % 16) * 2); + loc++; + } + } + } + + OUT_PKT0(ring, REG_A3XX_VPC_ATTR, 2); + OUT_RING(ring, A3XX_VPC_ATTR_TOTALATTR(fp->total_in) | + A3XX_VPC_ATTR_THRDASSIGN(1) | A3XX_VPC_ATTR_LMSIZE(1) | + COND(vp->writes_psize, A3XX_VPC_ATTR_PSIZE)); + OUT_RING(ring, A3XX_VPC_PACK_NUMFPNONPOSVAR(fp->total_in) | + A3XX_VPC_PACK_NUMNONPOSVSVAR(fp->total_in)); + + OUT_PKT0(ring, REG_A3XX_VPC_VARYING_INTERP_MODE(0), 4); + OUT_RING(ring, vinterp[0]); /* VPC_VARYING_INTERP[0].MODE */ + OUT_RING(ring, vinterp[1]); /* VPC_VARYING_INTERP[1].MODE */ + OUT_RING(ring, vinterp[2]); /* VPC_VARYING_INTERP[2].MODE */ + OUT_RING(ring, vinterp[3]); /* VPC_VARYING_INTERP[3].MODE */ + + OUT_PKT0(ring, REG_A3XX_VPC_VARYING_PS_REPL_MODE(0), 4); + OUT_RING(ring, vpsrepl[0]); /* VPC_VARYING_PS_REPL[0].MODE */ + OUT_RING(ring, vpsrepl[1]); /* VPC_VARYING_PS_REPL[1].MODE */ + OUT_RING(ring, vpsrepl[2]); /* VPC_VARYING_PS_REPL[2].MODE */ + OUT_RING(ring, vpsrepl[3]); /* VPC_VARYING_PS_REPL[3].MODE */ + + OUT_PKT0(ring, REG_A3XX_SP_FS_FLAT_SHAD_MODE_REG_0, 2); + OUT_RING(ring, flatshade[0]); /* SP_FS_FLAT_SHAD_MODE_REG_0 */ + OUT_RING(ring, flatshade[1]); /* SP_FS_FLAT_SHAD_MODE_REG_1 */ + } + + if (vpbuffer == BUFFER) + emit_shader(ring, vp); + + OUT_PKT0(ring, REG_A3XX_VFD_PERFCOUNTER0_SELECT, 1); + OUT_RING(ring, 0x00000000); /* VFD_PERFCOUNTER0_SELECT */ + + if (!emit->binning_pass) { + if (fpbuffer == BUFFER) + emit_shader(ring, fp); + + OUT_PKT0(ring, REG_A3XX_VFD_PERFCOUNTER0_SELECT, 1); + OUT_RING(ring, 0x00000000); /* VFD_PERFCOUNTER0_SELECT */ + } } static struct ir3_program_state * fd3_program_create(void *data, struct ir3_shader_variant *bs, - struct ir3_shader_variant *vs, - struct ir3_shader_variant *hs, - struct ir3_shader_variant *ds, - struct ir3_shader_variant *gs, - struct ir3_shader_variant *fs, - const struct ir3_shader_key *key) - in_dt + struct ir3_shader_variant *vs, struct ir3_shader_variant *hs, + struct ir3_shader_variant *ds, struct ir3_shader_variant *gs, + struct ir3_shader_variant *fs, + const struct ir3_shader_key *key) in_dt { - struct fd_context *ctx = fd_context(data); - struct fd3_program_state *state = CALLOC_STRUCT(fd3_program_state); + struct fd_context *ctx = fd_context(data); + struct fd3_program_state *state = CALLOC_STRUCT(fd3_program_state); - tc_assert_driver_thread(ctx->tc); + tc_assert_driver_thread(ctx->tc); - state->bs = bs; - state->vs = vs; - state->fs = fs; + state->bs = bs; + state->vs = vs; + state->fs = fs; - return &state->base; + return &state->base; } static void fd3_program_destroy(void *data, struct ir3_program_state *state) { - struct fd3_program_state *so = fd3_program_state(state); - free(so); + struct fd3_program_state *so = fd3_program_state(state); + free(so); } static const struct ir3_cache_funcs cache_funcs = { - .create_state = fd3_program_create, - .destroy_state = fd3_program_destroy, + .create_state = fd3_program_create, + .destroy_state = fd3_program_destroy, }; void fd3_prog_init(struct pipe_context *pctx) { - struct fd_context *ctx = fd_context(pctx); + struct fd_context *ctx = fd_context(pctx); - ctx->shader_cache = ir3_cache_create(&cache_funcs, ctx); - ir3_prog_init(pctx); - fd_prog_init(pctx); + ctx->shader_cache = ir3_cache_create(&cache_funcs, ctx); + ir3_prog_init(pctx); + fd_prog_init(pctx); } diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_program.h b/src/gallium/drivers/freedreno/a3xx/fd3_program.h index 3393330..cefe192 100644 --- a/src/gallium/drivers/freedreno/a3xx/fd3_program.h +++ b/src/gallium/drivers/freedreno/a3xx/fd3_program.h @@ -36,24 +36,24 @@ struct fd3_emit; struct fd3_program_state { - struct ir3_program_state base; - struct ir3_shader_variant *bs; /* VS for when emit->binning */ - struct ir3_shader_variant *vs; - struct ir3_shader_variant *fs; /* FS for when !emit->binning */ + struct ir3_program_state base; + struct ir3_shader_variant *bs; /* VS for when emit->binning */ + struct ir3_shader_variant *vs; + struct ir3_shader_variant *fs; /* FS for when !emit->binning */ }; static inline struct fd3_program_state * fd3_program_state(struct ir3_program_state *state) { - return (struct fd3_program_state *)state; + return (struct fd3_program_state *)state; } -void fd3_program_emit(struct fd_ringbuffer *ring, struct fd3_emit *emit, - int nr, struct pipe_surface **bufs); +void fd3_program_emit(struct fd_ringbuffer *ring, struct fd3_emit *emit, int nr, + struct pipe_surface **bufs); void fd3_prog_init(struct pipe_context *pctx); bool fd3_needs_manual_clipping(const struct ir3_shader *, - const struct pipe_rasterizer_state *); + const struct pipe_rasterizer_state *); #endif /* FD3_PROGRAM_H_ */ diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_query.c b/src/gallium/drivers/freedreno/a3xx/fd3_query.c index 35a99e8..0bb32bf 100644 --- a/src/gallium/drivers/freedreno/a3xx/fd3_query.c +++ b/src/gallium/drivers/freedreno/a3xx/fd3_query.c @@ -24,17 +24,16 @@ * Rob Clark */ -#include "freedreno_query_hw.h" #include "freedreno_batch.h" #include "freedreno_context.h" +#include "freedreno_query_hw.h" #include "freedreno_util.h" -#include "fd3_query.h" #include "fd3_format.h" - +#include "fd3_query.h" struct fd_rb_samp_ctrs { - uint64_t ctr[16]; + uint64_t ctr[16]; }; /* @@ -47,104 +46,103 @@ struct fd_rb_samp_ctrs { static struct fd_hw_sample * occlusion_get_sample(struct fd_batch *batch, struct fd_ringbuffer *ring) { - struct fd_hw_sample *samp = - fd_hw_sample_init(batch, sizeof(struct fd_rb_samp_ctrs)); - - /* Set RB_SAMPLE_COUNT_ADDR to samp->offset plus value of - * HW_QUERY_BASE_REG register: - */ - OUT_PKT3(ring, CP_SET_CONSTANT, 3); - OUT_RING(ring, CP_REG(REG_A3XX_RB_SAMPLE_COUNT_ADDR) | 0x80000000); - OUT_RING(ring, HW_QUERY_BASE_REG); - OUT_RING(ring, samp->offset); - - OUT_PKT0(ring, REG_A3XX_RB_SAMPLE_COUNT_CONTROL, 1); - OUT_RING(ring, A3XX_RB_SAMPLE_COUNT_CONTROL_COPY); - - OUT_PKT3(ring, CP_DRAW_INDX, 3); - OUT_RING(ring, 0x00000000); - OUT_RING(ring, DRAW(DI_PT_POINTLIST_PSIZE, DI_SRC_SEL_AUTO_INDEX, - INDEX_SIZE_IGN, USE_VISIBILITY, 0)); - OUT_RING(ring, 0); /* NumIndices */ - - fd_event_write(batch, ring, ZPASS_DONE); - - OUT_PKT0(ring, REG_A3XX_RBBM_PERFCTR_CTL, 1); - OUT_RING(ring, A3XX_RBBM_PERFCTR_CTL_ENABLE); - - OUT_PKT0(ring, REG_A3XX_VBIF_PERF_CNT_EN, 1); - OUT_RING(ring, A3XX_VBIF_PERF_CNT_EN_CNT0 | - A3XX_VBIF_PERF_CNT_EN_CNT1 | - A3XX_VBIF_PERF_CNT_EN_PWRCNT0 | - A3XX_VBIF_PERF_CNT_EN_PWRCNT1 | - A3XX_VBIF_PERF_CNT_EN_PWRCNT2); - - return samp; + struct fd_hw_sample *samp = + fd_hw_sample_init(batch, sizeof(struct fd_rb_samp_ctrs)); + + /* Set RB_SAMPLE_COUNT_ADDR to samp->offset plus value of + * HW_QUERY_BASE_REG register: + */ + OUT_PKT3(ring, CP_SET_CONSTANT, 3); + OUT_RING(ring, CP_REG(REG_A3XX_RB_SAMPLE_COUNT_ADDR) | 0x80000000); + OUT_RING(ring, HW_QUERY_BASE_REG); + OUT_RING(ring, samp->offset); + + OUT_PKT0(ring, REG_A3XX_RB_SAMPLE_COUNT_CONTROL, 1); + OUT_RING(ring, A3XX_RB_SAMPLE_COUNT_CONTROL_COPY); + + OUT_PKT3(ring, CP_DRAW_INDX, 3); + OUT_RING(ring, 0x00000000); + OUT_RING(ring, DRAW(DI_PT_POINTLIST_PSIZE, DI_SRC_SEL_AUTO_INDEX, + INDEX_SIZE_IGN, USE_VISIBILITY, 0)); + OUT_RING(ring, 0); /* NumIndices */ + + fd_event_write(batch, ring, ZPASS_DONE); + + OUT_PKT0(ring, REG_A3XX_RBBM_PERFCTR_CTL, 1); + OUT_RING(ring, A3XX_RBBM_PERFCTR_CTL_ENABLE); + + OUT_PKT0(ring, REG_A3XX_VBIF_PERF_CNT_EN, 1); + OUT_RING(ring, A3XX_VBIF_PERF_CNT_EN_CNT0 | A3XX_VBIF_PERF_CNT_EN_CNT1 | + A3XX_VBIF_PERF_CNT_EN_PWRCNT0 | + A3XX_VBIF_PERF_CNT_EN_PWRCNT1 | + A3XX_VBIF_PERF_CNT_EN_PWRCNT2); + + return samp; } static uint64_t count_samples(const struct fd_rb_samp_ctrs *start, - const struct fd_rb_samp_ctrs *end) + const struct fd_rb_samp_ctrs *end) { - uint64_t n = 0; - unsigned i; + uint64_t n = 0; + unsigned i; - /* not quite sure what all of these are, possibly different - * counters for each MRT render target: - */ - for (i = 0; i < 16; i += 4) - n += end->ctr[i] - start->ctr[i]; + /* not quite sure what all of these are, possibly different + * counters for each MRT render target: + */ + for (i = 0; i < 16; i += 4) + n += end->ctr[i] - start->ctr[i]; - return n; + return n; } static void -occlusion_counter_accumulate_result(struct fd_context *ctx, - const void *start, const void *end, - union pipe_query_result *result) +occlusion_counter_accumulate_result(struct fd_context *ctx, const void *start, + const void *end, + union pipe_query_result *result) { - uint64_t n = count_samples(start, end); - result->u64 += n; + uint64_t n = count_samples(start, end); + result->u64 += n; } static void -occlusion_predicate_accumulate_result(struct fd_context *ctx, - const void *start, const void *end, - union pipe_query_result *result) +occlusion_predicate_accumulate_result(struct fd_context *ctx, const void *start, + const void *end, + union pipe_query_result *result) { - uint64_t n = count_samples(start, end); - result->b |= (n > 0); + uint64_t n = count_samples(start, end); + result->b |= (n > 0); } static const struct fd_hw_sample_provider occlusion_counter = { - .query_type = PIPE_QUERY_OCCLUSION_COUNTER, - .get_sample = occlusion_get_sample, - .accumulate_result = occlusion_counter_accumulate_result, + .query_type = PIPE_QUERY_OCCLUSION_COUNTER, + .get_sample = occlusion_get_sample, + .accumulate_result = occlusion_counter_accumulate_result, }; static const struct fd_hw_sample_provider occlusion_predicate = { - .query_type = PIPE_QUERY_OCCLUSION_PREDICATE, - .get_sample = occlusion_get_sample, - .accumulate_result = occlusion_predicate_accumulate_result, + .query_type = PIPE_QUERY_OCCLUSION_PREDICATE, + .get_sample = occlusion_get_sample, + .accumulate_result = occlusion_predicate_accumulate_result, }; static const struct fd_hw_sample_provider occlusion_predicate_conservative = { - .query_type = PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE, - .get_sample = occlusion_get_sample, - .accumulate_result = occlusion_predicate_accumulate_result, + .query_type = PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE, + .get_sample = occlusion_get_sample, + .accumulate_result = occlusion_predicate_accumulate_result, }; -void fd3_query_context_init(struct pipe_context *pctx) - disable_thread_safety_analysis +void +fd3_query_context_init(struct pipe_context *pctx) disable_thread_safety_analysis { - struct fd_context *ctx = fd_context(pctx); + struct fd_context *ctx = fd_context(pctx); - ctx->create_query = fd_hw_create_query; - ctx->query_prepare = fd_hw_query_prepare; - ctx->query_prepare_tile = fd_hw_query_prepare_tile; - ctx->query_update_batch = fd_hw_query_update_batch; + ctx->create_query = fd_hw_create_query; + ctx->query_prepare = fd_hw_query_prepare; + ctx->query_prepare_tile = fd_hw_query_prepare_tile; + ctx->query_update_batch = fd_hw_query_update_batch; - fd_hw_query_register_provider(pctx, &occlusion_counter); - fd_hw_query_register_provider(pctx, &occlusion_predicate); - fd_hw_query_register_provider(pctx, &occlusion_predicate_conservative); + fd_hw_query_register_provider(pctx, &occlusion_counter); + fd_hw_query_register_provider(pctx, &occlusion_predicate); + fd_hw_query_register_provider(pctx, &occlusion_predicate_conservative); } diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_rasterizer.c b/src/gallium/drivers/freedreno/a3xx/fd3_rasterizer.c index 324998a..481efc3 100644 --- a/src/gallium/drivers/freedreno/a3xx/fd3_rasterizer.c +++ b/src/gallium/drivers/freedreno/a3xx/fd3_rasterizer.c @@ -24,80 +24,79 @@ * Rob Clark */ - #include "pipe/p_state.h" -#include "util/u_string.h" #include "util/u_memory.h" +#include "util/u_string.h" -#include "fd3_rasterizer.h" #include "fd3_context.h" #include "fd3_format.h" +#include "fd3_rasterizer.h" void * fd3_rasterizer_state_create(struct pipe_context *pctx, - const struct pipe_rasterizer_state *cso) + const struct pipe_rasterizer_state *cso) { - struct fd3_rasterizer_stateobj *so; - float psize_min, psize_max; + struct fd3_rasterizer_stateobj *so; + float psize_min, psize_max; - so = CALLOC_STRUCT(fd3_rasterizer_stateobj); - if (!so) - return NULL; + so = CALLOC_STRUCT(fd3_rasterizer_stateobj); + if (!so) + return NULL; - so->base = *cso; + so->base = *cso; - if (cso->point_size_per_vertex) { - psize_min = util_get_min_point_size(cso); - psize_max = 4092; - } else { - /* Force the point size to be as if the vertex output was disabled. */ - psize_min = cso->point_size; - psize_max = cso->point_size; - } + if (cso->point_size_per_vertex) { + psize_min = util_get_min_point_size(cso); + psize_max = 4092; + } else { + /* Force the point size to be as if the vertex output was disabled. */ + psize_min = cso->point_size; + psize_max = cso->point_size; + } -/* - if (cso->line_stipple_enable) { - ??? TODO line stipple - } - TODO cso->half_pixel_center - if (cso->multisample) - TODO -*/ - so->gras_cl_clip_cntl = - COND(cso->clip_halfz, A3XX_GRAS_CL_CLIP_CNTL_ZERO_GB_SCALE_Z); - so->gras_su_point_minmax = - A3XX_GRAS_SU_POINT_MINMAX_MIN(psize_min) | - A3XX_GRAS_SU_POINT_MINMAX_MAX(psize_max); - so->gras_su_point_size = A3XX_GRAS_SU_POINT_SIZE(cso->point_size); - so->gras_su_poly_offset_scale = - A3XX_GRAS_SU_POLY_OFFSET_SCALE_VAL(cso->offset_scale); - so->gras_su_poly_offset_offset = - A3XX_GRAS_SU_POLY_OFFSET_OFFSET(cso->offset_units * 2.0f); + /* + if (cso->line_stipple_enable) { + ??? TODO line stipple + } + TODO cso->half_pixel_center + if (cso->multisample) + TODO + */ + so->gras_cl_clip_cntl = + COND(cso->clip_halfz, A3XX_GRAS_CL_CLIP_CNTL_ZERO_GB_SCALE_Z); + so->gras_su_point_minmax = A3XX_GRAS_SU_POINT_MINMAX_MIN(psize_min) | + A3XX_GRAS_SU_POINT_MINMAX_MAX(psize_max); + so->gras_su_point_size = A3XX_GRAS_SU_POINT_SIZE(cso->point_size); + so->gras_su_poly_offset_scale = + A3XX_GRAS_SU_POLY_OFFSET_SCALE_VAL(cso->offset_scale); + so->gras_su_poly_offset_offset = + A3XX_GRAS_SU_POLY_OFFSET_OFFSET(cso->offset_units * 2.0f); - so->gras_su_mode_control = - A3XX_GRAS_SU_MODE_CONTROL_LINEHALFWIDTH(cso->line_width/2.0); + so->gras_su_mode_control = + A3XX_GRAS_SU_MODE_CONTROL_LINEHALFWIDTH(cso->line_width / 2.0); - so->pc_prim_vtx_cntl = - A3XX_PC_PRIM_VTX_CNTL_POLYMODE_FRONT_PTYPE(fd_polygon_mode(cso->fill_front)) | - A3XX_PC_PRIM_VTX_CNTL_POLYMODE_BACK_PTYPE(fd_polygon_mode(cso->fill_back)); + so->pc_prim_vtx_cntl = A3XX_PC_PRIM_VTX_CNTL_POLYMODE_FRONT_PTYPE( + fd_polygon_mode(cso->fill_front)) | + A3XX_PC_PRIM_VTX_CNTL_POLYMODE_BACK_PTYPE( + fd_polygon_mode(cso->fill_back)); - if (cso->fill_front != PIPE_POLYGON_MODE_FILL || - cso->fill_back != PIPE_POLYGON_MODE_FILL) - so->pc_prim_vtx_cntl |= A3XX_PC_PRIM_VTX_CNTL_POLYMODE_ENABLE; + if (cso->fill_front != PIPE_POLYGON_MODE_FILL || + cso->fill_back != PIPE_POLYGON_MODE_FILL) + so->pc_prim_vtx_cntl |= A3XX_PC_PRIM_VTX_CNTL_POLYMODE_ENABLE; - if (cso->cull_face & PIPE_FACE_FRONT) - so->gras_su_mode_control |= A3XX_GRAS_SU_MODE_CONTROL_CULL_FRONT; - if (cso->cull_face & PIPE_FACE_BACK) - so->gras_su_mode_control |= A3XX_GRAS_SU_MODE_CONTROL_CULL_BACK; - if (!cso->front_ccw) - so->gras_su_mode_control |= A3XX_GRAS_SU_MODE_CONTROL_FRONT_CW; - if (!cso->flatshade_first) - so->pc_prim_vtx_cntl |= A3XX_PC_PRIM_VTX_CNTL_PROVOKING_VTX_LAST; + if (cso->cull_face & PIPE_FACE_FRONT) + so->gras_su_mode_control |= A3XX_GRAS_SU_MODE_CONTROL_CULL_FRONT; + if (cso->cull_face & PIPE_FACE_BACK) + so->gras_su_mode_control |= A3XX_GRAS_SU_MODE_CONTROL_CULL_BACK; + if (!cso->front_ccw) + so->gras_su_mode_control |= A3XX_GRAS_SU_MODE_CONTROL_FRONT_CW; + if (!cso->flatshade_first) + so->pc_prim_vtx_cntl |= A3XX_PC_PRIM_VTX_CNTL_PROVOKING_VTX_LAST; - if (cso->offset_tri) - so->gras_su_mode_control |= A3XX_GRAS_SU_MODE_CONTROL_POLY_OFFSET; - if (!cso->depth_clip_near) - so->gras_cl_clip_cntl |= A3XX_GRAS_CL_CLIP_CNTL_CLIP_DISABLE; + if (cso->offset_tri) + so->gras_su_mode_control |= A3XX_GRAS_SU_MODE_CONTROL_POLY_OFFSET; + if (!cso->depth_clip_near) + so->gras_cl_clip_cntl |= A3XX_GRAS_CL_CLIP_CNTL_CLIP_DISABLE; - return so; + return so; } diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_rasterizer.h b/src/gallium/drivers/freedreno/a3xx/fd3_rasterizer.h index 1daaeee..5daa2d1 100644 --- a/src/gallium/drivers/freedreno/a3xx/fd3_rasterizer.h +++ b/src/gallium/drivers/freedreno/a3xx/fd3_rasterizer.h @@ -27,28 +27,28 @@ #ifndef FD3_RASTERIZER_H_ #define FD3_RASTERIZER_H_ -#include "pipe/p_state.h" #include "pipe/p_context.h" +#include "pipe/p_state.h" struct fd3_rasterizer_stateobj { - struct pipe_rasterizer_state base; - uint32_t gras_su_point_minmax; - uint32_t gras_su_point_size; - uint32_t gras_su_poly_offset_scale; - uint32_t gras_su_poly_offset_offset; + struct pipe_rasterizer_state base; + uint32_t gras_su_point_minmax; + uint32_t gras_su_point_size; + uint32_t gras_su_poly_offset_scale; + uint32_t gras_su_poly_offset_offset; - uint32_t gras_su_mode_control; - uint32_t gras_cl_clip_cntl; - uint32_t pc_prim_vtx_cntl; + uint32_t gras_su_mode_control; + uint32_t gras_cl_clip_cntl; + uint32_t pc_prim_vtx_cntl; }; static inline struct fd3_rasterizer_stateobj * fd3_rasterizer_stateobj(struct pipe_rasterizer_state *rast) { - return (struct fd3_rasterizer_stateobj *)rast; + return (struct fd3_rasterizer_stateobj *)rast; } -void * fd3_rasterizer_state_create(struct pipe_context *pctx, - const struct pipe_rasterizer_state *cso); +void *fd3_rasterizer_state_create(struct pipe_context *pctx, + const struct pipe_rasterizer_state *cso); #endif /* FD3_RASTERIZER_H_ */ diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_resource.c b/src/gallium/drivers/freedreno/a3xx/fd3_resource.c index 96aa34c..b5c7f7f 100644 --- a/src/gallium/drivers/freedreno/a3xx/fd3_resource.c +++ b/src/gallium/drivers/freedreno/a3xx/fd3_resource.c @@ -26,95 +26,96 @@ #include "fd3_format.h" static uint32_t -setup_slices(struct fd_resource *rsc, uint32_t alignment, enum pipe_format format) +setup_slices(struct fd_resource *rsc, uint32_t alignment, + enum pipe_format format) { - struct pipe_resource *prsc = &rsc->b.b; - uint32_t level, size = 0; - uint32_t width0 = prsc->width0; - - if (rsc->layout.tile_mode && prsc->target != PIPE_TEXTURE_CUBE) - width0 = util_next_power_of_two(width0); - - /* 32 pixel alignment */ - fdl_set_pitchalign(&rsc->layout, fdl_cpp_shift(&rsc->layout) + 5); - - for (level = 0; level <= prsc->last_level; level++) { - struct fdl_slice *slice = fd_resource_slice(rsc, level); - uint32_t pitch = fdl_pitch(&rsc->layout, level); - uint32_t height = u_minify(prsc->height0, level); - if (rsc->layout.tile_mode) { - height = align(height, 4); - if (prsc->target != PIPE_TEXTURE_CUBE) - height = util_next_power_of_two(height); - } - - uint32_t nblocksy = util_format_get_nblocksy(format, height); - - slice->offset = size; - /* 1d array and 2d array textures must all have the same layer size - * for each miplevel on a3xx. 3d textures can have different layer - * sizes for high levels, but the hw auto-sizer is buggy (or at least - * different than what this code does), so as soon as the layer size - * range gets into range, we stop reducing it. - */ - if (prsc->target == PIPE_TEXTURE_3D && ( - level == 1 || - (level > 1 && fd_resource_slice(rsc, level - 1)->size0 > 0xf000))) - slice->size0 = align(nblocksy * pitch, alignment); - else if (level == 0 || alignment == 1) - slice->size0 = align(nblocksy * pitch, alignment); - else - slice->size0 = fd_resource_slice(rsc, level - 1)->size0; - - size += slice->size0 * u_minify(prsc->depth0, level) * prsc->array_size; - } - - return size; + struct pipe_resource *prsc = &rsc->b.b; + uint32_t level, size = 0; + uint32_t width0 = prsc->width0; + + if (rsc->layout.tile_mode && prsc->target != PIPE_TEXTURE_CUBE) + width0 = util_next_power_of_two(width0); + + /* 32 pixel alignment */ + fdl_set_pitchalign(&rsc->layout, fdl_cpp_shift(&rsc->layout) + 5); + + for (level = 0; level <= prsc->last_level; level++) { + struct fdl_slice *slice = fd_resource_slice(rsc, level); + uint32_t pitch = fdl_pitch(&rsc->layout, level); + uint32_t height = u_minify(prsc->height0, level); + if (rsc->layout.tile_mode) { + height = align(height, 4); + if (prsc->target != PIPE_TEXTURE_CUBE) + height = util_next_power_of_two(height); + } + + uint32_t nblocksy = util_format_get_nblocksy(format, height); + + slice->offset = size; + /* 1d array and 2d array textures must all have the same layer size + * for each miplevel on a3xx. 3d textures can have different layer + * sizes for high levels, but the hw auto-sizer is buggy (or at least + * different than what this code does), so as soon as the layer size + * range gets into range, we stop reducing it. + */ + if (prsc->target == PIPE_TEXTURE_3D && + (level == 1 || + (level > 1 && fd_resource_slice(rsc, level - 1)->size0 > 0xf000))) + slice->size0 = align(nblocksy * pitch, alignment); + else if (level == 0 || alignment == 1) + slice->size0 = align(nblocksy * pitch, alignment); + else + slice->size0 = fd_resource_slice(rsc, level - 1)->size0; + + size += slice->size0 * u_minify(prsc->depth0, level) * prsc->array_size; + } + + return size; } uint32_t fd3_setup_slices(struct fd_resource *rsc) { - uint32_t alignment; - - switch (rsc->b.b.target) { - case PIPE_TEXTURE_3D: - case PIPE_TEXTURE_1D_ARRAY: - case PIPE_TEXTURE_2D_ARRAY: - alignment = 4096; - break; - default: - alignment = 1; - break; - } - - return setup_slices(rsc, alignment, rsc->b.b.format); + uint32_t alignment; + + switch (rsc->b.b.target) { + case PIPE_TEXTURE_3D: + case PIPE_TEXTURE_1D_ARRAY: + case PIPE_TEXTURE_2D_ARRAY: + alignment = 4096; + break; + default: + alignment = 1; + break; + } + + return setup_slices(rsc, alignment, rsc->b.b.format); } static bool ok_format(enum pipe_format pfmt) { - enum a3xx_color_fmt fmt = fd3_pipe2color(pfmt); + enum a3xx_color_fmt fmt = fd3_pipe2color(pfmt); - if (fmt == RB_NONE) - return false; + if (fmt == RB_NONE) + return false; - switch (pfmt) { - case PIPE_FORMAT_R8_UINT: - case PIPE_FORMAT_R8_SINT: - case PIPE_FORMAT_Z32_FLOAT: - return false; - default: - break; - } + switch (pfmt) { + case PIPE_FORMAT_R8_UINT: + case PIPE_FORMAT_R8_SINT: + case PIPE_FORMAT_Z32_FLOAT: + return false; + default: + break; + } - return true; + return true; } unsigned fd3_tile_mode(const struct pipe_resource *tmpl) { - if (ok_format(tmpl->format)) - return TILE_4X4; - return LINEAR; + if (ok_format(tmpl->format)) + return TILE_4X4; + return LINEAR; } diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_screen.c b/src/gallium/drivers/freedreno/a3xx/fd3_screen.c index dce4259..f0095bc 100644 --- a/src/gallium/drivers/freedreno/a3xx/fd3_screen.c +++ b/src/gallium/drivers/freedreno/a3xx/fd3_screen.c @@ -27,90 +27,84 @@ #include "pipe/p_screen.h" #include "util/format/u_format.h" -#include "fd3_screen.h" #include "fd3_context.h" -#include "fd3_format.h" #include "fd3_emit.h" +#include "fd3_format.h" #include "fd3_resource.h" +#include "fd3_screen.h" #include "ir3/ir3_compiler.h" static bool fd3_screen_is_format_supported(struct pipe_screen *pscreen, - enum pipe_format format, - enum pipe_texture_target target, - unsigned sample_count, - unsigned storage_sample_count, - unsigned usage) + enum pipe_format format, + enum pipe_texture_target target, + unsigned sample_count, + unsigned storage_sample_count, unsigned usage) { - unsigned retval = 0; + unsigned retval = 0; - if ((target >= PIPE_MAX_TEXTURE_TYPES) || - (sample_count > 1)) { /* TODO add MSAA */ - DBG("not supported: format=%s, target=%d, sample_count=%d, usage=%x", - util_format_name(format), target, sample_count, usage); - return false; - } + if ((target >= PIPE_MAX_TEXTURE_TYPES) || + (sample_count > 1)) { /* TODO add MSAA */ + DBG("not supported: format=%s, target=%d, sample_count=%d, usage=%x", + util_format_name(format), target, sample_count, usage); + return false; + } - if (MAX2(1, sample_count) != MAX2(1, storage_sample_count)) - return false; + if (MAX2(1, sample_count) != MAX2(1, storage_sample_count)) + return false; - if ((usage & PIPE_BIND_VERTEX_BUFFER) && - (fd3_pipe2vtx(format) != VFMT_NONE)) { - retval |= PIPE_BIND_VERTEX_BUFFER; - } + if ((usage & PIPE_BIND_VERTEX_BUFFER) && + (fd3_pipe2vtx(format) != VFMT_NONE)) { + retval |= PIPE_BIND_VERTEX_BUFFER; + } - if ((usage & PIPE_BIND_SAMPLER_VIEW) && - (fd3_pipe2tex(format) != TFMT_NONE)) { - retval |= PIPE_BIND_SAMPLER_VIEW; - } + if ((usage & PIPE_BIND_SAMPLER_VIEW) && + (fd3_pipe2tex(format) != TFMT_NONE)) { + retval |= PIPE_BIND_SAMPLER_VIEW; + } - if ((usage & (PIPE_BIND_RENDER_TARGET | - PIPE_BIND_DISPLAY_TARGET | - PIPE_BIND_SCANOUT | - PIPE_BIND_SHARED | - PIPE_BIND_BLENDABLE)) && - (fd3_pipe2color(format) != RB_NONE) && - (fd3_pipe2tex(format) != TFMT_NONE)) { - retval |= usage & (PIPE_BIND_RENDER_TARGET | - PIPE_BIND_DISPLAY_TARGET | - PIPE_BIND_SCANOUT | - PIPE_BIND_SHARED); - if (!util_format_is_pure_integer(format)) - retval |= usage & PIPE_BIND_BLENDABLE; - } + if ((usage & (PIPE_BIND_RENDER_TARGET | PIPE_BIND_DISPLAY_TARGET | + PIPE_BIND_SCANOUT | PIPE_BIND_SHARED | PIPE_BIND_BLENDABLE)) && + (fd3_pipe2color(format) != RB_NONE) && + (fd3_pipe2tex(format) != TFMT_NONE)) { + retval |= usage & (PIPE_BIND_RENDER_TARGET | PIPE_BIND_DISPLAY_TARGET | + PIPE_BIND_SCANOUT | PIPE_BIND_SHARED); + if (!util_format_is_pure_integer(format)) + retval |= usage & PIPE_BIND_BLENDABLE; + } - if ((usage & PIPE_BIND_DEPTH_STENCIL) && - (fd_pipe2depth(format) != (enum adreno_rb_depth_format)~0) && - (fd3_pipe2tex(format) != TFMT_NONE)) { - retval |= PIPE_BIND_DEPTH_STENCIL; - } + if ((usage & PIPE_BIND_DEPTH_STENCIL) && + (fd_pipe2depth(format) != (enum adreno_rb_depth_format) ~0) && + (fd3_pipe2tex(format) != TFMT_NONE)) { + retval |= PIPE_BIND_DEPTH_STENCIL; + } - if ((usage & PIPE_BIND_INDEX_BUFFER) && - (fd_pipe2index(format) != (enum pc_di_index_size)~0)) { - retval |= PIPE_BIND_INDEX_BUFFER; - } + if ((usage & PIPE_BIND_INDEX_BUFFER) && + (fd_pipe2index(format) != (enum pc_di_index_size) ~0)) { + retval |= PIPE_BIND_INDEX_BUFFER; + } - if (retval != usage) { - DBG("not supported: format=%s, target=%d, sample_count=%d, " - "usage=%x, retval=%x", util_format_name(format), - target, sample_count, usage, retval); - } + if (retval != usage) { + DBG("not supported: format=%s, target=%d, sample_count=%d, " + "usage=%x, retval=%x", + util_format_name(format), target, sample_count, usage, retval); + } - return retval == usage; + return retval == usage; } void fd3_screen_init(struct pipe_screen *pscreen) { - struct fd_screen *screen = fd_screen(pscreen); - screen->max_rts = A3XX_MAX_RENDER_TARGETS; - pscreen->context_create = fd3_context_create; - pscreen->is_format_supported = fd3_screen_is_format_supported; - fd3_emit_init_screen(pscreen); - ir3_screen_init(pscreen); + struct fd_screen *screen = fd_screen(pscreen); + screen->max_rts = A3XX_MAX_RENDER_TARGETS; + pscreen->context_create = fd3_context_create; + pscreen->is_format_supported = fd3_screen_is_format_supported; + fd3_emit_init_screen(pscreen); + ir3_screen_init(pscreen); - screen->setup_slices = fd3_setup_slices; - if (FD_DBG(TTILE)) - screen->tile_mode = fd3_tile_mode; + screen->setup_slices = fd3_setup_slices; + if (FD_DBG(TTILE)) + screen->tile_mode = fd3_tile_mode; } diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_texture.c b/src/gallium/drivers/freedreno/a3xx/fd3_texture.c index 3518dac..51c5459 100644 --- a/src/gallium/drivers/freedreno/a3xx/fd3_texture.c +++ b/src/gallium/drivers/freedreno/a3xx/fd3_texture.c @@ -25,204 +25,199 @@ */ #include "pipe/p_state.h" -#include "util/u_string.h" -#include "util/u_memory.h" -#include "util/u_inlines.h" #include "util/format/u_format.h" +#include "util/u_inlines.h" +#include "util/u_memory.h" +#include "util/u_string.h" -#include "fd3_texture.h" #include "fd3_format.h" +#include "fd3_texture.h" static enum a3xx_tex_clamp tex_clamp(unsigned wrap, bool *needs_border) { - switch (wrap) { - case PIPE_TEX_WRAP_REPEAT: - return A3XX_TEX_REPEAT; - case PIPE_TEX_WRAP_CLAMP_TO_EDGE: - return A3XX_TEX_CLAMP_TO_EDGE; - case PIPE_TEX_WRAP_CLAMP_TO_BORDER: - *needs_border = true; - return A3XX_TEX_CLAMP_TO_BORDER; - case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE: - /* only works for PoT.. need to emulate otherwise! */ - return A3XX_TEX_MIRROR_CLAMP; - case PIPE_TEX_WRAP_MIRROR_REPEAT: - return A3XX_TEX_MIRROR_REPEAT; - case PIPE_TEX_WRAP_MIRROR_CLAMP: - case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER: - /* these two we could perhaps emulate, but we currently - * just don't advertise PIPE_CAP_TEXTURE_MIRROR_CLAMP - */ - default: - DBG("invalid wrap: %u", wrap); - return 0; - } + switch (wrap) { + case PIPE_TEX_WRAP_REPEAT: + return A3XX_TEX_REPEAT; + case PIPE_TEX_WRAP_CLAMP_TO_EDGE: + return A3XX_TEX_CLAMP_TO_EDGE; + case PIPE_TEX_WRAP_CLAMP_TO_BORDER: + *needs_border = true; + return A3XX_TEX_CLAMP_TO_BORDER; + case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE: + /* only works for PoT.. need to emulate otherwise! */ + return A3XX_TEX_MIRROR_CLAMP; + case PIPE_TEX_WRAP_MIRROR_REPEAT: + return A3XX_TEX_MIRROR_REPEAT; + case PIPE_TEX_WRAP_MIRROR_CLAMP: + case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER: + /* these two we could perhaps emulate, but we currently + * just don't advertise PIPE_CAP_TEXTURE_MIRROR_CLAMP + */ + default: + DBG("invalid wrap: %u", wrap); + return 0; + } } static enum a3xx_tex_filter tex_filter(unsigned filter, bool aniso) { - switch (filter) { - case PIPE_TEX_FILTER_NEAREST: - return A3XX_TEX_NEAREST; - case PIPE_TEX_FILTER_LINEAR: - return aniso ? A3XX_TEX_ANISO : A3XX_TEX_LINEAR; - default: - DBG("invalid filter: %u", filter); - return 0; - } + switch (filter) { + case PIPE_TEX_FILTER_NEAREST: + return A3XX_TEX_NEAREST; + case PIPE_TEX_FILTER_LINEAR: + return aniso ? A3XX_TEX_ANISO : A3XX_TEX_LINEAR; + default: + DBG("invalid filter: %u", filter); + return 0; + } } static void * fd3_sampler_state_create(struct pipe_context *pctx, - const struct pipe_sampler_state *cso) + const struct pipe_sampler_state *cso) { - struct fd3_sampler_stateobj *so = CALLOC_STRUCT(fd3_sampler_stateobj); - unsigned aniso = util_last_bit(MIN2(cso->max_anisotropy >> 1, 8)); - bool miplinear = false; - - if (!so) - return NULL; - - if (cso->min_mip_filter == PIPE_TEX_MIPFILTER_LINEAR) - miplinear = true; - - so->base = *cso; - - so->needs_border = false; - so->texsamp0 = - COND(!cso->normalized_coords, A3XX_TEX_SAMP_0_UNNORM_COORDS) | - COND(!cso->seamless_cube_map, A3XX_TEX_SAMP_0_CUBEMAPSEAMLESSFILTOFF) | - COND(miplinear, A3XX_TEX_SAMP_0_MIPFILTER_LINEAR) | - A3XX_TEX_SAMP_0_XY_MAG(tex_filter(cso->mag_img_filter, aniso)) | - A3XX_TEX_SAMP_0_XY_MIN(tex_filter(cso->min_img_filter, aniso)) | - A3XX_TEX_SAMP_0_ANISO(aniso) | - A3XX_TEX_SAMP_0_WRAP_S(tex_clamp(cso->wrap_s, &so->needs_border)) | - A3XX_TEX_SAMP_0_WRAP_T(tex_clamp(cso->wrap_t, &so->needs_border)) | - A3XX_TEX_SAMP_0_WRAP_R(tex_clamp(cso->wrap_r, &so->needs_border)); - - if (cso->compare_mode) - so->texsamp0 |= A3XX_TEX_SAMP_0_COMPARE_FUNC(cso->compare_func); /* maps 1:1 */ - - so->texsamp1 = A3XX_TEX_SAMP_1_LOD_BIAS(cso->lod_bias); - - if (cso->min_mip_filter != PIPE_TEX_MIPFILTER_NONE) { - so->texsamp1 |= - A3XX_TEX_SAMP_1_MIN_LOD(cso->min_lod) | - A3XX_TEX_SAMP_1_MAX_LOD(cso->max_lod); - } else { - /* If we're not doing mipmap filtering, we still need a slightly > 0 - * LOD clamp so the HW can decide between min and mag filtering of - * level 0. - */ - so->texsamp1 |= - A3XX_TEX_SAMP_1_MIN_LOD(MIN2(cso->min_lod, 0.125)) | - A3XX_TEX_SAMP_1_MAX_LOD(MIN2(cso->max_lod, 0.125)); - } - - return so; + struct fd3_sampler_stateobj *so = CALLOC_STRUCT(fd3_sampler_stateobj); + unsigned aniso = util_last_bit(MIN2(cso->max_anisotropy >> 1, 8)); + bool miplinear = false; + + if (!so) + return NULL; + + if (cso->min_mip_filter == PIPE_TEX_MIPFILTER_LINEAR) + miplinear = true; + + so->base = *cso; + + so->needs_border = false; + so->texsamp0 = + COND(!cso->normalized_coords, A3XX_TEX_SAMP_0_UNNORM_COORDS) | + COND(!cso->seamless_cube_map, A3XX_TEX_SAMP_0_CUBEMAPSEAMLESSFILTOFF) | + COND(miplinear, A3XX_TEX_SAMP_0_MIPFILTER_LINEAR) | + A3XX_TEX_SAMP_0_XY_MAG(tex_filter(cso->mag_img_filter, aniso)) | + A3XX_TEX_SAMP_0_XY_MIN(tex_filter(cso->min_img_filter, aniso)) | + A3XX_TEX_SAMP_0_ANISO(aniso) | + A3XX_TEX_SAMP_0_WRAP_S(tex_clamp(cso->wrap_s, &so->needs_border)) | + A3XX_TEX_SAMP_0_WRAP_T(tex_clamp(cso->wrap_t, &so->needs_border)) | + A3XX_TEX_SAMP_0_WRAP_R(tex_clamp(cso->wrap_r, &so->needs_border)); + + if (cso->compare_mode) + so->texsamp0 |= + A3XX_TEX_SAMP_0_COMPARE_FUNC(cso->compare_func); /* maps 1:1 */ + + so->texsamp1 = A3XX_TEX_SAMP_1_LOD_BIAS(cso->lod_bias); + + if (cso->min_mip_filter != PIPE_TEX_MIPFILTER_NONE) { + so->texsamp1 |= A3XX_TEX_SAMP_1_MIN_LOD(cso->min_lod) | + A3XX_TEX_SAMP_1_MAX_LOD(cso->max_lod); + } else { + /* If we're not doing mipmap filtering, we still need a slightly > 0 + * LOD clamp so the HW can decide between min and mag filtering of + * level 0. + */ + so->texsamp1 |= A3XX_TEX_SAMP_1_MIN_LOD(MIN2(cso->min_lod, 0.125)) | + A3XX_TEX_SAMP_1_MAX_LOD(MIN2(cso->max_lod, 0.125)); + } + + return so; } static enum a3xx_tex_type tex_type(unsigned target) { - switch (target) { - default: - assert(0); - case PIPE_BUFFER: - case PIPE_TEXTURE_1D: - case PIPE_TEXTURE_1D_ARRAY: - return A3XX_TEX_1D; - case PIPE_TEXTURE_RECT: - case PIPE_TEXTURE_2D: - case PIPE_TEXTURE_2D_ARRAY: - return A3XX_TEX_2D; - case PIPE_TEXTURE_3D: - return A3XX_TEX_3D; - case PIPE_TEXTURE_CUBE: - case PIPE_TEXTURE_CUBE_ARRAY: - return A3XX_TEX_CUBE; - } + switch (target) { + default: + assert(0); + case PIPE_BUFFER: + case PIPE_TEXTURE_1D: + case PIPE_TEXTURE_1D_ARRAY: + return A3XX_TEX_1D; + case PIPE_TEXTURE_RECT: + case PIPE_TEXTURE_2D: + case PIPE_TEXTURE_2D_ARRAY: + return A3XX_TEX_2D; + case PIPE_TEXTURE_3D: + return A3XX_TEX_3D; + case PIPE_TEXTURE_CUBE: + case PIPE_TEXTURE_CUBE_ARRAY: + return A3XX_TEX_CUBE; + } } static struct pipe_sampler_view * fd3_sampler_view_create(struct pipe_context *pctx, struct pipe_resource *prsc, - const struct pipe_sampler_view *cso) + const struct pipe_sampler_view *cso) { - struct fd3_pipe_sampler_view *so = CALLOC_STRUCT(fd3_pipe_sampler_view); - struct fd_resource *rsc = fd_resource(prsc); - unsigned lvl; - - if (!so) - return NULL; - - so->base = *cso; - pipe_reference(NULL, &prsc->reference); - so->base.texture = prsc; - so->base.reference.count = 1; - so->base.context = pctx; - - so->texconst0 = - A3XX_TEX_CONST_0_TILE_MODE(rsc->layout.tile_mode) | - A3XX_TEX_CONST_0_TYPE(tex_type(prsc->target)) | - A3XX_TEX_CONST_0_FMT(fd3_pipe2tex(cso->format)) | - fd3_tex_swiz(cso->format, cso->swizzle_r, cso->swizzle_g, - cso->swizzle_b, cso->swizzle_a); - - if (prsc->target == PIPE_BUFFER || util_format_is_pure_integer(cso->format)) - so->texconst0 |= A3XX_TEX_CONST_0_NOCONVERT; - if (util_format_is_srgb(cso->format)) - so->texconst0 |= A3XX_TEX_CONST_0_SRGB; - - if (prsc->target == PIPE_BUFFER) { - lvl = 0; - so->texconst1 = - A3XX_TEX_CONST_1_WIDTH(cso->u.buf.size / util_format_get_blocksize(cso->format)) | - A3XX_TEX_CONST_1_HEIGHT(1); - } else { - unsigned miplevels; - - lvl = fd_sampler_first_level(cso); - miplevels = fd_sampler_last_level(cso) - lvl; - - so->texconst0 |= A3XX_TEX_CONST_0_MIPLVLS(miplevels); - so->texconst1 = - A3XX_TEX_CONST_1_PITCHALIGN(rsc->layout.pitchalign - 4) | - A3XX_TEX_CONST_1_WIDTH(u_minify(prsc->width0, lvl)) | - A3XX_TEX_CONST_1_HEIGHT(u_minify(prsc->height0, lvl)); - } - /* when emitted, A3XX_TEX_CONST_2_INDX() must be OR'd in: */ - struct fdl_slice *slice = fd_resource_slice(rsc, lvl); - so->texconst2 = - A3XX_TEX_CONST_2_PITCH(fd_resource_pitch(rsc, lvl)); - switch (prsc->target) { - case PIPE_TEXTURE_1D_ARRAY: - case PIPE_TEXTURE_2D_ARRAY: - so->texconst3 = - A3XX_TEX_CONST_3_DEPTH(prsc->array_size - 1) | - A3XX_TEX_CONST_3_LAYERSZ1(slice->size0); - break; - case PIPE_TEXTURE_3D: - so->texconst3 = - A3XX_TEX_CONST_3_DEPTH(u_minify(prsc->depth0, lvl)) | - A3XX_TEX_CONST_3_LAYERSZ1(slice->size0); - so->texconst3 |= A3XX_TEX_CONST_3_LAYERSZ2( - fd_resource_slice(rsc, prsc->last_level)->size0); - break; - default: - so->texconst3 = 0x00000000; - break; - } - - return &so->base; + struct fd3_pipe_sampler_view *so = CALLOC_STRUCT(fd3_pipe_sampler_view); + struct fd_resource *rsc = fd_resource(prsc); + unsigned lvl; + + if (!so) + return NULL; + + so->base = *cso; + pipe_reference(NULL, &prsc->reference); + so->base.texture = prsc; + so->base.reference.count = 1; + so->base.context = pctx; + + so->texconst0 = A3XX_TEX_CONST_0_TILE_MODE(rsc->layout.tile_mode) | + A3XX_TEX_CONST_0_TYPE(tex_type(prsc->target)) | + A3XX_TEX_CONST_0_FMT(fd3_pipe2tex(cso->format)) | + fd3_tex_swiz(cso->format, cso->swizzle_r, cso->swizzle_g, + cso->swizzle_b, cso->swizzle_a); + + if (prsc->target == PIPE_BUFFER || util_format_is_pure_integer(cso->format)) + so->texconst0 |= A3XX_TEX_CONST_0_NOCONVERT; + if (util_format_is_srgb(cso->format)) + so->texconst0 |= A3XX_TEX_CONST_0_SRGB; + + if (prsc->target == PIPE_BUFFER) { + lvl = 0; + so->texconst1 = + A3XX_TEX_CONST_1_WIDTH(cso->u.buf.size / + util_format_get_blocksize(cso->format)) | + A3XX_TEX_CONST_1_HEIGHT(1); + } else { + unsigned miplevels; + + lvl = fd_sampler_first_level(cso); + miplevels = fd_sampler_last_level(cso) - lvl; + + so->texconst0 |= A3XX_TEX_CONST_0_MIPLVLS(miplevels); + so->texconst1 = A3XX_TEX_CONST_1_PITCHALIGN(rsc->layout.pitchalign - 4) | + A3XX_TEX_CONST_1_WIDTH(u_minify(prsc->width0, lvl)) | + A3XX_TEX_CONST_1_HEIGHT(u_minify(prsc->height0, lvl)); + } + /* when emitted, A3XX_TEX_CONST_2_INDX() must be OR'd in: */ + struct fdl_slice *slice = fd_resource_slice(rsc, lvl); + so->texconst2 = A3XX_TEX_CONST_2_PITCH(fd_resource_pitch(rsc, lvl)); + switch (prsc->target) { + case PIPE_TEXTURE_1D_ARRAY: + case PIPE_TEXTURE_2D_ARRAY: + so->texconst3 = A3XX_TEX_CONST_3_DEPTH(prsc->array_size - 1) | + A3XX_TEX_CONST_3_LAYERSZ1(slice->size0); + break; + case PIPE_TEXTURE_3D: + so->texconst3 = A3XX_TEX_CONST_3_DEPTH(u_minify(prsc->depth0, lvl)) | + A3XX_TEX_CONST_3_LAYERSZ1(slice->size0); + so->texconst3 |= A3XX_TEX_CONST_3_LAYERSZ2( + fd_resource_slice(rsc, prsc->last_level)->size0); + break; + default: + so->texconst3 = 0x00000000; + break; + } + + return &so->base; } void fd3_texture_init(struct pipe_context *pctx) { - pctx->create_sampler_state = fd3_sampler_state_create; - pctx->bind_sampler_states = fd_sampler_states_bind; - pctx->create_sampler_view = fd3_sampler_view_create; - pctx->set_sampler_views = fd_set_sampler_views; + pctx->create_sampler_state = fd3_sampler_state_create; + pctx->bind_sampler_states = fd_sampler_states_bind; + pctx->create_sampler_view = fd3_sampler_view_create; + pctx->set_sampler_views = fd_set_sampler_views; } diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_texture.h b/src/gallium/drivers/freedreno/a3xx/fd3_texture.h index ddc4dfd..297a209 100644 --- a/src/gallium/drivers/freedreno/a3xx/fd3_texture.h +++ b/src/gallium/drivers/freedreno/a3xx/fd3_texture.h @@ -29,37 +29,37 @@ #include "pipe/p_context.h" -#include "freedreno_texture.h" #include "freedreno_resource.h" +#include "freedreno_texture.h" #include "fd3_context.h" #include "fd3_format.h" struct fd3_sampler_stateobj { - struct pipe_sampler_state base; - uint32_t texsamp0, texsamp1; - bool needs_border; + struct pipe_sampler_state base; + uint32_t texsamp0, texsamp1; + bool needs_border; }; static inline struct fd3_sampler_stateobj * fd3_sampler_stateobj(struct pipe_sampler_state *samp) { - return (struct fd3_sampler_stateobj *)samp; + return (struct fd3_sampler_stateobj *)samp; } struct fd3_pipe_sampler_view { - struct pipe_sampler_view base; - uint32_t texconst0, texconst1, texconst2, texconst3; + struct pipe_sampler_view base; + uint32_t texconst0, texconst1, texconst2, texconst3; }; static inline struct fd3_pipe_sampler_view * fd3_pipe_sampler_view(struct pipe_sampler_view *pview) { - return (struct fd3_pipe_sampler_view *)pview; + return (struct fd3_pipe_sampler_view *)pview; } unsigned fd3_get_const_idx(struct fd_context *ctx, - struct fd_texture_stateobj *tex, unsigned samp_id); + struct fd_texture_stateobj *tex, unsigned samp_id); void fd3_texture_init(struct pipe_context *pctx); diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_zsa.c b/src/gallium/drivers/freedreno/a3xx/fd3_zsa.c index 6eeed64..7cbb287 100644 --- a/src/gallium/drivers/freedreno/a3xx/fd3_zsa.c +++ b/src/gallium/drivers/freedreno/a3xx/fd3_zsa.c @@ -24,79 +24,75 @@ * Rob Clark */ - #include "pipe/p_state.h" -#include "util/u_string.h" #include "util/u_memory.h" +#include "util/u_string.h" -#include "fd3_zsa.h" #include "fd3_context.h" #include "fd3_format.h" +#include "fd3_zsa.h" void * fd3_zsa_state_create(struct pipe_context *pctx, - const struct pipe_depth_stencil_alpha_state *cso) + const struct pipe_depth_stencil_alpha_state *cso) { - struct fd3_zsa_stateobj *so; + struct fd3_zsa_stateobj *so; - so = CALLOC_STRUCT(fd3_zsa_stateobj); - if (!so) - return NULL; + so = CALLOC_STRUCT(fd3_zsa_stateobj); + if (!so) + return NULL; - so->base = *cso; + so->base = *cso; - so->rb_depth_control |= - A3XX_RB_DEPTH_CONTROL_ZFUNC(cso->depth_func); /* maps 1:1 */ + so->rb_depth_control |= + A3XX_RB_DEPTH_CONTROL_ZFUNC(cso->depth_func); /* maps 1:1 */ - if (cso->depth_enabled) - so->rb_depth_control |= - A3XX_RB_DEPTH_CONTROL_Z_ENABLE | - A3XX_RB_DEPTH_CONTROL_Z_TEST_ENABLE; + if (cso->depth_enabled) + so->rb_depth_control |= + A3XX_RB_DEPTH_CONTROL_Z_ENABLE | A3XX_RB_DEPTH_CONTROL_Z_TEST_ENABLE; - if (cso->depth_writemask) - so->rb_depth_control |= A3XX_RB_DEPTH_CONTROL_Z_WRITE_ENABLE; + if (cso->depth_writemask) + so->rb_depth_control |= A3XX_RB_DEPTH_CONTROL_Z_WRITE_ENABLE; - if (cso->stencil[0].enabled) { - const struct pipe_stencil_state *s = &cso->stencil[0]; + if (cso->stencil[0].enabled) { + const struct pipe_stencil_state *s = &cso->stencil[0]; - so->rb_stencil_control |= - A3XX_RB_STENCIL_CONTROL_STENCIL_READ | - A3XX_RB_STENCIL_CONTROL_STENCIL_ENABLE | - A3XX_RB_STENCIL_CONTROL_FUNC(s->func) | /* maps 1:1 */ - A3XX_RB_STENCIL_CONTROL_FAIL(fd_stencil_op(s->fail_op)) | - A3XX_RB_STENCIL_CONTROL_ZPASS(fd_stencil_op(s->zpass_op)) | - A3XX_RB_STENCIL_CONTROL_ZFAIL(fd_stencil_op(s->zfail_op)); - so->rb_stencilrefmask |= - 0xff000000 | /* ??? */ - A3XX_RB_STENCILREFMASK_STENCILWRITEMASK(s->writemask) | - A3XX_RB_STENCILREFMASK_STENCILMASK(s->valuemask); + so->rb_stencil_control |= + A3XX_RB_STENCIL_CONTROL_STENCIL_READ | + A3XX_RB_STENCIL_CONTROL_STENCIL_ENABLE | + A3XX_RB_STENCIL_CONTROL_FUNC(s->func) | /* maps 1:1 */ + A3XX_RB_STENCIL_CONTROL_FAIL(fd_stencil_op(s->fail_op)) | + A3XX_RB_STENCIL_CONTROL_ZPASS(fd_stencil_op(s->zpass_op)) | + A3XX_RB_STENCIL_CONTROL_ZFAIL(fd_stencil_op(s->zfail_op)); + so->rb_stencilrefmask |= + 0xff000000 | /* ??? */ + A3XX_RB_STENCILREFMASK_STENCILWRITEMASK(s->writemask) | + A3XX_RB_STENCILREFMASK_STENCILMASK(s->valuemask); - if (cso->stencil[1].enabled) { - const struct pipe_stencil_state *bs = &cso->stencil[1]; + if (cso->stencil[1].enabled) { + const struct pipe_stencil_state *bs = &cso->stencil[1]; - so->rb_stencil_control |= - A3XX_RB_STENCIL_CONTROL_STENCIL_ENABLE_BF | - A3XX_RB_STENCIL_CONTROL_FUNC_BF(bs->func) | /* maps 1:1 */ - A3XX_RB_STENCIL_CONTROL_FAIL_BF(fd_stencil_op(bs->fail_op)) | - A3XX_RB_STENCIL_CONTROL_ZPASS_BF(fd_stencil_op(bs->zpass_op)) | - A3XX_RB_STENCIL_CONTROL_ZFAIL_BF(fd_stencil_op(bs->zfail_op)); - so->rb_stencilrefmask_bf |= - 0xff000000 | /* ??? */ - A3XX_RB_STENCILREFMASK_STENCILWRITEMASK(bs->writemask) | - A3XX_RB_STENCILREFMASK_STENCILMASK(bs->valuemask); - } - } + so->rb_stencil_control |= + A3XX_RB_STENCIL_CONTROL_STENCIL_ENABLE_BF | + A3XX_RB_STENCIL_CONTROL_FUNC_BF(bs->func) | /* maps 1:1 */ + A3XX_RB_STENCIL_CONTROL_FAIL_BF(fd_stencil_op(bs->fail_op)) | + A3XX_RB_STENCIL_CONTROL_ZPASS_BF(fd_stencil_op(bs->zpass_op)) | + A3XX_RB_STENCIL_CONTROL_ZFAIL_BF(fd_stencil_op(bs->zfail_op)); + so->rb_stencilrefmask_bf |= + 0xff000000 | /* ??? */ + A3XX_RB_STENCILREFMASK_STENCILWRITEMASK(bs->writemask) | + A3XX_RB_STENCILREFMASK_STENCILMASK(bs->valuemask); + } + } - if (cso->alpha_enabled) { - so->rb_render_control = - A3XX_RB_RENDER_CONTROL_ALPHA_TEST | - A3XX_RB_RENDER_CONTROL_ALPHA_TEST_FUNC(cso->alpha_func); - so->rb_alpha_ref = - A3XX_RB_ALPHA_REF_UINT(cso->alpha_ref_value * 255.0) | - A3XX_RB_ALPHA_REF_FLOAT(cso->alpha_ref_value); - so->rb_depth_control |= - A3XX_RB_DEPTH_CONTROL_EARLY_Z_DISABLE; - } + if (cso->alpha_enabled) { + so->rb_render_control = + A3XX_RB_RENDER_CONTROL_ALPHA_TEST | + A3XX_RB_RENDER_CONTROL_ALPHA_TEST_FUNC(cso->alpha_func); + so->rb_alpha_ref = A3XX_RB_ALPHA_REF_UINT(cso->alpha_ref_value * 255.0) | + A3XX_RB_ALPHA_REF_FLOAT(cso->alpha_ref_value); + so->rb_depth_control |= A3XX_RB_DEPTH_CONTROL_EARLY_Z_DISABLE; + } - return so; + return so; } diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_zsa.h b/src/gallium/drivers/freedreno/a3xx/fd3_zsa.h index 8dbbfc8..e69b02f 100644 --- a/src/gallium/drivers/freedreno/a3xx/fd3_zsa.h +++ b/src/gallium/drivers/freedreno/a3xx/fd3_zsa.h @@ -27,29 +27,28 @@ #ifndef FD3_ZSA_H_ #define FD3_ZSA_H_ - -#include "pipe/p_state.h" #include "pipe/p_context.h" +#include "pipe/p_state.h" #include "freedreno_util.h" struct fd3_zsa_stateobj { - struct pipe_depth_stencil_alpha_state base; - uint32_t rb_render_control; - uint32_t rb_alpha_ref; - uint32_t rb_depth_control; - uint32_t rb_stencil_control; - uint32_t rb_stencilrefmask; - uint32_t rb_stencilrefmask_bf; + struct pipe_depth_stencil_alpha_state base; + uint32_t rb_render_control; + uint32_t rb_alpha_ref; + uint32_t rb_depth_control; + uint32_t rb_stencil_control; + uint32_t rb_stencilrefmask; + uint32_t rb_stencilrefmask_bf; }; static inline struct fd3_zsa_stateobj * fd3_zsa_stateobj(struct pipe_depth_stencil_alpha_state *zsa) { - return (struct fd3_zsa_stateobj *)zsa; + return (struct fd3_zsa_stateobj *)zsa; } -void * fd3_zsa_state_create(struct pipe_context *pctx, - const struct pipe_depth_stencil_alpha_state *cso); +void *fd3_zsa_state_create(struct pipe_context *pctx, + const struct pipe_depth_stencil_alpha_state *cso); #endif /* FD3_ZSA_H_ */ diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_blend.c b/src/gallium/drivers/freedreno/a4xx/fd4_blend.c index 9b067e0..d6f62ec 100644 --- a/src/gallium/drivers/freedreno/a4xx/fd4_blend.c +++ b/src/gallium/drivers/freedreno/a4xx/fd4_blend.c @@ -26,8 +26,8 @@ #include "pipe/p_state.h" #include "util/u_blend.h" -#include "util/u_string.h" #include "util/u_memory.h" +#include "util/u_string.h" #include "fd4_blend.h" #include "fd4_context.h" @@ -36,83 +36,89 @@ static enum a3xx_rb_blend_opcode blend_func(unsigned func) { - switch (func) { - case PIPE_BLEND_ADD: - return BLEND_DST_PLUS_SRC; - case PIPE_BLEND_MIN: - return BLEND_MIN_DST_SRC; - case PIPE_BLEND_MAX: - return BLEND_MAX_DST_SRC; - case PIPE_BLEND_SUBTRACT: - return BLEND_SRC_MINUS_DST; - case PIPE_BLEND_REVERSE_SUBTRACT: - return BLEND_DST_MINUS_SRC; - default: - DBG("invalid blend func: %x", func); - return 0; - } + switch (func) { + case PIPE_BLEND_ADD: + return BLEND_DST_PLUS_SRC; + case PIPE_BLEND_MIN: + return BLEND_MIN_DST_SRC; + case PIPE_BLEND_MAX: + return BLEND_MAX_DST_SRC; + case PIPE_BLEND_SUBTRACT: + return BLEND_SRC_MINUS_DST; + case PIPE_BLEND_REVERSE_SUBTRACT: + return BLEND_DST_MINUS_SRC; + default: + DBG("invalid blend func: %x", func); + return 0; + } } void * fd4_blend_state_create(struct pipe_context *pctx, - const struct pipe_blend_state *cso) + const struct pipe_blend_state *cso) { - struct fd4_blend_stateobj *so; - enum a3xx_rop_code rop = ROP_COPY; - bool reads_dest = false; - unsigned i, mrt_blend = 0; - - if (cso->logicop_enable) { - rop = cso->logicop_func; /* maps 1:1 */ - reads_dest = util_logicop_reads_dest(cso->logicop_func); - } - - so = CALLOC_STRUCT(fd4_blend_stateobj); - if (!so) - return NULL; - - so->base = *cso; - - for (i = 0; i < ARRAY_SIZE(so->rb_mrt); i++) { - const struct pipe_rt_blend_state *rt; - - if (cso->independent_blend_enable) - rt = &cso->rt[i]; - else - rt = &cso->rt[0]; - - so->rb_mrt[i].blend_control = - A4XX_RB_MRT_BLEND_CONTROL_RGB_SRC_FACTOR(fd_blend_factor(rt->rgb_src_factor)) | - A4XX_RB_MRT_BLEND_CONTROL_RGB_BLEND_OPCODE(blend_func(rt->rgb_func)) | - A4XX_RB_MRT_BLEND_CONTROL_RGB_DEST_FACTOR(fd_blend_factor(rt->rgb_dst_factor)) | - A4XX_RB_MRT_BLEND_CONTROL_ALPHA_SRC_FACTOR(fd_blend_factor(rt->alpha_src_factor)) | - A4XX_RB_MRT_BLEND_CONTROL_ALPHA_BLEND_OPCODE(blend_func(rt->alpha_func)) | - A4XX_RB_MRT_BLEND_CONTROL_ALPHA_DEST_FACTOR(fd_blend_factor(rt->alpha_dst_factor)); - - so->rb_mrt[i].control = - A4XX_RB_MRT_CONTROL_ROP_CODE(rop) | - COND(cso->logicop_enable, A4XX_RB_MRT_CONTROL_ROP_ENABLE) | - A4XX_RB_MRT_CONTROL_COMPONENT_ENABLE(rt->colormask); - - if (rt->blend_enable) { - so->rb_mrt[i].control |= - A4XX_RB_MRT_CONTROL_READ_DEST_ENABLE | - A4XX_RB_MRT_CONTROL_BLEND | - A4XX_RB_MRT_CONTROL_BLEND2; - mrt_blend |= (1 << i); - } - - if (reads_dest) { - so->rb_mrt[i].control |= A4XX_RB_MRT_CONTROL_READ_DEST_ENABLE; - mrt_blend |= (1 << i); - } - - if (cso->dither) - so->rb_mrt[i].buf_info |= A4XX_RB_MRT_BUF_INFO_DITHER_MODE(DITHER_ALWAYS); - } - - so->rb_fs_output = A4XX_RB_FS_OUTPUT_ENABLE_BLEND(mrt_blend) | - COND(cso->independent_blend_enable, A4XX_RB_FS_OUTPUT_INDEPENDENT_BLEND); - - return so; + struct fd4_blend_stateobj *so; + enum a3xx_rop_code rop = ROP_COPY; + bool reads_dest = false; + unsigned i, mrt_blend = 0; + + if (cso->logicop_enable) { + rop = cso->logicop_func; /* maps 1:1 */ + reads_dest = util_logicop_reads_dest(cso->logicop_func); + } + + so = CALLOC_STRUCT(fd4_blend_stateobj); + if (!so) + return NULL; + + so->base = *cso; + + for (i = 0; i < ARRAY_SIZE(so->rb_mrt); i++) { + const struct pipe_rt_blend_state *rt; + + if (cso->independent_blend_enable) + rt = &cso->rt[i]; + else + rt = &cso->rt[0]; + + so->rb_mrt[i].blend_control = + A4XX_RB_MRT_BLEND_CONTROL_RGB_SRC_FACTOR( + fd_blend_factor(rt->rgb_src_factor)) | + A4XX_RB_MRT_BLEND_CONTROL_RGB_BLEND_OPCODE(blend_func(rt->rgb_func)) | + A4XX_RB_MRT_BLEND_CONTROL_RGB_DEST_FACTOR( + fd_blend_factor(rt->rgb_dst_factor)) | + A4XX_RB_MRT_BLEND_CONTROL_ALPHA_SRC_FACTOR( + fd_blend_factor(rt->alpha_src_factor)) | + A4XX_RB_MRT_BLEND_CONTROL_ALPHA_BLEND_OPCODE( + blend_func(rt->alpha_func)) | + A4XX_RB_MRT_BLEND_CONTROL_ALPHA_DEST_FACTOR( + fd_blend_factor(rt->alpha_dst_factor)); + + so->rb_mrt[i].control = + A4XX_RB_MRT_CONTROL_ROP_CODE(rop) | + COND(cso->logicop_enable, A4XX_RB_MRT_CONTROL_ROP_ENABLE) | + A4XX_RB_MRT_CONTROL_COMPONENT_ENABLE(rt->colormask); + + if (rt->blend_enable) { + so->rb_mrt[i].control |= A4XX_RB_MRT_CONTROL_READ_DEST_ENABLE | + A4XX_RB_MRT_CONTROL_BLEND | + A4XX_RB_MRT_CONTROL_BLEND2; + mrt_blend |= (1 << i); + } + + if (reads_dest) { + so->rb_mrt[i].control |= A4XX_RB_MRT_CONTROL_READ_DEST_ENABLE; + mrt_blend |= (1 << i); + } + + if (cso->dither) + so->rb_mrt[i].buf_info |= + A4XX_RB_MRT_BUF_INFO_DITHER_MODE(DITHER_ALWAYS); + } + + so->rb_fs_output = + A4XX_RB_FS_OUTPUT_ENABLE_BLEND(mrt_blend) | + COND(cso->independent_blend_enable, A4XX_RB_FS_OUTPUT_INDEPENDENT_BLEND); + + return so; } diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_blend.h b/src/gallium/drivers/freedreno/a4xx/fd4_blend.h index 74364ce..8aa0bc7 100644 --- a/src/gallium/drivers/freedreno/a4xx/fd4_blend.h +++ b/src/gallium/drivers/freedreno/a4xx/fd4_blend.h @@ -27,28 +27,28 @@ #ifndef FD4_BLEND_H_ #define FD4_BLEND_H_ -#include "pipe/p_state.h" #include "pipe/p_context.h" +#include "pipe/p_state.h" #include "freedreno_util.h" struct fd4_blend_stateobj { - struct pipe_blend_state base; - struct { - uint32_t control; - uint32_t buf_info; - uint32_t blend_control; - } rb_mrt[A4XX_MAX_RENDER_TARGETS]; - uint32_t rb_fs_output; + struct pipe_blend_state base; + struct { + uint32_t control; + uint32_t buf_info; + uint32_t blend_control; + } rb_mrt[A4XX_MAX_RENDER_TARGETS]; + uint32_t rb_fs_output; }; static inline struct fd4_blend_stateobj * fd4_blend_stateobj(struct pipe_blend_state *blend) { - return (struct fd4_blend_stateobj *)blend; + return (struct fd4_blend_stateobj *)blend; } -void * fd4_blend_state_create(struct pipe_context *pctx, - const struct pipe_blend_state *cso); +void *fd4_blend_state_create(struct pipe_context *pctx, + const struct pipe_blend_state *cso); #endif /* FD4_BLEND_H_ */ diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_context.c b/src/gallium/drivers/freedreno/a4xx/fd4_context.c index e64fd11..dcfc2a2 100644 --- a/src/gallium/drivers/freedreno/a4xx/fd4_context.c +++ b/src/gallium/drivers/freedreno/a4xx/fd4_context.c @@ -26,8 +26,8 @@ #include "freedreno_query_hw.h" -#include "fd4_context.h" #include "fd4_blend.h" +#include "fd4_context.h" #include "fd4_draw.h" #include "fd4_emit.h" #include "fd4_gmem.h" @@ -38,25 +38,24 @@ #include "fd4_zsa.h" static void -fd4_context_destroy(struct pipe_context *pctx) - in_dt +fd4_context_destroy(struct pipe_context *pctx) in_dt { - struct fd4_context *fd4_ctx = fd4_context(fd_context(pctx)); + struct fd4_context *fd4_ctx = fd4_context(fd_context(pctx)); - u_upload_destroy(fd4_ctx->border_color_uploader); - pipe_resource_reference(&fd4_ctx->border_color_buf, NULL); + u_upload_destroy(fd4_ctx->border_color_uploader); + pipe_resource_reference(&fd4_ctx->border_color_buf, NULL); - fd_context_destroy(pctx); + fd_context_destroy(pctx); - fd_bo_del(fd4_ctx->vs_pvt_mem); - fd_bo_del(fd4_ctx->fs_pvt_mem); - fd_bo_del(fd4_ctx->vsc_size_mem); + fd_bo_del(fd4_ctx->vs_pvt_mem); + fd_bo_del(fd4_ctx->fs_pvt_mem); + fd_bo_del(fd4_ctx->vsc_size_mem); - fd_context_cleanup_common_vbos(&fd4_ctx->base); + fd_context_cleanup_common_vbos(&fd4_ctx->base); - fd_hw_query_fini(pctx); + fd_hw_query_fini(pctx); - free(fd4_ctx); + free(fd4_ctx); } /* clang-format off */ @@ -73,55 +72,55 @@ static const uint8_t primtypes[] = { /* clang-format on */ struct pipe_context * -fd4_context_create(struct pipe_screen *pscreen, void *priv, unsigned flags) - in_dt +fd4_context_create(struct pipe_screen *pscreen, void *priv, + unsigned flags) in_dt { - struct fd_screen *screen = fd_screen(pscreen); - struct fd4_context *fd4_ctx = CALLOC_STRUCT(fd4_context); - struct pipe_context *pctx; + struct fd_screen *screen = fd_screen(pscreen); + struct fd4_context *fd4_ctx = CALLOC_STRUCT(fd4_context); + struct pipe_context *pctx; - if (!fd4_ctx) - return NULL; + if (!fd4_ctx) + return NULL; - pctx = &fd4_ctx->base.base; - pctx->screen = pscreen; + pctx = &fd4_ctx->base.base; + pctx->screen = pscreen; - fd4_ctx->base.dev = fd_device_ref(screen->dev); - fd4_ctx->base.screen = fd_screen(pscreen); - fd4_ctx->base.last.key = &fd4_ctx->last_key; + fd4_ctx->base.dev = fd_device_ref(screen->dev); + fd4_ctx->base.screen = fd_screen(pscreen); + fd4_ctx->base.last.key = &fd4_ctx->last_key; - pctx->destroy = fd4_context_destroy; - pctx->create_blend_state = fd4_blend_state_create; - pctx->create_rasterizer_state = fd4_rasterizer_state_create; - pctx->create_depth_stencil_alpha_state = fd4_zsa_state_create; + pctx->destroy = fd4_context_destroy; + pctx->create_blend_state = fd4_blend_state_create; + pctx->create_rasterizer_state = fd4_rasterizer_state_create; + pctx->create_depth_stencil_alpha_state = fd4_zsa_state_create; - fd4_draw_init(pctx); - fd4_gmem_init(pctx); - fd4_texture_init(pctx); - fd4_prog_init(pctx); - fd4_emit_init(pctx); + fd4_draw_init(pctx); + fd4_gmem_init(pctx); + fd4_texture_init(pctx); + fd4_prog_init(pctx); + fd4_emit_init(pctx); - pctx = fd_context_init(&fd4_ctx->base, pscreen, primtypes, priv, flags); - if (!pctx) - return NULL; + pctx = fd_context_init(&fd4_ctx->base, pscreen, primtypes, priv, flags); + if (!pctx) + return NULL; - fd_hw_query_init(pctx); + fd_hw_query_init(pctx); - fd4_ctx->vs_pvt_mem = fd_bo_new(screen->dev, 0x2000, - DRM_FREEDRENO_GEM_TYPE_KMEM, "vs_pvt"); + fd4_ctx->vs_pvt_mem = + fd_bo_new(screen->dev, 0x2000, DRM_FREEDRENO_GEM_TYPE_KMEM, "vs_pvt"); - fd4_ctx->fs_pvt_mem = fd_bo_new(screen->dev, 0x2000, - DRM_FREEDRENO_GEM_TYPE_KMEM, "fs_pvt"); + fd4_ctx->fs_pvt_mem = + fd_bo_new(screen->dev, 0x2000, DRM_FREEDRENO_GEM_TYPE_KMEM, "fs_pvt"); - fd4_ctx->vsc_size_mem = fd_bo_new(screen->dev, 0x1000, - DRM_FREEDRENO_GEM_TYPE_KMEM, "vsc_size"); + fd4_ctx->vsc_size_mem = + fd_bo_new(screen->dev, 0x1000, DRM_FREEDRENO_GEM_TYPE_KMEM, "vsc_size"); - fd_context_setup_common_vbos(&fd4_ctx->base); + fd_context_setup_common_vbos(&fd4_ctx->base); - fd4_query_context_init(pctx); + fd4_query_context_init(pctx); - fd4_ctx->border_color_uploader = u_upload_create(pctx, 4096, 0, - PIPE_USAGE_STREAM, 0); + fd4_ctx->border_color_uploader = + u_upload_create(pctx, 4096, 0, PIPE_USAGE_STREAM, 0); - return pctx; + return pctx; } diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_context.h b/src/gallium/drivers/freedreno/a4xx/fd4_context.h index 95d0402..7cf0b00 100644 --- a/src/gallium/drivers/freedreno/a4xx/fd4_context.h +++ b/src/gallium/drivers/freedreno/a4xx/fd4_context.h @@ -34,34 +34,34 @@ #include "ir3/ir3_shader.h" struct fd4_context { - struct fd_context base; + struct fd_context base; - struct fd_bo *vs_pvt_mem, *fs_pvt_mem; + struct fd_bo *vs_pvt_mem, *fs_pvt_mem; - /* This only needs to be 4 * num_of_pipes bytes (ie. 32 bytes). We - * could combine it with another allocation. - * - * (upper area used as scratch bo.. see fd4_query) - */ - struct fd_bo *vsc_size_mem; + /* This only needs to be 4 * num_of_pipes bytes (ie. 32 bytes). We + * could combine it with another allocation. + * + * (upper area used as scratch bo.. see fd4_query) + */ + struct fd_bo *vsc_size_mem; - struct u_upload_mgr *border_color_uploader; - struct pipe_resource *border_color_buf; + struct u_upload_mgr *border_color_uploader; + struct pipe_resource *border_color_buf; - /* bitmask of samplers which need astc srgb workaround: */ - uint16_t vastc_srgb, fastc_srgb; + /* bitmask of samplers which need astc srgb workaround: */ + uint16_t vastc_srgb, fastc_srgb; - /* storage for ctx->last.key: */ - struct ir3_shader_key last_key; + /* storage for ctx->last.key: */ + struct ir3_shader_key last_key; }; static inline struct fd4_context * fd4_context(struct fd_context *ctx) { - return (struct fd4_context *)ctx; + return (struct fd4_context *)ctx; } -struct pipe_context * -fd4_context_create(struct pipe_screen *pscreen, void *priv, unsigned flags); +struct pipe_context *fd4_context_create(struct pipe_screen *pscreen, void *priv, + unsigned flags); #endif /* FD4_CONTEXT_H_ */ diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_draw.c b/src/gallium/drivers/freedreno/a4xx/fd4_draw.c index 37b24c1..57bdea8 100644 --- a/src/gallium/drivers/freedreno/a4xx/fd4_draw.c +++ b/src/gallium/drivers/freedreno/a4xx/fd4_draw.c @@ -25,150 +25,148 @@ */ #include "pipe/p_state.h" -#include "util/u_string.h" #include "util/u_memory.h" #include "util/u_prim.h" +#include "util/u_string.h" -#include "freedreno_state.h" #include "freedreno_resource.h" +#include "freedreno_state.h" -#include "fd4_draw.h" #include "fd4_context.h" +#include "fd4_draw.h" #include "fd4_emit.h" -#include "fd4_program.h" #include "fd4_format.h" +#include "fd4_program.h" #include "fd4_zsa.h" - static void draw_impl(struct fd_context *ctx, struct fd_ringbuffer *ring, - struct fd4_emit *emit, unsigned index_offset) - assert_dt + struct fd4_emit *emit, unsigned index_offset) assert_dt { - const struct pipe_draw_info *info = emit->info; - enum pc_di_primtype primtype = ctx->primtypes[info->mode]; + const struct pipe_draw_info *info = emit->info; + enum pc_di_primtype primtype = ctx->primtypes[info->mode]; - fd4_emit_state(ctx, ring, emit); + fd4_emit_state(ctx, ring, emit); - if (emit->dirty & (FD_DIRTY_VTXBUF | FD_DIRTY_VTXSTATE)) - fd4_emit_vertex_bufs(ring, emit); + if (emit->dirty & (FD_DIRTY_VTXBUF | FD_DIRTY_VTXSTATE)) + fd4_emit_vertex_bufs(ring, emit); - OUT_PKT0(ring, REG_A4XX_VFD_INDEX_OFFSET, 2); - OUT_RING(ring, info->index_size ? info->index_bias : emit->draw->start); /* VFD_INDEX_OFFSET */ - OUT_RING(ring, info->start_instance); /* ??? UNKNOWN_2209 */ + OUT_PKT0(ring, REG_A4XX_VFD_INDEX_OFFSET, 2); + OUT_RING(ring, info->index_size ? info->index_bias + : emit->draw->start); /* VFD_INDEX_OFFSET */ + OUT_RING(ring, info->start_instance); /* ??? UNKNOWN_2209 */ - OUT_PKT0(ring, REG_A4XX_PC_RESTART_INDEX, 1); - OUT_RING(ring, info->primitive_restart ? /* PC_RESTART_INDEX */ - info->restart_index : 0xffffffff); + OUT_PKT0(ring, REG_A4XX_PC_RESTART_INDEX, 1); + OUT_RING(ring, info->primitive_restart ? /* PC_RESTART_INDEX */ + info->restart_index + : 0xffffffff); - /* points + psize -> spritelist: */ - if (ctx->rasterizer->point_size_per_vertex && - fd4_emit_get_vp(emit)->writes_psize && - (info->mode == PIPE_PRIM_POINTS)) - primtype = DI_PT_POINTLIST_PSIZE; + /* points + psize -> spritelist: */ + if (ctx->rasterizer->point_size_per_vertex && + fd4_emit_get_vp(emit)->writes_psize && (info->mode == PIPE_PRIM_POINTS)) + primtype = DI_PT_POINTLIST_PSIZE; - fd4_draw_emit(ctx->batch, ring, primtype, - emit->binning_pass ? IGNORE_VISIBILITY : USE_VISIBILITY, - info, emit->indirect, emit->draw, index_offset); + fd4_draw_emit(ctx->batch, ring, primtype, + emit->binning_pass ? IGNORE_VISIBILITY : USE_VISIBILITY, info, + emit->indirect, emit->draw, index_offset); } static bool fd4_draw_vbo(struct fd_context *ctx, const struct pipe_draw_info *info, const struct pipe_draw_indirect_info *indirect, const struct pipe_draw_start_count *draw, - unsigned index_offset) - in_dt + unsigned index_offset) in_dt { - struct fd4_context *fd4_ctx = fd4_context(ctx); - struct fd4_emit emit = { - .debug = &ctx->debug, - .vtx = &ctx->vtx, - .info = info, - .indirect = indirect, - .draw = draw, - .key = { - .vs = ctx->prog.vs, - .fs = ctx->prog.fs, - .key = { - .rasterflat = ctx->rasterizer->flatshade, - .ucp_enables = ctx->rasterizer->clip_plane_enable, - .has_per_samp = fd4_ctx->fastc_srgb || fd4_ctx->vastc_srgb, - .vastc_srgb = fd4_ctx->vastc_srgb, - .fastc_srgb = fd4_ctx->fastc_srgb, - }, - }, - .rasterflat = ctx->rasterizer->flatshade, - .sprite_coord_enable = ctx->rasterizer->sprite_coord_enable, - .sprite_coord_mode = ctx->rasterizer->sprite_coord_mode, - }; - - if (info->mode != PIPE_PRIM_MAX && - !indirect && - !info->primitive_restart && - !u_trim_pipe_prim(info->mode, (unsigned*)&draw->count)) - return false; - - ir3_fixup_shader_state(&ctx->base, &emit.key.key); - - enum fd_dirty_3d_state dirty = ctx->dirty; - - emit.prog = fd4_program_state(ir3_cache_lookup(ctx->shader_cache, &emit.key, &ctx->debug)); - - /* bail if compile failed: */ - if (!emit.prog) - return false; - - const struct ir3_shader_variant *vp = fd4_emit_get_vp(&emit); - const struct ir3_shader_variant *fp = fd4_emit_get_fp(&emit); - - ir3_update_max_tf_vtx(ctx, vp); - - /* do regular pass first: */ - - if (unlikely(ctx->stats_users > 0)) { - ctx->stats.vs_regs += ir3_shader_halfregs(vp); - ctx->stats.fs_regs += ir3_shader_halfregs(fp); - } - - emit.binning_pass = false; - emit.dirty = dirty; - - struct fd_ringbuffer *ring = ctx->batch->draw; - - if (ctx->rasterizer->rasterizer_discard) { - fd_wfi(ctx->batch, ring); - OUT_PKT3(ring, CP_REG_RMW, 3); - OUT_RING(ring, REG_A4XX_RB_RENDER_CONTROL); - OUT_RING(ring, ~A4XX_RB_RENDER_CONTROL_DISABLE_COLOR_PIPE); - OUT_RING(ring, A4XX_RB_RENDER_CONTROL_DISABLE_COLOR_PIPE); - } - - draw_impl(ctx, ctx->batch->draw, &emit, index_offset); - - if (ctx->rasterizer->rasterizer_discard) { - fd_wfi(ctx->batch, ring); - OUT_PKT3(ring, CP_REG_RMW, 3); - OUT_RING(ring, REG_A4XX_RB_RENDER_CONTROL); - OUT_RING(ring, ~A4XX_RB_RENDER_CONTROL_DISABLE_COLOR_PIPE); - OUT_RING(ring, 0); - } - - /* and now binning pass: */ - emit.binning_pass = true; - emit.dirty = dirty & ~(FD_DIRTY_BLEND); - emit.vs = NULL; /* we changed key so need to refetch vs */ - emit.fs = NULL; - draw_impl(ctx, ctx->batch->binning, &emit, index_offset); - - fd_context_all_clean(ctx); - - return true; + struct fd4_context *fd4_ctx = fd4_context(ctx); + struct fd4_emit emit = { + .debug = &ctx->debug, + .vtx = &ctx->vtx, + .info = info, + .indirect = indirect, + .draw = draw, + .key = + { + .vs = ctx->prog.vs, + .fs = ctx->prog.fs, + .key = + { + .rasterflat = ctx->rasterizer->flatshade, + .ucp_enables = ctx->rasterizer->clip_plane_enable, + .has_per_samp = fd4_ctx->fastc_srgb || fd4_ctx->vastc_srgb, + .vastc_srgb = fd4_ctx->vastc_srgb, + .fastc_srgb = fd4_ctx->fastc_srgb, + }, + }, + .rasterflat = ctx->rasterizer->flatshade, + .sprite_coord_enable = ctx->rasterizer->sprite_coord_enable, + .sprite_coord_mode = ctx->rasterizer->sprite_coord_mode, + }; + + if (info->mode != PIPE_PRIM_MAX && !indirect && !info->primitive_restart && + !u_trim_pipe_prim(info->mode, (unsigned *)&draw->count)) + return false; + + ir3_fixup_shader_state(&ctx->base, &emit.key.key); + + enum fd_dirty_3d_state dirty = ctx->dirty; + + emit.prog = fd4_program_state( + ir3_cache_lookup(ctx->shader_cache, &emit.key, &ctx->debug)); + + /* bail if compile failed: */ + if (!emit.prog) + return false; + + const struct ir3_shader_variant *vp = fd4_emit_get_vp(&emit); + const struct ir3_shader_variant *fp = fd4_emit_get_fp(&emit); + + ir3_update_max_tf_vtx(ctx, vp); + + /* do regular pass first: */ + + if (unlikely(ctx->stats_users > 0)) { + ctx->stats.vs_regs += ir3_shader_halfregs(vp); + ctx->stats.fs_regs += ir3_shader_halfregs(fp); + } + + emit.binning_pass = false; + emit.dirty = dirty; + + struct fd_ringbuffer *ring = ctx->batch->draw; + + if (ctx->rasterizer->rasterizer_discard) { + fd_wfi(ctx->batch, ring); + OUT_PKT3(ring, CP_REG_RMW, 3); + OUT_RING(ring, REG_A4XX_RB_RENDER_CONTROL); + OUT_RING(ring, ~A4XX_RB_RENDER_CONTROL_DISABLE_COLOR_PIPE); + OUT_RING(ring, A4XX_RB_RENDER_CONTROL_DISABLE_COLOR_PIPE); + } + + draw_impl(ctx, ctx->batch->draw, &emit, index_offset); + + if (ctx->rasterizer->rasterizer_discard) { + fd_wfi(ctx->batch, ring); + OUT_PKT3(ring, CP_REG_RMW, 3); + OUT_RING(ring, REG_A4XX_RB_RENDER_CONTROL); + OUT_RING(ring, ~A4XX_RB_RENDER_CONTROL_DISABLE_COLOR_PIPE); + OUT_RING(ring, 0); + } + + /* and now binning pass: */ + emit.binning_pass = true; + emit.dirty = dirty & ~(FD_DIRTY_BLEND); + emit.vs = NULL; /* we changed key so need to refetch vs */ + emit.fs = NULL; + draw_impl(ctx, ctx->batch->binning, &emit, index_offset); + + fd_context_all_clean(ctx); + + return true; } void -fd4_draw_init(struct pipe_context *pctx) - disable_thread_safety_analysis +fd4_draw_init(struct pipe_context *pctx) disable_thread_safety_analysis { - struct fd_context *ctx = fd_context(pctx); - ctx->draw_vbo = fd4_draw_vbo; + struct fd_context *ctx = fd_context(pctx); + ctx->draw_vbo = fd4_draw_vbo; } diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_draw.h b/src/gallium/drivers/freedreno/a4xx/fd4_draw.h index d7520e1..29e8470 100644 --- a/src/gallium/drivers/freedreno/a4xx/fd4_draw.h +++ b/src/gallium/drivers/freedreno/a4xx/fd4_draw.h @@ -35,118 +35,114 @@ void fd4_draw_init(struct pipe_context *pctx); /* draw packet changed on a4xx, so cannot reuse one from a2xx/a3xx.. */ -static inline uint32_t DRAW4(enum pc_di_primtype prim_type, - enum pc_di_src_sel source_select, enum a4xx_index_size index_size, - enum pc_di_vis_cull_mode vis_cull_mode) +static inline uint32_t +DRAW4(enum pc_di_primtype prim_type, enum pc_di_src_sel source_select, + enum a4xx_index_size index_size, enum pc_di_vis_cull_mode vis_cull_mode) { - return CP_DRAW_INDX_OFFSET_0_PRIM_TYPE(prim_type) | - CP_DRAW_INDX_OFFSET_0_SOURCE_SELECT(source_select) | - CP_DRAW_INDX_OFFSET_0_INDEX_SIZE(index_size) | - CP_DRAW_INDX_OFFSET_0_VIS_CULL(vis_cull_mode); + return CP_DRAW_INDX_OFFSET_0_PRIM_TYPE(prim_type) | + CP_DRAW_INDX_OFFSET_0_SOURCE_SELECT(source_select) | + CP_DRAW_INDX_OFFSET_0_INDEX_SIZE(index_size) | + CP_DRAW_INDX_OFFSET_0_VIS_CULL(vis_cull_mode); } static inline void fd4_draw(struct fd_batch *batch, struct fd_ringbuffer *ring, - enum pc_di_primtype primtype, - enum pc_di_vis_cull_mode vismode, - enum pc_di_src_sel src_sel, uint32_t count, - uint32_t instances, enum a4xx_index_size idx_type, - uint32_t max_indices, uint32_t idx_offset, - struct pipe_resource *idx_buffer) + enum pc_di_primtype primtype, enum pc_di_vis_cull_mode vismode, + enum pc_di_src_sel src_sel, uint32_t count, uint32_t instances, + enum a4xx_index_size idx_type, uint32_t max_indices, + uint32_t idx_offset, struct pipe_resource *idx_buffer) { - /* for debug after a lock up, write a unique counter value - * to scratch7 for each draw, to make it easier to match up - * register dumps to cmdstream. The combination of IB - * (scratch6) and DRAW is enough to "triangulate" the - * particular draw that caused lockup. - */ - emit_marker(ring, 7); - - OUT_PKT3(ring, CP_DRAW_INDX_OFFSET, idx_buffer ? 6 : 3); - if (vismode == USE_VISIBILITY) { - /* leave vis mode blank for now, it will be patched up when - * we know if we are binning or not - */ - OUT_RINGP(ring, DRAW4(primtype, src_sel, idx_type, 0), - &batch->draw_patches); - } else { - OUT_RING(ring, DRAW4(primtype, src_sel, idx_type, vismode)); - } - OUT_RING(ring, instances); /* NumInstances */ - OUT_RING(ring, count); /* NumIndices */ - if (idx_buffer) { - OUT_RING(ring, 0x0); /* XXX */ - OUT_RELOC(ring, fd_resource(idx_buffer)->bo, idx_offset, 0, 0); - OUT_RING (ring, max_indices); - } - - emit_marker(ring, 7); - - fd_reset_wfi(batch); + /* for debug after a lock up, write a unique counter value + * to scratch7 for each draw, to make it easier to match up + * register dumps to cmdstream. The combination of IB + * (scratch6) and DRAW is enough to "triangulate" the + * particular draw that caused lockup. + */ + emit_marker(ring, 7); + + OUT_PKT3(ring, CP_DRAW_INDX_OFFSET, idx_buffer ? 6 : 3); + if (vismode == USE_VISIBILITY) { + /* leave vis mode blank for now, it will be patched up when + * we know if we are binning or not + */ + OUT_RINGP(ring, DRAW4(primtype, src_sel, idx_type, 0), + &batch->draw_patches); + } else { + OUT_RING(ring, DRAW4(primtype, src_sel, idx_type, vismode)); + } + OUT_RING(ring, instances); /* NumInstances */ + OUT_RING(ring, count); /* NumIndices */ + if (idx_buffer) { + OUT_RING(ring, 0x0); /* XXX */ + OUT_RELOC(ring, fd_resource(idx_buffer)->bo, idx_offset, 0, 0); + OUT_RING(ring, max_indices); + } + + emit_marker(ring, 7); + + fd_reset_wfi(batch); } static inline void fd4_draw_emit(struct fd_batch *batch, struct fd_ringbuffer *ring, - enum pc_di_primtype primtype, - enum pc_di_vis_cull_mode vismode, - const struct pipe_draw_info *info, + enum pc_di_primtype primtype, enum pc_di_vis_cull_mode vismode, + const struct pipe_draw_info *info, const struct pipe_draw_indirect_info *indirect, - const struct pipe_draw_start_count *draw, - unsigned index_offset) + const struct pipe_draw_start_count *draw, unsigned index_offset) { - struct pipe_resource *idx_buffer = NULL; - enum a4xx_index_size idx_type; - enum pc_di_src_sel src_sel; - uint32_t idx_size, idx_offset; - - if (indirect && indirect->buffer) { - struct fd_resource *ind = fd_resource(indirect->buffer); - - emit_marker(ring, 7); - - if (info->index_size) { - struct pipe_resource *idx = info->index.resource; - - OUT_PKT3(ring, CP_DRAW_INDX_INDIRECT, 4); - OUT_RINGP(ring, DRAW4(primtype, DI_SRC_SEL_DMA, - fd4_size2indextype(info->index_size), 0), - &batch->draw_patches); - OUT_RELOC(ring, fd_resource(idx)->bo, index_offset, 0, 0); - OUT_RING(ring, A4XX_CP_DRAW_INDX_INDIRECT_2_INDX_SIZE( - idx->width0 - index_offset)); - OUT_RELOC(ring, ind->bo, indirect->offset, 0, 0); - } else { - OUT_PKT3(ring, CP_DRAW_INDIRECT, 2); - OUT_RINGP(ring, DRAW4(primtype, DI_SRC_SEL_AUTO_INDEX, 0, 0), - &batch->draw_patches); - OUT_RELOC(ring, ind->bo, indirect->offset, 0, 0); - } - - emit_marker(ring, 7); - fd_reset_wfi(batch); - - return; - } - - if (info->index_size) { - assert(!info->has_user_indices); - - idx_buffer = info->index.resource; - idx_type = fd4_size2indextype(info->index_size); - idx_size = info->index_size * draw->count; - idx_offset = index_offset + draw->start * info->index_size; - src_sel = DI_SRC_SEL_DMA; - } else { - idx_buffer = NULL; - idx_type = INDEX4_SIZE_32_BIT; - idx_size = 0; - idx_offset = 0; - src_sel = DI_SRC_SEL_AUTO_INDEX; - } - - fd4_draw(batch, ring, primtype, vismode, src_sel, - draw->count, info->instance_count, - idx_type, idx_size, idx_offset, idx_buffer); + struct pipe_resource *idx_buffer = NULL; + enum a4xx_index_size idx_type; + enum pc_di_src_sel src_sel; + uint32_t idx_size, idx_offset; + + if (indirect && indirect->buffer) { + struct fd_resource *ind = fd_resource(indirect->buffer); + + emit_marker(ring, 7); + + if (info->index_size) { + struct pipe_resource *idx = info->index.resource; + + OUT_PKT3(ring, CP_DRAW_INDX_INDIRECT, 4); + OUT_RINGP(ring, + DRAW4(primtype, DI_SRC_SEL_DMA, + fd4_size2indextype(info->index_size), 0), + &batch->draw_patches); + OUT_RELOC(ring, fd_resource(idx)->bo, index_offset, 0, 0); + OUT_RING(ring, A4XX_CP_DRAW_INDX_INDIRECT_2_INDX_SIZE(idx->width0 - + index_offset)); + OUT_RELOC(ring, ind->bo, indirect->offset, 0, 0); + } else { + OUT_PKT3(ring, CP_DRAW_INDIRECT, 2); + OUT_RINGP(ring, DRAW4(primtype, DI_SRC_SEL_AUTO_INDEX, 0, 0), + &batch->draw_patches); + OUT_RELOC(ring, ind->bo, indirect->offset, 0, 0); + } + + emit_marker(ring, 7); + fd_reset_wfi(batch); + + return; + } + + if (info->index_size) { + assert(!info->has_user_indices); + + idx_buffer = info->index.resource; + idx_type = fd4_size2indextype(info->index_size); + idx_size = info->index_size * draw->count; + idx_offset = index_offset + draw->start * info->index_size; + src_sel = DI_SRC_SEL_DMA; + } else { + idx_buffer = NULL; + idx_type = INDEX4_SIZE_32_BIT; + idx_size = 0; + idx_offset = 0; + src_sel = DI_SRC_SEL_AUTO_INDEX; + } + + fd4_draw(batch, ring, primtype, vismode, src_sel, draw->count, + info->instance_count, idx_type, idx_size, idx_offset, idx_buffer); } #endif /* FD4_DRAW_H_ */ diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_emit.c b/src/gallium/drivers/freedreno/a4xx/fd4_emit.c index dee4920..e9f71cd 100644 --- a/src/gallium/drivers/freedreno/a4xx/fd4_emit.c +++ b/src/gallium/drivers/freedreno/a4xx/fd4_emit.c @@ -25,26 +25,26 @@ */ #include "pipe/p_state.h" -#include "util/u_string.h" -#include "util/u_memory.h" -#include "util/u_helpers.h" #include "util/format/u_format.h" +#include "util/u_helpers.h" +#include "util/u_memory.h" +#include "util/u_string.h" #include "util/u_viewport.h" -#include "freedreno_resource.h" #include "freedreno_query_hw.h" +#include "freedreno_resource.h" -#include "fd4_emit.h" #include "fd4_blend.h" #include "fd4_context.h" +#include "fd4_emit.h" +#include "fd4_format.h" #include "fd4_program.h" #include "fd4_rasterizer.h" #include "fd4_texture.h" -#include "fd4_format.h" #include "fd4_zsa.h" #define emit_const_user fd4_emit_const_user -#define emit_const_bo fd4_emit_const_bo +#define emit_const_bo fd4_emit_const_bo #include "ir3_const.h" /* regid: base const register @@ -53,214 +53,211 @@ */ static void fd4_emit_const_user(struct fd_ringbuffer *ring, - const struct ir3_shader_variant *v, uint32_t regid, uint32_t sizedwords, - const uint32_t *dwords) + const struct ir3_shader_variant *v, uint32_t regid, + uint32_t sizedwords, const uint32_t *dwords) { - emit_const_asserts(ring, v, regid, sizedwords); - - OUT_PKT3(ring, CP_LOAD_STATE4, 2 + sizedwords); - OUT_RING(ring, CP_LOAD_STATE4_0_DST_OFF(regid/4) | - CP_LOAD_STATE4_0_STATE_SRC(SS4_DIRECT) | - CP_LOAD_STATE4_0_STATE_BLOCK(fd4_stage2shadersb(v->type)) | - CP_LOAD_STATE4_0_NUM_UNIT(sizedwords/4)); - OUT_RING(ring, CP_LOAD_STATE4_1_EXT_SRC_ADDR(0) | - CP_LOAD_STATE4_1_STATE_TYPE(ST4_CONSTANTS)); - for (int i = 0; i < sizedwords; i++) - OUT_RING(ring, dwords[i]); + emit_const_asserts(ring, v, regid, sizedwords); + + OUT_PKT3(ring, CP_LOAD_STATE4, 2 + sizedwords); + OUT_RING(ring, CP_LOAD_STATE4_0_DST_OFF(regid / 4) | + CP_LOAD_STATE4_0_STATE_SRC(SS4_DIRECT) | + CP_LOAD_STATE4_0_STATE_BLOCK(fd4_stage2shadersb(v->type)) | + CP_LOAD_STATE4_0_NUM_UNIT(sizedwords / 4)); + OUT_RING(ring, CP_LOAD_STATE4_1_EXT_SRC_ADDR(0) | + CP_LOAD_STATE4_1_STATE_TYPE(ST4_CONSTANTS)); + for (int i = 0; i < sizedwords; i++) + OUT_RING(ring, dwords[i]); } static void -fd4_emit_const_bo(struct fd_ringbuffer *ring, const struct ir3_shader_variant *v, - uint32_t regid, uint32_t offset, uint32_t sizedwords, - struct fd_bo *bo) +fd4_emit_const_bo(struct fd_ringbuffer *ring, + const struct ir3_shader_variant *v, uint32_t regid, + uint32_t offset, uint32_t sizedwords, struct fd_bo *bo) { - uint32_t dst_off = regid / 4; - assert(dst_off % 4 == 0); - uint32_t num_unit = sizedwords / 4; - assert(num_unit % 4 == 0); - - emit_const_asserts(ring, v, regid, sizedwords); - - OUT_PKT3(ring, CP_LOAD_STATE4, 2); - OUT_RING(ring, CP_LOAD_STATE4_0_DST_OFF(dst_off) | - CP_LOAD_STATE4_0_STATE_SRC(SS4_INDIRECT) | - CP_LOAD_STATE4_0_STATE_BLOCK(fd4_stage2shadersb(v->type)) | - CP_LOAD_STATE4_0_NUM_UNIT(num_unit)); - OUT_RELOC(ring, bo, offset, - CP_LOAD_STATE4_1_STATE_TYPE(ST4_CONSTANTS), 0); + uint32_t dst_off = regid / 4; + assert(dst_off % 4 == 0); + uint32_t num_unit = sizedwords / 4; + assert(num_unit % 4 == 0); + + emit_const_asserts(ring, v, regid, sizedwords); + + OUT_PKT3(ring, CP_LOAD_STATE4, 2); + OUT_RING(ring, CP_LOAD_STATE4_0_DST_OFF(dst_off) | + CP_LOAD_STATE4_0_STATE_SRC(SS4_INDIRECT) | + CP_LOAD_STATE4_0_STATE_BLOCK(fd4_stage2shadersb(v->type)) | + CP_LOAD_STATE4_0_NUM_UNIT(num_unit)); + OUT_RELOC(ring, bo, offset, CP_LOAD_STATE4_1_STATE_TYPE(ST4_CONSTANTS), 0); } static void fd4_emit_const_ptrs(struct fd_ringbuffer *ring, gl_shader_stage type, - uint32_t regid, uint32_t num, struct fd_bo **bos, uint32_t *offsets) + uint32_t regid, uint32_t num, struct fd_bo **bos, + uint32_t *offsets) { - uint32_t anum = align(num, 4); - uint32_t i; - - debug_assert((regid % 4) == 0); - - OUT_PKT3(ring, CP_LOAD_STATE4, 2 + anum); - OUT_RING(ring, CP_LOAD_STATE4_0_DST_OFF(regid/4) | - CP_LOAD_STATE4_0_STATE_SRC(SS4_DIRECT) | - CP_LOAD_STATE4_0_STATE_BLOCK(fd4_stage2shadersb(type)) | - CP_LOAD_STATE4_0_NUM_UNIT(anum/4)); - OUT_RING(ring, CP_LOAD_STATE4_1_EXT_SRC_ADDR(0) | - CP_LOAD_STATE4_1_STATE_TYPE(ST4_CONSTANTS)); - - for (i = 0; i < num; i++) { - if (bos[i]) { - OUT_RELOC(ring, bos[i], offsets[i], 0, 0); - } else { - OUT_RING(ring, 0xbad00000 | (i << 16)); - } - } - - for (; i < anum; i++) - OUT_RING(ring, 0xffffffff); + uint32_t anum = align(num, 4); + uint32_t i; + + debug_assert((regid % 4) == 0); + + OUT_PKT3(ring, CP_LOAD_STATE4, 2 + anum); + OUT_RING(ring, CP_LOAD_STATE4_0_DST_OFF(regid / 4) | + CP_LOAD_STATE4_0_STATE_SRC(SS4_DIRECT) | + CP_LOAD_STATE4_0_STATE_BLOCK(fd4_stage2shadersb(type)) | + CP_LOAD_STATE4_0_NUM_UNIT(anum / 4)); + OUT_RING(ring, CP_LOAD_STATE4_1_EXT_SRC_ADDR(0) | + CP_LOAD_STATE4_1_STATE_TYPE(ST4_CONSTANTS)); + + for (i = 0; i < num; i++) { + if (bos[i]) { + OUT_RELOC(ring, bos[i], offsets[i], 0, 0); + } else { + OUT_RING(ring, 0xbad00000 | (i << 16)); + } + } + + for (; i < anum; i++) + OUT_RING(ring, 0xffffffff); } static bool is_stateobj(struct fd_ringbuffer *ring) { - return false; + return false; } static void -emit_const_ptrs(struct fd_ringbuffer *ring, - const struct ir3_shader_variant *v, uint32_t dst_offset, - uint32_t num, struct fd_bo **bos, uint32_t *offsets) +emit_const_ptrs(struct fd_ringbuffer *ring, const struct ir3_shader_variant *v, + uint32_t dst_offset, uint32_t num, struct fd_bo **bos, + uint32_t *offsets) { - /* TODO inline this */ - assert(dst_offset + num <= v->constlen * 4); - fd4_emit_const_ptrs(ring, v->type, dst_offset, num, bos, offsets); + /* TODO inline this */ + assert(dst_offset + num <= v->constlen * 4); + fd4_emit_const_ptrs(ring, v->type, dst_offset, num, bos, offsets); } static void emit_textures(struct fd_context *ctx, struct fd_ringbuffer *ring, - enum a4xx_state_block sb, struct fd_texture_stateobj *tex, - const struct ir3_shader_variant *v) + enum a4xx_state_block sb, struct fd_texture_stateobj *tex, + const struct ir3_shader_variant *v) { - static const uint32_t bcolor_reg[] = { - [SB4_VS_TEX] = REG_A4XX_TPL1_TP_VS_BORDER_COLOR_BASE_ADDR, - [SB4_FS_TEX] = REG_A4XX_TPL1_TP_FS_BORDER_COLOR_BASE_ADDR, - }; - struct fd4_context *fd4_ctx = fd4_context(ctx); - bool needs_border = false; - unsigned i; - - if (tex->num_samplers > 0) { - int num_samplers; - - /* not sure if this is an a420.0 workaround, but we seem - * to need to emit these in pairs.. emit a final dummy - * entry if odd # of samplers: - */ - num_samplers = align(tex->num_samplers, 2); - - /* output sampler state: */ - OUT_PKT3(ring, CP_LOAD_STATE4, 2 + (2 * num_samplers)); - OUT_RING(ring, CP_LOAD_STATE4_0_DST_OFF(0) | - CP_LOAD_STATE4_0_STATE_SRC(SS4_DIRECT) | - CP_LOAD_STATE4_0_STATE_BLOCK(sb) | - CP_LOAD_STATE4_0_NUM_UNIT(num_samplers)); - OUT_RING(ring, CP_LOAD_STATE4_1_STATE_TYPE(ST4_SHADER) | - CP_LOAD_STATE4_1_EXT_SRC_ADDR(0)); - for (i = 0; i < tex->num_samplers; i++) { - static const struct fd4_sampler_stateobj dummy_sampler = {}; - const struct fd4_sampler_stateobj *sampler = tex->samplers[i] ? - fd4_sampler_stateobj(tex->samplers[i]) : - &dummy_sampler; - OUT_RING(ring, sampler->texsamp0); - OUT_RING(ring, sampler->texsamp1); - - needs_border |= sampler->needs_border; - } - - for (; i < num_samplers; i++) { - OUT_RING(ring, 0x00000000); - OUT_RING(ring, 0x00000000); - } - } - - if (tex->num_textures > 0) { - unsigned num_textures = tex->num_textures + v->astc_srgb.count; - - /* emit texture state: */ - OUT_PKT3(ring, CP_LOAD_STATE4, 2 + (8 * num_textures)); - OUT_RING(ring, CP_LOAD_STATE4_0_DST_OFF(0) | - CP_LOAD_STATE4_0_STATE_SRC(SS4_DIRECT) | - CP_LOAD_STATE4_0_STATE_BLOCK(sb) | - CP_LOAD_STATE4_0_NUM_UNIT(num_textures)); - OUT_RING(ring, CP_LOAD_STATE4_1_STATE_TYPE(ST4_CONSTANTS) | - CP_LOAD_STATE4_1_EXT_SRC_ADDR(0)); - for (i = 0; i < tex->num_textures; i++) { - static const struct fd4_pipe_sampler_view dummy_view = {}; - const struct fd4_pipe_sampler_view *view = tex->textures[i] ? - fd4_pipe_sampler_view(tex->textures[i]) : - &dummy_view; - - OUT_RING(ring, view->texconst0); - OUT_RING(ring, view->texconst1); - OUT_RING(ring, view->texconst2); - OUT_RING(ring, view->texconst3); - if (view->base.texture) { - struct fd_resource *rsc = fd_resource(view->base.texture); - if (view->base.format == PIPE_FORMAT_X32_S8X24_UINT) - rsc = rsc->stencil; - OUT_RELOC(ring, rsc->bo, view->offset, view->texconst4, 0); - } else { - OUT_RING(ring, 0x00000000); - } - OUT_RING(ring, 0x00000000); - OUT_RING(ring, 0x00000000); - OUT_RING(ring, 0x00000000); - } - - for (i = 0; i < v->astc_srgb.count; i++) { - static const struct fd4_pipe_sampler_view dummy_view = {}; - const struct fd4_pipe_sampler_view *view; - unsigned idx = v->astc_srgb.orig_idx[i]; - - view = tex->textures[idx] ? - fd4_pipe_sampler_view(tex->textures[idx]) : - &dummy_view; - - debug_assert(view->texconst0 & A4XX_TEX_CONST_0_SRGB); - - OUT_RING(ring, view->texconst0 & ~A4XX_TEX_CONST_0_SRGB); - OUT_RING(ring, view->texconst1); - OUT_RING(ring, view->texconst2); - OUT_RING(ring, view->texconst3); - if (view->base.texture) { - struct fd_resource *rsc = fd_resource(view->base.texture); - OUT_RELOC(ring, rsc->bo, view->offset, view->texconst4, 0); - } else { - OUT_RING(ring, 0x00000000); - } - OUT_RING(ring, 0x00000000); - OUT_RING(ring, 0x00000000); - OUT_RING(ring, 0x00000000); - } - } else { - debug_assert(v->astc_srgb.count == 0); - } - - if (needs_border) { - unsigned off; - void *ptr; - - u_upload_alloc(fd4_ctx->border_color_uploader, - 0, BORDER_COLOR_UPLOAD_SIZE, - BORDER_COLOR_UPLOAD_SIZE, &off, - &fd4_ctx->border_color_buf, - &ptr); - - fd_setup_border_colors(tex, ptr, 0); - OUT_PKT0(ring, bcolor_reg[sb], 1); - OUT_RELOC(ring, fd_resource(fd4_ctx->border_color_buf)->bo, off, 0, 0); - - u_upload_unmap(fd4_ctx->border_color_uploader); - } + static const uint32_t bcolor_reg[] = { + [SB4_VS_TEX] = REG_A4XX_TPL1_TP_VS_BORDER_COLOR_BASE_ADDR, + [SB4_FS_TEX] = REG_A4XX_TPL1_TP_FS_BORDER_COLOR_BASE_ADDR, + }; + struct fd4_context *fd4_ctx = fd4_context(ctx); + bool needs_border = false; + unsigned i; + + if (tex->num_samplers > 0) { + int num_samplers; + + /* not sure if this is an a420.0 workaround, but we seem + * to need to emit these in pairs.. emit a final dummy + * entry if odd # of samplers: + */ + num_samplers = align(tex->num_samplers, 2); + + /* output sampler state: */ + OUT_PKT3(ring, CP_LOAD_STATE4, 2 + (2 * num_samplers)); + OUT_RING(ring, CP_LOAD_STATE4_0_DST_OFF(0) | + CP_LOAD_STATE4_0_STATE_SRC(SS4_DIRECT) | + CP_LOAD_STATE4_0_STATE_BLOCK(sb) | + CP_LOAD_STATE4_0_NUM_UNIT(num_samplers)); + OUT_RING(ring, CP_LOAD_STATE4_1_STATE_TYPE(ST4_SHADER) | + CP_LOAD_STATE4_1_EXT_SRC_ADDR(0)); + for (i = 0; i < tex->num_samplers; i++) { + static const struct fd4_sampler_stateobj dummy_sampler = {}; + const struct fd4_sampler_stateobj *sampler = + tex->samplers[i] ? fd4_sampler_stateobj(tex->samplers[i]) + : &dummy_sampler; + OUT_RING(ring, sampler->texsamp0); + OUT_RING(ring, sampler->texsamp1); + + needs_border |= sampler->needs_border; + } + + for (; i < num_samplers; i++) { + OUT_RING(ring, 0x00000000); + OUT_RING(ring, 0x00000000); + } + } + + if (tex->num_textures > 0) { + unsigned num_textures = tex->num_textures + v->astc_srgb.count; + + /* emit texture state: */ + OUT_PKT3(ring, CP_LOAD_STATE4, 2 + (8 * num_textures)); + OUT_RING(ring, CP_LOAD_STATE4_0_DST_OFF(0) | + CP_LOAD_STATE4_0_STATE_SRC(SS4_DIRECT) | + CP_LOAD_STATE4_0_STATE_BLOCK(sb) | + CP_LOAD_STATE4_0_NUM_UNIT(num_textures)); + OUT_RING(ring, CP_LOAD_STATE4_1_STATE_TYPE(ST4_CONSTANTS) | + CP_LOAD_STATE4_1_EXT_SRC_ADDR(0)); + for (i = 0; i < tex->num_textures; i++) { + static const struct fd4_pipe_sampler_view dummy_view = {}; + const struct fd4_pipe_sampler_view *view = + tex->textures[i] ? fd4_pipe_sampler_view(tex->textures[i]) + : &dummy_view; + + OUT_RING(ring, view->texconst0); + OUT_RING(ring, view->texconst1); + OUT_RING(ring, view->texconst2); + OUT_RING(ring, view->texconst3); + if (view->base.texture) { + struct fd_resource *rsc = fd_resource(view->base.texture); + if (view->base.format == PIPE_FORMAT_X32_S8X24_UINT) + rsc = rsc->stencil; + OUT_RELOC(ring, rsc->bo, view->offset, view->texconst4, 0); + } else { + OUT_RING(ring, 0x00000000); + } + OUT_RING(ring, 0x00000000); + OUT_RING(ring, 0x00000000); + OUT_RING(ring, 0x00000000); + } + + for (i = 0; i < v->astc_srgb.count; i++) { + static const struct fd4_pipe_sampler_view dummy_view = {}; + const struct fd4_pipe_sampler_view *view; + unsigned idx = v->astc_srgb.orig_idx[i]; + + view = tex->textures[idx] ? fd4_pipe_sampler_view(tex->textures[idx]) + : &dummy_view; + + debug_assert(view->texconst0 & A4XX_TEX_CONST_0_SRGB); + + OUT_RING(ring, view->texconst0 & ~A4XX_TEX_CONST_0_SRGB); + OUT_RING(ring, view->texconst1); + OUT_RING(ring, view->texconst2); + OUT_RING(ring, view->texconst3); + if (view->base.texture) { + struct fd_resource *rsc = fd_resource(view->base.texture); + OUT_RELOC(ring, rsc->bo, view->offset, view->texconst4, 0); + } else { + OUT_RING(ring, 0x00000000); + } + OUT_RING(ring, 0x00000000); + OUT_RING(ring, 0x00000000); + OUT_RING(ring, 0x00000000); + } + } else { + debug_assert(v->astc_srgb.count == 0); + } + + if (needs_border) { + unsigned off; + void *ptr; + + u_upload_alloc(fd4_ctx->border_color_uploader, 0, + BORDER_COLOR_UPLOAD_SIZE, BORDER_COLOR_UPLOAD_SIZE, &off, + &fd4_ctx->border_color_buf, &ptr); + + fd_setup_border_colors(tex, ptr, 0); + OUT_PKT0(ring, bcolor_reg[sb], 1); + OUT_RELOC(ring, fd_resource(fd4_ctx->border_color_buf)->bo, off, 0, 0); + + u_upload_unmap(fd4_ctx->border_color_uploader); + } } /* emit texture state for mem->gmem restore operation.. eventually it would @@ -269,501 +266,509 @@ emit_textures(struct fd_context *ctx, struct fd_ringbuffer *ring, */ void fd4_emit_gmem_restore_tex(struct fd_ringbuffer *ring, unsigned nr_bufs, - struct pipe_surface **bufs) + struct pipe_surface **bufs) { - unsigned char mrt_comp[A4XX_MAX_RENDER_TARGETS]; - int i; - - for (i = 0; i < A4XX_MAX_RENDER_TARGETS; i++) { - mrt_comp[i] = (i < nr_bufs) ? 0xf : 0; - } - - /* output sampler state: */ - OUT_PKT3(ring, CP_LOAD_STATE4, 2 + (2 * nr_bufs)); - OUT_RING(ring, CP_LOAD_STATE4_0_DST_OFF(0) | - CP_LOAD_STATE4_0_STATE_SRC(SS4_DIRECT) | - CP_LOAD_STATE4_0_STATE_BLOCK(SB4_FS_TEX) | - CP_LOAD_STATE4_0_NUM_UNIT(nr_bufs)); - OUT_RING(ring, CP_LOAD_STATE4_1_STATE_TYPE(ST4_SHADER) | - CP_LOAD_STATE4_1_EXT_SRC_ADDR(0)); - for (i = 0; i < nr_bufs; i++) { - OUT_RING(ring, A4XX_TEX_SAMP_0_XY_MAG(A4XX_TEX_NEAREST) | - A4XX_TEX_SAMP_0_XY_MIN(A4XX_TEX_NEAREST) | - A4XX_TEX_SAMP_0_WRAP_S(A4XX_TEX_CLAMP_TO_EDGE) | - A4XX_TEX_SAMP_0_WRAP_T(A4XX_TEX_CLAMP_TO_EDGE) | - A4XX_TEX_SAMP_0_WRAP_R(A4XX_TEX_REPEAT)); - OUT_RING(ring, 0x00000000); - } - - /* emit texture state: */ - OUT_PKT3(ring, CP_LOAD_STATE4, 2 + (8 * nr_bufs)); - OUT_RING(ring, CP_LOAD_STATE4_0_DST_OFF(0) | - CP_LOAD_STATE4_0_STATE_SRC(SS4_DIRECT) | - CP_LOAD_STATE4_0_STATE_BLOCK(SB4_FS_TEX) | - CP_LOAD_STATE4_0_NUM_UNIT(nr_bufs)); - OUT_RING(ring, CP_LOAD_STATE4_1_STATE_TYPE(ST4_CONSTANTS) | - CP_LOAD_STATE4_1_EXT_SRC_ADDR(0)); - for (i = 0; i < nr_bufs; i++) { - if (bufs[i]) { - struct fd_resource *rsc = fd_resource(bufs[i]->texture); - enum pipe_format format = fd_gmem_restore_format(bufs[i]->format); - - /* The restore blit_zs shader expects stencil in sampler 0, - * and depth in sampler 1 - */ - if (rsc->stencil && (i == 0)) { - rsc = rsc->stencil; - format = fd_gmem_restore_format(rsc->b.b.format); - } - - /* note: PIPE_BUFFER disallowed for surfaces */ - unsigned lvl = bufs[i]->u.tex.level; - unsigned offset = fd_resource_offset(rsc, lvl, bufs[i]->u.tex.first_layer); - - /* z32 restore is accomplished using depth write. If there is - * no stencil component (ie. PIPE_FORMAT_Z32_FLOAT_S8X24_UINT) - * then no render target: - * - * (The same applies for z32_s8x24, since for stencil sampler - * state the above 'if' will replace 'format' with s8) - */ - if ((format == PIPE_FORMAT_Z32_FLOAT) || - (format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT)) - mrt_comp[i] = 0; - - debug_assert(bufs[i]->u.tex.first_layer == bufs[i]->u.tex.last_layer); - - OUT_RING(ring, A4XX_TEX_CONST_0_FMT(fd4_pipe2tex(format)) | - A4XX_TEX_CONST_0_TYPE(A4XX_TEX_2D) | - fd4_tex_swiz(format, PIPE_SWIZZLE_X, PIPE_SWIZZLE_Y, - PIPE_SWIZZLE_Z, PIPE_SWIZZLE_W)); - OUT_RING(ring, A4XX_TEX_CONST_1_WIDTH(bufs[i]->width) | - A4XX_TEX_CONST_1_HEIGHT(bufs[i]->height)); - OUT_RING(ring, A4XX_TEX_CONST_2_PITCH(fd_resource_pitch(rsc, lvl))); - OUT_RING(ring, 0x00000000); - OUT_RELOC(ring, rsc->bo, offset, 0, 0); - OUT_RING(ring, 0x00000000); - OUT_RING(ring, 0x00000000); - OUT_RING(ring, 0x00000000); - } else { - OUT_RING(ring, A4XX_TEX_CONST_0_FMT(0) | - A4XX_TEX_CONST_0_TYPE(A4XX_TEX_2D) | - A4XX_TEX_CONST_0_SWIZ_X(A4XX_TEX_ONE) | - A4XX_TEX_CONST_0_SWIZ_Y(A4XX_TEX_ONE) | - A4XX_TEX_CONST_0_SWIZ_Z(A4XX_TEX_ONE) | - A4XX_TEX_CONST_0_SWIZ_W(A4XX_TEX_ONE)); - OUT_RING(ring, A4XX_TEX_CONST_1_WIDTH(0) | - A4XX_TEX_CONST_1_HEIGHT(0)); - OUT_RING(ring, A4XX_TEX_CONST_2_PITCH(0)); - OUT_RING(ring, 0x00000000); - OUT_RING(ring, 0x00000000); - OUT_RING(ring, 0x00000000); - OUT_RING(ring, 0x00000000); - OUT_RING(ring, 0x00000000); - } - } - - OUT_PKT0(ring, REG_A4XX_RB_RENDER_COMPONENTS, 1); - OUT_RING(ring, A4XX_RB_RENDER_COMPONENTS_RT0(mrt_comp[0]) | - A4XX_RB_RENDER_COMPONENTS_RT1(mrt_comp[1]) | - A4XX_RB_RENDER_COMPONENTS_RT2(mrt_comp[2]) | - A4XX_RB_RENDER_COMPONENTS_RT3(mrt_comp[3]) | - A4XX_RB_RENDER_COMPONENTS_RT4(mrt_comp[4]) | - A4XX_RB_RENDER_COMPONENTS_RT5(mrt_comp[5]) | - A4XX_RB_RENDER_COMPONENTS_RT6(mrt_comp[6]) | - A4XX_RB_RENDER_COMPONENTS_RT7(mrt_comp[7])); + unsigned char mrt_comp[A4XX_MAX_RENDER_TARGETS]; + int i; + + for (i = 0; i < A4XX_MAX_RENDER_TARGETS; i++) { + mrt_comp[i] = (i < nr_bufs) ? 0xf : 0; + } + + /* output sampler state: */ + OUT_PKT3(ring, CP_LOAD_STATE4, 2 + (2 * nr_bufs)); + OUT_RING(ring, CP_LOAD_STATE4_0_DST_OFF(0) | + CP_LOAD_STATE4_0_STATE_SRC(SS4_DIRECT) | + CP_LOAD_STATE4_0_STATE_BLOCK(SB4_FS_TEX) | + CP_LOAD_STATE4_0_NUM_UNIT(nr_bufs)); + OUT_RING(ring, CP_LOAD_STATE4_1_STATE_TYPE(ST4_SHADER) | + CP_LOAD_STATE4_1_EXT_SRC_ADDR(0)); + for (i = 0; i < nr_bufs; i++) { + OUT_RING(ring, A4XX_TEX_SAMP_0_XY_MAG(A4XX_TEX_NEAREST) | + A4XX_TEX_SAMP_0_XY_MIN(A4XX_TEX_NEAREST) | + A4XX_TEX_SAMP_0_WRAP_S(A4XX_TEX_CLAMP_TO_EDGE) | + A4XX_TEX_SAMP_0_WRAP_T(A4XX_TEX_CLAMP_TO_EDGE) | + A4XX_TEX_SAMP_0_WRAP_R(A4XX_TEX_REPEAT)); + OUT_RING(ring, 0x00000000); + } + + /* emit texture state: */ + OUT_PKT3(ring, CP_LOAD_STATE4, 2 + (8 * nr_bufs)); + OUT_RING(ring, CP_LOAD_STATE4_0_DST_OFF(0) | + CP_LOAD_STATE4_0_STATE_SRC(SS4_DIRECT) | + CP_LOAD_STATE4_0_STATE_BLOCK(SB4_FS_TEX) | + CP_LOAD_STATE4_0_NUM_UNIT(nr_bufs)); + OUT_RING(ring, CP_LOAD_STATE4_1_STATE_TYPE(ST4_CONSTANTS) | + CP_LOAD_STATE4_1_EXT_SRC_ADDR(0)); + for (i = 0; i < nr_bufs; i++) { + if (bufs[i]) { + struct fd_resource *rsc = fd_resource(bufs[i]->texture); + enum pipe_format format = fd_gmem_restore_format(bufs[i]->format); + + /* The restore blit_zs shader expects stencil in sampler 0, + * and depth in sampler 1 + */ + if (rsc->stencil && (i == 0)) { + rsc = rsc->stencil; + format = fd_gmem_restore_format(rsc->b.b.format); + } + + /* note: PIPE_BUFFER disallowed for surfaces */ + unsigned lvl = bufs[i]->u.tex.level; + unsigned offset = + fd_resource_offset(rsc, lvl, bufs[i]->u.tex.first_layer); + + /* z32 restore is accomplished using depth write. If there is + * no stencil component (ie. PIPE_FORMAT_Z32_FLOAT_S8X24_UINT) + * then no render target: + * + * (The same applies for z32_s8x24, since for stencil sampler + * state the above 'if' will replace 'format' with s8) + */ + if ((format == PIPE_FORMAT_Z32_FLOAT) || + (format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT)) + mrt_comp[i] = 0; + + debug_assert(bufs[i]->u.tex.first_layer == bufs[i]->u.tex.last_layer); + + OUT_RING(ring, A4XX_TEX_CONST_0_FMT(fd4_pipe2tex(format)) | + A4XX_TEX_CONST_0_TYPE(A4XX_TEX_2D) | + fd4_tex_swiz(format, PIPE_SWIZZLE_X, PIPE_SWIZZLE_Y, + PIPE_SWIZZLE_Z, PIPE_SWIZZLE_W)); + OUT_RING(ring, A4XX_TEX_CONST_1_WIDTH(bufs[i]->width) | + A4XX_TEX_CONST_1_HEIGHT(bufs[i]->height)); + OUT_RING(ring, A4XX_TEX_CONST_2_PITCH(fd_resource_pitch(rsc, lvl))); + OUT_RING(ring, 0x00000000); + OUT_RELOC(ring, rsc->bo, offset, 0, 0); + OUT_RING(ring, 0x00000000); + OUT_RING(ring, 0x00000000); + OUT_RING(ring, 0x00000000); + } else { + OUT_RING(ring, A4XX_TEX_CONST_0_FMT(0) | + A4XX_TEX_CONST_0_TYPE(A4XX_TEX_2D) | + A4XX_TEX_CONST_0_SWIZ_X(A4XX_TEX_ONE) | + A4XX_TEX_CONST_0_SWIZ_Y(A4XX_TEX_ONE) | + A4XX_TEX_CONST_0_SWIZ_Z(A4XX_TEX_ONE) | + A4XX_TEX_CONST_0_SWIZ_W(A4XX_TEX_ONE)); + OUT_RING(ring, A4XX_TEX_CONST_1_WIDTH(0) | A4XX_TEX_CONST_1_HEIGHT(0)); + OUT_RING(ring, A4XX_TEX_CONST_2_PITCH(0)); + OUT_RING(ring, 0x00000000); + OUT_RING(ring, 0x00000000); + OUT_RING(ring, 0x00000000); + OUT_RING(ring, 0x00000000); + OUT_RING(ring, 0x00000000); + } + } + + OUT_PKT0(ring, REG_A4XX_RB_RENDER_COMPONENTS, 1); + OUT_RING(ring, A4XX_RB_RENDER_COMPONENTS_RT0(mrt_comp[0]) | + A4XX_RB_RENDER_COMPONENTS_RT1(mrt_comp[1]) | + A4XX_RB_RENDER_COMPONENTS_RT2(mrt_comp[2]) | + A4XX_RB_RENDER_COMPONENTS_RT3(mrt_comp[3]) | + A4XX_RB_RENDER_COMPONENTS_RT4(mrt_comp[4]) | + A4XX_RB_RENDER_COMPONENTS_RT5(mrt_comp[5]) | + A4XX_RB_RENDER_COMPONENTS_RT6(mrt_comp[6]) | + A4XX_RB_RENDER_COMPONENTS_RT7(mrt_comp[7])); } void fd4_emit_vertex_bufs(struct fd_ringbuffer *ring, struct fd4_emit *emit) { - int32_t i, j, last = -1; - uint32_t total_in = 0; - const struct fd_vertex_state *vtx = emit->vtx; - const struct ir3_shader_variant *vp = fd4_emit_get_vp(emit); - unsigned vertex_regid = regid(63, 0); - unsigned instance_regid = regid(63, 0); - unsigned vtxcnt_regid = regid(63, 0); - - /* Note that sysvals come *after* normal inputs: */ - for (i = 0; i < vp->inputs_count; i++) { - if (!vp->inputs[i].compmask) - continue; - if (vp->inputs[i].sysval) { - switch(vp->inputs[i].slot) { - case SYSTEM_VALUE_VERTEX_ID_ZERO_BASE: - vertex_regid = vp->inputs[i].regid; - break; - case SYSTEM_VALUE_INSTANCE_ID: - instance_regid = vp->inputs[i].regid; - break; - case SYSTEM_VALUE_VERTEX_CNT: - vtxcnt_regid = vp->inputs[i].regid; - break; - default: - unreachable("invalid system value"); - break; - } - } else if (i < vtx->vtx->num_elements) { - last = i; - } - } - - for (i = 0, j = 0; i <= last; i++) { - assert(!vp->inputs[i].sysval); - if (vp->inputs[i].compmask) { - struct pipe_vertex_element *elem = &vtx->vtx->pipe[i]; - const struct pipe_vertex_buffer *vb = - &vtx->vertexbuf.vb[elem->vertex_buffer_index]; - struct fd_resource *rsc = fd_resource(vb->buffer.resource); - enum pipe_format pfmt = elem->src_format; - enum a4xx_vtx_fmt fmt = fd4_pipe2vtx(pfmt); - bool switchnext = (i != last) || - (vertex_regid != regid(63, 0)) || - (instance_regid != regid(63, 0)) || - (vtxcnt_regid != regid(63, 0)); - bool isint = util_format_is_pure_integer(pfmt); - uint32_t fs = util_format_get_blocksize(pfmt); - uint32_t off = vb->buffer_offset + elem->src_offset; - uint32_t size = fd_bo_size(rsc->bo) - off; - debug_assert(fmt != VFMT4_NONE); + int32_t i, j, last = -1; + uint32_t total_in = 0; + const struct fd_vertex_state *vtx = emit->vtx; + const struct ir3_shader_variant *vp = fd4_emit_get_vp(emit); + unsigned vertex_regid = regid(63, 0); + unsigned instance_regid = regid(63, 0); + unsigned vtxcnt_regid = regid(63, 0); + + /* Note that sysvals come *after* normal inputs: */ + for (i = 0; i < vp->inputs_count; i++) { + if (!vp->inputs[i].compmask) + continue; + if (vp->inputs[i].sysval) { + switch (vp->inputs[i].slot) { + case SYSTEM_VALUE_VERTEX_ID_ZERO_BASE: + vertex_regid = vp->inputs[i].regid; + break; + case SYSTEM_VALUE_INSTANCE_ID: + instance_regid = vp->inputs[i].regid; + break; + case SYSTEM_VALUE_VERTEX_CNT: + vtxcnt_regid = vp->inputs[i].regid; + break; + default: + unreachable("invalid system value"); + break; + } + } else if (i < vtx->vtx->num_elements) { + last = i; + } + } + + for (i = 0, j = 0; i <= last; i++) { + assert(!vp->inputs[i].sysval); + if (vp->inputs[i].compmask) { + struct pipe_vertex_element *elem = &vtx->vtx->pipe[i]; + const struct pipe_vertex_buffer *vb = + &vtx->vertexbuf.vb[elem->vertex_buffer_index]; + struct fd_resource *rsc = fd_resource(vb->buffer.resource); + enum pipe_format pfmt = elem->src_format; + enum a4xx_vtx_fmt fmt = fd4_pipe2vtx(pfmt); + bool switchnext = (i != last) || (vertex_regid != regid(63, 0)) || + (instance_regid != regid(63, 0)) || + (vtxcnt_regid != regid(63, 0)); + bool isint = util_format_is_pure_integer(pfmt); + uint32_t fs = util_format_get_blocksize(pfmt); + uint32_t off = vb->buffer_offset + elem->src_offset; + uint32_t size = fd_bo_size(rsc->bo) - off; + debug_assert(fmt != VFMT4_NONE); #ifdef DEBUG - /* see dEQP-GLES31.stress.vertex_attribute_binding.buffer_bounds.bind_vertex_buffer_offset_near_wrap_10 - */ - if (off > fd_bo_size(rsc->bo)) - continue; + /* see + * dEQP-GLES31.stress.vertex_attribute_binding.buffer_bounds.bind_vertex_buffer_offset_near_wrap_10 + */ + if (off > fd_bo_size(rsc->bo)) + continue; #endif - OUT_PKT0(ring, REG_A4XX_VFD_FETCH(j), 4); - OUT_RING(ring, A4XX_VFD_FETCH_INSTR_0_FETCHSIZE(fs - 1) | - A4XX_VFD_FETCH_INSTR_0_BUFSTRIDE(vb->stride) | - COND(elem->instance_divisor, A4XX_VFD_FETCH_INSTR_0_INSTANCED) | - COND(switchnext, A4XX_VFD_FETCH_INSTR_0_SWITCHNEXT)); - OUT_RELOC(ring, rsc->bo, off, 0, 0); - OUT_RING(ring, A4XX_VFD_FETCH_INSTR_2_SIZE(size)); - OUT_RING(ring, A4XX_VFD_FETCH_INSTR_3_STEPRATE(MAX2(1, elem->instance_divisor))); - - OUT_PKT0(ring, REG_A4XX_VFD_DECODE_INSTR(j), 1); - OUT_RING(ring, A4XX_VFD_DECODE_INSTR_CONSTFILL | - A4XX_VFD_DECODE_INSTR_WRITEMASK(vp->inputs[i].compmask) | - A4XX_VFD_DECODE_INSTR_FORMAT(fmt) | - A4XX_VFD_DECODE_INSTR_SWAP(fd4_pipe2swap(pfmt)) | - A4XX_VFD_DECODE_INSTR_REGID(vp->inputs[i].regid) | - A4XX_VFD_DECODE_INSTR_SHIFTCNT(fs) | - A4XX_VFD_DECODE_INSTR_LASTCOMPVALID | - COND(isint, A4XX_VFD_DECODE_INSTR_INT) | - COND(switchnext, A4XX_VFD_DECODE_INSTR_SWITCHNEXT)); - - total_in += util_bitcount(vp->inputs[i].compmask); - j++; - } - } - - /* hw doesn't like to be configured for zero vbo's, it seems: */ - if (last < 0) { - /* just recycle the shader bo, we just need to point to *something* - * valid: - */ - struct fd_bo *dummy_vbo = vp->bo; - bool switchnext = (vertex_regid != regid(63, 0)) || - (instance_regid != regid(63, 0)) || - (vtxcnt_regid != regid(63, 0)); - - OUT_PKT0(ring, REG_A4XX_VFD_FETCH(0), 4); - OUT_RING(ring, A4XX_VFD_FETCH_INSTR_0_FETCHSIZE(0) | - A4XX_VFD_FETCH_INSTR_0_BUFSTRIDE(0) | - COND(switchnext, A4XX_VFD_FETCH_INSTR_0_SWITCHNEXT)); - OUT_RELOC(ring, dummy_vbo, 0, 0, 0); - OUT_RING(ring, A4XX_VFD_FETCH_INSTR_2_SIZE(1)); - OUT_RING(ring, A4XX_VFD_FETCH_INSTR_3_STEPRATE(1)); - - OUT_PKT0(ring, REG_A4XX_VFD_DECODE_INSTR(0), 1); - OUT_RING(ring, A4XX_VFD_DECODE_INSTR_CONSTFILL | - A4XX_VFD_DECODE_INSTR_WRITEMASK(0x1) | - A4XX_VFD_DECODE_INSTR_FORMAT(VFMT4_8_UNORM) | - A4XX_VFD_DECODE_INSTR_SWAP(XYZW) | - A4XX_VFD_DECODE_INSTR_REGID(regid(0,0)) | - A4XX_VFD_DECODE_INSTR_SHIFTCNT(1) | - A4XX_VFD_DECODE_INSTR_LASTCOMPVALID | - COND(switchnext, A4XX_VFD_DECODE_INSTR_SWITCHNEXT)); - - total_in = 1; - j = 1; - } - - OUT_PKT0(ring, REG_A4XX_VFD_CONTROL_0, 5); - OUT_RING(ring, A4XX_VFD_CONTROL_0_TOTALATTRTOVS(total_in) | - 0xa0000 | /* XXX */ - A4XX_VFD_CONTROL_0_STRMDECINSTRCNT(j) | - A4XX_VFD_CONTROL_0_STRMFETCHINSTRCNT(j)); - OUT_RING(ring, A4XX_VFD_CONTROL_1_MAXSTORAGE(129) | // XXX - A4XX_VFD_CONTROL_1_REGID4VTX(vertex_regid) | - A4XX_VFD_CONTROL_1_REGID4INST(instance_regid)); - OUT_RING(ring, 0x00000000); /* XXX VFD_CONTROL_2 */ - OUT_RING(ring, A4XX_VFD_CONTROL_3_REGID_VTXCNT(vtxcnt_regid)); - OUT_RING(ring, 0x00000000); /* XXX VFD_CONTROL_4 */ - - /* cache invalidate, otherwise vertex fetch could see - * stale vbo contents: - */ - OUT_PKT0(ring, REG_A4XX_UCHE_INVALIDATE0, 2); - OUT_RING(ring, 0x00000000); - OUT_RING(ring, 0x00000012); + OUT_PKT0(ring, REG_A4XX_VFD_FETCH(j), 4); + OUT_RING(ring, A4XX_VFD_FETCH_INSTR_0_FETCHSIZE(fs - 1) | + A4XX_VFD_FETCH_INSTR_0_BUFSTRIDE(vb->stride) | + COND(elem->instance_divisor, + A4XX_VFD_FETCH_INSTR_0_INSTANCED) | + COND(switchnext, A4XX_VFD_FETCH_INSTR_0_SWITCHNEXT)); + OUT_RELOC(ring, rsc->bo, off, 0, 0); + OUT_RING(ring, A4XX_VFD_FETCH_INSTR_2_SIZE(size)); + OUT_RING(ring, A4XX_VFD_FETCH_INSTR_3_STEPRATE( + MAX2(1, elem->instance_divisor))); + + OUT_PKT0(ring, REG_A4XX_VFD_DECODE_INSTR(j), 1); + OUT_RING(ring, + A4XX_VFD_DECODE_INSTR_CONSTFILL | + A4XX_VFD_DECODE_INSTR_WRITEMASK(vp->inputs[i].compmask) | + A4XX_VFD_DECODE_INSTR_FORMAT(fmt) | + A4XX_VFD_DECODE_INSTR_SWAP(fd4_pipe2swap(pfmt)) | + A4XX_VFD_DECODE_INSTR_REGID(vp->inputs[i].regid) | + A4XX_VFD_DECODE_INSTR_SHIFTCNT(fs) | + A4XX_VFD_DECODE_INSTR_LASTCOMPVALID | + COND(isint, A4XX_VFD_DECODE_INSTR_INT) | + COND(switchnext, A4XX_VFD_DECODE_INSTR_SWITCHNEXT)); + + total_in += util_bitcount(vp->inputs[i].compmask); + j++; + } + } + + /* hw doesn't like to be configured for zero vbo's, it seems: */ + if (last < 0) { + /* just recycle the shader bo, we just need to point to *something* + * valid: + */ + struct fd_bo *dummy_vbo = vp->bo; + bool switchnext = (vertex_regid != regid(63, 0)) || + (instance_regid != regid(63, 0)) || + (vtxcnt_regid != regid(63, 0)); + + OUT_PKT0(ring, REG_A4XX_VFD_FETCH(0), 4); + OUT_RING(ring, A4XX_VFD_FETCH_INSTR_0_FETCHSIZE(0) | + A4XX_VFD_FETCH_INSTR_0_BUFSTRIDE(0) | + COND(switchnext, A4XX_VFD_FETCH_INSTR_0_SWITCHNEXT)); + OUT_RELOC(ring, dummy_vbo, 0, 0, 0); + OUT_RING(ring, A4XX_VFD_FETCH_INSTR_2_SIZE(1)); + OUT_RING(ring, A4XX_VFD_FETCH_INSTR_3_STEPRATE(1)); + + OUT_PKT0(ring, REG_A4XX_VFD_DECODE_INSTR(0), 1); + OUT_RING(ring, A4XX_VFD_DECODE_INSTR_CONSTFILL | + A4XX_VFD_DECODE_INSTR_WRITEMASK(0x1) | + A4XX_VFD_DECODE_INSTR_FORMAT(VFMT4_8_UNORM) | + A4XX_VFD_DECODE_INSTR_SWAP(XYZW) | + A4XX_VFD_DECODE_INSTR_REGID(regid(0, 0)) | + A4XX_VFD_DECODE_INSTR_SHIFTCNT(1) | + A4XX_VFD_DECODE_INSTR_LASTCOMPVALID | + COND(switchnext, A4XX_VFD_DECODE_INSTR_SWITCHNEXT)); + + total_in = 1; + j = 1; + } + + OUT_PKT0(ring, REG_A4XX_VFD_CONTROL_0, 5); + OUT_RING(ring, A4XX_VFD_CONTROL_0_TOTALATTRTOVS(total_in) | + 0xa0000 | /* XXX */ + A4XX_VFD_CONTROL_0_STRMDECINSTRCNT(j) | + A4XX_VFD_CONTROL_0_STRMFETCHINSTRCNT(j)); + OUT_RING(ring, A4XX_VFD_CONTROL_1_MAXSTORAGE(129) | // XXX + A4XX_VFD_CONTROL_1_REGID4VTX(vertex_regid) | + A4XX_VFD_CONTROL_1_REGID4INST(instance_regid)); + OUT_RING(ring, 0x00000000); /* XXX VFD_CONTROL_2 */ + OUT_RING(ring, A4XX_VFD_CONTROL_3_REGID_VTXCNT(vtxcnt_regid)); + OUT_RING(ring, 0x00000000); /* XXX VFD_CONTROL_4 */ + + /* cache invalidate, otherwise vertex fetch could see + * stale vbo contents: + */ + OUT_PKT0(ring, REG_A4XX_UCHE_INVALIDATE0, 2); + OUT_RING(ring, 0x00000000); + OUT_RING(ring, 0x00000012); } void fd4_emit_state(struct fd_context *ctx, struct fd_ringbuffer *ring, - struct fd4_emit *emit) + struct fd4_emit *emit) { - const struct ir3_shader_variant *vp = fd4_emit_get_vp(emit); - const struct ir3_shader_variant *fp = fd4_emit_get_fp(emit); - const enum fd_dirty_3d_state dirty = emit->dirty; - - emit_marker(ring, 5); - - if ((dirty & FD_DIRTY_FRAMEBUFFER) && !emit->binning_pass) { - struct pipe_framebuffer_state *pfb = &ctx->batch->framebuffer; - unsigned char mrt_comp[A4XX_MAX_RENDER_TARGETS] = {0}; - - for (unsigned i = 0; i < A4XX_MAX_RENDER_TARGETS; i++) { - mrt_comp[i] = ((i < pfb->nr_cbufs) && pfb->cbufs[i]) ? 0xf : 0; - } - - OUT_PKT0(ring, REG_A4XX_RB_RENDER_COMPONENTS, 1); - OUT_RING(ring, A4XX_RB_RENDER_COMPONENTS_RT0(mrt_comp[0]) | - A4XX_RB_RENDER_COMPONENTS_RT1(mrt_comp[1]) | - A4XX_RB_RENDER_COMPONENTS_RT2(mrt_comp[2]) | - A4XX_RB_RENDER_COMPONENTS_RT3(mrt_comp[3]) | - A4XX_RB_RENDER_COMPONENTS_RT4(mrt_comp[4]) | - A4XX_RB_RENDER_COMPONENTS_RT5(mrt_comp[5]) | - A4XX_RB_RENDER_COMPONENTS_RT6(mrt_comp[6]) | - A4XX_RB_RENDER_COMPONENTS_RT7(mrt_comp[7])); - } - - if (dirty & (FD_DIRTY_ZSA | FD_DIRTY_FRAMEBUFFER)) { - struct fd4_zsa_stateobj *zsa = fd4_zsa_stateobj(ctx->zsa); - struct pipe_framebuffer_state *pfb = &ctx->batch->framebuffer; - uint32_t rb_alpha_control = zsa->rb_alpha_control; - - if (util_format_is_pure_integer(pipe_surface_format(pfb->cbufs[0]))) - rb_alpha_control &= ~A4XX_RB_ALPHA_CONTROL_ALPHA_TEST; - - OUT_PKT0(ring, REG_A4XX_RB_ALPHA_CONTROL, 1); - OUT_RING(ring, rb_alpha_control); - - OUT_PKT0(ring, REG_A4XX_RB_STENCIL_CONTROL, 2); - OUT_RING(ring, zsa->rb_stencil_control); - OUT_RING(ring, zsa->rb_stencil_control2); - } - - if (dirty & (FD_DIRTY_ZSA | FD_DIRTY_STENCIL_REF)) { - struct fd4_zsa_stateobj *zsa = fd4_zsa_stateobj(ctx->zsa); - struct pipe_stencil_ref *sr = &ctx->stencil_ref; - - OUT_PKT0(ring, REG_A4XX_RB_STENCILREFMASK, 2); - OUT_RING(ring, zsa->rb_stencilrefmask | - A4XX_RB_STENCILREFMASK_STENCILREF(sr->ref_value[0])); - OUT_RING(ring, zsa->rb_stencilrefmask_bf | - A4XX_RB_STENCILREFMASK_BF_STENCILREF(sr->ref_value[1])); - } - - if (dirty & (FD_DIRTY_ZSA | FD_DIRTY_RASTERIZER | FD_DIRTY_PROG)) { - struct fd4_zsa_stateobj *zsa = fd4_zsa_stateobj(ctx->zsa); - bool fragz = fp->no_earlyz | fp->has_kill | fp->writes_pos; - bool clamp = !ctx->rasterizer->depth_clip_near; - - OUT_PKT0(ring, REG_A4XX_RB_DEPTH_CONTROL, 1); - OUT_RING(ring, zsa->rb_depth_control | - COND(clamp, A4XX_RB_DEPTH_CONTROL_Z_CLAMP_ENABLE) | - COND(fragz, A4XX_RB_DEPTH_CONTROL_EARLY_Z_DISABLE) | - COND(fragz && fp->fragcoord_compmask != 0, - A4XX_RB_DEPTH_CONTROL_FORCE_FRAGZ_TO_FS)); - - /* maybe this register/bitfield needs a better name.. this - * appears to be just disabling early-z - */ - OUT_PKT0(ring, REG_A4XX_GRAS_ALPHA_CONTROL, 1); - OUT_RING(ring, zsa->gras_alpha_control | - COND(fragz, A4XX_GRAS_ALPHA_CONTROL_ALPHA_TEST_ENABLE) | - COND(fragz && fp->fragcoord_compmask != 0, - A4XX_GRAS_ALPHA_CONTROL_FORCE_FRAGZ_TO_FS)); - } - - if (dirty & FD_DIRTY_RASTERIZER) { - struct fd4_rasterizer_stateobj *rasterizer = - fd4_rasterizer_stateobj(ctx->rasterizer); - - OUT_PKT0(ring, REG_A4XX_GRAS_SU_MODE_CONTROL, 1); - OUT_RING(ring, rasterizer->gras_su_mode_control | - A4XX_GRAS_SU_MODE_CONTROL_RENDERING_PASS); - - OUT_PKT0(ring, REG_A4XX_GRAS_SU_POINT_MINMAX, 2); - OUT_RING(ring, rasterizer->gras_su_point_minmax); - OUT_RING(ring, rasterizer->gras_su_point_size); - - OUT_PKT0(ring, REG_A4XX_GRAS_SU_POLY_OFFSET_SCALE, 3); - OUT_RING(ring, rasterizer->gras_su_poly_offset_scale); - OUT_RING(ring, rasterizer->gras_su_poly_offset_offset); - OUT_RING(ring, rasterizer->gras_su_poly_offset_clamp); - - OUT_PKT0(ring, REG_A4XX_GRAS_CL_CLIP_CNTL, 1); - OUT_RING(ring, rasterizer->gras_cl_clip_cntl); - } - - /* NOTE: since primitive_restart is not actually part of any - * state object, we need to make sure that we always emit - * PRIM_VTX_CNTL.. either that or be more clever and detect - * when it changes. - */ - if (emit->info) { - const struct pipe_draw_info *info = emit->info; - struct fd4_rasterizer_stateobj *rast = - fd4_rasterizer_stateobj(ctx->rasterizer); - uint32_t val = rast->pc_prim_vtx_cntl; - - if (info->index_size && info->primitive_restart) - val |= A4XX_PC_PRIM_VTX_CNTL_PRIMITIVE_RESTART; - - val |= COND(vp->writes_psize, A4XX_PC_PRIM_VTX_CNTL_PSIZE); - - if (fp->total_in > 0) { - uint32_t varout = align(fp->total_in, 16) / 16; - if (varout > 1) - varout = align(varout, 2); - val |= A4XX_PC_PRIM_VTX_CNTL_VAROUT(varout); - } - - OUT_PKT0(ring, REG_A4XX_PC_PRIM_VTX_CNTL, 2); - OUT_RING(ring, val); - OUT_RING(ring, rast->pc_prim_vtx_cntl2); - } - - /* NOTE: scissor enabled bit is part of rasterizer state: */ - if (dirty & (FD_DIRTY_SCISSOR | FD_DIRTY_RASTERIZER)) { - struct pipe_scissor_state *scissor = fd_context_get_scissor(ctx); - - OUT_PKT0(ring, REG_A4XX_GRAS_SC_WINDOW_SCISSOR_BR, 2); - OUT_RING(ring, A4XX_GRAS_SC_WINDOW_SCISSOR_BR_X(scissor->maxx - 1) | - A4XX_GRAS_SC_WINDOW_SCISSOR_BR_Y(scissor->maxy - 1)); - OUT_RING(ring, A4XX_GRAS_SC_WINDOW_SCISSOR_TL_X(scissor->minx) | - A4XX_GRAS_SC_WINDOW_SCISSOR_TL_Y(scissor->miny)); - - ctx->batch->max_scissor.minx = MIN2(ctx->batch->max_scissor.minx, scissor->minx); - ctx->batch->max_scissor.miny = MIN2(ctx->batch->max_scissor.miny, scissor->miny); - ctx->batch->max_scissor.maxx = MAX2(ctx->batch->max_scissor.maxx, scissor->maxx); - ctx->batch->max_scissor.maxy = MAX2(ctx->batch->max_scissor.maxy, scissor->maxy); - } - - if (dirty & FD_DIRTY_VIEWPORT) { - fd_wfi(ctx->batch, ring); - OUT_PKT0(ring, REG_A4XX_GRAS_CL_VPORT_XOFFSET_0, 6); - OUT_RING(ring, A4XX_GRAS_CL_VPORT_XOFFSET_0(ctx->viewport.translate[0])); - OUT_RING(ring, A4XX_GRAS_CL_VPORT_XSCALE_0(ctx->viewport.scale[0])); - OUT_RING(ring, A4XX_GRAS_CL_VPORT_YOFFSET_0(ctx->viewport.translate[1])); - OUT_RING(ring, A4XX_GRAS_CL_VPORT_YSCALE_0(ctx->viewport.scale[1])); - OUT_RING(ring, A4XX_GRAS_CL_VPORT_ZOFFSET_0(ctx->viewport.translate[2])); - OUT_RING(ring, A4XX_GRAS_CL_VPORT_ZSCALE_0(ctx->viewport.scale[2])); - } - - if (dirty & (FD_DIRTY_VIEWPORT | FD_DIRTY_RASTERIZER | FD_DIRTY_FRAMEBUFFER)) { - float zmin, zmax; - int depth = 24; - if (ctx->batch->framebuffer.zsbuf) { - depth = util_format_get_component_bits( - pipe_surface_format(ctx->batch->framebuffer.zsbuf), - UTIL_FORMAT_COLORSPACE_ZS, 0); - } - util_viewport_zmin_zmax(&ctx->viewport, ctx->rasterizer->clip_halfz, - &zmin, &zmax); - - OUT_PKT0(ring, REG_A4XX_RB_VPORT_Z_CLAMP(0), 2); - if (depth == 32) { - OUT_RING(ring, fui(zmin)); - OUT_RING(ring, fui(zmax)); - } else if (depth == 16) { - OUT_RING(ring, (uint32_t)(zmin * 0xffff)); - OUT_RING(ring, (uint32_t)(zmax * 0xffff)); - } else { - OUT_RING(ring, (uint32_t)(zmin * 0xffffff)); - OUT_RING(ring, (uint32_t)(zmax * 0xffffff)); - } - } - - if (dirty & (FD_DIRTY_PROG | FD_DIRTY_FRAMEBUFFER)) { - struct pipe_framebuffer_state *pfb = &ctx->batch->framebuffer; - unsigned n = pfb->nr_cbufs; - /* if we have depth/stencil, we need at least on MRT: */ - if (pfb->zsbuf) - n = MAX2(1, n); - fd4_program_emit(ring, emit, n, pfb->cbufs); - } - - if (!emit->skip_consts) { /* evil hack to deal sanely with clear path */ - ir3_emit_vs_consts(vp, ring, ctx, emit->info, emit->indirect, emit->draw); - if (!emit->binning_pass) - ir3_emit_fs_consts(fp, ring, ctx); - } - - if ((dirty & FD_DIRTY_BLEND)) { - struct fd4_blend_stateobj *blend = fd4_blend_stateobj(ctx->blend); - uint32_t i; - - for (i = 0; i < A4XX_MAX_RENDER_TARGETS; i++) { - enum pipe_format format = pipe_surface_format( - ctx->batch->framebuffer.cbufs[i]); - bool is_int = util_format_is_pure_integer(format); - bool has_alpha = util_format_has_alpha(format); - uint32_t control = blend->rb_mrt[i].control; - - if (is_int) { - control &= A4XX_RB_MRT_CONTROL_COMPONENT_ENABLE__MASK; - control |= A4XX_RB_MRT_CONTROL_ROP_CODE(ROP_COPY); - } - - if (!has_alpha) { - control &= ~A4XX_RB_MRT_CONTROL_BLEND2; - } - - OUT_PKT0(ring, REG_A4XX_RB_MRT_CONTROL(i), 1); - OUT_RING(ring, control); - - OUT_PKT0(ring, REG_A4XX_RB_MRT_BLEND_CONTROL(i), 1); - OUT_RING(ring, blend->rb_mrt[i].blend_control); - } - - OUT_PKT0(ring, REG_A4XX_RB_FS_OUTPUT, 1); - OUT_RING(ring, blend->rb_fs_output | - A4XX_RB_FS_OUTPUT_SAMPLE_MASK(0xffff)); - } - - if (dirty & FD_DIRTY_BLEND_COLOR) { - struct pipe_blend_color *bcolor = &ctx->blend_color; - - OUT_PKT0(ring, REG_A4XX_RB_BLEND_RED, 8); - OUT_RING(ring, A4XX_RB_BLEND_RED_FLOAT(bcolor->color[0]) | - A4XX_RB_BLEND_RED_UINT(bcolor->color[0] * 0xff) | - A4XX_RB_BLEND_RED_SINT(bcolor->color[0] * 0x7f)); - OUT_RING(ring, A4XX_RB_BLEND_RED_F32(bcolor->color[0])); - OUT_RING(ring, A4XX_RB_BLEND_GREEN_FLOAT(bcolor->color[1]) | - A4XX_RB_BLEND_GREEN_UINT(bcolor->color[1] * 0xff) | - A4XX_RB_BLEND_GREEN_SINT(bcolor->color[1] * 0x7f)); - OUT_RING(ring, A4XX_RB_BLEND_RED_F32(bcolor->color[1])); - OUT_RING(ring, A4XX_RB_BLEND_BLUE_FLOAT(bcolor->color[2]) | - A4XX_RB_BLEND_BLUE_UINT(bcolor->color[2] * 0xff) | - A4XX_RB_BLEND_BLUE_SINT(bcolor->color[2] * 0x7f)); - OUT_RING(ring, A4XX_RB_BLEND_BLUE_F32(bcolor->color[2])); - OUT_RING(ring, A4XX_RB_BLEND_ALPHA_FLOAT(bcolor->color[3]) | - A4XX_RB_BLEND_ALPHA_UINT(bcolor->color[3] * 0xff) | - A4XX_RB_BLEND_ALPHA_SINT(bcolor->color[3] * 0x7f)); - OUT_RING(ring, A4XX_RB_BLEND_ALPHA_F32(bcolor->color[3])); - } - - if (ctx->dirty_shader[PIPE_SHADER_VERTEX] & FD_DIRTY_SHADER_TEX) - emit_textures(ctx, ring, SB4_VS_TEX, &ctx->tex[PIPE_SHADER_VERTEX], vp); - - if (ctx->dirty_shader[PIPE_SHADER_FRAGMENT] & FD_DIRTY_SHADER_TEX) - emit_textures(ctx, ring, SB4_FS_TEX, &ctx->tex[PIPE_SHADER_FRAGMENT], fp); + const struct ir3_shader_variant *vp = fd4_emit_get_vp(emit); + const struct ir3_shader_variant *fp = fd4_emit_get_fp(emit); + const enum fd_dirty_3d_state dirty = emit->dirty; + + emit_marker(ring, 5); + + if ((dirty & FD_DIRTY_FRAMEBUFFER) && !emit->binning_pass) { + struct pipe_framebuffer_state *pfb = &ctx->batch->framebuffer; + unsigned char mrt_comp[A4XX_MAX_RENDER_TARGETS] = {0}; + + for (unsigned i = 0; i < A4XX_MAX_RENDER_TARGETS; i++) { + mrt_comp[i] = ((i < pfb->nr_cbufs) && pfb->cbufs[i]) ? 0xf : 0; + } + + OUT_PKT0(ring, REG_A4XX_RB_RENDER_COMPONENTS, 1); + OUT_RING(ring, A4XX_RB_RENDER_COMPONENTS_RT0(mrt_comp[0]) | + A4XX_RB_RENDER_COMPONENTS_RT1(mrt_comp[1]) | + A4XX_RB_RENDER_COMPONENTS_RT2(mrt_comp[2]) | + A4XX_RB_RENDER_COMPONENTS_RT3(mrt_comp[3]) | + A4XX_RB_RENDER_COMPONENTS_RT4(mrt_comp[4]) | + A4XX_RB_RENDER_COMPONENTS_RT5(mrt_comp[5]) | + A4XX_RB_RENDER_COMPONENTS_RT6(mrt_comp[6]) | + A4XX_RB_RENDER_COMPONENTS_RT7(mrt_comp[7])); + } + + if (dirty & (FD_DIRTY_ZSA | FD_DIRTY_FRAMEBUFFER)) { + struct fd4_zsa_stateobj *zsa = fd4_zsa_stateobj(ctx->zsa); + struct pipe_framebuffer_state *pfb = &ctx->batch->framebuffer; + uint32_t rb_alpha_control = zsa->rb_alpha_control; + + if (util_format_is_pure_integer(pipe_surface_format(pfb->cbufs[0]))) + rb_alpha_control &= ~A4XX_RB_ALPHA_CONTROL_ALPHA_TEST; + + OUT_PKT0(ring, REG_A4XX_RB_ALPHA_CONTROL, 1); + OUT_RING(ring, rb_alpha_control); + + OUT_PKT0(ring, REG_A4XX_RB_STENCIL_CONTROL, 2); + OUT_RING(ring, zsa->rb_stencil_control); + OUT_RING(ring, zsa->rb_stencil_control2); + } + + if (dirty & (FD_DIRTY_ZSA | FD_DIRTY_STENCIL_REF)) { + struct fd4_zsa_stateobj *zsa = fd4_zsa_stateobj(ctx->zsa); + struct pipe_stencil_ref *sr = &ctx->stencil_ref; + + OUT_PKT0(ring, REG_A4XX_RB_STENCILREFMASK, 2); + OUT_RING(ring, zsa->rb_stencilrefmask | + A4XX_RB_STENCILREFMASK_STENCILREF(sr->ref_value[0])); + OUT_RING(ring, zsa->rb_stencilrefmask_bf | + A4XX_RB_STENCILREFMASK_BF_STENCILREF(sr->ref_value[1])); + } + + if (dirty & (FD_DIRTY_ZSA | FD_DIRTY_RASTERIZER | FD_DIRTY_PROG)) { + struct fd4_zsa_stateobj *zsa = fd4_zsa_stateobj(ctx->zsa); + bool fragz = fp->no_earlyz | fp->has_kill | fp->writes_pos; + bool clamp = !ctx->rasterizer->depth_clip_near; + + OUT_PKT0(ring, REG_A4XX_RB_DEPTH_CONTROL, 1); + OUT_RING(ring, zsa->rb_depth_control | + COND(clamp, A4XX_RB_DEPTH_CONTROL_Z_CLAMP_ENABLE) | + COND(fragz, A4XX_RB_DEPTH_CONTROL_EARLY_Z_DISABLE) | + COND(fragz && fp->fragcoord_compmask != 0, + A4XX_RB_DEPTH_CONTROL_FORCE_FRAGZ_TO_FS)); + + /* maybe this register/bitfield needs a better name.. this + * appears to be just disabling early-z + */ + OUT_PKT0(ring, REG_A4XX_GRAS_ALPHA_CONTROL, 1); + OUT_RING(ring, zsa->gras_alpha_control | + COND(fragz, A4XX_GRAS_ALPHA_CONTROL_ALPHA_TEST_ENABLE) | + COND(fragz && fp->fragcoord_compmask != 0, + A4XX_GRAS_ALPHA_CONTROL_FORCE_FRAGZ_TO_FS)); + } + + if (dirty & FD_DIRTY_RASTERIZER) { + struct fd4_rasterizer_stateobj *rasterizer = + fd4_rasterizer_stateobj(ctx->rasterizer); + + OUT_PKT0(ring, REG_A4XX_GRAS_SU_MODE_CONTROL, 1); + OUT_RING(ring, rasterizer->gras_su_mode_control | + A4XX_GRAS_SU_MODE_CONTROL_RENDERING_PASS); + + OUT_PKT0(ring, REG_A4XX_GRAS_SU_POINT_MINMAX, 2); + OUT_RING(ring, rasterizer->gras_su_point_minmax); + OUT_RING(ring, rasterizer->gras_su_point_size); + + OUT_PKT0(ring, REG_A4XX_GRAS_SU_POLY_OFFSET_SCALE, 3); + OUT_RING(ring, rasterizer->gras_su_poly_offset_scale); + OUT_RING(ring, rasterizer->gras_su_poly_offset_offset); + OUT_RING(ring, rasterizer->gras_su_poly_offset_clamp); + + OUT_PKT0(ring, REG_A4XX_GRAS_CL_CLIP_CNTL, 1); + OUT_RING(ring, rasterizer->gras_cl_clip_cntl); + } + + /* NOTE: since primitive_restart is not actually part of any + * state object, we need to make sure that we always emit + * PRIM_VTX_CNTL.. either that or be more clever and detect + * when it changes. + */ + if (emit->info) { + const struct pipe_draw_info *info = emit->info; + struct fd4_rasterizer_stateobj *rast = + fd4_rasterizer_stateobj(ctx->rasterizer); + uint32_t val = rast->pc_prim_vtx_cntl; + + if (info->index_size && info->primitive_restart) + val |= A4XX_PC_PRIM_VTX_CNTL_PRIMITIVE_RESTART; + + val |= COND(vp->writes_psize, A4XX_PC_PRIM_VTX_CNTL_PSIZE); + + if (fp->total_in > 0) { + uint32_t varout = align(fp->total_in, 16) / 16; + if (varout > 1) + varout = align(varout, 2); + val |= A4XX_PC_PRIM_VTX_CNTL_VAROUT(varout); + } + + OUT_PKT0(ring, REG_A4XX_PC_PRIM_VTX_CNTL, 2); + OUT_RING(ring, val); + OUT_RING(ring, rast->pc_prim_vtx_cntl2); + } + + /* NOTE: scissor enabled bit is part of rasterizer state: */ + if (dirty & (FD_DIRTY_SCISSOR | FD_DIRTY_RASTERIZER)) { + struct pipe_scissor_state *scissor = fd_context_get_scissor(ctx); + + OUT_PKT0(ring, REG_A4XX_GRAS_SC_WINDOW_SCISSOR_BR, 2); + OUT_RING(ring, A4XX_GRAS_SC_WINDOW_SCISSOR_BR_X(scissor->maxx - 1) | + A4XX_GRAS_SC_WINDOW_SCISSOR_BR_Y(scissor->maxy - 1)); + OUT_RING(ring, A4XX_GRAS_SC_WINDOW_SCISSOR_TL_X(scissor->minx) | + A4XX_GRAS_SC_WINDOW_SCISSOR_TL_Y(scissor->miny)); + + ctx->batch->max_scissor.minx = + MIN2(ctx->batch->max_scissor.minx, scissor->minx); + ctx->batch->max_scissor.miny = + MIN2(ctx->batch->max_scissor.miny, scissor->miny); + ctx->batch->max_scissor.maxx = + MAX2(ctx->batch->max_scissor.maxx, scissor->maxx); + ctx->batch->max_scissor.maxy = + MAX2(ctx->batch->max_scissor.maxy, scissor->maxy); + } + + if (dirty & FD_DIRTY_VIEWPORT) { + fd_wfi(ctx->batch, ring); + OUT_PKT0(ring, REG_A4XX_GRAS_CL_VPORT_XOFFSET_0, 6); + OUT_RING(ring, A4XX_GRAS_CL_VPORT_XOFFSET_0(ctx->viewport.translate[0])); + OUT_RING(ring, A4XX_GRAS_CL_VPORT_XSCALE_0(ctx->viewport.scale[0])); + OUT_RING(ring, A4XX_GRAS_CL_VPORT_YOFFSET_0(ctx->viewport.translate[1])); + OUT_RING(ring, A4XX_GRAS_CL_VPORT_YSCALE_0(ctx->viewport.scale[1])); + OUT_RING(ring, A4XX_GRAS_CL_VPORT_ZOFFSET_0(ctx->viewport.translate[2])); + OUT_RING(ring, A4XX_GRAS_CL_VPORT_ZSCALE_0(ctx->viewport.scale[2])); + } + + if (dirty & + (FD_DIRTY_VIEWPORT | FD_DIRTY_RASTERIZER | FD_DIRTY_FRAMEBUFFER)) { + float zmin, zmax; + int depth = 24; + if (ctx->batch->framebuffer.zsbuf) { + depth = util_format_get_component_bits( + pipe_surface_format(ctx->batch->framebuffer.zsbuf), + UTIL_FORMAT_COLORSPACE_ZS, 0); + } + util_viewport_zmin_zmax(&ctx->viewport, ctx->rasterizer->clip_halfz, + &zmin, &zmax); + + OUT_PKT0(ring, REG_A4XX_RB_VPORT_Z_CLAMP(0), 2); + if (depth == 32) { + OUT_RING(ring, fui(zmin)); + OUT_RING(ring, fui(zmax)); + } else if (depth == 16) { + OUT_RING(ring, (uint32_t)(zmin * 0xffff)); + OUT_RING(ring, (uint32_t)(zmax * 0xffff)); + } else { + OUT_RING(ring, (uint32_t)(zmin * 0xffffff)); + OUT_RING(ring, (uint32_t)(zmax * 0xffffff)); + } + } + + if (dirty & (FD_DIRTY_PROG | FD_DIRTY_FRAMEBUFFER)) { + struct pipe_framebuffer_state *pfb = &ctx->batch->framebuffer; + unsigned n = pfb->nr_cbufs; + /* if we have depth/stencil, we need at least on MRT: */ + if (pfb->zsbuf) + n = MAX2(1, n); + fd4_program_emit(ring, emit, n, pfb->cbufs); + } + + if (!emit->skip_consts) { /* evil hack to deal sanely with clear path */ + ir3_emit_vs_consts(vp, ring, ctx, emit->info, emit->indirect, emit->draw); + if (!emit->binning_pass) + ir3_emit_fs_consts(fp, ring, ctx); + } + + if ((dirty & FD_DIRTY_BLEND)) { + struct fd4_blend_stateobj *blend = fd4_blend_stateobj(ctx->blend); + uint32_t i; + + for (i = 0; i < A4XX_MAX_RENDER_TARGETS; i++) { + enum pipe_format format = + pipe_surface_format(ctx->batch->framebuffer.cbufs[i]); + bool is_int = util_format_is_pure_integer(format); + bool has_alpha = util_format_has_alpha(format); + uint32_t control = blend->rb_mrt[i].control; + + if (is_int) { + control &= A4XX_RB_MRT_CONTROL_COMPONENT_ENABLE__MASK; + control |= A4XX_RB_MRT_CONTROL_ROP_CODE(ROP_COPY); + } + + if (!has_alpha) { + control &= ~A4XX_RB_MRT_CONTROL_BLEND2; + } + + OUT_PKT0(ring, REG_A4XX_RB_MRT_CONTROL(i), 1); + OUT_RING(ring, control); + + OUT_PKT0(ring, REG_A4XX_RB_MRT_BLEND_CONTROL(i), 1); + OUT_RING(ring, blend->rb_mrt[i].blend_control); + } + + OUT_PKT0(ring, REG_A4XX_RB_FS_OUTPUT, 1); + OUT_RING(ring, + blend->rb_fs_output | A4XX_RB_FS_OUTPUT_SAMPLE_MASK(0xffff)); + } + + if (dirty & FD_DIRTY_BLEND_COLOR) { + struct pipe_blend_color *bcolor = &ctx->blend_color; + + OUT_PKT0(ring, REG_A4XX_RB_BLEND_RED, 8); + OUT_RING(ring, A4XX_RB_BLEND_RED_FLOAT(bcolor->color[0]) | + A4XX_RB_BLEND_RED_UINT(bcolor->color[0] * 0xff) | + A4XX_RB_BLEND_RED_SINT(bcolor->color[0] * 0x7f)); + OUT_RING(ring, A4XX_RB_BLEND_RED_F32(bcolor->color[0])); + OUT_RING(ring, A4XX_RB_BLEND_GREEN_FLOAT(bcolor->color[1]) | + A4XX_RB_BLEND_GREEN_UINT(bcolor->color[1] * 0xff) | + A4XX_RB_BLEND_GREEN_SINT(bcolor->color[1] * 0x7f)); + OUT_RING(ring, A4XX_RB_BLEND_RED_F32(bcolor->color[1])); + OUT_RING(ring, A4XX_RB_BLEND_BLUE_FLOAT(bcolor->color[2]) | + A4XX_RB_BLEND_BLUE_UINT(bcolor->color[2] * 0xff) | + A4XX_RB_BLEND_BLUE_SINT(bcolor->color[2] * 0x7f)); + OUT_RING(ring, A4XX_RB_BLEND_BLUE_F32(bcolor->color[2])); + OUT_RING(ring, A4XX_RB_BLEND_ALPHA_FLOAT(bcolor->color[3]) | + A4XX_RB_BLEND_ALPHA_UINT(bcolor->color[3] * 0xff) | + A4XX_RB_BLEND_ALPHA_SINT(bcolor->color[3] * 0x7f)); + OUT_RING(ring, A4XX_RB_BLEND_ALPHA_F32(bcolor->color[3])); + } + + if (ctx->dirty_shader[PIPE_SHADER_VERTEX] & FD_DIRTY_SHADER_TEX) + emit_textures(ctx, ring, SB4_VS_TEX, &ctx->tex[PIPE_SHADER_VERTEX], vp); + + if (ctx->dirty_shader[PIPE_SHADER_FRAGMENT] & FD_DIRTY_SHADER_TEX) + emit_textures(ctx, ring, SB4_FS_TEX, &ctx->tex[PIPE_SHADER_FRAGMENT], fp); } /* emit setup at begin of new cmdstream buffer (don't rely on previous @@ -772,181 +777,177 @@ fd4_emit_state(struct fd_context *ctx, struct fd_ringbuffer *ring, void fd4_emit_restore(struct fd_batch *batch, struct fd_ringbuffer *ring) { - struct fd_context *ctx = batch->ctx; - struct fd4_context *fd4_ctx = fd4_context(ctx); + struct fd_context *ctx = batch->ctx; + struct fd4_context *fd4_ctx = fd4_context(ctx); - OUT_PKT0(ring, REG_A4XX_RBBM_PERFCTR_CTL, 1); - OUT_RING(ring, 0x00000001); + OUT_PKT0(ring, REG_A4XX_RBBM_PERFCTR_CTL, 1); + OUT_RING(ring, 0x00000001); - OUT_PKT0(ring, REG_A4XX_GRAS_DEBUG_ECO_CONTROL, 1); - OUT_RING(ring, 0x00000000); + OUT_PKT0(ring, REG_A4XX_GRAS_DEBUG_ECO_CONTROL, 1); + OUT_RING(ring, 0x00000000); - OUT_PKT0(ring, REG_A4XX_SP_MODE_CONTROL, 1); - OUT_RING(ring, 0x00000006); + OUT_PKT0(ring, REG_A4XX_SP_MODE_CONTROL, 1); + OUT_RING(ring, 0x00000006); - OUT_PKT0(ring, REG_A4XX_TPL1_TP_MODE_CONTROL, 1); - OUT_RING(ring, 0x0000003a); + OUT_PKT0(ring, REG_A4XX_TPL1_TP_MODE_CONTROL, 1); + OUT_RING(ring, 0x0000003a); - OUT_PKT0(ring, REG_A4XX_UNKNOWN_0D01, 1); - OUT_RING(ring, 0x00000001); + OUT_PKT0(ring, REG_A4XX_UNKNOWN_0D01, 1); + OUT_RING(ring, 0x00000001); - OUT_PKT0(ring, REG_A4XX_UNKNOWN_0E42, 1); - OUT_RING(ring, 0x00000000); + OUT_PKT0(ring, REG_A4XX_UNKNOWN_0E42, 1); + OUT_RING(ring, 0x00000000); - OUT_PKT0(ring, REG_A4XX_UCHE_CACHE_WAYS_VFD, 1); - OUT_RING(ring, 0x00000007); + OUT_PKT0(ring, REG_A4XX_UCHE_CACHE_WAYS_VFD, 1); + OUT_RING(ring, 0x00000007); - OUT_PKT0(ring, REG_A4XX_UCHE_CACHE_MODE_CONTROL, 1); - OUT_RING(ring, 0x00000000); + OUT_PKT0(ring, REG_A4XX_UCHE_CACHE_MODE_CONTROL, 1); + OUT_RING(ring, 0x00000000); - OUT_PKT0(ring, REG_A4XX_UCHE_INVALIDATE0, 2); - OUT_RING(ring, 0x00000000); - OUT_RING(ring, 0x00000012); + OUT_PKT0(ring, REG_A4XX_UCHE_INVALIDATE0, 2); + OUT_RING(ring, 0x00000000); + OUT_RING(ring, 0x00000012); - OUT_PKT0(ring, REG_A4XX_HLSQ_MODE_CONTROL, 1); - OUT_RING(ring, 0x00000000); + OUT_PKT0(ring, REG_A4XX_HLSQ_MODE_CONTROL, 1); + OUT_RING(ring, 0x00000000); - OUT_PKT0(ring, REG_A4XX_UNKNOWN_0CC5, 1); - OUT_RING(ring, 0x00000006); + OUT_PKT0(ring, REG_A4XX_UNKNOWN_0CC5, 1); + OUT_RING(ring, 0x00000006); - OUT_PKT0(ring, REG_A4XX_UNKNOWN_0CC6, 1); - OUT_RING(ring, 0x00000000); + OUT_PKT0(ring, REG_A4XX_UNKNOWN_0CC6, 1); + OUT_RING(ring, 0x00000000); - OUT_PKT0(ring, REG_A4XX_UNKNOWN_0EC2, 1); - OUT_RING(ring, 0x00040000); + OUT_PKT0(ring, REG_A4XX_UNKNOWN_0EC2, 1); + OUT_RING(ring, 0x00040000); - OUT_PKT0(ring, REG_A4XX_UNKNOWN_2001, 1); - OUT_RING(ring, 0x00000000); + OUT_PKT0(ring, REG_A4XX_UNKNOWN_2001, 1); + OUT_RING(ring, 0x00000000); - OUT_PKT3(ring, CP_INVALIDATE_STATE, 1); - OUT_RING(ring, 0x00001000); + OUT_PKT3(ring, CP_INVALIDATE_STATE, 1); + OUT_RING(ring, 0x00001000); - OUT_PKT0(ring, REG_A4XX_UNKNOWN_20EF, 1); - OUT_RING(ring, 0x00000000); + OUT_PKT0(ring, REG_A4XX_UNKNOWN_20EF, 1); + OUT_RING(ring, 0x00000000); - OUT_PKT0(ring, REG_A4XX_RB_BLEND_RED, 4); - OUT_RING(ring, A4XX_RB_BLEND_RED_UINT(0) | - A4XX_RB_BLEND_RED_FLOAT(0.0)); - OUT_RING(ring, A4XX_RB_BLEND_GREEN_UINT(0) | - A4XX_RB_BLEND_GREEN_FLOAT(0.0)); - OUT_RING(ring, A4XX_RB_BLEND_BLUE_UINT(0) | - A4XX_RB_BLEND_BLUE_FLOAT(0.0)); - OUT_RING(ring, A4XX_RB_BLEND_ALPHA_UINT(0x7fff) | - A4XX_RB_BLEND_ALPHA_FLOAT(1.0)); + OUT_PKT0(ring, REG_A4XX_RB_BLEND_RED, 4); + OUT_RING(ring, A4XX_RB_BLEND_RED_UINT(0) | A4XX_RB_BLEND_RED_FLOAT(0.0)); + OUT_RING(ring, A4XX_RB_BLEND_GREEN_UINT(0) | A4XX_RB_BLEND_GREEN_FLOAT(0.0)); + OUT_RING(ring, A4XX_RB_BLEND_BLUE_UINT(0) | A4XX_RB_BLEND_BLUE_FLOAT(0.0)); + OUT_RING(ring, + A4XX_RB_BLEND_ALPHA_UINT(0x7fff) | A4XX_RB_BLEND_ALPHA_FLOAT(1.0)); - OUT_PKT0(ring, REG_A4XX_UNKNOWN_2152, 1); - OUT_RING(ring, 0x00000000); + OUT_PKT0(ring, REG_A4XX_UNKNOWN_2152, 1); + OUT_RING(ring, 0x00000000); - OUT_PKT0(ring, REG_A4XX_UNKNOWN_2153, 1); - OUT_RING(ring, 0x00000000); + OUT_PKT0(ring, REG_A4XX_UNKNOWN_2153, 1); + OUT_RING(ring, 0x00000000); - OUT_PKT0(ring, REG_A4XX_UNKNOWN_2154, 1); - OUT_RING(ring, 0x00000000); + OUT_PKT0(ring, REG_A4XX_UNKNOWN_2154, 1); + OUT_RING(ring, 0x00000000); - OUT_PKT0(ring, REG_A4XX_UNKNOWN_2155, 1); - OUT_RING(ring, 0x00000000); + OUT_PKT0(ring, REG_A4XX_UNKNOWN_2155, 1); + OUT_RING(ring, 0x00000000); - OUT_PKT0(ring, REG_A4XX_UNKNOWN_2156, 1); - OUT_RING(ring, 0x00000000); + OUT_PKT0(ring, REG_A4XX_UNKNOWN_2156, 1); + OUT_RING(ring, 0x00000000); - OUT_PKT0(ring, REG_A4XX_UNKNOWN_2157, 1); - OUT_RING(ring, 0x00000000); + OUT_PKT0(ring, REG_A4XX_UNKNOWN_2157, 1); + OUT_RING(ring, 0x00000000); - OUT_PKT0(ring, REG_A4XX_UNKNOWN_21C3, 1); - OUT_RING(ring, 0x0000001d); + OUT_PKT0(ring, REG_A4XX_UNKNOWN_21C3, 1); + OUT_RING(ring, 0x0000001d); - OUT_PKT0(ring, REG_A4XX_PC_GS_PARAM, 1); - OUT_RING(ring, 0x00000000); + OUT_PKT0(ring, REG_A4XX_PC_GS_PARAM, 1); + OUT_RING(ring, 0x00000000); - OUT_PKT0(ring, REG_A4XX_UNKNOWN_21E6, 1); - OUT_RING(ring, 0x00000001); + OUT_PKT0(ring, REG_A4XX_UNKNOWN_21E6, 1); + OUT_RING(ring, 0x00000001); - OUT_PKT0(ring, REG_A4XX_PC_HS_PARAM, 1); - OUT_RING(ring, 0x00000000); + OUT_PKT0(ring, REG_A4XX_PC_HS_PARAM, 1); + OUT_RING(ring, 0x00000000); - OUT_PKT0(ring, REG_A4XX_UNKNOWN_22D7, 1); - OUT_RING(ring, 0x00000000); + OUT_PKT0(ring, REG_A4XX_UNKNOWN_22D7, 1); + OUT_RING(ring, 0x00000000); - OUT_PKT0(ring, REG_A4XX_TPL1_TP_TEX_OFFSET, 1); - OUT_RING(ring, 0x00000000); + OUT_PKT0(ring, REG_A4XX_TPL1_TP_TEX_OFFSET, 1); + OUT_RING(ring, 0x00000000); - OUT_PKT0(ring, REG_A4XX_TPL1_TP_TEX_COUNT, 1); - OUT_RING(ring, A4XX_TPL1_TP_TEX_COUNT_VS(16) | - A4XX_TPL1_TP_TEX_COUNT_HS(0) | - A4XX_TPL1_TP_TEX_COUNT_DS(0) | - A4XX_TPL1_TP_TEX_COUNT_GS(0)); + OUT_PKT0(ring, REG_A4XX_TPL1_TP_TEX_COUNT, 1); + OUT_RING(ring, A4XX_TPL1_TP_TEX_COUNT_VS(16) | A4XX_TPL1_TP_TEX_COUNT_HS(0) | + A4XX_TPL1_TP_TEX_COUNT_DS(0) | + A4XX_TPL1_TP_TEX_COUNT_GS(0)); - OUT_PKT0(ring, REG_A4XX_TPL1_TP_FS_TEX_COUNT, 1); - OUT_RING(ring, 16); + OUT_PKT0(ring, REG_A4XX_TPL1_TP_FS_TEX_COUNT, 1); + OUT_RING(ring, 16); - /* we don't use this yet.. probably best to disable.. */ - OUT_PKT3(ring, CP_SET_DRAW_STATE, 2); - OUT_RING(ring, CP_SET_DRAW_STATE__0_COUNT(0) | - CP_SET_DRAW_STATE__0_DISABLE_ALL_GROUPS | - CP_SET_DRAW_STATE__0_GROUP_ID(0)); - OUT_RING(ring, CP_SET_DRAW_STATE__1_ADDR_LO(0)); + /* we don't use this yet.. probably best to disable.. */ + OUT_PKT3(ring, CP_SET_DRAW_STATE, 2); + OUT_RING(ring, CP_SET_DRAW_STATE__0_COUNT(0) | + CP_SET_DRAW_STATE__0_DISABLE_ALL_GROUPS | + CP_SET_DRAW_STATE__0_GROUP_ID(0)); + OUT_RING(ring, CP_SET_DRAW_STATE__1_ADDR_LO(0)); - OUT_PKT0(ring, REG_A4XX_SP_VS_PVT_MEM_PARAM, 2); - OUT_RING(ring, 0x08000001); /* SP_VS_PVT_MEM_PARAM */ - OUT_RELOC(ring, fd4_ctx->vs_pvt_mem, 0,0,0); /* SP_VS_PVT_MEM_ADDR */ + OUT_PKT0(ring, REG_A4XX_SP_VS_PVT_MEM_PARAM, 2); + OUT_RING(ring, 0x08000001); /* SP_VS_PVT_MEM_PARAM */ + OUT_RELOC(ring, fd4_ctx->vs_pvt_mem, 0, 0, 0); /* SP_VS_PVT_MEM_ADDR */ - OUT_PKT0(ring, REG_A4XX_SP_FS_PVT_MEM_PARAM, 2); - OUT_RING(ring, 0x08000001); /* SP_FS_PVT_MEM_PARAM */ - OUT_RELOC(ring, fd4_ctx->fs_pvt_mem, 0,0,0); /* SP_FS_PVT_MEM_ADDR */ + OUT_PKT0(ring, REG_A4XX_SP_FS_PVT_MEM_PARAM, 2); + OUT_RING(ring, 0x08000001); /* SP_FS_PVT_MEM_PARAM */ + OUT_RELOC(ring, fd4_ctx->fs_pvt_mem, 0, 0, 0); /* SP_FS_PVT_MEM_ADDR */ - OUT_PKT0(ring, REG_A4XX_GRAS_SC_CONTROL, 1); - OUT_RING(ring, A4XX_GRAS_SC_CONTROL_RENDER_MODE(RB_RENDERING_PASS) | - A4XX_GRAS_SC_CONTROL_MSAA_DISABLE | - A4XX_GRAS_SC_CONTROL_MSAA_SAMPLES(MSAA_ONE) | - A4XX_GRAS_SC_CONTROL_RASTER_MODE(0)); + OUT_PKT0(ring, REG_A4XX_GRAS_SC_CONTROL, 1); + OUT_RING(ring, A4XX_GRAS_SC_CONTROL_RENDER_MODE(RB_RENDERING_PASS) | + A4XX_GRAS_SC_CONTROL_MSAA_DISABLE | + A4XX_GRAS_SC_CONTROL_MSAA_SAMPLES(MSAA_ONE) | + A4XX_GRAS_SC_CONTROL_RASTER_MODE(0)); - OUT_PKT0(ring, REG_A4XX_RB_MSAA_CONTROL, 1); - OUT_RING(ring, A4XX_RB_MSAA_CONTROL_DISABLE | - A4XX_RB_MSAA_CONTROL_SAMPLES(MSAA_ONE)); + OUT_PKT0(ring, REG_A4XX_RB_MSAA_CONTROL, 1); + OUT_RING(ring, A4XX_RB_MSAA_CONTROL_DISABLE | + A4XX_RB_MSAA_CONTROL_SAMPLES(MSAA_ONE)); - OUT_PKT0(ring, REG_A4XX_GRAS_CL_GB_CLIP_ADJ, 1); - OUT_RING(ring, A4XX_GRAS_CL_GB_CLIP_ADJ_HORZ(0) | - A4XX_GRAS_CL_GB_CLIP_ADJ_VERT(0)); + OUT_PKT0(ring, REG_A4XX_GRAS_CL_GB_CLIP_ADJ, 1); + OUT_RING(ring, A4XX_GRAS_CL_GB_CLIP_ADJ_HORZ(0) | + A4XX_GRAS_CL_GB_CLIP_ADJ_VERT(0)); - OUT_PKT0(ring, REG_A4XX_RB_ALPHA_CONTROL, 1); - OUT_RING(ring, A4XX_RB_ALPHA_CONTROL_ALPHA_TEST_FUNC(FUNC_ALWAYS)); + OUT_PKT0(ring, REG_A4XX_RB_ALPHA_CONTROL, 1); + OUT_RING(ring, A4XX_RB_ALPHA_CONTROL_ALPHA_TEST_FUNC(FUNC_ALWAYS)); - OUT_PKT0(ring, REG_A4XX_RB_FS_OUTPUT, 1); - OUT_RING(ring, A4XX_RB_FS_OUTPUT_SAMPLE_MASK(0xffff)); + OUT_PKT0(ring, REG_A4XX_RB_FS_OUTPUT, 1); + OUT_RING(ring, A4XX_RB_FS_OUTPUT_SAMPLE_MASK(0xffff)); - OUT_PKT0(ring, REG_A4XX_GRAS_ALPHA_CONTROL, 1); - OUT_RING(ring, 0x0); + OUT_PKT0(ring, REG_A4XX_GRAS_ALPHA_CONTROL, 1); + OUT_RING(ring, 0x0); - fd_hw_query_enable(batch, ring); + fd_hw_query_enable(batch, ring); } static void fd4_mem_to_mem(struct fd_ringbuffer *ring, struct pipe_resource *dst, - unsigned dst_off, struct pipe_resource *src, unsigned src_off, - unsigned sizedwords) + unsigned dst_off, struct pipe_resource *src, unsigned src_off, + unsigned sizedwords) { - struct fd_bo *src_bo = fd_resource(src)->bo; - struct fd_bo *dst_bo = fd_resource(dst)->bo; - unsigned i; - - for (i = 0; i < sizedwords; i++) { - OUT_PKT3(ring, CP_MEM_TO_MEM, 3); - OUT_RING(ring, 0x00000000); - OUT_RELOC(ring, dst_bo, dst_off, 0, 0); - OUT_RELOC(ring, src_bo, src_off, 0, 0); - - dst_off += 4; - src_off += 4; - } + struct fd_bo *src_bo = fd_resource(src)->bo; + struct fd_bo *dst_bo = fd_resource(dst)->bo; + unsigned i; + + for (i = 0; i < sizedwords; i++) { + OUT_PKT3(ring, CP_MEM_TO_MEM, 3); + OUT_RING(ring, 0x00000000); + OUT_RELOC(ring, dst_bo, dst_off, 0, 0); + OUT_RELOC(ring, src_bo, src_off, 0, 0); + + dst_off += 4; + src_off += 4; + } } void fd4_emit_init_screen(struct pipe_screen *pscreen) { - struct fd_screen *screen = fd_screen(pscreen); + struct fd_screen *screen = fd_screen(pscreen); - screen->emit_ib = fd4_emit_ib; - screen->mem_to_mem = fd4_mem_to_mem; + screen->emit_ib = fd4_emit_ib; + screen->mem_to_mem = fd4_mem_to_mem; } void diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_emit.h b/src/gallium/drivers/freedreno/a4xx/fd4_emit.h index 64b62bc..d0cebec 100644 --- a/src/gallium/drivers/freedreno/a4xx/fd4_emit.h +++ b/src/gallium/drivers/freedreno/a4xx/fd4_emit.h @@ -29,76 +29,79 @@ #include "pipe/p_context.h" -#include "freedreno_context.h" #include "fd4_format.h" #include "fd4_program.h" +#include "freedreno_context.h" #include "ir3_gallium.h" struct fd_ringbuffer; -void fd4_emit_gmem_restore_tex(struct fd_ringbuffer *ring, - unsigned nr_bufs, struct pipe_surface **bufs); +void fd4_emit_gmem_restore_tex(struct fd_ringbuffer *ring, unsigned nr_bufs, + struct pipe_surface **bufs); /* grouped together emit-state for prog/vertex/state emit: */ struct fd4_emit { - struct pipe_debug_callback *debug; - const struct fd_vertex_state *vtx; - const struct fd4_program_state *prog; - const struct pipe_draw_info *info; - const struct pipe_draw_indirect_info *indirect; - const struct pipe_draw_start_count *draw; - bool binning_pass; - struct ir3_cache_key key; - enum fd_dirty_3d_state dirty; - - uint32_t sprite_coord_enable; /* bitmask */ - bool sprite_coord_mode; - bool rasterflat; - bool no_decode_srgb; - bool skip_consts; - - /* cached to avoid repeated lookups of same variants: */ - const struct ir3_shader_variant *vs, *fs; - /* TODO: other shader stages.. */ + struct pipe_debug_callback *debug; + const struct fd_vertex_state *vtx; + const struct fd4_program_state *prog; + const struct pipe_draw_info *info; + const struct pipe_draw_indirect_info *indirect; + const struct pipe_draw_start_count *draw; + bool binning_pass; + struct ir3_cache_key key; + enum fd_dirty_3d_state dirty; + + uint32_t sprite_coord_enable; /* bitmask */ + bool sprite_coord_mode; + bool rasterflat; + bool no_decode_srgb; + bool skip_consts; + + /* cached to avoid repeated lookups of same variants: */ + const struct ir3_shader_variant *vs, *fs; + /* TODO: other shader stages.. */ }; -static inline enum a4xx_color_fmt fd4_emit_format(struct pipe_surface *surf) +static inline enum a4xx_color_fmt +fd4_emit_format(struct pipe_surface *surf) { - if (!surf) - return 0; - return fd4_pipe2color(surf->format); + if (!surf) + return 0; + return fd4_pipe2color(surf->format); } static inline const struct ir3_shader_variant * fd4_emit_get_vp(struct fd4_emit *emit) { - if (!emit->vs) { - emit->vs = emit->binning_pass ? emit->prog->bs : emit->prog->vs; - } - return emit->vs; + if (!emit->vs) { + emit->vs = emit->binning_pass ? emit->prog->bs : emit->prog->vs; + } + return emit->vs; } static inline const struct ir3_shader_variant * fd4_emit_get_fp(struct fd4_emit *emit) { - if (!emit->fs) { - if (emit->binning_pass) { - /* use dummy stateobj to simplify binning vs non-binning: */ - static const struct ir3_shader_variant binning_fs = {}; - emit->fs = &binning_fs; - } else { - emit->fs = emit->prog->fs; - } - } - return emit->fs; + if (!emit->fs) { + if (emit->binning_pass) { + /* use dummy stateobj to simplify binning vs non-binning: */ + static const struct ir3_shader_variant binning_fs = {}; + emit->fs = &binning_fs; + } else { + emit->fs = emit->prog->fs; + } + } + return emit->fs; } -void fd4_emit_vertex_bufs(struct fd_ringbuffer *ring, struct fd4_emit *emit) assert_dt; +void fd4_emit_vertex_bufs(struct fd_ringbuffer *ring, + struct fd4_emit *emit) assert_dt; void fd4_emit_state(struct fd_context *ctx, struct fd_ringbuffer *ring, - struct fd4_emit *emit) assert_dt; + struct fd4_emit *emit) assert_dt; -void fd4_emit_restore(struct fd_batch *batch, struct fd_ringbuffer *ring) assert_dt; +void fd4_emit_restore(struct fd_batch *batch, + struct fd_ringbuffer *ring) assert_dt; void fd4_emit_init_screen(struct pipe_screen *pscreen); void fd4_emit_init(struct pipe_context *pctx); @@ -106,7 +109,7 @@ void fd4_emit_init(struct pipe_context *pctx); static inline void fd4_emit_ib(struct fd_ringbuffer *ring, struct fd_ringbuffer *target) { - __OUT_IB(ring, true, target); + __OUT_IB(ring, true, target); } #endif /* FD4_EMIT_H */ diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_format.c b/src/gallium/drivers/freedreno/a4xx/fd4_format.c index 6fc340a..b264bbe 100644 --- a/src/gallium/drivers/freedreno/a4xx/fd4_format.c +++ b/src/gallium/drivers/freedreno/a4xx/fd4_format.c @@ -29,48 +29,41 @@ #include "fd4_format.h" - /* Specifies the table of all the formats and their features. Also supplies * the helpers that look up various data in those tables. */ struct fd4_format { - enum a4xx_vtx_fmt vtx; - enum a4xx_tex_fmt tex; - enum a4xx_color_fmt rb; - enum a3xx_color_swap swap; - boolean present; + enum a4xx_vtx_fmt vtx; + enum a4xx_tex_fmt tex; + enum a4xx_color_fmt rb; + enum a3xx_color_swap swap; + boolean present; }; /* vertex + texture */ -#define VT(pipe, fmt, rbfmt, swapfmt) \ - [PIPE_FORMAT_ ## pipe] = { \ - .present = 1, \ - .vtx = VFMT4_ ## fmt, \ - .tex = TFMT4_ ## fmt, \ - .rb = RB4_ ## rbfmt, \ - .swap = swapfmt \ - } +#define VT(pipe, fmt, rbfmt, swapfmt) \ + [PIPE_FORMAT_##pipe] = {.present = 1, \ + .vtx = VFMT4_##fmt, \ + .tex = TFMT4_##fmt, \ + .rb = RB4_##rbfmt, \ + .swap = swapfmt} /* texture-only */ -#define _T(pipe, fmt, rbfmt, swapfmt) \ - [PIPE_FORMAT_ ## pipe] = { \ - .present = 1, \ - .vtx = VFMT4_NONE, \ - .tex = TFMT4_ ## fmt, \ - .rb = RB4_ ## rbfmt, \ - .swap = swapfmt \ - } +#define _T(pipe, fmt, rbfmt, swapfmt) \ + [PIPE_FORMAT_##pipe] = {.present = 1, \ + .vtx = VFMT4_NONE, \ + .tex = TFMT4_##fmt, \ + .rb = RB4_##rbfmt, \ + .swap = swapfmt} /* vertex-only */ -#define V_(pipe, fmt, rbfmt, swapfmt) \ - [PIPE_FORMAT_ ## pipe] = { \ - .present = 1, \ - .vtx = VFMT4_ ## fmt, \ - .tex = TFMT4_NONE, \ - .rb = RB4_ ## rbfmt, \ - .swap = swapfmt \ - } +#define V_(pipe, fmt, rbfmt, swapfmt) \ + [PIPE_FORMAT_##pipe] = {.present = 1, \ + .vtx = VFMT4_##fmt, \ + .tex = TFMT4_NONE, \ + .rb = RB4_##rbfmt, \ + .swap = swapfmt} /* clang-format off */ static struct fd4_format formats[PIPE_FORMAT_COUNT] = { @@ -340,84 +333,94 @@ static struct fd4_format formats[PIPE_FORMAT_COUNT] = { enum a4xx_vtx_fmt fd4_pipe2vtx(enum pipe_format format) { - if (!formats[format].present) - return VFMT4_NONE; - return formats[format].vtx; + if (!formats[format].present) + return VFMT4_NONE; + return formats[format].vtx; } /* convert pipe format to texture sampler format: */ enum a4xx_tex_fmt fd4_pipe2tex(enum pipe_format format) { - if (!formats[format].present) - return TFMT4_NONE; - return formats[format].tex; + if (!formats[format].present) + return TFMT4_NONE; + return formats[format].tex; } /* convert pipe format to MRT / copydest format used for render-target: */ enum a4xx_color_fmt fd4_pipe2color(enum pipe_format format) { - if (!formats[format].present) - return RB4_NONE; - return formats[format].rb; + if (!formats[format].present) + return RB4_NONE; + return formats[format].rb; } enum a3xx_color_swap fd4_pipe2swap(enum pipe_format format) { - if (!formats[format].present) - return WZYX; - return formats[format].swap; + if (!formats[format].present) + return WZYX; + return formats[format].swap; } enum a4xx_depth_format fd4_pipe2depth(enum pipe_format format) { - switch (format) { - case PIPE_FORMAT_Z16_UNORM: - return DEPTH4_16; - case PIPE_FORMAT_Z24X8_UNORM: - case PIPE_FORMAT_Z24_UNORM_S8_UINT: - case PIPE_FORMAT_X8Z24_UNORM: - case PIPE_FORMAT_S8_UINT_Z24_UNORM: - return DEPTH4_24_8; - case PIPE_FORMAT_Z32_FLOAT: - case PIPE_FORMAT_Z32_FLOAT_S8X24_UINT: - return DEPTH4_32; - default: - return ~0; - } + switch (format) { + case PIPE_FORMAT_Z16_UNORM: + return DEPTH4_16; + case PIPE_FORMAT_Z24X8_UNORM: + case PIPE_FORMAT_Z24_UNORM_S8_UINT: + case PIPE_FORMAT_X8Z24_UNORM: + case PIPE_FORMAT_S8_UINT_Z24_UNORM: + return DEPTH4_24_8; + case PIPE_FORMAT_Z32_FLOAT: + case PIPE_FORMAT_Z32_FLOAT_S8X24_UINT: + return DEPTH4_32; + default: + return ~0; + } } static inline enum a4xx_tex_swiz tex_swiz(unsigned swiz) { - switch (swiz) { - default: - case PIPE_SWIZZLE_X: return A4XX_TEX_X; - case PIPE_SWIZZLE_Y: return A4XX_TEX_Y; - case PIPE_SWIZZLE_Z: return A4XX_TEX_Z; - case PIPE_SWIZZLE_W: return A4XX_TEX_W; - case PIPE_SWIZZLE_0: return A4XX_TEX_ZERO; - case PIPE_SWIZZLE_1: return A4XX_TEX_ONE; - } + switch (swiz) { + default: + case PIPE_SWIZZLE_X: + return A4XX_TEX_X; + case PIPE_SWIZZLE_Y: + return A4XX_TEX_Y; + case PIPE_SWIZZLE_Z: + return A4XX_TEX_Z; + case PIPE_SWIZZLE_W: + return A4XX_TEX_W; + case PIPE_SWIZZLE_0: + return A4XX_TEX_ZERO; + case PIPE_SWIZZLE_1: + return A4XX_TEX_ONE; + } } uint32_t fd4_tex_swiz(enum pipe_format format, unsigned swizzle_r, unsigned swizzle_g, - unsigned swizzle_b, unsigned swizzle_a) + unsigned swizzle_b, unsigned swizzle_a) { - const struct util_format_description *desc = - util_format_description(format); - unsigned char swiz[4] = { - swizzle_r, swizzle_g, swizzle_b, swizzle_a, - }, rswiz[4]; - - util_format_compose_swizzles(desc->swizzle, swiz, rswiz); - - return A4XX_TEX_CONST_0_SWIZ_X(tex_swiz(rswiz[0])) | - A4XX_TEX_CONST_0_SWIZ_Y(tex_swiz(rswiz[1])) | - A4XX_TEX_CONST_0_SWIZ_Z(tex_swiz(rswiz[2])) | - A4XX_TEX_CONST_0_SWIZ_W(tex_swiz(rswiz[3])); + const struct util_format_description *desc = util_format_description(format); + unsigned char swiz[4] = + { + swizzle_r, + swizzle_g, + swizzle_b, + swizzle_a, + }, + rswiz[4]; + + util_format_compose_swizzles(desc->swizzle, swiz, rswiz); + + return A4XX_TEX_CONST_0_SWIZ_X(tex_swiz(rswiz[0])) | + A4XX_TEX_CONST_0_SWIZ_Y(tex_swiz(rswiz[1])) | + A4XX_TEX_CONST_0_SWIZ_Z(tex_swiz(rswiz[2])) | + A4XX_TEX_CONST_0_SWIZ_W(tex_swiz(rswiz[3])); } diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_format.h b/src/gallium/drivers/freedreno/a4xx/fd4_format.h index 7184af3..2b31dee 100644 --- a/src/gallium/drivers/freedreno/a4xx/fd4_format.h +++ b/src/gallium/drivers/freedreno/a4xx/fd4_format.h @@ -38,6 +38,7 @@ enum a3xx_color_swap fd4_pipe2swap(enum pipe_format format); enum a4xx_depth_format fd4_pipe2depth(enum pipe_format format); uint32_t fd4_tex_swiz(enum pipe_format format, unsigned swizzle_r, - unsigned swizzle_g, unsigned swizzle_b, unsigned swizzle_a); + unsigned swizzle_g, unsigned swizzle_b, + unsigned swizzle_a); #endif /* FD4_UTIL_H_ */ diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_gmem.c b/src/gallium/drivers/freedreno/a4xx/fd4_gmem.c index 32fff9f..157dc1e 100644 --- a/src/gallium/drivers/freedreno/a4xx/fd4_gmem.c +++ b/src/gallium/drivers/freedreno/a4xx/fd4_gmem.c @@ -25,808 +25,810 @@ */ #include "pipe/p_state.h" -#include "util/u_string.h" -#include "util/u_memory.h" -#include "util/u_inlines.h" #include "util/format/u_format.h" +#include "util/u_inlines.h" +#include "util/u_memory.h" +#include "util/u_string.h" #include "freedreno_draw.h" -#include "freedreno_state.h" #include "freedreno_resource.h" +#include "freedreno_state.h" -#include "fd4_gmem.h" #include "fd4_context.h" #include "fd4_draw.h" #include "fd4_emit.h" -#include "fd4_program.h" #include "fd4_format.h" +#include "fd4_gmem.h" +#include "fd4_program.h" #include "fd4_zsa.h" static void -fd4_gmem_emit_set_prog(struct fd_context *ctx, struct fd4_emit *emit, struct fd_program_stateobj *prog) +fd4_gmem_emit_set_prog(struct fd_context *ctx, struct fd4_emit *emit, + struct fd_program_stateobj *prog) { - emit->skip_consts = true; - emit->key.vs = prog->vs; - emit->key.fs = prog->fs; - emit->prog = fd4_program_state(ir3_cache_lookup(ctx->shader_cache, &emit->key, &ctx->debug)); - /* reset the fd4_emit_get_*p cache */ - emit->vs = NULL; - emit->fs = NULL; + emit->skip_consts = true; + emit->key.vs = prog->vs; + emit->key.fs = prog->fs; + emit->prog = fd4_program_state( + ir3_cache_lookup(ctx->shader_cache, &emit->key, &ctx->debug)); + /* reset the fd4_emit_get_*p cache */ + emit->vs = NULL; + emit->fs = NULL; } static void emit_mrt(struct fd_ringbuffer *ring, unsigned nr_bufs, - struct pipe_surface **bufs, const uint32_t *bases, - uint32_t bin_w, bool decode_srgb) + struct pipe_surface **bufs, const uint32_t *bases, uint32_t bin_w, + bool decode_srgb) { - enum a4xx_tile_mode tile_mode; - unsigned i; - - if (bin_w) { - tile_mode = 2; - } else { - tile_mode = TILE4_LINEAR; - } - - for (i = 0; i < A4XX_MAX_RENDER_TARGETS; i++) { - enum a4xx_color_fmt format = 0; - enum a3xx_color_swap swap = WZYX; - bool srgb = false; - struct fd_resource *rsc = NULL; - uint32_t stride = 0; - uint32_t base = 0; - uint32_t offset = 0; - - if ((i < nr_bufs) && bufs[i]) { - struct pipe_surface *psurf = bufs[i]; - enum pipe_format pformat = psurf->format; - - rsc = fd_resource(psurf->texture); - - /* In case we're drawing to Z32F_S8, the "color" actually goes to - * the stencil - */ - if (rsc->stencil) { - rsc = rsc->stencil; - pformat = rsc->b.b.format; - if (bases) - bases++; - } - - format = fd4_pipe2color(pformat); - swap = fd4_pipe2swap(pformat); - - if (decode_srgb) - srgb = util_format_is_srgb(pformat); - else - pformat = util_format_linear(pformat); - - debug_assert(psurf->u.tex.first_layer == psurf->u.tex.last_layer); - - offset = fd_resource_offset(rsc, psurf->u.tex.level, - psurf->u.tex.first_layer); - - if (bin_w) { - stride = bin_w << fdl_cpp_shift(&rsc->layout); - - if (bases) { - base = bases[i]; - } - } else { - stride = fd_resource_pitch(rsc, psurf->u.tex.level); - } - } else if ((i < nr_bufs) && bases) { - base = bases[i]; - } - - OUT_PKT0(ring, REG_A4XX_RB_MRT_BUF_INFO(i), 3); - OUT_RING(ring, A4XX_RB_MRT_BUF_INFO_COLOR_FORMAT(format) | - A4XX_RB_MRT_BUF_INFO_COLOR_TILE_MODE(tile_mode) | - A4XX_RB_MRT_BUF_INFO_COLOR_BUF_PITCH(stride) | - A4XX_RB_MRT_BUF_INFO_COLOR_SWAP(swap) | - COND(srgb, A4XX_RB_MRT_BUF_INFO_COLOR_SRGB)); - if (bin_w || (i >= nr_bufs) || !bufs[i]) { - OUT_RING(ring, base); - OUT_RING(ring, A4XX_RB_MRT_CONTROL3_STRIDE(stride)); - } else { - OUT_RELOC(ring, rsc->bo, offset, 0, 0); - /* RB_MRT[i].CONTROL3.STRIDE not emitted by c2d.. - * not sure if we need to skip it for bypass or - * not. - */ - OUT_RING(ring, A4XX_RB_MRT_CONTROL3_STRIDE(0)); - } - } + enum a4xx_tile_mode tile_mode; + unsigned i; + + if (bin_w) { + tile_mode = 2; + } else { + tile_mode = TILE4_LINEAR; + } + + for (i = 0; i < A4XX_MAX_RENDER_TARGETS; i++) { + enum a4xx_color_fmt format = 0; + enum a3xx_color_swap swap = WZYX; + bool srgb = false; + struct fd_resource *rsc = NULL; + uint32_t stride = 0; + uint32_t base = 0; + uint32_t offset = 0; + + if ((i < nr_bufs) && bufs[i]) { + struct pipe_surface *psurf = bufs[i]; + enum pipe_format pformat = psurf->format; + + rsc = fd_resource(psurf->texture); + + /* In case we're drawing to Z32F_S8, the "color" actually goes to + * the stencil + */ + if (rsc->stencil) { + rsc = rsc->stencil; + pformat = rsc->b.b.format; + if (bases) + bases++; + } + + format = fd4_pipe2color(pformat); + swap = fd4_pipe2swap(pformat); + + if (decode_srgb) + srgb = util_format_is_srgb(pformat); + else + pformat = util_format_linear(pformat); + + debug_assert(psurf->u.tex.first_layer == psurf->u.tex.last_layer); + + offset = fd_resource_offset(rsc, psurf->u.tex.level, + psurf->u.tex.first_layer); + + if (bin_w) { + stride = bin_w << fdl_cpp_shift(&rsc->layout); + + if (bases) { + base = bases[i]; + } + } else { + stride = fd_resource_pitch(rsc, psurf->u.tex.level); + } + } else if ((i < nr_bufs) && bases) { + base = bases[i]; + } + + OUT_PKT0(ring, REG_A4XX_RB_MRT_BUF_INFO(i), 3); + OUT_RING(ring, A4XX_RB_MRT_BUF_INFO_COLOR_FORMAT(format) | + A4XX_RB_MRT_BUF_INFO_COLOR_TILE_MODE(tile_mode) | + A4XX_RB_MRT_BUF_INFO_COLOR_BUF_PITCH(stride) | + A4XX_RB_MRT_BUF_INFO_COLOR_SWAP(swap) | + COND(srgb, A4XX_RB_MRT_BUF_INFO_COLOR_SRGB)); + if (bin_w || (i >= nr_bufs) || !bufs[i]) { + OUT_RING(ring, base); + OUT_RING(ring, A4XX_RB_MRT_CONTROL3_STRIDE(stride)); + } else { + OUT_RELOC(ring, rsc->bo, offset, 0, 0); + /* RB_MRT[i].CONTROL3.STRIDE not emitted by c2d.. + * not sure if we need to skip it for bypass or + * not. + */ + OUT_RING(ring, A4XX_RB_MRT_CONTROL3_STRIDE(0)); + } + } } static bool use_hw_binning(struct fd_batch *batch) { - const struct fd_gmem_stateobj *gmem = batch->gmem_state; + const struct fd_gmem_stateobj *gmem = batch->gmem_state; - if ((gmem->maxpw * gmem->maxph) > 32) - return false; + if ((gmem->maxpw * gmem->maxph) > 32) + return false; - if ((gmem->maxpw > 15) || (gmem->maxph > 15)) - return false; + if ((gmem->maxpw > 15) || (gmem->maxph > 15)) + return false; - return fd_binning_enabled && ((gmem->nbins_x * gmem->nbins_y) > 2); + return fd_binning_enabled && ((gmem->nbins_x * gmem->nbins_y) > 2); } /* transfer from gmem to system memory (ie. normal RAM) */ static void -emit_gmem2mem_surf(struct fd_batch *batch, bool stencil, - uint32_t base, struct pipe_surface *psurf) +emit_gmem2mem_surf(struct fd_batch *batch, bool stencil, uint32_t base, + struct pipe_surface *psurf) { - struct fd_ringbuffer *ring = batch->gmem; - struct fd_resource *rsc = fd_resource(psurf->texture); - enum pipe_format pformat = psurf->format; - uint32_t offset, pitch; - - if (!rsc->valid) - return; - - if (stencil) { - debug_assert(rsc->stencil); - rsc = rsc->stencil; - pformat = rsc->b.b.format; - } - - offset = fd_resource_offset(rsc, psurf->u.tex.level, - psurf->u.tex.first_layer); - pitch = fd_resource_pitch(rsc, psurf->u.tex.level); - - debug_assert(psurf->u.tex.first_layer == psurf->u.tex.last_layer); - - OUT_PKT0(ring, REG_A4XX_RB_COPY_CONTROL, 4); - OUT_RING(ring, A4XX_RB_COPY_CONTROL_MSAA_RESOLVE(MSAA_ONE) | - A4XX_RB_COPY_CONTROL_MODE(RB_COPY_RESOLVE) | - A4XX_RB_COPY_CONTROL_GMEM_BASE(base)); - OUT_RELOC(ring, rsc->bo, offset, 0, 0); /* RB_COPY_DEST_BASE */ - OUT_RING(ring, A4XX_RB_COPY_DEST_PITCH_PITCH(pitch)); - OUT_RING(ring, A4XX_RB_COPY_DEST_INFO_TILE(TILE4_LINEAR) | - A4XX_RB_COPY_DEST_INFO_FORMAT(fd4_pipe2color(pformat)) | - A4XX_RB_COPY_DEST_INFO_COMPONENT_ENABLE(0xf) | - A4XX_RB_COPY_DEST_INFO_ENDIAN(ENDIAN_NONE) | - A4XX_RB_COPY_DEST_INFO_SWAP(fd4_pipe2swap(pformat))); - - fd4_draw(batch, ring, DI_PT_RECTLIST, IGNORE_VISIBILITY, - DI_SRC_SEL_AUTO_INDEX, 2, 1, INDEX4_SIZE_8_BIT, 0, 0, NULL); + struct fd_ringbuffer *ring = batch->gmem; + struct fd_resource *rsc = fd_resource(psurf->texture); + enum pipe_format pformat = psurf->format; + uint32_t offset, pitch; + + if (!rsc->valid) + return; + + if (stencil) { + debug_assert(rsc->stencil); + rsc = rsc->stencil; + pformat = rsc->b.b.format; + } + + offset = + fd_resource_offset(rsc, psurf->u.tex.level, psurf->u.tex.first_layer); + pitch = fd_resource_pitch(rsc, psurf->u.tex.level); + + debug_assert(psurf->u.tex.first_layer == psurf->u.tex.last_layer); + + OUT_PKT0(ring, REG_A4XX_RB_COPY_CONTROL, 4); + OUT_RING(ring, A4XX_RB_COPY_CONTROL_MSAA_RESOLVE(MSAA_ONE) | + A4XX_RB_COPY_CONTROL_MODE(RB_COPY_RESOLVE) | + A4XX_RB_COPY_CONTROL_GMEM_BASE(base)); + OUT_RELOC(ring, rsc->bo, offset, 0, 0); /* RB_COPY_DEST_BASE */ + OUT_RING(ring, A4XX_RB_COPY_DEST_PITCH_PITCH(pitch)); + OUT_RING(ring, A4XX_RB_COPY_DEST_INFO_TILE(TILE4_LINEAR) | + A4XX_RB_COPY_DEST_INFO_FORMAT(fd4_pipe2color(pformat)) | + A4XX_RB_COPY_DEST_INFO_COMPONENT_ENABLE(0xf) | + A4XX_RB_COPY_DEST_INFO_ENDIAN(ENDIAN_NONE) | + A4XX_RB_COPY_DEST_INFO_SWAP(fd4_pipe2swap(pformat))); + + fd4_draw(batch, ring, DI_PT_RECTLIST, IGNORE_VISIBILITY, + DI_SRC_SEL_AUTO_INDEX, 2, 1, INDEX4_SIZE_8_BIT, 0, 0, NULL); } static void -fd4_emit_tile_gmem2mem(struct fd_batch *batch, const struct fd_tile *tile) - assert_dt +fd4_emit_tile_gmem2mem(struct fd_batch *batch, + const struct fd_tile *tile) assert_dt { - struct fd_context *ctx = batch->ctx; - const struct fd_gmem_stateobj *gmem = batch->gmem_state; - struct fd_ringbuffer *ring = batch->gmem; - struct pipe_framebuffer_state *pfb = &batch->framebuffer; - struct fd4_emit emit = { - .debug = &ctx->debug, - .vtx = &ctx->solid_vbuf_state, - }; - fd4_gmem_emit_set_prog(ctx, &emit, &ctx->solid_prog); - - OUT_PKT0(ring, REG_A4XX_RB_DEPTH_CONTROL, 1); - OUT_RING(ring, A4XX_RB_DEPTH_CONTROL_ZFUNC(FUNC_NEVER)); - - OUT_PKT0(ring, REG_A4XX_RB_STENCIL_CONTROL, 2); - OUT_RING(ring, A4XX_RB_STENCIL_CONTROL_FUNC(FUNC_NEVER) | - A4XX_RB_STENCIL_CONTROL_FAIL(STENCIL_KEEP) | - A4XX_RB_STENCIL_CONTROL_ZPASS(STENCIL_KEEP) | - A4XX_RB_STENCIL_CONTROL_ZFAIL(STENCIL_KEEP) | - A4XX_RB_STENCIL_CONTROL_FUNC_BF(FUNC_NEVER) | - A4XX_RB_STENCIL_CONTROL_FAIL_BF(STENCIL_KEEP) | - A4XX_RB_STENCIL_CONTROL_ZPASS_BF(STENCIL_KEEP) | - A4XX_RB_STENCIL_CONTROL_ZFAIL_BF(STENCIL_KEEP)); - OUT_RING(ring, 0x00000000); /* RB_STENCIL_CONTROL2 */ - - OUT_PKT0(ring, REG_A4XX_RB_STENCILREFMASK, 2); - OUT_RING(ring, 0xff000000 | - A4XX_RB_STENCILREFMASK_STENCILREF(0) | - A4XX_RB_STENCILREFMASK_STENCILMASK(0) | - A4XX_RB_STENCILREFMASK_STENCILWRITEMASK(0xff)); - OUT_RING(ring, 0xff000000 | - A4XX_RB_STENCILREFMASK_BF_STENCILREF(0) | - A4XX_RB_STENCILREFMASK_BF_STENCILMASK(0) | - A4XX_RB_STENCILREFMASK_BF_STENCILWRITEMASK(0xff)); - - OUT_PKT0(ring, REG_A4XX_GRAS_SU_MODE_CONTROL, 1); - OUT_RING(ring, A4XX_GRAS_SU_MODE_CONTROL_LINEHALFWIDTH(0)); - - fd_wfi(batch, ring); - - OUT_PKT0(ring, REG_A4XX_GRAS_CL_CLIP_CNTL, 1); - OUT_RING(ring, 0x80000); /* GRAS_CL_CLIP_CNTL */ - - OUT_PKT0(ring, REG_A4XX_GRAS_CL_VPORT_XOFFSET_0, 6); - OUT_RING(ring, A4XX_GRAS_CL_VPORT_XOFFSET_0((float)pfb->width/2.0)); - OUT_RING(ring, A4XX_GRAS_CL_VPORT_XSCALE_0((float)pfb->width/2.0)); - OUT_RING(ring, A4XX_GRAS_CL_VPORT_YOFFSET_0((float)pfb->height/2.0)); - OUT_RING(ring, A4XX_GRAS_CL_VPORT_YSCALE_0(-(float)pfb->height/2.0)); - OUT_RING(ring, A4XX_GRAS_CL_VPORT_ZOFFSET_0(0.0)); - OUT_RING(ring, A4XX_GRAS_CL_VPORT_ZSCALE_0(1.0)); - - OUT_PKT0(ring, REG_A4XX_RB_RENDER_CONTROL, 1); - OUT_RING(ring, A4XX_RB_RENDER_CONTROL_DISABLE_COLOR_PIPE | - 0xa); /* XXX */ - - OUT_PKT0(ring, REG_A4XX_GRAS_SC_CONTROL, 1); - OUT_RING(ring, A4XX_GRAS_SC_CONTROL_RENDER_MODE(RB_RESOLVE_PASS) | - A4XX_GRAS_SC_CONTROL_MSAA_DISABLE | - A4XX_GRAS_SC_CONTROL_MSAA_SAMPLES(MSAA_ONE) | - A4XX_GRAS_SC_CONTROL_RASTER_MODE(1)); - - OUT_PKT0(ring, REG_A4XX_PC_PRIM_VTX_CNTL, 1); - OUT_RING(ring, A4XX_PC_PRIM_VTX_CNTL_PROVOKING_VTX_LAST); - - OUT_PKT0(ring, REG_A4XX_GRAS_ALPHA_CONTROL, 1); - OUT_RING(ring, 0x00000002); - - OUT_PKT0(ring, REG_A4XX_GRAS_SC_WINDOW_SCISSOR_BR, 2); - OUT_RING(ring, A4XX_GRAS_SC_WINDOW_SCISSOR_BR_X(pfb->width - 1) | - A4XX_GRAS_SC_WINDOW_SCISSOR_BR_Y(pfb->height - 1)); - OUT_RING(ring, A4XX_GRAS_SC_WINDOW_SCISSOR_TL_X(0) | - A4XX_GRAS_SC_WINDOW_SCISSOR_TL_Y(0)); - - OUT_PKT0(ring, REG_A4XX_VFD_INDEX_OFFSET, 2); - OUT_RING(ring, 0); /* VFD_INDEX_OFFSET */ - OUT_RING(ring, 0); /* ??? UNKNOWN_2209 */ - - fd4_program_emit(ring, &emit, 0, NULL); - fd4_emit_vertex_bufs(ring, &emit); - - if (batch->resolve & (FD_BUFFER_DEPTH | FD_BUFFER_STENCIL)) { - struct fd_resource *rsc = fd_resource(pfb->zsbuf->texture); - if (!rsc->stencil || (batch->resolve & FD_BUFFER_DEPTH)) - emit_gmem2mem_surf(batch, false, gmem->zsbuf_base[0], pfb->zsbuf); - if (rsc->stencil && (batch->resolve & FD_BUFFER_STENCIL)) - emit_gmem2mem_surf(batch, true, gmem->zsbuf_base[1], pfb->zsbuf); - } - - if (batch->resolve & FD_BUFFER_COLOR) { - unsigned i; - for (i = 0; i < pfb->nr_cbufs; i++) { - if (!pfb->cbufs[i]) - continue; - if (!(batch->resolve & (PIPE_CLEAR_COLOR0 << i))) - continue; - emit_gmem2mem_surf(batch, false, gmem->cbuf_base[i], pfb->cbufs[i]); - } - } - - OUT_PKT0(ring, REG_A4XX_GRAS_SC_CONTROL, 1); - OUT_RING(ring, A4XX_GRAS_SC_CONTROL_RENDER_MODE(RB_RENDERING_PASS) | - A4XX_GRAS_SC_CONTROL_MSAA_DISABLE | - A4XX_GRAS_SC_CONTROL_MSAA_SAMPLES(MSAA_ONE) | - A4XX_GRAS_SC_CONTROL_RASTER_MODE(0)); + struct fd_context *ctx = batch->ctx; + const struct fd_gmem_stateobj *gmem = batch->gmem_state; + struct fd_ringbuffer *ring = batch->gmem; + struct pipe_framebuffer_state *pfb = &batch->framebuffer; + struct fd4_emit emit = { + .debug = &ctx->debug, + .vtx = &ctx->solid_vbuf_state, + }; + fd4_gmem_emit_set_prog(ctx, &emit, &ctx->solid_prog); + + OUT_PKT0(ring, REG_A4XX_RB_DEPTH_CONTROL, 1); + OUT_RING(ring, A4XX_RB_DEPTH_CONTROL_ZFUNC(FUNC_NEVER)); + + OUT_PKT0(ring, REG_A4XX_RB_STENCIL_CONTROL, 2); + OUT_RING(ring, A4XX_RB_STENCIL_CONTROL_FUNC(FUNC_NEVER) | + A4XX_RB_STENCIL_CONTROL_FAIL(STENCIL_KEEP) | + A4XX_RB_STENCIL_CONTROL_ZPASS(STENCIL_KEEP) | + A4XX_RB_STENCIL_CONTROL_ZFAIL(STENCIL_KEEP) | + A4XX_RB_STENCIL_CONTROL_FUNC_BF(FUNC_NEVER) | + A4XX_RB_STENCIL_CONTROL_FAIL_BF(STENCIL_KEEP) | + A4XX_RB_STENCIL_CONTROL_ZPASS_BF(STENCIL_KEEP) | + A4XX_RB_STENCIL_CONTROL_ZFAIL_BF(STENCIL_KEEP)); + OUT_RING(ring, 0x00000000); /* RB_STENCIL_CONTROL2 */ + + OUT_PKT0(ring, REG_A4XX_RB_STENCILREFMASK, 2); + OUT_RING(ring, 0xff000000 | A4XX_RB_STENCILREFMASK_STENCILREF(0) | + A4XX_RB_STENCILREFMASK_STENCILMASK(0) | + A4XX_RB_STENCILREFMASK_STENCILWRITEMASK(0xff)); + OUT_RING(ring, 0xff000000 | A4XX_RB_STENCILREFMASK_BF_STENCILREF(0) | + A4XX_RB_STENCILREFMASK_BF_STENCILMASK(0) | + A4XX_RB_STENCILREFMASK_BF_STENCILWRITEMASK(0xff)); + + OUT_PKT0(ring, REG_A4XX_GRAS_SU_MODE_CONTROL, 1); + OUT_RING(ring, A4XX_GRAS_SU_MODE_CONTROL_LINEHALFWIDTH(0)); + + fd_wfi(batch, ring); + + OUT_PKT0(ring, REG_A4XX_GRAS_CL_CLIP_CNTL, 1); + OUT_RING(ring, 0x80000); /* GRAS_CL_CLIP_CNTL */ + + OUT_PKT0(ring, REG_A4XX_GRAS_CL_VPORT_XOFFSET_0, 6); + OUT_RING(ring, A4XX_GRAS_CL_VPORT_XOFFSET_0((float)pfb->width / 2.0)); + OUT_RING(ring, A4XX_GRAS_CL_VPORT_XSCALE_0((float)pfb->width / 2.0)); + OUT_RING(ring, A4XX_GRAS_CL_VPORT_YOFFSET_0((float)pfb->height / 2.0)); + OUT_RING(ring, A4XX_GRAS_CL_VPORT_YSCALE_0(-(float)pfb->height / 2.0)); + OUT_RING(ring, A4XX_GRAS_CL_VPORT_ZOFFSET_0(0.0)); + OUT_RING(ring, A4XX_GRAS_CL_VPORT_ZSCALE_0(1.0)); + + OUT_PKT0(ring, REG_A4XX_RB_RENDER_CONTROL, 1); + OUT_RING(ring, A4XX_RB_RENDER_CONTROL_DISABLE_COLOR_PIPE | 0xa); /* XXX */ + + OUT_PKT0(ring, REG_A4XX_GRAS_SC_CONTROL, 1); + OUT_RING(ring, A4XX_GRAS_SC_CONTROL_RENDER_MODE(RB_RESOLVE_PASS) | + A4XX_GRAS_SC_CONTROL_MSAA_DISABLE | + A4XX_GRAS_SC_CONTROL_MSAA_SAMPLES(MSAA_ONE) | + A4XX_GRAS_SC_CONTROL_RASTER_MODE(1)); + + OUT_PKT0(ring, REG_A4XX_PC_PRIM_VTX_CNTL, 1); + OUT_RING(ring, A4XX_PC_PRIM_VTX_CNTL_PROVOKING_VTX_LAST); + + OUT_PKT0(ring, REG_A4XX_GRAS_ALPHA_CONTROL, 1); + OUT_RING(ring, 0x00000002); + + OUT_PKT0(ring, REG_A4XX_GRAS_SC_WINDOW_SCISSOR_BR, 2); + OUT_RING(ring, A4XX_GRAS_SC_WINDOW_SCISSOR_BR_X(pfb->width - 1) | + A4XX_GRAS_SC_WINDOW_SCISSOR_BR_Y(pfb->height - 1)); + OUT_RING(ring, A4XX_GRAS_SC_WINDOW_SCISSOR_TL_X(0) | + A4XX_GRAS_SC_WINDOW_SCISSOR_TL_Y(0)); + + OUT_PKT0(ring, REG_A4XX_VFD_INDEX_OFFSET, 2); + OUT_RING(ring, 0); /* VFD_INDEX_OFFSET */ + OUT_RING(ring, 0); /* ??? UNKNOWN_2209 */ + + fd4_program_emit(ring, &emit, 0, NULL); + fd4_emit_vertex_bufs(ring, &emit); + + if (batch->resolve & (FD_BUFFER_DEPTH | FD_BUFFER_STENCIL)) { + struct fd_resource *rsc = fd_resource(pfb->zsbuf->texture); + if (!rsc->stencil || (batch->resolve & FD_BUFFER_DEPTH)) + emit_gmem2mem_surf(batch, false, gmem->zsbuf_base[0], pfb->zsbuf); + if (rsc->stencil && (batch->resolve & FD_BUFFER_STENCIL)) + emit_gmem2mem_surf(batch, true, gmem->zsbuf_base[1], pfb->zsbuf); + } + + if (batch->resolve & FD_BUFFER_COLOR) { + unsigned i; + for (i = 0; i < pfb->nr_cbufs; i++) { + if (!pfb->cbufs[i]) + continue; + if (!(batch->resolve & (PIPE_CLEAR_COLOR0 << i))) + continue; + emit_gmem2mem_surf(batch, false, gmem->cbuf_base[i], pfb->cbufs[i]); + } + } + + OUT_PKT0(ring, REG_A4XX_GRAS_SC_CONTROL, 1); + OUT_RING(ring, A4XX_GRAS_SC_CONTROL_RENDER_MODE(RB_RENDERING_PASS) | + A4XX_GRAS_SC_CONTROL_MSAA_DISABLE | + A4XX_GRAS_SC_CONTROL_MSAA_SAMPLES(MSAA_ONE) | + A4XX_GRAS_SC_CONTROL_RASTER_MODE(0)); } /* transfer from system memory to gmem */ static void emit_mem2gmem_surf(struct fd_batch *batch, const uint32_t *bases, - struct pipe_surface **bufs, uint32_t nr_bufs, uint32_t bin_w) + struct pipe_surface **bufs, uint32_t nr_bufs, uint32_t bin_w) { - struct fd_ringbuffer *ring = batch->gmem; - struct pipe_surface *zsbufs[2]; + struct fd_ringbuffer *ring = batch->gmem; + struct pipe_surface *zsbufs[2]; - emit_mrt(ring, nr_bufs, bufs, bases, bin_w, false); + emit_mrt(ring, nr_bufs, bufs, bases, bin_w, false); - if (bufs[0] && (bufs[0]->format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT)) { - /* The gmem_restore_tex logic will put the first buffer's stencil - * as color. Supply it with the proper information to make that - * happen. - */ - zsbufs[0] = zsbufs[1] = bufs[0]; - bufs = zsbufs; - nr_bufs = 2; - } + if (bufs[0] && (bufs[0]->format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT)) { + /* The gmem_restore_tex logic will put the first buffer's stencil + * as color. Supply it with the proper information to make that + * happen. + */ + zsbufs[0] = zsbufs[1] = bufs[0]; + bufs = zsbufs; + nr_bufs = 2; + } - fd4_emit_gmem_restore_tex(ring, nr_bufs, bufs); + fd4_emit_gmem_restore_tex(ring, nr_bufs, bufs); - fd4_draw(batch, ring, DI_PT_RECTLIST, IGNORE_VISIBILITY, - DI_SRC_SEL_AUTO_INDEX, 2, 1, INDEX4_SIZE_8_BIT, 0, 0, NULL); + fd4_draw(batch, ring, DI_PT_RECTLIST, IGNORE_VISIBILITY, + DI_SRC_SEL_AUTO_INDEX, 2, 1, INDEX4_SIZE_8_BIT, 0, 0, NULL); } static void -fd4_emit_tile_mem2gmem(struct fd_batch *batch, const struct fd_tile *tile) - assert_dt +fd4_emit_tile_mem2gmem(struct fd_batch *batch, + const struct fd_tile *tile) assert_dt { - struct fd_context *ctx = batch->ctx; - const struct fd_gmem_stateobj *gmem = batch->gmem_state; - struct fd_ringbuffer *ring = batch->gmem; - struct pipe_framebuffer_state *pfb = &batch->framebuffer; - struct fd4_emit emit = { - .debug = &ctx->debug, - .vtx = &ctx->blit_vbuf_state, - .sprite_coord_enable = 1, - .no_decode_srgb = true, - }; - /* NOTE: They all use the same VP, this is for vtx bufs. */ - fd4_gmem_emit_set_prog(ctx, &emit, &ctx->blit_prog[0]); - - unsigned char mrt_comp[A4XX_MAX_RENDER_TARGETS] = {0}; - float x0, y0, x1, y1; - unsigned bin_w = tile->bin_w; - unsigned bin_h = tile->bin_h; - unsigned i; - - /* write texture coordinates to vertexbuf: */ - x0 = ((float)tile->xoff) / ((float)pfb->width); - x1 = ((float)tile->xoff + bin_w) / ((float)pfb->width); - y0 = ((float)tile->yoff) / ((float)pfb->height); - y1 = ((float)tile->yoff + bin_h) / ((float)pfb->height); - - OUT_PKT3(ring, CP_MEM_WRITE, 5); - OUT_RELOC(ring, fd_resource(ctx->blit_texcoord_vbuf)->bo, 0, 0, 0); - OUT_RING(ring, fui(x0)); - OUT_RING(ring, fui(y0)); - OUT_RING(ring, fui(x1)); - OUT_RING(ring, fui(y1)); - - for (i = 0; i < A4XX_MAX_RENDER_TARGETS; i++) { - mrt_comp[i] = ((i < pfb->nr_cbufs) && pfb->cbufs[i]) ? 0xf : 0; - - OUT_PKT0(ring, REG_A4XX_RB_MRT_CONTROL(i), 1); - OUT_RING(ring, A4XX_RB_MRT_CONTROL_ROP_CODE(ROP_COPY) | - A4XX_RB_MRT_CONTROL_COMPONENT_ENABLE(0xf)); - - OUT_PKT0(ring, REG_A4XX_RB_MRT_BLEND_CONTROL(i), 1); - OUT_RING(ring, A4XX_RB_MRT_BLEND_CONTROL_RGB_SRC_FACTOR(FACTOR_ONE) | - A4XX_RB_MRT_BLEND_CONTROL_RGB_BLEND_OPCODE(BLEND_DST_PLUS_SRC) | - A4XX_RB_MRT_BLEND_CONTROL_RGB_DEST_FACTOR(FACTOR_ZERO) | - A4XX_RB_MRT_BLEND_CONTROL_ALPHA_SRC_FACTOR(FACTOR_ONE) | - A4XX_RB_MRT_BLEND_CONTROL_ALPHA_BLEND_OPCODE(BLEND_DST_PLUS_SRC) | - A4XX_RB_MRT_BLEND_CONTROL_ALPHA_DEST_FACTOR(FACTOR_ZERO)); - } - - OUT_PKT0(ring, REG_A4XX_RB_RENDER_COMPONENTS, 1); - OUT_RING(ring, A4XX_RB_RENDER_COMPONENTS_RT0(mrt_comp[0]) | - A4XX_RB_RENDER_COMPONENTS_RT1(mrt_comp[1]) | - A4XX_RB_RENDER_COMPONENTS_RT2(mrt_comp[2]) | - A4XX_RB_RENDER_COMPONENTS_RT3(mrt_comp[3]) | - A4XX_RB_RENDER_COMPONENTS_RT4(mrt_comp[4]) | - A4XX_RB_RENDER_COMPONENTS_RT5(mrt_comp[5]) | - A4XX_RB_RENDER_COMPONENTS_RT6(mrt_comp[6]) | - A4XX_RB_RENDER_COMPONENTS_RT7(mrt_comp[7])); - - OUT_PKT0(ring, REG_A4XX_RB_RENDER_CONTROL, 1); - OUT_RING(ring, 0x8); /* XXX RB_RENDER_CONTROL */ - - OUT_PKT0(ring, REG_A4XX_RB_DEPTH_CONTROL, 1); - OUT_RING(ring, A4XX_RB_DEPTH_CONTROL_ZFUNC(FUNC_LESS)); - - OUT_PKT0(ring, REG_A4XX_GRAS_CL_CLIP_CNTL, 1); - OUT_RING(ring, 0x280000); /* XXX GRAS_CL_CLIP_CNTL */ - - OUT_PKT0(ring, REG_A4XX_GRAS_SU_MODE_CONTROL, 1); - OUT_RING(ring, A4XX_GRAS_SU_MODE_CONTROL_LINEHALFWIDTH(0) | - A4XX_GRAS_SU_MODE_CONTROL_RENDERING_PASS); - - OUT_PKT0(ring, REG_A4XX_GRAS_CL_VPORT_XOFFSET_0, 6); - OUT_RING(ring, A4XX_GRAS_CL_VPORT_XOFFSET_0((float)bin_w/2.0)); - OUT_RING(ring, A4XX_GRAS_CL_VPORT_XSCALE_0((float)bin_w/2.0)); - OUT_RING(ring, A4XX_GRAS_CL_VPORT_YOFFSET_0((float)bin_h/2.0)); - OUT_RING(ring, A4XX_GRAS_CL_VPORT_YSCALE_0(-(float)bin_h/2.0)); - OUT_RING(ring, A4XX_GRAS_CL_VPORT_ZOFFSET_0(0.0)); - OUT_RING(ring, A4XX_GRAS_CL_VPORT_ZSCALE_0(1.0)); - - OUT_PKT0(ring, REG_A4XX_GRAS_SC_WINDOW_SCISSOR_BR, 2); - OUT_RING(ring, A4XX_GRAS_SC_WINDOW_SCISSOR_BR_X(bin_w - 1) | - A4XX_GRAS_SC_WINDOW_SCISSOR_BR_Y(bin_h - 1)); - OUT_RING(ring, A4XX_GRAS_SC_WINDOW_SCISSOR_TL_X(0) | - A4XX_GRAS_SC_WINDOW_SCISSOR_TL_Y(0)); - - OUT_PKT0(ring, REG_A4XX_GRAS_SC_SCREEN_SCISSOR_TL, 2); - OUT_RING(ring, A4XX_GRAS_SC_SCREEN_SCISSOR_TL_X(0) | - A4XX_GRAS_SC_SCREEN_SCISSOR_TL_Y(0)); - OUT_RING(ring, A4XX_GRAS_SC_SCREEN_SCISSOR_BR_X(bin_w - 1) | - A4XX_GRAS_SC_SCREEN_SCISSOR_BR_Y(bin_h - 1)); - - OUT_PKT0(ring, REG_A4XX_RB_MODE_CONTROL, 1); - OUT_RING(ring, A4XX_RB_MODE_CONTROL_WIDTH(gmem->bin_w) | - A4XX_RB_MODE_CONTROL_HEIGHT(gmem->bin_h)); - - OUT_PKT0(ring, REG_A4XX_RB_STENCIL_CONTROL, 2); - OUT_RING(ring, A4XX_RB_STENCIL_CONTROL_FUNC(FUNC_ALWAYS) | - A4XX_RB_STENCIL_CONTROL_FAIL(STENCIL_KEEP) | - A4XX_RB_STENCIL_CONTROL_ZPASS(STENCIL_KEEP) | - A4XX_RB_STENCIL_CONTROL_ZFAIL(STENCIL_KEEP) | - A4XX_RB_STENCIL_CONTROL_FUNC_BF(FUNC_ALWAYS) | - A4XX_RB_STENCIL_CONTROL_FAIL_BF(STENCIL_KEEP) | - A4XX_RB_STENCIL_CONTROL_ZPASS_BF(STENCIL_KEEP) | - A4XX_RB_STENCIL_CONTROL_ZFAIL_BF(STENCIL_KEEP)); - OUT_RING(ring, 0x00000000); /* RB_STENCIL_CONTROL2 */ - - OUT_PKT0(ring, REG_A4XX_GRAS_SC_CONTROL, 1); - OUT_RING(ring, A4XX_GRAS_SC_CONTROL_RENDER_MODE(RB_RENDERING_PASS) | - A4XX_GRAS_SC_CONTROL_MSAA_DISABLE | - A4XX_GRAS_SC_CONTROL_MSAA_SAMPLES(MSAA_ONE) | - A4XX_GRAS_SC_CONTROL_RASTER_MODE(1)); - - OUT_PKT0(ring, REG_A4XX_PC_PRIM_VTX_CNTL, 1); - OUT_RING(ring, A4XX_PC_PRIM_VTX_CNTL_PROVOKING_VTX_LAST | - A4XX_PC_PRIM_VTX_CNTL_VAROUT(1)); - - OUT_PKT0(ring, REG_A4XX_VFD_INDEX_OFFSET, 2); - OUT_RING(ring, 0); /* VFD_INDEX_OFFSET */ - OUT_RING(ring, 0); /* ??? UNKNOWN_2209 */ - - fd4_emit_vertex_bufs(ring, &emit); - - /* for gmem pitch/base calculations, we need to use the non- - * truncated tile sizes: - */ - bin_w = gmem->bin_w; - bin_h = gmem->bin_h; - - if (fd_gmem_needs_restore(batch, tile, FD_BUFFER_COLOR)) { - fd4_gmem_emit_set_prog(ctx, &emit, &ctx->blit_prog[pfb->nr_cbufs - 1]); - fd4_program_emit(ring, &emit, pfb->nr_cbufs, pfb->cbufs); - emit_mem2gmem_surf(batch, gmem->cbuf_base, pfb->cbufs, pfb->nr_cbufs, bin_w); - } - - if (fd_gmem_needs_restore(batch, tile, FD_BUFFER_DEPTH | FD_BUFFER_STENCIL)) { - switch (pfb->zsbuf->format) { - case PIPE_FORMAT_Z32_FLOAT_S8X24_UINT: - case PIPE_FORMAT_Z32_FLOAT: - if (pfb->zsbuf->format == PIPE_FORMAT_Z32_FLOAT) - fd4_gmem_emit_set_prog(ctx, &emit, &ctx->blit_z); - else - fd4_gmem_emit_set_prog(ctx, &emit, &ctx->blit_zs); - - OUT_PKT0(ring, REG_A4XX_RB_DEPTH_CONTROL, 1); - OUT_RING(ring, A4XX_RB_DEPTH_CONTROL_Z_ENABLE | - A4XX_RB_DEPTH_CONTROL_Z_WRITE_ENABLE | - A4XX_RB_DEPTH_CONTROL_ZFUNC(FUNC_ALWAYS) | - A4XX_RB_DEPTH_CONTROL_EARLY_Z_DISABLE); - - OUT_PKT0(ring, REG_A4XX_GRAS_ALPHA_CONTROL, 1); - OUT_RING(ring, A4XX_GRAS_ALPHA_CONTROL_ALPHA_TEST_ENABLE); - - OUT_PKT0(ring, REG_A4XX_GRAS_CL_CLIP_CNTL, 1); - OUT_RING(ring, 0x80000); /* GRAS_CL_CLIP_CNTL */ - - break; - default: - /* Non-float can use a regular color write. It's split over 8-bit - * components, so half precision is always sufficient. - */ - fd4_gmem_emit_set_prog(ctx, &emit, &ctx->blit_prog[0]); - break; - } - fd4_program_emit(ring, &emit, 1, &pfb->zsbuf); - emit_mem2gmem_surf(batch, gmem->zsbuf_base, &pfb->zsbuf, 1, bin_w); - } - - OUT_PKT0(ring, REG_A4XX_GRAS_SC_CONTROL, 1); - OUT_RING(ring, A4XX_GRAS_SC_CONTROL_RENDER_MODE(RB_RENDERING_PASS) | - A4XX_GRAS_SC_CONTROL_MSAA_SAMPLES(MSAA_ONE) | - A4XX_GRAS_SC_CONTROL_RASTER_MODE(0)); - - OUT_PKT0(ring, REG_A4XX_RB_MODE_CONTROL, 1); - OUT_RING(ring, A4XX_RB_MODE_CONTROL_WIDTH(gmem->bin_w) | - A4XX_RB_MODE_CONTROL_HEIGHT(gmem->bin_h) | - 0x00010000); /* XXX */ + struct fd_context *ctx = batch->ctx; + const struct fd_gmem_stateobj *gmem = batch->gmem_state; + struct fd_ringbuffer *ring = batch->gmem; + struct pipe_framebuffer_state *pfb = &batch->framebuffer; + struct fd4_emit emit = { + .debug = &ctx->debug, + .vtx = &ctx->blit_vbuf_state, + .sprite_coord_enable = 1, + .no_decode_srgb = true, + }; + /* NOTE: They all use the same VP, this is for vtx bufs. */ + fd4_gmem_emit_set_prog(ctx, &emit, &ctx->blit_prog[0]); + + unsigned char mrt_comp[A4XX_MAX_RENDER_TARGETS] = {0}; + float x0, y0, x1, y1; + unsigned bin_w = tile->bin_w; + unsigned bin_h = tile->bin_h; + unsigned i; + + /* write texture coordinates to vertexbuf: */ + x0 = ((float)tile->xoff) / ((float)pfb->width); + x1 = ((float)tile->xoff + bin_w) / ((float)pfb->width); + y0 = ((float)tile->yoff) / ((float)pfb->height); + y1 = ((float)tile->yoff + bin_h) / ((float)pfb->height); + + OUT_PKT3(ring, CP_MEM_WRITE, 5); + OUT_RELOC(ring, fd_resource(ctx->blit_texcoord_vbuf)->bo, 0, 0, 0); + OUT_RING(ring, fui(x0)); + OUT_RING(ring, fui(y0)); + OUT_RING(ring, fui(x1)); + OUT_RING(ring, fui(y1)); + + for (i = 0; i < A4XX_MAX_RENDER_TARGETS; i++) { + mrt_comp[i] = ((i < pfb->nr_cbufs) && pfb->cbufs[i]) ? 0xf : 0; + + OUT_PKT0(ring, REG_A4XX_RB_MRT_CONTROL(i), 1); + OUT_RING(ring, A4XX_RB_MRT_CONTROL_ROP_CODE(ROP_COPY) | + A4XX_RB_MRT_CONTROL_COMPONENT_ENABLE(0xf)); + + OUT_PKT0(ring, REG_A4XX_RB_MRT_BLEND_CONTROL(i), 1); + OUT_RING( + ring, + A4XX_RB_MRT_BLEND_CONTROL_RGB_SRC_FACTOR(FACTOR_ONE) | + A4XX_RB_MRT_BLEND_CONTROL_RGB_BLEND_OPCODE(BLEND_DST_PLUS_SRC) | + A4XX_RB_MRT_BLEND_CONTROL_RGB_DEST_FACTOR(FACTOR_ZERO) | + A4XX_RB_MRT_BLEND_CONTROL_ALPHA_SRC_FACTOR(FACTOR_ONE) | + A4XX_RB_MRT_BLEND_CONTROL_ALPHA_BLEND_OPCODE(BLEND_DST_PLUS_SRC) | + A4XX_RB_MRT_BLEND_CONTROL_ALPHA_DEST_FACTOR(FACTOR_ZERO)); + } + + OUT_PKT0(ring, REG_A4XX_RB_RENDER_COMPONENTS, 1); + OUT_RING(ring, A4XX_RB_RENDER_COMPONENTS_RT0(mrt_comp[0]) | + A4XX_RB_RENDER_COMPONENTS_RT1(mrt_comp[1]) | + A4XX_RB_RENDER_COMPONENTS_RT2(mrt_comp[2]) | + A4XX_RB_RENDER_COMPONENTS_RT3(mrt_comp[3]) | + A4XX_RB_RENDER_COMPONENTS_RT4(mrt_comp[4]) | + A4XX_RB_RENDER_COMPONENTS_RT5(mrt_comp[5]) | + A4XX_RB_RENDER_COMPONENTS_RT6(mrt_comp[6]) | + A4XX_RB_RENDER_COMPONENTS_RT7(mrt_comp[7])); + + OUT_PKT0(ring, REG_A4XX_RB_RENDER_CONTROL, 1); + OUT_RING(ring, 0x8); /* XXX RB_RENDER_CONTROL */ + + OUT_PKT0(ring, REG_A4XX_RB_DEPTH_CONTROL, 1); + OUT_RING(ring, A4XX_RB_DEPTH_CONTROL_ZFUNC(FUNC_LESS)); + + OUT_PKT0(ring, REG_A4XX_GRAS_CL_CLIP_CNTL, 1); + OUT_RING(ring, 0x280000); /* XXX GRAS_CL_CLIP_CNTL */ + + OUT_PKT0(ring, REG_A4XX_GRAS_SU_MODE_CONTROL, 1); + OUT_RING(ring, A4XX_GRAS_SU_MODE_CONTROL_LINEHALFWIDTH(0) | + A4XX_GRAS_SU_MODE_CONTROL_RENDERING_PASS); + + OUT_PKT0(ring, REG_A4XX_GRAS_CL_VPORT_XOFFSET_0, 6); + OUT_RING(ring, A4XX_GRAS_CL_VPORT_XOFFSET_0((float)bin_w / 2.0)); + OUT_RING(ring, A4XX_GRAS_CL_VPORT_XSCALE_0((float)bin_w / 2.0)); + OUT_RING(ring, A4XX_GRAS_CL_VPORT_YOFFSET_0((float)bin_h / 2.0)); + OUT_RING(ring, A4XX_GRAS_CL_VPORT_YSCALE_0(-(float)bin_h / 2.0)); + OUT_RING(ring, A4XX_GRAS_CL_VPORT_ZOFFSET_0(0.0)); + OUT_RING(ring, A4XX_GRAS_CL_VPORT_ZSCALE_0(1.0)); + + OUT_PKT0(ring, REG_A4XX_GRAS_SC_WINDOW_SCISSOR_BR, 2); + OUT_RING(ring, A4XX_GRAS_SC_WINDOW_SCISSOR_BR_X(bin_w - 1) | + A4XX_GRAS_SC_WINDOW_SCISSOR_BR_Y(bin_h - 1)); + OUT_RING(ring, A4XX_GRAS_SC_WINDOW_SCISSOR_TL_X(0) | + A4XX_GRAS_SC_WINDOW_SCISSOR_TL_Y(0)); + + OUT_PKT0(ring, REG_A4XX_GRAS_SC_SCREEN_SCISSOR_TL, 2); + OUT_RING(ring, A4XX_GRAS_SC_SCREEN_SCISSOR_TL_X(0) | + A4XX_GRAS_SC_SCREEN_SCISSOR_TL_Y(0)); + OUT_RING(ring, A4XX_GRAS_SC_SCREEN_SCISSOR_BR_X(bin_w - 1) | + A4XX_GRAS_SC_SCREEN_SCISSOR_BR_Y(bin_h - 1)); + + OUT_PKT0(ring, REG_A4XX_RB_MODE_CONTROL, 1); + OUT_RING(ring, A4XX_RB_MODE_CONTROL_WIDTH(gmem->bin_w) | + A4XX_RB_MODE_CONTROL_HEIGHT(gmem->bin_h)); + + OUT_PKT0(ring, REG_A4XX_RB_STENCIL_CONTROL, 2); + OUT_RING(ring, A4XX_RB_STENCIL_CONTROL_FUNC(FUNC_ALWAYS) | + A4XX_RB_STENCIL_CONTROL_FAIL(STENCIL_KEEP) | + A4XX_RB_STENCIL_CONTROL_ZPASS(STENCIL_KEEP) | + A4XX_RB_STENCIL_CONTROL_ZFAIL(STENCIL_KEEP) | + A4XX_RB_STENCIL_CONTROL_FUNC_BF(FUNC_ALWAYS) | + A4XX_RB_STENCIL_CONTROL_FAIL_BF(STENCIL_KEEP) | + A4XX_RB_STENCIL_CONTROL_ZPASS_BF(STENCIL_KEEP) | + A4XX_RB_STENCIL_CONTROL_ZFAIL_BF(STENCIL_KEEP)); + OUT_RING(ring, 0x00000000); /* RB_STENCIL_CONTROL2 */ + + OUT_PKT0(ring, REG_A4XX_GRAS_SC_CONTROL, 1); + OUT_RING(ring, A4XX_GRAS_SC_CONTROL_RENDER_MODE(RB_RENDERING_PASS) | + A4XX_GRAS_SC_CONTROL_MSAA_DISABLE | + A4XX_GRAS_SC_CONTROL_MSAA_SAMPLES(MSAA_ONE) | + A4XX_GRAS_SC_CONTROL_RASTER_MODE(1)); + + OUT_PKT0(ring, REG_A4XX_PC_PRIM_VTX_CNTL, 1); + OUT_RING(ring, A4XX_PC_PRIM_VTX_CNTL_PROVOKING_VTX_LAST | + A4XX_PC_PRIM_VTX_CNTL_VAROUT(1)); + + OUT_PKT0(ring, REG_A4XX_VFD_INDEX_OFFSET, 2); + OUT_RING(ring, 0); /* VFD_INDEX_OFFSET */ + OUT_RING(ring, 0); /* ??? UNKNOWN_2209 */ + + fd4_emit_vertex_bufs(ring, &emit); + + /* for gmem pitch/base calculations, we need to use the non- + * truncated tile sizes: + */ + bin_w = gmem->bin_w; + bin_h = gmem->bin_h; + + if (fd_gmem_needs_restore(batch, tile, FD_BUFFER_COLOR)) { + fd4_gmem_emit_set_prog(ctx, &emit, &ctx->blit_prog[pfb->nr_cbufs - 1]); + fd4_program_emit(ring, &emit, pfb->nr_cbufs, pfb->cbufs); + emit_mem2gmem_surf(batch, gmem->cbuf_base, pfb->cbufs, pfb->nr_cbufs, + bin_w); + } + + if (fd_gmem_needs_restore(batch, tile, + FD_BUFFER_DEPTH | FD_BUFFER_STENCIL)) { + switch (pfb->zsbuf->format) { + case PIPE_FORMAT_Z32_FLOAT_S8X24_UINT: + case PIPE_FORMAT_Z32_FLOAT: + if (pfb->zsbuf->format == PIPE_FORMAT_Z32_FLOAT) + fd4_gmem_emit_set_prog(ctx, &emit, &ctx->blit_z); + else + fd4_gmem_emit_set_prog(ctx, &emit, &ctx->blit_zs); + + OUT_PKT0(ring, REG_A4XX_RB_DEPTH_CONTROL, 1); + OUT_RING(ring, A4XX_RB_DEPTH_CONTROL_Z_ENABLE | + A4XX_RB_DEPTH_CONTROL_Z_WRITE_ENABLE | + A4XX_RB_DEPTH_CONTROL_ZFUNC(FUNC_ALWAYS) | + A4XX_RB_DEPTH_CONTROL_EARLY_Z_DISABLE); + + OUT_PKT0(ring, REG_A4XX_GRAS_ALPHA_CONTROL, 1); + OUT_RING(ring, A4XX_GRAS_ALPHA_CONTROL_ALPHA_TEST_ENABLE); + + OUT_PKT0(ring, REG_A4XX_GRAS_CL_CLIP_CNTL, 1); + OUT_RING(ring, 0x80000); /* GRAS_CL_CLIP_CNTL */ + + break; + default: + /* Non-float can use a regular color write. It's split over 8-bit + * components, so half precision is always sufficient. + */ + fd4_gmem_emit_set_prog(ctx, &emit, &ctx->blit_prog[0]); + break; + } + fd4_program_emit(ring, &emit, 1, &pfb->zsbuf); + emit_mem2gmem_surf(batch, gmem->zsbuf_base, &pfb->zsbuf, 1, bin_w); + } + + OUT_PKT0(ring, REG_A4XX_GRAS_SC_CONTROL, 1); + OUT_RING(ring, A4XX_GRAS_SC_CONTROL_RENDER_MODE(RB_RENDERING_PASS) | + A4XX_GRAS_SC_CONTROL_MSAA_SAMPLES(MSAA_ONE) | + A4XX_GRAS_SC_CONTROL_RASTER_MODE(0)); + + OUT_PKT0(ring, REG_A4XX_RB_MODE_CONTROL, 1); + OUT_RING(ring, A4XX_RB_MODE_CONTROL_WIDTH(gmem->bin_w) | + A4XX_RB_MODE_CONTROL_HEIGHT(gmem->bin_h) | + 0x00010000); /* XXX */ } static void patch_draws(struct fd_batch *batch, enum pc_di_vis_cull_mode vismode) { - unsigned i; - for (i = 0; i < fd_patch_num_elements(&batch->draw_patches); i++) { - struct fd_cs_patch *patch = fd_patch_element(&batch->draw_patches, i); - *patch->cs = patch->val | DRAW4(0, 0, 0, vismode); - } - util_dynarray_clear(&batch->draw_patches); + unsigned i; + for (i = 0; i < fd_patch_num_elements(&batch->draw_patches); i++) { + struct fd_cs_patch *patch = fd_patch_element(&batch->draw_patches, i); + *patch->cs = patch->val | DRAW4(0, 0, 0, vismode); + } + util_dynarray_clear(&batch->draw_patches); } /* for rendering directly to system memory: */ static void -fd4_emit_sysmem_prep(struct fd_batch *batch) - assert_dt +fd4_emit_sysmem_prep(struct fd_batch *batch) assert_dt { - struct pipe_framebuffer_state *pfb = &batch->framebuffer; - struct fd_ringbuffer *ring = batch->gmem; + struct pipe_framebuffer_state *pfb = &batch->framebuffer; + struct fd_ringbuffer *ring = batch->gmem; - fd4_emit_restore(batch, ring); + fd4_emit_restore(batch, ring); - OUT_PKT0(ring, REG_A4XX_RB_FRAME_BUFFER_DIMENSION, 1); - OUT_RING(ring, A4XX_RB_FRAME_BUFFER_DIMENSION_WIDTH(pfb->width) | - A4XX_RB_FRAME_BUFFER_DIMENSION_HEIGHT(pfb->height)); + OUT_PKT0(ring, REG_A4XX_RB_FRAME_BUFFER_DIMENSION, 1); + OUT_RING(ring, A4XX_RB_FRAME_BUFFER_DIMENSION_WIDTH(pfb->width) | + A4XX_RB_FRAME_BUFFER_DIMENSION_HEIGHT(pfb->height)); - emit_mrt(ring, pfb->nr_cbufs, pfb->cbufs, NULL, 0, true); + emit_mrt(ring, pfb->nr_cbufs, pfb->cbufs, NULL, 0, true); - /* setup scissor/offset for current tile: */ - OUT_PKT0(ring, REG_A4XX_RB_BIN_OFFSET, 1); - OUT_RING(ring, A4XX_RB_BIN_OFFSET_X(0) | - A4XX_RB_BIN_OFFSET_Y(0)); + /* setup scissor/offset for current tile: */ + OUT_PKT0(ring, REG_A4XX_RB_BIN_OFFSET, 1); + OUT_RING(ring, A4XX_RB_BIN_OFFSET_X(0) | A4XX_RB_BIN_OFFSET_Y(0)); - OUT_PKT0(ring, REG_A4XX_GRAS_SC_SCREEN_SCISSOR_TL, 2); - OUT_RING(ring, A4XX_GRAS_SC_SCREEN_SCISSOR_TL_X(0) | - A4XX_GRAS_SC_SCREEN_SCISSOR_TL_Y(0)); - OUT_RING(ring, A4XX_GRAS_SC_SCREEN_SCISSOR_BR_X(pfb->width - 1) | - A4XX_GRAS_SC_SCREEN_SCISSOR_BR_Y(pfb->height - 1)); + OUT_PKT0(ring, REG_A4XX_GRAS_SC_SCREEN_SCISSOR_TL, 2); + OUT_RING(ring, A4XX_GRAS_SC_SCREEN_SCISSOR_TL_X(0) | + A4XX_GRAS_SC_SCREEN_SCISSOR_TL_Y(0)); + OUT_RING(ring, A4XX_GRAS_SC_SCREEN_SCISSOR_BR_X(pfb->width - 1) | + A4XX_GRAS_SC_SCREEN_SCISSOR_BR_Y(pfb->height - 1)); - OUT_PKT0(ring, REG_A4XX_RB_MODE_CONTROL, 1); - OUT_RING(ring, A4XX_RB_MODE_CONTROL_WIDTH(0) | - A4XX_RB_MODE_CONTROL_HEIGHT(0) | - 0x00c00000); /* XXX */ + OUT_PKT0(ring, REG_A4XX_RB_MODE_CONTROL, 1); + OUT_RING(ring, A4XX_RB_MODE_CONTROL_WIDTH(0) | + A4XX_RB_MODE_CONTROL_HEIGHT(0) | 0x00c00000); /* XXX */ - OUT_PKT0(ring, REG_A4XX_RB_RENDER_CONTROL, 1); - OUT_RING(ring, 0x8); + OUT_PKT0(ring, REG_A4XX_RB_RENDER_CONTROL, 1); + OUT_RING(ring, 0x8); - patch_draws(batch, IGNORE_VISIBILITY); + patch_draws(batch, IGNORE_VISIBILITY); } static void -update_vsc_pipe(struct fd_batch *batch) - assert_dt +update_vsc_pipe(struct fd_batch *batch) assert_dt { - struct fd_context *ctx = batch->ctx; - const struct fd_gmem_stateobj *gmem = batch->gmem_state; - struct fd4_context *fd4_ctx = fd4_context(ctx); - struct fd_ringbuffer *ring = batch->gmem; - int i; - - OUT_PKT0(ring, REG_A4XX_VSC_SIZE_ADDRESS, 1); - OUT_RELOC(ring, fd4_ctx->vsc_size_mem, 0, 0, 0); /* VSC_SIZE_ADDRESS */ - - OUT_PKT0(ring, REG_A4XX_VSC_PIPE_CONFIG_REG(0), 8); - for (i = 0; i < 8; i++) { - const struct fd_vsc_pipe *pipe = &gmem->vsc_pipe[i]; - OUT_RING(ring, A4XX_VSC_PIPE_CONFIG_REG_X(pipe->x) | - A4XX_VSC_PIPE_CONFIG_REG_Y(pipe->y) | - A4XX_VSC_PIPE_CONFIG_REG_W(pipe->w) | - A4XX_VSC_PIPE_CONFIG_REG_H(pipe->h)); - } - - OUT_PKT0(ring, REG_A4XX_VSC_PIPE_DATA_ADDRESS_REG(0), 8); - for (i = 0; i < 8; i++) { - if (!ctx->vsc_pipe_bo[i]) { - ctx->vsc_pipe_bo[i] = fd_bo_new(ctx->dev, 0x40000, - DRM_FREEDRENO_GEM_TYPE_KMEM, "vsc_pipe[%u]", i); - } - OUT_RELOC(ring, ctx->vsc_pipe_bo[i], 0, 0, 0); /* VSC_PIPE_DATA_ADDRESS[i] */ - } - - OUT_PKT0(ring, REG_A4XX_VSC_PIPE_DATA_LENGTH_REG(0), 8); - for (i = 0; i < 8; i++) { - OUT_RING(ring, fd_bo_size(ctx->vsc_pipe_bo[i]) - 32); /* VSC_PIPE_DATA_LENGTH[i] */ - } + struct fd_context *ctx = batch->ctx; + const struct fd_gmem_stateobj *gmem = batch->gmem_state; + struct fd4_context *fd4_ctx = fd4_context(ctx); + struct fd_ringbuffer *ring = batch->gmem; + int i; + + OUT_PKT0(ring, REG_A4XX_VSC_SIZE_ADDRESS, 1); + OUT_RELOC(ring, fd4_ctx->vsc_size_mem, 0, 0, 0); /* VSC_SIZE_ADDRESS */ + + OUT_PKT0(ring, REG_A4XX_VSC_PIPE_CONFIG_REG(0), 8); + for (i = 0; i < 8; i++) { + const struct fd_vsc_pipe *pipe = &gmem->vsc_pipe[i]; + OUT_RING(ring, A4XX_VSC_PIPE_CONFIG_REG_X(pipe->x) | + A4XX_VSC_PIPE_CONFIG_REG_Y(pipe->y) | + A4XX_VSC_PIPE_CONFIG_REG_W(pipe->w) | + A4XX_VSC_PIPE_CONFIG_REG_H(pipe->h)); + } + + OUT_PKT0(ring, REG_A4XX_VSC_PIPE_DATA_ADDRESS_REG(0), 8); + for (i = 0; i < 8; i++) { + if (!ctx->vsc_pipe_bo[i]) { + ctx->vsc_pipe_bo[i] = fd_bo_new( + ctx->dev, 0x40000, DRM_FREEDRENO_GEM_TYPE_KMEM, "vsc_pipe[%u]", i); + } + OUT_RELOC(ring, ctx->vsc_pipe_bo[i], 0, 0, + 0); /* VSC_PIPE_DATA_ADDRESS[i] */ + } + + OUT_PKT0(ring, REG_A4XX_VSC_PIPE_DATA_LENGTH_REG(0), 8); + for (i = 0; i < 8; i++) { + OUT_RING(ring, fd_bo_size(ctx->vsc_pipe_bo[i]) - + 32); /* VSC_PIPE_DATA_LENGTH[i] */ + } } static void -emit_binning_pass(struct fd_batch *batch) - assert_dt +emit_binning_pass(struct fd_batch *batch) assert_dt { - const struct fd_gmem_stateobj *gmem = batch->gmem_state; - struct pipe_framebuffer_state *pfb = &batch->framebuffer; - struct fd_ringbuffer *ring = batch->gmem; - int i; - - uint32_t x1 = gmem->minx; - uint32_t y1 = gmem->miny; - uint32_t x2 = gmem->minx + gmem->width - 1; - uint32_t y2 = gmem->miny + gmem->height - 1; - - OUT_PKT0(ring, REG_A4XX_PC_BINNING_COMMAND, 1); - OUT_RING(ring, A4XX_PC_BINNING_COMMAND_BINNING_ENABLE); - - OUT_PKT0(ring, REG_A4XX_GRAS_SC_CONTROL, 1); - OUT_RING(ring, A4XX_GRAS_SC_CONTROL_RENDER_MODE(RB_TILING_PASS) | - A4XX_GRAS_SC_CONTROL_MSAA_DISABLE | - A4XX_GRAS_SC_CONTROL_MSAA_SAMPLES(MSAA_ONE) | - A4XX_GRAS_SC_CONTROL_RASTER_MODE(0)); - - OUT_PKT0(ring, REG_A4XX_RB_FRAME_BUFFER_DIMENSION, 1); - OUT_RING(ring, A4XX_RB_FRAME_BUFFER_DIMENSION_WIDTH(pfb->width) | - A4XX_RB_FRAME_BUFFER_DIMENSION_HEIGHT(pfb->height)); - - /* setup scissor/offset for whole screen: */ - OUT_PKT0(ring, REG_A4XX_RB_BIN_OFFSET, 1); - OUT_RING(ring, A4XX_RB_BIN_OFFSET_X(x1) | - A4XX_RB_BIN_OFFSET_Y(y1)); - - OUT_PKT0(ring, REG_A4XX_GRAS_SC_SCREEN_SCISSOR_TL, 2); - OUT_RING(ring, A4XX_GRAS_SC_SCREEN_SCISSOR_TL_X(x1) | - A4XX_GRAS_SC_SCREEN_SCISSOR_TL_Y(y1)); - OUT_RING(ring, A4XX_GRAS_SC_SCREEN_SCISSOR_BR_X(x2) | - A4XX_GRAS_SC_SCREEN_SCISSOR_BR_Y(y2)); - - for (i = 0; i < A4XX_MAX_RENDER_TARGETS; i++) { - OUT_PKT0(ring, REG_A4XX_RB_MRT_CONTROL(i), 1); - OUT_RING(ring, A4XX_RB_MRT_CONTROL_ROP_CODE(ROP_CLEAR) | - A4XX_RB_MRT_CONTROL_COMPONENT_ENABLE(0xf)); - } - - /* emit IB to binning drawcmds: */ - fd4_emit_ib(ring, batch->binning); - - fd_reset_wfi(batch); - fd_wfi(batch, ring); - - /* and then put stuff back the way it was: */ - - OUT_PKT0(ring, REG_A4XX_PC_BINNING_COMMAND, 1); - OUT_RING(ring, 0x00000000); - - OUT_PKT0(ring, REG_A4XX_GRAS_SC_CONTROL, 1); - OUT_RING(ring, A4XX_GRAS_SC_CONTROL_RENDER_MODE(RB_RENDERING_PASS) | - A4XX_GRAS_SC_CONTROL_MSAA_DISABLE | - A4XX_GRAS_SC_CONTROL_MSAA_SAMPLES(MSAA_ONE) | - A4XX_GRAS_SC_CONTROL_RASTER_MODE(0)); - - fd_event_write(batch, ring, CACHE_FLUSH); - fd_wfi(batch, ring); + const struct fd_gmem_stateobj *gmem = batch->gmem_state; + struct pipe_framebuffer_state *pfb = &batch->framebuffer; + struct fd_ringbuffer *ring = batch->gmem; + int i; + + uint32_t x1 = gmem->minx; + uint32_t y1 = gmem->miny; + uint32_t x2 = gmem->minx + gmem->width - 1; + uint32_t y2 = gmem->miny + gmem->height - 1; + + OUT_PKT0(ring, REG_A4XX_PC_BINNING_COMMAND, 1); + OUT_RING(ring, A4XX_PC_BINNING_COMMAND_BINNING_ENABLE); + + OUT_PKT0(ring, REG_A4XX_GRAS_SC_CONTROL, 1); + OUT_RING(ring, A4XX_GRAS_SC_CONTROL_RENDER_MODE(RB_TILING_PASS) | + A4XX_GRAS_SC_CONTROL_MSAA_DISABLE | + A4XX_GRAS_SC_CONTROL_MSAA_SAMPLES(MSAA_ONE) | + A4XX_GRAS_SC_CONTROL_RASTER_MODE(0)); + + OUT_PKT0(ring, REG_A4XX_RB_FRAME_BUFFER_DIMENSION, 1); + OUT_RING(ring, A4XX_RB_FRAME_BUFFER_DIMENSION_WIDTH(pfb->width) | + A4XX_RB_FRAME_BUFFER_DIMENSION_HEIGHT(pfb->height)); + + /* setup scissor/offset for whole screen: */ + OUT_PKT0(ring, REG_A4XX_RB_BIN_OFFSET, 1); + OUT_RING(ring, A4XX_RB_BIN_OFFSET_X(x1) | A4XX_RB_BIN_OFFSET_Y(y1)); + + OUT_PKT0(ring, REG_A4XX_GRAS_SC_SCREEN_SCISSOR_TL, 2); + OUT_RING(ring, A4XX_GRAS_SC_SCREEN_SCISSOR_TL_X(x1) | + A4XX_GRAS_SC_SCREEN_SCISSOR_TL_Y(y1)); + OUT_RING(ring, A4XX_GRAS_SC_SCREEN_SCISSOR_BR_X(x2) | + A4XX_GRAS_SC_SCREEN_SCISSOR_BR_Y(y2)); + + for (i = 0; i < A4XX_MAX_RENDER_TARGETS; i++) { + OUT_PKT0(ring, REG_A4XX_RB_MRT_CONTROL(i), 1); + OUT_RING(ring, A4XX_RB_MRT_CONTROL_ROP_CODE(ROP_CLEAR) | + A4XX_RB_MRT_CONTROL_COMPONENT_ENABLE(0xf)); + } + + /* emit IB to binning drawcmds: */ + fd4_emit_ib(ring, batch->binning); + + fd_reset_wfi(batch); + fd_wfi(batch, ring); + + /* and then put stuff back the way it was: */ + + OUT_PKT0(ring, REG_A4XX_PC_BINNING_COMMAND, 1); + OUT_RING(ring, 0x00000000); + + OUT_PKT0(ring, REG_A4XX_GRAS_SC_CONTROL, 1); + OUT_RING(ring, A4XX_GRAS_SC_CONTROL_RENDER_MODE(RB_RENDERING_PASS) | + A4XX_GRAS_SC_CONTROL_MSAA_DISABLE | + A4XX_GRAS_SC_CONTROL_MSAA_SAMPLES(MSAA_ONE) | + A4XX_GRAS_SC_CONTROL_RASTER_MODE(0)); + + fd_event_write(batch, ring, CACHE_FLUSH); + fd_wfi(batch, ring); } /* before first tile */ static void -fd4_emit_tile_init(struct fd_batch *batch) - assert_dt +fd4_emit_tile_init(struct fd_batch *batch) assert_dt { - struct fd_ringbuffer *ring = batch->gmem; - struct pipe_framebuffer_state *pfb = &batch->framebuffer; - const struct fd_gmem_stateobj *gmem = batch->gmem_state; + struct fd_ringbuffer *ring = batch->gmem; + struct pipe_framebuffer_state *pfb = &batch->framebuffer; + const struct fd_gmem_stateobj *gmem = batch->gmem_state; - fd4_emit_restore(batch, ring); + fd4_emit_restore(batch, ring); - OUT_PKT0(ring, REG_A4XX_VSC_BIN_SIZE, 1); - OUT_RING(ring, A4XX_VSC_BIN_SIZE_WIDTH(gmem->bin_w) | - A4XX_VSC_BIN_SIZE_HEIGHT(gmem->bin_h)); + OUT_PKT0(ring, REG_A4XX_VSC_BIN_SIZE, 1); + OUT_RING(ring, A4XX_VSC_BIN_SIZE_WIDTH(gmem->bin_w) | + A4XX_VSC_BIN_SIZE_HEIGHT(gmem->bin_h)); - update_vsc_pipe(batch); + update_vsc_pipe(batch); - fd_wfi(batch, ring); - OUT_PKT0(ring, REG_A4XX_RB_FRAME_BUFFER_DIMENSION, 1); - OUT_RING(ring, A4XX_RB_FRAME_BUFFER_DIMENSION_WIDTH(pfb->width) | - A4XX_RB_FRAME_BUFFER_DIMENSION_HEIGHT(pfb->height)); + fd_wfi(batch, ring); + OUT_PKT0(ring, REG_A4XX_RB_FRAME_BUFFER_DIMENSION, 1); + OUT_RING(ring, A4XX_RB_FRAME_BUFFER_DIMENSION_WIDTH(pfb->width) | + A4XX_RB_FRAME_BUFFER_DIMENSION_HEIGHT(pfb->height)); - if (use_hw_binning(batch)) { - OUT_PKT0(ring, REG_A4XX_RB_MODE_CONTROL, 1); - OUT_RING(ring, A4XX_RB_MODE_CONTROL_WIDTH(gmem->bin_w) | - A4XX_RB_MODE_CONTROL_HEIGHT(gmem->bin_h)); + if (use_hw_binning(batch)) { + OUT_PKT0(ring, REG_A4XX_RB_MODE_CONTROL, 1); + OUT_RING(ring, A4XX_RB_MODE_CONTROL_WIDTH(gmem->bin_w) | + A4XX_RB_MODE_CONTROL_HEIGHT(gmem->bin_h)); - OUT_PKT0(ring, REG_A4XX_RB_RENDER_CONTROL, 1); - OUT_RING(ring, A4XX_RB_RENDER_CONTROL_BINNING_PASS | - A4XX_RB_RENDER_CONTROL_DISABLE_COLOR_PIPE | - 0x8); + OUT_PKT0(ring, REG_A4XX_RB_RENDER_CONTROL, 1); + OUT_RING(ring, A4XX_RB_RENDER_CONTROL_BINNING_PASS | + A4XX_RB_RENDER_CONTROL_DISABLE_COLOR_PIPE | 0x8); - /* emit hw binning pass: */ - emit_binning_pass(batch); + /* emit hw binning pass: */ + emit_binning_pass(batch); - patch_draws(batch, USE_VISIBILITY); - } else { - patch_draws(batch, IGNORE_VISIBILITY); - } + patch_draws(batch, USE_VISIBILITY); + } else { + patch_draws(batch, IGNORE_VISIBILITY); + } - OUT_PKT0(ring, REG_A4XX_RB_MODE_CONTROL, 1); - OUT_RING(ring, A4XX_RB_MODE_CONTROL_WIDTH(gmem->bin_w) | - A4XX_RB_MODE_CONTROL_HEIGHT(gmem->bin_h) | - A4XX_RB_MODE_CONTROL_ENABLE_GMEM); + OUT_PKT0(ring, REG_A4XX_RB_MODE_CONTROL, 1); + OUT_RING(ring, A4XX_RB_MODE_CONTROL_WIDTH(gmem->bin_w) | + A4XX_RB_MODE_CONTROL_HEIGHT(gmem->bin_h) | + A4XX_RB_MODE_CONTROL_ENABLE_GMEM); } /* before mem2gmem */ static void fd4_emit_tile_prep(struct fd_batch *batch, const struct fd_tile *tile) { - struct fd_ringbuffer *ring = batch->gmem; - struct pipe_framebuffer_state *pfb = &batch->framebuffer; - const struct fd_gmem_stateobj *gmem = batch->gmem_state; - - if (pfb->zsbuf) { - struct fd_resource *rsc = fd_resource(pfb->zsbuf->texture); - uint32_t cpp = rsc->layout.cpp; - - OUT_PKT0(ring, REG_A4XX_RB_DEPTH_INFO, 3); - OUT_RING(ring, A4XX_RB_DEPTH_INFO_DEPTH_BASE(gmem->zsbuf_base[0]) | - A4XX_RB_DEPTH_INFO_DEPTH_FORMAT(fd4_pipe2depth(pfb->zsbuf->format))); - OUT_RING(ring, A4XX_RB_DEPTH_PITCH(cpp * gmem->bin_w)); - OUT_RING(ring, A4XX_RB_DEPTH_PITCH2(cpp * gmem->bin_w)); - - OUT_PKT0(ring, REG_A4XX_RB_STENCIL_INFO, 2); - if (rsc->stencil) { - OUT_RING(ring, A4XX_RB_STENCIL_INFO_SEPARATE_STENCIL | - A4XX_RB_STENCIL_INFO_STENCIL_BASE(gmem->zsbuf_base[1])); - OUT_RING(ring, A4XX_RB_STENCIL_PITCH(rsc->stencil->layout.cpp * gmem->bin_w)); - } else { - OUT_RING(ring, 0x00000000); - OUT_RING(ring, 0x00000000); - } - } else { - OUT_PKT0(ring, REG_A4XX_RB_DEPTH_INFO, 3); - OUT_RING(ring, 0x00000000); - OUT_RING(ring, 0x00000000); - OUT_RING(ring, 0x00000000); - - OUT_PKT0(ring, REG_A4XX_RB_STENCIL_INFO, 2); - OUT_RING(ring, 0); /* RB_STENCIL_INFO */ - OUT_RING(ring, 0); /* RB_STENCIL_PITCH */ - } - - OUT_PKT0(ring, REG_A4XX_GRAS_DEPTH_CONTROL, 1); - if (pfb->zsbuf) { - OUT_RING(ring, A4XX_GRAS_DEPTH_CONTROL_FORMAT( - fd4_pipe2depth(pfb->zsbuf->format))); - } else { - OUT_RING(ring, A4XX_GRAS_DEPTH_CONTROL_FORMAT(DEPTH4_NONE)); - } + struct fd_ringbuffer *ring = batch->gmem; + struct pipe_framebuffer_state *pfb = &batch->framebuffer; + const struct fd_gmem_stateobj *gmem = batch->gmem_state; + + if (pfb->zsbuf) { + struct fd_resource *rsc = fd_resource(pfb->zsbuf->texture); + uint32_t cpp = rsc->layout.cpp; + + OUT_PKT0(ring, REG_A4XX_RB_DEPTH_INFO, 3); + OUT_RING(ring, A4XX_RB_DEPTH_INFO_DEPTH_BASE(gmem->zsbuf_base[0]) | + A4XX_RB_DEPTH_INFO_DEPTH_FORMAT( + fd4_pipe2depth(pfb->zsbuf->format))); + OUT_RING(ring, A4XX_RB_DEPTH_PITCH(cpp * gmem->bin_w)); + OUT_RING(ring, A4XX_RB_DEPTH_PITCH2(cpp * gmem->bin_w)); + + OUT_PKT0(ring, REG_A4XX_RB_STENCIL_INFO, 2); + if (rsc->stencil) { + OUT_RING(ring, + A4XX_RB_STENCIL_INFO_SEPARATE_STENCIL | + A4XX_RB_STENCIL_INFO_STENCIL_BASE(gmem->zsbuf_base[1])); + OUT_RING(ring, A4XX_RB_STENCIL_PITCH(rsc->stencil->layout.cpp * + gmem->bin_w)); + } else { + OUT_RING(ring, 0x00000000); + OUT_RING(ring, 0x00000000); + } + } else { + OUT_PKT0(ring, REG_A4XX_RB_DEPTH_INFO, 3); + OUT_RING(ring, 0x00000000); + OUT_RING(ring, 0x00000000); + OUT_RING(ring, 0x00000000); + + OUT_PKT0(ring, REG_A4XX_RB_STENCIL_INFO, 2); + OUT_RING(ring, 0); /* RB_STENCIL_INFO */ + OUT_RING(ring, 0); /* RB_STENCIL_PITCH */ + } + + OUT_PKT0(ring, REG_A4XX_GRAS_DEPTH_CONTROL, 1); + if (pfb->zsbuf) { + OUT_RING(ring, A4XX_GRAS_DEPTH_CONTROL_FORMAT( + fd4_pipe2depth(pfb->zsbuf->format))); + } else { + OUT_RING(ring, A4XX_GRAS_DEPTH_CONTROL_FORMAT(DEPTH4_NONE)); + } } /* before IB to rendering cmds: */ static void -fd4_emit_tile_renderprep(struct fd_batch *batch, const struct fd_tile *tile) - assert_dt +fd4_emit_tile_renderprep(struct fd_batch *batch, + const struct fd_tile *tile) assert_dt { - struct fd_context *ctx = batch->ctx; - struct fd4_context *fd4_ctx = fd4_context(ctx); - struct fd_ringbuffer *ring = batch->gmem; - const struct fd_gmem_stateobj *gmem = batch->gmem_state; - struct pipe_framebuffer_state *pfb = &batch->framebuffer; - - uint32_t x1 = tile->xoff; - uint32_t y1 = tile->yoff; - uint32_t x2 = tile->xoff + tile->bin_w - 1; - uint32_t y2 = tile->yoff + tile->bin_h - 1; - - if (use_hw_binning(batch)) { - const struct fd_vsc_pipe *pipe = &gmem->vsc_pipe[tile->p]; - struct fd_bo *pipe_bo = ctx->vsc_pipe_bo[tile->p]; - - assert(pipe->w && pipe->h); - - fd_event_write(batch, ring, HLSQ_FLUSH); - fd_wfi(batch, ring); - - OUT_PKT0(ring, REG_A4XX_PC_VSTREAM_CONTROL, 1); - OUT_RING(ring, A4XX_PC_VSTREAM_CONTROL_SIZE(pipe->w * pipe->h) | - A4XX_PC_VSTREAM_CONTROL_N(tile->n)); - - OUT_PKT3(ring, CP_SET_BIN_DATA, 2); - OUT_RELOC(ring, pipe_bo, 0, 0, 0); /* BIN_DATA_ADDR <- VSC_PIPE[p].DATA_ADDRESS */ - OUT_RELOC(ring, fd4_ctx->vsc_size_mem, /* BIN_SIZE_ADDR <- VSC_SIZE_ADDRESS + (p * 4) */ - (tile->p * 4), 0, 0); - } else { - OUT_PKT0(ring, REG_A4XX_PC_VSTREAM_CONTROL, 1); - OUT_RING(ring, 0x00000000); - } - - OUT_PKT3(ring, CP_SET_BIN, 3); - OUT_RING(ring, 0x00000000); - OUT_RING(ring, CP_SET_BIN_1_X1(x1) | CP_SET_BIN_1_Y1(y1)); - OUT_RING(ring, CP_SET_BIN_2_X2(x2) | CP_SET_BIN_2_Y2(y2)); - - emit_mrt(ring, pfb->nr_cbufs, pfb->cbufs, gmem->cbuf_base, gmem->bin_w, true); - - /* setup scissor/offset for current tile: */ - OUT_PKT0(ring, REG_A4XX_RB_BIN_OFFSET, 1); - OUT_RING(ring, A4XX_RB_BIN_OFFSET_X(tile->xoff) | - A4XX_RB_BIN_OFFSET_Y(tile->yoff)); - - OUT_PKT0(ring, REG_A4XX_GRAS_SC_SCREEN_SCISSOR_TL, 2); - OUT_RING(ring, A4XX_GRAS_SC_SCREEN_SCISSOR_TL_X(x1) | - A4XX_GRAS_SC_SCREEN_SCISSOR_TL_Y(y1)); - OUT_RING(ring, A4XX_GRAS_SC_SCREEN_SCISSOR_BR_X(x2) | - A4XX_GRAS_SC_SCREEN_SCISSOR_BR_Y(y2)); - - OUT_PKT0(ring, REG_A4XX_RB_RENDER_CONTROL, 1); - OUT_RING(ring, 0x8); + struct fd_context *ctx = batch->ctx; + struct fd4_context *fd4_ctx = fd4_context(ctx); + struct fd_ringbuffer *ring = batch->gmem; + const struct fd_gmem_stateobj *gmem = batch->gmem_state; + struct pipe_framebuffer_state *pfb = &batch->framebuffer; + + uint32_t x1 = tile->xoff; + uint32_t y1 = tile->yoff; + uint32_t x2 = tile->xoff + tile->bin_w - 1; + uint32_t y2 = tile->yoff + tile->bin_h - 1; + + if (use_hw_binning(batch)) { + const struct fd_vsc_pipe *pipe = &gmem->vsc_pipe[tile->p]; + struct fd_bo *pipe_bo = ctx->vsc_pipe_bo[tile->p]; + + assert(pipe->w && pipe->h); + + fd_event_write(batch, ring, HLSQ_FLUSH); + fd_wfi(batch, ring); + + OUT_PKT0(ring, REG_A4XX_PC_VSTREAM_CONTROL, 1); + OUT_RING(ring, A4XX_PC_VSTREAM_CONTROL_SIZE(pipe->w * pipe->h) | + A4XX_PC_VSTREAM_CONTROL_N(tile->n)); + + OUT_PKT3(ring, CP_SET_BIN_DATA, 2); + OUT_RELOC(ring, pipe_bo, 0, 0, + 0); /* BIN_DATA_ADDR <- VSC_PIPE[p].DATA_ADDRESS */ + OUT_RELOC(ring, fd4_ctx->vsc_size_mem, /* BIN_SIZE_ADDR <- + VSC_SIZE_ADDRESS + (p * 4) */ + (tile->p * 4), 0, 0); + } else { + OUT_PKT0(ring, REG_A4XX_PC_VSTREAM_CONTROL, 1); + OUT_RING(ring, 0x00000000); + } + + OUT_PKT3(ring, CP_SET_BIN, 3); + OUT_RING(ring, 0x00000000); + OUT_RING(ring, CP_SET_BIN_1_X1(x1) | CP_SET_BIN_1_Y1(y1)); + OUT_RING(ring, CP_SET_BIN_2_X2(x2) | CP_SET_BIN_2_Y2(y2)); + + emit_mrt(ring, pfb->nr_cbufs, pfb->cbufs, gmem->cbuf_base, gmem->bin_w, + true); + + /* setup scissor/offset for current tile: */ + OUT_PKT0(ring, REG_A4XX_RB_BIN_OFFSET, 1); + OUT_RING(ring, A4XX_RB_BIN_OFFSET_X(tile->xoff) | + A4XX_RB_BIN_OFFSET_Y(tile->yoff)); + + OUT_PKT0(ring, REG_A4XX_GRAS_SC_SCREEN_SCISSOR_TL, 2); + OUT_RING(ring, A4XX_GRAS_SC_SCREEN_SCISSOR_TL_X(x1) | + A4XX_GRAS_SC_SCREEN_SCISSOR_TL_Y(y1)); + OUT_RING(ring, A4XX_GRAS_SC_SCREEN_SCISSOR_BR_X(x2) | + A4XX_GRAS_SC_SCREEN_SCISSOR_BR_Y(y2)); + + OUT_PKT0(ring, REG_A4XX_RB_RENDER_CONTROL, 1); + OUT_RING(ring, 0x8); } void -fd4_gmem_init(struct pipe_context *pctx) - disable_thread_safety_analysis +fd4_gmem_init(struct pipe_context *pctx) disable_thread_safety_analysis { - struct fd_context *ctx = fd_context(pctx); - - ctx->emit_sysmem_prep = fd4_emit_sysmem_prep; - ctx->emit_tile_init = fd4_emit_tile_init; - ctx->emit_tile_prep = fd4_emit_tile_prep; - ctx->emit_tile_mem2gmem = fd4_emit_tile_mem2gmem; - ctx->emit_tile_renderprep = fd4_emit_tile_renderprep; - ctx->emit_tile_gmem2mem = fd4_emit_tile_gmem2mem; + struct fd_context *ctx = fd_context(pctx); + + ctx->emit_sysmem_prep = fd4_emit_sysmem_prep; + ctx->emit_tile_init = fd4_emit_tile_init; + ctx->emit_tile_prep = fd4_emit_tile_prep; + ctx->emit_tile_mem2gmem = fd4_emit_tile_mem2gmem; + ctx->emit_tile_renderprep = fd4_emit_tile_renderprep; + ctx->emit_tile_gmem2mem = fd4_emit_tile_gmem2mem; } diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_program.c b/src/gallium/drivers/freedreno/a4xx/fd4_program.c index 62e0d4c..bfee652 100644 --- a/src/gallium/drivers/freedreno/a4xx/fd4_program.c +++ b/src/gallium/drivers/freedreno/a4xx/fd4_program.c @@ -25,555 +25,573 @@ */ #include "pipe/p_state.h" -#include "util/u_string.h" -#include "util/u_memory.h" -#include "util/u_inlines.h" #include "util/format/u_format.h" +#include "util/u_inlines.h" +#include "util/u_memory.h" +#include "util/u_string.h" #include "freedreno_program.h" -#include "fd4_program.h" #include "fd4_emit.h" -#include "fd4_texture.h" #include "fd4_format.h" +#include "fd4_program.h" +#include "fd4_texture.h" static void emit_shader(struct fd_ringbuffer *ring, const struct ir3_shader_variant *so) { - const struct ir3_info *si = &so->info; - enum a4xx_state_block sb = fd4_stage2shadersb(so->type); - enum a4xx_state_src src; - uint32_t i, sz, *bin; - - if (FD_DBG(DIRECT)) { - sz = si->sizedwords; - src = SS4_DIRECT; - bin = fd_bo_map(so->bo); - } else { - sz = 0; - src = SS4_INDIRECT; - bin = NULL; - } - - OUT_PKT3(ring, CP_LOAD_STATE4, 2 + sz); - OUT_RING(ring, CP_LOAD_STATE4_0_DST_OFF(0) | - CP_LOAD_STATE4_0_STATE_SRC(src) | - CP_LOAD_STATE4_0_STATE_BLOCK(sb) | - CP_LOAD_STATE4_0_NUM_UNIT(so->instrlen)); - if (bin) { - OUT_RING(ring, CP_LOAD_STATE4_1_EXT_SRC_ADDR(0) | - CP_LOAD_STATE4_1_STATE_TYPE(ST4_SHADER)); - } else { - OUT_RELOC(ring, so->bo, 0, - CP_LOAD_STATE4_1_STATE_TYPE(ST4_SHADER), 0); - } - - /* for how clever coverity is, it is sometimes rather dull, and - * doesn't realize that the only case where bin==NULL, sz==0: - */ - assume(bin || (sz == 0)); - - for (i = 0; i < sz; i++) { - OUT_RING(ring, bin[i]); - } + const struct ir3_info *si = &so->info; + enum a4xx_state_block sb = fd4_stage2shadersb(so->type); + enum a4xx_state_src src; + uint32_t i, sz, *bin; + + if (FD_DBG(DIRECT)) { + sz = si->sizedwords; + src = SS4_DIRECT; + bin = fd_bo_map(so->bo); + } else { + sz = 0; + src = SS4_INDIRECT; + bin = NULL; + } + + OUT_PKT3(ring, CP_LOAD_STATE4, 2 + sz); + OUT_RING(ring, CP_LOAD_STATE4_0_DST_OFF(0) | + CP_LOAD_STATE4_0_STATE_SRC(src) | + CP_LOAD_STATE4_0_STATE_BLOCK(sb) | + CP_LOAD_STATE4_0_NUM_UNIT(so->instrlen)); + if (bin) { + OUT_RING(ring, CP_LOAD_STATE4_1_EXT_SRC_ADDR(0) | + CP_LOAD_STATE4_1_STATE_TYPE(ST4_SHADER)); + } else { + OUT_RELOC(ring, so->bo, 0, CP_LOAD_STATE4_1_STATE_TYPE(ST4_SHADER), 0); + } + + /* for how clever coverity is, it is sometimes rather dull, and + * doesn't realize that the only case where bin==NULL, sz==0: + */ + assume(bin || (sz == 0)); + + for (i = 0; i < sz; i++) { + OUT_RING(ring, bin[i]); + } } struct stage { - const struct ir3_shader_variant *v; - const struct ir3_info *i; - /* const sizes are in units of 4 * vec4 */ - uint8_t constoff; - uint8_t constlen; - /* instr sizes are in units of 16 instructions */ - uint8_t instroff; - uint8_t instrlen; + const struct ir3_shader_variant *v; + const struct ir3_info *i; + /* const sizes are in units of 4 * vec4 */ + uint8_t constoff; + uint8_t constlen; + /* instr sizes are in units of 16 instructions */ + uint8_t instroff; + uint8_t instrlen; }; -enum { - VS = 0, - FS = 1, - HS = 2, - DS = 3, - GS = 4, - MAX_STAGES -}; +enum { VS = 0, FS = 1, HS = 2, DS = 3, GS = 4, MAX_STAGES }; static void setup_stages(struct fd4_emit *emit, struct stage *s) { - unsigned i; - - s[VS].v = fd4_emit_get_vp(emit); - s[FS].v = fd4_emit_get_fp(emit); - - s[HS].v = s[DS].v = s[GS].v = NULL; /* for now */ - - for (i = 0; i < MAX_STAGES; i++) { - if (s[i].v) { - s[i].i = &s[i].v->info; - /* constlen is in units of 4 * vec4: */ - assert(s[i].v->constlen % 4 == 0); - s[i].constlen = s[i].v->constlen / 4; - /* instrlen is already in units of 16 instr.. although - * probably we should ditch that and not make the compiler - * care about instruction group size of a3xx vs a4xx - */ - s[i].instrlen = s[i].v->instrlen; - } else { - s[i].i = NULL; - s[i].constlen = 0; - s[i].instrlen = 0; - } - } - - /* NOTE: at least for gles2, blob partitions VS at bottom of const - * space and FS taking entire remaining space. We probably don't - * need to do that the same way, but for now mimic what the blob - * does to make it easier to diff against register values from blob - * - * NOTE: if VS.instrlen + FS.instrlen > 64, then one or both shaders - * is run from external memory. - */ - if ((s[VS].instrlen + s[FS].instrlen) > 64) { - /* prioritize FS for internal memory: */ - if (s[FS].instrlen < 64) { - /* if FS can fit, kick VS out to external memory: */ - s[VS].instrlen = 0; - } else if (s[VS].instrlen < 64) { - /* otherwise if VS can fit, kick out FS: */ - s[FS].instrlen = 0; - } else { - /* neither can fit, run both from external memory: */ - s[VS].instrlen = 0; - s[FS].instrlen = 0; - } - } - s[VS].constlen = 66; - s[FS].constlen = 128 - s[VS].constlen; - s[VS].instroff = 0; - s[VS].constoff = 0; - s[FS].instroff = 64 - s[FS].instrlen; - s[FS].constoff = s[VS].constlen; - s[HS].instroff = s[DS].instroff = s[GS].instroff = s[FS].instroff; - s[HS].constoff = s[DS].constoff = s[GS].constoff = s[FS].constoff; + unsigned i; + + s[VS].v = fd4_emit_get_vp(emit); + s[FS].v = fd4_emit_get_fp(emit); + + s[HS].v = s[DS].v = s[GS].v = NULL; /* for now */ + + for (i = 0; i < MAX_STAGES; i++) { + if (s[i].v) { + s[i].i = &s[i].v->info; + /* constlen is in units of 4 * vec4: */ + assert(s[i].v->constlen % 4 == 0); + s[i].constlen = s[i].v->constlen / 4; + /* instrlen is already in units of 16 instr.. although + * probably we should ditch that and not make the compiler + * care about instruction group size of a3xx vs a4xx + */ + s[i].instrlen = s[i].v->instrlen; + } else { + s[i].i = NULL; + s[i].constlen = 0; + s[i].instrlen = 0; + } + } + + /* NOTE: at least for gles2, blob partitions VS at bottom of const + * space and FS taking entire remaining space. We probably don't + * need to do that the same way, but for now mimic what the blob + * does to make it easier to diff against register values from blob + * + * NOTE: if VS.instrlen + FS.instrlen > 64, then one or both shaders + * is run from external memory. + */ + if ((s[VS].instrlen + s[FS].instrlen) > 64) { + /* prioritize FS for internal memory: */ + if (s[FS].instrlen < 64) { + /* if FS can fit, kick VS out to external memory: */ + s[VS].instrlen = 0; + } else if (s[VS].instrlen < 64) { + /* otherwise if VS can fit, kick out FS: */ + s[FS].instrlen = 0; + } else { + /* neither can fit, run both from external memory: */ + s[VS].instrlen = 0; + s[FS].instrlen = 0; + } + } + s[VS].constlen = 66; + s[FS].constlen = 128 - s[VS].constlen; + s[VS].instroff = 0; + s[VS].constoff = 0; + s[FS].instroff = 64 - s[FS].instrlen; + s[FS].constoff = s[VS].constlen; + s[HS].instroff = s[DS].instroff = s[GS].instroff = s[FS].instroff; + s[HS].constoff = s[DS].constoff = s[GS].constoff = s[FS].constoff; } void -fd4_program_emit(struct fd_ringbuffer *ring, struct fd4_emit *emit, - int nr, struct pipe_surface **bufs) +fd4_program_emit(struct fd_ringbuffer *ring, struct fd4_emit *emit, int nr, + struct pipe_surface **bufs) { - struct stage s[MAX_STAGES]; - uint32_t pos_regid, posz_regid, psize_regid, color_regid[8]; - uint32_t face_regid, coord_regid, zwcoord_regid, ij_regid[IJ_COUNT]; - enum a3xx_threadsize fssz; - int constmode; - int i, j; - - debug_assert(nr <= ARRAY_SIZE(color_regid)); - - if (emit->binning_pass) - nr = 0; - - setup_stages(emit, s); - - fssz = (s[FS].i->double_threadsize) ? FOUR_QUADS : TWO_QUADS; - - /* blob seems to always use constmode currently: */ - constmode = 1; - - pos_regid = ir3_find_output_regid(s[VS].v, VARYING_SLOT_POS); - if (pos_regid == regid(63, 0)) { - /* hw dislikes when there is no position output, which can - * happen for transform-feedback vertex shaders. Just tell - * the hw to use r0.x, with whatever random value is there: - */ - pos_regid = regid(0, 0); - } - posz_regid = ir3_find_output_regid(s[FS].v, FRAG_RESULT_DEPTH); - psize_regid = ir3_find_output_regid(s[VS].v, VARYING_SLOT_PSIZ); - if (s[FS].v->color0_mrt) { - color_regid[0] = color_regid[1] = color_regid[2] = color_regid[3] = - color_regid[4] = color_regid[5] = color_regid[6] = color_regid[7] = - ir3_find_output_regid(s[FS].v, FRAG_RESULT_COLOR); - } else { - color_regid[0] = ir3_find_output_regid(s[FS].v, FRAG_RESULT_DATA0); - color_regid[1] = ir3_find_output_regid(s[FS].v, FRAG_RESULT_DATA1); - color_regid[2] = ir3_find_output_regid(s[FS].v, FRAG_RESULT_DATA2); - color_regid[3] = ir3_find_output_regid(s[FS].v, FRAG_RESULT_DATA3); - color_regid[4] = ir3_find_output_regid(s[FS].v, FRAG_RESULT_DATA4); - color_regid[5] = ir3_find_output_regid(s[FS].v, FRAG_RESULT_DATA5); - color_regid[6] = ir3_find_output_regid(s[FS].v, FRAG_RESULT_DATA6); - color_regid[7] = ir3_find_output_regid(s[FS].v, FRAG_RESULT_DATA7); - } - - face_regid = ir3_find_sysval_regid(s[FS].v, SYSTEM_VALUE_FRONT_FACE); - coord_regid = ir3_find_sysval_regid(s[FS].v, SYSTEM_VALUE_FRAG_COORD); - zwcoord_regid = (coord_regid == regid(63,0)) ? regid(63,0) : (coord_regid + 2); - for (unsigned i = 0; i < ARRAY_SIZE(ij_regid); i++) - ij_regid[i] = ir3_find_sysval_regid(s[FS].v, SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL + i); - - /* we could probably divide this up into things that need to be - * emitted if frag-prog is dirty vs if vert-prog is dirty.. - */ - - OUT_PKT0(ring, REG_A4XX_HLSQ_UPDATE_CONTROL, 1); - OUT_RING(ring, 0x00000003); - - OUT_PKT0(ring, REG_A4XX_HLSQ_CONTROL_0_REG, 5); - OUT_RING(ring, A4XX_HLSQ_CONTROL_0_REG_FSTHREADSIZE(fssz) | - A4XX_HLSQ_CONTROL_0_REG_CONSTMODE(constmode) | - A4XX_HLSQ_CONTROL_0_REG_FSSUPERTHREADENABLE | - /* NOTE: I guess SHADERRESTART and CONSTFULLUPDATE maybe - * flush some caches? I think we only need to set those - * bits if we have updated const or shader.. - */ - A4XX_HLSQ_CONTROL_0_REG_SPSHADERRESTART | - A4XX_HLSQ_CONTROL_0_REG_SPCONSTFULLUPDATE); - OUT_RING(ring, A4XX_HLSQ_CONTROL_1_REG_VSTHREADSIZE(TWO_QUADS) | - A4XX_HLSQ_CONTROL_1_REG_VSSUPERTHREADENABLE | - A4XX_HLSQ_CONTROL_1_REG_COORDREGID(coord_regid) | - A4XX_HLSQ_CONTROL_1_REG_ZWCOORDREGID(zwcoord_regid)); - OUT_RING(ring, A4XX_HLSQ_CONTROL_2_REG_PRIMALLOCTHRESHOLD(63) | - 0x3f3f000 | /* XXX */ - A4XX_HLSQ_CONTROL_2_REG_FACEREGID(face_regid)); - /* XXX left out centroid/sample for now */ - OUT_RING(ring, A4XX_HLSQ_CONTROL_3_REG_IJ_PERSP_PIXEL(ij_regid[IJ_PERSP_PIXEL]) | - A4XX_HLSQ_CONTROL_3_REG_IJ_LINEAR_PIXEL(ij_regid[IJ_LINEAR_PIXEL]) | - A4XX_HLSQ_CONTROL_3_REG_IJ_PERSP_CENTROID(ij_regid[IJ_PERSP_CENTROID]) | - A4XX_HLSQ_CONTROL_3_REG_IJ_LINEAR_CENTROID(ij_regid[IJ_LINEAR_CENTROID])); - OUT_RING(ring, 0x00fcfcfc); /* XXX HLSQ_CONTROL_4 */ - - OUT_PKT0(ring, REG_A4XX_HLSQ_VS_CONTROL_REG, 5); - OUT_RING(ring, A4XX_HLSQ_VS_CONTROL_REG_CONSTLENGTH(s[VS].constlen) | - A4XX_HLSQ_VS_CONTROL_REG_CONSTOBJECTOFFSET(s[VS].constoff) | - A4XX_HLSQ_VS_CONTROL_REG_INSTRLENGTH(s[VS].instrlen) | - A4XX_HLSQ_VS_CONTROL_REG_SHADEROBJOFFSET(s[VS].instroff)); - OUT_RING(ring, A4XX_HLSQ_FS_CONTROL_REG_CONSTLENGTH(s[FS].constlen) | - A4XX_HLSQ_FS_CONTROL_REG_CONSTOBJECTOFFSET(s[FS].constoff) | - A4XX_HLSQ_FS_CONTROL_REG_INSTRLENGTH(s[FS].instrlen) | - A4XX_HLSQ_FS_CONTROL_REG_SHADEROBJOFFSET(s[FS].instroff)); - OUT_RING(ring, A4XX_HLSQ_HS_CONTROL_REG_CONSTLENGTH(s[HS].constlen) | - A4XX_HLSQ_HS_CONTROL_REG_CONSTOBJECTOFFSET(s[HS].constoff) | - A4XX_HLSQ_HS_CONTROL_REG_INSTRLENGTH(s[HS].instrlen) | - A4XX_HLSQ_HS_CONTROL_REG_SHADEROBJOFFSET(s[HS].instroff)); - OUT_RING(ring, A4XX_HLSQ_DS_CONTROL_REG_CONSTLENGTH(s[DS].constlen) | - A4XX_HLSQ_DS_CONTROL_REG_CONSTOBJECTOFFSET(s[DS].constoff) | - A4XX_HLSQ_DS_CONTROL_REG_INSTRLENGTH(s[DS].instrlen) | - A4XX_HLSQ_DS_CONTROL_REG_SHADEROBJOFFSET(s[DS].instroff)); - OUT_RING(ring, A4XX_HLSQ_GS_CONTROL_REG_CONSTLENGTH(s[GS].constlen) | - A4XX_HLSQ_GS_CONTROL_REG_CONSTOBJECTOFFSET(s[GS].constoff) | - A4XX_HLSQ_GS_CONTROL_REG_INSTRLENGTH(s[GS].instrlen) | - A4XX_HLSQ_GS_CONTROL_REG_SHADEROBJOFFSET(s[GS].instroff)); - - OUT_PKT0(ring, REG_A4XX_SP_SP_CTRL_REG, 1); - OUT_RING(ring, 0x140010 | /* XXX */ - COND(emit->binning_pass, A4XX_SP_SP_CTRL_REG_BINNING_PASS)); - - OUT_PKT0(ring, REG_A4XX_SP_INSTR_CACHE_CTRL, 1); - OUT_RING(ring, 0x7f | /* XXX */ - COND(s[VS].instrlen, A4XX_SP_INSTR_CACHE_CTRL_VS_BUFFER) | - COND(s[FS].instrlen, A4XX_SP_INSTR_CACHE_CTRL_FS_BUFFER) | - COND(s[VS].instrlen && s[FS].instrlen, - A4XX_SP_INSTR_CACHE_CTRL_INSTR_BUFFER)); - - OUT_PKT0(ring, REG_A4XX_SP_VS_LENGTH_REG, 1); - OUT_RING(ring, s[VS].v->instrlen); /* SP_VS_LENGTH_REG */ - - OUT_PKT0(ring, REG_A4XX_SP_VS_CTRL_REG0, 3); - OUT_RING(ring, A4XX_SP_VS_CTRL_REG0_THREADMODE(MULTI) | - A4XX_SP_VS_CTRL_REG0_HALFREGFOOTPRINT(s[VS].i->max_half_reg + 1) | - A4XX_SP_VS_CTRL_REG0_FULLREGFOOTPRINT(s[VS].i->max_reg + 1) | - A4XX_SP_VS_CTRL_REG0_INOUTREGOVERLAP(0) | - A4XX_SP_VS_CTRL_REG0_THREADSIZE(TWO_QUADS) | - A4XX_SP_VS_CTRL_REG0_SUPERTHREADMODE | - COND(s[VS].v->need_pixlod, A4XX_SP_VS_CTRL_REG0_PIXLODENABLE)); - OUT_RING(ring, A4XX_SP_VS_CTRL_REG1_CONSTLENGTH(s[VS].constlen) | - A4XX_SP_VS_CTRL_REG1_INITIALOUTSTANDING(s[VS].v->total_in)); - OUT_RING(ring, A4XX_SP_VS_PARAM_REG_POSREGID(pos_regid) | - A4XX_SP_VS_PARAM_REG_PSIZEREGID(psize_regid) | - A4XX_SP_VS_PARAM_REG_TOTALVSOUTVAR(s[FS].v->varying_in)); - - struct ir3_shader_linkage l = {0}; - ir3_link_shaders(&l, s[VS].v, s[FS].v, false); - - for (i = 0, j = 0; (i < 16) && (j < l.cnt); i++) { - uint32_t reg = 0; - - OUT_PKT0(ring, REG_A4XX_SP_VS_OUT_REG(i), 1); - - reg |= A4XX_SP_VS_OUT_REG_A_REGID(l.var[j].regid); - reg |= A4XX_SP_VS_OUT_REG_A_COMPMASK(l.var[j].compmask); - j++; - - reg |= A4XX_SP_VS_OUT_REG_B_REGID(l.var[j].regid); - reg |= A4XX_SP_VS_OUT_REG_B_COMPMASK(l.var[j].compmask); - j++; - - OUT_RING(ring, reg); - } - - for (i = 0, j = 0; (i < 8) && (j < l.cnt); i++) { - uint32_t reg = 0; - - OUT_PKT0(ring, REG_A4XX_SP_VS_VPC_DST_REG(i), 1); - - reg |= A4XX_SP_VS_VPC_DST_REG_OUTLOC0(l.var[j++].loc + 8); - reg |= A4XX_SP_VS_VPC_DST_REG_OUTLOC1(l.var[j++].loc + 8); - reg |= A4XX_SP_VS_VPC_DST_REG_OUTLOC2(l.var[j++].loc + 8); - reg |= A4XX_SP_VS_VPC_DST_REG_OUTLOC3(l.var[j++].loc + 8); - - OUT_RING(ring, reg); - } - - OUT_PKT0(ring, REG_A4XX_SP_VS_OBJ_OFFSET_REG, 2); - OUT_RING(ring, A4XX_SP_VS_OBJ_OFFSET_REG_CONSTOBJECTOFFSET(s[VS].constoff) | - A4XX_SP_VS_OBJ_OFFSET_REG_SHADEROBJOFFSET(s[VS].instroff)); - OUT_RELOC(ring, s[VS].v->bo, 0, 0, 0); /* SP_VS_OBJ_START_REG */ - - if (emit->binning_pass) { - OUT_PKT0(ring, REG_A4XX_SP_FS_LENGTH_REG, 1); - OUT_RING(ring, 0x00000000); /* SP_FS_LENGTH_REG */ - - OUT_PKT0(ring, REG_A4XX_SP_FS_CTRL_REG0, 2); - OUT_RING(ring, A4XX_SP_FS_CTRL_REG0_THREADMODE(MULTI) | - COND(s[FS].v->total_in > 0, A4XX_SP_FS_CTRL_REG0_VARYING) | - A4XX_SP_FS_CTRL_REG0_HALFREGFOOTPRINT(0) | - A4XX_SP_FS_CTRL_REG0_FULLREGFOOTPRINT(0) | - A4XX_SP_FS_CTRL_REG0_INOUTREGOVERLAP(1) | - A4XX_SP_FS_CTRL_REG0_THREADSIZE(fssz) | - A4XX_SP_FS_CTRL_REG0_SUPERTHREADMODE); - OUT_RING(ring, A4XX_SP_FS_CTRL_REG1_CONSTLENGTH(s[FS].constlen) | - 0x80000000); - - OUT_PKT0(ring, REG_A4XX_SP_FS_OBJ_OFFSET_REG, 2); - OUT_RING(ring, A4XX_SP_FS_OBJ_OFFSET_REG_CONSTOBJECTOFFSET(s[FS].constoff) | - A4XX_SP_FS_OBJ_OFFSET_REG_SHADEROBJOFFSET(s[FS].instroff)); - OUT_RING(ring, 0x00000000); - } else { - OUT_PKT0(ring, REG_A4XX_SP_FS_LENGTH_REG, 1); - OUT_RING(ring, s[FS].v->instrlen); /* SP_FS_LENGTH_REG */ - - OUT_PKT0(ring, REG_A4XX_SP_FS_CTRL_REG0, 2); - OUT_RING(ring, A4XX_SP_FS_CTRL_REG0_THREADMODE(MULTI) | - COND(s[FS].v->total_in > 0, A4XX_SP_FS_CTRL_REG0_VARYING) | - A4XX_SP_FS_CTRL_REG0_HALFREGFOOTPRINT(s[FS].i->max_half_reg + 1) | - A4XX_SP_FS_CTRL_REG0_FULLREGFOOTPRINT(s[FS].i->max_reg + 1) | - A4XX_SP_FS_CTRL_REG0_INOUTREGOVERLAP(1) | - A4XX_SP_FS_CTRL_REG0_THREADSIZE(fssz) | - A4XX_SP_FS_CTRL_REG0_SUPERTHREADMODE | - COND(s[FS].v->need_pixlod, A4XX_SP_FS_CTRL_REG0_PIXLODENABLE)); - OUT_RING(ring, A4XX_SP_FS_CTRL_REG1_CONSTLENGTH(s[FS].constlen) | - 0x80000000 | /* XXX */ - COND(s[FS].v->frag_face, A4XX_SP_FS_CTRL_REG1_FACENESS) | - COND(s[FS].v->total_in > 0, A4XX_SP_FS_CTRL_REG1_VARYING) | - COND(s[FS].v->fragcoord_compmask != 0, A4XX_SP_FS_CTRL_REG1_FRAGCOORD)); - - OUT_PKT0(ring, REG_A4XX_SP_FS_OBJ_OFFSET_REG, 2); - OUT_RING(ring, A4XX_SP_FS_OBJ_OFFSET_REG_CONSTOBJECTOFFSET(s[FS].constoff) | - A4XX_SP_FS_OBJ_OFFSET_REG_SHADEROBJOFFSET(s[FS].instroff)); - OUT_RELOC(ring, s[FS].v->bo, 0, 0, 0); /* SP_FS_OBJ_START_REG */ - } - - OUT_PKT0(ring, REG_A4XX_SP_HS_OBJ_OFFSET_REG, 1); - OUT_RING(ring, A4XX_SP_HS_OBJ_OFFSET_REG_CONSTOBJECTOFFSET(s[HS].constoff) | - A4XX_SP_HS_OBJ_OFFSET_REG_SHADEROBJOFFSET(s[HS].instroff)); - - OUT_PKT0(ring, REG_A4XX_SP_DS_OBJ_OFFSET_REG, 1); - OUT_RING(ring, A4XX_SP_DS_OBJ_OFFSET_REG_CONSTOBJECTOFFSET(s[DS].constoff) | - A4XX_SP_DS_OBJ_OFFSET_REG_SHADEROBJOFFSET(s[DS].instroff)); - - OUT_PKT0(ring, REG_A4XX_SP_GS_OBJ_OFFSET_REG, 1); - OUT_RING(ring, A4XX_SP_GS_OBJ_OFFSET_REG_CONSTOBJECTOFFSET(s[GS].constoff) | - A4XX_SP_GS_OBJ_OFFSET_REG_SHADEROBJOFFSET(s[GS].instroff)); - - OUT_PKT0(ring, REG_A4XX_GRAS_CNTL, 1); - OUT_RING(ring, - CONDREG(face_regid, A4XX_GRAS_CNTL_IJ_PERSP) | - CONDREG(zwcoord_regid, A4XX_GRAS_CNTL_IJ_PERSP) | - CONDREG(ij_regid[IJ_PERSP_PIXEL], A4XX_GRAS_CNTL_IJ_PERSP) | - CONDREG(ij_regid[IJ_LINEAR_PIXEL], A4XX_GRAS_CNTL_IJ_LINEAR) | - CONDREG(ij_regid[IJ_PERSP_CENTROID], A4XX_GRAS_CNTL_IJ_PERSP)); - - OUT_PKT0(ring, REG_A4XX_RB_RENDER_CONTROL2, 1); - OUT_RING(ring, A4XX_RB_RENDER_CONTROL2_MSAA_SAMPLES(0) | - CONDREG(ij_regid[IJ_PERSP_PIXEL], A4XX_RB_RENDER_CONTROL2_IJ_PERSP_PIXEL) | - CONDREG(ij_regid[IJ_PERSP_CENTROID], A4XX_RB_RENDER_CONTROL2_IJ_PERSP_CENTROID) | - CONDREG(ij_regid[IJ_LINEAR_PIXEL], A4XX_RB_RENDER_CONTROL2_SIZE) | - COND(s[FS].v->frag_face, A4XX_RB_RENDER_CONTROL2_FACENESS) | - COND(s[FS].v->fragcoord_compmask != 0, - A4XX_RB_RENDER_CONTROL2_COORD_MASK(s[FS].v->fragcoord_compmask))); - - OUT_PKT0(ring, REG_A4XX_RB_FS_OUTPUT_REG, 1); - OUT_RING(ring, A4XX_RB_FS_OUTPUT_REG_MRT(nr) | - COND(s[FS].v->writes_pos, A4XX_RB_FS_OUTPUT_REG_FRAG_WRITES_Z)); - - OUT_PKT0(ring, REG_A4XX_SP_FS_OUTPUT_REG, 1); - OUT_RING(ring, A4XX_SP_FS_OUTPUT_REG_MRT(nr) | - COND(s[FS].v->writes_pos, A4XX_SP_FS_OUTPUT_REG_DEPTH_ENABLE) | - A4XX_SP_FS_OUTPUT_REG_DEPTH_REGID(posz_regid)); - - OUT_PKT0(ring, REG_A4XX_SP_FS_MRT_REG(0), 8); - for (i = 0; i < 8; i++) { - enum a4xx_color_fmt format = 0; - bool srgb = false; - if (i < nr) { - format = fd4_emit_format(bufs[i]); - if (bufs[i] && !emit->no_decode_srgb) - srgb = util_format_is_srgb(bufs[i]->format); - } - OUT_RING(ring, A4XX_SP_FS_MRT_REG_REGID(color_regid[i]) | - A4XX_SP_FS_MRT_REG_MRTFORMAT(format) | - COND(srgb, A4XX_SP_FS_MRT_REG_COLOR_SRGB) | - COND(color_regid[i] & HALF_REG_ID, - A4XX_SP_FS_MRT_REG_HALF_PRECISION)); - } - - if (emit->binning_pass) { - OUT_PKT0(ring, REG_A4XX_VPC_ATTR, 2); - OUT_RING(ring, A4XX_VPC_ATTR_THRDASSIGN(1) | - 0x40000000 | /* XXX */ - COND(s[VS].v->writes_psize, A4XX_VPC_ATTR_PSIZE)); - OUT_RING(ring, 0x00000000); - } else { - uint32_t vinterp[8], vpsrepl[8]; - - memset(vinterp, 0, sizeof(vinterp)); - memset(vpsrepl, 0, sizeof(vpsrepl)); - - /* looks like we need to do int varyings in the frag - * shader on a4xx (no flatshad reg? or a420.0 bug?): - * - * (sy)(ss)nop - * (sy)ldlv.u32 r0.x,l[r0.x], 1 - * ldlv.u32 r0.y,l[r0.x+1], 1 - * (ss)bary.f (ei)r63.x, 0, r0.x - * (ss)(rpt1)cov.s32f16 hr0.x, (r)r0.x - * (rpt5)nop - * sam (f16)(xyzw)hr0.x, hr0.x, s#0, t#0 - * - * Possibly on later a4xx variants we'll be able to use - * something like the code below instead of workaround - * in the shader: - */ - /* figure out VARYING_INTERP / VARYING_PS_REPL register values: */ - for (j = -1; (j = ir3_next_varying(s[FS].v, j)) < (int)s[FS].v->inputs_count; ) { - /* NOTE: varyings are packed, so if compmask is 0xb - * then first, third, and fourth component occupy - * three consecutive varying slots: - */ - unsigned compmask = s[FS].v->inputs[j].compmask; - - uint32_t inloc = s[FS].v->inputs[j].inloc; - - if (s[FS].v->inputs[j].flat || - (s[FS].v->inputs[j].rasterflat && emit->rasterflat)) { - uint32_t loc = inloc; - - for (i = 0; i < 4; i++) { - if (compmask & (1 << i)) { - vinterp[loc / 16] |= 1 << ((loc % 16) * 2); - //flatshade[loc / 32] |= 1 << (loc % 32); - loc++; - } - } - } - - bool coord_mode = emit->sprite_coord_mode; - if (ir3_point_sprite(s[FS].v, j, emit->sprite_coord_enable, &coord_mode)) { - /* mask is two 2-bit fields, where: - * '01' -> S - * '10' -> T - * '11' -> 1 - T (flip mode) - */ - unsigned mask = coord_mode ? 0b1101 : 0b1001; - uint32_t loc = inloc; - if (compmask & 0x1) { - vpsrepl[loc / 16] |= ((mask >> 0) & 0x3) << ((loc % 16) * 2); - loc++; - } - if (compmask & 0x2) { - vpsrepl[loc / 16] |= ((mask >> 2) & 0x3) << ((loc % 16) * 2); - loc++; - } - if (compmask & 0x4) { - /* .z <- 0.0f */ - vinterp[loc / 16] |= 0b10 << ((loc % 16) * 2); - loc++; - } - if (compmask & 0x8) { - /* .w <- 1.0f */ - vinterp[loc / 16] |= 0b11 << ((loc % 16) * 2); - loc++; - } - } - } - - OUT_PKT0(ring, REG_A4XX_VPC_ATTR, 2); - OUT_RING(ring, A4XX_VPC_ATTR_TOTALATTR(s[FS].v->total_in) | - A4XX_VPC_ATTR_THRDASSIGN(1) | - COND(s[FS].v->total_in > 0, A4XX_VPC_ATTR_ENABLE) | - 0x40000000 | /* XXX */ - COND(s[VS].v->writes_psize, A4XX_VPC_ATTR_PSIZE)); - OUT_RING(ring, A4XX_VPC_PACK_NUMFPNONPOSVAR(s[FS].v->total_in) | - A4XX_VPC_PACK_NUMNONPOSVSVAR(s[FS].v->total_in)); - - OUT_PKT0(ring, REG_A4XX_VPC_VARYING_INTERP_MODE(0), 8); - for (i = 0; i < 8; i++) - OUT_RING(ring, vinterp[i]); /* VPC_VARYING_INTERP[i].MODE */ - - OUT_PKT0(ring, REG_A4XX_VPC_VARYING_PS_REPL_MODE(0), 8); - for (i = 0; i < 8; i++) - OUT_RING(ring, vpsrepl[i]); /* VPC_VARYING_PS_REPL[i] */ - } - - if (s[VS].instrlen) - emit_shader(ring, s[VS].v); - - if (!emit->binning_pass) - if (s[FS].instrlen) - emit_shader(ring, s[FS].v); + struct stage s[MAX_STAGES]; + uint32_t pos_regid, posz_regid, psize_regid, color_regid[8]; + uint32_t face_regid, coord_regid, zwcoord_regid, ij_regid[IJ_COUNT]; + enum a3xx_threadsize fssz; + int constmode; + int i, j; + + debug_assert(nr <= ARRAY_SIZE(color_regid)); + + if (emit->binning_pass) + nr = 0; + + setup_stages(emit, s); + + fssz = (s[FS].i->double_threadsize) ? FOUR_QUADS : TWO_QUADS; + + /* blob seems to always use constmode currently: */ + constmode = 1; + + pos_regid = ir3_find_output_regid(s[VS].v, VARYING_SLOT_POS); + if (pos_regid == regid(63, 0)) { + /* hw dislikes when there is no position output, which can + * happen for transform-feedback vertex shaders. Just tell + * the hw to use r0.x, with whatever random value is there: + */ + pos_regid = regid(0, 0); + } + posz_regid = ir3_find_output_regid(s[FS].v, FRAG_RESULT_DEPTH); + psize_regid = ir3_find_output_regid(s[VS].v, VARYING_SLOT_PSIZ); + if (s[FS].v->color0_mrt) { + color_regid[0] = color_regid[1] = color_regid[2] = color_regid[3] = + color_regid[4] = color_regid[5] = color_regid[6] = color_regid[7] = + ir3_find_output_regid(s[FS].v, FRAG_RESULT_COLOR); + } else { + color_regid[0] = ir3_find_output_regid(s[FS].v, FRAG_RESULT_DATA0); + color_regid[1] = ir3_find_output_regid(s[FS].v, FRAG_RESULT_DATA1); + color_regid[2] = ir3_find_output_regid(s[FS].v, FRAG_RESULT_DATA2); + color_regid[3] = ir3_find_output_regid(s[FS].v, FRAG_RESULT_DATA3); + color_regid[4] = ir3_find_output_regid(s[FS].v, FRAG_RESULT_DATA4); + color_regid[5] = ir3_find_output_regid(s[FS].v, FRAG_RESULT_DATA5); + color_regid[6] = ir3_find_output_regid(s[FS].v, FRAG_RESULT_DATA6); + color_regid[7] = ir3_find_output_regid(s[FS].v, FRAG_RESULT_DATA7); + } + + face_regid = ir3_find_sysval_regid(s[FS].v, SYSTEM_VALUE_FRONT_FACE); + coord_regid = ir3_find_sysval_regid(s[FS].v, SYSTEM_VALUE_FRAG_COORD); + zwcoord_regid = + (coord_regid == regid(63, 0)) ? regid(63, 0) : (coord_regid + 2); + for (unsigned i = 0; i < ARRAY_SIZE(ij_regid); i++) + ij_regid[i] = ir3_find_sysval_regid( + s[FS].v, SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL + i); + + /* we could probably divide this up into things that need to be + * emitted if frag-prog is dirty vs if vert-prog is dirty.. + */ + + OUT_PKT0(ring, REG_A4XX_HLSQ_UPDATE_CONTROL, 1); + OUT_RING(ring, 0x00000003); + + OUT_PKT0(ring, REG_A4XX_HLSQ_CONTROL_0_REG, 5); + OUT_RING(ring, A4XX_HLSQ_CONTROL_0_REG_FSTHREADSIZE(fssz) | + A4XX_HLSQ_CONTROL_0_REG_CONSTMODE(constmode) | + A4XX_HLSQ_CONTROL_0_REG_FSSUPERTHREADENABLE | + /* NOTE: I guess SHADERRESTART and CONSTFULLUPDATE maybe + * flush some caches? I think we only need to set those + * bits if we have updated const or shader.. + */ + A4XX_HLSQ_CONTROL_0_REG_SPSHADERRESTART | + A4XX_HLSQ_CONTROL_0_REG_SPCONSTFULLUPDATE); + OUT_RING(ring, A4XX_HLSQ_CONTROL_1_REG_VSTHREADSIZE(TWO_QUADS) | + A4XX_HLSQ_CONTROL_1_REG_VSSUPERTHREADENABLE | + A4XX_HLSQ_CONTROL_1_REG_COORDREGID(coord_regid) | + A4XX_HLSQ_CONTROL_1_REG_ZWCOORDREGID(zwcoord_regid)); + OUT_RING(ring, A4XX_HLSQ_CONTROL_2_REG_PRIMALLOCTHRESHOLD(63) | + 0x3f3f000 | /* XXX */ + A4XX_HLSQ_CONTROL_2_REG_FACEREGID(face_regid)); + /* XXX left out centroid/sample for now */ + OUT_RING( + ring, + A4XX_HLSQ_CONTROL_3_REG_IJ_PERSP_PIXEL(ij_regid[IJ_PERSP_PIXEL]) | + A4XX_HLSQ_CONTROL_3_REG_IJ_LINEAR_PIXEL(ij_regid[IJ_LINEAR_PIXEL]) | + A4XX_HLSQ_CONTROL_3_REG_IJ_PERSP_CENTROID( + ij_regid[IJ_PERSP_CENTROID]) | + A4XX_HLSQ_CONTROL_3_REG_IJ_LINEAR_CENTROID( + ij_regid[IJ_LINEAR_CENTROID])); + OUT_RING(ring, 0x00fcfcfc); /* XXX HLSQ_CONTROL_4 */ + + OUT_PKT0(ring, REG_A4XX_HLSQ_VS_CONTROL_REG, 5); + OUT_RING(ring, + A4XX_HLSQ_VS_CONTROL_REG_CONSTLENGTH(s[VS].constlen) | + A4XX_HLSQ_VS_CONTROL_REG_CONSTOBJECTOFFSET(s[VS].constoff) | + A4XX_HLSQ_VS_CONTROL_REG_INSTRLENGTH(s[VS].instrlen) | + A4XX_HLSQ_VS_CONTROL_REG_SHADEROBJOFFSET(s[VS].instroff)); + OUT_RING(ring, + A4XX_HLSQ_FS_CONTROL_REG_CONSTLENGTH(s[FS].constlen) | + A4XX_HLSQ_FS_CONTROL_REG_CONSTOBJECTOFFSET(s[FS].constoff) | + A4XX_HLSQ_FS_CONTROL_REG_INSTRLENGTH(s[FS].instrlen) | + A4XX_HLSQ_FS_CONTROL_REG_SHADEROBJOFFSET(s[FS].instroff)); + OUT_RING(ring, + A4XX_HLSQ_HS_CONTROL_REG_CONSTLENGTH(s[HS].constlen) | + A4XX_HLSQ_HS_CONTROL_REG_CONSTOBJECTOFFSET(s[HS].constoff) | + A4XX_HLSQ_HS_CONTROL_REG_INSTRLENGTH(s[HS].instrlen) | + A4XX_HLSQ_HS_CONTROL_REG_SHADEROBJOFFSET(s[HS].instroff)); + OUT_RING(ring, + A4XX_HLSQ_DS_CONTROL_REG_CONSTLENGTH(s[DS].constlen) | + A4XX_HLSQ_DS_CONTROL_REG_CONSTOBJECTOFFSET(s[DS].constoff) | + A4XX_HLSQ_DS_CONTROL_REG_INSTRLENGTH(s[DS].instrlen) | + A4XX_HLSQ_DS_CONTROL_REG_SHADEROBJOFFSET(s[DS].instroff)); + OUT_RING(ring, + A4XX_HLSQ_GS_CONTROL_REG_CONSTLENGTH(s[GS].constlen) | + A4XX_HLSQ_GS_CONTROL_REG_CONSTOBJECTOFFSET(s[GS].constoff) | + A4XX_HLSQ_GS_CONTROL_REG_INSTRLENGTH(s[GS].instrlen) | + A4XX_HLSQ_GS_CONTROL_REG_SHADEROBJOFFSET(s[GS].instroff)); + + OUT_PKT0(ring, REG_A4XX_SP_SP_CTRL_REG, 1); + OUT_RING(ring, + 0x140010 | /* XXX */ + COND(emit->binning_pass, A4XX_SP_SP_CTRL_REG_BINNING_PASS)); + + OUT_PKT0(ring, REG_A4XX_SP_INSTR_CACHE_CTRL, 1); + OUT_RING(ring, 0x7f | /* XXX */ + COND(s[VS].instrlen, A4XX_SP_INSTR_CACHE_CTRL_VS_BUFFER) | + COND(s[FS].instrlen, A4XX_SP_INSTR_CACHE_CTRL_FS_BUFFER) | + COND(s[VS].instrlen && s[FS].instrlen, + A4XX_SP_INSTR_CACHE_CTRL_INSTR_BUFFER)); + + OUT_PKT0(ring, REG_A4XX_SP_VS_LENGTH_REG, 1); + OUT_RING(ring, s[VS].v->instrlen); /* SP_VS_LENGTH_REG */ + + OUT_PKT0(ring, REG_A4XX_SP_VS_CTRL_REG0, 3); + OUT_RING( + ring, + A4XX_SP_VS_CTRL_REG0_THREADMODE(MULTI) | + A4XX_SP_VS_CTRL_REG0_HALFREGFOOTPRINT(s[VS].i->max_half_reg + 1) | + A4XX_SP_VS_CTRL_REG0_FULLREGFOOTPRINT(s[VS].i->max_reg + 1) | + A4XX_SP_VS_CTRL_REG0_INOUTREGOVERLAP(0) | + A4XX_SP_VS_CTRL_REG0_THREADSIZE(TWO_QUADS) | + A4XX_SP_VS_CTRL_REG0_SUPERTHREADMODE | + COND(s[VS].v->need_pixlod, A4XX_SP_VS_CTRL_REG0_PIXLODENABLE)); + OUT_RING(ring, + A4XX_SP_VS_CTRL_REG1_CONSTLENGTH(s[VS].constlen) | + A4XX_SP_VS_CTRL_REG1_INITIALOUTSTANDING(s[VS].v->total_in)); + OUT_RING(ring, A4XX_SP_VS_PARAM_REG_POSREGID(pos_regid) | + A4XX_SP_VS_PARAM_REG_PSIZEREGID(psize_regid) | + A4XX_SP_VS_PARAM_REG_TOTALVSOUTVAR(s[FS].v->varying_in)); + + struct ir3_shader_linkage l = {0}; + ir3_link_shaders(&l, s[VS].v, s[FS].v, false); + + for (i = 0, j = 0; (i < 16) && (j < l.cnt); i++) { + uint32_t reg = 0; + + OUT_PKT0(ring, REG_A4XX_SP_VS_OUT_REG(i), 1); + + reg |= A4XX_SP_VS_OUT_REG_A_REGID(l.var[j].regid); + reg |= A4XX_SP_VS_OUT_REG_A_COMPMASK(l.var[j].compmask); + j++; + + reg |= A4XX_SP_VS_OUT_REG_B_REGID(l.var[j].regid); + reg |= A4XX_SP_VS_OUT_REG_B_COMPMASK(l.var[j].compmask); + j++; + + OUT_RING(ring, reg); + } + + for (i = 0, j = 0; (i < 8) && (j < l.cnt); i++) { + uint32_t reg = 0; + + OUT_PKT0(ring, REG_A4XX_SP_VS_VPC_DST_REG(i), 1); + + reg |= A4XX_SP_VS_VPC_DST_REG_OUTLOC0(l.var[j++].loc + 8); + reg |= A4XX_SP_VS_VPC_DST_REG_OUTLOC1(l.var[j++].loc + 8); + reg |= A4XX_SP_VS_VPC_DST_REG_OUTLOC2(l.var[j++].loc + 8); + reg |= A4XX_SP_VS_VPC_DST_REG_OUTLOC3(l.var[j++].loc + 8); + + OUT_RING(ring, reg); + } + + OUT_PKT0(ring, REG_A4XX_SP_VS_OBJ_OFFSET_REG, 2); + OUT_RING(ring, A4XX_SP_VS_OBJ_OFFSET_REG_CONSTOBJECTOFFSET(s[VS].constoff) | + A4XX_SP_VS_OBJ_OFFSET_REG_SHADEROBJOFFSET(s[VS].instroff)); + OUT_RELOC(ring, s[VS].v->bo, 0, 0, 0); /* SP_VS_OBJ_START_REG */ + + if (emit->binning_pass) { + OUT_PKT0(ring, REG_A4XX_SP_FS_LENGTH_REG, 1); + OUT_RING(ring, 0x00000000); /* SP_FS_LENGTH_REG */ + + OUT_PKT0(ring, REG_A4XX_SP_FS_CTRL_REG0, 2); + OUT_RING(ring, + A4XX_SP_FS_CTRL_REG0_THREADMODE(MULTI) | + COND(s[FS].v->total_in > 0, A4XX_SP_FS_CTRL_REG0_VARYING) | + A4XX_SP_FS_CTRL_REG0_HALFREGFOOTPRINT(0) | + A4XX_SP_FS_CTRL_REG0_FULLREGFOOTPRINT(0) | + A4XX_SP_FS_CTRL_REG0_INOUTREGOVERLAP(1) | + A4XX_SP_FS_CTRL_REG0_THREADSIZE(fssz) | + A4XX_SP_FS_CTRL_REG0_SUPERTHREADMODE); + OUT_RING(ring, + A4XX_SP_FS_CTRL_REG1_CONSTLENGTH(s[FS].constlen) | 0x80000000); + + OUT_PKT0(ring, REG_A4XX_SP_FS_OBJ_OFFSET_REG, 2); + OUT_RING(ring, + A4XX_SP_FS_OBJ_OFFSET_REG_CONSTOBJECTOFFSET(s[FS].constoff) | + A4XX_SP_FS_OBJ_OFFSET_REG_SHADEROBJOFFSET(s[FS].instroff)); + OUT_RING(ring, 0x00000000); + } else { + OUT_PKT0(ring, REG_A4XX_SP_FS_LENGTH_REG, 1); + OUT_RING(ring, s[FS].v->instrlen); /* SP_FS_LENGTH_REG */ + + OUT_PKT0(ring, REG_A4XX_SP_FS_CTRL_REG0, 2); + OUT_RING( + ring, + A4XX_SP_FS_CTRL_REG0_THREADMODE(MULTI) | + COND(s[FS].v->total_in > 0, A4XX_SP_FS_CTRL_REG0_VARYING) | + A4XX_SP_FS_CTRL_REG0_HALFREGFOOTPRINT(s[FS].i->max_half_reg + 1) | + A4XX_SP_FS_CTRL_REG0_FULLREGFOOTPRINT(s[FS].i->max_reg + 1) | + A4XX_SP_FS_CTRL_REG0_INOUTREGOVERLAP(1) | + A4XX_SP_FS_CTRL_REG0_THREADSIZE(fssz) | + A4XX_SP_FS_CTRL_REG0_SUPERTHREADMODE | + COND(s[FS].v->need_pixlod, A4XX_SP_FS_CTRL_REG0_PIXLODENABLE)); + OUT_RING(ring, + A4XX_SP_FS_CTRL_REG1_CONSTLENGTH(s[FS].constlen) | + 0x80000000 | /* XXX */ + COND(s[FS].v->frag_face, A4XX_SP_FS_CTRL_REG1_FACENESS) | + COND(s[FS].v->total_in > 0, A4XX_SP_FS_CTRL_REG1_VARYING) | + COND(s[FS].v->fragcoord_compmask != 0, + A4XX_SP_FS_CTRL_REG1_FRAGCOORD)); + + OUT_PKT0(ring, REG_A4XX_SP_FS_OBJ_OFFSET_REG, 2); + OUT_RING(ring, + A4XX_SP_FS_OBJ_OFFSET_REG_CONSTOBJECTOFFSET(s[FS].constoff) | + A4XX_SP_FS_OBJ_OFFSET_REG_SHADEROBJOFFSET(s[FS].instroff)); + OUT_RELOC(ring, s[FS].v->bo, 0, 0, 0); /* SP_FS_OBJ_START_REG */ + } + + OUT_PKT0(ring, REG_A4XX_SP_HS_OBJ_OFFSET_REG, 1); + OUT_RING(ring, A4XX_SP_HS_OBJ_OFFSET_REG_CONSTOBJECTOFFSET(s[HS].constoff) | + A4XX_SP_HS_OBJ_OFFSET_REG_SHADEROBJOFFSET(s[HS].instroff)); + + OUT_PKT0(ring, REG_A4XX_SP_DS_OBJ_OFFSET_REG, 1); + OUT_RING(ring, A4XX_SP_DS_OBJ_OFFSET_REG_CONSTOBJECTOFFSET(s[DS].constoff) | + A4XX_SP_DS_OBJ_OFFSET_REG_SHADEROBJOFFSET(s[DS].instroff)); + + OUT_PKT0(ring, REG_A4XX_SP_GS_OBJ_OFFSET_REG, 1); + OUT_RING(ring, A4XX_SP_GS_OBJ_OFFSET_REG_CONSTOBJECTOFFSET(s[GS].constoff) | + A4XX_SP_GS_OBJ_OFFSET_REG_SHADEROBJOFFSET(s[GS].instroff)); + + OUT_PKT0(ring, REG_A4XX_GRAS_CNTL, 1); + OUT_RING(ring, + CONDREG(face_regid, A4XX_GRAS_CNTL_IJ_PERSP) | + CONDREG(zwcoord_regid, A4XX_GRAS_CNTL_IJ_PERSP) | + CONDREG(ij_regid[IJ_PERSP_PIXEL], A4XX_GRAS_CNTL_IJ_PERSP) | + CONDREG(ij_regid[IJ_LINEAR_PIXEL], A4XX_GRAS_CNTL_IJ_LINEAR) | + CONDREG(ij_regid[IJ_PERSP_CENTROID], A4XX_GRAS_CNTL_IJ_PERSP)); + + OUT_PKT0(ring, REG_A4XX_RB_RENDER_CONTROL2, 1); + OUT_RING( + ring, + A4XX_RB_RENDER_CONTROL2_MSAA_SAMPLES(0) | + CONDREG(ij_regid[IJ_PERSP_PIXEL], + A4XX_RB_RENDER_CONTROL2_IJ_PERSP_PIXEL) | + CONDREG(ij_regid[IJ_PERSP_CENTROID], + A4XX_RB_RENDER_CONTROL2_IJ_PERSP_CENTROID) | + CONDREG(ij_regid[IJ_LINEAR_PIXEL], A4XX_RB_RENDER_CONTROL2_SIZE) | + COND(s[FS].v->frag_face, A4XX_RB_RENDER_CONTROL2_FACENESS) | + COND(s[FS].v->fragcoord_compmask != 0, + A4XX_RB_RENDER_CONTROL2_COORD_MASK(s[FS].v->fragcoord_compmask))); + + OUT_PKT0(ring, REG_A4XX_RB_FS_OUTPUT_REG, 1); + OUT_RING(ring, + A4XX_RB_FS_OUTPUT_REG_MRT(nr) | + COND(s[FS].v->writes_pos, A4XX_RB_FS_OUTPUT_REG_FRAG_WRITES_Z)); + + OUT_PKT0(ring, REG_A4XX_SP_FS_OUTPUT_REG, 1); + OUT_RING(ring, + A4XX_SP_FS_OUTPUT_REG_MRT(nr) | + COND(s[FS].v->writes_pos, A4XX_SP_FS_OUTPUT_REG_DEPTH_ENABLE) | + A4XX_SP_FS_OUTPUT_REG_DEPTH_REGID(posz_regid)); + + OUT_PKT0(ring, REG_A4XX_SP_FS_MRT_REG(0), 8); + for (i = 0; i < 8; i++) { + enum a4xx_color_fmt format = 0; + bool srgb = false; + if (i < nr) { + format = fd4_emit_format(bufs[i]); + if (bufs[i] && !emit->no_decode_srgb) + srgb = util_format_is_srgb(bufs[i]->format); + } + OUT_RING(ring, A4XX_SP_FS_MRT_REG_REGID(color_regid[i]) | + A4XX_SP_FS_MRT_REG_MRTFORMAT(format) | + COND(srgb, A4XX_SP_FS_MRT_REG_COLOR_SRGB) | + COND(color_regid[i] & HALF_REG_ID, + A4XX_SP_FS_MRT_REG_HALF_PRECISION)); + } + + if (emit->binning_pass) { + OUT_PKT0(ring, REG_A4XX_VPC_ATTR, 2); + OUT_RING(ring, A4XX_VPC_ATTR_THRDASSIGN(1) | 0x40000000 | /* XXX */ + COND(s[VS].v->writes_psize, A4XX_VPC_ATTR_PSIZE)); + OUT_RING(ring, 0x00000000); + } else { + uint32_t vinterp[8], vpsrepl[8]; + + memset(vinterp, 0, sizeof(vinterp)); + memset(vpsrepl, 0, sizeof(vpsrepl)); + + /* looks like we need to do int varyings in the frag + * shader on a4xx (no flatshad reg? or a420.0 bug?): + * + * (sy)(ss)nop + * (sy)ldlv.u32 r0.x,l[r0.x], 1 + * ldlv.u32 r0.y,l[r0.x+1], 1 + * (ss)bary.f (ei)r63.x, 0, r0.x + * (ss)(rpt1)cov.s32f16 hr0.x, (r)r0.x + * (rpt5)nop + * sam (f16)(xyzw)hr0.x, hr0.x, s#0, t#0 + * + * Possibly on later a4xx variants we'll be able to use + * something like the code below instead of workaround + * in the shader: + */ + /* figure out VARYING_INTERP / VARYING_PS_REPL register values: */ + for (j = -1; + (j = ir3_next_varying(s[FS].v, j)) < (int)s[FS].v->inputs_count;) { + /* NOTE: varyings are packed, so if compmask is 0xb + * then first, third, and fourth component occupy + * three consecutive varying slots: + */ + unsigned compmask = s[FS].v->inputs[j].compmask; + + uint32_t inloc = s[FS].v->inputs[j].inloc; + + if (s[FS].v->inputs[j].flat || + (s[FS].v->inputs[j].rasterflat && emit->rasterflat)) { + uint32_t loc = inloc; + + for (i = 0; i < 4; i++) { + if (compmask & (1 << i)) { + vinterp[loc / 16] |= 1 << ((loc % 16) * 2); + // flatshade[loc / 32] |= 1 << (loc % 32); + loc++; + } + } + } + + bool coord_mode = emit->sprite_coord_mode; + if (ir3_point_sprite(s[FS].v, j, emit->sprite_coord_enable, + &coord_mode)) { + /* mask is two 2-bit fields, where: + * '01' -> S + * '10' -> T + * '11' -> 1 - T (flip mode) + */ + unsigned mask = coord_mode ? 0b1101 : 0b1001; + uint32_t loc = inloc; + if (compmask & 0x1) { + vpsrepl[loc / 16] |= ((mask >> 0) & 0x3) << ((loc % 16) * 2); + loc++; + } + if (compmask & 0x2) { + vpsrepl[loc / 16] |= ((mask >> 2) & 0x3) << ((loc % 16) * 2); + loc++; + } + if (compmask & 0x4) { + /* .z <- 0.0f */ + vinterp[loc / 16] |= 0b10 << ((loc % 16) * 2); + loc++; + } + if (compmask & 0x8) { + /* .w <- 1.0f */ + vinterp[loc / 16] |= 0b11 << ((loc % 16) * 2); + loc++; + } + } + } + + OUT_PKT0(ring, REG_A4XX_VPC_ATTR, 2); + OUT_RING(ring, A4XX_VPC_ATTR_TOTALATTR(s[FS].v->total_in) | + A4XX_VPC_ATTR_THRDASSIGN(1) | + COND(s[FS].v->total_in > 0, A4XX_VPC_ATTR_ENABLE) | + 0x40000000 | /* XXX */ + COND(s[VS].v->writes_psize, A4XX_VPC_ATTR_PSIZE)); + OUT_RING(ring, A4XX_VPC_PACK_NUMFPNONPOSVAR(s[FS].v->total_in) | + A4XX_VPC_PACK_NUMNONPOSVSVAR(s[FS].v->total_in)); + + OUT_PKT0(ring, REG_A4XX_VPC_VARYING_INTERP_MODE(0), 8); + for (i = 0; i < 8; i++) + OUT_RING(ring, vinterp[i]); /* VPC_VARYING_INTERP[i].MODE */ + + OUT_PKT0(ring, REG_A4XX_VPC_VARYING_PS_REPL_MODE(0), 8); + for (i = 0; i < 8; i++) + OUT_RING(ring, vpsrepl[i]); /* VPC_VARYING_PS_REPL[i] */ + } + + if (s[VS].instrlen) + emit_shader(ring, s[VS].v); + + if (!emit->binning_pass) + if (s[FS].instrlen) + emit_shader(ring, s[FS].v); } static struct ir3_program_state * fd4_program_create(void *data, struct ir3_shader_variant *bs, - struct ir3_shader_variant *vs, - struct ir3_shader_variant *hs, - struct ir3_shader_variant *ds, - struct ir3_shader_variant *gs, - struct ir3_shader_variant *fs, - const struct ir3_shader_key *key) - in_dt + struct ir3_shader_variant *vs, struct ir3_shader_variant *hs, + struct ir3_shader_variant *ds, struct ir3_shader_variant *gs, + struct ir3_shader_variant *fs, + const struct ir3_shader_key *key) in_dt { - struct fd_context *ctx = fd_context(data); - struct fd4_program_state *state = CALLOC_STRUCT(fd4_program_state); + struct fd_context *ctx = fd_context(data); + struct fd4_program_state *state = CALLOC_STRUCT(fd4_program_state); - tc_assert_driver_thread(ctx->tc); + tc_assert_driver_thread(ctx->tc); - state->bs = bs; - state->vs = vs; - state->fs = fs; + state->bs = bs; + state->vs = vs; + state->fs = fs; - return &state->base; + return &state->base; } static void fd4_program_destroy(void *data, struct ir3_program_state *state) { - struct fd4_program_state *so = fd4_program_state(state); - free(so); + struct fd4_program_state *so = fd4_program_state(state); + free(so); } static const struct ir3_cache_funcs cache_funcs = { - .create_state = fd4_program_create, - .destroy_state = fd4_program_destroy, + .create_state = fd4_program_create, + .destroy_state = fd4_program_destroy, }; void fd4_prog_init(struct pipe_context *pctx) { - struct fd_context *ctx = fd_context(pctx); + struct fd_context *ctx = fd_context(pctx); - ctx->shader_cache = ir3_cache_create(&cache_funcs, ctx); - ir3_prog_init(pctx); - fd_prog_init(pctx); + ctx->shader_cache = ir3_cache_create(&cache_funcs, ctx); + ir3_prog_init(pctx); + fd_prog_init(pctx); } diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_program.h b/src/gallium/drivers/freedreno/a4xx/fd4_program.h index 790adc9..7fcb0c7 100644 --- a/src/gallium/drivers/freedreno/a4xx/fd4_program.h +++ b/src/gallium/drivers/freedreno/a4xx/fd4_program.h @@ -36,20 +36,20 @@ struct fd4_emit; struct fd4_program_state { - struct ir3_program_state base; - struct ir3_shader_variant *bs; /* VS for when emit->binning */ - struct ir3_shader_variant *vs; - struct ir3_shader_variant *fs; /* FS for when !emit->binning */ + struct ir3_program_state base; + struct ir3_shader_variant *bs; /* VS for when emit->binning */ + struct ir3_shader_variant *vs; + struct ir3_shader_variant *fs; /* FS for when !emit->binning */ }; static inline struct fd4_program_state * fd4_program_state(struct ir3_program_state *state) { - return (struct fd4_program_state *)state; + return (struct fd4_program_state *)state; } -void fd4_program_emit(struct fd_ringbuffer *ring, struct fd4_emit *emit, - int nr, struct pipe_surface **bufs); +void fd4_program_emit(struct fd_ringbuffer *ring, struct fd4_emit *emit, int nr, + struct pipe_surface **bufs); void fd4_prog_init(struct pipe_context *pctx); diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_query.c b/src/gallium/drivers/freedreno/a4xx/fd4_query.c index 0802e58..5545572 100644 --- a/src/gallium/drivers/freedreno/a4xx/fd4_query.c +++ b/src/gallium/drivers/freedreno/a4xx/fd4_query.c @@ -24,18 +24,17 @@ * Rob Clark */ -#include "freedreno_query_hw.h" #include "freedreno_context.h" +#include "freedreno_query_hw.h" #include "freedreno_util.h" -#include "fd4_query.h" #include "fd4_context.h" #include "fd4_draw.h" #include "fd4_format.h" - +#include "fd4_query.h" struct fd_rb_samp_ctrs { - uint64_t ctr[16]; + uint64_t ctr[16]; }; /* @@ -48,57 +47,56 @@ struct fd_rb_samp_ctrs { static struct fd_hw_sample * occlusion_get_sample(struct fd_batch *batch, struct fd_ringbuffer *ring) { - struct fd_hw_sample *samp = - fd_hw_sample_init(batch, sizeof(struct fd_rb_samp_ctrs)); - - /* low bits of sample addr should be zero (since they are control - * flags in RB_SAMPLE_COUNT_CONTROL): - */ - debug_assert((samp->offset & 0x3) == 0); - - /* Set RB_SAMPLE_COUNT_ADDR to samp->offset plus value of - * HW_QUERY_BASE_REG register: - */ - OUT_PKT3(ring, CP_SET_CONSTANT, 3); - OUT_RING(ring, CP_REG(REG_A4XX_RB_SAMPLE_COUNT_CONTROL) | 0x80000000); - OUT_RING(ring, HW_QUERY_BASE_REG); - OUT_RING(ring, A4XX_RB_SAMPLE_COUNT_CONTROL_COPY | - samp->offset); - - OUT_PKT3(ring, CP_DRAW_INDX_OFFSET, 3); - OUT_RING(ring, DRAW4(DI_PT_POINTLIST_PSIZE, DI_SRC_SEL_AUTO_INDEX, - INDEX4_SIZE_32_BIT, USE_VISIBILITY)); - OUT_RING(ring, 1); /* NumInstances */ - OUT_RING(ring, 0); /* NumIndices */ - - fd_event_write(batch, ring, ZPASS_DONE); - - return samp; + struct fd_hw_sample *samp = + fd_hw_sample_init(batch, sizeof(struct fd_rb_samp_ctrs)); + + /* low bits of sample addr should be zero (since they are control + * flags in RB_SAMPLE_COUNT_CONTROL): + */ + debug_assert((samp->offset & 0x3) == 0); + + /* Set RB_SAMPLE_COUNT_ADDR to samp->offset plus value of + * HW_QUERY_BASE_REG register: + */ + OUT_PKT3(ring, CP_SET_CONSTANT, 3); + OUT_RING(ring, CP_REG(REG_A4XX_RB_SAMPLE_COUNT_CONTROL) | 0x80000000); + OUT_RING(ring, HW_QUERY_BASE_REG); + OUT_RING(ring, A4XX_RB_SAMPLE_COUNT_CONTROL_COPY | samp->offset); + + OUT_PKT3(ring, CP_DRAW_INDX_OFFSET, 3); + OUT_RING(ring, DRAW4(DI_PT_POINTLIST_PSIZE, DI_SRC_SEL_AUTO_INDEX, + INDEX4_SIZE_32_BIT, USE_VISIBILITY)); + OUT_RING(ring, 1); /* NumInstances */ + OUT_RING(ring, 0); /* NumIndices */ + + fd_event_write(batch, ring, ZPASS_DONE); + + return samp; } static uint64_t count_samples(const struct fd_rb_samp_ctrs *start, - const struct fd_rb_samp_ctrs *end) + const struct fd_rb_samp_ctrs *end) { - return end->ctr[0] - start->ctr[0]; + return end->ctr[0] - start->ctr[0]; } static void -occlusion_counter_accumulate_result(struct fd_context *ctx, - const void *start, const void *end, - union pipe_query_result *result) +occlusion_counter_accumulate_result(struct fd_context *ctx, const void *start, + const void *end, + union pipe_query_result *result) { - uint64_t n = count_samples(start, end); - result->u64 += n; + uint64_t n = count_samples(start, end); + result->u64 += n; } static void -occlusion_predicate_accumulate_result(struct fd_context *ctx, - const void *start, const void *end, - union pipe_query_result *result) +occlusion_predicate_accumulate_result(struct fd_context *ctx, const void *start, + const void *end, + union pipe_query_result *result) { - uint64_t n = count_samples(start, end); - result->b |= (n > 0); + uint64_t n = count_samples(start, end); + result->b |= (n > 0); } /* @@ -109,161 +107,159 @@ occlusion_predicate_accumulate_result(struct fd_context *ctx, */ static void -time_elapsed_enable(struct fd_context *ctx, struct fd_ringbuffer *ring) - assert_dt +time_elapsed_enable(struct fd_context *ctx, + struct fd_ringbuffer *ring) assert_dt { - /* Right now, the assignment of countable to counter register is - * just hard coded. If we start exposing more countables than we - * have counters, we will need to be more clever. - */ - struct fd_batch *batch = fd_context_batch_locked(ctx); - fd_wfi(batch, ring); - OUT_PKT0(ring, REG_A4XX_CP_PERFCTR_CP_SEL_0, 1); - OUT_RING(ring, CP_ALWAYS_COUNT); - fd_batch_unlock_submit(batch); - fd_batch_reference(&batch, NULL); + /* Right now, the assignment of countable to counter register is + * just hard coded. If we start exposing more countables than we + * have counters, we will need to be more clever. + */ + struct fd_batch *batch = fd_context_batch_locked(ctx); + fd_wfi(batch, ring); + OUT_PKT0(ring, REG_A4XX_CP_PERFCTR_CP_SEL_0, 1); + OUT_RING(ring, CP_ALWAYS_COUNT); + fd_batch_unlock_submit(batch); + fd_batch_reference(&batch, NULL); } static struct fd_hw_sample * -time_elapsed_get_sample(struct fd_batch *batch, struct fd_ringbuffer *ring) - assert_dt +time_elapsed_get_sample(struct fd_batch *batch, + struct fd_ringbuffer *ring) assert_dt { - struct fd_hw_sample *samp = fd_hw_sample_init(batch, sizeof(uint64_t)); - - /* use unused part of vsc_size_mem as scratch space, to avoid - * extra allocation: - */ - struct fd_bo *scratch_bo = fd4_context(batch->ctx)->vsc_size_mem; - const int sample_off = 128; - const int addr_off = sample_off + 8; - - debug_assert(batch->ctx->screen->max_freq > 0); - - /* Basic issue is that we need to read counter value to a relative - * destination (with per-tile offset) rather than absolute dest - * addr. But there is no pm4 packet that can do that. This is - * where it would be *really* nice if we could write our own fw - * since afaict implementing the sort of packet we need would be - * trivial. - * - * Instead, we: - * (1) CP_REG_TO_MEM to do a 64b copy of counter to scratch buffer - * (2) CP_MEM_WRITE to write per-sample offset to scratch buffer - * (3) CP_REG_TO_MEM w/ accumulate flag to add the per-tile base - * address to the per-sample offset in the scratch buffer - * (4) CP_MEM_TO_REG to copy resulting address from steps #2 and #3 - * to CP_ME_NRT_ADDR - * (5) CP_MEM_TO_REG's to copy saved counter value from scratch - * buffer to CP_ME_NRT_DATA to trigger the write out to query - * result buffer - * - * Straightforward, right? - * - * Maybe could swap the order of things in the scratch buffer to - * put address first, and copy back to CP_ME_NRT_ADDR+DATA in one - * shot, but that's really just polishing a turd.. - */ - - fd_wfi(batch, ring); - - /* copy sample counter _LO and _HI to scratch: */ - OUT_PKT3(ring, CP_REG_TO_MEM, 2); - OUT_RING(ring, CP_REG_TO_MEM_0_REG(REG_A4XX_RBBM_PERFCTR_CP_0_LO) | - CP_REG_TO_MEM_0_64B | - CP_REG_TO_MEM_0_CNT(2)); /* write 2 regs to mem */ - OUT_RELOC(ring, scratch_bo, sample_off, 0, 0); - - /* ok... here we really *would* like to use the CP_SET_CONSTANT - * mode which can add a constant to value in reg2 and write to - * reg1... *but* that only works for banked/context registers, - * and CP_ME_NRT_DATA isn't one of those.. so we need to do some - * CP math to the scratch buffer instead: - * - * (note first 8 bytes are counter value, use offset 0x8 for - * address calculation) - */ - - /* per-sample offset to scratch bo: */ - OUT_PKT3(ring, CP_MEM_WRITE, 2); - OUT_RELOC(ring, scratch_bo, addr_off, 0, 0); - OUT_RING(ring, samp->offset); - - /* now add to that the per-tile base: */ - OUT_PKT3(ring, CP_REG_TO_MEM, 2); - OUT_RING(ring, CP_REG_TO_MEM_0_REG(HW_QUERY_BASE_REG) | - CP_REG_TO_MEM_0_ACCUMULATE | - CP_REG_TO_MEM_0_CNT(0)); /* readback 1 regs */ - OUT_RELOC(ring, scratch_bo, addr_off, 0, 0); - - /* now copy that back to CP_ME_NRT_ADDR: */ - OUT_PKT3(ring, CP_MEM_TO_REG, 2); - OUT_RING(ring, REG_A4XX_CP_ME_NRT_ADDR); - OUT_RELOC(ring, scratch_bo, addr_off, 0, 0); - - /* and finally, copy sample from scratch buffer to CP_ME_NRT_DATA - * to trigger the write to result buffer - */ - OUT_PKT3(ring, CP_MEM_TO_REG, 2); - OUT_RING(ring, REG_A4XX_CP_ME_NRT_DATA); - OUT_RELOC(ring, scratch_bo, sample_off, 0, 0); - - /* and again to get the value of the _HI reg from scratch: */ - OUT_PKT3(ring, CP_MEM_TO_REG, 2); - OUT_RING(ring, REG_A4XX_CP_ME_NRT_DATA); - OUT_RELOC(ring, scratch_bo, sample_off + 0x4, 0, 0); - - /* Sigh.. */ - - return samp; + struct fd_hw_sample *samp = fd_hw_sample_init(batch, sizeof(uint64_t)); + + /* use unused part of vsc_size_mem as scratch space, to avoid + * extra allocation: + */ + struct fd_bo *scratch_bo = fd4_context(batch->ctx)->vsc_size_mem; + const int sample_off = 128; + const int addr_off = sample_off + 8; + + debug_assert(batch->ctx->screen->max_freq > 0); + + /* Basic issue is that we need to read counter value to a relative + * destination (with per-tile offset) rather than absolute dest + * addr. But there is no pm4 packet that can do that. This is + * where it would be *really* nice if we could write our own fw + * since afaict implementing the sort of packet we need would be + * trivial. + * + * Instead, we: + * (1) CP_REG_TO_MEM to do a 64b copy of counter to scratch buffer + * (2) CP_MEM_WRITE to write per-sample offset to scratch buffer + * (3) CP_REG_TO_MEM w/ accumulate flag to add the per-tile base + * address to the per-sample offset in the scratch buffer + * (4) CP_MEM_TO_REG to copy resulting address from steps #2 and #3 + * to CP_ME_NRT_ADDR + * (5) CP_MEM_TO_REG's to copy saved counter value from scratch + * buffer to CP_ME_NRT_DATA to trigger the write out to query + * result buffer + * + * Straightforward, right? + * + * Maybe could swap the order of things in the scratch buffer to + * put address first, and copy back to CP_ME_NRT_ADDR+DATA in one + * shot, but that's really just polishing a turd.. + */ + + fd_wfi(batch, ring); + + /* copy sample counter _LO and _HI to scratch: */ + OUT_PKT3(ring, CP_REG_TO_MEM, 2); + OUT_RING(ring, CP_REG_TO_MEM_0_REG(REG_A4XX_RBBM_PERFCTR_CP_0_LO) | + CP_REG_TO_MEM_0_64B | + CP_REG_TO_MEM_0_CNT(2)); /* write 2 regs to mem */ + OUT_RELOC(ring, scratch_bo, sample_off, 0, 0); + + /* ok... here we really *would* like to use the CP_SET_CONSTANT + * mode which can add a constant to value in reg2 and write to + * reg1... *but* that only works for banked/context registers, + * and CP_ME_NRT_DATA isn't one of those.. so we need to do some + * CP math to the scratch buffer instead: + * + * (note first 8 bytes are counter value, use offset 0x8 for + * address calculation) + */ + + /* per-sample offset to scratch bo: */ + OUT_PKT3(ring, CP_MEM_WRITE, 2); + OUT_RELOC(ring, scratch_bo, addr_off, 0, 0); + OUT_RING(ring, samp->offset); + + /* now add to that the per-tile base: */ + OUT_PKT3(ring, CP_REG_TO_MEM, 2); + OUT_RING(ring, CP_REG_TO_MEM_0_REG(HW_QUERY_BASE_REG) | + CP_REG_TO_MEM_0_ACCUMULATE | + CP_REG_TO_MEM_0_CNT(0)); /* readback 1 regs */ + OUT_RELOC(ring, scratch_bo, addr_off, 0, 0); + + /* now copy that back to CP_ME_NRT_ADDR: */ + OUT_PKT3(ring, CP_MEM_TO_REG, 2); + OUT_RING(ring, REG_A4XX_CP_ME_NRT_ADDR); + OUT_RELOC(ring, scratch_bo, addr_off, 0, 0); + + /* and finally, copy sample from scratch buffer to CP_ME_NRT_DATA + * to trigger the write to result buffer + */ + OUT_PKT3(ring, CP_MEM_TO_REG, 2); + OUT_RING(ring, REG_A4XX_CP_ME_NRT_DATA); + OUT_RELOC(ring, scratch_bo, sample_off, 0, 0); + + /* and again to get the value of the _HI reg from scratch: */ + OUT_PKT3(ring, CP_MEM_TO_REG, 2); + OUT_RING(ring, REG_A4XX_CP_ME_NRT_DATA); + OUT_RELOC(ring, scratch_bo, sample_off + 0x4, 0, 0); + + /* Sigh.. */ + + return samp; } static void -time_elapsed_accumulate_result(struct fd_context *ctx, - const void *start, const void *end, - union pipe_query_result *result) +time_elapsed_accumulate_result(struct fd_context *ctx, const void *start, + const void *end, union pipe_query_result *result) { - uint64_t n = *(uint64_t *)end - *(uint64_t *)start; - /* max_freq is in Hz, convert cycle count to ns: */ - result->u64 += n * 1000000000 / ctx->screen->max_freq; + uint64_t n = *(uint64_t *)end - *(uint64_t *)start; + /* max_freq is in Hz, convert cycle count to ns: */ + result->u64 += n * 1000000000 / ctx->screen->max_freq; } static void -timestamp_accumulate_result(struct fd_context *ctx, - const void *start, const void *end, - union pipe_query_result *result) +timestamp_accumulate_result(struct fd_context *ctx, const void *start, + const void *end, union pipe_query_result *result) { - /* just return the value from fist tile: */ - if (result->u64 != 0) - return; - uint64_t n = *(uint64_t *)start; - /* max_freq is in Hz, convert cycle count to ns: */ - result->u64 = n * 1000000000 / ctx->screen->max_freq; + /* just return the value from fist tile: */ + if (result->u64 != 0) + return; + uint64_t n = *(uint64_t *)start; + /* max_freq is in Hz, convert cycle count to ns: */ + result->u64 = n * 1000000000 / ctx->screen->max_freq; } static const struct fd_hw_sample_provider occlusion_counter = { - .query_type = PIPE_QUERY_OCCLUSION_COUNTER, - .get_sample = occlusion_get_sample, - .accumulate_result = occlusion_counter_accumulate_result, + .query_type = PIPE_QUERY_OCCLUSION_COUNTER, + .get_sample = occlusion_get_sample, + .accumulate_result = occlusion_counter_accumulate_result, }; static const struct fd_hw_sample_provider occlusion_predicate = { - .query_type = PIPE_QUERY_OCCLUSION_PREDICATE, - .get_sample = occlusion_get_sample, - .accumulate_result = occlusion_predicate_accumulate_result, + .query_type = PIPE_QUERY_OCCLUSION_PREDICATE, + .get_sample = occlusion_get_sample, + .accumulate_result = occlusion_predicate_accumulate_result, }; static const struct fd_hw_sample_provider occlusion_predicate_conservative = { - .query_type = PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE, - .get_sample = occlusion_get_sample, - .accumulate_result = occlusion_predicate_accumulate_result, + .query_type = PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE, + .get_sample = occlusion_get_sample, + .accumulate_result = occlusion_predicate_accumulate_result, }; static const struct fd_hw_sample_provider time_elapsed = { - .query_type = PIPE_QUERY_TIME_ELAPSED, - .always = true, - .enable = time_elapsed_enable, - .get_sample = time_elapsed_get_sample, - .accumulate_result = time_elapsed_accumulate_result, + .query_type = PIPE_QUERY_TIME_ELAPSED, + .always = true, + .enable = time_elapsed_enable, + .get_sample = time_elapsed_get_sample, + .accumulate_result = time_elapsed_accumulate_result, }; /* NOTE: timestamp query isn't going to give terribly sensible results @@ -273,26 +269,26 @@ static const struct fd_hw_sample_provider time_elapsed = { * kind of good enough. */ static const struct fd_hw_sample_provider timestamp = { - .query_type = PIPE_QUERY_TIMESTAMP, - .always = true, - .enable = time_elapsed_enable, - .get_sample = time_elapsed_get_sample, - .accumulate_result = timestamp_accumulate_result, + .query_type = PIPE_QUERY_TIMESTAMP, + .always = true, + .enable = time_elapsed_enable, + .get_sample = time_elapsed_get_sample, + .accumulate_result = timestamp_accumulate_result, }; -void fd4_query_context_init(struct pipe_context *pctx) - disable_thread_safety_analysis +void +fd4_query_context_init(struct pipe_context *pctx) disable_thread_safety_analysis { - struct fd_context *ctx = fd_context(pctx); - - ctx->create_query = fd_hw_create_query; - ctx->query_prepare = fd_hw_query_prepare; - ctx->query_prepare_tile = fd_hw_query_prepare_tile; - ctx->query_update_batch = fd_hw_query_update_batch; - - fd_hw_query_register_provider(pctx, &occlusion_counter); - fd_hw_query_register_provider(pctx, &occlusion_predicate); - fd_hw_query_register_provider(pctx, &occlusion_predicate_conservative); - fd_hw_query_register_provider(pctx, &time_elapsed); - fd_hw_query_register_provider(pctx, ×tamp); + struct fd_context *ctx = fd_context(pctx); + + ctx->create_query = fd_hw_create_query; + ctx->query_prepare = fd_hw_query_prepare; + ctx->query_prepare_tile = fd_hw_query_prepare_tile; + ctx->query_update_batch = fd_hw_query_update_batch; + + fd_hw_query_register_provider(pctx, &occlusion_counter); + fd_hw_query_register_provider(pctx, &occlusion_predicate); + fd_hw_query_register_provider(pctx, &occlusion_predicate_conservative); + fd_hw_query_register_provider(pctx, &time_elapsed); + fd_hw_query_register_provider(pctx, ×tamp); } diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_rasterizer.c b/src/gallium/drivers/freedreno/a4xx/fd4_rasterizer.c index 8d674de..7212157 100644 --- a/src/gallium/drivers/freedreno/a4xx/fd4_rasterizer.c +++ b/src/gallium/drivers/freedreno/a4xx/fd4_rasterizer.c @@ -24,84 +24,83 @@ * Rob Clark */ - #include "pipe/p_state.h" -#include "util/u_string.h" #include "util/u_memory.h" +#include "util/u_string.h" -#include "fd4_rasterizer.h" #include "fd4_context.h" #include "fd4_format.h" +#include "fd4_rasterizer.h" void * fd4_rasterizer_state_create(struct pipe_context *pctx, - const struct pipe_rasterizer_state *cso) + const struct pipe_rasterizer_state *cso) { - struct fd4_rasterizer_stateobj *so; - float psize_min, psize_max; + struct fd4_rasterizer_stateobj *so; + float psize_min, psize_max; - so = CALLOC_STRUCT(fd4_rasterizer_stateobj); - if (!so) - return NULL; + so = CALLOC_STRUCT(fd4_rasterizer_stateobj); + if (!so) + return NULL; - so->base = *cso; + so->base = *cso; - if (cso->point_size_per_vertex) { - psize_min = util_get_min_point_size(cso); - psize_max = 4092; - } else { - /* Force the point size to be as if the vertex output was disabled. */ - psize_min = cso->point_size; - psize_max = cso->point_size; - } + if (cso->point_size_per_vertex) { + psize_min = util_get_min_point_size(cso); + psize_max = 4092; + } else { + /* Force the point size to be as if the vertex output was disabled. */ + psize_min = cso->point_size; + psize_max = cso->point_size; + } -/* - if (cso->line_stipple_enable) { - ??? TODO line stipple - } - TODO cso->half_pixel_center - if (cso->multisample) - TODO -*/ - so->gras_cl_clip_cntl = 0x80000; /* ??? */ - so->gras_su_point_minmax = - A4XX_GRAS_SU_POINT_MINMAX_MIN(psize_min) | - A4XX_GRAS_SU_POINT_MINMAX_MAX(psize_max); - so->gras_su_point_size = A4XX_GRAS_SU_POINT_SIZE(cso->point_size); - so->gras_su_poly_offset_scale = - A4XX_GRAS_SU_POLY_OFFSET_SCALE(cso->offset_scale); - so->gras_su_poly_offset_offset = - A4XX_GRAS_SU_POLY_OFFSET_OFFSET(cso->offset_units * 2.0f); - so->gras_su_poly_offset_clamp = - A4XX_GRAS_SU_POLY_OFFSET_CLAMP(cso->offset_clamp); + /* + if (cso->line_stipple_enable) { + ??? TODO line stipple + } + TODO cso->half_pixel_center + if (cso->multisample) + TODO + */ + so->gras_cl_clip_cntl = 0x80000; /* ??? */ + so->gras_su_point_minmax = A4XX_GRAS_SU_POINT_MINMAX_MIN(psize_min) | + A4XX_GRAS_SU_POINT_MINMAX_MAX(psize_max); + so->gras_su_point_size = A4XX_GRAS_SU_POINT_SIZE(cso->point_size); + so->gras_su_poly_offset_scale = + A4XX_GRAS_SU_POLY_OFFSET_SCALE(cso->offset_scale); + so->gras_su_poly_offset_offset = + A4XX_GRAS_SU_POLY_OFFSET_OFFSET(cso->offset_units * 2.0f); + so->gras_su_poly_offset_clamp = + A4XX_GRAS_SU_POLY_OFFSET_CLAMP(cso->offset_clamp); - so->gras_su_mode_control = - A4XX_GRAS_SU_MODE_CONTROL_LINEHALFWIDTH(cso->line_width/2.0); - so->pc_prim_vtx_cntl2 = - A4XX_PC_PRIM_VTX_CNTL2_POLYMODE_FRONT_PTYPE(fd_polygon_mode(cso->fill_front)) | - A4XX_PC_PRIM_VTX_CNTL2_POLYMODE_BACK_PTYPE(fd_polygon_mode(cso->fill_back)); + so->gras_su_mode_control = + A4XX_GRAS_SU_MODE_CONTROL_LINEHALFWIDTH(cso->line_width / 2.0); + so->pc_prim_vtx_cntl2 = A4XX_PC_PRIM_VTX_CNTL2_POLYMODE_FRONT_PTYPE( + fd_polygon_mode(cso->fill_front)) | + A4XX_PC_PRIM_VTX_CNTL2_POLYMODE_BACK_PTYPE( + fd_polygon_mode(cso->fill_back)); - if (cso->fill_front != PIPE_POLYGON_MODE_FILL || - cso->fill_back != PIPE_POLYGON_MODE_FILL) - so->pc_prim_vtx_cntl2 |= A4XX_PC_PRIM_VTX_CNTL2_POLYMODE_ENABLE; + if (cso->fill_front != PIPE_POLYGON_MODE_FILL || + cso->fill_back != PIPE_POLYGON_MODE_FILL) + so->pc_prim_vtx_cntl2 |= A4XX_PC_PRIM_VTX_CNTL2_POLYMODE_ENABLE; - if (cso->cull_face & PIPE_FACE_FRONT) - so->gras_su_mode_control |= A4XX_GRAS_SU_MODE_CONTROL_CULL_FRONT; - if (cso->cull_face & PIPE_FACE_BACK) - so->gras_su_mode_control |= A4XX_GRAS_SU_MODE_CONTROL_CULL_BACK; - if (!cso->front_ccw) - so->gras_su_mode_control |= A4XX_GRAS_SU_MODE_CONTROL_FRONT_CW; - if (!cso->flatshade_first) - so->pc_prim_vtx_cntl |= A4XX_PC_PRIM_VTX_CNTL_PROVOKING_VTX_LAST; + if (cso->cull_face & PIPE_FACE_FRONT) + so->gras_su_mode_control |= A4XX_GRAS_SU_MODE_CONTROL_CULL_FRONT; + if (cso->cull_face & PIPE_FACE_BACK) + so->gras_su_mode_control |= A4XX_GRAS_SU_MODE_CONTROL_CULL_BACK; + if (!cso->front_ccw) + so->gras_su_mode_control |= A4XX_GRAS_SU_MODE_CONTROL_FRONT_CW; + if (!cso->flatshade_first) + so->pc_prim_vtx_cntl |= A4XX_PC_PRIM_VTX_CNTL_PROVOKING_VTX_LAST; - if (cso->offset_tri) - so->gras_su_mode_control |= A4XX_GRAS_SU_MODE_CONTROL_POLY_OFFSET; + if (cso->offset_tri) + so->gras_su_mode_control |= A4XX_GRAS_SU_MODE_CONTROL_POLY_OFFSET; - if (!cso->depth_clip_near) - so->gras_cl_clip_cntl |= A4XX_GRAS_CL_CLIP_CNTL_ZNEAR_CLIP_DISABLE | - A4XX_GRAS_CL_CLIP_CNTL_ZFAR_CLIP_DISABLE; - if (cso->clip_halfz) - so->gras_cl_clip_cntl |= A4XX_GRAS_CL_CLIP_CNTL_ZERO_GB_SCALE_Z; + if (!cso->depth_clip_near) + so->gras_cl_clip_cntl |= A4XX_GRAS_CL_CLIP_CNTL_ZNEAR_CLIP_DISABLE | + A4XX_GRAS_CL_CLIP_CNTL_ZFAR_CLIP_DISABLE; + if (cso->clip_halfz) + so->gras_cl_clip_cntl |= A4XX_GRAS_CL_CLIP_CNTL_ZERO_GB_SCALE_Z; - return so; + return so; } diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_rasterizer.h b/src/gallium/drivers/freedreno/a4xx/fd4_rasterizer.h index b9019c3..300e1a3 100644 --- a/src/gallium/drivers/freedreno/a4xx/fd4_rasterizer.h +++ b/src/gallium/drivers/freedreno/a4xx/fd4_rasterizer.h @@ -27,30 +27,30 @@ #ifndef FD4_RASTERIZER_H_ #define FD4_RASTERIZER_H_ -#include "pipe/p_state.h" #include "pipe/p_context.h" +#include "pipe/p_state.h" struct fd4_rasterizer_stateobj { - struct pipe_rasterizer_state base; - uint32_t gras_su_point_minmax; - uint32_t gras_su_point_size; - uint32_t gras_su_poly_offset_scale; - uint32_t gras_su_poly_offset_offset; - uint32_t gras_su_poly_offset_clamp; + struct pipe_rasterizer_state base; + uint32_t gras_su_point_minmax; + uint32_t gras_su_point_size; + uint32_t gras_su_poly_offset_scale; + uint32_t gras_su_poly_offset_offset; + uint32_t gras_su_poly_offset_clamp; - uint32_t gras_su_mode_control; - uint32_t gras_cl_clip_cntl; - uint32_t pc_prim_vtx_cntl; - uint32_t pc_prim_vtx_cntl2; + uint32_t gras_su_mode_control; + uint32_t gras_cl_clip_cntl; + uint32_t pc_prim_vtx_cntl; + uint32_t pc_prim_vtx_cntl2; }; static inline struct fd4_rasterizer_stateobj * fd4_rasterizer_stateobj(struct pipe_rasterizer_state *rast) { - return (struct fd4_rasterizer_stateobj *)rast; + return (struct fd4_rasterizer_stateobj *)rast; } -void * fd4_rasterizer_state_create(struct pipe_context *pctx, - const struct pipe_rasterizer_state *cso); +void *fd4_rasterizer_state_create(struct pipe_context *pctx, + const struct pipe_rasterizer_state *cso); #endif /* FD4_RASTERIZER_H_ */ diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_resource.c b/src/gallium/drivers/freedreno/a4xx/fd4_resource.c index b5b4c0c..782031b 100644 --- a/src/gallium/drivers/freedreno/a4xx/fd4_resource.c +++ b/src/gallium/drivers/freedreno/a4xx/fd4_resource.c @@ -30,54 +30,54 @@ uint32_t fd4_setup_slices(struct fd_resource *rsc) { - struct pipe_resource *prsc = &rsc->b.b; - enum pipe_format format = prsc->format; - uint32_t level, size = 0; - uint32_t width = prsc->width0; - uint32_t height = prsc->height0; - uint32_t depth = prsc->depth0; - /* in layer_first layout, the level (slice) contains just one - * layer (since in fact the layer contains the slices) - */ - uint32_t layers_in_level, alignment; + struct pipe_resource *prsc = &rsc->b.b; + enum pipe_format format = prsc->format; + uint32_t level, size = 0; + uint32_t width = prsc->width0; + uint32_t height = prsc->height0; + uint32_t depth = prsc->depth0; + /* in layer_first layout, the level (slice) contains just one + * layer (since in fact the layer contains the slices) + */ + uint32_t layers_in_level, alignment; - if (prsc->target == PIPE_TEXTURE_3D) { - rsc->layout.layer_first = false; - layers_in_level = prsc->array_size; - alignment = 4096; - } else { - rsc->layout.layer_first = true; - layers_in_level = 1; - alignment = 1; - } + if (prsc->target == PIPE_TEXTURE_3D) { + rsc->layout.layer_first = false; + layers_in_level = prsc->array_size; + alignment = 4096; + } else { + rsc->layout.layer_first = true; + layers_in_level = 1; + alignment = 1; + } - /* 32 pixel alignment */ - fdl_set_pitchalign(&rsc->layout, fdl_cpp_shift(&rsc->layout) + 5); + /* 32 pixel alignment */ + fdl_set_pitchalign(&rsc->layout, fdl_cpp_shift(&rsc->layout) + 5); - for (level = 0; level <= prsc->last_level; level++) { - struct fdl_slice *slice = fd_resource_slice(rsc, level); - uint32_t pitch = fdl_pitch(&rsc->layout, level); - uint32_t nblocksy = util_format_get_nblocksy(format, height); + for (level = 0; level <= prsc->last_level; level++) { + struct fdl_slice *slice = fd_resource_slice(rsc, level); + uint32_t pitch = fdl_pitch(&rsc->layout, level); + uint32_t nblocksy = util_format_get_nblocksy(format, height); - slice->offset = size; + slice->offset = size; - /* 3d textures can have different layer sizes for high levels, but the - * hw auto-sizer is buggy (or at least different than what this code - * does), so as soon as the layer size range gets into range, we stop - * reducing it. - */ - if (prsc->target == PIPE_TEXTURE_3D && - (level > 1 && fd_resource_slice(rsc, level - 1)->size0 <= 0xf000)) - slice->size0 = fd_resource_slice(rsc, level - 1)->size0; - else - slice->size0 = align(nblocksy * pitch, alignment); + /* 3d textures can have different layer sizes for high levels, but the + * hw auto-sizer is buggy (or at least different than what this code + * does), so as soon as the layer size range gets into range, we stop + * reducing it. + */ + if (prsc->target == PIPE_TEXTURE_3D && + (level > 1 && fd_resource_slice(rsc, level - 1)->size0 <= 0xf000)) + slice->size0 = fd_resource_slice(rsc, level - 1)->size0; + else + slice->size0 = align(nblocksy * pitch, alignment); - size += slice->size0 * depth * layers_in_level; + size += slice->size0 * depth * layers_in_level; - width = u_minify(width, 1); - height = u_minify(height, 1); - depth = u_minify(depth, 1); - } + width = u_minify(width, 1); + height = u_minify(height, 1); + depth = u_minify(depth, 1); + } - return size; + return size; } diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_screen.c b/src/gallium/drivers/freedreno/a4xx/fd4_screen.c index a72143a..a85a326 100644 --- a/src/gallium/drivers/freedreno/a4xx/fd4_screen.c +++ b/src/gallium/drivers/freedreno/a4xx/fd4_screen.c @@ -27,91 +27,85 @@ #include "pipe/p_screen.h" #include "util/format/u_format.h" -#include "fd4_screen.h" #include "fd4_context.h" #include "fd4_emit.h" #include "fd4_format.h" #include "fd4_resource.h" +#include "fd4_screen.h" #include "ir3/ir3_compiler.h" static bool fd4_screen_is_format_supported(struct pipe_screen *pscreen, - enum pipe_format format, - enum pipe_texture_target target, - unsigned sample_count, - unsigned storage_sample_count, - unsigned usage) + enum pipe_format format, + enum pipe_texture_target target, + unsigned sample_count, + unsigned storage_sample_count, unsigned usage) { - unsigned retval = 0; + unsigned retval = 0; - if ((target >= PIPE_MAX_TEXTURE_TYPES) || - (sample_count > 1)) { /* TODO add MSAA */ - DBG("not supported: format=%s, target=%d, sample_count=%d, usage=%x", - util_format_name(format), target, sample_count, usage); - return false; - } + if ((target >= PIPE_MAX_TEXTURE_TYPES) || + (sample_count > 1)) { /* TODO add MSAA */ + DBG("not supported: format=%s, target=%d, sample_count=%d, usage=%x", + util_format_name(format), target, sample_count, usage); + return false; + } - if (MAX2(1, sample_count) != MAX2(1, storage_sample_count)) - return false; + if (MAX2(1, sample_count) != MAX2(1, storage_sample_count)) + return false; - if ((usage & PIPE_BIND_VERTEX_BUFFER) && - (fd4_pipe2vtx(format) != VFMT4_NONE)) { - retval |= PIPE_BIND_VERTEX_BUFFER; - } + if ((usage & PIPE_BIND_VERTEX_BUFFER) && + (fd4_pipe2vtx(format) != VFMT4_NONE)) { + retval |= PIPE_BIND_VERTEX_BUFFER; + } - if ((usage & PIPE_BIND_SAMPLER_VIEW) && - (fd4_pipe2tex(format) != TFMT4_NONE) && - (target == PIPE_BUFFER || - util_format_get_blocksize(format) != 12)) { - retval |= PIPE_BIND_SAMPLER_VIEW; - } + if ((usage & PIPE_BIND_SAMPLER_VIEW) && + (fd4_pipe2tex(format) != TFMT4_NONE) && + (target == PIPE_BUFFER || util_format_get_blocksize(format) != 12)) { + retval |= PIPE_BIND_SAMPLER_VIEW; + } - if ((usage & (PIPE_BIND_RENDER_TARGET | - PIPE_BIND_DISPLAY_TARGET | - PIPE_BIND_SCANOUT | - PIPE_BIND_SHARED)) && - (fd4_pipe2color(format) != RB4_NONE) && - (fd4_pipe2tex(format) != TFMT4_NONE)) { - retval |= usage & (PIPE_BIND_RENDER_TARGET | - PIPE_BIND_DISPLAY_TARGET | - PIPE_BIND_SCANOUT | - PIPE_BIND_SHARED); - } + if ((usage & (PIPE_BIND_RENDER_TARGET | PIPE_BIND_DISPLAY_TARGET | + PIPE_BIND_SCANOUT | PIPE_BIND_SHARED)) && + (fd4_pipe2color(format) != RB4_NONE) && + (fd4_pipe2tex(format) != TFMT4_NONE)) { + retval |= usage & (PIPE_BIND_RENDER_TARGET | PIPE_BIND_DISPLAY_TARGET | + PIPE_BIND_SCANOUT | PIPE_BIND_SHARED); + } - /* For ARB_framebuffer_no_attachments: */ - if ((usage & PIPE_BIND_RENDER_TARGET) && (format == PIPE_FORMAT_NONE)) { - retval |= usage & PIPE_BIND_RENDER_TARGET; - } + /* For ARB_framebuffer_no_attachments: */ + if ((usage & PIPE_BIND_RENDER_TARGET) && (format == PIPE_FORMAT_NONE)) { + retval |= usage & PIPE_BIND_RENDER_TARGET; + } - if ((usage & PIPE_BIND_DEPTH_STENCIL) && - (fd4_pipe2depth(format) != (enum a4xx_depth_format)~0) && - (fd4_pipe2tex(format) != TFMT4_NONE)) { - retval |= PIPE_BIND_DEPTH_STENCIL; - } + if ((usage & PIPE_BIND_DEPTH_STENCIL) && + (fd4_pipe2depth(format) != (enum a4xx_depth_format) ~0) && + (fd4_pipe2tex(format) != TFMT4_NONE)) { + retval |= PIPE_BIND_DEPTH_STENCIL; + } - if ((usage & PIPE_BIND_INDEX_BUFFER) && - (fd_pipe2index(format) != (enum pc_di_index_size)~0)) { - retval |= PIPE_BIND_INDEX_BUFFER; - } + if ((usage & PIPE_BIND_INDEX_BUFFER) && + (fd_pipe2index(format) != (enum pc_di_index_size) ~0)) { + retval |= PIPE_BIND_INDEX_BUFFER; + } - if (retval != usage) { - DBG("not supported: format=%s, target=%d, sample_count=%d, " - "usage=%x, retval=%x", util_format_name(format), - target, sample_count, usage, retval); - } + if (retval != usage) { + DBG("not supported: format=%s, target=%d, sample_count=%d, " + "usage=%x, retval=%x", + util_format_name(format), target, sample_count, usage, retval); + } - return retval == usage; + return retval == usage; } void fd4_screen_init(struct pipe_screen *pscreen) { - struct fd_screen *screen = fd_screen(pscreen); - screen->max_rts = A4XX_MAX_RENDER_TARGETS; - screen->setup_slices = fd4_setup_slices; - pscreen->context_create = fd4_context_create; - pscreen->is_format_supported = fd4_screen_is_format_supported; - fd4_emit_init_screen(pscreen); - ir3_screen_init(pscreen); + struct fd_screen *screen = fd_screen(pscreen); + screen->max_rts = A4XX_MAX_RENDER_TARGETS; + screen->setup_slices = fd4_setup_slices; + pscreen->context_create = fd4_context_create; + pscreen->is_format_supported = fd4_screen_is_format_supported; + fd4_emit_init_screen(pscreen); + ir3_screen_init(pscreen); } diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_texture.c b/src/gallium/drivers/freedreno/a4xx/fd4_texture.c index cbabff6..a1ff744 100644 --- a/src/gallium/drivers/freedreno/a4xx/fd4_texture.c +++ b/src/gallium/drivers/freedreno/a4xx/fd4_texture.c @@ -25,264 +25,257 @@ */ #include "pipe/p_state.h" -#include "util/u_string.h" -#include "util/u_memory.h" -#include "util/u_inlines.h" #include "util/format/u_format.h" +#include "util/u_inlines.h" +#include "util/u_memory.h" +#include "util/u_string.h" -#include "fd4_texture.h" #include "fd4_format.h" +#include "fd4_texture.h" static enum a4xx_tex_clamp tex_clamp(unsigned wrap, bool *needs_border) { - switch (wrap) { - case PIPE_TEX_WRAP_REPEAT: - return A4XX_TEX_REPEAT; - case PIPE_TEX_WRAP_CLAMP_TO_EDGE: - return A4XX_TEX_CLAMP_TO_EDGE; - case PIPE_TEX_WRAP_CLAMP_TO_BORDER: - *needs_border = true; - return A4XX_TEX_CLAMP_TO_BORDER; - case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE: - /* only works for PoT.. need to emulate otherwise! */ - return A4XX_TEX_MIRROR_CLAMP; - case PIPE_TEX_WRAP_MIRROR_REPEAT: - return A4XX_TEX_MIRROR_REPEAT; - case PIPE_TEX_WRAP_MIRROR_CLAMP: - case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER: - /* these two we could perhaps emulate, but we currently - * just don't advertise PIPE_CAP_TEXTURE_MIRROR_CLAMP - */ - default: - DBG("invalid wrap: %u", wrap); - return 0; - } + switch (wrap) { + case PIPE_TEX_WRAP_REPEAT: + return A4XX_TEX_REPEAT; + case PIPE_TEX_WRAP_CLAMP_TO_EDGE: + return A4XX_TEX_CLAMP_TO_EDGE; + case PIPE_TEX_WRAP_CLAMP_TO_BORDER: + *needs_border = true; + return A4XX_TEX_CLAMP_TO_BORDER; + case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE: + /* only works for PoT.. need to emulate otherwise! */ + return A4XX_TEX_MIRROR_CLAMP; + case PIPE_TEX_WRAP_MIRROR_REPEAT: + return A4XX_TEX_MIRROR_REPEAT; + case PIPE_TEX_WRAP_MIRROR_CLAMP: + case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER: + /* these two we could perhaps emulate, but we currently + * just don't advertise PIPE_CAP_TEXTURE_MIRROR_CLAMP + */ + default: + DBG("invalid wrap: %u", wrap); + return 0; + } } static enum a4xx_tex_filter tex_filter(unsigned filter, bool aniso) { - switch (filter) { - case PIPE_TEX_FILTER_NEAREST: - return A4XX_TEX_NEAREST; - case PIPE_TEX_FILTER_LINEAR: - return aniso ? A4XX_TEX_ANISO : A4XX_TEX_LINEAR; - default: - DBG("invalid filter: %u", filter); - return 0; - } + switch (filter) { + case PIPE_TEX_FILTER_NEAREST: + return A4XX_TEX_NEAREST; + case PIPE_TEX_FILTER_LINEAR: + return aniso ? A4XX_TEX_ANISO : A4XX_TEX_LINEAR; + default: + DBG("invalid filter: %u", filter); + return 0; + } } static void * fd4_sampler_state_create(struct pipe_context *pctx, - const struct pipe_sampler_state *cso) + const struct pipe_sampler_state *cso) { - struct fd4_sampler_stateobj *so = CALLOC_STRUCT(fd4_sampler_stateobj); - unsigned aniso = util_last_bit(MIN2(cso->max_anisotropy >> 1, 8)); - bool miplinear = false; - - if (!so) - return NULL; - - if (cso->min_mip_filter == PIPE_TEX_MIPFILTER_LINEAR) - miplinear = true; - - so->base = *cso; - - so->needs_border = false; - so->texsamp0 = - COND(miplinear, A4XX_TEX_SAMP_0_MIPFILTER_LINEAR_NEAR) | - A4XX_TEX_SAMP_0_XY_MAG(tex_filter(cso->mag_img_filter, aniso)) | - A4XX_TEX_SAMP_0_XY_MIN(tex_filter(cso->min_img_filter, aniso)) | - A4XX_TEX_SAMP_0_ANISO(aniso) | - A4XX_TEX_SAMP_0_WRAP_S(tex_clamp(cso->wrap_s, &so->needs_border)) | - A4XX_TEX_SAMP_0_WRAP_T(tex_clamp(cso->wrap_t, &so->needs_border)) | - A4XX_TEX_SAMP_0_WRAP_R(tex_clamp(cso->wrap_r, &so->needs_border)); - - so->texsamp1 = -// COND(miplinear, A4XX_TEX_SAMP_1_MIPFILTER_LINEAR_FAR) | - COND(!cso->seamless_cube_map, A4XX_TEX_SAMP_1_CUBEMAPSEAMLESSFILTOFF) | - COND(!cso->normalized_coords, A4XX_TEX_SAMP_1_UNNORM_COORDS); - - if (cso->min_mip_filter != PIPE_TEX_MIPFILTER_NONE) { - so->texsamp0 |= A4XX_TEX_SAMP_0_LOD_BIAS(cso->lod_bias); - so->texsamp1 |= - A4XX_TEX_SAMP_1_MIN_LOD(cso->min_lod) | - A4XX_TEX_SAMP_1_MAX_LOD(cso->max_lod); - } - - if (cso->compare_mode) - so->texsamp1 |= A4XX_TEX_SAMP_1_COMPARE_FUNC(cso->compare_func); /* maps 1:1 */ - - return so; + struct fd4_sampler_stateobj *so = CALLOC_STRUCT(fd4_sampler_stateobj); + unsigned aniso = util_last_bit(MIN2(cso->max_anisotropy >> 1, 8)); + bool miplinear = false; + + if (!so) + return NULL; + + if (cso->min_mip_filter == PIPE_TEX_MIPFILTER_LINEAR) + miplinear = true; + + so->base = *cso; + + so->needs_border = false; + so->texsamp0 = + COND(miplinear, A4XX_TEX_SAMP_0_MIPFILTER_LINEAR_NEAR) | + A4XX_TEX_SAMP_0_XY_MAG(tex_filter(cso->mag_img_filter, aniso)) | + A4XX_TEX_SAMP_0_XY_MIN(tex_filter(cso->min_img_filter, aniso)) | + A4XX_TEX_SAMP_0_ANISO(aniso) | + A4XX_TEX_SAMP_0_WRAP_S(tex_clamp(cso->wrap_s, &so->needs_border)) | + A4XX_TEX_SAMP_0_WRAP_T(tex_clamp(cso->wrap_t, &so->needs_border)) | + A4XX_TEX_SAMP_0_WRAP_R(tex_clamp(cso->wrap_r, &so->needs_border)); + + so->texsamp1 = + // COND(miplinear, A4XX_TEX_SAMP_1_MIPFILTER_LINEAR_FAR) | + COND(!cso->seamless_cube_map, A4XX_TEX_SAMP_1_CUBEMAPSEAMLESSFILTOFF) | + COND(!cso->normalized_coords, A4XX_TEX_SAMP_1_UNNORM_COORDS); + + if (cso->min_mip_filter != PIPE_TEX_MIPFILTER_NONE) { + so->texsamp0 |= A4XX_TEX_SAMP_0_LOD_BIAS(cso->lod_bias); + so->texsamp1 |= A4XX_TEX_SAMP_1_MIN_LOD(cso->min_lod) | + A4XX_TEX_SAMP_1_MAX_LOD(cso->max_lod); + } + + if (cso->compare_mode) + so->texsamp1 |= + A4XX_TEX_SAMP_1_COMPARE_FUNC(cso->compare_func); /* maps 1:1 */ + + return so; } - static enum a4xx_tex_type tex_type(unsigned target) { - switch (target) { - default: - assert(0); - case PIPE_BUFFER: - case PIPE_TEXTURE_1D: - case PIPE_TEXTURE_1D_ARRAY: - return A4XX_TEX_1D; - case PIPE_TEXTURE_RECT: - case PIPE_TEXTURE_2D: - case PIPE_TEXTURE_2D_ARRAY: - return A4XX_TEX_2D; - case PIPE_TEXTURE_3D: - return A4XX_TEX_3D; - case PIPE_TEXTURE_CUBE: - case PIPE_TEXTURE_CUBE_ARRAY: - return A4XX_TEX_CUBE; - } + switch (target) { + default: + assert(0); + case PIPE_BUFFER: + case PIPE_TEXTURE_1D: + case PIPE_TEXTURE_1D_ARRAY: + return A4XX_TEX_1D; + case PIPE_TEXTURE_RECT: + case PIPE_TEXTURE_2D: + case PIPE_TEXTURE_2D_ARRAY: + return A4XX_TEX_2D; + case PIPE_TEXTURE_3D: + return A4XX_TEX_3D; + case PIPE_TEXTURE_CUBE: + case PIPE_TEXTURE_CUBE_ARRAY: + return A4XX_TEX_CUBE; + } } static bool use_astc_srgb_workaround(struct pipe_context *pctx, enum pipe_format format) { - return (fd_screen(pctx->screen)->gpu_id == 420) && - (util_format_description(format)->layout == UTIL_FORMAT_LAYOUT_ASTC); + return (fd_screen(pctx->screen)->gpu_id == 420) && + (util_format_description(format)->layout == UTIL_FORMAT_LAYOUT_ASTC); } static struct pipe_sampler_view * fd4_sampler_view_create(struct pipe_context *pctx, struct pipe_resource *prsc, - const struct pipe_sampler_view *cso) + const struct pipe_sampler_view *cso) { - struct fd4_pipe_sampler_view *so = CALLOC_STRUCT(fd4_pipe_sampler_view); - struct fd_resource *rsc = fd_resource(prsc); - enum pipe_format format = cso->format; - unsigned lvl, layers = 0; - - if (!so) - return NULL; - - if (format == PIPE_FORMAT_X32_S8X24_UINT) { - rsc = rsc->stencil; - format = rsc->b.b.format; - } - - so->base = *cso; - pipe_reference(NULL, &prsc->reference); - so->base.texture = prsc; - so->base.reference.count = 1; - so->base.context = pctx; - - so->texconst0 = - A4XX_TEX_CONST_0_TYPE(tex_type(cso->target)) | - A4XX_TEX_CONST_0_FMT(fd4_pipe2tex(format)) | - fd4_tex_swiz(format, cso->swizzle_r, cso->swizzle_g, - cso->swizzle_b, cso->swizzle_a); - - if (util_format_is_srgb(format)) { - if (use_astc_srgb_workaround(pctx, format)) - so->astc_srgb = true; - so->texconst0 |= A4XX_TEX_CONST_0_SRGB; - } - - if (cso->target == PIPE_BUFFER) { - unsigned elements = cso->u.buf.size / util_format_get_blocksize(format); - - lvl = 0; - so->texconst1 = - A4XX_TEX_CONST_1_WIDTH(elements) | - A4XX_TEX_CONST_1_HEIGHT(1); - so->texconst2 = - A4XX_TEX_CONST_2_PITCH(elements * rsc->layout.cpp); - so->offset = cso->u.buf.offset; - } else { - unsigned miplevels; - - lvl = fd_sampler_first_level(cso); - miplevels = fd_sampler_last_level(cso) - lvl; - layers = cso->u.tex.last_layer - cso->u.tex.first_layer + 1; - - so->texconst0 |= A4XX_TEX_CONST_0_MIPLVLS(miplevels); - so->texconst1 = - A4XX_TEX_CONST_1_WIDTH(u_minify(prsc->width0, lvl)) | - A4XX_TEX_CONST_1_HEIGHT(u_minify(prsc->height0, lvl)); - so->texconst2 = - A4XX_TEX_CONST_2_PITCHALIGN(rsc->layout.pitchalign - 5) | - A4XX_TEX_CONST_2_PITCH(fd_resource_pitch(rsc, lvl)); - so->offset = fd_resource_offset(rsc, lvl, cso->u.tex.first_layer); - } - - /* NOTE: since we sample z24s8 using 8888_UINT format, the swizzle - * we get isn't quite right. Use SWAP(XYZW) as a cheap and cheerful - * way to re-arrange things so stencil component is where the swiz - * expects. - * - * Note that gallium expects stencil sampler to return (s,s,s,s) - * which isn't quite true. To make that happen we'd have to massage - * the swizzle. But in practice only the .x component is used. - */ - if (format == PIPE_FORMAT_X24S8_UINT) - so->texconst2 |= A4XX_TEX_CONST_2_SWAP(XYZW); - - switch (cso->target) { - case PIPE_TEXTURE_1D_ARRAY: - case PIPE_TEXTURE_2D_ARRAY: - so->texconst3 = - A4XX_TEX_CONST_3_DEPTH(layers) | - A4XX_TEX_CONST_3_LAYERSZ(rsc->layout.layer_size); - break; - case PIPE_TEXTURE_CUBE: - case PIPE_TEXTURE_CUBE_ARRAY: - so->texconst3 = - A4XX_TEX_CONST_3_DEPTH(layers / 6) | - A4XX_TEX_CONST_3_LAYERSZ(rsc->layout.layer_size); - break; - case PIPE_TEXTURE_3D: - so->texconst3 = - A4XX_TEX_CONST_3_DEPTH(u_minify(prsc->depth0, lvl)) | - A4XX_TEX_CONST_3_LAYERSZ(fd_resource_slice(rsc, lvl)->size0); - so->texconst4 = A4XX_TEX_CONST_4_LAYERSZ( - fd_resource_slice(rsc, prsc->last_level)->size0); - break; - default: - so->texconst3 = 0x00000000; - break; - } - - return &so->base; + struct fd4_pipe_sampler_view *so = CALLOC_STRUCT(fd4_pipe_sampler_view); + struct fd_resource *rsc = fd_resource(prsc); + enum pipe_format format = cso->format; + unsigned lvl, layers = 0; + + if (!so) + return NULL; + + if (format == PIPE_FORMAT_X32_S8X24_UINT) { + rsc = rsc->stencil; + format = rsc->b.b.format; + } + + so->base = *cso; + pipe_reference(NULL, &prsc->reference); + so->base.texture = prsc; + so->base.reference.count = 1; + so->base.context = pctx; + + so->texconst0 = A4XX_TEX_CONST_0_TYPE(tex_type(cso->target)) | + A4XX_TEX_CONST_0_FMT(fd4_pipe2tex(format)) | + fd4_tex_swiz(format, cso->swizzle_r, cso->swizzle_g, + cso->swizzle_b, cso->swizzle_a); + + if (util_format_is_srgb(format)) { + if (use_astc_srgb_workaround(pctx, format)) + so->astc_srgb = true; + so->texconst0 |= A4XX_TEX_CONST_0_SRGB; + } + + if (cso->target == PIPE_BUFFER) { + unsigned elements = cso->u.buf.size / util_format_get_blocksize(format); + + lvl = 0; + so->texconst1 = + A4XX_TEX_CONST_1_WIDTH(elements) | A4XX_TEX_CONST_1_HEIGHT(1); + so->texconst2 = A4XX_TEX_CONST_2_PITCH(elements * rsc->layout.cpp); + so->offset = cso->u.buf.offset; + } else { + unsigned miplevels; + + lvl = fd_sampler_first_level(cso); + miplevels = fd_sampler_last_level(cso) - lvl; + layers = cso->u.tex.last_layer - cso->u.tex.first_layer + 1; + + so->texconst0 |= A4XX_TEX_CONST_0_MIPLVLS(miplevels); + so->texconst1 = A4XX_TEX_CONST_1_WIDTH(u_minify(prsc->width0, lvl)) | + A4XX_TEX_CONST_1_HEIGHT(u_minify(prsc->height0, lvl)); + so->texconst2 = A4XX_TEX_CONST_2_PITCHALIGN(rsc->layout.pitchalign - 5) | + A4XX_TEX_CONST_2_PITCH(fd_resource_pitch(rsc, lvl)); + so->offset = fd_resource_offset(rsc, lvl, cso->u.tex.first_layer); + } + + /* NOTE: since we sample z24s8 using 8888_UINT format, the swizzle + * we get isn't quite right. Use SWAP(XYZW) as a cheap and cheerful + * way to re-arrange things so stencil component is where the swiz + * expects. + * + * Note that gallium expects stencil sampler to return (s,s,s,s) + * which isn't quite true. To make that happen we'd have to massage + * the swizzle. But in practice only the .x component is used. + */ + if (format == PIPE_FORMAT_X24S8_UINT) + so->texconst2 |= A4XX_TEX_CONST_2_SWAP(XYZW); + + switch (cso->target) { + case PIPE_TEXTURE_1D_ARRAY: + case PIPE_TEXTURE_2D_ARRAY: + so->texconst3 = A4XX_TEX_CONST_3_DEPTH(layers) | + A4XX_TEX_CONST_3_LAYERSZ(rsc->layout.layer_size); + break; + case PIPE_TEXTURE_CUBE: + case PIPE_TEXTURE_CUBE_ARRAY: + so->texconst3 = A4XX_TEX_CONST_3_DEPTH(layers / 6) | + A4XX_TEX_CONST_3_LAYERSZ(rsc->layout.layer_size); + break; + case PIPE_TEXTURE_3D: + so->texconst3 = + A4XX_TEX_CONST_3_DEPTH(u_minify(prsc->depth0, lvl)) | + A4XX_TEX_CONST_3_LAYERSZ(fd_resource_slice(rsc, lvl)->size0); + so->texconst4 = A4XX_TEX_CONST_4_LAYERSZ( + fd_resource_slice(rsc, prsc->last_level)->size0); + break; + default: + so->texconst3 = 0x00000000; + break; + } + + return &so->base; } static void fd4_set_sampler_views(struct pipe_context *pctx, enum pipe_shader_type shader, - unsigned start, unsigned nr, unsigned unbind_num_trailing_slots, - struct pipe_sampler_view **views) + unsigned start, unsigned nr, + unsigned unbind_num_trailing_slots, + struct pipe_sampler_view **views) { - struct fd_context *ctx = fd_context(pctx); - struct fd4_context *fd4_ctx = fd4_context(ctx); - uint16_t astc_srgb = 0; - unsigned i; - - for (i = 0; i < nr; i++) { - if (views[i]) { - struct fd4_pipe_sampler_view *view = - fd4_pipe_sampler_view(views[i]); - if (view->astc_srgb) - astc_srgb |= (1 << i); - } - } - - fd_set_sampler_views(pctx, shader, start, nr, unbind_num_trailing_slots, views); - - if (shader == PIPE_SHADER_FRAGMENT) { - fd4_ctx->fastc_srgb = astc_srgb; - } else if (shader == PIPE_SHADER_VERTEX) { - fd4_ctx->vastc_srgb = astc_srgb; - } + struct fd_context *ctx = fd_context(pctx); + struct fd4_context *fd4_ctx = fd4_context(ctx); + uint16_t astc_srgb = 0; + unsigned i; + + for (i = 0; i < nr; i++) { + if (views[i]) { + struct fd4_pipe_sampler_view *view = fd4_pipe_sampler_view(views[i]); + if (view->astc_srgb) + astc_srgb |= (1 << i); + } + } + + fd_set_sampler_views(pctx, shader, start, nr, unbind_num_trailing_slots, + views); + + if (shader == PIPE_SHADER_FRAGMENT) { + fd4_ctx->fastc_srgb = astc_srgb; + } else if (shader == PIPE_SHADER_VERTEX) { + fd4_ctx->vastc_srgb = astc_srgb; + } } void fd4_texture_init(struct pipe_context *pctx) { - pctx->create_sampler_state = fd4_sampler_state_create; - pctx->bind_sampler_states = fd_sampler_states_bind; - pctx->create_sampler_view = fd4_sampler_view_create; - pctx->set_sampler_views = fd4_set_sampler_views; + pctx->create_sampler_state = fd4_sampler_state_create; + pctx->bind_sampler_states = fd_sampler_states_bind; + pctx->create_sampler_view = fd4_sampler_view_create; + pctx->set_sampler_views = fd4_set_sampler_views; } diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_texture.h b/src/gallium/drivers/freedreno/a4xx/fd4_texture.h index 42b5d2d..402f07c 100644 --- a/src/gallium/drivers/freedreno/a4xx/fd4_texture.h +++ b/src/gallium/drivers/freedreno/a4xx/fd4_texture.h @@ -29,39 +29,39 @@ #include "pipe/p_context.h" -#include "freedreno_texture.h" #include "freedreno_resource.h" +#include "freedreno_texture.h" #include "fd4_context.h" #include "fd4_format.h" struct fd4_sampler_stateobj { - struct pipe_sampler_state base; - uint32_t texsamp0, texsamp1; - bool needs_border; + struct pipe_sampler_state base; + uint32_t texsamp0, texsamp1; + bool needs_border; }; static inline struct fd4_sampler_stateobj * fd4_sampler_stateobj(struct pipe_sampler_state *samp) { - return (struct fd4_sampler_stateobj *)samp; + return (struct fd4_sampler_stateobj *)samp; } struct fd4_pipe_sampler_view { - struct pipe_sampler_view base; - uint32_t texconst0, texconst1, texconst2, texconst3, texconst4; - uint32_t offset; - bool astc_srgb; + struct pipe_sampler_view base; + uint32_t texconst0, texconst1, texconst2, texconst3, texconst4; + uint32_t offset; + bool astc_srgb; }; static inline struct fd4_pipe_sampler_view * fd4_pipe_sampler_view(struct pipe_sampler_view *pview) { - return (struct fd4_pipe_sampler_view *)pview; + return (struct fd4_pipe_sampler_view *)pview; } unsigned fd4_get_const_idx(struct fd_context *ctx, - struct fd_texture_stateobj *tex, unsigned samp_id); + struct fd_texture_stateobj *tex, unsigned samp_id); void fd4_texture_init(struct pipe_context *pctx); diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_zsa.c b/src/gallium/drivers/freedreno/a4xx/fd4_zsa.c index fd5c631..07c18d0 100644 --- a/src/gallium/drivers/freedreno/a4xx/fd4_zsa.c +++ b/src/gallium/drivers/freedreno/a4xx/fd4_zsa.c @@ -24,82 +24,77 @@ * Rob Clark */ - #include "pipe/p_state.h" -#include "util/u_string.h" #include "util/u_memory.h" +#include "util/u_string.h" -#include "fd4_zsa.h" #include "fd4_context.h" #include "fd4_format.h" +#include "fd4_zsa.h" void * fd4_zsa_state_create(struct pipe_context *pctx, - const struct pipe_depth_stencil_alpha_state *cso) + const struct pipe_depth_stencil_alpha_state *cso) { - struct fd4_zsa_stateobj *so; + struct fd4_zsa_stateobj *so; - so = CALLOC_STRUCT(fd4_zsa_stateobj); - if (!so) - return NULL; + so = CALLOC_STRUCT(fd4_zsa_stateobj); + if (!so) + return NULL; - so->base = *cso; + so->base = *cso; - so->rb_depth_control |= - A4XX_RB_DEPTH_CONTROL_ZFUNC(cso->depth_func); /* maps 1:1 */ + so->rb_depth_control |= + A4XX_RB_DEPTH_CONTROL_ZFUNC(cso->depth_func); /* maps 1:1 */ - if (cso->depth_enabled) - so->rb_depth_control |= - A4XX_RB_DEPTH_CONTROL_Z_ENABLE | - A4XX_RB_DEPTH_CONTROL_Z_TEST_ENABLE; + if (cso->depth_enabled) + so->rb_depth_control |= + A4XX_RB_DEPTH_CONTROL_Z_ENABLE | A4XX_RB_DEPTH_CONTROL_Z_TEST_ENABLE; - if (cso->depth_writemask) - so->rb_depth_control |= A4XX_RB_DEPTH_CONTROL_Z_WRITE_ENABLE; + if (cso->depth_writemask) + so->rb_depth_control |= A4XX_RB_DEPTH_CONTROL_Z_WRITE_ENABLE; - if (cso->stencil[0].enabled) { - const struct pipe_stencil_state *s = &cso->stencil[0]; + if (cso->stencil[0].enabled) { + const struct pipe_stencil_state *s = &cso->stencil[0]; - so->rb_stencil_control |= - A4XX_RB_STENCIL_CONTROL_STENCIL_READ | - A4XX_RB_STENCIL_CONTROL_STENCIL_ENABLE | - A4XX_RB_STENCIL_CONTROL_FUNC(s->func) | /* maps 1:1 */ - A4XX_RB_STENCIL_CONTROL_FAIL(fd_stencil_op(s->fail_op)) | - A4XX_RB_STENCIL_CONTROL_ZPASS(fd_stencil_op(s->zpass_op)) | - A4XX_RB_STENCIL_CONTROL_ZFAIL(fd_stencil_op(s->zfail_op)); - so->rb_stencil_control2 |= - A4XX_RB_STENCIL_CONTROL2_STENCIL_BUFFER; - so->rb_stencilrefmask |= - 0xff000000 | /* ??? */ - A4XX_RB_STENCILREFMASK_STENCILWRITEMASK(s->writemask) | - A4XX_RB_STENCILREFMASK_STENCILMASK(s->valuemask); + so->rb_stencil_control |= + A4XX_RB_STENCIL_CONTROL_STENCIL_READ | + A4XX_RB_STENCIL_CONTROL_STENCIL_ENABLE | + A4XX_RB_STENCIL_CONTROL_FUNC(s->func) | /* maps 1:1 */ + A4XX_RB_STENCIL_CONTROL_FAIL(fd_stencil_op(s->fail_op)) | + A4XX_RB_STENCIL_CONTROL_ZPASS(fd_stencil_op(s->zpass_op)) | + A4XX_RB_STENCIL_CONTROL_ZFAIL(fd_stencil_op(s->zfail_op)); + so->rb_stencil_control2 |= A4XX_RB_STENCIL_CONTROL2_STENCIL_BUFFER; + so->rb_stencilrefmask |= + 0xff000000 | /* ??? */ + A4XX_RB_STENCILREFMASK_STENCILWRITEMASK(s->writemask) | + A4XX_RB_STENCILREFMASK_STENCILMASK(s->valuemask); - if (cso->stencil[1].enabled) { - const struct pipe_stencil_state *bs = &cso->stencil[1]; + if (cso->stencil[1].enabled) { + const struct pipe_stencil_state *bs = &cso->stencil[1]; - so->rb_stencil_control |= - A4XX_RB_STENCIL_CONTROL_STENCIL_ENABLE_BF | - A4XX_RB_STENCIL_CONTROL_FUNC_BF(bs->func) | /* maps 1:1 */ - A4XX_RB_STENCIL_CONTROL_FAIL_BF(fd_stencil_op(bs->fail_op)) | - A4XX_RB_STENCIL_CONTROL_ZPASS_BF(fd_stencil_op(bs->zpass_op)) | - A4XX_RB_STENCIL_CONTROL_ZFAIL_BF(fd_stencil_op(bs->zfail_op)); - so->rb_stencilrefmask_bf |= - 0xff000000 | /* ??? */ - A4XX_RB_STENCILREFMASK_BF_STENCILWRITEMASK(bs->writemask) | - A4XX_RB_STENCILREFMASK_BF_STENCILMASK(bs->valuemask); - } - } + so->rb_stencil_control |= + A4XX_RB_STENCIL_CONTROL_STENCIL_ENABLE_BF | + A4XX_RB_STENCIL_CONTROL_FUNC_BF(bs->func) | /* maps 1:1 */ + A4XX_RB_STENCIL_CONTROL_FAIL_BF(fd_stencil_op(bs->fail_op)) | + A4XX_RB_STENCIL_CONTROL_ZPASS_BF(fd_stencil_op(bs->zpass_op)) | + A4XX_RB_STENCIL_CONTROL_ZFAIL_BF(fd_stencil_op(bs->zfail_op)); + so->rb_stencilrefmask_bf |= + 0xff000000 | /* ??? */ + A4XX_RB_STENCILREFMASK_BF_STENCILWRITEMASK(bs->writemask) | + A4XX_RB_STENCILREFMASK_BF_STENCILMASK(bs->valuemask); + } + } - if (cso->alpha_enabled) { - uint32_t ref = cso->alpha_ref_value * 255.0; - so->gras_alpha_control = - A4XX_GRAS_ALPHA_CONTROL_ALPHA_TEST_ENABLE; - so->rb_alpha_control = - A4XX_RB_ALPHA_CONTROL_ALPHA_TEST | - A4XX_RB_ALPHA_CONTROL_ALPHA_REF(ref) | - A4XX_RB_ALPHA_CONTROL_ALPHA_TEST_FUNC(cso->alpha_func); - so->rb_depth_control |= - A4XX_RB_DEPTH_CONTROL_EARLY_Z_DISABLE; - } + if (cso->alpha_enabled) { + uint32_t ref = cso->alpha_ref_value * 255.0; + so->gras_alpha_control = A4XX_GRAS_ALPHA_CONTROL_ALPHA_TEST_ENABLE; + so->rb_alpha_control = + A4XX_RB_ALPHA_CONTROL_ALPHA_TEST | + A4XX_RB_ALPHA_CONTROL_ALPHA_REF(ref) | + A4XX_RB_ALPHA_CONTROL_ALPHA_TEST_FUNC(cso->alpha_func); + so->rb_depth_control |= A4XX_RB_DEPTH_CONTROL_EARLY_Z_DISABLE; + } - return so; + return so; } diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_zsa.h b/src/gallium/drivers/freedreno/a4xx/fd4_zsa.h index a9939be..07d90f2 100644 --- a/src/gallium/drivers/freedreno/a4xx/fd4_zsa.h +++ b/src/gallium/drivers/freedreno/a4xx/fd4_zsa.h @@ -27,30 +27,29 @@ #ifndef FD4_ZSA_H_ #define FD4_ZSA_H_ - -#include "pipe/p_state.h" #include "pipe/p_context.h" +#include "pipe/p_state.h" #include "freedreno_util.h" struct fd4_zsa_stateobj { - struct pipe_depth_stencil_alpha_state base; - uint32_t gras_alpha_control; - uint32_t rb_alpha_control; - uint32_t rb_depth_control; - uint32_t rb_stencil_control; - uint32_t rb_stencil_control2; - uint32_t rb_stencilrefmask; - uint32_t rb_stencilrefmask_bf; + struct pipe_depth_stencil_alpha_state base; + uint32_t gras_alpha_control; + uint32_t rb_alpha_control; + uint32_t rb_depth_control; + uint32_t rb_stencil_control; + uint32_t rb_stencil_control2; + uint32_t rb_stencilrefmask; + uint32_t rb_stencilrefmask_bf; }; static inline struct fd4_zsa_stateobj * fd4_zsa_stateobj(struct pipe_depth_stencil_alpha_state *zsa) { - return (struct fd4_zsa_stateobj *)zsa; + return (struct fd4_zsa_stateobj *)zsa; } -void * fd4_zsa_state_create(struct pipe_context *pctx, - const struct pipe_depth_stencil_alpha_state *cso); +void *fd4_zsa_state_create(struct pipe_context *pctx, + const struct pipe_depth_stencil_alpha_state *cso); #endif /* FD4_ZSA_H_ */ diff --git a/src/gallium/drivers/freedreno/a5xx/fd5_blend.c b/src/gallium/drivers/freedreno/a5xx/fd5_blend.c index c98d7c5..582013f 100644 --- a/src/gallium/drivers/freedreno/a5xx/fd5_blend.c +++ b/src/gallium/drivers/freedreno/a5xx/fd5_blend.c @@ -26,8 +26,8 @@ #include "pipe/p_state.h" #include "util/u_blend.h" -#include "util/u_string.h" #include "util/u_memory.h" +#include "util/u_string.h" #include "fd5_blend.h" #include "fd5_context.h" @@ -37,90 +37,99 @@ static enum a3xx_rb_blend_opcode blend_func(unsigned func) { - switch (func) { - case PIPE_BLEND_ADD: - return BLEND_DST_PLUS_SRC; - case PIPE_BLEND_MIN: - return BLEND_MIN_DST_SRC; - case PIPE_BLEND_MAX: - return BLEND_MAX_DST_SRC; - case PIPE_BLEND_SUBTRACT: - return BLEND_SRC_MINUS_DST; - case PIPE_BLEND_REVERSE_SUBTRACT: - return BLEND_DST_MINUS_SRC; - default: - DBG("invalid blend func: %x", func); - return 0; - } + switch (func) { + case PIPE_BLEND_ADD: + return BLEND_DST_PLUS_SRC; + case PIPE_BLEND_MIN: + return BLEND_MIN_DST_SRC; + case PIPE_BLEND_MAX: + return BLEND_MAX_DST_SRC; + case PIPE_BLEND_SUBTRACT: + return BLEND_SRC_MINUS_DST; + case PIPE_BLEND_REVERSE_SUBTRACT: + return BLEND_DST_MINUS_SRC; + default: + DBG("invalid blend func: %x", func); + return 0; + } } void * fd5_blend_state_create(struct pipe_context *pctx, - const struct pipe_blend_state *cso) + const struct pipe_blend_state *cso) { - struct fd5_blend_stateobj *so; - enum a3xx_rop_code rop = ROP_COPY; - bool reads_dest = false; - unsigned i, mrt_blend = 0; - - if (cso->logicop_enable) { - rop = cso->logicop_func; /* maps 1:1 */ - reads_dest = util_logicop_reads_dest(cso->logicop_func); - } - - so = CALLOC_STRUCT(fd5_blend_stateobj); - if (!so) - return NULL; - - so->base = *cso; - - so->lrz_write = true; /* unless blend enabled for any MRT */ - - for (i = 0; i < ARRAY_SIZE(so->rb_mrt); i++) { - const struct pipe_rt_blend_state *rt; - - if (cso->independent_blend_enable) - rt = &cso->rt[i]; - else - rt = &cso->rt[0]; - - so->rb_mrt[i].blend_control = - A5XX_RB_MRT_BLEND_CONTROL_RGB_SRC_FACTOR(fd_blend_factor(rt->rgb_src_factor)) | - A5XX_RB_MRT_BLEND_CONTROL_RGB_BLEND_OPCODE(blend_func(rt->rgb_func)) | - A5XX_RB_MRT_BLEND_CONTROL_RGB_DEST_FACTOR(fd_blend_factor(rt->rgb_dst_factor)) | - A5XX_RB_MRT_BLEND_CONTROL_ALPHA_SRC_FACTOR(fd_blend_factor(rt->alpha_src_factor)) | - A5XX_RB_MRT_BLEND_CONTROL_ALPHA_BLEND_OPCODE(blend_func(rt->alpha_func)) | - A5XX_RB_MRT_BLEND_CONTROL_ALPHA_DEST_FACTOR(fd_blend_factor(rt->alpha_dst_factor)); - - so->rb_mrt[i].control = - A5XX_RB_MRT_CONTROL_ROP_CODE(rop) | - COND(cso->logicop_enable, A5XX_RB_MRT_CONTROL_ROP_ENABLE) | - A5XX_RB_MRT_CONTROL_COMPONENT_ENABLE(rt->colormask); - - if (rt->blend_enable) { - so->rb_mrt[i].control |= -// A5XX_RB_MRT_CONTROL_READ_DEST_ENABLE | - A5XX_RB_MRT_CONTROL_BLEND | - A5XX_RB_MRT_CONTROL_BLEND2; - mrt_blend |= (1 << i); - so->lrz_write = false; - } - - if (reads_dest) { -// so->rb_mrt[i].control |= A5XX_RB_MRT_CONTROL_READ_DEST_ENABLE; - mrt_blend |= (1 << i); - } - -// if (cso->dither) -// so->rb_mrt[i].buf_info |= A5XX_RB_MRT_BUF_INFO_DITHER_MODE(DITHER_ALWAYS); - } - - so->rb_blend_cntl = A5XX_RB_BLEND_CNTL_ENABLE_BLEND(mrt_blend) | - COND(cso->alpha_to_coverage, A5XX_RB_BLEND_CNTL_ALPHA_TO_COVERAGE) | - COND(cso->independent_blend_enable, A5XX_RB_BLEND_CNTL_INDEPENDENT_BLEND); - so->sp_blend_cntl = A5XX_SP_BLEND_CNTL_UNK8 | - COND(cso->alpha_to_coverage, A5XX_SP_BLEND_CNTL_ALPHA_TO_COVERAGE) | - COND(mrt_blend, A5XX_SP_BLEND_CNTL_ENABLED); - - return so; + struct fd5_blend_stateobj *so; + enum a3xx_rop_code rop = ROP_COPY; + bool reads_dest = false; + unsigned i, mrt_blend = 0; + + if (cso->logicop_enable) { + rop = cso->logicop_func; /* maps 1:1 */ + reads_dest = util_logicop_reads_dest(cso->logicop_func); + } + + so = CALLOC_STRUCT(fd5_blend_stateobj); + if (!so) + return NULL; + + so->base = *cso; + + so->lrz_write = true; /* unless blend enabled for any MRT */ + + for (i = 0; i < ARRAY_SIZE(so->rb_mrt); i++) { + const struct pipe_rt_blend_state *rt; + + if (cso->independent_blend_enable) + rt = &cso->rt[i]; + else + rt = &cso->rt[0]; + + so->rb_mrt[i].blend_control = + A5XX_RB_MRT_BLEND_CONTROL_RGB_SRC_FACTOR( + fd_blend_factor(rt->rgb_src_factor)) | + A5XX_RB_MRT_BLEND_CONTROL_RGB_BLEND_OPCODE(blend_func(rt->rgb_func)) | + A5XX_RB_MRT_BLEND_CONTROL_RGB_DEST_FACTOR( + fd_blend_factor(rt->rgb_dst_factor)) | + A5XX_RB_MRT_BLEND_CONTROL_ALPHA_SRC_FACTOR( + fd_blend_factor(rt->alpha_src_factor)) | + A5XX_RB_MRT_BLEND_CONTROL_ALPHA_BLEND_OPCODE( + blend_func(rt->alpha_func)) | + A5XX_RB_MRT_BLEND_CONTROL_ALPHA_DEST_FACTOR( + fd_blend_factor(rt->alpha_dst_factor)); + + so->rb_mrt[i].control = + A5XX_RB_MRT_CONTROL_ROP_CODE(rop) | + COND(cso->logicop_enable, A5XX_RB_MRT_CONTROL_ROP_ENABLE) | + A5XX_RB_MRT_CONTROL_COMPONENT_ENABLE(rt->colormask); + + if (rt->blend_enable) { + so->rb_mrt[i].control |= + // A5XX_RB_MRT_CONTROL_READ_DEST_ENABLE + //| + A5XX_RB_MRT_CONTROL_BLEND | A5XX_RB_MRT_CONTROL_BLEND2; + mrt_blend |= (1 << i); + so->lrz_write = false; + } + + if (reads_dest) { + // so->rb_mrt[i].control |= + //A5XX_RB_MRT_CONTROL_READ_DEST_ENABLE; + mrt_blend |= (1 << i); + } + + // if (cso->dither) + // so->rb_mrt[i].buf_info |= + //A5XX_RB_MRT_BUF_INFO_DITHER_MODE(DITHER_ALWAYS); + } + + so->rb_blend_cntl = + A5XX_RB_BLEND_CNTL_ENABLE_BLEND(mrt_blend) | + COND(cso->alpha_to_coverage, A5XX_RB_BLEND_CNTL_ALPHA_TO_COVERAGE) | + COND(cso->independent_blend_enable, A5XX_RB_BLEND_CNTL_INDEPENDENT_BLEND); + so->sp_blend_cntl = + A5XX_SP_BLEND_CNTL_UNK8 | + COND(cso->alpha_to_coverage, A5XX_SP_BLEND_CNTL_ALPHA_TO_COVERAGE) | + COND(mrt_blend, A5XX_SP_BLEND_CNTL_ENABLED); + + return so; } diff --git a/src/gallium/drivers/freedreno/a5xx/fd5_blend.h b/src/gallium/drivers/freedreno/a5xx/fd5_blend.h index 10cbbaa..7f1802f 100644 --- a/src/gallium/drivers/freedreno/a5xx/fd5_blend.h +++ b/src/gallium/drivers/freedreno/a5xx/fd5_blend.h @@ -27,31 +27,31 @@ #ifndef FD5_BLEND_H_ #define FD5_BLEND_H_ -#include "pipe/p_state.h" #include "pipe/p_context.h" +#include "pipe/p_state.h" #include "freedreno_util.h" struct fd5_blend_stateobj { - struct pipe_blend_state base; - - struct { - uint32_t control; - uint32_t buf_info; - uint32_t blend_control; - } rb_mrt[A5XX_MAX_RENDER_TARGETS]; - uint32_t rb_blend_cntl; - uint32_t sp_blend_cntl; - bool lrz_write; + struct pipe_blend_state base; + + struct { + uint32_t control; + uint32_t buf_info; + uint32_t blend_control; + } rb_mrt[A5XX_MAX_RENDER_TARGETS]; + uint32_t rb_blend_cntl; + uint32_t sp_blend_cntl; + bool lrz_write; }; static inline struct fd5_blend_stateobj * fd5_blend_stateobj(struct pipe_blend_state *blend) { - return (struct fd5_blend_stateobj *)blend; + return (struct fd5_blend_stateobj *)blend; } -void * fd5_blend_state_create(struct pipe_context *pctx, - const struct pipe_blend_state *cso); +void *fd5_blend_state_create(struct pipe_context *pctx, + const struct pipe_blend_state *cso); #endif /* FD5_BLEND_H_ */ diff --git a/src/gallium/drivers/freedreno/a5xx/fd5_blitter.c b/src/gallium/drivers/freedreno/a5xx/fd5_blitter.c index 4af11c7..85ab446 100644 --- a/src/gallium/drivers/freedreno/a5xx/fd5_blitter.c +++ b/src/gallium/drivers/freedreno/a5xx/fd5_blitter.c @@ -28,8 +28,8 @@ #include "freedreno_resource.h" #include "fd5_blitter.h" -#include "fd5_format.h" #include "fd5_emit.h" +#include "fd5_format.h" /* Make sure none of the requested dimensions extend beyond the size of the * resource. Not entirely sure why this happens, but sometimes it does, and @@ -39,9 +39,9 @@ static bool ok_dims(const struct pipe_resource *r, const struct pipe_box *b, int lvl) { - return (b->x >= 0) && (b->x + b->width <= u_minify(r->width0, lvl)) && - (b->y >= 0) && (b->y + b->height <= u_minify(r->height0, lvl)) && - (b->z >= 0) && (b->z + b->depth <= u_minify(r->depth0, lvl)); + return (b->x >= 0) && (b->x + b->width <= u_minify(r->width0, lvl)) && + (b->y >= 0) && (b->y + b->height <= u_minify(r->height0, lvl)) && + (b->z >= 0) && (b->z + b->depth <= u_minify(r->depth0, lvl)); } /* Not sure if format restrictions differ for src and dst, or if @@ -52,136 +52,136 @@ ok_dims(const struct pipe_resource *r, const struct pipe_box *b, int lvl) static bool ok_format(enum pipe_format fmt) { - if (util_format_is_compressed(fmt)) - return false; - - switch (fmt) { - case PIPE_FORMAT_R10G10B10A2_SSCALED: - case PIPE_FORMAT_R10G10B10A2_SNORM: - case PIPE_FORMAT_B10G10R10A2_USCALED: - case PIPE_FORMAT_B10G10R10A2_SSCALED: - case PIPE_FORMAT_B10G10R10A2_SNORM: - case PIPE_FORMAT_R10G10B10A2_UNORM: - case PIPE_FORMAT_R10G10B10A2_USCALED: - case PIPE_FORMAT_B10G10R10A2_UNORM: - case PIPE_FORMAT_R10SG10SB10SA2U_NORM: - case PIPE_FORMAT_B10G10R10A2_UINT: - case PIPE_FORMAT_R10G10B10A2_UINT: - return false; - default: - break; - } - - if (fd5_pipe2color(fmt) == RB5_NONE) - return false; - - return true; + if (util_format_is_compressed(fmt)) + return false; + + switch (fmt) { + case PIPE_FORMAT_R10G10B10A2_SSCALED: + case PIPE_FORMAT_R10G10B10A2_SNORM: + case PIPE_FORMAT_B10G10R10A2_USCALED: + case PIPE_FORMAT_B10G10R10A2_SSCALED: + case PIPE_FORMAT_B10G10R10A2_SNORM: + case PIPE_FORMAT_R10G10B10A2_UNORM: + case PIPE_FORMAT_R10G10B10A2_USCALED: + case PIPE_FORMAT_B10G10R10A2_UNORM: + case PIPE_FORMAT_R10SG10SB10SA2U_NORM: + case PIPE_FORMAT_B10G10R10A2_UINT: + case PIPE_FORMAT_R10G10B10A2_UINT: + return false; + default: + break; + } + + if (fd5_pipe2color(fmt) == RB5_NONE) + return false; + + return true; } static bool can_do_blit(const struct pipe_blit_info *info) { - /* I think we can do scaling, but not in z dimension since that would - * require blending.. - */ - if (info->dst.box.depth != info->src.box.depth) - return false; + /* I think we can do scaling, but not in z dimension since that would + * require blending.. + */ + if (info->dst.box.depth != info->src.box.depth) + return false; - if (!ok_format(info->dst.format)) - return false; + if (!ok_format(info->dst.format)) + return false; - if (!ok_format(info->src.format)) - return false; + if (!ok_format(info->src.format)) + return false; - /* hw ignores {SRC,DST}_INFO.COLOR_SWAP if {SRC,DST}_INFO.TILE_MODE - * is set (not linear). We can kind of get around that when tiling/ - * untiling by setting both src and dst COLOR_SWAP=WZYX, but that - * means the formats must match: - */ - if ((fd_resource(info->dst.resource)->layout.tile_mode || - fd_resource(info->src.resource)->layout.tile_mode) && - info->dst.format != info->src.format) - return false; + /* hw ignores {SRC,DST}_INFO.COLOR_SWAP if {SRC,DST}_INFO.TILE_MODE + * is set (not linear). We can kind of get around that when tiling/ + * untiling by setting both src and dst COLOR_SWAP=WZYX, but that + * means the formats must match: + */ + if ((fd_resource(info->dst.resource)->layout.tile_mode || + fd_resource(info->src.resource)->layout.tile_mode) && + info->dst.format != info->src.format) + return false; - /* until we figure out a few more registers: */ - if ((info->dst.box.width != info->src.box.width) || - (info->dst.box.height != info->src.box.height)) - return false; + /* until we figure out a few more registers: */ + if ((info->dst.box.width != info->src.box.width) || + (info->dst.box.height != info->src.box.height)) + return false; - /* src box can be inverted, which we don't support.. dst box cannot: */ - if ((info->src.box.width < 0) || (info->src.box.height < 0)) - return false; + /* src box can be inverted, which we don't support.. dst box cannot: */ + if ((info->src.box.width < 0) || (info->src.box.height < 0)) + return false; - if (!ok_dims(info->src.resource, &info->src.box, info->src.level)) - return false; + if (!ok_dims(info->src.resource, &info->src.box, info->src.level)) + return false; - if (!ok_dims(info->dst.resource, &info->dst.box, info->dst.level)) - return false; + if (!ok_dims(info->dst.resource, &info->dst.box, info->dst.level)) + return false; - debug_assert(info->dst.box.width >= 0); - debug_assert(info->dst.box.height >= 0); - debug_assert(info->dst.box.depth >= 0); + debug_assert(info->dst.box.width >= 0); + debug_assert(info->dst.box.height >= 0); + debug_assert(info->dst.box.depth >= 0); - if ((info->dst.resource->nr_samples > 1) || - (info->src.resource->nr_samples > 1)) - return false; + if ((info->dst.resource->nr_samples > 1) || + (info->src.resource->nr_samples > 1)) + return false; - if (info->scissor_enable) - return false; + if (info->scissor_enable) + return false; - if (info->window_rectangle_include) - return false; + if (info->window_rectangle_include) + return false; - if (info->render_condition_enable) - return false; + if (info->render_condition_enable) + return false; - if (info->alpha_blend) - return false; + if (info->alpha_blend) + return false; - if (info->filter != PIPE_TEX_FILTER_NEAREST) - return false; + if (info->filter != PIPE_TEX_FILTER_NEAREST) + return false; - if (info->mask != util_format_get_mask(info->src.format)) - return false; + if (info->mask != util_format_get_mask(info->src.format)) + return false; - if (info->mask != util_format_get_mask(info->dst.format)) - return false; + if (info->mask != util_format_get_mask(info->dst.format)) + return false; - return true; + return true; } static void emit_setup(struct fd_ringbuffer *ring) { - OUT_PKT4(ring, REG_A5XX_RB_RENDER_CNTL, 1); - OUT_RING(ring, 0x00000008); + OUT_PKT4(ring, REG_A5XX_RB_RENDER_CNTL, 1); + OUT_RING(ring, 0x00000008); - OUT_PKT4(ring, REG_A5XX_UNKNOWN_2100, 1); - OUT_RING(ring, 0x86000000); /* UNKNOWN_2100 */ + OUT_PKT4(ring, REG_A5XX_UNKNOWN_2100, 1); + OUT_RING(ring, 0x86000000); /* UNKNOWN_2100 */ - OUT_PKT4(ring, REG_A5XX_UNKNOWN_2180, 1); - OUT_RING(ring, 0x86000000); /* UNKNOWN_2180 */ + OUT_PKT4(ring, REG_A5XX_UNKNOWN_2180, 1); + OUT_RING(ring, 0x86000000); /* UNKNOWN_2180 */ - OUT_PKT4(ring, REG_A5XX_UNKNOWN_2184, 1); - OUT_RING(ring, 0x00000009); /* UNKNOWN_2184 */ + OUT_PKT4(ring, REG_A5XX_UNKNOWN_2184, 1); + OUT_RING(ring, 0x00000009); /* UNKNOWN_2184 */ - OUT_PKT4(ring, REG_A5XX_RB_CNTL, 1); - OUT_RING(ring, A5XX_RB_CNTL_BYPASS); + OUT_PKT4(ring, REG_A5XX_RB_CNTL, 1); + OUT_RING(ring, A5XX_RB_CNTL_BYPASS); - OUT_PKT4(ring, REG_A5XX_RB_MODE_CNTL, 1); - OUT_RING(ring, 0x00000004); /* RB_MODE_CNTL */ + OUT_PKT4(ring, REG_A5XX_RB_MODE_CNTL, 1); + OUT_RING(ring, 0x00000004); /* RB_MODE_CNTL */ - OUT_PKT4(ring, REG_A5XX_SP_MODE_CNTL, 1); - OUT_RING(ring, 0x0000000c); /* SP_MODE_CNTL */ + OUT_PKT4(ring, REG_A5XX_SP_MODE_CNTL, 1); + OUT_RING(ring, 0x0000000c); /* SP_MODE_CNTL */ - OUT_PKT4(ring, REG_A5XX_TPL1_MODE_CNTL, 1); - OUT_RING(ring, 0x00000344); /* TPL1_MODE_CNTL */ + OUT_PKT4(ring, REG_A5XX_TPL1_MODE_CNTL, 1); + OUT_RING(ring, 0x00000344); /* TPL1_MODE_CNTL */ - OUT_PKT4(ring, REG_A5XX_HLSQ_MODE_CNTL, 1); - OUT_RING(ring, 0x00000002); /* HLSQ_MODE_CNTL */ + OUT_PKT4(ring, REG_A5XX_HLSQ_MODE_CNTL, 1); + OUT_RING(ring, 0x00000002); /* HLSQ_MODE_CNTL */ - OUT_PKT4(ring, REG_A5XX_GRAS_CL_CNTL, 1); - OUT_RING(ring, 0x00000181); /* GRAS_CL_CNTL */ + OUT_PKT4(ring, REG_A5XX_GRAS_CL_CNTL, 1); + OUT_RING(ring, 0x00000181); /* GRAS_CL_CNTL */ } /* buffers need to be handled specially since x/width can exceed the bounds @@ -190,297 +190,297 @@ emit_setup(struct fd_ringbuffer *ring) static void emit_blit_buffer(struct fd_ringbuffer *ring, const struct pipe_blit_info *info) { - const struct pipe_box *sbox = &info->src.box; - const struct pipe_box *dbox = &info->dst.box; - struct fd_resource *src, *dst; - unsigned sshift, dshift; - - src = fd_resource(info->src.resource); - dst = fd_resource(info->dst.resource); - - debug_assert(src->layout.cpp == 1); - debug_assert(dst->layout.cpp == 1); - debug_assert(info->src.resource->format == info->dst.resource->format); - debug_assert((sbox->y == 0) && (sbox->height == 1)); - debug_assert((dbox->y == 0) && (dbox->height == 1)); - debug_assert((sbox->z == 0) && (sbox->depth == 1)); - debug_assert((dbox->z == 0) && (dbox->depth == 1)); - debug_assert(sbox->width == dbox->width); - debug_assert(info->src.level == 0); - debug_assert(info->dst.level == 0); - - /* - * Buffers can have dimensions bigger than max width, remap into - * multiple 1d blits to fit within max dimension - * - * Note that blob uses .ARRAY_PITCH=128 for blitting buffers, which - * seems to prevent overfetch related faults. Not quite sure what - * the deal is there. - * - * Low 6 bits of SRC/DST addresses need to be zero (ie. address - * aligned to 64) so we need to shift src/dst x1/x2 to make up the - * difference. On top of already splitting up the blit so width - * isn't > 16k. - * - * We perhaps could do a bit better, if src and dst are aligned but - * in the worst case this means we have to split the copy up into - * 16k (0x4000) minus 64 (0x40). - */ - - sshift = sbox->x & 0x3f; - dshift = dbox->x & 0x3f; - - for (unsigned off = 0; off < sbox->width; off += (0x4000 - 0x40)) { - unsigned soff, doff, w, p; - - soff = (sbox->x + off) & ~0x3f; - doff = (dbox->x + off) & ~0x3f; - - w = MIN2(sbox->width - off, (0x4000 - 0x40)); - p = align(w, 64); - - debug_assert((soff + w) <= fd_bo_size(src->bo)); - debug_assert((doff + w) <= fd_bo_size(dst->bo)); - - OUT_PKT7(ring, CP_SET_RENDER_MODE, 1); - OUT_RING(ring, CP_SET_RENDER_MODE_0_MODE(BLIT2D)); - - /* - * Emit source: - */ - OUT_PKT4(ring, REG_A5XX_RB_2D_SRC_INFO, 9); - OUT_RING(ring, A5XX_RB_2D_SRC_INFO_COLOR_FORMAT(RB5_R8_UNORM) | - A5XX_RB_2D_SRC_INFO_TILE_MODE(TILE5_LINEAR) | - A5XX_RB_2D_SRC_INFO_COLOR_SWAP(WZYX)); - OUT_RELOC(ring, src->bo, soff, 0, 0); /* RB_2D_SRC_LO/HI */ - OUT_RING(ring, A5XX_RB_2D_SRC_SIZE_PITCH(p) | - A5XX_RB_2D_SRC_SIZE_ARRAY_PITCH(128)); - OUT_RING(ring, 0x00000000); - OUT_RING(ring, 0x00000000); - OUT_RING(ring, 0x00000000); - OUT_RING(ring, 0x00000000); - OUT_RING(ring, 0x00000000); - - OUT_PKT4(ring, REG_A5XX_GRAS_2D_SRC_INFO, 1); - OUT_RING(ring, A5XX_GRAS_2D_SRC_INFO_COLOR_FORMAT(RB5_R8_UNORM) | - A5XX_GRAS_2D_SRC_INFO_COLOR_SWAP(WZYX)); - - /* - * Emit destination: - */ - OUT_PKT4(ring, REG_A5XX_RB_2D_DST_INFO, 9); - OUT_RING(ring, A5XX_RB_2D_DST_INFO_COLOR_FORMAT(RB5_R8_UNORM) | - A5XX_RB_2D_DST_INFO_TILE_MODE(TILE5_LINEAR) | - A5XX_RB_2D_DST_INFO_COLOR_SWAP(WZYX)); - OUT_RELOC(ring, dst->bo, doff, 0, 0); /* RB_2D_DST_LO/HI */ - OUT_RING(ring, A5XX_RB_2D_DST_SIZE_PITCH(p) | - A5XX_RB_2D_DST_SIZE_ARRAY_PITCH(128)); - OUT_RING(ring, 0x00000000); - OUT_RING(ring, 0x00000000); - OUT_RING(ring, 0x00000000); - OUT_RING(ring, 0x00000000); - OUT_RING(ring, 0x00000000); - - OUT_PKT4(ring, REG_A5XX_GRAS_2D_DST_INFO, 1); - OUT_RING(ring, A5XX_GRAS_2D_DST_INFO_COLOR_FORMAT(RB5_R8_UNORM) | - A5XX_GRAS_2D_DST_INFO_COLOR_SWAP(WZYX)); - - /* - * Blit command: - */ - OUT_PKT7(ring, CP_BLIT, 5); - OUT_RING(ring, CP_BLIT_0_OP(BLIT_OP_COPY)); - OUT_RING(ring, CP_BLIT_1_SRC_X1(sshift) | CP_BLIT_1_SRC_Y1(0)); - OUT_RING(ring, CP_BLIT_2_SRC_X2(sshift+w-1) | CP_BLIT_2_SRC_Y2(0)); - OUT_RING(ring, CP_BLIT_3_DST_X1(dshift) | CP_BLIT_3_DST_Y1(0)); - OUT_RING(ring, CP_BLIT_4_DST_X2(dshift+w-1) | CP_BLIT_4_DST_Y2(0)); - - OUT_PKT7(ring, CP_SET_RENDER_MODE, 1); - OUT_RING(ring, CP_SET_RENDER_MODE_0_MODE(END2D)); - - OUT_WFI5(ring); - } + const struct pipe_box *sbox = &info->src.box; + const struct pipe_box *dbox = &info->dst.box; + struct fd_resource *src, *dst; + unsigned sshift, dshift; + + src = fd_resource(info->src.resource); + dst = fd_resource(info->dst.resource); + + debug_assert(src->layout.cpp == 1); + debug_assert(dst->layout.cpp == 1); + debug_assert(info->src.resource->format == info->dst.resource->format); + debug_assert((sbox->y == 0) && (sbox->height == 1)); + debug_assert((dbox->y == 0) && (dbox->height == 1)); + debug_assert((sbox->z == 0) && (sbox->depth == 1)); + debug_assert((dbox->z == 0) && (dbox->depth == 1)); + debug_assert(sbox->width == dbox->width); + debug_assert(info->src.level == 0); + debug_assert(info->dst.level == 0); + + /* + * Buffers can have dimensions bigger than max width, remap into + * multiple 1d blits to fit within max dimension + * + * Note that blob uses .ARRAY_PITCH=128 for blitting buffers, which + * seems to prevent overfetch related faults. Not quite sure what + * the deal is there. + * + * Low 6 bits of SRC/DST addresses need to be zero (ie. address + * aligned to 64) so we need to shift src/dst x1/x2 to make up the + * difference. On top of already splitting up the blit so width + * isn't > 16k. + * + * We perhaps could do a bit better, if src and dst are aligned but + * in the worst case this means we have to split the copy up into + * 16k (0x4000) minus 64 (0x40). + */ + + sshift = sbox->x & 0x3f; + dshift = dbox->x & 0x3f; + + for (unsigned off = 0; off < sbox->width; off += (0x4000 - 0x40)) { + unsigned soff, doff, w, p; + + soff = (sbox->x + off) & ~0x3f; + doff = (dbox->x + off) & ~0x3f; + + w = MIN2(sbox->width - off, (0x4000 - 0x40)); + p = align(w, 64); + + debug_assert((soff + w) <= fd_bo_size(src->bo)); + debug_assert((doff + w) <= fd_bo_size(dst->bo)); + + OUT_PKT7(ring, CP_SET_RENDER_MODE, 1); + OUT_RING(ring, CP_SET_RENDER_MODE_0_MODE(BLIT2D)); + + /* + * Emit source: + */ + OUT_PKT4(ring, REG_A5XX_RB_2D_SRC_INFO, 9); + OUT_RING(ring, A5XX_RB_2D_SRC_INFO_COLOR_FORMAT(RB5_R8_UNORM) | + A5XX_RB_2D_SRC_INFO_TILE_MODE(TILE5_LINEAR) | + A5XX_RB_2D_SRC_INFO_COLOR_SWAP(WZYX)); + OUT_RELOC(ring, src->bo, soff, 0, 0); /* RB_2D_SRC_LO/HI */ + OUT_RING(ring, A5XX_RB_2D_SRC_SIZE_PITCH(p) | + A5XX_RB_2D_SRC_SIZE_ARRAY_PITCH(128)); + OUT_RING(ring, 0x00000000); + OUT_RING(ring, 0x00000000); + OUT_RING(ring, 0x00000000); + OUT_RING(ring, 0x00000000); + OUT_RING(ring, 0x00000000); + + OUT_PKT4(ring, REG_A5XX_GRAS_2D_SRC_INFO, 1); + OUT_RING(ring, A5XX_GRAS_2D_SRC_INFO_COLOR_FORMAT(RB5_R8_UNORM) | + A5XX_GRAS_2D_SRC_INFO_COLOR_SWAP(WZYX)); + + /* + * Emit destination: + */ + OUT_PKT4(ring, REG_A5XX_RB_2D_DST_INFO, 9); + OUT_RING(ring, A5XX_RB_2D_DST_INFO_COLOR_FORMAT(RB5_R8_UNORM) | + A5XX_RB_2D_DST_INFO_TILE_MODE(TILE5_LINEAR) | + A5XX_RB_2D_DST_INFO_COLOR_SWAP(WZYX)); + OUT_RELOC(ring, dst->bo, doff, 0, 0); /* RB_2D_DST_LO/HI */ + OUT_RING(ring, A5XX_RB_2D_DST_SIZE_PITCH(p) | + A5XX_RB_2D_DST_SIZE_ARRAY_PITCH(128)); + OUT_RING(ring, 0x00000000); + OUT_RING(ring, 0x00000000); + OUT_RING(ring, 0x00000000); + OUT_RING(ring, 0x00000000); + OUT_RING(ring, 0x00000000); + + OUT_PKT4(ring, REG_A5XX_GRAS_2D_DST_INFO, 1); + OUT_RING(ring, A5XX_GRAS_2D_DST_INFO_COLOR_FORMAT(RB5_R8_UNORM) | + A5XX_GRAS_2D_DST_INFO_COLOR_SWAP(WZYX)); + + /* + * Blit command: + */ + OUT_PKT7(ring, CP_BLIT, 5); + OUT_RING(ring, CP_BLIT_0_OP(BLIT_OP_COPY)); + OUT_RING(ring, CP_BLIT_1_SRC_X1(sshift) | CP_BLIT_1_SRC_Y1(0)); + OUT_RING(ring, CP_BLIT_2_SRC_X2(sshift + w - 1) | CP_BLIT_2_SRC_Y2(0)); + OUT_RING(ring, CP_BLIT_3_DST_X1(dshift) | CP_BLIT_3_DST_Y1(0)); + OUT_RING(ring, CP_BLIT_4_DST_X2(dshift + w - 1) | CP_BLIT_4_DST_Y2(0)); + + OUT_PKT7(ring, CP_SET_RENDER_MODE, 1); + OUT_RING(ring, CP_SET_RENDER_MODE_0_MODE(END2D)); + + OUT_WFI5(ring); + } } static void emit_blit(struct fd_ringbuffer *ring, const struct pipe_blit_info *info) { - const struct pipe_box *sbox = &info->src.box; - const struct pipe_box *dbox = &info->dst.box; - struct fd_resource *src, *dst; - struct fdl_slice *sslice, *dslice; - enum a5xx_color_fmt sfmt, dfmt; - enum a5xx_tile_mode stile, dtile; - enum a3xx_color_swap sswap, dswap; - unsigned ssize, dsize, spitch, dpitch; - unsigned sx1, sy1, sx2, sy2; - unsigned dx1, dy1, dx2, dy2; - - src = fd_resource(info->src.resource); - dst = fd_resource(info->dst.resource); - - sslice = fd_resource_slice(src, info->src.level); - dslice = fd_resource_slice(dst, info->dst.level); - - sfmt = fd5_pipe2color(info->src.format); - dfmt = fd5_pipe2color(info->dst.format); - - stile = fd_resource_tile_mode(info->src.resource, info->src.level); - dtile = fd_resource_tile_mode(info->dst.resource, info->dst.level); - - sswap = fd5_pipe2swap(info->src.format); - dswap = fd5_pipe2swap(info->dst.format); - - spitch = fd_resource_pitch(src, info->src.level); - dpitch = fd_resource_pitch(dst, info->dst.level); - - /* if dtile, then dswap ignored by hw, and likewise if stile then sswap - * ignored by hw.. but in this case we have already rejected the blit - * if src and dst formats differ, so juse use WZYX for both src and - * dst swap mode (so we don't change component order) - */ - if (stile || dtile) { - debug_assert(info->src.format == info->dst.format); - sswap = dswap = WZYX; - } - - sx1 = sbox->x; - sy1 = sbox->y; - sx2 = sbox->x + sbox->width - 1; - sy2 = sbox->y + sbox->height - 1; - - dx1 = dbox->x; - dy1 = dbox->y; - dx2 = dbox->x + dbox->width - 1; - dy2 = dbox->y + dbox->height - 1; - - if (info->src.resource->target == PIPE_TEXTURE_3D) - ssize = sslice->size0; - else - ssize = src->layout.layer_size; - - if (info->dst.resource->target == PIPE_TEXTURE_3D) - dsize = dslice->size0; - else - dsize = dst->layout.layer_size; - - for (unsigned i = 0; i < info->dst.box.depth; i++) { - unsigned soff = fd_resource_offset(src, info->src.level, sbox->z + i); - unsigned doff = fd_resource_offset(dst, info->dst.level, dbox->z + i); - - debug_assert((soff + (sbox->height * spitch)) <= fd_bo_size(src->bo)); - debug_assert((doff + (dbox->height * dpitch)) <= fd_bo_size(dst->bo)); - - OUT_PKT7(ring, CP_SET_RENDER_MODE, 1); - OUT_RING(ring, CP_SET_RENDER_MODE_0_MODE(BLIT2D)); - - /* - * Emit source: - */ - OUT_PKT4(ring, REG_A5XX_RB_2D_SRC_INFO, 9); - OUT_RING(ring, A5XX_RB_2D_SRC_INFO_COLOR_FORMAT(sfmt) | - A5XX_RB_2D_SRC_INFO_TILE_MODE(stile) | - A5XX_RB_2D_SRC_INFO_COLOR_SWAP(sswap)); - OUT_RELOC(ring, src->bo, soff, 0, 0); /* RB_2D_SRC_LO/HI */ - OUT_RING(ring, A5XX_RB_2D_SRC_SIZE_PITCH(spitch) | - A5XX_RB_2D_SRC_SIZE_ARRAY_PITCH(ssize)); - OUT_RING(ring, 0x00000000); - OUT_RING(ring, 0x00000000); - OUT_RING(ring, 0x00000000); - OUT_RING(ring, 0x00000000); - OUT_RING(ring, 0x00000000); - - OUT_PKT4(ring, REG_A5XX_GRAS_2D_SRC_INFO, 1); - OUT_RING(ring, A5XX_GRAS_2D_SRC_INFO_COLOR_FORMAT(sfmt) | - A5XX_GRAS_2D_SRC_INFO_TILE_MODE(stile) | - A5XX_GRAS_2D_SRC_INFO_COLOR_SWAP(sswap)); - - /* - * Emit destination: - */ - OUT_PKT4(ring, REG_A5XX_RB_2D_DST_INFO, 9); - OUT_RING(ring, A5XX_RB_2D_DST_INFO_COLOR_FORMAT(dfmt) | - A5XX_RB_2D_DST_INFO_TILE_MODE(dtile) | - A5XX_RB_2D_DST_INFO_COLOR_SWAP(dswap)); - OUT_RELOC(ring, dst->bo, doff, 0, 0); /* RB_2D_DST_LO/HI */ - OUT_RING(ring, A5XX_RB_2D_DST_SIZE_PITCH(dpitch) | - A5XX_RB_2D_DST_SIZE_ARRAY_PITCH(dsize)); - OUT_RING(ring, 0x00000000); - OUT_RING(ring, 0x00000000); - OUT_RING(ring, 0x00000000); - OUT_RING(ring, 0x00000000); - OUT_RING(ring, 0x00000000); - - OUT_PKT4(ring, REG_A5XX_GRAS_2D_DST_INFO, 1); - OUT_RING(ring, A5XX_GRAS_2D_DST_INFO_COLOR_FORMAT(dfmt) | - A5XX_GRAS_2D_DST_INFO_TILE_MODE(dtile) | - A5XX_GRAS_2D_DST_INFO_COLOR_SWAP(dswap)); - - /* - * Blit command: - */ - OUT_PKT7(ring, CP_BLIT, 5); - OUT_RING(ring, CP_BLIT_0_OP(BLIT_OP_COPY)); - OUT_RING(ring, CP_BLIT_1_SRC_X1(sx1) | CP_BLIT_1_SRC_Y1(sy1)); - OUT_RING(ring, CP_BLIT_2_SRC_X2(sx2) | CP_BLIT_2_SRC_Y2(sy2)); - OUT_RING(ring, CP_BLIT_3_DST_X1(dx1) | CP_BLIT_3_DST_Y1(dy1)); - OUT_RING(ring, CP_BLIT_4_DST_X2(dx2) | CP_BLIT_4_DST_Y2(dy2)); - - OUT_PKT7(ring, CP_SET_RENDER_MODE, 1); - OUT_RING(ring, CP_SET_RENDER_MODE_0_MODE(END2D)); - } + const struct pipe_box *sbox = &info->src.box; + const struct pipe_box *dbox = &info->dst.box; + struct fd_resource *src, *dst; + struct fdl_slice *sslice, *dslice; + enum a5xx_color_fmt sfmt, dfmt; + enum a5xx_tile_mode stile, dtile; + enum a3xx_color_swap sswap, dswap; + unsigned ssize, dsize, spitch, dpitch; + unsigned sx1, sy1, sx2, sy2; + unsigned dx1, dy1, dx2, dy2; + + src = fd_resource(info->src.resource); + dst = fd_resource(info->dst.resource); + + sslice = fd_resource_slice(src, info->src.level); + dslice = fd_resource_slice(dst, info->dst.level); + + sfmt = fd5_pipe2color(info->src.format); + dfmt = fd5_pipe2color(info->dst.format); + + stile = fd_resource_tile_mode(info->src.resource, info->src.level); + dtile = fd_resource_tile_mode(info->dst.resource, info->dst.level); + + sswap = fd5_pipe2swap(info->src.format); + dswap = fd5_pipe2swap(info->dst.format); + + spitch = fd_resource_pitch(src, info->src.level); + dpitch = fd_resource_pitch(dst, info->dst.level); + + /* if dtile, then dswap ignored by hw, and likewise if stile then sswap + * ignored by hw.. but in this case we have already rejected the blit + * if src and dst formats differ, so juse use WZYX for both src and + * dst swap mode (so we don't change component order) + */ + if (stile || dtile) { + debug_assert(info->src.format == info->dst.format); + sswap = dswap = WZYX; + } + + sx1 = sbox->x; + sy1 = sbox->y; + sx2 = sbox->x + sbox->width - 1; + sy2 = sbox->y + sbox->height - 1; + + dx1 = dbox->x; + dy1 = dbox->y; + dx2 = dbox->x + dbox->width - 1; + dy2 = dbox->y + dbox->height - 1; + + if (info->src.resource->target == PIPE_TEXTURE_3D) + ssize = sslice->size0; + else + ssize = src->layout.layer_size; + + if (info->dst.resource->target == PIPE_TEXTURE_3D) + dsize = dslice->size0; + else + dsize = dst->layout.layer_size; + + for (unsigned i = 0; i < info->dst.box.depth; i++) { + unsigned soff = fd_resource_offset(src, info->src.level, sbox->z + i); + unsigned doff = fd_resource_offset(dst, info->dst.level, dbox->z + i); + + debug_assert((soff + (sbox->height * spitch)) <= fd_bo_size(src->bo)); + debug_assert((doff + (dbox->height * dpitch)) <= fd_bo_size(dst->bo)); + + OUT_PKT7(ring, CP_SET_RENDER_MODE, 1); + OUT_RING(ring, CP_SET_RENDER_MODE_0_MODE(BLIT2D)); + + /* + * Emit source: + */ + OUT_PKT4(ring, REG_A5XX_RB_2D_SRC_INFO, 9); + OUT_RING(ring, A5XX_RB_2D_SRC_INFO_COLOR_FORMAT(sfmt) | + A5XX_RB_2D_SRC_INFO_TILE_MODE(stile) | + A5XX_RB_2D_SRC_INFO_COLOR_SWAP(sswap)); + OUT_RELOC(ring, src->bo, soff, 0, 0); /* RB_2D_SRC_LO/HI */ + OUT_RING(ring, A5XX_RB_2D_SRC_SIZE_PITCH(spitch) | + A5XX_RB_2D_SRC_SIZE_ARRAY_PITCH(ssize)); + OUT_RING(ring, 0x00000000); + OUT_RING(ring, 0x00000000); + OUT_RING(ring, 0x00000000); + OUT_RING(ring, 0x00000000); + OUT_RING(ring, 0x00000000); + + OUT_PKT4(ring, REG_A5XX_GRAS_2D_SRC_INFO, 1); + OUT_RING(ring, A5XX_GRAS_2D_SRC_INFO_COLOR_FORMAT(sfmt) | + A5XX_GRAS_2D_SRC_INFO_TILE_MODE(stile) | + A5XX_GRAS_2D_SRC_INFO_COLOR_SWAP(sswap)); + + /* + * Emit destination: + */ + OUT_PKT4(ring, REG_A5XX_RB_2D_DST_INFO, 9); + OUT_RING(ring, A5XX_RB_2D_DST_INFO_COLOR_FORMAT(dfmt) | + A5XX_RB_2D_DST_INFO_TILE_MODE(dtile) | + A5XX_RB_2D_DST_INFO_COLOR_SWAP(dswap)); + OUT_RELOC(ring, dst->bo, doff, 0, 0); /* RB_2D_DST_LO/HI */ + OUT_RING(ring, A5XX_RB_2D_DST_SIZE_PITCH(dpitch) | + A5XX_RB_2D_DST_SIZE_ARRAY_PITCH(dsize)); + OUT_RING(ring, 0x00000000); + OUT_RING(ring, 0x00000000); + OUT_RING(ring, 0x00000000); + OUT_RING(ring, 0x00000000); + OUT_RING(ring, 0x00000000); + + OUT_PKT4(ring, REG_A5XX_GRAS_2D_DST_INFO, 1); + OUT_RING(ring, A5XX_GRAS_2D_DST_INFO_COLOR_FORMAT(dfmt) | + A5XX_GRAS_2D_DST_INFO_TILE_MODE(dtile) | + A5XX_GRAS_2D_DST_INFO_COLOR_SWAP(dswap)); + + /* + * Blit command: + */ + OUT_PKT7(ring, CP_BLIT, 5); + OUT_RING(ring, CP_BLIT_0_OP(BLIT_OP_COPY)); + OUT_RING(ring, CP_BLIT_1_SRC_X1(sx1) | CP_BLIT_1_SRC_Y1(sy1)); + OUT_RING(ring, CP_BLIT_2_SRC_X2(sx2) | CP_BLIT_2_SRC_Y2(sy2)); + OUT_RING(ring, CP_BLIT_3_DST_X1(dx1) | CP_BLIT_3_DST_Y1(dy1)); + OUT_RING(ring, CP_BLIT_4_DST_X2(dx2) | CP_BLIT_4_DST_Y2(dy2)); + + OUT_PKT7(ring, CP_SET_RENDER_MODE, 1); + OUT_RING(ring, CP_SET_RENDER_MODE_0_MODE(END2D)); + } } bool -fd5_blitter_blit(struct fd_context *ctx, const struct pipe_blit_info *info) - assert_dt +fd5_blitter_blit(struct fd_context *ctx, + const struct pipe_blit_info *info) assert_dt { - struct fd_batch *batch; + struct fd_batch *batch; - if (!can_do_blit(info)) { - return false; - } + if (!can_do_blit(info)) { + return false; + } - batch = fd_bc_alloc_batch(&ctx->screen->batch_cache, ctx, true); + batch = fd_bc_alloc_batch(&ctx->screen->batch_cache, ctx, true); - fd_batch_update_queries(batch); + fd_batch_update_queries(batch); - emit_setup(batch->draw); + emit_setup(batch->draw); - if ((info->src.resource->target == PIPE_BUFFER) && - (info->dst.resource->target == PIPE_BUFFER)) { - assert(fd_resource(info->src.resource)->layout.tile_mode == TILE5_LINEAR); - assert(fd_resource(info->dst.resource)->layout.tile_mode == TILE5_LINEAR); - emit_blit_buffer(batch->draw, info); - } else { - /* I don't *think* we need to handle blits between buffer <-> !buffer */ - debug_assert(info->src.resource->target != PIPE_BUFFER); - debug_assert(info->dst.resource->target != PIPE_BUFFER); - emit_blit(batch->draw, info); - } + if ((info->src.resource->target == PIPE_BUFFER) && + (info->dst.resource->target == PIPE_BUFFER)) { + assert(fd_resource(info->src.resource)->layout.tile_mode == TILE5_LINEAR); + assert(fd_resource(info->dst.resource)->layout.tile_mode == TILE5_LINEAR); + emit_blit_buffer(batch->draw, info); + } else { + /* I don't *think* we need to handle blits between buffer <-> !buffer */ + debug_assert(info->src.resource->target != PIPE_BUFFER); + debug_assert(info->dst.resource->target != PIPE_BUFFER); + emit_blit(batch->draw, info); + } - fd_resource(info->dst.resource)->valid = true; - batch->needs_flush = true; + fd_resource(info->dst.resource)->valid = true; + batch->needs_flush = true; - fd_batch_flush(batch); - fd_batch_reference(&batch, NULL); + fd_batch_flush(batch); + fd_batch_reference(&batch, NULL); - /* Acc query state will have been dirtied by our fd_batch_update_queries, so - * the ctx->batch may need to turn its queries back on. - */ - ctx->update_active_queries = true; + /* Acc query state will have been dirtied by our fd_batch_update_queries, so + * the ctx->batch may need to turn its queries back on. + */ + ctx->update_active_queries = true; - return true; + return true; } unsigned fd5_tile_mode(const struct pipe_resource *tmpl) { - /* basically just has to be a format we can blit, so uploads/downloads - * via linear staging buffer works: - */ - if (ok_format(tmpl->format)) - return TILE5_3; + /* basically just has to be a format we can blit, so uploads/downloads + * via linear staging buffer works: + */ + if (ok_format(tmpl->format)) + return TILE5_3; - return TILE5_LINEAR; + return TILE5_LINEAR; } diff --git a/src/gallium/drivers/freedreno/a5xx/fd5_blitter.h b/src/gallium/drivers/freedreno/a5xx/fd5_blitter.h index 69a071c..ade2c32 100644 --- a/src/gallium/drivers/freedreno/a5xx/fd5_blitter.h +++ b/src/gallium/drivers/freedreno/a5xx/fd5_blitter.h @@ -31,7 +31,8 @@ #include "freedreno_context.h" -bool fd5_blitter_blit(struct fd_context *ctx, const struct pipe_blit_info *info); +bool fd5_blitter_blit(struct fd_context *ctx, + const struct pipe_blit_info *info); unsigned fd5_tile_mode(const struct pipe_resource *tmpl); #endif /* FD5_BLIT_H_ */ diff --git a/src/gallium/drivers/freedreno/a5xx/fd5_compute.c b/src/gallium/drivers/freedreno/a5xx/fd5_compute.c index f8b75c9..668da2e 100644 --- a/src/gallium/drivers/freedreno/a5xx/fd5_compute.c +++ b/src/gallium/drivers/freedreno/a5xx/fd5_compute.c @@ -32,160 +32,167 @@ #include "fd5_context.h" #include "fd5_emit.h" - /* maybe move to fd5_program? */ static void cs_program_emit(struct fd_ringbuffer *ring, struct ir3_shader_variant *v) { - const struct ir3_info *i = &v->info; - enum a3xx_threadsize thrsz = i->double_threadsize ? FOUR_QUADS : TWO_QUADS; - unsigned instrlen = v->instrlen; - - /* if shader is more than 32*16 instructions, don't preload it. Similar - * to the combined restriction of 64*16 for VS+FS - */ - if (instrlen > 32) - instrlen = 0; - - OUT_PKT4(ring, REG_A5XX_SP_SP_CNTL, 1); - OUT_RING(ring, 0x00000000); /* SP_SP_CNTL */ - - OUT_PKT4(ring, REG_A5XX_HLSQ_CONTROL_0_REG, 1); - OUT_RING(ring, A5XX_HLSQ_CONTROL_0_REG_FSTHREADSIZE(TWO_QUADS) | - A5XX_HLSQ_CONTROL_0_REG_CSTHREADSIZE(thrsz) | - 0x00000880 /* XXX */); - - OUT_PKT4(ring, REG_A5XX_SP_CS_CTRL_REG0, 1); - OUT_RING(ring, A5XX_SP_CS_CTRL_REG0_THREADSIZE(thrsz) | - A5XX_SP_CS_CTRL_REG0_HALFREGFOOTPRINT(i->max_half_reg + 1) | - A5XX_SP_CS_CTRL_REG0_FULLREGFOOTPRINT(i->max_reg + 1) | - A5XX_SP_CS_CTRL_REG0_BRANCHSTACK(0x3) | // XXX need to figure this out somehow.. - 0x6 /* XXX */); - - OUT_PKT4(ring, REG_A5XX_HLSQ_CS_CONFIG, 1); - OUT_RING(ring, A5XX_HLSQ_CS_CONFIG_CONSTOBJECTOFFSET(0) | - A5XX_HLSQ_CS_CONFIG_SHADEROBJOFFSET(0) | - A5XX_HLSQ_CS_CONFIG_ENABLED); - - OUT_PKT4(ring, REG_A5XX_HLSQ_CS_CNTL, 1); - OUT_RING(ring, A5XX_HLSQ_CS_CNTL_INSTRLEN(instrlen) | - COND(v->has_ssbo, A5XX_HLSQ_CS_CNTL_SSBO_ENABLE)); - - OUT_PKT4(ring, REG_A5XX_SP_CS_CONFIG, 1); - OUT_RING(ring, A5XX_SP_CS_CONFIG_CONSTOBJECTOFFSET(0) | - A5XX_SP_CS_CONFIG_SHADEROBJOFFSET(0) | - A5XX_SP_CS_CONFIG_ENABLED); - - assert(v->constlen % 4 == 0); - unsigned constlen = v->constlen / 4; - OUT_PKT4(ring, REG_A5XX_HLSQ_CS_CONSTLEN, 2); - OUT_RING(ring, constlen); /* HLSQ_CS_CONSTLEN */ - OUT_RING(ring, instrlen); /* HLSQ_CS_INSTRLEN */ - - OUT_PKT4(ring, REG_A5XX_SP_CS_OBJ_START_LO, 2); - OUT_RELOC(ring, v->bo, 0, 0, 0); /* SP_CS_OBJ_START_LO/HI */ - - OUT_PKT4(ring, REG_A5XX_HLSQ_UPDATE_CNTL, 1); - OUT_RING(ring, 0x1f00000); - - uint32_t local_invocation_id, work_group_id; - local_invocation_id = ir3_find_sysval_regid(v, SYSTEM_VALUE_LOCAL_INVOCATION_ID); - work_group_id = ir3_find_sysval_regid(v, SYSTEM_VALUE_WORK_GROUP_ID); - - OUT_PKT4(ring, REG_A5XX_HLSQ_CS_CNTL_0, 2); - OUT_RING(ring, A5XX_HLSQ_CS_CNTL_0_WGIDCONSTID(work_group_id) | - A5XX_HLSQ_CS_CNTL_0_UNK0(regid(63, 0)) | - A5XX_HLSQ_CS_CNTL_0_UNK1(regid(63, 0)) | - A5XX_HLSQ_CS_CNTL_0_LOCALIDREGID(local_invocation_id)); - OUT_RING(ring, 0x1); /* HLSQ_CS_CNTL_1 */ - - if (instrlen > 0) - fd5_emit_shader(ring, v); + const struct ir3_info *i = &v->info; + enum a3xx_threadsize thrsz = i->double_threadsize ? FOUR_QUADS : TWO_QUADS; + unsigned instrlen = v->instrlen; + + /* if shader is more than 32*16 instructions, don't preload it. Similar + * to the combined restriction of 64*16 for VS+FS + */ + if (instrlen > 32) + instrlen = 0; + + OUT_PKT4(ring, REG_A5XX_SP_SP_CNTL, 1); + OUT_RING(ring, 0x00000000); /* SP_SP_CNTL */ + + OUT_PKT4(ring, REG_A5XX_HLSQ_CONTROL_0_REG, 1); + OUT_RING(ring, A5XX_HLSQ_CONTROL_0_REG_FSTHREADSIZE(TWO_QUADS) | + A5XX_HLSQ_CONTROL_0_REG_CSTHREADSIZE(thrsz) | + 0x00000880 /* XXX */); + + OUT_PKT4(ring, REG_A5XX_SP_CS_CTRL_REG0, 1); + OUT_RING(ring, + A5XX_SP_CS_CTRL_REG0_THREADSIZE(thrsz) | + A5XX_SP_CS_CTRL_REG0_HALFREGFOOTPRINT(i->max_half_reg + 1) | + A5XX_SP_CS_CTRL_REG0_FULLREGFOOTPRINT(i->max_reg + 1) | + A5XX_SP_CS_CTRL_REG0_BRANCHSTACK( + 0x3) | // XXX need to figure this out somehow.. + 0x6 /* XXX */); + + OUT_PKT4(ring, REG_A5XX_HLSQ_CS_CONFIG, 1); + OUT_RING(ring, A5XX_HLSQ_CS_CONFIG_CONSTOBJECTOFFSET(0) | + A5XX_HLSQ_CS_CONFIG_SHADEROBJOFFSET(0) | + A5XX_HLSQ_CS_CONFIG_ENABLED); + + OUT_PKT4(ring, REG_A5XX_HLSQ_CS_CNTL, 1); + OUT_RING(ring, A5XX_HLSQ_CS_CNTL_INSTRLEN(instrlen) | + COND(v->has_ssbo, A5XX_HLSQ_CS_CNTL_SSBO_ENABLE)); + + OUT_PKT4(ring, REG_A5XX_SP_CS_CONFIG, 1); + OUT_RING(ring, A5XX_SP_CS_CONFIG_CONSTOBJECTOFFSET(0) | + A5XX_SP_CS_CONFIG_SHADEROBJOFFSET(0) | + A5XX_SP_CS_CONFIG_ENABLED); + + assert(v->constlen % 4 == 0); + unsigned constlen = v->constlen / 4; + OUT_PKT4(ring, REG_A5XX_HLSQ_CS_CONSTLEN, 2); + OUT_RING(ring, constlen); /* HLSQ_CS_CONSTLEN */ + OUT_RING(ring, instrlen); /* HLSQ_CS_INSTRLEN */ + + OUT_PKT4(ring, REG_A5XX_SP_CS_OBJ_START_LO, 2); + OUT_RELOC(ring, v->bo, 0, 0, 0); /* SP_CS_OBJ_START_LO/HI */ + + OUT_PKT4(ring, REG_A5XX_HLSQ_UPDATE_CNTL, 1); + OUT_RING(ring, 0x1f00000); + + uint32_t local_invocation_id, work_group_id; + local_invocation_id = + ir3_find_sysval_regid(v, SYSTEM_VALUE_LOCAL_INVOCATION_ID); + work_group_id = ir3_find_sysval_regid(v, SYSTEM_VALUE_WORK_GROUP_ID); + + OUT_PKT4(ring, REG_A5XX_HLSQ_CS_CNTL_0, 2); + OUT_RING(ring, A5XX_HLSQ_CS_CNTL_0_WGIDCONSTID(work_group_id) | + A5XX_HLSQ_CS_CNTL_0_UNK0(regid(63, 0)) | + A5XX_HLSQ_CS_CNTL_0_UNK1(regid(63, 0)) | + A5XX_HLSQ_CS_CNTL_0_LOCALIDREGID(local_invocation_id)); + OUT_RING(ring, 0x1); /* HLSQ_CS_CNTL_1 */ + + if (instrlen > 0) + fd5_emit_shader(ring, v); } static void -fd5_launch_grid(struct fd_context *ctx, const struct pipe_grid_info *info) - assert_dt +fd5_launch_grid(struct fd_context *ctx, + const struct pipe_grid_info *info) assert_dt { - struct ir3_shader_key key = {}; - struct ir3_shader_variant *v; - struct fd_ringbuffer *ring = ctx->batch->draw; - unsigned nglobal = 0; - - v = ir3_shader_variant(ir3_get_shader(ctx->compute), key, false, &ctx->debug); - if (!v) - return; - - if (ctx->dirty_shader[PIPE_SHADER_COMPUTE] & FD_DIRTY_SHADER_PROG) - cs_program_emit(ring, v); - - fd5_emit_cs_state(ctx, ring, v); - fd5_emit_cs_consts(v, ring, ctx, info); - - u_foreach_bit(i, ctx->global_bindings.enabled_mask) - nglobal++; - - if (nglobal > 0) { - /* global resources don't otherwise get an OUT_RELOC(), since - * the raw ptr address is emitted ir ir3_emit_cs_consts(). - * So to make the kernel aware that these buffers are referenced - * by the batch, emit dummy reloc's as part of a no-op packet - * payload: - */ - OUT_PKT7(ring, CP_NOP, 2 * nglobal); - u_foreach_bit(i, ctx->global_bindings.enabled_mask) { - struct pipe_resource *prsc = ctx->global_bindings.buf[i]; - OUT_RELOC(ring, fd_resource(prsc)->bo, 0, 0, 0); - } - } - - const unsigned *local_size = info->block; // v->shader->nir->info->cs.local_size; - const unsigned *num_groups = info->grid; - /* for some reason, mesa/st doesn't set info->work_dim, so just assume 3: */ - const unsigned work_dim = info->work_dim ? info->work_dim : 3; - OUT_PKT4(ring, REG_A5XX_HLSQ_CS_NDRANGE_0, 7); - OUT_RING(ring, A5XX_HLSQ_CS_NDRANGE_0_KERNELDIM(work_dim) | - A5XX_HLSQ_CS_NDRANGE_0_LOCALSIZEX(local_size[0] - 1) | - A5XX_HLSQ_CS_NDRANGE_0_LOCALSIZEY(local_size[1] - 1) | - A5XX_HLSQ_CS_NDRANGE_0_LOCALSIZEZ(local_size[2] - 1)); - OUT_RING(ring, A5XX_HLSQ_CS_NDRANGE_1_GLOBALSIZE_X(local_size[0] * num_groups[0])); - OUT_RING(ring, 0); /* HLSQ_CS_NDRANGE_2_GLOBALOFF_X */ - OUT_RING(ring, A5XX_HLSQ_CS_NDRANGE_3_GLOBALSIZE_Y(local_size[1] * num_groups[1])); - OUT_RING(ring, 0); /* HLSQ_CS_NDRANGE_4_GLOBALOFF_Y */ - OUT_RING(ring, A5XX_HLSQ_CS_NDRANGE_5_GLOBALSIZE_Z(local_size[2] * num_groups[2])); - OUT_RING(ring, 0); /* HLSQ_CS_NDRANGE_6_GLOBALOFF_Z */ - - OUT_PKT4(ring, REG_A5XX_HLSQ_CS_KERNEL_GROUP_X, 3); - OUT_RING(ring, 1); /* HLSQ_CS_KERNEL_GROUP_X */ - OUT_RING(ring, 1); /* HLSQ_CS_KERNEL_GROUP_Y */ - OUT_RING(ring, 1); /* HLSQ_CS_KERNEL_GROUP_Z */ - - if (info->indirect) { - struct fd_resource *rsc = fd_resource(info->indirect); - - fd5_emit_flush(ctx, ring); - - OUT_PKT7(ring, CP_EXEC_CS_INDIRECT, 4); - OUT_RING(ring, 0x00000000); - OUT_RELOC(ring, rsc->bo, info->indirect_offset, 0, 0); /* ADDR_LO/HI */ - OUT_RING(ring, A5XX_CP_EXEC_CS_INDIRECT_3_LOCALSIZEX(local_size[0] - 1) | - A5XX_CP_EXEC_CS_INDIRECT_3_LOCALSIZEY(local_size[1] - 1) | - A5XX_CP_EXEC_CS_INDIRECT_3_LOCALSIZEZ(local_size[2] - 1)); - } else { - OUT_PKT7(ring, CP_EXEC_CS, 4); - OUT_RING(ring, 0x00000000); - OUT_RING(ring, CP_EXEC_CS_1_NGROUPS_X(info->grid[0])); - OUT_RING(ring, CP_EXEC_CS_2_NGROUPS_Y(info->grid[1])); - OUT_RING(ring, CP_EXEC_CS_3_NGROUPS_Z(info->grid[2])); - } + struct ir3_shader_key key = {}; + struct ir3_shader_variant *v; + struct fd_ringbuffer *ring = ctx->batch->draw; + unsigned nglobal = 0; + + v = + ir3_shader_variant(ir3_get_shader(ctx->compute), key, false, &ctx->debug); + if (!v) + return; + + if (ctx->dirty_shader[PIPE_SHADER_COMPUTE] & FD_DIRTY_SHADER_PROG) + cs_program_emit(ring, v); + + fd5_emit_cs_state(ctx, ring, v); + fd5_emit_cs_consts(v, ring, ctx, info); + + u_foreach_bit (i, ctx->global_bindings.enabled_mask) + nglobal++; + + if (nglobal > 0) { + /* global resources don't otherwise get an OUT_RELOC(), since + * the raw ptr address is emitted ir ir3_emit_cs_consts(). + * So to make the kernel aware that these buffers are referenced + * by the batch, emit dummy reloc's as part of a no-op packet + * payload: + */ + OUT_PKT7(ring, CP_NOP, 2 * nglobal); + u_foreach_bit (i, ctx->global_bindings.enabled_mask) { + struct pipe_resource *prsc = ctx->global_bindings.buf[i]; + OUT_RELOC(ring, fd_resource(prsc)->bo, 0, 0, 0); + } + } + + const unsigned *local_size = + info->block; // v->shader->nir->info->cs.local_size; + const unsigned *num_groups = info->grid; + /* for some reason, mesa/st doesn't set info->work_dim, so just assume 3: */ + const unsigned work_dim = info->work_dim ? info->work_dim : 3; + OUT_PKT4(ring, REG_A5XX_HLSQ_CS_NDRANGE_0, 7); + OUT_RING(ring, A5XX_HLSQ_CS_NDRANGE_0_KERNELDIM(work_dim) | + A5XX_HLSQ_CS_NDRANGE_0_LOCALSIZEX(local_size[0] - 1) | + A5XX_HLSQ_CS_NDRANGE_0_LOCALSIZEY(local_size[1] - 1) | + A5XX_HLSQ_CS_NDRANGE_0_LOCALSIZEZ(local_size[2] - 1)); + OUT_RING(ring, + A5XX_HLSQ_CS_NDRANGE_1_GLOBALSIZE_X(local_size[0] * num_groups[0])); + OUT_RING(ring, 0); /* HLSQ_CS_NDRANGE_2_GLOBALOFF_X */ + OUT_RING(ring, + A5XX_HLSQ_CS_NDRANGE_3_GLOBALSIZE_Y(local_size[1] * num_groups[1])); + OUT_RING(ring, 0); /* HLSQ_CS_NDRANGE_4_GLOBALOFF_Y */ + OUT_RING(ring, + A5XX_HLSQ_CS_NDRANGE_5_GLOBALSIZE_Z(local_size[2] * num_groups[2])); + OUT_RING(ring, 0); /* HLSQ_CS_NDRANGE_6_GLOBALOFF_Z */ + + OUT_PKT4(ring, REG_A5XX_HLSQ_CS_KERNEL_GROUP_X, 3); + OUT_RING(ring, 1); /* HLSQ_CS_KERNEL_GROUP_X */ + OUT_RING(ring, 1); /* HLSQ_CS_KERNEL_GROUP_Y */ + OUT_RING(ring, 1); /* HLSQ_CS_KERNEL_GROUP_Z */ + + if (info->indirect) { + struct fd_resource *rsc = fd_resource(info->indirect); + + fd5_emit_flush(ctx, ring); + + OUT_PKT7(ring, CP_EXEC_CS_INDIRECT, 4); + OUT_RING(ring, 0x00000000); + OUT_RELOC(ring, rsc->bo, info->indirect_offset, 0, 0); /* ADDR_LO/HI */ + OUT_RING(ring, + A5XX_CP_EXEC_CS_INDIRECT_3_LOCALSIZEX(local_size[0] - 1) | + A5XX_CP_EXEC_CS_INDIRECT_3_LOCALSIZEY(local_size[1] - 1) | + A5XX_CP_EXEC_CS_INDIRECT_3_LOCALSIZEZ(local_size[2] - 1)); + } else { + OUT_PKT7(ring, CP_EXEC_CS, 4); + OUT_RING(ring, 0x00000000); + OUT_RING(ring, CP_EXEC_CS_1_NGROUPS_X(info->grid[0])); + OUT_RING(ring, CP_EXEC_CS_2_NGROUPS_Y(info->grid[1])); + OUT_RING(ring, CP_EXEC_CS_3_NGROUPS_Z(info->grid[2])); + } } void -fd5_compute_init(struct pipe_context *pctx) - disable_thread_safety_analysis +fd5_compute_init(struct pipe_context *pctx) disable_thread_safety_analysis { - struct fd_context *ctx = fd_context(pctx); - ctx->launch_grid = fd5_launch_grid; - pctx->create_compute_state = ir3_shader_compute_state_create; - pctx->delete_compute_state = ir3_shader_state_delete; + struct fd_context *ctx = fd_context(pctx); + ctx->launch_grid = fd5_launch_grid; + pctx->create_compute_state = ir3_shader_compute_state_create; + pctx->delete_compute_state = ir3_shader_state_delete; } diff --git a/src/gallium/drivers/freedreno/a5xx/fd5_context.c b/src/gallium/drivers/freedreno/a5xx/fd5_context.c index bd92c5f..763efc8 100644 --- a/src/gallium/drivers/freedreno/a5xx/fd5_context.c +++ b/src/gallium/drivers/freedreno/a5xx/fd5_context.c @@ -26,10 +26,10 @@ #include "freedreno_query_acc.h" -#include "fd5_context.h" #include "fd5_blend.h" #include "fd5_blitter.h" #include "fd5_compute.h" +#include "fd5_context.h" #include "fd5_draw.h" #include "fd5_emit.h" #include "fd5_gmem.h" @@ -40,22 +40,21 @@ #include "fd5_zsa.h" static void -fd5_context_destroy(struct pipe_context *pctx) - in_dt +fd5_context_destroy(struct pipe_context *pctx) in_dt { - struct fd5_context *fd5_ctx = fd5_context(fd_context(pctx)); + struct fd5_context *fd5_ctx = fd5_context(fd_context(pctx)); - u_upload_destroy(fd5_ctx->border_color_uploader); - pipe_resource_reference(&fd5_ctx->border_color_buf, NULL); + u_upload_destroy(fd5_ctx->border_color_uploader); + pipe_resource_reference(&fd5_ctx->border_color_buf, NULL); - fd_context_destroy(pctx); + fd_context_destroy(pctx); - fd_bo_del(fd5_ctx->vsc_size_mem); - fd_bo_del(fd5_ctx->blit_mem); + fd_bo_del(fd5_ctx->vsc_size_mem); + fd_bo_del(fd5_ctx->blit_mem); - fd_context_cleanup_common_vbos(&fd5_ctx->base); + fd_context_cleanup_common_vbos(&fd5_ctx->base); - free(fd5_ctx); + free(fd5_ctx); } /* clang-format off */ @@ -72,56 +71,56 @@ static const uint8_t primtypes[] = { /* clang-format on */ struct pipe_context * -fd5_context_create(struct pipe_screen *pscreen, void *priv, unsigned flags) - disable_thread_safety_analysis +fd5_context_create(struct pipe_screen *pscreen, void *priv, + unsigned flags) disable_thread_safety_analysis { - struct fd_screen *screen = fd_screen(pscreen); - struct fd5_context *fd5_ctx = CALLOC_STRUCT(fd5_context); - struct pipe_context *pctx; + struct fd_screen *screen = fd_screen(pscreen); + struct fd5_context *fd5_ctx = CALLOC_STRUCT(fd5_context); + struct pipe_context *pctx; - if (!fd5_ctx) - return NULL; + if (!fd5_ctx) + return NULL; - pctx = &fd5_ctx->base.base; - pctx->screen = pscreen; + pctx = &fd5_ctx->base.base; + pctx->screen = pscreen; - fd5_ctx->base.dev = fd_device_ref(screen->dev); - fd5_ctx->base.screen = fd_screen(pscreen); - fd5_ctx->base.last.key = &fd5_ctx->last_key; + fd5_ctx->base.dev = fd_device_ref(screen->dev); + fd5_ctx->base.screen = fd_screen(pscreen); + fd5_ctx->base.last.key = &fd5_ctx->last_key; - pctx->destroy = fd5_context_destroy; - pctx->create_blend_state = fd5_blend_state_create; - pctx->create_rasterizer_state = fd5_rasterizer_state_create; - pctx->create_depth_stencil_alpha_state = fd5_zsa_state_create; + pctx->destroy = fd5_context_destroy; + pctx->create_blend_state = fd5_blend_state_create; + pctx->create_rasterizer_state = fd5_rasterizer_state_create; + pctx->create_depth_stencil_alpha_state = fd5_zsa_state_create; - fd5_draw_init(pctx); - fd5_compute_init(pctx); - fd5_gmem_init(pctx); - fd5_texture_init(pctx); - fd5_prog_init(pctx); - fd5_emit_init(pctx); + fd5_draw_init(pctx); + fd5_compute_init(pctx); + fd5_gmem_init(pctx); + fd5_texture_init(pctx); + fd5_prog_init(pctx); + fd5_emit_init(pctx); - if (!FD_DBG(NOBLIT)) - fd5_ctx->base.blit = fd5_blitter_blit; + if (!FD_DBG(NOBLIT)) + fd5_ctx->base.blit = fd5_blitter_blit; - pctx = fd_context_init(&fd5_ctx->base, pscreen, primtypes, priv, flags); - if (!pctx) - return NULL; + pctx = fd_context_init(&fd5_ctx->base, pscreen, primtypes, priv, flags); + if (!pctx) + return NULL; - util_blitter_set_texture_multisample(fd5_ctx->base.blitter, true); + util_blitter_set_texture_multisample(fd5_ctx->base.blitter, true); - fd5_ctx->vsc_size_mem = fd_bo_new(screen->dev, 0x1000, - DRM_FREEDRENO_GEM_TYPE_KMEM, "vsc_size"); + fd5_ctx->vsc_size_mem = + fd_bo_new(screen->dev, 0x1000, DRM_FREEDRENO_GEM_TYPE_KMEM, "vsc_size"); - fd5_ctx->blit_mem = fd_bo_new(screen->dev, 0x1000, - DRM_FREEDRENO_GEM_TYPE_KMEM, "blit"); + fd5_ctx->blit_mem = + fd_bo_new(screen->dev, 0x1000, DRM_FREEDRENO_GEM_TYPE_KMEM, "blit"); - fd_context_setup_common_vbos(&fd5_ctx->base); + fd_context_setup_common_vbos(&fd5_ctx->base); - fd5_query_context_init(pctx); + fd5_query_context_init(pctx); - fd5_ctx->border_color_uploader = u_upload_create(pctx, 4096, 0, - PIPE_USAGE_STREAM, 0); + fd5_ctx->border_color_uploader = + u_upload_create(pctx, 4096, 0, PIPE_USAGE_STREAM, 0); - return pctx; + return pctx; } diff --git a/src/gallium/drivers/freedreno/a5xx/fd5_context.h b/src/gallium/drivers/freedreno/a5xx/fd5_context.h index 3da77a6..776380e 100644 --- a/src/gallium/drivers/freedreno/a5xx/fd5_context.h +++ b/src/gallium/drivers/freedreno/a5xx/fd5_context.h @@ -34,55 +34,55 @@ #include "ir3/ir3_shader.h" struct fd5_context { - struct fd_context base; + struct fd_context base; - /* This only needs to be 4 * num_of_pipes bytes (ie. 32 bytes). We - * could combine it with another allocation. - */ - struct fd_bo *vsc_size_mem; + /* This only needs to be 4 * num_of_pipes bytes (ie. 32 bytes). We + * could combine it with another allocation. + */ + struct fd_bo *vsc_size_mem; - /* TODO not sure what this is for.. probably similar to - * CACHE_FLUSH_TS on kernel side, where value gets written - * to this address synchronized w/ 3d (ie. a way to - * synchronize when the CP is running far ahead) - */ - struct fd_bo *blit_mem; + /* TODO not sure what this is for.. probably similar to + * CACHE_FLUSH_TS on kernel side, where value gets written + * to this address synchronized w/ 3d (ie. a way to + * synchronize when the CP is running far ahead) + */ + struct fd_bo *blit_mem; - struct u_upload_mgr *border_color_uploader; - struct pipe_resource *border_color_buf; + struct u_upload_mgr *border_color_uploader; + struct pipe_resource *border_color_buf; - /* bitmask of samplers which need astc srgb workaround: */ - uint16_t vastc_srgb, fastc_srgb; + /* bitmask of samplers which need astc srgb workaround: */ + uint16_t vastc_srgb, fastc_srgb; - /* storage for ctx->last.key: */ - struct ir3_shader_key last_key; + /* storage for ctx->last.key: */ + struct ir3_shader_key last_key; - /* number of active samples-passed queries: */ - int samples_passed_queries; + /* number of active samples-passed queries: */ + int samples_passed_queries; - /* cached state about current emitted shader program (3d): */ - unsigned max_loc; + /* cached state about current emitted shader program (3d): */ + unsigned max_loc; }; static inline struct fd5_context * fd5_context(struct fd_context *ctx) { - return (struct fd5_context *)ctx; + return (struct fd5_context *)ctx; } -struct pipe_context * -fd5_context_create(struct pipe_screen *pscreen, void *priv, unsigned flags); +struct pipe_context *fd5_context_create(struct pipe_screen *pscreen, void *priv, + unsigned flags); /* helper for places where we need to stall CP to wait for previous draws: */ static inline void fd5_emit_flush(struct fd_context *ctx, struct fd_ringbuffer *ring) { - OUT_PKT7(ring, CP_EVENT_WRITE, 4); - OUT_RING(ring, CACHE_FLUSH_TS); - OUT_RELOC(ring, fd5_context(ctx)->blit_mem, 0, 0, 0); /* ADDR_LO/HI */ - OUT_RING(ring, 0x00000000); + OUT_PKT7(ring, CP_EVENT_WRITE, 4); + OUT_RING(ring, CACHE_FLUSH_TS); + OUT_RELOC(ring, fd5_context(ctx)->blit_mem, 0, 0, 0); /* ADDR_LO/HI */ + OUT_RING(ring, 0x00000000); - OUT_WFI5(ring); + OUT_WFI5(ring); } #endif /* FD5_CONTEXT_H_ */ diff --git a/src/gallium/drivers/freedreno/a5xx/fd5_draw.c b/src/gallium/drivers/freedreno/a5xx/fd5_draw.c index 038519d..b9cbfaa 100644 --- a/src/gallium/drivers/freedreno/a5xx/fd5_draw.c +++ b/src/gallium/drivers/freedreno/a5xx/fd5_draw.c @@ -25,343 +25,341 @@ */ #include "pipe/p_state.h" -#include "util/u_string.h" #include "util/u_memory.h" #include "util/u_prim.h" +#include "util/u_string.h" -#include "freedreno_state.h" #include "freedreno_resource.h" +#include "freedreno_state.h" -#include "fd5_draw.h" #include "fd5_context.h" +#include "fd5_draw.h" #include "fd5_emit.h" -#include "fd5_program.h" #include "fd5_format.h" +#include "fd5_program.h" #include "fd5_zsa.h" - static void draw_impl(struct fd_context *ctx, struct fd_ringbuffer *ring, - struct fd5_emit *emit, unsigned index_offset) - assert_dt + struct fd5_emit *emit, unsigned index_offset) assert_dt { - const struct pipe_draw_info *info = emit->info; - enum pc_di_primtype primtype = ctx->primtypes[info->mode]; + const struct pipe_draw_info *info = emit->info; + enum pc_di_primtype primtype = ctx->primtypes[info->mode]; - fd5_emit_state(ctx, ring, emit); + fd5_emit_state(ctx, ring, emit); - if (emit->dirty & (FD_DIRTY_VTXBUF | FD_DIRTY_VTXSTATE)) - fd5_emit_vertex_bufs(ring, emit); + if (emit->dirty & (FD_DIRTY_VTXBUF | FD_DIRTY_VTXSTATE)) + fd5_emit_vertex_bufs(ring, emit); - OUT_PKT4(ring, REG_A5XX_VFD_INDEX_OFFSET, 2); - OUT_RING(ring, info->index_size ? info->index_bias : emit->draw->start); /* VFD_INDEX_OFFSET */ - OUT_RING(ring, info->start_instance); /* VFD_INSTANCE_START_OFFSET */ + OUT_PKT4(ring, REG_A5XX_VFD_INDEX_OFFSET, 2); + OUT_RING(ring, info->index_size ? info->index_bias + : emit->draw->start); /* VFD_INDEX_OFFSET */ + OUT_RING(ring, info->start_instance); /* VFD_INSTANCE_START_OFFSET */ - OUT_PKT4(ring, REG_A5XX_PC_RESTART_INDEX, 1); - OUT_RING(ring, info->primitive_restart ? /* PC_RESTART_INDEX */ - info->restart_index : 0xffffffff); + OUT_PKT4(ring, REG_A5XX_PC_RESTART_INDEX, 1); + OUT_RING(ring, info->primitive_restart ? /* PC_RESTART_INDEX */ + info->restart_index + : 0xffffffff); - fd5_emit_render_cntl(ctx, false, emit->binning_pass); - fd5_draw_emit(ctx->batch, ring, primtype, - emit->binning_pass ? IGNORE_VISIBILITY : USE_VISIBILITY, - info, emit->indirect, emit->draw, index_offset); + fd5_emit_render_cntl(ctx, false, emit->binning_pass); + fd5_draw_emit(ctx->batch, ring, primtype, + emit->binning_pass ? IGNORE_VISIBILITY : USE_VISIBILITY, info, + emit->indirect, emit->draw, index_offset); } static bool fd5_draw_vbo(struct fd_context *ctx, const struct pipe_draw_info *info, const struct pipe_draw_indirect_info *indirect, const struct pipe_draw_start_count *draw, - unsigned index_offset) - in_dt + unsigned index_offset) in_dt { - struct fd5_context *fd5_ctx = fd5_context(ctx); - struct fd5_emit emit = { - .debug = &ctx->debug, - .vtx = &ctx->vtx, - .info = info, - .indirect = indirect, - .draw = draw, - .key = { - .vs = ctx->prog.vs, - .fs = ctx->prog.fs, - .key = { - .rasterflat = ctx->rasterizer->flatshade, - .has_per_samp = fd5_ctx->fastc_srgb || fd5_ctx->vastc_srgb, - .vastc_srgb = fd5_ctx->vastc_srgb, - .fastc_srgb = fd5_ctx->fastc_srgb, - }, - }, - .rasterflat = ctx->rasterizer->flatshade, - .sprite_coord_enable = ctx->rasterizer->sprite_coord_enable, - .sprite_coord_mode = ctx->rasterizer->sprite_coord_mode, - }; - - /* Technically a5xx should not require this, but it avoids a crash in - * piglit 'spec@!opengl 1.1@ppgtt_memory_alignment' due to a draw with - * no VBO bound but a VS that expects an input. The draw is a single - * vertex with PIPE_PRIM_TRIANGLES so the u_trim_pipe_prim() causes it - * to be skipped. - */ - if (info->mode != PIPE_PRIM_MAX && - !indirect && - !info->primitive_restart && - !u_trim_pipe_prim(info->mode, (unsigned*)&draw->count)) - return false; - - ir3_fixup_shader_state(&ctx->base, &emit.key.key); - - unsigned dirty = ctx->dirty; - - emit.prog = fd5_program_state(ir3_cache_lookup(ctx->shader_cache, &emit.key, &ctx->debug)); - - /* bail if compile failed: */ - if (!emit.prog) - return false; - - const struct ir3_shader_variant *vp = fd5_emit_get_vp(&emit); - const struct ir3_shader_variant *fp = fd5_emit_get_fp(&emit); - - ir3_update_max_tf_vtx(ctx, vp); - - /* do regular pass first: */ - - if (unlikely(ctx->stats_users > 0)) { - ctx->stats.vs_regs += ir3_shader_halfregs(vp); - ctx->stats.fs_regs += ir3_shader_halfregs(fp); - } - - /* figure out whether we need to disable LRZ write for binning - * pass using draw pass's fp: - */ - emit.no_lrz_write = fp->writes_pos || fp->no_earlyz || fp->has_kill; - - emit.binning_pass = false; - emit.dirty = dirty; - - draw_impl(ctx, ctx->batch->draw, &emit, index_offset); - - /* and now binning pass: */ - emit.binning_pass = true; - emit.dirty = dirty & ~(FD_DIRTY_BLEND); - emit.vs = NULL; /* we changed key so need to refetch vp */ - emit.fs = NULL; - draw_impl(ctx, ctx->batch->binning, &emit, index_offset); - - if (emit.streamout_mask) { - struct fd_ringbuffer *ring = ctx->batch->draw; - - for (unsigned i = 0; i < PIPE_MAX_SO_BUFFERS; i++) { - if (emit.streamout_mask & (1 << i)) { - fd5_event_write(ctx->batch, ring, FLUSH_SO_0 + i, false); - } - } - } - - fd_context_all_clean(ctx); - - return true; + struct fd5_context *fd5_ctx = fd5_context(ctx); + struct fd5_emit emit = { + .debug = &ctx->debug, + .vtx = &ctx->vtx, + .info = info, + .indirect = indirect, + .draw = draw, + .key = + { + .vs = ctx->prog.vs, + .fs = ctx->prog.fs, + .key = + { + .rasterflat = ctx->rasterizer->flatshade, + .has_per_samp = fd5_ctx->fastc_srgb || fd5_ctx->vastc_srgb, + .vastc_srgb = fd5_ctx->vastc_srgb, + .fastc_srgb = fd5_ctx->fastc_srgb, + }, + }, + .rasterflat = ctx->rasterizer->flatshade, + .sprite_coord_enable = ctx->rasterizer->sprite_coord_enable, + .sprite_coord_mode = ctx->rasterizer->sprite_coord_mode, + }; + + /* Technically a5xx should not require this, but it avoids a crash in + * piglit 'spec@!opengl 1.1@ppgtt_memory_alignment' due to a draw with + * no VBO bound but a VS that expects an input. The draw is a single + * vertex with PIPE_PRIM_TRIANGLES so the u_trim_pipe_prim() causes it + * to be skipped. + */ + if (info->mode != PIPE_PRIM_MAX && !indirect && !info->primitive_restart && + !u_trim_pipe_prim(info->mode, (unsigned *)&draw->count)) + return false; + + ir3_fixup_shader_state(&ctx->base, &emit.key.key); + + unsigned dirty = ctx->dirty; + + emit.prog = fd5_program_state( + ir3_cache_lookup(ctx->shader_cache, &emit.key, &ctx->debug)); + + /* bail if compile failed: */ + if (!emit.prog) + return false; + + const struct ir3_shader_variant *vp = fd5_emit_get_vp(&emit); + const struct ir3_shader_variant *fp = fd5_emit_get_fp(&emit); + + ir3_update_max_tf_vtx(ctx, vp); + + /* do regular pass first: */ + + if (unlikely(ctx->stats_users > 0)) { + ctx->stats.vs_regs += ir3_shader_halfregs(vp); + ctx->stats.fs_regs += ir3_shader_halfregs(fp); + } + + /* figure out whether we need to disable LRZ write for binning + * pass using draw pass's fp: + */ + emit.no_lrz_write = fp->writes_pos || fp->no_earlyz || fp->has_kill; + + emit.binning_pass = false; + emit.dirty = dirty; + + draw_impl(ctx, ctx->batch->draw, &emit, index_offset); + + /* and now binning pass: */ + emit.binning_pass = true; + emit.dirty = dirty & ~(FD_DIRTY_BLEND); + emit.vs = NULL; /* we changed key so need to refetch vp */ + emit.fs = NULL; + draw_impl(ctx, ctx->batch->binning, &emit, index_offset); + + if (emit.streamout_mask) { + struct fd_ringbuffer *ring = ctx->batch->draw; + + for (unsigned i = 0; i < PIPE_MAX_SO_BUFFERS; i++) { + if (emit.streamout_mask & (1 << i)) { + fd5_event_write(ctx->batch, ring, FLUSH_SO_0 + i, false); + } + } + } + + fd_context_all_clean(ctx); + + return true; } -static bool is_z32(enum pipe_format format) +static bool +is_z32(enum pipe_format format) { - switch (format) { - case PIPE_FORMAT_Z32_FLOAT_S8X24_UINT: - case PIPE_FORMAT_Z32_UNORM: - case PIPE_FORMAT_Z32_FLOAT: - return true; - default: - return false; - } + switch (format) { + case PIPE_FORMAT_Z32_FLOAT_S8X24_UINT: + case PIPE_FORMAT_Z32_UNORM: + case PIPE_FORMAT_Z32_FLOAT: + return true; + default: + return false; + } } static void fd5_clear_lrz(struct fd_batch *batch, struct fd_resource *zsbuf, double depth) { - struct fd_ringbuffer *ring; - uint32_t clear = util_pack_z(PIPE_FORMAT_Z16_UNORM, depth); + struct fd_ringbuffer *ring; + uint32_t clear = util_pack_z(PIPE_FORMAT_Z16_UNORM, depth); - ring = fd_batch_get_prologue(batch); + ring = fd_batch_get_prologue(batch); - OUT_WFI5(ring); + OUT_WFI5(ring); - OUT_PKT4(ring, REG_A5XX_RB_CCU_CNTL, 1); - OUT_RING(ring, 0x10000000); + OUT_PKT4(ring, REG_A5XX_RB_CCU_CNTL, 1); + OUT_RING(ring, 0x10000000); - OUT_PKT4(ring, REG_A5XX_HLSQ_UPDATE_CNTL, 1); - OUT_RING(ring, 0x20fffff); + OUT_PKT4(ring, REG_A5XX_HLSQ_UPDATE_CNTL, 1); + OUT_RING(ring, 0x20fffff); - OUT_PKT4(ring, REG_A5XX_GRAS_SU_CNTL, 1); - OUT_RING(ring, A5XX_GRAS_SU_CNTL_LINEHALFWIDTH(0.0) | - COND(zsbuf->b.b.nr_samples > 1, A5XX_GRAS_SU_CNTL_MSAA_ENABLE)); + OUT_PKT4(ring, REG_A5XX_GRAS_SU_CNTL, 1); + OUT_RING(ring, + A5XX_GRAS_SU_CNTL_LINEHALFWIDTH(0.0) | + COND(zsbuf->b.b.nr_samples > 1, A5XX_GRAS_SU_CNTL_MSAA_ENABLE)); - OUT_PKT4(ring, REG_A5XX_GRAS_CNTL, 1); - OUT_RING(ring, 0x00000000); + OUT_PKT4(ring, REG_A5XX_GRAS_CNTL, 1); + OUT_RING(ring, 0x00000000); - OUT_PKT4(ring, REG_A5XX_GRAS_CL_CNTL, 1); - OUT_RING(ring, 0x00000181); + OUT_PKT4(ring, REG_A5XX_GRAS_CL_CNTL, 1); + OUT_RING(ring, 0x00000181); - OUT_PKT4(ring, REG_A5XX_GRAS_LRZ_CNTL, 1); - OUT_RING(ring, 0x00000000); + OUT_PKT4(ring, REG_A5XX_GRAS_LRZ_CNTL, 1); + OUT_RING(ring, 0x00000000); - OUT_PKT4(ring, REG_A5XX_RB_MRT_BUF_INFO(0), 5); - OUT_RING(ring, A5XX_RB_MRT_BUF_INFO_COLOR_FORMAT(RB5_R16_UNORM) | - A5XX_RB_MRT_BUF_INFO_COLOR_TILE_MODE(TILE5_LINEAR) | - A5XX_RB_MRT_BUF_INFO_COLOR_SWAP(WZYX)); - OUT_RING(ring, A5XX_RB_MRT_PITCH(zsbuf->lrz_pitch * 2)); - OUT_RING(ring, A5XX_RB_MRT_ARRAY_PITCH(fd_bo_size(zsbuf->lrz))); - OUT_RELOC(ring, zsbuf->lrz, 0x1000, 0, 0); + OUT_PKT4(ring, REG_A5XX_RB_MRT_BUF_INFO(0), 5); + OUT_RING(ring, A5XX_RB_MRT_BUF_INFO_COLOR_FORMAT(RB5_R16_UNORM) | + A5XX_RB_MRT_BUF_INFO_COLOR_TILE_MODE(TILE5_LINEAR) | + A5XX_RB_MRT_BUF_INFO_COLOR_SWAP(WZYX)); + OUT_RING(ring, A5XX_RB_MRT_PITCH(zsbuf->lrz_pitch * 2)); + OUT_RING(ring, A5XX_RB_MRT_ARRAY_PITCH(fd_bo_size(zsbuf->lrz))); + OUT_RELOC(ring, zsbuf->lrz, 0x1000, 0, 0); - OUT_PKT4(ring, REG_A5XX_RB_RENDER_CNTL, 1); - OUT_RING(ring, 0x00000000); + OUT_PKT4(ring, REG_A5XX_RB_RENDER_CNTL, 1); + OUT_RING(ring, 0x00000000); - OUT_PKT4(ring, REG_A5XX_RB_DEST_MSAA_CNTL, 1); - OUT_RING(ring, A5XX_RB_DEST_MSAA_CNTL_SAMPLES(MSAA_ONE)); + OUT_PKT4(ring, REG_A5XX_RB_DEST_MSAA_CNTL, 1); + OUT_RING(ring, A5XX_RB_DEST_MSAA_CNTL_SAMPLES(MSAA_ONE)); - OUT_PKT4(ring, REG_A5XX_RB_BLIT_CNTL, 1); - OUT_RING(ring, A5XX_RB_BLIT_CNTL_BUF(BLIT_MRT0)); + OUT_PKT4(ring, REG_A5XX_RB_BLIT_CNTL, 1); + OUT_RING(ring, A5XX_RB_BLIT_CNTL_BUF(BLIT_MRT0)); - OUT_PKT4(ring, REG_A5XX_RB_CLEAR_CNTL, 1); - OUT_RING(ring, A5XX_RB_CLEAR_CNTL_FAST_CLEAR | - A5XX_RB_CLEAR_CNTL_MASK(0xf)); + OUT_PKT4(ring, REG_A5XX_RB_CLEAR_CNTL, 1); + OUT_RING(ring, A5XX_RB_CLEAR_CNTL_FAST_CLEAR | A5XX_RB_CLEAR_CNTL_MASK(0xf)); - OUT_PKT4(ring, REG_A5XX_RB_CLEAR_COLOR_DW0, 1); - OUT_RING(ring, clear); /* RB_CLEAR_COLOR_DW0 */ + OUT_PKT4(ring, REG_A5XX_RB_CLEAR_COLOR_DW0, 1); + OUT_RING(ring, clear); /* RB_CLEAR_COLOR_DW0 */ - OUT_PKT4(ring, REG_A5XX_VSC_RESOLVE_CNTL, 2); - OUT_RING(ring, A5XX_VSC_RESOLVE_CNTL_X(zsbuf->lrz_width) | - A5XX_VSC_RESOLVE_CNTL_Y(zsbuf->lrz_height)); - OUT_RING(ring, 0x00000000); // XXX UNKNOWN_0CDE + OUT_PKT4(ring, REG_A5XX_VSC_RESOLVE_CNTL, 2); + OUT_RING(ring, A5XX_VSC_RESOLVE_CNTL_X(zsbuf->lrz_width) | + A5XX_VSC_RESOLVE_CNTL_Y(zsbuf->lrz_height)); + OUT_RING(ring, 0x00000000); // XXX UNKNOWN_0CDE - OUT_PKT4(ring, REG_A5XX_RB_CNTL, 1); - OUT_RING(ring, A5XX_RB_CNTL_BYPASS); + OUT_PKT4(ring, REG_A5XX_RB_CNTL, 1); + OUT_RING(ring, A5XX_RB_CNTL_BYPASS); - OUT_PKT4(ring, REG_A5XX_RB_RESOLVE_CNTL_1, 2); - OUT_RING(ring, A5XX_RB_RESOLVE_CNTL_1_X(0) | - A5XX_RB_RESOLVE_CNTL_1_Y(0)); - OUT_RING(ring, A5XX_RB_RESOLVE_CNTL_2_X(zsbuf->lrz_width - 1) | - A5XX_RB_RESOLVE_CNTL_2_Y(zsbuf->lrz_height - 1)); + OUT_PKT4(ring, REG_A5XX_RB_RESOLVE_CNTL_1, 2); + OUT_RING(ring, A5XX_RB_RESOLVE_CNTL_1_X(0) | A5XX_RB_RESOLVE_CNTL_1_Y(0)); + OUT_RING(ring, A5XX_RB_RESOLVE_CNTL_2_X(zsbuf->lrz_width - 1) | + A5XX_RB_RESOLVE_CNTL_2_Y(zsbuf->lrz_height - 1)); - fd5_emit_blit(batch, ring); + fd5_emit_blit(batch, ring); } static bool fd5_clear(struct fd_context *ctx, unsigned buffers, - const union pipe_color_union *color, double depth, unsigned stencil) - assert_dt + const union pipe_color_union *color, double depth, + unsigned stencil) assert_dt { - struct fd_ringbuffer *ring = ctx->batch->draw; - struct pipe_framebuffer_state *pfb = &ctx->batch->framebuffer; - - if ((buffers & (PIPE_CLEAR_DEPTH | PIPE_CLEAR_STENCIL)) && - is_z32(pfb->zsbuf->format)) - return false; - - fd5_emit_render_cntl(ctx, true, false); - - if (buffers & PIPE_CLEAR_COLOR) { - for (int i = 0; i < pfb->nr_cbufs; i++) { - union util_color uc = {0}; - - if (!pfb->cbufs[i]) - continue; - - if (!(buffers & (PIPE_CLEAR_COLOR0 << i))) - continue; - - enum pipe_format pfmt = pfb->cbufs[i]->format; - - // XXX I think RB_CLEAR_COLOR_DWn wants to take into account SWAP?? - union pipe_color_union swapped; - switch (fd5_pipe2swap(pfmt)) { - case WZYX: - swapped.ui[0] = color->ui[0]; - swapped.ui[1] = color->ui[1]; - swapped.ui[2] = color->ui[2]; - swapped.ui[3] = color->ui[3]; - break; - case WXYZ: - swapped.ui[2] = color->ui[0]; - swapped.ui[1] = color->ui[1]; - swapped.ui[0] = color->ui[2]; - swapped.ui[3] = color->ui[3]; - break; - case ZYXW: - swapped.ui[3] = color->ui[0]; - swapped.ui[0] = color->ui[1]; - swapped.ui[1] = color->ui[2]; - swapped.ui[2] = color->ui[3]; - break; - case XYZW: - swapped.ui[3] = color->ui[0]; - swapped.ui[2] = color->ui[1]; - swapped.ui[1] = color->ui[2]; - swapped.ui[0] = color->ui[3]; - break; - } - - util_pack_color_union(pfmt, &uc, &swapped); - - OUT_PKT4(ring, REG_A5XX_RB_BLIT_CNTL, 1); - OUT_RING(ring, A5XX_RB_BLIT_CNTL_BUF(BLIT_MRT0 + i)); - - OUT_PKT4(ring, REG_A5XX_RB_CLEAR_CNTL, 1); - OUT_RING(ring, A5XX_RB_CLEAR_CNTL_FAST_CLEAR | - A5XX_RB_CLEAR_CNTL_MASK(0xf)); - - OUT_PKT4(ring, REG_A5XX_RB_CLEAR_COLOR_DW0, 4); - OUT_RING(ring, uc.ui[0]); /* RB_CLEAR_COLOR_DW0 */ - OUT_RING(ring, uc.ui[1]); /* RB_CLEAR_COLOR_DW1 */ - OUT_RING(ring, uc.ui[2]); /* RB_CLEAR_COLOR_DW2 */ - OUT_RING(ring, uc.ui[3]); /* RB_CLEAR_COLOR_DW3 */ - - fd5_emit_blit(ctx->batch, ring); - } - } - - if (pfb->zsbuf && (buffers & (PIPE_CLEAR_DEPTH | PIPE_CLEAR_STENCIL))) { - uint32_t clear = - util_pack_z_stencil(pfb->zsbuf->format, depth, stencil); - uint32_t mask = 0; - - if (buffers & PIPE_CLEAR_DEPTH) - mask |= 0x1; - - if (buffers & PIPE_CLEAR_STENCIL) - mask |= 0x2; - - OUT_PKT4(ring, REG_A5XX_RB_BLIT_CNTL, 1); - OUT_RING(ring, A5XX_RB_BLIT_CNTL_BUF(BLIT_ZS)); - - OUT_PKT4(ring, REG_A5XX_RB_CLEAR_CNTL, 1); - OUT_RING(ring, A5XX_RB_CLEAR_CNTL_FAST_CLEAR | - A5XX_RB_CLEAR_CNTL_MASK(mask)); - - OUT_PKT4(ring, REG_A5XX_RB_CLEAR_COLOR_DW0, 1); - OUT_RING(ring, clear); /* RB_CLEAR_COLOR_DW0 */ - - fd5_emit_blit(ctx->batch, ring); - - if (pfb->zsbuf && (buffers & PIPE_CLEAR_DEPTH)) { - struct fd_resource *zsbuf = fd_resource(pfb->zsbuf->texture); - if (zsbuf->lrz) { - zsbuf->lrz_valid = true; - fd5_clear_lrz(ctx->batch, zsbuf, depth); - } - } - } - - /* disable fast clear to not interfere w/ gmem->mem, etc.. */ - OUT_PKT4(ring, REG_A5XX_RB_CLEAR_CNTL, 1); - OUT_RING(ring, 0x00000000); /* RB_CLEAR_CNTL */ - - return true; + struct fd_ringbuffer *ring = ctx->batch->draw; + struct pipe_framebuffer_state *pfb = &ctx->batch->framebuffer; + + if ((buffers & (PIPE_CLEAR_DEPTH | PIPE_CLEAR_STENCIL)) && + is_z32(pfb->zsbuf->format)) + return false; + + fd5_emit_render_cntl(ctx, true, false); + + if (buffers & PIPE_CLEAR_COLOR) { + for (int i = 0; i < pfb->nr_cbufs; i++) { + union util_color uc = {0}; + + if (!pfb->cbufs[i]) + continue; + + if (!(buffers & (PIPE_CLEAR_COLOR0 << i))) + continue; + + enum pipe_format pfmt = pfb->cbufs[i]->format; + + // XXX I think RB_CLEAR_COLOR_DWn wants to take into account SWAP?? + union pipe_color_union swapped; + switch (fd5_pipe2swap(pfmt)) { + case WZYX: + swapped.ui[0] = color->ui[0]; + swapped.ui[1] = color->ui[1]; + swapped.ui[2] = color->ui[2]; + swapped.ui[3] = color->ui[3]; + break; + case WXYZ: + swapped.ui[2] = color->ui[0]; + swapped.ui[1] = color->ui[1]; + swapped.ui[0] = color->ui[2]; + swapped.ui[3] = color->ui[3]; + break; + case ZYXW: + swapped.ui[3] = color->ui[0]; + swapped.ui[0] = color->ui[1]; + swapped.ui[1] = color->ui[2]; + swapped.ui[2] = color->ui[3]; + break; + case XYZW: + swapped.ui[3] = color->ui[0]; + swapped.ui[2] = color->ui[1]; + swapped.ui[1] = color->ui[2]; + swapped.ui[0] = color->ui[3]; + break; + } + + util_pack_color_union(pfmt, &uc, &swapped); + + OUT_PKT4(ring, REG_A5XX_RB_BLIT_CNTL, 1); + OUT_RING(ring, A5XX_RB_BLIT_CNTL_BUF(BLIT_MRT0 + i)); + + OUT_PKT4(ring, REG_A5XX_RB_CLEAR_CNTL, 1); + OUT_RING(ring, + A5XX_RB_CLEAR_CNTL_FAST_CLEAR | A5XX_RB_CLEAR_CNTL_MASK(0xf)); + + OUT_PKT4(ring, REG_A5XX_RB_CLEAR_COLOR_DW0, 4); + OUT_RING(ring, uc.ui[0]); /* RB_CLEAR_COLOR_DW0 */ + OUT_RING(ring, uc.ui[1]); /* RB_CLEAR_COLOR_DW1 */ + OUT_RING(ring, uc.ui[2]); /* RB_CLEAR_COLOR_DW2 */ + OUT_RING(ring, uc.ui[3]); /* RB_CLEAR_COLOR_DW3 */ + + fd5_emit_blit(ctx->batch, ring); + } + } + + if (pfb->zsbuf && (buffers & (PIPE_CLEAR_DEPTH | PIPE_CLEAR_STENCIL))) { + uint32_t clear = util_pack_z_stencil(pfb->zsbuf->format, depth, stencil); + uint32_t mask = 0; + + if (buffers & PIPE_CLEAR_DEPTH) + mask |= 0x1; + + if (buffers & PIPE_CLEAR_STENCIL) + mask |= 0x2; + + OUT_PKT4(ring, REG_A5XX_RB_BLIT_CNTL, 1); + OUT_RING(ring, A5XX_RB_BLIT_CNTL_BUF(BLIT_ZS)); + + OUT_PKT4(ring, REG_A5XX_RB_CLEAR_CNTL, 1); + OUT_RING(ring, + A5XX_RB_CLEAR_CNTL_FAST_CLEAR | A5XX_RB_CLEAR_CNTL_MASK(mask)); + + OUT_PKT4(ring, REG_A5XX_RB_CLEAR_COLOR_DW0, 1); + OUT_RING(ring, clear); /* RB_CLEAR_COLOR_DW0 */ + + fd5_emit_blit(ctx->batch, ring); + + if (pfb->zsbuf && (buffers & PIPE_CLEAR_DEPTH)) { + struct fd_resource *zsbuf = fd_resource(pfb->zsbuf->texture); + if (zsbuf->lrz) { + zsbuf->lrz_valid = true; + fd5_clear_lrz(ctx->batch, zsbuf, depth); + } + } + } + + /* disable fast clear to not interfere w/ gmem->mem, etc.. */ + OUT_PKT4(ring, REG_A5XX_RB_CLEAR_CNTL, 1); + OUT_RING(ring, 0x00000000); /* RB_CLEAR_CNTL */ + + return true; } void -fd5_draw_init(struct pipe_context *pctx) - disable_thread_safety_analysis +fd5_draw_init(struct pipe_context *pctx) disable_thread_safety_analysis { - struct fd_context *ctx = fd_context(pctx); - ctx->draw_vbo = fd5_draw_vbo; - ctx->clear = fd5_clear; + struct fd_context *ctx = fd_context(pctx); + ctx->draw_vbo = fd5_draw_vbo; + ctx->clear = fd5_clear; } diff --git a/src/gallium/drivers/freedreno/a5xx/fd5_draw.h b/src/gallium/drivers/freedreno/a5xx/fd5_draw.h index 47577fd..59fa77c 100644 --- a/src/gallium/drivers/freedreno/a5xx/fd5_draw.h +++ b/src/gallium/drivers/freedreno/a5xx/fd5_draw.h @@ -41,107 +41,103 @@ void fd5_draw_init(struct pipe_context *pctx); static inline void fd5_draw(struct fd_batch *batch, struct fd_ringbuffer *ring, - enum pc_di_primtype primtype, - enum pc_di_vis_cull_mode vismode, - enum pc_di_src_sel src_sel, uint32_t count, - uint32_t instances, enum a4xx_index_size idx_type, - uint32_t max_indices, uint32_t idx_offset, - struct pipe_resource *idx_buffer) + enum pc_di_primtype primtype, enum pc_di_vis_cull_mode vismode, + enum pc_di_src_sel src_sel, uint32_t count, uint32_t instances, + enum a4xx_index_size idx_type, uint32_t max_indices, + uint32_t idx_offset, struct pipe_resource *idx_buffer) { - /* for debug after a lock up, write a unique counter value - * to scratch7 for each draw, to make it easier to match up - * register dumps to cmdstream. The combination of IB - * (scratch6) and DRAW is enough to "triangulate" the - * particular draw that caused lockup. - */ - emit_marker5(ring, 7); - - OUT_PKT7(ring, CP_DRAW_INDX_OFFSET, idx_buffer ? 7 : 3); - if (vismode == USE_VISIBILITY) { - /* leave vis mode blank for now, it will be patched up when - * we know if we are binning or not - */ - OUT_RINGP(ring, DRAW4(primtype, src_sel, idx_type, 0), - &batch->draw_patches); - } else { - OUT_RING(ring, DRAW4(primtype, src_sel, idx_type, vismode)); - } - OUT_RING(ring, instances); /* NumInstances */ - OUT_RING(ring, count); /* NumIndices */ - if (idx_buffer) { - OUT_RING(ring, 0x0); /* XXX */ - OUT_RELOC(ring, fd_resource(idx_buffer)->bo, idx_offset, 0, 0); - OUT_RING (ring, max_indices); - } - - emit_marker5(ring, 7); - - fd_reset_wfi(batch); + /* for debug after a lock up, write a unique counter value + * to scratch7 for each draw, to make it easier to match up + * register dumps to cmdstream. The combination of IB + * (scratch6) and DRAW is enough to "triangulate" the + * particular draw that caused lockup. + */ + emit_marker5(ring, 7); + + OUT_PKT7(ring, CP_DRAW_INDX_OFFSET, idx_buffer ? 7 : 3); + if (vismode == USE_VISIBILITY) { + /* leave vis mode blank for now, it will be patched up when + * we know if we are binning or not + */ + OUT_RINGP(ring, DRAW4(primtype, src_sel, idx_type, 0), + &batch->draw_patches); + } else { + OUT_RING(ring, DRAW4(primtype, src_sel, idx_type, vismode)); + } + OUT_RING(ring, instances); /* NumInstances */ + OUT_RING(ring, count); /* NumIndices */ + if (idx_buffer) { + OUT_RING(ring, 0x0); /* XXX */ + OUT_RELOC(ring, fd_resource(idx_buffer)->bo, idx_offset, 0, 0); + OUT_RING(ring, max_indices); + } + + emit_marker5(ring, 7); + + fd_reset_wfi(batch); } static inline void fd5_draw_emit(struct fd_batch *batch, struct fd_ringbuffer *ring, - enum pc_di_primtype primtype, - enum pc_di_vis_cull_mode vismode, - const struct pipe_draw_info *info, + enum pc_di_primtype primtype, enum pc_di_vis_cull_mode vismode, + const struct pipe_draw_info *info, const struct pipe_draw_indirect_info *indirect, - const struct pipe_draw_start_count *draw, - unsigned index_offset) + const struct pipe_draw_start_count *draw, unsigned index_offset) { - struct pipe_resource *idx_buffer = NULL; - enum a4xx_index_size idx_type; - enum pc_di_src_sel src_sel; - uint32_t max_indices, idx_offset; - - if (indirect && indirect->buffer) { - struct fd_resource *ind = fd_resource(indirect->buffer); - - emit_marker5(ring, 7); - - if (info->index_size) { - struct pipe_resource *idx = info->index.resource; - max_indices = idx->width0 / info->index_size; - - OUT_PKT7(ring, CP_DRAW_INDX_INDIRECT, 6); - OUT_RINGP(ring, DRAW4(primtype, DI_SRC_SEL_DMA, - fd4_size2indextype(info->index_size), 0), - &batch->draw_patches); - OUT_RELOC(ring, fd_resource(idx)->bo, - index_offset, 0, 0); - OUT_RING(ring, A5XX_CP_DRAW_INDX_INDIRECT_3_MAX_INDICES(max_indices)); - OUT_RELOC(ring, ind->bo, indirect->offset, 0, 0); - } else { - OUT_PKT7(ring, CP_DRAW_INDIRECT, 3); - OUT_RINGP(ring, DRAW4(primtype, DI_SRC_SEL_AUTO_INDEX, 0, 0), - &batch->draw_patches); - OUT_RELOC(ring, ind->bo, indirect->offset, 0, 0); - } - - emit_marker5(ring, 7); - fd_reset_wfi(batch); - - return; - } - - if (info->index_size) { - assert(!info->has_user_indices); - - idx_buffer = info->index.resource; - idx_type = fd4_size2indextype(info->index_size); - max_indices = idx_buffer->width0 / info->index_size; - idx_offset = index_offset + draw->start * info->index_size; - src_sel = DI_SRC_SEL_DMA; - } else { - idx_buffer = NULL; - idx_type = INDEX4_SIZE_32_BIT; - max_indices = 0; - idx_offset = 0; - src_sel = DI_SRC_SEL_AUTO_INDEX; - } - - fd5_draw(batch, ring, primtype, vismode, src_sel, - draw->count, info->instance_count, - idx_type, max_indices, idx_offset, idx_buffer); + struct pipe_resource *idx_buffer = NULL; + enum a4xx_index_size idx_type; + enum pc_di_src_sel src_sel; + uint32_t max_indices, idx_offset; + + if (indirect && indirect->buffer) { + struct fd_resource *ind = fd_resource(indirect->buffer); + + emit_marker5(ring, 7); + + if (info->index_size) { + struct pipe_resource *idx = info->index.resource; + max_indices = idx->width0 / info->index_size; + + OUT_PKT7(ring, CP_DRAW_INDX_INDIRECT, 6); + OUT_RINGP(ring, + DRAW4(primtype, DI_SRC_SEL_DMA, + fd4_size2indextype(info->index_size), 0), + &batch->draw_patches); + OUT_RELOC(ring, fd_resource(idx)->bo, index_offset, 0, 0); + OUT_RING(ring, A5XX_CP_DRAW_INDX_INDIRECT_3_MAX_INDICES(max_indices)); + OUT_RELOC(ring, ind->bo, indirect->offset, 0, 0); + } else { + OUT_PKT7(ring, CP_DRAW_INDIRECT, 3); + OUT_RINGP(ring, DRAW4(primtype, DI_SRC_SEL_AUTO_INDEX, 0, 0), + &batch->draw_patches); + OUT_RELOC(ring, ind->bo, indirect->offset, 0, 0); + } + + emit_marker5(ring, 7); + fd_reset_wfi(batch); + + return; + } + + if (info->index_size) { + assert(!info->has_user_indices); + + idx_buffer = info->index.resource; + idx_type = fd4_size2indextype(info->index_size); + max_indices = idx_buffer->width0 / info->index_size; + idx_offset = index_offset + draw->start * info->index_size; + src_sel = DI_SRC_SEL_DMA; + } else { + idx_buffer = NULL; + idx_type = INDEX4_SIZE_32_BIT; + max_indices = 0; + idx_offset = 0; + src_sel = DI_SRC_SEL_AUTO_INDEX; + } + + fd5_draw(batch, ring, primtype, vismode, src_sel, draw->count, + info->instance_count, idx_type, max_indices, idx_offset, + idx_buffer); } #endif /* FD5_DRAW_H_ */ diff --git a/src/gallium/drivers/freedreno/a5xx/fd5_emit.c b/src/gallium/drivers/freedreno/a5xx/fd5_emit.c index 9e4ef8e..dc04d42 100644 --- a/src/gallium/drivers/freedreno/a5xx/fd5_emit.c +++ b/src/gallium/drivers/freedreno/a5xx/fd5_emit.c @@ -25,29 +25,29 @@ */ #include "pipe/p_state.h" -#include "util/u_string.h" -#include "util/u_memory.h" -#include "util/u_helpers.h" #include "util/format/u_format.h" +#include "util/u_helpers.h" +#include "util/u_memory.h" +#include "util/u_string.h" #include "util/u_viewport.h" -#include "freedreno_resource.h" #include "freedreno_query_hw.h" +#include "freedreno_resource.h" -#include "fd5_emit.h" #include "fd5_blend.h" #include "fd5_blitter.h" #include "fd5_context.h" +#include "fd5_emit.h" +#include "fd5_format.h" #include "fd5_image.h" #include "fd5_program.h" #include "fd5_rasterizer.h" -#include "fd5_texture.h" #include "fd5_screen.h" -#include "fd5_format.h" +#include "fd5_texture.h" #include "fd5_zsa.h" #define emit_const_user fd5_emit_const_user -#define emit_const_bo fd5_emit_const_bo +#define emit_const_bo fd5_emit_const_bo #include "ir3_const.h" /* regid: base const register @@ -56,97 +56,99 @@ */ static void fd5_emit_const_user(struct fd_ringbuffer *ring, - const struct ir3_shader_variant *v, uint32_t regid, uint32_t sizedwords, - const uint32_t *dwords) + const struct ir3_shader_variant *v, uint32_t regid, + uint32_t sizedwords, const uint32_t *dwords) { - emit_const_asserts(ring, v, regid, sizedwords); - - OUT_PKT7(ring, CP_LOAD_STATE4, 3 + sizedwords); - OUT_RING(ring, CP_LOAD_STATE4_0_DST_OFF(regid/4) | - CP_LOAD_STATE4_0_STATE_SRC(SS4_DIRECT) | - CP_LOAD_STATE4_0_STATE_BLOCK(fd4_stage2shadersb(v->type)) | - CP_LOAD_STATE4_0_NUM_UNIT(sizedwords/4)); - OUT_RING(ring, CP_LOAD_STATE4_1_EXT_SRC_ADDR(0) | - CP_LOAD_STATE4_1_STATE_TYPE(ST4_CONSTANTS)); - OUT_RING(ring, CP_LOAD_STATE4_2_EXT_SRC_ADDR_HI(0)); - for (int i = 0; i < sizedwords; i++) - OUT_RING(ring, ((uint32_t *)dwords)[i]); + emit_const_asserts(ring, v, regid, sizedwords); + + OUT_PKT7(ring, CP_LOAD_STATE4, 3 + sizedwords); + OUT_RING(ring, CP_LOAD_STATE4_0_DST_OFF(regid / 4) | + CP_LOAD_STATE4_0_STATE_SRC(SS4_DIRECT) | + CP_LOAD_STATE4_0_STATE_BLOCK(fd4_stage2shadersb(v->type)) | + CP_LOAD_STATE4_0_NUM_UNIT(sizedwords / 4)); + OUT_RING(ring, CP_LOAD_STATE4_1_EXT_SRC_ADDR(0) | + CP_LOAD_STATE4_1_STATE_TYPE(ST4_CONSTANTS)); + OUT_RING(ring, CP_LOAD_STATE4_2_EXT_SRC_ADDR_HI(0)); + for (int i = 0; i < sizedwords; i++) + OUT_RING(ring, ((uint32_t *)dwords)[i]); } static void -fd5_emit_const_bo(struct fd_ringbuffer *ring, const struct ir3_shader_variant *v, - uint32_t regid, uint32_t offset, uint32_t sizedwords, struct fd_bo *bo) +fd5_emit_const_bo(struct fd_ringbuffer *ring, + const struct ir3_shader_variant *v, uint32_t regid, + uint32_t offset, uint32_t sizedwords, struct fd_bo *bo) { - uint32_t dst_off = regid / 4; - assert(dst_off % 4 == 0); - uint32_t num_unit = sizedwords / 4; - assert(num_unit % 4 == 0); - - emit_const_asserts(ring, v, regid, sizedwords); - - OUT_PKT7(ring, CP_LOAD_STATE4, 3); - OUT_RING(ring, CP_LOAD_STATE4_0_DST_OFF(dst_off) | - CP_LOAD_STATE4_0_STATE_SRC(SS4_INDIRECT) | - CP_LOAD_STATE4_0_STATE_BLOCK(fd4_stage2shadersb(v->type)) | - CP_LOAD_STATE4_0_NUM_UNIT(num_unit)); - OUT_RELOC(ring, bo, offset, - CP_LOAD_STATE4_1_STATE_TYPE(ST4_CONSTANTS), 0); + uint32_t dst_off = regid / 4; + assert(dst_off % 4 == 0); + uint32_t num_unit = sizedwords / 4; + assert(num_unit % 4 == 0); + + emit_const_asserts(ring, v, regid, sizedwords); + + OUT_PKT7(ring, CP_LOAD_STATE4, 3); + OUT_RING(ring, CP_LOAD_STATE4_0_DST_OFF(dst_off) | + CP_LOAD_STATE4_0_STATE_SRC(SS4_INDIRECT) | + CP_LOAD_STATE4_0_STATE_BLOCK(fd4_stage2shadersb(v->type)) | + CP_LOAD_STATE4_0_NUM_UNIT(num_unit)); + OUT_RELOC(ring, bo, offset, CP_LOAD_STATE4_1_STATE_TYPE(ST4_CONSTANTS), 0); } static void fd5_emit_const_ptrs(struct fd_ringbuffer *ring, gl_shader_stage type, - uint32_t regid, uint32_t num, struct fd_bo **bos, uint32_t *offsets) + uint32_t regid, uint32_t num, struct fd_bo **bos, + uint32_t *offsets) { - uint32_t anum = align(num, 2); - uint32_t i; - - debug_assert((regid % 4) == 0); - - OUT_PKT7(ring, CP_LOAD_STATE4, 3 + (2 * anum)); - OUT_RING(ring, CP_LOAD_STATE4_0_DST_OFF(regid/4) | - CP_LOAD_STATE4_0_STATE_SRC(SS4_DIRECT) | - CP_LOAD_STATE4_0_STATE_BLOCK(fd4_stage2shadersb(type)) | - CP_LOAD_STATE4_0_NUM_UNIT(anum/2)); - OUT_RING(ring, CP_LOAD_STATE4_1_EXT_SRC_ADDR(0) | - CP_LOAD_STATE4_1_STATE_TYPE(ST4_CONSTANTS)); - OUT_RING(ring, CP_LOAD_STATE4_2_EXT_SRC_ADDR_HI(0)); - - for (i = 0; i < num; i++) { - if (bos[i]) { - OUT_RELOC(ring, bos[i], offsets[i], 0, 0); - } else { - OUT_RING(ring, 0xbad00000 | (i << 16)); - OUT_RING(ring, 0xbad00000 | (i << 16)); - } - } - - for (; i < anum; i++) { - OUT_RING(ring, 0xffffffff); - OUT_RING(ring, 0xffffffff); - } + uint32_t anum = align(num, 2); + uint32_t i; + + debug_assert((regid % 4) == 0); + + OUT_PKT7(ring, CP_LOAD_STATE4, 3 + (2 * anum)); + OUT_RING(ring, CP_LOAD_STATE4_0_DST_OFF(regid / 4) | + CP_LOAD_STATE4_0_STATE_SRC(SS4_DIRECT) | + CP_LOAD_STATE4_0_STATE_BLOCK(fd4_stage2shadersb(type)) | + CP_LOAD_STATE4_0_NUM_UNIT(anum / 2)); + OUT_RING(ring, CP_LOAD_STATE4_1_EXT_SRC_ADDR(0) | + CP_LOAD_STATE4_1_STATE_TYPE(ST4_CONSTANTS)); + OUT_RING(ring, CP_LOAD_STATE4_2_EXT_SRC_ADDR_HI(0)); + + for (i = 0; i < num; i++) { + if (bos[i]) { + OUT_RELOC(ring, bos[i], offsets[i], 0, 0); + } else { + OUT_RING(ring, 0xbad00000 | (i << 16)); + OUT_RING(ring, 0xbad00000 | (i << 16)); + } + } + + for (; i < anum; i++) { + OUT_RING(ring, 0xffffffff); + OUT_RING(ring, 0xffffffff); + } } static bool is_stateobj(struct fd_ringbuffer *ring) { - return false; + return false; } static void -emit_const_ptrs(struct fd_ringbuffer *ring, - const struct ir3_shader_variant *v, uint32_t dst_offset, - uint32_t num, struct fd_bo **bos, uint32_t *offsets) +emit_const_ptrs(struct fd_ringbuffer *ring, const struct ir3_shader_variant *v, + uint32_t dst_offset, uint32_t num, struct fd_bo **bos, + uint32_t *offsets) { - /* TODO inline this */ - assert(dst_offset + num <= v->constlen * 4); - fd5_emit_const_ptrs(ring, v->type, dst_offset, num, bos, offsets); + /* TODO inline this */ + assert(dst_offset + num <= v->constlen * 4); + fd5_emit_const_ptrs(ring, v->type, dst_offset, num, bos, offsets); } void -fd5_emit_cs_consts(const struct ir3_shader_variant *v, struct fd_ringbuffer *ring, - struct fd_context *ctx, const struct pipe_grid_info *info) +fd5_emit_cs_consts(const struct ir3_shader_variant *v, + struct fd_ringbuffer *ring, struct fd_context *ctx, + const struct pipe_grid_info *info) { - ir3_emit_cs_consts(v, ring, ctx, info); + ir3_emit_cs_consts(v, ring, ctx, info); } /* Border color layout is diff from a4xx/a5xx.. if it turns out to be @@ -156,735 +158,755 @@ fd5_emit_cs_consts(const struct ir3_shader_variant *v, struct fd_ringbuffer *rin */ struct PACKED bcolor_entry { - uint32_t fp32[4]; - uint16_t ui16[4]; - int16_t si16[4]; - - uint16_t fp16[4]; - uint16_t rgb565; - uint16_t rgb5a1; - uint16_t rgba4; - uint8_t __pad0[2]; - uint8_t ui8[4]; - int8_t si8[4]; - uint32_t rgb10a2; - uint32_t z24; /* also s8? */ - - uint16_t srgb[4]; /* appears to duplicate fp16[], but clamped, used for srgb */ - uint8_t __pad1[24]; + uint32_t fp32[4]; + uint16_t ui16[4]; + int16_t si16[4]; + + uint16_t fp16[4]; + uint16_t rgb565; + uint16_t rgb5a1; + uint16_t rgba4; + uint8_t __pad0[2]; + uint8_t ui8[4]; + int8_t si8[4]; + uint32_t rgb10a2; + uint32_t z24; /* also s8? */ + + uint16_t + srgb[4]; /* appears to duplicate fp16[], but clamped, used for srgb */ + uint8_t __pad1[24]; }; -#define FD5_BORDER_COLOR_SIZE 0x60 -#define FD5_BORDER_COLOR_UPLOAD_SIZE (2 * PIPE_MAX_SAMPLERS * FD5_BORDER_COLOR_SIZE) +#define FD5_BORDER_COLOR_SIZE 0x60 +#define FD5_BORDER_COLOR_UPLOAD_SIZE \ + (2 * PIPE_MAX_SAMPLERS * FD5_BORDER_COLOR_SIZE) static void -setup_border_colors(struct fd_texture_stateobj *tex, struct bcolor_entry *entries) +setup_border_colors(struct fd_texture_stateobj *tex, + struct bcolor_entry *entries) { - unsigned i, j; - STATIC_ASSERT(sizeof(struct bcolor_entry) == FD5_BORDER_COLOR_SIZE); - - for (i = 0; i < tex->num_samplers; i++) { - struct bcolor_entry *e = &entries[i]; - struct pipe_sampler_state *sampler = tex->samplers[i]; - union pipe_color_union *bc; - - if (!sampler) - continue; - - bc = &sampler->border_color; - - /* - * XXX HACK ALERT XXX - * - * The border colors need to be swizzled in a particular - * format-dependent order. Even though samplers don't know about - * formats, we can assume that with a GL state tracker, there's a - * 1:1 correspondence between sampler and texture. Take advantage - * of that knowledge. - */ - if ((i >= tex->num_textures) || !tex->textures[i]) - continue; - - enum pipe_format format = tex->textures[i]->format; - const struct util_format_description *desc = - util_format_description(format); - - e->rgb565 = 0; - e->rgb5a1 = 0; - e->rgba4 = 0; - e->rgb10a2 = 0; - e->z24 = 0; - - for (j = 0; j < 4; j++) { - int c = desc->swizzle[j]; - int cd = c; - - /* - * HACK: for PIPE_FORMAT_X24S8_UINT we end up w/ the - * stencil border color value in bc->ui[0] but according - * to desc->swizzle and desc->channel, the .x component - * is NONE and the stencil value is in the y component. - * Meanwhile the hardware wants this in the .x componetn. - */ - if ((format == PIPE_FORMAT_X24S8_UINT) || - (format == PIPE_FORMAT_X32_S8X24_UINT)) { - if (j == 0) { - c = 1; - cd = 0; - } else { - continue; - } - } - - if (c >= 4) - continue; - - if (desc->channel[c].pure_integer) { - uint16_t clamped; - switch (desc->channel[c].size) { - case 2: - assert(desc->channel[c].type == UTIL_FORMAT_TYPE_UNSIGNED); - clamped = CLAMP(bc->ui[j], 0, 0x3); - break; - case 8: - if (desc->channel[c].type == UTIL_FORMAT_TYPE_SIGNED) - clamped = CLAMP(bc->i[j], -128, 127); - else - clamped = CLAMP(bc->ui[j], 0, 255); - break; - case 10: - assert(desc->channel[c].type == UTIL_FORMAT_TYPE_UNSIGNED); - clamped = CLAMP(bc->ui[j], 0, 0x3ff); - break; - case 16: - if (desc->channel[c].type == UTIL_FORMAT_TYPE_SIGNED) - clamped = CLAMP(bc->i[j], -32768, 32767); - else - clamped = CLAMP(bc->ui[j], 0, 65535); - break; - default: - assert(!"Unexpected bit size"); - case 32: - clamped = 0; - break; - } - e->fp32[cd] = bc->ui[j]; - e->fp16[cd] = clamped; - } else { - float f = bc->f[j]; - float f_u = CLAMP(f, 0, 1); - float f_s = CLAMP(f, -1, 1); - - e->fp32[c] = fui(f); - e->fp16[c] = _mesa_float_to_half(f); - e->srgb[c] = _mesa_float_to_half(f_u); - e->ui16[c] = f_u * 0xffff; - e->si16[c] = f_s * 0x7fff; - e->ui8[c] = f_u * 0xff; - e->si8[c] = f_s * 0x7f; - if (c == 1) - e->rgb565 |= (int)(f_u * 0x3f) << 5; - else if (c < 3) - e->rgb565 |= (int)(f_u * 0x1f) << (c ? 11 : 0); - if (c == 3) - e->rgb5a1 |= (f_u > 0.5) ? 0x8000 : 0; - else - e->rgb5a1 |= (int)(f_u * 0x1f) << (c * 5); - if (c == 3) - e->rgb10a2 |= (int)(f_u * 0x3) << 30; - else - e->rgb10a2 |= (int)(f_u * 0x3ff) << (c * 10); - e->rgba4 |= (int)(f_u * 0xf) << (c * 4); - if (c == 0) - e->z24 = f_u * 0xffffff; - } - } + unsigned i, j; + STATIC_ASSERT(sizeof(struct bcolor_entry) == FD5_BORDER_COLOR_SIZE); + + for (i = 0; i < tex->num_samplers; i++) { + struct bcolor_entry *e = &entries[i]; + struct pipe_sampler_state *sampler = tex->samplers[i]; + union pipe_color_union *bc; + + if (!sampler) + continue; + + bc = &sampler->border_color; + + /* + * XXX HACK ALERT XXX + * + * The border colors need to be swizzled in a particular + * format-dependent order. Even though samplers don't know about + * formats, we can assume that with a GL state tracker, there's a + * 1:1 correspondence between sampler and texture. Take advantage + * of that knowledge. + */ + if ((i >= tex->num_textures) || !tex->textures[i]) + continue; + + enum pipe_format format = tex->textures[i]->format; + const struct util_format_description *desc = + util_format_description(format); + + e->rgb565 = 0; + e->rgb5a1 = 0; + e->rgba4 = 0; + e->rgb10a2 = 0; + e->z24 = 0; + + for (j = 0; j < 4; j++) { + int c = desc->swizzle[j]; + int cd = c; + + /* + * HACK: for PIPE_FORMAT_X24S8_UINT we end up w/ the + * stencil border color value in bc->ui[0] but according + * to desc->swizzle and desc->channel, the .x component + * is NONE and the stencil value is in the y component. + * Meanwhile the hardware wants this in the .x componetn. + */ + if ((format == PIPE_FORMAT_X24S8_UINT) || + (format == PIPE_FORMAT_X32_S8X24_UINT)) { + if (j == 0) { + c = 1; + cd = 0; + } else { + continue; + } + } + + if (c >= 4) + continue; + + if (desc->channel[c].pure_integer) { + uint16_t clamped; + switch (desc->channel[c].size) { + case 2: + assert(desc->channel[c].type == UTIL_FORMAT_TYPE_UNSIGNED); + clamped = CLAMP(bc->ui[j], 0, 0x3); + break; + case 8: + if (desc->channel[c].type == UTIL_FORMAT_TYPE_SIGNED) + clamped = CLAMP(bc->i[j], -128, 127); + else + clamped = CLAMP(bc->ui[j], 0, 255); + break; + case 10: + assert(desc->channel[c].type == UTIL_FORMAT_TYPE_UNSIGNED); + clamped = CLAMP(bc->ui[j], 0, 0x3ff); + break; + case 16: + if (desc->channel[c].type == UTIL_FORMAT_TYPE_SIGNED) + clamped = CLAMP(bc->i[j], -32768, 32767); + else + clamped = CLAMP(bc->ui[j], 0, 65535); + break; + default: + assert(!"Unexpected bit size"); + case 32: + clamped = 0; + break; + } + e->fp32[cd] = bc->ui[j]; + e->fp16[cd] = clamped; + } else { + float f = bc->f[j]; + float f_u = CLAMP(f, 0, 1); + float f_s = CLAMP(f, -1, 1); + + e->fp32[c] = fui(f); + e->fp16[c] = _mesa_float_to_half(f); + e->srgb[c] = _mesa_float_to_half(f_u); + e->ui16[c] = f_u * 0xffff; + e->si16[c] = f_s * 0x7fff; + e->ui8[c] = f_u * 0xff; + e->si8[c] = f_s * 0x7f; + if (c == 1) + e->rgb565 |= (int)(f_u * 0x3f) << 5; + else if (c < 3) + e->rgb565 |= (int)(f_u * 0x1f) << (c ? 11 : 0); + if (c == 3) + e->rgb5a1 |= (f_u > 0.5) ? 0x8000 : 0; + else + e->rgb5a1 |= (int)(f_u * 0x1f) << (c * 5); + if (c == 3) + e->rgb10a2 |= (int)(f_u * 0x3) << 30; + else + e->rgb10a2 |= (int)(f_u * 0x3ff) << (c * 10); + e->rgba4 |= (int)(f_u * 0xf) << (c * 4); + if (c == 0) + e->z24 = f_u * 0xffffff; + } + } #ifdef DEBUG - memset(&e->__pad0, 0, sizeof(e->__pad0)); - memset(&e->__pad1, 0, sizeof(e->__pad1)); + memset(&e->__pad0, 0, sizeof(e->__pad0)); + memset(&e->__pad1, 0, sizeof(e->__pad1)); #endif - } + } } static void -emit_border_color(struct fd_context *ctx, struct fd_ringbuffer *ring) - assert_dt +emit_border_color(struct fd_context *ctx, struct fd_ringbuffer *ring) assert_dt { - struct fd5_context *fd5_ctx = fd5_context(ctx); - struct bcolor_entry *entries; - unsigned off; - void *ptr; + struct fd5_context *fd5_ctx = fd5_context(ctx); + struct bcolor_entry *entries; + unsigned off; + void *ptr; - STATIC_ASSERT(sizeof(struct bcolor_entry) == FD5_BORDER_COLOR_SIZE); + STATIC_ASSERT(sizeof(struct bcolor_entry) == FD5_BORDER_COLOR_SIZE); - u_upload_alloc(fd5_ctx->border_color_uploader, - 0, FD5_BORDER_COLOR_UPLOAD_SIZE, - FD5_BORDER_COLOR_UPLOAD_SIZE, &off, - &fd5_ctx->border_color_buf, - &ptr); + u_upload_alloc(fd5_ctx->border_color_uploader, 0, + FD5_BORDER_COLOR_UPLOAD_SIZE, FD5_BORDER_COLOR_UPLOAD_SIZE, + &off, &fd5_ctx->border_color_buf, &ptr); - entries = ptr; + entries = ptr; - setup_border_colors(&ctx->tex[PIPE_SHADER_VERTEX], &entries[0]); - setup_border_colors(&ctx->tex[PIPE_SHADER_FRAGMENT], - &entries[ctx->tex[PIPE_SHADER_VERTEX].num_samplers]); + setup_border_colors(&ctx->tex[PIPE_SHADER_VERTEX], &entries[0]); + setup_border_colors(&ctx->tex[PIPE_SHADER_FRAGMENT], + &entries[ctx->tex[PIPE_SHADER_VERTEX].num_samplers]); - OUT_PKT4(ring, REG_A5XX_TPL1_TP_BORDER_COLOR_BASE_ADDR_LO, 2); - OUT_RELOC(ring, fd_resource(fd5_ctx->border_color_buf)->bo, off, 0, 0); + OUT_PKT4(ring, REG_A5XX_TPL1_TP_BORDER_COLOR_BASE_ADDR_LO, 2); + OUT_RELOC(ring, fd_resource(fd5_ctx->border_color_buf)->bo, off, 0, 0); - u_upload_unmap(fd5_ctx->border_color_uploader); + u_upload_unmap(fd5_ctx->border_color_uploader); } static bool emit_textures(struct fd_context *ctx, struct fd_ringbuffer *ring, - enum a4xx_state_block sb, struct fd_texture_stateobj *tex) - assert_dt + enum a4xx_state_block sb, + struct fd_texture_stateobj *tex) assert_dt { - bool needs_border = false; - unsigned bcolor_offset = (sb == SB4_FS_TEX) ? ctx->tex[PIPE_SHADER_VERTEX].num_samplers : 0; - unsigned i; - - if (tex->num_samplers > 0) { - /* output sampler state: */ - OUT_PKT7(ring, CP_LOAD_STATE4, 3 + (4 * tex->num_samplers)); - OUT_RING(ring, CP_LOAD_STATE4_0_DST_OFF(0) | - CP_LOAD_STATE4_0_STATE_SRC(SS4_DIRECT) | - CP_LOAD_STATE4_0_STATE_BLOCK(sb) | - CP_LOAD_STATE4_0_NUM_UNIT(tex->num_samplers)); - OUT_RING(ring, CP_LOAD_STATE4_1_STATE_TYPE(ST4_SHADER) | - CP_LOAD_STATE4_1_EXT_SRC_ADDR(0)); - OUT_RING(ring, CP_LOAD_STATE4_2_EXT_SRC_ADDR_HI(0)); - for (i = 0; i < tex->num_samplers; i++) { - static const struct fd5_sampler_stateobj dummy_sampler = {}; - const struct fd5_sampler_stateobj *sampler = tex->samplers[i] ? - fd5_sampler_stateobj(tex->samplers[i]) : - &dummy_sampler; - OUT_RING(ring, sampler->texsamp0); - OUT_RING(ring, sampler->texsamp1); - OUT_RING(ring, sampler->texsamp2 | - A5XX_TEX_SAMP_2_BCOLOR_OFFSET(bcolor_offset)); - OUT_RING(ring, sampler->texsamp3); - - needs_border |= sampler->needs_border; - } - } - - if (tex->num_textures > 0) { - unsigned num_textures = tex->num_textures; - - /* emit texture state: */ - OUT_PKT7(ring, CP_LOAD_STATE4, 3 + (12 * num_textures)); - OUT_RING(ring, CP_LOAD_STATE4_0_DST_OFF(0) | - CP_LOAD_STATE4_0_STATE_SRC(SS4_DIRECT) | - CP_LOAD_STATE4_0_STATE_BLOCK(sb) | - CP_LOAD_STATE4_0_NUM_UNIT(num_textures)); - OUT_RING(ring, CP_LOAD_STATE4_1_STATE_TYPE(ST4_CONSTANTS) | - CP_LOAD_STATE4_1_EXT_SRC_ADDR(0)); - OUT_RING(ring, CP_LOAD_STATE4_2_EXT_SRC_ADDR_HI(0)); - for (i = 0; i < tex->num_textures; i++) { - static const struct fd5_pipe_sampler_view dummy_view = {}; - const struct fd5_pipe_sampler_view *view = tex->textures[i] ? - fd5_pipe_sampler_view(tex->textures[i]) : - &dummy_view; - enum a5xx_tile_mode tile_mode = TILE5_LINEAR; - - if (view->base.texture) - tile_mode = fd_resource(view->base.texture)->layout.tile_mode; - - OUT_RING(ring, view->texconst0 | - A5XX_TEX_CONST_0_TILE_MODE(tile_mode)); - OUT_RING(ring, view->texconst1); - OUT_RING(ring, view->texconst2); - OUT_RING(ring, view->texconst3); - if (view->base.texture) { - struct fd_resource *rsc = fd_resource(view->base.texture); - if (view->base.format == PIPE_FORMAT_X32_S8X24_UINT) - rsc = rsc->stencil; - OUT_RELOC(ring, rsc->bo, view->offset, - (uint64_t)view->texconst5 << 32, 0); - } else { - OUT_RING(ring, 0x00000000); - OUT_RING(ring, view->texconst5); - } - OUT_RING(ring, view->texconst6); - OUT_RING(ring, view->texconst7); - OUT_RING(ring, view->texconst8); - OUT_RING(ring, view->texconst9); - OUT_RING(ring, view->texconst10); - OUT_RING(ring, view->texconst11); - } - } - - return needs_border; + bool needs_border = false; + unsigned bcolor_offset = + (sb == SB4_FS_TEX) ? ctx->tex[PIPE_SHADER_VERTEX].num_samplers : 0; + unsigned i; + + if (tex->num_samplers > 0) { + /* output sampler state: */ + OUT_PKT7(ring, CP_LOAD_STATE4, 3 + (4 * tex->num_samplers)); + OUT_RING(ring, CP_LOAD_STATE4_0_DST_OFF(0) | + CP_LOAD_STATE4_0_STATE_SRC(SS4_DIRECT) | + CP_LOAD_STATE4_0_STATE_BLOCK(sb) | + CP_LOAD_STATE4_0_NUM_UNIT(tex->num_samplers)); + OUT_RING(ring, CP_LOAD_STATE4_1_STATE_TYPE(ST4_SHADER) | + CP_LOAD_STATE4_1_EXT_SRC_ADDR(0)); + OUT_RING(ring, CP_LOAD_STATE4_2_EXT_SRC_ADDR_HI(0)); + for (i = 0; i < tex->num_samplers; i++) { + static const struct fd5_sampler_stateobj dummy_sampler = {}; + const struct fd5_sampler_stateobj *sampler = + tex->samplers[i] ? fd5_sampler_stateobj(tex->samplers[i]) + : &dummy_sampler; + OUT_RING(ring, sampler->texsamp0); + OUT_RING(ring, sampler->texsamp1); + OUT_RING(ring, sampler->texsamp2 | + A5XX_TEX_SAMP_2_BCOLOR_OFFSET(bcolor_offset)); + OUT_RING(ring, sampler->texsamp3); + + needs_border |= sampler->needs_border; + } + } + + if (tex->num_textures > 0) { + unsigned num_textures = tex->num_textures; + + /* emit texture state: */ + OUT_PKT7(ring, CP_LOAD_STATE4, 3 + (12 * num_textures)); + OUT_RING(ring, CP_LOAD_STATE4_0_DST_OFF(0) | + CP_LOAD_STATE4_0_STATE_SRC(SS4_DIRECT) | + CP_LOAD_STATE4_0_STATE_BLOCK(sb) | + CP_LOAD_STATE4_0_NUM_UNIT(num_textures)); + OUT_RING(ring, CP_LOAD_STATE4_1_STATE_TYPE(ST4_CONSTANTS) | + CP_LOAD_STATE4_1_EXT_SRC_ADDR(0)); + OUT_RING(ring, CP_LOAD_STATE4_2_EXT_SRC_ADDR_HI(0)); + for (i = 0; i < tex->num_textures; i++) { + static const struct fd5_pipe_sampler_view dummy_view = {}; + const struct fd5_pipe_sampler_view *view = + tex->textures[i] ? fd5_pipe_sampler_view(tex->textures[i]) + : &dummy_view; + enum a5xx_tile_mode tile_mode = TILE5_LINEAR; + + if (view->base.texture) + tile_mode = fd_resource(view->base.texture)->layout.tile_mode; + + OUT_RING(ring, + view->texconst0 | A5XX_TEX_CONST_0_TILE_MODE(tile_mode)); + OUT_RING(ring, view->texconst1); + OUT_RING(ring, view->texconst2); + OUT_RING(ring, view->texconst3); + if (view->base.texture) { + struct fd_resource *rsc = fd_resource(view->base.texture); + if (view->base.format == PIPE_FORMAT_X32_S8X24_UINT) + rsc = rsc->stencil; + OUT_RELOC(ring, rsc->bo, view->offset, + (uint64_t)view->texconst5 << 32, 0); + } else { + OUT_RING(ring, 0x00000000); + OUT_RING(ring, view->texconst5); + } + OUT_RING(ring, view->texconst6); + OUT_RING(ring, view->texconst7); + OUT_RING(ring, view->texconst8); + OUT_RING(ring, view->texconst9); + OUT_RING(ring, view->texconst10); + OUT_RING(ring, view->texconst11); + } + } + + return needs_border; } static void emit_ssbos(struct fd_context *ctx, struct fd_ringbuffer *ring, - enum a4xx_state_block sb, struct fd_shaderbuf_stateobj *so, - const struct ir3_shader_variant *v) + enum a4xx_state_block sb, struct fd_shaderbuf_stateobj *so, + const struct ir3_shader_variant *v) { - unsigned count = util_last_bit(so->enabled_mask); - - for (unsigned i = 0; i < count; i++) { - OUT_PKT7(ring, CP_LOAD_STATE4, 5); - OUT_RING(ring, CP_LOAD_STATE4_0_DST_OFF(i) | - CP_LOAD_STATE4_0_STATE_SRC(SS4_DIRECT) | - CP_LOAD_STATE4_0_STATE_BLOCK(sb) | - CP_LOAD_STATE4_0_NUM_UNIT(1)); - OUT_RING(ring, CP_LOAD_STATE4_1_STATE_TYPE(1) | - CP_LOAD_STATE4_1_EXT_SRC_ADDR(0)); - OUT_RING(ring, CP_LOAD_STATE4_2_EXT_SRC_ADDR_HI(0)); - - struct pipe_shader_buffer *buf = &so->sb[i]; - unsigned sz = buf->buffer_size; - - /* width is in dwords, overflows into height: */ - sz /= 4; - - OUT_RING(ring, A5XX_SSBO_1_0_WIDTH(sz)); - OUT_RING(ring, A5XX_SSBO_1_1_HEIGHT(sz >> 16)); - - OUT_PKT7(ring, CP_LOAD_STATE4, 5); - OUT_RING(ring, CP_LOAD_STATE4_0_DST_OFF(i) | - CP_LOAD_STATE4_0_STATE_SRC(SS4_DIRECT) | - CP_LOAD_STATE4_0_STATE_BLOCK(sb) | - CP_LOAD_STATE4_0_NUM_UNIT(1)); - OUT_RING(ring, CP_LOAD_STATE4_1_STATE_TYPE(2) | - CP_LOAD_STATE4_1_EXT_SRC_ADDR(0)); - OUT_RING(ring, CP_LOAD_STATE4_2_EXT_SRC_ADDR_HI(0)); - - if (buf->buffer) { - struct fd_resource *rsc = fd_resource(buf->buffer); - OUT_RELOC(ring, rsc->bo, buf->buffer_offset, 0, 0); - } else { - OUT_RING(ring, 0x00000000); - OUT_RING(ring, 0x00000000); - } - } + unsigned count = util_last_bit(so->enabled_mask); + + for (unsigned i = 0; i < count; i++) { + OUT_PKT7(ring, CP_LOAD_STATE4, 5); + OUT_RING(ring, CP_LOAD_STATE4_0_DST_OFF(i) | + CP_LOAD_STATE4_0_STATE_SRC(SS4_DIRECT) | + CP_LOAD_STATE4_0_STATE_BLOCK(sb) | + CP_LOAD_STATE4_0_NUM_UNIT(1)); + OUT_RING(ring, CP_LOAD_STATE4_1_STATE_TYPE(1) | + CP_LOAD_STATE4_1_EXT_SRC_ADDR(0)); + OUT_RING(ring, CP_LOAD_STATE4_2_EXT_SRC_ADDR_HI(0)); + + struct pipe_shader_buffer *buf = &so->sb[i]; + unsigned sz = buf->buffer_size; + + /* width is in dwords, overflows into height: */ + sz /= 4; + + OUT_RING(ring, A5XX_SSBO_1_0_WIDTH(sz)); + OUT_RING(ring, A5XX_SSBO_1_1_HEIGHT(sz >> 16)); + + OUT_PKT7(ring, CP_LOAD_STATE4, 5); + OUT_RING(ring, CP_LOAD_STATE4_0_DST_OFF(i) | + CP_LOAD_STATE4_0_STATE_SRC(SS4_DIRECT) | + CP_LOAD_STATE4_0_STATE_BLOCK(sb) | + CP_LOAD_STATE4_0_NUM_UNIT(1)); + OUT_RING(ring, CP_LOAD_STATE4_1_STATE_TYPE(2) | + CP_LOAD_STATE4_1_EXT_SRC_ADDR(0)); + OUT_RING(ring, CP_LOAD_STATE4_2_EXT_SRC_ADDR_HI(0)); + + if (buf->buffer) { + struct fd_resource *rsc = fd_resource(buf->buffer); + OUT_RELOC(ring, rsc->bo, buf->buffer_offset, 0, 0); + } else { + OUT_RING(ring, 0x00000000); + OUT_RING(ring, 0x00000000); + } + } } void fd5_emit_vertex_bufs(struct fd_ringbuffer *ring, struct fd5_emit *emit) { - int32_t i, j; - const struct fd_vertex_state *vtx = emit->vtx; - const struct ir3_shader_variant *vp = fd5_emit_get_vp(emit); - - for (i = 0, j = 0; i <= vp->inputs_count; i++) { - if (vp->inputs[i].sysval) - continue; - if (vp->inputs[i].compmask) { - struct pipe_vertex_element *elem = &vtx->vtx->pipe[i]; - const struct pipe_vertex_buffer *vb = - &vtx->vertexbuf.vb[elem->vertex_buffer_index]; - struct fd_resource *rsc = fd_resource(vb->buffer.resource); - enum pipe_format pfmt = elem->src_format; - enum a5xx_vtx_fmt fmt = fd5_pipe2vtx(pfmt); - bool isint = util_format_is_pure_integer(pfmt); - uint32_t off = vb->buffer_offset + elem->src_offset; - uint32_t size = fd_bo_size(rsc->bo) - off; - debug_assert(fmt != VFMT5_NONE); + int32_t i, j; + const struct fd_vertex_state *vtx = emit->vtx; + const struct ir3_shader_variant *vp = fd5_emit_get_vp(emit); + + for (i = 0, j = 0; i <= vp->inputs_count; i++) { + if (vp->inputs[i].sysval) + continue; + if (vp->inputs[i].compmask) { + struct pipe_vertex_element *elem = &vtx->vtx->pipe[i]; + const struct pipe_vertex_buffer *vb = + &vtx->vertexbuf.vb[elem->vertex_buffer_index]; + struct fd_resource *rsc = fd_resource(vb->buffer.resource); + enum pipe_format pfmt = elem->src_format; + enum a5xx_vtx_fmt fmt = fd5_pipe2vtx(pfmt); + bool isint = util_format_is_pure_integer(pfmt); + uint32_t off = vb->buffer_offset + elem->src_offset; + uint32_t size = fd_bo_size(rsc->bo) - off; + debug_assert(fmt != VFMT5_NONE); #ifdef DEBUG - /* see dEQP-GLES31.stress.vertex_attribute_binding.buffer_bounds.bind_vertex_buffer_offset_near_wrap_10 - */ - if (off > fd_bo_size(rsc->bo)) - continue; + /* see + * dEQP-GLES31.stress.vertex_attribute_binding.buffer_bounds.bind_vertex_buffer_offset_near_wrap_10 + */ + if (off > fd_bo_size(rsc->bo)) + continue; #endif - OUT_PKT4(ring, REG_A5XX_VFD_FETCH(j), 4); - OUT_RELOC(ring, rsc->bo, off, 0, 0); - OUT_RING(ring, size); /* VFD_FETCH[j].SIZE */ - OUT_RING(ring, vb->stride); /* VFD_FETCH[j].STRIDE */ - - OUT_PKT4(ring, REG_A5XX_VFD_DECODE(j), 2); - OUT_RING(ring, A5XX_VFD_DECODE_INSTR_IDX(j) | - A5XX_VFD_DECODE_INSTR_FORMAT(fmt) | - COND(elem->instance_divisor, A5XX_VFD_DECODE_INSTR_INSTANCED) | - A5XX_VFD_DECODE_INSTR_SWAP(fd5_pipe2swap(pfmt)) | - A5XX_VFD_DECODE_INSTR_UNK30 | - COND(!isint, A5XX_VFD_DECODE_INSTR_FLOAT)); - OUT_RING(ring, MAX2(1, elem->instance_divisor)); /* VFD_DECODE[j].STEP_RATE */ - - OUT_PKT4(ring, REG_A5XX_VFD_DEST_CNTL(j), 1); - OUT_RING(ring, A5XX_VFD_DEST_CNTL_INSTR_WRITEMASK(vp->inputs[i].compmask) | - A5XX_VFD_DEST_CNTL_INSTR_REGID(vp->inputs[i].regid)); - - j++; - } - } - - OUT_PKT4(ring, REG_A5XX_VFD_CONTROL_0, 1); - OUT_RING(ring, A5XX_VFD_CONTROL_0_VTXCNT(j)); + OUT_PKT4(ring, REG_A5XX_VFD_FETCH(j), 4); + OUT_RELOC(ring, rsc->bo, off, 0, 0); + OUT_RING(ring, size); /* VFD_FETCH[j].SIZE */ + OUT_RING(ring, vb->stride); /* VFD_FETCH[j].STRIDE */ + + OUT_PKT4(ring, REG_A5XX_VFD_DECODE(j), 2); + OUT_RING( + ring, + A5XX_VFD_DECODE_INSTR_IDX(j) | A5XX_VFD_DECODE_INSTR_FORMAT(fmt) | + COND(elem->instance_divisor, A5XX_VFD_DECODE_INSTR_INSTANCED) | + A5XX_VFD_DECODE_INSTR_SWAP(fd5_pipe2swap(pfmt)) | + A5XX_VFD_DECODE_INSTR_UNK30 | + COND(!isint, A5XX_VFD_DECODE_INSTR_FLOAT)); + OUT_RING( + ring, + MAX2(1, elem->instance_divisor)); /* VFD_DECODE[j].STEP_RATE */ + + OUT_PKT4(ring, REG_A5XX_VFD_DEST_CNTL(j), 1); + OUT_RING(ring, + A5XX_VFD_DEST_CNTL_INSTR_WRITEMASK(vp->inputs[i].compmask) | + A5XX_VFD_DEST_CNTL_INSTR_REGID(vp->inputs[i].regid)); + + j++; + } + } + + OUT_PKT4(ring, REG_A5XX_VFD_CONTROL_0, 1); + OUT_RING(ring, A5XX_VFD_CONTROL_0_VTXCNT(j)); } void fd5_emit_state(struct fd_context *ctx, struct fd_ringbuffer *ring, - struct fd5_emit *emit) + struct fd5_emit *emit) { - struct pipe_framebuffer_state *pfb = &ctx->batch->framebuffer; - const struct ir3_shader_variant *vp = fd5_emit_get_vp(emit); - const struct ir3_shader_variant *fp = fd5_emit_get_fp(emit); - const enum fd_dirty_3d_state dirty = emit->dirty; - bool needs_border = false; - - emit_marker5(ring, 5); - - if ((dirty & FD_DIRTY_FRAMEBUFFER) && !emit->binning_pass) { - unsigned char mrt_comp[A5XX_MAX_RENDER_TARGETS] = {0}; - - for (unsigned i = 0; i < A5XX_MAX_RENDER_TARGETS; i++) { - mrt_comp[i] = ((i < pfb->nr_cbufs) && pfb->cbufs[i]) ? 0xf : 0; - } - - OUT_PKT4(ring, REG_A5XX_RB_RENDER_COMPONENTS, 1); - OUT_RING(ring, A5XX_RB_RENDER_COMPONENTS_RT0(mrt_comp[0]) | - A5XX_RB_RENDER_COMPONENTS_RT1(mrt_comp[1]) | - A5XX_RB_RENDER_COMPONENTS_RT2(mrt_comp[2]) | - A5XX_RB_RENDER_COMPONENTS_RT3(mrt_comp[3]) | - A5XX_RB_RENDER_COMPONENTS_RT4(mrt_comp[4]) | - A5XX_RB_RENDER_COMPONENTS_RT5(mrt_comp[5]) | - A5XX_RB_RENDER_COMPONENTS_RT6(mrt_comp[6]) | - A5XX_RB_RENDER_COMPONENTS_RT7(mrt_comp[7])); - } - - if (dirty & (FD_DIRTY_ZSA | FD_DIRTY_FRAMEBUFFER)) { - struct fd5_zsa_stateobj *zsa = fd5_zsa_stateobj(ctx->zsa); - uint32_t rb_alpha_control = zsa->rb_alpha_control; - - if (util_format_is_pure_integer(pipe_surface_format(pfb->cbufs[0]))) - rb_alpha_control &= ~A5XX_RB_ALPHA_CONTROL_ALPHA_TEST; - - OUT_PKT4(ring, REG_A5XX_RB_ALPHA_CONTROL, 1); - OUT_RING(ring, rb_alpha_control); - - OUT_PKT4(ring, REG_A5XX_RB_STENCIL_CONTROL, 1); - OUT_RING(ring, zsa->rb_stencil_control); - } - - if (dirty & (FD_DIRTY_ZSA | FD_DIRTY_BLEND | FD_DIRTY_PROG)) { - struct fd5_blend_stateobj *blend = fd5_blend_stateobj(ctx->blend); - struct fd5_zsa_stateobj *zsa = fd5_zsa_stateobj(ctx->zsa); - - if (pfb->zsbuf) { - struct fd_resource *rsc = fd_resource(pfb->zsbuf->texture); - uint32_t gras_lrz_cntl = zsa->gras_lrz_cntl; - - if (emit->no_lrz_write || !rsc->lrz || !rsc->lrz_valid) - gras_lrz_cntl = 0; - else if (emit->binning_pass && blend->lrz_write && zsa->lrz_write) - gras_lrz_cntl |= A5XX_GRAS_LRZ_CNTL_LRZ_WRITE; - - OUT_PKT4(ring, REG_A5XX_GRAS_LRZ_CNTL, 1); - OUT_RING(ring, gras_lrz_cntl); - } - } - - if (dirty & (FD_DIRTY_ZSA | FD_DIRTY_STENCIL_REF)) { - struct fd5_zsa_stateobj *zsa = fd5_zsa_stateobj(ctx->zsa); - struct pipe_stencil_ref *sr = &ctx->stencil_ref; - - OUT_PKT4(ring, REG_A5XX_RB_STENCILREFMASK, 2); - OUT_RING(ring, zsa->rb_stencilrefmask | - A5XX_RB_STENCILREFMASK_STENCILREF(sr->ref_value[0])); - OUT_RING(ring, zsa->rb_stencilrefmask_bf | - A5XX_RB_STENCILREFMASK_BF_STENCILREF(sr->ref_value[1])); - } - - if (dirty & (FD_DIRTY_ZSA | FD_DIRTY_RASTERIZER | FD_DIRTY_PROG)) { - struct fd5_zsa_stateobj *zsa = fd5_zsa_stateobj(ctx->zsa); - bool fragz = fp->no_earlyz || fp->has_kill || zsa->base.alpha_enabled || fp->writes_pos; - - OUT_PKT4(ring, REG_A5XX_RB_DEPTH_CNTL, 1); - OUT_RING(ring, zsa->rb_depth_cntl); - - OUT_PKT4(ring, REG_A5XX_RB_DEPTH_PLANE_CNTL, 1); - OUT_RING(ring, COND(fragz, A5XX_RB_DEPTH_PLANE_CNTL_FRAG_WRITES_Z) | - COND(fragz && fp->fragcoord_compmask != 0, - A5XX_RB_DEPTH_PLANE_CNTL_UNK1)); - - OUT_PKT4(ring, REG_A5XX_GRAS_SU_DEPTH_PLANE_CNTL, 1); - OUT_RING(ring, COND(fragz, A5XX_GRAS_SU_DEPTH_PLANE_CNTL_FRAG_WRITES_Z) | - COND(fragz && fp->fragcoord_compmask != 0, - A5XX_GRAS_SU_DEPTH_PLANE_CNTL_UNK1)); - } - - /* NOTE: scissor enabled bit is part of rasterizer state: */ - if (dirty & (FD_DIRTY_SCISSOR | FD_DIRTY_RASTERIZER)) { - struct pipe_scissor_state *scissor = fd_context_get_scissor(ctx); - - OUT_PKT4(ring, REG_A5XX_GRAS_SC_SCREEN_SCISSOR_TL_0, 2); - OUT_RING(ring, A5XX_GRAS_SC_SCREEN_SCISSOR_TL_0_X(scissor->minx) | - A5XX_GRAS_SC_SCREEN_SCISSOR_TL_0_Y(scissor->miny)); - OUT_RING(ring, A5XX_GRAS_SC_SCREEN_SCISSOR_TL_0_X(scissor->maxx - 1) | - A5XX_GRAS_SC_SCREEN_SCISSOR_TL_0_Y(scissor->maxy - 1)); - - OUT_PKT4(ring, REG_A5XX_GRAS_SC_VIEWPORT_SCISSOR_TL_0, 2); - OUT_RING(ring, A5XX_GRAS_SC_VIEWPORT_SCISSOR_TL_0_X(scissor->minx) | - A5XX_GRAS_SC_VIEWPORT_SCISSOR_TL_0_Y(scissor->miny)); - OUT_RING(ring, A5XX_GRAS_SC_VIEWPORT_SCISSOR_TL_0_X(scissor->maxx - 1) | - A5XX_GRAS_SC_VIEWPORT_SCISSOR_TL_0_Y(scissor->maxy - 1)); - - ctx->batch->max_scissor.minx = MIN2(ctx->batch->max_scissor.minx, scissor->minx); - ctx->batch->max_scissor.miny = MIN2(ctx->batch->max_scissor.miny, scissor->miny); - ctx->batch->max_scissor.maxx = MAX2(ctx->batch->max_scissor.maxx, scissor->maxx); - ctx->batch->max_scissor.maxy = MAX2(ctx->batch->max_scissor.maxy, scissor->maxy); - } - - if (dirty & FD_DIRTY_VIEWPORT) { - fd_wfi(ctx->batch, ring); - OUT_PKT4(ring, REG_A5XX_GRAS_CL_VPORT_XOFFSET_0, 6); - OUT_RING(ring, A5XX_GRAS_CL_VPORT_XOFFSET_0(ctx->viewport.translate[0])); - OUT_RING(ring, A5XX_GRAS_CL_VPORT_XSCALE_0(ctx->viewport.scale[0])); - OUT_RING(ring, A5XX_GRAS_CL_VPORT_YOFFSET_0(ctx->viewport.translate[1])); - OUT_RING(ring, A5XX_GRAS_CL_VPORT_YSCALE_0(ctx->viewport.scale[1])); - OUT_RING(ring, A5XX_GRAS_CL_VPORT_ZOFFSET_0(ctx->viewport.translate[2])); - OUT_RING(ring, A5XX_GRAS_CL_VPORT_ZSCALE_0(ctx->viewport.scale[2])); - } - - if (dirty & FD_DIRTY_PROG) - fd5_program_emit(ctx, ring, emit); - - if (dirty & FD_DIRTY_RASTERIZER) { - struct fd5_rasterizer_stateobj *rasterizer = - fd5_rasterizer_stateobj(ctx->rasterizer); - - OUT_PKT4(ring, REG_A5XX_GRAS_SU_CNTL, 1); - OUT_RING(ring, rasterizer->gras_su_cntl | - COND(pfb->samples > 1, A5XX_GRAS_SU_CNTL_MSAA_ENABLE)); - - OUT_PKT4(ring, REG_A5XX_GRAS_SU_POINT_MINMAX, 2); - OUT_RING(ring, rasterizer->gras_su_point_minmax); - OUT_RING(ring, rasterizer->gras_su_point_size); - - OUT_PKT4(ring, REG_A5XX_GRAS_SU_POLY_OFFSET_SCALE, 3); - OUT_RING(ring, rasterizer->gras_su_poly_offset_scale); - OUT_RING(ring, rasterizer->gras_su_poly_offset_offset); - OUT_RING(ring, rasterizer->gras_su_poly_offset_clamp); - - OUT_PKT4(ring, REG_A5XX_PC_RASTER_CNTL, 1); - OUT_RING(ring, rasterizer->pc_raster_cntl); - - OUT_PKT4(ring, REG_A5XX_GRAS_CL_CNTL, 1); - OUT_RING(ring, rasterizer->gras_cl_clip_cntl); - } - - /* note: must come after program emit.. because there is some overlap - * in registers, ex. PC_PRIMITIVE_CNTL and we rely on some cached - * values from fd5_program_emit() to avoid having to re-emit the prog - * every time rast state changes. - * - * Since the primitive restart state is not part of a tracked object, we - * re-emit this register every time. - */ - if (emit->info && ctx->rasterizer) { - struct fd5_rasterizer_stateobj *rasterizer = - fd5_rasterizer_stateobj(ctx->rasterizer); - unsigned max_loc = fd5_context(ctx)->max_loc; - - OUT_PKT4(ring, REG_A5XX_PC_PRIMITIVE_CNTL, 1); - OUT_RING(ring, rasterizer->pc_primitive_cntl | - A5XX_PC_PRIMITIVE_CNTL_STRIDE_IN_VPC(max_loc) | - COND(emit->info->primitive_restart && emit->info->index_size, - A5XX_PC_PRIMITIVE_CNTL_PRIMITIVE_RESTART)); - } - - if (dirty & (FD_DIRTY_FRAMEBUFFER | FD_DIRTY_RASTERIZER | FD_DIRTY_PROG)) { - uint32_t posz_regid = ir3_find_output_regid(fp, FRAG_RESULT_DEPTH); - unsigned nr = pfb->nr_cbufs; - - if (emit->binning_pass) - nr = 0; - else if (ctx->rasterizer->rasterizer_discard) - nr = 0; - - OUT_PKT4(ring, REG_A5XX_RB_FS_OUTPUT_CNTL, 1); - OUT_RING(ring, A5XX_RB_FS_OUTPUT_CNTL_MRT(nr) | - COND(fp->writes_pos, A5XX_RB_FS_OUTPUT_CNTL_FRAG_WRITES_Z)); - - OUT_PKT4(ring, REG_A5XX_SP_FS_OUTPUT_CNTL, 1); - OUT_RING(ring, A5XX_SP_FS_OUTPUT_CNTL_MRT(nr) | - A5XX_SP_FS_OUTPUT_CNTL_DEPTH_REGID(posz_regid) | - A5XX_SP_FS_OUTPUT_CNTL_SAMPLEMASK_REGID(regid(63, 0))); - } - - ir3_emit_vs_consts(vp, ring, ctx, emit->info, emit->indirect, emit->draw); - if (!emit->binning_pass) - ir3_emit_fs_consts(fp, ring, ctx); - - struct ir3_stream_output_info *info = &vp->shader->stream_output; - if (info->num_outputs) { - struct fd_streamout_stateobj *so = &ctx->streamout; - - for (unsigned i = 0; i < so->num_targets; i++) { - struct fd_stream_output_target *target = fd_stream_output_target(so->targets[i]); - - if (!target) - continue; - - OUT_PKT4(ring, REG_A5XX_VPC_SO_BUFFER_BASE_LO(i), 3); - /* VPC_SO[i].BUFFER_BASE_LO: */ - OUT_RELOC(ring, fd_resource(target->base.buffer)->bo, 0, 0, 0); - OUT_RING(ring, target->base.buffer_size + target->base.buffer_offset); - - struct fd_bo *offset_bo = fd_resource(target->offset_buf)->bo; - - if (so->reset & (1 << i)) { - assert(so->offsets[i] == 0); - - OUT_PKT7(ring, CP_MEM_WRITE, 3); - OUT_RELOC(ring, offset_bo, 0, 0, 0); - OUT_RING(ring, target->base.buffer_offset); - - OUT_PKT4(ring, REG_A5XX_VPC_SO_BUFFER_OFFSET(i), 1); - OUT_RING(ring, target->base.buffer_offset); - } else { - OUT_PKT7(ring, CP_MEM_TO_REG, 3); - OUT_RING(ring, CP_MEM_TO_REG_0_REG(REG_A5XX_VPC_SO_BUFFER_OFFSET(i)) | - CP_MEM_TO_REG_0_SHIFT_BY_2 | CP_MEM_TO_REG_0_UNK31 | - CP_MEM_TO_REG_0_CNT(0)); - OUT_RELOC(ring, offset_bo, 0, 0, 0); - } - - // After a draw HW would write the new offset to offset_bo - OUT_PKT4(ring, REG_A5XX_VPC_SO_FLUSH_BASE_LO(i), 2); - OUT_RELOC(ring, offset_bo, 0, 0, 0); - - so->reset &= ~(1 << i); - - emit->streamout_mask |= (1 << i); - } - } - - if (dirty & FD_DIRTY_BLEND) { - struct fd5_blend_stateobj *blend = fd5_blend_stateobj(ctx->blend); - uint32_t i; - - for (i = 0; i < A5XX_MAX_RENDER_TARGETS; i++) { - enum pipe_format format = pipe_surface_format(pfb->cbufs[i]); - bool is_int = util_format_is_pure_integer(format); - bool has_alpha = util_format_has_alpha(format); - uint32_t control = blend->rb_mrt[i].control; - - if (is_int) { - control &= A5XX_RB_MRT_CONTROL_COMPONENT_ENABLE__MASK; - control |= A5XX_RB_MRT_CONTROL_ROP_CODE(ROP_COPY); - } - - if (!has_alpha) { - control &= ~A5XX_RB_MRT_CONTROL_BLEND2; - } - - OUT_PKT4(ring, REG_A5XX_RB_MRT_CONTROL(i), 1); - OUT_RING(ring, control); - - OUT_PKT4(ring, REG_A5XX_RB_MRT_BLEND_CONTROL(i), 1); - OUT_RING(ring, blend->rb_mrt[i].blend_control); - } - - OUT_PKT4(ring, REG_A5XX_SP_BLEND_CNTL, 1); - OUT_RING(ring, blend->sp_blend_cntl); - } - - if (dirty & (FD_DIRTY_BLEND | FD_DIRTY_SAMPLE_MASK)) { - struct fd5_blend_stateobj *blend = fd5_blend_stateobj(ctx->blend); - - OUT_PKT4(ring, REG_A5XX_RB_BLEND_CNTL, 1); - OUT_RING(ring, blend->rb_blend_cntl | - A5XX_RB_BLEND_CNTL_SAMPLE_MASK(ctx->sample_mask)); - } - - if (dirty & FD_DIRTY_BLEND_COLOR) { - struct pipe_blend_color *bcolor = &ctx->blend_color; - - OUT_PKT4(ring, REG_A5XX_RB_BLEND_RED, 8); - OUT_RING(ring, A5XX_RB_BLEND_RED_FLOAT(bcolor->color[0]) | - A5XX_RB_BLEND_RED_UINT(bcolor->color[0] * 0xff) | - A5XX_RB_BLEND_RED_SINT(bcolor->color[0] * 0x7f)); - OUT_RING(ring, A5XX_RB_BLEND_RED_F32(bcolor->color[0])); - OUT_RING(ring, A5XX_RB_BLEND_GREEN_FLOAT(bcolor->color[1]) | - A5XX_RB_BLEND_GREEN_UINT(bcolor->color[1] * 0xff) | - A5XX_RB_BLEND_GREEN_SINT(bcolor->color[1] * 0x7f)); - OUT_RING(ring, A5XX_RB_BLEND_RED_F32(bcolor->color[1])); - OUT_RING(ring, A5XX_RB_BLEND_BLUE_FLOAT(bcolor->color[2]) | - A5XX_RB_BLEND_BLUE_UINT(bcolor->color[2] * 0xff) | - A5XX_RB_BLEND_BLUE_SINT(bcolor->color[2] * 0x7f)); - OUT_RING(ring, A5XX_RB_BLEND_BLUE_F32(bcolor->color[2])); - OUT_RING(ring, A5XX_RB_BLEND_ALPHA_FLOAT(bcolor->color[3]) | - A5XX_RB_BLEND_ALPHA_UINT(bcolor->color[3] * 0xff) | - A5XX_RB_BLEND_ALPHA_SINT(bcolor->color[3] * 0x7f)); - OUT_RING(ring, A5XX_RB_BLEND_ALPHA_F32(bcolor->color[3])); - } - - if (ctx->dirty_shader[PIPE_SHADER_VERTEX] & FD_DIRTY_SHADER_TEX) { - needs_border |= emit_textures(ctx, ring, SB4_VS_TEX, - &ctx->tex[PIPE_SHADER_VERTEX]); - OUT_PKT4(ring, REG_A5XX_TPL1_VS_TEX_COUNT, 1); - OUT_RING(ring, ctx->tex[PIPE_SHADER_VERTEX].num_textures); - } - - if (ctx->dirty_shader[PIPE_SHADER_FRAGMENT] & FD_DIRTY_SHADER_TEX) { - needs_border |= emit_textures(ctx, ring, SB4_FS_TEX, - &ctx->tex[PIPE_SHADER_FRAGMENT]); - } - - OUT_PKT4(ring, REG_A5XX_TPL1_FS_TEX_COUNT, 1); - OUT_RING(ring, ctx->shaderimg[PIPE_SHADER_FRAGMENT].enabled_mask ? - ~0 : ctx->tex[PIPE_SHADER_FRAGMENT].num_textures); - - OUT_PKT4(ring, REG_A5XX_TPL1_CS_TEX_COUNT, 1); - OUT_RING(ring, 0); - - if (needs_border) - emit_border_color(ctx, ring); - - if (ctx->dirty_shader[PIPE_SHADER_FRAGMENT] & FD_DIRTY_SHADER_SSBO) - emit_ssbos(ctx, ring, SB4_SSBO, &ctx->shaderbuf[PIPE_SHADER_FRAGMENT], fp); - - if (ctx->dirty_shader[PIPE_SHADER_FRAGMENT] & FD_DIRTY_SHADER_IMAGE) - fd5_emit_images(ctx, ring, PIPE_SHADER_FRAGMENT, fp); + struct pipe_framebuffer_state *pfb = &ctx->batch->framebuffer; + const struct ir3_shader_variant *vp = fd5_emit_get_vp(emit); + const struct ir3_shader_variant *fp = fd5_emit_get_fp(emit); + const enum fd_dirty_3d_state dirty = emit->dirty; + bool needs_border = false; + + emit_marker5(ring, 5); + + if ((dirty & FD_DIRTY_FRAMEBUFFER) && !emit->binning_pass) { + unsigned char mrt_comp[A5XX_MAX_RENDER_TARGETS] = {0}; + + for (unsigned i = 0; i < A5XX_MAX_RENDER_TARGETS; i++) { + mrt_comp[i] = ((i < pfb->nr_cbufs) && pfb->cbufs[i]) ? 0xf : 0; + } + + OUT_PKT4(ring, REG_A5XX_RB_RENDER_COMPONENTS, 1); + OUT_RING(ring, A5XX_RB_RENDER_COMPONENTS_RT0(mrt_comp[0]) | + A5XX_RB_RENDER_COMPONENTS_RT1(mrt_comp[1]) | + A5XX_RB_RENDER_COMPONENTS_RT2(mrt_comp[2]) | + A5XX_RB_RENDER_COMPONENTS_RT3(mrt_comp[3]) | + A5XX_RB_RENDER_COMPONENTS_RT4(mrt_comp[4]) | + A5XX_RB_RENDER_COMPONENTS_RT5(mrt_comp[5]) | + A5XX_RB_RENDER_COMPONENTS_RT6(mrt_comp[6]) | + A5XX_RB_RENDER_COMPONENTS_RT7(mrt_comp[7])); + } + + if (dirty & (FD_DIRTY_ZSA | FD_DIRTY_FRAMEBUFFER)) { + struct fd5_zsa_stateobj *zsa = fd5_zsa_stateobj(ctx->zsa); + uint32_t rb_alpha_control = zsa->rb_alpha_control; + + if (util_format_is_pure_integer(pipe_surface_format(pfb->cbufs[0]))) + rb_alpha_control &= ~A5XX_RB_ALPHA_CONTROL_ALPHA_TEST; + + OUT_PKT4(ring, REG_A5XX_RB_ALPHA_CONTROL, 1); + OUT_RING(ring, rb_alpha_control); + + OUT_PKT4(ring, REG_A5XX_RB_STENCIL_CONTROL, 1); + OUT_RING(ring, zsa->rb_stencil_control); + } + + if (dirty & (FD_DIRTY_ZSA | FD_DIRTY_BLEND | FD_DIRTY_PROG)) { + struct fd5_blend_stateobj *blend = fd5_blend_stateobj(ctx->blend); + struct fd5_zsa_stateobj *zsa = fd5_zsa_stateobj(ctx->zsa); + + if (pfb->zsbuf) { + struct fd_resource *rsc = fd_resource(pfb->zsbuf->texture); + uint32_t gras_lrz_cntl = zsa->gras_lrz_cntl; + + if (emit->no_lrz_write || !rsc->lrz || !rsc->lrz_valid) + gras_lrz_cntl = 0; + else if (emit->binning_pass && blend->lrz_write && zsa->lrz_write) + gras_lrz_cntl |= A5XX_GRAS_LRZ_CNTL_LRZ_WRITE; + + OUT_PKT4(ring, REG_A5XX_GRAS_LRZ_CNTL, 1); + OUT_RING(ring, gras_lrz_cntl); + } + } + + if (dirty & (FD_DIRTY_ZSA | FD_DIRTY_STENCIL_REF)) { + struct fd5_zsa_stateobj *zsa = fd5_zsa_stateobj(ctx->zsa); + struct pipe_stencil_ref *sr = &ctx->stencil_ref; + + OUT_PKT4(ring, REG_A5XX_RB_STENCILREFMASK, 2); + OUT_RING(ring, zsa->rb_stencilrefmask | + A5XX_RB_STENCILREFMASK_STENCILREF(sr->ref_value[0])); + OUT_RING(ring, zsa->rb_stencilrefmask_bf | + A5XX_RB_STENCILREFMASK_BF_STENCILREF(sr->ref_value[1])); + } + + if (dirty & (FD_DIRTY_ZSA | FD_DIRTY_RASTERIZER | FD_DIRTY_PROG)) { + struct fd5_zsa_stateobj *zsa = fd5_zsa_stateobj(ctx->zsa); + bool fragz = fp->no_earlyz || fp->has_kill || zsa->base.alpha_enabled || + fp->writes_pos; + + OUT_PKT4(ring, REG_A5XX_RB_DEPTH_CNTL, 1); + OUT_RING(ring, zsa->rb_depth_cntl); + + OUT_PKT4(ring, REG_A5XX_RB_DEPTH_PLANE_CNTL, 1); + OUT_RING(ring, COND(fragz, A5XX_RB_DEPTH_PLANE_CNTL_FRAG_WRITES_Z) | + COND(fragz && fp->fragcoord_compmask != 0, + A5XX_RB_DEPTH_PLANE_CNTL_UNK1)); + + OUT_PKT4(ring, REG_A5XX_GRAS_SU_DEPTH_PLANE_CNTL, 1); + OUT_RING(ring, COND(fragz, A5XX_GRAS_SU_DEPTH_PLANE_CNTL_FRAG_WRITES_Z) | + COND(fragz && fp->fragcoord_compmask != 0, + A5XX_GRAS_SU_DEPTH_PLANE_CNTL_UNK1)); + } + + /* NOTE: scissor enabled bit is part of rasterizer state: */ + if (dirty & (FD_DIRTY_SCISSOR | FD_DIRTY_RASTERIZER)) { + struct pipe_scissor_state *scissor = fd_context_get_scissor(ctx); + + OUT_PKT4(ring, REG_A5XX_GRAS_SC_SCREEN_SCISSOR_TL_0, 2); + OUT_RING(ring, A5XX_GRAS_SC_SCREEN_SCISSOR_TL_0_X(scissor->minx) | + A5XX_GRAS_SC_SCREEN_SCISSOR_TL_0_Y(scissor->miny)); + OUT_RING(ring, A5XX_GRAS_SC_SCREEN_SCISSOR_TL_0_X(scissor->maxx - 1) | + A5XX_GRAS_SC_SCREEN_SCISSOR_TL_0_Y(scissor->maxy - 1)); + + OUT_PKT4(ring, REG_A5XX_GRAS_SC_VIEWPORT_SCISSOR_TL_0, 2); + OUT_RING(ring, A5XX_GRAS_SC_VIEWPORT_SCISSOR_TL_0_X(scissor->minx) | + A5XX_GRAS_SC_VIEWPORT_SCISSOR_TL_0_Y(scissor->miny)); + OUT_RING(ring, + A5XX_GRAS_SC_VIEWPORT_SCISSOR_TL_0_X(scissor->maxx - 1) | + A5XX_GRAS_SC_VIEWPORT_SCISSOR_TL_0_Y(scissor->maxy - 1)); + + ctx->batch->max_scissor.minx = + MIN2(ctx->batch->max_scissor.minx, scissor->minx); + ctx->batch->max_scissor.miny = + MIN2(ctx->batch->max_scissor.miny, scissor->miny); + ctx->batch->max_scissor.maxx = + MAX2(ctx->batch->max_scissor.maxx, scissor->maxx); + ctx->batch->max_scissor.maxy = + MAX2(ctx->batch->max_scissor.maxy, scissor->maxy); + } + + if (dirty & FD_DIRTY_VIEWPORT) { + fd_wfi(ctx->batch, ring); + OUT_PKT4(ring, REG_A5XX_GRAS_CL_VPORT_XOFFSET_0, 6); + OUT_RING(ring, A5XX_GRAS_CL_VPORT_XOFFSET_0(ctx->viewport.translate[0])); + OUT_RING(ring, A5XX_GRAS_CL_VPORT_XSCALE_0(ctx->viewport.scale[0])); + OUT_RING(ring, A5XX_GRAS_CL_VPORT_YOFFSET_0(ctx->viewport.translate[1])); + OUT_RING(ring, A5XX_GRAS_CL_VPORT_YSCALE_0(ctx->viewport.scale[1])); + OUT_RING(ring, A5XX_GRAS_CL_VPORT_ZOFFSET_0(ctx->viewport.translate[2])); + OUT_RING(ring, A5XX_GRAS_CL_VPORT_ZSCALE_0(ctx->viewport.scale[2])); + } + + if (dirty & FD_DIRTY_PROG) + fd5_program_emit(ctx, ring, emit); + + if (dirty & FD_DIRTY_RASTERIZER) { + struct fd5_rasterizer_stateobj *rasterizer = + fd5_rasterizer_stateobj(ctx->rasterizer); + + OUT_PKT4(ring, REG_A5XX_GRAS_SU_CNTL, 1); + OUT_RING(ring, rasterizer->gras_su_cntl | + COND(pfb->samples > 1, A5XX_GRAS_SU_CNTL_MSAA_ENABLE)); + + OUT_PKT4(ring, REG_A5XX_GRAS_SU_POINT_MINMAX, 2); + OUT_RING(ring, rasterizer->gras_su_point_minmax); + OUT_RING(ring, rasterizer->gras_su_point_size); + + OUT_PKT4(ring, REG_A5XX_GRAS_SU_POLY_OFFSET_SCALE, 3); + OUT_RING(ring, rasterizer->gras_su_poly_offset_scale); + OUT_RING(ring, rasterizer->gras_su_poly_offset_offset); + OUT_RING(ring, rasterizer->gras_su_poly_offset_clamp); + + OUT_PKT4(ring, REG_A5XX_PC_RASTER_CNTL, 1); + OUT_RING(ring, rasterizer->pc_raster_cntl); + + OUT_PKT4(ring, REG_A5XX_GRAS_CL_CNTL, 1); + OUT_RING(ring, rasterizer->gras_cl_clip_cntl); + } + + /* note: must come after program emit.. because there is some overlap + * in registers, ex. PC_PRIMITIVE_CNTL and we rely on some cached + * values from fd5_program_emit() to avoid having to re-emit the prog + * every time rast state changes. + * + * Since the primitive restart state is not part of a tracked object, we + * re-emit this register every time. + */ + if (emit->info && ctx->rasterizer) { + struct fd5_rasterizer_stateobj *rasterizer = + fd5_rasterizer_stateobj(ctx->rasterizer); + unsigned max_loc = fd5_context(ctx)->max_loc; + + OUT_PKT4(ring, REG_A5XX_PC_PRIMITIVE_CNTL, 1); + OUT_RING(ring, + rasterizer->pc_primitive_cntl | + A5XX_PC_PRIMITIVE_CNTL_STRIDE_IN_VPC(max_loc) | + COND(emit->info->primitive_restart && emit->info->index_size, + A5XX_PC_PRIMITIVE_CNTL_PRIMITIVE_RESTART)); + } + + if (dirty & (FD_DIRTY_FRAMEBUFFER | FD_DIRTY_RASTERIZER | FD_DIRTY_PROG)) { + uint32_t posz_regid = ir3_find_output_regid(fp, FRAG_RESULT_DEPTH); + unsigned nr = pfb->nr_cbufs; + + if (emit->binning_pass) + nr = 0; + else if (ctx->rasterizer->rasterizer_discard) + nr = 0; + + OUT_PKT4(ring, REG_A5XX_RB_FS_OUTPUT_CNTL, 1); + OUT_RING(ring, + A5XX_RB_FS_OUTPUT_CNTL_MRT(nr) | + COND(fp->writes_pos, A5XX_RB_FS_OUTPUT_CNTL_FRAG_WRITES_Z)); + + OUT_PKT4(ring, REG_A5XX_SP_FS_OUTPUT_CNTL, 1); + OUT_RING(ring, A5XX_SP_FS_OUTPUT_CNTL_MRT(nr) | + A5XX_SP_FS_OUTPUT_CNTL_DEPTH_REGID(posz_regid) | + A5XX_SP_FS_OUTPUT_CNTL_SAMPLEMASK_REGID(regid(63, 0))); + } + + ir3_emit_vs_consts(vp, ring, ctx, emit->info, emit->indirect, emit->draw); + if (!emit->binning_pass) + ir3_emit_fs_consts(fp, ring, ctx); + + struct ir3_stream_output_info *info = &vp->shader->stream_output; + if (info->num_outputs) { + struct fd_streamout_stateobj *so = &ctx->streamout; + + for (unsigned i = 0; i < so->num_targets; i++) { + struct fd_stream_output_target *target = + fd_stream_output_target(so->targets[i]); + + if (!target) + continue; + + OUT_PKT4(ring, REG_A5XX_VPC_SO_BUFFER_BASE_LO(i), 3); + /* VPC_SO[i].BUFFER_BASE_LO: */ + OUT_RELOC(ring, fd_resource(target->base.buffer)->bo, 0, 0, 0); + OUT_RING(ring, target->base.buffer_size + target->base.buffer_offset); + + struct fd_bo *offset_bo = fd_resource(target->offset_buf)->bo; + + if (so->reset & (1 << i)) { + assert(so->offsets[i] == 0); + + OUT_PKT7(ring, CP_MEM_WRITE, 3); + OUT_RELOC(ring, offset_bo, 0, 0, 0); + OUT_RING(ring, target->base.buffer_offset); + + OUT_PKT4(ring, REG_A5XX_VPC_SO_BUFFER_OFFSET(i), 1); + OUT_RING(ring, target->base.buffer_offset); + } else { + OUT_PKT7(ring, CP_MEM_TO_REG, 3); + OUT_RING(ring, + CP_MEM_TO_REG_0_REG(REG_A5XX_VPC_SO_BUFFER_OFFSET(i)) | + CP_MEM_TO_REG_0_SHIFT_BY_2 | CP_MEM_TO_REG_0_UNK31 | + CP_MEM_TO_REG_0_CNT(0)); + OUT_RELOC(ring, offset_bo, 0, 0, 0); + } + + // After a draw HW would write the new offset to offset_bo + OUT_PKT4(ring, REG_A5XX_VPC_SO_FLUSH_BASE_LO(i), 2); + OUT_RELOC(ring, offset_bo, 0, 0, 0); + + so->reset &= ~(1 << i); + + emit->streamout_mask |= (1 << i); + } + } + + if (dirty & FD_DIRTY_BLEND) { + struct fd5_blend_stateobj *blend = fd5_blend_stateobj(ctx->blend); + uint32_t i; + + for (i = 0; i < A5XX_MAX_RENDER_TARGETS; i++) { + enum pipe_format format = pipe_surface_format(pfb->cbufs[i]); + bool is_int = util_format_is_pure_integer(format); + bool has_alpha = util_format_has_alpha(format); + uint32_t control = blend->rb_mrt[i].control; + + if (is_int) { + control &= A5XX_RB_MRT_CONTROL_COMPONENT_ENABLE__MASK; + control |= A5XX_RB_MRT_CONTROL_ROP_CODE(ROP_COPY); + } + + if (!has_alpha) { + control &= ~A5XX_RB_MRT_CONTROL_BLEND2; + } + + OUT_PKT4(ring, REG_A5XX_RB_MRT_CONTROL(i), 1); + OUT_RING(ring, control); + + OUT_PKT4(ring, REG_A5XX_RB_MRT_BLEND_CONTROL(i), 1); + OUT_RING(ring, blend->rb_mrt[i].blend_control); + } + + OUT_PKT4(ring, REG_A5XX_SP_BLEND_CNTL, 1); + OUT_RING(ring, blend->sp_blend_cntl); + } + + if (dirty & (FD_DIRTY_BLEND | FD_DIRTY_SAMPLE_MASK)) { + struct fd5_blend_stateobj *blend = fd5_blend_stateobj(ctx->blend); + + OUT_PKT4(ring, REG_A5XX_RB_BLEND_CNTL, 1); + OUT_RING(ring, blend->rb_blend_cntl | + A5XX_RB_BLEND_CNTL_SAMPLE_MASK(ctx->sample_mask)); + } + + if (dirty & FD_DIRTY_BLEND_COLOR) { + struct pipe_blend_color *bcolor = &ctx->blend_color; + + OUT_PKT4(ring, REG_A5XX_RB_BLEND_RED, 8); + OUT_RING(ring, A5XX_RB_BLEND_RED_FLOAT(bcolor->color[0]) | + A5XX_RB_BLEND_RED_UINT(bcolor->color[0] * 0xff) | + A5XX_RB_BLEND_RED_SINT(bcolor->color[0] * 0x7f)); + OUT_RING(ring, A5XX_RB_BLEND_RED_F32(bcolor->color[0])); + OUT_RING(ring, A5XX_RB_BLEND_GREEN_FLOAT(bcolor->color[1]) | + A5XX_RB_BLEND_GREEN_UINT(bcolor->color[1] * 0xff) | + A5XX_RB_BLEND_GREEN_SINT(bcolor->color[1] * 0x7f)); + OUT_RING(ring, A5XX_RB_BLEND_RED_F32(bcolor->color[1])); + OUT_RING(ring, A5XX_RB_BLEND_BLUE_FLOAT(bcolor->color[2]) | + A5XX_RB_BLEND_BLUE_UINT(bcolor->color[2] * 0xff) | + A5XX_RB_BLEND_BLUE_SINT(bcolor->color[2] * 0x7f)); + OUT_RING(ring, A5XX_RB_BLEND_BLUE_F32(bcolor->color[2])); + OUT_RING(ring, A5XX_RB_BLEND_ALPHA_FLOAT(bcolor->color[3]) | + A5XX_RB_BLEND_ALPHA_UINT(bcolor->color[3] * 0xff) | + A5XX_RB_BLEND_ALPHA_SINT(bcolor->color[3] * 0x7f)); + OUT_RING(ring, A5XX_RB_BLEND_ALPHA_F32(bcolor->color[3])); + } + + if (ctx->dirty_shader[PIPE_SHADER_VERTEX] & FD_DIRTY_SHADER_TEX) { + needs_border |= + emit_textures(ctx, ring, SB4_VS_TEX, &ctx->tex[PIPE_SHADER_VERTEX]); + OUT_PKT4(ring, REG_A5XX_TPL1_VS_TEX_COUNT, 1); + OUT_RING(ring, ctx->tex[PIPE_SHADER_VERTEX].num_textures); + } + + if (ctx->dirty_shader[PIPE_SHADER_FRAGMENT] & FD_DIRTY_SHADER_TEX) { + needs_border |= + emit_textures(ctx, ring, SB4_FS_TEX, &ctx->tex[PIPE_SHADER_FRAGMENT]); + } + + OUT_PKT4(ring, REG_A5XX_TPL1_FS_TEX_COUNT, 1); + OUT_RING(ring, ctx->shaderimg[PIPE_SHADER_FRAGMENT].enabled_mask + ? ~0 + : ctx->tex[PIPE_SHADER_FRAGMENT].num_textures); + + OUT_PKT4(ring, REG_A5XX_TPL1_CS_TEX_COUNT, 1); + OUT_RING(ring, 0); + + if (needs_border) + emit_border_color(ctx, ring); + + if (ctx->dirty_shader[PIPE_SHADER_FRAGMENT] & FD_DIRTY_SHADER_SSBO) + emit_ssbos(ctx, ring, SB4_SSBO, &ctx->shaderbuf[PIPE_SHADER_FRAGMENT], + fp); + + if (ctx->dirty_shader[PIPE_SHADER_FRAGMENT] & FD_DIRTY_SHADER_IMAGE) + fd5_emit_images(ctx, ring, PIPE_SHADER_FRAGMENT, fp); } void fd5_emit_cs_state(struct fd_context *ctx, struct fd_ringbuffer *ring, - struct ir3_shader_variant *cp) + struct ir3_shader_variant *cp) { - enum fd_dirty_shader_state dirty = ctx->dirty_shader[PIPE_SHADER_COMPUTE]; + enum fd_dirty_shader_state dirty = ctx->dirty_shader[PIPE_SHADER_COMPUTE]; - if (dirty & FD_DIRTY_SHADER_TEX) { - bool needs_border = false; - needs_border |= emit_textures(ctx, ring, SB4_CS_TEX, - &ctx->tex[PIPE_SHADER_COMPUTE]); + if (dirty & FD_DIRTY_SHADER_TEX) { + bool needs_border = false; + needs_border |= + emit_textures(ctx, ring, SB4_CS_TEX, &ctx->tex[PIPE_SHADER_COMPUTE]); - if (needs_border) - emit_border_color(ctx, ring); + if (needs_border) + emit_border_color(ctx, ring); - OUT_PKT4(ring, REG_A5XX_TPL1_VS_TEX_COUNT, 1); - OUT_RING(ring, 0); + OUT_PKT4(ring, REG_A5XX_TPL1_VS_TEX_COUNT, 1); + OUT_RING(ring, 0); - OUT_PKT4(ring, REG_A5XX_TPL1_HS_TEX_COUNT, 1); - OUT_RING(ring, 0); + OUT_PKT4(ring, REG_A5XX_TPL1_HS_TEX_COUNT, 1); + OUT_RING(ring, 0); - OUT_PKT4(ring, REG_A5XX_TPL1_DS_TEX_COUNT, 1); - OUT_RING(ring, 0); + OUT_PKT4(ring, REG_A5XX_TPL1_DS_TEX_COUNT, 1); + OUT_RING(ring, 0); - OUT_PKT4(ring, REG_A5XX_TPL1_GS_TEX_COUNT, 1); - OUT_RING(ring, 0); + OUT_PKT4(ring, REG_A5XX_TPL1_GS_TEX_COUNT, 1); + OUT_RING(ring, 0); - OUT_PKT4(ring, REG_A5XX_TPL1_FS_TEX_COUNT, 1); - OUT_RING(ring, 0); - } + OUT_PKT4(ring, REG_A5XX_TPL1_FS_TEX_COUNT, 1); + OUT_RING(ring, 0); + } - OUT_PKT4(ring, REG_A5XX_TPL1_CS_TEX_COUNT, 1); - OUT_RING(ring, ctx->shaderimg[PIPE_SHADER_COMPUTE].enabled_mask ? - ~0 : ctx->tex[PIPE_SHADER_COMPUTE].num_textures); + OUT_PKT4(ring, REG_A5XX_TPL1_CS_TEX_COUNT, 1); + OUT_RING(ring, ctx->shaderimg[PIPE_SHADER_COMPUTE].enabled_mask + ? ~0 + : ctx->tex[PIPE_SHADER_COMPUTE].num_textures); - if (dirty & FD_DIRTY_SHADER_SSBO) - emit_ssbos(ctx, ring, SB4_CS_SSBO, &ctx->shaderbuf[PIPE_SHADER_COMPUTE], cp); + if (dirty & FD_DIRTY_SHADER_SSBO) + emit_ssbos(ctx, ring, SB4_CS_SSBO, &ctx->shaderbuf[PIPE_SHADER_COMPUTE], + cp); - if (dirty & FD_DIRTY_SHADER_IMAGE) - fd5_emit_images(ctx, ring, PIPE_SHADER_COMPUTE, cp); + if (dirty & FD_DIRTY_SHADER_IMAGE) + fd5_emit_images(ctx, ring, PIPE_SHADER_COMPUTE, cp); } /* emit setup at begin of new cmdstream buffer (don't rely on previous @@ -893,263 +915,263 @@ fd5_emit_cs_state(struct fd_context *ctx, struct fd_ringbuffer *ring, void fd5_emit_restore(struct fd_batch *batch, struct fd_ringbuffer *ring) { - struct fd_context *ctx = batch->ctx; + struct fd_context *ctx = batch->ctx; - fd5_set_render_mode(ctx, ring, BYPASS); - fd5_cache_flush(batch, ring); + fd5_set_render_mode(ctx, ring, BYPASS); + fd5_cache_flush(batch, ring); - OUT_PKT4(ring, REG_A5XX_HLSQ_UPDATE_CNTL, 1); - OUT_RING(ring, 0xfffff); + OUT_PKT4(ring, REG_A5XX_HLSQ_UPDATE_CNTL, 1); + OUT_RING(ring, 0xfffff); -/* -t7 opcode: CP_PERFCOUNTER_ACTION (50) (4 dwords) -0000000500024048: 70d08003 00000000 001c5000 00000005 -t7 opcode: CP_PERFCOUNTER_ACTION (50) (4 dwords) -0000000500024058: 70d08003 00000010 001c7000 00000005 + /* + t7 opcode: CP_PERFCOUNTER_ACTION (50) (4 dwords) + 0000000500024048: 70d08003 00000000 001c5000 00000005 + t7 opcode: CP_PERFCOUNTER_ACTION (50) (4 dwords) + 0000000500024058: 70d08003 00000010 001c7000 00000005 -t7 opcode: CP_WAIT_FOR_IDLE (26) (1 dwords) -0000000500024068: 70268000 -*/ + t7 opcode: CP_WAIT_FOR_IDLE (26) (1 dwords) + 0000000500024068: 70268000 + */ - OUT_PKT4(ring, REG_A5XX_PC_RESTART_INDEX, 1); - OUT_RING(ring, 0xffffffff); + OUT_PKT4(ring, REG_A5XX_PC_RESTART_INDEX, 1); + OUT_RING(ring, 0xffffffff); - OUT_PKT4(ring, REG_A5XX_PC_RASTER_CNTL, 1); - OUT_RING(ring, 0x00000012); + OUT_PKT4(ring, REG_A5XX_PC_RASTER_CNTL, 1); + OUT_RING(ring, 0x00000012); - OUT_PKT4(ring, REG_A5XX_GRAS_SU_POINT_MINMAX, 2); - OUT_RING(ring, A5XX_GRAS_SU_POINT_MINMAX_MIN(1.0) | - A5XX_GRAS_SU_POINT_MINMAX_MAX(4092.0)); - OUT_RING(ring, A5XX_GRAS_SU_POINT_SIZE(0.5)); + OUT_PKT4(ring, REG_A5XX_GRAS_SU_POINT_MINMAX, 2); + OUT_RING(ring, A5XX_GRAS_SU_POINT_MINMAX_MIN(1.0) | + A5XX_GRAS_SU_POINT_MINMAX_MAX(4092.0)); + OUT_RING(ring, A5XX_GRAS_SU_POINT_SIZE(0.5)); - OUT_PKT4(ring, REG_A5XX_GRAS_SU_CONSERVATIVE_RAS_CNTL, 1); - OUT_RING(ring, 0x00000000); /* GRAS_SU_CONSERVATIVE_RAS_CNTL */ + OUT_PKT4(ring, REG_A5XX_GRAS_SU_CONSERVATIVE_RAS_CNTL, 1); + OUT_RING(ring, 0x00000000); /* GRAS_SU_CONSERVATIVE_RAS_CNTL */ - OUT_PKT4(ring, REG_A5XX_GRAS_SC_SCREEN_SCISSOR_CNTL, 1); - OUT_RING(ring, 0x00000000); /* GRAS_SC_SCREEN_SCISSOR_CNTL */ + OUT_PKT4(ring, REG_A5XX_GRAS_SC_SCREEN_SCISSOR_CNTL, 1); + OUT_RING(ring, 0x00000000); /* GRAS_SC_SCREEN_SCISSOR_CNTL */ - OUT_PKT4(ring, REG_A5XX_SP_VS_CONFIG_MAX_CONST, 1); - OUT_RING(ring, 0); /* SP_VS_CONFIG_MAX_CONST */ + OUT_PKT4(ring, REG_A5XX_SP_VS_CONFIG_MAX_CONST, 1); + OUT_RING(ring, 0); /* SP_VS_CONFIG_MAX_CONST */ - OUT_PKT4(ring, REG_A5XX_SP_FS_CONFIG_MAX_CONST, 1); - OUT_RING(ring, 0); /* SP_FS_CONFIG_MAX_CONST */ + OUT_PKT4(ring, REG_A5XX_SP_FS_CONFIG_MAX_CONST, 1); + OUT_RING(ring, 0); /* SP_FS_CONFIG_MAX_CONST */ - OUT_PKT4(ring, REG_A5XX_UNKNOWN_E292, 2); - OUT_RING(ring, 0x00000000); /* UNKNOWN_E292 */ - OUT_RING(ring, 0x00000000); /* UNKNOWN_E293 */ + OUT_PKT4(ring, REG_A5XX_UNKNOWN_E292, 2); + OUT_RING(ring, 0x00000000); /* UNKNOWN_E292 */ + OUT_RING(ring, 0x00000000); /* UNKNOWN_E293 */ - OUT_PKT4(ring, REG_A5XX_RB_MODE_CNTL, 1); - OUT_RING(ring, 0x00000044); /* RB_MODE_CNTL */ + OUT_PKT4(ring, REG_A5XX_RB_MODE_CNTL, 1); + OUT_RING(ring, 0x00000044); /* RB_MODE_CNTL */ - OUT_PKT4(ring, REG_A5XX_RB_DBG_ECO_CNTL, 1); - OUT_RING(ring, 0x00100000); /* RB_DBG_ECO_CNTL */ + OUT_PKT4(ring, REG_A5XX_RB_DBG_ECO_CNTL, 1); + OUT_RING(ring, 0x00100000); /* RB_DBG_ECO_CNTL */ - OUT_PKT4(ring, REG_A5XX_VFD_MODE_CNTL, 1); - OUT_RING(ring, 0x00000000); /* VFD_MODE_CNTL */ + OUT_PKT4(ring, REG_A5XX_VFD_MODE_CNTL, 1); + OUT_RING(ring, 0x00000000); /* VFD_MODE_CNTL */ - OUT_PKT4(ring, REG_A5XX_PC_MODE_CNTL, 1); - OUT_RING(ring, 0x0000001f); /* PC_MODE_CNTL */ + OUT_PKT4(ring, REG_A5XX_PC_MODE_CNTL, 1); + OUT_RING(ring, 0x0000001f); /* PC_MODE_CNTL */ - OUT_PKT4(ring, REG_A5XX_SP_MODE_CNTL, 1); - OUT_RING(ring, 0x0000001e); /* SP_MODE_CNTL */ + OUT_PKT4(ring, REG_A5XX_SP_MODE_CNTL, 1); + OUT_RING(ring, 0x0000001e); /* SP_MODE_CNTL */ - if (ctx->screen->gpu_id == 540) { - OUT_PKT4(ring, REG_A5XX_SP_DBG_ECO_CNTL, 1); - OUT_RING(ring, 0x800); /* SP_DBG_ECO_CNTL */ + if (ctx->screen->gpu_id == 540) { + OUT_PKT4(ring, REG_A5XX_SP_DBG_ECO_CNTL, 1); + OUT_RING(ring, 0x800); /* SP_DBG_ECO_CNTL */ - OUT_PKT4(ring, REG_A5XX_HLSQ_DBG_ECO_CNTL, 1); - OUT_RING(ring, 0x0); + OUT_PKT4(ring, REG_A5XX_HLSQ_DBG_ECO_CNTL, 1); + OUT_RING(ring, 0x0); - OUT_PKT4(ring, REG_A5XX_VPC_DBG_ECO_CNTL, 1); - OUT_RING(ring, 0x800400); - } else { - OUT_PKT4(ring, REG_A5XX_SP_DBG_ECO_CNTL, 1); - OUT_RING(ring, 0x40000800); /* SP_DBG_ECO_CNTL */ - } + OUT_PKT4(ring, REG_A5XX_VPC_DBG_ECO_CNTL, 1); + OUT_RING(ring, 0x800400); + } else { + OUT_PKT4(ring, REG_A5XX_SP_DBG_ECO_CNTL, 1); + OUT_RING(ring, 0x40000800); /* SP_DBG_ECO_CNTL */ + } - OUT_PKT4(ring, REG_A5XX_TPL1_MODE_CNTL, 1); - OUT_RING(ring, 0x00000544); /* TPL1_MODE_CNTL */ + OUT_PKT4(ring, REG_A5XX_TPL1_MODE_CNTL, 1); + OUT_RING(ring, 0x00000544); /* TPL1_MODE_CNTL */ - OUT_PKT4(ring, REG_A5XX_HLSQ_TIMEOUT_THRESHOLD_0, 2); - OUT_RING(ring, 0x00000080); /* HLSQ_TIMEOUT_THRESHOLD_0 */ - OUT_RING(ring, 0x00000000); /* HLSQ_TIMEOUT_THRESHOLD_1 */ + OUT_PKT4(ring, REG_A5XX_HLSQ_TIMEOUT_THRESHOLD_0, 2); + OUT_RING(ring, 0x00000080); /* HLSQ_TIMEOUT_THRESHOLD_0 */ + OUT_RING(ring, 0x00000000); /* HLSQ_TIMEOUT_THRESHOLD_1 */ - OUT_PKT4(ring, REG_A5XX_VPC_DBG_ECO_CNTL, 1); - OUT_RING(ring, 0x00000400); /* VPC_DBG_ECO_CNTL */ + OUT_PKT4(ring, REG_A5XX_VPC_DBG_ECO_CNTL, 1); + OUT_RING(ring, 0x00000400); /* VPC_DBG_ECO_CNTL */ - OUT_PKT4(ring, REG_A5XX_HLSQ_MODE_CNTL, 1); - OUT_RING(ring, 0x00000001); /* HLSQ_MODE_CNTL */ + OUT_PKT4(ring, REG_A5XX_HLSQ_MODE_CNTL, 1); + OUT_RING(ring, 0x00000001); /* HLSQ_MODE_CNTL */ - OUT_PKT4(ring, REG_A5XX_VPC_MODE_CNTL, 1); - OUT_RING(ring, 0x00000000); /* VPC_MODE_CNTL */ + OUT_PKT4(ring, REG_A5XX_VPC_MODE_CNTL, 1); + OUT_RING(ring, 0x00000000); /* VPC_MODE_CNTL */ - /* we don't use this yet.. probably best to disable.. */ - OUT_PKT7(ring, CP_SET_DRAW_STATE, 3); - OUT_RING(ring, CP_SET_DRAW_STATE__0_COUNT(0) | - CP_SET_DRAW_STATE__0_DISABLE_ALL_GROUPS | - CP_SET_DRAW_STATE__0_GROUP_ID(0)); - OUT_RING(ring, CP_SET_DRAW_STATE__1_ADDR_LO(0)); - OUT_RING(ring, CP_SET_DRAW_STATE__2_ADDR_HI(0)); + /* we don't use this yet.. probably best to disable.. */ + OUT_PKT7(ring, CP_SET_DRAW_STATE, 3); + OUT_RING(ring, CP_SET_DRAW_STATE__0_COUNT(0) | + CP_SET_DRAW_STATE__0_DISABLE_ALL_GROUPS | + CP_SET_DRAW_STATE__0_GROUP_ID(0)); + OUT_RING(ring, CP_SET_DRAW_STATE__1_ADDR_LO(0)); + OUT_RING(ring, CP_SET_DRAW_STATE__2_ADDR_HI(0)); - OUT_PKT4(ring, REG_A5XX_GRAS_SU_CONSERVATIVE_RAS_CNTL, 1); - OUT_RING(ring, 0x00000000); /* GRAS_SU_CONSERVATIVE_RAS_CNTL */ + OUT_PKT4(ring, REG_A5XX_GRAS_SU_CONSERVATIVE_RAS_CNTL, 1); + OUT_RING(ring, 0x00000000); /* GRAS_SU_CONSERVATIVE_RAS_CNTL */ - OUT_PKT4(ring, REG_A5XX_GRAS_SC_BIN_CNTL, 1); - OUT_RING(ring, 0x00000000); /* GRAS_SC_BIN_CNTL */ + OUT_PKT4(ring, REG_A5XX_GRAS_SC_BIN_CNTL, 1); + OUT_RING(ring, 0x00000000); /* GRAS_SC_BIN_CNTL */ - OUT_PKT4(ring, REG_A5XX_GRAS_SC_BIN_CNTL, 1); - OUT_RING(ring, 0x00000000); /* GRAS_SC_BIN_CNTL */ + OUT_PKT4(ring, REG_A5XX_GRAS_SC_BIN_CNTL, 1); + OUT_RING(ring, 0x00000000); /* GRAS_SC_BIN_CNTL */ - OUT_PKT4(ring, REG_A5XX_VPC_FS_PRIMITIVEID_CNTL, 1); - OUT_RING(ring, 0x000000ff); /* VPC_FS_PRIMITIVEID_CNTL */ + OUT_PKT4(ring, REG_A5XX_VPC_FS_PRIMITIVEID_CNTL, 1); + OUT_RING(ring, 0x000000ff); /* VPC_FS_PRIMITIVEID_CNTL */ - OUT_PKT4(ring, REG_A5XX_VPC_SO_OVERRIDE, 1); - OUT_RING(ring, A5XX_VPC_SO_OVERRIDE_SO_DISABLE); + OUT_PKT4(ring, REG_A5XX_VPC_SO_OVERRIDE, 1); + OUT_RING(ring, A5XX_VPC_SO_OVERRIDE_SO_DISABLE); - OUT_PKT4(ring, REG_A5XX_VPC_SO_BUFFER_BASE_LO(0), 3); - OUT_RING(ring, 0x00000000); /* VPC_SO_BUFFER_BASE_LO_0 */ - OUT_RING(ring, 0x00000000); /* VPC_SO_BUFFER_BASE_HI_0 */ - OUT_RING(ring, 0x00000000); /* VPC_SO_BUFFER_SIZE_0 */ + OUT_PKT4(ring, REG_A5XX_VPC_SO_BUFFER_BASE_LO(0), 3); + OUT_RING(ring, 0x00000000); /* VPC_SO_BUFFER_BASE_LO_0 */ + OUT_RING(ring, 0x00000000); /* VPC_SO_BUFFER_BASE_HI_0 */ + OUT_RING(ring, 0x00000000); /* VPC_SO_BUFFER_SIZE_0 */ - OUT_PKT4(ring, REG_A5XX_VPC_SO_FLUSH_BASE_LO(0), 2); - OUT_RING(ring, 0x00000000); /* VPC_SO_FLUSH_BASE_LO_0 */ - OUT_RING(ring, 0x00000000); /* VPC_SO_FLUSH_BASE_HI_0 */ + OUT_PKT4(ring, REG_A5XX_VPC_SO_FLUSH_BASE_LO(0), 2); + OUT_RING(ring, 0x00000000); /* VPC_SO_FLUSH_BASE_LO_0 */ + OUT_RING(ring, 0x00000000); /* VPC_SO_FLUSH_BASE_HI_0 */ - OUT_PKT4(ring, REG_A5XX_PC_GS_PARAM, 1); - OUT_RING(ring, 0x00000000); /* PC_GS_PARAM */ + OUT_PKT4(ring, REG_A5XX_PC_GS_PARAM, 1); + OUT_RING(ring, 0x00000000); /* PC_GS_PARAM */ - OUT_PKT4(ring, REG_A5XX_PC_HS_PARAM, 1); - OUT_RING(ring, 0x00000000); /* PC_HS_PARAM */ + OUT_PKT4(ring, REG_A5XX_PC_HS_PARAM, 1); + OUT_RING(ring, 0x00000000); /* PC_HS_PARAM */ - OUT_PKT4(ring, REG_A5XX_TPL1_TP_FS_ROTATION_CNTL, 1); - OUT_RING(ring, 0x00000000); /* TPL1_TP_FS_ROTATION_CNTL */ + OUT_PKT4(ring, REG_A5XX_TPL1_TP_FS_ROTATION_CNTL, 1); + OUT_RING(ring, 0x00000000); /* TPL1_TP_FS_ROTATION_CNTL */ - OUT_PKT4(ring, REG_A5XX_UNKNOWN_E004, 1); - OUT_RING(ring, 0x00000000); /* UNKNOWN_E004 */ + OUT_PKT4(ring, REG_A5XX_UNKNOWN_E004, 1); + OUT_RING(ring, 0x00000000); /* UNKNOWN_E004 */ - OUT_PKT4(ring, REG_A5XX_GRAS_SU_LAYERED, 1); - OUT_RING(ring, 0x00000000); /* GRAS_SU_LAYERED */ + OUT_PKT4(ring, REG_A5XX_GRAS_SU_LAYERED, 1); + OUT_RING(ring, 0x00000000); /* GRAS_SU_LAYERED */ - OUT_PKT4(ring, REG_A5XX_VPC_SO_BUF_CNTL, 1); - OUT_RING(ring, 0x00000000); /* VPC_SO_BUF_CNTL */ + OUT_PKT4(ring, REG_A5XX_VPC_SO_BUF_CNTL, 1); + OUT_RING(ring, 0x00000000); /* VPC_SO_BUF_CNTL */ - OUT_PKT4(ring, REG_A5XX_VPC_SO_BUFFER_OFFSET(0), 1); - OUT_RING(ring, 0x00000000); /* UNKNOWN_E2AB */ + OUT_PKT4(ring, REG_A5XX_VPC_SO_BUFFER_OFFSET(0), 1); + OUT_RING(ring, 0x00000000); /* UNKNOWN_E2AB */ - OUT_PKT4(ring, REG_A5XX_PC_GS_LAYERED, 1); - OUT_RING(ring, 0x00000000); /* PC_GS_LAYERED */ + OUT_PKT4(ring, REG_A5XX_PC_GS_LAYERED, 1); + OUT_RING(ring, 0x00000000); /* PC_GS_LAYERED */ - OUT_PKT4(ring, REG_A5XX_UNKNOWN_E5AB, 1); - OUT_RING(ring, 0x00000000); /* UNKNOWN_E5AB */ + OUT_PKT4(ring, REG_A5XX_UNKNOWN_E5AB, 1); + OUT_RING(ring, 0x00000000); /* UNKNOWN_E5AB */ - OUT_PKT4(ring, REG_A5XX_UNKNOWN_E5C2, 1); - OUT_RING(ring, 0x00000000); /* UNKNOWN_E5C2 */ + OUT_PKT4(ring, REG_A5XX_UNKNOWN_E5C2, 1); + OUT_RING(ring, 0x00000000); /* UNKNOWN_E5C2 */ - OUT_PKT4(ring, REG_A5XX_VPC_SO_BUFFER_BASE_LO(1), 3); - OUT_RING(ring, 0x00000000); - OUT_RING(ring, 0x00000000); - OUT_RING(ring, 0x00000000); + OUT_PKT4(ring, REG_A5XX_VPC_SO_BUFFER_BASE_LO(1), 3); + OUT_RING(ring, 0x00000000); + OUT_RING(ring, 0x00000000); + OUT_RING(ring, 0x00000000); - OUT_PKT4(ring, REG_A5XX_VPC_SO_BUFFER_OFFSET(1), 6); - OUT_RING(ring, 0x00000000); - OUT_RING(ring, 0x00000000); - OUT_RING(ring, 0x00000000); - OUT_RING(ring, 0x00000000); - OUT_RING(ring, 0x00000000); - OUT_RING(ring, 0x00000000); - - OUT_PKT4(ring, REG_A5XX_VPC_SO_BUFFER_OFFSET(2), 6); - OUT_RING(ring, 0x00000000); - OUT_RING(ring, 0x00000000); - OUT_RING(ring, 0x00000000); - OUT_RING(ring, 0x00000000); - OUT_RING(ring, 0x00000000); - OUT_RING(ring, 0x00000000); + OUT_PKT4(ring, REG_A5XX_VPC_SO_BUFFER_OFFSET(1), 6); + OUT_RING(ring, 0x00000000); + OUT_RING(ring, 0x00000000); + OUT_RING(ring, 0x00000000); + OUT_RING(ring, 0x00000000); + OUT_RING(ring, 0x00000000); + OUT_RING(ring, 0x00000000); + + OUT_PKT4(ring, REG_A5XX_VPC_SO_BUFFER_OFFSET(2), 6); + OUT_RING(ring, 0x00000000); + OUT_RING(ring, 0x00000000); + OUT_RING(ring, 0x00000000); + OUT_RING(ring, 0x00000000); + OUT_RING(ring, 0x00000000); + OUT_RING(ring, 0x00000000); - OUT_PKT4(ring, REG_A5XX_VPC_SO_BUFFER_OFFSET(3), 3); - OUT_RING(ring, 0x00000000); - OUT_RING(ring, 0x00000000); - OUT_RING(ring, 0x00000000); - - OUT_PKT4(ring, REG_A5XX_UNKNOWN_E5DB, 1); - OUT_RING(ring, 0x00000000); + OUT_PKT4(ring, REG_A5XX_VPC_SO_BUFFER_OFFSET(3), 3); + OUT_RING(ring, 0x00000000); + OUT_RING(ring, 0x00000000); + OUT_RING(ring, 0x00000000); + + OUT_PKT4(ring, REG_A5XX_UNKNOWN_E5DB, 1); + OUT_RING(ring, 0x00000000); - OUT_PKT4(ring, REG_A5XX_SP_HS_CTRL_REG0, 1); - OUT_RING(ring, 0x00000000); + OUT_PKT4(ring, REG_A5XX_SP_HS_CTRL_REG0, 1); + OUT_RING(ring, 0x00000000); - OUT_PKT4(ring, REG_A5XX_SP_GS_CTRL_REG0, 1); - OUT_RING(ring, 0x00000000); + OUT_PKT4(ring, REG_A5XX_SP_GS_CTRL_REG0, 1); + OUT_RING(ring, 0x00000000); - OUT_PKT4(ring, REG_A5XX_TPL1_VS_TEX_COUNT, 4); - OUT_RING(ring, 0x00000000); - OUT_RING(ring, 0x00000000); - OUT_RING(ring, 0x00000000); - OUT_RING(ring, 0x00000000); - - OUT_PKT4(ring, REG_A5XX_TPL1_FS_TEX_COUNT, 2); - OUT_RING(ring, 0x00000000); - OUT_RING(ring, 0x00000000); + OUT_PKT4(ring, REG_A5XX_TPL1_VS_TEX_COUNT, 4); + OUT_RING(ring, 0x00000000); + OUT_RING(ring, 0x00000000); + OUT_RING(ring, 0x00000000); + OUT_RING(ring, 0x00000000); + + OUT_PKT4(ring, REG_A5XX_TPL1_FS_TEX_COUNT, 2); + OUT_RING(ring, 0x00000000); + OUT_RING(ring, 0x00000000); - OUT_PKT4(ring, REG_A5XX_UNKNOWN_E7C0, 3); - OUT_RING(ring, 0x00000000); - OUT_RING(ring, 0x00000000); - OUT_RING(ring, 0x00000000); - - OUT_PKT4(ring, REG_A5XX_UNKNOWN_E7C5, 3); - OUT_RING(ring, 0x00000000); - OUT_RING(ring, 0x00000000); - OUT_RING(ring, 0x00000000); + OUT_PKT4(ring, REG_A5XX_UNKNOWN_E7C0, 3); + OUT_RING(ring, 0x00000000); + OUT_RING(ring, 0x00000000); + OUT_RING(ring, 0x00000000); + + OUT_PKT4(ring, REG_A5XX_UNKNOWN_E7C5, 3); + OUT_RING(ring, 0x00000000); + OUT_RING(ring, 0x00000000); + OUT_RING(ring, 0x00000000); - OUT_PKT4(ring, REG_A5XX_UNKNOWN_E7CA, 3); - OUT_RING(ring, 0x00000000); - OUT_RING(ring, 0x00000000); - OUT_RING(ring, 0x00000000); - - OUT_PKT4(ring, REG_A5XX_UNKNOWN_E7CF, 3); - OUT_RING(ring, 0x00000000); - OUT_RING(ring, 0x00000000); - OUT_RING(ring, 0x00000000); - - OUT_PKT4(ring, REG_A5XX_UNKNOWN_E7D4, 3); - OUT_RING(ring, 0x00000000); - OUT_RING(ring, 0x00000000); - OUT_RING(ring, 0x00000000); - - OUT_PKT4(ring, REG_A5XX_UNKNOWN_E7D9, 3); - OUT_RING(ring, 0x00000000); - OUT_RING(ring, 0x00000000); - OUT_RING(ring, 0x00000000); - - OUT_PKT4(ring, REG_A5XX_RB_CLEAR_CNTL, 1); - OUT_RING(ring, 0x00000000); + OUT_PKT4(ring, REG_A5XX_UNKNOWN_E7CA, 3); + OUT_RING(ring, 0x00000000); + OUT_RING(ring, 0x00000000); + OUT_RING(ring, 0x00000000); + + OUT_PKT4(ring, REG_A5XX_UNKNOWN_E7CF, 3); + OUT_RING(ring, 0x00000000); + OUT_RING(ring, 0x00000000); + OUT_RING(ring, 0x00000000); + + OUT_PKT4(ring, REG_A5XX_UNKNOWN_E7D4, 3); + OUT_RING(ring, 0x00000000); + OUT_RING(ring, 0x00000000); + OUT_RING(ring, 0x00000000); + + OUT_PKT4(ring, REG_A5XX_UNKNOWN_E7D9, 3); + OUT_RING(ring, 0x00000000); + OUT_RING(ring, 0x00000000); + OUT_RING(ring, 0x00000000); + + OUT_PKT4(ring, REG_A5XX_RB_CLEAR_CNTL, 1); + OUT_RING(ring, 0x00000000); } static void fd5_mem_to_mem(struct fd_ringbuffer *ring, struct pipe_resource *dst, - unsigned dst_off, struct pipe_resource *src, unsigned src_off, - unsigned sizedwords) + unsigned dst_off, struct pipe_resource *src, unsigned src_off, + unsigned sizedwords) { - struct fd_bo *src_bo = fd_resource(src)->bo; - struct fd_bo *dst_bo = fd_resource(dst)->bo; - unsigned i; - - for (i = 0; i < sizedwords; i++) { - OUT_PKT7(ring, CP_MEM_TO_MEM, 5); - OUT_RING(ring, 0x00000000); - OUT_RELOC(ring, dst_bo, dst_off, 0, 0); - OUT_RELOC(ring, src_bo, src_off, 0, 0); - - dst_off += 4; - src_off += 4; - } + struct fd_bo *src_bo = fd_resource(src)->bo; + struct fd_bo *dst_bo = fd_resource(dst)->bo; + unsigned i; + + for (i = 0; i < sizedwords; i++) { + OUT_PKT7(ring, CP_MEM_TO_MEM, 5); + OUT_RING(ring, 0x00000000); + OUT_RELOC(ring, dst_bo, dst_off, 0, 0); + OUT_RELOC(ring, src_bo, src_off, 0, 0); + + dst_off += 4; + src_off += 4; + } } void fd5_emit_init_screen(struct pipe_screen *pscreen) { - struct fd_screen *screen = fd_screen(pscreen); - screen->emit_ib = fd5_emit_ib; - screen->mem_to_mem = fd5_mem_to_mem; + struct fd_screen *screen = fd_screen(pscreen); + screen->emit_ib = fd5_emit_ib; + screen->mem_to_mem = fd5_mem_to_mem; } void diff --git a/src/gallium/drivers/freedreno/a5xx/fd5_emit.h b/src/gallium/drivers/freedreno/a5xx/fd5_emit.h index 024e5a5..ef6ced5 100644 --- a/src/gallium/drivers/freedreno/a5xx/fd5_emit.h +++ b/src/gallium/drivers/freedreno/a5xx/fd5_emit.h @@ -29,186 +29,191 @@ #include "pipe/p_context.h" -#include "freedreno_batch.h" -#include "freedreno_context.h" #include "fd5_context.h" #include "fd5_format.h" #include "fd5_program.h" #include "fd5_screen.h" +#include "freedreno_batch.h" +#include "freedreno_context.h" #include "ir3_gallium.h" struct fd_ringbuffer; /* grouped together emit-state for prog/vertex/state emit: */ struct fd5_emit { - struct pipe_debug_callback *debug; - const struct fd_vertex_state *vtx; - const struct fd5_program_state *prog; - const struct pipe_draw_info *info; - const struct pipe_draw_indirect_info *indirect; - const struct pipe_draw_start_count *draw; - bool binning_pass; - struct ir3_cache_key key; - enum fd_dirty_3d_state dirty; - - uint32_t sprite_coord_enable; /* bitmask */ - bool sprite_coord_mode; - bool rasterflat; - - /* in binning pass, we don't have real frag shader, so we - * don't know if real draw disqualifies lrz write. So just - * figure that out up-front and stash it in the emit. - */ - bool no_lrz_write; - - /* cached to avoid repeated lookups of same variants: */ - const struct ir3_shader_variant *vs, *fs; - /* TODO: other shader stages.. */ - - unsigned streamout_mask; + struct pipe_debug_callback *debug; + const struct fd_vertex_state *vtx; + const struct fd5_program_state *prog; + const struct pipe_draw_info *info; + const struct pipe_draw_indirect_info *indirect; + const struct pipe_draw_start_count *draw; + bool binning_pass; + struct ir3_cache_key key; + enum fd_dirty_3d_state dirty; + + uint32_t sprite_coord_enable; /* bitmask */ + bool sprite_coord_mode; + bool rasterflat; + + /* in binning pass, we don't have real frag shader, so we + * don't know if real draw disqualifies lrz write. So just + * figure that out up-front and stash it in the emit. + */ + bool no_lrz_write; + + /* cached to avoid repeated lookups of same variants: */ + const struct ir3_shader_variant *vs, *fs; + /* TODO: other shader stages.. */ + + unsigned streamout_mask; }; -static inline enum a5xx_color_fmt fd5_emit_format(struct pipe_surface *surf) +static inline enum a5xx_color_fmt +fd5_emit_format(struct pipe_surface *surf) { - if (!surf) - return 0; - return fd5_pipe2color(surf->format); + if (!surf) + return 0; + return fd5_pipe2color(surf->format); } static inline const struct ir3_shader_variant * fd5_emit_get_vp(struct fd5_emit *emit) { - if (!emit->vs) { - /* We use nonbinning VS during binning when TFB is enabled because that - * is what has all the outputs that might be involved in TFB. - */ - if (emit->binning_pass && !emit->prog->vs->shader->stream_output.num_outputs) - emit->vs = emit->prog->bs; - else - emit->vs = emit->prog->vs; - } - return emit->vs; + if (!emit->vs) { + /* We use nonbinning VS during binning when TFB is enabled because that + * is what has all the outputs that might be involved in TFB. + */ + if (emit->binning_pass && + !emit->prog->vs->shader->stream_output.num_outputs) + emit->vs = emit->prog->bs; + else + emit->vs = emit->prog->vs; + } + return emit->vs; } static inline const struct ir3_shader_variant * fd5_emit_get_fp(struct fd5_emit *emit) { - if (!emit->fs) { - if (emit->binning_pass) { - /* use dummy stateobj to simplify binning vs non-binning: */ - static const struct ir3_shader_variant binning_fs = {}; - emit->fs = &binning_fs; - } else { - emit->fs = emit->prog->fs; - } - } - return emit->fs; + if (!emit->fs) { + if (emit->binning_pass) { + /* use dummy stateobj to simplify binning vs non-binning: */ + static const struct ir3_shader_variant binning_fs = {}; + emit->fs = &binning_fs; + } else { + emit->fs = emit->prog->fs; + } + } + return emit->fs; } static inline void -fd5_cache_flush(struct fd_batch *batch, struct fd_ringbuffer *ring) - assert_dt +fd5_cache_flush(struct fd_batch *batch, struct fd_ringbuffer *ring) assert_dt { - fd_reset_wfi(batch); - OUT_PKT4(ring, REG_A5XX_UCHE_CACHE_INVALIDATE_MIN_LO, 5); - OUT_RING(ring, 0x00000000); /* UCHE_CACHE_INVALIDATE_MIN_LO */ - OUT_RING(ring, 0x00000000); /* UCHE_CACHE_INVALIDATE_MIN_HI */ - OUT_RING(ring, 0x00000000); /* UCHE_CACHE_INVALIDATE_MAX_LO */ - OUT_RING(ring, 0x00000000); /* UCHE_CACHE_INVALIDATE_MAX_HI */ - OUT_RING(ring, 0x00000012); /* UCHE_CACHE_INVALIDATE */ - fd_wfi(batch, ring); + fd_reset_wfi(batch); + OUT_PKT4(ring, REG_A5XX_UCHE_CACHE_INVALIDATE_MIN_LO, 5); + OUT_RING(ring, 0x00000000); /* UCHE_CACHE_INVALIDATE_MIN_LO */ + OUT_RING(ring, 0x00000000); /* UCHE_CACHE_INVALIDATE_MIN_HI */ + OUT_RING(ring, 0x00000000); /* UCHE_CACHE_INVALIDATE_MAX_LO */ + OUT_RING(ring, 0x00000000); /* UCHE_CACHE_INVALIDATE_MAX_HI */ + OUT_RING(ring, 0x00000012); /* UCHE_CACHE_INVALIDATE */ + fd_wfi(batch, ring); } static inline void fd5_set_render_mode(struct fd_context *ctx, struct fd_ringbuffer *ring, - enum render_mode_cmd mode) + enum render_mode_cmd mode) { - /* TODO add preemption support, gmem bypass, etc */ - emit_marker5(ring, 7); - OUT_PKT7(ring, CP_SET_RENDER_MODE, 5); - OUT_RING(ring, CP_SET_RENDER_MODE_0_MODE(mode)); - OUT_RING(ring, 0x00000000); /* ADDR_LO */ - OUT_RING(ring, 0x00000000); /* ADDR_HI */ - OUT_RING(ring, COND(mode == GMEM, CP_SET_RENDER_MODE_3_GMEM_ENABLE) | - COND(mode == BINNING, CP_SET_RENDER_MODE_3_VSC_ENABLE)); - OUT_RING(ring, 0x00000000); - emit_marker5(ring, 7); + /* TODO add preemption support, gmem bypass, etc */ + emit_marker5(ring, 7); + OUT_PKT7(ring, CP_SET_RENDER_MODE, 5); + OUT_RING(ring, CP_SET_RENDER_MODE_0_MODE(mode)); + OUT_RING(ring, 0x00000000); /* ADDR_LO */ + OUT_RING(ring, 0x00000000); /* ADDR_HI */ + OUT_RING(ring, COND(mode == GMEM, CP_SET_RENDER_MODE_3_GMEM_ENABLE) | + COND(mode == BINNING, CP_SET_RENDER_MODE_3_VSC_ENABLE)); + OUT_RING(ring, 0x00000000); + emit_marker5(ring, 7); } static inline void fd5_event_write(struct fd_batch *batch, struct fd_ringbuffer *ring, - enum vgt_event_type evt, bool timestamp) + enum vgt_event_type evt, bool timestamp) { - OUT_PKT7(ring, CP_EVENT_WRITE, timestamp ? 4 : 1); - OUT_RING(ring, CP_EVENT_WRITE_0_EVENT(evt)); - if (timestamp) { - OUT_RELOC(ring, fd5_context(batch->ctx)->blit_mem, 0, 0, 0); /* ADDR_LO/HI */ - OUT_RING(ring, 0x00000000); - } + OUT_PKT7(ring, CP_EVENT_WRITE, timestamp ? 4 : 1); + OUT_RING(ring, CP_EVENT_WRITE_0_EVENT(evt)); + if (timestamp) { + OUT_RELOC(ring, fd5_context(batch->ctx)->blit_mem, 0, 0, + 0); /* ADDR_LO/HI */ + OUT_RING(ring, 0x00000000); + } } static inline void fd5_emit_blit(struct fd_batch *batch, struct fd_ringbuffer *ring) { - emit_marker5(ring, 7); - fd5_event_write(batch, ring, BLIT, true); - emit_marker5(ring, 7); + emit_marker5(ring, 7); + fd5_event_write(batch, ring, BLIT, true); + emit_marker5(ring, 7); } static inline void -fd5_emit_render_cntl(struct fd_context *ctx, bool blit, bool binning) - assert_dt +fd5_emit_render_cntl(struct fd_context *ctx, bool blit, bool binning) assert_dt { - struct fd_ringbuffer *ring = binning ? ctx->batch->binning : ctx->batch->draw; - - /* TODO eventually this partially depends on the pfb state, ie. - * which of the cbuf(s)/zsbuf has an UBWC flag buffer.. that part - * we could probably cache and just regenerate if framebuffer - * state is dirty (or something like that).. - * - * Other bits seem to depend on query state, like if samples-passed - * query is active. - */ - bool samples_passed = (fd5_context(ctx)->samples_passed_queries > 0); - OUT_PKT4(ring, REG_A5XX_RB_RENDER_CNTL, 1); - OUT_RING(ring, 0x00000000 | /* RB_RENDER_CNTL */ - COND(binning, A5XX_RB_RENDER_CNTL_BINNING_PASS) | - COND(binning, A5XX_RB_RENDER_CNTL_DISABLE_COLOR_PIPE) | - COND(samples_passed, A5XX_RB_RENDER_CNTL_SAMPLES_PASSED) | - COND(!blit, 0x8)); - - OUT_PKT4(ring, REG_A5XX_GRAS_SC_CNTL, 1); - OUT_RING(ring, 0x00000008 | /* GRAS_SC_CNTL */ - COND(binning, A5XX_GRAS_SC_CNTL_BINNING_PASS) | - COND(samples_passed, A5XX_GRAS_SC_CNTL_SAMPLES_PASSED)); + struct fd_ringbuffer *ring = + binning ? ctx->batch->binning : ctx->batch->draw; + + /* TODO eventually this partially depends on the pfb state, ie. + * which of the cbuf(s)/zsbuf has an UBWC flag buffer.. that part + * we could probably cache and just regenerate if framebuffer + * state is dirty (or something like that).. + * + * Other bits seem to depend on query state, like if samples-passed + * query is active. + */ + bool samples_passed = (fd5_context(ctx)->samples_passed_queries > 0); + OUT_PKT4(ring, REG_A5XX_RB_RENDER_CNTL, 1); + OUT_RING(ring, 0x00000000 | /* RB_RENDER_CNTL */ + COND(binning, A5XX_RB_RENDER_CNTL_BINNING_PASS) | + COND(binning, A5XX_RB_RENDER_CNTL_DISABLE_COLOR_PIPE) | + COND(samples_passed, A5XX_RB_RENDER_CNTL_SAMPLES_PASSED) | + COND(!blit, 0x8)); + + OUT_PKT4(ring, REG_A5XX_GRAS_SC_CNTL, 1); + OUT_RING(ring, 0x00000008 | /* GRAS_SC_CNTL */ + COND(binning, A5XX_GRAS_SC_CNTL_BINNING_PASS) | + COND(samples_passed, A5XX_GRAS_SC_CNTL_SAMPLES_PASSED)); } static inline void fd5_emit_lrz_flush(struct fd_batch *batch, struct fd_ringbuffer *ring) { - /* TODO I think the extra writes to GRAS_LRZ_CNTL are probably - * a workaround and not needed on all a5xx. - */ - OUT_PKT4(ring, REG_A5XX_GRAS_LRZ_CNTL, 1); - OUT_RING(ring, A5XX_GRAS_LRZ_CNTL_ENABLE); + /* TODO I think the extra writes to GRAS_LRZ_CNTL are probably + * a workaround and not needed on all a5xx. + */ + OUT_PKT4(ring, REG_A5XX_GRAS_LRZ_CNTL, 1); + OUT_RING(ring, A5XX_GRAS_LRZ_CNTL_ENABLE); - fd5_event_write(batch, ring, LRZ_FLUSH, false); + fd5_event_write(batch, ring, LRZ_FLUSH, false); - OUT_PKT4(ring, REG_A5XX_GRAS_LRZ_CNTL, 1); - OUT_RING(ring, 0x0); + OUT_PKT4(ring, REG_A5XX_GRAS_LRZ_CNTL, 1); + OUT_RING(ring, 0x0); } -void fd5_emit_vertex_bufs(struct fd_ringbuffer *ring, struct fd5_emit *emit) assert_dt; +void fd5_emit_vertex_bufs(struct fd_ringbuffer *ring, + struct fd5_emit *emit) assert_dt; void fd5_emit_state(struct fd_context *ctx, struct fd_ringbuffer *ring, - struct fd5_emit *emit) assert_dt; + struct fd5_emit *emit) assert_dt; void fd5_emit_cs_state(struct fd_context *ctx, struct fd_ringbuffer *ring, - struct ir3_shader_variant *cp) assert_dt; -void fd5_emit_cs_consts(const struct ir3_shader_variant *v, struct fd_ringbuffer *ring, - struct fd_context *ctx, const struct pipe_grid_info *info) assert_dt; + struct ir3_shader_variant *cp) assert_dt; +void fd5_emit_cs_consts(const struct ir3_shader_variant *v, + struct fd_ringbuffer *ring, struct fd_context *ctx, + const struct pipe_grid_info *info) assert_dt; -void fd5_emit_restore(struct fd_batch *batch, struct fd_ringbuffer *ring) assert_dt; +void fd5_emit_restore(struct fd_batch *batch, + struct fd_ringbuffer *ring) assert_dt; void fd5_emit_init_screen(struct pipe_screen *pscreen); void fd5_emit_init(struct pipe_context *pctx); @@ -216,15 +221,15 @@ void fd5_emit_init(struct pipe_context *pctx); static inline void fd5_emit_ib(struct fd_ringbuffer *ring, struct fd_ringbuffer *target) { - /* for debug after a lock up, write a unique counter value - * to scratch6 for each IB, to make it easier to match up - * register dumps to cmdstream. The combination of IB and - * DRAW (scratch7) is enough to "triangulate" the particular - * draw that caused lockup. - */ - emit_marker5(ring, 6); - __OUT_IB5(ring, target); - emit_marker5(ring, 6); + /* for debug after a lock up, write a unique counter value + * to scratch6 for each IB, to make it easier to match up + * register dumps to cmdstream. The combination of IB and + * DRAW (scratch7) is enough to "triangulate" the particular + * draw that caused lockup. + */ + emit_marker5(ring, 6); + __OUT_IB5(ring, target); + emit_marker5(ring, 6); } #endif /* FD5_EMIT_H */ diff --git a/src/gallium/drivers/freedreno/a5xx/fd5_format.c b/src/gallium/drivers/freedreno/a5xx/fd5_format.c index 847c379..c4f12a3 100644 --- a/src/gallium/drivers/freedreno/a5xx/fd5_format.c +++ b/src/gallium/drivers/freedreno/a5xx/fd5_format.c @@ -29,48 +29,41 @@ #include "fd5_format.h" - /* Specifies the table of all the formats and their features. Also supplies * the helpers that look up various data in those tables. */ struct fd5_format { - enum a5xx_vtx_fmt vtx; - enum a5xx_tex_fmt tex; - enum a5xx_color_fmt rb; - enum a3xx_color_swap swap; - boolean present; + enum a5xx_vtx_fmt vtx; + enum a5xx_tex_fmt tex; + enum a5xx_color_fmt rb; + enum a3xx_color_swap swap; + boolean present; }; /* vertex + texture */ -#define VT(pipe, fmt, rbfmt, swapfmt) \ - [PIPE_FORMAT_ ## pipe] = { \ - .present = 1, \ - .vtx = VFMT5_ ## fmt, \ - .tex = TFMT5_ ## fmt, \ - .rb = RB5_ ## rbfmt, \ - .swap = swapfmt \ - } +#define VT(pipe, fmt, rbfmt, swapfmt) \ + [PIPE_FORMAT_##pipe] = {.present = 1, \ + .vtx = VFMT5_##fmt, \ + .tex = TFMT5_##fmt, \ + .rb = RB5_##rbfmt, \ + .swap = swapfmt} /* texture-only */ -#define _T(pipe, fmt, rbfmt, swapfmt) \ - [PIPE_FORMAT_ ## pipe] = { \ - .present = 1, \ - .vtx = VFMT5_NONE, \ - .tex = TFMT5_ ## fmt, \ - .rb = RB5_ ## rbfmt, \ - .swap = swapfmt \ - } +#define _T(pipe, fmt, rbfmt, swapfmt) \ + [PIPE_FORMAT_##pipe] = {.present = 1, \ + .vtx = VFMT5_NONE, \ + .tex = TFMT5_##fmt, \ + .rb = RB5_##rbfmt, \ + .swap = swapfmt} /* vertex-only */ -#define V_(pipe, fmt, rbfmt, swapfmt) \ - [PIPE_FORMAT_ ## pipe] = { \ - .present = 1, \ - .vtx = VFMT5_ ## fmt, \ - .tex = TFMT5_NONE, \ - .rb = RB5_ ## rbfmt, \ - .swap = swapfmt \ - } +#define V_(pipe, fmt, rbfmt, swapfmt) \ + [PIPE_FORMAT_##pipe] = {.present = 1, \ + .vtx = VFMT5_##fmt, \ + .tex = TFMT5_NONE, \ + .rb = RB5_##rbfmt, \ + .swap = swapfmt} /* clang-format off */ static struct fd5_format formats[PIPE_FORMAT_COUNT] = { @@ -343,84 +336,94 @@ static struct fd5_format formats[PIPE_FORMAT_COUNT] = { enum a5xx_vtx_fmt fd5_pipe2vtx(enum pipe_format format) { - if (!formats[format].present) - return VFMT5_NONE; - return formats[format].vtx; + if (!formats[format].present) + return VFMT5_NONE; + return formats[format].vtx; } /* convert pipe format to texture sampler format: */ enum a5xx_tex_fmt fd5_pipe2tex(enum pipe_format format) { - if (!formats[format].present) - return TFMT5_NONE; - return formats[format].tex; + if (!formats[format].present) + return TFMT5_NONE; + return formats[format].tex; } /* convert pipe format to MRT / copydest format used for render-target: */ enum a5xx_color_fmt fd5_pipe2color(enum pipe_format format) { - if (!formats[format].present) - return RB5_NONE; - return formats[format].rb; + if (!formats[format].present) + return RB5_NONE; + return formats[format].rb; } enum a3xx_color_swap fd5_pipe2swap(enum pipe_format format) { - if (!formats[format].present) - return WZYX; - return formats[format].swap; + if (!formats[format].present) + return WZYX; + return formats[format].swap; } enum a5xx_depth_format fd5_pipe2depth(enum pipe_format format) { - switch (format) { - case PIPE_FORMAT_Z16_UNORM: - return DEPTH5_16; - case PIPE_FORMAT_Z24X8_UNORM: - case PIPE_FORMAT_Z24_UNORM_S8_UINT: - case PIPE_FORMAT_X8Z24_UNORM: - case PIPE_FORMAT_S8_UINT_Z24_UNORM: - return DEPTH5_24_8; - case PIPE_FORMAT_Z32_FLOAT: - case PIPE_FORMAT_Z32_FLOAT_S8X24_UINT: - return DEPTH5_32; - default: - return ~0; - } + switch (format) { + case PIPE_FORMAT_Z16_UNORM: + return DEPTH5_16; + case PIPE_FORMAT_Z24X8_UNORM: + case PIPE_FORMAT_Z24_UNORM_S8_UINT: + case PIPE_FORMAT_X8Z24_UNORM: + case PIPE_FORMAT_S8_UINT_Z24_UNORM: + return DEPTH5_24_8; + case PIPE_FORMAT_Z32_FLOAT: + case PIPE_FORMAT_Z32_FLOAT_S8X24_UINT: + return DEPTH5_32; + default: + return ~0; + } } static inline enum a5xx_tex_swiz tex_swiz(unsigned swiz) { - switch (swiz) { - default: - case PIPE_SWIZZLE_X: return A5XX_TEX_X; - case PIPE_SWIZZLE_Y: return A5XX_TEX_Y; - case PIPE_SWIZZLE_Z: return A5XX_TEX_Z; - case PIPE_SWIZZLE_W: return A5XX_TEX_W; - case PIPE_SWIZZLE_0: return A5XX_TEX_ZERO; - case PIPE_SWIZZLE_1: return A5XX_TEX_ONE; - } + switch (swiz) { + default: + case PIPE_SWIZZLE_X: + return A5XX_TEX_X; + case PIPE_SWIZZLE_Y: + return A5XX_TEX_Y; + case PIPE_SWIZZLE_Z: + return A5XX_TEX_Z; + case PIPE_SWIZZLE_W: + return A5XX_TEX_W; + case PIPE_SWIZZLE_0: + return A5XX_TEX_ZERO; + case PIPE_SWIZZLE_1: + return A5XX_TEX_ONE; + } } uint32_t fd5_tex_swiz(enum pipe_format format, unsigned swizzle_r, unsigned swizzle_g, - unsigned swizzle_b, unsigned swizzle_a) + unsigned swizzle_b, unsigned swizzle_a) { - const struct util_format_description *desc = - util_format_description(format); - unsigned char swiz[4] = { - swizzle_r, swizzle_g, swizzle_b, swizzle_a, - }, rswiz[4]; - - util_format_compose_swizzles(desc->swizzle, swiz, rswiz); - - return A5XX_TEX_CONST_0_SWIZ_X(tex_swiz(rswiz[0])) | - A5XX_TEX_CONST_0_SWIZ_Y(tex_swiz(rswiz[1])) | - A5XX_TEX_CONST_0_SWIZ_Z(tex_swiz(rswiz[2])) | - A5XX_TEX_CONST_0_SWIZ_W(tex_swiz(rswiz[3])); + const struct util_format_description *desc = util_format_description(format); + unsigned char swiz[4] = + { + swizzle_r, + swizzle_g, + swizzle_b, + swizzle_a, + }, + rswiz[4]; + + util_format_compose_swizzles(desc->swizzle, swiz, rswiz); + + return A5XX_TEX_CONST_0_SWIZ_X(tex_swiz(rswiz[0])) | + A5XX_TEX_CONST_0_SWIZ_Y(tex_swiz(rswiz[1])) | + A5XX_TEX_CONST_0_SWIZ_Z(tex_swiz(rswiz[2])) | + A5XX_TEX_CONST_0_SWIZ_W(tex_swiz(rswiz[3])); } diff --git a/src/gallium/drivers/freedreno/a5xx/fd5_format.h b/src/gallium/drivers/freedreno/a5xx/fd5_format.h index f662455..2f5e69b 100644 --- a/src/gallium/drivers/freedreno/a5xx/fd5_format.h +++ b/src/gallium/drivers/freedreno/a5xx/fd5_format.h @@ -38,6 +38,7 @@ enum a3xx_color_swap fd5_pipe2swap(enum pipe_format format); enum a5xx_depth_format fd5_pipe2depth(enum pipe_format format); uint32_t fd5_tex_swiz(enum pipe_format format, unsigned swizzle_r, - unsigned swizzle_g, unsigned swizzle_b, unsigned swizzle_a); + unsigned swizzle_g, unsigned swizzle_b, + unsigned swizzle_a); #endif /* FD5_UTIL_H_ */ diff --git a/src/gallium/drivers/freedreno/a5xx/fd5_gmem.c b/src/gallium/drivers/freedreno/a5xx/fd5_gmem.c index c3c33df..7aa5e97 100644 --- a/src/gallium/drivers/freedreno/a5xx/fd5_gmem.c +++ b/src/gallium/drivers/freedreno/a5xx/fd5_gmem.c @@ -25,795 +25,786 @@ */ #include "pipe/p_state.h" -#include "util/u_string.h" -#include "util/u_memory.h" -#include "util/u_inlines.h" #include "util/format/u_format.h" +#include "util/u_inlines.h" +#include "util/u_memory.h" +#include "util/u_string.h" #include "freedreno_draw.h" -#include "freedreno_state.h" #include "freedreno_resource.h" +#include "freedreno_state.h" -#include "fd5_gmem.h" #include "fd5_context.h" #include "fd5_draw.h" #include "fd5_emit.h" -#include "fd5_program.h" #include "fd5_format.h" +#include "fd5_gmem.h" +#include "fd5_program.h" #include "fd5_zsa.h" static void emit_mrt(struct fd_ringbuffer *ring, unsigned nr_bufs, - struct pipe_surface **bufs, const struct fd_gmem_stateobj *gmem) + struct pipe_surface **bufs, const struct fd_gmem_stateobj *gmem) { - enum a5xx_tile_mode tile_mode; - unsigned i; - - for (i = 0; i < A5XX_MAX_RENDER_TARGETS; i++) { - enum a5xx_color_fmt format = 0; - enum a3xx_color_swap swap = WZYX; - bool srgb = false, sint = false, uint = false; - struct fd_resource *rsc = NULL; - struct fdl_slice *slice = NULL; - uint32_t stride = 0; - uint32_t size = 0; - uint32_t base = 0; - uint32_t offset = 0; - - if (gmem) { - tile_mode = TILE5_2; - } else { - tile_mode = TILE5_LINEAR; - } - - if ((i < nr_bufs) && bufs[i]) { - struct pipe_surface *psurf = bufs[i]; - enum pipe_format pformat = psurf->format; - - rsc = fd_resource(psurf->texture); - - slice = fd_resource_slice(rsc, psurf->u.tex.level); - format = fd5_pipe2color(pformat); - swap = fd5_pipe2swap(pformat); - srgb = util_format_is_srgb(pformat); - sint = util_format_is_pure_sint(pformat); - uint = util_format_is_pure_uint(pformat); - - debug_assert(psurf->u.tex.first_layer == psurf->u.tex.last_layer); - - offset = fd_resource_offset(rsc, psurf->u.tex.level, - psurf->u.tex.first_layer); - - if (gmem) { - stride = gmem->bin_w * gmem->cbuf_cpp[i]; - size = stride * gmem->bin_h; - base = gmem->cbuf_base[i]; - } else { - stride = fd_resource_pitch(rsc, psurf->u.tex.level); - size = slice->size0; - - tile_mode = fd_resource_tile_mode(psurf->texture, psurf->u.tex.level); - } - } - - OUT_PKT4(ring, REG_A5XX_RB_MRT_BUF_INFO(i), 5); - OUT_RING(ring, A5XX_RB_MRT_BUF_INFO_COLOR_FORMAT(format) | - A5XX_RB_MRT_BUF_INFO_COLOR_TILE_MODE(tile_mode) | - A5XX_RB_MRT_BUF_INFO_COLOR_SWAP(swap) | - COND(gmem, 0x800) | /* XXX 0x1000 for RECTLIST clear, 0x0 for BLIT.. */ - COND(srgb, A5XX_RB_MRT_BUF_INFO_COLOR_SRGB)); - OUT_RING(ring, A5XX_RB_MRT_PITCH(stride)); - OUT_RING(ring, A5XX_RB_MRT_ARRAY_PITCH(size)); - if (gmem || (i >= nr_bufs) || !bufs[i]) { - OUT_RING(ring, base); /* RB_MRT[i].BASE_LO */ - OUT_RING(ring, 0x00000000); /* RB_MRT[i].BASE_HI */ - } else { - debug_assert((offset + size) <= fd_bo_size(rsc->bo)); - OUT_RELOC(ring, rsc->bo, offset, 0, 0); /* BASE_LO/HI */ - } - - OUT_PKT4(ring, REG_A5XX_SP_FS_MRT_REG(i), 1); - OUT_RING(ring, A5XX_SP_FS_MRT_REG_COLOR_FORMAT(format) | - COND(sint, A5XX_SP_FS_MRT_REG_COLOR_SINT) | - COND(uint, A5XX_SP_FS_MRT_REG_COLOR_UINT) | - COND(srgb, A5XX_SP_FS_MRT_REG_COLOR_SRGB)); - - /* when we support UBWC, these would be the system memory - * addr/pitch/etc: - */ - OUT_PKT4(ring, REG_A5XX_RB_MRT_FLAG_BUFFER(i), 4); - OUT_RING(ring, 0x00000000); /* RB_MRT_FLAG_BUFFER[i].ADDR_LO */ - OUT_RING(ring, 0x00000000); /* RB_MRT_FLAG_BUFFER[i].ADDR_HI */ - OUT_RING(ring, A5XX_RB_MRT_FLAG_BUFFER_PITCH(0)); - OUT_RING(ring, A5XX_RB_MRT_FLAG_BUFFER_ARRAY_PITCH(0)); - } + enum a5xx_tile_mode tile_mode; + unsigned i; + + for (i = 0; i < A5XX_MAX_RENDER_TARGETS; i++) { + enum a5xx_color_fmt format = 0; + enum a3xx_color_swap swap = WZYX; + bool srgb = false, sint = false, uint = false; + struct fd_resource *rsc = NULL; + struct fdl_slice *slice = NULL; + uint32_t stride = 0; + uint32_t size = 0; + uint32_t base = 0; + uint32_t offset = 0; + + if (gmem) { + tile_mode = TILE5_2; + } else { + tile_mode = TILE5_LINEAR; + } + + if ((i < nr_bufs) && bufs[i]) { + struct pipe_surface *psurf = bufs[i]; + enum pipe_format pformat = psurf->format; + + rsc = fd_resource(psurf->texture); + + slice = fd_resource_slice(rsc, psurf->u.tex.level); + format = fd5_pipe2color(pformat); + swap = fd5_pipe2swap(pformat); + srgb = util_format_is_srgb(pformat); + sint = util_format_is_pure_sint(pformat); + uint = util_format_is_pure_uint(pformat); + + debug_assert(psurf->u.tex.first_layer == psurf->u.tex.last_layer); + + offset = fd_resource_offset(rsc, psurf->u.tex.level, + psurf->u.tex.first_layer); + + if (gmem) { + stride = gmem->bin_w * gmem->cbuf_cpp[i]; + size = stride * gmem->bin_h; + base = gmem->cbuf_base[i]; + } else { + stride = fd_resource_pitch(rsc, psurf->u.tex.level); + size = slice->size0; + + tile_mode = + fd_resource_tile_mode(psurf->texture, psurf->u.tex.level); + } + } + + OUT_PKT4(ring, REG_A5XX_RB_MRT_BUF_INFO(i), 5); + OUT_RING( + ring, + A5XX_RB_MRT_BUF_INFO_COLOR_FORMAT(format) | + A5XX_RB_MRT_BUF_INFO_COLOR_TILE_MODE(tile_mode) | + A5XX_RB_MRT_BUF_INFO_COLOR_SWAP(swap) | + COND(gmem, + 0x800) | /* XXX 0x1000 for RECTLIST clear, 0x0 for BLIT.. */ + COND(srgb, A5XX_RB_MRT_BUF_INFO_COLOR_SRGB)); + OUT_RING(ring, A5XX_RB_MRT_PITCH(stride)); + OUT_RING(ring, A5XX_RB_MRT_ARRAY_PITCH(size)); + if (gmem || (i >= nr_bufs) || !bufs[i]) { + OUT_RING(ring, base); /* RB_MRT[i].BASE_LO */ + OUT_RING(ring, 0x00000000); /* RB_MRT[i].BASE_HI */ + } else { + debug_assert((offset + size) <= fd_bo_size(rsc->bo)); + OUT_RELOC(ring, rsc->bo, offset, 0, 0); /* BASE_LO/HI */ + } + + OUT_PKT4(ring, REG_A5XX_SP_FS_MRT_REG(i), 1); + OUT_RING(ring, A5XX_SP_FS_MRT_REG_COLOR_FORMAT(format) | + COND(sint, A5XX_SP_FS_MRT_REG_COLOR_SINT) | + COND(uint, A5XX_SP_FS_MRT_REG_COLOR_UINT) | + COND(srgb, A5XX_SP_FS_MRT_REG_COLOR_SRGB)); + + /* when we support UBWC, these would be the system memory + * addr/pitch/etc: + */ + OUT_PKT4(ring, REG_A5XX_RB_MRT_FLAG_BUFFER(i), 4); + OUT_RING(ring, 0x00000000); /* RB_MRT_FLAG_BUFFER[i].ADDR_LO */ + OUT_RING(ring, 0x00000000); /* RB_MRT_FLAG_BUFFER[i].ADDR_HI */ + OUT_RING(ring, A5XX_RB_MRT_FLAG_BUFFER_PITCH(0)); + OUT_RING(ring, A5XX_RB_MRT_FLAG_BUFFER_ARRAY_PITCH(0)); + } } static void emit_zs(struct fd_ringbuffer *ring, struct pipe_surface *zsbuf, - const struct fd_gmem_stateobj *gmem) + const struct fd_gmem_stateobj *gmem) { - if (zsbuf) { - struct fd_resource *rsc = fd_resource(zsbuf->texture); - enum a5xx_depth_format fmt = fd5_pipe2depth(zsbuf->format); - uint32_t cpp = rsc->layout.cpp; - uint32_t stride = 0; - uint32_t size = 0; - - if (gmem) { - stride = cpp * gmem->bin_w; - size = stride * gmem->bin_h; - } else { - stride = fd_resource_pitch(rsc, 0); - size = fd_resource_slice(rsc, 0)->size0; - } - - OUT_PKT4(ring, REG_A5XX_RB_DEPTH_BUFFER_INFO, 5); - OUT_RING(ring, A5XX_RB_DEPTH_BUFFER_INFO_DEPTH_FORMAT(fmt)); - if (gmem) { - OUT_RING(ring, gmem->zsbuf_base[0]); /* RB_DEPTH_BUFFER_BASE_LO */ - OUT_RING(ring, 0x00000000); /* RB_DEPTH_BUFFER_BASE_HI */ - } else { - OUT_RELOC(ring, rsc->bo, 0, 0, 0); /* RB_DEPTH_BUFFER_BASE_LO/HI */ - } - OUT_RING(ring, A5XX_RB_DEPTH_BUFFER_PITCH(stride)); - OUT_RING(ring, A5XX_RB_DEPTH_BUFFER_ARRAY_PITCH(size)); - - OUT_PKT4(ring, REG_A5XX_GRAS_SU_DEPTH_BUFFER_INFO, 1); - OUT_RING(ring, A5XX_GRAS_SU_DEPTH_BUFFER_INFO_DEPTH_FORMAT(fmt)); - - OUT_PKT4(ring, REG_A5XX_RB_DEPTH_FLAG_BUFFER_BASE_LO, 3); - OUT_RING(ring, 0x00000000); /* RB_DEPTH_FLAG_BUFFER_BASE_LO */ - OUT_RING(ring, 0x00000000); /* RB_DEPTH_FLAG_BUFFER_BASE_HI */ - OUT_RING(ring, 0x00000000); /* RB_DEPTH_FLAG_BUFFER_PITCH */ - - if (rsc->lrz) { - OUT_PKT4(ring, REG_A5XX_GRAS_LRZ_BUFFER_BASE_LO, 3); - OUT_RELOC(ring, rsc->lrz, 0x1000, 0, 0); - OUT_RING(ring, A5XX_GRAS_LRZ_BUFFER_PITCH(rsc->lrz_pitch)); - - OUT_PKT4(ring, REG_A5XX_GRAS_LRZ_FAST_CLEAR_BUFFER_BASE_LO, 2); - OUT_RELOC(ring, rsc->lrz, 0, 0, 0); - } else { - OUT_PKT4(ring, REG_A5XX_GRAS_LRZ_BUFFER_BASE_LO, 3); - OUT_RING(ring, 0x00000000); - OUT_RING(ring, 0x00000000); - OUT_RING(ring, 0x00000000); /* GRAS_LRZ_BUFFER_PITCH */ - - OUT_PKT4(ring, REG_A5XX_GRAS_LRZ_FAST_CLEAR_BUFFER_BASE_LO, 2); - OUT_RING(ring, 0x00000000); - OUT_RING(ring, 0x00000000); - } - - if (rsc->stencil) { - if (gmem) { - stride = 1 * gmem->bin_w; - size = stride * gmem->bin_h; - } else { - stride = fd_resource_pitch(rsc->stencil, 0); - size = fd_resource_slice(rsc->stencil, 0)->size0; - } - - OUT_PKT4(ring, REG_A5XX_RB_STENCIL_INFO, 5); - OUT_RING(ring, A5XX_RB_STENCIL_INFO_SEPARATE_STENCIL); - if (gmem) { - OUT_RING(ring, gmem->zsbuf_base[1]); /* RB_STENCIL_BASE_LO */ - OUT_RING(ring, 0x00000000); /* RB_STENCIL_BASE_HI */ - } else { - OUT_RELOC(ring, rsc->stencil->bo, 0, 0, 0); /* RB_STENCIL_BASE_LO/HI */ - } - OUT_RING(ring, A5XX_RB_STENCIL_PITCH(stride)); - OUT_RING(ring, A5XX_RB_STENCIL_ARRAY_PITCH(size)); - } else { - OUT_PKT4(ring, REG_A5XX_RB_STENCIL_INFO, 1); - OUT_RING(ring, 0x00000000); /* RB_STENCIL_INFO */ - } - } else { - OUT_PKT4(ring, REG_A5XX_RB_DEPTH_BUFFER_INFO, 5); - OUT_RING(ring, A5XX_RB_DEPTH_BUFFER_INFO_DEPTH_FORMAT(DEPTH5_NONE)); - OUT_RING(ring, 0x00000000); /* RB_DEPTH_BUFFER_BASE_LO */ - OUT_RING(ring, 0x00000000); /* RB_DEPTH_BUFFER_BASE_HI */ - OUT_RING(ring, 0x00000000); /* RB_DEPTH_BUFFER_PITCH */ - OUT_RING(ring, 0x00000000); /* RB_DEPTH_BUFFER_ARRAY_PITCH */ - - OUT_PKT4(ring, REG_A5XX_GRAS_SU_DEPTH_BUFFER_INFO, 1); - OUT_RING(ring, A5XX_GRAS_SU_DEPTH_BUFFER_INFO_DEPTH_FORMAT(DEPTH5_NONE)); - - OUT_PKT4(ring, REG_A5XX_RB_DEPTH_FLAG_BUFFER_BASE_LO, 3); - OUT_RING(ring, 0x00000000); /* RB_DEPTH_FLAG_BUFFER_BASE_LO */ - OUT_RING(ring, 0x00000000); /* RB_DEPTH_FLAG_BUFFER_BASE_HI */ - OUT_RING(ring, 0x00000000); /* RB_DEPTH_FLAG_BUFFER_PITCH */ - - OUT_PKT4(ring, REG_A5XX_RB_STENCIL_INFO, 1); - OUT_RING(ring, 0x00000000); /* RB_STENCIL_INFO */ - } + if (zsbuf) { + struct fd_resource *rsc = fd_resource(zsbuf->texture); + enum a5xx_depth_format fmt = fd5_pipe2depth(zsbuf->format); + uint32_t cpp = rsc->layout.cpp; + uint32_t stride = 0; + uint32_t size = 0; + + if (gmem) { + stride = cpp * gmem->bin_w; + size = stride * gmem->bin_h; + } else { + stride = fd_resource_pitch(rsc, 0); + size = fd_resource_slice(rsc, 0)->size0; + } + + OUT_PKT4(ring, REG_A5XX_RB_DEPTH_BUFFER_INFO, 5); + OUT_RING(ring, A5XX_RB_DEPTH_BUFFER_INFO_DEPTH_FORMAT(fmt)); + if (gmem) { + OUT_RING(ring, gmem->zsbuf_base[0]); /* RB_DEPTH_BUFFER_BASE_LO */ + OUT_RING(ring, 0x00000000); /* RB_DEPTH_BUFFER_BASE_HI */ + } else { + OUT_RELOC(ring, rsc->bo, 0, 0, 0); /* RB_DEPTH_BUFFER_BASE_LO/HI */ + } + OUT_RING(ring, A5XX_RB_DEPTH_BUFFER_PITCH(stride)); + OUT_RING(ring, A5XX_RB_DEPTH_BUFFER_ARRAY_PITCH(size)); + + OUT_PKT4(ring, REG_A5XX_GRAS_SU_DEPTH_BUFFER_INFO, 1); + OUT_RING(ring, A5XX_GRAS_SU_DEPTH_BUFFER_INFO_DEPTH_FORMAT(fmt)); + + OUT_PKT4(ring, REG_A5XX_RB_DEPTH_FLAG_BUFFER_BASE_LO, 3); + OUT_RING(ring, 0x00000000); /* RB_DEPTH_FLAG_BUFFER_BASE_LO */ + OUT_RING(ring, 0x00000000); /* RB_DEPTH_FLAG_BUFFER_BASE_HI */ + OUT_RING(ring, 0x00000000); /* RB_DEPTH_FLAG_BUFFER_PITCH */ + + if (rsc->lrz) { + OUT_PKT4(ring, REG_A5XX_GRAS_LRZ_BUFFER_BASE_LO, 3); + OUT_RELOC(ring, rsc->lrz, 0x1000, 0, 0); + OUT_RING(ring, A5XX_GRAS_LRZ_BUFFER_PITCH(rsc->lrz_pitch)); + + OUT_PKT4(ring, REG_A5XX_GRAS_LRZ_FAST_CLEAR_BUFFER_BASE_LO, 2); + OUT_RELOC(ring, rsc->lrz, 0, 0, 0); + } else { + OUT_PKT4(ring, REG_A5XX_GRAS_LRZ_BUFFER_BASE_LO, 3); + OUT_RING(ring, 0x00000000); + OUT_RING(ring, 0x00000000); + OUT_RING(ring, 0x00000000); /* GRAS_LRZ_BUFFER_PITCH */ + + OUT_PKT4(ring, REG_A5XX_GRAS_LRZ_FAST_CLEAR_BUFFER_BASE_LO, 2); + OUT_RING(ring, 0x00000000); + OUT_RING(ring, 0x00000000); + } + + if (rsc->stencil) { + if (gmem) { + stride = 1 * gmem->bin_w; + size = stride * gmem->bin_h; + } else { + stride = fd_resource_pitch(rsc->stencil, 0); + size = fd_resource_slice(rsc->stencil, 0)->size0; + } + + OUT_PKT4(ring, REG_A5XX_RB_STENCIL_INFO, 5); + OUT_RING(ring, A5XX_RB_STENCIL_INFO_SEPARATE_STENCIL); + if (gmem) { + OUT_RING(ring, gmem->zsbuf_base[1]); /* RB_STENCIL_BASE_LO */ + OUT_RING(ring, 0x00000000); /* RB_STENCIL_BASE_HI */ + } else { + OUT_RELOC(ring, rsc->stencil->bo, 0, 0, + 0); /* RB_STENCIL_BASE_LO/HI */ + } + OUT_RING(ring, A5XX_RB_STENCIL_PITCH(stride)); + OUT_RING(ring, A5XX_RB_STENCIL_ARRAY_PITCH(size)); + } else { + OUT_PKT4(ring, REG_A5XX_RB_STENCIL_INFO, 1); + OUT_RING(ring, 0x00000000); /* RB_STENCIL_INFO */ + } + } else { + OUT_PKT4(ring, REG_A5XX_RB_DEPTH_BUFFER_INFO, 5); + OUT_RING(ring, A5XX_RB_DEPTH_BUFFER_INFO_DEPTH_FORMAT(DEPTH5_NONE)); + OUT_RING(ring, 0x00000000); /* RB_DEPTH_BUFFER_BASE_LO */ + OUT_RING(ring, 0x00000000); /* RB_DEPTH_BUFFER_BASE_HI */ + OUT_RING(ring, 0x00000000); /* RB_DEPTH_BUFFER_PITCH */ + OUT_RING(ring, 0x00000000); /* RB_DEPTH_BUFFER_ARRAY_PITCH */ + + OUT_PKT4(ring, REG_A5XX_GRAS_SU_DEPTH_BUFFER_INFO, 1); + OUT_RING(ring, A5XX_GRAS_SU_DEPTH_BUFFER_INFO_DEPTH_FORMAT(DEPTH5_NONE)); + + OUT_PKT4(ring, REG_A5XX_RB_DEPTH_FLAG_BUFFER_BASE_LO, 3); + OUT_RING(ring, 0x00000000); /* RB_DEPTH_FLAG_BUFFER_BASE_LO */ + OUT_RING(ring, 0x00000000); /* RB_DEPTH_FLAG_BUFFER_BASE_HI */ + OUT_RING(ring, 0x00000000); /* RB_DEPTH_FLAG_BUFFER_PITCH */ + + OUT_PKT4(ring, REG_A5XX_RB_STENCIL_INFO, 1); + OUT_RING(ring, 0x00000000); /* RB_STENCIL_INFO */ + } } static bool use_hw_binning(struct fd_batch *batch) { - const struct fd_gmem_stateobj *gmem = batch->gmem_state; + const struct fd_gmem_stateobj *gmem = batch->gmem_state; - if ((gmem->maxpw * gmem->maxph) > 32) - return false; + if ((gmem->maxpw * gmem->maxph) > 32) + return false; - if ((gmem->maxpw > 15) || (gmem->maxph > 15)) - return false; + if ((gmem->maxpw > 15) || (gmem->maxph > 15)) + return false; - return fd_binning_enabled && ((gmem->nbins_x * gmem->nbins_y) > 2) && - (batch->num_draws > 0); + return fd_binning_enabled && ((gmem->nbins_x * gmem->nbins_y) > 2) && + (batch->num_draws > 0); } static void patch_draws(struct fd_batch *batch, enum pc_di_vis_cull_mode vismode) { - unsigned i; - for (i = 0; i < fd_patch_num_elements(&batch->draw_patches); i++) { - struct fd_cs_patch *patch = fd_patch_element(&batch->draw_patches, i); - *patch->cs = patch->val | DRAW4(0, 0, 0, vismode); - } - util_dynarray_clear(&batch->draw_patches); + unsigned i; + for (i = 0; i < fd_patch_num_elements(&batch->draw_patches); i++) { + struct fd_cs_patch *patch = fd_patch_element(&batch->draw_patches, i); + *patch->cs = patch->val | DRAW4(0, 0, 0, vismode); + } + util_dynarray_clear(&batch->draw_patches); } static void -update_vsc_pipe(struct fd_batch *batch) - assert_dt +update_vsc_pipe(struct fd_batch *batch) assert_dt { - struct fd_context *ctx = batch->ctx; - struct fd5_context *fd5_ctx = fd5_context(ctx); - const struct fd_gmem_stateobj *gmem = batch->gmem_state; - struct fd_ringbuffer *ring = batch->gmem; - int i; - - OUT_PKT4(ring, REG_A5XX_VSC_BIN_SIZE, 3); - OUT_RING(ring, A5XX_VSC_BIN_SIZE_WIDTH(gmem->bin_w) | - A5XX_VSC_BIN_SIZE_HEIGHT(gmem->bin_h)); - OUT_RELOC(ring, fd5_ctx->vsc_size_mem, 0, 0, 0); /* VSC_SIZE_ADDRESS_LO/HI */ - - OUT_PKT4(ring, REG_A5XX_UNKNOWN_0BC5, 2); - OUT_RING(ring, 0x00000000); /* UNKNOWN_0BC5 */ - OUT_RING(ring, 0x00000000); /* UNKNOWN_0BC6 */ - - OUT_PKT4(ring, REG_A5XX_VSC_PIPE_CONFIG_REG(0), 16); - for (i = 0; i < 16; i++) { - const struct fd_vsc_pipe *pipe = &gmem->vsc_pipe[i]; - OUT_RING(ring, A5XX_VSC_PIPE_CONFIG_REG_X(pipe->x) | - A5XX_VSC_PIPE_CONFIG_REG_Y(pipe->y) | - A5XX_VSC_PIPE_CONFIG_REG_W(pipe->w) | - A5XX_VSC_PIPE_CONFIG_REG_H(pipe->h)); - } - - OUT_PKT4(ring, REG_A5XX_VSC_PIPE_DATA_ADDRESS_LO(0), 32); - for (i = 0; i < 16; i++) { - if (!ctx->vsc_pipe_bo[i]) { - ctx->vsc_pipe_bo[i] = fd_bo_new(ctx->dev, 0x20000, - DRM_FREEDRENO_GEM_TYPE_KMEM, "vsc_pipe[%u]", i); - } - OUT_RELOC(ring, ctx->vsc_pipe_bo[i], 0, 0, 0); /* VSC_PIPE_DATA_ADDRESS[i].LO/HI */ - } - - OUT_PKT4(ring, REG_A5XX_VSC_PIPE_DATA_LENGTH_REG(0), 16); - for (i = 0; i < 16; i++) { - OUT_RING(ring, fd_bo_size(ctx->vsc_pipe_bo[i]) - 32); /* VSC_PIPE_DATA_LENGTH[i] */ - } + struct fd_context *ctx = batch->ctx; + struct fd5_context *fd5_ctx = fd5_context(ctx); + const struct fd_gmem_stateobj *gmem = batch->gmem_state; + struct fd_ringbuffer *ring = batch->gmem; + int i; + + OUT_PKT4(ring, REG_A5XX_VSC_BIN_SIZE, 3); + OUT_RING(ring, A5XX_VSC_BIN_SIZE_WIDTH(gmem->bin_w) | + A5XX_VSC_BIN_SIZE_HEIGHT(gmem->bin_h)); + OUT_RELOC(ring, fd5_ctx->vsc_size_mem, 0, 0, 0); /* VSC_SIZE_ADDRESS_LO/HI */ + + OUT_PKT4(ring, REG_A5XX_UNKNOWN_0BC5, 2); + OUT_RING(ring, 0x00000000); /* UNKNOWN_0BC5 */ + OUT_RING(ring, 0x00000000); /* UNKNOWN_0BC6 */ + + OUT_PKT4(ring, REG_A5XX_VSC_PIPE_CONFIG_REG(0), 16); + for (i = 0; i < 16; i++) { + const struct fd_vsc_pipe *pipe = &gmem->vsc_pipe[i]; + OUT_RING(ring, A5XX_VSC_PIPE_CONFIG_REG_X(pipe->x) | + A5XX_VSC_PIPE_CONFIG_REG_Y(pipe->y) | + A5XX_VSC_PIPE_CONFIG_REG_W(pipe->w) | + A5XX_VSC_PIPE_CONFIG_REG_H(pipe->h)); + } + + OUT_PKT4(ring, REG_A5XX_VSC_PIPE_DATA_ADDRESS_LO(0), 32); + for (i = 0; i < 16; i++) { + if (!ctx->vsc_pipe_bo[i]) { + ctx->vsc_pipe_bo[i] = fd_bo_new( + ctx->dev, 0x20000, DRM_FREEDRENO_GEM_TYPE_KMEM, "vsc_pipe[%u]", i); + } + OUT_RELOC(ring, ctx->vsc_pipe_bo[i], 0, 0, + 0); /* VSC_PIPE_DATA_ADDRESS[i].LO/HI */ + } + + OUT_PKT4(ring, REG_A5XX_VSC_PIPE_DATA_LENGTH_REG(0), 16); + for (i = 0; i < 16; i++) { + OUT_RING(ring, fd_bo_size(ctx->vsc_pipe_bo[i]) - + 32); /* VSC_PIPE_DATA_LENGTH[i] */ + } } static void -emit_binning_pass(struct fd_batch *batch) - assert_dt +emit_binning_pass(struct fd_batch *batch) assert_dt { - struct fd_ringbuffer *ring = batch->gmem; - const struct fd_gmem_stateobj *gmem = batch->gmem_state; + struct fd_ringbuffer *ring = batch->gmem; + const struct fd_gmem_stateobj *gmem = batch->gmem_state; - uint32_t x1 = gmem->minx; - uint32_t y1 = gmem->miny; - uint32_t x2 = gmem->minx + gmem->width - 1; - uint32_t y2 = gmem->miny + gmem->height - 1; + uint32_t x1 = gmem->minx; + uint32_t y1 = gmem->miny; + uint32_t x2 = gmem->minx + gmem->width - 1; + uint32_t y2 = gmem->miny + gmem->height - 1; - fd5_set_render_mode(batch->ctx, ring, BINNING); + fd5_set_render_mode(batch->ctx, ring, BINNING); - OUT_PKT4(ring, REG_A5XX_RB_CNTL, 1); - OUT_RING(ring, A5XX_RB_CNTL_WIDTH(gmem->bin_w) | - A5XX_RB_CNTL_HEIGHT(gmem->bin_h)); + OUT_PKT4(ring, REG_A5XX_RB_CNTL, 1); + OUT_RING(ring, + A5XX_RB_CNTL_WIDTH(gmem->bin_w) | A5XX_RB_CNTL_HEIGHT(gmem->bin_h)); - OUT_PKT4(ring, REG_A5XX_GRAS_SC_WINDOW_SCISSOR_TL, 2); - OUT_RING(ring, A5XX_GRAS_SC_WINDOW_SCISSOR_TL_X(x1) | - A5XX_GRAS_SC_WINDOW_SCISSOR_TL_Y(y1)); - OUT_RING(ring, A5XX_GRAS_SC_WINDOW_SCISSOR_BR_X(x2) | - A5XX_GRAS_SC_WINDOW_SCISSOR_BR_Y(y2)); + OUT_PKT4(ring, REG_A5XX_GRAS_SC_WINDOW_SCISSOR_TL, 2); + OUT_RING(ring, A5XX_GRAS_SC_WINDOW_SCISSOR_TL_X(x1) | + A5XX_GRAS_SC_WINDOW_SCISSOR_TL_Y(y1)); + OUT_RING(ring, A5XX_GRAS_SC_WINDOW_SCISSOR_BR_X(x2) | + A5XX_GRAS_SC_WINDOW_SCISSOR_BR_Y(y2)); - OUT_PKT4(ring, REG_A5XX_RB_RESOLVE_CNTL_1, 2); - OUT_RING(ring, A5XX_RB_RESOLVE_CNTL_1_X(x1) | - A5XX_RB_RESOLVE_CNTL_1_Y(y1)); - OUT_RING(ring, A5XX_RB_RESOLVE_CNTL_2_X(x2) | - A5XX_RB_RESOLVE_CNTL_2_Y(y2)); + OUT_PKT4(ring, REG_A5XX_RB_RESOLVE_CNTL_1, 2); + OUT_RING(ring, A5XX_RB_RESOLVE_CNTL_1_X(x1) | A5XX_RB_RESOLVE_CNTL_1_Y(y1)); + OUT_RING(ring, A5XX_RB_RESOLVE_CNTL_2_X(x2) | A5XX_RB_RESOLVE_CNTL_2_Y(y2)); - update_vsc_pipe(batch); + update_vsc_pipe(batch); - OUT_PKT4(ring, REG_A5XX_VPC_MODE_CNTL, 1); - OUT_RING(ring, A5XX_VPC_MODE_CNTL_BINNING_PASS); + OUT_PKT4(ring, REG_A5XX_VPC_MODE_CNTL, 1); + OUT_RING(ring, A5XX_VPC_MODE_CNTL_BINNING_PASS); - fd5_event_write(batch, ring, UNK_2C, false); + fd5_event_write(batch, ring, UNK_2C, false); - OUT_PKT4(ring, REG_A5XX_RB_WINDOW_OFFSET, 1); - OUT_RING(ring, A5XX_RB_WINDOW_OFFSET_X(0) | - A5XX_RB_WINDOW_OFFSET_Y(0)); + OUT_PKT4(ring, REG_A5XX_RB_WINDOW_OFFSET, 1); + OUT_RING(ring, A5XX_RB_WINDOW_OFFSET_X(0) | A5XX_RB_WINDOW_OFFSET_Y(0)); - /* emit IB to binning drawcmds: */ - fd5_emit_ib(ring, batch->binning); + /* emit IB to binning drawcmds: */ + fd5_emit_ib(ring, batch->binning); - fd_reset_wfi(batch); + fd_reset_wfi(batch); - fd5_event_write(batch, ring, UNK_2D, false); + fd5_event_write(batch, ring, UNK_2D, false); - fd5_event_write(batch, ring, CACHE_FLUSH_TS, true); + fd5_event_write(batch, ring, CACHE_FLUSH_TS, true); - // TODO CP_COND_WRITE's for all the vsc buffers (check for overflow??) + // TODO CP_COND_WRITE's for all the vsc buffers (check for overflow??) - fd_wfi(batch, ring); + fd_wfi(batch, ring); - OUT_PKT4(ring, REG_A5XX_VPC_MODE_CNTL, 1); - OUT_RING(ring, 0x0); + OUT_PKT4(ring, REG_A5XX_VPC_MODE_CNTL, 1); + OUT_RING(ring, 0x0); } /* before first tile */ static void -fd5_emit_tile_init(struct fd_batch *batch) - assert_dt +fd5_emit_tile_init(struct fd_batch *batch) assert_dt { - struct fd_ringbuffer *ring = batch->gmem; - struct pipe_framebuffer_state *pfb = &batch->framebuffer; + struct fd_ringbuffer *ring = batch->gmem; + struct pipe_framebuffer_state *pfb = &batch->framebuffer; - fd5_emit_restore(batch, ring); + fd5_emit_restore(batch, ring); - if (batch->prologue) - fd5_emit_ib(ring, batch->prologue); + if (batch->prologue) + fd5_emit_ib(ring, batch->prologue); - fd5_emit_lrz_flush(batch, ring); + fd5_emit_lrz_flush(batch, ring); - OUT_PKT4(ring, REG_A5XX_GRAS_CL_CNTL, 1); - OUT_RING(ring, 0x00000080); /* GRAS_CL_CNTL */ + OUT_PKT4(ring, REG_A5XX_GRAS_CL_CNTL, 1); + OUT_RING(ring, 0x00000080); /* GRAS_CL_CNTL */ - OUT_PKT7(ring, CP_SKIP_IB2_ENABLE_GLOBAL, 1); - OUT_RING(ring, 0x0); + OUT_PKT7(ring, CP_SKIP_IB2_ENABLE_GLOBAL, 1); + OUT_RING(ring, 0x0); - OUT_PKT4(ring, REG_A5XX_PC_POWER_CNTL, 1); - OUT_RING(ring, 0x00000003); /* PC_POWER_CNTL */ + OUT_PKT4(ring, REG_A5XX_PC_POWER_CNTL, 1); + OUT_RING(ring, 0x00000003); /* PC_POWER_CNTL */ - OUT_PKT4(ring, REG_A5XX_VFD_POWER_CNTL, 1); - OUT_RING(ring, 0x00000003); /* VFD_POWER_CNTL */ + OUT_PKT4(ring, REG_A5XX_VFD_POWER_CNTL, 1); + OUT_RING(ring, 0x00000003); /* VFD_POWER_CNTL */ - /* 0x10000000 for BYPASS.. 0x7c13c080 for GMEM: */ - fd_wfi(batch, ring); - OUT_PKT4(ring, REG_A5XX_RB_CCU_CNTL, 1); - OUT_RING(ring, 0x7c13c080); /* RB_CCU_CNTL */ + /* 0x10000000 for BYPASS.. 0x7c13c080 for GMEM: */ + fd_wfi(batch, ring); + OUT_PKT4(ring, REG_A5XX_RB_CCU_CNTL, 1); + OUT_RING(ring, 0x7c13c080); /* RB_CCU_CNTL */ - emit_zs(ring, pfb->zsbuf, batch->gmem_state); - emit_mrt(ring, pfb->nr_cbufs, pfb->cbufs, batch->gmem_state); + emit_zs(ring, pfb->zsbuf, batch->gmem_state); + emit_mrt(ring, pfb->nr_cbufs, pfb->cbufs, batch->gmem_state); - /* Enable stream output for the first pass (likely the binning). */ - OUT_PKT4(ring, REG_A5XX_VPC_SO_OVERRIDE, 1); - OUT_RING(ring, 0); + /* Enable stream output for the first pass (likely the binning). */ + OUT_PKT4(ring, REG_A5XX_VPC_SO_OVERRIDE, 1); + OUT_RING(ring, 0); - if (use_hw_binning(batch)) { - emit_binning_pass(batch); + if (use_hw_binning(batch)) { + emit_binning_pass(batch); - /* Disable stream output after binning, since each VS output should get - * streamed out once. - */ - OUT_PKT4(ring, REG_A5XX_VPC_SO_OVERRIDE, 1); - OUT_RING(ring, A5XX_VPC_SO_OVERRIDE_SO_DISABLE); + /* Disable stream output after binning, since each VS output should get + * streamed out once. + */ + OUT_PKT4(ring, REG_A5XX_VPC_SO_OVERRIDE, 1); + OUT_RING(ring, A5XX_VPC_SO_OVERRIDE_SO_DISABLE); - fd5_emit_lrz_flush(batch, ring); - patch_draws(batch, USE_VISIBILITY); - } else { - patch_draws(batch, IGNORE_VISIBILITY); - } + fd5_emit_lrz_flush(batch, ring); + patch_draws(batch, USE_VISIBILITY); + } else { + patch_draws(batch, IGNORE_VISIBILITY); + } - fd5_set_render_mode(batch->ctx, ring, GMEM); + fd5_set_render_mode(batch->ctx, ring, GMEM); - /* XXX If we're in gmem mode but not doing HW binning, then after the first - * tile we should disable stream output (fd6_gmem.c doesn't do that either). - */ + /* XXX If we're in gmem mode but not doing HW binning, then after the first + * tile we should disable stream output (fd6_gmem.c doesn't do that either). + */ } /* before mem2gmem */ static void -fd5_emit_tile_prep(struct fd_batch *batch, const struct fd_tile *tile) - assert_dt +fd5_emit_tile_prep(struct fd_batch *batch, const struct fd_tile *tile) assert_dt { - struct fd_context *ctx = batch->ctx; - const struct fd_gmem_stateobj *gmem = batch->gmem_state; - struct fd5_context *fd5_ctx = fd5_context(ctx); - struct fd_ringbuffer *ring = batch->gmem; - - uint32_t x1 = tile->xoff; - uint32_t y1 = tile->yoff; - uint32_t x2 = tile->xoff + tile->bin_w - 1; - uint32_t y2 = tile->yoff + tile->bin_h - 1; - - OUT_PKT4(ring, REG_A5XX_GRAS_SC_WINDOW_SCISSOR_TL, 2); - OUT_RING(ring, A5XX_GRAS_SC_WINDOW_SCISSOR_TL_X(x1) | - A5XX_GRAS_SC_WINDOW_SCISSOR_TL_Y(y1)); - OUT_RING(ring, A5XX_GRAS_SC_WINDOW_SCISSOR_BR_X(x2) | - A5XX_GRAS_SC_WINDOW_SCISSOR_BR_Y(y2)); - - OUT_PKT4(ring, REG_A5XX_RB_RESOLVE_CNTL_1, 2); - OUT_RING(ring, A5XX_RB_RESOLVE_CNTL_1_X(x1) | - A5XX_RB_RESOLVE_CNTL_1_Y(y1)); - OUT_RING(ring, A5XX_RB_RESOLVE_CNTL_2_X(x2) | - A5XX_RB_RESOLVE_CNTL_2_Y(y2)); - - if (use_hw_binning(batch)) { - const struct fd_vsc_pipe *pipe = &gmem->vsc_pipe[tile->p]; - struct fd_bo *pipe_bo = ctx->vsc_pipe_bo[tile->p]; - - OUT_PKT7(ring, CP_WAIT_FOR_ME, 0); - - OUT_PKT7(ring, CP_SET_VISIBILITY_OVERRIDE, 1); - OUT_RING(ring, 0x0); - - OUT_PKT7(ring, CP_SET_BIN_DATA5, 5); - OUT_RING(ring, CP_SET_BIN_DATA5_0_VSC_SIZE(pipe->w * pipe->h) | - CP_SET_BIN_DATA5_0_VSC_N(tile->n)); - OUT_RELOC(ring, pipe_bo, 0, 0, 0); /* VSC_PIPE[p].DATA_ADDRESS */ - OUT_RELOC(ring, fd5_ctx->vsc_size_mem, /* VSC_SIZE_ADDRESS + (p * 4) */ - (tile->p * 4), 0, 0); - } else { - OUT_PKT7(ring, CP_SET_VISIBILITY_OVERRIDE, 1); - OUT_RING(ring, 0x1); - } - - OUT_PKT4(ring, REG_A5XX_RB_WINDOW_OFFSET, 1); - OUT_RING(ring, A5XX_RB_WINDOW_OFFSET_X(x1) | - A5XX_RB_WINDOW_OFFSET_Y(y1)); + struct fd_context *ctx = batch->ctx; + const struct fd_gmem_stateobj *gmem = batch->gmem_state; + struct fd5_context *fd5_ctx = fd5_context(ctx); + struct fd_ringbuffer *ring = batch->gmem; + + uint32_t x1 = tile->xoff; + uint32_t y1 = tile->yoff; + uint32_t x2 = tile->xoff + tile->bin_w - 1; + uint32_t y2 = tile->yoff + tile->bin_h - 1; + + OUT_PKT4(ring, REG_A5XX_GRAS_SC_WINDOW_SCISSOR_TL, 2); + OUT_RING(ring, A5XX_GRAS_SC_WINDOW_SCISSOR_TL_X(x1) | + A5XX_GRAS_SC_WINDOW_SCISSOR_TL_Y(y1)); + OUT_RING(ring, A5XX_GRAS_SC_WINDOW_SCISSOR_BR_X(x2) | + A5XX_GRAS_SC_WINDOW_SCISSOR_BR_Y(y2)); + + OUT_PKT4(ring, REG_A5XX_RB_RESOLVE_CNTL_1, 2); + OUT_RING(ring, A5XX_RB_RESOLVE_CNTL_1_X(x1) | A5XX_RB_RESOLVE_CNTL_1_Y(y1)); + OUT_RING(ring, A5XX_RB_RESOLVE_CNTL_2_X(x2) | A5XX_RB_RESOLVE_CNTL_2_Y(y2)); + + if (use_hw_binning(batch)) { + const struct fd_vsc_pipe *pipe = &gmem->vsc_pipe[tile->p]; + struct fd_bo *pipe_bo = ctx->vsc_pipe_bo[tile->p]; + + OUT_PKT7(ring, CP_WAIT_FOR_ME, 0); + + OUT_PKT7(ring, CP_SET_VISIBILITY_OVERRIDE, 1); + OUT_RING(ring, 0x0); + + OUT_PKT7(ring, CP_SET_BIN_DATA5, 5); + OUT_RING(ring, CP_SET_BIN_DATA5_0_VSC_SIZE(pipe->w * pipe->h) | + CP_SET_BIN_DATA5_0_VSC_N(tile->n)); + OUT_RELOC(ring, pipe_bo, 0, 0, 0); /* VSC_PIPE[p].DATA_ADDRESS */ + OUT_RELOC(ring, fd5_ctx->vsc_size_mem, /* VSC_SIZE_ADDRESS + (p * 4) */ + (tile->p * 4), 0, 0); + } else { + OUT_PKT7(ring, CP_SET_VISIBILITY_OVERRIDE, 1); + OUT_RING(ring, 0x1); + } + + OUT_PKT4(ring, REG_A5XX_RB_WINDOW_OFFSET, 1); + OUT_RING(ring, A5XX_RB_WINDOW_OFFSET_X(x1) | A5XX_RB_WINDOW_OFFSET_Y(y1)); } - /* * transfer from system memory to gmem */ static void emit_mem2gmem_surf(struct fd_batch *batch, uint32_t base, - struct pipe_surface *psurf, enum a5xx_blit_buf buf) + struct pipe_surface *psurf, enum a5xx_blit_buf buf) { - struct fd_ringbuffer *ring = batch->gmem; - const struct fd_gmem_stateobj *gmem = batch->gmem_state; - struct fd_resource *rsc = fd_resource(psurf->texture); - uint32_t stride, size; - - debug_assert(psurf->u.tex.first_layer == psurf->u.tex.last_layer); - - if (buf == BLIT_S) - rsc = rsc->stencil; - - if ((buf == BLIT_ZS) || (buf == BLIT_S)) { - // XXX hack import via BLIT_MRT0 instead of BLIT_ZS, since I don't - // know otherwise how to go from linear in sysmem to tiled in gmem. - // possibly we want to flip this around gmem2mem and keep depth - // tiled in sysmem (and fixup sampler state to assume tiled).. this - // might be required for doing depth/stencil in bypass mode? - struct fdl_slice *slice = fd_resource_slice(rsc, 0); - enum a5xx_color_fmt format = - fd5_pipe2color(fd_gmem_restore_format(rsc->b.b.format)); - - OUT_PKT4(ring, REG_A5XX_RB_MRT_BUF_INFO(0), 5); - OUT_RING(ring, A5XX_RB_MRT_BUF_INFO_COLOR_FORMAT(format) | - A5XX_RB_MRT_BUF_INFO_COLOR_TILE_MODE(rsc->layout.tile_mode) | - A5XX_RB_MRT_BUF_INFO_COLOR_SWAP(WZYX)); - OUT_RING(ring, A5XX_RB_MRT_PITCH(fd_resource_pitch(rsc, 0))); - OUT_RING(ring, A5XX_RB_MRT_ARRAY_PITCH(slice->size0)); - OUT_RELOC(ring, rsc->bo, 0, 0, 0); /* BASE_LO/HI */ - - buf = BLIT_MRT0; - } - - stride = gmem->bin_w << fdl_cpp_shift(&rsc->layout); - size = stride * gmem->bin_h; - - OUT_PKT4(ring, REG_A5XX_RB_BLIT_FLAG_DST_LO, 4); - OUT_RING(ring, 0x00000000); /* RB_BLIT_FLAG_DST_LO */ - OUT_RING(ring, 0x00000000); /* RB_BLIT_FLAG_DST_HI */ - OUT_RING(ring, 0x00000000); /* RB_BLIT_FLAG_DST_PITCH */ - OUT_RING(ring, 0x00000000); /* RB_BLIT_FLAG_DST_ARRAY_PITCH */ - - OUT_PKT4(ring, REG_A5XX_RB_RESOLVE_CNTL_3, 5); - OUT_RING(ring, 0x00000000); /* RB_RESOLVE_CNTL_3 */ - OUT_RING(ring, base); /* RB_BLIT_DST_LO */ - OUT_RING(ring, 0x00000000); /* RB_BLIT_DST_HI */ - OUT_RING(ring, A5XX_RB_BLIT_DST_PITCH(stride)); - OUT_RING(ring, A5XX_RB_BLIT_DST_ARRAY_PITCH(size)); - - OUT_PKT4(ring, REG_A5XX_RB_BLIT_CNTL, 1); - OUT_RING(ring, A5XX_RB_BLIT_CNTL_BUF(buf)); - - fd5_emit_blit(batch, ring); + struct fd_ringbuffer *ring = batch->gmem; + const struct fd_gmem_stateobj *gmem = batch->gmem_state; + struct fd_resource *rsc = fd_resource(psurf->texture); + uint32_t stride, size; + + debug_assert(psurf->u.tex.first_layer == psurf->u.tex.last_layer); + + if (buf == BLIT_S) + rsc = rsc->stencil; + + if ((buf == BLIT_ZS) || (buf == BLIT_S)) { + // XXX hack import via BLIT_MRT0 instead of BLIT_ZS, since I don't + // know otherwise how to go from linear in sysmem to tiled in gmem. + // possibly we want to flip this around gmem2mem and keep depth + // tiled in sysmem (and fixup sampler state to assume tiled).. this + // might be required for doing depth/stencil in bypass mode? + struct fdl_slice *slice = fd_resource_slice(rsc, 0); + enum a5xx_color_fmt format = + fd5_pipe2color(fd_gmem_restore_format(rsc->b.b.format)); + + OUT_PKT4(ring, REG_A5XX_RB_MRT_BUF_INFO(0), 5); + OUT_RING(ring, + A5XX_RB_MRT_BUF_INFO_COLOR_FORMAT(format) | + A5XX_RB_MRT_BUF_INFO_COLOR_TILE_MODE(rsc->layout.tile_mode) | + A5XX_RB_MRT_BUF_INFO_COLOR_SWAP(WZYX)); + OUT_RING(ring, A5XX_RB_MRT_PITCH(fd_resource_pitch(rsc, 0))); + OUT_RING(ring, A5XX_RB_MRT_ARRAY_PITCH(slice->size0)); + OUT_RELOC(ring, rsc->bo, 0, 0, 0); /* BASE_LO/HI */ + + buf = BLIT_MRT0; + } + + stride = gmem->bin_w << fdl_cpp_shift(&rsc->layout); + size = stride * gmem->bin_h; + + OUT_PKT4(ring, REG_A5XX_RB_BLIT_FLAG_DST_LO, 4); + OUT_RING(ring, 0x00000000); /* RB_BLIT_FLAG_DST_LO */ + OUT_RING(ring, 0x00000000); /* RB_BLIT_FLAG_DST_HI */ + OUT_RING(ring, 0x00000000); /* RB_BLIT_FLAG_DST_PITCH */ + OUT_RING(ring, 0x00000000); /* RB_BLIT_FLAG_DST_ARRAY_PITCH */ + + OUT_PKT4(ring, REG_A5XX_RB_RESOLVE_CNTL_3, 5); + OUT_RING(ring, 0x00000000); /* RB_RESOLVE_CNTL_3 */ + OUT_RING(ring, base); /* RB_BLIT_DST_LO */ + OUT_RING(ring, 0x00000000); /* RB_BLIT_DST_HI */ + OUT_RING(ring, A5XX_RB_BLIT_DST_PITCH(stride)); + OUT_RING(ring, A5XX_RB_BLIT_DST_ARRAY_PITCH(size)); + + OUT_PKT4(ring, REG_A5XX_RB_BLIT_CNTL, 1); + OUT_RING(ring, A5XX_RB_BLIT_CNTL_BUF(buf)); + + fd5_emit_blit(batch, ring); } static void fd5_emit_tile_mem2gmem(struct fd_batch *batch, const struct fd_tile *tile) { - struct fd_ringbuffer *ring = batch->gmem; - const struct fd_gmem_stateobj *gmem = batch->gmem_state; - struct pipe_framebuffer_state *pfb = &batch->framebuffer; - - /* - * setup mrt and zs with system memory base addresses: - */ - - emit_mrt(ring, pfb->nr_cbufs, pfb->cbufs, NULL); -// emit_zs(ring, pfb->zsbuf, NULL); - - OUT_PKT4(ring, REG_A5XX_RB_CNTL, 1); - OUT_RING(ring, A5XX_RB_CNTL_WIDTH(gmem->bin_w) | - A5XX_RB_CNTL_HEIGHT(gmem->bin_h) | - A5XX_RB_CNTL_BYPASS); - - if (fd_gmem_needs_restore(batch, tile, FD_BUFFER_COLOR)) { - unsigned i; - for (i = 0; i < pfb->nr_cbufs; i++) { - if (!pfb->cbufs[i]) - continue; - if (!(batch->restore & (PIPE_CLEAR_COLOR0 << i))) - continue; - emit_mem2gmem_surf(batch, gmem->cbuf_base[i], - pfb->cbufs[i], BLIT_MRT0 + i); - } - } - - if (fd_gmem_needs_restore(batch, tile, FD_BUFFER_DEPTH | FD_BUFFER_STENCIL)) { - struct fd_resource *rsc = fd_resource(pfb->zsbuf->texture); - - if (!rsc->stencil || fd_gmem_needs_restore(batch, tile, FD_BUFFER_DEPTH)) - emit_mem2gmem_surf(batch, gmem->zsbuf_base[0], pfb->zsbuf, BLIT_ZS); - if (rsc->stencil && fd_gmem_needs_restore(batch, tile, FD_BUFFER_STENCIL)) - emit_mem2gmem_surf(batch, gmem->zsbuf_base[1], pfb->zsbuf, BLIT_S); - } + struct fd_ringbuffer *ring = batch->gmem; + const struct fd_gmem_stateobj *gmem = batch->gmem_state; + struct pipe_framebuffer_state *pfb = &batch->framebuffer; + + /* + * setup mrt and zs with system memory base addresses: + */ + + emit_mrt(ring, pfb->nr_cbufs, pfb->cbufs, NULL); + // emit_zs(ring, pfb->zsbuf, NULL); + + OUT_PKT4(ring, REG_A5XX_RB_CNTL, 1); + OUT_RING(ring, A5XX_RB_CNTL_WIDTH(gmem->bin_w) | + A5XX_RB_CNTL_HEIGHT(gmem->bin_h) | A5XX_RB_CNTL_BYPASS); + + if (fd_gmem_needs_restore(batch, tile, FD_BUFFER_COLOR)) { + unsigned i; + for (i = 0; i < pfb->nr_cbufs; i++) { + if (!pfb->cbufs[i]) + continue; + if (!(batch->restore & (PIPE_CLEAR_COLOR0 << i))) + continue; + emit_mem2gmem_surf(batch, gmem->cbuf_base[i], pfb->cbufs[i], + BLIT_MRT0 + i); + } + } + + if (fd_gmem_needs_restore(batch, tile, + FD_BUFFER_DEPTH | FD_BUFFER_STENCIL)) { + struct fd_resource *rsc = fd_resource(pfb->zsbuf->texture); + + if (!rsc->stencil || fd_gmem_needs_restore(batch, tile, FD_BUFFER_DEPTH)) + emit_mem2gmem_surf(batch, gmem->zsbuf_base[0], pfb->zsbuf, BLIT_ZS); + if (rsc->stencil && fd_gmem_needs_restore(batch, tile, FD_BUFFER_STENCIL)) + emit_mem2gmem_surf(batch, gmem->zsbuf_base[1], pfb->zsbuf, BLIT_S); + } } - /* before IB to rendering cmds: */ static void fd5_emit_tile_renderprep(struct fd_batch *batch, const struct fd_tile *tile) { - struct fd_ringbuffer *ring = batch->gmem; - const struct fd_gmem_stateobj *gmem = batch->gmem_state; - struct pipe_framebuffer_state *pfb = &batch->framebuffer; - - OUT_PKT4(ring, REG_A5XX_RB_CNTL, 1); - OUT_RING(ring, A5XX_RB_CNTL_WIDTH(gmem->bin_w) | - A5XX_RB_CNTL_HEIGHT(gmem->bin_h)); - - emit_zs(ring, pfb->zsbuf, gmem); - emit_mrt(ring, pfb->nr_cbufs, pfb->cbufs, gmem); - - enum a3xx_msaa_samples samples = fd_msaa_samples(pfb->samples); - - OUT_PKT4(ring, REG_A5XX_TPL1_TP_RAS_MSAA_CNTL, 2); - OUT_RING(ring, A5XX_TPL1_TP_RAS_MSAA_CNTL_SAMPLES(samples)); - OUT_RING(ring, A5XX_TPL1_TP_DEST_MSAA_CNTL_SAMPLES(samples) | - COND(samples == MSAA_ONE, A5XX_TPL1_TP_DEST_MSAA_CNTL_MSAA_DISABLE)); - - OUT_PKT4(ring, REG_A5XX_RB_RAS_MSAA_CNTL, 2); - OUT_RING(ring, A5XX_RB_RAS_MSAA_CNTL_SAMPLES(samples)); - OUT_RING(ring, A5XX_RB_DEST_MSAA_CNTL_SAMPLES(samples) | - COND(samples == MSAA_ONE, A5XX_RB_DEST_MSAA_CNTL_MSAA_DISABLE)); - - - OUT_PKT4(ring, REG_A5XX_GRAS_SC_RAS_MSAA_CNTL, 2); - OUT_RING(ring, A5XX_GRAS_SC_RAS_MSAA_CNTL_SAMPLES(samples)); - OUT_RING(ring, A5XX_GRAS_SC_DEST_MSAA_CNTL_SAMPLES(samples) | - COND(samples == MSAA_ONE, A5XX_GRAS_SC_DEST_MSAA_CNTL_MSAA_DISABLE)); + struct fd_ringbuffer *ring = batch->gmem; + const struct fd_gmem_stateobj *gmem = batch->gmem_state; + struct pipe_framebuffer_state *pfb = &batch->framebuffer; + + OUT_PKT4(ring, REG_A5XX_RB_CNTL, 1); + OUT_RING(ring, + A5XX_RB_CNTL_WIDTH(gmem->bin_w) | A5XX_RB_CNTL_HEIGHT(gmem->bin_h)); + + emit_zs(ring, pfb->zsbuf, gmem); + emit_mrt(ring, pfb->nr_cbufs, pfb->cbufs, gmem); + + enum a3xx_msaa_samples samples = fd_msaa_samples(pfb->samples); + + OUT_PKT4(ring, REG_A5XX_TPL1_TP_RAS_MSAA_CNTL, 2); + OUT_RING(ring, A5XX_TPL1_TP_RAS_MSAA_CNTL_SAMPLES(samples)); + OUT_RING(ring, A5XX_TPL1_TP_DEST_MSAA_CNTL_SAMPLES(samples) | + COND(samples == MSAA_ONE, + A5XX_TPL1_TP_DEST_MSAA_CNTL_MSAA_DISABLE)); + + OUT_PKT4(ring, REG_A5XX_RB_RAS_MSAA_CNTL, 2); + OUT_RING(ring, A5XX_RB_RAS_MSAA_CNTL_SAMPLES(samples)); + OUT_RING(ring, + A5XX_RB_DEST_MSAA_CNTL_SAMPLES(samples) | + COND(samples == MSAA_ONE, A5XX_RB_DEST_MSAA_CNTL_MSAA_DISABLE)); + + OUT_PKT4(ring, REG_A5XX_GRAS_SC_RAS_MSAA_CNTL, 2); + OUT_RING(ring, A5XX_GRAS_SC_RAS_MSAA_CNTL_SAMPLES(samples)); + OUT_RING(ring, A5XX_GRAS_SC_DEST_MSAA_CNTL_SAMPLES(samples) | + COND(samples == MSAA_ONE, + A5XX_GRAS_SC_DEST_MSAA_CNTL_MSAA_DISABLE)); } - /* * transfer from gmem to system memory (ie. normal RAM) */ static void emit_gmem2mem_surf(struct fd_batch *batch, uint32_t base, - struct pipe_surface *psurf, enum a5xx_blit_buf buf) + struct pipe_surface *psurf, enum a5xx_blit_buf buf) { - struct fd_ringbuffer *ring = batch->gmem; - struct fd_resource *rsc = fd_resource(psurf->texture); - struct fdl_slice *slice; - bool tiled; - uint32_t offset, pitch; + struct fd_ringbuffer *ring = batch->gmem; + struct fd_resource *rsc = fd_resource(psurf->texture); + struct fdl_slice *slice; + bool tiled; + uint32_t offset, pitch; - if (!rsc->valid) - return; + if (!rsc->valid) + return; - if (buf == BLIT_S) - rsc = rsc->stencil; + if (buf == BLIT_S) + rsc = rsc->stencil; - slice = fd_resource_slice(rsc, psurf->u.tex.level); - offset = fd_resource_offset(rsc, psurf->u.tex.level, - psurf->u.tex.first_layer); - pitch = fd_resource_pitch(rsc, psurf->u.tex.level); + slice = fd_resource_slice(rsc, psurf->u.tex.level); + offset = + fd_resource_offset(rsc, psurf->u.tex.level, psurf->u.tex.first_layer); + pitch = fd_resource_pitch(rsc, psurf->u.tex.level); - debug_assert(psurf->u.tex.first_layer == psurf->u.tex.last_layer); + debug_assert(psurf->u.tex.first_layer == psurf->u.tex.last_layer); - OUT_PKT4(ring, REG_A5XX_RB_BLIT_FLAG_DST_LO, 4); - OUT_RING(ring, 0x00000000); /* RB_BLIT_FLAG_DST_LO */ - OUT_RING(ring, 0x00000000); /* RB_BLIT_FLAG_DST_HI */ - OUT_RING(ring, 0x00000000); /* RB_BLIT_FLAG_DST_PITCH */ - OUT_RING(ring, 0x00000000); /* RB_BLIT_FLAG_DST_ARRAY_PITCH */ + OUT_PKT4(ring, REG_A5XX_RB_BLIT_FLAG_DST_LO, 4); + OUT_RING(ring, 0x00000000); /* RB_BLIT_FLAG_DST_LO */ + OUT_RING(ring, 0x00000000); /* RB_BLIT_FLAG_DST_HI */ + OUT_RING(ring, 0x00000000); /* RB_BLIT_FLAG_DST_PITCH */ + OUT_RING(ring, 0x00000000); /* RB_BLIT_FLAG_DST_ARRAY_PITCH */ - tiled = fd_resource_tile_mode(psurf->texture, psurf->u.tex.level); + tiled = fd_resource_tile_mode(psurf->texture, psurf->u.tex.level); - OUT_PKT4(ring, REG_A5XX_RB_RESOLVE_CNTL_3, 5); - OUT_RING(ring, 0x00000004 | /* XXX RB_RESOLVE_CNTL_3 */ - COND(tiled, A5XX_RB_RESOLVE_CNTL_3_TILED)); - OUT_RELOC(ring, rsc->bo, offset, 0, 0); /* RB_BLIT_DST_LO/HI */ - OUT_RING(ring, A5XX_RB_BLIT_DST_PITCH(pitch)); - OUT_RING(ring, A5XX_RB_BLIT_DST_ARRAY_PITCH(slice->size0)); + OUT_PKT4(ring, REG_A5XX_RB_RESOLVE_CNTL_3, 5); + OUT_RING(ring, 0x00000004 | /* XXX RB_RESOLVE_CNTL_3 */ + COND(tiled, A5XX_RB_RESOLVE_CNTL_3_TILED)); + OUT_RELOC(ring, rsc->bo, offset, 0, 0); /* RB_BLIT_DST_LO/HI */ + OUT_RING(ring, A5XX_RB_BLIT_DST_PITCH(pitch)); + OUT_RING(ring, A5XX_RB_BLIT_DST_ARRAY_PITCH(slice->size0)); - OUT_PKT4(ring, REG_A5XX_RB_BLIT_CNTL, 1); - OUT_RING(ring, A5XX_RB_BLIT_CNTL_BUF(buf)); + OUT_PKT4(ring, REG_A5XX_RB_BLIT_CNTL, 1); + OUT_RING(ring, A5XX_RB_BLIT_CNTL_BUF(buf)); -// bool msaa_resolve = pfb->samples > 1; - bool msaa_resolve = false; - OUT_PKT4(ring, REG_A5XX_RB_CLEAR_CNTL, 1); - OUT_RING(ring, COND(msaa_resolve, A5XX_RB_CLEAR_CNTL_MSAA_RESOLVE)); + // bool msaa_resolve = pfb->samples > 1; + bool msaa_resolve = false; + OUT_PKT4(ring, REG_A5XX_RB_CLEAR_CNTL, 1); + OUT_RING(ring, COND(msaa_resolve, A5XX_RB_CLEAR_CNTL_MSAA_RESOLVE)); - fd5_emit_blit(batch, ring); + fd5_emit_blit(batch, ring); } static void fd5_emit_tile_gmem2mem(struct fd_batch *batch, const struct fd_tile *tile) { - const struct fd_gmem_stateobj *gmem = batch->gmem_state; - struct pipe_framebuffer_state *pfb = &batch->framebuffer; - - if (batch->resolve & (FD_BUFFER_DEPTH | FD_BUFFER_STENCIL)) { - struct fd_resource *rsc = fd_resource(pfb->zsbuf->texture); - - if (!rsc->stencil || (batch->resolve & FD_BUFFER_DEPTH)) - emit_gmem2mem_surf(batch, gmem->zsbuf_base[0], pfb->zsbuf, BLIT_ZS); - if (rsc->stencil && (batch->resolve & FD_BUFFER_STENCIL)) - emit_gmem2mem_surf(batch, gmem->zsbuf_base[1], pfb->zsbuf, BLIT_S); - } - - if (batch->resolve & FD_BUFFER_COLOR) { - unsigned i; - for (i = 0; i < pfb->nr_cbufs; i++) { - if (!pfb->cbufs[i]) - continue; - if (!(batch->resolve & (PIPE_CLEAR_COLOR0 << i))) - continue; - emit_gmem2mem_surf(batch, gmem->cbuf_base[i], - pfb->cbufs[i], BLIT_MRT0 + i); - } - } + const struct fd_gmem_stateobj *gmem = batch->gmem_state; + struct pipe_framebuffer_state *pfb = &batch->framebuffer; + + if (batch->resolve & (FD_BUFFER_DEPTH | FD_BUFFER_STENCIL)) { + struct fd_resource *rsc = fd_resource(pfb->zsbuf->texture); + + if (!rsc->stencil || (batch->resolve & FD_BUFFER_DEPTH)) + emit_gmem2mem_surf(batch, gmem->zsbuf_base[0], pfb->zsbuf, BLIT_ZS); + if (rsc->stencil && (batch->resolve & FD_BUFFER_STENCIL)) + emit_gmem2mem_surf(batch, gmem->zsbuf_base[1], pfb->zsbuf, BLIT_S); + } + + if (batch->resolve & FD_BUFFER_COLOR) { + unsigned i; + for (i = 0; i < pfb->nr_cbufs; i++) { + if (!pfb->cbufs[i]) + continue; + if (!(batch->resolve & (PIPE_CLEAR_COLOR0 << i))) + continue; + emit_gmem2mem_surf(batch, gmem->cbuf_base[i], pfb->cbufs[i], + BLIT_MRT0 + i); + } + } } static void -fd5_emit_tile_fini(struct fd_batch *batch) - assert_dt +fd5_emit_tile_fini(struct fd_batch *batch) assert_dt { - struct fd_ringbuffer *ring = batch->gmem; + struct fd_ringbuffer *ring = batch->gmem; - OUT_PKT7(ring, CP_SKIP_IB2_ENABLE_GLOBAL, 1); - OUT_RING(ring, 0x0); + OUT_PKT7(ring, CP_SKIP_IB2_ENABLE_GLOBAL, 1); + OUT_RING(ring, 0x0); - fd5_emit_lrz_flush(batch, ring); + fd5_emit_lrz_flush(batch, ring); - fd5_cache_flush(batch, ring); - fd5_set_render_mode(batch->ctx, ring, BYPASS); + fd5_cache_flush(batch, ring); + fd5_set_render_mode(batch->ctx, ring, BYPASS); } static void -fd5_emit_sysmem_prep(struct fd_batch *batch) - assert_dt +fd5_emit_sysmem_prep(struct fd_batch *batch) assert_dt { - struct fd_ringbuffer *ring = batch->gmem; + struct fd_ringbuffer *ring = batch->gmem; - fd5_emit_restore(batch, ring); + fd5_emit_restore(batch, ring); - fd5_emit_lrz_flush(batch, ring); + fd5_emit_lrz_flush(batch, ring); - if (batch->prologue) - fd5_emit_ib(ring, batch->prologue); + if (batch->prologue) + fd5_emit_ib(ring, batch->prologue); - OUT_PKT7(ring, CP_SKIP_IB2_ENABLE_GLOBAL, 1); - OUT_RING(ring, 0x0); + OUT_PKT7(ring, CP_SKIP_IB2_ENABLE_GLOBAL, 1); + OUT_RING(ring, 0x0); - fd5_event_write(batch, ring, PC_CCU_INVALIDATE_COLOR, false); + fd5_event_write(batch, ring, PC_CCU_INVALIDATE_COLOR, false); - OUT_PKT4(ring, REG_A5XX_PC_POWER_CNTL, 1); - OUT_RING(ring, 0x00000003); /* PC_POWER_CNTL */ + OUT_PKT4(ring, REG_A5XX_PC_POWER_CNTL, 1); + OUT_RING(ring, 0x00000003); /* PC_POWER_CNTL */ - OUT_PKT4(ring, REG_A5XX_VFD_POWER_CNTL, 1); - OUT_RING(ring, 0x00000003); /* VFD_POWER_CNTL */ + OUT_PKT4(ring, REG_A5XX_VFD_POWER_CNTL, 1); + OUT_RING(ring, 0x00000003); /* VFD_POWER_CNTL */ - /* 0x10000000 for BYPASS.. 0x7c13c080 for GMEM: */ - fd_wfi(batch, ring); - OUT_PKT4(ring, REG_A5XX_RB_CCU_CNTL, 1); - OUT_RING(ring, 0x10000000); /* RB_CCU_CNTL */ + /* 0x10000000 for BYPASS.. 0x7c13c080 for GMEM: */ + fd_wfi(batch, ring); + OUT_PKT4(ring, REG_A5XX_RB_CCU_CNTL, 1); + OUT_RING(ring, 0x10000000); /* RB_CCU_CNTL */ - OUT_PKT4(ring, REG_A5XX_RB_CNTL, 1); - OUT_RING(ring, A5XX_RB_CNTL_WIDTH(0) | - A5XX_RB_CNTL_HEIGHT(0) | - A5XX_RB_CNTL_BYPASS); + OUT_PKT4(ring, REG_A5XX_RB_CNTL, 1); + OUT_RING(ring, A5XX_RB_CNTL_WIDTH(0) | A5XX_RB_CNTL_HEIGHT(0) | + A5XX_RB_CNTL_BYPASS); - /* remaining setup below here does not apply to blit/compute: */ - if (batch->nondraw) - return; + /* remaining setup below here does not apply to blit/compute: */ + if (batch->nondraw) + return; - struct pipe_framebuffer_state *pfb = &batch->framebuffer; + struct pipe_framebuffer_state *pfb = &batch->framebuffer; - OUT_PKT4(ring, REG_A5XX_GRAS_SC_WINDOW_SCISSOR_TL, 2); - OUT_RING(ring, A5XX_GRAS_SC_WINDOW_SCISSOR_TL_X(0) | - A5XX_GRAS_SC_WINDOW_SCISSOR_TL_Y(0)); - OUT_RING(ring, A5XX_GRAS_SC_WINDOW_SCISSOR_BR_X(pfb->width - 1) | - A5XX_GRAS_SC_WINDOW_SCISSOR_BR_Y(pfb->height - 1)); + OUT_PKT4(ring, REG_A5XX_GRAS_SC_WINDOW_SCISSOR_TL, 2); + OUT_RING(ring, A5XX_GRAS_SC_WINDOW_SCISSOR_TL_X(0) | + A5XX_GRAS_SC_WINDOW_SCISSOR_TL_Y(0)); + OUT_RING(ring, A5XX_GRAS_SC_WINDOW_SCISSOR_BR_X(pfb->width - 1) | + A5XX_GRAS_SC_WINDOW_SCISSOR_BR_Y(pfb->height - 1)); - OUT_PKT4(ring, REG_A5XX_RB_RESOLVE_CNTL_1, 2); - OUT_RING(ring, A5XX_RB_RESOLVE_CNTL_1_X(0) | - A5XX_RB_RESOLVE_CNTL_1_Y(0)); - OUT_RING(ring, A5XX_RB_RESOLVE_CNTL_2_X(pfb->width - 1) | - A5XX_RB_RESOLVE_CNTL_2_Y(pfb->height - 1)); + OUT_PKT4(ring, REG_A5XX_RB_RESOLVE_CNTL_1, 2); + OUT_RING(ring, A5XX_RB_RESOLVE_CNTL_1_X(0) | A5XX_RB_RESOLVE_CNTL_1_Y(0)); + OUT_RING(ring, A5XX_RB_RESOLVE_CNTL_2_X(pfb->width - 1) | + A5XX_RB_RESOLVE_CNTL_2_Y(pfb->height - 1)); - OUT_PKT4(ring, REG_A5XX_RB_WINDOW_OFFSET, 1); - OUT_RING(ring, A5XX_RB_WINDOW_OFFSET_X(0) | - A5XX_RB_WINDOW_OFFSET_Y(0)); + OUT_PKT4(ring, REG_A5XX_RB_WINDOW_OFFSET, 1); + OUT_RING(ring, A5XX_RB_WINDOW_OFFSET_X(0) | A5XX_RB_WINDOW_OFFSET_Y(0)); - /* Enable stream output, since there's no binning pass to put it in. */ - OUT_PKT4(ring, REG_A5XX_VPC_SO_OVERRIDE, 1); - OUT_RING(ring, 0); + /* Enable stream output, since there's no binning pass to put it in. */ + OUT_PKT4(ring, REG_A5XX_VPC_SO_OVERRIDE, 1); + OUT_RING(ring, 0); - OUT_PKT7(ring, CP_SET_VISIBILITY_OVERRIDE, 1); - OUT_RING(ring, 0x1); + OUT_PKT7(ring, CP_SET_VISIBILITY_OVERRIDE, 1); + OUT_RING(ring, 0x1); - patch_draws(batch, IGNORE_VISIBILITY); + patch_draws(batch, IGNORE_VISIBILITY); - emit_zs(ring, pfb->zsbuf, NULL); - emit_mrt(ring, pfb->nr_cbufs, pfb->cbufs, NULL); + emit_zs(ring, pfb->zsbuf, NULL); + emit_mrt(ring, pfb->nr_cbufs, pfb->cbufs, NULL); - OUT_PKT4(ring, REG_A5XX_TPL1_TP_RAS_MSAA_CNTL, 2); - OUT_RING(ring, A5XX_TPL1_TP_RAS_MSAA_CNTL_SAMPLES(MSAA_ONE)); - OUT_RING(ring, A5XX_TPL1_TP_DEST_MSAA_CNTL_SAMPLES(MSAA_ONE) | - A5XX_TPL1_TP_DEST_MSAA_CNTL_MSAA_DISABLE); + OUT_PKT4(ring, REG_A5XX_TPL1_TP_RAS_MSAA_CNTL, 2); + OUT_RING(ring, A5XX_TPL1_TP_RAS_MSAA_CNTL_SAMPLES(MSAA_ONE)); + OUT_RING(ring, A5XX_TPL1_TP_DEST_MSAA_CNTL_SAMPLES(MSAA_ONE) | + A5XX_TPL1_TP_DEST_MSAA_CNTL_MSAA_DISABLE); - OUT_PKT4(ring, REG_A5XX_RB_RAS_MSAA_CNTL, 2); - OUT_RING(ring, A5XX_RB_RAS_MSAA_CNTL_SAMPLES(MSAA_ONE)); - OUT_RING(ring, A5XX_RB_DEST_MSAA_CNTL_SAMPLES(MSAA_ONE) | - A5XX_RB_DEST_MSAA_CNTL_MSAA_DISABLE); + OUT_PKT4(ring, REG_A5XX_RB_RAS_MSAA_CNTL, 2); + OUT_RING(ring, A5XX_RB_RAS_MSAA_CNTL_SAMPLES(MSAA_ONE)); + OUT_RING(ring, A5XX_RB_DEST_MSAA_CNTL_SAMPLES(MSAA_ONE) | + A5XX_RB_DEST_MSAA_CNTL_MSAA_DISABLE); - OUT_PKT4(ring, REG_A5XX_GRAS_SC_RAS_MSAA_CNTL, 2); - OUT_RING(ring, A5XX_GRAS_SC_RAS_MSAA_CNTL_SAMPLES(MSAA_ONE)); - OUT_RING(ring, A5XX_GRAS_SC_DEST_MSAA_CNTL_SAMPLES(MSAA_ONE) | - A5XX_GRAS_SC_DEST_MSAA_CNTL_MSAA_DISABLE); + OUT_PKT4(ring, REG_A5XX_GRAS_SC_RAS_MSAA_CNTL, 2); + OUT_RING(ring, A5XX_GRAS_SC_RAS_MSAA_CNTL_SAMPLES(MSAA_ONE)); + OUT_RING(ring, A5XX_GRAS_SC_DEST_MSAA_CNTL_SAMPLES(MSAA_ONE) | + A5XX_GRAS_SC_DEST_MSAA_CNTL_MSAA_DISABLE); } static void fd5_emit_sysmem_fini(struct fd_batch *batch) { - struct fd_ringbuffer *ring = batch->gmem; + struct fd_ringbuffer *ring = batch->gmem; - OUT_PKT7(ring, CP_SKIP_IB2_ENABLE_GLOBAL, 1); - OUT_RING(ring, 0x0); + OUT_PKT7(ring, CP_SKIP_IB2_ENABLE_GLOBAL, 1); + OUT_RING(ring, 0x0); - fd5_emit_lrz_flush(batch, ring); + fd5_emit_lrz_flush(batch, ring); - fd5_event_write(batch, ring, PC_CCU_FLUSH_COLOR_TS, true); - fd5_event_write(batch, ring, PC_CCU_FLUSH_DEPTH_TS, true); + fd5_event_write(batch, ring, PC_CCU_FLUSH_COLOR_TS, true); + fd5_event_write(batch, ring, PC_CCU_FLUSH_DEPTH_TS, true); } void -fd5_gmem_init(struct pipe_context *pctx) - disable_thread_safety_analysis +fd5_gmem_init(struct pipe_context *pctx) disable_thread_safety_analysis { - struct fd_context *ctx = fd_context(pctx); - - ctx->emit_tile_init = fd5_emit_tile_init; - ctx->emit_tile_prep = fd5_emit_tile_prep; - ctx->emit_tile_mem2gmem = fd5_emit_tile_mem2gmem; - ctx->emit_tile_renderprep = fd5_emit_tile_renderprep; - ctx->emit_tile_gmem2mem = fd5_emit_tile_gmem2mem; - ctx->emit_tile_fini = fd5_emit_tile_fini; - ctx->emit_sysmem_prep = fd5_emit_sysmem_prep; - ctx->emit_sysmem_fini = fd5_emit_sysmem_fini; + struct fd_context *ctx = fd_context(pctx); + + ctx->emit_tile_init = fd5_emit_tile_init; + ctx->emit_tile_prep = fd5_emit_tile_prep; + ctx->emit_tile_mem2gmem = fd5_emit_tile_mem2gmem; + ctx->emit_tile_renderprep = fd5_emit_tile_renderprep; + ctx->emit_tile_gmem2mem = fd5_emit_tile_gmem2mem; + ctx->emit_tile_fini = fd5_emit_tile_fini; + ctx->emit_sysmem_prep = fd5_emit_sysmem_prep; + ctx->emit_sysmem_fini = fd5_emit_sysmem_fini; } diff --git a/src/gallium/drivers/freedreno/a5xx/fd5_image.c b/src/gallium/drivers/freedreno/a5xx/fd5_image.c index 6c2bb49..f420169 100644 --- a/src/gallium/drivers/freedreno/a5xx/fd5_image.c +++ b/src/gallium/drivers/freedreno/a5xx/fd5_image.c @@ -26,181 +26,184 @@ #include "pipe/p_state.h" -#include "freedreno_resource.h" -#include "fd5_image.h" #include "fd5_format.h" +#include "fd5_image.h" #include "fd5_texture.h" +#include "freedreno_resource.h" static enum a4xx_state_block texsb[] = { - [PIPE_SHADER_COMPUTE] = SB4_CS_TEX, - [PIPE_SHADER_FRAGMENT] = SB4_FS_TEX, + [PIPE_SHADER_COMPUTE] = SB4_CS_TEX, + [PIPE_SHADER_FRAGMENT] = SB4_FS_TEX, }; static enum a4xx_state_block imgsb[] = { - [PIPE_SHADER_COMPUTE] = SB4_CS_SSBO, - [PIPE_SHADER_FRAGMENT] = SB4_SSBO, + [PIPE_SHADER_COMPUTE] = SB4_CS_SSBO, + [PIPE_SHADER_FRAGMENT] = SB4_SSBO, }; struct fd5_image { - enum pipe_format pfmt; - enum a5xx_tex_fmt fmt; - enum a5xx_tex_type type; - bool srgb; - uint32_t cpp; - uint32_t width; - uint32_t height; - uint32_t depth; - uint32_t pitch; - uint32_t array_pitch; - struct fd_bo *bo; - uint32_t offset; - bool buffer; + enum pipe_format pfmt; + enum a5xx_tex_fmt fmt; + enum a5xx_tex_type type; + bool srgb; + uint32_t cpp; + uint32_t width; + uint32_t height; + uint32_t depth; + uint32_t pitch; + uint32_t array_pitch; + struct fd_bo *bo; + uint32_t offset; + bool buffer; }; -static void translate_image(struct fd5_image *img, struct pipe_image_view *pimg) +static void +translate_image(struct fd5_image *img, struct pipe_image_view *pimg) { - enum pipe_format format = pimg->format; - struct pipe_resource *prsc = pimg->resource; - struct fd_resource *rsc = fd_resource(prsc); - - if (!pimg->resource) { - memset(img, 0, sizeof(*img)); - return; - } - - img->pfmt = format; - img->fmt = fd5_pipe2tex(format); - img->type = fd5_tex_type(prsc->target); - img->srgb = util_format_is_srgb(format); - img->cpp = rsc->layout.cpp; - img->bo = rsc->bo; - - /* Treat cube textures as 2d-array: */ - if (img->type == A5XX_TEX_CUBE) - img->type = A5XX_TEX_2D; - - if (prsc->target == PIPE_BUFFER) { - img->buffer = true; - img->offset = pimg->u.buf.offset; - img->pitch = 0; - img->array_pitch = 0; - - /* size is encoded with low 15b in WIDTH and high bits in - * HEIGHT, in units of elements: - */ - unsigned sz = pimg->u.buf.size / util_format_get_blocksize(format); - img->width = sz & MASK(15); - img->height = sz >> 15; - img->depth = 0; - } else { - img->buffer = false; - - unsigned lvl = pimg->u.tex.level; - img->offset = fd_resource_offset(rsc, lvl, pimg->u.tex.first_layer); - img->pitch = fd_resource_pitch(rsc, lvl); - - img->width = u_minify(prsc->width0, lvl); - img->height = u_minify(prsc->height0, lvl); - - unsigned layers = pimg->u.tex.last_layer - pimg->u.tex.first_layer + 1; - - switch (prsc->target) { - case PIPE_TEXTURE_RECT: - case PIPE_TEXTURE_1D: - case PIPE_TEXTURE_2D: - img->array_pitch = rsc->layout.layer_size; - img->depth = 1; - break; - case PIPE_TEXTURE_1D_ARRAY: - case PIPE_TEXTURE_2D_ARRAY: - img->array_pitch = rsc->layout.layer_size; - img->depth = layers; - break; - case PIPE_TEXTURE_CUBE: - case PIPE_TEXTURE_CUBE_ARRAY: - img->array_pitch = rsc->layout.layer_size; - img->depth = layers; - break; - case PIPE_TEXTURE_3D: - img->array_pitch = fd_resource_slice(rsc, lvl)->size0; - img->depth = u_minify(prsc->depth0, lvl); - break; - default: - img->array_pitch = 0; - img->depth = 0; - break; - } - } + enum pipe_format format = pimg->format; + struct pipe_resource *prsc = pimg->resource; + struct fd_resource *rsc = fd_resource(prsc); + + if (!pimg->resource) { + memset(img, 0, sizeof(*img)); + return; + } + + img->pfmt = format; + img->fmt = fd5_pipe2tex(format); + img->type = fd5_tex_type(prsc->target); + img->srgb = util_format_is_srgb(format); + img->cpp = rsc->layout.cpp; + img->bo = rsc->bo; + + /* Treat cube textures as 2d-array: */ + if (img->type == A5XX_TEX_CUBE) + img->type = A5XX_TEX_2D; + + if (prsc->target == PIPE_BUFFER) { + img->buffer = true; + img->offset = pimg->u.buf.offset; + img->pitch = 0; + img->array_pitch = 0; + + /* size is encoded with low 15b in WIDTH and high bits in + * HEIGHT, in units of elements: + */ + unsigned sz = pimg->u.buf.size / util_format_get_blocksize(format); + img->width = sz & MASK(15); + img->height = sz >> 15; + img->depth = 0; + } else { + img->buffer = false; + + unsigned lvl = pimg->u.tex.level; + img->offset = fd_resource_offset(rsc, lvl, pimg->u.tex.first_layer); + img->pitch = fd_resource_pitch(rsc, lvl); + + img->width = u_minify(prsc->width0, lvl); + img->height = u_minify(prsc->height0, lvl); + + unsigned layers = pimg->u.tex.last_layer - pimg->u.tex.first_layer + 1; + + switch (prsc->target) { + case PIPE_TEXTURE_RECT: + case PIPE_TEXTURE_1D: + case PIPE_TEXTURE_2D: + img->array_pitch = rsc->layout.layer_size; + img->depth = 1; + break; + case PIPE_TEXTURE_1D_ARRAY: + case PIPE_TEXTURE_2D_ARRAY: + img->array_pitch = rsc->layout.layer_size; + img->depth = layers; + break; + case PIPE_TEXTURE_CUBE: + case PIPE_TEXTURE_CUBE_ARRAY: + img->array_pitch = rsc->layout.layer_size; + img->depth = layers; + break; + case PIPE_TEXTURE_3D: + img->array_pitch = fd_resource_slice(rsc, lvl)->size0; + img->depth = u_minify(prsc->depth0, lvl); + break; + default: + img->array_pitch = 0; + img->depth = 0; + break; + } + } } -static void emit_image_tex(struct fd_ringbuffer *ring, unsigned slot, - struct fd5_image *img, enum pipe_shader_type shader) +static void +emit_image_tex(struct fd_ringbuffer *ring, unsigned slot, struct fd5_image *img, + enum pipe_shader_type shader) { - OUT_PKT7(ring, CP_LOAD_STATE4, 3 + 12); - OUT_RING(ring, CP_LOAD_STATE4_0_DST_OFF(slot) | - CP_LOAD_STATE4_0_STATE_SRC(SS4_DIRECT) | - CP_LOAD_STATE4_0_STATE_BLOCK(texsb[shader]) | - CP_LOAD_STATE4_0_NUM_UNIT(1)); - OUT_RING(ring, CP_LOAD_STATE4_1_STATE_TYPE(ST4_CONSTANTS) | - CP_LOAD_STATE4_1_EXT_SRC_ADDR(0)); - OUT_RING(ring, CP_LOAD_STATE4_2_EXT_SRC_ADDR_HI(0)); - - OUT_RING(ring, A5XX_TEX_CONST_0_FMT(img->fmt) | - fd5_tex_swiz(img->pfmt, PIPE_SWIZZLE_X, PIPE_SWIZZLE_Y, - PIPE_SWIZZLE_Z, PIPE_SWIZZLE_W) | - COND(img->srgb, A5XX_TEX_CONST_0_SRGB)); - OUT_RING(ring, A5XX_TEX_CONST_1_WIDTH(img->width) | - A5XX_TEX_CONST_1_HEIGHT(img->height)); - OUT_RING(ring, - COND(img->buffer, A5XX_TEX_CONST_2_UNK4 | A5XX_TEX_CONST_2_UNK31) | - A5XX_TEX_CONST_2_TYPE(img->type) | - A5XX_TEX_CONST_2_PITCH(img->pitch)); - OUT_RING(ring, A5XX_TEX_CONST_3_ARRAY_PITCH(img->array_pitch)); - if (img->bo) { - OUT_RELOC(ring, img->bo, img->offset, - (uint64_t)A5XX_TEX_CONST_5_DEPTH(img->depth) << 32, 0); - } else { - OUT_RING(ring, 0x00000000); - OUT_RING(ring, A5XX_TEX_CONST_5_DEPTH(img->depth)); - } - OUT_RING(ring, 0x00000000); - OUT_RING(ring, 0x00000000); - OUT_RING(ring, 0x00000000); - OUT_RING(ring, 0x00000000); - OUT_RING(ring, 0x00000000); - OUT_RING(ring, 0x00000000); + OUT_PKT7(ring, CP_LOAD_STATE4, 3 + 12); + OUT_RING(ring, CP_LOAD_STATE4_0_DST_OFF(slot) | + CP_LOAD_STATE4_0_STATE_SRC(SS4_DIRECT) | + CP_LOAD_STATE4_0_STATE_BLOCK(texsb[shader]) | + CP_LOAD_STATE4_0_NUM_UNIT(1)); + OUT_RING(ring, CP_LOAD_STATE4_1_STATE_TYPE(ST4_CONSTANTS) | + CP_LOAD_STATE4_1_EXT_SRC_ADDR(0)); + OUT_RING(ring, CP_LOAD_STATE4_2_EXT_SRC_ADDR_HI(0)); + + OUT_RING(ring, A5XX_TEX_CONST_0_FMT(img->fmt) | + fd5_tex_swiz(img->pfmt, PIPE_SWIZZLE_X, PIPE_SWIZZLE_Y, + PIPE_SWIZZLE_Z, PIPE_SWIZZLE_W) | + COND(img->srgb, A5XX_TEX_CONST_0_SRGB)); + OUT_RING(ring, A5XX_TEX_CONST_1_WIDTH(img->width) | + A5XX_TEX_CONST_1_HEIGHT(img->height)); + OUT_RING(ring, + COND(img->buffer, A5XX_TEX_CONST_2_UNK4 | A5XX_TEX_CONST_2_UNK31) | + A5XX_TEX_CONST_2_TYPE(img->type) | + A5XX_TEX_CONST_2_PITCH(img->pitch)); + OUT_RING(ring, A5XX_TEX_CONST_3_ARRAY_PITCH(img->array_pitch)); + if (img->bo) { + OUT_RELOC(ring, img->bo, img->offset, + (uint64_t)A5XX_TEX_CONST_5_DEPTH(img->depth) << 32, 0); + } else { + OUT_RING(ring, 0x00000000); + OUT_RING(ring, A5XX_TEX_CONST_5_DEPTH(img->depth)); + } + OUT_RING(ring, 0x00000000); + OUT_RING(ring, 0x00000000); + OUT_RING(ring, 0x00000000); + OUT_RING(ring, 0x00000000); + OUT_RING(ring, 0x00000000); + OUT_RING(ring, 0x00000000); } -static void emit_image_ssbo(struct fd_ringbuffer *ring, unsigned slot, - struct fd5_image *img, enum pipe_shader_type shader) +static void +emit_image_ssbo(struct fd_ringbuffer *ring, unsigned slot, + struct fd5_image *img, enum pipe_shader_type shader) { - OUT_PKT7(ring, CP_LOAD_STATE4, 3 + 2); - OUT_RING(ring, CP_LOAD_STATE4_0_DST_OFF(slot) | - CP_LOAD_STATE4_0_STATE_SRC(SS4_DIRECT) | - CP_LOAD_STATE4_0_STATE_BLOCK(imgsb[shader]) | - CP_LOAD_STATE4_0_NUM_UNIT(1)); - OUT_RING(ring, CP_LOAD_STATE4_1_STATE_TYPE(1) | - CP_LOAD_STATE4_1_EXT_SRC_ADDR(0)); - OUT_RING(ring, CP_LOAD_STATE4_2_EXT_SRC_ADDR_HI(0)); - OUT_RING(ring, A5XX_SSBO_1_0_FMT(img->fmt) | - A5XX_SSBO_1_0_WIDTH(img->width)); - OUT_RING(ring, A5XX_SSBO_1_1_HEIGHT(img->height) | - A5XX_SSBO_1_1_DEPTH(img->depth)); - - OUT_PKT7(ring, CP_LOAD_STATE4, 3 + 2); - OUT_RING(ring, CP_LOAD_STATE4_0_DST_OFF(slot) | - CP_LOAD_STATE4_0_STATE_SRC(SS4_DIRECT) | - CP_LOAD_STATE4_0_STATE_BLOCK(imgsb[shader]) | - CP_LOAD_STATE4_0_NUM_UNIT(1)); - OUT_RING(ring, CP_LOAD_STATE4_1_STATE_TYPE(2) | - CP_LOAD_STATE4_1_EXT_SRC_ADDR(0)); - OUT_RING(ring, CP_LOAD_STATE4_2_EXT_SRC_ADDR_HI(0)); - if (img->bo) { - OUT_RELOC(ring, img->bo, img->offset, 0, 0); - } else { - OUT_RING(ring, 0x00000000); - OUT_RING(ring, 0x00000000); - } + OUT_PKT7(ring, CP_LOAD_STATE4, 3 + 2); + OUT_RING(ring, CP_LOAD_STATE4_0_DST_OFF(slot) | + CP_LOAD_STATE4_0_STATE_SRC(SS4_DIRECT) | + CP_LOAD_STATE4_0_STATE_BLOCK(imgsb[shader]) | + CP_LOAD_STATE4_0_NUM_UNIT(1)); + OUT_RING(ring, + CP_LOAD_STATE4_1_STATE_TYPE(1) | CP_LOAD_STATE4_1_EXT_SRC_ADDR(0)); + OUT_RING(ring, CP_LOAD_STATE4_2_EXT_SRC_ADDR_HI(0)); + OUT_RING(ring, + A5XX_SSBO_1_0_FMT(img->fmt) | A5XX_SSBO_1_0_WIDTH(img->width)); + OUT_RING(ring, A5XX_SSBO_1_1_HEIGHT(img->height) | + A5XX_SSBO_1_1_DEPTH(img->depth)); + + OUT_PKT7(ring, CP_LOAD_STATE4, 3 + 2); + OUT_RING(ring, CP_LOAD_STATE4_0_DST_OFF(slot) | + CP_LOAD_STATE4_0_STATE_SRC(SS4_DIRECT) | + CP_LOAD_STATE4_0_STATE_BLOCK(imgsb[shader]) | + CP_LOAD_STATE4_0_NUM_UNIT(1)); + OUT_RING(ring, + CP_LOAD_STATE4_1_STATE_TYPE(2) | CP_LOAD_STATE4_1_EXT_SRC_ADDR(0)); + OUT_RING(ring, CP_LOAD_STATE4_2_EXT_SRC_ADDR_HI(0)); + if (img->bo) { + OUT_RELOC(ring, img->bo, img->offset, 0, 0); + } else { + OUT_RING(ring, 0x00000000); + OUT_RING(ring, 0x00000000); + } } /* Emit required "SSBO" and sampler state. The sampler state is used by the @@ -209,19 +212,21 @@ static void emit_image_ssbo(struct fd_ringbuffer *ring, unsigned slot, */ void fd5_emit_images(struct fd_context *ctx, struct fd_ringbuffer *ring, - enum pipe_shader_type shader, const struct ir3_shader_variant *v) + enum pipe_shader_type shader, + const struct ir3_shader_variant *v) { - struct fd_shaderimg_stateobj *so = &ctx->shaderimg[shader]; - unsigned enabled_mask = so->enabled_mask; - const struct ir3_ibo_mapping *m = &v->image_mapping; + struct fd_shaderimg_stateobj *so = &ctx->shaderimg[shader]; + unsigned enabled_mask = so->enabled_mask; + const struct ir3_ibo_mapping *m = &v->image_mapping; - while (enabled_mask) { - unsigned index = u_bit_scan(&enabled_mask); - struct fd5_image img; + while (enabled_mask) { + unsigned index = u_bit_scan(&enabled_mask); + struct fd5_image img; - translate_image(&img, &so->si[index]); + translate_image(&img, &so->si[index]); - emit_image_tex(ring, m->image_to_tex[index] + m->tex_base, &img, shader); - emit_image_ssbo(ring, v->shader->nir->info.num_ssbos + index, &img, shader); - } + emit_image_tex(ring, m->image_to_tex[index] + m->tex_base, &img, shader); + emit_image_ssbo(ring, v->shader->nir->info.num_ssbos + index, &img, + shader); + } } diff --git a/src/gallium/drivers/freedreno/a5xx/fd5_image.h b/src/gallium/drivers/freedreno/a5xx/fd5_image.h index f756782..b23d482c 100644 --- a/src/gallium/drivers/freedreno/a5xx/fd5_image.h +++ b/src/gallium/drivers/freedreno/a5xx/fd5_image.h @@ -31,6 +31,7 @@ struct ir3_shader_variant; void fd5_emit_images(struct fd_context *ctx, struct fd_ringbuffer *ring, - enum pipe_shader_type shader, const struct ir3_shader_variant *v); + enum pipe_shader_type shader, + const struct ir3_shader_variant *v); #endif /* FD5_IMAGE_H_ */ diff --git a/src/gallium/drivers/freedreno/a5xx/fd5_program.c b/src/gallium/drivers/freedreno/a5xx/fd5_program.c index f29c3fe..06b41ba 100644 --- a/src/gallium/drivers/freedreno/a5xx/fd5_program.c +++ b/src/gallium/drivers/freedreno/a5xx/fd5_program.c @@ -25,61 +25,60 @@ */ #include "pipe/p_state.h" -#include "util/u_string.h" -#include "util/u_memory.h" -#include "util/u_inlines.h" -#include "util/format/u_format.h" #include "util/bitset.h" +#include "util/format/u_format.h" +#include "util/u_inlines.h" +#include "util/u_memory.h" +#include "util/u_string.h" #include "freedreno_program.h" -#include "fd5_program.h" #include "fd5_emit.h" -#include "fd5_texture.h" #include "fd5_format.h" +#include "fd5_program.h" +#include "fd5_texture.h" #include "ir3_cache.h" void fd5_emit_shader(struct fd_ringbuffer *ring, const struct ir3_shader_variant *so) { - const struct ir3_info *si = &so->info; - enum a4xx_state_block sb = fd4_stage2shadersb(so->type); - enum a4xx_state_src src; - uint32_t i, sz, *bin; - - if (FD_DBG(DIRECT)) { - sz = si->sizedwords; - src = SS4_DIRECT; - bin = fd_bo_map(so->bo); - } else { - sz = 0; - src = SS4_INDIRECT; - bin = NULL; - } - - OUT_PKT7(ring, CP_LOAD_STATE4, 3 + sz); - OUT_RING(ring, CP_LOAD_STATE4_0_DST_OFF(0) | - CP_LOAD_STATE4_0_STATE_SRC(src) | - CP_LOAD_STATE4_0_STATE_BLOCK(sb) | - CP_LOAD_STATE4_0_NUM_UNIT(so->instrlen)); - if (bin) { - OUT_RING(ring, CP_LOAD_STATE4_1_EXT_SRC_ADDR(0) | - CP_LOAD_STATE4_1_STATE_TYPE(ST4_SHADER)); - OUT_RING(ring, CP_LOAD_STATE4_2_EXT_SRC_ADDR_HI(0)); - } else { - OUT_RELOC(ring, so->bo, 0, - CP_LOAD_STATE4_1_STATE_TYPE(ST4_SHADER), 0); - } - - /* for how clever coverity is, it is sometimes rather dull, and - * doesn't realize that the only case where bin==NULL, sz==0: - */ - assume(bin || (sz == 0)); - - for (i = 0; i < sz; i++) { - OUT_RING(ring, bin[i]); - } + const struct ir3_info *si = &so->info; + enum a4xx_state_block sb = fd4_stage2shadersb(so->type); + enum a4xx_state_src src; + uint32_t i, sz, *bin; + + if (FD_DBG(DIRECT)) { + sz = si->sizedwords; + src = SS4_DIRECT; + bin = fd_bo_map(so->bo); + } else { + sz = 0; + src = SS4_INDIRECT; + bin = NULL; + } + + OUT_PKT7(ring, CP_LOAD_STATE4, 3 + sz); + OUT_RING(ring, CP_LOAD_STATE4_0_DST_OFF(0) | + CP_LOAD_STATE4_0_STATE_SRC(src) | + CP_LOAD_STATE4_0_STATE_BLOCK(sb) | + CP_LOAD_STATE4_0_NUM_UNIT(so->instrlen)); + if (bin) { + OUT_RING(ring, CP_LOAD_STATE4_1_EXT_SRC_ADDR(0) | + CP_LOAD_STATE4_1_STATE_TYPE(ST4_SHADER)); + OUT_RING(ring, CP_LOAD_STATE4_2_EXT_SRC_ADDR_HI(0)); + } else { + OUT_RELOC(ring, so->bo, 0, CP_LOAD_STATE4_1_STATE_TYPE(ST4_SHADER), 0); + } + + /* for how clever coverity is, it is sometimes rather dull, and + * doesn't realize that the only case where bin==NULL, sz==0: + */ + assume(bin || (sz == 0)); + + for (i = 0; i < sz; i++) { + OUT_RING(ring, bin[i]); + } } /* TODO maybe some of this we could pre-compute once rather than having @@ -87,636 +86,651 @@ fd5_emit_shader(struct fd_ringbuffer *ring, const struct ir3_shader_variant *so) */ static void emit_stream_out(struct fd_ringbuffer *ring, const struct ir3_shader_variant *v, - struct ir3_shader_linkage *l) + struct ir3_shader_linkage *l) { - const struct ir3_stream_output_info *strmout = &v->shader->stream_output; - unsigned ncomp[PIPE_MAX_SO_BUFFERS] = {0}; - unsigned prog[align(l->max_loc, 2) / 2]; - - memset(prog, 0, sizeof(prog)); - - for (unsigned i = 0; i < strmout->num_outputs; i++) { - const struct ir3_stream_output *out = &strmout->output[i]; - unsigned k = out->register_index; - unsigned idx; - - ncomp[out->output_buffer] += out->num_components; - - /* linkage map sorted by order frag shader wants things, so - * a bit less ideal here.. - */ - for (idx = 0; idx < l->cnt; idx++) - if (l->var[idx].regid == v->outputs[k].regid) - break; - - debug_assert(idx < l->cnt); - - for (unsigned j = 0; j < out->num_components; j++) { - unsigned c = j + out->start_component; - unsigned loc = l->var[idx].loc + c; - unsigned off = j + out->dst_offset; /* in dwords */ - - if (loc & 1) { - prog[loc/2] |= A5XX_VPC_SO_PROG_B_EN | - A5XX_VPC_SO_PROG_B_BUF(out->output_buffer) | - A5XX_VPC_SO_PROG_B_OFF(off * 4); - } else { - prog[loc/2] |= A5XX_VPC_SO_PROG_A_EN | - A5XX_VPC_SO_PROG_A_BUF(out->output_buffer) | - A5XX_VPC_SO_PROG_A_OFF(off * 4); - } - } - } - - OUT_PKT7(ring, CP_CONTEXT_REG_BUNCH, 12 + (2 * ARRAY_SIZE(prog))); - OUT_RING(ring, REG_A5XX_VPC_SO_BUF_CNTL); - OUT_RING(ring, A5XX_VPC_SO_BUF_CNTL_ENABLE | - COND(ncomp[0] > 0, A5XX_VPC_SO_BUF_CNTL_BUF0) | - COND(ncomp[1] > 0, A5XX_VPC_SO_BUF_CNTL_BUF1) | - COND(ncomp[2] > 0, A5XX_VPC_SO_BUF_CNTL_BUF2) | - COND(ncomp[3] > 0, A5XX_VPC_SO_BUF_CNTL_BUF3)); - OUT_RING(ring, REG_A5XX_VPC_SO_NCOMP(0)); - OUT_RING(ring, ncomp[0]); - OUT_RING(ring, REG_A5XX_VPC_SO_NCOMP(1)); - OUT_RING(ring, ncomp[1]); - OUT_RING(ring, REG_A5XX_VPC_SO_NCOMP(2)); - OUT_RING(ring, ncomp[2]); - OUT_RING(ring, REG_A5XX_VPC_SO_NCOMP(3)); - OUT_RING(ring, ncomp[3]); - OUT_RING(ring, REG_A5XX_VPC_SO_CNTL); - OUT_RING(ring, A5XX_VPC_SO_CNTL_ENABLE); - for (unsigned i = 0; i < ARRAY_SIZE(prog); i++) { - OUT_RING(ring, REG_A5XX_VPC_SO_PROG); - OUT_RING(ring, prog[i]); - } + const struct ir3_stream_output_info *strmout = &v->shader->stream_output; + unsigned ncomp[PIPE_MAX_SO_BUFFERS] = {0}; + unsigned prog[align(l->max_loc, 2) / 2]; + + memset(prog, 0, sizeof(prog)); + + for (unsigned i = 0; i < strmout->num_outputs; i++) { + const struct ir3_stream_output *out = &strmout->output[i]; + unsigned k = out->register_index; + unsigned idx; + + ncomp[out->output_buffer] += out->num_components; + + /* linkage map sorted by order frag shader wants things, so + * a bit less ideal here.. + */ + for (idx = 0; idx < l->cnt; idx++) + if (l->var[idx].regid == v->outputs[k].regid) + break; + + debug_assert(idx < l->cnt); + + for (unsigned j = 0; j < out->num_components; j++) { + unsigned c = j + out->start_component; + unsigned loc = l->var[idx].loc + c; + unsigned off = j + out->dst_offset; /* in dwords */ + + if (loc & 1) { + prog[loc / 2] |= A5XX_VPC_SO_PROG_B_EN | + A5XX_VPC_SO_PROG_B_BUF(out->output_buffer) | + A5XX_VPC_SO_PROG_B_OFF(off * 4); + } else { + prog[loc / 2] |= A5XX_VPC_SO_PROG_A_EN | + A5XX_VPC_SO_PROG_A_BUF(out->output_buffer) | + A5XX_VPC_SO_PROG_A_OFF(off * 4); + } + } + } + + OUT_PKT7(ring, CP_CONTEXT_REG_BUNCH, 12 + (2 * ARRAY_SIZE(prog))); + OUT_RING(ring, REG_A5XX_VPC_SO_BUF_CNTL); + OUT_RING(ring, A5XX_VPC_SO_BUF_CNTL_ENABLE | + COND(ncomp[0] > 0, A5XX_VPC_SO_BUF_CNTL_BUF0) | + COND(ncomp[1] > 0, A5XX_VPC_SO_BUF_CNTL_BUF1) | + COND(ncomp[2] > 0, A5XX_VPC_SO_BUF_CNTL_BUF2) | + COND(ncomp[3] > 0, A5XX_VPC_SO_BUF_CNTL_BUF3)); + OUT_RING(ring, REG_A5XX_VPC_SO_NCOMP(0)); + OUT_RING(ring, ncomp[0]); + OUT_RING(ring, REG_A5XX_VPC_SO_NCOMP(1)); + OUT_RING(ring, ncomp[1]); + OUT_RING(ring, REG_A5XX_VPC_SO_NCOMP(2)); + OUT_RING(ring, ncomp[2]); + OUT_RING(ring, REG_A5XX_VPC_SO_NCOMP(3)); + OUT_RING(ring, ncomp[3]); + OUT_RING(ring, REG_A5XX_VPC_SO_CNTL); + OUT_RING(ring, A5XX_VPC_SO_CNTL_ENABLE); + for (unsigned i = 0; i < ARRAY_SIZE(prog); i++) { + OUT_RING(ring, REG_A5XX_VPC_SO_PROG); + OUT_RING(ring, prog[i]); + } } struct stage { - const struct ir3_shader_variant *v; - const struct ir3_info *i; - /* const sizes are in units of 4 * vec4 */ - uint8_t constoff; - uint8_t constlen; - /* instr sizes are in units of 16 instructions */ - uint8_t instroff; - uint8_t instrlen; + const struct ir3_shader_variant *v; + const struct ir3_info *i; + /* const sizes are in units of 4 * vec4 */ + uint8_t constoff; + uint8_t constlen; + /* instr sizes are in units of 16 instructions */ + uint8_t instroff; + uint8_t instrlen; }; -enum { - VS = 0, - FS = 1, - HS = 2, - DS = 3, - GS = 4, - MAX_STAGES -}; +enum { VS = 0, FS = 1, HS = 2, DS = 3, GS = 4, MAX_STAGES }; static void setup_stages(struct fd5_emit *emit, struct stage *s) { - unsigned i; - - s[VS].v = fd5_emit_get_vp(emit); - s[FS].v = fd5_emit_get_fp(emit); - - s[HS].v = s[DS].v = s[GS].v = NULL; /* for now */ - - for (i = 0; i < MAX_STAGES; i++) { - if (s[i].v) { - s[i].i = &s[i].v->info; - /* constlen is in units of 4 * vec4: */ - assert(s[i].v->constlen % 4 == 0); - s[i].constlen = s[i].v->constlen / 4; - /* instrlen is already in units of 16 instr.. although - * probably we should ditch that and not make the compiler - * care about instruction group size of a3xx vs a5xx - */ - s[i].instrlen = s[i].v->instrlen; - } else { - s[i].i = NULL; - s[i].constlen = 0; - s[i].instrlen = 0; - } - } - - /* NOTE: at least for gles2, blob partitions VS at bottom of const - * space and FS taking entire remaining space. We probably don't - * need to do that the same way, but for now mimic what the blob - * does to make it easier to diff against register values from blob - * - * NOTE: if VS.instrlen + FS.instrlen > 64, then one or both shaders - * is run from external memory. - */ - if ((s[VS].instrlen + s[FS].instrlen) > 64) { - /* prioritize FS for internal memory: */ - if (s[FS].instrlen < 64) { - /* if FS can fit, kick VS out to external memory: */ - s[VS].instrlen = 0; - } else if (s[VS].instrlen < 64) { - /* otherwise if VS can fit, kick out FS: */ - s[FS].instrlen = 0; - } else { - /* neither can fit, run both from external memory: */ - s[VS].instrlen = 0; - s[FS].instrlen = 0; - } - } - - unsigned constoff = 0; - for (i = 0; i < MAX_STAGES; i++) { - s[i].constoff = constoff; - constoff += s[i].constlen; - } - - s[VS].instroff = 0; - s[FS].instroff = 64 - s[FS].instrlen; - s[HS].instroff = s[DS].instroff = s[GS].instroff = s[FS].instroff; + unsigned i; + + s[VS].v = fd5_emit_get_vp(emit); + s[FS].v = fd5_emit_get_fp(emit); + + s[HS].v = s[DS].v = s[GS].v = NULL; /* for now */ + + for (i = 0; i < MAX_STAGES; i++) { + if (s[i].v) { + s[i].i = &s[i].v->info; + /* constlen is in units of 4 * vec4: */ + assert(s[i].v->constlen % 4 == 0); + s[i].constlen = s[i].v->constlen / 4; + /* instrlen is already in units of 16 instr.. although + * probably we should ditch that and not make the compiler + * care about instruction group size of a3xx vs a5xx + */ + s[i].instrlen = s[i].v->instrlen; + } else { + s[i].i = NULL; + s[i].constlen = 0; + s[i].instrlen = 0; + } + } + + /* NOTE: at least for gles2, blob partitions VS at bottom of const + * space and FS taking entire remaining space. We probably don't + * need to do that the same way, but for now mimic what the blob + * does to make it easier to diff against register values from blob + * + * NOTE: if VS.instrlen + FS.instrlen > 64, then one or both shaders + * is run from external memory. + */ + if ((s[VS].instrlen + s[FS].instrlen) > 64) { + /* prioritize FS for internal memory: */ + if (s[FS].instrlen < 64) { + /* if FS can fit, kick VS out to external memory: */ + s[VS].instrlen = 0; + } else if (s[VS].instrlen < 64) { + /* otherwise if VS can fit, kick out FS: */ + s[FS].instrlen = 0; + } else { + /* neither can fit, run both from external memory: */ + s[VS].instrlen = 0; + s[FS].instrlen = 0; + } + } + + unsigned constoff = 0; + for (i = 0; i < MAX_STAGES; i++) { + s[i].constoff = constoff; + constoff += s[i].constlen; + } + + s[VS].instroff = 0; + s[FS].instroff = 64 - s[FS].instrlen; + s[HS].instroff = s[DS].instroff = s[GS].instroff = s[FS].instroff; } static inline uint32_t next_regid(uint32_t reg, uint32_t increment) { - if (VALIDREG(reg)) - return reg + increment; - else - return regid(63,0); + if (VALIDREG(reg)) + return reg + increment; + else + return regid(63, 0); } void fd5_program_emit(struct fd_context *ctx, struct fd_ringbuffer *ring, - struct fd5_emit *emit) + struct fd5_emit *emit) { - struct stage s[MAX_STAGES]; - uint32_t pos_regid, psize_regid, color_regid[8]; - uint32_t face_regid, coord_regid, zwcoord_regid, samp_id_regid, samp_mask_regid; - uint32_t ij_regid[IJ_COUNT], vertex_regid, instance_regid, clip0_regid, clip1_regid; - enum a3xx_threadsize fssz; - uint8_t psize_loc = ~0; - int i, j; - - setup_stages(emit, s); - - bool do_streamout = (s[VS].v->shader->stream_output.num_outputs > 0); - uint8_t clip_mask = s[VS].v->clip_mask, cull_mask = s[VS].v->cull_mask; - uint8_t clip_cull_mask = clip_mask | cull_mask; - - fssz = (s[FS].i->double_threadsize) ? FOUR_QUADS : TWO_QUADS; - - pos_regid = ir3_find_output_regid(s[VS].v, VARYING_SLOT_POS); - psize_regid = ir3_find_output_regid(s[VS].v, VARYING_SLOT_PSIZ); - clip0_regid = ir3_find_output_regid(s[VS].v, VARYING_SLOT_CLIP_DIST0); - clip1_regid = ir3_find_output_regid(s[VS].v, VARYING_SLOT_CLIP_DIST1); - vertex_regid = ir3_find_sysval_regid(s[VS].v, SYSTEM_VALUE_VERTEX_ID_ZERO_BASE); - instance_regid = ir3_find_sysval_regid(s[VS].v, SYSTEM_VALUE_INSTANCE_ID); - - if (s[FS].v->color0_mrt) { - color_regid[0] = color_regid[1] = color_regid[2] = color_regid[3] = - color_regid[4] = color_regid[5] = color_regid[6] = color_regid[7] = - ir3_find_output_regid(s[FS].v, FRAG_RESULT_COLOR); - } else { - color_regid[0] = ir3_find_output_regid(s[FS].v, FRAG_RESULT_DATA0); - color_regid[1] = ir3_find_output_regid(s[FS].v, FRAG_RESULT_DATA1); - color_regid[2] = ir3_find_output_regid(s[FS].v, FRAG_RESULT_DATA2); - color_regid[3] = ir3_find_output_regid(s[FS].v, FRAG_RESULT_DATA3); - color_regid[4] = ir3_find_output_regid(s[FS].v, FRAG_RESULT_DATA4); - color_regid[5] = ir3_find_output_regid(s[FS].v, FRAG_RESULT_DATA5); - color_regid[6] = ir3_find_output_regid(s[FS].v, FRAG_RESULT_DATA6); - color_regid[7] = ir3_find_output_regid(s[FS].v, FRAG_RESULT_DATA7); - } - - samp_id_regid = ir3_find_sysval_regid(s[FS].v, SYSTEM_VALUE_SAMPLE_ID); - samp_mask_regid = ir3_find_sysval_regid(s[FS].v, SYSTEM_VALUE_SAMPLE_MASK_IN); - face_regid = ir3_find_sysval_regid(s[FS].v, SYSTEM_VALUE_FRONT_FACE); - coord_regid = ir3_find_sysval_regid(s[FS].v, SYSTEM_VALUE_FRAG_COORD); - zwcoord_regid = next_regid(coord_regid, 2); - for (unsigned i = 0; i < ARRAY_SIZE(ij_regid); i++) - ij_regid[i] = ir3_find_sysval_regid(s[FS].v, SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL + i); - - /* we could probably divide this up into things that need to be - * emitted if frag-prog is dirty vs if vert-prog is dirty.. - */ - - OUT_PKT4(ring, REG_A5XX_HLSQ_VS_CONFIG, 5); - OUT_RING(ring, A5XX_HLSQ_VS_CONFIG_CONSTOBJECTOFFSET(s[VS].constoff) | - A5XX_HLSQ_VS_CONFIG_SHADEROBJOFFSET(s[VS].instroff) | - COND(s[VS].v, A5XX_HLSQ_VS_CONFIG_ENABLED)); - OUT_RING(ring, A5XX_HLSQ_FS_CONFIG_CONSTOBJECTOFFSET(s[FS].constoff) | - A5XX_HLSQ_FS_CONFIG_SHADEROBJOFFSET(s[FS].instroff) | - COND(s[FS].v, A5XX_HLSQ_FS_CONFIG_ENABLED)); - OUT_RING(ring, A5XX_HLSQ_HS_CONFIG_CONSTOBJECTOFFSET(s[HS].constoff) | - A5XX_HLSQ_HS_CONFIG_SHADEROBJOFFSET(s[HS].instroff) | - COND(s[HS].v, A5XX_HLSQ_HS_CONFIG_ENABLED)); - OUT_RING(ring, A5XX_HLSQ_DS_CONFIG_CONSTOBJECTOFFSET(s[DS].constoff) | - A5XX_HLSQ_DS_CONFIG_SHADEROBJOFFSET(s[DS].instroff) | - COND(s[DS].v, A5XX_HLSQ_DS_CONFIG_ENABLED)); - OUT_RING(ring, A5XX_HLSQ_GS_CONFIG_CONSTOBJECTOFFSET(s[GS].constoff) | - A5XX_HLSQ_GS_CONFIG_SHADEROBJOFFSET(s[GS].instroff) | - COND(s[GS].v, A5XX_HLSQ_GS_CONFIG_ENABLED)); - - OUT_PKT4(ring, REG_A5XX_HLSQ_CS_CONFIG, 1); - OUT_RING(ring, 0x00000000); - - OUT_PKT4(ring, REG_A5XX_HLSQ_VS_CNTL, 5); - OUT_RING(ring, A5XX_HLSQ_VS_CNTL_INSTRLEN(s[VS].instrlen) | - COND(s[VS].v && s[VS].v->has_ssbo, A5XX_HLSQ_VS_CNTL_SSBO_ENABLE)); - OUT_RING(ring, A5XX_HLSQ_FS_CNTL_INSTRLEN(s[FS].instrlen) | - COND(s[FS].v && s[FS].v->has_ssbo, A5XX_HLSQ_FS_CNTL_SSBO_ENABLE)); - OUT_RING(ring, A5XX_HLSQ_HS_CNTL_INSTRLEN(s[HS].instrlen) | - COND(s[HS].v && s[HS].v->has_ssbo, A5XX_HLSQ_HS_CNTL_SSBO_ENABLE)); - OUT_RING(ring, A5XX_HLSQ_DS_CNTL_INSTRLEN(s[DS].instrlen) | - COND(s[DS].v && s[DS].v->has_ssbo, A5XX_HLSQ_DS_CNTL_SSBO_ENABLE)); - OUT_RING(ring, A5XX_HLSQ_GS_CNTL_INSTRLEN(s[GS].instrlen) | - COND(s[GS].v && s[GS].v->has_ssbo, A5XX_HLSQ_GS_CNTL_SSBO_ENABLE)); - - OUT_PKT4(ring, REG_A5XX_SP_VS_CONFIG, 5); - OUT_RING(ring, A5XX_SP_VS_CONFIG_CONSTOBJECTOFFSET(s[VS].constoff) | - A5XX_SP_VS_CONFIG_SHADEROBJOFFSET(s[VS].instroff) | - COND(s[VS].v, A5XX_SP_VS_CONFIG_ENABLED)); - OUT_RING(ring, A5XX_SP_FS_CONFIG_CONSTOBJECTOFFSET(s[FS].constoff) | - A5XX_SP_FS_CONFIG_SHADEROBJOFFSET(s[FS].instroff) | - COND(s[FS].v, A5XX_SP_FS_CONFIG_ENABLED)); - OUT_RING(ring, A5XX_SP_HS_CONFIG_CONSTOBJECTOFFSET(s[HS].constoff) | - A5XX_SP_HS_CONFIG_SHADEROBJOFFSET(s[HS].instroff) | - COND(s[HS].v, A5XX_SP_HS_CONFIG_ENABLED)); - OUT_RING(ring, A5XX_SP_DS_CONFIG_CONSTOBJECTOFFSET(s[DS].constoff) | - A5XX_SP_DS_CONFIG_SHADEROBJOFFSET(s[DS].instroff) | - COND(s[DS].v, A5XX_SP_DS_CONFIG_ENABLED)); - OUT_RING(ring, A5XX_SP_GS_CONFIG_CONSTOBJECTOFFSET(s[GS].constoff) | - A5XX_SP_GS_CONFIG_SHADEROBJOFFSET(s[GS].instroff) | - COND(s[GS].v, A5XX_SP_GS_CONFIG_ENABLED)); - - OUT_PKT4(ring, REG_A5XX_SP_CS_CONFIG, 1); - OUT_RING(ring, 0x00000000); - - OUT_PKT4(ring, REG_A5XX_HLSQ_VS_CONSTLEN, 2); - OUT_RING(ring, s[VS].constlen); /* HLSQ_VS_CONSTLEN */ - OUT_RING(ring, s[VS].instrlen); /* HLSQ_VS_INSTRLEN */ - - OUT_PKT4(ring, REG_A5XX_HLSQ_FS_CONSTLEN, 2); - OUT_RING(ring, s[FS].constlen); /* HLSQ_FS_CONSTLEN */ - OUT_RING(ring, s[FS].instrlen); /* HLSQ_FS_INSTRLEN */ - - OUT_PKT4(ring, REG_A5XX_HLSQ_HS_CONSTLEN, 2); - OUT_RING(ring, s[HS].constlen); /* HLSQ_HS_CONSTLEN */ - OUT_RING(ring, s[HS].instrlen); /* HLSQ_HS_INSTRLEN */ - - OUT_PKT4(ring, REG_A5XX_HLSQ_DS_CONSTLEN, 2); - OUT_RING(ring, s[DS].constlen); /* HLSQ_DS_CONSTLEN */ - OUT_RING(ring, s[DS].instrlen); /* HLSQ_DS_INSTRLEN */ - - OUT_PKT4(ring, REG_A5XX_HLSQ_GS_CONSTLEN, 2); - OUT_RING(ring, s[GS].constlen); /* HLSQ_GS_CONSTLEN */ - OUT_RING(ring, s[GS].instrlen); /* HLSQ_GS_INSTRLEN */ - - OUT_PKT4(ring, REG_A5XX_HLSQ_CS_CONSTLEN, 2); - OUT_RING(ring, 0x00000000); /* HLSQ_CS_CONSTLEN */ - OUT_RING(ring, 0x00000000); /* HLSQ_CS_INSTRLEN */ - - OUT_PKT4(ring, REG_A5XX_SP_VS_CTRL_REG0, 1); - OUT_RING(ring, A5XX_SP_VS_CTRL_REG0_HALFREGFOOTPRINT(s[VS].i->max_half_reg + 1) | - A5XX_SP_VS_CTRL_REG0_FULLREGFOOTPRINT(s[VS].i->max_reg + 1) | - 0x6 | /* XXX seems to be always set? */ - A5XX_SP_VS_CTRL_REG0_BRANCHSTACK(s[VS].v->branchstack) | - COND(s[VS].v->need_pixlod, A5XX_SP_VS_CTRL_REG0_PIXLODENABLE)); - - /* If we have streamout, link against the real FS in the binning program, - * rather than the dummy FS used for binning pass state, to ensure the - * OUTLOC's match. Depending on whether we end up doing sysmem or gmem, the - * actual streamout could happen with either the binning pass or draw pass - * program, but the same streamout stateobj is used in either case: - */ - const struct ir3_shader_variant *link_fs = s[FS].v; - if (do_streamout && emit->binning_pass) - link_fs = emit->prog->fs; - struct ir3_shader_linkage l = {0}; - ir3_link_shaders(&l, s[VS].v, link_fs, true); - - uint8_t clip0_loc = l.clip0_loc; - uint8_t clip1_loc = l.clip1_loc; - - OUT_PKT4(ring, REG_A5XX_VPC_VAR_DISABLE(0), 4); - OUT_RING(ring, ~l.varmask[0]); /* VPC_VAR[0].DISABLE */ - OUT_RING(ring, ~l.varmask[1]); /* VPC_VAR[1].DISABLE */ - OUT_RING(ring, ~l.varmask[2]); /* VPC_VAR[2].DISABLE */ - OUT_RING(ring, ~l.varmask[3]); /* VPC_VAR[3].DISABLE */ - - /* Add stream out outputs after computing the VPC_VAR_DISABLE bitmask. */ - ir3_link_stream_out(&l, s[VS].v); - - /* a5xx appends pos/psize to end of the linkage map: */ - if (VALIDREG(pos_regid)) - ir3_link_add(&l, pos_regid, 0xf, l.max_loc); - - if (VALIDREG(psize_regid)) { - psize_loc = l.max_loc; - ir3_link_add(&l, psize_regid, 0x1, l.max_loc); - } - - /* Handle the case where clip/cull distances aren't read by the FS. Make - * sure to avoid adding an output with an empty writemask if the user - * disables all the clip distances in the API so that the slot is unused. - */ - if (clip0_loc == 0xff && VALIDREG(clip0_regid) && (clip_cull_mask & 0xf) != 0) { - clip0_loc = l.max_loc; - ir3_link_add(&l, clip0_regid, clip_cull_mask & 0xf, l.max_loc); - } - - if (clip1_loc == 0xff && VALIDREG(clip1_regid) && (clip_cull_mask >> 4) != 0) { - clip1_loc = l.max_loc; - ir3_link_add(&l, clip1_regid, clip_cull_mask >> 4, l.max_loc); - } - - /* If we have stream-out, we use the full shader for binning - * pass, rather than the optimized binning pass one, so that we - * have all the varying outputs available for xfb. So streamout - * state should always be derived from the non-binning pass - * program: - */ - if (do_streamout && !emit->binning_pass) - emit_stream_out(ring, s[VS].v, &l); - - for (i = 0, j = 0; (i < 16) && (j < l.cnt); i++) { - uint32_t reg = 0; - - OUT_PKT4(ring, REG_A5XX_SP_VS_OUT_REG(i), 1); - - reg |= A5XX_SP_VS_OUT_REG_A_REGID(l.var[j].regid); - reg |= A5XX_SP_VS_OUT_REG_A_COMPMASK(l.var[j].compmask); - j++; - - reg |= A5XX_SP_VS_OUT_REG_B_REGID(l.var[j].regid); - reg |= A5XX_SP_VS_OUT_REG_B_COMPMASK(l.var[j].compmask); - j++; - - OUT_RING(ring, reg); - } - - for (i = 0, j = 0; (i < 8) && (j < l.cnt); i++) { - uint32_t reg = 0; - - OUT_PKT4(ring, REG_A5XX_SP_VS_VPC_DST_REG(i), 1); - - reg |= A5XX_SP_VS_VPC_DST_REG_OUTLOC0(l.var[j++].loc); - reg |= A5XX_SP_VS_VPC_DST_REG_OUTLOC1(l.var[j++].loc); - reg |= A5XX_SP_VS_VPC_DST_REG_OUTLOC2(l.var[j++].loc); - reg |= A5XX_SP_VS_VPC_DST_REG_OUTLOC3(l.var[j++].loc); - - OUT_RING(ring, reg); - } - - OUT_PKT4(ring, REG_A5XX_SP_VS_OBJ_START_LO, 2); - OUT_RELOC(ring, s[VS].v->bo, 0, 0, 0); /* SP_VS_OBJ_START_LO/HI */ - - if (s[VS].instrlen) - fd5_emit_shader(ring, s[VS].v); - - // TODO depending on other bits in this reg (if any) set somewhere else? - OUT_PKT4(ring, REG_A5XX_PC_PRIM_VTX_CNTL, 1); - OUT_RING(ring, COND(s[VS].v->writes_psize, A5XX_PC_PRIM_VTX_CNTL_PSIZE)); - - OUT_PKT4(ring, REG_A5XX_SP_PRIMITIVE_CNTL, 1); - OUT_RING(ring, A5XX_SP_PRIMITIVE_CNTL_VSOUT(l.cnt)); - - OUT_PKT4(ring, REG_A5XX_VPC_CNTL_0, 1); - OUT_RING(ring, A5XX_VPC_CNTL_0_STRIDE_IN_VPC(l.max_loc) | - COND(s[FS].v->total_in > 0, A5XX_VPC_CNTL_0_VARYING) | - 0x10000); // XXX - - fd5_context(ctx)->max_loc = l.max_loc; - - if (emit->binning_pass) { - OUT_PKT4(ring, REG_A5XX_SP_FS_OBJ_START_LO, 2); - OUT_RING(ring, 0x00000000); /* SP_FS_OBJ_START_LO */ - OUT_RING(ring, 0x00000000); /* SP_FS_OBJ_START_HI */ - } else { - OUT_PKT4(ring, REG_A5XX_SP_FS_OBJ_START_LO, 2); - OUT_RELOC(ring, s[FS].v->bo, 0, 0, 0); /* SP_FS_OBJ_START_LO/HI */ - } - - OUT_PKT4(ring, REG_A5XX_HLSQ_CONTROL_0_REG, 5); - OUT_RING(ring, A5XX_HLSQ_CONTROL_0_REG_FSTHREADSIZE(fssz) | - A5XX_HLSQ_CONTROL_0_REG_CSTHREADSIZE(TWO_QUADS) | - 0x00000880); /* XXX HLSQ_CONTROL_0 */ - OUT_RING(ring, A5XX_HLSQ_CONTROL_1_REG_PRIMALLOCTHRESHOLD(63)); - OUT_RING(ring, A5XX_HLSQ_CONTROL_2_REG_FACEREGID(face_regid) | - A5XX_HLSQ_CONTROL_2_REG_SAMPLEID(samp_id_regid) | - A5XX_HLSQ_CONTROL_2_REG_SAMPLEMASK(samp_mask_regid) | - A5XX_HLSQ_CONTROL_2_REG_SIZE(ij_regid[IJ_PERSP_SIZE])); - OUT_RING(ring, - A5XX_HLSQ_CONTROL_3_REG_IJ_PERSP_PIXEL(ij_regid[IJ_PERSP_PIXEL]) | - A5XX_HLSQ_CONTROL_3_REG_IJ_LINEAR_PIXEL(ij_regid[IJ_LINEAR_PIXEL]) | - A5XX_HLSQ_CONTROL_3_REG_IJ_PERSP_CENTROID(ij_regid[IJ_PERSP_CENTROID]) | - A5XX_HLSQ_CONTROL_3_REG_IJ_PERSP_CENTROID(ij_regid[IJ_LINEAR_CENTROID])); - OUT_RING(ring, A5XX_HLSQ_CONTROL_4_REG_XYCOORDREGID(coord_regid) | - A5XX_HLSQ_CONTROL_4_REG_ZWCOORDREGID(zwcoord_regid) | - A5XX_HLSQ_CONTROL_4_REG_IJ_PERSP_SAMPLE(ij_regid[IJ_PERSP_SAMPLE]) | - A5XX_HLSQ_CONTROL_4_REG_IJ_LINEAR_SAMPLE(ij_regid[IJ_LINEAR_SAMPLE])); - - OUT_PKT4(ring, REG_A5XX_SP_FS_CTRL_REG0, 1); - OUT_RING(ring, COND(s[FS].v->total_in > 0, A5XX_SP_FS_CTRL_REG0_VARYING) | - 0x40006 | /* XXX set pretty much everywhere */ - A5XX_SP_FS_CTRL_REG0_THREADSIZE(fssz) | - A5XX_SP_FS_CTRL_REG0_HALFREGFOOTPRINT(s[FS].i->max_half_reg + 1) | - A5XX_SP_FS_CTRL_REG0_FULLREGFOOTPRINT(s[FS].i->max_reg + 1) | - A5XX_SP_FS_CTRL_REG0_BRANCHSTACK(s[FS].v->branchstack) | - COND(s[FS].v->need_pixlod, A5XX_SP_FS_CTRL_REG0_PIXLODENABLE)); - - OUT_PKT4(ring, REG_A5XX_HLSQ_UPDATE_CNTL, 1); - OUT_RING(ring, 0x020fffff); /* XXX */ - - OUT_PKT4(ring, REG_A5XX_VPC_GS_SIV_CNTL, 1); - OUT_RING(ring, 0x0000ffff); /* XXX */ - - OUT_PKT4(ring, REG_A5XX_SP_SP_CNTL, 1); - OUT_RING(ring, 0x00000010); /* XXX */ - - /* XXX: missing enable bits for per-sample bary linear centroid and IJ_PERSP_SIZE - * (should be identical to a6xx) - */ - - OUT_PKT4(ring, REG_A5XX_GRAS_CNTL, 1); - OUT_RING(ring, - CONDREG(ij_regid[IJ_PERSP_PIXEL], A5XX_GRAS_CNTL_IJ_PERSP_PIXEL) | - CONDREG(ij_regid[IJ_PERSP_CENTROID], A5XX_GRAS_CNTL_IJ_PERSP_CENTROID) | - COND(s[FS].v->fragcoord_compmask != 0, - A5XX_GRAS_CNTL_COORD_MASK(s[FS].v->fragcoord_compmask) | - A5XX_GRAS_CNTL_SIZE) | - COND(s[FS].v->frag_face, A5XX_GRAS_CNTL_SIZE) | - CONDREG(ij_regid[IJ_LINEAR_PIXEL], A5XX_GRAS_CNTL_SIZE)); - - OUT_PKT4(ring, REG_A5XX_RB_RENDER_CONTROL0, 2); - OUT_RING(ring, - CONDREG(ij_regid[IJ_PERSP_PIXEL], A5XX_RB_RENDER_CONTROL0_IJ_PERSP_PIXEL) | - CONDREG(ij_regid[IJ_PERSP_CENTROID], A5XX_RB_RENDER_CONTROL0_IJ_PERSP_CENTROID) | - COND(s[FS].v->fragcoord_compmask != 0, - A5XX_RB_RENDER_CONTROL0_COORD_MASK(s[FS].v->fragcoord_compmask) | - A5XX_RB_RENDER_CONTROL0_SIZE) | - COND(s[FS].v->frag_face, A5XX_RB_RENDER_CONTROL0_SIZE) | - CONDREG(ij_regid[IJ_LINEAR_PIXEL], A5XX_RB_RENDER_CONTROL0_SIZE)); - OUT_RING(ring, - CONDREG(samp_mask_regid, A5XX_RB_RENDER_CONTROL1_SAMPLEMASK) | - COND(s[FS].v->frag_face, A5XX_RB_RENDER_CONTROL1_FACENESS) | - CONDREG(samp_id_regid, A5XX_RB_RENDER_CONTROL1_SAMPLEID)); - - OUT_PKT4(ring, REG_A5XX_SP_FS_OUTPUT_REG(0), 8); - for (i = 0; i < 8; i++) { - OUT_RING(ring, A5XX_SP_FS_OUTPUT_REG_REGID(color_regid[i]) | - COND(color_regid[i] & HALF_REG_ID, A5XX_SP_FS_OUTPUT_REG_HALF_PRECISION)); - } - - - OUT_PKT4(ring, REG_A5XX_VPC_PACK, 1); - OUT_RING(ring, A5XX_VPC_PACK_NUMNONPOSVAR(s[FS].v->total_in) | - A5XX_VPC_PACK_PSIZELOC(psize_loc)); - - if (!emit->binning_pass) { - uint32_t vinterp[8], vpsrepl[8]; - - memset(vinterp, 0, sizeof(vinterp)); - memset(vpsrepl, 0, sizeof(vpsrepl)); - - /* looks like we need to do int varyings in the frag - * shader on a5xx (no flatshad reg? or a420.0 bug?): - * - * (sy)(ss)nop - * (sy)ldlv.u32 r0.x,l[r0.x], 1 - * ldlv.u32 r0.y,l[r0.x+1], 1 - * (ss)bary.f (ei)r63.x, 0, r0.x - * (ss)(rpt1)cov.s32f16 hr0.x, (r)r0.x - * (rpt5)nop - * sam (f16)(xyzw)hr0.x, hr0.x, s#0, t#0 - * - * Possibly on later a5xx variants we'll be able to use - * something like the code below instead of workaround - * in the shader: - */ - /* figure out VARYING_INTERP / VARYING_PS_REPL register values: */ - for (j = -1; (j = ir3_next_varying(s[FS].v, j)) < (int)s[FS].v->inputs_count; ) { - /* NOTE: varyings are packed, so if compmask is 0xb - * then first, third, and fourth component occupy - * three consecutive varying slots: - */ - unsigned compmask = s[FS].v->inputs[j].compmask; - - uint32_t inloc = s[FS].v->inputs[j].inloc; - - if (s[FS].v->inputs[j].flat || - (s[FS].v->inputs[j].rasterflat && emit->rasterflat)) { - uint32_t loc = inloc; - - for (i = 0; i < 4; i++) { - if (compmask & (1 << i)) { - vinterp[loc / 16] |= 1 << ((loc % 16) * 2); - //flatshade[loc / 32] |= 1 << (loc % 32); - loc++; - } - } - } - - bool coord_mode = emit->sprite_coord_mode; - if (ir3_point_sprite(s[FS].v, j, emit->sprite_coord_enable, &coord_mode)) { - /* mask is two 2-bit fields, where: - * '01' -> S - * '10' -> T - * '11' -> 1 - T (flip mode) - */ - unsigned mask = coord_mode ? 0b1101 : 0b1001; - uint32_t loc = inloc; - if (compmask & 0x1) { - vpsrepl[loc / 16] |= ((mask >> 0) & 0x3) << ((loc % 16) * 2); - loc++; - } - if (compmask & 0x2) { - vpsrepl[loc / 16] |= ((mask >> 2) & 0x3) << ((loc % 16) * 2); - loc++; - } - if (compmask & 0x4) { - /* .z <- 0.0f */ - vinterp[loc / 16] |= 0b10 << ((loc % 16) * 2); - loc++; - } - if (compmask & 0x8) { - /* .w <- 1.0f */ - vinterp[loc / 16] |= 0b11 << ((loc % 16) * 2); - loc++; - } - } - } - - OUT_PKT4(ring, REG_A5XX_VPC_VARYING_INTERP_MODE(0), 8); - for (i = 0; i < 8; i++) - OUT_RING(ring, vinterp[i]); /* VPC_VARYING_INTERP[i].MODE */ - - OUT_PKT4(ring, REG_A5XX_VPC_VARYING_PS_REPL_MODE(0), 8); - for (i = 0; i < 8; i++) - OUT_RING(ring, vpsrepl[i]); /* VPC_VARYING_PS_REPL[i] */ - } - - OUT_PKT4(ring, REG_A5XX_GRAS_VS_CL_CNTL, 1); - OUT_RING(ring, A5XX_GRAS_VS_CL_CNTL_CLIP_MASK(clip_mask) | - A5XX_GRAS_VS_CL_CNTL_CULL_MASK(cull_mask)); - - OUT_PKT4(ring, REG_A5XX_VPC_CLIP_CNTL, 1); - OUT_RING(ring, A5XX_VPC_CLIP_CNTL_CLIP_MASK(clip_cull_mask) | - A5XX_VPC_CLIP_CNTL_CLIP_DIST_03_LOC(clip0_loc) | - A5XX_VPC_CLIP_CNTL_CLIP_DIST_47_LOC(clip1_loc)); - - OUT_PKT4(ring, REG_A5XX_PC_CLIP_CNTL, 1); - OUT_RING(ring, A5XX_PC_CLIP_CNTL_CLIP_MASK(clip_mask)); - - if (!emit->binning_pass) - if (s[FS].instrlen) - fd5_emit_shader(ring, s[FS].v); - - OUT_PKT4(ring, REG_A5XX_VFD_CONTROL_1, 5); - OUT_RING(ring, A5XX_VFD_CONTROL_1_REGID4VTX(vertex_regid) | - A5XX_VFD_CONTROL_1_REGID4INST(instance_regid) | - 0xfc0000); - OUT_RING(ring, 0x0000fcfc); /* VFD_CONTROL_2 */ - OUT_RING(ring, 0x0000fcfc); /* VFD_CONTROL_3 */ - OUT_RING(ring, 0x000000fc); /* VFD_CONTROL_4 */ - OUT_RING(ring, 0x00000000); /* VFD_CONTROL_5 */ + struct stage s[MAX_STAGES]; + uint32_t pos_regid, psize_regid, color_regid[8]; + uint32_t face_regid, coord_regid, zwcoord_regid, samp_id_regid, + samp_mask_regid; + uint32_t ij_regid[IJ_COUNT], vertex_regid, instance_regid, clip0_regid, + clip1_regid; + enum a3xx_threadsize fssz; + uint8_t psize_loc = ~0; + int i, j; + + setup_stages(emit, s); + + bool do_streamout = (s[VS].v->shader->stream_output.num_outputs > 0); + uint8_t clip_mask = s[VS].v->clip_mask, cull_mask = s[VS].v->cull_mask; + uint8_t clip_cull_mask = clip_mask | cull_mask; + + fssz = (s[FS].i->double_threadsize) ? FOUR_QUADS : TWO_QUADS; + + pos_regid = ir3_find_output_regid(s[VS].v, VARYING_SLOT_POS); + psize_regid = ir3_find_output_regid(s[VS].v, VARYING_SLOT_PSIZ); + clip0_regid = ir3_find_output_regid(s[VS].v, VARYING_SLOT_CLIP_DIST0); + clip1_regid = ir3_find_output_regid(s[VS].v, VARYING_SLOT_CLIP_DIST1); + vertex_regid = + ir3_find_sysval_regid(s[VS].v, SYSTEM_VALUE_VERTEX_ID_ZERO_BASE); + instance_regid = ir3_find_sysval_regid(s[VS].v, SYSTEM_VALUE_INSTANCE_ID); + + if (s[FS].v->color0_mrt) { + color_regid[0] = color_regid[1] = color_regid[2] = color_regid[3] = + color_regid[4] = color_regid[5] = color_regid[6] = color_regid[7] = + ir3_find_output_regid(s[FS].v, FRAG_RESULT_COLOR); + } else { + color_regid[0] = ir3_find_output_regid(s[FS].v, FRAG_RESULT_DATA0); + color_regid[1] = ir3_find_output_regid(s[FS].v, FRAG_RESULT_DATA1); + color_regid[2] = ir3_find_output_regid(s[FS].v, FRAG_RESULT_DATA2); + color_regid[3] = ir3_find_output_regid(s[FS].v, FRAG_RESULT_DATA3); + color_regid[4] = ir3_find_output_regid(s[FS].v, FRAG_RESULT_DATA4); + color_regid[5] = ir3_find_output_regid(s[FS].v, FRAG_RESULT_DATA5); + color_regid[6] = ir3_find_output_regid(s[FS].v, FRAG_RESULT_DATA6); + color_regid[7] = ir3_find_output_regid(s[FS].v, FRAG_RESULT_DATA7); + } + + samp_id_regid = ir3_find_sysval_regid(s[FS].v, SYSTEM_VALUE_SAMPLE_ID); + samp_mask_regid = + ir3_find_sysval_regid(s[FS].v, SYSTEM_VALUE_SAMPLE_MASK_IN); + face_regid = ir3_find_sysval_regid(s[FS].v, SYSTEM_VALUE_FRONT_FACE); + coord_regid = ir3_find_sysval_regid(s[FS].v, SYSTEM_VALUE_FRAG_COORD); + zwcoord_regid = next_regid(coord_regid, 2); + for (unsigned i = 0; i < ARRAY_SIZE(ij_regid); i++) + ij_regid[i] = ir3_find_sysval_regid( + s[FS].v, SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL + i); + + /* we could probably divide this up into things that need to be + * emitted if frag-prog is dirty vs if vert-prog is dirty.. + */ + + OUT_PKT4(ring, REG_A5XX_HLSQ_VS_CONFIG, 5); + OUT_RING(ring, A5XX_HLSQ_VS_CONFIG_CONSTOBJECTOFFSET(s[VS].constoff) | + A5XX_HLSQ_VS_CONFIG_SHADEROBJOFFSET(s[VS].instroff) | + COND(s[VS].v, A5XX_HLSQ_VS_CONFIG_ENABLED)); + OUT_RING(ring, A5XX_HLSQ_FS_CONFIG_CONSTOBJECTOFFSET(s[FS].constoff) | + A5XX_HLSQ_FS_CONFIG_SHADEROBJOFFSET(s[FS].instroff) | + COND(s[FS].v, A5XX_HLSQ_FS_CONFIG_ENABLED)); + OUT_RING(ring, A5XX_HLSQ_HS_CONFIG_CONSTOBJECTOFFSET(s[HS].constoff) | + A5XX_HLSQ_HS_CONFIG_SHADEROBJOFFSET(s[HS].instroff) | + COND(s[HS].v, A5XX_HLSQ_HS_CONFIG_ENABLED)); + OUT_RING(ring, A5XX_HLSQ_DS_CONFIG_CONSTOBJECTOFFSET(s[DS].constoff) | + A5XX_HLSQ_DS_CONFIG_SHADEROBJOFFSET(s[DS].instroff) | + COND(s[DS].v, A5XX_HLSQ_DS_CONFIG_ENABLED)); + OUT_RING(ring, A5XX_HLSQ_GS_CONFIG_CONSTOBJECTOFFSET(s[GS].constoff) | + A5XX_HLSQ_GS_CONFIG_SHADEROBJOFFSET(s[GS].instroff) | + COND(s[GS].v, A5XX_HLSQ_GS_CONFIG_ENABLED)); + + OUT_PKT4(ring, REG_A5XX_HLSQ_CS_CONFIG, 1); + OUT_RING(ring, 0x00000000); + + OUT_PKT4(ring, REG_A5XX_HLSQ_VS_CNTL, 5); + OUT_RING(ring, A5XX_HLSQ_VS_CNTL_INSTRLEN(s[VS].instrlen) | + COND(s[VS].v && s[VS].v->has_ssbo, + A5XX_HLSQ_VS_CNTL_SSBO_ENABLE)); + OUT_RING(ring, A5XX_HLSQ_FS_CNTL_INSTRLEN(s[FS].instrlen) | + COND(s[FS].v && s[FS].v->has_ssbo, + A5XX_HLSQ_FS_CNTL_SSBO_ENABLE)); + OUT_RING(ring, A5XX_HLSQ_HS_CNTL_INSTRLEN(s[HS].instrlen) | + COND(s[HS].v && s[HS].v->has_ssbo, + A5XX_HLSQ_HS_CNTL_SSBO_ENABLE)); + OUT_RING(ring, A5XX_HLSQ_DS_CNTL_INSTRLEN(s[DS].instrlen) | + COND(s[DS].v && s[DS].v->has_ssbo, + A5XX_HLSQ_DS_CNTL_SSBO_ENABLE)); + OUT_RING(ring, A5XX_HLSQ_GS_CNTL_INSTRLEN(s[GS].instrlen) | + COND(s[GS].v && s[GS].v->has_ssbo, + A5XX_HLSQ_GS_CNTL_SSBO_ENABLE)); + + OUT_PKT4(ring, REG_A5XX_SP_VS_CONFIG, 5); + OUT_RING(ring, A5XX_SP_VS_CONFIG_CONSTOBJECTOFFSET(s[VS].constoff) | + A5XX_SP_VS_CONFIG_SHADEROBJOFFSET(s[VS].instroff) | + COND(s[VS].v, A5XX_SP_VS_CONFIG_ENABLED)); + OUT_RING(ring, A5XX_SP_FS_CONFIG_CONSTOBJECTOFFSET(s[FS].constoff) | + A5XX_SP_FS_CONFIG_SHADEROBJOFFSET(s[FS].instroff) | + COND(s[FS].v, A5XX_SP_FS_CONFIG_ENABLED)); + OUT_RING(ring, A5XX_SP_HS_CONFIG_CONSTOBJECTOFFSET(s[HS].constoff) | + A5XX_SP_HS_CONFIG_SHADEROBJOFFSET(s[HS].instroff) | + COND(s[HS].v, A5XX_SP_HS_CONFIG_ENABLED)); + OUT_RING(ring, A5XX_SP_DS_CONFIG_CONSTOBJECTOFFSET(s[DS].constoff) | + A5XX_SP_DS_CONFIG_SHADEROBJOFFSET(s[DS].instroff) | + COND(s[DS].v, A5XX_SP_DS_CONFIG_ENABLED)); + OUT_RING(ring, A5XX_SP_GS_CONFIG_CONSTOBJECTOFFSET(s[GS].constoff) | + A5XX_SP_GS_CONFIG_SHADEROBJOFFSET(s[GS].instroff) | + COND(s[GS].v, A5XX_SP_GS_CONFIG_ENABLED)); + + OUT_PKT4(ring, REG_A5XX_SP_CS_CONFIG, 1); + OUT_RING(ring, 0x00000000); + + OUT_PKT4(ring, REG_A5XX_HLSQ_VS_CONSTLEN, 2); + OUT_RING(ring, s[VS].constlen); /* HLSQ_VS_CONSTLEN */ + OUT_RING(ring, s[VS].instrlen); /* HLSQ_VS_INSTRLEN */ + + OUT_PKT4(ring, REG_A5XX_HLSQ_FS_CONSTLEN, 2); + OUT_RING(ring, s[FS].constlen); /* HLSQ_FS_CONSTLEN */ + OUT_RING(ring, s[FS].instrlen); /* HLSQ_FS_INSTRLEN */ + + OUT_PKT4(ring, REG_A5XX_HLSQ_HS_CONSTLEN, 2); + OUT_RING(ring, s[HS].constlen); /* HLSQ_HS_CONSTLEN */ + OUT_RING(ring, s[HS].instrlen); /* HLSQ_HS_INSTRLEN */ + + OUT_PKT4(ring, REG_A5XX_HLSQ_DS_CONSTLEN, 2); + OUT_RING(ring, s[DS].constlen); /* HLSQ_DS_CONSTLEN */ + OUT_RING(ring, s[DS].instrlen); /* HLSQ_DS_INSTRLEN */ + + OUT_PKT4(ring, REG_A5XX_HLSQ_GS_CONSTLEN, 2); + OUT_RING(ring, s[GS].constlen); /* HLSQ_GS_CONSTLEN */ + OUT_RING(ring, s[GS].instrlen); /* HLSQ_GS_INSTRLEN */ + + OUT_PKT4(ring, REG_A5XX_HLSQ_CS_CONSTLEN, 2); + OUT_RING(ring, 0x00000000); /* HLSQ_CS_CONSTLEN */ + OUT_RING(ring, 0x00000000); /* HLSQ_CS_INSTRLEN */ + + OUT_PKT4(ring, REG_A5XX_SP_VS_CTRL_REG0, 1); + OUT_RING(ring, + A5XX_SP_VS_CTRL_REG0_HALFREGFOOTPRINT(s[VS].i->max_half_reg + 1) | + A5XX_SP_VS_CTRL_REG0_FULLREGFOOTPRINT(s[VS].i->max_reg + 1) | + 0x6 | /* XXX seems to be always set? */ + A5XX_SP_VS_CTRL_REG0_BRANCHSTACK(s[VS].v->branchstack) | + COND(s[VS].v->need_pixlod, A5XX_SP_VS_CTRL_REG0_PIXLODENABLE)); + + /* If we have streamout, link against the real FS in the binning program, + * rather than the dummy FS used for binning pass state, to ensure the + * OUTLOC's match. Depending on whether we end up doing sysmem or gmem, the + * actual streamout could happen with either the binning pass or draw pass + * program, but the same streamout stateobj is used in either case: + */ + const struct ir3_shader_variant *link_fs = s[FS].v; + if (do_streamout && emit->binning_pass) + link_fs = emit->prog->fs; + struct ir3_shader_linkage l = {0}; + ir3_link_shaders(&l, s[VS].v, link_fs, true); + + uint8_t clip0_loc = l.clip0_loc; + uint8_t clip1_loc = l.clip1_loc; + + OUT_PKT4(ring, REG_A5XX_VPC_VAR_DISABLE(0), 4); + OUT_RING(ring, ~l.varmask[0]); /* VPC_VAR[0].DISABLE */ + OUT_RING(ring, ~l.varmask[1]); /* VPC_VAR[1].DISABLE */ + OUT_RING(ring, ~l.varmask[2]); /* VPC_VAR[2].DISABLE */ + OUT_RING(ring, ~l.varmask[3]); /* VPC_VAR[3].DISABLE */ + + /* Add stream out outputs after computing the VPC_VAR_DISABLE bitmask. */ + ir3_link_stream_out(&l, s[VS].v); + + /* a5xx appends pos/psize to end of the linkage map: */ + if (VALIDREG(pos_regid)) + ir3_link_add(&l, pos_regid, 0xf, l.max_loc); + + if (VALIDREG(psize_regid)) { + psize_loc = l.max_loc; + ir3_link_add(&l, psize_regid, 0x1, l.max_loc); + } + + /* Handle the case where clip/cull distances aren't read by the FS. Make + * sure to avoid adding an output with an empty writemask if the user + * disables all the clip distances in the API so that the slot is unused. + */ + if (clip0_loc == 0xff && VALIDREG(clip0_regid) && + (clip_cull_mask & 0xf) != 0) { + clip0_loc = l.max_loc; + ir3_link_add(&l, clip0_regid, clip_cull_mask & 0xf, l.max_loc); + } + + if (clip1_loc == 0xff && VALIDREG(clip1_regid) && + (clip_cull_mask >> 4) != 0) { + clip1_loc = l.max_loc; + ir3_link_add(&l, clip1_regid, clip_cull_mask >> 4, l.max_loc); + } + + /* If we have stream-out, we use the full shader for binning + * pass, rather than the optimized binning pass one, so that we + * have all the varying outputs available for xfb. So streamout + * state should always be derived from the non-binning pass + * program: + */ + if (do_streamout && !emit->binning_pass) + emit_stream_out(ring, s[VS].v, &l); + + for (i = 0, j = 0; (i < 16) && (j < l.cnt); i++) { + uint32_t reg = 0; + + OUT_PKT4(ring, REG_A5XX_SP_VS_OUT_REG(i), 1); + + reg |= A5XX_SP_VS_OUT_REG_A_REGID(l.var[j].regid); + reg |= A5XX_SP_VS_OUT_REG_A_COMPMASK(l.var[j].compmask); + j++; + + reg |= A5XX_SP_VS_OUT_REG_B_REGID(l.var[j].regid); + reg |= A5XX_SP_VS_OUT_REG_B_COMPMASK(l.var[j].compmask); + j++; + + OUT_RING(ring, reg); + } + + for (i = 0, j = 0; (i < 8) && (j < l.cnt); i++) { + uint32_t reg = 0; + + OUT_PKT4(ring, REG_A5XX_SP_VS_VPC_DST_REG(i), 1); + + reg |= A5XX_SP_VS_VPC_DST_REG_OUTLOC0(l.var[j++].loc); + reg |= A5XX_SP_VS_VPC_DST_REG_OUTLOC1(l.var[j++].loc); + reg |= A5XX_SP_VS_VPC_DST_REG_OUTLOC2(l.var[j++].loc); + reg |= A5XX_SP_VS_VPC_DST_REG_OUTLOC3(l.var[j++].loc); + + OUT_RING(ring, reg); + } + + OUT_PKT4(ring, REG_A5XX_SP_VS_OBJ_START_LO, 2); + OUT_RELOC(ring, s[VS].v->bo, 0, 0, 0); /* SP_VS_OBJ_START_LO/HI */ + + if (s[VS].instrlen) + fd5_emit_shader(ring, s[VS].v); + + // TODO depending on other bits in this reg (if any) set somewhere else? + OUT_PKT4(ring, REG_A5XX_PC_PRIM_VTX_CNTL, 1); + OUT_RING(ring, COND(s[VS].v->writes_psize, A5XX_PC_PRIM_VTX_CNTL_PSIZE)); + + OUT_PKT4(ring, REG_A5XX_SP_PRIMITIVE_CNTL, 1); + OUT_RING(ring, A5XX_SP_PRIMITIVE_CNTL_VSOUT(l.cnt)); + + OUT_PKT4(ring, REG_A5XX_VPC_CNTL_0, 1); + OUT_RING(ring, A5XX_VPC_CNTL_0_STRIDE_IN_VPC(l.max_loc) | + COND(s[FS].v->total_in > 0, A5XX_VPC_CNTL_0_VARYING) | + 0x10000); // XXX + + fd5_context(ctx)->max_loc = l.max_loc; + + if (emit->binning_pass) { + OUT_PKT4(ring, REG_A5XX_SP_FS_OBJ_START_LO, 2); + OUT_RING(ring, 0x00000000); /* SP_FS_OBJ_START_LO */ + OUT_RING(ring, 0x00000000); /* SP_FS_OBJ_START_HI */ + } else { + OUT_PKT4(ring, REG_A5XX_SP_FS_OBJ_START_LO, 2); + OUT_RELOC(ring, s[FS].v->bo, 0, 0, 0); /* SP_FS_OBJ_START_LO/HI */ + } + + OUT_PKT4(ring, REG_A5XX_HLSQ_CONTROL_0_REG, 5); + OUT_RING(ring, A5XX_HLSQ_CONTROL_0_REG_FSTHREADSIZE(fssz) | + A5XX_HLSQ_CONTROL_0_REG_CSTHREADSIZE(TWO_QUADS) | + 0x00000880); /* XXX HLSQ_CONTROL_0 */ + OUT_RING(ring, A5XX_HLSQ_CONTROL_1_REG_PRIMALLOCTHRESHOLD(63)); + OUT_RING(ring, A5XX_HLSQ_CONTROL_2_REG_FACEREGID(face_regid) | + A5XX_HLSQ_CONTROL_2_REG_SAMPLEID(samp_id_regid) | + A5XX_HLSQ_CONTROL_2_REG_SAMPLEMASK(samp_mask_regid) | + A5XX_HLSQ_CONTROL_2_REG_SIZE(ij_regid[IJ_PERSP_SIZE])); + OUT_RING( + ring, + A5XX_HLSQ_CONTROL_3_REG_IJ_PERSP_PIXEL(ij_regid[IJ_PERSP_PIXEL]) | + A5XX_HLSQ_CONTROL_3_REG_IJ_LINEAR_PIXEL(ij_regid[IJ_LINEAR_PIXEL]) | + A5XX_HLSQ_CONTROL_3_REG_IJ_PERSP_CENTROID( + ij_regid[IJ_PERSP_CENTROID]) | + A5XX_HLSQ_CONTROL_3_REG_IJ_PERSP_CENTROID( + ij_regid[IJ_LINEAR_CENTROID])); + OUT_RING( + ring, + A5XX_HLSQ_CONTROL_4_REG_XYCOORDREGID(coord_regid) | + A5XX_HLSQ_CONTROL_4_REG_ZWCOORDREGID(zwcoord_regid) | + A5XX_HLSQ_CONTROL_4_REG_IJ_PERSP_SAMPLE(ij_regid[IJ_PERSP_SAMPLE]) | + A5XX_HLSQ_CONTROL_4_REG_IJ_LINEAR_SAMPLE(ij_regid[IJ_LINEAR_SAMPLE])); + + OUT_PKT4(ring, REG_A5XX_SP_FS_CTRL_REG0, 1); + OUT_RING( + ring, + COND(s[FS].v->total_in > 0, A5XX_SP_FS_CTRL_REG0_VARYING) | + 0x40006 | /* XXX set pretty much everywhere */ + A5XX_SP_FS_CTRL_REG0_THREADSIZE(fssz) | + A5XX_SP_FS_CTRL_REG0_HALFREGFOOTPRINT(s[FS].i->max_half_reg + 1) | + A5XX_SP_FS_CTRL_REG0_FULLREGFOOTPRINT(s[FS].i->max_reg + 1) | + A5XX_SP_FS_CTRL_REG0_BRANCHSTACK(s[FS].v->branchstack) | + COND(s[FS].v->need_pixlod, A5XX_SP_FS_CTRL_REG0_PIXLODENABLE)); + + OUT_PKT4(ring, REG_A5XX_HLSQ_UPDATE_CNTL, 1); + OUT_RING(ring, 0x020fffff); /* XXX */ + + OUT_PKT4(ring, REG_A5XX_VPC_GS_SIV_CNTL, 1); + OUT_RING(ring, 0x0000ffff); /* XXX */ + + OUT_PKT4(ring, REG_A5XX_SP_SP_CNTL, 1); + OUT_RING(ring, 0x00000010); /* XXX */ + + /* XXX: missing enable bits for per-sample bary linear centroid and + * IJ_PERSP_SIZE (should be identical to a6xx) + */ + + OUT_PKT4(ring, REG_A5XX_GRAS_CNTL, 1); + OUT_RING(ring, + CONDREG(ij_regid[IJ_PERSP_PIXEL], A5XX_GRAS_CNTL_IJ_PERSP_PIXEL) | + CONDREG(ij_regid[IJ_PERSP_CENTROID], + A5XX_GRAS_CNTL_IJ_PERSP_CENTROID) | + COND(s[FS].v->fragcoord_compmask != 0, + A5XX_GRAS_CNTL_COORD_MASK(s[FS].v->fragcoord_compmask) | + A5XX_GRAS_CNTL_SIZE) | + COND(s[FS].v->frag_face, A5XX_GRAS_CNTL_SIZE) | + CONDREG(ij_regid[IJ_LINEAR_PIXEL], A5XX_GRAS_CNTL_SIZE)); + + OUT_PKT4(ring, REG_A5XX_RB_RENDER_CONTROL0, 2); + OUT_RING( + ring, + CONDREG(ij_regid[IJ_PERSP_PIXEL], + A5XX_RB_RENDER_CONTROL0_IJ_PERSP_PIXEL) | + CONDREG(ij_regid[IJ_PERSP_CENTROID], + A5XX_RB_RENDER_CONTROL0_IJ_PERSP_CENTROID) | + COND(s[FS].v->fragcoord_compmask != 0, + A5XX_RB_RENDER_CONTROL0_COORD_MASK(s[FS].v->fragcoord_compmask) | + A5XX_RB_RENDER_CONTROL0_SIZE) | + COND(s[FS].v->frag_face, A5XX_RB_RENDER_CONTROL0_SIZE) | + CONDREG(ij_regid[IJ_LINEAR_PIXEL], A5XX_RB_RENDER_CONTROL0_SIZE)); + OUT_RING(ring, + CONDREG(samp_mask_regid, A5XX_RB_RENDER_CONTROL1_SAMPLEMASK) | + COND(s[FS].v->frag_face, A5XX_RB_RENDER_CONTROL1_FACENESS) | + CONDREG(samp_id_regid, A5XX_RB_RENDER_CONTROL1_SAMPLEID)); + + OUT_PKT4(ring, REG_A5XX_SP_FS_OUTPUT_REG(0), 8); + for (i = 0; i < 8; i++) { + OUT_RING(ring, A5XX_SP_FS_OUTPUT_REG_REGID(color_regid[i]) | + COND(color_regid[i] & HALF_REG_ID, + A5XX_SP_FS_OUTPUT_REG_HALF_PRECISION)); + } + + OUT_PKT4(ring, REG_A5XX_VPC_PACK, 1); + OUT_RING(ring, A5XX_VPC_PACK_NUMNONPOSVAR(s[FS].v->total_in) | + A5XX_VPC_PACK_PSIZELOC(psize_loc)); + + if (!emit->binning_pass) { + uint32_t vinterp[8], vpsrepl[8]; + + memset(vinterp, 0, sizeof(vinterp)); + memset(vpsrepl, 0, sizeof(vpsrepl)); + + /* looks like we need to do int varyings in the frag + * shader on a5xx (no flatshad reg? or a420.0 bug?): + * + * (sy)(ss)nop + * (sy)ldlv.u32 r0.x,l[r0.x], 1 + * ldlv.u32 r0.y,l[r0.x+1], 1 + * (ss)bary.f (ei)r63.x, 0, r0.x + * (ss)(rpt1)cov.s32f16 hr0.x, (r)r0.x + * (rpt5)nop + * sam (f16)(xyzw)hr0.x, hr0.x, s#0, t#0 + * + * Possibly on later a5xx variants we'll be able to use + * something like the code below instead of workaround + * in the shader: + */ + /* figure out VARYING_INTERP / VARYING_PS_REPL register values: */ + for (j = -1; + (j = ir3_next_varying(s[FS].v, j)) < (int)s[FS].v->inputs_count;) { + /* NOTE: varyings are packed, so if compmask is 0xb + * then first, third, and fourth component occupy + * three consecutive varying slots: + */ + unsigned compmask = s[FS].v->inputs[j].compmask; + + uint32_t inloc = s[FS].v->inputs[j].inloc; + + if (s[FS].v->inputs[j].flat || + (s[FS].v->inputs[j].rasterflat && emit->rasterflat)) { + uint32_t loc = inloc; + + for (i = 0; i < 4; i++) { + if (compmask & (1 << i)) { + vinterp[loc / 16] |= 1 << ((loc % 16) * 2); + // flatshade[loc / 32] |= 1 << (loc % 32); + loc++; + } + } + } + + bool coord_mode = emit->sprite_coord_mode; + if (ir3_point_sprite(s[FS].v, j, emit->sprite_coord_enable, + &coord_mode)) { + /* mask is two 2-bit fields, where: + * '01' -> S + * '10' -> T + * '11' -> 1 - T (flip mode) + */ + unsigned mask = coord_mode ? 0b1101 : 0b1001; + uint32_t loc = inloc; + if (compmask & 0x1) { + vpsrepl[loc / 16] |= ((mask >> 0) & 0x3) << ((loc % 16) * 2); + loc++; + } + if (compmask & 0x2) { + vpsrepl[loc / 16] |= ((mask >> 2) & 0x3) << ((loc % 16) * 2); + loc++; + } + if (compmask & 0x4) { + /* .z <- 0.0f */ + vinterp[loc / 16] |= 0b10 << ((loc % 16) * 2); + loc++; + } + if (compmask & 0x8) { + /* .w <- 1.0f */ + vinterp[loc / 16] |= 0b11 << ((loc % 16) * 2); + loc++; + } + } + } + + OUT_PKT4(ring, REG_A5XX_VPC_VARYING_INTERP_MODE(0), 8); + for (i = 0; i < 8; i++) + OUT_RING(ring, vinterp[i]); /* VPC_VARYING_INTERP[i].MODE */ + + OUT_PKT4(ring, REG_A5XX_VPC_VARYING_PS_REPL_MODE(0), 8); + for (i = 0; i < 8; i++) + OUT_RING(ring, vpsrepl[i]); /* VPC_VARYING_PS_REPL[i] */ + } + + OUT_PKT4(ring, REG_A5XX_GRAS_VS_CL_CNTL, 1); + OUT_RING(ring, A5XX_GRAS_VS_CL_CNTL_CLIP_MASK(clip_mask) | + A5XX_GRAS_VS_CL_CNTL_CULL_MASK(cull_mask)); + + OUT_PKT4(ring, REG_A5XX_VPC_CLIP_CNTL, 1); + OUT_RING(ring, A5XX_VPC_CLIP_CNTL_CLIP_MASK(clip_cull_mask) | + A5XX_VPC_CLIP_CNTL_CLIP_DIST_03_LOC(clip0_loc) | + A5XX_VPC_CLIP_CNTL_CLIP_DIST_47_LOC(clip1_loc)); + + OUT_PKT4(ring, REG_A5XX_PC_CLIP_CNTL, 1); + OUT_RING(ring, A5XX_PC_CLIP_CNTL_CLIP_MASK(clip_mask)); + + if (!emit->binning_pass) + if (s[FS].instrlen) + fd5_emit_shader(ring, s[FS].v); + + OUT_PKT4(ring, REG_A5XX_VFD_CONTROL_1, 5); + OUT_RING(ring, A5XX_VFD_CONTROL_1_REGID4VTX(vertex_regid) | + A5XX_VFD_CONTROL_1_REGID4INST(instance_regid) | 0xfc0000); + OUT_RING(ring, 0x0000fcfc); /* VFD_CONTROL_2 */ + OUT_RING(ring, 0x0000fcfc); /* VFD_CONTROL_3 */ + OUT_RING(ring, 0x000000fc); /* VFD_CONTROL_4 */ + OUT_RING(ring, 0x00000000); /* VFD_CONTROL_5 */ } static struct ir3_program_state * fd5_program_create(void *data, struct ir3_shader_variant *bs, - struct ir3_shader_variant *vs, - struct ir3_shader_variant *hs, - struct ir3_shader_variant *ds, - struct ir3_shader_variant *gs, - struct ir3_shader_variant *fs, - const struct ir3_shader_key *key) - in_dt + struct ir3_shader_variant *vs, struct ir3_shader_variant *hs, + struct ir3_shader_variant *ds, struct ir3_shader_variant *gs, + struct ir3_shader_variant *fs, + const struct ir3_shader_key *key) in_dt { - struct fd_context *ctx = fd_context(data); - struct fd5_program_state *state = CALLOC_STRUCT(fd5_program_state); + struct fd_context *ctx = fd_context(data); + struct fd5_program_state *state = CALLOC_STRUCT(fd5_program_state); - tc_assert_driver_thread(ctx->tc); + tc_assert_driver_thread(ctx->tc); - state->bs = bs; - state->vs = vs; - state->fs = fs; + state->bs = bs; + state->vs = vs; + state->fs = fs; - return &state->base; + return &state->base; } static void fd5_program_destroy(void *data, struct ir3_program_state *state) { - struct fd5_program_state *so = fd5_program_state(state); - free(so); + struct fd5_program_state *so = fd5_program_state(state); + free(so); } static const struct ir3_cache_funcs cache_funcs = { - .create_state = fd5_program_create, - .destroy_state = fd5_program_destroy, + .create_state = fd5_program_create, + .destroy_state = fd5_program_destroy, }; void fd5_prog_init(struct pipe_context *pctx) { - struct fd_context *ctx = fd_context(pctx); + struct fd_context *ctx = fd_context(pctx); - ctx->shader_cache = ir3_cache_create(&cache_funcs, ctx); - ir3_prog_init(pctx); - fd_prog_init(pctx); + ctx->shader_cache = ir3_cache_create(&cache_funcs, ctx); + ir3_prog_init(pctx); + fd_prog_init(pctx); } diff --git a/src/gallium/drivers/freedreno/a5xx/fd5_program.h b/src/gallium/drivers/freedreno/a5xx/fd5_program.h index d0fff12..59c499e 100644 --- a/src/gallium/drivers/freedreno/a5xx/fd5_program.h +++ b/src/gallium/drivers/freedreno/a5xx/fd5_program.h @@ -36,22 +36,23 @@ struct fd5_emit; struct fd5_program_state { - struct ir3_program_state base; - struct ir3_shader_variant *bs; /* VS for when emit->binning */ - struct ir3_shader_variant *vs; - struct ir3_shader_variant *fs; /* FS for when !emit->binning */ + struct ir3_program_state base; + struct ir3_shader_variant *bs; /* VS for when emit->binning */ + struct ir3_shader_variant *vs; + struct ir3_shader_variant *fs; /* FS for when !emit->binning */ }; static inline struct fd5_program_state * fd5_program_state(struct ir3_program_state *state) { - return (struct fd5_program_state *)state; + return (struct fd5_program_state *)state; } -void fd5_emit_shader(struct fd_ringbuffer *ring, const struct ir3_shader_variant *so); +void fd5_emit_shader(struct fd_ringbuffer *ring, + const struct ir3_shader_variant *so); void fd5_program_emit(struct fd_context *ctx, struct fd_ringbuffer *ring, - struct fd5_emit *emit); + struct fd5_emit *emit); void fd5_prog_init(struct pipe_context *pctx); diff --git a/src/gallium/drivers/freedreno/a5xx/fd5_query.c b/src/gallium/drivers/freedreno/a5xx/fd5_query.c index fc8251e..af04654 100644 --- a/src/gallium/drivers/freedreno/a5xx/fd5_query.c +++ b/src/gallium/drivers/freedreno/a5xx/fd5_query.c @@ -35,21 +35,20 @@ #include "fd5_query.h" struct PACKED fd5_query_sample { - uint64_t start; - uint64_t result; - uint64_t stop; + uint64_t start; + uint64_t result; + uint64_t stop; }; /* offset of a single field of an array of fd5_query_sample: */ -#define query_sample_idx(aq, idx, field) \ - fd_resource((aq)->prsc)->bo, \ - (idx * sizeof(struct fd5_query_sample)) + \ - offsetof(struct fd5_query_sample, field), \ - 0, 0 +#define query_sample_idx(aq, idx, field) \ + fd_resource((aq)->prsc)->bo, \ + (idx * sizeof(struct fd5_query_sample)) + \ + offsetof(struct fd5_query_sample, field), \ + 0, 0 /* offset of a single field of fd5_query_sample: */ -#define query_sample(aq, field) \ - query_sample_idx(aq, 0, field) +#define query_sample(aq, field) query_sample_idx(aq, 0, field) /* * Occlusion Query: @@ -61,98 +60,97 @@ struct PACKED fd5_query_sample { static void occlusion_resume(struct fd_acc_query *aq, struct fd_batch *batch) { - struct fd_ringbuffer *ring = batch->draw; + struct fd_ringbuffer *ring = batch->draw; - OUT_PKT4(ring, REG_A5XX_RB_SAMPLE_COUNT_CONTROL, 1); - OUT_RING(ring, A5XX_RB_SAMPLE_COUNT_CONTROL_COPY); + OUT_PKT4(ring, REG_A5XX_RB_SAMPLE_COUNT_CONTROL, 1); + OUT_RING(ring, A5XX_RB_SAMPLE_COUNT_CONTROL_COPY); - OUT_PKT4(ring, REG_A5XX_RB_SAMPLE_COUNT_ADDR_LO, 2); - OUT_RELOC(ring, query_sample(aq, start)); + OUT_PKT4(ring, REG_A5XX_RB_SAMPLE_COUNT_ADDR_LO, 2); + OUT_RELOC(ring, query_sample(aq, start)); - fd5_event_write(batch, ring, ZPASS_DONE, false); - fd_reset_wfi(batch); + fd5_event_write(batch, ring, ZPASS_DONE, false); + fd_reset_wfi(batch); - fd5_context(batch->ctx)->samples_passed_queries++; + fd5_context(batch->ctx)->samples_passed_queries++; } static void occlusion_pause(struct fd_acc_query *aq, struct fd_batch *batch) { - struct fd_ringbuffer *ring = batch->draw; + struct fd_ringbuffer *ring = batch->draw; - OUT_PKT7(ring, CP_MEM_WRITE, 4); - OUT_RELOC(ring, query_sample(aq, stop)); - OUT_RING(ring, 0xffffffff); - OUT_RING(ring, 0xffffffff); + OUT_PKT7(ring, CP_MEM_WRITE, 4); + OUT_RELOC(ring, query_sample(aq, stop)); + OUT_RING(ring, 0xffffffff); + OUT_RING(ring, 0xffffffff); - OUT_PKT7(ring, CP_WAIT_MEM_WRITES, 0); + OUT_PKT7(ring, CP_WAIT_MEM_WRITES, 0); - OUT_PKT4(ring, REG_A5XX_RB_SAMPLE_COUNT_CONTROL, 1); - OUT_RING(ring, A5XX_RB_SAMPLE_COUNT_CONTROL_COPY); + OUT_PKT4(ring, REG_A5XX_RB_SAMPLE_COUNT_CONTROL, 1); + OUT_RING(ring, A5XX_RB_SAMPLE_COUNT_CONTROL_COPY); - OUT_PKT4(ring, REG_A5XX_RB_SAMPLE_COUNT_ADDR_LO, 2); - OUT_RELOC(ring, query_sample(aq, stop)); + OUT_PKT4(ring, REG_A5XX_RB_SAMPLE_COUNT_ADDR_LO, 2); + OUT_RELOC(ring, query_sample(aq, stop)); - fd5_event_write(batch, ring, ZPASS_DONE, false); - fd_reset_wfi(batch); + fd5_event_write(batch, ring, ZPASS_DONE, false); + fd_reset_wfi(batch); - OUT_PKT7(ring, CP_WAIT_REG_MEM, 6); - OUT_RING(ring, 0x00000014); // XXX - OUT_RELOC(ring, query_sample(aq, stop)); - OUT_RING(ring, 0xffffffff); - OUT_RING(ring, 0xffffffff); - OUT_RING(ring, 0x00000010); // XXX + OUT_PKT7(ring, CP_WAIT_REG_MEM, 6); + OUT_RING(ring, 0x00000014); // XXX + OUT_RELOC(ring, query_sample(aq, stop)); + OUT_RING(ring, 0xffffffff); + OUT_RING(ring, 0xffffffff); + OUT_RING(ring, 0x00000010); // XXX - /* result += stop - start: */ - OUT_PKT7(ring, CP_MEM_TO_MEM, 9); - OUT_RING(ring, CP_MEM_TO_MEM_0_DOUBLE | - CP_MEM_TO_MEM_0_NEG_C); - OUT_RELOC(ring, query_sample(aq, result)); /* dst */ - OUT_RELOC(ring, query_sample(aq, result)); /* srcA */ - OUT_RELOC(ring, query_sample(aq, stop)); /* srcB */ - OUT_RELOC(ring, query_sample(aq, start)); /* srcC */ + /* result += stop - start: */ + OUT_PKT7(ring, CP_MEM_TO_MEM, 9); + OUT_RING(ring, CP_MEM_TO_MEM_0_DOUBLE | CP_MEM_TO_MEM_0_NEG_C); + OUT_RELOC(ring, query_sample(aq, result)); /* dst */ + OUT_RELOC(ring, query_sample(aq, result)); /* srcA */ + OUT_RELOC(ring, query_sample(aq, stop)); /* srcB */ + OUT_RELOC(ring, query_sample(aq, start)); /* srcC */ - fd5_context(batch->ctx)->samples_passed_queries--; + fd5_context(batch->ctx)->samples_passed_queries--; } static void occlusion_counter_result(struct fd_acc_query *aq, void *buf, - union pipe_query_result *result) + union pipe_query_result *result) { - struct fd5_query_sample *sp = buf; - result->u64 = sp->result; + struct fd5_query_sample *sp = buf; + result->u64 = sp->result; } static void occlusion_predicate_result(struct fd_acc_query *aq, void *buf, - union pipe_query_result *result) + union pipe_query_result *result) { - struct fd5_query_sample *sp = buf; - result->b = !!sp->result; + struct fd5_query_sample *sp = buf; + result->b = !!sp->result; } static const struct fd_acc_sample_provider occlusion_counter = { - .query_type = PIPE_QUERY_OCCLUSION_COUNTER, - .size = sizeof(struct fd5_query_sample), - .resume = occlusion_resume, - .pause = occlusion_pause, - .result = occlusion_counter_result, + .query_type = PIPE_QUERY_OCCLUSION_COUNTER, + .size = sizeof(struct fd5_query_sample), + .resume = occlusion_resume, + .pause = occlusion_pause, + .result = occlusion_counter_result, }; static const struct fd_acc_sample_provider occlusion_predicate = { - .query_type = PIPE_QUERY_OCCLUSION_PREDICATE, - .size = sizeof(struct fd5_query_sample), - .resume = occlusion_resume, - .pause = occlusion_pause, - .result = occlusion_predicate_result, + .query_type = PIPE_QUERY_OCCLUSION_PREDICATE, + .size = sizeof(struct fd5_query_sample), + .resume = occlusion_resume, + .pause = occlusion_pause, + .result = occlusion_predicate_result, }; static const struct fd_acc_sample_provider occlusion_predicate_conservative = { - .query_type = PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE, - .size = sizeof(struct fd5_query_sample), - .resume = occlusion_resume, - .pause = occlusion_pause, - .result = occlusion_predicate_result, + .query_type = PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE, + .size = sizeof(struct fd5_query_sample), + .resume = occlusion_resume, + .pause = occlusion_pause, + .result = occlusion_predicate_result, }; /* @@ -160,78 +158,75 @@ static const struct fd_acc_sample_provider occlusion_predicate_conservative = { */ static void -timestamp_resume(struct fd_acc_query *aq, struct fd_batch *batch) - assert_dt +timestamp_resume(struct fd_acc_query *aq, struct fd_batch *batch) assert_dt { - struct fd_ringbuffer *ring = batch->draw; + struct fd_ringbuffer *ring = batch->draw; - OUT_PKT7(ring, CP_EVENT_WRITE, 4); - OUT_RING(ring, CP_EVENT_WRITE_0_EVENT(RB_DONE_TS) | - CP_EVENT_WRITE_0_TIMESTAMP); - OUT_RELOC(ring, query_sample(aq, start)); - OUT_RING(ring, 0x00000000); + OUT_PKT7(ring, CP_EVENT_WRITE, 4); + OUT_RING(ring, + CP_EVENT_WRITE_0_EVENT(RB_DONE_TS) | CP_EVENT_WRITE_0_TIMESTAMP); + OUT_RELOC(ring, query_sample(aq, start)); + OUT_RING(ring, 0x00000000); - fd_reset_wfi(batch); + fd_reset_wfi(batch); } static void -timestamp_pause(struct fd_acc_query *aq, struct fd_batch *batch) - assert_dt +timestamp_pause(struct fd_acc_query *aq, struct fd_batch *batch) assert_dt { - struct fd_ringbuffer *ring = batch->draw; - - OUT_PKT7(ring, CP_EVENT_WRITE, 4); - OUT_RING(ring, CP_EVENT_WRITE_0_EVENT(RB_DONE_TS) | - CP_EVENT_WRITE_0_TIMESTAMP); - OUT_RELOC(ring, query_sample(aq, stop)); - OUT_RING(ring, 0x00000000); - - fd_reset_wfi(batch); - fd_wfi(batch, ring); - - /* result += stop - start: */ - OUT_PKT7(ring, CP_MEM_TO_MEM, 9); - OUT_RING(ring, CP_MEM_TO_MEM_0_DOUBLE | - CP_MEM_TO_MEM_0_NEG_C); - OUT_RELOC(ring, query_sample(aq, result)); /* dst */ - OUT_RELOC(ring, query_sample(aq, result)); /* srcA */ - OUT_RELOC(ring, query_sample(aq, stop)); /* srcB */ - OUT_RELOC(ring, query_sample(aq, start)); /* srcC */ + struct fd_ringbuffer *ring = batch->draw; + + OUT_PKT7(ring, CP_EVENT_WRITE, 4); + OUT_RING(ring, + CP_EVENT_WRITE_0_EVENT(RB_DONE_TS) | CP_EVENT_WRITE_0_TIMESTAMP); + OUT_RELOC(ring, query_sample(aq, stop)); + OUT_RING(ring, 0x00000000); + + fd_reset_wfi(batch); + fd_wfi(batch, ring); + + /* result += stop - start: */ + OUT_PKT7(ring, CP_MEM_TO_MEM, 9); + OUT_RING(ring, CP_MEM_TO_MEM_0_DOUBLE | CP_MEM_TO_MEM_0_NEG_C); + OUT_RELOC(ring, query_sample(aq, result)); /* dst */ + OUT_RELOC(ring, query_sample(aq, result)); /* srcA */ + OUT_RELOC(ring, query_sample(aq, stop)); /* srcB */ + OUT_RELOC(ring, query_sample(aq, start)); /* srcC */ } static uint64_t ticks_to_ns(uint32_t ts) { - /* This is based on the 19.2MHz always-on rbbm timer. - * - * TODO we should probably query this value from kernel.. - */ - return ts * (1000000000 / 19200000); + /* This is based on the 19.2MHz always-on rbbm timer. + * + * TODO we should probably query this value from kernel.. + */ + return ts * (1000000000 / 19200000); } static void time_elapsed_accumulate_result(struct fd_acc_query *aq, void *buf, - union pipe_query_result *result) + union pipe_query_result *result) { - struct fd5_query_sample *sp = buf; - result->u64 = ticks_to_ns(sp->result); + struct fd5_query_sample *sp = buf; + result->u64 = ticks_to_ns(sp->result); } static void timestamp_accumulate_result(struct fd_acc_query *aq, void *buf, - union pipe_query_result *result) + union pipe_query_result *result) { - struct fd5_query_sample *sp = buf; - result->u64 = ticks_to_ns(sp->result); + struct fd5_query_sample *sp = buf; + result->u64 = ticks_to_ns(sp->result); } static const struct fd_acc_sample_provider time_elapsed = { - .query_type = PIPE_QUERY_TIME_ELAPSED, - .always = true, - .size = sizeof(struct fd5_query_sample), - .resume = timestamp_resume, - .pause = timestamp_pause, - .result = time_elapsed_accumulate_result, + .query_type = PIPE_QUERY_TIME_ELAPSED, + .always = true, + .size = sizeof(struct fd5_query_sample), + .resume = timestamp_resume, + .pause = timestamp_pause, + .result = time_elapsed_accumulate_result, }; /* NOTE: timestamp query isn't going to give terribly sensible results @@ -242,12 +237,12 @@ static const struct fd_acc_sample_provider time_elapsed = { */ static const struct fd_acc_sample_provider timestamp = { - .query_type = PIPE_QUERY_TIMESTAMP, - .always = true, - .size = sizeof(struct fd5_query_sample), - .resume = timestamp_resume, - .pause = timestamp_pause, - .result = timestamp_accumulate_result, + .query_type = PIPE_QUERY_TIMESTAMP, + .always = true, + .size = sizeof(struct fd5_query_sample), + .resume = timestamp_resume, + .pause = timestamp_pause, + .result = timestamp_accumulate_result, }; /* @@ -260,208 +255,204 @@ static const struct fd_acc_sample_provider timestamp = { */ struct fd_batch_query_entry { - uint8_t gid; /* group-id */ - uint8_t cid; /* countable-id within the group */ + uint8_t gid; /* group-id */ + uint8_t cid; /* countable-id within the group */ }; struct fd_batch_query_data { - struct fd_screen *screen; - unsigned num_query_entries; - struct fd_batch_query_entry query_entries[]; + struct fd_screen *screen; + unsigned num_query_entries; + struct fd_batch_query_entry query_entries[]; }; static void -perfcntr_resume(struct fd_acc_query *aq, struct fd_batch *batch) - assert_dt +perfcntr_resume(struct fd_acc_query *aq, struct fd_batch *batch) assert_dt { - struct fd_batch_query_data *data = aq->query_data; - struct fd_screen *screen = data->screen; - struct fd_ringbuffer *ring = batch->draw; + struct fd_batch_query_data *data = aq->query_data; + struct fd_screen *screen = data->screen; + struct fd_ringbuffer *ring = batch->draw; - unsigned counters_per_group[screen->num_perfcntr_groups]; - memset(counters_per_group, 0, sizeof(counters_per_group)); + unsigned counters_per_group[screen->num_perfcntr_groups]; + memset(counters_per_group, 0, sizeof(counters_per_group)); - fd_wfi(batch, ring); + fd_wfi(batch, ring); - /* configure performance counters for the requested queries: */ - for (unsigned i = 0; i < data->num_query_entries; i++) { - struct fd_batch_query_entry *entry = &data->query_entries[i]; - const struct fd_perfcntr_group *g = &screen->perfcntr_groups[entry->gid]; - unsigned counter_idx = counters_per_group[entry->gid]++; + /* configure performance counters for the requested queries: */ + for (unsigned i = 0; i < data->num_query_entries; i++) { + struct fd_batch_query_entry *entry = &data->query_entries[i]; + const struct fd_perfcntr_group *g = &screen->perfcntr_groups[entry->gid]; + unsigned counter_idx = counters_per_group[entry->gid]++; - debug_assert(counter_idx < g->num_counters); + debug_assert(counter_idx < g->num_counters); - OUT_PKT4(ring, g->counters[counter_idx].select_reg, 1); - OUT_RING(ring, g->countables[entry->cid].selector); - } + OUT_PKT4(ring, g->counters[counter_idx].select_reg, 1); + OUT_RING(ring, g->countables[entry->cid].selector); + } - memset(counters_per_group, 0, sizeof(counters_per_group)); + memset(counters_per_group, 0, sizeof(counters_per_group)); - /* and snapshot the start values */ - for (unsigned i = 0; i < data->num_query_entries; i++) { - struct fd_batch_query_entry *entry = &data->query_entries[i]; - const struct fd_perfcntr_group *g = &screen->perfcntr_groups[entry->gid]; - unsigned counter_idx = counters_per_group[entry->gid]++; - const struct fd_perfcntr_counter *counter = &g->counters[counter_idx]; + /* and snapshot the start values */ + for (unsigned i = 0; i < data->num_query_entries; i++) { + struct fd_batch_query_entry *entry = &data->query_entries[i]; + const struct fd_perfcntr_group *g = &screen->perfcntr_groups[entry->gid]; + unsigned counter_idx = counters_per_group[entry->gid]++; + const struct fd_perfcntr_counter *counter = &g->counters[counter_idx]; - OUT_PKT7(ring, CP_REG_TO_MEM, 3); - OUT_RING(ring, CP_REG_TO_MEM_0_64B | - CP_REG_TO_MEM_0_REG(counter->counter_reg_lo)); - OUT_RELOC(ring, query_sample_idx(aq, i, start)); - } + OUT_PKT7(ring, CP_REG_TO_MEM, 3); + OUT_RING(ring, CP_REG_TO_MEM_0_64B | + CP_REG_TO_MEM_0_REG(counter->counter_reg_lo)); + OUT_RELOC(ring, query_sample_idx(aq, i, start)); + } } static void -perfcntr_pause(struct fd_acc_query *aq, struct fd_batch *batch) - assert_dt +perfcntr_pause(struct fd_acc_query *aq, struct fd_batch *batch) assert_dt { - struct fd_batch_query_data *data = aq->query_data; - struct fd_screen *screen = data->screen; - struct fd_ringbuffer *ring = batch->draw; - - unsigned counters_per_group[screen->num_perfcntr_groups]; - memset(counters_per_group, 0, sizeof(counters_per_group)); - - fd_wfi(batch, ring); - - /* TODO do we need to bother to turn anything off? */ - - /* snapshot the end values: */ - for (unsigned i = 0; i < data->num_query_entries; i++) { - struct fd_batch_query_entry *entry = &data->query_entries[i]; - const struct fd_perfcntr_group *g = &screen->perfcntr_groups[entry->gid]; - unsigned counter_idx = counters_per_group[entry->gid]++; - const struct fd_perfcntr_counter *counter = &g->counters[counter_idx]; - - OUT_PKT7(ring, CP_REG_TO_MEM, 3); - OUT_RING(ring, CP_REG_TO_MEM_0_64B | - CP_REG_TO_MEM_0_REG(counter->counter_reg_lo)); - OUT_RELOC(ring, query_sample_idx(aq, i, stop)); - } - - /* and compute the result: */ - for (unsigned i = 0; i < data->num_query_entries; i++) { - /* result += stop - start: */ - OUT_PKT7(ring, CP_MEM_TO_MEM, 9); - OUT_RING(ring, CP_MEM_TO_MEM_0_DOUBLE | - CP_MEM_TO_MEM_0_NEG_C); - OUT_RELOC(ring, query_sample_idx(aq, i, result)); /* dst */ - OUT_RELOC(ring, query_sample_idx(aq, i, result)); /* srcA */ - OUT_RELOC(ring, query_sample_idx(aq, i, stop)); /* srcB */ - OUT_RELOC(ring, query_sample_idx(aq, i, start)); /* srcC */ - } + struct fd_batch_query_data *data = aq->query_data; + struct fd_screen *screen = data->screen; + struct fd_ringbuffer *ring = batch->draw; + + unsigned counters_per_group[screen->num_perfcntr_groups]; + memset(counters_per_group, 0, sizeof(counters_per_group)); + + fd_wfi(batch, ring); + + /* TODO do we need to bother to turn anything off? */ + + /* snapshot the end values: */ + for (unsigned i = 0; i < data->num_query_entries; i++) { + struct fd_batch_query_entry *entry = &data->query_entries[i]; + const struct fd_perfcntr_group *g = &screen->perfcntr_groups[entry->gid]; + unsigned counter_idx = counters_per_group[entry->gid]++; + const struct fd_perfcntr_counter *counter = &g->counters[counter_idx]; + + OUT_PKT7(ring, CP_REG_TO_MEM, 3); + OUT_RING(ring, CP_REG_TO_MEM_0_64B | + CP_REG_TO_MEM_0_REG(counter->counter_reg_lo)); + OUT_RELOC(ring, query_sample_idx(aq, i, stop)); + } + + /* and compute the result: */ + for (unsigned i = 0; i < data->num_query_entries; i++) { + /* result += stop - start: */ + OUT_PKT7(ring, CP_MEM_TO_MEM, 9); + OUT_RING(ring, CP_MEM_TO_MEM_0_DOUBLE | CP_MEM_TO_MEM_0_NEG_C); + OUT_RELOC(ring, query_sample_idx(aq, i, result)); /* dst */ + OUT_RELOC(ring, query_sample_idx(aq, i, result)); /* srcA */ + OUT_RELOC(ring, query_sample_idx(aq, i, stop)); /* srcB */ + OUT_RELOC(ring, query_sample_idx(aq, i, start)); /* srcC */ + } } static void perfcntr_accumulate_result(struct fd_acc_query *aq, void *buf, - union pipe_query_result *result) + union pipe_query_result *result) { - struct fd_batch_query_data *data = aq->query_data; - struct fd5_query_sample *sp = buf; + struct fd_batch_query_data *data = aq->query_data; + struct fd5_query_sample *sp = buf; - for (unsigned i = 0; i < data->num_query_entries; i++) { - result->batch[i].u64 = sp[i].result; - } + for (unsigned i = 0; i < data->num_query_entries; i++) { + result->batch[i].u64 = sp[i].result; + } } static const struct fd_acc_sample_provider perfcntr = { - .query_type = FD_QUERY_FIRST_PERFCNTR, - .always = true, - .resume = perfcntr_resume, - .pause = perfcntr_pause, - .result = perfcntr_accumulate_result, + .query_type = FD_QUERY_FIRST_PERFCNTR, + .always = true, + .resume = perfcntr_resume, + .pause = perfcntr_pause, + .result = perfcntr_accumulate_result, }; static struct pipe_query * -fd5_create_batch_query(struct pipe_context *pctx, - unsigned num_queries, unsigned *query_types) +fd5_create_batch_query(struct pipe_context *pctx, unsigned num_queries, + unsigned *query_types) { - struct fd_context *ctx = fd_context(pctx); - struct fd_screen *screen = ctx->screen; - struct fd_query *q; - struct fd_acc_query *aq; - struct fd_batch_query_data *data; - - data = CALLOC_VARIANT_LENGTH_STRUCT(fd_batch_query_data, - num_queries * sizeof(data->query_entries[0])); - - data->screen = screen; - data->num_query_entries = num_queries; - - /* validate the requested query_types and ensure we don't try - * to request more query_types of a given group than we have - * counters: - */ - unsigned counters_per_group[screen->num_perfcntr_groups]; - memset(counters_per_group, 0, sizeof(counters_per_group)); - - for (unsigned i = 0; i < num_queries; i++) { - unsigned idx = query_types[i] - FD_QUERY_FIRST_PERFCNTR; - - /* verify valid query_type, ie. is it actually a perfcntr? */ - if ((query_types[i] < FD_QUERY_FIRST_PERFCNTR) || - (idx >= screen->num_perfcntr_queries)) { - mesa_loge("invalid batch query query_type: %u", query_types[i]); - goto error; - } - - struct fd_batch_query_entry *entry = &data->query_entries[i]; - struct pipe_driver_query_info *pq = &screen->perfcntr_queries[idx]; - - entry->gid = pq->group_id; - - /* the perfcntr_queries[] table flattens all the countables - * for each group in series, ie: - * - * (G0,C0), .., (G0,Cn), (G1,C0), .., (G1,Cm), ... - * - * So to find the countable index just step back through the - * table to find the first entry with the same group-id. - */ - while (pq > screen->perfcntr_queries) { - pq--; - if (pq->group_id == entry->gid) - entry->cid++; - } - - if (counters_per_group[entry->gid] >= - screen->perfcntr_groups[entry->gid].num_counters) { - mesa_loge("too many counters for group %u\n", entry->gid); - goto error; - } - - counters_per_group[entry->gid]++; - } - - q = fd_acc_create_query2(ctx, 0, 0, &perfcntr); - aq = fd_acc_query(q); - - /* sample buffer size is based on # of queries: */ - aq->size = num_queries * sizeof(struct fd5_query_sample); - aq->query_data = data; - - return (struct pipe_query *)q; + struct fd_context *ctx = fd_context(pctx); + struct fd_screen *screen = ctx->screen; + struct fd_query *q; + struct fd_acc_query *aq; + struct fd_batch_query_data *data; + + data = CALLOC_VARIANT_LENGTH_STRUCT( + fd_batch_query_data, num_queries * sizeof(data->query_entries[0])); + + data->screen = screen; + data->num_query_entries = num_queries; + + /* validate the requested query_types and ensure we don't try + * to request more query_types of a given group than we have + * counters: + */ + unsigned counters_per_group[screen->num_perfcntr_groups]; + memset(counters_per_group, 0, sizeof(counters_per_group)); + + for (unsigned i = 0; i < num_queries; i++) { + unsigned idx = query_types[i] - FD_QUERY_FIRST_PERFCNTR; + + /* verify valid query_type, ie. is it actually a perfcntr? */ + if ((query_types[i] < FD_QUERY_FIRST_PERFCNTR) || + (idx >= screen->num_perfcntr_queries)) { + mesa_loge("invalid batch query query_type: %u", query_types[i]); + goto error; + } + + struct fd_batch_query_entry *entry = &data->query_entries[i]; + struct pipe_driver_query_info *pq = &screen->perfcntr_queries[idx]; + + entry->gid = pq->group_id; + + /* the perfcntr_queries[] table flattens all the countables + * for each group in series, ie: + * + * (G0,C0), .., (G0,Cn), (G1,C0), .., (G1,Cm), ... + * + * So to find the countable index just step back through the + * table to find the first entry with the same group-id. + */ + while (pq > screen->perfcntr_queries) { + pq--; + if (pq->group_id == entry->gid) + entry->cid++; + } + + if (counters_per_group[entry->gid] >= + screen->perfcntr_groups[entry->gid].num_counters) { + mesa_loge("too many counters for group %u\n", entry->gid); + goto error; + } + + counters_per_group[entry->gid]++; + } + + q = fd_acc_create_query2(ctx, 0, 0, &perfcntr); + aq = fd_acc_query(q); + + /* sample buffer size is based on # of queries: */ + aq->size = num_queries * sizeof(struct fd5_query_sample); + aq->query_data = data; + + return (struct pipe_query *)q; error: - free(data); - return NULL; + free(data); + return NULL; } void -fd5_query_context_init(struct pipe_context *pctx) - disable_thread_safety_analysis +fd5_query_context_init(struct pipe_context *pctx) disable_thread_safety_analysis { - struct fd_context *ctx = fd_context(pctx); + struct fd_context *ctx = fd_context(pctx); - ctx->create_query = fd_acc_create_query; - ctx->query_update_batch = fd_acc_query_update_batch; + ctx->create_query = fd_acc_create_query; + ctx->query_update_batch = fd_acc_query_update_batch; - pctx->create_batch_query = fd5_create_batch_query; + pctx->create_batch_query = fd5_create_batch_query; - fd_acc_query_register_provider(pctx, &occlusion_counter); - fd_acc_query_register_provider(pctx, &occlusion_predicate); - fd_acc_query_register_provider(pctx, &occlusion_predicate_conservative); + fd_acc_query_register_provider(pctx, &occlusion_counter); + fd_acc_query_register_provider(pctx, &occlusion_predicate); + fd_acc_query_register_provider(pctx, &occlusion_predicate_conservative); - fd_acc_query_register_provider(pctx, &time_elapsed); - fd_acc_query_register_provider(pctx, ×tamp); + fd_acc_query_register_provider(pctx, &time_elapsed); + fd_acc_query_register_provider(pctx, ×tamp); } diff --git a/src/gallium/drivers/freedreno/a5xx/fd5_rasterizer.c b/src/gallium/drivers/freedreno/a5xx/fd5_rasterizer.c index 2bbcbf2..c625c25 100644 --- a/src/gallium/drivers/freedreno/a5xx/fd5_rasterizer.c +++ b/src/gallium/drivers/freedreno/a5xx/fd5_rasterizer.c @@ -24,75 +24,73 @@ * Rob Clark */ - #include "pipe/p_state.h" -#include "util/u_string.h" #include "util/u_memory.h" +#include "util/u_string.h" -#include "fd5_rasterizer.h" #include "fd5_context.h" #include "fd5_format.h" +#include "fd5_rasterizer.h" void * fd5_rasterizer_state_create(struct pipe_context *pctx, - const struct pipe_rasterizer_state *cso) + const struct pipe_rasterizer_state *cso) { - struct fd5_rasterizer_stateobj *so; - float psize_min, psize_max; + struct fd5_rasterizer_stateobj *so; + float psize_min, psize_max; - so = CALLOC_STRUCT(fd5_rasterizer_stateobj); - if (!so) - return NULL; + so = CALLOC_STRUCT(fd5_rasterizer_stateobj); + if (!so) + return NULL; - so->base = *cso; + so->base = *cso; - if (cso->point_size_per_vertex) { - psize_min = util_get_min_point_size(cso); - psize_max = 4092; - } else { - /* Force the point size to be as if the vertex output was disabled. */ - psize_min = cso->point_size; - psize_max = cso->point_size; - } + if (cso->point_size_per_vertex) { + psize_min = util_get_min_point_size(cso); + psize_max = 4092; + } else { + /* Force the point size to be as if the vertex output was disabled. */ + psize_min = cso->point_size; + psize_max = cso->point_size; + } - so->gras_su_point_minmax = - A5XX_GRAS_SU_POINT_MINMAX_MIN(psize_min) | - A5XX_GRAS_SU_POINT_MINMAX_MAX(psize_max); - so->gras_su_point_size = A5XX_GRAS_SU_POINT_SIZE(cso->point_size); - so->gras_su_poly_offset_scale = - A5XX_GRAS_SU_POLY_OFFSET_SCALE(cso->offset_scale); - so->gras_su_poly_offset_offset = - A5XX_GRAS_SU_POLY_OFFSET_OFFSET(cso->offset_units); - so->gras_su_poly_offset_clamp = - A5XX_GRAS_SU_POLY_OFFSET_OFFSET_CLAMP(cso->offset_clamp); + so->gras_su_point_minmax = A5XX_GRAS_SU_POINT_MINMAX_MIN(psize_min) | + A5XX_GRAS_SU_POINT_MINMAX_MAX(psize_max); + so->gras_su_point_size = A5XX_GRAS_SU_POINT_SIZE(cso->point_size); + so->gras_su_poly_offset_scale = + A5XX_GRAS_SU_POLY_OFFSET_SCALE(cso->offset_scale); + so->gras_su_poly_offset_offset = + A5XX_GRAS_SU_POLY_OFFSET_OFFSET(cso->offset_units); + so->gras_su_poly_offset_clamp = + A5XX_GRAS_SU_POLY_OFFSET_OFFSET_CLAMP(cso->offset_clamp); - so->gras_su_cntl = - A5XX_GRAS_SU_CNTL_LINEHALFWIDTH(cso->line_width/2.0); - so->pc_raster_cntl = - A5XX_PC_RASTER_CNTL_POLYMODE_FRONT_PTYPE(fd_polygon_mode(cso->fill_front)) | - A5XX_PC_RASTER_CNTL_POLYMODE_BACK_PTYPE(fd_polygon_mode(cso->fill_back)); + so->gras_su_cntl = A5XX_GRAS_SU_CNTL_LINEHALFWIDTH(cso->line_width / 2.0); + so->pc_raster_cntl = + A5XX_PC_RASTER_CNTL_POLYMODE_FRONT_PTYPE( + fd_polygon_mode(cso->fill_front)) | + A5XX_PC_RASTER_CNTL_POLYMODE_BACK_PTYPE(fd_polygon_mode(cso->fill_back)); - if (cso->fill_front != PIPE_POLYGON_MODE_FILL || - cso->fill_back != PIPE_POLYGON_MODE_FILL) - so->pc_raster_cntl |= A5XX_PC_RASTER_CNTL_POLYMODE_ENABLE; + if (cso->fill_front != PIPE_POLYGON_MODE_FILL || + cso->fill_back != PIPE_POLYGON_MODE_FILL) + so->pc_raster_cntl |= A5XX_PC_RASTER_CNTL_POLYMODE_ENABLE; - if (cso->cull_face & PIPE_FACE_FRONT) - so->gras_su_cntl |= A5XX_GRAS_SU_CNTL_CULL_FRONT; - if (cso->cull_face & PIPE_FACE_BACK) - so->gras_su_cntl |= A5XX_GRAS_SU_CNTL_CULL_BACK; - if (!cso->front_ccw) - so->gras_su_cntl |= A5XX_GRAS_SU_CNTL_FRONT_CW; - if (cso->offset_tri) - so->gras_su_cntl |= A5XX_GRAS_SU_CNTL_POLY_OFFSET; + if (cso->cull_face & PIPE_FACE_FRONT) + so->gras_su_cntl |= A5XX_GRAS_SU_CNTL_CULL_FRONT; + if (cso->cull_face & PIPE_FACE_BACK) + so->gras_su_cntl |= A5XX_GRAS_SU_CNTL_CULL_BACK; + if (!cso->front_ccw) + so->gras_su_cntl |= A5XX_GRAS_SU_CNTL_FRONT_CW; + if (cso->offset_tri) + so->gras_su_cntl |= A5XX_GRAS_SU_CNTL_POLY_OFFSET; - if (!cso->flatshade_first) - so->pc_primitive_cntl |= A5XX_PC_PRIMITIVE_CNTL_PROVOKING_VTX_LAST; + if (!cso->flatshade_first) + so->pc_primitive_cntl |= A5XX_PC_PRIMITIVE_CNTL_PROVOKING_VTX_LAST; -// if (!cso->depth_clip) -// so->gras_cl_clip_cntl |= A5XX_GRAS_CL_CLIP_CNTL_ZNEAR_CLIP_DISABLE | -// A5XX_GRAS_CL_CLIP_CNTL_ZFAR_CLIP_DISABLE; - if (cso->clip_halfz) - so->gras_cl_clip_cntl |= A5XX_GRAS_CL_CNTL_ZERO_GB_SCALE_Z; + // if (!cso->depth_clip) + // so->gras_cl_clip_cntl |= A5XX_GRAS_CL_CLIP_CNTL_ZNEAR_CLIP_DISABLE + //| A5XX_GRAS_CL_CLIP_CNTL_ZFAR_CLIP_DISABLE; + if (cso->clip_halfz) + so->gras_cl_clip_cntl |= A5XX_GRAS_CL_CNTL_ZERO_GB_SCALE_Z; - return so; + return so; } diff --git a/src/gallium/drivers/freedreno/a5xx/fd5_rasterizer.h b/src/gallium/drivers/freedreno/a5xx/fd5_rasterizer.h index b597581..14079d3 100644 --- a/src/gallium/drivers/freedreno/a5xx/fd5_rasterizer.h +++ b/src/gallium/drivers/freedreno/a5xx/fd5_rasterizer.h @@ -27,31 +27,31 @@ #ifndef FD5_RASTERIZER_H_ #define FD5_RASTERIZER_H_ -#include "pipe/p_state.h" #include "pipe/p_context.h" +#include "pipe/p_state.h" struct fd5_rasterizer_stateobj { - struct pipe_rasterizer_state base; - - uint32_t gras_su_point_minmax; - uint32_t gras_su_point_size; - uint32_t gras_su_poly_offset_scale; - uint32_t gras_su_poly_offset_offset; - uint32_t gras_su_poly_offset_clamp; - - uint32_t gras_su_cntl; - uint32_t gras_cl_clip_cntl; - uint32_t pc_primitive_cntl; - uint32_t pc_raster_cntl; + struct pipe_rasterizer_state base; + + uint32_t gras_su_point_minmax; + uint32_t gras_su_point_size; + uint32_t gras_su_poly_offset_scale; + uint32_t gras_su_poly_offset_offset; + uint32_t gras_su_poly_offset_clamp; + + uint32_t gras_su_cntl; + uint32_t gras_cl_clip_cntl; + uint32_t pc_primitive_cntl; + uint32_t pc_raster_cntl; }; static inline struct fd5_rasterizer_stateobj * fd5_rasterizer_stateobj(struct pipe_rasterizer_state *rast) { - return (struct fd5_rasterizer_stateobj *)rast; + return (struct fd5_rasterizer_stateobj *)rast; } -void * fd5_rasterizer_state_create(struct pipe_context *pctx, - const struct pipe_rasterizer_state *cso); +void *fd5_rasterizer_state_create(struct pipe_context *pctx, + const struct pipe_rasterizer_state *cso); #endif /* FD5_RASTERIZER_H_ */ diff --git a/src/gallium/drivers/freedreno/a5xx/fd5_resource.c b/src/gallium/drivers/freedreno/a5xx/fd5_resource.c index c164a75..8a04c3d 100644 --- a/src/gallium/drivers/freedreno/a5xx/fd5_resource.c +++ b/src/gallium/drivers/freedreno/a5xx/fd5_resource.c @@ -29,43 +29,42 @@ static void setup_lrz(struct fd_resource *rsc) { - struct fd_screen *screen = fd_screen(rsc->b.b.screen); - const uint32_t flags = DRM_FREEDRENO_GEM_CACHE_WCOMBINE | - DRM_FREEDRENO_GEM_TYPE_KMEM; /* TODO */ - unsigned lrz_pitch = align(DIV_ROUND_UP(rsc->b.b.width0, 8), 64); - unsigned lrz_height = DIV_ROUND_UP(rsc->b.b.height0, 8); + struct fd_screen *screen = fd_screen(rsc->b.b.screen); + const uint32_t flags = + DRM_FREEDRENO_GEM_CACHE_WCOMBINE | DRM_FREEDRENO_GEM_TYPE_KMEM; /* TODO */ + unsigned lrz_pitch = align(DIV_ROUND_UP(rsc->b.b.width0, 8), 64); + unsigned lrz_height = DIV_ROUND_UP(rsc->b.b.height0, 8); - /* LRZ buffer is super-sampled: */ - switch (rsc->b.b.nr_samples) { - case 4: - lrz_pitch *= 2; - FALLTHROUGH; - case 2: - lrz_height *= 2; - } + /* LRZ buffer is super-sampled: */ + switch (rsc->b.b.nr_samples) { + case 4: + lrz_pitch *= 2; + FALLTHROUGH; + case 2: + lrz_height *= 2; + } - unsigned size = lrz_pitch * lrz_height * 2; + unsigned size = lrz_pitch * lrz_height * 2; - size += 0x1000; /* for GRAS_LRZ_FAST_CLEAR_BUFFER */ + size += 0x1000; /* for GRAS_LRZ_FAST_CLEAR_BUFFER */ - rsc->lrz_height = lrz_height; - rsc->lrz_width = lrz_pitch; - rsc->lrz_pitch = lrz_pitch; - rsc->lrz = fd_bo_new(screen->dev, size, flags, "lrz"); + rsc->lrz_height = lrz_height; + rsc->lrz_width = lrz_pitch; + rsc->lrz_pitch = lrz_pitch; + rsc->lrz = fd_bo_new(screen->dev, size, flags, "lrz"); } uint32_t fd5_setup_slices(struct fd_resource *rsc) { - struct pipe_resource *prsc = &rsc->b.b; + struct pipe_resource *prsc = &rsc->b.b; - if (FD_DBG(LRZ) && has_depth(rsc->b.b.format)) - setup_lrz(rsc); + if (FD_DBG(LRZ) && has_depth(rsc->b.b.format)) + setup_lrz(rsc); - fdl5_layout(&rsc->layout, prsc->format, fd_resource_nr_samples(prsc), - prsc->width0, prsc->height0, prsc->depth0, - prsc->last_level + 1, prsc->array_size, - prsc->target == PIPE_TEXTURE_3D); + fdl5_layout(&rsc->layout, prsc->format, fd_resource_nr_samples(prsc), + prsc->width0, prsc->height0, prsc->depth0, prsc->last_level + 1, + prsc->array_size, prsc->target == PIPE_TEXTURE_3D); - return rsc->layout.size; + return rsc->layout.size; } diff --git a/src/gallium/drivers/freedreno/a5xx/fd5_screen.c b/src/gallium/drivers/freedreno/a5xx/fd5_screen.c index 7ba2e6a..1393abf 100644 --- a/src/gallium/drivers/freedreno/a5xx/fd5_screen.c +++ b/src/gallium/drivers/freedreno/a5xx/fd5_screen.c @@ -27,112 +27,106 @@ #include "pipe/p_screen.h" #include "util/format/u_format.h" -#include "fd5_screen.h" #include "fd5_blitter.h" #include "fd5_context.h" -#include "fd5_format.h" #include "fd5_emit.h" +#include "fd5_format.h" #include "fd5_resource.h" +#include "fd5_screen.h" #include "ir3/ir3_compiler.h" static bool valid_sample_count(unsigned sample_count) { - switch (sample_count) { - case 0: - case 1: - case 2: - case 4: - return true; - default: - return false; - } + switch (sample_count) { + case 0: + case 1: + case 2: + case 4: + return true; + default: + return false; + } } static bool fd5_screen_is_format_supported(struct pipe_screen *pscreen, - enum pipe_format format, - enum pipe_texture_target target, - unsigned sample_count, - unsigned storage_sample_count, - unsigned usage) + enum pipe_format format, + enum pipe_texture_target target, + unsigned sample_count, + unsigned storage_sample_count, unsigned usage) { - unsigned retval = 0; - - if ((target >= PIPE_MAX_TEXTURE_TYPES) || - !valid_sample_count(sample_count)) { - DBG("not supported: format=%s, target=%d, sample_count=%d, usage=%x", - util_format_name(format), target, sample_count, usage); - return false; - } - - if (MAX2(1, sample_count) != MAX2(1, storage_sample_count)) - return false; - - if ((usage & PIPE_BIND_VERTEX_BUFFER) && - (fd5_pipe2vtx(format) != VFMT5_NONE)) { - retval |= PIPE_BIND_VERTEX_BUFFER; - } - - if ((usage & (PIPE_BIND_SAMPLER_VIEW | PIPE_BIND_SHADER_IMAGE)) && - (fd5_pipe2tex(format) != TFMT5_NONE) && - (target == PIPE_BUFFER || - util_format_get_blocksize(format) != 12)) { - retval |= usage & (PIPE_BIND_SAMPLER_VIEW | PIPE_BIND_SHADER_IMAGE); - } - - if ((usage & (PIPE_BIND_RENDER_TARGET | - PIPE_BIND_DISPLAY_TARGET | - PIPE_BIND_SCANOUT | - PIPE_BIND_SHARED | - PIPE_BIND_COMPUTE_RESOURCE)) && - (fd5_pipe2color(format) != RB5_NONE) && - (fd5_pipe2tex(format) != TFMT5_NONE)) { - retval |= usage & (PIPE_BIND_RENDER_TARGET | - PIPE_BIND_DISPLAY_TARGET | - PIPE_BIND_SCANOUT | - PIPE_BIND_SHARED | - PIPE_BIND_COMPUTE_RESOURCE); - } - - /* For ARB_framebuffer_no_attachments: */ - if ((usage & PIPE_BIND_RENDER_TARGET) && (format == PIPE_FORMAT_NONE)) { - retval |= usage & PIPE_BIND_RENDER_TARGET; - } - - if ((usage & PIPE_BIND_DEPTH_STENCIL) && - (fd5_pipe2depth(format) != (enum a5xx_depth_format)~0) && - (fd5_pipe2tex(format) != TFMT5_NONE)) { - retval |= PIPE_BIND_DEPTH_STENCIL; - } - - if ((usage & PIPE_BIND_INDEX_BUFFER) && - (fd_pipe2index(format) != (enum pc_di_index_size)~0)) { - retval |= PIPE_BIND_INDEX_BUFFER; - } - - if (retval != usage) { - DBG("not supported: format=%s, target=%d, sample_count=%d, " - "usage=%x, retval=%x", util_format_name(format), - target, sample_count, usage, retval); - } - - return retval == usage; + unsigned retval = 0; + + if ((target >= PIPE_MAX_TEXTURE_TYPES) || + !valid_sample_count(sample_count)) { + DBG("not supported: format=%s, target=%d, sample_count=%d, usage=%x", + util_format_name(format), target, sample_count, usage); + return false; + } + + if (MAX2(1, sample_count) != MAX2(1, storage_sample_count)) + return false; + + if ((usage & PIPE_BIND_VERTEX_BUFFER) && + (fd5_pipe2vtx(format) != VFMT5_NONE)) { + retval |= PIPE_BIND_VERTEX_BUFFER; + } + + if ((usage & (PIPE_BIND_SAMPLER_VIEW | PIPE_BIND_SHADER_IMAGE)) && + (fd5_pipe2tex(format) != TFMT5_NONE) && + (target == PIPE_BUFFER || util_format_get_blocksize(format) != 12)) { + retval |= usage & (PIPE_BIND_SAMPLER_VIEW | PIPE_BIND_SHADER_IMAGE); + } + + if ((usage & + (PIPE_BIND_RENDER_TARGET | PIPE_BIND_DISPLAY_TARGET | + PIPE_BIND_SCANOUT | PIPE_BIND_SHARED | PIPE_BIND_COMPUTE_RESOURCE)) && + (fd5_pipe2color(format) != RB5_NONE) && + (fd5_pipe2tex(format) != TFMT5_NONE)) { + retval |= usage & (PIPE_BIND_RENDER_TARGET | PIPE_BIND_DISPLAY_TARGET | + PIPE_BIND_SCANOUT | PIPE_BIND_SHARED | + PIPE_BIND_COMPUTE_RESOURCE); + } + + /* For ARB_framebuffer_no_attachments: */ + if ((usage & PIPE_BIND_RENDER_TARGET) && (format == PIPE_FORMAT_NONE)) { + retval |= usage & PIPE_BIND_RENDER_TARGET; + } + + if ((usage & PIPE_BIND_DEPTH_STENCIL) && + (fd5_pipe2depth(format) != (enum a5xx_depth_format) ~0) && + (fd5_pipe2tex(format) != TFMT5_NONE)) { + retval |= PIPE_BIND_DEPTH_STENCIL; + } + + if ((usage & PIPE_BIND_INDEX_BUFFER) && + (fd_pipe2index(format) != (enum pc_di_index_size) ~0)) { + retval |= PIPE_BIND_INDEX_BUFFER; + } + + if (retval != usage) { + DBG("not supported: format=%s, target=%d, sample_count=%d, " + "usage=%x, retval=%x", + util_format_name(format), target, sample_count, usage, retval); + } + + return retval == usage; } void fd5_screen_init(struct pipe_screen *pscreen) { - struct fd_screen *screen = fd_screen(pscreen); - screen->max_rts = A5XX_MAX_RENDER_TARGETS; - pscreen->context_create = fd5_context_create; - pscreen->is_format_supported = fd5_screen_is_format_supported; + struct fd_screen *screen = fd_screen(pscreen); + screen->max_rts = A5XX_MAX_RENDER_TARGETS; + pscreen->context_create = fd5_context_create; + pscreen->is_format_supported = fd5_screen_is_format_supported; - screen->setup_slices = fd5_setup_slices; - if (FD_DBG(TTILE)) - screen->tile_mode = fd5_tile_mode; + screen->setup_slices = fd5_setup_slices; + if (FD_DBG(TTILE)) + screen->tile_mode = fd5_tile_mode; - fd5_emit_init_screen(pscreen); - ir3_screen_init(pscreen); + fd5_emit_init_screen(pscreen); + ir3_screen_init(pscreen); } diff --git a/src/gallium/drivers/freedreno/a5xx/fd5_screen.h b/src/gallium/drivers/freedreno/a5xx/fd5_screen.h index fa54075..152f300 100644 --- a/src/gallium/drivers/freedreno/a5xx/fd5_screen.h +++ b/src/gallium/drivers/freedreno/a5xx/fd5_screen.h @@ -38,13 +38,13 @@ void fd5_screen_init(struct pipe_screen *pscreen); static inline void emit_marker5(struct fd_ringbuffer *ring, int scratch_idx) { - extern int32_t marker_cnt; - unsigned reg = REG_A5XX_CP_SCRATCH_REG(scratch_idx); - if (__EMIT_MARKER) { - OUT_WFI5(ring); - OUT_PKT4(ring, reg, 1); - OUT_RING(ring, p_atomic_inc_return(&marker_cnt)); - } + extern int32_t marker_cnt; + unsigned reg = REG_A5XX_CP_SCRATCH_REG(scratch_idx); + if (__EMIT_MARKER) { + OUT_WFI5(ring); + OUT_PKT4(ring, reg, 1); + OUT_RING(ring, p_atomic_inc_return(&marker_cnt)); + } } #endif /* FD5_SCREEN_H_ */ diff --git a/src/gallium/drivers/freedreno/a5xx/fd5_texture.c b/src/gallium/drivers/freedreno/a5xx/fd5_texture.c index 6279038..e1b4e46 100644 --- a/src/gallium/drivers/freedreno/a5xx/fd5_texture.c +++ b/src/gallium/drivers/freedreno/a5xx/fd5_texture.c @@ -25,263 +25,250 @@ */ #include "pipe/p_state.h" -#include "util/u_string.h" -#include "util/u_memory.h" -#include "util/u_inlines.h" #include "util/format/u_format.h" +#include "util/u_inlines.h" +#include "util/u_memory.h" +#include "util/u_string.h" -#include "fd5_texture.h" #include "fd5_format.h" +#include "fd5_texture.h" static enum a5xx_tex_clamp tex_clamp(unsigned wrap, bool *needs_border) { - switch (wrap) { - case PIPE_TEX_WRAP_REPEAT: - return A5XX_TEX_REPEAT; - case PIPE_TEX_WRAP_CLAMP_TO_EDGE: - return A5XX_TEX_CLAMP_TO_EDGE; - case PIPE_TEX_WRAP_CLAMP_TO_BORDER: - *needs_border = true; - return A5XX_TEX_CLAMP_TO_BORDER; - case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE: - /* only works for PoT.. need to emulate otherwise! */ - return A5XX_TEX_MIRROR_CLAMP; - case PIPE_TEX_WRAP_MIRROR_REPEAT: - return A5XX_TEX_MIRROR_REPEAT; - case PIPE_TEX_WRAP_MIRROR_CLAMP: - case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER: - /* these two we could perhaps emulate, but we currently - * just don't advertise PIPE_CAP_TEXTURE_MIRROR_CLAMP - */ - default: - DBG("invalid wrap: %u", wrap); - return 0; - } + switch (wrap) { + case PIPE_TEX_WRAP_REPEAT: + return A5XX_TEX_REPEAT; + case PIPE_TEX_WRAP_CLAMP_TO_EDGE: + return A5XX_TEX_CLAMP_TO_EDGE; + case PIPE_TEX_WRAP_CLAMP_TO_BORDER: + *needs_border = true; + return A5XX_TEX_CLAMP_TO_BORDER; + case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE: + /* only works for PoT.. need to emulate otherwise! */ + return A5XX_TEX_MIRROR_CLAMP; + case PIPE_TEX_WRAP_MIRROR_REPEAT: + return A5XX_TEX_MIRROR_REPEAT; + case PIPE_TEX_WRAP_MIRROR_CLAMP: + case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER: + /* these two we could perhaps emulate, but we currently + * just don't advertise PIPE_CAP_TEXTURE_MIRROR_CLAMP + */ + default: + DBG("invalid wrap: %u", wrap); + return 0; + } } static enum a5xx_tex_filter tex_filter(unsigned filter, bool aniso) { - switch (filter) { - case PIPE_TEX_FILTER_NEAREST: - return A5XX_TEX_NEAREST; - case PIPE_TEX_FILTER_LINEAR: - return aniso ? A5XX_TEX_ANISO : A5XX_TEX_LINEAR; - default: - DBG("invalid filter: %u", filter); - return 0; - } + switch (filter) { + case PIPE_TEX_FILTER_NEAREST: + return A5XX_TEX_NEAREST; + case PIPE_TEX_FILTER_LINEAR: + return aniso ? A5XX_TEX_ANISO : A5XX_TEX_LINEAR; + default: + DBG("invalid filter: %u", filter); + return 0; + } } static void * fd5_sampler_state_create(struct pipe_context *pctx, - const struct pipe_sampler_state *cso) + const struct pipe_sampler_state *cso) { - struct fd5_sampler_stateobj *so = CALLOC_STRUCT(fd5_sampler_stateobj); - unsigned aniso = util_last_bit(MIN2(cso->max_anisotropy >> 1, 8)); - bool miplinear = false; - - if (!so) - return NULL; - - so->base = *cso; - - if (cso->min_mip_filter == PIPE_TEX_MIPFILTER_LINEAR) - miplinear = true; - - so->needs_border = false; - so->texsamp0 = - COND(miplinear, A5XX_TEX_SAMP_0_MIPFILTER_LINEAR_NEAR) | - A5XX_TEX_SAMP_0_XY_MAG(tex_filter(cso->mag_img_filter, aniso)) | - A5XX_TEX_SAMP_0_XY_MIN(tex_filter(cso->min_img_filter, aniso)) | - A5XX_TEX_SAMP_0_ANISO(aniso) | - A5XX_TEX_SAMP_0_WRAP_S(tex_clamp(cso->wrap_s, &so->needs_border)) | - A5XX_TEX_SAMP_0_WRAP_T(tex_clamp(cso->wrap_t, &so->needs_border)) | - A5XX_TEX_SAMP_0_WRAP_R(tex_clamp(cso->wrap_r, &so->needs_border)); - - so->texsamp1 = - COND(!cso->seamless_cube_map, A5XX_TEX_SAMP_1_CUBEMAPSEAMLESSFILTOFF) | - COND(!cso->normalized_coords, A5XX_TEX_SAMP_1_UNNORM_COORDS); - - so->texsamp0 |= A5XX_TEX_SAMP_0_LOD_BIAS(cso->lod_bias); - - if (cso->min_mip_filter != PIPE_TEX_MIPFILTER_NONE) { - so->texsamp1 |= - A5XX_TEX_SAMP_1_MIN_LOD(cso->min_lod) | - A5XX_TEX_SAMP_1_MAX_LOD(cso->max_lod); - } else { - /* If we're not doing mipmap filtering, we still need a slightly > 0 - * LOD clamp so the HW can decide between min and mag filtering of - * level 0. - */ - so->texsamp1 |= - A5XX_TEX_SAMP_1_MIN_LOD(MIN2(cso->min_lod, 0.125)) | - A5XX_TEX_SAMP_1_MAX_LOD(MIN2(cso->max_lod, 0.125)); - } - - if (cso->compare_mode) - so->texsamp1 |= A5XX_TEX_SAMP_1_COMPARE_FUNC(cso->compare_func); /* maps 1:1 */ - - return so; + struct fd5_sampler_stateobj *so = CALLOC_STRUCT(fd5_sampler_stateobj); + unsigned aniso = util_last_bit(MIN2(cso->max_anisotropy >> 1, 8)); + bool miplinear = false; + + if (!so) + return NULL; + + so->base = *cso; + + if (cso->min_mip_filter == PIPE_TEX_MIPFILTER_LINEAR) + miplinear = true; + + so->needs_border = false; + so->texsamp0 = + COND(miplinear, A5XX_TEX_SAMP_0_MIPFILTER_LINEAR_NEAR) | + A5XX_TEX_SAMP_0_XY_MAG(tex_filter(cso->mag_img_filter, aniso)) | + A5XX_TEX_SAMP_0_XY_MIN(tex_filter(cso->min_img_filter, aniso)) | + A5XX_TEX_SAMP_0_ANISO(aniso) | + A5XX_TEX_SAMP_0_WRAP_S(tex_clamp(cso->wrap_s, &so->needs_border)) | + A5XX_TEX_SAMP_0_WRAP_T(tex_clamp(cso->wrap_t, &so->needs_border)) | + A5XX_TEX_SAMP_0_WRAP_R(tex_clamp(cso->wrap_r, &so->needs_border)); + + so->texsamp1 = + COND(!cso->seamless_cube_map, A5XX_TEX_SAMP_1_CUBEMAPSEAMLESSFILTOFF) | + COND(!cso->normalized_coords, A5XX_TEX_SAMP_1_UNNORM_COORDS); + + so->texsamp0 |= A5XX_TEX_SAMP_0_LOD_BIAS(cso->lod_bias); + + if (cso->min_mip_filter != PIPE_TEX_MIPFILTER_NONE) { + so->texsamp1 |= A5XX_TEX_SAMP_1_MIN_LOD(cso->min_lod) | + A5XX_TEX_SAMP_1_MAX_LOD(cso->max_lod); + } else { + /* If we're not doing mipmap filtering, we still need a slightly > 0 + * LOD clamp so the HW can decide between min and mag filtering of + * level 0. + */ + so->texsamp1 |= A5XX_TEX_SAMP_1_MIN_LOD(MIN2(cso->min_lod, 0.125)) | + A5XX_TEX_SAMP_1_MAX_LOD(MIN2(cso->max_lod, 0.125)); + } + + if (cso->compare_mode) + so->texsamp1 |= + A5XX_TEX_SAMP_1_COMPARE_FUNC(cso->compare_func); /* maps 1:1 */ + + return so; } static bool use_astc_srgb_workaround(struct pipe_context *pctx, enum pipe_format format) { - return false; // TODO check if this is still needed on a5xx + return false; // TODO check if this is still needed on a5xx } static struct pipe_sampler_view * fd5_sampler_view_create(struct pipe_context *pctx, struct pipe_resource *prsc, - const struct pipe_sampler_view *cso) + const struct pipe_sampler_view *cso) { - struct fd5_pipe_sampler_view *so = CALLOC_STRUCT(fd5_pipe_sampler_view); - struct fd_resource *rsc = fd_resource(prsc); - enum pipe_format format = cso->format; - unsigned lvl, layers = 0; - - if (!so) - return NULL; - - if (format == PIPE_FORMAT_X32_S8X24_UINT) { - rsc = rsc->stencil; - format = rsc->b.b.format; - } - - so->base = *cso; - pipe_reference(NULL, &prsc->reference); - so->base.texture = prsc; - so->base.reference.count = 1; - so->base.context = pctx; - - so->texconst0 = - A5XX_TEX_CONST_0_FMT(fd5_pipe2tex(format)) | - A5XX_TEX_CONST_0_SAMPLES(fd_msaa_samples(prsc->nr_samples)) | - fd5_tex_swiz(format, cso->swizzle_r, cso->swizzle_g, - cso->swizzle_b, cso->swizzle_a); - - /* NOTE: since we sample z24s8 using 8888_UINT format, the swizzle - * we get isn't quite right. Use SWAP(XYZW) as a cheap and cheerful - * way to re-arrange things so stencil component is where the swiz - * expects. - * - * Note that gallium expects stencil sampler to return (s,s,s,s) - * which isn't quite true. To make that happen we'd have to massage - * the swizzle. But in practice only the .x component is used. - */ - if (format == PIPE_FORMAT_X24S8_UINT) { - so->texconst0 |= A5XX_TEX_CONST_0_SWAP(XYZW); - } - - if (util_format_is_srgb(format)) { - if (use_astc_srgb_workaround(pctx, format)) - so->astc_srgb = true; - so->texconst0 |= A5XX_TEX_CONST_0_SRGB; - } - - if (cso->target == PIPE_BUFFER) { - unsigned elements = cso->u.buf.size / util_format_get_blocksize(format); - - lvl = 0; - so->texconst1 = - A5XX_TEX_CONST_1_WIDTH(elements & MASK(15)) | - A5XX_TEX_CONST_1_HEIGHT(elements >> 15); - so->texconst2 = - A5XX_TEX_CONST_2_UNK4 | - A5XX_TEX_CONST_2_UNK31; - so->offset = cso->u.buf.offset; - } else { - unsigned miplevels; - - lvl = fd_sampler_first_level(cso); - miplevels = fd_sampler_last_level(cso) - lvl; - layers = cso->u.tex.last_layer - cso->u.tex.first_layer + 1; - - so->texconst0 |= A5XX_TEX_CONST_0_MIPLVLS(miplevels); - so->texconst1 = - A5XX_TEX_CONST_1_WIDTH(u_minify(prsc->width0, lvl)) | - A5XX_TEX_CONST_1_HEIGHT(u_minify(prsc->height0, lvl)); - so->texconst2 = - A5XX_TEX_CONST_2_PITCHALIGN(rsc->layout.pitchalign - 6) | - A5XX_TEX_CONST_2_PITCH(fd_resource_pitch(rsc, lvl)); - so->offset = fd_resource_offset(rsc, lvl, cso->u.tex.first_layer); - } - - so->texconst2 |= A5XX_TEX_CONST_2_TYPE(fd5_tex_type(cso->target)); - - switch (cso->target) { - case PIPE_TEXTURE_RECT: - case PIPE_TEXTURE_1D: - case PIPE_TEXTURE_2D: - so->texconst3 = - A5XX_TEX_CONST_3_ARRAY_PITCH(rsc->layout.layer_size); - so->texconst5 = - A5XX_TEX_CONST_5_DEPTH(1); - break; - case PIPE_TEXTURE_1D_ARRAY: - case PIPE_TEXTURE_2D_ARRAY: - so->texconst3 = - A5XX_TEX_CONST_3_ARRAY_PITCH(rsc->layout.layer_size); - so->texconst5 = - A5XX_TEX_CONST_5_DEPTH(layers); - break; - case PIPE_TEXTURE_CUBE: - case PIPE_TEXTURE_CUBE_ARRAY: - so->texconst3 = - A5XX_TEX_CONST_3_ARRAY_PITCH(rsc->layout.layer_size); - so->texconst5 = - A5XX_TEX_CONST_5_DEPTH(layers / 6); - break; - case PIPE_TEXTURE_3D: - so->texconst3 = - A5XX_TEX_CONST_3_MIN_LAYERSZ( - fd_resource_slice(rsc, prsc->last_level)->size0) | - A5XX_TEX_CONST_3_ARRAY_PITCH(fd_resource_slice(rsc, lvl)->size0); - so->texconst5 = - A5XX_TEX_CONST_5_DEPTH(u_minify(prsc->depth0, lvl)); - break; - default: - so->texconst3 = 0x00000000; - break; - } - - return &so->base; + struct fd5_pipe_sampler_view *so = CALLOC_STRUCT(fd5_pipe_sampler_view); + struct fd_resource *rsc = fd_resource(prsc); + enum pipe_format format = cso->format; + unsigned lvl, layers = 0; + + if (!so) + return NULL; + + if (format == PIPE_FORMAT_X32_S8X24_UINT) { + rsc = rsc->stencil; + format = rsc->b.b.format; + } + + so->base = *cso; + pipe_reference(NULL, &prsc->reference); + so->base.texture = prsc; + so->base.reference.count = 1; + so->base.context = pctx; + + so->texconst0 = A5XX_TEX_CONST_0_FMT(fd5_pipe2tex(format)) | + A5XX_TEX_CONST_0_SAMPLES(fd_msaa_samples(prsc->nr_samples)) | + fd5_tex_swiz(format, cso->swizzle_r, cso->swizzle_g, + cso->swizzle_b, cso->swizzle_a); + + /* NOTE: since we sample z24s8 using 8888_UINT format, the swizzle + * we get isn't quite right. Use SWAP(XYZW) as a cheap and cheerful + * way to re-arrange things so stencil component is where the swiz + * expects. + * + * Note that gallium expects stencil sampler to return (s,s,s,s) + * which isn't quite true. To make that happen we'd have to massage + * the swizzle. But in practice only the .x component is used. + */ + if (format == PIPE_FORMAT_X24S8_UINT) { + so->texconst0 |= A5XX_TEX_CONST_0_SWAP(XYZW); + } + + if (util_format_is_srgb(format)) { + if (use_astc_srgb_workaround(pctx, format)) + so->astc_srgb = true; + so->texconst0 |= A5XX_TEX_CONST_0_SRGB; + } + + if (cso->target == PIPE_BUFFER) { + unsigned elements = cso->u.buf.size / util_format_get_blocksize(format); + + lvl = 0; + so->texconst1 = A5XX_TEX_CONST_1_WIDTH(elements & MASK(15)) | + A5XX_TEX_CONST_1_HEIGHT(elements >> 15); + so->texconst2 = A5XX_TEX_CONST_2_UNK4 | A5XX_TEX_CONST_2_UNK31; + so->offset = cso->u.buf.offset; + } else { + unsigned miplevels; + + lvl = fd_sampler_first_level(cso); + miplevels = fd_sampler_last_level(cso) - lvl; + layers = cso->u.tex.last_layer - cso->u.tex.first_layer + 1; + + so->texconst0 |= A5XX_TEX_CONST_0_MIPLVLS(miplevels); + so->texconst1 = A5XX_TEX_CONST_1_WIDTH(u_minify(prsc->width0, lvl)) | + A5XX_TEX_CONST_1_HEIGHT(u_minify(prsc->height0, lvl)); + so->texconst2 = A5XX_TEX_CONST_2_PITCHALIGN(rsc->layout.pitchalign - 6) | + A5XX_TEX_CONST_2_PITCH(fd_resource_pitch(rsc, lvl)); + so->offset = fd_resource_offset(rsc, lvl, cso->u.tex.first_layer); + } + + so->texconst2 |= A5XX_TEX_CONST_2_TYPE(fd5_tex_type(cso->target)); + + switch (cso->target) { + case PIPE_TEXTURE_RECT: + case PIPE_TEXTURE_1D: + case PIPE_TEXTURE_2D: + so->texconst3 = A5XX_TEX_CONST_3_ARRAY_PITCH(rsc->layout.layer_size); + so->texconst5 = A5XX_TEX_CONST_5_DEPTH(1); + break; + case PIPE_TEXTURE_1D_ARRAY: + case PIPE_TEXTURE_2D_ARRAY: + so->texconst3 = A5XX_TEX_CONST_3_ARRAY_PITCH(rsc->layout.layer_size); + so->texconst5 = A5XX_TEX_CONST_5_DEPTH(layers); + break; + case PIPE_TEXTURE_CUBE: + case PIPE_TEXTURE_CUBE_ARRAY: + so->texconst3 = A5XX_TEX_CONST_3_ARRAY_PITCH(rsc->layout.layer_size); + so->texconst5 = A5XX_TEX_CONST_5_DEPTH(layers / 6); + break; + case PIPE_TEXTURE_3D: + so->texconst3 = + A5XX_TEX_CONST_3_MIN_LAYERSZ( + fd_resource_slice(rsc, prsc->last_level)->size0) | + A5XX_TEX_CONST_3_ARRAY_PITCH(fd_resource_slice(rsc, lvl)->size0); + so->texconst5 = A5XX_TEX_CONST_5_DEPTH(u_minify(prsc->depth0, lvl)); + break; + default: + so->texconst3 = 0x00000000; + break; + } + + return &so->base; } static void fd5_set_sampler_views(struct pipe_context *pctx, enum pipe_shader_type shader, - unsigned start, unsigned nr, unsigned unbind_num_trailing_slots, - struct pipe_sampler_view **views) + unsigned start, unsigned nr, + unsigned unbind_num_trailing_slots, + struct pipe_sampler_view **views) { - struct fd_context *ctx = fd_context(pctx); - struct fd5_context *fd5_ctx = fd5_context(ctx); - uint16_t astc_srgb = 0; - unsigned i; - - for (i = 0; i < nr; i++) { - if (views[i]) { - struct fd5_pipe_sampler_view *view = - fd5_pipe_sampler_view(views[i]); - if (view->astc_srgb) - astc_srgb |= (1 << i); - } - } - - fd_set_sampler_views(pctx, shader, start, nr, unbind_num_trailing_slots, views); - - if (shader == PIPE_SHADER_FRAGMENT) { - fd5_ctx->fastc_srgb = astc_srgb; - } else if (shader == PIPE_SHADER_VERTEX) { - fd5_ctx->vastc_srgb = astc_srgb; - } + struct fd_context *ctx = fd_context(pctx); + struct fd5_context *fd5_ctx = fd5_context(ctx); + uint16_t astc_srgb = 0; + unsigned i; + + for (i = 0; i < nr; i++) { + if (views[i]) { + struct fd5_pipe_sampler_view *view = fd5_pipe_sampler_view(views[i]); + if (view->astc_srgb) + astc_srgb |= (1 << i); + } + } + + fd_set_sampler_views(pctx, shader, start, nr, unbind_num_trailing_slots, + views); + + if (shader == PIPE_SHADER_FRAGMENT) { + fd5_ctx->fastc_srgb = astc_srgb; + } else if (shader == PIPE_SHADER_VERTEX) { + fd5_ctx->vastc_srgb = astc_srgb; + } } void fd5_texture_init(struct pipe_context *pctx) { - pctx->create_sampler_state = fd5_sampler_state_create; - pctx->bind_sampler_states = fd_sampler_states_bind; - pctx->create_sampler_view = fd5_sampler_view_create; - pctx->set_sampler_views = fd5_set_sampler_views; + pctx->create_sampler_state = fd5_sampler_state_create; + pctx->bind_sampler_states = fd_sampler_states_bind; + pctx->create_sampler_view = fd5_sampler_view_create; + pctx->set_sampler_views = fd5_set_sampler_views; } diff --git a/src/gallium/drivers/freedreno/a5xx/fd5_texture.h b/src/gallium/drivers/freedreno/a5xx/fd5_texture.h index b45c526..bccef79 100644 --- a/src/gallium/drivers/freedreno/a5xx/fd5_texture.h +++ b/src/gallium/drivers/freedreno/a5xx/fd5_texture.h @@ -29,61 +29,60 @@ #include "pipe/p_context.h" -#include "freedreno_texture.h" #include "freedreno_resource.h" +#include "freedreno_texture.h" #include "fd5_context.h" #include "fd5_format.h" struct fd5_sampler_stateobj { - struct pipe_sampler_state base; - uint32_t texsamp0, texsamp1, texsamp2, texsamp3; - bool needs_border; + struct pipe_sampler_state base; + uint32_t texsamp0, texsamp1, texsamp2, texsamp3; + bool needs_border; }; static inline struct fd5_sampler_stateobj * fd5_sampler_stateobj(struct pipe_sampler_state *samp) { - return (struct fd5_sampler_stateobj *)samp; + return (struct fd5_sampler_stateobj *)samp; } struct fd5_pipe_sampler_view { - struct pipe_sampler_view base; - uint32_t texconst0, texconst1, texconst2, texconst3, texconst5; - uint32_t texconst6, texconst7, texconst8, texconst9, texconst10, texconst11; - uint32_t offset; - bool astc_srgb; + struct pipe_sampler_view base; + uint32_t texconst0, texconst1, texconst2, texconst3, texconst5; + uint32_t texconst6, texconst7, texconst8, texconst9, texconst10, texconst11; + uint32_t offset; + bool astc_srgb; }; static inline struct fd5_pipe_sampler_view * fd5_pipe_sampler_view(struct pipe_sampler_view *pview) { - return (struct fd5_pipe_sampler_view *)pview; + return (struct fd5_pipe_sampler_view *)pview; } void fd5_texture_init(struct pipe_context *pctx); - static inline enum a5xx_tex_type fd5_tex_type(unsigned target) { - switch (target) { - default: - assert(0); - case PIPE_BUFFER: - case PIPE_TEXTURE_1D: - case PIPE_TEXTURE_1D_ARRAY: - return A5XX_TEX_1D; - case PIPE_TEXTURE_RECT: - case PIPE_TEXTURE_2D: - case PIPE_TEXTURE_2D_ARRAY: - return A5XX_TEX_2D; - case PIPE_TEXTURE_3D: - return A5XX_TEX_3D; - case PIPE_TEXTURE_CUBE: - case PIPE_TEXTURE_CUBE_ARRAY: - return A5XX_TEX_CUBE; - } + switch (target) { + default: + assert(0); + case PIPE_BUFFER: + case PIPE_TEXTURE_1D: + case PIPE_TEXTURE_1D_ARRAY: + return A5XX_TEX_1D; + case PIPE_TEXTURE_RECT: + case PIPE_TEXTURE_2D: + case PIPE_TEXTURE_2D_ARRAY: + return A5XX_TEX_2D; + case PIPE_TEXTURE_3D: + return A5XX_TEX_3D; + case PIPE_TEXTURE_CUBE: + case PIPE_TEXTURE_CUBE_ARRAY: + return A5XX_TEX_CUBE; + } } #endif /* FD5_TEXTURE_H_ */ diff --git a/src/gallium/drivers/freedreno/a5xx/fd5_zsa.c b/src/gallium/drivers/freedreno/a5xx/fd5_zsa.c index 0a990dc..3e08c3b 100644 --- a/src/gallium/drivers/freedreno/a5xx/fd5_zsa.c +++ b/src/gallium/drivers/freedreno/a5xx/fd5_zsa.c @@ -24,96 +24,95 @@ * Rob Clark */ - #include "pipe/p_state.h" -#include "util/u_string.h" #include "util/u_memory.h" +#include "util/u_string.h" -#include "fd5_zsa.h" #include "fd5_context.h" #include "fd5_format.h" +#include "fd5_zsa.h" void * fd5_zsa_state_create(struct pipe_context *pctx, - const struct pipe_depth_stencil_alpha_state *cso) + const struct pipe_depth_stencil_alpha_state *cso) { - struct fd5_zsa_stateobj *so; - - so = CALLOC_STRUCT(fd5_zsa_stateobj); - if (!so) - return NULL; - - so->base = *cso; - - switch (cso->depth_func) { - case PIPE_FUNC_LESS: - case PIPE_FUNC_LEQUAL: - so->gras_lrz_cntl = A5XX_GRAS_LRZ_CNTL_ENABLE; - break; - - case PIPE_FUNC_GREATER: - case PIPE_FUNC_GEQUAL: - so->gras_lrz_cntl = A5XX_GRAS_LRZ_CNTL_ENABLE | A5XX_GRAS_LRZ_CNTL_GREATER; - break; - - default: - /* LRZ not enabled */ - so->gras_lrz_cntl = 0; - break; - } - - if (!(cso->stencil->enabled || cso->alpha_enabled || !cso->depth_writemask)) - so->lrz_write = true; - - so->rb_depth_cntl |= - A5XX_RB_DEPTH_CNTL_ZFUNC(cso->depth_func); /* maps 1:1 */ - - if (cso->depth_enabled) - so->rb_depth_cntl |= - A5XX_RB_DEPTH_CNTL_Z_ENABLE | - A5XX_RB_DEPTH_CNTL_Z_TEST_ENABLE; - - if (cso->depth_writemask) - so->rb_depth_cntl |= A5XX_RB_DEPTH_CNTL_Z_WRITE_ENABLE; - - if (cso->stencil[0].enabled) { - const struct pipe_stencil_state *s = &cso->stencil[0]; - - so->rb_stencil_control |= - A5XX_RB_STENCIL_CONTROL_STENCIL_READ | - A5XX_RB_STENCIL_CONTROL_STENCIL_ENABLE | - A5XX_RB_STENCIL_CONTROL_FUNC(s->func) | /* maps 1:1 */ - A5XX_RB_STENCIL_CONTROL_FAIL(fd_stencil_op(s->fail_op)) | - A5XX_RB_STENCIL_CONTROL_ZPASS(fd_stencil_op(s->zpass_op)) | - A5XX_RB_STENCIL_CONTROL_ZFAIL(fd_stencil_op(s->zfail_op)); - so->rb_stencilrefmask |= - A5XX_RB_STENCILREFMASK_STENCILWRITEMASK(s->writemask) | - A5XX_RB_STENCILREFMASK_STENCILMASK(s->valuemask); - - if (cso->stencil[1].enabled) { - const struct pipe_stencil_state *bs = &cso->stencil[1]; - - so->rb_stencil_control |= - A5XX_RB_STENCIL_CONTROL_STENCIL_ENABLE_BF | - A5XX_RB_STENCIL_CONTROL_FUNC_BF(bs->func) | /* maps 1:1 */ - A5XX_RB_STENCIL_CONTROL_FAIL_BF(fd_stencil_op(bs->fail_op)) | - A5XX_RB_STENCIL_CONTROL_ZPASS_BF(fd_stencil_op(bs->zpass_op)) | - A5XX_RB_STENCIL_CONTROL_ZFAIL_BF(fd_stencil_op(bs->zfail_op)); - so->rb_stencilrefmask_bf |= - A5XX_RB_STENCILREFMASK_BF_STENCILWRITEMASK(bs->writemask) | - A5XX_RB_STENCILREFMASK_BF_STENCILMASK(bs->valuemask); - } - } - - if (cso->alpha_enabled) { - uint32_t ref = cso->alpha_ref_value * 255.0; - so->rb_alpha_control = - A5XX_RB_ALPHA_CONTROL_ALPHA_TEST | - A5XX_RB_ALPHA_CONTROL_ALPHA_REF(ref) | - A5XX_RB_ALPHA_CONTROL_ALPHA_TEST_FUNC(cso->alpha_func); -// so->rb_depth_control |= -// A5XX_RB_DEPTH_CONTROL_EARLY_Z_DISABLE; - } - - return so; + struct fd5_zsa_stateobj *so; + + so = CALLOC_STRUCT(fd5_zsa_stateobj); + if (!so) + return NULL; + + so->base = *cso; + + switch (cso->depth_func) { + case PIPE_FUNC_LESS: + case PIPE_FUNC_LEQUAL: + so->gras_lrz_cntl = A5XX_GRAS_LRZ_CNTL_ENABLE; + break; + + case PIPE_FUNC_GREATER: + case PIPE_FUNC_GEQUAL: + so->gras_lrz_cntl = + A5XX_GRAS_LRZ_CNTL_ENABLE | A5XX_GRAS_LRZ_CNTL_GREATER; + break; + + default: + /* LRZ not enabled */ + so->gras_lrz_cntl = 0; + break; + } + + if (!(cso->stencil->enabled || cso->alpha_enabled || !cso->depth_writemask)) + so->lrz_write = true; + + so->rb_depth_cntl |= + A5XX_RB_DEPTH_CNTL_ZFUNC(cso->depth_func); /* maps 1:1 */ + + if (cso->depth_enabled) + so->rb_depth_cntl |= + A5XX_RB_DEPTH_CNTL_Z_ENABLE | A5XX_RB_DEPTH_CNTL_Z_TEST_ENABLE; + + if (cso->depth_writemask) + so->rb_depth_cntl |= A5XX_RB_DEPTH_CNTL_Z_WRITE_ENABLE; + + if (cso->stencil[0].enabled) { + const struct pipe_stencil_state *s = &cso->stencil[0]; + + so->rb_stencil_control |= + A5XX_RB_STENCIL_CONTROL_STENCIL_READ | + A5XX_RB_STENCIL_CONTROL_STENCIL_ENABLE | + A5XX_RB_STENCIL_CONTROL_FUNC(s->func) | /* maps 1:1 */ + A5XX_RB_STENCIL_CONTROL_FAIL(fd_stencil_op(s->fail_op)) | + A5XX_RB_STENCIL_CONTROL_ZPASS(fd_stencil_op(s->zpass_op)) | + A5XX_RB_STENCIL_CONTROL_ZFAIL(fd_stencil_op(s->zfail_op)); + so->rb_stencilrefmask |= + A5XX_RB_STENCILREFMASK_STENCILWRITEMASK(s->writemask) | + A5XX_RB_STENCILREFMASK_STENCILMASK(s->valuemask); + + if (cso->stencil[1].enabled) { + const struct pipe_stencil_state *bs = &cso->stencil[1]; + + so->rb_stencil_control |= + A5XX_RB_STENCIL_CONTROL_STENCIL_ENABLE_BF | + A5XX_RB_STENCIL_CONTROL_FUNC_BF(bs->func) | /* maps 1:1 */ + A5XX_RB_STENCIL_CONTROL_FAIL_BF(fd_stencil_op(bs->fail_op)) | + A5XX_RB_STENCIL_CONTROL_ZPASS_BF(fd_stencil_op(bs->zpass_op)) | + A5XX_RB_STENCIL_CONTROL_ZFAIL_BF(fd_stencil_op(bs->zfail_op)); + so->rb_stencilrefmask_bf |= + A5XX_RB_STENCILREFMASK_BF_STENCILWRITEMASK(bs->writemask) | + A5XX_RB_STENCILREFMASK_BF_STENCILMASK(bs->valuemask); + } + } + + if (cso->alpha_enabled) { + uint32_t ref = cso->alpha_ref_value * 255.0; + so->rb_alpha_control = + A5XX_RB_ALPHA_CONTROL_ALPHA_TEST | + A5XX_RB_ALPHA_CONTROL_ALPHA_REF(ref) | + A5XX_RB_ALPHA_CONTROL_ALPHA_TEST_FUNC(cso->alpha_func); + // so->rb_depth_control |= + // A5XX_RB_DEPTH_CONTROL_EARLY_Z_DISABLE; + } + + return so; } diff --git a/src/gallium/drivers/freedreno/a5xx/fd5_zsa.h b/src/gallium/drivers/freedreno/a5xx/fd5_zsa.h index c15ba1a..662b050 100644 --- a/src/gallium/drivers/freedreno/a5xx/fd5_zsa.h +++ b/src/gallium/drivers/freedreno/a5xx/fd5_zsa.h @@ -27,31 +27,30 @@ #ifndef FD5_ZSA_H_ #define FD5_ZSA_H_ - -#include "pipe/p_state.h" #include "pipe/p_context.h" +#include "pipe/p_state.h" #include "freedreno_util.h" struct fd5_zsa_stateobj { - struct pipe_depth_stencil_alpha_state base; - - uint32_t rb_alpha_control; - uint32_t rb_depth_cntl; - uint32_t rb_stencil_control; - uint32_t rb_stencilrefmask; - uint32_t rb_stencilrefmask_bf; - uint32_t gras_lrz_cntl; - bool lrz_write; + struct pipe_depth_stencil_alpha_state base; + + uint32_t rb_alpha_control; + uint32_t rb_depth_cntl; + uint32_t rb_stencil_control; + uint32_t rb_stencilrefmask; + uint32_t rb_stencilrefmask_bf; + uint32_t gras_lrz_cntl; + bool lrz_write; }; static inline struct fd5_zsa_stateobj * fd5_zsa_stateobj(struct pipe_depth_stencil_alpha_state *zsa) { - return (struct fd5_zsa_stateobj *)zsa; + return (struct fd5_zsa_stateobj *)zsa; } -void * fd5_zsa_state_create(struct pipe_context *pctx, - const struct pipe_depth_stencil_alpha_state *cso); +void *fd5_zsa_state_create(struct pipe_context *pctx, + const struct pipe_depth_stencil_alpha_state *cso); #endif /* FD5_ZSA_H_ */ diff --git a/src/gallium/drivers/freedreno/a6xx/fd6_blend.c b/src/gallium/drivers/freedreno/a6xx/fd6_blend.c index b645b88..4475416 100644 --- a/src/gallium/drivers/freedreno/a6xx/fd6_blend.c +++ b/src/gallium/drivers/freedreno/a6xx/fd6_blend.c @@ -28,8 +28,8 @@ #include "pipe/p_state.h" #include "util/u_blend.h" #include "util/u_dual_blend.h" -#include "util/u_string.h" #include "util/u_memory.h" +#include "util/u_string.h" #include "fd6_blend.h" #include "fd6_context.h" @@ -40,170 +40,170 @@ static enum a3xx_rb_blend_opcode blend_func(unsigned func) { - switch (func) { - case PIPE_BLEND_ADD: - return BLEND_DST_PLUS_SRC; - case PIPE_BLEND_MIN: - return BLEND_MIN_DST_SRC; - case PIPE_BLEND_MAX: - return BLEND_MAX_DST_SRC; - case PIPE_BLEND_SUBTRACT: - return BLEND_SRC_MINUS_DST; - case PIPE_BLEND_REVERSE_SUBTRACT: - return BLEND_DST_MINUS_SRC; - default: - DBG("invalid blend func: %x", func); - return 0; - } + switch (func) { + case PIPE_BLEND_ADD: + return BLEND_DST_PLUS_SRC; + case PIPE_BLEND_MIN: + return BLEND_MIN_DST_SRC; + case PIPE_BLEND_MAX: + return BLEND_MAX_DST_SRC; + case PIPE_BLEND_SUBTRACT: + return BLEND_SRC_MINUS_DST; + case PIPE_BLEND_REVERSE_SUBTRACT: + return BLEND_DST_MINUS_SRC; + default: + DBG("invalid blend func: %x", func); + return 0; + } } struct fd6_blend_variant * -__fd6_setup_blend_variant(struct fd6_blend_stateobj *blend, unsigned sample_mask) +__fd6_setup_blend_variant(struct fd6_blend_stateobj *blend, + unsigned sample_mask) { - const struct pipe_blend_state *cso = &blend->base; - struct fd6_blend_variant *so; - enum a3xx_rop_code rop = ROP_COPY; - bool reads_dest = false; - unsigned mrt_blend = 0; - - if (cso->logicop_enable) { - rop = cso->logicop_func; /* maps 1:1 */ - reads_dest = util_logicop_reads_dest(cso->logicop_func); - } - - so = rzalloc_size(blend, sizeof(*so)); - if (!so) - return NULL; - - struct fd_ringbuffer *ring = fd_ringbuffer_new_object(blend->ctx->pipe, - ((A6XX_MAX_RENDER_TARGETS * 4) + 6) * 4); - so->stateobj = ring; - - for (unsigned i = 0; i <= cso->max_rt; i++) { - const struct pipe_rt_blend_state *rt; - - if (cso->independent_blend_enable) - rt = &cso->rt[i]; - else - rt = &cso->rt[0]; - - OUT_REG(ring, A6XX_RB_MRT_BLEND_CONTROL(i, - .rgb_src_factor = fd_blend_factor(rt->rgb_src_factor), - .rgb_blend_opcode = blend_func(rt->rgb_func), - .rgb_dest_factor = fd_blend_factor(rt->rgb_dst_factor), - .alpha_src_factor = fd_blend_factor(rt->alpha_src_factor), - .alpha_blend_opcode = blend_func(rt->alpha_func), - .alpha_dest_factor = fd_blend_factor(rt->alpha_dst_factor), - )); - - OUT_REG(ring, A6XX_RB_MRT_CONTROL(i, - .rop_code = rop, - .rop_enable = cso->logicop_enable, - .component_enable = rt->colormask, - .blend = rt->blend_enable, - .blend2 = rt->blend_enable, - )); - - if (rt->blend_enable) { - mrt_blend |= (1 << i); - } - - if (reads_dest) { - mrt_blend |= (1 << i); - } - } - - OUT_REG(ring, A6XX_RB_DITHER_CNTL( - .dither_mode_mrt0 = cso->dither ? DITHER_ALWAYS : DITHER_DISABLE, - .dither_mode_mrt1 = cso->dither ? DITHER_ALWAYS : DITHER_DISABLE, - .dither_mode_mrt2 = cso->dither ? DITHER_ALWAYS : DITHER_DISABLE, - .dither_mode_mrt3 = cso->dither ? DITHER_ALWAYS : DITHER_DISABLE, - .dither_mode_mrt4 = cso->dither ? DITHER_ALWAYS : DITHER_DISABLE, - .dither_mode_mrt5 = cso->dither ? DITHER_ALWAYS : DITHER_DISABLE, - .dither_mode_mrt6 = cso->dither ? DITHER_ALWAYS : DITHER_DISABLE, - .dither_mode_mrt7 = cso->dither ? DITHER_ALWAYS : DITHER_DISABLE, - )); - - OUT_REG(ring, A6XX_SP_BLEND_CNTL( - .unk8 = true, - .alpha_to_coverage = cso->alpha_to_coverage, - .enabled = !!mrt_blend, - .dual_color_in_enable = blend->use_dual_src_blend, - )); - - OUT_REG(ring, A6XX_RB_BLEND_CNTL( - .enable_blend = mrt_blend, - .alpha_to_coverage = cso->alpha_to_coverage, - .alpha_to_one = cso->alpha_to_one, - .independent_blend = cso->independent_blend_enable, - .sample_mask = sample_mask, - .dual_color_in_enable = blend->use_dual_src_blend, - )); - - so->sample_mask = sample_mask; - - util_dynarray_append(&blend->variants, struct fd6_blend_variant *, so); - - return so; + const struct pipe_blend_state *cso = &blend->base; + struct fd6_blend_variant *so; + enum a3xx_rop_code rop = ROP_COPY; + bool reads_dest = false; + unsigned mrt_blend = 0; + + if (cso->logicop_enable) { + rop = cso->logicop_func; /* maps 1:1 */ + reads_dest = util_logicop_reads_dest(cso->logicop_func); + } + + so = rzalloc_size(blend, sizeof(*so)); + if (!so) + return NULL; + + struct fd_ringbuffer *ring = fd_ringbuffer_new_object( + blend->ctx->pipe, ((A6XX_MAX_RENDER_TARGETS * 4) + 6) * 4); + so->stateobj = ring; + + for (unsigned i = 0; i <= cso->max_rt; i++) { + const struct pipe_rt_blend_state *rt; + + if (cso->independent_blend_enable) + rt = &cso->rt[i]; + else + rt = &cso->rt[0]; + + OUT_REG(ring, + A6XX_RB_MRT_BLEND_CONTROL( + i, .rgb_src_factor = fd_blend_factor(rt->rgb_src_factor), + .rgb_blend_opcode = blend_func(rt->rgb_func), + .rgb_dest_factor = fd_blend_factor(rt->rgb_dst_factor), + .alpha_src_factor = fd_blend_factor(rt->alpha_src_factor), + .alpha_blend_opcode = blend_func(rt->alpha_func), + .alpha_dest_factor = fd_blend_factor(rt->alpha_dst_factor), )); + + OUT_REG(ring, A6XX_RB_MRT_CONTROL(i, .rop_code = rop, + .rop_enable = cso->logicop_enable, + .component_enable = rt->colormask, + .blend = rt->blend_enable, + .blend2 = rt->blend_enable, )); + + if (rt->blend_enable) { + mrt_blend |= (1 << i); + } + + if (reads_dest) { + mrt_blend |= (1 << i); + } + } + + OUT_REG( + ring, + A6XX_RB_DITHER_CNTL( + .dither_mode_mrt0 = cso->dither ? DITHER_ALWAYS : DITHER_DISABLE, + .dither_mode_mrt1 = cso->dither ? DITHER_ALWAYS : DITHER_DISABLE, + .dither_mode_mrt2 = cso->dither ? DITHER_ALWAYS : DITHER_DISABLE, + .dither_mode_mrt3 = cso->dither ? DITHER_ALWAYS : DITHER_DISABLE, + .dither_mode_mrt4 = cso->dither ? DITHER_ALWAYS : DITHER_DISABLE, + .dither_mode_mrt5 = cso->dither ? DITHER_ALWAYS : DITHER_DISABLE, + .dither_mode_mrt6 = cso->dither ? DITHER_ALWAYS : DITHER_DISABLE, + .dither_mode_mrt7 = + cso->dither ? DITHER_ALWAYS : DITHER_DISABLE, )); + + OUT_REG(ring, A6XX_SP_BLEND_CNTL(.unk8 = true, + .alpha_to_coverage = cso->alpha_to_coverage, + .enabled = !!mrt_blend, + .dual_color_in_enable = + blend->use_dual_src_blend, )); + + OUT_REG( + ring, + A6XX_RB_BLEND_CNTL(.enable_blend = mrt_blend, + .alpha_to_coverage = cso->alpha_to_coverage, + .alpha_to_one = cso->alpha_to_one, + .independent_blend = cso->independent_blend_enable, + .sample_mask = sample_mask, + .dual_color_in_enable = blend->use_dual_src_blend, )); + + so->sample_mask = sample_mask; + + util_dynarray_append(&blend->variants, struct fd6_blend_variant *, so); + + return so; } void * fd6_blend_state_create(struct pipe_context *pctx, - const struct pipe_blend_state *cso) + const struct pipe_blend_state *cso) { - struct fd6_blend_stateobj *so; - - so = rzalloc_size(NULL, sizeof(*so)); - if (!so) - return NULL; - - so->base = *cso; - so->ctx = fd_context(pctx); - - if (cso->logicop_enable) { - so->reads_dest |= util_logicop_reads_dest(cso->logicop_func); - } - - so->use_dual_src_blend = - cso->rt[0].blend_enable && util_blend_state_is_dual(cso, 0); - - unsigned nr = cso->independent_blend_enable ? cso->max_rt : 0; - for (unsigned i = 0; i <= nr; i++) { - const struct pipe_rt_blend_state *rt = &cso->rt[i]; - - so->reads_dest |= rt->blend_enable; - - /* From the PoV of LRZ, having masked color channels is - * the same as having blend enabled, in that the draw will - * care about the fragments from an earlier draw. - * - * NOTE we actually don't care about masked color channels - * that don't actually exist in the render target, but we - * don't know the render target format here to determine - * that. It is probably not worth worrying about, but if - * we find a game/benchmark that goes out of it's way to - * mask off non-existent channels, we should fixup the - * pipe_blend_state to give us more info. - */ - if (rt->blend_enable || (rt->colormask != 0xf)) { - so->reads_dest = true; - } - } - - util_dynarray_init(&so->variants, so); - - return so; + struct fd6_blend_stateobj *so; + + so = rzalloc_size(NULL, sizeof(*so)); + if (!so) + return NULL; + + so->base = *cso; + so->ctx = fd_context(pctx); + + if (cso->logicop_enable) { + so->reads_dest |= util_logicop_reads_dest(cso->logicop_func); + } + + so->use_dual_src_blend = + cso->rt[0].blend_enable && util_blend_state_is_dual(cso, 0); + + unsigned nr = cso->independent_blend_enable ? cso->max_rt : 0; + for (unsigned i = 0; i <= nr; i++) { + const struct pipe_rt_blend_state *rt = &cso->rt[i]; + + so->reads_dest |= rt->blend_enable; + + /* From the PoV of LRZ, having masked color channels is + * the same as having blend enabled, in that the draw will + * care about the fragments from an earlier draw. + * + * NOTE we actually don't care about masked color channels + * that don't actually exist in the render target, but we + * don't know the render target format here to determine + * that. It is probably not worth worrying about, but if + * we find a game/benchmark that goes out of it's way to + * mask off non-existent channels, we should fixup the + * pipe_blend_state to give us more info. + */ + if (rt->blend_enable || (rt->colormask != 0xf)) { + so->reads_dest = true; + } + } + + util_dynarray_init(&so->variants, so); + + return so; } void fd6_blend_state_delete(struct pipe_context *pctx, void *hwcso) { - struct fd6_blend_stateobj *so = hwcso; + struct fd6_blend_stateobj *so = hwcso; - util_dynarray_foreach(&so->variants, struct fd6_blend_variant *, vp) { - struct fd6_blend_variant *v = *vp; - fd_ringbuffer_del(v->stateobj); - } + util_dynarray_foreach (&so->variants, struct fd6_blend_variant *, vp) { + struct fd6_blend_variant *v = *vp; + fd_ringbuffer_del(v->stateobj); + } - ralloc_free(so); + ralloc_free(so); } diff --git a/src/gallium/drivers/freedreno/a6xx/fd6_blend.h b/src/gallium/drivers/freedreno/a6xx/fd6_blend.h index 0878cf4..a610bd5 100644 --- a/src/gallium/drivers/freedreno/a6xx/fd6_blend.h +++ b/src/gallium/drivers/freedreno/a6xx/fd6_blend.h @@ -28,8 +28,8 @@ #ifndef FD6_BLEND_H_ #define FD6_BLEND_H_ -#include "pipe/p_state.h" #include "pipe/p_context.h" +#include "pipe/p_state.h" #include "freedreno_context.h" #include "freedreno_util.h" @@ -40,52 +40,53 @@ * to change frequently. */ struct fd6_blend_variant { - unsigned sample_mask; - struct fd_ringbuffer *stateobj; + unsigned sample_mask; + struct fd_ringbuffer *stateobj; }; struct fd6_blend_stateobj { - struct pipe_blend_state base; + struct pipe_blend_state base; - bool use_dual_src_blend; + bool use_dual_src_blend; - struct fd_context *ctx; - bool reads_dest; - struct util_dynarray variants; + struct fd_context *ctx; + bool reads_dest; + struct util_dynarray variants; }; static inline struct fd6_blend_stateobj * fd6_blend_stateobj(struct pipe_blend_state *blend) { - return (struct fd6_blend_stateobj *)blend; + return (struct fd6_blend_stateobj *)blend; } -struct fd6_blend_variant * __fd6_setup_blend_variant( - struct fd6_blend_stateobj *blend, unsigned sample_mask); +struct fd6_blend_variant * +__fd6_setup_blend_variant(struct fd6_blend_stateobj *blend, + unsigned sample_mask); static inline struct fd6_blend_variant * -fd6_blend_variant(struct pipe_blend_state *cso, - unsigned nr_samples, unsigned sample_mask) +fd6_blend_variant(struct pipe_blend_state *cso, unsigned nr_samples, + unsigned sample_mask) { - struct fd6_blend_stateobj *blend = fd6_blend_stateobj(cso); - unsigned mask = BITFIELD_MASK(nr_samples); + struct fd6_blend_stateobj *blend = fd6_blend_stateobj(cso); + unsigned mask = BITFIELD_MASK(nr_samples); - util_dynarray_foreach(&blend->variants, struct fd6_blend_variant *, vp) { - struct fd6_blend_variant *v = *vp; + util_dynarray_foreach (&blend->variants, struct fd6_blend_variant *, vp) { + struct fd6_blend_variant *v = *vp; - /* mask out sample-mask bits that we don't care about to avoid - * creating unnecessary variants - */ - if ((mask & v->sample_mask) == (mask & sample_mask)) { - return v; - } - } + /* mask out sample-mask bits that we don't care about to avoid + * creating unnecessary variants + */ + if ((mask & v->sample_mask) == (mask & sample_mask)) { + return v; + } + } - return __fd6_setup_blend_variant(blend, sample_mask); + return __fd6_setup_blend_variant(blend, sample_mask); } -void * fd6_blend_state_create(struct pipe_context *pctx, - const struct pipe_blend_state *cso); +void *fd6_blend_state_create(struct pipe_context *pctx, + const struct pipe_blend_state *cso); void fd6_blend_state_delete(struct pipe_context *, void *hwcso); #endif /* FD6_BLEND_H_ */ diff --git a/src/gallium/drivers/freedreno/a6xx/fd6_blitter.c b/src/gallium/drivers/freedreno/a6xx/fd6_blitter.c index 97c7c50..25dc028 100644 --- a/src/gallium/drivers/freedreno/a6xx/fd6_blitter.c +++ b/src/gallium/drivers/freedreno/a6xx/fd6_blitter.c @@ -25,9 +25,9 @@ * Rob Clark */ -#include "util/u_dump.h" -#include "util/half_float.h" #include "util/format_srgb.h" +#include "util/half_float.h" +#include "util/u_dump.h" #include "freedreno_blitter.h" #include "freedreno_fence.h" @@ -35,76 +35,76 @@ #include "freedreno_tracepoints.h" #include "fd6_blitter.h" -#include "fd6_format.h" #include "fd6_emit.h" +#include "fd6_format.h" #include "fd6_resource.h" static inline enum a6xx_2d_ifmt fd6_ifmt(enum a6xx_format fmt) { - switch (fmt) { - case FMT6_A8_UNORM: - case FMT6_8_UNORM: - case FMT6_8_SNORM: - case FMT6_8_8_UNORM: - case FMT6_8_8_SNORM: - case FMT6_8_8_8_8_UNORM: - case FMT6_8_8_8_X8_UNORM: - case FMT6_8_8_8_8_SNORM: - case FMT6_4_4_4_4_UNORM: - case FMT6_5_5_5_1_UNORM: - case FMT6_5_6_5_UNORM: - return R2D_UNORM8; - - case FMT6_32_UINT: - case FMT6_32_SINT: - case FMT6_32_32_UINT: - case FMT6_32_32_SINT: - case FMT6_32_32_32_32_UINT: - case FMT6_32_32_32_32_SINT: - return R2D_INT32; - - case FMT6_16_UINT: - case FMT6_16_SINT: - case FMT6_16_16_UINT: - case FMT6_16_16_SINT: - case FMT6_16_16_16_16_UINT: - case FMT6_16_16_16_16_SINT: - case FMT6_10_10_10_2_UINT: - return R2D_INT16; - - case FMT6_8_UINT: - case FMT6_8_SINT: - case FMT6_8_8_UINT: - case FMT6_8_8_SINT: - case FMT6_8_8_8_8_UINT: - case FMT6_8_8_8_8_SINT: - case FMT6_Z24_UNORM_S8_UINT: - case FMT6_Z24_UNORM_S8_UINT_AS_R8G8B8A8: - return R2D_INT8; - - case FMT6_16_UNORM: - case FMT6_16_SNORM: - case FMT6_16_16_UNORM: - case FMT6_16_16_SNORM: - case FMT6_16_16_16_16_UNORM: - case FMT6_16_16_16_16_SNORM: - case FMT6_32_FLOAT: - case FMT6_32_32_FLOAT: - case FMT6_32_32_32_32_FLOAT: - return R2D_FLOAT32; - - case FMT6_16_FLOAT: - case FMT6_16_16_FLOAT: - case FMT6_16_16_16_16_FLOAT: - case FMT6_11_11_10_FLOAT: - case FMT6_10_10_10_2_UNORM_DEST: - return R2D_FLOAT16; - - default: - unreachable("bad format"); - return 0; - } + switch (fmt) { + case FMT6_A8_UNORM: + case FMT6_8_UNORM: + case FMT6_8_SNORM: + case FMT6_8_8_UNORM: + case FMT6_8_8_SNORM: + case FMT6_8_8_8_8_UNORM: + case FMT6_8_8_8_X8_UNORM: + case FMT6_8_8_8_8_SNORM: + case FMT6_4_4_4_4_UNORM: + case FMT6_5_5_5_1_UNORM: + case FMT6_5_6_5_UNORM: + return R2D_UNORM8; + + case FMT6_32_UINT: + case FMT6_32_SINT: + case FMT6_32_32_UINT: + case FMT6_32_32_SINT: + case FMT6_32_32_32_32_UINT: + case FMT6_32_32_32_32_SINT: + return R2D_INT32; + + case FMT6_16_UINT: + case FMT6_16_SINT: + case FMT6_16_16_UINT: + case FMT6_16_16_SINT: + case FMT6_16_16_16_16_UINT: + case FMT6_16_16_16_16_SINT: + case FMT6_10_10_10_2_UINT: + return R2D_INT16; + + case FMT6_8_UINT: + case FMT6_8_SINT: + case FMT6_8_8_UINT: + case FMT6_8_8_SINT: + case FMT6_8_8_8_8_UINT: + case FMT6_8_8_8_8_SINT: + case FMT6_Z24_UNORM_S8_UINT: + case FMT6_Z24_UNORM_S8_UINT_AS_R8G8B8A8: + return R2D_INT8; + + case FMT6_16_UNORM: + case FMT6_16_SNORM: + case FMT6_16_16_UNORM: + case FMT6_16_16_SNORM: + case FMT6_16_16_16_16_UNORM: + case FMT6_16_16_16_16_SNORM: + case FMT6_32_FLOAT: + case FMT6_32_32_FLOAT: + case FMT6_32_32_32_32_FLOAT: + return R2D_FLOAT32; + + case FMT6_16_FLOAT: + case FMT6_16_16_FLOAT: + case FMT6_16_16_16_16_FLOAT: + case FMT6_11_11_10_FLOAT: + case FMT6_10_10_10_2_UNORM_DEST: + return R2D_FLOAT16; + + default: + unreachable("bad format"); + return 0; + } } /* Make sure none of the requested dimensions extend beyond the size of the @@ -115,190 +115,190 @@ fd6_ifmt(enum a6xx_format fmt) static bool ok_dims(const struct pipe_resource *r, const struct pipe_box *b, int lvl) { - int last_layer = - r->target == PIPE_TEXTURE_3D ? u_minify(r->depth0, lvl) - : r->array_size; + int last_layer = + r->target == PIPE_TEXTURE_3D ? u_minify(r->depth0, lvl) : r->array_size; - return (b->x >= 0) && (b->x + b->width <= u_minify(r->width0, lvl)) && - (b->y >= 0) && (b->y + b->height <= u_minify(r->height0, lvl)) && - (b->z >= 0) && (b->z + b->depth <= last_layer); + return (b->x >= 0) && (b->x + b->width <= u_minify(r->width0, lvl)) && + (b->y >= 0) && (b->y + b->height <= u_minify(r->height0, lvl)) && + (b->z >= 0) && (b->z + b->depth <= last_layer); } static bool ok_format(enum pipe_format pfmt) { - enum a6xx_format fmt = fd6_pipe2color(pfmt); - - if (util_format_is_compressed(pfmt)) - return true; - - switch (pfmt) { - case PIPE_FORMAT_Z24_UNORM_S8_UINT: - case PIPE_FORMAT_Z24X8_UNORM: - case PIPE_FORMAT_Z16_UNORM: - case PIPE_FORMAT_Z32_UNORM: - case PIPE_FORMAT_Z32_FLOAT: - case PIPE_FORMAT_Z32_FLOAT_S8X24_UINT: - case PIPE_FORMAT_S8_UINT: - return true; - default: - break; - } - - if (fmt == FMT6_NONE) - return false; - - return true; + enum a6xx_format fmt = fd6_pipe2color(pfmt); + + if (util_format_is_compressed(pfmt)) + return true; + + switch (pfmt) { + case PIPE_FORMAT_Z24_UNORM_S8_UINT: + case PIPE_FORMAT_Z24X8_UNORM: + case PIPE_FORMAT_Z16_UNORM: + case PIPE_FORMAT_Z32_UNORM: + case PIPE_FORMAT_Z32_FLOAT: + case PIPE_FORMAT_Z32_FLOAT_S8X24_UINT: + case PIPE_FORMAT_S8_UINT: + return true; + default: + break; + } + + if (fmt == FMT6_NONE) + return false; + + return true; } -#define DEBUG_BLIT 0 +#define DEBUG_BLIT 0 #define DEBUG_BLIT_FALLBACK 0 -#define fail_if(cond) \ - do { \ - if (cond) { \ - if (DEBUG_BLIT_FALLBACK) { \ - fprintf(stderr, "falling back: %s for blit:\n", #cond); \ - dump_blit_info(info); \ - } \ - return false; \ - } \ - } while (0) +#define fail_if(cond) \ + do { \ + if (cond) { \ + if (DEBUG_BLIT_FALLBACK) { \ + fprintf(stderr, "falling back: %s for blit:\n", #cond); \ + dump_blit_info(info); \ + } \ + return false; \ + } \ + } while (0) static bool is_ubwc(struct pipe_resource *prsc, unsigned level) { - return fd_resource_ubwc_enabled(fd_resource(prsc), level); + return fd_resource_ubwc_enabled(fd_resource(prsc), level); } static void dump_blit_info(const struct pipe_blit_info *info) { - util_dump_blit_info(stderr, info); - fprintf(stderr, "\ndst resource: "); - util_dump_resource(stderr, info->dst.resource); - if (is_ubwc(info->dst.resource, info->dst.level)) - fprintf(stderr, " (ubwc)"); - fprintf(stderr, "\nsrc resource: "); - util_dump_resource(stderr, info->src.resource); - if (is_ubwc(info->src.resource, info->src.level)) - fprintf(stderr, " (ubwc)"); - fprintf(stderr, "\n"); + util_dump_blit_info(stderr, info); + fprintf(stderr, "\ndst resource: "); + util_dump_resource(stderr, info->dst.resource); + if (is_ubwc(info->dst.resource, info->dst.level)) + fprintf(stderr, " (ubwc)"); + fprintf(stderr, "\nsrc resource: "); + util_dump_resource(stderr, info->src.resource); + if (is_ubwc(info->src.resource, info->src.level)) + fprintf(stderr, " (ubwc)"); + fprintf(stderr, "\n"); } static bool can_do_blit(const struct pipe_blit_info *info) { - /* I think we can do scaling, but not in z dimension since that would - * require blending.. - */ - fail_if(info->dst.box.depth != info->src.box.depth); + /* I think we can do scaling, but not in z dimension since that would + * require blending.. + */ + fail_if(info->dst.box.depth != info->src.box.depth); - /* Fail if unsupported format: */ - fail_if(!ok_format(info->src.format)); - fail_if(!ok_format(info->dst.format)); + /* Fail if unsupported format: */ + fail_if(!ok_format(info->src.format)); + fail_if(!ok_format(info->dst.format)); - debug_assert(!util_format_is_compressed(info->src.format)); - debug_assert(!util_format_is_compressed(info->dst.format)); + debug_assert(!util_format_is_compressed(info->src.format)); + debug_assert(!util_format_is_compressed(info->dst.format)); - fail_if(!ok_dims(info->src.resource, &info->src.box, info->src.level)); + fail_if(!ok_dims(info->src.resource, &info->src.box, info->src.level)); - fail_if(!ok_dims(info->dst.resource, &info->dst.box, info->dst.level)); + fail_if(!ok_dims(info->dst.resource, &info->dst.box, info->dst.level)); - debug_assert(info->dst.box.width >= 0); - debug_assert(info->dst.box.height >= 0); - debug_assert(info->dst.box.depth >= 0); + debug_assert(info->dst.box.width >= 0); + debug_assert(info->dst.box.height >= 0); + debug_assert(info->dst.box.depth >= 0); - fail_if(info->dst.resource->nr_samples > 1); + fail_if(info->dst.resource->nr_samples > 1); - fail_if(info->window_rectangle_include); + fail_if(info->window_rectangle_include); - const struct util_format_description *src_desc = - util_format_description(info->src.format); - const struct util_format_description *dst_desc = - util_format_description(info->dst.format); - const int common_channels = MIN2(src_desc->nr_channels, dst_desc->nr_channels); + const struct util_format_description *src_desc = + util_format_description(info->src.format); + const struct util_format_description *dst_desc = + util_format_description(info->dst.format); + const int common_channels = + MIN2(src_desc->nr_channels, dst_desc->nr_channels); - if (info->mask & PIPE_MASK_RGBA) { - for (int i = 0; i < common_channels; i++) { - fail_if(memcmp(&src_desc->channel[i], - &dst_desc->channel[i], - sizeof(src_desc->channel[0]))); - } - } + if (info->mask & PIPE_MASK_RGBA) { + for (int i = 0; i < common_channels; i++) { + fail_if(memcmp(&src_desc->channel[i], &dst_desc->channel[i], + sizeof(src_desc->channel[0]))); + } + } - fail_if(info->alpha_blend); + fail_if(info->alpha_blend); - return true; + return true; } static void emit_setup(struct fd_batch *batch) { - struct fd_ringbuffer *ring = batch->draw; - struct fd_screen *screen = batch->ctx->screen; - - fd6_event_write(batch, ring, PC_CCU_FLUSH_COLOR_TS, true); - fd6_event_write(batch, ring, PC_CCU_FLUSH_DEPTH_TS, true); - fd6_event_write(batch, ring, PC_CCU_INVALIDATE_COLOR, false); - fd6_event_write(batch, ring, PC_CCU_INVALIDATE_DEPTH, false); - - /* normal BLIT_OP_SCALE operation needs bypass RB_CCU_CNTL */ - OUT_WFI5(ring); - OUT_PKT4(ring, REG_A6XX_RB_CCU_CNTL, 1); - OUT_RING(ring, A6XX_RB_CCU_CNTL_OFFSET(screen->info.a6xx.ccu_offset_bypass)); + struct fd_ringbuffer *ring = batch->draw; + struct fd_screen *screen = batch->ctx->screen; + + fd6_event_write(batch, ring, PC_CCU_FLUSH_COLOR_TS, true); + fd6_event_write(batch, ring, PC_CCU_FLUSH_DEPTH_TS, true); + fd6_event_write(batch, ring, PC_CCU_INVALIDATE_COLOR, false); + fd6_event_write(batch, ring, PC_CCU_INVALIDATE_DEPTH, false); + + /* normal BLIT_OP_SCALE operation needs bypass RB_CCU_CNTL */ + OUT_WFI5(ring); + OUT_PKT4(ring, REG_A6XX_RB_CCU_CNTL, 1); + OUT_RING(ring, A6XX_RB_CCU_CNTL_OFFSET(screen->info.a6xx.ccu_offset_bypass)); } static void -emit_blit_setup(struct fd_ringbuffer *ring, - enum pipe_format pfmt, bool scissor_enable, union pipe_color_union *color) +emit_blit_setup(struct fd_ringbuffer *ring, enum pipe_format pfmt, + bool scissor_enable, union pipe_color_union *color) { - enum a6xx_format fmt = fd6_pipe2color(pfmt); - bool is_srgb = util_format_is_srgb(pfmt); - enum a6xx_2d_ifmt ifmt = fd6_ifmt(fmt); - - if (is_srgb) { - assert(ifmt == R2D_UNORM8); - ifmt = R2D_UNORM8_SRGB; - } - - uint32_t blit_cntl = A6XX_RB_2D_BLIT_CNTL_MASK(0xf) | - A6XX_RB_2D_BLIT_CNTL_COLOR_FORMAT(fmt) | - A6XX_RB_2D_BLIT_CNTL_IFMT(ifmt) | - COND(color, A6XX_RB_2D_BLIT_CNTL_SOLID_COLOR) | - COND(scissor_enable, A6XX_RB_2D_BLIT_CNTL_SCISSOR); - - OUT_PKT4(ring, REG_A6XX_RB_2D_BLIT_CNTL, 1); - OUT_RING(ring, blit_cntl); - - OUT_PKT4(ring, REG_A6XX_GRAS_2D_BLIT_CNTL, 1); - OUT_RING(ring, blit_cntl); - - if (fmt == FMT6_10_10_10_2_UNORM_DEST) - fmt = FMT6_16_16_16_16_FLOAT; - - /* This register is probably badly named... it seems that it's - * controlling the internal/accumulator format or something like - * that. It's certainly not tied to only the src format. - */ - OUT_PKT4(ring, REG_A6XX_SP_2D_DST_FORMAT, 1); - OUT_RING(ring, A6XX_SP_2D_DST_FORMAT_COLOR_FORMAT(fmt) | - COND(util_format_is_pure_sint(pfmt), - A6XX_SP_2D_DST_FORMAT_SINT) | - COND(util_format_is_pure_uint(pfmt), - A6XX_SP_2D_DST_FORMAT_UINT) | - COND(util_format_is_snorm(pfmt), - A6XX_SP_2D_DST_FORMAT_SINT | - A6XX_SP_2D_DST_FORMAT_NORM) | - COND(util_format_is_unorm(pfmt), -// TODO sometimes blob uses UINT+NORM but dEQP seems unhappy about that -// A6XX_SP_2D_DST_FORMAT_UINT | - A6XX_SP_2D_DST_FORMAT_NORM) | - COND(is_srgb, A6XX_SP_2D_DST_FORMAT_SRGB) | - A6XX_SP_2D_DST_FORMAT_MASK(0xf)); - - OUT_PKT4(ring, REG_A6XX_RB_2D_UNKNOWN_8C01, 1); - OUT_RING(ring, 0); + enum a6xx_format fmt = fd6_pipe2color(pfmt); + bool is_srgb = util_format_is_srgb(pfmt); + enum a6xx_2d_ifmt ifmt = fd6_ifmt(fmt); + + if (is_srgb) { + assert(ifmt == R2D_UNORM8); + ifmt = R2D_UNORM8_SRGB; + } + + uint32_t blit_cntl = A6XX_RB_2D_BLIT_CNTL_MASK(0xf) | + A6XX_RB_2D_BLIT_CNTL_COLOR_FORMAT(fmt) | + A6XX_RB_2D_BLIT_CNTL_IFMT(ifmt) | + COND(color, A6XX_RB_2D_BLIT_CNTL_SOLID_COLOR) | + COND(scissor_enable, A6XX_RB_2D_BLIT_CNTL_SCISSOR); + + OUT_PKT4(ring, REG_A6XX_RB_2D_BLIT_CNTL, 1); + OUT_RING(ring, blit_cntl); + + OUT_PKT4(ring, REG_A6XX_GRAS_2D_BLIT_CNTL, 1); + OUT_RING(ring, blit_cntl); + + if (fmt == FMT6_10_10_10_2_UNORM_DEST) + fmt = FMT6_16_16_16_16_FLOAT; + + /* This register is probably badly named... it seems that it's + * controlling the internal/accumulator format or something like + * that. It's certainly not tied to only the src format. + */ + OUT_PKT4(ring, REG_A6XX_SP_2D_DST_FORMAT, 1); + OUT_RING( + ring, + A6XX_SP_2D_DST_FORMAT_COLOR_FORMAT(fmt) | + COND(util_format_is_pure_sint(pfmt), A6XX_SP_2D_DST_FORMAT_SINT) | + COND(util_format_is_pure_uint(pfmt), A6XX_SP_2D_DST_FORMAT_UINT) | + COND(util_format_is_snorm(pfmt), + A6XX_SP_2D_DST_FORMAT_SINT | A6XX_SP_2D_DST_FORMAT_NORM) | + COND(util_format_is_unorm(pfmt), + // TODO sometimes blob uses UINT+NORM but dEQP seems unhappy about + // that + // A6XX_SP_2D_DST_FORMAT_UINT + //| + A6XX_SP_2D_DST_FORMAT_NORM) | + COND(is_srgb, A6XX_SP_2D_DST_FORMAT_SRGB) | + A6XX_SP_2D_DST_FORMAT_MASK(0xf)); + + OUT_PKT4(ring, REG_A6XX_RB_2D_UNKNOWN_8C01, 1); + OUT_RING(ring, 0); } /* buffers need to be handled specially since x/width can exceed the bounds @@ -306,451 +306,458 @@ emit_blit_setup(struct fd_ringbuffer *ring, */ static void emit_blit_buffer(struct fd_context *ctx, struct fd_ringbuffer *ring, - const struct pipe_blit_info *info) + const struct pipe_blit_info *info) { - const struct pipe_box *sbox = &info->src.box; - const struct pipe_box *dbox = &info->dst.box; - struct fd_resource *src, *dst; - unsigned sshift, dshift; - - if (DEBUG_BLIT) { - fprintf(stderr, "buffer blit: "); - dump_blit_info(info); - } - - src = fd_resource(info->src.resource); - dst = fd_resource(info->dst.resource); - - debug_assert(src->layout.cpp == 1); - debug_assert(dst->layout.cpp == 1); - debug_assert(info->src.resource->format == info->dst.resource->format); - debug_assert((sbox->y == 0) && (sbox->height == 1)); - debug_assert((dbox->y == 0) && (dbox->height == 1)); - debug_assert((sbox->z == 0) && (sbox->depth == 1)); - debug_assert((dbox->z == 0) && (dbox->depth == 1)); - debug_assert(sbox->width == dbox->width); - debug_assert(info->src.level == 0); - debug_assert(info->dst.level == 0); - - /* - * Buffers can have dimensions bigger than max width, remap into - * multiple 1d blits to fit within max dimension - * - * Note that blob uses .ARRAY_PITCH=128 for blitting buffers, which - * seems to prevent overfetch related faults. Not quite sure what - * the deal is there. - * - * Low 6 bits of SRC/DST addresses need to be zero (ie. address - * aligned to 64) so we need to shift src/dst x1/x2 to make up the - * difference. On top of already splitting up the blit so width - * isn't > 16k. - * - * We perhaps could do a bit better, if src and dst are aligned but - * in the worst case this means we have to split the copy up into - * 16k (0x4000) minus 64 (0x40). - */ - - sshift = sbox->x & 0x3f; - dshift = dbox->x & 0x3f; - - emit_blit_setup(ring, PIPE_FORMAT_R8_UNORM, false, NULL); - - for (unsigned off = 0; off < sbox->width; off += (0x4000 - 0x40)) { - unsigned soff, doff, w, p; - - soff = (sbox->x + off) & ~0x3f; - doff = (dbox->x + off) & ~0x3f; - - w = MIN2(sbox->width - off, (0x4000 - 0x40)); - p = align(w, 64); - - debug_assert((soff + w) <= fd_bo_size(src->bo)); - debug_assert((doff + w) <= fd_bo_size(dst->bo)); - - /* - * Emit source: - */ - OUT_PKT4(ring, REG_A6XX_SP_PS_2D_SRC_INFO, 10); - OUT_RING(ring, A6XX_SP_PS_2D_SRC_INFO_COLOR_FORMAT(FMT6_8_UNORM) | - A6XX_SP_PS_2D_SRC_INFO_TILE_MODE(TILE6_LINEAR) | - A6XX_SP_PS_2D_SRC_INFO_COLOR_SWAP(WZYX) | - 0x500000); - OUT_RING(ring, A6XX_SP_PS_2D_SRC_SIZE_WIDTH(sshift + w) | - A6XX_SP_PS_2D_SRC_SIZE_HEIGHT(1)); /* SP_PS_2D_SRC_SIZE */ - OUT_RELOC(ring, src->bo, soff, 0, 0); /* SP_PS_2D_SRC_LO/HI */ - OUT_RING(ring, A6XX_SP_PS_2D_SRC_PITCH_PITCH(p)); - - OUT_RING(ring, 0x00000000); - OUT_RING(ring, 0x00000000); - OUT_RING(ring, 0x00000000); - OUT_RING(ring, 0x00000000); - OUT_RING(ring, 0x00000000); - - /* - * Emit destination: - */ - OUT_PKT4(ring, REG_A6XX_RB_2D_DST_INFO, 9); - OUT_RING(ring, A6XX_RB_2D_DST_INFO_COLOR_FORMAT(FMT6_8_UNORM) | - A6XX_RB_2D_DST_INFO_TILE_MODE(TILE6_LINEAR) | - A6XX_RB_2D_DST_INFO_COLOR_SWAP(WZYX)); - OUT_RELOC(ring, dst->bo, doff, 0, 0); /* RB_2D_DST_LO/HI */ - OUT_RING(ring, A6XX_RB_2D_DST_PITCH(p)); - OUT_RING(ring, 0x00000000); - OUT_RING(ring, 0x00000000); - OUT_RING(ring, 0x00000000); - OUT_RING(ring, 0x00000000); - OUT_RING(ring, 0x00000000); - - /* - * Blit command: - */ - OUT_PKT4(ring, REG_A6XX_GRAS_2D_SRC_TL_X, 4); - OUT_RING(ring, A6XX_GRAS_2D_SRC_TL_X(sshift)); - OUT_RING(ring, A6XX_GRAS_2D_SRC_BR_X(sshift + w - 1)); - OUT_RING(ring, A6XX_GRAS_2D_SRC_TL_Y(0)); - OUT_RING(ring, A6XX_GRAS_2D_SRC_BR_Y(0)); - - OUT_PKT4(ring, REG_A6XX_GRAS_2D_DST_TL, 2); - OUT_RING(ring, A6XX_GRAS_2D_DST_TL_X(dshift) | A6XX_GRAS_2D_DST_TL_Y(0)); - OUT_RING(ring, A6XX_GRAS_2D_DST_BR_X(dshift + w - 1) | A6XX_GRAS_2D_DST_BR_Y(0)); - - OUT_PKT7(ring, CP_EVENT_WRITE, 1); - OUT_RING(ring, 0x3f); - OUT_WFI5(ring); - - OUT_PKT4(ring, REG_A6XX_RB_UNKNOWN_8E04, 1); - OUT_RING(ring, ctx->screen->info.a6xx.magic.RB_UNKNOWN_8E04_blit); - - OUT_PKT7(ring, CP_BLIT, 1); - OUT_RING(ring, CP_BLIT_0_OP(BLIT_OP_SCALE)); - - OUT_WFI5(ring); - - OUT_PKT4(ring, REG_A6XX_RB_UNKNOWN_8E04, 1); - OUT_RING(ring, 0); /* RB_UNKNOWN_8E04 */ - } + const struct pipe_box *sbox = &info->src.box; + const struct pipe_box *dbox = &info->dst.box; + struct fd_resource *src, *dst; + unsigned sshift, dshift; + + if (DEBUG_BLIT) { + fprintf(stderr, "buffer blit: "); + dump_blit_info(info); + } + + src = fd_resource(info->src.resource); + dst = fd_resource(info->dst.resource); + + debug_assert(src->layout.cpp == 1); + debug_assert(dst->layout.cpp == 1); + debug_assert(info->src.resource->format == info->dst.resource->format); + debug_assert((sbox->y == 0) && (sbox->height == 1)); + debug_assert((dbox->y == 0) && (dbox->height == 1)); + debug_assert((sbox->z == 0) && (sbox->depth == 1)); + debug_assert((dbox->z == 0) && (dbox->depth == 1)); + debug_assert(sbox->width == dbox->width); + debug_assert(info->src.level == 0); + debug_assert(info->dst.level == 0); + + /* + * Buffers can have dimensions bigger than max width, remap into + * multiple 1d blits to fit within max dimension + * + * Note that blob uses .ARRAY_PITCH=128 for blitting buffers, which + * seems to prevent overfetch related faults. Not quite sure what + * the deal is there. + * + * Low 6 bits of SRC/DST addresses need to be zero (ie. address + * aligned to 64) so we need to shift src/dst x1/x2 to make up the + * difference. On top of already splitting up the blit so width + * isn't > 16k. + * + * We perhaps could do a bit better, if src and dst are aligned but + * in the worst case this means we have to split the copy up into + * 16k (0x4000) minus 64 (0x40). + */ + + sshift = sbox->x & 0x3f; + dshift = dbox->x & 0x3f; + + emit_blit_setup(ring, PIPE_FORMAT_R8_UNORM, false, NULL); + + for (unsigned off = 0; off < sbox->width; off += (0x4000 - 0x40)) { + unsigned soff, doff, w, p; + + soff = (sbox->x + off) & ~0x3f; + doff = (dbox->x + off) & ~0x3f; + + w = MIN2(sbox->width - off, (0x4000 - 0x40)); + p = align(w, 64); + + debug_assert((soff + w) <= fd_bo_size(src->bo)); + debug_assert((doff + w) <= fd_bo_size(dst->bo)); + + /* + * Emit source: + */ + OUT_PKT4(ring, REG_A6XX_SP_PS_2D_SRC_INFO, 10); + OUT_RING(ring, A6XX_SP_PS_2D_SRC_INFO_COLOR_FORMAT(FMT6_8_UNORM) | + A6XX_SP_PS_2D_SRC_INFO_TILE_MODE(TILE6_LINEAR) | + A6XX_SP_PS_2D_SRC_INFO_COLOR_SWAP(WZYX) | 0x500000); + OUT_RING(ring, + A6XX_SP_PS_2D_SRC_SIZE_WIDTH(sshift + w) | + A6XX_SP_PS_2D_SRC_SIZE_HEIGHT(1)); /* SP_PS_2D_SRC_SIZE */ + OUT_RELOC(ring, src->bo, soff, 0, 0); /* SP_PS_2D_SRC_LO/HI */ + OUT_RING(ring, A6XX_SP_PS_2D_SRC_PITCH_PITCH(p)); + + OUT_RING(ring, 0x00000000); + OUT_RING(ring, 0x00000000); + OUT_RING(ring, 0x00000000); + OUT_RING(ring, 0x00000000); + OUT_RING(ring, 0x00000000); + + /* + * Emit destination: + */ + OUT_PKT4(ring, REG_A6XX_RB_2D_DST_INFO, 9); + OUT_RING(ring, A6XX_RB_2D_DST_INFO_COLOR_FORMAT(FMT6_8_UNORM) | + A6XX_RB_2D_DST_INFO_TILE_MODE(TILE6_LINEAR) | + A6XX_RB_2D_DST_INFO_COLOR_SWAP(WZYX)); + OUT_RELOC(ring, dst->bo, doff, 0, 0); /* RB_2D_DST_LO/HI */ + OUT_RING(ring, A6XX_RB_2D_DST_PITCH(p)); + OUT_RING(ring, 0x00000000); + OUT_RING(ring, 0x00000000); + OUT_RING(ring, 0x00000000); + OUT_RING(ring, 0x00000000); + OUT_RING(ring, 0x00000000); + + /* + * Blit command: + */ + OUT_PKT4(ring, REG_A6XX_GRAS_2D_SRC_TL_X, 4); + OUT_RING(ring, A6XX_GRAS_2D_SRC_TL_X(sshift)); + OUT_RING(ring, A6XX_GRAS_2D_SRC_BR_X(sshift + w - 1)); + OUT_RING(ring, A6XX_GRAS_2D_SRC_TL_Y(0)); + OUT_RING(ring, A6XX_GRAS_2D_SRC_BR_Y(0)); + + OUT_PKT4(ring, REG_A6XX_GRAS_2D_DST_TL, 2); + OUT_RING(ring, A6XX_GRAS_2D_DST_TL_X(dshift) | A6XX_GRAS_2D_DST_TL_Y(0)); + OUT_RING(ring, A6XX_GRAS_2D_DST_BR_X(dshift + w - 1) | + A6XX_GRAS_2D_DST_BR_Y(0)); + + OUT_PKT7(ring, CP_EVENT_WRITE, 1); + OUT_RING(ring, 0x3f); + OUT_WFI5(ring); + + OUT_PKT4(ring, REG_A6XX_RB_UNKNOWN_8E04, 1); + OUT_RING(ring, ctx->screen->info.a6xx.magic.RB_UNKNOWN_8E04_blit); + + OUT_PKT7(ring, CP_BLIT, 1); + OUT_RING(ring, CP_BLIT_0_OP(BLIT_OP_SCALE)); + + OUT_WFI5(ring); + + OUT_PKT4(ring, REG_A6XX_RB_UNKNOWN_8E04, 1); + OUT_RING(ring, 0); /* RB_UNKNOWN_8E04 */ + } } static void -fd6_clear_ubwc(struct fd_batch *batch, struct fd_resource *rsc) - assert_dt +fd6_clear_ubwc(struct fd_batch *batch, struct fd_resource *rsc) assert_dt { - struct fd_ringbuffer *ring = fd_batch_get_prologue(batch); - union pipe_color_union color = {}; - - emit_blit_setup(ring, PIPE_FORMAT_R8_UNORM, false, &color); - - OUT_PKT4(ring, REG_A6XX_SP_PS_2D_SRC_INFO, 13); - OUT_RING(ring, 0x00000000); - OUT_RING(ring, 0x00000000); - OUT_RING(ring, 0x00000000); - OUT_RING(ring, 0x00000000); - OUT_RING(ring, 0x00000000); - OUT_RING(ring, 0x00000000); - OUT_RING(ring, 0x00000000); - OUT_RING(ring, 0x00000000); - OUT_RING(ring, 0x00000000); - OUT_RING(ring, 0x00000000); - OUT_RING(ring, 0x00000000); - OUT_RING(ring, 0x00000000); - OUT_RING(ring, 0x00000000); - - OUT_PKT4(ring, REG_A6XX_RB_2D_SRC_SOLID_C0, 4); - OUT_RING(ring, 0x00000000); - OUT_RING(ring, 0x00000000); - OUT_RING(ring, 0x00000000); - OUT_RING(ring, 0x00000000); - - OUT_PKT4(ring, REG_A6XX_GRAS_2D_SRC_TL_X, 4); - OUT_RING(ring, A6XX_GRAS_2D_SRC_TL_X(0)); - OUT_RING(ring, A6XX_GRAS_2D_SRC_BR_X(0)); - OUT_RING(ring, A6XX_GRAS_2D_SRC_TL_Y(0)); - OUT_RING(ring, A6XX_GRAS_2D_SRC_BR_Y(0)); - - unsigned size = rsc->layout.slices[0].offset; - unsigned offset = 0; - - /* We could be more clever here and realize that we could use a - * larger width if the size is aligned to something more than a - * single page.. or even use a format larger than r8 in those - * cases. But for normal sized textures and even up to 16k x 16k - * at <= 4byte/pixel, we'll only go thru the loop once - */ - const unsigned w = 0x1000; - - /* ubwc size should always be page aligned: */ - assert((size % w) == 0); - - while (size > 0) { - const unsigned h = MIN2(0x4000, size / w); - /* width is already aligned to a suitable pitch: */ - const unsigned p = w; - - /* - * Emit destination: - */ - OUT_PKT4(ring, REG_A6XX_RB_2D_DST_INFO, 9); - OUT_RING(ring, A6XX_RB_2D_DST_INFO_COLOR_FORMAT(FMT6_8_UNORM) | - A6XX_RB_2D_DST_INFO_TILE_MODE(TILE6_LINEAR) | - A6XX_RB_2D_DST_INFO_COLOR_SWAP(WZYX)); - OUT_RELOC(ring, rsc->bo, offset, 0, 0); /* RB_2D_DST_LO/HI */ - OUT_RING(ring, A6XX_RB_2D_DST_PITCH(p)); - OUT_RING(ring, 0x00000000); - OUT_RING(ring, 0x00000000); - OUT_RING(ring, 0x00000000); - OUT_RING(ring, 0x00000000); - OUT_RING(ring, 0x00000000); - - /* - * Blit command: - */ - - OUT_PKT4(ring, REG_A6XX_GRAS_2D_DST_TL, 2); - OUT_RING(ring, A6XX_GRAS_2D_DST_TL_X(0) | A6XX_GRAS_2D_DST_TL_Y(0)); - OUT_RING(ring, A6XX_GRAS_2D_DST_BR_X(w - 1) | A6XX_GRAS_2D_DST_BR_Y(h - 1)); - - OUT_PKT7(ring, CP_EVENT_WRITE, 1); - OUT_RING(ring, 0x3f); - OUT_WFI5(ring); - - OUT_PKT4(ring, REG_A6XX_RB_UNKNOWN_8E04, 1); - OUT_RING(ring, batch->ctx->screen->info.a6xx.magic.RB_UNKNOWN_8E04_blit); - - OUT_PKT7(ring, CP_BLIT, 1); - OUT_RING(ring, CP_BLIT_0_OP(BLIT_OP_SCALE)); - - OUT_WFI5(ring); - - OUT_PKT4(ring, REG_A6XX_RB_UNKNOWN_8E04, 1); - OUT_RING(ring, 0); /* RB_UNKNOWN_8E04 */ - - offset += w * h; - size -= w * h; - } - - fd6_event_write(batch, ring, PC_CCU_FLUSH_COLOR_TS, true); - fd6_event_write(batch, ring, PC_CCU_FLUSH_DEPTH_TS, true); - fd6_event_write(batch, ring, CACHE_FLUSH_TS, true); - fd6_cache_inv(batch, ring); + struct fd_ringbuffer *ring = fd_batch_get_prologue(batch); + union pipe_color_union color = {}; + + emit_blit_setup(ring, PIPE_FORMAT_R8_UNORM, false, &color); + + OUT_PKT4(ring, REG_A6XX_SP_PS_2D_SRC_INFO, 13); + OUT_RING(ring, 0x00000000); + OUT_RING(ring, 0x00000000); + OUT_RING(ring, 0x00000000); + OUT_RING(ring, 0x00000000); + OUT_RING(ring, 0x00000000); + OUT_RING(ring, 0x00000000); + OUT_RING(ring, 0x00000000); + OUT_RING(ring, 0x00000000); + OUT_RING(ring, 0x00000000); + OUT_RING(ring, 0x00000000); + OUT_RING(ring, 0x00000000); + OUT_RING(ring, 0x00000000); + OUT_RING(ring, 0x00000000); + + OUT_PKT4(ring, REG_A6XX_RB_2D_SRC_SOLID_C0, 4); + OUT_RING(ring, 0x00000000); + OUT_RING(ring, 0x00000000); + OUT_RING(ring, 0x00000000); + OUT_RING(ring, 0x00000000); + + OUT_PKT4(ring, REG_A6XX_GRAS_2D_SRC_TL_X, 4); + OUT_RING(ring, A6XX_GRAS_2D_SRC_TL_X(0)); + OUT_RING(ring, A6XX_GRAS_2D_SRC_BR_X(0)); + OUT_RING(ring, A6XX_GRAS_2D_SRC_TL_Y(0)); + OUT_RING(ring, A6XX_GRAS_2D_SRC_BR_Y(0)); + + unsigned size = rsc->layout.slices[0].offset; + unsigned offset = 0; + + /* We could be more clever here and realize that we could use a + * larger width if the size is aligned to something more than a + * single page.. or even use a format larger than r8 in those + * cases. But for normal sized textures and even up to 16k x 16k + * at <= 4byte/pixel, we'll only go thru the loop once + */ + const unsigned w = 0x1000; + + /* ubwc size should always be page aligned: */ + assert((size % w) == 0); + + while (size > 0) { + const unsigned h = MIN2(0x4000, size / w); + /* width is already aligned to a suitable pitch: */ + const unsigned p = w; + + /* + * Emit destination: + */ + OUT_PKT4(ring, REG_A6XX_RB_2D_DST_INFO, 9); + OUT_RING(ring, A6XX_RB_2D_DST_INFO_COLOR_FORMAT(FMT6_8_UNORM) | + A6XX_RB_2D_DST_INFO_TILE_MODE(TILE6_LINEAR) | + A6XX_RB_2D_DST_INFO_COLOR_SWAP(WZYX)); + OUT_RELOC(ring, rsc->bo, offset, 0, 0); /* RB_2D_DST_LO/HI */ + OUT_RING(ring, A6XX_RB_2D_DST_PITCH(p)); + OUT_RING(ring, 0x00000000); + OUT_RING(ring, 0x00000000); + OUT_RING(ring, 0x00000000); + OUT_RING(ring, 0x00000000); + OUT_RING(ring, 0x00000000); + + /* + * Blit command: + */ + + OUT_PKT4(ring, REG_A6XX_GRAS_2D_DST_TL, 2); + OUT_RING(ring, A6XX_GRAS_2D_DST_TL_X(0) | A6XX_GRAS_2D_DST_TL_Y(0)); + OUT_RING(ring, + A6XX_GRAS_2D_DST_BR_X(w - 1) | A6XX_GRAS_2D_DST_BR_Y(h - 1)); + + OUT_PKT7(ring, CP_EVENT_WRITE, 1); + OUT_RING(ring, 0x3f); + OUT_WFI5(ring); + + OUT_PKT4(ring, REG_A6XX_RB_UNKNOWN_8E04, 1); + OUT_RING(ring, batch->ctx->screen->info.a6xx.magic.RB_UNKNOWN_8E04_blit); + + OUT_PKT7(ring, CP_BLIT, 1); + OUT_RING(ring, CP_BLIT_0_OP(BLIT_OP_SCALE)); + + OUT_WFI5(ring); + + OUT_PKT4(ring, REG_A6XX_RB_UNKNOWN_8E04, 1); + OUT_RING(ring, 0); /* RB_UNKNOWN_8E04 */ + + offset += w * h; + size -= w * h; + } + + fd6_event_write(batch, ring, PC_CCU_FLUSH_COLOR_TS, true); + fd6_event_write(batch, ring, PC_CCU_FLUSH_DEPTH_TS, true); + fd6_event_write(batch, ring, CACHE_FLUSH_TS, true); + fd6_cache_inv(batch, ring); } static void -emit_blit_dst(struct fd_ringbuffer *ring, struct pipe_resource *prsc, enum pipe_format pfmt, unsigned level, unsigned layer) +emit_blit_dst(struct fd_ringbuffer *ring, struct pipe_resource *prsc, + enum pipe_format pfmt, unsigned level, unsigned layer) { - struct fd_resource *dst = fd_resource(prsc); - enum a6xx_format fmt = fd6_pipe2color(pfmt); - enum a6xx_tile_mode tile = fd_resource_tile_mode(prsc, level); - enum a3xx_color_swap swap = fd6_resource_swap(dst, pfmt); - uint32_t pitch = fd_resource_pitch(dst, level); - bool ubwc_enabled = fd_resource_ubwc_enabled(dst, level); - unsigned off = fd_resource_offset(dst, level, layer); - - if (fmt == FMT6_Z24_UNORM_S8_UINT) - fmt = FMT6_Z24_UNORM_S8_UINT_AS_R8G8B8A8; - - OUT_PKT4(ring, REG_A6XX_RB_2D_DST_INFO, 9); - OUT_RING(ring, A6XX_RB_2D_DST_INFO_COLOR_FORMAT(fmt) | - A6XX_RB_2D_DST_INFO_TILE_MODE(tile) | - A6XX_RB_2D_DST_INFO_COLOR_SWAP(swap) | - COND(util_format_is_srgb(pfmt), A6XX_RB_2D_DST_INFO_SRGB) | - COND(ubwc_enabled, A6XX_RB_2D_DST_INFO_FLAGS)); - OUT_RELOC(ring, dst->bo, off, 0, 0); /* RB_2D_DST_LO/HI */ - OUT_RING(ring, A6XX_RB_2D_DST_PITCH(pitch)); - OUT_RING(ring, 0x00000000); - OUT_RING(ring, 0x00000000); - OUT_RING(ring, 0x00000000); - OUT_RING(ring, 0x00000000); - OUT_RING(ring, 0x00000000); - - if (ubwc_enabled) { - OUT_PKT4(ring, REG_A6XX_RB_2D_DST_FLAGS, 6); - fd6_emit_flag_reference(ring, dst, level, layer); - OUT_RING(ring, 0x00000000); - OUT_RING(ring, 0x00000000); - OUT_RING(ring, 0x00000000); - } + struct fd_resource *dst = fd_resource(prsc); + enum a6xx_format fmt = fd6_pipe2color(pfmt); + enum a6xx_tile_mode tile = fd_resource_tile_mode(prsc, level); + enum a3xx_color_swap swap = fd6_resource_swap(dst, pfmt); + uint32_t pitch = fd_resource_pitch(dst, level); + bool ubwc_enabled = fd_resource_ubwc_enabled(dst, level); + unsigned off = fd_resource_offset(dst, level, layer); + + if (fmt == FMT6_Z24_UNORM_S8_UINT) + fmt = FMT6_Z24_UNORM_S8_UINT_AS_R8G8B8A8; + + OUT_PKT4(ring, REG_A6XX_RB_2D_DST_INFO, 9); + OUT_RING(ring, A6XX_RB_2D_DST_INFO_COLOR_FORMAT(fmt) | + A6XX_RB_2D_DST_INFO_TILE_MODE(tile) | + A6XX_RB_2D_DST_INFO_COLOR_SWAP(swap) | + COND(util_format_is_srgb(pfmt), A6XX_RB_2D_DST_INFO_SRGB) | + COND(ubwc_enabled, A6XX_RB_2D_DST_INFO_FLAGS)); + OUT_RELOC(ring, dst->bo, off, 0, 0); /* RB_2D_DST_LO/HI */ + OUT_RING(ring, A6XX_RB_2D_DST_PITCH(pitch)); + OUT_RING(ring, 0x00000000); + OUT_RING(ring, 0x00000000); + OUT_RING(ring, 0x00000000); + OUT_RING(ring, 0x00000000); + OUT_RING(ring, 0x00000000); + + if (ubwc_enabled) { + OUT_PKT4(ring, REG_A6XX_RB_2D_DST_FLAGS, 6); + fd6_emit_flag_reference(ring, dst, level, layer); + OUT_RING(ring, 0x00000000); + OUT_RING(ring, 0x00000000); + OUT_RING(ring, 0x00000000); + } } static void -emit_blit_src(struct fd_ringbuffer *ring, const struct pipe_blit_info *info, unsigned layer, unsigned nr_samples) +emit_blit_src(struct fd_ringbuffer *ring, const struct pipe_blit_info *info, + unsigned layer, unsigned nr_samples) { - struct fd_resource *src = fd_resource(info->src.resource); - enum a6xx_format sfmt = fd6_pipe2color(info->src.format); - enum a6xx_tile_mode stile = fd_resource_tile_mode(info->src.resource, info->src.level); - enum a3xx_color_swap sswap = fd6_resource_swap(src, info->src.format); - uint32_t pitch = fd_resource_pitch(src, info->src.level); - bool subwc_enabled = fd_resource_ubwc_enabled(src, info->src.level); - unsigned soff = fd_resource_offset(src, info->src.level, layer); - uint32_t width = u_minify(src->b.b.width0, info->src.level) * nr_samples; - uint32_t height = u_minify(src->b.b.height0, info->src.level); - uint32_t filter = 0; - - if (info->filter == PIPE_TEX_FILTER_LINEAR) - filter = A6XX_SP_PS_2D_SRC_INFO_FILTER; - - enum a3xx_msaa_samples samples = fd_msaa_samples(src->b.b.nr_samples); - - if (sfmt == FMT6_10_10_10_2_UNORM_DEST) - sfmt = FMT6_10_10_10_2_UNORM; - - OUT_PKT4(ring, REG_A6XX_SP_PS_2D_SRC_INFO, 10); - OUT_RING(ring, A6XX_SP_PS_2D_SRC_INFO_COLOR_FORMAT(sfmt) | - A6XX_SP_PS_2D_SRC_INFO_TILE_MODE(stile) | - A6XX_SP_PS_2D_SRC_INFO_COLOR_SWAP(sswap) | - A6XX_SP_PS_2D_SRC_INFO_SAMPLES(samples) | - COND(samples > MSAA_ONE && (info->mask & PIPE_MASK_RGBA), - A6XX_SP_PS_2D_SRC_INFO_SAMPLES_AVERAGE) | - COND(subwc_enabled, A6XX_SP_PS_2D_SRC_INFO_FLAGS) | - COND(util_format_is_srgb(info->src.format), A6XX_SP_PS_2D_SRC_INFO_SRGB) | - 0x500000 | filter); - OUT_RING(ring, A6XX_SP_PS_2D_SRC_SIZE_WIDTH(width) | - A6XX_SP_PS_2D_SRC_SIZE_HEIGHT(height)); /* SP_PS_2D_SRC_SIZE */ - OUT_RELOC(ring, src->bo, soff, 0, 0); /* SP_PS_2D_SRC_LO/HI */ - OUT_RING(ring, A6XX_SP_PS_2D_SRC_PITCH_PITCH(pitch)); - - OUT_RING(ring, 0x00000000); - OUT_RING(ring, 0x00000000); - OUT_RING(ring, 0x00000000); - OUT_RING(ring, 0x00000000); - OUT_RING(ring, 0x00000000); - - if (subwc_enabled) { - OUT_PKT4(ring, REG_A6XX_SP_PS_2D_SRC_FLAGS, 6); - fd6_emit_flag_reference(ring, src, info->src.level, layer); - OUT_RING(ring, 0x00000000); - OUT_RING(ring, 0x00000000); - OUT_RING(ring, 0x00000000); - } + struct fd_resource *src = fd_resource(info->src.resource); + enum a6xx_format sfmt = fd6_pipe2color(info->src.format); + enum a6xx_tile_mode stile = + fd_resource_tile_mode(info->src.resource, info->src.level); + enum a3xx_color_swap sswap = fd6_resource_swap(src, info->src.format); + uint32_t pitch = fd_resource_pitch(src, info->src.level); + bool subwc_enabled = fd_resource_ubwc_enabled(src, info->src.level); + unsigned soff = fd_resource_offset(src, info->src.level, layer); + uint32_t width = u_minify(src->b.b.width0, info->src.level) * nr_samples; + uint32_t height = u_minify(src->b.b.height0, info->src.level); + uint32_t filter = 0; + + if (info->filter == PIPE_TEX_FILTER_LINEAR) + filter = A6XX_SP_PS_2D_SRC_INFO_FILTER; + + enum a3xx_msaa_samples samples = fd_msaa_samples(src->b.b.nr_samples); + + if (sfmt == FMT6_10_10_10_2_UNORM_DEST) + sfmt = FMT6_10_10_10_2_UNORM; + + OUT_PKT4(ring, REG_A6XX_SP_PS_2D_SRC_INFO, 10); + OUT_RING(ring, A6XX_SP_PS_2D_SRC_INFO_COLOR_FORMAT(sfmt) | + A6XX_SP_PS_2D_SRC_INFO_TILE_MODE(stile) | + A6XX_SP_PS_2D_SRC_INFO_COLOR_SWAP(sswap) | + A6XX_SP_PS_2D_SRC_INFO_SAMPLES(samples) | + COND(samples > MSAA_ONE && (info->mask & PIPE_MASK_RGBA), + A6XX_SP_PS_2D_SRC_INFO_SAMPLES_AVERAGE) | + COND(subwc_enabled, A6XX_SP_PS_2D_SRC_INFO_FLAGS) | + COND(util_format_is_srgb(info->src.format), + A6XX_SP_PS_2D_SRC_INFO_SRGB) | + 0x500000 | filter); + OUT_RING(ring, + A6XX_SP_PS_2D_SRC_SIZE_WIDTH(width) | + A6XX_SP_PS_2D_SRC_SIZE_HEIGHT(height)); /* SP_PS_2D_SRC_SIZE */ + OUT_RELOC(ring, src->bo, soff, 0, 0); /* SP_PS_2D_SRC_LO/HI */ + OUT_RING(ring, A6XX_SP_PS_2D_SRC_PITCH_PITCH(pitch)); + + OUT_RING(ring, 0x00000000); + OUT_RING(ring, 0x00000000); + OUT_RING(ring, 0x00000000); + OUT_RING(ring, 0x00000000); + OUT_RING(ring, 0x00000000); + + if (subwc_enabled) { + OUT_PKT4(ring, REG_A6XX_SP_PS_2D_SRC_FLAGS, 6); + fd6_emit_flag_reference(ring, src, info->src.level, layer); + OUT_RING(ring, 0x00000000); + OUT_RING(ring, 0x00000000); + OUT_RING(ring, 0x00000000); + } } static void -emit_blit_texture(struct fd_context *ctx, - struct fd_ringbuffer *ring, const struct pipe_blit_info *info) +emit_blit_texture(struct fd_context *ctx, struct fd_ringbuffer *ring, + const struct pipe_blit_info *info) { - const struct pipe_box *sbox = &info->src.box; - const struct pipe_box *dbox = &info->dst.box; - struct fd_resource *dst; - int sx1, sy1, sx2, sy2; - int dx1, dy1, dx2, dy2; - - if (DEBUG_BLIT) { - fprintf(stderr, "texture blit: "); - dump_blit_info(info); - } - - dst = fd_resource(info->dst.resource); - - uint32_t nr_samples = fd_resource_nr_samples(&dst->b.b); - - sx1 = sbox->x * nr_samples; - sy1 = sbox->y; - sx2 = (sbox->x + sbox->width) * nr_samples - 1; - sy2 = sbox->y + sbox->height - 1; - - OUT_PKT4(ring, REG_A6XX_GRAS_2D_SRC_TL_X, 4); - OUT_RING(ring, A6XX_GRAS_2D_SRC_TL_X(sx1)); - OUT_RING(ring, A6XX_GRAS_2D_SRC_BR_X(sx2)); - OUT_RING(ring, A6XX_GRAS_2D_SRC_TL_Y(sy1)); - OUT_RING(ring, A6XX_GRAS_2D_SRC_BR_Y(sy2)); - - dx1 = dbox->x * nr_samples; - dy1 = dbox->y; - dx2 = (dbox->x + dbox->width) * nr_samples - 1; - dy2 = dbox->y + dbox->height - 1; - - OUT_PKT4(ring, REG_A6XX_GRAS_2D_DST_TL, 2); - OUT_RING(ring, A6XX_GRAS_2D_DST_TL_X(dx1) | A6XX_GRAS_2D_DST_TL_Y(dy1)); - OUT_RING(ring, A6XX_GRAS_2D_DST_BR_X(dx2) | A6XX_GRAS_2D_DST_BR_Y(dy2)); - - if (info->scissor_enable) { - OUT_PKT4(ring, REG_A6XX_GRAS_2D_RESOLVE_CNTL_1, 2); - OUT_RING(ring, A6XX_GRAS_2D_RESOLVE_CNTL_1_X(info->scissor.minx) | - A6XX_GRAS_2D_RESOLVE_CNTL_1_Y(info->scissor.miny)); - OUT_RING(ring, A6XX_GRAS_2D_RESOLVE_CNTL_1_X(info->scissor.maxx - 1) | - A6XX_GRAS_2D_RESOLVE_CNTL_1_Y(info->scissor.maxy - 1)); - } - - emit_blit_setup(ring, info->dst.format, info->scissor_enable, NULL); - - for (unsigned i = 0; i < info->dst.box.depth; i++) { - - emit_blit_src(ring, info, sbox->z + i, nr_samples); - emit_blit_dst(ring, info->dst.resource, info->dst.format, info->dst.level, dbox->z + i); - - /* - * Blit command: - */ - OUT_PKT7(ring, CP_EVENT_WRITE, 1); - OUT_RING(ring, 0x3f); - OUT_WFI5(ring); - - OUT_PKT4(ring, REG_A6XX_RB_UNKNOWN_8E04, 1); - OUT_RING(ring, ctx->screen->info.a6xx.magic.RB_UNKNOWN_8E04_blit); - - OUT_PKT7(ring, CP_BLIT, 1); - OUT_RING(ring, CP_BLIT_0_OP(BLIT_OP_SCALE)); - - OUT_WFI5(ring); - - OUT_PKT4(ring, REG_A6XX_RB_UNKNOWN_8E04, 1); - OUT_RING(ring, 0); /* RB_UNKNOWN_8E04 */ - } + const struct pipe_box *sbox = &info->src.box; + const struct pipe_box *dbox = &info->dst.box; + struct fd_resource *dst; + int sx1, sy1, sx2, sy2; + int dx1, dy1, dx2, dy2; + + if (DEBUG_BLIT) { + fprintf(stderr, "texture blit: "); + dump_blit_info(info); + } + + dst = fd_resource(info->dst.resource); + + uint32_t nr_samples = fd_resource_nr_samples(&dst->b.b); + + sx1 = sbox->x * nr_samples; + sy1 = sbox->y; + sx2 = (sbox->x + sbox->width) * nr_samples - 1; + sy2 = sbox->y + sbox->height - 1; + + OUT_PKT4(ring, REG_A6XX_GRAS_2D_SRC_TL_X, 4); + OUT_RING(ring, A6XX_GRAS_2D_SRC_TL_X(sx1)); + OUT_RING(ring, A6XX_GRAS_2D_SRC_BR_X(sx2)); + OUT_RING(ring, A6XX_GRAS_2D_SRC_TL_Y(sy1)); + OUT_RING(ring, A6XX_GRAS_2D_SRC_BR_Y(sy2)); + + dx1 = dbox->x * nr_samples; + dy1 = dbox->y; + dx2 = (dbox->x + dbox->width) * nr_samples - 1; + dy2 = dbox->y + dbox->height - 1; + + OUT_PKT4(ring, REG_A6XX_GRAS_2D_DST_TL, 2); + OUT_RING(ring, A6XX_GRAS_2D_DST_TL_X(dx1) | A6XX_GRAS_2D_DST_TL_Y(dy1)); + OUT_RING(ring, A6XX_GRAS_2D_DST_BR_X(dx2) | A6XX_GRAS_2D_DST_BR_Y(dy2)); + + if (info->scissor_enable) { + OUT_PKT4(ring, REG_A6XX_GRAS_2D_RESOLVE_CNTL_1, 2); + OUT_RING(ring, A6XX_GRAS_2D_RESOLVE_CNTL_1_X(info->scissor.minx) | + A6XX_GRAS_2D_RESOLVE_CNTL_1_Y(info->scissor.miny)); + OUT_RING(ring, A6XX_GRAS_2D_RESOLVE_CNTL_1_X(info->scissor.maxx - 1) | + A6XX_GRAS_2D_RESOLVE_CNTL_1_Y(info->scissor.maxy - 1)); + } + + emit_blit_setup(ring, info->dst.format, info->scissor_enable, NULL); + + for (unsigned i = 0; i < info->dst.box.depth; i++) { + + emit_blit_src(ring, info, sbox->z + i, nr_samples); + emit_blit_dst(ring, info->dst.resource, info->dst.format, info->dst.level, + dbox->z + i); + + /* + * Blit command: + */ + OUT_PKT7(ring, CP_EVENT_WRITE, 1); + OUT_RING(ring, 0x3f); + OUT_WFI5(ring); + + OUT_PKT4(ring, REG_A6XX_RB_UNKNOWN_8E04, 1); + OUT_RING(ring, ctx->screen->info.a6xx.magic.RB_UNKNOWN_8E04_blit); + + OUT_PKT7(ring, CP_BLIT, 1); + OUT_RING(ring, CP_BLIT_0_OP(BLIT_OP_SCALE)); + + OUT_WFI5(ring); + + OUT_PKT4(ring, REG_A6XX_RB_UNKNOWN_8E04, 1); + OUT_RING(ring, 0); /* RB_UNKNOWN_8E04 */ + } } static void -emit_clear_color(struct fd_ringbuffer *ring, - enum pipe_format pfmt, union pipe_color_union *color) +emit_clear_color(struct fd_ringbuffer *ring, enum pipe_format pfmt, + union pipe_color_union *color) { - switch (pfmt) { - case PIPE_FORMAT_Z24X8_UNORM: - case PIPE_FORMAT_Z24_UNORM_S8_UINT: - case PIPE_FORMAT_X24S8_UINT: { - uint32_t depth_unorm24 = color->f[0] * ((1u << 24) - 1); - uint8_t stencil = color->ui[1]; - color->ui[0] = depth_unorm24 & 0xff; - color->ui[1] = (depth_unorm24 >> 8) & 0xff; - color->ui[2] = (depth_unorm24 >> 16) & 0xff; - color->ui[3] = stencil; - break; - } - default: - break; - } - - OUT_PKT4(ring, REG_A6XX_RB_2D_SRC_SOLID_C0, 4); - switch (fd6_ifmt(fd6_pipe2color(pfmt))) { - case R2D_UNORM8: - case R2D_UNORM8_SRGB: - /* The r2d ifmt is badly named, it also covers the signed case: */ - if (util_format_is_snorm(pfmt)) { - OUT_RING(ring, float_to_byte_tex(color->f[0])); - OUT_RING(ring, float_to_byte_tex(color->f[1])); - OUT_RING(ring, float_to_byte_tex(color->f[2])); - OUT_RING(ring, float_to_byte_tex(color->f[3])); - } else { - OUT_RING(ring, float_to_ubyte(color->f[0])); - OUT_RING(ring, float_to_ubyte(color->f[1])); - OUT_RING(ring, float_to_ubyte(color->f[2])); - OUT_RING(ring, float_to_ubyte(color->f[3])); - } - break; - case R2D_FLOAT16: - OUT_RING(ring, _mesa_float_to_half(color->f[0])); - OUT_RING(ring, _mesa_float_to_half(color->f[1])); - OUT_RING(ring, _mesa_float_to_half(color->f[2])); - OUT_RING(ring, _mesa_float_to_half(color->f[3])); - break; - case R2D_FLOAT32: - case R2D_INT32: - case R2D_INT16: - case R2D_INT8: - default: - OUT_RING(ring, color->ui[0]); - OUT_RING(ring, color->ui[1]); - OUT_RING(ring, color->ui[2]); - OUT_RING(ring, color->ui[3]); - break; - } + switch (pfmt) { + case PIPE_FORMAT_Z24X8_UNORM: + case PIPE_FORMAT_Z24_UNORM_S8_UINT: + case PIPE_FORMAT_X24S8_UINT: { + uint32_t depth_unorm24 = color->f[0] * ((1u << 24) - 1); + uint8_t stencil = color->ui[1]; + color->ui[0] = depth_unorm24 & 0xff; + color->ui[1] = (depth_unorm24 >> 8) & 0xff; + color->ui[2] = (depth_unorm24 >> 16) & 0xff; + color->ui[3] = stencil; + break; + } + default: + break; + } + + OUT_PKT4(ring, REG_A6XX_RB_2D_SRC_SOLID_C0, 4); + switch (fd6_ifmt(fd6_pipe2color(pfmt))) { + case R2D_UNORM8: + case R2D_UNORM8_SRGB: + /* The r2d ifmt is badly named, it also covers the signed case: */ + if (util_format_is_snorm(pfmt)) { + OUT_RING(ring, float_to_byte_tex(color->f[0])); + OUT_RING(ring, float_to_byte_tex(color->f[1])); + OUT_RING(ring, float_to_byte_tex(color->f[2])); + OUT_RING(ring, float_to_byte_tex(color->f[3])); + } else { + OUT_RING(ring, float_to_ubyte(color->f[0])); + OUT_RING(ring, float_to_ubyte(color->f[1])); + OUT_RING(ring, float_to_ubyte(color->f[2])); + OUT_RING(ring, float_to_ubyte(color->f[3])); + } + break; + case R2D_FLOAT16: + OUT_RING(ring, _mesa_float_to_half(color->f[0])); + OUT_RING(ring, _mesa_float_to_half(color->f[1])); + OUT_RING(ring, _mesa_float_to_half(color->f[2])); + OUT_RING(ring, _mesa_float_to_half(color->f[3])); + break; + case R2D_FLOAT32: + case R2D_INT32: + case R2D_INT16: + case R2D_INT8: + default: + OUT_RING(ring, color->ui[0]); + OUT_RING(ring, color->ui[1]); + OUT_RING(ring, color->ui[2]); + OUT_RING(ring, color->ui[3]); + break; + } } /** @@ -759,210 +766,213 @@ emit_clear_color(struct fd_ringbuffer *ring, static union pipe_color_union convert_color(enum pipe_format format, union pipe_color_union *pcolor) { - union pipe_color_union color = *pcolor; + union pipe_color_union color = *pcolor; - /* For solid-fill blits, the hw isn't going to convert from - * linear to srgb for us: - */ - if (util_format_is_srgb(format)) { - for (int i = 0; i < 3; i++) - color.f[i] = util_format_linear_to_srgb_float(color.f[i]); - } + /* For solid-fill blits, the hw isn't going to convert from + * linear to srgb for us: + */ + if (util_format_is_srgb(format)) { + for (int i = 0; i < 3; i++) + color.f[i] = util_format_linear_to_srgb_float(color.f[i]); + } - if (util_format_is_snorm(format)) { - for (int i = 0; i < 3; i++) - color.f[i] = CLAMP(color.f[i], -1.0f, 1.0f); - } + if (util_format_is_snorm(format)) { + for (int i = 0; i < 3; i++) + color.f[i] = CLAMP(color.f[i], -1.0f, 1.0f); + } - /* Note that float_to_ubyte() already clamps, for the unorm case */ + /* Note that float_to_ubyte() already clamps, for the unorm case */ - return color; + return color; } void -fd6_clear_surface(struct fd_context *ctx, - struct fd_ringbuffer *ring, struct pipe_surface *psurf, - uint32_t width, uint32_t height, union pipe_color_union *color) +fd6_clear_surface(struct fd_context *ctx, struct fd_ringbuffer *ring, + struct pipe_surface *psurf, uint32_t width, uint32_t height, + union pipe_color_union *color) { - if (DEBUG_BLIT) { - fprintf(stderr, "surface clear:\ndst resource: "); - util_dump_resource(stderr, psurf->texture); - fprintf(stderr, "\n"); - } + if (DEBUG_BLIT) { + fprintf(stderr, "surface clear:\ndst resource: "); + util_dump_resource(stderr, psurf->texture); + fprintf(stderr, "\n"); + } - uint32_t nr_samples = fd_resource_nr_samples(psurf->texture); - OUT_PKT4(ring, REG_A6XX_GRAS_2D_DST_TL, 2); - OUT_RING(ring, A6XX_GRAS_2D_DST_TL_X(0) | A6XX_GRAS_2D_DST_TL_Y(0)); - OUT_RING(ring, A6XX_GRAS_2D_DST_BR_X(width * nr_samples - 1) | - A6XX_GRAS_2D_DST_BR_Y(height - 1)); + uint32_t nr_samples = fd_resource_nr_samples(psurf->texture); + OUT_PKT4(ring, REG_A6XX_GRAS_2D_DST_TL, 2); + OUT_RING(ring, A6XX_GRAS_2D_DST_TL_X(0) | A6XX_GRAS_2D_DST_TL_Y(0)); + OUT_RING(ring, A6XX_GRAS_2D_DST_BR_X(width * nr_samples - 1) | + A6XX_GRAS_2D_DST_BR_Y(height - 1)); - union pipe_color_union clear_color = convert_color(psurf->format, color); + union pipe_color_union clear_color = convert_color(psurf->format, color); - emit_clear_color(ring, psurf->format, &clear_color); - emit_blit_setup(ring, psurf->format, false, &clear_color); + emit_clear_color(ring, psurf->format, &clear_color); + emit_blit_setup(ring, psurf->format, false, &clear_color); - for (unsigned i = psurf->u.tex.first_layer; i <= psurf->u.tex.last_layer; i++) { - emit_blit_dst(ring, psurf->texture, psurf->format, psurf->u.tex.level, i); + for (unsigned i = psurf->u.tex.first_layer; i <= psurf->u.tex.last_layer; + i++) { + emit_blit_dst(ring, psurf->texture, psurf->format, psurf->u.tex.level, i); - /* - * Blit command: - */ - OUT_PKT7(ring, CP_EVENT_WRITE, 1); - OUT_RING(ring, 0x3f); - OUT_WFI5(ring); + /* + * Blit command: + */ + OUT_PKT7(ring, CP_EVENT_WRITE, 1); + OUT_RING(ring, 0x3f); + OUT_WFI5(ring); - OUT_PKT4(ring, REG_A6XX_RB_UNKNOWN_8E04, 1); - OUT_RING(ring, ctx->screen->info.a6xx.magic.RB_UNKNOWN_8E04_blit); + OUT_PKT4(ring, REG_A6XX_RB_UNKNOWN_8E04, 1); + OUT_RING(ring, ctx->screen->info.a6xx.magic.RB_UNKNOWN_8E04_blit); - OUT_PKT7(ring, CP_BLIT, 1); - OUT_RING(ring, CP_BLIT_0_OP(BLIT_OP_SCALE)); + OUT_PKT7(ring, CP_BLIT, 1); + OUT_RING(ring, CP_BLIT_0_OP(BLIT_OP_SCALE)); - OUT_WFI5(ring); + OUT_WFI5(ring); - OUT_PKT4(ring, REG_A6XX_RB_UNKNOWN_8E04, 1); - OUT_RING(ring, 0); /* RB_UNKNOWN_8E04 */ - } + OUT_PKT4(ring, REG_A6XX_RB_UNKNOWN_8E04, 1); + OUT_RING(ring, 0); /* RB_UNKNOWN_8E04 */ + } } void fd6_resolve_tile(struct fd_batch *batch, struct fd_ringbuffer *ring, - uint32_t base, struct pipe_surface *psurf) + uint32_t base, struct pipe_surface *psurf) { - const struct fd_gmem_stateobj *gmem = batch->gmem_state; - uint64_t gmem_base = batch->ctx->screen->gmem_base + base; - uint32_t gmem_pitch = gmem->bin_w * batch->framebuffer.samples * - util_format_get_blocksize(psurf->format); - - OUT_PKT4(ring, REG_A6XX_GRAS_2D_DST_TL, 2); - OUT_RING(ring, A6XX_GRAS_2D_DST_TL_X(0) | A6XX_GRAS_2D_DST_TL_Y(0)); - OUT_RING(ring, A6XX_GRAS_2D_DST_BR_X(psurf->width - 1) | - A6XX_GRAS_2D_DST_BR_Y(psurf->height - 1)); - - OUT_PKT4(ring, REG_A6XX_GRAS_2D_SRC_TL_X, 4); - OUT_RING(ring, A6XX_GRAS_2D_SRC_TL_X(0)); - OUT_RING(ring, A6XX_GRAS_2D_SRC_BR_X(psurf->width - 1)); - OUT_RING(ring, A6XX_GRAS_2D_SRC_TL_Y(0)); - OUT_RING(ring, A6XX_GRAS_2D_SRC_BR_Y(psurf->height - 1)); - - /* Enable scissor bit, which will take into account the window scissor - * which is set per-tile - */ - emit_blit_setup(ring, psurf->format, true, NULL); - - /* We shouldn't be using GMEM in the layered rendering case: */ - assert(psurf->u.tex.first_layer == psurf->u.tex.last_layer); - - emit_blit_dst(ring, psurf->texture, psurf->format, psurf->u.tex.level, - psurf->u.tex.first_layer); - - enum a6xx_format sfmt = fd6_pipe2color(psurf->format); - enum a3xx_msaa_samples samples = fd_msaa_samples(batch->framebuffer.samples); - - OUT_PKT4(ring, REG_A6XX_SP_PS_2D_SRC_INFO, 10); - OUT_RING(ring, A6XX_SP_PS_2D_SRC_INFO_COLOR_FORMAT(sfmt) | - A6XX_SP_PS_2D_SRC_INFO_TILE_MODE(TILE6_2) | - A6XX_SP_PS_2D_SRC_INFO_SAMPLES(samples) | - COND(samples > MSAA_ONE, A6XX_SP_PS_2D_SRC_INFO_SAMPLES_AVERAGE) | - COND(util_format_is_srgb(psurf->format), A6XX_SP_PS_2D_SRC_INFO_SRGB) | - A6XX_SP_PS_2D_SRC_INFO_UNK20 | - A6XX_SP_PS_2D_SRC_INFO_UNK22); - OUT_RING(ring, A6XX_SP_PS_2D_SRC_SIZE_WIDTH(psurf->width) | - A6XX_SP_PS_2D_SRC_SIZE_HEIGHT(psurf->height)); - OUT_RING(ring, gmem_base); /* SP_PS_2D_SRC_LO */ - OUT_RING(ring, gmem_base >> 32); /* SP_PS_2D_SRC_HI */ - OUT_RING(ring, A6XX_SP_PS_2D_SRC_PITCH_PITCH(gmem_pitch)); - OUT_RING(ring, 0x00000000); - OUT_RING(ring, 0x00000000); - OUT_RING(ring, 0x00000000); - OUT_RING(ring, 0x00000000); - OUT_RING(ring, 0x00000000); - - /* sync GMEM writes with CACHE. */ - fd6_cache_inv(batch, ring); - - /* Wait for CACHE_INVALIDATE to land */ - fd_wfi(batch, ring); - - OUT_PKT7(ring, CP_BLIT, 1); - OUT_RING(ring, CP_BLIT_0_OP(BLIT_OP_SCALE)); - - OUT_WFI5(ring); - - /* CP_BLIT writes to the CCU, unlike CP_EVENT_WRITE::BLIT which writes to - * sysmem, and we generally assume that GMEM renderpasses leave their - * results in sysmem, so we need to flush manually here. - */ - fd6_event_write(batch, ring, PC_CCU_FLUSH_COLOR_TS, true); + const struct fd_gmem_stateobj *gmem = batch->gmem_state; + uint64_t gmem_base = batch->ctx->screen->gmem_base + base; + uint32_t gmem_pitch = gmem->bin_w * batch->framebuffer.samples * + util_format_get_blocksize(psurf->format); + + OUT_PKT4(ring, REG_A6XX_GRAS_2D_DST_TL, 2); + OUT_RING(ring, A6XX_GRAS_2D_DST_TL_X(0) | A6XX_GRAS_2D_DST_TL_Y(0)); + OUT_RING(ring, A6XX_GRAS_2D_DST_BR_X(psurf->width - 1) | + A6XX_GRAS_2D_DST_BR_Y(psurf->height - 1)); + + OUT_PKT4(ring, REG_A6XX_GRAS_2D_SRC_TL_X, 4); + OUT_RING(ring, A6XX_GRAS_2D_SRC_TL_X(0)); + OUT_RING(ring, A6XX_GRAS_2D_SRC_BR_X(psurf->width - 1)); + OUT_RING(ring, A6XX_GRAS_2D_SRC_TL_Y(0)); + OUT_RING(ring, A6XX_GRAS_2D_SRC_BR_Y(psurf->height - 1)); + + /* Enable scissor bit, which will take into account the window scissor + * which is set per-tile + */ + emit_blit_setup(ring, psurf->format, true, NULL); + + /* We shouldn't be using GMEM in the layered rendering case: */ + assert(psurf->u.tex.first_layer == psurf->u.tex.last_layer); + + emit_blit_dst(ring, psurf->texture, psurf->format, psurf->u.tex.level, + psurf->u.tex.first_layer); + + enum a6xx_format sfmt = fd6_pipe2color(psurf->format); + enum a3xx_msaa_samples samples = fd_msaa_samples(batch->framebuffer.samples); + + OUT_PKT4(ring, REG_A6XX_SP_PS_2D_SRC_INFO, 10); + OUT_RING( + ring, + A6XX_SP_PS_2D_SRC_INFO_COLOR_FORMAT(sfmt) | + A6XX_SP_PS_2D_SRC_INFO_TILE_MODE(TILE6_2) | + A6XX_SP_PS_2D_SRC_INFO_SAMPLES(samples) | + COND(samples > MSAA_ONE, A6XX_SP_PS_2D_SRC_INFO_SAMPLES_AVERAGE) | + COND(util_format_is_srgb(psurf->format), A6XX_SP_PS_2D_SRC_INFO_SRGB) | + A6XX_SP_PS_2D_SRC_INFO_UNK20 | A6XX_SP_PS_2D_SRC_INFO_UNK22); + OUT_RING(ring, A6XX_SP_PS_2D_SRC_SIZE_WIDTH(psurf->width) | + A6XX_SP_PS_2D_SRC_SIZE_HEIGHT(psurf->height)); + OUT_RING(ring, gmem_base); /* SP_PS_2D_SRC_LO */ + OUT_RING(ring, gmem_base >> 32); /* SP_PS_2D_SRC_HI */ + OUT_RING(ring, A6XX_SP_PS_2D_SRC_PITCH_PITCH(gmem_pitch)); + OUT_RING(ring, 0x00000000); + OUT_RING(ring, 0x00000000); + OUT_RING(ring, 0x00000000); + OUT_RING(ring, 0x00000000); + OUT_RING(ring, 0x00000000); + + /* sync GMEM writes with CACHE. */ + fd6_cache_inv(batch, ring); + + /* Wait for CACHE_INVALIDATE to land */ + fd_wfi(batch, ring); + + OUT_PKT7(ring, CP_BLIT, 1); + OUT_RING(ring, CP_BLIT_0_OP(BLIT_OP_SCALE)); + + OUT_WFI5(ring); + + /* CP_BLIT writes to the CCU, unlike CP_EVENT_WRITE::BLIT which writes to + * sysmem, and we generally assume that GMEM renderpasses leave their + * results in sysmem, so we need to flush manually here. + */ + fd6_event_write(batch, ring, PC_CCU_FLUSH_COLOR_TS, true); } static bool -handle_rgba_blit(struct fd_context *ctx, const struct pipe_blit_info *info) - assert_dt +handle_rgba_blit(struct fd_context *ctx, + const struct pipe_blit_info *info) assert_dt { - struct fd_batch *batch; + struct fd_batch *batch; - debug_assert(!(info->mask & PIPE_MASK_ZS)); + debug_assert(!(info->mask & PIPE_MASK_ZS)); - if (!can_do_blit(info)) - return false; + if (!can_do_blit(info)) + return false; - batch = fd_bc_alloc_batch(&ctx->screen->batch_cache, ctx, true); + batch = fd_bc_alloc_batch(&ctx->screen->batch_cache, ctx, true); - fd_screen_lock(ctx->screen); + fd_screen_lock(ctx->screen); - fd_batch_resource_read(batch, fd_resource(info->src.resource)); - fd_batch_resource_write(batch, fd_resource(info->dst.resource)); + fd_batch_resource_read(batch, fd_resource(info->src.resource)); + fd_batch_resource_write(batch, fd_resource(info->dst.resource)); - fd_screen_unlock(ctx->screen); + fd_screen_unlock(ctx->screen); - ASSERTED bool ret = fd_batch_lock_submit(batch); - assert(ret); + ASSERTED bool ret = fd_batch_lock_submit(batch); + assert(ret); - /* Clearing last_fence must come after the batch dependency tracking - * (resource_read()/resource_write()), as that can trigger a flush, - * re-populating last_fence - */ - fd_fence_ref(&ctx->last_fence, NULL); + /* Clearing last_fence must come after the batch dependency tracking + * (resource_read()/resource_write()), as that can trigger a flush, + * re-populating last_fence + */ + fd_fence_ref(&ctx->last_fence, NULL); - fd_batch_update_queries(batch); + fd_batch_update_queries(batch); - emit_setup(batch); + emit_setup(batch); - trace_start_blit(&batch->trace, info->src.resource->target, info->dst.resource->target); + trace_start_blit(&batch->trace, info->src.resource->target, + info->dst.resource->target); - if ((info->src.resource->target == PIPE_BUFFER) && - (info->dst.resource->target == PIPE_BUFFER)) { - assert(fd_resource(info->src.resource)->layout.tile_mode == TILE6_LINEAR); - assert(fd_resource(info->dst.resource)->layout.tile_mode == TILE6_LINEAR); - emit_blit_buffer(ctx, batch->draw, info); - } else { - /* I don't *think* we need to handle blits between buffer <-> !buffer */ - debug_assert(info->src.resource->target != PIPE_BUFFER); - debug_assert(info->dst.resource->target != PIPE_BUFFER); - emit_blit_texture(ctx, batch->draw, info); - } + if ((info->src.resource->target == PIPE_BUFFER) && + (info->dst.resource->target == PIPE_BUFFER)) { + assert(fd_resource(info->src.resource)->layout.tile_mode == TILE6_LINEAR); + assert(fd_resource(info->dst.resource)->layout.tile_mode == TILE6_LINEAR); + emit_blit_buffer(ctx, batch->draw, info); + } else { + /* I don't *think* we need to handle blits between buffer <-> !buffer */ + debug_assert(info->src.resource->target != PIPE_BUFFER); + debug_assert(info->dst.resource->target != PIPE_BUFFER); + emit_blit_texture(ctx, batch->draw, info); + } - trace_end_blit(&batch->trace); + trace_end_blit(&batch->trace); - fd6_event_write(batch, batch->draw, PC_CCU_FLUSH_COLOR_TS, true); - fd6_event_write(batch, batch->draw, PC_CCU_FLUSH_DEPTH_TS, true); - fd6_event_write(batch, batch->draw, CACHE_FLUSH_TS, true); - fd6_cache_inv(batch, batch->draw); + fd6_event_write(batch, batch->draw, PC_CCU_FLUSH_COLOR_TS, true); + fd6_event_write(batch, batch->draw, PC_CCU_FLUSH_DEPTH_TS, true); + fd6_event_write(batch, batch->draw, CACHE_FLUSH_TS, true); + fd6_cache_inv(batch, batch->draw); - fd_batch_unlock_submit(batch); + fd_batch_unlock_submit(batch); - fd_resource(info->dst.resource)->valid = true; - batch->needs_flush = true; + fd_resource(info->dst.resource)->valid = true; + batch->needs_flush = true; - fd_batch_flush(batch); - fd_batch_reference(&batch, NULL); + fd_batch_flush(batch); + fd_batch_reference(&batch, NULL); - /* Acc query state will have been dirtied by our fd_batch_update_queries, so - * the ctx->batch may need to turn its queries back on. - */ - ctx->update_active_queries = true; + /* Acc query state will have been dirtied by our fd_batch_update_queries, so + * the ctx->batch may need to turn its queries back on. + */ + ctx->update_active_queries = true; - return true; + return true; } /** @@ -972,14 +982,14 @@ handle_rgba_blit(struct fd_context *ctx, const struct pipe_blit_info *info) * ourself and never "fail". */ static bool -do_rewritten_blit(struct fd_context *ctx, const struct pipe_blit_info *info) - assert_dt +do_rewritten_blit(struct fd_context *ctx, + const struct pipe_blit_info *info) assert_dt { - bool success = handle_rgba_blit(ctx, info); - if (!success) - success = fd_blitter_blit(ctx, info); - debug_assert(success); /* fallback should never fail! */ - return success; + bool success = handle_rgba_blit(ctx, info); + if (!success) + success = fd_blitter_blit(ctx, info); + debug_assert(success); /* fallback should never fail! */ + return success; } /** @@ -987,172 +997,170 @@ do_rewritten_blit(struct fd_context *ctx, const struct pipe_blit_info *info) * blit into an equivilant format that we can handle */ static bool -handle_zs_blit(struct fd_context *ctx, const struct pipe_blit_info *info) - assert_dt +handle_zs_blit(struct fd_context *ctx, + const struct pipe_blit_info *info) assert_dt { - struct pipe_blit_info blit = *info; - - if (DEBUG_BLIT) { - fprintf(stderr, "---- handle_zs_blit: "); - dump_blit_info(info); - } - - struct fd_resource *src = fd_resource(info->src.resource); - struct fd_resource *dst = fd_resource(info->dst.resource); - - switch (info->dst.format) { - case PIPE_FORMAT_S8_UINT: - debug_assert(info->mask == PIPE_MASK_S); - blit.mask = PIPE_MASK_R; - blit.src.format = PIPE_FORMAT_R8_UINT; - blit.dst.format = PIPE_FORMAT_R8_UINT; - return do_rewritten_blit(ctx, &blit); - - case PIPE_FORMAT_Z32_FLOAT_S8X24_UINT: - if (info->mask & PIPE_MASK_Z) { - blit.mask = PIPE_MASK_R; - blit.src.format = PIPE_FORMAT_R32_FLOAT; - blit.dst.format = PIPE_FORMAT_R32_FLOAT; - do_rewritten_blit(ctx, &blit); - } - - if (info->mask & PIPE_MASK_S) { - blit.mask = PIPE_MASK_R; - blit.src.format = PIPE_FORMAT_R8_UINT; - blit.dst.format = PIPE_FORMAT_R8_UINT; - blit.src.resource = &src->stencil->b.b; - blit.dst.resource = &dst->stencil->b.b; - do_rewritten_blit(ctx, &blit); - } - - return true; - - case PIPE_FORMAT_Z16_UNORM: - blit.mask = PIPE_MASK_R; - blit.src.format = PIPE_FORMAT_R16_UNORM; - blit.dst.format = PIPE_FORMAT_R16_UNORM; - return do_rewritten_blit(ctx, &blit); - - case PIPE_FORMAT_Z32_UNORM: - case PIPE_FORMAT_Z32_FLOAT: - debug_assert(info->mask == PIPE_MASK_Z); - blit.mask = PIPE_MASK_R; - blit.src.format = PIPE_FORMAT_R32_UINT; - blit.dst.format = PIPE_FORMAT_R32_UINT; - return do_rewritten_blit(ctx, &blit); - - case PIPE_FORMAT_Z24X8_UNORM: - case PIPE_FORMAT_Z24_UNORM_S8_UINT: - blit.mask = 0; - if (info->mask & PIPE_MASK_Z) - blit.mask |= PIPE_MASK_R | PIPE_MASK_G | PIPE_MASK_B; - if (info->mask & PIPE_MASK_S) - blit.mask |= PIPE_MASK_A; - blit.src.format = PIPE_FORMAT_Z24_UNORM_S8_UINT_AS_R8G8B8A8; - blit.dst.format = PIPE_FORMAT_Z24_UNORM_S8_UINT_AS_R8G8B8A8; - /* non-UBWC Z24_UNORM_S8_UINT_AS_R8G8B8A8 is broken on a630, fall back to - * 8888_unorm. - */ - if (!ctx->screen->info.a6xx.has_z24uint_s8uint) { - if (!src->layout.ubwc) - blit.src.format = PIPE_FORMAT_RGBA8888_UNORM; - if (!dst->layout.ubwc) - blit.dst.format = PIPE_FORMAT_RGBA8888_UNORM; - } - return fd_blitter_blit(ctx, &blit); - - default: - return false; - } + struct pipe_blit_info blit = *info; + + if (DEBUG_BLIT) { + fprintf(stderr, "---- handle_zs_blit: "); + dump_blit_info(info); + } + + struct fd_resource *src = fd_resource(info->src.resource); + struct fd_resource *dst = fd_resource(info->dst.resource); + + switch (info->dst.format) { + case PIPE_FORMAT_S8_UINT: + debug_assert(info->mask == PIPE_MASK_S); + blit.mask = PIPE_MASK_R; + blit.src.format = PIPE_FORMAT_R8_UINT; + blit.dst.format = PIPE_FORMAT_R8_UINT; + return do_rewritten_blit(ctx, &blit); + + case PIPE_FORMAT_Z32_FLOAT_S8X24_UINT: + if (info->mask & PIPE_MASK_Z) { + blit.mask = PIPE_MASK_R; + blit.src.format = PIPE_FORMAT_R32_FLOAT; + blit.dst.format = PIPE_FORMAT_R32_FLOAT; + do_rewritten_blit(ctx, &blit); + } + + if (info->mask & PIPE_MASK_S) { + blit.mask = PIPE_MASK_R; + blit.src.format = PIPE_FORMAT_R8_UINT; + blit.dst.format = PIPE_FORMAT_R8_UINT; + blit.src.resource = &src->stencil->b.b; + blit.dst.resource = &dst->stencil->b.b; + do_rewritten_blit(ctx, &blit); + } + + return true; + + case PIPE_FORMAT_Z16_UNORM: + blit.mask = PIPE_MASK_R; + blit.src.format = PIPE_FORMAT_R16_UNORM; + blit.dst.format = PIPE_FORMAT_R16_UNORM; + return do_rewritten_blit(ctx, &blit); + + case PIPE_FORMAT_Z32_UNORM: + case PIPE_FORMAT_Z32_FLOAT: + debug_assert(info->mask == PIPE_MASK_Z); + blit.mask = PIPE_MASK_R; + blit.src.format = PIPE_FORMAT_R32_UINT; + blit.dst.format = PIPE_FORMAT_R32_UINT; + return do_rewritten_blit(ctx, &blit); + + case PIPE_FORMAT_Z24X8_UNORM: + case PIPE_FORMAT_Z24_UNORM_S8_UINT: + blit.mask = 0; + if (info->mask & PIPE_MASK_Z) + blit.mask |= PIPE_MASK_R | PIPE_MASK_G | PIPE_MASK_B; + if (info->mask & PIPE_MASK_S) + blit.mask |= PIPE_MASK_A; + blit.src.format = PIPE_FORMAT_Z24_UNORM_S8_UINT_AS_R8G8B8A8; + blit.dst.format = PIPE_FORMAT_Z24_UNORM_S8_UINT_AS_R8G8B8A8; + /* non-UBWC Z24_UNORM_S8_UINT_AS_R8G8B8A8 is broken on a630, fall back to + * 8888_unorm. + */ + if (!ctx->screen->info.a6xx.has_z24uint_s8uint) { + if (!src->layout.ubwc) + blit.src.format = PIPE_FORMAT_RGBA8888_UNORM; + if (!dst->layout.ubwc) + blit.dst.format = PIPE_FORMAT_RGBA8888_UNORM; + } + return fd_blitter_blit(ctx, &blit); + + default: + return false; + } } static bool -handle_compressed_blit(struct fd_context *ctx, const struct pipe_blit_info *info) - assert_dt +handle_compressed_blit(struct fd_context *ctx, + const struct pipe_blit_info *info) assert_dt { - struct pipe_blit_info blit = *info; + struct pipe_blit_info blit = *info; - if (DEBUG_BLIT) { - fprintf(stderr, "---- handle_compressed_blit: "); - dump_blit_info(info); - } + if (DEBUG_BLIT) { + fprintf(stderr, "---- handle_compressed_blit: "); + dump_blit_info(info); + } - if (info->src.format != info->dst.format) - return fd_blitter_blit(ctx, info); + if (info->src.format != info->dst.format) + return fd_blitter_blit(ctx, info); - if (util_format_get_blocksize(info->src.format) == 8) { - blit.src.format = blit.dst.format = PIPE_FORMAT_R16G16B16A16_UINT; - } else { - debug_assert(util_format_get_blocksize(info->src.format) == 16); - blit.src.format = blit.dst.format = PIPE_FORMAT_R32G32B32A32_UINT; - } + if (util_format_get_blocksize(info->src.format) == 8) { + blit.src.format = blit.dst.format = PIPE_FORMAT_R16G16B16A16_UINT; + } else { + debug_assert(util_format_get_blocksize(info->src.format) == 16); + blit.src.format = blit.dst.format = PIPE_FORMAT_R32G32B32A32_UINT; + } - int bw = util_format_get_blockwidth(info->src.format); - int bh = util_format_get_blockheight(info->src.format); + int bw = util_format_get_blockwidth(info->src.format); + int bh = util_format_get_blockheight(info->src.format); - /* NOTE: x/y *must* be aligned to block boundary (ie. in - * glCompressedTexSubImage2D()) but width/height may not - * be: - */ + /* NOTE: x/y *must* be aligned to block boundary (ie. in + * glCompressedTexSubImage2D()) but width/height may not + * be: + */ - debug_assert((blit.src.box.x % bw) == 0); - debug_assert((blit.src.box.y % bh) == 0); + debug_assert((blit.src.box.x % bw) == 0); + debug_assert((blit.src.box.y % bh) == 0); - blit.src.box.x /= bw; - blit.src.box.y /= bh; - blit.src.box.width = DIV_ROUND_UP(blit.src.box.width, bw); - blit.src.box.height = DIV_ROUND_UP(blit.src.box.height, bh); + blit.src.box.x /= bw; + blit.src.box.y /= bh; + blit.src.box.width = DIV_ROUND_UP(blit.src.box.width, bw); + blit.src.box.height = DIV_ROUND_UP(blit.src.box.height, bh); - debug_assert((blit.dst.box.x % bw) == 0); - debug_assert((blit.dst.box.y % bh) == 0); + debug_assert((blit.dst.box.x % bw) == 0); + debug_assert((blit.dst.box.y % bh) == 0); - blit.dst.box.x /= bw; - blit.dst.box.y /= bh; - blit.dst.box.width = DIV_ROUND_UP(blit.dst.box.width, bw); - blit.dst.box.height = DIV_ROUND_UP(blit.dst.box.height, bh); + blit.dst.box.x /= bw; + blit.dst.box.y /= bh; + blit.dst.box.width = DIV_ROUND_UP(blit.dst.box.width, bw); + blit.dst.box.height = DIV_ROUND_UP(blit.dst.box.height, bh); - return do_rewritten_blit(ctx, &blit); + return do_rewritten_blit(ctx, &blit); } static bool -fd6_blit(struct fd_context *ctx, const struct pipe_blit_info *info) - assert_dt +fd6_blit(struct fd_context *ctx, const struct pipe_blit_info *info) assert_dt { - if (info->mask & PIPE_MASK_ZS) - return handle_zs_blit(ctx, info); - if (util_format_is_compressed(info->src.format) || - util_format_is_compressed(info->dst.format)) - return handle_compressed_blit(ctx, info); + if (info->mask & PIPE_MASK_ZS) + return handle_zs_blit(ctx, info); + if (util_format_is_compressed(info->src.format) || + util_format_is_compressed(info->dst.format)) + return handle_compressed_blit(ctx, info); - return handle_rgba_blit(ctx, info); + return handle_rgba_blit(ctx, info); } void -fd6_blitter_init(struct pipe_context *pctx) - disable_thread_safety_analysis +fd6_blitter_init(struct pipe_context *pctx) disable_thread_safety_analysis { - fd_context(pctx)->clear_ubwc = fd6_clear_ubwc; + fd_context(pctx)->clear_ubwc = fd6_clear_ubwc; - if (FD_DBG(NOBLIT)) - return; + if (FD_DBG(NOBLIT)) + return; - fd_context(pctx)->blit = fd6_blit; + fd_context(pctx)->blit = fd6_blit; } unsigned fd6_tile_mode(const struct pipe_resource *tmpl) { - /* if the mipmap level 0 is still too small to be tiled, then don't - * bother pretending: - */ - if (fd_resource_level_linear(tmpl, 0)) - return TILE6_LINEAR; - - /* basically just has to be a format we can blit, so uploads/downloads - * via linear staging buffer works: - */ - if (ok_format(tmpl->format)) - return TILE6_3; - - return TILE6_LINEAR; + /* if the mipmap level 0 is still too small to be tiled, then don't + * bother pretending: + */ + if (fd_resource_level_linear(tmpl, 0)) + return TILE6_LINEAR; + + /* basically just has to be a format we can blit, so uploads/downloads + * via linear staging buffer works: + */ + if (ok_format(tmpl->format)) + return TILE6_3; + + return TILE6_LINEAR; } diff --git a/src/gallium/drivers/freedreno/a6xx/fd6_blitter.h b/src/gallium/drivers/freedreno/a6xx/fd6_blitter.h index de84239..dd423cf 100644 --- a/src/gallium/drivers/freedreno/a6xx/fd6_blitter.h +++ b/src/gallium/drivers/freedreno/a6xx/fd6_blitter.h @@ -40,10 +40,11 @@ unsigned fd6_tile_mode(const struct pipe_resource *tmpl); * instead of CP_EVENT_WRITE::BLITs */ -void fd6_clear_surface(struct fd_context *ctx, - struct fd_ringbuffer *ring, struct pipe_surface *psurf, - uint32_t width, uint32_t height, union pipe_color_union *color) assert_dt; +void fd6_clear_surface(struct fd_context *ctx, struct fd_ringbuffer *ring, + struct pipe_surface *psurf, uint32_t width, + uint32_t height, + union pipe_color_union *color) assert_dt; void fd6_resolve_tile(struct fd_batch *batch, struct fd_ringbuffer *ring, - uint32_t base, struct pipe_surface *psurf) assert_dt; + uint32_t base, struct pipe_surface *psurf) assert_dt; #endif /* FD6_BLIT_H_ */ diff --git a/src/gallium/drivers/freedreno/a6xx/fd6_compute.c b/src/gallium/drivers/freedreno/a6xx/fd6_compute.c index 1ba59fd..01e3a5d 100644 --- a/src/gallium/drivers/freedreno/a6xx/fd6_compute.c +++ b/src/gallium/drivers/freedreno/a6xx/fd6_compute.c @@ -37,162 +37,161 @@ #include "fd6_emit.h" #include "fd6_pack.h" - /* maybe move to fd6_program? */ static void cs_program_emit(struct fd_context *ctx, struct fd_ringbuffer *ring, - struct ir3_shader_variant *v) assert_dt + struct ir3_shader_variant *v) assert_dt { - const struct ir3_info *i = &v->info; - enum a6xx_threadsize thrsz = i->double_threadsize ? THREAD128 : THREAD64; - - OUT_REG(ring, A6XX_HLSQ_INVALIDATE_CMD( - .vs_state = true, - .hs_state = true, - .ds_state = true, - .gs_state = true, - .fs_state = true, - .cs_state = true, - .gfx_ibo = true, - .cs_ibo = true, - )); - - OUT_PKT4(ring, REG_A6XX_HLSQ_CS_CNTL, 1); - OUT_RING(ring, A6XX_HLSQ_CS_CNTL_CONSTLEN(v->constlen) | - A6XX_HLSQ_CS_CNTL_ENABLED); - - OUT_PKT4(ring, REG_A6XX_SP_CS_CONFIG, 2); - OUT_RING(ring, A6XX_SP_CS_CONFIG_ENABLED | - A6XX_SP_CS_CONFIG_NIBO(v->shader->nir->info.num_ssbos + - v->shader->nir->info.num_images) | - A6XX_SP_CS_CONFIG_NTEX(v->num_samp) | - A6XX_SP_CS_CONFIG_NSAMP(v->num_samp)); /* SP_VS_CONFIG */ - OUT_RING(ring, v->instrlen); /* SP_VS_INSTRLEN */ - - OUT_PKT4(ring, REG_A6XX_SP_CS_CTRL_REG0, 1); - OUT_RING(ring, A6XX_SP_CS_CTRL_REG0_THREADSIZE(thrsz) | - A6XX_SP_CS_CTRL_REG0_FULLREGFOOTPRINT(i->max_reg + 1) | - A6XX_SP_CS_CTRL_REG0_HALFREGFOOTPRINT(i->max_half_reg + 1) | - COND(v->mergedregs, A6XX_SP_CS_CTRL_REG0_MERGEDREGS) | - A6XX_SP_CS_CTRL_REG0_BRANCHSTACK(v->branchstack)); - - uint32_t shared_size = MAX2(((int)v->shared_size - 1) / 1024, 1); - OUT_PKT4(ring, REG_A6XX_SP_CS_UNKNOWN_A9B1, 1); - OUT_RING(ring, A6XX_SP_CS_UNKNOWN_A9B1_SHARED_SIZE(shared_size) | - A6XX_SP_CS_UNKNOWN_A9B1_UNK6); - - uint32_t local_invocation_id, work_group_id; - local_invocation_id = ir3_find_sysval_regid(v, SYSTEM_VALUE_LOCAL_INVOCATION_ID); - work_group_id = ir3_find_sysval_regid(v, SYSTEM_VALUE_WORK_GROUP_ID); - - OUT_PKT4(ring, REG_A6XX_HLSQ_CS_CNTL_0, 2); - OUT_RING(ring, A6XX_HLSQ_CS_CNTL_0_WGIDCONSTID(work_group_id) | - A6XX_HLSQ_CS_CNTL_0_WGSIZECONSTID(regid(63, 0)) | - A6XX_HLSQ_CS_CNTL_0_WGOFFSETCONSTID(regid(63, 0)) | - A6XX_HLSQ_CS_CNTL_0_LOCALIDREGID(local_invocation_id)); - OUT_RING(ring, A6XX_HLSQ_CS_CNTL_1_LINEARLOCALIDREGID(regid(63, 0)) | - A6XX_HLSQ_CS_CNTL_1_THREADSIZE(thrsz)); - - OUT_PKT4(ring, REG_A6XX_SP_CS_OBJ_START, 2); - OUT_RELOC(ring, v->bo, 0, 0, 0); /* SP_CS_OBJ_START_LO/HI */ - - if (v->instrlen > 0) - fd6_emit_shader(ctx, ring, v); + const struct ir3_info *i = &v->info; + enum a6xx_threadsize thrsz = i->double_threadsize ? THREAD128 : THREAD64; + + OUT_REG(ring, A6XX_HLSQ_INVALIDATE_CMD(.vs_state = true, .hs_state = true, + .ds_state = true, .gs_state = true, + .fs_state = true, .cs_state = true, + .gfx_ibo = true, .cs_ibo = true, )); + + OUT_PKT4(ring, REG_A6XX_HLSQ_CS_CNTL, 1); + OUT_RING(ring, A6XX_HLSQ_CS_CNTL_CONSTLEN(v->constlen) | + A6XX_HLSQ_CS_CNTL_ENABLED); + + OUT_PKT4(ring, REG_A6XX_SP_CS_CONFIG, 2); + OUT_RING(ring, A6XX_SP_CS_CONFIG_ENABLED | + A6XX_SP_CS_CONFIG_NIBO(v->shader->nir->info.num_ssbos + + v->shader->nir->info.num_images) | + A6XX_SP_CS_CONFIG_NTEX(v->num_samp) | + A6XX_SP_CS_CONFIG_NSAMP(v->num_samp)); /* SP_VS_CONFIG */ + OUT_RING(ring, v->instrlen); /* SP_VS_INSTRLEN */ + + OUT_PKT4(ring, REG_A6XX_SP_CS_CTRL_REG0, 1); + OUT_RING(ring, + A6XX_SP_CS_CTRL_REG0_THREADSIZE(thrsz) | + A6XX_SP_CS_CTRL_REG0_FULLREGFOOTPRINT(i->max_reg + 1) | + A6XX_SP_CS_CTRL_REG0_HALFREGFOOTPRINT(i->max_half_reg + 1) | + COND(v->mergedregs, A6XX_SP_CS_CTRL_REG0_MERGEDREGS) | + A6XX_SP_CS_CTRL_REG0_BRANCHSTACK(v->branchstack)); + + uint32_t shared_size = MAX2(((int)v->shared_size - 1) / 1024, 1); + OUT_PKT4(ring, REG_A6XX_SP_CS_UNKNOWN_A9B1, 1); + OUT_RING(ring, A6XX_SP_CS_UNKNOWN_A9B1_SHARED_SIZE(shared_size) | + A6XX_SP_CS_UNKNOWN_A9B1_UNK6); + + uint32_t local_invocation_id, work_group_id; + local_invocation_id = + ir3_find_sysval_regid(v, SYSTEM_VALUE_LOCAL_INVOCATION_ID); + work_group_id = ir3_find_sysval_regid(v, SYSTEM_VALUE_WORK_GROUP_ID); + + OUT_PKT4(ring, REG_A6XX_HLSQ_CS_CNTL_0, 2); + OUT_RING(ring, A6XX_HLSQ_CS_CNTL_0_WGIDCONSTID(work_group_id) | + A6XX_HLSQ_CS_CNTL_0_WGSIZECONSTID(regid(63, 0)) | + A6XX_HLSQ_CS_CNTL_0_WGOFFSETCONSTID(regid(63, 0)) | + A6XX_HLSQ_CS_CNTL_0_LOCALIDREGID(local_invocation_id)); + OUT_RING(ring, A6XX_HLSQ_CS_CNTL_1_LINEARLOCALIDREGID(regid(63, 0)) | + A6XX_HLSQ_CS_CNTL_1_THREADSIZE(thrsz)); + + OUT_PKT4(ring, REG_A6XX_SP_CS_OBJ_START, 2); + OUT_RELOC(ring, v->bo, 0, 0, 0); /* SP_CS_OBJ_START_LO/HI */ + + if (v->instrlen > 0) + fd6_emit_shader(ctx, ring, v); } static void -fd6_launch_grid(struct fd_context *ctx, const struct pipe_grid_info *info) - in_dt +fd6_launch_grid(struct fd_context *ctx, const struct pipe_grid_info *info) in_dt { - struct ir3_shader_key key = {}; - struct ir3_shader_variant *v; - struct fd_ringbuffer *ring = ctx->batch->draw; - unsigned nglobal = 0; - - v = ir3_shader_variant(ir3_get_shader(ctx->compute), key, false, &ctx->debug); - if (!v) - return; - - if (ctx->dirty_shader[PIPE_SHADER_COMPUTE] & FD_DIRTY_SHADER_PROG) - cs_program_emit(ctx, ring, v); - - fd6_emit_cs_state(ctx, ring, v); - fd6_emit_cs_consts(v, ring, ctx, info); - - u_foreach_bit(i, ctx->global_bindings.enabled_mask) - nglobal++; - - if (nglobal > 0) { - /* global resources don't otherwise get an OUT_RELOC(), since - * the raw ptr address is emitted in ir3_emit_cs_consts(). - * So to make the kernel aware that these buffers are referenced - * by the batch, emit dummy reloc's as part of a no-op packet - * payload: - */ - OUT_PKT7(ring, CP_NOP, 2 * nglobal); - u_foreach_bit(i, ctx->global_bindings.enabled_mask) { - struct pipe_resource *prsc = ctx->global_bindings.buf[i]; - OUT_RELOC(ring, fd_resource(prsc)->bo, 0, 0, 0); - } - } - - OUT_PKT7(ring, CP_SET_MARKER, 1); - OUT_RING(ring, A6XX_CP_SET_MARKER_0_MODE(RM6_COMPUTE)); - - const unsigned *local_size = info->block; // v->shader->nir->info->cs.local_size; - const unsigned *num_groups = info->grid; - /* for some reason, mesa/st doesn't set info->work_dim, so just assume 3: */ - const unsigned work_dim = info->work_dim ? info->work_dim : 3; - OUT_PKT4(ring, REG_A6XX_HLSQ_CS_NDRANGE_0, 7); - OUT_RING(ring, A6XX_HLSQ_CS_NDRANGE_0_KERNELDIM(work_dim) | - A6XX_HLSQ_CS_NDRANGE_0_LOCALSIZEX(local_size[0] - 1) | - A6XX_HLSQ_CS_NDRANGE_0_LOCALSIZEY(local_size[1] - 1) | - A6XX_HLSQ_CS_NDRANGE_0_LOCALSIZEZ(local_size[2] - 1)); - OUT_RING(ring, A6XX_HLSQ_CS_NDRANGE_1_GLOBALSIZE_X(local_size[0] * num_groups[0])); - OUT_RING(ring, 0); /* HLSQ_CS_NDRANGE_2_GLOBALOFF_X */ - OUT_RING(ring, A6XX_HLSQ_CS_NDRANGE_3_GLOBALSIZE_Y(local_size[1] * num_groups[1])); - OUT_RING(ring, 0); /* HLSQ_CS_NDRANGE_4_GLOBALOFF_Y */ - OUT_RING(ring, A6XX_HLSQ_CS_NDRANGE_5_GLOBALSIZE_Z(local_size[2] * num_groups[2])); - OUT_RING(ring, 0); /* HLSQ_CS_NDRANGE_6_GLOBALOFF_Z */ - - OUT_PKT4(ring, REG_A6XX_HLSQ_CS_KERNEL_GROUP_X, 3); - OUT_RING(ring, 1); /* HLSQ_CS_KERNEL_GROUP_X */ - OUT_RING(ring, 1); /* HLSQ_CS_KERNEL_GROUP_Y */ - OUT_RING(ring, 1); /* HLSQ_CS_KERNEL_GROUP_Z */ - - trace_grid_info(&ctx->batch->trace, info); - trace_start_compute(&ctx->batch->trace); - - if (info->indirect) { - struct fd_resource *rsc = fd_resource(info->indirect); - - OUT_PKT7(ring, CP_EXEC_CS_INDIRECT, 4); - OUT_RING(ring, 0x00000000); - OUT_RELOC(ring, rsc->bo, info->indirect_offset, 0, 0); /* ADDR_LO/HI */ - OUT_RING(ring, A5XX_CP_EXEC_CS_INDIRECT_3_LOCALSIZEX(local_size[0] - 1) | - A5XX_CP_EXEC_CS_INDIRECT_3_LOCALSIZEY(local_size[1] - 1) | - A5XX_CP_EXEC_CS_INDIRECT_3_LOCALSIZEZ(local_size[2] - 1)); - } else { - OUT_PKT7(ring, CP_EXEC_CS, 4); - OUT_RING(ring, 0x00000000); - OUT_RING(ring, CP_EXEC_CS_1_NGROUPS_X(info->grid[0])); - OUT_RING(ring, CP_EXEC_CS_2_NGROUPS_Y(info->grid[1])); - OUT_RING(ring, CP_EXEC_CS_3_NGROUPS_Z(info->grid[2])); - } - - trace_end_compute(&ctx->batch->trace); - - OUT_WFI5(ring); - - fd6_cache_flush(ctx->batch, ring); + struct ir3_shader_key key = {}; + struct ir3_shader_variant *v; + struct fd_ringbuffer *ring = ctx->batch->draw; + unsigned nglobal = 0; + + v = + ir3_shader_variant(ir3_get_shader(ctx->compute), key, false, &ctx->debug); + if (!v) + return; + + if (ctx->dirty_shader[PIPE_SHADER_COMPUTE] & FD_DIRTY_SHADER_PROG) + cs_program_emit(ctx, ring, v); + + fd6_emit_cs_state(ctx, ring, v); + fd6_emit_cs_consts(v, ring, ctx, info); + + u_foreach_bit (i, ctx->global_bindings.enabled_mask) + nglobal++; + + if (nglobal > 0) { + /* global resources don't otherwise get an OUT_RELOC(), since + * the raw ptr address is emitted in ir3_emit_cs_consts(). + * So to make the kernel aware that these buffers are referenced + * by the batch, emit dummy reloc's as part of a no-op packet + * payload: + */ + OUT_PKT7(ring, CP_NOP, 2 * nglobal); + u_foreach_bit (i, ctx->global_bindings.enabled_mask) { + struct pipe_resource *prsc = ctx->global_bindings.buf[i]; + OUT_RELOC(ring, fd_resource(prsc)->bo, 0, 0, 0); + } + } + + OUT_PKT7(ring, CP_SET_MARKER, 1); + OUT_RING(ring, A6XX_CP_SET_MARKER_0_MODE(RM6_COMPUTE)); + + const unsigned *local_size = + info->block; // v->shader->nir->info->cs.local_size; + const unsigned *num_groups = info->grid; + /* for some reason, mesa/st doesn't set info->work_dim, so just assume 3: */ + const unsigned work_dim = info->work_dim ? info->work_dim : 3; + OUT_PKT4(ring, REG_A6XX_HLSQ_CS_NDRANGE_0, 7); + OUT_RING(ring, A6XX_HLSQ_CS_NDRANGE_0_KERNELDIM(work_dim) | + A6XX_HLSQ_CS_NDRANGE_0_LOCALSIZEX(local_size[0] - 1) | + A6XX_HLSQ_CS_NDRANGE_0_LOCALSIZEY(local_size[1] - 1) | + A6XX_HLSQ_CS_NDRANGE_0_LOCALSIZEZ(local_size[2] - 1)); + OUT_RING(ring, + A6XX_HLSQ_CS_NDRANGE_1_GLOBALSIZE_X(local_size[0] * num_groups[0])); + OUT_RING(ring, 0); /* HLSQ_CS_NDRANGE_2_GLOBALOFF_X */ + OUT_RING(ring, + A6XX_HLSQ_CS_NDRANGE_3_GLOBALSIZE_Y(local_size[1] * num_groups[1])); + OUT_RING(ring, 0); /* HLSQ_CS_NDRANGE_4_GLOBALOFF_Y */ + OUT_RING(ring, + A6XX_HLSQ_CS_NDRANGE_5_GLOBALSIZE_Z(local_size[2] * num_groups[2])); + OUT_RING(ring, 0); /* HLSQ_CS_NDRANGE_6_GLOBALOFF_Z */ + + OUT_PKT4(ring, REG_A6XX_HLSQ_CS_KERNEL_GROUP_X, 3); + OUT_RING(ring, 1); /* HLSQ_CS_KERNEL_GROUP_X */ + OUT_RING(ring, 1); /* HLSQ_CS_KERNEL_GROUP_Y */ + OUT_RING(ring, 1); /* HLSQ_CS_KERNEL_GROUP_Z */ + + trace_grid_info(&ctx->batch->trace, info); + trace_start_compute(&ctx->batch->trace); + + if (info->indirect) { + struct fd_resource *rsc = fd_resource(info->indirect); + + OUT_PKT7(ring, CP_EXEC_CS_INDIRECT, 4); + OUT_RING(ring, 0x00000000); + OUT_RELOC(ring, rsc->bo, info->indirect_offset, 0, 0); /* ADDR_LO/HI */ + OUT_RING(ring, + A5XX_CP_EXEC_CS_INDIRECT_3_LOCALSIZEX(local_size[0] - 1) | + A5XX_CP_EXEC_CS_INDIRECT_3_LOCALSIZEY(local_size[1] - 1) | + A5XX_CP_EXEC_CS_INDIRECT_3_LOCALSIZEZ(local_size[2] - 1)); + } else { + OUT_PKT7(ring, CP_EXEC_CS, 4); + OUT_RING(ring, 0x00000000); + OUT_RING(ring, CP_EXEC_CS_1_NGROUPS_X(info->grid[0])); + OUT_RING(ring, CP_EXEC_CS_2_NGROUPS_Y(info->grid[1])); + OUT_RING(ring, CP_EXEC_CS_3_NGROUPS_Z(info->grid[2])); + } + + trace_end_compute(&ctx->batch->trace); + + OUT_WFI5(ring); + + fd6_cache_flush(ctx->batch, ring); } void -fd6_compute_init(struct pipe_context *pctx) - disable_thread_safety_analysis +fd6_compute_init(struct pipe_context *pctx) disable_thread_safety_analysis { - struct fd_context *ctx = fd_context(pctx); - ctx->launch_grid = fd6_launch_grid; - pctx->create_compute_state = ir3_shader_compute_state_create; - pctx->delete_compute_state = ir3_shader_state_delete; + struct fd_context *ctx = fd_context(pctx); + ctx->launch_grid = fd6_launch_grid; + pctx->create_compute_state = ir3_shader_compute_state_create; + pctx->delete_compute_state = ir3_shader_state_delete; } diff --git a/src/gallium/drivers/freedreno/a6xx/fd6_const.c b/src/gallium/drivers/freedreno/a6xx/fd6_const.c index b8b3f19..bb1ecc4 100644 --- a/src/gallium/drivers/freedreno/a6xx/fd6_const.c +++ b/src/gallium/drivers/freedreno/a6xx/fd6_const.c @@ -26,7 +26,7 @@ #include "fd6_pack.h" #define emit_const_user fd6_emit_const_user -#define emit_const_bo fd6_emit_const_bo +#define emit_const_bo fd6_emit_const_bo #include "ir3_const.h" /* regid: base const register @@ -35,362 +35,335 @@ */ void fd6_emit_const_user(struct fd_ringbuffer *ring, - const struct ir3_shader_variant *v, uint32_t regid, - uint32_t sizedwords, const uint32_t *dwords) + const struct ir3_shader_variant *v, uint32_t regid, + uint32_t sizedwords, const uint32_t *dwords) { - emit_const_asserts(ring, v, regid, sizedwords); - - /* NOTE we cheat a bit here, since we know mesa is aligning - * the size of the user buffer to 16 bytes. And we want to - * cut cycles in a hot path. - */ - uint32_t align_sz = align(sizedwords, 4); - - if (fd6_geom_stage(v->type)) { - OUT_PKTBUF(ring, CP_LOAD_STATE6_GEOM, dwords, align_sz, - CP_LOAD_STATE6_0( - .dst_off = regid/4, - .state_type = ST6_CONSTANTS, - .state_src = SS6_DIRECT, - .state_block = fd6_stage2shadersb(v->type), - .num_unit = DIV_ROUND_UP(sizedwords, 4) - ), - CP_LOAD_STATE6_1(), - CP_LOAD_STATE6_2() - ); - } else { - OUT_PKTBUF(ring, CP_LOAD_STATE6_FRAG, dwords, align_sz, - CP_LOAD_STATE6_0( - .dst_off = regid/4, - .state_type = ST6_CONSTANTS, - .state_src = SS6_DIRECT, - .state_block = fd6_stage2shadersb(v->type), - .num_unit = DIV_ROUND_UP(sizedwords, 4) - ), - CP_LOAD_STATE6_1(), - CP_LOAD_STATE6_2() - ); - } + emit_const_asserts(ring, v, regid, sizedwords); + + /* NOTE we cheat a bit here, since we know mesa is aligning + * the size of the user buffer to 16 bytes. And we want to + * cut cycles in a hot path. + */ + uint32_t align_sz = align(sizedwords, 4); + + if (fd6_geom_stage(v->type)) { + OUT_PKTBUF( + ring, CP_LOAD_STATE6_GEOM, dwords, align_sz, + CP_LOAD_STATE6_0(.dst_off = regid / 4, .state_type = ST6_CONSTANTS, + .state_src = SS6_DIRECT, + .state_block = fd6_stage2shadersb(v->type), + .num_unit = DIV_ROUND_UP(sizedwords, 4)), + CP_LOAD_STATE6_1(), CP_LOAD_STATE6_2()); + } else { + OUT_PKTBUF( + ring, CP_LOAD_STATE6_FRAG, dwords, align_sz, + CP_LOAD_STATE6_0(.dst_off = regid / 4, .state_type = ST6_CONSTANTS, + .state_src = SS6_DIRECT, + .state_block = fd6_stage2shadersb(v->type), + .num_unit = DIV_ROUND_UP(sizedwords, 4)), + CP_LOAD_STATE6_1(), CP_LOAD_STATE6_2()); + } } void fd6_emit_const_bo(struct fd_ringbuffer *ring, - const struct ir3_shader_variant *v, uint32_t regid, - uint32_t offset, uint32_t sizedwords, struct fd_bo *bo) + const struct ir3_shader_variant *v, uint32_t regid, + uint32_t offset, uint32_t sizedwords, struct fd_bo *bo) { - uint32_t dst_off = regid / 4; - assert(dst_off % 4 == 0); - uint32_t num_unit = DIV_ROUND_UP(sizedwords, 4); - assert(num_unit % 4 == 0); - - emit_const_asserts(ring, v, regid, sizedwords); - - if (fd6_geom_stage(v->type)) { - OUT_PKT(ring, CP_LOAD_STATE6_GEOM, - CP_LOAD_STATE6_0( - .dst_off = dst_off, - .state_type = ST6_CONSTANTS, - .state_src = SS6_INDIRECT, - .state_block = fd6_stage2shadersb(v->type), - .num_unit = num_unit, - ), - CP_LOAD_STATE6_EXT_SRC_ADDR( - .bo = bo, - .bo_offset = offset - ) - ); - } else { - OUT_PKT(ring, CP_LOAD_STATE6_FRAG, - CP_LOAD_STATE6_0( - .dst_off = dst_off, - .state_type = ST6_CONSTANTS, - .state_src = SS6_INDIRECT, - .state_block = fd6_stage2shadersb(v->type), - .num_unit = num_unit, - ), - CP_LOAD_STATE6_EXT_SRC_ADDR( - .bo = bo, - .bo_offset = offset - ) - ); - } + uint32_t dst_off = regid / 4; + assert(dst_off % 4 == 0); + uint32_t num_unit = DIV_ROUND_UP(sizedwords, 4); + assert(num_unit % 4 == 0); + + emit_const_asserts(ring, v, regid, sizedwords); + + if (fd6_geom_stage(v->type)) { + OUT_PKT(ring, CP_LOAD_STATE6_GEOM, + CP_LOAD_STATE6_0(.dst_off = dst_off, .state_type = ST6_CONSTANTS, + .state_src = SS6_INDIRECT, + .state_block = fd6_stage2shadersb(v->type), + .num_unit = num_unit, ), + CP_LOAD_STATE6_EXT_SRC_ADDR(.bo = bo, .bo_offset = offset)); + } else { + OUT_PKT(ring, CP_LOAD_STATE6_FRAG, + CP_LOAD_STATE6_0(.dst_off = dst_off, .state_type = ST6_CONSTANTS, + .state_src = SS6_INDIRECT, + .state_block = fd6_stage2shadersb(v->type), + .num_unit = num_unit, ), + CP_LOAD_STATE6_EXT_SRC_ADDR(.bo = bo, .bo_offset = offset)); + } } static bool is_stateobj(struct fd_ringbuffer *ring) { - return true; + return true; } static void -emit_const_ptrs(struct fd_ringbuffer *ring, - const struct ir3_shader_variant *v, uint32_t dst_offset, - uint32_t num, struct fd_bo **bos, uint32_t *offsets) +emit_const_ptrs(struct fd_ringbuffer *ring, const struct ir3_shader_variant *v, + uint32_t dst_offset, uint32_t num, struct fd_bo **bos, + uint32_t *offsets) { - unreachable("shouldn't be called on a6xx"); + unreachable("shouldn't be called on a6xx"); } static void -emit_tess_bos(struct fd_ringbuffer *ring, struct fd6_emit *emit, struct ir3_shader_variant *s) - assert_dt +emit_tess_bos(struct fd_ringbuffer *ring, struct fd6_emit *emit, + struct ir3_shader_variant *s) assert_dt { - struct fd_context *ctx = emit->ctx; - const struct ir3_const_state *const_state = ir3_const_state(s); - const unsigned regid = const_state->offsets.primitive_param * 4 + 4; - uint32_t dwords = 16; - - OUT_PKT7(ring, fd6_stage2opcode(s->type), 3); - OUT_RING(ring, CP_LOAD_STATE6_0_DST_OFF(regid / 4) | - CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS)| - CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) | - CP_LOAD_STATE6_0_STATE_BLOCK(fd6_stage2shadersb(s->type)) | - CP_LOAD_STATE6_0_NUM_UNIT(dwords / 4)); - OUT_RB(ring, ctx->batch->tess_addrs_constobj); + struct fd_context *ctx = emit->ctx; + const struct ir3_const_state *const_state = ir3_const_state(s); + const unsigned regid = const_state->offsets.primitive_param * 4 + 4; + uint32_t dwords = 16; + + OUT_PKT7(ring, fd6_stage2opcode(s->type), 3); + OUT_RING(ring, CP_LOAD_STATE6_0_DST_OFF(regid / 4) | + CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) | + CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) | + CP_LOAD_STATE6_0_STATE_BLOCK(fd6_stage2shadersb(s->type)) | + CP_LOAD_STATE6_0_NUM_UNIT(dwords / 4)); + OUT_RB(ring, ctx->batch->tess_addrs_constobj); } static void emit_stage_tess_consts(struct fd_ringbuffer *ring, struct ir3_shader_variant *v, - uint32_t *params, int num_params) + uint32_t *params, int num_params) { - const struct ir3_const_state *const_state = ir3_const_state(v); - const unsigned regid = const_state->offsets.primitive_param; - int size = MIN2(1 + regid, v->constlen) - regid; - if (size > 0) - fd6_emit_const_user(ring, v, regid * 4, num_params, params); + const struct ir3_const_state *const_state = ir3_const_state(v); + const unsigned regid = const_state->offsets.primitive_param; + int size = MIN2(1 + regid, v->constlen) - regid; + if (size > 0) + fd6_emit_const_user(ring, v, regid * 4, num_params, params); } struct fd_ringbuffer * fd6_build_tess_consts(struct fd6_emit *emit) { - struct fd_context *ctx = emit->ctx; - - struct fd_ringbuffer *constobj = fd_submit_new_ringbuffer( - ctx->batch->submit, 0x1000, FD_RINGBUFFER_STREAMING); - - /* VS sizes are in bytes since that's what STLW/LDLW use, while the HS - * size is dwords, since that's what LDG/STG use. - */ - unsigned num_vertices = - emit->hs ? - emit->info->vertices_per_patch : - emit->gs->shader->nir->info.gs.vertices_in; - - uint32_t vs_params[4] = { - emit->vs->output_size * num_vertices * 4, /* vs primitive stride */ - emit->vs->output_size * 4, /* vs vertex stride */ - 0, - 0 - }; - - emit_stage_tess_consts(constobj, emit->vs, vs_params, ARRAY_SIZE(vs_params)); - - if (emit->hs) { - uint32_t hs_params[4] = { - emit->vs->output_size * num_vertices * 4, /* vs primitive stride */ - emit->vs->output_size * 4, /* vs vertex stride */ - emit->hs->output_size, - emit->info->vertices_per_patch - }; - - emit_stage_tess_consts(constobj, emit->hs, hs_params, ARRAY_SIZE(hs_params)); - emit_tess_bos(constobj, emit, emit->hs); - - if (emit->gs) - num_vertices = emit->gs->shader->nir->info.gs.vertices_in; - - uint32_t ds_params[4] = { - emit->ds->output_size * num_vertices * 4, /* ds primitive stride */ - emit->ds->output_size * 4, /* ds vertex stride */ - emit->hs->output_size, /* hs vertex stride (dwords) */ - emit->hs->shader->nir->info.tess.tcs_vertices_out - }; - - emit_stage_tess_consts(constobj, emit->ds, ds_params, ARRAY_SIZE(ds_params)); - emit_tess_bos(constobj, emit, emit->ds); - } - - if (emit->gs) { - struct ir3_shader_variant *prev; - if (emit->ds) - prev = emit->ds; - else - prev = emit->vs; - - uint32_t gs_params[4] = { - prev->output_size * num_vertices * 4, /* ds primitive stride */ - prev->output_size * 4, /* ds vertex stride */ - 0, - 0, - }; - - num_vertices = emit->gs->shader->nir->info.gs.vertices_in; - emit_stage_tess_consts(constobj, emit->gs, gs_params, ARRAY_SIZE(gs_params)); - } - - return constobj; + struct fd_context *ctx = emit->ctx; + + struct fd_ringbuffer *constobj = fd_submit_new_ringbuffer( + ctx->batch->submit, 0x1000, FD_RINGBUFFER_STREAMING); + + /* VS sizes are in bytes since that's what STLW/LDLW use, while the HS + * size is dwords, since that's what LDG/STG use. + */ + unsigned num_vertices = emit->hs + ? emit->info->vertices_per_patch + : emit->gs->shader->nir->info.gs.vertices_in; + + uint32_t vs_params[4] = { + emit->vs->output_size * num_vertices * 4, /* vs primitive stride */ + emit->vs->output_size * 4, /* vs vertex stride */ + 0, 0}; + + emit_stage_tess_consts(constobj, emit->vs, vs_params, ARRAY_SIZE(vs_params)); + + if (emit->hs) { + uint32_t hs_params[4] = { + emit->vs->output_size * num_vertices * 4, /* vs primitive stride */ + emit->vs->output_size * 4, /* vs vertex stride */ + emit->hs->output_size, emit->info->vertices_per_patch}; + + emit_stage_tess_consts(constobj, emit->hs, hs_params, + ARRAY_SIZE(hs_params)); + emit_tess_bos(constobj, emit, emit->hs); + + if (emit->gs) + num_vertices = emit->gs->shader->nir->info.gs.vertices_in; + + uint32_t ds_params[4] = { + emit->ds->output_size * num_vertices * 4, /* ds primitive stride */ + emit->ds->output_size * 4, /* ds vertex stride */ + emit->hs->output_size, /* hs vertex stride (dwords) */ + emit->hs->shader->nir->info.tess.tcs_vertices_out}; + + emit_stage_tess_consts(constobj, emit->ds, ds_params, + ARRAY_SIZE(ds_params)); + emit_tess_bos(constobj, emit, emit->ds); + } + + if (emit->gs) { + struct ir3_shader_variant *prev; + if (emit->ds) + prev = emit->ds; + else + prev = emit->vs; + + uint32_t gs_params[4] = { + prev->output_size * num_vertices * 4, /* ds primitive stride */ + prev->output_size * 4, /* ds vertex stride */ + 0, + 0, + }; + + num_vertices = emit->gs->shader->nir->info.gs.vertices_in; + emit_stage_tess_consts(constobj, emit->gs, gs_params, + ARRAY_SIZE(gs_params)); + } + + return constobj; } static void fd6_emit_ubos(struct fd_context *ctx, const struct ir3_shader_variant *v, - struct fd_ringbuffer *ring, struct fd_constbuf_stateobj *constbuf) + struct fd_ringbuffer *ring, struct fd_constbuf_stateobj *constbuf) { - const struct ir3_const_state *const_state = ir3_const_state(v); - int num_ubos = const_state->num_ubos; - - if (!num_ubos) - return; - - OUT_PKT7(ring, fd6_stage2opcode(v->type), 3 + (2 * num_ubos)); - OUT_RING(ring, CP_LOAD_STATE6_0_DST_OFF(0) | - CP_LOAD_STATE6_0_STATE_TYPE(ST6_UBO)| - CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) | - CP_LOAD_STATE6_0_STATE_BLOCK(fd6_stage2shadersb(v->type)) | - CP_LOAD_STATE6_0_NUM_UNIT(num_ubos)); - OUT_RING(ring, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0)); - OUT_RING(ring, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0)); - - for (int i = 0; i < num_ubos; i++) { - /* NIR constant data is packed into the end of the shader. */ - if (i == const_state->constant_data_ubo) { - int size_vec4s = DIV_ROUND_UP(v->constant_data_size, 16); - OUT_RELOC(ring, v->bo, - v->info.constant_data_offset, - (uint64_t)A6XX_UBO_1_SIZE(size_vec4s) << 32, - 0); - continue; - } - - struct pipe_constant_buffer *cb = &constbuf->cb[i]; - - /* If we have user pointers (constbuf 0, aka GL uniforms), upload them - * to a buffer now, and save it in the constbuf so that we don't have - * to reupload until they get changed. - */ - if (cb->user_buffer) { - struct pipe_context *pctx = &ctx->base; - u_upload_data(pctx->stream_uploader, 0, - cb->buffer_size, - 64, - cb->user_buffer, - &cb->buffer_offset, &cb->buffer); - cb->user_buffer = NULL; - } - - if (cb->buffer) { - int size_vec4s = DIV_ROUND_UP(cb->buffer_size, 16); - OUT_RELOC(ring, fd_resource(cb->buffer)->bo, - cb->buffer_offset, - (uint64_t)A6XX_UBO_1_SIZE(size_vec4s) << 32, - 0); - } else { - OUT_RING(ring, 0xbad00000 | (i << 16)); - OUT_RING(ring, A6XX_UBO_1_SIZE(0)); - } - } + const struct ir3_const_state *const_state = ir3_const_state(v); + int num_ubos = const_state->num_ubos; + + if (!num_ubos) + return; + + OUT_PKT7(ring, fd6_stage2opcode(v->type), 3 + (2 * num_ubos)); + OUT_RING(ring, CP_LOAD_STATE6_0_DST_OFF(0) | + CP_LOAD_STATE6_0_STATE_TYPE(ST6_UBO) | + CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) | + CP_LOAD_STATE6_0_STATE_BLOCK(fd6_stage2shadersb(v->type)) | + CP_LOAD_STATE6_0_NUM_UNIT(num_ubos)); + OUT_RING(ring, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0)); + OUT_RING(ring, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0)); + + for (int i = 0; i < num_ubos; i++) { + /* NIR constant data is packed into the end of the shader. */ + if (i == const_state->constant_data_ubo) { + int size_vec4s = DIV_ROUND_UP(v->constant_data_size, 16); + OUT_RELOC(ring, v->bo, v->info.constant_data_offset, + (uint64_t)A6XX_UBO_1_SIZE(size_vec4s) << 32, 0); + continue; + } + + struct pipe_constant_buffer *cb = &constbuf->cb[i]; + + /* If we have user pointers (constbuf 0, aka GL uniforms), upload them + * to a buffer now, and save it in the constbuf so that we don't have + * to reupload until they get changed. + */ + if (cb->user_buffer) { + struct pipe_context *pctx = &ctx->base; + u_upload_data(pctx->stream_uploader, 0, cb->buffer_size, 64, + cb->user_buffer, &cb->buffer_offset, &cb->buffer); + cb->user_buffer = NULL; + } + + if (cb->buffer) { + int size_vec4s = DIV_ROUND_UP(cb->buffer_size, 16); + OUT_RELOC(ring, fd_resource(cb->buffer)->bo, cb->buffer_offset, + (uint64_t)A6XX_UBO_1_SIZE(size_vec4s) << 32, 0); + } else { + OUT_RING(ring, 0xbad00000 | (i << 16)); + OUT_RING(ring, A6XX_UBO_1_SIZE(0)); + } + } } static unsigned user_consts_cmdstream_size(struct ir3_shader_variant *v) { - struct ir3_const_state *const_state = ir3_const_state(v); - struct ir3_ubo_analysis_state *ubo_state = &const_state->ubo_state; + struct ir3_const_state *const_state = ir3_const_state(v); + struct ir3_ubo_analysis_state *ubo_state = &const_state->ubo_state; - if (unlikely(!ubo_state->cmdstream_size)) { - unsigned packets, size; + if (unlikely(!ubo_state->cmdstream_size)) { + unsigned packets, size; - /* pre-calculate size required for userconst stateobj: */ - ir3_user_consts_size(ubo_state, &packets, &size); + /* pre-calculate size required for userconst stateobj: */ + ir3_user_consts_size(ubo_state, &packets, &size); - /* also account for UBO addresses: */ - packets += 1; - size += 2 * const_state->num_ubos; + /* also account for UBO addresses: */ + packets += 1; + size += 2 * const_state->num_ubos; - unsigned sizedwords = (4 * packets) + size; - ubo_state->cmdstream_size = sizedwords * 4; - } + unsigned sizedwords = (4 * packets) + size; + ubo_state->cmdstream_size = sizedwords * 4; + } - return ubo_state->cmdstream_size; + return ubo_state->cmdstream_size; } struct fd_ringbuffer * fd6_build_user_consts(struct fd6_emit *emit) { - static const enum pipe_shader_type types[] = { - PIPE_SHADER_VERTEX, PIPE_SHADER_TESS_CTRL, PIPE_SHADER_TESS_EVAL, - PIPE_SHADER_GEOMETRY, PIPE_SHADER_FRAGMENT, - }; - struct ir3_shader_variant *variants[] = { - emit->vs, emit->hs, emit->ds, emit->gs, emit->fs, - }; - struct fd_context *ctx = emit->ctx; - unsigned sz = 0; - - for (unsigned i = 0; i < ARRAY_SIZE(types); i++) { - if (!variants[i]) - continue; - sz += user_consts_cmdstream_size(variants[i]); - } - - struct fd_ringbuffer *constobj = fd_submit_new_ringbuffer( - ctx->batch->submit, sz, FD_RINGBUFFER_STREAMING); - - for (unsigned i = 0; i < ARRAY_SIZE(types); i++) { - if (!variants[i]) - continue; - ir3_emit_user_consts(ctx->screen, variants[i], constobj, &ctx->constbuf[types[i]]); - fd6_emit_ubos(ctx, variants[i], constobj, &ctx->constbuf[types[i]]); - } - - return constobj; + static const enum pipe_shader_type types[] = { + PIPE_SHADER_VERTEX, PIPE_SHADER_TESS_CTRL, PIPE_SHADER_TESS_EVAL, + PIPE_SHADER_GEOMETRY, PIPE_SHADER_FRAGMENT, + }; + struct ir3_shader_variant *variants[] = { + emit->vs, emit->hs, emit->ds, emit->gs, emit->fs, + }; + struct fd_context *ctx = emit->ctx; + unsigned sz = 0; + + for (unsigned i = 0; i < ARRAY_SIZE(types); i++) { + if (!variants[i]) + continue; + sz += user_consts_cmdstream_size(variants[i]); + } + + struct fd_ringbuffer *constobj = + fd_submit_new_ringbuffer(ctx->batch->submit, sz, FD_RINGBUFFER_STREAMING); + + for (unsigned i = 0; i < ARRAY_SIZE(types); i++) { + if (!variants[i]) + continue; + ir3_emit_user_consts(ctx->screen, variants[i], constobj, + &ctx->constbuf[types[i]]); + fd6_emit_ubos(ctx, variants[i], constobj, &ctx->constbuf[types[i]]); + } + + return constobj; } struct fd_ringbuffer * fd6_build_vs_driver_params(struct fd6_emit *emit) { - struct fd_context *ctx = emit->ctx; - struct fd6_context *fd6_ctx = fd6_context(ctx); - const struct ir3_shader_variant *vs = emit->vs; - - if (vs->need_driver_params) { - struct fd_ringbuffer *dpconstobj = fd_submit_new_ringbuffer( - ctx->batch->submit, IR3_DP_VS_COUNT * 4, FD_RINGBUFFER_STREAMING); - ir3_emit_vs_driver_params(vs, dpconstobj, ctx, emit->info, emit->indirect, emit->draw); - fd6_ctx->has_dp_state = true; - return dpconstobj; - } - - fd6_ctx->has_dp_state = false; - return NULL; + struct fd_context *ctx = emit->ctx; + struct fd6_context *fd6_ctx = fd6_context(ctx); + const struct ir3_shader_variant *vs = emit->vs; + + if (vs->need_driver_params) { + struct fd_ringbuffer *dpconstobj = fd_submit_new_ringbuffer( + ctx->batch->submit, IR3_DP_VS_COUNT * 4, FD_RINGBUFFER_STREAMING); + ir3_emit_vs_driver_params(vs, dpconstobj, ctx, emit->info, emit->indirect, + emit->draw); + fd6_ctx->has_dp_state = true; + return dpconstobj; + } + + fd6_ctx->has_dp_state = false; + return NULL; } void fd6_emit_ibo_consts(struct fd6_emit *emit, const struct ir3_shader_variant *v, - enum pipe_shader_type stage, struct fd_ringbuffer *ring) + enum pipe_shader_type stage, struct fd_ringbuffer *ring) { - struct fd_context *ctx = emit->ctx; + struct fd_context *ctx = emit->ctx; - ir3_emit_ssbo_sizes(ctx->screen, v, ring, &ctx->shaderbuf[stage]); - ir3_emit_image_dims(ctx->screen, v, ring, &ctx->shaderimg[stage]); + ir3_emit_ssbo_sizes(ctx->screen, v, ring, &ctx->shaderbuf[stage]); + ir3_emit_image_dims(ctx->screen, v, ring, &ctx->shaderimg[stage]); } void -fd6_emit_cs_consts(const struct ir3_shader_variant *v, struct fd_ringbuffer *ring, - struct fd_context *ctx, const struct pipe_grid_info *info) +fd6_emit_cs_consts(const struct ir3_shader_variant *v, + struct fd_ringbuffer *ring, struct fd_context *ctx, + const struct pipe_grid_info *info) { - ir3_emit_cs_consts(v, ring, ctx, info); - fd6_emit_ubos(ctx, v, ring, &ctx->constbuf[PIPE_SHADER_COMPUTE]); + ir3_emit_cs_consts(v, ring, ctx, info); + fd6_emit_ubos(ctx, v, ring, &ctx->constbuf[PIPE_SHADER_COMPUTE]); } void -fd6_emit_immediates(struct fd_screen *screen, const struct ir3_shader_variant *v, - struct fd_ringbuffer *ring) +fd6_emit_immediates(struct fd_screen *screen, + const struct ir3_shader_variant *v, + struct fd_ringbuffer *ring) { - ir3_emit_immediates(screen, v, ring); + ir3_emit_immediates(screen, v, ring); } void fd6_emit_link_map(struct fd_screen *screen, - const struct ir3_shader_variant *producer, - const struct ir3_shader_variant *v, struct fd_ringbuffer *ring) + const struct ir3_shader_variant *producer, + const struct ir3_shader_variant *v, + struct fd_ringbuffer *ring) { - ir3_emit_link_map(screen, producer, v, ring); + ir3_emit_link_map(screen, producer, v, ring); } diff --git a/src/gallium/drivers/freedreno/a6xx/fd6_const.h b/src/gallium/drivers/freedreno/a6xx/fd6_const.h index 7ab52735..299fd29 100644 --- a/src/gallium/drivers/freedreno/a6xx/fd6_const.h +++ b/src/gallium/drivers/freedreno/a6xx/fd6_const.h @@ -22,24 +22,29 @@ * SOFTWARE. */ - #ifndef FD6_CONST_H #define FD6_CONST_H #include "fd6_emit.h" -struct fd_ringbuffer * fd6_build_tess_consts(struct fd6_emit *emit) assert_dt; -struct fd_ringbuffer * fd6_build_user_consts(struct fd6_emit *emit) assert_dt; -struct fd_ringbuffer * fd6_build_vs_driver_params(struct fd6_emit *emit) assert_dt; +struct fd_ringbuffer *fd6_build_tess_consts(struct fd6_emit *emit) assert_dt; +struct fd_ringbuffer *fd6_build_user_consts(struct fd6_emit *emit) assert_dt; +struct fd_ringbuffer * +fd6_build_vs_driver_params(struct fd6_emit *emit) assert_dt; -void fd6_emit_ibo_consts(struct fd6_emit *emit, const struct ir3_shader_variant *v, - enum pipe_shader_type stage, struct fd_ringbuffer *ring) assert_dt; -void fd6_emit_cs_consts(const struct ir3_shader_variant *v, struct fd_ringbuffer *ring, - struct fd_context *ctx, const struct pipe_grid_info *info) assert_dt; -void fd6_emit_immediates(struct fd_screen *screen, const struct ir3_shader_variant *v, - struct fd_ringbuffer *ring) assert_dt; +void fd6_emit_ibo_consts(struct fd6_emit *emit, + const struct ir3_shader_variant *v, + enum pipe_shader_type stage, + struct fd_ringbuffer *ring) assert_dt; +void fd6_emit_cs_consts(const struct ir3_shader_variant *v, + struct fd_ringbuffer *ring, struct fd_context *ctx, + const struct pipe_grid_info *info) assert_dt; +void fd6_emit_immediates(struct fd_screen *screen, + const struct ir3_shader_variant *v, + struct fd_ringbuffer *ring) assert_dt; void fd6_emit_link_map(struct fd_screen *screen, - const struct ir3_shader_variant *producer, - const struct ir3_shader_variant *v, struct fd_ringbuffer *ring) assert_dt; + const struct ir3_shader_variant *producer, + const struct ir3_shader_variant *v, + struct fd_ringbuffer *ring) assert_dt; #endif /* FD6_CONST_H */ diff --git a/src/gallium/drivers/freedreno/a6xx/fd6_context.c b/src/gallium/drivers/freedreno/a6xx/fd6_context.c index 4d8ad8f..c03b652 100644 --- a/src/gallium/drivers/freedreno/a6xx/fd6_context.c +++ b/src/gallium/drivers/freedreno/a6xx/fd6_context.c @@ -27,10 +27,10 @@ #include "freedreno_query_acc.h" -#include "fd6_context.h" -#include "fd6_compute.h" #include "fd6_blend.h" #include "fd6_blitter.h" +#include "fd6_compute.h" +#include "fd6_context.h" #include "fd6_draw.h" #include "fd6_emit.h" #include "fd6_gmem.h" @@ -42,27 +42,26 @@ #include "fd6_zsa.h" static void -fd6_context_destroy(struct pipe_context *pctx) - in_dt +fd6_context_destroy(struct pipe_context *pctx) in_dt { - struct fd6_context *fd6_ctx = fd6_context(fd_context(pctx)); + struct fd6_context *fd6_ctx = fd6_context(fd_context(pctx)); - u_upload_destroy(fd6_ctx->border_color_uploader); - pipe_resource_reference(&fd6_ctx->border_color_buf, NULL); + u_upload_destroy(fd6_ctx->border_color_uploader); + pipe_resource_reference(&fd6_ctx->border_color_buf, NULL); - fd_context_destroy(pctx); + fd_context_destroy(pctx); - if (fd6_ctx->vsc_draw_strm) - fd_bo_del(fd6_ctx->vsc_draw_strm); - if (fd6_ctx->vsc_prim_strm) - fd_bo_del(fd6_ctx->vsc_prim_strm); - fd_bo_del(fd6_ctx->control_mem); + if (fd6_ctx->vsc_draw_strm) + fd_bo_del(fd6_ctx->vsc_draw_strm); + if (fd6_ctx->vsc_prim_strm) + fd_bo_del(fd6_ctx->vsc_prim_strm); + fd_bo_del(fd6_ctx->control_mem); - fd_context_cleanup_common_vbos(&fd6_ctx->base); + fd_context_cleanup_common_vbos(&fd6_ctx->base); - fd6_texture_fini(pctx); + fd6_texture_fini(pctx); - free(fd6_ctx); + free(fd6_ctx); } /* clang-format off */ @@ -85,165 +84,173 @@ static const uint8_t primtypes[] = { static void * fd6_vertex_state_create(struct pipe_context *pctx, unsigned num_elements, - const struct pipe_vertex_element *elements) + const struct pipe_vertex_element *elements) { - struct fd_context *ctx = fd_context(pctx); - - struct fd6_vertex_stateobj *state = CALLOC_STRUCT(fd6_vertex_stateobj); - memcpy(state->base.pipe, elements, sizeof(*elements) * num_elements); - state->base.num_elements = num_elements; - state->stateobj = - fd_ringbuffer_new_object(ctx->pipe, 4 * (num_elements * 2 + 1)); - struct fd_ringbuffer *ring = state->stateobj; - - OUT_PKT4(ring, REG_A6XX_VFD_DECODE(0), 2 * num_elements); - for (int32_t i = 0; i < num_elements; i++) { - const struct pipe_vertex_element *elem = &elements[i]; - enum pipe_format pfmt = elem->src_format; - enum a6xx_format fmt = fd6_pipe2vtx(pfmt); - bool isint = util_format_is_pure_integer(pfmt); - debug_assert(fmt != FMT6_NONE); - - OUT_RING(ring, A6XX_VFD_DECODE_INSTR_IDX(elem->vertex_buffer_index) | - A6XX_VFD_DECODE_INSTR_OFFSET(elem->src_offset) | - A6XX_VFD_DECODE_INSTR_FORMAT(fmt) | - COND(elem->instance_divisor, A6XX_VFD_DECODE_INSTR_INSTANCED) | - A6XX_VFD_DECODE_INSTR_SWAP(fd6_pipe2swap(pfmt)) | - A6XX_VFD_DECODE_INSTR_UNK30 | - COND(!isint, A6XX_VFD_DECODE_INSTR_FLOAT)); - OUT_RING(ring, MAX2(1, elem->instance_divisor)); /* VFD_DECODE[j].STEP_RATE */ - } - - return state; + struct fd_context *ctx = fd_context(pctx); + + struct fd6_vertex_stateobj *state = CALLOC_STRUCT(fd6_vertex_stateobj); + memcpy(state->base.pipe, elements, sizeof(*elements) * num_elements); + state->base.num_elements = num_elements; + state->stateobj = + fd_ringbuffer_new_object(ctx->pipe, 4 * (num_elements * 2 + 1)); + struct fd_ringbuffer *ring = state->stateobj; + + OUT_PKT4(ring, REG_A6XX_VFD_DECODE(0), 2 * num_elements); + for (int32_t i = 0; i < num_elements; i++) { + const struct pipe_vertex_element *elem = &elements[i]; + enum pipe_format pfmt = elem->src_format; + enum a6xx_format fmt = fd6_pipe2vtx(pfmt); + bool isint = util_format_is_pure_integer(pfmt); + debug_assert(fmt != FMT6_NONE); + + OUT_RING(ring, A6XX_VFD_DECODE_INSTR_IDX(elem->vertex_buffer_index) | + A6XX_VFD_DECODE_INSTR_OFFSET(elem->src_offset) | + A6XX_VFD_DECODE_INSTR_FORMAT(fmt) | + COND(elem->instance_divisor, + A6XX_VFD_DECODE_INSTR_INSTANCED) | + A6XX_VFD_DECODE_INSTR_SWAP(fd6_pipe2swap(pfmt)) | + A6XX_VFD_DECODE_INSTR_UNK30 | + COND(!isint, A6XX_VFD_DECODE_INSTR_FLOAT)); + OUT_RING(ring, + MAX2(1, elem->instance_divisor)); /* VFD_DECODE[j].STEP_RATE */ + } + + return state; } static void fd6_vertex_state_delete(struct pipe_context *pctx, void *hwcso) { - struct fd6_vertex_stateobj *so = hwcso; + struct fd6_vertex_stateobj *so = hwcso; - fd_ringbuffer_del(so->stateobj); - FREE(hwcso); + fd_ringbuffer_del(so->stateobj); + FREE(hwcso); } static void setup_state_map(struct fd_context *ctx) { - STATIC_ASSERT(FD6_GROUP_NON_GROUP < 32); - - fd_context_add_map(ctx, FD_DIRTY_VTXSTATE, BIT(FD6_GROUP_VTXSTATE)); - fd_context_add_map(ctx, FD_DIRTY_VTXBUF, BIT(FD6_GROUP_VBO)); - fd_context_add_map(ctx, FD_DIRTY_ZSA | FD_DIRTY_RASTERIZER, BIT(FD6_GROUP_ZSA)); - fd_context_add_map(ctx, FD_DIRTY_ZSA | FD_DIRTY_BLEND | FD_DIRTY_PROG, - BIT(FD6_GROUP_LRZ) | BIT(FD6_GROUP_LRZ_BINNING)); - fd_context_add_map(ctx, FD_DIRTY_PROG, BIT(FD6_GROUP_PROG)); - fd_context_add_map(ctx, FD_DIRTY_RASTERIZER, BIT(FD6_GROUP_RASTERIZER)); - fd_context_add_map(ctx, FD_DIRTY_FRAMEBUFFER | FD_DIRTY_RASTERIZER_DISCARD | - FD_DIRTY_PROG | FD_DIRTY_BLEND_DUAL, - BIT(FD6_GROUP_PROG_FB_RAST)); - fd_context_add_map(ctx, FD_DIRTY_BLEND | FD_DIRTY_SAMPLE_MASK, BIT(FD6_GROUP_BLEND)); - fd_context_add_map(ctx, FD_DIRTY_BLEND_COLOR, BIT(FD6_GROUP_BLEND_COLOR)); - fd_context_add_map(ctx, FD_DIRTY_SSBO | FD_DIRTY_IMAGE | FD_DIRTY_PROG, - BIT(FD6_GROUP_IBO)); - fd_context_add_map(ctx, FD_DIRTY_PROG, BIT(FD6_GROUP_VS_TEX) | BIT(FD6_GROUP_HS_TEX) | - BIT(FD6_GROUP_DS_TEX) | BIT(FD6_GROUP_GS_TEX) | BIT(FD6_GROUP_FS_TEX)); - fd_context_add_map(ctx, FD_DIRTY_PROG | FD_DIRTY_CONST, BIT(FD6_GROUP_CONST)); - fd_context_add_map(ctx, FD_DIRTY_STREAMOUT, BIT(FD6_GROUP_SO)); - - fd_context_add_shader_map(ctx, PIPE_SHADER_VERTEX, FD_DIRTY_SHADER_TEX, - BIT(FD6_GROUP_VS_TEX)); - fd_context_add_shader_map(ctx, PIPE_SHADER_TESS_CTRL, FD_DIRTY_SHADER_TEX, - BIT(FD6_GROUP_HS_TEX)); - fd_context_add_shader_map(ctx, PIPE_SHADER_TESS_EVAL, FD_DIRTY_SHADER_TEX, - BIT(FD6_GROUP_DS_TEX)); - fd_context_add_shader_map(ctx, PIPE_SHADER_GEOMETRY, FD_DIRTY_SHADER_TEX, - BIT(FD6_GROUP_GS_TEX)); - fd_context_add_shader_map(ctx, PIPE_SHADER_FRAGMENT, FD_DIRTY_SHADER_TEX, - BIT(FD6_GROUP_FS_TEX)); - - /* NOTE: scissor enabled bit is part of rasterizer state, but - * fd_rasterizer_state_bind() will mark scissor dirty if needed: - */ - fd_context_add_map(ctx, FD_DIRTY_SCISSOR, BIT(FD6_GROUP_SCISSOR)); - - /* Stuff still emit in IB2 - * - * NOTE: viewport state doesn't seem to change frequently, so possibly - * move it into FD6_GROUP_RASTERIZER? - */ - fd_context_add_map(ctx, FD_DIRTY_STENCIL_REF | FD_DIRTY_VIEWPORT | FD_DIRTY_RASTERIZER, - BIT(FD6_GROUP_NON_GROUP)); + STATIC_ASSERT(FD6_GROUP_NON_GROUP < 32); + + fd_context_add_map(ctx, FD_DIRTY_VTXSTATE, BIT(FD6_GROUP_VTXSTATE)); + fd_context_add_map(ctx, FD_DIRTY_VTXBUF, BIT(FD6_GROUP_VBO)); + fd_context_add_map(ctx, FD_DIRTY_ZSA | FD_DIRTY_RASTERIZER, + BIT(FD6_GROUP_ZSA)); + fd_context_add_map(ctx, FD_DIRTY_ZSA | FD_DIRTY_BLEND | FD_DIRTY_PROG, + BIT(FD6_GROUP_LRZ) | BIT(FD6_GROUP_LRZ_BINNING)); + fd_context_add_map(ctx, FD_DIRTY_PROG, BIT(FD6_GROUP_PROG)); + fd_context_add_map(ctx, FD_DIRTY_RASTERIZER, BIT(FD6_GROUP_RASTERIZER)); + fd_context_add_map(ctx, + FD_DIRTY_FRAMEBUFFER | FD_DIRTY_RASTERIZER_DISCARD | + FD_DIRTY_PROG | FD_DIRTY_BLEND_DUAL, + BIT(FD6_GROUP_PROG_FB_RAST)); + fd_context_add_map(ctx, FD_DIRTY_BLEND | FD_DIRTY_SAMPLE_MASK, + BIT(FD6_GROUP_BLEND)); + fd_context_add_map(ctx, FD_DIRTY_BLEND_COLOR, BIT(FD6_GROUP_BLEND_COLOR)); + fd_context_add_map(ctx, FD_DIRTY_SSBO | FD_DIRTY_IMAGE | FD_DIRTY_PROG, + BIT(FD6_GROUP_IBO)); + fd_context_add_map(ctx, FD_DIRTY_PROG, + BIT(FD6_GROUP_VS_TEX) | BIT(FD6_GROUP_HS_TEX) | + BIT(FD6_GROUP_DS_TEX) | BIT(FD6_GROUP_GS_TEX) | + BIT(FD6_GROUP_FS_TEX)); + fd_context_add_map(ctx, FD_DIRTY_PROG | FD_DIRTY_CONST, + BIT(FD6_GROUP_CONST)); + fd_context_add_map(ctx, FD_DIRTY_STREAMOUT, BIT(FD6_GROUP_SO)); + + fd_context_add_shader_map(ctx, PIPE_SHADER_VERTEX, FD_DIRTY_SHADER_TEX, + BIT(FD6_GROUP_VS_TEX)); + fd_context_add_shader_map(ctx, PIPE_SHADER_TESS_CTRL, FD_DIRTY_SHADER_TEX, + BIT(FD6_GROUP_HS_TEX)); + fd_context_add_shader_map(ctx, PIPE_SHADER_TESS_EVAL, FD_DIRTY_SHADER_TEX, + BIT(FD6_GROUP_DS_TEX)); + fd_context_add_shader_map(ctx, PIPE_SHADER_GEOMETRY, FD_DIRTY_SHADER_TEX, + BIT(FD6_GROUP_GS_TEX)); + fd_context_add_shader_map(ctx, PIPE_SHADER_FRAGMENT, FD_DIRTY_SHADER_TEX, + BIT(FD6_GROUP_FS_TEX)); + + /* NOTE: scissor enabled bit is part of rasterizer state, but + * fd_rasterizer_state_bind() will mark scissor dirty if needed: + */ + fd_context_add_map(ctx, FD_DIRTY_SCISSOR, BIT(FD6_GROUP_SCISSOR)); + + /* Stuff still emit in IB2 + * + * NOTE: viewport state doesn't seem to change frequently, so possibly + * move it into FD6_GROUP_RASTERIZER? + */ + fd_context_add_map( + ctx, FD_DIRTY_STENCIL_REF | FD_DIRTY_VIEWPORT | FD_DIRTY_RASTERIZER, + BIT(FD6_GROUP_NON_GROUP)); } struct pipe_context * -fd6_context_create(struct pipe_screen *pscreen, void *priv, unsigned flags) - disable_thread_safety_analysis +fd6_context_create(struct pipe_screen *pscreen, void *priv, + unsigned flags) disable_thread_safety_analysis { - struct fd_screen *screen = fd_screen(pscreen); - struct fd6_context *fd6_ctx = CALLOC_STRUCT(fd6_context); - struct pipe_context *pctx; + struct fd_screen *screen = fd_screen(pscreen); + struct fd6_context *fd6_ctx = CALLOC_STRUCT(fd6_context); + struct pipe_context *pctx; - if (!fd6_ctx) - return NULL; + if (!fd6_ctx) + return NULL; - pctx = &fd6_ctx->base.base; - pctx->screen = pscreen; + pctx = &fd6_ctx->base.base; + pctx->screen = pscreen; - fd6_ctx->base.dev = fd_device_ref(screen->dev); - fd6_ctx->base.screen = fd_screen(pscreen); - fd6_ctx->base.last.key = &fd6_ctx->last_key; + fd6_ctx->base.dev = fd_device_ref(screen->dev); + fd6_ctx->base.screen = fd_screen(pscreen); + fd6_ctx->base.last.key = &fd6_ctx->last_key; - pctx->destroy = fd6_context_destroy; - pctx->create_blend_state = fd6_blend_state_create; - pctx->create_rasterizer_state = fd6_rasterizer_state_create; - pctx->create_depth_stencil_alpha_state = fd6_zsa_state_create; - pctx->create_vertex_elements_state = fd6_vertex_state_create; + pctx->destroy = fd6_context_destroy; + pctx->create_blend_state = fd6_blend_state_create; + pctx->create_rasterizer_state = fd6_rasterizer_state_create; + pctx->create_depth_stencil_alpha_state = fd6_zsa_state_create; + pctx->create_vertex_elements_state = fd6_vertex_state_create; - fd6_draw_init(pctx); - fd6_compute_init(pctx); - fd6_gmem_init(pctx); - fd6_texture_init(pctx); - fd6_prog_init(pctx); - fd6_emit_init(pctx); - fd6_query_context_init(pctx); + fd6_draw_init(pctx); + fd6_compute_init(pctx); + fd6_gmem_init(pctx); + fd6_texture_init(pctx); + fd6_prog_init(pctx); + fd6_emit_init(pctx); + fd6_query_context_init(pctx); - setup_state_map(&fd6_ctx->base); + setup_state_map(&fd6_ctx->base); - pctx = fd_context_init(&fd6_ctx->base, pscreen, primtypes, priv, flags); - if (!pctx) - return NULL; + pctx = fd_context_init(&fd6_ctx->base, pscreen, primtypes, priv, flags); + if (!pctx) + return NULL; - /* after fd_context_init() to override set_shader_images() */ - fd6_image_init(pctx); + /* after fd_context_init() to override set_shader_images() */ + fd6_image_init(pctx); - util_blitter_set_texture_multisample(fd6_ctx->base.blitter, true); + util_blitter_set_texture_multisample(fd6_ctx->base.blitter, true); - pctx->delete_vertex_elements_state = fd6_vertex_state_delete; + pctx->delete_vertex_elements_state = fd6_vertex_state_delete; - /* fd_context_init overwrites delete_rasterizer_state, so set this - * here. */ - pctx->delete_rasterizer_state = fd6_rasterizer_state_delete; - pctx->delete_blend_state = fd6_blend_state_delete; - pctx->delete_depth_stencil_alpha_state = fd6_zsa_state_delete; + /* fd_context_init overwrites delete_rasterizer_state, so set this + * here. */ + pctx->delete_rasterizer_state = fd6_rasterizer_state_delete; + pctx->delete_blend_state = fd6_blend_state_delete; + pctx->delete_depth_stencil_alpha_state = fd6_zsa_state_delete; - /* initial sizes for VSC buffers (or rather the per-pipe sizes - * which is used to derive entire buffer size: - */ - fd6_ctx->vsc_draw_strm_pitch = 0x440; - fd6_ctx->vsc_prim_strm_pitch = 0x1040; + /* initial sizes for VSC buffers (or rather the per-pipe sizes + * which is used to derive entire buffer size: + */ + fd6_ctx->vsc_draw_strm_pitch = 0x440; + fd6_ctx->vsc_prim_strm_pitch = 0x1040; - fd6_ctx->control_mem = fd_bo_new(screen->dev, 0x1000, - DRM_FREEDRENO_GEM_TYPE_KMEM, "control"); + fd6_ctx->control_mem = + fd_bo_new(screen->dev, 0x1000, DRM_FREEDRENO_GEM_TYPE_KMEM, "control"); - memset(fd_bo_map(fd6_ctx->control_mem), 0, - sizeof(struct fd6_control)); + memset(fd_bo_map(fd6_ctx->control_mem), 0, sizeof(struct fd6_control)); - fd_context_setup_common_vbos(&fd6_ctx->base); + fd_context_setup_common_vbos(&fd6_ctx->base); - fd6_blitter_init(pctx); + fd6_blitter_init(pctx); - fd6_ctx->border_color_uploader = u_upload_create(pctx, 4096, 0, - PIPE_USAGE_STREAM, 0); + fd6_ctx->border_color_uploader = + u_upload_create(pctx, 4096, 0, PIPE_USAGE_STREAM, 0); - return fd_context_init_tc(pctx, flags); + return fd_context_init_tc(pctx, flags); } diff --git a/src/gallium/drivers/freedreno/a6xx/fd6_context.h b/src/gallium/drivers/freedreno/a6xx/fd6_context.h index 8fbc2da..1487e74 100644 --- a/src/gallium/drivers/freedreno/a6xx/fd6_context.h +++ b/src/gallium/drivers/freedreno/a6xx/fd6_context.h @@ -38,111 +38,108 @@ #include "a6xx.xml.h" struct fd6_lrz_state { - bool enable : 1; - bool write : 1; - bool test : 1; - enum fd_lrz_direction direction : 2; + bool enable : 1; + bool write : 1; + bool test : 1; + enum fd_lrz_direction direction : 2; - /* this comes from the fs program state, rather than zsa: */ - enum a6xx_ztest_mode z_mode : 2; + /* this comes from the fs program state, rather than zsa: */ + enum a6xx_ztest_mode z_mode : 2; }; struct fd6_context { - struct fd_context base; + struct fd_context base; - /* Two buffers related to hw binning / visibility stream (VSC). - * Compared to previous generations - * (1) we cannot specify individual buffers per VSC, instead - * just a pitch and base address - * (2) there is a second smaller buffer.. we also stash - * VSC_BIN_SIZE at end of 2nd buffer. - */ - struct fd_bo *vsc_draw_strm, *vsc_prim_strm; + /* Two buffers related to hw binning / visibility stream (VSC). + * Compared to previous generations + * (1) we cannot specify individual buffers per VSC, instead + * just a pitch and base address + * (2) there is a second smaller buffer.. we also stash + * VSC_BIN_SIZE at end of 2nd buffer. + */ + struct fd_bo *vsc_draw_strm, *vsc_prim_strm; - unsigned vsc_draw_strm_pitch, vsc_prim_strm_pitch; + unsigned vsc_draw_strm_pitch, vsc_prim_strm_pitch; - /* The 'control' mem BO is used for various housekeeping - * functions. See 'struct fd6_control' - */ - struct fd_bo *control_mem; - uint32_t seqno; + /* The 'control' mem BO is used for various housekeeping + * functions. See 'struct fd6_control' + */ + struct fd_bo *control_mem; + uint32_t seqno; - struct u_upload_mgr *border_color_uploader; - struct pipe_resource *border_color_buf; + struct u_upload_mgr *border_color_uploader; + struct pipe_resource *border_color_buf; - /* storage for ctx->last.key: */ - struct ir3_shader_key last_key; + /* storage for ctx->last.key: */ + struct ir3_shader_key last_key; - /* Is there current VS driver-param state set? */ - bool has_dp_state; + /* Is there current VS driver-param state set? */ + bool has_dp_state; - /* number of active samples-passed queries: */ - int samples_passed_queries; + /* number of active samples-passed queries: */ + int samples_passed_queries; - /* cached stateobjs to avoid hashtable lookup when not dirty: */ - const struct fd6_program_state *prog; + /* cached stateobjs to avoid hashtable lookup when not dirty: */ + const struct fd6_program_state *prog; - uint16_t tex_seqno; - struct hash_table *tex_cache; + uint16_t tex_seqno; + struct hash_table *tex_cache; - struct { - /* previous binning/draw lrz state, which is a function of multiple - * gallium stateobjs, but doesn't necessarily change as frequently: - */ - struct fd6_lrz_state lrz[2]; - } last; + struct { + /* previous binning/draw lrz state, which is a function of multiple + * gallium stateobjs, but doesn't necessarily change as frequently: + */ + struct fd6_lrz_state lrz[2]; + } last; }; static inline struct fd6_context * fd6_context(struct fd_context *ctx) { - return (struct fd6_context *)ctx; + return (struct fd6_context *)ctx; } -struct pipe_context * -fd6_context_create(struct pipe_screen *pscreen, void *priv, unsigned flags); - +struct pipe_context *fd6_context_create(struct pipe_screen *pscreen, void *priv, + unsigned flags); /* This struct defines the layout of the fd6_context::control buffer: */ struct fd6_control { - uint32_t seqno; /* seqno for async CP_EVENT_WRITE, etc */ - uint32_t _pad0; - volatile uint32_t vsc_overflow; - uint32_t _pad1[5]; - - /* scratch space for VPC_SO[i].FLUSH_BASE_LO/HI, start on 32 byte boundary. */ - struct { - uint32_t offset; - uint32_t pad[7]; - } flush_base[4]; + uint32_t seqno; /* seqno for async CP_EVENT_WRITE, etc */ + uint32_t _pad0; + volatile uint32_t vsc_overflow; + uint32_t _pad1[5]; + + /* scratch space for VPC_SO[i].FLUSH_BASE_LO/HI, start on 32 byte boundary. */ + struct { + uint32_t offset; + uint32_t pad[7]; + } flush_base[4]; }; -#define control_ptr(fd6_ctx, member) \ - (fd6_ctx)->control_mem, offsetof(struct fd6_control, member), 0, 0 - +#define control_ptr(fd6_ctx, member) \ + (fd6_ctx)->control_mem, offsetof(struct fd6_control, member), 0, 0 static inline void emit_marker6(struct fd_ringbuffer *ring, int scratch_idx) { - extern int32_t marker_cnt; - unsigned reg = REG_A6XX_CP_SCRATCH_REG(scratch_idx); - if (__EMIT_MARKER) { - OUT_WFI5(ring); - OUT_PKT4(ring, reg, 1); - OUT_RING(ring, p_atomic_inc_return(&marker_cnt)); - } + extern int32_t marker_cnt; + unsigned reg = REG_A6XX_CP_SCRATCH_REG(scratch_idx); + if (__EMIT_MARKER) { + OUT_WFI5(ring); + OUT_PKT4(ring, reg, 1); + OUT_RING(ring, p_atomic_inc_return(&marker_cnt)); + } } struct fd6_vertex_stateobj { - struct fd_vertex_stateobj base; - struct fd_ringbuffer *stateobj; + struct fd_vertex_stateobj base; + struct fd_ringbuffer *stateobj; }; static inline struct fd6_vertex_stateobj * fd6_vertex_stateobj(void *p) { - return (struct fd6_vertex_stateobj *) p; + return (struct fd6_vertex_stateobj *)p; } - #endif /* FD6_CONTEXT_H_ */ diff --git a/src/gallium/drivers/freedreno/a6xx/fd6_draw.c b/src/gallium/drivers/freedreno/a6xx/fd6_draw.c index aea5654..b21250c 100644 --- a/src/gallium/drivers/freedreno/a6xx/fd6_draw.c +++ b/src/gallium/drivers/freedreno/a6xx/fd6_draw.c @@ -26,531 +26,514 @@ */ #include "pipe/p_state.h" -#include "util/u_string.h" #include "util/u_memory.h" #include "util/u_prim.h" +#include "util/u_string.h" -#include "freedreno_state.h" #include "freedreno_resource.h" +#include "freedreno_state.h" -#include "fd6_draw.h" #include "fd6_context.h" +#include "fd6_draw.h" #include "fd6_emit.h" -#include "fd6_program.h" #include "fd6_format.h" +#include "fd6_program.h" #include "fd6_vsc.h" #include "fd6_zsa.h" #include "fd6_pack.h" static void -draw_emit_xfb(struct fd_ringbuffer *ring, - struct CP_DRAW_INDX_OFFSET_0 *draw0, - const struct pipe_draw_info *info, +draw_emit_xfb(struct fd_ringbuffer *ring, struct CP_DRAW_INDX_OFFSET_0 *draw0, + const struct pipe_draw_info *info, const struct pipe_draw_indirect_info *indirect) { - struct fd_stream_output_target *target = fd_stream_output_target(indirect->count_from_stream_output); - struct fd_resource *offset = fd_resource(target->offset_buf); - - /* All known firmware versions do not wait for WFI's with CP_DRAW_AUTO. - * Plus, for the common case where the counter buffer is written by - * vkCmdEndTransformFeedback, we need to wait for the CP_WAIT_MEM_WRITES to - * complete which means we need a WAIT_FOR_ME anyway. - */ - OUT_PKT7(ring, CP_WAIT_FOR_ME, 0); - - OUT_PKT7(ring, CP_DRAW_AUTO, 6); - OUT_RING(ring, pack_CP_DRAW_INDX_OFFSET_0(*draw0).value); - OUT_RING(ring, info->instance_count); - OUT_RELOC(ring, offset->bo, 0, 0, 0); - OUT_RING(ring, 0); /* byte counter offset subtraced from the value read from above */ - OUT_RING(ring, target->stride); + struct fd_stream_output_target *target = + fd_stream_output_target(indirect->count_from_stream_output); + struct fd_resource *offset = fd_resource(target->offset_buf); + + /* All known firmware versions do not wait for WFI's with CP_DRAW_AUTO. + * Plus, for the common case where the counter buffer is written by + * vkCmdEndTransformFeedback, we need to wait for the CP_WAIT_MEM_WRITES to + * complete which means we need a WAIT_FOR_ME anyway. + */ + OUT_PKT7(ring, CP_WAIT_FOR_ME, 0); + + OUT_PKT7(ring, CP_DRAW_AUTO, 6); + OUT_RING(ring, pack_CP_DRAW_INDX_OFFSET_0(*draw0).value); + OUT_RING(ring, info->instance_count); + OUT_RELOC(ring, offset->bo, 0, 0, 0); + OUT_RING( + ring, + 0); /* byte counter offset subtraced from the value read from above */ + OUT_RING(ring, target->stride); } static void draw_emit_indirect(struct fd_ringbuffer *ring, - struct CP_DRAW_INDX_OFFSET_0 *draw0, - const struct pipe_draw_info *info, - const struct pipe_draw_indirect_info *indirect, - unsigned index_offset) + struct CP_DRAW_INDX_OFFSET_0 *draw0, + const struct pipe_draw_info *info, + const struct pipe_draw_indirect_info *indirect, + unsigned index_offset) { - struct fd_resource *ind = fd_resource(indirect->buffer); - - if (info->index_size) { - struct pipe_resource *idx = info->index.resource; - unsigned max_indices = (idx->width0 - index_offset) / info->index_size; - - OUT_PKT(ring, CP_DRAW_INDX_INDIRECT, - pack_CP_DRAW_INDX_OFFSET_0(*draw0), - A5XX_CP_DRAW_INDX_INDIRECT_INDX_BASE( - fd_resource(idx)->bo, index_offset), - A5XX_CP_DRAW_INDX_INDIRECT_3(.max_indices = max_indices), - A5XX_CP_DRAW_INDX_INDIRECT_INDIRECT( - ind->bo, indirect->offset) - ); - } else { - OUT_PKT(ring, CP_DRAW_INDIRECT, - pack_CP_DRAW_INDX_OFFSET_0(*draw0), - A5XX_CP_DRAW_INDIRECT_INDIRECT( - ind->bo, indirect->offset) - ); - } + struct fd_resource *ind = fd_resource(indirect->buffer); + + if (info->index_size) { + struct pipe_resource *idx = info->index.resource; + unsigned max_indices = (idx->width0 - index_offset) / info->index_size; + + OUT_PKT(ring, CP_DRAW_INDX_INDIRECT, pack_CP_DRAW_INDX_OFFSET_0(*draw0), + A5XX_CP_DRAW_INDX_INDIRECT_INDX_BASE(fd_resource(idx)->bo, + index_offset), + A5XX_CP_DRAW_INDX_INDIRECT_3(.max_indices = max_indices), + A5XX_CP_DRAW_INDX_INDIRECT_INDIRECT(ind->bo, indirect->offset)); + } else { + OUT_PKT(ring, CP_DRAW_INDIRECT, pack_CP_DRAW_INDX_OFFSET_0(*draw0), + A5XX_CP_DRAW_INDIRECT_INDIRECT(ind->bo, indirect->offset)); + } } static void -draw_emit(struct fd_ringbuffer *ring, - struct CP_DRAW_INDX_OFFSET_0 *draw0, - const struct pipe_draw_info *info, - const struct pipe_draw_start_count *draw, - unsigned index_offset) +draw_emit(struct fd_ringbuffer *ring, struct CP_DRAW_INDX_OFFSET_0 *draw0, + const struct pipe_draw_info *info, + const struct pipe_draw_start_count *draw, unsigned index_offset) { - if (info->index_size) { - assert(!info->has_user_indices); - - struct pipe_resource *idx_buffer = info->index.resource; - unsigned max_indices = (idx_buffer->width0 - index_offset) / info->index_size; - - OUT_PKT(ring, CP_DRAW_INDX_OFFSET, - pack_CP_DRAW_INDX_OFFSET_0(*draw0), - CP_DRAW_INDX_OFFSET_1(.num_instances = info->instance_count), - CP_DRAW_INDX_OFFSET_2(.num_indices = draw->count), - CP_DRAW_INDX_OFFSET_3(.first_indx = draw->start), - A5XX_CP_DRAW_INDX_OFFSET_INDX_BASE( - fd_resource(idx_buffer)->bo, index_offset), - A5XX_CP_DRAW_INDX_OFFSET_6(.max_indices = max_indices) - ); - } else { - OUT_PKT(ring, CP_DRAW_INDX_OFFSET, - pack_CP_DRAW_INDX_OFFSET_0(*draw0), - CP_DRAW_INDX_OFFSET_1(.num_instances = info->instance_count), - CP_DRAW_INDX_OFFSET_2(.num_indices = draw->count) - ); - } + if (info->index_size) { + assert(!info->has_user_indices); + + struct pipe_resource *idx_buffer = info->index.resource; + unsigned max_indices = + (idx_buffer->width0 - index_offset) / info->index_size; + + OUT_PKT(ring, CP_DRAW_INDX_OFFSET, pack_CP_DRAW_INDX_OFFSET_0(*draw0), + CP_DRAW_INDX_OFFSET_1(.num_instances = info->instance_count), + CP_DRAW_INDX_OFFSET_2(.num_indices = draw->count), + CP_DRAW_INDX_OFFSET_3(.first_indx = draw->start), + A5XX_CP_DRAW_INDX_OFFSET_INDX_BASE(fd_resource(idx_buffer)->bo, + index_offset), + A5XX_CP_DRAW_INDX_OFFSET_6(.max_indices = max_indices)); + } else { + OUT_PKT(ring, CP_DRAW_INDX_OFFSET, pack_CP_DRAW_INDX_OFFSET_0(*draw0), + CP_DRAW_INDX_OFFSET_1(.num_instances = info->instance_count), + CP_DRAW_INDX_OFFSET_2(.num_indices = draw->count)); + } } static void -fixup_draw_state(struct fd_context *ctx, struct fd6_emit *emit) - assert_dt +fixup_draw_state(struct fd_context *ctx, struct fd6_emit *emit) assert_dt { - if (ctx->last.dirty || - (ctx->last.primitive_restart != emit->primitive_restart)) { - /* rasterizer state is effected by primitive-restart: */ - fd_context_dirty(ctx, FD_DIRTY_RASTERIZER); - ctx->last.primitive_restart = emit->primitive_restart; - } + if (ctx->last.dirty || + (ctx->last.primitive_restart != emit->primitive_restart)) { + /* rasterizer state is effected by primitive-restart: */ + fd_context_dirty(ctx, FD_DIRTY_RASTERIZER); + ctx->last.primitive_restart = emit->primitive_restart; + } } static bool fd6_draw_vbo(struct fd_context *ctx, const struct pipe_draw_info *info, const struct pipe_draw_indirect_info *indirect, const struct pipe_draw_start_count *draw, - unsigned index_offset) - assert_dt + unsigned index_offset) assert_dt { - struct fd6_context *fd6_ctx = fd6_context(ctx); - struct shader_info *gs_info = ir3_get_shader_info(ctx->prog.gs); - struct fd6_emit emit = { - .ctx = ctx, - .vtx = &ctx->vtx, - .info = info, - .indirect = indirect, - .draw = draw, - .key = { - .vs = ctx->prog.vs, - .gs = ctx->prog.gs, - .fs = ctx->prog.fs, - .key = { - .rasterflat = ctx->rasterizer->flatshade, - .layer_zero = !gs_info || !(gs_info->outputs_written & VARYING_BIT_LAYER), - .sample_shading = (ctx->min_samples > 1), - .msaa = (ctx->framebuffer.samples > 1), - }, - }, - .rasterflat = ctx->rasterizer->flatshade, - .sprite_coord_enable = ctx->rasterizer->sprite_coord_enable, - .sprite_coord_mode = ctx->rasterizer->sprite_coord_mode, - .primitive_restart = info->primitive_restart && info->index_size, - }; - - if (!(ctx->prog.vs && ctx->prog.fs)) - return false; - - if (info->mode == PIPE_PRIM_PATCHES) { - emit.key.hs = ctx->prog.hs; - emit.key.ds = ctx->prog.ds; - - if (!(ctx->prog.hs && ctx->prog.ds)) - return false; - - struct shader_info *ds_info = ir3_get_shader_info(emit.key.ds); - emit.key.key.tessellation = ir3_tess_mode(ds_info->tess.primitive_mode); - ctx->gen_dirty |= BIT(FD6_GROUP_PRIMITIVE_PARAMS); - } - - if (emit.key.gs) { - emit.key.key.has_gs = true; - ctx->gen_dirty |= BIT(FD6_GROUP_PRIMITIVE_PARAMS); - } - - if (!(emit.key.hs || emit.key.ds || emit.key.gs || indirect)) - fd6_vsc_update_sizes(ctx->batch, info, draw); - - ir3_fixup_shader_state(&ctx->base, &emit.key.key); - - if (!(ctx->dirty & FD_DIRTY_PROG)) { - emit.prog = fd6_ctx->prog; - } else { - fd6_ctx->prog = fd6_emit_get_prog(&emit); - } - - /* bail if compile failed: */ - if (!fd6_ctx->prog) - return false; - - fixup_draw_state(ctx, &emit); - - /* *after* fixup_shader_state(): */ - emit.dirty = ctx->dirty; - emit.dirty_groups = ctx->gen_dirty; - - emit.bs = fd6_emit_get_prog(&emit)->bs; - emit.vs = fd6_emit_get_prog(&emit)->vs; - emit.hs = fd6_emit_get_prog(&emit)->hs; - emit.ds = fd6_emit_get_prog(&emit)->ds; - emit.gs = fd6_emit_get_prog(&emit)->gs; - emit.fs = fd6_emit_get_prog(&emit)->fs; - - if (emit.vs->need_driver_params || fd6_ctx->has_dp_state) - emit.dirty_groups |= BIT(FD6_GROUP_VS_DRIVER_PARAMS); - - /* If we are doing xfb, we need to emit the xfb state on every draw: */ - if (emit.prog->stream_output) - emit.dirty_groups |= BIT(FD6_GROUP_SO); - - if (unlikely(ctx->stats_users > 0)) { - ctx->stats.vs_regs += ir3_shader_halfregs(emit.vs); - ctx->stats.hs_regs += COND(emit.hs, ir3_shader_halfregs(emit.hs)); - ctx->stats.ds_regs += COND(emit.ds, ir3_shader_halfregs(emit.ds)); - ctx->stats.gs_regs += COND(emit.gs, ir3_shader_halfregs(emit.gs)); - ctx->stats.fs_regs += ir3_shader_halfregs(emit.fs); - } - - struct fd_ringbuffer *ring = ctx->batch->draw; - - struct CP_DRAW_INDX_OFFSET_0 draw0 = { - .prim_type = ctx->primtypes[info->mode], - .vis_cull = USE_VISIBILITY, - .gs_enable = !!emit.key.gs, - }; - - if (indirect && indirect->count_from_stream_output) { - draw0.source_select= DI_SRC_SEL_AUTO_XFB; - } else if (info->index_size) { - draw0.source_select = DI_SRC_SEL_DMA; - draw0.index_size = fd4_size2indextype(info->index_size); - } else { - draw0.source_select = DI_SRC_SEL_AUTO_INDEX; - } - - if (info->mode == PIPE_PRIM_PATCHES) { - shader_info *ds_info = &emit.ds->shader->nir->info; - uint32_t factor_stride; - - switch (ds_info->tess.primitive_mode) { - case GL_ISOLINES: - draw0.patch_type = TESS_ISOLINES; - factor_stride = 12; - break; - case GL_TRIANGLES: - draw0.patch_type = TESS_TRIANGLES; - factor_stride = 20; - break; - case GL_QUADS: - draw0.patch_type = TESS_QUADS; - factor_stride = 28; - break; - default: - unreachable("bad tessmode"); - } - - draw0.prim_type = DI_PT_PATCHES0 + info->vertices_per_patch; - draw0.tess_enable = true; - - const unsigned max_count = 2048; - unsigned count; - - /** - * We can cap tessparam/tessfactor buffer sizes at the sub-draw - * limit. But in the indirect-draw case we must assume the worst. - */ - if (indirect && indirect->buffer) { - count = ALIGN_NPOT(max_count, info->vertices_per_patch); - } else { - count = MIN2(max_count, draw->count); - count = ALIGN_NPOT(count, info->vertices_per_patch); - } - - OUT_PKT7(ring, CP_SET_SUBDRAW_SIZE, 1); - OUT_RING(ring, count); - - ctx->batch->tessellation = true; - ctx->batch->tessparam_size = MAX2(ctx->batch->tessparam_size, - emit.hs->output_size * 4 * count); - ctx->batch->tessfactor_size = MAX2(ctx->batch->tessfactor_size, - factor_stride * count); - - if (!ctx->batch->tess_addrs_constobj) { - /* Reserve space for the bo address - we'll write them later in - * setup_tess_buffers(). We need 2 bo address, but indirect - * constant upload needs at least 4 vec4s. - */ - unsigned size = 4 * 16; - - ctx->batch->tess_addrs_constobj = fd_submit_new_ringbuffer( - ctx->batch->submit, size, FD_RINGBUFFER_STREAMING); - - ctx->batch->tess_addrs_constobj->cur += size; - } - } - - uint32_t index_start = info->index_size ? info->index_bias : draw->start; - if (ctx->last.dirty || (ctx->last.index_start != index_start)) { - OUT_PKT4(ring, REG_A6XX_VFD_INDEX_OFFSET, 1); - OUT_RING(ring, index_start); /* VFD_INDEX_OFFSET */ - ctx->last.index_start = index_start; - } - - if (ctx->last.dirty || (ctx->last.instance_start != info->start_instance)) { - OUT_PKT4(ring, REG_A6XX_VFD_INSTANCE_START_OFFSET, 1); - OUT_RING(ring, info->start_instance); /* VFD_INSTANCE_START_OFFSET */ - ctx->last.instance_start = info->start_instance; - } - - uint32_t restart_index = info->primitive_restart ? info->restart_index : 0xffffffff; - if (ctx->last.dirty || (ctx->last.restart_index != restart_index)) { - OUT_PKT4(ring, REG_A6XX_PC_RESTART_INDEX, 1); - OUT_RING(ring, restart_index); /* PC_RESTART_INDEX */ - ctx->last.restart_index = restart_index; - } - - // TODO move fd6_emit_streamout.. I think.. - if (emit.dirty_groups) - fd6_emit_state(ring, &emit); - - /* for debug after a lock up, write a unique counter value - * to scratch7 for each draw, to make it easier to match up - * register dumps to cmdstream. The combination of IB - * (scratch6) and DRAW is enough to "triangulate" the - * particular draw that caused lockup. - */ - emit_marker6(ring, 7); - - if (indirect) { - if (indirect->count_from_stream_output) { - draw_emit_xfb(ring, &draw0, info, indirect); - } else { - draw_emit_indirect(ring, &draw0, info, indirect, index_offset); - } - } else { - draw_emit(ring, &draw0, info, draw, index_offset); - } - - emit_marker6(ring, 7); - fd_reset_wfi(ctx->batch); - - if (emit.streamout_mask) { - struct fd_ringbuffer *ring = ctx->batch->draw; - - for (unsigned i = 0; i < PIPE_MAX_SO_BUFFERS; i++) { - if (emit.streamout_mask & (1 << i)) { - fd6_event_write(ctx->batch, ring, FLUSH_SO_0 + i, false); - } - } - } - - fd_context_all_clean(ctx); - - return true; + struct fd6_context *fd6_ctx = fd6_context(ctx); + struct shader_info *gs_info = ir3_get_shader_info(ctx->prog.gs); + struct fd6_emit emit = { + .ctx = ctx, + .vtx = &ctx->vtx, + .info = info, + .indirect = indirect, + .draw = draw, + .key = + { + .vs = ctx->prog.vs, + .gs = ctx->prog.gs, + .fs = ctx->prog.fs, + .key = + { + .rasterflat = ctx->rasterizer->flatshade, + .layer_zero = !gs_info || + !(gs_info->outputs_written & VARYING_BIT_LAYER), + .sample_shading = (ctx->min_samples > 1), + .msaa = (ctx->framebuffer.samples > 1), + }, + }, + .rasterflat = ctx->rasterizer->flatshade, + .sprite_coord_enable = ctx->rasterizer->sprite_coord_enable, + .sprite_coord_mode = ctx->rasterizer->sprite_coord_mode, + .primitive_restart = info->primitive_restart && info->index_size, + }; + + if (!(ctx->prog.vs && ctx->prog.fs)) + return false; + + if (info->mode == PIPE_PRIM_PATCHES) { + emit.key.hs = ctx->prog.hs; + emit.key.ds = ctx->prog.ds; + + if (!(ctx->prog.hs && ctx->prog.ds)) + return false; + + struct shader_info *ds_info = ir3_get_shader_info(emit.key.ds); + emit.key.key.tessellation = ir3_tess_mode(ds_info->tess.primitive_mode); + ctx->gen_dirty |= BIT(FD6_GROUP_PRIMITIVE_PARAMS); + } + + if (emit.key.gs) { + emit.key.key.has_gs = true; + ctx->gen_dirty |= BIT(FD6_GROUP_PRIMITIVE_PARAMS); + } + + if (!(emit.key.hs || emit.key.ds || emit.key.gs || indirect)) + fd6_vsc_update_sizes(ctx->batch, info, draw); + + ir3_fixup_shader_state(&ctx->base, &emit.key.key); + + if (!(ctx->dirty & FD_DIRTY_PROG)) { + emit.prog = fd6_ctx->prog; + } else { + fd6_ctx->prog = fd6_emit_get_prog(&emit); + } + + /* bail if compile failed: */ + if (!fd6_ctx->prog) + return false; + + fixup_draw_state(ctx, &emit); + + /* *after* fixup_shader_state(): */ + emit.dirty = ctx->dirty; + emit.dirty_groups = ctx->gen_dirty; + + emit.bs = fd6_emit_get_prog(&emit)->bs; + emit.vs = fd6_emit_get_prog(&emit)->vs; + emit.hs = fd6_emit_get_prog(&emit)->hs; + emit.ds = fd6_emit_get_prog(&emit)->ds; + emit.gs = fd6_emit_get_prog(&emit)->gs; + emit.fs = fd6_emit_get_prog(&emit)->fs; + + if (emit.vs->need_driver_params || fd6_ctx->has_dp_state) + emit.dirty_groups |= BIT(FD6_GROUP_VS_DRIVER_PARAMS); + + /* If we are doing xfb, we need to emit the xfb state on every draw: */ + if (emit.prog->stream_output) + emit.dirty_groups |= BIT(FD6_GROUP_SO); + + if (unlikely(ctx->stats_users > 0)) { + ctx->stats.vs_regs += ir3_shader_halfregs(emit.vs); + ctx->stats.hs_regs += COND(emit.hs, ir3_shader_halfregs(emit.hs)); + ctx->stats.ds_regs += COND(emit.ds, ir3_shader_halfregs(emit.ds)); + ctx->stats.gs_regs += COND(emit.gs, ir3_shader_halfregs(emit.gs)); + ctx->stats.fs_regs += ir3_shader_halfregs(emit.fs); + } + + struct fd_ringbuffer *ring = ctx->batch->draw; + + struct CP_DRAW_INDX_OFFSET_0 draw0 = { + .prim_type = ctx->primtypes[info->mode], + .vis_cull = USE_VISIBILITY, + .gs_enable = !!emit.key.gs, + }; + + if (indirect && indirect->count_from_stream_output) { + draw0.source_select = DI_SRC_SEL_AUTO_XFB; + } else if (info->index_size) { + draw0.source_select = DI_SRC_SEL_DMA; + draw0.index_size = fd4_size2indextype(info->index_size); + } else { + draw0.source_select = DI_SRC_SEL_AUTO_INDEX; + } + + if (info->mode == PIPE_PRIM_PATCHES) { + shader_info *ds_info = &emit.ds->shader->nir->info; + uint32_t factor_stride; + + switch (ds_info->tess.primitive_mode) { + case GL_ISOLINES: + draw0.patch_type = TESS_ISOLINES; + factor_stride = 12; + break; + case GL_TRIANGLES: + draw0.patch_type = TESS_TRIANGLES; + factor_stride = 20; + break; + case GL_QUADS: + draw0.patch_type = TESS_QUADS; + factor_stride = 28; + break; + default: + unreachable("bad tessmode"); + } + + draw0.prim_type = DI_PT_PATCHES0 + info->vertices_per_patch; + draw0.tess_enable = true; + + const unsigned max_count = 2048; + unsigned count; + + /** + * We can cap tessparam/tessfactor buffer sizes at the sub-draw + * limit. But in the indirect-draw case we must assume the worst. + */ + if (indirect && indirect->buffer) { + count = ALIGN_NPOT(max_count, info->vertices_per_patch); + } else { + count = MIN2(max_count, draw->count); + count = ALIGN_NPOT(count, info->vertices_per_patch); + } + + OUT_PKT7(ring, CP_SET_SUBDRAW_SIZE, 1); + OUT_RING(ring, count); + + ctx->batch->tessellation = true; + ctx->batch->tessparam_size = + MAX2(ctx->batch->tessparam_size, emit.hs->output_size * 4 * count); + ctx->batch->tessfactor_size = + MAX2(ctx->batch->tessfactor_size, factor_stride * count); + + if (!ctx->batch->tess_addrs_constobj) { + /* Reserve space for the bo address - we'll write them later in + * setup_tess_buffers(). We need 2 bo address, but indirect + * constant upload needs at least 4 vec4s. + */ + unsigned size = 4 * 16; + + ctx->batch->tess_addrs_constobj = fd_submit_new_ringbuffer( + ctx->batch->submit, size, FD_RINGBUFFER_STREAMING); + + ctx->batch->tess_addrs_constobj->cur += size; + } + } + + uint32_t index_start = info->index_size ? info->index_bias : draw->start; + if (ctx->last.dirty || (ctx->last.index_start != index_start)) { + OUT_PKT4(ring, REG_A6XX_VFD_INDEX_OFFSET, 1); + OUT_RING(ring, index_start); /* VFD_INDEX_OFFSET */ + ctx->last.index_start = index_start; + } + + if (ctx->last.dirty || (ctx->last.instance_start != info->start_instance)) { + OUT_PKT4(ring, REG_A6XX_VFD_INSTANCE_START_OFFSET, 1); + OUT_RING(ring, info->start_instance); /* VFD_INSTANCE_START_OFFSET */ + ctx->last.instance_start = info->start_instance; + } + + uint32_t restart_index = + info->primitive_restart ? info->restart_index : 0xffffffff; + if (ctx->last.dirty || (ctx->last.restart_index != restart_index)) { + OUT_PKT4(ring, REG_A6XX_PC_RESTART_INDEX, 1); + OUT_RING(ring, restart_index); /* PC_RESTART_INDEX */ + ctx->last.restart_index = restart_index; + } + + // TODO move fd6_emit_streamout.. I think.. + if (emit.dirty_groups) + fd6_emit_state(ring, &emit); + + /* for debug after a lock up, write a unique counter value + * to scratch7 for each draw, to make it easier to match up + * register dumps to cmdstream. The combination of IB + * (scratch6) and DRAW is enough to "triangulate" the + * particular draw that caused lockup. + */ + emit_marker6(ring, 7); + + if (indirect) { + if (indirect->count_from_stream_output) { + draw_emit_xfb(ring, &draw0, info, indirect); + } else { + draw_emit_indirect(ring, &draw0, info, indirect, index_offset); + } + } else { + draw_emit(ring, &draw0, info, draw, index_offset); + } + + emit_marker6(ring, 7); + fd_reset_wfi(ctx->batch); + + if (emit.streamout_mask) { + struct fd_ringbuffer *ring = ctx->batch->draw; + + for (unsigned i = 0; i < PIPE_MAX_SO_BUFFERS; i++) { + if (emit.streamout_mask & (1 << i)) { + fd6_event_write(ctx->batch, ring, FLUSH_SO_0 + i, false); + } + } + } + + fd_context_all_clean(ctx); + + return true; } static void fd6_clear_lrz(struct fd_batch *batch, struct fd_resource *zsbuf, double depth) { - struct fd_ringbuffer *ring; - struct fd_screen *screen = batch->ctx->screen; - - ring = fd_batch_get_prologue(batch); - - emit_marker6(ring, 7); - OUT_PKT7(ring, CP_SET_MARKER, 1); - OUT_RING(ring, A6XX_CP_SET_MARKER_0_MODE(RM6_BYPASS)); - emit_marker6(ring, 7); - - OUT_WFI5(ring); - - OUT_REG(ring, A6XX_RB_CCU_CNTL(.offset = screen->info.a6xx.ccu_offset_bypass)); - - OUT_REG(ring, A6XX_HLSQ_INVALIDATE_CMD( - .vs_state = true, - .hs_state = true, - .ds_state = true, - .gs_state = true, - .fs_state = true, - .cs_state = true, - .gfx_ibo = true, - .cs_ibo = true, - .gfx_shared_const = true, - .gfx_bindless = 0x1f, - .cs_bindless = 0x1f - )); - - emit_marker6(ring, 7); - OUT_PKT7(ring, CP_SET_MARKER, 1); - OUT_RING(ring, A6XX_CP_SET_MARKER_0_MODE(RM6_BLIT2DSCALE)); - emit_marker6(ring, 7); - - OUT_PKT4(ring, REG_A6XX_RB_2D_UNKNOWN_8C01, 1); - OUT_RING(ring, 0x0); - - OUT_PKT4(ring, REG_A6XX_SP_PS_2D_SRC_INFO, 13); - OUT_RING(ring, 0x00000000); - OUT_RING(ring, 0x00000000); - OUT_RING(ring, 0x00000000); - OUT_RING(ring, 0x00000000); - OUT_RING(ring, 0x00000000); - OUT_RING(ring, 0x00000000); - OUT_RING(ring, 0x00000000); - OUT_RING(ring, 0x00000000); - OUT_RING(ring, 0x00000000); - OUT_RING(ring, 0x00000000); - OUT_RING(ring, 0x00000000); - OUT_RING(ring, 0x00000000); - OUT_RING(ring, 0x00000000); - - OUT_PKT4(ring, REG_A6XX_SP_2D_DST_FORMAT, 1); - OUT_RING(ring, 0x0000f410); - - OUT_PKT4(ring, REG_A6XX_GRAS_2D_BLIT_CNTL, 1); - OUT_RING(ring, A6XX_GRAS_2D_BLIT_CNTL_COLOR_FORMAT(FMT6_16_UNORM) | - 0x4f00080); - - OUT_PKT4(ring, REG_A6XX_RB_2D_BLIT_CNTL, 1); - OUT_RING(ring, A6XX_RB_2D_BLIT_CNTL_COLOR_FORMAT(FMT6_16_UNORM) | - 0x4f00080); - - fd6_event_write(batch, ring, PC_CCU_FLUSH_COLOR_TS, true); - fd6_event_write(batch, ring, PC_CCU_INVALIDATE_COLOR, false); - - OUT_PKT4(ring, REG_A6XX_RB_2D_SRC_SOLID_C0, 4); - OUT_RING(ring, fui(depth)); - OUT_RING(ring, 0x00000000); - OUT_RING(ring, 0x00000000); - OUT_RING(ring, 0x00000000); - - OUT_PKT4(ring, REG_A6XX_RB_2D_DST_INFO, 9); - OUT_RING(ring, A6XX_RB_2D_DST_INFO_COLOR_FORMAT(FMT6_16_UNORM) | - A6XX_RB_2D_DST_INFO_TILE_MODE(TILE6_LINEAR) | - A6XX_RB_2D_DST_INFO_COLOR_SWAP(WZYX)); - OUT_RELOC(ring, zsbuf->lrz, 0, 0, 0); - OUT_RING(ring, A6XX_RB_2D_DST_PITCH(zsbuf->lrz_pitch * 2).value); - OUT_RING(ring, 0x00000000); - OUT_RING(ring, 0x00000000); - OUT_RING(ring, 0x00000000); - OUT_RING(ring, 0x00000000); - OUT_RING(ring, 0x00000000); - - OUT_REG(ring, - A6XX_GRAS_2D_SRC_TL_X(0), - A6XX_GRAS_2D_SRC_BR_X(0), - A6XX_GRAS_2D_SRC_TL_Y(0), - A6XX_GRAS_2D_SRC_BR_Y(0)); - - OUT_PKT4(ring, REG_A6XX_GRAS_2D_DST_TL, 2); - OUT_RING(ring, A6XX_GRAS_2D_DST_TL_X(0) | - A6XX_GRAS_2D_DST_TL_Y(0)); - OUT_RING(ring, A6XX_GRAS_2D_DST_BR_X(zsbuf->lrz_width - 1) | - A6XX_GRAS_2D_DST_BR_Y(zsbuf->lrz_height - 1)); - - fd6_event_write(batch, ring, 0x3f, false); - - OUT_WFI5(ring); - - OUT_PKT4(ring, REG_A6XX_RB_UNKNOWN_8E04, 1); - OUT_RING(ring, screen->info.a6xx.magic.RB_UNKNOWN_8E04_blit); - - OUT_PKT7(ring, CP_BLIT, 1); - OUT_RING(ring, CP_BLIT_0_OP(BLIT_OP_SCALE)); - - OUT_WFI5(ring); - - OUT_PKT4(ring, REG_A6XX_RB_UNKNOWN_8E04, 1); - OUT_RING(ring, 0x0); /* RB_UNKNOWN_8E04 */ - - fd6_event_write(batch, ring, PC_CCU_FLUSH_COLOR_TS, true); - fd6_event_write(batch, ring, PC_CCU_FLUSH_DEPTH_TS, true); - fd6_event_write(batch, ring, CACHE_FLUSH_TS, true); - - fd6_cache_inv(batch, ring); + struct fd_ringbuffer *ring; + struct fd_screen *screen = batch->ctx->screen; + + ring = fd_batch_get_prologue(batch); + + emit_marker6(ring, 7); + OUT_PKT7(ring, CP_SET_MARKER, 1); + OUT_RING(ring, A6XX_CP_SET_MARKER_0_MODE(RM6_BYPASS)); + emit_marker6(ring, 7); + + OUT_WFI5(ring); + + OUT_REG(ring, + A6XX_RB_CCU_CNTL(.offset = screen->info.a6xx.ccu_offset_bypass)); + + OUT_REG(ring, + A6XX_HLSQ_INVALIDATE_CMD(.vs_state = true, .hs_state = true, + .ds_state = true, .gs_state = true, + .fs_state = true, .cs_state = true, + .gfx_ibo = true, .cs_ibo = true, + .gfx_shared_const = true, + .gfx_bindless = 0x1f, .cs_bindless = 0x1f)); + + emit_marker6(ring, 7); + OUT_PKT7(ring, CP_SET_MARKER, 1); + OUT_RING(ring, A6XX_CP_SET_MARKER_0_MODE(RM6_BLIT2DSCALE)); + emit_marker6(ring, 7); + + OUT_PKT4(ring, REG_A6XX_RB_2D_UNKNOWN_8C01, 1); + OUT_RING(ring, 0x0); + + OUT_PKT4(ring, REG_A6XX_SP_PS_2D_SRC_INFO, 13); + OUT_RING(ring, 0x00000000); + OUT_RING(ring, 0x00000000); + OUT_RING(ring, 0x00000000); + OUT_RING(ring, 0x00000000); + OUT_RING(ring, 0x00000000); + OUT_RING(ring, 0x00000000); + OUT_RING(ring, 0x00000000); + OUT_RING(ring, 0x00000000); + OUT_RING(ring, 0x00000000); + OUT_RING(ring, 0x00000000); + OUT_RING(ring, 0x00000000); + OUT_RING(ring, 0x00000000); + OUT_RING(ring, 0x00000000); + + OUT_PKT4(ring, REG_A6XX_SP_2D_DST_FORMAT, 1); + OUT_RING(ring, 0x0000f410); + + OUT_PKT4(ring, REG_A6XX_GRAS_2D_BLIT_CNTL, 1); + OUT_RING(ring, + A6XX_GRAS_2D_BLIT_CNTL_COLOR_FORMAT(FMT6_16_UNORM) | 0x4f00080); + + OUT_PKT4(ring, REG_A6XX_RB_2D_BLIT_CNTL, 1); + OUT_RING(ring, A6XX_RB_2D_BLIT_CNTL_COLOR_FORMAT(FMT6_16_UNORM) | 0x4f00080); + + fd6_event_write(batch, ring, PC_CCU_FLUSH_COLOR_TS, true); + fd6_event_write(batch, ring, PC_CCU_INVALIDATE_COLOR, false); + + OUT_PKT4(ring, REG_A6XX_RB_2D_SRC_SOLID_C0, 4); + OUT_RING(ring, fui(depth)); + OUT_RING(ring, 0x00000000); + OUT_RING(ring, 0x00000000); + OUT_RING(ring, 0x00000000); + + OUT_PKT4(ring, REG_A6XX_RB_2D_DST_INFO, 9); + OUT_RING(ring, A6XX_RB_2D_DST_INFO_COLOR_FORMAT(FMT6_16_UNORM) | + A6XX_RB_2D_DST_INFO_TILE_MODE(TILE6_LINEAR) | + A6XX_RB_2D_DST_INFO_COLOR_SWAP(WZYX)); + OUT_RELOC(ring, zsbuf->lrz, 0, 0, 0); + OUT_RING(ring, A6XX_RB_2D_DST_PITCH(zsbuf->lrz_pitch * 2).value); + OUT_RING(ring, 0x00000000); + OUT_RING(ring, 0x00000000); + OUT_RING(ring, 0x00000000); + OUT_RING(ring, 0x00000000); + OUT_RING(ring, 0x00000000); + + OUT_REG(ring, A6XX_GRAS_2D_SRC_TL_X(0), A6XX_GRAS_2D_SRC_BR_X(0), + A6XX_GRAS_2D_SRC_TL_Y(0), A6XX_GRAS_2D_SRC_BR_Y(0)); + + OUT_PKT4(ring, REG_A6XX_GRAS_2D_DST_TL, 2); + OUT_RING(ring, A6XX_GRAS_2D_DST_TL_X(0) | A6XX_GRAS_2D_DST_TL_Y(0)); + OUT_RING(ring, A6XX_GRAS_2D_DST_BR_X(zsbuf->lrz_width - 1) | + A6XX_GRAS_2D_DST_BR_Y(zsbuf->lrz_height - 1)); + + fd6_event_write(batch, ring, 0x3f, false); + + OUT_WFI5(ring); + + OUT_PKT4(ring, REG_A6XX_RB_UNKNOWN_8E04, 1); + OUT_RING(ring, screen->info.a6xx.magic.RB_UNKNOWN_8E04_blit); + + OUT_PKT7(ring, CP_BLIT, 1); + OUT_RING(ring, CP_BLIT_0_OP(BLIT_OP_SCALE)); + + OUT_WFI5(ring); + + OUT_PKT4(ring, REG_A6XX_RB_UNKNOWN_8E04, 1); + OUT_RING(ring, 0x0); /* RB_UNKNOWN_8E04 */ + + fd6_event_write(batch, ring, PC_CCU_FLUSH_COLOR_TS, true); + fd6_event_write(batch, ring, PC_CCU_FLUSH_DEPTH_TS, true); + fd6_event_write(batch, ring, CACHE_FLUSH_TS, true); + + fd6_cache_inv(batch, ring); } -static bool is_z32(enum pipe_format format) +static bool +is_z32(enum pipe_format format) { - switch (format) { - case PIPE_FORMAT_Z32_FLOAT_S8X24_UINT: - case PIPE_FORMAT_Z32_UNORM: - case PIPE_FORMAT_Z32_FLOAT: - return true; - default: - return false; - } + switch (format) { + case PIPE_FORMAT_Z32_FLOAT_S8X24_UINT: + case PIPE_FORMAT_Z32_UNORM: + case PIPE_FORMAT_Z32_FLOAT: + return true; + default: + return false; + } } static bool fd6_clear(struct fd_context *ctx, unsigned buffers, - const union pipe_color_union *color, double depth, unsigned stencil) - assert_dt + const union pipe_color_union *color, double depth, + unsigned stencil) assert_dt { - struct pipe_framebuffer_state *pfb = &ctx->batch->framebuffer; - const bool has_depth = pfb->zsbuf; - unsigned color_buffers = buffers >> 2; - - /* we need to do multisample clear on 3d pipe, so fallback to u_blitter: */ - if (pfb->samples > 1) - return false; - - /* If we're clearing after draws, fallback to 3D pipe clears. We could - * use blitter clears in the draw batch but then we'd have to patch up the - * gmem offsets. This doesn't seem like a useful thing to optimize for - * however.*/ - if (ctx->batch->num_draws > 0) - return false; - - u_foreach_bit(i, color_buffers) - ctx->batch->clear_color[i] = *color; - if (buffers & PIPE_CLEAR_DEPTH) - ctx->batch->clear_depth = depth; - if (buffers & PIPE_CLEAR_STENCIL) - ctx->batch->clear_stencil = stencil; - - ctx->batch->fast_cleared |= buffers; - - if (has_depth && (buffers & PIPE_CLEAR_DEPTH)) { - struct fd_resource *zsbuf = fd_resource(pfb->zsbuf->texture); - if (zsbuf->lrz && !is_z32(pfb->zsbuf->format)) { - zsbuf->lrz_valid = true; - zsbuf->lrz_direction = FD_LRZ_UNKNOWN; - fd6_clear_lrz(ctx->batch, zsbuf, depth); - } - } - - return true; + struct pipe_framebuffer_state *pfb = &ctx->batch->framebuffer; + const bool has_depth = pfb->zsbuf; + unsigned color_buffers = buffers >> 2; + + /* we need to do multisample clear on 3d pipe, so fallback to u_blitter: */ + if (pfb->samples > 1) + return false; + + /* If we're clearing after draws, fallback to 3D pipe clears. We could + * use blitter clears in the draw batch but then we'd have to patch up the + * gmem offsets. This doesn't seem like a useful thing to optimize for + * however.*/ + if (ctx->batch->num_draws > 0) + return false; + + u_foreach_bit (i, color_buffers) + ctx->batch->clear_color[i] = *color; + if (buffers & PIPE_CLEAR_DEPTH) + ctx->batch->clear_depth = depth; + if (buffers & PIPE_CLEAR_STENCIL) + ctx->batch->clear_stencil = stencil; + + ctx->batch->fast_cleared |= buffers; + + if (has_depth && (buffers & PIPE_CLEAR_DEPTH)) { + struct fd_resource *zsbuf = fd_resource(pfb->zsbuf->texture); + if (zsbuf->lrz && !is_z32(pfb->zsbuf->format)) { + zsbuf->lrz_valid = true; + zsbuf->lrz_direction = FD_LRZ_UNKNOWN; + fd6_clear_lrz(ctx->batch, zsbuf, depth); + } + } + + return true; } void -fd6_draw_init(struct pipe_context *pctx) - disable_thread_safety_analysis +fd6_draw_init(struct pipe_context *pctx) disable_thread_safety_analysis { - struct fd_context *ctx = fd_context(pctx); - ctx->draw_vbo = fd6_draw_vbo; - ctx->clear = fd6_clear; + struct fd_context *ctx = fd_context(pctx); + ctx->draw_vbo = fd6_draw_vbo; + ctx->clear = fd6_clear; } diff --git a/src/gallium/drivers/freedreno/a6xx/fd6_emit.c b/src/gallium/drivers/freedreno/a6xx/fd6_emit.c index 2933ad6..88aef02 100644 --- a/src/gallium/drivers/freedreno/a6xx/fd6_emit.c +++ b/src/gallium/drivers/freedreno/a6xx/fd6_emit.c @@ -26,28 +26,28 @@ */ #include "pipe/p_state.h" -#include "util/u_string.h" -#include "util/u_memory.h" -#include "util/u_helpers.h" #include "util/format/u_format.h" +#include "util/u_helpers.h" +#include "util/u_memory.h" +#include "util/u_string.h" #include "util/u_viewport.h" +#include "common/freedreno_guardband.h" +#include "freedreno_query_hw.h" #include "freedreno_resource.h" #include "freedreno_state.h" #include "freedreno_tracepoints.h" -#include "freedreno_query_hw.h" -#include "common/freedreno_guardband.h" -#include "fd6_emit.h" #include "fd6_blend.h" #include "fd6_const.h" #include "fd6_context.h" +#include "fd6_emit.h" +#include "fd6_format.h" #include "fd6_image.h" #include "fd6_pack.h" #include "fd6_program.h" #include "fd6_rasterizer.h" #include "fd6_texture.h" -#include "fd6_format.h" #include "fd6_zsa.h" /* Border color layout is diff from a4xx/a5xx.. if it turns out to be @@ -57,421 +57,422 @@ */ struct PACKED bcolor_entry { - uint32_t fp32[4]; - uint16_t ui16[4]; - int16_t si16[4]; - uint16_t fp16[4]; - uint16_t rgb565; - uint16_t rgb5a1; - uint16_t rgba4; - uint8_t __pad0[2]; - uint8_t ui8[4]; - int8_t si8[4]; - uint32_t rgb10a2; - uint32_t z24; /* also s8? */ - uint16_t srgb[4]; /* appears to duplicate fp16[], but clamped, used for srgb */ - uint8_t __pad1[56]; + uint32_t fp32[4]; + uint16_t ui16[4]; + int16_t si16[4]; + uint16_t fp16[4]; + uint16_t rgb565; + uint16_t rgb5a1; + uint16_t rgba4; + uint8_t __pad0[2]; + uint8_t ui8[4]; + int8_t si8[4]; + uint32_t rgb10a2; + uint32_t z24; /* also s8? */ + uint16_t + srgb[4]; /* appears to duplicate fp16[], but clamped, used for srgb */ + uint8_t __pad1[56]; }; -#define FD6_BORDER_COLOR_SIZE sizeof(struct bcolor_entry) -#define FD6_BORDER_COLOR_UPLOAD_SIZE (2 * PIPE_MAX_SAMPLERS * FD6_BORDER_COLOR_SIZE) +#define FD6_BORDER_COLOR_SIZE sizeof(struct bcolor_entry) +#define FD6_BORDER_COLOR_UPLOAD_SIZE \ + (2 * PIPE_MAX_SAMPLERS * FD6_BORDER_COLOR_SIZE) static void -setup_border_colors(struct fd_texture_stateobj *tex, struct bcolor_entry *entries) +setup_border_colors(struct fd_texture_stateobj *tex, + struct bcolor_entry *entries) { - unsigned i, j; - STATIC_ASSERT(sizeof(struct bcolor_entry) == FD6_BORDER_COLOR_SIZE); - - for (i = 0; i < tex->num_samplers; i++) { - struct bcolor_entry *e = &entries[i]; - struct pipe_sampler_state *sampler = tex->samplers[i]; - union pipe_color_union *bc; - - if (!sampler) - continue; - - bc = &sampler->border_color; - - /* - * XXX HACK ALERT XXX - * - * The border colors need to be swizzled in a particular - * format-dependent order. Even though samplers don't know about - * formats, we can assume that with a GL state tracker, there's a - * 1:1 correspondence between sampler and texture. Take advantage - * of that knowledge. - */ - if ((i >= tex->num_textures) || !tex->textures[i]) - continue; - - struct pipe_sampler_view *view = tex->textures[i]; - enum pipe_format format = view->format; - const struct util_format_description *desc = - util_format_description(format); - - e->rgb565 = 0; - e->rgb5a1 = 0; - e->rgba4 = 0; - e->rgb10a2 = 0; - e->z24 = 0; - - unsigned char swiz[4]; - - fd6_tex_swiz(format, swiz, - view->swizzle_r, view->swizzle_g, - view->swizzle_b, view->swizzle_a); - - for (j = 0; j < 4; j++) { - int c = swiz[j]; - int cd = c; - - /* - * HACK: for PIPE_FORMAT_X24S8_UINT we end up w/ the - * stencil border color value in bc->ui[0] but according - * to desc->swizzle and desc->channel, the .x/.w component - * is NONE and the stencil value is in the y component. - * Meanwhile the hardware wants this in the .w component - * for x24s8 and the .x component for x32_s8x24. - */ - if ((format == PIPE_FORMAT_X24S8_UINT) || - (format == PIPE_FORMAT_X32_S8X24_UINT)) { - if (j == 0) { - c = 1; - cd = (format == PIPE_FORMAT_X32_S8X24_UINT) ? 0 : 3; - } else { - continue; - } - } - - if (c >= 4) - continue; - - if (desc->channel[c].pure_integer) { - uint16_t clamped; - switch (desc->channel[c].size) { - case 2: - assert(desc->channel[c].type == UTIL_FORMAT_TYPE_UNSIGNED); - clamped = CLAMP(bc->ui[j], 0, 0x3); - break; - case 8: - if (desc->channel[c].type == UTIL_FORMAT_TYPE_SIGNED) - clamped = CLAMP(bc->i[j], -128, 127); - else - clamped = CLAMP(bc->ui[j], 0, 255); - break; - case 10: - assert(desc->channel[c].type == UTIL_FORMAT_TYPE_UNSIGNED); - clamped = CLAMP(bc->ui[j], 0, 0x3ff); - break; - case 16: - if (desc->channel[c].type == UTIL_FORMAT_TYPE_SIGNED) - clamped = CLAMP(bc->i[j], -32768, 32767); - else - clamped = CLAMP(bc->ui[j], 0, 65535); - break; - default: - assert(!"Unexpected bit size"); - case 32: - clamped = 0; - break; - } - e->fp32[cd] = bc->ui[j]; - e->fp16[cd] = clamped; - } else { - float f = bc->f[j]; - float f_u = CLAMP(f, 0, 1); - float f_s = CLAMP(f, -1, 1); - - e->fp32[c] = fui(f); - e->fp16[c] = _mesa_float_to_half(f); - e->srgb[c] = _mesa_float_to_half(f_u); - e->ui16[c] = f_u * 0xffff; - e->si16[c] = f_s * 0x7fff; - e->ui8[c] = f_u * 0xff; - e->si8[c] = f_s * 0x7f; - if (c == 1) - e->rgb565 |= (int)(f_u * 0x3f) << 5; - else if (c < 3) - e->rgb565 |= (int)(f_u * 0x1f) << (c ? 11 : 0); - if (c == 3) - e->rgb5a1 |= (f_u > 0.5) ? 0x8000 : 0; - else - e->rgb5a1 |= (int)(f_u * 0x1f) << (c * 5); - if (c == 3) - e->rgb10a2 |= (int)(f_u * 0x3) << 30; - else - e->rgb10a2 |= (int)(f_u * 0x3ff) << (c * 10); - e->rgba4 |= (int)(f_u * 0xf) << (c * 4); - if (c == 0) - e->z24 = f_u * 0xffffff; - } - } + unsigned i, j; + STATIC_ASSERT(sizeof(struct bcolor_entry) == FD6_BORDER_COLOR_SIZE); + + for (i = 0; i < tex->num_samplers; i++) { + struct bcolor_entry *e = &entries[i]; + struct pipe_sampler_state *sampler = tex->samplers[i]; + union pipe_color_union *bc; + + if (!sampler) + continue; + + bc = &sampler->border_color; + + /* + * XXX HACK ALERT XXX + * + * The border colors need to be swizzled in a particular + * format-dependent order. Even though samplers don't know about + * formats, we can assume that with a GL state tracker, there's a + * 1:1 correspondence between sampler and texture. Take advantage + * of that knowledge. + */ + if ((i >= tex->num_textures) || !tex->textures[i]) + continue; + + struct pipe_sampler_view *view = tex->textures[i]; + enum pipe_format format = view->format; + const struct util_format_description *desc = + util_format_description(format); + + e->rgb565 = 0; + e->rgb5a1 = 0; + e->rgba4 = 0; + e->rgb10a2 = 0; + e->z24 = 0; + + unsigned char swiz[4]; + + fd6_tex_swiz(format, swiz, view->swizzle_r, view->swizzle_g, + view->swizzle_b, view->swizzle_a); + + for (j = 0; j < 4; j++) { + int c = swiz[j]; + int cd = c; + + /* + * HACK: for PIPE_FORMAT_X24S8_UINT we end up w/ the + * stencil border color value in bc->ui[0] but according + * to desc->swizzle and desc->channel, the .x/.w component + * is NONE and the stencil value is in the y component. + * Meanwhile the hardware wants this in the .w component + * for x24s8 and the .x component for x32_s8x24. + */ + if ((format == PIPE_FORMAT_X24S8_UINT) || + (format == PIPE_FORMAT_X32_S8X24_UINT)) { + if (j == 0) { + c = 1; + cd = (format == PIPE_FORMAT_X32_S8X24_UINT) ? 0 : 3; + } else { + continue; + } + } + + if (c >= 4) + continue; + + if (desc->channel[c].pure_integer) { + uint16_t clamped; + switch (desc->channel[c].size) { + case 2: + assert(desc->channel[c].type == UTIL_FORMAT_TYPE_UNSIGNED); + clamped = CLAMP(bc->ui[j], 0, 0x3); + break; + case 8: + if (desc->channel[c].type == UTIL_FORMAT_TYPE_SIGNED) + clamped = CLAMP(bc->i[j], -128, 127); + else + clamped = CLAMP(bc->ui[j], 0, 255); + break; + case 10: + assert(desc->channel[c].type == UTIL_FORMAT_TYPE_UNSIGNED); + clamped = CLAMP(bc->ui[j], 0, 0x3ff); + break; + case 16: + if (desc->channel[c].type == UTIL_FORMAT_TYPE_SIGNED) + clamped = CLAMP(bc->i[j], -32768, 32767); + else + clamped = CLAMP(bc->ui[j], 0, 65535); + break; + default: + assert(!"Unexpected bit size"); + case 32: + clamped = 0; + break; + } + e->fp32[cd] = bc->ui[j]; + e->fp16[cd] = clamped; + } else { + float f = bc->f[j]; + float f_u = CLAMP(f, 0, 1); + float f_s = CLAMP(f, -1, 1); + + e->fp32[c] = fui(f); + e->fp16[c] = _mesa_float_to_half(f); + e->srgb[c] = _mesa_float_to_half(f_u); + e->ui16[c] = f_u * 0xffff; + e->si16[c] = f_s * 0x7fff; + e->ui8[c] = f_u * 0xff; + e->si8[c] = f_s * 0x7f; + if (c == 1) + e->rgb565 |= (int)(f_u * 0x3f) << 5; + else if (c < 3) + e->rgb565 |= (int)(f_u * 0x1f) << (c ? 11 : 0); + if (c == 3) + e->rgb5a1 |= (f_u > 0.5) ? 0x8000 : 0; + else + e->rgb5a1 |= (int)(f_u * 0x1f) << (c * 5); + if (c == 3) + e->rgb10a2 |= (int)(f_u * 0x3) << 30; + else + e->rgb10a2 |= (int)(f_u * 0x3ff) << (c * 10); + e->rgba4 |= (int)(f_u * 0xf) << (c * 4); + if (c == 0) + e->z24 = f_u * 0xffffff; + } + } #ifdef DEBUG - memset(&e->__pad0, 0, sizeof(e->__pad0)); - memset(&e->__pad1, 0, sizeof(e->__pad1)); + memset(&e->__pad0, 0, sizeof(e->__pad0)); + memset(&e->__pad1, 0, sizeof(e->__pad1)); #endif - } + } } static void -emit_border_color(struct fd_context *ctx, struct fd_ringbuffer *ring) - assert_dt +emit_border_color(struct fd_context *ctx, struct fd_ringbuffer *ring) assert_dt { - struct fd6_context *fd6_ctx = fd6_context(ctx); - struct bcolor_entry *entries; - unsigned off; - void *ptr; + struct fd6_context *fd6_ctx = fd6_context(ctx); + struct bcolor_entry *entries; + unsigned off; + void *ptr; - STATIC_ASSERT(sizeof(struct bcolor_entry) == FD6_BORDER_COLOR_SIZE); + STATIC_ASSERT(sizeof(struct bcolor_entry) == FD6_BORDER_COLOR_SIZE); - u_upload_alloc(fd6_ctx->border_color_uploader, - 0, FD6_BORDER_COLOR_UPLOAD_SIZE, - FD6_BORDER_COLOR_UPLOAD_SIZE, &off, - &fd6_ctx->border_color_buf, - &ptr); + u_upload_alloc(fd6_ctx->border_color_uploader, 0, + FD6_BORDER_COLOR_UPLOAD_SIZE, FD6_BORDER_COLOR_UPLOAD_SIZE, + &off, &fd6_ctx->border_color_buf, &ptr); - entries = ptr; + entries = ptr; - setup_border_colors(&ctx->tex[PIPE_SHADER_VERTEX], &entries[0]); - setup_border_colors(&ctx->tex[PIPE_SHADER_FRAGMENT], - &entries[ctx->tex[PIPE_SHADER_VERTEX].num_samplers]); + setup_border_colors(&ctx->tex[PIPE_SHADER_VERTEX], &entries[0]); + setup_border_colors(&ctx->tex[PIPE_SHADER_FRAGMENT], + &entries[ctx->tex[PIPE_SHADER_VERTEX].num_samplers]); - OUT_PKT4(ring, REG_A6XX_SP_TP_BORDER_COLOR_BASE_ADDR, 2); - OUT_RELOC(ring, fd_resource(fd6_ctx->border_color_buf)->bo, off, 0, 0); + OUT_PKT4(ring, REG_A6XX_SP_TP_BORDER_COLOR_BASE_ADDR, 2); + OUT_RELOC(ring, fd_resource(fd6_ctx->border_color_buf)->bo, off, 0, 0); - u_upload_unmap(fd6_ctx->border_color_uploader); + u_upload_unmap(fd6_ctx->border_color_uploader); } static void -fd6_emit_fb_tex(struct fd_ringbuffer *state, struct fd_context *ctx) - assert_dt +fd6_emit_fb_tex(struct fd_ringbuffer *state, struct fd_context *ctx) assert_dt { - struct pipe_framebuffer_state *pfb = &ctx->batch->framebuffer; - struct pipe_surface *psurf = pfb->cbufs[0]; - struct fd_resource *rsc = fd_resource(psurf->texture); - - uint32_t texconst0 = fd6_tex_const_0(psurf->texture, psurf->u.tex.level, - psurf->format, PIPE_SWIZZLE_X, PIPE_SWIZZLE_Y, - PIPE_SWIZZLE_Z, PIPE_SWIZZLE_W); - - /* always TILE6_2 mode in GMEM.. which also means no swap: */ - texconst0 &= ~(A6XX_TEX_CONST_0_SWAP__MASK | A6XX_TEX_CONST_0_TILE_MODE__MASK); - texconst0 |= A6XX_TEX_CONST_0_TILE_MODE(TILE6_2); - - OUT_RING(state, texconst0); - OUT_RING(state, A6XX_TEX_CONST_1_WIDTH(pfb->width) | - A6XX_TEX_CONST_1_HEIGHT(pfb->height)); - OUT_RINGP(state, A6XX_TEX_CONST_2_TYPE(A6XX_TEX_2D), - &ctx->batch->fb_read_patches); - OUT_RING(state, A6XX_TEX_CONST_3_ARRAY_PITCH(rsc->layout.layer_size)); - - OUT_RING(state, A6XX_TEX_CONST_4_BASE_LO(ctx->screen->gmem_base)); - OUT_RING(state, A6XX_TEX_CONST_5_BASE_HI(ctx->screen->gmem_base >> 32) | - A6XX_TEX_CONST_5_DEPTH(1)); - OUT_RING(state, 0); /* texconst6 */ - OUT_RING(state, 0); /* texconst7 */ - OUT_RING(state, 0); /* texconst8 */ - OUT_RING(state, 0); /* texconst9 */ - OUT_RING(state, 0); /* texconst10 */ - OUT_RING(state, 0); /* texconst11 */ - OUT_RING(state, 0); - OUT_RING(state, 0); - OUT_RING(state, 0); - OUT_RING(state, 0); + struct pipe_framebuffer_state *pfb = &ctx->batch->framebuffer; + struct pipe_surface *psurf = pfb->cbufs[0]; + struct fd_resource *rsc = fd_resource(psurf->texture); + + uint32_t texconst0 = fd6_tex_const_0( + psurf->texture, psurf->u.tex.level, psurf->format, PIPE_SWIZZLE_X, + PIPE_SWIZZLE_Y, PIPE_SWIZZLE_Z, PIPE_SWIZZLE_W); + + /* always TILE6_2 mode in GMEM.. which also means no swap: */ + texconst0 &= + ~(A6XX_TEX_CONST_0_SWAP__MASK | A6XX_TEX_CONST_0_TILE_MODE__MASK); + texconst0 |= A6XX_TEX_CONST_0_TILE_MODE(TILE6_2); + + OUT_RING(state, texconst0); + OUT_RING(state, A6XX_TEX_CONST_1_WIDTH(pfb->width) | + A6XX_TEX_CONST_1_HEIGHT(pfb->height)); + OUT_RINGP(state, A6XX_TEX_CONST_2_TYPE(A6XX_TEX_2D), + &ctx->batch->fb_read_patches); + OUT_RING(state, A6XX_TEX_CONST_3_ARRAY_PITCH(rsc->layout.layer_size)); + + OUT_RING(state, A6XX_TEX_CONST_4_BASE_LO(ctx->screen->gmem_base)); + OUT_RING(state, A6XX_TEX_CONST_5_BASE_HI(ctx->screen->gmem_base >> 32) | + A6XX_TEX_CONST_5_DEPTH(1)); + OUT_RING(state, 0); /* texconst6 */ + OUT_RING(state, 0); /* texconst7 */ + OUT_RING(state, 0); /* texconst8 */ + OUT_RING(state, 0); /* texconst9 */ + OUT_RING(state, 0); /* texconst10 */ + OUT_RING(state, 0); /* texconst11 */ + OUT_RING(state, 0); + OUT_RING(state, 0); + OUT_RING(state, 0); + OUT_RING(state, 0); } bool fd6_emit_textures(struct fd_context *ctx, struct fd_ringbuffer *ring, - enum pipe_shader_type type, struct fd_texture_stateobj *tex, - unsigned bcolor_offset, - /* can be NULL if no image/SSBO/fb state to merge in: */ - const struct ir3_shader_variant *v) + enum pipe_shader_type type, struct fd_texture_stateobj *tex, + unsigned bcolor_offset, + /* can be NULL if no image/SSBO/fb state to merge in: */ + const struct ir3_shader_variant *v) { - bool needs_border = false; - unsigned opcode, tex_samp_reg, tex_const_reg, tex_count_reg; - enum a6xx_state_block sb; - - switch (type) { - case PIPE_SHADER_VERTEX: - sb = SB6_VS_TEX; - opcode = CP_LOAD_STATE6_GEOM; - tex_samp_reg = REG_A6XX_SP_VS_TEX_SAMP; - tex_const_reg = REG_A6XX_SP_VS_TEX_CONST; - tex_count_reg = REG_A6XX_SP_VS_TEX_COUNT; - break; - case PIPE_SHADER_TESS_CTRL: - sb = SB6_HS_TEX; - opcode = CP_LOAD_STATE6_GEOM; - tex_samp_reg = REG_A6XX_SP_HS_TEX_SAMP; - tex_const_reg = REG_A6XX_SP_HS_TEX_CONST; - tex_count_reg = REG_A6XX_SP_HS_TEX_COUNT; - break; - case PIPE_SHADER_TESS_EVAL: - sb = SB6_DS_TEX; - opcode = CP_LOAD_STATE6_GEOM; - tex_samp_reg = REG_A6XX_SP_DS_TEX_SAMP; - tex_const_reg = REG_A6XX_SP_DS_TEX_CONST; - tex_count_reg = REG_A6XX_SP_DS_TEX_COUNT; - break; - case PIPE_SHADER_GEOMETRY: - sb = SB6_GS_TEX; - opcode = CP_LOAD_STATE6_GEOM; - tex_samp_reg = REG_A6XX_SP_GS_TEX_SAMP; - tex_const_reg = REG_A6XX_SP_GS_TEX_CONST; - tex_count_reg = REG_A6XX_SP_GS_TEX_COUNT; - break; - case PIPE_SHADER_FRAGMENT: - sb = SB6_FS_TEX; - opcode = CP_LOAD_STATE6_FRAG; - tex_samp_reg = REG_A6XX_SP_FS_TEX_SAMP; - tex_const_reg = REG_A6XX_SP_FS_TEX_CONST; - tex_count_reg = REG_A6XX_SP_FS_TEX_COUNT; - break; - case PIPE_SHADER_COMPUTE: - sb = SB6_CS_TEX; - opcode = CP_LOAD_STATE6_FRAG; - tex_samp_reg = REG_A6XX_SP_CS_TEX_SAMP; - tex_const_reg = REG_A6XX_SP_CS_TEX_CONST; - tex_count_reg = REG_A6XX_SP_CS_TEX_COUNT; - break; - default: - unreachable("bad state block"); - } - - if (tex->num_samplers > 0) { - struct fd_ringbuffer *state = - fd_ringbuffer_new_object(ctx->pipe, tex->num_samplers * 4 * 4); - for (unsigned i = 0; i < tex->num_samplers; i++) { - static const struct fd6_sampler_stateobj dummy_sampler = {}; - const struct fd6_sampler_stateobj *sampler = tex->samplers[i] ? - fd6_sampler_stateobj(tex->samplers[i]) : &dummy_sampler; - OUT_RING(state, sampler->texsamp0); - OUT_RING(state, sampler->texsamp1); - OUT_RING(state, sampler->texsamp2 | - A6XX_TEX_SAMP_2_BCOLOR(i + bcolor_offset)); - OUT_RING(state, sampler->texsamp3); - needs_border |= sampler->needs_border; - } - - /* output sampler state: */ - OUT_PKT7(ring, opcode, 3); - OUT_RING(ring, CP_LOAD_STATE6_0_DST_OFF(0) | - CP_LOAD_STATE6_0_STATE_TYPE(ST6_SHADER) | - CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) | - CP_LOAD_STATE6_0_STATE_BLOCK(sb) | - CP_LOAD_STATE6_0_NUM_UNIT(tex->num_samplers)); - OUT_RB(ring, state); /* SRC_ADDR_LO/HI */ - - OUT_PKT4(ring, tex_samp_reg, 2); - OUT_RB(ring, state); /* SRC_ADDR_LO/HI */ - - fd_ringbuffer_del(state); - } - - unsigned num_merged_textures = tex->num_textures; - unsigned num_textures = tex->num_textures; - if (v) { - num_merged_textures += v->image_mapping.num_tex; - - if (v->fb_read) - num_merged_textures++; - - /* There could be more bound textures than what the shader uses. - * Which isn't known at shader compile time. So in the case we - * are merging tex state, only emit the textures that the shader - * uses (since the image/SSBO related tex state comes immediately - * after) - */ - num_textures = v->image_mapping.tex_base; - } - - if (num_merged_textures > 0) { - struct fd_ringbuffer *state = - fd_ringbuffer_new_object(ctx->pipe, num_merged_textures * 16 * 4); - for (unsigned i = 0; i < num_textures; i++) { - const struct fd6_pipe_sampler_view *view; - - if (tex->textures[i]) { - view = fd6_pipe_sampler_view(tex->textures[i]); - if (unlikely(view->rsc_seqno != fd_resource(view->base.texture)->seqno)) { - fd6_sampler_view_update(ctx, - fd6_pipe_sampler_view(tex->textures[i])); - } - } else { - static const struct fd6_pipe_sampler_view dummy_view = {}; - view = &dummy_view; - } - - OUT_RING(state, view->texconst0); - OUT_RING(state, view->texconst1); - OUT_RING(state, view->texconst2); - OUT_RING(state, view->texconst3); - - if (view->ptr1) { - OUT_RELOC(state, view->ptr1->bo, view->offset1, - (uint64_t)view->texconst5 << 32, 0); - } else { - OUT_RING(state, 0x00000000); - OUT_RING(state, view->texconst5); - } - - OUT_RING(state, view->texconst6); - - if (view->ptr2) { - OUT_RELOC(state, view->ptr2->bo, view->offset2, 0, 0); - } else { - OUT_RING(state, 0); - OUT_RING(state, 0); - } - - OUT_RING(state, view->texconst9); - OUT_RING(state, view->texconst10); - OUT_RING(state, view->texconst11); - OUT_RING(state, 0); - OUT_RING(state, 0); - OUT_RING(state, 0); - OUT_RING(state, 0); - } - - if (v) { - const struct ir3_ibo_mapping *mapping = &v->image_mapping; - struct fd_shaderbuf_stateobj *buf = &ctx->shaderbuf[type]; - struct fd_shaderimg_stateobj *img = &ctx->shaderimg[type]; - - for (unsigned i = 0; i < mapping->num_tex; i++) { - unsigned idx = mapping->tex_to_image[i]; - if (idx & IBO_SSBO) { - fd6_emit_ssbo_tex(state, &buf->sb[idx & ~IBO_SSBO]); - } else { - fd6_emit_image_tex(state, &img->si[idx]); - } - } - - if (v->fb_read) { - fd6_emit_fb_tex(state, ctx); - } - } - - /* emit texture state: */ - OUT_PKT7(ring, opcode, 3); - OUT_RING(ring, CP_LOAD_STATE6_0_DST_OFF(0) | - CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) | - CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) | - CP_LOAD_STATE6_0_STATE_BLOCK(sb) | - CP_LOAD_STATE6_0_NUM_UNIT(num_merged_textures)); - OUT_RB(ring, state); /* SRC_ADDR_LO/HI */ - - OUT_PKT4(ring, tex_const_reg, 2); - OUT_RB(ring, state); /* SRC_ADDR_LO/HI */ - - fd_ringbuffer_del(state); - } - - OUT_PKT4(ring, tex_count_reg, 1); - OUT_RING(ring, num_merged_textures); - - return needs_border; + bool needs_border = false; + unsigned opcode, tex_samp_reg, tex_const_reg, tex_count_reg; + enum a6xx_state_block sb; + + switch (type) { + case PIPE_SHADER_VERTEX: + sb = SB6_VS_TEX; + opcode = CP_LOAD_STATE6_GEOM; + tex_samp_reg = REG_A6XX_SP_VS_TEX_SAMP; + tex_const_reg = REG_A6XX_SP_VS_TEX_CONST; + tex_count_reg = REG_A6XX_SP_VS_TEX_COUNT; + break; + case PIPE_SHADER_TESS_CTRL: + sb = SB6_HS_TEX; + opcode = CP_LOAD_STATE6_GEOM; + tex_samp_reg = REG_A6XX_SP_HS_TEX_SAMP; + tex_const_reg = REG_A6XX_SP_HS_TEX_CONST; + tex_count_reg = REG_A6XX_SP_HS_TEX_COUNT; + break; + case PIPE_SHADER_TESS_EVAL: + sb = SB6_DS_TEX; + opcode = CP_LOAD_STATE6_GEOM; + tex_samp_reg = REG_A6XX_SP_DS_TEX_SAMP; + tex_const_reg = REG_A6XX_SP_DS_TEX_CONST; + tex_count_reg = REG_A6XX_SP_DS_TEX_COUNT; + break; + case PIPE_SHADER_GEOMETRY: + sb = SB6_GS_TEX; + opcode = CP_LOAD_STATE6_GEOM; + tex_samp_reg = REG_A6XX_SP_GS_TEX_SAMP; + tex_const_reg = REG_A6XX_SP_GS_TEX_CONST; + tex_count_reg = REG_A6XX_SP_GS_TEX_COUNT; + break; + case PIPE_SHADER_FRAGMENT: + sb = SB6_FS_TEX; + opcode = CP_LOAD_STATE6_FRAG; + tex_samp_reg = REG_A6XX_SP_FS_TEX_SAMP; + tex_const_reg = REG_A6XX_SP_FS_TEX_CONST; + tex_count_reg = REG_A6XX_SP_FS_TEX_COUNT; + break; + case PIPE_SHADER_COMPUTE: + sb = SB6_CS_TEX; + opcode = CP_LOAD_STATE6_FRAG; + tex_samp_reg = REG_A6XX_SP_CS_TEX_SAMP; + tex_const_reg = REG_A6XX_SP_CS_TEX_CONST; + tex_count_reg = REG_A6XX_SP_CS_TEX_COUNT; + break; + default: + unreachable("bad state block"); + } + + if (tex->num_samplers > 0) { + struct fd_ringbuffer *state = + fd_ringbuffer_new_object(ctx->pipe, tex->num_samplers * 4 * 4); + for (unsigned i = 0; i < tex->num_samplers; i++) { + static const struct fd6_sampler_stateobj dummy_sampler = {}; + const struct fd6_sampler_stateobj *sampler = + tex->samplers[i] ? fd6_sampler_stateobj(tex->samplers[i]) + : &dummy_sampler; + OUT_RING(state, sampler->texsamp0); + OUT_RING(state, sampler->texsamp1); + OUT_RING(state, sampler->texsamp2 | + A6XX_TEX_SAMP_2_BCOLOR(i + bcolor_offset)); + OUT_RING(state, sampler->texsamp3); + needs_border |= sampler->needs_border; + } + + /* output sampler state: */ + OUT_PKT7(ring, opcode, 3); + OUT_RING(ring, CP_LOAD_STATE6_0_DST_OFF(0) | + CP_LOAD_STATE6_0_STATE_TYPE(ST6_SHADER) | + CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) | + CP_LOAD_STATE6_0_STATE_BLOCK(sb) | + CP_LOAD_STATE6_0_NUM_UNIT(tex->num_samplers)); + OUT_RB(ring, state); /* SRC_ADDR_LO/HI */ + + OUT_PKT4(ring, tex_samp_reg, 2); + OUT_RB(ring, state); /* SRC_ADDR_LO/HI */ + + fd_ringbuffer_del(state); + } + + unsigned num_merged_textures = tex->num_textures; + unsigned num_textures = tex->num_textures; + if (v) { + num_merged_textures += v->image_mapping.num_tex; + + if (v->fb_read) + num_merged_textures++; + + /* There could be more bound textures than what the shader uses. + * Which isn't known at shader compile time. So in the case we + * are merging tex state, only emit the textures that the shader + * uses (since the image/SSBO related tex state comes immediately + * after) + */ + num_textures = v->image_mapping.tex_base; + } + + if (num_merged_textures > 0) { + struct fd_ringbuffer *state = + fd_ringbuffer_new_object(ctx->pipe, num_merged_textures * 16 * 4); + for (unsigned i = 0; i < num_textures; i++) { + const struct fd6_pipe_sampler_view *view; + + if (tex->textures[i]) { + view = fd6_pipe_sampler_view(tex->textures[i]); + if (unlikely(view->rsc_seqno != + fd_resource(view->base.texture)->seqno)) { + fd6_sampler_view_update(ctx, + fd6_pipe_sampler_view(tex->textures[i])); + } + } else { + static const struct fd6_pipe_sampler_view dummy_view = {}; + view = &dummy_view; + } + + OUT_RING(state, view->texconst0); + OUT_RING(state, view->texconst1); + OUT_RING(state, view->texconst2); + OUT_RING(state, view->texconst3); + + if (view->ptr1) { + OUT_RELOC(state, view->ptr1->bo, view->offset1, + (uint64_t)view->texconst5 << 32, 0); + } else { + OUT_RING(state, 0x00000000); + OUT_RING(state, view->texconst5); + } + + OUT_RING(state, view->texconst6); + + if (view->ptr2) { + OUT_RELOC(state, view->ptr2->bo, view->offset2, 0, 0); + } else { + OUT_RING(state, 0); + OUT_RING(state, 0); + } + + OUT_RING(state, view->texconst9); + OUT_RING(state, view->texconst10); + OUT_RING(state, view->texconst11); + OUT_RING(state, 0); + OUT_RING(state, 0); + OUT_RING(state, 0); + OUT_RING(state, 0); + } + + if (v) { + const struct ir3_ibo_mapping *mapping = &v->image_mapping; + struct fd_shaderbuf_stateobj *buf = &ctx->shaderbuf[type]; + struct fd_shaderimg_stateobj *img = &ctx->shaderimg[type]; + + for (unsigned i = 0; i < mapping->num_tex; i++) { + unsigned idx = mapping->tex_to_image[i]; + if (idx & IBO_SSBO) { + fd6_emit_ssbo_tex(state, &buf->sb[idx & ~IBO_SSBO]); + } else { + fd6_emit_image_tex(state, &img->si[idx]); + } + } + + if (v->fb_read) { + fd6_emit_fb_tex(state, ctx); + } + } + + /* emit texture state: */ + OUT_PKT7(ring, opcode, 3); + OUT_RING(ring, CP_LOAD_STATE6_0_DST_OFF(0) | + CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) | + CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) | + CP_LOAD_STATE6_0_STATE_BLOCK(sb) | + CP_LOAD_STATE6_0_NUM_UNIT(num_merged_textures)); + OUT_RB(ring, state); /* SRC_ADDR_LO/HI */ + + OUT_PKT4(ring, tex_const_reg, 2); + OUT_RB(ring, state); /* SRC_ADDR_LO/HI */ + + fd_ringbuffer_del(state); + } + + OUT_PKT4(ring, tex_count_reg, 1); + OUT_RING(ring, num_merged_textures); + + return needs_border; } /* Emits combined texture state, which also includes any Image/SSBO @@ -488,136 +489,133 @@ fd6_emit_textures(struct fd_context *ctx, struct fd_ringbuffer *ring, */ static bool fd6_emit_combined_textures(struct fd_ringbuffer *ring, struct fd6_emit *emit, - enum pipe_shader_type type, const struct ir3_shader_variant *v) - assert_dt + enum pipe_shader_type type, + const struct ir3_shader_variant *v) assert_dt { - struct fd_context *ctx = emit->ctx; - bool needs_border = false; - - static const struct { - enum fd6_state_id state_id; - unsigned enable_mask; - } s[PIPE_SHADER_TYPES] = { - [PIPE_SHADER_VERTEX] = { FD6_GROUP_VS_TEX, ENABLE_ALL }, - [PIPE_SHADER_TESS_CTRL] = { FD6_GROUP_HS_TEX, ENABLE_ALL }, - [PIPE_SHADER_TESS_EVAL] = { FD6_GROUP_DS_TEX, ENABLE_ALL }, - [PIPE_SHADER_GEOMETRY] = { FD6_GROUP_GS_TEX, ENABLE_ALL }, - [PIPE_SHADER_FRAGMENT] = { FD6_GROUP_FS_TEX, ENABLE_DRAW }, - }; - - debug_assert(s[type].state_id); - - if (!v->image_mapping.num_tex && !v->fb_read) { - /* in the fast-path, when we don't have to mix in any image/SSBO - * related texture state, we can just lookup the stateobj and - * re-emit that: - * - * Also, framebuffer-read is a slow-path because an extra - * texture needs to be inserted. - * - * TODO we can probably simmplify things if we also treated - * border_color as a slow-path.. this way the tex state key - * wouldn't depend on bcolor_offset.. but fb_read might rather - * be *somehow* a fast-path if we eventually used it for PLS. - * I suppose there would be no harm in just *always* inserting - * an fb_read texture? - */ - if ((ctx->dirty_shader[type] & FD_DIRTY_SHADER_TEX) && - ctx->tex[type].num_textures > 0) { - struct fd6_texture_state *tex = fd6_texture_state(ctx, - type, &ctx->tex[type]); - - needs_border |= tex->needs_border; - - fd6_emit_add_group(emit, tex->stateobj, s[type].state_id, - s[type].enable_mask); - - fd6_texture_state_reference(&tex, NULL); - } - } else { - /* In the slow-path, create a one-shot texture state object - * if either TEX|PROG|SSBO|IMAGE state is dirty: - */ - if ((ctx->dirty_shader[type] & - (FD_DIRTY_SHADER_TEX | FD_DIRTY_SHADER_PROG | - FD_DIRTY_SHADER_IMAGE | FD_DIRTY_SHADER_SSBO)) || - v->fb_read) { - struct fd_texture_stateobj *tex = &ctx->tex[type]; - struct fd_ringbuffer *stateobj = - fd_submit_new_ringbuffer(ctx->batch->submit, - 0x1000, FD_RINGBUFFER_STREAMING); - unsigned bcolor_offset = - fd6_border_color_offset(ctx, type, tex); - - needs_border |= fd6_emit_textures(ctx, stateobj, type, tex, - bcolor_offset, v); - - fd6_emit_take_group(emit, stateobj, s[type].state_id, - s[type].enable_mask); - } - } - - return needs_border; + struct fd_context *ctx = emit->ctx; + bool needs_border = false; + + static const struct { + enum fd6_state_id state_id; + unsigned enable_mask; + } s[PIPE_SHADER_TYPES] = { + [PIPE_SHADER_VERTEX] = {FD6_GROUP_VS_TEX, ENABLE_ALL}, + [PIPE_SHADER_TESS_CTRL] = {FD6_GROUP_HS_TEX, ENABLE_ALL}, + [PIPE_SHADER_TESS_EVAL] = {FD6_GROUP_DS_TEX, ENABLE_ALL}, + [PIPE_SHADER_GEOMETRY] = {FD6_GROUP_GS_TEX, ENABLE_ALL}, + [PIPE_SHADER_FRAGMENT] = {FD6_GROUP_FS_TEX, ENABLE_DRAW}, + }; + + debug_assert(s[type].state_id); + + if (!v->image_mapping.num_tex && !v->fb_read) { + /* in the fast-path, when we don't have to mix in any image/SSBO + * related texture state, we can just lookup the stateobj and + * re-emit that: + * + * Also, framebuffer-read is a slow-path because an extra + * texture needs to be inserted. + * + * TODO we can probably simmplify things if we also treated + * border_color as a slow-path.. this way the tex state key + * wouldn't depend on bcolor_offset.. but fb_read might rather + * be *somehow* a fast-path if we eventually used it for PLS. + * I suppose there would be no harm in just *always* inserting + * an fb_read texture? + */ + if ((ctx->dirty_shader[type] & FD_DIRTY_SHADER_TEX) && + ctx->tex[type].num_textures > 0) { + struct fd6_texture_state *tex = + fd6_texture_state(ctx, type, &ctx->tex[type]); + + needs_border |= tex->needs_border; + + fd6_emit_add_group(emit, tex->stateobj, s[type].state_id, + s[type].enable_mask); + + fd6_texture_state_reference(&tex, NULL); + } + } else { + /* In the slow-path, create a one-shot texture state object + * if either TEX|PROG|SSBO|IMAGE state is dirty: + */ + if ((ctx->dirty_shader[type] & + (FD_DIRTY_SHADER_TEX | FD_DIRTY_SHADER_PROG | FD_DIRTY_SHADER_IMAGE | + FD_DIRTY_SHADER_SSBO)) || + v->fb_read) { + struct fd_texture_stateobj *tex = &ctx->tex[type]; + struct fd_ringbuffer *stateobj = fd_submit_new_ringbuffer( + ctx->batch->submit, 0x1000, FD_RINGBUFFER_STREAMING); + unsigned bcolor_offset = fd6_border_color_offset(ctx, type, tex); + + needs_border |= + fd6_emit_textures(ctx, stateobj, type, tex, bcolor_offset, v); + + fd6_emit_take_group(emit, stateobj, s[type].state_id, + s[type].enable_mask); + } + } + + return needs_border; } static struct fd_ringbuffer * -build_vbo_state(struct fd6_emit *emit) - assert_dt +build_vbo_state(struct fd6_emit *emit) assert_dt { - const struct fd_vertex_state *vtx = emit->vtx; - - struct fd_ringbuffer *ring = fd_submit_new_ringbuffer(emit->ctx->batch->submit, - 4 * (1 + vtx->vertexbuf.count * 4), FD_RINGBUFFER_STREAMING); - - OUT_PKT4(ring, REG_A6XX_VFD_FETCH(0), 4 * vtx->vertexbuf.count); - for (int32_t j = 0; j < vtx->vertexbuf.count; j++) { - const struct pipe_vertex_buffer *vb = &vtx->vertexbuf.vb[j]; - struct fd_resource *rsc = fd_resource(vb->buffer.resource); - if (rsc == NULL) { - OUT_RING(ring, 0); - OUT_RING(ring, 0); - OUT_RING(ring, 0); - OUT_RING(ring, 0); - } else { - uint32_t off = vb->buffer_offset; - uint32_t size = fd_bo_size(rsc->bo) - off; - - OUT_RELOC(ring, rsc->bo, off, 0, 0); - OUT_RING(ring, size); /* VFD_FETCH[j].SIZE */ - OUT_RING(ring, vb->stride); /* VFD_FETCH[j].STRIDE */ - } - } - - return ring; + const struct fd_vertex_state *vtx = emit->vtx; + + struct fd_ringbuffer *ring = fd_submit_new_ringbuffer( + emit->ctx->batch->submit, 4 * (1 + vtx->vertexbuf.count * 4), + FD_RINGBUFFER_STREAMING); + + OUT_PKT4(ring, REG_A6XX_VFD_FETCH(0), 4 * vtx->vertexbuf.count); + for (int32_t j = 0; j < vtx->vertexbuf.count; j++) { + const struct pipe_vertex_buffer *vb = &vtx->vertexbuf.vb[j]; + struct fd_resource *rsc = fd_resource(vb->buffer.resource); + if (rsc == NULL) { + OUT_RING(ring, 0); + OUT_RING(ring, 0); + OUT_RING(ring, 0); + OUT_RING(ring, 0); + } else { + uint32_t off = vb->buffer_offset; + uint32_t size = fd_bo_size(rsc->bo) - off; + + OUT_RELOC(ring, rsc->bo, off, 0, 0); + OUT_RING(ring, size); /* VFD_FETCH[j].SIZE */ + OUT_RING(ring, vb->stride); /* VFD_FETCH[j].STRIDE */ + } + } + + return ring; } static enum a6xx_ztest_mode -compute_ztest_mode(struct fd6_emit *emit, bool lrz_valid) - assert_dt +compute_ztest_mode(struct fd6_emit *emit, bool lrz_valid) assert_dt { - struct fd_context *ctx = emit->ctx; - struct pipe_framebuffer_state *pfb = &ctx->batch->framebuffer; - struct fd6_zsa_stateobj *zsa = fd6_zsa_stateobj(ctx->zsa); - const struct ir3_shader_variant *fs = emit->fs; - - if (fs->shader->nir->info.fs.early_fragment_tests) - return A6XX_EARLY_Z; - - if (fs->no_earlyz || fs->writes_pos || !zsa->base.depth_enabled || - fs->writes_stencilref) { - return A6XX_LATE_Z; - } else if ((fs->has_kill || zsa->alpha_test) && - (zsa->writes_zs || !pfb->zsbuf)) { - /* Slightly odd, but seems like the hw wants us to select - * LATE_Z mode if there is no depth buffer + discard. Either - * that, or when occlusion query is enabled. See: - * - * dEQP-GLES31.functional.fbo.no_attachments.* - */ - return lrz_valid ? A6XX_EARLY_LRZ_LATE_Z : A6XX_LATE_Z; - } else { - return A6XX_EARLY_Z; - } + struct fd_context *ctx = emit->ctx; + struct pipe_framebuffer_state *pfb = &ctx->batch->framebuffer; + struct fd6_zsa_stateobj *zsa = fd6_zsa_stateobj(ctx->zsa); + const struct ir3_shader_variant *fs = emit->fs; + + if (fs->shader->nir->info.fs.early_fragment_tests) + return A6XX_EARLY_Z; + + if (fs->no_earlyz || fs->writes_pos || !zsa->base.depth_enabled || + fs->writes_stencilref) { + return A6XX_LATE_Z; + } else if ((fs->has_kill || zsa->alpha_test) && + (zsa->writes_zs || !pfb->zsbuf)) { + /* Slightly odd, but seems like the hw wants us to select + * LATE_Z mode if there is no depth buffer + discard. Either + * that, or when occlusion query is enabled. See: + * + * dEQP-GLES31.functional.fbo.no_attachments.* + */ + return lrz_valid ? A6XX_EARLY_LRZ_LATE_Z : A6XX_LATE_Z; + } else { + return A6XX_EARLY_Z; + } } /** @@ -626,782 +624,749 @@ compute_ztest_mode(struct fd6_emit *emit, bool lrz_valid) * to invalidate lrz. */ static struct fd6_lrz_state -compute_lrz_state(struct fd6_emit *emit, bool binning_pass) - assert_dt +compute_lrz_state(struct fd6_emit *emit, bool binning_pass) assert_dt { - struct fd_context *ctx = emit->ctx; - struct pipe_framebuffer_state *pfb = &ctx->batch->framebuffer; - const struct ir3_shader_variant *fs = emit->fs; - struct fd6_lrz_state lrz; - - if (!pfb->zsbuf) { - memset(&lrz, 0, sizeof(lrz)); - if (!binning_pass) { - lrz.z_mode = compute_ztest_mode(emit, false); - } - return lrz; - } - - struct fd6_blend_stateobj *blend = fd6_blend_stateobj(ctx->blend); - struct fd6_zsa_stateobj *zsa = fd6_zsa_stateobj(ctx->zsa); - struct fd_resource *rsc = fd_resource(pfb->zsbuf->texture); - - lrz = zsa->lrz; - - /* normalize lrz state: */ - if (blend->reads_dest || fs->writes_pos || fs->no_earlyz || fs->has_kill) { - lrz.write = false; - if (binning_pass) - lrz.enable = false; - } - - /* if we change depthfunc direction, bail out on using LRZ. The - * LRZ buffer encodes a min/max depth value per block, but if - * we switch from GT/GE <-> LT/LE, those values cannot be - * interpreted properly. - */ - if (zsa->base.depth_enabled && - (rsc->lrz_direction != FD_LRZ_UNKNOWN) && - (rsc->lrz_direction != lrz.direction)) { - rsc->lrz_valid = false; - } - - if (zsa->invalidate_lrz || !rsc->lrz_valid) { - rsc->lrz_valid = false; - memset(&lrz, 0, sizeof(lrz)); - } - - if (fs->no_earlyz || fs->writes_pos) { - lrz.enable = false; - lrz.write = false; - lrz.test = false; - } - - if (!binning_pass) { - lrz.z_mode = compute_ztest_mode(emit, rsc->lrz_valid); - } - - /* Once we start writing to the real depth buffer, we lock in the - * direction for LRZ.. if we have to skip a LRZ write for any - * reason, it is still safe to have LRZ until there is a direction - * reversal. Prior to the reversal, since we disabled LRZ writes - * in the "unsafe" cases, this just means that the LRZ test may - * not early-discard some things that end up not passing a later - * test (ie. be overly concervative). But once you have a reversal - * of direction, it is possible to increase/decrease the z value - * to the point where the overly-conservative test is incorrect. - */ - if (zsa->base.depth_writemask) { - rsc->lrz_direction = lrz.direction; - } - - return lrz; + struct fd_context *ctx = emit->ctx; + struct pipe_framebuffer_state *pfb = &ctx->batch->framebuffer; + const struct ir3_shader_variant *fs = emit->fs; + struct fd6_lrz_state lrz; + + if (!pfb->zsbuf) { + memset(&lrz, 0, sizeof(lrz)); + if (!binning_pass) { + lrz.z_mode = compute_ztest_mode(emit, false); + } + return lrz; + } + + struct fd6_blend_stateobj *blend = fd6_blend_stateobj(ctx->blend); + struct fd6_zsa_stateobj *zsa = fd6_zsa_stateobj(ctx->zsa); + struct fd_resource *rsc = fd_resource(pfb->zsbuf->texture); + + lrz = zsa->lrz; + + /* normalize lrz state: */ + if (blend->reads_dest || fs->writes_pos || fs->no_earlyz || fs->has_kill) { + lrz.write = false; + if (binning_pass) + lrz.enable = false; + } + + /* if we change depthfunc direction, bail out on using LRZ. The + * LRZ buffer encodes a min/max depth value per block, but if + * we switch from GT/GE <-> LT/LE, those values cannot be + * interpreted properly. + */ + if (zsa->base.depth_enabled && (rsc->lrz_direction != FD_LRZ_UNKNOWN) && + (rsc->lrz_direction != lrz.direction)) { + rsc->lrz_valid = false; + } + + if (zsa->invalidate_lrz || !rsc->lrz_valid) { + rsc->lrz_valid = false; + memset(&lrz, 0, sizeof(lrz)); + } + + if (fs->no_earlyz || fs->writes_pos) { + lrz.enable = false; + lrz.write = false; + lrz.test = false; + } + + if (!binning_pass) { + lrz.z_mode = compute_ztest_mode(emit, rsc->lrz_valid); + } + + /* Once we start writing to the real depth buffer, we lock in the + * direction for LRZ.. if we have to skip a LRZ write for any + * reason, it is still safe to have LRZ until there is a direction + * reversal. Prior to the reversal, since we disabled LRZ writes + * in the "unsafe" cases, this just means that the LRZ test may + * not early-discard some things that end up not passing a later + * test (ie. be overly concervative). But once you have a reversal + * of direction, it is possible to increase/decrease the z value + * to the point where the overly-conservative test is incorrect. + */ + if (zsa->base.depth_writemask) { + rsc->lrz_direction = lrz.direction; + } + + return lrz; } static struct fd_ringbuffer * -build_lrz(struct fd6_emit *emit, bool binning_pass) - assert_dt +build_lrz(struct fd6_emit *emit, bool binning_pass) assert_dt { - struct fd_context *ctx = emit->ctx; - struct fd6_context *fd6_ctx = fd6_context(ctx); - struct fd6_lrz_state lrz = - compute_lrz_state(emit, binning_pass); - - /* If the LRZ state has not changed, we can skip the emit: */ - if (!ctx->last.dirty && - !memcmp(&fd6_ctx->last.lrz[binning_pass], &lrz, sizeof(lrz))) - return NULL; - - fd6_ctx->last.lrz[binning_pass] = lrz; - - struct fd_ringbuffer *ring = fd_submit_new_ringbuffer(ctx->batch->submit, - 8*4, FD_RINGBUFFER_STREAMING); - - OUT_REG(ring, A6XX_GRAS_LRZ_CNTL( - .enable = lrz.enable, - .lrz_write = lrz.write, - .greater = lrz.direction == FD_LRZ_GREATER, - .z_test_enable = lrz.test, - )); - OUT_REG(ring, A6XX_RB_LRZ_CNTL( - .enable = lrz.enable, - )); - - OUT_REG(ring, A6XX_RB_DEPTH_PLANE_CNTL( - .z_mode = lrz.z_mode, - )); - - OUT_REG(ring, A6XX_GRAS_SU_DEPTH_PLANE_CNTL( - .z_mode = lrz.z_mode, - )); - - return ring; + struct fd_context *ctx = emit->ctx; + struct fd6_context *fd6_ctx = fd6_context(ctx); + struct fd6_lrz_state lrz = compute_lrz_state(emit, binning_pass); + + /* If the LRZ state has not changed, we can skip the emit: */ + if (!ctx->last.dirty && + !memcmp(&fd6_ctx->last.lrz[binning_pass], &lrz, sizeof(lrz))) + return NULL; + + fd6_ctx->last.lrz[binning_pass] = lrz; + + struct fd_ringbuffer *ring = fd_submit_new_ringbuffer( + ctx->batch->submit, 8 * 4, FD_RINGBUFFER_STREAMING); + + OUT_REG(ring, + A6XX_GRAS_LRZ_CNTL(.enable = lrz.enable, .lrz_write = lrz.write, + .greater = lrz.direction == FD_LRZ_GREATER, + .z_test_enable = lrz.test, )); + OUT_REG(ring, A6XX_RB_LRZ_CNTL(.enable = lrz.enable, )); + + OUT_REG(ring, A6XX_RB_DEPTH_PLANE_CNTL(.z_mode = lrz.z_mode, )); + + OUT_REG(ring, A6XX_GRAS_SU_DEPTH_PLANE_CNTL(.z_mode = lrz.z_mode, )); + + return ring; } static struct fd_ringbuffer * -build_scissor(struct fd6_emit *emit) - assert_dt +build_scissor(struct fd6_emit *emit) assert_dt { - struct fd_context *ctx = emit->ctx; - struct pipe_scissor_state *scissor = fd_context_get_scissor(ctx); - - struct fd_ringbuffer *ring = fd_submit_new_ringbuffer(emit->ctx->batch->submit, 3*4, - FD_RINGBUFFER_STREAMING); - - OUT_REG(ring, - A6XX_GRAS_SC_SCREEN_SCISSOR_TL(0, - .x = scissor->minx, - .y = scissor->miny - ), - A6XX_GRAS_SC_SCREEN_SCISSOR_BR(0, - .x = MAX2(scissor->maxx, 1) - 1, - .y = MAX2(scissor->maxy, 1) - 1 - ) - ); - - ctx->batch->max_scissor.minx = MIN2(ctx->batch->max_scissor.minx, scissor->minx); - ctx->batch->max_scissor.miny = MIN2(ctx->batch->max_scissor.miny, scissor->miny); - ctx->batch->max_scissor.maxx = MAX2(ctx->batch->max_scissor.maxx, scissor->maxx); - ctx->batch->max_scissor.maxy = MAX2(ctx->batch->max_scissor.maxy, scissor->maxy); - - return ring; + struct fd_context *ctx = emit->ctx; + struct pipe_scissor_state *scissor = fd_context_get_scissor(ctx); + + struct fd_ringbuffer *ring = fd_submit_new_ringbuffer( + emit->ctx->batch->submit, 3 * 4, FD_RINGBUFFER_STREAMING); + + OUT_REG( + ring, + A6XX_GRAS_SC_SCREEN_SCISSOR_TL(0, .x = scissor->minx, .y = scissor->miny), + A6XX_GRAS_SC_SCREEN_SCISSOR_BR(0, .x = MAX2(scissor->maxx, 1) - 1, + .y = MAX2(scissor->maxy, 1) - 1)); + + ctx->batch->max_scissor.minx = + MIN2(ctx->batch->max_scissor.minx, scissor->minx); + ctx->batch->max_scissor.miny = + MIN2(ctx->batch->max_scissor.miny, scissor->miny); + ctx->batch->max_scissor.maxx = + MAX2(ctx->batch->max_scissor.maxx, scissor->maxx); + ctx->batch->max_scissor.maxy = + MAX2(ctx->batch->max_scissor.maxy, scissor->maxy); + + return ring; } /* Combination of FD_DIRTY_FRAMEBUFFER | FD_DIRTY_RASTERIZER_DISCARD | * FD_DIRTY_PROG | FD_DIRTY_DUAL_BLEND */ static struct fd_ringbuffer * -build_prog_fb_rast(struct fd6_emit *emit) - assert_dt +build_prog_fb_rast(struct fd6_emit *emit) assert_dt { - struct fd_context *ctx = emit->ctx; - struct pipe_framebuffer_state *pfb = &ctx->batch->framebuffer; - const struct fd6_program_state *prog = fd6_emit_get_prog(emit); - const struct ir3_shader_variant *fs = emit->fs; + struct fd_context *ctx = emit->ctx; + struct pipe_framebuffer_state *pfb = &ctx->batch->framebuffer; + const struct fd6_program_state *prog = fd6_emit_get_prog(emit); + const struct ir3_shader_variant *fs = emit->fs; - struct fd_ringbuffer *ring = fd_submit_new_ringbuffer(ctx->batch->submit, - 9 * 4, FD_RINGBUFFER_STREAMING); + struct fd_ringbuffer *ring = fd_submit_new_ringbuffer( + ctx->batch->submit, 9 * 4, FD_RINGBUFFER_STREAMING); - unsigned nr = pfb->nr_cbufs; + unsigned nr = pfb->nr_cbufs; - if (ctx->rasterizer->rasterizer_discard) - nr = 0; + if (ctx->rasterizer->rasterizer_discard) + nr = 0; - struct fd6_blend_stateobj *blend = fd6_blend_stateobj(ctx->blend); + struct fd6_blend_stateobj *blend = fd6_blend_stateobj(ctx->blend); - if (blend->use_dual_src_blend) - nr++; + if (blend->use_dual_src_blend) + nr++; - OUT_PKT4(ring, REG_A6XX_RB_FS_OUTPUT_CNTL0, 2); - OUT_RING(ring, COND(fs->writes_pos, A6XX_RB_FS_OUTPUT_CNTL0_FRAG_WRITES_Z) | - COND(fs->writes_smask && pfb->samples > 1, - A6XX_RB_FS_OUTPUT_CNTL0_FRAG_WRITES_SAMPMASK) | - COND(fs->writes_stencilref, - A6XX_RB_FS_OUTPUT_CNTL0_FRAG_WRITES_STENCILREF) | - COND(blend->use_dual_src_blend, - A6XX_RB_FS_OUTPUT_CNTL0_DUAL_COLOR_IN_ENABLE)); - OUT_RING(ring, A6XX_RB_FS_OUTPUT_CNTL1_MRT(nr)); + OUT_PKT4(ring, REG_A6XX_RB_FS_OUTPUT_CNTL0, 2); + OUT_RING(ring, COND(fs->writes_pos, A6XX_RB_FS_OUTPUT_CNTL0_FRAG_WRITES_Z) | + COND(fs->writes_smask && pfb->samples > 1, + A6XX_RB_FS_OUTPUT_CNTL0_FRAG_WRITES_SAMPMASK) | + COND(fs->writes_stencilref, + A6XX_RB_FS_OUTPUT_CNTL0_FRAG_WRITES_STENCILREF) | + COND(blend->use_dual_src_blend, + A6XX_RB_FS_OUTPUT_CNTL0_DUAL_COLOR_IN_ENABLE)); + OUT_RING(ring, A6XX_RB_FS_OUTPUT_CNTL1_MRT(nr)); - OUT_PKT4(ring, REG_A6XX_SP_FS_OUTPUT_CNTL1, 1); - OUT_RING(ring, A6XX_SP_FS_OUTPUT_CNTL1_MRT(nr)); + OUT_PKT4(ring, REG_A6XX_SP_FS_OUTPUT_CNTL1, 1); + OUT_RING(ring, A6XX_SP_FS_OUTPUT_CNTL1_MRT(nr)); - unsigned mrt_components = 0; - for (unsigned i = 0; i < pfb->nr_cbufs; i++) { - if (!pfb->cbufs[i]) - continue; - mrt_components |= 0xf << (i * 4); - } + unsigned mrt_components = 0; + for (unsigned i = 0; i < pfb->nr_cbufs; i++) { + if (!pfb->cbufs[i]) + continue; + mrt_components |= 0xf << (i * 4); + } - /* dual source blending has an extra fs output in the 2nd slot */ - if (blend->use_dual_src_blend) - mrt_components |= 0xf << 4; + /* dual source blending has an extra fs output in the 2nd slot */ + if (blend->use_dual_src_blend) + mrt_components |= 0xf << 4; - mrt_components &= prog->mrt_components; + mrt_components &= prog->mrt_components; - OUT_REG(ring, A6XX_SP_FS_RENDER_COMPONENTS(.dword = mrt_components)); - OUT_REG(ring, A6XX_RB_RENDER_COMPONENTS(.dword = mrt_components)); + OUT_REG(ring, A6XX_SP_FS_RENDER_COMPONENTS(.dword = mrt_components)); + OUT_REG(ring, A6XX_RB_RENDER_COMPONENTS(.dword = mrt_components)); - return ring; + return ring; } static struct fd_ringbuffer * -build_blend_color(struct fd6_emit *emit) - assert_dt +build_blend_color(struct fd6_emit *emit) assert_dt { - struct fd_context *ctx = emit->ctx; - struct pipe_blend_color *bcolor = &ctx->blend_color; - struct fd_ringbuffer *ring = fd_submit_new_ringbuffer(ctx->batch->submit, - 5*4, FD_RINGBUFFER_STREAMING); - - OUT_REG(ring, - A6XX_RB_BLEND_RED_F32(bcolor->color[0]), - A6XX_RB_BLEND_GREEN_F32(bcolor->color[1]), - A6XX_RB_BLEND_BLUE_F32(bcolor->color[2]), - A6XX_RB_BLEND_ALPHA_F32(bcolor->color[3]) - ); - - return ring; + struct fd_context *ctx = emit->ctx; + struct pipe_blend_color *bcolor = &ctx->blend_color; + struct fd_ringbuffer *ring = fd_submit_new_ringbuffer( + ctx->batch->submit, 5 * 4, FD_RINGBUFFER_STREAMING); + + OUT_REG(ring, A6XX_RB_BLEND_RED_F32(bcolor->color[0]), + A6XX_RB_BLEND_GREEN_F32(bcolor->color[1]), + A6XX_RB_BLEND_BLUE_F32(bcolor->color[2]), + A6XX_RB_BLEND_ALPHA_F32(bcolor->color[3])); + + return ring; } static struct fd_ringbuffer * -build_ibo(struct fd6_emit *emit) - assert_dt +build_ibo(struct fd6_emit *emit) assert_dt { - struct fd_context *ctx = emit->ctx; - - if (emit->hs) { - debug_assert(ir3_shader_nibo(emit->hs) == 0); - debug_assert(ir3_shader_nibo(emit->ds) == 0); - } - if (emit->gs) { - debug_assert(ir3_shader_nibo(emit->gs) == 0); - } - - struct fd_ringbuffer *ibo_state = - fd6_build_ibo_state(ctx, emit->fs, PIPE_SHADER_FRAGMENT); - struct fd_ringbuffer *ring = fd_submit_new_ringbuffer( - ctx->batch->submit, 0x100, FD_RINGBUFFER_STREAMING); - - OUT_PKT7(ring, CP_LOAD_STATE6, 3); - OUT_RING(ring, CP_LOAD_STATE6_0_DST_OFF(0) | - CP_LOAD_STATE6_0_STATE_TYPE(ST6_SHADER) | - CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) | - CP_LOAD_STATE6_0_STATE_BLOCK(SB6_IBO) | - CP_LOAD_STATE6_0_NUM_UNIT(ir3_shader_nibo(emit->fs))); - OUT_RB(ring, ibo_state); - - OUT_PKT4(ring, REG_A6XX_SP_IBO, 2); - OUT_RB(ring, ibo_state); - - /* TODO if we used CP_SET_DRAW_STATE for compute shaders, we could - * de-duplicate this from program->config_stateobj - */ - OUT_PKT4(ring, REG_A6XX_SP_IBO_COUNT, 1); - OUT_RING(ring, ir3_shader_nibo(emit->fs)); - - fd_ringbuffer_del(ibo_state); - - return ring; + struct fd_context *ctx = emit->ctx; + + if (emit->hs) { + debug_assert(ir3_shader_nibo(emit->hs) == 0); + debug_assert(ir3_shader_nibo(emit->ds) == 0); + } + if (emit->gs) { + debug_assert(ir3_shader_nibo(emit->gs) == 0); + } + + struct fd_ringbuffer *ibo_state = + fd6_build_ibo_state(ctx, emit->fs, PIPE_SHADER_FRAGMENT); + struct fd_ringbuffer *ring = fd_submit_new_ringbuffer( + ctx->batch->submit, 0x100, FD_RINGBUFFER_STREAMING); + + OUT_PKT7(ring, CP_LOAD_STATE6, 3); + OUT_RING(ring, CP_LOAD_STATE6_0_DST_OFF(0) | + CP_LOAD_STATE6_0_STATE_TYPE(ST6_SHADER) | + CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) | + CP_LOAD_STATE6_0_STATE_BLOCK(SB6_IBO) | + CP_LOAD_STATE6_0_NUM_UNIT(ir3_shader_nibo(emit->fs))); + OUT_RB(ring, ibo_state); + + OUT_PKT4(ring, REG_A6XX_SP_IBO, 2); + OUT_RB(ring, ibo_state); + + /* TODO if we used CP_SET_DRAW_STATE for compute shaders, we could + * de-duplicate this from program->config_stateobj + */ + OUT_PKT4(ring, REG_A6XX_SP_IBO_COUNT, 1); + OUT_RING(ring, ir3_shader_nibo(emit->fs)); + + fd_ringbuffer_del(ibo_state); + + return ring; } static void -fd6_emit_streamout(struct fd_ringbuffer *ring, struct fd6_emit *emit) - assert_dt +fd6_emit_streamout(struct fd_ringbuffer *ring, struct fd6_emit *emit) assert_dt { - struct fd_context *ctx = emit->ctx; - const struct fd6_program_state *prog = fd6_emit_get_prog(emit); - struct ir3_stream_output_info *info = prog->stream_output; - struct fd_streamout_stateobj *so = &ctx->streamout; - - emit->streamout_mask = 0; - - if (!info) - return; - - for (unsigned i = 0; i < so->num_targets; i++) { - struct fd_stream_output_target *target = fd_stream_output_target(so->targets[i]); - - if (!target) - continue; - - target->stride = info->stride[i]; - - OUT_PKT4(ring, REG_A6XX_VPC_SO_BUFFER_BASE(i), 3); - /* VPC_SO[i].BUFFER_BASE_LO: */ - OUT_RELOC(ring, fd_resource(target->base.buffer)->bo, 0, 0, 0); - OUT_RING(ring, target->base.buffer_size + target->base.buffer_offset); - - struct fd_bo *offset_bo = fd_resource(target->offset_buf)->bo; - - if (so->reset & (1 << i)) { - assert(so->offsets[i] == 0); - - OUT_PKT7(ring, CP_MEM_WRITE, 3); - OUT_RELOC(ring, offset_bo, 0, 0, 0); - OUT_RING(ring, target->base.buffer_offset); - - OUT_PKT4(ring, REG_A6XX_VPC_SO_BUFFER_OFFSET(i), 1); - OUT_RING(ring, target->base.buffer_offset); - } else { - OUT_PKT7(ring, CP_MEM_TO_REG, 3); - OUT_RING(ring, CP_MEM_TO_REG_0_REG(REG_A6XX_VPC_SO_BUFFER_OFFSET(i)) | - CP_MEM_TO_REG_0_SHIFT_BY_2 | CP_MEM_TO_REG_0_UNK31 | - CP_MEM_TO_REG_0_CNT(0)); - OUT_RELOC(ring, offset_bo, 0, 0, 0); - } - - // After a draw HW would write the new offset to offset_bo - OUT_PKT4(ring, REG_A6XX_VPC_SO_FLUSH_BASE(i), 2); - OUT_RELOC(ring, offset_bo, 0, 0, 0); - - so->reset &= ~(1 << i); - - emit->streamout_mask |= (1 << i); - } - - if (emit->streamout_mask) { - fd6_emit_add_group(emit, prog->streamout_stateobj, FD6_GROUP_SO, ENABLE_ALL); - } else { - /* If we transition from a draw with streamout to one without, turn - * off streamout. - */ - if (ctx->last.streamout_mask != 0) { - struct fd_ringbuffer *obj = fd_submit_new_ringbuffer(emit->ctx->batch->submit, - 5 * 4, FD_RINGBUFFER_STREAMING); - - OUT_PKT7(obj, CP_CONTEXT_REG_BUNCH, 4); - OUT_RING(obj, REG_A6XX_VPC_SO_CNTL); - OUT_RING(obj, 0); - OUT_RING(obj, REG_A6XX_VPC_SO_STREAM_CNTL); - OUT_RING(obj, 0); - - fd6_emit_take_group(emit, obj, FD6_GROUP_SO, ENABLE_ALL); - } - } - - ctx->last.streamout_mask = emit->streamout_mask; + struct fd_context *ctx = emit->ctx; + const struct fd6_program_state *prog = fd6_emit_get_prog(emit); + struct ir3_stream_output_info *info = prog->stream_output; + struct fd_streamout_stateobj *so = &ctx->streamout; + + emit->streamout_mask = 0; + + if (!info) + return; + + for (unsigned i = 0; i < so->num_targets; i++) { + struct fd_stream_output_target *target = + fd_stream_output_target(so->targets[i]); + + if (!target) + continue; + + target->stride = info->stride[i]; + + OUT_PKT4(ring, REG_A6XX_VPC_SO_BUFFER_BASE(i), 3); + /* VPC_SO[i].BUFFER_BASE_LO: */ + OUT_RELOC(ring, fd_resource(target->base.buffer)->bo, 0, 0, 0); + OUT_RING(ring, target->base.buffer_size + target->base.buffer_offset); + + struct fd_bo *offset_bo = fd_resource(target->offset_buf)->bo; + + if (so->reset & (1 << i)) { + assert(so->offsets[i] == 0); + + OUT_PKT7(ring, CP_MEM_WRITE, 3); + OUT_RELOC(ring, offset_bo, 0, 0, 0); + OUT_RING(ring, target->base.buffer_offset); + + OUT_PKT4(ring, REG_A6XX_VPC_SO_BUFFER_OFFSET(i), 1); + OUT_RING(ring, target->base.buffer_offset); + } else { + OUT_PKT7(ring, CP_MEM_TO_REG, 3); + OUT_RING(ring, CP_MEM_TO_REG_0_REG(REG_A6XX_VPC_SO_BUFFER_OFFSET(i)) | + CP_MEM_TO_REG_0_SHIFT_BY_2 | CP_MEM_TO_REG_0_UNK31 | + CP_MEM_TO_REG_0_CNT(0)); + OUT_RELOC(ring, offset_bo, 0, 0, 0); + } + + // After a draw HW would write the new offset to offset_bo + OUT_PKT4(ring, REG_A6XX_VPC_SO_FLUSH_BASE(i), 2); + OUT_RELOC(ring, offset_bo, 0, 0, 0); + + so->reset &= ~(1 << i); + + emit->streamout_mask |= (1 << i); + } + + if (emit->streamout_mask) { + fd6_emit_add_group(emit, prog->streamout_stateobj, FD6_GROUP_SO, + ENABLE_ALL); + } else { + /* If we transition from a draw with streamout to one without, turn + * off streamout. + */ + if (ctx->last.streamout_mask != 0) { + struct fd_ringbuffer *obj = fd_submit_new_ringbuffer( + emit->ctx->batch->submit, 5 * 4, FD_RINGBUFFER_STREAMING); + + OUT_PKT7(obj, CP_CONTEXT_REG_BUNCH, 4); + OUT_RING(obj, REG_A6XX_VPC_SO_CNTL); + OUT_RING(obj, 0); + OUT_RING(obj, REG_A6XX_VPC_SO_STREAM_CNTL); + OUT_RING(obj, 0); + + fd6_emit_take_group(emit, obj, FD6_GROUP_SO, ENABLE_ALL); + } + } + + ctx->last.streamout_mask = emit->streamout_mask; } /** * Stuff that less frequently changes and isn't (yet) moved into stategroups */ static void -fd6_emit_non_ring(struct fd_ringbuffer *ring, struct fd6_emit *emit) - assert_dt +fd6_emit_non_ring(struct fd_ringbuffer *ring, struct fd6_emit *emit) assert_dt { - struct fd_context *ctx = emit->ctx; - const enum fd_dirty_3d_state dirty = emit->dirty; - - if (dirty & FD_DIRTY_STENCIL_REF) { - struct pipe_stencil_ref *sr = &ctx->stencil_ref; - - OUT_PKT4(ring, REG_A6XX_RB_STENCILREF, 1); - OUT_RING(ring, A6XX_RB_STENCILREF_REF(sr->ref_value[0]) | - A6XX_RB_STENCILREF_BFREF(sr->ref_value[1])); - } - - if (dirty & FD_DIRTY_VIEWPORT) { - struct pipe_scissor_state *scissor = &ctx->viewport_scissor; - - OUT_REG(ring, - A6XX_GRAS_CL_VPORT_XOFFSET(0, ctx->viewport.translate[0]), - A6XX_GRAS_CL_VPORT_XSCALE(0, ctx->viewport.scale[0]), - A6XX_GRAS_CL_VPORT_YOFFSET(0, ctx->viewport.translate[1]), - A6XX_GRAS_CL_VPORT_YSCALE(0, ctx->viewport.scale[1]), - A6XX_GRAS_CL_VPORT_ZOFFSET(0, ctx->viewport.translate[2]), - A6XX_GRAS_CL_VPORT_ZSCALE(0, ctx->viewport.scale[2]) - ); - - OUT_REG(ring, - A6XX_GRAS_SC_VIEWPORT_SCISSOR_TL(0, - .x = scissor->minx, - .y = scissor->miny - ), - A6XX_GRAS_SC_VIEWPORT_SCISSOR_BR(0, - .x = MAX2(scissor->maxx, 1) - 1, - .y = MAX2(scissor->maxy, 1) - 1 - ) - ); - - unsigned guardband_x = - fd_calc_guardband(ctx->viewport.translate[0], ctx->viewport.scale[0], - false); - unsigned guardband_y = - fd_calc_guardband(ctx->viewport.translate[1], ctx->viewport.scale[1], - false); - - OUT_REG(ring, A6XX_GRAS_CL_GUARDBAND_CLIP_ADJ( - .horz = guardband_x, - .vert = guardband_y - ) - ); - } - - /* The clamp ranges are only used when the rasterizer wants depth - * clamping. - */ - if ((dirty & (FD_DIRTY_VIEWPORT | FD_DIRTY_RASTERIZER)) && - fd_depth_clamp_enabled(ctx)) { - float zmin, zmax; - util_viewport_zmin_zmax(&ctx->viewport, ctx->rasterizer->clip_halfz, - &zmin, &zmax); - - OUT_REG(ring, - A6XX_GRAS_CL_Z_CLAMP_MIN(0, zmin), - A6XX_GRAS_CL_Z_CLAMP_MAX(0, zmax)); - - OUT_REG(ring, - A6XX_RB_Z_CLAMP_MIN(zmin), - A6XX_RB_Z_CLAMP_MAX(zmax)); - } + struct fd_context *ctx = emit->ctx; + const enum fd_dirty_3d_state dirty = emit->dirty; + + if (dirty & FD_DIRTY_STENCIL_REF) { + struct pipe_stencil_ref *sr = &ctx->stencil_ref; + + OUT_PKT4(ring, REG_A6XX_RB_STENCILREF, 1); + OUT_RING(ring, A6XX_RB_STENCILREF_REF(sr->ref_value[0]) | + A6XX_RB_STENCILREF_BFREF(sr->ref_value[1])); + } + + if (dirty & FD_DIRTY_VIEWPORT) { + struct pipe_scissor_state *scissor = &ctx->viewport_scissor; + + OUT_REG(ring, A6XX_GRAS_CL_VPORT_XOFFSET(0, ctx->viewport.translate[0]), + A6XX_GRAS_CL_VPORT_XSCALE(0, ctx->viewport.scale[0]), + A6XX_GRAS_CL_VPORT_YOFFSET(0, ctx->viewport.translate[1]), + A6XX_GRAS_CL_VPORT_YSCALE(0, ctx->viewport.scale[1]), + A6XX_GRAS_CL_VPORT_ZOFFSET(0, ctx->viewport.translate[2]), + A6XX_GRAS_CL_VPORT_ZSCALE(0, ctx->viewport.scale[2])); + + OUT_REG( + ring, + A6XX_GRAS_SC_VIEWPORT_SCISSOR_TL(0, .x = scissor->minx, + .y = scissor->miny), + A6XX_GRAS_SC_VIEWPORT_SCISSOR_BR(0, .x = MAX2(scissor->maxx, 1) - 1, + .y = MAX2(scissor->maxy, 1) - 1)); + + unsigned guardband_x = fd_calc_guardband(ctx->viewport.translate[0], + ctx->viewport.scale[0], false); + unsigned guardband_y = fd_calc_guardband(ctx->viewport.translate[1], + ctx->viewport.scale[1], false); + + OUT_REG(ring, A6XX_GRAS_CL_GUARDBAND_CLIP_ADJ(.horz = guardband_x, + .vert = guardband_y)); + } + + /* The clamp ranges are only used when the rasterizer wants depth + * clamping. + */ + if ((dirty & (FD_DIRTY_VIEWPORT | FD_DIRTY_RASTERIZER)) && + fd_depth_clamp_enabled(ctx)) { + float zmin, zmax; + util_viewport_zmin_zmax(&ctx->viewport, ctx->rasterizer->clip_halfz, + &zmin, &zmax); + + OUT_REG(ring, A6XX_GRAS_CL_Z_CLAMP_MIN(0, zmin), + A6XX_GRAS_CL_Z_CLAMP_MAX(0, zmax)); + + OUT_REG(ring, A6XX_RB_Z_CLAMP_MIN(zmin), A6XX_RB_Z_CLAMP_MAX(zmax)); + } } void fd6_emit_state(struct fd_ringbuffer *ring, struct fd6_emit *emit) { - struct fd_context *ctx = emit->ctx; - struct pipe_framebuffer_state *pfb = &ctx->batch->framebuffer; - const struct fd6_program_state *prog = fd6_emit_get_prog(emit); - const struct ir3_shader_variant *vs = emit->vs; - const struct ir3_shader_variant *hs = emit->hs; - const struct ir3_shader_variant *ds = emit->ds; - const struct ir3_shader_variant *gs = emit->gs; - const struct ir3_shader_variant *fs = emit->fs; - bool needs_border = false; - - emit_marker6(ring, 5); - - /* NOTE: we track fb_read differently than _BLEND_ENABLED since we - * might decide to do sysmem in some cases when blend is enabled: - */ - if (fs->fb_read) - ctx->batch->gmem_reason |= FD_GMEM_FB_READ; - - u_foreach_bit (b, emit->dirty_groups) { - enum fd6_state_id group = b; - struct fd_ringbuffer *state = NULL; - uint32_t enable_mask = ENABLE_ALL; - - switch (group) { - case FD6_GROUP_VTXSTATE: - state = fd6_vertex_stateobj(ctx->vtx.vtx)->stateobj; - fd_ringbuffer_ref(state); - break; - case FD6_GROUP_VBO: - state = build_vbo_state(emit); - break; - case FD6_GROUP_ZSA: - state = fd6_zsa_state(ctx, - util_format_is_pure_integer(pipe_surface_format(pfb->cbufs[0])), - fd_depth_clamp_enabled(ctx)); - fd_ringbuffer_ref(state); - break; - case FD6_GROUP_LRZ: - state = build_lrz(emit, false); - if (!state) - continue; - enable_mask = ENABLE_DRAW; - break; - case FD6_GROUP_LRZ_BINNING: - state = build_lrz(emit, true); - if (!state) - continue; - enable_mask = CP_SET_DRAW_STATE__0_BINNING; - break; - case FD6_GROUP_SCISSOR: - state = build_scissor(emit); - break; - case FD6_GROUP_PROG: - fd6_emit_add_group(emit, prog->config_stateobj, FD6_GROUP_PROG_CONFIG, ENABLE_ALL); - fd6_emit_add_group(emit, prog->stateobj, FD6_GROUP_PROG, ENABLE_DRAW); - fd6_emit_add_group(emit, prog->binning_stateobj, - FD6_GROUP_PROG_BINNING, CP_SET_DRAW_STATE__0_BINNING); - - /* emit remaining streaming program state, ie. what depends on - * other emit state, so cannot be pre-baked. - */ - fd6_emit_take_group(emit, fd6_program_interp_state(emit), - FD6_GROUP_PROG_INTERP, ENABLE_DRAW); - continue; - case FD6_GROUP_RASTERIZER: - state = fd6_rasterizer_state(ctx, emit->primitive_restart); - fd_ringbuffer_ref(state); - break; - case FD6_GROUP_PROG_FB_RAST: - state = build_prog_fb_rast(emit); - break; - case FD6_GROUP_BLEND: - state = fd6_blend_variant(ctx->blend, pfb->samples, ctx->sample_mask)->stateobj; - fd_ringbuffer_ref(state); - break; - case FD6_GROUP_BLEND_COLOR: - state = build_blend_color(emit); - break; - case FD6_GROUP_IBO: - state = build_ibo(emit); - fd6_emit_ibo_consts(emit, fs, PIPE_SHADER_FRAGMENT, ring); - break; - case FD6_GROUP_CONST: - state = fd6_build_user_consts(emit); - break; - case FD6_GROUP_VS_DRIVER_PARAMS: - state = fd6_build_vs_driver_params(emit); - break; - case FD6_GROUP_PRIMITIVE_PARAMS: - state = fd6_build_tess_consts(emit); - break; - case FD6_GROUP_VS_TEX: - needs_border |= fd6_emit_combined_textures(ring, emit, PIPE_SHADER_VERTEX, vs); - continue; - case FD6_GROUP_HS_TEX: - if (hs) { - needs_border |= fd6_emit_combined_textures(ring, emit, PIPE_SHADER_TESS_CTRL, hs); - } - continue; - case FD6_GROUP_DS_TEX: - if (ds) { - needs_border |= fd6_emit_combined_textures(ring, emit, PIPE_SHADER_TESS_EVAL, ds); - } - continue; - case FD6_GROUP_GS_TEX: - if (gs) { - needs_border |= fd6_emit_combined_textures(ring, emit, PIPE_SHADER_GEOMETRY, gs); - } - continue; - case FD6_GROUP_FS_TEX: - needs_border |= fd6_emit_combined_textures(ring, emit, PIPE_SHADER_FRAGMENT, fs); - continue; - case FD6_GROUP_SO: - fd6_emit_streamout(ring, emit); - continue; - case FD6_GROUP_NON_GROUP: - fd6_emit_non_ring(ring, emit); - continue; - default: - unreachable("bad state group"); - } - - fd6_emit_take_group(emit, state, group, enable_mask); - } - - if (needs_border) - emit_border_color(ctx, ring); - - if (emit->num_groups > 0) { - OUT_PKT7(ring, CP_SET_DRAW_STATE, 3 * emit->num_groups); - for (unsigned i = 0; i < emit->num_groups; i++) { - struct fd6_state_group *g = &emit->groups[i]; - unsigned n = g->stateobj ? - fd_ringbuffer_size(g->stateobj) / 4 : 0; - - debug_assert((g->enable_mask & ~ENABLE_ALL) == 0); - - if (n == 0) { - OUT_RING(ring, CP_SET_DRAW_STATE__0_COUNT(0) | - CP_SET_DRAW_STATE__0_DISABLE | - g->enable_mask | - CP_SET_DRAW_STATE__0_GROUP_ID(g->group_id)); - OUT_RING(ring, 0x00000000); - OUT_RING(ring, 0x00000000); - } else { - OUT_RING(ring, CP_SET_DRAW_STATE__0_COUNT(n) | - g->enable_mask | - CP_SET_DRAW_STATE__0_GROUP_ID(g->group_id)); - OUT_RB(ring, g->stateobj); - } - - if (g->stateobj) - fd_ringbuffer_del(g->stateobj); - } - emit->num_groups = 0; - } + struct fd_context *ctx = emit->ctx; + struct pipe_framebuffer_state *pfb = &ctx->batch->framebuffer; + const struct fd6_program_state *prog = fd6_emit_get_prog(emit); + const struct ir3_shader_variant *vs = emit->vs; + const struct ir3_shader_variant *hs = emit->hs; + const struct ir3_shader_variant *ds = emit->ds; + const struct ir3_shader_variant *gs = emit->gs; + const struct ir3_shader_variant *fs = emit->fs; + bool needs_border = false; + + emit_marker6(ring, 5); + + /* NOTE: we track fb_read differently than _BLEND_ENABLED since we + * might decide to do sysmem in some cases when blend is enabled: + */ + if (fs->fb_read) + ctx->batch->gmem_reason |= FD_GMEM_FB_READ; + + u_foreach_bit (b, emit->dirty_groups) { + enum fd6_state_id group = b; + struct fd_ringbuffer *state = NULL; + uint32_t enable_mask = ENABLE_ALL; + + switch (group) { + case FD6_GROUP_VTXSTATE: + state = fd6_vertex_stateobj(ctx->vtx.vtx)->stateobj; + fd_ringbuffer_ref(state); + break; + case FD6_GROUP_VBO: + state = build_vbo_state(emit); + break; + case FD6_GROUP_ZSA: + state = fd6_zsa_state( + ctx, + util_format_is_pure_integer(pipe_surface_format(pfb->cbufs[0])), + fd_depth_clamp_enabled(ctx)); + fd_ringbuffer_ref(state); + break; + case FD6_GROUP_LRZ: + state = build_lrz(emit, false); + if (!state) + continue; + enable_mask = ENABLE_DRAW; + break; + case FD6_GROUP_LRZ_BINNING: + state = build_lrz(emit, true); + if (!state) + continue; + enable_mask = CP_SET_DRAW_STATE__0_BINNING; + break; + case FD6_GROUP_SCISSOR: + state = build_scissor(emit); + break; + case FD6_GROUP_PROG: + fd6_emit_add_group(emit, prog->config_stateobj, FD6_GROUP_PROG_CONFIG, + ENABLE_ALL); + fd6_emit_add_group(emit, prog->stateobj, FD6_GROUP_PROG, ENABLE_DRAW); + fd6_emit_add_group(emit, prog->binning_stateobj, + FD6_GROUP_PROG_BINNING, + CP_SET_DRAW_STATE__0_BINNING); + + /* emit remaining streaming program state, ie. what depends on + * other emit state, so cannot be pre-baked. + */ + fd6_emit_take_group(emit, fd6_program_interp_state(emit), + FD6_GROUP_PROG_INTERP, ENABLE_DRAW); + continue; + case FD6_GROUP_RASTERIZER: + state = fd6_rasterizer_state(ctx, emit->primitive_restart); + fd_ringbuffer_ref(state); + break; + case FD6_GROUP_PROG_FB_RAST: + state = build_prog_fb_rast(emit); + break; + case FD6_GROUP_BLEND: + state = fd6_blend_variant(ctx->blend, pfb->samples, ctx->sample_mask) + ->stateobj; + fd_ringbuffer_ref(state); + break; + case FD6_GROUP_BLEND_COLOR: + state = build_blend_color(emit); + break; + case FD6_GROUP_IBO: + state = build_ibo(emit); + fd6_emit_ibo_consts(emit, fs, PIPE_SHADER_FRAGMENT, ring); + break; + case FD6_GROUP_CONST: + state = fd6_build_user_consts(emit); + break; + case FD6_GROUP_VS_DRIVER_PARAMS: + state = fd6_build_vs_driver_params(emit); + break; + case FD6_GROUP_PRIMITIVE_PARAMS: + state = fd6_build_tess_consts(emit); + break; + case FD6_GROUP_VS_TEX: + needs_border |= + fd6_emit_combined_textures(ring, emit, PIPE_SHADER_VERTEX, vs); + continue; + case FD6_GROUP_HS_TEX: + if (hs) { + needs_border |= fd6_emit_combined_textures( + ring, emit, PIPE_SHADER_TESS_CTRL, hs); + } + continue; + case FD6_GROUP_DS_TEX: + if (ds) { + needs_border |= fd6_emit_combined_textures( + ring, emit, PIPE_SHADER_TESS_EVAL, ds); + } + continue; + case FD6_GROUP_GS_TEX: + if (gs) { + needs_border |= + fd6_emit_combined_textures(ring, emit, PIPE_SHADER_GEOMETRY, gs); + } + continue; + case FD6_GROUP_FS_TEX: + needs_border |= + fd6_emit_combined_textures(ring, emit, PIPE_SHADER_FRAGMENT, fs); + continue; + case FD6_GROUP_SO: + fd6_emit_streamout(ring, emit); + continue; + case FD6_GROUP_NON_GROUP: + fd6_emit_non_ring(ring, emit); + continue; + default: + unreachable("bad state group"); + } + + fd6_emit_take_group(emit, state, group, enable_mask); + } + + if (needs_border) + emit_border_color(ctx, ring); + + if (emit->num_groups > 0) { + OUT_PKT7(ring, CP_SET_DRAW_STATE, 3 * emit->num_groups); + for (unsigned i = 0; i < emit->num_groups; i++) { + struct fd6_state_group *g = &emit->groups[i]; + unsigned n = g->stateobj ? fd_ringbuffer_size(g->stateobj) / 4 : 0; + + debug_assert((g->enable_mask & ~ENABLE_ALL) == 0); + + if (n == 0) { + OUT_RING(ring, CP_SET_DRAW_STATE__0_COUNT(0) | + CP_SET_DRAW_STATE__0_DISABLE | g->enable_mask | + CP_SET_DRAW_STATE__0_GROUP_ID(g->group_id)); + OUT_RING(ring, 0x00000000); + OUT_RING(ring, 0x00000000); + } else { + OUT_RING(ring, CP_SET_DRAW_STATE__0_COUNT(n) | g->enable_mask | + CP_SET_DRAW_STATE__0_GROUP_ID(g->group_id)); + OUT_RB(ring, g->stateobj); + } + + if (g->stateobj) + fd_ringbuffer_del(g->stateobj); + } + emit->num_groups = 0; + } } void fd6_emit_cs_state(struct fd_context *ctx, struct fd_ringbuffer *ring, - struct ir3_shader_variant *cp) + struct ir3_shader_variant *cp) { - enum fd_dirty_shader_state dirty = ctx->dirty_shader[PIPE_SHADER_COMPUTE]; + enum fd_dirty_shader_state dirty = ctx->dirty_shader[PIPE_SHADER_COMPUTE]; - if (dirty & (FD_DIRTY_SHADER_TEX | FD_DIRTY_SHADER_PROG | - FD_DIRTY_SHADER_IMAGE | FD_DIRTY_SHADER_SSBO)) { - struct fd_texture_stateobj *tex = &ctx->tex[PIPE_SHADER_COMPUTE]; - unsigned bcolor_offset = fd6_border_color_offset(ctx, PIPE_SHADER_COMPUTE, tex); + if (dirty & (FD_DIRTY_SHADER_TEX | FD_DIRTY_SHADER_PROG | + FD_DIRTY_SHADER_IMAGE | FD_DIRTY_SHADER_SSBO)) { + struct fd_texture_stateobj *tex = &ctx->tex[PIPE_SHADER_COMPUTE]; + unsigned bcolor_offset = + fd6_border_color_offset(ctx, PIPE_SHADER_COMPUTE, tex); - bool needs_border = fd6_emit_textures(ctx, ring, PIPE_SHADER_COMPUTE, tex, - bcolor_offset, cp); + bool needs_border = fd6_emit_textures(ctx, ring, PIPE_SHADER_COMPUTE, tex, + bcolor_offset, cp); - if (needs_border) - emit_border_color(ctx, ring); + if (needs_border) + emit_border_color(ctx, ring); - OUT_PKT4(ring, REG_A6XX_SP_VS_TEX_COUNT, 1); - OUT_RING(ring, 0); + OUT_PKT4(ring, REG_A6XX_SP_VS_TEX_COUNT, 1); + OUT_RING(ring, 0); - OUT_PKT4(ring, REG_A6XX_SP_HS_TEX_COUNT, 1); - OUT_RING(ring, 0); + OUT_PKT4(ring, REG_A6XX_SP_HS_TEX_COUNT, 1); + OUT_RING(ring, 0); - OUT_PKT4(ring, REG_A6XX_SP_DS_TEX_COUNT, 1); - OUT_RING(ring, 0); + OUT_PKT4(ring, REG_A6XX_SP_DS_TEX_COUNT, 1); + OUT_RING(ring, 0); - OUT_PKT4(ring, REG_A6XX_SP_GS_TEX_COUNT, 1); - OUT_RING(ring, 0); + OUT_PKT4(ring, REG_A6XX_SP_GS_TEX_COUNT, 1); + OUT_RING(ring, 0); - OUT_PKT4(ring, REG_A6XX_SP_FS_TEX_COUNT, 1); - OUT_RING(ring, 0); - } + OUT_PKT4(ring, REG_A6XX_SP_FS_TEX_COUNT, 1); + OUT_RING(ring, 0); + } - if (dirty & (FD_DIRTY_SHADER_SSBO | FD_DIRTY_SHADER_IMAGE)) { - struct fd_ringbuffer *state = - fd6_build_ibo_state(ctx, cp, PIPE_SHADER_COMPUTE); + if (dirty & (FD_DIRTY_SHADER_SSBO | FD_DIRTY_SHADER_IMAGE)) { + struct fd_ringbuffer *state = + fd6_build_ibo_state(ctx, cp, PIPE_SHADER_COMPUTE); - OUT_PKT7(ring, CP_LOAD_STATE6_FRAG, 3); - OUT_RING(ring, CP_LOAD_STATE6_0_DST_OFF(0) | - CP_LOAD_STATE6_0_STATE_TYPE(ST6_IBO) | - CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) | - CP_LOAD_STATE6_0_STATE_BLOCK(SB6_CS_SHADER) | - CP_LOAD_STATE6_0_NUM_UNIT(ir3_shader_nibo(cp))); - OUT_RB(ring, state); + OUT_PKT7(ring, CP_LOAD_STATE6_FRAG, 3); + OUT_RING(ring, CP_LOAD_STATE6_0_DST_OFF(0) | + CP_LOAD_STATE6_0_STATE_TYPE(ST6_IBO) | + CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) | + CP_LOAD_STATE6_0_STATE_BLOCK(SB6_CS_SHADER) | + CP_LOAD_STATE6_0_NUM_UNIT(ir3_shader_nibo(cp))); + OUT_RB(ring, state); - OUT_PKT4(ring, REG_A6XX_SP_CS_IBO, 2); - OUT_RB(ring, state); + OUT_PKT4(ring, REG_A6XX_SP_CS_IBO, 2); + OUT_RB(ring, state); - OUT_PKT4(ring, REG_A6XX_SP_CS_IBO_COUNT, 1); - OUT_RING(ring, ir3_shader_nibo(cp)); + OUT_PKT4(ring, REG_A6XX_SP_CS_IBO_COUNT, 1); + OUT_RING(ring, ir3_shader_nibo(cp)); - fd_ringbuffer_del(state); - } + fd_ringbuffer_del(state); + } } - /* emit setup at begin of new cmdstream buffer (don't rely on previous * state, there could have been a context switch between ioctls): */ void fd6_emit_restore(struct fd_batch *batch, struct fd_ringbuffer *ring) { - //struct fd_context *ctx = batch->ctx; - - if (!batch->nondraw) { - trace_start_state_restore(&batch->trace); - } - - fd6_cache_inv(batch, ring); - - OUT_REG(ring, A6XX_HLSQ_INVALIDATE_CMD( - .vs_state = true, - .hs_state = true, - .ds_state = true, - .gs_state = true, - .fs_state = true, - .cs_state = true, - .gfx_ibo = true, - .cs_ibo = true, - .gfx_shared_const = true, - .cs_shared_const = true, - .gfx_bindless = 0x1f, - .cs_bindless = 0x1f - )); - - OUT_WFI5(ring); - - WRITE(REG_A6XX_RB_UNKNOWN_8E04, 0x0); - WRITE(REG_A6XX_SP_FLOAT_CNTL, A6XX_SP_FLOAT_CNTL_F16_NO_INF); - WRITE(REG_A6XX_SP_UNKNOWN_AE00, 0); - WRITE(REG_A6XX_SP_PERFCTR_ENABLE, 0x3f); - WRITE(REG_A6XX_TPL1_UNKNOWN_B605, 0x44); - WRITE(REG_A6XX_TPL1_UNKNOWN_B600, 0x100000); - WRITE(REG_A6XX_HLSQ_UNKNOWN_BE00, 0x80); - WRITE(REG_A6XX_HLSQ_UNKNOWN_BE01, 0); - - WRITE(REG_A6XX_VPC_UNKNOWN_9600, 0); - WRITE(REG_A6XX_GRAS_UNKNOWN_8600, 0x880); - WRITE(REG_A6XX_HLSQ_UNKNOWN_BE04, 0x80000); - WRITE(REG_A6XX_SP_UNKNOWN_AE03, 0x1430); - WRITE(REG_A6XX_SP_IBO_COUNT, 0); - WRITE(REG_A6XX_SP_UNKNOWN_B182, 0); - WRITE(REG_A6XX_HLSQ_SHARED_CONSTS, 0); - WRITE(REG_A6XX_UCHE_UNKNOWN_0E12, 0x3200000); - WRITE(REG_A6XX_UCHE_CLIENT_PF, 4); - WRITE(REG_A6XX_RB_UNKNOWN_8E01, 0x1); - WRITE(REG_A6XX_SP_MODE_CONTROL, A6XX_SP_MODE_CONTROL_CONSTANT_DEMOTION_ENABLE | 4); - WRITE(REG_A6XX_VFD_ADD_OFFSET, A6XX_VFD_ADD_OFFSET_VERTEX); - WRITE(REG_A6XX_RB_UNKNOWN_8811, 0x00000010); - WRITE(REG_A6XX_PC_MODE_CNTL, 0x1f); - - WRITE(REG_A6XX_GRAS_UNKNOWN_8101, 0); - WRITE(REG_A6XX_GRAS_SAMPLE_CNTL, 0); - WRITE(REG_A6XX_GRAS_UNKNOWN_8110, 0x2); - - WRITE(REG_A6XX_RB_UNKNOWN_8818, 0); - WRITE(REG_A6XX_RB_UNKNOWN_8819, 0); - WRITE(REG_A6XX_RB_UNKNOWN_881A, 0); - WRITE(REG_A6XX_RB_UNKNOWN_881B, 0); - WRITE(REG_A6XX_RB_UNKNOWN_881C, 0); - WRITE(REG_A6XX_RB_UNKNOWN_881D, 0); - WRITE(REG_A6XX_RB_UNKNOWN_881E, 0); - WRITE(REG_A6XX_RB_UNKNOWN_88F0, 0); - - WRITE(REG_A6XX_VPC_POINT_COORD_INVERT, - A6XX_VPC_POINT_COORD_INVERT(0).value); - WRITE(REG_A6XX_VPC_UNKNOWN_9300, 0); - - WRITE(REG_A6XX_VPC_SO_DISABLE, A6XX_VPC_SO_DISABLE(true).value); - - WRITE(REG_A6XX_PC_RASTER_CNTL, 0); - - WRITE(REG_A6XX_PC_MULTIVIEW_CNTL, 0); - - WRITE(REG_A6XX_SP_UNKNOWN_B183, 0); - - WRITE(REG_A6XX_GRAS_UNKNOWN_8099, 0); - WRITE(REG_A6XX_GRAS_VS_LAYER_CNTL, 0); - WRITE(REG_A6XX_GRAS_UNKNOWN_80A0, 2); - WRITE(REG_A6XX_GRAS_UNKNOWN_80AF, 0); - WRITE(REG_A6XX_VPC_UNKNOWN_9210, 0); - WRITE(REG_A6XX_VPC_UNKNOWN_9211, 0); - WRITE(REG_A6XX_VPC_UNKNOWN_9602, 0); - WRITE(REG_A6XX_PC_UNKNOWN_9E72, 0); - WRITE(REG_A6XX_SP_TP_SAMPLE_CONFIG, 0); - /* NOTE blob seems to (mostly?) use 0xb2 for SP_TP_UNKNOWN_B309 - * but this seems to kill texture gather offsets. - */ - WRITE(REG_A6XX_SP_TP_UNKNOWN_B309, 0xa2); - WRITE(REG_A6XX_RB_SAMPLE_CONFIG, 0); - WRITE(REG_A6XX_GRAS_SAMPLE_CONFIG, 0); - WRITE(REG_A6XX_RB_Z_BOUNDS_MIN, 0); - WRITE(REG_A6XX_RB_Z_BOUNDS_MAX, 0); - WRITE(REG_A6XX_HLSQ_CONTROL_5_REG, 0xfc); - - emit_marker6(ring, 7); - - OUT_PKT4(ring, REG_A6XX_VFD_MODE_CNTL, 1); - OUT_RING(ring, 0x00000000); /* VFD_MODE_CNTL */ - - WRITE(REG_A6XX_VFD_MULTIVIEW_CNTL, 0); - - OUT_PKT4(ring, REG_A6XX_PC_MODE_CNTL, 1); - OUT_RING(ring, 0x0000001f); /* PC_MODE_CNTL */ - - /* Clear any potential pending state groups to be safe: */ - OUT_PKT7(ring, CP_SET_DRAW_STATE, 3); - OUT_RING(ring, CP_SET_DRAW_STATE__0_COUNT(0) | - CP_SET_DRAW_STATE__0_DISABLE_ALL_GROUPS | - CP_SET_DRAW_STATE__0_GROUP_ID(0)); - OUT_RING(ring, CP_SET_DRAW_STATE__1_ADDR_LO(0)); - OUT_RING(ring, CP_SET_DRAW_STATE__2_ADDR_HI(0)); - - OUT_PKT4(ring, REG_A6XX_VPC_SO_STREAM_CNTL, 1); - OUT_RING(ring, 0x00000000); /* VPC_SO_STREAM_CNTL */ - - OUT_PKT4(ring, REG_A6XX_GRAS_LRZ_CNTL, 1); - OUT_RING(ring, 0x00000000); - - OUT_PKT4(ring, REG_A6XX_RB_LRZ_CNTL, 1); - OUT_RING(ring, 0x00000000); - - if (!batch->nondraw) { - trace_end_state_restore(&batch->trace); - } + // struct fd_context *ctx = batch->ctx; + + if (!batch->nondraw) { + trace_start_state_restore(&batch->trace); + } + + fd6_cache_inv(batch, ring); + + OUT_REG(ring, + A6XX_HLSQ_INVALIDATE_CMD(.vs_state = true, .hs_state = true, + .ds_state = true, .gs_state = true, + .fs_state = true, .cs_state = true, + .gfx_ibo = true, .cs_ibo = true, + .gfx_shared_const = true, + .cs_shared_const = true, + .gfx_bindless = 0x1f, .cs_bindless = 0x1f)); + + OUT_WFI5(ring); + + WRITE(REG_A6XX_RB_UNKNOWN_8E04, 0x0); + WRITE(REG_A6XX_SP_FLOAT_CNTL, A6XX_SP_FLOAT_CNTL_F16_NO_INF); + WRITE(REG_A6XX_SP_UNKNOWN_AE00, 0); + WRITE(REG_A6XX_SP_PERFCTR_ENABLE, 0x3f); + WRITE(REG_A6XX_TPL1_UNKNOWN_B605, 0x44); + WRITE(REG_A6XX_TPL1_UNKNOWN_B600, 0x100000); + WRITE(REG_A6XX_HLSQ_UNKNOWN_BE00, 0x80); + WRITE(REG_A6XX_HLSQ_UNKNOWN_BE01, 0); + + WRITE(REG_A6XX_VPC_UNKNOWN_9600, 0); + WRITE(REG_A6XX_GRAS_UNKNOWN_8600, 0x880); + WRITE(REG_A6XX_HLSQ_UNKNOWN_BE04, 0x80000); + WRITE(REG_A6XX_SP_UNKNOWN_AE03, 0x1430); + WRITE(REG_A6XX_SP_IBO_COUNT, 0); + WRITE(REG_A6XX_SP_UNKNOWN_B182, 0); + WRITE(REG_A6XX_HLSQ_SHARED_CONSTS, 0); + WRITE(REG_A6XX_UCHE_UNKNOWN_0E12, 0x3200000); + WRITE(REG_A6XX_UCHE_CLIENT_PF, 4); + WRITE(REG_A6XX_RB_UNKNOWN_8E01, 0x1); + WRITE(REG_A6XX_SP_MODE_CONTROL, + A6XX_SP_MODE_CONTROL_CONSTANT_DEMOTION_ENABLE | 4); + WRITE(REG_A6XX_VFD_ADD_OFFSET, A6XX_VFD_ADD_OFFSET_VERTEX); + WRITE(REG_A6XX_RB_UNKNOWN_8811, 0x00000010); + WRITE(REG_A6XX_PC_MODE_CNTL, 0x1f); + + WRITE(REG_A6XX_GRAS_UNKNOWN_8101, 0); + WRITE(REG_A6XX_GRAS_SAMPLE_CNTL, 0); + WRITE(REG_A6XX_GRAS_UNKNOWN_8110, 0x2); + + WRITE(REG_A6XX_RB_UNKNOWN_8818, 0); + WRITE(REG_A6XX_RB_UNKNOWN_8819, 0); + WRITE(REG_A6XX_RB_UNKNOWN_881A, 0); + WRITE(REG_A6XX_RB_UNKNOWN_881B, 0); + WRITE(REG_A6XX_RB_UNKNOWN_881C, 0); + WRITE(REG_A6XX_RB_UNKNOWN_881D, 0); + WRITE(REG_A6XX_RB_UNKNOWN_881E, 0); + WRITE(REG_A6XX_RB_UNKNOWN_88F0, 0); + + WRITE(REG_A6XX_VPC_POINT_COORD_INVERT, A6XX_VPC_POINT_COORD_INVERT(0).value); + WRITE(REG_A6XX_VPC_UNKNOWN_9300, 0); + + WRITE(REG_A6XX_VPC_SO_DISABLE, A6XX_VPC_SO_DISABLE(true).value); + + WRITE(REG_A6XX_PC_RASTER_CNTL, 0); + + WRITE(REG_A6XX_PC_MULTIVIEW_CNTL, 0); + + WRITE(REG_A6XX_SP_UNKNOWN_B183, 0); + + WRITE(REG_A6XX_GRAS_UNKNOWN_8099, 0); + WRITE(REG_A6XX_GRAS_VS_LAYER_CNTL, 0); + WRITE(REG_A6XX_GRAS_UNKNOWN_80A0, 2); + WRITE(REG_A6XX_GRAS_UNKNOWN_80AF, 0); + WRITE(REG_A6XX_VPC_UNKNOWN_9210, 0); + WRITE(REG_A6XX_VPC_UNKNOWN_9211, 0); + WRITE(REG_A6XX_VPC_UNKNOWN_9602, 0); + WRITE(REG_A6XX_PC_UNKNOWN_9E72, 0); + WRITE(REG_A6XX_SP_TP_SAMPLE_CONFIG, 0); + /* NOTE blob seems to (mostly?) use 0xb2 for SP_TP_UNKNOWN_B309 + * but this seems to kill texture gather offsets. + */ + WRITE(REG_A6XX_SP_TP_UNKNOWN_B309, 0xa2); + WRITE(REG_A6XX_RB_SAMPLE_CONFIG, 0); + WRITE(REG_A6XX_GRAS_SAMPLE_CONFIG, 0); + WRITE(REG_A6XX_RB_Z_BOUNDS_MIN, 0); + WRITE(REG_A6XX_RB_Z_BOUNDS_MAX, 0); + WRITE(REG_A6XX_HLSQ_CONTROL_5_REG, 0xfc); + + emit_marker6(ring, 7); + + OUT_PKT4(ring, REG_A6XX_VFD_MODE_CNTL, 1); + OUT_RING(ring, 0x00000000); /* VFD_MODE_CNTL */ + + WRITE(REG_A6XX_VFD_MULTIVIEW_CNTL, 0); + + OUT_PKT4(ring, REG_A6XX_PC_MODE_CNTL, 1); + OUT_RING(ring, 0x0000001f); /* PC_MODE_CNTL */ + + /* Clear any potential pending state groups to be safe: */ + OUT_PKT7(ring, CP_SET_DRAW_STATE, 3); + OUT_RING(ring, CP_SET_DRAW_STATE__0_COUNT(0) | + CP_SET_DRAW_STATE__0_DISABLE_ALL_GROUPS | + CP_SET_DRAW_STATE__0_GROUP_ID(0)); + OUT_RING(ring, CP_SET_DRAW_STATE__1_ADDR_LO(0)); + OUT_RING(ring, CP_SET_DRAW_STATE__2_ADDR_HI(0)); + + OUT_PKT4(ring, REG_A6XX_VPC_SO_STREAM_CNTL, 1); + OUT_RING(ring, 0x00000000); /* VPC_SO_STREAM_CNTL */ + + OUT_PKT4(ring, REG_A6XX_GRAS_LRZ_CNTL, 1); + OUT_RING(ring, 0x00000000); + + OUT_PKT4(ring, REG_A6XX_RB_LRZ_CNTL, 1); + OUT_RING(ring, 0x00000000); + + if (!batch->nondraw) { + trace_end_state_restore(&batch->trace); + } } static void fd6_mem_to_mem(struct fd_ringbuffer *ring, struct pipe_resource *dst, - unsigned dst_off, struct pipe_resource *src, unsigned src_off, - unsigned sizedwords) + unsigned dst_off, struct pipe_resource *src, unsigned src_off, + unsigned sizedwords) { - struct fd_bo *src_bo = fd_resource(src)->bo; - struct fd_bo *dst_bo = fd_resource(dst)->bo; - unsigned i; - - for (i = 0; i < sizedwords; i++) { - OUT_PKT7(ring, CP_MEM_TO_MEM, 5); - OUT_RING(ring, 0x00000000); - OUT_RELOC(ring, dst_bo, dst_off, 0, 0); - OUT_RELOC(ring, src_bo, src_off, 0, 0); - - dst_off += 4; - src_off += 4; - } + struct fd_bo *src_bo = fd_resource(src)->bo; + struct fd_bo *dst_bo = fd_resource(dst)->bo; + unsigned i; + + for (i = 0; i < sizedwords; i++) { + OUT_PKT7(ring, CP_MEM_TO_MEM, 5); + OUT_RING(ring, 0x00000000); + OUT_RELOC(ring, dst_bo, dst_off, 0, 0); + OUT_RELOC(ring, src_bo, src_off, 0, 0); + + dst_off += 4; + src_off += 4; + } } /* this is *almost* the same as fd6_cache_flush().. which I guess @@ -1410,49 +1375,47 @@ fd6_mem_to_mem(struct fd_ringbuffer *ring, struct pipe_resource *dst, * figuring out which events trigger what state to flush.. */ static void -fd6_framebuffer_barrier(struct fd_context *ctx) - assert_dt +fd6_framebuffer_barrier(struct fd_context *ctx) assert_dt { - struct fd6_context *fd6_ctx = fd6_context(ctx); - struct fd_batch *batch = ctx->batch; - struct fd_ringbuffer *ring = batch->draw; - unsigned seqno; + struct fd6_context *fd6_ctx = fd6_context(ctx); + struct fd_batch *batch = ctx->batch; + struct fd_ringbuffer *ring = batch->draw; + unsigned seqno; - seqno = fd6_event_write(batch, ring, RB_DONE_TS, true); + seqno = fd6_event_write(batch, ring, RB_DONE_TS, true); - OUT_PKT7(ring, CP_WAIT_REG_MEM, 6); - OUT_RING(ring, CP_WAIT_REG_MEM_0_FUNCTION(WRITE_EQ) | - CP_WAIT_REG_MEM_0_POLL_MEMORY); - OUT_RELOC(ring, control_ptr(fd6_ctx, seqno)); - OUT_RING(ring, CP_WAIT_REG_MEM_3_REF(seqno)); - OUT_RING(ring, CP_WAIT_REG_MEM_4_MASK(~0)); - OUT_RING(ring, CP_WAIT_REG_MEM_5_DELAY_LOOP_CYCLES(16)); + OUT_PKT7(ring, CP_WAIT_REG_MEM, 6); + OUT_RING(ring, CP_WAIT_REG_MEM_0_FUNCTION(WRITE_EQ) | + CP_WAIT_REG_MEM_0_POLL_MEMORY); + OUT_RELOC(ring, control_ptr(fd6_ctx, seqno)); + OUT_RING(ring, CP_WAIT_REG_MEM_3_REF(seqno)); + OUT_RING(ring, CP_WAIT_REG_MEM_4_MASK(~0)); + OUT_RING(ring, CP_WAIT_REG_MEM_5_DELAY_LOOP_CYCLES(16)); - fd6_event_write(batch, ring, PC_CCU_FLUSH_COLOR_TS, true); - fd6_event_write(batch, ring, PC_CCU_FLUSH_DEPTH_TS, true); + fd6_event_write(batch, ring, PC_CCU_FLUSH_COLOR_TS, true); + fd6_event_write(batch, ring, PC_CCU_FLUSH_DEPTH_TS, true); - seqno = fd6_event_write(batch, ring, CACHE_FLUSH_TS, true); + seqno = fd6_event_write(batch, ring, CACHE_FLUSH_TS, true); - fd6_event_write(batch, ring, 0x31, false); + fd6_event_write(batch, ring, 0x31, false); - OUT_PKT7(ring, CP_WAIT_MEM_GTE, 4); - OUT_RING(ring, CP_WAIT_MEM_GTE_0_RESERVED(0)); - OUT_RELOC(ring, control_ptr(fd6_ctx, seqno)); - OUT_RING(ring, CP_WAIT_MEM_GTE_3_REF(seqno)); + OUT_PKT7(ring, CP_WAIT_MEM_GTE, 4); + OUT_RING(ring, CP_WAIT_MEM_GTE_0_RESERVED(0)); + OUT_RELOC(ring, control_ptr(fd6_ctx, seqno)); + OUT_RING(ring, CP_WAIT_MEM_GTE_3_REF(seqno)); } void fd6_emit_init_screen(struct pipe_screen *pscreen) { - struct fd_screen *screen = fd_screen(pscreen); - screen->emit_ib = fd6_emit_ib; - screen->mem_to_mem = fd6_mem_to_mem; + struct fd_screen *screen = fd_screen(pscreen); + screen->emit_ib = fd6_emit_ib; + screen->mem_to_mem = fd6_mem_to_mem; } void -fd6_emit_init(struct pipe_context *pctx) - disable_thread_safety_analysis +fd6_emit_init(struct pipe_context *pctx) disable_thread_safety_analysis { - struct fd_context *ctx = fd_context(pctx); - ctx->framebuffer_barrier = fd6_framebuffer_barrier; + struct fd_context *ctx = fd_context(pctx); + ctx->framebuffer_barrier = fd6_framebuffer_barrier; } diff --git a/src/gallium/drivers/freedreno/a6xx/fd6_emit.h b/src/gallium/drivers/freedreno/a6xx/fd6_emit.h index 21d9d89..4775c95 100644 --- a/src/gallium/drivers/freedreno/a6xx/fd6_emit.h +++ b/src/gallium/drivers/freedreno/a6xx/fd6_emit.h @@ -30,10 +30,10 @@ #include "pipe/p_context.h" -#include "freedreno_context.h" #include "fd6_context.h" #include "fd6_format.h" #include "fd6_program.h" +#include "freedreno_context.h" #include "ir3_gallium.h" struct fd_ringbuffer; @@ -43,243 +43,247 @@ struct fd_ringbuffer; * need to be emit'd. */ enum fd6_state_id { - FD6_GROUP_PROG_CONFIG, - FD6_GROUP_PROG, - FD6_GROUP_PROG_BINNING, - FD6_GROUP_PROG_INTERP, - FD6_GROUP_PROG_FB_RAST, - FD6_GROUP_LRZ, - FD6_GROUP_LRZ_BINNING, - FD6_GROUP_VTXSTATE, - FD6_GROUP_VBO, - FD6_GROUP_CONST, - FD6_GROUP_VS_DRIVER_PARAMS, - FD6_GROUP_PRIMITIVE_PARAMS, - FD6_GROUP_VS_TEX, - FD6_GROUP_HS_TEX, - FD6_GROUP_DS_TEX, - FD6_GROUP_GS_TEX, - FD6_GROUP_FS_TEX, - FD6_GROUP_RASTERIZER, - FD6_GROUP_ZSA, - FD6_GROUP_BLEND, - FD6_GROUP_SCISSOR, - FD6_GROUP_BLEND_COLOR, - FD6_GROUP_SO, - FD6_GROUP_IBO, - FD6_GROUP_NON_GROUP, /* placeholder group for state emit in IB2, keep last */ + FD6_GROUP_PROG_CONFIG, + FD6_GROUP_PROG, + FD6_GROUP_PROG_BINNING, + FD6_GROUP_PROG_INTERP, + FD6_GROUP_PROG_FB_RAST, + FD6_GROUP_LRZ, + FD6_GROUP_LRZ_BINNING, + FD6_GROUP_VTXSTATE, + FD6_GROUP_VBO, + FD6_GROUP_CONST, + FD6_GROUP_VS_DRIVER_PARAMS, + FD6_GROUP_PRIMITIVE_PARAMS, + FD6_GROUP_VS_TEX, + FD6_GROUP_HS_TEX, + FD6_GROUP_DS_TEX, + FD6_GROUP_GS_TEX, + FD6_GROUP_FS_TEX, + FD6_GROUP_RASTERIZER, + FD6_GROUP_ZSA, + FD6_GROUP_BLEND, + FD6_GROUP_SCISSOR, + FD6_GROUP_BLEND_COLOR, + FD6_GROUP_SO, + FD6_GROUP_IBO, + FD6_GROUP_NON_GROUP, /* placeholder group for state emit in IB2, keep last */ }; -#define ENABLE_ALL (CP_SET_DRAW_STATE__0_BINNING | CP_SET_DRAW_STATE__0_GMEM | CP_SET_DRAW_STATE__0_SYSMEM) +#define ENABLE_ALL \ + (CP_SET_DRAW_STATE__0_BINNING | CP_SET_DRAW_STATE__0_GMEM | \ + CP_SET_DRAW_STATE__0_SYSMEM) #define ENABLE_DRAW (CP_SET_DRAW_STATE__0_GMEM | CP_SET_DRAW_STATE__0_SYSMEM) struct fd6_state_group { - struct fd_ringbuffer *stateobj; - enum fd6_state_id group_id; - /* enable_mask controls which states the stateobj is evaluated in, - * b0 is binning pass b1 and/or b2 is draw pass - */ - uint32_t enable_mask; + struct fd_ringbuffer *stateobj; + enum fd6_state_id group_id; + /* enable_mask controls which states the stateobj is evaluated in, + * b0 is binning pass b1 and/or b2 is draw pass + */ + uint32_t enable_mask; }; /* grouped together emit-state for prog/vertex/state emit: */ struct fd6_emit { - struct fd_context *ctx; - const struct fd_vertex_state *vtx; - const struct pipe_draw_info *info; - const struct pipe_draw_indirect_info *indirect; - const struct pipe_draw_start_count *draw; - struct ir3_cache_key key; - enum fd_dirty_3d_state dirty; - uint32_t dirty_groups; - - uint32_t sprite_coord_enable; /* bitmask */ - bool sprite_coord_mode; - bool rasterflat; - bool primitive_restart; - - /* cached to avoid repeated lookups: */ - const struct fd6_program_state *prog; - - struct ir3_shader_variant *bs; - struct ir3_shader_variant *vs; - struct ir3_shader_variant *hs; - struct ir3_shader_variant *ds; - struct ir3_shader_variant *gs; - struct ir3_shader_variant *fs; - - unsigned streamout_mask; - - struct fd6_state_group groups[32]; - unsigned num_groups; + struct fd_context *ctx; + const struct fd_vertex_state *vtx; + const struct pipe_draw_info *info; + const struct pipe_draw_indirect_info *indirect; + const struct pipe_draw_start_count *draw; + struct ir3_cache_key key; + enum fd_dirty_3d_state dirty; + uint32_t dirty_groups; + + uint32_t sprite_coord_enable; /* bitmask */ + bool sprite_coord_mode; + bool rasterflat; + bool primitive_restart; + + /* cached to avoid repeated lookups: */ + const struct fd6_program_state *prog; + + struct ir3_shader_variant *bs; + struct ir3_shader_variant *vs; + struct ir3_shader_variant *hs; + struct ir3_shader_variant *ds; + struct ir3_shader_variant *gs; + struct ir3_shader_variant *fs; + + unsigned streamout_mask; + + struct fd6_state_group groups[32]; + unsigned num_groups; }; static inline const struct fd6_program_state * fd6_emit_get_prog(struct fd6_emit *emit) { - if (!emit->prog) { - struct ir3_program_state *s = - ir3_cache_lookup(emit->ctx->shader_cache, &emit->key, &emit->ctx->debug); - emit->prog = fd6_program_state(s); - } - return emit->prog; + if (!emit->prog) { + struct ir3_program_state *s = ir3_cache_lookup( + emit->ctx->shader_cache, &emit->key, &emit->ctx->debug); + emit->prog = fd6_program_state(s); + } + return emit->prog; } static inline void fd6_emit_take_group(struct fd6_emit *emit, struct fd_ringbuffer *stateobj, - enum fd6_state_id group_id, unsigned enable_mask) + enum fd6_state_id group_id, unsigned enable_mask) { - debug_assert(emit->num_groups < ARRAY_SIZE(emit->groups)); - struct fd6_state_group *g = &emit->groups[emit->num_groups++]; - g->stateobj = stateobj; - g->group_id = group_id; - g->enable_mask = enable_mask; + debug_assert(emit->num_groups < ARRAY_SIZE(emit->groups)); + struct fd6_state_group *g = &emit->groups[emit->num_groups++]; + g->stateobj = stateobj; + g->group_id = group_id; + g->enable_mask = enable_mask; } static inline void fd6_emit_add_group(struct fd6_emit *emit, struct fd_ringbuffer *stateobj, - enum fd6_state_id group_id, unsigned enable_mask) + enum fd6_state_id group_id, unsigned enable_mask) { - fd6_emit_take_group(emit, fd_ringbuffer_ref(stateobj), group_id, enable_mask); + fd6_emit_take_group(emit, fd_ringbuffer_ref(stateobj), group_id, + enable_mask); } static inline unsigned fd6_event_write(struct fd_batch *batch, struct fd_ringbuffer *ring, - enum vgt_event_type evt, bool timestamp) + enum vgt_event_type evt, bool timestamp) { - unsigned seqno = 0; + unsigned seqno = 0; - fd_reset_wfi(batch); + fd_reset_wfi(batch); - OUT_PKT7(ring, CP_EVENT_WRITE, timestamp ? 4 : 1); - OUT_RING(ring, CP_EVENT_WRITE_0_EVENT(evt)); - if (timestamp) { - struct fd6_context *fd6_ctx = fd6_context(batch->ctx); - seqno = ++fd6_ctx->seqno; - OUT_RELOC(ring, control_ptr(fd6_ctx, seqno)); /* ADDR_LO/HI */ - OUT_RING(ring, seqno); - } + OUT_PKT7(ring, CP_EVENT_WRITE, timestamp ? 4 : 1); + OUT_RING(ring, CP_EVENT_WRITE_0_EVENT(evt)); + if (timestamp) { + struct fd6_context *fd6_ctx = fd6_context(batch->ctx); + seqno = ++fd6_ctx->seqno; + OUT_RELOC(ring, control_ptr(fd6_ctx, seqno)); /* ADDR_LO/HI */ + OUT_RING(ring, seqno); + } - return seqno; + return seqno; } static inline void fd6_cache_inv(struct fd_batch *batch, struct fd_ringbuffer *ring) { - fd6_event_write(batch, ring, CACHE_INVALIDATE, false); + fd6_event_write(batch, ring, CACHE_INVALIDATE, false); } static inline void fd6_cache_flush(struct fd_batch *batch, struct fd_ringbuffer *ring) { - struct fd6_context *fd6_ctx = fd6_context(batch->ctx); - unsigned seqno; + struct fd6_context *fd6_ctx = fd6_context(batch->ctx); + unsigned seqno; - seqno = fd6_event_write(batch, ring, RB_DONE_TS, true); + seqno = fd6_event_write(batch, ring, RB_DONE_TS, true); - OUT_PKT7(ring, CP_WAIT_REG_MEM, 6); - OUT_RING(ring, CP_WAIT_REG_MEM_0_FUNCTION(WRITE_EQ) | - CP_WAIT_REG_MEM_0_POLL_MEMORY); - OUT_RELOC(ring, control_ptr(fd6_ctx, seqno)); - OUT_RING(ring, CP_WAIT_REG_MEM_3_REF(seqno)); - OUT_RING(ring, CP_WAIT_REG_MEM_4_MASK(~0)); - OUT_RING(ring, CP_WAIT_REG_MEM_5_DELAY_LOOP_CYCLES(16)); + OUT_PKT7(ring, CP_WAIT_REG_MEM, 6); + OUT_RING(ring, CP_WAIT_REG_MEM_0_FUNCTION(WRITE_EQ) | + CP_WAIT_REG_MEM_0_POLL_MEMORY); + OUT_RELOC(ring, control_ptr(fd6_ctx, seqno)); + OUT_RING(ring, CP_WAIT_REG_MEM_3_REF(seqno)); + OUT_RING(ring, CP_WAIT_REG_MEM_4_MASK(~0)); + OUT_RING(ring, CP_WAIT_REG_MEM_5_DELAY_LOOP_CYCLES(16)); - seqno = fd6_event_write(batch, ring, CACHE_FLUSH_TS, true); + seqno = fd6_event_write(batch, ring, CACHE_FLUSH_TS, true); - OUT_PKT7(ring, CP_WAIT_MEM_GTE, 4); - OUT_RING(ring, CP_WAIT_MEM_GTE_0_RESERVED(0)); - OUT_RELOC(ring, control_ptr(fd6_ctx, seqno)); - OUT_RING(ring, CP_WAIT_MEM_GTE_3_REF(seqno)); + OUT_PKT7(ring, CP_WAIT_MEM_GTE, 4); + OUT_RING(ring, CP_WAIT_MEM_GTE_0_RESERVED(0)); + OUT_RELOC(ring, control_ptr(fd6_ctx, seqno)); + OUT_RING(ring, CP_WAIT_MEM_GTE_3_REF(seqno)); } static inline void fd6_emit_blit(struct fd_batch *batch, struct fd_ringbuffer *ring) { - emit_marker6(ring, 7); - fd6_event_write(batch, ring, BLIT, false); - emit_marker6(ring, 7); + emit_marker6(ring, 7); + fd6_event_write(batch, ring, BLIT, false); + emit_marker6(ring, 7); } static inline void fd6_emit_lrz_flush(struct fd_ringbuffer *ring) { - OUT_PKT7(ring, CP_EVENT_WRITE, 1); - OUT_RING(ring, LRZ_FLUSH); + OUT_PKT7(ring, CP_EVENT_WRITE, 1); + OUT_RING(ring, LRZ_FLUSH); } static inline bool fd6_geom_stage(gl_shader_stage type) { - switch (type) { - case MESA_SHADER_VERTEX: - case MESA_SHADER_TESS_CTRL: - case MESA_SHADER_TESS_EVAL: - case MESA_SHADER_GEOMETRY: - return true; - case MESA_SHADER_FRAGMENT: - case MESA_SHADER_COMPUTE: - case MESA_SHADER_KERNEL: - return false; - default: - unreachable("bad shader type"); - } + switch (type) { + case MESA_SHADER_VERTEX: + case MESA_SHADER_TESS_CTRL: + case MESA_SHADER_TESS_EVAL: + case MESA_SHADER_GEOMETRY: + return true; + case MESA_SHADER_FRAGMENT: + case MESA_SHADER_COMPUTE: + case MESA_SHADER_KERNEL: + return false; + default: + unreachable("bad shader type"); + } } static inline uint32_t fd6_stage2opcode(gl_shader_stage type) { - return fd6_geom_stage(type) ? CP_LOAD_STATE6_GEOM : CP_LOAD_STATE6_FRAG; + return fd6_geom_stage(type) ? CP_LOAD_STATE6_GEOM : CP_LOAD_STATE6_FRAG; } static inline enum a6xx_state_block fd6_stage2shadersb(gl_shader_stage type) { - switch (type) { - case MESA_SHADER_VERTEX: - return SB6_VS_SHADER; - case MESA_SHADER_TESS_CTRL: - return SB6_HS_SHADER; - case MESA_SHADER_TESS_EVAL: - return SB6_DS_SHADER; - case MESA_SHADER_GEOMETRY: - return SB6_GS_SHADER; - case MESA_SHADER_FRAGMENT: - return SB6_FS_SHADER; - case MESA_SHADER_COMPUTE: - case MESA_SHADER_KERNEL: - return SB6_CS_SHADER; - default: - unreachable("bad shader type"); - return ~0; - } + switch (type) { + case MESA_SHADER_VERTEX: + return SB6_VS_SHADER; + case MESA_SHADER_TESS_CTRL: + return SB6_HS_SHADER; + case MESA_SHADER_TESS_EVAL: + return SB6_DS_SHADER; + case MESA_SHADER_GEOMETRY: + return SB6_GS_SHADER; + case MESA_SHADER_FRAGMENT: + return SB6_FS_SHADER; + case MESA_SHADER_COMPUTE: + case MESA_SHADER_KERNEL: + return SB6_CS_SHADER; + default: + unreachable("bad shader type"); + return ~0; + } } static inline enum a6xx_tess_spacing fd6_gl2spacing(enum gl_tess_spacing spacing) { - switch (spacing) { - case TESS_SPACING_EQUAL: - return TESS_EQUAL; - case TESS_SPACING_FRACTIONAL_ODD: - return TESS_FRACTIONAL_ODD; - case TESS_SPACING_FRACTIONAL_EVEN: - return TESS_FRACTIONAL_EVEN; - case TESS_SPACING_UNSPECIFIED: - default: - unreachable("spacing must be specified"); - } + switch (spacing) { + case TESS_SPACING_EQUAL: + return TESS_EQUAL; + case TESS_SPACING_FRACTIONAL_ODD: + return TESS_FRACTIONAL_ODD; + case TESS_SPACING_FRACTIONAL_EVEN: + return TESS_FRACTIONAL_EVEN; + case TESS_SPACING_UNSPECIFIED: + default: + unreachable("spacing must be specified"); + } } bool fd6_emit_textures(struct fd_context *ctx, struct fd_ringbuffer *ring, - enum pipe_shader_type type, struct fd_texture_stateobj *tex, - unsigned bcolor_offset, - const struct ir3_shader_variant *v) assert_dt; + enum pipe_shader_type type, + struct fd_texture_stateobj *tex, unsigned bcolor_offset, + const struct ir3_shader_variant *v) assert_dt; -void fd6_emit_state(struct fd_ringbuffer *ring, struct fd6_emit *emit) assert_dt; +void fd6_emit_state(struct fd_ringbuffer *ring, + struct fd6_emit *emit) assert_dt; void fd6_emit_cs_state(struct fd_context *ctx, struct fd_ringbuffer *ring, - struct ir3_shader_variant *cp) assert_dt; + struct ir3_shader_variant *cp) assert_dt; void fd6_emit_restore(struct fd_batch *batch, struct fd_ringbuffer *ring); @@ -289,15 +293,15 @@ void fd6_emit_init(struct pipe_context *pctx); static inline void fd6_emit_ib(struct fd_ringbuffer *ring, struct fd_ringbuffer *target) { - emit_marker6(ring, 6); - __OUT_IB5(ring, target); - emit_marker6(ring, 6); + emit_marker6(ring, 6); + __OUT_IB5(ring, target); + emit_marker6(ring, 6); } -#define WRITE(reg, val) do { \ - OUT_PKT4(ring, reg, 1); \ - OUT_RING(ring, val); \ - } while (0) - +#define WRITE(reg, val) \ + do { \ + OUT_PKT4(ring, reg, 1); \ + OUT_RING(ring, val); \ + } while (0) #endif /* FD6_EMIT_H */ diff --git a/src/gallium/drivers/freedreno/a6xx/fd6_format.c b/src/gallium/drivers/freedreno/a6xx/fd6_format.c index 69d635c..aaa521c 100644 --- a/src/gallium/drivers/freedreno/a6xx/fd6_format.c +++ b/src/gallium/drivers/freedreno/a6xx/fd6_format.c @@ -31,27 +31,24 @@ #include "fd6_format.h" #include "freedreno_resource.h" - /* Specifies the table of all the formats and their features. Also supplies * the helpers that look up various data in those tables. */ struct fd6_format { - enum a6xx_format vtx; - enum a6xx_format tex; - enum a6xx_format rb; - enum a3xx_color_swap swap; - boolean present; + enum a6xx_format vtx; + enum a6xx_format tex; + enum a6xx_format rb; + enum a3xx_color_swap swap; + boolean present; }; -#define FMT(pipe, vtxfmt, texfmt, rbfmt, swapfmt) \ - [PIPE_FORMAT_ ## pipe] = { \ - .present = 1, \ - .vtx = FMT6_ ## vtxfmt, \ - .tex = FMT6_ ## texfmt, \ - .rb = FMT6_ ## rbfmt, \ - .swap = swapfmt \ - } +#define FMT(pipe, vtxfmt, texfmt, rbfmt, swapfmt) \ + [PIPE_FORMAT_##pipe] = {.present = 1, \ + .vtx = FMT6_##vtxfmt, \ + .tex = FMT6_##texfmt, \ + .rb = FMT6_##rbfmt, \ + .swap = swapfmt} /* vertex + texture + color */ #define VTC(pipe, fmt, swapfmt) FMT(pipe, fmt, fmt, fmt, swapfmt) @@ -342,125 +339,122 @@ static struct fd6_format formats[PIPE_FORMAT_COUNT] = { enum a6xx_format fd6_pipe2vtx(enum pipe_format format) { - if (!formats[format].present) - return FMT6_NONE; - return formats[format].vtx; + if (!formats[format].present) + return FMT6_NONE; + return formats[format].vtx; } /* convert pipe format to texture sampler format: */ enum a6xx_format fd6_pipe2tex(enum pipe_format format) { - if (!formats[format].present) - return FMT6_NONE; - return formats[format].tex; + if (!formats[format].present) + return FMT6_NONE; + return formats[format].tex; } /* convert pipe format to MRT / copydest format used for render-target: */ enum a6xx_format fd6_pipe2color(enum pipe_format format) { - if (!formats[format].present) - return FMT6_NONE; - return formats[format].rb; + if (!formats[format].present) + return FMT6_NONE; + return formats[format].rb; } enum a3xx_color_swap fd6_pipe2swap(enum pipe_format format) { - if (!formats[format].present) - return WZYX; - return formats[format].swap; + if (!formats[format].present) + return WZYX; + return formats[format].swap; } enum a6xx_depth_format fd6_pipe2depth(enum pipe_format format) { - switch (format) { - case PIPE_FORMAT_Z16_UNORM: - return DEPTH6_16; - case PIPE_FORMAT_Z24X8_UNORM: - case PIPE_FORMAT_Z24_UNORM_S8_UINT: - case PIPE_FORMAT_X8Z24_UNORM: - case PIPE_FORMAT_S8_UINT_Z24_UNORM: - return DEPTH6_24_8; - case PIPE_FORMAT_Z32_FLOAT: - case PIPE_FORMAT_Z32_FLOAT_S8X24_UINT: - return DEPTH6_32; - default: - return ~0; - } + switch (format) { + case PIPE_FORMAT_Z16_UNORM: + return DEPTH6_16; + case PIPE_FORMAT_Z24X8_UNORM: + case PIPE_FORMAT_Z24_UNORM_S8_UINT: + case PIPE_FORMAT_X8Z24_UNORM: + case PIPE_FORMAT_S8_UINT_Z24_UNORM: + return DEPTH6_24_8; + case PIPE_FORMAT_Z32_FLOAT: + case PIPE_FORMAT_Z32_FLOAT_S8X24_UINT: + return DEPTH6_32; + default: + return ~0; + } } enum a6xx_tex_swiz fd6_pipe2swiz(unsigned swiz) { - switch (swiz) { - default: - case PIPE_SWIZZLE_X: return A6XX_TEX_X; - case PIPE_SWIZZLE_Y: return A6XX_TEX_Y; - case PIPE_SWIZZLE_Z: return A6XX_TEX_Z; - case PIPE_SWIZZLE_W: return A6XX_TEX_W; - case PIPE_SWIZZLE_0: return A6XX_TEX_ZERO; - case PIPE_SWIZZLE_1: return A6XX_TEX_ONE; - } + switch (swiz) { + default: + case PIPE_SWIZZLE_X: + return A6XX_TEX_X; + case PIPE_SWIZZLE_Y: + return A6XX_TEX_Y; + case PIPE_SWIZZLE_Z: + return A6XX_TEX_Z; + case PIPE_SWIZZLE_W: + return A6XX_TEX_W; + case PIPE_SWIZZLE_0: + return A6XX_TEX_ZERO; + case PIPE_SWIZZLE_1: + return A6XX_TEX_ONE; + } } void -fd6_tex_swiz(enum pipe_format format, unsigned char *swiz, - unsigned swizzle_r, unsigned swizzle_g, - unsigned swizzle_b, unsigned swizzle_a) +fd6_tex_swiz(enum pipe_format format, unsigned char *swiz, unsigned swizzle_r, + unsigned swizzle_g, unsigned swizzle_b, unsigned swizzle_a) { - const struct util_format_description *desc = - util_format_description(format); - const unsigned char uswiz[4] = { - swizzle_r, swizzle_g, swizzle_b, swizzle_a - }; - - /* Gallium expects stencil sampler to return (s,s,s,s), so massage - * the swizzle to do so. - */ - if (format == PIPE_FORMAT_X24S8_UINT) { - const unsigned char stencil_swiz[4] = { - PIPE_SWIZZLE_W, PIPE_SWIZZLE_W, PIPE_SWIZZLE_W, PIPE_SWIZZLE_W - }; - util_format_compose_swizzles(stencil_swiz, uswiz, swiz); - } else if (fd6_pipe2swap(format) != WZYX) { - /* Formats with a non-pass-through swap are permutations of RGBA - * formats. We program the permutation using the swap and don't - * need to compose the format swizzle with the user swizzle. - */ - memcpy(swiz, uswiz, sizeof(uswiz)); - } else { - /* Otherwise, it's an unswapped RGBA format or a format like L8 where - * we need the XXX1 swizzle from the gallium format description. - */ - util_format_compose_swizzles(desc->swizzle, uswiz, swiz); - } + const struct util_format_description *desc = util_format_description(format); + const unsigned char uswiz[4] = {swizzle_r, swizzle_g, swizzle_b, swizzle_a}; + + /* Gallium expects stencil sampler to return (s,s,s,s), so massage + * the swizzle to do so. + */ + if (format == PIPE_FORMAT_X24S8_UINT) { + const unsigned char stencil_swiz[4] = {PIPE_SWIZZLE_W, PIPE_SWIZZLE_W, + PIPE_SWIZZLE_W, PIPE_SWIZZLE_W}; + util_format_compose_swizzles(stencil_swiz, uswiz, swiz); + } else if (fd6_pipe2swap(format) != WZYX) { + /* Formats with a non-pass-through swap are permutations of RGBA + * formats. We program the permutation using the swap and don't + * need to compose the format swizzle with the user swizzle. + */ + memcpy(swiz, uswiz, sizeof(uswiz)); + } else { + /* Otherwise, it's an unswapped RGBA format or a format like L8 where + * we need the XXX1 swizzle from the gallium format description. + */ + util_format_compose_swizzles(desc->swizzle, uswiz, swiz); + } } /* Compute the TEX_CONST_0 value for texture state, including SWIZ/SWAP/etc: */ uint32_t -fd6_tex_const_0(struct pipe_resource *prsc, - unsigned level, enum pipe_format format, - unsigned swizzle_r, unsigned swizzle_g, - unsigned swizzle_b, unsigned swizzle_a) +fd6_tex_const_0(struct pipe_resource *prsc, unsigned level, + enum pipe_format format, unsigned swizzle_r, unsigned swizzle_g, + unsigned swizzle_b, unsigned swizzle_a) { - struct fd_resource *rsc = fd_resource(prsc); - unsigned char swiz[4]; - - fd6_tex_swiz(format, swiz, - swizzle_r, swizzle_g, - swizzle_b, swizzle_a); - - return - A6XX_TEX_CONST_0_FMT(fd6_pipe2tex(format)) | - A6XX_TEX_CONST_0_SAMPLES(fd_msaa_samples(prsc->nr_samples)) | - A6XX_TEX_CONST_0_SWAP(fd6_resource_swap(rsc, format)) | - A6XX_TEX_CONST_0_TILE_MODE(fd_resource_tile_mode(prsc, level)) | - COND(util_format_is_srgb(format), A6XX_TEX_CONST_0_SRGB) | - A6XX_TEX_CONST_0_SWIZ_X(fd6_pipe2swiz(swiz[0])) | - A6XX_TEX_CONST_0_SWIZ_Y(fd6_pipe2swiz(swiz[1])) | - A6XX_TEX_CONST_0_SWIZ_Z(fd6_pipe2swiz(swiz[2])) | - A6XX_TEX_CONST_0_SWIZ_W(fd6_pipe2swiz(swiz[3])); + struct fd_resource *rsc = fd_resource(prsc); + unsigned char swiz[4]; + + fd6_tex_swiz(format, swiz, swizzle_r, swizzle_g, swizzle_b, swizzle_a); + + return A6XX_TEX_CONST_0_FMT(fd6_pipe2tex(format)) | + A6XX_TEX_CONST_0_SAMPLES(fd_msaa_samples(prsc->nr_samples)) | + A6XX_TEX_CONST_0_SWAP(fd6_resource_swap(rsc, format)) | + A6XX_TEX_CONST_0_TILE_MODE(fd_resource_tile_mode(prsc, level)) | + COND(util_format_is_srgb(format), A6XX_TEX_CONST_0_SRGB) | + A6XX_TEX_CONST_0_SWIZ_X(fd6_pipe2swiz(swiz[0])) | + A6XX_TEX_CONST_0_SWIZ_Y(fd6_pipe2swiz(swiz[1])) | + A6XX_TEX_CONST_0_SWIZ_Z(fd6_pipe2swiz(swiz[2])) | + A6XX_TEX_CONST_0_SWIZ_W(fd6_pipe2swiz(swiz[3])); } diff --git a/src/gallium/drivers/freedreno/a6xx/fd6_format.h b/src/gallium/drivers/freedreno/a6xx/fd6_format.h index 9163d41..b9e2f55 100644 --- a/src/gallium/drivers/freedreno/a6xx/fd6_format.h +++ b/src/gallium/drivers/freedreno/a6xx/fd6_format.h @@ -41,18 +41,18 @@ enum a6xx_depth_format fd6_pipe2depth(enum pipe_format format); enum a6xx_tex_swiz fd6_pipe2swiz(unsigned swiz); void fd6_tex_swiz(enum pipe_format format, unsigned char *swiz, - unsigned swizzle_r, unsigned swizzle_g, - unsigned swizzle_b, unsigned swizzle_a); + unsigned swizzle_r, unsigned swizzle_g, unsigned swizzle_b, + unsigned swizzle_a); -uint32_t fd6_tex_const_0(struct pipe_resource *prsc, - unsigned level, enum pipe_format format, - unsigned swizzle_r, unsigned swizzle_g, - unsigned swizzle_b, unsigned swizzle_a); +uint32_t fd6_tex_const_0(struct pipe_resource *prsc, unsigned level, + enum pipe_format format, unsigned swizzle_r, + unsigned swizzle_g, unsigned swizzle_b, + unsigned swizzle_a); static inline uint32_t fd6_resource_swap(struct fd_resource *rsc, enum pipe_format format) { - return rsc->layout.tile_mode ? WZYX : fd6_pipe2swap(format); + return rsc->layout.tile_mode ? WZYX : fd6_pipe2swap(format); } #endif /* FD6_UTIL_H_ */ diff --git a/src/gallium/drivers/freedreno/a6xx/fd6_gmem.c b/src/gallium/drivers/freedreno/a6xx/fd6_gmem.c index 43be406..40d9caf 100644 --- a/src/gallium/drivers/freedreno/a6xx/fd6_gmem.c +++ b/src/gallium/drivers/freedreno/a6xx/fd6_gmem.c @@ -28,26 +28,26 @@ #include #include "pipe/p_state.h" -#include "util/u_string.h" -#include "util/u_memory.h" -#include "util/u_inlines.h" #include "util/format/u_format.h" +#include "util/u_inlines.h" +#include "util/u_memory.h" +#include "util/u_string.h" #include "freedreno_draw.h" -#include "freedreno_state.h" #include "freedreno_resource.h" +#include "freedreno_state.h" #include "freedreno_tracepoints.h" #include "fd6_blitter.h" -#include "fd6_gmem.h" #include "fd6_context.h" #include "fd6_draw.h" #include "fd6_emit.h" -#include "fd6_program.h" #include "fd6_format.h" +#include "fd6_gmem.h" +#include "fd6_pack.h" +#include "fd6_program.h" #include "fd6_resource.h" #include "fd6_zsa.h" -#include "fd6_pack.h" /** * Emits the flags registers, suitable for RB_MRT_FLAG_BUFFER, @@ -55,315 +55,321 @@ */ void fd6_emit_flag_reference(struct fd_ringbuffer *ring, struct fd_resource *rsc, - int level, int layer) + int level, int layer) { - if (fd_resource_ubwc_enabled(rsc, level)) { - OUT_RELOC(ring, rsc->bo, fd_resource_ubwc_offset(rsc, level, layer), 0, 0); - OUT_RING(ring, - A6XX_RB_MRT_FLAG_BUFFER_PITCH_PITCH(fdl_ubwc_pitch(&rsc->layout, level)) | - A6XX_RB_MRT_FLAG_BUFFER_PITCH_ARRAY_PITCH(rsc->layout.ubwc_layer_size >> 2)); - } else { - OUT_RING(ring, 0x00000000); /* RB_MRT_FLAG_BUFFER[i].ADDR_LO */ - OUT_RING(ring, 0x00000000); /* RB_MRT_FLAG_BUFFER[i].ADDR_HI */ - OUT_RING(ring, 0x00000000); - } + if (fd_resource_ubwc_enabled(rsc, level)) { + OUT_RELOC(ring, rsc->bo, fd_resource_ubwc_offset(rsc, level, layer), 0, + 0); + OUT_RING(ring, A6XX_RB_MRT_FLAG_BUFFER_PITCH_PITCH( + fdl_ubwc_pitch(&rsc->layout, level)) | + A6XX_RB_MRT_FLAG_BUFFER_PITCH_ARRAY_PITCH( + rsc->layout.ubwc_layer_size >> 2)); + } else { + OUT_RING(ring, 0x00000000); /* RB_MRT_FLAG_BUFFER[i].ADDR_LO */ + OUT_RING(ring, 0x00000000); /* RB_MRT_FLAG_BUFFER[i].ADDR_HI */ + OUT_RING(ring, 0x00000000); + } } static void emit_mrt(struct fd_ringbuffer *ring, struct pipe_framebuffer_state *pfb, - const struct fd_gmem_stateobj *gmem) + const struct fd_gmem_stateobj *gmem) { - unsigned srgb_cntl = 0; - unsigned i; - - unsigned max_layer_index = 0; - - for (i = 0; i < pfb->nr_cbufs; i++) { - enum a6xx_format format = 0; - enum a3xx_color_swap swap = WZYX; - bool sint = false, uint = false; - struct fd_resource *rsc = NULL; - struct fdl_slice *slice = NULL; - uint32_t stride = 0; - uint32_t array_stride = 0; - uint32_t offset; - uint32_t tile_mode; - - if (!pfb->cbufs[i]) - continue; - - struct pipe_surface *psurf = pfb->cbufs[i]; - enum pipe_format pformat = psurf->format; - rsc = fd_resource(psurf->texture); - if (!rsc->bo) - continue; - - uint32_t base = gmem ? gmem->cbuf_base[i] : 0; - slice = fd_resource_slice(rsc, psurf->u.tex.level); - format = fd6_pipe2color(pformat); - sint = util_format_is_pure_sint(pformat); - uint = util_format_is_pure_uint(pformat); - - if (util_format_is_srgb(pformat)) - srgb_cntl |= (1 << i); - - offset = fd_resource_offset(rsc, psurf->u.tex.level, - psurf->u.tex.first_layer); - - stride = fd_resource_pitch(rsc, psurf->u.tex.level); - array_stride = fd_resource_layer_stride(rsc, psurf->u.tex.level); - swap = fd6_resource_swap(rsc, pformat); - - tile_mode = fd_resource_tile_mode(psurf->texture, psurf->u.tex.level); - max_layer_index = psurf->u.tex.last_layer - psurf->u.tex.first_layer; - - debug_assert((offset + slice->size0) <= fd_bo_size(rsc->bo)); - - OUT_REG(ring, - A6XX_RB_MRT_BUF_INFO(i, - .color_format = format, - .color_tile_mode = tile_mode, - .color_swap = swap), - A6XX_RB_MRT_PITCH(i, .a6xx_rb_mrt_pitch = stride), - A6XX_RB_MRT_ARRAY_PITCH(i, .a6xx_rb_mrt_array_pitch = array_stride), - A6XX_RB_MRT_BASE(i, .bo = rsc->bo, .bo_offset = offset), - A6XX_RB_MRT_BASE_GMEM(i, .unknown = base)); - - OUT_REG(ring, - A6XX_SP_FS_MRT_REG(i, .color_format = format, - .color_sint = sint, .color_uint = uint)); - - OUT_PKT4(ring, REG_A6XX_RB_MRT_FLAG_BUFFER(i), 3); - fd6_emit_flag_reference(ring, rsc, - psurf->u.tex.level, psurf->u.tex.first_layer); - } - - OUT_REG(ring, A6XX_RB_SRGB_CNTL(.dword = srgb_cntl)); - OUT_REG(ring, A6XX_SP_SRGB_CNTL(.dword = srgb_cntl)); - - OUT_REG(ring, A6XX_GRAS_MAX_LAYER_INDEX(max_layer_index)); + unsigned srgb_cntl = 0; + unsigned i; + + unsigned max_layer_index = 0; + + for (i = 0; i < pfb->nr_cbufs; i++) { + enum a6xx_format format = 0; + enum a3xx_color_swap swap = WZYX; + bool sint = false, uint = false; + struct fd_resource *rsc = NULL; + struct fdl_slice *slice = NULL; + uint32_t stride = 0; + uint32_t array_stride = 0; + uint32_t offset; + uint32_t tile_mode; + + if (!pfb->cbufs[i]) + continue; + + struct pipe_surface *psurf = pfb->cbufs[i]; + enum pipe_format pformat = psurf->format; + rsc = fd_resource(psurf->texture); + if (!rsc->bo) + continue; + + uint32_t base = gmem ? gmem->cbuf_base[i] : 0; + slice = fd_resource_slice(rsc, psurf->u.tex.level); + format = fd6_pipe2color(pformat); + sint = util_format_is_pure_sint(pformat); + uint = util_format_is_pure_uint(pformat); + + if (util_format_is_srgb(pformat)) + srgb_cntl |= (1 << i); + + offset = + fd_resource_offset(rsc, psurf->u.tex.level, psurf->u.tex.first_layer); + + stride = fd_resource_pitch(rsc, psurf->u.tex.level); + array_stride = fd_resource_layer_stride(rsc, psurf->u.tex.level); + swap = fd6_resource_swap(rsc, pformat); + + tile_mode = fd_resource_tile_mode(psurf->texture, psurf->u.tex.level); + max_layer_index = psurf->u.tex.last_layer - psurf->u.tex.first_layer; + + debug_assert((offset + slice->size0) <= fd_bo_size(rsc->bo)); + + OUT_REG( + ring, + A6XX_RB_MRT_BUF_INFO(i, .color_format = format, + .color_tile_mode = tile_mode, .color_swap = swap), + A6XX_RB_MRT_PITCH(i, .a6xx_rb_mrt_pitch = stride), + A6XX_RB_MRT_ARRAY_PITCH(i, .a6xx_rb_mrt_array_pitch = array_stride), + A6XX_RB_MRT_BASE(i, .bo = rsc->bo, .bo_offset = offset), + A6XX_RB_MRT_BASE_GMEM(i, .unknown = base)); + + OUT_REG(ring, A6XX_SP_FS_MRT_REG(i, .color_format = format, + .color_sint = sint, .color_uint = uint)); + + OUT_PKT4(ring, REG_A6XX_RB_MRT_FLAG_BUFFER(i), 3); + fd6_emit_flag_reference(ring, rsc, psurf->u.tex.level, + psurf->u.tex.first_layer); + } + + OUT_REG(ring, A6XX_RB_SRGB_CNTL(.dword = srgb_cntl)); + OUT_REG(ring, A6XX_SP_SRGB_CNTL(.dword = srgb_cntl)); + + OUT_REG(ring, A6XX_GRAS_MAX_LAYER_INDEX(max_layer_index)); } static void emit_zs(struct fd_ringbuffer *ring, struct pipe_surface *zsbuf, - const struct fd_gmem_stateobj *gmem) + const struct fd_gmem_stateobj *gmem) { - if (zsbuf) { - struct fd_resource *rsc = fd_resource(zsbuf->texture); - enum a6xx_depth_format fmt = fd6_pipe2depth(zsbuf->format); - uint32_t stride = fd_resource_pitch(rsc, 0); - uint32_t array_stride = fd_resource_layer_stride(rsc, 0); - uint32_t base = gmem ? gmem->zsbuf_base[0] : 0; - uint32_t offset = fd_resource_offset(rsc, zsbuf->u.tex.level, - zsbuf->u.tex.first_layer); - - OUT_REG(ring, - A6XX_RB_DEPTH_BUFFER_INFO(.depth_format = fmt), - A6XX_RB_DEPTH_BUFFER_PITCH(.a6xx_rb_depth_buffer_pitch = stride), - A6XX_RB_DEPTH_BUFFER_ARRAY_PITCH(.a6xx_rb_depth_buffer_array_pitch = array_stride), - A6XX_RB_DEPTH_BUFFER_BASE(.bo = rsc->bo, .bo_offset = offset), - A6XX_RB_DEPTH_BUFFER_BASE_GMEM(.dword = base)); - - OUT_REG(ring, A6XX_GRAS_SU_DEPTH_BUFFER_INFO(.depth_format = fmt)); - - OUT_PKT4(ring, REG_A6XX_RB_DEPTH_FLAG_BUFFER_BASE, 3); - fd6_emit_flag_reference(ring, rsc, - zsbuf->u.tex.level, zsbuf->u.tex.first_layer); - - if (rsc->lrz) { - OUT_REG(ring, - A6XX_GRAS_LRZ_BUFFER_BASE(.bo = rsc->lrz), - A6XX_GRAS_LRZ_BUFFER_PITCH(.pitch = rsc->lrz_pitch), - // XXX a6xx seems to use a different buffer here.. not sure what for.. - A6XX_GRAS_LRZ_FAST_CLEAR_BUFFER_BASE()); - } else { - OUT_PKT4(ring, REG_A6XX_GRAS_LRZ_BUFFER_BASE, 5); - OUT_RING(ring, 0x00000000); - OUT_RING(ring, 0x00000000); - OUT_RING(ring, 0x00000000); /* GRAS_LRZ_BUFFER_PITCH */ - OUT_RING(ring, 0x00000000); /* GRAS_LRZ_FAST_CLEAR_BUFFER_BASE_LO */ - OUT_RING(ring, 0x00000000); - } - - /* NOTE: blob emits GRAS_LRZ_CNTL plus GRAZ_LRZ_BUFFER_BASE - * plus this CP_EVENT_WRITE at the end in it's own IB.. - */ - OUT_PKT7(ring, CP_EVENT_WRITE, 1); - OUT_RING(ring, CP_EVENT_WRITE_0_EVENT(UNK_25)); - - if (rsc->stencil) { - stride = fd_resource_pitch(rsc->stencil, 0); - array_stride = fd_resource_layer_stride(rsc->stencil, 0); - uint32_t base = gmem ? gmem->zsbuf_base[1] : 0; - - OUT_REG(ring, - A6XX_RB_STENCIL_INFO(.separate_stencil = true), - A6XX_RB_STENCIL_BUFFER_PITCH(.a6xx_rb_stencil_buffer_pitch = stride), - A6XX_RB_STENCIL_BUFFER_ARRAY_PITCH(.a6xx_rb_stencil_buffer_array_pitch = array_stride), - A6XX_RB_STENCIL_BUFFER_BASE(.bo = rsc->stencil->bo), - A6XX_RB_STENCIL_BUFFER_BASE_GMEM(.dword = base)); - } else { - OUT_REG(ring, A6XX_RB_STENCIL_INFO(0)); - } - } else { - OUT_PKT4(ring, REG_A6XX_RB_DEPTH_BUFFER_INFO, 6); - OUT_RING(ring, A6XX_RB_DEPTH_BUFFER_INFO_DEPTH_FORMAT(DEPTH6_NONE)); - OUT_RING(ring, 0x00000000); /* RB_DEPTH_BUFFER_PITCH */ - OUT_RING(ring, 0x00000000); /* RB_DEPTH_BUFFER_ARRAY_PITCH */ - OUT_RING(ring, 0x00000000); /* RB_DEPTH_BUFFER_BASE_LO */ - OUT_RING(ring, 0x00000000); /* RB_DEPTH_BUFFER_BASE_HI */ - OUT_RING(ring, 0x00000000); /* RB_DEPTH_BUFFER_BASE_GMEM */ - - OUT_REG(ring, A6XX_GRAS_SU_DEPTH_BUFFER_INFO(.depth_format = DEPTH6_NONE)); - - OUT_PKT4(ring, REG_A6XX_GRAS_LRZ_BUFFER_BASE, 5); - OUT_RING(ring, 0x00000000); /* RB_DEPTH_FLAG_BUFFER_BASE_LO */ - OUT_RING(ring, 0x00000000); /* RB_DEPTH_FLAG_BUFFER_BASE_HI */ - OUT_RING(ring, 0x00000000); /* GRAS_LRZ_BUFFER_PITCH */ - OUT_RING(ring, 0x00000000); /* GRAS_LRZ_FAST_CLEAR_BUFFER_BASE_LO */ - OUT_RING(ring, 0x00000000); /* GRAS_LRZ_FAST_CLEAR_BUFFER_BASE_HI */ - - OUT_REG(ring, A6XX_RB_STENCIL_INFO(0)); - } + if (zsbuf) { + struct fd_resource *rsc = fd_resource(zsbuf->texture); + enum a6xx_depth_format fmt = fd6_pipe2depth(zsbuf->format); + uint32_t stride = fd_resource_pitch(rsc, 0); + uint32_t array_stride = fd_resource_layer_stride(rsc, 0); + uint32_t base = gmem ? gmem->zsbuf_base[0] : 0; + uint32_t offset = + fd_resource_offset(rsc, zsbuf->u.tex.level, zsbuf->u.tex.first_layer); + + OUT_REG( + ring, A6XX_RB_DEPTH_BUFFER_INFO(.depth_format = fmt), + A6XX_RB_DEPTH_BUFFER_PITCH(.a6xx_rb_depth_buffer_pitch = stride), + A6XX_RB_DEPTH_BUFFER_ARRAY_PITCH(.a6xx_rb_depth_buffer_array_pitch = + array_stride), + A6XX_RB_DEPTH_BUFFER_BASE(.bo = rsc->bo, .bo_offset = offset), + A6XX_RB_DEPTH_BUFFER_BASE_GMEM(.dword = base)); + + OUT_REG(ring, A6XX_GRAS_SU_DEPTH_BUFFER_INFO(.depth_format = fmt)); + + OUT_PKT4(ring, REG_A6XX_RB_DEPTH_FLAG_BUFFER_BASE, 3); + fd6_emit_flag_reference(ring, rsc, zsbuf->u.tex.level, + zsbuf->u.tex.first_layer); + + if (rsc->lrz) { + OUT_REG(ring, A6XX_GRAS_LRZ_BUFFER_BASE(.bo = rsc->lrz), + A6XX_GRAS_LRZ_BUFFER_PITCH(.pitch = rsc->lrz_pitch), + // XXX a6xx seems to use a different buffer here.. not sure + // what for.. + A6XX_GRAS_LRZ_FAST_CLEAR_BUFFER_BASE()); + } else { + OUT_PKT4(ring, REG_A6XX_GRAS_LRZ_BUFFER_BASE, 5); + OUT_RING(ring, 0x00000000); + OUT_RING(ring, 0x00000000); + OUT_RING(ring, 0x00000000); /* GRAS_LRZ_BUFFER_PITCH */ + OUT_RING(ring, 0x00000000); /* GRAS_LRZ_FAST_CLEAR_BUFFER_BASE_LO */ + OUT_RING(ring, 0x00000000); + } + + /* NOTE: blob emits GRAS_LRZ_CNTL plus GRAZ_LRZ_BUFFER_BASE + * plus this CP_EVENT_WRITE at the end in it's own IB.. + */ + OUT_PKT7(ring, CP_EVENT_WRITE, 1); + OUT_RING(ring, CP_EVENT_WRITE_0_EVENT(UNK_25)); + + if (rsc->stencil) { + stride = fd_resource_pitch(rsc->stencil, 0); + array_stride = fd_resource_layer_stride(rsc->stencil, 0); + uint32_t base = gmem ? gmem->zsbuf_base[1] : 0; + + OUT_REG(ring, A6XX_RB_STENCIL_INFO(.separate_stencil = true), + A6XX_RB_STENCIL_BUFFER_PITCH(.a6xx_rb_stencil_buffer_pitch = + stride), + A6XX_RB_STENCIL_BUFFER_ARRAY_PITCH( + .a6xx_rb_stencil_buffer_array_pitch = array_stride), + A6XX_RB_STENCIL_BUFFER_BASE(.bo = rsc->stencil->bo), + A6XX_RB_STENCIL_BUFFER_BASE_GMEM(.dword = base)); + } else { + OUT_REG(ring, A6XX_RB_STENCIL_INFO(0)); + } + } else { + OUT_PKT4(ring, REG_A6XX_RB_DEPTH_BUFFER_INFO, 6); + OUT_RING(ring, A6XX_RB_DEPTH_BUFFER_INFO_DEPTH_FORMAT(DEPTH6_NONE)); + OUT_RING(ring, 0x00000000); /* RB_DEPTH_BUFFER_PITCH */ + OUT_RING(ring, 0x00000000); /* RB_DEPTH_BUFFER_ARRAY_PITCH */ + OUT_RING(ring, 0x00000000); /* RB_DEPTH_BUFFER_BASE_LO */ + OUT_RING(ring, 0x00000000); /* RB_DEPTH_BUFFER_BASE_HI */ + OUT_RING(ring, 0x00000000); /* RB_DEPTH_BUFFER_BASE_GMEM */ + + OUT_REG(ring, + A6XX_GRAS_SU_DEPTH_BUFFER_INFO(.depth_format = DEPTH6_NONE)); + + OUT_PKT4(ring, REG_A6XX_GRAS_LRZ_BUFFER_BASE, 5); + OUT_RING(ring, 0x00000000); /* RB_DEPTH_FLAG_BUFFER_BASE_LO */ + OUT_RING(ring, 0x00000000); /* RB_DEPTH_FLAG_BUFFER_BASE_HI */ + OUT_RING(ring, 0x00000000); /* GRAS_LRZ_BUFFER_PITCH */ + OUT_RING(ring, 0x00000000); /* GRAS_LRZ_FAST_CLEAR_BUFFER_BASE_LO */ + OUT_RING(ring, 0x00000000); /* GRAS_LRZ_FAST_CLEAR_BUFFER_BASE_HI */ + + OUT_REG(ring, A6XX_RB_STENCIL_INFO(0)); + } } static bool use_hw_binning(struct fd_batch *batch) { - const struct fd_gmem_stateobj *gmem = batch->gmem_state; + const struct fd_gmem_stateobj *gmem = batch->gmem_state; - if ((gmem->maxpw * gmem->maxph) > 32) - return false; + if ((gmem->maxpw * gmem->maxph) > 32) + return false; - return fd_binning_enabled && ((gmem->nbins_x * gmem->nbins_y) >= 2) && - (batch->num_draws > 0); + return fd_binning_enabled && ((gmem->nbins_x * gmem->nbins_y) >= 2) && + (batch->num_draws > 0); } static void patch_fb_read(struct fd_batch *batch) { - const struct fd_gmem_stateobj *gmem = batch->gmem_state; - - for (unsigned i = 0; i < fd_patch_num_elements(&batch->fb_read_patches); i++) { - struct fd_cs_patch *patch = fd_patch_element(&batch->fb_read_patches, i); - *patch->cs = patch->val | A6XX_TEX_CONST_2_PITCH(gmem->bin_w * gmem->cbuf_cpp[0]); - } - util_dynarray_clear(&batch->fb_read_patches); + const struct fd_gmem_stateobj *gmem = batch->gmem_state; + + for (unsigned i = 0; i < fd_patch_num_elements(&batch->fb_read_patches); + i++) { + struct fd_cs_patch *patch = fd_patch_element(&batch->fb_read_patches, i); + *patch->cs = + patch->val | A6XX_TEX_CONST_2_PITCH(gmem->bin_w * gmem->cbuf_cpp[0]); + } + util_dynarray_clear(&batch->fb_read_patches); } static void -update_render_cntl(struct fd_batch *batch, struct pipe_framebuffer_state *pfb, bool binning) +update_render_cntl(struct fd_batch *batch, struct pipe_framebuffer_state *pfb, + bool binning) { - struct fd_ringbuffer *ring = batch->gmem; - uint32_t cntl = 0; - bool depth_ubwc_enable = false; - uint32_t mrts_ubwc_enable = 0; - int i; - - if (pfb->zsbuf) { - struct fd_resource *rsc = fd_resource(pfb->zsbuf->texture); - depth_ubwc_enable = fd_resource_ubwc_enabled(rsc, pfb->zsbuf->u.tex.level); - } - - for (i = 0; i < pfb->nr_cbufs; i++) { - if (!pfb->cbufs[i]) - continue; - - struct pipe_surface *psurf = pfb->cbufs[i]; - struct fd_resource *rsc = fd_resource(psurf->texture); - if (!rsc->bo) - continue; - - if (fd_resource_ubwc_enabled(rsc, psurf->u.tex.level)) - mrts_ubwc_enable |= 1 << i; - } - - cntl |= A6XX_RB_RENDER_CNTL_UNK4; - if (binning) - cntl |= A6XX_RB_RENDER_CNTL_BINNING; - - OUT_PKT7(ring, CP_REG_WRITE, 3); - OUT_RING(ring, CP_REG_WRITE_0_TRACKER(TRACK_RENDER_CNTL)); - OUT_RING(ring, REG_A6XX_RB_RENDER_CNTL); - OUT_RING(ring, cntl | - COND(depth_ubwc_enable, A6XX_RB_RENDER_CNTL_FLAG_DEPTH) | - A6XX_RB_RENDER_CNTL_FLAG_MRTS(mrts_ubwc_enable)); + struct fd_ringbuffer *ring = batch->gmem; + uint32_t cntl = 0; + bool depth_ubwc_enable = false; + uint32_t mrts_ubwc_enable = 0; + int i; + + if (pfb->zsbuf) { + struct fd_resource *rsc = fd_resource(pfb->zsbuf->texture); + depth_ubwc_enable = + fd_resource_ubwc_enabled(rsc, pfb->zsbuf->u.tex.level); + } + + for (i = 0; i < pfb->nr_cbufs; i++) { + if (!pfb->cbufs[i]) + continue; + + struct pipe_surface *psurf = pfb->cbufs[i]; + struct fd_resource *rsc = fd_resource(psurf->texture); + if (!rsc->bo) + continue; + + if (fd_resource_ubwc_enabled(rsc, psurf->u.tex.level)) + mrts_ubwc_enable |= 1 << i; + } + + cntl |= A6XX_RB_RENDER_CNTL_UNK4; + if (binning) + cntl |= A6XX_RB_RENDER_CNTL_BINNING; + + OUT_PKT7(ring, CP_REG_WRITE, 3); + OUT_RING(ring, CP_REG_WRITE_0_TRACKER(TRACK_RENDER_CNTL)); + OUT_RING(ring, REG_A6XX_RB_RENDER_CNTL); + OUT_RING(ring, cntl | + COND(depth_ubwc_enable, A6XX_RB_RENDER_CNTL_FLAG_DEPTH) | + A6XX_RB_RENDER_CNTL_FLAG_MRTS(mrts_ubwc_enable)); } /* extra size to store VSC_DRAW_STRM_SIZE: */ -#define VSC_DRAW_STRM_SIZE(pitch) ((pitch) * 32 + 0x100) -#define VSC_PRIM_STRM_SIZE(pitch) ((pitch) * 32) +#define VSC_DRAW_STRM_SIZE(pitch) ((pitch)*32 + 0x100) +#define VSC_PRIM_STRM_SIZE(pitch) ((pitch)*32) static void update_vsc_pipe(struct fd_batch *batch) { - struct fd_context *ctx = batch->ctx; - struct fd6_context *fd6_ctx = fd6_context(ctx); - const struct fd_gmem_stateobj *gmem = batch->gmem_state; - struct fd_ringbuffer *ring = batch->gmem; - int i; - - if (batch->draw_strm_bits/8 > fd6_ctx->vsc_draw_strm_pitch) { - if (fd6_ctx->vsc_draw_strm) - fd_bo_del(fd6_ctx->vsc_draw_strm); - fd6_ctx->vsc_draw_strm = NULL; - /* Note: probably only need to align to 0x40, but aligning stronger - * reduces the odds that we will have to realloc again on the next - * frame: - */ - fd6_ctx->vsc_draw_strm_pitch = align(batch->draw_strm_bits/8, 0x4000); - mesa_logd("pre-resize VSC_DRAW_STRM_PITCH to: 0x%x", - fd6_ctx->vsc_draw_strm_pitch); - } - - if (batch->prim_strm_bits/8 > fd6_ctx->vsc_prim_strm_pitch) { - if (fd6_ctx->vsc_prim_strm) - fd_bo_del(fd6_ctx->vsc_prim_strm); - fd6_ctx->vsc_prim_strm = NULL; - fd6_ctx->vsc_prim_strm_pitch = align(batch->prim_strm_bits/8, 0x4000); - mesa_logd("pre-resize VSC_PRIM_STRM_PITCH to: 0x%x", - fd6_ctx->vsc_prim_strm_pitch); - } - - if (!fd6_ctx->vsc_draw_strm) { - fd6_ctx->vsc_draw_strm = fd_bo_new(ctx->screen->dev, - VSC_DRAW_STRM_SIZE(fd6_ctx->vsc_draw_strm_pitch), - DRM_FREEDRENO_GEM_TYPE_KMEM, "vsc_draw_strm"); - } - - if (!fd6_ctx->vsc_prim_strm) { - fd6_ctx->vsc_prim_strm = fd_bo_new(ctx->screen->dev, - VSC_PRIM_STRM_SIZE(fd6_ctx->vsc_prim_strm_pitch), - DRM_FREEDRENO_GEM_TYPE_KMEM, "vsc_prim_strm"); - } - - OUT_REG(ring, - A6XX_VSC_BIN_SIZE(.width = gmem->bin_w, .height = gmem->bin_h), - A6XX_VSC_DRAW_STRM_SIZE_ADDRESS( - .bo = fd6_ctx->vsc_draw_strm, - .bo_offset = 32 * fd6_ctx->vsc_draw_strm_pitch)); - - OUT_REG(ring, A6XX_VSC_BIN_COUNT(.nx = gmem->nbins_x, - .ny = gmem->nbins_y)); - - OUT_PKT4(ring, REG_A6XX_VSC_PIPE_CONFIG_REG(0), 32); - for (i = 0; i < 32; i++) { - const struct fd_vsc_pipe *pipe = &gmem->vsc_pipe[i]; - OUT_RING(ring, A6XX_VSC_PIPE_CONFIG_REG_X(pipe->x) | - A6XX_VSC_PIPE_CONFIG_REG_Y(pipe->y) | - A6XX_VSC_PIPE_CONFIG_REG_W(pipe->w) | - A6XX_VSC_PIPE_CONFIG_REG_H(pipe->h)); - } - - OUT_REG(ring, - A6XX_VSC_PRIM_STRM_ADDRESS(.bo = fd6_ctx->vsc_prim_strm), - A6XX_VSC_PRIM_STRM_PITCH(.dword = fd6_ctx->vsc_prim_strm_pitch), - A6XX_VSC_PRIM_STRM_LIMIT(.dword = fd6_ctx->vsc_prim_strm_pitch - 64)); - - OUT_REG(ring, - A6XX_VSC_DRAW_STRM_ADDRESS(.bo = fd6_ctx->vsc_draw_strm), - A6XX_VSC_DRAW_STRM_PITCH(.dword = fd6_ctx->vsc_draw_strm_pitch), - A6XX_VSC_DRAW_STRM_LIMIT(.dword = fd6_ctx->vsc_draw_strm_pitch - 64)); + struct fd_context *ctx = batch->ctx; + struct fd6_context *fd6_ctx = fd6_context(ctx); + const struct fd_gmem_stateobj *gmem = batch->gmem_state; + struct fd_ringbuffer *ring = batch->gmem; + int i; + + if (batch->draw_strm_bits / 8 > fd6_ctx->vsc_draw_strm_pitch) { + if (fd6_ctx->vsc_draw_strm) + fd_bo_del(fd6_ctx->vsc_draw_strm); + fd6_ctx->vsc_draw_strm = NULL; + /* Note: probably only need to align to 0x40, but aligning stronger + * reduces the odds that we will have to realloc again on the next + * frame: + */ + fd6_ctx->vsc_draw_strm_pitch = align(batch->draw_strm_bits / 8, 0x4000); + mesa_logd("pre-resize VSC_DRAW_STRM_PITCH to: 0x%x", + fd6_ctx->vsc_draw_strm_pitch); + } + + if (batch->prim_strm_bits / 8 > fd6_ctx->vsc_prim_strm_pitch) { + if (fd6_ctx->vsc_prim_strm) + fd_bo_del(fd6_ctx->vsc_prim_strm); + fd6_ctx->vsc_prim_strm = NULL; + fd6_ctx->vsc_prim_strm_pitch = align(batch->prim_strm_bits / 8, 0x4000); + mesa_logd("pre-resize VSC_PRIM_STRM_PITCH to: 0x%x", + fd6_ctx->vsc_prim_strm_pitch); + } + + if (!fd6_ctx->vsc_draw_strm) { + fd6_ctx->vsc_draw_strm = fd_bo_new( + ctx->screen->dev, VSC_DRAW_STRM_SIZE(fd6_ctx->vsc_draw_strm_pitch), + DRM_FREEDRENO_GEM_TYPE_KMEM, "vsc_draw_strm"); + } + + if (!fd6_ctx->vsc_prim_strm) { + fd6_ctx->vsc_prim_strm = fd_bo_new( + ctx->screen->dev, VSC_PRIM_STRM_SIZE(fd6_ctx->vsc_prim_strm_pitch), + DRM_FREEDRENO_GEM_TYPE_KMEM, "vsc_prim_strm"); + } + + OUT_REG( + ring, A6XX_VSC_BIN_SIZE(.width = gmem->bin_w, .height = gmem->bin_h), + A6XX_VSC_DRAW_STRM_SIZE_ADDRESS(.bo = fd6_ctx->vsc_draw_strm, + .bo_offset = + 32 * fd6_ctx->vsc_draw_strm_pitch)); + + OUT_REG(ring, A6XX_VSC_BIN_COUNT(.nx = gmem->nbins_x, .ny = gmem->nbins_y)); + + OUT_PKT4(ring, REG_A6XX_VSC_PIPE_CONFIG_REG(0), 32); + for (i = 0; i < 32; i++) { + const struct fd_vsc_pipe *pipe = &gmem->vsc_pipe[i]; + OUT_RING(ring, A6XX_VSC_PIPE_CONFIG_REG_X(pipe->x) | + A6XX_VSC_PIPE_CONFIG_REG_Y(pipe->y) | + A6XX_VSC_PIPE_CONFIG_REG_W(pipe->w) | + A6XX_VSC_PIPE_CONFIG_REG_H(pipe->h)); + } + + OUT_REG( + ring, A6XX_VSC_PRIM_STRM_ADDRESS(.bo = fd6_ctx->vsc_prim_strm), + A6XX_VSC_PRIM_STRM_PITCH(.dword = fd6_ctx->vsc_prim_strm_pitch), + A6XX_VSC_PRIM_STRM_LIMIT(.dword = fd6_ctx->vsc_prim_strm_pitch - 64)); + + OUT_REG( + ring, A6XX_VSC_DRAW_STRM_ADDRESS(.bo = fd6_ctx->vsc_draw_strm), + A6XX_VSC_DRAW_STRM_PITCH(.dword = fd6_ctx->vsc_draw_strm_pitch), + A6XX_VSC_DRAW_STRM_LIMIT(.dword = fd6_ctx->vsc_draw_strm_pitch - 64)); } /* @@ -378,141 +384,147 @@ update_vsc_pipe(struct fd_batch *batch) static void emit_vsc_overflow_test(struct fd_batch *batch) { - struct fd_ringbuffer *ring = batch->gmem; - const struct fd_gmem_stateobj *gmem = batch->gmem_state; - struct fd6_context *fd6_ctx = fd6_context(batch->ctx); - - debug_assert((fd6_ctx->vsc_draw_strm_pitch & 0x3) == 0); - debug_assert((fd6_ctx->vsc_prim_strm_pitch & 0x3) == 0); - - /* Check for overflow, write vsc_scratch if detected: */ - for (int i = 0; i < gmem->num_vsc_pipes; i++) { - OUT_PKT7(ring, CP_COND_WRITE5, 8); - OUT_RING(ring, CP_COND_WRITE5_0_FUNCTION(WRITE_GE) | - CP_COND_WRITE5_0_WRITE_MEMORY); - OUT_RING(ring, CP_COND_WRITE5_1_POLL_ADDR_LO(REG_A6XX_VSC_DRAW_STRM_SIZE_REG(i))); - OUT_RING(ring, CP_COND_WRITE5_2_POLL_ADDR_HI(0)); - OUT_RING(ring, CP_COND_WRITE5_3_REF(fd6_ctx->vsc_draw_strm_pitch - 64)); - OUT_RING(ring, CP_COND_WRITE5_4_MASK(~0)); - OUT_RELOC(ring, control_ptr(fd6_ctx, vsc_overflow)); /* WRITE_ADDR_LO/HI */ - OUT_RING(ring, CP_COND_WRITE5_7_WRITE_DATA(1 + fd6_ctx->vsc_draw_strm_pitch)); - - OUT_PKT7(ring, CP_COND_WRITE5, 8); - OUT_RING(ring, CP_COND_WRITE5_0_FUNCTION(WRITE_GE) | - CP_COND_WRITE5_0_WRITE_MEMORY); - OUT_RING(ring, CP_COND_WRITE5_1_POLL_ADDR_LO(REG_A6XX_VSC_PRIM_STRM_SIZE_REG(i))); - OUT_RING(ring, CP_COND_WRITE5_2_POLL_ADDR_HI(0)); - OUT_RING(ring, CP_COND_WRITE5_3_REF(fd6_ctx->vsc_prim_strm_pitch - 64)); - OUT_RING(ring, CP_COND_WRITE5_4_MASK(~0)); - OUT_RELOC(ring, control_ptr(fd6_ctx, vsc_overflow)); /* WRITE_ADDR_LO/HI */ - OUT_RING(ring, CP_COND_WRITE5_7_WRITE_DATA(3 + fd6_ctx->vsc_prim_strm_pitch)); - } - - OUT_PKT7(ring, CP_WAIT_MEM_WRITES, 0); + struct fd_ringbuffer *ring = batch->gmem; + const struct fd_gmem_stateobj *gmem = batch->gmem_state; + struct fd6_context *fd6_ctx = fd6_context(batch->ctx); + + debug_assert((fd6_ctx->vsc_draw_strm_pitch & 0x3) == 0); + debug_assert((fd6_ctx->vsc_prim_strm_pitch & 0x3) == 0); + + /* Check for overflow, write vsc_scratch if detected: */ + for (int i = 0; i < gmem->num_vsc_pipes; i++) { + OUT_PKT7(ring, CP_COND_WRITE5, 8); + OUT_RING(ring, CP_COND_WRITE5_0_FUNCTION(WRITE_GE) | + CP_COND_WRITE5_0_WRITE_MEMORY); + OUT_RING(ring, CP_COND_WRITE5_1_POLL_ADDR_LO( + REG_A6XX_VSC_DRAW_STRM_SIZE_REG(i))); + OUT_RING(ring, CP_COND_WRITE5_2_POLL_ADDR_HI(0)); + OUT_RING(ring, CP_COND_WRITE5_3_REF(fd6_ctx->vsc_draw_strm_pitch - 64)); + OUT_RING(ring, CP_COND_WRITE5_4_MASK(~0)); + OUT_RELOC(ring, + control_ptr(fd6_ctx, vsc_overflow)); /* WRITE_ADDR_LO/HI */ + OUT_RING(ring, + CP_COND_WRITE5_7_WRITE_DATA(1 + fd6_ctx->vsc_draw_strm_pitch)); + + OUT_PKT7(ring, CP_COND_WRITE5, 8); + OUT_RING(ring, CP_COND_WRITE5_0_FUNCTION(WRITE_GE) | + CP_COND_WRITE5_0_WRITE_MEMORY); + OUT_RING(ring, CP_COND_WRITE5_1_POLL_ADDR_LO( + REG_A6XX_VSC_PRIM_STRM_SIZE_REG(i))); + OUT_RING(ring, CP_COND_WRITE5_2_POLL_ADDR_HI(0)); + OUT_RING(ring, CP_COND_WRITE5_3_REF(fd6_ctx->vsc_prim_strm_pitch - 64)); + OUT_RING(ring, CP_COND_WRITE5_4_MASK(~0)); + OUT_RELOC(ring, + control_ptr(fd6_ctx, vsc_overflow)); /* WRITE_ADDR_LO/HI */ + OUT_RING(ring, + CP_COND_WRITE5_7_WRITE_DATA(3 + fd6_ctx->vsc_prim_strm_pitch)); + } + + OUT_PKT7(ring, CP_WAIT_MEM_WRITES, 0); } static void check_vsc_overflow(struct fd_context *ctx) { - struct fd6_context *fd6_ctx = fd6_context(ctx); - struct fd6_control *control = fd_bo_map(fd6_ctx->control_mem); - uint32_t vsc_overflow = control->vsc_overflow; - - if (!vsc_overflow) - return; - - /* clear overflow flag: */ - control->vsc_overflow = 0; - - unsigned buffer = vsc_overflow & 0x3; - unsigned size = vsc_overflow & ~0x3; - - if (buffer == 0x1) { - /* VSC_DRAW_STRM overflow: */ - - if (size < fd6_ctx->vsc_draw_strm_pitch) { - /* we've already increased the size, this overflow is - * from a batch submitted before resize, but executed - * after - */ - return; - } - - fd_bo_del(fd6_ctx->vsc_draw_strm); - fd6_ctx->vsc_draw_strm = NULL; - fd6_ctx->vsc_draw_strm_pitch *= 2; - - mesa_logd("resized VSC_DRAW_STRM_PITCH to: 0x%x", - fd6_ctx->vsc_draw_strm_pitch); - - } else if (buffer == 0x3) { - /* VSC_PRIM_STRM overflow: */ - - if (size < fd6_ctx->vsc_prim_strm_pitch) { - /* we've already increased the size */ - return; - } - - fd_bo_del(fd6_ctx->vsc_prim_strm); - fd6_ctx->vsc_prim_strm = NULL; - fd6_ctx->vsc_prim_strm_pitch *= 2; - - mesa_logd("resized VSC_PRIM_STRM_PITCH to: 0x%x", - fd6_ctx->vsc_prim_strm_pitch); - - } else { - /* NOTE: it's possible, for example, for overflow to corrupt the - * control page. I mostly just see this hit if I set initial VSC - * buffer size extremely small. Things still seem to recover, - * but maybe we should pre-emptively realloc vsc_data/vsc_data2 - * and hope for different memory placement? - */ - mesa_loge("invalid vsc_overflow value: 0x%08x", vsc_overflow); - } + struct fd6_context *fd6_ctx = fd6_context(ctx); + struct fd6_control *control = fd_bo_map(fd6_ctx->control_mem); + uint32_t vsc_overflow = control->vsc_overflow; + + if (!vsc_overflow) + return; + + /* clear overflow flag: */ + control->vsc_overflow = 0; + + unsigned buffer = vsc_overflow & 0x3; + unsigned size = vsc_overflow & ~0x3; + + if (buffer == 0x1) { + /* VSC_DRAW_STRM overflow: */ + + if (size < fd6_ctx->vsc_draw_strm_pitch) { + /* we've already increased the size, this overflow is + * from a batch submitted before resize, but executed + * after + */ + return; + } + + fd_bo_del(fd6_ctx->vsc_draw_strm); + fd6_ctx->vsc_draw_strm = NULL; + fd6_ctx->vsc_draw_strm_pitch *= 2; + + mesa_logd("resized VSC_DRAW_STRM_PITCH to: 0x%x", + fd6_ctx->vsc_draw_strm_pitch); + + } else if (buffer == 0x3) { + /* VSC_PRIM_STRM overflow: */ + + if (size < fd6_ctx->vsc_prim_strm_pitch) { + /* we've already increased the size */ + return; + } + + fd_bo_del(fd6_ctx->vsc_prim_strm); + fd6_ctx->vsc_prim_strm = NULL; + fd6_ctx->vsc_prim_strm_pitch *= 2; + + mesa_logd("resized VSC_PRIM_STRM_PITCH to: 0x%x", + fd6_ctx->vsc_prim_strm_pitch); + + } else { + /* NOTE: it's possible, for example, for overflow to corrupt the + * control page. I mostly just see this hit if I set initial VSC + * buffer size extremely small. Things still seem to recover, + * but maybe we should pre-emptively realloc vsc_data/vsc_data2 + * and hope for different memory placement? + */ + mesa_loge("invalid vsc_overflow value: 0x%08x", vsc_overflow); + } } static void emit_common_init(struct fd_batch *batch) { - struct fd_ringbuffer *ring = batch->gmem; - struct fd_autotune *at = &batch->ctx->autotune; - struct fd_batch_result *result = batch->autotune_result; + struct fd_ringbuffer *ring = batch->gmem; + struct fd_autotune *at = &batch->ctx->autotune; + struct fd_batch_result *result = batch->autotune_result; - if (!result) - return; + if (!result) + return; - OUT_PKT4(ring, REG_A6XX_RB_SAMPLE_COUNT_CONTROL, 1); - OUT_RING(ring, A6XX_RB_SAMPLE_COUNT_CONTROL_COPY); + OUT_PKT4(ring, REG_A6XX_RB_SAMPLE_COUNT_CONTROL, 1); + OUT_RING(ring, A6XX_RB_SAMPLE_COUNT_CONTROL_COPY); - OUT_PKT4(ring, REG_A6XX_RB_SAMPLE_COUNT_ADDR, 2); - OUT_RELOC(ring, results_ptr(at, result[result->idx].samples_start)); + OUT_PKT4(ring, REG_A6XX_RB_SAMPLE_COUNT_ADDR, 2); + OUT_RELOC(ring, results_ptr(at, result[result->idx].samples_start)); - fd6_event_write(batch, ring, ZPASS_DONE, false); + fd6_event_write(batch, ring, ZPASS_DONE, false); } static void emit_common_fini(struct fd_batch *batch) { - struct fd_ringbuffer *ring = batch->gmem; - struct fd_autotune *at = &batch->ctx->autotune; - struct fd_batch_result *result = batch->autotune_result; + struct fd_ringbuffer *ring = batch->gmem; + struct fd_autotune *at = &batch->ctx->autotune; + struct fd_batch_result *result = batch->autotune_result; - if (!result) - return; + if (!result) + return; - OUT_PKT4(ring, REG_A6XX_RB_SAMPLE_COUNT_CONTROL, 1); - OUT_RING(ring, A6XX_RB_SAMPLE_COUNT_CONTROL_COPY); + OUT_PKT4(ring, REG_A6XX_RB_SAMPLE_COUNT_CONTROL, 1); + OUT_RING(ring, A6XX_RB_SAMPLE_COUNT_CONTROL_COPY); - OUT_PKT4(ring, REG_A6XX_RB_SAMPLE_COUNT_ADDR, 2); - OUT_RELOC(ring, results_ptr(at, result[result->idx].samples_end)); + OUT_PKT4(ring, REG_A6XX_RB_SAMPLE_COUNT_ADDR, 2); + OUT_RELOC(ring, results_ptr(at, result[result->idx].samples_end)); - fd6_event_write(batch, ring, ZPASS_DONE, false); + fd6_event_write(batch, ring, ZPASS_DONE, false); - // TODO is there a better event to use.. a single ZPASS_DONE_TS would be nice - OUT_PKT7(ring, CP_EVENT_WRITE, 4); - OUT_RING(ring, CP_EVENT_WRITE_0_EVENT(CACHE_FLUSH_TS)); - OUT_RELOC(ring, results_ptr(at, fence)); - OUT_RING(ring, result->fence); + // TODO is there a better event to use.. a single ZPASS_DONE_TS would be nice + OUT_PKT7(ring, CP_EVENT_WRITE, 4); + OUT_RING(ring, CP_EVENT_WRITE_0_EVENT(CACHE_FLUSH_TS)); + OUT_RELOC(ring, results_ptr(at, fence)); + OUT_RING(ring, result->fence); } /* @@ -521,169 +533,169 @@ emit_common_fini(struct fd_batch *batch) */ static void emit_conditional_ib(struct fd_batch *batch, const struct fd_tile *tile, - struct fd_ringbuffer *target) + struct fd_ringbuffer *target) { - struct fd_ringbuffer *ring = batch->gmem; + struct fd_ringbuffer *ring = batch->gmem; - if (target->cur == target->start) - return; + if (target->cur == target->start) + return; - emit_marker6(ring, 6); + emit_marker6(ring, 6); - unsigned count = fd_ringbuffer_cmd_count(target); + unsigned count = fd_ringbuffer_cmd_count(target); - BEGIN_RING(ring, 5 + 4 * count); /* ensure conditional doesn't get split */ + BEGIN_RING(ring, 5 + 4 * count); /* ensure conditional doesn't get split */ - OUT_PKT7(ring, CP_REG_TEST, 1); - OUT_RING(ring, A6XX_CP_REG_TEST_0_REG(REG_A6XX_VSC_STATE_REG(tile->p)) | - A6XX_CP_REG_TEST_0_BIT(tile->n) | - A6XX_CP_REG_TEST_0_WAIT_FOR_ME); + OUT_PKT7(ring, CP_REG_TEST, 1); + OUT_RING(ring, A6XX_CP_REG_TEST_0_REG(REG_A6XX_VSC_STATE_REG(tile->p)) | + A6XX_CP_REG_TEST_0_BIT(tile->n) | + A6XX_CP_REG_TEST_0_WAIT_FOR_ME); - OUT_PKT7(ring, CP_COND_REG_EXEC, 2); - OUT_RING(ring, CP_COND_REG_EXEC_0_MODE(PRED_TEST)); - OUT_RING(ring, CP_COND_REG_EXEC_1_DWORDS(4 * count)); + OUT_PKT7(ring, CP_COND_REG_EXEC, 2); + OUT_RING(ring, CP_COND_REG_EXEC_0_MODE(PRED_TEST)); + OUT_RING(ring, CP_COND_REG_EXEC_1_DWORDS(4 * count)); - for (unsigned i = 0; i < count; i++) { - uint32_t dwords; - OUT_PKT7(ring, CP_INDIRECT_BUFFER, 3); - dwords = fd_ringbuffer_emit_reloc_ring_full(ring, target, i) / 4; - assert(dwords > 0); - OUT_RING(ring, dwords); - } + for (unsigned i = 0; i < count; i++) { + uint32_t dwords; + OUT_PKT7(ring, CP_INDIRECT_BUFFER, 3); + dwords = fd_ringbuffer_emit_reloc_ring_full(ring, target, i) / 4; + assert(dwords > 0); + OUT_RING(ring, dwords); + } - emit_marker6(ring, 6); + emit_marker6(ring, 6); } static void -set_scissor(struct fd_ringbuffer *ring, uint32_t x1, uint32_t y1, uint32_t x2, uint32_t y2) +set_scissor(struct fd_ringbuffer *ring, uint32_t x1, uint32_t y1, uint32_t x2, + uint32_t y2) { - OUT_REG(ring, - A6XX_GRAS_SC_WINDOW_SCISSOR_TL(.x = x1, .y = y1), - A6XX_GRAS_SC_WINDOW_SCISSOR_BR(.x = x2, .y = y2)); + OUT_REG(ring, A6XX_GRAS_SC_WINDOW_SCISSOR_TL(.x = x1, .y = y1), + A6XX_GRAS_SC_WINDOW_SCISSOR_BR(.x = x2, .y = y2)); - OUT_REG(ring, - A6XX_GRAS_2D_RESOLVE_CNTL_1(.x = x1, .y = y1), - A6XX_GRAS_2D_RESOLVE_CNTL_2(.x = x2, .y = y2)); + OUT_REG(ring, A6XX_GRAS_2D_RESOLVE_CNTL_1(.x = x1, .y = y1), + A6XX_GRAS_2D_RESOLVE_CNTL_2(.x = x2, .y = y2)); } static void set_bin_size(struct fd_ringbuffer *ring, uint32_t w, uint32_t h, uint32_t flag) { - OUT_REG(ring, A6XX_GRAS_BIN_CONTROL(.binw = w, .binh = h, .dword = flag)); - OUT_REG(ring, A6XX_RB_BIN_CONTROL(.binw = w, .binh = h, .dword = flag)); - /* no flag for RB_BIN_CONTROL2... */ - OUT_REG(ring, A6XX_RB_BIN_CONTROL2(.binw = w, .binh = h)); + OUT_REG(ring, A6XX_GRAS_BIN_CONTROL(.binw = w, .binh = h, .dword = flag)); + OUT_REG(ring, A6XX_RB_BIN_CONTROL(.binw = w, .binh = h, .dword = flag)); + /* no flag for RB_BIN_CONTROL2... */ + OUT_REG(ring, A6XX_RB_BIN_CONTROL2(.binw = w, .binh = h)); } static void -emit_binning_pass(struct fd_batch *batch) - assert_dt +emit_binning_pass(struct fd_batch *batch) assert_dt { - struct fd_ringbuffer *ring = batch->gmem; - const struct fd_gmem_stateobj *gmem = batch->gmem_state; - struct fd_screen *screen = batch->ctx->screen; + struct fd_ringbuffer *ring = batch->gmem; + const struct fd_gmem_stateobj *gmem = batch->gmem_state; + struct fd_screen *screen = batch->ctx->screen; - debug_assert(!batch->tessellation); + debug_assert(!batch->tessellation); - set_scissor(ring, 0, 0, gmem->width - 1, gmem->height - 1); + set_scissor(ring, 0, 0, gmem->width - 1, gmem->height - 1); - emit_marker6(ring, 7); - OUT_PKT7(ring, CP_SET_MARKER, 1); - OUT_RING(ring, A6XX_CP_SET_MARKER_0_MODE(RM6_BINNING)); - emit_marker6(ring, 7); + emit_marker6(ring, 7); + OUT_PKT7(ring, CP_SET_MARKER, 1); + OUT_RING(ring, A6XX_CP_SET_MARKER_0_MODE(RM6_BINNING)); + emit_marker6(ring, 7); - OUT_PKT7(ring, CP_SET_VISIBILITY_OVERRIDE, 1); - OUT_RING(ring, 0x1); + OUT_PKT7(ring, CP_SET_VISIBILITY_OVERRIDE, 1); + OUT_RING(ring, 0x1); - OUT_PKT7(ring, CP_SET_MODE, 1); - OUT_RING(ring, 0x1); + OUT_PKT7(ring, CP_SET_MODE, 1); + OUT_RING(ring, 0x1); - OUT_WFI5(ring); + OUT_WFI5(ring); - OUT_REG(ring, A6XX_VFD_MODE_CNTL(.binning_pass = true)); + OUT_REG(ring, A6XX_VFD_MODE_CNTL(.binning_pass = true)); - update_vsc_pipe(batch); + update_vsc_pipe(batch); - OUT_PKT4(ring, REG_A6XX_PC_UNKNOWN_9805, 1); - OUT_RING(ring, screen->info.a6xx.magic.PC_UNKNOWN_9805); + OUT_PKT4(ring, REG_A6XX_PC_UNKNOWN_9805, 1); + OUT_RING(ring, screen->info.a6xx.magic.PC_UNKNOWN_9805); - OUT_PKT4(ring, REG_A6XX_SP_UNKNOWN_A0F8, 1); - OUT_RING(ring, screen->info.a6xx.magic.SP_UNKNOWN_A0F8); + OUT_PKT4(ring, REG_A6XX_SP_UNKNOWN_A0F8, 1); + OUT_RING(ring, screen->info.a6xx.magic.SP_UNKNOWN_A0F8); - OUT_PKT7(ring, CP_EVENT_WRITE, 1); - OUT_RING(ring, UNK_2C); + OUT_PKT7(ring, CP_EVENT_WRITE, 1); + OUT_RING(ring, UNK_2C); - OUT_PKT4(ring, REG_A6XX_RB_WINDOW_OFFSET, 1); - OUT_RING(ring, A6XX_RB_WINDOW_OFFSET_X(0) | - A6XX_RB_WINDOW_OFFSET_Y(0)); + OUT_PKT4(ring, REG_A6XX_RB_WINDOW_OFFSET, 1); + OUT_RING(ring, A6XX_RB_WINDOW_OFFSET_X(0) | A6XX_RB_WINDOW_OFFSET_Y(0)); - OUT_PKT4(ring, REG_A6XX_SP_TP_WINDOW_OFFSET, 1); - OUT_RING(ring, A6XX_SP_TP_WINDOW_OFFSET_X(0) | - A6XX_SP_TP_WINDOW_OFFSET_Y(0)); + OUT_PKT4(ring, REG_A6XX_SP_TP_WINDOW_OFFSET, 1); + OUT_RING(ring, + A6XX_SP_TP_WINDOW_OFFSET_X(0) | A6XX_SP_TP_WINDOW_OFFSET_Y(0)); - /* emit IB to binning drawcmds: */ - trace_start_binning_ib(&batch->trace); - fd6_emit_ib(ring, batch->draw); - trace_end_binning_ib(&batch->trace); + /* emit IB to binning drawcmds: */ + trace_start_binning_ib(&batch->trace); + fd6_emit_ib(ring, batch->draw); + trace_end_binning_ib(&batch->trace); - fd_reset_wfi(batch); + fd_reset_wfi(batch); - OUT_PKT7(ring, CP_SET_DRAW_STATE, 3); - OUT_RING(ring, CP_SET_DRAW_STATE__0_COUNT(0) | - CP_SET_DRAW_STATE__0_DISABLE_ALL_GROUPS | - CP_SET_DRAW_STATE__0_GROUP_ID(0)); - OUT_RING(ring, CP_SET_DRAW_STATE__1_ADDR_LO(0)); - OUT_RING(ring, CP_SET_DRAW_STATE__2_ADDR_HI(0)); + OUT_PKT7(ring, CP_SET_DRAW_STATE, 3); + OUT_RING(ring, CP_SET_DRAW_STATE__0_COUNT(0) | + CP_SET_DRAW_STATE__0_DISABLE_ALL_GROUPS | + CP_SET_DRAW_STATE__0_GROUP_ID(0)); + OUT_RING(ring, CP_SET_DRAW_STATE__1_ADDR_LO(0)); + OUT_RING(ring, CP_SET_DRAW_STATE__2_ADDR_HI(0)); - OUT_PKT7(ring, CP_EVENT_WRITE, 1); - OUT_RING(ring, UNK_2D); + OUT_PKT7(ring, CP_EVENT_WRITE, 1); + OUT_RING(ring, UNK_2D); - fd6_cache_inv(batch, ring); - fd6_cache_flush(batch, ring); - fd_wfi(batch, ring); + fd6_cache_inv(batch, ring); + fd6_cache_flush(batch, ring); + fd_wfi(batch, ring); - OUT_PKT7(ring, CP_WAIT_FOR_ME, 0); + OUT_PKT7(ring, CP_WAIT_FOR_ME, 0); - trace_start_vsc_overflow_test(&batch->trace); - emit_vsc_overflow_test(batch); - trace_end_vsc_overflow_test(&batch->trace); + trace_start_vsc_overflow_test(&batch->trace); + emit_vsc_overflow_test(batch); + trace_end_vsc_overflow_test(&batch->trace); - OUT_PKT7(ring, CP_SET_VISIBILITY_OVERRIDE, 1); - OUT_RING(ring, 0x0); + OUT_PKT7(ring, CP_SET_VISIBILITY_OVERRIDE, 1); + OUT_RING(ring, 0x0); - OUT_PKT7(ring, CP_SET_MODE, 1); - OUT_RING(ring, 0x0); + OUT_PKT7(ring, CP_SET_MODE, 1); + OUT_RING(ring, 0x0); - OUT_WFI5(ring); + OUT_WFI5(ring); - OUT_REG(ring, - A6XX_RB_CCU_CNTL(.offset = screen->info.a6xx.ccu_offset_gmem, - .gmem = true, - .unk2 = screen->info.a6xx.ccu_cntl_gmem_unk2)); + OUT_REG(ring, + A6XX_RB_CCU_CNTL(.offset = screen->info.a6xx.ccu_offset_gmem, + .gmem = true, + .unk2 = screen->info.a6xx.ccu_cntl_gmem_unk2)); } static void emit_msaa(struct fd_ringbuffer *ring, unsigned nr) { - enum a3xx_msaa_samples samples = fd_msaa_samples(nr); - - OUT_PKT4(ring, REG_A6XX_SP_TP_RAS_MSAA_CNTL, 2); - OUT_RING(ring, A6XX_SP_TP_RAS_MSAA_CNTL_SAMPLES(samples)); - OUT_RING(ring, A6XX_SP_TP_DEST_MSAA_CNTL_SAMPLES(samples) | - COND(samples == MSAA_ONE, A6XX_SP_TP_DEST_MSAA_CNTL_MSAA_DISABLE)); - - OUT_PKT4(ring, REG_A6XX_GRAS_RAS_MSAA_CNTL, 2); - OUT_RING(ring, A6XX_GRAS_RAS_MSAA_CNTL_SAMPLES(samples)); - OUT_RING(ring, A6XX_GRAS_DEST_MSAA_CNTL_SAMPLES(samples) | - COND(samples == MSAA_ONE, A6XX_GRAS_DEST_MSAA_CNTL_MSAA_DISABLE)); - - OUT_PKT4(ring, REG_A6XX_RB_RAS_MSAA_CNTL, 2); - OUT_RING(ring, A6XX_RB_RAS_MSAA_CNTL_SAMPLES(samples)); - OUT_RING(ring, A6XX_RB_DEST_MSAA_CNTL_SAMPLES(samples) | - COND(samples == MSAA_ONE, A6XX_RB_DEST_MSAA_CNTL_MSAA_DISABLE)); - - OUT_PKT4(ring, REG_A6XX_RB_MSAA_CNTL, 1); - OUT_RING(ring, A6XX_RB_MSAA_CNTL_SAMPLES(samples)); + enum a3xx_msaa_samples samples = fd_msaa_samples(nr); + + OUT_PKT4(ring, REG_A6XX_SP_TP_RAS_MSAA_CNTL, 2); + OUT_RING(ring, A6XX_SP_TP_RAS_MSAA_CNTL_SAMPLES(samples)); + OUT_RING(ring, A6XX_SP_TP_DEST_MSAA_CNTL_SAMPLES(samples) | + COND(samples == MSAA_ONE, + A6XX_SP_TP_DEST_MSAA_CNTL_MSAA_DISABLE)); + + OUT_PKT4(ring, REG_A6XX_GRAS_RAS_MSAA_CNTL, 2); + OUT_RING(ring, A6XX_GRAS_RAS_MSAA_CNTL_SAMPLES(samples)); + OUT_RING(ring, A6XX_GRAS_DEST_MSAA_CNTL_SAMPLES(samples) | + COND(samples == MSAA_ONE, + A6XX_GRAS_DEST_MSAA_CNTL_MSAA_DISABLE)); + + OUT_PKT4(ring, REG_A6XX_RB_RAS_MSAA_CNTL, 2); + OUT_RING(ring, A6XX_RB_RAS_MSAA_CNTL_SAMPLES(samples)); + OUT_RING(ring, + A6XX_RB_DEST_MSAA_CNTL_SAMPLES(samples) | + COND(samples == MSAA_ONE, A6XX_RB_DEST_MSAA_CNTL_MSAA_DISABLE)); + + OUT_PKT4(ring, REG_A6XX_RB_MSAA_CNTL, 1); + OUT_RING(ring, A6XX_RB_MSAA_CNTL_SAMPLES(samples)); } static void prepare_tile_setup_ib(struct fd_batch *batch); @@ -691,415 +703,406 @@ static void prepare_tile_fini_ib(struct fd_batch *batch); /* before first tile */ static void -fd6_emit_tile_init(struct fd_batch *batch) - assert_dt +fd6_emit_tile_init(struct fd_batch *batch) assert_dt { - struct fd_ringbuffer *ring = batch->gmem; - struct pipe_framebuffer_state *pfb = &batch->framebuffer; - const struct fd_gmem_stateobj *gmem = batch->gmem_state; - struct fd_screen *screen = batch->ctx->screen; + struct fd_ringbuffer *ring = batch->gmem; + struct pipe_framebuffer_state *pfb = &batch->framebuffer; + const struct fd_gmem_stateobj *gmem = batch->gmem_state; + struct fd_screen *screen = batch->ctx->screen; - fd6_emit_restore(batch, ring); + fd6_emit_restore(batch, ring); - fd6_emit_lrz_flush(ring); + fd6_emit_lrz_flush(ring); - if (batch->prologue) { - trace_start_prologue(&batch->trace); - fd6_emit_ib(ring, batch->prologue); - trace_end_prologue(&batch->trace); - } + if (batch->prologue) { + trace_start_prologue(&batch->trace); + fd6_emit_ib(ring, batch->prologue); + trace_end_prologue(&batch->trace); + } - fd6_cache_inv(batch, ring); + fd6_cache_inv(batch, ring); - prepare_tile_setup_ib(batch); - prepare_tile_fini_ib(batch); + prepare_tile_setup_ib(batch); + prepare_tile_fini_ib(batch); - OUT_PKT7(ring, CP_SKIP_IB2_ENABLE_GLOBAL, 1); - OUT_RING(ring, 0x0); + OUT_PKT7(ring, CP_SKIP_IB2_ENABLE_GLOBAL, 1); + OUT_RING(ring, 0x0); - /* blob controls "local" in IB2, but I think that is not required */ - OUT_PKT7(ring, CP_SKIP_IB2_ENABLE_LOCAL, 1); - OUT_RING(ring, 0x1); + /* blob controls "local" in IB2, but I think that is not required */ + OUT_PKT7(ring, CP_SKIP_IB2_ENABLE_LOCAL, 1); + OUT_RING(ring, 0x1); - fd_wfi(batch, ring); - OUT_REG(ring, - A6XX_RB_CCU_CNTL(.offset = screen->info.a6xx.ccu_offset_gmem, - .gmem = true, - .unk2 = screen->info.a6xx.ccu_cntl_gmem_unk2)); + fd_wfi(batch, ring); + OUT_REG(ring, + A6XX_RB_CCU_CNTL(.offset = screen->info.a6xx.ccu_offset_gmem, + .gmem = true, + .unk2 = screen->info.a6xx.ccu_cntl_gmem_unk2)); - emit_zs(ring, pfb->zsbuf, batch->gmem_state); - emit_mrt(ring, pfb, batch->gmem_state); - emit_msaa(ring, pfb->samples); - patch_fb_read(batch); + emit_zs(ring, pfb->zsbuf, batch->gmem_state); + emit_mrt(ring, pfb, batch->gmem_state); + emit_msaa(ring, pfb->samples); + patch_fb_read(batch); - if (use_hw_binning(batch)) { - /* enable stream-out during binning pass: */ - OUT_REG(ring, A6XX_VPC_SO_DISABLE(false)); + if (use_hw_binning(batch)) { + /* enable stream-out during binning pass: */ + OUT_REG(ring, A6XX_VPC_SO_DISABLE(false)); - set_bin_size(ring, gmem->bin_w, gmem->bin_h, - A6XX_RB_BIN_CONTROL_BINNING_PASS | 0x6000000); - update_render_cntl(batch, pfb, true); - emit_binning_pass(batch); + set_bin_size(ring, gmem->bin_w, gmem->bin_h, + A6XX_RB_BIN_CONTROL_BINNING_PASS | 0x6000000); + update_render_cntl(batch, pfb, true); + emit_binning_pass(batch); - /* and disable stream-out for draw pass: */ - OUT_REG(ring, A6XX_VPC_SO_DISABLE(true)); + /* and disable stream-out for draw pass: */ + OUT_REG(ring, A6XX_VPC_SO_DISABLE(true)); - /* - * NOTE: even if we detect VSC overflow and disable use of - * visibility stream in draw pass, it is still safe to execute - * the reset of these cmds: - */ + /* + * NOTE: even if we detect VSC overflow and disable use of + * visibility stream in draw pass, it is still safe to execute + * the reset of these cmds: + */ -// NOTE a618 not setting .USE_VIZ .. from a quick check on a630, it -// does not appear that this bit changes much (ie. it isn't actually -// .USE_VIZ like previous gens) - set_bin_size(ring, gmem->bin_w, gmem->bin_h, - A6XX_RB_BIN_CONTROL_USE_VIZ | 0x6000000); + // NOTE a618 not setting .USE_VIZ .. from a quick check on a630, it + // does not appear that this bit changes much (ie. it isn't actually + // .USE_VIZ like previous gens) + set_bin_size(ring, gmem->bin_w, gmem->bin_h, + A6XX_RB_BIN_CONTROL_USE_VIZ | 0x6000000); - OUT_PKT4(ring, REG_A6XX_VFD_MODE_CNTL, 1); - OUT_RING(ring, 0x0); + OUT_PKT4(ring, REG_A6XX_VFD_MODE_CNTL, 1); + OUT_RING(ring, 0x0); - OUT_PKT4(ring, REG_A6XX_PC_UNKNOWN_9805, 1); - OUT_RING(ring, screen->info.a6xx.magic.PC_UNKNOWN_9805); + OUT_PKT4(ring, REG_A6XX_PC_UNKNOWN_9805, 1); + OUT_RING(ring, screen->info.a6xx.magic.PC_UNKNOWN_9805); - OUT_PKT4(ring, REG_A6XX_SP_UNKNOWN_A0F8, 1); - OUT_RING(ring, screen->info.a6xx.magic.SP_UNKNOWN_A0F8); + OUT_PKT4(ring, REG_A6XX_SP_UNKNOWN_A0F8, 1); + OUT_RING(ring, screen->info.a6xx.magic.SP_UNKNOWN_A0F8); - OUT_PKT7(ring, CP_SKIP_IB2_ENABLE_GLOBAL, 1); - OUT_RING(ring, 0x1); - } else { - /* no binning pass, so enable stream-out for draw pass:: */ - OUT_REG(ring, A6XX_VPC_SO_DISABLE(false)); + OUT_PKT7(ring, CP_SKIP_IB2_ENABLE_GLOBAL, 1); + OUT_RING(ring, 0x1); + } else { + /* no binning pass, so enable stream-out for draw pass:: */ + OUT_REG(ring, A6XX_VPC_SO_DISABLE(false)); - set_bin_size(ring, gmem->bin_w, gmem->bin_h, 0x6000000); - } + set_bin_size(ring, gmem->bin_w, gmem->bin_h, 0x6000000); + } - update_render_cntl(batch, pfb, false); + update_render_cntl(batch, pfb, false); - emit_common_init(batch); + emit_common_init(batch); } static void set_window_offset(struct fd_ringbuffer *ring, uint32_t x1, uint32_t y1) { - OUT_PKT4(ring, REG_A6XX_RB_WINDOW_OFFSET, 1); - OUT_RING(ring, A6XX_RB_WINDOW_OFFSET_X(x1) | - A6XX_RB_WINDOW_OFFSET_Y(y1)); + OUT_PKT4(ring, REG_A6XX_RB_WINDOW_OFFSET, 1); + OUT_RING(ring, A6XX_RB_WINDOW_OFFSET_X(x1) | A6XX_RB_WINDOW_OFFSET_Y(y1)); - OUT_PKT4(ring, REG_A6XX_RB_WINDOW_OFFSET2, 1); - OUT_RING(ring, A6XX_RB_WINDOW_OFFSET2_X(x1) | - A6XX_RB_WINDOW_OFFSET2_Y(y1)); + OUT_PKT4(ring, REG_A6XX_RB_WINDOW_OFFSET2, 1); + OUT_RING(ring, A6XX_RB_WINDOW_OFFSET2_X(x1) | A6XX_RB_WINDOW_OFFSET2_Y(y1)); - OUT_PKT4(ring, REG_A6XX_SP_WINDOW_OFFSET, 1); - OUT_RING(ring, A6XX_SP_WINDOW_OFFSET_X(x1) | - A6XX_SP_WINDOW_OFFSET_Y(y1)); + OUT_PKT4(ring, REG_A6XX_SP_WINDOW_OFFSET, 1); + OUT_RING(ring, A6XX_SP_WINDOW_OFFSET_X(x1) | A6XX_SP_WINDOW_OFFSET_Y(y1)); - OUT_PKT4(ring, REG_A6XX_SP_TP_WINDOW_OFFSET, 1); - OUT_RING(ring, A6XX_SP_TP_WINDOW_OFFSET_X(x1) | - A6XX_SP_TP_WINDOW_OFFSET_Y(y1)); + OUT_PKT4(ring, REG_A6XX_SP_TP_WINDOW_OFFSET, 1); + OUT_RING(ring, + A6XX_SP_TP_WINDOW_OFFSET_X(x1) | A6XX_SP_TP_WINDOW_OFFSET_Y(y1)); } /* before mem2gmem */ static void fd6_emit_tile_prep(struct fd_batch *batch, const struct fd_tile *tile) { - struct fd_context *ctx = batch->ctx; - const struct fd_gmem_stateobj *gmem = batch->gmem_state; - struct fd6_context *fd6_ctx = fd6_context(ctx); - struct fd_ringbuffer *ring = batch->gmem; + struct fd_context *ctx = batch->ctx; + const struct fd_gmem_stateobj *gmem = batch->gmem_state; + struct fd6_context *fd6_ctx = fd6_context(ctx); + struct fd_ringbuffer *ring = batch->gmem; - emit_marker6(ring, 7); - OUT_PKT7(ring, CP_SET_MARKER, 1); - OUT_RING(ring, A6XX_CP_SET_MARKER_0_MODE(RM6_GMEM)); - emit_marker6(ring, 7); + emit_marker6(ring, 7); + OUT_PKT7(ring, CP_SET_MARKER, 1); + OUT_RING(ring, A6XX_CP_SET_MARKER_0_MODE(RM6_GMEM)); + emit_marker6(ring, 7); - uint32_t x1 = tile->xoff; - uint32_t y1 = tile->yoff; - uint32_t x2 = tile->xoff + tile->bin_w - 1; - uint32_t y2 = tile->yoff + tile->bin_h - 1; + uint32_t x1 = tile->xoff; + uint32_t y1 = tile->yoff; + uint32_t x2 = tile->xoff + tile->bin_w - 1; + uint32_t y2 = tile->yoff + tile->bin_h - 1; - set_scissor(ring, x1, y1, x2, y2); + set_scissor(ring, x1, y1, x2, y2); - if (use_hw_binning(batch)) { - const struct fd_vsc_pipe *pipe = &gmem->vsc_pipe[tile->p]; + if (use_hw_binning(batch)) { + const struct fd_vsc_pipe *pipe = &gmem->vsc_pipe[tile->p]; - OUT_PKT7(ring, CP_WAIT_FOR_ME, 0); + OUT_PKT7(ring, CP_WAIT_FOR_ME, 0); - OUT_PKT7(ring, CP_SET_MODE, 1); - OUT_RING(ring, 0x0); + OUT_PKT7(ring, CP_SET_MODE, 1); + OUT_RING(ring, 0x0); - OUT_PKT7(ring, CP_SET_BIN_DATA5, 7); - OUT_RING(ring, CP_SET_BIN_DATA5_0_VSC_SIZE(pipe->w * pipe->h) | - CP_SET_BIN_DATA5_0_VSC_N(tile->n)); - OUT_RELOC(ring, fd6_ctx->vsc_draw_strm, /* per-pipe draw-stream address */ - (tile->p * fd6_ctx->vsc_draw_strm_pitch), 0, 0); - OUT_RELOC(ring, fd6_ctx->vsc_draw_strm, /* VSC_DRAW_STRM_ADDRESS + (p * 4) */ - (tile->p * 4) + (32 * fd6_ctx->vsc_draw_strm_pitch), 0, 0); - OUT_RELOC(ring, fd6_ctx->vsc_prim_strm, - (tile->p * fd6_ctx->vsc_prim_strm_pitch), 0, 0); + OUT_PKT7(ring, CP_SET_BIN_DATA5, 7); + OUT_RING(ring, CP_SET_BIN_DATA5_0_VSC_SIZE(pipe->w * pipe->h) | + CP_SET_BIN_DATA5_0_VSC_N(tile->n)); + OUT_RELOC(ring, fd6_ctx->vsc_draw_strm, /* per-pipe draw-stream address */ + (tile->p * fd6_ctx->vsc_draw_strm_pitch), 0, 0); + OUT_RELOC(ring, + fd6_ctx->vsc_draw_strm, /* VSC_DRAW_STRM_ADDRESS + (p * 4) */ + (tile->p * 4) + (32 * fd6_ctx->vsc_draw_strm_pitch), 0, 0); + OUT_RELOC(ring, fd6_ctx->vsc_prim_strm, + (tile->p * fd6_ctx->vsc_prim_strm_pitch), 0, 0); - OUT_PKT7(ring, CP_SET_VISIBILITY_OVERRIDE, 1); - OUT_RING(ring, 0x0); + OUT_PKT7(ring, CP_SET_VISIBILITY_OVERRIDE, 1); + OUT_RING(ring, 0x0); - set_window_offset(ring, x1, y1); + set_window_offset(ring, x1, y1); - const struct fd_gmem_stateobj *gmem = batch->gmem_state; - set_bin_size(ring, gmem->bin_w, gmem->bin_h, 0x6000000); + const struct fd_gmem_stateobj *gmem = batch->gmem_state; + set_bin_size(ring, gmem->bin_w, gmem->bin_h, 0x6000000); - OUT_PKT7(ring, CP_SET_MODE, 1); - OUT_RING(ring, 0x0); - } else { - set_window_offset(ring, x1, y1); + OUT_PKT7(ring, CP_SET_MODE, 1); + OUT_RING(ring, 0x0); + } else { + set_window_offset(ring, x1, y1); - OUT_PKT7(ring, CP_SET_VISIBILITY_OVERRIDE, 1); - OUT_RING(ring, 0x1); + OUT_PKT7(ring, CP_SET_VISIBILITY_OVERRIDE, 1); + OUT_RING(ring, 0x1); - OUT_PKT7(ring, CP_SET_MODE, 1); - OUT_RING(ring, 0x0); - } + OUT_PKT7(ring, CP_SET_MODE, 1); + OUT_RING(ring, 0x0); + } } static void set_blit_scissor(struct fd_batch *batch, struct fd_ringbuffer *ring) { - struct pipe_scissor_state blit_scissor = batch->max_scissor; - - blit_scissor.minx = ROUND_DOWN_TO(blit_scissor.minx, 16); - blit_scissor.miny = ROUND_DOWN_TO(blit_scissor.miny, 4); - blit_scissor.maxx = ALIGN(blit_scissor.maxx, 16); - blit_scissor.maxy = ALIGN(blit_scissor.maxy, 4); - - OUT_PKT4(ring, REG_A6XX_RB_BLIT_SCISSOR_TL, 2); - OUT_RING(ring, - A6XX_RB_BLIT_SCISSOR_TL_X(blit_scissor.minx) | - A6XX_RB_BLIT_SCISSOR_TL_Y(blit_scissor.miny)); - OUT_RING(ring, - A6XX_RB_BLIT_SCISSOR_BR_X(blit_scissor.maxx - 1) | - A6XX_RB_BLIT_SCISSOR_BR_Y(blit_scissor.maxy - 1)); + struct pipe_scissor_state blit_scissor = batch->max_scissor; + + blit_scissor.minx = ROUND_DOWN_TO(blit_scissor.minx, 16); + blit_scissor.miny = ROUND_DOWN_TO(blit_scissor.miny, 4); + blit_scissor.maxx = ALIGN(blit_scissor.maxx, 16); + blit_scissor.maxy = ALIGN(blit_scissor.maxy, 4); + + OUT_PKT4(ring, REG_A6XX_RB_BLIT_SCISSOR_TL, 2); + OUT_RING(ring, A6XX_RB_BLIT_SCISSOR_TL_X(blit_scissor.minx) | + A6XX_RB_BLIT_SCISSOR_TL_Y(blit_scissor.miny)); + OUT_RING(ring, A6XX_RB_BLIT_SCISSOR_BR_X(blit_scissor.maxx - 1) | + A6XX_RB_BLIT_SCISSOR_BR_Y(blit_scissor.maxy - 1)); } static void -emit_blit(struct fd_batch *batch, - struct fd_ringbuffer *ring, - uint32_t base, - struct pipe_surface *psurf, - bool stencil) +emit_blit(struct fd_batch *batch, struct fd_ringbuffer *ring, uint32_t base, + struct pipe_surface *psurf, bool stencil) { - struct fd_resource *rsc = fd_resource(psurf->texture); - enum pipe_format pfmt = psurf->format; - uint32_t offset; - bool ubwc_enabled; - - debug_assert(psurf->u.tex.first_layer == psurf->u.tex.last_layer); - - /* separate stencil case: */ - if (stencil) { - rsc = rsc->stencil; - pfmt = rsc->b.b.format; - } - - offset = fd_resource_offset(rsc, psurf->u.tex.level, - psurf->u.tex.first_layer); - ubwc_enabled = fd_resource_ubwc_enabled(rsc, psurf->u.tex.level); - - debug_assert(psurf->u.tex.first_layer == psurf->u.tex.last_layer); - - enum a6xx_format format = fd6_pipe2color(pfmt); - uint32_t stride = fd_resource_pitch(rsc, psurf->u.tex.level); - uint32_t size = fd_resource_slice(rsc, psurf->u.tex.level)->size0; - enum a3xx_color_swap swap = fd6_resource_swap(rsc, pfmt); - enum a3xx_msaa_samples samples = - fd_msaa_samples(rsc->b.b.nr_samples); - uint32_t tile_mode = fd_resource_tile_mode(&rsc->b.b, psurf->u.tex.level); - - OUT_REG(ring, - A6XX_RB_BLIT_DST_INFO(.tile_mode = tile_mode, .samples = samples, - .color_format = format, .color_swap = swap, .flags = ubwc_enabled), - A6XX_RB_BLIT_DST(.bo = rsc->bo, .bo_offset = offset), - A6XX_RB_BLIT_DST_PITCH(.a6xx_rb_blit_dst_pitch = stride), - A6XX_RB_BLIT_DST_ARRAY_PITCH(.a6xx_rb_blit_dst_array_pitch = size)); - - OUT_REG(ring, A6XX_RB_BLIT_BASE_GMEM(.dword = base)); - - if (ubwc_enabled) { - OUT_PKT4(ring, REG_A6XX_RB_BLIT_FLAG_DST, 3); - fd6_emit_flag_reference(ring, rsc, - psurf->u.tex.level, psurf->u.tex.first_layer); - } - - fd6_emit_blit(batch, ring); + struct fd_resource *rsc = fd_resource(psurf->texture); + enum pipe_format pfmt = psurf->format; + uint32_t offset; + bool ubwc_enabled; + + debug_assert(psurf->u.tex.first_layer == psurf->u.tex.last_layer); + + /* separate stencil case: */ + if (stencil) { + rsc = rsc->stencil; + pfmt = rsc->b.b.format; + } + + offset = + fd_resource_offset(rsc, psurf->u.tex.level, psurf->u.tex.first_layer); + ubwc_enabled = fd_resource_ubwc_enabled(rsc, psurf->u.tex.level); + + debug_assert(psurf->u.tex.first_layer == psurf->u.tex.last_layer); + + enum a6xx_format format = fd6_pipe2color(pfmt); + uint32_t stride = fd_resource_pitch(rsc, psurf->u.tex.level); + uint32_t size = fd_resource_slice(rsc, psurf->u.tex.level)->size0; + enum a3xx_color_swap swap = fd6_resource_swap(rsc, pfmt); + enum a3xx_msaa_samples samples = fd_msaa_samples(rsc->b.b.nr_samples); + uint32_t tile_mode = fd_resource_tile_mode(&rsc->b.b, psurf->u.tex.level); + + OUT_REG(ring, + A6XX_RB_BLIT_DST_INFO(.tile_mode = tile_mode, .samples = samples, + .color_format = format, .color_swap = swap, + .flags = ubwc_enabled), + A6XX_RB_BLIT_DST(.bo = rsc->bo, .bo_offset = offset), + A6XX_RB_BLIT_DST_PITCH(.a6xx_rb_blit_dst_pitch = stride), + A6XX_RB_BLIT_DST_ARRAY_PITCH(.a6xx_rb_blit_dst_array_pitch = size)); + + OUT_REG(ring, A6XX_RB_BLIT_BASE_GMEM(.dword = base)); + + if (ubwc_enabled) { + OUT_PKT4(ring, REG_A6XX_RB_BLIT_FLAG_DST, 3); + fd6_emit_flag_reference(ring, rsc, psurf->u.tex.level, + psurf->u.tex.first_layer); + } + + fd6_emit_blit(batch, ring); } static void -emit_restore_blit(struct fd_batch *batch, - struct fd_ringbuffer *ring, - uint32_t base, - struct pipe_surface *psurf, - unsigned buffer) +emit_restore_blit(struct fd_batch *batch, struct fd_ringbuffer *ring, + uint32_t base, struct pipe_surface *psurf, unsigned buffer) { - bool stencil = (buffer == FD_BUFFER_STENCIL); + bool stencil = (buffer == FD_BUFFER_STENCIL); - OUT_REG(ring, A6XX_RB_BLIT_INFO( - .gmem = true, .unk0 = true, - .depth = (buffer == FD_BUFFER_DEPTH), - .sample_0 = util_format_is_pure_integer(psurf->format))); + OUT_REG(ring, A6XX_RB_BLIT_INFO(.gmem = true, .unk0 = true, + .depth = (buffer == FD_BUFFER_DEPTH), + .sample_0 = util_format_is_pure_integer( + psurf->format))); - emit_blit(batch, ring, base, psurf, stencil); + emit_blit(batch, ring, base, psurf, stencil); } static void emit_clears(struct fd_batch *batch, struct fd_ringbuffer *ring) { - struct pipe_framebuffer_state *pfb = &batch->framebuffer; - const struct fd_gmem_stateobj *gmem = batch->gmem_state; - enum a3xx_msaa_samples samples = fd_msaa_samples(pfb->samples); - - uint32_t buffers = batch->fast_cleared; - - if (buffers & PIPE_CLEAR_COLOR) { - - for (int i = 0; i < pfb->nr_cbufs; i++) { - union pipe_color_union *color = &batch->clear_color[i]; - union util_color uc = {0}; - - if (!pfb->cbufs[i]) - continue; - - if (!(buffers & (PIPE_CLEAR_COLOR0 << i))) - continue; - - enum pipe_format pfmt = pfb->cbufs[i]->format; - - // XXX I think RB_CLEAR_COLOR_DWn wants to take into account SWAP?? - union pipe_color_union swapped; - switch (fd6_pipe2swap(pfmt)) { - case WZYX: - swapped.ui[0] = color->ui[0]; - swapped.ui[1] = color->ui[1]; - swapped.ui[2] = color->ui[2]; - swapped.ui[3] = color->ui[3]; - break; - case WXYZ: - swapped.ui[2] = color->ui[0]; - swapped.ui[1] = color->ui[1]; - swapped.ui[0] = color->ui[2]; - swapped.ui[3] = color->ui[3]; - break; - case ZYXW: - swapped.ui[3] = color->ui[0]; - swapped.ui[0] = color->ui[1]; - swapped.ui[1] = color->ui[2]; - swapped.ui[2] = color->ui[3]; - break; - case XYZW: - swapped.ui[3] = color->ui[0]; - swapped.ui[2] = color->ui[1]; - swapped.ui[1] = color->ui[2]; - swapped.ui[0] = color->ui[3]; - break; - } - - util_pack_color_union(pfmt, &uc, &swapped); - - OUT_PKT4(ring, REG_A6XX_RB_BLIT_DST_INFO, 1); - OUT_RING(ring, A6XX_RB_BLIT_DST_INFO_TILE_MODE(TILE6_LINEAR) | - A6XX_RB_BLIT_DST_INFO_SAMPLES(samples) | - A6XX_RB_BLIT_DST_INFO_COLOR_FORMAT(fd6_pipe2color(pfmt))); - - OUT_PKT4(ring, REG_A6XX_RB_BLIT_INFO, 1); - OUT_RING(ring, A6XX_RB_BLIT_INFO_GMEM | - A6XX_RB_BLIT_INFO_CLEAR_MASK(0xf)); - - OUT_PKT4(ring, REG_A6XX_RB_BLIT_BASE_GMEM, 1); - OUT_RING(ring, gmem->cbuf_base[i]); - - OUT_PKT4(ring, REG_A6XX_RB_UNKNOWN_88D0, 1); - OUT_RING(ring, 0); - - OUT_PKT4(ring, REG_A6XX_RB_BLIT_CLEAR_COLOR_DW0, 4); - OUT_RING(ring, uc.ui[0]); - OUT_RING(ring, uc.ui[1]); - OUT_RING(ring, uc.ui[2]); - OUT_RING(ring, uc.ui[3]); - - fd6_emit_blit(batch, ring); - } - } - - const bool has_depth = pfb->zsbuf; - const bool has_separate_stencil = - has_depth && fd_resource(pfb->zsbuf->texture)->stencil; - - /* First clear depth or combined depth/stencil. */ - if ((has_depth && (buffers & PIPE_CLEAR_DEPTH)) || - (!has_separate_stencil && (buffers & PIPE_CLEAR_STENCIL))) { - enum pipe_format pfmt = pfb->zsbuf->format; - uint32_t clear_value; - uint32_t mask = 0; - - if (has_separate_stencil) { - pfmt = util_format_get_depth_only(pfb->zsbuf->format); - clear_value = util_pack_z(pfmt, batch->clear_depth); - } else { - pfmt = pfb->zsbuf->format; - clear_value = util_pack_z_stencil(pfmt, batch->clear_depth, - batch->clear_stencil); - } - - if (buffers & PIPE_CLEAR_DEPTH) - mask |= 0x1; - - if (!has_separate_stencil && (buffers & PIPE_CLEAR_STENCIL)) - mask |= 0x2; - - OUT_PKT4(ring, REG_A6XX_RB_BLIT_DST_INFO, 1); - OUT_RING(ring, A6XX_RB_BLIT_DST_INFO_TILE_MODE(TILE6_LINEAR) | - A6XX_RB_BLIT_DST_INFO_SAMPLES(samples) | - A6XX_RB_BLIT_DST_INFO_COLOR_FORMAT(fd6_pipe2color(pfmt))); - - OUT_PKT4(ring, REG_A6XX_RB_BLIT_INFO, 1); - OUT_RING(ring, A6XX_RB_BLIT_INFO_GMEM | - // XXX UNK0 for separate stencil ?? - A6XX_RB_BLIT_INFO_DEPTH | - A6XX_RB_BLIT_INFO_CLEAR_MASK(mask)); - - OUT_PKT4(ring, REG_A6XX_RB_BLIT_BASE_GMEM, 1); - OUT_RING(ring, gmem->zsbuf_base[0]); - - OUT_PKT4(ring, REG_A6XX_RB_UNKNOWN_88D0, 1); - OUT_RING(ring, 0); - - OUT_PKT4(ring, REG_A6XX_RB_BLIT_CLEAR_COLOR_DW0, 1); - OUT_RING(ring, clear_value); - - fd6_emit_blit(batch, ring); - } - - /* Then clear the separate stencil buffer in case of 32 bit depth - * formats with separate stencil. */ - if (has_separate_stencil && (buffers & PIPE_CLEAR_STENCIL)) { - OUT_PKT4(ring, REG_A6XX_RB_BLIT_DST_INFO, 1); - OUT_RING(ring, A6XX_RB_BLIT_DST_INFO_TILE_MODE(TILE6_LINEAR) | - A6XX_RB_BLIT_DST_INFO_SAMPLES(samples) | - A6XX_RB_BLIT_DST_INFO_COLOR_FORMAT(FMT6_8_UINT)); - - OUT_PKT4(ring, REG_A6XX_RB_BLIT_INFO, 1); - OUT_RING(ring, A6XX_RB_BLIT_INFO_GMEM | - //A6XX_RB_BLIT_INFO_UNK0 | - A6XX_RB_BLIT_INFO_DEPTH | - A6XX_RB_BLIT_INFO_CLEAR_MASK(0x1)); - - OUT_PKT4(ring, REG_A6XX_RB_BLIT_BASE_GMEM, 1); - OUT_RING(ring, gmem->zsbuf_base[1]); - - OUT_PKT4(ring, REG_A6XX_RB_UNKNOWN_88D0, 1); - OUT_RING(ring, 0); - - OUT_PKT4(ring, REG_A6XX_RB_BLIT_CLEAR_COLOR_DW0, 1); - OUT_RING(ring, batch->clear_stencil & 0xff); - - fd6_emit_blit(batch, ring); - } + struct pipe_framebuffer_state *pfb = &batch->framebuffer; + const struct fd_gmem_stateobj *gmem = batch->gmem_state; + enum a3xx_msaa_samples samples = fd_msaa_samples(pfb->samples); + + uint32_t buffers = batch->fast_cleared; + + if (buffers & PIPE_CLEAR_COLOR) { + + for (int i = 0; i < pfb->nr_cbufs; i++) { + union pipe_color_union *color = &batch->clear_color[i]; + union util_color uc = {0}; + + if (!pfb->cbufs[i]) + continue; + + if (!(buffers & (PIPE_CLEAR_COLOR0 << i))) + continue; + + enum pipe_format pfmt = pfb->cbufs[i]->format; + + // XXX I think RB_CLEAR_COLOR_DWn wants to take into account SWAP?? + union pipe_color_union swapped; + switch (fd6_pipe2swap(pfmt)) { + case WZYX: + swapped.ui[0] = color->ui[0]; + swapped.ui[1] = color->ui[1]; + swapped.ui[2] = color->ui[2]; + swapped.ui[3] = color->ui[3]; + break; + case WXYZ: + swapped.ui[2] = color->ui[0]; + swapped.ui[1] = color->ui[1]; + swapped.ui[0] = color->ui[2]; + swapped.ui[3] = color->ui[3]; + break; + case ZYXW: + swapped.ui[3] = color->ui[0]; + swapped.ui[0] = color->ui[1]; + swapped.ui[1] = color->ui[2]; + swapped.ui[2] = color->ui[3]; + break; + case XYZW: + swapped.ui[3] = color->ui[0]; + swapped.ui[2] = color->ui[1]; + swapped.ui[1] = color->ui[2]; + swapped.ui[0] = color->ui[3]; + break; + } + + util_pack_color_union(pfmt, &uc, &swapped); + + OUT_PKT4(ring, REG_A6XX_RB_BLIT_DST_INFO, 1); + OUT_RING(ring, + A6XX_RB_BLIT_DST_INFO_TILE_MODE(TILE6_LINEAR) | + A6XX_RB_BLIT_DST_INFO_SAMPLES(samples) | + A6XX_RB_BLIT_DST_INFO_COLOR_FORMAT(fd6_pipe2color(pfmt))); + + OUT_PKT4(ring, REG_A6XX_RB_BLIT_INFO, 1); + OUT_RING(ring, + A6XX_RB_BLIT_INFO_GMEM | A6XX_RB_BLIT_INFO_CLEAR_MASK(0xf)); + + OUT_PKT4(ring, REG_A6XX_RB_BLIT_BASE_GMEM, 1); + OUT_RING(ring, gmem->cbuf_base[i]); + + OUT_PKT4(ring, REG_A6XX_RB_UNKNOWN_88D0, 1); + OUT_RING(ring, 0); + + OUT_PKT4(ring, REG_A6XX_RB_BLIT_CLEAR_COLOR_DW0, 4); + OUT_RING(ring, uc.ui[0]); + OUT_RING(ring, uc.ui[1]); + OUT_RING(ring, uc.ui[2]); + OUT_RING(ring, uc.ui[3]); + + fd6_emit_blit(batch, ring); + } + } + + const bool has_depth = pfb->zsbuf; + const bool has_separate_stencil = + has_depth && fd_resource(pfb->zsbuf->texture)->stencil; + + /* First clear depth or combined depth/stencil. */ + if ((has_depth && (buffers & PIPE_CLEAR_DEPTH)) || + (!has_separate_stencil && (buffers & PIPE_CLEAR_STENCIL))) { + enum pipe_format pfmt = pfb->zsbuf->format; + uint32_t clear_value; + uint32_t mask = 0; + + if (has_separate_stencil) { + pfmt = util_format_get_depth_only(pfb->zsbuf->format); + clear_value = util_pack_z(pfmt, batch->clear_depth); + } else { + pfmt = pfb->zsbuf->format; + clear_value = + util_pack_z_stencil(pfmt, batch->clear_depth, batch->clear_stencil); + } + + if (buffers & PIPE_CLEAR_DEPTH) + mask |= 0x1; + + if (!has_separate_stencil && (buffers & PIPE_CLEAR_STENCIL)) + mask |= 0x2; + + OUT_PKT4(ring, REG_A6XX_RB_BLIT_DST_INFO, 1); + OUT_RING(ring, + A6XX_RB_BLIT_DST_INFO_TILE_MODE(TILE6_LINEAR) | + A6XX_RB_BLIT_DST_INFO_SAMPLES(samples) | + A6XX_RB_BLIT_DST_INFO_COLOR_FORMAT(fd6_pipe2color(pfmt))); + + OUT_PKT4(ring, REG_A6XX_RB_BLIT_INFO, 1); + OUT_RING(ring, A6XX_RB_BLIT_INFO_GMEM | + // XXX UNK0 for separate stencil ?? + A6XX_RB_BLIT_INFO_DEPTH | + A6XX_RB_BLIT_INFO_CLEAR_MASK(mask)); + + OUT_PKT4(ring, REG_A6XX_RB_BLIT_BASE_GMEM, 1); + OUT_RING(ring, gmem->zsbuf_base[0]); + + OUT_PKT4(ring, REG_A6XX_RB_UNKNOWN_88D0, 1); + OUT_RING(ring, 0); + + OUT_PKT4(ring, REG_A6XX_RB_BLIT_CLEAR_COLOR_DW0, 1); + OUT_RING(ring, clear_value); + + fd6_emit_blit(batch, ring); + } + + /* Then clear the separate stencil buffer in case of 32 bit depth + * formats with separate stencil. */ + if (has_separate_stencil && (buffers & PIPE_CLEAR_STENCIL)) { + OUT_PKT4(ring, REG_A6XX_RB_BLIT_DST_INFO, 1); + OUT_RING(ring, A6XX_RB_BLIT_DST_INFO_TILE_MODE(TILE6_LINEAR) | + A6XX_RB_BLIT_DST_INFO_SAMPLES(samples) | + A6XX_RB_BLIT_DST_INFO_COLOR_FORMAT(FMT6_8_UINT)); + + OUT_PKT4(ring, REG_A6XX_RB_BLIT_INFO, 1); + OUT_RING(ring, A6XX_RB_BLIT_INFO_GMEM | + // A6XX_RB_BLIT_INFO_UNK0 | + A6XX_RB_BLIT_INFO_DEPTH | + A6XX_RB_BLIT_INFO_CLEAR_MASK(0x1)); + + OUT_PKT4(ring, REG_A6XX_RB_BLIT_BASE_GMEM, 1); + OUT_RING(ring, gmem->zsbuf_base[1]); + + OUT_PKT4(ring, REG_A6XX_RB_UNKNOWN_88D0, 1); + OUT_RING(ring, 0); + + OUT_PKT4(ring, REG_A6XX_RB_BLIT_CLEAR_COLOR_DW0, 1); + OUT_RING(ring, batch->clear_stencil & 0xff); + + fd6_emit_blit(batch, ring); + } } /* @@ -1108,48 +1111,48 @@ emit_clears(struct fd_batch *batch, struct fd_ringbuffer *ring) static void emit_restore_blits(struct fd_batch *batch, struct fd_ringbuffer *ring) { - const struct fd_gmem_stateobj *gmem = batch->gmem_state; - struct pipe_framebuffer_state *pfb = &batch->framebuffer; - - if (batch->restore & FD_BUFFER_COLOR) { - unsigned i; - for (i = 0; i < pfb->nr_cbufs; i++) { - if (!pfb->cbufs[i]) - continue; - if (!(batch->restore & (PIPE_CLEAR_COLOR0 << i))) - continue; - emit_restore_blit(batch, ring, gmem->cbuf_base[i], pfb->cbufs[i], - FD_BUFFER_COLOR); - } - } - - if (batch->restore & (FD_BUFFER_DEPTH | FD_BUFFER_STENCIL)) { - struct fd_resource *rsc = fd_resource(pfb->zsbuf->texture); - - if (!rsc->stencil || (batch->restore & FD_BUFFER_DEPTH)) { - emit_restore_blit(batch, ring, gmem->zsbuf_base[0], pfb->zsbuf, - FD_BUFFER_DEPTH); - } - if (rsc->stencil && (batch->restore & FD_BUFFER_STENCIL)) { - emit_restore_blit(batch, ring, gmem->zsbuf_base[1], pfb->zsbuf, - FD_BUFFER_STENCIL); - } - } + const struct fd_gmem_stateobj *gmem = batch->gmem_state; + struct pipe_framebuffer_state *pfb = &batch->framebuffer; + + if (batch->restore & FD_BUFFER_COLOR) { + unsigned i; + for (i = 0; i < pfb->nr_cbufs; i++) { + if (!pfb->cbufs[i]) + continue; + if (!(batch->restore & (PIPE_CLEAR_COLOR0 << i))) + continue; + emit_restore_blit(batch, ring, gmem->cbuf_base[i], pfb->cbufs[i], + FD_BUFFER_COLOR); + } + } + + if (batch->restore & (FD_BUFFER_DEPTH | FD_BUFFER_STENCIL)) { + struct fd_resource *rsc = fd_resource(pfb->zsbuf->texture); + + if (!rsc->stencil || (batch->restore & FD_BUFFER_DEPTH)) { + emit_restore_blit(batch, ring, gmem->zsbuf_base[0], pfb->zsbuf, + FD_BUFFER_DEPTH); + } + if (rsc->stencil && (batch->restore & FD_BUFFER_STENCIL)) { + emit_restore_blit(batch, ring, gmem->zsbuf_base[1], pfb->zsbuf, + FD_BUFFER_STENCIL); + } + } } static void prepare_tile_setup_ib(struct fd_batch *batch) { - if (!(batch->restore || batch->fast_cleared)) - return; + if (!(batch->restore || batch->fast_cleared)) + return; - batch->tile_setup = fd_submit_new_ringbuffer(batch->submit, 0x1000, - FD_RINGBUFFER_STREAMING); + batch->tile_setup = + fd_submit_new_ringbuffer(batch->submit, 0x1000, FD_RINGBUFFER_STREAMING); - set_blit_scissor(batch, batch->tile_setup); + set_blit_scissor(batch, batch->tile_setup); - emit_restore_blits(batch, batch->tile_setup); - emit_clears(batch, batch->tile_setup); + emit_restore_blits(batch, batch->tile_setup); + emit_clears(batch, batch->tile_setup); } /* @@ -1164,104 +1167,104 @@ fd6_emit_tile_mem2gmem(struct fd_batch *batch, const struct fd_tile *tile) static void fd6_emit_tile_renderprep(struct fd_batch *batch, const struct fd_tile *tile) { - if (!batch->tile_setup) - return; - - trace_start_clear_restore(&batch->trace, batch->fast_cleared); - if (batch->fast_cleared || !use_hw_binning(batch)) { - fd6_emit_ib(batch->gmem, batch->tile_setup); - } else { - emit_conditional_ib(batch, tile, batch->tile_setup); - } - trace_end_clear_restore(&batch->trace); + if (!batch->tile_setup) + return; + + trace_start_clear_restore(&batch->trace, batch->fast_cleared); + if (batch->fast_cleared || !use_hw_binning(batch)) { + fd6_emit_ib(batch->gmem, batch->tile_setup); + } else { + emit_conditional_ib(batch, tile, batch->tile_setup); + } + trace_end_clear_restore(&batch->trace); } static bool blit_can_resolve(enum pipe_format format) { - const struct util_format_description *desc = util_format_description(format); - - /* blit event can only do resolve for simple cases: - * averaging samples as unsigned integers or choosing only one sample - */ - if (util_format_is_snorm(format) || util_format_is_srgb(format)) - return false; - - /* can't do formats with larger channel sizes - * note: this includes all float formats - * note2: single channel integer formats seem OK - */ - if (desc->channel[0].size > 10) - return false; - - switch (format) { - /* for unknown reasons blit event can't msaa resolve these formats when tiled - * likely related to these formats having different layout from other cpp=2 formats - */ - case PIPE_FORMAT_R8G8_UNORM: - case PIPE_FORMAT_R8G8_UINT: - case PIPE_FORMAT_R8G8_SINT: - /* TODO: this one should be able to work? */ - case PIPE_FORMAT_Z24_UNORM_S8_UINT: - return false; - default: - break; - } - - return true; + const struct util_format_description *desc = util_format_description(format); + + /* blit event can only do resolve for simple cases: + * averaging samples as unsigned integers or choosing only one sample + */ + if (util_format_is_snorm(format) || util_format_is_srgb(format)) + return false; + + /* can't do formats with larger channel sizes + * note: this includes all float formats + * note2: single channel integer formats seem OK + */ + if (desc->channel[0].size > 10) + return false; + + switch (format) { + /* for unknown reasons blit event can't msaa resolve these formats when tiled + * likely related to these formats having different layout from other cpp=2 + * formats + */ + case PIPE_FORMAT_R8G8_UNORM: + case PIPE_FORMAT_R8G8_UINT: + case PIPE_FORMAT_R8G8_SINT: + /* TODO: this one should be able to work? */ + case PIPE_FORMAT_Z24_UNORM_S8_UINT: + return false; + default: + break; + } + + return true; } static bool needs_resolve(struct pipe_surface *psurf) { - return psurf->nr_samples && (psurf->nr_samples != psurf->texture->nr_samples); + return psurf->nr_samples && + (psurf->nr_samples != psurf->texture->nr_samples); } static void -emit_resolve_blit(struct fd_batch *batch, - struct fd_ringbuffer *ring, - uint32_t base, - struct pipe_surface *psurf, - unsigned buffer) - assert_dt +emit_resolve_blit(struct fd_batch *batch, struct fd_ringbuffer *ring, + uint32_t base, struct pipe_surface *psurf, + unsigned buffer) assert_dt { - uint32_t info = 0; - bool stencil = false; - - if (!fd_resource(psurf->texture)->valid) - return; - - /* if we need to resolve, but cannot with BLIT event, we instead need - * to generate per-tile CP_BLIT (r2d) commands: - * - * The separate-stencil is a special case, we might need to use CP_BLIT - * for depth, but we can still resolve stencil with a BLIT event - */ - if (needs_resolve(psurf) && !blit_can_resolve(psurf->format) && - (buffer != FD_BUFFER_STENCIL)) { - fd6_resolve_tile(batch, ring, base, psurf); - return; - } - - switch (buffer) { - case FD_BUFFER_COLOR: - break; - case FD_BUFFER_STENCIL: - info |= A6XX_RB_BLIT_INFO_UNK0; - stencil = true; - break; - case FD_BUFFER_DEPTH: - info |= A6XX_RB_BLIT_INFO_DEPTH; - break; - } - - if (util_format_is_pure_integer(psurf->format) || util_format_is_depth_or_stencil(psurf->format)) - info |= A6XX_RB_BLIT_INFO_SAMPLE_0; - - OUT_PKT4(ring, REG_A6XX_RB_BLIT_INFO, 1); - OUT_RING(ring, info); - - emit_blit(batch, ring, base, psurf, stencil); + uint32_t info = 0; + bool stencil = false; + + if (!fd_resource(psurf->texture)->valid) + return; + + /* if we need to resolve, but cannot with BLIT event, we instead need + * to generate per-tile CP_BLIT (r2d) commands: + * + * The separate-stencil is a special case, we might need to use CP_BLIT + * for depth, but we can still resolve stencil with a BLIT event + */ + if (needs_resolve(psurf) && !blit_can_resolve(psurf->format) && + (buffer != FD_BUFFER_STENCIL)) { + fd6_resolve_tile(batch, ring, base, psurf); + return; + } + + switch (buffer) { + case FD_BUFFER_COLOR: + break; + case FD_BUFFER_STENCIL: + info |= A6XX_RB_BLIT_INFO_UNK0; + stencil = true; + break; + case FD_BUFFER_DEPTH: + info |= A6XX_RB_BLIT_INFO_DEPTH; + break; + } + + if (util_format_is_pure_integer(psurf->format) || + util_format_is_depth_or_stencil(psurf->format)) + info |= A6XX_RB_BLIT_INFO_SAMPLE_0; + + OUT_PKT4(ring, REG_A6XX_RB_BLIT_INFO, 1); + OUT_RING(ring, info); + + emit_blit(batch, ring, base, psurf, stencil); } /* @@ -1269,300 +1272,294 @@ emit_resolve_blit(struct fd_batch *batch, */ static void -prepare_tile_fini_ib(struct fd_batch *batch) - assert_dt +prepare_tile_fini_ib(struct fd_batch *batch) assert_dt { - const struct fd_gmem_stateobj *gmem = batch->gmem_state; - struct pipe_framebuffer_state *pfb = &batch->framebuffer; - struct fd_ringbuffer *ring; - - batch->tile_fini = fd_submit_new_ringbuffer(batch->submit, 0x1000, - FD_RINGBUFFER_STREAMING); - ring = batch->tile_fini; - - set_blit_scissor(batch, ring); - - if (batch->resolve & (FD_BUFFER_DEPTH | FD_BUFFER_STENCIL)) { - struct fd_resource *rsc = fd_resource(pfb->zsbuf->texture); - - if (!rsc->stencil || (batch->resolve & FD_BUFFER_DEPTH)) { - emit_resolve_blit(batch, ring, - gmem->zsbuf_base[0], pfb->zsbuf, - FD_BUFFER_DEPTH); - } - if (rsc->stencil && (batch->resolve & FD_BUFFER_STENCIL)) { - emit_resolve_blit(batch, ring, - gmem->zsbuf_base[1], pfb->zsbuf, - FD_BUFFER_STENCIL); - } - } - - if (batch->resolve & FD_BUFFER_COLOR) { - unsigned i; - for (i = 0; i < pfb->nr_cbufs; i++) { - if (!pfb->cbufs[i]) - continue; - if (!(batch->resolve & (PIPE_CLEAR_COLOR0 << i))) - continue; - emit_resolve_blit(batch, ring, gmem->cbuf_base[i], pfb->cbufs[i], - FD_BUFFER_COLOR); - } - } + const struct fd_gmem_stateobj *gmem = batch->gmem_state; + struct pipe_framebuffer_state *pfb = &batch->framebuffer; + struct fd_ringbuffer *ring; + + batch->tile_fini = + fd_submit_new_ringbuffer(batch->submit, 0x1000, FD_RINGBUFFER_STREAMING); + ring = batch->tile_fini; + + set_blit_scissor(batch, ring); + + if (batch->resolve & (FD_BUFFER_DEPTH | FD_BUFFER_STENCIL)) { + struct fd_resource *rsc = fd_resource(pfb->zsbuf->texture); + + if (!rsc->stencil || (batch->resolve & FD_BUFFER_DEPTH)) { + emit_resolve_blit(batch, ring, gmem->zsbuf_base[0], pfb->zsbuf, + FD_BUFFER_DEPTH); + } + if (rsc->stencil && (batch->resolve & FD_BUFFER_STENCIL)) { + emit_resolve_blit(batch, ring, gmem->zsbuf_base[1], pfb->zsbuf, + FD_BUFFER_STENCIL); + } + } + + if (batch->resolve & FD_BUFFER_COLOR) { + unsigned i; + for (i = 0; i < pfb->nr_cbufs; i++) { + if (!pfb->cbufs[i]) + continue; + if (!(batch->resolve & (PIPE_CLEAR_COLOR0 << i))) + continue; + emit_resolve_blit(batch, ring, gmem->cbuf_base[i], pfb->cbufs[i], + FD_BUFFER_COLOR); + } + } } static void fd6_emit_tile(struct fd_batch *batch, const struct fd_tile *tile) { - if (!use_hw_binning(batch)) { - fd6_emit_ib(batch->gmem, batch->draw); - } else { - emit_conditional_ib(batch, tile, batch->draw); - } - - if (batch->epilogue) - fd6_emit_ib(batch->gmem, batch->epilogue); + if (!use_hw_binning(batch)) { + fd6_emit_ib(batch->gmem, batch->draw); + } else { + emit_conditional_ib(batch, tile, batch->draw); + } + + if (batch->epilogue) + fd6_emit_ib(batch->gmem, batch->epilogue); } static void fd6_emit_tile_gmem2mem(struct fd_batch *batch, const struct fd_tile *tile) { - struct fd_ringbuffer *ring = batch->gmem; - - if (use_hw_binning(batch)) { - OUT_PKT7(ring, CP_SET_MARKER, 1); - OUT_RING(ring, A6XX_CP_SET_MARKER_0_MODE(RM6_ENDVIS)); - } - - OUT_PKT7(ring, CP_SET_DRAW_STATE, 3); - OUT_RING(ring, CP_SET_DRAW_STATE__0_COUNT(0) | - CP_SET_DRAW_STATE__0_DISABLE_ALL_GROUPS | - CP_SET_DRAW_STATE__0_GROUP_ID(0)); - OUT_RING(ring, CP_SET_DRAW_STATE__1_ADDR_LO(0)); - OUT_RING(ring, CP_SET_DRAW_STATE__2_ADDR_HI(0)); - - OUT_PKT7(ring, CP_SKIP_IB2_ENABLE_LOCAL, 1); - OUT_RING(ring, 0x0); - - emit_marker6(ring, 7); - OUT_PKT7(ring, CP_SET_MARKER, 1); - OUT_RING(ring, A6XX_CP_SET_MARKER_0_MODE(RM6_RESOLVE)); - emit_marker6(ring, 7); - - trace_start_resolve(&batch->trace); - if (batch->fast_cleared || !use_hw_binning(batch)) { - fd6_emit_ib(batch->gmem, batch->tile_fini); - } else { - emit_conditional_ib(batch, tile, batch->tile_fini); - } - trace_end_resolve(&batch->trace); + struct fd_ringbuffer *ring = batch->gmem; + + if (use_hw_binning(batch)) { + OUT_PKT7(ring, CP_SET_MARKER, 1); + OUT_RING(ring, A6XX_CP_SET_MARKER_0_MODE(RM6_ENDVIS)); + } + + OUT_PKT7(ring, CP_SET_DRAW_STATE, 3); + OUT_RING(ring, CP_SET_DRAW_STATE__0_COUNT(0) | + CP_SET_DRAW_STATE__0_DISABLE_ALL_GROUPS | + CP_SET_DRAW_STATE__0_GROUP_ID(0)); + OUT_RING(ring, CP_SET_DRAW_STATE__1_ADDR_LO(0)); + OUT_RING(ring, CP_SET_DRAW_STATE__2_ADDR_HI(0)); + + OUT_PKT7(ring, CP_SKIP_IB2_ENABLE_LOCAL, 1); + OUT_RING(ring, 0x0); + + emit_marker6(ring, 7); + OUT_PKT7(ring, CP_SET_MARKER, 1); + OUT_RING(ring, A6XX_CP_SET_MARKER_0_MODE(RM6_RESOLVE)); + emit_marker6(ring, 7); + + trace_start_resolve(&batch->trace); + if (batch->fast_cleared || !use_hw_binning(batch)) { + fd6_emit_ib(batch->gmem, batch->tile_fini); + } else { + emit_conditional_ib(batch, tile, batch->tile_fini); + } + trace_end_resolve(&batch->trace); } static void fd6_emit_tile_fini(struct fd_batch *batch) { - struct fd_ringbuffer *ring = batch->gmem; + struct fd_ringbuffer *ring = batch->gmem; - emit_common_fini(batch); + emit_common_fini(batch); - OUT_PKT4(ring, REG_A6XX_GRAS_LRZ_CNTL, 1); - OUT_RING(ring, A6XX_GRAS_LRZ_CNTL_ENABLE); + OUT_PKT4(ring, REG_A6XX_GRAS_LRZ_CNTL, 1); + OUT_RING(ring, A6XX_GRAS_LRZ_CNTL_ENABLE); - fd6_emit_lrz_flush(ring); + fd6_emit_lrz_flush(ring); - fd6_event_write(batch, ring, PC_CCU_RESOLVE_TS, true); + fd6_event_write(batch, ring, PC_CCU_RESOLVE_TS, true); - if (use_hw_binning(batch)) { - check_vsc_overflow(batch->ctx); - } + if (use_hw_binning(batch)) { + check_vsc_overflow(batch->ctx); + } } static void -emit_sysmem_clears(struct fd_batch *batch, struct fd_ringbuffer *ring) - assert_dt +emit_sysmem_clears(struct fd_batch *batch, struct fd_ringbuffer *ring) assert_dt { - struct fd_context *ctx = batch->ctx; - struct pipe_framebuffer_state *pfb = &batch->framebuffer; + struct fd_context *ctx = batch->ctx; + struct pipe_framebuffer_state *pfb = &batch->framebuffer; - uint32_t buffers = batch->fast_cleared; + uint32_t buffers = batch->fast_cleared; - if (!buffers) - return; + if (!buffers) + return; - trace_start_clear_restore(&batch->trace, buffers); + trace_start_clear_restore(&batch->trace, buffers); - if (buffers & PIPE_CLEAR_COLOR) { - for (int i = 0; i < pfb->nr_cbufs; i++) { - union pipe_color_union color = batch->clear_color[i]; + if (buffers & PIPE_CLEAR_COLOR) { + for (int i = 0; i < pfb->nr_cbufs; i++) { + union pipe_color_union color = batch->clear_color[i]; - if (!pfb->cbufs[i]) - continue; + if (!pfb->cbufs[i]) + continue; - if (!(buffers & (PIPE_CLEAR_COLOR0 << i))) - continue; + if (!(buffers & (PIPE_CLEAR_COLOR0 << i))) + continue; - fd6_clear_surface(ctx, ring, - pfb->cbufs[i], pfb->width, pfb->height, &color); - } - } - if (buffers & (PIPE_CLEAR_DEPTH | PIPE_CLEAR_STENCIL)) { - union pipe_color_union value = {}; + fd6_clear_surface(ctx, ring, pfb->cbufs[i], pfb->width, pfb->height, + &color); + } + } + if (buffers & (PIPE_CLEAR_DEPTH | PIPE_CLEAR_STENCIL)) { + union pipe_color_union value = {}; - const bool has_depth = pfb->zsbuf; - struct pipe_resource *separate_stencil = - has_depth && fd_resource(pfb->zsbuf->texture)->stencil ? - &fd_resource(pfb->zsbuf->texture)->stencil->b.b : NULL; + const bool has_depth = pfb->zsbuf; + struct pipe_resource *separate_stencil = + has_depth && fd_resource(pfb->zsbuf->texture)->stencil + ? &fd_resource(pfb->zsbuf->texture)->stencil->b.b + : NULL; - if ((has_depth && (buffers & PIPE_CLEAR_DEPTH)) || - (!separate_stencil && (buffers & PIPE_CLEAR_STENCIL))) { - value.f[0] = batch->clear_depth; - value.ui[1] = batch->clear_stencil; - fd6_clear_surface(ctx, ring, - pfb->zsbuf, pfb->width, pfb->height, &value); - } + if ((has_depth && (buffers & PIPE_CLEAR_DEPTH)) || + (!separate_stencil && (buffers & PIPE_CLEAR_STENCIL))) { + value.f[0] = batch->clear_depth; + value.ui[1] = batch->clear_stencil; + fd6_clear_surface(ctx, ring, pfb->zsbuf, pfb->width, pfb->height, + &value); + } - if (separate_stencil && (buffers & PIPE_CLEAR_STENCIL)) { - value.ui[0] = batch->clear_stencil; + if (separate_stencil && (buffers & PIPE_CLEAR_STENCIL)) { + value.ui[0] = batch->clear_stencil; - struct pipe_surface stencil_surf = *pfb->zsbuf; - stencil_surf.format = PIPE_FORMAT_S8_UINT; - stencil_surf.texture = separate_stencil; + struct pipe_surface stencil_surf = *pfb->zsbuf; + stencil_surf.format = PIPE_FORMAT_S8_UINT; + stencil_surf.texture = separate_stencil; - fd6_clear_surface(ctx, ring, - &stencil_surf, pfb->width, pfb->height, &value); - } - } + fd6_clear_surface(ctx, ring, &stencil_surf, pfb->width, pfb->height, + &value); + } + } - fd6_event_write(batch, ring, PC_CCU_FLUSH_COLOR_TS, true); + fd6_event_write(batch, ring, PC_CCU_FLUSH_COLOR_TS, true); - trace_end_clear_restore(&batch->trace); + trace_end_clear_restore(&batch->trace); } static void setup_tess_buffers(struct fd_batch *batch, struct fd_ringbuffer *ring) { - struct fd_context *ctx = batch->ctx; + struct fd_context *ctx = batch->ctx; - batch->tessfactor_bo = fd_bo_new(ctx->screen->dev, - batch->tessfactor_size, - DRM_FREEDRENO_GEM_TYPE_KMEM, "tessfactor"); + batch->tessfactor_bo = fd_bo_new(ctx->screen->dev, batch->tessfactor_size, + DRM_FREEDRENO_GEM_TYPE_KMEM, "tessfactor"); - batch->tessparam_bo = fd_bo_new(ctx->screen->dev, - batch->tessparam_size, - DRM_FREEDRENO_GEM_TYPE_KMEM, "tessparam"); + batch->tessparam_bo = fd_bo_new(ctx->screen->dev, batch->tessparam_size, + DRM_FREEDRENO_GEM_TYPE_KMEM, "tessparam"); - OUT_PKT4(ring, REG_A6XX_PC_TESSFACTOR_ADDR, 2); - OUT_RELOC(ring, batch->tessfactor_bo, 0, 0, 0); + OUT_PKT4(ring, REG_A6XX_PC_TESSFACTOR_ADDR, 2); + OUT_RELOC(ring, batch->tessfactor_bo, 0, 0, 0); - batch->tess_addrs_constobj->cur = batch->tess_addrs_constobj->start; - OUT_RELOC(batch->tess_addrs_constobj, batch->tessparam_bo, 0, 0, 0); - OUT_RELOC(batch->tess_addrs_constobj, batch->tessfactor_bo, 0, 0, 0); + batch->tess_addrs_constobj->cur = batch->tess_addrs_constobj->start; + OUT_RELOC(batch->tess_addrs_constobj, batch->tessparam_bo, 0, 0, 0); + OUT_RELOC(batch->tess_addrs_constobj, batch->tessfactor_bo, 0, 0, 0); } static void -fd6_emit_sysmem_prep(struct fd_batch *batch) - assert_dt +fd6_emit_sysmem_prep(struct fd_batch *batch) assert_dt { - struct fd_ringbuffer *ring = batch->gmem; - struct fd_screen *screen = batch->ctx->screen; + struct fd_ringbuffer *ring = batch->gmem; + struct fd_screen *screen = batch->ctx->screen; - fd6_emit_restore(batch, ring); - fd6_emit_lrz_flush(ring); + fd6_emit_restore(batch, ring); + fd6_emit_lrz_flush(ring); - if (batch->prologue) { - if (!batch->nondraw) { - trace_start_prologue(&batch->trace); - } - fd6_emit_ib(ring, batch->prologue); - if (!batch->nondraw) { - trace_end_prologue(&batch->trace); - } - } + if (batch->prologue) { + if (!batch->nondraw) { + trace_start_prologue(&batch->trace); + } + fd6_emit_ib(ring, batch->prologue); + if (!batch->nondraw) { + trace_end_prologue(&batch->trace); + } + } - /* remaining setup below here does not apply to blit/compute: */ - if (batch->nondraw) - return; + /* remaining setup below here does not apply to blit/compute: */ + if (batch->nondraw) + return; - struct pipe_framebuffer_state *pfb = &batch->framebuffer; + struct pipe_framebuffer_state *pfb = &batch->framebuffer; - if (pfb->width > 0 && pfb->height > 0) - set_scissor(ring, 0, 0, pfb->width - 1, pfb->height - 1); - else - set_scissor(ring, 0, 0, 0, 0); + if (pfb->width > 0 && pfb->height > 0) + set_scissor(ring, 0, 0, pfb->width - 1, pfb->height - 1); + else + set_scissor(ring, 0, 0, 0, 0); - set_window_offset(ring, 0, 0); + set_window_offset(ring, 0, 0); - set_bin_size(ring, 0, 0, 0xc00000); /* 0xc00000 = BYPASS? */ + set_bin_size(ring, 0, 0, 0xc00000); /* 0xc00000 = BYPASS? */ - emit_sysmem_clears(batch, ring); + emit_sysmem_clears(batch, ring); - emit_marker6(ring, 7); - OUT_PKT7(ring, CP_SET_MARKER, 1); - OUT_RING(ring, A6XX_CP_SET_MARKER_0_MODE(RM6_BYPASS)); - emit_marker6(ring, 7); + emit_marker6(ring, 7); + OUT_PKT7(ring, CP_SET_MARKER, 1); + OUT_RING(ring, A6XX_CP_SET_MARKER_0_MODE(RM6_BYPASS)); + emit_marker6(ring, 7); - if (batch->tessellation) - setup_tess_buffers(batch, ring); + if (batch->tessellation) + setup_tess_buffers(batch, ring); - OUT_PKT7(ring, CP_SKIP_IB2_ENABLE_GLOBAL, 1); - OUT_RING(ring, 0x0); + OUT_PKT7(ring, CP_SKIP_IB2_ENABLE_GLOBAL, 1); + OUT_RING(ring, 0x0); - /* blob controls "local" in IB2, but I think that is not required */ - OUT_PKT7(ring, CP_SKIP_IB2_ENABLE_LOCAL, 1); - OUT_RING(ring, 0x1); + /* blob controls "local" in IB2, but I think that is not required */ + OUT_PKT7(ring, CP_SKIP_IB2_ENABLE_LOCAL, 1); + OUT_RING(ring, 0x1); - fd6_event_write(batch, ring, PC_CCU_INVALIDATE_COLOR, false); - fd6_cache_inv(batch, ring); + fd6_event_write(batch, ring, PC_CCU_INVALIDATE_COLOR, false); + fd6_cache_inv(batch, ring); - fd_wfi(batch, ring); - OUT_REG(ring, A6XX_RB_CCU_CNTL(.offset = screen->info.a6xx.ccu_offset_bypass)); + fd_wfi(batch, ring); + OUT_REG(ring, + A6XX_RB_CCU_CNTL(.offset = screen->info.a6xx.ccu_offset_bypass)); - /* enable stream-out, with sysmem there is only one pass: */ - OUT_REG(ring, A6XX_VPC_SO_DISABLE(false)); + /* enable stream-out, with sysmem there is only one pass: */ + OUT_REG(ring, A6XX_VPC_SO_DISABLE(false)); - OUT_PKT7(ring, CP_SET_VISIBILITY_OVERRIDE, 1); - OUT_RING(ring, 0x1); + OUT_PKT7(ring, CP_SET_VISIBILITY_OVERRIDE, 1); + OUT_RING(ring, 0x1); - emit_zs(ring, pfb->zsbuf, NULL); - emit_mrt(ring, pfb, NULL); - emit_msaa(ring, pfb->samples); + emit_zs(ring, pfb->zsbuf, NULL); + emit_mrt(ring, pfb, NULL); + emit_msaa(ring, pfb->samples); - update_render_cntl(batch, pfb, false); + update_render_cntl(batch, pfb, false); - emit_common_init(batch); + emit_common_init(batch); } static void fd6_emit_sysmem_fini(struct fd_batch *batch) { - struct fd_ringbuffer *ring = batch->gmem; + struct fd_ringbuffer *ring = batch->gmem; - emit_common_fini(batch); + emit_common_fini(batch); - if (batch->epilogue) - fd6_emit_ib(batch->gmem, batch->epilogue); + if (batch->epilogue) + fd6_emit_ib(batch->gmem, batch->epilogue); - OUT_PKT7(ring, CP_SKIP_IB2_ENABLE_GLOBAL, 1); - OUT_RING(ring, 0x0); + OUT_PKT7(ring, CP_SKIP_IB2_ENABLE_GLOBAL, 1); + OUT_RING(ring, 0x0); - fd6_emit_lrz_flush(ring); + fd6_emit_lrz_flush(ring); - fd6_event_write(batch, ring, PC_CCU_FLUSH_COLOR_TS, true); - fd6_event_write(batch, ring, PC_CCU_FLUSH_DEPTH_TS, true); + fd6_event_write(batch, ring, PC_CCU_FLUSH_COLOR_TS, true); + fd6_event_write(batch, ring, PC_CCU_FLUSH_DEPTH_TS, true); } void -fd6_gmem_init(struct pipe_context *pctx) - disable_thread_safety_analysis +fd6_gmem_init(struct pipe_context *pctx) disable_thread_safety_analysis { - struct fd_context *ctx = fd_context(pctx); - - ctx->emit_tile_init = fd6_emit_tile_init; - ctx->emit_tile_prep = fd6_emit_tile_prep; - ctx->emit_tile_mem2gmem = fd6_emit_tile_mem2gmem; - ctx->emit_tile_renderprep = fd6_emit_tile_renderprep; - ctx->emit_tile = fd6_emit_tile; - ctx->emit_tile_gmem2mem = fd6_emit_tile_gmem2mem; - ctx->emit_tile_fini = fd6_emit_tile_fini; - ctx->emit_sysmem_prep = fd6_emit_sysmem_prep; - ctx->emit_sysmem_fini = fd6_emit_sysmem_fini; + struct fd_context *ctx = fd_context(pctx); + + ctx->emit_tile_init = fd6_emit_tile_init; + ctx->emit_tile_prep = fd6_emit_tile_prep; + ctx->emit_tile_mem2gmem = fd6_emit_tile_mem2gmem; + ctx->emit_tile_renderprep = fd6_emit_tile_renderprep; + ctx->emit_tile = fd6_emit_tile; + ctx->emit_tile_gmem2mem = fd6_emit_tile_gmem2mem; + ctx->emit_tile_fini = fd6_emit_tile_fini; + ctx->emit_sysmem_prep = fd6_emit_sysmem_prep; + ctx->emit_sysmem_fini = fd6_emit_sysmem_fini; } diff --git a/src/gallium/drivers/freedreno/a6xx/fd6_image.c b/src/gallium/drivers/freedreno/a6xx/fd6_image.c index 683664e0..cc8ee0a 100644 --- a/src/gallium/drivers/freedreno/a6xx/fd6_image.c +++ b/src/gallium/drivers/freedreno/a6xx/fd6_image.c @@ -30,318 +30,329 @@ #include "freedreno_resource.h" #include "freedreno_state.h" -#include "fd6_image.h" #include "fd6_format.h" +#include "fd6_image.h" #include "fd6_resource.h" #include "fd6_texture.h" struct fd6_image { - struct pipe_resource *prsc; - enum pipe_format pfmt; - enum a6xx_tex_type type; - bool srgb; - uint32_t cpp; - uint32_t level; - uint32_t width; - uint32_t height; - uint32_t depth; - uint32_t pitch; - uint32_t array_pitch; - struct fd_bo *bo; - uint32_t ubwc_offset; - uint32_t offset; - bool buffer; + struct pipe_resource *prsc; + enum pipe_format pfmt; + enum a6xx_tex_type type; + bool srgb; + uint32_t cpp; + uint32_t level; + uint32_t width; + uint32_t height; + uint32_t depth; + uint32_t pitch; + uint32_t array_pitch; + struct fd_bo *bo; + uint32_t ubwc_offset; + uint32_t offset; + bool buffer; }; -static void translate_image(struct fd6_image *img, const struct pipe_image_view *pimg) +static void +translate_image(struct fd6_image *img, const struct pipe_image_view *pimg) { - enum pipe_format format = pimg->format; - struct pipe_resource *prsc = pimg->resource; - struct fd_resource *rsc = fd_resource(prsc); - - if (!prsc) { - memset(img, 0, sizeof(*img)); - return; - } - - img->prsc = prsc; - img->pfmt = format; - img->type = fd6_tex_type(prsc->target); - img->srgb = util_format_is_srgb(format); - img->cpp = rsc->layout.cpp; - img->bo = rsc->bo; - - /* Treat cube textures as 2d-array: */ - if (img->type == A6XX_TEX_CUBE) - img->type = A6XX_TEX_2D; - - if (prsc->target == PIPE_BUFFER) { - img->buffer = true; - img->ubwc_offset = 0; /* not valid for buffers */ - img->offset = pimg->u.buf.offset; - img->pitch = 0; - img->array_pitch = 0; - - /* size is encoded with low 15b in WIDTH and high bits in - * HEIGHT, in units of elements: - */ - unsigned sz = pimg->u.buf.size / util_format_get_blocksize(format); - img->width = sz & MASK(15); - img->height = sz >> 15; - img->depth = 0; - img->level = 0; - } else { - img->buffer = false; - - unsigned lvl = pimg->u.tex.level; - unsigned layers = pimg->u.tex.last_layer - pimg->u.tex.first_layer + 1; - - img->ubwc_offset = fd_resource_ubwc_offset(rsc, lvl, pimg->u.tex.first_layer); - img->offset = fd_resource_offset(rsc, lvl, pimg->u.tex.first_layer); - img->pitch = fd_resource_pitch(rsc, lvl); - - switch (prsc->target) { - case PIPE_TEXTURE_RECT: - case PIPE_TEXTURE_1D: - case PIPE_TEXTURE_2D: - img->array_pitch = rsc->layout.layer_size; - img->depth = 1; - break; - case PIPE_TEXTURE_1D_ARRAY: - case PIPE_TEXTURE_2D_ARRAY: - case PIPE_TEXTURE_CUBE: - case PIPE_TEXTURE_CUBE_ARRAY: - img->array_pitch = rsc->layout.layer_size; - // TODO the CUBE/CUBE_ARRAY might need to be layers/6 for tex state, - // but empirically for ibo state it shouldn't be divided. - img->depth = layers; - break; - case PIPE_TEXTURE_3D: - img->array_pitch = fd_resource_slice(rsc, lvl)->size0; - img->depth = u_minify(prsc->depth0, lvl); - break; - default: - break; - } - - img->level = lvl; - img->width = u_minify(prsc->width0, lvl); - img->height = u_minify(prsc->height0, lvl); - } + enum pipe_format format = pimg->format; + struct pipe_resource *prsc = pimg->resource; + struct fd_resource *rsc = fd_resource(prsc); + + if (!prsc) { + memset(img, 0, sizeof(*img)); + return; + } + + img->prsc = prsc; + img->pfmt = format; + img->type = fd6_tex_type(prsc->target); + img->srgb = util_format_is_srgb(format); + img->cpp = rsc->layout.cpp; + img->bo = rsc->bo; + + /* Treat cube textures as 2d-array: */ + if (img->type == A6XX_TEX_CUBE) + img->type = A6XX_TEX_2D; + + if (prsc->target == PIPE_BUFFER) { + img->buffer = true; + img->ubwc_offset = 0; /* not valid for buffers */ + img->offset = pimg->u.buf.offset; + img->pitch = 0; + img->array_pitch = 0; + + /* size is encoded with low 15b in WIDTH and high bits in + * HEIGHT, in units of elements: + */ + unsigned sz = pimg->u.buf.size / util_format_get_blocksize(format); + img->width = sz & MASK(15); + img->height = sz >> 15; + img->depth = 0; + img->level = 0; + } else { + img->buffer = false; + + unsigned lvl = pimg->u.tex.level; + unsigned layers = pimg->u.tex.last_layer - pimg->u.tex.first_layer + 1; + + img->ubwc_offset = + fd_resource_ubwc_offset(rsc, lvl, pimg->u.tex.first_layer); + img->offset = fd_resource_offset(rsc, lvl, pimg->u.tex.first_layer); + img->pitch = fd_resource_pitch(rsc, lvl); + + switch (prsc->target) { + case PIPE_TEXTURE_RECT: + case PIPE_TEXTURE_1D: + case PIPE_TEXTURE_2D: + img->array_pitch = rsc->layout.layer_size; + img->depth = 1; + break; + case PIPE_TEXTURE_1D_ARRAY: + case PIPE_TEXTURE_2D_ARRAY: + case PIPE_TEXTURE_CUBE: + case PIPE_TEXTURE_CUBE_ARRAY: + img->array_pitch = rsc->layout.layer_size; + // TODO the CUBE/CUBE_ARRAY might need to be layers/6 for tex state, + // but empirically for ibo state it shouldn't be divided. + img->depth = layers; + break; + case PIPE_TEXTURE_3D: + img->array_pitch = fd_resource_slice(rsc, lvl)->size0; + img->depth = u_minify(prsc->depth0, lvl); + break; + default: + break; + } + + img->level = lvl; + img->width = u_minify(prsc->width0, lvl); + img->height = u_minify(prsc->height0, lvl); + } } -static void translate_buf(struct fd6_image *img, const struct pipe_shader_buffer *pimg) +static void +translate_buf(struct fd6_image *img, const struct pipe_shader_buffer *pimg) { - enum pipe_format format = PIPE_FORMAT_R32_UINT; - struct pipe_resource *prsc = pimg->buffer; - struct fd_resource *rsc = fd_resource(prsc); - - if (!prsc) { - memset(img, 0, sizeof(*img)); - return; - } - - img->prsc = prsc; - img->pfmt = format; - img->type = fd6_tex_type(prsc->target); - img->srgb = util_format_is_srgb(format); - img->cpp = rsc->layout.cpp; - img->bo = rsc->bo; - img->buffer = true; - - img->ubwc_offset = 0; /* not valid for buffers */ - img->offset = pimg->buffer_offset; - img->pitch = 0; - img->array_pitch = 0; - img->level = 0; - - /* size is encoded with low 15b in WIDTH and high bits in HEIGHT, - * in units of elements: - */ - unsigned sz = pimg->buffer_size / 4; - img->width = sz & MASK(15); - img->height = sz >> 15; - img->depth = 0; + enum pipe_format format = PIPE_FORMAT_R32_UINT; + struct pipe_resource *prsc = pimg->buffer; + struct fd_resource *rsc = fd_resource(prsc); + + if (!prsc) { + memset(img, 0, sizeof(*img)); + return; + } + + img->prsc = prsc; + img->pfmt = format; + img->type = fd6_tex_type(prsc->target); + img->srgb = util_format_is_srgb(format); + img->cpp = rsc->layout.cpp; + img->bo = rsc->bo; + img->buffer = true; + + img->ubwc_offset = 0; /* not valid for buffers */ + img->offset = pimg->buffer_offset; + img->pitch = 0; + img->array_pitch = 0; + img->level = 0; + + /* size is encoded with low 15b in WIDTH and high bits in HEIGHT, + * in units of elements: + */ + unsigned sz = pimg->buffer_size / 4; + img->width = sz & MASK(15); + img->height = sz >> 15; + img->depth = 0; } -static void emit_image_tex(struct fd_ringbuffer *ring, struct fd6_image *img) +static void +emit_image_tex(struct fd_ringbuffer *ring, struct fd6_image *img) { - struct fd_resource *rsc = fd_resource(img->prsc); - bool ubwc_enabled = fd_resource_ubwc_enabled(rsc, img->level); - - OUT_RING(ring, fd6_tex_const_0(img->prsc, img->level, img->pfmt, - PIPE_SWIZZLE_X, PIPE_SWIZZLE_Y, - PIPE_SWIZZLE_Z, PIPE_SWIZZLE_W)); - OUT_RING(ring, A6XX_TEX_CONST_1_WIDTH(img->width) | - A6XX_TEX_CONST_1_HEIGHT(img->height)); - OUT_RING(ring, - COND(img->buffer, A6XX_TEX_CONST_2_UNK4 | A6XX_TEX_CONST_2_UNK31) | - A6XX_TEX_CONST_2_TYPE(img->type) | - A6XX_TEX_CONST_2_PITCH(img->pitch)); - OUT_RING(ring, A6XX_TEX_CONST_3_ARRAY_PITCH(img->array_pitch) | - COND(ubwc_enabled, A6XX_TEX_CONST_3_FLAG) | - COND(rsc->layout.tile_all, A6XX_TEX_CONST_3_TILE_ALL)); - if (img->bo) { - OUT_RELOC(ring, img->bo, img->offset, - (uint64_t)A6XX_TEX_CONST_5_DEPTH(img->depth) << 32, 0); - } else { - OUT_RING(ring, 0x00000000); - OUT_RING(ring, A6XX_TEX_CONST_5_DEPTH(img->depth)); - } - - OUT_RING(ring, 0x00000000); /* texconst6 */ - - if (ubwc_enabled) { - uint32_t block_width, block_height; - fdl6_get_ubwc_blockwidth(&rsc->layout, &block_width, &block_height); - - OUT_RELOC(ring, rsc->bo, img->ubwc_offset, 0, 0); - OUT_RING(ring, A6XX_TEX_CONST_9_FLAG_BUFFER_ARRAY_PITCH(rsc->layout.ubwc_layer_size >> 2)); - OUT_RING(ring, - A6XX_TEX_CONST_10_FLAG_BUFFER_PITCH(fdl_ubwc_pitch(&rsc->layout, img->level)) | - A6XX_TEX_CONST_10_FLAG_BUFFER_LOGW(util_logbase2_ceil(DIV_ROUND_UP(img->width, block_width))) | - A6XX_TEX_CONST_10_FLAG_BUFFER_LOGH(util_logbase2_ceil(DIV_ROUND_UP(img->height, block_height)))); - } else { - OUT_RING(ring, 0x00000000); /* texconst7 */ - OUT_RING(ring, 0x00000000); /* texconst8 */ - OUT_RING(ring, 0x00000000); /* texconst9 */ - OUT_RING(ring, 0x00000000); /* texconst10 */ - } - - OUT_RING(ring, 0x00000000); /* texconst11 */ - OUT_RING(ring, 0x00000000); /* texconst12 */ - OUT_RING(ring, 0x00000000); /* texconst13 */ - OUT_RING(ring, 0x00000000); /* texconst14 */ - OUT_RING(ring, 0x00000000); /* texconst15 */ + struct fd_resource *rsc = fd_resource(img->prsc); + bool ubwc_enabled = fd_resource_ubwc_enabled(rsc, img->level); + + OUT_RING(ring, + fd6_tex_const_0(img->prsc, img->level, img->pfmt, PIPE_SWIZZLE_X, + PIPE_SWIZZLE_Y, PIPE_SWIZZLE_Z, PIPE_SWIZZLE_W)); + OUT_RING(ring, A6XX_TEX_CONST_1_WIDTH(img->width) | + A6XX_TEX_CONST_1_HEIGHT(img->height)); + OUT_RING(ring, + COND(img->buffer, A6XX_TEX_CONST_2_UNK4 | A6XX_TEX_CONST_2_UNK31) | + A6XX_TEX_CONST_2_TYPE(img->type) | + A6XX_TEX_CONST_2_PITCH(img->pitch)); + OUT_RING(ring, A6XX_TEX_CONST_3_ARRAY_PITCH(img->array_pitch) | + COND(ubwc_enabled, A6XX_TEX_CONST_3_FLAG) | + COND(rsc->layout.tile_all, A6XX_TEX_CONST_3_TILE_ALL)); + if (img->bo) { + OUT_RELOC(ring, img->bo, img->offset, + (uint64_t)A6XX_TEX_CONST_5_DEPTH(img->depth) << 32, 0); + } else { + OUT_RING(ring, 0x00000000); + OUT_RING(ring, A6XX_TEX_CONST_5_DEPTH(img->depth)); + } + + OUT_RING(ring, 0x00000000); /* texconst6 */ + + if (ubwc_enabled) { + uint32_t block_width, block_height; + fdl6_get_ubwc_blockwidth(&rsc->layout, &block_width, &block_height); + + OUT_RELOC(ring, rsc->bo, img->ubwc_offset, 0, 0); + OUT_RING(ring, A6XX_TEX_CONST_9_FLAG_BUFFER_ARRAY_PITCH( + rsc->layout.ubwc_layer_size >> 2)); + OUT_RING(ring, A6XX_TEX_CONST_10_FLAG_BUFFER_PITCH( + fdl_ubwc_pitch(&rsc->layout, img->level)) | + A6XX_TEX_CONST_10_FLAG_BUFFER_LOGW(util_logbase2_ceil( + DIV_ROUND_UP(img->width, block_width))) | + A6XX_TEX_CONST_10_FLAG_BUFFER_LOGH(util_logbase2_ceil( + DIV_ROUND_UP(img->height, block_height)))); + } else { + OUT_RING(ring, 0x00000000); /* texconst7 */ + OUT_RING(ring, 0x00000000); /* texconst8 */ + OUT_RING(ring, 0x00000000); /* texconst9 */ + OUT_RING(ring, 0x00000000); /* texconst10 */ + } + + OUT_RING(ring, 0x00000000); /* texconst11 */ + OUT_RING(ring, 0x00000000); /* texconst12 */ + OUT_RING(ring, 0x00000000); /* texconst13 */ + OUT_RING(ring, 0x00000000); /* texconst14 */ + OUT_RING(ring, 0x00000000); /* texconst15 */ } void -fd6_emit_image_tex(struct fd_ringbuffer *ring, const struct pipe_image_view *pimg) +fd6_emit_image_tex(struct fd_ringbuffer *ring, + const struct pipe_image_view *pimg) { - struct fd6_image img; - translate_image(&img, pimg); - emit_image_tex(ring, &img); + struct fd6_image img; + translate_image(&img, pimg); + emit_image_tex(ring, &img); } void -fd6_emit_ssbo_tex(struct fd_ringbuffer *ring, const struct pipe_shader_buffer *pbuf) +fd6_emit_ssbo_tex(struct fd_ringbuffer *ring, + const struct pipe_shader_buffer *pbuf) { - struct fd6_image img; - translate_buf(&img, pbuf); - emit_image_tex(ring, &img); + struct fd6_image img; + translate_buf(&img, pbuf); + emit_image_tex(ring, &img); } -static void emit_image_ssbo(struct fd_ringbuffer *ring, struct fd6_image *img) +static void +emit_image_ssbo(struct fd_ringbuffer *ring, struct fd6_image *img) { - /* If the SSBO isn't present (becasue gallium doesn't pack atomic - * counters), zero-fill the slot. - */ - if (!img->prsc) { - for (int i = 0; i < 16; i++) - OUT_RING(ring, 0); - return; - } - - struct fd_resource *rsc = fd_resource(img->prsc); - enum a6xx_tile_mode tile_mode = fd_resource_tile_mode(img->prsc, img->level); - bool ubwc_enabled = fd_resource_ubwc_enabled(rsc, img->level); - - OUT_RING(ring, A6XX_IBO_0_FMT(fd6_pipe2tex(img->pfmt)) | - A6XX_IBO_0_TILE_MODE(tile_mode)); - OUT_RING(ring, A6XX_IBO_1_WIDTH(img->width) | - A6XX_IBO_1_HEIGHT(img->height)); - OUT_RING(ring, A6XX_IBO_2_PITCH(img->pitch) | - COND(img->buffer, A6XX_IBO_2_UNK4 | A6XX_IBO_2_UNK31) | - A6XX_IBO_2_TYPE(img->type)); - OUT_RING(ring, A6XX_IBO_3_ARRAY_PITCH(img->array_pitch) | - COND(ubwc_enabled, A6XX_IBO_3_FLAG | A6XX_IBO_3_UNK27)); - if (img->bo) { - OUT_RELOC(ring, img->bo, img->offset, - (uint64_t)A6XX_IBO_5_DEPTH(img->depth) << 32, 0); - } else { - OUT_RING(ring, 0x00000000); - OUT_RING(ring, A6XX_IBO_5_DEPTH(img->depth)); - } - OUT_RING(ring, 0x00000000); - - if (ubwc_enabled) { - OUT_RELOC(ring, rsc->bo, img->ubwc_offset, 0, 0); - OUT_RING(ring, A6XX_IBO_9_FLAG_BUFFER_ARRAY_PITCH(rsc->layout.ubwc_layer_size >> 2)); - OUT_RING(ring, A6XX_IBO_10_FLAG_BUFFER_PITCH(fdl_ubwc_pitch(&rsc->layout, img->level))); - } else { - OUT_RING(ring, 0x00000000); - OUT_RING(ring, 0x00000000); - OUT_RING(ring, 0x00000000); - OUT_RING(ring, 0x00000000); - } - - OUT_RING(ring, 0x00000000); - OUT_RING(ring, 0x00000000); - OUT_RING(ring, 0x00000000); - OUT_RING(ring, 0x00000000); - OUT_RING(ring, 0x00000000); + /* If the SSBO isn't present (becasue gallium doesn't pack atomic + * counters), zero-fill the slot. + */ + if (!img->prsc) { + for (int i = 0; i < 16; i++) + OUT_RING(ring, 0); + return; + } + + struct fd_resource *rsc = fd_resource(img->prsc); + enum a6xx_tile_mode tile_mode = fd_resource_tile_mode(img->prsc, img->level); + bool ubwc_enabled = fd_resource_ubwc_enabled(rsc, img->level); + + OUT_RING(ring, A6XX_IBO_0_FMT(fd6_pipe2tex(img->pfmt)) | + A6XX_IBO_0_TILE_MODE(tile_mode)); + OUT_RING(ring, + A6XX_IBO_1_WIDTH(img->width) | A6XX_IBO_1_HEIGHT(img->height)); + OUT_RING(ring, A6XX_IBO_2_PITCH(img->pitch) | + COND(img->buffer, A6XX_IBO_2_UNK4 | A6XX_IBO_2_UNK31) | + A6XX_IBO_2_TYPE(img->type)); + OUT_RING(ring, A6XX_IBO_3_ARRAY_PITCH(img->array_pitch) | + COND(ubwc_enabled, A6XX_IBO_3_FLAG | A6XX_IBO_3_UNK27)); + if (img->bo) { + OUT_RELOC(ring, img->bo, img->offset, + (uint64_t)A6XX_IBO_5_DEPTH(img->depth) << 32, 0); + } else { + OUT_RING(ring, 0x00000000); + OUT_RING(ring, A6XX_IBO_5_DEPTH(img->depth)); + } + OUT_RING(ring, 0x00000000); + + if (ubwc_enabled) { + OUT_RELOC(ring, rsc->bo, img->ubwc_offset, 0, 0); + OUT_RING(ring, A6XX_IBO_9_FLAG_BUFFER_ARRAY_PITCH( + rsc->layout.ubwc_layer_size >> 2)); + OUT_RING(ring, A6XX_IBO_10_FLAG_BUFFER_PITCH( + fdl_ubwc_pitch(&rsc->layout, img->level))); + } else { + OUT_RING(ring, 0x00000000); + OUT_RING(ring, 0x00000000); + OUT_RING(ring, 0x00000000); + OUT_RING(ring, 0x00000000); + } + + OUT_RING(ring, 0x00000000); + OUT_RING(ring, 0x00000000); + OUT_RING(ring, 0x00000000); + OUT_RING(ring, 0x00000000); + OUT_RING(ring, 0x00000000); } /* Build combined image/SSBO "IBO" state, returns ownership of state reference */ struct fd_ringbuffer * fd6_build_ibo_state(struct fd_context *ctx, const struct ir3_shader_variant *v, - enum pipe_shader_type shader) + enum pipe_shader_type shader) { - struct fd_shaderbuf_stateobj *bufso = &ctx->shaderbuf[shader]; - struct fd_shaderimg_stateobj *imgso = &ctx->shaderimg[shader]; - - struct fd_ringbuffer *state = - fd_submit_new_ringbuffer(ctx->batch->submit, - (v->shader->nir->info.num_ssbos + - v->shader->nir->info.num_images) * 16 * 4, - FD_RINGBUFFER_STREAMING); - - assert(shader == PIPE_SHADER_COMPUTE || shader == PIPE_SHADER_FRAGMENT); - - for (unsigned i = 0; i < v->shader->nir->info.num_ssbos; i++) { - struct fd6_image img; - translate_buf(&img, &bufso->sb[i]); - emit_image_ssbo(state, &img); - } - - for (unsigned i = 0; i < v->shader->nir->info.num_images; i++) { - struct fd6_image img; - translate_image(&img, &imgso->si[i]); - emit_image_ssbo(state, &img); - } - - return state; + struct fd_shaderbuf_stateobj *bufso = &ctx->shaderbuf[shader]; + struct fd_shaderimg_stateobj *imgso = &ctx->shaderimg[shader]; + + struct fd_ringbuffer *state = fd_submit_new_ringbuffer( + ctx->batch->submit, + (v->shader->nir->info.num_ssbos + v->shader->nir->info.num_images) * 16 * + 4, + FD_RINGBUFFER_STREAMING); + + assert(shader == PIPE_SHADER_COMPUTE || shader == PIPE_SHADER_FRAGMENT); + + for (unsigned i = 0; i < v->shader->nir->info.num_ssbos; i++) { + struct fd6_image img; + translate_buf(&img, &bufso->sb[i]); + emit_image_ssbo(state, &img); + } + + for (unsigned i = 0; i < v->shader->nir->info.num_images; i++) { + struct fd6_image img; + translate_image(&img, &imgso->si[i]); + emit_image_ssbo(state, &img); + } + + return state; } -static void fd6_set_shader_images(struct pipe_context *pctx, - enum pipe_shader_type shader, - unsigned start, unsigned count, - unsigned unbind_num_trailing_slots, - const struct pipe_image_view *images) - in_dt +static void +fd6_set_shader_images(struct pipe_context *pctx, enum pipe_shader_type shader, + unsigned start, unsigned count, + unsigned unbind_num_trailing_slots, + const struct pipe_image_view *images) in_dt { - struct fd_context *ctx = fd_context(pctx); - struct fd_shaderimg_stateobj *so = &ctx->shaderimg[shader]; + struct fd_context *ctx = fd_context(pctx); + struct fd_shaderimg_stateobj *so = &ctx->shaderimg[shader]; - fd_set_shader_images(pctx, shader, start, count, - unbind_num_trailing_slots, images); + fd_set_shader_images(pctx, shader, start, count, unbind_num_trailing_slots, + images); - if (!images) - return; + if (!images) + return; - for (unsigned i = 0; i < count; i++) { - unsigned n = i + start; - struct pipe_image_view *buf = &so->si[n]; + for (unsigned i = 0; i < count; i++) { + unsigned n = i + start; + struct pipe_image_view *buf = &so->si[n]; - if (!buf->resource) - continue; + if (!buf->resource) + continue; - fd6_validate_format(ctx, fd_resource(buf->resource), buf->format); - } + fd6_validate_format(ctx, fd_resource(buf->resource), buf->format); + } } void fd6_image_init(struct pipe_context *pctx) { - pctx->set_shader_images = fd6_set_shader_images; + pctx->set_shader_images = fd6_set_shader_images; } diff --git a/src/gallium/drivers/freedreno/a6xx/fd6_image.h b/src/gallium/drivers/freedreno/a6xx/fd6_image.h index e42255f..84838b9 100644 --- a/src/gallium/drivers/freedreno/a6xx/fd6_image.h +++ b/src/gallium/drivers/freedreno/a6xx/fd6_image.h @@ -30,12 +30,15 @@ #include "freedreno_context.h" -void fd6_emit_image_tex(struct fd_ringbuffer *ring, const struct pipe_image_view *pimg) assert_dt; -void fd6_emit_ssbo_tex(struct fd_ringbuffer *ring, const struct pipe_shader_buffer *pbuf) assert_dt; +void fd6_emit_image_tex(struct fd_ringbuffer *ring, + const struct pipe_image_view *pimg) assert_dt; +void fd6_emit_ssbo_tex(struct fd_ringbuffer *ring, + const struct pipe_shader_buffer *pbuf) assert_dt; struct ir3_shader_variant; -struct fd_ringbuffer * fd6_build_ibo_state(struct fd_context *ctx, - const struct ir3_shader_variant *v, enum pipe_shader_type shader) assert_dt; +struct fd_ringbuffer * +fd6_build_ibo_state(struct fd_context *ctx, const struct ir3_shader_variant *v, + enum pipe_shader_type shader) assert_dt; void fd6_image_init(struct pipe_context *pctx); diff --git a/src/gallium/drivers/freedreno/a6xx/fd6_pack.h b/src/gallium/drivers/freedreno/a6xx/fd6_pack.h index 847e7d4..75a811d 100644 --- a/src/gallium/drivers/freedreno/a6xx/fd6_pack.h +++ b/src/gallium/drivers/freedreno/a6xx/fd6_pack.h @@ -27,13 +27,13 @@ #include "a6xx.xml.h" struct fd_reg_pair { - uint32_t reg; - uint64_t value; - struct fd_bo *bo; - bool is_address; - bool bo_write; - uint32_t bo_offset; - uint32_t bo_shift; + uint32_t reg; + uint64_t value; + struct fd_bo *bo; + bool is_address; + bool bo_write; + uint32_t bo_offset; + uint32_t bo_shift; }; #define __bo_type struct fd_bo * @@ -41,138 +41,134 @@ struct fd_reg_pair { #include "a6xx-pack.xml.h" #include "adreno-pm4-pack.xml.h" -#define __assert_eq(a, b) \ - do { \ - if ((a) != (b)) { \ - fprintf(stderr, "assert failed: " #a " (0x%x) != " #b " (0x%x)\n", a, b); \ - assert((a) == (b)); \ - } \ - } while (0) +#define __assert_eq(a, b) \ + do { \ + if ((a) != (b)) { \ + fprintf(stderr, "assert failed: " #a " (0x%x) != " #b " (0x%x)\n", a, \ + b); \ + assert((a) == (b)); \ + } \ + } while (0) -#define __ONE_REG(i, ...) \ - do { \ - const struct fd_reg_pair regs[] = { __VA_ARGS__ }; \ - /* NOTE: allow regs[0].reg==0, this happens in OUT_PKT() */ \ - if (i < ARRAY_SIZE(regs) && (i == 0 || regs[i].reg > 0)) { \ - __assert_eq(regs[0].reg + i, regs[i].reg); \ - if (regs[i].bo) { \ - ring->cur = p; \ - p += 2; \ - OUT_RELOC(ring, regs[i].bo, regs[i].bo_offset, \ - regs[i].value, regs[i].bo_shift); \ - } else { \ - *p++ = regs[i].value; \ - if (regs[i].is_address) \ - *p++ = regs[i].value >> 32; \ - } \ - } \ - } while (0) +#define __ONE_REG(i, ...) \ + do { \ + const struct fd_reg_pair regs[] = {__VA_ARGS__}; \ + /* NOTE: allow regs[0].reg==0, this happens in OUT_PKT() */ \ + if (i < ARRAY_SIZE(regs) && (i == 0 || regs[i].reg > 0)) { \ + __assert_eq(regs[0].reg + i, regs[i].reg); \ + if (regs[i].bo) { \ + ring->cur = p; \ + p += 2; \ + OUT_RELOC(ring, regs[i].bo, regs[i].bo_offset, regs[i].value, \ + regs[i].bo_shift); \ + } else { \ + *p++ = regs[i].value; \ + if (regs[i].is_address) \ + *p++ = regs[i].value >> 32; \ + } \ + } \ + } while (0) -#define OUT_REG(ring, ...) \ - do { \ - const struct fd_reg_pair regs[] = { __VA_ARGS__ }; \ - unsigned count = ARRAY_SIZE(regs); \ - \ - STATIC_ASSERT(count > 0); \ - STATIC_ASSERT(count <= 16); \ - \ - BEGIN_RING(ring, count + 1); \ - uint32_t *p = ring->cur; \ - *p++ = CP_TYPE4_PKT | count | \ - (_odd_parity_bit(count) << 7) | \ - ((regs[0].reg & 0x3ffff) << 8) | \ - ((_odd_parity_bit(regs[0].reg) << 27)); \ - \ - __ONE_REG( 0, __VA_ARGS__); \ - __ONE_REG( 1, __VA_ARGS__); \ - __ONE_REG( 2, __VA_ARGS__); \ - __ONE_REG( 3, __VA_ARGS__); \ - __ONE_REG( 4, __VA_ARGS__); \ - __ONE_REG( 5, __VA_ARGS__); \ - __ONE_REG( 6, __VA_ARGS__); \ - __ONE_REG( 7, __VA_ARGS__); \ - __ONE_REG( 8, __VA_ARGS__); \ - __ONE_REG( 9, __VA_ARGS__); \ - __ONE_REG(10, __VA_ARGS__); \ - __ONE_REG(11, __VA_ARGS__); \ - __ONE_REG(12, __VA_ARGS__); \ - __ONE_REG(13, __VA_ARGS__); \ - __ONE_REG(14, __VA_ARGS__); \ - __ONE_REG(15, __VA_ARGS__); \ - ring->cur = p; \ - } while (0) +#define OUT_REG(ring, ...) \ + do { \ + const struct fd_reg_pair regs[] = {__VA_ARGS__}; \ + unsigned count = ARRAY_SIZE(regs); \ + \ + STATIC_ASSERT(count > 0); \ + STATIC_ASSERT(count <= 16); \ + \ + BEGIN_RING(ring, count + 1); \ + uint32_t *p = ring->cur; \ + *p++ = CP_TYPE4_PKT | count | (_odd_parity_bit(count) << 7) | \ + ((regs[0].reg & 0x3ffff) << 8) | \ + ((_odd_parity_bit(regs[0].reg) << 27)); \ + \ + __ONE_REG(0, __VA_ARGS__); \ + __ONE_REG(1, __VA_ARGS__); \ + __ONE_REG(2, __VA_ARGS__); \ + __ONE_REG(3, __VA_ARGS__); \ + __ONE_REG(4, __VA_ARGS__); \ + __ONE_REG(5, __VA_ARGS__); \ + __ONE_REG(6, __VA_ARGS__); \ + __ONE_REG(7, __VA_ARGS__); \ + __ONE_REG(8, __VA_ARGS__); \ + __ONE_REG(9, __VA_ARGS__); \ + __ONE_REG(10, __VA_ARGS__); \ + __ONE_REG(11, __VA_ARGS__); \ + __ONE_REG(12, __VA_ARGS__); \ + __ONE_REG(13, __VA_ARGS__); \ + __ONE_REG(14, __VA_ARGS__); \ + __ONE_REG(15, __VA_ARGS__); \ + ring->cur = p; \ + } while (0) -#define OUT_PKT(ring, opcode, ...) \ - do { \ - const struct fd_reg_pair regs[] = { __VA_ARGS__ }; \ - unsigned count = ARRAY_SIZE(regs); \ - \ - STATIC_ASSERT(count <= 16); \ - \ - BEGIN_RING(ring, count + 1); \ - uint32_t *p = ring->cur; \ - *p++ = CP_TYPE7_PKT | count | \ - (_odd_parity_bit(count) << 15) | \ - ((opcode & 0x7f) << 16) | \ - ((_odd_parity_bit(opcode) << 23)); \ - \ - __ONE_REG( 0, __VA_ARGS__); \ - __ONE_REG( 1, __VA_ARGS__); \ - __ONE_REG( 2, __VA_ARGS__); \ - __ONE_REG( 3, __VA_ARGS__); \ - __ONE_REG( 4, __VA_ARGS__); \ - __ONE_REG( 5, __VA_ARGS__); \ - __ONE_REG( 6, __VA_ARGS__); \ - __ONE_REG( 7, __VA_ARGS__); \ - __ONE_REG( 8, __VA_ARGS__); \ - __ONE_REG( 9, __VA_ARGS__); \ - __ONE_REG(10, __VA_ARGS__); \ - __ONE_REG(11, __VA_ARGS__); \ - __ONE_REG(12, __VA_ARGS__); \ - __ONE_REG(13, __VA_ARGS__); \ - __ONE_REG(14, __VA_ARGS__); \ - __ONE_REG(15, __VA_ARGS__); \ - ring->cur = p; \ - } while (0) +#define OUT_PKT(ring, opcode, ...) \ + do { \ + const struct fd_reg_pair regs[] = {__VA_ARGS__}; \ + unsigned count = ARRAY_SIZE(regs); \ + \ + STATIC_ASSERT(count <= 16); \ + \ + BEGIN_RING(ring, count + 1); \ + uint32_t *p = ring->cur; \ + *p++ = CP_TYPE7_PKT | count | (_odd_parity_bit(count) << 15) | \ + ((opcode & 0x7f) << 16) | ((_odd_parity_bit(opcode) << 23)); \ + \ + __ONE_REG(0, __VA_ARGS__); \ + __ONE_REG(1, __VA_ARGS__); \ + __ONE_REG(2, __VA_ARGS__); \ + __ONE_REG(3, __VA_ARGS__); \ + __ONE_REG(4, __VA_ARGS__); \ + __ONE_REG(5, __VA_ARGS__); \ + __ONE_REG(6, __VA_ARGS__); \ + __ONE_REG(7, __VA_ARGS__); \ + __ONE_REG(8, __VA_ARGS__); \ + __ONE_REG(9, __VA_ARGS__); \ + __ONE_REG(10, __VA_ARGS__); \ + __ONE_REG(11, __VA_ARGS__); \ + __ONE_REG(12, __VA_ARGS__); \ + __ONE_REG(13, __VA_ARGS__); \ + __ONE_REG(14, __VA_ARGS__); \ + __ONE_REG(15, __VA_ARGS__); \ + ring->cur = p; \ + } while (0) /* similar to OUT_PKT() but appends specified # of dwords * copied for buf to the end of the packet (ie. for use- * cases like CP_LOAD_STATE) */ -#define OUT_PKTBUF(ring, opcode, dwords, sizedwords, ...) \ - do { \ - const struct fd_reg_pair regs[] = { __VA_ARGS__ }; \ - unsigned count = ARRAY_SIZE(regs); \ - \ - STATIC_ASSERT(count <= 16); \ - count += sizedwords; \ - \ - BEGIN_RING(ring, count + 1); \ - uint32_t *p = ring->cur; \ - *p++ = CP_TYPE7_PKT | count | \ - (_odd_parity_bit(count) << 15) | \ - ((opcode & 0x7f) << 16) | \ - ((_odd_parity_bit(opcode) << 23)); \ - \ - __ONE_REG( 0, __VA_ARGS__); \ - __ONE_REG( 1, __VA_ARGS__); \ - __ONE_REG( 2, __VA_ARGS__); \ - __ONE_REG( 3, __VA_ARGS__); \ - __ONE_REG( 4, __VA_ARGS__); \ - __ONE_REG( 5, __VA_ARGS__); \ - __ONE_REG( 6, __VA_ARGS__); \ - __ONE_REG( 7, __VA_ARGS__); \ - __ONE_REG( 8, __VA_ARGS__); \ - __ONE_REG( 9, __VA_ARGS__); \ - __ONE_REG(10, __VA_ARGS__); \ - __ONE_REG(11, __VA_ARGS__); \ - __ONE_REG(12, __VA_ARGS__); \ - __ONE_REG(13, __VA_ARGS__); \ - __ONE_REG(14, __VA_ARGS__); \ - __ONE_REG(15, __VA_ARGS__); \ - memcpy(p, dwords, 4 * sizedwords); \ - p += sizedwords; \ - ring->cur = p; \ - } while (0) +#define OUT_PKTBUF(ring, opcode, dwords, sizedwords, ...) \ + do { \ + const struct fd_reg_pair regs[] = {__VA_ARGS__}; \ + unsigned count = ARRAY_SIZE(regs); \ + \ + STATIC_ASSERT(count <= 16); \ + count += sizedwords; \ + \ + BEGIN_RING(ring, count + 1); \ + uint32_t *p = ring->cur; \ + *p++ = CP_TYPE7_PKT | count | (_odd_parity_bit(count) << 15) | \ + ((opcode & 0x7f) << 16) | ((_odd_parity_bit(opcode) << 23)); \ + \ + __ONE_REG(0, __VA_ARGS__); \ + __ONE_REG(1, __VA_ARGS__); \ + __ONE_REG(2, __VA_ARGS__); \ + __ONE_REG(3, __VA_ARGS__); \ + __ONE_REG(4, __VA_ARGS__); \ + __ONE_REG(5, __VA_ARGS__); \ + __ONE_REG(6, __VA_ARGS__); \ + __ONE_REG(7, __VA_ARGS__); \ + __ONE_REG(8, __VA_ARGS__); \ + __ONE_REG(9, __VA_ARGS__); \ + __ONE_REG(10, __VA_ARGS__); \ + __ONE_REG(11, __VA_ARGS__); \ + __ONE_REG(12, __VA_ARGS__); \ + __ONE_REG(13, __VA_ARGS__); \ + __ONE_REG(14, __VA_ARGS__); \ + __ONE_REG(15, __VA_ARGS__); \ + memcpy(p, dwords, 4 * sizedwords); \ + p += sizedwords; \ + ring->cur = p; \ + } while (0) #endif /* FD6_PACK_H */ diff --git a/src/gallium/drivers/freedreno/a6xx/fd6_program.c b/src/gallium/drivers/freedreno/a6xx/fd6_program.c index f581e2f..d1b4e31 100644 --- a/src/gallium/drivers/freedreno/a6xx/fd6_program.c +++ b/src/gallium/drivers/freedreno/a6xx/fd6_program.c @@ -26,933 +26,962 @@ */ #include "pipe/p_state.h" -#include "util/u_string.h" -#include "util/u_memory.h" -#include "util/u_inlines.h" -#include "util/format/u_format.h" #include "util/bitset.h" +#include "util/format/u_format.h" +#include "util/u_inlines.h" +#include "util/u_memory.h" +#include "util/u_string.h" #include "freedreno_program.h" -#include "fd6_program.h" #include "fd6_const.h" #include "fd6_emit.h" -#include "fd6_texture.h" #include "fd6_format.h" #include "fd6_pack.h" +#include "fd6_program.h" +#include "fd6_texture.h" void fd6_emit_shader(struct fd_context *ctx, struct fd_ringbuffer *ring, - const struct ir3_shader_variant *so) + const struct ir3_shader_variant *so) { - enum a6xx_state_block sb = fd6_stage2shadersb(so->type); - - uint32_t first_exec_offset = 0; - uint32_t instrlen = 0; - - switch (so->type) { - case MESA_SHADER_VERTEX: - first_exec_offset = REG_A6XX_SP_VS_OBJ_FIRST_EXEC_OFFSET; - instrlen = REG_A6XX_SP_VS_INSTRLEN; - break; - case MESA_SHADER_TESS_CTRL: - first_exec_offset = REG_A6XX_SP_HS_OBJ_FIRST_EXEC_OFFSET; - instrlen = REG_A6XX_SP_HS_INSTRLEN; - break; - case MESA_SHADER_TESS_EVAL: - first_exec_offset = REG_A6XX_SP_DS_OBJ_FIRST_EXEC_OFFSET; - instrlen = REG_A6XX_SP_DS_INSTRLEN; - break; - case MESA_SHADER_GEOMETRY: - first_exec_offset = REG_A6XX_SP_GS_OBJ_FIRST_EXEC_OFFSET; - instrlen = REG_A6XX_SP_GS_INSTRLEN; - break; - case MESA_SHADER_FRAGMENT: - first_exec_offset = REG_A6XX_SP_FS_OBJ_FIRST_EXEC_OFFSET; - instrlen = REG_A6XX_SP_FS_INSTRLEN; - break; - case MESA_SHADER_COMPUTE: - case MESA_SHADER_KERNEL: - first_exec_offset = REG_A6XX_SP_CS_OBJ_FIRST_EXEC_OFFSET; - instrlen = REG_A6XX_SP_CS_INSTRLEN; - break; - case MESA_SHADER_TASK: - case MESA_SHADER_MESH: - case MESA_SHADER_RAYGEN: - case MESA_SHADER_ANY_HIT: - case MESA_SHADER_CLOSEST_HIT: - case MESA_SHADER_MISS: - case MESA_SHADER_INTERSECTION: - case MESA_SHADER_CALLABLE: - unreachable("Unsupported shader stage"); - case MESA_SHADER_NONE: - unreachable(""); - } + enum a6xx_state_block sb = fd6_stage2shadersb(so->type); + + uint32_t first_exec_offset = 0; + uint32_t instrlen = 0; + + switch (so->type) { + case MESA_SHADER_VERTEX: + first_exec_offset = REG_A6XX_SP_VS_OBJ_FIRST_EXEC_OFFSET; + instrlen = REG_A6XX_SP_VS_INSTRLEN; + break; + case MESA_SHADER_TESS_CTRL: + first_exec_offset = REG_A6XX_SP_HS_OBJ_FIRST_EXEC_OFFSET; + instrlen = REG_A6XX_SP_HS_INSTRLEN; + break; + case MESA_SHADER_TESS_EVAL: + first_exec_offset = REG_A6XX_SP_DS_OBJ_FIRST_EXEC_OFFSET; + instrlen = REG_A6XX_SP_DS_INSTRLEN; + break; + case MESA_SHADER_GEOMETRY: + first_exec_offset = REG_A6XX_SP_GS_OBJ_FIRST_EXEC_OFFSET; + instrlen = REG_A6XX_SP_GS_INSTRLEN; + break; + case MESA_SHADER_FRAGMENT: + first_exec_offset = REG_A6XX_SP_FS_OBJ_FIRST_EXEC_OFFSET; + instrlen = REG_A6XX_SP_FS_INSTRLEN; + break; + case MESA_SHADER_COMPUTE: + case MESA_SHADER_KERNEL: + first_exec_offset = REG_A6XX_SP_CS_OBJ_FIRST_EXEC_OFFSET; + instrlen = REG_A6XX_SP_CS_INSTRLEN; + break; + case MESA_SHADER_TASK: + case MESA_SHADER_MESH: + case MESA_SHADER_RAYGEN: + case MESA_SHADER_ANY_HIT: + case MESA_SHADER_CLOSEST_HIT: + case MESA_SHADER_MISS: + case MESA_SHADER_INTERSECTION: + case MESA_SHADER_CALLABLE: + unreachable("Unsupported shader stage"); + case MESA_SHADER_NONE: + unreachable(""); + } #ifdef DEBUG - /* Name should generally match what you get with MESA_SHADER_CAPTURE_PATH: */ - const char *name = so->shader->nir->info.name; - if (name) - fd_emit_string5(ring, name, strlen(name)); + /* Name should generally match what you get with MESA_SHADER_CAPTURE_PATH: */ + const char *name = so->shader->nir->info.name; + if (name) + fd_emit_string5(ring, name, strlen(name)); #endif - uint32_t fibers_per_sp = ctx->screen->info.fibers_per_sp; - uint32_t num_sp_cores = ctx->screen->info.num_sp_cores; - - uint32_t per_fiber_size = ALIGN(so->pvtmem_size, 512); - if (per_fiber_size > ctx->pvtmem[so->pvtmem_per_wave].per_fiber_size) { - if (ctx->pvtmem[so->pvtmem_per_wave].bo) - fd_bo_del(ctx->pvtmem[so->pvtmem_per_wave].bo); - ctx->pvtmem[so->pvtmem_per_wave].per_fiber_size = per_fiber_size; - uint32_t total_size = ALIGN(per_fiber_size * fibers_per_sp, 1 << 12) - * num_sp_cores; - ctx->pvtmem[so->pvtmem_per_wave].bo = - fd_bo_new(ctx->screen->dev, total_size, - DRM_FREEDRENO_GEM_TYPE_KMEM, "pvtmem_%s_%d", - so->pvtmem_per_wave ? "per_wave" : "per_fiber", - per_fiber_size); - } else { - per_fiber_size = ctx->pvtmem[so->pvtmem_per_wave].per_fiber_size; - } - - uint32_t per_sp_size = ALIGN(per_fiber_size * fibers_per_sp, 1 << 12); - - OUT_PKT4(ring, instrlen, 1); - OUT_RING(ring, so->instrlen); - - OUT_PKT4(ring, first_exec_offset, 7); - OUT_RING(ring, 0); /* SP_xS_OBJ_FIRST_EXEC_OFFSET */ - OUT_RELOC(ring, so->bo, 0, 0, 0); /* SP_xS_OBJ_START_LO */ - OUT_RING(ring, A6XX_SP_VS_PVT_MEM_PARAM_MEMSIZEPERITEM(per_fiber_size)); - if (so->pvtmem_size > 0) { /* SP_xS_PVT_MEM_ADDR */ - OUT_RELOC(ring, ctx->pvtmem[so->pvtmem_per_wave].bo, 0, 0, 0); - } else { - OUT_RING(ring, 0); - OUT_RING(ring, 0); - } - OUT_RING(ring, A6XX_SP_VS_PVT_MEM_SIZE_TOTALPVTMEMSIZE(per_sp_size) | - COND(so->pvtmem_per_wave, A6XX_SP_VS_PVT_MEM_SIZE_PERWAVEMEMLAYOUT)); - - OUT_PKT7(ring, fd6_stage2opcode(so->type), 3); - OUT_RING(ring, CP_LOAD_STATE6_0_DST_OFF(0) | - CP_LOAD_STATE6_0_STATE_TYPE(ST6_SHADER) | - CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) | - CP_LOAD_STATE6_0_STATE_BLOCK(sb) | - CP_LOAD_STATE6_0_NUM_UNIT(so->instrlen)); - OUT_RELOC(ring, so->bo, 0, 0, 0); + uint32_t fibers_per_sp = ctx->screen->info.fibers_per_sp; + uint32_t num_sp_cores = ctx->screen->info.num_sp_cores; + + uint32_t per_fiber_size = ALIGN(so->pvtmem_size, 512); + if (per_fiber_size > ctx->pvtmem[so->pvtmem_per_wave].per_fiber_size) { + if (ctx->pvtmem[so->pvtmem_per_wave].bo) + fd_bo_del(ctx->pvtmem[so->pvtmem_per_wave].bo); + ctx->pvtmem[so->pvtmem_per_wave].per_fiber_size = per_fiber_size; + uint32_t total_size = + ALIGN(per_fiber_size * fibers_per_sp, 1 << 12) * num_sp_cores; + ctx->pvtmem[so->pvtmem_per_wave].bo = fd_bo_new( + ctx->screen->dev, total_size, DRM_FREEDRENO_GEM_TYPE_KMEM, + "pvtmem_%s_%d", so->pvtmem_per_wave ? "per_wave" : "per_fiber", + per_fiber_size); + } else { + per_fiber_size = ctx->pvtmem[so->pvtmem_per_wave].per_fiber_size; + } + + uint32_t per_sp_size = ALIGN(per_fiber_size * fibers_per_sp, 1 << 12); + + OUT_PKT4(ring, instrlen, 1); + OUT_RING(ring, so->instrlen); + + OUT_PKT4(ring, first_exec_offset, 7); + OUT_RING(ring, 0); /* SP_xS_OBJ_FIRST_EXEC_OFFSET */ + OUT_RELOC(ring, so->bo, 0, 0, 0); /* SP_xS_OBJ_START_LO */ + OUT_RING(ring, A6XX_SP_VS_PVT_MEM_PARAM_MEMSIZEPERITEM(per_fiber_size)); + if (so->pvtmem_size > 0) { /* SP_xS_PVT_MEM_ADDR */ + OUT_RELOC(ring, ctx->pvtmem[so->pvtmem_per_wave].bo, 0, 0, 0); + } else { + OUT_RING(ring, 0); + OUT_RING(ring, 0); + } + OUT_RING(ring, A6XX_SP_VS_PVT_MEM_SIZE_TOTALPVTMEMSIZE(per_sp_size) | + COND(so->pvtmem_per_wave, + A6XX_SP_VS_PVT_MEM_SIZE_PERWAVEMEMLAYOUT)); + + OUT_PKT7(ring, fd6_stage2opcode(so->type), 3); + OUT_RING(ring, CP_LOAD_STATE6_0_DST_OFF(0) | + CP_LOAD_STATE6_0_STATE_TYPE(ST6_SHADER) | + CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) | + CP_LOAD_STATE6_0_STATE_BLOCK(sb) | + CP_LOAD_STATE6_0_NUM_UNIT(so->instrlen)); + OUT_RELOC(ring, so->bo, 0, 0, 0); } - static void -setup_stream_out(struct fd6_program_state *state, const struct ir3_shader_variant *v, - struct ir3_shader_linkage *l) +setup_stream_out(struct fd6_program_state *state, + const struct ir3_shader_variant *v, + struct ir3_shader_linkage *l) { - const struct ir3_stream_output_info *strmout = &v->shader->stream_output; - - uint32_t ncomp[PIPE_MAX_SO_BUFFERS]; - uint32_t prog[256/2]; - uint32_t prog_count; - - memset(ncomp, 0, sizeof(ncomp)); - memset(prog, 0, sizeof(prog)); - - prog_count = align(l->max_loc, 2) / 2; - - debug_assert(prog_count < ARRAY_SIZE(prog)); - - for (unsigned i = 0; i < strmout->num_outputs; i++) { - const struct ir3_stream_output *out = &strmout->output[i]; - unsigned k = out->register_index; - unsigned idx; - - ncomp[out->output_buffer] += out->num_components; - - /* linkage map sorted by order frag shader wants things, so - * a bit less ideal here.. - */ - for (idx = 0; idx < l->cnt; idx++) - if (l->var[idx].regid == v->outputs[k].regid) - break; - - debug_assert(idx < l->cnt); - - for (unsigned j = 0; j < out->num_components; j++) { - unsigned c = j + out->start_component; - unsigned loc = l->var[idx].loc + c; - unsigned off = j + out->dst_offset; /* in dwords */ - - if (loc & 1) { - prog[loc/2] |= A6XX_VPC_SO_PROG_B_EN | - A6XX_VPC_SO_PROG_B_BUF(out->output_buffer) | - A6XX_VPC_SO_PROG_B_OFF(off * 4); - } else { - prog[loc/2] |= A6XX_VPC_SO_PROG_A_EN | - A6XX_VPC_SO_PROG_A_BUF(out->output_buffer) | - A6XX_VPC_SO_PROG_A_OFF(off * 4); - } - } - } - - struct fd_ringbuffer *ring = state->streamout_stateobj; - - OUT_PKT7(ring, CP_CONTEXT_REG_BUNCH, 12 + (2 * prog_count)); - OUT_RING(ring, REG_A6XX_VPC_SO_STREAM_CNTL); - OUT_RING(ring, A6XX_VPC_SO_STREAM_CNTL_STREAM_ENABLE(0x1) | - COND(ncomp[0] > 0, A6XX_VPC_SO_STREAM_CNTL_BUF0_STREAM(1)) | - COND(ncomp[1] > 0, A6XX_VPC_SO_STREAM_CNTL_BUF1_STREAM(1)) | - COND(ncomp[2] > 0, A6XX_VPC_SO_STREAM_CNTL_BUF2_STREAM(1)) | - COND(ncomp[3] > 0, A6XX_VPC_SO_STREAM_CNTL_BUF3_STREAM(1))); - OUT_RING(ring, REG_A6XX_VPC_SO_NCOMP(0)); - OUT_RING(ring, ncomp[0]); - OUT_RING(ring, REG_A6XX_VPC_SO_NCOMP(1)); - OUT_RING(ring, ncomp[1]); - OUT_RING(ring, REG_A6XX_VPC_SO_NCOMP(2)); - OUT_RING(ring, ncomp[2]); - OUT_RING(ring, REG_A6XX_VPC_SO_NCOMP(3)); - OUT_RING(ring, ncomp[3]); - OUT_RING(ring, REG_A6XX_VPC_SO_CNTL); - OUT_RING(ring, A6XX_VPC_SO_CNTL_RESET); - for (unsigned i = 0; i < prog_count; i++) { - OUT_RING(ring, REG_A6XX_VPC_SO_PROG); - OUT_RING(ring, prog[i]); - } + const struct ir3_stream_output_info *strmout = &v->shader->stream_output; + + uint32_t ncomp[PIPE_MAX_SO_BUFFERS]; + uint32_t prog[256 / 2]; + uint32_t prog_count; + + memset(ncomp, 0, sizeof(ncomp)); + memset(prog, 0, sizeof(prog)); + + prog_count = align(l->max_loc, 2) / 2; + + debug_assert(prog_count < ARRAY_SIZE(prog)); + + for (unsigned i = 0; i < strmout->num_outputs; i++) { + const struct ir3_stream_output *out = &strmout->output[i]; + unsigned k = out->register_index; + unsigned idx; + + ncomp[out->output_buffer] += out->num_components; + + /* linkage map sorted by order frag shader wants things, so + * a bit less ideal here.. + */ + for (idx = 0; idx < l->cnt; idx++) + if (l->var[idx].regid == v->outputs[k].regid) + break; + + debug_assert(idx < l->cnt); + + for (unsigned j = 0; j < out->num_components; j++) { + unsigned c = j + out->start_component; + unsigned loc = l->var[idx].loc + c; + unsigned off = j + out->dst_offset; /* in dwords */ + + if (loc & 1) { + prog[loc / 2] |= A6XX_VPC_SO_PROG_B_EN | + A6XX_VPC_SO_PROG_B_BUF(out->output_buffer) | + A6XX_VPC_SO_PROG_B_OFF(off * 4); + } else { + prog[loc / 2] |= A6XX_VPC_SO_PROG_A_EN | + A6XX_VPC_SO_PROG_A_BUF(out->output_buffer) | + A6XX_VPC_SO_PROG_A_OFF(off * 4); + } + } + } + + struct fd_ringbuffer *ring = state->streamout_stateobj; + + OUT_PKT7(ring, CP_CONTEXT_REG_BUNCH, 12 + (2 * prog_count)); + OUT_RING(ring, REG_A6XX_VPC_SO_STREAM_CNTL); + OUT_RING(ring, + A6XX_VPC_SO_STREAM_CNTL_STREAM_ENABLE(0x1) | + COND(ncomp[0] > 0, A6XX_VPC_SO_STREAM_CNTL_BUF0_STREAM(1)) | + COND(ncomp[1] > 0, A6XX_VPC_SO_STREAM_CNTL_BUF1_STREAM(1)) | + COND(ncomp[2] > 0, A6XX_VPC_SO_STREAM_CNTL_BUF2_STREAM(1)) | + COND(ncomp[3] > 0, A6XX_VPC_SO_STREAM_CNTL_BUF3_STREAM(1))); + OUT_RING(ring, REG_A6XX_VPC_SO_NCOMP(0)); + OUT_RING(ring, ncomp[0]); + OUT_RING(ring, REG_A6XX_VPC_SO_NCOMP(1)); + OUT_RING(ring, ncomp[1]); + OUT_RING(ring, REG_A6XX_VPC_SO_NCOMP(2)); + OUT_RING(ring, ncomp[2]); + OUT_RING(ring, REG_A6XX_VPC_SO_NCOMP(3)); + OUT_RING(ring, ncomp[3]); + OUT_RING(ring, REG_A6XX_VPC_SO_CNTL); + OUT_RING(ring, A6XX_VPC_SO_CNTL_RESET); + for (unsigned i = 0; i < prog_count; i++) { + OUT_RING(ring, REG_A6XX_VPC_SO_PROG); + OUT_RING(ring, prog[i]); + } } static void -setup_config_stateobj(struct fd_ringbuffer *ring, struct fd6_program_state *state) +setup_config_stateobj(struct fd_ringbuffer *ring, + struct fd6_program_state *state) { - OUT_REG(ring, A6XX_HLSQ_INVALIDATE_CMD( - .vs_state = true, - .hs_state = true, - .ds_state = true, - .gs_state = true, - .fs_state = true, - .cs_state = true, - .gfx_ibo = true, - .cs_ibo = true, - )); - - debug_assert(state->vs->constlen >= state->bs->constlen); - - OUT_PKT4(ring, REG_A6XX_HLSQ_VS_CNTL, 4); - OUT_RING(ring, A6XX_HLSQ_VS_CNTL_CONSTLEN(state->vs->constlen) | - A6XX_HLSQ_VS_CNTL_ENABLED); - OUT_RING(ring, COND(state->hs, - A6XX_HLSQ_HS_CNTL_ENABLED | - A6XX_HLSQ_HS_CNTL_CONSTLEN(state->hs->constlen))); - OUT_RING(ring, COND(state->ds, - A6XX_HLSQ_DS_CNTL_ENABLED | - A6XX_HLSQ_DS_CNTL_CONSTLEN(state->ds->constlen))); - OUT_RING(ring, COND(state->gs, - A6XX_HLSQ_GS_CNTL_ENABLED | - A6XX_HLSQ_GS_CNTL_CONSTLEN(state->gs->constlen))); - OUT_PKT4(ring, REG_A6XX_HLSQ_FS_CNTL, 1); - OUT_RING(ring, A6XX_HLSQ_FS_CNTL_CONSTLEN(state->fs->constlen) | - A6XX_HLSQ_FS_CNTL_ENABLED); - - OUT_PKT4(ring, REG_A6XX_SP_VS_CONFIG, 1); - OUT_RING(ring, COND(state->vs, A6XX_SP_VS_CONFIG_ENABLED) | - A6XX_SP_VS_CONFIG_NIBO(ir3_shader_nibo(state->vs)) | - A6XX_SP_VS_CONFIG_NTEX(state->vs->num_samp) | - A6XX_SP_VS_CONFIG_NSAMP(state->vs->num_samp)); - - OUT_PKT4(ring, REG_A6XX_SP_HS_CONFIG, 1); - OUT_RING(ring, COND(state->hs, - A6XX_SP_HS_CONFIG_ENABLED | - A6XX_SP_HS_CONFIG_NIBO(ir3_shader_nibo(state->hs)) | - A6XX_SP_HS_CONFIG_NTEX(state->hs->num_samp) | - A6XX_SP_HS_CONFIG_NSAMP(state->hs->num_samp))); - - OUT_PKT4(ring, REG_A6XX_SP_DS_CONFIG, 1); - OUT_RING(ring, COND(state->ds, - A6XX_SP_DS_CONFIG_ENABLED | - A6XX_SP_DS_CONFIG_NIBO(ir3_shader_nibo(state->ds)) | - A6XX_SP_DS_CONFIG_NTEX(state->ds->num_samp) | - A6XX_SP_DS_CONFIG_NSAMP(state->ds->num_samp))); - - OUT_PKT4(ring, REG_A6XX_SP_GS_CONFIG, 1); - OUT_RING(ring, COND(state->gs, - A6XX_SP_GS_CONFIG_ENABLED | - A6XX_SP_GS_CONFIG_NIBO(ir3_shader_nibo(state->gs)) | - A6XX_SP_GS_CONFIG_NTEX(state->gs->num_samp) | - A6XX_SP_GS_CONFIG_NSAMP(state->gs->num_samp))); - - OUT_PKT4(ring, REG_A6XX_SP_FS_CONFIG, 1); - OUT_RING(ring, COND(state->fs, A6XX_SP_FS_CONFIG_ENABLED) | - A6XX_SP_FS_CONFIG_NIBO(ir3_shader_nibo(state->fs)) | - A6XX_SP_FS_CONFIG_NTEX(state->fs->num_samp) | - A6XX_SP_FS_CONFIG_NSAMP(state->fs->num_samp)); - - OUT_PKT4(ring, REG_A6XX_SP_IBO_COUNT, 1); - OUT_RING(ring, ir3_shader_nibo(state->fs)); + OUT_REG(ring, A6XX_HLSQ_INVALIDATE_CMD(.vs_state = true, .hs_state = true, + .ds_state = true, .gs_state = true, + .fs_state = true, .cs_state = true, + .gfx_ibo = true, .cs_ibo = true, )); + + debug_assert(state->vs->constlen >= state->bs->constlen); + + OUT_PKT4(ring, REG_A6XX_HLSQ_VS_CNTL, 4); + OUT_RING(ring, A6XX_HLSQ_VS_CNTL_CONSTLEN(state->vs->constlen) | + A6XX_HLSQ_VS_CNTL_ENABLED); + OUT_RING(ring, COND(state->hs, + A6XX_HLSQ_HS_CNTL_ENABLED | + A6XX_HLSQ_HS_CNTL_CONSTLEN(state->hs->constlen))); + OUT_RING(ring, COND(state->ds, + A6XX_HLSQ_DS_CNTL_ENABLED | + A6XX_HLSQ_DS_CNTL_CONSTLEN(state->ds->constlen))); + OUT_RING(ring, COND(state->gs, + A6XX_HLSQ_GS_CNTL_ENABLED | + A6XX_HLSQ_GS_CNTL_CONSTLEN(state->gs->constlen))); + OUT_PKT4(ring, REG_A6XX_HLSQ_FS_CNTL, 1); + OUT_RING(ring, A6XX_HLSQ_FS_CNTL_CONSTLEN(state->fs->constlen) | + A6XX_HLSQ_FS_CNTL_ENABLED); + + OUT_PKT4(ring, REG_A6XX_SP_VS_CONFIG, 1); + OUT_RING(ring, COND(state->vs, A6XX_SP_VS_CONFIG_ENABLED) | + A6XX_SP_VS_CONFIG_NIBO(ir3_shader_nibo(state->vs)) | + A6XX_SP_VS_CONFIG_NTEX(state->vs->num_samp) | + A6XX_SP_VS_CONFIG_NSAMP(state->vs->num_samp)); + + OUT_PKT4(ring, REG_A6XX_SP_HS_CONFIG, 1); + OUT_RING(ring, COND(state->hs, + A6XX_SP_HS_CONFIG_ENABLED | + A6XX_SP_HS_CONFIG_NIBO(ir3_shader_nibo(state->hs)) | + A6XX_SP_HS_CONFIG_NTEX(state->hs->num_samp) | + A6XX_SP_HS_CONFIG_NSAMP(state->hs->num_samp))); + + OUT_PKT4(ring, REG_A6XX_SP_DS_CONFIG, 1); + OUT_RING(ring, COND(state->ds, + A6XX_SP_DS_CONFIG_ENABLED | + A6XX_SP_DS_CONFIG_NIBO(ir3_shader_nibo(state->ds)) | + A6XX_SP_DS_CONFIG_NTEX(state->ds->num_samp) | + A6XX_SP_DS_CONFIG_NSAMP(state->ds->num_samp))); + + OUT_PKT4(ring, REG_A6XX_SP_GS_CONFIG, 1); + OUT_RING(ring, COND(state->gs, + A6XX_SP_GS_CONFIG_ENABLED | + A6XX_SP_GS_CONFIG_NIBO(ir3_shader_nibo(state->gs)) | + A6XX_SP_GS_CONFIG_NTEX(state->gs->num_samp) | + A6XX_SP_GS_CONFIG_NSAMP(state->gs->num_samp))); + + OUT_PKT4(ring, REG_A6XX_SP_FS_CONFIG, 1); + OUT_RING(ring, COND(state->fs, A6XX_SP_FS_CONFIG_ENABLED) | + A6XX_SP_FS_CONFIG_NIBO(ir3_shader_nibo(state->fs)) | + A6XX_SP_FS_CONFIG_NTEX(state->fs->num_samp) | + A6XX_SP_FS_CONFIG_NSAMP(state->fs->num_samp)); + + OUT_PKT4(ring, REG_A6XX_SP_IBO_COUNT, 1); + OUT_RING(ring, ir3_shader_nibo(state->fs)); } static inline uint32_t next_regid(uint32_t reg, uint32_t increment) { - if (VALIDREG(reg)) - return reg + increment; - else - return regid(63,0); + if (VALIDREG(reg)) + return reg + increment; + else + return regid(63, 0); } static void setup_stateobj(struct fd_ringbuffer *ring, struct fd_context *ctx, - struct fd6_program_state *state, const struct ir3_shader_key *key, - bool binning_pass) - assert_dt + struct fd6_program_state *state, + const struct ir3_shader_key *key, bool binning_pass) assert_dt { - uint32_t pos_regid, psize_regid, color_regid[8], posz_regid; - uint32_t clip0_regid, clip1_regid; - uint32_t face_regid, coord_regid, zwcoord_regid, samp_id_regid; - uint32_t smask_in_regid, smask_regid; - uint32_t stencilref_regid; - uint32_t vertex_regid, instance_regid, layer_regid, primitive_regid; - uint32_t hs_invocation_regid; - uint32_t tess_coord_x_regid, tess_coord_y_regid, hs_patch_regid, ds_patch_regid; - uint32_t ij_regid[IJ_COUNT]; - uint32_t gs_header_regid; - enum a6xx_threadsize fssz; - uint8_t psize_loc = ~0, pos_loc = ~0, layer_loc = ~0; - uint8_t clip0_loc, clip1_loc; - int i, j; - - static const struct ir3_shader_variant dummy_fs = {0}; - const struct ir3_shader_variant *vs = binning_pass ? state->bs : state->vs; - const struct ir3_shader_variant *hs = state->hs; - const struct ir3_shader_variant *ds = state->ds; - const struct ir3_shader_variant *gs = state->gs; - const struct ir3_shader_variant *fs = binning_pass ? &dummy_fs : state->fs; - - /* binning VS is wrong when GS is present, so use nonbinning VS - * TODO: compile both binning VS/GS variants correctly - */ - if (binning_pass && state->gs) - vs = state->vs; - - bool sample_shading = fs->per_samp | key->sample_shading; - - fssz = fs->info.double_threadsize ? THREAD128 : THREAD64; - - pos_regid = ir3_find_output_regid(vs, VARYING_SLOT_POS); - psize_regid = ir3_find_output_regid(vs, VARYING_SLOT_PSIZ); - clip0_regid = ir3_find_output_regid(vs, VARYING_SLOT_CLIP_DIST0); - clip1_regid = ir3_find_output_regid(vs, VARYING_SLOT_CLIP_DIST1); - layer_regid = ir3_find_output_regid(vs, VARYING_SLOT_LAYER); - vertex_regid = ir3_find_sysval_regid(vs, SYSTEM_VALUE_VERTEX_ID); - instance_regid = ir3_find_sysval_regid(vs, SYSTEM_VALUE_INSTANCE_ID); - - if (hs) { - tess_coord_x_regid = ir3_find_sysval_regid(ds, SYSTEM_VALUE_TESS_COORD); - tess_coord_y_regid = next_regid(tess_coord_x_regid, 1); - hs_patch_regid = ir3_find_sysval_regid(hs, SYSTEM_VALUE_PRIMITIVE_ID); - ds_patch_regid = ir3_find_sysval_regid(ds, SYSTEM_VALUE_PRIMITIVE_ID); - hs_invocation_regid = ir3_find_sysval_regid(hs, SYSTEM_VALUE_TCS_HEADER_IR3); - - pos_regid = ir3_find_output_regid(ds, VARYING_SLOT_POS); - psize_regid = ir3_find_output_regid(ds, VARYING_SLOT_PSIZ); - clip0_regid = ir3_find_output_regid(ds, VARYING_SLOT_CLIP_DIST0); - clip1_regid = ir3_find_output_regid(ds, VARYING_SLOT_CLIP_DIST1); - } else { - tess_coord_x_regid = regid(63, 0); - tess_coord_y_regid = regid(63, 0); - hs_patch_regid = regid(63, 0); - ds_patch_regid = regid(63, 0); - hs_invocation_regid = regid(63, 0); - } - - if (gs) { - gs_header_regid = ir3_find_sysval_regid(gs, SYSTEM_VALUE_GS_HEADER_IR3); - primitive_regid = ir3_find_sysval_regid(gs, SYSTEM_VALUE_PRIMITIVE_ID); - pos_regid = ir3_find_output_regid(gs, VARYING_SLOT_POS); - psize_regid = ir3_find_output_regid(gs, VARYING_SLOT_PSIZ); - clip0_regid = ir3_find_output_regid(gs, VARYING_SLOT_CLIP_DIST0); - clip1_regid = ir3_find_output_regid(gs, VARYING_SLOT_CLIP_DIST1); - layer_regid = ir3_find_output_regid(gs, VARYING_SLOT_LAYER); - } else { - gs_header_regid = regid(63, 0); - primitive_regid = regid(63, 0); - } - - if (fs->color0_mrt) { - color_regid[0] = color_regid[1] = color_regid[2] = color_regid[3] = - color_regid[4] = color_regid[5] = color_regid[6] = color_regid[7] = - ir3_find_output_regid(fs, FRAG_RESULT_COLOR); - } else { - color_regid[0] = ir3_find_output_regid(fs, FRAG_RESULT_DATA0); - color_regid[1] = ir3_find_output_regid(fs, FRAG_RESULT_DATA1); - color_regid[2] = ir3_find_output_regid(fs, FRAG_RESULT_DATA2); - color_regid[3] = ir3_find_output_regid(fs, FRAG_RESULT_DATA3); - color_regid[4] = ir3_find_output_regid(fs, FRAG_RESULT_DATA4); - color_regid[5] = ir3_find_output_regid(fs, FRAG_RESULT_DATA5); - color_regid[6] = ir3_find_output_regid(fs, FRAG_RESULT_DATA6); - color_regid[7] = ir3_find_output_regid(fs, FRAG_RESULT_DATA7); - } - - samp_id_regid = ir3_find_sysval_regid(fs, SYSTEM_VALUE_SAMPLE_ID); - smask_in_regid = ir3_find_sysval_regid(fs, SYSTEM_VALUE_SAMPLE_MASK_IN); - face_regid = ir3_find_sysval_regid(fs, SYSTEM_VALUE_FRONT_FACE); - coord_regid = ir3_find_sysval_regid(fs, SYSTEM_VALUE_FRAG_COORD); - zwcoord_regid = next_regid(coord_regid, 2); - posz_regid = ir3_find_output_regid(fs, FRAG_RESULT_DEPTH); - smask_regid = ir3_find_output_regid(fs, FRAG_RESULT_SAMPLE_MASK); - stencilref_regid = ir3_find_output_regid(fs, FRAG_RESULT_STENCIL); - for (unsigned i = 0; i < ARRAY_SIZE(ij_regid); i++) - ij_regid[i] = ir3_find_sysval_regid(fs, SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL + i); - - /* If we have pre-dispatch texture fetches, then ij_pix should not - * be DCE'd, even if not actually used in the shader itself: - */ - if (fs->num_sampler_prefetch > 0) { - assert(VALIDREG(ij_regid[IJ_PERSP_PIXEL])); - /* also, it seems like ij_pix is *required* to be r0.x */ - assert(ij_regid[IJ_PERSP_PIXEL] == regid(0, 0)); - } - - /* we can't write gl_SampleMask for !msaa.. if b0 is zero then we - * end up masking the single sample!! - */ - if (!key->msaa) - smask_regid = regid(63, 0); - - /* we could probably divide this up into things that need to be - * emitted if frag-prog is dirty vs if vert-prog is dirty.. - */ - - OUT_PKT4(ring, REG_A6XX_SP_FS_PREFETCH_CNTL, 1 + fs->num_sampler_prefetch); - OUT_RING(ring, A6XX_SP_FS_PREFETCH_CNTL_COUNT(fs->num_sampler_prefetch) | - A6XX_SP_FS_PREFETCH_CNTL_UNK4(regid(63, 0)) | - 0x7000); // XXX - for (int i = 0; i < fs->num_sampler_prefetch; i++) { - const struct ir3_sampler_prefetch *prefetch = &fs->sampler_prefetch[i]; - OUT_RING(ring, A6XX_SP_FS_PREFETCH_CMD_SRC(prefetch->src) | - A6XX_SP_FS_PREFETCH_CMD_SAMP_ID(prefetch->samp_id) | - A6XX_SP_FS_PREFETCH_CMD_TEX_ID(prefetch->tex_id) | - A6XX_SP_FS_PREFETCH_CMD_DST(prefetch->dst) | - A6XX_SP_FS_PREFETCH_CMD_WRMASK(prefetch->wrmask) | - COND(prefetch->half_precision, A6XX_SP_FS_PREFETCH_CMD_HALF) | - A6XX_SP_FS_PREFETCH_CMD_CMD(prefetch->cmd)); - } - - OUT_PKT4(ring, REG_A6XX_SP_UNKNOWN_A9A8, 1); - OUT_RING(ring, 0); - - OUT_PKT4(ring, REG_A6XX_SP_MODE_CONTROL, 1); - OUT_RING(ring, A6XX_SP_MODE_CONTROL_CONSTANT_DEMOTION_ENABLE | 4); - - bool fs_has_dual_src_color = !binning_pass && - fs->shader->nir->info.fs.color_is_dual_source; - - OUT_PKT4(ring, REG_A6XX_SP_FS_OUTPUT_CNTL0, 1); - OUT_RING(ring, A6XX_SP_FS_OUTPUT_CNTL0_DEPTH_REGID(posz_regid) | - A6XX_SP_FS_OUTPUT_CNTL0_SAMPMASK_REGID(smask_regid) | - A6XX_SP_FS_OUTPUT_CNTL0_STENCILREF_REGID(stencilref_regid) | - COND(fs_has_dual_src_color, - A6XX_SP_FS_OUTPUT_CNTL0_DUAL_COLOR_IN_ENABLE)); - - OUT_PKT4(ring, REG_A6XX_SP_VS_CTRL_REG0, 1); - OUT_RING(ring, - A6XX_SP_VS_CTRL_REG0_FULLREGFOOTPRINT(vs->info.max_reg + 1) | - A6XX_SP_VS_CTRL_REG0_HALFREGFOOTPRINT(vs->info.max_half_reg + 1) | - COND(vs->mergedregs, A6XX_SP_VS_CTRL_REG0_MERGEDREGS) | - A6XX_SP_VS_CTRL_REG0_BRANCHSTACK(vs->branchstack)); - - fd6_emit_shader(ctx, ring, vs); - fd6_emit_immediates(ctx->screen, vs, ring); - - struct ir3_shader_linkage l = {0}; - const struct ir3_shader_variant *last_shader = fd6_last_shader(state); - - bool do_streamout = (last_shader->shader->stream_output.num_outputs > 0); - uint8_t clip_mask = last_shader->clip_mask, cull_mask = last_shader->cull_mask; - uint8_t clip_cull_mask = clip_mask | cull_mask; - - /* If we have streamout, link against the real FS, rather than the - * dummy FS used for binning pass state, to ensure the OUTLOC's - * match. Depending on whether we end up doing sysmem or gmem, - * the actual streamout could happen with either the binning pass - * or draw pass program, but the same streamout stateobj is used - * in either case: - */ - ir3_link_shaders(&l, last_shader, do_streamout ? state->fs : fs, true); - - bool primid_passthru = l.primid_loc != 0xff; - clip0_loc = l.clip0_loc; - clip1_loc = l.clip1_loc; - - OUT_PKT4(ring, REG_A6XX_VPC_VAR_DISABLE(0), 4); - OUT_RING(ring, ~l.varmask[0]); /* VPC_VAR[0].DISABLE */ - OUT_RING(ring, ~l.varmask[1]); /* VPC_VAR[1].DISABLE */ - OUT_RING(ring, ~l.varmask[2]); /* VPC_VAR[2].DISABLE */ - OUT_RING(ring, ~l.varmask[3]); /* VPC_VAR[3].DISABLE */ - - /* Add stream out outputs after computing the VPC_VAR_DISABLE bitmask. */ - ir3_link_stream_out(&l, last_shader); - - if (VALIDREG(layer_regid)) { - layer_loc = l.max_loc; - ir3_link_add(&l, layer_regid, 0x1, l.max_loc); - } - - if (VALIDREG(pos_regid)) { - pos_loc = l.max_loc; - ir3_link_add(&l, pos_regid, 0xf, l.max_loc); - } - - if (VALIDREG(psize_regid)) { - psize_loc = l.max_loc; - ir3_link_add(&l, psize_regid, 0x1, l.max_loc); - } - - /* Handle the case where clip/cull distances aren't read by the FS. Make - * sure to avoid adding an output with an empty writemask if the user - * disables all the clip distances in the API so that the slot is unused. - */ - if (clip0_loc == 0xff && VALIDREG(clip0_regid) && (clip_cull_mask & 0xf) != 0) { - clip0_loc = l.max_loc; - ir3_link_add(&l, clip0_regid, clip_cull_mask & 0xf, l.max_loc); - } - - if (clip1_loc == 0xff && VALIDREG(clip1_regid) && (clip_cull_mask >> 4) != 0) { - clip1_loc = l.max_loc; - ir3_link_add(&l, clip1_regid, clip_cull_mask >> 4, l.max_loc); - } - - /* If we have stream-out, we use the full shader for binning - * pass, rather than the optimized binning pass one, so that we - * have all the varying outputs available for xfb. So streamout - * state should always be derived from the non-binning pass - * program: - */ - if (do_streamout && !binning_pass) { - setup_stream_out(state, last_shader, &l); - } - - debug_assert(l.cnt <= 32); - if (gs) - OUT_PKT4(ring, REG_A6XX_SP_GS_OUT_REG(0), DIV_ROUND_UP(l.cnt, 2)); - else if (ds) - OUT_PKT4(ring, REG_A6XX_SP_DS_OUT_REG(0), DIV_ROUND_UP(l.cnt, 2)); - else - OUT_PKT4(ring, REG_A6XX_SP_VS_OUT_REG(0), DIV_ROUND_UP(l.cnt, 2)); - - for (j = 0; j < l.cnt; ) { - uint32_t reg = 0; - - reg |= A6XX_SP_VS_OUT_REG_A_REGID(l.var[j].regid); - reg |= A6XX_SP_VS_OUT_REG_A_COMPMASK(l.var[j].compmask); - j++; - - reg |= A6XX_SP_VS_OUT_REG_B_REGID(l.var[j].regid); - reg |= A6XX_SP_VS_OUT_REG_B_COMPMASK(l.var[j].compmask); - j++; - - OUT_RING(ring, reg); - } - - if (gs) - OUT_PKT4(ring, REG_A6XX_SP_GS_VPC_DST_REG(0), DIV_ROUND_UP(l.cnt, 4)); - else if (ds) - OUT_PKT4(ring, REG_A6XX_SP_DS_VPC_DST_REG(0), DIV_ROUND_UP(l.cnt, 4)); - else - OUT_PKT4(ring, REG_A6XX_SP_VS_VPC_DST_REG(0), DIV_ROUND_UP(l.cnt, 4)); - - for (j = 0; j < l.cnt; ) { - uint32_t reg = 0; - - reg |= A6XX_SP_VS_VPC_DST_REG_OUTLOC0(l.var[j++].loc); - reg |= A6XX_SP_VS_VPC_DST_REG_OUTLOC1(l.var[j++].loc); - reg |= A6XX_SP_VS_VPC_DST_REG_OUTLOC2(l.var[j++].loc); - reg |= A6XX_SP_VS_VPC_DST_REG_OUTLOC3(l.var[j++].loc); - - OUT_RING(ring, reg); - } - - if (hs) { - assert(vs->mergedregs == hs->mergedregs); - OUT_PKT4(ring, REG_A6XX_SP_HS_CTRL_REG0, 1); - OUT_RING(ring, - A6XX_SP_HS_CTRL_REG0_FULLREGFOOTPRINT(hs->info.max_reg + 1) | - A6XX_SP_HS_CTRL_REG0_HALFREGFOOTPRINT(hs->info.max_half_reg + 1) | - A6XX_SP_HS_CTRL_REG0_BRANCHSTACK(hs->branchstack)); - - fd6_emit_shader(ctx, ring, hs); - fd6_emit_immediates(ctx->screen, hs, ring); - fd6_emit_link_map(ctx->screen, vs, hs, ring); - - OUT_PKT4(ring, REG_A6XX_SP_DS_CTRL_REG0, 1); - OUT_RING(ring, - A6XX_SP_DS_CTRL_REG0_FULLREGFOOTPRINT(ds->info.max_reg + 1) | - A6XX_SP_DS_CTRL_REG0_HALFREGFOOTPRINT(ds->info.max_half_reg + 1) | - COND(ds->mergedregs, A6XX_SP_DS_CTRL_REG0_MERGEDREGS) | - A6XX_SP_DS_CTRL_REG0_BRANCHSTACK(ds->branchstack)); - - fd6_emit_shader(ctx, ring, ds); - fd6_emit_immediates(ctx->screen, ds, ring); - fd6_emit_link_map(ctx->screen, hs, ds, ring); - - shader_info *hs_info = &hs->shader->nir->info; - OUT_PKT4(ring, REG_A6XX_PC_TESS_NUM_VERTEX, 1); - OUT_RING(ring, hs_info->tess.tcs_vertices_out); - - /* Total attribute slots in HS incoming patch. */ - OUT_PKT4(ring, REG_A6XX_PC_HS_INPUT_SIZE, 1); - OUT_RING(ring, hs_info->tess.tcs_vertices_out * vs->output_size / 4); - - const uint32_t wavesize = 64; - const uint32_t max_wave_input_size = 64; - const uint32_t patch_control_points = hs_info->tess.tcs_vertices_out; - - /* note: if HS is really just the VS extended, then this - * should be by MAX2(patch_control_points, hs_info->tess.tcs_vertices_out) - * however that doesn't match the blob, and fails some dEQP tests. - */ - uint32_t prims_per_wave = wavesize / hs_info->tess.tcs_vertices_out; - uint32_t max_prims_per_wave = - max_wave_input_size * wavesize / (vs->output_size * patch_control_points); - prims_per_wave = MIN2(prims_per_wave, max_prims_per_wave); - - uint32_t total_size = vs->output_size * patch_control_points * prims_per_wave; - uint32_t wave_input_size = DIV_ROUND_UP(total_size, wavesize); - - OUT_PKT4(ring, REG_A6XX_SP_HS_WAVE_INPUT_SIZE, 1); - OUT_RING(ring, wave_input_size); - - shader_info *ds_info = &ds->shader->nir->info; - OUT_PKT4(ring, REG_A6XX_PC_TESS_CNTL, 1); - uint32_t output; - if (ds_info->tess.point_mode) - output = TESS_POINTS; - else if (ds_info->tess.primitive_mode == GL_ISOLINES) - output = TESS_LINES; - else if (ds_info->tess.ccw) - output = TESS_CCW_TRIS; - else - output = TESS_CW_TRIS; - - OUT_RING(ring, A6XX_PC_TESS_CNTL_SPACING(fd6_gl2spacing(ds_info->tess.spacing)) | - A6XX_PC_TESS_CNTL_OUTPUT(output)); - - OUT_PKT4(ring, REG_A6XX_VPC_DS_CLIP_CNTL, 1); - OUT_RING(ring, A6XX_VPC_DS_CLIP_CNTL_CLIP_MASK(clip_cull_mask) | - A6XX_VPC_DS_CLIP_CNTL_CLIP_DIST_03_LOC(clip0_loc) | - A6XX_VPC_DS_CLIP_CNTL_CLIP_DIST_47_LOC(clip1_loc)); - - OUT_PKT4(ring, REG_A6XX_VPC_DS_LAYER_CNTL, 1); - OUT_RING(ring, 0x0000ffff); - - OUT_PKT4(ring, REG_A6XX_GRAS_DS_LAYER_CNTL, 1); - OUT_RING(ring, 0x0); - - OUT_PKT4(ring, REG_A6XX_GRAS_DS_CL_CNTL, 1); - OUT_RING(ring, A6XX_GRAS_DS_CL_CNTL_CLIP_MASK(clip_mask) | - A6XX_GRAS_DS_CL_CNTL_CULL_MASK(cull_mask)); - - OUT_PKT4(ring, REG_A6XX_VPC_VS_PACK, 1); - OUT_RING(ring, A6XX_VPC_VS_PACK_POSITIONLOC(pos_loc) | - A6XX_VPC_VS_PACK_PSIZELOC(255) | - A6XX_VPC_VS_PACK_STRIDE_IN_VPC(l.max_loc)); - - OUT_PKT4(ring, REG_A6XX_VPC_DS_PACK, 1); - OUT_RING(ring, A6XX_VPC_DS_PACK_POSITIONLOC(pos_loc) | - A6XX_VPC_DS_PACK_PSIZELOC(psize_loc) | - A6XX_VPC_DS_PACK_STRIDE_IN_VPC(l.max_loc)); - - OUT_PKT4(ring, REG_A6XX_SP_DS_PRIMITIVE_CNTL, 1); - OUT_RING(ring, A6XX_SP_DS_PRIMITIVE_CNTL_OUT(l.cnt)); - - OUT_PKT4(ring, REG_A6XX_PC_DS_OUT_CNTL, 1); - OUT_RING(ring, A6XX_PC_DS_OUT_CNTL_STRIDE_IN_VPC(l.max_loc) | - CONDREG(psize_regid, A6XX_PC_DS_OUT_CNTL_PSIZE) | - A6XX_PC_DS_OUT_CNTL_CLIP_MASK(clip_cull_mask)); - - } else { - OUT_PKT4(ring, REG_A6XX_SP_HS_WAVE_INPUT_SIZE, 1); - OUT_RING(ring, 0); - } - - OUT_PKT4(ring, REG_A6XX_SP_VS_PRIMITIVE_CNTL, 1); - OUT_RING(ring, A6XX_SP_VS_PRIMITIVE_CNTL_OUT(l.cnt)); - - bool enable_varyings = fs->total_in > 0; - - OUT_PKT4(ring, REG_A6XX_VPC_CNTL_0, 1); - OUT_RING(ring, A6XX_VPC_CNTL_0_NUMNONPOSVAR(fs->total_in) | - COND(enable_varyings, A6XX_VPC_CNTL_0_VARYING) | - A6XX_VPC_CNTL_0_PRIMIDLOC(l.primid_loc) | - A6XX_VPC_CNTL_0_VIEWIDLOC(0xff)); - - OUT_PKT4(ring, REG_A6XX_PC_VS_OUT_CNTL, 1); - OUT_RING(ring, A6XX_PC_VS_OUT_CNTL_STRIDE_IN_VPC(l.max_loc) | - CONDREG(psize_regid, A6XX_PC_VS_OUT_CNTL_PSIZE) | - CONDREG(layer_regid, A6XX_PC_VS_OUT_CNTL_LAYER) | - A6XX_PC_VS_OUT_CNTL_CLIP_MASK(clip_cull_mask)); - - OUT_PKT4(ring, REG_A6XX_PC_PRIMITIVE_CNTL_3, 1); - OUT_RING(ring, 0); - - OUT_PKT4(ring, REG_A6XX_HLSQ_CONTROL_1_REG, 5); - OUT_RING(ring, 0x7); /* XXX */ - OUT_RING(ring, A6XX_HLSQ_CONTROL_2_REG_FACEREGID(face_regid) | - A6XX_HLSQ_CONTROL_2_REG_SAMPLEID(samp_id_regid) | - A6XX_HLSQ_CONTROL_2_REG_SAMPLEMASK(smask_in_regid) | - A6XX_HLSQ_CONTROL_2_REG_SIZE(ij_regid[IJ_PERSP_SIZE])); - OUT_RING(ring, - A6XX_HLSQ_CONTROL_3_REG_IJ_PERSP_PIXEL(ij_regid[IJ_PERSP_PIXEL]) | - A6XX_HLSQ_CONTROL_3_REG_IJ_LINEAR_PIXEL(ij_regid[IJ_LINEAR_PIXEL]) | - A6XX_HLSQ_CONTROL_3_REG_IJ_PERSP_CENTROID(ij_regid[IJ_PERSP_CENTROID]) | - A6XX_HLSQ_CONTROL_3_REG_IJ_LINEAR_CENTROID(ij_regid[IJ_LINEAR_CENTROID])); - OUT_RING(ring, A6XX_HLSQ_CONTROL_4_REG_XYCOORDREGID(coord_regid) | - A6XX_HLSQ_CONTROL_4_REG_ZWCOORDREGID(zwcoord_regid) | - A6XX_HLSQ_CONTROL_4_REG_IJ_PERSP_SAMPLE(ij_regid[IJ_PERSP_SAMPLE]) | - A6XX_HLSQ_CONTROL_4_REG_IJ_LINEAR_SAMPLE(ij_regid[IJ_LINEAR_SAMPLE])); - OUT_RING(ring, 0xfc); /* XXX */ - - OUT_PKT4(ring, REG_A6XX_HLSQ_FS_CNTL_0, 1); - OUT_RING(ring, A6XX_HLSQ_FS_CNTL_0_THREADSIZE(fssz) | - COND(enable_varyings, A6XX_HLSQ_FS_CNTL_0_VARYINGS)); - - OUT_PKT4(ring, REG_A6XX_SP_FS_CTRL_REG0, 1); - OUT_RING(ring, A6XX_SP_FS_CTRL_REG0_THREADSIZE(fssz) | - COND(enable_varyings, A6XX_SP_FS_CTRL_REG0_VARYING) | - 0x1000000 | - A6XX_SP_FS_CTRL_REG0_FULLREGFOOTPRINT(fs->info.max_reg + 1) | - A6XX_SP_FS_CTRL_REG0_HALFREGFOOTPRINT(fs->info.max_half_reg + 1) | - COND(fs->mergedregs, A6XX_SP_FS_CTRL_REG0_MERGEDREGS) | - A6XX_SP_FS_CTRL_REG0_BRANCHSTACK(fs->branchstack) | - COND(fs->need_pixlod, A6XX_SP_FS_CTRL_REG0_PIXLODENABLE)); - - OUT_PKT4(ring, REG_A6XX_VPC_VS_LAYER_CNTL, 1); - OUT_RING(ring, A6XX_VPC_VS_LAYER_CNTL_LAYERLOC(layer_loc) | - A6XX_VPC_VS_LAYER_CNTL_VIEWLOC(0xff)); - - bool need_size = fs->frag_face || fs->fragcoord_compmask != 0; - bool need_size_persamp = false; - if (VALIDREG(ij_regid[IJ_PERSP_SIZE])) { - if (sample_shading) - need_size_persamp = true; - else - need_size = true; - } - if (VALIDREG(ij_regid[IJ_LINEAR_PIXEL])) - need_size = true; - - /* XXX: enable bits for linear centroid and linear sample bary */ - - OUT_PKT4(ring, REG_A6XX_GRAS_CNTL, 1); - OUT_RING(ring, - CONDREG(ij_regid[IJ_PERSP_PIXEL], A6XX_GRAS_CNTL_IJ_PERSP_PIXEL) | - CONDREG(ij_regid[IJ_PERSP_CENTROID], A6XX_GRAS_CNTL_IJ_PERSP_CENTROID) | - CONDREG(ij_regid[IJ_PERSP_SAMPLE], A6XX_GRAS_CNTL_IJ_PERSP_SAMPLE) | - COND(need_size, A6XX_GRAS_CNTL_SIZE) | - COND(need_size_persamp, A6XX_GRAS_CNTL_SIZE_PERSAMP) | - COND(fs->fragcoord_compmask != 0, A6XX_GRAS_CNTL_COORD_MASK(fs->fragcoord_compmask))); - - OUT_PKT4(ring, REG_A6XX_RB_RENDER_CONTROL0, 2); - OUT_RING(ring, - CONDREG(ij_regid[IJ_PERSP_PIXEL], A6XX_RB_RENDER_CONTROL0_IJ_PERSP_PIXEL) | - CONDREG(ij_regid[IJ_PERSP_CENTROID], A6XX_RB_RENDER_CONTROL0_IJ_PERSP_CENTROID) | - CONDREG(ij_regid[IJ_PERSP_SAMPLE], A6XX_RB_RENDER_CONTROL0_IJ_PERSP_SAMPLE) | - COND(need_size, A6XX_RB_RENDER_CONTROL0_SIZE) | - COND(enable_varyings, A6XX_RB_RENDER_CONTROL0_UNK10) | - COND(need_size_persamp, A6XX_RB_RENDER_CONTROL0_SIZE_PERSAMP) | - COND(fs->fragcoord_compmask != 0, - A6XX_RB_RENDER_CONTROL0_COORD_MASK(fs->fragcoord_compmask))); - - OUT_RING(ring, - CONDREG(smask_in_regid, A6XX_RB_RENDER_CONTROL1_SAMPLEMASK) | - CONDREG(samp_id_regid, A6XX_RB_RENDER_CONTROL1_SAMPLEID) | - CONDREG(ij_regid[IJ_PERSP_SIZE], A6XX_RB_RENDER_CONTROL1_SIZE) | - COND(fs->frag_face, A6XX_RB_RENDER_CONTROL1_FACENESS)); - - OUT_PKT4(ring, REG_A6XX_RB_SAMPLE_CNTL, 1); - OUT_RING(ring, COND(sample_shading, A6XX_RB_SAMPLE_CNTL_PER_SAMP_MODE)); - - OUT_PKT4(ring, REG_A6XX_GRAS_UNKNOWN_8101, 1); - OUT_RING(ring, COND(sample_shading, 0x6)); // XXX - - OUT_PKT4(ring, REG_A6XX_GRAS_SAMPLE_CNTL, 1); - OUT_RING(ring, COND(sample_shading, A6XX_GRAS_SAMPLE_CNTL_PER_SAMP_MODE)); - - OUT_PKT4(ring, REG_A6XX_SP_FS_OUTPUT_REG(0), 8); - for (i = 0; i < 8; i++) { - OUT_RING(ring, A6XX_SP_FS_OUTPUT_REG_REGID(color_regid[i]) | - COND(color_regid[i] & HALF_REG_ID, A6XX_SP_FS_OUTPUT_REG_HALF_PRECISION)); - if (VALIDREG(color_regid[i])) { - state->mrt_components |= 0xf << (i * 4); - } - } - - /* dual source blending has an extra fs output in the 2nd slot */ - if (fs_has_dual_src_color) { - state->mrt_components |= 0xf << 4; - } - - OUT_PKT4(ring, REG_A6XX_VPC_VS_PACK, 1); - OUT_RING(ring, A6XX_VPC_VS_PACK_POSITIONLOC(pos_loc) | - A6XX_VPC_VS_PACK_PSIZELOC(psize_loc) | - A6XX_VPC_VS_PACK_STRIDE_IN_VPC(l.max_loc)); - - if (gs) { - assert(gs->mergedregs == (ds ? ds->mergedregs : vs->mergedregs)); - OUT_PKT4(ring, REG_A6XX_SP_GS_CTRL_REG0, 1); - OUT_RING(ring, - A6XX_SP_GS_CTRL_REG0_FULLREGFOOTPRINT(gs->info.max_reg + 1) | - A6XX_SP_GS_CTRL_REG0_HALFREGFOOTPRINT(gs->info.max_half_reg + 1) | - A6XX_SP_GS_CTRL_REG0_BRANCHSTACK(gs->branchstack)); - - fd6_emit_shader(ctx, ring, gs); - fd6_emit_immediates(ctx->screen, gs, ring); - if (ds) - fd6_emit_link_map(ctx->screen, ds, gs, ring); - else - fd6_emit_link_map(ctx->screen, vs, gs, ring); - - OUT_PKT4(ring, REG_A6XX_VPC_GS_PACK, 1); - OUT_RING(ring, A6XX_VPC_GS_PACK_POSITIONLOC(pos_loc) | - A6XX_VPC_GS_PACK_PSIZELOC(psize_loc) | - A6XX_VPC_GS_PACK_STRIDE_IN_VPC(l.max_loc)); - - OUT_PKT4(ring, REG_A6XX_VPC_GS_LAYER_CNTL, 1); - OUT_RING(ring, A6XX_VPC_GS_LAYER_CNTL_LAYERLOC(layer_loc) | 0xff00); - - OUT_PKT4(ring, REG_A6XX_GRAS_GS_LAYER_CNTL, 1); - OUT_RING(ring, CONDREG(layer_regid, A6XX_GRAS_GS_LAYER_CNTL_WRITES_LAYER)); - - uint32_t flags_regid = ir3_find_output_regid(gs, VARYING_SLOT_GS_VERTEX_FLAGS_IR3); - - OUT_PKT4(ring, REG_A6XX_SP_GS_PRIMITIVE_CNTL, 1); - OUT_RING(ring, A6XX_SP_GS_PRIMITIVE_CNTL_OUT(l.cnt) | - A6XX_SP_GS_PRIMITIVE_CNTL_FLAGS_REGID(flags_regid)); - - OUT_PKT4(ring, REG_A6XX_PC_GS_OUT_CNTL, 1); - OUT_RING(ring, A6XX_PC_GS_OUT_CNTL_STRIDE_IN_VPC(l.max_loc) | - CONDREG(psize_regid, A6XX_PC_GS_OUT_CNTL_PSIZE) | - CONDREG(layer_regid, A6XX_PC_GS_OUT_CNTL_LAYER) | - CONDREG(primitive_regid, A6XX_PC_GS_OUT_CNTL_PRIMITIVE_ID) | - A6XX_PC_GS_OUT_CNTL_CLIP_MASK(clip_cull_mask)); - - uint32_t output; - switch (gs->shader->nir->info.gs.output_primitive) { - case GL_POINTS: - output = TESS_POINTS; - break; - case GL_LINE_STRIP: - output = TESS_LINES; - break; - case GL_TRIANGLE_STRIP: - output = TESS_CW_TRIS; - break; - default: - unreachable(""); - } - OUT_PKT4(ring, REG_A6XX_PC_PRIMITIVE_CNTL_5, 1); - OUT_RING(ring, - A6XX_PC_PRIMITIVE_CNTL_5_GS_VERTICES_OUT(gs->shader->nir->info.gs.vertices_out - 1) | - A6XX_PC_PRIMITIVE_CNTL_5_GS_OUTPUT(output) | - A6XX_PC_PRIMITIVE_CNTL_5_GS_INVOCATIONS(gs->shader->nir->info.gs.invocations - 1)); - - OUT_PKT4(ring, REG_A6XX_GRAS_GS_CL_CNTL, 1); - OUT_RING(ring, A6XX_GRAS_GS_CL_CNTL_CLIP_MASK(clip_mask) | - A6XX_GRAS_GS_CL_CNTL_CULL_MASK(cull_mask)); - - OUT_PKT4(ring, REG_A6XX_VPC_UNKNOWN_9100, 1); - OUT_RING(ring, 0xff); - - OUT_PKT4(ring, REG_A6XX_VPC_GS_CLIP_CNTL, 1); - OUT_RING(ring, A6XX_VPC_GS_CLIP_CNTL_CLIP_MASK(clip_cull_mask) | - A6XX_VPC_GS_CLIP_CNTL_CLIP_DIST_03_LOC(clip0_loc) | - A6XX_VPC_GS_CLIP_CNTL_CLIP_DIST_47_LOC(clip1_loc)); - - const struct ir3_shader_variant *prev = state->ds ? state->ds : state->vs; - - /* Size of per-primitive alloction in ldlw memory in vec4s. */ - uint32_t vec4_size = - gs->shader->nir->info.gs.vertices_in * - DIV_ROUND_UP(prev->output_size, 4); - OUT_PKT4(ring, REG_A6XX_PC_PRIMITIVE_CNTL_6, 1); - OUT_RING(ring, A6XX_PC_PRIMITIVE_CNTL_6_STRIDE_IN_VPC(vec4_size)); - - OUT_PKT4(ring, REG_A6XX_PC_MULTIVIEW_CNTL, 1); - OUT_RING(ring, 0); - - OUT_PKT4(ring, REG_A6XX_SP_GS_PRIM_SIZE, 1); - OUT_RING(ring, prev->output_size); - } else { - OUT_PKT4(ring, REG_A6XX_PC_PRIMITIVE_CNTL_6, 1); - OUT_RING(ring, 0); - OUT_PKT4(ring, REG_A6XX_SP_GS_PRIM_SIZE, 1); - OUT_RING(ring, 0); - - OUT_PKT4(ring, REG_A6XX_GRAS_VS_LAYER_CNTL, 1); - OUT_RING(ring, CONDREG(layer_regid, A6XX_GRAS_VS_LAYER_CNTL_WRITES_LAYER)); - } - - OUT_PKT4(ring, REG_A6XX_VPC_VS_CLIP_CNTL, 1); - OUT_RING(ring, A6XX_VPC_VS_CLIP_CNTL_CLIP_MASK(clip_cull_mask) | - A6XX_VPC_VS_CLIP_CNTL_CLIP_DIST_03_LOC(clip0_loc) | - A6XX_VPC_VS_CLIP_CNTL_CLIP_DIST_47_LOC(clip1_loc)); - - OUT_PKT4(ring, REG_A6XX_GRAS_VS_CL_CNTL, 1); - OUT_RING(ring, A6XX_GRAS_VS_CL_CNTL_CLIP_MASK(clip_mask) | - A6XX_GRAS_VS_CL_CNTL_CULL_MASK(cull_mask)); - - OUT_PKT4(ring, REG_A6XX_VPC_UNKNOWN_9107, 1); - OUT_RING(ring, 0); - - if (fs->instrlen) - fd6_emit_shader(ctx, ring, fs); - - OUT_REG(ring, A6XX_PC_PRIMID_PASSTHRU(primid_passthru)); - - uint32_t non_sysval_input_count = 0; - for (uint32_t i = 0; i < vs->inputs_count; i++) - if (!vs->inputs[i].sysval) - non_sysval_input_count++; - - OUT_PKT4(ring, REG_A6XX_VFD_CONTROL_0, 1); - OUT_RING(ring, A6XX_VFD_CONTROL_0_FETCH_CNT(non_sysval_input_count) | - A6XX_VFD_CONTROL_0_DECODE_CNT(non_sysval_input_count)); - - OUT_PKT4(ring, REG_A6XX_VFD_DEST_CNTL(0), non_sysval_input_count); - for (uint32_t i = 0; i < non_sysval_input_count; i++) { - assert(vs->inputs[i].compmask); - OUT_RING(ring, A6XX_VFD_DEST_CNTL_INSTR_WRITEMASK(vs->inputs[i].compmask) | - A6XX_VFD_DEST_CNTL_INSTR_REGID(vs->inputs[i].regid)); - } - - OUT_PKT4(ring, REG_A6XX_VFD_CONTROL_1, 6); - OUT_RING(ring, A6XX_VFD_CONTROL_1_REGID4VTX(vertex_regid) | - A6XX_VFD_CONTROL_1_REGID4INST(instance_regid) | - A6XX_VFD_CONTROL_1_REGID4PRIMID(primitive_regid) | - 0xfc000000); - OUT_RING(ring, A6XX_VFD_CONTROL_2_REGID_HSPATCHID(hs_patch_regid) | - A6XX_VFD_CONTROL_2_REGID_INVOCATIONID(hs_invocation_regid)); - OUT_RING(ring, A6XX_VFD_CONTROL_3_REGID_DSPATCHID(ds_patch_regid) | - A6XX_VFD_CONTROL_3_REGID_TESSX(tess_coord_x_regid) | - A6XX_VFD_CONTROL_3_REGID_TESSY(tess_coord_y_regid) | - 0xfc); - OUT_RING(ring, 0x000000fc); /* VFD_CONTROL_4 */ - OUT_RING(ring, A6XX_VFD_CONTROL_5_REGID_GSHEADER(gs_header_regid) | - 0xfc00); /* VFD_CONTROL_5 */ - OUT_RING(ring, - COND(primid_passthru, A6XX_VFD_CONTROL_6_PRIMID_PASSTHRU)); /* VFD_CONTROL_6 */ - - if (!binning_pass) - fd6_emit_immediates(ctx->screen, fs, ring); + uint32_t pos_regid, psize_regid, color_regid[8], posz_regid; + uint32_t clip0_regid, clip1_regid; + uint32_t face_regid, coord_regid, zwcoord_regid, samp_id_regid; + uint32_t smask_in_regid, smask_regid; + uint32_t stencilref_regid; + uint32_t vertex_regid, instance_regid, layer_regid, primitive_regid; + uint32_t hs_invocation_regid; + uint32_t tess_coord_x_regid, tess_coord_y_regid, hs_patch_regid, + ds_patch_regid; + uint32_t ij_regid[IJ_COUNT]; + uint32_t gs_header_regid; + enum a6xx_threadsize fssz; + uint8_t psize_loc = ~0, pos_loc = ~0, layer_loc = ~0; + uint8_t clip0_loc, clip1_loc; + int i, j; + + static const struct ir3_shader_variant dummy_fs = {0}; + const struct ir3_shader_variant *vs = binning_pass ? state->bs : state->vs; + const struct ir3_shader_variant *hs = state->hs; + const struct ir3_shader_variant *ds = state->ds; + const struct ir3_shader_variant *gs = state->gs; + const struct ir3_shader_variant *fs = binning_pass ? &dummy_fs : state->fs; + + /* binning VS is wrong when GS is present, so use nonbinning VS + * TODO: compile both binning VS/GS variants correctly + */ + if (binning_pass && state->gs) + vs = state->vs; + + bool sample_shading = fs->per_samp | key->sample_shading; + + fssz = fs->info.double_threadsize ? THREAD128 : THREAD64; + + pos_regid = ir3_find_output_regid(vs, VARYING_SLOT_POS); + psize_regid = ir3_find_output_regid(vs, VARYING_SLOT_PSIZ); + clip0_regid = ir3_find_output_regid(vs, VARYING_SLOT_CLIP_DIST0); + clip1_regid = ir3_find_output_regid(vs, VARYING_SLOT_CLIP_DIST1); + layer_regid = ir3_find_output_regid(vs, VARYING_SLOT_LAYER); + vertex_regid = ir3_find_sysval_regid(vs, SYSTEM_VALUE_VERTEX_ID); + instance_regid = ir3_find_sysval_regid(vs, SYSTEM_VALUE_INSTANCE_ID); + + if (hs) { + tess_coord_x_regid = ir3_find_sysval_regid(ds, SYSTEM_VALUE_TESS_COORD); + tess_coord_y_regid = next_regid(tess_coord_x_regid, 1); + hs_patch_regid = ir3_find_sysval_regid(hs, SYSTEM_VALUE_PRIMITIVE_ID); + ds_patch_regid = ir3_find_sysval_regid(ds, SYSTEM_VALUE_PRIMITIVE_ID); + hs_invocation_regid = + ir3_find_sysval_regid(hs, SYSTEM_VALUE_TCS_HEADER_IR3); + + pos_regid = ir3_find_output_regid(ds, VARYING_SLOT_POS); + psize_regid = ir3_find_output_regid(ds, VARYING_SLOT_PSIZ); + clip0_regid = ir3_find_output_regid(ds, VARYING_SLOT_CLIP_DIST0); + clip1_regid = ir3_find_output_regid(ds, VARYING_SLOT_CLIP_DIST1); + } else { + tess_coord_x_regid = regid(63, 0); + tess_coord_y_regid = regid(63, 0); + hs_patch_regid = regid(63, 0); + ds_patch_regid = regid(63, 0); + hs_invocation_regid = regid(63, 0); + } + + if (gs) { + gs_header_regid = ir3_find_sysval_regid(gs, SYSTEM_VALUE_GS_HEADER_IR3); + primitive_regid = ir3_find_sysval_regid(gs, SYSTEM_VALUE_PRIMITIVE_ID); + pos_regid = ir3_find_output_regid(gs, VARYING_SLOT_POS); + psize_regid = ir3_find_output_regid(gs, VARYING_SLOT_PSIZ); + clip0_regid = ir3_find_output_regid(gs, VARYING_SLOT_CLIP_DIST0); + clip1_regid = ir3_find_output_regid(gs, VARYING_SLOT_CLIP_DIST1); + layer_regid = ir3_find_output_regid(gs, VARYING_SLOT_LAYER); + } else { + gs_header_regid = regid(63, 0); + primitive_regid = regid(63, 0); + } + + if (fs->color0_mrt) { + color_regid[0] = color_regid[1] = color_regid[2] = color_regid[3] = + color_regid[4] = color_regid[5] = color_regid[6] = color_regid[7] = + ir3_find_output_regid(fs, FRAG_RESULT_COLOR); + } else { + color_regid[0] = ir3_find_output_regid(fs, FRAG_RESULT_DATA0); + color_regid[1] = ir3_find_output_regid(fs, FRAG_RESULT_DATA1); + color_regid[2] = ir3_find_output_regid(fs, FRAG_RESULT_DATA2); + color_regid[3] = ir3_find_output_regid(fs, FRAG_RESULT_DATA3); + color_regid[4] = ir3_find_output_regid(fs, FRAG_RESULT_DATA4); + color_regid[5] = ir3_find_output_regid(fs, FRAG_RESULT_DATA5); + color_regid[6] = ir3_find_output_regid(fs, FRAG_RESULT_DATA6); + color_regid[7] = ir3_find_output_regid(fs, FRAG_RESULT_DATA7); + } + + samp_id_regid = ir3_find_sysval_regid(fs, SYSTEM_VALUE_SAMPLE_ID); + smask_in_regid = ir3_find_sysval_regid(fs, SYSTEM_VALUE_SAMPLE_MASK_IN); + face_regid = ir3_find_sysval_regid(fs, SYSTEM_VALUE_FRONT_FACE); + coord_regid = ir3_find_sysval_regid(fs, SYSTEM_VALUE_FRAG_COORD); + zwcoord_regid = next_regid(coord_regid, 2); + posz_regid = ir3_find_output_regid(fs, FRAG_RESULT_DEPTH); + smask_regid = ir3_find_output_regid(fs, FRAG_RESULT_SAMPLE_MASK); + stencilref_regid = ir3_find_output_regid(fs, FRAG_RESULT_STENCIL); + for (unsigned i = 0; i < ARRAY_SIZE(ij_regid); i++) + ij_regid[i] = + ir3_find_sysval_regid(fs, SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL + i); + + /* If we have pre-dispatch texture fetches, then ij_pix should not + * be DCE'd, even if not actually used in the shader itself: + */ + if (fs->num_sampler_prefetch > 0) { + assert(VALIDREG(ij_regid[IJ_PERSP_PIXEL])); + /* also, it seems like ij_pix is *required* to be r0.x */ + assert(ij_regid[IJ_PERSP_PIXEL] == regid(0, 0)); + } + + /* we can't write gl_SampleMask for !msaa.. if b0 is zero then we + * end up masking the single sample!! + */ + if (!key->msaa) + smask_regid = regid(63, 0); + + /* we could probably divide this up into things that need to be + * emitted if frag-prog is dirty vs if vert-prog is dirty.. + */ + + OUT_PKT4(ring, REG_A6XX_SP_FS_PREFETCH_CNTL, 1 + fs->num_sampler_prefetch); + OUT_RING(ring, A6XX_SP_FS_PREFETCH_CNTL_COUNT(fs->num_sampler_prefetch) | + A6XX_SP_FS_PREFETCH_CNTL_UNK4(regid(63, 0)) | + 0x7000); // XXX + for (int i = 0; i < fs->num_sampler_prefetch; i++) { + const struct ir3_sampler_prefetch *prefetch = &fs->sampler_prefetch[i]; + OUT_RING(ring, + A6XX_SP_FS_PREFETCH_CMD_SRC(prefetch->src) | + A6XX_SP_FS_PREFETCH_CMD_SAMP_ID(prefetch->samp_id) | + A6XX_SP_FS_PREFETCH_CMD_TEX_ID(prefetch->tex_id) | + A6XX_SP_FS_PREFETCH_CMD_DST(prefetch->dst) | + A6XX_SP_FS_PREFETCH_CMD_WRMASK(prefetch->wrmask) | + COND(prefetch->half_precision, A6XX_SP_FS_PREFETCH_CMD_HALF) | + A6XX_SP_FS_PREFETCH_CMD_CMD(prefetch->cmd)); + } + + OUT_PKT4(ring, REG_A6XX_SP_UNKNOWN_A9A8, 1); + OUT_RING(ring, 0); + + OUT_PKT4(ring, REG_A6XX_SP_MODE_CONTROL, 1); + OUT_RING(ring, A6XX_SP_MODE_CONTROL_CONSTANT_DEMOTION_ENABLE | 4); + + bool fs_has_dual_src_color = + !binning_pass && fs->shader->nir->info.fs.color_is_dual_source; + + OUT_PKT4(ring, REG_A6XX_SP_FS_OUTPUT_CNTL0, 1); + OUT_RING(ring, + A6XX_SP_FS_OUTPUT_CNTL0_DEPTH_REGID(posz_regid) | + A6XX_SP_FS_OUTPUT_CNTL0_SAMPMASK_REGID(smask_regid) | + A6XX_SP_FS_OUTPUT_CNTL0_STENCILREF_REGID(stencilref_regid) | + COND(fs_has_dual_src_color, + A6XX_SP_FS_OUTPUT_CNTL0_DUAL_COLOR_IN_ENABLE)); + + OUT_PKT4(ring, REG_A6XX_SP_VS_CTRL_REG0, 1); + OUT_RING(ring, A6XX_SP_VS_CTRL_REG0_FULLREGFOOTPRINT(vs->info.max_reg + 1) | + A6XX_SP_VS_CTRL_REG0_HALFREGFOOTPRINT( + vs->info.max_half_reg + 1) | + COND(vs->mergedregs, A6XX_SP_VS_CTRL_REG0_MERGEDREGS) | + A6XX_SP_VS_CTRL_REG0_BRANCHSTACK(vs->branchstack)); + + fd6_emit_shader(ctx, ring, vs); + fd6_emit_immediates(ctx->screen, vs, ring); + + struct ir3_shader_linkage l = {0}; + const struct ir3_shader_variant *last_shader = fd6_last_shader(state); + + bool do_streamout = (last_shader->shader->stream_output.num_outputs > 0); + uint8_t clip_mask = last_shader->clip_mask, + cull_mask = last_shader->cull_mask; + uint8_t clip_cull_mask = clip_mask | cull_mask; + + /* If we have streamout, link against the real FS, rather than the + * dummy FS used for binning pass state, to ensure the OUTLOC's + * match. Depending on whether we end up doing sysmem or gmem, + * the actual streamout could happen with either the binning pass + * or draw pass program, but the same streamout stateobj is used + * in either case: + */ + ir3_link_shaders(&l, last_shader, do_streamout ? state->fs : fs, true); + + bool primid_passthru = l.primid_loc != 0xff; + clip0_loc = l.clip0_loc; + clip1_loc = l.clip1_loc; + + OUT_PKT4(ring, REG_A6XX_VPC_VAR_DISABLE(0), 4); + OUT_RING(ring, ~l.varmask[0]); /* VPC_VAR[0].DISABLE */ + OUT_RING(ring, ~l.varmask[1]); /* VPC_VAR[1].DISABLE */ + OUT_RING(ring, ~l.varmask[2]); /* VPC_VAR[2].DISABLE */ + OUT_RING(ring, ~l.varmask[3]); /* VPC_VAR[3].DISABLE */ + + /* Add stream out outputs after computing the VPC_VAR_DISABLE bitmask. */ + ir3_link_stream_out(&l, last_shader); + + if (VALIDREG(layer_regid)) { + layer_loc = l.max_loc; + ir3_link_add(&l, layer_regid, 0x1, l.max_loc); + } + + if (VALIDREG(pos_regid)) { + pos_loc = l.max_loc; + ir3_link_add(&l, pos_regid, 0xf, l.max_loc); + } + + if (VALIDREG(psize_regid)) { + psize_loc = l.max_loc; + ir3_link_add(&l, psize_regid, 0x1, l.max_loc); + } + + /* Handle the case where clip/cull distances aren't read by the FS. Make + * sure to avoid adding an output with an empty writemask if the user + * disables all the clip distances in the API so that the slot is unused. + */ + if (clip0_loc == 0xff && VALIDREG(clip0_regid) && + (clip_cull_mask & 0xf) != 0) { + clip0_loc = l.max_loc; + ir3_link_add(&l, clip0_regid, clip_cull_mask & 0xf, l.max_loc); + } + + if (clip1_loc == 0xff && VALIDREG(clip1_regid) && + (clip_cull_mask >> 4) != 0) { + clip1_loc = l.max_loc; + ir3_link_add(&l, clip1_regid, clip_cull_mask >> 4, l.max_loc); + } + + /* If we have stream-out, we use the full shader for binning + * pass, rather than the optimized binning pass one, so that we + * have all the varying outputs available for xfb. So streamout + * state should always be derived from the non-binning pass + * program: + */ + if (do_streamout && !binning_pass) { + setup_stream_out(state, last_shader, &l); + } + + debug_assert(l.cnt <= 32); + if (gs) + OUT_PKT4(ring, REG_A6XX_SP_GS_OUT_REG(0), DIV_ROUND_UP(l.cnt, 2)); + else if (ds) + OUT_PKT4(ring, REG_A6XX_SP_DS_OUT_REG(0), DIV_ROUND_UP(l.cnt, 2)); + else + OUT_PKT4(ring, REG_A6XX_SP_VS_OUT_REG(0), DIV_ROUND_UP(l.cnt, 2)); + + for (j = 0; j < l.cnt;) { + uint32_t reg = 0; + + reg |= A6XX_SP_VS_OUT_REG_A_REGID(l.var[j].regid); + reg |= A6XX_SP_VS_OUT_REG_A_COMPMASK(l.var[j].compmask); + j++; + + reg |= A6XX_SP_VS_OUT_REG_B_REGID(l.var[j].regid); + reg |= A6XX_SP_VS_OUT_REG_B_COMPMASK(l.var[j].compmask); + j++; + + OUT_RING(ring, reg); + } + + if (gs) + OUT_PKT4(ring, REG_A6XX_SP_GS_VPC_DST_REG(0), DIV_ROUND_UP(l.cnt, 4)); + else if (ds) + OUT_PKT4(ring, REG_A6XX_SP_DS_VPC_DST_REG(0), DIV_ROUND_UP(l.cnt, 4)); + else + OUT_PKT4(ring, REG_A6XX_SP_VS_VPC_DST_REG(0), DIV_ROUND_UP(l.cnt, 4)); + + for (j = 0; j < l.cnt;) { + uint32_t reg = 0; + + reg |= A6XX_SP_VS_VPC_DST_REG_OUTLOC0(l.var[j++].loc); + reg |= A6XX_SP_VS_VPC_DST_REG_OUTLOC1(l.var[j++].loc); + reg |= A6XX_SP_VS_VPC_DST_REG_OUTLOC2(l.var[j++].loc); + reg |= A6XX_SP_VS_VPC_DST_REG_OUTLOC3(l.var[j++].loc); + + OUT_RING(ring, reg); + } + + if (hs) { + assert(vs->mergedregs == hs->mergedregs); + OUT_PKT4(ring, REG_A6XX_SP_HS_CTRL_REG0, 1); + OUT_RING( + ring, + A6XX_SP_HS_CTRL_REG0_FULLREGFOOTPRINT(hs->info.max_reg + 1) | + A6XX_SP_HS_CTRL_REG0_HALFREGFOOTPRINT(hs->info.max_half_reg + 1) | + A6XX_SP_HS_CTRL_REG0_BRANCHSTACK(hs->branchstack)); + + fd6_emit_shader(ctx, ring, hs); + fd6_emit_immediates(ctx->screen, hs, ring); + fd6_emit_link_map(ctx->screen, vs, hs, ring); + + OUT_PKT4(ring, REG_A6XX_SP_DS_CTRL_REG0, 1); + OUT_RING( + ring, + A6XX_SP_DS_CTRL_REG0_FULLREGFOOTPRINT(ds->info.max_reg + 1) | + A6XX_SP_DS_CTRL_REG0_HALFREGFOOTPRINT(ds->info.max_half_reg + 1) | + COND(ds->mergedregs, A6XX_SP_DS_CTRL_REG0_MERGEDREGS) | + A6XX_SP_DS_CTRL_REG0_BRANCHSTACK(ds->branchstack)); + + fd6_emit_shader(ctx, ring, ds); + fd6_emit_immediates(ctx->screen, ds, ring); + fd6_emit_link_map(ctx->screen, hs, ds, ring); + + shader_info *hs_info = &hs->shader->nir->info; + OUT_PKT4(ring, REG_A6XX_PC_TESS_NUM_VERTEX, 1); + OUT_RING(ring, hs_info->tess.tcs_vertices_out); + + /* Total attribute slots in HS incoming patch. */ + OUT_PKT4(ring, REG_A6XX_PC_HS_INPUT_SIZE, 1); + OUT_RING(ring, hs_info->tess.tcs_vertices_out * vs->output_size / 4); + + const uint32_t wavesize = 64; + const uint32_t max_wave_input_size = 64; + const uint32_t patch_control_points = hs_info->tess.tcs_vertices_out; + + /* note: if HS is really just the VS extended, then this + * should be by MAX2(patch_control_points, hs_info->tess.tcs_vertices_out) + * however that doesn't match the blob, and fails some dEQP tests. + */ + uint32_t prims_per_wave = wavesize / hs_info->tess.tcs_vertices_out; + uint32_t max_prims_per_wave = max_wave_input_size * wavesize / + (vs->output_size * patch_control_points); + prims_per_wave = MIN2(prims_per_wave, max_prims_per_wave); + + uint32_t total_size = + vs->output_size * patch_control_points * prims_per_wave; + uint32_t wave_input_size = DIV_ROUND_UP(total_size, wavesize); + + OUT_PKT4(ring, REG_A6XX_SP_HS_WAVE_INPUT_SIZE, 1); + OUT_RING(ring, wave_input_size); + + shader_info *ds_info = &ds->shader->nir->info; + OUT_PKT4(ring, REG_A6XX_PC_TESS_CNTL, 1); + uint32_t output; + if (ds_info->tess.point_mode) + output = TESS_POINTS; + else if (ds_info->tess.primitive_mode == GL_ISOLINES) + output = TESS_LINES; + else if (ds_info->tess.ccw) + output = TESS_CCW_TRIS; + else + output = TESS_CW_TRIS; + + OUT_RING(ring, A6XX_PC_TESS_CNTL_SPACING( + fd6_gl2spacing(ds_info->tess.spacing)) | + A6XX_PC_TESS_CNTL_OUTPUT(output)); + + OUT_PKT4(ring, REG_A6XX_VPC_DS_CLIP_CNTL, 1); + OUT_RING(ring, A6XX_VPC_DS_CLIP_CNTL_CLIP_MASK(clip_cull_mask) | + A6XX_VPC_DS_CLIP_CNTL_CLIP_DIST_03_LOC(clip0_loc) | + A6XX_VPC_DS_CLIP_CNTL_CLIP_DIST_47_LOC(clip1_loc)); + + OUT_PKT4(ring, REG_A6XX_VPC_DS_LAYER_CNTL, 1); + OUT_RING(ring, 0x0000ffff); + + OUT_PKT4(ring, REG_A6XX_GRAS_DS_LAYER_CNTL, 1); + OUT_RING(ring, 0x0); + + OUT_PKT4(ring, REG_A6XX_GRAS_DS_CL_CNTL, 1); + OUT_RING(ring, A6XX_GRAS_DS_CL_CNTL_CLIP_MASK(clip_mask) | + A6XX_GRAS_DS_CL_CNTL_CULL_MASK(cull_mask)); + + OUT_PKT4(ring, REG_A6XX_VPC_VS_PACK, 1); + OUT_RING(ring, A6XX_VPC_VS_PACK_POSITIONLOC(pos_loc) | + A6XX_VPC_VS_PACK_PSIZELOC(255) | + A6XX_VPC_VS_PACK_STRIDE_IN_VPC(l.max_loc)); + + OUT_PKT4(ring, REG_A6XX_VPC_DS_PACK, 1); + OUT_RING(ring, A6XX_VPC_DS_PACK_POSITIONLOC(pos_loc) | + A6XX_VPC_DS_PACK_PSIZELOC(psize_loc) | + A6XX_VPC_DS_PACK_STRIDE_IN_VPC(l.max_loc)); + + OUT_PKT4(ring, REG_A6XX_SP_DS_PRIMITIVE_CNTL, 1); + OUT_RING(ring, A6XX_SP_DS_PRIMITIVE_CNTL_OUT(l.cnt)); + + OUT_PKT4(ring, REG_A6XX_PC_DS_OUT_CNTL, 1); + OUT_RING(ring, A6XX_PC_DS_OUT_CNTL_STRIDE_IN_VPC(l.max_loc) | + CONDREG(psize_regid, A6XX_PC_DS_OUT_CNTL_PSIZE) | + A6XX_PC_DS_OUT_CNTL_CLIP_MASK(clip_cull_mask)); + + } else { + OUT_PKT4(ring, REG_A6XX_SP_HS_WAVE_INPUT_SIZE, 1); + OUT_RING(ring, 0); + } + + OUT_PKT4(ring, REG_A6XX_SP_VS_PRIMITIVE_CNTL, 1); + OUT_RING(ring, A6XX_SP_VS_PRIMITIVE_CNTL_OUT(l.cnt)); + + bool enable_varyings = fs->total_in > 0; + + OUT_PKT4(ring, REG_A6XX_VPC_CNTL_0, 1); + OUT_RING(ring, A6XX_VPC_CNTL_0_NUMNONPOSVAR(fs->total_in) | + COND(enable_varyings, A6XX_VPC_CNTL_0_VARYING) | + A6XX_VPC_CNTL_0_PRIMIDLOC(l.primid_loc) | + A6XX_VPC_CNTL_0_VIEWIDLOC(0xff)); + + OUT_PKT4(ring, REG_A6XX_PC_VS_OUT_CNTL, 1); + OUT_RING(ring, A6XX_PC_VS_OUT_CNTL_STRIDE_IN_VPC(l.max_loc) | + CONDREG(psize_regid, A6XX_PC_VS_OUT_CNTL_PSIZE) | + CONDREG(layer_regid, A6XX_PC_VS_OUT_CNTL_LAYER) | + A6XX_PC_VS_OUT_CNTL_CLIP_MASK(clip_cull_mask)); + + OUT_PKT4(ring, REG_A6XX_PC_PRIMITIVE_CNTL_3, 1); + OUT_RING(ring, 0); + + OUT_PKT4(ring, REG_A6XX_HLSQ_CONTROL_1_REG, 5); + OUT_RING(ring, 0x7); /* XXX */ + OUT_RING(ring, A6XX_HLSQ_CONTROL_2_REG_FACEREGID(face_regid) | + A6XX_HLSQ_CONTROL_2_REG_SAMPLEID(samp_id_regid) | + A6XX_HLSQ_CONTROL_2_REG_SAMPLEMASK(smask_in_regid) | + A6XX_HLSQ_CONTROL_2_REG_SIZE(ij_regid[IJ_PERSP_SIZE])); + OUT_RING( + ring, + A6XX_HLSQ_CONTROL_3_REG_IJ_PERSP_PIXEL(ij_regid[IJ_PERSP_PIXEL]) | + A6XX_HLSQ_CONTROL_3_REG_IJ_LINEAR_PIXEL(ij_regid[IJ_LINEAR_PIXEL]) | + A6XX_HLSQ_CONTROL_3_REG_IJ_PERSP_CENTROID( + ij_regid[IJ_PERSP_CENTROID]) | + A6XX_HLSQ_CONTROL_3_REG_IJ_LINEAR_CENTROID( + ij_regid[IJ_LINEAR_CENTROID])); + OUT_RING( + ring, + A6XX_HLSQ_CONTROL_4_REG_XYCOORDREGID(coord_regid) | + A6XX_HLSQ_CONTROL_4_REG_ZWCOORDREGID(zwcoord_regid) | + A6XX_HLSQ_CONTROL_4_REG_IJ_PERSP_SAMPLE(ij_regid[IJ_PERSP_SAMPLE]) | + A6XX_HLSQ_CONTROL_4_REG_IJ_LINEAR_SAMPLE(ij_regid[IJ_LINEAR_SAMPLE])); + OUT_RING(ring, 0xfc); /* XXX */ + + OUT_PKT4(ring, REG_A6XX_HLSQ_FS_CNTL_0, 1); + OUT_RING(ring, A6XX_HLSQ_FS_CNTL_0_THREADSIZE(fssz) | + COND(enable_varyings, A6XX_HLSQ_FS_CNTL_0_VARYINGS)); + + OUT_PKT4(ring, REG_A6XX_SP_FS_CTRL_REG0, 1); + OUT_RING( + ring, + A6XX_SP_FS_CTRL_REG0_THREADSIZE(fssz) | + COND(enable_varyings, A6XX_SP_FS_CTRL_REG0_VARYING) | 0x1000000 | + A6XX_SP_FS_CTRL_REG0_FULLREGFOOTPRINT(fs->info.max_reg + 1) | + A6XX_SP_FS_CTRL_REG0_HALFREGFOOTPRINT(fs->info.max_half_reg + 1) | + COND(fs->mergedregs, A6XX_SP_FS_CTRL_REG0_MERGEDREGS) | + A6XX_SP_FS_CTRL_REG0_BRANCHSTACK(fs->branchstack) | + COND(fs->need_pixlod, A6XX_SP_FS_CTRL_REG0_PIXLODENABLE)); + + OUT_PKT4(ring, REG_A6XX_VPC_VS_LAYER_CNTL, 1); + OUT_RING(ring, A6XX_VPC_VS_LAYER_CNTL_LAYERLOC(layer_loc) | + A6XX_VPC_VS_LAYER_CNTL_VIEWLOC(0xff)); + + bool need_size = fs->frag_face || fs->fragcoord_compmask != 0; + bool need_size_persamp = false; + if (VALIDREG(ij_regid[IJ_PERSP_SIZE])) { + if (sample_shading) + need_size_persamp = true; + else + need_size = true; + } + if (VALIDREG(ij_regid[IJ_LINEAR_PIXEL])) + need_size = true; + + /* XXX: enable bits for linear centroid and linear sample bary */ + + OUT_PKT4(ring, REG_A6XX_GRAS_CNTL, 1); + OUT_RING( + ring, + CONDREG(ij_regid[IJ_PERSP_PIXEL], A6XX_GRAS_CNTL_IJ_PERSP_PIXEL) | + CONDREG(ij_regid[IJ_PERSP_CENTROID], + A6XX_GRAS_CNTL_IJ_PERSP_CENTROID) | + CONDREG(ij_regid[IJ_PERSP_SAMPLE], A6XX_GRAS_CNTL_IJ_PERSP_SAMPLE) | + COND(need_size, A6XX_GRAS_CNTL_SIZE) | + COND(need_size_persamp, A6XX_GRAS_CNTL_SIZE_PERSAMP) | + COND(fs->fragcoord_compmask != 0, + A6XX_GRAS_CNTL_COORD_MASK(fs->fragcoord_compmask))); + + OUT_PKT4(ring, REG_A6XX_RB_RENDER_CONTROL0, 2); + OUT_RING( + ring, + CONDREG(ij_regid[IJ_PERSP_PIXEL], + A6XX_RB_RENDER_CONTROL0_IJ_PERSP_PIXEL) | + CONDREG(ij_regid[IJ_PERSP_CENTROID], + A6XX_RB_RENDER_CONTROL0_IJ_PERSP_CENTROID) | + CONDREG(ij_regid[IJ_PERSP_SAMPLE], + A6XX_RB_RENDER_CONTROL0_IJ_PERSP_SAMPLE) | + COND(need_size, A6XX_RB_RENDER_CONTROL0_SIZE) | + COND(enable_varyings, A6XX_RB_RENDER_CONTROL0_UNK10) | + COND(need_size_persamp, A6XX_RB_RENDER_CONTROL0_SIZE_PERSAMP) | + COND(fs->fragcoord_compmask != 0, + A6XX_RB_RENDER_CONTROL0_COORD_MASK(fs->fragcoord_compmask))); + + OUT_RING(ring, + CONDREG(smask_in_regid, A6XX_RB_RENDER_CONTROL1_SAMPLEMASK) | + CONDREG(samp_id_regid, A6XX_RB_RENDER_CONTROL1_SAMPLEID) | + CONDREG(ij_regid[IJ_PERSP_SIZE], A6XX_RB_RENDER_CONTROL1_SIZE) | + COND(fs->frag_face, A6XX_RB_RENDER_CONTROL1_FACENESS)); + + OUT_PKT4(ring, REG_A6XX_RB_SAMPLE_CNTL, 1); + OUT_RING(ring, COND(sample_shading, A6XX_RB_SAMPLE_CNTL_PER_SAMP_MODE)); + + OUT_PKT4(ring, REG_A6XX_GRAS_UNKNOWN_8101, 1); + OUT_RING(ring, COND(sample_shading, 0x6)); // XXX + + OUT_PKT4(ring, REG_A6XX_GRAS_SAMPLE_CNTL, 1); + OUT_RING(ring, COND(sample_shading, A6XX_GRAS_SAMPLE_CNTL_PER_SAMP_MODE)); + + OUT_PKT4(ring, REG_A6XX_SP_FS_OUTPUT_REG(0), 8); + for (i = 0; i < 8; i++) { + OUT_RING(ring, A6XX_SP_FS_OUTPUT_REG_REGID(color_regid[i]) | + COND(color_regid[i] & HALF_REG_ID, + A6XX_SP_FS_OUTPUT_REG_HALF_PRECISION)); + if (VALIDREG(color_regid[i])) { + state->mrt_components |= 0xf << (i * 4); + } + } + + /* dual source blending has an extra fs output in the 2nd slot */ + if (fs_has_dual_src_color) { + state->mrt_components |= 0xf << 4; + } + + OUT_PKT4(ring, REG_A6XX_VPC_VS_PACK, 1); + OUT_RING(ring, A6XX_VPC_VS_PACK_POSITIONLOC(pos_loc) | + A6XX_VPC_VS_PACK_PSIZELOC(psize_loc) | + A6XX_VPC_VS_PACK_STRIDE_IN_VPC(l.max_loc)); + + if (gs) { + assert(gs->mergedregs == (ds ? ds->mergedregs : vs->mergedregs)); + OUT_PKT4(ring, REG_A6XX_SP_GS_CTRL_REG0, 1); + OUT_RING( + ring, + A6XX_SP_GS_CTRL_REG0_FULLREGFOOTPRINT(gs->info.max_reg + 1) | + A6XX_SP_GS_CTRL_REG0_HALFREGFOOTPRINT(gs->info.max_half_reg + 1) | + A6XX_SP_GS_CTRL_REG0_BRANCHSTACK(gs->branchstack)); + + fd6_emit_shader(ctx, ring, gs); + fd6_emit_immediates(ctx->screen, gs, ring); + if (ds) + fd6_emit_link_map(ctx->screen, ds, gs, ring); + else + fd6_emit_link_map(ctx->screen, vs, gs, ring); + + OUT_PKT4(ring, REG_A6XX_VPC_GS_PACK, 1); + OUT_RING(ring, A6XX_VPC_GS_PACK_POSITIONLOC(pos_loc) | + A6XX_VPC_GS_PACK_PSIZELOC(psize_loc) | + A6XX_VPC_GS_PACK_STRIDE_IN_VPC(l.max_loc)); + + OUT_PKT4(ring, REG_A6XX_VPC_GS_LAYER_CNTL, 1); + OUT_RING(ring, A6XX_VPC_GS_LAYER_CNTL_LAYERLOC(layer_loc) | 0xff00); + + OUT_PKT4(ring, REG_A6XX_GRAS_GS_LAYER_CNTL, 1); + OUT_RING(ring, + CONDREG(layer_regid, A6XX_GRAS_GS_LAYER_CNTL_WRITES_LAYER)); + + uint32_t flags_regid = + ir3_find_output_regid(gs, VARYING_SLOT_GS_VERTEX_FLAGS_IR3); + + OUT_PKT4(ring, REG_A6XX_SP_GS_PRIMITIVE_CNTL, 1); + OUT_RING(ring, A6XX_SP_GS_PRIMITIVE_CNTL_OUT(l.cnt) | + A6XX_SP_GS_PRIMITIVE_CNTL_FLAGS_REGID(flags_regid)); + + OUT_PKT4(ring, REG_A6XX_PC_GS_OUT_CNTL, 1); + OUT_RING(ring, + A6XX_PC_GS_OUT_CNTL_STRIDE_IN_VPC(l.max_loc) | + CONDREG(psize_regid, A6XX_PC_GS_OUT_CNTL_PSIZE) | + CONDREG(layer_regid, A6XX_PC_GS_OUT_CNTL_LAYER) | + CONDREG(primitive_regid, A6XX_PC_GS_OUT_CNTL_PRIMITIVE_ID) | + A6XX_PC_GS_OUT_CNTL_CLIP_MASK(clip_cull_mask)); + + uint32_t output; + switch (gs->shader->nir->info.gs.output_primitive) { + case GL_POINTS: + output = TESS_POINTS; + break; + case GL_LINE_STRIP: + output = TESS_LINES; + break; + case GL_TRIANGLE_STRIP: + output = TESS_CW_TRIS; + break; + default: + unreachable(""); + } + OUT_PKT4(ring, REG_A6XX_PC_PRIMITIVE_CNTL_5, 1); + OUT_RING(ring, A6XX_PC_PRIMITIVE_CNTL_5_GS_VERTICES_OUT( + gs->shader->nir->info.gs.vertices_out - 1) | + A6XX_PC_PRIMITIVE_CNTL_5_GS_OUTPUT(output) | + A6XX_PC_PRIMITIVE_CNTL_5_GS_INVOCATIONS( + gs->shader->nir->info.gs.invocations - 1)); + + OUT_PKT4(ring, REG_A6XX_GRAS_GS_CL_CNTL, 1); + OUT_RING(ring, A6XX_GRAS_GS_CL_CNTL_CLIP_MASK(clip_mask) | + A6XX_GRAS_GS_CL_CNTL_CULL_MASK(cull_mask)); + + OUT_PKT4(ring, REG_A6XX_VPC_UNKNOWN_9100, 1); + OUT_RING(ring, 0xff); + + OUT_PKT4(ring, REG_A6XX_VPC_GS_CLIP_CNTL, 1); + OUT_RING(ring, A6XX_VPC_GS_CLIP_CNTL_CLIP_MASK(clip_cull_mask) | + A6XX_VPC_GS_CLIP_CNTL_CLIP_DIST_03_LOC(clip0_loc) | + A6XX_VPC_GS_CLIP_CNTL_CLIP_DIST_47_LOC(clip1_loc)); + + const struct ir3_shader_variant *prev = state->ds ? state->ds : state->vs; + + /* Size of per-primitive alloction in ldlw memory in vec4s. */ + uint32_t vec4_size = gs->shader->nir->info.gs.vertices_in * + DIV_ROUND_UP(prev->output_size, 4); + OUT_PKT4(ring, REG_A6XX_PC_PRIMITIVE_CNTL_6, 1); + OUT_RING(ring, A6XX_PC_PRIMITIVE_CNTL_6_STRIDE_IN_VPC(vec4_size)); + + OUT_PKT4(ring, REG_A6XX_PC_MULTIVIEW_CNTL, 1); + OUT_RING(ring, 0); + + OUT_PKT4(ring, REG_A6XX_SP_GS_PRIM_SIZE, 1); + OUT_RING(ring, prev->output_size); + } else { + OUT_PKT4(ring, REG_A6XX_PC_PRIMITIVE_CNTL_6, 1); + OUT_RING(ring, 0); + OUT_PKT4(ring, REG_A6XX_SP_GS_PRIM_SIZE, 1); + OUT_RING(ring, 0); + + OUT_PKT4(ring, REG_A6XX_GRAS_VS_LAYER_CNTL, 1); + OUT_RING(ring, + CONDREG(layer_regid, A6XX_GRAS_VS_LAYER_CNTL_WRITES_LAYER)); + } + + OUT_PKT4(ring, REG_A6XX_VPC_VS_CLIP_CNTL, 1); + OUT_RING(ring, A6XX_VPC_VS_CLIP_CNTL_CLIP_MASK(clip_cull_mask) | + A6XX_VPC_VS_CLIP_CNTL_CLIP_DIST_03_LOC(clip0_loc) | + A6XX_VPC_VS_CLIP_CNTL_CLIP_DIST_47_LOC(clip1_loc)); + + OUT_PKT4(ring, REG_A6XX_GRAS_VS_CL_CNTL, 1); + OUT_RING(ring, A6XX_GRAS_VS_CL_CNTL_CLIP_MASK(clip_mask) | + A6XX_GRAS_VS_CL_CNTL_CULL_MASK(cull_mask)); + + OUT_PKT4(ring, REG_A6XX_VPC_UNKNOWN_9107, 1); + OUT_RING(ring, 0); + + if (fs->instrlen) + fd6_emit_shader(ctx, ring, fs); + + OUT_REG(ring, A6XX_PC_PRIMID_PASSTHRU(primid_passthru)); + + uint32_t non_sysval_input_count = 0; + for (uint32_t i = 0; i < vs->inputs_count; i++) + if (!vs->inputs[i].sysval) + non_sysval_input_count++; + + OUT_PKT4(ring, REG_A6XX_VFD_CONTROL_0, 1); + OUT_RING(ring, A6XX_VFD_CONTROL_0_FETCH_CNT(non_sysval_input_count) | + A6XX_VFD_CONTROL_0_DECODE_CNT(non_sysval_input_count)); + + OUT_PKT4(ring, REG_A6XX_VFD_DEST_CNTL(0), non_sysval_input_count); + for (uint32_t i = 0; i < non_sysval_input_count; i++) { + assert(vs->inputs[i].compmask); + OUT_RING(ring, + A6XX_VFD_DEST_CNTL_INSTR_WRITEMASK(vs->inputs[i].compmask) | + A6XX_VFD_DEST_CNTL_INSTR_REGID(vs->inputs[i].regid)); + } + + OUT_PKT4(ring, REG_A6XX_VFD_CONTROL_1, 6); + OUT_RING(ring, A6XX_VFD_CONTROL_1_REGID4VTX(vertex_regid) | + A6XX_VFD_CONTROL_1_REGID4INST(instance_regid) | + A6XX_VFD_CONTROL_1_REGID4PRIMID(primitive_regid) | + 0xfc000000); + OUT_RING(ring, + A6XX_VFD_CONTROL_2_REGID_HSPATCHID(hs_patch_regid) | + A6XX_VFD_CONTROL_2_REGID_INVOCATIONID(hs_invocation_regid)); + OUT_RING(ring, A6XX_VFD_CONTROL_3_REGID_DSPATCHID(ds_patch_regid) | + A6XX_VFD_CONTROL_3_REGID_TESSX(tess_coord_x_regid) | + A6XX_VFD_CONTROL_3_REGID_TESSY(tess_coord_y_regid) | 0xfc); + OUT_RING(ring, 0x000000fc); /* VFD_CONTROL_4 */ + OUT_RING(ring, A6XX_VFD_CONTROL_5_REGID_GSHEADER(gs_header_regid) | + 0xfc00); /* VFD_CONTROL_5 */ + OUT_RING(ring, COND(primid_passthru, + A6XX_VFD_CONTROL_6_PRIMID_PASSTHRU)); /* VFD_CONTROL_6 */ + + if (!binning_pass) + fd6_emit_immediates(ctx->screen, fs, ring); } -static void emit_interp_state(struct fd_ringbuffer *ring, struct ir3_shader_variant *fs, - bool rasterflat, bool sprite_coord_mode, uint32_t sprite_coord_enable); +static void emit_interp_state(struct fd_ringbuffer *ring, + struct ir3_shader_variant *fs, bool rasterflat, + bool sprite_coord_mode, + uint32_t sprite_coord_enable); static struct fd_ringbuffer * create_interp_stateobj(struct fd_context *ctx, struct fd6_program_state *state) { - struct fd_ringbuffer *ring = fd_ringbuffer_new_object(ctx->pipe, 18 * 4); + struct fd_ringbuffer *ring = fd_ringbuffer_new_object(ctx->pipe, 18 * 4); - emit_interp_state(ring, state->fs, false, false, 0); + emit_interp_state(ring, state->fs, false, false, 0); - return ring; + return ring; } /* build the program streaming state which is not part of the pre- @@ -962,171 +991,167 @@ create_interp_stateobj(struct fd_context *ctx, struct fd6_program_state *state) struct fd_ringbuffer * fd6_program_interp_state(struct fd6_emit *emit) { - const struct fd6_program_state *state = fd6_emit_get_prog(emit); + const struct fd6_program_state *state = fd6_emit_get_prog(emit); - if (!unlikely(emit->rasterflat || emit->sprite_coord_enable)) { - /* fastpath: */ - return fd_ringbuffer_ref(state->interp_stateobj); - } else { - struct fd_ringbuffer *ring = fd_submit_new_ringbuffer( - emit->ctx->batch->submit, 18 * 4, FD_RINGBUFFER_STREAMING); + if (!unlikely(emit->rasterflat || emit->sprite_coord_enable)) { + /* fastpath: */ + return fd_ringbuffer_ref(state->interp_stateobj); + } else { + struct fd_ringbuffer *ring = fd_submit_new_ringbuffer( + emit->ctx->batch->submit, 18 * 4, FD_RINGBUFFER_STREAMING); - emit_interp_state(ring, state->fs, emit->rasterflat, - emit->sprite_coord_mode, emit->sprite_coord_enable); + emit_interp_state(ring, state->fs, emit->rasterflat, + emit->sprite_coord_mode, emit->sprite_coord_enable); - return ring; - } + return ring; + } } static void emit_interp_state(struct fd_ringbuffer *ring, struct ir3_shader_variant *fs, - bool rasterflat, bool sprite_coord_mode, uint32_t sprite_coord_enable) + bool rasterflat, bool sprite_coord_mode, + uint32_t sprite_coord_enable) { - uint32_t vinterp[8], vpsrepl[8]; - - memset(vinterp, 0, sizeof(vinterp)); - memset(vpsrepl, 0, sizeof(vpsrepl)); - - for (int j = -1; (j = ir3_next_varying(fs, j)) < (int)fs->inputs_count; ) { - - /* NOTE: varyings are packed, so if compmask is 0xb - * then first, third, and fourth component occupy - * three consecutive varying slots: - */ - unsigned compmask = fs->inputs[j].compmask; - - uint32_t inloc = fs->inputs[j].inloc; - - if (fs->inputs[j].flat || - (fs->inputs[j].rasterflat && rasterflat)) { - uint32_t loc = inloc; - - for (int i = 0; i < 4; i++) { - if (compmask & (1 << i)) { - vinterp[loc / 16] |= 1 << ((loc % 16) * 2); - loc++; - } - } - } - - bool coord_mode = sprite_coord_mode; - if (ir3_point_sprite(fs, j, sprite_coord_enable, &coord_mode)) { - /* mask is two 2-bit fields, where: - * '01' -> S - * '10' -> T - * '11' -> 1 - T (flip mode) - */ - unsigned mask = coord_mode ? 0b1101 : 0b1001; - uint32_t loc = inloc; - if (compmask & 0x1) { - vpsrepl[loc / 16] |= ((mask >> 0) & 0x3) << ((loc % 16) * 2); - loc++; - } - if (compmask & 0x2) { - vpsrepl[loc / 16] |= ((mask >> 2) & 0x3) << ((loc % 16) * 2); - loc++; - } - if (compmask & 0x4) { - /* .z <- 0.0f */ - vinterp[loc / 16] |= 0b10 << ((loc % 16) * 2); - loc++; - } - if (compmask & 0x8) { - /* .w <- 1.0f */ - vinterp[loc / 16] |= 0b11 << ((loc % 16) * 2); - loc++; - } - } - } - - OUT_PKT4(ring, REG_A6XX_VPC_VARYING_INTERP_MODE(0), 8); - for (int i = 0; i < 8; i++) - OUT_RING(ring, vinterp[i]); /* VPC_VARYING_INTERP[i].MODE */ - - OUT_PKT4(ring, REG_A6XX_VPC_VARYING_PS_REPL_MODE(0), 8); - for (int i = 0; i < 8; i++) - OUT_RING(ring, vpsrepl[i]); /* VPC_VARYING_PS_REPL[i] */ + uint32_t vinterp[8], vpsrepl[8]; + + memset(vinterp, 0, sizeof(vinterp)); + memset(vpsrepl, 0, sizeof(vpsrepl)); + + for (int j = -1; (j = ir3_next_varying(fs, j)) < (int)fs->inputs_count;) { + + /* NOTE: varyings are packed, so if compmask is 0xb + * then first, third, and fourth component occupy + * three consecutive varying slots: + */ + unsigned compmask = fs->inputs[j].compmask; + + uint32_t inloc = fs->inputs[j].inloc; + + if (fs->inputs[j].flat || (fs->inputs[j].rasterflat && rasterflat)) { + uint32_t loc = inloc; + + for (int i = 0; i < 4; i++) { + if (compmask & (1 << i)) { + vinterp[loc / 16] |= 1 << ((loc % 16) * 2); + loc++; + } + } + } + + bool coord_mode = sprite_coord_mode; + if (ir3_point_sprite(fs, j, sprite_coord_enable, &coord_mode)) { + /* mask is two 2-bit fields, where: + * '01' -> S + * '10' -> T + * '11' -> 1 - T (flip mode) + */ + unsigned mask = coord_mode ? 0b1101 : 0b1001; + uint32_t loc = inloc; + if (compmask & 0x1) { + vpsrepl[loc / 16] |= ((mask >> 0) & 0x3) << ((loc % 16) * 2); + loc++; + } + if (compmask & 0x2) { + vpsrepl[loc / 16] |= ((mask >> 2) & 0x3) << ((loc % 16) * 2); + loc++; + } + if (compmask & 0x4) { + /* .z <- 0.0f */ + vinterp[loc / 16] |= 0b10 << ((loc % 16) * 2); + loc++; + } + if (compmask & 0x8) { + /* .w <- 1.0f */ + vinterp[loc / 16] |= 0b11 << ((loc % 16) * 2); + loc++; + } + } + } + + OUT_PKT4(ring, REG_A6XX_VPC_VARYING_INTERP_MODE(0), 8); + for (int i = 0; i < 8; i++) + OUT_RING(ring, vinterp[i]); /* VPC_VARYING_INTERP[i].MODE */ + + OUT_PKT4(ring, REG_A6XX_VPC_VARYING_PS_REPL_MODE(0), 8); + for (int i = 0; i < 8; i++) + OUT_RING(ring, vpsrepl[i]); /* VPC_VARYING_PS_REPL[i] */ } static struct ir3_program_state * fd6_program_create(void *data, struct ir3_shader_variant *bs, - struct ir3_shader_variant *vs, - struct ir3_shader_variant *hs, - struct ir3_shader_variant *ds, - struct ir3_shader_variant *gs, - struct ir3_shader_variant *fs, - const struct ir3_shader_key *key) - in_dt + struct ir3_shader_variant *vs, struct ir3_shader_variant *hs, + struct ir3_shader_variant *ds, struct ir3_shader_variant *gs, + struct ir3_shader_variant *fs, + const struct ir3_shader_key *key) in_dt { - struct fd_context *ctx = fd_context(data); - struct fd6_program_state *state = CALLOC_STRUCT(fd6_program_state); - - tc_assert_driver_thread(ctx->tc); - - /* if we have streamout, use full VS in binning pass, as the - * binning pass VS will have outputs on other than position/psize - * stripped out: - */ - state->bs = vs->shader->stream_output.num_outputs ? vs : bs; - state->vs = vs; - state->hs = hs; - state->ds = ds; - state->gs = gs; - state->fs = fs; - state->config_stateobj = fd_ringbuffer_new_object(ctx->pipe, 0x1000); - state->binning_stateobj = fd_ringbuffer_new_object(ctx->pipe, 0x1000); - state->stateobj = fd_ringbuffer_new_object(ctx->pipe, 0x1000); - state->streamout_stateobj = fd_ringbuffer_new_object(ctx->pipe, 0x1000); - + struct fd_context *ctx = fd_context(data); + struct fd6_program_state *state = CALLOC_STRUCT(fd6_program_state); + + tc_assert_driver_thread(ctx->tc); + + /* if we have streamout, use full VS in binning pass, as the + * binning pass VS will have outputs on other than position/psize + * stripped out: + */ + state->bs = vs->shader->stream_output.num_outputs ? vs : bs; + state->vs = vs; + state->hs = hs; + state->ds = ds; + state->gs = gs; + state->fs = fs; + state->config_stateobj = fd_ringbuffer_new_object(ctx->pipe, 0x1000); + state->binning_stateobj = fd_ringbuffer_new_object(ctx->pipe, 0x1000); + state->stateobj = fd_ringbuffer_new_object(ctx->pipe, 0x1000); + state->streamout_stateobj = fd_ringbuffer_new_object(ctx->pipe, 0x1000); #ifdef DEBUG - if (!ds) { - for (unsigned i = 0; i < bs->inputs_count; i++) { - if (vs->inputs[i].sysval) - continue; - debug_assert(bs->inputs[i].regid == vs->inputs[i].regid); - } - } + if (!ds) { + for (unsigned i = 0; i < bs->inputs_count; i++) { + if (vs->inputs[i].sysval) + continue; + debug_assert(bs->inputs[i].regid == vs->inputs[i].regid); + } + } #endif - setup_config_stateobj(state->config_stateobj, state); - setup_stateobj(state->binning_stateobj, ctx, state, key, true); - setup_stateobj(state->stateobj, ctx, state, key, false); - state->interp_stateobj = create_interp_stateobj(ctx, state); + setup_config_stateobj(state->config_stateobj, state); + setup_stateobj(state->binning_stateobj, ctx, state, key, true); + setup_stateobj(state->stateobj, ctx, state, key, false); + state->interp_stateobj = create_interp_stateobj(ctx, state); - struct ir3_stream_output_info *stream_output = - &fd6_last_shader(state)->shader->stream_output; - if (stream_output->num_outputs > 0) - state->stream_output = stream_output; + struct ir3_stream_output_info *stream_output = + &fd6_last_shader(state)->shader->stream_output; + if (stream_output->num_outputs > 0) + state->stream_output = stream_output; - return &state->base; + return &state->base; } static void fd6_program_destroy(void *data, struct ir3_program_state *state) { - struct fd6_program_state *so = fd6_program_state(state); - fd_ringbuffer_del(so->stateobj); - fd_ringbuffer_del(so->binning_stateobj); - fd_ringbuffer_del(so->config_stateobj); - fd_ringbuffer_del(so->interp_stateobj); - fd_ringbuffer_del(so->streamout_stateobj); - free(so); + struct fd6_program_state *so = fd6_program_state(state); + fd_ringbuffer_del(so->stateobj); + fd_ringbuffer_del(so->binning_stateobj); + fd_ringbuffer_del(so->config_stateobj); + fd_ringbuffer_del(so->interp_stateobj); + fd_ringbuffer_del(so->streamout_stateobj); + free(so); } static const struct ir3_cache_funcs cache_funcs = { - .create_state = fd6_program_create, - .destroy_state = fd6_program_destroy, + .create_state = fd6_program_create, + .destroy_state = fd6_program_destroy, }; void fd6_prog_init(struct pipe_context *pctx) { - struct fd_context *ctx = fd_context(pctx); + struct fd_context *ctx = fd_context(pctx); - ctx->shader_cache = ir3_cache_create(&cache_funcs, ctx); + ctx->shader_cache = ir3_cache_create(&cache_funcs, ctx); - ir3_prog_init(pctx); + ir3_prog_init(pctx); - fd_prog_init(pctx); + fd_prog_init(pctx); } diff --git a/src/gallium/drivers/freedreno/a6xx/fd6_program.h b/src/gallium/drivers/freedreno/a6xx/fd6_program.h index 317a77b..0bfd9ea 100644 --- a/src/gallium/drivers/freedreno/a6xx/fd6_program.h +++ b/src/gallium/drivers/freedreno/a6xx/fd6_program.h @@ -37,50 +37,50 @@ struct fd6_emit; struct fd6_program_state { - struct ir3_program_state base; - struct ir3_shader_variant *bs; /* binning pass vs */ - struct ir3_shader_variant *vs; - struct ir3_shader_variant *hs; - struct ir3_shader_variant *ds; - struct ir3_shader_variant *gs; - struct ir3_shader_variant *fs; - struct fd_ringbuffer *config_stateobj; - struct fd_ringbuffer *interp_stateobj; - struct fd_ringbuffer *binning_stateobj; - struct fd_ringbuffer *streamout_stateobj; - struct fd_ringbuffer *stateobj; + struct ir3_program_state base; + struct ir3_shader_variant *bs; /* binning pass vs */ + struct ir3_shader_variant *vs; + struct ir3_shader_variant *hs; + struct ir3_shader_variant *ds; + struct ir3_shader_variant *gs; + struct ir3_shader_variant *fs; + struct fd_ringbuffer *config_stateobj; + struct fd_ringbuffer *interp_stateobj; + struct fd_ringbuffer *binning_stateobj; + struct fd_ringbuffer *streamout_stateobj; + struct fd_ringbuffer *stateobj; - struct ir3_stream_output_info *stream_output; + struct ir3_stream_output_info *stream_output; - /** - * Output components from frag shader. It is possible to have - * a fragment shader that only writes a subset of the bound - * render targets. - */ - uint32_t mrt_components; + /** + * Output components from frag shader. It is possible to have + * a fragment shader that only writes a subset of the bound + * render targets. + */ + uint32_t mrt_components; }; static inline struct fd6_program_state * fd6_program_state(struct ir3_program_state *state) { - return (struct fd6_program_state *)state; + return (struct fd6_program_state *)state; } static inline const struct ir3_shader_variant * fd6_last_shader(const struct fd6_program_state *state) { - if (state->gs) - return state->gs; - else if (state->ds) - return state->ds; - else - return state->vs; + if (state->gs) + return state->gs; + else if (state->ds) + return state->ds; + else + return state->vs; } void fd6_emit_shader(struct fd_context *ctx, struct fd_ringbuffer *ring, - const struct ir3_shader_variant *so) assert_dt; + const struct ir3_shader_variant *so) assert_dt; -struct fd_ringbuffer * fd6_program_interp_state(struct fd6_emit *emit) assert_dt; +struct fd_ringbuffer *fd6_program_interp_state(struct fd6_emit *emit) assert_dt; void fd6_prog_init(struct pipe_context *pctx); diff --git a/src/gallium/drivers/freedreno/a6xx/fd6_query.c b/src/gallium/drivers/freedreno/a6xx/fd6_query.c index d7060df..cb02ffe 100644 --- a/src/gallium/drivers/freedreno/a6xx/fd6_query.c +++ b/src/gallium/drivers/freedreno/a6xx/fd6_query.c @@ -36,21 +36,20 @@ #include "fd6_query.h" struct PACKED fd6_query_sample { - uint64_t start; - uint64_t result; - uint64_t stop; + uint64_t start; + uint64_t result; + uint64_t stop; }; /* offset of a single field of an array of fd6_query_sample: */ -#define query_sample_idx(aq, idx, field) \ - fd_resource((aq)->prsc)->bo, \ - (idx * sizeof(struct fd6_query_sample)) + \ - offsetof(struct fd6_query_sample, field), \ - 0, 0 +#define query_sample_idx(aq, idx, field) \ + fd_resource((aq)->prsc)->bo, \ + (idx * sizeof(struct fd6_query_sample)) + \ + offsetof(struct fd6_query_sample, field), \ + 0, 0 /* offset of a single field of fd6_query_sample: */ -#define query_sample(aq, field) \ - query_sample_idx(aq, 0, field) +#define query_sample(aq, field) query_sample_idx(aq, 0, field) /* * Occlusion Query: @@ -62,96 +61,94 @@ struct PACKED fd6_query_sample { static void occlusion_resume(struct fd_acc_query *aq, struct fd_batch *batch) { - struct fd_ringbuffer *ring = batch->draw; + struct fd_ringbuffer *ring = batch->draw; - OUT_PKT4(ring, REG_A6XX_RB_SAMPLE_COUNT_CONTROL, 1); - OUT_RING(ring, A6XX_RB_SAMPLE_COUNT_CONTROL_COPY); + OUT_PKT4(ring, REG_A6XX_RB_SAMPLE_COUNT_CONTROL, 1); + OUT_RING(ring, A6XX_RB_SAMPLE_COUNT_CONTROL_COPY); - OUT_PKT4(ring, REG_A6XX_RB_SAMPLE_COUNT_ADDR, 2); - OUT_RELOC(ring, query_sample(aq, start)); + OUT_PKT4(ring, REG_A6XX_RB_SAMPLE_COUNT_ADDR, 2); + OUT_RELOC(ring, query_sample(aq, start)); - fd6_event_write(batch, ring, ZPASS_DONE, false); + fd6_event_write(batch, ring, ZPASS_DONE, false); - fd6_context(batch->ctx)->samples_passed_queries++; + fd6_context(batch->ctx)->samples_passed_queries++; } static void -occlusion_pause(struct fd_acc_query *aq, struct fd_batch *batch) - assert_dt +occlusion_pause(struct fd_acc_query *aq, struct fd_batch *batch) assert_dt { - struct fd_ringbuffer *ring = batch->draw; + struct fd_ringbuffer *ring = batch->draw; - OUT_PKT7(ring, CP_MEM_WRITE, 4); - OUT_RELOC(ring, query_sample(aq, stop)); - OUT_RING(ring, 0xffffffff); - OUT_RING(ring, 0xffffffff); + OUT_PKT7(ring, CP_MEM_WRITE, 4); + OUT_RELOC(ring, query_sample(aq, stop)); + OUT_RING(ring, 0xffffffff); + OUT_RING(ring, 0xffffffff); - OUT_PKT7(ring, CP_WAIT_MEM_WRITES, 0); + OUT_PKT7(ring, CP_WAIT_MEM_WRITES, 0); - OUT_PKT4(ring, REG_A6XX_RB_SAMPLE_COUNT_CONTROL, 1); - OUT_RING(ring, A6XX_RB_SAMPLE_COUNT_CONTROL_COPY); + OUT_PKT4(ring, REG_A6XX_RB_SAMPLE_COUNT_CONTROL, 1); + OUT_RING(ring, A6XX_RB_SAMPLE_COUNT_CONTROL_COPY); - OUT_PKT4(ring, REG_A6XX_RB_SAMPLE_COUNT_ADDR, 2); - OUT_RELOC(ring, query_sample(aq, stop)); + OUT_PKT4(ring, REG_A6XX_RB_SAMPLE_COUNT_ADDR, 2); + OUT_RELOC(ring, query_sample(aq, stop)); - fd6_event_write(batch, ring, ZPASS_DONE, false); + fd6_event_write(batch, ring, ZPASS_DONE, false); - /* To avoid stalling in the draw buffer, emit code the code to compute the - * counter delta in the epilogue ring. - */ - struct fd_ringbuffer *epilogue = fd_batch_get_epilogue(batch); - fd_wfi(batch, epilogue); + /* To avoid stalling in the draw buffer, emit code the code to compute the + * counter delta in the epilogue ring. + */ + struct fd_ringbuffer *epilogue = fd_batch_get_epilogue(batch); + fd_wfi(batch, epilogue); - /* result += stop - start: */ - OUT_PKT7(epilogue, CP_MEM_TO_MEM, 9); - OUT_RING(epilogue, CP_MEM_TO_MEM_0_DOUBLE | - CP_MEM_TO_MEM_0_NEG_C); - OUT_RELOC(epilogue, query_sample(aq, result)); /* dst */ - OUT_RELOC(epilogue, query_sample(aq, result)); /* srcA */ - OUT_RELOC(epilogue, query_sample(aq, stop)); /* srcB */ - OUT_RELOC(epilogue, query_sample(aq, start)); /* srcC */ + /* result += stop - start: */ + OUT_PKT7(epilogue, CP_MEM_TO_MEM, 9); + OUT_RING(epilogue, CP_MEM_TO_MEM_0_DOUBLE | CP_MEM_TO_MEM_0_NEG_C); + OUT_RELOC(epilogue, query_sample(aq, result)); /* dst */ + OUT_RELOC(epilogue, query_sample(aq, result)); /* srcA */ + OUT_RELOC(epilogue, query_sample(aq, stop)); /* srcB */ + OUT_RELOC(epilogue, query_sample(aq, start)); /* srcC */ - fd6_context(batch->ctx)->samples_passed_queries--; + fd6_context(batch->ctx)->samples_passed_queries--; } static void occlusion_counter_result(struct fd_acc_query *aq, void *buf, - union pipe_query_result *result) + union pipe_query_result *result) { - struct fd6_query_sample *sp = buf; - result->u64 = sp->result; + struct fd6_query_sample *sp = buf; + result->u64 = sp->result; } static void occlusion_predicate_result(struct fd_acc_query *aq, void *buf, - union pipe_query_result *result) + union pipe_query_result *result) { - struct fd6_query_sample *sp = buf; - result->b = !!sp->result; + struct fd6_query_sample *sp = buf; + result->b = !!sp->result; } static const struct fd_acc_sample_provider occlusion_counter = { - .query_type = PIPE_QUERY_OCCLUSION_COUNTER, - .size = sizeof(struct fd6_query_sample), - .resume = occlusion_resume, - .pause = occlusion_pause, - .result = occlusion_counter_result, + .query_type = PIPE_QUERY_OCCLUSION_COUNTER, + .size = sizeof(struct fd6_query_sample), + .resume = occlusion_resume, + .pause = occlusion_pause, + .result = occlusion_counter_result, }; static const struct fd_acc_sample_provider occlusion_predicate = { - .query_type = PIPE_QUERY_OCCLUSION_PREDICATE, - .size = sizeof(struct fd6_query_sample), - .resume = occlusion_resume, - .pause = occlusion_pause, - .result = occlusion_predicate_result, + .query_type = PIPE_QUERY_OCCLUSION_PREDICATE, + .size = sizeof(struct fd6_query_sample), + .resume = occlusion_resume, + .pause = occlusion_pause, + .result = occlusion_predicate_result, }; static const struct fd_acc_sample_provider occlusion_predicate_conservative = { - .query_type = PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE, - .size = sizeof(struct fd6_query_sample), - .resume = occlusion_resume, - .pause = occlusion_pause, - .result = occlusion_predicate_result, + .query_type = PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE, + .size = sizeof(struct fd6_query_sample), + .resume = occlusion_resume, + .pause = occlusion_pause, + .result = occlusion_predicate_result, }; /* @@ -161,92 +158,90 @@ static const struct fd_acc_sample_provider occlusion_predicate_conservative = { static void timestamp_resume(struct fd_acc_query *aq, struct fd_batch *batch) { - struct fd_ringbuffer *ring = batch->draw; + struct fd_ringbuffer *ring = batch->draw; - OUT_PKT7(ring, CP_EVENT_WRITE, 4); - OUT_RING(ring, CP_EVENT_WRITE_0_EVENT(RB_DONE_TS) | - CP_EVENT_WRITE_0_TIMESTAMP); - OUT_RELOC(ring, query_sample(aq, start)); - OUT_RING(ring, 0x00000000); + OUT_PKT7(ring, CP_EVENT_WRITE, 4); + OUT_RING(ring, + CP_EVENT_WRITE_0_EVENT(RB_DONE_TS) | CP_EVENT_WRITE_0_TIMESTAMP); + OUT_RELOC(ring, query_sample(aq, start)); + OUT_RING(ring, 0x00000000); - fd_reset_wfi(batch); + fd_reset_wfi(batch); } static void -time_elapsed_pause(struct fd_acc_query *aq, struct fd_batch *batch) - assert_dt +time_elapsed_pause(struct fd_acc_query *aq, struct fd_batch *batch) assert_dt { - struct fd_ringbuffer *ring = batch->draw; - - OUT_PKT7(ring, CP_EVENT_WRITE, 4); - OUT_RING(ring, CP_EVENT_WRITE_0_EVENT(RB_DONE_TS) | - CP_EVENT_WRITE_0_TIMESTAMP); - OUT_RELOC(ring, query_sample(aq, stop)); - OUT_RING(ring, 0x00000000); - - fd_reset_wfi(batch); - fd_wfi(batch, ring); - - /* result += stop - start: */ - OUT_PKT7(ring, CP_MEM_TO_MEM, 9); - OUT_RING(ring, CP_MEM_TO_MEM_0_DOUBLE | - CP_MEM_TO_MEM_0_NEG_C); - OUT_RELOC(ring, query_sample(aq, result)); /* dst */ - OUT_RELOC(ring, query_sample(aq, result)); /* srcA */ - OUT_RELOC(ring, query_sample(aq, stop)); /* srcB */ - OUT_RELOC(ring, query_sample(aq, start)); /* srcC */ + struct fd_ringbuffer *ring = batch->draw; + + OUT_PKT7(ring, CP_EVENT_WRITE, 4); + OUT_RING(ring, + CP_EVENT_WRITE_0_EVENT(RB_DONE_TS) | CP_EVENT_WRITE_0_TIMESTAMP); + OUT_RELOC(ring, query_sample(aq, stop)); + OUT_RING(ring, 0x00000000); + + fd_reset_wfi(batch); + fd_wfi(batch, ring); + + /* result += stop - start: */ + OUT_PKT7(ring, CP_MEM_TO_MEM, 9); + OUT_RING(ring, CP_MEM_TO_MEM_0_DOUBLE | CP_MEM_TO_MEM_0_NEG_C); + OUT_RELOC(ring, query_sample(aq, result)); /* dst */ + OUT_RELOC(ring, query_sample(aq, result)); /* srcA */ + OUT_RELOC(ring, query_sample(aq, stop)); /* srcB */ + OUT_RELOC(ring, query_sample(aq, start)); /* srcC */ } static void timestamp_pause(struct fd_acc_query *aq, struct fd_batch *batch) { - /* We captured a timestamp in timestamp_resume(), nothing to do here. */ + /* We captured a timestamp in timestamp_resume(), nothing to do here. */ } /* timestamp logging for u_trace: */ static void record_timestamp(struct fd_ringbuffer *ring, struct fd_bo *bo, unsigned offset) { - OUT_PKT7(ring, CP_EVENT_WRITE, 4); - OUT_RING(ring, CP_EVENT_WRITE_0_EVENT(RB_DONE_TS) | - CP_EVENT_WRITE_0_TIMESTAMP); - OUT_RELOC(ring, bo, offset, 0, 0); - OUT_RING(ring, 0x00000000); + OUT_PKT7(ring, CP_EVENT_WRITE, 4); + OUT_RING(ring, + CP_EVENT_WRITE_0_EVENT(RB_DONE_TS) | CP_EVENT_WRITE_0_TIMESTAMP); + OUT_RELOC(ring, bo, offset, 0, 0); + OUT_RING(ring, 0x00000000); } static uint64_t ticks_to_ns(uint64_t ts) { - /* This is based on the 19.2MHz always-on rbbm timer. - * - * TODO we should probably query this value from kernel.. - */ - return ts * (1000000000 / 19200000); + /* This is based on the 19.2MHz always-on rbbm timer. + * + * TODO we should probably query this value from kernel.. + */ + return ts * (1000000000 / 19200000); } static void time_elapsed_accumulate_result(struct fd_acc_query *aq, void *buf, - union pipe_query_result *result) + union pipe_query_result *result) { - struct fd6_query_sample *sp = buf; - result->u64 = ticks_to_ns(sp->result); + struct fd6_query_sample *sp = buf; + result->u64 = ticks_to_ns(sp->result); } static void timestamp_accumulate_result(struct fd_acc_query *aq, void *buf, - union pipe_query_result *result) + union pipe_query_result *result) { - struct fd6_query_sample *sp = buf; - result->u64 = ticks_to_ns(sp->start); + struct fd6_query_sample *sp = buf; + result->u64 = ticks_to_ns(sp->start); } static const struct fd_acc_sample_provider time_elapsed = { - .query_type = PIPE_QUERY_TIME_ELAPSED, - .always = true, - .size = sizeof(struct fd6_query_sample), - .resume = timestamp_resume, - .pause = time_elapsed_pause, - .result = time_elapsed_accumulate_result, + .query_type = PIPE_QUERY_TIME_ELAPSED, + .always = true, + .size = sizeof(struct fd6_query_sample), + .resume = timestamp_resume, + .pause = time_elapsed_pause, + .result = time_elapsed_accumulate_result, }; /* NOTE: timestamp query isn't going to give terribly sensible results @@ -257,27 +252,28 @@ static const struct fd_acc_sample_provider time_elapsed = { */ static const struct fd_acc_sample_provider timestamp = { - .query_type = PIPE_QUERY_TIMESTAMP, - .always = true, - .size = sizeof(struct fd6_query_sample), - .resume = timestamp_resume, - .pause = timestamp_pause, - .result = timestamp_accumulate_result, + .query_type = PIPE_QUERY_TIMESTAMP, + .always = true, + .size = sizeof(struct fd6_query_sample), + .resume = timestamp_resume, + .pause = timestamp_pause, + .result = timestamp_accumulate_result, }; struct PACKED fd6_primitives_sample { - struct { - uint64_t emitted, generated; - } start[4], stop[4], result; + struct { + uint64_t emitted, generated; + } start[4], stop[4], result; - uint64_t prim_start[16], prim_stop[16], prim_emitted; + uint64_t prim_start[16], prim_stop[16], prim_emitted; }; - -#define primitives_relocw(ring, aq, field) \ - OUT_RELOC(ring, fd_resource((aq)->prsc)->bo, offsetof(struct fd6_primitives_sample, field), 0, 0); -#define primitives_reloc(ring, aq, field) \ - OUT_RELOC(ring, fd_resource((aq)->prsc)->bo, offsetof(struct fd6_primitives_sample, field), 0, 0); +#define primitives_relocw(ring, aq, field) \ + OUT_RELOC(ring, fd_resource((aq)->prsc)->bo, \ + offsetof(struct fd6_primitives_sample, field), 0, 0); +#define primitives_reloc(ring, aq, field) \ + OUT_RELOC(ring, fd_resource((aq)->prsc)->bo, \ + offsetof(struct fd6_primitives_sample, field), 0, 0); #ifdef DEBUG_COUNTERS static const unsigned counter_count = 10; @@ -286,37 +282,37 @@ static const unsigned counter_base = REG_A6XX_RBBM_PRIMCTR_0_LO; static void log_counters(struct fd6_primitives_sample *ps) { - const char *labels[] = { - "vs_vertices_in", - "vs_primitives_out", - "hs_vertices_in", - "hs_patches_out", - "ds_vertices_in", - "ds_primitives_out", - "gs_primitives_in", - "gs_primitives_out", - "ras_primitives_in", - "x", - }; - - printf(" counter\t\tstart\t\t\tstop\t\t\tdiff\n"); - for (int i = 0; i < ARRAY_SIZE(labels); i++) { - int register_idx = i + (counter_base - REG_A6XX_RBBM_PRIMCTR_0_LO) / 2; - printf(" RBBM_PRIMCTR_%d\t0x%016"PRIx64"\t0x%016"PRIx64"\t%"PRIi64"\t%s\n", - register_idx, - ps->prim_start[i], ps->prim_stop[i], ps->prim_stop[i] - ps->prim_start[i], - labels[register_idx]); - } - - printf(" so counts\n"); - for (int i = 0; i < ARRAY_SIZE(ps->start); i++) { - printf(" CHANNEL %d emitted\t0x%016"PRIx64"\t0x%016"PRIx64"\t%"PRIi64"\n", - i, ps->start[i].generated, ps->stop[i].generated, ps->stop[i].generated - ps->start[i].generated); - printf(" CHANNEL %d generated\t0x%016"PRIx64"\t0x%016"PRIx64"\t%"PRIi64"\n", - i, ps->start[i].emitted, ps->stop[i].emitted, ps->stop[i].emitted - ps->start[i].emitted); - } - - printf("generated %"PRIu64", emitted %"PRIu64"\n", ps->result.generated, ps->result.emitted); + const char *labels[] = { + "vs_vertices_in", "vs_primitives_out", + "hs_vertices_in", "hs_patches_out", + "ds_vertices_in", "ds_primitives_out", + "gs_primitives_in", "gs_primitives_out", + "ras_primitives_in", "x", + }; + + printf(" counter\t\tstart\t\t\tstop\t\t\tdiff\n"); + for (int i = 0; i < ARRAY_SIZE(labels); i++) { + int register_idx = i + (counter_base - REG_A6XX_RBBM_PRIMCTR_0_LO) / 2; + printf(" RBBM_PRIMCTR_%d\t0x%016" PRIx64 "\t0x%016" PRIx64 "\t%" PRIi64 + "\t%s\n", + register_idx, ps->prim_start[i], ps->prim_stop[i], + ps->prim_stop[i] - ps->prim_start[i], labels[register_idx]); + } + + printf(" so counts\n"); + for (int i = 0; i < ARRAY_SIZE(ps->start); i++) { + printf(" CHANNEL %d emitted\t0x%016" PRIx64 "\t0x%016" PRIx64 + "\t%" PRIi64 "\n", + i, ps->start[i].generated, ps->stop[i].generated, + ps->stop[i].generated - ps->start[i].generated); + printf(" CHANNEL %d generated\t0x%016" PRIx64 "\t0x%016" PRIx64 + "\t%" PRIi64 "\n", + i, ps->start[i].emitted, ps->stop[i].emitted, + ps->stop[i].emitted - ps->start[i].emitted); + } + + printf("generated %" PRIu64 ", emitted %" PRIu64 "\n", ps->result.generated, + ps->result.emitted); } #else @@ -332,122 +328,120 @@ log_counters(struct fd6_primitives_sample *ps) #endif static void -primitives_generated_resume(struct fd_acc_query *aq, struct fd_batch *batch) - assert_dt +primitives_generated_resume(struct fd_acc_query *aq, + struct fd_batch *batch) assert_dt { - struct fd_ringbuffer *ring = batch->draw; + struct fd_ringbuffer *ring = batch->draw; - fd_wfi(batch, ring); + fd_wfi(batch, ring); - OUT_PKT7(ring, CP_REG_TO_MEM, 3); - OUT_RING(ring, CP_REG_TO_MEM_0_64B | - CP_REG_TO_MEM_0_CNT(counter_count * 2) | - CP_REG_TO_MEM_0_REG(counter_base)); - primitives_relocw(ring, aq, prim_start); + OUT_PKT7(ring, CP_REG_TO_MEM, 3); + OUT_RING(ring, CP_REG_TO_MEM_0_64B | CP_REG_TO_MEM_0_CNT(counter_count * 2) | + CP_REG_TO_MEM_0_REG(counter_base)); + primitives_relocw(ring, aq, prim_start); - fd6_event_write(batch, ring, START_PRIMITIVE_CTRS, false); + fd6_event_write(batch, ring, START_PRIMITIVE_CTRS, false); } static void -primitives_generated_pause(struct fd_acc_query *aq, struct fd_batch *batch) - assert_dt +primitives_generated_pause(struct fd_acc_query *aq, + struct fd_batch *batch) assert_dt { - struct fd_ringbuffer *ring = batch->draw; - - fd_wfi(batch, ring); - - /* snapshot the end values: */ - OUT_PKT7(ring, CP_REG_TO_MEM, 3); - OUT_RING(ring, CP_REG_TO_MEM_0_64B | - CP_REG_TO_MEM_0_CNT(counter_count * 2) | - CP_REG_TO_MEM_0_REG(counter_base)); - primitives_relocw(ring, aq, prim_stop); - - fd6_event_write(batch, ring, STOP_PRIMITIVE_CTRS, false); - - /* result += stop - start: */ - OUT_PKT7(ring, CP_MEM_TO_MEM, 9); - OUT_RING(ring, CP_MEM_TO_MEM_0_DOUBLE | - CP_MEM_TO_MEM_0_NEG_C | 0x40000000); - primitives_relocw(ring, aq, result.generated); - primitives_reloc(ring, aq, prim_emitted); - primitives_reloc(ring, aq, prim_stop[(REG_A6XX_RBBM_PRIMCTR_8_LO - counter_base) / 2]) - primitives_reloc(ring, aq, prim_start[(REG_A6XX_RBBM_PRIMCTR_8_LO - counter_base) / 2]); + struct fd_ringbuffer *ring = batch->draw; + + fd_wfi(batch, ring); + + /* snapshot the end values: */ + OUT_PKT7(ring, CP_REG_TO_MEM, 3); + OUT_RING(ring, CP_REG_TO_MEM_0_64B | CP_REG_TO_MEM_0_CNT(counter_count * 2) | + CP_REG_TO_MEM_0_REG(counter_base)); + primitives_relocw(ring, aq, prim_stop); + + fd6_event_write(batch, ring, STOP_PRIMITIVE_CTRS, false); + + /* result += stop - start: */ + OUT_PKT7(ring, CP_MEM_TO_MEM, 9); + OUT_RING(ring, CP_MEM_TO_MEM_0_DOUBLE | CP_MEM_TO_MEM_0_NEG_C | 0x40000000); + primitives_relocw(ring, aq, result.generated); + primitives_reloc(ring, aq, prim_emitted); + primitives_reloc(ring, aq, + prim_stop[(REG_A6XX_RBBM_PRIMCTR_8_LO - counter_base) / 2]) + primitives_reloc( + ring, aq, prim_start[(REG_A6XX_RBBM_PRIMCTR_8_LO - counter_base) / 2]); } static void primitives_generated_result(struct fd_acc_query *aq, void *buf, - union pipe_query_result *result) + union pipe_query_result *result) { - struct fd6_primitives_sample *ps = buf; + struct fd6_primitives_sample *ps = buf; - log_counters(ps); + log_counters(ps); - result->u64 = ps->result.generated; + result->u64 = ps->result.generated; } static const struct fd_acc_sample_provider primitives_generated = { - .query_type = PIPE_QUERY_PRIMITIVES_GENERATED, - .size = sizeof(struct fd6_primitives_sample), - .resume = primitives_generated_resume, - .pause = primitives_generated_pause, - .result = primitives_generated_result, + .query_type = PIPE_QUERY_PRIMITIVES_GENERATED, + .size = sizeof(struct fd6_primitives_sample), + .resume = primitives_generated_resume, + .pause = primitives_generated_pause, + .result = primitives_generated_result, }; static void -primitives_emitted_resume(struct fd_acc_query *aq, struct fd_batch *batch) - assert_dt +primitives_emitted_resume(struct fd_acc_query *aq, + struct fd_batch *batch) assert_dt { - struct fd_ringbuffer *ring = batch->draw; + struct fd_ringbuffer *ring = batch->draw; - fd_wfi(batch, ring); - OUT_PKT4(ring, REG_A6XX_VPC_SO_STREAM_COUNTS, 2); - primitives_relocw(ring, aq, start[0]); + fd_wfi(batch, ring); + OUT_PKT4(ring, REG_A6XX_VPC_SO_STREAM_COUNTS, 2); + primitives_relocw(ring, aq, start[0]); - fd6_event_write(batch, ring, WRITE_PRIMITIVE_COUNTS, false); + fd6_event_write(batch, ring, WRITE_PRIMITIVE_COUNTS, false); } static void -primitives_emitted_pause(struct fd_acc_query *aq, struct fd_batch *batch) - assert_dt +primitives_emitted_pause(struct fd_acc_query *aq, + struct fd_batch *batch) assert_dt { - struct fd_ringbuffer *ring = batch->draw; + struct fd_ringbuffer *ring = batch->draw; - fd_wfi(batch, ring); + fd_wfi(batch, ring); - OUT_PKT4(ring, REG_A6XX_VPC_SO_STREAM_COUNTS, 2); - primitives_relocw(ring, aq, stop[0]); - fd6_event_write(batch, ring, WRITE_PRIMITIVE_COUNTS, false); + OUT_PKT4(ring, REG_A6XX_VPC_SO_STREAM_COUNTS, 2); + primitives_relocw(ring, aq, stop[0]); + fd6_event_write(batch, ring, WRITE_PRIMITIVE_COUNTS, false); - fd6_event_write(batch, batch->draw, CACHE_FLUSH_TS, true); + fd6_event_write(batch, batch->draw, CACHE_FLUSH_TS, true); - /* result += stop - start: */ - OUT_PKT7(ring, CP_MEM_TO_MEM, 9); - OUT_RING(ring, CP_MEM_TO_MEM_0_DOUBLE | - CP_MEM_TO_MEM_0_NEG_C | 0x80000000); - primitives_relocw(ring, aq, result.emitted); - primitives_reloc(ring, aq, result.emitted); - primitives_reloc(ring, aq, stop[aq->base.index].emitted); - primitives_reloc(ring, aq, start[aq->base.index].emitted); + /* result += stop - start: */ + OUT_PKT7(ring, CP_MEM_TO_MEM, 9); + OUT_RING(ring, CP_MEM_TO_MEM_0_DOUBLE | CP_MEM_TO_MEM_0_NEG_C | 0x80000000); + primitives_relocw(ring, aq, result.emitted); + primitives_reloc(ring, aq, result.emitted); + primitives_reloc(ring, aq, stop[aq->base.index].emitted); + primitives_reloc(ring, aq, start[aq->base.index].emitted); } static void primitives_emitted_result(struct fd_acc_query *aq, void *buf, - union pipe_query_result *result) + union pipe_query_result *result) { - struct fd6_primitives_sample *ps = buf; + struct fd6_primitives_sample *ps = buf; - log_counters(ps); + log_counters(ps); - result->u64 = ps->result.emitted; + result->u64 = ps->result.emitted; } static const struct fd_acc_sample_provider primitives_emitted = { - .query_type = PIPE_QUERY_PRIMITIVES_EMITTED, - .size = sizeof(struct fd6_primitives_sample), - .resume = primitives_emitted_resume, - .pause = primitives_emitted_pause, - .result = primitives_emitted_result, + .query_type = PIPE_QUERY_PRIMITIVES_EMITTED, + .size = sizeof(struct fd6_primitives_sample), + .resume = primitives_emitted_resume, + .pause = primitives_emitted_pause, + .result = primitives_emitted_result, }; /* @@ -460,214 +454,210 @@ static const struct fd_acc_sample_provider primitives_emitted = { */ struct fd_batch_query_entry { - uint8_t gid; /* group-id */ - uint8_t cid; /* countable-id within the group */ + uint8_t gid; /* group-id */ + uint8_t cid; /* countable-id within the group */ }; struct fd_batch_query_data { - struct fd_screen *screen; - unsigned num_query_entries; - struct fd_batch_query_entry query_entries[]; + struct fd_screen *screen; + unsigned num_query_entries; + struct fd_batch_query_entry query_entries[]; }; static void -perfcntr_resume(struct fd_acc_query *aq, struct fd_batch *batch) - assert_dt +perfcntr_resume(struct fd_acc_query *aq, struct fd_batch *batch) assert_dt { - struct fd_batch_query_data *data = aq->query_data; - struct fd_screen *screen = data->screen; - struct fd_ringbuffer *ring = batch->draw; + struct fd_batch_query_data *data = aq->query_data; + struct fd_screen *screen = data->screen; + struct fd_ringbuffer *ring = batch->draw; - unsigned counters_per_group[screen->num_perfcntr_groups]; - memset(counters_per_group, 0, sizeof(counters_per_group)); + unsigned counters_per_group[screen->num_perfcntr_groups]; + memset(counters_per_group, 0, sizeof(counters_per_group)); - fd_wfi(batch, ring); + fd_wfi(batch, ring); - /* configure performance counters for the requested queries: */ - for (unsigned i = 0; i < data->num_query_entries; i++) { - struct fd_batch_query_entry *entry = &data->query_entries[i]; - const struct fd_perfcntr_group *g = &screen->perfcntr_groups[entry->gid]; - unsigned counter_idx = counters_per_group[entry->gid]++; + /* configure performance counters for the requested queries: */ + for (unsigned i = 0; i < data->num_query_entries; i++) { + struct fd_batch_query_entry *entry = &data->query_entries[i]; + const struct fd_perfcntr_group *g = &screen->perfcntr_groups[entry->gid]; + unsigned counter_idx = counters_per_group[entry->gid]++; - debug_assert(counter_idx < g->num_counters); + debug_assert(counter_idx < g->num_counters); - OUT_PKT4(ring, g->counters[counter_idx].select_reg, 1); - OUT_RING(ring, g->countables[entry->cid].selector); - } + OUT_PKT4(ring, g->counters[counter_idx].select_reg, 1); + OUT_RING(ring, g->countables[entry->cid].selector); + } - memset(counters_per_group, 0, sizeof(counters_per_group)); + memset(counters_per_group, 0, sizeof(counters_per_group)); - /* and snapshot the start values */ - for (unsigned i = 0; i < data->num_query_entries; i++) { - struct fd_batch_query_entry *entry = &data->query_entries[i]; - const struct fd_perfcntr_group *g = &screen->perfcntr_groups[entry->gid]; - unsigned counter_idx = counters_per_group[entry->gid]++; - const struct fd_perfcntr_counter *counter = &g->counters[counter_idx]; + /* and snapshot the start values */ + for (unsigned i = 0; i < data->num_query_entries; i++) { + struct fd_batch_query_entry *entry = &data->query_entries[i]; + const struct fd_perfcntr_group *g = &screen->perfcntr_groups[entry->gid]; + unsigned counter_idx = counters_per_group[entry->gid]++; + const struct fd_perfcntr_counter *counter = &g->counters[counter_idx]; - OUT_PKT7(ring, CP_REG_TO_MEM, 3); - OUT_RING(ring, CP_REG_TO_MEM_0_64B | - CP_REG_TO_MEM_0_REG(counter->counter_reg_lo)); - OUT_RELOC(ring, query_sample_idx(aq, i, start)); - } + OUT_PKT7(ring, CP_REG_TO_MEM, 3); + OUT_RING(ring, CP_REG_TO_MEM_0_64B | + CP_REG_TO_MEM_0_REG(counter->counter_reg_lo)); + OUT_RELOC(ring, query_sample_idx(aq, i, start)); + } } static void -perfcntr_pause(struct fd_acc_query *aq, struct fd_batch *batch) - assert_dt +perfcntr_pause(struct fd_acc_query *aq, struct fd_batch *batch) assert_dt { - struct fd_batch_query_data *data = aq->query_data; - struct fd_screen *screen = data->screen; - struct fd_ringbuffer *ring = batch->draw; - - unsigned counters_per_group[screen->num_perfcntr_groups]; - memset(counters_per_group, 0, sizeof(counters_per_group)); - - fd_wfi(batch, ring); - - /* TODO do we need to bother to turn anything off? */ - - /* snapshot the end values: */ - for (unsigned i = 0; i < data->num_query_entries; i++) { - struct fd_batch_query_entry *entry = &data->query_entries[i]; - const struct fd_perfcntr_group *g = &screen->perfcntr_groups[entry->gid]; - unsigned counter_idx = counters_per_group[entry->gid]++; - const struct fd_perfcntr_counter *counter = &g->counters[counter_idx]; - - OUT_PKT7(ring, CP_REG_TO_MEM, 3); - OUT_RING(ring, CP_REG_TO_MEM_0_64B | - CP_REG_TO_MEM_0_REG(counter->counter_reg_lo)); - OUT_RELOC(ring, query_sample_idx(aq, i, stop)); - } - - /* and compute the result: */ - for (unsigned i = 0; i < data->num_query_entries; i++) { - /* result += stop - start: */ - OUT_PKT7(ring, CP_MEM_TO_MEM, 9); - OUT_RING(ring, CP_MEM_TO_MEM_0_DOUBLE | - CP_MEM_TO_MEM_0_NEG_C); - OUT_RELOC(ring, query_sample_idx(aq, i, result)); /* dst */ - OUT_RELOC(ring, query_sample_idx(aq, i, result)); /* srcA */ - OUT_RELOC(ring, query_sample_idx(aq, i, stop)); /* srcB */ - OUT_RELOC(ring, query_sample_idx(aq, i, start)); /* srcC */ - } + struct fd_batch_query_data *data = aq->query_data; + struct fd_screen *screen = data->screen; + struct fd_ringbuffer *ring = batch->draw; + + unsigned counters_per_group[screen->num_perfcntr_groups]; + memset(counters_per_group, 0, sizeof(counters_per_group)); + + fd_wfi(batch, ring); + + /* TODO do we need to bother to turn anything off? */ + + /* snapshot the end values: */ + for (unsigned i = 0; i < data->num_query_entries; i++) { + struct fd_batch_query_entry *entry = &data->query_entries[i]; + const struct fd_perfcntr_group *g = &screen->perfcntr_groups[entry->gid]; + unsigned counter_idx = counters_per_group[entry->gid]++; + const struct fd_perfcntr_counter *counter = &g->counters[counter_idx]; + + OUT_PKT7(ring, CP_REG_TO_MEM, 3); + OUT_RING(ring, CP_REG_TO_MEM_0_64B | + CP_REG_TO_MEM_0_REG(counter->counter_reg_lo)); + OUT_RELOC(ring, query_sample_idx(aq, i, stop)); + } + + /* and compute the result: */ + for (unsigned i = 0; i < data->num_query_entries; i++) { + /* result += stop - start: */ + OUT_PKT7(ring, CP_MEM_TO_MEM, 9); + OUT_RING(ring, CP_MEM_TO_MEM_0_DOUBLE | CP_MEM_TO_MEM_0_NEG_C); + OUT_RELOC(ring, query_sample_idx(aq, i, result)); /* dst */ + OUT_RELOC(ring, query_sample_idx(aq, i, result)); /* srcA */ + OUT_RELOC(ring, query_sample_idx(aq, i, stop)); /* srcB */ + OUT_RELOC(ring, query_sample_idx(aq, i, start)); /* srcC */ + } } static void perfcntr_accumulate_result(struct fd_acc_query *aq, void *buf, - union pipe_query_result *result) + union pipe_query_result *result) { - struct fd_batch_query_data *data = aq->query_data; - struct fd6_query_sample *sp = buf; + struct fd_batch_query_data *data = aq->query_data; + struct fd6_query_sample *sp = buf; - for (unsigned i = 0; i < data->num_query_entries; i++) { - result->batch[i].u64 = sp[i].result; - } + for (unsigned i = 0; i < data->num_query_entries; i++) { + result->batch[i].u64 = sp[i].result; + } } static const struct fd_acc_sample_provider perfcntr = { - .query_type = FD_QUERY_FIRST_PERFCNTR, - .always = true, - .resume = perfcntr_resume, - .pause = perfcntr_pause, - .result = perfcntr_accumulate_result, + .query_type = FD_QUERY_FIRST_PERFCNTR, + .always = true, + .resume = perfcntr_resume, + .pause = perfcntr_pause, + .result = perfcntr_accumulate_result, }; static struct pipe_query * -fd6_create_batch_query(struct pipe_context *pctx, - unsigned num_queries, unsigned *query_types) +fd6_create_batch_query(struct pipe_context *pctx, unsigned num_queries, + unsigned *query_types) { - struct fd_context *ctx = fd_context(pctx); - struct fd_screen *screen = ctx->screen; - struct fd_query *q; - struct fd_acc_query *aq; - struct fd_batch_query_data *data; - - data = CALLOC_VARIANT_LENGTH_STRUCT(fd_batch_query_data, - num_queries * sizeof(data->query_entries[0])); - - data->screen = screen; - data->num_query_entries = num_queries; - - /* validate the requested query_types and ensure we don't try - * to request more query_types of a given group than we have - * counters: - */ - unsigned counters_per_group[screen->num_perfcntr_groups]; - memset(counters_per_group, 0, sizeof(counters_per_group)); - - for (unsigned i = 0; i < num_queries; i++) { - unsigned idx = query_types[i] - FD_QUERY_FIRST_PERFCNTR; - - /* verify valid query_type, ie. is it actually a perfcntr? */ - if ((query_types[i] < FD_QUERY_FIRST_PERFCNTR) || - (idx >= screen->num_perfcntr_queries)) { - mesa_loge("invalid batch query query_type: %u", query_types[i]); - goto error; - } - - struct fd_batch_query_entry *entry = &data->query_entries[i]; - struct pipe_driver_query_info *pq = &screen->perfcntr_queries[idx]; - - entry->gid = pq->group_id; - - /* the perfcntr_queries[] table flattens all the countables - * for each group in series, ie: - * - * (G0,C0), .., (G0,Cn), (G1,C0), .., (G1,Cm), ... - * - * So to find the countable index just step back through the - * table to find the first entry with the same group-id. - */ - while (pq > screen->perfcntr_queries) { - pq--; - if (pq->group_id == entry->gid) - entry->cid++; - } - - if (counters_per_group[entry->gid] >= - screen->perfcntr_groups[entry->gid].num_counters) { - mesa_loge("too many counters for group %u", entry->gid); - goto error; - } - - counters_per_group[entry->gid]++; - } - - q = fd_acc_create_query2(ctx, 0, 0, &perfcntr); - aq = fd_acc_query(q); - - /* sample buffer size is based on # of queries: */ - aq->size = num_queries * sizeof(struct fd6_query_sample); - aq->query_data = data; - - return (struct pipe_query *)q; + struct fd_context *ctx = fd_context(pctx); + struct fd_screen *screen = ctx->screen; + struct fd_query *q; + struct fd_acc_query *aq; + struct fd_batch_query_data *data; + + data = CALLOC_VARIANT_LENGTH_STRUCT( + fd_batch_query_data, num_queries * sizeof(data->query_entries[0])); + + data->screen = screen; + data->num_query_entries = num_queries; + + /* validate the requested query_types and ensure we don't try + * to request more query_types of a given group than we have + * counters: + */ + unsigned counters_per_group[screen->num_perfcntr_groups]; + memset(counters_per_group, 0, sizeof(counters_per_group)); + + for (unsigned i = 0; i < num_queries; i++) { + unsigned idx = query_types[i] - FD_QUERY_FIRST_PERFCNTR; + + /* verify valid query_type, ie. is it actually a perfcntr? */ + if ((query_types[i] < FD_QUERY_FIRST_PERFCNTR) || + (idx >= screen->num_perfcntr_queries)) { + mesa_loge("invalid batch query query_type: %u", query_types[i]); + goto error; + } + + struct fd_batch_query_entry *entry = &data->query_entries[i]; + struct pipe_driver_query_info *pq = &screen->perfcntr_queries[idx]; + + entry->gid = pq->group_id; + + /* the perfcntr_queries[] table flattens all the countables + * for each group in series, ie: + * + * (G0,C0), .., (G0,Cn), (G1,C0), .., (G1,Cm), ... + * + * So to find the countable index just step back through the + * table to find the first entry with the same group-id. + */ + while (pq > screen->perfcntr_queries) { + pq--; + if (pq->group_id == entry->gid) + entry->cid++; + } + + if (counters_per_group[entry->gid] >= + screen->perfcntr_groups[entry->gid].num_counters) { + mesa_loge("too many counters for group %u", entry->gid); + goto error; + } + + counters_per_group[entry->gid]++; + } + + q = fd_acc_create_query2(ctx, 0, 0, &perfcntr); + aq = fd_acc_query(q); + + /* sample buffer size is based on # of queries: */ + aq->size = num_queries * sizeof(struct fd6_query_sample); + aq->query_data = data; + + return (struct pipe_query *)q; error: - free(data); - return NULL; + free(data); + return NULL; } void -fd6_query_context_init(struct pipe_context *pctx) - disable_thread_safety_analysis +fd6_query_context_init(struct pipe_context *pctx) disable_thread_safety_analysis { - struct fd_context *ctx = fd_context(pctx); + struct fd_context *ctx = fd_context(pctx); - ctx->create_query = fd_acc_create_query; - ctx->query_update_batch = fd_acc_query_update_batch; + ctx->create_query = fd_acc_create_query; + ctx->query_update_batch = fd_acc_query_update_batch; - ctx->record_timestamp = record_timestamp; - ctx->ts_to_ns = ticks_to_ns; + ctx->record_timestamp = record_timestamp; + ctx->ts_to_ns = ticks_to_ns; - pctx->create_batch_query = fd6_create_batch_query; + pctx->create_batch_query = fd6_create_batch_query; - fd_acc_query_register_provider(pctx, &occlusion_counter); - fd_acc_query_register_provider(pctx, &occlusion_predicate); - fd_acc_query_register_provider(pctx, &occlusion_predicate_conservative); + fd_acc_query_register_provider(pctx, &occlusion_counter); + fd_acc_query_register_provider(pctx, &occlusion_predicate); + fd_acc_query_register_provider(pctx, &occlusion_predicate_conservative); - fd_acc_query_register_provider(pctx, &time_elapsed); - fd_acc_query_register_provider(pctx, ×tamp); + fd_acc_query_register_provider(pctx, &time_elapsed); + fd_acc_query_register_provider(pctx, ×tamp); - fd_acc_query_register_provider(pctx, &primitives_generated); - fd_acc_query_register_provider(pctx, &primitives_emitted); + fd_acc_query_register_provider(pctx, &primitives_generated); + fd_acc_query_register_provider(pctx, &primitives_emitted); } diff --git a/src/gallium/drivers/freedreno/a6xx/fd6_rasterizer.c b/src/gallium/drivers/freedreno/a6xx/fd6_rasterizer.c index 3532477..b08cbae 100644 --- a/src/gallium/drivers/freedreno/a6xx/fd6_rasterizer.c +++ b/src/gallium/drivers/freedreno/a6xx/fd6_rasterizer.c @@ -25,120 +25,101 @@ * Rob Clark */ - #include "pipe/p_state.h" -#include "util/u_string.h" #include "util/u_memory.h" +#include "util/u_string.h" -#include "fd6_rasterizer.h" #include "fd6_context.h" #include "fd6_format.h" #include "fd6_pack.h" +#include "fd6_rasterizer.h" struct fd_ringbuffer * __fd6_setup_rasterizer_stateobj(struct fd_context *ctx, - const struct pipe_rasterizer_state *cso, bool primitive_restart) + const struct pipe_rasterizer_state *cso, + bool primitive_restart) { - struct fd_ringbuffer *ring = fd_ringbuffer_new_object(ctx->pipe, 18 * 4); - float psize_min, psize_max; - - if (cso->point_size_per_vertex) { - psize_min = util_get_min_point_size(cso); - psize_max = 4092; - } else { - /* Force the point size to be as if the vertex output was disabled. */ - psize_min = cso->point_size; - psize_max = cso->point_size; - } - - OUT_REG(ring, - A6XX_GRAS_CL_CNTL( - .znear_clip_disable = !cso->depth_clip_near, - .zfar_clip_disable = !cso->depth_clip_far, - .unk5 = !cso->depth_clip_near || !cso->depth_clip_far, - .vp_clip_code_ignore = 1, - .zero_gb_scale_z = cso->clip_halfz - )); - - OUT_REG(ring, - A6XX_GRAS_SU_CNTL( - .linehalfwidth = cso->line_width / 2.0, - .poly_offset = cso->offset_tri, - .msaa_enable = cso->multisample, - .cull_front = cso->cull_face & PIPE_FACE_FRONT, - .cull_back = cso->cull_face & PIPE_FACE_BACK, - .front_cw = !cso->front_ccw, - )); - - OUT_REG(ring, - A6XX_GRAS_SU_POINT_MINMAX( - .min = psize_min, - .max = psize_max, - ), - A6XX_GRAS_SU_POINT_SIZE( - cso->point_size - )); - - OUT_REG(ring, - A6XX_GRAS_SU_POLY_OFFSET_SCALE( - cso->offset_scale - ), - A6XX_GRAS_SU_POLY_OFFSET_OFFSET( - cso->offset_units - ), - A6XX_GRAS_SU_POLY_OFFSET_OFFSET_CLAMP( - cso->offset_clamp - )); - - OUT_REG(ring, - A6XX_PC_PRIMITIVE_CNTL_0( - .provoking_vtx_last = !cso->flatshade_first, - .primitive_restart = primitive_restart, - )); - - enum a6xx_polygon_mode mode = POLYMODE6_TRIANGLES; - switch (cso->fill_front) { - case PIPE_POLYGON_MODE_POINT: - mode = POLYMODE6_POINTS; - break; - case PIPE_POLYGON_MODE_LINE: - mode = POLYMODE6_LINES; - break; - default: - assert(cso->fill_front == PIPE_POLYGON_MODE_FILL); - break; - } - - OUT_REG(ring, A6XX_VPC_POLYGON_MODE(mode)); - OUT_REG(ring, A6XX_PC_POLYGON_MODE(mode)); - - return ring; + struct fd_ringbuffer *ring = fd_ringbuffer_new_object(ctx->pipe, 18 * 4); + float psize_min, psize_max; + + if (cso->point_size_per_vertex) { + psize_min = util_get_min_point_size(cso); + psize_max = 4092; + } else { + /* Force the point size to be as if the vertex output was disabled. */ + psize_min = cso->point_size; + psize_max = cso->point_size; + } + + OUT_REG(ring, A6XX_GRAS_CL_CNTL(.znear_clip_disable = !cso->depth_clip_near, + .zfar_clip_disable = !cso->depth_clip_far, + .unk5 = !cso->depth_clip_near || + !cso->depth_clip_far, + .vp_clip_code_ignore = 1, + .zero_gb_scale_z = cso->clip_halfz)); + + OUT_REG(ring, + A6XX_GRAS_SU_CNTL(.linehalfwidth = cso->line_width / 2.0, + .poly_offset = cso->offset_tri, + .msaa_enable = cso->multisample, + .cull_front = cso->cull_face & PIPE_FACE_FRONT, + .cull_back = cso->cull_face & PIPE_FACE_BACK, + .front_cw = !cso->front_ccw, )); + + OUT_REG(ring, + A6XX_GRAS_SU_POINT_MINMAX(.min = psize_min, .max = psize_max, ), + A6XX_GRAS_SU_POINT_SIZE(cso->point_size)); + + OUT_REG(ring, A6XX_GRAS_SU_POLY_OFFSET_SCALE(cso->offset_scale), + A6XX_GRAS_SU_POLY_OFFSET_OFFSET(cso->offset_units), + A6XX_GRAS_SU_POLY_OFFSET_OFFSET_CLAMP(cso->offset_clamp)); + + OUT_REG(ring, + A6XX_PC_PRIMITIVE_CNTL_0(.provoking_vtx_last = !cso->flatshade_first, + .primitive_restart = primitive_restart, )); + + enum a6xx_polygon_mode mode = POLYMODE6_TRIANGLES; + switch (cso->fill_front) { + case PIPE_POLYGON_MODE_POINT: + mode = POLYMODE6_POINTS; + break; + case PIPE_POLYGON_MODE_LINE: + mode = POLYMODE6_LINES; + break; + default: + assert(cso->fill_front == PIPE_POLYGON_MODE_FILL); + break; + } + + OUT_REG(ring, A6XX_VPC_POLYGON_MODE(mode)); + OUT_REG(ring, A6XX_PC_POLYGON_MODE(mode)); + + return ring; } void * fd6_rasterizer_state_create(struct pipe_context *pctx, - const struct pipe_rasterizer_state *cso) + const struct pipe_rasterizer_state *cso) { - struct fd6_rasterizer_stateobj *so; + struct fd6_rasterizer_stateobj *so; - so = CALLOC_STRUCT(fd6_rasterizer_stateobj); - if (!so) - return NULL; + so = CALLOC_STRUCT(fd6_rasterizer_stateobj); + if (!so) + return NULL; - so->base = *cso; + so->base = *cso; - return so; + return so; } void fd6_rasterizer_state_delete(struct pipe_context *pctx, void *hwcso) { - struct fd6_rasterizer_stateobj *so = hwcso; + struct fd6_rasterizer_stateobj *so = hwcso; - for (unsigned i = 0; i < ARRAY_SIZE(so->stateobjs); i++) - if (so->stateobjs[i]) - fd_ringbuffer_del(so->stateobjs[i]); + for (unsigned i = 0; i < ARRAY_SIZE(so->stateobjs); i++) + if (so->stateobjs[i]) + fd_ringbuffer_del(so->stateobjs[i]); - FREE(hwcso); + FREE(hwcso); } - diff --git a/src/gallium/drivers/freedreno/a6xx/fd6_rasterizer.h b/src/gallium/drivers/freedreno/a6xx/fd6_rasterizer.h index cfaa5c1..12fb228 100644 --- a/src/gallium/drivers/freedreno/a6xx/fd6_rasterizer.h +++ b/src/gallium/drivers/freedreno/a6xx/fd6_rasterizer.h @@ -28,43 +28,45 @@ #ifndef FD6_RASTERIZER_H_ #define FD6_RASTERIZER_H_ -#include "pipe/p_state.h" #include "pipe/p_context.h" +#include "pipe/p_state.h" #include "freedreno_context.h" struct fd6_rasterizer_stateobj { - struct pipe_rasterizer_state base; + struct pipe_rasterizer_state base; - struct fd_ringbuffer *stateobjs[2]; + struct fd_ringbuffer *stateobjs[2]; }; static inline struct fd6_rasterizer_stateobj * fd6_rasterizer_stateobj(struct pipe_rasterizer_state *rast) { - return (struct fd6_rasterizer_stateobj *)rast; + return (struct fd6_rasterizer_stateobj *)rast; } -void * fd6_rasterizer_state_create(struct pipe_context *pctx, - const struct pipe_rasterizer_state *cso); +void *fd6_rasterizer_state_create(struct pipe_context *pctx, + const struct pipe_rasterizer_state *cso); void fd6_rasterizer_state_delete(struct pipe_context *, void *hwcso); -struct fd_ringbuffer * __fd6_setup_rasterizer_stateobj(struct fd_context *ctx, - const struct pipe_rasterizer_state *cso, bool primitive_restart); +struct fd_ringbuffer * +__fd6_setup_rasterizer_stateobj(struct fd_context *ctx, + const struct pipe_rasterizer_state *cso, + bool primitive_restart); static inline struct fd_ringbuffer * -fd6_rasterizer_state(struct fd_context *ctx, bool primitive_restart) - assert_dt +fd6_rasterizer_state(struct fd_context *ctx, bool primitive_restart) assert_dt { - struct fd6_rasterizer_stateobj *rasterizer = fd6_rasterizer_stateobj(ctx->rasterizer); - unsigned variant = primitive_restart; + struct fd6_rasterizer_stateobj *rasterizer = + fd6_rasterizer_stateobj(ctx->rasterizer); + unsigned variant = primitive_restart; - if (unlikely(!rasterizer->stateobjs[variant])) { - rasterizer->stateobjs[variant] = - __fd6_setup_rasterizer_stateobj(ctx, ctx->rasterizer, primitive_restart); - } + if (unlikely(!rasterizer->stateobjs[variant])) { + rasterizer->stateobjs[variant] = __fd6_setup_rasterizer_stateobj( + ctx, ctx->rasterizer, primitive_restart); + } - return rasterizer->stateobjs[variant]; + return rasterizer->stateobjs[variant]; } #endif /* FD6_RASTERIZER_H_ */ diff --git a/src/gallium/drivers/freedreno/a6xx/fd6_resource.c b/src/gallium/drivers/freedreno/a6xx/fd6_resource.c index a0cf5ec..460fdc8 100644 --- a/src/gallium/drivers/freedreno/a6xx/fd6_resource.c +++ b/src/gallium/drivers/freedreno/a6xx/fd6_resource.c @@ -27,8 +27,8 @@ #include "drm-uapi/drm_fourcc.h" -#include "fd6_resource.h" #include "fd6_format.h" +#include "fd6_resource.h" #include "a6xx.xml.h" @@ -39,68 +39,69 @@ static bool ok_ubwc_format(struct pipe_screen *pscreen, enum pipe_format pfmt) { - switch (pfmt) { - case PIPE_FORMAT_X24S8_UINT: - case PIPE_FORMAT_Z24_UNORM_S8_UINT: - /* We can't sample stencil with UBWC on a630, and we may need to be able - * to sample stencil at some point. We can't just use - * fd_resource_uncompress() at the point of stencil sampling because - * that itself uses stencil sampling in the fd_blitter_blit path. - */ - return fd_screen(pscreen)->info.a6xx.has_z24uint_s8uint; - - case PIPE_FORMAT_R8_G8B8_420_UNORM: - return true; - - default: - break; - } - - switch (fd6_pipe2color(pfmt)) { - case FMT6_10_10_10_2_UINT: - case FMT6_10_10_10_2_UNORM_DEST: - case FMT6_11_11_10_FLOAT: - case FMT6_16_FLOAT: - case FMT6_16_16_16_16_FLOAT: - case FMT6_16_16_16_16_SINT: - case FMT6_16_16_16_16_UINT: - case FMT6_16_16_FLOAT: - case FMT6_16_16_SINT: - case FMT6_16_16_UINT: - case FMT6_16_SINT: - case FMT6_16_UINT: - case FMT6_32_32_32_32_SINT: - case FMT6_32_32_32_32_UINT: - case FMT6_32_32_SINT: - case FMT6_32_32_UINT: - case FMT6_5_6_5_UNORM: - case FMT6_8_8_8_8_SINT: - case FMT6_8_8_8_8_UINT: - case FMT6_8_8_8_8_UNORM: - case FMT6_8_8_8_X8_UNORM: - case FMT6_8_8_SINT: - case FMT6_8_8_UINT: - case FMT6_8_8_UNORM: - case FMT6_8_UNORM: - case FMT6_Z24_UNORM_S8_UINT: - case FMT6_Z24_UNORM_S8_UINT_AS_R8G8B8A8: - return true; - default: - return false; - } + switch (pfmt) { + case PIPE_FORMAT_X24S8_UINT: + case PIPE_FORMAT_Z24_UNORM_S8_UINT: + /* We can't sample stencil with UBWC on a630, and we may need to be able + * to sample stencil at some point. We can't just use + * fd_resource_uncompress() at the point of stencil sampling because + * that itself uses stencil sampling in the fd_blitter_blit path. + */ + return fd_screen(pscreen)->info.a6xx.has_z24uint_s8uint; + + case PIPE_FORMAT_R8_G8B8_420_UNORM: + return true; + + default: + break; + } + + switch (fd6_pipe2color(pfmt)) { + case FMT6_10_10_10_2_UINT: + case FMT6_10_10_10_2_UNORM_DEST: + case FMT6_11_11_10_FLOAT: + case FMT6_16_FLOAT: + case FMT6_16_16_16_16_FLOAT: + case FMT6_16_16_16_16_SINT: + case FMT6_16_16_16_16_UINT: + case FMT6_16_16_FLOAT: + case FMT6_16_16_SINT: + case FMT6_16_16_UINT: + case FMT6_16_SINT: + case FMT6_16_UINT: + case FMT6_32_32_32_32_SINT: + case FMT6_32_32_32_32_UINT: + case FMT6_32_32_SINT: + case FMT6_32_32_UINT: + case FMT6_5_6_5_UNORM: + case FMT6_8_8_8_8_SINT: + case FMT6_8_8_8_8_UINT: + case FMT6_8_8_8_8_UNORM: + case FMT6_8_8_8_X8_UNORM: + case FMT6_8_8_SINT: + case FMT6_8_8_UINT: + case FMT6_8_8_UNORM: + case FMT6_8_UNORM: + case FMT6_Z24_UNORM_S8_UINT: + case FMT6_Z24_UNORM_S8_UINT_AS_R8G8B8A8: + return true; + default: + return false; + } } static bool can_do_ubwc(struct pipe_resource *prsc) { - /* limit things to simple single level 2d for now: */ - if ((prsc->depth0 != 1) || (prsc->array_size != 1) || (prsc->last_level != 0)) - return false; - if (prsc->target != PIPE_TEXTURE_2D) - return false; - if (!ok_ubwc_format(prsc->screen, prsc->format)) - return false; - return true; + /* limit things to simple single level 2d for now: */ + if ((prsc->depth0 != 1) || (prsc->array_size != 1) || + (prsc->last_level != 0)) + return false; + if (prsc->target != PIPE_TEXTURE_2D) + return false; + if (!ok_ubwc_format(prsc->screen, prsc->format)) + return false; + return true; } /** @@ -110,132 +111,133 @@ can_do_ubwc(struct pipe_resource *prsc) */ void fd6_validate_format(struct fd_context *ctx, struct fd_resource *rsc, - enum pipe_format format) + enum pipe_format format) { - tc_assert_driver_thread(ctx->tc); + tc_assert_driver_thread(ctx->tc); - if (!rsc->layout.ubwc) - return; + if (!rsc->layout.ubwc) + return; - if (ok_ubwc_format(rsc->b.b.screen, format)) - return; + if (ok_ubwc_format(rsc->b.b.screen, format)) + return; - perf_debug_ctx(ctx, "%"PRSC_FMT": demoted to uncompressed due to use as %s", - PRSC_ARGS(&rsc->b.b), util_format_short_name(format)); + perf_debug_ctx(ctx, + "%" PRSC_FMT ": demoted to uncompressed due to use as %s", + PRSC_ARGS(&rsc->b.b), util_format_short_name(format)); - fd_resource_uncompress(ctx, rsc); + fd_resource_uncompress(ctx, rsc); } static void setup_lrz(struct fd_resource *rsc) { - struct fd_screen *screen = fd_screen(rsc->b.b.screen); - const uint32_t flags = DRM_FREEDRENO_GEM_CACHE_WCOMBINE | - DRM_FREEDRENO_GEM_TYPE_KMEM; /* TODO */ - unsigned width0 = rsc->b.b.width0; - unsigned height0 = rsc->b.b.height0; - - /* LRZ buffer is super-sampled: */ - switch (rsc->b.b.nr_samples) { - case 4: - width0 *= 2; - FALLTHROUGH; - case 2: - height0 *= 2; - } - - unsigned lrz_pitch = align(DIV_ROUND_UP(width0, 8), 32); - unsigned lrz_height = align(DIV_ROUND_UP(height0, 8), 16); - - unsigned size = lrz_pitch * lrz_height * 2; - - rsc->lrz_height = lrz_height; - rsc->lrz_width = lrz_pitch; - rsc->lrz_pitch = lrz_pitch; - rsc->lrz = fd_bo_new(screen->dev, size, flags, "lrz"); + struct fd_screen *screen = fd_screen(rsc->b.b.screen); + const uint32_t flags = + DRM_FREEDRENO_GEM_CACHE_WCOMBINE | DRM_FREEDRENO_GEM_TYPE_KMEM; /* TODO */ + unsigned width0 = rsc->b.b.width0; + unsigned height0 = rsc->b.b.height0; + + /* LRZ buffer is super-sampled: */ + switch (rsc->b.b.nr_samples) { + case 4: + width0 *= 2; + FALLTHROUGH; + case 2: + height0 *= 2; + } + + unsigned lrz_pitch = align(DIV_ROUND_UP(width0, 8), 32); + unsigned lrz_height = align(DIV_ROUND_UP(height0, 8), 16); + + unsigned size = lrz_pitch * lrz_height * 2; + + rsc->lrz_height = lrz_height; + rsc->lrz_width = lrz_pitch; + rsc->lrz_pitch = lrz_pitch; + rsc->lrz = fd_bo_new(screen->dev, size, flags, "lrz"); } static uint32_t fd6_setup_slices(struct fd_resource *rsc) { - struct pipe_resource *prsc = &rsc->b.b; + struct pipe_resource *prsc = &rsc->b.b; - if (!FD_DBG(NOLRZ) && has_depth(rsc->b.b.format)) - setup_lrz(rsc); + if (!FD_DBG(NOLRZ) && has_depth(rsc->b.b.format)) + setup_lrz(rsc); - if (rsc->layout.ubwc && !ok_ubwc_format(rsc->b.b.screen, rsc->b.b.format)) - rsc->layout.ubwc = false; + if (rsc->layout.ubwc && !ok_ubwc_format(rsc->b.b.screen, rsc->b.b.format)) + rsc->layout.ubwc = false; - fdl6_layout(&rsc->layout, prsc->format, fd_resource_nr_samples(prsc), - prsc->width0, prsc->height0, prsc->depth0, - prsc->last_level + 1, prsc->array_size, - prsc->target == PIPE_TEXTURE_3D, - NULL); + fdl6_layout(&rsc->layout, prsc->format, fd_resource_nr_samples(prsc), + prsc->width0, prsc->height0, prsc->depth0, prsc->last_level + 1, + prsc->array_size, prsc->target == PIPE_TEXTURE_3D, NULL); - return rsc->layout.size; + return rsc->layout.size; } static int fill_ubwc_buffer_sizes(struct fd_resource *rsc) { - struct pipe_resource *prsc = &rsc->b.b; - struct fdl_explicit_layout explicit = { - .offset = rsc->layout.slices[0].offset, - .pitch = rsc->layout.pitch0, - }; + struct pipe_resource *prsc = &rsc->b.b; + struct fdl_explicit_layout explicit = { + .offset = rsc->layout.slices[0].offset, + .pitch = rsc->layout.pitch0, + }; - if (!can_do_ubwc(prsc)) - return -1; + if (!can_do_ubwc(prsc)) + return -1; - rsc->layout.ubwc = true; - rsc->layout.tile_mode = TILE6_3; + rsc->layout.ubwc = true; + rsc->layout.tile_mode = TILE6_3; - if (!fdl6_layout(&rsc->layout, prsc->format, fd_resource_nr_samples(prsc), - prsc->width0, prsc->height0, prsc->depth0, - prsc->last_level + 1, prsc->array_size, false, &explicit)) - return -1; + if (!fdl6_layout(&rsc->layout, prsc->format, fd_resource_nr_samples(prsc), + prsc->width0, prsc->height0, prsc->depth0, + prsc->last_level + 1, prsc->array_size, false, &explicit)) + return -1; - if (rsc->layout.size > fd_bo_size(rsc->bo)) - return -1; + if (rsc->layout.size > fd_bo_size(rsc->bo)) + return -1; - return 0; + return 0; } static int fd6_layout_resource_for_modifier(struct fd_resource *rsc, uint64_t modifier) { - switch (modifier) { - case DRM_FORMAT_MOD_QCOM_COMPRESSED: - return fill_ubwc_buffer_sizes(rsc); - case DRM_FORMAT_MOD_LINEAR: - if (can_do_ubwc(&rsc->b.b)) { - perf_debug("%"PRSC_FMT": not UBWC: imported with DRM_FORMAT_MOD_LINEAR!", - PRSC_ARGS(&rsc->b.b)); - } - return 0; - case DRM_FORMAT_MOD_INVALID: - if (can_do_ubwc(&rsc->b.b)) { - perf_debug("%"PRSC_FMT": not UBWC: imported with DRM_FORMAT_MOD_INVALID!", - PRSC_ARGS(&rsc->b.b)); - } - return 0; - default: - return -1; - } + switch (modifier) { + case DRM_FORMAT_MOD_QCOM_COMPRESSED: + return fill_ubwc_buffer_sizes(rsc); + case DRM_FORMAT_MOD_LINEAR: + if (can_do_ubwc(&rsc->b.b)) { + perf_debug("%" PRSC_FMT + ": not UBWC: imported with DRM_FORMAT_MOD_LINEAR!", + PRSC_ARGS(&rsc->b.b)); + } + return 0; + case DRM_FORMAT_MOD_INVALID: + if (can_do_ubwc(&rsc->b.b)) { + perf_debug("%" PRSC_FMT + ": not UBWC: imported with DRM_FORMAT_MOD_INVALID!", + PRSC_ARGS(&rsc->b.b)); + } + return 0; + default: + return -1; + } } static const uint64_t supported_modifiers[] = { - DRM_FORMAT_MOD_LINEAR, - DRM_FORMAT_MOD_QCOM_COMPRESSED, + DRM_FORMAT_MOD_LINEAR, + DRM_FORMAT_MOD_QCOM_COMPRESSED, }; void fd6_resource_screen_init(struct pipe_screen *pscreen) { - struct fd_screen *screen = fd_screen(pscreen); + struct fd_screen *screen = fd_screen(pscreen); - screen->setup_slices = fd6_setup_slices; - screen->layout_resource_for_modifier = fd6_layout_resource_for_modifier; - screen->supported_modifiers = supported_modifiers; - screen->num_supported_modifiers = ARRAY_SIZE(supported_modifiers); + screen->setup_slices = fd6_setup_slices; + screen->layout_resource_for_modifier = fd6_layout_resource_for_modifier; + screen->supported_modifiers = supported_modifiers; + screen->num_supported_modifiers = ARRAY_SIZE(supported_modifiers); } diff --git a/src/gallium/drivers/freedreno/a6xx/fd6_resource.h b/src/gallium/drivers/freedreno/a6xx/fd6_resource.h index 2563a21..e367f53 100644 --- a/src/gallium/drivers/freedreno/a6xx/fd6_resource.h +++ b/src/gallium/drivers/freedreno/a6xx/fd6_resource.h @@ -31,9 +31,9 @@ #include "freedreno_resource.h" void fd6_validate_format(struct fd_context *ctx, struct fd_resource *rsc, - enum pipe_format format) assert_dt; -void fd6_emit_flag_reference(struct fd_ringbuffer *ring, struct fd_resource *rsc, - int level, int layer); + enum pipe_format format) assert_dt; +void fd6_emit_flag_reference(struct fd_ringbuffer *ring, + struct fd_resource *rsc, int level, int layer); void fd6_resource_screen_init(struct pipe_screen *pscreen); #endif /* FD6_RESOURCE_H_ */ diff --git a/src/gallium/drivers/freedreno/a6xx/fd6_screen.c b/src/gallium/drivers/freedreno/a6xx/fd6_screen.c index 511f5e6..1a7f373 100644 --- a/src/gallium/drivers/freedreno/a6xx/fd6_screen.c +++ b/src/gallium/drivers/freedreno/a6xx/fd6_screen.c @@ -29,124 +29,118 @@ #include "pipe/p_screen.h" #include "util/format/u_format.h" -#include "fd6_screen.h" #include "fd6_blitter.h" #include "fd6_context.h" #include "fd6_emit.h" #include "fd6_format.h" #include "fd6_resource.h" +#include "fd6_screen.h" #include "ir3/ir3_compiler.h" static bool valid_sample_count(unsigned sample_count) { - switch (sample_count) { - case 0: - case 1: - case 2: - case 4: -// TODO seems 8x works, but increases lrz width or height.. but the -// blob I have doesn't seem to expose any egl configs w/ 8x, so -// just hide it for now and revisit later. -// case 8: - return true; - default: - return false; - } + switch (sample_count) { + case 0: + case 1: + case 2: + case 4: + // TODO seems 8x works, but increases lrz width or height.. but the + // blob I have doesn't seem to expose any egl configs w/ 8x, so + // just hide it for now and revisit later. + // case 8: + return true; + default: + return false; + } } static bool fd6_screen_is_format_supported(struct pipe_screen *pscreen, - enum pipe_format format, - enum pipe_texture_target target, - unsigned sample_count, - unsigned storage_sample_count, - unsigned usage) + enum pipe_format format, + enum pipe_texture_target target, + unsigned sample_count, + unsigned storage_sample_count, unsigned usage) { - unsigned retval = 0; - - if ((target >= PIPE_MAX_TEXTURE_TYPES) || - !valid_sample_count(sample_count)) { - DBG("not supported: format=%s, target=%d, sample_count=%d, usage=%x", - util_format_name(format), target, sample_count, usage); - return false; - } - - if (MAX2(1, sample_count) != MAX2(1, storage_sample_count)) - return false; - - if ((usage & PIPE_BIND_VERTEX_BUFFER) && - (fd6_pipe2vtx(format) != FMT6_NONE)) { - retval |= PIPE_BIND_VERTEX_BUFFER; - } - - if ((usage & (PIPE_BIND_SAMPLER_VIEW | PIPE_BIND_SHADER_IMAGE)) && - (fd6_pipe2tex(format) != FMT6_NONE) && - (target == PIPE_BUFFER || - util_format_get_blocksize(format) != 12)) { - retval |= usage & (PIPE_BIND_SAMPLER_VIEW | PIPE_BIND_SHADER_IMAGE); - } - - if ((usage & (PIPE_BIND_RENDER_TARGET | - PIPE_BIND_DISPLAY_TARGET | - PIPE_BIND_SCANOUT | - PIPE_BIND_SHARED | - PIPE_BIND_COMPUTE_RESOURCE)) && - (fd6_pipe2color(format) != FMT6_NONE) && - (fd6_pipe2tex(format) != FMT6_NONE)) { - retval |= usage & (PIPE_BIND_RENDER_TARGET | - PIPE_BIND_DISPLAY_TARGET | - PIPE_BIND_SCANOUT | - PIPE_BIND_SHARED | - PIPE_BIND_COMPUTE_RESOURCE); - } - - /* For ARB_framebuffer_no_attachments: */ - if ((usage & PIPE_BIND_RENDER_TARGET) && (format == PIPE_FORMAT_NONE)) { - retval |= usage & PIPE_BIND_RENDER_TARGET; - } - - if ((usage & PIPE_BIND_DEPTH_STENCIL) && - (fd6_pipe2depth(format) != (enum a6xx_depth_format)~0) && - (fd6_pipe2tex(format) != FMT6_NONE)) { - retval |= PIPE_BIND_DEPTH_STENCIL; - } - - if ((usage & PIPE_BIND_INDEX_BUFFER) && - (fd_pipe2index(format) != (enum pc_di_index_size)~0)) { - retval |= PIPE_BIND_INDEX_BUFFER; - } - - if (retval != usage) { - DBG("not supported: format=%s, target=%d, sample_count=%d, " - "usage=%x, retval=%x", util_format_name(format), - target, sample_count, usage, retval); - } - - return retval == usage; + unsigned retval = 0; + + if ((target >= PIPE_MAX_TEXTURE_TYPES) || + !valid_sample_count(sample_count)) { + DBG("not supported: format=%s, target=%d, sample_count=%d, usage=%x", + util_format_name(format), target, sample_count, usage); + return false; + } + + if (MAX2(1, sample_count) != MAX2(1, storage_sample_count)) + return false; + + if ((usage & PIPE_BIND_VERTEX_BUFFER) && + (fd6_pipe2vtx(format) != FMT6_NONE)) { + retval |= PIPE_BIND_VERTEX_BUFFER; + } + + if ((usage & (PIPE_BIND_SAMPLER_VIEW | PIPE_BIND_SHADER_IMAGE)) && + (fd6_pipe2tex(format) != FMT6_NONE) && + (target == PIPE_BUFFER || util_format_get_blocksize(format) != 12)) { + retval |= usage & (PIPE_BIND_SAMPLER_VIEW | PIPE_BIND_SHADER_IMAGE); + } + + if ((usage & + (PIPE_BIND_RENDER_TARGET | PIPE_BIND_DISPLAY_TARGET | + PIPE_BIND_SCANOUT | PIPE_BIND_SHARED | PIPE_BIND_COMPUTE_RESOURCE)) && + (fd6_pipe2color(format) != FMT6_NONE) && + (fd6_pipe2tex(format) != FMT6_NONE)) { + retval |= usage & (PIPE_BIND_RENDER_TARGET | PIPE_BIND_DISPLAY_TARGET | + PIPE_BIND_SCANOUT | PIPE_BIND_SHARED | + PIPE_BIND_COMPUTE_RESOURCE); + } + + /* For ARB_framebuffer_no_attachments: */ + if ((usage & PIPE_BIND_RENDER_TARGET) && (format == PIPE_FORMAT_NONE)) { + retval |= usage & PIPE_BIND_RENDER_TARGET; + } + + if ((usage & PIPE_BIND_DEPTH_STENCIL) && + (fd6_pipe2depth(format) != (enum a6xx_depth_format) ~0) && + (fd6_pipe2tex(format) != FMT6_NONE)) { + retval |= PIPE_BIND_DEPTH_STENCIL; + } + + if ((usage & PIPE_BIND_INDEX_BUFFER) && + (fd_pipe2index(format) != (enum pc_di_index_size) ~0)) { + retval |= PIPE_BIND_INDEX_BUFFER; + } + + if (retval != usage) { + DBG("not supported: format=%s, target=%d, sample_count=%d, " + "usage=%x, retval=%x", + util_format_name(format), target, sample_count, usage, retval); + } + + return retval == usage; } void fd6_screen_init(struct pipe_screen *pscreen) { - struct fd_screen *screen = fd_screen(pscreen); + struct fd_screen *screen = fd_screen(pscreen); - screen->max_rts = A6XX_MAX_RENDER_TARGETS; + screen->max_rts = A6XX_MAX_RENDER_TARGETS; - /* Currently only FB_READ forces GMEM path, mostly because we'd have to - * deal with cmdstream patching otherwise.. - */ - screen->gmem_reason_mask = FD_GMEM_CLEARS_DEPTH_STENCIL | - FD_GMEM_DEPTH_ENABLED | FD_GMEM_STENCIL_ENABLED | - FD_GMEM_BLEND_ENABLED | FD_GMEM_LOGICOP_ENABLED; + /* Currently only FB_READ forces GMEM path, mostly because we'd have to + * deal with cmdstream patching otherwise.. + */ + screen->gmem_reason_mask = FD_GMEM_CLEARS_DEPTH_STENCIL | + FD_GMEM_DEPTH_ENABLED | FD_GMEM_STENCIL_ENABLED | + FD_GMEM_BLEND_ENABLED | FD_GMEM_LOGICOP_ENABLED; - pscreen->context_create = fd6_context_create; - pscreen->is_format_supported = fd6_screen_is_format_supported; + pscreen->context_create = fd6_context_create; + pscreen->is_format_supported = fd6_screen_is_format_supported; - screen->tile_mode = fd6_tile_mode; + screen->tile_mode = fd6_tile_mode; - fd6_resource_screen_init(pscreen); - fd6_emit_init_screen(pscreen); - ir3_screen_init(pscreen); + fd6_resource_screen_init(pscreen); + fd6_emit_init_screen(pscreen); + ir3_screen_init(pscreen); } diff --git a/src/gallium/drivers/freedreno/a6xx/fd6_texture.c b/src/gallium/drivers/freedreno/a6xx/fd6_texture.c index b922766..2dc7e34 100644 --- a/src/gallium/drivers/freedreno/a6xx/fd6_texture.c +++ b/src/gallium/drivers/freedreno/a6xx/fd6_texture.c @@ -26,317 +26,313 @@ */ #include "pipe/p_state.h" -#include "util/u_string.h" -#include "util/u_memory.h" -#include "util/u_inlines.h" #include "util/format/u_format.h" #include "util/hash_table.h" +#include "util/u_inlines.h" +#include "util/u_memory.h" +#include "util/u_string.h" -#include "fd6_texture.h" -#include "fd6_resource.h" -#include "fd6_format.h" #include "fd6_emit.h" +#include "fd6_format.h" +#include "fd6_resource.h" +#include "fd6_texture.h" static void remove_tex_entry(struct fd6_context *fd6_ctx, struct hash_entry *entry) { - struct fd6_texture_state *tex = entry->data; - _mesa_hash_table_remove(fd6_ctx->tex_cache, entry); - fd6_texture_state_reference(&tex, NULL); + struct fd6_texture_state *tex = entry->data; + _mesa_hash_table_remove(fd6_ctx->tex_cache, entry); + fd6_texture_state_reference(&tex, NULL); } static enum a6xx_tex_clamp tex_clamp(unsigned wrap, bool *needs_border) { - switch (wrap) { - case PIPE_TEX_WRAP_REPEAT: - return A6XX_TEX_REPEAT; - case PIPE_TEX_WRAP_CLAMP_TO_EDGE: - return A6XX_TEX_CLAMP_TO_EDGE; - case PIPE_TEX_WRAP_CLAMP_TO_BORDER: - *needs_border = true; - return A6XX_TEX_CLAMP_TO_BORDER; - case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE: - /* only works for PoT.. need to emulate otherwise! */ - return A6XX_TEX_MIRROR_CLAMP; - case PIPE_TEX_WRAP_MIRROR_REPEAT: - return A6XX_TEX_MIRROR_REPEAT; - case PIPE_TEX_WRAP_MIRROR_CLAMP: - case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER: - /* these two we could perhaps emulate, but we currently - * just don't advertise PIPE_CAP_TEXTURE_MIRROR_CLAMP - */ - default: - DBG("invalid wrap: %u", wrap); - return 0; - } + switch (wrap) { + case PIPE_TEX_WRAP_REPEAT: + return A6XX_TEX_REPEAT; + case PIPE_TEX_WRAP_CLAMP_TO_EDGE: + return A6XX_TEX_CLAMP_TO_EDGE; + case PIPE_TEX_WRAP_CLAMP_TO_BORDER: + *needs_border = true; + return A6XX_TEX_CLAMP_TO_BORDER; + case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE: + /* only works for PoT.. need to emulate otherwise! */ + return A6XX_TEX_MIRROR_CLAMP; + case PIPE_TEX_WRAP_MIRROR_REPEAT: + return A6XX_TEX_MIRROR_REPEAT; + case PIPE_TEX_WRAP_MIRROR_CLAMP: + case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER: + /* these two we could perhaps emulate, but we currently + * just don't advertise PIPE_CAP_TEXTURE_MIRROR_CLAMP + */ + default: + DBG("invalid wrap: %u", wrap); + return 0; + } } static enum a6xx_tex_filter tex_filter(unsigned filter, bool aniso) { - switch (filter) { - case PIPE_TEX_FILTER_NEAREST: - return A6XX_TEX_NEAREST; - case PIPE_TEX_FILTER_LINEAR: - return aniso ? A6XX_TEX_ANISO : A6XX_TEX_LINEAR; - default: - DBG("invalid filter: %u", filter); - return 0; - } + switch (filter) { + case PIPE_TEX_FILTER_NEAREST: + return A6XX_TEX_NEAREST; + case PIPE_TEX_FILTER_LINEAR: + return aniso ? A6XX_TEX_ANISO : A6XX_TEX_LINEAR; + default: + DBG("invalid filter: %u", filter); + return 0; + } } static void * fd6_sampler_state_create(struct pipe_context *pctx, - const struct pipe_sampler_state *cso) + const struct pipe_sampler_state *cso) { - struct fd6_sampler_stateobj *so = CALLOC_STRUCT(fd6_sampler_stateobj); - unsigned aniso = util_last_bit(MIN2(cso->max_anisotropy >> 1, 8)); - bool miplinear = false; - - if (!so) - return NULL; - - so->base = *cso; - so->seqno = ++fd6_context(fd_context(pctx))->tex_seqno; - - if (cso->min_mip_filter == PIPE_TEX_MIPFILTER_LINEAR) - miplinear = true; - - so->needs_border = false; - so->texsamp0 = - COND(miplinear, A6XX_TEX_SAMP_0_MIPFILTER_LINEAR_NEAR) | - A6XX_TEX_SAMP_0_XY_MAG(tex_filter(cso->mag_img_filter, aniso)) | - A6XX_TEX_SAMP_0_XY_MIN(tex_filter(cso->min_img_filter, aniso)) | - A6XX_TEX_SAMP_0_ANISO(aniso) | - A6XX_TEX_SAMP_0_WRAP_S(tex_clamp(cso->wrap_s, &so->needs_border)) | - A6XX_TEX_SAMP_0_WRAP_T(tex_clamp(cso->wrap_t, &so->needs_border)) | - A6XX_TEX_SAMP_0_WRAP_R(tex_clamp(cso->wrap_r, &so->needs_border)); - - so->texsamp1 = - COND(cso->min_mip_filter == PIPE_TEX_MIPFILTER_NONE, - A6XX_TEX_SAMP_1_MIPFILTER_LINEAR_FAR) | - COND(!cso->seamless_cube_map, A6XX_TEX_SAMP_1_CUBEMAPSEAMLESSFILTOFF) | - COND(!cso->normalized_coords, A6XX_TEX_SAMP_1_UNNORM_COORDS); - - so->texsamp0 |= A6XX_TEX_SAMP_0_LOD_BIAS(cso->lod_bias); - so->texsamp1 |= - A6XX_TEX_SAMP_1_MIN_LOD(cso->min_lod) | - A6XX_TEX_SAMP_1_MAX_LOD(cso->max_lod); - - if (cso->compare_mode) - so->texsamp1 |= A6XX_TEX_SAMP_1_COMPARE_FUNC(cso->compare_func); /* maps 1:1 */ - - return so; + struct fd6_sampler_stateobj *so = CALLOC_STRUCT(fd6_sampler_stateobj); + unsigned aniso = util_last_bit(MIN2(cso->max_anisotropy >> 1, 8)); + bool miplinear = false; + + if (!so) + return NULL; + + so->base = *cso; + so->seqno = ++fd6_context(fd_context(pctx))->tex_seqno; + + if (cso->min_mip_filter == PIPE_TEX_MIPFILTER_LINEAR) + miplinear = true; + + so->needs_border = false; + so->texsamp0 = + COND(miplinear, A6XX_TEX_SAMP_0_MIPFILTER_LINEAR_NEAR) | + A6XX_TEX_SAMP_0_XY_MAG(tex_filter(cso->mag_img_filter, aniso)) | + A6XX_TEX_SAMP_0_XY_MIN(tex_filter(cso->min_img_filter, aniso)) | + A6XX_TEX_SAMP_0_ANISO(aniso) | + A6XX_TEX_SAMP_0_WRAP_S(tex_clamp(cso->wrap_s, &so->needs_border)) | + A6XX_TEX_SAMP_0_WRAP_T(tex_clamp(cso->wrap_t, &so->needs_border)) | + A6XX_TEX_SAMP_0_WRAP_R(tex_clamp(cso->wrap_r, &so->needs_border)); + + so->texsamp1 = + COND(cso->min_mip_filter == PIPE_TEX_MIPFILTER_NONE, + A6XX_TEX_SAMP_1_MIPFILTER_LINEAR_FAR) | + COND(!cso->seamless_cube_map, A6XX_TEX_SAMP_1_CUBEMAPSEAMLESSFILTOFF) | + COND(!cso->normalized_coords, A6XX_TEX_SAMP_1_UNNORM_COORDS); + + so->texsamp0 |= A6XX_TEX_SAMP_0_LOD_BIAS(cso->lod_bias); + so->texsamp1 |= A6XX_TEX_SAMP_1_MIN_LOD(cso->min_lod) | + A6XX_TEX_SAMP_1_MAX_LOD(cso->max_lod); + + if (cso->compare_mode) + so->texsamp1 |= + A6XX_TEX_SAMP_1_COMPARE_FUNC(cso->compare_func); /* maps 1:1 */ + + return so; } static void fd6_sampler_state_delete(struct pipe_context *pctx, void *hwcso) { - struct fd_context *ctx = fd_context(pctx); - struct fd6_context *fd6_ctx = fd6_context(ctx); - struct fd6_sampler_stateobj *samp = hwcso; + struct fd_context *ctx = fd_context(pctx); + struct fd6_context *fd6_ctx = fd6_context(ctx); + struct fd6_sampler_stateobj *samp = hwcso; - fd_screen_lock(ctx->screen); + fd_screen_lock(ctx->screen); - hash_table_foreach(fd6_ctx->tex_cache, entry) { - struct fd6_texture_state *state = entry->data; + hash_table_foreach(fd6_ctx->tex_cache, entry) + { + struct fd6_texture_state *state = entry->data; - for (unsigned i = 0; i < ARRAY_SIZE(state->key.samp); i++) { - if (samp->seqno == state->key.samp[i].seqno) { - remove_tex_entry(fd6_ctx, entry); - break; - } - } - } + for (unsigned i = 0; i < ARRAY_SIZE(state->key.samp); i++) { + if (samp->seqno == state->key.samp[i].seqno) { + remove_tex_entry(fd6_ctx, entry); + break; + } + } + } - fd_screen_unlock(ctx->screen); + fd_screen_unlock(ctx->screen); - free(hwcso); + free(hwcso); } static struct pipe_sampler_view * fd6_sampler_view_create(struct pipe_context *pctx, struct pipe_resource *prsc, - const struct pipe_sampler_view *cso) + const struct pipe_sampler_view *cso) { - struct fd6_pipe_sampler_view *so = CALLOC_STRUCT(fd6_pipe_sampler_view); + struct fd6_pipe_sampler_view *so = CALLOC_STRUCT(fd6_pipe_sampler_view); - if (!so) - return NULL; + if (!so) + return NULL; - so->base = *cso; - pipe_reference(NULL, &prsc->reference); - so->base.texture = prsc; - so->base.reference.count = 1; - so->base.context = pctx; - so->needs_validate = true; + so->base = *cso; + pipe_reference(NULL, &prsc->reference); + so->base.texture = prsc; + so->base.reference.count = 1; + so->base.context = pctx; + so->needs_validate = true; - return &so->base; + return &so->base; } static void fd6_set_sampler_views(struct pipe_context *pctx, enum pipe_shader_type shader, - unsigned start, unsigned nr, unsigned unbind_num_trailing_slots, - struct pipe_sampler_view **views) - in_dt + unsigned start, unsigned nr, + unsigned unbind_num_trailing_slots, + struct pipe_sampler_view **views) in_dt { - struct fd_context *ctx = fd_context(pctx); + struct fd_context *ctx = fd_context(pctx); - fd_set_sampler_views(pctx, shader, start, nr, unbind_num_trailing_slots, views); + fd_set_sampler_views(pctx, shader, start, nr, unbind_num_trailing_slots, + views); - if (!views) - return; + if (!views) + return; - for (unsigned i = 0; i < nr; i++) { - struct fd6_pipe_sampler_view *so = fd6_pipe_sampler_view(views[i]); + for (unsigned i = 0; i < nr; i++) { + struct fd6_pipe_sampler_view *so = fd6_pipe_sampler_view(views[i]); - if (!(so && so->needs_validate)) - continue; + if (!(so && so->needs_validate)) + continue; - struct fd_resource *rsc = fd_resource(so->base.texture); + struct fd_resource *rsc = fd_resource(so->base.texture); - fd6_validate_format(ctx, rsc, so->base.format); - fd6_sampler_view_update(ctx, so); + fd6_validate_format(ctx, rsc, so->base.format); + fd6_sampler_view_update(ctx, so); - so->needs_validate = false; - } + so->needs_validate = false; + } } void -fd6_sampler_view_update(struct fd_context *ctx, struct fd6_pipe_sampler_view *so) +fd6_sampler_view_update(struct fd_context *ctx, + struct fd6_pipe_sampler_view *so) { - const struct pipe_sampler_view *cso = &so->base; - struct pipe_resource *prsc = cso->texture; - struct fd_resource *rsc = fd_resource(prsc); - enum pipe_format format = cso->format; - bool ubwc_enabled = false; - unsigned lvl, layers = 0; - - fd6_validate_format(ctx, rsc, cso->format); - - if (format == PIPE_FORMAT_X32_S8X24_UINT) { - rsc = rsc->stencil; - format = rsc->b.b.format; - } - - so->seqno = ++fd6_context(ctx)->tex_seqno; - so->ptr1 = rsc; - so->rsc_seqno = rsc->seqno; - - if (cso->target == PIPE_BUFFER) { - unsigned elements = cso->u.buf.size / util_format_get_blocksize(format); - - lvl = 0; - so->texconst1 = - A6XX_TEX_CONST_1_WIDTH(elements & MASK(15)) | - A6XX_TEX_CONST_1_HEIGHT(elements >> 15); - so->texconst2 = - A6XX_TEX_CONST_2_UNK4 | - A6XX_TEX_CONST_2_UNK31; - so->offset1 = cso->u.buf.offset; - } else { - unsigned miplevels; - - lvl = fd_sampler_first_level(cso); - miplevels = fd_sampler_last_level(cso) - lvl; - layers = cso->u.tex.last_layer - cso->u.tex.first_layer + 1; - - so->texconst0 |= A6XX_TEX_CONST_0_MIPLVLS(miplevels); - so->texconst1 = - A6XX_TEX_CONST_1_WIDTH(u_minify(prsc->width0, lvl)) | - A6XX_TEX_CONST_1_HEIGHT(u_minify(prsc->height0, lvl)); - so->texconst2 = - A6XX_TEX_CONST_2_PITCHALIGN(rsc->layout.pitchalign - 6) | - A6XX_TEX_CONST_2_PITCH(fd_resource_pitch(rsc, lvl)); - - ubwc_enabled = fd_resource_ubwc_enabled(rsc, lvl); - - if (rsc->b.b.format == PIPE_FORMAT_R8_G8B8_420_UNORM) { - struct fd_resource *next = fd_resource(rsc->b.b.next); - - /* In case of biplanar R8_G8B8, the UBWC metadata address in - * dwords 7 and 8, is instead the pointer to the second plane. - */ - so->ptr2 = next; - so->texconst6 = - A6XX_TEX_CONST_6_PLANE_PITCH(fd_resource_pitch(next, lvl)); - - if (ubwc_enabled) { - /* Further, if using UBWC with R8_G8B8, we only point to the - * UBWC header and the color data is expected to follow immediately. - */ - so->offset1 = - fd_resource_ubwc_offset(rsc, lvl, cso->u.tex.first_layer); - so->offset2 = - fd_resource_ubwc_offset(next, lvl, cso->u.tex.first_layer); - } else { - so->offset1 = fd_resource_offset(rsc, lvl, cso->u.tex.first_layer); - so->offset2 = fd_resource_offset(next, lvl, cso->u.tex.first_layer); - } - } else { - so->offset1 = fd_resource_offset(rsc, lvl, cso->u.tex.first_layer); - if (ubwc_enabled) { - so->ptr2 = rsc; - so->offset2 = fd_resource_ubwc_offset(rsc, lvl, cso->u.tex.first_layer); - } - } - } - - so->texconst0 |= fd6_tex_const_0(prsc, lvl, cso->format, - cso->swizzle_r, cso->swizzle_g, - cso->swizzle_b, cso->swizzle_a); - - so->texconst2 |= A6XX_TEX_CONST_2_TYPE(fd6_tex_type(cso->target)); - - switch (cso->target) { - case PIPE_TEXTURE_RECT: - case PIPE_TEXTURE_1D: - case PIPE_TEXTURE_2D: - so->texconst3 = - A6XX_TEX_CONST_3_ARRAY_PITCH(rsc->layout.layer_size); - so->texconst5 = - A6XX_TEX_CONST_5_DEPTH(1); - break; - case PIPE_TEXTURE_1D_ARRAY: - case PIPE_TEXTURE_2D_ARRAY: - so->texconst3 = - A6XX_TEX_CONST_3_ARRAY_PITCH(rsc->layout.layer_size); - so->texconst5 = - A6XX_TEX_CONST_5_DEPTH(layers); - break; - case PIPE_TEXTURE_CUBE: - case PIPE_TEXTURE_CUBE_ARRAY: - so->texconst3 = - A6XX_TEX_CONST_3_ARRAY_PITCH(rsc->layout.layer_size); - so->texconst5 = - A6XX_TEX_CONST_5_DEPTH(layers / 6); - break; - case PIPE_TEXTURE_3D: - so->texconst3 = - A6XX_TEX_CONST_3_MIN_LAYERSZ( - fd_resource_slice(rsc, prsc->last_level)->size0) | - A6XX_TEX_CONST_3_ARRAY_PITCH(fd_resource_slice(rsc, lvl)->size0); - so->texconst5 = - A6XX_TEX_CONST_5_DEPTH(u_minify(prsc->depth0, lvl)); - break; - default: - break; - } - - if (rsc->layout.tile_all) - so->texconst3 |= A6XX_TEX_CONST_3_TILE_ALL; - - if (ubwc_enabled) { - uint32_t block_width, block_height; - fdl6_get_ubwc_blockwidth(&rsc->layout, &block_width, &block_height); - - so->texconst3 |= A6XX_TEX_CONST_3_FLAG; - so->texconst9 |= A6XX_TEX_CONST_9_FLAG_BUFFER_ARRAY_PITCH(rsc->layout.ubwc_layer_size >> 2); - so->texconst10 |= - A6XX_TEX_CONST_10_FLAG_BUFFER_PITCH(fdl_ubwc_pitch(&rsc->layout, lvl)) | - A6XX_TEX_CONST_10_FLAG_BUFFER_LOGW(util_logbase2_ceil(DIV_ROUND_UP(u_minify(prsc->width0, lvl), block_width))) | - A6XX_TEX_CONST_10_FLAG_BUFFER_LOGH(util_logbase2_ceil(DIV_ROUND_UP(u_minify(prsc->height0, lvl), block_height))); - } + const struct pipe_sampler_view *cso = &so->base; + struct pipe_resource *prsc = cso->texture; + struct fd_resource *rsc = fd_resource(prsc); + enum pipe_format format = cso->format; + bool ubwc_enabled = false; + unsigned lvl, layers = 0; + + fd6_validate_format(ctx, rsc, cso->format); + + if (format == PIPE_FORMAT_X32_S8X24_UINT) { + rsc = rsc->stencil; + format = rsc->b.b.format; + } + + so->seqno = ++fd6_context(ctx)->tex_seqno; + so->ptr1 = rsc; + so->rsc_seqno = rsc->seqno; + + if (cso->target == PIPE_BUFFER) { + unsigned elements = cso->u.buf.size / util_format_get_blocksize(format); + + lvl = 0; + so->texconst1 = A6XX_TEX_CONST_1_WIDTH(elements & MASK(15)) | + A6XX_TEX_CONST_1_HEIGHT(elements >> 15); + so->texconst2 = A6XX_TEX_CONST_2_UNK4 | A6XX_TEX_CONST_2_UNK31; + so->offset1 = cso->u.buf.offset; + } else { + unsigned miplevels; + + lvl = fd_sampler_first_level(cso); + miplevels = fd_sampler_last_level(cso) - lvl; + layers = cso->u.tex.last_layer - cso->u.tex.first_layer + 1; + + so->texconst0 |= A6XX_TEX_CONST_0_MIPLVLS(miplevels); + so->texconst1 = A6XX_TEX_CONST_1_WIDTH(u_minify(prsc->width0, lvl)) | + A6XX_TEX_CONST_1_HEIGHT(u_minify(prsc->height0, lvl)); + so->texconst2 = A6XX_TEX_CONST_2_PITCHALIGN(rsc->layout.pitchalign - 6) | + A6XX_TEX_CONST_2_PITCH(fd_resource_pitch(rsc, lvl)); + + ubwc_enabled = fd_resource_ubwc_enabled(rsc, lvl); + + if (rsc->b.b.format == PIPE_FORMAT_R8_G8B8_420_UNORM) { + struct fd_resource *next = fd_resource(rsc->b.b.next); + + /* In case of biplanar R8_G8B8, the UBWC metadata address in + * dwords 7 and 8, is instead the pointer to the second plane. + */ + so->ptr2 = next; + so->texconst6 = + A6XX_TEX_CONST_6_PLANE_PITCH(fd_resource_pitch(next, lvl)); + + if (ubwc_enabled) { + /* Further, if using UBWC with R8_G8B8, we only point to the + * UBWC header and the color data is expected to follow immediately. + */ + so->offset1 = + fd_resource_ubwc_offset(rsc, lvl, cso->u.tex.first_layer); + so->offset2 = + fd_resource_ubwc_offset(next, lvl, cso->u.tex.first_layer); + } else { + so->offset1 = fd_resource_offset(rsc, lvl, cso->u.tex.first_layer); + so->offset2 = fd_resource_offset(next, lvl, cso->u.tex.first_layer); + } + } else { + so->offset1 = fd_resource_offset(rsc, lvl, cso->u.tex.first_layer); + if (ubwc_enabled) { + so->ptr2 = rsc; + so->offset2 = + fd_resource_ubwc_offset(rsc, lvl, cso->u.tex.first_layer); + } + } + } + + so->texconst0 |= + fd6_tex_const_0(prsc, lvl, cso->format, cso->swizzle_r, cso->swizzle_g, + cso->swizzle_b, cso->swizzle_a); + + so->texconst2 |= A6XX_TEX_CONST_2_TYPE(fd6_tex_type(cso->target)); + + switch (cso->target) { + case PIPE_TEXTURE_RECT: + case PIPE_TEXTURE_1D: + case PIPE_TEXTURE_2D: + so->texconst3 = A6XX_TEX_CONST_3_ARRAY_PITCH(rsc->layout.layer_size); + so->texconst5 = A6XX_TEX_CONST_5_DEPTH(1); + break; + case PIPE_TEXTURE_1D_ARRAY: + case PIPE_TEXTURE_2D_ARRAY: + so->texconst3 = A6XX_TEX_CONST_3_ARRAY_PITCH(rsc->layout.layer_size); + so->texconst5 = A6XX_TEX_CONST_5_DEPTH(layers); + break; + case PIPE_TEXTURE_CUBE: + case PIPE_TEXTURE_CUBE_ARRAY: + so->texconst3 = A6XX_TEX_CONST_3_ARRAY_PITCH(rsc->layout.layer_size); + so->texconst5 = A6XX_TEX_CONST_5_DEPTH(layers / 6); + break; + case PIPE_TEXTURE_3D: + so->texconst3 = + A6XX_TEX_CONST_3_MIN_LAYERSZ( + fd_resource_slice(rsc, prsc->last_level)->size0) | + A6XX_TEX_CONST_3_ARRAY_PITCH(fd_resource_slice(rsc, lvl)->size0); + so->texconst5 = A6XX_TEX_CONST_5_DEPTH(u_minify(prsc->depth0, lvl)); + break; + default: + break; + } + + if (rsc->layout.tile_all) + so->texconst3 |= A6XX_TEX_CONST_3_TILE_ALL; + + if (ubwc_enabled) { + uint32_t block_width, block_height; + fdl6_get_ubwc_blockwidth(&rsc->layout, &block_width, &block_height); + + so->texconst3 |= A6XX_TEX_CONST_3_FLAG; + so->texconst9 |= A6XX_TEX_CONST_9_FLAG_BUFFER_ARRAY_PITCH( + rsc->layout.ubwc_layer_size >> 2); + so->texconst10 |= + A6XX_TEX_CONST_10_FLAG_BUFFER_PITCH( + fdl_ubwc_pitch(&rsc->layout, lvl)) | + A6XX_TEX_CONST_10_FLAG_BUFFER_LOGW(util_logbase2_ceil( + DIV_ROUND_UP(u_minify(prsc->width0, lvl), block_width))) | + A6XX_TEX_CONST_10_FLAG_BUFFER_LOGH(util_logbase2_ceil( + DIV_ROUND_UP(u_minify(prsc->height0, lvl), block_height))); + } } /* NOTE this can be called in either driver thread or frontend thread @@ -344,190 +340,190 @@ fd6_sampler_view_update(struct fd_context *ctx, struct fd6_pipe_sampler_view *so */ static void fd6_sampler_view_destroy(struct pipe_context *pctx, - struct pipe_sampler_view *_view) + struct pipe_sampler_view *_view) { - struct fd_context *ctx = fd_context(pctx); - struct fd6_context *fd6_ctx = fd6_context(ctx); - struct fd6_pipe_sampler_view *view = fd6_pipe_sampler_view(_view); + struct fd_context *ctx = fd_context(pctx); + struct fd6_context *fd6_ctx = fd6_context(ctx); + struct fd6_pipe_sampler_view *view = fd6_pipe_sampler_view(_view); - fd_screen_lock(ctx->screen); + fd_screen_lock(ctx->screen); - hash_table_foreach(fd6_ctx->tex_cache, entry) { - struct fd6_texture_state *state = entry->data; + hash_table_foreach(fd6_ctx->tex_cache, entry) + { + struct fd6_texture_state *state = entry->data; - for (unsigned i = 0; i < ARRAY_SIZE(state->key.view); i++) { - if (view->seqno == state->key.view[i].seqno) { - remove_tex_entry(fd6_ctx, entry); - break; - } - } - } + for (unsigned i = 0; i < ARRAY_SIZE(state->key.view); i++) { + if (view->seqno == state->key.view[i].seqno) { + remove_tex_entry(fd6_ctx, entry); + break; + } + } + } - fd_screen_unlock(ctx->screen); + fd_screen_unlock(ctx->screen); - pipe_resource_reference(&view->base.texture, NULL); + pipe_resource_reference(&view->base.texture, NULL); - free(view); + free(view); } - static uint32_t key_hash(const void *_key) { - const struct fd6_texture_key *key = _key; - return XXH32(key, sizeof(*key), 0); + const struct fd6_texture_key *key = _key; + return XXH32(key, sizeof(*key), 0); } static bool key_equals(const void *_a, const void *_b) { - const struct fd6_texture_key *a = _a; - const struct fd6_texture_key *b = _b; - return memcmp(a, b, sizeof(struct fd6_texture_key)) == 0; + const struct fd6_texture_key *a = _a; + const struct fd6_texture_key *b = _b; + return memcmp(a, b, sizeof(struct fd6_texture_key)) == 0; } struct fd6_texture_state * fd6_texture_state(struct fd_context *ctx, enum pipe_shader_type type, - struct fd_texture_stateobj *tex) + struct fd_texture_stateobj *tex) { - struct fd6_context *fd6_ctx = fd6_context(ctx); - struct fd6_texture_state *state = NULL; - struct fd6_texture_key key; - bool needs_border = false; + struct fd6_context *fd6_ctx = fd6_context(ctx); + struct fd6_texture_state *state = NULL; + struct fd6_texture_key key; + bool needs_border = false; - memset(&key, 0, sizeof(key)); + memset(&key, 0, sizeof(key)); - for (unsigned i = 0; i < tex->num_textures; i++) { - if (!tex->textures[i]) - continue; + for (unsigned i = 0; i < tex->num_textures; i++) { + if (!tex->textures[i]) + continue; - struct fd6_pipe_sampler_view *view = - fd6_pipe_sampler_view(tex->textures[i]); + struct fd6_pipe_sampler_view *view = + fd6_pipe_sampler_view(tex->textures[i]); - /* NOTE that if the backing rsc was uncompressed between the - * time that the CSO was originally created and now, the rsc - * seqno would have changed, so we don't have to worry about - * getting a bogus cache hit. - */ - key.view[i].rsc_seqno = fd_resource(view->base.texture)->seqno; - key.view[i].seqno = view->seqno; - } + /* NOTE that if the backing rsc was uncompressed between the + * time that the CSO was originally created and now, the rsc + * seqno would have changed, so we don't have to worry about + * getting a bogus cache hit. + */ + key.view[i].rsc_seqno = fd_resource(view->base.texture)->seqno; + key.view[i].seqno = view->seqno; + } - for (unsigned i = 0; i < tex->num_samplers; i++) { - if (!tex->samplers[i]) - continue; + for (unsigned i = 0; i < tex->num_samplers; i++) { + if (!tex->samplers[i]) + continue; - struct fd6_sampler_stateobj *sampler = - fd6_sampler_stateobj(tex->samplers[i]); + struct fd6_sampler_stateobj *sampler = + fd6_sampler_stateobj(tex->samplers[i]); - key.samp[i].seqno = sampler->seqno; + key.samp[i].seqno = sampler->seqno; - needs_border |= sampler->needs_border; - } + needs_border |= sampler->needs_border; + } - key.type = type; - key.bcolor_offset = fd6_border_color_offset(ctx, type, tex); + key.type = type; + key.bcolor_offset = fd6_border_color_offset(ctx, type, tex); - uint32_t hash = key_hash(&key); - fd_screen_lock(ctx->screen); - struct hash_entry *entry = - _mesa_hash_table_search_pre_hashed(fd6_ctx->tex_cache, hash, &key); + uint32_t hash = key_hash(&key); + fd_screen_lock(ctx->screen); + struct hash_entry *entry = + _mesa_hash_table_search_pre_hashed(fd6_ctx->tex_cache, hash, &key); - if (entry) { - fd6_texture_state_reference(&state, entry->data); - goto out_unlock; - } + if (entry) { + fd6_texture_state_reference(&state, entry->data); + goto out_unlock; + } - state = CALLOC_STRUCT(fd6_texture_state); + state = CALLOC_STRUCT(fd6_texture_state); - /* NOTE: one ref for tex_cache, and second ref for returned state: */ - pipe_reference_init(&state->reference, 2); - state->key = key; - state->stateobj = fd_ringbuffer_new_object(ctx->pipe, 0x1000); - state->needs_border = needs_border; + /* NOTE: one ref for tex_cache, and second ref for returned state: */ + pipe_reference_init(&state->reference, 2); + state->key = key; + state->stateobj = fd_ringbuffer_new_object(ctx->pipe, 0x1000); + state->needs_border = needs_border; - fd6_emit_textures(ctx, state->stateobj, type, tex, key.bcolor_offset, NULL); + fd6_emit_textures(ctx, state->stateobj, type, tex, key.bcolor_offset, NULL); - /* NOTE: uses copy of key in state obj, because pointer passed by caller - * is probably on the stack - */ - _mesa_hash_table_insert_pre_hashed(fd6_ctx->tex_cache, hash, - &state->key, state); + /* NOTE: uses copy of key in state obj, because pointer passed by caller + * is probably on the stack + */ + _mesa_hash_table_insert_pre_hashed(fd6_ctx->tex_cache, hash, &state->key, + state); out_unlock: - fd_screen_unlock(ctx->screen); - return state; + fd_screen_unlock(ctx->screen); + return state; } void -__fd6_texture_state_describe(char* buf, const struct fd6_texture_state *tex) +__fd6_texture_state_describe(char *buf, const struct fd6_texture_state *tex) { - sprintf(buf, "fd6_texture_state<%p>", tex); + sprintf(buf, "fd6_texture_state<%p>", tex); } void __fd6_texture_state_destroy(struct fd6_texture_state *state) { - fd_ringbuffer_del(state->stateobj); - free(state); + fd_ringbuffer_del(state->stateobj); + free(state); } static void -fd6_rebind_resource(struct fd_context *ctx, struct fd_resource *rsc) - assert_dt +fd6_rebind_resource(struct fd_context *ctx, struct fd_resource *rsc) assert_dt { - fd_screen_assert_locked(ctx->screen); + fd_screen_assert_locked(ctx->screen); - if (!(rsc->dirty & FD_DIRTY_TEX)) - return; + if (!(rsc->dirty & FD_DIRTY_TEX)) + return; - struct fd6_context *fd6_ctx = fd6_context(ctx); + struct fd6_context *fd6_ctx = fd6_context(ctx); - hash_table_foreach (fd6_ctx->tex_cache, entry) { - struct fd6_texture_state *state = entry->data; + hash_table_foreach(fd6_ctx->tex_cache, entry) + { + struct fd6_texture_state *state = entry->data; - for (unsigned i = 0; i < ARRAY_SIZE(state->key.view); i++) { - if (rsc->seqno == state->key.view[i].rsc_seqno) { - remove_tex_entry(fd6_ctx, entry); - break; - } - } - } + for (unsigned i = 0; i < ARRAY_SIZE(state->key.view); i++) { + if (rsc->seqno == state->key.view[i].rsc_seqno) { + remove_tex_entry(fd6_ctx, entry); + break; + } + } + } } void -fd6_texture_init(struct pipe_context *pctx) - disable_thread_safety_analysis +fd6_texture_init(struct pipe_context *pctx) disable_thread_safety_analysis { - struct fd_context *ctx = fd_context(pctx); - struct fd6_context *fd6_ctx = fd6_context(ctx); + struct fd_context *ctx = fd_context(pctx); + struct fd6_context *fd6_ctx = fd6_context(ctx); - pctx->create_sampler_state = fd6_sampler_state_create; - pctx->delete_sampler_state = fd6_sampler_state_delete; - pctx->bind_sampler_states = fd_sampler_states_bind; + pctx->create_sampler_state = fd6_sampler_state_create; + pctx->delete_sampler_state = fd6_sampler_state_delete; + pctx->bind_sampler_states = fd_sampler_states_bind; - pctx->create_sampler_view = fd6_sampler_view_create; - pctx->sampler_view_destroy = fd6_sampler_view_destroy; - pctx->set_sampler_views = fd6_set_sampler_views; + pctx->create_sampler_view = fd6_sampler_view_create; + pctx->sampler_view_destroy = fd6_sampler_view_destroy; + pctx->set_sampler_views = fd6_set_sampler_views; - ctx->rebind_resource = fd6_rebind_resource; + ctx->rebind_resource = fd6_rebind_resource; - fd6_ctx->tex_cache = _mesa_hash_table_create(NULL, key_hash, key_equals); + fd6_ctx->tex_cache = _mesa_hash_table_create(NULL, key_hash, key_equals); } void fd6_texture_fini(struct pipe_context *pctx) { - struct fd_context *ctx = fd_context(pctx); - struct fd6_context *fd6_ctx = fd6_context(ctx); + struct fd_context *ctx = fd_context(pctx); + struct fd6_context *fd6_ctx = fd6_context(ctx); - fd_screen_lock(ctx->screen); + fd_screen_lock(ctx->screen); - hash_table_foreach(fd6_ctx->tex_cache, entry) { - remove_tex_entry(fd6_ctx, entry); - } + hash_table_foreach(fd6_ctx->tex_cache, entry) + { + remove_tex_entry(fd6_ctx, entry); + } - fd_screen_unlock(ctx->screen); + fd_screen_unlock(ctx->screen); - ralloc_free(fd6_ctx->tex_cache); + ralloc_free(fd6_ctx->tex_cache); } diff --git a/src/gallium/drivers/freedreno/a6xx/fd6_texture.h b/src/gallium/drivers/freedreno/a6xx/fd6_texture.h index 0ea68f8..572adbc 100644 --- a/src/gallium/drivers/freedreno/a6xx/fd6_texture.h +++ b/src/gallium/drivers/freedreno/a6xx/fd6_texture.h @@ -30,48 +30,49 @@ #include "pipe/p_context.h" -#include "freedreno_texture.h" #include "freedreno_resource.h" +#include "freedreno_texture.h" #include "fd6_context.h" #include "fd6_format.h" struct fd6_sampler_stateobj { - struct pipe_sampler_state base; - uint32_t texsamp0, texsamp1, texsamp2, texsamp3; - bool needs_border; - uint16_t seqno; + struct pipe_sampler_state base; + uint32_t texsamp0, texsamp1, texsamp2, texsamp3; + bool needs_border; + uint16_t seqno; }; static inline struct fd6_sampler_stateobj * fd6_sampler_stateobj(struct pipe_sampler_state *samp) { - return (struct fd6_sampler_stateobj *)samp; + return (struct fd6_sampler_stateobj *)samp; } struct fd6_pipe_sampler_view { - struct pipe_sampler_view base; - uint32_t texconst0, texconst1, texconst2, texconst3, texconst5; - uint32_t texconst6, texconst7, texconst8, texconst9, texconst10, texconst11; - uint32_t offset1, offset2; - struct fd_resource *ptr1, *ptr2; - uint16_t seqno; - - /* For detecting when a resource has transitioned from UBWC compressed - * to uncompressed, which means the sampler state needs to be updated - */ - uint16_t rsc_seqno; - - bool needs_validate; + struct pipe_sampler_view base; + uint32_t texconst0, texconst1, texconst2, texconst3, texconst5; + uint32_t texconst6, texconst7, texconst8, texconst9, texconst10, texconst11; + uint32_t offset1, offset2; + struct fd_resource *ptr1, *ptr2; + uint16_t seqno; + + /* For detecting when a resource has transitioned from UBWC compressed + * to uncompressed, which means the sampler state needs to be updated + */ + uint16_t rsc_seqno; + + bool needs_validate; }; static inline struct fd6_pipe_sampler_view * fd6_pipe_sampler_view(struct pipe_sampler_view *pview) { - return (struct fd6_pipe_sampler_view *)pview; + return (struct fd6_pipe_sampler_view *)pview; } -void fd6_sampler_view_update(struct fd_context *ctx, struct fd6_pipe_sampler_view *so) assert_dt; +void fd6_sampler_view_update(struct fd_context *ctx, + struct fd6_pipe_sampler_view *so) assert_dt; void fd6_texture_init(struct pipe_context *pctx); void fd6_texture_fini(struct pipe_context *pctx); @@ -79,54 +80,53 @@ void fd6_texture_fini(struct pipe_context *pctx); static inline enum a6xx_tex_type fd6_tex_type(unsigned target) { - switch (target) { - default: - assert(0); - case PIPE_BUFFER: - case PIPE_TEXTURE_1D: - case PIPE_TEXTURE_1D_ARRAY: - return A6XX_TEX_1D; - case PIPE_TEXTURE_RECT: - case PIPE_TEXTURE_2D: - case PIPE_TEXTURE_2D_ARRAY: - return A6XX_TEX_2D; - case PIPE_TEXTURE_3D: - return A6XX_TEX_3D; - case PIPE_TEXTURE_CUBE: - case PIPE_TEXTURE_CUBE_ARRAY: - return A6XX_TEX_CUBE; - } + switch (target) { + default: + assert(0); + case PIPE_BUFFER: + case PIPE_TEXTURE_1D: + case PIPE_TEXTURE_1D_ARRAY: + return A6XX_TEX_1D; + case PIPE_TEXTURE_RECT: + case PIPE_TEXTURE_2D: + case PIPE_TEXTURE_2D_ARRAY: + return A6XX_TEX_2D; + case PIPE_TEXTURE_3D: + return A6XX_TEX_3D; + case PIPE_TEXTURE_CUBE: + case PIPE_TEXTURE_CUBE_ARRAY: + return A6XX_TEX_CUBE; + } } static inline unsigned fd6_border_color_offset(struct fd_context *ctx, enum pipe_shader_type type, - struct fd_texture_stateobj *tex) - assert_dt + struct fd_texture_stateobj *tex) assert_dt { - /* Currently we put the FS border-color state after VS. Possibly - * we could swap the order. - * - * This will need update for HS/DS/GS - */ - if (type != PIPE_SHADER_FRAGMENT) - return 0; + /* Currently we put the FS border-color state after VS. Possibly + * we could swap the order. + * + * This will need update for HS/DS/GS + */ + if (type != PIPE_SHADER_FRAGMENT) + return 0; - unsigned needs_border = false; + unsigned needs_border = false; - for (unsigned i = 0; i < tex->num_samplers; i++) { - if (!tex->samplers[i]) - continue; + for (unsigned i = 0; i < tex->num_samplers; i++) { + if (!tex->samplers[i]) + continue; - struct fd6_sampler_stateobj *sampler = - fd6_sampler_stateobj(tex->samplers[i]); + struct fd6_sampler_stateobj *sampler = + fd6_sampler_stateobj(tex->samplers[i]); - needs_border |= sampler->needs_border; - } + needs_border |= sampler->needs_border; + } - if (!needs_border) - return 0; + if (!needs_border) + return 0; - return ctx->tex[PIPE_SHADER_VERTEX].num_samplers; + return ctx->tex[PIPE_SHADER_VERTEX].num_samplers; } /* @@ -139,46 +139,50 @@ fd6_border_color_offset(struct fd_context *ctx, enum pipe_shader_type type, */ struct fd6_texture_key { - struct { - /* We need to track the seqno of the rsc as well as of the - * sampler view, because resource shadowing/etc can result - * that the underlying bo changes (which means the previous - * state was no longer valid. - */ - uint16_t rsc_seqno; - uint16_t seqno; - } view[16]; - struct { - uint16_t seqno; - } samp[16]; - uint8_t type; - uint8_t bcolor_offset; + struct { + /* We need to track the seqno of the rsc as well as of the + * sampler view, because resource shadowing/etc can result + * that the underlying bo changes (which means the previous + * state was no longer valid. + */ + uint16_t rsc_seqno; + uint16_t seqno; + } view[16]; + struct { + uint16_t seqno; + } samp[16]; + uint8_t type; + uint8_t bcolor_offset; }; struct fd6_texture_state { - struct pipe_reference reference; - struct fd6_texture_key key; - struct fd_ringbuffer *stateobj; - bool needs_border; + struct pipe_reference reference; + struct fd6_texture_key key; + struct fd_ringbuffer *stateobj; + bool needs_border; }; -struct fd6_texture_state * fd6_texture_state(struct fd_context *ctx, - enum pipe_shader_type type, struct fd_texture_stateobj *tex) assert_dt; +struct fd6_texture_state * +fd6_texture_state(struct fd_context *ctx, enum pipe_shader_type type, + struct fd_texture_stateobj *tex) assert_dt; /* not called directly: */ -void __fd6_texture_state_describe(char* buf, const struct fd6_texture_state *tex); +void __fd6_texture_state_describe(char *buf, + const struct fd6_texture_state *tex); void __fd6_texture_state_destroy(struct fd6_texture_state *tex); static inline void -fd6_texture_state_reference(struct fd6_texture_state **ptr, struct fd6_texture_state *tex) +fd6_texture_state_reference(struct fd6_texture_state **ptr, + struct fd6_texture_state *tex) { - struct fd6_texture_state *old_tex = *ptr; + struct fd6_texture_state *old_tex = *ptr; - if (pipe_reference_described(&(*ptr)->reference, &tex->reference, - (debug_reference_descriptor)__fd6_texture_state_describe)) - __fd6_texture_state_destroy(old_tex); + if (pipe_reference_described( + &(*ptr)->reference, &tex->reference, + (debug_reference_descriptor)__fd6_texture_state_describe)) + __fd6_texture_state_destroy(old_tex); - *ptr = tex; + *ptr = tex; } #endif /* FD6_TEXTURE_H_ */ diff --git a/src/gallium/drivers/freedreno/a6xx/fd6_vsc.c b/src/gallium/drivers/freedreno/a6xx/fd6_vsc.c index 95cbb42..1398914 100644 --- a/src/gallium/drivers/freedreno/a6xx/fd6_vsc.c +++ b/src/gallium/drivers/freedreno/a6xx/fd6_vsc.c @@ -21,7 +21,6 @@ * SOFTWARE. */ - #include "pipe/p_state.h" #include "util/u_prim.h" @@ -36,8 +35,8 @@ */ enum { - byte = 8, - dword = 4 * byte, + byte = 8, + dword = 4 * byte, } bits_per; /** @@ -47,9 +46,9 @@ enum { static unsigned number_size_bits(unsigned nr) { - unsigned n = util_last_bit(nr); - assert(n); /* encoding 0 is not possible */ - return n + (n - 1); + unsigned n = util_last_bit(nr); + assert(n); /* encoding 0 is not possible */ + return n + (n - 1); } /** @@ -59,45 +58,48 @@ number_size_bits(unsigned nr) static unsigned bitfield_size_bits(unsigned n) { - return n + 1; /* worst case is always 1 + nr of bits */ + return n + 1; /* worst case is always 1 + nr of bits */ } static unsigned prim_count(const struct pipe_draw_info *info, const struct pipe_draw_start_count *draw) { - /* PIPE_PRIM_MAX used internally for RECTLIST blits on 3d pipe: */ - unsigned vtx_per_prim = (info->mode == PIPE_PRIM_MAX) ? 2 : - u_vertices_per_prim(info->mode); - return MAX2(1, (draw->count * info->instance_count) / vtx_per_prim); + /* PIPE_PRIM_MAX used internally for RECTLIST blits on 3d pipe: */ + unsigned vtx_per_prim = + (info->mode == PIPE_PRIM_MAX) ? 2 : u_vertices_per_prim(info->mode); + return MAX2(1, (draw->count * info->instance_count) / vtx_per_prim); } /** * The primitive stream uses a run-length encoding, where each packet contains a - * bitfield of bins covered and then the number of primitives which have the same - * bitfield. Each packet consists of the following, in order: + * bitfield of bins covered and then the number of primitives which have the + * same bitfield. Each packet consists of the following, in order: * * - The (compressed) bitfield of bins covered * - The number of primitives with this bitset * - Checksum * - * The worst case would be that each primitive has a different bitmask. In practice, - * assuming ever other primitive has a different bitmask still gets us conservatively - * large primitive stream sizes. (Ie. 10x what is needed, vs. 20x) + * The worst case would be that each primitive has a different bitmask. In + * practice, assuming ever other primitive has a different bitmask still gets us + * conservatively large primitive stream sizes. (Ie. 10x what is needed, vs. + * 20x) * * https://github.com/freedreno/freedreno/wiki/Visibility-Stream-Format#primitive-streams */ static unsigned primitive_stream_size_bits(const struct pipe_draw_info *info, - const struct pipe_draw_start_count *draw, unsigned num_bins) + const struct pipe_draw_start_count *draw, + unsigned num_bins) { - unsigned num_prims = prim_count(info, draw); - unsigned nbits = - (bitfield_size_bits(num_bins) /* bitfield of bins covered */ - + number_size_bits(1) /* number of primitives with this bitset */ - + 1 /* checksum */ - ) * DIV_ROUND_UP(num_prims, 2); - return align(nbits, dword); + unsigned num_prims = prim_count(info, draw); + unsigned nbits = + (bitfield_size_bits(num_bins) /* bitfield of bins covered */ + + number_size_bits(1) /* number of primitives with this bitset */ + + 1 /* checksum */ + ) * + DIV_ROUND_UP(num_prims, 2); + return align(nbits, dword); } /** @@ -113,39 +115,40 @@ primitive_stream_size_bits(const struct pipe_draw_info *info, */ static unsigned draw_stream_size_bits(const struct pipe_draw_info *info, unsigned num_bins, - unsigned prim_strm_bits) + unsigned prim_strm_bits) { - unsigned ndwords = prim_strm_bits / dword; - return (bitfield_size_bits(num_bins) /* bitfield of bins */ - + 1 /* last-instance-bit */ - + number_size_bits(ndwords) /* size of corresponding prim strm */ - + 1 /* checksum */ - ) * MAX2(1, info->instance_count); + unsigned ndwords = prim_strm_bits / dword; + return (bitfield_size_bits(num_bins) /* bitfield of bins */ + + 1 /* last-instance-bit */ + + number_size_bits(ndwords) /* size of corresponding prim strm */ + + 1 /* checksum */ + ) * + MAX2(1, info->instance_count); } void fd6_vsc_update_sizes(struct fd_batch *batch, const struct pipe_draw_info *info, const struct pipe_draw_start_count *draw) { - if (!batch->num_bins_per_pipe) { - batch->num_bins_per_pipe = fd_gmem_estimate_bins_per_pipe(batch); - - /* This is a convenient spot to add the size of the final draw- - * stream packet: - * - * If there are N bins, the final packet, after all the draws are - * done, consists of a 1 followed by N + 17 0's, plus a final 1. - * This uses the otherwise-unused pattern of a non-empty bitfield - * (initial 1) that is nontheless empty (has all 0's) - */ - unsigned final_pkt_sz = 1 + batch->num_bins_per_pipe + 17 + 1; - batch->prim_strm_bits = align(final_pkt_sz, dword); - } - - unsigned prim_strm_bits = - primitive_stream_size_bits(info, draw, batch->num_bins_per_pipe); - unsigned draw_strm_bits = - draw_stream_size_bits(info, batch->num_bins_per_pipe, prim_strm_bits); + if (!batch->num_bins_per_pipe) { + batch->num_bins_per_pipe = fd_gmem_estimate_bins_per_pipe(batch); + + /* This is a convenient spot to add the size of the final draw- + * stream packet: + * + * If there are N bins, the final packet, after all the draws are + * done, consists of a 1 followed by N + 17 0's, plus a final 1. + * This uses the otherwise-unused pattern of a non-empty bitfield + * (initial 1) that is nontheless empty (has all 0's) + */ + unsigned final_pkt_sz = 1 + batch->num_bins_per_pipe + 17 + 1; + batch->prim_strm_bits = align(final_pkt_sz, dword); + } + + unsigned prim_strm_bits = + primitive_stream_size_bits(info, draw, batch->num_bins_per_pipe); + unsigned draw_strm_bits = + draw_stream_size_bits(info, batch->num_bins_per_pipe, prim_strm_bits); #if 0 printf("vsc: prim_strm_bits=%d, draw_strm_bits=%d, nb=%u, ic=%u, c=%u, pc=%u (%s)\n", @@ -156,7 +159,6 @@ fd6_vsc_update_sizes(struct fd_batch *batch, const struct pipe_draw_info *info, u_prim_name(info->mode)); #endif - batch->prim_strm_bits += prim_strm_bits; - batch->draw_strm_bits += draw_strm_bits; + batch->prim_strm_bits += prim_strm_bits; + batch->draw_strm_bits += draw_strm_bits; } - diff --git a/src/gallium/drivers/freedreno/a6xx/fd6_vsc.h b/src/gallium/drivers/freedreno/a6xx/fd6_vsc.h index 3e1548e..b1aa099 100644 --- a/src/gallium/drivers/freedreno/a6xx/fd6_vsc.h +++ b/src/gallium/drivers/freedreno/a6xx/fd6_vsc.h @@ -24,7 +24,8 @@ #ifndef FD6_VSC_H_ #define FD6_VSC_H_ -void fd6_vsc_update_sizes(struct fd_batch *batch, const struct pipe_draw_info *info, +void fd6_vsc_update_sizes(struct fd_batch *batch, + const struct pipe_draw_info *info, const struct pipe_draw_start_count *draw); #endif /* FD6_VSC_H_ */ diff --git a/src/gallium/drivers/freedreno/a6xx/fd6_zsa.c b/src/gallium/drivers/freedreno/a6xx/fd6_zsa.c index 4bd39e2..9c63811 100644 --- a/src/gallium/drivers/freedreno/a6xx/fd6_zsa.c +++ b/src/gallium/drivers/freedreno/a6xx/fd6_zsa.c @@ -25,14 +25,13 @@ * Rob Clark */ - #include "pipe/p_state.h" -#include "util/u_string.h" #include "util/u_memory.h" +#include "util/u_string.h" -#include "fd6_zsa.h" #include "fd6_context.h" #include "fd6_format.h" +#include "fd6_zsa.h" /* update lza state based on stencil-test func: * @@ -54,186 +53,187 @@ */ static void update_lrz_stencil(struct fd6_zsa_stateobj *so, enum pipe_compare_func func, - bool stencil_write) + bool stencil_write) { - switch (func) { - case PIPE_FUNC_ALWAYS: - /* nothing to do for LRZ, but for stencil test when stencil- - * write is enabled, we need to disable lrz-test, since - * conceptually stencil test and write happens before depth- - * test: - */ - if (stencil_write) { - so->lrz.enable = false; - so->lrz.test = false; - } - break; - case PIPE_FUNC_NEVER: - /* fragment never passes, disable lrz_write for this draw: */ - so->lrz.write = false; - break; - default: - /* whether the fragment passes or not depends on result - * of stencil test, which we cannot know when doing binning - * pass: - */ - so->lrz.write = false; - /* similarly to the PIPE_FUNC_ALWAY case, if there are side- - * effects from stencil test we need to disable lrz-test. - */ - if (stencil_write) { - so->lrz.enable = false; - so->lrz.test = false; - } - break; - } + switch (func) { + case PIPE_FUNC_ALWAYS: + /* nothing to do for LRZ, but for stencil test when stencil- + * write is enabled, we need to disable lrz-test, since + * conceptually stencil test and write happens before depth- + * test: + */ + if (stencil_write) { + so->lrz.enable = false; + so->lrz.test = false; + } + break; + case PIPE_FUNC_NEVER: + /* fragment never passes, disable lrz_write for this draw: */ + so->lrz.write = false; + break; + default: + /* whether the fragment passes or not depends on result + * of stencil test, which we cannot know when doing binning + * pass: + */ + so->lrz.write = false; + /* similarly to the PIPE_FUNC_ALWAY case, if there are side- + * effects from stencil test we need to disable lrz-test. + */ + if (stencil_write) { + so->lrz.enable = false; + so->lrz.test = false; + } + break; + } } void * fd6_zsa_state_create(struct pipe_context *pctx, - const struct pipe_depth_stencil_alpha_state *cso) + const struct pipe_depth_stencil_alpha_state *cso) { - struct fd_context *ctx = fd_context(pctx); - struct fd6_zsa_stateobj *so; - - so = CALLOC_STRUCT(fd6_zsa_stateobj); - if (!so) - return NULL; - - so->base = *cso; - - so->writes_zs = util_writes_depth_stencil(cso); - - so->rb_depth_cntl |= - A6XX_RB_DEPTH_CNTL_ZFUNC(cso->depth_func); /* maps 1:1 */ - - if (cso->depth_enabled) { - so->rb_depth_cntl |= - A6XX_RB_DEPTH_CNTL_Z_ENABLE | - A6XX_RB_DEPTH_CNTL_Z_TEST_ENABLE; - - so->lrz.test = true; - - if (cso->depth_writemask) { - so->lrz.write = true; - } - - switch (cso->depth_func) { - case PIPE_FUNC_LESS: - case PIPE_FUNC_LEQUAL: - so->lrz.enable = true; - so->lrz.direction = FD_LRZ_LESS; - break; - - case PIPE_FUNC_GREATER: - case PIPE_FUNC_GEQUAL: - so->lrz.enable = true; - so->lrz.direction = FD_LRZ_GREATER; - break; - - case PIPE_FUNC_NEVER: - so->lrz.enable = true; - so->lrz.write = false; - so->lrz.direction = FD_LRZ_LESS; - break; - - /* TODO revisit these: */ - case PIPE_FUNC_EQUAL: - case PIPE_FUNC_NOTEQUAL: - case PIPE_FUNC_ALWAYS: - so->lrz.write = false; - so->invalidate_lrz = true; - break; - } - } - - if (cso->depth_writemask) - so->rb_depth_cntl |= A6XX_RB_DEPTH_CNTL_Z_WRITE_ENABLE; - - if (cso->stencil[0].enabled) { - const struct pipe_stencil_state *s = &cso->stencil[0]; - - /* stencil test happens before depth test, so without performing - * stencil test we don't really know what the updates to the - * depth buffer will be. - */ - update_lrz_stencil(so, s->func, !!s->writemask); - - so->rb_stencil_control |= - A6XX_RB_STENCIL_CONTROL_STENCIL_READ | - A6XX_RB_STENCIL_CONTROL_STENCIL_ENABLE | - A6XX_RB_STENCIL_CONTROL_FUNC(s->func) | /* maps 1:1 */ - A6XX_RB_STENCIL_CONTROL_FAIL(fd_stencil_op(s->fail_op)) | - A6XX_RB_STENCIL_CONTROL_ZPASS(fd_stencil_op(s->zpass_op)) | - A6XX_RB_STENCIL_CONTROL_ZFAIL(fd_stencil_op(s->zfail_op)); - - so->rb_stencilmask = A6XX_RB_STENCILMASK_MASK(s->valuemask); - so->rb_stencilwrmask = A6XX_RB_STENCILWRMASK_WRMASK(s->writemask); - - if (cso->stencil[1].enabled) { - const struct pipe_stencil_state *bs = &cso->stencil[1]; - - update_lrz_stencil(so, bs->func, !!bs->writemask); - - so->rb_stencil_control |= - A6XX_RB_STENCIL_CONTROL_STENCIL_ENABLE_BF | - A6XX_RB_STENCIL_CONTROL_FUNC_BF(bs->func) | /* maps 1:1 */ - A6XX_RB_STENCIL_CONTROL_FAIL_BF(fd_stencil_op(bs->fail_op)) | - A6XX_RB_STENCIL_CONTROL_ZPASS_BF(fd_stencil_op(bs->zpass_op)) | - A6XX_RB_STENCIL_CONTROL_ZFAIL_BF(fd_stencil_op(bs->zfail_op)); - - so->rb_stencilmask |= A6XX_RB_STENCILMASK_BFMASK(bs->valuemask); - so->rb_stencilwrmask |= A6XX_RB_STENCILWRMASK_BFWRMASK(bs->writemask); - } - } - - if (cso->alpha_enabled) { - /* Alpha test is functionally a conditional discard, so we can't - * write LRZ before seeing if we end up discarding or not - */ - if (cso->alpha_func != PIPE_FUNC_ALWAYS) { - so->lrz.write = false; - so->alpha_test = true; - } - - uint32_t ref = cso->alpha_ref_value * 255.0; - so->rb_alpha_control = - A6XX_RB_ALPHA_CONTROL_ALPHA_TEST | - A6XX_RB_ALPHA_CONTROL_ALPHA_REF(ref) | - A6XX_RB_ALPHA_CONTROL_ALPHA_TEST_FUNC(cso->alpha_func); - } - - for (int i = 0; i < 4; i++) { - struct fd_ringbuffer *ring = fd_ringbuffer_new_object(ctx->pipe, 9 * 4); - - OUT_PKT4(ring, REG_A6XX_RB_ALPHA_CONTROL, 1); - OUT_RING(ring, (i & FD6_ZSA_NO_ALPHA) ? - so->rb_alpha_control & ~A6XX_RB_ALPHA_CONTROL_ALPHA_TEST : - so->rb_alpha_control); - - OUT_PKT4(ring, REG_A6XX_RB_STENCIL_CONTROL, 1); - OUT_RING(ring, so->rb_stencil_control); - - OUT_PKT4(ring, REG_A6XX_RB_DEPTH_CNTL, 1); - OUT_RING(ring, so->rb_depth_cntl | - COND(i & FD6_ZSA_DEPTH_CLAMP, A6XX_RB_DEPTH_CNTL_Z_CLAMP_ENABLE)); - - OUT_PKT4(ring, REG_A6XX_RB_STENCILMASK, 2); - OUT_RING(ring, so->rb_stencilmask); - OUT_RING(ring, so->rb_stencilwrmask); - - so->stateobj[i] = ring; - } - - return so; + struct fd_context *ctx = fd_context(pctx); + struct fd6_zsa_stateobj *so; + + so = CALLOC_STRUCT(fd6_zsa_stateobj); + if (!so) + return NULL; + + so->base = *cso; + + so->writes_zs = util_writes_depth_stencil(cso); + + so->rb_depth_cntl |= + A6XX_RB_DEPTH_CNTL_ZFUNC(cso->depth_func); /* maps 1:1 */ + + if (cso->depth_enabled) { + so->rb_depth_cntl |= + A6XX_RB_DEPTH_CNTL_Z_ENABLE | A6XX_RB_DEPTH_CNTL_Z_TEST_ENABLE; + + so->lrz.test = true; + + if (cso->depth_writemask) { + so->lrz.write = true; + } + + switch (cso->depth_func) { + case PIPE_FUNC_LESS: + case PIPE_FUNC_LEQUAL: + so->lrz.enable = true; + so->lrz.direction = FD_LRZ_LESS; + break; + + case PIPE_FUNC_GREATER: + case PIPE_FUNC_GEQUAL: + so->lrz.enable = true; + so->lrz.direction = FD_LRZ_GREATER; + break; + + case PIPE_FUNC_NEVER: + so->lrz.enable = true; + so->lrz.write = false; + so->lrz.direction = FD_LRZ_LESS; + break; + + /* TODO revisit these: */ + case PIPE_FUNC_EQUAL: + case PIPE_FUNC_NOTEQUAL: + case PIPE_FUNC_ALWAYS: + so->lrz.write = false; + so->invalidate_lrz = true; + break; + } + } + + if (cso->depth_writemask) + so->rb_depth_cntl |= A6XX_RB_DEPTH_CNTL_Z_WRITE_ENABLE; + + if (cso->stencil[0].enabled) { + const struct pipe_stencil_state *s = &cso->stencil[0]; + + /* stencil test happens before depth test, so without performing + * stencil test we don't really know what the updates to the + * depth buffer will be. + */ + update_lrz_stencil(so, s->func, !!s->writemask); + + so->rb_stencil_control |= + A6XX_RB_STENCIL_CONTROL_STENCIL_READ | + A6XX_RB_STENCIL_CONTROL_STENCIL_ENABLE | + A6XX_RB_STENCIL_CONTROL_FUNC(s->func) | /* maps 1:1 */ + A6XX_RB_STENCIL_CONTROL_FAIL(fd_stencil_op(s->fail_op)) | + A6XX_RB_STENCIL_CONTROL_ZPASS(fd_stencil_op(s->zpass_op)) | + A6XX_RB_STENCIL_CONTROL_ZFAIL(fd_stencil_op(s->zfail_op)); + + so->rb_stencilmask = A6XX_RB_STENCILMASK_MASK(s->valuemask); + so->rb_stencilwrmask = A6XX_RB_STENCILWRMASK_WRMASK(s->writemask); + + if (cso->stencil[1].enabled) { + const struct pipe_stencil_state *bs = &cso->stencil[1]; + + update_lrz_stencil(so, bs->func, !!bs->writemask); + + so->rb_stencil_control |= + A6XX_RB_STENCIL_CONTROL_STENCIL_ENABLE_BF | + A6XX_RB_STENCIL_CONTROL_FUNC_BF(bs->func) | /* maps 1:1 */ + A6XX_RB_STENCIL_CONTROL_FAIL_BF(fd_stencil_op(bs->fail_op)) | + A6XX_RB_STENCIL_CONTROL_ZPASS_BF(fd_stencil_op(bs->zpass_op)) | + A6XX_RB_STENCIL_CONTROL_ZFAIL_BF(fd_stencil_op(bs->zfail_op)); + + so->rb_stencilmask |= A6XX_RB_STENCILMASK_BFMASK(bs->valuemask); + so->rb_stencilwrmask |= A6XX_RB_STENCILWRMASK_BFWRMASK(bs->writemask); + } + } + + if (cso->alpha_enabled) { + /* Alpha test is functionally a conditional discard, so we can't + * write LRZ before seeing if we end up discarding or not + */ + if (cso->alpha_func != PIPE_FUNC_ALWAYS) { + so->lrz.write = false; + so->alpha_test = true; + } + + uint32_t ref = cso->alpha_ref_value * 255.0; + so->rb_alpha_control = + A6XX_RB_ALPHA_CONTROL_ALPHA_TEST | + A6XX_RB_ALPHA_CONTROL_ALPHA_REF(ref) | + A6XX_RB_ALPHA_CONTROL_ALPHA_TEST_FUNC(cso->alpha_func); + } + + for (int i = 0; i < 4; i++) { + struct fd_ringbuffer *ring = fd_ringbuffer_new_object(ctx->pipe, 9 * 4); + + OUT_PKT4(ring, REG_A6XX_RB_ALPHA_CONTROL, 1); + OUT_RING(ring, + (i & FD6_ZSA_NO_ALPHA) + ? so->rb_alpha_control & ~A6XX_RB_ALPHA_CONTROL_ALPHA_TEST + : so->rb_alpha_control); + + OUT_PKT4(ring, REG_A6XX_RB_STENCIL_CONTROL, 1); + OUT_RING(ring, so->rb_stencil_control); + + OUT_PKT4(ring, REG_A6XX_RB_DEPTH_CNTL, 1); + OUT_RING(ring, + so->rb_depth_cntl | COND(i & FD6_ZSA_DEPTH_CLAMP, + A6XX_RB_DEPTH_CNTL_Z_CLAMP_ENABLE)); + + OUT_PKT4(ring, REG_A6XX_RB_STENCILMASK, 2); + OUT_RING(ring, so->rb_stencilmask); + OUT_RING(ring, so->rb_stencilwrmask); + + so->stateobj[i] = ring; + } + + return so; } void fd6_zsa_state_delete(struct pipe_context *pctx, void *hwcso) { - struct fd6_zsa_stateobj *so = hwcso; + struct fd6_zsa_stateobj *so = hwcso; - for (int i = 0; i < ARRAY_SIZE(so->stateobj); i++) - fd_ringbuffer_del(so->stateobj[i]); - FREE(hwcso); + for (int i = 0; i < ARRAY_SIZE(so->stateobj); i++) + fd_ringbuffer_del(so->stateobj[i]); + FREE(hwcso); } diff --git a/src/gallium/drivers/freedreno/a6xx/fd6_zsa.h b/src/gallium/drivers/freedreno/a6xx/fd6_zsa.h index de23972..15be91e 100644 --- a/src/gallium/drivers/freedreno/a6xx/fd6_zsa.h +++ b/src/gallium/drivers/freedreno/a6xx/fd6_zsa.h @@ -28,9 +28,8 @@ #ifndef FD6_ZSA_H_ #define FD6_ZSA_H_ - -#include "pipe/p_state.h" #include "pipe/p_context.h" +#include "pipe/p_state.h" #include "freedreno_util.h" @@ -40,45 +39,42 @@ #define FD6_ZSA_DEPTH_CLAMP (1 << 1) struct fd6_zsa_stateobj { - struct pipe_depth_stencil_alpha_state base; + struct pipe_depth_stencil_alpha_state base; - uint32_t rb_alpha_control; - uint32_t rb_depth_cntl; - uint32_t rb_stencil_control; - uint32_t rb_stencilmask; - uint32_t rb_stencilwrmask; + uint32_t rb_alpha_control; + uint32_t rb_depth_cntl; + uint32_t rb_stencil_control; + uint32_t rb_stencilmask; + uint32_t rb_stencilwrmask; - struct fd6_lrz_state lrz; - bool writes_zs; /* writes depth and/or stencil */ - bool invalidate_lrz; - bool alpha_test; + struct fd6_lrz_state lrz; + bool writes_zs; /* writes depth and/or stencil */ + bool invalidate_lrz; + bool alpha_test; - struct fd_ringbuffer *stateobj[4]; + struct fd_ringbuffer *stateobj[4]; }; static inline struct fd6_zsa_stateobj * fd6_zsa_stateobj(struct pipe_depth_stencil_alpha_state *zsa) { - return (struct fd6_zsa_stateobj *)zsa; + return (struct fd6_zsa_stateobj *)zsa; } static inline struct fd_ringbuffer * -fd6_zsa_state(struct fd_context *ctx, bool no_alpha, bool depth_clamp) - assert_dt +fd6_zsa_state(struct fd_context *ctx, bool no_alpha, bool depth_clamp) assert_dt { - int variant = 0; - if (no_alpha) - variant |= FD6_ZSA_NO_ALPHA; - if (depth_clamp) - variant |= FD6_ZSA_DEPTH_CLAMP; - return fd6_zsa_stateobj(ctx->zsa)->stateobj[variant]; + int variant = 0; + if (no_alpha) + variant |= FD6_ZSA_NO_ALPHA; + if (depth_clamp) + variant |= FD6_ZSA_DEPTH_CLAMP; + return fd6_zsa_stateobj(ctx->zsa)->stateobj[variant]; } -void * fd6_zsa_state_create(struct pipe_context *pctx, - const struct pipe_depth_stencil_alpha_state *cso); +void *fd6_zsa_state_create(struct pipe_context *pctx, + const struct pipe_depth_stencil_alpha_state *cso); -void fd6_zsa_state_delete(struct pipe_context *pctx, - void *hwcso); +void fd6_zsa_state_delete(struct pipe_context *pctx, void *hwcso); #endif /* FD6_ZSA_H_ */ - diff --git a/src/gallium/drivers/freedreno/freedreno_autotune.c b/src/gallium/drivers/freedreno/freedreno_autotune.c index 1aff434..f2a8384 100644 --- a/src/gallium/drivers/freedreno/freedreno_autotune.c +++ b/src/gallium/drivers/freedreno/freedreno_autotune.c @@ -25,143 +25,143 @@ #include "freedreno_batch.h" #include "freedreno_util.h" - /** * Tracks, for a given batch key (which maps to a FBO/framebuffer state), * * ralloc parent is fd_autotune::ht */ struct fd_batch_history { - struct fd_batch_key *key; + struct fd_batch_key *key; - /* Entry in fd_autotune::lru: */ - struct list_head node; + /* Entry in fd_autotune::lru: */ + struct list_head node; - unsigned num_results; + unsigned num_results; - /** - * List of recent fd_batch_result's - */ - struct list_head results; + /** + * List of recent fd_batch_result's + */ + struct list_head results; #define MAX_RESULTS 5 }; - static struct fd_batch_history * get_history(struct fd_autotune *at, struct fd_batch *batch) { - struct fd_batch_history *history; + struct fd_batch_history *history; - if (!batch->key) - return NULL; + if (!batch->key) + return NULL; - struct hash_entry *entry = - _mesa_hash_table_search_pre_hashed(at->ht, batch->hash, batch->key); + struct hash_entry *entry = + _mesa_hash_table_search_pre_hashed(at->ht, batch->hash, batch->key); - if (entry) { - history = entry->data; - goto found; - } + if (entry) { + history = entry->data; + goto found; + } - history = rzalloc_size(at->ht, sizeof(*history)); + history = rzalloc_size(at->ht, sizeof(*history)); - history->key = fd_batch_key_clone(history, batch->key); - list_inithead(&history->node); - list_inithead(&history->results); + history->key = fd_batch_key_clone(history, batch->key); + list_inithead(&history->node); + list_inithead(&history->results); - /* Note: We cap # of cached GMEM states at 20.. so assuming double- - * buffering, 40 should be a good place to cap cached autotune state - */ - if (at->ht->entries >= 40) { - struct fd_batch_history *last = - list_last_entry(&at->lru, struct fd_batch_history, node); - _mesa_hash_table_remove_key(at->ht, last->key); - list_del(&last->node); - ralloc_free(last); - } + /* Note: We cap # of cached GMEM states at 20.. so assuming double- + * buffering, 40 should be a good place to cap cached autotune state + */ + if (at->ht->entries >= 40) { + struct fd_batch_history *last = + list_last_entry(&at->lru, struct fd_batch_history, node); + _mesa_hash_table_remove_key(at->ht, last->key); + list_del(&last->node); + ralloc_free(last); + } - _mesa_hash_table_insert_pre_hashed(at->ht, batch->hash, history->key, history); + _mesa_hash_table_insert_pre_hashed(at->ht, batch->hash, history->key, + history); found: - /* Move to the head of the LRU: */ - list_delinit(&history->node); - list_add(&history->node, &at->lru); + /* Move to the head of the LRU: */ + list_delinit(&history->node); + list_add(&history->node, &at->lru); - return history; + return history; } static void result_destructor(void *r) { - struct fd_batch_result *result = r; + struct fd_batch_result *result = r; - /* Just in case we manage to somehow still be on the pending_results list: */ - list_del(&result->node); + /* Just in case we manage to somehow still be on the pending_results list: */ + list_del(&result->node); } static struct fd_batch_result * get_result(struct fd_autotune *at, struct fd_batch_history *history) { - struct fd_batch_result *result = rzalloc_size(history, sizeof(*result)); + struct fd_batch_result *result = rzalloc_size(history, sizeof(*result)); - result->fence = ++at->fence_counter; /* pre-increment so zero isn't valid fence */ - result->idx = at->idx_counter++; + result->fence = + ++at->fence_counter; /* pre-increment so zero isn't valid fence */ + result->idx = at->idx_counter++; - if (at->idx_counter >= ARRAY_SIZE(at->results->result)) - at->idx_counter = 0; + if (at->idx_counter >= ARRAY_SIZE(at->results->result)) + at->idx_counter = 0; - result->history = history; - list_addtail(&result->node, &at->pending_results); + result->history = history; + list_addtail(&result->node, &at->pending_results); - ralloc_set_destructor(result, result_destructor); + ralloc_set_destructor(result, result_destructor); - return result; + return result; } static void process_results(struct fd_autotune *at) { - uint32_t current_fence = at->results->fence; - - list_for_each_entry_safe (struct fd_batch_result, result, &at->pending_results, node) { - if (result->fence > current_fence) - break; - - struct fd_batch_history *history = result->history; - - result->samples_passed = at->results->result[result->idx].samples_end - - at->results->result[result->idx].samples_start; - - list_delinit(&result->node); - list_add(&result->node, &history->results); - - if (history->num_results < MAX_RESULTS) { - history->num_results++; - } else { - /* Once above a limit, start popping old results off the - * tail of the list: - */ - struct fd_batch_result *old_result = - list_last_entry(&history->results, struct fd_batch_result, node); - list_delinit(&old_result->node); - ralloc_free(old_result); - } - } + uint32_t current_fence = at->results->fence; + + list_for_each_entry_safe (struct fd_batch_result, result, + &at->pending_results, node) { + if (result->fence > current_fence) + break; + + struct fd_batch_history *history = result->history; + + result->samples_passed = at->results->result[result->idx].samples_end - + at->results->result[result->idx].samples_start; + + list_delinit(&result->node); + list_add(&result->node, &history->results); + + if (history->num_results < MAX_RESULTS) { + history->num_results++; + } else { + /* Once above a limit, start popping old results off the + * tail of the list: + */ + struct fd_batch_result *old_result = + list_last_entry(&history->results, struct fd_batch_result, node); + list_delinit(&old_result->node); + ralloc_free(old_result); + } + } } static bool fallback_use_bypass(struct fd_batch *batch) { - struct pipe_framebuffer_state *pfb = &batch->framebuffer; + struct pipe_framebuffer_state *pfb = &batch->framebuffer; - /* Fallback logic if we have no historical data about the rendertarget: */ - if (batch->cleared || batch->gmem_reason || - ((batch->num_draws > 5) && !batch->blit) || - (pfb->samples > 1)) { - return false; - } + /* Fallback logic if we have no historical data about the rendertarget: */ + if (batch->cleared || batch->gmem_reason || + ((batch->num_draws > 5) && !batch->blit) || (pfb->samples > 1)) { + return false; + } - return true; + return true; } /** @@ -171,91 +171,95 @@ fallback_use_bypass(struct fd_batch *batch) bool fd_autotune_use_bypass(struct fd_autotune *at, struct fd_batch *batch) { - struct pipe_framebuffer_state *pfb = &batch->framebuffer; - - process_results(at); - - /* Only enable on gen's that opt-in (and actually have sample-passed - * collection wired up: - */ - if (!batch->ctx->screen->gmem_reason_mask) - return fallback_use_bypass(batch); - - if (batch->gmem_reason & ~batch->ctx->screen->gmem_reason_mask) - return fallback_use_bypass(batch); - - for (unsigned i = 0; i < pfb->nr_cbufs; i++) { - /* If ms-rtt is involved, force GMEM, as we don't currently - * implement a temporary render target that we can MSAA resolve - * from - */ - if (pfb->cbufs[i] && pfb->cbufs[i]->nr_samples) - return fallback_use_bypass(batch); - } - - struct fd_batch_history *history = get_history(at, batch); - if (!history) - return fallback_use_bypass(batch); - - batch->autotune_result = get_result(at, history); - batch->autotune_result->cost = batch->cost; - - bool use_bypass = fallback_use_bypass(batch); - - if (use_bypass) - return true; - - if (history->num_results > 0) { - uint32_t total_samples = 0; - - // TODO we should account for clears somehow - // TODO should we try to notice if there is a drastic change from - // frame to frame? - list_for_each_entry (struct fd_batch_result, result, &history->results, node) { - total_samples += result->samples_passed; - } - - float avg_samples = (float)total_samples / (float)history->num_results; - - /* Low sample count could mean there was only a clear.. or there was - * a clear plus draws that touch no or few samples - */ - if (avg_samples < 500.0) - return true; - - /* Cost-per-sample is an estimate for the average number of reads+ - * writes for a given passed sample. - */ - float sample_cost = batch->cost; - sample_cost /= batch->num_draws; - - float total_draw_cost = (avg_samples * sample_cost) / batch->num_draws; - DBG("%08x:%u\ttotal_samples=%u, avg_samples=%f, sample_cost=%f, total_draw_cost=%f\n", - batch->hash, batch->num_draws, total_samples, avg_samples, sample_cost, total_draw_cost); - - if (total_draw_cost < 3000.0) - return true; - } - - return use_bypass; + struct pipe_framebuffer_state *pfb = &batch->framebuffer; + + process_results(at); + + /* Only enable on gen's that opt-in (and actually have sample-passed + * collection wired up: + */ + if (!batch->ctx->screen->gmem_reason_mask) + return fallback_use_bypass(batch); + + if (batch->gmem_reason & ~batch->ctx->screen->gmem_reason_mask) + return fallback_use_bypass(batch); + + for (unsigned i = 0; i < pfb->nr_cbufs; i++) { + /* If ms-rtt is involved, force GMEM, as we don't currently + * implement a temporary render target that we can MSAA resolve + * from + */ + if (pfb->cbufs[i] && pfb->cbufs[i]->nr_samples) + return fallback_use_bypass(batch); + } + + struct fd_batch_history *history = get_history(at, batch); + if (!history) + return fallback_use_bypass(batch); + + batch->autotune_result = get_result(at, history); + batch->autotune_result->cost = batch->cost; + + bool use_bypass = fallback_use_bypass(batch); + + if (use_bypass) + return true; + + if (history->num_results > 0) { + uint32_t total_samples = 0; + + // TODO we should account for clears somehow + // TODO should we try to notice if there is a drastic change from + // frame to frame? + list_for_each_entry (struct fd_batch_result, result, &history->results, + node) { + total_samples += result->samples_passed; + } + + float avg_samples = (float)total_samples / (float)history->num_results; + + /* Low sample count could mean there was only a clear.. or there was + * a clear plus draws that touch no or few samples + */ + if (avg_samples < 500.0) + return true; + + /* Cost-per-sample is an estimate for the average number of reads+ + * writes for a given passed sample. + */ + float sample_cost = batch->cost; + sample_cost /= batch->num_draws; + + float total_draw_cost = (avg_samples * sample_cost) / batch->num_draws; + DBG("%08x:%u\ttotal_samples=%u, avg_samples=%f, sample_cost=%f, " + "total_draw_cost=%f\n", + batch->hash, batch->num_draws, total_samples, avg_samples, + sample_cost, total_draw_cost); + + if (total_draw_cost < 3000.0) + return true; + } + + return use_bypass; } void fd_autotune_init(struct fd_autotune *at, struct fd_device *dev) { - at->ht = _mesa_hash_table_create(NULL, fd_batch_key_hash, fd_batch_key_equals); - list_inithead(&at->lru); + at->ht = + _mesa_hash_table_create(NULL, fd_batch_key_hash, fd_batch_key_equals); + list_inithead(&at->lru); - at->results_mem = fd_bo_new(dev, sizeof(struct fd_autotune_results), - DRM_FREEDRENO_GEM_TYPE_KMEM, "autotune"); - at->results = fd_bo_map(at->results_mem); + at->results_mem = fd_bo_new(dev, sizeof(struct fd_autotune_results), + DRM_FREEDRENO_GEM_TYPE_KMEM, "autotune"); + at->results = fd_bo_map(at->results_mem); - list_inithead(&at->pending_results); + list_inithead(&at->pending_results); } void fd_autotune_fini(struct fd_autotune *at) { - _mesa_hash_table_destroy(at->ht, NULL); - fd_bo_del(at->results_mem); + _mesa_hash_table_destroy(at->ht, NULL); + fd_bo_del(at->results_mem); } diff --git a/src/gallium/drivers/freedreno/freedreno_autotune.h b/src/gallium/drivers/freedreno/freedreno_autotune.h index 373af58..f46d44e 100644 --- a/src/gallium/drivers/freedreno/freedreno_autotune.h +++ b/src/gallium/drivers/freedreno/freedreno_autotune.h @@ -71,31 +71,31 @@ struct fd_autotune_results; */ struct fd_autotune { - /** - * Cache to map batch->key (also used for batch-cache) to historical - * information about rendering to that particular render target. - */ - struct hash_table *ht; - - /** - * List of recently used historical results (to age out old results) - */ - struct list_head lru; - - /** - * GPU buffer used to communicate back results to the CPU - */ - struct fd_bo *results_mem; - struct fd_autotune_results *results; - - /** - * List of per-batch results that we are waiting for the GPU to finish - * with before reading back the results. - */ - struct list_head pending_results; - - uint32_t fence_counter; - uint32_t idx_counter; + /** + * Cache to map batch->key (also used for batch-cache) to historical + * information about rendering to that particular render target. + */ + struct hash_table *ht; + + /** + * List of recently used historical results (to age out old results) + */ + struct list_head lru; + + /** + * GPU buffer used to communicate back results to the CPU + */ + struct fd_bo *results_mem; + struct fd_autotune_results *results; + + /** + * List of per-batch results that we are waiting for the GPU to finish + * with before reading back the results. + */ + struct list_head pending_results; + + uint32_t fence_counter; + uint32_t idx_counter; }; /** @@ -107,34 +107,34 @@ struct fd_autotune { */ struct fd_autotune_results { - /** - * The GPU writes back a "fence" seqno value from the cmdstream after - * it finishes writing it's result slot, so that the CPU knows when - * results are valid - */ - uint32_t fence; - - uint32_t __pad0; - uint64_t __pad1; - - /** - * From the cmdstream, the captured samples-passed values are recorded - * at the start and end of the batch. - * - * Note that we do the math on the CPU to avoid a WFI. But pre-emption - * may force us to revisit that. - */ - struct { - uint64_t samples_start; - uint64_t __pad0; - uint64_t samples_end; - uint64_t __pad1; - } result[127]; + /** + * The GPU writes back a "fence" seqno value from the cmdstream after + * it finishes writing it's result slot, so that the CPU knows when + * results are valid + */ + uint32_t fence; + + uint32_t __pad0; + uint64_t __pad1; + + /** + * From the cmdstream, the captured samples-passed values are recorded + * at the start and end of the batch. + * + * Note that we do the math on the CPU to avoid a WFI. But pre-emption + * may force us to revisit that. + */ + struct { + uint64_t samples_start; + uint64_t __pad0; + uint64_t samples_end; + uint64_t __pad1; + } result[127]; }; -#define offset(base, ptr) ((uint8_t *)(ptr) - (uint8_t *)(base)) -#define results_ptr(at, member) \ - (at)->results_mem, offset((at)->results, &(at)->results->member), 0, 0 +#define offset(base, ptr) ((uint8_t *)(ptr) - (uint8_t *)(base)) +#define results_ptr(at, member) \ + (at)->results_mem, offset((at)->results, &(at)->results->member), 0, 0 struct fd_batch_history; @@ -147,31 +147,32 @@ struct fd_batch_history; */ struct fd_batch_result { - /** - * The index/slot in fd_autotune_results::result[] to write start/end - * counter to - */ - unsigned idx; - - /** - * Fence value to write back to fd_autotune_results::fence after both - * start/end values written - */ - uint32_t fence; - - /* - * Below here, only used internally within autotune - */ - struct fd_batch_history *history; - struct list_head node; - uint32_t cost; - uint64_t samples_passed; + /** + * The index/slot in fd_autotune_results::result[] to write start/end + * counter to + */ + unsigned idx; + + /** + * Fence value to write back to fd_autotune_results::fence after both + * start/end values written + */ + uint32_t fence; + + /* + * Below here, only used internally within autotune + */ + struct fd_batch_history *history; + struct list_head node; + uint32_t cost; + uint64_t samples_passed; }; void fd_autotune_init(struct fd_autotune *at, struct fd_device *dev); void fd_autotune_fini(struct fd_autotune *at); struct fd_batch; -bool fd_autotune_use_bypass(struct fd_autotune *at, struct fd_batch *batch) assert_dt; +bool fd_autotune_use_bypass(struct fd_autotune *at, + struct fd_batch *batch) assert_dt; #endif /* FREEDRENO_AUTOTUNE_H */ diff --git a/src/gallium/drivers/freedreno/freedreno_batch.c b/src/gallium/drivers/freedreno/freedreno_batch.c index 7c06554..63c290b 100644 --- a/src/gallium/drivers/freedreno/freedreno_batch.c +++ b/src/gallium/drivers/freedreno/freedreno_batch.c @@ -24,365 +24,363 @@ * Rob Clark */ +#include "util/hash_table.h" #include "util/list.h" #include "util/set.h" -#include "util/hash_table.h" #include "util/u_string.h" #include "freedreno_batch.h" #include "freedreno_context.h" #include "freedreno_fence.h" -#include "freedreno_resource.h" #include "freedreno_query_hw.h" +#include "freedreno_resource.h" static struct fd_ringbuffer * alloc_ring(struct fd_batch *batch, unsigned sz, enum fd_ringbuffer_flags flags) { - struct fd_context *ctx = batch->ctx; - - /* if kernel is too old to support unlimited # of cmd buffers, we - * have no option but to allocate large worst-case sizes so that - * we don't need to grow the ringbuffer. Performance is likely to - * suffer, but there is no good alternative. - * - * Otherwise if supported, allocate a growable ring with initial - * size of zero. - */ - if ((fd_device_version(ctx->screen->dev) >= FD_VERSION_UNLIMITED_CMDS) && - !FD_DBG(NOGROW)) { - flags |= FD_RINGBUFFER_GROWABLE; - sz = 0; - } - - return fd_submit_new_ringbuffer(batch->submit, sz, flags); + struct fd_context *ctx = batch->ctx; + + /* if kernel is too old to support unlimited # of cmd buffers, we + * have no option but to allocate large worst-case sizes so that + * we don't need to grow the ringbuffer. Performance is likely to + * suffer, but there is no good alternative. + * + * Otherwise if supported, allocate a growable ring with initial + * size of zero. + */ + if ((fd_device_version(ctx->screen->dev) >= FD_VERSION_UNLIMITED_CMDS) && + !FD_DBG(NOGROW)) { + flags |= FD_RINGBUFFER_GROWABLE; + sz = 0; + } + + return fd_submit_new_ringbuffer(batch->submit, sz, flags); } static void batch_init(struct fd_batch *batch) { - struct fd_context *ctx = batch->ctx; - - batch->submit = fd_submit_new(ctx->pipe); - if (batch->nondraw) { - batch->gmem = alloc_ring(batch, 0x1000, FD_RINGBUFFER_PRIMARY); - batch->draw = alloc_ring(batch, 0x100000, 0); - } else { - batch->gmem = alloc_ring(batch, 0x100000, FD_RINGBUFFER_PRIMARY); - batch->draw = alloc_ring(batch, 0x100000, 0); - - /* a6xx+ re-uses draw rb for both draw and binning pass: */ - if (ctx->screen->gpu_id < 600) { - batch->binning = alloc_ring(batch, 0x100000, 0); - } - } - - batch->in_fence_fd = -1; - batch->fence = fd_fence_create(batch); - - batch->cleared = 0; - batch->fast_cleared = 0; - batch->invalidated = 0; - batch->restore = batch->resolve = 0; - batch->needs_flush = false; - batch->flushed = false; - batch->gmem_reason = 0; - batch->num_draws = 0; - batch->num_vertices = 0; - batch->num_bins_per_pipe = 0; - batch->prim_strm_bits = 0; - batch->draw_strm_bits = 0; - - fd_reset_wfi(batch); - - util_dynarray_init(&batch->draw_patches, NULL); - util_dynarray_init(&batch->fb_read_patches, NULL); - - if (is_a2xx(ctx->screen)) { - util_dynarray_init(&batch->shader_patches, NULL); - util_dynarray_init(&batch->gmem_patches, NULL); - } - - if (is_a3xx(ctx->screen)) - util_dynarray_init(&batch->rbrc_patches, NULL); - - assert(batch->resources->entries == 0); - - util_dynarray_init(&batch->samples, NULL); - - u_trace_init(&batch->trace, &ctx->trace_context); - batch->last_timestamp_cmd = NULL; + struct fd_context *ctx = batch->ctx; + + batch->submit = fd_submit_new(ctx->pipe); + if (batch->nondraw) { + batch->gmem = alloc_ring(batch, 0x1000, FD_RINGBUFFER_PRIMARY); + batch->draw = alloc_ring(batch, 0x100000, 0); + } else { + batch->gmem = alloc_ring(batch, 0x100000, FD_RINGBUFFER_PRIMARY); + batch->draw = alloc_ring(batch, 0x100000, 0); + + /* a6xx+ re-uses draw rb for both draw and binning pass: */ + if (ctx->screen->gpu_id < 600) { + batch->binning = alloc_ring(batch, 0x100000, 0); + } + } + + batch->in_fence_fd = -1; + batch->fence = fd_fence_create(batch); + + batch->cleared = 0; + batch->fast_cleared = 0; + batch->invalidated = 0; + batch->restore = batch->resolve = 0; + batch->needs_flush = false; + batch->flushed = false; + batch->gmem_reason = 0; + batch->num_draws = 0; + batch->num_vertices = 0; + batch->num_bins_per_pipe = 0; + batch->prim_strm_bits = 0; + batch->draw_strm_bits = 0; + + fd_reset_wfi(batch); + + util_dynarray_init(&batch->draw_patches, NULL); + util_dynarray_init(&batch->fb_read_patches, NULL); + + if (is_a2xx(ctx->screen)) { + util_dynarray_init(&batch->shader_patches, NULL); + util_dynarray_init(&batch->gmem_patches, NULL); + } + + if (is_a3xx(ctx->screen)) + util_dynarray_init(&batch->rbrc_patches, NULL); + + assert(batch->resources->entries == 0); + + util_dynarray_init(&batch->samples, NULL); + + u_trace_init(&batch->trace, &ctx->trace_context); + batch->last_timestamp_cmd = NULL; } struct fd_batch * fd_batch_create(struct fd_context *ctx, bool nondraw) { - struct fd_batch *batch = CALLOC_STRUCT(fd_batch); + struct fd_batch *batch = CALLOC_STRUCT(fd_batch); - if (!batch) - return NULL; + if (!batch) + return NULL; - DBG("%p", batch); + DBG("%p", batch); - pipe_reference_init(&batch->reference, 1); - batch->ctx = ctx; - batch->nondraw = nondraw; + pipe_reference_init(&batch->reference, 1); + batch->ctx = ctx; + batch->nondraw = nondraw; - simple_mtx_init(&batch->submit_lock, mtx_plain); + simple_mtx_init(&batch->submit_lock, mtx_plain); - batch->resources = _mesa_set_create(NULL, _mesa_hash_pointer, - _mesa_key_pointer_equal); + batch->resources = + _mesa_set_create(NULL, _mesa_hash_pointer, _mesa_key_pointer_equal); - batch_init(batch); + batch_init(batch); - fd_screen_assert_locked(ctx->screen); - if (BATCH_DEBUG) { - _mesa_set_add(ctx->screen->live_batches, batch); - } + fd_screen_assert_locked(ctx->screen); + if (BATCH_DEBUG) { + _mesa_set_add(ctx->screen->live_batches, batch); + } - return batch; + return batch; } static void cleanup_submit(struct fd_batch *batch) { - if (!batch->submit) - return; - - fd_ringbuffer_del(batch->draw); - fd_ringbuffer_del(batch->gmem); - - if (batch->binning) { - fd_ringbuffer_del(batch->binning); - batch->binning = NULL; - } - - if (batch->prologue) { - fd_ringbuffer_del(batch->prologue); - batch->prologue = NULL; - } - - if (batch->epilogue) { - fd_ringbuffer_del(batch->epilogue); - batch->epilogue = NULL; - } - - if (batch->tile_setup) { - fd_ringbuffer_del(batch->tile_setup); - batch->tile_setup = NULL; - } - - if (batch->tile_fini) { - fd_ringbuffer_del(batch->tile_fini); - batch->tile_fini = NULL; - } - - if (batch->tessellation) { - fd_bo_del(batch->tessfactor_bo); - fd_bo_del(batch->tessparam_bo); - fd_ringbuffer_del(batch->tess_addrs_constobj); - } - - fd_submit_del(batch->submit); - batch->submit = NULL; + if (!batch->submit) + return; + + fd_ringbuffer_del(batch->draw); + fd_ringbuffer_del(batch->gmem); + + if (batch->binning) { + fd_ringbuffer_del(batch->binning); + batch->binning = NULL; + } + + if (batch->prologue) { + fd_ringbuffer_del(batch->prologue); + batch->prologue = NULL; + } + + if (batch->epilogue) { + fd_ringbuffer_del(batch->epilogue); + batch->epilogue = NULL; + } + + if (batch->tile_setup) { + fd_ringbuffer_del(batch->tile_setup); + batch->tile_setup = NULL; + } + + if (batch->tile_fini) { + fd_ringbuffer_del(batch->tile_fini); + batch->tile_fini = NULL; + } + + if (batch->tessellation) { + fd_bo_del(batch->tessfactor_bo); + fd_bo_del(batch->tessparam_bo); + fd_ringbuffer_del(batch->tess_addrs_constobj); + } + + fd_submit_del(batch->submit); + batch->submit = NULL; } static void batch_fini(struct fd_batch *batch) { - DBG("%p", batch); + DBG("%p", batch); - pipe_resource_reference(&batch->query_buf, NULL); + pipe_resource_reference(&batch->query_buf, NULL); - if (batch->in_fence_fd != -1) - close(batch->in_fence_fd); + if (batch->in_fence_fd != -1) + close(batch->in_fence_fd); - /* in case batch wasn't flushed but fence was created: */ - fd_fence_populate(batch->fence, 0, -1); + /* in case batch wasn't flushed but fence was created: */ + fd_fence_populate(batch->fence, 0, -1); - fd_fence_ref(&batch->fence, NULL); + fd_fence_ref(&batch->fence, NULL); - cleanup_submit(batch); + cleanup_submit(batch); - util_dynarray_fini(&batch->draw_patches); - util_dynarray_fini(&batch->fb_read_patches); + util_dynarray_fini(&batch->draw_patches); + util_dynarray_fini(&batch->fb_read_patches); - if (is_a2xx(batch->ctx->screen)) { - util_dynarray_fini(&batch->shader_patches); - util_dynarray_fini(&batch->gmem_patches); - } + if (is_a2xx(batch->ctx->screen)) { + util_dynarray_fini(&batch->shader_patches); + util_dynarray_fini(&batch->gmem_patches); + } - if (is_a3xx(batch->ctx->screen)) - util_dynarray_fini(&batch->rbrc_patches); + if (is_a3xx(batch->ctx->screen)) + util_dynarray_fini(&batch->rbrc_patches); - while (batch->samples.size > 0) { - struct fd_hw_sample *samp = - util_dynarray_pop(&batch->samples, struct fd_hw_sample *); - fd_hw_sample_reference(batch->ctx, &samp, NULL); - } - util_dynarray_fini(&batch->samples); + while (batch->samples.size > 0) { + struct fd_hw_sample *samp = + util_dynarray_pop(&batch->samples, struct fd_hw_sample *); + fd_hw_sample_reference(batch->ctx, &samp, NULL); + } + util_dynarray_fini(&batch->samples); - u_trace_fini(&batch->trace); + u_trace_fini(&batch->trace); } static void -batch_flush_dependencies(struct fd_batch *batch) - assert_dt +batch_flush_dependencies(struct fd_batch *batch) assert_dt { - struct fd_batch_cache *cache = &batch->ctx->screen->batch_cache; - struct fd_batch *dep; + struct fd_batch_cache *cache = &batch->ctx->screen->batch_cache; + struct fd_batch *dep; - foreach_batch (dep, cache, batch->dependents_mask) { - fd_batch_flush(dep); - fd_batch_reference(&dep, NULL); - } + foreach_batch(dep, cache, batch->dependents_mask) + { + fd_batch_flush(dep); + fd_batch_reference(&dep, NULL); + } - batch->dependents_mask = 0; + batch->dependents_mask = 0; } static void batch_reset_dependencies(struct fd_batch *batch) { - struct fd_batch_cache *cache = &batch->ctx->screen->batch_cache; - struct fd_batch *dep; + struct fd_batch_cache *cache = &batch->ctx->screen->batch_cache; + struct fd_batch *dep; - foreach_batch (dep, cache, batch->dependents_mask) { - fd_batch_reference(&dep, NULL); - } + foreach_batch(dep, cache, batch->dependents_mask) + { + fd_batch_reference(&dep, NULL); + } - batch->dependents_mask = 0; + batch->dependents_mask = 0; } static void batch_reset_resources_locked(struct fd_batch *batch) { - fd_screen_assert_locked(batch->ctx->screen); - - set_foreach(batch->resources, entry) { - struct fd_resource *rsc = (struct fd_resource *)entry->key; - _mesa_set_remove(batch->resources, entry); - debug_assert(rsc->track->batch_mask & (1 << batch->idx)); - rsc->track->batch_mask &= ~(1 << batch->idx); - if (rsc->track->write_batch == batch) - fd_batch_reference_locked(&rsc->track->write_batch, NULL); - } + fd_screen_assert_locked(batch->ctx->screen); + + set_foreach (batch->resources, entry) { + struct fd_resource *rsc = (struct fd_resource *)entry->key; + _mesa_set_remove(batch->resources, entry); + debug_assert(rsc->track->batch_mask & (1 << batch->idx)); + rsc->track->batch_mask &= ~(1 << batch->idx); + if (rsc->track->write_batch == batch) + fd_batch_reference_locked(&rsc->track->write_batch, NULL); + } } static void -batch_reset_resources(struct fd_batch *batch) - assert_dt +batch_reset_resources(struct fd_batch *batch) assert_dt { - fd_screen_lock(batch->ctx->screen); - batch_reset_resources_locked(batch); - fd_screen_unlock(batch->ctx->screen); + fd_screen_lock(batch->ctx->screen); + batch_reset_resources_locked(batch); + fd_screen_unlock(batch->ctx->screen); } static void -batch_reset(struct fd_batch *batch) - assert_dt +batch_reset(struct fd_batch *batch) assert_dt { - DBG("%p", batch); + DBG("%p", batch); - batch_reset_dependencies(batch); - batch_reset_resources(batch); + batch_reset_dependencies(batch); + batch_reset_resources(batch); - batch_fini(batch); - batch_init(batch); + batch_fini(batch); + batch_init(batch); } void fd_batch_reset(struct fd_batch *batch) { - if (batch->needs_flush) - batch_reset(batch); + if (batch->needs_flush) + batch_reset(batch); } void __fd_batch_destroy(struct fd_batch *batch) { - struct fd_context *ctx = batch->ctx; + struct fd_context *ctx = batch->ctx; - DBG("%p", batch); + DBG("%p", batch); - fd_screen_assert_locked(batch->ctx->screen); + fd_screen_assert_locked(batch->ctx->screen); - if (BATCH_DEBUG) { - _mesa_set_remove_key(ctx->screen->live_batches, batch); - } + if (BATCH_DEBUG) { + _mesa_set_remove_key(ctx->screen->live_batches, batch); + } - fd_bc_invalidate_batch(batch, true); + fd_bc_invalidate_batch(batch, true); - batch_reset_resources_locked(batch); - debug_assert(batch->resources->entries == 0); - _mesa_set_destroy(batch->resources, NULL); + batch_reset_resources_locked(batch); + debug_assert(batch->resources->entries == 0); + _mesa_set_destroy(batch->resources, NULL); - fd_screen_unlock(ctx->screen); - batch_reset_dependencies(batch); - debug_assert(batch->dependents_mask == 0); + fd_screen_unlock(ctx->screen); + batch_reset_dependencies(batch); + debug_assert(batch->dependents_mask == 0); - util_copy_framebuffer_state(&batch->framebuffer, NULL); - batch_fini(batch); + util_copy_framebuffer_state(&batch->framebuffer, NULL); + batch_fini(batch); - simple_mtx_destroy(&batch->submit_lock); + simple_mtx_destroy(&batch->submit_lock); - free(batch); - fd_screen_lock(ctx->screen); + free(batch); + fd_screen_lock(ctx->screen); } void -__fd_batch_describe(char* buf, const struct fd_batch *batch) +__fd_batch_describe(char *buf, const struct fd_batch *batch) { - sprintf(buf, "fd_batch<%u>", batch->seqno); + sprintf(buf, "fd_batch<%u>", batch->seqno); } /* Get per-batch prologue */ struct fd_ringbuffer * fd_batch_get_prologue(struct fd_batch *batch) { - if (!batch->prologue) - batch->prologue = alloc_ring(batch, 0x1000, 0); - return batch->prologue; + if (!batch->prologue) + batch->prologue = alloc_ring(batch, 0x1000, 0); + return batch->prologue; } /* Only called from fd_batch_flush() */ static void -batch_flush(struct fd_batch *batch) - assert_dt +batch_flush(struct fd_batch *batch) assert_dt { - DBG("%p: needs_flush=%d", batch, batch->needs_flush); + DBG("%p: needs_flush=%d", batch, batch->needs_flush); - if (!fd_batch_lock_submit(batch)) - return; + if (!fd_batch_lock_submit(batch)) + return; - batch->needs_flush = false; + batch->needs_flush = false; - /* close out the draw cmds by making sure any active queries are - * paused: - */ - fd_batch_finish_queries(batch); + /* close out the draw cmds by making sure any active queries are + * paused: + */ + fd_batch_finish_queries(batch); - batch_flush_dependencies(batch); + batch_flush_dependencies(batch); - batch->flushed = true; - if (batch == batch->ctx->batch) - fd_batch_reference(&batch->ctx->batch, NULL); + batch->flushed = true; + if (batch == batch->ctx->batch) + fd_batch_reference(&batch->ctx->batch, NULL); - fd_fence_ref(&batch->ctx->last_fence, batch->fence); + fd_fence_ref(&batch->ctx->last_fence, batch->fence); - fd_gmem_render_tiles(batch); - batch_reset_resources(batch); + fd_gmem_render_tiles(batch); + batch_reset_resources(batch); - debug_assert(batch->reference.count > 0); + debug_assert(batch->reference.count > 0); - fd_screen_lock(batch->ctx->screen); - /* NOTE: remove=false removes the patch from the hashtable, so future - * lookups won't cache-hit a flushed batch, but leaves the weak reference - * to the batch to avoid having multiple batches with same batch->idx, as - * that causes all sorts of hilarity. - */ - fd_bc_invalidate_batch(batch, false); - fd_screen_unlock(batch->ctx->screen); - cleanup_submit(batch); - fd_batch_unlock_submit(batch); + fd_screen_lock(batch->ctx->screen); + /* NOTE: remove=false removes the patch from the hashtable, so future + * lookups won't cache-hit a flushed batch, but leaves the weak reference + * to the batch to avoid having multiple batches with same batch->idx, as + * that causes all sorts of hilarity. + */ + fd_bc_invalidate_batch(batch, false); + fd_screen_unlock(batch->ctx->screen); + cleanup_submit(batch); + fd_batch_unlock_submit(batch); } /* NOTE: could drop the last ref to batch @@ -390,170 +388,171 @@ batch_flush(struct fd_batch *batch) void fd_batch_flush(struct fd_batch *batch) { - struct fd_batch *tmp = NULL; - - /* NOTE: we need to hold an extra ref across the body of flush, - * since the last ref to this batch could be dropped when cleaning - * up used_resources - */ - fd_batch_reference(&tmp, batch); - batch_flush(tmp); - fd_batch_reference(&tmp, NULL); + struct fd_batch *tmp = NULL; + + /* NOTE: we need to hold an extra ref across the body of flush, + * since the last ref to this batch could be dropped when cleaning + * up used_resources + */ + fd_batch_reference(&tmp, batch); + batch_flush(tmp); + fd_batch_reference(&tmp, NULL); } /* find a batches dependents mask, including recursive dependencies: */ static uint32_t recursive_dependents_mask(struct fd_batch *batch) { - struct fd_batch_cache *cache = &batch->ctx->screen->batch_cache; - struct fd_batch *dep; - uint32_t dependents_mask = batch->dependents_mask; + struct fd_batch_cache *cache = &batch->ctx->screen->batch_cache; + struct fd_batch *dep; + uint32_t dependents_mask = batch->dependents_mask; - foreach_batch(dep, cache, batch->dependents_mask) - dependents_mask |= recursive_dependents_mask(dep); + foreach_batch(dep, cache, batch->dependents_mask) dependents_mask |= + recursive_dependents_mask(dep); - return dependents_mask; + return dependents_mask; } void fd_batch_add_dep(struct fd_batch *batch, struct fd_batch *dep) { - fd_screen_assert_locked(batch->ctx->screen); + fd_screen_assert_locked(batch->ctx->screen); - if (batch->dependents_mask & (1 << dep->idx)) - return; + if (batch->dependents_mask & (1 << dep->idx)) + return; - /* a loop should not be possible */ - debug_assert(!((1 << batch->idx) & recursive_dependents_mask(dep))); + /* a loop should not be possible */ + debug_assert(!((1 << batch->idx) & recursive_dependents_mask(dep))); - struct fd_batch *other = NULL; - fd_batch_reference_locked(&other, dep); - batch->dependents_mask |= (1 << dep->idx); - DBG("%p: added dependency on %p", batch, dep); + struct fd_batch *other = NULL; + fd_batch_reference_locked(&other, dep); + batch->dependents_mask |= (1 << dep->idx); + DBG("%p: added dependency on %p", batch, dep); } static void -flush_write_batch(struct fd_resource *rsc) - assert_dt +flush_write_batch(struct fd_resource *rsc) assert_dt { - struct fd_batch *b = NULL; - fd_batch_reference_locked(&b, rsc->track->write_batch); + struct fd_batch *b = NULL; + fd_batch_reference_locked(&b, rsc->track->write_batch); - fd_screen_unlock(b->ctx->screen); - fd_batch_flush(b); - fd_screen_lock(b->ctx->screen); + fd_screen_unlock(b->ctx->screen); + fd_batch_flush(b); + fd_screen_lock(b->ctx->screen); - fd_batch_reference_locked(&b, NULL); + fd_batch_reference_locked(&b, NULL); } static void fd_batch_add_resource(struct fd_batch *batch, struct fd_resource *rsc) { - if (likely(fd_batch_references_resource(batch, rsc))) { - debug_assert(_mesa_set_search(batch->resources, rsc)); - return; - } + if (likely(fd_batch_references_resource(batch, rsc))) { + debug_assert(_mesa_set_search(batch->resources, rsc)); + return; + } - debug_assert(!_mesa_set_search(batch->resources, rsc)); + debug_assert(!_mesa_set_search(batch->resources, rsc)); - _mesa_set_add(batch->resources, rsc); - rsc->track->batch_mask |= (1 << batch->idx); + _mesa_set_add(batch->resources, rsc); + rsc->track->batch_mask |= (1 << batch->idx); } void fd_batch_resource_write(struct fd_batch *batch, struct fd_resource *rsc) { - fd_screen_assert_locked(batch->ctx->screen); - - DBG("%p: write %p", batch, rsc); - - /* Must do this before the early out, so we unset a previous resource - * invalidate (which may have left the write_batch state in place). - */ - rsc->valid = true; - - if (rsc->track->write_batch == batch) - return; - - fd_batch_write_prep(batch, rsc); - - if (rsc->stencil) - fd_batch_resource_write(batch, rsc->stencil); - - /* note, invalidate write batch, to avoid further writes to rsc - * resulting in a write-after-read hazard. - */ - /* if we are pending read or write by any other batch: */ - if (unlikely(rsc->track->batch_mask & ~(1 << batch->idx))) { - struct fd_batch_cache *cache = &batch->ctx->screen->batch_cache; - struct fd_batch *dep; - - if (rsc->track->write_batch) - flush_write_batch(rsc); - - foreach_batch (dep, cache, rsc->track->batch_mask) { - struct fd_batch *b = NULL; - if (dep == batch) - continue; - /* note that batch_add_dep could flush and unref dep, so - * we need to hold a reference to keep it live for the - * fd_bc_invalidate_batch() - */ - fd_batch_reference(&b, dep); - fd_batch_add_dep(batch, b); - fd_bc_invalidate_batch(b, false); - fd_batch_reference_locked(&b, NULL); - } - } - fd_batch_reference_locked(&rsc->track->write_batch, batch); - - fd_batch_add_resource(batch, rsc); + fd_screen_assert_locked(batch->ctx->screen); + + DBG("%p: write %p", batch, rsc); + + /* Must do this before the early out, so we unset a previous resource + * invalidate (which may have left the write_batch state in place). + */ + rsc->valid = true; + + if (rsc->track->write_batch == batch) + return; + + fd_batch_write_prep(batch, rsc); + + if (rsc->stencil) + fd_batch_resource_write(batch, rsc->stencil); + + /* note, invalidate write batch, to avoid further writes to rsc + * resulting in a write-after-read hazard. + */ + /* if we are pending read or write by any other batch: */ + if (unlikely(rsc->track->batch_mask & ~(1 << batch->idx))) { + struct fd_batch_cache *cache = &batch->ctx->screen->batch_cache; + struct fd_batch *dep; + + if (rsc->track->write_batch) + flush_write_batch(rsc); + + foreach_batch(dep, cache, rsc->track->batch_mask) + { + struct fd_batch *b = NULL; + if (dep == batch) + continue; + /* note that batch_add_dep could flush and unref dep, so + * we need to hold a reference to keep it live for the + * fd_bc_invalidate_batch() + */ + fd_batch_reference(&b, dep); + fd_batch_add_dep(batch, b); + fd_bc_invalidate_batch(b, false); + fd_batch_reference_locked(&b, NULL); + } + } + fd_batch_reference_locked(&rsc->track->write_batch, batch); + + fd_batch_add_resource(batch, rsc); } void fd_batch_resource_read_slowpath(struct fd_batch *batch, struct fd_resource *rsc) { - fd_screen_assert_locked(batch->ctx->screen); + fd_screen_assert_locked(batch->ctx->screen); - if (rsc->stencil) - fd_batch_resource_read(batch, rsc->stencil); + if (rsc->stencil) + fd_batch_resource_read(batch, rsc->stencil); - DBG("%p: read %p", batch, rsc); + DBG("%p: read %p", batch, rsc); - /* If reading a resource pending a write, go ahead and flush the - * writer. This avoids situations where we end up having to - * flush the current batch in _resource_used() - */ - if (unlikely(rsc->track->write_batch && rsc->track->write_batch != batch)) - flush_write_batch(rsc); + /* If reading a resource pending a write, go ahead and flush the + * writer. This avoids situations where we end up having to + * flush the current batch in _resource_used() + */ + if (unlikely(rsc->track->write_batch && rsc->track->write_batch != batch)) + flush_write_batch(rsc); - fd_batch_add_resource(batch, rsc); + fd_batch_add_resource(batch, rsc); } void fd_batch_check_size(struct fd_batch *batch) { - debug_assert(!batch->flushed); - - if (FD_DBG(FLUSH)) { - fd_batch_flush(batch); - return; - } - - /* Place a reasonable upper bound on prim/draw stream buffer size: */ - const unsigned limit_bits = 8 * 8 * 1024 * 1024; - if ((batch->prim_strm_bits > limit_bits) || (batch->draw_strm_bits > limit_bits)) { - fd_batch_flush(batch); - return; - } - - if (fd_device_version(batch->ctx->screen->dev) >= FD_VERSION_UNLIMITED_CMDS) - return; - - struct fd_ringbuffer *ring = batch->draw; - if ((ring->cur - ring->start) > (ring->size/4 - 0x1000)) - fd_batch_flush(batch); + debug_assert(!batch->flushed); + + if (FD_DBG(FLUSH)) { + fd_batch_flush(batch); + return; + } + + /* Place a reasonable upper bound on prim/draw stream buffer size: */ + const unsigned limit_bits = 8 * 8 * 1024 * 1024; + if ((batch->prim_strm_bits > limit_bits) || + (batch->draw_strm_bits > limit_bits)) { + fd_batch_flush(batch); + return; + } + + if (fd_device_version(batch->ctx->screen->dev) >= FD_VERSION_UNLIMITED_CMDS) + return; + + struct fd_ringbuffer *ring = batch->draw; + if ((ring->cur - ring->start) > (ring->size / 4 - 0x1000)) + fd_batch_flush(batch); } /* emit a WAIT_FOR_IDLE only if needed, ie. if there has not already @@ -562,11 +561,11 @@ fd_batch_check_size(struct fd_batch *batch) void fd_wfi(struct fd_batch *batch, struct fd_ringbuffer *ring) { - if (batch->needs_wfi) { - if (batch->ctx->screen->gpu_id >= 500) - OUT_WFI5(ring); - else - OUT_WFI(ring); - batch->needs_wfi = false; - } + if (batch->needs_wfi) { + if (batch->ctx->screen->gpu_id >= 500) + OUT_WFI5(ring); + else + OUT_WFI(ring); + batch->needs_wfi = false; + } } diff --git a/src/gallium/drivers/freedreno/freedreno_batch.h b/src/gallium/drivers/freedreno/freedreno_batch.h index 785f0c3..ded497e 100644 --- a/src/gallium/drivers/freedreno/freedreno_batch.h +++ b/src/gallium/drivers/freedreno/freedreno_batch.h @@ -27,19 +27,19 @@ #ifndef FREEDRENO_BATCH_H_ #define FREEDRENO_BATCH_H_ +#include "util/list.h" +#include "util/simple_mtx.h" #include "util/u_inlines.h" #include "util/u_queue.h" #include "util/u_trace.h" -#include "util/list.h" -#include "util/simple_mtx.h" #include "freedreno_context.h" #include "freedreno_util.h" #ifdef DEBUG -# define BATCH_DEBUG FD_DBG(MSGS) +#define BATCH_DEBUG FD_DBG(MSGS) #else -# define BATCH_DEBUG 0 +#define BATCH_DEBUG 0 #endif struct fd_resource; @@ -51,244 +51,247 @@ struct fd_batch_result; * fd_resource-s, etc. */ struct fd_batch { - struct pipe_reference reference; - unsigned seqno; - unsigned idx; /* index into cache->batches[] */ - - struct u_trace trace; - - /* To detect cases where we can skip cmdstream to record timestamp: */ - uint32_t *last_timestamp_cmd; - - int in_fence_fd; - bool needs_out_fence_fd; - struct pipe_fence_handle *fence; - - struct fd_context *ctx; - - /* emit_lock serializes cmdstream emission and flush. Acquire before - * screen->lock. - */ - simple_mtx_t submit_lock; - - /* do we need to mem2gmem before rendering. We don't, if for example, - * there was a glClear() that invalidated the entire previous buffer - * contents. Keep track of which buffer(s) are cleared, or needs - * restore. Masks of PIPE_CLEAR_* - * - * The 'cleared' bits will be set for buffers which are *entirely* - * cleared, and 'partial_cleared' bits will be set if you must - * check cleared_scissor. - * - * The 'invalidated' bits are set for cleared buffers, and buffers - * where the contents are undefined, ie. what we don't need to restore - * to gmem. - */ - enum { - /* align bitmask values w/ PIPE_CLEAR_*.. since that is convenient.. */ - FD_BUFFER_COLOR = PIPE_CLEAR_COLOR, - FD_BUFFER_DEPTH = PIPE_CLEAR_DEPTH, - FD_BUFFER_STENCIL = PIPE_CLEAR_STENCIL, - FD_BUFFER_ALL = FD_BUFFER_COLOR | FD_BUFFER_DEPTH | FD_BUFFER_STENCIL, - } invalidated, cleared, fast_cleared, restore, resolve; - - /* is this a non-draw batch (ie compute/blit which has no pfb state)? */ - bool nondraw : 1; - bool needs_flush : 1; - bool flushed : 1; - bool blit : 1; - bool back_blit : 1; /* only blit so far is resource shadowing back-blit */ - bool tessellation : 1; /* tessellation used in batch */ - - /* Keep track if WAIT_FOR_IDLE is needed for registers we need - * to update via RMW: - */ - bool needs_wfi : 1; - - /* To decide whether to render to system memory, keep track of the - * number of draws, and whether any of them require multisample, - * depth_test (or depth write), stencil_test, blending, and - * color_logic_Op (since those functions are disabled when by- - * passing GMEM. - */ - enum fd_gmem_reason gmem_reason; - - /* At submit time, once we've decided that this batch will use GMEM - * rendering, the appropriate gmem state is looked up: - */ - const struct fd_gmem_stateobj *gmem_state; - - /* A calculated "draw cost" value for the batch, which tries to - * estimate the bandwidth-per-sample of all the draws according - * to: - * - * foreach_draw (...) { - * cost += num_mrt; - * if (blend_enabled) - * cost += num_mrt; - * if (depth_test_enabled) - * cost++; - * if (depth_write_enabled) - * cost++; - * } - * - * The idea is that each sample-passed minimally does one write - * per MRT. If blend is enabled, the hw will additionally do - * a framebuffer read per sample-passed (for each MRT with blend - * enabled). If depth-test is enabled, the hw will additionally - * a depth buffer read. If depth-write is enable, the hw will - * additionally do a depth buffer write. - * - * This does ignore depth buffer traffic for samples which do not - * pass do to depth-test fail, and some other details. But it is - * just intended to be a rough estimate that is easy to calculate. - */ - unsigned cost; - - /* Tells the gen specific backend where to write stats used for - * the autotune module. - * - * Pointer only valid during gmem emit code. - */ - struct fd_batch_result *autotune_result; - - unsigned num_draws; /* number of draws in current batch */ - unsigned num_vertices; /* number of vertices in current batch */ - - /* Currently only used on a6xx, to calculate vsc prim/draw stream - * sizes: - */ - unsigned num_bins_per_pipe; - unsigned prim_strm_bits; - unsigned draw_strm_bits; - - /* Track the maximal bounds of the scissor of all the draws within a - * batch. Used at the tile rendering step (fd_gmem_render_tiles(), - * mem2gmem/gmem2mem) to avoid needlessly moving data in/out of gmem. - */ - struct pipe_scissor_state max_scissor; - - /* Keep track of DRAW initiators that need to be patched up depending - * on whether we using binning or not: - */ - struct util_dynarray draw_patches; - - /* texture state that needs patching for fb_read: */ - struct util_dynarray fb_read_patches; - - /* Keep track of writes to RB_RENDER_CONTROL which need to be patched - * once we know whether or not to use GMEM, and GMEM tile pitch. - * - * (only for a3xx.. but having gen specific subclasses of fd_batch - * seemed overkill for now) - */ - struct util_dynarray rbrc_patches; - - /* Keep track of GMEM related values that need to be patched up once we - * know the gmem layout: - */ - struct util_dynarray gmem_patches; - - /* Keep track of pointer to start of MEM exports for a20x binning shaders - * - * this is so the end of the shader can be cut off at the right point - * depending on the GMEM configuration - */ - struct util_dynarray shader_patches; - - struct pipe_framebuffer_state framebuffer; - - struct fd_submit *submit; - - /** draw pass cmdstream: */ - struct fd_ringbuffer *draw; - /** binning pass cmdstream: */ - struct fd_ringbuffer *binning; - /** tiling/gmem (IB0) cmdstream: */ - struct fd_ringbuffer *gmem; - - /** preemble cmdstream (executed once before first tile): */ - struct fd_ringbuffer *prologue; - - /** epilogue cmdstream (executed after each tile): */ - struct fd_ringbuffer *epilogue; - - struct fd_ringbuffer *tile_setup; - struct fd_ringbuffer *tile_fini; - - union pipe_color_union clear_color[MAX_RENDER_TARGETS]; - double clear_depth; - unsigned clear_stencil; - - /** - * hw query related state: - */ - /*@{*/ - /* next sample offset.. incremented for each sample in the batch/ - * submit, reset to zero on next submit. - */ - uint32_t next_sample_offset; - - /* cached samples (in case multiple queries need to reference - * the same sample snapshot) - */ - struct fd_hw_sample *sample_cache[MAX_HW_SAMPLE_PROVIDERS]; - - /* which sample providers were used in the current batch: */ - uint32_t query_providers_used; - - /* which sample providers are currently enabled in the batch: */ - uint32_t query_providers_active; - - /* list of samples in current batch: */ - struct util_dynarray samples; - - /* current query result bo and tile stride: */ - struct pipe_resource *query_buf; - uint32_t query_tile_stride; - /*@}*/ - - - /* Set of resources used by currently-unsubmitted batch (read or - * write).. does not hold a reference to the resource. - */ - struct set *resources; - - /** key in batch-cache (if not null): */ - const struct fd_batch_key *key; - uint32_t hash; - - /** set of dependent batches.. holds refs to dependent batches: */ - uint32_t dependents_mask; - - /* Buffer for tessellation engine input - */ - struct fd_bo *tessfactor_bo; - uint32_t tessfactor_size; - - /* Buffer for passing parameters between TCS and TES - */ - struct fd_bo *tessparam_bo; - uint32_t tessparam_size; - - struct fd_ringbuffer *tess_addrs_constobj; + struct pipe_reference reference; + unsigned seqno; + unsigned idx; /* index into cache->batches[] */ + + struct u_trace trace; + + /* To detect cases where we can skip cmdstream to record timestamp: */ + uint32_t *last_timestamp_cmd; + + int in_fence_fd; + bool needs_out_fence_fd; + struct pipe_fence_handle *fence; + + struct fd_context *ctx; + + /* emit_lock serializes cmdstream emission and flush. Acquire before + * screen->lock. + */ + simple_mtx_t submit_lock; + + /* do we need to mem2gmem before rendering. We don't, if for example, + * there was a glClear() that invalidated the entire previous buffer + * contents. Keep track of which buffer(s) are cleared, or needs + * restore. Masks of PIPE_CLEAR_* + * + * The 'cleared' bits will be set for buffers which are *entirely* + * cleared, and 'partial_cleared' bits will be set if you must + * check cleared_scissor. + * + * The 'invalidated' bits are set for cleared buffers, and buffers + * where the contents are undefined, ie. what we don't need to restore + * to gmem. + */ + enum { + /* align bitmask values w/ PIPE_CLEAR_*.. since that is convenient.. */ + FD_BUFFER_COLOR = PIPE_CLEAR_COLOR, + FD_BUFFER_DEPTH = PIPE_CLEAR_DEPTH, + FD_BUFFER_STENCIL = PIPE_CLEAR_STENCIL, + FD_BUFFER_ALL = FD_BUFFER_COLOR | FD_BUFFER_DEPTH | FD_BUFFER_STENCIL, + } invalidated, + cleared, fast_cleared, restore, resolve; + + /* is this a non-draw batch (ie compute/blit which has no pfb state)? */ + bool nondraw : 1; + bool needs_flush : 1; + bool flushed : 1; + bool blit : 1; + bool back_blit : 1; /* only blit so far is resource shadowing back-blit */ + bool tessellation : 1; /* tessellation used in batch */ + + /* Keep track if WAIT_FOR_IDLE is needed for registers we need + * to update via RMW: + */ + bool needs_wfi : 1; + + /* To decide whether to render to system memory, keep track of the + * number of draws, and whether any of them require multisample, + * depth_test (or depth write), stencil_test, blending, and + * color_logic_Op (since those functions are disabled when by- + * passing GMEM. + */ + enum fd_gmem_reason gmem_reason; + + /* At submit time, once we've decided that this batch will use GMEM + * rendering, the appropriate gmem state is looked up: + */ + const struct fd_gmem_stateobj *gmem_state; + + /* A calculated "draw cost" value for the batch, which tries to + * estimate the bandwidth-per-sample of all the draws according + * to: + * + * foreach_draw (...) { + * cost += num_mrt; + * if (blend_enabled) + * cost += num_mrt; + * if (depth_test_enabled) + * cost++; + * if (depth_write_enabled) + * cost++; + * } + * + * The idea is that each sample-passed minimally does one write + * per MRT. If blend is enabled, the hw will additionally do + * a framebuffer read per sample-passed (for each MRT with blend + * enabled). If depth-test is enabled, the hw will additionally + * a depth buffer read. If depth-write is enable, the hw will + * additionally do a depth buffer write. + * + * This does ignore depth buffer traffic for samples which do not + * pass do to depth-test fail, and some other details. But it is + * just intended to be a rough estimate that is easy to calculate. + */ + unsigned cost; + + /* Tells the gen specific backend where to write stats used for + * the autotune module. + * + * Pointer only valid during gmem emit code. + */ + struct fd_batch_result *autotune_result; + + unsigned num_draws; /* number of draws in current batch */ + unsigned num_vertices; /* number of vertices in current batch */ + + /* Currently only used on a6xx, to calculate vsc prim/draw stream + * sizes: + */ + unsigned num_bins_per_pipe; + unsigned prim_strm_bits; + unsigned draw_strm_bits; + + /* Track the maximal bounds of the scissor of all the draws within a + * batch. Used at the tile rendering step (fd_gmem_render_tiles(), + * mem2gmem/gmem2mem) to avoid needlessly moving data in/out of gmem. + */ + struct pipe_scissor_state max_scissor; + + /* Keep track of DRAW initiators that need to be patched up depending + * on whether we using binning or not: + */ + struct util_dynarray draw_patches; + + /* texture state that needs patching for fb_read: */ + struct util_dynarray fb_read_patches; + + /* Keep track of writes to RB_RENDER_CONTROL which need to be patched + * once we know whether or not to use GMEM, and GMEM tile pitch. + * + * (only for a3xx.. but having gen specific subclasses of fd_batch + * seemed overkill for now) + */ + struct util_dynarray rbrc_patches; + + /* Keep track of GMEM related values that need to be patched up once we + * know the gmem layout: + */ + struct util_dynarray gmem_patches; + + /* Keep track of pointer to start of MEM exports for a20x binning shaders + * + * this is so the end of the shader can be cut off at the right point + * depending on the GMEM configuration + */ + struct util_dynarray shader_patches; + + struct pipe_framebuffer_state framebuffer; + + struct fd_submit *submit; + + /** draw pass cmdstream: */ + struct fd_ringbuffer *draw; + /** binning pass cmdstream: */ + struct fd_ringbuffer *binning; + /** tiling/gmem (IB0) cmdstream: */ + struct fd_ringbuffer *gmem; + + /** preemble cmdstream (executed once before first tile): */ + struct fd_ringbuffer *prologue; + + /** epilogue cmdstream (executed after each tile): */ + struct fd_ringbuffer *epilogue; + + struct fd_ringbuffer *tile_setup; + struct fd_ringbuffer *tile_fini; + + union pipe_color_union clear_color[MAX_RENDER_TARGETS]; + double clear_depth; + unsigned clear_stencil; + + /** + * hw query related state: + */ + /*@{*/ + /* next sample offset.. incremented for each sample in the batch/ + * submit, reset to zero on next submit. + */ + uint32_t next_sample_offset; + + /* cached samples (in case multiple queries need to reference + * the same sample snapshot) + */ + struct fd_hw_sample *sample_cache[MAX_HW_SAMPLE_PROVIDERS]; + + /* which sample providers were used in the current batch: */ + uint32_t query_providers_used; + + /* which sample providers are currently enabled in the batch: */ + uint32_t query_providers_active; + + /* list of samples in current batch: */ + struct util_dynarray samples; + + /* current query result bo and tile stride: */ + struct pipe_resource *query_buf; + uint32_t query_tile_stride; + /*@}*/ + + /* Set of resources used by currently-unsubmitted batch (read or + * write).. does not hold a reference to the resource. + */ + struct set *resources; + + /** key in batch-cache (if not null): */ + const struct fd_batch_key *key; + uint32_t hash; + + /** set of dependent batches.. holds refs to dependent batches: */ + uint32_t dependents_mask; + + /* Buffer for tessellation engine input + */ + struct fd_bo *tessfactor_bo; + uint32_t tessfactor_size; + + /* Buffer for passing parameters between TCS and TES + */ + struct fd_bo *tessparam_bo; + uint32_t tessparam_size; + + struct fd_ringbuffer *tess_addrs_constobj; }; -struct fd_batch * fd_batch_create(struct fd_context *ctx, bool nondraw); +struct fd_batch *fd_batch_create(struct fd_context *ctx, bool nondraw); void fd_batch_reset(struct fd_batch *batch) assert_dt; void fd_batch_flush(struct fd_batch *batch) assert_dt; void fd_batch_add_dep(struct fd_batch *batch, struct fd_batch *dep) assert_dt; -void fd_batch_resource_write(struct fd_batch *batch, struct fd_resource *rsc) assert_dt; -void fd_batch_resource_read_slowpath(struct fd_batch *batch, struct fd_resource *rsc) assert_dt; +void fd_batch_resource_write(struct fd_batch *batch, + struct fd_resource *rsc) assert_dt; +void fd_batch_resource_read_slowpath(struct fd_batch *batch, + struct fd_resource *rsc) assert_dt; void fd_batch_check_size(struct fd_batch *batch) assert_dt; uint32_t fd_batch_key_hash(const void *_key); bool fd_batch_key_equals(const void *_a, const void *_b); -struct fd_batch_key * fd_batch_key_clone(void *mem_ctx, const struct fd_batch_key *key); +struct fd_batch_key *fd_batch_key_clone(void *mem_ctx, + const struct fd_batch_key *key); /* not called directly: */ -void __fd_batch_describe(char* buf, const struct fd_batch *batch) assert_dt; +void __fd_batch_describe(char *buf, const struct fd_batch *batch) assert_dt; void __fd_batch_destroy(struct fd_batch *batch); /* @@ -309,38 +312,39 @@ void __fd_batch_destroy(struct fd_batch *batch); static inline void fd_batch_reference_locked(struct fd_batch **ptr, struct fd_batch *batch) { - struct fd_batch *old_batch = *ptr; + struct fd_batch *old_batch = *ptr; - /* only need lock if a reference is dropped: */ - if (old_batch) - fd_screen_assert_locked(old_batch->ctx->screen); + /* only need lock if a reference is dropped: */ + if (old_batch) + fd_screen_assert_locked(old_batch->ctx->screen); - if (pipe_reference_described(&(*ptr)->reference, &batch->reference, - (debug_reference_descriptor)__fd_batch_describe)) - __fd_batch_destroy(old_batch); + if (pipe_reference_described( + &(*ptr)->reference, &batch->reference, + (debug_reference_descriptor)__fd_batch_describe)) + __fd_batch_destroy(old_batch); - *ptr = batch; + *ptr = batch; } static inline void fd_batch_reference(struct fd_batch **ptr, struct fd_batch *batch) { - struct fd_batch *old_batch = *ptr; - struct fd_context *ctx = old_batch ? old_batch->ctx : NULL; + struct fd_batch *old_batch = *ptr; + struct fd_context *ctx = old_batch ? old_batch->ctx : NULL; - if (ctx) - fd_screen_lock(ctx->screen); + if (ctx) + fd_screen_lock(ctx->screen); - fd_batch_reference_locked(ptr, batch); + fd_batch_reference_locked(ptr, batch); - if (ctx) - fd_screen_unlock(ctx->screen); + if (ctx) + fd_screen_unlock(ctx->screen); } static inline void fd_batch_unlock_submit(struct fd_batch *batch) { - simple_mtx_unlock(&batch->submit_lock); + simple_mtx_unlock(&batch->submit_lock); } /** @@ -350,39 +354,39 @@ fd_batch_unlock_submit(struct fd_batch *batch) static inline bool MUST_CHECK fd_batch_lock_submit(struct fd_batch *batch) { - simple_mtx_lock(&batch->submit_lock); - bool ret = !batch->flushed; - if (!ret) - fd_batch_unlock_submit(batch); - return ret; + simple_mtx_lock(&batch->submit_lock); + bool ret = !batch->flushed; + if (!ret) + fd_batch_unlock_submit(batch); + return ret; } /* Since we reorder batches and can pause/resume queries (notably for disabling * queries dueing some meta operations), we update the current query state for * the batch before each draw. */ -static inline void fd_batch_update_queries(struct fd_batch *batch) - assert_dt +static inline void +fd_batch_update_queries(struct fd_batch *batch) assert_dt { - struct fd_context *ctx = batch->ctx; + struct fd_context *ctx = batch->ctx; - if (ctx->query_update_batch) - ctx->query_update_batch(batch, false); + if (ctx->query_update_batch) + ctx->query_update_batch(batch, false); } -static inline void fd_batch_finish_queries(struct fd_batch *batch) - assert_dt +static inline void +fd_batch_finish_queries(struct fd_batch *batch) assert_dt { - struct fd_context *ctx = batch->ctx; + struct fd_context *ctx = batch->ctx; - if (ctx->query_update_batch) - ctx->query_update_batch(batch, true); + if (ctx->query_update_batch) + ctx->query_update_batch(batch, true); } static inline void fd_reset_wfi(struct fd_batch *batch) { - batch->needs_wfi = true; + batch->needs_wfi = true; } void fd_wfi(struct fd_batch *batch, struct fd_ringbuffer *ring) assert_dt; @@ -391,23 +395,23 @@ void fd_wfi(struct fd_batch *batch, struct fd_ringbuffer *ring) assert_dt; */ static inline void fd_event_write(struct fd_batch *batch, struct fd_ringbuffer *ring, - enum vgt_event_type evt) + enum vgt_event_type evt) { - OUT_PKT3(ring, CP_EVENT_WRITE, 1); - OUT_RING(ring, evt); - fd_reset_wfi(batch); + OUT_PKT3(ring, CP_EVENT_WRITE, 1); + OUT_RING(ring, evt); + fd_reset_wfi(batch); } /* Get per-tile epilogue */ static inline struct fd_ringbuffer * fd_batch_get_epilogue(struct fd_batch *batch) { - if (batch->epilogue == NULL) - batch->epilogue = fd_submit_new_ringbuffer(batch->submit, 0x1000, 0); + if (batch->epilogue == NULL) + batch->epilogue = fd_submit_new_ringbuffer(batch->submit, 0x1000, 0); - return batch->epilogue; + return batch->epilogue; } -struct fd_ringbuffer * fd_batch_get_prologue(struct fd_batch *batch); +struct fd_ringbuffer *fd_batch_get_prologue(struct fd_batch *batch); #endif /* FREEDRENO_BATCH_H_ */ diff --git a/src/gallium/drivers/freedreno/freedreno_batch_cache.c b/src/gallium/drivers/freedreno/freedreno_batch_cache.c index 5c4b7d9..6e84495 100644 --- a/src/gallium/drivers/freedreno/freedreno_batch_cache.c +++ b/src/gallium/drivers/freedreno/freedreno_batch_cache.c @@ -25,8 +25,8 @@ */ #include "util/hash_table.h" -#include "util/set.h" #include "util/list.h" +#include "util/set.h" #include "util/u_string.h" #define XXH_INLINE_ALL #include "util/xxhash.h" @@ -77,118 +77,121 @@ */ struct fd_batch_key { - uint32_t width; - uint32_t height; - uint16_t layers; - uint16_t samples; - uint16_t num_surfs; - uint16_t ctx_seqno; - struct { - struct pipe_resource *texture; - union pipe_surface_desc u; - uint8_t pos, samples; - uint16_t format; - } surf[0]; + uint32_t width; + uint32_t height; + uint16_t layers; + uint16_t samples; + uint16_t num_surfs; + uint16_t ctx_seqno; + struct { + struct pipe_resource *texture; + union pipe_surface_desc u; + uint8_t pos, samples; + uint16_t format; + } surf[0]; }; static struct fd_batch_key * key_alloc(unsigned num_surfs) { - struct fd_batch_key *key = - CALLOC_VARIANT_LENGTH_STRUCT(fd_batch_key, sizeof(key->surf[0]) * num_surfs); - return key; + struct fd_batch_key *key = CALLOC_VARIANT_LENGTH_STRUCT( + fd_batch_key, sizeof(key->surf[0]) * num_surfs); + return key; } uint32_t fd_batch_key_hash(const void *_key) { - const struct fd_batch_key *key = _key; - uint32_t hash = 0; - hash = XXH32(key, offsetof(struct fd_batch_key, surf[0]), hash); - hash = XXH32(key->surf, sizeof(key->surf[0]) * key->num_surfs , hash); - return hash; + const struct fd_batch_key *key = _key; + uint32_t hash = 0; + hash = XXH32(key, offsetof(struct fd_batch_key, surf[0]), hash); + hash = XXH32(key->surf, sizeof(key->surf[0]) * key->num_surfs, hash); + return hash; } bool fd_batch_key_equals(const void *_a, const void *_b) { - const struct fd_batch_key *a = _a; - const struct fd_batch_key *b = _b; - return (memcmp(a, b, offsetof(struct fd_batch_key, surf[0])) == 0) && - (memcmp(a->surf, b->surf, sizeof(a->surf[0]) * a->num_surfs) == 0); + const struct fd_batch_key *a = _a; + const struct fd_batch_key *b = _b; + return (memcmp(a, b, offsetof(struct fd_batch_key, surf[0])) == 0) && + (memcmp(a->surf, b->surf, sizeof(a->surf[0]) * a->num_surfs) == 0); } struct fd_batch_key * fd_batch_key_clone(void *mem_ctx, const struct fd_batch_key *key) { - unsigned sz = sizeof(struct fd_batch_key) + (sizeof(key->surf[0]) * key->num_surfs); - struct fd_batch_key *new_key = rzalloc_size(mem_ctx, sz); - memcpy(new_key, key, sz); - return new_key; + unsigned sz = + sizeof(struct fd_batch_key) + (sizeof(key->surf[0]) * key->num_surfs); + struct fd_batch_key *new_key = rzalloc_size(mem_ctx, sz); + memcpy(new_key, key, sz); + return new_key; } void fd_bc_init(struct fd_batch_cache *cache) { - cache->ht = _mesa_hash_table_create(NULL, fd_batch_key_hash, fd_batch_key_equals); + cache->ht = + _mesa_hash_table_create(NULL, fd_batch_key_hash, fd_batch_key_equals); } void fd_bc_fini(struct fd_batch_cache *cache) { - _mesa_hash_table_destroy(cache->ht, NULL); + _mesa_hash_table_destroy(cache->ht, NULL); } static void -bc_flush(struct fd_batch_cache *cache, struct fd_context *ctx, bool deferred) - assert_dt +bc_flush(struct fd_batch_cache *cache, struct fd_context *ctx, + bool deferred) assert_dt { - /* fd_batch_flush() (and fd_batch_add_dep() which calls it indirectly) - * can cause batches to be unref'd and freed under our feet, so grab - * a reference to all the batches we need up-front. - */ - struct fd_batch *batches[ARRAY_SIZE(cache->batches)] = {0}; - struct fd_batch *batch; - unsigned n = 0; - - fd_screen_lock(ctx->screen); - - foreach_batch(batch, cache, cache->batch_mask) { - if (batch->ctx == ctx) { - fd_batch_reference_locked(&batches[n++], batch); - } - } - - if (deferred) { - struct fd_batch *current_batch = fd_context_batch(ctx); - - for (unsigned i = 0; i < n; i++) { - if (batches[i] && (batches[i]->ctx == ctx) && - (batches[i] != current_batch)) { - fd_batch_add_dep(current_batch, batches[i]); - } - } - - fd_batch_reference_locked(¤t_batch, NULL); - - fd_screen_unlock(ctx->screen); - } else { - fd_screen_unlock(ctx->screen); - - for (unsigned i = 0; i < n; i++) { - fd_batch_flush(batches[i]); - } - } - - for (unsigned i = 0; i < n; i++) { - fd_batch_reference(&batches[i], NULL); - } + /* fd_batch_flush() (and fd_batch_add_dep() which calls it indirectly) + * can cause batches to be unref'd and freed under our feet, so grab + * a reference to all the batches we need up-front. + */ + struct fd_batch *batches[ARRAY_SIZE(cache->batches)] = {0}; + struct fd_batch *batch; + unsigned n = 0; + + fd_screen_lock(ctx->screen); + + foreach_batch(batch, cache, cache->batch_mask) + { + if (batch->ctx == ctx) { + fd_batch_reference_locked(&batches[n++], batch); + } + } + + if (deferred) { + struct fd_batch *current_batch = fd_context_batch(ctx); + + for (unsigned i = 0; i < n; i++) { + if (batches[i] && (batches[i]->ctx == ctx) && + (batches[i] != current_batch)) { + fd_batch_add_dep(current_batch, batches[i]); + } + } + + fd_batch_reference_locked(¤t_batch, NULL); + + fd_screen_unlock(ctx->screen); + } else { + fd_screen_unlock(ctx->screen); + + for (unsigned i = 0; i < n; i++) { + fd_batch_flush(batches[i]); + } + } + + for (unsigned i = 0; i < n; i++) { + fd_batch_reference(&batches[i], NULL); + } } void fd_bc_flush(struct fd_batch_cache *cache, struct fd_context *ctx) { - bc_flush(cache, ctx, false); + bc_flush(cache, ctx, false); } /* deferred flush doesn't actually flush, but it marks every other @@ -199,62 +202,61 @@ fd_bc_flush(struct fd_batch_cache *cache, struct fd_context *ctx) void fd_bc_flush_deferred(struct fd_batch_cache *cache, struct fd_context *ctx) { - bc_flush(cache, ctx, true); + bc_flush(cache, ctx, true); } static bool batch_in_cache(struct fd_batch_cache *cache, struct fd_batch *batch) { - struct fd_batch *b; + struct fd_batch *b; - foreach_batch (b, cache, cache->batch_mask) - if (b == batch) - return true; + foreach_batch(b, cache, cache->batch_mask) if (b == batch) return true; - return false; + return false; } void fd_bc_dump(struct fd_screen *screen, const char *fmt, ...) { - struct fd_batch_cache *cache = &screen->batch_cache; + struct fd_batch_cache *cache = &screen->batch_cache; - if (!BATCH_DEBUG) - return; + if (!BATCH_DEBUG) + return; - fd_screen_lock(screen); + fd_screen_lock(screen); - va_list ap; - va_start(ap, fmt); - vprintf(fmt, ap); - va_end(ap); + va_list ap; + va_start(ap, fmt); + vprintf(fmt, ap); + va_end(ap); - set_foreach (screen->live_batches, entry) { - struct fd_batch *batch = (struct fd_batch *)entry->key; - printf(" %p<%u>%s%s\n", batch, batch->seqno, - batch->needs_flush ? ", NEEDS FLUSH" : "", - batch_in_cache(cache, batch) ? "" : ", ORPHAN"); - } + set_foreach (screen->live_batches, entry) { + struct fd_batch *batch = (struct fd_batch *)entry->key; + printf(" %p<%u>%s%s\n", batch, batch->seqno, + batch->needs_flush ? ", NEEDS FLUSH" : "", + batch_in_cache(cache, batch) ? "" : ", ORPHAN"); + } - printf("----\n"); + printf("----\n"); - fd_screen_unlock(screen); + fd_screen_unlock(screen); } void fd_bc_invalidate_context(struct fd_context *ctx) { - struct fd_batch_cache *cache = &ctx->screen->batch_cache; - struct fd_batch *batch; + struct fd_batch_cache *cache = &ctx->screen->batch_cache; + struct fd_batch *batch; - fd_screen_lock(ctx->screen); + fd_screen_lock(ctx->screen); - foreach_batch(batch, cache, cache->batch_mask) { - if (batch->ctx == ctx) - fd_bc_invalidate_batch(batch, true); - } + foreach_batch(batch, cache, cache->batch_mask) + { + if (batch->ctx == ctx) + fd_bc_invalidate_batch(batch, true); + } - fd_screen_unlock(ctx->screen); + fd_screen_unlock(ctx->screen); } /** @@ -270,72 +272,73 @@ fd_bc_invalidate_context(struct fd_context *ctx) void fd_bc_invalidate_batch(struct fd_batch *batch, bool remove) { - if (!batch) - return; + if (!batch) + return; - struct fd_batch_cache *cache = &batch->ctx->screen->batch_cache; - struct fd_batch_key *key = (struct fd_batch_key *)batch->key; + struct fd_batch_cache *cache = &batch->ctx->screen->batch_cache; + struct fd_batch_key *key = (struct fd_batch_key *)batch->key; - fd_screen_assert_locked(batch->ctx->screen); + fd_screen_assert_locked(batch->ctx->screen); - if (remove) { - cache->batches[batch->idx] = NULL; - cache->batch_mask &= ~(1 << batch->idx); - } + if (remove) { + cache->batches[batch->idx] = NULL; + cache->batch_mask &= ~(1 << batch->idx); + } - if (!key) - return; + if (!key) + return; - DBG("%p: key=%p", batch, batch->key); - for (unsigned idx = 0; idx < key->num_surfs; idx++) { - struct fd_resource *rsc = fd_resource(key->surf[idx].texture); - rsc->track->bc_batch_mask &= ~(1 << batch->idx); - } + DBG("%p: key=%p", batch, batch->key); + for (unsigned idx = 0; idx < key->num_surfs; idx++) { + struct fd_resource *rsc = fd_resource(key->surf[idx].texture); + rsc->track->bc_batch_mask &= ~(1 << batch->idx); + } - struct hash_entry *entry = - _mesa_hash_table_search_pre_hashed(cache->ht, batch->hash, key); - _mesa_hash_table_remove(cache->ht, entry); + struct hash_entry *entry = + _mesa_hash_table_search_pre_hashed(cache->ht, batch->hash, key); + _mesa_hash_table_remove(cache->ht, entry); - batch->key = NULL; - free(key); + batch->key = NULL; + free(key); } void fd_bc_invalidate_resource(struct fd_resource *rsc, bool destroy) { - struct fd_screen *screen = fd_screen(rsc->b.b.screen); - struct fd_batch *batch; + struct fd_screen *screen = fd_screen(rsc->b.b.screen); + struct fd_batch *batch; - fd_screen_lock(screen); + fd_screen_lock(screen); - if (destroy) { - foreach_batch (batch, &screen->batch_cache, rsc->track->batch_mask) { - struct set_entry *entry = _mesa_set_search(batch->resources, rsc); - _mesa_set_remove(batch->resources, entry); - } - rsc->track->batch_mask = 0; + if (destroy) { + foreach_batch(batch, &screen->batch_cache, rsc->track->batch_mask) + { + struct set_entry *entry = _mesa_set_search(batch->resources, rsc); + _mesa_set_remove(batch->resources, entry); + } + rsc->track->batch_mask = 0; - fd_batch_reference_locked(&rsc->track->write_batch, NULL); - } + fd_batch_reference_locked(&rsc->track->write_batch, NULL); + } - foreach_batch (batch, &screen->batch_cache, rsc->track->bc_batch_mask) - fd_bc_invalidate_batch(batch, false); + foreach_batch(batch, &screen->batch_cache, rsc->track->bc_batch_mask) + fd_bc_invalidate_batch(batch, false); - rsc->track->bc_batch_mask = 0; + rsc->track->bc_batch_mask = 0; - fd_screen_unlock(screen); + fd_screen_unlock(screen); } static struct fd_batch * -alloc_batch_locked(struct fd_batch_cache *cache, struct fd_context *ctx, bool nondraw) - assert_dt +alloc_batch_locked(struct fd_batch_cache *cache, struct fd_context *ctx, + bool nondraw) assert_dt { - struct fd_batch *batch; - uint32_t idx; + struct fd_batch *batch; + uint32_t idx; - fd_screen_assert_locked(ctx->screen); + fd_screen_assert_locked(ctx->screen); - while ((idx = ffs(~cache->batch_mask)) == 0) { + while ((idx = ffs(~cache->batch_mask)) == 0) { #if 0 for (unsigned i = 0; i < ARRAY_SIZE(cache->batches); i++) { batch = cache->batches[i]; @@ -347,168 +350,170 @@ alloc_batch_locked(struct fd_batch_cache *cache, struct fd_context *ctx, bool no debug_printf("\n"); } #endif - /* TODO: is LRU the better policy? Or perhaps the batch that - * depends on the fewest other batches? - */ - struct fd_batch *flush_batch = NULL; - for (unsigned i = 0; i < ARRAY_SIZE(cache->batches); i++) { - if (!flush_batch || (cache->batches[i]->seqno < flush_batch->seqno)) - fd_batch_reference_locked(&flush_batch, cache->batches[i]); - } - - /* we can drop lock temporarily here, since we hold a ref, - * flush_batch won't disappear under us. - */ - fd_screen_unlock(ctx->screen); - DBG("%p: too many batches! flush forced!", flush_batch); - fd_batch_flush(flush_batch); - fd_screen_lock(ctx->screen); - - /* While the resources get cleaned up automatically, the flush_batch - * doesn't get removed from the dependencies of other batches, so - * it won't be unref'd and will remain in the table. - * - * TODO maybe keep a bitmask of batches that depend on me, to make - * this easier: - */ - for (unsigned i = 0; i < ARRAY_SIZE(cache->batches); i++) { - struct fd_batch *other = cache->batches[i]; - if (!other) - continue; - if (other->dependents_mask & (1 << flush_batch->idx)) { - other->dependents_mask &= ~(1 << flush_batch->idx); - struct fd_batch *ref = flush_batch; - fd_batch_reference_locked(&ref, NULL); - } - } - - fd_batch_reference_locked(&flush_batch, NULL); - } - - idx--; /* bit zero returns 1 for ffs() */ - - batch = fd_batch_create(ctx, nondraw); - if (!batch) - return NULL; - - batch->seqno = cache->cnt++; - batch->idx = idx; - cache->batch_mask |= (1 << idx); - - debug_assert(cache->batches[idx] == NULL); - cache->batches[idx] = batch; - - return batch; + /* TODO: is LRU the better policy? Or perhaps the batch that + * depends on the fewest other batches? + */ + struct fd_batch *flush_batch = NULL; + for (unsigned i = 0; i < ARRAY_SIZE(cache->batches); i++) { + if (!flush_batch || (cache->batches[i]->seqno < flush_batch->seqno)) + fd_batch_reference_locked(&flush_batch, cache->batches[i]); + } + + /* we can drop lock temporarily here, since we hold a ref, + * flush_batch won't disappear under us. + */ + fd_screen_unlock(ctx->screen); + DBG("%p: too many batches! flush forced!", flush_batch); + fd_batch_flush(flush_batch); + fd_screen_lock(ctx->screen); + + /* While the resources get cleaned up automatically, the flush_batch + * doesn't get removed from the dependencies of other batches, so + * it won't be unref'd and will remain in the table. + * + * TODO maybe keep a bitmask of batches that depend on me, to make + * this easier: + */ + for (unsigned i = 0; i < ARRAY_SIZE(cache->batches); i++) { + struct fd_batch *other = cache->batches[i]; + if (!other) + continue; + if (other->dependents_mask & (1 << flush_batch->idx)) { + other->dependents_mask &= ~(1 << flush_batch->idx); + struct fd_batch *ref = flush_batch; + fd_batch_reference_locked(&ref, NULL); + } + } + + fd_batch_reference_locked(&flush_batch, NULL); + } + + idx--; /* bit zero returns 1 for ffs() */ + + batch = fd_batch_create(ctx, nondraw); + if (!batch) + return NULL; + + batch->seqno = cache->cnt++; + batch->idx = idx; + cache->batch_mask |= (1 << idx); + + debug_assert(cache->batches[idx] == NULL); + cache->batches[idx] = batch; + + return batch; } struct fd_batch * -fd_bc_alloc_batch(struct fd_batch_cache *cache, struct fd_context *ctx, bool nondraw) +fd_bc_alloc_batch(struct fd_batch_cache *cache, struct fd_context *ctx, + bool nondraw) { - struct fd_batch *batch; + struct fd_batch *batch; - /* For normal draw batches, pctx->set_framebuffer_state() handles - * this, but for nondraw batches, this is a nice central location - * to handle them all. - */ - if (nondraw) - fd_context_switch_from(ctx); + /* For normal draw batches, pctx->set_framebuffer_state() handles + * this, but for nondraw batches, this is a nice central location + * to handle them all. + */ + if (nondraw) + fd_context_switch_from(ctx); - fd_screen_lock(ctx->screen); - batch = alloc_batch_locked(cache, ctx, nondraw); - fd_screen_unlock(ctx->screen); + fd_screen_lock(ctx->screen); + batch = alloc_batch_locked(cache, ctx, nondraw); + fd_screen_unlock(ctx->screen); - if (batch && nondraw) - fd_context_switch_to(ctx, batch); + if (batch && nondraw) + fd_context_switch_to(ctx, batch); - return batch; + return batch; } static struct fd_batch * batch_from_key(struct fd_batch_cache *cache, struct fd_batch_key *key, - struct fd_context *ctx) - assert_dt + struct fd_context *ctx) assert_dt { - struct fd_batch *batch = NULL; - uint32_t hash = fd_batch_key_hash(key); - struct hash_entry *entry = - _mesa_hash_table_search_pre_hashed(cache->ht, hash, key); - - if (entry) { - free(key); - fd_batch_reference(&batch, (struct fd_batch *)entry->data); - return batch; - } - - batch = alloc_batch_locked(cache, ctx, false); + struct fd_batch *batch = NULL; + uint32_t hash = fd_batch_key_hash(key); + struct hash_entry *entry = + _mesa_hash_table_search_pre_hashed(cache->ht, hash, key); + + if (entry) { + free(key); + fd_batch_reference(&batch, (struct fd_batch *)entry->data); + return batch; + } + + batch = alloc_batch_locked(cache, ctx, false); #ifdef DEBUG - DBG("%p: hash=0x%08x, %ux%u, %u layers, %u samples", batch, hash, - key->width, key->height, key->layers, key->samples); - for (unsigned idx = 0; idx < key->num_surfs; idx++) { - DBG("%p: surf[%u]: %p (%s) (%u,%u / %u,%u,%u)", batch, key->surf[idx].pos, - key->surf[idx].texture, util_format_name(key->surf[idx].format), - key->surf[idx].u.buf.first_element, key->surf[idx].u.buf.last_element, - key->surf[idx].u.tex.first_layer, key->surf[idx].u.tex.last_layer, - key->surf[idx].u.tex.level); - } + DBG("%p: hash=0x%08x, %ux%u, %u layers, %u samples", batch, hash, key->width, + key->height, key->layers, key->samples); + for (unsigned idx = 0; idx < key->num_surfs; idx++) { + DBG("%p: surf[%u]: %p (%s) (%u,%u / %u,%u,%u)", batch, + key->surf[idx].pos, key->surf[idx].texture, + util_format_name(key->surf[idx].format), + key->surf[idx].u.buf.first_element, key->surf[idx].u.buf.last_element, + key->surf[idx].u.tex.first_layer, key->surf[idx].u.tex.last_layer, + key->surf[idx].u.tex.level); + } #endif - if (!batch) - return NULL; - - /* reset max_scissor, which will be adjusted on draws - * according to the actual scissor. - */ - batch->max_scissor.minx = ~0; - batch->max_scissor.miny = ~0; - batch->max_scissor.maxx = 0; - batch->max_scissor.maxy = 0; - - _mesa_hash_table_insert_pre_hashed(cache->ht, hash, key, batch); - batch->key = key; - batch->hash = hash; - - for (unsigned idx = 0; idx < key->num_surfs; idx++) { - struct fd_resource *rsc = fd_resource(key->surf[idx].texture); - rsc->track->bc_batch_mask = (1 << batch->idx); - } - - return batch; + if (!batch) + return NULL; + + /* reset max_scissor, which will be adjusted on draws + * according to the actual scissor. + */ + batch->max_scissor.minx = ~0; + batch->max_scissor.miny = ~0; + batch->max_scissor.maxx = 0; + batch->max_scissor.maxy = 0; + + _mesa_hash_table_insert_pre_hashed(cache->ht, hash, key, batch); + batch->key = key; + batch->hash = hash; + + for (unsigned idx = 0; idx < key->num_surfs; idx++) { + struct fd_resource *rsc = fd_resource(key->surf[idx].texture); + rsc->track->bc_batch_mask = (1 << batch->idx); + } + + return batch; } static void -key_surf(struct fd_batch_key *key, unsigned idx, unsigned pos, struct pipe_surface *psurf) +key_surf(struct fd_batch_key *key, unsigned idx, unsigned pos, + struct pipe_surface *psurf) { - key->surf[idx].texture = psurf->texture; - key->surf[idx].u = psurf->u; - key->surf[idx].pos = pos; - key->surf[idx].samples = MAX2(1, psurf->nr_samples); - key->surf[idx].format = psurf->format; + key->surf[idx].texture = psurf->texture; + key->surf[idx].u = psurf->u; + key->surf[idx].pos = pos; + key->surf[idx].samples = MAX2(1, psurf->nr_samples); + key->surf[idx].format = psurf->format; } struct fd_batch * fd_batch_from_fb(struct fd_batch_cache *cache, struct fd_context *ctx, - const struct pipe_framebuffer_state *pfb) + const struct pipe_framebuffer_state *pfb) { - unsigned idx = 0, n = pfb->nr_cbufs + (pfb->zsbuf ? 1 : 0); - struct fd_batch_key *key = key_alloc(n); + unsigned idx = 0, n = pfb->nr_cbufs + (pfb->zsbuf ? 1 : 0); + struct fd_batch_key *key = key_alloc(n); - key->width = pfb->width; - key->height = pfb->height; - key->layers = pfb->layers; - key->samples = util_framebuffer_get_num_samples(pfb); - key->ctx_seqno = ctx->seqno; + key->width = pfb->width; + key->height = pfb->height; + key->layers = pfb->layers; + key->samples = util_framebuffer_get_num_samples(pfb); + key->ctx_seqno = ctx->seqno; - if (pfb->zsbuf) - key_surf(key, idx++, 0, pfb->zsbuf); + if (pfb->zsbuf) + key_surf(key, idx++, 0, pfb->zsbuf); - for (unsigned i = 0; i < pfb->nr_cbufs; i++) - if (pfb->cbufs[i]) - key_surf(key, idx++, i + 1, pfb->cbufs[i]); + for (unsigned i = 0; i < pfb->nr_cbufs; i++) + if (pfb->cbufs[i]) + key_surf(key, idx++, i + 1, pfb->cbufs[i]); - key->num_surfs = idx; + key->num_surfs = idx; - fd_screen_lock(ctx->screen); - struct fd_batch *batch = batch_from_key(cache, key, ctx); - fd_screen_unlock(ctx->screen); + fd_screen_lock(ctx->screen); + struct fd_batch *batch = batch_from_key(cache, key, ctx); + fd_screen_unlock(ctx->screen); - return batch; + return batch; } diff --git a/src/gallium/drivers/freedreno/freedreno_batch_cache.h b/src/gallium/drivers/freedreno/freedreno_batch_cache.h index 4fddf8f..0c55567 100644 --- a/src/gallium/drivers/freedreno/freedreno_batch_cache.h +++ b/src/gallium/drivers/freedreno/freedreno_batch_cache.h @@ -39,19 +39,19 @@ struct fd_screen; struct hash_table; struct fd_batch_cache { - struct hash_table *ht; - unsigned cnt; + struct hash_table *ht; + unsigned cnt; - /* set of active batches.. there is an upper limit on the number of - * in-flight batches, for two reasons: - * 1) to avoid big spikes in number of batches in edge cases, such as - * game startup (ie, lots of texture uploads, but no usages yet of - * the textures), etc. - * 2) so we can use a simple bitmask in fd_resource to track which - * batches have reference to the resource - */ - struct fd_batch *batches[32]; - uint32_t batch_mask; + /* set of active batches.. there is an upper limit on the number of + * in-flight batches, for two reasons: + * 1) to avoid big spikes in number of batches in edge cases, such as + * game startup (ie, lots of texture uploads, but no usages yet of + * the textures), etc. + * 2) so we can use a simple bitmask in fd_resource to track which + * batches have reference to the resource + */ + struct fd_batch *batches[32]; + uint32_t batch_mask; }; /* note: if batches get unref'd in the body of the loop, they are removed @@ -59,22 +59,29 @@ struct fd_batch_cache { * the loop into _m, we need the &= at the end of the loop to make sure * we don't have stale bits in _m */ -#define foreach_batch(batch, cache, mask) \ - for (uint32_t _m = (mask); _m && ((batch) = (cache)->batches[u_bit_scan(&_m)]); _m &= (mask)) +#define foreach_batch(batch, cache, mask) \ + for (uint32_t _m = (mask); \ + _m && ((batch) = (cache)->batches[u_bit_scan(&_m)]); _m &= (mask)) void fd_bc_init(struct fd_batch_cache *cache); void fd_bc_fini(struct fd_batch_cache *cache); -void fd_bc_flush(struct fd_batch_cache *cache, struct fd_context *ctx) assert_dt; -void fd_bc_flush_deferred(struct fd_batch_cache *cache, struct fd_context *ctx) assert_dt; -void fd_bc_dump(struct fd_screen *screen, const char *fmt, ...) _util_printf_format(2, 3); +void fd_bc_flush(struct fd_batch_cache *cache, + struct fd_context *ctx) assert_dt; +void fd_bc_flush_deferred(struct fd_batch_cache *cache, + struct fd_context *ctx) assert_dt; +void fd_bc_dump(struct fd_screen *screen, const char *fmt, ...) + _util_printf_format(2, 3); void fd_bc_invalidate_context(struct fd_context *ctx); void fd_bc_invalidate_batch(struct fd_batch *batch, bool destroy); void fd_bc_invalidate_resource(struct fd_resource *rsc, bool destroy); -struct fd_batch * fd_bc_alloc_batch(struct fd_batch_cache *cache, struct fd_context *ctx, bool nondraw) assert_dt; +struct fd_batch *fd_bc_alloc_batch(struct fd_batch_cache *cache, + struct fd_context *ctx, + bool nondraw) assert_dt; -struct fd_batch * fd_batch_from_fb(struct fd_batch_cache *cache, - struct fd_context *ctx, const struct pipe_framebuffer_state *pfb) assert_dt; +struct fd_batch * +fd_batch_from_fb(struct fd_batch_cache *cache, struct fd_context *ctx, + const struct pipe_framebuffer_state *pfb) assert_dt; #endif /* FREEDRENO_BATCH_CACHE_H_ */ diff --git a/src/gallium/drivers/freedreno/freedreno_blitter.c b/src/gallium/drivers/freedreno/freedreno_blitter.c index c8863a1..c188d17 100644 --- a/src/gallium/drivers/freedreno/freedreno_blitter.c +++ b/src/gallium/drivers/freedreno/freedreno_blitter.c @@ -29,8 +29,8 @@ #include "freedreno_blitter.h" #include "freedreno_context.h" -#include "freedreno_resource.h" #include "freedreno_fence.h" +#include "freedreno_resource.h" /* generic blit using u_blitter.. slightly modified version of util_blitter_blit * which also handles PIPE_BUFFER: @@ -38,230 +38,226 @@ static void default_dst_texture(struct pipe_surface *dst_templ, struct pipe_resource *dst, - unsigned dstlevel, unsigned dstz) + unsigned dstlevel, unsigned dstz) { - memset(dst_templ, 0, sizeof(*dst_templ)); - dst_templ->u.tex.level = dstlevel; - dst_templ->u.tex.first_layer = dstz; - dst_templ->u.tex.last_layer = dstz; + memset(dst_templ, 0, sizeof(*dst_templ)); + dst_templ->u.tex.level = dstlevel; + dst_templ->u.tex.first_layer = dstz; + dst_templ->u.tex.last_layer = dstz; } static void default_src_texture(struct pipe_sampler_view *src_templ, - struct pipe_resource *src, unsigned srclevel) + struct pipe_resource *src, unsigned srclevel) { - bool cube_as_2darray = src->screen->get_param(src->screen, - PIPE_CAP_SAMPLER_VIEW_TARGET); - - memset(src_templ, 0, sizeof(*src_templ)); - - if (cube_as_2darray && (src->target == PIPE_TEXTURE_CUBE || - src->target == PIPE_TEXTURE_CUBE_ARRAY)) - src_templ->target = PIPE_TEXTURE_2D_ARRAY; - else - src_templ->target = src->target; - - if (src->target == PIPE_BUFFER) { - src_templ->target = PIPE_TEXTURE_1D; - } - src_templ->u.tex.first_level = srclevel; - src_templ->u.tex.last_level = srclevel; - src_templ->u.tex.first_layer = 0; - src_templ->u.tex.last_layer = - src->target == PIPE_TEXTURE_3D ? u_minify(src->depth0, srclevel) - 1 - : (unsigned)(src->array_size - 1); - src_templ->swizzle_r = PIPE_SWIZZLE_X; - src_templ->swizzle_g = PIPE_SWIZZLE_Y; - src_templ->swizzle_b = PIPE_SWIZZLE_Z; - src_templ->swizzle_a = PIPE_SWIZZLE_W; + bool cube_as_2darray = + src->screen->get_param(src->screen, PIPE_CAP_SAMPLER_VIEW_TARGET); + + memset(src_templ, 0, sizeof(*src_templ)); + + if (cube_as_2darray && (src->target == PIPE_TEXTURE_CUBE || + src->target == PIPE_TEXTURE_CUBE_ARRAY)) + src_templ->target = PIPE_TEXTURE_2D_ARRAY; + else + src_templ->target = src->target; + + if (src->target == PIPE_BUFFER) { + src_templ->target = PIPE_TEXTURE_1D; + } + src_templ->u.tex.first_level = srclevel; + src_templ->u.tex.last_level = srclevel; + src_templ->u.tex.first_layer = 0; + src_templ->u.tex.last_layer = src->target == PIPE_TEXTURE_3D + ? u_minify(src->depth0, srclevel) - 1 + : (unsigned)(src->array_size - 1); + src_templ->swizzle_r = PIPE_SWIZZLE_X; + src_templ->swizzle_g = PIPE_SWIZZLE_Y; + src_templ->swizzle_b = PIPE_SWIZZLE_Z; + src_templ->swizzle_a = PIPE_SWIZZLE_W; } static void -fd_blitter_pipe_begin(struct fd_context *ctx, bool render_cond, bool discard) - assert_dt +fd_blitter_pipe_begin(struct fd_context *ctx, bool render_cond, + bool discard) assert_dt { - fd_fence_ref(&ctx->last_fence, NULL); - - util_blitter_save_vertex_buffer_slot(ctx->blitter, ctx->vtx.vertexbuf.vb); - util_blitter_save_vertex_elements(ctx->blitter, ctx->vtx.vtx); - util_blitter_save_vertex_shader(ctx->blitter, ctx->prog.vs); - util_blitter_save_tessctrl_shader(ctx->blitter, ctx->prog.hs); - util_blitter_save_tesseval_shader(ctx->blitter, ctx->prog.ds); - util_blitter_save_geometry_shader(ctx->blitter, ctx->prog.gs); - util_blitter_save_so_targets(ctx->blitter, ctx->streamout.num_targets, - ctx->streamout.targets); - util_blitter_save_rasterizer(ctx->blitter, ctx->rasterizer); - util_blitter_save_viewport(ctx->blitter, &ctx->viewport); - util_blitter_save_scissor(ctx->blitter, &ctx->scissor); - util_blitter_save_fragment_shader(ctx->blitter, ctx->prog.fs); - util_blitter_save_blend(ctx->blitter, ctx->blend); - util_blitter_save_depth_stencil_alpha(ctx->blitter, ctx->zsa); - util_blitter_save_stencil_ref(ctx->blitter, &ctx->stencil_ref); - util_blitter_save_sample_mask(ctx->blitter, ctx->sample_mask); - util_blitter_save_framebuffer(ctx->blitter, &ctx->framebuffer); - util_blitter_save_fragment_sampler_states(ctx->blitter, - ctx->tex[PIPE_SHADER_FRAGMENT].num_samplers, - (void **)ctx->tex[PIPE_SHADER_FRAGMENT].samplers); - util_blitter_save_fragment_sampler_views(ctx->blitter, - ctx->tex[PIPE_SHADER_FRAGMENT].num_textures, - ctx->tex[PIPE_SHADER_FRAGMENT].textures); - if (!render_cond) - util_blitter_save_render_condition(ctx->blitter, - ctx->cond_query, ctx->cond_cond, ctx->cond_mode); - - if (ctx->batch) - fd_batch_update_queries(ctx->batch); - - ctx->in_discard_blit = discard; + fd_fence_ref(&ctx->last_fence, NULL); + + util_blitter_save_vertex_buffer_slot(ctx->blitter, ctx->vtx.vertexbuf.vb); + util_blitter_save_vertex_elements(ctx->blitter, ctx->vtx.vtx); + util_blitter_save_vertex_shader(ctx->blitter, ctx->prog.vs); + util_blitter_save_tessctrl_shader(ctx->blitter, ctx->prog.hs); + util_blitter_save_tesseval_shader(ctx->blitter, ctx->prog.ds); + util_blitter_save_geometry_shader(ctx->blitter, ctx->prog.gs); + util_blitter_save_so_targets(ctx->blitter, ctx->streamout.num_targets, + ctx->streamout.targets); + util_blitter_save_rasterizer(ctx->blitter, ctx->rasterizer); + util_blitter_save_viewport(ctx->blitter, &ctx->viewport); + util_blitter_save_scissor(ctx->blitter, &ctx->scissor); + util_blitter_save_fragment_shader(ctx->blitter, ctx->prog.fs); + util_blitter_save_blend(ctx->blitter, ctx->blend); + util_blitter_save_depth_stencil_alpha(ctx->blitter, ctx->zsa); + util_blitter_save_stencil_ref(ctx->blitter, &ctx->stencil_ref); + util_blitter_save_sample_mask(ctx->blitter, ctx->sample_mask); + util_blitter_save_framebuffer(ctx->blitter, &ctx->framebuffer); + util_blitter_save_fragment_sampler_states( + ctx->blitter, ctx->tex[PIPE_SHADER_FRAGMENT].num_samplers, + (void **)ctx->tex[PIPE_SHADER_FRAGMENT].samplers); + util_blitter_save_fragment_sampler_views( + ctx->blitter, ctx->tex[PIPE_SHADER_FRAGMENT].num_textures, + ctx->tex[PIPE_SHADER_FRAGMENT].textures); + if (!render_cond) + util_blitter_save_render_condition(ctx->blitter, ctx->cond_query, + ctx->cond_cond, ctx->cond_mode); + + if (ctx->batch) + fd_batch_update_queries(ctx->batch); + + ctx->in_discard_blit = discard; } static void -fd_blitter_pipe_end(struct fd_context *ctx) - assert_dt +fd_blitter_pipe_end(struct fd_context *ctx) assert_dt { - ctx->in_discard_blit = false; + ctx->in_discard_blit = false; } bool fd_blitter_blit(struct fd_context *ctx, const struct pipe_blit_info *info) { - struct pipe_resource *dst = info->dst.resource; - struct pipe_resource *src = info->src.resource; - struct pipe_context *pipe = &ctx->base; - struct pipe_surface *dst_view, dst_templ; - struct pipe_sampler_view src_templ, *src_view; - bool discard = false; - - if (!info->scissor_enable && !info->alpha_blend) { - discard = util_texrange_covers_whole_level(info->dst.resource, - info->dst.level, info->dst.box.x, info->dst.box.y, - info->dst.box.z, info->dst.box.width, - info->dst.box.height, info->dst.box.depth); - } - - fd_blitter_pipe_begin(ctx, info->render_condition_enable, discard); - - /* Initialize the surface. */ - default_dst_texture(&dst_templ, dst, info->dst.level, - info->dst.box.z); - dst_templ.format = info->dst.format; - dst_view = pipe->create_surface(pipe, dst, &dst_templ); - - /* Initialize the sampler view. */ - default_src_texture(&src_templ, src, info->src.level); - src_templ.format = info->src.format; - src_view = pipe->create_sampler_view(pipe, src, &src_templ); - - /* Copy. */ - util_blitter_blit_generic(ctx->blitter, dst_view, &info->dst.box, - src_view, &info->src.box, src->width0, src->height0, - info->mask, info->filter, - info->scissor_enable ? &info->scissor : NULL, - info->alpha_blend); - - pipe_surface_reference(&dst_view, NULL); - pipe_sampler_view_reference(&src_view, NULL); - - fd_blitter_pipe_end(ctx); - - /* The fallback blitter must never fail: */ - return true; + struct pipe_resource *dst = info->dst.resource; + struct pipe_resource *src = info->src.resource; + struct pipe_context *pipe = &ctx->base; + struct pipe_surface *dst_view, dst_templ; + struct pipe_sampler_view src_templ, *src_view; + bool discard = false; + + if (!info->scissor_enable && !info->alpha_blend) { + discard = util_texrange_covers_whole_level( + info->dst.resource, info->dst.level, info->dst.box.x, info->dst.box.y, + info->dst.box.z, info->dst.box.width, info->dst.box.height, + info->dst.box.depth); + } + + fd_blitter_pipe_begin(ctx, info->render_condition_enable, discard); + + /* Initialize the surface. */ + default_dst_texture(&dst_templ, dst, info->dst.level, info->dst.box.z); + dst_templ.format = info->dst.format; + dst_view = pipe->create_surface(pipe, dst, &dst_templ); + + /* Initialize the sampler view. */ + default_src_texture(&src_templ, src, info->src.level); + src_templ.format = info->src.format; + src_view = pipe->create_sampler_view(pipe, src, &src_templ); + + /* Copy. */ + util_blitter_blit_generic( + ctx->blitter, dst_view, &info->dst.box, src_view, &info->src.box, + src->width0, src->height0, info->mask, info->filter, + info->scissor_enable ? &info->scissor : NULL, info->alpha_blend); + + pipe_surface_reference(&dst_view, NULL); + pipe_sampler_view_reference(&src_view, NULL); + + fd_blitter_pipe_end(ctx); + + /* The fallback blitter must never fail: */ + return true; } /* Generic clear implementation (partially) using u_blitter: */ void fd_blitter_clear(struct pipe_context *pctx, unsigned buffers, - const union pipe_color_union *color, double depth, unsigned stencil) + const union pipe_color_union *color, double depth, + unsigned stencil) { - struct fd_context *ctx = fd_context(pctx); - struct pipe_framebuffer_state *pfb = &ctx->batch->framebuffer; - struct blitter_context *blitter = ctx->blitter; - - /* Note: don't use discard=true, if there was something to - * discard, that would have been already handled in fd_clear(). - */ - fd_blitter_pipe_begin(ctx, false, false); - - util_blitter_save_fragment_constant_buffer_slot(ctx->blitter, - ctx->constbuf[PIPE_SHADER_FRAGMENT].cb); - - util_blitter_common_clear_setup(blitter, pfb->width, pfb->height, - buffers, NULL, NULL); - - struct pipe_stencil_ref sr = { - .ref_value = { stencil & 0xff } - }; - pctx->set_stencil_ref(pctx, sr); - - struct pipe_constant_buffer cb = { - .buffer_size = 16, - .user_buffer = &color->ui, - }; - pctx->set_constant_buffer(pctx, PIPE_SHADER_FRAGMENT, 0, false, &cb); - - unsigned rs_idx = pfb->samples > 1 ? 1 : 0; - if (!ctx->clear_rs_state[rs_idx]) { - const struct pipe_rasterizer_state tmpl = { - .cull_face = PIPE_FACE_NONE, - .half_pixel_center = 1, - .bottom_edge_rule = 1, - .flatshade = 1, - .depth_clip_near = 1, - .depth_clip_far = 1, - .multisample = pfb->samples > 1, - }; - ctx->clear_rs_state[rs_idx] = pctx->create_rasterizer_state(pctx, &tmpl); - } - pctx->bind_rasterizer_state(pctx, ctx->clear_rs_state[rs_idx]); - - struct pipe_viewport_state vp = { - .scale = { 0.5f * pfb->width, -0.5f * pfb->height, depth }, - .translate = { 0.5f * pfb->width, 0.5f * pfb->height, 0.0f }, - }; - pctx->set_viewport_states(pctx, 0, 1, &vp); - - pctx->bind_vertex_elements_state(pctx, ctx->solid_vbuf_state.vtx); - pctx->set_vertex_buffers(pctx, blitter->vb_slot, 1, 0, false, - &ctx->solid_vbuf_state.vertexbuf.vb[0]); - pctx->set_stream_output_targets(pctx, 0, NULL, NULL); - - if (pfb->layers > 1) - pctx->bind_vs_state(pctx, ctx->solid_layered_prog.vs); - else - pctx->bind_vs_state(pctx, ctx->solid_prog.vs); - - pctx->bind_fs_state(pctx, ctx->solid_prog.fs); - - /* Clear geom/tess shaders, lest the draw emit code think we are - * trying to use use them: - */ - pctx->bind_gs_state(pctx, NULL); - pctx->bind_tcs_state(pctx, NULL); - pctx->bind_tes_state(pctx, NULL); - - struct pipe_draw_info info = { - .mode = PIPE_PRIM_MAX, /* maps to DI_PT_RECTLIST */ - .index_bounds_valid = true, - .max_index = 1, - .instance_count = MAX2(1, pfb->layers), - }; - struct pipe_draw_start_count draw = { - .count = 2, - }; - pctx->draw_vbo(pctx, &info, NULL, &draw, 1); - - /* We expect that this should not have triggered a change in pfb: */ - assert(util_framebuffer_state_equal(pfb, &ctx->framebuffer)); - - util_blitter_restore_constant_buffer_state(blitter); - util_blitter_restore_vertex_states(blitter); - util_blitter_restore_fragment_states(blitter); - util_blitter_restore_textures(blitter); - util_blitter_restore_fb_state(blitter); - util_blitter_restore_render_cond(blitter); - util_blitter_unset_running_flag(blitter); - - fd_blitter_pipe_end(ctx); + struct fd_context *ctx = fd_context(pctx); + struct pipe_framebuffer_state *pfb = &ctx->batch->framebuffer; + struct blitter_context *blitter = ctx->blitter; + + /* Note: don't use discard=true, if there was something to + * discard, that would have been already handled in fd_clear(). + */ + fd_blitter_pipe_begin(ctx, false, false); + + util_blitter_save_fragment_constant_buffer_slot( + ctx->blitter, ctx->constbuf[PIPE_SHADER_FRAGMENT].cb); + + util_blitter_common_clear_setup(blitter, pfb->width, pfb->height, buffers, + NULL, NULL); + + struct pipe_stencil_ref sr = {.ref_value = {stencil & 0xff}}; + pctx->set_stencil_ref(pctx, sr); + + struct pipe_constant_buffer cb = { + .buffer_size = 16, + .user_buffer = &color->ui, + }; + pctx->set_constant_buffer(pctx, PIPE_SHADER_FRAGMENT, 0, false, &cb); + + unsigned rs_idx = pfb->samples > 1 ? 1 : 0; + if (!ctx->clear_rs_state[rs_idx]) { + const struct pipe_rasterizer_state tmpl = { + .cull_face = PIPE_FACE_NONE, + .half_pixel_center = 1, + .bottom_edge_rule = 1, + .flatshade = 1, + .depth_clip_near = 1, + .depth_clip_far = 1, + .multisample = pfb->samples > 1, + }; + ctx->clear_rs_state[rs_idx] = pctx->create_rasterizer_state(pctx, &tmpl); + } + pctx->bind_rasterizer_state(pctx, ctx->clear_rs_state[rs_idx]); + + struct pipe_viewport_state vp = { + .scale = {0.5f * pfb->width, -0.5f * pfb->height, depth}, + .translate = {0.5f * pfb->width, 0.5f * pfb->height, 0.0f}, + }; + pctx->set_viewport_states(pctx, 0, 1, &vp); + + pctx->bind_vertex_elements_state(pctx, ctx->solid_vbuf_state.vtx); + pctx->set_vertex_buffers(pctx, blitter->vb_slot, 1, 0, false, + &ctx->solid_vbuf_state.vertexbuf.vb[0]); + pctx->set_stream_output_targets(pctx, 0, NULL, NULL); + + if (pfb->layers > 1) + pctx->bind_vs_state(pctx, ctx->solid_layered_prog.vs); + else + pctx->bind_vs_state(pctx, ctx->solid_prog.vs); + + pctx->bind_fs_state(pctx, ctx->solid_prog.fs); + + /* Clear geom/tess shaders, lest the draw emit code think we are + * trying to use use them: + */ + pctx->bind_gs_state(pctx, NULL); + pctx->bind_tcs_state(pctx, NULL); + pctx->bind_tes_state(pctx, NULL); + + struct pipe_draw_info info = { + .mode = PIPE_PRIM_MAX, /* maps to DI_PT_RECTLIST */ + .index_bounds_valid = true, + .max_index = 1, + .instance_count = MAX2(1, pfb->layers), + }; + struct pipe_draw_start_count draw = { + .count = 2, + }; + pctx->draw_vbo(pctx, &info, NULL, &draw, 1); + + /* We expect that this should not have triggered a change in pfb: */ + assert(util_framebuffer_state_equal(pfb, &ctx->framebuffer)); + + util_blitter_restore_constant_buffer_state(blitter); + util_blitter_restore_vertex_states(blitter); + util_blitter_restore_fragment_states(blitter); + util_blitter_restore_textures(blitter); + util_blitter_restore_fb_state(blitter); + util_blitter_restore_render_cond(blitter); + util_blitter_unset_running_flag(blitter); + + fd_blitter_pipe_end(ctx); } /** @@ -271,58 +267,54 @@ fd_blitter_clear(struct pipe_context *pctx, unsigned buffers, bool fd_blit(struct pipe_context *pctx, const struct pipe_blit_info *blit_info) { - struct fd_context *ctx = fd_context(pctx); - struct pipe_blit_info info = *blit_info; + struct fd_context *ctx = fd_context(pctx); + struct pipe_blit_info info = *blit_info; - if (info.render_condition_enable && !fd_render_condition_check(pctx)) - return true; + if (info.render_condition_enable && !fd_render_condition_check(pctx)) + return true; - if (ctx->blit && ctx->blit(ctx, &info)) - return true; + if (ctx->blit && ctx->blit(ctx, &info)) + return true; - if (info.mask & PIPE_MASK_S) { - DBG("cannot blit stencil, skipping"); - info.mask &= ~PIPE_MASK_S; - } + if (info.mask & PIPE_MASK_S) { + DBG("cannot blit stencil, skipping"); + info.mask &= ~PIPE_MASK_S; + } - if (!util_blitter_is_blit_supported(ctx->blitter, &info)) { - DBG("blit unsupported %s -> %s", - util_format_short_name(info.src.resource->format), - util_format_short_name(info.dst.resource->format)); - return false; - } + if (!util_blitter_is_blit_supported(ctx->blitter, &info)) { + DBG("blit unsupported %s -> %s", + util_format_short_name(info.src.resource->format), + util_format_short_name(info.dst.resource->format)); + return false; + } - return fd_blitter_blit(ctx, &info); + return fd_blitter_blit(ctx, &info); } /** * _copy_region using pipe (3d engine) */ static bool -fd_blitter_pipe_copy_region(struct fd_context *ctx, - struct pipe_resource *dst, - unsigned dst_level, - unsigned dstx, unsigned dsty, unsigned dstz, - struct pipe_resource *src, - unsigned src_level, - const struct pipe_box *src_box) - assert_dt +fd_blitter_pipe_copy_region(struct fd_context *ctx, struct pipe_resource *dst, + unsigned dst_level, unsigned dstx, unsigned dsty, + unsigned dstz, struct pipe_resource *src, + unsigned src_level, + const struct pipe_box *src_box) assert_dt { - /* not until we allow rendertargets to be buffers */ - if (dst->target == PIPE_BUFFER || src->target == PIPE_BUFFER) - return false; + /* not until we allow rendertargets to be buffers */ + if (dst->target == PIPE_BUFFER || src->target == PIPE_BUFFER) + return false; - if (!util_blitter_is_copy_supported(ctx->blitter, dst, src)) - return false; + if (!util_blitter_is_copy_supported(ctx->blitter, dst, src)) + return false; - /* TODO we could discard if dst box covers dst level fully.. */ - fd_blitter_pipe_begin(ctx, false, false); - util_blitter_copy_texture(ctx->blitter, - dst, dst_level, dstx, dsty, dstz, - src, src_level, src_box); - fd_blitter_pipe_end(ctx); + /* TODO we could discard if dst box covers dst level fully.. */ + fd_blitter_pipe_begin(ctx, false, false); + util_blitter_copy_texture(ctx->blitter, dst, dst_level, dstx, dsty, dstz, + src, src_level, src_box); + fd_blitter_pipe_end(ctx); - return true; + return true; } /** @@ -330,56 +322,51 @@ fd_blitter_pipe_copy_region(struct fd_context *ctx, * The resource must be of the same format. */ void -fd_resource_copy_region(struct pipe_context *pctx, - struct pipe_resource *dst, - unsigned dst_level, - unsigned dstx, unsigned dsty, unsigned dstz, - struct pipe_resource *src, - unsigned src_level, - const struct pipe_box *src_box) +fd_resource_copy_region(struct pipe_context *pctx, struct pipe_resource *dst, + unsigned dst_level, unsigned dstx, unsigned dsty, + unsigned dstz, struct pipe_resource *src, + unsigned src_level, const struct pipe_box *src_box) { - struct fd_context *ctx = fd_context(pctx); - - if (ctx->blit) { - struct pipe_blit_info info; - - memset(&info, 0, sizeof info); - info.dst.resource = dst; - info.dst.level = dst_level; - info.dst.box.x = dstx; - info.dst.box.y = dsty; - info.dst.box.z = dstz; - info.dst.box.width = src_box->width; - info.dst.box.height = src_box->height; - assert(info.dst.box.width >= 0); - assert(info.dst.box.height >= 0); - info.dst.box.depth = 1; - info.dst.format = dst->format; - info.src.resource = src; - info.src.level = src_level; - info.src.box = *src_box; - info.src.format = src->format; - info.mask = util_format_get_mask(src->format); - info.filter = PIPE_TEX_FILTER_NEAREST; - info.scissor_enable = 0; - - if (ctx->blit(ctx, &info)) - return; - } - - /* TODO if we have 2d core, or other DMA engine that could be used - * for simple copies and reasonably easily synchronized with the 3d - * core, this is where we'd plug it in.. - */ - - /* try blit on 3d pipe: */ - if (fd_blitter_pipe_copy_region(ctx, - dst, dst_level, dstx, dsty, dstz, - src, src_level, src_box)) - return; - - /* else fallback to pure sw: */ - util_resource_copy_region(pctx, - dst, dst_level, dstx, dsty, dstz, - src, src_level, src_box); + struct fd_context *ctx = fd_context(pctx); + + if (ctx->blit) { + struct pipe_blit_info info; + + memset(&info, 0, sizeof info); + info.dst.resource = dst; + info.dst.level = dst_level; + info.dst.box.x = dstx; + info.dst.box.y = dsty; + info.dst.box.z = dstz; + info.dst.box.width = src_box->width; + info.dst.box.height = src_box->height; + assert(info.dst.box.width >= 0); + assert(info.dst.box.height >= 0); + info.dst.box.depth = 1; + info.dst.format = dst->format; + info.src.resource = src; + info.src.level = src_level; + info.src.box = *src_box; + info.src.format = src->format; + info.mask = util_format_get_mask(src->format); + info.filter = PIPE_TEX_FILTER_NEAREST; + info.scissor_enable = 0; + + if (ctx->blit(ctx, &info)) + return; + } + + /* TODO if we have 2d core, or other DMA engine that could be used + * for simple copies and reasonably easily synchronized with the 3d + * core, this is where we'd plug it in.. + */ + + /* try blit on 3d pipe: */ + if (fd_blitter_pipe_copy_region(ctx, dst, dst_level, dstx, dsty, dstz, src, + src_level, src_box)) + return; + + /* else fallback to pure sw: */ + util_resource_copy_region(pctx, dst, dst_level, dstx, dsty, dstz, src, + src_level, src_box); } diff --git a/src/gallium/drivers/freedreno/freedreno_blitter.h b/src/gallium/drivers/freedreno/freedreno_blitter.h index 03d75c3..07ca736 100644 --- a/src/gallium/drivers/freedreno/freedreno_blitter.h +++ b/src/gallium/drivers/freedreno/freedreno_blitter.h @@ -31,20 +31,20 @@ #include "freedreno_context.h" -bool fd_blitter_blit(struct fd_context *ctx, const struct pipe_blit_info *info) assert_dt; +bool fd_blitter_blit(struct fd_context *ctx, + const struct pipe_blit_info *info) assert_dt; -void -fd_blitter_clear(struct pipe_context *pctx, unsigned buffers, - const union pipe_color_union *color, double depth, unsigned stencil) assert_dt; +void fd_blitter_clear(struct pipe_context *pctx, unsigned buffers, + const union pipe_color_union *color, double depth, + unsigned stencil) assert_dt; void fd_resource_copy_region(struct pipe_context *pctx, - struct pipe_resource *dst, - unsigned dst_level, - unsigned dstx, unsigned dsty, unsigned dstz, - struct pipe_resource *src, - unsigned src_level, - const struct pipe_box *src_box) assert_dt; - -bool fd_blit(struct pipe_context *pctx, const struct pipe_blit_info *blit_info) assert_dt; + struct pipe_resource *dst, unsigned dst_level, + unsigned dstx, unsigned dsty, unsigned dstz, + struct pipe_resource *src, unsigned src_level, + const struct pipe_box *src_box) assert_dt; + +bool fd_blit(struct pipe_context *pctx, + const struct pipe_blit_info *blit_info) assert_dt; #endif /* FREEDRENO_BLIT_H_ */ diff --git a/src/gallium/drivers/freedreno/freedreno_context.c b/src/gallium/drivers/freedreno/freedreno_context.c index 73f1f7f..fd06147 100644 --- a/src/gallium/drivers/freedreno/freedreno_context.c +++ b/src/gallium/drivers/freedreno/freedreno_context.c @@ -25,194 +25,191 @@ */ #include "freedreno_context.h" +#include "ir3/ir3_cache.h" +#include "util/u_upload_mgr.h" #include "freedreno_blitter.h" #include "freedreno_draw.h" #include "freedreno_fence.h" -#include "freedreno_program.h" -#include "freedreno_resource.h" -#include "freedreno_texture.h" -#include "freedreno_state.h" #include "freedreno_gmem.h" +#include "freedreno_program.h" #include "freedreno_query.h" #include "freedreno_query_hw.h" +#include "freedreno_resource.h" +#include "freedreno_state.h" +#include "freedreno_texture.h" #include "freedreno_util.h" -#include "ir3/ir3_cache.h" -#include "util/u_upload_mgr.h" static void fd_context_flush(struct pipe_context *pctx, struct pipe_fence_handle **fencep, - unsigned flags) - in_dt + unsigned flags) in_dt { - struct fd_context *ctx = fd_context(pctx); - struct pipe_fence_handle *fence = NULL; - struct fd_batch *batch = NULL; - - /* We want to lookup current batch if it exists, but not create a new - * one if not (unless we need a fence) - */ - fd_batch_reference(&batch, ctx->batch); - - DBG("%p: flush: flags=%x", batch, flags); - - if (fencep && !batch) { - batch = fd_context_batch(ctx); - } else if (!batch) { - fd_bc_dump(ctx->screen, "%p: NULL batch, remaining:\n", ctx); - return; - } - - /* With TC_FLUSH_ASYNC, the fence will have been pre-created from - * the front-end thread. But not yet associated with a batch, - * because we cannot safely access ctx->batch outside of the driver - * thread. So instead, replace the existing batch->fence with the - * one created earlier - */ - if ((flags & TC_FLUSH_ASYNC) && fencep) { - /* We don't currently expect async+flush in the fence-fd - * case.. for that to work properly we'd need TC to tell - * us in the create_fence callback that it needs an fd. - */ - assert(!(flags & PIPE_FLUSH_FENCE_FD)); - - fd_fence_set_batch(*fencep, batch); - fd_fence_ref(&batch->fence, *fencep); - - /* We (a) cannot substitute the provided fence with last_fence, - * and (b) need fd_fence_populate() to be eventually called on - * the fence that was pre-created in frontend-thread: - */ - fd_fence_ref(&ctx->last_fence, NULL); - - /* async flush is not compatible with deferred flush, since - * nothing triggers the batch flush which fence_flush() would - * be waiting for - */ - flags &= ~PIPE_FLUSH_DEFERRED; - } - - /* In some sequence of events, we can end up with a last_fence that is - * not an "fd" fence, which results in eglDupNativeFenceFDANDROID() - * errors. - */ - if ((flags & PIPE_FLUSH_FENCE_FD) && ctx->last_fence && - !fd_fence_is_fd(ctx->last_fence)) - fd_fence_ref(&ctx->last_fence, NULL); - - /* if no rendering since last flush, ie. app just decided it needed - * a fence, re-use the last one: - */ - if (ctx->last_fence) { - fd_fence_ref(&fence, ctx->last_fence); - fd_bc_dump(ctx->screen, "%p: reuse last_fence, remaining:\n", ctx); - goto out; - } - - /* Take a ref to the batch's fence (batch can be unref'd when flushed: */ - fd_fence_ref(&fence, batch->fence); - - if (flags & PIPE_FLUSH_FENCE_FD) - batch->needs_out_fence_fd = true; - - fd_bc_dump(ctx->screen, "%p: flushing %p<%u>, flags=0x%x, pending:\n", - ctx, batch, batch->seqno, flags); - - if (!ctx->screen->reorder) { - fd_batch_flush(batch); - } else if (flags & PIPE_FLUSH_DEFERRED) { - fd_bc_flush_deferred(&ctx->screen->batch_cache, ctx); - } else { - fd_bc_flush(&ctx->screen->batch_cache, ctx); - } - - fd_bc_dump(ctx->screen, "%p: remaining:\n", ctx); + struct fd_context *ctx = fd_context(pctx); + struct pipe_fence_handle *fence = NULL; + struct fd_batch *batch = NULL; + + /* We want to lookup current batch if it exists, but not create a new + * one if not (unless we need a fence) + */ + fd_batch_reference(&batch, ctx->batch); + + DBG("%p: flush: flags=%x", batch, flags); + + if (fencep && !batch) { + batch = fd_context_batch(ctx); + } else if (!batch) { + fd_bc_dump(ctx->screen, "%p: NULL batch, remaining:\n", ctx); + return; + } + + /* With TC_FLUSH_ASYNC, the fence will have been pre-created from + * the front-end thread. But not yet associated with a batch, + * because we cannot safely access ctx->batch outside of the driver + * thread. So instead, replace the existing batch->fence with the + * one created earlier + */ + if ((flags & TC_FLUSH_ASYNC) && fencep) { + /* We don't currently expect async+flush in the fence-fd + * case.. for that to work properly we'd need TC to tell + * us in the create_fence callback that it needs an fd. + */ + assert(!(flags & PIPE_FLUSH_FENCE_FD)); + + fd_fence_set_batch(*fencep, batch); + fd_fence_ref(&batch->fence, *fencep); + + /* We (a) cannot substitute the provided fence with last_fence, + * and (b) need fd_fence_populate() to be eventually called on + * the fence that was pre-created in frontend-thread: + */ + fd_fence_ref(&ctx->last_fence, NULL); + + /* async flush is not compatible with deferred flush, since + * nothing triggers the batch flush which fence_flush() would + * be waiting for + */ + flags &= ~PIPE_FLUSH_DEFERRED; + } + + /* In some sequence of events, we can end up with a last_fence that is + * not an "fd" fence, which results in eglDupNativeFenceFDANDROID() + * errors. + */ + if ((flags & PIPE_FLUSH_FENCE_FD) && ctx->last_fence && + !fd_fence_is_fd(ctx->last_fence)) + fd_fence_ref(&ctx->last_fence, NULL); + + /* if no rendering since last flush, ie. app just decided it needed + * a fence, re-use the last one: + */ + if (ctx->last_fence) { + fd_fence_ref(&fence, ctx->last_fence); + fd_bc_dump(ctx->screen, "%p: reuse last_fence, remaining:\n", ctx); + goto out; + } + + /* Take a ref to the batch's fence (batch can be unref'd when flushed: */ + fd_fence_ref(&fence, batch->fence); + + if (flags & PIPE_FLUSH_FENCE_FD) + batch->needs_out_fence_fd = true; + + fd_bc_dump(ctx->screen, "%p: flushing %p<%u>, flags=0x%x, pending:\n", ctx, + batch, batch->seqno, flags); + + if (!ctx->screen->reorder) { + fd_batch_flush(batch); + } else if (flags & PIPE_FLUSH_DEFERRED) { + fd_bc_flush_deferred(&ctx->screen->batch_cache, ctx); + } else { + fd_bc_flush(&ctx->screen->batch_cache, ctx); + } + + fd_bc_dump(ctx->screen, "%p: remaining:\n", ctx); out: - if (fencep) - fd_fence_ref(fencep, fence); + if (fencep) + fd_fence_ref(fencep, fence); - fd_fence_ref(&ctx->last_fence, fence); + fd_fence_ref(&ctx->last_fence, fence); - fd_fence_ref(&fence, NULL); + fd_fence_ref(&fence, NULL); - fd_batch_reference(&batch, NULL); + fd_batch_reference(&batch, NULL); - u_trace_context_process(&ctx->trace_context, - !!(flags & PIPE_FLUSH_END_OF_FRAME)); + u_trace_context_process(&ctx->trace_context, + !!(flags & PIPE_FLUSH_END_OF_FRAME)); } static void -fd_texture_barrier(struct pipe_context *pctx, unsigned flags) - in_dt +fd_texture_barrier(struct pipe_context *pctx, unsigned flags) in_dt { - if (flags == PIPE_TEXTURE_BARRIER_FRAMEBUFFER) { - struct fd_context *ctx = fd_context(pctx); - - if (ctx->framebuffer_barrier) { - ctx->framebuffer_barrier(ctx); - return; - } - } - - /* On devices that could sample from GMEM we could possibly do better. - * Or if we knew that we were doing GMEM bypass we could just emit a - * cache flush, perhaps? But we don't know if future draws would cause - * us to use GMEM, and a flush in bypass isn't the end of the world. - */ - fd_context_flush(pctx, NULL, 0); + if (flags == PIPE_TEXTURE_BARRIER_FRAMEBUFFER) { + struct fd_context *ctx = fd_context(pctx); + + if (ctx->framebuffer_barrier) { + ctx->framebuffer_barrier(ctx); + return; + } + } + + /* On devices that could sample from GMEM we could possibly do better. + * Or if we knew that we were doing GMEM bypass we could just emit a + * cache flush, perhaps? But we don't know if future draws would cause + * us to use GMEM, and a flush in bypass isn't the end of the world. + */ + fd_context_flush(pctx, NULL, 0); } static void fd_memory_barrier(struct pipe_context *pctx, unsigned flags) { - if (!(flags & ~PIPE_BARRIER_UPDATE)) - return; + if (!(flags & ~PIPE_BARRIER_UPDATE)) + return; - fd_context_flush(pctx, NULL, 0); - /* TODO do we need to check for persistently mapped buffers and fd_bo_cpu_prep()?? */ + fd_context_flush(pctx, NULL, 0); + /* TODO do we need to check for persistently mapped buffers and + * fd_bo_cpu_prep()?? */ } static void emit_string_tail(struct fd_ringbuffer *ring, const char *string, int len) { - const uint32_t *buf = (const void *)string; - - while (len >= 4) { - OUT_RING(ring, *buf); - buf++; - len -= 4; - } - - /* copy remainder bytes without reading past end of input string: */ - if (len > 0) { - uint32_t w = 0; - memcpy(&w, buf, len); - OUT_RING(ring, w); - } + const uint32_t *buf = (const void *)string; + + while (len >= 4) { + OUT_RING(ring, *buf); + buf++; + len -= 4; + } + + /* copy remainder bytes without reading past end of input string: */ + if (len > 0) { + uint32_t w = 0; + memcpy(&w, buf, len); + OUT_RING(ring, w); + } } /* for prior to a5xx: */ void -fd_emit_string(struct fd_ringbuffer *ring, - const char *string, int len) +fd_emit_string(struct fd_ringbuffer *ring, const char *string, int len) { - /* max packet size is 0x3fff+1 dwords: */ - len = MIN2(len, 0x4000 * 4); + /* max packet size is 0x3fff+1 dwords: */ + len = MIN2(len, 0x4000 * 4); - OUT_PKT3(ring, CP_NOP, align(len, 4) / 4); - emit_string_tail(ring, string, len); + OUT_PKT3(ring, CP_NOP, align(len, 4) / 4); + emit_string_tail(ring, string, len); } /* for a5xx+ */ void -fd_emit_string5(struct fd_ringbuffer *ring, - const char *string, int len) +fd_emit_string5(struct fd_ringbuffer *ring, const char *string, int len) { - /* max packet size is 0x3fff dwords: */ - len = MIN2(len, 0x3fff * 4); + /* max packet size is 0x3fff dwords: */ + len = MIN2(len, 0x3fff * 4); - OUT_PKT7(ring, CP_NOP, align(len, 4) / 4); - emit_string_tail(ring, string, len); + OUT_PKT7(ring, CP_NOP, align(len, 4) / 4); + emit_string_tail(ring, string, len); } /** @@ -220,26 +217,26 @@ fd_emit_string5(struct fd_ringbuffer *ring, * decoded by cffdump. */ static void -fd_emit_string_marker(struct pipe_context *pctx, const char *string, int len) - in_dt +fd_emit_string_marker(struct pipe_context *pctx, const char *string, + int len) in_dt { - struct fd_context *ctx = fd_context(pctx); + struct fd_context *ctx = fd_context(pctx); - if (!ctx->batch) - return; + if (!ctx->batch) + return; - struct fd_batch *batch = fd_context_batch_locked(ctx); + struct fd_batch *batch = fd_context_batch_locked(ctx); - ctx->batch->needs_flush = true; + ctx->batch->needs_flush = true; - if (ctx->screen->gpu_id >= 500) { - fd_emit_string5(batch->draw, string, len); - } else { - fd_emit_string(batch->draw, string, len); - } + if (ctx->screen->gpu_id >= 500) { + fd_emit_string5(batch->draw, string, len); + } else { + fd_emit_string(batch->draw, string, len); + } - fd_batch_unlock_submit(batch); - fd_batch_reference(&batch, NULL); + fd_batch_unlock_submit(batch); + fd_batch_reference(&batch, NULL); } /** @@ -252,8 +249,8 @@ fd_emit_string_marker(struct pipe_context *pctx, const char *string, int len) void fd_context_switch_from(struct fd_context *ctx) { - if (ctx->batch && (ctx->batch->in_fence_fd != -1)) - fd_batch_flush(ctx->batch); + if (ctx->batch && (ctx->batch->in_fence_fd != -1)) + fd_batch_flush(ctx->batch); } /** @@ -264,11 +261,11 @@ fd_context_switch_from(struct fd_context *ctx) void fd_context_switch_to(struct fd_context *ctx, struct fd_batch *batch) { - if (ctx->in_fence_fd != -1) { - sync_accumulate("freedreno", &batch->in_fence_fd, ctx->in_fence_fd); - close(ctx->in_fence_fd); - ctx->in_fence_fd = -1; - } + if (ctx->in_fence_fd != -1) { + sync_accumulate("freedreno", &batch->in_fence_fd, ctx->in_fence_fd); + close(ctx->in_fence_fd); + ctx->in_fence_fd = -1; + } } /** @@ -277,21 +274,22 @@ fd_context_switch_to(struct fd_context *ctx, struct fd_batch *batch) struct fd_batch * fd_context_batch(struct fd_context *ctx) { - struct fd_batch *batch = NULL; + struct fd_batch *batch = NULL; - tc_assert_driver_thread(ctx->tc); + tc_assert_driver_thread(ctx->tc); - fd_batch_reference(&batch, ctx->batch); + fd_batch_reference(&batch, ctx->batch); - if (unlikely(!batch)) { - batch = fd_batch_from_fb(&ctx->screen->batch_cache, ctx, &ctx->framebuffer); - util_copy_framebuffer_state(&batch->framebuffer, &ctx->framebuffer); - fd_batch_reference(&ctx->batch, batch); - fd_context_all_dirty(ctx); - } - fd_context_switch_to(ctx, batch); + if (unlikely(!batch)) { + batch = + fd_batch_from_fb(&ctx->screen->batch_cache, ctx, &ctx->framebuffer); + util_copy_framebuffer_state(&batch->framebuffer, &ctx->framebuffer); + fd_batch_reference(&ctx->batch, batch); + fd_context_all_dirty(ctx); + } + fd_context_switch_to(ctx, batch); - return batch; + return batch; } /** @@ -303,178 +301,180 @@ fd_context_batch(struct fd_context *ctx) struct fd_batch * fd_context_batch_locked(struct fd_context *ctx) { - struct fd_batch *batch = NULL; + struct fd_batch *batch = NULL; - while (!batch) { - batch = fd_context_batch(ctx); - if (!fd_batch_lock_submit(batch)) { - fd_batch_reference(&batch, NULL); - } - } + while (!batch) { + batch = fd_context_batch(ctx); + if (!fd_batch_lock_submit(batch)) { + fd_batch_reference(&batch, NULL); + } + } - return batch; + return batch; } void fd_context_destroy(struct pipe_context *pctx) { - struct fd_context *ctx = fd_context(pctx); - unsigned i; + struct fd_context *ctx = fd_context(pctx); + unsigned i; - DBG(""); + DBG(""); - fd_screen_lock(ctx->screen); - list_del(&ctx->node); - fd_screen_unlock(ctx->screen); + fd_screen_lock(ctx->screen); + list_del(&ctx->node); + fd_screen_unlock(ctx->screen); - fd_fence_ref(&ctx->last_fence, NULL); + fd_fence_ref(&ctx->last_fence, NULL); - if (ctx->in_fence_fd != -1) - close(ctx->in_fence_fd); + if (ctx->in_fence_fd != -1) + close(ctx->in_fence_fd); - for (i = 0; i < ARRAY_SIZE(ctx->pvtmem); i++) { - if (ctx->pvtmem[i].bo) - fd_bo_del(ctx->pvtmem[i].bo); - } + for (i = 0; i < ARRAY_SIZE(ctx->pvtmem); i++) { + if (ctx->pvtmem[i].bo) + fd_bo_del(ctx->pvtmem[i].bo); + } - util_copy_framebuffer_state(&ctx->framebuffer, NULL); - fd_batch_reference(&ctx->batch, NULL); /* unref current batch */ - fd_bc_invalidate_context(ctx); + util_copy_framebuffer_state(&ctx->framebuffer, NULL); + fd_batch_reference(&ctx->batch, NULL); /* unref current batch */ + fd_bc_invalidate_context(ctx); - fd_prog_fini(pctx); + fd_prog_fini(pctx); - if (ctx->blitter) - util_blitter_destroy(ctx->blitter); + if (ctx->blitter) + util_blitter_destroy(ctx->blitter); - if (pctx->stream_uploader) - u_upload_destroy(pctx->stream_uploader); + if (pctx->stream_uploader) + u_upload_destroy(pctx->stream_uploader); - for (i = 0; i < ARRAY_SIZE(ctx->clear_rs_state); i++) - if (ctx->clear_rs_state[i]) - pctx->delete_rasterizer_state(pctx, ctx->clear_rs_state[i]); + for (i = 0; i < ARRAY_SIZE(ctx->clear_rs_state); i++) + if (ctx->clear_rs_state[i]) + pctx->delete_rasterizer_state(pctx, ctx->clear_rs_state[i]); - if (ctx->primconvert) - util_primconvert_destroy(ctx->primconvert); + if (ctx->primconvert) + util_primconvert_destroy(ctx->primconvert); - slab_destroy_child(&ctx->transfer_pool); - slab_destroy_child(&ctx->transfer_pool_unsync); + slab_destroy_child(&ctx->transfer_pool); + slab_destroy_child(&ctx->transfer_pool_unsync); - for (i = 0; i < ARRAY_SIZE(ctx->vsc_pipe_bo); i++) { - if (!ctx->vsc_pipe_bo[i]) - break; - fd_bo_del(ctx->vsc_pipe_bo[i]); - } + for (i = 0; i < ARRAY_SIZE(ctx->vsc_pipe_bo); i++) { + if (!ctx->vsc_pipe_bo[i]) + break; + fd_bo_del(ctx->vsc_pipe_bo[i]); + } - fd_device_del(ctx->dev); - fd_pipe_del(ctx->pipe); + fd_device_del(ctx->dev); + fd_pipe_del(ctx->pipe); - simple_mtx_destroy(&ctx->gmem_lock); + simple_mtx_destroy(&ctx->gmem_lock); - u_trace_context_fini(&ctx->trace_context); + u_trace_context_fini(&ctx->trace_context); - fd_autotune_fini(&ctx->autotune); + fd_autotune_fini(&ctx->autotune); - ir3_cache_destroy(ctx->shader_cache); + ir3_cache_destroy(ctx->shader_cache); - if (FD_DBG(BSTAT) || FD_DBG(MSGS)) { - mesa_logi("batch_total=%u, batch_sysmem=%u, batch_gmem=%u, batch_nondraw=%u, batch_restore=%u\n", - (uint32_t)ctx->stats.batch_total, (uint32_t)ctx->stats.batch_sysmem, - (uint32_t)ctx->stats.batch_gmem, (uint32_t)ctx->stats.batch_nondraw, - (uint32_t)ctx->stats.batch_restore); - } + if (FD_DBG(BSTAT) || FD_DBG(MSGS)) { + mesa_logi( + "batch_total=%u, batch_sysmem=%u, batch_gmem=%u, batch_nondraw=%u, " + "batch_restore=%u\n", + (uint32_t)ctx->stats.batch_total, (uint32_t)ctx->stats.batch_sysmem, + (uint32_t)ctx->stats.batch_gmem, (uint32_t)ctx->stats.batch_nondraw, + (uint32_t)ctx->stats.batch_restore); + } } static void fd_set_debug_callback(struct pipe_context *pctx, - const struct pipe_debug_callback *cb) + const struct pipe_debug_callback *cb) { - struct fd_context *ctx = fd_context(pctx); + struct fd_context *ctx = fd_context(pctx); - if (cb) - ctx->debug = *cb; - else - memset(&ctx->debug, 0, sizeof(ctx->debug)); + if (cb) + ctx->debug = *cb; + else + memset(&ctx->debug, 0, sizeof(ctx->debug)); } static uint32_t fd_get_reset_count(struct fd_context *ctx, bool per_context) { - uint64_t val; - enum fd_param_id param = - per_context ? FD_CTX_FAULTS : FD_GLOBAL_FAULTS; - int ret = fd_pipe_get_param(ctx->pipe, param, &val); - debug_assert(!ret); - return val; + uint64_t val; + enum fd_param_id param = per_context ? FD_CTX_FAULTS : FD_GLOBAL_FAULTS; + int ret = fd_pipe_get_param(ctx->pipe, param, &val); + debug_assert(!ret); + return val; } static enum pipe_reset_status fd_get_device_reset_status(struct pipe_context *pctx) { - struct fd_context *ctx = fd_context(pctx); - int context_faults = fd_get_reset_count(ctx, true); - int global_faults = fd_get_reset_count(ctx, false); - enum pipe_reset_status status; - - /* Not called in driver thread, but threaded_context syncs - * before calling this: - */ - fd_context_access_begin(ctx); - - if (context_faults != ctx->context_reset_count) { - status = PIPE_GUILTY_CONTEXT_RESET; - } else if (global_faults != ctx->global_reset_count) { - status = PIPE_INNOCENT_CONTEXT_RESET; - } else { - status = PIPE_NO_RESET; - } - - ctx->context_reset_count = context_faults; - ctx->global_reset_count = global_faults; - - fd_context_access_end(ctx); - - return status; + struct fd_context *ctx = fd_context(pctx); + int context_faults = fd_get_reset_count(ctx, true); + int global_faults = fd_get_reset_count(ctx, false); + enum pipe_reset_status status; + + /* Not called in driver thread, but threaded_context syncs + * before calling this: + */ + fd_context_access_begin(ctx); + + if (context_faults != ctx->context_reset_count) { + status = PIPE_GUILTY_CONTEXT_RESET; + } else if (global_faults != ctx->global_reset_count) { + status = PIPE_INNOCENT_CONTEXT_RESET; + } else { + status = PIPE_NO_RESET; + } + + ctx->context_reset_count = context_faults; + ctx->global_reset_count = global_faults; + + fd_context_access_end(ctx); + + return status; } static void fd_trace_record_ts(struct u_trace *ut, struct pipe_resource *timestamps, - unsigned idx) + unsigned idx) { - struct fd_batch *batch = container_of(ut, struct fd_batch, trace); - struct fd_ringbuffer *ring = batch->nondraw ? batch->draw : batch->gmem; - - if (ring->cur == batch->last_timestamp_cmd) { - uint64_t *ts = fd_bo_map(fd_resource(timestamps)->bo); - ts[idx] = U_TRACE_NO_TIMESTAMP; - return; - } - - unsigned ts_offset = idx * sizeof(uint64_t); - batch->ctx->record_timestamp(ring, fd_resource(timestamps)->bo, ts_offset); - batch->last_timestamp_cmd = ring->cur; + struct fd_batch *batch = container_of(ut, struct fd_batch, trace); + struct fd_ringbuffer *ring = batch->nondraw ? batch->draw : batch->gmem; + + if (ring->cur == batch->last_timestamp_cmd) { + uint64_t *ts = fd_bo_map(fd_resource(timestamps)->bo); + ts[idx] = U_TRACE_NO_TIMESTAMP; + return; + } + + unsigned ts_offset = idx * sizeof(uint64_t); + batch->ctx->record_timestamp(ring, fd_resource(timestamps)->bo, ts_offset); + batch->last_timestamp_cmd = ring->cur; } static uint64_t fd_trace_read_ts(struct u_trace_context *utctx, - struct pipe_resource *timestamps, unsigned idx) + struct pipe_resource *timestamps, unsigned idx) { - struct fd_context *ctx = container_of(utctx, struct fd_context, trace_context); - struct fd_bo *ts_bo = fd_resource(timestamps)->bo; + struct fd_context *ctx = + container_of(utctx, struct fd_context, trace_context); + struct fd_bo *ts_bo = fd_resource(timestamps)->bo; - /* Only need to stall on results for the first entry: */ - if (idx == 0) { - int ret = fd_bo_cpu_prep(ts_bo, ctx->pipe, DRM_FREEDRENO_PREP_READ); - if (ret) - return U_TRACE_NO_TIMESTAMP; - } + /* Only need to stall on results for the first entry: */ + if (idx == 0) { + int ret = fd_bo_cpu_prep(ts_bo, ctx->pipe, DRM_FREEDRENO_PREP_READ); + if (ret) + return U_TRACE_NO_TIMESTAMP; + } - uint64_t *ts = fd_bo_map(ts_bo); + uint64_t *ts = fd_bo_map(ts_bo); - /* Don't translate the no-timestamp marker: */ - if (ts[idx] == U_TRACE_NO_TIMESTAMP) - return U_TRACE_NO_TIMESTAMP; + /* Don't translate the no-timestamp marker: */ + if (ts[idx] == U_TRACE_NO_TIMESTAMP) + return U_TRACE_NO_TIMESTAMP; - return ctx->ts_to_ns(ts[idx]); + return ctx->ts_to_ns(ts[idx]); } /* TODO we could combine a few of these small buffers (solid_vbuf, @@ -485,201 +485,204 @@ fd_trace_read_ts(struct u_trace_context *utctx, static struct pipe_resource * create_solid_vertexbuf(struct pipe_context *pctx) { - static const float init_shader_const[] = { - -1.000000, +1.000000, +1.000000, - +1.000000, -1.000000, +1.000000, - }; - struct pipe_resource *prsc = pipe_buffer_create(pctx->screen, - PIPE_BIND_CUSTOM, PIPE_USAGE_IMMUTABLE, sizeof(init_shader_const)); - pipe_buffer_write(pctx, prsc, 0, - sizeof(init_shader_const), init_shader_const); - return prsc; + static const float init_shader_const[] = { + -1.000000, +1.000000, +1.000000, +1.000000, -1.000000, +1.000000, + }; + struct pipe_resource *prsc = + pipe_buffer_create(pctx->screen, PIPE_BIND_CUSTOM, PIPE_USAGE_IMMUTABLE, + sizeof(init_shader_const)); + pipe_buffer_write(pctx, prsc, 0, sizeof(init_shader_const), + init_shader_const); + return prsc; } static struct pipe_resource * create_blit_texcoord_vertexbuf(struct pipe_context *pctx) { - struct pipe_resource *prsc = pipe_buffer_create(pctx->screen, - PIPE_BIND_CUSTOM, PIPE_USAGE_DYNAMIC, 16); - return prsc; + struct pipe_resource *prsc = pipe_buffer_create( + pctx->screen, PIPE_BIND_CUSTOM, PIPE_USAGE_DYNAMIC, 16); + return prsc; } void fd_context_setup_common_vbos(struct fd_context *ctx) { - struct pipe_context *pctx = &ctx->base; - - ctx->solid_vbuf = create_solid_vertexbuf(pctx); - ctx->blit_texcoord_vbuf = create_blit_texcoord_vertexbuf(pctx); - - /* setup solid_vbuf_state: */ - ctx->solid_vbuf_state.vtx = pctx->create_vertex_elements_state( - pctx, 1, (struct pipe_vertex_element[]){{ - .vertex_buffer_index = 0, - .src_offset = 0, - .src_format = PIPE_FORMAT_R32G32B32_FLOAT, - }}); - ctx->solid_vbuf_state.vertexbuf.count = 1; - ctx->solid_vbuf_state.vertexbuf.vb[0].stride = 12; - ctx->solid_vbuf_state.vertexbuf.vb[0].buffer.resource = ctx->solid_vbuf; - - /* setup blit_vbuf_state: */ - ctx->blit_vbuf_state.vtx = pctx->create_vertex_elements_state( - pctx, 2, (struct pipe_vertex_element[]){{ - .vertex_buffer_index = 0, - .src_offset = 0, - .src_format = PIPE_FORMAT_R32G32_FLOAT, - }, { - .vertex_buffer_index = 1, - .src_offset = 0, - .src_format = PIPE_FORMAT_R32G32B32_FLOAT, - }}); - ctx->blit_vbuf_state.vertexbuf.count = 2; - ctx->blit_vbuf_state.vertexbuf.vb[0].stride = 8; - ctx->blit_vbuf_state.vertexbuf.vb[0].buffer.resource = ctx->blit_texcoord_vbuf; - ctx->blit_vbuf_state.vertexbuf.vb[1].stride = 12; - ctx->blit_vbuf_state.vertexbuf.vb[1].buffer.resource = ctx->solid_vbuf; + struct pipe_context *pctx = &ctx->base; + + ctx->solid_vbuf = create_solid_vertexbuf(pctx); + ctx->blit_texcoord_vbuf = create_blit_texcoord_vertexbuf(pctx); + + /* setup solid_vbuf_state: */ + ctx->solid_vbuf_state.vtx = pctx->create_vertex_elements_state( + pctx, 1, + (struct pipe_vertex_element[]){{ + .vertex_buffer_index = 0, + .src_offset = 0, + .src_format = PIPE_FORMAT_R32G32B32_FLOAT, + }}); + ctx->solid_vbuf_state.vertexbuf.count = 1; + ctx->solid_vbuf_state.vertexbuf.vb[0].stride = 12; + ctx->solid_vbuf_state.vertexbuf.vb[0].buffer.resource = ctx->solid_vbuf; + + /* setup blit_vbuf_state: */ + ctx->blit_vbuf_state.vtx = pctx->create_vertex_elements_state( + pctx, 2, + (struct pipe_vertex_element[]){ + { + .vertex_buffer_index = 0, + .src_offset = 0, + .src_format = PIPE_FORMAT_R32G32_FLOAT, + }, + { + .vertex_buffer_index = 1, + .src_offset = 0, + .src_format = PIPE_FORMAT_R32G32B32_FLOAT, + }}); + ctx->blit_vbuf_state.vertexbuf.count = 2; + ctx->blit_vbuf_state.vertexbuf.vb[0].stride = 8; + ctx->blit_vbuf_state.vertexbuf.vb[0].buffer.resource = + ctx->blit_texcoord_vbuf; + ctx->blit_vbuf_state.vertexbuf.vb[1].stride = 12; + ctx->blit_vbuf_state.vertexbuf.vb[1].buffer.resource = ctx->solid_vbuf; } void fd_context_cleanup_common_vbos(struct fd_context *ctx) { - struct pipe_context *pctx = &ctx->base; + struct pipe_context *pctx = &ctx->base; - pctx->delete_vertex_elements_state(pctx, ctx->solid_vbuf_state.vtx); - pctx->delete_vertex_elements_state(pctx, ctx->blit_vbuf_state.vtx); + pctx->delete_vertex_elements_state(pctx, ctx->solid_vbuf_state.vtx); + pctx->delete_vertex_elements_state(pctx, ctx->blit_vbuf_state.vtx); - pipe_resource_reference(&ctx->solid_vbuf, NULL); - pipe_resource_reference(&ctx->blit_texcoord_vbuf, NULL); + pipe_resource_reference(&ctx->solid_vbuf, NULL); + pipe_resource_reference(&ctx->blit_texcoord_vbuf, NULL); } struct pipe_context * fd_context_init(struct fd_context *ctx, struct pipe_screen *pscreen, - const uint8_t *primtypes, void *priv, unsigned flags) - disable_thread_safety_analysis + const uint8_t *primtypes, void *priv, + unsigned flags) disable_thread_safety_analysis { - struct fd_screen *screen = fd_screen(pscreen); - struct pipe_context *pctx; - unsigned prio = 1; - int i; - - /* lower numerical value == higher priority: */ - if (FD_DBG(HIPRIO)) - prio = 0; - else if (flags & PIPE_CONTEXT_HIGH_PRIORITY) - prio = 0; - else if (flags & PIPE_CONTEXT_LOW_PRIORITY) - prio = 2; - - /* Some of the stats will get printed out at context destroy, so - * make sure they are collected: - */ - if (FD_DBG(BSTAT) || FD_DBG(MSGS)) - ctx->stats_users++; - - ctx->screen = screen; - ctx->pipe = fd_pipe_new2(screen->dev, FD_PIPE_3D, prio); - - ctx->in_fence_fd = -1; - - if (fd_device_version(screen->dev) >= FD_VERSION_ROBUSTNESS) { - ctx->context_reset_count = fd_get_reset_count(ctx, true); - ctx->global_reset_count = fd_get_reset_count(ctx, false); - } - - ctx->primtypes = primtypes; - ctx->primtype_mask = 0; - for (i = 0; i <= PIPE_PRIM_MAX; i++) - if (primtypes[i]) - ctx->primtype_mask |= (1 << i); - - simple_mtx_init(&ctx->gmem_lock, mtx_plain); - - /* need some sane default in case gallium frontends don't - * set some state: - */ - ctx->sample_mask = 0xffff; - ctx->active_queries = true; - - pctx = &ctx->base; - pctx->screen = pscreen; - pctx->priv = priv; - pctx->flush = fd_context_flush; - pctx->emit_string_marker = fd_emit_string_marker; - pctx->set_debug_callback = fd_set_debug_callback; - pctx->get_device_reset_status = fd_get_device_reset_status; - pctx->create_fence_fd = fd_create_fence_fd; - pctx->fence_server_sync = fd_fence_server_sync; - pctx->fence_server_signal = fd_fence_server_signal; - pctx->texture_barrier = fd_texture_barrier; - pctx->memory_barrier = fd_memory_barrier; - - pctx->stream_uploader = u_upload_create_default(pctx); - if (!pctx->stream_uploader) - goto fail; - pctx->const_uploader = pctx->stream_uploader; - - slab_create_child(&ctx->transfer_pool, &screen->transfer_pool); - slab_create_child(&ctx->transfer_pool_unsync, &screen->transfer_pool); - - fd_draw_init(pctx); - fd_resource_context_init(pctx); - fd_query_context_init(pctx); - fd_texture_init(pctx); - fd_state_init(pctx); - - ctx->blitter = util_blitter_create(pctx); - if (!ctx->blitter) - goto fail; - - ctx->primconvert = util_primconvert_create(pctx, ctx->primtype_mask); - if (!ctx->primconvert) - goto fail; - - list_inithead(&ctx->hw_active_queries); - list_inithead(&ctx->acc_active_queries); - - fd_screen_lock(ctx->screen); - ctx->seqno = ++screen->ctx_seqno; - list_add(&ctx->node, &ctx->screen->context_list); - fd_screen_unlock(ctx->screen); - - ctx->current_scissor = &ctx->disabled_scissor; - - u_trace_context_init(&ctx->trace_context, pctx, - fd_trace_record_ts, fd_trace_read_ts); - - fd_autotune_init(&ctx->autotune, screen->dev); - - return pctx; + struct fd_screen *screen = fd_screen(pscreen); + struct pipe_context *pctx; + unsigned prio = 1; + int i; + + /* lower numerical value == higher priority: */ + if (FD_DBG(HIPRIO)) + prio = 0; + else if (flags & PIPE_CONTEXT_HIGH_PRIORITY) + prio = 0; + else if (flags & PIPE_CONTEXT_LOW_PRIORITY) + prio = 2; + + /* Some of the stats will get printed out at context destroy, so + * make sure they are collected: + */ + if (FD_DBG(BSTAT) || FD_DBG(MSGS)) + ctx->stats_users++; + + ctx->screen = screen; + ctx->pipe = fd_pipe_new2(screen->dev, FD_PIPE_3D, prio); + + ctx->in_fence_fd = -1; + + if (fd_device_version(screen->dev) >= FD_VERSION_ROBUSTNESS) { + ctx->context_reset_count = fd_get_reset_count(ctx, true); + ctx->global_reset_count = fd_get_reset_count(ctx, false); + } + + ctx->primtypes = primtypes; + ctx->primtype_mask = 0; + for (i = 0; i <= PIPE_PRIM_MAX; i++) + if (primtypes[i]) + ctx->primtype_mask |= (1 << i); + + simple_mtx_init(&ctx->gmem_lock, mtx_plain); + + /* need some sane default in case gallium frontends don't + * set some state: + */ + ctx->sample_mask = 0xffff; + ctx->active_queries = true; + + pctx = &ctx->base; + pctx->screen = pscreen; + pctx->priv = priv; + pctx->flush = fd_context_flush; + pctx->emit_string_marker = fd_emit_string_marker; + pctx->set_debug_callback = fd_set_debug_callback; + pctx->get_device_reset_status = fd_get_device_reset_status; + pctx->create_fence_fd = fd_create_fence_fd; + pctx->fence_server_sync = fd_fence_server_sync; + pctx->fence_server_signal = fd_fence_server_signal; + pctx->texture_barrier = fd_texture_barrier; + pctx->memory_barrier = fd_memory_barrier; + + pctx->stream_uploader = u_upload_create_default(pctx); + if (!pctx->stream_uploader) + goto fail; + pctx->const_uploader = pctx->stream_uploader; + + slab_create_child(&ctx->transfer_pool, &screen->transfer_pool); + slab_create_child(&ctx->transfer_pool_unsync, &screen->transfer_pool); + + fd_draw_init(pctx); + fd_resource_context_init(pctx); + fd_query_context_init(pctx); + fd_texture_init(pctx); + fd_state_init(pctx); + + ctx->blitter = util_blitter_create(pctx); + if (!ctx->blitter) + goto fail; + + ctx->primconvert = util_primconvert_create(pctx, ctx->primtype_mask); + if (!ctx->primconvert) + goto fail; + + list_inithead(&ctx->hw_active_queries); + list_inithead(&ctx->acc_active_queries); + + fd_screen_lock(ctx->screen); + ctx->seqno = ++screen->ctx_seqno; + list_add(&ctx->node, &ctx->screen->context_list); + fd_screen_unlock(ctx->screen); + + ctx->current_scissor = &ctx->disabled_scissor; + + u_trace_context_init(&ctx->trace_context, pctx, fd_trace_record_ts, + fd_trace_read_ts); + + fd_autotune_init(&ctx->autotune, screen->dev); + + return pctx; fail: - pctx->destroy(pctx); - return NULL; + pctx->destroy(pctx); + return NULL; } struct pipe_context * fd_context_init_tc(struct pipe_context *pctx, unsigned flags) { - struct fd_context *ctx = fd_context(pctx); + struct fd_context *ctx = fd_context(pctx); - if (!(flags & PIPE_CONTEXT_PREFER_THREADED)) - return pctx; + if (!(flags & PIPE_CONTEXT_PREFER_THREADED)) + return pctx; - /* Clover (compute-only) is unsupported. */ - if (flags & PIPE_CONTEXT_COMPUTE_ONLY) - return pctx; + /* Clover (compute-only) is unsupported. */ + if (flags & PIPE_CONTEXT_COMPUTE_ONLY) + return pctx; - struct pipe_context *tc = threaded_context_create(pctx, - &ctx->screen->transfer_pool, - fd_replace_buffer_storage, - fd_fence_create_unflushed, - &ctx->tc); + struct pipe_context *tc = threaded_context_create( + pctx, &ctx->screen->transfer_pool, fd_replace_buffer_storage, + fd_fence_create_unflushed, &ctx->tc); - uint64_t total_ram; - if (tc && tc != pctx && os_get_total_physical_memory(&total_ram)) { - ((struct threaded_context *) tc)->bytes_mapped_limit = total_ram / 16; - } + uint64_t total_ram; + if (tc && tc != pctx && os_get_total_physical_memory(&total_ram)) { + ((struct threaded_context *)tc)->bytes_mapped_limit = total_ram / 16; + } - return tc; + return tc; } diff --git a/src/gallium/drivers/freedreno/freedreno_context.h b/src/gallium/drivers/freedreno/freedreno_context.h index 87ac910..d85861b 100644 --- a/src/gallium/drivers/freedreno/freedreno_context.h +++ b/src/gallium/drivers/freedreno/freedreno_context.h @@ -27,19 +27,19 @@ #ifndef FREEDRENO_CONTEXT_H_ #define FREEDRENO_CONTEXT_H_ -#include "pipe/p_context.h" #include "indices/u_primconvert.h" -#include "util/u_blitter.h" +#include "pipe/p_context.h" #include "util/libsync.h" #include "util/list.h" #include "util/slab.h" +#include "util/u_blitter.h" #include "util/u_string.h" #include "util/u_threaded_context.h" #include "util/u_trace.h" #include "freedreno_autotune.h" -#include "freedreno_screen.h" #include "freedreno_gmem.h" +#include "freedreno_screen.h" #include "freedreno_util.h" #define BORDER_COLOR_UPLOAD_SIZE (2 * PIPE_MAX_SAMPLERS * BORDERCOLOR_SIZE) @@ -48,86 +48,87 @@ struct fd_vertex_stateobj; struct fd_batch; struct fd_texture_stateobj { - struct pipe_sampler_view *textures[PIPE_MAX_SAMPLERS]; - unsigned num_textures; - unsigned valid_textures; - struct pipe_sampler_state *samplers[PIPE_MAX_SAMPLERS]; - unsigned num_samplers; - unsigned valid_samplers; - /* number of samples per sampler, 2 bits per sampler: */ - uint32_t samples; + struct pipe_sampler_view *textures[PIPE_MAX_SAMPLERS]; + unsigned num_textures; + unsigned valid_textures; + struct pipe_sampler_state *samplers[PIPE_MAX_SAMPLERS]; + unsigned num_samplers; + unsigned valid_samplers; + /* number of samples per sampler, 2 bits per sampler: */ + uint32_t samples; }; struct fd_program_stateobj { - void *vs, *hs, *ds, *gs, *fs; + void *vs, *hs, *ds, *gs, *fs; }; struct fd_constbuf_stateobj { - struct pipe_constant_buffer cb[PIPE_MAX_CONSTANT_BUFFERS]; - uint32_t enabled_mask; + struct pipe_constant_buffer cb[PIPE_MAX_CONSTANT_BUFFERS]; + uint32_t enabled_mask; }; struct fd_shaderbuf_stateobj { - struct pipe_shader_buffer sb[PIPE_MAX_SHADER_BUFFERS]; - uint32_t enabled_mask; - uint32_t writable_mask; + struct pipe_shader_buffer sb[PIPE_MAX_SHADER_BUFFERS]; + uint32_t enabled_mask; + uint32_t writable_mask; }; struct fd_shaderimg_stateobj { - struct pipe_image_view si[PIPE_MAX_SHADER_IMAGES]; - uint32_t enabled_mask; + struct pipe_image_view si[PIPE_MAX_SHADER_IMAGES]; + uint32_t enabled_mask; }; struct fd_vertexbuf_stateobj { - struct pipe_vertex_buffer vb[PIPE_MAX_ATTRIBS]; - unsigned count; - uint32_t enabled_mask; + struct pipe_vertex_buffer vb[PIPE_MAX_ATTRIBS]; + unsigned count; + uint32_t enabled_mask; }; struct fd_vertex_stateobj { - struct pipe_vertex_element pipe[PIPE_MAX_ATTRIBS]; - unsigned num_elements; + struct pipe_vertex_element pipe[PIPE_MAX_ATTRIBS]; + unsigned num_elements; }; struct fd_stream_output_target { - struct pipe_stream_output_target base; - struct pipe_resource *offset_buf; - /* stride of the last stream out recorded to this target, for glDrawTransformFeedback(). */ - uint32_t stride; + struct pipe_stream_output_target base; + struct pipe_resource *offset_buf; + /* stride of the last stream out recorded to this target, for + * glDrawTransformFeedback(). */ + uint32_t stride; }; struct fd_streamout_stateobj { - struct pipe_stream_output_target *targets[PIPE_MAX_SO_BUFFERS]; - /* Bitmask of stream that should be reset. */ - unsigned reset; - - unsigned num_targets; - /* Track offset from vtxcnt for streamout data. This counter - * is just incremented by # of vertices on each draw until - * reset or new streamout buffer bound. - * - * When we eventually have GS, the CPU won't actually know the - * number of vertices per draw, so I think we'll have to do - * something more clever. - */ - unsigned offsets[PIPE_MAX_SO_BUFFERS]; - - /* Pre-a6xx, the maximum number of vertices that could be recorded to this - * set of targets with the current vertex shader. a6xx and newer, hardware - * queries are used. - */ - unsigned max_tf_vtx; - - /* Pre-a6xx, the number of verts written to the buffers since the last - * Begin. Used for overflow checking for SW queries. - */ - unsigned verts_written; + struct pipe_stream_output_target *targets[PIPE_MAX_SO_BUFFERS]; + /* Bitmask of stream that should be reset. */ + unsigned reset; + + unsigned num_targets; + /* Track offset from vtxcnt for streamout data. This counter + * is just incremented by # of vertices on each draw until + * reset or new streamout buffer bound. + * + * When we eventually have GS, the CPU won't actually know the + * number of vertices per draw, so I think we'll have to do + * something more clever. + */ + unsigned offsets[PIPE_MAX_SO_BUFFERS]; + + /* Pre-a6xx, the maximum number of vertices that could be recorded to this + * set of targets with the current vertex shader. a6xx and newer, hardware + * queries are used. + */ + unsigned max_tf_vtx; + + /* Pre-a6xx, the number of verts written to the buffers since the last + * Begin. Used for overflow checking for SW queries. + */ + unsigned verts_written; }; #define MAX_GLOBAL_BUFFERS 16 struct fd_global_bindings_stateobj { - struct pipe_resource *buf[MAX_GLOBAL_BUFFERS]; - uint32_t enabled_mask; + struct pipe_resource *buf[MAX_GLOBAL_BUFFERS]; + uint32_t enabled_mask; }; /* group together the vertex and vertexbuf state.. for ease of passing @@ -135,54 +136,54 @@ struct fd_global_bindings_stateobj { * need their own vertex state: */ struct fd_vertex_state { - struct fd_vertex_stateobj *vtx; - struct fd_vertexbuf_stateobj vertexbuf; + struct fd_vertex_stateobj *vtx; + struct fd_vertexbuf_stateobj vertexbuf; }; /* global 3d pipeline dirty state: */ enum fd_dirty_3d_state { - FD_DIRTY_BLEND = BIT(0), - FD_DIRTY_RASTERIZER = BIT(1), - FD_DIRTY_ZSA = BIT(2), - FD_DIRTY_BLEND_COLOR = BIT(3), - FD_DIRTY_STENCIL_REF = BIT(4), - FD_DIRTY_SAMPLE_MASK = BIT(5), - FD_DIRTY_FRAMEBUFFER = BIT(6), - FD_DIRTY_STIPPLE = BIT(7), - FD_DIRTY_VIEWPORT = BIT(8), - FD_DIRTY_VTXSTATE = BIT(9), - FD_DIRTY_VTXBUF = BIT(10), - FD_DIRTY_MIN_SAMPLES = BIT(11), - FD_DIRTY_SCISSOR = BIT(12), - FD_DIRTY_STREAMOUT = BIT(13), - FD_DIRTY_UCP = BIT(14), - FD_DIRTY_PROG = BIT(15), - FD_DIRTY_CONST = BIT(16), - FD_DIRTY_TEX = BIT(17), - FD_DIRTY_IMAGE = BIT(18), - FD_DIRTY_SSBO = BIT(19), - - /* only used by a2xx.. possibly can be removed.. */ - FD_DIRTY_TEXSTATE = BIT(20), - - /* fine grained state changes, for cases where state is not orthogonal - * from hw perspective: - */ - FD_DIRTY_RASTERIZER_DISCARD = BIT(24), - FD_DIRTY_BLEND_DUAL = BIT(25), + FD_DIRTY_BLEND = BIT(0), + FD_DIRTY_RASTERIZER = BIT(1), + FD_DIRTY_ZSA = BIT(2), + FD_DIRTY_BLEND_COLOR = BIT(3), + FD_DIRTY_STENCIL_REF = BIT(4), + FD_DIRTY_SAMPLE_MASK = BIT(5), + FD_DIRTY_FRAMEBUFFER = BIT(6), + FD_DIRTY_STIPPLE = BIT(7), + FD_DIRTY_VIEWPORT = BIT(8), + FD_DIRTY_VTXSTATE = BIT(9), + FD_DIRTY_VTXBUF = BIT(10), + FD_DIRTY_MIN_SAMPLES = BIT(11), + FD_DIRTY_SCISSOR = BIT(12), + FD_DIRTY_STREAMOUT = BIT(13), + FD_DIRTY_UCP = BIT(14), + FD_DIRTY_PROG = BIT(15), + FD_DIRTY_CONST = BIT(16), + FD_DIRTY_TEX = BIT(17), + FD_DIRTY_IMAGE = BIT(18), + FD_DIRTY_SSBO = BIT(19), + + /* only used by a2xx.. possibly can be removed.. */ + FD_DIRTY_TEXSTATE = BIT(20), + + /* fine grained state changes, for cases where state is not orthogonal + * from hw perspective: + */ + FD_DIRTY_RASTERIZER_DISCARD = BIT(24), + FD_DIRTY_BLEND_DUAL = BIT(25), #define NUM_DIRTY_BITS 26 - /* additional flag for state requires updated resource tracking: */ - FD_DIRTY_RESOURCE = BIT(31), + /* additional flag for state requires updated resource tracking: */ + FD_DIRTY_RESOURCE = BIT(31), }; /* per shader-stage dirty state: */ enum fd_dirty_shader_state { - FD_DIRTY_SHADER_PROG = BIT(0), - FD_DIRTY_SHADER_CONST = BIT(1), - FD_DIRTY_SHADER_TEX = BIT(2), - FD_DIRTY_SHADER_SSBO = BIT(3), - FD_DIRTY_SHADER_IMAGE = BIT(4), + FD_DIRTY_SHADER_PROG = BIT(0), + FD_DIRTY_SHADER_CONST = BIT(1), + FD_DIRTY_SHADER_TEX = BIT(2), + FD_DIRTY_SHADER_SSBO = BIT(3), + FD_DIRTY_SHADER_IMAGE = BIT(4), #define NUM_DIRTY_SHADER_BITS 5 }; @@ -193,350 +194,362 @@ struct fd_hw_sample; struct ir3_shader_key; struct fd_context { - struct pipe_context base; - - struct threaded_context *tc; - - struct list_head node; /* node in screen->context_list */ - - /* We currently need to serialize emitting GMEM batches, because of - * VSC state access in the context. - * - * In practice this lock should not be contended, since pipe_context - * use should be single threaded. But it is needed to protect the - * case, with batch reordering where a ctxB batch triggers flushing - * a ctxA batch - */ - simple_mtx_t gmem_lock; - - struct fd_device *dev; - struct fd_screen *screen; - struct fd_pipe *pipe; - - struct blitter_context *blitter dt; - void *clear_rs_state[2] dt; - struct primconvert_context *primconvert dt; - - /* slab for pipe_transfer allocations: */ - struct slab_child_pool transfer_pool dt; - struct slab_child_pool transfer_pool_unsync; /* for threaded_context */ - - struct fd_autotune autotune dt; - - /** - * query related state: - */ - /*@{*/ - /* slabs for fd_hw_sample and fd_hw_sample_period allocations: */ - struct slab_mempool sample_pool dt; - struct slab_mempool sample_period_pool dt; - - /* sample-providers for hw queries: */ - const struct fd_hw_sample_provider *hw_sample_providers[MAX_HW_SAMPLE_PROVIDERS]; - - /* list of active queries: */ - struct list_head hw_active_queries dt; - - /* sample-providers for accumulating hw queries: */ - const struct fd_acc_sample_provider *acc_sample_providers[MAX_HW_SAMPLE_PROVIDERS]; - - /* list of active accumulating queries: */ - struct list_head acc_active_queries dt; - /*@}*/ - - /* Whether we need to recheck the active_queries list next - * fd_batch_update_queries(). - */ - bool update_active_queries dt; - - /* Current state of pctx->set_active_query_state() (i.e. "should drawing - * be counted against non-perfcounter queries") - */ - bool active_queries dt; - - /* table with PIPE_PRIM_MAX entries mapping PIPE_PRIM_x to - * DI_PT_x value to use for draw initiator. There are some - * slight differences between generation: - */ - const uint8_t *primtypes; - uint32_t primtype_mask; - - /* shaders used by clear, and gmem->mem blits: */ - struct fd_program_stateobj solid_prog; // TODO move to screen? - struct fd_program_stateobj solid_layered_prog; - - /* shaders used by mem->gmem blits: */ - struct fd_program_stateobj blit_prog[MAX_RENDER_TARGETS]; // TODO move to screen? - struct fd_program_stateobj blit_z, blit_zs; - - /* Stats/counters: - */ - struct { - uint64_t prims_emitted; - uint64_t prims_generated; - uint64_t draw_calls; - uint64_t batch_total, batch_sysmem, batch_gmem, batch_nondraw, batch_restore; - uint64_t staging_uploads, shadow_uploads; - uint64_t vs_regs, hs_regs, ds_regs, gs_regs, fs_regs; - } stats dt; - - /* Counter for number of users who need sw counters (so we can - * skip collecting them when not needed) - */ - unsigned stats_users; - - /* Current batch.. the rule here is that you can deref ctx->batch - * in codepaths from pipe_context entrypoints. But not in code- - * paths from fd_batch_flush() (basically, the stuff that gets - * called from GMEM code), since in those code-paths the batch - * you care about is not necessarily the same as ctx->batch. - */ - struct fd_batch *batch dt; - - /* NULL if there has been rendering since last flush. Otherwise - * keeps a reference to the last fence so we can re-use it rather - * than having to flush no-op batch. - */ - struct pipe_fence_handle *last_fence dt; - - /* Fence fd we are told to wait on via ->fence_server_sync() (or -1 - * if none). The in-fence is transferred over to the batch on the - * next draw/blit/grid. - * - * The reason for this extra complexity is that apps will typically - * do eglWaitSyncKHR()/etc at the beginning of the frame, before the - * first draw. But mesa/st doesn't flush down framebuffer state - * change until we hit a draw, so at ->fence_server_sync() time, we - * don't yet have the correct batch. If we created a batch at that - * point, it would be the wrong one, and we'd have to flush it pre- - * maturely, causing us to stall early in the frame where we could - * be building up cmdstream. - */ - int in_fence_fd dt; - - /* track last known reset status globally and per-context to - * determine if more resets occurred since then. If global reset - * count increases, it means some other context crashed. If - * per-context reset count increases, it means we crashed the - * gpu. - */ - uint32_t context_reset_count dt; - uint32_t global_reset_count dt; - - /* Context sequence #, used for batch-cache key: */ - uint16_t seqno; - - /* Cost per draw, used in conjunction with samples-passed history to - * estimate whether GMEM or bypass is the better option. - */ - uint8_t draw_cost; - - /* Are we in process of shadowing a resource? Used to detect recursion - * in transfer_map, and skip unneeded synchronization. - */ - bool in_shadow : 1 dt; - - /* Ie. in blit situation where we no longer care about previous framebuffer - * contents. Main point is to eliminate blits from fd_try_shadow_resource(). - * For example, in case of texture upload + gen-mipmaps. - */ - bool in_discard_blit : 1 dt; - - /* points to either scissor or disabled_scissor depending on rast state: */ - struct pipe_scissor_state *current_scissor dt; - - struct pipe_scissor_state scissor dt; - - /* we don't have a disable/enable bit for scissor, so instead we keep - * a disabled-scissor state which matches the entire bound framebuffer - * and use that when scissor is not enabled. - */ - struct pipe_scissor_state disabled_scissor dt; - - /* Per vsc pipe bo's (a2xx-a5xx): */ - struct fd_bo *vsc_pipe_bo[32] dt; - - /* Maps generic gallium oriented fd_dirty_3d_state bits to generation - * specific bitmask of state "groups". - */ - uint32_t gen_dirty_map[NUM_DIRTY_BITS]; - uint32_t gen_dirty_shader_map[PIPE_SHADER_TYPES][NUM_DIRTY_SHADER_BITS]; - - /* Bitmask of all possible gen_dirty bits: */ - uint32_t gen_all_dirty; - - /* Generation specific bitmask of dirty state groups: */ - uint32_t gen_dirty; - - /* which state objects need to be re-emit'd: */ - enum fd_dirty_3d_state dirty dt; - - /* per shader-stage dirty status: */ - enum fd_dirty_shader_state dirty_shader[PIPE_SHADER_TYPES] dt; - - void *compute dt; - struct pipe_blend_state *blend dt; - struct pipe_rasterizer_state *rasterizer dt; - struct pipe_depth_stencil_alpha_state *zsa dt; - - struct fd_texture_stateobj tex[PIPE_SHADER_TYPES] dt; - - struct fd_program_stateobj prog dt; - uint32_t bound_shader_stages dt; - - struct fd_vertex_state vtx dt; - - struct pipe_blend_color blend_color dt; - struct pipe_stencil_ref stencil_ref dt; - unsigned sample_mask dt; - unsigned min_samples dt; - /* local context fb state, for when ctx->batch is null: */ - struct pipe_framebuffer_state framebuffer dt; - struct pipe_poly_stipple stipple dt; - struct pipe_viewport_state viewport dt; - struct pipe_scissor_state viewport_scissor dt; - struct fd_constbuf_stateobj constbuf[PIPE_SHADER_TYPES] dt; - struct fd_shaderbuf_stateobj shaderbuf[PIPE_SHADER_TYPES] dt; - struct fd_shaderimg_stateobj shaderimg[PIPE_SHADER_TYPES] dt; - struct fd_streamout_stateobj streamout dt; - struct fd_global_bindings_stateobj global_bindings dt; - struct pipe_clip_state ucp dt; - - struct pipe_query *cond_query dt; - bool cond_cond dt; /* inverted rendering condition */ - uint cond_mode dt; - - /* Private memory is a memory space where each fiber gets its own piece of - * memory, in addition to registers. It is backed by a buffer which needs - * to be large enough to hold the contents of every possible wavefront in - * every core of the GPU. Because it allocates space via the internal - * wavefront ID which is shared between all currently executing shaders, - * the same buffer can be reused by all shaders, as long as all shaders - * sharing the same buffer use the exact same configuration. There are two - * inputs to the configuration, the amount of per-fiber space and whether - * to use the newer per-wave or older per-fiber layout. We only ever - * increase the size, and shaders with a smaller size requirement simply - * use the larger existing buffer, so that we only need to keep track of - * one buffer and its size, but we still need to keep track of per-fiber - * and per-wave buffers separately so that we never use the same buffer - * for different layouts. pvtmem[0] is for per-fiber, and pvtmem[1] is for - * per-wave. - */ - struct { - struct fd_bo *bo; - uint32_t per_fiber_size; - } pvtmem[2] dt; - - /* maps per-shader-stage state plus variant key to hw - * program stateobj: - */ - struct ir3_cache *shader_cache; - - struct pipe_debug_callback debug; - - struct u_trace_context trace_context dt; - - /* Called on rebind_resource() for any per-gen cleanup required: */ - void (*rebind_resource)(struct fd_context *ctx, struct fd_resource *rsc) dt; - - /* GMEM/tile handling fxns: */ - void (*emit_tile_init)(struct fd_batch *batch) dt; - void (*emit_tile_prep)(struct fd_batch *batch, const struct fd_tile *tile) dt; - void (*emit_tile_mem2gmem)(struct fd_batch *batch, const struct fd_tile *tile) dt; - void (*emit_tile_renderprep)(struct fd_batch *batch, const struct fd_tile *tile) dt; - void (*emit_tile)(struct fd_batch *batch, const struct fd_tile *tile) dt; - void (*emit_tile_gmem2mem)(struct fd_batch *batch, const struct fd_tile *tile) dt; - void (*emit_tile_fini)(struct fd_batch *batch) dt; /* optional */ - - /* optional, for GMEM bypass: */ - void (*emit_sysmem_prep)(struct fd_batch *batch) dt; - void (*emit_sysmem_fini)(struct fd_batch *batch) dt; - - /* draw: */ - bool (*draw_vbo)(struct fd_context *ctx, const struct pipe_draw_info *info, - const struct pipe_draw_indirect_info *indirect, - const struct pipe_draw_start_count *draw, - unsigned index_offset) dt; - bool (*clear)(struct fd_context *ctx, unsigned buffers, - const union pipe_color_union *color, double depth, unsigned stencil) dt; - - /* compute: */ - void (*launch_grid)(struct fd_context *ctx, const struct pipe_grid_info *info) dt; - - /* query: */ - struct fd_query * (*create_query)(struct fd_context *ctx, unsigned query_type, unsigned index); - void (*query_prepare)(struct fd_batch *batch, uint32_t num_tiles) dt; - void (*query_prepare_tile)(struct fd_batch *batch, uint32_t n, - struct fd_ringbuffer *ring) dt; - void (*query_update_batch)(struct fd_batch *batch, bool disable_all) dt; - - /* blitter: */ - bool (*blit)(struct fd_context *ctx, const struct pipe_blit_info *info) dt; - void (*clear_ubwc)(struct fd_batch *batch, struct fd_resource *rsc) dt; - - /* handling for barriers: */ - void (*framebuffer_barrier)(struct fd_context *ctx) dt; - - /* logger: */ - void (*record_timestamp)(struct fd_ringbuffer *ring, struct fd_bo *bo, unsigned offset); - uint64_t (*ts_to_ns)(uint64_t ts); - - /* - * Common pre-cooked VBO state (used for a3xx and later): - */ - - /* for clear/gmem->mem vertices, and mem->gmem */ - struct pipe_resource *solid_vbuf; - - /* for mem->gmem tex coords: */ - struct pipe_resource *blit_texcoord_vbuf; - - /* vertex state for solid_vbuf: - * - solid_vbuf / 12 / R32G32B32_FLOAT - */ - struct fd_vertex_state solid_vbuf_state; - - /* vertex state for blit_prog: - * - blit_texcoord_vbuf / 8 / R32G32_FLOAT - * - solid_vbuf / 12 / R32G32B32_FLOAT - */ - struct fd_vertex_state blit_vbuf_state; - - /* - * Info about state of previous draw, for state that comes from - * pipe_draw_info (ie. not part of a CSO). This allows us to - * skip some register emit when the state doesn't change from - * draw-to-draw - */ - struct { - bool dirty; /* last draw state unknown */ - bool primitive_restart; - uint32_t index_start; - uint32_t instance_start; - uint32_t restart_index; - uint32_t streamout_mask; - - /* some state changes require a different shader variant. Keep - * track of this so we know when we need to re-emit shader state - * due to variant change. See ir3_fixup_shader_state() - * - * (used for a3xx+, NULL otherwise) - */ - struct ir3_shader_key *key; - - } last dt; + struct pipe_context base; + + struct threaded_context *tc; + + struct list_head node; /* node in screen->context_list */ + + /* We currently need to serialize emitting GMEM batches, because of + * VSC state access in the context. + * + * In practice this lock should not be contended, since pipe_context + * use should be single threaded. But it is needed to protect the + * case, with batch reordering where a ctxB batch triggers flushing + * a ctxA batch + */ + simple_mtx_t gmem_lock; + + struct fd_device *dev; + struct fd_screen *screen; + struct fd_pipe *pipe; + + struct blitter_context *blitter dt; + void *clear_rs_state[2] dt; + struct primconvert_context *primconvert dt; + + /* slab for pipe_transfer allocations: */ + struct slab_child_pool transfer_pool dt; + struct slab_child_pool transfer_pool_unsync; /* for threaded_context */ + + struct fd_autotune autotune dt; + + /** + * query related state: + */ + /*@{*/ + /* slabs for fd_hw_sample and fd_hw_sample_period allocations: */ + struct slab_mempool sample_pool dt; + struct slab_mempool sample_period_pool dt; + + /* sample-providers for hw queries: */ + const struct fd_hw_sample_provider + *hw_sample_providers[MAX_HW_SAMPLE_PROVIDERS]; + + /* list of active queries: */ + struct list_head hw_active_queries dt; + + /* sample-providers for accumulating hw queries: */ + const struct fd_acc_sample_provider + *acc_sample_providers[MAX_HW_SAMPLE_PROVIDERS]; + + /* list of active accumulating queries: */ + struct list_head acc_active_queries dt; + /*@}*/ + + /* Whether we need to recheck the active_queries list next + * fd_batch_update_queries(). + */ + bool update_active_queries dt; + + /* Current state of pctx->set_active_query_state() (i.e. "should drawing + * be counted against non-perfcounter queries") + */ + bool active_queries dt; + + /* table with PIPE_PRIM_MAX entries mapping PIPE_PRIM_x to + * DI_PT_x value to use for draw initiator. There are some + * slight differences between generation: + */ + const uint8_t *primtypes; + uint32_t primtype_mask; + + /* shaders used by clear, and gmem->mem blits: */ + struct fd_program_stateobj solid_prog; // TODO move to screen? + struct fd_program_stateobj solid_layered_prog; + + /* shaders used by mem->gmem blits: */ + struct fd_program_stateobj + blit_prog[MAX_RENDER_TARGETS]; // TODO move to screen? + struct fd_program_stateobj blit_z, blit_zs; + + /* Stats/counters: + */ + struct { + uint64_t prims_emitted; + uint64_t prims_generated; + uint64_t draw_calls; + uint64_t batch_total, batch_sysmem, batch_gmem, batch_nondraw, + batch_restore; + uint64_t staging_uploads, shadow_uploads; + uint64_t vs_regs, hs_regs, ds_regs, gs_regs, fs_regs; + } stats dt; + + /* Counter for number of users who need sw counters (so we can + * skip collecting them when not needed) + */ + unsigned stats_users; + + /* Current batch.. the rule here is that you can deref ctx->batch + * in codepaths from pipe_context entrypoints. But not in code- + * paths from fd_batch_flush() (basically, the stuff that gets + * called from GMEM code), since in those code-paths the batch + * you care about is not necessarily the same as ctx->batch. + */ + struct fd_batch *batch dt; + + /* NULL if there has been rendering since last flush. Otherwise + * keeps a reference to the last fence so we can re-use it rather + * than having to flush no-op batch. + */ + struct pipe_fence_handle *last_fence dt; + + /* Fence fd we are told to wait on via ->fence_server_sync() (or -1 + * if none). The in-fence is transferred over to the batch on the + * next draw/blit/grid. + * + * The reason for this extra complexity is that apps will typically + * do eglWaitSyncKHR()/etc at the beginning of the frame, before the + * first draw. But mesa/st doesn't flush down framebuffer state + * change until we hit a draw, so at ->fence_server_sync() time, we + * don't yet have the correct batch. If we created a batch at that + * point, it would be the wrong one, and we'd have to flush it pre- + * maturely, causing us to stall early in the frame where we could + * be building up cmdstream. + */ + int in_fence_fd dt; + + /* track last known reset status globally and per-context to + * determine if more resets occurred since then. If global reset + * count increases, it means some other context crashed. If + * per-context reset count increases, it means we crashed the + * gpu. + */ + uint32_t context_reset_count dt; + uint32_t global_reset_count dt; + + /* Context sequence #, used for batch-cache key: */ + uint16_t seqno; + + /* Cost per draw, used in conjunction with samples-passed history to + * estimate whether GMEM or bypass is the better option. + */ + uint8_t draw_cost; + + /* Are we in process of shadowing a resource? Used to detect recursion + * in transfer_map, and skip unneeded synchronization. + */ + bool in_shadow : 1 dt; + + /* Ie. in blit situation where we no longer care about previous framebuffer + * contents. Main point is to eliminate blits from fd_try_shadow_resource(). + * For example, in case of texture upload + gen-mipmaps. + */ + bool in_discard_blit : 1 dt; + + /* points to either scissor or disabled_scissor depending on rast state: */ + struct pipe_scissor_state *current_scissor dt; + + struct pipe_scissor_state scissor dt; + + /* we don't have a disable/enable bit for scissor, so instead we keep + * a disabled-scissor state which matches the entire bound framebuffer + * and use that when scissor is not enabled. + */ + struct pipe_scissor_state disabled_scissor dt; + + /* Per vsc pipe bo's (a2xx-a5xx): */ + struct fd_bo *vsc_pipe_bo[32] dt; + + /* Maps generic gallium oriented fd_dirty_3d_state bits to generation + * specific bitmask of state "groups". + */ + uint32_t gen_dirty_map[NUM_DIRTY_BITS]; + uint32_t gen_dirty_shader_map[PIPE_SHADER_TYPES][NUM_DIRTY_SHADER_BITS]; + + /* Bitmask of all possible gen_dirty bits: */ + uint32_t gen_all_dirty; + + /* Generation specific bitmask of dirty state groups: */ + uint32_t gen_dirty; + + /* which state objects need to be re-emit'd: */ + enum fd_dirty_3d_state dirty dt; + + /* per shader-stage dirty status: */ + enum fd_dirty_shader_state dirty_shader[PIPE_SHADER_TYPES] dt; + + void *compute dt; + struct pipe_blend_state *blend dt; + struct pipe_rasterizer_state *rasterizer dt; + struct pipe_depth_stencil_alpha_state *zsa dt; + + struct fd_texture_stateobj tex[PIPE_SHADER_TYPES] dt; + + struct fd_program_stateobj prog dt; + uint32_t bound_shader_stages dt; + + struct fd_vertex_state vtx dt; + + struct pipe_blend_color blend_color dt; + struct pipe_stencil_ref stencil_ref dt; + unsigned sample_mask dt; + unsigned min_samples dt; + /* local context fb state, for when ctx->batch is null: */ + struct pipe_framebuffer_state framebuffer dt; + struct pipe_poly_stipple stipple dt; + struct pipe_viewport_state viewport dt; + struct pipe_scissor_state viewport_scissor dt; + struct fd_constbuf_stateobj constbuf[PIPE_SHADER_TYPES] dt; + struct fd_shaderbuf_stateobj shaderbuf[PIPE_SHADER_TYPES] dt; + struct fd_shaderimg_stateobj shaderimg[PIPE_SHADER_TYPES] dt; + struct fd_streamout_stateobj streamout dt; + struct fd_global_bindings_stateobj global_bindings dt; + struct pipe_clip_state ucp dt; + + struct pipe_query *cond_query dt; + bool cond_cond dt; /* inverted rendering condition */ + uint cond_mode dt; + + /* Private memory is a memory space where each fiber gets its own piece of + * memory, in addition to registers. It is backed by a buffer which needs + * to be large enough to hold the contents of every possible wavefront in + * every core of the GPU. Because it allocates space via the internal + * wavefront ID which is shared between all currently executing shaders, + * the same buffer can be reused by all shaders, as long as all shaders + * sharing the same buffer use the exact same configuration. There are two + * inputs to the configuration, the amount of per-fiber space and whether + * to use the newer per-wave or older per-fiber layout. We only ever + * increase the size, and shaders with a smaller size requirement simply + * use the larger existing buffer, so that we only need to keep track of + * one buffer and its size, but we still need to keep track of per-fiber + * and per-wave buffers separately so that we never use the same buffer + * for different layouts. pvtmem[0] is for per-fiber, and pvtmem[1] is for + * per-wave. + */ + struct { + struct fd_bo *bo; + uint32_t per_fiber_size; + } pvtmem[2] dt; + + /* maps per-shader-stage state plus variant key to hw + * program stateobj: + */ + struct ir3_cache *shader_cache; + + struct pipe_debug_callback debug; + + struct u_trace_context trace_context dt; + + /* Called on rebind_resource() for any per-gen cleanup required: */ + void (*rebind_resource)(struct fd_context *ctx, struct fd_resource *rsc) dt; + + /* GMEM/tile handling fxns: */ + void (*emit_tile_init)(struct fd_batch *batch) dt; + void (*emit_tile_prep)(struct fd_batch *batch, + const struct fd_tile *tile) dt; + void (*emit_tile_mem2gmem)(struct fd_batch *batch, + const struct fd_tile *tile) dt; + void (*emit_tile_renderprep)(struct fd_batch *batch, + const struct fd_tile *tile) dt; + void (*emit_tile)(struct fd_batch *batch, const struct fd_tile *tile) dt; + void (*emit_tile_gmem2mem)(struct fd_batch *batch, + const struct fd_tile *tile) dt; + void (*emit_tile_fini)(struct fd_batch *batch) dt; /* optional */ + + /* optional, for GMEM bypass: */ + void (*emit_sysmem_prep)(struct fd_batch *batch) dt; + void (*emit_sysmem_fini)(struct fd_batch *batch) dt; + + /* draw: */ + bool (*draw_vbo)(struct fd_context *ctx, const struct pipe_draw_info *info, + const struct pipe_draw_indirect_info *indirect, + const struct pipe_draw_start_count *draw, + unsigned index_offset) dt; + bool (*clear)(struct fd_context *ctx, unsigned buffers, + const union pipe_color_union *color, double depth, + unsigned stencil) dt; + + /* compute: */ + void (*launch_grid)(struct fd_context *ctx, + const struct pipe_grid_info *info) dt; + + /* query: */ + struct fd_query *(*create_query)(struct fd_context *ctx, unsigned query_type, + unsigned index); + void (*query_prepare)(struct fd_batch *batch, uint32_t num_tiles) dt; + void (*query_prepare_tile)(struct fd_batch *batch, uint32_t n, + struct fd_ringbuffer *ring) dt; + void (*query_update_batch)(struct fd_batch *batch, bool disable_all) dt; + + /* blitter: */ + bool (*blit)(struct fd_context *ctx, const struct pipe_blit_info *info) dt; + void (*clear_ubwc)(struct fd_batch *batch, struct fd_resource *rsc) dt; + + /* handling for barriers: */ + void (*framebuffer_barrier)(struct fd_context *ctx) dt; + + /* logger: */ + void (*record_timestamp)(struct fd_ringbuffer *ring, struct fd_bo *bo, + unsigned offset); + uint64_t (*ts_to_ns)(uint64_t ts); + + /* + * Common pre-cooked VBO state (used for a3xx and later): + */ + + /* for clear/gmem->mem vertices, and mem->gmem */ + struct pipe_resource *solid_vbuf; + + /* for mem->gmem tex coords: */ + struct pipe_resource *blit_texcoord_vbuf; + + /* vertex state for solid_vbuf: + * - solid_vbuf / 12 / R32G32B32_FLOAT + */ + struct fd_vertex_state solid_vbuf_state; + + /* vertex state for blit_prog: + * - blit_texcoord_vbuf / 8 / R32G32_FLOAT + * - solid_vbuf / 12 / R32G32B32_FLOAT + */ + struct fd_vertex_state blit_vbuf_state; + + /* + * Info about state of previous draw, for state that comes from + * pipe_draw_info (ie. not part of a CSO). This allows us to + * skip some register emit when the state doesn't change from + * draw-to-draw + */ + struct { + bool dirty; /* last draw state unknown */ + bool primitive_restart; + uint32_t index_start; + uint32_t instance_start; + uint32_t restart_index; + uint32_t streamout_mask; + + /* some state changes require a different shader variant. Keep + * track of this so we know when we need to re-emit shader state + * due to variant change. See ir3_fixup_shader_state() + * + * (used for a3xx+, NULL otherwise) + */ + struct ir3_shader_key *key; + + } last dt; }; static inline struct fd_context * fd_context(struct pipe_context *pctx) { - return (struct fd_context *)pctx; + return (struct fd_context *)pctx; } static inline struct fd_stream_output_target * fd_stream_output_target(struct pipe_stream_output_target *target) { - return (struct fd_stream_output_target *)target; + return (struct fd_stream_output_target *)target; } /** @@ -559,90 +572,83 @@ fd_stream_output_target(struct pipe_stream_output_target *target) static inline bool fd_context_dirty_resource(enum fd_dirty_3d_state dirty) { - return dirty & (FD_DIRTY_FRAMEBUFFER | FD_DIRTY_ZSA | - FD_DIRTY_BLEND | FD_DIRTY_SSBO | FD_DIRTY_IMAGE | - FD_DIRTY_VTXBUF | FD_DIRTY_TEX | FD_DIRTY_STREAMOUT); + return dirty & (FD_DIRTY_FRAMEBUFFER | FD_DIRTY_ZSA | FD_DIRTY_BLEND | + FD_DIRTY_SSBO | FD_DIRTY_IMAGE | FD_DIRTY_VTXBUF | + FD_DIRTY_TEX | FD_DIRTY_STREAMOUT); } /* Mark specified non-shader-stage related state as dirty: */ static inline void -fd_context_dirty(struct fd_context *ctx, enum fd_dirty_3d_state dirty) - assert_dt +fd_context_dirty(struct fd_context *ctx, enum fd_dirty_3d_state dirty) assert_dt { - assert(util_is_power_of_two_nonzero(dirty)); - STATIC_ASSERT(ffs(dirty) <= ARRAY_SIZE(ctx->gen_dirty_map)); + assert(util_is_power_of_two_nonzero(dirty)); + STATIC_ASSERT(ffs(dirty) <= ARRAY_SIZE(ctx->gen_dirty_map)); - ctx->gen_dirty |= ctx->gen_dirty_map[ffs(dirty) - 1]; + ctx->gen_dirty |= ctx->gen_dirty_map[ffs(dirty) - 1]; - if (fd_context_dirty_resource(dirty)) - dirty |= FD_DIRTY_RESOURCE; + if (fd_context_dirty_resource(dirty)) + dirty |= FD_DIRTY_RESOURCE; - ctx->dirty |= dirty; + ctx->dirty |= dirty; } static inline void fd_context_dirty_shader(struct fd_context *ctx, enum pipe_shader_type shader, - enum fd_dirty_shader_state dirty) - assert_dt + enum fd_dirty_shader_state dirty) assert_dt { - const enum fd_dirty_3d_state map[] = { - FD_DIRTY_PROG, - FD_DIRTY_CONST, - FD_DIRTY_TEX, - FD_DIRTY_SSBO, - FD_DIRTY_IMAGE, - }; - - /* Need to update the table above if these shift: */ - STATIC_ASSERT(FD_DIRTY_SHADER_PROG == BIT(0)); - STATIC_ASSERT(FD_DIRTY_SHADER_CONST == BIT(1)); - STATIC_ASSERT(FD_DIRTY_SHADER_TEX == BIT(2)); - STATIC_ASSERT(FD_DIRTY_SHADER_SSBO == BIT(3)); - STATIC_ASSERT(FD_DIRTY_SHADER_IMAGE == BIT(4)); - - assert(util_is_power_of_two_nonzero(dirty)); - assert(ffs(dirty) <= ARRAY_SIZE(map)); - - ctx->gen_dirty |= ctx->gen_dirty_shader_map[shader][ffs(dirty) - 1]; - - ctx->dirty_shader[shader] |= dirty; - fd_context_dirty(ctx, map[ffs(dirty) - 1]); + const enum fd_dirty_3d_state map[] = { + FD_DIRTY_PROG, FD_DIRTY_CONST, FD_DIRTY_TEX, + FD_DIRTY_SSBO, FD_DIRTY_IMAGE, + }; + + /* Need to update the table above if these shift: */ + STATIC_ASSERT(FD_DIRTY_SHADER_PROG == BIT(0)); + STATIC_ASSERT(FD_DIRTY_SHADER_CONST == BIT(1)); + STATIC_ASSERT(FD_DIRTY_SHADER_TEX == BIT(2)); + STATIC_ASSERT(FD_DIRTY_SHADER_SSBO == BIT(3)); + STATIC_ASSERT(FD_DIRTY_SHADER_IMAGE == BIT(4)); + + assert(util_is_power_of_two_nonzero(dirty)); + assert(ffs(dirty) <= ARRAY_SIZE(map)); + + ctx->gen_dirty |= ctx->gen_dirty_shader_map[shader][ffs(dirty) - 1]; + + ctx->dirty_shader[shader] |= dirty; + fd_context_dirty(ctx, map[ffs(dirty) - 1]); } /* mark all state dirty: */ static inline void -fd_context_all_dirty(struct fd_context *ctx) - assert_dt +fd_context_all_dirty(struct fd_context *ctx) assert_dt { - ctx->last.dirty = true; - ctx->dirty = ~0; + ctx->last.dirty = true; + ctx->dirty = ~0; - /* NOTE: don't use ~0 for gen_dirty, because the gen specific - * emit code will loop over all the bits: - */ - ctx->gen_dirty = ctx->gen_all_dirty; + /* NOTE: don't use ~0 for gen_dirty, because the gen specific + * emit code will loop over all the bits: + */ + ctx->gen_dirty = ctx->gen_all_dirty; - for (unsigned i = 0; i < PIPE_SHADER_TYPES; i++) - ctx->dirty_shader[i] = ~0; + for (unsigned i = 0; i < PIPE_SHADER_TYPES; i++) + ctx->dirty_shader[i] = ~0; } static inline void -fd_context_all_clean(struct fd_context *ctx) - assert_dt +fd_context_all_clean(struct fd_context *ctx) assert_dt { - ctx->last.dirty = false; - ctx->dirty = 0; - ctx->gen_dirty = 0; - for (unsigned i = 0; i < PIPE_SHADER_TYPES; i++) { - /* don't mark compute state as clean, since it is not emitted - * during normal draw call. The places that call _all_dirty(), - * it is safe to mark compute state dirty as well, but the - * inverse is not true. - */ - if (i == PIPE_SHADER_COMPUTE) - continue; - ctx->dirty_shader[i] = 0; - } + ctx->last.dirty = false; + ctx->dirty = 0; + ctx->gen_dirty = 0; + for (unsigned i = 0; i < PIPE_SHADER_TYPES; i++) { + /* don't mark compute state as clean, since it is not emitted + * during normal draw call. The places that call _all_dirty(), + * it is safe to mark compute state dirty as well, but the + * inverse is not true. + */ + if (i == PIPE_SHADER_COMPUTE) + continue; + ctx->dirty_shader[i] = 0; + } } /** @@ -651,12 +657,12 @@ fd_context_all_clean(struct fd_context *ctx) */ static inline void fd_context_add_map(struct fd_context *ctx, enum fd_dirty_3d_state dirty, - uint32_t gen_dirty) + uint32_t gen_dirty) { - u_foreach_bit (b, dirty) { - ctx->gen_dirty_map[b] |= gen_dirty; - } - ctx->gen_all_dirty |= gen_dirty; + u_foreach_bit (b, dirty) { + ctx->gen_dirty_map[b] |= gen_dirty; + } + ctx->gen_all_dirty |= gen_dirty; } /** @@ -665,41 +671,43 @@ fd_context_add_map(struct fd_context *ctx, enum fd_dirty_3d_state dirty, */ static inline void fd_context_add_shader_map(struct fd_context *ctx, enum pipe_shader_type shader, - enum fd_dirty_shader_state dirty, uint32_t gen_dirty) + enum fd_dirty_shader_state dirty, uint32_t gen_dirty) { - u_foreach_bit (b, dirty) { - ctx->gen_dirty_shader_map[shader][b] |= gen_dirty; - } - ctx->gen_all_dirty |= gen_dirty; + u_foreach_bit (b, dirty) { + ctx->gen_dirty_shader_map[shader][b] |= gen_dirty; + } + ctx->gen_all_dirty |= gen_dirty; } static inline struct pipe_scissor_state * -fd_context_get_scissor(struct fd_context *ctx) - assert_dt +fd_context_get_scissor(struct fd_context *ctx) assert_dt { - return ctx->current_scissor; + return ctx->current_scissor; } static inline bool fd_supported_prim(struct fd_context *ctx, unsigned prim) { - return (1 << prim) & ctx->primtype_mask; + return (1 << prim) & ctx->primtype_mask; } void fd_context_switch_from(struct fd_context *ctx) assert_dt; -void fd_context_switch_to(struct fd_context *ctx, struct fd_batch *batch) assert_dt; -struct fd_batch * fd_context_batch(struct fd_context *ctx) assert_dt; -struct fd_batch * fd_context_batch_locked(struct fd_context *ctx) assert_dt; +void fd_context_switch_to(struct fd_context *ctx, + struct fd_batch *batch) assert_dt; +struct fd_batch *fd_context_batch(struct fd_context *ctx) assert_dt; +struct fd_batch *fd_context_batch_locked(struct fd_context *ctx) assert_dt; void fd_context_setup_common_vbos(struct fd_context *ctx); void fd_context_cleanup_common_vbos(struct fd_context *ctx); void fd_emit_string(struct fd_ringbuffer *ring, const char *string, int len); void fd_emit_string5(struct fd_ringbuffer *ring, const char *string, int len); -struct pipe_context * fd_context_init(struct fd_context *ctx, - struct pipe_screen *pscreen, const uint8_t *primtypes, - void *priv, unsigned flags); -struct pipe_context * fd_context_init_tc(struct pipe_context *pctx, unsigned flags); +struct pipe_context *fd_context_init(struct fd_context *ctx, + struct pipe_screen *pscreen, + const uint8_t *primtypes, void *priv, + unsigned flags); +struct pipe_context *fd_context_init_tc(struct pipe_context *pctx, + unsigned flags); void fd_context_destroy(struct pipe_context *pctx) assert_dt; diff --git a/src/gallium/drivers/freedreno/freedreno_draw.c b/src/gallium/drivers/freedreno/freedreno_draw.c index 399bd1a..9342f3e 100644 --- a/src/gallium/drivers/freedreno/freedreno_draw.c +++ b/src/gallium/drivers/freedreno/freedreno_draw.c @@ -25,578 +25,575 @@ */ #include "pipe/p_state.h" +#include "util/format/u_format.h" #include "util/u_draw.h" -#include "util/u_string.h" +#include "util/u_helpers.h" #include "util/u_memory.h" #include "util/u_prim.h" -#include "util/format/u_format.h" -#include "util/u_helpers.h" +#include "util/u_string.h" #include "freedreno_blitter.h" -#include "freedreno_draw.h" #include "freedreno_context.h" +#include "freedreno_draw.h" #include "freedreno_fence.h" -#include "freedreno_state.h" -#include "freedreno_resource.h" #include "freedreno_query_acc.h" #include "freedreno_query_hw.h" +#include "freedreno_resource.h" +#include "freedreno_state.h" #include "freedreno_util.h" static void -resource_read(struct fd_batch *batch, struct pipe_resource *prsc) - assert_dt +resource_read(struct fd_batch *batch, struct pipe_resource *prsc) assert_dt { - if (!prsc) - return; - fd_batch_resource_read(batch, fd_resource(prsc)); + if (!prsc) + return; + fd_batch_resource_read(batch, fd_resource(prsc)); } static void -resource_written(struct fd_batch *batch, struct pipe_resource *prsc) - assert_dt +resource_written(struct fd_batch *batch, struct pipe_resource *prsc) assert_dt { - if (!prsc) - return; - fd_batch_resource_write(batch, fd_resource(prsc)); + if (!prsc) + return; + fd_batch_resource_write(batch, fd_resource(prsc)); } static void -batch_draw_tracking_for_dirty_bits(struct fd_batch *batch) - assert_dt +batch_draw_tracking_for_dirty_bits(struct fd_batch *batch) assert_dt { - struct fd_context *ctx = batch->ctx; - struct pipe_framebuffer_state *pfb = &batch->framebuffer; - unsigned buffers = 0, restore_buffers = 0; - - if (ctx->dirty & (FD_DIRTY_FRAMEBUFFER | FD_DIRTY_ZSA)) { - if (fd_depth_enabled(ctx)) { - if (fd_resource(pfb->zsbuf->texture)->valid) { - restore_buffers |= FD_BUFFER_DEPTH; - } else { - batch->invalidated |= FD_BUFFER_DEPTH; - } - batch->gmem_reason |= FD_GMEM_DEPTH_ENABLED; - if (fd_depth_write_enabled(ctx)) { - buffers |= FD_BUFFER_DEPTH; - resource_written(batch, pfb->zsbuf->texture); - } else { - resource_read(batch, pfb->zsbuf->texture); - } - } - - if (fd_stencil_enabled(ctx)) { - if (fd_resource(pfb->zsbuf->texture)->valid) { - restore_buffers |= FD_BUFFER_STENCIL; - } else { - batch->invalidated |= FD_BUFFER_STENCIL; - } - batch->gmem_reason |= FD_GMEM_STENCIL_ENABLED; - buffers |= FD_BUFFER_STENCIL; - resource_written(batch, pfb->zsbuf->texture); - } - } - - if (ctx->dirty & FD_DIRTY_FRAMEBUFFER) { - for (unsigned i = 0; i < pfb->nr_cbufs; i++) { - struct pipe_resource *surf; - - if (!pfb->cbufs[i]) - continue; - - surf = pfb->cbufs[i]->texture; - - if (fd_resource(surf)->valid) { - restore_buffers |= PIPE_CLEAR_COLOR0 << i; - } else { - batch->invalidated |= PIPE_CLEAR_COLOR0 << i; - } - - buffers |= PIPE_CLEAR_COLOR0 << i; - - if (ctx->dirty & FD_DIRTY_FRAMEBUFFER) - resource_written(batch, pfb->cbufs[i]->texture); - } - } - - if (ctx->dirty & FD_DIRTY_BLEND) { - if (ctx->blend->logicop_enable) - batch->gmem_reason |= FD_GMEM_LOGICOP_ENABLED; - for (unsigned i = 0; i < pfb->nr_cbufs; i++) { - if (ctx->blend->rt[i].blend_enable) - batch->gmem_reason |= FD_GMEM_BLEND_ENABLED; - } - } - - /* Mark SSBOs */ - if (ctx->dirty_shader[PIPE_SHADER_FRAGMENT] & FD_DIRTY_SHADER_SSBO) { - const struct fd_shaderbuf_stateobj *so = &ctx->shaderbuf[PIPE_SHADER_FRAGMENT]; - - u_foreach_bit (i, so->enabled_mask & so->writable_mask) - resource_written(batch, so->sb[i].buffer); - - u_foreach_bit (i, so->enabled_mask & ~so->writable_mask) - resource_read(batch, so->sb[i].buffer); - } - - if (ctx->dirty_shader[PIPE_SHADER_FRAGMENT] & FD_DIRTY_SHADER_IMAGE) { - u_foreach_bit (i, ctx->shaderimg[PIPE_SHADER_FRAGMENT].enabled_mask) { - struct pipe_image_view *img = - &ctx->shaderimg[PIPE_SHADER_FRAGMENT].si[i]; - if (img->access & PIPE_IMAGE_ACCESS_WRITE) - resource_written(batch, img->resource); - else - resource_read(batch, img->resource); - } - } - - u_foreach_bit (s, ctx->bound_shader_stages) { - /* Mark constbuf as being read: */ - if (ctx->dirty_shader[s] & FD_DIRTY_SHADER_CONST) { - u_foreach_bit (i, ctx->constbuf[s].enabled_mask) - resource_read(batch, ctx->constbuf[s].cb[i].buffer); - } - - /* Mark textures as being read */ - if (ctx->dirty_shader[s] & FD_DIRTY_SHADER_TEX) { - u_foreach_bit (i, ctx->tex[s].valid_textures) - resource_read(batch, ctx->tex[s].textures[i]->texture); - } - } - - /* Mark VBOs as being read */ - if (ctx->dirty & FD_DIRTY_VTXBUF) { - u_foreach_bit (i, ctx->vtx.vertexbuf.enabled_mask) { - assert(!ctx->vtx.vertexbuf.vb[i].is_user_buffer); - resource_read(batch, ctx->vtx.vertexbuf.vb[i].buffer.resource); - } - } - - /* Mark streamout buffers as being written.. */ - if (ctx->dirty & FD_DIRTY_STREAMOUT) { - for (unsigned i = 0; i < ctx->streamout.num_targets; i++) - if (ctx->streamout.targets[i]) - resource_written(batch, ctx->streamout.targets[i]->buffer); - } - - /* any buffers that haven't been cleared yet, we need to restore: */ - batch->restore |= restore_buffers & (FD_BUFFER_ALL & ~batch->invalidated); - /* and any buffers used, need to be resolved: */ - batch->resolve |= buffers; + struct fd_context *ctx = batch->ctx; + struct pipe_framebuffer_state *pfb = &batch->framebuffer; + unsigned buffers = 0, restore_buffers = 0; + + if (ctx->dirty & (FD_DIRTY_FRAMEBUFFER | FD_DIRTY_ZSA)) { + if (fd_depth_enabled(ctx)) { + if (fd_resource(pfb->zsbuf->texture)->valid) { + restore_buffers |= FD_BUFFER_DEPTH; + } else { + batch->invalidated |= FD_BUFFER_DEPTH; + } + batch->gmem_reason |= FD_GMEM_DEPTH_ENABLED; + if (fd_depth_write_enabled(ctx)) { + buffers |= FD_BUFFER_DEPTH; + resource_written(batch, pfb->zsbuf->texture); + } else { + resource_read(batch, pfb->zsbuf->texture); + } + } + + if (fd_stencil_enabled(ctx)) { + if (fd_resource(pfb->zsbuf->texture)->valid) { + restore_buffers |= FD_BUFFER_STENCIL; + } else { + batch->invalidated |= FD_BUFFER_STENCIL; + } + batch->gmem_reason |= FD_GMEM_STENCIL_ENABLED; + buffers |= FD_BUFFER_STENCIL; + resource_written(batch, pfb->zsbuf->texture); + } + } + + if (ctx->dirty & FD_DIRTY_FRAMEBUFFER) { + for (unsigned i = 0; i < pfb->nr_cbufs; i++) { + struct pipe_resource *surf; + + if (!pfb->cbufs[i]) + continue; + + surf = pfb->cbufs[i]->texture; + + if (fd_resource(surf)->valid) { + restore_buffers |= PIPE_CLEAR_COLOR0 << i; + } else { + batch->invalidated |= PIPE_CLEAR_COLOR0 << i; + } + + buffers |= PIPE_CLEAR_COLOR0 << i; + + if (ctx->dirty & FD_DIRTY_FRAMEBUFFER) + resource_written(batch, pfb->cbufs[i]->texture); + } + } + + if (ctx->dirty & FD_DIRTY_BLEND) { + if (ctx->blend->logicop_enable) + batch->gmem_reason |= FD_GMEM_LOGICOP_ENABLED; + for (unsigned i = 0; i < pfb->nr_cbufs; i++) { + if (ctx->blend->rt[i].blend_enable) + batch->gmem_reason |= FD_GMEM_BLEND_ENABLED; + } + } + + /* Mark SSBOs */ + if (ctx->dirty_shader[PIPE_SHADER_FRAGMENT] & FD_DIRTY_SHADER_SSBO) { + const struct fd_shaderbuf_stateobj *so = + &ctx->shaderbuf[PIPE_SHADER_FRAGMENT]; + + u_foreach_bit (i, so->enabled_mask & so->writable_mask) + resource_written(batch, so->sb[i].buffer); + + u_foreach_bit (i, so->enabled_mask & ~so->writable_mask) + resource_read(batch, so->sb[i].buffer); + } + + if (ctx->dirty_shader[PIPE_SHADER_FRAGMENT] & FD_DIRTY_SHADER_IMAGE) { + u_foreach_bit (i, ctx->shaderimg[PIPE_SHADER_FRAGMENT].enabled_mask) { + struct pipe_image_view *img = + &ctx->shaderimg[PIPE_SHADER_FRAGMENT].si[i]; + if (img->access & PIPE_IMAGE_ACCESS_WRITE) + resource_written(batch, img->resource); + else + resource_read(batch, img->resource); + } + } + + u_foreach_bit (s, ctx->bound_shader_stages) { + /* Mark constbuf as being read: */ + if (ctx->dirty_shader[s] & FD_DIRTY_SHADER_CONST) { + u_foreach_bit (i, ctx->constbuf[s].enabled_mask) + resource_read(batch, ctx->constbuf[s].cb[i].buffer); + } + + /* Mark textures as being read */ + if (ctx->dirty_shader[s] & FD_DIRTY_SHADER_TEX) { + u_foreach_bit (i, ctx->tex[s].valid_textures) + resource_read(batch, ctx->tex[s].textures[i]->texture); + } + } + + /* Mark VBOs as being read */ + if (ctx->dirty & FD_DIRTY_VTXBUF) { + u_foreach_bit (i, ctx->vtx.vertexbuf.enabled_mask) { + assert(!ctx->vtx.vertexbuf.vb[i].is_user_buffer); + resource_read(batch, ctx->vtx.vertexbuf.vb[i].buffer.resource); + } + } + + /* Mark streamout buffers as being written.. */ + if (ctx->dirty & FD_DIRTY_STREAMOUT) { + for (unsigned i = 0; i < ctx->streamout.num_targets; i++) + if (ctx->streamout.targets[i]) + resource_written(batch, ctx->streamout.targets[i]->buffer); + } + + /* any buffers that haven't been cleared yet, we need to restore: */ + batch->restore |= restore_buffers & (FD_BUFFER_ALL & ~batch->invalidated); + /* and any buffers used, need to be resolved: */ + batch->resolve |= buffers; } static void batch_draw_tracking(struct fd_batch *batch, const struct pipe_draw_info *info, - const struct pipe_draw_indirect_info *indirect) - assert_dt + const struct pipe_draw_indirect_info *indirect) assert_dt { - struct fd_context *ctx = batch->ctx; + struct fd_context *ctx = batch->ctx; - /* NOTE: needs to be before resource_written(batch->query_buf), otherwise - * query_buf may not be created yet. - */ - fd_batch_update_queries(batch); + /* NOTE: needs to be before resource_written(batch->query_buf), otherwise + * query_buf may not be created yet. + */ + fd_batch_update_queries(batch); - /* - * Figure out the buffers/features we need: - */ + /* + * Figure out the buffers/features we need: + */ - fd_screen_lock(ctx->screen); + fd_screen_lock(ctx->screen); - if (ctx->dirty & FD_DIRTY_RESOURCE) - batch_draw_tracking_for_dirty_bits(batch); + if (ctx->dirty & FD_DIRTY_RESOURCE) + batch_draw_tracking_for_dirty_bits(batch); - /* Mark index buffer as being read */ - if (info->index_size) - resource_read(batch, info->index.resource); + /* Mark index buffer as being read */ + if (info->index_size) + resource_read(batch, info->index.resource); - /* Mark indirect draw buffer as being read */ - if (indirect) { - if (indirect->buffer) - resource_read(batch, indirect->buffer); - if (indirect->count_from_stream_output) - resource_read(batch, fd_stream_output_target(indirect->count_from_stream_output)->offset_buf); - } + /* Mark indirect draw buffer as being read */ + if (indirect) { + if (indirect->buffer) + resource_read(batch, indirect->buffer); + if (indirect->count_from_stream_output) + resource_read( + batch, fd_stream_output_target(indirect->count_from_stream_output) + ->offset_buf); + } - resource_written(batch, batch->query_buf); + resource_written(batch, batch->query_buf); - list_for_each_entry(struct fd_acc_query, aq, &ctx->acc_active_queries, node) - resource_written(batch, aq->prsc); + list_for_each_entry (struct fd_acc_query, aq, &ctx->acc_active_queries, node) + resource_written(batch, aq->prsc); - fd_screen_unlock(ctx->screen); + fd_screen_unlock(ctx->screen); } static void update_draw_stats(struct fd_context *ctx, const struct pipe_draw_info *info, - const struct pipe_draw_start_count *draws, unsigned num_draws) - assert_dt + const struct pipe_draw_start_count *draws, + unsigned num_draws) assert_dt { - ctx->stats.draw_calls++; - - if (ctx->screen->gpu_id < 600) { - /* Counting prims in sw doesn't work for GS and tesselation. For older - * gens we don't have those stages and don't have the hw counters enabled, - * so keep the count accurate for non-patch geometry. - */ - unsigned prims = 0; - if ((info->mode != PIPE_PRIM_PATCHES) && - (info->mode != PIPE_PRIM_MAX)) { - for (unsigned i = 0; i < num_draws; i++) { - prims += u_reduced_prims_for_vertices(info->mode, draws[i].count); - } - } - - ctx->stats.prims_generated += prims; - - if (ctx->streamout.num_targets > 0) { - /* Clip the prims we're writing to the size of the SO buffers. */ - enum pipe_prim_type tf_prim = u_decomposed_prim(info->mode); - unsigned verts_written = u_vertices_for_prims(tf_prim, prims); - unsigned remaining_vert_space = ctx->streamout.max_tf_vtx - ctx->streamout.verts_written; - if (verts_written > remaining_vert_space) { - verts_written = remaining_vert_space; - u_trim_pipe_prim(tf_prim, &remaining_vert_space); - } - ctx->streamout.verts_written += verts_written; - - ctx->stats.prims_emitted += u_reduced_prims_for_vertices(tf_prim, verts_written); - } - } + ctx->stats.draw_calls++; + + if (ctx->screen->gpu_id < 600) { + /* Counting prims in sw doesn't work for GS and tesselation. For older + * gens we don't have those stages and don't have the hw counters enabled, + * so keep the count accurate for non-patch geometry. + */ + unsigned prims = 0; + if ((info->mode != PIPE_PRIM_PATCHES) && (info->mode != PIPE_PRIM_MAX)) { + for (unsigned i = 0; i < num_draws; i++) { + prims += u_reduced_prims_for_vertices(info->mode, draws[i].count); + } + } + + ctx->stats.prims_generated += prims; + + if (ctx->streamout.num_targets > 0) { + /* Clip the prims we're writing to the size of the SO buffers. */ + enum pipe_prim_type tf_prim = u_decomposed_prim(info->mode); + unsigned verts_written = u_vertices_for_prims(tf_prim, prims); + unsigned remaining_vert_space = + ctx->streamout.max_tf_vtx - ctx->streamout.verts_written; + if (verts_written > remaining_vert_space) { + verts_written = remaining_vert_space; + u_trim_pipe_prim(tf_prim, &remaining_vert_space); + } + ctx->streamout.verts_written += verts_written; + + ctx->stats.prims_emitted += + u_reduced_prims_for_vertices(tf_prim, verts_written); + } + } } static void fd_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info, - const struct pipe_draw_indirect_info *indirect, - const struct pipe_draw_start_count *draws, - unsigned num_draws) - in_dt + const struct pipe_draw_indirect_info *indirect, + const struct pipe_draw_start_count *draws, unsigned num_draws) in_dt { - struct fd_context *ctx = fd_context(pctx); - - /* for debugging problems with indirect draw, it is convenient - * to be able to emulate it, to determine if game is feeding us - * bogus data: - */ - if (indirect && indirect->buffer && FD_DBG(NOINDR)) { - /* num_draws is only applicable for direct draws: */ - assert(num_draws == 1); - util_draw_indirect(pctx, info, indirect); - return; - } - - /* TODO: push down the region versions into the tiles */ - if (!fd_render_condition_check(pctx)) - return; - - /* emulate unsupported primitives: */ - if (!fd_supported_prim(ctx, info->mode)) { - if (ctx->streamout.num_targets > 0) - mesa_loge("stream-out with emulated prims"); - util_primconvert_save_rasterizer_state(ctx->primconvert, ctx->rasterizer); - util_primconvert_draw_vbo(ctx->primconvert, info, indirect, draws, num_draws); - return; - } - - /* Upload a user index buffer. */ - struct pipe_resource *indexbuf = NULL; - unsigned index_offset = 0; - struct pipe_draw_info new_info; - if (info->index_size) { - if (info->has_user_indices) { - if (num_draws > 1) { - util_draw_multi(pctx, info, indirect, draws, num_draws); - return; - } - if (!util_upload_index_buffer(pctx, info, &draws[0], - &indexbuf, &index_offset, 4)) - return; - new_info = *info; - new_info.index.resource = indexbuf; - new_info.has_user_indices = false; - info = &new_info; - } else { - indexbuf = info->index.resource; - } - } - - if ((ctx->streamout.num_targets > 0) && (num_draws > 1)) { - util_draw_multi(pctx, info, indirect, draws, num_draws); - return; - } - - struct fd_batch *batch = fd_context_batch(ctx); - - if (ctx->in_discard_blit) { - fd_batch_reset(batch); - fd_context_all_dirty(ctx); - } - - batch_draw_tracking(batch, info, indirect); - - while (unlikely(!fd_batch_lock_submit(batch))) { - /* The current batch was flushed in batch_draw_tracking() - * so start anew. We know this won't happen a second time - * since we are dealing with a fresh batch: - */ - fd_batch_reference(&batch, NULL); - batch = fd_context_batch(ctx); - batch_draw_tracking(batch, info, indirect); - assert(ctx->batch == batch); - } - - batch->blit = ctx->in_discard_blit; - batch->back_blit = ctx->in_shadow; - batch->num_draws++; - - /* Clearing last_fence must come after the batch dependency tracking - * (resource_read()/resource_written()), as that can trigger a flush, - * re-populating last_fence - */ - fd_fence_ref(&ctx->last_fence, NULL); - - struct pipe_framebuffer_state *pfb = &batch->framebuffer; - DBG("%p: %ux%u num_draws=%u (%s/%s)", batch, - pfb->width, pfb->height, batch->num_draws, - util_format_short_name(pipe_surface_format(pfb->cbufs[0])), - util_format_short_name(pipe_surface_format(pfb->zsbuf))); - - batch->cost += ctx->draw_cost; - - for (unsigned i = 0; i < num_draws; i++) { - if (ctx->draw_vbo(ctx, info, indirect, &draws[i], index_offset)) - batch->needs_flush = true; - - batch->num_vertices += draws[i].count * info->instance_count; - } - - if (unlikely(ctx->stats_users > 0)) - update_draw_stats(ctx, info, draws, num_draws); - - for (unsigned i = 0; i < ctx->streamout.num_targets; i++) { - assert(num_draws == 1); - ctx->streamout.offsets[i] += draws[0].count; - } - - if (FD_DBG(DDRAW)) - fd_context_all_dirty(ctx); - - fd_batch_unlock_submit(batch); - fd_batch_check_size(batch); - fd_batch_reference(&batch, NULL); - - if (info == &new_info) - pipe_resource_reference(&indexbuf, NULL); + struct fd_context *ctx = fd_context(pctx); + + /* for debugging problems with indirect draw, it is convenient + * to be able to emulate it, to determine if game is feeding us + * bogus data: + */ + if (indirect && indirect->buffer && FD_DBG(NOINDR)) { + /* num_draws is only applicable for direct draws: */ + assert(num_draws == 1); + util_draw_indirect(pctx, info, indirect); + return; + } + + /* TODO: push down the region versions into the tiles */ + if (!fd_render_condition_check(pctx)) + return; + + /* emulate unsupported primitives: */ + if (!fd_supported_prim(ctx, info->mode)) { + if (ctx->streamout.num_targets > 0) + mesa_loge("stream-out with emulated prims"); + util_primconvert_save_rasterizer_state(ctx->primconvert, ctx->rasterizer); + util_primconvert_draw_vbo(ctx->primconvert, info, indirect, draws, + num_draws); + return; + } + + /* Upload a user index buffer. */ + struct pipe_resource *indexbuf = NULL; + unsigned index_offset = 0; + struct pipe_draw_info new_info; + if (info->index_size) { + if (info->has_user_indices) { + if (num_draws > 1) { + util_draw_multi(pctx, info, indirect, draws, num_draws); + return; + } + if (!util_upload_index_buffer(pctx, info, &draws[0], &indexbuf, + &index_offset, 4)) + return; + new_info = *info; + new_info.index.resource = indexbuf; + new_info.has_user_indices = false; + info = &new_info; + } else { + indexbuf = info->index.resource; + } + } + + if ((ctx->streamout.num_targets > 0) && (num_draws > 1)) { + util_draw_multi(pctx, info, indirect, draws, num_draws); + return; + } + + struct fd_batch *batch = fd_context_batch(ctx); + + if (ctx->in_discard_blit) { + fd_batch_reset(batch); + fd_context_all_dirty(ctx); + } + + batch_draw_tracking(batch, info, indirect); + + while (unlikely(!fd_batch_lock_submit(batch))) { + /* The current batch was flushed in batch_draw_tracking() + * so start anew. We know this won't happen a second time + * since we are dealing with a fresh batch: + */ + fd_batch_reference(&batch, NULL); + batch = fd_context_batch(ctx); + batch_draw_tracking(batch, info, indirect); + assert(ctx->batch == batch); + } + + batch->blit = ctx->in_discard_blit; + batch->back_blit = ctx->in_shadow; + batch->num_draws++; + + /* Clearing last_fence must come after the batch dependency tracking + * (resource_read()/resource_written()), as that can trigger a flush, + * re-populating last_fence + */ + fd_fence_ref(&ctx->last_fence, NULL); + + struct pipe_framebuffer_state *pfb = &batch->framebuffer; + DBG("%p: %ux%u num_draws=%u (%s/%s)", batch, pfb->width, pfb->height, + batch->num_draws, + util_format_short_name(pipe_surface_format(pfb->cbufs[0])), + util_format_short_name(pipe_surface_format(pfb->zsbuf))); + + batch->cost += ctx->draw_cost; + + for (unsigned i = 0; i < num_draws; i++) { + if (ctx->draw_vbo(ctx, info, indirect, &draws[i], index_offset)) + batch->needs_flush = true; + + batch->num_vertices += draws[i].count * info->instance_count; + } + + if (unlikely(ctx->stats_users > 0)) + update_draw_stats(ctx, info, draws, num_draws); + + for (unsigned i = 0; i < ctx->streamout.num_targets; i++) { + assert(num_draws == 1); + ctx->streamout.offsets[i] += draws[0].count; + } + + if (FD_DBG(DDRAW)) + fd_context_all_dirty(ctx); + + fd_batch_unlock_submit(batch); + fd_batch_check_size(batch); + fd_batch_reference(&batch, NULL); + + if (info == &new_info) + pipe_resource_reference(&indexbuf, NULL); } static void -batch_clear_tracking(struct fd_batch *batch, unsigned buffers) - assert_dt +batch_clear_tracking(struct fd_batch *batch, unsigned buffers) assert_dt { - struct fd_context *ctx = batch->ctx; - struct pipe_framebuffer_state *pfb = &batch->framebuffer; - unsigned cleared_buffers; - - /* pctx->clear() is only for full-surface clears, so scissor is - * equivalent to having GL_SCISSOR_TEST disabled: - */ - batch->max_scissor.minx = 0; - batch->max_scissor.miny = 0; - batch->max_scissor.maxx = pfb->width; - batch->max_scissor.maxy = pfb->height; - - /* for bookkeeping about which buffers have been cleared (and thus - * can fully or partially skip mem2gmem) we need to ignore buffers - * that have already had a draw, in case apps do silly things like - * clear after draw (ie. if you only clear the color buffer, but - * something like alpha-test causes side effects from the draw in - * the depth buffer, etc) - */ - cleared_buffers = buffers & (FD_BUFFER_ALL & ~batch->restore); - batch->cleared |= buffers; - batch->invalidated |= cleared_buffers; - - batch->resolve |= buffers; - batch->needs_flush = true; - - fd_screen_lock(ctx->screen); - - if (buffers & PIPE_CLEAR_COLOR) - for (unsigned i = 0; i < pfb->nr_cbufs; i++) - if (buffers & (PIPE_CLEAR_COLOR0 << i)) - resource_written(batch, pfb->cbufs[i]->texture); - - if (buffers & (PIPE_CLEAR_DEPTH | PIPE_CLEAR_STENCIL)) { - resource_written(batch, pfb->zsbuf->texture); - batch->gmem_reason |= FD_GMEM_CLEARS_DEPTH_STENCIL; - } - - resource_written(batch, batch->query_buf); - - list_for_each_entry(struct fd_acc_query, aq, &ctx->acc_active_queries, node) - resource_written(batch, aq->prsc); - - fd_screen_unlock(ctx->screen); + struct fd_context *ctx = batch->ctx; + struct pipe_framebuffer_state *pfb = &batch->framebuffer; + unsigned cleared_buffers; + + /* pctx->clear() is only for full-surface clears, so scissor is + * equivalent to having GL_SCISSOR_TEST disabled: + */ + batch->max_scissor.minx = 0; + batch->max_scissor.miny = 0; + batch->max_scissor.maxx = pfb->width; + batch->max_scissor.maxy = pfb->height; + + /* for bookkeeping about which buffers have been cleared (and thus + * can fully or partially skip mem2gmem) we need to ignore buffers + * that have already had a draw, in case apps do silly things like + * clear after draw (ie. if you only clear the color buffer, but + * something like alpha-test causes side effects from the draw in + * the depth buffer, etc) + */ + cleared_buffers = buffers & (FD_BUFFER_ALL & ~batch->restore); + batch->cleared |= buffers; + batch->invalidated |= cleared_buffers; + + batch->resolve |= buffers; + batch->needs_flush = true; + + fd_screen_lock(ctx->screen); + + if (buffers & PIPE_CLEAR_COLOR) + for (unsigned i = 0; i < pfb->nr_cbufs; i++) + if (buffers & (PIPE_CLEAR_COLOR0 << i)) + resource_written(batch, pfb->cbufs[i]->texture); + + if (buffers & (PIPE_CLEAR_DEPTH | PIPE_CLEAR_STENCIL)) { + resource_written(batch, pfb->zsbuf->texture); + batch->gmem_reason |= FD_GMEM_CLEARS_DEPTH_STENCIL; + } + + resource_written(batch, batch->query_buf); + + list_for_each_entry (struct fd_acc_query, aq, &ctx->acc_active_queries, node) + resource_written(batch, aq->prsc); + + fd_screen_unlock(ctx->screen); } static void fd_clear(struct pipe_context *pctx, unsigned buffers, - const struct pipe_scissor_state *scissor_state, - const union pipe_color_union *color, double depth, - unsigned stencil) - in_dt + const struct pipe_scissor_state *scissor_state, + const union pipe_color_union *color, double depth, + unsigned stencil) in_dt { - struct fd_context *ctx = fd_context(pctx); - - /* TODO: push down the region versions into the tiles */ - if (!fd_render_condition_check(pctx)) - return; - - struct fd_batch *batch = fd_context_batch(ctx); - - if (ctx->in_discard_blit) { - fd_batch_reset(batch); - fd_context_all_dirty(ctx); - } - - batch_clear_tracking(batch, buffers); - - while (unlikely(!fd_batch_lock_submit(batch))) { - /* The current batch was flushed in batch_clear_tracking() - * so start anew. We know this won't happen a second time - * since we are dealing with a fresh batch: - */ - fd_batch_reference(&batch, NULL); - batch = fd_context_batch(ctx); - batch_clear_tracking(batch, buffers); - assert(ctx->batch == batch); - } - - /* Clearing last_fence must come after the batch dependency tracking - * (resource_read()/resource_written()), as that can trigger a flush, - * re-populating last_fence - */ - fd_fence_ref(&ctx->last_fence, NULL); - - struct pipe_framebuffer_state *pfb = &batch->framebuffer; - DBG("%p: %x %ux%u depth=%f, stencil=%u (%s/%s)", batch, buffers, - pfb->width, pfb->height, depth, stencil, - util_format_short_name(pipe_surface_format(pfb->cbufs[0])), - util_format_short_name(pipe_surface_format(pfb->zsbuf))); - - /* if per-gen backend doesn't implement ctx->clear() generic - * blitter clear: - */ - bool fallback = true; - - if (ctx->clear) { - fd_batch_update_queries(batch); - - if (ctx->clear(ctx, buffers, color, depth, stencil)) { - if (FD_DBG(DCLEAR)) - fd_context_all_dirty(ctx); - - fallback = false; - } - } - - fd_batch_unlock_submit(batch); - fd_batch_check_size(batch); - - if (fallback) { - fd_blitter_clear(pctx, buffers, color, depth, stencil); - } - - fd_batch_reference(&batch, NULL); + struct fd_context *ctx = fd_context(pctx); + + /* TODO: push down the region versions into the tiles */ + if (!fd_render_condition_check(pctx)) + return; + + struct fd_batch *batch = fd_context_batch(ctx); + + if (ctx->in_discard_blit) { + fd_batch_reset(batch); + fd_context_all_dirty(ctx); + } + + batch_clear_tracking(batch, buffers); + + while (unlikely(!fd_batch_lock_submit(batch))) { + /* The current batch was flushed in batch_clear_tracking() + * so start anew. We know this won't happen a second time + * since we are dealing with a fresh batch: + */ + fd_batch_reference(&batch, NULL); + batch = fd_context_batch(ctx); + batch_clear_tracking(batch, buffers); + assert(ctx->batch == batch); + } + + /* Clearing last_fence must come after the batch dependency tracking + * (resource_read()/resource_written()), as that can trigger a flush, + * re-populating last_fence + */ + fd_fence_ref(&ctx->last_fence, NULL); + + struct pipe_framebuffer_state *pfb = &batch->framebuffer; + DBG("%p: %x %ux%u depth=%f, stencil=%u (%s/%s)", batch, buffers, pfb->width, + pfb->height, depth, stencil, + util_format_short_name(pipe_surface_format(pfb->cbufs[0])), + util_format_short_name(pipe_surface_format(pfb->zsbuf))); + + /* if per-gen backend doesn't implement ctx->clear() generic + * blitter clear: + */ + bool fallback = true; + + if (ctx->clear) { + fd_batch_update_queries(batch); + + if (ctx->clear(ctx, buffers, color, depth, stencil)) { + if (FD_DBG(DCLEAR)) + fd_context_all_dirty(ctx); + + fallback = false; + } + } + + fd_batch_unlock_submit(batch); + fd_batch_check_size(batch); + + if (fallback) { + fd_blitter_clear(pctx, buffers, color, depth, stencil); + } + + fd_batch_reference(&batch, NULL); } static void fd_clear_render_target(struct pipe_context *pctx, struct pipe_surface *ps, - const union pipe_color_union *color, - unsigned x, unsigned y, unsigned w, unsigned h, - bool render_condition_enabled) + const union pipe_color_union *color, unsigned x, + unsigned y, unsigned w, unsigned h, + bool render_condition_enabled) { - DBG("TODO: x=%u, y=%u, w=%u, h=%u", x, y, w, h); + DBG("TODO: x=%u, y=%u, w=%u, h=%u", x, y, w, h); } static void fd_clear_depth_stencil(struct pipe_context *pctx, struct pipe_surface *ps, - unsigned buffers, double depth, unsigned stencil, - unsigned x, unsigned y, unsigned w, unsigned h, - bool render_condition_enabled) + unsigned buffers, double depth, unsigned stencil, + unsigned x, unsigned y, unsigned w, unsigned h, + bool render_condition_enabled) { - DBG("TODO: buffers=%u, depth=%f, stencil=%u, x=%u, y=%u, w=%u, h=%u", - buffers, depth, stencil, x, y, w, h); + DBG("TODO: buffers=%u, depth=%f, stencil=%u, x=%u, y=%u, w=%u, h=%u", + buffers, depth, stencil, x, y, w, h); } static void -fd_launch_grid(struct pipe_context *pctx, const struct pipe_grid_info *info) - in_dt +fd_launch_grid(struct pipe_context *pctx, + const struct pipe_grid_info *info) in_dt { - struct fd_context *ctx = fd_context(pctx); - const struct fd_shaderbuf_stateobj *so = &ctx->shaderbuf[PIPE_SHADER_COMPUTE]; - struct fd_batch *batch, *save_batch = NULL; + struct fd_context *ctx = fd_context(pctx); + const struct fd_shaderbuf_stateobj *so = + &ctx->shaderbuf[PIPE_SHADER_COMPUTE]; + struct fd_batch *batch, *save_batch = NULL; - batch = fd_bc_alloc_batch(&ctx->screen->batch_cache, ctx, true); - fd_batch_reference(&save_batch, ctx->batch); - fd_batch_reference(&ctx->batch, batch); - fd_context_all_dirty(ctx); + batch = fd_bc_alloc_batch(&ctx->screen->batch_cache, ctx, true); + fd_batch_reference(&save_batch, ctx->batch); + fd_batch_reference(&ctx->batch, batch); + fd_context_all_dirty(ctx); - fd_screen_lock(ctx->screen); + fd_screen_lock(ctx->screen); - /* Mark SSBOs */ - u_foreach_bit (i, so->enabled_mask & so->writable_mask) - resource_written(batch, so->sb[i].buffer); + /* Mark SSBOs */ + u_foreach_bit (i, so->enabled_mask & so->writable_mask) + resource_written(batch, so->sb[i].buffer); - u_foreach_bit (i, so->enabled_mask & ~so->writable_mask) - resource_read(batch, so->sb[i].buffer); + u_foreach_bit (i, so->enabled_mask & ~so->writable_mask) + resource_read(batch, so->sb[i].buffer); - u_foreach_bit(i, ctx->shaderimg[PIPE_SHADER_COMPUTE].enabled_mask) { - struct pipe_image_view *img = - &ctx->shaderimg[PIPE_SHADER_COMPUTE].si[i]; - if (img->access & PIPE_IMAGE_ACCESS_WRITE) - resource_written(batch, img->resource); - else - resource_read(batch, img->resource); - } + u_foreach_bit (i, ctx->shaderimg[PIPE_SHADER_COMPUTE].enabled_mask) { + struct pipe_image_view *img = &ctx->shaderimg[PIPE_SHADER_COMPUTE].si[i]; + if (img->access & PIPE_IMAGE_ACCESS_WRITE) + resource_written(batch, img->resource); + else + resource_read(batch, img->resource); + } - /* UBO's are read */ - u_foreach_bit(i, ctx->constbuf[PIPE_SHADER_COMPUTE].enabled_mask) - resource_read(batch, ctx->constbuf[PIPE_SHADER_COMPUTE].cb[i].buffer); + /* UBO's are read */ + u_foreach_bit (i, ctx->constbuf[PIPE_SHADER_COMPUTE].enabled_mask) + resource_read(batch, ctx->constbuf[PIPE_SHADER_COMPUTE].cb[i].buffer); - /* Mark textures as being read */ - u_foreach_bit(i, ctx->tex[PIPE_SHADER_COMPUTE].valid_textures) - resource_read(batch, ctx->tex[PIPE_SHADER_COMPUTE].textures[i]->texture); + /* Mark textures as being read */ + u_foreach_bit (i, ctx->tex[PIPE_SHADER_COMPUTE].valid_textures) + resource_read(batch, ctx->tex[PIPE_SHADER_COMPUTE].textures[i]->texture); - /* For global buffers, we don't really know if read or written, so assume - * the worst: - */ - u_foreach_bit(i, ctx->global_bindings.enabled_mask) - resource_written(batch, ctx->global_bindings.buf[i]); + /* For global buffers, we don't really know if read or written, so assume + * the worst: + */ + u_foreach_bit (i, ctx->global_bindings.enabled_mask) + resource_written(batch, ctx->global_bindings.buf[i]); - if (info->indirect) - resource_read(batch, info->indirect); + if (info->indirect) + resource_read(batch, info->indirect); - fd_screen_unlock(ctx->screen); + fd_screen_unlock(ctx->screen); - batch->needs_flush = true; - ctx->launch_grid(ctx, info); + batch->needs_flush = true; + ctx->launch_grid(ctx, info); - fd_batch_flush(batch); + fd_batch_flush(batch); - fd_batch_reference(&ctx->batch, save_batch); - fd_context_all_dirty(ctx); - fd_batch_reference(&save_batch, NULL); - fd_batch_reference(&batch, NULL); + fd_batch_reference(&ctx->batch, save_batch); + fd_context_all_dirty(ctx); + fd_batch_reference(&save_batch, NULL); + fd_batch_reference(&batch, NULL); } void fd_draw_init(struct pipe_context *pctx) { - pctx->draw_vbo = fd_draw_vbo; - pctx->clear = fd_clear; - pctx->clear_render_target = fd_clear_render_target; - pctx->clear_depth_stencil = fd_clear_depth_stencil; - - if (has_compute(fd_screen(pctx->screen))) { - pctx->launch_grid = fd_launch_grid; - } + pctx->draw_vbo = fd_draw_vbo; + pctx->clear = fd_clear; + pctx->clear_render_target = fd_clear_render_target; + pctx->clear_depth_stencil = fd_clear_depth_stencil; + + if (has_compute(fd_screen(pctx->screen))) { + pctx->launch_grid = fd_launch_grid; + } } diff --git a/src/gallium/drivers/freedreno/freedreno_draw.h b/src/gallium/drivers/freedreno/freedreno_draw.h index f45b195..02c1e49 100644 --- a/src/gallium/drivers/freedreno/freedreno_draw.h +++ b/src/gallium/drivers/freedreno/freedreno_draw.h @@ -27,8 +27,8 @@ #ifndef FREEDRENO_DRAW_H_ #define FREEDRENO_DRAW_H_ -#include "pipe/p_state.h" #include "pipe/p_context.h" +#include "pipe/p_state.h" #include "freedreno_context.h" #include "freedreno_resource.h" @@ -41,137 +41,134 @@ void fd_draw_init(struct pipe_context *pctx); static inline void fd_draw(struct fd_batch *batch, struct fd_ringbuffer *ring, - enum pc_di_primtype primtype, - enum pc_di_vis_cull_mode vismode, - enum pc_di_src_sel src_sel, uint32_t count, - uint8_t instances, - enum pc_di_index_size idx_type, - uint32_t idx_size, uint32_t idx_offset, - struct pipe_resource *idx_buffer) + enum pc_di_primtype primtype, enum pc_di_vis_cull_mode vismode, + enum pc_di_src_sel src_sel, uint32_t count, uint8_t instances, + enum pc_di_index_size idx_type, uint32_t idx_size, uint32_t idx_offset, + struct pipe_resource *idx_buffer) { - /* for debug after a lock up, write a unique counter value - * to scratch7 for each draw, to make it easier to match up - * register dumps to cmdstream. The combination of IB - * (scratch6) and DRAW is enough to "triangulate" the - * particular draw that caused lockup. - */ - emit_marker(ring, 7); - - if (is_a3xx_p0(batch->ctx->screen)) { - /* dummy-draw workaround: */ - OUT_PKT3(ring, CP_DRAW_INDX, 3); - OUT_RING(ring, 0x00000000); - OUT_RING(ring, DRAW(1, DI_SRC_SEL_AUTO_INDEX, - INDEX_SIZE_IGN, USE_VISIBILITY, 0)); - OUT_RING(ring, 0); /* NumIndices */ - - /* ugg, hard-code register offset to avoid pulling in the - * a3xx register headers into something #included from a2xx - */ - OUT_PKT0(ring, 0x2206, 1); /* A3XX_HLSQ_CONST_VSPRESV_RANGE_REG */ - OUT_RING(ring, 0); - } - - if (is_a20x(batch->ctx->screen)) { - /* a20x has a different draw command for drawing with binning data - * note: if we do patching we will have to insert a NOP - * - * binning data is is 1 byte/vertex (8x8x4 bin position of vertex) - * base ptr set by the CP_SET_DRAW_INIT_FLAGS command - * - * TODO: investigate the faceness_cull_select parameter to see how - * it is used with hw binning to use "faceness" bits - */ - uint32_t size = 2; - if (vismode) - size += 2; - if (idx_buffer) - size += 2; - - BEGIN_RING(ring, size+1); - if (vismode) - util_dynarray_append(&batch->draw_patches, uint32_t*, ring->cur); - - OUT_PKT3(ring, vismode ? CP_DRAW_INDX_BIN : CP_DRAW_INDX, size); - OUT_RING(ring, 0x00000000); - OUT_RING(ring, DRAW_A20X(primtype, DI_FACE_CULL_NONE, src_sel, - idx_type, vismode, vismode, count)); - if (vismode == USE_VISIBILITY) { - OUT_RING(ring, batch->num_vertices); - OUT_RING(ring, count); - } - } else { - OUT_PKT3(ring, CP_DRAW_INDX, idx_buffer ? 5 : 3); - OUT_RING(ring, 0x00000000); /* viz query info. */ - if (vismode == USE_VISIBILITY) { - /* leave vis mode blank for now, it will be patched up when - * we know if we are binning or not - */ - OUT_RINGP(ring, DRAW(primtype, src_sel, idx_type, 0, instances), - &batch->draw_patches); - } else { - OUT_RING(ring, DRAW(primtype, src_sel, idx_type, vismode, instances)); - } - OUT_RING(ring, count); /* NumIndices */ - } - - if (idx_buffer) { - OUT_RELOC(ring, fd_resource(idx_buffer)->bo, idx_offset, 0, 0); - OUT_RING (ring, idx_size); - } - - emit_marker(ring, 7); - - fd_reset_wfi(batch); + /* for debug after a lock up, write a unique counter value + * to scratch7 for each draw, to make it easier to match up + * register dumps to cmdstream. The combination of IB + * (scratch6) and DRAW is enough to "triangulate" the + * particular draw that caused lockup. + */ + emit_marker(ring, 7); + + if (is_a3xx_p0(batch->ctx->screen)) { + /* dummy-draw workaround: */ + OUT_PKT3(ring, CP_DRAW_INDX, 3); + OUT_RING(ring, 0x00000000); + OUT_RING(ring, DRAW(1, DI_SRC_SEL_AUTO_INDEX, INDEX_SIZE_IGN, + USE_VISIBILITY, 0)); + OUT_RING(ring, 0); /* NumIndices */ + + /* ugg, hard-code register offset to avoid pulling in the + * a3xx register headers into something #included from a2xx + */ + OUT_PKT0(ring, 0x2206, 1); /* A3XX_HLSQ_CONST_VSPRESV_RANGE_REG */ + OUT_RING(ring, 0); + } + + if (is_a20x(batch->ctx->screen)) { + /* a20x has a different draw command for drawing with binning data + * note: if we do patching we will have to insert a NOP + * + * binning data is is 1 byte/vertex (8x8x4 bin position of vertex) + * base ptr set by the CP_SET_DRAW_INIT_FLAGS command + * + * TODO: investigate the faceness_cull_select parameter to see how + * it is used with hw binning to use "faceness" bits + */ + uint32_t size = 2; + if (vismode) + size += 2; + if (idx_buffer) + size += 2; + + BEGIN_RING(ring, size + 1); + if (vismode) + util_dynarray_append(&batch->draw_patches, uint32_t *, ring->cur); + + OUT_PKT3(ring, vismode ? CP_DRAW_INDX_BIN : CP_DRAW_INDX, size); + OUT_RING(ring, 0x00000000); + OUT_RING(ring, DRAW_A20X(primtype, DI_FACE_CULL_NONE, src_sel, idx_type, + vismode, vismode, count)); + if (vismode == USE_VISIBILITY) { + OUT_RING(ring, batch->num_vertices); + OUT_RING(ring, count); + } + } else { + OUT_PKT3(ring, CP_DRAW_INDX, idx_buffer ? 5 : 3); + OUT_RING(ring, 0x00000000); /* viz query info. */ + if (vismode == USE_VISIBILITY) { + /* leave vis mode blank for now, it will be patched up when + * we know if we are binning or not + */ + OUT_RINGP(ring, DRAW(primtype, src_sel, idx_type, 0, instances), + &batch->draw_patches); + } else { + OUT_RING(ring, DRAW(primtype, src_sel, idx_type, vismode, instances)); + } + OUT_RING(ring, count); /* NumIndices */ + } + + if (idx_buffer) { + OUT_RELOC(ring, fd_resource(idx_buffer)->bo, idx_offset, 0, 0); + OUT_RING(ring, idx_size); + } + + emit_marker(ring, 7); + + fd_reset_wfi(batch); } - static inline enum pc_di_index_size size2indextype(unsigned index_size) { - switch (index_size) { - case 1: return INDEX_SIZE_8_BIT; - case 2: return INDEX_SIZE_16_BIT; - case 4: return INDEX_SIZE_32_BIT; - } - DBG("unsupported index size: %d", index_size); - assert(0); - return INDEX_SIZE_IGN; + switch (index_size) { + case 1: + return INDEX_SIZE_8_BIT; + case 2: + return INDEX_SIZE_16_BIT; + case 4: + return INDEX_SIZE_32_BIT; + } + DBG("unsupported index size: %d", index_size); + assert(0); + return INDEX_SIZE_IGN; } /* this is same for a2xx/a3xx, so split into helper: */ static inline void fd_draw_emit(struct fd_batch *batch, struct fd_ringbuffer *ring, - enum pc_di_primtype primtype, - enum pc_di_vis_cull_mode vismode, - const struct pipe_draw_info *info, - const struct pipe_draw_start_count *draw, - unsigned index_offset) + enum pc_di_primtype primtype, enum pc_di_vis_cull_mode vismode, + const struct pipe_draw_info *info, + const struct pipe_draw_start_count *draw, unsigned index_offset) { - struct pipe_resource *idx_buffer = NULL; - enum pc_di_index_size idx_type = INDEX_SIZE_IGN; - enum pc_di_src_sel src_sel; - uint32_t idx_size, idx_offset; - - if (info->index_size) { - assert(!info->has_user_indices); - - idx_buffer = info->index.resource; - idx_type = size2indextype(info->index_size); - idx_size = info->index_size * draw->count; - idx_offset = index_offset + draw->start * info->index_size; - src_sel = DI_SRC_SEL_DMA; - } else { - idx_buffer = NULL; - idx_type = INDEX_SIZE_IGN; - idx_size = 0; - idx_offset = 0; - src_sel = DI_SRC_SEL_AUTO_INDEX; - } - - fd_draw(batch, ring, primtype, vismode, src_sel, - draw->count, info->instance_count - 1, - idx_type, idx_size, idx_offset, idx_buffer); + struct pipe_resource *idx_buffer = NULL; + enum pc_di_index_size idx_type = INDEX_SIZE_IGN; + enum pc_di_src_sel src_sel; + uint32_t idx_size, idx_offset; + + if (info->index_size) { + assert(!info->has_user_indices); + + idx_buffer = info->index.resource; + idx_type = size2indextype(info->index_size); + idx_size = info->index_size * draw->count; + idx_offset = index_offset + draw->start * info->index_size; + src_sel = DI_SRC_SEL_DMA; + } else { + idx_buffer = NULL; + idx_type = INDEX_SIZE_IGN; + idx_size = 0; + idx_offset = 0; + src_sel = DI_SRC_SEL_AUTO_INDEX; + } + + fd_draw(batch, ring, primtype, vismode, src_sel, draw->count, + info->instance_count - 1, idx_type, idx_size, idx_offset, + idx_buffer); } #endif /* FREEDRENO_DRAW_H_ */ diff --git a/src/gallium/drivers/freedreno/freedreno_fence.c b/src/gallium/drivers/freedreno/freedreno_fence.c index 884dcf9..dbd58f6 100644 --- a/src/gallium/drivers/freedreno/freedreno_fence.c +++ b/src/gallium/drivers/freedreno/freedreno_fence.c @@ -27,261 +27,267 @@ #include "util/os_file.h" #include "util/u_inlines.h" -#include "freedreno_fence.h" #include "freedreno_batch.h" #include "freedreno_context.h" +#include "freedreno_fence.h" #include "freedreno_util.h" /* TODO: Use the interface drm/freedreno_drmif.h instead of calling directly */ #include struct pipe_fence_handle { - struct pipe_reference reference; - - /* fence holds a weak reference to the batch until the batch is flushed, - * at which point fd_fence_populate() is called and timestamp and possibly - * fence_fd become valid and the week reference is dropped. - * - * Note that with u_threaded_context async flushes, if a fence is requested - * by the frontend, the fence is initially created without a weak reference - * to the batch, which is filled in later when fd_context_flush() is called - * from the driver thread. In this case tc_token will be non-null, in - * which case threaded_context_flush() should be called in fd_fence_finish() - */ - struct fd_batch *batch; - - struct tc_unflushed_batch_token *tc_token; - bool needs_signal; - - /* For threaded_context async flushes, we must wait on the fence, signalled - * in fd_fence_populate(), to know that the rendering has been actually - * flushed from the driver thread. - * - * The ready fence is created signaled for non-async-flush fences, and only - * transitions once from unsignalled->signalled for async-flush fences - */ - struct util_queue_fence ready; - - /* Note that a fence can outlive the ctx, so we can only assume this is a - * valid ptr for unflushed fences. However we hold a reference to the - * fence->pipe so that is safe to use after flushing. - */ - struct fd_context *ctx; - struct fd_pipe *pipe; - struct fd_screen *screen; - int fence_fd; - uint32_t timestamp; - uint32_t syncobj; + struct pipe_reference reference; + + /* fence holds a weak reference to the batch until the batch is flushed, + * at which point fd_fence_populate() is called and timestamp and possibly + * fence_fd become valid and the week reference is dropped. + * + * Note that with u_threaded_context async flushes, if a fence is requested + * by the frontend, the fence is initially created without a weak reference + * to the batch, which is filled in later when fd_context_flush() is called + * from the driver thread. In this case tc_token will be non-null, in + * which case threaded_context_flush() should be called in fd_fence_finish() + */ + struct fd_batch *batch; + + struct tc_unflushed_batch_token *tc_token; + bool needs_signal; + + /* For threaded_context async flushes, we must wait on the fence, signalled + * in fd_fence_populate(), to know that the rendering has been actually + * flushed from the driver thread. + * + * The ready fence is created signaled for non-async-flush fences, and only + * transitions once from unsignalled->signalled for async-flush fences + */ + struct util_queue_fence ready; + + /* Note that a fence can outlive the ctx, so we can only assume this is a + * valid ptr for unflushed fences. However we hold a reference to the + * fence->pipe so that is safe to use after flushing. + */ + struct fd_context *ctx; + struct fd_pipe *pipe; + struct fd_screen *screen; + int fence_fd; + uint32_t timestamp; + uint32_t syncobj; }; static bool -fence_flush(struct pipe_context *pctx, struct pipe_fence_handle *fence, uint64_t timeout) - /* NOTE: in the !fence_is_signalled() case we may be called from non-driver - * thread, but we don't call fd_batch_flush() in that case - */ - in_dt +fence_flush(struct pipe_context *pctx, struct pipe_fence_handle *fence, + uint64_t timeout) + /* NOTE: in the !fence_is_signalled() case we may be called from non-driver + * thread, but we don't call fd_batch_flush() in that case + */ + in_dt { - if (!util_queue_fence_is_signalled(&fence->ready)) { - if (fence->tc_token) { - threaded_context_flush(pctx, fence->tc_token, - timeout == 0); - } - - if (!timeout) - return false; - - if (timeout == PIPE_TIMEOUT_INFINITE) { - util_queue_fence_wait(&fence->ready); - } else { - int64_t abs_timeout = os_time_get_absolute_timeout(timeout); - if (!util_queue_fence_wait_timeout(&fence->ready, abs_timeout)) { - return false; - } - } - - /* We've already waited for batch to be flushed and fd_fence_populate() - * called: - */ - assert(!fence->batch); - return true; - } - - if (fence->batch) - fd_batch_flush(fence->batch); - - debug_assert(!fence->batch); - - return true; + if (!util_queue_fence_is_signalled(&fence->ready)) { + if (fence->tc_token) { + threaded_context_flush(pctx, fence->tc_token, timeout == 0); + } + + if (!timeout) + return false; + + if (timeout == PIPE_TIMEOUT_INFINITE) { + util_queue_fence_wait(&fence->ready); + } else { + int64_t abs_timeout = os_time_get_absolute_timeout(timeout); + if (!util_queue_fence_wait_timeout(&fence->ready, abs_timeout)) { + return false; + } + } + + /* We've already waited for batch to be flushed and fd_fence_populate() + * called: + */ + assert(!fence->batch); + return true; + } + + if (fence->batch) + fd_batch_flush(fence->batch); + + debug_assert(!fence->batch); + + return true; } -void fd_fence_populate(struct pipe_fence_handle *fence, - uint32_t timestamp, int fence_fd) +void +fd_fence_populate(struct pipe_fence_handle *fence, uint32_t timestamp, + int fence_fd) { - if (!fence->batch) - return; - fence->timestamp = timestamp; - fence->fence_fd = fence_fd; - fence->batch = NULL; - - if (fence->needs_signal) { - util_queue_fence_signal(&fence->ready); - fence->needs_signal = false; - } + if (!fence->batch) + return; + fence->timestamp = timestamp; + fence->fence_fd = fence_fd; + fence->batch = NULL; + + if (fence->needs_signal) { + util_queue_fence_signal(&fence->ready); + fence->needs_signal = false; + } } -static void fd_fence_destroy(struct pipe_fence_handle *fence) +static void +fd_fence_destroy(struct pipe_fence_handle *fence) { - tc_unflushed_batch_token_reference(&fence->tc_token, NULL); - if (fence->fence_fd != -1) - close(fence->fence_fd); - if (fence->syncobj) - drmSyncobjDestroy(fd_device_fd(fence->screen->dev), fence->syncobj); - fd_pipe_del(fence->pipe); - FREE(fence); + tc_unflushed_batch_token_reference(&fence->tc_token, NULL); + if (fence->fence_fd != -1) + close(fence->fence_fd); + if (fence->syncobj) + drmSyncobjDestroy(fd_device_fd(fence->screen->dev), fence->syncobj); + fd_pipe_del(fence->pipe); + FREE(fence); } -void fd_fence_ref(struct pipe_fence_handle **ptr, - struct pipe_fence_handle *pfence) +void +fd_fence_ref(struct pipe_fence_handle **ptr, struct pipe_fence_handle *pfence) { - if (pipe_reference(&(*ptr)->reference, &pfence->reference)) - fd_fence_destroy(*ptr); + if (pipe_reference(&(*ptr)->reference, &pfence->reference)) + fd_fence_destroy(*ptr); - *ptr = pfence; + *ptr = pfence; } -bool fd_fence_finish(struct pipe_screen *pscreen, - struct pipe_context *pctx, - struct pipe_fence_handle *fence, - uint64_t timeout) +bool +fd_fence_finish(struct pipe_screen *pscreen, struct pipe_context *pctx, + struct pipe_fence_handle *fence, uint64_t timeout) { - if (!fence_flush(pctx, fence, timeout)) - return false; + if (!fence_flush(pctx, fence, timeout)) + return false; - if (fence->fence_fd != -1) { - int ret = sync_wait(fence->fence_fd, timeout / 1000000); - return ret == 0; - } + if (fence->fence_fd != -1) { + int ret = sync_wait(fence->fence_fd, timeout / 1000000); + return ret == 0; + } - if (fd_pipe_wait_timeout(fence->pipe, fence->timestamp, timeout)) - return false; + if (fd_pipe_wait_timeout(fence->pipe, fence->timestamp, timeout)) + return false; - return true; + return true; } -static struct pipe_fence_handle * fence_create(struct fd_context *ctx, - struct fd_batch *batch, uint32_t timestamp, int fence_fd, int syncobj) +static struct pipe_fence_handle * +fence_create(struct fd_context *ctx, struct fd_batch *batch, uint32_t timestamp, + int fence_fd, int syncobj) { - struct pipe_fence_handle *fence; + struct pipe_fence_handle *fence; - fence = CALLOC_STRUCT(pipe_fence_handle); - if (!fence) - return NULL; + fence = CALLOC_STRUCT(pipe_fence_handle); + if (!fence) + return NULL; - pipe_reference_init(&fence->reference, 1); - util_queue_fence_init(&fence->ready); + pipe_reference_init(&fence->reference, 1); + util_queue_fence_init(&fence->ready); - fence->ctx = ctx; - fence->batch = batch; - fence->pipe = fd_pipe_ref(ctx->pipe); - fence->screen = ctx->screen; - fence->timestamp = timestamp; - fence->fence_fd = fence_fd; - fence->syncobj = syncobj; + fence->ctx = ctx; + fence->batch = batch; + fence->pipe = fd_pipe_ref(ctx->pipe); + fence->screen = ctx->screen; + fence->timestamp = timestamp; + fence->fence_fd = fence_fd; + fence->syncobj = syncobj; - return fence; + return fence; } -void fd_create_fence_fd(struct pipe_context *pctx, - struct pipe_fence_handle **pfence, int fd, - enum pipe_fd_type type) +void +fd_create_fence_fd(struct pipe_context *pctx, struct pipe_fence_handle **pfence, + int fd, enum pipe_fd_type type) { - struct fd_context *ctx = fd_context(pctx); - - switch (type) { - case PIPE_FD_TYPE_NATIVE_SYNC: - *pfence = fence_create(fd_context(pctx), NULL, 0, os_dupfd_cloexec(fd), 0); - break; - case PIPE_FD_TYPE_SYNCOBJ: { - int ret; - uint32_t syncobj; - - assert(ctx->screen->has_syncobj); - ret = drmSyncobjFDToHandle(fd_device_fd(ctx->screen->dev), fd, &syncobj); - if (!ret) - close(fd); - - *pfence = fence_create(fd_context(pctx), NULL, 0, -1, syncobj); - break; - } - default: - unreachable("Unhandled fence type"); - } + struct fd_context *ctx = fd_context(pctx); + + switch (type) { + case PIPE_FD_TYPE_NATIVE_SYNC: + *pfence = + fence_create(fd_context(pctx), NULL, 0, os_dupfd_cloexec(fd), 0); + break; + case PIPE_FD_TYPE_SYNCOBJ: { + int ret; + uint32_t syncobj; + + assert(ctx->screen->has_syncobj); + ret = drmSyncobjFDToHandle(fd_device_fd(ctx->screen->dev), fd, &syncobj); + if (!ret) + close(fd); + + *pfence = fence_create(fd_context(pctx), NULL, 0, -1, syncobj); + break; + } + default: + unreachable("Unhandled fence type"); + } } -void fd_fence_server_sync(struct pipe_context *pctx, - struct pipe_fence_handle *fence) +void +fd_fence_server_sync(struct pipe_context *pctx, struct pipe_fence_handle *fence) { - struct fd_context *ctx = fd_context(pctx); + struct fd_context *ctx = fd_context(pctx); - /* NOTE: we don't expect the combination of fence-fd + async-flush-fence, - * so timeout==0 is ok here: - */ - fence_flush(pctx, fence, 0); + /* NOTE: we don't expect the combination of fence-fd + async-flush-fence, + * so timeout==0 is ok here: + */ + fence_flush(pctx, fence, 0); - /* if not an external fence, then nothing more to do without preemption: */ - if (fence->fence_fd == -1) - return; + /* if not an external fence, then nothing more to do without preemption: */ + if (fence->fence_fd == -1) + return; - if (sync_accumulate("freedreno", &ctx->in_fence_fd, fence->fence_fd)) { - /* error */ - } + if (sync_accumulate("freedreno", &ctx->in_fence_fd, fence->fence_fd)) { + /* error */ + } } -void fd_fence_server_signal(struct pipe_context *pctx, - struct pipe_fence_handle *fence) +void +fd_fence_server_signal(struct pipe_context *pctx, + struct pipe_fence_handle *fence) { - struct fd_context *ctx = fd_context(pctx); + struct fd_context *ctx = fd_context(pctx); - if (fence->syncobj) { - drmSyncobjSignal(fd_device_fd(ctx->screen->dev), &fence->syncobj, 1); - } + if (fence->syncobj) { + drmSyncobjSignal(fd_device_fd(ctx->screen->dev), &fence->syncobj, 1); + } } -int fd_fence_get_fd(struct pipe_screen *pscreen, - struct pipe_fence_handle *fence) +int +fd_fence_get_fd(struct pipe_screen *pscreen, struct pipe_fence_handle *fence) { - /* NOTE: in the deferred fence case, the pctx we want is the threaded-ctx - * but if TC is not used, this will be null. Which is fine, we won't call - * threaded_context_flush() in that case - */ - fence_flush(&fence->ctx->tc->base, fence, PIPE_TIMEOUT_INFINITE); - return os_dupfd_cloexec(fence->fence_fd); + /* NOTE: in the deferred fence case, the pctx we want is the threaded-ctx + * but if TC is not used, this will be null. Which is fine, we won't call + * threaded_context_flush() in that case + */ + fence_flush(&fence->ctx->tc->base, fence, PIPE_TIMEOUT_INFINITE); + return os_dupfd_cloexec(fence->fence_fd); } -bool fd_fence_is_fd(struct pipe_fence_handle *fence) +bool +fd_fence_is_fd(struct pipe_fence_handle *fence) { - return fence->fence_fd != -1; + return fence->fence_fd != -1; } -struct pipe_fence_handle * fd_fence_create(struct fd_batch *batch) +struct pipe_fence_handle * +fd_fence_create(struct fd_batch *batch) { - return fence_create(batch->ctx, batch, 0, -1, 0); + return fence_create(batch->ctx, batch, 0, -1, 0); } void fd_fence_set_batch(struct pipe_fence_handle *fence, struct fd_batch *batch) { - assert(!fence->batch); - fence->batch = batch; + assert(!fence->batch); + fence->batch = batch; } struct pipe_fence_handle * fd_fence_create_unflushed(struct pipe_context *pctx, - struct tc_unflushed_batch_token *tc_token) + struct tc_unflushed_batch_token *tc_token) { - struct pipe_fence_handle *fence = - fence_create(fd_context(pctx), NULL, 0, -1, 0); - fence->needs_signal = true; - util_queue_fence_reset(&fence->ready); - tc_unflushed_batch_token_reference(&fence->tc_token, tc_token); - return fence; + struct pipe_fence_handle *fence = + fence_create(fd_context(pctx), NULL, 0, -1, 0); + fence->needs_signal = true; + util_queue_fence_reset(&fence->ready); + tc_unflushed_batch_token_reference(&fence->tc_token, tc_token); + return fence; } diff --git a/src/gallium/drivers/freedreno/freedreno_fence.h b/src/gallium/drivers/freedreno/freedreno_fence.h index a37678a..7539996 100644 --- a/src/gallium/drivers/freedreno/freedreno_fence.h +++ b/src/gallium/drivers/freedreno/freedreno_fence.h @@ -29,33 +29,32 @@ #include "pipe/p_context.h" -void fd_fence_populate(struct pipe_fence_handle *fence, - uint32_t timestamp, int fence_fd); +void fd_fence_populate(struct pipe_fence_handle *fence, uint32_t timestamp, + int fence_fd); void fd_fence_ref(struct pipe_fence_handle **ptr, - struct pipe_fence_handle *pfence); -bool fd_fence_finish(struct pipe_screen *pscreen, - struct pipe_context *ctx, - struct pipe_fence_handle *pfence, - uint64_t timeout); + struct pipe_fence_handle *pfence); +bool fd_fence_finish(struct pipe_screen *pscreen, struct pipe_context *ctx, + struct pipe_fence_handle *pfence, uint64_t timeout); void fd_create_fence_fd(struct pipe_context *pctx, - struct pipe_fence_handle **pfence, int fd, - enum pipe_fd_type type); + struct pipe_fence_handle **pfence, int fd, + enum pipe_fd_type type); void fd_fence_server_sync(struct pipe_context *pctx, - struct pipe_fence_handle *fence); + struct pipe_fence_handle *fence); void fd_fence_server_signal(struct pipe_context *ctx, - struct pipe_fence_handle *fence); + struct pipe_fence_handle *fence); int fd_fence_get_fd(struct pipe_screen *pscreen, - struct pipe_fence_handle *pfence); + struct pipe_fence_handle *pfence); bool fd_fence_is_fd(struct pipe_fence_handle *fence); struct fd_batch; -struct pipe_fence_handle * fd_fence_create(struct fd_batch *batch); +struct pipe_fence_handle *fd_fence_create(struct fd_batch *batch); - -void fd_fence_set_batch(struct pipe_fence_handle *fence, struct fd_batch *batch); +void fd_fence_set_batch(struct pipe_fence_handle *fence, + struct fd_batch *batch); struct tc_unflushed_batch_token; -struct pipe_fence_handle *fd_fence_create_unflushed(struct pipe_context *pctx, - struct tc_unflushed_batch_token *tc_token); +struct pipe_fence_handle * +fd_fence_create_unflushed(struct pipe_context *pctx, + struct tc_unflushed_batch_token *tc_token); #endif /* FREEDRENO_FENCE_H_ */ diff --git a/src/gallium/drivers/freedreno/freedreno_gmem.c b/src/gallium/drivers/freedreno/freedreno_gmem.c index 65f52cf..8747c92 100644 --- a/src/gallium/drivers/freedreno/freedreno_gmem.c +++ b/src/gallium/drivers/freedreno/freedreno_gmem.c @@ -24,21 +24,21 @@ * Rob Clark */ -#include "util/debug.h" #include "pipe/p_state.h" +#include "util/debug.h" +#include "util/format/u_format.h" #include "util/hash_table.h" #include "util/u_dump.h" -#include "util/u_string.h" -#include "util/u_memory.h" #include "util/u_inlines.h" -#include "util/format/u_format.h" +#include "util/u_memory.h" +#include "util/u_string.h" #include "u_tracepoints.h" -#include "freedreno_gmem.h" #include "freedreno_context.h" #include "freedreno_fence.h" -#include "freedreno_resource.h" +#include "freedreno_gmem.h" #include "freedreno_query_hw.h" +#include "freedreno_resource.h" #include "freedreno_tracepoints.h" #include "freedreno_util.h" @@ -72,7 +72,7 @@ */ #ifndef BIN_DEBUG -# define BIN_DEBUG 0 +#define BIN_DEBUG 0 #endif /* @@ -90,482 +90,487 @@ */ struct gmem_key { - uint16_t minx, miny; - uint16_t width, height; - uint8_t gmem_page_align; /* alignment in multiples of 0x1000 to reduce key size */ - uint8_t nr_cbufs; - uint8_t cbuf_cpp[MAX_RENDER_TARGETS]; - uint8_t zsbuf_cpp[2]; + uint16_t minx, miny; + uint16_t width, height; + uint8_t + gmem_page_align; /* alignment in multiples of 0x1000 to reduce key size */ + uint8_t nr_cbufs; + uint8_t cbuf_cpp[MAX_RENDER_TARGETS]; + uint8_t zsbuf_cpp[2]; }; static uint32_t gmem_key_hash(const void *_key) { - const struct gmem_key *key = _key; - return _mesa_hash_data(key, sizeof(*key)); + const struct gmem_key *key = _key; + return _mesa_hash_data(key, sizeof(*key)); } static bool gmem_key_equals(const void *_a, const void *_b) { - const struct gmem_key *a = _a; - const struct gmem_key *b = _b; - return memcmp(a, b, sizeof(*a)) == 0; + const struct gmem_key *a = _a; + const struct gmem_key *b = _b; + return memcmp(a, b, sizeof(*a)) == 0; } static void dump_gmem_key(const struct gmem_key *key) { - printf("{ .minx=%u, .miny=%u, .width=%u, .height=%u", - key->minx, key->miny, key->width, key->height); - printf(", .gmem_page_align=%u, .nr_cbufs=%u", - key->gmem_page_align, key->nr_cbufs); - printf(", .cbuf_cpp = {"); - for (unsigned i = 0; i < ARRAY_SIZE(key->cbuf_cpp); i++) - printf("%u,", key->cbuf_cpp[i]); - printf("}, .zsbuf_cpp = {"); - for (unsigned i = 0; i < ARRAY_SIZE(key->zsbuf_cpp); i++) - printf("%u,", key->zsbuf_cpp[i]); - printf("}},\n"); + printf("{ .minx=%u, .miny=%u, .width=%u, .height=%u", key->minx, key->miny, + key->width, key->height); + printf(", .gmem_page_align=%u, .nr_cbufs=%u", key->gmem_page_align, + key->nr_cbufs); + printf(", .cbuf_cpp = {"); + for (unsigned i = 0; i < ARRAY_SIZE(key->cbuf_cpp); i++) + printf("%u,", key->cbuf_cpp[i]); + printf("}, .zsbuf_cpp = {"); + for (unsigned i = 0; i < ARRAY_SIZE(key->zsbuf_cpp); i++) + printf("%u,", key->zsbuf_cpp[i]); + printf("}},\n"); } static void dump_gmem_state(const struct fd_gmem_stateobj *gmem) { - unsigned total = 0; - printf("GMEM LAYOUT: bin=%ux%u, nbins=%ux%u\n", - gmem->bin_w, gmem->bin_h, gmem->nbins_x, gmem->nbins_y); - for (int i = 0; i < ARRAY_SIZE(gmem->cbuf_base); i++) { - if (!gmem->cbuf_cpp[i]) - continue; + unsigned total = 0; + printf("GMEM LAYOUT: bin=%ux%u, nbins=%ux%u\n", gmem->bin_w, gmem->bin_h, + gmem->nbins_x, gmem->nbins_y); + for (int i = 0; i < ARRAY_SIZE(gmem->cbuf_base); i++) { + if (!gmem->cbuf_cpp[i]) + continue; - unsigned size = gmem->cbuf_cpp[i] * gmem->bin_w * gmem->bin_h; - printf(" cbuf[%d]: base=0x%06x, size=0x%x, cpp=%u\n", i, - gmem->cbuf_base[i], size, gmem->cbuf_cpp[i]); + unsigned size = gmem->cbuf_cpp[i] * gmem->bin_w * gmem->bin_h; + printf(" cbuf[%d]: base=0x%06x, size=0x%x, cpp=%u\n", i, + gmem->cbuf_base[i], size, gmem->cbuf_cpp[i]); - total = gmem->cbuf_base[i] + size; - } + total = gmem->cbuf_base[i] + size; + } - for (int i = 0; i < ARRAY_SIZE(gmem->zsbuf_base); i++) { - if (!gmem->zsbuf_cpp[i]) - continue; + for (int i = 0; i < ARRAY_SIZE(gmem->zsbuf_base); i++) { + if (!gmem->zsbuf_cpp[i]) + continue; - unsigned size = gmem->zsbuf_cpp[i] * gmem->bin_w * gmem->bin_h; - printf(" zsbuf[%d]: base=0x%06x, size=0x%x, cpp=%u\n", i, - gmem->zsbuf_base[i], size, gmem->zsbuf_cpp[i]); + unsigned size = gmem->zsbuf_cpp[i] * gmem->bin_w * gmem->bin_h; + printf(" zsbuf[%d]: base=0x%06x, size=0x%x, cpp=%u\n", i, + gmem->zsbuf_base[i], size, gmem->zsbuf_cpp[i]); - total = gmem->zsbuf_base[i] + size; - } + total = gmem->zsbuf_base[i] + size; + } - printf("total: 0x%06x (of 0x%06x)\n", total, - gmem->screen->gmemsize_bytes); + printf("total: 0x%06x (of 0x%06x)\n", total, gmem->screen->gmemsize_bytes); } static unsigned div_align(unsigned num, unsigned denom, unsigned al) { - return util_align_npot(DIV_ROUND_UP(num, denom), al); + return util_align_npot(DIV_ROUND_UP(num, denom), al); } static bool layout_gmem(struct gmem_key *key, uint32_t nbins_x, uint32_t nbins_y, - struct fd_gmem_stateobj *gmem) + struct fd_gmem_stateobj *gmem) { - struct fd_screen *screen = gmem->screen; - uint32_t gmem_align = key->gmem_page_align * 0x1000; - uint32_t total = 0, i; - - if ((nbins_x == 0) || (nbins_y == 0)) - return false; - - uint32_t bin_w, bin_h; - bin_w = div_align(key->width, nbins_x, screen->info.tile_align_w); - bin_h = div_align(key->height, nbins_y, screen->info.tile_align_h); - - if (bin_w > screen->info.tile_max_w) - return false; - - if (bin_h > screen->info.tile_max_h) - return false; - - gmem->bin_w = bin_w; - gmem->bin_h = bin_h; - - /* due to aligning bin_w/h, we could end up with one too - * many bins in either dimension, so recalculate: - */ - gmem->nbins_x = DIV_ROUND_UP(key->width, bin_w); - gmem->nbins_y = DIV_ROUND_UP(key->height, bin_h); - - for (i = 0; i < MAX_RENDER_TARGETS; i++) { - if (key->cbuf_cpp[i]) { - gmem->cbuf_base[i] = util_align_npot(total, gmem_align); - total = gmem->cbuf_base[i] + key->cbuf_cpp[i] * bin_w * bin_h; - } - } - - if (key->zsbuf_cpp[0]) { - gmem->zsbuf_base[0] = util_align_npot(total, gmem_align); - total = gmem->zsbuf_base[0] + key->zsbuf_cpp[0] * bin_w * bin_h; - } - - if (key->zsbuf_cpp[1]) { - gmem->zsbuf_base[1] = util_align_npot(total, gmem_align); - total = gmem->zsbuf_base[1] + key->zsbuf_cpp[1] * bin_w * bin_h; - } - - return total <= screen->gmemsize_bytes; + struct fd_screen *screen = gmem->screen; + uint32_t gmem_align = key->gmem_page_align * 0x1000; + uint32_t total = 0, i; + + if ((nbins_x == 0) || (nbins_y == 0)) + return false; + + uint32_t bin_w, bin_h; + bin_w = div_align(key->width, nbins_x, screen->info.tile_align_w); + bin_h = div_align(key->height, nbins_y, screen->info.tile_align_h); + + if (bin_w > screen->info.tile_max_w) + return false; + + if (bin_h > screen->info.tile_max_h) + return false; + + gmem->bin_w = bin_w; + gmem->bin_h = bin_h; + + /* due to aligning bin_w/h, we could end up with one too + * many bins in either dimension, so recalculate: + */ + gmem->nbins_x = DIV_ROUND_UP(key->width, bin_w); + gmem->nbins_y = DIV_ROUND_UP(key->height, bin_h); + + for (i = 0; i < MAX_RENDER_TARGETS; i++) { + if (key->cbuf_cpp[i]) { + gmem->cbuf_base[i] = util_align_npot(total, gmem_align); + total = gmem->cbuf_base[i] + key->cbuf_cpp[i] * bin_w * bin_h; + } + } + + if (key->zsbuf_cpp[0]) { + gmem->zsbuf_base[0] = util_align_npot(total, gmem_align); + total = gmem->zsbuf_base[0] + key->zsbuf_cpp[0] * bin_w * bin_h; + } + + if (key->zsbuf_cpp[1]) { + gmem->zsbuf_base[1] = util_align_npot(total, gmem_align); + total = gmem->zsbuf_base[1] + key->zsbuf_cpp[1] * bin_w * bin_h; + } + + return total <= screen->gmemsize_bytes; } static void calc_nbins(struct gmem_key *key, struct fd_gmem_stateobj *gmem) { - struct fd_screen *screen = gmem->screen; - uint32_t nbins_x = 1, nbins_y = 1; - uint32_t max_width = screen->info.tile_max_w; - uint32_t max_height = screen->info.tile_max_h; - - if (FD_DBG(MSGS)) { - debug_printf("binning input: cbuf cpp:"); - for (unsigned i = 0; i < key->nr_cbufs; i++) - debug_printf(" %d", key->cbuf_cpp[i]); - debug_printf(", zsbuf cpp: %d; %dx%d\n", - key->zsbuf_cpp[0], key->width, key->height); - } - - /* first, find a bin size that satisfies the maximum width/ - * height restrictions: - */ - while (div_align(key->width, nbins_x, screen->info.tile_align_w) > max_width) { - nbins_x++; - } - - while (div_align(key->height, nbins_y, screen->info.tile_align_h) > max_height) { - nbins_y++; - } - - /* then find a bin width/height that satisfies the memory - * constraints: - */ - while (!layout_gmem(key, nbins_x, nbins_y, gmem)) { - if (nbins_y > nbins_x) { - nbins_x++; - } else { - nbins_y++; - } - } - - /* Lets see if we can tweak the layout a bit and come up with - * something better: - */ - if ((((nbins_x - 1) * (nbins_y + 1)) < (nbins_x * nbins_y)) && - layout_gmem(key, nbins_x - 1, nbins_y + 1, gmem)) { - nbins_x--; - nbins_y++; - } else if ((((nbins_x + 1) * (nbins_y - 1)) < (nbins_x * nbins_y)) && - layout_gmem(key, nbins_x + 1, nbins_y - 1, gmem)) { - nbins_x++; - nbins_y--; - } - - layout_gmem(key, nbins_x, nbins_y, gmem); + struct fd_screen *screen = gmem->screen; + uint32_t nbins_x = 1, nbins_y = 1; + uint32_t max_width = screen->info.tile_max_w; + uint32_t max_height = screen->info.tile_max_h; + + if (FD_DBG(MSGS)) { + debug_printf("binning input: cbuf cpp:"); + for (unsigned i = 0; i < key->nr_cbufs; i++) + debug_printf(" %d", key->cbuf_cpp[i]); + debug_printf(", zsbuf cpp: %d; %dx%d\n", key->zsbuf_cpp[0], key->width, + key->height); + } + + /* first, find a bin size that satisfies the maximum width/ + * height restrictions: + */ + while (div_align(key->width, nbins_x, screen->info.tile_align_w) > + max_width) { + nbins_x++; + } + + while (div_align(key->height, nbins_y, screen->info.tile_align_h) > + max_height) { + nbins_y++; + } + + /* then find a bin width/height that satisfies the memory + * constraints: + */ + while (!layout_gmem(key, nbins_x, nbins_y, gmem)) { + if (nbins_y > nbins_x) { + nbins_x++; + } else { + nbins_y++; + } + } + + /* Lets see if we can tweak the layout a bit and come up with + * something better: + */ + if ((((nbins_x - 1) * (nbins_y + 1)) < (nbins_x * nbins_y)) && + layout_gmem(key, nbins_x - 1, nbins_y + 1, gmem)) { + nbins_x--; + nbins_y++; + } else if ((((nbins_x + 1) * (nbins_y - 1)) < (nbins_x * nbins_y)) && + layout_gmem(key, nbins_x + 1, nbins_y - 1, gmem)) { + nbins_x++; + nbins_y--; + } + + layout_gmem(key, nbins_x, nbins_y, gmem); } static struct fd_gmem_stateobj * gmem_stateobj_init(struct fd_screen *screen, struct gmem_key *key) { - struct fd_gmem_stateobj *gmem = - rzalloc(screen->gmem_cache.ht, struct fd_gmem_stateobj); - pipe_reference_init(&gmem->reference, 1); - gmem->screen = screen; - gmem->key = key; - list_inithead(&gmem->node); - - const unsigned npipes = screen->info.num_vsc_pipes; - uint32_t i, j, t, xoff, yoff; - uint32_t tpp_x, tpp_y; - int tile_n[npipes]; - - calc_nbins(key, gmem); - - DBG("using %d bins of size %dx%d", gmem->nbins_x * gmem->nbins_y, - gmem->bin_w, gmem->bin_h); - - memcpy(gmem->cbuf_cpp, key->cbuf_cpp, sizeof(key->cbuf_cpp)); - memcpy(gmem->zsbuf_cpp, key->zsbuf_cpp, sizeof(key->zsbuf_cpp)); - gmem->minx = key->minx; - gmem->miny = key->miny; - gmem->width = key->width; - gmem->height = key->height; - - if (BIN_DEBUG) { - dump_gmem_state(gmem); - dump_gmem_key(key); - } - - /* - * Assign tiles and pipes: - * - * At some point it might be worth playing with different - * strategies and seeing if that makes much impact on - * performance. - */ - -#define div_round_up(v, a) (((v) + (a) - 1) / (a)) - /* figure out number of tiles per pipe: */ - if (is_a20x(screen)) { - /* for a20x we want to minimize the number of "pipes" - * binning data has 3 bits for x/y (8x8) but the edges are used to - * cull off-screen vertices with hw binning, so we have 6x6 pipes - */ - tpp_x = 6; - tpp_y = 6; - } else { - tpp_x = tpp_y = 1; - while (div_round_up(gmem->nbins_y, tpp_y) > npipes) - tpp_y += 2; - while ((div_round_up(gmem->nbins_y, tpp_y) * - div_round_up(gmem->nbins_x, tpp_x)) > npipes) - tpp_x += 1; - } + struct fd_gmem_stateobj *gmem = + rzalloc(screen->gmem_cache.ht, struct fd_gmem_stateobj); + pipe_reference_init(&gmem->reference, 1); + gmem->screen = screen; + gmem->key = key; + list_inithead(&gmem->node); + + const unsigned npipes = screen->info.num_vsc_pipes; + uint32_t i, j, t, xoff, yoff; + uint32_t tpp_x, tpp_y; + int tile_n[npipes]; + + calc_nbins(key, gmem); + + DBG("using %d bins of size %dx%d", gmem->nbins_x * gmem->nbins_y, + gmem->bin_w, gmem->bin_h); + + memcpy(gmem->cbuf_cpp, key->cbuf_cpp, sizeof(key->cbuf_cpp)); + memcpy(gmem->zsbuf_cpp, key->zsbuf_cpp, sizeof(key->zsbuf_cpp)); + gmem->minx = key->minx; + gmem->miny = key->miny; + gmem->width = key->width; + gmem->height = key->height; + + if (BIN_DEBUG) { + dump_gmem_state(gmem); + dump_gmem_key(key); + } + + /* + * Assign tiles and pipes: + * + * At some point it might be worth playing with different + * strategies and seeing if that makes much impact on + * performance. + */ + +#define div_round_up(v, a) (((v) + (a)-1) / (a)) + /* figure out number of tiles per pipe: */ + if (is_a20x(screen)) { + /* for a20x we want to minimize the number of "pipes" + * binning data has 3 bits for x/y (8x8) but the edges are used to + * cull off-screen vertices with hw binning, so we have 6x6 pipes + */ + tpp_x = 6; + tpp_y = 6; + } else { + tpp_x = tpp_y = 1; + while (div_round_up(gmem->nbins_y, tpp_y) > npipes) + tpp_y += 2; + while ((div_round_up(gmem->nbins_y, tpp_y) * + div_round_up(gmem->nbins_x, tpp_x)) > npipes) + tpp_x += 1; + } #ifdef DEBUG - tpp_x = env_var_as_unsigned("TPP_X", tpp_x); - tpp_y = env_var_as_unsigned("TPP_Y", tpp_x); + tpp_x = env_var_as_unsigned("TPP_X", tpp_x); + tpp_y = env_var_as_unsigned("TPP_Y", tpp_x); #endif - gmem->maxpw = tpp_x; - gmem->maxph = tpp_y; - - /* configure pipes: */ - xoff = yoff = 0; - for (i = 0; i < npipes; i++) { - struct fd_vsc_pipe *pipe = &gmem->vsc_pipe[i]; - - if (xoff >= gmem->nbins_x) { - xoff = 0; - yoff += tpp_y; - } - - if (yoff >= gmem->nbins_y) { - break; - } - - pipe->x = xoff; - pipe->y = yoff; - pipe->w = MIN2(tpp_x, gmem->nbins_x - xoff); - pipe->h = MIN2(tpp_y, gmem->nbins_y - yoff); - - xoff += tpp_x; - } - - /* number of pipes to use for a20x */ - gmem->num_vsc_pipes = MAX2(1, i); - - for (; i < npipes; i++) { - struct fd_vsc_pipe *pipe = &gmem->vsc_pipe[i]; - pipe->x = pipe->y = pipe->w = pipe->h = 0; - } - - if (BIN_DEBUG) { - printf("%dx%d ... tpp=%dx%d\n", gmem->nbins_x, gmem->nbins_y, tpp_x, tpp_y); - for (i = 0; i < ARRAY_SIZE(gmem->vsc_pipe); i++) { - struct fd_vsc_pipe *pipe = &gmem->vsc_pipe[i]; - printf("pipe[%d]: %ux%u @ %u,%u\n", i, - pipe->w, pipe->h, pipe->x, pipe->y); - } - } - - /* configure tiles: */ - t = 0; - yoff = key->miny; - memset(tile_n, 0, sizeof(tile_n)); - for (i = 0; i < gmem->nbins_y; i++) { - int bw, bh; - - xoff = key->minx; - - /* clip bin height: */ - bh = MIN2(gmem->bin_h, key->miny + key->height - yoff); - assert(bh > 0); - - for (j = 0; j < gmem->nbins_x; j++) { - struct fd_tile *tile = &gmem->tile[t]; - uint32_t p; - - assert(t < ARRAY_SIZE(gmem->tile)); - - /* pipe number: */ - p = ((i / tpp_y) * div_round_up(gmem->nbins_x, tpp_x)) + (j / tpp_x); - assert(p < gmem->num_vsc_pipes); - - /* clip bin width: */ - bw = MIN2(gmem->bin_w, key->minx + key->width - xoff); - assert(bw > 0); - - tile->n = !is_a20x(screen) ? tile_n[p]++ : - ((i % tpp_y + 1) << 3 | (j % tpp_x + 1)); - tile->p = p; - tile->bin_w = bw; - tile->bin_h = bh; - tile->xoff = xoff; - tile->yoff = yoff; - - if (BIN_DEBUG) { - printf("tile[%d]: p=%u, bin=%ux%u+%u+%u\n", t, - p, bw, bh, xoff, yoff); - } - - t++; - - xoff += bw; - } - - yoff += bh; - } - - if (BIN_DEBUG) { - t = 0; - for (i = 0; i < gmem->nbins_y; i++) { - for (j = 0; j < gmem->nbins_x; j++) { - struct fd_tile *tile = &gmem->tile[t++]; - printf("|p:%u n:%u|", tile->p, tile->n); - } - printf("\n"); - } - } - - return gmem; + gmem->maxpw = tpp_x; + gmem->maxph = tpp_y; + + /* configure pipes: */ + xoff = yoff = 0; + for (i = 0; i < npipes; i++) { + struct fd_vsc_pipe *pipe = &gmem->vsc_pipe[i]; + + if (xoff >= gmem->nbins_x) { + xoff = 0; + yoff += tpp_y; + } + + if (yoff >= gmem->nbins_y) { + break; + } + + pipe->x = xoff; + pipe->y = yoff; + pipe->w = MIN2(tpp_x, gmem->nbins_x - xoff); + pipe->h = MIN2(tpp_y, gmem->nbins_y - yoff); + + xoff += tpp_x; + } + + /* number of pipes to use for a20x */ + gmem->num_vsc_pipes = MAX2(1, i); + + for (; i < npipes; i++) { + struct fd_vsc_pipe *pipe = &gmem->vsc_pipe[i]; + pipe->x = pipe->y = pipe->w = pipe->h = 0; + } + + if (BIN_DEBUG) { + printf("%dx%d ... tpp=%dx%d\n", gmem->nbins_x, gmem->nbins_y, tpp_x, + tpp_y); + for (i = 0; i < ARRAY_SIZE(gmem->vsc_pipe); i++) { + struct fd_vsc_pipe *pipe = &gmem->vsc_pipe[i]; + printf("pipe[%d]: %ux%u @ %u,%u\n", i, pipe->w, pipe->h, pipe->x, + pipe->y); + } + } + + /* configure tiles: */ + t = 0; + yoff = key->miny; + memset(tile_n, 0, sizeof(tile_n)); + for (i = 0; i < gmem->nbins_y; i++) { + int bw, bh; + + xoff = key->minx; + + /* clip bin height: */ + bh = MIN2(gmem->bin_h, key->miny + key->height - yoff); + assert(bh > 0); + + for (j = 0; j < gmem->nbins_x; j++) { + struct fd_tile *tile = &gmem->tile[t]; + uint32_t p; + + assert(t < ARRAY_SIZE(gmem->tile)); + + /* pipe number: */ + p = ((i / tpp_y) * div_round_up(gmem->nbins_x, tpp_x)) + (j / tpp_x); + assert(p < gmem->num_vsc_pipes); + + /* clip bin width: */ + bw = MIN2(gmem->bin_w, key->minx + key->width - xoff); + assert(bw > 0); + + tile->n = !is_a20x(screen) ? tile_n[p]++ + : ((i % tpp_y + 1) << 3 | (j % tpp_x + 1)); + tile->p = p; + tile->bin_w = bw; + tile->bin_h = bh; + tile->xoff = xoff; + tile->yoff = yoff; + + if (BIN_DEBUG) { + printf("tile[%d]: p=%u, bin=%ux%u+%u+%u\n", t, p, bw, bh, xoff, + yoff); + } + + t++; + + xoff += bw; + } + + yoff += bh; + } + + if (BIN_DEBUG) { + t = 0; + for (i = 0; i < gmem->nbins_y; i++) { + for (j = 0; j < gmem->nbins_x; j++) { + struct fd_tile *tile = &gmem->tile[t++]; + printf("|p:%u n:%u|", tile->p, tile->n); + } + printf("\n"); + } + } + + return gmem; } void __fd_gmem_destroy(struct fd_gmem_stateobj *gmem) { - struct fd_gmem_cache *cache = &gmem->screen->gmem_cache; + struct fd_gmem_cache *cache = &gmem->screen->gmem_cache; - fd_screen_assert_locked(gmem->screen); + fd_screen_assert_locked(gmem->screen); - _mesa_hash_table_remove_key(cache->ht, gmem->key); - list_del(&gmem->node); + _mesa_hash_table_remove_key(cache->ht, gmem->key); + list_del(&gmem->node); - ralloc_free(gmem->key); - ralloc_free(gmem); + ralloc_free(gmem->key); + ralloc_free(gmem); } static struct gmem_key * gmem_key_init(struct fd_batch *batch, bool assume_zs, bool no_scis_opt) { - struct fd_screen *screen = batch->ctx->screen; - struct pipe_framebuffer_state *pfb = &batch->framebuffer; - bool has_zs = pfb->zsbuf && !!(batch->gmem_reason & (FD_GMEM_DEPTH_ENABLED | - FD_GMEM_STENCIL_ENABLED | FD_GMEM_CLEARS_DEPTH_STENCIL)); - struct gmem_key *key = rzalloc(screen->gmem_cache.ht, struct gmem_key); - - if (has_zs || assume_zs) { - struct fd_resource *rsc = fd_resource(pfb->zsbuf->texture); - key->zsbuf_cpp[0] = rsc->layout.cpp; - if (rsc->stencil) - key->zsbuf_cpp[1] = rsc->stencil->layout.cpp; - } else { - /* we might have a zsbuf, but it isn't used */ - batch->restore &= ~(FD_BUFFER_DEPTH | FD_BUFFER_STENCIL); - batch->resolve &= ~(FD_BUFFER_DEPTH | FD_BUFFER_STENCIL); - } - - key->nr_cbufs = pfb->nr_cbufs; - for (unsigned i = 0; i < pfb->nr_cbufs; i++) { - if (pfb->cbufs[i]) - key->cbuf_cpp[i] = util_format_get_blocksize(pfb->cbufs[i]->format); - else - key->cbuf_cpp[i] = 4; - /* if MSAA, color buffers are super-sampled in GMEM: */ - key->cbuf_cpp[i] *= pfb->samples; - } - - /* NOTE: on a6xx, the max-scissor-rect is handled in fd6_gmem, and - * we just rely on CP_COND_EXEC to skip bins with no geometry. - */ - if (no_scis_opt || is_a6xx(screen)) { - key->minx = 0; - key->miny = 0; - key->width = pfb->width; - key->height = pfb->height; - } else { - struct pipe_scissor_state *scissor = &batch->max_scissor; - - if (FD_DBG(NOSCIS)) { - scissor->minx = 0; - scissor->miny = 0; - scissor->maxx = pfb->width; - scissor->maxy = pfb->height; - } - - /* round down to multiple of alignment: */ - key->minx = scissor->minx & ~(screen->info.gmem_align_w - 1); - key->miny = scissor->miny & ~(screen->info.gmem_align_h - 1); - key->width = scissor->maxx - key->minx; - key->height = scissor->maxy - key->miny; - } - - if (is_a20x(screen) && batch->cleared) { - /* under normal circumstances the requirement would be 4K - * but the fast clear path requires an alignment of 32K - */ - key->gmem_page_align = 8; - } else if (is_a6xx(screen)) { - key->gmem_page_align = is_a650(screen) ? 3 : 1; - } else { - // TODO re-check this across gens.. maybe it should only - // be a single page in some cases: - key->gmem_page_align = 4; - } - - return key; + struct fd_screen *screen = batch->ctx->screen; + struct pipe_framebuffer_state *pfb = &batch->framebuffer; + bool has_zs = + pfb->zsbuf && + !!(batch->gmem_reason & (FD_GMEM_DEPTH_ENABLED | FD_GMEM_STENCIL_ENABLED | + FD_GMEM_CLEARS_DEPTH_STENCIL)); + struct gmem_key *key = rzalloc(screen->gmem_cache.ht, struct gmem_key); + + if (has_zs || assume_zs) { + struct fd_resource *rsc = fd_resource(pfb->zsbuf->texture); + key->zsbuf_cpp[0] = rsc->layout.cpp; + if (rsc->stencil) + key->zsbuf_cpp[1] = rsc->stencil->layout.cpp; + } else { + /* we might have a zsbuf, but it isn't used */ + batch->restore &= ~(FD_BUFFER_DEPTH | FD_BUFFER_STENCIL); + batch->resolve &= ~(FD_BUFFER_DEPTH | FD_BUFFER_STENCIL); + } + + key->nr_cbufs = pfb->nr_cbufs; + for (unsigned i = 0; i < pfb->nr_cbufs; i++) { + if (pfb->cbufs[i]) + key->cbuf_cpp[i] = util_format_get_blocksize(pfb->cbufs[i]->format); + else + key->cbuf_cpp[i] = 4; + /* if MSAA, color buffers are super-sampled in GMEM: */ + key->cbuf_cpp[i] *= pfb->samples; + } + + /* NOTE: on a6xx, the max-scissor-rect is handled in fd6_gmem, and + * we just rely on CP_COND_EXEC to skip bins with no geometry. + */ + if (no_scis_opt || is_a6xx(screen)) { + key->minx = 0; + key->miny = 0; + key->width = pfb->width; + key->height = pfb->height; + } else { + struct pipe_scissor_state *scissor = &batch->max_scissor; + + if (FD_DBG(NOSCIS)) { + scissor->minx = 0; + scissor->miny = 0; + scissor->maxx = pfb->width; + scissor->maxy = pfb->height; + } + + /* round down to multiple of alignment: */ + key->minx = scissor->minx & ~(screen->info.gmem_align_w - 1); + key->miny = scissor->miny & ~(screen->info.gmem_align_h - 1); + key->width = scissor->maxx - key->minx; + key->height = scissor->maxy - key->miny; + } + + if (is_a20x(screen) && batch->cleared) { + /* under normal circumstances the requirement would be 4K + * but the fast clear path requires an alignment of 32K + */ + key->gmem_page_align = 8; + } else if (is_a6xx(screen)) { + key->gmem_page_align = is_a650(screen) ? 3 : 1; + } else { + // TODO re-check this across gens.. maybe it should only + // be a single page in some cases: + key->gmem_page_align = 4; + } + + return key; } static struct fd_gmem_stateobj * lookup_gmem_state(struct fd_batch *batch, bool assume_zs, bool no_scis_opt) { - struct fd_screen *screen = batch->ctx->screen; - struct fd_gmem_cache *cache = &screen->gmem_cache; - struct fd_gmem_stateobj *gmem = NULL; - - /* Lock before allocating gmem_key, since that a screen-wide - * ralloc pool and ralloc itself is not thread-safe. - */ - fd_screen_lock(screen); - - struct gmem_key *key = gmem_key_init(batch, assume_zs, no_scis_opt); - uint32_t hash = gmem_key_hash(key); - - struct hash_entry *entry = - _mesa_hash_table_search_pre_hashed(cache->ht, hash, key); - if (entry) { - ralloc_free(key); - goto found; - } - - /* limit the # of cached gmem states, discarding the least - * recently used state if needed: - */ - if (cache->ht->entries >= 20) { - struct fd_gmem_stateobj *last = - list_last_entry(&cache->lru, struct fd_gmem_stateobj, node); - fd_gmem_reference(&last, NULL); - } - - entry = _mesa_hash_table_insert_pre_hashed(cache->ht, - hash, key, gmem_stateobj_init(screen, key)); + struct fd_screen *screen = batch->ctx->screen; + struct fd_gmem_cache *cache = &screen->gmem_cache; + struct fd_gmem_stateobj *gmem = NULL; + + /* Lock before allocating gmem_key, since that a screen-wide + * ralloc pool and ralloc itself is not thread-safe. + */ + fd_screen_lock(screen); + + struct gmem_key *key = gmem_key_init(batch, assume_zs, no_scis_opt); + uint32_t hash = gmem_key_hash(key); + + struct hash_entry *entry = + _mesa_hash_table_search_pre_hashed(cache->ht, hash, key); + if (entry) { + ralloc_free(key); + goto found; + } + + /* limit the # of cached gmem states, discarding the least + * recently used state if needed: + */ + if (cache->ht->entries >= 20) { + struct fd_gmem_stateobj *last = + list_last_entry(&cache->lru, struct fd_gmem_stateobj, node); + fd_gmem_reference(&last, NULL); + } + + entry = _mesa_hash_table_insert_pre_hashed(cache->ht, hash, key, + gmem_stateobj_init(screen, key)); found: - fd_gmem_reference(&gmem, entry->data); - /* Move to the head of the LRU: */ - list_delinit(&gmem->node); - list_add(&gmem->node, &cache->lru); + fd_gmem_reference(&gmem, entry->data); + /* Move to the head of the LRU: */ + list_delinit(&gmem->node); + list_add(&gmem->node, &cache->lru); - fd_screen_unlock(screen); + fd_screen_unlock(screen); - return gmem; + return gmem; } /* @@ -573,178 +578,175 @@ found: */ static void -render_tiles(struct fd_batch *batch, struct fd_gmem_stateobj *gmem) - assert_dt +render_tiles(struct fd_batch *batch, struct fd_gmem_stateobj *gmem) assert_dt { - struct fd_context *ctx = batch->ctx; - int i; + struct fd_context *ctx = batch->ctx; + int i; - simple_mtx_lock(&ctx->gmem_lock); + simple_mtx_lock(&ctx->gmem_lock); - ctx->emit_tile_init(batch); + ctx->emit_tile_init(batch); - if (batch->restore) - ctx->stats.batch_restore++; + if (batch->restore) + ctx->stats.batch_restore++; - for (i = 0; i < (gmem->nbins_x * gmem->nbins_y); i++) { - struct fd_tile *tile = &gmem->tile[i]; + for (i = 0; i < (gmem->nbins_x * gmem->nbins_y); i++) { + struct fd_tile *tile = &gmem->tile[i]; - trace_start_tile(&batch->trace, tile->bin_h, - tile->yoff, tile->bin_w, tile->xoff); + trace_start_tile(&batch->trace, tile->bin_h, tile->yoff, tile->bin_w, + tile->xoff); - ctx->emit_tile_prep(batch, tile); + ctx->emit_tile_prep(batch, tile); - if (batch->restore) { - ctx->emit_tile_mem2gmem(batch, tile); - } + if (batch->restore) { + ctx->emit_tile_mem2gmem(batch, tile); + } - ctx->emit_tile_renderprep(batch, tile); + ctx->emit_tile_renderprep(batch, tile); - if (ctx->query_prepare_tile) - ctx->query_prepare_tile(batch, i, batch->gmem); + if (ctx->query_prepare_tile) + ctx->query_prepare_tile(batch, i, batch->gmem); - /* emit IB to drawcmds: */ - trace_start_draw_ib(&batch->trace); - if (ctx->emit_tile) { - ctx->emit_tile(batch, tile); - } else { - ctx->screen->emit_ib(batch->gmem, batch->draw); - } - trace_end_draw_ib(&batch->trace); - fd_reset_wfi(batch); + /* emit IB to drawcmds: */ + trace_start_draw_ib(&batch->trace); + if (ctx->emit_tile) { + ctx->emit_tile(batch, tile); + } else { + ctx->screen->emit_ib(batch->gmem, batch->draw); + } + trace_end_draw_ib(&batch->trace); + fd_reset_wfi(batch); - /* emit gmem2mem to transfer tile back to system memory: */ - ctx->emit_tile_gmem2mem(batch, tile); - } + /* emit gmem2mem to transfer tile back to system memory: */ + ctx->emit_tile_gmem2mem(batch, tile); + } - if (ctx->emit_tile_fini) - ctx->emit_tile_fini(batch); + if (ctx->emit_tile_fini) + ctx->emit_tile_fini(batch); - simple_mtx_unlock(&ctx->gmem_lock); + simple_mtx_unlock(&ctx->gmem_lock); } static void -render_sysmem(struct fd_batch *batch) - assert_dt +render_sysmem(struct fd_batch *batch) assert_dt { - struct fd_context *ctx = batch->ctx; + struct fd_context *ctx = batch->ctx; - ctx->emit_sysmem_prep(batch); + ctx->emit_sysmem_prep(batch); - if (ctx->query_prepare_tile) - ctx->query_prepare_tile(batch, 0, batch->gmem); + if (ctx->query_prepare_tile) + ctx->query_prepare_tile(batch, 0, batch->gmem); - if (!batch->nondraw) { - trace_start_draw_ib(&batch->trace); - } - /* emit IB to drawcmds: */ - ctx->screen->emit_ib(batch->gmem, batch->draw); + if (!batch->nondraw) { + trace_start_draw_ib(&batch->trace); + } + /* emit IB to drawcmds: */ + ctx->screen->emit_ib(batch->gmem, batch->draw); - if (!batch->nondraw) { - trace_end_draw_ib(&batch->trace); - } + if (!batch->nondraw) { + trace_end_draw_ib(&batch->trace); + } - fd_reset_wfi(batch); + fd_reset_wfi(batch); - if (ctx->emit_sysmem_fini) - ctx->emit_sysmem_fini(batch); + if (ctx->emit_sysmem_fini) + ctx->emit_sysmem_fini(batch); } static void flush_ring(struct fd_batch *batch) { - uint32_t timestamp = 0; - int out_fence_fd = -1; + uint32_t timestamp = 0; + int out_fence_fd = -1; - if (FD_DBG(NOHW)) - return; + if (FD_DBG(NOHW)) + return; - fd_submit_flush(batch->submit, batch->in_fence_fd, - batch->needs_out_fence_fd ? &out_fence_fd : NULL, - ×tamp); + fd_submit_flush(batch->submit, batch->in_fence_fd, + batch->needs_out_fence_fd ? &out_fence_fd : NULL, + ×tamp); - fd_fence_populate(batch->fence, timestamp, out_fence_fd); + fd_fence_populate(batch->fence, timestamp, out_fence_fd); } void fd_gmem_render_tiles(struct fd_batch *batch) { - struct fd_context *ctx = batch->ctx; - struct pipe_framebuffer_state *pfb = &batch->framebuffer; - bool sysmem = false; - - if (!batch->nondraw) { - trace_flush_batch(&batch->trace, batch, batch->cleared, - batch->gmem_reason, batch->num_draws); - trace_framebuffer_state(&batch->trace, pfb); - } - - if (ctx->emit_sysmem_prep && !batch->nondraw) { - if (fd_autotune_use_bypass(&ctx->autotune, batch) && - !FD_DBG(NOBYPASS)) { - sysmem = true; - } - - /* For ARB_framebuffer_no_attachments: */ - if ((pfb->nr_cbufs == 0) && !pfb->zsbuf) { - sysmem = true; - } - } - - if (FD_DBG(NOGMEM)) - sysmem = true; - - /* Layered rendering always needs bypass. */ - for (unsigned i = 0; i < pfb->nr_cbufs; i++) { - struct pipe_surface *psurf = pfb->cbufs[i]; - if (!psurf) - continue; - if (psurf->u.tex.first_layer < psurf->u.tex.last_layer) - sysmem = true; - } - - /* Tessellation doesn't seem to support tiled rendering so fall back to - * bypass. - */ - if (batch->tessellation) { - debug_assert(ctx->emit_sysmem_prep); - sysmem = true; - } - - fd_reset_wfi(batch); - - ctx->stats.batch_total++; - - if (batch->nondraw) { - DBG("%p: rendering non-draw", batch); - render_sysmem(batch); - ctx->stats.batch_nondraw++; - } else if (sysmem) { - trace_render_sysmem(&batch->trace); - if (ctx->query_prepare) - ctx->query_prepare(batch, 1); - render_sysmem(batch); - ctx->stats.batch_sysmem++; - } else { - struct fd_gmem_stateobj *gmem = lookup_gmem_state(batch, false, false); - batch->gmem_state = gmem; - trace_render_gmem(&batch->trace, gmem->nbins_x, gmem->nbins_y, - gmem->bin_w, gmem->bin_h); - if (ctx->query_prepare) - ctx->query_prepare(batch, gmem->nbins_x * gmem->nbins_y); - render_tiles(batch, gmem); - batch->gmem_state = NULL; - - fd_screen_lock(ctx->screen); - fd_gmem_reference(&gmem, NULL); - fd_screen_unlock(ctx->screen); - - ctx->stats.batch_gmem++; - } - - flush_ring(batch); - - u_trace_flush(&batch->trace); + struct fd_context *ctx = batch->ctx; + struct pipe_framebuffer_state *pfb = &batch->framebuffer; + bool sysmem = false; + + if (!batch->nondraw) { + trace_flush_batch(&batch->trace, batch, batch->cleared, + batch->gmem_reason, batch->num_draws); + trace_framebuffer_state(&batch->trace, pfb); + } + + if (ctx->emit_sysmem_prep && !batch->nondraw) { + if (fd_autotune_use_bypass(&ctx->autotune, batch) && !FD_DBG(NOBYPASS)) { + sysmem = true; + } + + /* For ARB_framebuffer_no_attachments: */ + if ((pfb->nr_cbufs == 0) && !pfb->zsbuf) { + sysmem = true; + } + } + + if (FD_DBG(NOGMEM)) + sysmem = true; + + /* Layered rendering always needs bypass. */ + for (unsigned i = 0; i < pfb->nr_cbufs; i++) { + struct pipe_surface *psurf = pfb->cbufs[i]; + if (!psurf) + continue; + if (psurf->u.tex.first_layer < psurf->u.tex.last_layer) + sysmem = true; + } + + /* Tessellation doesn't seem to support tiled rendering so fall back to + * bypass. + */ + if (batch->tessellation) { + debug_assert(ctx->emit_sysmem_prep); + sysmem = true; + } + + fd_reset_wfi(batch); + + ctx->stats.batch_total++; + + if (batch->nondraw) { + DBG("%p: rendering non-draw", batch); + render_sysmem(batch); + ctx->stats.batch_nondraw++; + } else if (sysmem) { + trace_render_sysmem(&batch->trace); + if (ctx->query_prepare) + ctx->query_prepare(batch, 1); + render_sysmem(batch); + ctx->stats.batch_sysmem++; + } else { + struct fd_gmem_stateobj *gmem = lookup_gmem_state(batch, false, false); + batch->gmem_state = gmem; + trace_render_gmem(&batch->trace, gmem->nbins_x, gmem->nbins_y, + gmem->bin_w, gmem->bin_h); + if (ctx->query_prepare) + ctx->query_prepare(batch, gmem->nbins_x * gmem->nbins_y); + render_tiles(batch, gmem); + batch->gmem_state = NULL; + + fd_screen_lock(ctx->screen); + fd_gmem_reference(&gmem, NULL); + fd_screen_unlock(ctx->screen); + + ctx->stats.batch_gmem++; + } + + flush_ring(batch); + + u_trace_flush(&batch->trace); } /* Determine a worst-case estimate (ie. assuming we don't eliminate an @@ -753,16 +755,16 @@ fd_gmem_render_tiles(struct fd_batch *batch) unsigned fd_gmem_estimate_bins_per_pipe(struct fd_batch *batch) { - struct pipe_framebuffer_state *pfb = &batch->framebuffer; - struct fd_screen *screen = batch->ctx->screen; - struct fd_gmem_stateobj *gmem = lookup_gmem_state(batch, !!pfb->zsbuf, true); - unsigned nbins = gmem->maxpw * gmem->maxph; + struct pipe_framebuffer_state *pfb = &batch->framebuffer; + struct fd_screen *screen = batch->ctx->screen; + struct fd_gmem_stateobj *gmem = lookup_gmem_state(batch, !!pfb->zsbuf, true); + unsigned nbins = gmem->maxpw * gmem->maxph; - fd_screen_lock(screen); - fd_gmem_reference(&gmem, NULL); - fd_screen_unlock(screen); + fd_screen_lock(screen); + fd_gmem_reference(&gmem, NULL); + fd_screen_unlock(screen); - return nbins; + return nbins; } /* When deciding whether a tile needs mem2gmem, we need to take into @@ -772,27 +774,27 @@ fd_gmem_estimate_bins_per_pipe(struct fd_batch *batch) */ bool fd_gmem_needs_restore(struct fd_batch *batch, const struct fd_tile *tile, - uint32_t buffers) + uint32_t buffers) { - if (!(batch->restore & buffers)) - return false; + if (!(batch->restore & buffers)) + return false; - return true; + return true; } void fd_gmem_screen_init(struct pipe_screen *pscreen) { - struct fd_gmem_cache *cache = &fd_screen(pscreen)->gmem_cache; + struct fd_gmem_cache *cache = &fd_screen(pscreen)->gmem_cache; - cache->ht = _mesa_hash_table_create(NULL, gmem_key_hash, gmem_key_equals); - list_inithead(&cache->lru); + cache->ht = _mesa_hash_table_create(NULL, gmem_key_hash, gmem_key_equals); + list_inithead(&cache->lru); } void fd_gmem_screen_fini(struct pipe_screen *pscreen) { - struct fd_gmem_cache *cache = &fd_screen(pscreen)->gmem_cache; + struct fd_gmem_cache *cache = &fd_screen(pscreen)->gmem_cache; - _mesa_hash_table_destroy(cache->ht, NULL); + _mesa_hash_table_destroy(cache->ht, NULL); } diff --git a/src/gallium/drivers/freedreno/freedreno_gmem.h b/src/gallium/drivers/freedreno/freedreno_gmem.h index f20425b..7785cef 100644 --- a/src/gallium/drivers/freedreno/freedreno_gmem.h +++ b/src/gallium/drivers/freedreno/freedreno_gmem.h @@ -34,37 +34,37 @@ /* per-pipe configuration for hw binning: */ struct fd_vsc_pipe { - uint8_t x, y, w, h; /* VSC_PIPE[p].CONFIG */ + uint8_t x, y, w, h; /* VSC_PIPE[p].CONFIG */ }; /* per-tile configuration for hw binning: */ struct fd_tile { - uint8_t p; /* index into vsc_pipe[]s */ - uint8_t n; /* slot within pipe */ - uint16_t bin_w, bin_h; - uint16_t xoff, yoff; + uint8_t p; /* index into vsc_pipe[]s */ + uint8_t n; /* slot within pipe */ + uint16_t bin_w, bin_h; + uint16_t xoff, yoff; }; struct fd_gmem_stateobj { - struct pipe_reference reference; - struct fd_screen *screen; - void *key; - - uint32_t cbuf_base[MAX_RENDER_TARGETS]; - uint32_t zsbuf_base[2]; - uint8_t cbuf_cpp[MAX_RENDER_TARGETS]; - uint8_t zsbuf_cpp[2]; - uint16_t bin_h, nbins_y; - uint16_t bin_w, nbins_x; - uint16_t minx, miny; - uint16_t width, height; - uint16_t maxpw, maxph; /* maximum pipe width/height */ - uint8_t num_vsc_pipes; /* number of pipes for a20x */ - - struct fd_vsc_pipe vsc_pipe[32]; - struct fd_tile tile[2048]; - - struct list_head node; + struct pipe_reference reference; + struct fd_screen *screen; + void *key; + + uint32_t cbuf_base[MAX_RENDER_TARGETS]; + uint32_t zsbuf_base[2]; + uint8_t cbuf_cpp[MAX_RENDER_TARGETS]; + uint8_t zsbuf_cpp[2]; + uint16_t bin_h, nbins_y; + uint16_t bin_w, nbins_x; + uint16_t minx, miny; + uint16_t width, height; + uint16_t maxpw, maxph; /* maximum pipe width/height */ + uint8_t num_vsc_pipes; /* number of pipes for a20x */ + + struct fd_vsc_pipe vsc_pipe[32]; + struct fd_tile tile[2048]; + + struct list_head node; }; void __fd_gmem_destroy(struct fd_gmem_stateobj *gmem); @@ -72,17 +72,17 @@ void __fd_gmem_destroy(struct fd_gmem_stateobj *gmem); static inline void fd_gmem_reference(struct fd_gmem_stateobj **ptr, struct fd_gmem_stateobj *gmem) { - struct fd_gmem_stateobj *old_gmem = *ptr; + struct fd_gmem_stateobj *old_gmem = *ptr; - if (pipe_reference(&(*ptr)->reference, &gmem->reference)) - __fd_gmem_destroy(old_gmem); + if (pipe_reference(&(*ptr)->reference, &gmem->reference)) + __fd_gmem_destroy(old_gmem); - *ptr = gmem; + *ptr = gmem; } struct fd_gmem_cache { - struct hash_table *ht; - struct list_head lru; + struct hash_table *ht; + struct list_head lru; }; struct fd_batch; @@ -90,7 +90,7 @@ struct fd_batch; void fd_gmem_render_tiles(struct fd_batch *batch) assert_dt; unsigned fd_gmem_estimate_bins_per_pipe(struct fd_batch *batch); bool fd_gmem_needs_restore(struct fd_batch *batch, const struct fd_tile *tile, - uint32_t buffers); + uint32_t buffers); struct pipe_screen; void fd_gmem_screen_init(struct pipe_screen *pscreen); diff --git a/src/gallium/drivers/freedreno/freedreno_program.c b/src/gallium/drivers/freedreno/freedreno_program.c index 0712050..85747ed 100644 --- a/src/gallium/drivers/freedreno/freedreno_program.c +++ b/src/gallium/drivers/freedreno/freedreno_program.c @@ -29,233 +29,225 @@ #include "util/u_simple_shaders.h" -#include "freedreno_program.h" #include "freedreno_context.h" +#include "freedreno_program.h" static void -update_bound_stage(struct fd_context *ctx, enum pipe_shader_type shader, bool bound) - assert_dt +update_bound_stage(struct fd_context *ctx, enum pipe_shader_type shader, + bool bound) assert_dt { - if (bound) { - ctx->bound_shader_stages |= BIT(shader); - } else { - ctx->bound_shader_stages &= ~BIT(shader); - } + if (bound) { + ctx->bound_shader_stages |= BIT(shader); + } else { + ctx->bound_shader_stages &= ~BIT(shader); + } } static void -fd_vs_state_bind(struct pipe_context *pctx, void *hwcso) - in_dt +fd_vs_state_bind(struct pipe_context *pctx, void *hwcso) in_dt { - struct fd_context *ctx = fd_context(pctx); - ctx->prog.vs = hwcso; - fd_context_dirty_shader(ctx, PIPE_SHADER_VERTEX, FD_DIRTY_SHADER_PROG); - update_bound_stage(ctx, PIPE_SHADER_VERTEX, !!hwcso); + struct fd_context *ctx = fd_context(pctx); + ctx->prog.vs = hwcso; + fd_context_dirty_shader(ctx, PIPE_SHADER_VERTEX, FD_DIRTY_SHADER_PROG); + update_bound_stage(ctx, PIPE_SHADER_VERTEX, !!hwcso); } static void -fd_tcs_state_bind(struct pipe_context *pctx, void *hwcso) - in_dt +fd_tcs_state_bind(struct pipe_context *pctx, void *hwcso) in_dt { - struct fd_context *ctx = fd_context(pctx); - ctx->prog.hs = hwcso; - fd_context_dirty_shader(ctx, PIPE_SHADER_TESS_CTRL, FD_DIRTY_SHADER_PROG); - update_bound_stage(ctx, PIPE_SHADER_TESS_CTRL, !!hwcso); + struct fd_context *ctx = fd_context(pctx); + ctx->prog.hs = hwcso; + fd_context_dirty_shader(ctx, PIPE_SHADER_TESS_CTRL, FD_DIRTY_SHADER_PROG); + update_bound_stage(ctx, PIPE_SHADER_TESS_CTRL, !!hwcso); } static void -fd_tes_state_bind(struct pipe_context *pctx, void *hwcso) - in_dt +fd_tes_state_bind(struct pipe_context *pctx, void *hwcso) in_dt { - struct fd_context *ctx = fd_context(pctx); - ctx->prog.ds = hwcso; - fd_context_dirty_shader(ctx, PIPE_SHADER_TESS_EVAL, FD_DIRTY_SHADER_PROG); - update_bound_stage(ctx, PIPE_SHADER_TESS_EVAL, !!hwcso); + struct fd_context *ctx = fd_context(pctx); + ctx->prog.ds = hwcso; + fd_context_dirty_shader(ctx, PIPE_SHADER_TESS_EVAL, FD_DIRTY_SHADER_PROG); + update_bound_stage(ctx, PIPE_SHADER_TESS_EVAL, !!hwcso); } static void -fd_gs_state_bind(struct pipe_context *pctx, void *hwcso) - in_dt +fd_gs_state_bind(struct pipe_context *pctx, void *hwcso) in_dt { - struct fd_context *ctx = fd_context(pctx); - ctx->prog.gs = hwcso; - fd_context_dirty_shader(ctx, PIPE_SHADER_GEOMETRY, FD_DIRTY_SHADER_PROG); - update_bound_stage(ctx, PIPE_SHADER_GEOMETRY, !!hwcso); + struct fd_context *ctx = fd_context(pctx); + ctx->prog.gs = hwcso; + fd_context_dirty_shader(ctx, PIPE_SHADER_GEOMETRY, FD_DIRTY_SHADER_PROG); + update_bound_stage(ctx, PIPE_SHADER_GEOMETRY, !!hwcso); } static void -fd_fs_state_bind(struct pipe_context *pctx, void *hwcso) - in_dt +fd_fs_state_bind(struct pipe_context *pctx, void *hwcso) in_dt { - struct fd_context *ctx = fd_context(pctx); - ctx->prog.fs = hwcso; - fd_context_dirty_shader(ctx, PIPE_SHADER_FRAGMENT, FD_DIRTY_SHADER_PROG); - update_bound_stage(ctx, PIPE_SHADER_FRAGMENT, !!hwcso); + struct fd_context *ctx = fd_context(pctx); + ctx->prog.fs = hwcso; + fd_context_dirty_shader(ctx, PIPE_SHADER_FRAGMENT, FD_DIRTY_SHADER_PROG); + update_bound_stage(ctx, PIPE_SHADER_FRAGMENT, !!hwcso); } -static const char *solid_fs = - "FRAG \n" - "PROPERTY FS_COLOR0_WRITES_ALL_CBUFS 1 \n" - "DCL CONST[0] \n" - "DCL OUT[0], COLOR \n" - " 0: MOV OUT[0], CONST[0] \n" - " 1: END \n"; - -static const char *solid_vs = - "VERT \n" - "DCL IN[0] \n" - "DCL OUT[0], POSITION \n" - " 0: MOV OUT[0], IN[0] \n" - " 1: END \n"; - -static void * assemble_tgsi(struct pipe_context *pctx, - const char *src, bool frag) +static const char *solid_fs = "FRAG \n" + "PROPERTY FS_COLOR0_WRITES_ALL_CBUFS 1 \n" + "DCL CONST[0] \n" + "DCL OUT[0], COLOR \n" + " 0: MOV OUT[0], CONST[0] \n" + " 1: END \n"; + +static const char *solid_vs = "VERT \n" + "DCL IN[0] \n" + "DCL OUT[0], POSITION \n" + " 0: MOV OUT[0], IN[0] \n" + " 1: END \n"; + +static void * +assemble_tgsi(struct pipe_context *pctx, const char *src, bool frag) { - struct tgsi_token toks[32]; - struct pipe_shader_state cso = { - .tokens = toks, - }; - - bool ret = tgsi_text_translate(src, toks, ARRAY_SIZE(toks)); - assume(ret); - - if (frag) - return pctx->create_fs_state(pctx, &cso); - else - return pctx->create_vs_state(pctx, &cso); + struct tgsi_token toks[32]; + struct pipe_shader_state cso = { + .tokens = toks, + }; + + bool ret = tgsi_text_translate(src, toks, ARRAY_SIZE(toks)); + assume(ret); + + if (frag) + return pctx->create_fs_state(pctx, &cso); + else + return pctx->create_vs_state(pctx, &cso); } /* the correct semantic to use for the texcoord varying depends on pipe-cap: */ static enum tgsi_semantic texcoord_semantic(struct pipe_context *pctx) { - struct pipe_screen *pscreen = pctx->screen; + struct pipe_screen *pscreen = pctx->screen; - if (pscreen->get_param(pscreen, PIPE_CAP_TGSI_TEXCOORD)) { - return TGSI_SEMANTIC_TEXCOORD; - } else { - return TGSI_SEMANTIC_GENERIC; - } + if (pscreen->get_param(pscreen, PIPE_CAP_TGSI_TEXCOORD)) { + return TGSI_SEMANTIC_TEXCOORD; + } else { + return TGSI_SEMANTIC_GENERIC; + } } static void * fd_prog_blit_vs(struct pipe_context *pctx) { - struct ureg_program *ureg; + struct ureg_program *ureg; - ureg = ureg_create(PIPE_SHADER_VERTEX); - if (!ureg) - return NULL; + ureg = ureg_create(PIPE_SHADER_VERTEX); + if (!ureg) + return NULL; - struct ureg_src in0 = ureg_DECL_vs_input(ureg, 0); - struct ureg_src in1 = ureg_DECL_vs_input(ureg, 1); + struct ureg_src in0 = ureg_DECL_vs_input(ureg, 0); + struct ureg_src in1 = ureg_DECL_vs_input(ureg, 1); - struct ureg_dst out0 = ureg_DECL_output(ureg, texcoord_semantic(pctx), 0); - struct ureg_dst out1 = ureg_DECL_output(ureg, TGSI_SEMANTIC_POSITION, 1); + struct ureg_dst out0 = ureg_DECL_output(ureg, texcoord_semantic(pctx), 0); + struct ureg_dst out1 = ureg_DECL_output(ureg, TGSI_SEMANTIC_POSITION, 1); - ureg_MOV(ureg, out0, in0); - ureg_MOV(ureg, out1, in1); + ureg_MOV(ureg, out0, in0); + ureg_MOV(ureg, out1, in1); - ureg_END(ureg); + ureg_END(ureg); - return ureg_create_shader_and_destroy(ureg, pctx); + return ureg_create_shader_and_destroy(ureg, pctx); } static void * fd_prog_blit_fs(struct pipe_context *pctx, int rts, bool depth) { - int i; - struct ureg_src tc; - struct ureg_program *ureg; - - debug_assert(rts <= MAX_RENDER_TARGETS); - - ureg = ureg_create(PIPE_SHADER_FRAGMENT); - if (!ureg) - return NULL; - - tc = ureg_DECL_fs_input( - ureg, texcoord_semantic(pctx), 0, TGSI_INTERPOLATE_PERSPECTIVE); - for (i = 0; i < rts; i++) - ureg_TEX(ureg, ureg_DECL_output(ureg, TGSI_SEMANTIC_COLOR, i), - TGSI_TEXTURE_2D, tc, ureg_DECL_sampler(ureg, i)); - if (depth) - ureg_TEX(ureg, - ureg_writemask( - ureg_DECL_output(ureg, TGSI_SEMANTIC_POSITION, 0), - TGSI_WRITEMASK_Z), - TGSI_TEXTURE_2D, tc, ureg_DECL_sampler(ureg, rts)); - - ureg_END(ureg); - - return ureg_create_shader_and_destroy(ureg, pctx); + int i; + struct ureg_src tc; + struct ureg_program *ureg; + + debug_assert(rts <= MAX_RENDER_TARGETS); + + ureg = ureg_create(PIPE_SHADER_FRAGMENT); + if (!ureg) + return NULL; + + tc = ureg_DECL_fs_input(ureg, texcoord_semantic(pctx), 0, + TGSI_INTERPOLATE_PERSPECTIVE); + for (i = 0; i < rts; i++) + ureg_TEX(ureg, ureg_DECL_output(ureg, TGSI_SEMANTIC_COLOR, i), + TGSI_TEXTURE_2D, tc, ureg_DECL_sampler(ureg, i)); + if (depth) + ureg_TEX(ureg, + ureg_writemask(ureg_DECL_output(ureg, TGSI_SEMANTIC_POSITION, 0), + TGSI_WRITEMASK_Z), + TGSI_TEXTURE_2D, tc, ureg_DECL_sampler(ureg, rts)); + + ureg_END(ureg); + + return ureg_create_shader_and_destroy(ureg, pctx); } - -void fd_prog_init(struct pipe_context *pctx) +void +fd_prog_init(struct pipe_context *pctx) { - struct fd_context *ctx = fd_context(pctx); - int i; - - pctx->bind_vs_state = fd_vs_state_bind; - pctx->bind_tcs_state = fd_tcs_state_bind; - pctx->bind_tes_state = fd_tes_state_bind; - pctx->bind_gs_state = fd_gs_state_bind; - pctx->bind_fs_state = fd_fs_state_bind; - - ctx->solid_prog.fs = assemble_tgsi(pctx, solid_fs, true); - ctx->solid_prog.vs = assemble_tgsi(pctx, solid_vs, false); - - if (ctx->screen->gpu_id >= 600) { - ctx->solid_layered_prog.fs = assemble_tgsi(pctx, solid_fs, true); - ctx->solid_layered_prog.vs = - util_make_layered_clear_vertex_shader(pctx); - } - - if (ctx->screen->gpu_id >= 500) - return; - - ctx->blit_prog[0].vs = fd_prog_blit_vs(pctx); - ctx->blit_prog[0].fs = fd_prog_blit_fs(pctx, 1, false); - - if (ctx->screen->gpu_id < 300) - return; - - for (i = 1; i < ctx->screen->max_rts; i++) { - ctx->blit_prog[i].vs = ctx->blit_prog[0].vs; - ctx->blit_prog[i].fs = fd_prog_blit_fs(pctx, i + 1, false); - } - - ctx->blit_z.vs = ctx->blit_prog[0].vs; - ctx->blit_z.fs = fd_prog_blit_fs(pctx, 0, true); - ctx->blit_zs.vs = ctx->blit_prog[0].vs; - ctx->blit_zs.fs = fd_prog_blit_fs(pctx, 1, true); + struct fd_context *ctx = fd_context(pctx); + int i; + + pctx->bind_vs_state = fd_vs_state_bind; + pctx->bind_tcs_state = fd_tcs_state_bind; + pctx->bind_tes_state = fd_tes_state_bind; + pctx->bind_gs_state = fd_gs_state_bind; + pctx->bind_fs_state = fd_fs_state_bind; + + ctx->solid_prog.fs = assemble_tgsi(pctx, solid_fs, true); + ctx->solid_prog.vs = assemble_tgsi(pctx, solid_vs, false); + + if (ctx->screen->gpu_id >= 600) { + ctx->solid_layered_prog.fs = assemble_tgsi(pctx, solid_fs, true); + ctx->solid_layered_prog.vs = util_make_layered_clear_vertex_shader(pctx); + } + + if (ctx->screen->gpu_id >= 500) + return; + + ctx->blit_prog[0].vs = fd_prog_blit_vs(pctx); + ctx->blit_prog[0].fs = fd_prog_blit_fs(pctx, 1, false); + + if (ctx->screen->gpu_id < 300) + return; + + for (i = 1; i < ctx->screen->max_rts; i++) { + ctx->blit_prog[i].vs = ctx->blit_prog[0].vs; + ctx->blit_prog[i].fs = fd_prog_blit_fs(pctx, i + 1, false); + } + + ctx->blit_z.vs = ctx->blit_prog[0].vs; + ctx->blit_z.fs = fd_prog_blit_fs(pctx, 0, true); + ctx->blit_zs.vs = ctx->blit_prog[0].vs; + ctx->blit_zs.fs = fd_prog_blit_fs(pctx, 1, true); } -void fd_prog_fini(struct pipe_context *pctx) +void +fd_prog_fini(struct pipe_context *pctx) { - struct fd_context *ctx = fd_context(pctx); - int i; + struct fd_context *ctx = fd_context(pctx); + int i; - pctx->delete_vs_state(pctx, ctx->solid_prog.vs); - pctx->delete_fs_state(pctx, ctx->solid_prog.fs); + pctx->delete_vs_state(pctx, ctx->solid_prog.vs); + pctx->delete_fs_state(pctx, ctx->solid_prog.fs); - if (ctx->screen->gpu_id >= 600) { - pctx->delete_vs_state(pctx, ctx->solid_layered_prog.vs); - pctx->delete_fs_state(pctx, ctx->solid_layered_prog.fs); - } + if (ctx->screen->gpu_id >= 600) { + pctx->delete_vs_state(pctx, ctx->solid_layered_prog.vs); + pctx->delete_fs_state(pctx, ctx->solid_layered_prog.fs); + } - if (ctx->screen->gpu_id >= 500) - return; + if (ctx->screen->gpu_id >= 500) + return; - pctx->delete_vs_state(pctx, ctx->blit_prog[0].vs); - pctx->delete_fs_state(pctx, ctx->blit_prog[0].fs); + pctx->delete_vs_state(pctx, ctx->blit_prog[0].vs); + pctx->delete_fs_state(pctx, ctx->blit_prog[0].fs); - if (ctx->screen->gpu_id < 300) - return; + if (ctx->screen->gpu_id < 300) + return; - for (i = 1; i < ctx->screen->max_rts; i++) - pctx->delete_fs_state(pctx, ctx->blit_prog[i].fs); - pctx->delete_fs_state(pctx, ctx->blit_z.fs); - pctx->delete_fs_state(pctx, ctx->blit_zs.fs); + for (i = 1; i < ctx->screen->max_rts; i++) + pctx->delete_fs_state(pctx, ctx->blit_prog[i].fs); + pctx->delete_fs_state(pctx, ctx->blit_z.fs); + pctx->delete_fs_state(pctx, ctx->blit_zs.fs); } diff --git a/src/gallium/drivers/freedreno/freedreno_query.c b/src/gallium/drivers/freedreno/freedreno_query.c index 3a579fe..811922d 100644 --- a/src/gallium/drivers/freedreno/freedreno_query.c +++ b/src/gallium/drivers/freedreno/freedreno_query.c @@ -27,10 +27,10 @@ #include "pipe/p_state.h" #include "util/u_memory.h" +#include "freedreno_context.h" #include "freedreno_query.h" -#include "freedreno_query_sw.h" #include "freedreno_query_hw.h" -#include "freedreno_context.h" +#include "freedreno_query_sw.h" #include "freedreno_util.h" /* @@ -40,238 +40,233 @@ static struct pipe_query * fd_create_query(struct pipe_context *pctx, unsigned query_type, unsigned index) { - struct fd_context *ctx = fd_context(pctx); - struct fd_query *q = NULL; + struct fd_context *ctx = fd_context(pctx); + struct fd_query *q = NULL; - if (ctx->create_query) - q = ctx->create_query(ctx, query_type, index); - if (!q) - q = fd_sw_create_query(ctx, query_type, index); + if (ctx->create_query) + q = ctx->create_query(ctx, query_type, index); + if (!q) + q = fd_sw_create_query(ctx, query_type, index); - return (struct pipe_query *) q; + return (struct pipe_query *)q; } static void -fd_destroy_query(struct pipe_context *pctx, struct pipe_query *pq) - in_dt +fd_destroy_query(struct pipe_context *pctx, struct pipe_query *pq) in_dt { - struct fd_query *q = fd_query(pq); - q->funcs->destroy_query(fd_context(pctx), q); + struct fd_query *q = fd_query(pq); + q->funcs->destroy_query(fd_context(pctx), q); } static bool -fd_begin_query(struct pipe_context *pctx, struct pipe_query *pq) - in_dt +fd_begin_query(struct pipe_context *pctx, struct pipe_query *pq) in_dt { - struct fd_query *q = fd_query(pq); + struct fd_query *q = fd_query(pq); - q->funcs->begin_query(fd_context(pctx), q); + q->funcs->begin_query(fd_context(pctx), q); - return true; + return true; } static bool -fd_end_query(struct pipe_context *pctx, struct pipe_query *pq) - in_dt +fd_end_query(struct pipe_context *pctx, struct pipe_query *pq) in_dt { - struct fd_query *q = fd_query(pq); + struct fd_query *q = fd_query(pq); - /* there are a couple special cases, which don't have - * a matching ->begin_query(): - */ - if (skip_begin_query(q->type)) - fd_begin_query(pctx, pq); + /* there are a couple special cases, which don't have + * a matching ->begin_query(): + */ + if (skip_begin_query(q->type)) + fd_begin_query(pctx, pq); - q->funcs->end_query(fd_context(pctx), q); + q->funcs->end_query(fd_context(pctx), q); - return true; + return true; } static bool -fd_get_query_result(struct pipe_context *pctx, struct pipe_query *pq, - bool wait, union pipe_query_result *result) +fd_get_query_result(struct pipe_context *pctx, struct pipe_query *pq, bool wait, + union pipe_query_result *result) { - struct fd_query *q = fd_query(pq); + struct fd_query *q = fd_query(pq); - util_query_clear_result(result, q->type); + util_query_clear_result(result, q->type); - return q->funcs->get_query_result(fd_context(pctx), q, wait, result); + return q->funcs->get_query_result(fd_context(pctx), q, wait, result); } static void fd_render_condition(struct pipe_context *pctx, struct pipe_query *pq, - bool condition, enum pipe_render_cond_flag mode) - in_dt + bool condition, enum pipe_render_cond_flag mode) in_dt { - struct fd_context *ctx = fd_context(pctx); - ctx->cond_query = pq; - ctx->cond_cond = condition; - ctx->cond_mode = mode; + struct fd_context *ctx = fd_context(pctx); + ctx->cond_query = pq; + ctx->cond_cond = condition; + ctx->cond_mode = mode; } -#define _Q(_name, _query_type, _type, _result_type) { \ - .name = _name, \ - .query_type = _query_type, \ - .type = PIPE_DRIVER_QUERY_TYPE_ ## _type, \ - .result_type = PIPE_DRIVER_QUERY_RESULT_TYPE_ ## _result_type, \ - .group_id = ~(unsigned)0, \ -} +#define _Q(_name, _query_type, _type, _result_type) \ + { \ + .name = _name, .query_type = _query_type, \ + .type = PIPE_DRIVER_QUERY_TYPE_##_type, \ + .result_type = PIPE_DRIVER_QUERY_RESULT_TYPE_##_result_type, \ + .group_id = ~(unsigned)0, \ + } -#define FQ(_name, _query_type, _type, _result_type) \ - _Q(_name, FD_QUERY_ ## _query_type, _type, _result_type) +#define FQ(_name, _query_type, _type, _result_type) \ + _Q(_name, FD_QUERY_##_query_type, _type, _result_type) -#define PQ(_name, _query_type, _type, _result_type) \ - _Q(_name, PIPE_QUERY_ ## _query_type, _type, _result_type) +#define PQ(_name, _query_type, _type, _result_type) \ + _Q(_name, PIPE_QUERY_##_query_type, _type, _result_type) static const struct pipe_driver_query_info sw_query_list[] = { - FQ("draw-calls", DRAW_CALLS, UINT64, AVERAGE), - FQ("batches", BATCH_TOTAL, UINT64, AVERAGE), - FQ("batches-sysmem", BATCH_SYSMEM, UINT64, AVERAGE), - FQ("batches-gmem", BATCH_GMEM, UINT64, AVERAGE), - FQ("batches-nondraw", BATCH_NONDRAW, UINT64, AVERAGE), - FQ("restores", BATCH_RESTORE, UINT64, AVERAGE), - PQ("prims-emitted", PRIMITIVES_EMITTED, UINT64, AVERAGE), - FQ("staging", STAGING_UPLOADS, UINT64, AVERAGE), - FQ("shadow", SHADOW_UPLOADS, UINT64, AVERAGE), - FQ("vsregs", VS_REGS, FLOAT, AVERAGE), - FQ("fsregs", FS_REGS, FLOAT, AVERAGE), + FQ("draw-calls", DRAW_CALLS, UINT64, AVERAGE), + FQ("batches", BATCH_TOTAL, UINT64, AVERAGE), + FQ("batches-sysmem", BATCH_SYSMEM, UINT64, AVERAGE), + FQ("batches-gmem", BATCH_GMEM, UINT64, AVERAGE), + FQ("batches-nondraw", BATCH_NONDRAW, UINT64, AVERAGE), + FQ("restores", BATCH_RESTORE, UINT64, AVERAGE), + PQ("prims-emitted", PRIMITIVES_EMITTED, UINT64, AVERAGE), + FQ("staging", STAGING_UPLOADS, UINT64, AVERAGE), + FQ("shadow", SHADOW_UPLOADS, UINT64, AVERAGE), + FQ("vsregs", VS_REGS, FLOAT, AVERAGE), + FQ("fsregs", FS_REGS, FLOAT, AVERAGE), }; static int -fd_get_driver_query_info(struct pipe_screen *pscreen, - unsigned index, struct pipe_driver_query_info *info) +fd_get_driver_query_info(struct pipe_screen *pscreen, unsigned index, + struct pipe_driver_query_info *info) { - struct fd_screen *screen = fd_screen(pscreen); + struct fd_screen *screen = fd_screen(pscreen); - if (!info) - return ARRAY_SIZE(sw_query_list) + screen->num_perfcntr_queries; + if (!info) + return ARRAY_SIZE(sw_query_list) + screen->num_perfcntr_queries; - if (index >= ARRAY_SIZE(sw_query_list)) { - index -= ARRAY_SIZE(sw_query_list); - if (index >= screen->num_perfcntr_queries) - return 0; - *info = screen->perfcntr_queries[index]; - return 1; - } + if (index >= ARRAY_SIZE(sw_query_list)) { + index -= ARRAY_SIZE(sw_query_list); + if (index >= screen->num_perfcntr_queries) + return 0; + *info = screen->perfcntr_queries[index]; + return 1; + } - *info = sw_query_list[index]; - return 1; + *info = sw_query_list[index]; + return 1; } static int fd_get_driver_query_group_info(struct pipe_screen *pscreen, unsigned index, - struct pipe_driver_query_group_info *info) + struct pipe_driver_query_group_info *info) { - struct fd_screen *screen = fd_screen(pscreen); + struct fd_screen *screen = fd_screen(pscreen); - if (!info) - return screen->num_perfcntr_groups; + if (!info) + return screen->num_perfcntr_groups; - if (index >= screen->num_perfcntr_groups) - return 0; + if (index >= screen->num_perfcntr_groups) + return 0; - const struct fd_perfcntr_group *g = &screen->perfcntr_groups[index]; + const struct fd_perfcntr_group *g = &screen->perfcntr_groups[index]; - info->name = g->name; - info->max_active_queries = g->num_counters; - info->num_queries = g->num_countables; + info->name = g->name; + info->max_active_queries = g->num_counters; + info->num_queries = g->num_countables; - return 1; + return 1; } static void -fd_set_active_query_state(struct pipe_context *pctx, bool enable) - assert_dt +fd_set_active_query_state(struct pipe_context *pctx, bool enable) assert_dt { - struct fd_context *ctx = fd_context(pctx); - ctx->active_queries = enable; - ctx->update_active_queries = true; + struct fd_context *ctx = fd_context(pctx); + ctx->active_queries = enable; + ctx->update_active_queries = true; } static enum pipe_driver_query_type query_type(enum fd_perfcntr_type type) { -#define ENUM(t) case FD_PERFCNTR_ ## t: return PIPE_DRIVER_QUERY_ ## t - switch (type) { - ENUM(TYPE_UINT64); - ENUM(TYPE_UINT); - ENUM(TYPE_FLOAT); - ENUM(TYPE_PERCENTAGE); - ENUM(TYPE_BYTES); - ENUM(TYPE_MICROSECONDS); - ENUM(TYPE_HZ); - ENUM(TYPE_DBM); - ENUM(TYPE_TEMPERATURE); - ENUM(TYPE_VOLTS); - ENUM(TYPE_AMPS); - ENUM(TYPE_WATTS); - default: - unreachable("bad type"); - return 0; - } +#define ENUM(t) \ + case FD_PERFCNTR_##t: \ + return PIPE_DRIVER_QUERY_##t + switch (type) { + ENUM(TYPE_UINT64); + ENUM(TYPE_UINT); + ENUM(TYPE_FLOAT); + ENUM(TYPE_PERCENTAGE); + ENUM(TYPE_BYTES); + ENUM(TYPE_MICROSECONDS); + ENUM(TYPE_HZ); + ENUM(TYPE_DBM); + ENUM(TYPE_TEMPERATURE); + ENUM(TYPE_VOLTS); + ENUM(TYPE_AMPS); + ENUM(TYPE_WATTS); + default: + unreachable("bad type"); + return 0; + } } static enum pipe_driver_query_result_type query_result_type(enum fd_perfcntr_result_type type) { - switch (type) { - ENUM(RESULT_TYPE_AVERAGE); - ENUM(RESULT_TYPE_CUMULATIVE); - default: - unreachable("bad type"); - return 0; - } + switch (type) { + ENUM(RESULT_TYPE_AVERAGE); + ENUM(RESULT_TYPE_CUMULATIVE); + default: + unreachable("bad type"); + return 0; + } } static void setup_perfcntr_query_info(struct fd_screen *screen) { - unsigned num_queries = 0; - - for (unsigned i = 0; i < screen->num_perfcntr_groups; i++) - num_queries += screen->perfcntr_groups[i].num_countables; - - screen->perfcntr_queries = - calloc(num_queries, sizeof(screen->perfcntr_queries[0])); - screen->num_perfcntr_queries = num_queries; - - unsigned idx = 0; - for (unsigned i = 0; i < screen->num_perfcntr_groups; i++) { - const struct fd_perfcntr_group *g = &screen->perfcntr_groups[i]; - for (unsigned j = 0; j < g->num_countables; j++) { - struct pipe_driver_query_info *info = - &screen->perfcntr_queries[idx]; - const struct fd_perfcntr_countable *c = - &g->countables[j]; - - info->name = c->name; - info->query_type = FD_QUERY_FIRST_PERFCNTR + idx; - info->type = query_type(c->query_type); - info->result_type = query_result_type(c->result_type); - info->group_id = i; - info->flags = PIPE_DRIVER_QUERY_FLAG_BATCH; - - idx++; - } - } + unsigned num_queries = 0; + + for (unsigned i = 0; i < screen->num_perfcntr_groups; i++) + num_queries += screen->perfcntr_groups[i].num_countables; + + screen->perfcntr_queries = + calloc(num_queries, sizeof(screen->perfcntr_queries[0])); + screen->num_perfcntr_queries = num_queries; + + unsigned idx = 0; + for (unsigned i = 0; i < screen->num_perfcntr_groups; i++) { + const struct fd_perfcntr_group *g = &screen->perfcntr_groups[i]; + for (unsigned j = 0; j < g->num_countables; j++) { + struct pipe_driver_query_info *info = &screen->perfcntr_queries[idx]; + const struct fd_perfcntr_countable *c = &g->countables[j]; + + info->name = c->name; + info->query_type = FD_QUERY_FIRST_PERFCNTR + idx; + info->type = query_type(c->query_type); + info->result_type = query_result_type(c->result_type); + info->group_id = i; + info->flags = PIPE_DRIVER_QUERY_FLAG_BATCH; + + idx++; + } + } } void fd_query_screen_init(struct pipe_screen *pscreen) { - pscreen->get_driver_query_info = fd_get_driver_query_info; - pscreen->get_driver_query_group_info = fd_get_driver_query_group_info; - setup_perfcntr_query_info(fd_screen(pscreen)); + pscreen->get_driver_query_info = fd_get_driver_query_info; + pscreen->get_driver_query_group_info = fd_get_driver_query_group_info; + setup_perfcntr_query_info(fd_screen(pscreen)); } void fd_query_context_init(struct pipe_context *pctx) { - pctx->create_query = fd_create_query; - pctx->destroy_query = fd_destroy_query; - pctx->begin_query = fd_begin_query; - pctx->end_query = fd_end_query; - pctx->get_query_result = fd_get_query_result; - pctx->set_active_query_state = fd_set_active_query_state; - pctx->render_condition = fd_render_condition; + pctx->create_query = fd_create_query; + pctx->destroy_query = fd_destroy_query; + pctx->begin_query = fd_begin_query; + pctx->end_query = fd_end_query; + pctx->get_query_result = fd_get_query_result; + pctx->set_active_query_state = fd_set_active_query_state; + pctx->render_condition = fd_render_condition; } diff --git a/src/gallium/drivers/freedreno/freedreno_query.h b/src/gallium/drivers/freedreno/freedreno_query.h index cad4fbb..2b641a7 100644 --- a/src/gallium/drivers/freedreno/freedreno_query.h +++ b/src/gallium/drivers/freedreno/freedreno_query.h @@ -37,43 +37,55 @@ struct fd_context; struct fd_query; struct fd_query_funcs { - void (*destroy_query)(struct fd_context *ctx, - struct fd_query *q) dt; - void (*begin_query)(struct fd_context *ctx, struct fd_query *q) dt; - void (*end_query)(struct fd_context *ctx, struct fd_query *q) dt; - bool (*get_query_result)(struct fd_context *ctx, - struct fd_query *q, bool wait, - union pipe_query_result *result); + void (*destroy_query)(struct fd_context *ctx, struct fd_query *q) dt; + void (*begin_query)(struct fd_context *ctx, struct fd_query *q) dt; + void (*end_query)(struct fd_context *ctx, struct fd_query *q) dt; + bool (*get_query_result)(struct fd_context *ctx, struct fd_query *q, + bool wait, union pipe_query_result *result); }; struct fd_query { - struct threaded_query base; + struct threaded_query base; - const struct fd_query_funcs *funcs; - int type; - unsigned index; + const struct fd_query_funcs *funcs; + int type; + unsigned index; }; static inline struct fd_query * fd_query(struct pipe_query *pq) { - return (struct fd_query *)pq; + return (struct fd_query *)pq; } -#define FD_QUERY_DRAW_CALLS (PIPE_QUERY_DRIVER_SPECIFIC + 0) -#define FD_QUERY_BATCH_TOTAL (PIPE_QUERY_DRIVER_SPECIFIC + 1) /* total # of batches (submits) */ -#define FD_QUERY_BATCH_SYSMEM (PIPE_QUERY_DRIVER_SPECIFIC + 2) /* batches using system memory (GMEM bypass) */ -#define FD_QUERY_BATCH_GMEM (PIPE_QUERY_DRIVER_SPECIFIC + 3) /* batches using GMEM */ -#define FD_QUERY_BATCH_NONDRAW (PIPE_QUERY_DRIVER_SPECIFIC + 4) /* compute/blit batches */ -#define FD_QUERY_BATCH_RESTORE (PIPE_QUERY_DRIVER_SPECIFIC + 5) /* batches requiring GMEM restore */ -#define FD_QUERY_STAGING_UPLOADS (PIPE_QUERY_DRIVER_SPECIFIC + 6) /* texture/buffer uploads using staging blit */ -#define FD_QUERY_SHADOW_UPLOADS (PIPE_QUERY_DRIVER_SPECIFIC + 7) /* texture/buffer uploads that shadowed rsc */ -#define FD_QUERY_VS_REGS (PIPE_QUERY_DRIVER_SPECIFIC + 8) /* avg # of VS registers (scaled up by 100x) */ -#define FD_QUERY_FS_REGS (PIPE_QUERY_DRIVER_SPECIFIC + 9) /* avg # of VS registers (scaled up by 100x) */ +#define FD_QUERY_DRAW_CALLS (PIPE_QUERY_DRIVER_SPECIFIC + 0) +#define FD_QUERY_BATCH_TOTAL \ + (PIPE_QUERY_DRIVER_SPECIFIC + 1) /* total # of batches (submits) */ +#define FD_QUERY_BATCH_SYSMEM \ + (PIPE_QUERY_DRIVER_SPECIFIC + \ + 2) /* batches using system memory (GMEM bypass) */ +#define FD_QUERY_BATCH_GMEM \ + (PIPE_QUERY_DRIVER_SPECIFIC + 3) /* batches using GMEM */ +#define FD_QUERY_BATCH_NONDRAW \ + (PIPE_QUERY_DRIVER_SPECIFIC + 4) /* compute/blit batches */ +#define FD_QUERY_BATCH_RESTORE \ + (PIPE_QUERY_DRIVER_SPECIFIC + 5) /* batches requiring GMEM restore */ +#define FD_QUERY_STAGING_UPLOADS \ + (PIPE_QUERY_DRIVER_SPECIFIC + \ + 6) /* texture/buffer uploads using staging blit */ +#define FD_QUERY_SHADOW_UPLOADS \ + (PIPE_QUERY_DRIVER_SPECIFIC + \ + 7) /* texture/buffer uploads that shadowed rsc */ +#define FD_QUERY_VS_REGS \ + (PIPE_QUERY_DRIVER_SPECIFIC + \ + 8) /* avg # of VS registers (scaled up by 100x) */ +#define FD_QUERY_FS_REGS \ + (PIPE_QUERY_DRIVER_SPECIFIC + \ + 9) /* avg # of VS registers (scaled up by 100x) */ /* insert any new non-perfcntr queries here, the first perfcntr index * needs to come last! */ -#define FD_QUERY_FIRST_PERFCNTR (PIPE_QUERY_DRIVER_SPECIFIC + 10) +#define FD_QUERY_FIRST_PERFCNTR (PIPE_QUERY_DRIVER_SPECIFIC + 10) void fd_query_screen_init(struct pipe_screen *pscreen); void fd_query_context_init(struct pipe_context *pctx); @@ -81,42 +93,42 @@ void fd_query_context_init(struct pipe_context *pctx); static inline bool skip_begin_query(int type) { - switch (type) { - case PIPE_QUERY_TIMESTAMP: - case PIPE_QUERY_GPU_FINISHED: - return true; - default: - return false; - } + switch (type) { + case PIPE_QUERY_TIMESTAMP: + case PIPE_QUERY_GPU_FINISHED: + return true; + default: + return false; + } } /* maps query_type to sample provider idx: */ -static inline -int pidx(unsigned query_type) +static inline int +pidx(unsigned query_type) { - switch (query_type) { - case PIPE_QUERY_OCCLUSION_COUNTER: - return 0; - case PIPE_QUERY_OCCLUSION_PREDICATE: - return 1; - case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE: - return 2; - /* TODO currently queries only emitted in main pass (not in binning pass).. - * which is fine for occlusion query, but pretty much not anything else. - */ - case PIPE_QUERY_TIME_ELAPSED: - return 3; - case PIPE_QUERY_TIMESTAMP: - return 4; - - case PIPE_QUERY_PRIMITIVES_GENERATED: - return 5; - case PIPE_QUERY_PRIMITIVES_EMITTED: - return 6; - - default: - return -1; - } + switch (query_type) { + case PIPE_QUERY_OCCLUSION_COUNTER: + return 0; + case PIPE_QUERY_OCCLUSION_PREDICATE: + return 1; + case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE: + return 2; + /* TODO currently queries only emitted in main pass (not in binning pass).. + * which is fine for occlusion query, but pretty much not anything else. + */ + case PIPE_QUERY_TIME_ELAPSED: + return 3; + case PIPE_QUERY_TIMESTAMP: + return 4; + + case PIPE_QUERY_PRIMITIVES_GENERATED: + return 5; + case PIPE_QUERY_PRIMITIVES_EMITTED: + return 6; + + default: + return -1; + } } #endif /* FREEDRENO_QUERY_H_ */ diff --git a/src/gallium/drivers/freedreno/freedreno_query_acc.c b/src/gallium/drivers/freedreno/freedreno_query_acc.c index 6fb0816..2e31e79 100644 --- a/src/gallium/drivers/freedreno/freedreno_query_acc.c +++ b/src/gallium/drivers/freedreno/freedreno_query_acc.c @@ -24,225 +24,220 @@ * Rob Clark */ -#include "util/u_memory.h" #include "util/u_inlines.h" +#include "util/u_memory.h" -#include "freedreno_query_acc.h" #include "freedreno_context.h" +#include "freedreno_query_acc.h" #include "freedreno_resource.h" #include "freedreno_util.h" static void -fd_acc_destroy_query(struct fd_context *ctx, struct fd_query *q) - assert_dt +fd_acc_destroy_query(struct fd_context *ctx, struct fd_query *q) assert_dt { - struct fd_acc_query *aq = fd_acc_query(q); + struct fd_acc_query *aq = fd_acc_query(q); - DBG("%p", q); + DBG("%p", q); - pipe_resource_reference(&aq->prsc, NULL); - list_del(&aq->node); + pipe_resource_reference(&aq->prsc, NULL); + list_del(&aq->node); - free(aq->query_data); - free(aq); + free(aq->query_data); + free(aq); } static void realloc_query_bo(struct fd_context *ctx, struct fd_acc_query *aq) { - struct fd_resource *rsc; - void *map; + struct fd_resource *rsc; + void *map; - pipe_resource_reference(&aq->prsc, NULL); + pipe_resource_reference(&aq->prsc, NULL); - aq->prsc = pipe_buffer_create(&ctx->screen->base, - PIPE_BIND_QUERY_BUFFER, 0, 0x1000); + aq->prsc = + pipe_buffer_create(&ctx->screen->base, PIPE_BIND_QUERY_BUFFER, 0, 0x1000); - /* don't assume the buffer is zero-initialized: */ - rsc = fd_resource(aq->prsc); + /* don't assume the buffer is zero-initialized: */ + rsc = fd_resource(aq->prsc); - fd_bo_cpu_prep(rsc->bo, ctx->pipe, DRM_FREEDRENO_PREP_WRITE); + fd_bo_cpu_prep(rsc->bo, ctx->pipe, DRM_FREEDRENO_PREP_WRITE); - map = fd_bo_map(rsc->bo); - memset(map, 0, aq->size); - fd_bo_cpu_fini(rsc->bo); + map = fd_bo_map(rsc->bo); + memset(map, 0, aq->size); + fd_bo_cpu_fini(rsc->bo); } static void -fd_acc_query_pause(struct fd_acc_query *aq) - assert_dt +fd_acc_query_pause(struct fd_acc_query *aq) assert_dt { - const struct fd_acc_sample_provider *p = aq->provider; + const struct fd_acc_sample_provider *p = aq->provider; - if (!aq->batch) - return; + if (!aq->batch) + return; - p->pause(aq, aq->batch); - aq->batch = NULL; + p->pause(aq, aq->batch); + aq->batch = NULL; } static void -fd_acc_query_resume(struct fd_acc_query *aq, struct fd_batch *batch) - assert_dt +fd_acc_query_resume(struct fd_acc_query *aq, struct fd_batch *batch) assert_dt { - const struct fd_acc_sample_provider *p = aq->provider; + const struct fd_acc_sample_provider *p = aq->provider; - aq->batch = batch; - p->resume(aq, aq->batch); + aq->batch = batch; + p->resume(aq, aq->batch); - fd_screen_lock(batch->ctx->screen); - fd_batch_resource_write(batch, fd_resource(aq->prsc)); - fd_screen_unlock(batch->ctx->screen); + fd_screen_lock(batch->ctx->screen); + fd_batch_resource_write(batch, fd_resource(aq->prsc)); + fd_screen_unlock(batch->ctx->screen); } static void -fd_acc_begin_query(struct fd_context *ctx, struct fd_query *q) - assert_dt +fd_acc_begin_query(struct fd_context *ctx, struct fd_query *q) assert_dt { - struct fd_acc_query *aq = fd_acc_query(q); + struct fd_acc_query *aq = fd_acc_query(q); - DBG("%p", q); + DBG("%p", q); - /* ->begin_query() discards previous results, so realloc bo: */ - realloc_query_bo(ctx, aq); + /* ->begin_query() discards previous results, so realloc bo: */ + realloc_query_bo(ctx, aq); - /* Signal that we need to update the active queries on the next draw */ - ctx->update_active_queries = true; + /* Signal that we need to update the active queries on the next draw */ + ctx->update_active_queries = true; - /* add to active list: */ - assert(list_is_empty(&aq->node)); - list_addtail(&aq->node, &ctx->acc_active_queries); + /* add to active list: */ + assert(list_is_empty(&aq->node)); + list_addtail(&aq->node, &ctx->acc_active_queries); - /* TIMESTAMP/GPU_FINISHED and don't do normal bracketing at draw time, we - * need to just emit the capture at this moment. - */ - if (skip_begin_query(q->type)) { - struct fd_batch *batch = fd_context_batch_locked(ctx); - fd_acc_query_resume(aq, batch); - fd_batch_unlock_submit(batch); - fd_batch_reference(&batch, NULL); - } + /* TIMESTAMP/GPU_FINISHED and don't do normal bracketing at draw time, we + * need to just emit the capture at this moment. + */ + if (skip_begin_query(q->type)) { + struct fd_batch *batch = fd_context_batch_locked(ctx); + fd_acc_query_resume(aq, batch); + fd_batch_unlock_submit(batch); + fd_batch_reference(&batch, NULL); + } } static void -fd_acc_end_query(struct fd_context *ctx, struct fd_query *q) - assert_dt +fd_acc_end_query(struct fd_context *ctx, struct fd_query *q) assert_dt { - struct fd_acc_query *aq = fd_acc_query(q); + struct fd_acc_query *aq = fd_acc_query(q); - DBG("%p", q); + DBG("%p", q); - fd_acc_query_pause(aq); + fd_acc_query_pause(aq); - /* remove from active list: */ - list_delinit(&aq->node); + /* remove from active list: */ + list_delinit(&aq->node); } static bool -fd_acc_get_query_result(struct fd_context *ctx, struct fd_query *q, - bool wait, union pipe_query_result *result) +fd_acc_get_query_result(struct fd_context *ctx, struct fd_query *q, bool wait, + union pipe_query_result *result) { - struct fd_acc_query *aq = fd_acc_query(q); - const struct fd_acc_sample_provider *p = aq->provider; - struct fd_resource *rsc = fd_resource(aq->prsc); - - DBG("%p: wait=%d", q, wait); - - assert(list_is_empty(&aq->node)); - - /* if !wait, then check the last sample (the one most likely to - * not be ready yet) and bail if it is not ready: - */ - if (!wait) { - int ret; - - if (pending(rsc, false)) { - assert(!q->base.flushed); - tc_assert_driver_thread(ctx->tc); - - /* piglit spec@arb_occlusion_query@occlusion_query_conform - * test, and silly apps perhaps, get stuck in a loop trying - * to get query result forever with wait==false.. we don't - * wait to flush unnecessarily but we also don't want to - * spin forever: - */ - if (aq->no_wait_cnt++ > 5) { - fd_context_access_begin(ctx); - fd_batch_flush(rsc->track->write_batch); - fd_context_access_end(ctx); - } - return false; - } - - ret = fd_resource_wait(ctx, rsc, - DRM_FREEDRENO_PREP_READ | DRM_FREEDRENO_PREP_NOSYNC); - if (ret) - return false; - - fd_bo_cpu_fini(rsc->bo); - } - - if (rsc->track->write_batch) { - tc_assert_driver_thread(ctx->tc); - fd_context_access_begin(ctx); - fd_batch_flush(rsc->track->write_batch); - fd_context_access_end(ctx); - } - - /* get the result: */ - fd_resource_wait(ctx, rsc, DRM_FREEDRENO_PREP_READ); - - void *ptr = fd_bo_map(rsc->bo); - p->result(aq, ptr, result); - fd_bo_cpu_fini(rsc->bo); - - return true; + struct fd_acc_query *aq = fd_acc_query(q); + const struct fd_acc_sample_provider *p = aq->provider; + struct fd_resource *rsc = fd_resource(aq->prsc); + + DBG("%p: wait=%d", q, wait); + + assert(list_is_empty(&aq->node)); + + /* if !wait, then check the last sample (the one most likely to + * not be ready yet) and bail if it is not ready: + */ + if (!wait) { + int ret; + + if (pending(rsc, false)) { + assert(!q->base.flushed); + tc_assert_driver_thread(ctx->tc); + + /* piglit spec@arb_occlusion_query@occlusion_query_conform + * test, and silly apps perhaps, get stuck in a loop trying + * to get query result forever with wait==false.. we don't + * wait to flush unnecessarily but we also don't want to + * spin forever: + */ + if (aq->no_wait_cnt++ > 5) { + fd_context_access_begin(ctx); + fd_batch_flush(rsc->track->write_batch); + fd_context_access_end(ctx); + } + return false; + } + + ret = fd_resource_wait( + ctx, rsc, DRM_FREEDRENO_PREP_READ | DRM_FREEDRENO_PREP_NOSYNC); + if (ret) + return false; + + fd_bo_cpu_fini(rsc->bo); + } + + if (rsc->track->write_batch) { + tc_assert_driver_thread(ctx->tc); + fd_context_access_begin(ctx); + fd_batch_flush(rsc->track->write_batch); + fd_context_access_end(ctx); + } + + /* get the result: */ + fd_resource_wait(ctx, rsc, DRM_FREEDRENO_PREP_READ); + + void *ptr = fd_bo_map(rsc->bo); + p->result(aq, ptr, result); + fd_bo_cpu_fini(rsc->bo); + + return true; } static const struct fd_query_funcs acc_query_funcs = { - .destroy_query = fd_acc_destroy_query, - .begin_query = fd_acc_begin_query, - .end_query = fd_acc_end_query, - .get_query_result = fd_acc_get_query_result, + .destroy_query = fd_acc_destroy_query, + .begin_query = fd_acc_begin_query, + .end_query = fd_acc_end_query, + .get_query_result = fd_acc_get_query_result, }; struct fd_query * fd_acc_create_query2(struct fd_context *ctx, unsigned query_type, - unsigned index, const struct fd_acc_sample_provider *provider) + unsigned index, + const struct fd_acc_sample_provider *provider) { - struct fd_acc_query *aq; - struct fd_query *q; + struct fd_acc_query *aq; + struct fd_query *q; - aq = CALLOC_STRUCT(fd_acc_query); - if (!aq) - return NULL; + aq = CALLOC_STRUCT(fd_acc_query); + if (!aq) + return NULL; - DBG("%p: query_type=%u", aq, query_type); + DBG("%p: query_type=%u", aq, query_type); - aq->provider = provider; - aq->size = provider->size; + aq->provider = provider; + aq->size = provider->size; - list_inithead(&aq->node); + list_inithead(&aq->node); - q = &aq->base; - q->funcs = &acc_query_funcs; - q->type = query_type; - q->index = index; + q = &aq->base; + q->funcs = &acc_query_funcs; + q->type = query_type; + q->index = index; - return q; + return q; } struct fd_query * -fd_acc_create_query(struct fd_context *ctx, unsigned query_type, - unsigned index) +fd_acc_create_query(struct fd_context *ctx, unsigned query_type, unsigned index) { - int idx = pidx(query_type); + int idx = pidx(query_type); - if ((idx < 0) || !ctx->acc_sample_providers[idx]) - return NULL; + if ((idx < 0) || !ctx->acc_sample_providers[idx]) + return NULL; - return fd_acc_create_query2(ctx, query_type, index, - ctx->acc_sample_providers[idx]); + return fd_acc_create_query2(ctx, query_type, index, + ctx->acc_sample_providers[idx]); } /* Called at clear/draw/blit time to enable/disable the appropriate queries in @@ -252,35 +247,35 @@ fd_acc_create_query(struct fd_context *ctx, unsigned query_type, void fd_acc_query_update_batch(struct fd_batch *batch, bool disable_all) { - struct fd_context *ctx = batch->ctx; - - if (disable_all || ctx->update_active_queries) { - struct fd_acc_query *aq; - LIST_FOR_EACH_ENTRY(aq, &ctx->acc_active_queries, node) { - bool batch_change = aq->batch != batch; - bool was_active = aq->batch != NULL; - bool now_active = !disable_all && - (ctx->active_queries || aq->provider->always); - - if (was_active && (!now_active || batch_change)) - fd_acc_query_pause(aq); - if (now_active && (!was_active || batch_change)) - fd_acc_query_resume(aq, batch); - } - } - - ctx->update_active_queries = false; + struct fd_context *ctx = batch->ctx; + + if (disable_all || ctx->update_active_queries) { + struct fd_acc_query *aq; + LIST_FOR_EACH_ENTRY (aq, &ctx->acc_active_queries, node) { + bool batch_change = aq->batch != batch; + bool was_active = aq->batch != NULL; + bool now_active = + !disable_all && (ctx->active_queries || aq->provider->always); + + if (was_active && (!now_active || batch_change)) + fd_acc_query_pause(aq); + if (now_active && (!was_active || batch_change)) + fd_acc_query_resume(aq, batch); + } + } + + ctx->update_active_queries = false; } void fd_acc_query_register_provider(struct pipe_context *pctx, - const struct fd_acc_sample_provider *provider) + const struct fd_acc_sample_provider *provider) { - struct fd_context *ctx = fd_context(pctx); - int idx = pidx(provider->query_type); + struct fd_context *ctx = fd_context(pctx); + int idx = pidx(provider->query_type); - assert((0 <= idx) && (idx < MAX_HW_SAMPLE_PROVIDERS)); - assert(!ctx->acc_sample_providers[idx]); + assert((0 <= idx) && (idx < MAX_HW_SAMPLE_PROVIDERS)); + assert(!ctx->acc_sample_providers[idx]); - ctx->acc_sample_providers[idx] = provider; + ctx->acc_sample_providers[idx] = provider; } diff --git a/src/gallium/drivers/freedreno/freedreno_query_acc.h b/src/gallium/drivers/freedreno/freedreno_query_acc.h index 168813c..79dd399 100644 --- a/src/gallium/drivers/freedreno/freedreno_query_acc.h +++ b/src/gallium/drivers/freedreno/freedreno_query_acc.h @@ -29,9 +29,8 @@ #include "util/list.h" -#include "freedreno_query.h" #include "freedreno_context.h" - +#include "freedreno_query.h" /* * Accumulated HW Queries: @@ -53,61 +52,64 @@ * span multiple batches, etc. */ - struct fd_acc_query; struct fd_acc_sample_provider { - unsigned query_type; + unsigned query_type; - /* Set if the provider should still count while !ctx->active_queries */ - bool always; + /* Set if the provider should still count while !ctx->active_queries */ + bool always; - unsigned size; + unsigned size; - void (*resume)(struct fd_acc_query *aq, struct fd_batch *batch) dt; - void (*pause)(struct fd_acc_query *aq, struct fd_batch *batch) dt; + void (*resume)(struct fd_acc_query *aq, struct fd_batch *batch) dt; + void (*pause)(struct fd_acc_query *aq, struct fd_batch *batch) dt; - void (*result)(struct fd_acc_query *aq, void *buf, - union pipe_query_result *result); + void (*result)(struct fd_acc_query *aq, void *buf, + union pipe_query_result *result); }; struct fd_acc_query { - struct fd_query base; + struct fd_query base; - const struct fd_acc_sample_provider *provider; + const struct fd_acc_sample_provider *provider; - struct pipe_resource *prsc; + struct pipe_resource *prsc; - /* Pointer to the batch that our query has had resume() called on (if - * any). - */ - struct fd_batch *batch; + /* Pointer to the batch that our query has had resume() called on (if + * any). + */ + struct fd_batch *batch; - /* usually the same as provider->size but for batch queries we - * need to calculate the size dynamically when the query is - * allocated: - */ - unsigned size; + /* usually the same as provider->size but for batch queries we + * need to calculate the size dynamically when the query is + * allocated: + */ + unsigned size; - struct list_head node; /* list-node in ctx->active_acc_queries */ + struct list_head node; /* list-node in ctx->active_acc_queries */ - int no_wait_cnt; /* see fd_acc_get_query_result() */ + int no_wait_cnt; /* see fd_acc_get_query_result() */ - void *query_data; /* query specific data */ + void *query_data; /* query specific data */ }; static inline struct fd_acc_query * fd_acc_query(struct fd_query *q) { - return (struct fd_acc_query *)q; + return (struct fd_acc_query *)q; } -struct fd_query * fd_acc_create_query(struct fd_context *ctx, unsigned query_type, - unsigned index); -struct fd_query * fd_acc_create_query2(struct fd_context *ctx, unsigned query_type, - unsigned index, const struct fd_acc_sample_provider *provider); -void fd_acc_query_update_batch(struct fd_batch *batch, bool disable_all) assert_dt; -void fd_acc_query_register_provider(struct pipe_context *pctx, - const struct fd_acc_sample_provider *provider); +struct fd_query *fd_acc_create_query(struct fd_context *ctx, + unsigned query_type, unsigned index); +struct fd_query * +fd_acc_create_query2(struct fd_context *ctx, unsigned query_type, + unsigned index, + const struct fd_acc_sample_provider *provider); +void fd_acc_query_update_batch(struct fd_batch *batch, + bool disable_all) assert_dt; +void +fd_acc_query_register_provider(struct pipe_context *pctx, + const struct fd_acc_sample_provider *provider); #endif /* FREEDRENO_QUERY_ACC_H_ */ diff --git a/src/gallium/drivers/freedreno/freedreno_query_hw.c b/src/gallium/drivers/freedreno/freedreno_query_hw.c index 9eb0259..a4603f2 100644 --- a/src/gallium/drivers/freedreno/freedreno_query_hw.c +++ b/src/gallium/drivers/freedreno/freedreno_query_hw.c @@ -25,337 +25,332 @@ */ #include "pipe/p_state.h" -#include "util/u_memory.h" #include "util/u_inlines.h" +#include "util/u_memory.h" -#include "freedreno_query_hw.h" #include "freedreno_context.h" +#include "freedreno_query_hw.h" #include "freedreno_resource.h" #include "freedreno_util.h" struct fd_hw_sample_period { - struct fd_hw_sample *start, *end; - struct list_head list; + struct fd_hw_sample *start, *end; + struct list_head list; }; static struct fd_hw_sample * get_sample(struct fd_batch *batch, struct fd_ringbuffer *ring, - unsigned query_type) - assert_dt + unsigned query_type) assert_dt { - struct fd_context *ctx = batch->ctx; - struct fd_hw_sample *samp = NULL; - int idx = pidx(query_type); + struct fd_context *ctx = batch->ctx; + struct fd_hw_sample *samp = NULL; + int idx = pidx(query_type); - assume(idx >= 0); /* query never would have been created otherwise */ + assume(idx >= 0); /* query never would have been created otherwise */ - if (!batch->sample_cache[idx]) { - struct fd_hw_sample *new_samp = - ctx->hw_sample_providers[idx]->get_sample(batch, ring); - fd_hw_sample_reference(ctx, &batch->sample_cache[idx], new_samp); - util_dynarray_append(&batch->samples, struct fd_hw_sample *, new_samp); - batch->needs_flush = true; - } + if (!batch->sample_cache[idx]) { + struct fd_hw_sample *new_samp = + ctx->hw_sample_providers[idx]->get_sample(batch, ring); + fd_hw_sample_reference(ctx, &batch->sample_cache[idx], new_samp); + util_dynarray_append(&batch->samples, struct fd_hw_sample *, new_samp); + batch->needs_flush = true; + } - fd_hw_sample_reference(ctx, &samp, batch->sample_cache[idx]); + fd_hw_sample_reference(ctx, &samp, batch->sample_cache[idx]); - return samp; + return samp; } static void clear_sample_cache(struct fd_batch *batch) { - int i; + int i; - for (i = 0; i < ARRAY_SIZE(batch->sample_cache); i++) - fd_hw_sample_reference(batch->ctx, &batch->sample_cache[i], NULL); + for (i = 0; i < ARRAY_SIZE(batch->sample_cache); i++) + fd_hw_sample_reference(batch->ctx, &batch->sample_cache[i], NULL); } static bool query_active_in_batch(struct fd_batch *batch, struct fd_hw_query *hq) { - int idx = pidx(hq->provider->query_type); - return batch->query_providers_active & (1 << idx); + int idx = pidx(hq->provider->query_type); + return batch->query_providers_active & (1 << idx); } static void resume_query(struct fd_batch *batch, struct fd_hw_query *hq, - struct fd_ringbuffer *ring) - assert_dt + struct fd_ringbuffer *ring) assert_dt { - int idx = pidx(hq->provider->query_type); - DBG("%p", hq); - assert(idx >= 0); /* query never would have been created otherwise */ - assert(!hq->period); - batch->query_providers_used |= (1 << idx); - batch->query_providers_active |= (1 << idx); - hq->period = slab_alloc_st(&batch->ctx->sample_period_pool); - list_inithead(&hq->period->list); - hq->period->start = get_sample(batch, ring, hq->base.type); - /* NOTE: slab_alloc_st() does not zero out the buffer: */ - hq->period->end = NULL; + int idx = pidx(hq->provider->query_type); + DBG("%p", hq); + assert(idx >= 0); /* query never would have been created otherwise */ + assert(!hq->period); + batch->query_providers_used |= (1 << idx); + batch->query_providers_active |= (1 << idx); + hq->period = slab_alloc_st(&batch->ctx->sample_period_pool); + list_inithead(&hq->period->list); + hq->period->start = get_sample(batch, ring, hq->base.type); + /* NOTE: slab_alloc_st() does not zero out the buffer: */ + hq->period->end = NULL; } static void pause_query(struct fd_batch *batch, struct fd_hw_query *hq, - struct fd_ringbuffer *ring) - assert_dt + struct fd_ringbuffer *ring) assert_dt { - ASSERTED int idx = pidx(hq->provider->query_type); - DBG("%p", hq); - assert(idx >= 0); /* query never would have been created otherwise */ - assert(hq->period && !hq->period->end); - assert(query_active_in_batch(batch, hq)); - batch->query_providers_active &= ~(1 << idx); - hq->period->end = get_sample(batch, ring, hq->base.type); - list_addtail(&hq->period->list, &hq->periods); - hq->period = NULL; + ASSERTED int idx = pidx(hq->provider->query_type); + DBG("%p", hq); + assert(idx >= 0); /* query never would have been created otherwise */ + assert(hq->period && !hq->period->end); + assert(query_active_in_batch(batch, hq)); + batch->query_providers_active &= ~(1 << idx); + hq->period->end = get_sample(batch, ring, hq->base.type); + list_addtail(&hq->period->list, &hq->periods); + hq->period = NULL; } static void destroy_periods(struct fd_context *ctx, struct fd_hw_query *hq) { - struct fd_hw_sample_period *period, *s; - LIST_FOR_EACH_ENTRY_SAFE(period, s, &hq->periods, list) { - fd_hw_sample_reference(ctx, &period->start, NULL); - fd_hw_sample_reference(ctx, &period->end, NULL); - list_del(&period->list); - slab_free_st(&ctx->sample_period_pool, period); - } + struct fd_hw_sample_period *period, *s; + LIST_FOR_EACH_ENTRY_SAFE (period, s, &hq->periods, list) { + fd_hw_sample_reference(ctx, &period->start, NULL); + fd_hw_sample_reference(ctx, &period->end, NULL); + list_del(&period->list); + slab_free_st(&ctx->sample_period_pool, period); + } } static void fd_hw_destroy_query(struct fd_context *ctx, struct fd_query *q) { - struct fd_hw_query *hq = fd_hw_query(q); + struct fd_hw_query *hq = fd_hw_query(q); - DBG("%p", q); + DBG("%p", q); - destroy_periods(ctx, hq); - list_del(&hq->list); + destroy_periods(ctx, hq); + list_del(&hq->list); - free(hq); + free(hq); } static void -fd_hw_begin_query(struct fd_context *ctx, struct fd_query *q) - assert_dt +fd_hw_begin_query(struct fd_context *ctx, struct fd_query *q) assert_dt { - struct fd_batch *batch = fd_context_batch_locked(ctx); - struct fd_hw_query *hq = fd_hw_query(q); + struct fd_batch *batch = fd_context_batch_locked(ctx); + struct fd_hw_query *hq = fd_hw_query(q); - DBG("%p", q); + DBG("%p", q); - /* begin_query() should clear previous results: */ - destroy_periods(ctx, hq); + /* begin_query() should clear previous results: */ + destroy_periods(ctx, hq); - if (batch && (ctx->active_queries || hq->provider->always)) - resume_query(batch, hq, batch->draw); + if (batch && (ctx->active_queries || hq->provider->always)) + resume_query(batch, hq, batch->draw); - /* add to active list: */ - assert(list_is_empty(&hq->list)); - list_addtail(&hq->list, &ctx->hw_active_queries); + /* add to active list: */ + assert(list_is_empty(&hq->list)); + list_addtail(&hq->list, &ctx->hw_active_queries); - fd_batch_unlock_submit(batch); - fd_batch_reference(&batch, NULL); + fd_batch_unlock_submit(batch); + fd_batch_reference(&batch, NULL); } static void -fd_hw_end_query(struct fd_context *ctx, struct fd_query *q) - assert_dt +fd_hw_end_query(struct fd_context *ctx, struct fd_query *q) assert_dt { - struct fd_batch *batch = fd_context_batch_locked(ctx); - struct fd_hw_query *hq = fd_hw_query(q); + struct fd_batch *batch = fd_context_batch_locked(ctx); + struct fd_hw_query *hq = fd_hw_query(q); - DBG("%p", q); + DBG("%p", q); - if (batch && (ctx->active_queries || hq->provider->always)) - pause_query(batch, hq, batch->draw); + if (batch && (ctx->active_queries || hq->provider->always)) + pause_query(batch, hq, batch->draw); - /* remove from active list: */ - list_delinit(&hq->list); + /* remove from active list: */ + list_delinit(&hq->list); - fd_batch_unlock_submit(batch); - fd_batch_reference(&batch, NULL); + fd_batch_unlock_submit(batch); + fd_batch_reference(&batch, NULL); } /* helper to get ptr to specified sample: */ -static void * sampptr(struct fd_hw_sample *samp, uint32_t n, void *ptr) +static void * +sampptr(struct fd_hw_sample *samp, uint32_t n, void *ptr) { - return ((char *)ptr) + (samp->tile_stride * n) + samp->offset; + return ((char *)ptr) + (samp->tile_stride * n) + samp->offset; } static bool -fd_hw_get_query_result(struct fd_context *ctx, struct fd_query *q, - bool wait, union pipe_query_result *result) +fd_hw_get_query_result(struct fd_context *ctx, struct fd_query *q, bool wait, + union pipe_query_result *result) { - struct fd_hw_query *hq = fd_hw_query(q); - const struct fd_hw_sample_provider *p = hq->provider; - struct fd_hw_sample_period *period; - - DBG("%p: wait=%d", q, wait); - - if (list_is_empty(&hq->periods)) - return true; - - assert(list_is_empty(&hq->list)); - assert(!hq->period); - - /* if !wait, then check the last sample (the one most likely to - * not be ready yet) and bail if it is not ready: - */ - if (!wait) { - int ret; - - period = LIST_ENTRY(struct fd_hw_sample_period, - hq->periods.prev, list); - - struct fd_resource *rsc = fd_resource(period->end->prsc); - - if (pending(rsc, false)) { - assert(!q->base.flushed); - tc_assert_driver_thread(ctx->tc); - - /* piglit spec@arb_occlusion_query@occlusion_query_conform - * test, and silly apps perhaps, get stuck in a loop trying - * to get query result forever with wait==false.. we don't - * wait to flush unnecessarily but we also don't want to - * spin forever: - */ - if (hq->no_wait_cnt++ > 5) { - fd_context_access_begin(ctx); - fd_batch_flush(rsc->track->write_batch); - fd_context_access_end(ctx); - } - return false; - } - - if (!rsc->bo) - return false; - - ret = fd_resource_wait(ctx, rsc, - DRM_FREEDRENO_PREP_READ | DRM_FREEDRENO_PREP_NOSYNC); - if (ret) - return false; - - fd_bo_cpu_fini(rsc->bo); - } - - /* sum the result across all sample periods: */ - LIST_FOR_EACH_ENTRY(period, &hq->periods, list) { - struct fd_hw_sample *start = period->start; - ASSERTED struct fd_hw_sample *end = period->end; - unsigned i; - - /* start and end samples should be from same batch: */ - assert(start->prsc == end->prsc); - assert(start->num_tiles == end->num_tiles); - - struct fd_resource *rsc = fd_resource(start->prsc); - - if (rsc->track->write_batch) { - tc_assert_driver_thread(ctx->tc); - fd_context_access_begin(ctx); - fd_batch_flush(rsc->track->write_batch); - fd_context_access_end(ctx); - } - - /* some piglit tests at least do query with no draws, I guess: */ - if (!rsc->bo) - continue; - - fd_resource_wait(ctx, rsc, DRM_FREEDRENO_PREP_READ); - - void *ptr = fd_bo_map(rsc->bo); - - for (i = 0; i < start->num_tiles; i++) { - p->accumulate_result(ctx, sampptr(period->start, i, ptr), - sampptr(period->end, i, ptr), result); - } - - fd_bo_cpu_fini(rsc->bo); - } - - return true; + struct fd_hw_query *hq = fd_hw_query(q); + const struct fd_hw_sample_provider *p = hq->provider; + struct fd_hw_sample_period *period; + + DBG("%p: wait=%d", q, wait); + + if (list_is_empty(&hq->periods)) + return true; + + assert(list_is_empty(&hq->list)); + assert(!hq->period); + + /* if !wait, then check the last sample (the one most likely to + * not be ready yet) and bail if it is not ready: + */ + if (!wait) { + int ret; + + period = LIST_ENTRY(struct fd_hw_sample_period, hq->periods.prev, list); + + struct fd_resource *rsc = fd_resource(period->end->prsc); + + if (pending(rsc, false)) { + assert(!q->base.flushed); + tc_assert_driver_thread(ctx->tc); + + /* piglit spec@arb_occlusion_query@occlusion_query_conform + * test, and silly apps perhaps, get stuck in a loop trying + * to get query result forever with wait==false.. we don't + * wait to flush unnecessarily but we also don't want to + * spin forever: + */ + if (hq->no_wait_cnt++ > 5) { + fd_context_access_begin(ctx); + fd_batch_flush(rsc->track->write_batch); + fd_context_access_end(ctx); + } + return false; + } + + if (!rsc->bo) + return false; + + ret = fd_resource_wait( + ctx, rsc, DRM_FREEDRENO_PREP_READ | DRM_FREEDRENO_PREP_NOSYNC); + if (ret) + return false; + + fd_bo_cpu_fini(rsc->bo); + } + + /* sum the result across all sample periods: */ + LIST_FOR_EACH_ENTRY (period, &hq->periods, list) { + struct fd_hw_sample *start = period->start; + ASSERTED struct fd_hw_sample *end = period->end; + unsigned i; + + /* start and end samples should be from same batch: */ + assert(start->prsc == end->prsc); + assert(start->num_tiles == end->num_tiles); + + struct fd_resource *rsc = fd_resource(start->prsc); + + if (rsc->track->write_batch) { + tc_assert_driver_thread(ctx->tc); + fd_context_access_begin(ctx); + fd_batch_flush(rsc->track->write_batch); + fd_context_access_end(ctx); + } + + /* some piglit tests at least do query with no draws, I guess: */ + if (!rsc->bo) + continue; + + fd_resource_wait(ctx, rsc, DRM_FREEDRENO_PREP_READ); + + void *ptr = fd_bo_map(rsc->bo); + + for (i = 0; i < start->num_tiles; i++) { + p->accumulate_result(ctx, sampptr(period->start, i, ptr), + sampptr(period->end, i, ptr), result); + } + + fd_bo_cpu_fini(rsc->bo); + } + + return true; } static const struct fd_query_funcs hw_query_funcs = { - .destroy_query = fd_hw_destroy_query, - .begin_query = fd_hw_begin_query, - .end_query = fd_hw_end_query, - .get_query_result = fd_hw_get_query_result, + .destroy_query = fd_hw_destroy_query, + .begin_query = fd_hw_begin_query, + .end_query = fd_hw_end_query, + .get_query_result = fd_hw_get_query_result, }; struct fd_query * fd_hw_create_query(struct fd_context *ctx, unsigned query_type, unsigned index) { - struct fd_hw_query *hq; - struct fd_query *q; - int idx = pidx(query_type); + struct fd_hw_query *hq; + struct fd_query *q; + int idx = pidx(query_type); - if ((idx < 0) || !ctx->hw_sample_providers[idx]) - return NULL; + if ((idx < 0) || !ctx->hw_sample_providers[idx]) + return NULL; - hq = CALLOC_STRUCT(fd_hw_query); - if (!hq) - return NULL; + hq = CALLOC_STRUCT(fd_hw_query); + if (!hq) + return NULL; - DBG("%p: query_type=%u", hq, query_type); + DBG("%p: query_type=%u", hq, query_type); - hq->provider = ctx->hw_sample_providers[idx]; + hq->provider = ctx->hw_sample_providers[idx]; - list_inithead(&hq->periods); - list_inithead(&hq->list); + list_inithead(&hq->periods); + list_inithead(&hq->list); - q = &hq->base; - q->funcs = &hw_query_funcs; - q->type = query_type; - q->index = index; + q = &hq->base; + q->funcs = &hw_query_funcs; + q->type = query_type; + q->index = index; - return q; + return q; } struct fd_hw_sample * fd_hw_sample_init(struct fd_batch *batch, uint32_t size) { - struct fd_hw_sample *samp = slab_alloc_st(&batch->ctx->sample_pool); - pipe_reference_init(&samp->reference, 1); - samp->size = size; - debug_assert(util_is_power_of_two_or_zero(size)); - batch->next_sample_offset = align(batch->next_sample_offset, size); - samp->offset = batch->next_sample_offset; - /* NOTE: slab_alloc_st() does not zero out the buffer: */ - samp->prsc = NULL; - samp->num_tiles = 0; - samp->tile_stride = 0; - batch->next_sample_offset += size; - - if (!batch->query_buf) { - struct pipe_screen *pscreen = &batch->ctx->screen->base; - struct pipe_resource templ = { - .target = PIPE_BUFFER, - .format = PIPE_FORMAT_R8_UNORM, - .bind = PIPE_BIND_QUERY_BUFFER, - .width0 = 0, /* create initially zero size buffer */ - .height0 = 1, - .depth0 = 1, - .array_size = 1, - .last_level = 0, - .nr_samples = 1, - }; - batch->query_buf = pscreen->resource_create(pscreen, &templ); - } - - pipe_resource_reference(&samp->prsc, batch->query_buf); - - return samp; + struct fd_hw_sample *samp = slab_alloc_st(&batch->ctx->sample_pool); + pipe_reference_init(&samp->reference, 1); + samp->size = size; + debug_assert(util_is_power_of_two_or_zero(size)); + batch->next_sample_offset = align(batch->next_sample_offset, size); + samp->offset = batch->next_sample_offset; + /* NOTE: slab_alloc_st() does not zero out the buffer: */ + samp->prsc = NULL; + samp->num_tiles = 0; + samp->tile_stride = 0; + batch->next_sample_offset += size; + + if (!batch->query_buf) { + struct pipe_screen *pscreen = &batch->ctx->screen->base; + struct pipe_resource templ = { + .target = PIPE_BUFFER, + .format = PIPE_FORMAT_R8_UNORM, + .bind = PIPE_BIND_QUERY_BUFFER, + .width0 = 0, /* create initially zero size buffer */ + .height0 = 1, + .depth0 = 1, + .array_size = 1, + .last_level = 0, + .nr_samples = 1, + }; + batch->query_buf = pscreen->resource_create(pscreen, &templ); + } + + pipe_resource_reference(&samp->prsc, batch->query_buf); + + return samp; } void __fd_hw_sample_destroy(struct fd_context *ctx, struct fd_hw_sample *samp) { - pipe_resource_reference(&samp->prsc, NULL); - slab_free_st(&ctx->sample_pool, samp); + pipe_resource_reference(&samp->prsc, NULL); + slab_free_st(&ctx->sample_pool, samp); } /* called from gmem code once total storage requirements are known (ie. @@ -364,60 +359,60 @@ __fd_hw_sample_destroy(struct fd_context *ctx, struct fd_hw_sample *samp) void fd_hw_query_prepare(struct fd_batch *batch, uint32_t num_tiles) { - uint32_t tile_stride = batch->next_sample_offset; + uint32_t tile_stride = batch->next_sample_offset; - if (tile_stride > 0) - fd_resource_resize(batch->query_buf, tile_stride * num_tiles); + if (tile_stride > 0) + fd_resource_resize(batch->query_buf, tile_stride * num_tiles); - batch->query_tile_stride = tile_stride; + batch->query_tile_stride = tile_stride; - while (batch->samples.size > 0) { - struct fd_hw_sample *samp = - util_dynarray_pop(&batch->samples, struct fd_hw_sample *); - samp->num_tiles = num_tiles; - samp->tile_stride = tile_stride; - fd_hw_sample_reference(batch->ctx, &samp, NULL); - } + while (batch->samples.size > 0) { + struct fd_hw_sample *samp = + util_dynarray_pop(&batch->samples, struct fd_hw_sample *); + samp->num_tiles = num_tiles; + samp->tile_stride = tile_stride; + fd_hw_sample_reference(batch->ctx, &samp, NULL); + } - /* reset things for next batch: */ - batch->next_sample_offset = 0; + /* reset things for next batch: */ + batch->next_sample_offset = 0; } void fd_hw_query_prepare_tile(struct fd_batch *batch, uint32_t n, - struct fd_ringbuffer *ring) + struct fd_ringbuffer *ring) { - uint32_t tile_stride = batch->query_tile_stride; - uint32_t offset = tile_stride * n; + uint32_t tile_stride = batch->query_tile_stride; + uint32_t offset = tile_stride * n; - /* bail if no queries: */ - if (tile_stride == 0) - return; + /* bail if no queries: */ + if (tile_stride == 0) + return; - fd_wfi(batch, ring); - OUT_PKT0 (ring, HW_QUERY_BASE_REG, 1); - OUT_RELOC(ring, fd_resource(batch->query_buf)->bo, offset, 0, 0); + fd_wfi(batch, ring); + OUT_PKT0(ring, HW_QUERY_BASE_REG, 1); + OUT_RELOC(ring, fd_resource(batch->query_buf)->bo, offset, 0, 0); } void fd_hw_query_update_batch(struct fd_batch *batch, bool disable_all) { - struct fd_context *ctx = batch->ctx; - - if (disable_all || ctx->update_active_queries) { - struct fd_hw_query *hq; - LIST_FOR_EACH_ENTRY(hq, &batch->ctx->hw_active_queries, list) { - bool was_active = query_active_in_batch(batch, hq); - bool now_active = !disable_all && - (ctx->active_queries || hq->provider->always); - - if (now_active && !was_active) - resume_query(batch, hq, batch->draw); - else if (was_active && !now_active) - pause_query(batch, hq, batch->draw); - } - } - clear_sample_cache(batch); + struct fd_context *ctx = batch->ctx; + + if (disable_all || ctx->update_active_queries) { + struct fd_hw_query *hq; + LIST_FOR_EACH_ENTRY (hq, &batch->ctx->hw_active_queries, list) { + bool was_active = query_active_in_batch(batch, hq); + bool now_active = + !disable_all && (ctx->active_queries || hq->provider->always); + + if (now_active && !was_active) + resume_query(batch, hq, batch->draw); + else if (was_active && !now_active) + pause_query(batch, hq, batch->draw); + } + } + clear_sample_cache(batch); } /* call the provider->enable() for all the hw queries that were active @@ -427,45 +422,44 @@ fd_hw_query_update_batch(struct fd_batch *batch, bool disable_all) void fd_hw_query_enable(struct fd_batch *batch, struct fd_ringbuffer *ring) { - struct fd_context *ctx = batch->ctx; - for (int idx = 0; idx < MAX_HW_SAMPLE_PROVIDERS; idx++) { - if (batch->query_providers_used & (1 << idx)) { - assert(ctx->hw_sample_providers[idx]); - if (ctx->hw_sample_providers[idx]->enable) - ctx->hw_sample_providers[idx]->enable(ctx, ring); - } - } + struct fd_context *ctx = batch->ctx; + for (int idx = 0; idx < MAX_HW_SAMPLE_PROVIDERS; idx++) { + if (batch->query_providers_used & (1 << idx)) { + assert(ctx->hw_sample_providers[idx]); + if (ctx->hw_sample_providers[idx]->enable) + ctx->hw_sample_providers[idx]->enable(ctx, ring); + } + } } void fd_hw_query_register_provider(struct pipe_context *pctx, - const struct fd_hw_sample_provider *provider) + const struct fd_hw_sample_provider *provider) { - struct fd_context *ctx = fd_context(pctx); - int idx = pidx(provider->query_type); + struct fd_context *ctx = fd_context(pctx); + int idx = pidx(provider->query_type); - assert((0 <= idx) && (idx < MAX_HW_SAMPLE_PROVIDERS)); - assert(!ctx->hw_sample_providers[idx]); + assert((0 <= idx) && (idx < MAX_HW_SAMPLE_PROVIDERS)); + assert(!ctx->hw_sample_providers[idx]); - ctx->hw_sample_providers[idx] = provider; + ctx->hw_sample_providers[idx] = provider; } void fd_hw_query_init(struct pipe_context *pctx) { - struct fd_context *ctx = fd_context(pctx); + struct fd_context *ctx = fd_context(pctx); - slab_create(&ctx->sample_pool, sizeof(struct fd_hw_sample), - 16); - slab_create(&ctx->sample_period_pool, sizeof(struct fd_hw_sample_period), - 16); + slab_create(&ctx->sample_pool, sizeof(struct fd_hw_sample), 16); + slab_create(&ctx->sample_period_pool, sizeof(struct fd_hw_sample_period), + 16); } void fd_hw_query_fini(struct pipe_context *pctx) { - struct fd_context *ctx = fd_context(pctx); + struct fd_context *ctx = fd_context(pctx); - slab_destroy(&ctx->sample_pool); - slab_destroy(&ctx->sample_period_pool); + slab_destroy(&ctx->sample_pool); + slab_destroy(&ctx->sample_period_pool); } diff --git a/src/gallium/drivers/freedreno/freedreno_query_hw.h b/src/gallium/drivers/freedreno/freedreno_query_hw.h index bb3f8b6..01fc9ce 100644 --- a/src/gallium/drivers/freedreno/freedreno_query_hw.h +++ b/src/gallium/drivers/freedreno/freedreno_query_hw.h @@ -29,9 +29,8 @@ #include "util/list.h" -#include "freedreno_query.h" #include "freedreno_context.h" - +#include "freedreno_query.h" /* * HW Queries: @@ -69,97 +68,99 @@ */ struct fd_hw_sample_provider { - unsigned query_type; - - /* Set if the provider should still count while !ctx->active_queries */ - bool always; - - /* Optional hook for enabling a counter. Guaranteed to happen - * at least once before the first ->get_sample() in a batch. - */ - void (*enable)(struct fd_context *ctx, struct fd_ringbuffer *ring) dt; - - /* when a new sample is required, emit appropriate cmdstream - * and return a sample object: - */ - struct fd_hw_sample *(*get_sample)(struct fd_batch *batch, - struct fd_ringbuffer *ring) dt; - - /* accumulate the results from specified sample period: */ - void (*accumulate_result)(struct fd_context *ctx, - const void *start, const void *end, - union pipe_query_result *result); + unsigned query_type; + + /* Set if the provider should still count while !ctx->active_queries */ + bool always; + + /* Optional hook for enabling a counter. Guaranteed to happen + * at least once before the first ->get_sample() in a batch. + */ + void (*enable)(struct fd_context *ctx, struct fd_ringbuffer *ring) dt; + + /* when a new sample is required, emit appropriate cmdstream + * and return a sample object: + */ + struct fd_hw_sample *(*get_sample)(struct fd_batch *batch, + struct fd_ringbuffer *ring)dt; + + /* accumulate the results from specified sample period: */ + void (*accumulate_result)(struct fd_context *ctx, const void *start, + const void *end, union pipe_query_result *result); }; struct fd_hw_sample { - struct pipe_reference reference; /* keep this first */ - - /* offset and size of the sample are know at the time the - * sample is constructed. - */ - uint32_t size; - uint32_t offset; - - /* backing object, offset/stride/etc are determined not when - * the sample is constructed, but when the batch is submitted. - * This way we can defer allocation until total # of requested - * samples, and total # of tiles, is known. - */ - struct pipe_resource *prsc; - uint32_t num_tiles; - uint32_t tile_stride; + struct pipe_reference reference; /* keep this first */ + + /* offset and size of the sample are know at the time the + * sample is constructed. + */ + uint32_t size; + uint32_t offset; + + /* backing object, offset/stride/etc are determined not when + * the sample is constructed, but when the batch is submitted. + * This way we can defer allocation until total # of requested + * samples, and total # of tiles, is known. + */ + struct pipe_resource *prsc; + uint32_t num_tiles; + uint32_t tile_stride; }; struct fd_hw_sample_period; struct fd_hw_query { - struct fd_query base; + struct fd_query base; - const struct fd_hw_sample_provider *provider; + const struct fd_hw_sample_provider *provider; - /* list of fd_hw_sample_periods: */ - struct list_head periods; + /* list of fd_hw_sample_periods: */ + struct list_head periods; - /* if active and not paused, the current sample period (not - * yet added to current_periods): - */ - struct fd_hw_sample_period *period; + /* if active and not paused, the current sample period (not + * yet added to current_periods): + */ + struct fd_hw_sample_period *period; - struct list_head list; /* list-node in batch->active_queries */ + struct list_head list; /* list-node in batch->active_queries */ - int no_wait_cnt; /* see fd_hw_get_query_result */ + int no_wait_cnt; /* see fd_hw_get_query_result */ }; static inline struct fd_hw_query * fd_hw_query(struct fd_query *q) { - return (struct fd_hw_query *)q; + return (struct fd_hw_query *)q; } -struct fd_query * fd_hw_create_query(struct fd_context *ctx, unsigned query_type, unsigned index); +struct fd_query *fd_hw_create_query(struct fd_context *ctx, unsigned query_type, + unsigned index); /* helper for sample providers: */ -struct fd_hw_sample * fd_hw_sample_init(struct fd_batch *batch, uint32_t size); +struct fd_hw_sample *fd_hw_sample_init(struct fd_batch *batch, uint32_t size); /* don't call directly, use fd_hw_sample_reference() */ void __fd_hw_sample_destroy(struct fd_context *ctx, struct fd_hw_sample *samp); void fd_hw_query_prepare(struct fd_batch *batch, uint32_t num_tiles) assert_dt; void fd_hw_query_prepare_tile(struct fd_batch *batch, uint32_t n, - struct fd_ringbuffer *ring) assert_dt; + struct fd_ringbuffer *ring) assert_dt; void fd_hw_query_update_batch(struct fd_batch *batch, bool end_batch) assert_dt; -void fd_hw_query_enable(struct fd_batch *batch, struct fd_ringbuffer *ring) assert_dt; -void fd_hw_query_register_provider(struct pipe_context *pctx, - const struct fd_hw_sample_provider *provider); +void fd_hw_query_enable(struct fd_batch *batch, + struct fd_ringbuffer *ring) assert_dt; +void +fd_hw_query_register_provider(struct pipe_context *pctx, + const struct fd_hw_sample_provider *provider); void fd_hw_query_init(struct pipe_context *pctx); void fd_hw_query_fini(struct pipe_context *pctx); static inline void -fd_hw_sample_reference(struct fd_context *ctx, - struct fd_hw_sample **ptr, struct fd_hw_sample *samp) +fd_hw_sample_reference(struct fd_context *ctx, struct fd_hw_sample **ptr, + struct fd_hw_sample *samp) { - struct fd_hw_sample *old_samp = *ptr; + struct fd_hw_sample *old_samp = *ptr; - if (pipe_reference(&(*ptr)->reference, &samp->reference)) - __fd_hw_sample_destroy(ctx, old_samp); - *ptr = samp; + if (pipe_reference(&(*ptr)->reference, &samp->reference)) + __fd_hw_sample_destroy(ctx, old_samp); + *ptr = samp; } #endif /* FREEDRENO_QUERY_HW_H_ */ diff --git a/src/gallium/drivers/freedreno/freedreno_query_sw.c b/src/gallium/drivers/freedreno/freedreno_query_sw.c index 8c774e5..2876b97 100644 --- a/src/gallium/drivers/freedreno/freedreno_query_sw.c +++ b/src/gallium/drivers/freedreno/freedreno_query_sw.c @@ -25,13 +25,13 @@ */ #include "pipe/p_state.h" -#include "util/u_string.h" -#include "util/u_memory.h" -#include "util/u_inlines.h" #include "util/os_time.h" +#include "util/u_inlines.h" +#include "util/u_memory.h" +#include "util/u_string.h" -#include "freedreno_query_sw.h" #include "freedreno_context.h" +#include "freedreno_query_sw.h" #include "freedreno_util.h" /* @@ -43,164 +43,161 @@ static void fd_sw_destroy_query(struct fd_context *ctx, struct fd_query *q) { - struct fd_sw_query *sq = fd_sw_query(q); - free(sq); + struct fd_sw_query *sq = fd_sw_query(q); + free(sq); } static uint64_t -read_counter(struct fd_context *ctx, int type) - assert_dt +read_counter(struct fd_context *ctx, int type) assert_dt { - switch (type) { - case PIPE_QUERY_PRIMITIVES_GENERATED: - return ctx->stats.prims_generated; - case PIPE_QUERY_PRIMITIVES_EMITTED: - return ctx->stats.prims_emitted; - case FD_QUERY_DRAW_CALLS: - return ctx->stats.draw_calls; - case FD_QUERY_BATCH_TOTAL: - return ctx->stats.batch_total; - case FD_QUERY_BATCH_SYSMEM: - return ctx->stats.batch_sysmem; - case FD_QUERY_BATCH_GMEM: - return ctx->stats.batch_gmem; - case FD_QUERY_BATCH_NONDRAW: - return ctx->stats.batch_nondraw; - case FD_QUERY_BATCH_RESTORE: - return ctx->stats.batch_restore; - case FD_QUERY_STAGING_UPLOADS: - return ctx->stats.staging_uploads; - case FD_QUERY_SHADOW_UPLOADS: - return ctx->stats.shadow_uploads; - case FD_QUERY_VS_REGS: - return ctx->stats.vs_regs; - case FD_QUERY_FS_REGS: - return ctx->stats.fs_regs; - } - return 0; + switch (type) { + case PIPE_QUERY_PRIMITIVES_GENERATED: + return ctx->stats.prims_generated; + case PIPE_QUERY_PRIMITIVES_EMITTED: + return ctx->stats.prims_emitted; + case FD_QUERY_DRAW_CALLS: + return ctx->stats.draw_calls; + case FD_QUERY_BATCH_TOTAL: + return ctx->stats.batch_total; + case FD_QUERY_BATCH_SYSMEM: + return ctx->stats.batch_sysmem; + case FD_QUERY_BATCH_GMEM: + return ctx->stats.batch_gmem; + case FD_QUERY_BATCH_NONDRAW: + return ctx->stats.batch_nondraw; + case FD_QUERY_BATCH_RESTORE: + return ctx->stats.batch_restore; + case FD_QUERY_STAGING_UPLOADS: + return ctx->stats.staging_uploads; + case FD_QUERY_SHADOW_UPLOADS: + return ctx->stats.shadow_uploads; + case FD_QUERY_VS_REGS: + return ctx->stats.vs_regs; + case FD_QUERY_FS_REGS: + return ctx->stats.fs_regs; + } + return 0; } static bool is_time_rate_query(struct fd_query *q) { - switch (q->type) { - case FD_QUERY_BATCH_TOTAL: - case FD_QUERY_BATCH_SYSMEM: - case FD_QUERY_BATCH_GMEM: - case FD_QUERY_BATCH_NONDRAW: - case FD_QUERY_BATCH_RESTORE: - case FD_QUERY_STAGING_UPLOADS: - case FD_QUERY_SHADOW_UPLOADS: - return true; - default: - return false; - } + switch (q->type) { + case FD_QUERY_BATCH_TOTAL: + case FD_QUERY_BATCH_SYSMEM: + case FD_QUERY_BATCH_GMEM: + case FD_QUERY_BATCH_NONDRAW: + case FD_QUERY_BATCH_RESTORE: + case FD_QUERY_STAGING_UPLOADS: + case FD_QUERY_SHADOW_UPLOADS: + return true; + default: + return false; + } } static bool is_draw_rate_query(struct fd_query *q) { - switch (q->type) { - case FD_QUERY_VS_REGS: - case FD_QUERY_FS_REGS: - return true; - default: - return false; - } + switch (q->type) { + case FD_QUERY_VS_REGS: + case FD_QUERY_FS_REGS: + return true; + default: + return false; + } } static void -fd_sw_begin_query(struct fd_context *ctx, struct fd_query *q) - assert_dt +fd_sw_begin_query(struct fd_context *ctx, struct fd_query *q) assert_dt { - struct fd_sw_query *sq = fd_sw_query(q); + struct fd_sw_query *sq = fd_sw_query(q); - ctx->stats_users++; + ctx->stats_users++; - sq->begin_value = read_counter(ctx, q->type); - if (is_time_rate_query(q)) { - sq->begin_time = os_time_get(); - } else if (is_draw_rate_query(q)) { - sq->begin_time = ctx->stats.draw_calls; - } + sq->begin_value = read_counter(ctx, q->type); + if (is_time_rate_query(q)) { + sq->begin_time = os_time_get(); + } else if (is_draw_rate_query(q)) { + sq->begin_time = ctx->stats.draw_calls; + } } static void -fd_sw_end_query(struct fd_context *ctx, struct fd_query *q) - assert_dt +fd_sw_end_query(struct fd_context *ctx, struct fd_query *q) assert_dt { - struct fd_sw_query *sq = fd_sw_query(q); + struct fd_sw_query *sq = fd_sw_query(q); - assert(ctx->stats_users > 0); - ctx->stats_users--; + assert(ctx->stats_users > 0); + ctx->stats_users--; - sq->end_value = read_counter(ctx, q->type); - if (is_time_rate_query(q)) { - sq->end_time = os_time_get(); - } else if (is_draw_rate_query(q)) { - sq->end_time = ctx->stats.draw_calls; - } + sq->end_value = read_counter(ctx, q->type); + if (is_time_rate_query(q)) { + sq->end_time = os_time_get(); + } else if (is_draw_rate_query(q)) { + sq->end_time = ctx->stats.draw_calls; + } } static bool -fd_sw_get_query_result(struct fd_context *ctx, struct fd_query *q, - bool wait, union pipe_query_result *result) +fd_sw_get_query_result(struct fd_context *ctx, struct fd_query *q, bool wait, + union pipe_query_result *result) { - struct fd_sw_query *sq = fd_sw_query(q); + struct fd_sw_query *sq = fd_sw_query(q); - result->u64 = sq->end_value - sq->begin_value; + result->u64 = sq->end_value - sq->begin_value; - if (is_time_rate_query(q)) { - double fps = (result->u64 * 1000000) / - (double)(sq->end_time - sq->begin_time); - result->u64 = (uint64_t)fps; - } else if (is_draw_rate_query(q)) { - double avg = ((double)result->u64) / - (double)(sq->end_time - sq->begin_time); - result->f = avg; - } + if (is_time_rate_query(q)) { + double fps = + (result->u64 * 1000000) / (double)(sq->end_time - sq->begin_time); + result->u64 = (uint64_t)fps; + } else if (is_draw_rate_query(q)) { + double avg = + ((double)result->u64) / (double)(sq->end_time - sq->begin_time); + result->f = avg; + } - return true; + return true; } static const struct fd_query_funcs sw_query_funcs = { - .destroy_query = fd_sw_destroy_query, - .begin_query = fd_sw_begin_query, - .end_query = fd_sw_end_query, - .get_query_result = fd_sw_get_query_result, + .destroy_query = fd_sw_destroy_query, + .begin_query = fd_sw_begin_query, + .end_query = fd_sw_end_query, + .get_query_result = fd_sw_get_query_result, }; struct fd_query * fd_sw_create_query(struct fd_context *ctx, unsigned query_type, unsigned index) { - struct fd_sw_query *sq; - struct fd_query *q; - - switch (query_type) { - case PIPE_QUERY_PRIMITIVES_GENERATED: - case PIPE_QUERY_PRIMITIVES_EMITTED: - case FD_QUERY_DRAW_CALLS: - case FD_QUERY_BATCH_TOTAL: - case FD_QUERY_BATCH_SYSMEM: - case FD_QUERY_BATCH_GMEM: - case FD_QUERY_BATCH_NONDRAW: - case FD_QUERY_BATCH_RESTORE: - case FD_QUERY_STAGING_UPLOADS: - case FD_QUERY_SHADOW_UPLOADS: - case FD_QUERY_VS_REGS: - case FD_QUERY_FS_REGS: - break; - default: - return NULL; - } - - sq = CALLOC_STRUCT(fd_sw_query); - if (!sq) - return NULL; - - q = &sq->base; - q->funcs = &sw_query_funcs; - q->type = query_type; - - return q; + struct fd_sw_query *sq; + struct fd_query *q; + + switch (query_type) { + case PIPE_QUERY_PRIMITIVES_GENERATED: + case PIPE_QUERY_PRIMITIVES_EMITTED: + case FD_QUERY_DRAW_CALLS: + case FD_QUERY_BATCH_TOTAL: + case FD_QUERY_BATCH_SYSMEM: + case FD_QUERY_BATCH_GMEM: + case FD_QUERY_BATCH_NONDRAW: + case FD_QUERY_BATCH_RESTORE: + case FD_QUERY_STAGING_UPLOADS: + case FD_QUERY_SHADOW_UPLOADS: + case FD_QUERY_VS_REGS: + case FD_QUERY_FS_REGS: + break; + default: + return NULL; + } + + sq = CALLOC_STRUCT(fd_sw_query); + if (!sq) + return NULL; + + q = &sq->base; + q->funcs = &sw_query_funcs; + q->type = query_type; + + return q; } diff --git a/src/gallium/drivers/freedreno/freedreno_query_sw.h b/src/gallium/drivers/freedreno/freedreno_query_sw.h index 967e4af..1403ec6 100644 --- a/src/gallium/drivers/freedreno/freedreno_query_sw.h +++ b/src/gallium/drivers/freedreno/freedreno_query_sw.h @@ -36,18 +36,18 @@ */ struct fd_sw_query { - struct fd_query base; - uint64_t begin_value, end_value; - uint64_t begin_time, end_time; + struct fd_query base; + uint64_t begin_value, end_value; + uint64_t begin_time, end_time; }; static inline struct fd_sw_query * fd_sw_query(struct fd_query *q) { - return (struct fd_sw_query *)q; + return (struct fd_sw_query *)q; } -struct fd_query * fd_sw_create_query(struct fd_context *ctx, - unsigned query_type, unsigned index); +struct fd_query *fd_sw_create_query(struct fd_context *ctx, unsigned query_type, + unsigned index); #endif /* FREEDRENO_QUERY_SW_H_ */ diff --git a/src/gallium/drivers/freedreno/freedreno_resource.c b/src/gallium/drivers/freedreno/freedreno_resource.c index 28c7fb2..cb8038c 100644 --- a/src/gallium/drivers/freedreno/freedreno_resource.c +++ b/src/gallium/drivers/freedreno/freedreno_resource.c @@ -27,27 +27,27 @@ #include "util/format/u_format.h" #include "util/format/u_format_rgtc.h" #include "util/format/u_format_zs.h" +#include "util/set.h" +#include "util/u_drm.h" #include "util/u_inlines.h" -#include "util/u_transfer.h" #include "util/u_string.h" #include "util/u_surface.h" -#include "util/set.h" -#include "util/u_drm.h" +#include "util/u_transfer.h" #include "decode/util.h" -#include "freedreno_resource.h" #include "freedreno_batch_cache.h" #include "freedreno_blitter.h" +#include "freedreno_context.h" #include "freedreno_fence.h" +#include "freedreno_query_hw.h" +#include "freedreno_resource.h" #include "freedreno_screen.h" #include "freedreno_surface.h" -#include "freedreno_context.h" -#include "freedreno_query_hw.h" #include "freedreno_util.h" -#include "drm-uapi/drm_fourcc.h" #include +#include "drm-uapi/drm_fourcc.h" /* XXX this should go away, needed for 'struct winsys_handle' */ #include "frontend/drm_driver.h" @@ -58,7 +58,7 @@ * the layout(s) of the tiled modes, and whether they are the same * across generations. */ -#define FD_FORMAT_MOD_QCOM_TILED fourcc_mod_code(QCOM, 0xffffffff) +#define FD_FORMAT_MOD_QCOM_TILED fourcc_mod_code(QCOM, 0xffffffff) /** * Go through the entire state and see if the resource is bound @@ -67,179 +67,181 @@ * emitted so the GPU looks at the new backing bo. */ static void -rebind_resource_in_ctx(struct fd_context *ctx, struct fd_resource *rsc) - assert_dt +rebind_resource_in_ctx(struct fd_context *ctx, + struct fd_resource *rsc) assert_dt { - struct pipe_resource *prsc = &rsc->b.b; - - if (ctx->rebind_resource) - ctx->rebind_resource(ctx, rsc); - - /* VBOs */ - if (rsc->dirty & FD_DIRTY_VTXBUF) { - struct fd_vertexbuf_stateobj *vb = &ctx->vtx.vertexbuf; - for (unsigned i = 0; i < vb->count && !(ctx->dirty & FD_DIRTY_VTXBUF); i++) { - if (vb->vb[i].buffer.resource == prsc) - fd_context_dirty(ctx, FD_DIRTY_VTXBUF); - } - } - - const enum fd_dirty_3d_state per_stage_dirty = - FD_DIRTY_CONST | FD_DIRTY_TEX | FD_DIRTY_IMAGE | FD_DIRTY_SSBO; - - if (!(rsc->dirty & per_stage_dirty)) - return; - - /* per-shader-stage resources: */ - for (unsigned stage = 0; stage < PIPE_SHADER_TYPES; stage++) { - /* Constbufs.. note that constbuf[0] is normal uniforms emitted in - * cmdstream rather than by pointer.. - */ - if ((rsc->dirty & FD_DIRTY_CONST) && - !(ctx->dirty_shader[stage] & FD_DIRTY_CONST)) { - struct fd_constbuf_stateobj *cb = &ctx->constbuf[stage]; - const unsigned num_ubos = util_last_bit(cb->enabled_mask); - for (unsigned i = 1; i < num_ubos; i++) { - if (cb->cb[i].buffer == prsc) { - fd_context_dirty_shader(ctx, stage, FD_DIRTY_SHADER_CONST); - break; - } - } - } - - /* Textures */ - if ((rsc->dirty & FD_DIRTY_TEX) && - !(ctx->dirty_shader[stage] & FD_DIRTY_TEX)) { - struct fd_texture_stateobj *tex = &ctx->tex[stage]; - for (unsigned i = 0; i < tex->num_textures; i++) { - if (tex->textures[i] && (tex->textures[i]->texture == prsc)) { - fd_context_dirty_shader(ctx, stage, FD_DIRTY_SHADER_TEX); - break; - } - } - } - - /* Images */ - if ((rsc->dirty & FD_DIRTY_IMAGE) && - !(ctx->dirty_shader[stage] & FD_DIRTY_IMAGE)) { - struct fd_shaderimg_stateobj *si = &ctx->shaderimg[stage]; - const unsigned num_images = util_last_bit(si->enabled_mask); - for (unsigned i = 0; i < num_images; i++) { - if (si->si[i].resource == prsc) { - fd_context_dirty_shader(ctx, stage, FD_DIRTY_SHADER_IMAGE); - break; - } - } - } - - /* SSBOs */ - if ((rsc->dirty & FD_DIRTY_SSBO) && - !(ctx->dirty_shader[stage] & FD_DIRTY_SSBO)) { - struct fd_shaderbuf_stateobj *sb = &ctx->shaderbuf[stage]; - const unsigned num_ssbos = util_last_bit(sb->enabled_mask); - for (unsigned i = 0; i < num_ssbos; i++) { - if (sb->sb[i].buffer == prsc) { - fd_context_dirty_shader(ctx, stage, FD_DIRTY_SHADER_SSBO); - break; - } - } - } - } + struct pipe_resource *prsc = &rsc->b.b; + + if (ctx->rebind_resource) + ctx->rebind_resource(ctx, rsc); + + /* VBOs */ + if (rsc->dirty & FD_DIRTY_VTXBUF) { + struct fd_vertexbuf_stateobj *vb = &ctx->vtx.vertexbuf; + for (unsigned i = 0; i < vb->count && !(ctx->dirty & FD_DIRTY_VTXBUF); + i++) { + if (vb->vb[i].buffer.resource == prsc) + fd_context_dirty(ctx, FD_DIRTY_VTXBUF); + } + } + + const enum fd_dirty_3d_state per_stage_dirty = + FD_DIRTY_CONST | FD_DIRTY_TEX | FD_DIRTY_IMAGE | FD_DIRTY_SSBO; + + if (!(rsc->dirty & per_stage_dirty)) + return; + + /* per-shader-stage resources: */ + for (unsigned stage = 0; stage < PIPE_SHADER_TYPES; stage++) { + /* Constbufs.. note that constbuf[0] is normal uniforms emitted in + * cmdstream rather than by pointer.. + */ + if ((rsc->dirty & FD_DIRTY_CONST) && + !(ctx->dirty_shader[stage] & FD_DIRTY_CONST)) { + struct fd_constbuf_stateobj *cb = &ctx->constbuf[stage]; + const unsigned num_ubos = util_last_bit(cb->enabled_mask); + for (unsigned i = 1; i < num_ubos; i++) { + if (cb->cb[i].buffer == prsc) { + fd_context_dirty_shader(ctx, stage, FD_DIRTY_SHADER_CONST); + break; + } + } + } + + /* Textures */ + if ((rsc->dirty & FD_DIRTY_TEX) && + !(ctx->dirty_shader[stage] & FD_DIRTY_TEX)) { + struct fd_texture_stateobj *tex = &ctx->tex[stage]; + for (unsigned i = 0; i < tex->num_textures; i++) { + if (tex->textures[i] && (tex->textures[i]->texture == prsc)) { + fd_context_dirty_shader(ctx, stage, FD_DIRTY_SHADER_TEX); + break; + } + } + } + + /* Images */ + if ((rsc->dirty & FD_DIRTY_IMAGE) && + !(ctx->dirty_shader[stage] & FD_DIRTY_IMAGE)) { + struct fd_shaderimg_stateobj *si = &ctx->shaderimg[stage]; + const unsigned num_images = util_last_bit(si->enabled_mask); + for (unsigned i = 0; i < num_images; i++) { + if (si->si[i].resource == prsc) { + fd_context_dirty_shader(ctx, stage, FD_DIRTY_SHADER_IMAGE); + break; + } + } + } + + /* SSBOs */ + if ((rsc->dirty & FD_DIRTY_SSBO) && + !(ctx->dirty_shader[stage] & FD_DIRTY_SSBO)) { + struct fd_shaderbuf_stateobj *sb = &ctx->shaderbuf[stage]; + const unsigned num_ssbos = util_last_bit(sb->enabled_mask); + for (unsigned i = 0; i < num_ssbos; i++) { + if (sb->sb[i].buffer == prsc) { + fd_context_dirty_shader(ctx, stage, FD_DIRTY_SHADER_SSBO); + break; + } + } + } + } } static void -rebind_resource(struct fd_resource *rsc) - assert_dt +rebind_resource(struct fd_resource *rsc) assert_dt { - struct fd_screen *screen = fd_screen(rsc->b.b.screen); + struct fd_screen *screen = fd_screen(rsc->b.b.screen); - fd_screen_lock(screen); - fd_resource_lock(rsc); + fd_screen_lock(screen); + fd_resource_lock(rsc); - if (rsc->dirty) - list_for_each_entry (struct fd_context, ctx, &screen->context_list, node) - rebind_resource_in_ctx(ctx, rsc); + if (rsc->dirty) + list_for_each_entry (struct fd_context, ctx, &screen->context_list, node) + rebind_resource_in_ctx(ctx, rsc); - fd_resource_unlock(rsc); - fd_screen_unlock(screen); + fd_resource_unlock(rsc); + fd_screen_unlock(screen); } static inline void fd_resource_set_bo(struct fd_resource *rsc, struct fd_bo *bo) { - struct fd_screen *screen = fd_screen(rsc->b.b.screen); + struct fd_screen *screen = fd_screen(rsc->b.b.screen); - rsc->bo = bo; - rsc->seqno = p_atomic_inc_return(&screen->rsc_seqno); + rsc->bo = bo; + rsc->seqno = p_atomic_inc_return(&screen->rsc_seqno); } int -__fd_resource_wait(struct fd_context *ctx, struct fd_resource *rsc, - unsigned op, const char *func) +__fd_resource_wait(struct fd_context *ctx, struct fd_resource *rsc, unsigned op, + const char *func) { - if (op & DRM_FREEDRENO_PREP_NOSYNC) - return fd_bo_cpu_prep(rsc->bo, ctx->pipe, op); + if (op & DRM_FREEDRENO_PREP_NOSYNC) + return fd_bo_cpu_prep(rsc->bo, ctx->pipe, op); - int ret; + int ret; - perf_time_ctx(ctx, 10000, "%s: a busy \"%"PRSC_FMT"\" BO stalled", - func, PRSC_ARGS(&rsc->b.b)) { - ret = fd_bo_cpu_prep(rsc->bo, ctx->pipe, op); - } + perf_time_ctx(ctx, 10000, "%s: a busy \"%" PRSC_FMT "\" BO stalled", func, + PRSC_ARGS(&rsc->b.b)) + { + ret = fd_bo_cpu_prep(rsc->bo, ctx->pipe, op); + } - return ret; + return ret; } static void realloc_bo(struct fd_resource *rsc, uint32_t size) { - struct pipe_resource *prsc = &rsc->b.b; - struct fd_screen *screen = fd_screen(rsc->b.b.screen); - uint32_t flags = DRM_FREEDRENO_GEM_CACHE_WCOMBINE | - DRM_FREEDRENO_GEM_TYPE_KMEM | - COND(prsc->bind & PIPE_BIND_SCANOUT, DRM_FREEDRENO_GEM_SCANOUT); - /* TODO other flags? */ - - /* if we start using things other than write-combine, - * be sure to check for PIPE_RESOURCE_FLAG_MAP_COHERENT - */ - - if (rsc->bo) - fd_bo_del(rsc->bo); - - struct fd_bo *bo = fd_bo_new(screen->dev, size, flags, "%ux%ux%u@%u:%x", - prsc->width0, prsc->height0, prsc->depth0, rsc->layout.cpp, prsc->bind); - fd_resource_set_bo(rsc, bo); - - /* Zero out the UBWC area on allocation. This fixes intermittent failures - * with UBWC, which I suspect are due to the HW having a hard time - * interpreting arbitrary values populating the flags buffer when the BO - * was recycled through the bo cache (instead of fresh allocations from - * the kernel, which are zeroed). sleep(1) in this spot didn't work - * around the issue, but any memset value seems to. - */ - if (rsc->layout.ubwc) { - rsc->needs_ubwc_clear = true; - } - - util_range_set_empty(&rsc->valid_buffer_range); - fd_bc_invalidate_resource(rsc, true); + struct pipe_resource *prsc = &rsc->b.b; + struct fd_screen *screen = fd_screen(rsc->b.b.screen); + uint32_t flags = + DRM_FREEDRENO_GEM_CACHE_WCOMBINE | DRM_FREEDRENO_GEM_TYPE_KMEM | + COND(prsc->bind & PIPE_BIND_SCANOUT, DRM_FREEDRENO_GEM_SCANOUT); + /* TODO other flags? */ + + /* if we start using things other than write-combine, + * be sure to check for PIPE_RESOURCE_FLAG_MAP_COHERENT + */ + + if (rsc->bo) + fd_bo_del(rsc->bo); + + struct fd_bo *bo = + fd_bo_new(screen->dev, size, flags, "%ux%ux%u@%u:%x", prsc->width0, + prsc->height0, prsc->depth0, rsc->layout.cpp, prsc->bind); + fd_resource_set_bo(rsc, bo); + + /* Zero out the UBWC area on allocation. This fixes intermittent failures + * with UBWC, which I suspect are due to the HW having a hard time + * interpreting arbitrary values populating the flags buffer when the BO + * was recycled through the bo cache (instead of fresh allocations from + * the kernel, which are zeroed). sleep(1) in this spot didn't work + * around the issue, but any memset value seems to. + */ + if (rsc->layout.ubwc) { + rsc->needs_ubwc_clear = true; + } + + util_range_set_empty(&rsc->valid_buffer_range); + fd_bc_invalidate_resource(rsc, true); } static void -do_blit(struct fd_context *ctx, const struct pipe_blit_info *blit, bool fallback) - assert_dt +do_blit(struct fd_context *ctx, const struct pipe_blit_info *blit, + bool fallback) assert_dt { - struct pipe_context *pctx = &ctx->base; - - /* TODO size threshold too?? */ - if (fallback || !fd_blit(pctx, blit)) { - /* do blit on cpu: */ - util_resource_copy_region(pctx, - blit->dst.resource, blit->dst.level, blit->dst.box.x, - blit->dst.box.y, blit->dst.box.z, - blit->src.resource, blit->src.level, &blit->src.box); - } + struct pipe_context *pctx = &ctx->base; + + /* TODO size threshold too?? */ + if (fallback || !fd_blit(pctx, blit)) { + /* do blit on cpu: */ + util_resource_copy_region(pctx, blit->dst.resource, blit->dst.level, + blit->dst.box.x, blit->dst.box.y, + blit->dst.box.z, blit->src.resource, + blit->src.level, &blit->src.box); + } } /** @@ -248,50 +250,50 @@ do_blit(struct fd_context *ctx, const struct pipe_blit_info *blit, bool fallback */ void fd_replace_buffer_storage(struct pipe_context *pctx, struct pipe_resource *pdst, - struct pipe_resource *psrc) + struct pipe_resource *psrc) { - struct fd_context *ctx = fd_context(pctx); - struct fd_resource *dst = fd_resource(pdst); - struct fd_resource *src = fd_resource(psrc); - - DBG("pdst=%p, psrc=%p", pdst, psrc); - - /* This should only be called with buffers.. which side-steps some tricker - * cases, like a rsc that is in a batch-cache key... - */ - assert(pdst->target == PIPE_BUFFER); - assert(psrc->target == PIPE_BUFFER); - assert(dst->track->bc_batch_mask == 0); - assert(src->track->bc_batch_mask == 0); - assert(src->track->batch_mask == 0); - assert(src->track->write_batch == NULL); - assert(memcmp(&dst->layout, &src->layout, sizeof(dst->layout)) == 0); - - /* get rid of any references that batch-cache might have to us (which - * should empty/destroy rsc->batches hashset) - * - * Note that we aren't actually destroying dst, but we are replacing - * it's storage so we want to go thru the same motions of decoupling - * it's batch connections. - */ - fd_bc_invalidate_resource(dst, true); - rebind_resource(dst); - - fd_screen_lock(ctx->screen); - - fd_bo_del(dst->bo); - dst->bo = fd_bo_ref(src->bo); - - fd_resource_tracking_reference(&dst->track, src->track); - src->is_replacement = true; - - dst->seqno = p_atomic_inc_return(&ctx->screen->rsc_seqno); - - fd_screen_unlock(ctx->screen); + struct fd_context *ctx = fd_context(pctx); + struct fd_resource *dst = fd_resource(pdst); + struct fd_resource *src = fd_resource(psrc); + + DBG("pdst=%p, psrc=%p", pdst, psrc); + + /* This should only be called with buffers.. which side-steps some tricker + * cases, like a rsc that is in a batch-cache key... + */ + assert(pdst->target == PIPE_BUFFER); + assert(psrc->target == PIPE_BUFFER); + assert(dst->track->bc_batch_mask == 0); + assert(src->track->bc_batch_mask == 0); + assert(src->track->batch_mask == 0); + assert(src->track->write_batch == NULL); + assert(memcmp(&dst->layout, &src->layout, sizeof(dst->layout)) == 0); + + /* get rid of any references that batch-cache might have to us (which + * should empty/destroy rsc->batches hashset) + * + * Note that we aren't actually destroying dst, but we are replacing + * it's storage so we want to go thru the same motions of decoupling + * it's batch connections. + */ + fd_bc_invalidate_resource(dst, true); + rebind_resource(dst); + + fd_screen_lock(ctx->screen); + + fd_bo_del(dst->bo); + dst->bo = fd_bo_ref(src->bo); + + fd_resource_tracking_reference(&dst->track, src->track); + src->is_replacement = true; + + dst->seqno = p_atomic_inc_return(&ctx->screen->rsc_seqno); + + fd_screen_unlock(ctx->screen); } -static void -flush_resource(struct fd_context *ctx, struct fd_resource *rsc, unsigned usage); +static void flush_resource(struct fd_context *ctx, struct fd_resource *rsc, + unsigned usage); /** * @rsc: the resource to shadow @@ -301,177 +303,179 @@ flush_resource(struct fd_context *ctx, struct fd_resource *rsc, unsigned usage); */ static bool fd_try_shadow_resource(struct fd_context *ctx, struct fd_resource *rsc, - unsigned level, const struct pipe_box *box, uint64_t modifier) - assert_dt + unsigned level, const struct pipe_box *box, + uint64_t modifier) assert_dt { - struct pipe_context *pctx = &ctx->base; - struct pipe_resource *prsc = &rsc->b.b; - bool fallback = false; - - if (prsc->next) - return false; - - /* If you have a sequence where there is a single rsc associated - * with the current render target, and then you end up shadowing - * that same rsc on the 3d pipe (u_blitter), because of how we - * swap the new shadow and rsc before the back-blit, you could end - * up confusing things into thinking that u_blitter's framebuffer - * state is the same as the current framebuffer state, which has - * the result of blitting to rsc rather than shadow. - * - * Normally we wouldn't want to unconditionally trigger a flush, - * since that defeats the purpose of shadowing, but this is a - * case where we'd have to flush anyways. - */ - if (rsc->track->write_batch == ctx->batch) - flush_resource(ctx, rsc, 0); - - /* TODO: somehow munge dimensions and format to copy unsupported - * render target format to something that is supported? - */ - if (!pctx->screen->is_format_supported(pctx->screen, - prsc->format, prsc->target, prsc->nr_samples, - prsc->nr_storage_samples, - PIPE_BIND_RENDER_TARGET)) - fallback = true; - - /* do shadowing back-blits on the cpu for buffers: */ - if (prsc->target == PIPE_BUFFER) - fallback = true; - - bool discard_whole_level = box && util_texrange_covers_whole_level(prsc, level, - box->x, box->y, box->z, box->width, box->height, box->depth); - - /* TODO need to be more clever about current level */ - if ((prsc->target >= PIPE_TEXTURE_2D) && box && !discard_whole_level) - return false; - - struct pipe_resource *pshadow = - pctx->screen->resource_create_with_modifiers(pctx->screen, - prsc, &modifier, 1); - - if (!pshadow) - return false; - - assert(!ctx->in_shadow); - ctx->in_shadow = true; - - /* get rid of any references that batch-cache might have to us (which - * should empty/destroy rsc->batches hashset) - */ - fd_bc_invalidate_resource(rsc, false); - rebind_resource(rsc); - - fd_screen_lock(ctx->screen); - - /* Swap the backing bo's, so shadow becomes the old buffer, - * blit from shadow to new buffer. From here on out, we - * cannot fail. - * - * Note that we need to do it in this order, otherwise if - * we go down cpu blit path, the recursive transfer_map() - * sees the wrong status.. - */ - struct fd_resource *shadow = fd_resource(pshadow); - - DBG("shadow: %p (%d, %p) -> %p (%d, %p)", rsc, rsc->b.b.reference.count, rsc->track, - shadow, shadow->b.b.reference.count, shadow->track); - - /* TODO valid_buffer_range?? */ - swap(rsc->bo, shadow->bo); - swap(rsc->layout, shadow->layout); - rsc->seqno = p_atomic_inc_return(&ctx->screen->rsc_seqno); - - /* at this point, the newly created shadow buffer is not referenced - * by any batches, but the existing rsc (probably) is. We need to - * transfer those references over: - */ - debug_assert(shadow->track->batch_mask == 0); - struct fd_batch *batch; - foreach_batch (batch, &ctx->screen->batch_cache, rsc->track->batch_mask) { - struct set_entry *entry = _mesa_set_search(batch->resources, rsc); - _mesa_set_remove(batch->resources, entry); - _mesa_set_add(batch->resources, shadow); - } - swap(rsc->track, shadow->track); - - fd_screen_unlock(ctx->screen); - - struct pipe_blit_info blit = {}; - blit.dst.resource = prsc; - blit.dst.format = prsc->format; - blit.src.resource = pshadow; - blit.src.format = pshadow->format; - blit.mask = util_format_get_mask(prsc->format); - blit.filter = PIPE_TEX_FILTER_NEAREST; - -#define set_box(field, val) do { \ - blit.dst.field = (val); \ - blit.src.field = (val); \ - } while (0) - - /* Disable occlusion queries during shadow blits. */ - bool saved_active_queries = ctx->active_queries; - pctx->set_active_query_state(pctx, false); - - /* blit the other levels in their entirety: */ - for (unsigned l = 0; l <= prsc->last_level; l++) { - if (box && l == level) - continue; - - /* just blit whole level: */ - set_box(level, l); - set_box(box.width, u_minify(prsc->width0, l)); - set_box(box.height, u_minify(prsc->height0, l)); - set_box(box.depth, u_minify(prsc->depth0, l)); - - for (int i = 0; i < prsc->array_size; i++) { - set_box(box.z, i); - do_blit(ctx, &blit, fallback); - } - } - - /* deal w/ current level specially, since we might need to split - * it up into a couple blits: - */ - if (box && !discard_whole_level) { - set_box(level, level); - - switch (prsc->target) { - case PIPE_BUFFER: - case PIPE_TEXTURE_1D: - set_box(box.y, 0); - set_box(box.z, 0); - set_box(box.height, 1); - set_box(box.depth, 1); - - if (box->x > 0) { - set_box(box.x, 0); - set_box(box.width, box->x); - - do_blit(ctx, &blit, fallback); - } - if ((box->x + box->width) < u_minify(prsc->width0, level)) { - set_box(box.x, box->x + box->width); - set_box(box.width, u_minify(prsc->width0, level) - (box->x + box->width)); - - do_blit(ctx, &blit, fallback); - } - break; - case PIPE_TEXTURE_2D: - /* TODO */ - default: - unreachable("TODO"); - } - } - - pctx->set_active_query_state(pctx, saved_active_queries); - - ctx->in_shadow = false; - - pipe_resource_reference(&pshadow, NULL); - - return true; + struct pipe_context *pctx = &ctx->base; + struct pipe_resource *prsc = &rsc->b.b; + bool fallback = false; + + if (prsc->next) + return false; + + /* If you have a sequence where there is a single rsc associated + * with the current render target, and then you end up shadowing + * that same rsc on the 3d pipe (u_blitter), because of how we + * swap the new shadow and rsc before the back-blit, you could end + * up confusing things into thinking that u_blitter's framebuffer + * state is the same as the current framebuffer state, which has + * the result of blitting to rsc rather than shadow. + * + * Normally we wouldn't want to unconditionally trigger a flush, + * since that defeats the purpose of shadowing, but this is a + * case where we'd have to flush anyways. + */ + if (rsc->track->write_batch == ctx->batch) + flush_resource(ctx, rsc, 0); + + /* TODO: somehow munge dimensions and format to copy unsupported + * render target format to something that is supported? + */ + if (!pctx->screen->is_format_supported( + pctx->screen, prsc->format, prsc->target, prsc->nr_samples, + prsc->nr_storage_samples, PIPE_BIND_RENDER_TARGET)) + fallback = true; + + /* do shadowing back-blits on the cpu for buffers: */ + if (prsc->target == PIPE_BUFFER) + fallback = true; + + bool discard_whole_level = box && util_texrange_covers_whole_level( + prsc, level, box->x, box->y, box->z, + box->width, box->height, box->depth); + + /* TODO need to be more clever about current level */ + if ((prsc->target >= PIPE_TEXTURE_2D) && box && !discard_whole_level) + return false; + + struct pipe_resource *pshadow = pctx->screen->resource_create_with_modifiers( + pctx->screen, prsc, &modifier, 1); + + if (!pshadow) + return false; + + assert(!ctx->in_shadow); + ctx->in_shadow = true; + + /* get rid of any references that batch-cache might have to us (which + * should empty/destroy rsc->batches hashset) + */ + fd_bc_invalidate_resource(rsc, false); + rebind_resource(rsc); + + fd_screen_lock(ctx->screen); + + /* Swap the backing bo's, so shadow becomes the old buffer, + * blit from shadow to new buffer. From here on out, we + * cannot fail. + * + * Note that we need to do it in this order, otherwise if + * we go down cpu blit path, the recursive transfer_map() + * sees the wrong status.. + */ + struct fd_resource *shadow = fd_resource(pshadow); + + DBG("shadow: %p (%d, %p) -> %p (%d, %p)", rsc, rsc->b.b.reference.count, + rsc->track, shadow, shadow->b.b.reference.count, shadow->track); + + /* TODO valid_buffer_range?? */ + swap(rsc->bo, shadow->bo); + swap(rsc->layout, shadow->layout); + rsc->seqno = p_atomic_inc_return(&ctx->screen->rsc_seqno); + + /* at this point, the newly created shadow buffer is not referenced + * by any batches, but the existing rsc (probably) is. We need to + * transfer those references over: + */ + debug_assert(shadow->track->batch_mask == 0); + struct fd_batch *batch; + foreach_batch(batch, &ctx->screen->batch_cache, rsc->track->batch_mask) + { + struct set_entry *entry = _mesa_set_search(batch->resources, rsc); + _mesa_set_remove(batch->resources, entry); + _mesa_set_add(batch->resources, shadow); + } + swap(rsc->track, shadow->track); + + fd_screen_unlock(ctx->screen); + + struct pipe_blit_info blit = {}; + blit.dst.resource = prsc; + blit.dst.format = prsc->format; + blit.src.resource = pshadow; + blit.src.format = pshadow->format; + blit.mask = util_format_get_mask(prsc->format); + blit.filter = PIPE_TEX_FILTER_NEAREST; + +#define set_box(field, val) \ + do { \ + blit.dst.field = (val); \ + blit.src.field = (val); \ + } while (0) + + /* Disable occlusion queries during shadow blits. */ + bool saved_active_queries = ctx->active_queries; + pctx->set_active_query_state(pctx, false); + + /* blit the other levels in their entirety: */ + for (unsigned l = 0; l <= prsc->last_level; l++) { + if (box && l == level) + continue; + + /* just blit whole level: */ + set_box(level, l); + set_box(box.width, u_minify(prsc->width0, l)); + set_box(box.height, u_minify(prsc->height0, l)); + set_box(box.depth, u_minify(prsc->depth0, l)); + + for (int i = 0; i < prsc->array_size; i++) { + set_box(box.z, i); + do_blit(ctx, &blit, fallback); + } + } + + /* deal w/ current level specially, since we might need to split + * it up into a couple blits: + */ + if (box && !discard_whole_level) { + set_box(level, level); + + switch (prsc->target) { + case PIPE_BUFFER: + case PIPE_TEXTURE_1D: + set_box(box.y, 0); + set_box(box.z, 0); + set_box(box.height, 1); + set_box(box.depth, 1); + + if (box->x > 0) { + set_box(box.x, 0); + set_box(box.width, box->x); + + do_blit(ctx, &blit, fallback); + } + if ((box->x + box->width) < u_minify(prsc->width0, level)) { + set_box(box.x, box->x + box->width); + set_box(box.width, + u_minify(prsc->width0, level) - (box->x + box->width)); + + do_blit(ctx, &blit, fallback); + } + break; + case PIPE_TEXTURE_2D: + /* TODO */ + default: + unreachable("TODO"); + } + } + + pctx->set_active_query_state(pctx, saved_active_queries); + + ctx->in_shadow = false; + + pipe_resource_reference(&pshadow, NULL); + + return true; } /** @@ -483,13 +487,13 @@ fd_try_shadow_resource(struct fd_context *ctx, struct fd_resource *rsc, void fd_resource_uncompress(struct fd_context *ctx, struct fd_resource *rsc) { - tc_assert_driver_thread(ctx->tc); + tc_assert_driver_thread(ctx->tc); - bool success = - fd_try_shadow_resource(ctx, rsc, 0, NULL, FD_FORMAT_MOD_QCOM_TILED); + bool success = + fd_try_shadow_resource(ctx, rsc, 0, NULL, FD_FORMAT_MOD_QCOM_TILED); - /* shadow should not fail in any cases where we need to uncompress: */ - debug_assert(success); + /* shadow should not fail in any cases where we need to uncompress: */ + debug_assert(success); } /** @@ -498,230 +502,226 @@ fd_resource_uncompress(struct fd_context *ctx, struct fd_resource *rsc) void fd_resource_dump(struct fd_resource *rsc, const char *name) { - fd_bo_cpu_prep(rsc->bo, NULL, DRM_FREEDRENO_PREP_READ); - printf("%s: \n", name); - dump_hex(fd_bo_map(rsc->bo), fd_bo_size(rsc->bo)); + fd_bo_cpu_prep(rsc->bo, NULL, DRM_FREEDRENO_PREP_READ); + printf("%s: \n", name); + dump_hex(fd_bo_map(rsc->bo), fd_bo_size(rsc->bo)); } static struct fd_resource * fd_alloc_staging(struct fd_context *ctx, struct fd_resource *rsc, - unsigned level, const struct pipe_box *box) + unsigned level, const struct pipe_box *box) { - struct pipe_context *pctx = &ctx->base; - struct pipe_resource tmpl = rsc->b.b; - - tmpl.width0 = box->width; - tmpl.height0 = box->height; - /* for array textures, box->depth is the array_size, otherwise - * for 3d textures, it is the depth: - */ - if (tmpl.array_size > 1) { - if (tmpl.target == PIPE_TEXTURE_CUBE) - tmpl.target = PIPE_TEXTURE_2D_ARRAY; - tmpl.array_size = box->depth; - tmpl.depth0 = 1; - } else { - tmpl.array_size = 1; - tmpl.depth0 = box->depth; - } - tmpl.last_level = 0; - tmpl.bind |= PIPE_BIND_LINEAR; - tmpl.usage = PIPE_USAGE_STAGING; - - struct pipe_resource *pstaging = - pctx->screen->resource_create(pctx->screen, &tmpl); - if (!pstaging) - return NULL; - - return fd_resource(pstaging); + struct pipe_context *pctx = &ctx->base; + struct pipe_resource tmpl = rsc->b.b; + + tmpl.width0 = box->width; + tmpl.height0 = box->height; + /* for array textures, box->depth is the array_size, otherwise + * for 3d textures, it is the depth: + */ + if (tmpl.array_size > 1) { + if (tmpl.target == PIPE_TEXTURE_CUBE) + tmpl.target = PIPE_TEXTURE_2D_ARRAY; + tmpl.array_size = box->depth; + tmpl.depth0 = 1; + } else { + tmpl.array_size = 1; + tmpl.depth0 = box->depth; + } + tmpl.last_level = 0; + tmpl.bind |= PIPE_BIND_LINEAR; + tmpl.usage = PIPE_USAGE_STAGING; + + struct pipe_resource *pstaging = + pctx->screen->resource_create(pctx->screen, &tmpl); + if (!pstaging) + return NULL; + + return fd_resource(pstaging); } static void -fd_blit_from_staging(struct fd_context *ctx, struct fd_transfer *trans) - assert_dt +fd_blit_from_staging(struct fd_context *ctx, + struct fd_transfer *trans) assert_dt { - struct pipe_resource *dst = trans->b.b.resource; - struct pipe_blit_info blit = {}; - - blit.dst.resource = dst; - blit.dst.format = dst->format; - blit.dst.level = trans->b.b.level; - blit.dst.box = trans->b.b.box; - blit.src.resource = trans->staging_prsc; - blit.src.format = trans->staging_prsc->format; - blit.src.level = 0; - blit.src.box = trans->staging_box; - blit.mask = util_format_get_mask(trans->staging_prsc->format); - blit.filter = PIPE_TEX_FILTER_NEAREST; - - do_blit(ctx, &blit, false); + struct pipe_resource *dst = trans->b.b.resource; + struct pipe_blit_info blit = {}; + + blit.dst.resource = dst; + blit.dst.format = dst->format; + blit.dst.level = trans->b.b.level; + blit.dst.box = trans->b.b.box; + blit.src.resource = trans->staging_prsc; + blit.src.format = trans->staging_prsc->format; + blit.src.level = 0; + blit.src.box = trans->staging_box; + blit.mask = util_format_get_mask(trans->staging_prsc->format); + blit.filter = PIPE_TEX_FILTER_NEAREST; + + do_blit(ctx, &blit, false); } static void -fd_blit_to_staging(struct fd_context *ctx, struct fd_transfer *trans) - assert_dt +fd_blit_to_staging(struct fd_context *ctx, struct fd_transfer *trans) assert_dt { - struct pipe_resource *src = trans->b.b.resource; - struct pipe_blit_info blit = {}; - - blit.src.resource = src; - blit.src.format = src->format; - blit.src.level = trans->b.b.level; - blit.src.box = trans->b.b.box; - blit.dst.resource = trans->staging_prsc; - blit.dst.format = trans->staging_prsc->format; - blit.dst.level = 0; - blit.dst.box = trans->staging_box; - blit.mask = util_format_get_mask(trans->staging_prsc->format); - blit.filter = PIPE_TEX_FILTER_NEAREST; - - do_blit(ctx, &blit, false); + struct pipe_resource *src = trans->b.b.resource; + struct pipe_blit_info blit = {}; + + blit.src.resource = src; + blit.src.format = src->format; + blit.src.level = trans->b.b.level; + blit.src.box = trans->b.b.box; + blit.dst.resource = trans->staging_prsc; + blit.dst.format = trans->staging_prsc->format; + blit.dst.level = 0; + blit.dst.box = trans->staging_box; + blit.mask = util_format_get_mask(trans->staging_prsc->format); + blit.filter = PIPE_TEX_FILTER_NEAREST; + + do_blit(ctx, &blit, false); } -static void fd_resource_transfer_flush_region(struct pipe_context *pctx, - struct pipe_transfer *ptrans, - const struct pipe_box *box) +static void +fd_resource_transfer_flush_region(struct pipe_context *pctx, + struct pipe_transfer *ptrans, + const struct pipe_box *box) { - struct fd_resource *rsc = fd_resource(ptrans->resource); + struct fd_resource *rsc = fd_resource(ptrans->resource); - if (ptrans->resource->target == PIPE_BUFFER) - util_range_add(&rsc->b.b, &rsc->valid_buffer_range, - ptrans->box.x + box->x, - ptrans->box.x + box->x + box->width); + if (ptrans->resource->target == PIPE_BUFFER) + util_range_add(&rsc->b.b, &rsc->valid_buffer_range, + ptrans->box.x + box->x, + ptrans->box.x + box->x + box->width); } static void -flush_resource(struct fd_context *ctx, struct fd_resource *rsc, unsigned usage) - assert_dt +flush_resource(struct fd_context *ctx, struct fd_resource *rsc, + unsigned usage) assert_dt { - struct fd_batch *write_batch = NULL; - - fd_screen_lock(ctx->screen); - fd_batch_reference_locked(&write_batch, rsc->track->write_batch); - fd_screen_unlock(ctx->screen); - - if (usage & PIPE_MAP_WRITE) { - struct fd_batch *batch, *batches[32] = {}; - uint32_t batch_mask; - - /* This is a bit awkward, probably a fd_batch_flush_locked() - * would make things simpler.. but we need to hold the lock - * to iterate the batches which reference this resource. So - * we must first grab references under a lock, then flush. - */ - fd_screen_lock(ctx->screen); - batch_mask = rsc->track->batch_mask; - foreach_batch(batch, &ctx->screen->batch_cache, batch_mask) - fd_batch_reference_locked(&batches[batch->idx], batch); - fd_screen_unlock(ctx->screen); - - foreach_batch(batch, &ctx->screen->batch_cache, batch_mask) - fd_batch_flush(batch); - - foreach_batch(batch, &ctx->screen->batch_cache, batch_mask) { - fd_batch_reference(&batches[batch->idx], NULL); - } - assert(rsc->track->batch_mask == 0); - } else if (write_batch) { - fd_batch_flush(write_batch); - } - - fd_batch_reference(&write_batch, NULL); - - assert(!rsc->track->write_batch); + struct fd_batch *write_batch = NULL; + + fd_screen_lock(ctx->screen); + fd_batch_reference_locked(&write_batch, rsc->track->write_batch); + fd_screen_unlock(ctx->screen); + + if (usage & PIPE_MAP_WRITE) { + struct fd_batch *batch, *batches[32] = {}; + uint32_t batch_mask; + + /* This is a bit awkward, probably a fd_batch_flush_locked() + * would make things simpler.. but we need to hold the lock + * to iterate the batches which reference this resource. So + * we must first grab references under a lock, then flush. + */ + fd_screen_lock(ctx->screen); + batch_mask = rsc->track->batch_mask; + foreach_batch(batch, &ctx->screen->batch_cache, batch_mask) + fd_batch_reference_locked(&batches[batch->idx], batch); + fd_screen_unlock(ctx->screen); + + foreach_batch(batch, &ctx->screen->batch_cache, batch_mask) + fd_batch_flush(batch); + + foreach_batch(batch, &ctx->screen->batch_cache, batch_mask) + { + fd_batch_reference(&batches[batch->idx], NULL); + } + assert(rsc->track->batch_mask == 0); + } else if (write_batch) { + fd_batch_flush(write_batch); + } + + fd_batch_reference(&write_batch, NULL); + + assert(!rsc->track->write_batch); } static void -fd_flush_resource(struct pipe_context *pctx, struct pipe_resource *prsc) - in_dt +fd_flush_resource(struct pipe_context *pctx, struct pipe_resource *prsc) in_dt { - flush_resource(fd_context(pctx), fd_resource(prsc), PIPE_MAP_READ); + flush_resource(fd_context(pctx), fd_resource(prsc), PIPE_MAP_READ); } static void fd_resource_transfer_unmap(struct pipe_context *pctx, - struct pipe_transfer *ptrans) - in_dt /* TODO for threaded-ctx we'll need to split out unsynchronized path */ + struct pipe_transfer *ptrans) + in_dt /* TODO for threaded-ctx we'll need to split out unsynchronized path */ { - struct fd_context *ctx = fd_context(pctx); - struct fd_resource *rsc = fd_resource(ptrans->resource); - struct fd_transfer *trans = fd_transfer(ptrans); + struct fd_context *ctx = fd_context(pctx); + struct fd_resource *rsc = fd_resource(ptrans->resource); + struct fd_transfer *trans = fd_transfer(ptrans); - if (trans->staging_prsc) { - if (ptrans->usage & PIPE_MAP_WRITE) - fd_blit_from_staging(ctx, trans); - pipe_resource_reference(&trans->staging_prsc, NULL); - } + if (trans->staging_prsc) { + if (ptrans->usage & PIPE_MAP_WRITE) + fd_blit_from_staging(ctx, trans); + pipe_resource_reference(&trans->staging_prsc, NULL); + } - if (!(ptrans->usage & PIPE_MAP_UNSYNCHRONIZED)) { - fd_bo_cpu_fini(rsc->bo); - } + if (!(ptrans->usage & PIPE_MAP_UNSYNCHRONIZED)) { + fd_bo_cpu_fini(rsc->bo); + } - util_range_add(&rsc->b.b, &rsc->valid_buffer_range, - ptrans->box.x, - ptrans->box.x + ptrans->box.width); + util_range_add(&rsc->b.b, &rsc->valid_buffer_range, ptrans->box.x, + ptrans->box.x + ptrans->box.width); - pipe_resource_reference(&ptrans->resource, NULL); + pipe_resource_reference(&ptrans->resource, NULL); - assert(trans->b.staging == NULL); /* for threaded context only */ + assert(trans->b.staging == NULL); /* for threaded context only */ - /* Don't use pool_transfers_unsync. We are always in the driver - * thread. Freeing an object into a different pool is allowed. - */ - slab_free(&ctx->transfer_pool, ptrans); + /* Don't use pool_transfers_unsync. We are always in the driver + * thread. Freeing an object into a different pool is allowed. + */ + slab_free(&ctx->transfer_pool, ptrans); } static unsigned translate_usage(unsigned usage) { - uint32_t op = 0; + uint32_t op = 0; - if (usage & PIPE_MAP_READ) - op |= DRM_FREEDRENO_PREP_READ; + if (usage & PIPE_MAP_READ) + op |= DRM_FREEDRENO_PREP_READ; - if (usage & PIPE_MAP_WRITE) - op |= DRM_FREEDRENO_PREP_WRITE; + if (usage & PIPE_MAP_WRITE) + op |= DRM_FREEDRENO_PREP_WRITE; - return op; + return op; } static void -invalidate_resource(struct fd_resource *rsc, unsigned usage) - assert_dt +invalidate_resource(struct fd_resource *rsc, unsigned usage) assert_dt { - bool needs_flush = pending(rsc, !!(usage & PIPE_MAP_WRITE)); - unsigned op = translate_usage(usage); - - if (needs_flush || fd_resource_busy(rsc, op)) { - rebind_resource(rsc); - realloc_bo(rsc, fd_bo_size(rsc->bo)); - } else { - util_range_set_empty(&rsc->valid_buffer_range); - } + bool needs_flush = pending(rsc, !!(usage & PIPE_MAP_WRITE)); + unsigned op = translate_usage(usage); + + if (needs_flush || fd_resource_busy(rsc, op)) { + rebind_resource(rsc); + realloc_bo(rsc, fd_bo_size(rsc->bo)); + } else { + util_range_set_empty(&rsc->valid_buffer_range); + } } static void * resource_transfer_map_unsync(struct pipe_context *pctx, - struct pipe_resource *prsc, - unsigned level, unsigned usage, - const struct pipe_box *box, - struct fd_transfer *trans) + struct pipe_resource *prsc, unsigned level, + unsigned usage, const struct pipe_box *box, + struct fd_transfer *trans) { - struct fd_resource *rsc = fd_resource(prsc); - enum pipe_format format = prsc->format; - uint32_t offset; - char *buf; + struct fd_resource *rsc = fd_resource(prsc); + enum pipe_format format = prsc->format; + uint32_t offset; + char *buf; - buf = fd_bo_map(rsc->bo); - offset = - box->y / util_format_get_blockheight(format) * trans->b.b.stride + - box->x / util_format_get_blockwidth(format) * rsc->layout.cpp + - fd_resource_offset(rsc, level, box->z); + buf = fd_bo_map(rsc->bo); + offset = box->y / util_format_get_blockheight(format) * trans->b.b.stride + + box->x / util_format_get_blockwidth(format) * rsc->layout.cpp + + fd_resource_offset(rsc, level, box->z); - if (usage & PIPE_MAP_WRITE) - rsc->valid = true; + if (usage & PIPE_MAP_WRITE) + rsc->valid = true; - return buf + offset; + return buf + offset; } /** @@ -730,358 +730,350 @@ resource_transfer_map_unsync(struct pipe_context *pctx, * either driver or frontend thread. */ static void * -resource_transfer_map(struct pipe_context *pctx, - struct pipe_resource *prsc, - unsigned level, unsigned usage, - const struct pipe_box *box, - struct fd_transfer *trans) - in_dt +resource_transfer_map(struct pipe_context *pctx, struct pipe_resource *prsc, + unsigned level, unsigned usage, + const struct pipe_box *box, + struct fd_transfer *trans) in_dt { - struct fd_context *ctx = fd_context(pctx); - struct fd_resource *rsc = fd_resource(prsc); - char *buf; - int ret = 0; - - tc_assert_driver_thread(ctx->tc); - - /* we always need a staging texture for tiled buffers: - * - * TODO we might sometimes want to *also* shadow the resource to avoid - * splitting a batch.. for ex, mid-frame texture uploads to a tiled - * texture. - */ - if (rsc->layout.tile_mode) { - struct fd_resource *staging_rsc; - - assert(prsc->target != PIPE_BUFFER); - - staging_rsc = fd_alloc_staging(ctx, rsc, level, box); - if (staging_rsc) { - trans->staging_prsc = &staging_rsc->b.b; - trans->b.b.stride = fd_resource_pitch(staging_rsc, 0); - trans->b.b.layer_stride = fd_resource_layer_stride(staging_rsc, 0); - trans->staging_box = *box; - trans->staging_box.x = 0; - trans->staging_box.y = 0; - trans->staging_box.z = 0; - - if (usage & PIPE_MAP_READ) { - fd_blit_to_staging(ctx, trans); - - fd_resource_wait(ctx, staging_rsc, - DRM_FREEDRENO_PREP_READ); - } - - buf = fd_bo_map(staging_rsc->bo); - - ctx->stats.staging_uploads++; - - return buf; - } - } - - if (usage & PIPE_MAP_DISCARD_WHOLE_RESOURCE) { - invalidate_resource(rsc, usage); - } else { - struct fd_batch *write_batch = NULL; - - /* hold a reference, so it doesn't disappear under us: */ - fd_screen_lock(ctx->screen); - fd_batch_reference_locked(&write_batch, rsc->track->write_batch); - fd_screen_unlock(ctx->screen); - - if ((usage & PIPE_MAP_WRITE) && write_batch && - write_batch->back_blit) { - /* if only thing pending is a back-blit, we can discard it: */ - fd_batch_reset(write_batch); - } - - unsigned op = translate_usage(usage); - bool needs_flush = pending(rsc, !!(usage & PIPE_MAP_WRITE)); - - /* If the GPU is writing to the resource, or if it is reading from the - * resource and we're trying to write to it, flush the renders. - */ - bool busy = needs_flush || fd_resource_busy(rsc, op); - - /* if we need to flush/stall, see if we can make a shadow buffer - * to avoid this: - * - * TODO we could go down this path !reorder && !busy_for_read - * ie. we only *don't* want to go down this path if the blit - * will trigger a flush! - */ - if (ctx->screen->reorder && busy && !(usage & PIPE_MAP_READ) && - (usage & PIPE_MAP_DISCARD_RANGE)) { - assert(!(usage & TC_TRANSFER_MAP_NO_INVALIDATE)); - - /* try shadowing only if it avoids a flush, otherwise staging would - * be better: - */ - if (needs_flush && fd_try_shadow_resource(ctx, rsc, level, - box, DRM_FORMAT_MOD_LINEAR)) { - needs_flush = busy = false; - ctx->stats.shadow_uploads++; - } else { - struct fd_resource *staging_rsc; - - if (needs_flush) { - flush_resource(ctx, rsc, usage); - needs_flush = false; - } - - /* in this case, we don't need to shadow the whole resource, - * since any draw that references the previous contents has - * already had rendering flushed for all tiles. So we can - * use a staging buffer to do the upload. - */ - staging_rsc = fd_alloc_staging(ctx, rsc, level, box); - if (staging_rsc) { - trans->staging_prsc = &staging_rsc->b.b; - trans->b.b.stride = fd_resource_pitch(staging_rsc, 0); - trans->b.b.layer_stride = - fd_resource_layer_stride(staging_rsc, 0); - trans->staging_box = *box; - trans->staging_box.x = 0; - trans->staging_box.y = 0; - trans->staging_box.z = 0; - buf = fd_bo_map(staging_rsc->bo); - - fd_batch_reference(&write_batch, NULL); - - ctx->stats.staging_uploads++; - - return buf; - } - } - } - - if (needs_flush) { - flush_resource(ctx, rsc, usage); - needs_flush = false; - } - - fd_batch_reference(&write_batch, NULL); - - /* The GPU keeps track of how the various bo's are being used, and - * will wait if necessary for the proper operation to have - * completed. - */ - if (busy) { - ret = fd_resource_wait(ctx, rsc, op); - if (ret) - return NULL; - } - } - - return resource_transfer_map_unsync(pctx, prsc, level, usage, box, trans); + struct fd_context *ctx = fd_context(pctx); + struct fd_resource *rsc = fd_resource(prsc); + char *buf; + int ret = 0; + + tc_assert_driver_thread(ctx->tc); + + /* we always need a staging texture for tiled buffers: + * + * TODO we might sometimes want to *also* shadow the resource to avoid + * splitting a batch.. for ex, mid-frame texture uploads to a tiled + * texture. + */ + if (rsc->layout.tile_mode) { + struct fd_resource *staging_rsc; + + assert(prsc->target != PIPE_BUFFER); + + staging_rsc = fd_alloc_staging(ctx, rsc, level, box); + if (staging_rsc) { + trans->staging_prsc = &staging_rsc->b.b; + trans->b.b.stride = fd_resource_pitch(staging_rsc, 0); + trans->b.b.layer_stride = fd_resource_layer_stride(staging_rsc, 0); + trans->staging_box = *box; + trans->staging_box.x = 0; + trans->staging_box.y = 0; + trans->staging_box.z = 0; + + if (usage & PIPE_MAP_READ) { + fd_blit_to_staging(ctx, trans); + + fd_resource_wait(ctx, staging_rsc, DRM_FREEDRENO_PREP_READ); + } + + buf = fd_bo_map(staging_rsc->bo); + + ctx->stats.staging_uploads++; + + return buf; + } + } + + if (usage & PIPE_MAP_DISCARD_WHOLE_RESOURCE) { + invalidate_resource(rsc, usage); + } else { + struct fd_batch *write_batch = NULL; + + /* hold a reference, so it doesn't disappear under us: */ + fd_screen_lock(ctx->screen); + fd_batch_reference_locked(&write_batch, rsc->track->write_batch); + fd_screen_unlock(ctx->screen); + + if ((usage & PIPE_MAP_WRITE) && write_batch && write_batch->back_blit) { + /* if only thing pending is a back-blit, we can discard it: */ + fd_batch_reset(write_batch); + } + + unsigned op = translate_usage(usage); + bool needs_flush = pending(rsc, !!(usage & PIPE_MAP_WRITE)); + + /* If the GPU is writing to the resource, or if it is reading from the + * resource and we're trying to write to it, flush the renders. + */ + bool busy = needs_flush || fd_resource_busy(rsc, op); + + /* if we need to flush/stall, see if we can make a shadow buffer + * to avoid this: + * + * TODO we could go down this path !reorder && !busy_for_read + * ie. we only *don't* want to go down this path if the blit + * will trigger a flush! + */ + if (ctx->screen->reorder && busy && !(usage & PIPE_MAP_READ) && + (usage & PIPE_MAP_DISCARD_RANGE)) { + assert(!(usage & TC_TRANSFER_MAP_NO_INVALIDATE)); + + /* try shadowing only if it avoids a flush, otherwise staging would + * be better: + */ + if (needs_flush && fd_try_shadow_resource(ctx, rsc, level, box, + DRM_FORMAT_MOD_LINEAR)) { + needs_flush = busy = false; + ctx->stats.shadow_uploads++; + } else { + struct fd_resource *staging_rsc; + + if (needs_flush) { + flush_resource(ctx, rsc, usage); + needs_flush = false; + } + + /* in this case, we don't need to shadow the whole resource, + * since any draw that references the previous contents has + * already had rendering flushed for all tiles. So we can + * use a staging buffer to do the upload. + */ + staging_rsc = fd_alloc_staging(ctx, rsc, level, box); + if (staging_rsc) { + trans->staging_prsc = &staging_rsc->b.b; + trans->b.b.stride = fd_resource_pitch(staging_rsc, 0); + trans->b.b.layer_stride = + fd_resource_layer_stride(staging_rsc, 0); + trans->staging_box = *box; + trans->staging_box.x = 0; + trans->staging_box.y = 0; + trans->staging_box.z = 0; + buf = fd_bo_map(staging_rsc->bo); + + fd_batch_reference(&write_batch, NULL); + + ctx->stats.staging_uploads++; + + return buf; + } + } + } + + if (needs_flush) { + flush_resource(ctx, rsc, usage); + needs_flush = false; + } + + fd_batch_reference(&write_batch, NULL); + + /* The GPU keeps track of how the various bo's are being used, and + * will wait if necessary for the proper operation to have + * completed. + */ + if (busy) { + ret = fd_resource_wait(ctx, rsc, op); + if (ret) + return NULL; + } + } + + return resource_transfer_map_unsync(pctx, prsc, level, usage, box, trans); } static unsigned improve_transfer_map_usage(struct fd_context *ctx, struct fd_resource *rsc, - unsigned usage, const struct pipe_box *box) - /* Not *strictly* true, but the access to things that must only be in driver- - * thread are protected by !(usage & TC_TRANSFER_MAP_THREADED_UNSYNC): - */ - in_dt + unsigned usage, const struct pipe_box *box) + /* Not *strictly* true, but the access to things that must only be in driver- + * thread are protected by !(usage & TC_TRANSFER_MAP_THREADED_UNSYNC): + */ + in_dt { - if (usage & TC_TRANSFER_MAP_NO_INVALIDATE) { - usage &= ~PIPE_MAP_DISCARD_WHOLE_RESOURCE; - usage &= ~PIPE_MAP_DISCARD_RANGE; - } - - if (usage & TC_TRANSFER_MAP_THREADED_UNSYNC) - usage |= PIPE_MAP_UNSYNCHRONIZED; - - if (!(usage & (TC_TRANSFER_MAP_NO_INFER_UNSYNCHRONIZED | - PIPE_MAP_UNSYNCHRONIZED))) { - if (ctx->in_shadow && !(usage & PIPE_MAP_READ)) { - usage |= PIPE_MAP_UNSYNCHRONIZED; - } else if ((usage & PIPE_MAP_WRITE) && - (rsc->b.b.target == PIPE_BUFFER) && - !util_ranges_intersect(&rsc->valid_buffer_range, - box->x, box->x + box->width)) { - /* We are trying to write to a previously uninitialized range. No need - * to synchronize. - */ - usage |= PIPE_MAP_UNSYNCHRONIZED; - } - } - - return usage; + if (usage & TC_TRANSFER_MAP_NO_INVALIDATE) { + usage &= ~PIPE_MAP_DISCARD_WHOLE_RESOURCE; + usage &= ~PIPE_MAP_DISCARD_RANGE; + } + + if (usage & TC_TRANSFER_MAP_THREADED_UNSYNC) + usage |= PIPE_MAP_UNSYNCHRONIZED; + + if (!(usage & + (TC_TRANSFER_MAP_NO_INFER_UNSYNCHRONIZED | PIPE_MAP_UNSYNCHRONIZED))) { + if (ctx->in_shadow && !(usage & PIPE_MAP_READ)) { + usage |= PIPE_MAP_UNSYNCHRONIZED; + } else if ((usage & PIPE_MAP_WRITE) && (rsc->b.b.target == PIPE_BUFFER) && + !util_ranges_intersect(&rsc->valid_buffer_range, box->x, + box->x + box->width)) { + /* We are trying to write to a previously uninitialized range. No need + * to synchronize. + */ + usage |= PIPE_MAP_UNSYNCHRONIZED; + } + } + + return usage; } static void * -fd_resource_transfer_map(struct pipe_context *pctx, - struct pipe_resource *prsc, - unsigned level, unsigned usage, - const struct pipe_box *box, - struct pipe_transfer **pptrans) +fd_resource_transfer_map(struct pipe_context *pctx, struct pipe_resource *prsc, + unsigned level, unsigned usage, + const struct pipe_box *box, + struct pipe_transfer **pptrans) { - struct fd_context *ctx = fd_context(pctx); - struct fd_resource *rsc = fd_resource(prsc); - struct fd_transfer *trans; - struct pipe_transfer *ptrans; - - DBG("prsc=%p, level=%u, usage=%x, box=%dx%d+%d,%d", prsc, level, usage, - box->width, box->height, box->x, box->y); - - if ((usage & PIPE_MAP_DIRECTLY) && rsc->layout.tile_mode) { - DBG("CANNOT MAP DIRECTLY!\n"); - return NULL; - } - - if (usage & TC_TRANSFER_MAP_THREADED_UNSYNC) { - ptrans = slab_alloc(&ctx->transfer_pool_unsync); - } else { - ptrans = slab_alloc(&ctx->transfer_pool); - } - - if (!ptrans) - return NULL; - - /* slab_alloc_st() doesn't zero: */ - trans = fd_transfer(ptrans); - memset(trans, 0, sizeof(*trans)); - - usage = improve_transfer_map_usage(ctx, rsc, usage, box); - - pipe_resource_reference(&ptrans->resource, prsc); - ptrans->level = level; - ptrans->usage = usage; - ptrans->box = *box; - ptrans->stride = fd_resource_pitch(rsc, level); - ptrans->layer_stride = fd_resource_layer_stride(rsc, level); - - void *ret; - if (usage & PIPE_MAP_UNSYNCHRONIZED) { - ret = resource_transfer_map_unsync(pctx, prsc, level, usage, box, trans); - } else { - ret = resource_transfer_map(pctx, prsc, level, usage, box, trans); - } - - if (ret) { - *pptrans = ptrans; - } else { - fd_resource_transfer_unmap(pctx, ptrans); - } - - return ret; + struct fd_context *ctx = fd_context(pctx); + struct fd_resource *rsc = fd_resource(prsc); + struct fd_transfer *trans; + struct pipe_transfer *ptrans; + + DBG("prsc=%p, level=%u, usage=%x, box=%dx%d+%d,%d", prsc, level, usage, + box->width, box->height, box->x, box->y); + + if ((usage & PIPE_MAP_DIRECTLY) && rsc->layout.tile_mode) { + DBG("CANNOT MAP DIRECTLY!\n"); + return NULL; + } + + if (usage & TC_TRANSFER_MAP_THREADED_UNSYNC) { + ptrans = slab_alloc(&ctx->transfer_pool_unsync); + } else { + ptrans = slab_alloc(&ctx->transfer_pool); + } + + if (!ptrans) + return NULL; + + /* slab_alloc_st() doesn't zero: */ + trans = fd_transfer(ptrans); + memset(trans, 0, sizeof(*trans)); + + usage = improve_transfer_map_usage(ctx, rsc, usage, box); + + pipe_resource_reference(&ptrans->resource, prsc); + ptrans->level = level; + ptrans->usage = usage; + ptrans->box = *box; + ptrans->stride = fd_resource_pitch(rsc, level); + ptrans->layer_stride = fd_resource_layer_stride(rsc, level); + + void *ret; + if (usage & PIPE_MAP_UNSYNCHRONIZED) { + ret = resource_transfer_map_unsync(pctx, prsc, level, usage, box, trans); + } else { + ret = resource_transfer_map(pctx, prsc, level, usage, box, trans); + } + + if (ret) { + *pptrans = ptrans; + } else { + fd_resource_transfer_unmap(pctx, ptrans); + } + + return ret; } static void -fd_resource_destroy(struct pipe_screen *pscreen, - struct pipe_resource *prsc) +fd_resource_destroy(struct pipe_screen *pscreen, struct pipe_resource *prsc) { - struct fd_resource *rsc = fd_resource(prsc); + struct fd_resource *rsc = fd_resource(prsc); - if (!rsc->is_replacement) - fd_bc_invalidate_resource(rsc, true); - if (rsc->bo) - fd_bo_del(rsc->bo); - if (rsc->lrz) - fd_bo_del(rsc->lrz); - if (rsc->scanout) - renderonly_scanout_destroy(rsc->scanout, fd_screen(pscreen)->ro); + if (!rsc->is_replacement) + fd_bc_invalidate_resource(rsc, true); + if (rsc->bo) + fd_bo_del(rsc->bo); + if (rsc->lrz) + fd_bo_del(rsc->lrz); + if (rsc->scanout) + renderonly_scanout_destroy(rsc->scanout, fd_screen(pscreen)->ro); - threaded_resource_deinit(prsc); + threaded_resource_deinit(prsc); - util_range_destroy(&rsc->valid_buffer_range); - simple_mtx_destroy(&rsc->lock); - fd_resource_tracking_reference(&rsc->track, NULL); + util_range_destroy(&rsc->valid_buffer_range); + simple_mtx_destroy(&rsc->lock); + fd_resource_tracking_reference(&rsc->track, NULL); - FREE(rsc); + FREE(rsc); } static uint64_t fd_resource_modifier(struct fd_resource *rsc) { - if (!rsc->layout.tile_mode) - return DRM_FORMAT_MOD_LINEAR; + if (!rsc->layout.tile_mode) + return DRM_FORMAT_MOD_LINEAR; - if (rsc->layout.ubwc_layer_size) - return DRM_FORMAT_MOD_QCOM_COMPRESSED; + if (rsc->layout.ubwc_layer_size) + return DRM_FORMAT_MOD_QCOM_COMPRESSED; - /* TODO invent a modifier for tiled but not UBWC buffers: */ - return DRM_FORMAT_MOD_INVALID; + /* TODO invent a modifier for tiled but not UBWC buffers: */ + return DRM_FORMAT_MOD_INVALID; } static bool -fd_resource_get_handle(struct pipe_screen *pscreen, - struct pipe_context *pctx, - struct pipe_resource *prsc, - struct winsys_handle *handle, - unsigned usage) +fd_resource_get_handle(struct pipe_screen *pscreen, struct pipe_context *pctx, + struct pipe_resource *prsc, struct winsys_handle *handle, + unsigned usage) { - struct fd_resource *rsc = fd_resource(prsc); + struct fd_resource *rsc = fd_resource(prsc); - rsc->b.is_shared = true; + rsc->b.is_shared = true; - handle->modifier = fd_resource_modifier(rsc); + handle->modifier = fd_resource_modifier(rsc); - DBG("%"PRSC_FMT", modifier=%"PRIx64, PRSC_ARGS(prsc), handle->modifier); + DBG("%" PRSC_FMT ", modifier=%" PRIx64, PRSC_ARGS(prsc), handle->modifier); - return fd_screen_bo_get_handle(pscreen, rsc->bo, rsc->scanout, - fd_resource_pitch(rsc, 0), handle); + return fd_screen_bo_get_handle(pscreen, rsc->bo, rsc->scanout, + fd_resource_pitch(rsc, 0), handle); } /* special case to resize query buf after allocated.. */ void fd_resource_resize(struct pipe_resource *prsc, uint32_t sz) { - struct fd_resource *rsc = fd_resource(prsc); + struct fd_resource *rsc = fd_resource(prsc); - debug_assert(prsc->width0 == 0); - debug_assert(prsc->target == PIPE_BUFFER); - debug_assert(prsc->bind == PIPE_BIND_QUERY_BUFFER); + debug_assert(prsc->width0 == 0); + debug_assert(prsc->target == PIPE_BUFFER); + debug_assert(prsc->bind == PIPE_BIND_QUERY_BUFFER); - prsc->width0 = sz; - realloc_bo(rsc, fd_screen(prsc->screen)->setup_slices(rsc)); + prsc->width0 = sz; + realloc_bo(rsc, fd_screen(prsc->screen)->setup_slices(rsc)); } static void fd_resource_layout_init(struct pipe_resource *prsc) { - struct fd_resource *rsc = fd_resource(prsc); - struct fdl_layout *layout = &rsc->layout; + struct fd_resource *rsc = fd_resource(prsc); + struct fdl_layout *layout = &rsc->layout; - layout->format = prsc->format; + layout->format = prsc->format; - layout->width0 = prsc->width0; - layout->height0 = prsc->height0; - layout->depth0 = prsc->depth0; + layout->width0 = prsc->width0; + layout->height0 = prsc->height0; + layout->depth0 = prsc->depth0; - layout->cpp = util_format_get_blocksize(prsc->format); - layout->cpp *= fd_resource_nr_samples(prsc); - layout->cpp_shift = ffs(layout->cpp) - 1; + layout->cpp = util_format_get_blocksize(prsc->format); + layout->cpp *= fd_resource_nr_samples(prsc); + layout->cpp_shift = ffs(layout->cpp) - 1; } static struct fd_resource * -alloc_resource_struct(struct pipe_screen *pscreen, const struct pipe_resource *tmpl) +alloc_resource_struct(struct pipe_screen *pscreen, + const struct pipe_resource *tmpl) { - struct fd_resource *rsc = CALLOC_STRUCT(fd_resource); + struct fd_resource *rsc = CALLOC_STRUCT(fd_resource); - if (!rsc) - return NULL; + if (!rsc) + return NULL; - struct pipe_resource *prsc = &rsc->b.b; - *prsc = *tmpl; + struct pipe_resource *prsc = &rsc->b.b; + *prsc = *tmpl; - pipe_reference_init(&prsc->reference, 1); - prsc->screen = pscreen; + pipe_reference_init(&prsc->reference, 1); + prsc->screen = pscreen; - util_range_init(&rsc->valid_buffer_range); - simple_mtx_init(&rsc->lock, mtx_plain); + util_range_init(&rsc->valid_buffer_range); + simple_mtx_init(&rsc->lock, mtx_plain); - rsc->track = CALLOC_STRUCT(fd_resource_tracking); - if (!rsc->track) { - free(rsc); - return NULL; - } + rsc->track = CALLOC_STRUCT(fd_resource_tracking); + if (!rsc->track) { + free(rsc); + return NULL; + } - pipe_reference_init(&rsc->track->reference, 1); + pipe_reference_init(&rsc->track->reference, 1); - return rsc; + return rsc; } /** @@ -1093,116 +1085,118 @@ alloc_resource_struct(struct pipe_screen *pscreen, const struct pipe_resource *t */ static struct pipe_resource * fd_resource_allocate_and_resolve(struct pipe_screen *pscreen, - const struct pipe_resource *tmpl, - const uint64_t *modifiers, int count, uint32_t *psize) + const struct pipe_resource *tmpl, + const uint64_t *modifiers, int count, + uint32_t *psize) { - struct fd_screen *screen = fd_screen(pscreen); - struct fd_resource *rsc; - struct pipe_resource *prsc; - enum pipe_format format = tmpl->format; - uint32_t size; - - rsc = alloc_resource_struct(pscreen, tmpl); - if (!rsc) - return NULL; - - prsc = &rsc->b.b; - - DBG("%"PRSC_FMT, PRSC_ARGS(prsc)); - - threaded_resource_init(prsc); - - if (tmpl->bind & PIPE_BIND_SHARED) - rsc->b.is_shared = true; - - fd_resource_layout_init(prsc); - -#define LINEAR \ - (PIPE_BIND_SCANOUT | \ - PIPE_BIND_LINEAR | \ - PIPE_BIND_DISPLAY_TARGET) - - bool linear = drm_find_modifier(DRM_FORMAT_MOD_LINEAR, modifiers, count); - if (linear) { - perf_debug("%"PRSC_FMT": linear: DRM_FORMAT_MOD_LINEAR requested!", PRSC_ARGS(prsc)); - } else if (tmpl->bind & LINEAR) { - if (tmpl->usage != PIPE_USAGE_STAGING) - perf_debug("%"PRSC_FMT": linear: LINEAR bind requested!", PRSC_ARGS(prsc)); - linear = true; - } - - if (FD_DBG(NOTILE)) - linear = true; - - /* Normally, for non-shared buffers, allow buffer compression if - * not shared, otherwise only allow if QCOM_COMPRESSED modifier - * is requested: - * - * TODO we should probably also limit tiled in a similar way, - * except we don't have a format modifier for tiled. (We probably - * should.) - */ - bool allow_ubwc = false; - if (!linear) { - allow_ubwc = drm_find_modifier(DRM_FORMAT_MOD_INVALID, modifiers, count); - if (!allow_ubwc) { - perf_debug("%"PRSC_FMT": not UBWC: DRM_FORMAT_MOD_INVALID not requested!", - PRSC_ARGS(prsc)); - } - if (tmpl->bind & PIPE_BIND_SHARED) { - allow_ubwc = drm_find_modifier(DRM_FORMAT_MOD_QCOM_COMPRESSED, modifiers, count); - if (!allow_ubwc) { - perf_debug("%"PRSC_FMT": not UBWC: shared and DRM_FORMAT_MOD_QCOM_COMPRESSED not requested!", - PRSC_ARGS(prsc)); - linear = true; - } - } - } - - allow_ubwc &= !FD_DBG(NOUBWC); - - if (screen->tile_mode && - (tmpl->target != PIPE_BUFFER) && - !linear) { - rsc->layout.tile_mode = screen->tile_mode(prsc); - } - - rsc->internal_format = format; - - rsc->layout.ubwc = rsc->layout.tile_mode && is_a6xx(screen) && allow_ubwc; - - if (prsc->target == PIPE_BUFFER) { - assert(prsc->format == PIPE_FORMAT_R8_UNORM); - size = prsc->width0; - fdl_layout_buffer(&rsc->layout, size); - } else { - size = screen->setup_slices(rsc); - } - - /* special case for hw-query buffer, which we need to allocate before we - * know the size: - */ - if (size == 0) { - /* note, semi-intention == instead of & */ - debug_assert(prsc->bind == PIPE_BIND_QUERY_BUFFER); - *psize = 0; - return prsc; - } - - /* Set the layer size if the (non-a6xx) backend hasn't done so. */ - if (rsc->layout.layer_first && !rsc->layout.layer_size) { - rsc->layout.layer_size = align(size, 4096); - size = rsc->layout.layer_size * prsc->array_size; - } - - if (FD_DBG(LAYOUT)) - fdl_dump_layout(&rsc->layout); - - /* Hand out the resolved size. */ - if (psize) - *psize = size; - - return prsc; + struct fd_screen *screen = fd_screen(pscreen); + struct fd_resource *rsc; + struct pipe_resource *prsc; + enum pipe_format format = tmpl->format; + uint32_t size; + + rsc = alloc_resource_struct(pscreen, tmpl); + if (!rsc) + return NULL; + + prsc = &rsc->b.b; + + DBG("%" PRSC_FMT, PRSC_ARGS(prsc)); + + threaded_resource_init(prsc); + + if (tmpl->bind & PIPE_BIND_SHARED) + rsc->b.is_shared = true; + + fd_resource_layout_init(prsc); + +#define LINEAR (PIPE_BIND_SCANOUT | PIPE_BIND_LINEAR | PIPE_BIND_DISPLAY_TARGET) + + bool linear = drm_find_modifier(DRM_FORMAT_MOD_LINEAR, modifiers, count); + if (linear) { + perf_debug("%" PRSC_FMT ": linear: DRM_FORMAT_MOD_LINEAR requested!", + PRSC_ARGS(prsc)); + } else if (tmpl->bind & LINEAR) { + if (tmpl->usage != PIPE_USAGE_STAGING) + perf_debug("%" PRSC_FMT ": linear: LINEAR bind requested!", + PRSC_ARGS(prsc)); + linear = true; + } + + if (FD_DBG(NOTILE)) + linear = true; + + /* Normally, for non-shared buffers, allow buffer compression if + * not shared, otherwise only allow if QCOM_COMPRESSED modifier + * is requested: + * + * TODO we should probably also limit tiled in a similar way, + * except we don't have a format modifier for tiled. (We probably + * should.) + */ + bool allow_ubwc = false; + if (!linear) { + allow_ubwc = drm_find_modifier(DRM_FORMAT_MOD_INVALID, modifiers, count); + if (!allow_ubwc) { + perf_debug("%" PRSC_FMT + ": not UBWC: DRM_FORMAT_MOD_INVALID not requested!", + PRSC_ARGS(prsc)); + } + if (tmpl->bind & PIPE_BIND_SHARED) { + allow_ubwc = + drm_find_modifier(DRM_FORMAT_MOD_QCOM_COMPRESSED, modifiers, count); + if (!allow_ubwc) { + perf_debug("%" PRSC_FMT + ": not UBWC: shared and DRM_FORMAT_MOD_QCOM_COMPRESSED " + "not requested!", + PRSC_ARGS(prsc)); + linear = true; + } + } + } + + allow_ubwc &= !FD_DBG(NOUBWC); + + if (screen->tile_mode && (tmpl->target != PIPE_BUFFER) && !linear) { + rsc->layout.tile_mode = screen->tile_mode(prsc); + } + + rsc->internal_format = format; + + rsc->layout.ubwc = rsc->layout.tile_mode && is_a6xx(screen) && allow_ubwc; + + if (prsc->target == PIPE_BUFFER) { + assert(prsc->format == PIPE_FORMAT_R8_UNORM); + size = prsc->width0; + fdl_layout_buffer(&rsc->layout, size); + } else { + size = screen->setup_slices(rsc); + } + + /* special case for hw-query buffer, which we need to allocate before we + * know the size: + */ + if (size == 0) { + /* note, semi-intention == instead of & */ + debug_assert(prsc->bind == PIPE_BIND_QUERY_BUFFER); + *psize = 0; + return prsc; + } + + /* Set the layer size if the (non-a6xx) backend hasn't done so. */ + if (rsc->layout.layer_first && !rsc->layout.layer_size) { + rsc->layout.layer_size = align(size, 4096); + size = rsc->layout.layer_size * prsc->array_size; + } + + if (FD_DBG(LAYOUT)) + fdl_dump_layout(&rsc->layout); + + /* Hand out the resolved size. */ + if (psize) + *psize = size; + + return prsc; } /** @@ -1210,67 +1204,67 @@ fd_resource_allocate_and_resolve(struct pipe_screen *pscreen, */ static struct pipe_resource * fd_resource_create_with_modifiers(struct pipe_screen *pscreen, - const struct pipe_resource *tmpl, - const uint64_t *modifiers, int count) + const struct pipe_resource *tmpl, + const uint64_t *modifiers, int count) { - struct fd_screen *screen = fd_screen(pscreen); - struct fd_resource *rsc; - struct pipe_resource *prsc; - uint32_t size; - - /* when using kmsro, scanout buffers are allocated on the display device - * create_with_modifiers() doesn't give us usage flags, so we have to - * assume that all calls with modifiers are scanout-possible - */ - if (screen->ro && - ((tmpl->bind & PIPE_BIND_SCANOUT) || - !(count == 1 && modifiers[0] == DRM_FORMAT_MOD_INVALID))) { - struct pipe_resource scanout_templat = *tmpl; - struct renderonly_scanout *scanout; - struct winsys_handle handle; - - /* note: alignment is wrong for a6xx */ - scanout_templat.width0 = align(tmpl->width0, screen->info.gmem_align_w); - - scanout = renderonly_scanout_for_resource(&scanout_templat, - screen->ro, &handle); - if (!scanout) - return NULL; - - renderonly_scanout_destroy(scanout, screen->ro); - - assert(handle.type == WINSYS_HANDLE_TYPE_FD); - rsc = fd_resource(pscreen->resource_from_handle(pscreen, tmpl, - &handle, - PIPE_HANDLE_USAGE_FRAMEBUFFER_WRITE)); - close(handle.handle); - if (!rsc) - return NULL; - - return &rsc->b.b; - } - - prsc = fd_resource_allocate_and_resolve(pscreen, tmpl, modifiers, count, &size); - if (!prsc) - return NULL; - rsc = fd_resource(prsc); - - realloc_bo(rsc, size); - if (!rsc->bo) - goto fail; - - return prsc; + struct fd_screen *screen = fd_screen(pscreen); + struct fd_resource *rsc; + struct pipe_resource *prsc; + uint32_t size; + + /* when using kmsro, scanout buffers are allocated on the display device + * create_with_modifiers() doesn't give us usage flags, so we have to + * assume that all calls with modifiers are scanout-possible + */ + if (screen->ro && + ((tmpl->bind & PIPE_BIND_SCANOUT) || + !(count == 1 && modifiers[0] == DRM_FORMAT_MOD_INVALID))) { + struct pipe_resource scanout_templat = *tmpl; + struct renderonly_scanout *scanout; + struct winsys_handle handle; + + /* note: alignment is wrong for a6xx */ + scanout_templat.width0 = align(tmpl->width0, screen->info.gmem_align_w); + + scanout = + renderonly_scanout_for_resource(&scanout_templat, screen->ro, &handle); + if (!scanout) + return NULL; + + renderonly_scanout_destroy(scanout, screen->ro); + + assert(handle.type == WINSYS_HANDLE_TYPE_FD); + rsc = fd_resource(pscreen->resource_from_handle( + pscreen, tmpl, &handle, PIPE_HANDLE_USAGE_FRAMEBUFFER_WRITE)); + close(handle.handle); + if (!rsc) + return NULL; + + return &rsc->b.b; + } + + prsc = + fd_resource_allocate_and_resolve(pscreen, tmpl, modifiers, count, &size); + if (!prsc) + return NULL; + rsc = fd_resource(prsc); + + realloc_bo(rsc, size); + if (!rsc->bo) + goto fail; + + return prsc; fail: - fd_resource_destroy(pscreen, prsc); - return NULL; + fd_resource_destroy(pscreen, prsc); + return NULL; } static struct pipe_resource * fd_resource_create(struct pipe_screen *pscreen, - const struct pipe_resource *tmpl) + const struct pipe_resource *tmpl) { - const uint64_t mod = DRM_FORMAT_MOD_INVALID; - return fd_resource_create_with_modifiers(pscreen, tmpl, &mod, 1); + const uint64_t mod = DRM_FORMAT_MOD_INVALID; + return fd_resource_create_with_modifiers(pscreen, tmpl, &mod, 1); } /** @@ -1280,356 +1274,351 @@ fd_resource_create(struct pipe_screen *pscreen, */ static struct pipe_resource * fd_resource_from_handle(struct pipe_screen *pscreen, - const struct pipe_resource *tmpl, - struct winsys_handle *handle, unsigned usage) + const struct pipe_resource *tmpl, + struct winsys_handle *handle, unsigned usage) { - struct fd_screen *screen = fd_screen(pscreen); - struct fd_resource *rsc = alloc_resource_struct(pscreen, tmpl); + struct fd_screen *screen = fd_screen(pscreen); + struct fd_resource *rsc = alloc_resource_struct(pscreen, tmpl); - if (!rsc) - return NULL; + if (!rsc) + return NULL; - struct fdl_slice *slice = fd_resource_slice(rsc, 0); - struct pipe_resource *prsc = &rsc->b.b; + struct fdl_slice *slice = fd_resource_slice(rsc, 0); + struct pipe_resource *prsc = &rsc->b.b; - DBG("%"PRSC_FMT", modifier=%"PRIx64, PRSC_ARGS(prsc), handle->modifier); + DBG("%" PRSC_FMT ", modifier=%" PRIx64, PRSC_ARGS(prsc), handle->modifier); - threaded_resource_init(prsc); - rsc->b.is_shared = true; + threaded_resource_init(prsc); + rsc->b.is_shared = true; - fd_resource_layout_init(prsc); + fd_resource_layout_init(prsc); - struct fd_bo *bo = fd_screen_bo_from_handle(pscreen, handle); - if (!bo) - goto fail; + struct fd_bo *bo = fd_screen_bo_from_handle(pscreen, handle); + if (!bo) + goto fail; - fd_resource_set_bo(rsc, bo); + fd_resource_set_bo(rsc, bo); - rsc->internal_format = tmpl->format; - rsc->layout.pitch0 = handle->stride; - slice->offset = handle->offset; - slice->size0 = handle->stride * prsc->height0; + rsc->internal_format = tmpl->format; + rsc->layout.pitch0 = handle->stride; + slice->offset = handle->offset; + slice->size0 = handle->stride * prsc->height0; - /* use a pitchalign of gmem_align_w pixels, because GMEM resolve for - * lower alignments is not implemented (but possible for a6xx at least) - * - * for UBWC-enabled resources, layout_resource_for_modifier will further - * validate the pitch and set the right pitchalign - */ - rsc->layout.pitchalign = - fdl_cpp_shift(&rsc->layout) + util_logbase2(screen->info.gmem_align_w); + /* use a pitchalign of gmem_align_w pixels, because GMEM resolve for + * lower alignments is not implemented (but possible for a6xx at least) + * + * for UBWC-enabled resources, layout_resource_for_modifier will further + * validate the pitch and set the right pitchalign + */ + rsc->layout.pitchalign = + fdl_cpp_shift(&rsc->layout) + util_logbase2(screen->info.gmem_align_w); - /* apply the minimum pitchalign (note: actually 4 for a3xx but doesn't matter) */ - if (is_a6xx(screen) || is_a5xx(screen)) - rsc->layout.pitchalign = MAX2(rsc->layout.pitchalign, 6); - else - rsc->layout.pitchalign = MAX2(rsc->layout.pitchalign, 5); + /* apply the minimum pitchalign (note: actually 4 for a3xx but doesn't + * matter) */ + if (is_a6xx(screen) || is_a5xx(screen)) + rsc->layout.pitchalign = MAX2(rsc->layout.pitchalign, 6); + else + rsc->layout.pitchalign = MAX2(rsc->layout.pitchalign, 5); - if (rsc->layout.pitch0 < (prsc->width0 * rsc->layout.cpp) || - fd_resource_pitch(rsc, 0) != rsc->layout.pitch0) - goto fail; + if (rsc->layout.pitch0 < (prsc->width0 * rsc->layout.cpp) || + fd_resource_pitch(rsc, 0) != rsc->layout.pitch0) + goto fail; - assert(rsc->layout.cpp); + assert(rsc->layout.cpp); - if (screen->layout_resource_for_modifier(rsc, handle->modifier) < 0) - goto fail; + if (screen->layout_resource_for_modifier(rsc, handle->modifier) < 0) + goto fail; - if (screen->ro) { - rsc->scanout = - renderonly_create_gpu_import_for_resource(prsc, screen->ro, NULL); - /* failure is expected in some cases.. */ - } + if (screen->ro) { + rsc->scanout = + renderonly_create_gpu_import_for_resource(prsc, screen->ro, NULL); + /* failure is expected in some cases.. */ + } - rsc->valid = true; + rsc->valid = true; - return prsc; + return prsc; fail: - fd_resource_destroy(pscreen, prsc); - return NULL; + fd_resource_destroy(pscreen, prsc); + return NULL; } bool fd_render_condition_check(struct pipe_context *pctx) { - struct fd_context *ctx = fd_context(pctx); + struct fd_context *ctx = fd_context(pctx); - if (!ctx->cond_query) - return true; + if (!ctx->cond_query) + return true; - union pipe_query_result res = { 0 }; - bool wait = - ctx->cond_mode != PIPE_RENDER_COND_NO_WAIT && - ctx->cond_mode != PIPE_RENDER_COND_BY_REGION_NO_WAIT; + union pipe_query_result res = {0}; + bool wait = ctx->cond_mode != PIPE_RENDER_COND_NO_WAIT && + ctx->cond_mode != PIPE_RENDER_COND_BY_REGION_NO_WAIT; - if (pctx->get_query_result(pctx, ctx->cond_query, wait, &res)) - return (bool)res.u64 != ctx->cond_cond; + if (pctx->get_query_result(pctx, ctx->cond_query, wait, &res)) + return (bool)res.u64 != ctx->cond_cond; - return true; + return true; } static void -fd_invalidate_resource(struct pipe_context *pctx, struct pipe_resource *prsc) - in_dt +fd_invalidate_resource(struct pipe_context *pctx, + struct pipe_resource *prsc) in_dt { - struct fd_context *ctx = fd_context(pctx); - struct fd_resource *rsc = fd_resource(prsc); - - if (prsc->target == PIPE_BUFFER) { - /* Handle the glInvalidateBufferData() case: - */ - invalidate_resource(rsc, PIPE_MAP_READ | PIPE_MAP_WRITE); - } else if (rsc->track->write_batch) { - /* Handle the glInvalidateFramebuffer() case, telling us that - * we can skip resolve. - */ - - struct fd_batch *batch = rsc->track->write_batch; - struct pipe_framebuffer_state *pfb = &batch->framebuffer; - - if (pfb->zsbuf && pfb->zsbuf->texture == prsc) { - batch->resolve &= ~(FD_BUFFER_DEPTH | FD_BUFFER_STENCIL); - fd_context_dirty(ctx, FD_DIRTY_ZSA); - } - - for (unsigned i = 0; i < pfb->nr_cbufs; i++) { - if (pfb->cbufs[i] && pfb->cbufs[i]->texture == prsc) { - batch->resolve &= ~(PIPE_CLEAR_COLOR0 << i); - fd_context_dirty(ctx, FD_DIRTY_FRAMEBUFFER); - } - } - } - - rsc->valid = false; + struct fd_context *ctx = fd_context(pctx); + struct fd_resource *rsc = fd_resource(prsc); + + if (prsc->target == PIPE_BUFFER) { + /* Handle the glInvalidateBufferData() case: + */ + invalidate_resource(rsc, PIPE_MAP_READ | PIPE_MAP_WRITE); + } else if (rsc->track->write_batch) { + /* Handle the glInvalidateFramebuffer() case, telling us that + * we can skip resolve. + */ + + struct fd_batch *batch = rsc->track->write_batch; + struct pipe_framebuffer_state *pfb = &batch->framebuffer; + + if (pfb->zsbuf && pfb->zsbuf->texture == prsc) { + batch->resolve &= ~(FD_BUFFER_DEPTH | FD_BUFFER_STENCIL); + fd_context_dirty(ctx, FD_DIRTY_ZSA); + } + + for (unsigned i = 0; i < pfb->nr_cbufs; i++) { + if (pfb->cbufs[i] && pfb->cbufs[i]->texture == prsc) { + batch->resolve &= ~(PIPE_CLEAR_COLOR0 << i); + fd_context_dirty(ctx, FD_DIRTY_FRAMEBUFFER); + } + } + } + + rsc->valid = false; } static enum pipe_format fd_resource_get_internal_format(struct pipe_resource *prsc) { - return fd_resource(prsc)->internal_format; + return fd_resource(prsc)->internal_format; } static void fd_resource_set_stencil(struct pipe_resource *prsc, - struct pipe_resource *stencil) + struct pipe_resource *stencil) { - fd_resource(prsc)->stencil = fd_resource(stencil); + fd_resource(prsc)->stencil = fd_resource(stencil); } static struct pipe_resource * fd_resource_get_stencil(struct pipe_resource *prsc) { - struct fd_resource *rsc = fd_resource(prsc); - if (rsc->stencil) - return &rsc->stencil->b.b; - return NULL; + struct fd_resource *rsc = fd_resource(prsc); + if (rsc->stencil) + return &rsc->stencil->b.b; + return NULL; } static const struct u_transfer_vtbl transfer_vtbl = { - .resource_create = fd_resource_create, - .resource_destroy = fd_resource_destroy, - .transfer_map = fd_resource_transfer_map, - .transfer_flush_region = fd_resource_transfer_flush_region, - .transfer_unmap = fd_resource_transfer_unmap, - .get_internal_format = fd_resource_get_internal_format, - .set_stencil = fd_resource_set_stencil, - .get_stencil = fd_resource_get_stencil, + .resource_create = fd_resource_create, + .resource_destroy = fd_resource_destroy, + .transfer_map = fd_resource_transfer_map, + .transfer_flush_region = fd_resource_transfer_flush_region, + .transfer_unmap = fd_resource_transfer_unmap, + .get_internal_format = fd_resource_get_internal_format, + .set_stencil = fd_resource_set_stencil, + .get_stencil = fd_resource_get_stencil, }; static const uint64_t supported_modifiers[] = { - DRM_FORMAT_MOD_LINEAR, + DRM_FORMAT_MOD_LINEAR, }; static int fd_layout_resource_for_modifier(struct fd_resource *rsc, uint64_t modifier) { - switch (modifier) { - case DRM_FORMAT_MOD_LINEAR: - /* The dri gallium frontend will pass DRM_FORMAT_MOD_INVALID to us - * when it's called through any of the non-modifier BO create entry - * points. Other drivers will determine tiling from the kernel or - * other legacy backchannels, but for freedreno it just means - * LINEAR. */ - case DRM_FORMAT_MOD_INVALID: - return 0; - default: - return -1; - } + switch (modifier) { + case DRM_FORMAT_MOD_LINEAR: + /* The dri gallium frontend will pass DRM_FORMAT_MOD_INVALID to us + * when it's called through any of the non-modifier BO create entry + * points. Other drivers will determine tiling from the kernel or + * other legacy backchannels, but for freedreno it just means + * LINEAR. */ + case DRM_FORMAT_MOD_INVALID: + return 0; + default: + return -1; + } } static struct pipe_resource * fd_resource_from_memobj(struct pipe_screen *pscreen, - const struct pipe_resource *tmpl, - struct pipe_memory_object *pmemobj, - uint64_t offset) + const struct pipe_resource *tmpl, + struct pipe_memory_object *pmemobj, uint64_t offset) { - struct fd_screen *screen = fd_screen(pscreen); - struct fd_memory_object *memobj = fd_memory_object(pmemobj); - struct pipe_resource *prsc; - struct fd_resource *rsc; - uint32_t size; - assert(memobj->bo); - - /* We shouldn't get a scanout buffer here. */ - assert(!(tmpl->bind & PIPE_BIND_SCANOUT)); - - uint64_t modifiers = DRM_FORMAT_MOD_INVALID; - if (tmpl->bind & PIPE_BIND_LINEAR) { - modifiers = DRM_FORMAT_MOD_LINEAR; - } else if (is_a6xx(screen) && tmpl->width0 >= FDL_MIN_UBWC_WIDTH) { - modifiers = DRM_FORMAT_MOD_QCOM_COMPRESSED; - } - - /* Allocate new pipe resource. */ - prsc = fd_resource_allocate_and_resolve(pscreen, tmpl, &modifiers, 1, &size); - if (!prsc) - return NULL; - rsc = fd_resource(prsc); - rsc->b.is_shared = true; - - /* bo's size has to be large enough, otherwise cleanup resource and fail - * gracefully. - */ - if (fd_bo_size(memobj->bo) < size) { - fd_resource_destroy(pscreen, prsc); - return NULL; - } - - /* Share the bo with the memory object. */ - fd_resource_set_bo(rsc, fd_bo_ref(memobj->bo)); - - return prsc; + struct fd_screen *screen = fd_screen(pscreen); + struct fd_memory_object *memobj = fd_memory_object(pmemobj); + struct pipe_resource *prsc; + struct fd_resource *rsc; + uint32_t size; + assert(memobj->bo); + + /* We shouldn't get a scanout buffer here. */ + assert(!(tmpl->bind & PIPE_BIND_SCANOUT)); + + uint64_t modifiers = DRM_FORMAT_MOD_INVALID; + if (tmpl->bind & PIPE_BIND_LINEAR) { + modifiers = DRM_FORMAT_MOD_LINEAR; + } else if (is_a6xx(screen) && tmpl->width0 >= FDL_MIN_UBWC_WIDTH) { + modifiers = DRM_FORMAT_MOD_QCOM_COMPRESSED; + } + + /* Allocate new pipe resource. */ + prsc = fd_resource_allocate_and_resolve(pscreen, tmpl, &modifiers, 1, &size); + if (!prsc) + return NULL; + rsc = fd_resource(prsc); + rsc->b.is_shared = true; + + /* bo's size has to be large enough, otherwise cleanup resource and fail + * gracefully. + */ + if (fd_bo_size(memobj->bo) < size) { + fd_resource_destroy(pscreen, prsc); + return NULL; + } + + /* Share the bo with the memory object. */ + fd_resource_set_bo(rsc, fd_bo_ref(memobj->bo)); + + return prsc; } static struct pipe_memory_object * fd_memobj_create_from_handle(struct pipe_screen *pscreen, - struct winsys_handle *whandle, - bool dedicated) + struct winsys_handle *whandle, bool dedicated) { - struct fd_memory_object *memobj = CALLOC_STRUCT(fd_memory_object); - if (!memobj) - return NULL; + struct fd_memory_object *memobj = CALLOC_STRUCT(fd_memory_object); + if (!memobj) + return NULL; - struct fd_bo *bo = fd_screen_bo_from_handle(pscreen, whandle); - if (!bo) { - free(memobj); - return NULL; - } + struct fd_bo *bo = fd_screen_bo_from_handle(pscreen, whandle); + if (!bo) { + free(memobj); + return NULL; + } - memobj->b.dedicated = dedicated; - memobj->bo = bo; + memobj->b.dedicated = dedicated; + memobj->bo = bo; - return &memobj->b; + return &memobj->b; } static void fd_memobj_destroy(struct pipe_screen *pscreen, - struct pipe_memory_object *pmemobj) + struct pipe_memory_object *pmemobj) { - struct fd_memory_object *memobj = fd_memory_object(pmemobj); + struct fd_memory_object *memobj = fd_memory_object(pmemobj); - assert(memobj->bo); - fd_bo_del(memobj->bo); + assert(memobj->bo); + fd_bo_del(memobj->bo); - free(pmemobj); + free(pmemobj); } void fd_resource_screen_init(struct pipe_screen *pscreen) { - struct fd_screen *screen = fd_screen(pscreen); - bool fake_rgtc = screen->gpu_id < 400; - - pscreen->resource_create = u_transfer_helper_resource_create; - /* NOTE: u_transfer_helper does not yet support the _with_modifiers() - * variant: - */ - pscreen->resource_create_with_modifiers = fd_resource_create_with_modifiers; - pscreen->resource_from_handle = fd_resource_from_handle; - pscreen->resource_get_handle = fd_resource_get_handle; - pscreen->resource_destroy = u_transfer_helper_resource_destroy; - - pscreen->transfer_helper = u_transfer_helper_create(&transfer_vtbl, - true, false, fake_rgtc, true); - - if (!screen->layout_resource_for_modifier) - screen->layout_resource_for_modifier = fd_layout_resource_for_modifier; - if (!screen->supported_modifiers) { - screen->supported_modifiers = supported_modifiers; - screen->num_supported_modifiers = ARRAY_SIZE(supported_modifiers); - } - - /* GL_EXT_memory_object */ - pscreen->memobj_create_from_handle = fd_memobj_create_from_handle; - pscreen->memobj_destroy = fd_memobj_destroy; - pscreen->resource_from_memobj = fd_resource_from_memobj; + struct fd_screen *screen = fd_screen(pscreen); + bool fake_rgtc = screen->gpu_id < 400; + + pscreen->resource_create = u_transfer_helper_resource_create; + /* NOTE: u_transfer_helper does not yet support the _with_modifiers() + * variant: + */ + pscreen->resource_create_with_modifiers = fd_resource_create_with_modifiers; + pscreen->resource_from_handle = fd_resource_from_handle; + pscreen->resource_get_handle = fd_resource_get_handle; + pscreen->resource_destroy = u_transfer_helper_resource_destroy; + + pscreen->transfer_helper = + u_transfer_helper_create(&transfer_vtbl, true, false, fake_rgtc, true); + + if (!screen->layout_resource_for_modifier) + screen->layout_resource_for_modifier = fd_layout_resource_for_modifier; + if (!screen->supported_modifiers) { + screen->supported_modifiers = supported_modifiers; + screen->num_supported_modifiers = ARRAY_SIZE(supported_modifiers); + } + + /* GL_EXT_memory_object */ + pscreen->memobj_create_from_handle = fd_memobj_create_from_handle; + pscreen->memobj_destroy = fd_memobj_destroy; + pscreen->resource_from_memobj = fd_resource_from_memobj; } static void -fd_get_sample_position(struct pipe_context *context, - unsigned sample_count, unsigned sample_index, - float *pos_out) +fd_get_sample_position(struct pipe_context *context, unsigned sample_count, + unsigned sample_index, float *pos_out) { - /* The following is copied from nouveau/nv50 except for position - * values, which are taken from blob driver */ - static const uint8_t pos1[1][2] = { { 0x8, 0x8 } }; - static const uint8_t pos2[2][2] = { - { 0xc, 0xc }, { 0x4, 0x4 } }; - static const uint8_t pos4[4][2] = { - { 0x6, 0x2 }, { 0xe, 0x6 }, - { 0x2, 0xa }, { 0xa, 0xe } }; - /* TODO needs to be verified on supported hw */ - static const uint8_t pos8[8][2] = { - { 0x9, 0x5 }, { 0x7, 0xb }, - { 0xd, 0x9 }, { 0x5, 0x3 }, - { 0x3, 0xd }, { 0x1, 0x7 }, - { 0xb, 0xf }, { 0xf, 0x1 } }; - - const uint8_t (*ptr)[2]; - - switch (sample_count) { - case 1: - ptr = pos1; - break; - case 2: - ptr = pos2; - break; - case 4: - ptr = pos4; - break; - case 8: - ptr = pos8; - break; - default: - assert(0); - return; - } - - pos_out[0] = ptr[sample_index][0] / 16.0f; - pos_out[1] = ptr[sample_index][1] / 16.0f; + /* The following is copied from nouveau/nv50 except for position + * values, which are taken from blob driver */ + static const uint8_t pos1[1][2] = {{0x8, 0x8}}; + static const uint8_t pos2[2][2] = {{0xc, 0xc}, {0x4, 0x4}}; + static const uint8_t pos4[4][2] = {{0x6, 0x2}, + {0xe, 0x6}, + {0x2, 0xa}, + {0xa, 0xe}}; + /* TODO needs to be verified on supported hw */ + static const uint8_t pos8[8][2] = {{0x9, 0x5}, {0x7, 0xb}, {0xd, 0x9}, + {0x5, 0x3}, {0x3, 0xd}, {0x1, 0x7}, + {0xb, 0xf}, {0xf, 0x1}}; + + const uint8_t(*ptr)[2]; + + switch (sample_count) { + case 1: + ptr = pos1; + break; + case 2: + ptr = pos2; + break; + case 4: + ptr = pos4; + break; + case 8: + ptr = pos8; + break; + default: + assert(0); + return; + } + + pos_out[0] = ptr[sample_index][0] / 16.0f; + pos_out[1] = ptr[sample_index][1] / 16.0f; } static void -fd_blit_pipe(struct pipe_context *pctx, const struct pipe_blit_info *blit_info) - in_dt +fd_blit_pipe(struct pipe_context *pctx, + const struct pipe_blit_info *blit_info) in_dt { - /* wrap fd_blit to return void */ - fd_blit(pctx, blit_info); + /* wrap fd_blit to return void */ + fd_blit(pctx, blit_info); } void fd_resource_context_init(struct pipe_context *pctx) { - pctx->transfer_map = u_transfer_helper_transfer_map; - pctx->transfer_flush_region = u_transfer_helper_transfer_flush_region; - pctx->transfer_unmap = u_transfer_helper_transfer_unmap; - pctx->buffer_subdata = u_default_buffer_subdata; - pctx->texture_subdata = u_default_texture_subdata; - pctx->create_surface = fd_create_surface; - pctx->surface_destroy = fd_surface_destroy; - pctx->resource_copy_region = fd_resource_copy_region; - pctx->blit = fd_blit_pipe; - pctx->flush_resource = fd_flush_resource; - pctx->invalidate_resource = fd_invalidate_resource; - pctx->get_sample_position = fd_get_sample_position; + pctx->transfer_map = u_transfer_helper_transfer_map; + pctx->transfer_flush_region = u_transfer_helper_transfer_flush_region; + pctx->transfer_unmap = u_transfer_helper_transfer_unmap; + pctx->buffer_subdata = u_default_buffer_subdata; + pctx->texture_subdata = u_default_texture_subdata; + pctx->create_surface = fd_create_surface; + pctx->surface_destroy = fd_surface_destroy; + pctx->resource_copy_region = fd_resource_copy_region; + pctx->blit = fd_blit_pipe; + pctx->flush_resource = fd_flush_resource; + pctx->invalidate_resource = fd_invalidate_resource; + pctx->get_sample_position = fd_get_sample_position; } diff --git a/src/gallium/drivers/freedreno/freedreno_resource.h b/src/gallium/drivers/freedreno/freedreno_resource.h index 0982b2f..f786a60 100644 --- a/src/gallium/drivers/freedreno/freedreno_resource.h +++ b/src/gallium/drivers/freedreno/freedreno_resource.h @@ -28,32 +28,31 @@ #define FREEDRENO_RESOURCE_H_ #include "util/list.h" +#include "util/simple_mtx.h" #include "util/u_dump.h" #include "util/u_range.h" #include "util/u_transfer_helper.h" -#include "util/simple_mtx.h" +#include "freedreno/fdl/freedreno_layout.h" #include "freedreno_batch.h" #include "freedreno_util.h" -#include "freedreno/fdl/freedreno_layout.h" - - -#define PRSC_FMT \ - "p: target=%s, format=%s, %ux%ux%u, " \ - "array_size=%u, last_level=%u, " \ - "nr_samples=%u, usage=%u, bind=%x, flags=%x" -#define PRSC_ARGS(p) \ - (p), util_str_tex_target((p)->target, true), util_format_short_name((p)->format), \ - (p)->width0, (p)->height0, (p)->depth0, (p)->array_size, (p)->last_level, \ - (p)->nr_samples, (p)->usage, (p)->bind, (p)->flags +#define PRSC_FMT \ + "p: target=%s, format=%s, %ux%ux%u, " \ + "array_size=%u, last_level=%u, " \ + "nr_samples=%u, usage=%u, bind=%x, flags=%x" +#define PRSC_ARGS(p) \ + (p), util_str_tex_target((p)->target, true), \ + util_format_short_name((p)->format), (p)->width0, (p)->height0, \ + (p)->depth0, (p)->array_size, (p)->last_level, (p)->nr_samples, \ + (p)->usage, (p)->bind, (p)->flags enum fd_lrz_direction { - FD_LRZ_UNKNOWN, - /* Depth func less/less-than: */ - FD_LRZ_LESS, - /* Depth func greater/greater-than: */ - FD_LRZ_GREATER, + FD_LRZ_UNKNOWN, + /* Depth func less/less-than: */ + FD_LRZ_LESS, + /* Depth func greater/greater-than: */ + FD_LRZ_GREATER, }; /** @@ -75,257 +74,258 @@ enum fd_lrz_direction { * per-tile query results. */ struct fd_resource_tracking { - struct pipe_reference reference; - - /* bitmask of in-flight batches which reference this resource. Note - * that the batch doesn't hold reference to resources (but instead - * the fd_ringbuffer holds refs to the underlying fd_bo), but in case - * the resource is destroyed we need to clean up the batch's weak - * references to us. - */ - uint32_t batch_mask; - - /* reference to batch that writes this resource: */ - struct fd_batch *write_batch; - - /* Set of batches whose batch-cache key references this resource. - * We need to track this to know which batch-cache entries to - * invalidate if, for example, the resource is invalidated or - * shadowed. - */ - uint32_t bc_batch_mask; + struct pipe_reference reference; + + /* bitmask of in-flight batches which reference this resource. Note + * that the batch doesn't hold reference to resources (but instead + * the fd_ringbuffer holds refs to the underlying fd_bo), but in case + * the resource is destroyed we need to clean up the batch's weak + * references to us. + */ + uint32_t batch_mask; + + /* reference to batch that writes this resource: */ + struct fd_batch *write_batch; + + /* Set of batches whose batch-cache key references this resource. + * We need to track this to know which batch-cache entries to + * invalidate if, for example, the resource is invalidated or + * shadowed. + */ + uint32_t bc_batch_mask; }; void __fd_resource_tracking_destroy(struct fd_resource_tracking *track); static inline void fd_resource_tracking_reference(struct fd_resource_tracking **ptr, - struct fd_resource_tracking *track) + struct fd_resource_tracking *track) { - struct fd_resource_tracking *old_track = *ptr; + struct fd_resource_tracking *old_track = *ptr; - if (pipe_reference(&(*ptr)->reference, &track->reference)) { - assert(!old_track->write_batch); - free(old_track); - } + if (pipe_reference(&(*ptr)->reference, &track->reference)) { + assert(!old_track->write_batch); + free(old_track); + } - *ptr = track; + *ptr = track; } /** * A resource (any buffer/texture/image/etc) */ struct fd_resource { - struct threaded_resource b; - struct fd_bo *bo; /* use fd_resource_set_bo() to write */ - enum pipe_format internal_format; - struct fdl_layout layout; - - /* buffer range that has been initialized */ - struct util_range valid_buffer_range; - bool valid; - struct renderonly_scanout *scanout; - - /* reference to the resource holding stencil data for a z32_s8 texture */ - /* TODO rename to secondary or auxiliary? */ - struct fd_resource *stencil; - - struct fd_resource_tracking *track; - - simple_mtx_t lock; - - /* bitmask of state this resource could potentially dirty when rebound, - * see rebind_resource() - */ - enum fd_dirty_3d_state dirty; - - /* Sequence # incremented each time bo changes: */ - uint16_t seqno; - - /* Is this buffer a replacement created by threaded_context to avoid - * a stall in PIPE_MAP_DISCARD_WHOLE_RESOURCE|PIPE_MAP_WRITE case? - * If so, it no longer "owns" it's rsc->track, and so should not - * invalidate when the rsc is destroyed. - */ - bool is_replacement : 1; - - /* Uninitialized resources with UBWC format need their UBWC flag data - * cleared before writes, as the UBWC state is read and used during - * writes, so undefined UBWC flag data results in undefined results. - */ - bool needs_ubwc_clear : 1; - - /* - * LRZ - * - * TODO lrz width/height/pitch should probably also move to - * fdl_layout - */ - bool lrz_valid : 1; - enum fd_lrz_direction lrz_direction : 2; - uint16_t lrz_width; // for lrz clear, does this differ from lrz_pitch? - uint16_t lrz_height; - uint16_t lrz_pitch; - struct fd_bo *lrz; + struct threaded_resource b; + struct fd_bo *bo; /* use fd_resource_set_bo() to write */ + enum pipe_format internal_format; + struct fdl_layout layout; + + /* buffer range that has been initialized */ + struct util_range valid_buffer_range; + bool valid; + struct renderonly_scanout *scanout; + + /* reference to the resource holding stencil data for a z32_s8 texture */ + /* TODO rename to secondary or auxiliary? */ + struct fd_resource *stencil; + + struct fd_resource_tracking *track; + + simple_mtx_t lock; + + /* bitmask of state this resource could potentially dirty when rebound, + * see rebind_resource() + */ + enum fd_dirty_3d_state dirty; + + /* Sequence # incremented each time bo changes: */ + uint16_t seqno; + + /* Is this buffer a replacement created by threaded_context to avoid + * a stall in PIPE_MAP_DISCARD_WHOLE_RESOURCE|PIPE_MAP_WRITE case? + * If so, it no longer "owns" it's rsc->track, and so should not + * invalidate when the rsc is destroyed. + */ + bool is_replacement : 1; + + /* Uninitialized resources with UBWC format need their UBWC flag data + * cleared before writes, as the UBWC state is read and used during + * writes, so undefined UBWC flag data results in undefined results. + */ + bool needs_ubwc_clear : 1; + + /* + * LRZ + * + * TODO lrz width/height/pitch should probably also move to + * fdl_layout + */ + bool lrz_valid : 1; + enum fd_lrz_direction lrz_direction : 2; + uint16_t lrz_width; // for lrz clear, does this differ from lrz_pitch? + uint16_t lrz_height; + uint16_t lrz_pitch; + struct fd_bo *lrz; }; struct fd_memory_object { - struct pipe_memory_object b; - struct fd_bo *bo; + struct pipe_memory_object b; + struct fd_bo *bo; }; static inline struct fd_resource * fd_resource(struct pipe_resource *ptex) { - return (struct fd_resource *)ptex; + return (struct fd_resource *)ptex; } static inline const struct fd_resource * fd_resource_const(const struct pipe_resource *ptex) { - return (const struct fd_resource *)ptex; + return (const struct fd_resource *)ptex; } static inline struct fd_memory_object * -fd_memory_object (struct pipe_memory_object *pmemobj) +fd_memory_object(struct pipe_memory_object *pmemobj) { - return (struct fd_memory_object *)pmemobj; + return (struct fd_memory_object *)pmemobj; } static inline bool pending(struct fd_resource *rsc, bool write) { - /* if we have a pending GPU write, we are busy in any case: */ - if (rsc->track->write_batch) - return true; + /* if we have a pending GPU write, we are busy in any case: */ + if (rsc->track->write_batch) + return true; - /* if CPU wants to write, but we are pending a GPU read, we are busy: */ - if (write && rsc->track->batch_mask) - return true; + /* if CPU wants to write, but we are pending a GPU read, we are busy: */ + if (write && rsc->track->batch_mask) + return true; - if (rsc->stencil && pending(rsc->stencil, write)) - return true; + if (rsc->stencil && pending(rsc->stencil, write)) + return true; - return false; + return false; } static inline bool fd_resource_busy(struct fd_resource *rsc, unsigned op) { - return fd_bo_cpu_prep(rsc->bo, NULL, op | DRM_FREEDRENO_PREP_NOSYNC) != 0; + return fd_bo_cpu_prep(rsc->bo, NULL, op | DRM_FREEDRENO_PREP_NOSYNC) != 0; } -int __fd_resource_wait(struct fd_context *ctx, struct fd_resource *rsc, unsigned op, const char *func); -#define fd_resource_wait(ctx, rsc, op) __fd_resource_wait(ctx, rsc, op, __func__) +int __fd_resource_wait(struct fd_context *ctx, struct fd_resource *rsc, + unsigned op, const char *func); +#define fd_resource_wait(ctx, rsc, op) \ + __fd_resource_wait(ctx, rsc, op, __func__) static inline void fd_resource_lock(struct fd_resource *rsc) { - simple_mtx_lock(&rsc->lock); + simple_mtx_lock(&rsc->lock); } static inline void fd_resource_unlock(struct fd_resource *rsc) { - simple_mtx_unlock(&rsc->lock); + simple_mtx_unlock(&rsc->lock); } static inline void fd_resource_set_usage(struct pipe_resource *prsc, enum fd_dirty_3d_state usage) { - if (!prsc) - return; - struct fd_resource *rsc = fd_resource(prsc); - /* Bits are only ever ORed in, and we expect many set_usage() per - * resource, so do the quick check outside of the lock. - */ - if (likely(rsc->dirty & usage)) - return; - fd_resource_lock(rsc); - rsc->dirty |= usage; - fd_resource_unlock(rsc); + if (!prsc) + return; + struct fd_resource *rsc = fd_resource(prsc); + /* Bits are only ever ORed in, and we expect many set_usage() per + * resource, so do the quick check outside of the lock. + */ + if (likely(rsc->dirty & usage)) + return; + fd_resource_lock(rsc); + rsc->dirty |= usage; + fd_resource_unlock(rsc); } static inline bool has_depth(enum pipe_format format) { - const struct util_format_description *desc = - util_format_description(format); - return util_format_has_depth(desc); + const struct util_format_description *desc = util_format_description(format); + return util_format_has_depth(desc); } struct fd_transfer { - struct threaded_transfer b; - struct pipe_resource *staging_prsc; - struct pipe_box staging_box; + struct threaded_transfer b; + struct pipe_resource *staging_prsc; + struct pipe_box staging_box; }; static inline struct fd_transfer * fd_transfer(struct pipe_transfer *ptrans) { - return (struct fd_transfer *)ptrans; + return (struct fd_transfer *)ptrans; } static inline struct fdl_slice * fd_resource_slice(struct fd_resource *rsc, unsigned level) { - assert(level <= rsc->b.b.last_level); - return &rsc->layout.slices[level]; + assert(level <= rsc->b.b.last_level); + return &rsc->layout.slices[level]; } static inline uint32_t fd_resource_layer_stride(struct fd_resource *rsc, unsigned level) { - return fdl_layer_stride(&rsc->layout, level); + return fdl_layer_stride(&rsc->layout, level); } /* get pitch (in bytes) for specified mipmap level */ static inline uint32_t fd_resource_pitch(struct fd_resource *rsc, unsigned level) { - if (is_a2xx(fd_screen(rsc->b.b.screen))) - return fdl2_pitch(&rsc->layout, level); + if (is_a2xx(fd_screen(rsc->b.b.screen))) + return fdl2_pitch(&rsc->layout, level); - return fdl_pitch(&rsc->layout, level); + return fdl_pitch(&rsc->layout, level); } /* get offset for specified mipmap level and texture/array layer */ static inline uint32_t fd_resource_offset(struct fd_resource *rsc, unsigned level, unsigned layer) { - uint32_t offset = fdl_surface_offset(&rsc->layout, level, layer); - debug_assert(offset < fd_bo_size(rsc->bo)); - return offset; + uint32_t offset = fdl_surface_offset(&rsc->layout, level, layer); + debug_assert(offset < fd_bo_size(rsc->bo)); + return offset; } static inline uint32_t fd_resource_ubwc_offset(struct fd_resource *rsc, unsigned level, unsigned layer) { - uint32_t offset = fdl_ubwc_offset(&rsc->layout, level, layer); - debug_assert(offset < fd_bo_size(rsc->bo)); - return offset; + uint32_t offset = fdl_ubwc_offset(&rsc->layout, level, layer); + debug_assert(offset < fd_bo_size(rsc->bo)); + return offset; } /* This might be a5xx specific, but higher mipmap levels are always linear: */ static inline bool fd_resource_level_linear(const struct pipe_resource *prsc, int level) { - struct fd_screen *screen = fd_screen(prsc->screen); - debug_assert(!is_a3xx(screen)); + struct fd_screen *screen = fd_screen(prsc->screen); + debug_assert(!is_a3xx(screen)); - return fdl_level_linear(&fd_resource_const(prsc)->layout, level); + return fdl_level_linear(&fd_resource_const(prsc)->layout, level); } static inline uint32_t fd_resource_tile_mode(struct pipe_resource *prsc, int level) { - return fdl_tile_mode(&fd_resource(prsc)->layout, level); + return fdl_tile_mode(&fd_resource(prsc)->layout, level); } static inline bool fd_resource_ubwc_enabled(struct fd_resource *rsc, int level) { - return fdl_ubwc_enabled(&rsc->layout, level); + return fdl_ubwc_enabled(&rsc->layout, level); } /* access # of samples, with 0 normalized to 1 (which is what we care about @@ -334,7 +334,7 @@ fd_resource_ubwc_enabled(struct fd_resource *rsc, int level) static inline unsigned fd_resource_nr_samples(struct pipe_resource *prsc) { - return MAX2(1, prsc->nr_samples); + return MAX2(1, prsc->nr_samples); } void fd_resource_screen_init(struct pipe_screen *pscreen); @@ -342,9 +342,11 @@ void fd_resource_context_init(struct pipe_context *pctx); uint32_t fd_setup_slices(struct fd_resource *rsc); void fd_resource_resize(struct pipe_resource *prsc, uint32_t sz); -void fd_replace_buffer_storage(struct pipe_context *ctx, struct pipe_resource *dst, - struct pipe_resource *src) in_dt; -void fd_resource_uncompress(struct fd_context *ctx, struct fd_resource *rsc) assert_dt; +void fd_replace_buffer_storage(struct pipe_context *ctx, + struct pipe_resource *dst, + struct pipe_resource *src) in_dt; +void fd_resource_uncompress(struct fd_context *ctx, + struct fd_resource *rsc) assert_dt; void fd_resource_dump(struct fd_resource *rsc, const char *name); bool fd_render_condition_check(struct pipe_context *pctx) assert_dt; @@ -352,30 +354,28 @@ bool fd_render_condition_check(struct pipe_context *pctx) assert_dt; static inline bool fd_batch_references_resource(struct fd_batch *batch, struct fd_resource *rsc) { - return rsc->track->batch_mask & (1 << batch->idx); + return rsc->track->batch_mask & (1 << batch->idx); } static inline void -fd_batch_write_prep(struct fd_batch *batch, struct fd_resource *rsc) - assert_dt +fd_batch_write_prep(struct fd_batch *batch, struct fd_resource *rsc) assert_dt { - if (unlikely(rsc->needs_ubwc_clear)) { - batch->ctx->clear_ubwc(batch, rsc); - rsc->needs_ubwc_clear = false; - } + if (unlikely(rsc->needs_ubwc_clear)) { + batch->ctx->clear_ubwc(batch, rsc); + rsc->needs_ubwc_clear = false; + } } static inline void fd_batch_resource_read(struct fd_batch *batch, - struct fd_resource *rsc) - assert_dt + struct fd_resource *rsc) assert_dt { - /* Fast path: if we hit this then we know we don't have anyone else - * writing to it (since both _write and _read flush other writers), and - * that we've already recursed for stencil. - */ - if (unlikely(!fd_batch_references_resource(batch, rsc))) - fd_batch_resource_read_slowpath(batch, rsc); + /* Fast path: if we hit this then we know we don't have anyone else + * writing to it (since both _write and _read flush other writers), and + * that we've already recursed for stencil. + */ + if (unlikely(!fd_batch_references_resource(batch, rsc))) + fd_batch_resource_read_slowpath(batch, rsc); } #endif /* FREEDRENO_RESOURCE_H_ */ diff --git a/src/gallium/drivers/freedreno/freedreno_screen.c b/src/gallium/drivers/freedreno/freedreno_screen.c index 337aa08..b184576 100644 --- a/src/gallium/drivers/freedreno/freedreno_screen.c +++ b/src/gallium/drivers/freedreno/freedreno_screen.c @@ -24,31 +24,30 @@ * Rob Clark */ - #include "pipe/p_defines.h" #include "pipe/p_screen.h" #include "pipe/p_state.h" -#include "util/u_memory.h" -#include "util/u_inlines.h" #include "util/format/u_format.h" #include "util/format/u_format_s3tc.h" +#include "util/u_debug.h" +#include "util/u_inlines.h" +#include "util/u_memory.h" #include "util/u_screen.h" #include "util/u_string.h" -#include "util/u_debug.h" #include "util/os_time.h" -#include "drm-uapi/drm_fourcc.h" #include #include #include +#include "drm-uapi/drm_fourcc.h" #include -#include "freedreno_screen.h" -#include "freedreno_resource.h" #include "freedreno_fence.h" #include "freedreno_query.h" +#include "freedreno_resource.h" +#include "freedreno_screen.h" #include "freedreno_util.h" #include "a2xx/fd2_screen.h" @@ -60,9 +59,9 @@ /* for fd_get_driver/device_uuid() */ #include "common/freedreno_uuid.h" -#include "ir3/ir3_nir.h" -#include "ir3/ir3_gallium.h" #include "a2xx/ir2.h" +#include "ir3/ir3_gallium.h" +#include "ir3/ir3_nir.h" /* clang-format off */ static const struct debug_named_value fd_debug_options[] = { @@ -107,72 +106,69 @@ bool fd_binning_enabled = true; static const char * fd_screen_get_name(struct pipe_screen *pscreen) { - static char buffer[128]; - snprintf(buffer, sizeof(buffer), "FD%03d", - fd_screen(pscreen)->device_id); - return buffer; + static char buffer[128]; + snprintf(buffer, sizeof(buffer), "FD%03d", fd_screen(pscreen)->device_id); + return buffer; } static const char * fd_screen_get_vendor(struct pipe_screen *pscreen) { - return "freedreno"; + return "freedreno"; } static const char * fd_screen_get_device_vendor(struct pipe_screen *pscreen) { - return "Qualcomm"; + return "Qualcomm"; } - static uint64_t fd_screen_get_timestamp(struct pipe_screen *pscreen) { - struct fd_screen *screen = fd_screen(pscreen); - - if (screen->has_timestamp) { - uint64_t n; - fd_pipe_get_param(screen->pipe, FD_TIMESTAMP, &n); - debug_assert(screen->max_freq > 0); - return n * 1000000000 / screen->max_freq; - } else { - int64_t cpu_time = os_time_get() * 1000; - return cpu_time + screen->cpu_gpu_time_delta; - } - + struct fd_screen *screen = fd_screen(pscreen); + + if (screen->has_timestamp) { + uint64_t n; + fd_pipe_get_param(screen->pipe, FD_TIMESTAMP, &n); + debug_assert(screen->max_freq > 0); + return n * 1000000000 / screen->max_freq; + } else { + int64_t cpu_time = os_time_get() * 1000; + return cpu_time + screen->cpu_gpu_time_delta; + } } static void fd_screen_destroy(struct pipe_screen *pscreen) { - struct fd_screen *screen = fd_screen(pscreen); + struct fd_screen *screen = fd_screen(pscreen); - if (screen->pipe) - fd_pipe_del(screen->pipe); + if (screen->pipe) + fd_pipe_del(screen->pipe); - if (screen->dev) - fd_device_del(screen->dev); + if (screen->dev) + fd_device_del(screen->dev); - if (screen->ro) - screen->ro->destroy(screen->ro); + if (screen->ro) + screen->ro->destroy(screen->ro); - fd_bc_fini(&screen->batch_cache); - fd_gmem_screen_fini(pscreen); + fd_bc_fini(&screen->batch_cache); + fd_gmem_screen_fini(pscreen); - slab_destroy_parent(&screen->transfer_pool); + slab_destroy_parent(&screen->transfer_pool); - simple_mtx_destroy(&screen->lock); + simple_mtx_destroy(&screen->lock); - u_transfer_helper_destroy(pscreen->transfer_helper); + u_transfer_helper_destroy(pscreen->transfer_helper); - if (screen->compiler) - ir3_screen_fini(pscreen); + if (screen->compiler) + ir3_screen_fini(pscreen); - ralloc_free(screen->live_batches); + ralloc_free(screen->live_batches); - free(screen->perfcntr_queries); - free(screen); + free(screen->perfcntr_queries); + free(screen); } /* @@ -182,505 +178,526 @@ tables for things that differ if the delta is not too much.. static int fd_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param) { - struct fd_screen *screen = fd_screen(pscreen); - - /* this is probably not totally correct.. but it's a start: */ - switch (param) { - /* Supported features (boolean caps). */ - case PIPE_CAP_NPOT_TEXTURES: - case PIPE_CAP_MIXED_FRAMEBUFFER_SIZES: - case PIPE_CAP_ANISOTROPIC_FILTER: - case PIPE_CAP_POINT_SPRITE: - case PIPE_CAP_BLEND_EQUATION_SEPARATE: - case PIPE_CAP_TEXTURE_SWIZZLE: - case PIPE_CAP_MIXED_COLORBUFFER_FORMATS: - case PIPE_CAP_TGSI_FS_COORD_ORIGIN_UPPER_LEFT: - case PIPE_CAP_SEAMLESS_CUBE_MAP: - case PIPE_CAP_VERTEX_COLOR_UNCLAMPED: - case PIPE_CAP_QUADS_FOLLOW_PROVOKING_VERTEX_CONVENTION: - case PIPE_CAP_BUFFER_MAP_PERSISTENT_COHERENT: - case PIPE_CAP_STRING_MARKER: - case PIPE_CAP_MIXED_COLOR_DEPTH_BITS: - case PIPE_CAP_TEXTURE_BARRIER: - case PIPE_CAP_INVALIDATE_BUFFER: - case PIPE_CAP_RGB_OVERRIDE_DST_ALPHA_BLEND: - case PIPE_CAP_GLSL_TESS_LEVELS_AS_INPUTS: - case PIPE_CAP_NIR_COMPACT_ARRAYS: - return 1; - - case PIPE_CAP_VERTEX_BUFFER_OFFSET_4BYTE_ALIGNED_ONLY: - case PIPE_CAP_VERTEX_BUFFER_STRIDE_4BYTE_ALIGNED_ONLY: - case PIPE_CAP_VERTEX_ELEMENT_SRC_OFFSET_4BYTE_ALIGNED_ONLY: - return !is_a2xx(screen); - - case PIPE_CAP_TGSI_FS_COORD_PIXEL_CENTER_INTEGER: - return is_a2xx(screen); - case PIPE_CAP_TGSI_FS_COORD_PIXEL_CENTER_HALF_INTEGER: - return !is_a2xx(screen); - - case PIPE_CAP_PACKED_UNIFORMS: - return !is_a2xx(screen); - - case PIPE_CAP_ROBUST_BUFFER_ACCESS_BEHAVIOR: - case PIPE_CAP_DEVICE_RESET_STATUS_QUERY: - return screen->has_robustness; - - case PIPE_CAP_VERTEXID_NOBASE: - return is_a3xx(screen) || is_a4xx(screen); - - case PIPE_CAP_COMPUTE: - return has_compute(screen); - - case PIPE_CAP_PREFER_BLIT_BASED_TEXTURE_TRANSFER: - case PIPE_CAP_PCI_GROUP: - case PIPE_CAP_PCI_BUS: - case PIPE_CAP_PCI_DEVICE: - case PIPE_CAP_PCI_FUNCTION: - return 0; - - case PIPE_CAP_FRAGMENT_SHADER_TEXTURE_LOD: - case PIPE_CAP_FRAGMENT_SHADER_DERIVATIVES: - case PIPE_CAP_VERTEX_SHADER_SATURATE: - case PIPE_CAP_PRIMITIVE_RESTART: - case PIPE_CAP_PRIMITIVE_RESTART_FIXED_INDEX: - case PIPE_CAP_TGSI_INSTANCEID: - case PIPE_CAP_VERTEX_ELEMENT_INSTANCE_DIVISOR: - case PIPE_CAP_INDEP_BLEND_ENABLE: - case PIPE_CAP_INDEP_BLEND_FUNC: - case PIPE_CAP_TEXTURE_BUFFER_OBJECTS: - case PIPE_CAP_TEXTURE_HALF_FLOAT_LINEAR: - case PIPE_CAP_CONDITIONAL_RENDER: - case PIPE_CAP_CONDITIONAL_RENDER_INVERTED: - case PIPE_CAP_SEAMLESS_CUBE_MAP_PER_TEXTURE: - case PIPE_CAP_CLIP_HALFZ: - return is_a3xx(screen) || is_a4xx(screen) || is_a5xx(screen) || is_a6xx(screen); - - case PIPE_CAP_FAKE_SW_MSAA: - return !fd_screen_get_param(pscreen, PIPE_CAP_TEXTURE_MULTISAMPLE); - - case PIPE_CAP_TEXTURE_MULTISAMPLE: - return is_a5xx(screen) || is_a6xx(screen); - - case PIPE_CAP_SURFACE_SAMPLE_COUNT: - return is_a6xx(screen); - - case PIPE_CAP_DEPTH_CLIP_DISABLE: - return is_a3xx(screen) || is_a4xx(screen) || is_a6xx(screen); - - case PIPE_CAP_DEPTH_CLIP_DISABLE_SEPARATE: - return is_a6xx(screen); - - case PIPE_CAP_POLYGON_OFFSET_CLAMP: - return is_a4xx(screen) || is_a5xx(screen) || is_a6xx(screen); - - case PIPE_CAP_PREFER_IMM_ARRAYS_AS_CONSTBUF: - return 0; - - case PIPE_CAP_TEXTURE_BUFFER_OFFSET_ALIGNMENT: - if (is_a3xx(screen)) return 16; - if (is_a4xx(screen)) return 32; - if (is_a5xx(screen) || is_a6xx(screen)) return 64; - return 0; - case PIPE_CAP_MAX_TEXTURE_BUFFER_SIZE: - /* We could possibly emulate more by pretending 2d/rect textures and - * splitting high bits of index into 2nd dimension.. - */ - if (is_a3xx(screen)) return 8192; - if (is_a4xx(screen)) return 16384; - - /* Note that the Vulkan blob on a540 and 640 report a - * maxTexelBufferElements of just 65536 (the GLES3.2 and Vulkan - * minimum). - */ - if (is_a5xx(screen) || is_a6xx(screen)) return 1 << 27; - return 0; - - case PIPE_CAP_TEXTURE_FLOAT_LINEAR: - case PIPE_CAP_CUBE_MAP_ARRAY: - case PIPE_CAP_SAMPLER_VIEW_TARGET: - case PIPE_CAP_TEXTURE_QUERY_LOD: - return is_a4xx(screen) || is_a5xx(screen) || is_a6xx(screen); - - case PIPE_CAP_START_INSTANCE: - /* Note that a5xx can do this, it just can't (at least with - * current firmware) do draw_indirect with base_instance. - * Since draw_indirect is needed sooner (gles31 and gl40 vs - * gl42), hide base_instance on a5xx. :-/ - */ - return is_a4xx(screen); - - case PIPE_CAP_CONSTANT_BUFFER_OFFSET_ALIGNMENT: - return is_a2xx(screen) ? 64 : 32; - - case PIPE_CAP_GLSL_FEATURE_LEVEL: - case PIPE_CAP_GLSL_FEATURE_LEVEL_COMPATIBILITY: - if (is_a6xx(screen)) - return 330; - else if (is_ir3(screen)) - return 140; - else - return 120; - - case PIPE_CAP_ESSL_FEATURE_LEVEL: - /* we can probably enable 320 for a5xx too, but need to test: */ - if (is_a6xx(screen)) return 320; - if (is_a5xx(screen)) return 310; - if (is_ir3(screen)) return 300; - return 120; - - case PIPE_CAP_SHADER_BUFFER_OFFSET_ALIGNMENT: - if (is_a6xx(screen)) return 64; - if (is_a5xx(screen)) return 4; - return 0; - - case PIPE_CAP_MAX_TEXTURE_GATHER_COMPONENTS: - if (is_a4xx(screen) || is_a5xx(screen) || is_a6xx(screen)) - return 4; - return 0; - - /* TODO if we need this, do it in nir/ir3 backend to avoid breaking precompile: */ - case PIPE_CAP_FORCE_PERSAMPLE_INTERP: - return 0; - - case PIPE_CAP_FBFETCH: - if (fd_device_version(screen->dev) >= FD_VERSION_GMEM_BASE && - is_a6xx(screen)) - return 1; - return 0; - case PIPE_CAP_SAMPLE_SHADING: - if (is_a6xx(screen)) return 1; - return 0; - - case PIPE_CAP_CONTEXT_PRIORITY_MASK: - return screen->priority_mask; - - case PIPE_CAP_DRAW_INDIRECT: - if (is_a4xx(screen) || is_a5xx(screen) || is_a6xx(screen)) - return 1; - return 0; - - case PIPE_CAP_FRAMEBUFFER_NO_ATTACHMENT: - if (is_a4xx(screen) || is_a5xx(screen) || is_a6xx(screen)) - return 1; - return 0; - - case PIPE_CAP_LOAD_CONSTBUF: - /* name is confusing, but this turns on std430 packing */ - if (is_ir3(screen)) - return 1; - return 0; - - case PIPE_CAP_NIR_IMAGES_AS_DEREF: - return 0; - - case PIPE_CAP_MAX_VIEWPORTS: - return 1; - - case PIPE_CAP_MAX_VARYINGS: - return is_a6xx(screen) ? 31 : 16; - - case PIPE_CAP_MAX_SHADER_PATCH_VARYINGS: - /* We don't really have a limit on this, it all goes into the main - * memory buffer. Needs to be at least 120 / 4 (minimum requirement - * for GL_MAX_TESS_PATCH_COMPONENTS). - */ - return 128; - - case PIPE_CAP_MAX_TEXTURE_UPLOAD_MEMORY_BUDGET: - return 64 * 1024 * 1024; - - case PIPE_CAP_SHAREABLE_SHADERS: - case PIPE_CAP_GLSL_OPTIMIZE_CONSERVATIVELY: - if (is_ir3(screen)) - return 1; - return 0; - - /* Geometry shaders.. */ - case PIPE_CAP_MAX_GEOMETRY_OUTPUT_VERTICES: - return 512; - case PIPE_CAP_MAX_GEOMETRY_TOTAL_OUTPUT_COMPONENTS: - return 2048; - case PIPE_CAP_MAX_GS_INVOCATIONS: - return 32; - - /* Only a2xx has the half-border clamp mode in HW, just have mesa/st lower - * it for later HW. - */ - case PIPE_CAP_GL_CLAMP: - return is_a2xx(screen); - - case PIPE_CAP_CLIP_PLANES: - /* On a3xx, there is HW support for GL user clip planes that - * occasionally has to fall back to shader key-based lowering to clip - * distances in the VS, and we don't support clip distances so that is - * always shader-based lowering in the FS. - * - * On a4xx, there is no HW support for clip planes, so they are - * always lowered to clip distances. We also lack SW support for the - * HW's clip distances in HW, so we do shader-based lowering in the FS - * in the driver backend. - * - * On a5xx-a6xx, we have the HW clip distances hooked up, so we just let - * mesa/st lower desktop GL's clip planes to clip distances in the last - * vertex shader stage. - */ - return !is_a5xx(screen) && !is_a6xx(screen); - - /* Stream output. */ - case PIPE_CAP_MAX_STREAM_OUTPUT_BUFFERS: - if (is_ir3(screen)) - return PIPE_MAX_SO_BUFFERS; - return 0; - case PIPE_CAP_STREAM_OUTPUT_PAUSE_RESUME: - case PIPE_CAP_STREAM_OUTPUT_INTERLEAVE_BUFFERS: - case PIPE_CAP_TGSI_FS_POSITION_IS_SYSVAL: - case PIPE_CAP_TGSI_TEXCOORD: - if (is_ir3(screen)) - return 1; - return 0; - case PIPE_CAP_TGSI_FS_FACE_IS_INTEGER_SYSVAL: - return 1; - case PIPE_CAP_TGSI_FS_POINT_IS_SYSVAL: - return is_a2xx(screen); - case PIPE_CAP_MAX_STREAM_OUTPUT_SEPARATE_COMPONENTS: - case PIPE_CAP_MAX_STREAM_OUTPUT_INTERLEAVED_COMPONENTS: - if (is_ir3(screen)) - return 16 * 4; /* should only be shader out limit? */ - return 0; - - /* Texturing. */ - case PIPE_CAP_MAX_TEXTURE_2D_SIZE: - if (is_a6xx(screen) || is_a5xx(screen) || is_a4xx(screen)) - return 16384; - else - return 8192; - case PIPE_CAP_MAX_TEXTURE_CUBE_LEVELS: - if (is_a6xx(screen) || is_a5xx(screen) || is_a4xx(screen)) - return 15; - else - return 14; - case PIPE_CAP_MAX_TEXTURE_3D_LEVELS: - return 11; - - case PIPE_CAP_MAX_TEXTURE_ARRAY_LAYERS: - return (is_a3xx(screen) || is_a4xx(screen) || is_a5xx(screen) || is_a6xx(screen)) ? 256 : 0; - - /* Render targets. */ - case PIPE_CAP_MAX_RENDER_TARGETS: - return screen->max_rts; - case PIPE_CAP_MAX_DUAL_SOURCE_RENDER_TARGETS: - return (is_a3xx(screen) || is_a6xx(screen)) ? 1 : 0; - - /* Queries. */ - case PIPE_CAP_OCCLUSION_QUERY: - return is_a3xx(screen) || is_a4xx(screen) || is_a5xx(screen) || is_a6xx(screen); - case PIPE_CAP_QUERY_TIMESTAMP: - case PIPE_CAP_QUERY_TIME_ELAPSED: - /* only a4xx, requires new enough kernel so we know max_freq: */ - return (screen->max_freq > 0) && (is_a4xx(screen) || is_a5xx(screen) || is_a6xx(screen)); - - case PIPE_CAP_VENDOR_ID: - return 0x5143; - case PIPE_CAP_DEVICE_ID: - return 0xFFFFFFFF; - case PIPE_CAP_ACCELERATED: - return 1; - case PIPE_CAP_VIDEO_MEMORY: - DBG("FINISHME: The value returned is incorrect\n"); - return 10; - case PIPE_CAP_UMA: - return 1; - case PIPE_CAP_MEMOBJ: - return fd_device_version(screen->dev) >= FD_VERSION_MEMORY_FD; - case PIPE_CAP_NATIVE_FENCE_FD: - return fd_device_version(screen->dev) >= FD_VERSION_FENCE_FD; - case PIPE_CAP_FENCE_SIGNAL: - return screen->has_syncobj; - case PIPE_CAP_CULL_DISTANCE: - return is_a6xx(screen); - case PIPE_CAP_SHADER_STENCIL_EXPORT: - return is_a6xx(screen); - case PIPE_CAP_TWO_SIDED_COLOR: - return 0; - default: - return u_pipe_screen_get_param_defaults(pscreen, param); - } + struct fd_screen *screen = fd_screen(pscreen); + + /* this is probably not totally correct.. but it's a start: */ + switch (param) { + /* Supported features (boolean caps). */ + case PIPE_CAP_NPOT_TEXTURES: + case PIPE_CAP_MIXED_FRAMEBUFFER_SIZES: + case PIPE_CAP_ANISOTROPIC_FILTER: + case PIPE_CAP_POINT_SPRITE: + case PIPE_CAP_BLEND_EQUATION_SEPARATE: + case PIPE_CAP_TEXTURE_SWIZZLE: + case PIPE_CAP_MIXED_COLORBUFFER_FORMATS: + case PIPE_CAP_TGSI_FS_COORD_ORIGIN_UPPER_LEFT: + case PIPE_CAP_SEAMLESS_CUBE_MAP: + case PIPE_CAP_VERTEX_COLOR_UNCLAMPED: + case PIPE_CAP_QUADS_FOLLOW_PROVOKING_VERTEX_CONVENTION: + case PIPE_CAP_BUFFER_MAP_PERSISTENT_COHERENT: + case PIPE_CAP_STRING_MARKER: + case PIPE_CAP_MIXED_COLOR_DEPTH_BITS: + case PIPE_CAP_TEXTURE_BARRIER: + case PIPE_CAP_INVALIDATE_BUFFER: + case PIPE_CAP_RGB_OVERRIDE_DST_ALPHA_BLEND: + case PIPE_CAP_GLSL_TESS_LEVELS_AS_INPUTS: + case PIPE_CAP_NIR_COMPACT_ARRAYS: + return 1; + + case PIPE_CAP_VERTEX_BUFFER_OFFSET_4BYTE_ALIGNED_ONLY: + case PIPE_CAP_VERTEX_BUFFER_STRIDE_4BYTE_ALIGNED_ONLY: + case PIPE_CAP_VERTEX_ELEMENT_SRC_OFFSET_4BYTE_ALIGNED_ONLY: + return !is_a2xx(screen); + + case PIPE_CAP_TGSI_FS_COORD_PIXEL_CENTER_INTEGER: + return is_a2xx(screen); + case PIPE_CAP_TGSI_FS_COORD_PIXEL_CENTER_HALF_INTEGER: + return !is_a2xx(screen); + + case PIPE_CAP_PACKED_UNIFORMS: + return !is_a2xx(screen); + + case PIPE_CAP_ROBUST_BUFFER_ACCESS_BEHAVIOR: + case PIPE_CAP_DEVICE_RESET_STATUS_QUERY: + return screen->has_robustness; + + case PIPE_CAP_VERTEXID_NOBASE: + return is_a3xx(screen) || is_a4xx(screen); + + case PIPE_CAP_COMPUTE: + return has_compute(screen); + + case PIPE_CAP_PREFER_BLIT_BASED_TEXTURE_TRANSFER: + case PIPE_CAP_PCI_GROUP: + case PIPE_CAP_PCI_BUS: + case PIPE_CAP_PCI_DEVICE: + case PIPE_CAP_PCI_FUNCTION: + return 0; + + case PIPE_CAP_FRAGMENT_SHADER_TEXTURE_LOD: + case PIPE_CAP_FRAGMENT_SHADER_DERIVATIVES: + case PIPE_CAP_VERTEX_SHADER_SATURATE: + case PIPE_CAP_PRIMITIVE_RESTART: + case PIPE_CAP_PRIMITIVE_RESTART_FIXED_INDEX: + case PIPE_CAP_TGSI_INSTANCEID: + case PIPE_CAP_VERTEX_ELEMENT_INSTANCE_DIVISOR: + case PIPE_CAP_INDEP_BLEND_ENABLE: + case PIPE_CAP_INDEP_BLEND_FUNC: + case PIPE_CAP_TEXTURE_BUFFER_OBJECTS: + case PIPE_CAP_TEXTURE_HALF_FLOAT_LINEAR: + case PIPE_CAP_CONDITIONAL_RENDER: + case PIPE_CAP_CONDITIONAL_RENDER_INVERTED: + case PIPE_CAP_SEAMLESS_CUBE_MAP_PER_TEXTURE: + case PIPE_CAP_CLIP_HALFZ: + return is_a3xx(screen) || is_a4xx(screen) || is_a5xx(screen) || + is_a6xx(screen); + + case PIPE_CAP_FAKE_SW_MSAA: + return !fd_screen_get_param(pscreen, PIPE_CAP_TEXTURE_MULTISAMPLE); + + case PIPE_CAP_TEXTURE_MULTISAMPLE: + return is_a5xx(screen) || is_a6xx(screen); + + case PIPE_CAP_SURFACE_SAMPLE_COUNT: + return is_a6xx(screen); + + case PIPE_CAP_DEPTH_CLIP_DISABLE: + return is_a3xx(screen) || is_a4xx(screen) || is_a6xx(screen); + + case PIPE_CAP_DEPTH_CLIP_DISABLE_SEPARATE: + return is_a6xx(screen); + + case PIPE_CAP_POLYGON_OFFSET_CLAMP: + return is_a4xx(screen) || is_a5xx(screen) || is_a6xx(screen); + + case PIPE_CAP_PREFER_IMM_ARRAYS_AS_CONSTBUF: + return 0; + + case PIPE_CAP_TEXTURE_BUFFER_OFFSET_ALIGNMENT: + if (is_a3xx(screen)) + return 16; + if (is_a4xx(screen)) + return 32; + if (is_a5xx(screen) || is_a6xx(screen)) + return 64; + return 0; + case PIPE_CAP_MAX_TEXTURE_BUFFER_SIZE: + /* We could possibly emulate more by pretending 2d/rect textures and + * splitting high bits of index into 2nd dimension.. + */ + if (is_a3xx(screen)) + return 8192; + if (is_a4xx(screen)) + return 16384; + + /* Note that the Vulkan blob on a540 and 640 report a + * maxTexelBufferElements of just 65536 (the GLES3.2 and Vulkan + * minimum). + */ + if (is_a5xx(screen) || is_a6xx(screen)) + return 1 << 27; + return 0; + + case PIPE_CAP_TEXTURE_FLOAT_LINEAR: + case PIPE_CAP_CUBE_MAP_ARRAY: + case PIPE_CAP_SAMPLER_VIEW_TARGET: + case PIPE_CAP_TEXTURE_QUERY_LOD: + return is_a4xx(screen) || is_a5xx(screen) || is_a6xx(screen); + + case PIPE_CAP_START_INSTANCE: + /* Note that a5xx can do this, it just can't (at least with + * current firmware) do draw_indirect with base_instance. + * Since draw_indirect is needed sooner (gles31 and gl40 vs + * gl42), hide base_instance on a5xx. :-/ + */ + return is_a4xx(screen); + + case PIPE_CAP_CONSTANT_BUFFER_OFFSET_ALIGNMENT: + return is_a2xx(screen) ? 64 : 32; + + case PIPE_CAP_GLSL_FEATURE_LEVEL: + case PIPE_CAP_GLSL_FEATURE_LEVEL_COMPATIBILITY: + if (is_a6xx(screen)) + return 330; + else if (is_ir3(screen)) + return 140; + else + return 120; + + case PIPE_CAP_ESSL_FEATURE_LEVEL: + /* we can probably enable 320 for a5xx too, but need to test: */ + if (is_a6xx(screen)) + return 320; + if (is_a5xx(screen)) + return 310; + if (is_ir3(screen)) + return 300; + return 120; + + case PIPE_CAP_SHADER_BUFFER_OFFSET_ALIGNMENT: + if (is_a6xx(screen)) + return 64; + if (is_a5xx(screen)) + return 4; + return 0; + + case PIPE_CAP_MAX_TEXTURE_GATHER_COMPONENTS: + if (is_a4xx(screen) || is_a5xx(screen) || is_a6xx(screen)) + return 4; + return 0; + + /* TODO if we need this, do it in nir/ir3 backend to avoid breaking + * precompile: */ + case PIPE_CAP_FORCE_PERSAMPLE_INTERP: + return 0; + + case PIPE_CAP_FBFETCH: + if (fd_device_version(screen->dev) >= FD_VERSION_GMEM_BASE && + is_a6xx(screen)) + return 1; + return 0; + case PIPE_CAP_SAMPLE_SHADING: + if (is_a6xx(screen)) + return 1; + return 0; + + case PIPE_CAP_CONTEXT_PRIORITY_MASK: + return screen->priority_mask; + + case PIPE_CAP_DRAW_INDIRECT: + if (is_a4xx(screen) || is_a5xx(screen) || is_a6xx(screen)) + return 1; + return 0; + + case PIPE_CAP_FRAMEBUFFER_NO_ATTACHMENT: + if (is_a4xx(screen) || is_a5xx(screen) || is_a6xx(screen)) + return 1; + return 0; + + case PIPE_CAP_LOAD_CONSTBUF: + /* name is confusing, but this turns on std430 packing */ + if (is_ir3(screen)) + return 1; + return 0; + + case PIPE_CAP_NIR_IMAGES_AS_DEREF: + return 0; + + case PIPE_CAP_MAX_VIEWPORTS: + return 1; + + case PIPE_CAP_MAX_VARYINGS: + return is_a6xx(screen) ? 31 : 16; + + case PIPE_CAP_MAX_SHADER_PATCH_VARYINGS: + /* We don't really have a limit on this, it all goes into the main + * memory buffer. Needs to be at least 120 / 4 (minimum requirement + * for GL_MAX_TESS_PATCH_COMPONENTS). + */ + return 128; + + case PIPE_CAP_MAX_TEXTURE_UPLOAD_MEMORY_BUDGET: + return 64 * 1024 * 1024; + + case PIPE_CAP_SHAREABLE_SHADERS: + case PIPE_CAP_GLSL_OPTIMIZE_CONSERVATIVELY: + if (is_ir3(screen)) + return 1; + return 0; + + /* Geometry shaders.. */ + case PIPE_CAP_MAX_GEOMETRY_OUTPUT_VERTICES: + return 512; + case PIPE_CAP_MAX_GEOMETRY_TOTAL_OUTPUT_COMPONENTS: + return 2048; + case PIPE_CAP_MAX_GS_INVOCATIONS: + return 32; + + /* Only a2xx has the half-border clamp mode in HW, just have mesa/st lower + * it for later HW. + */ + case PIPE_CAP_GL_CLAMP: + return is_a2xx(screen); + + case PIPE_CAP_CLIP_PLANES: + /* On a3xx, there is HW support for GL user clip planes that + * occasionally has to fall back to shader key-based lowering to clip + * distances in the VS, and we don't support clip distances so that is + * always shader-based lowering in the FS. + * + * On a4xx, there is no HW support for clip planes, so they are + * always lowered to clip distances. We also lack SW support for the + * HW's clip distances in HW, so we do shader-based lowering in the FS + * in the driver backend. + * + * On a5xx-a6xx, we have the HW clip distances hooked up, so we just let + * mesa/st lower desktop GL's clip planes to clip distances in the last + * vertex shader stage. + */ + return !is_a5xx(screen) && !is_a6xx(screen); + + /* Stream output. */ + case PIPE_CAP_MAX_STREAM_OUTPUT_BUFFERS: + if (is_ir3(screen)) + return PIPE_MAX_SO_BUFFERS; + return 0; + case PIPE_CAP_STREAM_OUTPUT_PAUSE_RESUME: + case PIPE_CAP_STREAM_OUTPUT_INTERLEAVE_BUFFERS: + case PIPE_CAP_TGSI_FS_POSITION_IS_SYSVAL: + case PIPE_CAP_TGSI_TEXCOORD: + if (is_ir3(screen)) + return 1; + return 0; + case PIPE_CAP_TGSI_FS_FACE_IS_INTEGER_SYSVAL: + return 1; + case PIPE_CAP_TGSI_FS_POINT_IS_SYSVAL: + return is_a2xx(screen); + case PIPE_CAP_MAX_STREAM_OUTPUT_SEPARATE_COMPONENTS: + case PIPE_CAP_MAX_STREAM_OUTPUT_INTERLEAVED_COMPONENTS: + if (is_ir3(screen)) + return 16 * 4; /* should only be shader out limit? */ + return 0; + + /* Texturing. */ + case PIPE_CAP_MAX_TEXTURE_2D_SIZE: + if (is_a6xx(screen) || is_a5xx(screen) || is_a4xx(screen)) + return 16384; + else + return 8192; + case PIPE_CAP_MAX_TEXTURE_CUBE_LEVELS: + if (is_a6xx(screen) || is_a5xx(screen) || is_a4xx(screen)) + return 15; + else + return 14; + case PIPE_CAP_MAX_TEXTURE_3D_LEVELS: + return 11; + + case PIPE_CAP_MAX_TEXTURE_ARRAY_LAYERS: + return (is_a3xx(screen) || is_a4xx(screen) || is_a5xx(screen) || + is_a6xx(screen)) + ? 256 + : 0; + + /* Render targets. */ + case PIPE_CAP_MAX_RENDER_TARGETS: + return screen->max_rts; + case PIPE_CAP_MAX_DUAL_SOURCE_RENDER_TARGETS: + return (is_a3xx(screen) || is_a6xx(screen)) ? 1 : 0; + + /* Queries. */ + case PIPE_CAP_OCCLUSION_QUERY: + return is_a3xx(screen) || is_a4xx(screen) || is_a5xx(screen) || + is_a6xx(screen); + case PIPE_CAP_QUERY_TIMESTAMP: + case PIPE_CAP_QUERY_TIME_ELAPSED: + /* only a4xx, requires new enough kernel so we know max_freq: */ + return (screen->max_freq > 0) && + (is_a4xx(screen) || is_a5xx(screen) || is_a6xx(screen)); + + case PIPE_CAP_VENDOR_ID: + return 0x5143; + case PIPE_CAP_DEVICE_ID: + return 0xFFFFFFFF; + case PIPE_CAP_ACCELERATED: + return 1; + case PIPE_CAP_VIDEO_MEMORY: + DBG("FINISHME: The value returned is incorrect\n"); + return 10; + case PIPE_CAP_UMA: + return 1; + case PIPE_CAP_MEMOBJ: + return fd_device_version(screen->dev) >= FD_VERSION_MEMORY_FD; + case PIPE_CAP_NATIVE_FENCE_FD: + return fd_device_version(screen->dev) >= FD_VERSION_FENCE_FD; + case PIPE_CAP_FENCE_SIGNAL: + return screen->has_syncobj; + case PIPE_CAP_CULL_DISTANCE: + return is_a6xx(screen); + case PIPE_CAP_SHADER_STENCIL_EXPORT: + return is_a6xx(screen); + case PIPE_CAP_TWO_SIDED_COLOR: + return 0; + default: + return u_pipe_screen_get_param_defaults(pscreen, param); + } } static float fd_screen_get_paramf(struct pipe_screen *pscreen, enum pipe_capf param) { - switch (param) { - case PIPE_CAPF_MAX_LINE_WIDTH: - case PIPE_CAPF_MAX_LINE_WIDTH_AA: - /* NOTE: actual value is 127.0f, but this is working around a deqp - * bug.. dEQP-GLES3.functional.rasterization.primitives.lines_wide - * uses too small of a render target size, and gets confused when - * the lines start going offscreen. - * - * See: https://code.google.com/p/android/issues/detail?id=206513 - */ - if (FD_DBG(DEQP)) - return 48.0f; - return 127.0f; - case PIPE_CAPF_MAX_POINT_WIDTH: - case PIPE_CAPF_MAX_POINT_WIDTH_AA: - return 4092.0f; - case PIPE_CAPF_MAX_TEXTURE_ANISOTROPY: - return 16.0f; - case PIPE_CAPF_MAX_TEXTURE_LOD_BIAS: - return 15.0f; - case PIPE_CAPF_MIN_CONSERVATIVE_RASTER_DILATE: - case PIPE_CAPF_MAX_CONSERVATIVE_RASTER_DILATE: - case PIPE_CAPF_CONSERVATIVE_RASTER_DILATE_GRANULARITY: - return 0.0f; - } - mesa_loge("unknown paramf %d", param); - return 0; + switch (param) { + case PIPE_CAPF_MAX_LINE_WIDTH: + case PIPE_CAPF_MAX_LINE_WIDTH_AA: + /* NOTE: actual value is 127.0f, but this is working around a deqp + * bug.. dEQP-GLES3.functional.rasterization.primitives.lines_wide + * uses too small of a render target size, and gets confused when + * the lines start going offscreen. + * + * See: https://code.google.com/p/android/issues/detail?id=206513 + */ + if (FD_DBG(DEQP)) + return 48.0f; + return 127.0f; + case PIPE_CAPF_MAX_POINT_WIDTH: + case PIPE_CAPF_MAX_POINT_WIDTH_AA: + return 4092.0f; + case PIPE_CAPF_MAX_TEXTURE_ANISOTROPY: + return 16.0f; + case PIPE_CAPF_MAX_TEXTURE_LOD_BIAS: + return 15.0f; + case PIPE_CAPF_MIN_CONSERVATIVE_RASTER_DILATE: + case PIPE_CAPF_MAX_CONSERVATIVE_RASTER_DILATE: + case PIPE_CAPF_CONSERVATIVE_RASTER_DILATE_GRANULARITY: + return 0.0f; + } + mesa_loge("unknown paramf %d", param); + return 0; } static int fd_screen_get_shader_param(struct pipe_screen *pscreen, - enum pipe_shader_type shader, - enum pipe_shader_cap param) + enum pipe_shader_type shader, + enum pipe_shader_cap param) { - struct fd_screen *screen = fd_screen(pscreen); - - switch(shader) - { - case PIPE_SHADER_FRAGMENT: - case PIPE_SHADER_VERTEX: - break; - case PIPE_SHADER_TESS_CTRL: - case PIPE_SHADER_TESS_EVAL: - case PIPE_SHADER_GEOMETRY: - if (is_a6xx(screen)) - break; - return 0; - case PIPE_SHADER_COMPUTE: - if (has_compute(screen)) - break; - return 0; - default: - mesa_loge("unknown shader type %d", shader); - return 0; - } - - /* this is probably not totally correct.. but it's a start: */ - switch (param) { - case PIPE_SHADER_CAP_MAX_INSTRUCTIONS: - case PIPE_SHADER_CAP_MAX_ALU_INSTRUCTIONS: - case PIPE_SHADER_CAP_MAX_TEX_INSTRUCTIONS: - case PIPE_SHADER_CAP_MAX_TEX_INDIRECTIONS: - return 16384; - case PIPE_SHADER_CAP_MAX_CONTROL_FLOW_DEPTH: - return 8; /* XXX */ - case PIPE_SHADER_CAP_MAX_INPUTS: - if (shader == PIPE_SHADER_GEOMETRY && is_a6xx(screen)) - return 16; - return is_a6xx(screen) ? 32 : 16; - case PIPE_SHADER_CAP_MAX_OUTPUTS: - return is_a6xx(screen) ? 32 : 16; - case PIPE_SHADER_CAP_MAX_TEMPS: - return 64; /* Max native temporaries. */ - case PIPE_SHADER_CAP_MAX_CONST_BUFFER_SIZE: - /* NOTE: seems to be limit for a3xx is actually 512 but - * split between VS and FS. Use lower limit of 256 to - * avoid getting into impossible situations: - */ - return ((is_a3xx(screen) || is_a4xx(screen) || is_a5xx(screen) || is_a6xx(screen)) ? 4096 : 64) * sizeof(float[4]); - case PIPE_SHADER_CAP_MAX_CONST_BUFFERS: - return is_ir3(screen) ? 16 : 1; - case PIPE_SHADER_CAP_TGSI_CONT_SUPPORTED: - return 1; - case PIPE_SHADER_CAP_INDIRECT_INPUT_ADDR: - case PIPE_SHADER_CAP_INDIRECT_OUTPUT_ADDR: - /* Technically this should be the same as for TEMP/CONST, since - * everything is just normal registers. This is just temporary - * hack until load_input/store_output handle arrays in a similar - * way as load_var/store_var.. - * - * For tessellation stages, inputs are loaded using ldlw or ldg, both - * of which support indirection. - */ - return shader == PIPE_SHADER_TESS_CTRL || shader == PIPE_SHADER_TESS_EVAL; - case PIPE_SHADER_CAP_INDIRECT_TEMP_ADDR: - case PIPE_SHADER_CAP_INDIRECT_CONST_ADDR: - /* a2xx compiler doesn't handle indirect: */ - return is_ir3(screen) ? 1 : 0; - case PIPE_SHADER_CAP_SUBROUTINES: - case PIPE_SHADER_CAP_TGSI_DROUND_SUPPORTED: - case PIPE_SHADER_CAP_TGSI_DFRACEXP_DLDEXP_SUPPORTED: - case PIPE_SHADER_CAP_TGSI_LDEXP_SUPPORTED: - case PIPE_SHADER_CAP_TGSI_FMA_SUPPORTED: - case PIPE_SHADER_CAP_TGSI_ANY_INOUT_DECL_RANGE: - case PIPE_SHADER_CAP_MAX_HW_ATOMIC_COUNTERS: - case PIPE_SHADER_CAP_LOWER_IF_THRESHOLD: - case PIPE_SHADER_CAP_TGSI_SKIP_MERGE_REGISTERS: - case PIPE_SHADER_CAP_MAX_HW_ATOMIC_COUNTER_BUFFERS: - return 0; - case PIPE_SHADER_CAP_TGSI_SQRT_SUPPORTED: - return 1; - case PIPE_SHADER_CAP_INTEGERS: - return is_ir3(screen) ? 1 : 0; - case PIPE_SHADER_CAP_INT64_ATOMICS: - case PIPE_SHADER_CAP_FP16_DERIVATIVES: - case PIPE_SHADER_CAP_FP16_CONST_BUFFERS: - case PIPE_SHADER_CAP_INT16: - case PIPE_SHADER_CAP_GLSL_16BIT_CONSTS: - return 0; - case PIPE_SHADER_CAP_FP16: - return ((is_a5xx(screen) || is_a6xx(screen)) && - (shader == PIPE_SHADER_COMPUTE || - shader == PIPE_SHADER_FRAGMENT) && - !FD_DBG(NOFP16)); - case PIPE_SHADER_CAP_MAX_TEXTURE_SAMPLERS: - case PIPE_SHADER_CAP_MAX_SAMPLER_VIEWS: - return 16; - case PIPE_SHADER_CAP_PREFERRED_IR: - return PIPE_SHADER_IR_NIR; - case PIPE_SHADER_CAP_SUPPORTED_IRS: - return (1 << PIPE_SHADER_IR_NIR) | (1 << PIPE_SHADER_IR_TGSI); - case PIPE_SHADER_CAP_MAX_UNROLL_ITERATIONS_HINT: - return 32; - case PIPE_SHADER_CAP_MAX_SHADER_BUFFERS: - case PIPE_SHADER_CAP_MAX_SHADER_IMAGES: - if (is_a5xx(screen) || is_a6xx(screen)) { - /* a5xx (and a4xx for that matter) has one state-block - * for compute-shader SSBO's and another that is shared - * by VS/HS/DS/GS/FS.. so to simplify things for now - * just advertise SSBOs for FS and CS. We could possibly - * do what blob does, and partition the space for - * VS/HS/DS/GS/FS. The blob advertises: - * - * GL_MAX_VERTEX_SHADER_STORAGE_BLOCKS: 4 - * GL_MAX_GEOMETRY_SHADER_STORAGE_BLOCKS: 4 - * GL_MAX_TESS_CONTROL_SHADER_STORAGE_BLOCKS: 4 - * GL_MAX_TESS_EVALUATION_SHADER_STORAGE_BLOCKS: 4 - * GL_MAX_FRAGMENT_SHADER_STORAGE_BLOCKS: 4 - * GL_MAX_COMPUTE_SHADER_STORAGE_BLOCKS: 24 - * GL_MAX_COMBINED_SHADER_STORAGE_BLOCKS: 24 - * - * I think that way we could avoid having to patch shaders - * for actual SSBO indexes by using a static partitioning. - * - * Note same state block is used for images and buffers, - * but images also need texture state for read access - * (isam/isam.3d) - */ - switch(shader) - { - case PIPE_SHADER_FRAGMENT: - case PIPE_SHADER_COMPUTE: - return 24; - default: - return 0; - } - } - return 0; - } - mesa_loge("unknown shader param %d", param); - return 0; + struct fd_screen *screen = fd_screen(pscreen); + + switch (shader) { + case PIPE_SHADER_FRAGMENT: + case PIPE_SHADER_VERTEX: + break; + case PIPE_SHADER_TESS_CTRL: + case PIPE_SHADER_TESS_EVAL: + case PIPE_SHADER_GEOMETRY: + if (is_a6xx(screen)) + break; + return 0; + case PIPE_SHADER_COMPUTE: + if (has_compute(screen)) + break; + return 0; + default: + mesa_loge("unknown shader type %d", shader); + return 0; + } + + /* this is probably not totally correct.. but it's a start: */ + switch (param) { + case PIPE_SHADER_CAP_MAX_INSTRUCTIONS: + case PIPE_SHADER_CAP_MAX_ALU_INSTRUCTIONS: + case PIPE_SHADER_CAP_MAX_TEX_INSTRUCTIONS: + case PIPE_SHADER_CAP_MAX_TEX_INDIRECTIONS: + return 16384; + case PIPE_SHADER_CAP_MAX_CONTROL_FLOW_DEPTH: + return 8; /* XXX */ + case PIPE_SHADER_CAP_MAX_INPUTS: + if (shader == PIPE_SHADER_GEOMETRY && is_a6xx(screen)) + return 16; + return is_a6xx(screen) ? 32 : 16; + case PIPE_SHADER_CAP_MAX_OUTPUTS: + return is_a6xx(screen) ? 32 : 16; + case PIPE_SHADER_CAP_MAX_TEMPS: + return 64; /* Max native temporaries. */ + case PIPE_SHADER_CAP_MAX_CONST_BUFFER_SIZE: + /* NOTE: seems to be limit for a3xx is actually 512 but + * split between VS and FS. Use lower limit of 256 to + * avoid getting into impossible situations: + */ + return ((is_a3xx(screen) || is_a4xx(screen) || is_a5xx(screen) || + is_a6xx(screen)) + ? 4096 + : 64) * + sizeof(float[4]); + case PIPE_SHADER_CAP_MAX_CONST_BUFFERS: + return is_ir3(screen) ? 16 : 1; + case PIPE_SHADER_CAP_TGSI_CONT_SUPPORTED: + return 1; + case PIPE_SHADER_CAP_INDIRECT_INPUT_ADDR: + case PIPE_SHADER_CAP_INDIRECT_OUTPUT_ADDR: + /* Technically this should be the same as for TEMP/CONST, since + * everything is just normal registers. This is just temporary + * hack until load_input/store_output handle arrays in a similar + * way as load_var/store_var.. + * + * For tessellation stages, inputs are loaded using ldlw or ldg, both + * of which support indirection. + */ + return shader == PIPE_SHADER_TESS_CTRL || shader == PIPE_SHADER_TESS_EVAL; + case PIPE_SHADER_CAP_INDIRECT_TEMP_ADDR: + case PIPE_SHADER_CAP_INDIRECT_CONST_ADDR: + /* a2xx compiler doesn't handle indirect: */ + return is_ir3(screen) ? 1 : 0; + case PIPE_SHADER_CAP_SUBROUTINES: + case PIPE_SHADER_CAP_TGSI_DROUND_SUPPORTED: + case PIPE_SHADER_CAP_TGSI_DFRACEXP_DLDEXP_SUPPORTED: + case PIPE_SHADER_CAP_TGSI_LDEXP_SUPPORTED: + case PIPE_SHADER_CAP_TGSI_FMA_SUPPORTED: + case PIPE_SHADER_CAP_TGSI_ANY_INOUT_DECL_RANGE: + case PIPE_SHADER_CAP_MAX_HW_ATOMIC_COUNTERS: + case PIPE_SHADER_CAP_LOWER_IF_THRESHOLD: + case PIPE_SHADER_CAP_TGSI_SKIP_MERGE_REGISTERS: + case PIPE_SHADER_CAP_MAX_HW_ATOMIC_COUNTER_BUFFERS: + return 0; + case PIPE_SHADER_CAP_TGSI_SQRT_SUPPORTED: + return 1; + case PIPE_SHADER_CAP_INTEGERS: + return is_ir3(screen) ? 1 : 0; + case PIPE_SHADER_CAP_INT64_ATOMICS: + case PIPE_SHADER_CAP_FP16_DERIVATIVES: + case PIPE_SHADER_CAP_FP16_CONST_BUFFERS: + case PIPE_SHADER_CAP_INT16: + case PIPE_SHADER_CAP_GLSL_16BIT_CONSTS: + return 0; + case PIPE_SHADER_CAP_FP16: + return ( + (is_a5xx(screen) || is_a6xx(screen)) && + (shader == PIPE_SHADER_COMPUTE || shader == PIPE_SHADER_FRAGMENT) && + !FD_DBG(NOFP16)); + case PIPE_SHADER_CAP_MAX_TEXTURE_SAMPLERS: + case PIPE_SHADER_CAP_MAX_SAMPLER_VIEWS: + return 16; + case PIPE_SHADER_CAP_PREFERRED_IR: + return PIPE_SHADER_IR_NIR; + case PIPE_SHADER_CAP_SUPPORTED_IRS: + return (1 << PIPE_SHADER_IR_NIR) | (1 << PIPE_SHADER_IR_TGSI); + case PIPE_SHADER_CAP_MAX_UNROLL_ITERATIONS_HINT: + return 32; + case PIPE_SHADER_CAP_MAX_SHADER_BUFFERS: + case PIPE_SHADER_CAP_MAX_SHADER_IMAGES: + if (is_a5xx(screen) || is_a6xx(screen)) { + /* a5xx (and a4xx for that matter) has one state-block + * for compute-shader SSBO's and another that is shared + * by VS/HS/DS/GS/FS.. so to simplify things for now + * just advertise SSBOs for FS and CS. We could possibly + * do what blob does, and partition the space for + * VS/HS/DS/GS/FS. The blob advertises: + * + * GL_MAX_VERTEX_SHADER_STORAGE_BLOCKS: 4 + * GL_MAX_GEOMETRY_SHADER_STORAGE_BLOCKS: 4 + * GL_MAX_TESS_CONTROL_SHADER_STORAGE_BLOCKS: 4 + * GL_MAX_TESS_EVALUATION_SHADER_STORAGE_BLOCKS: 4 + * GL_MAX_FRAGMENT_SHADER_STORAGE_BLOCKS: 4 + * GL_MAX_COMPUTE_SHADER_STORAGE_BLOCKS: 24 + * GL_MAX_COMBINED_SHADER_STORAGE_BLOCKS: 24 + * + * I think that way we could avoid having to patch shaders + * for actual SSBO indexes by using a static partitioning. + * + * Note same state block is used for images and buffers, + * but images also need texture state for read access + * (isam/isam.3d) + */ + switch (shader) { + case PIPE_SHADER_FRAGMENT: + case PIPE_SHADER_COMPUTE: + return 24; + default: + return 0; + } + } + return 0; + } + mesa_loge("unknown shader param %d", param); + return 0; } /* TODO depending on how much the limits differ for a3xx/a4xx, maybe move this @@ -688,423 +705,422 @@ fd_screen_get_shader_param(struct pipe_screen *pscreen, */ static int fd_get_compute_param(struct pipe_screen *pscreen, enum pipe_shader_ir ir_type, - enum pipe_compute_cap param, void *ret) + enum pipe_compute_cap param, void *ret) { - struct fd_screen *screen = fd_screen(pscreen); - const char * const ir = "ir3"; + struct fd_screen *screen = fd_screen(pscreen); + const char *const ir = "ir3"; - if (!has_compute(screen)) - return 0; + if (!has_compute(screen)) + return 0; -#define RET(x) do { \ - if (ret) \ - memcpy(ret, x, sizeof(x)); \ - return sizeof(x); \ -} while (0) +#define RET(x) \ + do { \ + if (ret) \ + memcpy(ret, x, sizeof(x)); \ + return sizeof(x); \ + } while (0) - switch (param) { - case PIPE_COMPUTE_CAP_ADDRESS_BITS: -// don't expose 64b pointer support yet, until ir3 supports 64b -// math, otherwise spir64 target is used and we get 64b pointer -// calculations that we can't do yet -// if (is_a5xx(screen)) -// RET((uint32_t []){ 64 }); - RET((uint32_t []){ 32 }); + switch (param) { + case PIPE_COMPUTE_CAP_ADDRESS_BITS: + // don't expose 64b pointer support yet, until ir3 supports 64b + // math, otherwise spir64 target is used and we get 64b pointer + // calculations that we can't do yet + // if (is_a5xx(screen)) + // RET((uint32_t []){ 64 }); + RET((uint32_t[]){32}); - case PIPE_COMPUTE_CAP_IR_TARGET: - if (ret) - sprintf(ret, "%s", ir); - return strlen(ir) * sizeof(char); + case PIPE_COMPUTE_CAP_IR_TARGET: + if (ret) + sprintf(ret, "%s", ir); + return strlen(ir) * sizeof(char); - case PIPE_COMPUTE_CAP_GRID_DIMENSION: - RET((uint64_t []) { 3 }); + case PIPE_COMPUTE_CAP_GRID_DIMENSION: + RET((uint64_t[]){3}); - case PIPE_COMPUTE_CAP_MAX_GRID_SIZE: - RET(((uint64_t []) { 65535, 65535, 65535 })); + case PIPE_COMPUTE_CAP_MAX_GRID_SIZE: + RET(((uint64_t[]){65535, 65535, 65535})); - case PIPE_COMPUTE_CAP_MAX_BLOCK_SIZE: - RET(((uint64_t []) { 1024, 1024, 64 })); + case PIPE_COMPUTE_CAP_MAX_BLOCK_SIZE: + RET(((uint64_t[]){1024, 1024, 64})); - case PIPE_COMPUTE_CAP_MAX_THREADS_PER_BLOCK: - RET((uint64_t []) { 1024 }); + case PIPE_COMPUTE_CAP_MAX_THREADS_PER_BLOCK: + RET((uint64_t[]){1024}); - case PIPE_COMPUTE_CAP_MAX_GLOBAL_SIZE: - RET((uint64_t []) { screen->ram_size }); + case PIPE_COMPUTE_CAP_MAX_GLOBAL_SIZE: + RET((uint64_t[]){screen->ram_size}); - case PIPE_COMPUTE_CAP_MAX_LOCAL_SIZE: - RET((uint64_t []) { 32768 }); + case PIPE_COMPUTE_CAP_MAX_LOCAL_SIZE: + RET((uint64_t[]){32768}); - case PIPE_COMPUTE_CAP_MAX_PRIVATE_SIZE: - case PIPE_COMPUTE_CAP_MAX_INPUT_SIZE: - RET((uint64_t []) { 4096 }); + case PIPE_COMPUTE_CAP_MAX_PRIVATE_SIZE: + case PIPE_COMPUTE_CAP_MAX_INPUT_SIZE: + RET((uint64_t[]){4096}); - case PIPE_COMPUTE_CAP_MAX_MEM_ALLOC_SIZE: - RET((uint64_t []) { screen->ram_size }); + case PIPE_COMPUTE_CAP_MAX_MEM_ALLOC_SIZE: + RET((uint64_t[]){screen->ram_size}); - case PIPE_COMPUTE_CAP_MAX_CLOCK_FREQUENCY: - RET((uint32_t []) { screen->max_freq / 1000000 }); + case PIPE_COMPUTE_CAP_MAX_CLOCK_FREQUENCY: + RET((uint32_t[]){screen->max_freq / 1000000}); - case PIPE_COMPUTE_CAP_MAX_COMPUTE_UNITS: - RET((uint32_t []) { 9999 }); // TODO + case PIPE_COMPUTE_CAP_MAX_COMPUTE_UNITS: + RET((uint32_t[]){9999}); // TODO - case PIPE_COMPUTE_CAP_IMAGES_SUPPORTED: - RET((uint32_t []) { 1 }); + case PIPE_COMPUTE_CAP_IMAGES_SUPPORTED: + RET((uint32_t[]){1}); - case PIPE_COMPUTE_CAP_SUBGROUP_SIZE: - RET((uint32_t []) { 32 }); // TODO + case PIPE_COMPUTE_CAP_SUBGROUP_SIZE: + RET((uint32_t[]){32}); // TODO - case PIPE_COMPUTE_CAP_MAX_VARIABLE_THREADS_PER_BLOCK: - RET((uint64_t []) { 1024 }); // TODO - } + case PIPE_COMPUTE_CAP_MAX_VARIABLE_THREADS_PER_BLOCK: + RET((uint64_t[]){1024}); // TODO + } - return 0; + return 0; } static const void * -fd_get_compiler_options(struct pipe_screen *pscreen, - enum pipe_shader_ir ir, unsigned shader) +fd_get_compiler_options(struct pipe_screen *pscreen, enum pipe_shader_ir ir, + unsigned shader) { - struct fd_screen *screen = fd_screen(pscreen); + struct fd_screen *screen = fd_screen(pscreen); - if (is_ir3(screen)) - return ir3_get_compiler_options(screen->compiler); + if (is_ir3(screen)) + return ir3_get_compiler_options(screen->compiler); - return ir2_get_compiler_options(); + return ir2_get_compiler_options(); } static struct disk_cache * fd_get_disk_shader_cache(struct pipe_screen *pscreen) { - struct fd_screen *screen = fd_screen(pscreen); + struct fd_screen *screen = fd_screen(pscreen); - if (is_ir3(screen)) { - struct ir3_compiler *compiler = screen->compiler; - return compiler->disk_cache; - } + if (is_ir3(screen)) { + struct ir3_compiler *compiler = screen->compiler; + return compiler->disk_cache; + } - return NULL; + return NULL; } bool -fd_screen_bo_get_handle(struct pipe_screen *pscreen, - struct fd_bo *bo, - struct renderonly_scanout *scanout, - unsigned stride, - struct winsys_handle *whandle) +fd_screen_bo_get_handle(struct pipe_screen *pscreen, struct fd_bo *bo, + struct renderonly_scanout *scanout, unsigned stride, + struct winsys_handle *whandle) { - whandle->stride = stride; - - if (whandle->type == WINSYS_HANDLE_TYPE_SHARED) { - return fd_bo_get_name(bo, &whandle->handle) == 0; - } else if (whandle->type == WINSYS_HANDLE_TYPE_KMS) { - if (renderonly_get_handle(scanout, whandle)) - return true; - whandle->handle = fd_bo_handle(bo); - return true; - } else if (whandle->type == WINSYS_HANDLE_TYPE_FD) { - whandle->handle = fd_bo_dmabuf(bo); - return true; - } else { - return false; - } + whandle->stride = stride; + + if (whandle->type == WINSYS_HANDLE_TYPE_SHARED) { + return fd_bo_get_name(bo, &whandle->handle) == 0; + } else if (whandle->type == WINSYS_HANDLE_TYPE_KMS) { + if (renderonly_get_handle(scanout, whandle)) + return true; + whandle->handle = fd_bo_handle(bo); + return true; + } else if (whandle->type == WINSYS_HANDLE_TYPE_FD) { + whandle->handle = fd_bo_dmabuf(bo); + return true; + } else { + return false; + } } static void fd_screen_query_dmabuf_modifiers(struct pipe_screen *pscreen, - enum pipe_format format, - int max, uint64_t *modifiers, - unsigned int *external_only, - int *count) + enum pipe_format format, int max, + uint64_t *modifiers, + unsigned int *external_only, int *count) { - struct fd_screen *screen = fd_screen(pscreen); - int i, num = 0; + struct fd_screen *screen = fd_screen(pscreen); + int i, num = 0; - max = MIN2(max, screen->num_supported_modifiers); + max = MIN2(max, screen->num_supported_modifiers); - if (!max) { - max = screen->num_supported_modifiers; - external_only = NULL; - modifiers = NULL; - } + if (!max) { + max = screen->num_supported_modifiers; + external_only = NULL; + modifiers = NULL; + } - for (i = 0; i < max; i++) { - if (modifiers) - modifiers[num] = screen->supported_modifiers[i]; + for (i = 0; i < max; i++) { + if (modifiers) + modifiers[num] = screen->supported_modifiers[i]; - if (external_only) - external_only[num] = 0; + if (external_only) + external_only[num] = 0; - num++; - } + num++; + } - *count = num; + *count = num; } static bool fd_screen_is_dmabuf_modifier_supported(struct pipe_screen *pscreen, - uint64_t modifier, - enum pipe_format format, - bool *external_only) + uint64_t modifier, + enum pipe_format format, + bool *external_only) { - struct fd_screen *screen = fd_screen(pscreen); - int i; + struct fd_screen *screen = fd_screen(pscreen); + int i; - for (i = 0; i < screen->num_supported_modifiers; i++) { - if (modifier == screen->supported_modifiers[i]) { - if (external_only) - *external_only = false; + for (i = 0; i < screen->num_supported_modifiers; i++) { + if (modifier == screen->supported_modifiers[i]) { + if (external_only) + *external_only = false; - return true; - } - } + return true; + } + } - return false; + return false; } struct fd_bo * fd_screen_bo_from_handle(struct pipe_screen *pscreen, - struct winsys_handle *whandle) + struct winsys_handle *whandle) { - struct fd_screen *screen = fd_screen(pscreen); - struct fd_bo *bo; - - if (whandle->type == WINSYS_HANDLE_TYPE_SHARED) { - bo = fd_bo_from_name(screen->dev, whandle->handle); - } else if (whandle->type == WINSYS_HANDLE_TYPE_KMS) { - bo = fd_bo_from_handle(screen->dev, whandle->handle, 0); - } else if (whandle->type == WINSYS_HANDLE_TYPE_FD) { - bo = fd_bo_from_dmabuf(screen->dev, whandle->handle); - } else { - DBG("Attempt to import unsupported handle type %d", whandle->type); - return NULL; - } - - if (!bo) { - DBG("ref name 0x%08x failed", whandle->handle); - return NULL; - } - - return bo; + struct fd_screen *screen = fd_screen(pscreen); + struct fd_bo *bo; + + if (whandle->type == WINSYS_HANDLE_TYPE_SHARED) { + bo = fd_bo_from_name(screen->dev, whandle->handle); + } else if (whandle->type == WINSYS_HANDLE_TYPE_KMS) { + bo = fd_bo_from_handle(screen->dev, whandle->handle, 0); + } else if (whandle->type == WINSYS_HANDLE_TYPE_FD) { + bo = fd_bo_from_dmabuf(screen->dev, whandle->handle); + } else { + DBG("Attempt to import unsupported handle type %d", whandle->type); + return NULL; + } + + if (!bo) { + DBG("ref name 0x%08x failed", whandle->handle); + return NULL; + } + + return bo; } -static void _fd_fence_ref(struct pipe_screen *pscreen, - struct pipe_fence_handle **ptr, - struct pipe_fence_handle *pfence) +static void +_fd_fence_ref(struct pipe_screen *pscreen, struct pipe_fence_handle **ptr, + struct pipe_fence_handle *pfence) { - fd_fence_ref(ptr, pfence); + fd_fence_ref(ptr, pfence); } static void fd_screen_get_device_uuid(struct pipe_screen *pscreen, char *uuid) { - struct fd_screen *screen = fd_screen(pscreen); + struct fd_screen *screen = fd_screen(pscreen); - fd_get_device_uuid(uuid, screen->gpu_id); + fd_get_device_uuid(uuid, screen->gpu_id); } static void fd_screen_get_driver_uuid(struct pipe_screen *pscreen, char *uuid) { - fd_get_driver_uuid(uuid); + fd_get_driver_uuid(uuid); } struct pipe_screen * fd_screen_create(struct fd_device *dev, struct renderonly *ro) { - struct fd_screen *screen = CALLOC_STRUCT(fd_screen); - struct pipe_screen *pscreen; - uint64_t val; - - fd_mesa_debug = debug_get_option_fd_mesa_debug(); - - if (FD_DBG(NOBIN)) - fd_binning_enabled = false; - - if (!screen) - return NULL; - - pscreen = &screen->base; - - screen->dev = dev; - screen->ro = ro; - screen->refcnt = 1; - - // maybe this should be in context? - screen->pipe = fd_pipe_new(screen->dev, FD_PIPE_3D); - if (!screen->pipe) { - DBG("could not create 3d pipe"); - goto fail; - } - - if (fd_pipe_get_param(screen->pipe, FD_GMEM_SIZE, &val)) { - DBG("could not get GMEM size"); - goto fail; - } - screen->gmemsize_bytes = env_var_as_unsigned("FD_MESA_GMEM", val); - - if (fd_device_version(dev) >= FD_VERSION_GMEM_BASE) { - fd_pipe_get_param(screen->pipe, FD_GMEM_BASE, &screen->gmem_base); - } - - if (fd_pipe_get_param(screen->pipe, FD_DEVICE_ID, &val)) { - DBG("could not get device-id"); - goto fail; - } - screen->device_id = val; - - if (fd_pipe_get_param(screen->pipe, FD_MAX_FREQ, &val)) { - DBG("could not get gpu freq"); - /* this limits what performance related queries are - * supported but is not fatal - */ - screen->max_freq = 0; - } else { - screen->max_freq = val; - if (fd_pipe_get_param(screen->pipe, FD_TIMESTAMP, &val) == 0) - screen->has_timestamp = true; - } - - if (fd_pipe_get_param(screen->pipe, FD_GPU_ID, &val)) { - DBG("could not get gpu-id"); - goto fail; - } - screen->gpu_id = val; - - if (fd_pipe_get_param(screen->pipe, FD_CHIP_ID, &val)) { - DBG("could not get chip-id"); - /* older kernels may not have this property: */ - unsigned core = screen->gpu_id / 100; - unsigned major = (screen->gpu_id % 100) / 10; - unsigned minor = screen->gpu_id % 10; - unsigned patch = 0; /* assume the worst */ - val = (patch & 0xff) | ((minor & 0xff) << 8) | - ((major & 0xff) << 16) | ((core & 0xff) << 24); - } - screen->chip_id = val; - - if (fd_pipe_get_param(screen->pipe, FD_NR_RINGS, &val)) { - DBG("could not get # of rings"); - screen->priority_mask = 0; - } else { - /* # of rings equates to number of unique priority values: */ - screen->priority_mask = (1 << val) - 1; - } - - if (fd_device_version(dev) >= FD_VERSION_ROBUSTNESS) - screen->has_robustness = true; - - screen->has_syncobj = fd_has_syncobj(screen->dev); - - struct sysinfo si; - sysinfo(&si); - screen->ram_size = si.totalram; - - DBG("Pipe Info:"); - DBG(" GPU-id: %d", screen->gpu_id); - DBG(" Chip-id: 0x%08x", screen->chip_id); - DBG(" GMEM size: 0x%08x", screen->gmemsize_bytes); - - /* explicitly checking for GPU revisions that are known to work. This - * may be overly conservative for a3xx, where spoofing the gpu_id with - * the blob driver seems to generate identical cmdstream dumps. But - * on a2xx, there seem to be small differences between the GPU revs - * so it is probably better to actually test first on real hardware - * before enabling: - * - * If you have a different adreno version, feel free to add it to one - * of the cases below and see what happens. And if it works, please - * send a patch ;-) - */ - switch (screen->gpu_id) { - case 200: - case 201: - case 205: - case 220: - fd2_screen_init(pscreen); - break; - case 305: - case 307: - case 320: - case 330: - fd3_screen_init(pscreen); - break; - case 405: - case 420: - case 430: - fd4_screen_init(pscreen); - break; - case 510: - case 530: - case 540: - fd5_screen_init(pscreen); - break; - case 618: - case 630: - case 640: - case 650: - fd6_screen_init(pscreen); - break; - default: - mesa_loge("unsupported GPU: a%03d", screen->gpu_id); - goto fail; - } - - freedreno_dev_info_init(&screen->info, screen->gpu_id); - - if (FD_DBG(PERFC)) { - screen->perfcntr_groups = fd_perfcntrs(screen->gpu_id, - &screen->num_perfcntr_groups); - } - - /* NOTE: don't enable if we have too old of a kernel to support - * growable cmdstream buffers, since memory requirement for cmdstream - * buffers would be too much otherwise. - */ - if (fd_device_version(dev) >= FD_VERSION_UNLIMITED_CMDS) - screen->reorder = !FD_DBG(INORDER); - - if (BATCH_DEBUG) - screen->live_batches = _mesa_pointer_set_create(NULL); - - fd_bc_init(&screen->batch_cache); - - list_inithead(&screen->context_list); - - (void) simple_mtx_init(&screen->lock, mtx_plain); - - pscreen->destroy = fd_screen_destroy; - pscreen->get_param = fd_screen_get_param; - pscreen->get_paramf = fd_screen_get_paramf; - pscreen->get_shader_param = fd_screen_get_shader_param; - pscreen->get_compute_param = fd_get_compute_param; - pscreen->get_compiler_options = fd_get_compiler_options; - pscreen->get_disk_shader_cache = fd_get_disk_shader_cache; - - fd_resource_screen_init(pscreen); - fd_query_screen_init(pscreen); - fd_gmem_screen_init(pscreen); - - pscreen->get_name = fd_screen_get_name; - pscreen->get_vendor = fd_screen_get_vendor; - pscreen->get_device_vendor = fd_screen_get_device_vendor; - - pscreen->get_timestamp = fd_screen_get_timestamp; - - pscreen->fence_reference = _fd_fence_ref; - pscreen->fence_finish = fd_fence_finish; - pscreen->fence_get_fd = fd_fence_get_fd; - - pscreen->query_dmabuf_modifiers = fd_screen_query_dmabuf_modifiers; - pscreen->is_dmabuf_modifier_supported = fd_screen_is_dmabuf_modifier_supported; - - pscreen->get_device_uuid = fd_screen_get_device_uuid; - pscreen->get_driver_uuid = fd_screen_get_driver_uuid; - - slab_create_parent(&screen->transfer_pool, sizeof(struct fd_transfer), 16); - - return pscreen; + struct fd_screen *screen = CALLOC_STRUCT(fd_screen); + struct pipe_screen *pscreen; + uint64_t val; + + fd_mesa_debug = debug_get_option_fd_mesa_debug(); + + if (FD_DBG(NOBIN)) + fd_binning_enabled = false; + + if (!screen) + return NULL; + + pscreen = &screen->base; + + screen->dev = dev; + screen->ro = ro; + screen->refcnt = 1; + + // maybe this should be in context? + screen->pipe = fd_pipe_new(screen->dev, FD_PIPE_3D); + if (!screen->pipe) { + DBG("could not create 3d pipe"); + goto fail; + } + + if (fd_pipe_get_param(screen->pipe, FD_GMEM_SIZE, &val)) { + DBG("could not get GMEM size"); + goto fail; + } + screen->gmemsize_bytes = env_var_as_unsigned("FD_MESA_GMEM", val); + + if (fd_device_version(dev) >= FD_VERSION_GMEM_BASE) { + fd_pipe_get_param(screen->pipe, FD_GMEM_BASE, &screen->gmem_base); + } + + if (fd_pipe_get_param(screen->pipe, FD_DEVICE_ID, &val)) { + DBG("could not get device-id"); + goto fail; + } + screen->device_id = val; + + if (fd_pipe_get_param(screen->pipe, FD_MAX_FREQ, &val)) { + DBG("could not get gpu freq"); + /* this limits what performance related queries are + * supported but is not fatal + */ + screen->max_freq = 0; + } else { + screen->max_freq = val; + if (fd_pipe_get_param(screen->pipe, FD_TIMESTAMP, &val) == 0) + screen->has_timestamp = true; + } + + if (fd_pipe_get_param(screen->pipe, FD_GPU_ID, &val)) { + DBG("could not get gpu-id"); + goto fail; + } + screen->gpu_id = val; + + if (fd_pipe_get_param(screen->pipe, FD_CHIP_ID, &val)) { + DBG("could not get chip-id"); + /* older kernels may not have this property: */ + unsigned core = screen->gpu_id / 100; + unsigned major = (screen->gpu_id % 100) / 10; + unsigned minor = screen->gpu_id % 10; + unsigned patch = 0; /* assume the worst */ + val = (patch & 0xff) | ((minor & 0xff) << 8) | ((major & 0xff) << 16) | + ((core & 0xff) << 24); + } + screen->chip_id = val; + + if (fd_pipe_get_param(screen->pipe, FD_NR_RINGS, &val)) { + DBG("could not get # of rings"); + screen->priority_mask = 0; + } else { + /* # of rings equates to number of unique priority values: */ + screen->priority_mask = (1 << val) - 1; + } + + if (fd_device_version(dev) >= FD_VERSION_ROBUSTNESS) + screen->has_robustness = true; + + screen->has_syncobj = fd_has_syncobj(screen->dev); + + struct sysinfo si; + sysinfo(&si); + screen->ram_size = si.totalram; + + DBG("Pipe Info:"); + DBG(" GPU-id: %d", screen->gpu_id); + DBG(" Chip-id: 0x%08x", screen->chip_id); + DBG(" GMEM size: 0x%08x", screen->gmemsize_bytes); + + /* explicitly checking for GPU revisions that are known to work. This + * may be overly conservative for a3xx, where spoofing the gpu_id with + * the blob driver seems to generate identical cmdstream dumps. But + * on a2xx, there seem to be small differences between the GPU revs + * so it is probably better to actually test first on real hardware + * before enabling: + * + * If you have a different adreno version, feel free to add it to one + * of the cases below and see what happens. And if it works, please + * send a patch ;-) + */ + switch (screen->gpu_id) { + case 200: + case 201: + case 205: + case 220: + fd2_screen_init(pscreen); + break; + case 305: + case 307: + case 320: + case 330: + fd3_screen_init(pscreen); + break; + case 405: + case 420: + case 430: + fd4_screen_init(pscreen); + break; + case 510: + case 530: + case 540: + fd5_screen_init(pscreen); + break; + case 618: + case 630: + case 640: + case 650: + fd6_screen_init(pscreen); + break; + default: + mesa_loge("unsupported GPU: a%03d", screen->gpu_id); + goto fail; + } + + freedreno_dev_info_init(&screen->info, screen->gpu_id); + + if (FD_DBG(PERFC)) { + screen->perfcntr_groups = + fd_perfcntrs(screen->gpu_id, &screen->num_perfcntr_groups); + } + + /* NOTE: don't enable if we have too old of a kernel to support + * growable cmdstream buffers, since memory requirement for cmdstream + * buffers would be too much otherwise. + */ + if (fd_device_version(dev) >= FD_VERSION_UNLIMITED_CMDS) + screen->reorder = !FD_DBG(INORDER); + + if (BATCH_DEBUG) + screen->live_batches = _mesa_pointer_set_create(NULL); + + fd_bc_init(&screen->batch_cache); + + list_inithead(&screen->context_list); + + (void)simple_mtx_init(&screen->lock, mtx_plain); + + pscreen->destroy = fd_screen_destroy; + pscreen->get_param = fd_screen_get_param; + pscreen->get_paramf = fd_screen_get_paramf; + pscreen->get_shader_param = fd_screen_get_shader_param; + pscreen->get_compute_param = fd_get_compute_param; + pscreen->get_compiler_options = fd_get_compiler_options; + pscreen->get_disk_shader_cache = fd_get_disk_shader_cache; + + fd_resource_screen_init(pscreen); + fd_query_screen_init(pscreen); + fd_gmem_screen_init(pscreen); + + pscreen->get_name = fd_screen_get_name; + pscreen->get_vendor = fd_screen_get_vendor; + pscreen->get_device_vendor = fd_screen_get_device_vendor; + + pscreen->get_timestamp = fd_screen_get_timestamp; + + pscreen->fence_reference = _fd_fence_ref; + pscreen->fence_finish = fd_fence_finish; + pscreen->fence_get_fd = fd_fence_get_fd; + + pscreen->query_dmabuf_modifiers = fd_screen_query_dmabuf_modifiers; + pscreen->is_dmabuf_modifier_supported = + fd_screen_is_dmabuf_modifier_supported; + + pscreen->get_device_uuid = fd_screen_get_device_uuid; + pscreen->get_driver_uuid = fd_screen_get_driver_uuid; + + slab_create_parent(&screen->transfer_pool, sizeof(struct fd_transfer), 16); + + return pscreen; fail: - fd_screen_destroy(pscreen); - return NULL; + fd_screen_destroy(pscreen); + return NULL; } diff --git a/src/gallium/drivers/freedreno/freedreno_screen.h b/src/gallium/drivers/freedreno/freedreno_screen.h index 1aa9db5..cf8b2dd 100644 --- a/src/gallium/drivers/freedreno/freedreno_screen.h +++ b/src/gallium/drivers/freedreno/freedreno_screen.h @@ -27,18 +27,18 @@ #ifndef FREEDRENO_SCREEN_H_ #define FREEDRENO_SCREEN_H_ +#include "common/freedreno_dev_info.h" #include "drm/freedreno_drmif.h" #include "drm/freedreno_ringbuffer.h" #include "perfcntrs/freedreno_perfcntr.h" -#include "common/freedreno_dev_info.h" #include "pipe/p_screen.h" +#include "renderonly/renderonly.h" #include "util/debug.h" +#include "util/simple_mtx.h" +#include "util/slab.h" #include "util/u_memory.h" #include "util/u_queue.h" -#include "util/slab.h" -#include "util/simple_mtx.h" -#include "renderonly/renderonly.h" #include "freedreno_batch_cache.h" #include "freedreno_gmem.h" @@ -50,149 +50,148 @@ struct fd_bo; * generation backend can override this with screen->gmem_reason_mask */ enum fd_gmem_reason { - FD_GMEM_CLEARS_DEPTH_STENCIL = BIT(0), - FD_GMEM_DEPTH_ENABLED = BIT(1), - FD_GMEM_STENCIL_ENABLED = BIT(2), - FD_GMEM_BLEND_ENABLED = BIT(3), - FD_GMEM_LOGICOP_ENABLED = BIT(4), - FD_GMEM_FB_READ = BIT(5), + FD_GMEM_CLEARS_DEPTH_STENCIL = BIT(0), + FD_GMEM_DEPTH_ENABLED = BIT(1), + FD_GMEM_STENCIL_ENABLED = BIT(2), + FD_GMEM_BLEND_ENABLED = BIT(3), + FD_GMEM_LOGICOP_ENABLED = BIT(4), + FD_GMEM_FB_READ = BIT(5), }; struct fd_screen { - struct pipe_screen base; + struct pipe_screen base; - struct list_head context_list; + struct list_head context_list; - simple_mtx_t lock; + simple_mtx_t lock; - /* it would be tempting to use pipe_reference here, but that - * really doesn't work well if it isn't the first member of - * the struct, so not quite so awesome to be adding refcnting - * further down the inheritance hierarchy: - */ - int refcnt; + /* it would be tempting to use pipe_reference here, but that + * really doesn't work well if it isn't the first member of + * the struct, so not quite so awesome to be adding refcnting + * further down the inheritance hierarchy: + */ + int refcnt; - /* place for winsys to stash it's own stuff: */ - void *winsys_priv; + /* place for winsys to stash it's own stuff: */ + void *winsys_priv; - struct slab_parent_pool transfer_pool; + struct slab_parent_pool transfer_pool; - uint64_t gmem_base; - uint32_t gmemsize_bytes; - uint32_t device_id; - uint32_t gpu_id; /* 220, 305, etc */ - uint32_t chip_id; /* coreid:8 majorrev:8 minorrev:8 patch:8 */ - uint32_t max_freq; - uint32_t ram_size; - uint32_t max_rts; /* max # of render targets */ - uint32_t priority_mask; - bool has_timestamp; - bool has_robustness; - bool has_syncobj; + uint64_t gmem_base; + uint32_t gmemsize_bytes; + uint32_t device_id; + uint32_t gpu_id; /* 220, 305, etc */ + uint32_t chip_id; /* coreid:8 majorrev:8 minorrev:8 patch:8 */ + uint32_t max_freq; + uint32_t ram_size; + uint32_t max_rts; /* max # of render targets */ + uint32_t priority_mask; + bool has_timestamp; + bool has_robustness; + bool has_syncobj; - struct freedreno_dev_info info; + struct freedreno_dev_info info; - /* Bitmask of gmem_reasons that do not force GMEM path over bypass - * for current generation. - */ - enum fd_gmem_reason gmem_reason_mask; + /* Bitmask of gmem_reasons that do not force GMEM path over bypass + * for current generation. + */ + enum fd_gmem_reason gmem_reason_mask; - unsigned num_perfcntr_groups; - const struct fd_perfcntr_group *perfcntr_groups; + unsigned num_perfcntr_groups; + const struct fd_perfcntr_group *perfcntr_groups; - /* generated at startup from the perfcntr groups: */ - unsigned num_perfcntr_queries; - struct pipe_driver_query_info *perfcntr_queries; + /* generated at startup from the perfcntr groups: */ + unsigned num_perfcntr_queries; + struct pipe_driver_query_info *perfcntr_queries; - void *compiler; /* currently unused for a2xx */ - struct util_queue compile_queue; /* currently unused for a2xx */ + void *compiler; /* currently unused for a2xx */ + struct util_queue compile_queue; /* currently unused for a2xx */ - struct fd_device *dev; + struct fd_device *dev; - /* NOTE: we still need a pipe associated with the screen in a few - * places, like screen->get_timestamp(). For anything context - * related, use ctx->pipe instead. - */ - struct fd_pipe *pipe; + /* NOTE: we still need a pipe associated with the screen in a few + * places, like screen->get_timestamp(). For anything context + * related, use ctx->pipe instead. + */ + struct fd_pipe *pipe; - uint32_t (*setup_slices)(struct fd_resource *rsc); - unsigned (*tile_mode)(const struct pipe_resource *prsc); - int (*layout_resource_for_modifier)(struct fd_resource *rsc, uint64_t modifier); + uint32_t (*setup_slices)(struct fd_resource *rsc); + unsigned (*tile_mode)(const struct pipe_resource *prsc); + int (*layout_resource_for_modifier)(struct fd_resource *rsc, + uint64_t modifier); - /* indirect-branch emit: */ - void (*emit_ib)(struct fd_ringbuffer *ring, struct fd_ringbuffer *target); + /* indirect-branch emit: */ + void (*emit_ib)(struct fd_ringbuffer *ring, struct fd_ringbuffer *target); - /* simple gpu "memcpy": */ - void (*mem_to_mem)(struct fd_ringbuffer *ring, struct pipe_resource *dst, - unsigned dst_off, struct pipe_resource *src, unsigned src_off, - unsigned sizedwords); + /* simple gpu "memcpy": */ + void (*mem_to_mem)(struct fd_ringbuffer *ring, struct pipe_resource *dst, + unsigned dst_off, struct pipe_resource *src, + unsigned src_off, unsigned sizedwords); - int64_t cpu_gpu_time_delta; + int64_t cpu_gpu_time_delta; - struct fd_batch_cache batch_cache; - struct fd_gmem_cache gmem_cache; + struct fd_batch_cache batch_cache; + struct fd_gmem_cache gmem_cache; - bool reorder; + bool reorder; - uint16_t rsc_seqno; - uint16_t ctx_seqno; + uint16_t rsc_seqno; + uint16_t ctx_seqno; - unsigned num_supported_modifiers; - const uint64_t *supported_modifiers; + unsigned num_supported_modifiers; + const uint64_t *supported_modifiers; - struct renderonly *ro; + struct renderonly *ro; - /* when BATCH_DEBUG is enabled, tracking for fd_batch's which are not yet - * freed: - */ - struct set *live_batches; + /* when BATCH_DEBUG is enabled, tracking for fd_batch's which are not yet + * freed: + */ + struct set *live_batches; }; static inline struct fd_screen * fd_screen(struct pipe_screen *pscreen) { - return (struct fd_screen *)pscreen; + return (struct fd_screen *)pscreen; } static inline void fd_screen_lock(struct fd_screen *screen) { - simple_mtx_lock(&screen->lock); + simple_mtx_lock(&screen->lock); } static inline void fd_screen_unlock(struct fd_screen *screen) { - simple_mtx_unlock(&screen->lock); + simple_mtx_unlock(&screen->lock); } static inline void fd_screen_assert_locked(struct fd_screen *screen) { - simple_mtx_assert_locked(&screen->lock); + simple_mtx_assert_locked(&screen->lock); } -bool fd_screen_bo_get_handle(struct pipe_screen *pscreen, - struct fd_bo *bo, - struct renderonly_scanout *scanout, - unsigned stride, - struct winsys_handle *whandle); -struct fd_bo * fd_screen_bo_from_handle(struct pipe_screen *pscreen, - struct winsys_handle *whandle); +bool fd_screen_bo_get_handle(struct pipe_screen *pscreen, struct fd_bo *bo, + struct renderonly_scanout *scanout, + unsigned stride, struct winsys_handle *whandle); +struct fd_bo *fd_screen_bo_from_handle(struct pipe_screen *pscreen, + struct winsys_handle *whandle); -struct pipe_screen * -fd_screen_create(struct fd_device *dev, struct renderonly *ro); +struct pipe_screen *fd_screen_create(struct fd_device *dev, + struct renderonly *ro); static inline boolean is_a20x(struct fd_screen *screen) { - return (screen->gpu_id >= 200) && (screen->gpu_id < 210); + return (screen->gpu_id >= 200) && (screen->gpu_id < 210); } static inline boolean is_a2xx(struct fd_screen *screen) { - return (screen->gpu_id >= 200) && (screen->gpu_id < 300); + return (screen->gpu_id >= 200) && (screen->gpu_id < 300); } /* is a3xx patch revision 0? */ @@ -200,50 +199,51 @@ is_a2xx(struct fd_screen *screen) static inline boolean is_a3xx_p0(struct fd_screen *screen) { - return (screen->chip_id & 0xff0000ff) == 0x03000000; + return (screen->chip_id & 0xff0000ff) == 0x03000000; } static inline boolean is_a3xx(struct fd_screen *screen) { - return (screen->gpu_id >= 300) && (screen->gpu_id < 400); + return (screen->gpu_id >= 300) && (screen->gpu_id < 400); } static inline boolean is_a4xx(struct fd_screen *screen) { - return (screen->gpu_id >= 400) && (screen->gpu_id < 500); + return (screen->gpu_id >= 400) && (screen->gpu_id < 500); } static inline boolean is_a5xx(struct fd_screen *screen) { - return (screen->gpu_id >= 500) && (screen->gpu_id < 600); + return (screen->gpu_id >= 500) && (screen->gpu_id < 600); } static inline boolean is_a6xx(struct fd_screen *screen) { - return (screen->gpu_id >= 600) && (screen->gpu_id < 700); + return (screen->gpu_id >= 600) && (screen->gpu_id < 700); } static inline boolean is_a650(struct fd_screen *screen) { - return screen->gpu_id == 650; + return screen->gpu_id == 650; } /* is it using the ir3 compiler (shader isa introduced with a3xx)? */ static inline boolean is_ir3(struct fd_screen *screen) { - return is_a3xx(screen) || is_a4xx(screen) || is_a5xx(screen) || is_a6xx(screen); + return is_a3xx(screen) || is_a4xx(screen) || is_a5xx(screen) || + is_a6xx(screen); } static inline bool has_compute(struct fd_screen *screen) { - return is_a5xx(screen) || is_a6xx(screen); + return is_a5xx(screen) || is_a6xx(screen); } #endif /* FREEDRENO_SCREEN_H_ */ diff --git a/src/gallium/drivers/freedreno/freedreno_state.c b/src/gallium/drivers/freedreno/freedreno_state.c index ab008cf..8e12fdb 100644 --- a/src/gallium/drivers/freedreno/freedreno_state.c +++ b/src/gallium/drivers/freedreno/freedreno_state.c @@ -26,16 +26,16 @@ #include "pipe/p_state.h" #include "util/u_dual_blend.h" -#include "util/u_string.h" -#include "util/u_memory.h" #include "util/u_helpers.h" +#include "util/u_memory.h" +#include "util/u_string.h" -#include "freedreno_state.h" #include "freedreno_context.h" -#include "freedreno_resource.h" -#include "freedreno_texture.h" #include "freedreno_gmem.h" #include "freedreno_query_hw.h" +#include "freedreno_resource.h" +#include "freedreno_state.h" +#include "freedreno_texture.h" #include "freedreno_util.h" /* All the generic state handling.. In case of CSO's that are specific @@ -44,67 +44,61 @@ */ static void -update_draw_cost(struct fd_context *ctx) - assert_dt +update_draw_cost(struct fd_context *ctx) assert_dt { - struct pipe_framebuffer_state *pfb = &ctx->framebuffer; - - ctx->draw_cost = pfb->nr_cbufs; - for (unsigned i = 0; i < pfb->nr_cbufs; i++) - if (fd_blend_enabled(ctx, i)) - ctx->draw_cost++; - if (fd_depth_enabled(ctx)) - ctx->draw_cost++; - if (fd_depth_write_enabled(ctx)) - ctx->draw_cost++; + struct pipe_framebuffer_state *pfb = &ctx->framebuffer; + + ctx->draw_cost = pfb->nr_cbufs; + for (unsigned i = 0; i < pfb->nr_cbufs; i++) + if (fd_blend_enabled(ctx, i)) + ctx->draw_cost++; + if (fd_depth_enabled(ctx)) + ctx->draw_cost++; + if (fd_depth_write_enabled(ctx)) + ctx->draw_cost++; } static void fd_set_blend_color(struct pipe_context *pctx, - const struct pipe_blend_color *blend_color) - in_dt + const struct pipe_blend_color *blend_color) in_dt { - struct fd_context *ctx = fd_context(pctx); - ctx->blend_color = *blend_color; - fd_context_dirty(ctx, FD_DIRTY_BLEND_COLOR); + struct fd_context *ctx = fd_context(pctx); + ctx->blend_color = *blend_color; + fd_context_dirty(ctx, FD_DIRTY_BLEND_COLOR); } static void fd_set_stencil_ref(struct pipe_context *pctx, - const struct pipe_stencil_ref stencil_ref) - in_dt + const struct pipe_stencil_ref stencil_ref) in_dt { - struct fd_context *ctx = fd_context(pctx); - ctx->stencil_ref = stencil_ref; - fd_context_dirty(ctx, FD_DIRTY_STENCIL_REF); + struct fd_context *ctx = fd_context(pctx); + ctx->stencil_ref = stencil_ref; + fd_context_dirty(ctx, FD_DIRTY_STENCIL_REF); } static void fd_set_clip_state(struct pipe_context *pctx, - const struct pipe_clip_state *clip) - in_dt + const struct pipe_clip_state *clip) in_dt { - struct fd_context *ctx = fd_context(pctx); - ctx->ucp = *clip; - fd_context_dirty(ctx, FD_DIRTY_UCP); + struct fd_context *ctx = fd_context(pctx); + ctx->ucp = *clip; + fd_context_dirty(ctx, FD_DIRTY_UCP); } static void -fd_set_sample_mask(struct pipe_context *pctx, unsigned sample_mask) - in_dt +fd_set_sample_mask(struct pipe_context *pctx, unsigned sample_mask) in_dt { - struct fd_context *ctx = fd_context(pctx); - ctx->sample_mask = (uint16_t)sample_mask; - fd_context_dirty(ctx, FD_DIRTY_SAMPLE_MASK); + struct fd_context *ctx = fd_context(pctx); + ctx->sample_mask = (uint16_t)sample_mask; + fd_context_dirty(ctx, FD_DIRTY_SAMPLE_MASK); } static void -fd_set_min_samples(struct pipe_context *pctx, unsigned min_samples) - in_dt +fd_set_min_samples(struct pipe_context *pctx, unsigned min_samples) in_dt { - struct fd_context *ctx = fd_context(pctx); - ctx->min_samples = min_samples; - fd_context_dirty(ctx, FD_DIRTY_MIN_SAMPLES); + struct fd_context *ctx = fd_context(pctx); + ctx->min_samples = min_samples; + fd_context_dirty(ctx, FD_DIRTY_MIN_SAMPLES); } /* notes from calim on #dri-devel: @@ -116,613 +110,584 @@ fd_set_min_samples(struct pipe_context *pctx, unsigned min_samples) * index>0 will be UBO's.. well, I'll worry about that later */ static void -fd_set_constant_buffer(struct pipe_context *pctx, - enum pipe_shader_type shader, uint index, - bool take_ownership, - const struct pipe_constant_buffer *cb) - in_dt +fd_set_constant_buffer(struct pipe_context *pctx, enum pipe_shader_type shader, + uint index, bool take_ownership, + const struct pipe_constant_buffer *cb) in_dt { - struct fd_context *ctx = fd_context(pctx); - struct fd_constbuf_stateobj *so = &ctx->constbuf[shader]; + struct fd_context *ctx = fd_context(pctx); + struct fd_constbuf_stateobj *so = &ctx->constbuf[shader]; - util_copy_constant_buffer(&so->cb[index], cb, take_ownership); + util_copy_constant_buffer(&so->cb[index], cb, take_ownership); - /* Note that gallium frontends can unbind constant buffers by - * passing NULL here. - */ - if (unlikely(!cb)) { - so->enabled_mask &= ~(1 << index); - return; - } + /* Note that gallium frontends can unbind constant buffers by + * passing NULL here. + */ + if (unlikely(!cb)) { + so->enabled_mask &= ~(1 << index); + return; + } - so->enabled_mask |= 1 << index; + so->enabled_mask |= 1 << index; - fd_context_dirty_shader(ctx, shader, FD_DIRTY_SHADER_CONST); - fd_resource_set_usage(cb->buffer, FD_DIRTY_CONST); + fd_context_dirty_shader(ctx, shader, FD_DIRTY_SHADER_CONST); + fd_resource_set_usage(cb->buffer, FD_DIRTY_CONST); - if (index > 0) { - assert(!cb->user_buffer); - ctx->dirty |= FD_DIRTY_RESOURCE; - } + if (index > 0) { + assert(!cb->user_buffer); + ctx->dirty |= FD_DIRTY_RESOURCE; + } } static void -fd_set_shader_buffers(struct pipe_context *pctx, - enum pipe_shader_type shader, - unsigned start, unsigned count, - const struct pipe_shader_buffer *buffers, - unsigned writable_bitmask) - in_dt +fd_set_shader_buffers(struct pipe_context *pctx, enum pipe_shader_type shader, + unsigned start, unsigned count, + const struct pipe_shader_buffer *buffers, + unsigned writable_bitmask) in_dt { - struct fd_context *ctx = fd_context(pctx); - struct fd_shaderbuf_stateobj *so = &ctx->shaderbuf[shader]; - const unsigned modified_bits = u_bit_consecutive(start, count); + struct fd_context *ctx = fd_context(pctx); + struct fd_shaderbuf_stateobj *so = &ctx->shaderbuf[shader]; + const unsigned modified_bits = u_bit_consecutive(start, count); - so->enabled_mask &= ~modified_bits; - so->writable_mask &= ~modified_bits; - so->writable_mask |= writable_bitmask << start; + so->enabled_mask &= ~modified_bits; + so->writable_mask &= ~modified_bits; + so->writable_mask |= writable_bitmask << start; - for (unsigned i = 0; i < count; i++) { - unsigned n = i + start; - struct pipe_shader_buffer *buf = &so->sb[n]; + for (unsigned i = 0; i < count; i++) { + unsigned n = i + start; + struct pipe_shader_buffer *buf = &so->sb[n]; - if (buffers && buffers[i].buffer) { - if ((buf->buffer == buffers[i].buffer) && - (buf->buffer_offset == buffers[i].buffer_offset) && - (buf->buffer_size == buffers[i].buffer_size)) - continue; + if (buffers && buffers[i].buffer) { + if ((buf->buffer == buffers[i].buffer) && + (buf->buffer_offset == buffers[i].buffer_offset) && + (buf->buffer_size == buffers[i].buffer_size)) + continue; - buf->buffer_offset = buffers[i].buffer_offset; - buf->buffer_size = buffers[i].buffer_size; - pipe_resource_reference(&buf->buffer, buffers[i].buffer); + buf->buffer_offset = buffers[i].buffer_offset; + buf->buffer_size = buffers[i].buffer_size; + pipe_resource_reference(&buf->buffer, buffers[i].buffer); - fd_resource_set_usage(buffers[i].buffer, FD_DIRTY_SSBO); + fd_resource_set_usage(buffers[i].buffer, FD_DIRTY_SSBO); - so->enabled_mask |= BIT(n); - } else { - pipe_resource_reference(&buf->buffer, NULL); - } - } + so->enabled_mask |= BIT(n); + } else { + pipe_resource_reference(&buf->buffer, NULL); + } + } - fd_context_dirty_shader(ctx, shader, FD_DIRTY_SHADER_SSBO); + fd_context_dirty_shader(ctx, shader, FD_DIRTY_SHADER_SSBO); } void -fd_set_shader_images(struct pipe_context *pctx, - enum pipe_shader_type shader, - unsigned start, unsigned count, - unsigned unbind_num_trailing_slots, - const struct pipe_image_view *images) - in_dt +fd_set_shader_images(struct pipe_context *pctx, enum pipe_shader_type shader, + unsigned start, unsigned count, + unsigned unbind_num_trailing_slots, + const struct pipe_image_view *images) in_dt { - struct fd_context *ctx = fd_context(pctx); - struct fd_shaderimg_stateobj *so = &ctx->shaderimg[shader]; + struct fd_context *ctx = fd_context(pctx); + struct fd_shaderimg_stateobj *so = &ctx->shaderimg[shader]; - unsigned mask = 0; + unsigned mask = 0; - if (images) { - for (unsigned i = 0; i < count; i++) { - unsigned n = i + start; - struct pipe_image_view *buf = &so->si[n]; + if (images) { + for (unsigned i = 0; i < count; i++) { + unsigned n = i + start; + struct pipe_image_view *buf = &so->si[n]; - if ((buf->resource == images[i].resource) && - (buf->format == images[i].format) && - (buf->access == images[i].access) && - !memcmp(&buf->u, &images[i].u, sizeof(buf->u))) - continue; + if ((buf->resource == images[i].resource) && + (buf->format == images[i].format) && + (buf->access == images[i].access) && + !memcmp(&buf->u, &images[i].u, sizeof(buf->u))) + continue; - mask |= BIT(n); - util_copy_image_view(buf, &images[i]); + mask |= BIT(n); + util_copy_image_view(buf, &images[i]); - if (buf->resource) { - fd_resource_set_usage(buf->resource, FD_DIRTY_IMAGE); - so->enabled_mask |= BIT(n); - } else { - so->enabled_mask &= ~BIT(n); - } - } - } else { - mask = (BIT(count) - 1) << start; + if (buf->resource) { + fd_resource_set_usage(buf->resource, FD_DIRTY_IMAGE); + so->enabled_mask |= BIT(n); + } else { + so->enabled_mask &= ~BIT(n); + } + } + } else { + mask = (BIT(count) - 1) << start; - for (unsigned i = 0; i < count; i++) { - unsigned n = i + start; - struct pipe_image_view *img = &so->si[n]; + for (unsigned i = 0; i < count; i++) { + unsigned n = i + start; + struct pipe_image_view *img = &so->si[n]; - pipe_resource_reference(&img->resource, NULL); - } + pipe_resource_reference(&img->resource, NULL); + } - so->enabled_mask &= ~mask; - } + so->enabled_mask &= ~mask; + } - for (unsigned i = 0; i < unbind_num_trailing_slots; i++) - pipe_resource_reference(&so->si[i + start + count].resource, NULL); + for (unsigned i = 0; i < unbind_num_trailing_slots; i++) + pipe_resource_reference(&so->si[i + start + count].resource, NULL); - so->enabled_mask &= ~(BITFIELD_MASK(unbind_num_trailing_slots) << (start + count)); + so->enabled_mask &= + ~(BITFIELD_MASK(unbind_num_trailing_slots) << (start + count)); - fd_context_dirty_shader(ctx, shader, FD_DIRTY_SHADER_IMAGE); + fd_context_dirty_shader(ctx, shader, FD_DIRTY_SHADER_IMAGE); } static void fd_set_framebuffer_state(struct pipe_context *pctx, - const struct pipe_framebuffer_state *framebuffer) - in_dt + const struct pipe_framebuffer_state *framebuffer) in_dt { - struct fd_context *ctx = fd_context(pctx); - struct pipe_framebuffer_state *cso; + struct fd_context *ctx = fd_context(pctx); + struct pipe_framebuffer_state *cso; - DBG("%ux%u, %u layers, %u samples", - framebuffer->width, framebuffer->height, - framebuffer->layers, framebuffer->samples); + DBG("%ux%u, %u layers, %u samples", framebuffer->width, framebuffer->height, + framebuffer->layers, framebuffer->samples); - cso = &ctx->framebuffer; + cso = &ctx->framebuffer; - if (util_framebuffer_state_equal(cso, framebuffer)) - return; + if (util_framebuffer_state_equal(cso, framebuffer)) + return; - /* Do this *after* checking that the framebuffer state is actually - * changing. In the fd_blitter_clear() path, we get a pfb update - * to restore the current pfb state, which should not trigger us - * to flush (as that can cause the batch to be freed at a point - * before fd_clear() returns, but after the point where it expects - * flushes to potentially happen. - */ - fd_context_switch_from(ctx); + /* Do this *after* checking that the framebuffer state is actually + * changing. In the fd_blitter_clear() path, we get a pfb update + * to restore the current pfb state, which should not trigger us + * to flush (as that can cause the batch to be freed at a point + * before fd_clear() returns, but after the point where it expects + * flushes to potentially happen. + */ + fd_context_switch_from(ctx); - util_copy_framebuffer_state(cso, framebuffer); + util_copy_framebuffer_state(cso, framebuffer); - cso->samples = util_framebuffer_get_num_samples(cso); + cso->samples = util_framebuffer_get_num_samples(cso); - if (ctx->screen->reorder) { - struct fd_batch *old_batch = NULL; + if (ctx->screen->reorder) { + struct fd_batch *old_batch = NULL; - fd_batch_reference(&old_batch, ctx->batch); + fd_batch_reference(&old_batch, ctx->batch); - if (likely(old_batch)) - fd_batch_finish_queries(old_batch); + if (likely(old_batch)) + fd_batch_finish_queries(old_batch); - fd_batch_reference(&ctx->batch, NULL); - fd_context_all_dirty(ctx); - ctx->update_active_queries = true; + fd_batch_reference(&ctx->batch, NULL); + fd_context_all_dirty(ctx); + ctx->update_active_queries = true; - if (old_batch && old_batch->blit && !old_batch->back_blit) { - /* for blits, there is not really much point in hanging on - * to the uncommitted batch (ie. you probably don't blit - * multiple times to the same surface), so we might as - * well go ahead and flush this one: - */ - fd_batch_flush(old_batch); - } + if (old_batch && old_batch->blit && !old_batch->back_blit) { + /* for blits, there is not really much point in hanging on + * to the uncommitted batch (ie. you probably don't blit + * multiple times to the same surface), so we might as + * well go ahead and flush this one: + */ + fd_batch_flush(old_batch); + } - fd_batch_reference(&old_batch, NULL); - } else if (ctx->batch) { - DBG("%d: cbufs[0]=%p, zsbuf=%p", ctx->batch->needs_flush, - framebuffer->cbufs[0], framebuffer->zsbuf); - fd_batch_flush(ctx->batch); - } + fd_batch_reference(&old_batch, NULL); + } else if (ctx->batch) { + DBG("%d: cbufs[0]=%p, zsbuf=%p", ctx->batch->needs_flush, + framebuffer->cbufs[0], framebuffer->zsbuf); + fd_batch_flush(ctx->batch); + } - fd_context_dirty(ctx, FD_DIRTY_FRAMEBUFFER); + fd_context_dirty(ctx, FD_DIRTY_FRAMEBUFFER); - ctx->disabled_scissor.minx = 0; - ctx->disabled_scissor.miny = 0; - ctx->disabled_scissor.maxx = cso->width; - ctx->disabled_scissor.maxy = cso->height; + ctx->disabled_scissor.minx = 0; + ctx->disabled_scissor.miny = 0; + ctx->disabled_scissor.maxx = cso->width; + ctx->disabled_scissor.maxy = cso->height; - fd_context_dirty(ctx, FD_DIRTY_SCISSOR); - update_draw_cost(ctx); + fd_context_dirty(ctx, FD_DIRTY_SCISSOR); + update_draw_cost(ctx); } static void fd_set_polygon_stipple(struct pipe_context *pctx, - const struct pipe_poly_stipple *stipple) - in_dt + const struct pipe_poly_stipple *stipple) in_dt { - struct fd_context *ctx = fd_context(pctx); - ctx->stipple = *stipple; - fd_context_dirty(ctx, FD_DIRTY_STIPPLE); + struct fd_context *ctx = fd_context(pctx); + ctx->stipple = *stipple; + fd_context_dirty(ctx, FD_DIRTY_STIPPLE); } static void -fd_set_scissor_states(struct pipe_context *pctx, - unsigned start_slot, - unsigned num_scissors, - const struct pipe_scissor_state *scissor) - in_dt +fd_set_scissor_states(struct pipe_context *pctx, unsigned start_slot, + unsigned num_scissors, + const struct pipe_scissor_state *scissor) in_dt { - struct fd_context *ctx = fd_context(pctx); + struct fd_context *ctx = fd_context(pctx); - ctx->scissor = *scissor; - fd_context_dirty(ctx, FD_DIRTY_SCISSOR); + ctx->scissor = *scissor; + fd_context_dirty(ctx, FD_DIRTY_SCISSOR); } static void -fd_set_viewport_states(struct pipe_context *pctx, - unsigned start_slot, - unsigned num_viewports, - const struct pipe_viewport_state *viewport) - in_dt +fd_set_viewport_states(struct pipe_context *pctx, unsigned start_slot, + unsigned num_viewports, + const struct pipe_viewport_state *viewport) in_dt { - struct fd_context *ctx = fd_context(pctx); - struct pipe_scissor_state *scissor = &ctx->viewport_scissor; - float minx, miny, maxx, maxy; + struct fd_context *ctx = fd_context(pctx); + struct pipe_scissor_state *scissor = &ctx->viewport_scissor; + float minx, miny, maxx, maxy; - ctx->viewport = *viewport; + ctx->viewport = *viewport; - /* see si_get_scissor_from_viewport(): */ + /* see si_get_scissor_from_viewport(): */ - /* Convert (-1, -1) and (1, 1) from clip space into window space. */ - minx = -viewport->scale[0] + viewport->translate[0]; - miny = -viewport->scale[1] + viewport->translate[1]; - maxx = viewport->scale[0] + viewport->translate[0]; - maxy = viewport->scale[1] + viewport->translate[1]; + /* Convert (-1, -1) and (1, 1) from clip space into window space. */ + minx = -viewport->scale[0] + viewport->translate[0]; + miny = -viewport->scale[1] + viewport->translate[1]; + maxx = viewport->scale[0] + viewport->translate[0]; + maxy = viewport->scale[1] + viewport->translate[1]; - /* Handle inverted viewports. */ - if (minx > maxx) { - swap(minx, maxx); - } - if (miny > maxy) { - swap(miny, maxy); - } + /* Handle inverted viewports. */ + if (minx > maxx) { + swap(minx, maxx); + } + if (miny > maxy) { + swap(miny, maxy); + } - const float max_dims = ctx->screen->gpu_id >= 400 ? 16384.f : 4096.f; + const float max_dims = ctx->screen->gpu_id >= 400 ? 16384.f : 4096.f; - /* Clamp, convert to integer and round up the max bounds. */ - scissor->minx = CLAMP(minx, 0.f, max_dims); - scissor->miny = CLAMP(miny, 0.f, max_dims); - scissor->maxx = CLAMP(ceilf(maxx), 0.f, max_dims); - scissor->maxy = CLAMP(ceilf(maxy), 0.f, max_dims); + /* Clamp, convert to integer and round up the max bounds. */ + scissor->minx = CLAMP(minx, 0.f, max_dims); + scissor->miny = CLAMP(miny, 0.f, max_dims); + scissor->maxx = CLAMP(ceilf(maxx), 0.f, max_dims); + scissor->maxy = CLAMP(ceilf(maxy), 0.f, max_dims); - fd_context_dirty(ctx, FD_DIRTY_VIEWPORT); + fd_context_dirty(ctx, FD_DIRTY_VIEWPORT); } static void -fd_set_vertex_buffers(struct pipe_context *pctx, - unsigned start_slot, unsigned count, - unsigned unbind_num_trailing_slots, - bool take_ownership, - const struct pipe_vertex_buffer *vb) - in_dt +fd_set_vertex_buffers(struct pipe_context *pctx, unsigned start_slot, + unsigned count, unsigned unbind_num_trailing_slots, + bool take_ownership, + const struct pipe_vertex_buffer *vb) in_dt { - struct fd_context *ctx = fd_context(pctx); - struct fd_vertexbuf_stateobj *so = &ctx->vtx.vertexbuf; - int i; - - /* on a2xx, pitch is encoded in the vtx fetch instruction, so - * we need to mark VTXSTATE as dirty as well to trigger patching - * and re-emitting the vtx shader: - */ - if (ctx->screen->gpu_id < 300) { - for (i = 0; i < count; i++) { - bool new_enabled = vb && vb[i].buffer.resource; - bool old_enabled = so->vb[i].buffer.resource != NULL; - uint32_t new_stride = vb ? vb[i].stride : 0; - uint32_t old_stride = so->vb[i].stride; - if ((new_enabled != old_enabled) || (new_stride != old_stride)) { - fd_context_dirty(ctx, FD_DIRTY_VTXSTATE); - break; - } - } - } - - util_set_vertex_buffers_mask(so->vb, &so->enabled_mask, vb, start_slot, - count, unbind_num_trailing_slots, - take_ownership); - so->count = util_last_bit(so->enabled_mask); - - if (!vb) - return; - - fd_context_dirty(ctx, FD_DIRTY_VTXBUF); - - for (unsigned i = 0; i < count; i++) { - assert(!vb[i].is_user_buffer); - fd_resource_set_usage(vb[i].buffer.resource, FD_DIRTY_VTXBUF); - } + struct fd_context *ctx = fd_context(pctx); + struct fd_vertexbuf_stateobj *so = &ctx->vtx.vertexbuf; + int i; + + /* on a2xx, pitch is encoded in the vtx fetch instruction, so + * we need to mark VTXSTATE as dirty as well to trigger patching + * and re-emitting the vtx shader: + */ + if (ctx->screen->gpu_id < 300) { + for (i = 0; i < count; i++) { + bool new_enabled = vb && vb[i].buffer.resource; + bool old_enabled = so->vb[i].buffer.resource != NULL; + uint32_t new_stride = vb ? vb[i].stride : 0; + uint32_t old_stride = so->vb[i].stride; + if ((new_enabled != old_enabled) || (new_stride != old_stride)) { + fd_context_dirty(ctx, FD_DIRTY_VTXSTATE); + break; + } + } + } + + util_set_vertex_buffers_mask(so->vb, &so->enabled_mask, vb, start_slot, + count, unbind_num_trailing_slots, + take_ownership); + so->count = util_last_bit(so->enabled_mask); + + if (!vb) + return; + + fd_context_dirty(ctx, FD_DIRTY_VTXBUF); + + for (unsigned i = 0; i < count; i++) { + assert(!vb[i].is_user_buffer); + fd_resource_set_usage(vb[i].buffer.resource, FD_DIRTY_VTXBUF); + } } static void -fd_blend_state_bind(struct pipe_context *pctx, void *hwcso) - in_dt +fd_blend_state_bind(struct pipe_context *pctx, void *hwcso) in_dt { - struct fd_context *ctx = fd_context(pctx); - struct pipe_blend_state *cso = hwcso; - bool old_is_dual = ctx->blend ? - ctx->blend->rt[0].blend_enable && util_blend_state_is_dual(ctx->blend, 0) : - false; - bool new_is_dual = cso ? - cso->rt[0].blend_enable && util_blend_state_is_dual(cso, 0) : - false; - ctx->blend = hwcso; - fd_context_dirty(ctx, FD_DIRTY_BLEND); - if (old_is_dual != new_is_dual) - fd_context_dirty(ctx, FD_DIRTY_BLEND_DUAL); - update_draw_cost(ctx); + struct fd_context *ctx = fd_context(pctx); + struct pipe_blend_state *cso = hwcso; + bool old_is_dual = ctx->blend ? ctx->blend->rt[0].blend_enable && + util_blend_state_is_dual(ctx->blend, 0) + : false; + bool new_is_dual = + cso ? cso->rt[0].blend_enable && util_blend_state_is_dual(cso, 0) : false; + ctx->blend = hwcso; + fd_context_dirty(ctx, FD_DIRTY_BLEND); + if (old_is_dual != new_is_dual) + fd_context_dirty(ctx, FD_DIRTY_BLEND_DUAL); + update_draw_cost(ctx); } static void -fd_blend_state_delete(struct pipe_context *pctx, void *hwcso) - in_dt +fd_blend_state_delete(struct pipe_context *pctx, void *hwcso) in_dt { - FREE(hwcso); + FREE(hwcso); } static void -fd_rasterizer_state_bind(struct pipe_context *pctx, void *hwcso) - in_dt +fd_rasterizer_state_bind(struct pipe_context *pctx, void *hwcso) in_dt { - struct fd_context *ctx = fd_context(pctx); - struct pipe_scissor_state *old_scissor = fd_context_get_scissor(ctx); - bool discard = ctx->rasterizer && ctx->rasterizer->rasterizer_discard; - - ctx->rasterizer = hwcso; - fd_context_dirty(ctx, FD_DIRTY_RASTERIZER); - - if (ctx->rasterizer && ctx->rasterizer->scissor) { - ctx->current_scissor = &ctx->scissor; - } else { - ctx->current_scissor = &ctx->disabled_scissor; - } - - /* if scissor enable bit changed we need to mark scissor - * state as dirty as well: - * NOTE: we can do a shallow compare, since we only care - * if it changed to/from &ctx->disable_scissor - */ - if (old_scissor != fd_context_get_scissor(ctx)) - fd_context_dirty(ctx, FD_DIRTY_SCISSOR); - - if (ctx->rasterizer && (discard != ctx->rasterizer->rasterizer_discard)) - fd_context_dirty(ctx, FD_DIRTY_RASTERIZER_DISCARD); + struct fd_context *ctx = fd_context(pctx); + struct pipe_scissor_state *old_scissor = fd_context_get_scissor(ctx); + bool discard = ctx->rasterizer && ctx->rasterizer->rasterizer_discard; + + ctx->rasterizer = hwcso; + fd_context_dirty(ctx, FD_DIRTY_RASTERIZER); + + if (ctx->rasterizer && ctx->rasterizer->scissor) { + ctx->current_scissor = &ctx->scissor; + } else { + ctx->current_scissor = &ctx->disabled_scissor; + } + + /* if scissor enable bit changed we need to mark scissor + * state as dirty as well: + * NOTE: we can do a shallow compare, since we only care + * if it changed to/from &ctx->disable_scissor + */ + if (old_scissor != fd_context_get_scissor(ctx)) + fd_context_dirty(ctx, FD_DIRTY_SCISSOR); + + if (ctx->rasterizer && (discard != ctx->rasterizer->rasterizer_discard)) + fd_context_dirty(ctx, FD_DIRTY_RASTERIZER_DISCARD); } static void -fd_rasterizer_state_delete(struct pipe_context *pctx, void *hwcso) - in_dt +fd_rasterizer_state_delete(struct pipe_context *pctx, void *hwcso) in_dt { - FREE(hwcso); + FREE(hwcso); } static void -fd_zsa_state_bind(struct pipe_context *pctx, void *hwcso) - in_dt +fd_zsa_state_bind(struct pipe_context *pctx, void *hwcso) in_dt { - struct fd_context *ctx = fd_context(pctx); - ctx->zsa = hwcso; - fd_context_dirty(ctx, FD_DIRTY_ZSA); - update_draw_cost(ctx); + struct fd_context *ctx = fd_context(pctx); + ctx->zsa = hwcso; + fd_context_dirty(ctx, FD_DIRTY_ZSA); + update_draw_cost(ctx); } static void -fd_zsa_state_delete(struct pipe_context *pctx, void *hwcso) - in_dt +fd_zsa_state_delete(struct pipe_context *pctx, void *hwcso) in_dt { - FREE(hwcso); + FREE(hwcso); } static void * fd_vertex_state_create(struct pipe_context *pctx, unsigned num_elements, - const struct pipe_vertex_element *elements) + const struct pipe_vertex_element *elements) { - struct fd_vertex_stateobj *so = CALLOC_STRUCT(fd_vertex_stateobj); + struct fd_vertex_stateobj *so = CALLOC_STRUCT(fd_vertex_stateobj); - if (!so) - return NULL; + if (!so) + return NULL; - memcpy(so->pipe, elements, sizeof(*elements) * num_elements); - so->num_elements = num_elements; + memcpy(so->pipe, elements, sizeof(*elements) * num_elements); + so->num_elements = num_elements; - return so; + return so; } static void -fd_vertex_state_delete(struct pipe_context *pctx, void *hwcso) - in_dt +fd_vertex_state_delete(struct pipe_context *pctx, void *hwcso) in_dt { - FREE(hwcso); + FREE(hwcso); } static void -fd_vertex_state_bind(struct pipe_context *pctx, void *hwcso) - in_dt +fd_vertex_state_bind(struct pipe_context *pctx, void *hwcso) in_dt { - struct fd_context *ctx = fd_context(pctx); - ctx->vtx.vtx = hwcso; - fd_context_dirty(ctx, FD_DIRTY_VTXSTATE); + struct fd_context *ctx = fd_context(pctx); + ctx->vtx.vtx = hwcso; + fd_context_dirty(ctx, FD_DIRTY_VTXSTATE); } static struct pipe_stream_output_target * fd_create_stream_output_target(struct pipe_context *pctx, - struct pipe_resource *prsc, unsigned buffer_offset, - unsigned buffer_size) + struct pipe_resource *prsc, + unsigned buffer_offset, unsigned buffer_size) { - struct fd_stream_output_target *target; - struct fd_resource *rsc = fd_resource(prsc); + struct fd_stream_output_target *target; + struct fd_resource *rsc = fd_resource(prsc); - target = CALLOC_STRUCT(fd_stream_output_target); - if (!target) - return NULL; + target = CALLOC_STRUCT(fd_stream_output_target); + if (!target) + return NULL; - pipe_reference_init(&target->base.reference, 1); - pipe_resource_reference(&target->base.buffer, prsc); + pipe_reference_init(&target->base.reference, 1); + pipe_resource_reference(&target->base.buffer, prsc); - target->base.context = pctx; - target->base.buffer_offset = buffer_offset; - target->base.buffer_size = buffer_size; + target->base.context = pctx; + target->base.buffer_offset = buffer_offset; + target->base.buffer_size = buffer_size; - target->offset_buf = pipe_buffer_create(pctx->screen, - PIPE_BIND_CUSTOM, PIPE_USAGE_IMMUTABLE, sizeof(uint32_t)); + target->offset_buf = pipe_buffer_create( + pctx->screen, PIPE_BIND_CUSTOM, PIPE_USAGE_IMMUTABLE, sizeof(uint32_t)); - assert(rsc->b.b.target == PIPE_BUFFER); - util_range_add(&rsc->b.b, &rsc->valid_buffer_range, - buffer_offset, buffer_offset + buffer_size); + assert(rsc->b.b.target == PIPE_BUFFER); + util_range_add(&rsc->b.b, &rsc->valid_buffer_range, buffer_offset, + buffer_offset + buffer_size); - return &target->base; + return &target->base; } static void fd_stream_output_target_destroy(struct pipe_context *pctx, - struct pipe_stream_output_target *target) + struct pipe_stream_output_target *target) { - struct fd_stream_output_target *cso = fd_stream_output_target(target); + struct fd_stream_output_target *cso = fd_stream_output_target(target); - pipe_resource_reference(&cso->base.buffer, NULL); - pipe_resource_reference(&cso->offset_buf, NULL); + pipe_resource_reference(&cso->base.buffer, NULL); + pipe_resource_reference(&cso->offset_buf, NULL); - FREE(target); + FREE(target); } static void -fd_set_stream_output_targets(struct pipe_context *pctx, - unsigned num_targets, struct pipe_stream_output_target **targets, - const unsigned *offsets) - in_dt +fd_set_stream_output_targets(struct pipe_context *pctx, unsigned num_targets, + struct pipe_stream_output_target **targets, + const unsigned *offsets) in_dt { - struct fd_context *ctx = fd_context(pctx); - struct fd_streamout_stateobj *so = &ctx->streamout; - unsigned i; + struct fd_context *ctx = fd_context(pctx); + struct fd_streamout_stateobj *so = &ctx->streamout; + unsigned i; - debug_assert(num_targets <= ARRAY_SIZE(so->targets)); + debug_assert(num_targets <= ARRAY_SIZE(so->targets)); - /* Older targets need sw stats enabled for streamout emulation in VS: */ - if (ctx->screen->gpu_id < 500) { - if (num_targets && !so->num_targets) { - ctx->stats_users++; - } else if (so->num_targets && !num_targets) { - ctx->stats_users--; - } - } + /* Older targets need sw stats enabled for streamout emulation in VS: */ + if (ctx->screen->gpu_id < 500) { + if (num_targets && !so->num_targets) { + ctx->stats_users++; + } else if (so->num_targets && !num_targets) { + ctx->stats_users--; + } + } - for (i = 0; i < num_targets; i++) { - boolean changed = targets[i] != so->targets[i]; - boolean reset = (offsets[i] != (unsigned)-1); + for (i = 0; i < num_targets; i++) { + boolean changed = targets[i] != so->targets[i]; + boolean reset = (offsets[i] != (unsigned)-1); - so->reset |= (reset << i); + so->reset |= (reset << i); - if (!changed && !reset) - continue; + if (!changed && !reset) + continue; - /* Note that all SO targets will be reset at once at a - * BeginTransformFeedback(). - */ - if (reset) { - so->offsets[i] = offsets[i]; - ctx->streamout.verts_written = 0; - } + /* Note that all SO targets will be reset at once at a + * BeginTransformFeedback(). + */ + if (reset) { + so->offsets[i] = offsets[i]; + ctx->streamout.verts_written = 0; + } - pipe_so_target_reference(&so->targets[i], targets[i]); - } + pipe_so_target_reference(&so->targets[i], targets[i]); + } - for (; i < so->num_targets; i++) { - pipe_so_target_reference(&so->targets[i], NULL); - } + for (; i < so->num_targets; i++) { + pipe_so_target_reference(&so->targets[i], NULL); + } - so->num_targets = num_targets; + so->num_targets = num_targets; - fd_context_dirty(ctx, FD_DIRTY_STREAMOUT); + fd_context_dirty(ctx, FD_DIRTY_STREAMOUT); } static void -fd_bind_compute_state(struct pipe_context *pctx, void *state) - in_dt +fd_bind_compute_state(struct pipe_context *pctx, void *state) in_dt { - struct fd_context *ctx = fd_context(pctx); - ctx->compute = state; - /* NOTE: Don't mark FD_DIRTY_PROG for compute specific state */ - ctx->dirty_shader[PIPE_SHADER_COMPUTE] |= FD_DIRTY_SHADER_PROG; + struct fd_context *ctx = fd_context(pctx); + ctx->compute = state; + /* NOTE: Don't mark FD_DIRTY_PROG for compute specific state */ + ctx->dirty_shader[PIPE_SHADER_COMPUTE] |= FD_DIRTY_SHADER_PROG; } static void -fd_set_compute_resources(struct pipe_context *pctx, - unsigned start, unsigned count, struct pipe_surface **prscs) - in_dt +fd_set_compute_resources(struct pipe_context *pctx, unsigned start, + unsigned count, struct pipe_surface **prscs) in_dt { - // TODO + // TODO } /* used by clover to bind global objects, returning the bo address * via handles[n] */ static void -fd_set_global_binding(struct pipe_context *pctx, - unsigned first, unsigned count, struct pipe_resource **prscs, - uint32_t **handles) - in_dt +fd_set_global_binding(struct pipe_context *pctx, unsigned first, unsigned count, + struct pipe_resource **prscs, uint32_t **handles) in_dt { - struct fd_context *ctx = fd_context(pctx); - struct fd_global_bindings_stateobj *so = &ctx->global_bindings; - unsigned mask = 0; - - if (prscs) { - for (unsigned i = 0; i < count; i++) { - unsigned n = i + first; - - mask |= BIT(n); - - pipe_resource_reference(&so->buf[n], prscs[i]); - - if (so->buf[n]) { - struct fd_resource *rsc = fd_resource(so->buf[n]); - uint64_t iova = fd_bo_get_iova(rsc->bo); - // TODO need to scream if iova > 32b or fix gallium API.. - *handles[i] += iova; - } - - if (prscs[i]) - so->enabled_mask |= BIT(n); - else - so->enabled_mask &= ~BIT(n); - } - } else { - mask = (BIT(count) - 1) << first; - - for (unsigned i = 0; i < count; i++) { - unsigned n = i + first; - pipe_resource_reference(&so->buf[n], NULL); - } - - so->enabled_mask &= ~mask; - } - + struct fd_context *ctx = fd_context(pctx); + struct fd_global_bindings_stateobj *so = &ctx->global_bindings; + unsigned mask = 0; + + if (prscs) { + for (unsigned i = 0; i < count; i++) { + unsigned n = i + first; + + mask |= BIT(n); + + pipe_resource_reference(&so->buf[n], prscs[i]); + + if (so->buf[n]) { + struct fd_resource *rsc = fd_resource(so->buf[n]); + uint64_t iova = fd_bo_get_iova(rsc->bo); + // TODO need to scream if iova > 32b or fix gallium API.. + *handles[i] += iova; + } + + if (prscs[i]) + so->enabled_mask |= BIT(n); + else + so->enabled_mask &= ~BIT(n); + } + } else { + mask = (BIT(count) - 1) << first; + + for (unsigned i = 0; i < count; i++) { + unsigned n = i + first; + pipe_resource_reference(&so->buf[n], NULL); + } + + so->enabled_mask &= ~mask; + } } void fd_state_init(struct pipe_context *pctx) { - pctx->set_blend_color = fd_set_blend_color; - pctx->set_stencil_ref = fd_set_stencil_ref; - pctx->set_clip_state = fd_set_clip_state; - pctx->set_sample_mask = fd_set_sample_mask; - pctx->set_min_samples = fd_set_min_samples; - pctx->set_constant_buffer = fd_set_constant_buffer; - pctx->set_shader_buffers = fd_set_shader_buffers; - pctx->set_shader_images = fd_set_shader_images; - pctx->set_framebuffer_state = fd_set_framebuffer_state; - pctx->set_polygon_stipple = fd_set_polygon_stipple; - pctx->set_scissor_states = fd_set_scissor_states; - pctx->set_viewport_states = fd_set_viewport_states; - - pctx->set_vertex_buffers = fd_set_vertex_buffers; - - pctx->bind_blend_state = fd_blend_state_bind; - pctx->delete_blend_state = fd_blend_state_delete; - - pctx->bind_rasterizer_state = fd_rasterizer_state_bind; - pctx->delete_rasterizer_state = fd_rasterizer_state_delete; - - pctx->bind_depth_stencil_alpha_state = fd_zsa_state_bind; - pctx->delete_depth_stencil_alpha_state = fd_zsa_state_delete; - - if (!pctx->create_vertex_elements_state) - pctx->create_vertex_elements_state = fd_vertex_state_create; - pctx->delete_vertex_elements_state = fd_vertex_state_delete; - pctx->bind_vertex_elements_state = fd_vertex_state_bind; - - pctx->create_stream_output_target = fd_create_stream_output_target; - pctx->stream_output_target_destroy = fd_stream_output_target_destroy; - pctx->set_stream_output_targets = fd_set_stream_output_targets; - - if (has_compute(fd_screen(pctx->screen))) { - pctx->bind_compute_state = fd_bind_compute_state; - pctx->set_compute_resources = fd_set_compute_resources; - pctx->set_global_binding = fd_set_global_binding; - } + pctx->set_blend_color = fd_set_blend_color; + pctx->set_stencil_ref = fd_set_stencil_ref; + pctx->set_clip_state = fd_set_clip_state; + pctx->set_sample_mask = fd_set_sample_mask; + pctx->set_min_samples = fd_set_min_samples; + pctx->set_constant_buffer = fd_set_constant_buffer; + pctx->set_shader_buffers = fd_set_shader_buffers; + pctx->set_shader_images = fd_set_shader_images; + pctx->set_framebuffer_state = fd_set_framebuffer_state; + pctx->set_polygon_stipple = fd_set_polygon_stipple; + pctx->set_scissor_states = fd_set_scissor_states; + pctx->set_viewport_states = fd_set_viewport_states; + + pctx->set_vertex_buffers = fd_set_vertex_buffers; + + pctx->bind_blend_state = fd_blend_state_bind; + pctx->delete_blend_state = fd_blend_state_delete; + + pctx->bind_rasterizer_state = fd_rasterizer_state_bind; + pctx->delete_rasterizer_state = fd_rasterizer_state_delete; + + pctx->bind_depth_stencil_alpha_state = fd_zsa_state_bind; + pctx->delete_depth_stencil_alpha_state = fd_zsa_state_delete; + + if (!pctx->create_vertex_elements_state) + pctx->create_vertex_elements_state = fd_vertex_state_create; + pctx->delete_vertex_elements_state = fd_vertex_state_delete; + pctx->bind_vertex_elements_state = fd_vertex_state_bind; + + pctx->create_stream_output_target = fd_create_stream_output_target; + pctx->stream_output_target_destroy = fd_stream_output_target_destroy; + pctx->set_stream_output_targets = fd_set_stream_output_targets; + + if (has_compute(fd_screen(pctx->screen))) { + pctx->bind_compute_state = fd_bind_compute_state; + pctx->set_compute_resources = fd_set_compute_resources; + pctx->set_global_binding = fd_set_global_binding; + } } diff --git a/src/gallium/drivers/freedreno/freedreno_state.h b/src/gallium/drivers/freedreno/freedreno_state.h index b6ba3fb..255049b 100644 --- a/src/gallium/drivers/freedreno/freedreno_state.h +++ b/src/gallium/drivers/freedreno/freedreno_state.h @@ -30,41 +30,41 @@ #include "pipe/p_context.h" #include "freedreno_context.h" -static inline bool fd_depth_enabled(struct fd_context *ctx) - assert_dt +static inline bool +fd_depth_enabled(struct fd_context *ctx) assert_dt { - return ctx->zsa && ctx->zsa->depth_enabled; + return ctx->zsa && ctx->zsa->depth_enabled; } -static inline bool fd_depth_write_enabled(struct fd_context *ctx) - assert_dt +static inline bool +fd_depth_write_enabled(struct fd_context *ctx) assert_dt { - return ctx->zsa && ctx->zsa->depth_writemask; + return ctx->zsa && ctx->zsa->depth_writemask; } -static inline bool fd_stencil_enabled(struct fd_context *ctx) - assert_dt +static inline bool +fd_stencil_enabled(struct fd_context *ctx) assert_dt { - return ctx->zsa && ctx->zsa->stencil[0].enabled; + return ctx->zsa && ctx->zsa->stencil[0].enabled; } -static inline bool fd_blend_enabled(struct fd_context *ctx, unsigned n) - assert_dt +static inline bool +fd_blend_enabled(struct fd_context *ctx, unsigned n) assert_dt { - return ctx->blend && ctx->blend->rt[n].blend_enable; + return ctx->blend && ctx->blend->rt[n].blend_enable; } -static inline bool fd_depth_clamp_enabled(struct fd_context *ctx) - assert_dt +static inline bool +fd_depth_clamp_enabled(struct fd_context *ctx) assert_dt { - return !(ctx->rasterizer->depth_clip_near && ctx->rasterizer->depth_clip_far); + return !(ctx->rasterizer->depth_clip_near && + ctx->rasterizer->depth_clip_far); } void fd_set_shader_images(struct pipe_context *pctx, - enum pipe_shader_type shader, - unsigned start, unsigned count, - unsigned unbind_num_trailing_slots, - const struct pipe_image_view *images); + enum pipe_shader_type shader, unsigned start, + unsigned count, unsigned unbind_num_trailing_slots, + const struct pipe_image_view *images); void fd_state_init(struct pipe_context *pctx); diff --git a/src/gallium/drivers/freedreno/freedreno_surface.c b/src/gallium/drivers/freedreno/freedreno_surface.c index 3a6617c..1fbc09e 100644 --- a/src/gallium/drivers/freedreno/freedreno_surface.c +++ b/src/gallium/drivers/freedreno/freedreno_surface.c @@ -28,47 +28,45 @@ #include "freedreno_resource.h" #include "freedreno_util.h" -#include "util/u_memory.h" #include "util/u_inlines.h" +#include "util/u_memory.h" struct pipe_surface * -fd_create_surface(struct pipe_context *pctx, - struct pipe_resource *ptex, - const struct pipe_surface *surf_tmpl) +fd_create_surface(struct pipe_context *pctx, struct pipe_resource *ptex, + const struct pipe_surface *surf_tmpl) { - struct fd_surface* surface = CALLOC_STRUCT(fd_surface); - - if (!surface) - return NULL; + struct fd_surface *surface = CALLOC_STRUCT(fd_surface); + if (!surface) + return NULL; - struct pipe_surface *psurf = &surface->base; - unsigned level = surf_tmpl->u.tex.level; + struct pipe_surface *psurf = &surface->base; + unsigned level = surf_tmpl->u.tex.level; - pipe_reference_init(&psurf->reference, 1); - pipe_resource_reference(&psurf->texture, ptex); + pipe_reference_init(&psurf->reference, 1); + pipe_resource_reference(&psurf->texture, ptex); - psurf->context = pctx; - psurf->format = surf_tmpl->format; - psurf->width = u_minify(ptex->width0, level); - psurf->height = u_minify(ptex->height0, level); - psurf->nr_samples = surf_tmpl->nr_samples; + psurf->context = pctx; + psurf->format = surf_tmpl->format; + psurf->width = u_minify(ptex->width0, level); + psurf->height = u_minify(ptex->height0, level); + psurf->nr_samples = surf_tmpl->nr_samples; - if (ptex->target == PIPE_BUFFER) { - psurf->u.buf.first_element = surf_tmpl->u.buf.first_element; - psurf->u.buf.last_element = surf_tmpl->u.buf.last_element; - } else { - psurf->u.tex.level = level; - psurf->u.tex.first_layer = surf_tmpl->u.tex.first_layer; - psurf->u.tex.last_layer = surf_tmpl->u.tex.last_layer; - } + if (ptex->target == PIPE_BUFFER) { + psurf->u.buf.first_element = surf_tmpl->u.buf.first_element; + psurf->u.buf.last_element = surf_tmpl->u.buf.last_element; + } else { + psurf->u.tex.level = level; + psurf->u.tex.first_layer = surf_tmpl->u.tex.first_layer; + psurf->u.tex.last_layer = surf_tmpl->u.tex.last_layer; + } - return &surface->base; + return &surface->base; } void fd_surface_destroy(struct pipe_context *pctx, struct pipe_surface *psurf) { - pipe_resource_reference(&psurf->texture, NULL); - FREE(psurf); + pipe_resource_reference(&psurf->texture, NULL); + FREE(psurf); } diff --git a/src/gallium/drivers/freedreno/freedreno_surface.h b/src/gallium/drivers/freedreno/freedreno_surface.h index 8898f89..2c2e24e 100644 --- a/src/gallium/drivers/freedreno/freedreno_surface.h +++ b/src/gallium/drivers/freedreno/freedreno_surface.h @@ -30,18 +30,18 @@ #include "pipe/p_state.h" struct fd_surface { - struct pipe_surface base; + struct pipe_surface base; }; static inline struct fd_surface * fd_surface(struct pipe_surface *psurf) { - return (struct fd_surface *)psurf; + return (struct fd_surface *)psurf; } -struct pipe_surface* fd_create_surface(struct pipe_context *pctx, - struct pipe_resource *ptex, - const struct pipe_surface *surf_tmpl); +struct pipe_surface *fd_create_surface(struct pipe_context *pctx, + struct pipe_resource *ptex, + const struct pipe_surface *surf_tmpl); void fd_surface_destroy(struct pipe_context *pctx, struct pipe_surface *psurf); #endif /* FREEDRENO_SURFACE_H_ */ diff --git a/src/gallium/drivers/freedreno/freedreno_texture.c b/src/gallium/drivers/freedreno/freedreno_texture.c index 6868c2c..1703a5a 100644 --- a/src/gallium/drivers/freedreno/freedreno_texture.c +++ b/src/gallium/drivers/freedreno/freedreno_texture.c @@ -25,158 +25,158 @@ */ #include "pipe/p_state.h" -#include "util/u_string.h" -#include "util/u_memory.h" #include "util/u_inlines.h" +#include "util/u_memory.h" +#include "util/u_string.h" -#include "freedreno_texture.h" #include "freedreno_context.h" #include "freedreno_resource.h" +#include "freedreno_texture.h" #include "freedreno_util.h" static void fd_sampler_state_delete(struct pipe_context *pctx, void *hwcso) { - FREE(hwcso); + FREE(hwcso); } static void fd_sampler_view_destroy(struct pipe_context *pctx, - struct pipe_sampler_view *view) + struct pipe_sampler_view *view) { - pipe_resource_reference(&view->texture, NULL); - FREE(view); + pipe_resource_reference(&view->texture, NULL); + FREE(view); } -static void bind_sampler_states(struct fd_texture_stateobj *tex, - unsigned start, unsigned nr, void **hwcso) +static void +bind_sampler_states(struct fd_texture_stateobj *tex, unsigned start, + unsigned nr, void **hwcso) { - unsigned i; - - for (i = 0; i < nr; i++) { - unsigned p = i + start; - tex->samplers[p] = hwcso[i]; - if (tex->samplers[p]) - tex->valid_samplers |= (1 << p); - else - tex->valid_samplers &= ~(1 << p); - } - - tex->num_samplers = util_last_bit(tex->valid_samplers); + unsigned i; + + for (i = 0; i < nr; i++) { + unsigned p = i + start; + tex->samplers[p] = hwcso[i]; + if (tex->samplers[p]) + tex->valid_samplers |= (1 << p); + else + tex->valid_samplers &= ~(1 << p); + } + + tex->num_samplers = util_last_bit(tex->valid_samplers); } -static void set_sampler_views(struct fd_texture_stateobj *tex, - unsigned start, unsigned nr, - unsigned unbind_num_trailing_slots, - struct pipe_sampler_view **views) +static void +set_sampler_views(struct fd_texture_stateobj *tex, unsigned start, unsigned nr, + unsigned unbind_num_trailing_slots, + struct pipe_sampler_view **views) { - unsigned i; - unsigned samplers = 0; - - for (i = 0; i < nr; i++) { - struct pipe_sampler_view *view = views ? views[i] : NULL; - unsigned p = i + start; - pipe_sampler_view_reference(&tex->textures[p], view); - if (tex->textures[p]) { - fd_resource_set_usage(tex->textures[p]->texture, FD_DIRTY_TEX); - tex->valid_textures |= (1 << p); - } else { - tex->valid_textures &= ~(1 << p); - } - } - for (; i < nr + unbind_num_trailing_slots; i++) { - unsigned p = i + start; - pipe_sampler_view_reference(&tex->textures[p], NULL); - tex->valid_textures &= ~(1 << p); - } - - tex->num_textures = util_last_bit(tex->valid_textures); - - for (i = 0; i < tex->num_textures; i++) { - uint nr_samples = fd_resource_nr_samples(tex->textures[i]->texture); - samplers |= (nr_samples >> 1) << (i * 2); - } - - tex->samples = samplers; + unsigned i; + unsigned samplers = 0; + + for (i = 0; i < nr; i++) { + struct pipe_sampler_view *view = views ? views[i] : NULL; + unsigned p = i + start; + pipe_sampler_view_reference(&tex->textures[p], view); + if (tex->textures[p]) { + fd_resource_set_usage(tex->textures[p]->texture, FD_DIRTY_TEX); + tex->valid_textures |= (1 << p); + } else { + tex->valid_textures &= ~(1 << p); + } + } + for (; i < nr + unbind_num_trailing_slots; i++) { + unsigned p = i + start; + pipe_sampler_view_reference(&tex->textures[p], NULL); + tex->valid_textures &= ~(1 << p); + } + + tex->num_textures = util_last_bit(tex->valid_textures); + + for (i = 0; i < tex->num_textures; i++) { + uint nr_samples = fd_resource_nr_samples(tex->textures[i]->texture); + samplers |= (nr_samples >> 1) << (i * 2); + } + + tex->samples = samplers; } void -fd_sampler_states_bind(struct pipe_context *pctx, - enum pipe_shader_type shader, unsigned start, - unsigned nr, void **hwcso) - in_dt +fd_sampler_states_bind(struct pipe_context *pctx, enum pipe_shader_type shader, + unsigned start, unsigned nr, void **hwcso) in_dt { - struct fd_context *ctx = fd_context(pctx); + struct fd_context *ctx = fd_context(pctx); - bind_sampler_states(&ctx->tex[shader], start, nr, hwcso); - fd_context_dirty_shader(ctx, shader, FD_DIRTY_SHADER_TEX); + bind_sampler_states(&ctx->tex[shader], start, nr, hwcso); + fd_context_dirty_shader(ctx, shader, FD_DIRTY_SHADER_TEX); } void fd_set_sampler_views(struct pipe_context *pctx, enum pipe_shader_type shader, - unsigned start, unsigned nr, unsigned unbind_num_trailing_slots, - struct pipe_sampler_view **views) - in_dt + unsigned start, unsigned nr, + unsigned unbind_num_trailing_slots, + struct pipe_sampler_view **views) in_dt { - struct fd_context *ctx = fd_context(pctx); + struct fd_context *ctx = fd_context(pctx); - set_sampler_views(&ctx->tex[shader], start, nr, unbind_num_trailing_slots, views); - fd_context_dirty_shader(ctx, shader, FD_DIRTY_SHADER_TEX); + set_sampler_views(&ctx->tex[shader], start, nr, unbind_num_trailing_slots, + views); + fd_context_dirty_shader(ctx, shader, FD_DIRTY_SHADER_TEX); } void fd_texture_init(struct pipe_context *pctx) { - if (!pctx->delete_sampler_state) - pctx->delete_sampler_state = fd_sampler_state_delete; - if (!pctx->sampler_view_destroy) - pctx->sampler_view_destroy = fd_sampler_view_destroy; + if (!pctx->delete_sampler_state) + pctx->delete_sampler_state = fd_sampler_state_delete; + if (!pctx->sampler_view_destroy) + pctx->sampler_view_destroy = fd_sampler_view_destroy; } /* helper for setting up border-color buffer for a3xx/a4xx: */ void fd_setup_border_colors(struct fd_texture_stateobj *tex, void *ptr, - unsigned offset) + unsigned offset) { - unsigned i, j; - - for (i = 0; i < tex->num_samplers; i++) { - struct pipe_sampler_state *sampler = tex->samplers[i]; - uint16_t *bcolor = (uint16_t *)((uint8_t *)ptr + - (BORDERCOLOR_SIZE * offset) + - (BORDERCOLOR_SIZE * i)); - uint32_t *bcolor32 = (uint32_t *)&bcolor[16]; - - if (!sampler) - continue; - - /* - * XXX HACK ALERT XXX - * - * The border colors need to be swizzled in a particular - * format-dependent order. Even though samplers don't know about - * formats, we can assume that with a GL state tracker, there's a - * 1:1 correspondence between sampler and texture. Take advantage - * of that knowledge. - */ - if (i < tex->num_textures && tex->textures[i]) { - const struct util_format_description *desc = - util_format_description(tex->textures[i]->format); - for (j = 0; j < 4; j++) { - if (desc->swizzle[j] >= 4) - continue; - - const struct util_format_channel_description *chan = - &desc->channel[desc->swizzle[j]]; - if (chan->pure_integer) { - bcolor32[desc->swizzle[j] + 4] = sampler->border_color.i[j]; - bcolor[desc->swizzle[j] + 8] = sampler->border_color.i[j]; - } else { - bcolor32[desc->swizzle[j]] = fui(sampler->border_color.f[j]); - bcolor[desc->swizzle[j]] = - _mesa_float_to_half(sampler->border_color.f[j]); - } - } - } - } + unsigned i, j; + + for (i = 0; i < tex->num_samplers; i++) { + struct pipe_sampler_state *sampler = tex->samplers[i]; + uint16_t *bcolor = + (uint16_t *)((uint8_t *)ptr + (BORDERCOLOR_SIZE * offset) + + (BORDERCOLOR_SIZE * i)); + uint32_t *bcolor32 = (uint32_t *)&bcolor[16]; + + if (!sampler) + continue; + + /* + * XXX HACK ALERT XXX + * + * The border colors need to be swizzled in a particular + * format-dependent order. Even though samplers don't know about + * formats, we can assume that with a GL state tracker, there's a + * 1:1 correspondence between sampler and texture. Take advantage + * of that knowledge. + */ + if (i < tex->num_textures && tex->textures[i]) { + const struct util_format_description *desc = + util_format_description(tex->textures[i]->format); + for (j = 0; j < 4; j++) { + if (desc->swizzle[j] >= 4) + continue; + + const struct util_format_channel_description *chan = + &desc->channel[desc->swizzle[j]]; + if (chan->pure_integer) { + bcolor32[desc->swizzle[j] + 4] = sampler->border_color.i[j]; + bcolor[desc->swizzle[j] + 8] = sampler->border_color.i[j]; + } else { + bcolor32[desc->swizzle[j]] = fui(sampler->border_color.f[j]); + bcolor[desc->swizzle[j]] = + _mesa_float_to_half(sampler->border_color.f[j]); + } + } + } + } } diff --git a/src/gallium/drivers/freedreno/freedreno_texture.h b/src/gallium/drivers/freedreno/freedreno_texture.h index f38dd9b..7cb523c 100644 --- a/src/gallium/drivers/freedreno/freedreno_texture.h +++ b/src/gallium/drivers/freedreno/freedreno_texture.h @@ -30,14 +30,13 @@ #include "pipe/p_context.h" void fd_sampler_states_bind(struct pipe_context *pctx, - enum pipe_shader_type shader, unsigned start, - unsigned nr, void **hwcso); + enum pipe_shader_type shader, unsigned start, + unsigned nr, void **hwcso); void fd_set_sampler_views(struct pipe_context *pctx, - enum pipe_shader_type shader, - unsigned start, unsigned nr, - unsigned unbind_num_trailing_slots, - struct pipe_sampler_view **views); + enum pipe_shader_type shader, unsigned start, + unsigned nr, unsigned unbind_num_trailing_slots, + struct pipe_sampler_view **views); void fd_texture_init(struct pipe_context *pctx); @@ -70,6 +69,6 @@ struct fd_texture_stateobj; */ #define BORDERCOLOR_SIZE 0x40 void fd_setup_border_colors(struct fd_texture_stateobj *tex, void *ptr, - unsigned offset); + unsigned offset); #endif /* FREEDRENO_TEXTURE_H_ */ diff --git a/src/gallium/drivers/freedreno/freedreno_util.c b/src/gallium/drivers/freedreno/freedreno_util.c index f5bc502..a964656 100644 --- a/src/gallium/drivers/freedreno/freedreno_util.c +++ b/src/gallium/drivers/freedreno/freedreno_util.c @@ -34,35 +34,35 @@ int32_t marker_cnt; enum adreno_rb_depth_format fd_pipe2depth(enum pipe_format format) { - switch (format) { - case PIPE_FORMAT_Z16_UNORM: - return DEPTHX_16; - case PIPE_FORMAT_Z24X8_UNORM: - case PIPE_FORMAT_Z24_UNORM_S8_UINT: - case PIPE_FORMAT_X8Z24_UNORM: - case PIPE_FORMAT_S8_UINT_Z24_UNORM: - return DEPTHX_24_8; - case PIPE_FORMAT_Z32_FLOAT: - case PIPE_FORMAT_Z32_FLOAT_S8X24_UINT: - return DEPTHX_32; - default: - return ~0; - } + switch (format) { + case PIPE_FORMAT_Z16_UNORM: + return DEPTHX_16; + case PIPE_FORMAT_Z24X8_UNORM: + case PIPE_FORMAT_Z24_UNORM_S8_UINT: + case PIPE_FORMAT_X8Z24_UNORM: + case PIPE_FORMAT_S8_UINT_Z24_UNORM: + return DEPTHX_24_8; + case PIPE_FORMAT_Z32_FLOAT: + case PIPE_FORMAT_Z32_FLOAT_S8X24_UINT: + return DEPTHX_32; + default: + return ~0; + } } enum pc_di_index_size fd_pipe2index(enum pipe_format format) { - switch (format) { - case PIPE_FORMAT_I8_UINT: - return INDEX_SIZE_8_BIT; - case PIPE_FORMAT_I16_UINT: - return INDEX_SIZE_16_BIT; - case PIPE_FORMAT_I32_UINT: - return INDEX_SIZE_32_BIT; - default: - return ~0; - } + switch (format) { + case PIPE_FORMAT_I8_UINT: + return INDEX_SIZE_8_BIT; + case PIPE_FORMAT_I16_UINT: + return INDEX_SIZE_16_BIT; + case PIPE_FORMAT_I32_UINT: + return INDEX_SIZE_32_BIT; + default: + return ~0; + } } /* we need to special case a bit the depth/stencil restore, because we are @@ -73,106 +73,106 @@ fd_pipe2index(enum pipe_format format) enum pipe_format fd_gmem_restore_format(enum pipe_format format) { - switch (format) { - case PIPE_FORMAT_Z24X8_UNORM: - case PIPE_FORMAT_Z24_UNORM_S8_UINT: - return PIPE_FORMAT_R8G8B8A8_UNORM; - case PIPE_FORMAT_Z16_UNORM: - return PIPE_FORMAT_R8G8_UNORM; - case PIPE_FORMAT_S8_UINT: - return PIPE_FORMAT_R8_UNORM; - default: - return format; - } + switch (format) { + case PIPE_FORMAT_Z24X8_UNORM: + case PIPE_FORMAT_Z24_UNORM_S8_UINT: + return PIPE_FORMAT_R8G8B8A8_UNORM; + case PIPE_FORMAT_Z16_UNORM: + return PIPE_FORMAT_R8G8_UNORM; + case PIPE_FORMAT_S8_UINT: + return PIPE_FORMAT_R8_UNORM; + default: + return format; + } } enum adreno_rb_blend_factor fd_blend_factor(unsigned factor) { - switch (factor) { - case PIPE_BLENDFACTOR_ONE: - return FACTOR_ONE; - case PIPE_BLENDFACTOR_SRC_COLOR: - return FACTOR_SRC_COLOR; - case PIPE_BLENDFACTOR_SRC_ALPHA: - return FACTOR_SRC_ALPHA; - case PIPE_BLENDFACTOR_DST_ALPHA: - return FACTOR_DST_ALPHA; - case PIPE_BLENDFACTOR_DST_COLOR: - return FACTOR_DST_COLOR; - case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE: - return FACTOR_SRC_ALPHA_SATURATE; - case PIPE_BLENDFACTOR_CONST_COLOR: - return FACTOR_CONSTANT_COLOR; - case PIPE_BLENDFACTOR_CONST_ALPHA: - return FACTOR_CONSTANT_ALPHA; - case PIPE_BLENDFACTOR_ZERO: - case 0: - return FACTOR_ZERO; - case PIPE_BLENDFACTOR_INV_SRC_COLOR: - return FACTOR_ONE_MINUS_SRC_COLOR; - case PIPE_BLENDFACTOR_INV_SRC_ALPHA: - return FACTOR_ONE_MINUS_SRC_ALPHA; - case PIPE_BLENDFACTOR_INV_DST_ALPHA: - return FACTOR_ONE_MINUS_DST_ALPHA; - case PIPE_BLENDFACTOR_INV_DST_COLOR: - return FACTOR_ONE_MINUS_DST_COLOR; - case PIPE_BLENDFACTOR_INV_CONST_COLOR: - return FACTOR_ONE_MINUS_CONSTANT_COLOR; - case PIPE_BLENDFACTOR_INV_CONST_ALPHA: - return FACTOR_ONE_MINUS_CONSTANT_ALPHA; - case PIPE_BLENDFACTOR_INV_SRC1_COLOR: - return FACTOR_ONE_MINUS_SRC1_COLOR; - case PIPE_BLENDFACTOR_INV_SRC1_ALPHA: - return FACTOR_ONE_MINUS_SRC1_ALPHA; - case PIPE_BLENDFACTOR_SRC1_COLOR: - return FACTOR_SRC1_COLOR; - case PIPE_BLENDFACTOR_SRC1_ALPHA: - return FACTOR_SRC1_ALPHA; - default: - DBG("invalid blend factor: %x", factor); - return 0; - } + switch (factor) { + case PIPE_BLENDFACTOR_ONE: + return FACTOR_ONE; + case PIPE_BLENDFACTOR_SRC_COLOR: + return FACTOR_SRC_COLOR; + case PIPE_BLENDFACTOR_SRC_ALPHA: + return FACTOR_SRC_ALPHA; + case PIPE_BLENDFACTOR_DST_ALPHA: + return FACTOR_DST_ALPHA; + case PIPE_BLENDFACTOR_DST_COLOR: + return FACTOR_DST_COLOR; + case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE: + return FACTOR_SRC_ALPHA_SATURATE; + case PIPE_BLENDFACTOR_CONST_COLOR: + return FACTOR_CONSTANT_COLOR; + case PIPE_BLENDFACTOR_CONST_ALPHA: + return FACTOR_CONSTANT_ALPHA; + case PIPE_BLENDFACTOR_ZERO: + case 0: + return FACTOR_ZERO; + case PIPE_BLENDFACTOR_INV_SRC_COLOR: + return FACTOR_ONE_MINUS_SRC_COLOR; + case PIPE_BLENDFACTOR_INV_SRC_ALPHA: + return FACTOR_ONE_MINUS_SRC_ALPHA; + case PIPE_BLENDFACTOR_INV_DST_ALPHA: + return FACTOR_ONE_MINUS_DST_ALPHA; + case PIPE_BLENDFACTOR_INV_DST_COLOR: + return FACTOR_ONE_MINUS_DST_COLOR; + case PIPE_BLENDFACTOR_INV_CONST_COLOR: + return FACTOR_ONE_MINUS_CONSTANT_COLOR; + case PIPE_BLENDFACTOR_INV_CONST_ALPHA: + return FACTOR_ONE_MINUS_CONSTANT_ALPHA; + case PIPE_BLENDFACTOR_INV_SRC1_COLOR: + return FACTOR_ONE_MINUS_SRC1_COLOR; + case PIPE_BLENDFACTOR_INV_SRC1_ALPHA: + return FACTOR_ONE_MINUS_SRC1_ALPHA; + case PIPE_BLENDFACTOR_SRC1_COLOR: + return FACTOR_SRC1_COLOR; + case PIPE_BLENDFACTOR_SRC1_ALPHA: + return FACTOR_SRC1_ALPHA; + default: + DBG("invalid blend factor: %x", factor); + return 0; + } } enum adreno_pa_su_sc_draw fd_polygon_mode(unsigned mode) { - switch (mode) { - case PIPE_POLYGON_MODE_POINT: - return PC_DRAW_POINTS; - case PIPE_POLYGON_MODE_LINE: - return PC_DRAW_LINES; - case PIPE_POLYGON_MODE_FILL: - return PC_DRAW_TRIANGLES; - default: - DBG("invalid polygon mode: %u", mode); - return 0; - } + switch (mode) { + case PIPE_POLYGON_MODE_POINT: + return PC_DRAW_POINTS; + case PIPE_POLYGON_MODE_LINE: + return PC_DRAW_LINES; + case PIPE_POLYGON_MODE_FILL: + return PC_DRAW_TRIANGLES; + default: + DBG("invalid polygon mode: %u", mode); + return 0; + } } enum adreno_stencil_op fd_stencil_op(unsigned op) { - switch (op) { - case PIPE_STENCIL_OP_KEEP: - return STENCIL_KEEP; - case PIPE_STENCIL_OP_ZERO: - return STENCIL_ZERO; - case PIPE_STENCIL_OP_REPLACE: - return STENCIL_REPLACE; - case PIPE_STENCIL_OP_INCR: - return STENCIL_INCR_CLAMP; - case PIPE_STENCIL_OP_DECR: - return STENCIL_DECR_CLAMP; - case PIPE_STENCIL_OP_INCR_WRAP: - return STENCIL_INCR_WRAP; - case PIPE_STENCIL_OP_DECR_WRAP: - return STENCIL_DECR_WRAP; - case PIPE_STENCIL_OP_INVERT: - return STENCIL_INVERT; - default: - DBG("invalid stencil op: %u", op); - return 0; - } + switch (op) { + case PIPE_STENCIL_OP_KEEP: + return STENCIL_KEEP; + case PIPE_STENCIL_OP_ZERO: + return STENCIL_ZERO; + case PIPE_STENCIL_OP_REPLACE: + return STENCIL_REPLACE; + case PIPE_STENCIL_OP_INCR: + return STENCIL_INCR_CLAMP; + case PIPE_STENCIL_OP_DECR: + return STENCIL_DECR_CLAMP; + case PIPE_STENCIL_OP_INCR_WRAP: + return STENCIL_INCR_WRAP; + case PIPE_STENCIL_OP_DECR_WRAP: + return STENCIL_DECR_WRAP; + case PIPE_STENCIL_OP_INVERT: + return STENCIL_INVERT; + default: + DBG("invalid stencil op: %u", op); + return 0; + } } diff --git a/src/gallium/drivers/freedreno/freedreno_util.h b/src/gallium/drivers/freedreno/freedreno_util.h index 90b21a6..dd7658b 100644 --- a/src/gallium/drivers/freedreno/freedreno_util.h +++ b/src/gallium/drivers/freedreno/freedreno_util.h @@ -33,16 +33,16 @@ #include "pipe/p_format.h" #include "pipe/p_state.h" #include "util/compiler.h" +#include "util/half_float.h" #include "util/log.h" #include "util/u_debug.h" -#include "util/u_math.h" -#include "util/half_float.h" #include "util/u_dynarray.h" +#include "util/u_math.h" #include "util/u_pack_color.h" -#include "disasm.h" #include "adreno_common.xml.h" #include "adreno_pm4.xml.h" +#include "disasm.h" enum adreno_rb_depth_format fd_pipe2depth(enum pipe_format format); enum pc_di_index_size fd_pipe2index(enum pipe_format format); @@ -98,51 +98,57 @@ enum fd_debug_flag { extern int fd_mesa_debug; extern bool fd_binning_enabled; -#define FD_DBG(category) unlikely(fd_mesa_debug & FD_DBG_##category) +#define FD_DBG(category) unlikely(fd_mesa_debug &FD_DBG_##category) -#define DBG(fmt, ...) \ - do { if (FD_DBG(MSGS)) \ - mesa_logd("%s:%d: "fmt, \ - __FUNCTION__, __LINE__, ##__VA_ARGS__); } while (0) +#define DBG(fmt, ...) \ + do { \ + if (FD_DBG(MSGS)) \ + mesa_logd("%s:%d: " fmt, __FUNCTION__, __LINE__, ##__VA_ARGS__); \ + } while (0) -#define perf_debug_ctx(ctx, ...) do { \ - if (FD_DBG(PERF)) \ - mesa_logw(__VA_ARGS__); \ - struct fd_context *__c = (ctx); \ - if (__c) \ - pipe_debug_message(&__c->debug, PERF_INFO, __VA_ARGS__); \ - } while(0) +#define perf_debug_ctx(ctx, ...) \ + do { \ + if (FD_DBG(PERF)) \ + mesa_logw(__VA_ARGS__); \ + struct fd_context *__c = (ctx); \ + if (__c) \ + pipe_debug_message(&__c->debug, PERF_INFO, __VA_ARGS__); \ + } while (0) #define perf_debug(...) perf_debug_ctx(NULL, __VA_ARGS__) -#define perf_time_ctx(ctx, limit_ns, fmt, ...) for( \ - struct __perf_time_state __s = { \ - .t = -__perf_get_time(ctx), \ - }; \ - !__s.done; \ - ({ \ - __s.t += __perf_get_time(ctx); \ - __s.done = true; \ - if (__s.t > (limit_ns)) { \ - perf_debug_ctx(ctx, fmt " (%.03f ms)", ##__VA_ARGS__, (double)__s.t / 1000000.0); \ - } \ - })) - -#define perf_time(limit_ns, fmt, ...) perf_time_ctx(NULL, limit_ns, fmt, ##__VA_ARGS__) +#define perf_time_ctx(ctx, limit_ns, fmt, ...) \ + for (struct __perf_time_state __s = \ + { \ + .t = -__perf_get_time(ctx), \ + }; \ + !__s.done; ({ \ + __s.t += __perf_get_time(ctx); \ + __s.done = true; \ + if (__s.t > (limit_ns)) { \ + perf_debug_ctx(ctx, fmt " (%.03f ms)", ##__VA_ARGS__, \ + (double)__s.t / 1000000.0); \ + } \ + })) + +#define perf_time(limit_ns, fmt, ...) \ + perf_time_ctx(NULL, limit_ns, fmt, ##__VA_ARGS__) struct __perf_time_state { - int64_t t; - bool done; + int64_t t; + bool done; }; /* static inline would be nice here, except 'struct fd_context' is not * defined yet: */ -#define __perf_get_time(ctx) \ - ((FD_DBG(PERF) || \ - ({ struct fd_context *__c = (ctx); \ - unlikely(__c && __c->debug.debug_message); })) ? \ - os_time_get_nano() : 0) +#define __perf_get_time(ctx) \ + ((FD_DBG(PERF) || ({ \ + struct fd_context *__c = (ctx); \ + unlikely(__c && __c->debug.debug_message); \ + })) \ + ? os_time_get_nano() \ + : 0) struct fd_context; @@ -193,181 +199,174 @@ extern lock_cap_t fd_context_access_cap; */ static inline void fd_context_access_begin(struct fd_context *ctx) - acquire_cap(fd_context_access_cap) + acquire_cap(fd_context_access_cap) { } static inline void -fd_context_access_end(struct fd_context *ctx) - release_cap(fd_context_access_cap) +fd_context_access_end(struct fd_context *ctx) release_cap(fd_context_access_cap) { } - /* for conditionally setting boolean flag(s): */ #define COND(bool, val) ((bool) ? (val) : 0) #define CP_REG(reg) ((0x4 << 16) | ((unsigned int)((reg) - (0x2000)))) -static inline uint32_t DRAW(enum pc_di_primtype prim_type, - enum pc_di_src_sel source_select, enum pc_di_index_size index_size, - enum pc_di_vis_cull_mode vis_cull_mode, - uint8_t instances) +static inline uint32_t +DRAW(enum pc_di_primtype prim_type, enum pc_di_src_sel source_select, + enum pc_di_index_size index_size, enum pc_di_vis_cull_mode vis_cull_mode, + uint8_t instances) { - return (prim_type << 0) | - (source_select << 6) | - ((index_size & 1) << 11) | - ((index_size >> 1) << 13) | - (vis_cull_mode << 9) | - (1 << 14) | - (instances << 24); + return (prim_type << 0) | (source_select << 6) | ((index_size & 1) << 11) | + ((index_size >> 1) << 13) | (vis_cull_mode << 9) | (1 << 14) | + (instances << 24); } -static inline uint32_t DRAW_A20X(enum pc_di_primtype prim_type, - enum pc_di_face_cull_sel faceness_cull_select, - enum pc_di_src_sel source_select, enum pc_di_index_size index_size, - bool pre_fetch_cull_enable, - bool grp_cull_enable, - uint16_t count) +static inline uint32_t +DRAW_A20X(enum pc_di_primtype prim_type, + enum pc_di_face_cull_sel faceness_cull_select, + enum pc_di_src_sel source_select, enum pc_di_index_size index_size, + bool pre_fetch_cull_enable, bool grp_cull_enable, uint16_t count) { - return (prim_type << 0) | - (source_select << 6) | - (faceness_cull_select << 8) | - ((index_size & 1) << 11) | - ((index_size >> 1) << 13) | - (pre_fetch_cull_enable << 14) | - (grp_cull_enable << 15) | - (count << 16); + return (prim_type << 0) | (source_select << 6) | + (faceness_cull_select << 8) | ((index_size & 1) << 11) | + ((index_size >> 1) << 13) | (pre_fetch_cull_enable << 14) | + (grp_cull_enable << 15) | (count << 16); } /* for tracking cmdstream positions that need to be patched: */ struct fd_cs_patch { - uint32_t *cs; - uint32_t val; + uint32_t *cs; + uint32_t val; }; #define fd_patch_num_elements(buf) ((buf)->size / sizeof(struct fd_cs_patch)) -#define fd_patch_element(buf, i) util_dynarray_element(buf, struct fd_cs_patch, i) +#define fd_patch_element(buf, i) \ + util_dynarray_element(buf, struct fd_cs_patch, i) static inline enum pipe_format pipe_surface_format(struct pipe_surface *psurf) { - if (!psurf) - return PIPE_FORMAT_NONE; - return psurf->format; + if (!psurf) + return PIPE_FORMAT_NONE; + return psurf->format; } static inline bool fd_surface_half_precision(const struct pipe_surface *psurf) { - enum pipe_format format; + enum pipe_format format; - if (!psurf) - return true; + if (!psurf) + return true; - format = psurf->format; + format = psurf->format; - /* colors are provided in consts, which go through cov.f32f16, which will - * break these values - */ - if (util_format_is_pure_integer(format)) - return false; + /* colors are provided in consts, which go through cov.f32f16, which will + * break these values + */ + if (util_format_is_pure_integer(format)) + return false; - /* avoid losing precision on 32-bit float formats */ - if (util_format_is_float(format) && - util_format_get_component_bits(format, UTIL_FORMAT_COLORSPACE_RGB, 0) == 32) - return false; + /* avoid losing precision on 32-bit float formats */ + if (util_format_is_float(format) && + util_format_get_component_bits(format, UTIL_FORMAT_COLORSPACE_RGB, 0) == + 32) + return false; - return true; + return true; } static inline unsigned fd_sampler_first_level(const struct pipe_sampler_view *view) { - if (view->target == PIPE_BUFFER) - return 0; - return view->u.tex.first_level; + if (view->target == PIPE_BUFFER) + return 0; + return view->u.tex.first_level; } static inline unsigned fd_sampler_last_level(const struct pipe_sampler_view *view) { - if (view->target == PIPE_BUFFER) - return 0; - return view->u.tex.last_level; + if (view->target == PIPE_BUFFER) + return 0; + return view->u.tex.last_level; } static inline bool fd_half_precision(struct pipe_framebuffer_state *pfb) { - unsigned i; + unsigned i; - for (i = 0; i < pfb->nr_cbufs; i++) - if (!fd_surface_half_precision(pfb->cbufs[i])) - return false; + for (i = 0; i < pfb->nr_cbufs; i++) + if (!fd_surface_half_precision(pfb->cbufs[i])) + return false; - return true; + return true; } static inline void emit_marker(struct fd_ringbuffer *ring, int scratch_idx); /* like OUT_RING() but appends a cmdstream patch point to 'buf' */ static inline void -OUT_RINGP(struct fd_ringbuffer *ring, uint32_t data, - struct util_dynarray *buf) +OUT_RINGP(struct fd_ringbuffer *ring, uint32_t data, struct util_dynarray *buf) { - if (LOG_DWORDS) { - DBG("ring[%p]: OUT_RINGP %04x: %08x", ring, - (uint32_t)(ring->cur - ring->start), data); - } - util_dynarray_append(buf, struct fd_cs_patch, ((struct fd_cs_patch){ - .cs = ring->cur++, - .val = data, - })); + if (LOG_DWORDS) { + DBG("ring[%p]: OUT_RINGP %04x: %08x", ring, + (uint32_t)(ring->cur - ring->start), data); + } + util_dynarray_append(buf, struct fd_cs_patch, + ((struct fd_cs_patch){ + .cs = ring->cur++, + .val = data, + })); } static inline void -__OUT_IB(struct fd_ringbuffer *ring, bool prefetch, struct fd_ringbuffer *target) +__OUT_IB(struct fd_ringbuffer *ring, bool prefetch, + struct fd_ringbuffer *target) { - if (target->cur == target->start) - return; - - unsigned count = fd_ringbuffer_cmd_count(target); - - /* for debug after a lock up, write a unique counter value - * to scratch6 for each IB, to make it easier to match up - * register dumps to cmdstream. The combination of IB and - * DRAW (scratch7) is enough to "triangulate" the particular - * draw that caused lockup. - */ - emit_marker(ring, 6); - - for (unsigned i = 0; i < count; i++) { - uint32_t dwords; - OUT_PKT3(ring, prefetch ? CP_INDIRECT_BUFFER_PFE : CP_INDIRECT_BUFFER_PFD, 2); - dwords = fd_ringbuffer_emit_reloc_ring_full(ring, target, i) / 4; - assert(dwords > 0); - OUT_RING(ring, dwords); - OUT_PKT2(ring); - } - - emit_marker(ring, 6); + if (target->cur == target->start) + return; + + unsigned count = fd_ringbuffer_cmd_count(target); + + /* for debug after a lock up, write a unique counter value + * to scratch6 for each IB, to make it easier to match up + * register dumps to cmdstream. The combination of IB and + * DRAW (scratch7) is enough to "triangulate" the particular + * draw that caused lockup. + */ + emit_marker(ring, 6); + + for (unsigned i = 0; i < count; i++) { + uint32_t dwords; + OUT_PKT3(ring, prefetch ? CP_INDIRECT_BUFFER_PFE : CP_INDIRECT_BUFFER_PFD, + 2); + dwords = fd_ringbuffer_emit_reloc_ring_full(ring, target, i) / 4; + assert(dwords > 0); + OUT_RING(ring, dwords); + OUT_PKT2(ring); + } + + emit_marker(ring, 6); } static inline void __OUT_IB5(struct fd_ringbuffer *ring, struct fd_ringbuffer *target) { - if (target->cur == target->start) - return; - - unsigned count = fd_ringbuffer_cmd_count(target); - - for (unsigned i = 0; i < count; i++) { - uint32_t dwords; - OUT_PKT7(ring, CP_INDIRECT_BUFFER, 3); - dwords = fd_ringbuffer_emit_reloc_ring_full(ring, target, i) / 4; - assert(dwords > 0); - OUT_RING(ring, dwords); - } + if (target->cur == target->start) + return; + + unsigned count = fd_ringbuffer_cmd_count(target); + + for (unsigned i = 0; i < count; i++) { + uint32_t dwords; + OUT_PKT7(ring, CP_INDIRECT_BUFFER, 3); + dwords = fd_ringbuffer_emit_reloc_ring_full(ring, target, i) / 4; + assert(dwords > 0); + OUT_RING(ring, dwords); + } } /* CP_SCRATCH_REG4 is used to hold base address for query results: */ @@ -377,39 +376,43 @@ __OUT_IB5(struct fd_ringbuffer *ring, struct fd_ringbuffer *target) #define HW_QUERY_BASE_REG REG_AXXX_CP_SCRATCH_REG4 #ifdef DEBUG -# define __EMIT_MARKER 1 +#define __EMIT_MARKER 1 #else -# define __EMIT_MARKER 0 +#define __EMIT_MARKER 0 #endif static inline void emit_marker(struct fd_ringbuffer *ring, int scratch_idx) { - extern int32_t marker_cnt; - unsigned reg = REG_AXXX_CP_SCRATCH_REG0 + scratch_idx; - assert(reg != HW_QUERY_BASE_REG); - if (reg == HW_QUERY_BASE_REG) - return; - if (__EMIT_MARKER) { - OUT_WFI5(ring); - OUT_PKT0(ring, reg, 1); - OUT_RING(ring, p_atomic_inc_return(&marker_cnt)); - } + extern int32_t marker_cnt; + unsigned reg = REG_AXXX_CP_SCRATCH_REG0 + scratch_idx; + assert(reg != HW_QUERY_BASE_REG); + if (reg == HW_QUERY_BASE_REG) + return; + if (__EMIT_MARKER) { + OUT_WFI5(ring); + OUT_PKT0(ring, reg, 1); + OUT_RING(ring, p_atomic_inc_return(&marker_cnt)); + } } static inline uint32_t pack_rgba(enum pipe_format format, const float *rgba) { - union util_color uc; - util_pack_color(rgba, format, &uc); - return uc.ui[0]; + union util_color uc; + util_pack_color(rgba, format, &uc); + return uc.ui[0]; } /* * swap - swap value of @a and @b */ -#define swap(a, b) \ - do { __typeof(a) __tmp = (a); (a) = (b); (b) = __tmp; } while (0) +#define swap(a, b) \ + do { \ + __typeof(a) __tmp = (a); \ + (a) = (b); \ + (b) = __tmp; \ + } while (0) #define BIT(bit) (1u << bit) @@ -420,16 +423,20 @@ pack_rgba(enum pipe_format format, const float *rgba) static inline enum a3xx_msaa_samples fd_msaa_samples(unsigned samples) { - switch (samples) { - default: - debug_assert(0); - FALLTHROUGH; - case 0: - case 1: return MSAA_ONE; - case 2: return MSAA_TWO; - case 4: return MSAA_FOUR; - case 8: return MSAA_EIGHT; - } + switch (samples) { + default: + debug_assert(0); + FALLTHROUGH; + case 0: + case 1: + return MSAA_ONE; + case 2: + return MSAA_TWO; + case 4: + return MSAA_FOUR; + case 8: + return MSAA_EIGHT; + } } /* @@ -439,31 +446,34 @@ fd_msaa_samples(unsigned samples) static inline enum a4xx_state_block fd4_stage2shadersb(gl_shader_stage type) { - switch (type) { - case MESA_SHADER_VERTEX: - return SB4_VS_SHADER; - case MESA_SHADER_FRAGMENT: - return SB4_FS_SHADER; - case MESA_SHADER_COMPUTE: - case MESA_SHADER_KERNEL: - return SB4_CS_SHADER; - default: - unreachable("bad shader type"); - return ~0; - } + switch (type) { + case MESA_SHADER_VERTEX: + return SB4_VS_SHADER; + case MESA_SHADER_FRAGMENT: + return SB4_FS_SHADER; + case MESA_SHADER_COMPUTE: + case MESA_SHADER_KERNEL: + return SB4_CS_SHADER; + default: + unreachable("bad shader type"); + return ~0; + } } static inline enum a4xx_index_size fd4_size2indextype(unsigned index_size) { - switch (index_size) { - case 1: return INDEX4_SIZE_8_BIT; - case 2: return INDEX4_SIZE_16_BIT; - case 4: return INDEX4_SIZE_32_BIT; - } - DBG("unsupported index size: %d", index_size); - assert(0); - return INDEX4_SIZE_32_BIT; + switch (index_size) { + case 1: + return INDEX4_SIZE_8_BIT; + case 2: + return INDEX4_SIZE_16_BIT; + case 4: + return INDEX4_SIZE_32_BIT; + } + DBG("unsupported index size: %d", index_size); + assert(0); + return INDEX4_SIZE_32_BIT; } #endif /* FREEDRENO_UTIL_H_ */ diff --git a/src/gallium/drivers/freedreno/gmemtool.c b/src/gallium/drivers/freedreno/gmemtool.c index b60d25c..0628b70 100644 --- a/src/gallium/drivers/freedreno/gmemtool.c +++ b/src/gallium/drivers/freedreno/gmemtool.c @@ -81,110 +81,105 @@ static const struct gmem_key keys[] = { /* clang-format on */ struct gpu_info { - const char *name; - uint32_t gpu_id; - uint8_t gmem_page_align; - uint32_t gmemsize_bytes; + const char *name; + uint32_t gpu_id; + uint8_t gmem_page_align; + uint32_t gmemsize_bytes; }; -#define SZ_128K 0x00020000 -#define SZ_256K 0x00040000 -#define SZ_512K 0x00080000 -#define SZ_1M 0x00100000 +#define SZ_128K 0x00020000 +#define SZ_256K 0x00040000 +#define SZ_512K 0x00080000 +#define SZ_1M 0x00100000 /* keep sorted by gpu name: */ static const struct gpu_info gpu_infos[] = { - { "a306", 307, 4, SZ_128K }, - { "a405", 405, 4, SZ_256K }, - { "a530", 530, 4, SZ_1M }, - { "a618", 618, 1, SZ_512K }, - { "a630", 630, 1, SZ_1M }, - { "a650", 630, 1, SZ_1M + SZ_128K }, + {"a306", 307, 4, SZ_128K}, {"a405", 405, 4, SZ_256K}, + {"a530", 530, 4, SZ_1M}, {"a618", 618, 1, SZ_512K}, + {"a630", 630, 1, SZ_1M}, {"a650", 630, 1, SZ_1M + SZ_128K}, }; - static const struct option opts[] = { - { .name = "gpu", .has_arg = 1, NULL, 'g' }, - { .name = "help", .has_arg = 0, NULL, 'h' }, - { .name = "verbose", .has_arg = 0, NULL, 'v' }, - {} -}; + {.name = "gpu", .has_arg = 1, NULL, 'g'}, + {.name = "help", .has_arg = 0, NULL, 'h'}, + {.name = "verbose", .has_arg = 0, NULL, 'v'}, + {}}; static void usage(void) { - fprintf(stderr, "Usage:\n\n" - "\tgmemtool [-hv] [-g GPU]\n\n" - "Options:\n" - "\t-g, --gpu=GPU - use GMEM size/alignment/etc settings for the specified GPU\n" - "\t-h, --help - this usage message\n" - "\t-v, --verbose - dump more verbose output\n" - "\n" - ); - fprintf(stderr, "Where GPU is one of:\n"); - for (int i = 0; i < ARRAY_SIZE(gpu_infos); i++) - fprintf(stderr, "\t%s\n", gpu_infos[i].name); - exit(2); + fprintf(stderr, "Usage:\n\n" + "\tgmemtool [-hv] [-g GPU]\n\n" + "Options:\n" + "\t-g, --gpu=GPU - use GMEM size/alignment/etc settings " + "for the specified GPU\n" + "\t-h, --help - this usage message\n" + "\t-v, --verbose - dump more verbose output\n" + "\n"); + fprintf(stderr, "Where GPU is one of:\n"); + for (int i = 0; i < ARRAY_SIZE(gpu_infos); i++) + fprintf(stderr, "\t%s\n", gpu_infos[i].name); + exit(2); } int main(int argc, char **argv) { - const char *gpu_name = "a630"; - int c; - - while ((c = getopt_long(argc, argv, "g:hv", opts, NULL)) != -1) { - switch (c) { - case 'g': - gpu_name = optarg; - break; - case 'v': - bin_debug = true; - break; - case 'h': - default: - usage(); - } - } - - const struct gpu_info *gpu_info = NULL; - - for (int i = 0; i < ARRAY_SIZE(gpu_infos); i++) { - if (strcmp(gpu_name, gpu_infos[i].name) == 0) { - gpu_info = &gpu_infos[i]; - break; - } - } - - if (!gpu_info) { - printf("unrecognized gpu name: %s\n", gpu_name); - usage(); - } - - /* Setup a fake screen with enough GMEM related configuration - * to make gmem_stateobj_init() happy: - */ - struct fd_screen screen = { - .gpu_id = gpu_info->gpu_id, - .gmemsize_bytes = gpu_info->gmemsize_bytes, - }; - - freedreno_dev_info_init(&screen.info, gpu_info->gpu_id); - - /* And finally run thru all the GMEM keys: */ - for (int i = 0; i < ARRAY_SIZE(keys); i++) { - struct gmem_key key = keys[i]; - key.gmem_page_align = gpu_info->gmem_page_align; - struct fd_gmem_stateobj *gmem = gmem_stateobj_init(&screen, &key); - dump_gmem_state(gmem); - - assert((gmem->bin_w * gmem->nbins_x) >= key.width); - assert((gmem->bin_h * gmem->nbins_y) >= key.height); - assert(gmem->bin_w < screen.info.tile_max_w); - assert(gmem->bin_h < screen.info.tile_max_h); - - ralloc_free(gmem); - } - - return 0; + const char *gpu_name = "a630"; + int c; + + while ((c = getopt_long(argc, argv, "g:hv", opts, NULL)) != -1) { + switch (c) { + case 'g': + gpu_name = optarg; + break; + case 'v': + bin_debug = true; + break; + case 'h': + default: + usage(); + } + } + + const struct gpu_info *gpu_info = NULL; + + for (int i = 0; i < ARRAY_SIZE(gpu_infos); i++) { + if (strcmp(gpu_name, gpu_infos[i].name) == 0) { + gpu_info = &gpu_infos[i]; + break; + } + } + + if (!gpu_info) { + printf("unrecognized gpu name: %s\n", gpu_name); + usage(); + } + + /* Setup a fake screen with enough GMEM related configuration + * to make gmem_stateobj_init() happy: + */ + struct fd_screen screen = { + .gpu_id = gpu_info->gpu_id, + .gmemsize_bytes = gpu_info->gmemsize_bytes, + }; + + freedreno_dev_info_init(&screen.info, gpu_info->gpu_id); + + /* And finally run thru all the GMEM keys: */ + for (int i = 0; i < ARRAY_SIZE(keys); i++) { + struct gmem_key key = keys[i]; + key.gmem_page_align = gpu_info->gmem_page_align; + struct fd_gmem_stateobj *gmem = gmem_stateobj_init(&screen, &key); + dump_gmem_state(gmem); + + assert((gmem->bin_w * gmem->nbins_x) >= key.width); + assert((gmem->bin_h * gmem->nbins_y) >= key.height); + assert(gmem->bin_w < screen.info.tile_max_w); + assert(gmem->bin_h < screen.info.tile_max_h); + + ralloc_free(gmem); + } + + return 0; } diff --git a/src/gallium/drivers/freedreno/ir3/ir3_cache.c b/src/gallium/drivers/freedreno/ir3/ir3_cache.c index dddcbb9..467f21b 100644 --- a/src/gallium/drivers/freedreno/ir3/ir3_cache.c +++ b/src/gallium/drivers/freedreno/ir3/ir3_cache.c @@ -24,163 +24,164 @@ * Rob Clark */ -#include "util/ralloc.h" #include "util/hash_table.h" +#include "util/ralloc.h" #define XXH_INLINE_ALL #include "util/xxhash.h" #include "ir3_cache.h" #include "ir3_gallium.h" - static uint32_t key_hash(const void *_key) { - const struct ir3_cache_key *key = _key; - return XXH32(key, sizeof(*key), 0); + const struct ir3_cache_key *key = _key; + return XXH32(key, sizeof(*key), 0); } static bool key_equals(const void *_a, const void *_b) { - const struct ir3_cache_key *a = _a; - const struct ir3_cache_key *b = _b; - // TODO we could optimize the key shader-variant key comparison by not - // ignoring has_per_samp.. not really sure if that helps.. - return memcmp(a, b, sizeof(struct ir3_cache_key)) == 0; + const struct ir3_cache_key *a = _a; + const struct ir3_cache_key *b = _b; + // TODO we could optimize the key shader-variant key comparison by not + // ignoring has_per_samp.. not really sure if that helps.. + return memcmp(a, b, sizeof(struct ir3_cache_key)) == 0; } struct ir3_cache { - /* cache mapping gallium/etc shader state-objs + shader-key to backend - * specific state-object - */ - struct hash_table *ht; + /* cache mapping gallium/etc shader state-objs + shader-key to backend + * specific state-object + */ + struct hash_table *ht; - const struct ir3_cache_funcs *funcs; - void *data; + const struct ir3_cache_funcs *funcs; + void *data; }; -struct ir3_cache * ir3_cache_create(const struct ir3_cache_funcs *funcs, void *data) +struct ir3_cache * +ir3_cache_create(const struct ir3_cache_funcs *funcs, void *data) { - struct ir3_cache *cache = rzalloc(NULL, struct ir3_cache); + struct ir3_cache *cache = rzalloc(NULL, struct ir3_cache); - cache->ht = _mesa_hash_table_create(cache, key_hash, key_equals); - cache->funcs = funcs; - cache->data = data; + cache->ht = _mesa_hash_table_create(cache, key_hash, key_equals); + cache->funcs = funcs; + cache->data = data; - return cache; + return cache; } -void ir3_cache_destroy(struct ir3_cache *cache) +void +ir3_cache_destroy(struct ir3_cache *cache) { - if (!cache) - return; + if (!cache) + return; - /* _mesa_hash_table_destroy is so *almost* useful.. */ - hash_table_foreach(cache->ht, entry) { - cache->funcs->destroy_state(cache->data, entry->data); - } + /* _mesa_hash_table_destroy is so *almost* useful.. */ + hash_table_foreach(cache->ht, entry) + { + cache->funcs->destroy_state(cache->data, entry->data); + } - ralloc_free(cache); + ralloc_free(cache); } struct ir3_program_state * ir3_cache_lookup(struct ir3_cache *cache, const struct ir3_cache_key *key, - struct pipe_debug_callback *debug) + struct pipe_debug_callback *debug) { - uint32_t hash = key_hash(key); - struct hash_entry *entry = - _mesa_hash_table_search_pre_hashed(cache->ht, hash, key); - - if (entry) { - return entry->data; - } - - if (key->hs) - debug_assert(key->ds); - - struct ir3_shader *shaders[MESA_SHADER_STAGES] = { - [MESA_SHADER_VERTEX] = ir3_get_shader(key->vs), - [MESA_SHADER_TESS_CTRL] = ir3_get_shader(key->hs), - [MESA_SHADER_TESS_EVAL] = ir3_get_shader(key->ds), - [MESA_SHADER_GEOMETRY] = ir3_get_shader(key->gs), - [MESA_SHADER_FRAGMENT] = ir3_get_shader(key->fs), - }; - - struct ir3_shader_variant *variants[MESA_SHADER_STAGES]; - struct ir3_shader_key shader_key = key->key; - - for (gl_shader_stage stage = MESA_SHADER_VERTEX; - stage < MESA_SHADER_STAGES; stage++) { - if (shaders[stage]) { - variants[stage] = - ir3_shader_variant(shaders[stage], shader_key, false, debug); - if (!variants[stage]) - return NULL; - } else { - variants[stage] = NULL; - } - } - - struct ir3_compiler *compiler = shaders[MESA_SHADER_VERTEX]->compiler; - uint32_t safe_constlens = ir3_trim_constlen(variants, compiler); - shader_key.safe_constlen = true; - - for (gl_shader_stage stage = MESA_SHADER_VERTEX; - stage < MESA_SHADER_STAGES; stage++) { - if (safe_constlens & (1 << stage)) { - variants[stage] = - ir3_shader_variant(shaders[stage], shader_key, false, debug); - if (!variants[stage]) - return NULL; - } - } - - struct ir3_shader_variant *bs; - - if (ir3_has_binning_vs(&key->key)) { - shader_key.safe_constlen = !!(safe_constlens & (1 << MESA_SHADER_VERTEX)); - bs = ir3_shader_variant(shaders[MESA_SHADER_VERTEX], key->key, true, debug); - if (!bs) - return NULL; - } else { - bs = variants[MESA_SHADER_VERTEX]; - } - - struct ir3_program_state *state = - cache->funcs->create_state(cache->data, bs, - variants[MESA_SHADER_VERTEX], - variants[MESA_SHADER_TESS_CTRL], - variants[MESA_SHADER_TESS_EVAL], - variants[MESA_SHADER_GEOMETRY], - variants[MESA_SHADER_FRAGMENT], - &key->key); - state->key = *key; - - /* NOTE: uses copy of key in state obj, because pointer passed by caller - * is probably on the stack - */ - _mesa_hash_table_insert_pre_hashed(cache->ht, hash, &state->key, state); - - return state; + uint32_t hash = key_hash(key); + struct hash_entry *entry = + _mesa_hash_table_search_pre_hashed(cache->ht, hash, key); + + if (entry) { + return entry->data; + } + + if (key->hs) + debug_assert(key->ds); + + struct ir3_shader *shaders[MESA_SHADER_STAGES] = { + [MESA_SHADER_VERTEX] = ir3_get_shader(key->vs), + [MESA_SHADER_TESS_CTRL] = ir3_get_shader(key->hs), + [MESA_SHADER_TESS_EVAL] = ir3_get_shader(key->ds), + [MESA_SHADER_GEOMETRY] = ir3_get_shader(key->gs), + [MESA_SHADER_FRAGMENT] = ir3_get_shader(key->fs), + }; + + struct ir3_shader_variant *variants[MESA_SHADER_STAGES]; + struct ir3_shader_key shader_key = key->key; + + for (gl_shader_stage stage = MESA_SHADER_VERTEX; stage < MESA_SHADER_STAGES; + stage++) { + if (shaders[stage]) { + variants[stage] = + ir3_shader_variant(shaders[stage], shader_key, false, debug); + if (!variants[stage]) + return NULL; + } else { + variants[stage] = NULL; + } + } + + struct ir3_compiler *compiler = shaders[MESA_SHADER_VERTEX]->compiler; + uint32_t safe_constlens = ir3_trim_constlen(variants, compiler); + shader_key.safe_constlen = true; + + for (gl_shader_stage stage = MESA_SHADER_VERTEX; stage < MESA_SHADER_STAGES; + stage++) { + if (safe_constlens & (1 << stage)) { + variants[stage] = + ir3_shader_variant(shaders[stage], shader_key, false, debug); + if (!variants[stage]) + return NULL; + } + } + + struct ir3_shader_variant *bs; + + if (ir3_has_binning_vs(&key->key)) { + shader_key.safe_constlen = !!(safe_constlens & (1 << MESA_SHADER_VERTEX)); + bs = + ir3_shader_variant(shaders[MESA_SHADER_VERTEX], key->key, true, debug); + if (!bs) + return NULL; + } else { + bs = variants[MESA_SHADER_VERTEX]; + } + + struct ir3_program_state *state = cache->funcs->create_state( + cache->data, bs, variants[MESA_SHADER_VERTEX], + variants[MESA_SHADER_TESS_CTRL], variants[MESA_SHADER_TESS_EVAL], + variants[MESA_SHADER_GEOMETRY], variants[MESA_SHADER_FRAGMENT], + &key->key); + state->key = *key; + + /* NOTE: uses copy of key in state obj, because pointer passed by caller + * is probably on the stack + */ + _mesa_hash_table_insert_pre_hashed(cache->ht, hash, &state->key, state); + + return state; } /* call when an API level state object is destroyed, to invalidate * cache entries which reference that state object. */ -void ir3_cache_invalidate(struct ir3_cache *cache, void *stobj) +void +ir3_cache_invalidate(struct ir3_cache *cache, void *stobj) { - if (!cache) - return; - - hash_table_foreach(cache->ht, entry) { - const struct ir3_cache_key *key = entry->key; - if ((key->fs == stobj) || (key->vs == stobj) || - (key->ds == stobj) || (key->hs == stobj) || - (key->gs == stobj)) { - cache->funcs->destroy_state(cache->data, entry->data); - _mesa_hash_table_remove(cache->ht, entry); - return; - } - } + if (!cache) + return; + + hash_table_foreach(cache->ht, entry) + { + const struct ir3_cache_key *key = entry->key; + if ((key->fs == stobj) || (key->vs == stobj) || (key->ds == stobj) || + (key->hs == stobj) || (key->gs == stobj)) { + cache->funcs->destroy_state(cache->data, entry->data); + _mesa_hash_table_remove(cache->ht, entry); + return; + } + } } diff --git a/src/gallium/drivers/freedreno/ir3/ir3_cache.h b/src/gallium/drivers/freedreno/ir3/ir3_cache.h index 71708ad..26135d9 100644 --- a/src/gallium/drivers/freedreno/ir3/ir3_cache.h +++ b/src/gallium/drivers/freedreno/ir3/ir3_cache.h @@ -37,8 +37,8 @@ /* key into program state cache */ struct ir3_cache_key { - struct ir3_shader_state *vs, *hs, *ds, *gs, *fs; // 5 pointers - struct ir3_shader_key key; // 7 dwords + struct ir3_shader_state *vs, *hs, *ds, *gs, *fs; // 5 pointers + struct ir3_shader_key key; // 7 dwords }; /* per-gen backend program state object should subclass this for it's @@ -46,33 +46,31 @@ struct ir3_cache_key { * allocated on the stack */ struct ir3_program_state { - struct ir3_cache_key key; + struct ir3_cache_key key; }; struct ir3_cache_funcs { - struct ir3_program_state *(*create_state)(void *data, - struct ir3_shader_variant *bs, /* binning pass vs */ - struct ir3_shader_variant *vs, - struct ir3_shader_variant *hs, - struct ir3_shader_variant *ds, - struct ir3_shader_variant *gs, - struct ir3_shader_variant *fs, - const struct ir3_shader_key *key); - void (*destroy_state)(void *data, struct ir3_program_state *state); + struct ir3_program_state *(*create_state)( + void *data, struct ir3_shader_variant *bs, /* binning pass vs */ + struct ir3_shader_variant *vs, struct ir3_shader_variant *hs, + struct ir3_shader_variant *ds, struct ir3_shader_variant *gs, + struct ir3_shader_variant *fs, const struct ir3_shader_key *key); + void (*destroy_state)(void *data, struct ir3_program_state *state); }; struct ir3_cache; /* construct a shader cache. Free with ralloc_free() */ -struct ir3_cache * ir3_cache_create(const struct ir3_cache_funcs *funcs, void *data); +struct ir3_cache *ir3_cache_create(const struct ir3_cache_funcs *funcs, + void *data); void ir3_cache_destroy(struct ir3_cache *cache); /* debug callback is used for shader-db logs in case the lookup triggers * shader variant compilation. */ -struct ir3_program_state * ir3_cache_lookup(struct ir3_cache *cache, - const struct ir3_cache_key *key, - struct pipe_debug_callback *debug); +struct ir3_program_state *ir3_cache_lookup(struct ir3_cache *cache, + const struct ir3_cache_key *key, + struct pipe_debug_callback *debug); /* call when an API level state object is destroyed, to invalidate * cache entries which reference that state object. diff --git a/src/gallium/drivers/freedreno/ir3/ir3_cmdline.c b/src/gallium/drivers/freedreno/ir3/ir3_cmdline.c index 60123a3..1594ecd 100644 --- a/src/gallium/drivers/freedreno/ir3/ir3_cmdline.c +++ b/src/gallium/drivers/freedreno/ir3/ir3_cmdline.c @@ -24,32 +24,32 @@ * Rob Clark */ -#include -#include -#include +#include #include +#include #include -#include #include -#include -#include +#include +#include +#include +#include #include "nir/tgsi_to_nir.h" +#include "tgsi/tgsi_dump.h" #include "tgsi/tgsi_parse.h" #include "tgsi/tgsi_text.h" -#include "tgsi/tgsi_dump.h" +#include "ir3/instr-a3xx.h" +#include "ir3/ir3.h" #include "ir3/ir3_compiler.h" #include "ir3/ir3_gallium.h" #include "ir3/ir3_nir.h" -#include "ir3/instr-a3xx.h" -#include "ir3/ir3.h" #include "main/mtypes.h" -#include "compiler/glsl/standalone.h" -#include "compiler/glsl/glsl_to_nir.h" #include "compiler/glsl/gl_nir.h" +#include "compiler/glsl/glsl_to_nir.h" +#include "compiler/glsl/standalone.h" #include "compiler/nir_types.h" #include "compiler/spirv/nir_spirv.h" @@ -58,384 +58,379 @@ static void dump_info(struct ir3_shader_variant *so, const char *str) { - uint32_t *bin; - const char *type = ir3_shader_stage(so); - bin = ir3_shader_assemble(so); - printf("; %s: %s\n", type, str); - ir3_shader_disasm(so, bin, stdout); + uint32_t *bin; + const char *type = ir3_shader_stage(so); + bin = ir3_shader_assemble(so); + printf("; %s: %s\n", type, str); + ir3_shader_disasm(so, bin, stdout); } static void insert_sorted(struct exec_list *var_list, nir_variable *new_var) { - nir_foreach_variable_in_list(var, var_list) { - if (var->data.location > new_var->data.location) { - exec_node_insert_node_before(&var->node, &new_var->node); - return; - } - } - exec_list_push_tail(var_list, &new_var->node); + nir_foreach_variable_in_list (var, var_list) { + if (var->data.location > new_var->data.location) { + exec_node_insert_node_before(&var->node, &new_var->node); + return; + } + } + exec_list_push_tail(var_list, &new_var->node); } static void sort_varyings(nir_shader *nir, nir_variable_mode mode) { - struct exec_list new_list; - exec_list_make_empty(&new_list); - nir_foreach_variable_with_modes_safe(var, nir, mode) { - exec_node_remove(&var->node); - insert_sorted(&new_list, var); - } - exec_list_append(&nir->variables, &new_list); + struct exec_list new_list; + exec_list_make_empty(&new_list); + nir_foreach_variable_with_modes_safe(var, nir, mode) + { + exec_node_remove(&var->node); + insert_sorted(&new_list, var); + } + exec_list_append(&nir->variables, &new_list); } static void fixup_varying_slots(nir_shader *nir, nir_variable_mode mode) { - nir_foreach_variable_with_modes(var, nir, mode) { - if (var->data.location >= VARYING_SLOT_VAR0) { - var->data.location += 9; - } else if ((var->data.location >= VARYING_SLOT_TEX0) && - (var->data.location <= VARYING_SLOT_TEX7)) { - var->data.location += VARYING_SLOT_VAR0 - VARYING_SLOT_TEX0; - } - } + nir_foreach_variable_with_modes(var, nir, mode) + { + if (var->data.location >= VARYING_SLOT_VAR0) { + var->data.location += 9; + } else if ((var->data.location >= VARYING_SLOT_TEX0) && + (var->data.location <= VARYING_SLOT_TEX7)) { + var->data.location += VARYING_SLOT_VAR0 - VARYING_SLOT_TEX0; + } + } } static struct ir3_compiler *compiler; static nir_shader * -load_glsl(unsigned num_files, char* const* files, gl_shader_stage stage) +load_glsl(unsigned num_files, char *const *files, gl_shader_stage stage) { - static const struct standalone_options options = { - .glsl_version = 310, - .do_link = true, - .lower_precision = true, - }; - struct gl_shader_program *prog; - const nir_shader_compiler_options *nir_options = - ir3_get_compiler_options(compiler); - static struct gl_context local_ctx; - - prog = standalone_compile_shader(&options, num_files, files, &local_ctx); - if (!prog) - errx(1, "couldn't parse `%s'", files[0]); - - nir_shader *nir = glsl_to_nir(&local_ctx, prog, stage, nir_options); - - /* required NIR passes: */ - if (nir_options->lower_all_io_to_temps || - nir->info.stage == MESA_SHADER_VERTEX || - nir->info.stage == MESA_SHADER_GEOMETRY) { - NIR_PASS_V(nir, nir_lower_io_to_temporaries, - nir_shader_get_entrypoint(nir), - true, true); - } else if (nir->info.stage == MESA_SHADER_FRAGMENT) { - NIR_PASS_V(nir, nir_lower_io_to_temporaries, - nir_shader_get_entrypoint(nir), - true, false); - } - - NIR_PASS_V(nir, nir_lower_global_vars_to_local); - NIR_PASS_V(nir, nir_split_var_copies); - NIR_PASS_V(nir, nir_lower_var_copies); - - NIR_PASS_V(nir, nir_split_var_copies); - NIR_PASS_V(nir, nir_lower_var_copies); - nir_print_shader(nir, stdout); - NIR_PASS_V(nir, gl_nir_lower_atomics, prog, true); - NIR_PASS_V(nir, gl_nir_lower_buffers, prog); - NIR_PASS_V(nir, nir_lower_atomics_to_ssbo); - nir_print_shader(nir, stdout); - - switch (stage) { - case MESA_SHADER_VERTEX: - nir_assign_var_locations(nir, nir_var_shader_in, - &nir->num_inputs, - ir3_glsl_type_size); - - /* Re-lower global vars, to deal with any dead VS inputs. */ - NIR_PASS_V(nir, nir_lower_global_vars_to_local); - - sort_varyings(nir, nir_var_shader_out); - nir_assign_var_locations(nir, nir_var_shader_out, - &nir->num_outputs, - ir3_glsl_type_size); - fixup_varying_slots(nir, nir_var_shader_out); - break; - case MESA_SHADER_FRAGMENT: - sort_varyings(nir, nir_var_shader_in); - nir_assign_var_locations(nir, nir_var_shader_in, - &nir->num_inputs, - ir3_glsl_type_size); - fixup_varying_slots(nir, nir_var_shader_in); - nir_assign_var_locations(nir, nir_var_shader_out, - &nir->num_outputs, - ir3_glsl_type_size); - break; - case MESA_SHADER_COMPUTE: - case MESA_SHADER_KERNEL: - break; - default: - errx(1, "unhandled shader stage: %d", stage); - } - - nir_assign_var_locations(nir, nir_var_uniform, - &nir->num_uniforms, - ir3_glsl_type_size); - - NIR_PASS_V(nir, nir_lower_system_values); - NIR_PASS_V(nir, nir_lower_compute_system_values, NULL); - - NIR_PASS_V(nir, nir_lower_frexp); - NIR_PASS_V(nir, nir_lower_io, - nir_var_shader_in | nir_var_shader_out | nir_var_uniform, - ir3_glsl_type_size, (nir_lower_io_options)0); - NIR_PASS_V(nir, gl_nir_lower_samplers, prog); - - return nir; + static const struct standalone_options options = { + .glsl_version = 310, + .do_link = true, + .lower_precision = true, + }; + struct gl_shader_program *prog; + const nir_shader_compiler_options *nir_options = + ir3_get_compiler_options(compiler); + static struct gl_context local_ctx; + + prog = standalone_compile_shader(&options, num_files, files, &local_ctx); + if (!prog) + errx(1, "couldn't parse `%s'", files[0]); + + nir_shader *nir = glsl_to_nir(&local_ctx, prog, stage, nir_options); + + /* required NIR passes: */ + if (nir_options->lower_all_io_to_temps || + nir->info.stage == MESA_SHADER_VERTEX || + nir->info.stage == MESA_SHADER_GEOMETRY) { + NIR_PASS_V(nir, nir_lower_io_to_temporaries, + nir_shader_get_entrypoint(nir), true, true); + } else if (nir->info.stage == MESA_SHADER_FRAGMENT) { + NIR_PASS_V(nir, nir_lower_io_to_temporaries, + nir_shader_get_entrypoint(nir), true, false); + } + + NIR_PASS_V(nir, nir_lower_global_vars_to_local); + NIR_PASS_V(nir, nir_split_var_copies); + NIR_PASS_V(nir, nir_lower_var_copies); + + NIR_PASS_V(nir, nir_split_var_copies); + NIR_PASS_V(nir, nir_lower_var_copies); + nir_print_shader(nir, stdout); + NIR_PASS_V(nir, gl_nir_lower_atomics, prog, true); + NIR_PASS_V(nir, gl_nir_lower_buffers, prog); + NIR_PASS_V(nir, nir_lower_atomics_to_ssbo); + nir_print_shader(nir, stdout); + + switch (stage) { + case MESA_SHADER_VERTEX: + nir_assign_var_locations(nir, nir_var_shader_in, &nir->num_inputs, + ir3_glsl_type_size); + + /* Re-lower global vars, to deal with any dead VS inputs. */ + NIR_PASS_V(nir, nir_lower_global_vars_to_local); + + sort_varyings(nir, nir_var_shader_out); + nir_assign_var_locations(nir, nir_var_shader_out, &nir->num_outputs, + ir3_glsl_type_size); + fixup_varying_slots(nir, nir_var_shader_out); + break; + case MESA_SHADER_FRAGMENT: + sort_varyings(nir, nir_var_shader_in); + nir_assign_var_locations(nir, nir_var_shader_in, &nir->num_inputs, + ir3_glsl_type_size); + fixup_varying_slots(nir, nir_var_shader_in); + nir_assign_var_locations(nir, nir_var_shader_out, &nir->num_outputs, + ir3_glsl_type_size); + break; + case MESA_SHADER_COMPUTE: + case MESA_SHADER_KERNEL: + break; + default: + errx(1, "unhandled shader stage: %d", stage); + } + + nir_assign_var_locations(nir, nir_var_uniform, &nir->num_uniforms, + ir3_glsl_type_size); + + NIR_PASS_V(nir, nir_lower_system_values); + NIR_PASS_V(nir, nir_lower_compute_system_values, NULL); + + NIR_PASS_V(nir, nir_lower_frexp); + NIR_PASS_V(nir, nir_lower_io, + nir_var_shader_in | nir_var_shader_out | nir_var_uniform, + ir3_glsl_type_size, (nir_lower_io_options)0); + NIR_PASS_V(nir, gl_nir_lower_samplers, prog); + + return nir; } static int read_file(const char *filename, void **ptr, size_t *size) { - int fd, ret; - struct stat st; + int fd, ret; + struct stat st; - *ptr = MAP_FAILED; + *ptr = MAP_FAILED; - fd = open(filename, O_RDONLY); - if (fd == -1) { - warnx("couldn't open `%s'", filename); - return 1; - } + fd = open(filename, O_RDONLY); + if (fd == -1) { + warnx("couldn't open `%s'", filename); + return 1; + } - ret = fstat(fd, &st); - if (ret) - errx(1, "couldn't stat `%s'", filename); + ret = fstat(fd, &st); + if (ret) + errx(1, "couldn't stat `%s'", filename); - *size = st.st_size; - *ptr = mmap(NULL, st.st_size, PROT_READ, MAP_SHARED, fd, 0); - if (*ptr == MAP_FAILED) - errx(1, "couldn't map `%s'", filename); + *size = st.st_size; + *ptr = mmap(NULL, st.st_size, PROT_READ, MAP_SHARED, fd, 0); + if (*ptr == MAP_FAILED) + errx(1, "couldn't map `%s'", filename); - close(fd); + close(fd); - return 0; + return 0; } -static void debug_func(void *priv, enum nir_spirv_debug_level level, - size_t spirv_offset, const char *message) +static void +debug_func(void *priv, enum nir_spirv_debug_level level, size_t spirv_offset, + const char *message) { -// printf("%s\n", message); + // printf("%s\n", message); } static nir_shader * load_spirv(const char *filename, const char *entry, gl_shader_stage stage) { - const struct spirv_to_nir_options spirv_options = { - /* these caps are just make-believe */ - .caps = { - .draw_parameters = true, - .float64 = true, - .image_read_without_format = true, - .image_write_without_format = true, - .int64 = true, - .variable_pointers = true, - }, - .debug = { - .func = debug_func, - } - }; - nir_shader *nir; - void *buf; - size_t size; - - read_file(filename, &buf, &size); - - nir = spirv_to_nir(buf, size / 4, - NULL, 0, /* spec_entries */ - stage, entry, - &spirv_options, - ir3_get_compiler_options(compiler)); - - nir_print_shader(nir, stdout); - - return nir; + const struct spirv_to_nir_options spirv_options = { + /* these caps are just make-believe */ + .caps = + { + .draw_parameters = true, + .float64 = true, + .image_read_without_format = true, + .image_write_without_format = true, + .int64 = true, + .variable_pointers = true, + }, + .debug = { + .func = debug_func, + }}; + nir_shader *nir; + void *buf; + size_t size; + + read_file(filename, &buf, &size); + + nir = spirv_to_nir(buf, size / 4, NULL, 0, /* spec_entries */ + stage, entry, &spirv_options, + ir3_get_compiler_options(compiler)); + + nir_print_shader(nir, stdout); + + return nir; } static const char *shortopts = "g:hv"; static const struct option longopts[] = { - { "gpu", required_argument, 0, 'g' }, - { "help", no_argument, 0, 'h' }, - { "verbose", no_argument, 0, 'v' }, + {"gpu", required_argument, 0, 'g'}, + {"help", no_argument, 0, 'h'}, + {"verbose", no_argument, 0, 'v'}, }; static void print_usage(void) { - printf("Usage: ir3_compiler [OPTIONS]... \n"); - printf(" -g, --gpu GPU_ID - specify gpu-id (default 320)\n"); - printf(" -h, --help - show this message\n"); - printf(" -v, --verbose - verbose compiler/debug messages\n"); + printf("Usage: ir3_compiler [OPTIONS]... \n"); + printf(" -g, --gpu GPU_ID - specify gpu-id (default 320)\n"); + printf(" -h, --help - show this message\n"); + printf(" -v, --verbose - verbose compiler/debug messages\n"); } int main(int argc, char **argv) { - int ret = 0, opt; - char *filenames[2]; - int num_files = 0; - unsigned stage = 0; - struct ir3_shader_key key = {}; - unsigned gpu_id = 320; - const char *info; - const char *spirv_entry = NULL; - void *ptr; - bool from_tgsi = false; - size_t size; - - while ((opt = getopt_long_only(argc, argv, shortopts, longopts, NULL)) != -1) { - switch (opt) { - case 'g': - gpu_id = strtol(optarg, NULL, 0); - break; - case 'v': - ir3_shader_debug |= IR3_DBG_OPTMSGS | IR3_DBG_DISASM; - break; - default: - printf("unrecognized arg: %c\n", opt); - /* fallthrough */ - case 'h': - print_usage(); - return 0; - } - } - - if (optind >= argc) { - fprintf(stderr, "no file specified!\n"); - print_usage(); - return 0; - } - - unsigned n = optind; - while (n < argc) { - char *filename = argv[n]; - char *ext = strrchr(filename, '.'); - - if (strcmp(ext, ".tgsi") == 0) { - if (num_files != 0) - errx(1, "in TGSI mode, only a single file may be specified"); - from_tgsi = true; - } else if (strcmp(ext, ".spv") == 0) { - if (num_files != 0) - errx(1, "in SPIR-V mode, only a single file may be specified"); - stage = MESA_SHADER_COMPUTE; - filenames[num_files++] = filename; - n++; - if (n == argc) - errx(1, "in SPIR-V mode, an entry point must be specified"); - spirv_entry = argv[n]; - n++; - } else if (strcmp(ext, ".comp") == 0) { - if (from_tgsi || spirv_entry) - errx(1, "cannot mix GLSL/TGSI/SPIRV"); - if (num_files >= ARRAY_SIZE(filenames)) - errx(1, "too many GLSL files"); - stage = MESA_SHADER_COMPUTE; - } else if (strcmp(ext, ".frag") == 0) { - if (from_tgsi || spirv_entry) - errx(1, "cannot mix GLSL/TGSI/SPIRV"); - if (num_files >= ARRAY_SIZE(filenames)) - errx(1, "too many GLSL files"); - stage = MESA_SHADER_FRAGMENT; - } else if (strcmp(ext, ".vert") == 0) { - if (from_tgsi) - errx(1, "cannot mix GLSL and TGSI"); - if (num_files >= ARRAY_SIZE(filenames)) - errx(1, "too many GLSL files"); - stage = MESA_SHADER_VERTEX; - } else { - print_usage(); - return -1; - } - - filenames[num_files++] = filename; - - n++; - } - - nir_shader *nir; - - compiler = ir3_compiler_create(NULL, gpu_id); - - if (from_tgsi) { - struct tgsi_token toks[65536]; - const nir_shader_compiler_options *nir_options = - ir3_get_compiler_options(compiler); - - ret = read_file(filenames[0], &ptr, &size); - if (ret) { - print_usage(); - return ret; - } - - if (ir3_shader_debug & IR3_DBG_OPTMSGS) - printf("%s\n", (char *)ptr); - - if (!tgsi_text_translate(ptr, toks, ARRAY_SIZE(toks))) - errx(1, "could not parse `%s'", filenames[0]); - - if (ir3_shader_debug & IR3_DBG_OPTMSGS) - tgsi_dump(toks, 0); - - nir = tgsi_to_nir_noscreen(toks, nir_options); - NIR_PASS_V(nir, nir_lower_global_vars_to_local); - } else if (spirv_entry) { - nir = load_spirv(filenames[0], spirv_entry, stage); - - NIR_PASS_V(nir, nir_lower_io, - nir_var_shader_in | nir_var_shader_out, - ir3_glsl_type_size, (nir_lower_io_options)0); - - /* TODO do this somewhere else */ - nir_lower_int64(nir); - nir_lower_system_values(nir); - nir_lower_compute_system_values(nir, NULL); - } else if (num_files > 0) { - nir = load_glsl(num_files, filenames, stage); - } else { - print_usage(); - return -1; - } - - ir3_finalize_nir(compiler, nir); - ir3_nir_post_finalize(compiler, nir); - - struct ir3_shader *shader = rzalloc_size(NULL, sizeof(*shader)); - shader->compiler = compiler; - shader->type = stage; - shader->nir = nir; - - struct ir3_shader_variant *v = rzalloc_size(shader, sizeof(*v)); - v->type = shader->type; - v->shader = shader; - v->key = key; - v->const_state = rzalloc_size(v, sizeof(*v->const_state)); - - shader->variants = v; - shader->variant_count = 1; - - ir3_nir_lower_variant(v, nir); - - info = "NIR compiler"; - ret = ir3_compile_shader_nir(compiler, v); - if (ret) { - fprintf(stderr, "compiler failed!\n"); - return ret; - } - dump_info(v, info); - - return 0; + int ret = 0, opt; + char *filenames[2]; + int num_files = 0; + unsigned stage = 0; + struct ir3_shader_key key = {}; + unsigned gpu_id = 320; + const char *info; + const char *spirv_entry = NULL; + void *ptr; + bool from_tgsi = false; + size_t size; + + while ((opt = getopt_long_only(argc, argv, shortopts, longopts, NULL)) != + -1) { + switch (opt) { + case 'g': + gpu_id = strtol(optarg, NULL, 0); + break; + case 'v': + ir3_shader_debug |= IR3_DBG_OPTMSGS | IR3_DBG_DISASM; + break; + default: + printf("unrecognized arg: %c\n", opt); + /* fallthrough */ + case 'h': + print_usage(); + return 0; + } + } + + if (optind >= argc) { + fprintf(stderr, "no file specified!\n"); + print_usage(); + return 0; + } + + unsigned n = optind; + while (n < argc) { + char *filename = argv[n]; + char *ext = strrchr(filename, '.'); + + if (strcmp(ext, ".tgsi") == 0) { + if (num_files != 0) + errx(1, "in TGSI mode, only a single file may be specified"); + from_tgsi = true; + } else if (strcmp(ext, ".spv") == 0) { + if (num_files != 0) + errx(1, "in SPIR-V mode, only a single file may be specified"); + stage = MESA_SHADER_COMPUTE; + filenames[num_files++] = filename; + n++; + if (n == argc) + errx(1, "in SPIR-V mode, an entry point must be specified"); + spirv_entry = argv[n]; + n++; + } else if (strcmp(ext, ".comp") == 0) { + if (from_tgsi || spirv_entry) + errx(1, "cannot mix GLSL/TGSI/SPIRV"); + if (num_files >= ARRAY_SIZE(filenames)) + errx(1, "too many GLSL files"); + stage = MESA_SHADER_COMPUTE; + } else if (strcmp(ext, ".frag") == 0) { + if (from_tgsi || spirv_entry) + errx(1, "cannot mix GLSL/TGSI/SPIRV"); + if (num_files >= ARRAY_SIZE(filenames)) + errx(1, "too many GLSL files"); + stage = MESA_SHADER_FRAGMENT; + } else if (strcmp(ext, ".vert") == 0) { + if (from_tgsi) + errx(1, "cannot mix GLSL and TGSI"); + if (num_files >= ARRAY_SIZE(filenames)) + errx(1, "too many GLSL files"); + stage = MESA_SHADER_VERTEX; + } else { + print_usage(); + return -1; + } + + filenames[num_files++] = filename; + + n++; + } + + nir_shader *nir; + + compiler = ir3_compiler_create(NULL, gpu_id); + + if (from_tgsi) { + struct tgsi_token toks[65536]; + const nir_shader_compiler_options *nir_options = + ir3_get_compiler_options(compiler); + + ret = read_file(filenames[0], &ptr, &size); + if (ret) { + print_usage(); + return ret; + } + + if (ir3_shader_debug & IR3_DBG_OPTMSGS) + printf("%s\n", (char *)ptr); + + if (!tgsi_text_translate(ptr, toks, ARRAY_SIZE(toks))) + errx(1, "could not parse `%s'", filenames[0]); + + if (ir3_shader_debug & IR3_DBG_OPTMSGS) + tgsi_dump(toks, 0); + + nir = tgsi_to_nir_noscreen(toks, nir_options); + NIR_PASS_V(nir, nir_lower_global_vars_to_local); + } else if (spirv_entry) { + nir = load_spirv(filenames[0], spirv_entry, stage); + + NIR_PASS_V(nir, nir_lower_io, nir_var_shader_in | nir_var_shader_out, + ir3_glsl_type_size, (nir_lower_io_options)0); + + /* TODO do this somewhere else */ + nir_lower_int64(nir); + nir_lower_system_values(nir); + nir_lower_compute_system_values(nir, NULL); + } else if (num_files > 0) { + nir = load_glsl(num_files, filenames, stage); + } else { + print_usage(); + return -1; + } + + ir3_finalize_nir(compiler, nir); + ir3_nir_post_finalize(compiler, nir); + + struct ir3_shader *shader = rzalloc_size(NULL, sizeof(*shader)); + shader->compiler = compiler; + shader->type = stage; + shader->nir = nir; + + struct ir3_shader_variant *v = rzalloc_size(shader, sizeof(*v)); + v->type = shader->type; + v->shader = shader; + v->key = key; + v->const_state = rzalloc_size(v, sizeof(*v->const_state)); + + shader->variants = v; + shader->variant_count = 1; + + ir3_nir_lower_variant(v, nir); + + info = "NIR compiler"; + ret = ir3_compile_shader_nir(compiler, v); + if (ret) { + fprintf(stderr, "compiler failed!\n"); + return ret; + } + dump_info(v, info); + + return 0; } diff --git a/src/gallium/drivers/freedreno/ir3/ir3_const.h b/src/gallium/drivers/freedreno/ir3/ir3_const.h index 71cab01..d01b036 100644 --- a/src/gallium/drivers/freedreno/ir3/ir3_const.h +++ b/src/gallium/drivers/freedreno/ir3/ir3_const.h @@ -40,48 +40,47 @@ static bool is_stateobj(struct fd_ringbuffer *ring); static void emit_const_user(struct fd_ringbuffer *ring, - const struct ir3_shader_variant *v, uint32_t regid, - uint32_t size, const uint32_t *user_buffer); + const struct ir3_shader_variant *v, uint32_t regid, + uint32_t size, const uint32_t *user_buffer); static void emit_const_bo(struct fd_ringbuffer *ring, - const struct ir3_shader_variant *v, uint32_t regid, - uint32_t offset, uint32_t size, - struct fd_bo *bo); - -static void emit_const_prsc(struct fd_ringbuffer *ring, - const struct ir3_shader_variant *v, uint32_t regid, - uint32_t offset, uint32_t size, - struct pipe_resource *buffer) + const struct ir3_shader_variant *v, uint32_t regid, + uint32_t offset, uint32_t size, struct fd_bo *bo); + +static void +emit_const_prsc(struct fd_ringbuffer *ring, const struct ir3_shader_variant *v, + uint32_t regid, uint32_t offset, uint32_t size, + struct pipe_resource *buffer) { - struct fd_resource *rsc = fd_resource(buffer); - emit_const_bo(ring, v, regid, offset, size, rsc->bo); + struct fd_resource *rsc = fd_resource(buffer); + emit_const_bo(ring, v, regid, offset, size, rsc->bo); } static void emit_const_ptrs(struct fd_ringbuffer *ring, - const struct ir3_shader_variant *v, uint32_t dst_offset, - uint32_t num, struct fd_bo **bos, uint32_t *offsets); + const struct ir3_shader_variant *v, + uint32_t dst_offset, uint32_t num, + struct fd_bo **bos, uint32_t *offsets); static void emit_const_asserts(struct fd_ringbuffer *ring, - const struct ir3_shader_variant *v, - uint32_t regid, uint32_t sizedwords) + const struct ir3_shader_variant *v, uint32_t regid, + uint32_t sizedwords) { - assert((regid % 4) == 0); - assert((sizedwords % 4) == 0); - assert(regid + sizedwords <= v->constlen * 4); + assert((regid % 4) == 0); + assert((sizedwords % 4) == 0); + assert(regid + sizedwords <= v->constlen * 4); } static void -ring_wfi(struct fd_batch *batch, struct fd_ringbuffer *ring) - assert_dt +ring_wfi(struct fd_batch *batch, struct fd_ringbuffer *ring) assert_dt { - /* when we emit const state via ring (IB2) we need a WFI, but when - * it is emit'd via stateobj, we don't - */ - if (is_stateobj(ring)) - return; + /* when we emit const state via ring (IB2) we need a WFI, but when + * it is emit'd via stateobj, we don't + */ + if (is_stateobj(ring)) + return; - fd_wfi(batch, ring); + fd_wfi(batch, ring); } /** @@ -94,17 +93,17 @@ ring_wfi(struct fd_batch *batch, struct fd_ringbuffer *ring) * Returns size in dwords. */ static inline void -ir3_user_consts_size(struct ir3_ubo_analysis_state *state, - unsigned *packets, unsigned *size) +ir3_user_consts_size(struct ir3_ubo_analysis_state *state, unsigned *packets, + unsigned *size) { - *packets = *size = 0; - - for (uint32_t i = 0; i < ARRAY_SIZE(state->range); i++) { - if (state->range[i].start < state->range[i].end) { - *size += state->range[i].end - state->range[i].start; - (*packets)++; - } - } + *packets = *size = 0; + + for (uint32_t i = 0; i < ARRAY_SIZE(state->range); i++) { + if (state->range[i].start < state->range[i].end) { + *size += state->range[i].end - state->range[i].start; + (*packets)++; + } + } } /** @@ -113,36 +112,37 @@ ir3_user_consts_size(struct ir3_ubo_analysis_state *state, */ static inline void ir3_emit_constant_data(struct fd_screen *screen, - const struct ir3_shader_variant *v, struct fd_ringbuffer *ring) + const struct ir3_shader_variant *v, + struct fd_ringbuffer *ring) { - const struct ir3_const_state *const_state = ir3_const_state(v); - const struct ir3_ubo_analysis_state *state = &const_state->ubo_state; - - for (unsigned i = 0; i < state->num_enabled; i++) { - unsigned ubo = state->range[i].ubo.block; - if (ubo != const_state->constant_data_ubo) - continue; - - uint32_t size = state->range[i].end - state->range[i].start; - - /* Pre-a6xx, we might have ranges enabled in the shader that aren't - * used in the binning variant. - */ - if (16 * v->constlen <= state->range[i].offset) - continue; - - /* and even if the start of the const buffer is before - * first_immediate, the end may not be: - */ - size = MIN2(size, (16 * v->constlen) - state->range[i].offset); - - if (size == 0) - continue; - - emit_const_bo(ring, v, state->range[i].offset / 4, - v->info.constant_data_offset + state->range[i].start, - size / 4, v->bo); - } + const struct ir3_const_state *const_state = ir3_const_state(v); + const struct ir3_ubo_analysis_state *state = &const_state->ubo_state; + + for (unsigned i = 0; i < state->num_enabled; i++) { + unsigned ubo = state->range[i].ubo.block; + if (ubo != const_state->constant_data_ubo) + continue; + + uint32_t size = state->range[i].end - state->range[i].start; + + /* Pre-a6xx, we might have ranges enabled in the shader that aren't + * used in the binning variant. + */ + if (16 * v->constlen <= state->range[i].offset) + continue; + + /* and even if the start of the const buffer is before + * first_immediate, the end may not be: + */ + size = MIN2(size, (16 * v->constlen) - state->range[i].offset); + + if (size == 0) + continue; + + emit_const_bo(ring, v, state->range[i].offset / 4, + v->info.constant_data_offset + state->range[i].start, + size / 4, v->bo); + } } /** @@ -151,310 +151,314 @@ ir3_emit_constant_data(struct fd_screen *screen, * shader). */ static inline void -ir3_emit_user_consts(struct fd_screen *screen, const struct ir3_shader_variant *v, - struct fd_ringbuffer *ring, struct fd_constbuf_stateobj *constbuf) +ir3_emit_user_consts(struct fd_screen *screen, + const struct ir3_shader_variant *v, + struct fd_ringbuffer *ring, + struct fd_constbuf_stateobj *constbuf) { - const struct ir3_const_state *const_state = ir3_const_state(v); - const struct ir3_ubo_analysis_state *state = &const_state->ubo_state; - - for (unsigned i = 0; i < state->num_enabled; i++) { - assert(!state->range[i].ubo.bindless); - unsigned ubo = state->range[i].ubo.block; - if (!(constbuf->enabled_mask & (1 << ubo)) || - ubo == const_state->constant_data_ubo) { - continue; - } - struct pipe_constant_buffer *cb = &constbuf->cb[ubo]; - - uint32_t size = state->range[i].end - state->range[i].start; - uint32_t offset = cb->buffer_offset + state->range[i].start; - - /* Pre-a6xx, we might have ranges enabled in the shader that aren't - * used in the binning variant. - */ - if (16 * v->constlen <= state->range[i].offset) - continue; - - /* and even if the start of the const buffer is before - * first_immediate, the end may not be: - */ - size = MIN2(size, (16 * v->constlen) - state->range[i].offset); - - if (size == 0) - continue; - - /* things should be aligned to vec4: */ - debug_assert((state->range[i].offset % 16) == 0); - debug_assert((size % 16) == 0); - debug_assert((offset % 16) == 0); - - if (cb->user_buffer) { - emit_const_user(ring, v, state->range[i].offset / 4, - size / 4, cb->user_buffer + state->range[i].start); - } else { - emit_const_prsc(ring, v, state->range[i].offset / 4, - offset, size / 4, cb->buffer); - } - } + const struct ir3_const_state *const_state = ir3_const_state(v); + const struct ir3_ubo_analysis_state *state = &const_state->ubo_state; + + for (unsigned i = 0; i < state->num_enabled; i++) { + assert(!state->range[i].ubo.bindless); + unsigned ubo = state->range[i].ubo.block; + if (!(constbuf->enabled_mask & (1 << ubo)) || + ubo == const_state->constant_data_ubo) { + continue; + } + struct pipe_constant_buffer *cb = &constbuf->cb[ubo]; + + uint32_t size = state->range[i].end - state->range[i].start; + uint32_t offset = cb->buffer_offset + state->range[i].start; + + /* Pre-a6xx, we might have ranges enabled in the shader that aren't + * used in the binning variant. + */ + if (16 * v->constlen <= state->range[i].offset) + continue; + + /* and even if the start of the const buffer is before + * first_immediate, the end may not be: + */ + size = MIN2(size, (16 * v->constlen) - state->range[i].offset); + + if (size == 0) + continue; + + /* things should be aligned to vec4: */ + debug_assert((state->range[i].offset % 16) == 0); + debug_assert((size % 16) == 0); + debug_assert((offset % 16) == 0); + + if (cb->user_buffer) { + emit_const_user(ring, v, state->range[i].offset / 4, size / 4, + cb->user_buffer + state->range[i].start); + } else { + emit_const_prsc(ring, v, state->range[i].offset / 4, offset, size / 4, + cb->buffer); + } + } } static inline void ir3_emit_ubos(struct fd_context *ctx, const struct ir3_shader_variant *v, - struct fd_ringbuffer *ring, struct fd_constbuf_stateobj *constbuf) + struct fd_ringbuffer *ring, struct fd_constbuf_stateobj *constbuf) { - const struct ir3_const_state *const_state = ir3_const_state(v); - uint32_t offset = const_state->offsets.ubo; - - /* a6xx+ uses UBO state and ldc instead of pointers emitted in - * const state and ldg: - */ - if (ctx->screen->gpu_id >= 600) - return; - - if (v->constlen > offset) { - uint32_t params = const_state->num_ubos; - uint32_t offsets[params]; - struct fd_bo *bos[params]; - - for (uint32_t i = 0; i < params; i++) { - if (i == const_state->constant_data_ubo) { - bos[i] = v->bo; - offsets[i] = v->info.constant_data_offset; - continue; - } - - struct pipe_constant_buffer *cb = &constbuf->cb[i]; - - /* If we have user pointers (constbuf 0, aka GL uniforms), upload - * them to a buffer now, and save it in the constbuf so that we - * don't have to reupload until they get changed. - */ - if (cb->user_buffer) { - struct pipe_context *pctx = &ctx->base; - u_upload_data(pctx->stream_uploader, 0, - cb->buffer_size, - 64, - cb->user_buffer, - &cb->buffer_offset, &cb->buffer); - cb->user_buffer = NULL; - } - - if ((constbuf->enabled_mask & (1 << i)) && cb->buffer) { - offsets[i] = cb->buffer_offset; - bos[i] = fd_resource(cb->buffer)->bo; - } else { - offsets[i] = 0; - bos[i] = NULL; - } - } - - assert(offset * 4 + params <= v->constlen * 4); - - emit_const_ptrs(ring, v, offset * 4, params, bos, offsets); - } + const struct ir3_const_state *const_state = ir3_const_state(v); + uint32_t offset = const_state->offsets.ubo; + + /* a6xx+ uses UBO state and ldc instead of pointers emitted in + * const state and ldg: + */ + if (ctx->screen->gpu_id >= 600) + return; + + if (v->constlen > offset) { + uint32_t params = const_state->num_ubos; + uint32_t offsets[params]; + struct fd_bo *bos[params]; + + for (uint32_t i = 0; i < params; i++) { + if (i == const_state->constant_data_ubo) { + bos[i] = v->bo; + offsets[i] = v->info.constant_data_offset; + continue; + } + + struct pipe_constant_buffer *cb = &constbuf->cb[i]; + + /* If we have user pointers (constbuf 0, aka GL uniforms), upload + * them to a buffer now, and save it in the constbuf so that we + * don't have to reupload until they get changed. + */ + if (cb->user_buffer) { + struct pipe_context *pctx = &ctx->base; + u_upload_data(pctx->stream_uploader, 0, cb->buffer_size, 64, + cb->user_buffer, &cb->buffer_offset, &cb->buffer); + cb->user_buffer = NULL; + } + + if ((constbuf->enabled_mask & (1 << i)) && cb->buffer) { + offsets[i] = cb->buffer_offset; + bos[i] = fd_resource(cb->buffer)->bo; + } else { + offsets[i] = 0; + bos[i] = NULL; + } + } + + assert(offset * 4 + params <= v->constlen * 4); + + emit_const_ptrs(ring, v, offset * 4, params, bos, offsets); + } } static inline void -ir3_emit_ssbo_sizes(struct fd_screen *screen, const struct ir3_shader_variant *v, - struct fd_ringbuffer *ring, struct fd_shaderbuf_stateobj *sb) +ir3_emit_ssbo_sizes(struct fd_screen *screen, + const struct ir3_shader_variant *v, + struct fd_ringbuffer *ring, + struct fd_shaderbuf_stateobj *sb) { - const struct ir3_const_state *const_state = ir3_const_state(v); - uint32_t offset = const_state->offsets.ssbo_sizes; - if (v->constlen > offset) { - uint32_t sizes[align(const_state->ssbo_size.count, 4)]; - unsigned mask = const_state->ssbo_size.mask; - - while (mask) { - unsigned index = u_bit_scan(&mask); - unsigned off = const_state->ssbo_size.off[index]; - sizes[off] = sb->sb[index].buffer_size; - } - - emit_const_user(ring, v, offset * 4, ARRAY_SIZE(sizes), sizes); - } + const struct ir3_const_state *const_state = ir3_const_state(v); + uint32_t offset = const_state->offsets.ssbo_sizes; + if (v->constlen > offset) { + uint32_t sizes[align(const_state->ssbo_size.count, 4)]; + unsigned mask = const_state->ssbo_size.mask; + + while (mask) { + unsigned index = u_bit_scan(&mask); + unsigned off = const_state->ssbo_size.off[index]; + sizes[off] = sb->sb[index].buffer_size; + } + + emit_const_user(ring, v, offset * 4, ARRAY_SIZE(sizes), sizes); + } } static inline void -ir3_emit_image_dims(struct fd_screen *screen, const struct ir3_shader_variant *v, - struct fd_ringbuffer *ring, struct fd_shaderimg_stateobj *si) +ir3_emit_image_dims(struct fd_screen *screen, + const struct ir3_shader_variant *v, + struct fd_ringbuffer *ring, + struct fd_shaderimg_stateobj *si) { - const struct ir3_const_state *const_state = ir3_const_state(v); - uint32_t offset = const_state->offsets.image_dims; - if (v->constlen > offset) { - uint32_t dims[align(const_state->image_dims.count, 4)]; - unsigned mask = const_state->image_dims.mask; - - while (mask) { - struct pipe_image_view *img; - struct fd_resource *rsc; - unsigned index = u_bit_scan(&mask); - unsigned off = const_state->image_dims.off[index]; - - img = &si->si[index]; - rsc = fd_resource(img->resource); - - dims[off + 0] = util_format_get_blocksize(img->format); - if (img->resource->target != PIPE_BUFFER) { - struct fdl_slice *slice = - fd_resource_slice(rsc, img->u.tex.level); - /* note for 2d/cube/etc images, even if re-interpreted - * as a different color format, the pixel size should - * be the same, so use original dimensions for y and z - * stride: - */ - dims[off + 1] = fd_resource_pitch(rsc, img->u.tex.level); - /* see corresponding logic in fd_resource_offset(): */ - if (rsc->layout.layer_first) { - dims[off + 2] = rsc->layout.layer_size; - } else { - dims[off + 2] = slice->size0; - } - } else { - /* For buffer-backed images, the log2 of the format's - * bytes-per-pixel is placed on the 2nd slot. This is useful - * when emitting image_size instructions, for which we need - * to divide by bpp for image buffers. Since the bpp - * can only be power-of-two, the division is implemented - * as a SHR, and for that it is handy to have the log2 of - * bpp as a constant. (log2 = first-set-bit - 1) - */ - dims[off + 1] = ffs(dims[off + 0]) - 1; - } - } - uint32_t size = MIN2(ARRAY_SIZE(dims), v->constlen * 4 - offset * 4); - - emit_const_user(ring, v, offset * 4, size, dims); - } + const struct ir3_const_state *const_state = ir3_const_state(v); + uint32_t offset = const_state->offsets.image_dims; + if (v->constlen > offset) { + uint32_t dims[align(const_state->image_dims.count, 4)]; + unsigned mask = const_state->image_dims.mask; + + while (mask) { + struct pipe_image_view *img; + struct fd_resource *rsc; + unsigned index = u_bit_scan(&mask); + unsigned off = const_state->image_dims.off[index]; + + img = &si->si[index]; + rsc = fd_resource(img->resource); + + dims[off + 0] = util_format_get_blocksize(img->format); + if (img->resource->target != PIPE_BUFFER) { + struct fdl_slice *slice = fd_resource_slice(rsc, img->u.tex.level); + /* note for 2d/cube/etc images, even if re-interpreted + * as a different color format, the pixel size should + * be the same, so use original dimensions for y and z + * stride: + */ + dims[off + 1] = fd_resource_pitch(rsc, img->u.tex.level); + /* see corresponding logic in fd_resource_offset(): */ + if (rsc->layout.layer_first) { + dims[off + 2] = rsc->layout.layer_size; + } else { + dims[off + 2] = slice->size0; + } + } else { + /* For buffer-backed images, the log2 of the format's + * bytes-per-pixel is placed on the 2nd slot. This is useful + * when emitting image_size instructions, for which we need + * to divide by bpp for image buffers. Since the bpp + * can only be power-of-two, the division is implemented + * as a SHR, and for that it is handy to have the log2 of + * bpp as a constant. (log2 = first-set-bit - 1) + */ + dims[off + 1] = ffs(dims[off + 0]) - 1; + } + } + uint32_t size = MIN2(ARRAY_SIZE(dims), v->constlen * 4 - offset * 4); + + emit_const_user(ring, v, offset * 4, size, dims); + } } static inline void -ir3_emit_immediates(struct fd_screen *screen, const struct ir3_shader_variant *v, - struct fd_ringbuffer *ring) +ir3_emit_immediates(struct fd_screen *screen, + const struct ir3_shader_variant *v, + struct fd_ringbuffer *ring) { - const struct ir3_const_state *const_state = ir3_const_state(v); - uint32_t base = const_state->offsets.immediate; - int size = DIV_ROUND_UP(const_state->immediates_count, 4); - - /* truncate size to avoid writing constants that shader - * does not use: - */ - size = MIN2(size + base, v->constlen) - base; - - /* convert out of vec4: */ - base *= 4; - size *= 4; - - if (size > 0) - emit_const_user(ring, v, base, size, const_state->immediates); - - /* NIR constant data has the same lifetime as immediates, so upload it - * now, too. - */ - ir3_emit_constant_data(screen, v, ring); + const struct ir3_const_state *const_state = ir3_const_state(v); + uint32_t base = const_state->offsets.immediate; + int size = DIV_ROUND_UP(const_state->immediates_count, 4); + + /* truncate size to avoid writing constants that shader + * does not use: + */ + size = MIN2(size + base, v->constlen) - base; + + /* convert out of vec4: */ + base *= 4; + size *= 4; + + if (size > 0) + emit_const_user(ring, v, base, size, const_state->immediates); + + /* NIR constant data has the same lifetime as immediates, so upload it + * now, too. + */ + ir3_emit_constant_data(screen, v, ring); } static inline void ir3_emit_link_map(struct fd_screen *screen, - const struct ir3_shader_variant *producer, - const struct ir3_shader_variant *v, struct fd_ringbuffer *ring) + const struct ir3_shader_variant *producer, + const struct ir3_shader_variant *v, + struct fd_ringbuffer *ring) { - const struct ir3_const_state *const_state = ir3_const_state(v); - uint32_t base = const_state->offsets.primitive_map; - int size = DIV_ROUND_UP(v->input_size, 4); + const struct ir3_const_state *const_state = ir3_const_state(v); + uint32_t base = const_state->offsets.primitive_map; + int size = DIV_ROUND_UP(v->input_size, 4); - /* truncate size to avoid writing constants that shader - * does not use: - */ - size = MIN2(size + base, v->constlen) - base; + /* truncate size to avoid writing constants that shader + * does not use: + */ + size = MIN2(size + base, v->constlen) - base; - /* convert out of vec4: */ - base *= 4; - size *= 4; + /* convert out of vec4: */ + base *= 4; + size *= 4; - if (size > 0) - emit_const_user(ring, v, base, size, producer->output_loc); + if (size > 0) + emit_const_user(ring, v, base, size, producer->output_loc); } /* emit stream-out buffers: */ static inline void emit_tfbos(struct fd_context *ctx, const struct ir3_shader_variant *v, - struct fd_ringbuffer *ring) + struct fd_ringbuffer *ring) { - /* streamout addresses after driver-params: */ - const struct ir3_const_state *const_state = ir3_const_state(v); - uint32_t offset = const_state->offsets.tfbo; - if (v->constlen > offset) { - struct fd_streamout_stateobj *so = &ctx->streamout; - struct ir3_stream_output_info *info = &v->shader->stream_output; - uint32_t params = 4; - uint32_t offsets[params]; - struct fd_bo *bos[params]; - - for (uint32_t i = 0; i < params; i++) { - struct pipe_stream_output_target *target = so->targets[i]; - - if (target) { - offsets[i] = (so->offsets[i] * info->stride[i] * 4) + - target->buffer_offset; - bos[i] = fd_resource(target->buffer)->bo; - } else { - offsets[i] = 0; - bos[i] = NULL; - } - } - - assert(offset * 4 + params <= v->constlen * 4); - - emit_const_ptrs(ring, v, offset * 4, params, bos, offsets); - } + /* streamout addresses after driver-params: */ + const struct ir3_const_state *const_state = ir3_const_state(v); + uint32_t offset = const_state->offsets.tfbo; + if (v->constlen > offset) { + struct fd_streamout_stateobj *so = &ctx->streamout; + struct ir3_stream_output_info *info = &v->shader->stream_output; + uint32_t params = 4; + uint32_t offsets[params]; + struct fd_bo *bos[params]; + + for (uint32_t i = 0; i < params; i++) { + struct pipe_stream_output_target *target = so->targets[i]; + + if (target) { + offsets[i] = + (so->offsets[i] * info->stride[i] * 4) + target->buffer_offset; + bos[i] = fd_resource(target->buffer)->bo; + } else { + offsets[i] = 0; + bos[i] = NULL; + } + } + + assert(offset * 4 + params <= v->constlen * 4); + + emit_const_ptrs(ring, v, offset * 4, params, bos, offsets); + } } static inline void -emit_common_consts(const struct ir3_shader_variant *v, struct fd_ringbuffer *ring, - struct fd_context *ctx, enum pipe_shader_type t) - assert_dt +emit_common_consts(const struct ir3_shader_variant *v, + struct fd_ringbuffer *ring, struct fd_context *ctx, + enum pipe_shader_type t) assert_dt { - enum fd_dirty_shader_state dirty = ctx->dirty_shader[t]; - - /* When we use CP_SET_DRAW_STATE objects to emit constant state, - * if we emit any of it we need to emit all. This is because - * we are using the same state-group-id each time for uniform - * state, and if previous update is never evaluated (due to no - * visible primitives in the current tile) then the new stateobj - * completely replaces the old one. - * - * Possibly if we split up different parts of the const state to - * different state-objects we could avoid this. - */ - if (dirty && is_stateobj(ring)) - dirty = ~0; - - if (dirty & (FD_DIRTY_SHADER_PROG | FD_DIRTY_SHADER_CONST)) { - struct fd_constbuf_stateobj *constbuf; - bool shader_dirty; - - constbuf = &ctx->constbuf[t]; - shader_dirty = !!(dirty & FD_DIRTY_SHADER_PROG); - - ring_wfi(ctx->batch, ring); - - ir3_emit_user_consts(ctx->screen, v, ring, constbuf); - ir3_emit_ubos(ctx, v, ring, constbuf); - if (shader_dirty) - ir3_emit_immediates(ctx->screen, v, ring); - } - - if (dirty & (FD_DIRTY_SHADER_PROG | FD_DIRTY_SHADER_SSBO)) { - struct fd_shaderbuf_stateobj *sb = &ctx->shaderbuf[t]; - ring_wfi(ctx->batch, ring); - ir3_emit_ssbo_sizes(ctx->screen, v, ring, sb); - } - - if (dirty & (FD_DIRTY_SHADER_PROG | FD_DIRTY_SHADER_IMAGE)) { - struct fd_shaderimg_stateobj *si = &ctx->shaderimg[t]; - ring_wfi(ctx->batch, ring); - ir3_emit_image_dims(ctx->screen, v, ring, si); - } + enum fd_dirty_shader_state dirty = ctx->dirty_shader[t]; + + /* When we use CP_SET_DRAW_STATE objects to emit constant state, + * if we emit any of it we need to emit all. This is because + * we are using the same state-group-id each time for uniform + * state, and if previous update is never evaluated (due to no + * visible primitives in the current tile) then the new stateobj + * completely replaces the old one. + * + * Possibly if we split up different parts of the const state to + * different state-objects we could avoid this. + */ + if (dirty && is_stateobj(ring)) + dirty = ~0; + + if (dirty & (FD_DIRTY_SHADER_PROG | FD_DIRTY_SHADER_CONST)) { + struct fd_constbuf_stateobj *constbuf; + bool shader_dirty; + + constbuf = &ctx->constbuf[t]; + shader_dirty = !!(dirty & FD_DIRTY_SHADER_PROG); + + ring_wfi(ctx->batch, ring); + + ir3_emit_user_consts(ctx->screen, v, ring, constbuf); + ir3_emit_ubos(ctx, v, ring, constbuf); + if (shader_dirty) + ir3_emit_immediates(ctx->screen, v, ring); + } + + if (dirty & (FD_DIRTY_SHADER_PROG | FD_DIRTY_SHADER_SSBO)) { + struct fd_shaderbuf_stateobj *sb = &ctx->shaderbuf[t]; + ring_wfi(ctx->batch, ring); + ir3_emit_ssbo_sizes(ctx->screen, v, ring, sb); + } + + if (dirty & (FD_DIRTY_SHADER_PROG | FD_DIRTY_SHADER_IMAGE)) { + struct fd_shaderimg_stateobj *si = &ctx->shaderimg[t]; + ring_wfi(ctx->batch, ring); + ir3_emit_image_dims(ctx->screen, v, ring, si); + } } static inline void @@ -462,171 +466,167 @@ ir3_emit_vs_driver_params(const struct ir3_shader_variant *v, struct fd_ringbuffer *ring, struct fd_context *ctx, const struct pipe_draw_info *info, const struct pipe_draw_indirect_info *indirect, - const struct pipe_draw_start_count *draw) - assert_dt + const struct pipe_draw_start_count *draw) assert_dt { - assert(v->need_driver_params); - - const struct ir3_const_state *const_state = ir3_const_state(v); - uint32_t offset = const_state->offsets.driver_param; - uint32_t vertex_params[IR3_DP_VS_COUNT] = { - [IR3_DP_DRAWID] = 0, /* filled by hw (CP_DRAW_INDIRECT_MULTI) */ - [IR3_DP_VTXID_BASE] = info->index_size ? - info->index_bias : draw->start, - [IR3_DP_INSTID_BASE] = info->start_instance, - [IR3_DP_VTXCNT_MAX] = ctx->streamout.max_tf_vtx, - }; - if (v->key.ucp_enables) { - struct pipe_clip_state *ucp = &ctx->ucp; - unsigned pos = IR3_DP_UCP0_X; - for (unsigned i = 0; pos <= IR3_DP_UCP7_W; i++) { - for (unsigned j = 0; j < 4; j++) { - vertex_params[pos] = fui(ucp->ucp[i][j]); - pos++; - } - } - } - - /* Only emit as many params as needed, i.e. up to the highest enabled UCP - * plane. However a binning pass may drop even some of these, so limit to - * program max. - */ - const uint32_t vertex_params_size = MIN2( - const_state->num_driver_params, - (v->constlen - offset) * 4); - assert(vertex_params_size <= IR3_DP_VS_COUNT); - - bool needs_vtxid_base = - ir3_find_sysval_regid(v, SYSTEM_VALUE_VERTEX_ID_ZERO_BASE) != regid(63, 0); - - /* for indirect draw, we need to copy VTXID_BASE from - * indirect-draw parameters buffer.. which is annoying - * and means we can't easily emit these consts in cmd - * stream so need to copy them to bo. - */ - if (indirect && needs_vtxid_base) { - struct pipe_resource *vertex_params_rsc = - pipe_buffer_create(&ctx->screen->base, - PIPE_BIND_CONSTANT_BUFFER, PIPE_USAGE_STREAM, - vertex_params_size * 4); - unsigned src_off = indirect->offset;; - void *ptr; - - ptr = fd_bo_map(fd_resource(vertex_params_rsc)->bo); - memcpy(ptr, vertex_params, vertex_params_size * 4); - - if (info->index_size) { - /* indexed draw, index_bias is 4th field: */ - src_off += 3 * 4; - } else { - /* non-indexed draw, start is 3rd field: */ - src_off += 2 * 4; - } - - /* copy index_bias or start from draw params: */ - ctx->screen->mem_to_mem(ring, vertex_params_rsc, 0, - indirect->buffer, src_off, 1); - - emit_const_prsc(ring, v, offset * 4, 0, - vertex_params_size, vertex_params_rsc); - - pipe_resource_reference(&vertex_params_rsc, NULL); - } else { - emit_const_user(ring, v, offset * 4, - vertex_params_size, vertex_params); - } - - /* if needed, emit stream-out buffer addresses: */ - if (vertex_params[IR3_DP_VTXCNT_MAX] > 0) { - emit_tfbos(ctx, v, ring); - } + assert(v->need_driver_params); + + const struct ir3_const_state *const_state = ir3_const_state(v); + uint32_t offset = const_state->offsets.driver_param; + uint32_t vertex_params[IR3_DP_VS_COUNT] = { + [IR3_DP_DRAWID] = 0, /* filled by hw (CP_DRAW_INDIRECT_MULTI) */ + [IR3_DP_VTXID_BASE] = info->index_size ? info->index_bias : draw->start, + [IR3_DP_INSTID_BASE] = info->start_instance, + [IR3_DP_VTXCNT_MAX] = ctx->streamout.max_tf_vtx, + }; + if (v->key.ucp_enables) { + struct pipe_clip_state *ucp = &ctx->ucp; + unsigned pos = IR3_DP_UCP0_X; + for (unsigned i = 0; pos <= IR3_DP_UCP7_W; i++) { + for (unsigned j = 0; j < 4; j++) { + vertex_params[pos] = fui(ucp->ucp[i][j]); + pos++; + } + } + } + + /* Only emit as many params as needed, i.e. up to the highest enabled UCP + * plane. However a binning pass may drop even some of these, so limit to + * program max. + */ + const uint32_t vertex_params_size = + MIN2(const_state->num_driver_params, (v->constlen - offset) * 4); + assert(vertex_params_size <= IR3_DP_VS_COUNT); + + bool needs_vtxid_base = + ir3_find_sysval_regid(v, SYSTEM_VALUE_VERTEX_ID_ZERO_BASE) != + regid(63, 0); + + /* for indirect draw, we need to copy VTXID_BASE from + * indirect-draw parameters buffer.. which is annoying + * and means we can't easily emit these consts in cmd + * stream so need to copy them to bo. + */ + if (indirect && needs_vtxid_base) { + struct pipe_resource *vertex_params_rsc = + pipe_buffer_create(&ctx->screen->base, PIPE_BIND_CONSTANT_BUFFER, + PIPE_USAGE_STREAM, vertex_params_size * 4); + unsigned src_off = indirect->offset; + ; + void *ptr; + + ptr = fd_bo_map(fd_resource(vertex_params_rsc)->bo); + memcpy(ptr, vertex_params, vertex_params_size * 4); + + if (info->index_size) { + /* indexed draw, index_bias is 4th field: */ + src_off += 3 * 4; + } else { + /* non-indexed draw, start is 3rd field: */ + src_off += 2 * 4; + } + + /* copy index_bias or start from draw params: */ + ctx->screen->mem_to_mem(ring, vertex_params_rsc, 0, indirect->buffer, + src_off, 1); + + emit_const_prsc(ring, v, offset * 4, 0, vertex_params_size, + vertex_params_rsc); + + pipe_resource_reference(&vertex_params_rsc, NULL); + } else { + emit_const_user(ring, v, offset * 4, vertex_params_size, vertex_params); + } + + /* if needed, emit stream-out buffer addresses: */ + if (vertex_params[IR3_DP_VTXCNT_MAX] > 0) { + emit_tfbos(ctx, v, ring); + } } static inline void -ir3_emit_vs_consts(const struct ir3_shader_variant *v, struct fd_ringbuffer *ring, - struct fd_context *ctx, const struct pipe_draw_info *info, +ir3_emit_vs_consts(const struct ir3_shader_variant *v, + struct fd_ringbuffer *ring, struct fd_context *ctx, + const struct pipe_draw_info *info, const struct pipe_draw_indirect_info *indirect, - const struct pipe_draw_start_count *draw) - assert_dt + const struct pipe_draw_start_count *draw) assert_dt { - debug_assert(v->type == MESA_SHADER_VERTEX); + debug_assert(v->type == MESA_SHADER_VERTEX); - emit_common_consts(v, ring, ctx, PIPE_SHADER_VERTEX); + emit_common_consts(v, ring, ctx, PIPE_SHADER_VERTEX); - /* emit driver params every time: */ - if (info && v->need_driver_params) { - ring_wfi(ctx->batch, ring); - ir3_emit_vs_driver_params(v, ring, ctx, info, indirect, draw); - } + /* emit driver params every time: */ + if (info && v->need_driver_params) { + ring_wfi(ctx->batch, ring); + ir3_emit_vs_driver_params(v, ring, ctx, info, indirect, draw); + } } static inline void -ir3_emit_fs_consts(const struct ir3_shader_variant *v, struct fd_ringbuffer *ring, - struct fd_context *ctx) - assert_dt +ir3_emit_fs_consts(const struct ir3_shader_variant *v, + struct fd_ringbuffer *ring, struct fd_context *ctx) assert_dt { - debug_assert(v->type == MESA_SHADER_FRAGMENT); + debug_assert(v->type == MESA_SHADER_FRAGMENT); - emit_common_consts(v, ring, ctx, PIPE_SHADER_FRAGMENT); + emit_common_consts(v, ring, ctx, PIPE_SHADER_FRAGMENT); } /* emit compute-shader consts: */ static inline void -ir3_emit_cs_consts(const struct ir3_shader_variant *v, struct fd_ringbuffer *ring, - struct fd_context *ctx, const struct pipe_grid_info *info) - assert_dt +ir3_emit_cs_consts(const struct ir3_shader_variant *v, + struct fd_ringbuffer *ring, struct fd_context *ctx, + const struct pipe_grid_info *info) assert_dt { - debug_assert(gl_shader_stage_is_compute(v->type)); - - emit_common_consts(v, ring, ctx, PIPE_SHADER_COMPUTE); - - /* emit compute-shader driver-params: */ - const struct ir3_const_state *const_state = ir3_const_state(v); - uint32_t offset = const_state->offsets.driver_param; - if (v->constlen > offset) { - ring_wfi(ctx->batch, ring); - - if (info->indirect) { - struct pipe_resource *indirect = NULL; - unsigned indirect_offset; - - /* This is a bit awkward, but CP_LOAD_STATE.EXT_SRC_ADDR needs - * to be aligned more strongly than 4 bytes. So in this case - * we need a temporary buffer to copy NumWorkGroups.xyz to. - * - * TODO if previous compute job is writing to info->indirect, - * we might need a WFI.. but since we currently flush for each - * compute job, we are probably ok for now. - */ - if (info->indirect_offset & 0xf) { - indirect = pipe_buffer_create(&ctx->screen->base, - PIPE_BIND_COMMAND_ARGS_BUFFER, PIPE_USAGE_STREAM, - 0x1000); - indirect_offset = 0; - - ctx->screen->mem_to_mem(ring, indirect, 0, info->indirect, - info->indirect_offset, 3); - } else { - pipe_resource_reference(&indirect, info->indirect); - indirect_offset = info->indirect_offset; - } - - emit_const_prsc(ring, v, offset * 4, indirect_offset, 16, indirect); - - pipe_resource_reference(&indirect, NULL); - } else { - uint32_t compute_params[IR3_DP_CS_COUNT] = { - [IR3_DP_NUM_WORK_GROUPS_X] = info->grid[0], - [IR3_DP_NUM_WORK_GROUPS_Y] = info->grid[1], - [IR3_DP_NUM_WORK_GROUPS_Z] = info->grid[2], - [IR3_DP_LOCAL_GROUP_SIZE_X] = info->block[0], - [IR3_DP_LOCAL_GROUP_SIZE_Y] = info->block[1], - [IR3_DP_LOCAL_GROUP_SIZE_Z] = info->block[2], - }; - uint32_t size = MIN2(const_state->num_driver_params, - v->constlen * 4 - offset * 4); - - emit_const_user(ring, v, offset * 4, size, compute_params); - } - } + debug_assert(gl_shader_stage_is_compute(v->type)); + + emit_common_consts(v, ring, ctx, PIPE_SHADER_COMPUTE); + + /* emit compute-shader driver-params: */ + const struct ir3_const_state *const_state = ir3_const_state(v); + uint32_t offset = const_state->offsets.driver_param; + if (v->constlen > offset) { + ring_wfi(ctx->batch, ring); + + if (info->indirect) { + struct pipe_resource *indirect = NULL; + unsigned indirect_offset; + + /* This is a bit awkward, but CP_LOAD_STATE.EXT_SRC_ADDR needs + * to be aligned more strongly than 4 bytes. So in this case + * we need a temporary buffer to copy NumWorkGroups.xyz to. + * + * TODO if previous compute job is writing to info->indirect, + * we might need a WFI.. but since we currently flush for each + * compute job, we are probably ok for now. + */ + if (info->indirect_offset & 0xf) { + indirect = pipe_buffer_create(&ctx->screen->base, + PIPE_BIND_COMMAND_ARGS_BUFFER, + PIPE_USAGE_STREAM, 0x1000); + indirect_offset = 0; + + ctx->screen->mem_to_mem(ring, indirect, 0, info->indirect, + info->indirect_offset, 3); + } else { + pipe_resource_reference(&indirect, info->indirect); + indirect_offset = info->indirect_offset; + } + + emit_const_prsc(ring, v, offset * 4, indirect_offset, 16, indirect); + + pipe_resource_reference(&indirect, NULL); + } else { + uint32_t compute_params[IR3_DP_CS_COUNT] = { + [IR3_DP_NUM_WORK_GROUPS_X] = info->grid[0], + [IR3_DP_NUM_WORK_GROUPS_Y] = info->grid[1], + [IR3_DP_NUM_WORK_GROUPS_Z] = info->grid[2], + [IR3_DP_LOCAL_GROUP_SIZE_X] = info->block[0], + [IR3_DP_LOCAL_GROUP_SIZE_Y] = info->block[1], + [IR3_DP_LOCAL_GROUP_SIZE_Z] = info->block[2], + }; + uint32_t size = + MIN2(const_state->num_driver_params, v->constlen * 4 - offset * 4); + + emit_const_user(ring, v, offset * 4, size, compute_params); + } + } } diff --git a/src/gallium/drivers/freedreno/ir3/ir3_gallium.c b/src/gallium/drivers/freedreno/ir3/ir3_gallium.c index d0c4c23..18e3a86 100644 --- a/src/gallium/drivers/freedreno/ir3/ir3_gallium.c +++ b/src/gallium/drivers/freedreno/ir3/ir3_gallium.c @@ -24,14 +24,14 @@ * Rob Clark */ -#include "pipe/p_state.h" #include "pipe/p_screen.h" -#include "util/u_string.h" -#include "util/u_memory.h" -#include "util/u_inlines.h" -#include "util/format/u_format.h" +#include "pipe/p_state.h" #include "tgsi/tgsi_dump.h" #include "tgsi/tgsi_parse.h" +#include "util/format/u_format.h" +#include "util/u_inlines.h" +#include "util/u_memory.h" +#include "util/u_string.h" #include "nir/tgsi_to_nir.h" @@ -39,10 +39,10 @@ #include "freedreno_util.h" #include "ir3/ir3_cache.h" -#include "ir3/ir3_shader.h" -#include "ir3/ir3_gallium.h" #include "ir3/ir3_compiler.h" +#include "ir3/ir3_gallium.h" #include "ir3/ir3_nir.h" +#include "ir3/ir3_shader.h" /** * The hardware cso for shader state @@ -51,10 +51,10 @@ * plumb in async compile. */ struct ir3_shader_state { - struct ir3_shader *shader; + struct ir3_shader *shader; - /* Fence signalled when async compile is completed: */ - struct util_queue_fence ready; + /* Fence signalled when async compile is completed: */ + struct util_queue_fence ready; }; /** @@ -68,211 +68,197 @@ struct ir3_shader_state { static bool initial_variants_synchronous(struct fd_context *ctx) { - return unlikely(ctx->debug.debug_message) || - FD_DBG(SHADERDB) || FD_DBG(SERIALC); + return unlikely(ctx->debug.debug_message) || FD_DBG(SHADERDB) || + FD_DBG(SERIALC); } static void -dump_shader_info(struct ir3_shader_variant *v, struct pipe_debug_callback *debug) +dump_shader_info(struct ir3_shader_variant *v, + struct pipe_debug_callback *debug) { - if (!FD_DBG(SHADERDB)) - return; - - pipe_debug_message(debug, SHADER_INFO, - "%s shader: %u inst, %u nops, %u non-nops, %u mov, %u cov, " - "%u dwords, %u last-baryf, %u half, %u full, %u constlen, " - "%u cat0, %u cat1, %u cat2, %u cat3, %u cat4, %u cat5, %u cat6, %u cat7, " - "%u sstall, %u (ss), %u (sy), %d waves, %d max_sun, %d loops\n", - ir3_shader_stage(v), - v->info.instrs_count, - v->info.nops_count, - v->info.instrs_count - v->info.nops_count, - v->info.mov_count, - v->info.cov_count, - v->info.sizedwords, - v->info.last_baryf, - v->info.max_half_reg + 1, - v->info.max_reg + 1, - v->constlen, - v->info.instrs_per_cat[0], - v->info.instrs_per_cat[1], - v->info.instrs_per_cat[2], - v->info.instrs_per_cat[3], - v->info.instrs_per_cat[4], - v->info.instrs_per_cat[5], - v->info.instrs_per_cat[6], - v->info.instrs_per_cat[7], - v->info.sstall, - v->info.ss, v->info.sy, - v->info.max_waves, - v->max_sun, v->loops); + if (!FD_DBG(SHADERDB)) + return; + + pipe_debug_message( + debug, SHADER_INFO, + "%s shader: %u inst, %u nops, %u non-nops, %u mov, %u cov, " + "%u dwords, %u last-baryf, %u half, %u full, %u constlen, " + "%u cat0, %u cat1, %u cat2, %u cat3, %u cat4, %u cat5, %u cat6, %u cat7, " + "%u sstall, %u (ss), %u (sy), %d waves, %d max_sun, %d loops\n", + ir3_shader_stage(v), v->info.instrs_count, v->info.nops_count, + v->info.instrs_count - v->info.nops_count, v->info.mov_count, + v->info.cov_count, v->info.sizedwords, v->info.last_baryf, + v->info.max_half_reg + 1, v->info.max_reg + 1, v->constlen, + v->info.instrs_per_cat[0], v->info.instrs_per_cat[1], + v->info.instrs_per_cat[2], v->info.instrs_per_cat[3], + v->info.instrs_per_cat[4], v->info.instrs_per_cat[5], + v->info.instrs_per_cat[6], v->info.instrs_per_cat[7], v->info.sstall, + v->info.ss, v->info.sy, v->info.max_waves, v->max_sun, v->loops); } static void upload_shader_variant(struct ir3_shader_variant *v) { - struct shader_info *info = &v->shader->nir->info; - struct ir3_compiler *compiler = v->shader->compiler; + struct shader_info *info = &v->shader->nir->info; + struct ir3_compiler *compiler = v->shader->compiler; - assert(!v->bo); + assert(!v->bo); - v->bo = fd_bo_new(compiler->dev, v->info.size, - DRM_FREEDRENO_GEM_CACHE_WCOMBINE | - DRM_FREEDRENO_GEM_TYPE_KMEM, - "%s:%s", ir3_shader_stage(v), info->name); + v->bo = + fd_bo_new(compiler->dev, v->info.size, + DRM_FREEDRENO_GEM_CACHE_WCOMBINE | DRM_FREEDRENO_GEM_TYPE_KMEM, + "%s:%s", ir3_shader_stage(v), info->name); - /* Always include shaders in kernel crash dumps. */ - fd_bo_mark_for_dump(v->bo); + /* Always include shaders in kernel crash dumps. */ + fd_bo_mark_for_dump(v->bo); - memcpy(fd_bo_map(v->bo), v->bin, v->info.size); + memcpy(fd_bo_map(v->bo), v->bin, v->info.size); } struct ir3_shader_variant * ir3_shader_variant(struct ir3_shader *shader, struct ir3_shader_key key, - bool binning_pass, struct pipe_debug_callback *debug) + bool binning_pass, struct pipe_debug_callback *debug) { - struct ir3_shader_variant *v; - bool created = false; - - /* Some shader key values may not be used by a given ir3_shader (for - * example, fragment shader saturates in the vertex shader), so clean out - * those flags to avoid recompiling. - */ - ir3_key_clear_unused(&key, shader); - - v = ir3_shader_get_variant(shader, &key, binning_pass, false, &created); - - if (created) { - if (shader->initial_variants_done) { - pipe_debug_message(debug, SHADER_INFO, - "%s shader: recompiling at draw time: global 0x%08x, vfsamples %x/%x, astc %x/%x\n", - ir3_shader_stage(v), - key.global, - key.vsamples, key.fsamples, - key.vastc_srgb, key.fastc_srgb); - - } - - dump_shader_info(v, debug); - upload_shader_variant(v); - - if (v->binning) { - upload_shader_variant(v->binning); - dump_shader_info(v->binning, debug); - } - } - - return v; + struct ir3_shader_variant *v; + bool created = false; + + /* Some shader key values may not be used by a given ir3_shader (for + * example, fragment shader saturates in the vertex shader), so clean out + * those flags to avoid recompiling. + */ + ir3_key_clear_unused(&key, shader); + + v = ir3_shader_get_variant(shader, &key, binning_pass, false, &created); + + if (created) { + if (shader->initial_variants_done) { + pipe_debug_message(debug, SHADER_INFO, + "%s shader: recompiling at draw time: global " + "0x%08x, vfsamples %x/%x, astc %x/%x\n", + ir3_shader_stage(v), key.global, key.vsamples, + key.fsamples, key.vastc_srgb, key.fastc_srgb); + } + + dump_shader_info(v, debug); + upload_shader_variant(v); + + if (v->binning) { + upload_shader_variant(v->binning); + dump_shader_info(v->binning, debug); + } + } + + return v; } static void copy_stream_out(struct ir3_stream_output_info *i, - const struct pipe_stream_output_info *p) + const struct pipe_stream_output_info *p) { - STATIC_ASSERT(ARRAY_SIZE(i->stride) == ARRAY_SIZE(p->stride)); - STATIC_ASSERT(ARRAY_SIZE(i->output) == ARRAY_SIZE(p->output)); - - i->num_outputs = p->num_outputs; - for (int n = 0; n < ARRAY_SIZE(i->stride); n++) - i->stride[n] = p->stride[n]; - - for (int n = 0; n < ARRAY_SIZE(i->output); n++) { - i->output[n].register_index = p->output[n].register_index; - i->output[n].start_component = p->output[n].start_component; - i->output[n].num_components = p->output[n].num_components; - i->output[n].output_buffer = p->output[n].output_buffer; - i->output[n].dst_offset = p->output[n].dst_offset; - i->output[n].stream = p->output[n].stream; - } + STATIC_ASSERT(ARRAY_SIZE(i->stride) == ARRAY_SIZE(p->stride)); + STATIC_ASSERT(ARRAY_SIZE(i->output) == ARRAY_SIZE(p->output)); + + i->num_outputs = p->num_outputs; + for (int n = 0; n < ARRAY_SIZE(i->stride); n++) + i->stride[n] = p->stride[n]; + + for (int n = 0; n < ARRAY_SIZE(i->output); n++) { + i->output[n].register_index = p->output[n].register_index; + i->output[n].start_component = p->output[n].start_component; + i->output[n].num_components = p->output[n].num_components; + i->output[n].output_buffer = p->output[n].output_buffer; + i->output[n].dst_offset = p->output[n].dst_offset; + i->output[n].stream = p->output[n].stream; + } } static void create_initial_variants(struct ir3_shader_state *hwcso, - struct pipe_debug_callback *debug) + struct pipe_debug_callback *debug) { - struct ir3_shader *shader = hwcso->shader; - struct ir3_compiler *compiler = shader->compiler; - nir_shader *nir = shader->nir; - - /* Compile standard variants immediately to try to avoid draw-time stalls - * to run the compiler. - */ - struct ir3_shader_key key = { - .tessellation = IR3_TESS_NONE, - .ucp_enables = MASK(nir->info.clip_distance_array_size), - .msaa = true, - }; - - switch (nir->info.stage) { - case MESA_SHADER_TESS_EVAL: - key.tessellation = ir3_tess_mode(nir->info.tess.primitive_mode); - break; - - case MESA_SHADER_TESS_CTRL: - /* The primitive_mode field, while it exists for TCS, is not - * populated (since separable shaders between TCS/TES are legal, - * so TCS wouldn't have access to TES's declaration). Make a - * guess so that we shader-db something plausible for TCS. - */ - if (nir->info.outputs_written & VARYING_BIT_TESS_LEVEL_INNER) - key.tessellation = IR3_TESS_TRIANGLES; - else - key.tessellation = IR3_TESS_ISOLINES; - break; - - case MESA_SHADER_GEOMETRY: - key.has_gs = true; - break; - - default: - break; - } - - key.safe_constlen = false; - struct ir3_shader_variant *v = ir3_shader_variant(shader, key, false, debug); - if (!v) - return; - - if (v->constlen > compiler->max_const_safe) { - key.safe_constlen = true; - ir3_shader_variant(shader, key, false, debug); - } - - /* For vertex shaders, also compile initial binning pass shader: */ - if (nir->info.stage == MESA_SHADER_VERTEX) { - key.safe_constlen = false; - v = ir3_shader_variant(shader, key, true, debug); - if (!v) - return; - - if (v->constlen > compiler->max_const_safe) { - key.safe_constlen = true; - ir3_shader_variant(shader, key, true, debug); - } - } - - shader->initial_variants_done = true; + struct ir3_shader *shader = hwcso->shader; + struct ir3_compiler *compiler = shader->compiler; + nir_shader *nir = shader->nir; + + /* Compile standard variants immediately to try to avoid draw-time stalls + * to run the compiler. + */ + struct ir3_shader_key key = { + .tessellation = IR3_TESS_NONE, + .ucp_enables = MASK(nir->info.clip_distance_array_size), + .msaa = true, + }; + + switch (nir->info.stage) { + case MESA_SHADER_TESS_EVAL: + key.tessellation = ir3_tess_mode(nir->info.tess.primitive_mode); + break; + + case MESA_SHADER_TESS_CTRL: + /* The primitive_mode field, while it exists for TCS, is not + * populated (since separable shaders between TCS/TES are legal, + * so TCS wouldn't have access to TES's declaration). Make a + * guess so that we shader-db something plausible for TCS. + */ + if (nir->info.outputs_written & VARYING_BIT_TESS_LEVEL_INNER) + key.tessellation = IR3_TESS_TRIANGLES; + else + key.tessellation = IR3_TESS_ISOLINES; + break; + + case MESA_SHADER_GEOMETRY: + key.has_gs = true; + break; + + default: + break; + } + + key.safe_constlen = false; + struct ir3_shader_variant *v = ir3_shader_variant(shader, key, false, debug); + if (!v) + return; + + if (v->constlen > compiler->max_const_safe) { + key.safe_constlen = true; + ir3_shader_variant(shader, key, false, debug); + } + + /* For vertex shaders, also compile initial binning pass shader: */ + if (nir->info.stage == MESA_SHADER_VERTEX) { + key.safe_constlen = false; + v = ir3_shader_variant(shader, key, true, debug); + if (!v) + return; + + if (v->constlen > compiler->max_const_safe) { + key.safe_constlen = true; + ir3_shader_variant(shader, key, true, debug); + } + } + + shader->initial_variants_done = true; } static void create_initial_variants_async(void *job, int thread_index) { - struct ir3_shader_state *hwcso = job; - struct pipe_debug_callback debug = {}; + struct ir3_shader_state *hwcso = job; + struct pipe_debug_callback debug = {}; - create_initial_variants(hwcso, &debug); + create_initial_variants(hwcso, &debug); } static void create_initial_compute_variants_async(void *job, int thread_index) { - struct ir3_shader_state *hwcso = job; - struct ir3_shader *shader = hwcso->shader; - struct pipe_debug_callback debug = {}; - static struct ir3_shader_key key; /* static is implicitly zeroed */ + struct ir3_shader_state *hwcso = job; + struct ir3_shader *shader = hwcso->shader; + struct pipe_debug_callback debug = {}; + static struct ir3_shader_key key; /* static is implicitly zeroed */ - ir3_shader_variant(shader, key, false, &debug); - shader->initial_variants_done = true; + ir3_shader_variant(shader, key, false, &debug); + shader->initial_variants_done = true; } /* a bit annoying that compute-shader and normal shader state objects @@ -280,172 +266,172 @@ create_initial_compute_variants_async(void *job, int thread_index) */ void * ir3_shader_compute_state_create(struct pipe_context *pctx, - const struct pipe_compute_state *cso) + const struct pipe_compute_state *cso) { - struct fd_context *ctx = fd_context(pctx); - - /* req_input_mem will only be non-zero for cl kernels (ie. clover). - * This isn't a perfect test because I guess it is possible (but - * uncommon) for none for the kernel parameters to be a global, - * but ctx->set_global_bindings() can't fail, so this is the next - * best place to fail if we need a newer version of kernel driver: - */ - if ((cso->req_input_mem > 0) && - fd_device_version(ctx->dev) < FD_VERSION_BO_IOVA) { - return NULL; - } - - struct ir3_compiler *compiler = ctx->screen->compiler; - nir_shader *nir; - - if (cso->ir_type == PIPE_SHADER_IR_NIR) { - /* we take ownership of the reference: */ - nir = (nir_shader *)cso->prog; - } else { - debug_assert(cso->ir_type == PIPE_SHADER_IR_TGSI); - if (ir3_shader_debug & IR3_DBG_DISASM) { - tgsi_dump(cso->prog, 0); - } - nir = tgsi_to_nir(cso->prog, pctx->screen, false); - } - - struct ir3_shader *shader = ir3_shader_from_nir(compiler, nir, 0, NULL); - struct ir3_shader_state *hwcso = calloc(1, sizeof(*hwcso)); - - util_queue_fence_init(&hwcso->ready); - hwcso->shader = shader; - - /* Immediately compile a standard variant. We have so few variants in our - * shaders, that doing so almost eliminates draw-time recompiles. (This - * is also how we get data from shader-db's ./run) - */ - - if (initial_variants_synchronous(ctx)) { - static struct ir3_shader_key key; /* static is implicitly zeroed */ - ir3_shader_variant(shader, key, false, &ctx->debug); - shader->initial_variants_done = true; - } else { - struct fd_screen *screen = ctx->screen; - util_queue_add_job(&screen->compile_queue, hwcso, - &hwcso->ready, create_initial_compute_variants_async, - NULL, 0); - } - - return hwcso; + struct fd_context *ctx = fd_context(pctx); + + /* req_input_mem will only be non-zero for cl kernels (ie. clover). + * This isn't a perfect test because I guess it is possible (but + * uncommon) for none for the kernel parameters to be a global, + * but ctx->set_global_bindings() can't fail, so this is the next + * best place to fail if we need a newer version of kernel driver: + */ + if ((cso->req_input_mem > 0) && + fd_device_version(ctx->dev) < FD_VERSION_BO_IOVA) { + return NULL; + } + + struct ir3_compiler *compiler = ctx->screen->compiler; + nir_shader *nir; + + if (cso->ir_type == PIPE_SHADER_IR_NIR) { + /* we take ownership of the reference: */ + nir = (nir_shader *)cso->prog; + } else { + debug_assert(cso->ir_type == PIPE_SHADER_IR_TGSI); + if (ir3_shader_debug & IR3_DBG_DISASM) { + tgsi_dump(cso->prog, 0); + } + nir = tgsi_to_nir(cso->prog, pctx->screen, false); + } + + struct ir3_shader *shader = ir3_shader_from_nir(compiler, nir, 0, NULL); + struct ir3_shader_state *hwcso = calloc(1, sizeof(*hwcso)); + + util_queue_fence_init(&hwcso->ready); + hwcso->shader = shader; + + /* Immediately compile a standard variant. We have so few variants in our + * shaders, that doing so almost eliminates draw-time recompiles. (This + * is also how we get data from shader-db's ./run) + */ + + if (initial_variants_synchronous(ctx)) { + static struct ir3_shader_key key; /* static is implicitly zeroed */ + ir3_shader_variant(shader, key, false, &ctx->debug); + shader->initial_variants_done = true; + } else { + struct fd_screen *screen = ctx->screen; + util_queue_add_job(&screen->compile_queue, hwcso, &hwcso->ready, + create_initial_compute_variants_async, NULL, 0); + } + + return hwcso; } void * -ir3_shader_state_create(struct pipe_context *pctx, const struct pipe_shader_state *cso) +ir3_shader_state_create(struct pipe_context *pctx, + const struct pipe_shader_state *cso) { - struct fd_context *ctx = fd_context(pctx); - struct ir3_compiler *compiler = ctx->screen->compiler; - struct ir3_shader_state *hwcso = calloc(1, sizeof(*hwcso)); - - /* - * Convert to nir (if necessary): - */ - - nir_shader *nir; - if (cso->type == PIPE_SHADER_IR_NIR) { - /* we take ownership of the reference: */ - nir = cso->ir.nir; - } else { - debug_assert(cso->type == PIPE_SHADER_IR_TGSI); - if (ir3_shader_debug & IR3_DBG_DISASM) { - tgsi_dump(cso->tokens, 0); - } - nir = tgsi_to_nir(cso->tokens, pctx->screen, false); - } - - /* - * Create ir3_shader: - * - * This part is cheap, it doesn't compile initial variants - */ - - struct ir3_stream_output_info stream_output = {}; - copy_stream_out(&stream_output, &cso->stream_output); - - hwcso->shader = ir3_shader_from_nir(compiler, nir, 0, &stream_output); - - /* - * Create initial variants to avoid draw-time stalls. This is - * normally done asynchronously, unless debug is enabled (which - * will be the case for shader-db) - */ - - util_queue_fence_init(&hwcso->ready); - - if (initial_variants_synchronous(ctx)) { - create_initial_variants(hwcso, &ctx->debug); - } else { - util_queue_add_job(&ctx->screen->compile_queue, hwcso, - &hwcso->ready, create_initial_variants_async, - NULL, 0); - } - - return hwcso; + struct fd_context *ctx = fd_context(pctx); + struct ir3_compiler *compiler = ctx->screen->compiler; + struct ir3_shader_state *hwcso = calloc(1, sizeof(*hwcso)); + + /* + * Convert to nir (if necessary): + */ + + nir_shader *nir; + if (cso->type == PIPE_SHADER_IR_NIR) { + /* we take ownership of the reference: */ + nir = cso->ir.nir; + } else { + debug_assert(cso->type == PIPE_SHADER_IR_TGSI); + if (ir3_shader_debug & IR3_DBG_DISASM) { + tgsi_dump(cso->tokens, 0); + } + nir = tgsi_to_nir(cso->tokens, pctx->screen, false); + } + + /* + * Create ir3_shader: + * + * This part is cheap, it doesn't compile initial variants + */ + + struct ir3_stream_output_info stream_output = {}; + copy_stream_out(&stream_output, &cso->stream_output); + + hwcso->shader = ir3_shader_from_nir(compiler, nir, 0, &stream_output); + + /* + * Create initial variants to avoid draw-time stalls. This is + * normally done asynchronously, unless debug is enabled (which + * will be the case for shader-db) + */ + + util_queue_fence_init(&hwcso->ready); + + if (initial_variants_synchronous(ctx)) { + create_initial_variants(hwcso, &ctx->debug); + } else { + util_queue_add_job(&ctx->screen->compile_queue, hwcso, &hwcso->ready, + create_initial_variants_async, NULL, 0); + } + + return hwcso; } void ir3_shader_state_delete(struct pipe_context *pctx, void *_hwcso) { - struct fd_context *ctx = fd_context(pctx); - struct fd_screen *screen = ctx->screen; - struct ir3_shader_state *hwcso = _hwcso; - struct ir3_shader *so = hwcso->shader; - - ir3_cache_invalidate(ctx->shader_cache, hwcso); - - /* util_queue_drop_job() guarantees that either: - * 1) job did not execute - * 2) job completed - * - * In either case the fence is signaled - */ - util_queue_drop_job(&screen->compile_queue, &hwcso->ready); - - /* free the uploaded shaders, since this is handled outside of the - * shared ir3 code (ie. not used by turnip): - */ - for (struct ir3_shader_variant *v = so->variants; v; v = v->next) { - fd_bo_del(v->bo); - v->bo = NULL; - - if (v->binning && v->binning->bo) { - fd_bo_del(v->binning->bo); - v->binning->bo = NULL; - } - } - - ir3_shader_destroy(so); - util_queue_fence_destroy(&hwcso->ready); - free(hwcso); + struct fd_context *ctx = fd_context(pctx); + struct fd_screen *screen = ctx->screen; + struct ir3_shader_state *hwcso = _hwcso; + struct ir3_shader *so = hwcso->shader; + + ir3_cache_invalidate(ctx->shader_cache, hwcso); + + /* util_queue_drop_job() guarantees that either: + * 1) job did not execute + * 2) job completed + * + * In either case the fence is signaled + */ + util_queue_drop_job(&screen->compile_queue, &hwcso->ready); + + /* free the uploaded shaders, since this is handled outside of the + * shared ir3 code (ie. not used by turnip): + */ + for (struct ir3_shader_variant *v = so->variants; v; v = v->next) { + fd_bo_del(v->bo); + v->bo = NULL; + + if (v->binning && v->binning->bo) { + fd_bo_del(v->binning->bo); + v->binning->bo = NULL; + } + } + + ir3_shader_destroy(so); + util_queue_fence_destroy(&hwcso->ready); + free(hwcso); } struct ir3_shader * ir3_get_shader(struct ir3_shader_state *hwcso) { - if (!hwcso) - return NULL; - - struct ir3_shader *shader = hwcso->shader; - perf_time(1000, "waited for %s:%s:%s variants", - _mesa_shader_stage_to_abbrev(shader->type), - shader->nir->info.name, shader->nir->info.label) { - /* wait for initial variants to compile: */ - util_queue_fence_wait(&hwcso->ready); - } - - return shader; + if (!hwcso) + return NULL; + + struct ir3_shader *shader = hwcso->shader; + perf_time(1000, "waited for %s:%s:%s variants", + _mesa_shader_stage_to_abbrev(shader->type), shader->nir->info.name, + shader->nir->info.label) + { + /* wait for initial variants to compile: */ + util_queue_fence_wait(&hwcso->ready); + } + + return shader; } struct shader_info * ir3_get_shader_info(struct ir3_shader_state *hwcso) { - if (!hwcso) - return NULL; - return &hwcso->shader->nir->info; + if (!hwcso) + return NULL; + return &hwcso->shader->nir->info; } /* fixup dirty shader state in case some "unrelated" (from the state- @@ -455,147 +441,151 @@ ir3_get_shader_info(struct ir3_shader_state *hwcso) void ir3_fixup_shader_state(struct pipe_context *pctx, struct ir3_shader_key *key) { - struct fd_context *ctx = fd_context(pctx); + struct fd_context *ctx = fd_context(pctx); - if (!ir3_shader_key_equal(ctx->last.key, key)) { - if (ir3_shader_key_changes_fs(ctx->last.key, key)) { - fd_context_dirty_shader(ctx, PIPE_SHADER_FRAGMENT, FD_DIRTY_SHADER_PROG); - } + if (!ir3_shader_key_equal(ctx->last.key, key)) { + if (ir3_shader_key_changes_fs(ctx->last.key, key)) { + fd_context_dirty_shader(ctx, PIPE_SHADER_FRAGMENT, + FD_DIRTY_SHADER_PROG); + } - if (ir3_shader_key_changes_vs(ctx->last.key, key)) { - fd_context_dirty_shader(ctx, PIPE_SHADER_VERTEX, FD_DIRTY_SHADER_PROG); - } + if (ir3_shader_key_changes_vs(ctx->last.key, key)) { + fd_context_dirty_shader(ctx, PIPE_SHADER_VERTEX, FD_DIRTY_SHADER_PROG); + } - /* NOTE: currently only a6xx has gs/tess, but needs no - * gs/tess specific lowering. - */ + /* NOTE: currently only a6xx has gs/tess, but needs no + * gs/tess specific lowering. + */ - *ctx->last.key = *key; - } + *ctx->last.key = *key; + } } static void ir3_screen_finalize_nir(struct pipe_screen *pscreen, void *nir, bool optimize) { - struct fd_screen *screen = fd_screen(pscreen); + struct fd_screen *screen = fd_screen(pscreen); - ir3_finalize_nir(screen->compiler, nir); + ir3_finalize_nir(screen->compiler, nir); } static void -ir3_set_max_shader_compiler_threads(struct pipe_screen *pscreen, unsigned max_threads) +ir3_set_max_shader_compiler_threads(struct pipe_screen *pscreen, + unsigned max_threads) { - struct fd_screen *screen = fd_screen(pscreen); + struct fd_screen *screen = fd_screen(pscreen); - /* This function doesn't allow a greater number of threads than - * the queue had at its creation. - */ - util_queue_adjust_num_threads(&screen->compile_queue, max_threads); + /* This function doesn't allow a greater number of threads than + * the queue had at its creation. + */ + util_queue_adjust_num_threads(&screen->compile_queue, max_threads); } static bool ir3_is_parallel_shader_compilation_finished(struct pipe_screen *pscreen, - void *shader, enum pipe_shader_type shader_type) + void *shader, + enum pipe_shader_type shader_type) { - struct ir3_shader_state *hwcso = (struct ir3_shader_state *)shader; + struct ir3_shader_state *hwcso = (struct ir3_shader_state *)shader; - return util_queue_fence_is_signalled(&hwcso->ready); + return util_queue_fence_is_signalled(&hwcso->ready); } void ir3_prog_init(struct pipe_context *pctx) { - pctx->create_vs_state = ir3_shader_state_create; - pctx->delete_vs_state = ir3_shader_state_delete; + pctx->create_vs_state = ir3_shader_state_create; + pctx->delete_vs_state = ir3_shader_state_delete; - pctx->create_tcs_state = ir3_shader_state_create; - pctx->delete_tcs_state = ir3_shader_state_delete; + pctx->create_tcs_state = ir3_shader_state_create; + pctx->delete_tcs_state = ir3_shader_state_delete; - pctx->create_tes_state = ir3_shader_state_create; - pctx->delete_tes_state = ir3_shader_state_delete; + pctx->create_tes_state = ir3_shader_state_create; + pctx->delete_tes_state = ir3_shader_state_delete; - pctx->create_gs_state = ir3_shader_state_create; - pctx->delete_gs_state = ir3_shader_state_delete; + pctx->create_gs_state = ir3_shader_state_create; + pctx->delete_gs_state = ir3_shader_state_delete; - pctx->create_fs_state = ir3_shader_state_create; - pctx->delete_fs_state = ir3_shader_state_delete; + pctx->create_fs_state = ir3_shader_state_create; + pctx->delete_fs_state = ir3_shader_state_delete; } void ir3_screen_init(struct pipe_screen *pscreen) { - struct fd_screen *screen = fd_screen(pscreen); - - screen->compiler = ir3_compiler_create(screen->dev, screen->gpu_id); - - /* TODO do we want to limit things to # of fast cores, or just limit - * based on total # of both big and little cores. The little cores - * tend to be in-order and probably much slower for compiling than - * big cores. OTOH if they are sitting idle, maybe it is useful to - * use them? - */ - unsigned num_threads = sysconf(_SC_NPROCESSORS_ONLN) - 1; - - util_queue_init(&screen->compile_queue, "ir3q", 64, num_threads, - UTIL_QUEUE_INIT_RESIZE_IF_FULL | - UTIL_QUEUE_INIT_SET_FULL_THREAD_AFFINITY); - - pscreen->finalize_nir = ir3_screen_finalize_nir; - pscreen->set_max_shader_compiler_threads = - ir3_set_max_shader_compiler_threads; - pscreen->is_parallel_shader_compilation_finished = - ir3_is_parallel_shader_compilation_finished; + struct fd_screen *screen = fd_screen(pscreen); + + screen->compiler = ir3_compiler_create(screen->dev, screen->gpu_id); + + /* TODO do we want to limit things to # of fast cores, or just limit + * based on total # of both big and little cores. The little cores + * tend to be in-order and probably much slower for compiling than + * big cores. OTOH if they are sitting idle, maybe it is useful to + * use them? + */ + unsigned num_threads = sysconf(_SC_NPROCESSORS_ONLN) - 1; + + util_queue_init(&screen->compile_queue, "ir3q", 64, num_threads, + UTIL_QUEUE_INIT_RESIZE_IF_FULL | + UTIL_QUEUE_INIT_SET_FULL_THREAD_AFFINITY); + + pscreen->finalize_nir = ir3_screen_finalize_nir; + pscreen->set_max_shader_compiler_threads = + ir3_set_max_shader_compiler_threads; + pscreen->is_parallel_shader_compilation_finished = + ir3_is_parallel_shader_compilation_finished; } void ir3_screen_fini(struct pipe_screen *pscreen) { - struct fd_screen *screen = fd_screen(pscreen); + struct fd_screen *screen = fd_screen(pscreen); - util_queue_destroy(&screen->compile_queue); - ir3_compiler_destroy(screen->compiler); - screen->compiler = NULL; + util_queue_destroy(&screen->compile_queue); + ir3_compiler_destroy(screen->compiler); + screen->compiler = NULL; } void -ir3_update_max_tf_vtx(struct fd_context *ctx, const struct ir3_shader_variant *v) +ir3_update_max_tf_vtx(struct fd_context *ctx, + const struct ir3_shader_variant *v) { - struct fd_streamout_stateobj *so = &ctx->streamout; - struct ir3_stream_output_info *info = &v->shader->stream_output; - uint32_t maxvtxcnt = 0x7fffffff; - - if (v->shader->stream_output.num_outputs == 0) - ctx->streamout.max_tf_vtx = 0; - if (so->num_targets == 0) - ctx->streamout.max_tf_vtx = 0; - - /* offset to write to is: - * - * total_vtxcnt = vtxcnt + offsets[i] - * offset = total_vtxcnt * stride[i] - * - * offset = vtxcnt * stride[i] ; calculated in shader - * + offsets[i] * stride[i] ; calculated at emit_tfbos() - * - * assuming for each vtx, each target buffer will have data written - * up to 'offset + stride[i]', that leaves maxvtxcnt as: - * - * buffer_size = (maxvtxcnt * stride[i]) + stride[i] - * maxvtxcnt = (buffer_size - stride[i]) / stride[i] - * - * but shader is actually doing a less-than (rather than less-than- - * equal) check, so we can drop the -stride[i]. - * - * TODO is assumption about `offset + stride[i]` legit? - */ - for (unsigned i = 0; i < so->num_targets; i++) { - struct pipe_stream_output_target *target = so->targets[i]; - unsigned stride = info->stride[i] * 4; /* convert dwords->bytes */ - if (target) { - uint32_t max = target->buffer_size / stride; - maxvtxcnt = MIN2(maxvtxcnt, max); - } - } - - ctx->streamout.max_tf_vtx = maxvtxcnt; + struct fd_streamout_stateobj *so = &ctx->streamout; + struct ir3_stream_output_info *info = &v->shader->stream_output; + uint32_t maxvtxcnt = 0x7fffffff; + + if (v->shader->stream_output.num_outputs == 0) + ctx->streamout.max_tf_vtx = 0; + if (so->num_targets == 0) + ctx->streamout.max_tf_vtx = 0; + + /* offset to write to is: + * + * total_vtxcnt = vtxcnt + offsets[i] + * offset = total_vtxcnt * stride[i] + * + * offset = vtxcnt * stride[i] ; calculated in shader + * + offsets[i] * stride[i] ; calculated at emit_tfbos() + * + * assuming for each vtx, each target buffer will have data written + * up to 'offset + stride[i]', that leaves maxvtxcnt as: + * + * buffer_size = (maxvtxcnt * stride[i]) + stride[i] + * maxvtxcnt = (buffer_size - stride[i]) / stride[i] + * + * but shader is actually doing a less-than (rather than less-than- + * equal) check, so we can drop the -stride[i]. + * + * TODO is assumption about `offset + stride[i]` legit? + */ + for (unsigned i = 0; i < so->num_targets; i++) { + struct pipe_stream_output_target *target = so->targets[i]; + unsigned stride = info->stride[i] * 4; /* convert dwords->bytes */ + if (target) { + uint32_t max = target->buffer_size / stride; + maxvtxcnt = MIN2(maxvtxcnt, max); + } + } + + ctx->streamout.max_tf_vtx = maxvtxcnt; } diff --git a/src/gallium/drivers/freedreno/ir3/ir3_gallium.h b/src/gallium/drivers/freedreno/ir3/ir3_gallium.h index da17bc6..9f3011f 100644 --- a/src/gallium/drivers/freedreno/ir3/ir3_gallium.h +++ b/src/gallium/drivers/freedreno/ir3/ir3_gallium.h @@ -27,9 +27,9 @@ #ifndef IR3_GALLIUM_H_ #define IR3_GALLIUM_H_ -#include "pipe/p_state.h" -#include "pipe/p_screen.h" #include "ir3/ir3_shader.h" +#include "pipe/p_screen.h" +#include "pipe/p_state.h" #include "freedreno_util.h" @@ -39,19 +39,21 @@ */ struct ir3_shader_state; -struct ir3_shader_variant * ir3_shader_variant(struct ir3_shader *shader, - struct ir3_shader_key key, bool binning_pass, - struct pipe_debug_callback *debug); +struct ir3_shader_variant * +ir3_shader_variant(struct ir3_shader *shader, struct ir3_shader_key key, + bool binning_pass, struct pipe_debug_callback *debug); -void * ir3_shader_compute_state_create(struct pipe_context *pctx, - const struct pipe_compute_state *cso); -void * ir3_shader_state_create(struct pipe_context *pctx, const struct pipe_shader_state *cso); +void *ir3_shader_compute_state_create(struct pipe_context *pctx, + const struct pipe_compute_state *cso); +void *ir3_shader_state_create(struct pipe_context *pctx, + const struct pipe_shader_state *cso); void ir3_shader_state_delete(struct pipe_context *pctx, void *hwcso); -struct ir3_shader * ir3_get_shader(struct ir3_shader_state *hwcso); -struct shader_info * ir3_get_shader_info(struct ir3_shader_state *hwcso); +struct ir3_shader *ir3_get_shader(struct ir3_shader_state *hwcso); +struct shader_info *ir3_get_shader_info(struct ir3_shader_state *hwcso); -void ir3_fixup_shader_state(struct pipe_context *pctx, struct ir3_shader_key *key) assert_dt; +void ir3_fixup_shader_state(struct pipe_context *pctx, + struct ir3_shader_key *key) assert_dt; void ir3_prog_init(struct pipe_context *pctx); void ir3_screen_init(struct pipe_screen *pscreen); @@ -63,20 +65,21 @@ void ir3_screen_fini(struct pipe_screen *pscreen); */ static inline bool ir3_point_sprite(const struct ir3_shader_variant *fs, int i, - uint32_t sprite_coord_enable, bool *coord_mode) + uint32_t sprite_coord_enable, bool *coord_mode) { - gl_varying_slot slot = fs->inputs[i].slot; - switch (slot) { - case VARYING_SLOT_PNTC: - *coord_mode = true; - return true; - case VARYING_SLOT_TEX0 ... VARYING_SLOT_TEX7: - return !!(sprite_coord_enable & BITFIELD_BIT(slot - VARYING_SLOT_TEX0)); - default: - return false; - } + gl_varying_slot slot = fs->inputs[i].slot; + switch (slot) { + case VARYING_SLOT_PNTC: + *coord_mode = true; + return true; + case VARYING_SLOT_TEX0 ... VARYING_SLOT_TEX7: + return !!(sprite_coord_enable & BITFIELD_BIT(slot - VARYING_SLOT_TEX0)); + default: + return false; + } } -void ir3_update_max_tf_vtx(struct fd_context *ctx, const struct ir3_shader_variant *v) assert_dt; +void ir3_update_max_tf_vtx(struct fd_context *ctx, + const struct ir3_shader_variant *v) assert_dt; #endif /* IR3_GALLIUM_H_ */