From 543b2331d7b45a29ccd3530daa2389e87e65d89b Mon Sep 17 00:00:00 2001 From: =?utf8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Tue, 8 Nov 2011 21:58:27 +0100 Subject: [PATCH] r600g: implement transform feedback r600: DONE. r700: MOSTLY (done but locks up). Evergreen: MOSTLY (done but doesn't work for an unknown reason). The kernel support will come soon. --- src/gallium/drivers/r600/eg_asm.c | 29 ++ src/gallium/drivers/r600/evergreen_hw_context.c | 35 +++ src/gallium/drivers/r600/evergreen_state.c | 7 +- src/gallium/drivers/r600/evergreend.h | 45 ++++ src/gallium/drivers/r600/r600.h | 21 ++ src/gallium/drivers/r600/r600_asm.c | 115 ++++++++ src/gallium/drivers/r600/r600_asm.h | 2 + src/gallium/drivers/r600/r600_blit.c | 2 + src/gallium/drivers/r600/r600_hw_context.c | 335 ++++++++++++++++++++++++ src/gallium/drivers/r600/r600_hw_context_priv.h | 7 + src/gallium/drivers/r600/r600_pipe.c | 15 +- src/gallium/drivers/r600/r600_pipe.h | 15 ++ src/gallium/drivers/r600/r600_query.c | 16 +- src/gallium/drivers/r600/r600_shader.c | 101 +++++++ src/gallium/drivers/r600/r600_sq.h | 4 + src/gallium/drivers/r600/r600_state.c | 9 +- src/gallium/drivers/r600/r600_state_common.c | 77 +++++- src/gallium/drivers/r600/r600d.h | 51 ++++ 18 files changed, 873 insertions(+), 13 deletions(-) diff --git a/src/gallium/drivers/r600/eg_asm.c b/src/gallium/drivers/r600/eg_asm.c index f6b8631..877e162 100644 --- a/src/gallium/drivers/r600/eg_asm.c +++ b/src/gallium/drivers/r600/eg_asm.c @@ -73,6 +73,35 @@ int eg_bytecode_cf_build(struct r600_bytecode *bc, struct r600_bytecode_cf *cf) bc->bytecode[id] |= S_SQ_CF_ALLOC_EXPORT_WORD1_END_OF_PROGRAM(cf->output.end_of_program); id++; break; + case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0_BUF0: + case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0_BUF1: + case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0_BUF2: + case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0_BUF3: + case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM1_BUF0: + case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM1_BUF1: + case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM1_BUF2: + case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM1_BUF3: + case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM2_BUF0: + case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM2_BUF1: + case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM2_BUF2: + case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM2_BUF3: + case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM3_BUF0: + case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM3_BUF1: + case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM3_BUF2: + case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM3_BUF3: + bc->bytecode[id++] = S_SQ_CF_ALLOC_EXPORT_WORD0_RW_GPR(cf->output.gpr) | + S_SQ_CF_ALLOC_EXPORT_WORD0_ELEM_SIZE(cf->output.elem_size) | + S_SQ_CF_ALLOC_EXPORT_WORD0_ARRAY_BASE(cf->output.array_base) | + S_SQ_CF_ALLOC_EXPORT_WORD0_TYPE(cf->output.type); + bc->bytecode[id] = S_SQ_CF_ALLOC_EXPORT_WORD1_BURST_COUNT(cf->output.burst_count - 1) | + S_SQ_CF_ALLOC_EXPORT_WORD1_BARRIER(cf->output.barrier) | + cf->output.inst | + S_SQ_CF_ALLOC_EXPORT_WORD1_BUF_COMP_MASK(cf->output.comp_mask) | + S_SQ_CF_ALLOC_EXPORT_WORD1_BUF_ARRAY_SIZE(cf->output.array_size); + if (bc->chip_class == EVERGREEN) /* no EOP on cayman */ + bc->bytecode[id] |= S_SQ_CF_ALLOC_EXPORT_WORD1_END_OF_PROGRAM(cf->output.end_of_program); + id++; + break; case EG_V_SQ_CF_WORD1_SQ_CF_INST_JUMP: case EG_V_SQ_CF_WORD1_SQ_CF_INST_ELSE: case EG_V_SQ_CF_WORD1_SQ_CF_INST_POP: diff --git a/src/gallium/drivers/r600/evergreen_hw_context.c b/src/gallium/drivers/r600/evergreen_hw_context.c index 96e8d18..bd1d969 100644 --- a/src/gallium/drivers/r600/evergreen_hw_context.c +++ b/src/gallium/drivers/r600/evergreen_hw_context.c @@ -1241,3 +1241,38 @@ void evergreen_context_flush_dest_caches(struct r600_context *ctx) ctx->flags &= ~R600_CONTEXT_DST_CACHES_DIRTY; } + +void evergreen_flush_vgt_streamout(struct r600_context *ctx) +{ + ctx->pm4[ctx->pm4_cdwords++] = PKT3(PKT3_SET_CONFIG_REG, 1, 0); + ctx->pm4[ctx->pm4_cdwords++] = (R_0084FC_CP_STRMOUT_CNTL - EVERGREEN_CONFIG_REG_OFFSET) >> 2; + ctx->pm4[ctx->pm4_cdwords++] = 0; + + ctx->pm4[ctx->pm4_cdwords++] = PKT3(PKT3_EVENT_WRITE, 0, 0); + ctx->pm4[ctx->pm4_cdwords++] = EVENT_TYPE(EVENT_TYPE_SO_VGTSTREAMOUT_FLUSH) | EVENT_INDEX(0); + + ctx->pm4[ctx->pm4_cdwords++] = PKT3(PKT3_WAIT_REG_MEM, 5, 0); + ctx->pm4[ctx->pm4_cdwords++] = WAIT_REG_MEM_EQUAL; /* wait until the register is equal to the reference value */ + ctx->pm4[ctx->pm4_cdwords++] = R_0084FC_CP_STRMOUT_CNTL >> 2; /* register */ + ctx->pm4[ctx->pm4_cdwords++] = 0; + ctx->pm4[ctx->pm4_cdwords++] = S_0084FC_OFFSET_UPDATE_DONE(1); /* reference value */ + ctx->pm4[ctx->pm4_cdwords++] = S_0084FC_OFFSET_UPDATE_DONE(1); /* mask */ + ctx->pm4[ctx->pm4_cdwords++] = 4; /* poll interval */ +} + +void evergreen_set_streamout_enable(struct r600_context *ctx, unsigned buffer_enable_bit) +{ + if (buffer_enable_bit) { + ctx->pm4[ctx->pm4_cdwords++] = PKT3(PKT3_SET_CONTEXT_REG, 1, 0); + ctx->pm4[ctx->pm4_cdwords++] = (R_028B94_VGT_STRMOUT_CONFIG - EVERGREEN_CONTEXT_REG_OFFSET) >> 2; + ctx->pm4[ctx->pm4_cdwords++] = S_028B94_STREAMOUT_0_EN(1); + + ctx->pm4[ctx->pm4_cdwords++] = PKT3(PKT3_SET_CONTEXT_REG, 1, 0); + ctx->pm4[ctx->pm4_cdwords++] = (R_028B98_VGT_STRMOUT_BUFFER_CONFIG - EVERGREEN_CONTEXT_REG_OFFSET) >> 2; + ctx->pm4[ctx->pm4_cdwords++] = S_028B98_STREAM_0_BUFFER_EN(buffer_enable_bit); + } else { + ctx->pm4[ctx->pm4_cdwords++] = PKT3(PKT3_SET_CONTEXT_REG, 1, 0); + ctx->pm4[ctx->pm4_cdwords++] = (R_028B94_VGT_STRMOUT_CONFIG - EVERGREEN_CONTEXT_REG_OFFSET) >> 2; + ctx->pm4[ctx->pm4_cdwords++] = S_028B94_STREAMOUT_0_EN(0); + } +} diff --git a/src/gallium/drivers/r600/evergreen_state.c b/src/gallium/drivers/r600/evergreen_state.c index d0c02d5..6f5d6f7 100644 --- a/src/gallium/drivers/r600/evergreen_state.c +++ b/src/gallium/drivers/r600/evergreen_state.c @@ -927,8 +927,8 @@ static void *evergreen_create_rs_state(struct pipe_context *ctx, state->fill_back != PIPE_POLYGON_MODE_FILL); r600_pipe_state_add_reg(rstate, R_028814_PA_SU_SC_MODE_CNTL, S_028814_PROVOKING_VTX_LAST(prov_vtx) | - S_028814_CULL_FRONT((state->cull_face & PIPE_FACE_FRONT) ? 1 : 0) | - S_028814_CULL_BACK((state->cull_face & PIPE_FACE_BACK) ? 1 : 0) | + S_028814_CULL_FRONT(state->rasterizer_discard || (state->cull_face & PIPE_FACE_FRONT) ? 1 : 0) | + S_028814_CULL_BACK(state->rasterizer_discard || (state->cull_face & PIPE_FACE_BACK) ? 1 : 0) | S_028814_FACE(!state->front_ccw) | S_028814_POLY_OFFSET_FRONT_ENABLE(state->offset_tri) | S_028814_POLY_OFFSET_BACK_ENABLE(state->offset_tri) | @@ -1688,6 +1688,9 @@ void evergreen_init_state_functions(struct r600_pipe_context *rctx) rctx->context.sampler_view_destroy = r600_sampler_view_destroy; rctx->context.redefine_user_buffer = u_default_redefine_user_buffer; rctx->context.texture_barrier = evergreen_texture_barrier; + rctx->context.create_stream_output_target = r600_create_so_target; + rctx->context.stream_output_target_destroy = r600_so_target_destroy; + rctx->context.set_stream_output_targets = r600_set_so_targets; } static void cayman_init_config(struct r600_pipe_context *rctx) diff --git a/src/gallium/drivers/r600/evergreend.h b/src/gallium/drivers/r600/evergreend.h index 6baa2a7..68b77b4 100644 --- a/src/gallium/drivers/r600/evergreend.h +++ b/src/gallium/drivers/r600/evergreend.h @@ -50,6 +50,8 @@ #define EVENT_TYPE_PS_PARTIAL_FLUSH 0x10 #define EVENT_TYPE_ZPASS_DONE 0x15 #define EVENT_TYPE_CACHE_FLUSH_AND_INV_EVENT 0x16 +#define EVENT_TYPE_SO_VGTSTREAMOUT_FLUSH 0x1f + #define EVENT_TYPE(x) ((x) << 0) #define EVENT_INDEX(x) ((x) << 8) /* 0 - any non-TS event @@ -82,6 +84,7 @@ #define PKT3_MEM_SEMAPHORE 0x39 #define PKT3_MPEG_INDEX 0x3A #define PKT3_WAIT_REG_MEM 0x3C +#define WAIT_REG_MEM_EQUAL 3 #define PKT3_MEM_WRITE 0x3D #define PKT3_INDIRECT_BUFFER 0x32 #define PKT3_CP_INTERRUPT 0x40 @@ -118,6 +121,12 @@ #define PKT3(op, count, predicate) (PKT_TYPE_S(3) | PKT3_IT_OPCODE_S(op) | PKT_COUNT_S(count) | PKT3_PREDICATE(predicate)) /* Registers */ +#define R_0084FC_CP_STRMOUT_CNTL 0x000084FC +#define S_0084FC_OFFSET_UPDATE_DONE(x) (((x) & 0x1) << 0) +#define R_008960_VGT_STRMOUT_BUFFER_FILLED_SIZE_0 0x008960 /* read-only */ +#define R_008964_VGT_STRMOUT_BUFFER_FILLED_SIZE_1 0x008964 /* read-only */ +#define R_008968_VGT_STRMOUT_BUFFER_FILLED_SIZE_2 0x008968 /* read-only */ +#define R_00896C_VGT_STRMOUT_BUFFER_FILLED_SIZE_3 0x00896C /* read-only */ #define R_008C00_SQ_CONFIG 0x00008C00 #define S_008C00_VC_ENABLE(x) (((x) & 0x1) << 0) #define G_008C00_VC_ENABLE(x) (((x) >> 0) & 0x1) @@ -1723,6 +1732,33 @@ #define R_028AC0_DB_SRESULTS_COMPARE_STATE0 0x00028AC0 #define R_028AC4_DB_SRESULTS_COMPARE_STATE1 0x00028AC4 #define R_028AC8_DB_PRELOAD_CONTROL 0x00028AC8 +#define R_028AD0_VGT_STRMOUT_BUFFER_SIZE_0 0x028AD0 +#define R_028AD4_VGT_STRMOUT_VTX_STRIDE_0 0x028AD4 +#define R_028AD8_VGT_STRMOUT_BUFFER_BASE_0 0x028AD8 +#define R_028ADC_VGT_STRMOUT_BUFFER_OFFSET_0 0x028ADC +#define R_028AE0_VGT_STRMOUT_BUFFER_SIZE_1 0x028AE0 +#define R_028AE4_VGT_STRMOUT_VTX_STRIDE_1 0x028AE4 +#define R_028AE8_VGT_STRMOUT_BUFFER_BASE_1 0x028AE8 +#define R_028AEC_VGT_STRMOUT_BUFFER_OFFSET_1 0x028AEC +#define R_028AF0_VGT_STRMOUT_BUFFER_SIZE_2 0x028AF0 +#define R_028AF4_VGT_STRMOUT_VTX_STRIDE_2 0x028AF4 +#define R_028AF8_VGT_STRMOUT_BUFFER_BASE_2 0x028AF8 +#define R_028AFC_VGT_STRMOUT_BUFFER_OFFSET_2 0x028AFC +#define R_028B00_VGT_STRMOUT_BUFFER_SIZE_3 0x028B00 +#define R_028B04_VGT_STRMOUT_VTX_STRIDE_3 0x028B04 +#define R_028B08_VGT_STRMOUT_BUFFER_BASE_3 0x028B08 +#define R_028B0C_VGT_STRMOUT_BUFFER_OFFSET_3 0x028B0C +#define R_028B10_VGT_STRMOUT_BASE_OFFSET_0 0x028B10 +#define R_028B14_VGT_STRMOUT_BASE_OFFSET_1 0x028B14 +#define R_028B18_VGT_STRMOUT_BASE_OFFSET_2 0x028B18 +#define R_028B1C_VGT_STRMOUT_BASE_OFFSET_3 0x028B1C +#define R_028B28_VGT_STRMOUT_DRAW_OPAQUE_OFFSET 0x028B28 +#define R_028B2C_VGT_STRMOUT_DRAW_OPAQUE_BUFFER_FILLED_SIZE 0x028B2C +#define R_028B30_VGT_STRMOUT_DRAW_OPAQUE_VERTEX_STRIDE 0x028B30 +#define R_028B44_VGT_STRMOUT_BASE_OFFSET_HI_0 0x028B44 +#define R_028B48_VGT_STRMOUT_BASE_OFFSET_HI_1 0x028B48 +#define R_028B4C_VGT_STRMOUT_BASE_OFFSET_HI_2 0x028B4C +#define R_028B50_VGT_STRMOUT_BASE_OFFSET_HI_3 0x028B50 #define R_028B54_VGT_SHADER_STAGES_EN 0x00028B54 #define R_028B70_DB_ALPHA_TO_MASK 0x00028B70 #define R_028B78_PA_SU_POLY_OFFSET_DB_FMT_CNTL 0x00028B78 @@ -1750,7 +1786,16 @@ #define G_028B8C_OFFSET(x) (((x) >> 0) & 0xFFFFFFFF) #define C_028B8C_OFFSET 0x00000000 #define R_028B94_VGT_STRMOUT_CONFIG 0x00028B94 +#define S_028B94_STREAMOUT_0_EN(x) (((x) & 0x1) << 0) +#define S_028B94_STREAMOUT_1_EN(x) (((x) & 0x1) << 1) +#define S_028B94_STREAMOUT_2_EN(x) (((x) & 0x1) << 2) +#define S_028B94_STREAMOUT_3_EN(x) (((x) & 0x1) << 3) +#define S_028B94_RAST_STREAM(x) (((x) & 0x7) << 4) #define R_028B98_VGT_STRMOUT_BUFFER_CONFIG 0x00028B98 +#define S_028B98_STREAM_0_BUFFER_EN(x) (((x) & 0xf) << 0) +#define S_028B98_STREAM_1_BUFFER_EN(x) (((x) & 0xf) << 4) +#define S_028B98_STREAM_2_BUFFER_EN(x) (((x) & 0xf) << 8) +#define S_028B98_STREAM_3_BUFFER_EN(x) (((x) & 0xf) << 12) #define R_028C00_PA_SC_LINE_CNTL 0x00028C00 #define R_028C04_PA_SC_AA_CONFIG 0x00028C04 #define R_028C08_PA_SU_VTX_CNTL 0x00028C08 diff --git a/src/gallium/drivers/r600/r600.h b/src/gallium/drivers/r600/r600.h index e697406..fbd12fb 100644 --- a/src/gallium/drivers/r600/r600.h +++ b/src/gallium/drivers/r600/r600.h @@ -167,6 +167,7 @@ struct r600_query { union { uint64_t u64; boolean b; + struct pipe_query_data_so_statistics so; } result; /* The kind of query */ unsigned type; @@ -187,6 +188,15 @@ struct r600_query { struct list_head list; }; +struct r600_so_target { + struct pipe_stream_output_target b; + + /* The buffer where BUFFER_FILLED_SIZE is stored. */ + struct r600_resource *filled_size; + unsigned stride; + unsigned so_index; +}; + #define R600_CONTEXT_DRAW_PENDING (1 << 0) #define R600_CONTEXT_DST_CACHES_DIRTY (1 << 1) #define R600_CONTEXT_CHECK_EVENT_FLUSH (1 << 2) @@ -218,6 +228,7 @@ struct r600_context { /* The list of active queries. Only one query of each type can be active. */ struct list_head active_query_list; unsigned num_cs_dw_queries_suspend; + unsigned num_cs_dw_streamout_end; unsigned backend_mask; unsigned max_db; /* for OQ */ @@ -229,6 +240,12 @@ struct r600_context { struct r600_range fs_resources; int num_ps_resources, num_vs_resources, num_fs_resources; boolean have_depth_texture, have_depth_fb; + + unsigned num_so_targets; + struct r600_so_target *so_targets[PIPE_MAX_SO_BUFFERS]; + boolean streamout_start; + unsigned streamout_append_bitmask; + unsigned *vs_shader_so_strides; }; struct r600_draw { @@ -268,6 +285,10 @@ void r600_context_emit_fence(struct r600_context *ctx, struct r600_resource *fen void r600_context_flush_all(struct r600_context *ctx, unsigned flush_flags); void r600_context_flush_dest_caches(struct r600_context *ctx); +void r600_context_streamout_begin(struct r600_context *ctx); +void r600_context_streamout_end(struct r600_context *ctx); +void r600_context_draw_opaque_count(struct r600_context *ctx, struct r600_so_target *t); + int evergreen_context_init(struct r600_context *ctx, struct r600_screen *screen); void evergreen_context_draw(struct r600_context *ctx, const struct r600_draw *draw); void evergreen_context_flush_dest_caches(struct r600_context *ctx); diff --git a/src/gallium/drivers/r600/r600_asm.c b/src/gallium/drivers/r600/r600_asm.c index 1ab16f2..e617b16 100644 --- a/src/gallium/drivers/r600/r600_asm.c +++ b/src/gallium/drivers/r600/r600_asm.c @@ -1672,6 +1672,21 @@ static int r600_bytecode_cf_build(struct r600_bytecode *bc, struct r600_bytecode cf->output.inst | S_SQ_CF_ALLOC_EXPORT_WORD1_END_OF_PROGRAM(cf->output.end_of_program); break; + case V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0: + case V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM1: + case V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM2: + case V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM3: + bc->bytecode[id++] = S_SQ_CF_ALLOC_EXPORT_WORD0_RW_GPR(cf->output.gpr) | + S_SQ_CF_ALLOC_EXPORT_WORD0_ELEM_SIZE(cf->output.elem_size) | + S_SQ_CF_ALLOC_EXPORT_WORD0_ARRAY_BASE(cf->output.array_base) | + S_SQ_CF_ALLOC_EXPORT_WORD0_TYPE(cf->output.type); + bc->bytecode[id++] = S_SQ_CF_ALLOC_EXPORT_WORD1_BURST_COUNT(cf->output.burst_count - 1) | + S_SQ_CF_ALLOC_EXPORT_WORD1_BARRIER(cf->output.barrier) | + cf->output.inst | + S_SQ_CF_ALLOC_EXPORT_WORD1_END_OF_PROGRAM(cf->output.end_of_program) | + S_SQ_CF_ALLOC_EXPORT_WORD1_BUF_ARRAY_SIZE(cf->output.array_size) | + S_SQ_CF_ALLOC_EXPORT_WORD1_BUF_COMP_MASK(cf->output.comp_mask); + break; case V_SQ_CF_WORD1_SQ_CF_INST_JUMP: case V_SQ_CF_WORD1_SQ_CF_INST_ELSE: case V_SQ_CF_WORD1_SQ_CF_INST_POP: @@ -1730,6 +1745,22 @@ int r600_bytecode_build(struct r600_bytecode *bc) case EG_V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_PUSH_BEFORE: case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT: case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT_DONE: + case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0_BUF0: + case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0_BUF1: + case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0_BUF2: + case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0_BUF3: + case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM1_BUF0: + case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM1_BUF1: + case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM1_BUF2: + case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM1_BUF3: + case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM2_BUF0: + case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM2_BUF1: + case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM2_BUF2: + case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM2_BUF3: + case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM3_BUF0: + case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM3_BUF1: + case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM3_BUF2: + case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM3_BUF3: case EG_V_SQ_CF_WORD1_SQ_CF_INST_JUMP: case EG_V_SQ_CF_WORD1_SQ_CF_INST_ELSE: case EG_V_SQ_CF_WORD1_SQ_CF_INST_POP: @@ -1760,6 +1791,10 @@ int r600_bytecode_build(struct r600_bytecode *bc) case V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_PUSH_BEFORE: case V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT: case V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT_DONE: + case V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0: + case V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM1: + case V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM2: + case V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM3: case V_SQ_CF_WORD1_SQ_CF_INST_JUMP: case V_SQ_CF_WORD1_SQ_CF_INST_ELSE: case V_SQ_CF_WORD1_SQ_CF_INST_POP: @@ -1849,6 +1884,22 @@ int r600_bytecode_build(struct r600_bytecode *bc) break; case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT: case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT_DONE: + case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0_BUF0: + case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0_BUF1: + case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0_BUF2: + case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0_BUF3: + case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM1_BUF0: + case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM1_BUF1: + case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM1_BUF2: + case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM1_BUF3: + case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM2_BUF0: + case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM2_BUF1: + case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM2_BUF2: + case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM2_BUF3: + case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM3_BUF0: + case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM3_BUF1: + case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM3_BUF2: + case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM3_BUF3: case EG_V_SQ_CF_WORD1_SQ_CF_INST_LOOP_START_NO_AL: case EG_V_SQ_CF_WORD1_SQ_CF_INST_LOOP_END: case EG_V_SQ_CF_WORD1_SQ_CF_INST_LOOP_CONTINUE: @@ -1923,6 +1974,10 @@ int r600_bytecode_build(struct r600_bytecode *bc) break; case V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT: case V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT_DONE: + case V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0: + case V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM1: + case V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM2: + case V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM3: case V_SQ_CF_WORD1_SQ_CF_INST_LOOP_START_NO_AL: case V_SQ_CF_WORD1_SQ_CF_INST_LOOP_END: case V_SQ_CF_WORD1_SQ_CF_INST_LOOP_CONTINUE: @@ -2057,6 +2112,44 @@ void r600_bytecode_dump(struct r600_bytecode *bc) fprintf(stderr, "BURST_COUNT:%d ", cf->output.burst_count); fprintf(stderr, "EOP:%X\n", cf->output.end_of_program); break; + case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0_BUF0: + case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0_BUF1: + case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0_BUF2: + case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0_BUF3: + case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM1_BUF0: + case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM1_BUF1: + case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM1_BUF2: + case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM1_BUF3: + case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM2_BUF0: + case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM2_BUF1: + case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM2_BUF2: + case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM2_BUF3: + case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM3_BUF0: + case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM3_BUF1: + case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM3_BUF2: + case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM3_BUF3: + fprintf(stderr, "%04d %08X EXPORT MEM_STREAM%i_BUF%i ", id, bc->bytecode[id], + (EG_G_SQ_CF_ALLOC_EXPORT_WORD1_CF_INST(cf->inst) - + EG_G_SQ_CF_ALLOC_EXPORT_WORD1_CF_INST(EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0_BUF0)) / 4, + (EG_G_SQ_CF_ALLOC_EXPORT_WORD1_CF_INST(cf->inst) - + EG_G_SQ_CF_ALLOC_EXPORT_WORD1_CF_INST(EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0_BUF0)) % 4); + fprintf(stderr, "GPR:%X ", cf->output.gpr); + fprintf(stderr, "ELEM_SIZE:%i ", cf->output.elem_size); + fprintf(stderr, "ARRAY_BASE:%i ", cf->output.array_base); + fprintf(stderr, "TYPE:%X\n", cf->output.type); + id++; + fprintf(stderr, "%04d %08X EXPORT MEM_STREAM%i_BUF%i ", id, bc->bytecode[id], + (EG_G_SQ_CF_ALLOC_EXPORT_WORD1_CF_INST(cf->inst) - + EG_G_SQ_CF_ALLOC_EXPORT_WORD1_CF_INST(EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0_BUF0)) / 4, + (EG_G_SQ_CF_ALLOC_EXPORT_WORD1_CF_INST(cf->inst) - + EG_G_SQ_CF_ALLOC_EXPORT_WORD1_CF_INST(EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0_BUF0)) % 4); + fprintf(stderr, "ARRAY_SIZE:%i ", cf->output.array_size); + fprintf(stderr, "COMP_MASK:%X ", cf->output.comp_mask); + fprintf(stderr, "BARRIER:%X ", cf->output.barrier); + fprintf(stderr, "INST:%d ", cf->output.inst); + fprintf(stderr, "BURST_COUNT:%d ", cf->output.burst_count); + fprintf(stderr, "EOP:%X\n", cf->output.end_of_program); + break; case EG_V_SQ_CF_WORD1_SQ_CF_INST_JUMP: case EG_V_SQ_CF_WORD1_SQ_CF_INST_ELSE: case EG_V_SQ_CF_WORD1_SQ_CF_INST_POP: @@ -2125,6 +2218,28 @@ void r600_bytecode_dump(struct r600_bytecode *bc) fprintf(stderr, "BURST_COUNT:%d ", cf->output.burst_count); fprintf(stderr, "EOP:%X\n", cf->output.end_of_program); break; + case V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0: + case V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM1: + case V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM2: + case V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM3: + fprintf(stderr, "%04d %08X EXPORT MEM_STREAM%i ", id, bc->bytecode[id], + R600_G_SQ_CF_ALLOC_EXPORT_WORD1_CF_INST(cf->inst) - + R600_G_SQ_CF_ALLOC_EXPORT_WORD1_CF_INST(V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0)); + fprintf(stderr, "GPR:%X ", cf->output.gpr); + fprintf(stderr, "ELEM_SIZE:%i ", cf->output.elem_size); + fprintf(stderr, "ARRAY_BASE:%i ", cf->output.array_base); + fprintf(stderr, "TYPE:%X\n", cf->output.type); + id++; + fprintf(stderr, "%04d %08X EXPORT MEM_STREAM%i ", id, bc->bytecode[id], + R600_G_SQ_CF_ALLOC_EXPORT_WORD1_CF_INST(cf->inst) - + R600_G_SQ_CF_ALLOC_EXPORT_WORD1_CF_INST(V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0)); + fprintf(stderr, "ARRAY_SIZE:%i ", cf->output.array_size); + fprintf(stderr, "COMP_MASK:%X ", cf->output.comp_mask); + fprintf(stderr, "BARRIER:%X ", cf->output.barrier); + fprintf(stderr, "INST:%d ", cf->output.inst); + fprintf(stderr, "BURST_COUNT:%d ", cf->output.burst_count); + fprintf(stderr, "EOP:%X\n", cf->output.end_of_program); + break; case V_SQ_CF_WORD1_SQ_CF_INST_JUMP: case V_SQ_CF_WORD1_SQ_CF_INST_ELSE: case V_SQ_CF_WORD1_SQ_CF_INST_POP: diff --git a/src/gallium/drivers/r600/r600_asm.h b/src/gallium/drivers/r600/r600_asm.h index 0fd4467..d0ff75d 100644 --- a/src/gallium/drivers/r600/r600_asm.h +++ b/src/gallium/drivers/r600/r600_asm.h @@ -107,6 +107,8 @@ struct r600_bytecode_vtx { struct r600_bytecode_output { unsigned array_base; + unsigned array_size; + unsigned comp_mask; unsigned type; unsigned end_of_program; diff --git a/src/gallium/drivers/r600/r600_blit.c b/src/gallium/drivers/r600/r600_blit.c index 9326dc6..313ed12 100644 --- a/src/gallium/drivers/r600/r600_blit.c +++ b/src/gallium/drivers/r600/r600_blit.c @@ -65,6 +65,8 @@ static void r600_blitter_begin(struct pipe_context *ctx, enum r600_blitter_op op util_blitter_save_vertex_buffers(rctx->blitter, rctx->vbuf_mgr->nr_vertex_buffers, rctx->vbuf_mgr->vertex_buffer); + util_blitter_save_so_targets(rctx->blitter, rctx->ctx.num_so_targets, + (struct pipe_stream_output_target**)rctx->ctx.so_targets); if (op & R600_SAVE_FRAMEBUFFER) util_blitter_save_framebuffer(rctx->blitter, &rctx->framebuffer); diff --git a/src/gallium/drivers/r600/r600_hw_context.c b/src/gallium/drivers/r600/r600_hw_context.c index 52e0be7..1dba966 100644 --- a/src/gallium/drivers/r600/r600_hw_context.c +++ b/src/gallium/drivers/r600/r600_hw_context.c @@ -943,6 +943,9 @@ void r600_need_cs_space(struct r600_context *ctx, unsigned num_dw, /* Count in queries_suspend. */ num_dw += ctx->num_cs_dw_queries_suspend; + /* Count in streamout_end at the end of CS. */ + num_dw += ctx->num_cs_dw_streamout_end; + /* Count in render_condition(NULL) at the end of CS. */ if (ctx->predicate_drawing) { num_dw += 3; @@ -1471,6 +1474,12 @@ void r600_context_draw(struct r600_context *ctx, const struct r600_draw *draw) r600_context_block_resource_emit_dirty(ctx, dirty_block); } + /* Enable stream out if needed. */ + if (ctx->streamout_start) { + r600_context_streamout_begin(ctx); + ctx->streamout_start = FALSE; + } + /* draw packet */ pm4 = &ctx->pm4[ctx->pm4_cdwords]; @@ -1503,6 +1512,7 @@ void r600_context_flush(struct r600_context *ctx, unsigned flags) { struct r600_block *enable_block = NULL; bool queries_suspended = false; + bool streamout_suspended = false; if (ctx->pm4_cdwords == ctx->init_dwords) return; @@ -1513,6 +1523,11 @@ void r600_context_flush(struct r600_context *ctx, unsigned flags) queries_suspended = true; } + if (ctx->num_cs_dw_streamout_end) { + r600_context_streamout_end(ctx); + streamout_suspended = true; + } + if (ctx->screen->chip_class >= EVERGREEN) evergreen_context_flush_dest_caches(ctx); else @@ -1542,6 +1557,11 @@ void r600_context_flush(struct r600_context *ctx, unsigned flags) r600_init_cs(ctx); + if (streamout_suspended) { + ctx->streamout_start = TRUE; + ctx->streamout_append_bitmask = ~0; + } + /* resume queries */ if (queries_suspended) { r600_context_queries_resume(ctx); @@ -1636,6 +1656,44 @@ static boolean r600_query_result(struct r600_context *ctx, struct r600_query *qu results_base = (results_base + query->result_size) % query->buffer->b.b.b.width0; } break; + case PIPE_QUERY_PRIMITIVES_EMITTED: + /* SAMPLE_STREAMOUTSTATS stores this structure: + * { + * u64 NumPrimitivesWritten; + * u64 PrimitiveStorageNeeded; + * } + * We only need NumPrimitivesWritten here. */ + while (results_base != query->results_end) { + query->result.u64 += + r600_query_read_result(map + results_base, 2, 6, true); + results_base = (results_base + query->result_size) % query->buffer->b.b.b.width0; + } + break; + case PIPE_QUERY_PRIMITIVES_GENERATED: + /* Here we read PrimitiveStorageNeeded. */ + while (results_base != query->results_end) { + query->result.u64 += + r600_query_read_result(map + results_base, 0, 4, true); + results_base = (results_base + query->result_size) % query->buffer->b.b.b.width0; + } + break; + case PIPE_QUERY_SO_STATISTICS: + while (results_base != query->results_end) { + query->result.so.num_primitives_written += + r600_query_read_result(map + results_base, 2, 6, true); + query->result.so.primitives_storage_needed += + r600_query_read_result(map + results_base, 0, 4, true); + results_base = (results_base + query->result_size) % query->buffer->b.b.b.width0; + } + break; + case PIPE_QUERY_SO_OVERFLOW_PREDICATE: + while (results_base != query->results_end) { + query->result.b = query->result.b || + r600_query_read_result(map + results_base, 2, 6, true) != + r600_query_read_result(map + results_base, 0, 4, true); + results_base = (results_base + query->result_size) % query->buffer->b.b.b.width0; + } + break; default: assert(0); } @@ -1679,6 +1737,15 @@ void r600_query_begin(struct r600_context *ctx, struct r600_query *query) break; case PIPE_QUERY_TIME_ELAPSED: break; + case PIPE_QUERY_PRIMITIVES_EMITTED: + case PIPE_QUERY_PRIMITIVES_GENERATED: + case PIPE_QUERY_SO_STATISTICS: + case PIPE_QUERY_SO_OVERFLOW_PREDICATE: + results = ctx->ws->buffer_map(query->buffer->buf, ctx->cs, PIPE_TRANSFER_WRITE); + results = (u32*)((char*)results + query->results_end); + memset(results, 0, query->result_size); + ctx->ws->buffer_unmap(query->buffer->buf); + break; default: assert(0); } @@ -1692,6 +1759,15 @@ void r600_query_begin(struct r600_context *ctx, struct r600_query *query) ctx->pm4[ctx->pm4_cdwords++] = query->results_end; ctx->pm4[ctx->pm4_cdwords++] = 0; break; + case PIPE_QUERY_PRIMITIVES_EMITTED: + case PIPE_QUERY_PRIMITIVES_GENERATED: + case PIPE_QUERY_SO_STATISTICS: + case PIPE_QUERY_SO_OVERFLOW_PREDICATE: + ctx->pm4[ctx->pm4_cdwords++] = PKT3(PKT3_EVENT_WRITE, 2, 0); + ctx->pm4[ctx->pm4_cdwords++] = EVENT_TYPE(EVENT_TYPE_SAMPLE_STREAMOUTSTATS) | EVENT_INDEX(3); + ctx->pm4[ctx->pm4_cdwords++] = query->results_end; + ctx->pm4[ctx->pm4_cdwords++] = 0; + break; case PIPE_QUERY_TIME_ELAPSED: ctx->pm4[ctx->pm4_cdwords++] = PKT3(PKT3_EVENT_WRITE_EOP, 4, 0); ctx->pm4[ctx->pm4_cdwords++] = EVENT_TYPE(EVENT_TYPE_CACHE_FLUSH_AND_INV_TS_EVENT) | EVENT_INDEX(5); @@ -1720,6 +1796,15 @@ void r600_query_end(struct r600_context *ctx, struct r600_query *query) ctx->pm4[ctx->pm4_cdwords++] = query->results_end + 8; ctx->pm4[ctx->pm4_cdwords++] = 0; break; + case PIPE_QUERY_PRIMITIVES_EMITTED: + case PIPE_QUERY_PRIMITIVES_GENERATED: + case PIPE_QUERY_SO_STATISTICS: + case PIPE_QUERY_SO_OVERFLOW_PREDICATE: + ctx->pm4[ctx->pm4_cdwords++] = PKT3(PKT3_EVENT_WRITE, 2, 0); + ctx->pm4[ctx->pm4_cdwords++] = EVENT_TYPE(EVENT_TYPE_SAMPLE_STREAMOUTSTATS) | EVENT_INDEX(3); + ctx->pm4[ctx->pm4_cdwords++] = query->results_end + query->result_size/2; + ctx->pm4[ctx->pm4_cdwords++] = 0; + break; case PIPE_QUERY_TIME_ELAPSED: ctx->pm4[ctx->pm4_cdwords++] = PKT3(PKT3_EVENT_WRITE_EOP, 4, 0); ctx->pm4[ctx->pm4_cdwords++] = EVENT_TYPE(EVENT_TYPE_CACHE_FLUSH_AND_INV_TS_EVENT) | EVENT_INDEX(5); @@ -1798,6 +1883,14 @@ struct r600_query *r600_context_query_create(struct r600_context *ctx, unsigned query->result_size = 16; query->num_cs_dw = 8; break; + case PIPE_QUERY_PRIMITIVES_EMITTED: + case PIPE_QUERY_PRIMITIVES_GENERATED: + case PIPE_QUERY_SO_STATISTICS: + case PIPE_QUERY_SO_OVERFLOW_PREDICATE: + /* NumPrimitivesWritten, PrimitiveStorageNeeded. */ + query->result_size = 32; + query->num_cs_dw = 6; + break; default: assert(0); FREE(query); @@ -1832,20 +1925,28 @@ boolean r600_context_query_result(struct r600_context *ctx, { boolean *result_b = (boolean*)vresult; uint64_t *result_u64 = (uint64_t*)vresult; + struct pipe_query_data_so_statistics *result_so = + (struct pipe_query_data_so_statistics*)vresult; if (!r600_query_result(ctx, query, wait)) return FALSE; switch (query->type) { case PIPE_QUERY_OCCLUSION_COUNTER: + case PIPE_QUERY_PRIMITIVES_EMITTED: + case PIPE_QUERY_PRIMITIVES_GENERATED: *result_u64 = query->result.u64; break; case PIPE_QUERY_OCCLUSION_PREDICATE: + case PIPE_QUERY_SO_OVERFLOW_PREDICATE: *result_b = query->result.b; break; case PIPE_QUERY_TIME_ELAPSED: *result_u64 = (1000000 * query->result.u64) / ctx->screen->info.r600_clock_crystal_freq; break; + case PIPE_QUERY_SO_STATISTICS: + *result_so = query->result.so; + break; default: assert(0); } @@ -1872,3 +1973,237 @@ void r600_context_queries_resume(struct r600_context *ctx) r600_query_begin(ctx, query); } } + +static void r600_flush_vgt_streamout(struct r600_context *ctx) +{ + ctx->pm4[ctx->pm4_cdwords++] = PKT3(PKT3_SET_CONFIG_REG, 1, 0); + ctx->pm4[ctx->pm4_cdwords++] = (R_008490_CP_STRMOUT_CNTL - R600_CONFIG_REG_OFFSET) >> 2; + ctx->pm4[ctx->pm4_cdwords++] = 0; + + ctx->pm4[ctx->pm4_cdwords++] = PKT3(PKT3_EVENT_WRITE, 0, 0); + ctx->pm4[ctx->pm4_cdwords++] = EVENT_TYPE(EVENT_TYPE_SO_VGTSTREAMOUT_FLUSH) | EVENT_INDEX(0); + + ctx->pm4[ctx->pm4_cdwords++] = PKT3(PKT3_WAIT_REG_MEM, 5, 0); + ctx->pm4[ctx->pm4_cdwords++] = WAIT_REG_MEM_EQUAL; /* wait until the register is equal to the reference value */ + ctx->pm4[ctx->pm4_cdwords++] = R_008490_CP_STRMOUT_CNTL >> 2; /* register */ + ctx->pm4[ctx->pm4_cdwords++] = 0; + ctx->pm4[ctx->pm4_cdwords++] = S_008490_OFFSET_UPDATE_DONE(1); /* reference value */ + ctx->pm4[ctx->pm4_cdwords++] = S_008490_OFFSET_UPDATE_DONE(1); /* mask */ + ctx->pm4[ctx->pm4_cdwords++] = 4; /* poll interval */ +} + +static void r600_set_streamout_enable(struct r600_context *ctx, unsigned buffer_enable_bit) +{ + if (buffer_enable_bit) { + ctx->pm4[ctx->pm4_cdwords++] = PKT3(PKT3_SET_CONTEXT_REG, 1, 0); + ctx->pm4[ctx->pm4_cdwords++] = (R_028AB0_VGT_STRMOUT_EN - R600_CONTEXT_REG_OFFSET) >> 2; + ctx->pm4[ctx->pm4_cdwords++] = S_028AB0_STREAMOUT(1); + + ctx->pm4[ctx->pm4_cdwords++] = PKT3(PKT3_SET_CONTEXT_REG, 1, 0); + ctx->pm4[ctx->pm4_cdwords++] = (R_028B20_VGT_STRMOUT_BUFFER_EN - R600_CONTEXT_REG_OFFSET) >> 2; + ctx->pm4[ctx->pm4_cdwords++] = buffer_enable_bit; + } else { + ctx->pm4[ctx->pm4_cdwords++] = PKT3(PKT3_SET_CONTEXT_REG, 1, 0); + ctx->pm4[ctx->pm4_cdwords++] = (R_028AB0_VGT_STRMOUT_EN - R600_CONTEXT_REG_OFFSET) >> 2; + ctx->pm4[ctx->pm4_cdwords++] = S_028AB0_STREAMOUT(0); + } +} + +void r600_context_streamout_begin(struct r600_context *ctx) +{ + struct r600_so_target **t = ctx->so_targets; + unsigned *strides = ctx->vs_shader_so_strides; + unsigned buffer_en, i, update_flags = 0; + + buffer_en = (ctx->num_so_targets >= 1 && t[0] ? 1 : 0) | + (ctx->num_so_targets >= 2 && t[1] ? 2 : 0) | + (ctx->num_so_targets >= 3 && t[2] ? 4 : 0) | + (ctx->num_so_targets >= 4 && t[3] ? 8 : 0); + + ctx->num_cs_dw_streamout_end = + 12 + /* flush_vgt_streamout */ + util_bitcount(buffer_en) * 8 + + 8; + + r600_need_cs_space(ctx, + 12 + /* flush_vgt_streamout */ + 6 + /* enables */ + util_bitcount(buffer_en & ctx->streamout_append_bitmask) * 8 + + util_bitcount(buffer_en & ~ctx->streamout_append_bitmask) * 6 + + (ctx->screen->family > CHIP_R600 && ctx->screen->family < CHIP_RV770 ? 2 : 0) + + ctx->num_cs_dw_streamout_end, TRUE); + + if (ctx->screen->chip_class >= EVERGREEN) { + evergreen_flush_vgt_streamout(ctx); + evergreen_set_streamout_enable(ctx, buffer_en); + } else { + r600_flush_vgt_streamout(ctx); + r600_set_streamout_enable(ctx, buffer_en); + } + + for (i = 0; i < ctx->num_so_targets; i++) { + if (t[i]) { + t[i]->stride = strides[i]; + t[i]->so_index = i; + + update_flags |= SURFACE_BASE_UPDATE_STRMOUT(i); + + ctx->pm4[ctx->pm4_cdwords++] = PKT3(PKT3_SET_CONTEXT_REG, 3, 0); + ctx->pm4[ctx->pm4_cdwords++] = (R_028AD0_VGT_STRMOUT_BUFFER_SIZE_0 + + 16*i - R600_CONTEXT_REG_OFFSET) >> 2; + ctx->pm4[ctx->pm4_cdwords++] = (t[i]->b.buffer_offset + + t[i]->b.buffer_size) >> 2; /* BUFFER_SIZE (in DW) */ + ctx->pm4[ctx->pm4_cdwords++] = strides[i] >> 2; /* VTX_STRIDE (in DW) */ + ctx->pm4[ctx->pm4_cdwords++] = 0; /* BUFFER_BASE */ + + ctx->pm4[ctx->pm4_cdwords++] = PKT3(PKT3_NOP, 0, 0); + ctx->pm4[ctx->pm4_cdwords++] = + r600_context_bo_reloc(ctx, r600_resource(t[i]->b.buffer), + RADEON_USAGE_WRITE); + + if (ctx->streamout_append_bitmask & (1 << i)) { + /* Append. */ + ctx->pm4[ctx->pm4_cdwords++] = PKT3(PKT3_STRMOUT_BUFFER_UPDATE, 4, 0); + ctx->pm4[ctx->pm4_cdwords++] = STRMOUT_SELECT_BUFFER(i) | + STRMOUT_OFFSET_SOURCE(STRMOUT_OFFSET_FROM_MEM); /* control */ + ctx->pm4[ctx->pm4_cdwords++] = 0; /* unused */ + ctx->pm4[ctx->pm4_cdwords++] = 0; /* unused */ + ctx->pm4[ctx->pm4_cdwords++] = 0; /* src address lo */ + ctx->pm4[ctx->pm4_cdwords++] = 0; /* src address hi */ + + ctx->pm4[ctx->pm4_cdwords++] = PKT3(PKT3_NOP, 0, 0); + ctx->pm4[ctx->pm4_cdwords++] = + r600_context_bo_reloc(ctx, t[i]->filled_size, + RADEON_USAGE_READ); + } else { + /* Start from the beginning. */ + ctx->pm4[ctx->pm4_cdwords++] = PKT3(PKT3_STRMOUT_BUFFER_UPDATE, 4, 0); + ctx->pm4[ctx->pm4_cdwords++] = STRMOUT_SELECT_BUFFER(i) | + STRMOUT_OFFSET_SOURCE(STRMOUT_OFFSET_FROM_PACKET); /* control */ + ctx->pm4[ctx->pm4_cdwords++] = 0; /* unused */ + ctx->pm4[ctx->pm4_cdwords++] = 0; /* unused */ + ctx->pm4[ctx->pm4_cdwords++] = t[i]->b.buffer_offset >> 2; /* buffer offset in DW */ + ctx->pm4[ctx->pm4_cdwords++] = 0; /* unused */ + } + } + } + + if (ctx->screen->family > CHIP_R600 && ctx->screen->family < CHIP_RV770) { + ctx->pm4[ctx->pm4_cdwords++] = PKT3(PKT3_SURFACE_BASE_UPDATE, 0, 0); + ctx->pm4[ctx->pm4_cdwords++] = update_flags; + } +} + +void r600_context_streamout_end(struct r600_context *ctx) +{ + struct r600_so_target **t = ctx->so_targets; + unsigned i, flush_flags = 0; + + if (ctx->screen->chip_class >= EVERGREEN) { + evergreen_flush_vgt_streamout(ctx); + } else { + r600_flush_vgt_streamout(ctx); + } + + for (i = 0; i < ctx->num_so_targets; i++) { + if (t[i]) { + ctx->pm4[ctx->pm4_cdwords++] = PKT3(PKT3_STRMOUT_BUFFER_UPDATE, 4, 0); + ctx->pm4[ctx->pm4_cdwords++] = STRMOUT_SELECT_BUFFER(i) | + STRMOUT_OFFSET_SOURCE(STRMOUT_OFFSET_NONE) | + STRMOUT_STORE_BUFFER_FILLED_SIZE; /* control */ + ctx->pm4[ctx->pm4_cdwords++] = 0; /* dst address lo */ + ctx->pm4[ctx->pm4_cdwords++] = 0; /* dst address hi */ + ctx->pm4[ctx->pm4_cdwords++] = 0; /* unused */ + ctx->pm4[ctx->pm4_cdwords++] = 0; /* unused */ + + ctx->pm4[ctx->pm4_cdwords++] = PKT3(PKT3_NOP, 0, 0); + ctx->pm4[ctx->pm4_cdwords++] = + r600_context_bo_reloc(ctx, t[i]->filled_size, + RADEON_USAGE_WRITE); + + flush_flags |= S_0085F0_SO0_DEST_BASE_ENA(1) << i; + } + } + + if (ctx->screen->chip_class >= EVERGREEN) { + evergreen_set_streamout_enable(ctx, 0); + } else { + r600_set_streamout_enable(ctx, 0); + } + + if (ctx->screen->family < CHIP_RV770) { + ctx->pm4[ctx->pm4_cdwords++] = PKT3(PKT3_EVENT_WRITE, 0, 0); + ctx->pm4[ctx->pm4_cdwords++] = EVENT_TYPE(EVENT_TYPE_CACHE_FLUSH_AND_INV_EVENT) | EVENT_INDEX(0); + } else { + ctx->pm4[ctx->pm4_cdwords++] = PKT3(PKT3_SURFACE_SYNC, 3, 0); + ctx->pm4[ctx->pm4_cdwords++] = flush_flags; /* CP_COHER_CNTL */ + ctx->pm4[ctx->pm4_cdwords++] = 0xffffffff; /* CP_COHER_SIZE */ + ctx->pm4[ctx->pm4_cdwords++] = 0; /* CP_COHER_BASE */ + ctx->pm4[ctx->pm4_cdwords++] = 0x0000000A; /* POLL_INTERVAL */ + } + + ctx->num_cs_dw_streamout_end = 0; + + /* XXX print some debug info */ + for (i = 0; i < ctx->num_so_targets; i++) { + if (!t[i]) + continue; + + uint32_t *ptr = ctx->ws->buffer_map(t[i]->filled_size->buf, ctx->cs, RADEON_USAGE_READ); + printf("FILLED_SIZE%i: %u\n", i, *ptr); + ctx->ws->buffer_unmap(t[i]->filled_size->buf); + } +} + +void r600_context_draw_opaque_count(struct r600_context *ctx, struct r600_so_target *t) +{ + r600_need_cs_space(ctx, 14 + 21, TRUE); + + ctx->pm4[ctx->pm4_cdwords++] = PKT3(PKT3_SET_CONTEXT_REG, 1, 0); + ctx->pm4[ctx->pm4_cdwords++] = (R_028B28_VGT_STRMOUT_DRAW_OPAQUE_OFFSET - R600_CONTEXT_REG_OFFSET) >> 2; + ctx->pm4[ctx->pm4_cdwords++] = 0; + + ctx->pm4[ctx->pm4_cdwords++] = PKT3(PKT3_SET_CONTEXT_REG, 1, 0); + ctx->pm4[ctx->pm4_cdwords++] = (R_028B30_VGT_STRMOUT_DRAW_OPAQUE_VERTEX_STRIDE - R600_CONTEXT_REG_OFFSET) >> 2; + ctx->pm4[ctx->pm4_cdwords++] = t->stride >> 2; + + ctx->pm4[ctx->pm4_cdwords++] = PKT3(PKT3_COPY_DW, 4, 0); + ctx->pm4[ctx->pm4_cdwords++] = COPY_DW_SRC_IS_MEM | COPY_DW_DST_IS_REG; + ctx->pm4[ctx->pm4_cdwords++] = 0; /* src address lo */ + ctx->pm4[ctx->pm4_cdwords++] = 0; /* src address hi */ + ctx->pm4[ctx->pm4_cdwords++] = R_028B2C_VGT_STRMOUT_DRAW_OPAQUE_BUFFER_FILLED_SIZE >> 2; /* dst register */ + ctx->pm4[ctx->pm4_cdwords++] = 0; /* unused */ + + ctx->pm4[ctx->pm4_cdwords++] = PKT3(PKT3_NOP, 0, 0); + ctx->pm4[ctx->pm4_cdwords++] = r600_context_bo_reloc(ctx, t->filled_size, + RADEON_USAGE_READ); + +#if 0 /* I have not found this useful yet. */ + ctx->pm4[ctx->pm4_cdwords++] = PKT3(PKT3_COPY_DW, 4, 0); + ctx->pm4[ctx->pm4_cdwords++] = COPY_DW_SRC_IS_REG | COPY_DW_DST_IS_REG; + ctx->pm4[ctx->pm4_cdwords++] = R_028B2C_VGT_STRMOUT_DRAW_OPAQUE_BUFFER_FILLED_SIZE >> 2; /* src register */ + ctx->pm4[ctx->pm4_cdwords++] = 0; /* unused */ + ctx->pm4[ctx->pm4_cdwords++] = R_0085F4_CP_COHER_SIZE >> 2; /* dst register */ + ctx->pm4[ctx->pm4_cdwords++] = 0; /* unused */ + + ctx->pm4[ctx->pm4_cdwords++] = PKT3(PKT3_SET_CONFIG_REG, 1, 0); + ctx->pm4[ctx->pm4_cdwords++] = (R_0085F0_CP_COHER_CNTL - R600_CONFIG_REG_OFFSET) >> 2; + ctx->pm4[ctx->pm4_cdwords++] = S_0085F0_SO0_DEST_BASE_ENA(1) << t->so_index; + + ctx->pm4[ctx->pm4_cdwords++] = PKT3(PKT3_SET_CONFIG_REG, 1, 0); + ctx->pm4[ctx->pm4_cdwords++] = (R_0085F8_CP_COHER_BASE - R600_CONFIG_REG_OFFSET) >> 2; + ctx->pm4[ctx->pm4_cdwords++] = t->b.buffer_offset >> 2; + + ctx->pm4[ctx->pm4_cdwords++] = PKT3(PKT3_NOP, 0, 0); + ctx->pm4[ctx->pm4_cdwords++] = r600_context_bo_reloc(ctx, (struct r600_resource*)t->b.buffer, + RADEON_USAGE_WRITE); + + ctx->pm4[ctx->pm4_cdwords++] = PKT3(PKT3_WAIT_REG_MEM, 5, 0); + ctx->pm4[ctx->pm4_cdwords++] = WAIT_REG_MEM_EQUAL; /* wait until the register is equal to the reference value */ + ctx->pm4[ctx->pm4_cdwords++] = R_0085FC_CP_COHER_STATUS >> 2; /* register */ + ctx->pm4[ctx->pm4_cdwords++] = 0; + ctx->pm4[ctx->pm4_cdwords++] = 0; /* reference value */ + ctx->pm4[ctx->pm4_cdwords++] = 0xffffffff; /* mask */ + ctx->pm4[ctx->pm4_cdwords++] = 4; /* poll interval */ +#endif +} diff --git a/src/gallium/drivers/r600/r600_hw_context_priv.h b/src/gallium/drivers/r600/r600_hw_context_priv.h index bea6135..206de7e 100644 --- a/src/gallium/drivers/r600/r600_hw_context_priv.h +++ b/src/gallium/drivers/r600/r600_hw_context_priv.h @@ -76,6 +76,13 @@ void r600_context_reg(struct r600_context *ctx, void r600_init_cs(struct r600_context *ctx); int r600_resource_init(struct r600_context *ctx, struct r600_range *range, unsigned offset, unsigned nblocks, unsigned stride, struct r600_reg *reg, int nreg, unsigned offset_base); +/* + * evergreen_hw_context.c + */ +void evergreen_flush_vgt_streamout(struct r600_context *ctx); +void evergreen_set_streamout_enable(struct r600_context *ctx, unsigned buffer_enable_bit); + + static INLINE unsigned r600_context_bo_reloc(struct r600_context *ctx, struct r600_resource *rbo, enum radeon_bo_usage usage) { diff --git a/src/gallium/drivers/r600/r600_pipe.c b/src/gallium/drivers/r600/r600_pipe.c index 97c6808..22fefaa 100644 --- a/src/gallium/drivers/r600/r600_pipe.c +++ b/src/gallium/drivers/r600/r600_pipe.c @@ -378,6 +378,7 @@ static int r600_get_param(struct pipe_screen* pscreen, enum pipe_cap param) case PIPE_CAP_PRIMITIVE_RESTART: case PIPE_CAP_CONDITIONAL_RENDER: case PIPE_CAP_TEXTURE_BARRIER: + case PIPE_CAP_STREAM_OUTPUT_PAUSE_RESUME: return 1; /* Supported except the original R600. */ @@ -391,17 +392,21 @@ static int r600_get_param(struct pipe_screen* pscreen, enum pipe_cap param) return family >= CHIP_CEDAR ? 1 : 0; /* Unsupported features. */ - case PIPE_CAP_MAX_STREAM_OUTPUT_BUFFERS: - case PIPE_CAP_MAX_STREAM_OUTPUT_SEPARATE_ATTRIBS: - case PIPE_CAP_MAX_STREAM_OUTPUT_SEPARATE_COMPONENTS: - case PIPE_CAP_MAX_STREAM_OUTPUT_INTERLEAVED_COMPONENTS: - case PIPE_CAP_STREAM_OUTPUT_PAUSE_RESUME: case PIPE_CAP_TGSI_INSTANCEID: case PIPE_CAP_TGSI_FS_COORD_ORIGIN_LOWER_LEFT: case PIPE_CAP_TGSI_FS_COORD_PIXEL_CENTER_INTEGER: case PIPE_CAP_SCALED_RESOLVE: return 0; + /* Stream output. */ + case PIPE_CAP_MAX_STREAM_OUTPUT_BUFFERS: + return debug_get_bool_option("R600_STREAMOUT", FALSE) ? 4 : 0; + case PIPE_CAP_MAX_STREAM_OUTPUT_SEPARATE_ATTRIBS: + return 16; + case PIPE_CAP_MAX_STREAM_OUTPUT_SEPARATE_COMPONENTS: + case PIPE_CAP_MAX_STREAM_OUTPUT_INTERLEAVED_COMPONENTS: + return 16*4; + /* Texturing. */ case PIPE_CAP_MAX_TEXTURE_2D_LEVELS: case PIPE_CAP_MAX_TEXTURE_3D_LEVELS: diff --git a/src/gallium/drivers/r600/r600_pipe.h b/src/gallium/drivers/r600/r600_pipe.h index a127eed..46443af 100644 --- a/src/gallium/drivers/r600/r600_pipe.h +++ b/src/gallium/drivers/r600/r600_pipe.h @@ -133,6 +133,8 @@ struct r600_pipe_shader { struct r600_vertex_element vertex_elements; struct tgsi_token *tokens; unsigned sprite_coord_enable; + struct pipe_stream_output_info so; + unsigned so_strides[4]; }; struct r600_pipe_sampler_state { @@ -348,6 +350,19 @@ void r600_delete_ps_shader(struct pipe_context *ctx, void *state); void r600_delete_vs_shader(struct pipe_context *ctx, void *state); void r600_set_constant_buffer(struct pipe_context *ctx, uint shader, uint index, struct pipe_resource *buffer); +struct pipe_stream_output_target * +r600_create_so_target(struct pipe_context *ctx, + struct pipe_resource *buffer, + unsigned buffer_offset, + unsigned buffer_size); +void r600_so_target_destroy(struct pipe_context *ctx, + struct pipe_stream_output_target *target); +void r600_set_so_targets(struct pipe_context *ctx, + unsigned num_targets, + struct pipe_stream_output_target **targets, + unsigned append_bitmask); + + void r600_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info); /* diff --git a/src/gallium/drivers/r600/r600_query.c b/src/gallium/drivers/r600/r600_query.c index ec0d91f..ee2d04b 100644 --- a/src/gallium/drivers/r600/r600_query.c +++ b/src/gallium/drivers/r600/r600_query.c @@ -100,7 +100,21 @@ static void r600_render_condition(struct pipe_context *ctx, } rctx->ctx.predicate_drawing = true; - r600_query_predication(&rctx->ctx, rquery, PREDICATION_OP_ZPASS, wait_flag); + + switch (rquery->type) { + case PIPE_QUERY_OCCLUSION_COUNTER: + case PIPE_QUERY_OCCLUSION_PREDICATE: + r600_query_predication(&rctx->ctx, rquery, PREDICATION_OP_ZPASS, wait_flag); + break; + case PIPE_QUERY_PRIMITIVES_EMITTED: + case PIPE_QUERY_PRIMITIVES_GENERATED: + case PIPE_QUERY_SO_STATISTICS: + case PIPE_QUERY_SO_OVERFLOW_PREDICATE: + r600_query_predication(&rctx->ctx, rquery, PREDICATION_OP_PRIMCOUNT, wait_flag); + break; + default: + assert(0); + } } void r600_init_query_functions(struct r600_pipe_context *rctx) diff --git a/src/gallium/drivers/r600/r600_shader.c b/src/gallium/drivers/r600/r600_shader.c index 61f88f4..ad4aded 100644 --- a/src/gallium/drivers/r600/r600_shader.c +++ b/src/gallium/drivers/r600/r600_shader.c @@ -119,6 +119,19 @@ int r600_pipe_shader_create(struct pipe_context *ctx, struct r600_pipe_shader *s if (dump_shaders) { fprintf(stderr, "--------------------------------------------------------------\n"); tgsi_dump(shader->tokens, 0); + + if (shader->so.num_outputs) { + unsigned i; + fprintf(stderr, "STREAMOUT\n"); + for (i = 0; i < shader->so.num_outputs; i++) { + fprintf(stderr, " %i: MEM_STREAM0_BUF%i OUT[%i].%s%s%s%s\n", i, + shader->so.output[i].output_buffer, shader->so.output[i].register_index, + shader->so.output[i].register_mask & 1 ? "x" : "_", + (shader->so.output[i].register_mask >> 1) & 1 ? "y" : "_", + (shader->so.output[i].register_mask >> 2) & 1 ? "z" : "_", + (shader->so.output[i].register_mask >> 3) & 1 ? "w" : "_"); + } + } } r = r600_shader_from_tgsi(rctx, shader); if (r) { @@ -681,6 +694,7 @@ static int r600_shader_from_tgsi(struct r600_pipe_context * rctx, struct r600_pi { struct r600_shader *shader = &pipeshader->shader; struct tgsi_token *tokens = pipeshader->tokens; + struct pipe_stream_output_info so = pipeshader->so; struct tgsi_full_immediate *immediate; struct tgsi_full_property *property; struct r600_shader_ctx ctx; @@ -847,6 +861,93 @@ static int r600_shader_from_tgsi(struct r600_pipe_context * rctx, struct r600_pi } } + /* Add stream outputs. */ + if (ctx.type == TGSI_PROCESSOR_VERTEX && so.num_outputs) { + unsigned buffer_offset[PIPE_MAX_SO_BUFFERS] = {0}; + + for (i = 0; i < so.num_outputs; i++) { + struct r600_bytecode_output output; + unsigned comps; + + if (so.output[i].output_buffer >= 4) { + R600_ERR("exceeded the max number of stream output buffers, got: %d\n", + so.output[i].output_buffer); + r = -EINVAL; + goto out_err; + } + + switch (so.output[i].register_mask) { + case TGSI_WRITEMASK_XYZW: + comps = 4; + break; + case TGSI_WRITEMASK_XYZ: + comps = 3; + break; + case TGSI_WRITEMASK_XY: + comps = 2; + break; + case TGSI_WRITEMASK_X: + comps = 1; + break; + default: + R600_ERR("streamout: invalid register_mask, got: %x\n", + so.output[i].register_mask); + r = -EINVAL; + goto out_err; + } + + memset(&output, 0, sizeof(struct r600_bytecode_output)); + output.gpr = shader->output[so.output[i].register_index].gpr; + output.elem_size = 0; + output.array_base = buffer_offset[so.output[i].output_buffer]; + output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE; + output.burst_count = 1; + output.barrier = 1; + output.array_size = 0; + output.comp_mask = so.output[i].register_mask; + if (ctx.bc->chip_class >= EVERGREEN) { + switch (so.output[i].output_buffer) { + case 0: + output.inst = EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0_BUF0; + break; + case 1: + output.inst = EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0_BUF1; + break; + case 2: + output.inst = EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0_BUF2; + break; + case 3: + output.inst = EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0_BUF3; + break; + } + } else { + switch (so.output[i].output_buffer) { + case 0: + output.inst = V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0; + break; + case 1: + output.inst = V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM1; + break; + case 2: + output.inst = V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM2; + break; + case 3: + output.inst = V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM3; + break; + } + } + r = r600_bytecode_add_output(ctx.bc, &output); + if (r) + goto out_err; + + buffer_offset[so.output[i].output_buffer] += comps; + } + + for (i = 0; i < PIPE_MAX_SO_BUFFERS; i++) { + pipeshader->so_strides[i] = buffer_offset[i] * 4; + } + } + /* export output */ j = 0; for (i = 0, pos0 = 0; i < noutput; i++) { diff --git a/src/gallium/drivers/r600/r600_sq.h b/src/gallium/drivers/r600/r600_sq.h index 56ed35e..b9c4126 100644 --- a/src/gallium/drivers/r600/r600_sq.h +++ b/src/gallium/drivers/r600/r600_sq.h @@ -114,6 +114,10 @@ #define V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS 0x00000001 #define V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM 0x00000002 #define V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_SX 0x00000003 +#define V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE 0x00000000 +#define V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE_IND 0x00000001 +#define V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_READ 0x00000002 +#define V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_READ_IND 0x00000003 #define S_SQ_CF_ALLOC_EXPORT_WORD0_RW_GPR(x) (((x) & 0x7F) << 15) #define G_SQ_CF_ALLOC_EXPORT_WORD0_RW_GPR(x) (((x) >> 15) & 0x7F) #define C_SQ_CF_ALLOC_EXPORT_WORD0_RW_GPR 0xFFC07FFF diff --git a/src/gallium/drivers/r600/r600_state.c b/src/gallium/drivers/r600/r600_state.c index 8410cfe..7f44035 100644 --- a/src/gallium/drivers/r600/r600_state.c +++ b/src/gallium/drivers/r600/r600_state.c @@ -933,7 +933,7 @@ static void *r600_create_dsa_state(struct pipe_context *ctx, } static void *r600_create_rs_state(struct pipe_context *ctx, - const struct pipe_rasterizer_state *state) + const struct pipe_rasterizer_state *state) { struct r600_pipe_context *rctx = (struct r600_pipe_context *)ctx; struct r600_pipe_rasterizer *rs = CALLOC_STRUCT(r600_pipe_rasterizer); @@ -978,8 +978,8 @@ static void *r600_create_rs_state(struct pipe_context *ctx, state->fill_back != PIPE_POLYGON_MODE_FILL); r600_pipe_state_add_reg(rstate, R_028814_PA_SU_SC_MODE_CNTL, S_028814_PROVOKING_VTX_LAST(prov_vtx) | - S_028814_CULL_FRONT((state->cull_face & PIPE_FACE_FRONT) ? 1 : 0) | - S_028814_CULL_BACK((state->cull_face & PIPE_FACE_BACK) ? 1 : 0) | + S_028814_CULL_FRONT(state->rasterizer_discard || (state->cull_face & PIPE_FACE_FRONT) ? 1 : 0) | + S_028814_CULL_BACK(state->rasterizer_discard || (state->cull_face & PIPE_FACE_BACK) ? 1 : 0) | S_028814_FACE(!state->front_ccw) | S_028814_POLY_OFFSET_FRONT_ENABLE(state->offset_tri) | S_028814_POLY_OFFSET_BACK_ENABLE(state->offset_tri) | @@ -1758,6 +1758,9 @@ void r600_init_state_functions(struct r600_pipe_context *rctx) rctx->context.sampler_view_destroy = r600_sampler_view_destroy; rctx->context.redefine_user_buffer = u_default_redefine_user_buffer; rctx->context.texture_barrier = r600_texture_barrier; + rctx->context.create_stream_output_target = r600_create_so_target; + rctx->context.stream_output_target_destroy = r600_so_target_destroy; + rctx->context.set_stream_output_targets = r600_set_so_targets; } void r600_adjust_gprs(struct r600_pipe_context *rctx) diff --git a/src/gallium/drivers/r600/r600_state_common.c b/src/gallium/drivers/r600/r600_state_common.c index d6ffda4..9f6f514 100644 --- a/src/gallium/drivers/r600/r600_state_common.c +++ b/src/gallium/drivers/r600/r600_state_common.c @@ -247,10 +247,11 @@ void *r600_create_vertex_elements(struct pipe_context *ctx, void *r600_create_shader_state(struct pipe_context *ctx, const struct pipe_shader_state *state) { - struct r600_pipe_shader *shader = CALLOC_STRUCT(r600_pipe_shader); + struct r600_pipe_shader *shader = CALLOC_STRUCT(r600_pipe_shader); int r; shader->tokens = tgsi_dup_tokens(state->tokens); + shader->so = state->stream_output; r = r600_pipe_shader_create(ctx, shader); if (r) { @@ -412,6 +413,71 @@ void r600_set_constant_buffer(struct pipe_context *ctx, uint shader, uint index, pipe_resource_reference((struct pipe_resource**)&rbuffer, NULL); } +struct pipe_stream_output_target * +r600_create_so_target(struct pipe_context *ctx, + struct pipe_resource *buffer, + unsigned buffer_offset, + unsigned buffer_size) +{ + struct r600_pipe_context *rctx = (struct r600_pipe_context *)ctx; + struct r600_so_target *t; + void *ptr; + + t = CALLOC_STRUCT(r600_so_target); + if (!t) { + return NULL; + } + + t->b.reference.count = 1; + t->b.context = ctx; + pipe_resource_reference(&t->b.buffer, buffer); + t->b.buffer_offset = buffer_offset; + t->b.buffer_size = buffer_size; + + t->filled_size = (struct r600_resource*) + pipe_buffer_create(ctx->screen, PIPE_BIND_CUSTOM, PIPE_USAGE_STATIC, 4); + ptr = rctx->ws->buffer_map(t->filled_size->buf, rctx->ctx.cs, PIPE_TRANSFER_WRITE); + memset(ptr, 0, t->filled_size->buf->size); + rctx->ws->buffer_unmap(t->filled_size->buf); + + return &t->b; +} + +void r600_so_target_destroy(struct pipe_context *ctx, + struct pipe_stream_output_target *target) +{ + struct r600_so_target *t = (struct r600_so_target*)target; + pipe_resource_reference(&t->b.buffer, NULL); + pipe_resource_reference((struct pipe_resource**)&t->filled_size, NULL); + FREE(t); +} + +void r600_set_so_targets(struct pipe_context *ctx, + unsigned num_targets, + struct pipe_stream_output_target **targets, + unsigned append_bitmask) +{ + struct r600_pipe_context *rctx = (struct r600_pipe_context *)ctx; + unsigned i; + + /* Stop streamout. */ + if (rctx->ctx.num_so_targets) { + r600_context_streamout_end(&rctx->ctx); + } + + /* Set the new targets. */ + for (i = 0; i < num_targets; i++) { + pipe_so_target_reference((struct pipe_stream_output_target**)&rctx->ctx.so_targets[i], targets[i]); + } + for (; i < rctx->ctx.num_so_targets; i++) { + pipe_so_target_reference((struct pipe_stream_output_target**)&rctx->ctx.so_targets[i], NULL); + } + + rctx->ctx.num_so_targets = num_targets; + rctx->ctx.streamout_start = num_targets != 0; + rctx->ctx.streamout_append_bitmask = append_bitmask; +} + static void r600_vertex_buffer_update(struct r600_pipe_context *rctx) { struct r600_pipe_resource_state *rstate; @@ -528,7 +594,7 @@ void r600_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *dinfo) struct pipe_index_buffer ib = {}; unsigned prim, mask, ls_mask = 0; - if (!info.count || + if ((!info.count && (info.indexed || !info.count_from_stream_output)) || (info.indexed && !rctx->vbuf_mgr->index_buffer.buffer) || !r600_conv_pipe_prim(info.mode, &prim)) { return; @@ -572,8 +638,15 @@ void r600_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *dinfo) } else { info.index_bias = info.start; rdraw.vgt_draw_initiator = V_0287F0_DI_SRC_SEL_AUTO_INDEX; + if (info.count_from_stream_output) { + rdraw.vgt_draw_initiator |= S_0287F0_USE_OPAQUE(1); + + r600_context_draw_opaque_count(&rctx->ctx, (struct r600_so_target*)info.count_from_stream_output); + } } + rctx->ctx.vs_shader_so_strides = rctx->vs_shader->so_strides; + mask = (1ULL << ((unsigned)rctx->framebuffer.nr_cbufs * 4)) - 1; if (rctx->vgt.id != R600_PIPE_STATE_VGT) { diff --git a/src/gallium/drivers/r600/r600d.h b/src/gallium/drivers/r600/r600d.h index 7a2fe02..ccdf82e 100644 --- a/src/gallium/drivers/r600/r600d.h +++ b/src/gallium/drivers/r600/r600d.h @@ -76,10 +76,23 @@ #define PKT3_DRAW_INDEX_IMMD 0x2E #define PKT3_NUM_INSTANCES 0x2F #define PKT3_STRMOUT_BUFFER_UPDATE 0x34 +#define STRMOUT_STORE_BUFFER_FILLED_SIZE 1 +#define STRMOUT_OFFSET_SOURCE(x) (((x) & 0x3) << 1) +#define STRMOUT_OFFSET_FROM_PACKET 0 +#define STRMOUT_OFFSET_FROM_VGT_FILLED_SIZE 1 +#define STRMOUT_OFFSET_FROM_MEM 2 +#define STRMOUT_OFFSET_NONE 3 +#define STRMOUT_SELECT_BUFFER(x) (((x) & 0x3) << 8) #define PKT3_INDIRECT_BUFFER_MP 0x38 #define PKT3_MEM_SEMAPHORE 0x39 #define PKT3_MPEG_INDEX 0x3A +#define PKT3_COPY_DW 0x3B +#define COPY_DW_SRC_IS_REG (0 << 0) +#define COPY_DW_SRC_IS_MEM (1 << 0) +#define COPY_DW_DST_IS_REG (0 << 1) +#define COPY_DW_DST_IS_MEM (1 << 1) #define PKT3_WAIT_REG_MEM 0x3C +#define WAIT_REG_MEM_EQUAL 3 #define PKT3_MEM_WRITE 0x3D #define PKT3_INDIRECT_BUFFER 0x32 #define PKT3_CP_INTERRUPT 0x40 @@ -106,6 +119,8 @@ #define EVENT_TYPE_CACHE_FLUSH_AND_INV_TS_EVENT 0x14 #define EVENT_TYPE_ZPASS_DONE 0x15 #define EVENT_TYPE_CACHE_FLUSH_AND_INV_EVENT 0x16 +#define EVENT_TYPE_SO_VGTSTREAMOUT_FLUSH 0x1f +#define EVENT_TYPE_SAMPLE_STREAMOUTSTATS 0x20 #define EVENT_TYPE(x) ((x) << 0) #define EVENT_INDEX(x) ((x) << 8) /* 0 - any non-TS event @@ -147,6 +162,12 @@ #define PKT3(op, count, predicate) (PKT_TYPE_S(3) | PKT3_IT_OPCODE_S(op) | PKT_COUNT_S(count) | PKT3_PRED_S(predicate)) /* Registers */ +#define R_008490_CP_STRMOUT_CNTL 0x008490 +#define S_008490_OFFSET_UPDATE_DONE(x) (((x) & 0x1) << 0) +#define R_008960_VGT_STRMOUT_BUFFER_FILLED_SIZE_0 0x008960 /* read-only */ +#define R_008964_VGT_STRMOUT_BUFFER_FILLED_SIZE_1 0x008964 /* read-only */ +#define R_008968_VGT_STRMOUT_BUFFER_FILLED_SIZE_2 0x008968 /* read-only */ +#define R_00896C_VGT_STRMOUT_BUFFER_FILLED_SIZE_3 0x00896C /* read-only */ #define R_008C00_SQ_CONFIG 0x00008C00 #define S_008C00_VC_ENABLE(x) (((x) & 0x1) << 0) #define G_008C00_VC_ENABLE(x) (((x) >> 0) & 0x1) @@ -3144,6 +3165,26 @@ #define S_028AB8_VTX_CNT_EN(x) (((x) & 0x1) << 0) #define G_028AB8_VTX_CNT_EN(x) (((x) >> 0) & 0x1) #define C_028AB8_VTX_CNT_EN 0xFFFFFFFE +#define R_028AD0_VGT_STRMOUT_BUFFER_SIZE_0 0x028AD0 +#define R_028AD4_VGT_STRMOUT_VTX_STRIDE_0 0x028AD4 +#define R_028AD8_VGT_STRMOUT_BUFFER_BASE_0 0x028AD8 +#define R_028ADC_VGT_STRMOUT_BUFFER_OFFSET_0 0x028ADC +#define R_028AE0_VGT_STRMOUT_BUFFER_SIZE_1 0x028AE0 +#define R_028AE4_VGT_STRMOUT_VTX_STRIDE_1 0x028AE4 +#define R_028AE8_VGT_STRMOUT_BUFFER_BASE_1 0x028AE8 +#define R_028AEC_VGT_STRMOUT_BUFFER_OFFSET_1 0x028AEC +#define R_028AF0_VGT_STRMOUT_BUFFER_SIZE_2 0x028AF0 +#define R_028AF4_VGT_STRMOUT_VTX_STRIDE_2 0x028AF4 +#define R_028AF8_VGT_STRMOUT_BUFFER_BASE_2 0x028AF8 +#define R_028AFC_VGT_STRMOUT_BUFFER_OFFSET_2 0x028AFC +#define R_028B00_VGT_STRMOUT_BUFFER_SIZE_3 0x028B00 +#define R_028B04_VGT_STRMOUT_VTX_STRIDE_3 0x028B04 +#define R_028B08_VGT_STRMOUT_BUFFER_BASE_3 0x028B08 +#define R_028B0C_VGT_STRMOUT_BUFFER_OFFSET_3 0x028B0C +#define R_028B10_VGT_STRMOUT_BASE_OFFSET_0 0x028B10 +#define R_028B14_VGT_STRMOUT_BASE_OFFSET_1 0x028B14 +#define R_028B18_VGT_STRMOUT_BASE_OFFSET_2 0x028B18 +#define R_028B1C_VGT_STRMOUT_BASE_OFFSET_3 0x028B1C #define R_028B20_VGT_STRMOUT_BUFFER_EN 0x028B20 #define S_028B20_BUFFER_0_EN(x) (((x) & 0x1) << 0) #define G_028B20_BUFFER_0_EN(x) (((x) >> 0) & 0x1) @@ -3157,6 +3198,13 @@ #define S_028B20_BUFFER_3_EN(x) (((x) & 0x1) << 3) #define G_028B20_BUFFER_3_EN(x) (((x) >> 3) & 0x1) #define C_028B20_BUFFER_3_EN 0xFFFFFFF7 +#define R_028B28_VGT_STRMOUT_DRAW_OPAQUE_OFFSET 0x028B28 +#define R_028B2C_VGT_STRMOUT_DRAW_OPAQUE_BUFFER_FILLED_SIZE 0x028B2C +#define R_028B30_VGT_STRMOUT_DRAW_OPAQUE_VERTEX_STRIDE 0x028B30 +#define R_028B44_VGT_STRMOUT_BASE_OFFSET_HI_0 0x028B44 +#define R_028B48_VGT_STRMOUT_BASE_OFFSET_HI_1 0x028B48 +#define R_028B4C_VGT_STRMOUT_BASE_OFFSET_HI_2 0x028B4C +#define R_028B50_VGT_STRMOUT_BASE_OFFSET_HI_3 0x028B50 #define R_028C20_PA_SC_AA_SAMPLE_LOCS_8S_WD1_MCTX 0x028C20 #define S_028C20_S4_X(x) (((x) & 0xF) << 0) #define G_028C20_S4_X(x) (((x) >> 0) & 0xF) @@ -3280,6 +3328,9 @@ #define S_0085F0_CR2_ACTION_ENA(x) (((x) & 0x1) << 31) #define G_0085F0_CR2_ACTION_ENA(x) (((x) >> 31) & 0x1) #define C_0085F0_CR2_ACTION_ENA 0x7FFFFFFF +#define R_0085F4_CP_COHER_SIZE 0x0085F4 +#define R_0085F8_CP_COHER_BASE 0x0085F8 +#define R_0085FC_CP_COHER_STATUS 0x0085FC #define R_02812C_CB_CLEAR_ALPHA 0x02812C -- 2.7.4