r600g: implement transform feedback
authorMarek Olšák <maraeo@gmail.com>
Tue, 8 Nov 2011 20:58:27 +0000 (21:58 +0100)
committerMarek Olšák <maraeo@gmail.com>
Sat, 17 Dec 2011 13:56:21 +0000 (14:56 +0100)
r600: DONE.
r700: MOSTLY (done but locks up).
Evergreen: MOSTLY (done but doesn't work for an unknown reason).

The kernel support will come soon.

18 files changed:
src/gallium/drivers/r600/eg_asm.c
src/gallium/drivers/r600/evergreen_hw_context.c
src/gallium/drivers/r600/evergreen_state.c
src/gallium/drivers/r600/evergreend.h
src/gallium/drivers/r600/r600.h
src/gallium/drivers/r600/r600_asm.c
src/gallium/drivers/r600/r600_asm.h
src/gallium/drivers/r600/r600_blit.c
src/gallium/drivers/r600/r600_hw_context.c
src/gallium/drivers/r600/r600_hw_context_priv.h
src/gallium/drivers/r600/r600_pipe.c
src/gallium/drivers/r600/r600_pipe.h
src/gallium/drivers/r600/r600_query.c
src/gallium/drivers/r600/r600_shader.c
src/gallium/drivers/r600/r600_sq.h
src/gallium/drivers/r600/r600_state.c
src/gallium/drivers/r600/r600_state_common.c
src/gallium/drivers/r600/r600d.h

index f6b8631..877e162 100644 (file)
@@ -73,6 +73,35 @@ int eg_bytecode_cf_build(struct r600_bytecode *bc, struct r600_bytecode_cf *cf)
                        bc->bytecode[id] |= S_SQ_CF_ALLOC_EXPORT_WORD1_END_OF_PROGRAM(cf->output.end_of_program);
                id++;
                break;
+       case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0_BUF0:
+       case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0_BUF1:
+       case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0_BUF2:
+       case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0_BUF3:
+       case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM1_BUF0:
+       case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM1_BUF1:
+       case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM1_BUF2:
+       case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM1_BUF3:
+       case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM2_BUF0:
+       case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM2_BUF1:
+       case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM2_BUF2:
+       case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM2_BUF3:
+       case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM3_BUF0:
+       case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM3_BUF1:
+       case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM3_BUF2:
+       case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM3_BUF3:
+               bc->bytecode[id++] = S_SQ_CF_ALLOC_EXPORT_WORD0_RW_GPR(cf->output.gpr) |
+                       S_SQ_CF_ALLOC_EXPORT_WORD0_ELEM_SIZE(cf->output.elem_size) |
+                       S_SQ_CF_ALLOC_EXPORT_WORD0_ARRAY_BASE(cf->output.array_base) |
+                       S_SQ_CF_ALLOC_EXPORT_WORD0_TYPE(cf->output.type);
+               bc->bytecode[id] = S_SQ_CF_ALLOC_EXPORT_WORD1_BURST_COUNT(cf->output.burst_count - 1) |
+                       S_SQ_CF_ALLOC_EXPORT_WORD1_BARRIER(cf->output.barrier) |
+                       cf->output.inst |
+                       S_SQ_CF_ALLOC_EXPORT_WORD1_BUF_COMP_MASK(cf->output.comp_mask) |
+                       S_SQ_CF_ALLOC_EXPORT_WORD1_BUF_ARRAY_SIZE(cf->output.array_size);
+               if (bc->chip_class == EVERGREEN) /* no EOP on cayman */
+                       bc->bytecode[id] |= S_SQ_CF_ALLOC_EXPORT_WORD1_END_OF_PROGRAM(cf->output.end_of_program);
+               id++;
+               break;
        case EG_V_SQ_CF_WORD1_SQ_CF_INST_JUMP:
        case EG_V_SQ_CF_WORD1_SQ_CF_INST_ELSE:
        case EG_V_SQ_CF_WORD1_SQ_CF_INST_POP:
index 96e8d18..bd1d969 100644 (file)
@@ -1241,3 +1241,38 @@ void evergreen_context_flush_dest_caches(struct r600_context *ctx)
 
        ctx->flags &= ~R600_CONTEXT_DST_CACHES_DIRTY;
 }
+
+void evergreen_flush_vgt_streamout(struct r600_context *ctx)
+{
+       ctx->pm4[ctx->pm4_cdwords++] = PKT3(PKT3_SET_CONFIG_REG, 1, 0);
+       ctx->pm4[ctx->pm4_cdwords++] = (R_0084FC_CP_STRMOUT_CNTL - EVERGREEN_CONFIG_REG_OFFSET) >> 2;
+       ctx->pm4[ctx->pm4_cdwords++] = 0;
+
+       ctx->pm4[ctx->pm4_cdwords++] = PKT3(PKT3_EVENT_WRITE, 0, 0);
+       ctx->pm4[ctx->pm4_cdwords++] = EVENT_TYPE(EVENT_TYPE_SO_VGTSTREAMOUT_FLUSH) | EVENT_INDEX(0);
+
+       ctx->pm4[ctx->pm4_cdwords++] = PKT3(PKT3_WAIT_REG_MEM, 5, 0);
+       ctx->pm4[ctx->pm4_cdwords++] = WAIT_REG_MEM_EQUAL; /* wait until the register is equal to the reference value */
+       ctx->pm4[ctx->pm4_cdwords++] = R_0084FC_CP_STRMOUT_CNTL >> 2;  /* register */
+       ctx->pm4[ctx->pm4_cdwords++] = 0;
+       ctx->pm4[ctx->pm4_cdwords++] = S_0084FC_OFFSET_UPDATE_DONE(1); /* reference value */
+       ctx->pm4[ctx->pm4_cdwords++] = S_0084FC_OFFSET_UPDATE_DONE(1); /* mask */
+       ctx->pm4[ctx->pm4_cdwords++] = 4; /* poll interval */
+}
+
+void evergreen_set_streamout_enable(struct r600_context *ctx, unsigned buffer_enable_bit)
+{
+       if (buffer_enable_bit) {
+               ctx->pm4[ctx->pm4_cdwords++] = PKT3(PKT3_SET_CONTEXT_REG, 1, 0);
+               ctx->pm4[ctx->pm4_cdwords++] = (R_028B94_VGT_STRMOUT_CONFIG - EVERGREEN_CONTEXT_REG_OFFSET) >> 2;
+               ctx->pm4[ctx->pm4_cdwords++] = S_028B94_STREAMOUT_0_EN(1);
+
+               ctx->pm4[ctx->pm4_cdwords++] = PKT3(PKT3_SET_CONTEXT_REG, 1, 0);
+               ctx->pm4[ctx->pm4_cdwords++] = (R_028B98_VGT_STRMOUT_BUFFER_CONFIG - EVERGREEN_CONTEXT_REG_OFFSET) >> 2;
+               ctx->pm4[ctx->pm4_cdwords++] = S_028B98_STREAM_0_BUFFER_EN(buffer_enable_bit);
+       } else {
+               ctx->pm4[ctx->pm4_cdwords++] = PKT3(PKT3_SET_CONTEXT_REG, 1, 0);
+               ctx->pm4[ctx->pm4_cdwords++] = (R_028B94_VGT_STRMOUT_CONFIG - EVERGREEN_CONTEXT_REG_OFFSET) >> 2;
+               ctx->pm4[ctx->pm4_cdwords++] = S_028B94_STREAMOUT_0_EN(0);
+       }
+}
index d0c02d5..6f5d6f7 100644 (file)
@@ -927,8 +927,8 @@ static void *evergreen_create_rs_state(struct pipe_context *ctx,
                                state->fill_back != PIPE_POLYGON_MODE_FILL);
        r600_pipe_state_add_reg(rstate, R_028814_PA_SU_SC_MODE_CNTL,
                S_028814_PROVOKING_VTX_LAST(prov_vtx) |
-               S_028814_CULL_FRONT((state->cull_face & PIPE_FACE_FRONT) ? 1 : 0) |
-               S_028814_CULL_BACK((state->cull_face & PIPE_FACE_BACK) ? 1 : 0) |
+               S_028814_CULL_FRONT(state->rasterizer_discard || (state->cull_face & PIPE_FACE_FRONT) ? 1 : 0) |
+               S_028814_CULL_BACK(state->rasterizer_discard || (state->cull_face & PIPE_FACE_BACK) ? 1 : 0) |
                S_028814_FACE(!state->front_ccw) |
                S_028814_POLY_OFFSET_FRONT_ENABLE(state->offset_tri) |
                S_028814_POLY_OFFSET_BACK_ENABLE(state->offset_tri) |
@@ -1688,6 +1688,9 @@ void evergreen_init_state_functions(struct r600_pipe_context *rctx)
        rctx->context.sampler_view_destroy = r600_sampler_view_destroy;
        rctx->context.redefine_user_buffer = u_default_redefine_user_buffer;
        rctx->context.texture_barrier = evergreen_texture_barrier;
+       rctx->context.create_stream_output_target = r600_create_so_target;
+       rctx->context.stream_output_target_destroy = r600_so_target_destroy;
+       rctx->context.set_stream_output_targets = r600_set_so_targets;
 }
 
 static void cayman_init_config(struct r600_pipe_context *rctx)
index 6baa2a7..68b77b4 100644 (file)
@@ -50,6 +50,8 @@
 #define EVENT_TYPE_PS_PARTIAL_FLUSH            0x10
 #define EVENT_TYPE_ZPASS_DONE                  0x15
 #define EVENT_TYPE_CACHE_FLUSH_AND_INV_EVENT   0x16
+#define EVENT_TYPE_SO_VGTSTREAMOUT_FLUSH       0x1f
+
 #define                EVENT_TYPE(x)                           ((x) << 0)
 #define                EVENT_INDEX(x)                          ((x) << 8)
                 /* 0 - any non-TS event
@@ -82,6 +84,7 @@
 #define PKT3_MEM_SEMAPHORE                     0x39
 #define PKT3_MPEG_INDEX                        0x3A
 #define PKT3_WAIT_REG_MEM                      0x3C
+#define                WAIT_REG_MEM_EQUAL              3
 #define PKT3_MEM_WRITE                         0x3D
 #define PKT3_INDIRECT_BUFFER                   0x32
 #define PKT3_CP_INTERRUPT                      0x40
 #define PKT3(op, count, predicate) (PKT_TYPE_S(3) | PKT3_IT_OPCODE_S(op) | PKT_COUNT_S(count) | PKT3_PREDICATE(predicate))
 
 /* Registers */
+#define R_0084FC_CP_STRMOUT_CNTL                    0x000084FC
+#define   S_0084FC_OFFSET_UPDATE_DONE(x)               (((x) & 0x1) << 0)
+#define R_008960_VGT_STRMOUT_BUFFER_FILLED_SIZE_0    0x008960 /* read-only */
+#define R_008964_VGT_STRMOUT_BUFFER_FILLED_SIZE_1    0x008964 /* read-only */
+#define R_008968_VGT_STRMOUT_BUFFER_FILLED_SIZE_2    0x008968 /* read-only */
+#define R_00896C_VGT_STRMOUT_BUFFER_FILLED_SIZE_3    0x00896C /* read-only */
 #define R_008C00_SQ_CONFIG                           0x00008C00
 #define   S_008C00_VC_ENABLE(x)                        (((x) & 0x1) << 0)
 #define   G_008C00_VC_ENABLE(x)                        (((x) >> 0) & 0x1)
 #define R_028AC0_DB_SRESULTS_COMPARE_STATE0          0x00028AC0
 #define R_028AC4_DB_SRESULTS_COMPARE_STATE1          0x00028AC4
 #define R_028AC8_DB_PRELOAD_CONTROL                  0x00028AC8
+#define R_028AD0_VGT_STRMOUT_BUFFER_SIZE_0          0x028AD0
+#define R_028AD4_VGT_STRMOUT_VTX_STRIDE_0           0x028AD4
+#define R_028AD8_VGT_STRMOUT_BUFFER_BASE_0          0x028AD8
+#define R_028ADC_VGT_STRMOUT_BUFFER_OFFSET_0        0x028ADC
+#define R_028AE0_VGT_STRMOUT_BUFFER_SIZE_1          0x028AE0
+#define R_028AE4_VGT_STRMOUT_VTX_STRIDE_1           0x028AE4
+#define R_028AE8_VGT_STRMOUT_BUFFER_BASE_1          0x028AE8
+#define R_028AEC_VGT_STRMOUT_BUFFER_OFFSET_1        0x028AEC
+#define R_028AF0_VGT_STRMOUT_BUFFER_SIZE_2          0x028AF0
+#define R_028AF4_VGT_STRMOUT_VTX_STRIDE_2           0x028AF4
+#define R_028AF8_VGT_STRMOUT_BUFFER_BASE_2          0x028AF8
+#define R_028AFC_VGT_STRMOUT_BUFFER_OFFSET_2        0x028AFC
+#define R_028B00_VGT_STRMOUT_BUFFER_SIZE_3          0x028B00
+#define R_028B04_VGT_STRMOUT_VTX_STRIDE_3           0x028B04
+#define R_028B08_VGT_STRMOUT_BUFFER_BASE_3          0x028B08
+#define R_028B0C_VGT_STRMOUT_BUFFER_OFFSET_3        0x028B0C
+#define R_028B10_VGT_STRMOUT_BASE_OFFSET_0          0x028B10
+#define R_028B14_VGT_STRMOUT_BASE_OFFSET_1          0x028B14
+#define R_028B18_VGT_STRMOUT_BASE_OFFSET_2          0x028B18
+#define R_028B1C_VGT_STRMOUT_BASE_OFFSET_3          0x028B1C
+#define R_028B28_VGT_STRMOUT_DRAW_OPAQUE_OFFSET             0x028B28
+#define R_028B2C_VGT_STRMOUT_DRAW_OPAQUE_BUFFER_FILLED_SIZE 0x028B2C
+#define R_028B30_VGT_STRMOUT_DRAW_OPAQUE_VERTEX_STRIDE 0x028B30
+#define R_028B44_VGT_STRMOUT_BASE_OFFSET_HI_0       0x028B44
+#define R_028B48_VGT_STRMOUT_BASE_OFFSET_HI_1       0x028B48
+#define R_028B4C_VGT_STRMOUT_BASE_OFFSET_HI_2       0x028B4C
+#define R_028B50_VGT_STRMOUT_BASE_OFFSET_HI_3       0x028B50
 #define R_028B54_VGT_SHADER_STAGES_EN                0x00028B54
 #define R_028B70_DB_ALPHA_TO_MASK                    0x00028B70
 #define R_028B78_PA_SU_POLY_OFFSET_DB_FMT_CNTL       0x00028B78
 #define   G_028B8C_OFFSET(x)                           (((x) >> 0) & 0xFFFFFFFF)
 #define   C_028B8C_OFFSET                              0x00000000
 #define R_028B94_VGT_STRMOUT_CONFIG                  0x00028B94
+#define   S_028B94_STREAMOUT_0_EN(x)                   (((x) & 0x1) << 0)
+#define   S_028B94_STREAMOUT_1_EN(x)                   (((x) & 0x1) << 1)
+#define   S_028B94_STREAMOUT_2_EN(x)                   (((x) & 0x1) << 2)
+#define   S_028B94_STREAMOUT_3_EN(x)                   (((x) & 0x1) << 3)
+#define   S_028B94_RAST_STREAM(x)                      (((x) & 0x7) << 4)
 #define R_028B98_VGT_STRMOUT_BUFFER_CONFIG           0x00028B98
+#define   S_028B98_STREAM_0_BUFFER_EN(x)               (((x) & 0xf) << 0)
+#define   S_028B98_STREAM_1_BUFFER_EN(x)               (((x) & 0xf) << 4)
+#define   S_028B98_STREAM_2_BUFFER_EN(x)               (((x) & 0xf) << 8)
+#define   S_028B98_STREAM_3_BUFFER_EN(x)               (((x) & 0xf) << 12)
 #define R_028C00_PA_SC_LINE_CNTL                     0x00028C00
 #define R_028C04_PA_SC_AA_CONFIG                     0x00028C04
 #define R_028C08_PA_SU_VTX_CNTL                      0x00028C08
index e697406..fbd12fb 100644 (file)
@@ -167,6 +167,7 @@ struct r600_query {
        union {
                uint64_t                        u64;
                boolean                         b;
+               struct pipe_query_data_so_statistics so;
        } result;
        /* The kind of query */
        unsigned                                type;
@@ -187,6 +188,15 @@ struct r600_query {
        struct list_head                        list;
 };
 
+struct r600_so_target {
+       struct pipe_stream_output_target b;
+
+       /* The buffer where BUFFER_FILLED_SIZE is stored. */
+       struct r600_resource    *filled_size;
+       unsigned                stride;
+       unsigned                so_index;
+};
+
 #define R600_CONTEXT_DRAW_PENDING      (1 << 0)
 #define R600_CONTEXT_DST_CACHES_DIRTY  (1 << 1)
 #define R600_CONTEXT_CHECK_EVENT_FLUSH (1 << 2)
@@ -218,6 +228,7 @@ struct r600_context {
        /* The list of active queries. Only one query of each type can be active. */
        struct list_head        active_query_list;
        unsigned                num_cs_dw_queries_suspend;
+       unsigned                num_cs_dw_streamout_end;
 
        unsigned                backend_mask;
        unsigned                max_db; /* for OQ */
@@ -229,6 +240,12 @@ struct r600_context {
        struct r600_range fs_resources;
        int num_ps_resources, num_vs_resources, num_fs_resources;
        boolean                 have_depth_texture, have_depth_fb;
+
+       unsigned                        num_so_targets;
+       struct r600_so_target           *so_targets[PIPE_MAX_SO_BUFFERS];
+       boolean                         streamout_start;
+       unsigned                        streamout_append_bitmask;
+       unsigned                        *vs_shader_so_strides;
 };
 
 struct r600_draw {
@@ -268,6 +285,10 @@ void r600_context_emit_fence(struct r600_context *ctx, struct r600_resource *fen
 void r600_context_flush_all(struct r600_context *ctx, unsigned flush_flags);
 void r600_context_flush_dest_caches(struct r600_context *ctx);
 
+void r600_context_streamout_begin(struct r600_context *ctx);
+void r600_context_streamout_end(struct r600_context *ctx);
+void r600_context_draw_opaque_count(struct r600_context *ctx, struct r600_so_target *t);
+
 int evergreen_context_init(struct r600_context *ctx, struct r600_screen *screen);
 void evergreen_context_draw(struct r600_context *ctx, const struct r600_draw *draw);
 void evergreen_context_flush_dest_caches(struct r600_context *ctx);
index 1ab16f2..e617b16 100644 (file)
@@ -1672,6 +1672,21 @@ static int r600_bytecode_cf_build(struct r600_bytecode *bc, struct r600_bytecode
                        cf->output.inst |
                        S_SQ_CF_ALLOC_EXPORT_WORD1_END_OF_PROGRAM(cf->output.end_of_program);
                break;
+       case V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0:
+       case V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM1:
+       case V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM2:
+       case V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM3:
+               bc->bytecode[id++] = S_SQ_CF_ALLOC_EXPORT_WORD0_RW_GPR(cf->output.gpr) |
+                       S_SQ_CF_ALLOC_EXPORT_WORD0_ELEM_SIZE(cf->output.elem_size) |
+                       S_SQ_CF_ALLOC_EXPORT_WORD0_ARRAY_BASE(cf->output.array_base) |
+                       S_SQ_CF_ALLOC_EXPORT_WORD0_TYPE(cf->output.type);
+               bc->bytecode[id++] = S_SQ_CF_ALLOC_EXPORT_WORD1_BURST_COUNT(cf->output.burst_count - 1) |
+                       S_SQ_CF_ALLOC_EXPORT_WORD1_BARRIER(cf->output.barrier) |
+                       cf->output.inst |
+                       S_SQ_CF_ALLOC_EXPORT_WORD1_END_OF_PROGRAM(cf->output.end_of_program) |
+                       S_SQ_CF_ALLOC_EXPORT_WORD1_BUF_ARRAY_SIZE(cf->output.array_size) |
+                       S_SQ_CF_ALLOC_EXPORT_WORD1_BUF_COMP_MASK(cf->output.comp_mask);
+               break;
        case V_SQ_CF_WORD1_SQ_CF_INST_JUMP:
        case V_SQ_CF_WORD1_SQ_CF_INST_ELSE:
        case V_SQ_CF_WORD1_SQ_CF_INST_POP:
@@ -1730,6 +1745,22 @@ int r600_bytecode_build(struct r600_bytecode *bc)
                        case EG_V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_PUSH_BEFORE:
                        case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT:
                        case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT_DONE:
+                       case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0_BUF0:
+                       case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0_BUF1:
+                       case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0_BUF2:
+                       case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0_BUF3:
+                       case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM1_BUF0:
+                       case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM1_BUF1:
+                       case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM1_BUF2:
+                       case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM1_BUF3:
+                       case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM2_BUF0:
+                       case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM2_BUF1:
+                       case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM2_BUF2:
+                       case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM2_BUF3:
+                       case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM3_BUF0:
+                       case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM3_BUF1:
+                       case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM3_BUF2:
+                       case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM3_BUF3:
                        case EG_V_SQ_CF_WORD1_SQ_CF_INST_JUMP:
                        case EG_V_SQ_CF_WORD1_SQ_CF_INST_ELSE:
                        case EG_V_SQ_CF_WORD1_SQ_CF_INST_POP:
@@ -1760,6 +1791,10 @@ int r600_bytecode_build(struct r600_bytecode *bc)
                        case V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_PUSH_BEFORE:
                        case V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT:
                        case V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT_DONE:
+                       case V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0:
+                       case V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM1:
+                       case V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM2:
+                       case V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM3:
                        case V_SQ_CF_WORD1_SQ_CF_INST_JUMP:
                        case V_SQ_CF_WORD1_SQ_CF_INST_ELSE:
                        case V_SQ_CF_WORD1_SQ_CF_INST_POP:
@@ -1849,6 +1884,22 @@ int r600_bytecode_build(struct r600_bytecode *bc)
                                break;
                        case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT:
                        case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT_DONE:
+                       case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0_BUF0:
+                       case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0_BUF1:
+                       case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0_BUF2:
+                       case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0_BUF3:
+                       case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM1_BUF0:
+                       case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM1_BUF1:
+                       case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM1_BUF2:
+                       case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM1_BUF3:
+                       case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM2_BUF0:
+                       case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM2_BUF1:
+                       case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM2_BUF2:
+                       case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM2_BUF3:
+                       case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM3_BUF0:
+                       case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM3_BUF1:
+                       case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM3_BUF2:
+                       case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM3_BUF3:
                        case EG_V_SQ_CF_WORD1_SQ_CF_INST_LOOP_START_NO_AL:
                        case EG_V_SQ_CF_WORD1_SQ_CF_INST_LOOP_END:
                        case EG_V_SQ_CF_WORD1_SQ_CF_INST_LOOP_CONTINUE:
@@ -1923,6 +1974,10 @@ int r600_bytecode_build(struct r600_bytecode *bc)
                                break;
                        case V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT:
                        case V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT_DONE:
+                       case V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0:
+                       case V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM1:
+                       case V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM2:
+                       case V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM3:
                        case V_SQ_CF_WORD1_SQ_CF_INST_LOOP_START_NO_AL:
                        case V_SQ_CF_WORD1_SQ_CF_INST_LOOP_END:
                        case V_SQ_CF_WORD1_SQ_CF_INST_LOOP_CONTINUE:
@@ -2057,6 +2112,44 @@ void r600_bytecode_dump(struct r600_bytecode *bc)
                                fprintf(stderr, "BURST_COUNT:%d ", cf->output.burst_count);
                                fprintf(stderr, "EOP:%X\n", cf->output.end_of_program);
                                break;
+                       case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0_BUF0:
+                       case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0_BUF1:
+                       case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0_BUF2:
+                       case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0_BUF3:
+                       case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM1_BUF0:
+                       case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM1_BUF1:
+                       case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM1_BUF2:
+                       case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM1_BUF3:
+                       case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM2_BUF0:
+                       case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM2_BUF1:
+                       case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM2_BUF2:
+                       case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM2_BUF3:
+                       case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM3_BUF0:
+                       case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM3_BUF1:
+                       case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM3_BUF2:
+                       case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM3_BUF3:
+                               fprintf(stderr, "%04d %08X EXPORT MEM_STREAM%i_BUF%i ", id, bc->bytecode[id],
+                                       (EG_G_SQ_CF_ALLOC_EXPORT_WORD1_CF_INST(cf->inst) -
+                                        EG_G_SQ_CF_ALLOC_EXPORT_WORD1_CF_INST(EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0_BUF0)) / 4,
+                                       (EG_G_SQ_CF_ALLOC_EXPORT_WORD1_CF_INST(cf->inst) -
+                                        EG_G_SQ_CF_ALLOC_EXPORT_WORD1_CF_INST(EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0_BUF0)) % 4);
+                               fprintf(stderr, "GPR:%X ", cf->output.gpr);
+                               fprintf(stderr, "ELEM_SIZE:%i ", cf->output.elem_size);
+                               fprintf(stderr, "ARRAY_BASE:%i ", cf->output.array_base);
+                               fprintf(stderr, "TYPE:%X\n", cf->output.type);
+                               id++;
+                               fprintf(stderr, "%04d %08X EXPORT MEM_STREAM%i_BUF%i ", id, bc->bytecode[id],
+                                       (EG_G_SQ_CF_ALLOC_EXPORT_WORD1_CF_INST(cf->inst) -
+                                        EG_G_SQ_CF_ALLOC_EXPORT_WORD1_CF_INST(EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0_BUF0)) / 4,
+                                       (EG_G_SQ_CF_ALLOC_EXPORT_WORD1_CF_INST(cf->inst) -
+                                        EG_G_SQ_CF_ALLOC_EXPORT_WORD1_CF_INST(EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0_BUF0)) % 4);
+                               fprintf(stderr, "ARRAY_SIZE:%i ", cf->output.array_size);
+                               fprintf(stderr, "COMP_MASK:%X ", cf->output.comp_mask);
+                               fprintf(stderr, "BARRIER:%X ", cf->output.barrier);
+                               fprintf(stderr, "INST:%d ", cf->output.inst);
+                               fprintf(stderr, "BURST_COUNT:%d ", cf->output.burst_count);
+                               fprintf(stderr, "EOP:%X\n", cf->output.end_of_program);
+                               break;
                        case EG_V_SQ_CF_WORD1_SQ_CF_INST_JUMP:
                        case EG_V_SQ_CF_WORD1_SQ_CF_INST_ELSE:
                        case EG_V_SQ_CF_WORD1_SQ_CF_INST_POP:
@@ -2125,6 +2218,28 @@ void r600_bytecode_dump(struct r600_bytecode *bc)
                                fprintf(stderr, "BURST_COUNT:%d ", cf->output.burst_count);
                                fprintf(stderr, "EOP:%X\n", cf->output.end_of_program);
                                break;
+                       case V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0:
+                       case V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM1:
+                       case V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM2:
+                       case V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM3:
+                               fprintf(stderr, "%04d %08X EXPORT MEM_STREAM%i ", id, bc->bytecode[id],
+                                       R600_G_SQ_CF_ALLOC_EXPORT_WORD1_CF_INST(cf->inst) -
+                                       R600_G_SQ_CF_ALLOC_EXPORT_WORD1_CF_INST(V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0));
+                               fprintf(stderr, "GPR:%X ", cf->output.gpr);
+                               fprintf(stderr, "ELEM_SIZE:%i ", cf->output.elem_size);
+                               fprintf(stderr, "ARRAY_BASE:%i ", cf->output.array_base);
+                               fprintf(stderr, "TYPE:%X\n", cf->output.type);
+                               id++;
+                               fprintf(stderr, "%04d %08X EXPORT MEM_STREAM%i ", id, bc->bytecode[id],
+                                       R600_G_SQ_CF_ALLOC_EXPORT_WORD1_CF_INST(cf->inst) -
+                                       R600_G_SQ_CF_ALLOC_EXPORT_WORD1_CF_INST(V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0));
+                               fprintf(stderr, "ARRAY_SIZE:%i ", cf->output.array_size);
+                               fprintf(stderr, "COMP_MASK:%X ", cf->output.comp_mask);
+                               fprintf(stderr, "BARRIER:%X ", cf->output.barrier);
+                               fprintf(stderr, "INST:%d ", cf->output.inst);
+                               fprintf(stderr, "BURST_COUNT:%d ", cf->output.burst_count);
+                               fprintf(stderr, "EOP:%X\n", cf->output.end_of_program);
+                               break;
                        case V_SQ_CF_WORD1_SQ_CF_INST_JUMP:
                        case V_SQ_CF_WORD1_SQ_CF_INST_ELSE:
                        case V_SQ_CF_WORD1_SQ_CF_INST_POP:
index 0fd4467..d0ff75d 100644 (file)
@@ -107,6 +107,8 @@ struct r600_bytecode_vtx {
 
 struct r600_bytecode_output {
        unsigned                        array_base;
+       unsigned                        array_size;
+       unsigned                        comp_mask;
        unsigned                        type;
        unsigned                        end_of_program;
 
index 9326dc6..313ed12 100644 (file)
@@ -65,6 +65,8 @@ static void r600_blitter_begin(struct pipe_context *ctx, enum r600_blitter_op op
        util_blitter_save_vertex_buffers(rctx->blitter,
                                         rctx->vbuf_mgr->nr_vertex_buffers,
                                         rctx->vbuf_mgr->vertex_buffer);
+       util_blitter_save_so_targets(rctx->blitter, rctx->ctx.num_so_targets,
+                                    (struct pipe_stream_output_target**)rctx->ctx.so_targets);
 
        if (op & R600_SAVE_FRAMEBUFFER)
                util_blitter_save_framebuffer(rctx->blitter, &rctx->framebuffer);
index 52e0be7..1dba966 100644 (file)
@@ -943,6 +943,9 @@ void r600_need_cs_space(struct r600_context *ctx, unsigned num_dw,
        /* Count in queries_suspend. */
        num_dw += ctx->num_cs_dw_queries_suspend;
 
+       /* Count in streamout_end at the end of CS. */
+       num_dw += ctx->num_cs_dw_streamout_end;
+
        /* Count in render_condition(NULL) at the end of CS. */
        if (ctx->predicate_drawing) {
                num_dw += 3;
@@ -1471,6 +1474,12 @@ void r600_context_draw(struct r600_context *ctx, const struct r600_draw *draw)
                r600_context_block_resource_emit_dirty(ctx, dirty_block);
        }
 
+       /* Enable stream out if needed. */
+       if (ctx->streamout_start) {
+               r600_context_streamout_begin(ctx);
+               ctx->streamout_start = FALSE;
+       }
+
        /* draw packet */
        pm4 = &ctx->pm4[ctx->pm4_cdwords];
 
@@ -1503,6 +1512,7 @@ void r600_context_flush(struct r600_context *ctx, unsigned flags)
 {
        struct r600_block *enable_block = NULL;
        bool queries_suspended = false;
+       bool streamout_suspended = false;
 
        if (ctx->pm4_cdwords == ctx->init_dwords)
                return;
@@ -1513,6 +1523,11 @@ void r600_context_flush(struct r600_context *ctx, unsigned flags)
                queries_suspended = true;
        }
 
+       if (ctx->num_cs_dw_streamout_end) {
+               r600_context_streamout_end(ctx);
+               streamout_suspended = true;
+       }
+
        if (ctx->screen->chip_class >= EVERGREEN)
                evergreen_context_flush_dest_caches(ctx);
        else
@@ -1542,6 +1557,11 @@ void r600_context_flush(struct r600_context *ctx, unsigned flags)
 
        r600_init_cs(ctx);
 
+       if (streamout_suspended) {
+               ctx->streamout_start = TRUE;
+               ctx->streamout_append_bitmask = ~0;
+       }
+
        /* resume queries */
        if (queries_suspended) {
                r600_context_queries_resume(ctx);
@@ -1636,6 +1656,44 @@ static boolean r600_query_result(struct r600_context *ctx, struct r600_query *qu
                        results_base = (results_base + query->result_size) % query->buffer->b.b.b.width0;
                }
                break;
+       case PIPE_QUERY_PRIMITIVES_EMITTED:
+               /* SAMPLE_STREAMOUTSTATS stores this structure:
+                * {
+                *    u64 NumPrimitivesWritten;
+                *    u64 PrimitiveStorageNeeded;
+                * }
+                * We only need NumPrimitivesWritten here. */
+               while (results_base != query->results_end) {
+                       query->result.u64 +=
+                               r600_query_read_result(map + results_base, 2, 6, true);
+                       results_base = (results_base + query->result_size) % query->buffer->b.b.b.width0;
+               }
+               break;
+       case PIPE_QUERY_PRIMITIVES_GENERATED:
+               /* Here we read PrimitiveStorageNeeded. */
+               while (results_base != query->results_end) {
+                       query->result.u64 +=
+                               r600_query_read_result(map + results_base, 0, 4, true);
+                       results_base = (results_base + query->result_size) % query->buffer->b.b.b.width0;
+               }
+               break;
+       case PIPE_QUERY_SO_STATISTICS:
+               while (results_base != query->results_end) {
+                       query->result.so.num_primitives_written +=
+                               r600_query_read_result(map + results_base, 2, 6, true);
+                       query->result.so.primitives_storage_needed +=
+                               r600_query_read_result(map + results_base, 0, 4, true);
+                       results_base = (results_base + query->result_size) % query->buffer->b.b.b.width0;
+               }
+               break;
+       case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
+               while (results_base != query->results_end) {
+                       query->result.b = query->result.b ||
+                               r600_query_read_result(map + results_base, 2, 6, true) !=
+                               r600_query_read_result(map + results_base, 0, 4, true);
+                       results_base = (results_base + query->result_size) % query->buffer->b.b.b.width0;
+               }
+               break;
        default:
                assert(0);
        }
@@ -1679,6 +1737,15 @@ void r600_query_begin(struct r600_context *ctx, struct r600_query *query)
                break;
        case PIPE_QUERY_TIME_ELAPSED:
                break;
+       case PIPE_QUERY_PRIMITIVES_EMITTED:
+       case PIPE_QUERY_PRIMITIVES_GENERATED:
+       case PIPE_QUERY_SO_STATISTICS:
+       case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
+               results = ctx->ws->buffer_map(query->buffer->buf, ctx->cs, PIPE_TRANSFER_WRITE);
+               results = (u32*)((char*)results + query->results_end);
+               memset(results, 0, query->result_size);
+               ctx->ws->buffer_unmap(query->buffer->buf);
+               break;
        default:
                assert(0);
        }
@@ -1692,6 +1759,15 @@ void r600_query_begin(struct r600_context *ctx, struct r600_query *query)
                ctx->pm4[ctx->pm4_cdwords++] = query->results_end;
                ctx->pm4[ctx->pm4_cdwords++] = 0;
                break;
+       case PIPE_QUERY_PRIMITIVES_EMITTED:
+       case PIPE_QUERY_PRIMITIVES_GENERATED:
+       case PIPE_QUERY_SO_STATISTICS:
+       case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
+               ctx->pm4[ctx->pm4_cdwords++] = PKT3(PKT3_EVENT_WRITE, 2, 0);
+               ctx->pm4[ctx->pm4_cdwords++] = EVENT_TYPE(EVENT_TYPE_SAMPLE_STREAMOUTSTATS) | EVENT_INDEX(3);
+               ctx->pm4[ctx->pm4_cdwords++] = query->results_end;
+               ctx->pm4[ctx->pm4_cdwords++] = 0;
+               break;
        case PIPE_QUERY_TIME_ELAPSED:
                ctx->pm4[ctx->pm4_cdwords++] = PKT3(PKT3_EVENT_WRITE_EOP, 4, 0);
                ctx->pm4[ctx->pm4_cdwords++] = EVENT_TYPE(EVENT_TYPE_CACHE_FLUSH_AND_INV_TS_EVENT) | EVENT_INDEX(5);
@@ -1720,6 +1796,15 @@ void r600_query_end(struct r600_context *ctx, struct r600_query *query)
                ctx->pm4[ctx->pm4_cdwords++] = query->results_end + 8;
                ctx->pm4[ctx->pm4_cdwords++] = 0;
                break;
+       case PIPE_QUERY_PRIMITIVES_EMITTED:
+       case PIPE_QUERY_PRIMITIVES_GENERATED:
+       case PIPE_QUERY_SO_STATISTICS:
+       case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
+               ctx->pm4[ctx->pm4_cdwords++] = PKT3(PKT3_EVENT_WRITE, 2, 0);
+               ctx->pm4[ctx->pm4_cdwords++] = EVENT_TYPE(EVENT_TYPE_SAMPLE_STREAMOUTSTATS) | EVENT_INDEX(3);
+               ctx->pm4[ctx->pm4_cdwords++] = query->results_end + query->result_size/2;
+               ctx->pm4[ctx->pm4_cdwords++] = 0;
+               break;
        case PIPE_QUERY_TIME_ELAPSED:
                ctx->pm4[ctx->pm4_cdwords++] = PKT3(PKT3_EVENT_WRITE_EOP, 4, 0);
                ctx->pm4[ctx->pm4_cdwords++] = EVENT_TYPE(EVENT_TYPE_CACHE_FLUSH_AND_INV_TS_EVENT) | EVENT_INDEX(5);
@@ -1798,6 +1883,14 @@ struct r600_query *r600_context_query_create(struct r600_context *ctx, unsigned
                query->result_size = 16;
                query->num_cs_dw = 8;
                break;
+       case PIPE_QUERY_PRIMITIVES_EMITTED:
+       case PIPE_QUERY_PRIMITIVES_GENERATED:
+       case PIPE_QUERY_SO_STATISTICS:
+       case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
+               /* NumPrimitivesWritten, PrimitiveStorageNeeded. */
+               query->result_size = 32;
+               query->num_cs_dw = 6;
+               break;
        default:
                assert(0);
                FREE(query);
@@ -1832,20 +1925,28 @@ boolean r600_context_query_result(struct r600_context *ctx,
 {
        boolean *result_b = (boolean*)vresult;
        uint64_t *result_u64 = (uint64_t*)vresult;
+       struct pipe_query_data_so_statistics *result_so =
+               (struct pipe_query_data_so_statistics*)vresult;
 
        if (!r600_query_result(ctx, query, wait))
                return FALSE;
 
        switch (query->type) {
        case PIPE_QUERY_OCCLUSION_COUNTER:
+       case PIPE_QUERY_PRIMITIVES_EMITTED:
+       case PIPE_QUERY_PRIMITIVES_GENERATED:
                *result_u64 = query->result.u64;
                break;
        case PIPE_QUERY_OCCLUSION_PREDICATE:
+       case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
                *result_b = query->result.b;
                break;
        case PIPE_QUERY_TIME_ELAPSED:
                *result_u64 = (1000000 * query->result.u64) / ctx->screen->info.r600_clock_crystal_freq;
                break;
+       case PIPE_QUERY_SO_STATISTICS:
+               *result_so = query->result.so;
+               break;
        default:
                assert(0);
        }
@@ -1872,3 +1973,237 @@ void r600_context_queries_resume(struct r600_context *ctx)
                r600_query_begin(ctx, query);
        }
 }
+
+static void r600_flush_vgt_streamout(struct r600_context *ctx)
+{
+       ctx->pm4[ctx->pm4_cdwords++] = PKT3(PKT3_SET_CONFIG_REG, 1, 0);
+       ctx->pm4[ctx->pm4_cdwords++] = (R_008490_CP_STRMOUT_CNTL - R600_CONFIG_REG_OFFSET) >> 2;
+       ctx->pm4[ctx->pm4_cdwords++] = 0;
+
+       ctx->pm4[ctx->pm4_cdwords++] = PKT3(PKT3_EVENT_WRITE, 0, 0);
+       ctx->pm4[ctx->pm4_cdwords++] = EVENT_TYPE(EVENT_TYPE_SO_VGTSTREAMOUT_FLUSH) | EVENT_INDEX(0);
+
+       ctx->pm4[ctx->pm4_cdwords++] = PKT3(PKT3_WAIT_REG_MEM, 5, 0);
+       ctx->pm4[ctx->pm4_cdwords++] = WAIT_REG_MEM_EQUAL; /* wait until the register is equal to the reference value */
+       ctx->pm4[ctx->pm4_cdwords++] = R_008490_CP_STRMOUT_CNTL >> 2;  /* register */
+       ctx->pm4[ctx->pm4_cdwords++] = 0;
+       ctx->pm4[ctx->pm4_cdwords++] = S_008490_OFFSET_UPDATE_DONE(1); /* reference value */
+       ctx->pm4[ctx->pm4_cdwords++] = S_008490_OFFSET_UPDATE_DONE(1); /* mask */
+       ctx->pm4[ctx->pm4_cdwords++] = 4; /* poll interval */
+}
+
+static void r600_set_streamout_enable(struct r600_context *ctx, unsigned buffer_enable_bit)
+{
+       if (buffer_enable_bit) {
+               ctx->pm4[ctx->pm4_cdwords++] = PKT3(PKT3_SET_CONTEXT_REG, 1, 0);
+               ctx->pm4[ctx->pm4_cdwords++] = (R_028AB0_VGT_STRMOUT_EN - R600_CONTEXT_REG_OFFSET) >> 2;
+               ctx->pm4[ctx->pm4_cdwords++] = S_028AB0_STREAMOUT(1);
+
+               ctx->pm4[ctx->pm4_cdwords++] = PKT3(PKT3_SET_CONTEXT_REG, 1, 0);
+               ctx->pm4[ctx->pm4_cdwords++] = (R_028B20_VGT_STRMOUT_BUFFER_EN - R600_CONTEXT_REG_OFFSET) >> 2;
+               ctx->pm4[ctx->pm4_cdwords++] = buffer_enable_bit;
+       } else {
+               ctx->pm4[ctx->pm4_cdwords++] = PKT3(PKT3_SET_CONTEXT_REG, 1, 0);
+               ctx->pm4[ctx->pm4_cdwords++] = (R_028AB0_VGT_STRMOUT_EN - R600_CONTEXT_REG_OFFSET) >> 2;
+               ctx->pm4[ctx->pm4_cdwords++] = S_028AB0_STREAMOUT(0);
+       }
+}
+
+void r600_context_streamout_begin(struct r600_context *ctx)
+{
+       struct r600_so_target **t = ctx->so_targets;
+       unsigned *strides = ctx->vs_shader_so_strides;
+       unsigned buffer_en, i, update_flags = 0;
+
+       buffer_en = (ctx->num_so_targets >= 1 && t[0] ? 1 : 0) |
+                   (ctx->num_so_targets >= 2 && t[1] ? 2 : 0) |
+                   (ctx->num_so_targets >= 3 && t[2] ? 4 : 0) |
+                   (ctx->num_so_targets >= 4 && t[3] ? 8 : 0);
+
+       ctx->num_cs_dw_streamout_end =
+               12 + /* flush_vgt_streamout */
+               util_bitcount(buffer_en) * 8 +
+               8;
+
+       r600_need_cs_space(ctx,
+                          12 + /* flush_vgt_streamout */
+                          6 + /* enables */
+                          util_bitcount(buffer_en & ctx->streamout_append_bitmask) * 8 +
+                          util_bitcount(buffer_en & ~ctx->streamout_append_bitmask) * 6 +
+                          (ctx->screen->family > CHIP_R600 && ctx->screen->family < CHIP_RV770 ? 2 : 0) +
+                          ctx->num_cs_dw_streamout_end, TRUE);
+
+       if (ctx->screen->chip_class >= EVERGREEN) {
+               evergreen_flush_vgt_streamout(ctx);
+               evergreen_set_streamout_enable(ctx, buffer_en);
+       } else {
+               r600_flush_vgt_streamout(ctx);
+               r600_set_streamout_enable(ctx, buffer_en);
+       }
+
+       for (i = 0; i < ctx->num_so_targets; i++) {
+               if (t[i]) {
+                       t[i]->stride = strides[i];
+                       t[i]->so_index = i;
+
+                       update_flags |= SURFACE_BASE_UPDATE_STRMOUT(i);
+
+                       ctx->pm4[ctx->pm4_cdwords++] = PKT3(PKT3_SET_CONTEXT_REG, 3, 0);
+                       ctx->pm4[ctx->pm4_cdwords++] = (R_028AD0_VGT_STRMOUT_BUFFER_SIZE_0 +
+                                                       16*i - R600_CONTEXT_REG_OFFSET) >> 2;
+                       ctx->pm4[ctx->pm4_cdwords++] = (t[i]->b.buffer_offset +
+                                                       t[i]->b.buffer_size) >> 2; /* BUFFER_SIZE (in DW) */
+                       ctx->pm4[ctx->pm4_cdwords++] = strides[i] >> 2;            /* VTX_STRIDE (in DW) */
+                       ctx->pm4[ctx->pm4_cdwords++] = 0;                          /* BUFFER_BASE */
+
+                       ctx->pm4[ctx->pm4_cdwords++] = PKT3(PKT3_NOP, 0, 0);
+                       ctx->pm4[ctx->pm4_cdwords++] =
+                               r600_context_bo_reloc(ctx, r600_resource(t[i]->b.buffer),
+                                                     RADEON_USAGE_WRITE);
+
+                       if (ctx->streamout_append_bitmask & (1 << i)) {
+                               /* Append. */
+                               ctx->pm4[ctx->pm4_cdwords++] = PKT3(PKT3_STRMOUT_BUFFER_UPDATE, 4, 0);
+                               ctx->pm4[ctx->pm4_cdwords++] = STRMOUT_SELECT_BUFFER(i) |
+                                                              STRMOUT_OFFSET_SOURCE(STRMOUT_OFFSET_FROM_MEM); /* control */
+                               ctx->pm4[ctx->pm4_cdwords++] = 0; /* unused */
+                               ctx->pm4[ctx->pm4_cdwords++] = 0; /* unused */
+                               ctx->pm4[ctx->pm4_cdwords++] = 0; /* src address lo */
+                               ctx->pm4[ctx->pm4_cdwords++] = 0; /* src address hi */
+
+                               ctx->pm4[ctx->pm4_cdwords++] = PKT3(PKT3_NOP, 0, 0);
+                               ctx->pm4[ctx->pm4_cdwords++] =
+                                       r600_context_bo_reloc(ctx,  t[i]->filled_size,
+                                                             RADEON_USAGE_READ);
+                       } else {
+                               /* Start from the beginning. */
+                               ctx->pm4[ctx->pm4_cdwords++] = PKT3(PKT3_STRMOUT_BUFFER_UPDATE, 4, 0);
+                               ctx->pm4[ctx->pm4_cdwords++] = STRMOUT_SELECT_BUFFER(i) |
+                                                              STRMOUT_OFFSET_SOURCE(STRMOUT_OFFSET_FROM_PACKET); /* control */
+                               ctx->pm4[ctx->pm4_cdwords++] = 0; /* unused */
+                               ctx->pm4[ctx->pm4_cdwords++] = 0; /* unused */
+                               ctx->pm4[ctx->pm4_cdwords++] = t[i]->b.buffer_offset >> 2; /* buffer offset in DW */
+                               ctx->pm4[ctx->pm4_cdwords++] = 0; /* unused */
+                       }
+               }
+       }
+
+       if (ctx->screen->family > CHIP_R600 && ctx->screen->family < CHIP_RV770) {
+               ctx->pm4[ctx->pm4_cdwords++] = PKT3(PKT3_SURFACE_BASE_UPDATE, 0, 0);
+               ctx->pm4[ctx->pm4_cdwords++] = update_flags;
+       }
+}
+
+void r600_context_streamout_end(struct r600_context *ctx)
+{
+       struct r600_so_target **t = ctx->so_targets;
+       unsigned i, flush_flags = 0;
+
+       if (ctx->screen->chip_class >= EVERGREEN) {
+               evergreen_flush_vgt_streamout(ctx);
+       } else {
+               r600_flush_vgt_streamout(ctx);
+       }
+
+       for (i = 0; i < ctx->num_so_targets; i++) {
+               if (t[i]) {
+                       ctx->pm4[ctx->pm4_cdwords++] = PKT3(PKT3_STRMOUT_BUFFER_UPDATE, 4, 0);
+                       ctx->pm4[ctx->pm4_cdwords++] = STRMOUT_SELECT_BUFFER(i) |
+                                                      STRMOUT_OFFSET_SOURCE(STRMOUT_OFFSET_NONE) |
+                                                      STRMOUT_STORE_BUFFER_FILLED_SIZE; /* control */
+                       ctx->pm4[ctx->pm4_cdwords++] = 0; /* dst address lo */
+                       ctx->pm4[ctx->pm4_cdwords++] = 0; /* dst address hi */
+                       ctx->pm4[ctx->pm4_cdwords++] = 0; /* unused */
+                       ctx->pm4[ctx->pm4_cdwords++] = 0; /* unused */
+
+                       ctx->pm4[ctx->pm4_cdwords++] = PKT3(PKT3_NOP, 0, 0);
+                       ctx->pm4[ctx->pm4_cdwords++] =
+                               r600_context_bo_reloc(ctx,  t[i]->filled_size,
+                                                     RADEON_USAGE_WRITE);
+
+                       flush_flags |= S_0085F0_SO0_DEST_BASE_ENA(1) << i;
+               }
+       }
+
+       if (ctx->screen->chip_class >= EVERGREEN) {
+               evergreen_set_streamout_enable(ctx, 0);
+       } else {
+               r600_set_streamout_enable(ctx, 0);
+       }
+
+       if (ctx->screen->family < CHIP_RV770) {
+               ctx->pm4[ctx->pm4_cdwords++] = PKT3(PKT3_EVENT_WRITE, 0, 0);
+               ctx->pm4[ctx->pm4_cdwords++] = EVENT_TYPE(EVENT_TYPE_CACHE_FLUSH_AND_INV_EVENT) | EVENT_INDEX(0);
+       } else {
+               ctx->pm4[ctx->pm4_cdwords++] = PKT3(PKT3_SURFACE_SYNC, 3, 0);
+               ctx->pm4[ctx->pm4_cdwords++] = flush_flags;     /* CP_COHER_CNTL */
+               ctx->pm4[ctx->pm4_cdwords++] = 0xffffffff;      /* CP_COHER_SIZE */
+               ctx->pm4[ctx->pm4_cdwords++] = 0;               /* CP_COHER_BASE */
+               ctx->pm4[ctx->pm4_cdwords++] = 0x0000000A;      /* POLL_INTERVAL */
+       }
+
+       ctx->num_cs_dw_streamout_end = 0;
+
+       /* XXX print some debug info */
+       for (i = 0; i < ctx->num_so_targets; i++) {
+               if (!t[i])
+                       continue;
+
+               uint32_t *ptr = ctx->ws->buffer_map(t[i]->filled_size->buf, ctx->cs, RADEON_USAGE_READ);
+               printf("FILLED_SIZE%i: %u\n", i, *ptr);
+               ctx->ws->buffer_unmap(t[i]->filled_size->buf);
+       }
+}
+
+void r600_context_draw_opaque_count(struct r600_context *ctx, struct r600_so_target *t)
+{
+       r600_need_cs_space(ctx, 14 + 21, TRUE);
+
+       ctx->pm4[ctx->pm4_cdwords++] = PKT3(PKT3_SET_CONTEXT_REG, 1, 0);
+       ctx->pm4[ctx->pm4_cdwords++] = (R_028B28_VGT_STRMOUT_DRAW_OPAQUE_OFFSET - R600_CONTEXT_REG_OFFSET) >> 2;
+       ctx->pm4[ctx->pm4_cdwords++] = 0;
+
+       ctx->pm4[ctx->pm4_cdwords++] = PKT3(PKT3_SET_CONTEXT_REG, 1, 0);
+       ctx->pm4[ctx->pm4_cdwords++] = (R_028B30_VGT_STRMOUT_DRAW_OPAQUE_VERTEX_STRIDE - R600_CONTEXT_REG_OFFSET) >> 2;
+       ctx->pm4[ctx->pm4_cdwords++] = t->stride >> 2;
+
+       ctx->pm4[ctx->pm4_cdwords++] = PKT3(PKT3_COPY_DW, 4, 0);
+       ctx->pm4[ctx->pm4_cdwords++] = COPY_DW_SRC_IS_MEM | COPY_DW_DST_IS_REG;
+       ctx->pm4[ctx->pm4_cdwords++] = 0; /* src address lo */
+       ctx->pm4[ctx->pm4_cdwords++] = 0; /* src address hi */
+       ctx->pm4[ctx->pm4_cdwords++] = R_028B2C_VGT_STRMOUT_DRAW_OPAQUE_BUFFER_FILLED_SIZE >> 2; /* dst register */
+       ctx->pm4[ctx->pm4_cdwords++] = 0; /* unused */
+
+       ctx->pm4[ctx->pm4_cdwords++] = PKT3(PKT3_NOP, 0, 0);
+       ctx->pm4[ctx->pm4_cdwords++] = r600_context_bo_reloc(ctx,  t->filled_size,
+                                                            RADEON_USAGE_READ);
+
+#if 0 /* I have not found this useful yet. */
+       ctx->pm4[ctx->pm4_cdwords++] = PKT3(PKT3_COPY_DW, 4, 0);
+       ctx->pm4[ctx->pm4_cdwords++] = COPY_DW_SRC_IS_REG | COPY_DW_DST_IS_REG;
+       ctx->pm4[ctx->pm4_cdwords++] = R_028B2C_VGT_STRMOUT_DRAW_OPAQUE_BUFFER_FILLED_SIZE >> 2; /* src register */
+       ctx->pm4[ctx->pm4_cdwords++] = 0; /* unused */
+       ctx->pm4[ctx->pm4_cdwords++] = R_0085F4_CP_COHER_SIZE >> 2; /* dst register */
+       ctx->pm4[ctx->pm4_cdwords++] = 0; /* unused */
+
+       ctx->pm4[ctx->pm4_cdwords++] = PKT3(PKT3_SET_CONFIG_REG, 1, 0);
+       ctx->pm4[ctx->pm4_cdwords++] = (R_0085F0_CP_COHER_CNTL - R600_CONFIG_REG_OFFSET) >> 2;
+       ctx->pm4[ctx->pm4_cdwords++] = S_0085F0_SO0_DEST_BASE_ENA(1) << t->so_index;
+
+       ctx->pm4[ctx->pm4_cdwords++] = PKT3(PKT3_SET_CONFIG_REG, 1, 0);
+       ctx->pm4[ctx->pm4_cdwords++] = (R_0085F8_CP_COHER_BASE - R600_CONFIG_REG_OFFSET) >> 2;
+       ctx->pm4[ctx->pm4_cdwords++] = t->b.buffer_offset >> 2;
+
+       ctx->pm4[ctx->pm4_cdwords++] = PKT3(PKT3_NOP, 0, 0);
+       ctx->pm4[ctx->pm4_cdwords++] = r600_context_bo_reloc(ctx, (struct r600_resource*)t->b.buffer,
+                                                            RADEON_USAGE_WRITE);
+
+       ctx->pm4[ctx->pm4_cdwords++] = PKT3(PKT3_WAIT_REG_MEM, 5, 0);
+       ctx->pm4[ctx->pm4_cdwords++] = WAIT_REG_MEM_EQUAL; /* wait until the register is equal to the reference value */
+       ctx->pm4[ctx->pm4_cdwords++] = R_0085FC_CP_COHER_STATUS >> 2;  /* register */
+       ctx->pm4[ctx->pm4_cdwords++] = 0;
+       ctx->pm4[ctx->pm4_cdwords++] = 0; /* reference value */
+       ctx->pm4[ctx->pm4_cdwords++] = 0xffffffff; /* mask */
+       ctx->pm4[ctx->pm4_cdwords++] = 4; /* poll interval */
+#endif
+}
index bea6135..206de7e 100644 (file)
@@ -76,6 +76,13 @@ void r600_context_reg(struct r600_context *ctx,
 void r600_init_cs(struct r600_context *ctx);
 int r600_resource_init(struct r600_context *ctx, struct r600_range *range, unsigned offset, unsigned nblocks, unsigned stride, struct r600_reg *reg, int nreg, unsigned offset_base);
 
+/*
+ * evergreen_hw_context.c
+ */
+void evergreen_flush_vgt_streamout(struct r600_context *ctx);
+void evergreen_set_streamout_enable(struct r600_context *ctx, unsigned buffer_enable_bit);
+
+
 static INLINE unsigned r600_context_bo_reloc(struct r600_context *ctx, struct r600_resource *rbo,
                                             enum radeon_bo_usage usage)
 {
index 97c6808..22fefaa 100644 (file)
@@ -378,6 +378,7 @@ static int r600_get_param(struct pipe_screen* pscreen, enum pipe_cap param)
        case PIPE_CAP_PRIMITIVE_RESTART:
        case PIPE_CAP_CONDITIONAL_RENDER:
        case PIPE_CAP_TEXTURE_BARRIER:
+       case PIPE_CAP_STREAM_OUTPUT_PAUSE_RESUME:
                return 1;
 
        /* Supported except the original R600. */
@@ -391,17 +392,21 @@ static int r600_get_param(struct pipe_screen* pscreen, enum pipe_cap param)
                return family >= CHIP_CEDAR ? 1 : 0;
 
        /* Unsupported features. */
-       case PIPE_CAP_MAX_STREAM_OUTPUT_BUFFERS:
-       case PIPE_CAP_MAX_STREAM_OUTPUT_SEPARATE_ATTRIBS:
-       case PIPE_CAP_MAX_STREAM_OUTPUT_SEPARATE_COMPONENTS:
-       case PIPE_CAP_MAX_STREAM_OUTPUT_INTERLEAVED_COMPONENTS:
-       case PIPE_CAP_STREAM_OUTPUT_PAUSE_RESUME:
        case PIPE_CAP_TGSI_INSTANCEID:
        case PIPE_CAP_TGSI_FS_COORD_ORIGIN_LOWER_LEFT:
        case PIPE_CAP_TGSI_FS_COORD_PIXEL_CENTER_INTEGER:
        case PIPE_CAP_SCALED_RESOLVE:
                return 0;
 
+       /* Stream output. */
+       case PIPE_CAP_MAX_STREAM_OUTPUT_BUFFERS:
+               return debug_get_bool_option("R600_STREAMOUT", FALSE) ? 4 : 0;
+       case PIPE_CAP_MAX_STREAM_OUTPUT_SEPARATE_ATTRIBS:
+               return 16;
+       case PIPE_CAP_MAX_STREAM_OUTPUT_SEPARATE_COMPONENTS:
+       case PIPE_CAP_MAX_STREAM_OUTPUT_INTERLEAVED_COMPONENTS:
+               return 16*4;
+
        /* Texturing. */
        case PIPE_CAP_MAX_TEXTURE_2D_LEVELS:
        case PIPE_CAP_MAX_TEXTURE_3D_LEVELS:
index a127eed..46443af 100644 (file)
@@ -133,6 +133,8 @@ struct r600_pipe_shader {
        struct r600_vertex_element      vertex_elements;
        struct tgsi_token               *tokens;
        unsigned        sprite_coord_enable;
+       struct pipe_stream_output_info  so;
+       unsigned                        so_strides[4];
 };
 
 struct r600_pipe_sampler_state {
@@ -348,6 +350,19 @@ void r600_delete_ps_shader(struct pipe_context *ctx, void *state);
 void r600_delete_vs_shader(struct pipe_context *ctx, void *state);
 void r600_set_constant_buffer(struct pipe_context *ctx, uint shader, uint index,
                              struct pipe_resource *buffer);
+struct pipe_stream_output_target *
+r600_create_so_target(struct pipe_context *ctx,
+                     struct pipe_resource *buffer,
+                     unsigned buffer_offset,
+                     unsigned buffer_size);
+void r600_so_target_destroy(struct pipe_context *ctx,
+                           struct pipe_stream_output_target *target);
+void r600_set_so_targets(struct pipe_context *ctx,
+                        unsigned num_targets,
+                        struct pipe_stream_output_target **targets,
+                        unsigned append_bitmask);
+
+
 void r600_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info);
 
 /*
index ec0d91f..ee2d04b 100644 (file)
@@ -100,7 +100,21 @@ static void r600_render_condition(struct pipe_context *ctx,
        }
 
        rctx->ctx.predicate_drawing = true;
-       r600_query_predication(&rctx->ctx, rquery, PREDICATION_OP_ZPASS, wait_flag);
+
+       switch (rquery->type) {
+       case PIPE_QUERY_OCCLUSION_COUNTER:
+       case PIPE_QUERY_OCCLUSION_PREDICATE:
+               r600_query_predication(&rctx->ctx, rquery, PREDICATION_OP_ZPASS, wait_flag);
+               break;
+       case PIPE_QUERY_PRIMITIVES_EMITTED:
+       case PIPE_QUERY_PRIMITIVES_GENERATED:
+       case PIPE_QUERY_SO_STATISTICS:
+       case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
+               r600_query_predication(&rctx->ctx, rquery, PREDICATION_OP_PRIMCOUNT, wait_flag);
+               break;
+       default:
+               assert(0);
+       }
 }
 
 void r600_init_query_functions(struct r600_pipe_context *rctx)
index 61f88f4..ad4aded 100644 (file)
@@ -119,6 +119,19 @@ int r600_pipe_shader_create(struct pipe_context *ctx, struct r600_pipe_shader *s
        if (dump_shaders) {
                fprintf(stderr, "--------------------------------------------------------------\n");
                tgsi_dump(shader->tokens, 0);
+
+               if (shader->so.num_outputs) {
+                       unsigned i;
+                       fprintf(stderr, "STREAMOUT\n");
+                       for (i = 0; i < shader->so.num_outputs; i++) {
+                               fprintf(stderr, "  %i: MEM_STREAM0_BUF%i OUT[%i].%s%s%s%s\n", i,
+                                       shader->so.output[i].output_buffer, shader->so.output[i].register_index,
+                                       shader->so.output[i].register_mask & 1 ? "x" : "_",
+                                       (shader->so.output[i].register_mask >> 1) & 1 ? "y" : "_",
+                                       (shader->so.output[i].register_mask >> 2) & 1 ? "z" : "_",
+                                       (shader->so.output[i].register_mask >> 3) & 1 ? "w" : "_");
+                       }
+               }
        }
        r = r600_shader_from_tgsi(rctx, shader);
        if (r) {
@@ -681,6 +694,7 @@ static int r600_shader_from_tgsi(struct r600_pipe_context * rctx, struct r600_pi
 {
        struct r600_shader *shader = &pipeshader->shader;
        struct tgsi_token *tokens = pipeshader->tokens;
+       struct pipe_stream_output_info so = pipeshader->so;
        struct tgsi_full_immediate *immediate;
        struct tgsi_full_property *property;
        struct r600_shader_ctx ctx;
@@ -847,6 +861,93 @@ static int r600_shader_from_tgsi(struct r600_pipe_context * rctx, struct r600_pi
                }
        }
 
+       /* Add stream outputs. */
+       if (ctx.type == TGSI_PROCESSOR_VERTEX && so.num_outputs) {
+               unsigned buffer_offset[PIPE_MAX_SO_BUFFERS] = {0};
+
+               for (i = 0; i < so.num_outputs; i++) {
+                       struct r600_bytecode_output output;
+                       unsigned comps;
+
+                       if (so.output[i].output_buffer >= 4) {
+                               R600_ERR("exceeded the max number of stream output buffers, got: %d\n",
+                                        so.output[i].output_buffer);
+                               r = -EINVAL;
+                               goto out_err;
+                       }
+
+                       switch (so.output[i].register_mask) {
+                       case TGSI_WRITEMASK_XYZW:
+                               comps = 4;
+                               break;
+                       case TGSI_WRITEMASK_XYZ:
+                               comps = 3;
+                               break;
+                       case TGSI_WRITEMASK_XY:
+                               comps = 2;
+                               break;
+                       case TGSI_WRITEMASK_X:
+                               comps = 1;
+                               break;
+                       default:
+                               R600_ERR("streamout: invalid register_mask, got: %x\n",
+                                        so.output[i].register_mask);
+                               r = -EINVAL;
+                               goto out_err;
+                       }
+
+                       memset(&output, 0, sizeof(struct r600_bytecode_output));
+                       output.gpr = shader->output[so.output[i].register_index].gpr;
+                       output.elem_size = 0;
+                       output.array_base = buffer_offset[so.output[i].output_buffer];
+                       output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE;
+                       output.burst_count = 1;
+                       output.barrier = 1;
+                       output.array_size = 0;
+                       output.comp_mask = so.output[i].register_mask;
+                       if (ctx.bc->chip_class >= EVERGREEN) {
+                               switch (so.output[i].output_buffer) {
+                               case 0:
+                                       output.inst = EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0_BUF0;
+                                       break;
+                               case 1:
+                                       output.inst = EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0_BUF1;
+                                       break;
+                               case 2:
+                                       output.inst = EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0_BUF2;
+                                       break;
+                               case 3:
+                                       output.inst = EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0_BUF3;
+                                       break;
+                               }
+                       } else {
+                               switch (so.output[i].output_buffer) {
+                               case 0:
+                                       output.inst = V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM0;
+                                       break;
+                               case 1:
+                                       output.inst = V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM1;
+                                       break;
+                               case 2:
+                                       output.inst = V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM2;
+                                       break;
+                               case 3:
+                                       output.inst = V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_MEM_STREAM3;
+                                       break;
+                               }
+                       }
+                       r = r600_bytecode_add_output(ctx.bc, &output);
+                       if (r)
+                               goto out_err;
+
+                       buffer_offset[so.output[i].output_buffer] += comps;
+               }
+
+               for (i = 0; i < PIPE_MAX_SO_BUFFERS; i++) {
+                       pipeshader->so_strides[i] = buffer_offset[i] * 4;
+               }
+       }
+
        /* export output */
        j = 0;
        for (i = 0, pos0 = 0; i < noutput; i++) {
index 56ed35e..b9c4126 100644 (file)
 #define     V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS                 0x00000001
 #define     V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM               0x00000002
 #define     V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_SX                  0x00000003
+#define     V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE               0x00000000
+#define     V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE_IND           0x00000001
+#define     V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_READ                0x00000002
+#define     V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_READ_IND            0x00000003
 #define   S_SQ_CF_ALLOC_EXPORT_WORD0_RW_GPR(x)                       (((x) & 0x7F) << 15)
 #define   G_SQ_CF_ALLOC_EXPORT_WORD0_RW_GPR(x)                       (((x) >> 15) & 0x7F)
 #define   C_SQ_CF_ALLOC_EXPORT_WORD0_RW_GPR                          0xFFC07FFF
index 8410cfe..7f44035 100644 (file)
@@ -933,7 +933,7 @@ static void *r600_create_dsa_state(struct pipe_context *ctx,
 }
 
 static void *r600_create_rs_state(struct pipe_context *ctx,
-                                       const struct pipe_rasterizer_state *state)
+                                 const struct pipe_rasterizer_state *state)
 {
        struct r600_pipe_context *rctx = (struct r600_pipe_context *)ctx;
        struct r600_pipe_rasterizer *rs = CALLOC_STRUCT(r600_pipe_rasterizer);
@@ -978,8 +978,8 @@ static void *r600_create_rs_state(struct pipe_context *ctx,
                                state->fill_back != PIPE_POLYGON_MODE_FILL);
        r600_pipe_state_add_reg(rstate, R_028814_PA_SU_SC_MODE_CNTL,
                S_028814_PROVOKING_VTX_LAST(prov_vtx) |
-               S_028814_CULL_FRONT((state->cull_face & PIPE_FACE_FRONT) ? 1 : 0) |
-               S_028814_CULL_BACK((state->cull_face & PIPE_FACE_BACK) ? 1 : 0) |
+               S_028814_CULL_FRONT(state->rasterizer_discard || (state->cull_face & PIPE_FACE_FRONT) ? 1 : 0) |
+               S_028814_CULL_BACK(state->rasterizer_discard || (state->cull_face & PIPE_FACE_BACK) ? 1 : 0) |
                S_028814_FACE(!state->front_ccw) |
                S_028814_POLY_OFFSET_FRONT_ENABLE(state->offset_tri) |
                S_028814_POLY_OFFSET_BACK_ENABLE(state->offset_tri) |
@@ -1758,6 +1758,9 @@ void r600_init_state_functions(struct r600_pipe_context *rctx)
        rctx->context.sampler_view_destroy = r600_sampler_view_destroy;
        rctx->context.redefine_user_buffer = u_default_redefine_user_buffer;
        rctx->context.texture_barrier = r600_texture_barrier;
+       rctx->context.create_stream_output_target = r600_create_so_target;
+       rctx->context.stream_output_target_destroy = r600_so_target_destroy;
+       rctx->context.set_stream_output_targets = r600_set_so_targets;
 }
 
 void r600_adjust_gprs(struct r600_pipe_context *rctx)
index d6ffda4..9f6f514 100644 (file)
@@ -247,10 +247,11 @@ void *r600_create_vertex_elements(struct pipe_context *ctx,
 void *r600_create_shader_state(struct pipe_context *ctx,
                               const struct pipe_shader_state *state)
 {
-       struct r600_pipe_shader *shader =  CALLOC_STRUCT(r600_pipe_shader);
+       struct r600_pipe_shader *shader = CALLOC_STRUCT(r600_pipe_shader);
        int r;
 
        shader->tokens = tgsi_dup_tokens(state->tokens);
+       shader->so = state->stream_output;
 
        r =  r600_pipe_shader_create(ctx, shader);
        if (r) {
@@ -412,6 +413,71 @@ void r600_set_constant_buffer(struct pipe_context *ctx, uint shader, uint index,
                pipe_resource_reference((struct pipe_resource**)&rbuffer, NULL);
 }
 
+struct pipe_stream_output_target *
+r600_create_so_target(struct pipe_context *ctx,
+                     struct pipe_resource *buffer,
+                     unsigned buffer_offset,
+                     unsigned buffer_size)
+{
+       struct r600_pipe_context *rctx = (struct r600_pipe_context *)ctx;
+       struct r600_so_target *t;
+       void *ptr;
+
+       t = CALLOC_STRUCT(r600_so_target);
+       if (!t) {
+               return NULL;
+       }
+
+       t->b.reference.count = 1;
+       t->b.context = ctx;
+       pipe_resource_reference(&t->b.buffer, buffer);
+       t->b.buffer_offset = buffer_offset;
+       t->b.buffer_size = buffer_size;
+
+       t->filled_size = (struct r600_resource*)
+               pipe_buffer_create(ctx->screen, PIPE_BIND_CUSTOM, PIPE_USAGE_STATIC, 4);
+       ptr = rctx->ws->buffer_map(t->filled_size->buf, rctx->ctx.cs, PIPE_TRANSFER_WRITE);
+       memset(ptr, 0, t->filled_size->buf->size);
+       rctx->ws->buffer_unmap(t->filled_size->buf);
+
+       return &t->b;
+}
+
+void r600_so_target_destroy(struct pipe_context *ctx,
+                           struct pipe_stream_output_target *target)
+{
+       struct r600_so_target *t = (struct r600_so_target*)target;
+       pipe_resource_reference(&t->b.buffer, NULL);
+       pipe_resource_reference((struct pipe_resource**)&t->filled_size, NULL);
+       FREE(t);
+}
+
+void r600_set_so_targets(struct pipe_context *ctx,
+                        unsigned num_targets,
+                        struct pipe_stream_output_target **targets,
+                        unsigned append_bitmask)
+{
+       struct r600_pipe_context *rctx = (struct r600_pipe_context *)ctx;
+       unsigned i;
+
+       /* Stop streamout. */
+       if (rctx->ctx.num_so_targets) {
+               r600_context_streamout_end(&rctx->ctx);
+       }
+
+       /* Set the new targets. */
+       for (i = 0; i < num_targets; i++) {
+               pipe_so_target_reference((struct pipe_stream_output_target**)&rctx->ctx.so_targets[i], targets[i]);
+       }
+       for (; i < rctx->ctx.num_so_targets; i++) {
+               pipe_so_target_reference((struct pipe_stream_output_target**)&rctx->ctx.so_targets[i], NULL);
+       }
+
+       rctx->ctx.num_so_targets = num_targets;
+       rctx->ctx.streamout_start = num_targets != 0;
+       rctx->ctx.streamout_append_bitmask = append_bitmask;
+}
+
 static void r600_vertex_buffer_update(struct r600_pipe_context *rctx)
 {
        struct r600_pipe_resource_state *rstate;
@@ -528,7 +594,7 @@ void r600_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *dinfo)
        struct pipe_index_buffer ib = {};
        unsigned prim, mask, ls_mask = 0;
 
-       if (!info.count ||
+       if ((!info.count && (info.indexed || !info.count_from_stream_output)) ||
            (info.indexed && !rctx->vbuf_mgr->index_buffer.buffer) ||
            !r600_conv_pipe_prim(info.mode, &prim)) {
                return;
@@ -572,8 +638,15 @@ void r600_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *dinfo)
        } else {
                info.index_bias = info.start;
                rdraw.vgt_draw_initiator = V_0287F0_DI_SRC_SEL_AUTO_INDEX;
+               if (info.count_from_stream_output) {
+                       rdraw.vgt_draw_initiator |= S_0287F0_USE_OPAQUE(1);
+
+                       r600_context_draw_opaque_count(&rctx->ctx, (struct r600_so_target*)info.count_from_stream_output);
+               }
        }
 
+       rctx->ctx.vs_shader_so_strides = rctx->vs_shader->so_strides;
+
        mask = (1ULL << ((unsigned)rctx->framebuffer.nr_cbufs * 4)) - 1;
 
        if (rctx->vgt.id != R600_PIPE_STATE_VGT) {
index 7a2fe02..ccdf82e 100644 (file)
 #define PKT3_DRAW_INDEX_IMMD                   0x2E
 #define PKT3_NUM_INSTANCES                     0x2F
 #define PKT3_STRMOUT_BUFFER_UPDATE             0x34
+#define                STRMOUT_STORE_BUFFER_FILLED_SIZE        1
+#define                STRMOUT_OFFSET_SOURCE(x)        (((x) & 0x3) << 1)
+#define                        STRMOUT_OFFSET_FROM_PACKET              0
+#define                        STRMOUT_OFFSET_FROM_VGT_FILLED_SIZE     1
+#define                        STRMOUT_OFFSET_FROM_MEM                 2
+#define                        STRMOUT_OFFSET_NONE                     3
+#define                STRMOUT_SELECT_BUFFER(x)        (((x) & 0x3) << 8)
 #define PKT3_INDIRECT_BUFFER_MP                0x38
 #define PKT3_MEM_SEMAPHORE                     0x39
 #define PKT3_MPEG_INDEX                        0x3A
+#define PKT3_COPY_DW                          0x3B
+#define                COPY_DW_SRC_IS_REG              (0 << 0)
+#define                COPY_DW_SRC_IS_MEM              (1 << 0)
+#define                COPY_DW_DST_IS_REG              (0 << 1)
+#define                COPY_DW_DST_IS_MEM              (1 << 1)
 #define PKT3_WAIT_REG_MEM                      0x3C
+#define                WAIT_REG_MEM_EQUAL              3
 #define PKT3_MEM_WRITE                         0x3D
 #define PKT3_INDIRECT_BUFFER                   0x32
 #define PKT3_CP_INTERRUPT                      0x40
 #define EVENT_TYPE_CACHE_FLUSH_AND_INV_TS_EVENT 0x14
 #define EVENT_TYPE_ZPASS_DONE                  0x15
 #define EVENT_TYPE_CACHE_FLUSH_AND_INV_EVENT   0x16
+#define EVENT_TYPE_SO_VGTSTREAMOUT_FLUSH       0x1f
+#define EVENT_TYPE_SAMPLE_STREAMOUTSTATS       0x20
 #define                EVENT_TYPE(x)                           ((x) << 0)
 #define                EVENT_INDEX(x)                          ((x) << 8)
                 /* 0 - any non-TS event
 #define PKT3(op, count, predicate) (PKT_TYPE_S(3) | PKT3_IT_OPCODE_S(op) | PKT_COUNT_S(count) | PKT3_PRED_S(predicate))
 
 /* Registers */
+#define R_008490_CP_STRMOUT_CNTL                    0x008490
+#define   S_008490_OFFSET_UPDATE_DONE(x)               (((x) & 0x1) << 0)
+#define R_008960_VGT_STRMOUT_BUFFER_FILLED_SIZE_0    0x008960 /* read-only */
+#define R_008964_VGT_STRMOUT_BUFFER_FILLED_SIZE_1    0x008964 /* read-only */
+#define R_008968_VGT_STRMOUT_BUFFER_FILLED_SIZE_2    0x008968 /* read-only */
+#define R_00896C_VGT_STRMOUT_BUFFER_FILLED_SIZE_3    0x00896C /* read-only */
 #define R_008C00_SQ_CONFIG                           0x00008C00
 #define   S_008C00_VC_ENABLE(x)                        (((x) & 0x1) << 0)
 #define   G_008C00_VC_ENABLE(x)                        (((x) >> 0) & 0x1)
 #define   S_028AB8_VTX_CNT_EN(x)                       (((x) & 0x1) << 0)
 #define   G_028AB8_VTX_CNT_EN(x)                       (((x) >> 0) & 0x1)
 #define   C_028AB8_VTX_CNT_EN                          0xFFFFFFFE
+#define R_028AD0_VGT_STRMOUT_BUFFER_SIZE_0          0x028AD0
+#define R_028AD4_VGT_STRMOUT_VTX_STRIDE_0           0x028AD4
+#define R_028AD8_VGT_STRMOUT_BUFFER_BASE_0          0x028AD8
+#define R_028ADC_VGT_STRMOUT_BUFFER_OFFSET_0        0x028ADC
+#define R_028AE0_VGT_STRMOUT_BUFFER_SIZE_1          0x028AE0
+#define R_028AE4_VGT_STRMOUT_VTX_STRIDE_1           0x028AE4
+#define R_028AE8_VGT_STRMOUT_BUFFER_BASE_1          0x028AE8
+#define R_028AEC_VGT_STRMOUT_BUFFER_OFFSET_1        0x028AEC
+#define R_028AF0_VGT_STRMOUT_BUFFER_SIZE_2          0x028AF0
+#define R_028AF4_VGT_STRMOUT_VTX_STRIDE_2           0x028AF4
+#define R_028AF8_VGT_STRMOUT_BUFFER_BASE_2          0x028AF8
+#define R_028AFC_VGT_STRMOUT_BUFFER_OFFSET_2        0x028AFC
+#define R_028B00_VGT_STRMOUT_BUFFER_SIZE_3          0x028B00
+#define R_028B04_VGT_STRMOUT_VTX_STRIDE_3           0x028B04
+#define R_028B08_VGT_STRMOUT_BUFFER_BASE_3          0x028B08
+#define R_028B0C_VGT_STRMOUT_BUFFER_OFFSET_3        0x028B0C
+#define R_028B10_VGT_STRMOUT_BASE_OFFSET_0          0x028B10
+#define R_028B14_VGT_STRMOUT_BASE_OFFSET_1          0x028B14
+#define R_028B18_VGT_STRMOUT_BASE_OFFSET_2          0x028B18
+#define R_028B1C_VGT_STRMOUT_BASE_OFFSET_3          0x028B1C
 #define R_028B20_VGT_STRMOUT_BUFFER_EN               0x028B20
 #define   S_028B20_BUFFER_0_EN(x)                      (((x) & 0x1) << 0)
 #define   G_028B20_BUFFER_0_EN(x)                      (((x) >> 0) & 0x1)
 #define   S_028B20_BUFFER_3_EN(x)                      (((x) & 0x1) << 3)
 #define   G_028B20_BUFFER_3_EN(x)                      (((x) >> 3) & 0x1)
 #define   C_028B20_BUFFER_3_EN                         0xFFFFFFF7
+#define R_028B28_VGT_STRMOUT_DRAW_OPAQUE_OFFSET             0x028B28
+#define R_028B2C_VGT_STRMOUT_DRAW_OPAQUE_BUFFER_FILLED_SIZE 0x028B2C
+#define R_028B30_VGT_STRMOUT_DRAW_OPAQUE_VERTEX_STRIDE 0x028B30
+#define R_028B44_VGT_STRMOUT_BASE_OFFSET_HI_0       0x028B44
+#define R_028B48_VGT_STRMOUT_BASE_OFFSET_HI_1       0x028B48
+#define R_028B4C_VGT_STRMOUT_BASE_OFFSET_HI_2       0x028B4C
+#define R_028B50_VGT_STRMOUT_BASE_OFFSET_HI_3       0x028B50
 #define R_028C20_PA_SC_AA_SAMPLE_LOCS_8S_WD1_MCTX    0x028C20
 #define   S_028C20_S4_X(x)                             (((x) & 0xF) << 0)
 #define   G_028C20_S4_X(x)                             (((x) >> 0) & 0xF)
 #define   S_0085F0_CR2_ACTION_ENA(x)                   (((x) & 0x1) << 31)
 #define   G_0085F0_CR2_ACTION_ENA(x)                   (((x) >> 31) & 0x1)
 #define   C_0085F0_CR2_ACTION_ENA                      0x7FFFFFFF
+#define R_0085F4_CP_COHER_SIZE                       0x0085F4
+#define R_0085F8_CP_COHER_BASE                       0x0085F8
+#define R_0085FC_CP_COHER_STATUS                     0x0085FC
 
 
 #define R_02812C_CB_CLEAR_ALPHA                      0x02812C