From 2840bec56f79347b95dec5458b20d4a46d1aa445 Mon Sep 17 00:00:00 2001 From: Tom Stellard Date: Wed, 12 Jun 2013 14:38:59 -0700 Subject: [PATCH] r600g/compute: Accept LDS size from the LLVM backend And allocate the correct amount before dispatching the kernel. Tested-by: Aaron Watry --- src/gallium/drivers/r600/evergreen_compute.c | 53 +++++++++++++++------- .../drivers/r600/evergreen_compute_internal.h | 1 + src/gallium/drivers/r600/evergreen_state.c | 6 +-- src/gallium/drivers/r600/r600_asm.h | 1 + src/gallium/drivers/r600/r600_llvm.c | 3 ++ 5 files changed, 44 insertions(+), 20 deletions(-) diff --git a/src/gallium/drivers/r600/evergreen_compute.c b/src/gallium/drivers/r600/evergreen_compute.c index b16c9d9..226933b 100644 --- a/src/gallium/drivers/r600/evergreen_compute.c +++ b/src/gallium/drivers/r600/evergreen_compute.c @@ -211,7 +211,8 @@ void *evergreen_create_compute_state( #endif shader->ctx = (struct r600_context*)ctx; - shader->local_size = cso->req_local_mem; ///TODO: assert it + /* XXX: We ignore cso->req_local_mem, because we compute this value + * ourselves on a per-kernel basis. */ shader->private_size = cso->req_private_mem; shader->input_size = cso->req_input_mem; @@ -327,13 +328,13 @@ static void evergreen_emit_direct_dispatch( { int i; struct radeon_winsys_cs *cs = rctx->rings.gfx.cs; + struct r600_pipe_compute *shader = rctx->cs_shader_state.shader; unsigned num_waves; unsigned num_pipes = rctx->screen->info.r600_max_pipes; unsigned wave_divisor = (16 * num_pipes); int group_size = 1; int grid_size = 1; - /* XXX: Enable lds and get size from cs_shader_state */ - unsigned lds_size = 0; + unsigned lds_size = shader->active_kernel->bc.nlds_dw; /* Calculate group_size/grid_size */ for (i = 0; i < 3; i++) { @@ -348,16 +349,10 @@ static void evergreen_emit_direct_dispatch( num_waves = (block_layout[0] * block_layout[1] * block_layout[2] + wave_divisor - 1) / wave_divisor; - COMPUTE_DBG(rctx->screen, "Using %u pipes, there are %u wavefronts per thread block\n", - num_pipes, num_waves); - - /* XXX: Partition the LDS between PS/CS. By default half (4096 dwords - * on Evergreen) oes to Pixel Shaders and half goes to Compute Shaders. - * We may need to allocat the entire LDS space for Compute Shaders. - * - * EG: R_008E2C_SQ_LDS_RESOURCE_MGMT := S_008E2C_NUM_LS_LDS(lds_dwords) - * CM: CM_R_0286FC_SPI_LDS_MGMT := S_0286FC_NUM_LS_LDS(lds_dwords) - */ + COMPUTE_DBG(rctx->screen, "Using %u pipes, " + "%u wavefronts per thread block, " + "allocating %u dwords lds.\n", + num_pipes, num_waves, lds_size); r600_write_config_reg(cs, R_008970_VGT_NUM_INDICES, group_size); @@ -374,6 +369,14 @@ static void evergreen_emit_direct_dispatch( r600_write_value(cs, block_layout[1]); /* R_0286F0_SPI_COMPUTE_NUM_THREAD_Y */ r600_write_value(cs, block_layout[2]); /* R_0286F4_SPI_COMPUTE_NUM_THREAD_Z */ + if (rctx->chip_class < CAYMAN) { + assert(lds_size <= 8192); + } else { + /* Cayman appears to have a slightly smaller limit, see the + * value of CM_R_0286FC_SPI_LDS_MGMT.NUM_LS_LDS */ + assert(lds_size <= 8160); + } + r600_write_compute_context_reg(cs, CM_R_0288E8_SQ_LDS_ALLOC, lds_size | (num_waves << 14)); @@ -517,12 +520,14 @@ static void evergreen_launch_grid( struct r600_context *ctx = (struct r600_context *)ctx_; #ifdef HAVE_OPENCL - COMPUTE_DBG(ctx->screen, "*** evergreen_launch_grid: pc = %u\n", pc); struct r600_pipe_compute *shader = ctx->cs_shader_state.shader; - if (!shader->kernels[pc].code_bo) { + struct r600_kernel *kernel = &shader->kernels[pc]; + + COMPUTE_DBG(ctx->screen, "*** evergreen_launch_grid: pc = %u\n", pc); + + if (!kernel->code_bo) { void *p; - struct r600_kernel *kernel = &shader->kernels[pc]; struct r600_bytecode *bc = &kernel->bc; LLVMModuleRef mod = kernel->llvm_module; boolean use_kill = false; @@ -551,7 +556,7 @@ static void evergreen_launch_grid( ctx->ws->buffer_unmap(kernel->code_bo->cs_buf); } #endif - + shader->active_kernel = kernel; ctx->cs_shader_state.kernel_index = pc; evergreen_compute_upload_input(ctx_, block_layout, grid_layout, input); compute_emit_cs(ctx, block_layout, grid_layout); @@ -792,6 +797,20 @@ void evergreen_init_atom_start_compute_cs(struct r600_context *ctx) r600_store_value(cb, S_008C28_NUM_LS_STACK_ENTRIES(num_stack_entries)); } + /* Give the compute shader all the available LDS space. + * NOTE: This only sets the maximum number of dwords that a compute + * shader can allocate. When a shader is executed, we still need to + * allocate the appropriate amount of LDS dwords using the + * CM_R_0288E8_SQ_LDS_ALLOC register. + */ + if (ctx->chip_class < CAYMAN) { + r600_store_config_reg(cb, R_008E2C_SQ_LDS_RESOURCE_MGMT, + S_008E2C_NUM_PS_LDS(0x0000) | S_008E2C_NUM_LS_LDS(8192)); + } else { + r600_store_context_reg(cb, CM_R_0286FC_SPI_LDS_MGMT, + S_0286FC_NUM_PS_LDS(0) | + S_0286FC_NUM_LS_LDS(255)); /* 255 * 32 = 8160 dwords */ + } /* Context Registers */ diff --git a/src/gallium/drivers/r600/evergreen_compute_internal.h b/src/gallium/drivers/r600/evergreen_compute_internal.h index f904d61..c524da2 100644 --- a/src/gallium/drivers/r600/evergreen_compute_internal.h +++ b/src/gallium/drivers/r600/evergreen_compute_internal.h @@ -42,6 +42,7 @@ struct r600_pipe_compute { unsigned num_kernels; struct r600_kernel *kernels; + struct r600_kernel *active_kernel; unsigned local_size; unsigned private_size; unsigned input_size; diff --git a/src/gallium/drivers/r600/evergreen_state.c b/src/gallium/drivers/r600/evergreen_state.c index 3ebb157..72a2fe2 100644 --- a/src/gallium/drivers/r600/evergreen_state.c +++ b/src/gallium/drivers/r600/evergreen_state.c @@ -2974,9 +2974,6 @@ void evergreen_init_common_regs(struct r600_command_buffer *cb, r600_store_value(cb, tmp); /* R_008C0C_SQ_GPR_RESOURCE_MGMT_3 */ } - r600_store_config_reg(cb, R_008E2C_SQ_LDS_RESOURCE_MGMT, - S_008E2C_NUM_PS_LDS(0x1000) | S_008E2C_NUM_LS_LDS(0x1000)); - r600_store_context_reg(cb, R_028A4C_PA_SC_MODE_CNTL_1, 0); /* The cs checker requires this register to be set. */ @@ -3195,6 +3192,9 @@ void evergreen_init_atom_start_cs(struct r600_context *rctx) tmp |= S_008C28_NUM_LS_STACK_ENTRIES(num_ls_stack_entries); r600_store_value(cb, tmp); /* R_008C28_SQ_STACK_RESOURCE_MGMT_3 */ + r600_store_config_reg(cb, R_008E2C_SQ_LDS_RESOURCE_MGMT, + S_008E2C_NUM_PS_LDS(0x1000) | S_008E2C_NUM_LS_LDS(0x1000)); + r600_store_config_reg(cb, R_009100_SPI_CONFIG_CNTL, 0); r600_store_config_reg(cb, R_00913C_SPI_CONFIG_CNTL_1, S_00913C_VTX_DONE_DELAY(4)); diff --git a/src/gallium/drivers/r600/r600_asm.h b/src/gallium/drivers/r600/r600_asm.h index 6ab5dac..82c6c8d 100644 --- a/src/gallium/drivers/r600/r600_asm.h +++ b/src/gallium/drivers/r600/r600_asm.h @@ -203,6 +203,7 @@ struct r600_bytecode { unsigned ncf; unsigned ngpr; unsigned nstack; + unsigned nlds_dw; unsigned nresource; unsigned force_add_cf; uint32_t *bytecode; diff --git a/src/gallium/drivers/r600/r600_llvm.c b/src/gallium/drivers/r600/r600_llvm.c index c1809b3..03a68e4 100644 --- a/src/gallium/drivers/r600/r600_llvm.c +++ b/src/gallium/drivers/r600/r600_llvm.c @@ -640,6 +640,9 @@ unsigned r600_llvm_compile( case R_02880C_DB_SHADER_CONTROL: *use_kill = G_02880C_KILL_ENABLE(value); break; + case CM_R_0288E8_SQ_LDS_ALLOC: + bc->nlds_dw = value; + break; } } -- 2.7.4