From 9d78abbef80ae79c9f81056d19eaee9a4e81aeb3 Mon Sep 17 00:00:00 2001 From: Jason Ekstrand Date: Thu, 17 May 2018 23:17:17 -0700 Subject: [PATCH] intel/compiler: Add and use helpers for working with KSP indices The pixel shader dispatch table is kind-of a confusing mess. This adds some helpers for dealing with it and for easily extracting the correct data from wm_prog_data. Reviewed-by: Matt Turner --- src/intel/blorp/blorp_genX_exec.h | 65 ++++++++++++-------- src/intel/compiler/brw_compiler.h | 85 +++++++++++++++++++++++++++ src/intel/vulkan/genX_pipeline.c | 18 +++--- src/mesa/drivers/dri/i965/gen4_blorp_exec.h | 16 +++-- src/mesa/drivers/dri/i965/genX_state_upload.c | 54 +++++++++++------ 5 files changed, 183 insertions(+), 55 deletions(-) diff --git a/src/intel/blorp/blorp_genX_exec.h b/src/intel/blorp/blorp_genX_exec.h index 4800c7d..d2eba27 100644 --- a/src/intel/blorp/blorp_genX_exec.h +++ b/src/intel/blorp/blorp_genX_exec.h @@ -763,17 +763,22 @@ blorp_emit_ps_config(struct blorp_batch *batch, } if (prog_data) { - ps.DispatchGRFStartRegisterForConstantSetupData0 = - prog_data->base.dispatch_grf_start_reg; - ps.DispatchGRFStartRegisterForConstantSetupData2 = - prog_data->dispatch_grf_start_reg_2; - ps._8PixelDispatchEnable = prog_data->dispatch_8; ps._16PixelDispatchEnable = prog_data->dispatch_16; - ps.KernelStartPointer0 = params->wm_prog_kernel; - ps.KernelStartPointer2 = - params->wm_prog_kernel + prog_data->prog_offset_2; + ps.DispatchGRFStartRegisterForConstantSetupData0 = + brw_wm_prog_data_dispatch_grf_start_reg(prog_data, ps, 0); + ps.DispatchGRFStartRegisterForConstantSetupData1 = + brw_wm_prog_data_dispatch_grf_start_reg(prog_data, ps, 1); + ps.DispatchGRFStartRegisterForConstantSetupData2 = + brw_wm_prog_data_dispatch_grf_start_reg(prog_data, ps, 2); + + ps.KernelStartPointer0 = params->wm_prog_kernel + + brw_wm_prog_data_prog_offset(prog_data, ps, 0); + ps.KernelStartPointer1 = params->wm_prog_kernel + + brw_wm_prog_data_prog_offset(prog_data, ps, 1); + ps.KernelStartPointer2 = params->wm_prog_kernel + + brw_wm_prog_data_prog_offset(prog_data, ps, 2); } /* 3DSTATE_PS expects the number of threads per PSD, which is always 64 @@ -867,17 +872,22 @@ blorp_emit_ps_config(struct blorp_batch *batch, #endif if (prog_data) { + ps._8PixelDispatchEnable = prog_data->dispatch_8; + ps._16PixelDispatchEnable = prog_data->dispatch_16; + ps.DispatchGRFStartRegisterForConstantSetupData0 = - prog_data->base.dispatch_grf_start_reg; + brw_wm_prog_data_dispatch_grf_start_reg(prog_data, ps, 0); + ps.DispatchGRFStartRegisterForConstantSetupData1 = + brw_wm_prog_data_dispatch_grf_start_reg(prog_data, ps, 1); ps.DispatchGRFStartRegisterForConstantSetupData2 = - prog_data->dispatch_grf_start_reg_2; - - ps.KernelStartPointer0 = params->wm_prog_kernel; - ps.KernelStartPointer2 = - params->wm_prog_kernel + prog_data->prog_offset_2; + brw_wm_prog_data_dispatch_grf_start_reg(prog_data, ps, 2); - ps._8PixelDispatchEnable = prog_data->dispatch_8; - ps._16PixelDispatchEnable = prog_data->dispatch_16; + ps.KernelStartPointer0 = params->wm_prog_kernel + + brw_wm_prog_data_prog_offset(prog_data, ps, 0); + ps.KernelStartPointer1 = params->wm_prog_kernel + + brw_wm_prog_data_prog_offset(prog_data, ps, 1); + ps.KernelStartPointer2 = params->wm_prog_kernel + + brw_wm_prog_data_prog_offset(prog_data, ps, 2); ps.AttributeEnable = prog_data->num_varying_inputs > 0; } else { @@ -929,18 +939,23 @@ blorp_emit_ps_config(struct blorp_batch *batch, if (prog_data) { wm.ThreadDispatchEnable = true; - wm.DispatchGRFStartRegisterForConstantSetupData0 = - prog_data->base.dispatch_grf_start_reg; - wm.DispatchGRFStartRegisterForConstantSetupData2 = - prog_data->dispatch_grf_start_reg_2; - - wm.KernelStartPointer0 = params->wm_prog_kernel; - wm.KernelStartPointer2 = - params->wm_prog_kernel + prog_data->prog_offset_2; - wm._8PixelDispatchEnable = prog_data->dispatch_8; wm._16PixelDispatchEnable = prog_data->dispatch_16; + wm.DispatchGRFStartRegisterForConstantSetupData0 = + brw_wm_prog_data_dispatch_grf_start_reg(prog_data, wm, 0); + wm.DispatchGRFStartRegisterForConstantSetupData1 = + brw_wm_prog_data_dispatch_grf_start_reg(prog_data, wm, 1); + wm.DispatchGRFStartRegisterForConstantSetupData2 = + brw_wm_prog_data_dispatch_grf_start_reg(prog_data, wm, 2); + + wm.KernelStartPointer0 = params->wm_prog_kernel + + brw_wm_prog_data_prog_offset(prog_data, wm, 0); + wm.KernelStartPointer1 = params->wm_prog_kernel + + brw_wm_prog_data_prog_offset(prog_data, wm, 1); + wm.KernelStartPointer2 = params->wm_prog_kernel + + brw_wm_prog_data_prog_offset(prog_data, wm, 2); + wm.NumberofSFOutputAttributes = prog_data->num_varying_inputs; } diff --git a/src/intel/compiler/brw_compiler.h b/src/intel/compiler/brw_compiler.h index 8b4e6fe..a8ae243 100644 --- a/src/intel/compiler/brw_compiler.h +++ b/src/intel/compiler/brw_compiler.h @@ -745,6 +745,91 @@ struct brw_wm_prog_data { int urb_setup[VARYING_SLOT_MAX]; }; +/** Returns the SIMD width corresponding to a given KSP index + * + * The "Variable Pixel Dispatch" table in the PRM (which can be found, for + * example in Vol. 7 of the SKL PRM) has a mapping from dispatch widths to + * kernel start pointer (KSP) indices that is based on what dispatch widths + * are enabled. This function provides, effectively, the reverse mapping. + * + * If the given KSP is valid with respect to the SIMD8/16/32 enables, a SIMD + * width of 8, 16, or 32 is returned. If the KSP is invalid, 0 is returned. + */ +static inline unsigned +brw_fs_simd_width_for_ksp(unsigned ksp_idx, bool simd8_enabled, + bool simd16_enabled, bool simd32_enabled) +{ + /* This function strictly ignores contiguous dispatch */ + switch (ksp_idx) { + case 0: + return simd8_enabled ? 8 : + (simd16_enabled && !simd32_enabled) ? 16 : + (simd32_enabled && !simd16_enabled) ? 32 : 0; + case 1: + return (simd32_enabled && (simd16_enabled || simd8_enabled)) ? 32 : 0; + case 2: + return (simd16_enabled && (simd32_enabled || simd8_enabled)) ? 16 : 0; + default: + unreachable("Invalid KSP index"); + } +} + +#define brw_wm_state_simd_width_for_ksp(wm_state, ksp_idx) \ + brw_fs_simd_width_for_ksp((ksp_idx), (wm_state)._8PixelDispatchEnable, \ + (wm_state)._16PixelDispatchEnable, \ + (wm_state)._32PixelDispatchEnable) + +#define brw_wm_state_has_ksp(wm_state, ksp_idx) \ + (brw_wm_state_simd_width_for_ksp((wm_state), (ksp_idx)) != 0) + +static inline uint32_t +_brw_wm_prog_data_prog_offset(const struct brw_wm_prog_data *prog_data, + unsigned ksp_idx) +{ + switch (ksp_idx) { + case 0: return 0; + case 1: return 0; + case 2: return prog_data->prog_offset_2; + default: + unreachable("Invalid KSP index"); + } +} + +#define brw_wm_prog_data_prog_offset(prog_data, wm_state, ksp_idx) \ + _brw_wm_prog_data_prog_offset(prog_data, ksp_idx) + +static inline uint8_t +_brw_wm_prog_data_dispatch_grf_start_reg(const struct brw_wm_prog_data *prog_data, + unsigned ksp_idx) +{ + switch (ksp_idx) { + case 0: return prog_data->base.dispatch_grf_start_reg; + case 1: return 0; + case 2: return prog_data->dispatch_grf_start_reg_2; + default: + unreachable("Invalid KSP index"); + } +} + +#define brw_wm_prog_data_dispatch_grf_start_reg(prog_data, wm_state, ksp_idx) \ + _brw_wm_prog_data_dispatch_grf_start_reg(prog_data, ksp_idx) + +static inline uint8_t +_brw_wm_prog_data_reg_blocks(const struct brw_wm_prog_data *prog_data, + unsigned ksp_idx) +{ + switch (ksp_idx) { + case 0: return prog_data->reg_blocks_0; + case 1: return 0; + case 2: return prog_data->reg_blocks_2; + default: + unreachable("Invalid KSP index"); + } +} + +#define brw_wm_prog_data_reg_blocks(prog_data, wm_state, ksp_idx) \ + _brw_wm_prog_data_reg_blocks(prog_data, ksp_idx) + struct brw_push_const_block { unsigned dwords; /* Dword count, not reg aligned */ unsigned regs; diff --git a/src/intel/vulkan/genX_pipeline.c b/src/intel/vulkan/genX_pipeline.c index 462c594..80165b8 100644 --- a/src/intel/vulkan/genX_pipeline.c +++ b/src/intel/vulkan/genX_pipeline.c @@ -1488,14 +1488,17 @@ emit_3dstate_ps(struct anv_pipeline *pipeline, #endif anv_batch_emit(&pipeline->batch, GENX(3DSTATE_PS), ps) { - ps.KernelStartPointer0 = fs_bin->kernel.offset; - ps.KernelStartPointer1 = 0; - ps.KernelStartPointer2 = fs_bin->kernel.offset + - wm_prog_data->prog_offset_2; ps._8PixelDispatchEnable = wm_prog_data->dispatch_8; ps._16PixelDispatchEnable = wm_prog_data->dispatch_16; ps._32PixelDispatchEnable = false; + ps.KernelStartPointer0 = fs_bin->kernel.offset + + brw_wm_prog_data_prog_offset(wm_prog_data, ps, 0); + ps.KernelStartPointer1 = fs_bin->kernel.offset + + brw_wm_prog_data_prog_offset(wm_prog_data, ps, 1); + ps.KernelStartPointer2 = fs_bin->kernel.offset + + brw_wm_prog_data_prog_offset(wm_prog_data, ps, 2); + ps.SingleProgramFlow = false; ps.VectorMaskEnable = true; ps.SamplerCount = get_sampler_count(fs_bin); @@ -1526,10 +1529,11 @@ emit_3dstate_ps(struct anv_pipeline *pipeline, #endif ps.DispatchGRFStartRegisterForConstantSetupData0 = - wm_prog_data->base.dispatch_grf_start_reg; - ps.DispatchGRFStartRegisterForConstantSetupData1 = 0; + brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, ps, 0); + ps.DispatchGRFStartRegisterForConstantSetupData1 = + brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, ps, 1); ps.DispatchGRFStartRegisterForConstantSetupData2 = - wm_prog_data->dispatch_grf_start_reg_2; + brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, ps, 2); ps.PerThreadScratchSpace = get_scratch_space(fs_bin); ps.ScratchSpaceBasePointer = diff --git a/src/mesa/drivers/dri/i965/gen4_blorp_exec.h b/src/mesa/drivers/dri/i965/gen4_blorp_exec.h index e59bc9f..e3b90f1 100644 --- a/src/mesa/drivers/dri/i965/gen4_blorp_exec.h +++ b/src/mesa/drivers/dri/i965/gen4_blorp_exec.h @@ -136,13 +136,17 @@ blorp_emit_wm_state(struct blorp_batch *batch, #if GEN_GEN == 4 wm.KernelStartPointer0 = instruction_state_address(batch, params->wm_prog_kernel); - wm.GRFRegisterCount0 = prog_data->reg_blocks_0; + wm.GRFRegisterCount0 = brw_wm_prog_data_reg_blocks(prog_data, wm, 0); #else - wm.KernelStartPointer0 = params->wm_prog_kernel; - wm.GRFRegisterCount0 = prog_data->reg_blocks_0; - wm.KernelStartPointer2 = - params->wm_prog_kernel + prog_data->prog_offset_2; - wm.GRFRegisterCount2 = prog_data->reg_blocks_2; + wm.KernelStartPointer0 = params->wm_prog_kernel + + brw_wm_prog_data_prog_offset(prog_data, wm, 0); + wm.KernelStartPointer1 = params->wm_prog_kernel + + brw_wm_prog_data_prog_offset(prog_data, wm, 1); + wm.KernelStartPointer2 = params->wm_prog_kernel + + brw_wm_prog_data_prog_offset(prog_data, wm, 2); + wm.GRFRegisterCount0 = brw_wm_prog_data_reg_blocks(prog_data, wm, 0); + wm.GRFRegisterCount1 = brw_wm_prog_data_reg_blocks(prog_data, wm, 1); + wm.GRFRegisterCount2 = brw_wm_prog_data_reg_blocks(prog_data, wm, 2); #endif } diff --git a/src/mesa/drivers/dri/i965/genX_state_upload.c b/src/mesa/drivers/dri/i965/genX_state_upload.c index b9803d3..189245d 100644 --- a/src/mesa/drivers/dri/i965/genX_state_upload.c +++ b/src/mesa/drivers/dri/i965/genX_state_upload.c @@ -1899,43 +1899,57 @@ genX(upload_wm)(struct brw_context *brw) #if GEN_GEN == 4 /* On gen4, we only have one shader kernel */ - if (wm_prog_data->dispatch_8 || wm_prog_data->dispatch_16) { + if (brw_wm_state_has_ksp(wm, 0)) { + assert(brw_wm_prog_data_prog_offset(wm_prog_data, wm, 0) == 0); wm.KernelStartPointer0 = KSP(brw, stage_state->prog_offset); - wm.GRFRegisterCount0 = wm_prog_data->reg_blocks_0; + wm.GRFRegisterCount0 = brw_wm_prog_data_reg_blocks(wm_prog_data, wm, 0); wm.DispatchGRFStartRegisterForConstantSetupData0 = - wm_prog_data->base.dispatch_grf_start_reg; + brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, wm, 0); } #elif GEN_GEN == 5 /* On gen5, we have multiple shader kernels but only one GRF start * register for all kernels */ - wm.KernelStartPointer0 = stage_state->prog_offset; + wm.KernelStartPointer0 = stage_state->prog_offset + + brw_wm_prog_data_prog_offset(wm_prog_data, wm, 0); + wm.KernelStartPointer1 = stage_state->prog_offset + + brw_wm_prog_data_prog_offset(wm_prog_data, wm, 1); wm.KernelStartPointer2 = stage_state->prog_offset + - wm_prog_data->prog_offset_2; + brw_wm_prog_data_prog_offset(wm_prog_data, wm, 2); - wm.GRFRegisterCount0 = wm_prog_data->reg_blocks_0; - wm.GRFRegisterCount2 = wm_prog_data->reg_blocks_2; + wm.GRFRegisterCount0 = brw_wm_prog_data_reg_blocks(wm_prog_data, wm, 0); + wm.GRFRegisterCount1 = brw_wm_prog_data_reg_blocks(wm_prog_data, wm, 1); + wm.GRFRegisterCount2 = brw_wm_prog_data_reg_blocks(wm_prog_data, wm, 2); wm.DispatchGRFStartRegisterForConstantSetupData0 = wm_prog_data->base.dispatch_grf_start_reg; /* Dispatch GRF Start should be the same for all shaders on gen5 */ - if (wm_prog_data->dispatch_8 && wm_prog_data->dispatch_16) { + if (brw_wm_state_has_ksp(wm, 1)) { assert(wm_prog_data->base.dispatch_grf_start_reg == - wm_prog_data->dispatch_grf_start_reg_2); + brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, wm, 1)); + } + if (brw_wm_state_has_ksp(wm, 2)) { + assert(wm_prog_data->base.dispatch_grf_start_reg == + brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, wm, 2)); } #elif GEN_GEN == 6 /* On gen5, we have multiple shader kernels and we no longer specify a * register count for each one. */ - wm.KernelStartPointer0 = stage_state->prog_offset; + wm.KernelStartPointer0 = stage_state->prog_offset + + brw_wm_prog_data_prog_offset(wm_prog_data, wm, 0); + wm.KernelStartPointer1 = stage_state->prog_offset + + brw_wm_prog_data_prog_offset(wm_prog_data, wm, 1); wm.KernelStartPointer2 = stage_state->prog_offset + - wm_prog_data->prog_offset_2; + brw_wm_prog_data_prog_offset(wm_prog_data, wm, 2); wm.DispatchGRFStartRegisterForConstantSetupData0 = - wm_prog_data->base.dispatch_grf_start_reg; + brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, wm, 0); + wm.DispatchGRFStartRegisterForConstantSetupData1 = + brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, wm, 1); wm.DispatchGRFStartRegisterForConstantSetupData2 = - wm_prog_data->dispatch_grf_start_reg_2; + brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, wm, 2); #endif #if GEN_GEN <= 5 @@ -4015,14 +4029,20 @@ genX(upload_ps)(struct brw_context *brw) ps._8PixelDispatchEnable = prog_data->dispatch_8; ps._16PixelDispatchEnable = prog_data->dispatch_16; + ps.DispatchGRFStartRegisterForConstantSetupData0 = - prog_data->base.dispatch_grf_start_reg; + brw_wm_prog_data_dispatch_grf_start_reg(prog_data, ps, 0); + ps.DispatchGRFStartRegisterForConstantSetupData1 = + brw_wm_prog_data_dispatch_grf_start_reg(prog_data, ps, 1); ps.DispatchGRFStartRegisterForConstantSetupData2 = - prog_data->dispatch_grf_start_reg_2; + brw_wm_prog_data_dispatch_grf_start_reg(prog_data, ps, 2); - ps.KernelStartPointer0 = stage_state->prog_offset; + ps.KernelStartPointer0 = stage_state->prog_offset + + brw_wm_prog_data_prog_offset(prog_data, ps, 0); + ps.KernelStartPointer1 = stage_state->prog_offset + + brw_wm_prog_data_prog_offset(prog_data, ps, 1); ps.KernelStartPointer2 = stage_state->prog_offset + - prog_data->prog_offset_2; + brw_wm_prog_data_prog_offset(prog_data, ps, 2); if (prog_data->base.total_scratch) { ps.ScratchSpaceBasePointer = -- 2.7.4