iris: Stop using cs_prog_data->threads
authorCaio Marcelo de Oliveira Filho <caio.oliveira@intel.com>
Thu, 9 Apr 2020 23:54:25 +0000 (16:54 -0700)
committerCaio Marcelo de Oliveira Filho <caio.oliveira@intel.com>
Fri, 10 Apr 2020 02:23:12 +0000 (19:23 -0700)
This is a preparation for dropping this field since this value is
expected to be calculated by the drivers now for variable group size
case.  And also the field would get in the way of brw_compile_cs
producing multiple SIMD variants (like FS).

Reviewed-by: Jordan Justen <jordan.l.justen@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/4504>

src/gallium/drivers/iris/iris_context.h
src/gallium/drivers/iris/iris_program.c
src/gallium/drivers/iris/iris_state.c

index e6a7af1..d1952ed 100644 (file)
@@ -847,6 +847,7 @@ void iris_init_perfquery_functions(struct pipe_context *ctx);
 void iris_update_compiled_shaders(struct iris_context *ice);
 void iris_update_compiled_compute_shader(struct iris_context *ice);
 void iris_fill_cs_push_const_buffer(struct brw_cs_prog_data *cs_prog_data,
+                                    unsigned threads,
                                     uint32_t *dst);
 
 
index 3e8abcd..c876540 100644 (file)
@@ -2020,13 +2020,14 @@ iris_update_compiled_compute_shader(struct iris_context *ice)
 
 void
 iris_fill_cs_push_const_buffer(struct brw_cs_prog_data *cs_prog_data,
+                               unsigned threads,
                                uint32_t *dst)
 {
-   assert(brw_cs_push_const_total_size(cs_prog_data, cs_prog_data->threads) > 0);
+   assert(brw_cs_push_const_total_size(cs_prog_data, threads) > 0);
    assert(cs_prog_data->push.cross_thread.size == 0);
    assert(cs_prog_data->push.per_thread.dwords == 1);
    assert(cs_prog_data->base.param[0] == BRW_PARAM_BUILTIN_SUBGROUP_ID);
-   for (unsigned t = 0; t < cs_prog_data->threads; t++)
+   for (unsigned t = 0; t < threads; t++)
       dst[8 * t] = t;
 }
 
index 5ea1cd8..f572c02 100644 (file)
@@ -4365,7 +4365,6 @@ iris_store_cs_state(struct iris_context *ice,
    iris_pack_state(GENX(INTERFACE_DESCRIPTOR_DATA), map, desc) {
       desc.KernelStartPointer = KSP(shader);
       desc.ConstantURBEntryReadLength = cs_prog_data->push.per_thread.regs;
-      desc.NumberofThreadsinGPGPUThreadGroup = cs_prog_data->threads;
       desc.SharedLocalMemorySize =
          encode_slm_size(GEN_GEN, prog_data->total_shared);
       desc.BarrierEnable = cs_prog_data->uses_barrier;
@@ -6471,6 +6470,9 @@ iris_upload_compute_state(struct iris_context *ice,
    struct brw_stage_prog_data *prog_data = shader->prog_data;
    struct brw_cs_prog_data *cs_prog_data = (void *) prog_data;
 
+   const uint32_t group_size = grid->block[0] * grid->block[1] * grid->block[2];
+   const unsigned threads = DIV_ROUND_UP(group_size, cs_prog_data->simd_size);
+
    /* Always pin the binder.  If we're emitting new binding table pointers,
     * we need it.  If not, we're probably inheriting old tables via the
     * context, and need it anyway.  Since true zero-bindings cases are
@@ -6532,7 +6534,7 @@ iris_upload_compute_state(struct iris_context *ice,
          vfe.URBEntryAllocationSize = 2;
 
          vfe.CURBEAllocationSize =
-            ALIGN(cs_prog_data->push.per_thread.regs * cs_prog_data->threads +
+            ALIGN(cs_prog_data->push.per_thread.regs * threads +
                   cs_prog_data->push.cross_thread.regs, 2);
       }
    }
@@ -6544,7 +6546,7 @@ iris_upload_compute_state(struct iris_context *ice,
              cs_prog_data->push.per_thread.dwords == 1 &&
              cs_prog_data->base.param[0] == BRW_PARAM_BUILTIN_SUBGROUP_ID);
       const unsigned push_const_size =
-         brw_cs_push_const_total_size(cs_prog_data, cs_prog_data->threads);
+         brw_cs_push_const_total_size(cs_prog_data, threads);
       uint32_t *curbe_data_map =
          stream_state(batch, ice->state.dynamic_uploader,
                       &ice->state.last_res.cs_thread_ids,
@@ -6552,7 +6554,7 @@ iris_upload_compute_state(struct iris_context *ice,
                       &curbe_data_offset);
       assert(curbe_data_map);
       memset(curbe_data_map, 0x5a, ALIGN(push_const_size, 64));
-      iris_fill_cs_push_const_buffer(cs_prog_data, curbe_data_map);
+      iris_fill_cs_push_const_buffer(cs_prog_data, threads, curbe_data_map);
 
       iris_emit_cmd(batch, GENX(MEDIA_CURBE_LOAD), curbe) {
          curbe.CURBETotalDataLength = ALIGN(push_const_size, 64);
@@ -6569,6 +6571,7 @@ iris_upload_compute_state(struct iris_context *ice,
       iris_pack_state(GENX(INTERFACE_DESCRIPTOR_DATA), desc, idd) {
          idd.SamplerStatePointer = shs->sampler_table.offset;
          idd.BindingTablePointer = binder->bt_offset[MESA_SHADER_COMPUTE];
+         idd.NumberofThreadsinGPGPUThreadGroup = threads;
       }
 
       for (int i = 0; i < GENX(INTERFACE_DESCRIPTOR_DATA_length); i++)
@@ -6583,7 +6586,6 @@ iris_upload_compute_state(struct iris_context *ice,
       }
    }
 
-   uint32_t group_size = grid->block[0] * grid->block[1] * grid->block[2];
    uint32_t remainder = group_size & (cs_prog_data->simd_size - 1);
    uint32_t right_mask;
 
@@ -6618,7 +6620,7 @@ iris_upload_compute_state(struct iris_context *ice,
       ggw.SIMDSize                   = cs_prog_data->simd_size / 16;
       ggw.ThreadDepthCounterMaximum  = 0;
       ggw.ThreadHeightCounterMaximum = 0;
-      ggw.ThreadWidthCounterMaximum  = cs_prog_data->threads - 1;
+      ggw.ThreadWidthCounterMaximum  = threads - 1;
       ggw.ThreadGroupIDXDimension    = grid->grid[0];
       ggw.ThreadGroupIDYDimension    = grid->grid[1];
       ggw.ThreadGroupIDZDimension    = grid->grid[2];