anv_device_release_bo(device, pool->bos[i][s]);
}
}
+
+ for (unsigned i = 0; i < 16; i++) {
+ if (pool->surf_states[i].map != NULL) {
+ anv_state_pool_free(&device->surface_state_pool,
+ pool->surf_states[i]);
+ }
+ }
}
struct anv_bo *
assert(scratch_size_log2 < 16);
assert(stage < ARRAY_SIZE(pool->bos));
+
+ const struct intel_device_info *devinfo = &device->info;
+
+ /* On GFX version 12.5, scratch access changed to a surface-based model.
+ * Instead of each shader type having its own layout based on IDs passed
+ * from the relevant fixed-function unit, all scratch access is based on
+ * thread IDs like it always has been for compute.
+ */
+ if (devinfo->verx10 >= 125)
+ stage = MESA_SHADER_COMPUTE;
+
struct anv_bo *bo = p_atomic_read(&pool->bos[scratch_size_log2][stage]);
if (bo != NULL)
return bo;
- const struct intel_device_info *devinfo = &device->info;
-
unsigned subslices = MAX2(device->physical->subslice_total, 1);
/* The documentation for 3DSTATE_PS "Scratch Space Base Pointer" says:
* For, Gfx11+, scratch space allocation is based on the number of threads
* in the base configuration.
*/
- if (devinfo->ver == 12)
+ if (devinfo->verx10 == 125)
+ subslices = 32;
+ else if (devinfo->ver == 12)
subslices = (devinfo->is_dg1 || devinfo->gt == 2 ? 6 : 2);
else if (devinfo->ver == 11)
subslices = 8;
}
}
+uint32_t
+anv_scratch_pool_get_surf(struct anv_device *device,
+ struct anv_scratch_pool *pool,
+ unsigned per_thread_scratch)
+{
+ if (per_thread_scratch == 0)
+ return 0;
+
+ unsigned scratch_size_log2 = ffs(per_thread_scratch / 2048);
+ assert(scratch_size_log2 < 16);
+
+ uint32_t surf = p_atomic_read(&pool->surfs[scratch_size_log2]);
+ if (surf > 0)
+ return surf;
+
+ struct anv_bo *bo =
+ anv_scratch_pool_alloc(device, pool, MESA_SHADER_COMPUTE,
+ per_thread_scratch);
+ struct anv_address addr = { .bo = bo };
+
+ struct anv_state state =
+ anv_state_pool_alloc(&device->surface_state_pool,
+ device->isl_dev.ss.size, 64);
+
+ isl_buffer_fill_state(&device->isl_dev, state.map,
+ .address = anv_address_physical(addr),
+ .size_B = bo->size,
+ .mocs = anv_mocs(device, bo, 0),
+ .format = ISL_FORMAT_RAW,
+ .swizzle = ISL_SWIZZLE_IDENTITY,
+ .stride_B = per_thread_scratch,
+ .is_scratch = true);
+
+ uint32_t current = p_atomic_cmpxchg(&pool->surfs[scratch_size_log2],
+ 0, state.offset);
+ if (current) {
+ anv_state_pool_free(&device->surface_state_pool, state);
+ return current;
+ } else {
+ pool->surf_states[scratch_size_log2] = state;
+ return state.offset;
+ }
+}
+
VkResult
anv_bo_cache_init(struct anv_bo_cache *cache)
{
struct anv_scratch_pool {
/* Indexed by Per-Thread Scratch Space number (the hardware value) and stage */
struct anv_bo *bos[16][MESA_SHADER_STAGES];
+ uint32_t surfs[16];
+ struct anv_state surf_states[16];
};
void anv_scratch_pool_init(struct anv_device *device,
struct anv_scratch_pool *pool,
gl_shader_stage stage,
unsigned per_thread_scratch);
+uint32_t anv_scratch_pool_get_surf(struct anv_device *device,
+ struct anv_scratch_pool *pool,
+ unsigned per_thread_scratch);
/** Implements a BO cache that ensures a 1-1 mapping of GEM BOs to anv_bos */
struct anv_bo_cache {
return MIN2(count_by_4, 4);
}
-static struct anv_address
+static UNUSED struct anv_address
get_scratch_address(struct anv_pipeline *pipeline,
gl_shader_stage stage,
const struct anv_shader_bin *bin)
};
}
-static uint32_t
+static UNUSED uint32_t
get_scratch_space(const struct anv_shader_bin *bin)
{
return ffs(bin->prog_data->total_scratch / 2048);
}
+static UNUSED uint32_t
+get_scratch_surf(struct anv_pipeline *pipeline,
+ const struct anv_shader_bin *bin)
+{
+ return anv_scratch_pool_get_surf(pipeline->device,
+ &pipeline->device->scratch_pool,
+ bin->prog_data->total_scratch) >> 4;
+}
+
static void
emit_3dstate_vs(struct anv_graphics_pipeline *pipeline)
{
vs_prog_data->base.cull_distance_mask;
#endif
+#if GFX_VERx10 >= 125
+ vs.ScratchSpaceBuffer = get_scratch_surf(&pipeline->base, vs_bin);
+#else
vs.PerThreadScratchSpace = get_scratch_space(vs_bin);
vs.ScratchSpaceBasePointer =
get_scratch_address(&pipeline->base, MESA_SHADER_VERTEX, vs_bin);
+#endif
}
}
tcs_prog_data->base.base.dispatch_grf_start_reg >> 5;
#endif
-
+#if GFX_VERx10 >= 125
+ hs.ScratchSpaceBuffer = get_scratch_surf(&pipeline->base, tcs_bin);
+#else
hs.PerThreadScratchSpace = get_scratch_space(tcs_bin);
hs.ScratchSpaceBasePointer =
get_scratch_address(&pipeline->base, MESA_SHADER_TESS_CTRL, tcs_bin);
+#endif
#if GFX_VER == 12
/* Patch Count threshold specifies the maximum number of patches that
tes_prog_data->base.cull_distance_mask;
#endif
+#if GFX_VERx10 >= 125
+ ds.ScratchSpaceBuffer = get_scratch_surf(&pipeline->base, tes_bin);
+#else
ds.PerThreadScratchSpace = get_scratch_space(tes_bin);
ds.ScratchSpaceBasePointer =
get_scratch_address(&pipeline->base, MESA_SHADER_TESS_EVAL, tes_bin);
+#endif
}
}
gs_prog_data->base.cull_distance_mask;
#endif
+#if GFX_VERx10 >= 125
+ gs.ScratchSpaceBuffer = get_scratch_surf(&pipeline->base, gs_bin);
+#else
gs.PerThreadScratchSpace = get_scratch_space(gs_bin);
gs.ScratchSpaceBasePointer =
get_scratch_address(&pipeline->base, MESA_SHADER_GEOMETRY, gs_bin);
+#endif
}
}
ps.DispatchGRFStartRegisterForConstantSetupData2 =
brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, ps, 2);
+#if GFX_VERx10 >= 125
+ ps.ScratchSpaceBuffer = get_scratch_surf(&pipeline->base, fs_bin);
+#else
ps.PerThreadScratchSpace = get_scratch_space(fs_bin);
ps.ScratchSpaceBasePointer =
get_scratch_address(&pipeline->base, MESA_SHADER_FRAGMENT, fs_bin);
+#endif
}
}
anv_batch_emit(&pipeline->base.batch, GENX(CFE_STATE), cfe) {
cfe.MaximumNumberofThreads =
devinfo->max_cs_threads * subslices - 1;
- /* TODO: Enable gfx12-hp scratch support*/
- assert(get_scratch_space(cs_bin) == 0);
+ cfe.ScratchSpaceBuffer = get_scratch_surf(&pipeline->base, cs_bin);
}
}