From ac993ae8287ea374a073c7a99fac84dfd773c475 Mon Sep 17 00:00:00 2001 From: Karol Herbst Date: Fri, 18 Nov 2022 15:51:18 +0100 Subject: [PATCH] rusticl/kernel: make use of cso info Signed-off-by: Karol Herbst Reviewed-by: Alyssa Rosenzweig Part-of: --- src/gallium/frontends/rusticl/api/kernel.rs | 5 +- src/gallium/frontends/rusticl/core/device.rs | 5 + src/gallium/frontends/rusticl/core/kernel.rs | 107 ++++++++++++--------- src/gallium/frontends/rusticl/mesa/pipe/context.rs | 9 ++ 4 files changed, 76 insertions(+), 50 deletions(-) diff --git a/src/gallium/frontends/rusticl/api/kernel.rs b/src/gallium/frontends/rusticl/api/kernel.rs index e9d5ec2..2cb7ef6 100644 --- a/src/gallium/frontends/rusticl/api/kernel.rs +++ b/src/gallium/frontends/rusticl/api/kernel.rs @@ -89,11 +89,10 @@ impl CLInfoObj for cl_kernel { CL_KERNEL_COMPILE_WORK_GROUP_SIZE => cl_prop::<[usize; 3]>(kernel.work_group_size), CL_KERNEL_LOCAL_MEM_SIZE => cl_prop::(kernel.local_mem_size(&dev)), CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE => { - cl_prop::(dev.subgroups() as usize) + cl_prop::(kernel.preferred_simd_size(&dev)) } CL_KERNEL_PRIVATE_MEM_SIZE => cl_prop::(kernel.priv_mem_size(&dev)), - // TODO - CL_KERNEL_WORK_GROUP_SIZE => cl_prop::(dev.subgroups() as usize), + CL_KERNEL_WORK_GROUP_SIZE => cl_prop::(kernel.max_threads_per_block(&dev)), // CL_INVALID_VALUE if param_name is not one of the supported values _ => return Err(CL_INVALID_VALUE), }) diff --git a/src/gallium/frontends/rusticl/core/device.rs b/src/gallium/frontends/rusticl/core/device.rs index 77c51b0..f2450c8 100644 --- a/src/gallium/frontends/rusticl/core/device.rs +++ b/src/gallium/frontends/rusticl/core/device.rs @@ -75,6 +75,7 @@ pub trait HelperContextWrapper { fn create_compute_state(&self, nir: &NirShader, static_local_mem: u32) -> *mut c_void; fn delete_compute_state(&self, cso: *mut c_void); + fn compute_state_info(&self, state: *mut c_void) -> pipe_compute_state_object_info; fn unmap(&self, tx: PipeTransfer); } @@ -159,6 +160,10 @@ impl<'a> HelperContextWrapper for HelperContext<'a> { self.lock.delete_compute_state(cso) } + fn compute_state_info(&self, state: *mut c_void) -> pipe_compute_state_object_info { + self.lock.compute_state_info(state) + } + fn unmap(&self, tx: PipeTransfer) { tx.with_ctx(&self.lock); } diff --git a/src/gallium/frontends/rusticl/core/kernel.rs b/src/gallium/frontends/rusticl/core/kernel.rs index 2810f24..e1ce6f0 100644 --- a/src/gallium/frontends/rusticl/core/kernel.rs +++ b/src/gallium/frontends/rusticl/core/kernel.rs @@ -258,6 +258,7 @@ struct KernelDevStateInner { nir: NirShader, constant_buffer: Option>, cso: *mut c_void, + info: pipe_compute_state_object_info, } struct KernelDevState { @@ -279,21 +280,25 @@ impl KernelDevState { let states = nirs .into_iter() .map(|(dev, nir)| { - let cso = if dev.shareable_shaders() { - dev.helper_ctx() - .create_compute_state(&nir, nir.shared_size()) - } else { - ptr::null_mut() - }; - + let mut cso = dev + .helper_ctx() + .create_compute_state(&nir, nir.shared_size()); + let info = dev.helper_ctx().compute_state_info(cso); let cb = Self::create_nir_constant_buffer(&dev, &nir); + // if we can't share the cso between threads, destroy it now. + if !dev.shareable_shaders() { + dev.helper_ctx().delete_compute_state(cso); + cso = ptr::null_mut(); + }; + ( dev, KernelDevStateInner { nir: nir, constant_buffer: cb, cso: cso, + info: info, }, ) }) @@ -829,44 +834,6 @@ fn extract<'a, const S: usize>(buf: &'a mut &[u8]) -> &'a [u8; S] { val.try_into().unwrap() } -fn optimize_local_size(d: &Device, grid: &mut [u32; 3], block: &mut [u32; 3]) { - let mut threads = d.max_threads_per_block() as u32; - let dim_threads = d.max_block_sizes(); - let subgroups = d.subgroups(); - - if !block.contains(&0) { - for i in 0..3 { - // we already made sure everything is fine - grid[i] /= block[i]; - } - return; - } - - for i in 0..3 { - let t = cmp::min(threads, dim_threads[i] as u32); - let gcd = gcd(t, grid[i]); - - block[i] = gcd; - grid[i] /= gcd; - - // update limits - threads /= block[i]; - } - - // if we didn't fill the subgroup we can do a bit better if we have threads remaining - let total_threads = block[0] * block[1] * block[2]; - if threads != 1 && total_threads < subgroups { - for i in 0..3 { - if grid[i] * total_threads < threads { - block[i] *= grid[i]; - grid[i] = 1; - // can only do it once as nothing is cleanly divisible - break; - } - } - } -} - impl Kernel { pub fn new(name: String, prog: Arc, args: Vec) -> Arc { let (mut nirs, args, internal_args, attributes_string) = @@ -895,6 +862,44 @@ impl Kernel { }) } + fn optimize_local_size(&self, d: &Device, grid: &mut [u32; 3], block: &mut [u32; 3]) { + let mut threads = self.max_threads_per_block(d) as u32; + let dim_threads = d.max_block_sizes(); + let subgroups = self.preferred_simd_size(d) as u32; + + if !block.contains(&0) { + for i in 0..3 { + // we already made sure everything is fine + grid[i] /= block[i]; + } + return; + } + + for i in 0..3 { + let t = cmp::min(threads, dim_threads[i] as u32); + let gcd = gcd(t, grid[i]); + + block[i] = gcd; + grid[i] /= gcd; + + // update limits + threads /= block[i]; + } + + // if we didn't fill the subgroup we can do a bit better if we have threads remaining + let total_threads = block[0] * block[1] * block[2]; + if threads != 1 && total_threads < subgroups { + for i in 0..3 { + if grid[i] * total_threads < threads { + block[i] *= grid[i]; + grid[i] = 1; + // can only do it once as nothing is cleanly divisible + break; + } + } + } + } + // the painful part is, that host threads are allowed to modify the kernel object once it was // enqueued, so return a closure with all req data included. pub fn launch( @@ -928,7 +933,7 @@ impl Kernel { &[0; 4] }; - optimize_local_size(&q.device, &mut grid, &mut block); + self.optimize_local_size(&q.device, &mut grid, &mut block); for (arg, val) in self.args.iter().zip(&self.values) { if arg.dead { @@ -1225,7 +1230,15 @@ impl Kernel { } pub fn priv_mem_size(&self, dev: &Arc) -> cl_ulong { - self.dev_state.get(dev).nir.scratch_size() as cl_ulong + self.dev_state.get(dev).info.private_memory.into() + } + + pub fn max_threads_per_block(&self, dev: &Device) -> usize { + self.dev_state.get(dev).info.max_threads as usize + } + + pub fn preferred_simd_size(&self, dev: &Device) -> usize { + self.dev_state.get(dev).info.preferred_simd_size as usize } pub fn local_mem_size(&self, dev: &Arc) -> cl_ulong { diff --git a/src/gallium/frontends/rusticl/mesa/pipe/context.rs b/src/gallium/frontends/rusticl/mesa/pipe/context.rs index a4d83fc..abd234b 100644 --- a/src/gallium/frontends/rusticl/mesa/pipe/context.rs +++ b/src/gallium/frontends/rusticl/mesa/pipe/context.rs @@ -319,6 +319,14 @@ impl PipeContext { unsafe { self.pipe.as_ref().delete_compute_state.unwrap()(self.pipe.as_ptr(), state) } } + pub fn compute_state_info(&self, state: *mut c_void) -> pipe_compute_state_object_info { + let mut info = pipe_compute_state_object_info::default(); + unsafe { + self.pipe.as_ref().get_compute_state_info.unwrap()(self.pipe.as_ptr(), state, &mut info) + } + info + } + pub fn create_sampler_state(&self, state: &pipe_sampler_state) -> *mut c_void { unsafe { self.pipe.as_ref().create_sampler_state.unwrap()(self.pipe.as_ptr(), state) } } @@ -530,6 +538,7 @@ fn has_required_cbs(context: &pipe_context) -> bool { & has_required_feature!(context, delete_compute_state) & has_required_feature!(context, delete_sampler_state) & has_required_feature!(context, flush) + & has_required_feature!(context, get_compute_state_info) & has_required_feature!(context, launch_grid) & has_required_feature!(context, memory_barrier) & has_required_feature!(context, resource_copy_region) -- 2.7.4