From ac993ae8287ea374a073c7a99fac84dfd773c475 Mon Sep 17 00:00:00 2001
From: Karol Herbst <kherbst@redhat.com>
Date: Fri, 18 Nov 2022 15:51:18 +0100
Subject: [PATCH] rusticl/kernel: make use of cso info

Signed-off-by: Karol Herbst <kherbst@redhat.com>
Reviewed-by: Alyssa Rosenzweig <alyssa.rosenzweig@collabora.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/19855>
---
 src/gallium/frontends/rusticl/api/kernel.rs        |   5 +-
 src/gallium/frontends/rusticl/core/device.rs       |   5 +
 src/gallium/frontends/rusticl/core/kernel.rs       | 107 ++++++++++++---------
 src/gallium/frontends/rusticl/mesa/pipe/context.rs |   9 ++
 4 files changed, 76 insertions(+), 50 deletions(-)
diff --git a/src/gallium/frontends/rusticl/api/kernel.rs b/src/gallium/frontends/rusticl/api/kernel.rs
index e9d5ec2..2cb7ef6 100644
--- a/src/gallium/frontends/rusticl/api/kernel.rs
+++ b/src/gallium/frontends/rusticl/api/kernel.rs
@@ -89,11 +89,10 @@ impl CLInfoObj<cl_kernel_work_group_info, cl_device_id> for cl_kernel {
             CL_KERNEL_COMPILE_WORK_GROUP_SIZE => cl_prop::<[usize; 3]>(kernel.work_group_size),
             CL_KERNEL_LOCAL_MEM_SIZE => cl_prop::<cl_ulong>(kernel.local_mem_size(&dev)),
             CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE => {
-                cl_prop::<usize>(dev.subgroups() as usize)
+                cl_prop::<usize>(kernel.preferred_simd_size(&dev))
             }
             CL_KERNEL_PRIVATE_MEM_SIZE => cl_prop::<cl_ulong>(kernel.priv_mem_size(&dev)),
-            // TODO
-            CL_KERNEL_WORK_GROUP_SIZE => cl_prop::<usize>(dev.subgroups() as usize),
+            CL_KERNEL_WORK_GROUP_SIZE => cl_prop::<usize>(kernel.max_threads_per_block(&dev)),
             // CL_INVALID_VALUE if param_name is not one of the supported values
             _ => return Err(CL_INVALID_VALUE),
         })
diff --git a/src/gallium/frontends/rusticl/core/device.rs b/src/gallium/frontends/rusticl/core/device.rs
index 77c51b0..f2450c8 100644
--- a/src/gallium/frontends/rusticl/core/device.rs
+++ b/src/gallium/frontends/rusticl/core/device.rs
@@ -75,6 +75,7 @@ pub trait HelperContextWrapper {
 
     fn create_compute_state(&self, nir: &NirShader, static_local_mem: u32) -> *mut c_void;
     fn delete_compute_state(&self, cso: *mut c_void);
+    fn compute_state_info(&self, state: *mut c_void) -> pipe_compute_state_object_info;
 
     fn unmap(&self, tx: PipeTransfer);
 }
@@ -159,6 +160,10 @@ impl<'a> HelperContextWrapper for HelperContext<'a> {
         self.lock.delete_compute_state(cso)
     }
 
+    fn compute_state_info(&self, state: *mut c_void) -> pipe_compute_state_object_info {
+        self.lock.compute_state_info(state)
+    }
+
     fn unmap(&self, tx: PipeTransfer) {
         tx.with_ctx(&self.lock);
     }
diff --git a/src/gallium/frontends/rusticl/core/kernel.rs b/src/gallium/frontends/rusticl/core/kernel.rs
index 2810f24..e1ce6f0 100644
--- a/src/gallium/frontends/rusticl/core/kernel.rs
+++ b/src/gallium/frontends/rusticl/core/kernel.rs
@@ -258,6 +258,7 @@ struct KernelDevStateInner {
     nir: NirShader,
     constant_buffer: Option<Arc<PipeResource>>,
     cso: *mut c_void,
+    info: pipe_compute_state_object_info,
 }
 
 struct KernelDevState {
@@ -279,21 +280,25 @@ impl KernelDevState {
         let states = nirs
             .into_iter()
             .map(|(dev, nir)| {
-                let cso = if dev.shareable_shaders() {
-                    dev.helper_ctx()
-                        .create_compute_state(&nir, nir.shared_size())
-                } else {
-                    ptr::null_mut()
-                };
-
+                let mut cso = dev
+                    .helper_ctx()
+                    .create_compute_state(&nir, nir.shared_size());
+                let info = dev.helper_ctx().compute_state_info(cso);
                 let cb = Self::create_nir_constant_buffer(&dev, &nir);
 
+                // if we can't share the cso between threads, destroy it now.
+                if !dev.shareable_shaders() {
+                    dev.helper_ctx().delete_compute_state(cso);
+                    cso = ptr::null_mut();
+                };
+
                 (
                     dev,
                     KernelDevStateInner {
                         nir: nir,
                         constant_buffer: cb,
                         cso: cso,
+                        info: info,
                     },
                 )
             })
@@ -829,44 +834,6 @@ fn extract<'a, const S: usize>(buf: &'a mut &[u8]) -> &'a [u8; S] {
     val.try_into().unwrap()
 }
 
-fn optimize_local_size(d: &Device, grid: &mut [u32; 3], block: &mut [u32; 3]) {
-    let mut threads = d.max_threads_per_block() as u32;
-    let dim_threads = d.max_block_sizes();
-    let subgroups = d.subgroups();
-
-    if !block.contains(&0) {
-        for i in 0..3 {
-            // we already made sure everything is fine
-            grid[i] /= block[i];
-        }
-        return;
-    }
-
-    for i in 0..3 {
-        let t = cmp::min(threads, dim_threads[i] as u32);
-        let gcd = gcd(t, grid[i]);
-
-        block[i] = gcd;
-        grid[i] /= gcd;
-
-        // update limits
-        threads /= block[i];
-    }
-
-    // if we didn't fill the subgroup we can do a bit better if we have threads remaining
-    let total_threads = block[0] * block[1] * block[2];
-    if threads != 1 && total_threads < subgroups {
-        for i in 0..3 {
-            if grid[i] * total_threads < threads {
-                block[i] *= grid[i];
-                grid[i] = 1;
-                // can only do it once as nothing is cleanly divisible
-                break;
-            }
-        }
-    }
-}
-
 impl Kernel {
     pub fn new(name: String, prog: Arc<Program>, args: Vec<spirv::SPIRVKernelArg>) -> Arc<Kernel> {
         let (mut nirs, args, internal_args, attributes_string) =
@@ -895,6 +862,44 @@ impl Kernel {
         })
     }
 
+    fn optimize_local_size(&self, d: &Device, grid: &mut [u32; 3], block: &mut [u32; 3]) {
+        let mut threads = self.max_threads_per_block(d) as u32;
+        let dim_threads = d.max_block_sizes();
+        let subgroups = self.preferred_simd_size(d) as u32;
+
+        if !block.contains(&0) {
+            for i in 0..3 {
+                // we already made sure everything is fine
+                grid[i] /= block[i];
+            }
+            return;
+        }
+
+        for i in 0..3 {
+            let t = cmp::min(threads, dim_threads[i] as u32);
+            let gcd = gcd(t, grid[i]);
+
+            block[i] = gcd;
+            grid[i] /= gcd;
+
+            // update limits
+            threads /= block[i];
+        }
+
+        // if we didn't fill the subgroup we can do a bit better if we have threads remaining
+        let total_threads = block[0] * block[1] * block[2];
+        if threads != 1 && total_threads < subgroups {
+            for i in 0..3 {
+                if grid[i] * total_threads < threads {
+                    block[i] *= grid[i];
+                    grid[i] = 1;
+                    // can only do it once as nothing is cleanly divisible
+                    break;
+                }
+            }
+        }
+    }
+
     // the painful part is, that host threads are allowed to modify the kernel object once it was
     // enqueued, so return a closure with all req data included.
     pub fn launch(
@@ -928,7 +933,7 @@ impl Kernel {
             &[0; 4]
         };
 
-        optimize_local_size(&q.device, &mut grid, &mut block);
+        self.optimize_local_size(&q.device, &mut grid, &mut block);
 
         for (arg, val) in self.args.iter().zip(&self.values) {
             if arg.dead {
@@ -1225,7 +1230,15 @@ impl Kernel {
     }
 
     pub fn priv_mem_size(&self, dev: &Arc<Device>) -> cl_ulong {
-        self.dev_state.get(dev).nir.scratch_size() as cl_ulong
+        self.dev_state.get(dev).info.private_memory.into()
+    }
+
+    pub fn max_threads_per_block(&self, dev: &Device) -> usize {
+        self.dev_state.get(dev).info.max_threads as usize
+    }
+
+    pub fn preferred_simd_size(&self, dev: &Device) -> usize {
+        self.dev_state.get(dev).info.preferred_simd_size as usize
     }
 
     pub fn local_mem_size(&self, dev: &Arc<Device>) -> cl_ulong {
diff --git a/src/gallium/frontends/rusticl/mesa/pipe/context.rs b/src/gallium/frontends/rusticl/mesa/pipe/context.rs
index a4d83fc..abd234b 100644
--- a/src/gallium/frontends/rusticl/mesa/pipe/context.rs
+++ b/src/gallium/frontends/rusticl/mesa/pipe/context.rs
@@ -319,6 +319,14 @@ impl PipeContext {
         unsafe { self.pipe.as_ref().delete_compute_state.unwrap()(self.pipe.as_ptr(), state) }
     }
 
+    pub fn compute_state_info(&self, state: *mut c_void) -> pipe_compute_state_object_info {
+        let mut info = pipe_compute_state_object_info::default();
+        unsafe {
+            self.pipe.as_ref().get_compute_state_info.unwrap()(self.pipe.as_ptr(), state, &mut info)
+        }
+        info
+    }
+
     pub fn create_sampler_state(&self, state: &pipe_sampler_state) -> *mut c_void {
         unsafe { self.pipe.as_ref().create_sampler_state.unwrap()(self.pipe.as_ptr(), state) }
     }
@@ -530,6 +538,7 @@ fn has_required_cbs(context: &pipe_context) -> bool {
         & has_required_feature!(context, delete_compute_state)
         & has_required_feature!(context, delete_sampler_state)
         & has_required_feature!(context, flush)
+        & has_required_feature!(context, get_compute_state_info)
         & has_required_feature!(context, launch_grid)
         & has_required_feature!(context, memory_barrier)
         & has_required_feature!(context, resource_copy_region)
-- 
2.7.4