vulkan/runtime: add compute astc decoder helper functions
authorYogesh Mohan Marimuthu <yogesh.mohanmarimuthu@amd.com>
Thu, 7 Sep 2023 11:33:03 +0000 (17:03 +0530)
committerMarge Bot <emma+marge@anholt.net>
Wed, 11 Oct 2023 19:28:40 +0000 (19:28 +0000)
The astc compute decode and lut creation code is copied
from https://github.com/Themaister/Granite/

Always set DECODE_8BIT idea is copied from
https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/19886

v2: use astc glsl shader code (Chia-I Wu)
v3: fix 32bit compilation error (Christopher Snowhill)
v4: use pitch to copy in vk_create_fill_image_visible_mem() function
    pass correct layer to decode_astc()
v5: use existing ASTCLutHolder (Chia-I Wu)
v6: use only staging buffer (Chia-I Wu)
    use texel buffer for partition table (Chia-I Wu)
v7: use 2DArray for input and output
v8: check for == mem_property (Chia-I Wu)
    do not use vk_common* functions (Chia-I Wu)
    squash single buffer patch (Chia-I Wu)
    fix for minTexelBufferOffsetAlignment (Chia-I Wu)
    avoid wasting 4 slots (Chia-I Wu)
    remove partition_tbl_mask (Chia-I Wu)
    remove wrong bindings count (Chia-I Wu)
    use binding names from glsl code (Chia-I Wu)
    use ARRAY_SIZE (Chia-I Wu)
    use VkFormat for getting partition table index (Chia-I Wu)
    fix mutex lock (Chia-I Wu)
    image layout should be based on function call (Chia-I Wu)
    VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE is wrong (Chia-I Wu)
    add vk_texcompress_astc tag to helpder functions (Chia-I Wu)
    remove write_desc_set_count (Chia-I Wu)
    use desc_i++ (Chia-I Wu)
    add assert for desc_i count at end (Chia-I Wu)
    remove unused vk_create_map_texel_buffer() function (Chia-I Wu)
    dynamically create the lut offset (Chia-I Wu)
    offset not to pass as push contant (Chia-I Wu)
v9: use correct stoage and sampled flags (Chia-I Wu)
    always pass single_buf_size (Chia-I Wu)
    query drivers for minTexelBufferOffsetAlignment (Chia-I Wu)
    remove blank lines (Chia-I Wu)
    remove unnecessary if check in destroy (Chia-I Wu)
    name label as unlock instead of fail and pass (Chia-I Wu)
    use prog_glslang.found() (Chia-I Wu)
    add offset,extent check to astc shader (Chia-I Wu)
v10: prog_glslang can be undefined in meson.build (Chia-I Wu)
v11: remove with_texcompress_astc and use required in find_program (Chia-I Wu)
v12: offset are aligned to blk size (Chia-I Wu)
v13: texel_blk_start should be under vulkan if check (Chia-I Wu)
     dst image layout is always VK_IMAGE_LAYOUT_GENERAL (Chia-I Wu)

Reviewed-by: Chia-I Wu <olvaffe@gmail.com>
Acked-by: Bas Nieuwenhuizen <bas@basnieuwenhuizen.nl>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/24672>

meson.build
src/compiler/glsl/astc_decoder.glsl
src/vulkan/runtime/meson.build
src/vulkan/runtime/vk_texcompress_astc.c [new file with mode: 0644]
src/vulkan/runtime/vk_texcompress_astc.h [new file with mode: 0644]

index 6f3cc93..722469f 100644 (file)
@@ -564,8 +564,8 @@ if vdpau_drivers_path == ''
   vdpau_drivers_path = join_paths(get_option('libdir'), 'vdpau')
 endif
 
-if with_vulkan_overlay_layer or with_aco_tests or with_amd_vk or with_intel_vk
-  prog_glslang = find_program('glslangValidator', native : true)
+prog_glslang = find_program('glslangValidator', native : true, required : with_vulkan_overlay_layer or with_aco_tests or with_amd_vk or with_intel_vk)
+if prog_glslang.found()
   if run_command(prog_glslang, [ '--quiet', '--version' ], check : false).returncode() == 0
     glslang_quiet = ['--quiet']
   else
index af1d913..2f46a1d 100644 (file)
@@ -32,22 +32,28 @@ precision highp uimage2D;
 #ifdef VULKAN
 
 precision highp utextureBuffer;
-precision highp utexture2D;
+precision highp utexture2DArray;
+precision highp uimage2DArray;
 
 #extension GL_EXT_samplerless_texture_functions : require
 layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z = 4) in;
 
-layout(set = 0, binding = 0) writeonly uniform uimage2D OutputImage;
-layout(set = 0, binding = 1) uniform utexture2D PayloadInput;
+layout(set = 0, binding = 0) writeonly uniform uimage2DArray OutputImage2Darray;
+layout(set = 0, binding = 1) uniform utexture2DArray PayloadInput2Darray;
 layout(set = 0, binding = 2) uniform utextureBuffer LUTRemainingBitsToEndpointQuantizer;
 layout(set = 0, binding = 3) uniform utextureBuffer LUTEndpointUnquantize;
 layout(set = 0, binding = 4) uniform utextureBuffer LUTWeightQuantizer;
 layout(set = 0, binding = 5) uniform utextureBuffer LUTWeightUnquantize;
 layout(set = 0, binding = 6) uniform utextureBuffer LUTTritQuintDecode;
-layout(set = 0, binding = 7) uniform utexture2D LUTPartitionTable;
+layout(set = 0, binding = 7) uniform utextureBuffer LUTPartitionTable;
 
 layout(constant_id = 2) const bool DECODE_8BIT = false;
 
+layout(push_constant, std430) uniform pc {
+   ivec2 texel_blk_start;
+   ivec2 texel_end;
+};
+
 #else /* VULKAN */
 
 layout(local_size_x = %u, local_size_y = %u, local_size_z = 4) in;
@@ -146,6 +152,9 @@ ivec4 build_coord()
     ivec2 payload_coord = ivec2(gl_WorkGroupID.xy) * 2;
     payload_coord.x += int(gl_LocalInvocationID.z) & 1;
     payload_coord.y += (int(gl_LocalInvocationID.z) >> 1) & 1;
+#ifdef VULKAN
+    payload_coord += texel_blk_start;
+#endif /* VULKAN */
     ivec2 coord = payload_coord * ivec2(gl_WorkGroupSize.xy);
     coord += ivec2(gl_LocalInvocationID.xy);
     return ivec4(coord, payload_coord);
@@ -1140,7 +1149,11 @@ void decode_endpoint(out ivec4 ep0, out ivec4 ep1, out int decode_mode,
 
 void emit_decode_error(ivec2 coord)
 {
+#ifdef VULKAN
+    imageStore(OutputImage2Darray, ivec3(coord, gl_WorkGroupID.z), error_color);
+#else /* VULKAN */
     imageStore(OutputImage, coord, error_color);
+#endif /* VULKAN */
 }
 
 int compute_num_endpoint_pairs(int num_partitions, int cem)
@@ -1233,12 +1246,22 @@ ivec4 void_extent_color(uvec4 payload, out int decode_mode)
 void main()
 {
     ivec4 coord = build_coord();
+#ifdef VULKAN
+    if (any(greaterThanEqual(coord.xy, texel_end.xy)))
+        return;
+#else /* VULKAN */
     if (any(greaterThanEqual(coord.xy, imageSize(OutputImage))))
         return;
+#endif /* VULKAN */
 
     ivec2 pixel_coord = ivec2(gl_LocalInvocationID.xy);
     int linear_pixel = int(gl_WorkGroupSize.x) * pixel_coord.y + pixel_coord.x;
-    uvec4 payload = texelFetch(PayloadInput, coord.zw, 0);
+    uvec4 payload;
+#ifdef VULKAN
+    payload = texelFetch(PayloadInput2Darray,ivec3(coord.zw, gl_WorkGroupID.z), 0);
+#else /* VULKAN */
+    payload = texelFetch(PayloadInput, coord.zw, 0);
+#endif /* VULKAN */
 
     BlockMode block_mode = decode_block_mode(payload);
     CHECK_DECODE_ERROR();
@@ -1260,7 +1283,12 @@ void main()
         {
             int lut_x = pixel_coord.x + int(gl_WorkGroupSize.x) * (block_mode.seed & 31);
             int lut_y = pixel_coord.y + int(gl_WorkGroupSize.y) * (block_mode.seed >> 5);
+#ifdef VULKAN
+            int lut_width = int(gl_WorkGroupSize.x) * 32;
+            partition_index = int(texelFetch(LUTPartitionTable, lut_y * lut_width + lut_x).x);
+#else /* VULKAN */
             partition_index = int(texelFetch(LUTPartitionTable, ivec2(lut_x, lut_y), 0).x);
+#endif /* VULKAN */
             partition_index = (partition_index >> (2 * block_mode.num_partitions - 4)) & 3;
         }
 
@@ -1315,7 +1343,11 @@ void main()
 
     if (DECODE_8BIT)
     {
+#ifdef VULKAN
+        imageStore(OutputImage2Darray, ivec3(coord.xy, gl_WorkGroupID.z), uvec4(final_color >> 8));
+#else /* VULKAN */
         imageStore(OutputImage, coord.xy, uvec4(final_color >> 8));
+#endif /* VULKAN */
     }
     else
     {
@@ -1324,6 +1356,10 @@ void main()
             encoded = uvec4(final_color);
         else
             encoded = decode_fp16(final_color, decode_mode);
+#ifdef VULKAN
+        imageStore(OutputImage2Darray, ivec3(coord.xy, gl_WorkGroupID.z), encoded);
+#else /* VULKAN */
         imageStore(OutputImage, coord.xy, encoded);
+#endif /* VULKAN */
     }
 }
index fb26b8a..889e0bb 100644 (file)
@@ -135,6 +135,16 @@ if with_platform_android
   vulkan_runtime_deps += dep_android
 endif
 
+if prog_glslang.found()
+  vulkan_runtime_files += files('vk_texcompress_astc.c', 'vk_texcompress_astc.h')
+  vulkan_runtime_files += custom_target(
+    'astc_spv.h',
+    input : astc_decoder_glsl_file,
+    output : 'astc_spv.h',
+    command : [prog_glslang, '-V', '-S', 'comp', '-x', '-o', '@OUTPUT@', '@INPUT@'] + glslang_quiet,
+    )
+endif
+
 vk_common_entrypoints = custom_target(
   'vk_common_entrypoints',
   input : [vk_entrypoints_gen, vk_api_xml],
diff --git a/src/vulkan/runtime/vk_texcompress_astc.c b/src/vulkan/runtime/vk_texcompress_astc.c
new file mode 100644 (file)
index 0000000..c195db8
--- /dev/null
@@ -0,0 +1,637 @@
+/* Copyright (c) 2017-2023 Hans-Kristian Arntzen
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "vk_texcompress_astc.h"
+#include "util/texcompress_astc_luts_wrap.h"
+#include "vk_alloc.h"
+#include "vk_buffer.h"
+#include "vk_device.h"
+#include "vk_format.h"
+#include "vk_image.h"
+#include "vk_physical_device.h"
+
+/* type_indexes_mask bits are set/clear for support memory type index as per
+ * struct VkPhysicalDeviceMemoryProperties.memoryTypes[] */
+static uint32_t
+get_mem_type_index(struct vk_device *device, uint32_t type_indexes_mask,
+                   VkMemoryPropertyFlags mem_property)
+{
+   const struct vk_physical_device_dispatch_table *disp = &device->physical->dispatch_table;
+   VkPhysicalDevice _phy_device = vk_physical_device_to_handle(device->physical);
+
+   VkPhysicalDeviceMemoryProperties2 props2 = {
+      .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_MEMORY_PROPERTIES_2,
+      .pNext = NULL,
+   };
+   disp->GetPhysicalDeviceMemoryProperties2(_phy_device, &props2);
+
+   for (uint32_t i = 0; i < props2.memoryProperties.memoryTypeCount; i++) {
+      if ((type_indexes_mask & (1 << i)) &&
+          ((props2.memoryProperties.memoryTypes[i].propertyFlags & mem_property) == mem_property)) {
+         return i;
+      }
+   }
+
+   return -1;
+}
+
+static VkResult
+vk_create_buffer(struct vk_device *device, VkAllocationCallbacks *allocator,
+                 VkDeviceSize size, VkMemoryPropertyFlags mem_prop_flags,
+                 VkBufferUsageFlags usage_flags, VkBuffer *vk_buf,
+                 VkDeviceMemory *vk_mem)
+{
+   VkResult result;
+   VkDevice _device = vk_device_to_handle(device);
+   const struct vk_device_dispatch_table *disp = &device->dispatch_table;
+
+   VkBufferCreateInfo buffer_create_info = {
+      .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO,
+      .size = size,
+      .usage = usage_flags,
+      .sharingMode = VK_SHARING_MODE_EXCLUSIVE,
+   };
+   result =
+      disp->CreateBuffer(_device, &buffer_create_info, allocator, vk_buf);
+   if (unlikely(result != VK_SUCCESS))
+      return result;
+
+   VkBufferMemoryRequirementsInfo2 mem_req_info = {
+      .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_REQUIREMENTS_INFO_2,
+      .buffer = *vk_buf,
+   };
+   VkMemoryRequirements2 mem_req = {
+      .sType = VK_STRUCTURE_TYPE_MEMORY_REQUIREMENTS_2,
+   };
+   disp->GetBufferMemoryRequirements2(_device, &mem_req_info, &mem_req);
+
+   uint32_t mem_type_index = get_mem_type_index(
+      device, mem_req.memoryRequirements.memoryTypeBits, mem_prop_flags);
+   if (mem_type_index == -1)
+      return VK_ERROR_OUT_OF_DEVICE_MEMORY;
+
+   VkMemoryAllocateInfo alloc_info = {
+      .sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO,
+      .allocationSize = mem_req.memoryRequirements.size,
+      .memoryTypeIndex = mem_type_index,
+   };
+   result = disp->AllocateMemory(_device, &alloc_info, allocator, vk_mem);
+   if (unlikely(result != VK_SUCCESS))
+      return result;
+
+   disp->BindBufferMemory(_device, *vk_buf, *vk_mem, 0);
+
+   return result;
+}
+
+static VkResult
+create_buffer_view(struct vk_device *device, VkAllocationCallbacks *allocator,
+                   VkBufferView *buf_view, VkBuffer buf, VkFormat format, VkDeviceSize size,
+                   VkDeviceSize offset)
+{
+   VkResult result;
+   VkDevice _device = vk_device_to_handle(device);
+   const struct vk_device_dispatch_table *disp = &device->dispatch_table;
+
+   VkBufferViewCreateInfo buffer_view_create_info = {
+      .sType = VK_STRUCTURE_TYPE_BUFFER_VIEW_CREATE_INFO,
+      .buffer = buf,
+      .format = format,
+      .offset = offset,
+      .range = size,
+   };
+   result = disp->CreateBufferView(_device, &buffer_view_create_info,
+                                   allocator, buf_view);
+   return result;
+}
+
+static uint8_t
+get_partition_table_index(VkFormat format)
+{
+   switch (format) {
+   case VK_FORMAT_ASTC_4x4_UNORM_BLOCK:
+   case VK_FORMAT_ASTC_4x4_SRGB_BLOCK:
+      return 0;
+   case VK_FORMAT_ASTC_5x4_UNORM_BLOCK:
+   case VK_FORMAT_ASTC_5x4_SRGB_BLOCK:
+      return 1;
+   case VK_FORMAT_ASTC_5x5_UNORM_BLOCK:
+   case VK_FORMAT_ASTC_5x5_SRGB_BLOCK:
+      return 2;
+   case VK_FORMAT_ASTC_6x5_UNORM_BLOCK:
+   case VK_FORMAT_ASTC_6x5_SRGB_BLOCK:
+      return 3;
+   case VK_FORMAT_ASTC_6x6_UNORM_BLOCK:
+   case VK_FORMAT_ASTC_6x6_SRGB_BLOCK:
+      return 4;
+   case VK_FORMAT_ASTC_8x5_UNORM_BLOCK:
+   case VK_FORMAT_ASTC_8x5_SRGB_BLOCK:
+      return 5;
+   case VK_FORMAT_ASTC_8x6_UNORM_BLOCK:
+   case VK_FORMAT_ASTC_8x6_SRGB_BLOCK:
+      return 6;
+   case VK_FORMAT_ASTC_8x8_UNORM_BLOCK:
+   case VK_FORMAT_ASTC_8x8_SRGB_BLOCK:
+      return 7;
+   case VK_FORMAT_ASTC_10x5_UNORM_BLOCK:
+   case VK_FORMAT_ASTC_10x5_SRGB_BLOCK:
+      return 8;
+   case VK_FORMAT_ASTC_10x6_UNORM_BLOCK:
+   case VK_FORMAT_ASTC_10x6_SRGB_BLOCK:
+      return 9;
+   case VK_FORMAT_ASTC_10x8_UNORM_BLOCK:
+   case VK_FORMAT_ASTC_10x8_SRGB_BLOCK:
+      return 10;
+   case VK_FORMAT_ASTC_10x10_UNORM_BLOCK:
+   case VK_FORMAT_ASTC_10x10_SRGB_BLOCK:
+      return 11;
+   case VK_FORMAT_ASTC_12x10_UNORM_BLOCK:
+   case VK_FORMAT_ASTC_12x10_SRGB_BLOCK:
+      return 12;
+   case VK_FORMAT_ASTC_12x12_UNORM_BLOCK:
+   case VK_FORMAT_ASTC_12x12_SRGB_BLOCK:
+      return 13;
+   default:
+      unreachable("bad astc format\n");
+      return 0;
+   }
+}
+
+static VkResult
+astc_prepare_buffer(struct vk_device *device,
+                    struct vk_texcompress_astc_state *astc,
+                    VkAllocationCallbacks *allocator,
+                    VkDeviceSize minTexelBufferOffsetAlignment,
+                    uint8_t *single_buf_ptr,
+                    VkDeviceSize *single_buf_size)
+{
+   VkResult result;
+   astc_decoder_lut_holder astc_lut_holder;
+   VkDeviceSize offset = 0;
+
+   _mesa_init_astc_decoder_luts(&astc_lut_holder);
+
+   const astc_decoder_lut *luts[] = {
+      &astc_lut_holder.color_endpoint,
+      &astc_lut_holder.color_endpoint_unquant,
+      &astc_lut_holder.weights,
+      &astc_lut_holder.weights_unquant,
+      &astc_lut_holder.trits_quints,
+   };
+
+   for (unsigned i = 0; i < ARRAY_SIZE(luts); i++) {
+      offset = align(offset, minTexelBufferOffsetAlignment);
+      if (single_buf_ptr) {
+         memcpy(single_buf_ptr + offset, luts[i]->data, luts[i]->size_B);
+         result = create_buffer_view(device, allocator, &astc->luts_buf_view[i], astc->luts_buf,
+                                     vk_format_from_pipe_format(luts[i]->format), luts[i]->size_B,
+                                     offset);
+         if (result != VK_SUCCESS)
+            return result;
+      }
+      offset += luts[i]->size_B;
+   }
+
+   const VkFormat formats[] = {
+      VK_FORMAT_ASTC_4x4_UNORM_BLOCK,
+      VK_FORMAT_ASTC_5x4_UNORM_BLOCK,
+      VK_FORMAT_ASTC_5x5_UNORM_BLOCK,
+      VK_FORMAT_ASTC_6x5_UNORM_BLOCK,
+      VK_FORMAT_ASTC_6x6_UNORM_BLOCK,
+      VK_FORMAT_ASTC_8x5_UNORM_BLOCK,
+      VK_FORMAT_ASTC_8x6_UNORM_BLOCK,
+      VK_FORMAT_ASTC_8x8_UNORM_BLOCK,
+      VK_FORMAT_ASTC_10x5_UNORM_BLOCK,
+      VK_FORMAT_ASTC_10x6_UNORM_BLOCK,
+      VK_FORMAT_ASTC_10x8_UNORM_BLOCK,
+      VK_FORMAT_ASTC_10x10_UNORM_BLOCK,
+      VK_FORMAT_ASTC_12x10_UNORM_BLOCK,
+      VK_FORMAT_ASTC_12x12_UNORM_BLOCK,
+   };
+
+   for (uint32_t i = 0; i < ARRAY_SIZE(formats); i++) {
+      unsigned lut_width;
+      unsigned lut_height;
+      const void *lut_data = _mesa_get_astc_decoder_partition_table(
+            vk_format_get_blockwidth(formats[i]),
+            vk_format_get_blockheight(formats[i]),
+            &lut_width, &lut_height);
+      const unsigned lut_size = lut_width * lut_height;
+
+      offset = align(offset, minTexelBufferOffsetAlignment);
+      if (single_buf_ptr) {
+         memcpy(single_buf_ptr + offset, lut_data, lut_width * lut_height);
+
+         result = create_buffer_view(device, allocator, &astc->partition_tbl_buf_view[i],
+                                     astc->luts_buf, VK_FORMAT_R8_UINT, lut_width * lut_height,
+                                     offset);
+         if (result != VK_SUCCESS)
+            return result;
+      }
+      offset += lut_size;
+   }
+
+   *single_buf_size = offset;
+   return result;
+}
+
+static VkResult
+create_fill_all_luts_vulkan(struct vk_device *device,
+                            VkAllocationCallbacks *allocator,
+                            struct vk_texcompress_astc_state *astc)
+{
+   VkResult result;
+   VkDevice _device = vk_device_to_handle(device);
+   const struct vk_device_dispatch_table *disp = &device->dispatch_table;
+   VkPhysicalDevice _phy_device = vk_physical_device_to_handle(device->physical);
+   const struct vk_physical_device_dispatch_table *phy_disp = &device->physical->dispatch_table;
+   VkDeviceSize single_buf_size;
+   uint8_t *single_buf_ptr;
+
+   VkPhysicalDeviceProperties2 phy_dev_prop = {
+      .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PROPERTIES_2,
+      .pNext = NULL,
+   };
+   phy_disp->GetPhysicalDeviceProperties2(_phy_device, &phy_dev_prop);
+
+   /* get the single_buf_size */
+   result = astc_prepare_buffer(device, astc, allocator,
+                                phy_dev_prop.properties.limits.minTexelBufferOffsetAlignment,
+                                NULL, &single_buf_size);
+
+   /* create gpu buffer for all the luts */
+   result = vk_create_buffer(device, allocator, single_buf_size,
+                             VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT |
+                             VK_MEMORY_PROPERTY_HOST_COHERENT_BIT,
+                             VK_BUFFER_USAGE_UNIFORM_TEXEL_BUFFER_BIT,
+                             &astc->luts_buf, &astc->luts_mem);
+   if (unlikely(result != VK_SUCCESS))
+      return result;
+
+   disp->MapMemory(_device, astc->luts_mem, 0, VK_WHOLE_SIZE, 0, (void*)&single_buf_ptr);
+
+   /* fill all the luts and create views */
+   result = astc_prepare_buffer(device, astc, allocator,
+                                phy_dev_prop.properties.limits.minTexelBufferOffsetAlignment,
+                                single_buf_ptr, &single_buf_size);
+
+   disp->UnmapMemory(_device, astc->luts_mem);
+   return result;
+}
+
+static VkResult
+create_layout(struct vk_device *device, VkAllocationCallbacks *allocator,
+              struct vk_texcompress_astc_state *astc)
+{
+   VkResult result;
+   VkDevice _device = vk_device_to_handle(device);
+   const struct vk_device_dispatch_table *disp = &device->dispatch_table;
+
+   VkDescriptorSetLayoutBinding bindings[] = {
+      {
+         .binding = 0, /* OutputImage2DArray */
+         .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
+         .descriptorCount = 1,
+         .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
+         .pImmutableSamplers = NULL,
+      },
+      {
+         .binding = 1, /* PayloadInput2DArray */
+         .descriptorType = VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE,
+         .descriptorCount = 1,
+         .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
+         .pImmutableSamplers = NULL,
+      },
+      {
+         .binding = 2, /* LUTRemainingBitsToEndpointQuantizer */
+         .descriptorType = VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER,
+         .descriptorCount = 1,
+         .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
+         .pImmutableSamplers = NULL,
+      },
+      {
+         .binding = 3, /* LUTEndpointUnquantize */
+         .descriptorType = VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER,
+         .descriptorCount = 1,
+         .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
+         .pImmutableSamplers = NULL,
+      },
+      {
+         .binding = 4, /* LUTWeightQuantizer */
+         .descriptorType = VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER,
+         .descriptorCount = 1,
+         .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
+         .pImmutableSamplers = NULL,
+      },
+      {
+         .binding = 5, /* LUTWeightUnquantize */
+         .descriptorType = VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER,
+         .descriptorCount = 1,
+         .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
+         .pImmutableSamplers = NULL,
+      },
+      {
+         .binding = 6, /* LUTTritQuintDecode */
+         .descriptorType = VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER,
+         .descriptorCount = 1,
+         .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
+         .pImmutableSamplers = NULL,
+      },
+      {
+         .binding = 7, /* LUTPartitionTable */
+         .descriptorType = VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER,
+         .descriptorCount = 1,
+         .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
+         .pImmutableSamplers = NULL,
+      },
+   };
+
+   VkDescriptorSetLayoutCreateInfo ds_create_info = {
+      .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO,
+      .flags = VK_DESCRIPTOR_SET_LAYOUT_CREATE_PUSH_DESCRIPTOR_BIT_KHR,
+      .bindingCount = ARRAY_SIZE(bindings),
+      .pBindings = bindings,
+   };
+
+   result = disp->CreateDescriptorSetLayout(_device, &ds_create_info,
+                                            allocator, &astc->ds_layout);
+   if (result != VK_SUCCESS)
+      goto fail;
+
+   VkPipelineLayoutCreateInfo pl_create_info = {
+      .sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO,
+      .setLayoutCount = 1,
+      .pSetLayouts = &astc->ds_layout,
+      .pushConstantRangeCount = 1,
+      .pPushConstantRanges = &(VkPushConstantRange){VK_SHADER_STAGE_COMPUTE_BIT, 0, 16},
+   };
+   result = disp->CreatePipelineLayout(_device, &pl_create_info, allocator,
+                                       &astc->p_layout);
+fail:
+   return result;
+}
+
+static const uint32_t astc_spv[] = {
+#include "astc_spv.h"
+};
+
+static VkResult
+vk_astc_create_shader_module(struct vk_device *device,
+                             VkAllocationCallbacks *allocator,
+                             struct vk_texcompress_astc_state *astc)
+{
+   VkDevice _device = vk_device_to_handle(device);
+   const struct vk_device_dispatch_table *disp = &device->dispatch_table;
+
+   VkShaderModuleCreateInfo shader_module_create_info = {
+      .sType = VK_STRUCTURE_TYPE_SHADER_MODULE_CREATE_INFO,
+      .pNext = NULL,
+      .flags = 0,
+      .codeSize = sizeof(astc_spv),
+      .pCode = astc_spv,
+   };
+
+   return disp->CreateShaderModule(_device, &shader_module_create_info,
+                                   allocator, &astc->shader_module);
+}
+
+static VkResult
+create_astc_decode_pipeline(struct vk_device *device,
+                            VkAllocationCallbacks *allocator,
+                            struct vk_texcompress_astc_state *astc,
+                            VkPipelineCache pipeline_cache, VkFormat format)
+{
+   VkResult result;
+   VkDevice _device = vk_device_to_handle(device);
+   const struct vk_device_dispatch_table *disp = &device->dispatch_table;
+   VkPipeline pipeline;
+   uint8_t t_i;
+
+   t_i = get_partition_table_index(format);
+
+   uint32_t special_data[3] = {
+      vk_format_get_blockwidth(format),
+      vk_format_get_blockheight(format),
+      true,
+   };
+   VkSpecializationMapEntry special_map_entry[3] = {{
+                                                       .constantID = 0,
+                                                       .offset = 0,
+                                                       .size = 4,
+                                                    },
+                                                    {
+                                                       .constantID = 1,
+                                                       .offset = 4,
+                                                       .size = 4,
+                                                    },
+                                                    {
+                                                       .constantID = 2,
+                                                       .offset = 8,
+                                                       .size = 4,
+                                                    }};
+
+   VkSpecializationInfo specialization_info = {
+      .mapEntryCount = 3,
+      .pMapEntries = special_map_entry,
+      .dataSize = 12,
+      .pData = special_data,
+   };
+
+   /* compute shader */
+   VkPipelineShaderStageCreateInfo pipeline_shader_stage = {
+      .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO,
+      .stage = VK_SHADER_STAGE_COMPUTE_BIT,
+      .module = astc->shader_module,
+      .pName = "main",
+      .pSpecializationInfo = &specialization_info,
+   };
+
+   VkComputePipelineCreateInfo vk_pipeline_info = {
+      .sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO,
+      .stage = pipeline_shader_stage,
+      .flags = 0,
+      .layout = astc->p_layout,
+   };
+
+   result = disp->CreateComputePipelines(
+      _device, pipeline_cache, 1, &vk_pipeline_info, allocator, &pipeline);
+   if (result != VK_SUCCESS)
+      return result;
+
+   astc->pipeline[t_i] = pipeline;
+   astc->pipeline_mask |= (1 << t_i);
+
+   return result;
+}
+
+VkPipeline
+vk_texcompress_astc_get_decode_pipeline(struct vk_device *device, VkAllocationCallbacks *allocator,
+                                        struct vk_texcompress_astc_state *astc, VkPipelineCache pipeline_cache,
+                                        VkFormat format)
+{
+   VkResult result;
+   uint8_t t_i = get_partition_table_index(format);
+
+   simple_mtx_lock(&astc->mutex);
+
+   if (astc->pipeline[t_i])
+      goto unlock;
+
+   if (!astc->shader_module) {
+      result = vk_astc_create_shader_module(device, allocator, astc);
+      if (result != VK_SUCCESS)
+         goto unlock;
+   }
+
+   create_astc_decode_pipeline(device, allocator, astc, pipeline_cache, format);
+
+unlock:
+   simple_mtx_unlock(&astc->mutex);
+   return astc->pipeline[t_i];
+}
+
+static inline void
+fill_desc_image_info_struct(VkDescriptorImageInfo *info, VkImageView img_view,
+                            VkImageLayout img_layout)
+{
+   info->sampler = VK_NULL_HANDLE;
+   info->imageView = img_view;
+   info->imageLayout = img_layout;
+}
+
+static inline void
+fill_write_descriptor_set_image(VkWriteDescriptorSet *set, uint8_t bind_i,
+                                VkDescriptorType desc_type, VkDescriptorImageInfo *image_info)
+{
+   set->sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET;
+   set->pNext = NULL;
+   set->dstSet = VK_NULL_HANDLE;
+   set->dstBinding = bind_i;
+   set->dstArrayElement = 0;
+   set->descriptorCount = 1;
+   set->descriptorType = desc_type;
+   set->pImageInfo = image_info;
+   set->pBufferInfo = NULL;
+   set->pTexelBufferView = NULL;
+}
+
+static inline void
+fill_write_descriptor_set_uniform_texel(VkWriteDescriptorSet *set,
+                                        uint8_t bind_i,
+                                        VkBufferView *buf_view)
+{
+   set->sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET;
+   set->pNext = NULL;
+   set->dstSet = VK_NULL_HANDLE;
+   set->dstBinding = bind_i;
+   set->dstArrayElement = 0;
+   set->descriptorCount = 1;
+   set->descriptorType = VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER;
+   set->pImageInfo = NULL;
+   set->pBufferInfo = NULL;
+   set->pTexelBufferView = buf_view;
+}
+
+void
+vk_texcompress_astc_fill_write_descriptor_sets(struct vk_texcompress_astc_state *astc,
+                                               struct vk_texcompress_astc_write_descriptor_set *set,
+                                               VkImageView src_img_view, VkImageLayout src_img_layout,
+                                               VkImageView dst_img_view,
+                                               VkFormat format)
+{
+   unsigned desc_i;
+
+   desc_i = 0;
+   fill_desc_image_info_struct(&set->dst_desc_image_info, dst_img_view, VK_IMAGE_LAYOUT_GENERAL);
+   fill_write_descriptor_set_image(&set->descriptor_set[desc_i], desc_i,
+                                   VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, &set->dst_desc_image_info);
+   desc_i++;
+   fill_desc_image_info_struct(&set->src_desc_image_info, src_img_view, src_img_layout);
+   fill_write_descriptor_set_image(&set->descriptor_set[desc_i], desc_i,
+                                   VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE, &set->src_desc_image_info);
+   /* fill luts descriptor */
+   desc_i++;
+   for (unsigned i = 0; i < VK_TEXCOMPRESS_ASTC_NUM_LUTS; i++) {
+      fill_write_descriptor_set_uniform_texel(&set->descriptor_set[desc_i + i], desc_i + i,
+                                              &astc->luts_buf_view[i]);
+   }
+   desc_i += VK_TEXCOMPRESS_ASTC_NUM_LUTS;
+   uint8_t t_i = get_partition_table_index(format);
+   fill_write_descriptor_set_uniform_texel(&set->descriptor_set[desc_i], desc_i,
+                                           &astc->partition_tbl_buf_view[t_i]);
+   desc_i++;
+   assert(desc_i == ARRAY_SIZE(set->descriptor_set));
+}
+
+VkResult
+vk_texcompress_astc_init(struct vk_device *device, VkAllocationCallbacks *allocator,
+                         VkPipelineCache pipeline_cache,
+                         struct vk_texcompress_astc_state **astc)
+{
+   VkResult result;
+
+   /* astc memory to be freed as part of vk_astc_decode_finish() */
+   *astc = vk_zalloc(allocator, sizeof(struct vk_texcompress_astc_state), 8,
+                     VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
+   if (*astc == NULL)
+      return VK_ERROR_OUT_OF_HOST_MEMORY;
+
+   simple_mtx_init(&(*astc)->mutex, mtx_plain);
+
+   result = create_fill_all_luts_vulkan(device, allocator, *astc);
+   if (result != VK_SUCCESS)
+      goto fail;
+
+   result = create_layout(device, allocator, *astc);
+
+fail:
+   return result;
+}
+
+void
+vk_texcompress_astc_finish(struct vk_device *device,
+                           VkAllocationCallbacks *allocator,
+                           struct vk_texcompress_astc_state *astc)
+{
+   VkDevice _device = vk_device_to_handle(device);
+   const struct vk_device_dispatch_table *disp = &device->dispatch_table;
+
+   while (astc->pipeline_mask) {
+      uint8_t t_i = u_bit_scan(&astc->pipeline_mask);
+      disp->DestroyPipeline(_device, astc->pipeline[t_i], allocator);
+   }
+
+   disp->DestroyPipelineLayout(_device, astc->p_layout, allocator);
+   disp->DestroyShaderModule(_device, astc->shader_module, allocator);
+   disp->DestroyDescriptorSetLayout(_device, astc->ds_layout, allocator);
+
+   for (unsigned i = 0; i < VK_TEXCOMPRESS_ASTC_NUM_LUTS; i++)
+      disp->DestroyBufferView(_device, astc->luts_buf_view[i], allocator);
+
+   for (unsigned i = 0; i < VK_TEXCOMPRESS_ASTC_NUM_PARTITION_TABLES; i++)
+      disp->DestroyBufferView(_device, astc->partition_tbl_buf_view[i], allocator);
+
+   disp->DestroyBuffer(_device, astc->luts_buf, allocator);
+   disp->FreeMemory(_device, astc->luts_mem, allocator);
+
+   vk_free(allocator, astc);
+}
diff --git a/src/vulkan/runtime/vk_texcompress_astc.h b/src/vulkan/runtime/vk_texcompress_astc.h
new file mode 100644 (file)
index 0000000..e307af5
--- /dev/null
@@ -0,0 +1,121 @@
+/* Copyright (c) 2017-2023 Hans-Kristian Arntzen
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+#ifndef VK_TEXCOMPRESS_ASTC_H
+#define VK_TEXCOMPRESS_ASTC_H
+
+#include "vk_device.h"
+
+/* luts order matching astc glsl shader below,
+ * 0 - color endpoint
+ * 1 - color endpoint unquant
+ * 2 - weights
+ * 3 - weights unquant
+ * 4 - trits quints
+ */
+#define VK_TEXCOMPRESS_ASTC_NUM_LUTS 5
+#define VK_TEXCOMPRESS_ASTC_NUM_PARTITION_TABLES 14
+#define VK_TEXCOMPRESS_ASTC_WRITE_DESC_SET_COUNT 8
+
+struct vk_texcompress_astc_state {
+   /* single buffer is allocated for all luts */
+   VkDeviceMemory luts_mem;
+   VkBuffer luts_buf;
+
+   VkBufferView luts_buf_view[VK_TEXCOMPRESS_ASTC_NUM_LUTS];
+   VkBufferView partition_tbl_buf_view[VK_TEXCOMPRESS_ASTC_NUM_PARTITION_TABLES];
+
+   simple_mtx_t mutex;
+   VkDescriptorSetLayout ds_layout;
+   VkPipelineLayout p_layout;
+   VkPipeline pipeline[VK_TEXCOMPRESS_ASTC_NUM_PARTITION_TABLES];
+   uint32_t pipeline_mask;
+   VkShaderModule shader_module;
+};
+
+struct vk_texcompress_astc_write_descriptor_set {
+   VkWriteDescriptorSet descriptor_set[VK_TEXCOMPRESS_ASTC_WRITE_DESC_SET_COUNT];
+   VkDescriptorImageInfo dst_desc_image_info;
+   VkDescriptorImageInfo src_desc_image_info;
+};
+
+void
+vk_texcompress_astc_fill_write_descriptor_sets(struct vk_texcompress_astc_state *astc,
+                                               struct vk_texcompress_astc_write_descriptor_set *set,
+                                               VkImageView src_img_view, VkImageLayout src_img_layout,
+                                               VkImageView dst_img_view,
+                                               VkFormat format);
+VkPipeline vk_texcompress_astc_get_decode_pipeline(struct vk_device *device,
+                                                   VkAllocationCallbacks *allocator,
+                                                   struct vk_texcompress_astc_state *astc,
+                                                   VkPipelineCache pipeline_cache,
+                                                   VkFormat format);
+VkResult vk_texcompress_astc_init(struct vk_device *device,
+                                  VkAllocationCallbacks *allocator,
+                                  VkPipelineCache pipeline_cache,
+                                  struct vk_texcompress_astc_state **astc);
+void vk_texcompress_astc_finish(struct vk_device *device,
+                                VkAllocationCallbacks *allocator,
+                                struct vk_texcompress_astc_state *astc);
+
+static inline VkFormat
+vk_texcompress_astc_emulation_format(VkFormat format)
+{
+   /* TODO: From VK_EXT_astc_Decode_mode spec, VK_FORMAT_R16G16B16A16_SFLOAT is the default
+    * option. VK_FORMAT_R8G8B8A8_UNORM is only acceptable image quality option.
+    */
+   switch (format) {
+   case VK_FORMAT_ASTC_4x4_UNORM_BLOCK:
+   case VK_FORMAT_ASTC_5x4_UNORM_BLOCK:
+   case VK_FORMAT_ASTC_5x5_UNORM_BLOCK:
+   case VK_FORMAT_ASTC_6x5_UNORM_BLOCK:
+   case VK_FORMAT_ASTC_6x6_UNORM_BLOCK:
+   case VK_FORMAT_ASTC_8x5_UNORM_BLOCK:
+   case VK_FORMAT_ASTC_8x6_UNORM_BLOCK:
+   case VK_FORMAT_ASTC_8x8_UNORM_BLOCK:
+   case VK_FORMAT_ASTC_10x5_UNORM_BLOCK:
+   case VK_FORMAT_ASTC_10x6_UNORM_BLOCK:
+   case VK_FORMAT_ASTC_10x8_UNORM_BLOCK:
+   case VK_FORMAT_ASTC_10x10_UNORM_BLOCK:
+   case VK_FORMAT_ASTC_12x10_UNORM_BLOCK:
+   case VK_FORMAT_ASTC_12x12_UNORM_BLOCK:
+      return VK_FORMAT_R8G8B8A8_UNORM;
+   case VK_FORMAT_ASTC_4x4_SRGB_BLOCK:
+   case VK_FORMAT_ASTC_5x4_SRGB_BLOCK:
+   case VK_FORMAT_ASTC_5x5_SRGB_BLOCK:
+   case VK_FORMAT_ASTC_6x5_SRGB_BLOCK:
+   case VK_FORMAT_ASTC_6x6_SRGB_BLOCK:
+   case VK_FORMAT_ASTC_8x5_SRGB_BLOCK:
+   case VK_FORMAT_ASTC_8x6_SRGB_BLOCK:
+   case VK_FORMAT_ASTC_8x8_SRGB_BLOCK:
+   case VK_FORMAT_ASTC_10x5_SRGB_BLOCK:
+   case VK_FORMAT_ASTC_10x6_SRGB_BLOCK:
+   case VK_FORMAT_ASTC_10x8_SRGB_BLOCK:
+   case VK_FORMAT_ASTC_10x10_SRGB_BLOCK:
+   case VK_FORMAT_ASTC_12x10_SRGB_BLOCK:
+   case VK_FORMAT_ASTC_12x12_SRGB_BLOCK:
+      return VK_FORMAT_R8G8B8A8_SRGB;
+   default:
+      return VK_FORMAT_UNDEFINED;
+   }
+}
+
+#endif /* VK_TEXCOMPRESS_ASTC_H */