lavapipe: NV_device_generated_commands
authorMike Blumenkrantz <michael.blumenkrantz@gmail.com>
Fri, 2 Jun 2023 14:22:11 +0000 (10:22 -0400)
committerMarge Bot <emma+marge@anholt.net>
Thu, 8 Jun 2023 01:22:01 +0000 (01:22 +0000)
this adds sufficient handling to pass the vkd3d-proton tests as well
as running cts on zink, which is gonna have to be enough since there's
no vkcts

it works by dynamically generating a vk_cmd_queue list of commands just
like the regular cmd queue would generate, with the minor change that
the final link has a nulled next pointer to correctly handle buffer copies,
where the last link would otherwise have a next pointer pointing to the
original cmd list

Acked-by: Dave Airlie <airlied@redhat.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/23394>

src/gallium/frontends/lavapipe/lvp_device.c
src/gallium/frontends/lavapipe/lvp_execute.c
src/gallium/frontends/lavapipe/lvp_pipeline.c
src/gallium/frontends/lavapipe/lvp_private.h

index f6f6a1c..1de6709 100644 (file)
@@ -211,6 +211,7 @@ static const struct vk_device_extension_table lvp_device_extensions_supported =
    .EXT_robustness2                       = true,
    .GOOGLE_decorate_string                = true,
    .GOOGLE_hlsl_functionality1            = true,
+   .NV_device_generated_commands          = true,
 };
 
 static int
@@ -514,6 +515,9 @@ lvp_get_features(const struct lvp_physical_device *pdevice,
       .robustImageAccess2 = true,
       .nullDescriptor = true,
 
+      /* VK_NV_device_generated_commands */
+      .deviceGeneratedCommands = true,
+
       /* VK_EXT_primitive_topology_list_restart */
       .primitiveTopologyListRestart = true,
       .primitiveTopologyPatchListRestart = true,
@@ -1135,6 +1139,20 @@ VKAPI_ATTR void VKAPI_CALL lvp_GetPhysicalDeviceProperties2(
                                         PIPE_CAP_RASTERIZER_SUBPIXEL_BITS);
          break;
       }
+      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_DEVICE_GENERATED_COMMANDS_PROPERTIES_NV: {
+         VkPhysicalDeviceDeviceGeneratedCommandsPropertiesNV *properties =
+            (VkPhysicalDeviceDeviceGeneratedCommandsPropertiesNV *)ext;
+         properties->maxGraphicsShaderGroupCount = 1<<12;
+         properties->maxIndirectSequenceCount = 1<<20;
+         properties->maxIndirectCommandsTokenCount = MAX_DGC_TOKENS;
+         properties->maxIndirectCommandsStreamCount = MAX_DGC_STREAMS;
+         properties->maxIndirectCommandsTokenOffset = 2047;
+         properties->maxIndirectCommandsStreamStride = 2048;
+         properties->minSequencesCountBufferOffsetAlignment = 4;
+         properties->minSequencesIndexBufferOffsetAlignment = 4;
+         properties->minIndirectCommandsBufferOffsetAlignment = 4;
+         break;
+      }
       case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_INLINE_UNIFORM_BLOCK_PROPERTIES: {
          VkPhysicalDeviceInlineUniformBlockProperties *properties =
             (VkPhysicalDeviceInlineUniformBlockProperties *)ext;
@@ -2341,6 +2359,126 @@ VKAPI_ATTR void VKAPI_CALL lvp_GetPrivateDataEXT(
                                    privateDataSlot, pData);
 }
 
+VKAPI_ATTR VkResult VKAPI_CALL lvp_CreateIndirectCommandsLayoutNV(
+    VkDevice                                    _device,
+    const VkIndirectCommandsLayoutCreateInfoNV* pCreateInfo,
+    const VkAllocationCallbacks*                pAllocator,
+    VkIndirectCommandsLayoutNV*                 pIndirectCommandsLayout)
+{
+   LVP_FROM_HANDLE(lvp_device, device, _device);
+   struct lvp_indirect_command_layout *dlayout;
+
+   size_t size = sizeof(*dlayout) + pCreateInfo->tokenCount * sizeof(VkIndirectCommandsLayoutTokenNV);
+
+   dlayout =
+      vk_zalloc2(&device->vk.alloc, pAllocator, size, alignof(struct lvp_indirect_command_layout),
+                VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
+   if (!dlayout)
+      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
+
+   vk_object_base_init(&device->vk, &dlayout->base, VK_OBJECT_TYPE_INDIRECT_COMMANDS_LAYOUT_NV);
+
+   dlayout->stream_count = pCreateInfo->streamCount;
+   dlayout->token_count = pCreateInfo->tokenCount;
+   for (unsigned i = 0; i < pCreateInfo->streamCount; i++)
+      dlayout->stream_strides[i] = pCreateInfo->pStreamStrides[i];
+   typed_memcpy(dlayout->tokens, pCreateInfo->pTokens, pCreateInfo->tokenCount);
+
+   *pIndirectCommandsLayout = lvp_indirect_command_layout_to_handle(dlayout);
+   return VK_SUCCESS;
+}
+
+VKAPI_ATTR void VKAPI_CALL lvp_DestroyIndirectCommandsLayoutNV(
+    VkDevice                                    _device,
+    VkIndirectCommandsLayoutNV                  indirectCommandsLayout,
+    const VkAllocationCallbacks*                pAllocator)
+{
+   LVP_FROM_HANDLE(lvp_device, device, _device);
+   VK_FROM_HANDLE(lvp_indirect_command_layout, layout, indirectCommandsLayout);
+
+   if (!layout)
+      return;
+
+   vk_object_base_finish(&layout->base);
+   vk_free2(&device->vk.alloc, pAllocator, layout);
+}
+
+enum vk_cmd_type
+lvp_nv_dgc_token_to_cmd_type(const VkIndirectCommandsLayoutTokenNV *token)
+{
+   switch (token->tokenType) {
+      case VK_INDIRECT_COMMANDS_TOKEN_TYPE_SHADER_GROUP_NV:
+         return VK_CMD_BIND_PIPELINE_SHADER_GROUP_NV;
+      case VK_INDIRECT_COMMANDS_TOKEN_TYPE_STATE_FLAGS_NV:
+         if (token->indirectStateFlags & VK_INDIRECT_STATE_FLAG_FRONTFACE_BIT_NV) {
+            return VK_CMD_SET_FRONT_FACE;
+         }
+         assert(!"unknown token type!");
+         break;
+      case VK_INDIRECT_COMMANDS_TOKEN_TYPE_PUSH_CONSTANT_NV:
+         return VK_CMD_PUSH_CONSTANTS;
+      case VK_INDIRECT_COMMANDS_TOKEN_TYPE_INDEX_BUFFER_NV:
+         return VK_CMD_BIND_INDEX_BUFFER;
+      case VK_INDIRECT_COMMANDS_TOKEN_TYPE_VERTEX_BUFFER_NV:
+        return VK_CMD_BIND_VERTEX_BUFFERS2;
+      case VK_INDIRECT_COMMANDS_TOKEN_TYPE_DRAW_INDEXED_NV:
+         return VK_CMD_DRAW_INDEXED_INDIRECT;
+      case VK_INDIRECT_COMMANDS_TOKEN_TYPE_DRAW_NV:
+         return VK_CMD_DRAW_INDIRECT;
+      // only available if VK_EXT_mesh_shader is supported
+      case VK_INDIRECT_COMMANDS_TOKEN_TYPE_DRAW_MESH_TASKS_NV:
+         return VK_CMD_DRAW_MESH_TASKS_INDIRECT_EXT;
+      // only available if VK_NV_mesh_shader is supported
+      case VK_INDIRECT_COMMANDS_TOKEN_TYPE_DRAW_TASKS_NV:
+         unreachable("NV_mesh_shader unsupported!");
+      default:
+         unreachable("unknown token type");
+   }
+   return UINT32_MAX;
+}
+
+VKAPI_ATTR void VKAPI_CALL lvp_GetGeneratedCommandsMemoryRequirementsNV(
+    VkDevice                                    device,
+    const VkGeneratedCommandsMemoryRequirementsInfoNV* pInfo,
+    VkMemoryRequirements2*                      pMemoryRequirements)
+{
+   VK_FROM_HANDLE(lvp_indirect_command_layout, dlayout, pInfo->indirectCommandsLayout);
+
+   size_t size = sizeof(struct list_head);
+
+   for (unsigned i = 0; i < dlayout->token_count; i++) {
+      const VkIndirectCommandsLayoutTokenNV *token = &dlayout->tokens[i];
+      UNUSED struct vk_cmd_queue_entry *cmd;
+      enum vk_cmd_type type = lvp_nv_dgc_token_to_cmd_type(token);
+      size += vk_cmd_queue_type_sizes[type];
+
+      switch (token->tokenType) {
+      case VK_INDIRECT_COMMANDS_TOKEN_TYPE_VERTEX_BUFFER_NV:
+         size += sizeof(*cmd->u.bind_vertex_buffers.buffers);
+         size += sizeof(*cmd->u.bind_vertex_buffers.offsets);
+         size += sizeof(*cmd->u.bind_vertex_buffers2.sizes) + sizeof(*cmd->u.bind_vertex_buffers2.strides);
+         break;
+      case VK_INDIRECT_COMMANDS_TOKEN_TYPE_PUSH_CONSTANT_NV:
+         size += token->pushconstantSize;
+         break;
+      case VK_INDIRECT_COMMANDS_TOKEN_TYPE_SHADER_GROUP_NV:
+      case VK_INDIRECT_COMMANDS_TOKEN_TYPE_INDEX_BUFFER_NV:
+      case VK_INDIRECT_COMMANDS_TOKEN_TYPE_STATE_FLAGS_NV:
+      case VK_INDIRECT_COMMANDS_TOKEN_TYPE_DRAW_INDEXED_NV:
+      case VK_INDIRECT_COMMANDS_TOKEN_TYPE_DRAW_NV:
+      case VK_INDIRECT_COMMANDS_TOKEN_TYPE_DRAW_TASKS_NV:
+      case VK_INDIRECT_COMMANDS_TOKEN_TYPE_DRAW_MESH_TASKS_NV:
+         break;
+      default:
+         unreachable("unknown type!");
+      }
+   }
+
+   pMemoryRequirements->memoryRequirements.memoryTypeBits = 1;
+   pMemoryRequirements->memoryRequirements.alignment = 4;
+   pMemoryRequirements->memoryRequirements.size = align(size, pMemoryRequirements->memoryRequirements.alignment);
+}
+
 VKAPI_ATTR void VKAPI_CALL lvp_GetPhysicalDeviceExternalFenceProperties(
    VkPhysicalDevice                           physicalDevice,
    const VkPhysicalDeviceExternalFenceInfo    *pExternalFenceInfo,
index 54960d4..9ee9be1 100644 (file)
@@ -855,10 +855,9 @@ handle_graphics_layout(struct rendering_state *state, gl_shader_stage stage, str
    }
 }
 
-static void handle_graphics_pipeline(struct vk_cmd_queue_entry *cmd,
+static void handle_graphics_pipeline(struct lvp_pipeline *pipeline,
                                      struct rendering_state *state)
 {
-   LVP_FROM_HANDLE(lvp_pipeline, pipeline, cmd->u.bind_pipeline.pipeline);
    const struct vk_graphics_pipeline_state *ps = &pipeline->graphics_state;
    lvp_pipeline_shaders_compile(pipeline);
    bool dynamic_tess_origin = BITSET_TEST(ps->dynamic, MESA_VK_DYNAMIC_TS_DOMAIN_ORIGIN);
@@ -1198,7 +1197,7 @@ static void handle_pipeline(struct vk_cmd_queue_entry *cmd,
       handle_compute_pipeline(cmd, state);
       handle_pipeline_access(state, MESA_SHADER_COMPUTE);
    } else {
-      handle_graphics_pipeline(cmd, state);
+      handle_graphics_pipeline(pipeline, state);
       lvp_forall_gfx_stage(sh) {
          handle_pipeline_access(state, sh);
       }
@@ -1206,6 +1205,19 @@ static void handle_pipeline(struct vk_cmd_queue_entry *cmd,
    state->push_size[pipeline->is_compute_pipeline] = pipeline->layout->push_constant_size;
 }
 
+static void
+handle_graphics_pipeline_group(struct vk_cmd_queue_entry *cmd, struct rendering_state *state)
+{
+   assert(cmd->u.bind_pipeline_shader_group_nv.pipeline_bind_point == VK_PIPELINE_BIND_POINT_GRAPHICS);
+   LVP_FROM_HANDLE(lvp_pipeline, pipeline, cmd->u.bind_pipeline_shader_group_nv.pipeline);
+   if (cmd->u.bind_pipeline_shader_group_nv.group_index)
+      pipeline = lvp_pipeline_from_handle(pipeline->groups[cmd->u.bind_pipeline_shader_group_nv.group_index - 1]);
+   handle_graphics_pipeline(pipeline, state);
+   lvp_forall_gfx_stage(sh)
+      handle_pipeline_access(state, sh);
+   state->push_size[pipeline->is_compute_pipeline] = pipeline->layout->push_constant_size;
+}
+
 static void handle_vertex_buffers2(struct vk_cmd_queue_entry *cmd,
                                    struct rendering_state *state)
 {
@@ -2864,11 +2876,13 @@ static void handle_index_buffer(struct vk_cmd_queue_entry *cmd,
    default:
       break;
    }
-   state->index_offset = ib->offset;
-   if (ib->buffer)
+   if (ib->buffer) {
+      state->index_offset = ib->offset;
       state->index_buffer = lvp_buffer_from_handle(ib->buffer)->bo;
-   else
+   } else {
+      state->index_offset = 0;
       state->index_buffer = state->device->zero_buffer;
+   }
 
    state->ib_dirty = true;
 }
@@ -4227,6 +4241,233 @@ static void handle_draw_mesh_tasks_indirect_count(struct vk_cmd_queue_entry *cmd
    state->pctx->draw_mesh_tasks(state->pctx, &state->dispatch_info);
 }
 
+static VkBuffer
+get_buffer(struct rendering_state *state, uint8_t *ptr, size_t *offset)
+{
+   simple_mtx_lock(&state->device->bda_lock);
+   hash_table_foreach(&state->device->bda, he) {
+      const uint8_t *bda = he->key;
+      if (ptr < bda)
+         continue;
+      struct lvp_buffer *buffer = he->data;
+      if (bda + buffer->size > ptr) {
+         *offset = ptr - bda;
+         simple_mtx_unlock(&state->device->bda_lock);
+         return lvp_buffer_to_handle(buffer);
+      }
+   }
+   fprintf(stderr, "unrecognized BDA!\n");
+   abort();
+}
+
+static size_t
+process_sequence(struct rendering_state *state,
+                 VkPipeline pipeline, struct lvp_indirect_command_layout *dlayout,
+                 struct list_head *list, uint8_t *pbuf, size_t max_size,
+                 uint8_t **map_streams, const VkIndirectCommandsStreamNV *pstreams, uint32_t seq)
+{
+   size_t size = 0;
+   for (uint32_t t = 0; t < dlayout->token_count; t++){
+      const VkIndirectCommandsLayoutTokenNV *token = &dlayout->tokens[t];
+      uint32_t stride = dlayout->stream_strides[token->stream];
+      uint8_t *stream = map_streams[token->stream];
+      uint32_t offset = stride * seq + token->offset;
+      uint32_t draw_offset = offset + pstreams[token->stream].offset;
+      void *input = stream + offset;
+
+      struct vk_cmd_queue_entry *cmd = (struct vk_cmd_queue_entry*)(pbuf + size);
+      size_t cmd_size = vk_cmd_queue_type_sizes[lvp_nv_dgc_token_to_cmd_type(token)];
+      uint8_t *cmdptr = (void*)(pbuf + size + cmd_size);
+      if (max_size < size + cmd_size)
+         abort();
+      cmd->type = lvp_nv_dgc_token_to_cmd_type(token);
+
+      switch (token->tokenType) {
+      case VK_INDIRECT_COMMANDS_TOKEN_TYPE_SHADER_GROUP_NV: {
+         VkBindShaderGroupIndirectCommandNV *bind = input;
+         cmd->u.bind_pipeline_shader_group_nv.pipeline_bind_point = VK_PIPELINE_BIND_POINT_GRAPHICS;
+         cmd->u.bind_pipeline_shader_group_nv.pipeline = pipeline;
+         cmd->u.bind_pipeline_shader_group_nv.group_index = bind->groupIndex;
+         break;
+      }
+      case VK_INDIRECT_COMMANDS_TOKEN_TYPE_STATE_FLAGS_NV: {
+         VkSetStateFlagsIndirectCommandNV *state = input;
+         if (token->indirectStateFlags & VK_INDIRECT_STATE_FLAG_FRONTFACE_BIT_NV) {
+            if (state->data & BITFIELD_BIT(VK_FRONT_FACE_CLOCKWISE)) {
+               cmd->u.set_front_face.front_face = VK_FRONT_FACE_CLOCKWISE;
+            } else {
+               cmd->u.set_front_face.front_face = VK_FRONT_FACE_COUNTER_CLOCKWISE;
+            }
+         } else {
+            /* skip this if unrecognized state flag */
+            continue;
+         }
+         break;
+      }
+      case VK_INDIRECT_COMMANDS_TOKEN_TYPE_PUSH_CONSTANT_NV: {
+         uint32_t *data = input;
+         cmd_size += token->pushconstantSize;
+         if (max_size < size + cmd_size)
+            abort();
+         cmd->u.push_constants.layout = token->pushconstantPipelineLayout;
+         cmd->u.push_constants.stage_flags = token->pushconstantShaderStageFlags;
+         cmd->u.push_constants.offset = token->pushconstantOffset;
+         cmd->u.push_constants.size = token->pushconstantSize;
+         cmd->u.push_constants.values = (void*)cmdptr;
+         memcpy(cmd->u.push_constants.values, data, token->pushconstantSize);
+         break;
+      }
+      case VK_INDIRECT_COMMANDS_TOKEN_TYPE_INDEX_BUFFER_NV: {
+         VkBindIndexBufferIndirectCommandNV *data = input;
+         cmd->u.bind_index_buffer.offset = 0;
+         if (data->bufferAddress)
+            cmd->u.bind_index_buffer.buffer = get_buffer(state, (void*)(uintptr_t)data->bufferAddress, (size_t*)&cmd->u.bind_index_buffer.offset);
+         else
+            cmd->u.bind_index_buffer.buffer = VK_NULL_HANDLE;
+         cmd->u.bind_index_buffer.index_type = data->indexType;
+         for (unsigned i = 0; i < token->indexTypeCount; i++) {
+            if (data->indexType == token->pIndexTypeValues[i]) {
+               cmd->u.bind_index_buffer.index_type = token->pIndexTypes[i];
+               break;
+            }
+         }
+         break;
+      }
+      case VK_INDIRECT_COMMANDS_TOKEN_TYPE_VERTEX_BUFFER_NV: {
+         VkBindVertexBufferIndirectCommandNV *data = input;
+         cmd_size += sizeof(*cmd->u.bind_vertex_buffers.buffers) + sizeof(*cmd->u.bind_vertex_buffers.offsets);
+         cmd_size += sizeof(*cmd->u.bind_vertex_buffers2.sizes) + sizeof(*cmd->u.bind_vertex_buffers2.strides);
+         if (max_size < size + cmd_size)
+            abort();
+
+         cmd->u.bind_vertex_buffers2.first_binding = token->vertexBindingUnit;
+         cmd->u.bind_vertex_buffers2.binding_count = 1;
+
+         cmd->u.bind_vertex_buffers2.buffers = (void*)cmdptr;
+         cmd->u.bind_vertex_buffers2.offsets = (void*)(cmdptr + sizeof(*cmd->u.bind_vertex_buffers2.buffers));
+         cmd->u.bind_vertex_buffers2.offsets[0] = 0;
+         cmd->u.bind_vertex_buffers2.buffers[0] = data->bufferAddress ? get_buffer(state, (void*)(uintptr_t)data->bufferAddress, (size_t*)&cmd->u.bind_vertex_buffers2.offsets[0]) : VK_NULL_HANDLE;
+
+         if (token->vertexDynamicStride) {
+            cmd->u.bind_vertex_buffers2.strides = (void*)(cmdptr + sizeof(*cmd->u.bind_vertex_buffers2.buffers) + sizeof(*cmd->u.bind_vertex_buffers2.offsets) + sizeof(*cmd->u.bind_vertex_buffers2.sizes));
+            cmd->u.bind_vertex_buffers2.strides[0] = data->stride;
+         } else {
+            cmd->u.bind_vertex_buffers2.strides = NULL;
+         }
+         break;
+      }
+      case VK_INDIRECT_COMMANDS_TOKEN_TYPE_DRAW_INDEXED_NV: {
+         cmd->u.draw_indexed_indirect.buffer = pstreams[token->stream].buffer;
+         cmd->u.draw_indexed_indirect.offset = draw_offset;
+         cmd->u.draw_indexed_indirect.draw_count = 1;
+         cmd->u.draw_indexed_indirect.stride = 0;
+         break;
+      }
+      case VK_INDIRECT_COMMANDS_TOKEN_TYPE_DRAW_NV: {
+         cmd->u.draw_indirect.buffer = pstreams[token->stream].buffer;
+         cmd->u.draw_indirect.offset = draw_offset;
+         cmd->u.draw_indirect.draw_count = 1;
+         cmd->u.draw_indirect.stride = 0;
+         break;
+      }
+      // only available if VK_EXT_mesh_shader is supported
+      case VK_INDIRECT_COMMANDS_TOKEN_TYPE_DRAW_MESH_TASKS_NV: {
+         cmd->u.draw_mesh_tasks_indirect_ext.buffer = pstreams[token->stream].buffer;
+         cmd->u.draw_mesh_tasks_indirect_ext.offset = draw_offset;
+         cmd->u.draw_mesh_tasks_indirect_ext.draw_count = 1;
+         cmd->u.draw_mesh_tasks_indirect_ext.stride = 0;
+         break;
+      }
+      // only available if VK_NV_mesh_shader is supported
+      case VK_INDIRECT_COMMANDS_TOKEN_TYPE_DRAW_TASKS_NV:
+         unreachable("NV_mesh_shader unsupported!");
+      default:
+         unreachable("unknown token type");
+         break;
+      }
+      size += cmd_size;
+      list_addtail(&cmd->cmd_link, list);
+   }
+   return size;
+}
+
+static void
+handle_preprocess_generated_commands(struct vk_cmd_queue_entry *cmd, struct rendering_state *state)
+{
+   VkGeneratedCommandsInfoNV *pre = cmd->u.preprocess_generated_commands_nv.generated_commands_info;
+   VK_FROM_HANDLE(lvp_indirect_command_layout, dlayout, pre->indirectCommandsLayout);
+   struct pipe_transfer *stream_maps[16];
+   uint8_t *streams[16];
+   for (unsigned i = 0; i < pre->streamCount; i++) {
+      struct lvp_buffer *buf = lvp_buffer_from_handle(pre->pStreams[i].buffer);
+      streams[i] = pipe_buffer_map(state->pctx, buf->bo, PIPE_MAP_READ, &stream_maps[i]);
+      streams[i] += pre->pStreams[i].offset;
+   }
+   LVP_FROM_HANDLE(lvp_buffer, pbuf, pre->preprocessBuffer);
+   LVP_FROM_HANDLE(lvp_buffer, seqc, pre->sequencesCountBuffer);
+   LVP_FROM_HANDLE(lvp_buffer, seqi, pre->sequencesIndexBuffer);
+
+   unsigned seq_count = pre->sequencesCount;
+   if (seqc) {
+      unsigned count = 0;
+      pipe_buffer_read(state->pctx, seqc->bo, pre->sequencesCountOffset, sizeof(uint32_t), &count);
+      seq_count = MIN2(count, seq_count);
+   }
+   uint32_t *seq = NULL;
+   struct pipe_transfer *seq_map = NULL;
+   if (seqi) {
+      seq = pipe_buffer_map(state->pctx, seqi->bo, PIPE_MAP_READ, &seq_map);
+      seq = (uint32_t*)(((uint8_t*)seq) + pre->sequencesIndexOffset);
+   }
+
+   struct pipe_transfer *pmap;
+   uint8_t *p = pipe_buffer_map(state->pctx, pbuf->bo, PIPE_MAP_WRITE, &pmap);
+   p += pre->preprocessOffset;
+   struct list_head *list = (void*)p;
+   size_t size = sizeof(struct list_head);
+   size_t max_size = pre->preprocessSize;
+   if (size > max_size)
+      abort();
+   list_inithead(list);
+
+   size_t offset = size;
+   for (unsigned i = 0; i < seq_count; i++) {
+      uint32_t s = seq ? seq[i] : i;
+      offset += process_sequence(state, pre->pipeline, dlayout, list, p + offset, max_size, streams, pre->pStreams, s);
+   }
+
+   /* vk_cmd_queue will copy the binary and break the list, so null the tail pointer */
+   list->prev->next = NULL;
+
+   for (unsigned i = 0; i < pre->streamCount; i++)
+      state->pctx->buffer_unmap(state->pctx, stream_maps[i]);
+   state->pctx->buffer_unmap(state->pctx, pmap);
+   if (seq_map)
+      state->pctx->buffer_unmap(state->pctx, seq_map);
+}
+
+static void
+handle_execute_generated_commands(struct vk_cmd_queue_entry *cmd, struct rendering_state *state, bool print_cmds)
+{
+   VkGeneratedCommandsInfoNV *gen = cmd->u.execute_generated_commands_nv.generated_commands_info;
+   struct vk_cmd_execute_generated_commands_nv *exec = &cmd->u.execute_generated_commands_nv;
+   if (!exec->is_preprocessed) {
+      struct vk_cmd_queue_entry pre;
+      pre.u.preprocess_generated_commands_nv.generated_commands_info = exec->generated_commands_info;
+      handle_preprocess_generated_commands(&pre, state);
+   }
+   LVP_FROM_HANDLE(lvp_buffer, pbuf, gen->preprocessBuffer);
+   struct pipe_transfer *pmap;
+   uint8_t *p = pipe_buffer_map(state->pctx, pbuf->bo, PIPE_MAP_WRITE, &pmap);
+   p += gen->preprocessOffset;
+   struct list_head *list = (void*)p;
+
+   lvp_execute_cmd_buffer(list, state, print_cmds);
+
+   state->pctx->buffer_unmap(state->pctx, pmap);
+}
+
 void lvp_add_enqueue_cmd_entrypoints(struct vk_device_dispatch_table *disp)
 {
    struct vk_device_dispatch_table cmd_enqueue_dispatch;
@@ -4350,6 +4591,11 @@ void lvp_add_enqueue_cmd_entrypoints(struct vk_device_dispatch_table *disp)
    ENQUEUE_CMD(CmdDrawMeshTasksEXT)
    ENQUEUE_CMD(CmdDrawMeshTasksIndirectEXT)
    ENQUEUE_CMD(CmdDrawMeshTasksIndirectCountEXT)
+
+   ENQUEUE_CMD(CmdBindPipelineShaderGroupNV)
+   ENQUEUE_CMD(CmdPreprocessGeneratedCommandsNV)
+   ENQUEUE_CMD(CmdExecuteGeneratedCommandsNV)
+
 #undef ENQUEUE_CMD
 }
 
@@ -4681,6 +4927,15 @@ static void lvp_execute_cmd_buffer(struct list_head *cmds,
          emit_state(state);
          handle_draw_mesh_tasks_indirect_count(cmd, state);
          break;
+      case VK_CMD_BIND_PIPELINE_SHADER_GROUP_NV:
+         handle_graphics_pipeline_group(cmd, state);
+         break;
+      case VK_CMD_PREPROCESS_GENERATED_COMMANDS_NV:
+         handle_preprocess_generated_commands(cmd, state);
+         break;
+      case VK_CMD_EXECUTE_GENERATED_COMMANDS_NV:
+         handle_execute_generated_commands(cmd, state, print_cmds);
+         break;
       default:
          fprintf(stderr, "Unsupported command %s\n", vk_cmd_queue_type_names[cmd->type]);
          unreachable("Unsupported command");
@@ -4688,6 +4943,8 @@ static void lvp_execute_cmd_buffer(struct list_head *cmds,
       }
       first = false;
       did_flush = false;
+      if (!cmd->cmd_link.next)
+         break;
    }
 }
 
index 99b8161..421a05f 100644 (file)
@@ -81,6 +81,11 @@ lvp_pipeline_destroy(struct lvp_device *device, struct lvp_pipeline *pipeline)
    if (pipeline->layout)
       vk_pipeline_layout_unref(&device->vk, &pipeline->layout->vk);
 
+   for (unsigned i = 0; i < pipeline->num_groups; i++) {
+      LVP_FROM_HANDLE(lvp_pipeline, p, pipeline->groups[i]);
+      lvp_pipeline_destroy(device, p);
+   }
+
    vk_free(&device->vk.alloc, pipeline->state_data);
    vk_object_base_finish(&pipeline->base);
    vk_free(&device->vk.alloc, pipeline);
@@ -991,7 +996,8 @@ lvp_graphics_pipeline_create(
    VkDevice _device,
    VkPipelineCache _cache,
    const VkGraphicsPipelineCreateInfo *pCreateInfo,
-   VkPipeline *pPipeline)
+   VkPipeline *pPipeline,
+   bool group)
 {
    LVP_FROM_HANDLE(lvp_device, device, _device);
    LVP_FROM_HANDLE(lvp_pipeline_cache, cache, _cache);
@@ -1000,7 +1006,12 @@ lvp_graphics_pipeline_create(
 
    assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_GRAPHICS_PIPELINE_CREATE_INFO);
 
-   pipeline = vk_zalloc(&device->vk.alloc, sizeof(*pipeline), 8,
+   size_t size = 0;
+   const VkGraphicsPipelineShaderGroupsCreateInfoNV *groupinfo = vk_find_struct_const(pCreateInfo, GRAPHICS_PIPELINE_SHADER_GROUPS_CREATE_INFO_NV);
+   if (!group && groupinfo)
+      size += (groupinfo->groupCount + groupinfo->pipelineCount) * sizeof(VkPipeline);
+
+   pipeline = vk_zalloc(&device->vk.alloc, sizeof(*pipeline) + size, 8,
                          VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
    if (pipeline == NULL)
       return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
@@ -1013,9 +1024,28 @@ lvp_graphics_pipeline_create(
       vk_free(&device->vk.alloc, pipeline);
       return result;
    }
+   if (!group && groupinfo) {
+      VkGraphicsPipelineCreateInfo pci = *pCreateInfo;
+      for (unsigned i = 0; i < groupinfo->groupCount; i++) {
+         const VkGraphicsShaderGroupCreateInfoNV *g = &groupinfo->pGroups[i];
+         pci.pVertexInputState = g->pVertexInputState;
+         pci.pTessellationState = g->pTessellationState;
+         pci.pStages = g->pStages;
+         pci.stageCount = g->stageCount;
+         result = lvp_graphics_pipeline_create(_device, _cache, &pci, &pipeline->groups[i], true);
+         if (result != VK_SUCCESS) {
+            lvp_pipeline_destroy(device, pipeline);
+            return result;
+         }
+         pipeline->num_groups++;
+      }
+      for (unsigned i = 0; i < groupinfo->pipelineCount; i++)
+         pipeline->groups[pipeline->num_groups + i] = groupinfo->pPipelines[i];
+      pipeline->num_groups_total = groupinfo->groupCount + groupinfo->pipelineCount;
+   }
 
    VkPipelineCreationFeedbackCreateInfo *feedback = (void*)vk_find_struct_const(pCreateInfo->pNext, PIPELINE_CREATION_FEEDBACK_CREATE_INFO);
-   if (feedback) {
+   if (feedback && !group) {
       feedback->pPipelineCreationFeedback->duration = os_time_get_nano() - t0;
       feedback->pPipelineCreationFeedback->flags = VK_PIPELINE_CREATION_FEEDBACK_VALID_BIT;
       memset(feedback->pPipelineStageCreationFeedbacks, 0, sizeof(VkPipelineCreationFeedback) * feedback->pipelineStageCreationFeedbackCount);
@@ -1043,7 +1073,8 @@ VKAPI_ATTR VkResult VKAPI_CALL lvp_CreateGraphicsPipelines(
          r = lvp_graphics_pipeline_create(_device,
                                           pipelineCache,
                                           &pCreateInfos[i],
-                                          &pPipelines[i]);
+                                          &pPipelines[i],
+                                          false);
       if (r != VK_SUCCESS) {
          result = r;
          pPipelines[i] = VK_NULL_HANDLE;
index fbbf315..fbad94b 100644 (file)
@@ -86,6 +86,8 @@ extern "C" {
 #define MAX_PUSH_DESCRIPTORS 32
 #define MAX_DESCRIPTOR_UNIFORM_BLOCK_SIZE 4096
 #define MAX_PER_STAGE_DESCRIPTOR_UNIFORM_BLOCKS 8
+#define MAX_DGC_STREAMS 16
+#define MAX_DGC_TOKENS 16
 
 #ifdef _WIN32
 #define lvp_printflike(a, b)
@@ -490,6 +492,10 @@ struct lvp_pipeline {
    bool library;
    bool compiled;
    bool used;
+
+   unsigned num_groups;
+   unsigned num_groups_total;
+   VkPipeline groups[0];
 };
 
 void
@@ -541,6 +547,16 @@ struct lvp_cmd_buffer {
    uint8_t push_constants[MAX_PUSH_CONSTANTS_SIZE];
 };
 
+struct lvp_indirect_command_layout {
+   struct vk_object_base base;
+   uint8_t stream_count;
+   uint8_t token_count;
+   uint16_t stream_strides[MAX_DGC_STREAMS];
+   VkPipelineBindPoint bind_point;
+   VkIndirectCommandsLayoutUsageFlagsNV flags;
+   VkIndirectCommandsLayoutTokenNV tokens[0];
+};
+
 extern const struct vk_command_buffer_ops lvp_cmd_buffer_ops;
 
 static inline const struct lvp_descriptor_set_layout *
@@ -598,6 +614,8 @@ VK_DEFINE_NONDISP_HANDLE_CASTS(lvp_query_pool, base, VkQueryPool,
                                VK_OBJECT_TYPE_QUERY_POOL)
 VK_DEFINE_NONDISP_HANDLE_CASTS(lvp_sampler, base, VkSampler,
                                VK_OBJECT_TYPE_SAMPLER)
+VK_DEFINE_NONDISP_HANDLE_CASTS(lvp_indirect_command_layout, base, VkIndirectCommandsLayoutNV,
+                               VK_OBJECT_TYPE_INDIRECT_COMMANDS_LAYOUT_NV)
 
 struct lvp_write_descriptor {
    uint32_t dst_binding;
@@ -672,6 +690,8 @@ void
 lvp_inline_uniforms(nir_shader *nir, const struct lvp_shader *shader, const uint32_t *uniform_values, uint32_t ubo);
 void *
 lvp_shader_compile(struct lvp_device *device, struct lvp_shader *shader, nir_shader *nir);
+enum vk_cmd_type
+lvp_nv_dgc_token_to_cmd_type(const VkIndirectCommandsLayoutTokenNV *token);
 #ifdef __cplusplus
 }
 #endif