habanalabs: add driver support for internal cb scheduling

author Ofir Bitton <obitton@habana.ai>

Wed, 18 Nov 2020 13:46:57 +0000 (15:46 +0200)

committer Oded Gabbay <ogabbay@kernel.org>

Wed, 27 Jan 2021 19:03:50 +0000 (21:03 +0200)
author Ofir Bitton <obitton@habana.ai>
Wed, 18 Nov 2020 13:46:57 +0000 (15:46 +0200)
committer Oded Gabbay <ogabbay@kernel.org>
Wed, 27 Jan 2021 19:03:50 +0000 (21:03 +0200)
diff --git a/drivers/misc/habanalabs/common/command_submission.c b/drivers/misc/habanalabs/common/command_submission.c

index 3affb35..a5e9bb0 100644 (file)
--- a/drivers/misc/habanalabs/common/command_submission.c
+++ b/drivers/misc/habanalabs/common/command_submission.c
@@ -585,6 +585,18 @@ void hl_cs_rollback_all(struct hl_device *hdev)
         }
  }
  
+void hl_pending_cb_list_flush(struct hl_ctx *ctx)
+{
+       struct hl_pending_cb *pending_cb, *tmp;
+
+       list_for_each_entry_safe(pending_cb, tmp,
+                       &ctx->pending_cb_list, cb_node) {
+               list_del(&pending_cb->cb_node);
+               hl_cb_put(pending_cb->cb);
+               kfree(pending_cb);
+       }
+}
+
  static void job_wq_completion(struct work_struct *work)
  {
         struct hl_cs_job *job = container_of(work, struct hl_cs_job,
@@ -954,6 +966,129 @@ out:
         return rc;
  }
  
+static int pending_cb_create_job(struct hl_device *hdev, struct hl_ctx *ctx,
+               struct hl_cs *cs, struct hl_cb *cb, u32 size, u32 hw_queue_id)
+{
+       struct hw_queue_properties *hw_queue_prop;
+       struct hl_cs_counters_atomic *cntr;
+       struct hl_cs_job *job;
+
+       hw_queue_prop = &hdev->asic_prop.hw_queues_props[hw_queue_id];
+       cntr = &hdev->aggregated_cs_counters;
+
+       job = hl_cs_allocate_job(hdev, hw_queue_prop->type, true);
+       if (!job) {
+               atomic64_inc(&ctx->cs_counters.out_of_mem_drop_cnt);
+               atomic64_inc(&cntr->out_of_mem_drop_cnt);
+               dev_err(hdev->dev, "Failed to allocate a new job\n");
+               return -ENOMEM;
+       }
+
+       job->id = 0;
+       job->cs = cs;
+       job->user_cb = cb;
+       atomic_inc(&job->user_cb->cs_cnt);
+       job->user_cb_size = size;
+       job->hw_queue_id = hw_queue_id;
+       job->patched_cb = job->user_cb;
+       job->job_cb_size = job->user_cb_size;
+
+       /* increment refcount as for external queues we get completion */
+       cs_get(cs);
+
+       cs->jobs_in_queue_cnt[job->hw_queue_id]++;
+
+       list_add_tail(&job->cs_node, &cs->job_list);
+
+       hl_debugfs_add_job(hdev, job);
+
+       return 0;
+}
+
+static int hl_submit_pending_cb(struct hl_fpriv *hpriv)
+{
+       struct hl_device *hdev = hpriv->hdev;
+       struct hl_ctx *ctx = hpriv->ctx;
+       struct hl_pending_cb *pending_cb, *tmp;
+       struct list_head local_cb_list;
+       struct hl_cs *cs;
+       struct hl_cb *cb;
+       u32 hw_queue_id;
+       u32 cb_size;
+       int process_list, rc = 0;
+
+       if (list_empty(&ctx->pending_cb_list))
+               return 0;
+
+       process_list = atomic_cmpxchg(&ctx->thread_pending_cb_token, 1, 0);
+
+       /* Only a single thread is allowed to process the list */
+       if (!process_list)
+               return 0;
+
+       if (list_empty(&ctx->pending_cb_list))
+               goto free_pending_cb_token;
+
+       /* move all list elements to a local list */
+       INIT_LIST_HEAD(&local_cb_list);
+       spin_lock(&ctx->pending_cb_lock);
+       list_for_each_entry_safe(pending_cb, tmp, &ctx->pending_cb_list,
+                                                               cb_node)
+               list_move_tail(&pending_cb->cb_node, &local_cb_list);
+       spin_unlock(&ctx->pending_cb_lock);
+
+       rc = allocate_cs(hdev, ctx, CS_TYPE_DEFAULT, &cs);
+       if (rc)
+               goto add_list_elements;
+
+       hl_debugfs_add_cs(cs);
+
+       /* Iterate through pending cb list, create jobs and add to CS */
+       list_for_each_entry(pending_cb, &local_cb_list, cb_node) {
+               cb = pending_cb->cb;
+               cb_size = pending_cb->cb_size;
+               hw_queue_id = pending_cb->hw_queue_id;
+
+               rc = pending_cb_create_job(hdev, ctx, cs, cb, cb_size,
+                                                               hw_queue_id);
+               if (rc)
+                       goto free_cs_object;
+       }
+
+       rc = hl_hw_queue_schedule_cs(cs);
+       if (rc) {
+               if (rc != -EAGAIN)
+                       dev_err(hdev->dev,
+                               "Failed to submit CS %d.%llu (%d)\n",
+                               ctx->asid, cs->sequence, rc);
+               goto free_cs_object;
+       }
+
+       /* pending cb was scheduled successfully */
+       list_for_each_entry_safe(pending_cb, tmp, &local_cb_list, cb_node) {
+               list_del(&pending_cb->cb_node);
+               kfree(pending_cb);
+       }
+
+       cs_put(cs);
+
+       goto free_pending_cb_token;
+
+free_cs_object:
+       cs_rollback(hdev, cs);
+       cs_put(cs);
+add_list_elements:
+       spin_lock(&ctx->pending_cb_lock);
+       list_for_each_entry_safe_reverse(pending_cb, tmp, &local_cb_list,
+                                                               cb_node)
+               list_move(&pending_cb->cb_node, &ctx->pending_cb_list);
+       spin_unlock(&ctx->pending_cb_lock);
+free_pending_cb_token:
+       atomic_set(&ctx->thread_pending_cb_token, 1);
+
+       return rc;
+}
+
  static int hl_cs_ctx_switch(struct hl_fpriv *hpriv, union hl_cs_args *args,
                                 u64 *cs_seq)
  {
@@ -1353,6 +1488,10 @@ int hl_cs_ioctl(struct hl_fpriv *hpriv, void *data)
         if (rc)
                 goto out;
  
+       rc = hl_submit_pending_cb(hpriv);
+       if (rc)
+               goto out;
+
         cs_type = hl_cs_get_cs_type(args->in.cs_flags &
                                         ~HL_CS_FLAGS_FORCE_RESTORE);
         chunks = (void __user *) (uintptr_t) args->in.chunks_execute;
diff --git a/drivers/misc/habanalabs/common/context.c b/drivers/misc/habanalabs/common/context.c

index 3d86b83..829fe98 100644 (file)
--- a/drivers/misc/habanalabs/common/context.c
+++ b/drivers/misc/habanalabs/common/context.c
@@ -15,6 +15,11 @@ static void hl_ctx_fini(struct hl_ctx *ctx)
         u64 idle_mask = 0;
         int i;
  
+       /* Release all allocated pending cb's, those cb's were never
+        * scheduled so it is safe to release them here
+        */
+       hl_pending_cb_list_flush(ctx);
+
         /*
          * If we arrived here, there are no jobs waiting for this context
          * on its queues so we can safely remove it.
@@ -142,8 +147,11 @@ int hl_ctx_init(struct hl_device *hdev, struct hl_ctx *ctx, bool is_kernel_ctx)
         kref_init(&ctx->refcount);
  
         ctx->cs_sequence = 1;
+       INIT_LIST_HEAD(&ctx->pending_cb_list);
+       spin_lock_init(&ctx->pending_cb_lock);
         spin_lock_init(&ctx->cs_lock);
         atomic_set(&ctx->thread_ctx_switch_token, 1);
+       atomic_set(&ctx->thread_pending_cb_token, 1);
         ctx->thread_ctx_switch_wait_token = 0;
         ctx->cs_pending = kcalloc(hdev->asic_prop.max_pending_cs,
                                 sizeof(struct hl_fence *),
diff --git a/drivers/misc/habanalabs/common/habanalabs.h b/drivers/misc/habanalabs/common/habanalabs.h

index eb43fb3..8e0553b 100644 (file)
--- a/drivers/misc/habanalabs/common/habanalabs.h
+++ b/drivers/misc/habanalabs/common/habanalabs.h
@@ -1017,6 +1017,20 @@ struct hl_cs_counters_atomic {
  };
  
  /**
+ * struct hl_pending_cb - pending command buffer structure
+ * @cb_node: cb node in pending cb list
+ * @cb: command buffer to send in next submission
+ * @cb_size: command buffer size
+ * @hw_queue_id: destination queue id
+ */
+struct hl_pending_cb {
+       struct list_head        cb_node;
+       struct hl_cb            *cb;
+       u32                     cb_size;
+       u32                     hw_queue_id;
+};
+
+/**
   * struct hl_ctx - user/kernel context.
   * @mem_hash: holds mapping from virtual address to virtual memory area
   *             descriptor (hl_vm_phys_pg_list or hl_userptr).
@@ -1031,6 +1045,8 @@ struct hl_cs_counters_atomic {
   * @mmu_lock: protects the MMU page tables. Any change to the PGT, modifying the
   *            MMU hash or walking the PGT requires talking this lock.
   * @debugfs_list: node in debugfs list of contexts.
+ * pending_cb_list: list of pending command buffers waiting to be sent upon
+ *                  next user command submission context.
   * @cs_counters: context command submission counters.
   * @cb_va_pool: device VA pool for command buffers which are mapped to the
   *              device's MMU.
@@ -1039,11 +1055,17 @@ struct hl_cs_counters_atomic {
   *                     index to cs_pending array.
   * @dram_default_hops: array that holds all hops addresses needed for default
   *                     DRAM mapping.
+ * @pending_cb_lock: spinlock to protect pending cb list
   * @cs_lock: spinlock to protect cs_sequence.
   * @dram_phys_mem: amount of used physical DRAM memory by this context.
   * @thread_ctx_switch_token: token to prevent multiple threads of the same
   *                             context from running the context switch phase.
   *                             Only a single thread should run it.
+ * @thread_pending_cb_token: token to prevent multiple threads from processing
+ *                             the pending CB list. Only a single thread should
+ *                             process the list since it is protected by a
+ *                             spinlock and we don't want to halt the entire
+ *                             command submission sequence.
   * @thread_ctx_switch_wait_token: token to prevent the threads that didn't run
   *                             the context switch phase from moving to their
   *                             execution phase before the context switch phase
@@ -1062,13 +1084,16 @@ struct hl_ctx {
         struct mutex                    mem_hash_lock;
         struct mutex                    mmu_lock;
         struct list_head                debugfs_list;
+       struct list_head                pending_cb_list;
         struct hl_cs_counters_atomic    cs_counters;
         struct gen_pool                 *cb_va_pool;
         u64                             cs_sequence;
         u64                             *dram_default_hops;
+       spinlock_t                      pending_cb_lock;
         spinlock_t                      cs_lock;
         atomic64_t                      dram_phys_mem;
         atomic_t                        thread_ctx_switch_token;
+       atomic_t                        thread_pending_cb_token;
         u32                             thread_ctx_switch_wait_token;
         u32                             asid;
         u32                             handle;
@@ -2143,6 +2168,7 @@ int hl_cb_va_pool_init(struct hl_ctx *ctx);
  void hl_cb_va_pool_fini(struct hl_ctx *ctx);
  
  void hl_cs_rollback_all(struct hl_device *hdev);
+void hl_pending_cb_list_flush(struct hl_ctx *ctx);
  struct hl_cs_job *hl_cs_allocate_job(struct hl_device *hdev,
                 enum hl_queue_type queue_type, bool is_kernel_allocated_cb);
  void hl_sob_reset_error(struct kref *ref);
author	Ofir Bitton <obitton@habana.ai>
	Wed, 18 Nov 2020 13:46:57 +0000 (15:46 +0200)
committer	Oded Gabbay <ogabbay@kernel.org>
	Wed, 27 Jan 2021 19:03:50 +0000 (21:03 +0200)
drivers/misc/habanalabs/common/command_submission.c		patch \| blob \| history
drivers/misc/habanalabs/common/context.c		patch \| blob \| history
drivers/misc/habanalabs/common/habanalabs.h		patch \| blob \| history