habanalabs: Separate CS job completion from its deallocation
authorTomer Tayar <ttayar@habana.ai>
Mon, 10 Aug 2020 14:30:35 +0000 (17:30 +0300)
committerOded Gabbay <ogabbay@kernel.org>
Mon, 30 Nov 2020 08:47:33 +0000 (10:47 +0200)
Current CS jobs are no longer needed after their completion.
However, jobs of future workload might be in use even after they are
completed. To allow that, the patch adds a refcount to the job object,
and decouples its completion handling from its deallocation.

Signed-off-by: Tomer Tayar <ttayar@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
drivers/misc/habanalabs/common/command_submission.c
drivers/misc/habanalabs/common/debugfs.c
drivers/misc/habanalabs/common/habanalabs.h

index 536e095..91bdd6d 100644 (file)
@@ -155,6 +155,18 @@ static void cs_put(struct hl_cs *cs)
        kref_put(&cs->refcount, cs_do_release);
 }
 
+static void cs_job_do_release(struct kref *ref)
+{
+       struct hl_cs_job *job = container_of(ref, struct hl_cs_job, refcount);
+
+       kfree(job);
+}
+
+static void cs_job_put(struct hl_cs_job *job)
+{
+       kref_put(&job->refcount, cs_job_do_release);
+}
+
 static bool is_cb_patched(struct hl_device *hdev, struct hl_cs_job *job)
 {
        /*
@@ -227,7 +239,7 @@ static int cs_parser(struct hl_fpriv *hpriv, struct hl_cs_job *job)
        return rc;
 }
 
-static void free_job(struct hl_device *hdev, struct hl_cs_job *job)
+static void complete_job(struct hl_device *hdev, struct hl_cs_job *job)
 {
        struct hl_cs *cs = job->cs;
 
@@ -276,7 +288,7 @@ static void free_job(struct hl_device *hdev, struct hl_cs_job *job)
                        job->queue_type == QUEUE_TYPE_HW)
                cs_put(cs);
 
-       kfree(job);
+       cs_job_put(job);
 }
 
 static void cs_do_release(struct kref *ref)
@@ -290,13 +302,13 @@ static void cs_do_release(struct kref *ref)
        /*
         * Although if we reached here it means that all external jobs have
         * finished, because each one of them took refcnt to CS, we still
-        * need to go over the internal jobs and free them. Otherwise, we
+        * need to go over the internal jobs and complete them. Otherwise, we
         * will have leaked memory and what's worse, the CS object (and
         * potentially the CTX object) could be released, while the JOB
         * still holds a pointer to them (but no reference).
         */
        list_for_each_entry_safe(job, tmp, &cs->job_list, cs_node)
-               free_job(hdev, job);
+               complete_job(hdev, job);
 
        if (!cs->submitted) {
                /* In case the wait for signal CS was submitted, the put occurs
@@ -507,7 +519,7 @@ static void cs_rollback(struct hl_device *hdev, struct hl_cs *cs)
        struct hl_cs_job *job, *tmp;
 
        list_for_each_entry_safe(job, tmp, &cs->job_list, cs_node)
-               free_job(hdev, job);
+               complete_job(hdev, job);
 }
 
 void hl_cs_rollback_all(struct hl_device *hdev)
@@ -539,7 +551,7 @@ static void job_wq_completion(struct work_struct *work)
        struct hl_device *hdev = cs->ctx->hdev;
 
        /* job is no longer needed */
-       free_job(hdev, job);
+       complete_job(hdev, job);
 }
 
 static int validate_queue_index(struct hl_device *hdev,
@@ -647,6 +659,7 @@ struct hl_cs_job *hl_cs_allocate_job(struct hl_device *hdev,
        if (!job)
                return NULL;
 
+       kref_init(&job->refcount);
        job->queue_type = queue_type;
        job->is_kernel_allocated_cb = is_kernel_allocated_cb;
 
index 912ddfa..b44193e 100644 (file)
@@ -168,18 +168,19 @@ static int command_submission_jobs_show(struct seq_file *s, void *data)
                if (first) {
                        first = false;
                        seq_puts(s, "\n");
-                       seq_puts(s, " JOB ID   CS ID    CTX ASID   H/W Queue\n");
-                       seq_puts(s, "---------------------------------------\n");
+                       seq_puts(s, " JOB ID   CS ID    CTX ASID   JOB RefCnt   H/W Queue\n");
+                       seq_puts(s, "----------------------------------------------------\n");
                }
                if (job->cs)
                        seq_printf(s,
-                               "    %02d       %llu         %d         %d\n",
+                               "   %02d      %llu        %d          %d           %d\n",
                                job->id, job->cs->sequence, job->cs->ctx->asid,
-                               job->hw_queue_id);
+                               kref_read(&job->refcount), job->hw_queue_id);
                else
                        seq_printf(s,
-                               "    %02d       0         %d         %d\n",
-                               job->id, HL_KERNEL_ASID_ID, job->hw_queue_id);
+                               "   %02d      0        %d          %d           %d\n",
+                               job->id, HL_KERNEL_ASID_ID,
+                               kref_read(&job->refcount), job->hw_queue_id);
        }
 
        spin_unlock(&dev_entry->cs_job_spinlock);
index 0f4f8ef..81ff340 100644 (file)
@@ -1142,6 +1142,7 @@ struct hl_cs {
  * @userptr_list: linked-list of userptr mappings that belong to this job and
  *                     wait for completion.
  * @debugfs_list: node in debugfs list of command submission jobs.
+ * @refcount: reference counter for usage of the CS job.
  * @queue_type: the type of the H/W queue this job is submitted to.
  * @id: the id of this job inside a CS.
  * @hw_queue_id: the id of the H/W queue this job is submitted to.
@@ -1165,6 +1166,7 @@ struct hl_cs_job {
        struct work_struct      finish_work;
        struct list_head        userptr_list;
        struct list_head        debugfs_list;
+       struct kref             refcount;
        enum hl_queue_type      queue_type;
        u32                     id;
        u32                     hw_queue_id;