habanalabs: do MMU prefetch as deferred work
authorOhad Sharabi <osharabi@habana.ai>
Mon, 11 Apr 2022 06:31:32 +0000 (09:31 +0300)
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Sun, 22 May 2022 19:01:21 +0000 (21:01 +0200)
When user requests to prefetch the MMU translations, the driver will
not block the user until prefetch is done.
Instead, the prefetch work will be delegated to a WQ which will do it
in the background.
This way, the prefetch may progress without blocking the user at all.

Signed-off-by: Ohad Sharabi <osharabi@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
drivers/misc/habanalabs/common/device.c
drivers/misc/habanalabs/common/habanalabs.h
drivers/misc/habanalabs/common/memory.c
drivers/misc/habanalabs/common/mmu/mmu.c
drivers/misc/habanalabs/gaudi/gaudi.c
drivers/misc/habanalabs/goya/goya.c

index 0908ac3..b4f14c6 100644 (file)
@@ -612,11 +612,18 @@ static int device_early_init(struct hl_device *hdev)
                goto free_eq_wq;
        }
 
+       hdev->pf_wq = alloc_workqueue("hl-prefetch", WQ_UNBOUND, 0);
+       if (!hdev->pf_wq) {
+               dev_err(hdev->dev, "Failed to allocate MMU prefetch workqueue\n");
+               rc = -ENOMEM;
+               goto free_ts_free_wq;
+       }
+
        hdev->hl_chip_info = kzalloc(sizeof(struct hwmon_chip_info),
                                        GFP_KERNEL);
        if (!hdev->hl_chip_info) {
                rc = -ENOMEM;
-               goto free_ts_free_wq;
+               goto free_pf_wq;
        }
 
        rc = hl_mmu_if_set_funcs(hdev);
@@ -655,6 +662,8 @@ free_cb_mgr:
        hl_mem_mgr_fini(&hdev->kernel_mem_mgr);
 free_chip_info:
        kfree(hdev->hl_chip_info);
+free_pf_wq:
+       destroy_workqueue(hdev->pf_wq);
 free_ts_free_wq:
        destroy_workqueue(hdev->ts_free_obj_wq);
 free_eq_wq:
@@ -695,6 +704,7 @@ static void device_early_fini(struct hl_device *hdev)
 
        kfree(hdev->hl_chip_info);
 
+       destroy_workqueue(hdev->pf_wq);
        destroy_workqueue(hdev->ts_free_obj_wq);
        destroy_workqueue(hdev->eq_wq);
        destroy_workqueue(hdev->device_reset_work.wq);
@@ -891,6 +901,9 @@ static void cleanup_resources(struct hl_device *hdev, bool hard_reset, bool fw_r
        /* Go over all the queues, release all CS and their jobs */
        hl_cs_rollback_all(hdev, skip_wq_flush);
 
+       /* flush the MMU prefetch workqueue */
+       flush_workqueue(hdev->pf_wq);
+
        /* Release all pending user interrupts, each pending user interrupt
         * holds a reference to user context
         */
index 8977ec6..632037b 100644 (file)
@@ -1402,8 +1402,7 @@ struct hl_asic_funcs {
                                        u32 flags);
        int (*mmu_invalidate_cache_range)(struct hl_device *hdev, bool is_hard,
                                u32 flags, u32 asid, u64 va, u64 size);
-       int (*mmu_prefetch_cache_range)(struct hl_device *hdev, u32 flags, u32 asid, u64 va,
-                               u64 size);
+       int (*mmu_prefetch_cache_range)(struct hl_ctx *ctx, u32 flags, u32 asid, u64 va, u64 size);
        int (*send_heartbeat)(struct hl_device *hdev);
        int (*debug_coresight)(struct hl_device *hdev, struct hl_ctx *ctx, void *data);
        bool (*is_device_idle)(struct hl_device *hdev, u64 *mask_arr,
@@ -2476,6 +2475,24 @@ struct hl_mmu_funcs {
 };
 
 /**
+ * struct hl_prefetch_work - prefetch work structure handler
+ * @pf_work: actual work struct.
+ * @ctx: compute context.
+ * @va: virtual address to pre-fetch.
+ * @size: pre-fetch size.
+ * @flags: operation flags.
+ * @asid: ASID for maintenance operation.
+ */
+struct hl_prefetch_work {
+       struct work_struct      pf_work;
+       struct hl_ctx           *ctx;
+       u64                     va;
+       u64                     size;
+       u32                     flags;
+       u32                     asid;
+};
+
+/*
  * number of user contexts allowed to call wait_for_multi_cs ioctl in
  * parallel
  */
@@ -2648,6 +2665,7 @@ struct hl_reset_info {
  *         context.
  * @eq_wq: work queue of event queue for executing work in process context.
  * @ts_free_obj_wq: work queue for timestamp registration objects release.
+ * @pf_wq: work queue for MMU pre-fetch operations.
  * @kernel_ctx: Kernel driver context structure.
  * @kernel_queues: array of hl_hw_queue.
  * @cs_mirror_list: CS mirror list for TDR.
@@ -2760,6 +2778,7 @@ struct hl_reset_info {
  * @supports_wait_for_multi_cs: true if wait for multi CS is supported
  * @is_compute_ctx_active: Whether there is an active compute context executing.
  * @compute_ctx_in_release: true if the current compute context is being released.
+ * @supports_mmu_prefetch: true if prefetch is supported, otherwise false.
  */
 struct hl_device {
        struct pci_dev                  *pdev;
@@ -2781,6 +2800,7 @@ struct hl_device {
        struct workqueue_struct         **cq_wq;
        struct workqueue_struct         *eq_wq;
        struct workqueue_struct         *ts_free_obj_wq;
+       struct workqueue_struct         *pf_wq;
        struct hl_ctx                   *kernel_ctx;
        struct hl_hw_queue              *kernel_queues;
        struct list_head                cs_mirror_list;
@@ -2882,6 +2902,7 @@ struct hl_device {
        u8                              stream_master_qid_arr_size;
        u8                              is_compute_ctx_active;
        u8                              compute_ctx_in_release;
+       u8                              supports_mmu_prefetch;
 
        /* Parameters for bring-up */
        u64                             nic_ports_mask;
@@ -3163,7 +3184,7 @@ int hl_mmu_unmap_contiguous(struct hl_ctx *ctx, u64 virt_addr, u32 size);
 int hl_mmu_invalidate_cache(struct hl_device *hdev, bool is_hard, u32 flags);
 int hl_mmu_invalidate_cache_range(struct hl_device *hdev, bool is_hard,
                                        u32 flags, u32 asid, u64 va, u64 size);
-int hl_mmu_prefetch_cache_range(struct hl_device *hdev, u32 flags, u32 asid, u64 va, u64 size);
+int hl_mmu_prefetch_cache_range(struct hl_ctx *ctx, u32 flags, u32 asid, u64 va, u64 size);
 u64 hl_mmu_get_next_hop_addr(struct hl_ctx *ctx, u64 curr_pte);
 u64 hl_mmu_get_hop_pte_phys_addr(struct hl_ctx *ctx, struct hl_mmu_properties *mmu_prop,
                                        u8 hop_idx, u64 hop_addr, u64 virt_addr);
index ecf3c09..087a556 100644 (file)
@@ -1102,21 +1102,24 @@ static int get_paddr_from_handle(struct hl_ctx *ctx, struct hl_mem_in *args,
  *   map a device virtual block to this pages and return the start address of
  *   this block.
  */
-static int map_device_va(struct hl_ctx *ctx, struct hl_mem_in *args,
-               u64 *device_addr)
+static int map_device_va(struct hl_ctx *ctx, struct hl_mem_in *args, u64 *device_addr)
 {
-       struct hl_device *hdev = ctx->hdev;
-       struct hl_vm *vm = &hdev->vm;
        struct hl_vm_phys_pg_pack *phys_pg_pack;
+       enum hl_va_range_type va_range_type = 0;
+       struct hl_device *hdev = ctx->hdev;
        struct hl_userptr *userptr = NULL;
+       u32 handle = 0, va_block_align;
        struct hl_vm_hash_node *hnode;
+       struct hl_vm *vm = &hdev->vm;
        struct hl_va_range *va_range;
-       enum vm_type *vm_type;
+       bool is_userptr, do_prefetch;
        u64 ret_vaddr, hint_addr;
-       u32 handle = 0, va_block_align;
+       enum vm_type *vm_type;
        int rc;
-       bool is_userptr = args->flags & HL_MEM_USERPTR;
-       enum hl_va_range_type va_range_type = 0;
+
+       /* set map flags */
+       is_userptr = args->flags & HL_MEM_USERPTR;
+       do_prefetch = hdev->supports_mmu_prefetch && (args->flags & HL_MEM_PREFETCH);
 
        /* Assume failure */
        *device_addr = 0;
@@ -1250,15 +1253,19 @@ static int map_device_va(struct hl_ctx *ctx, struct hl_mem_in *args,
        if (rc)
                goto map_err;
 
-       if (args->flags & HL_MEM_PREFETCH) {
-               rc = hl_mmu_prefetch_cache_range(hdev, *vm_type, ctx->asid, ret_vaddr,
+       mutex_unlock(&ctx->mmu_lock);
+
+       /*
+        * prefetch is done upon user's request. it is performed in WQ as and so can
+        * be outside the MMU lock. the operation itself is already protected by the mmu lock
+        */
+       if (do_prefetch) {
+               rc = hl_mmu_prefetch_cache_range(ctx, *vm_type, ctx->asid, ret_vaddr,
                                                        phys_pg_pack->total_size);
                if (rc)
                        goto map_err;
        }
 
-       mutex_unlock(&ctx->mmu_lock);
-
        ret_vaddr += phys_pg_pack->offset;
 
        hnode->ptr = vm_type;
index 04e53af..ae9b492 100644 (file)
@@ -665,15 +665,53 @@ int hl_mmu_invalidate_cache_range(struct hl_device *hdev, bool is_hard,
        return rc;
 }
 
-int hl_mmu_prefetch_cache_range(struct hl_device *hdev, u32 flags, u32 asid, u64 va, u64 size)
+static void hl_mmu_prefetch_work_function(struct work_struct *work)
 {
-       int rc;
+       struct hl_prefetch_work *pfw = container_of(work, struct hl_prefetch_work, pf_work);
+       struct hl_ctx *ctx = pfw->ctx;
 
-       rc = hdev->asic_funcs->mmu_prefetch_cache_range(hdev, flags, asid, va, size);
-       if (rc)
-               dev_err_ratelimited(hdev->dev, "MMU cache range prefetch failed\n");
+       if (!hl_device_operational(ctx->hdev, NULL))
+               goto put_ctx;
 
-       return rc;
+       mutex_lock(&ctx->mmu_lock);
+
+       ctx->hdev->asic_funcs->mmu_prefetch_cache_range(ctx, pfw->flags, pfw->asid,
+                                                               pfw->va, pfw->size);
+
+       mutex_unlock(&ctx->mmu_lock);
+
+put_ctx:
+       /*
+        * context was taken in the common mmu prefetch function- see comment there about
+        * context handling.
+        */
+       hl_ctx_put(ctx);
+       kfree(pfw);
+}
+
+int hl_mmu_prefetch_cache_range(struct hl_ctx *ctx, u32 flags, u32 asid, u64 va, u64 size)
+{
+       struct hl_prefetch_work *handle_pf_work;
+
+       handle_pf_work = kmalloc(sizeof(*handle_pf_work), GFP_KERNEL);
+       if (!handle_pf_work)
+               return -ENOMEM;
+
+       INIT_WORK(&handle_pf_work->pf_work, hl_mmu_prefetch_work_function);
+       handle_pf_work->ctx = ctx;
+       handle_pf_work->va = va;
+       handle_pf_work->size = size;
+       handle_pf_work->flags = flags;
+       handle_pf_work->asid = asid;
+
+       /*
+        * as actual prefetch is done in a WQ we must get the context (and put it
+        * at the end of the work function)
+        */
+       hl_ctx_get(ctx->hdev, ctx);
+       queue_work(ctx->hdev->pf_wq, &handle_pf_work->pf_work);
+
+       return 0;
 }
 
 u64 hl_mmu_get_next_hop_addr(struct hl_ctx *ctx, u64 curr_pte)
index 1c38853..96a8331 100644 (file)
@@ -9174,12 +9174,6 @@ static void gaudi_get_valid_dram_page_orders(struct hl_info_dev_memalloc_page_si
        info->page_order_bitmask = 0;
 }
 
-static int gaudi_mmu_prefetch_cache_range(struct hl_device *hdev, u32 flags, u32 asid, u64 va,
-                                               u64 size)
-{
-       return 0;
-}
-
 static ssize_t infineon_ver_show(struct device *dev, struct device_attribute *attr, char *buf)
 {
        struct hl_device *hdev = dev_get_drvdata(dev);
@@ -9244,7 +9238,7 @@ static const struct hl_asic_funcs gaudi_funcs = {
        .write_pte = gaudi_write_pte,
        .mmu_invalidate_cache = gaudi_mmu_invalidate_cache,
        .mmu_invalidate_cache_range = gaudi_mmu_invalidate_cache_range,
-       .mmu_prefetch_cache_range = gaudi_mmu_prefetch_cache_range,
+       .mmu_prefetch_cache_range = NULL,
        .send_heartbeat = gaudi_send_heartbeat,
        .debug_coresight = gaudi_debug_coresight,
        .is_device_idle = gaudi_is_device_idle,
index f2d4362..4cde505 100644 (file)
@@ -5428,12 +5428,6 @@ static int goya_get_monitor_dump(struct hl_device *hdev, void *data)
        return -EOPNOTSUPP;
 }
 
-static int goya_mmu_prefetch_cache_range(struct hl_device *hdev, u32 flags, u32 asid, u64 va,
-                                       u64 size)
-{
-       return 0;
-}
-
 static int goya_scrub_device_dram(struct hl_device *hdev, u64 val)
 {
        return -EOPNOTSUPP;
@@ -5480,7 +5474,7 @@ static const struct hl_asic_funcs goya_funcs = {
        .write_pte = goya_write_pte,
        .mmu_invalidate_cache = goya_mmu_invalidate_cache,
        .mmu_invalidate_cache_range = goya_mmu_invalidate_cache_range,
-       .mmu_prefetch_cache_range = goya_mmu_prefetch_cache_range,
+       .mmu_prefetch_cache_range = NULL,
        .send_heartbeat = goya_send_heartbeat,
        .debug_coresight = goya_debug_coresight,
        .is_device_idle = goya_is_device_idle,