habanalabs/gaudi2: new API to control engine cores running mode
authorTal Cohen <talcohen@habana.ai>
Thu, 7 Jul 2022 15:42:47 +0000 (18:42 +0300)
committerOded Gabbay <ogabbay@kernel.org>
Sun, 18 Sep 2022 10:29:51 +0000 (13:29 +0300)
The current flow of halting the engine cores is implemented by command
buffers built by the user space and sent towards the Driver.

This current flow is broken since the user space does not know when
the cores actually halt as sending a workload is async op.

Therefore the application can not free the memory that is mapped
to the engine cores.

This new API allows the user space to control the running mode. The
API call is sync (returns after the cores are set to the
requested mode).

Signed-off-by: Tal Cohen <talcohen@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
drivers/misc/habanalabs/common/command_submission.c
drivers/misc/habanalabs/common/habanalabs.h
drivers/misc/habanalabs/gaudi2/gaudi2.c
drivers/misc/habanalabs/include/gaudi2/asic_reg/gaudi2_regs.h
include/uapi/misc/habanalabs.h

index 304e4f3b0e7ec0a6c27dc7253e7a2b362f296cf0..cf411851567876368d58c63d1d93001ace4ce35c 100644 (file)
@@ -13,7 +13,7 @@
 
 #define HL_CS_FLAGS_TYPE_MASK  (HL_CS_FLAGS_SIGNAL | HL_CS_FLAGS_WAIT | \
                        HL_CS_FLAGS_COLLECTIVE_WAIT | HL_CS_FLAGS_RESERVE_SIGNALS_ONLY | \
-                       HL_CS_FLAGS_UNRESERVE_SIGNALS_ONLY)
+                       HL_CS_FLAGS_UNRESERVE_SIGNALS_ONLY | HL_CS_FLAGS_ENGINE_CORE_COMMAND)
 
 
 #define MAX_TS_ITER_NUM 10
@@ -1244,6 +1244,8 @@ static enum hl_cs_type hl_cs_get_cs_type(u32 cs_type_flags)
                return CS_RESERVE_SIGNALS;
        else if (cs_type_flags & HL_CS_FLAGS_UNRESERVE_SIGNALS_ONLY)
                return CS_UNRESERVE_SIGNALS;
+       else if (cs_type_flags & HL_CS_FLAGS_ENGINE_CORE_COMMAND)
+               return CS_TYPE_ENGINE_CORE;
        else
                return CS_TYPE_DEFAULT;
 }
@@ -2355,6 +2357,41 @@ out:
        return rc;
 }
 
+static int cs_ioctl_engine_cores(struct hl_fpriv *hpriv, u64 engine_cores,
+                                               u32 num_engine_cores, u32 core_command)
+{
+       int rc;
+       struct hl_device *hdev = hpriv->hdev;
+       void __user *engine_cores_arr;
+       u32 *cores;
+
+       if (!num_engine_cores || num_engine_cores > hdev->asic_prop.num_engine_cores) {
+               dev_err(hdev->dev, "Number of engine cores %d is invalid\n", num_engine_cores);
+               return -EINVAL;
+       }
+
+       if (core_command != HL_ENGINE_CORE_RUN && core_command != HL_ENGINE_CORE_HALT) {
+               dev_err(hdev->dev, "Engine core command is invalid\n");
+               return -EINVAL;
+       }
+
+       engine_cores_arr = (void __user *) (uintptr_t) engine_cores;
+       cores = kmalloc_array(num_engine_cores, sizeof(u32), GFP_KERNEL);
+       if (!cores)
+               return -ENOMEM;
+
+       if (copy_from_user(cores, engine_cores_arr, num_engine_cores * sizeof(u32))) {
+               dev_err(hdev->dev, "Failed to copy core-ids array from user\n");
+               kfree(cores);
+               return -EFAULT;
+       }
+
+       rc = hdev->asic_funcs->set_engine_cores(hdev, cores, num_engine_cores, core_command);
+       kfree(cores);
+
+       return rc;
+}
+
 int hl_cs_ioctl(struct hl_fpriv *hpriv, void *data)
 {
        union hl_cs_args *args = data;
@@ -2407,6 +2444,10 @@ int hl_cs_ioctl(struct hl_fpriv *hpriv, void *data)
                rc = cs_ioctl_unreserve_signals(hpriv,
                                        args->in.encaps_sig_handle_id);
                break;
+       case CS_TYPE_ENGINE_CORE:
+               rc = cs_ioctl_engine_cores(hpriv, args->in.engine_cores,
+                               args->in.num_engine_cores, args->in.core_command);
+               break;
        default:
                rc = cs_ioctl_default(hpriv, chunks, num_chunks, &cs_seq,
                                                args->in.cs_flags,
index 8d9e96c6092a63245c37bce16a38d59e3ce48fd8..ae3f5832fe58273079ec3f579035e21085e1fe54 100644 (file)
@@ -345,7 +345,8 @@ enum hl_cs_type {
        CS_TYPE_WAIT,
        CS_TYPE_COLLECTIVE_WAIT,
        CS_RESERVE_SIGNALS,
-       CS_UNRESERVE_SIGNALS
+       CS_UNRESERVE_SIGNALS,
+       CS_TYPE_ENGINE_CORE
 };
 
 /*
@@ -617,6 +618,7 @@ struct hl_hints_range {
  *                                      which the property supports_user_set_page_size is true
  *                                      (i.e. the DRAM supports multiple page sizes), otherwise
  *                                      it will shall  be equal to dram_page_size.
+ * @num_engine_cores: number of engine cpu cores
  * @collective_first_sob: first sync object available for collective use
  * @collective_first_mon: first monitor available for collective use
  * @sync_stream_first_sob: first sync object available for sync stream use
@@ -737,6 +739,7 @@ struct asic_fixed_properties {
        u32                             faulty_dram_cluster_map;
        u32                             xbar_edge_enabled_mask;
        u32                             device_mem_alloc_default_page_size;
+       u32                             num_engine_cores;
        u16                             collective_first_sob;
        u16                             collective_first_mon;
        u16                             sync_stream_first_sob;
@@ -1511,6 +1514,7 @@ struct engines_data {
  * @check_if_razwi_happened: check if there was a razwi due to RR violation.
  * @access_dev_mem: access device memory
  * @set_dram_bar_base: set the base of the DRAM BAR
+ * @set_engine_cores: set a config command to enigne cores
  */
 struct hl_asic_funcs {
        int (*early_init)(struct hl_device *hdev);
@@ -1645,6 +1649,8 @@ struct hl_asic_funcs {
        int (*access_dev_mem)(struct hl_device *hdev, enum pci_region region_type,
                                u64 addr, u64 *val, enum debugfs_access_type acc_type);
        u64 (*set_dram_bar_base)(struct hl_device *hdev, u64 addr);
+       int (*set_engine_cores)(struct hl_device *hdev, u32 *core_ids,
+                                       u32 num_cores, u32 core_command);
 };
 
 
index 9ccde0258ecadbe24148ece193b4d60431aa85d3..676419961f867201d6c96ea40ff41b4c6571d524 100644 (file)
@@ -1989,6 +1989,7 @@ static int gaudi2_set_fixed_properties(struct hl_device *hdev)
                prop->pmmu_huge.end_addr = VA_HOST_SPACE_HPAGE_END;
        }
 
+       prop->num_engine_cores = CPU_ID_MAX;
        prop->cfg_size = CFG_SIZE;
        prop->max_asid = MAX_ASID;
        prop->num_of_events = GAUDI2_EVENT_SIZE;
@@ -3751,14 +3752,16 @@ static void gaudi2_stop_dec(struct hl_device *hdev)
        gaudi2_stop_pcie_dec(hdev);
 }
 
-static void gaudi2_halt_arc(struct hl_device *hdev, u32 cpu_id)
+static void gaudi2_set_arc_running_mode(struct hl_device *hdev, u32 cpu_id, u32 run_mode)
 {
        u32 reg_base, reg_val;
 
        reg_base = gaudi2_arc_blocks_bases[cpu_id];
+       if (run_mode == HL_ENGINE_CORE_RUN)
+               reg_val = FIELD_PREP(ARC_FARM_ARC0_AUX_RUN_HALT_REQ_RUN_REQ_MASK, 1);
+       else
+               reg_val = FIELD_PREP(ARC_FARM_ARC0_AUX_RUN_HALT_REQ_HALT_REQ_MASK, 1);
 
-       /* Halt ARC */
-       reg_val = FIELD_PREP(ARC_FARM_ARC0_AUX_RUN_HALT_REQ_HALT_REQ_MASK, 1);
        WREG32(reg_base + ARC_HALT_REQ_OFFSET, reg_val);
 }
 
@@ -3768,10 +3771,37 @@ static void gaudi2_halt_arcs(struct hl_device *hdev)
 
        for (arc_id = CPU_ID_SCHED_ARC0; arc_id < CPU_ID_MAX; arc_id++) {
                if (gaudi2_is_arc_enabled(hdev, arc_id))
-                       gaudi2_halt_arc(hdev, arc_id);
+                       gaudi2_set_arc_running_mode(hdev, arc_id, HL_ENGINE_CORE_HALT);
        }
 }
 
+static int gaudi2_verify_arc_running_mode(struct hl_device *hdev, u32 cpu_id, u32 run_mode)
+{
+       int rc;
+       u32 reg_base, val, ack_mask, timeout_usec = 100000;
+
+       if (hdev->pldm)
+               timeout_usec *= 100;
+
+       reg_base = gaudi2_arc_blocks_bases[cpu_id];
+       if (run_mode == HL_ENGINE_CORE_RUN)
+               ack_mask = ARC_FARM_ARC0_AUX_RUN_HALT_ACK_RUN_ACK_MASK;
+       else
+               ack_mask = ARC_FARM_ARC0_AUX_RUN_HALT_ACK_HALT_ACK_MASK;
+
+       rc = hl_poll_timeout(hdev, reg_base + ARC_HALT_ACK_OFFSET,
+                               val, ((val & ack_mask) == ack_mask),
+                               1000, timeout_usec);
+
+       if (!rc) {
+               /* Clear */
+               val = FIELD_PREP(ARC_FARM_ARC0_AUX_RUN_HALT_REQ_RUN_REQ_MASK, 0);
+               WREG32(reg_base + ARC_HALT_REQ_OFFSET, val);
+       }
+
+       return rc;
+}
+
 static void gaudi2_reset_arcs(struct hl_device *hdev)
 {
        struct gaudi2_device *gaudi2 = hdev->asic_specific;
@@ -3796,8 +3826,39 @@ static void gaudi2_nic_qmans_manual_flush(struct hl_device *hdev)
 
        queue_id = GAUDI2_QUEUE_ID_NIC_0_0;
 
-       for (i = 0 ; i < NIC_NUMBER_OF_ENGINES ; i++, queue_id += NUM_OF_PQ_PER_QMAN)
+       for (i = 0 ; i < NIC_NUMBER_OF_ENGINES ; i++, queue_id += NUM_OF_PQ_PER_QMAN) {
+               if (!(hdev->nic_ports_mask & BIT(i)))
+                       continue;
+
                gaudi2_qman_manual_flush_common(hdev, queue_id);
+       }
+}
+
+static int gaudi2_set_engine_cores(struct hl_device *hdev, u32 *core_ids,
+                                       u32 num_cores, u32 core_command)
+{
+       int i, rc;
+
+
+       for (i = 0 ; i < num_cores ; i++) {
+               if (gaudi2_is_arc_enabled(hdev, core_ids[i]))
+                       gaudi2_set_arc_running_mode(hdev, core_ids[i], core_command);
+       }
+
+       for (i = 0 ; i < num_cores ; i++) {
+               if (gaudi2_is_arc_enabled(hdev, core_ids[i])) {
+                       rc = gaudi2_verify_arc_running_mode(hdev, core_ids[i], core_command);
+
+                       if (rc) {
+                               dev_err(hdev->dev, "failed to %s arc: %d\n",
+                                       (core_command == HL_ENGINE_CORE_HALT) ?
+                                       "HALT" : "RUN", core_ids[i]);
+                               return -1;
+                       }
+               }
+       }
+
+       return 0;
 }
 
 static void gaudi2_halt_engines(struct hl_device *hdev, bool hard_reset, bool fw_reset)
@@ -9968,6 +10029,7 @@ static const struct hl_asic_funcs gaudi2_funcs = {
        .mmu_get_real_page_size = gaudi2_mmu_get_real_page_size,
        .access_dev_mem = hl_access_dev_mem,
        .set_dram_bar_base = gaudi2_set_hbm_bar_base,
+       .set_engine_cores = gaudi2_set_engine_cores,
 };
 
 void gaudi2_set_asic_funcs(struct hl_device *hdev)
index d0e2c68a639fbd8a48f3ce77ca98de9f456afaed..bfda4223bdc89d76b06017fdfc0aa8b63df76914 100644 (file)
 #define SFT_IF_RTR_OFFSET      (mmSFT0_HBW_RTR_IF1_RTR_H3_BASE - mmSFT0_HBW_RTR_IF0_RTR_H3_BASE)
 
 #define ARC_HALT_REQ_OFFSET    (mmARC_FARM_ARC0_AUX_RUN_HALT_REQ - mmARC_FARM_ARC0_AUX_BASE)
+#define ARC_HALT_ACK_OFFSET    (mmARC_FARM_ARC0_AUX_RUN_HALT_ACK - mmARC_FARM_ARC0_AUX_BASE)
 
 #define ARC_REGION_CFG_OFFSET(region) \
        (mmARC_FARM_ARC0_AUX_ARC_REGION_CFG_0 + (region * 4) - mmARC_FARM_ARC0_AUX_BASE)
index 0da8894ab94a052c87b12ccc35067f25e1028cfd..f51c6ae4f94d0956c7d0fefd1b838ebb4771930c 100644 (file)
@@ -1361,17 +1361,47 @@ struct hl_cs_chunk {
 #define HL_CS_FLAGS_RESERVE_SIGNALS_ONLY       0x1000
 #define HL_CS_FLAGS_UNRESERVE_SIGNALS_ONLY     0x2000
 
+/*
+ * The engine cores CS is merged into the existing CS ioctls.
+ * Use it to control the engine cores mode.
+ */
+#define HL_CS_FLAGS_ENGINE_CORE_COMMAND                0x4000
+
 #define HL_CS_STATUS_SUCCESS           0
 
 #define HL_MAX_JOBS_PER_CS             512
 
+/* HL_ENGINE_CORE_ values
+ *
+ * HL_ENGINE_CORE_HALT: engine core halt
+ * HL_ENGINE_CORE_RUN:  engine core run
+ */
+#define HL_ENGINE_CORE_HALT    (1 << 0)
+#define HL_ENGINE_CORE_RUN     (1 << 1)
+
 struct hl_cs_in {
 
-       /* this holds address of array of hl_cs_chunk for restore phase */
-       __u64 chunks_restore;
+       union {
+               struct {
+                       /* this holds address of array of hl_cs_chunk for restore phase */
+                       __u64 chunks_restore;
 
-       /* holds address of array of hl_cs_chunk for execution phase */
-       __u64 chunks_execute;
+                       /* holds address of array of hl_cs_chunk for execution phase */
+                       __u64 chunks_execute;
+               };
+
+               /* Valid only when HL_CS_FLAGS_ENGINE_CORE_COMMAND is set */
+               struct {
+                       /* this holds address of array of uint32 for engine_cores */
+                       __u64 engine_cores;
+
+                       /* number of engine cores in engine_cores array */
+                       __u32 num_engine_cores;
+
+                       /* the core command to be sent towards engine cores */
+                       __u32 core_command;
+               };
+       };
 
        union {
                /*