habanalabs: modify multi-CS to wait on stream masters
authorOhad Sharabi <osharabi@habana.ai>
Sun, 1 Aug 2021 20:02:07 +0000 (23:02 +0300)
committerOded Gabbay <ogabbay@kernel.org>
Wed, 1 Sep 2021 15:38:24 +0000 (18:38 +0300)
During the integration, the multi-CS requirements were refined:
- The multi CS call shall wait on "per-ASIC" predefined stream masters
  instead of set of streams.
- Stream masters are set of QIDs used by the upper SW layers (synapse)
  for completion (must be an external/HW queue).

Signed-off-by: Ohad Sharabi <osharabi@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
drivers/misc/habanalabs/common/command_submission.c
drivers/misc/habanalabs/common/habanalabs.h
drivers/misc/habanalabs/common/hw_queue.c
drivers/misc/habanalabs/gaudi/gaudi.c
drivers/misc/habanalabs/gaudi/gaudiP.h
drivers/misc/habanalabs/goya/goya.c

index d71bd48..3a67265 100644 (file)
@@ -487,14 +487,15 @@ static void force_complete_multi_cs(struct hl_device *hdev)
  *
  * @hdev: pointer to habanalabs device structure
  * @cs: CS structure
- *
- * The function signals waiting entity that its waiting stream has common
- * stream with the completed CS.
+ * The function signals a waiting entity that has an overlapping stream masters
+ * with the completed CS.
  * For example:
- * - a completed CS worked on streams 0 and 1, multi CS completion
- *   is actively waiting on stream 3. don't send signal as no common stream
- * - a completed CS worked on streams 0 and 1, multi CS completion
- *   is actively waiting on streams 1 and 3. send signal as stream 1 is common
+ * - a completed CS worked on stream master QID 4, multi CS completion
+ *   is actively waiting on stream master QIDs 3, 5. don't send signal as no
+ *   common stream master QID
+ * - a completed CS worked on stream master QID 4, multi CS completion
+ *   is actively waiting on stream master QIDs 3, 4. send signal as stream
+ *   master QID 4 is common
  */
 static void complete_multi_cs(struct hl_device *hdev, struct hl_cs *cs)
 {
@@ -518,10 +519,11 @@ static void complete_multi_cs(struct hl_device *hdev, struct hl_cs *cs)
                 * complete if:
                 * 1. still waiting for completion
                 * 2. the completed CS has at least one overlapping stream
-                *    with the streams in the completion
+                *    master with the stream masters in the completion
                 */
                if (mcs_compl->used &&
-                               (fence->stream_map & mcs_compl->stream_map)) {
+                               (fence->stream_master_qid_map &
+                                       mcs_compl->stream_master_qid_map)) {
                        /* extract the timestamp only of first completed CS */
                        if (!mcs_compl->timestamp)
                                mcs_compl->timestamp =
@@ -1228,6 +1230,17 @@ static int cs_staged_submission(struct hl_device *hdev, struct hl_cs *cs,
        return 0;
 }
 
+static u32 get_stream_master_qid_mask(struct hl_device *hdev, u32 qid)
+{
+       int i;
+
+       for (i = 0; i < hdev->stream_master_qid_arr_size; i++)
+               if (qid == hdev->stream_master_qid_arr[i])
+                       return BIT(i);
+
+       return 0;
+}
+
 static int cs_ioctl_default(struct hl_fpriv *hpriv, void __user *chunks,
                                u32 num_chunks, u64 *cs_seq, u32 flags,
                                u32 encaps_signals_handle, u32 timeout)
@@ -1241,7 +1254,7 @@ static int cs_ioctl_default(struct hl_fpriv *hpriv, void __user *chunks,
        struct hl_cs *cs;
        struct hl_cb *cb;
        u64 user_sequence;
-       u8 stream_map = 0;
+       u8 stream_master_qid_map = 0;
        int rc, i;
 
        cntr = &hdev->aggregated_cs_counters;
@@ -1310,7 +1323,9 @@ static int cs_ioctl_default(struct hl_fpriv *hpriv, void __user *chunks,
                         * queues of this CS
                         */
                        if (hdev->supports_wait_for_multi_cs)
-                               stream_map |= BIT((chunk->queue_index % 4));
+                               stream_master_qid_map |=
+                                       get_stream_master_qid_mask(hdev,
+                                                       chunk->queue_index);
                }
 
                job = hl_cs_allocate_job(hdev, queue_type,
@@ -1378,7 +1393,7 @@ static int cs_ioctl_default(struct hl_fpriv *hpriv, void __user *chunks,
         * fence object for multi-CS completion
         */
        if (hdev->supports_wait_for_multi_cs)
-               cs->fence->stream_map = stream_map;
+               cs->fence->stream_master_qid_map = stream_master_qid_map;
 
        rc = hl_hw_queue_schedule_cs(cs);
        if (rc) {
@@ -2332,7 +2347,7 @@ static int hl_cs_poll_fences(struct multi_cs_data *mcs_data)
                        break;
                }
 
-               mcs_data->stream_map |= fence->stream_map;
+               mcs_data->stream_master_qid_map |= fence->stream_master_qid_map;
 
                if (status == CS_WAIT_STATUS_BUSY)
                        continue;
@@ -2394,7 +2409,8 @@ static int _hl_cs_wait_ioctl(struct hl_device *hdev, struct hl_ctx *ctx,
  * hl_wait_multi_cs_completion_init - init completion structure
  *
  * @hdev: pointer to habanalabs device structure
- * @stream_map: stream map, set bit indicates stream to wait on
+ * @stream_master_bitmap: stream master QIDs map, set bit indicates stream
+ *                        master QID to wait on
  *
  * @return valid completion struct pointer on success, otherwise error pointer
  *
@@ -2404,7 +2420,7 @@ static int _hl_cs_wait_ioctl(struct hl_device *hdev, struct hl_ctx *ctx,
  */
 static struct multi_cs_completion *hl_wait_multi_cs_completion_init(
                                                        struct hl_device *hdev,
-                                                       u8 stream_map)
+                                                       u8 stream_master_bitmap)
 {
        struct multi_cs_completion *mcs_compl;
        int i;
@@ -2416,7 +2432,7 @@ static struct multi_cs_completion *hl_wait_multi_cs_completion_init(
                if (!mcs_compl->used) {
                        mcs_compl->used = 1;
                        mcs_compl->timestamp = 0;
-                       mcs_compl->stream_map = stream_map;
+                       mcs_compl->stream_master_qid_map = stream_master_bitmap;
                        reinit_completion(&mcs_compl->completion);
                        spin_unlock(&mcs_compl->lock);
                        break;
@@ -2464,7 +2480,7 @@ static int hl_wait_multi_cs_completion(struct multi_cs_data *mcs_data)
        long completion_rc;
 
        mcs_compl = hl_wait_multi_cs_completion_init(hdev,
-                                                       mcs_data->stream_map);
+                                       mcs_data->stream_master_qid_map);
        if (IS_ERR(mcs_compl))
                return PTR_ERR(mcs_compl);
 
index 5c7f26e..c4a482b 100644 (file)
@@ -592,18 +592,18 @@ struct asic_fixed_properties {
  * @completion: fence is implemented using completion
  * @refcount: refcount for this fence
  * @cs_sequence: sequence of the corresponding command submission
+ * @stream_master_qid_map: streams masters QID bitmap to represent all streams
+ *                         masters QIDs that multi cs is waiting on
  * @error: mark this fence with error
  * @timestamp: timestamp upon completion
- * @stream_map: streams bitmap to represent all streams that multi cs is
- *              waiting on
  */
 struct hl_fence {
        struct completion       completion;
        struct kref             refcount;
        u64                     cs_sequence;
+       u32                     stream_master_qid_map;
        int                     error;
        ktime_t                 timestamp;
-       u8                      stream_map;
 };
 
 /**
@@ -1160,6 +1160,7 @@ struct fw_load_mgr {
  * @state_dump_init: initialize constants required for state dump
  * @get_sob_addr: get SOB base address offset.
  * @set_pci_memory_regions: setting properties of PCI memory regions
+ * @get_stream_master_qid_arr: get pointer to stream masters QID array
  */
 struct hl_asic_funcs {
        int (*early_init)(struct hl_device *hdev);
@@ -1289,6 +1290,7 @@ struct hl_asic_funcs {
        void (*state_dump_init)(struct hl_device *hdev);
        u32 (*get_sob_addr)(struct hl_device *hdev, u32 sob_id);
        void (*set_pci_memory_regions)(struct hl_device *hdev);
+       u32* (*get_stream_master_qid_arr)(void);
 };
 
 
@@ -2263,16 +2265,16 @@ struct hl_mmu_funcs {
  * @completion: completion of any of the CS in the list
  * @lock: spinlock for the completion structure
  * @timestamp: timestamp for the multi-CS completion
+ * @stream_master_qid_map: bitmap of all stream masters on which the multi-CS
+ *                        is waiting
  * @used: 1 if in use, otherwise 0
- * @stream_map: bitmap of all HW/external queues streams on which the multi-CS
- *              is waiting
  */
 struct multi_cs_completion {
        struct completion       completion;
        spinlock_t              lock;
        s64                     timestamp;
+       u32                     stream_master_qid_map;
        u8                      used;
-       u8                      stream_map;
 };
 
 /**
@@ -2284,9 +2286,9 @@ struct multi_cs_completion {
  * @timestamp: timestamp of first completed CS
  * @wait_status: wait for CS status
  * @completion_bitmap: bitmap of completed CSs (1- completed, otherwise 0)
+ * @stream_master_qid_map: bitmap of all stream master QIDs on which the
+ *                         multi-CS is waiting
  * @arr_len: fence_arr and seq_arr array length
- * @stream_map: bitmap of all HW/external queues streams on which the multi-CS
- *              is waiting
  * @gone_cs: indication of gone CS (1- there was gone CS, otherwise 0)
  * @update_ts: update timestamp. 1- update the timestamp, otherwise 0.
  */
@@ -2298,8 +2300,8 @@ struct multi_cs_data {
        s64             timestamp;
        long            wait_status;
        u32             completion_bitmap;
+       u32             stream_master_qid_map;
        u8              arr_len;
-       u8              stream_map;
        u8              gone_cs;
        u8              update_ts;
 };
@@ -2520,6 +2522,7 @@ struct hl_device {
 
        struct multi_cs_completion      multi_cs_completion[
                                                        MULTI_CS_MAX_USER_CTX];
+       u32                             *stream_master_qid_arr;
        atomic64_t                      dram_used_mem;
        u64                             timeout_jiffies;
        u64                             max_power;
@@ -2570,6 +2573,7 @@ struct hl_device {
        u8                              skip_reset_on_timeout;
        u8                              device_cpu_is_halted;
        u8                              supports_wait_for_multi_cs;
+       u8                              stream_master_qid_arr_size;
 
        /* Parameters for bring-up */
        u64                             nic_ports_mask;
index 6d3becc..76b7de8 100644 (file)
@@ -721,7 +721,8 @@ int hl_hw_queue_schedule_cs(struct hl_cs *cs)
 
                /* update stream map of the first CS */
                if (hdev->supports_wait_for_multi_cs)
-                       staged_cs->fence->stream_map |= cs->fence->stream_map;
+                       staged_cs->fence->stream_master_qid_map |=
+                                       cs->fence->stream_master_qid_map;
        }
 
        list_add_tail(&cs->mirror_node, &hdev->cs_mirror_list);
index 27d996a..a05688c 100644 (file)
 
 #define MONITOR_SOB_STRING_SIZE                256
 
+static u32 gaudi_stream_master[GAUDI_STREAM_MASTER_ARR_SIZE] = {
+       GAUDI_QUEUE_ID_DMA_0_0,
+       GAUDI_QUEUE_ID_DMA_0_1,
+       GAUDI_QUEUE_ID_DMA_0_2,
+       GAUDI_QUEUE_ID_DMA_0_3,
+       GAUDI_QUEUE_ID_DMA_1_0,
+       GAUDI_QUEUE_ID_DMA_1_1,
+       GAUDI_QUEUE_ID_DMA_1_2,
+       GAUDI_QUEUE_ID_DMA_1_3
+};
+
 static const char gaudi_irq_name[GAUDI_MSI_ENTRIES][GAUDI_MAX_STRING_LEN] = {
                "gaudi cq 0_0", "gaudi cq 0_1", "gaudi cq 0_2", "gaudi cq 0_3",
                "gaudi cq 1_0", "gaudi cq 1_1", "gaudi cq 1_2", "gaudi cq 1_3",
@@ -1870,6 +1881,9 @@ static int gaudi_sw_init(struct hl_device *hdev)
        hdev->supports_wait_for_multi_cs = true;
 
        hdev->asic_funcs->set_pci_memory_regions(hdev);
+       hdev->stream_master_qid_arr =
+                               hdev->asic_funcs->get_stream_master_qid_arr();
+       hdev->stream_master_qid_arr_size = GAUDI_STREAM_MASTER_ARR_SIZE;
 
        return 0;
 
@@ -9352,6 +9366,11 @@ static void gaudi_state_dump_init(struct hl_device *hdev)
        sds->funcs = gaudi_state_dump_funcs;
 }
 
+static u32 *gaudi_get_stream_master_qid_arr(void)
+{
+       return gaudi_stream_master;
+}
+
 static const struct hl_asic_funcs gaudi_funcs = {
        .early_init = gaudi_early_init,
        .early_fini = gaudi_early_fini,
@@ -9440,7 +9459,8 @@ static const struct hl_asic_funcs gaudi_funcs = {
        .init_cpu_scrambler_dram = gaudi_init_scrambler_hbm,
        .state_dump_init = gaudi_state_dump_init,
        .get_sob_addr = gaudi_get_sob_addr,
-       .set_pci_memory_regions = gaudi_set_pci_memory_regions
+       .set_pci_memory_regions = gaudi_set_pci_memory_regions,
+       .get_stream_master_qid_arr = gaudi_get_stream_master_qid_arr
 };
 
 /**
index eacc5ea..2f0928c 100644 (file)
@@ -36,6 +36,8 @@
 #define NUMBER_OF_INTERRUPTS           (NUMBER_OF_CMPLT_QUEUES + \
                                                NUMBER_OF_CPU_HW_QUEUES)
 
+#define GAUDI_STREAM_MASTER_ARR_SIZE   8
+
 #if (NUMBER_OF_INTERRUPTS > GAUDI_MSI_ENTRIES)
 #error "Number of MSI interrupts must be smaller or equal to GAUDI_MSI_ENTRIES"
 #endif
index d956088..89f8a05 100644 (file)
@@ -5588,6 +5588,11 @@ static u32 goya_get_sob_addr(struct hl_device *hdev, u32 sob_id)
        return 0;
 }
 
+static u32 *goya_get_stream_master_qid_arr(void)
+{
+       return NULL;
+}
+
 static const struct hl_asic_funcs goya_funcs = {
        .early_init = goya_early_init,
        .early_fini = goya_early_fini,
@@ -5677,6 +5682,7 @@ static const struct hl_asic_funcs goya_funcs = {
        .state_dump_init = goya_state_dump_init,
        .get_sob_addr = &goya_get_sob_addr,
        .set_pci_memory_regions = goya_set_pci_memory_regions,
+       .get_stream_master_qid_arr = goya_get_stream_master_qid_arr,
 };
 
 /*