From 79b1894c419468607ce547855e8636d42b456149 Mon Sep 17 00:00:00 2001 From: Ofir Bitton Date: Wed, 24 Jun 2020 14:49:43 +0300 Subject: [PATCH] habanalabs: use queue pi/ci in order to determine queue occupancy Instead of using the free slots amount on the compute CQ to determine whether we can submit work to queues, use the queues pi/ci. This is needed in future ASICs where we don't have CQ per queue. Signed-off-by: Ofir Bitton Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/misc/habanalabs/device.c | 17 +++++--- drivers/misc/habanalabs/habanalabs.h | 2 +- drivers/misc/habanalabs/hw_queue.c | 82 ++++++++++++------------------------ drivers/misc/habanalabs/irq.c | 7 +-- 4 files changed, 39 insertions(+), 69 deletions(-) diff --git a/drivers/misc/habanalabs/device.c b/drivers/misc/habanalabs/device.c index 2b38a1197..65a5a5c 100644 --- a/drivers/misc/habanalabs/device.c +++ b/drivers/misc/habanalabs/device.c @@ -1144,14 +1144,17 @@ int hl_device_init(struct hl_device *hdev, struct class *hclass) * because there the addresses of the completion queues are being * passed as arguments to request_irq */ - hdev->completion_queue = kcalloc(cq_cnt, - sizeof(*hdev->completion_queue), - GFP_KERNEL); + if (cq_cnt) { + hdev->completion_queue = kcalloc(cq_cnt, + sizeof(*hdev->completion_queue), + GFP_KERNEL); - if (!hdev->completion_queue) { - dev_err(hdev->dev, "failed to allocate completion queues\n"); - rc = -ENOMEM; - goto hw_queues_destroy; + if (!hdev->completion_queue) { + dev_err(hdev->dev, + "failed to allocate completion queues\n"); + rc = -ENOMEM; + goto hw_queues_destroy; + } } for (i = 0, cq_ready_cnt = 0 ; i < cq_cnt ; i++, cq_ready_cnt++) { diff --git a/drivers/misc/habanalabs/habanalabs.h b/drivers/misc/habanalabs/habanalabs.h index 9213d10..a61aab0 100644 --- a/drivers/misc/habanalabs/habanalabs.h +++ b/drivers/misc/habanalabs/habanalabs.h @@ -461,7 +461,7 @@ struct hl_hw_queue { u64 kernel_address; dma_addr_t bus_address; u32 pi; - u32 ci; + atomic_t ci; u32 hw_queue_id; u32 cq_id; u32 msi_vec; diff --git a/drivers/misc/habanalabs/hw_queue.c b/drivers/misc/habanalabs/hw_queue.c index 7965551..474a0e8 100644 --- a/drivers/misc/habanalabs/hw_queue.c +++ b/drivers/misc/habanalabs/hw_queue.c @@ -23,10 +23,14 @@ inline u32 hl_hw_queue_add_ptr(u32 ptr, u16 val) ptr &= ((HL_QUEUE_LENGTH << 1) - 1); return ptr; } +static inline int queue_ci_get(atomic_t *ci, u32 queue_len) +{ + return atomic_read(ci) & ((queue_len << 1) - 1); +} static inline int queue_free_slots(struct hl_hw_queue *q, u32 queue_len) { - int delta = (q->pi - q->ci); + int delta = (q->pi - queue_ci_get(&q->ci, queue_len)); if (delta >= 0) return (queue_len - delta); @@ -40,21 +44,14 @@ void hl_int_hw_queue_update_ci(struct hl_cs *cs) struct hl_hw_queue *q; int i; - hdev->asic_funcs->hw_queues_lock(hdev); - if (hdev->disabled) - goto out; + return; q = &hdev->kernel_queues[0]; for (i = 0 ; i < hdev->asic_prop.max_queues ; i++, q++) { - if (q->queue_type == QUEUE_TYPE_INT) { - q->ci += cs->jobs_in_queue_cnt[i]; - q->ci &= ((q->int_queue_len << 1) - 1); - } + if (q->queue_type == QUEUE_TYPE_INT) + atomic_add(cs->jobs_in_queue_cnt[i], &q->ci); } - -out: - hdev->asic_funcs->hw_queues_unlock(hdev); } /* @@ -174,38 +171,26 @@ static int int_queue_sanity_checks(struct hl_device *hdev, } /* - * hw_queue_sanity_checks() - Perform some sanity checks on a H/W queue. + * hw_queue_sanity_checks() - Make sure we have enough space in the h/w queue * @hdev: Pointer to hl_device structure. * @q: Pointer to hl_hw_queue structure. * @num_of_entries: How many entries to check for space. * - * Perform the following: - * - Make sure we have enough space in the completion queue. - * This check also ensures that there is enough space in the h/w queue, as - * both queues are of the same size. - * - Reserve space in the completion queue (needs to be reversed if there - * is a failure down the road before the actual submission of work). + * Notice: We do not reserve queue entries so this function mustn't be called + * more than once per CS for the same queue * - * Both operations are done using the "free_slots_cnt" field of the completion - * queue. The CI counters of the queue and the completion queue are not - * needed/used for the H/W queue type. */ static int hw_queue_sanity_checks(struct hl_device *hdev, struct hl_hw_queue *q, int num_of_entries) { - atomic_t *free_slots = - &hdev->completion_queue[q->cq_id].free_slots_cnt; + int free_slots_cnt; - /* - * Check we have enough space in the completion queue. - * Add -1 to counter (decrement) unless counter was already 0. - * In that case, CQ is full so we can't submit a new CB. - * atomic_add_unless will return 0 if counter was already 0. - */ - if (atomic_add_negative(num_of_entries * -1, free_slots)) { - dev_dbg(hdev->dev, "No space for %d entries on CQ %d\n", - num_of_entries, q->hw_queue_id); - atomic_add(num_of_entries, free_slots); + /* Check we have enough space in the queue */ + free_slots_cnt = queue_free_slots(q, HL_QUEUE_LENGTH); + + if (free_slots_cnt < num_of_entries) { + dev_dbg(hdev->dev, "Queue %d doesn't have room for %d CBs\n", + q->hw_queue_id, num_of_entries); return -EAGAIN; } @@ -366,7 +351,6 @@ static void hw_queue_schedule_job(struct hl_cs_job *job) { struct hl_device *hdev = job->cs->ctx->hdev; struct hl_hw_queue *q = &hdev->kernel_queues[job->hw_queue_id]; - struct hl_cq *cq; u64 ptr; u32 offset, ctl, len; @@ -395,17 +379,6 @@ static void hw_queue_schedule_job(struct hl_cs_job *job) else ptr = (u64) (uintptr_t) job->user_cb; - /* - * No need to protect pi_offset because scheduling to the - * H/W queues is done under the scheduler mutex - * - * No need to check if CQ is full because it was already - * checked in hw_queue_sanity_checks - */ - cq = &hdev->completion_queue[q->cq_id]; - - cq->pi = hl_cq_inc_ptr(cq->pi); - ext_and_hw_queue_submit_bd(hdev, q, ctl, len, ptr); } @@ -552,8 +525,7 @@ int hl_hw_queue_schedule_cs(struct hl_cs *cs) goto unroll_cq_resv; } - if (q->queue_type == QUEUE_TYPE_EXT || - q->queue_type == QUEUE_TYPE_HW) + if (q->queue_type == QUEUE_TYPE_EXT) cq_cnt++; } } @@ -605,9 +577,8 @@ int hl_hw_queue_schedule_cs(struct hl_cs *cs) unroll_cq_resv: q = &hdev->kernel_queues[0]; for (i = 0 ; (i < max_queues) && (cq_cnt > 0) ; i++, q++) { - if ((q->queue_type == QUEUE_TYPE_EXT || - q->queue_type == QUEUE_TYPE_HW) && - cs->jobs_in_queue_cnt[i]) { + if ((q->queue_type == QUEUE_TYPE_EXT) && + (cs->jobs_in_queue_cnt[i])) { atomic_t *free_slots = &hdev->completion_queue[i].free_slots_cnt; atomic_add(cs->jobs_in_queue_cnt[i], free_slots); @@ -631,7 +602,7 @@ void hl_hw_queue_inc_ci_kernel(struct hl_device *hdev, u32 hw_queue_id) { struct hl_hw_queue *q = &hdev->kernel_queues[hw_queue_id]; - q->ci = hl_queue_inc_ptr(q->ci); + atomic_inc(&q->ci); } static int ext_and_cpu_queue_init(struct hl_device *hdev, struct hl_hw_queue *q, @@ -666,7 +637,7 @@ static int ext_and_cpu_queue_init(struct hl_device *hdev, struct hl_hw_queue *q, } /* Make sure read/write pointers are initialized to start of queue */ - q->ci = 0; + atomic_set(&q->ci, 0); q->pi = 0; return 0; @@ -700,7 +671,7 @@ static int int_queue_init(struct hl_device *hdev, struct hl_hw_queue *q) q->kernel_address = (u64) (uintptr_t) p; q->pi = 0; - q->ci = 0; + atomic_set(&q->ci, 0); return 0; } @@ -729,7 +700,7 @@ static int hw_queue_init(struct hl_device *hdev, struct hl_hw_queue *q) q->kernel_address = (u64) (uintptr_t) p; /* Make sure read/write pointers are initialized to start of queue */ - q->ci = 0; + atomic_set(&q->ci, 0); q->pi = 0; return 0; @@ -931,7 +902,8 @@ void hl_hw_queue_reset(struct hl_device *hdev, bool hard_reset) if ((!q->valid) || ((!hard_reset) && (q->queue_type == QUEUE_TYPE_CPU))) continue; - q->pi = q->ci = 0; + q->pi = 0; + atomic_set(&q->ci, 0); if (q->supports_sync_stream) sync_stream_queue_reset(hdev, q->hw_queue_id); diff --git a/drivers/misc/habanalabs/irq.c b/drivers/misc/habanalabs/irq.c index 7a4878ed..195a5ec 100644 --- a/drivers/misc/habanalabs/irq.c +++ b/drivers/misc/habanalabs/irq.c @@ -122,12 +122,7 @@ irqreturn_t hl_irq_handler_cq(int irq, void *arg) queue_work(hdev->cq_wq, &job->finish_work); } - /* Update ci of the context's queue. There is no - * need to protect it with spinlock because this update is - * done only inside IRQ and there is a different IRQ per - * queue - */ - queue->ci = hl_queue_inc_ptr(queue->ci); + atomic_inc(&queue->ci); /* Clear CQ entry ready bit */ cq_entry->data = cpu_to_le32(le32_to_cpu(cq_entry->data) & -- 2.7.4