habanalabs: fix race between wait and irq
authorOded Gabbay <ogabbay@kernel.org>
Tue, 18 Jan 2022 22:10:43 +0000 (00:10 +0200)
committerOded Gabbay <ogabbay@kernel.org>
Mon, 28 Feb 2022 12:22:03 +0000 (14:22 +0200)
There is a race in the user interrupts code, where between checking
the target value and adding the new pend to the list, there is a chance
the interrupt happened.

In that case, no one will complete the node, and we will get a timeout
on it.

Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
drivers/misc/habanalabs/common/command_submission.c

index 8dd2f399d1c4984c30621eb6ace78c12a5a029d4..307a95a039e0b9eeb349ef80ebef0bf5ffeffdad 100644 (file)
@@ -2892,16 +2892,21 @@ static int _hl_interrupt_wait_ioctl(struct hl_device *hdev, struct hl_ctx *ctx,
        pend->cq_kernel_addr = (u64 *) cb->kernel_address + cq_counters_offset;
        pend->cq_target_value = target_value;
 
+       spin_lock_irqsave(&interrupt->wait_list_lock, flags);
+
        /* We check for completion value as interrupt could have been received
         * before we added the node to the wait list
         */
        if (*pend->cq_kernel_addr >= target_value) {
+               spin_unlock_irqrestore(&interrupt->wait_list_lock, flags);
+
                *status = HL_WAIT_CS_STATUS_COMPLETED;
                /* There was no interrupt, we assume the completion is now. */
                pend->fence.timestamp = ktime_get();
                goto set_timestamp;
 
        } else if (!timeout_us) {
+               spin_unlock_irqrestore(&interrupt->wait_list_lock, flags);
                *status = HL_WAIT_CS_STATUS_BUSY;
                pend->fence.timestamp = ktime_get();
                goto set_timestamp;
@@ -2910,7 +2915,6 @@ static int _hl_interrupt_wait_ioctl(struct hl_device *hdev, struct hl_ctx *ctx,
        /* Add pending user interrupt to relevant list for the interrupt
         * handler to monitor
         */
-       spin_lock_irqsave(&interrupt->wait_list_lock, flags);
        list_add_tail(&pend->wait_list_node, &interrupt->wait_list_head);
        spin_unlock_irqrestore(&interrupt->wait_list_lock, flags);