habanalabs/gaudi: add page fault notify event
authorDani Liberman <dliberman@habana.ai>
Mon, 31 Oct 2022 09:44:45 +0000 (11:44 +0200)
committerOded Gabbay <ogabbay@kernel.org>
Wed, 23 Nov 2022 14:13:46 +0000 (16:13 +0200)
Each time page fault happens, besides capturing its data, also notify
the user about it.

Signed-off-by: Dani Liberman <dliberman@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
drivers/misc/habanalabs/common/device.c
drivers/misc/habanalabs/common/habanalabs.h
drivers/misc/habanalabs/gaudi/gaudi.c
include/uapi/misc/habanalabs.h

index 65bb40f..3181812 100644 (file)
@@ -2490,3 +2490,12 @@ void hl_capture_page_fault(struct hl_device *hdev, u64 addr, u16 eng_id, bool is
        hdev->captured_err_info.pgf_info.pgf.engine_id = eng_id;
        hl_capture_user_mappings(hdev, is_pmmu);
 }
+
+void hl_handle_page_fault(struct hl_device *hdev, u64 addr, u16 eng_id, bool is_pmmu,
+                               u64 *event_mask)
+{
+       hl_capture_page_fault(hdev, addr, eng_id, is_pmmu);
+
+       if (event_mask)
+               *event_mask |=  HL_NOTIFIER_EVENT_PAGE_FAULT;
+}
index d9335f3..0781b86 100644 (file)
@@ -3815,6 +3815,8 @@ void hl_capture_razwi(struct hl_device *hdev, u64 addr, u16 *engine_id, u16 num_
 void hl_handle_razwi(struct hl_device *hdev, u64 addr, u16 *engine_id, u16 num_of_engines,
                        u8 flags, u64 *event_mask);
 void hl_capture_page_fault(struct hl_device *hdev, u64 addr, u16 eng_id, bool is_pmmu);
+void hl_handle_page_fault(struct hl_device *hdev, u64 addr, u16 eng_id, bool is_pmmu,
+                               u64 *event_mask);
 
 #ifdef CONFIG_DEBUG_FS
 
index 035865c..cbe1daf 100644 (file)
@@ -6740,7 +6740,7 @@ static void gaudi_print_and_get_razwi_info(struct hl_device *hdev, u16 *engine_i
        }
 }
 
-static void gaudi_print_and_get_mmu_error_info(struct hl_device *hdev, u64 *addr)
+static void gaudi_print_and_get_mmu_error_info(struct hl_device *hdev, u64 *addr, u64 *event_mask)
 {
        struct gaudi_device *gaudi = hdev->asic_specific;
        u32 val;
@@ -6755,7 +6755,7 @@ static void gaudi_print_and_get_mmu_error_info(struct hl_device *hdev, u64 *addr
                *addr |= RREG32(mmMMU_UP_PAGE_ERROR_CAPTURE_VA);
 
                dev_err_ratelimited(hdev->dev, "MMU page fault on va 0x%llx\n", *addr);
-               hl_capture_page_fault(hdev, *addr, 0, true);
+               hl_handle_page_fault(hdev, *addr, 0, true, event_mask);
 
                WREG32(mmMMU_UP_PAGE_ERROR_CAPTURE, 0);
        }
@@ -7323,7 +7323,7 @@ static void gaudi_print_irq_info(struct hl_device *hdev, u16 event_type,
        if (razwi) {
                gaudi_print_and_get_razwi_info(hdev, &engine_id[0], &engine_id[1], &is_read,
                                                &is_write);
-               gaudi_print_and_get_mmu_error_info(hdev, &razwi_addr);
+               gaudi_print_and_get_mmu_error_info(hdev, &razwi_addr, event_mask);
 
                if (is_read)
                        razwi_flags |= HL_RAZWI_READ;
index 7747e19..e50cb71 100644 (file)
@@ -722,6 +722,7 @@ enum hl_server_type {
  * HL_NOTIFIER_EVENT_USER_ENGINE_ERR   - Indicates device engine in error state
  * HL_NOTIFIER_EVENT_GENERAL_HW_ERR     - Indicates device HW error
  * HL_NOTIFIER_EVENT_RAZWI              - Indicates razwi happened
+ * HL_NOTIFIER_EVENT_PAGE_FAULT         - Indicates page fault happened
  */
 #define HL_NOTIFIER_EVENT_TPC_ASSERT           (1ULL << 0)
 #define HL_NOTIFIER_EVENT_UNDEFINED_OPCODE     (1ULL << 1)
@@ -731,6 +732,7 @@ enum hl_server_type {
 #define HL_NOTIFIER_EVENT_USER_ENGINE_ERR      (1ULL << 5)
 #define HL_NOTIFIER_EVENT_GENERAL_HW_ERR       (1ULL << 6)
 #define HL_NOTIFIER_EVENT_RAZWI                        (1ULL << 7)
+#define HL_NOTIFIER_EVENT_PAGE_FAULT           (1ULL << 8)
 
 /* Opcode for management ioctl
  *