habanalabs: add support for new cpucp return codes
authorOfir Bitton <obitton@habana.ai>
Tue, 23 Aug 2022 13:58:38 +0000 (16:58 +0300)
committerOded Gabbay <ogabbay@kernel.org>
Mon, 19 Sep 2022 12:08:38 +0000 (15:08 +0300)
Firmware now responds with a more detailed cpucp return codes.
Driver can now distinguish between error and debug return codes.

Signed-off-by: Ofir Bitton <obitton@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
drivers/misc/habanalabs/common/firmware_if.c
drivers/misc/habanalabs/common/habanalabs.h
drivers/misc/habanalabs/gaudi2/gaudi2.c
drivers/misc/habanalabs/include/common/cpucp_if.h

index 8bfb459..c237591 100644 (file)
@@ -252,7 +252,7 @@ int hl_fw_send_cpu_message(struct hl_device *hdev, u32 hw_queue_id, u32 *msg,
        struct cpucp_packet *pkt;
        dma_addr_t pkt_dma_addr;
        struct hl_bd *sent_bd;
-       u32 tmp, expected_ack_val, pi;
+       u32 tmp, expected_ack_val, pi, opcode;
        int rc;
 
        pkt = hl_cpu_accessible_dma_pool_alloc(hdev, len, &pkt_dma_addr);
@@ -319,8 +319,35 @@ int hl_fw_send_cpu_message(struct hl_device *hdev, u32 hw_queue_id, u32 *msg,
 
        rc = (tmp & CPUCP_PKT_CTL_RC_MASK) >> CPUCP_PKT_CTL_RC_SHIFT;
        if (rc) {
-               dev_dbg(hdev->dev, "F/W ERROR %d for CPU packet %d\n",
-                       rc, (tmp & CPUCP_PKT_CTL_OPCODE_MASK) >> CPUCP_PKT_CTL_OPCODE_SHIFT);
+               opcode = (tmp & CPUCP_PKT_CTL_OPCODE_MASK) >> CPUCP_PKT_CTL_OPCODE_SHIFT;
+
+               if (!prop->supports_advanced_cpucp_rc) {
+                       dev_dbg(hdev->dev, "F/W ERROR %d for CPU packet %d\n", rc, opcode);
+                       goto scrub_descriptor;
+               }
+
+               switch (rc) {
+               case cpucp_packet_invalid:
+                       dev_err(hdev->dev,
+                               "CPU packet %d is not supported by F/W\n", opcode);
+                       break;
+               case cpucp_packet_fault:
+                       dev_err(hdev->dev,
+                               "F/W failed processing CPU packet %d\n", opcode);
+                       break;
+               case cpucp_packet_invalid_pkt:
+                       dev_dbg(hdev->dev,
+                               "CPU packet %d is not supported by F/W\n", opcode);
+                       break;
+               case cpucp_packet_invalid_params:
+                       dev_err(hdev->dev,
+                               "F/W reports invalid parameters for CPU packet %d\n", opcode);
+                       break;
+
+               default:
+                       dev_err(hdev->dev,
+                               "Unknown F/W ERROR %d for CPU packet %d\n", rc, opcode);
+               }
 
                /* propagate the return code from the f/w to the callers who want to check it */
                if (result)
@@ -332,6 +359,7 @@ int hl_fw_send_cpu_message(struct hl_device *hdev, u32 hw_queue_id, u32 *msg,
                *result = le64_to_cpu(pkt->result);
        }
 
+scrub_descriptor:
        /* Scrub previous buffer descriptor 'ctl' field which contains the
         * previous PI value written during packet submission.
         * We must do this or else F/W can read an old value upon queue wraparound.
index b7e0165..959e361 100644 (file)
@@ -678,6 +678,7 @@ struct hl_hints_range {
  * @set_max_power_on_device_init: true if need to set max power in F/W on device init.
  * @supports_user_set_page_size: true if user can set the allocation page size.
  * @dma_mask: the dma mask to be set for this device
+ * @supports_advanced_cpucp_rc: true if new cpucp opcodes are supported.
  */
 struct asic_fixed_properties {
        struct hw_queue_properties      *hw_queues_props;
@@ -785,6 +786,7 @@ struct asic_fixed_properties {
        u8                              set_max_power_on_device_init;
        u8                              supports_user_set_page_size;
        u8                              dma_mask;
+       u8                              supports_advanced_cpucp_rc;
 };
 
 /**
index a0b15b2..db18e06 100644 (file)
@@ -2721,6 +2721,8 @@ static int gaudi2_late_init(struct hl_device *hdev)
        struct gaudi2_device *gaudi2 = hdev->asic_specific;
        int rc;
 
+       hdev->asic_prop.supports_advanced_cpucp_rc = true;
+
        rc = hl_fw_send_pci_access_msg(hdev, CPUCP_PACKET_ENABLE_PCI_ACCESS,
                                        gaudi2->virt_msix_db_dma_addr);
        if (rc) {
index b837bb1..9593d1a 100644 (file)
@@ -824,10 +824,25 @@ enum cpucp_led_index {
        CPUCP_LED2_INDEX
 };
 
+/*
+ * enum cpucp_packet_rc - Error return code
+ * @cpucp_packet_success       -> in case of success.
+ * @cpucp_packet_invalid       -> this is to support Goya and Gaudi platform.
+ * @cpucp_packet_fault         -> in case of processing error like failing to
+ *                                 get device binding or semaphore etc.
+ * @cpucp_packet_invalid_pkt   -> when cpucp packet is un-supported. This is
+ *                                 supported Greco onwards.
+ * @cpucp_packet_invalid_params        -> when checking parameter like length of buffer
+ *                                or attribute value etc. Supported Greco onwards.
+ * @cpucp_packet_rc_max                -> It indicates size of enum so should be at last.
+ */
 enum cpucp_packet_rc {
        cpucp_packet_success,
        cpucp_packet_invalid,
-       cpucp_packet_fault
+       cpucp_packet_fault,
+       cpucp_packet_invalid_pkt,
+       cpucp_packet_invalid_params,
+       cpucp_packet_rc_max
 };
 
 /*