};
/**
+ * struct last_error_session_info - info about last session in which CS timeout or
+ * razwi error occurred.
+ * @open_dev_timestamp: device open timestamp.
+ * @cs_timeout_timestamp: CS timeout timestamp.
+ * @razwi_timestamp: razwi timestamp.
+ * @cs_write_disable: if set writing to CS parameters in the structure is disabled so the
+ * first (root cause) CS timeout will not be overwritten.
+ * @razwi_write_disable: if set writing to razwi parameters in the structure is disabled so the
+ * first (root cause) razwi will not be overwritten.
+ * @cs_timeout_seq: CS timeout sequence number.
+ * @razwi_addr: address that caused razwi.
+ * @razwi_engine_id_1: engine id of the razwi initiator, if it was initiated by engine that does
+ * not have engine id it will be set to U16_MAX.
+ * @razwi_engine_id_2: second engine id of razwi initiator. Might happen that razwi have 2 possible
+ * engines which one them caused the razwi. In that case, it will contain the
+ * second possible engine id, otherwise it will be set to U16_MAX.
+ * @razwi_non_engine_initiator: in case the initiator of the razwi does not have engine id.
+ * @razwi_type: cause of razwi, page fault or access error, otherwise it will be set to U8_MAX.
+ */
+struct last_error_session_info {
+ ktime_t open_dev_timestamp;
+ ktime_t cs_timeout_timestamp;
+ ktime_t razwi_timestamp;
+ atomic_t cs_write_disable;
+ atomic_t razwi_write_disable;
+ u64 cs_timeout_seq;
+ u64 razwi_addr;
+ u16 razwi_engine_id_1;
+ u16 razwi_engine_id_2;
+ u8 razwi_non_engine_initiator;
+ u8 razwi_type;
+};
+
+/**
* struct hl_device - habanalabs device structure.
* @pdev: pointer to PCI device, can be NULL in case of simulator device.
* @pcie_bar_phys: array of available PCIe bars physical addresses.
* device initialization. Mainly used to debug and
* workaround firmware bugs
* @dram_pci_bar_start: start bus address of PCIe bar towards DRAM.
+ * @last_successful_open_ktime: timestamp (ktime) of the last successful device open.
* @last_successful_open_jif: timestamp (jiffies) of the last successful
* device open.
* @last_open_session_duration_jif: duration (jiffies) of the last device open
struct multi_cs_completion multi_cs_completion[
MULTI_CS_MAX_USER_CTX];
struct hl_clk_throttle clk_throttling;
+ struct last_error_session_info last_error;
u32 *stream_master_qid_arr;
atomic64_t dram_used_mem;
u64 open_counter;
u64 fw_poll_interval_usec;
atomic_t in_reset;
+ ktime_t last_successful_open_ktime;
enum hl_pll_frequency curr_pll_profile;
enum cpucp_card_types card_type;
u32 major;
return copy_to_user(out, &info, min_t(size_t, max_size, sizeof(info))) ? -EFAULT : 0;
}
+static int last_err_open_dev_info(struct hl_fpriv *hpriv, struct hl_info_args *args)
+{
+ struct hl_info_last_err_open_dev_time info = {0};
+ struct hl_device *hdev = hpriv->hdev;
+ u32 max_size = args->return_size;
+ void __user *out = (void __user *) (uintptr_t) args->return_pointer;
+
+ if ((!max_size) || (!out))
+ return -EINVAL;
+
+ info.timestamp = ktime_to_ns(hdev->last_error.open_dev_timestamp);
+
+ return copy_to_user(out, &info, min_t(size_t, max_size, sizeof(info))) ? -EFAULT : 0;
+}
+
+static int cs_timeout_info(struct hl_fpriv *hpriv, struct hl_info_args *args)
+{
+ struct hl_info_cs_timeout_event info = {0};
+ struct hl_device *hdev = hpriv->hdev;
+ u32 max_size = args->return_size;
+ void __user *out = (void __user *) (uintptr_t) args->return_pointer;
+
+ if ((!max_size) || (!out))
+ return -EINVAL;
+
+ info.seq = hdev->last_error.cs_timeout_seq;
+ info.timestamp = ktime_to_ns(hdev->last_error.cs_timeout_timestamp);
+
+ return copy_to_user(out, &info, min_t(size_t, max_size, sizeof(info))) ? -EFAULT : 0;
+}
+
+static int razwi_info(struct hl_fpriv *hpriv, struct hl_info_args *args)
+{
+ struct hl_device *hdev = hpriv->hdev;
+ u32 max_size = args->return_size;
+ struct hl_info_razwi_event info = {0};
+ void __user *out = (void __user *) (uintptr_t) args->return_pointer;
+
+ if ((!max_size) || (!out))
+ return -EINVAL;
+
+ info.timestamp = ktime_to_ns(hdev->last_error.razwi_timestamp);
+ info.addr = hdev->last_error.razwi_addr;
+ info.engine_id_1 = hdev->last_error.razwi_engine_id_1;
+ info.engine_id_2 = hdev->last_error.razwi_engine_id_2;
+ info.no_engine_id = hdev->last_error.razwi_non_engine_initiator;
+ info.error_type = hdev->last_error.razwi_type;
+
+ return copy_to_user(out, &info, min_t(size_t, max_size, sizeof(info))) ? -EFAULT : 0;
+}
+
static int _hl_info_ioctl(struct hl_fpriv *hpriv, void *data,
struct device *dev)
{
case HL_INFO_DRAM_PENDING_ROWS:
return dram_pending_rows_info(hpriv, args);
+ case HL_INFO_LAST_ERR_OPEN_DEV_TIME:
+ return last_err_open_dev_info(hpriv, args);
+
+ case HL_INFO_CS_TIMEOUT_EVENT:
+ return cs_timeout_info(hpriv, args);
+
+ case HL_INFO_RAZWI_EVENT:
+ return razwi_info(hpriv, args);
+
default:
dev_err(dev, "Invalid request %d\n", args->op);
rc = -ENOTTY;
snprintf(desc, size, "N/A");
}
-static const char *gaudi_get_razwi_initiator_dma_name(struct hl_device *hdev,
- u32 x_y, bool is_write)
+static const char *gaudi_get_razwi_initiator_dma_name(struct hl_device *hdev, u32 x_y,
+ bool is_write, s32 *engine_id_1,
+ s32 *engine_id_2)
{
u32 dma_id[2], dma_offset, err_cause[2], mask, i;
switch (x_y) {
case RAZWI_INITIATOR_ID_X_Y_DMA_IF_W_S_0:
case RAZWI_INITIATOR_ID_X_Y_DMA_IF_W_S_1:
- if ((err_cause[0] & mask) && !(err_cause[1] & mask))
+ if ((err_cause[0] & mask) && !(err_cause[1] & mask)) {
+ *engine_id_1 = GAUDI_ENGINE_ID_DMA_0;
return "DMA0";
- else if (!(err_cause[0] & mask) && (err_cause[1] & mask))
+ } else if (!(err_cause[0] & mask) && (err_cause[1] & mask)) {
+ *engine_id_1 = GAUDI_ENGINE_ID_DMA_2;
return "DMA2";
- else
+ } else {
+ *engine_id_1 = GAUDI_ENGINE_ID_DMA_0;
+ *engine_id_2 = GAUDI_ENGINE_ID_DMA_2;
return "DMA0 or DMA2";
+ }
case RAZWI_INITIATOR_ID_X_Y_DMA_IF_E_S_0:
case RAZWI_INITIATOR_ID_X_Y_DMA_IF_E_S_1:
- if ((err_cause[0] & mask) && !(err_cause[1] & mask))
+ if ((err_cause[0] & mask) && !(err_cause[1] & mask)) {
+ *engine_id_1 = GAUDI_ENGINE_ID_DMA_1;
return "DMA1";
- else if (!(err_cause[0] & mask) && (err_cause[1] & mask))
+ } else if (!(err_cause[0] & mask) && (err_cause[1] & mask)) {
+ *engine_id_1 = GAUDI_ENGINE_ID_DMA_3;
return "DMA3";
- else
+ } else {
+ *engine_id_1 = GAUDI_ENGINE_ID_DMA_1;
+ *engine_id_2 = GAUDI_ENGINE_ID_DMA_3;
return "DMA1 or DMA3";
+ }
case RAZWI_INITIATOR_ID_X_Y_DMA_IF_W_N_0:
case RAZWI_INITIATOR_ID_X_Y_DMA_IF_W_N_1:
- if ((err_cause[0] & mask) && !(err_cause[1] & mask))
+ if ((err_cause[0] & mask) && !(err_cause[1] & mask)) {
+ *engine_id_1 = GAUDI_ENGINE_ID_DMA_4;
return "DMA4";
- else if (!(err_cause[0] & mask) && (err_cause[1] & mask))
+ } else if (!(err_cause[0] & mask) && (err_cause[1] & mask)) {
+ *engine_id_1 = GAUDI_ENGINE_ID_DMA_6;
return "DMA6";
- else
+ } else {
+ *engine_id_1 = GAUDI_ENGINE_ID_DMA_4;
+ *engine_id_2 = GAUDI_ENGINE_ID_DMA_6;
return "DMA4 or DMA6";
+ }
case RAZWI_INITIATOR_ID_X_Y_DMA_IF_E_N_0:
case RAZWI_INITIATOR_ID_X_Y_DMA_IF_E_N_1:
- if ((err_cause[0] & mask) && !(err_cause[1] & mask))
+ if ((err_cause[0] & mask) && !(err_cause[1] & mask)) {
+ *engine_id_1 = GAUDI_ENGINE_ID_DMA_5;
return "DMA5";
- else if (!(err_cause[0] & mask) && (err_cause[1] & mask))
+ } else if (!(err_cause[0] & mask) && (err_cause[1] & mask)) {
+ *engine_id_1 = GAUDI_ENGINE_ID_DMA_7;
return "DMA7";
- else
+ } else {
+ *engine_id_1 = GAUDI_ENGINE_ID_DMA_5;
+ *engine_id_2 = GAUDI_ENGINE_ID_DMA_7;
return "DMA5 or DMA7";
+ }
}
unknown_initiator:
return "unknown initiator";
}
-static const char *gaudi_get_razwi_initiator_name(struct hl_device *hdev,
- bool is_write)
+static const char *gaudi_get_razwi_initiator_name(struct hl_device *hdev, bool is_write,
+ u32 *engine_id_1, u32 *engine_id_2)
{
u32 val, x_y, axi_id;
switch (x_y) {
case RAZWI_INITIATOR_ID_X_Y_TPC0_NIC0:
- if (axi_id == RAZWI_INITIATOR_ID_AXI_ID(AXI_ID_TPC))
+ if (axi_id == RAZWI_INITIATOR_ID_AXI_ID(AXI_ID_TPC)) {
+ *engine_id_1 = GAUDI_ENGINE_ID_TPC_0;
return "TPC0";
- if (axi_id == RAZWI_INITIATOR_ID_AXI_ID(AXI_ID_NIC))
+ }
+ if (axi_id == RAZWI_INITIATOR_ID_AXI_ID(AXI_ID_NIC)) {
+ *engine_id_1 = GAUDI_ENGINE_ID_NIC_0;
return "NIC0";
+ }
break;
case RAZWI_INITIATOR_ID_X_Y_TPC1:
+ *engine_id_1 = GAUDI_ENGINE_ID_TPC_1;
return "TPC1";
case RAZWI_INITIATOR_ID_X_Y_MME0_0:
case RAZWI_INITIATOR_ID_X_Y_MME0_1:
+ *engine_id_1 = GAUDI_ENGINE_ID_MME_0;
return "MME0";
case RAZWI_INITIATOR_ID_X_Y_MME1_0:
case RAZWI_INITIATOR_ID_X_Y_MME1_1:
+ *engine_id_1 = GAUDI_ENGINE_ID_MME_1;
return "MME1";
case RAZWI_INITIATOR_ID_X_Y_TPC2:
+ *engine_id_1 = GAUDI_ENGINE_ID_TPC_2;
return "TPC2";
case RAZWI_INITIATOR_ID_X_Y_TPC3_PCI_CPU_PSOC:
- if (axi_id == RAZWI_INITIATOR_ID_AXI_ID(AXI_ID_TPC))
+ if (axi_id == RAZWI_INITIATOR_ID_AXI_ID(AXI_ID_TPC)) {
+ *engine_id_1 = GAUDI_ENGINE_ID_TPC_3;
return "TPC3";
+ }
+ /* PCI, CPU or PSOC does not have engine id*/
if (axi_id == RAZWI_INITIATOR_ID_AXI_ID(AXI_ID_PCI))
return "PCI";
if (axi_id == RAZWI_INITIATOR_ID_AXI_ID(AXI_ID_CPU))
case RAZWI_INITIATOR_ID_X_Y_DMA_IF_W_N_1:
case RAZWI_INITIATOR_ID_X_Y_DMA_IF_E_N_0:
case RAZWI_INITIATOR_ID_X_Y_DMA_IF_E_N_1:
- return gaudi_get_razwi_initiator_dma_name(hdev, x_y, is_write);
+ return gaudi_get_razwi_initiator_dma_name(hdev, x_y, is_write,
+ engine_id_1, engine_id_2);
case RAZWI_INITIATOR_ID_X_Y_TPC4_NIC1_NIC2:
- if (axi_id == RAZWI_INITIATOR_ID_AXI_ID(AXI_ID_TPC))
+ if (axi_id == RAZWI_INITIATOR_ID_AXI_ID(AXI_ID_TPC)) {
+ *engine_id_1 = GAUDI_ENGINE_ID_TPC_4;
return "TPC4";
- if (axi_id == RAZWI_INITIATOR_ID_AXI_ID(AXI_ID_NIC))
+ }
+ if (axi_id == RAZWI_INITIATOR_ID_AXI_ID(AXI_ID_NIC)) {
+ *engine_id_1 = GAUDI_ENGINE_ID_NIC_1;
return "NIC1";
- if (axi_id == RAZWI_INITIATOR_ID_AXI_ID(AXI_ID_NIC_FT))
+ }
+ if (axi_id == RAZWI_INITIATOR_ID_AXI_ID(AXI_ID_NIC_FT)) {
+ *engine_id_1 = GAUDI_ENGINE_ID_NIC_2;
return "NIC2";
+ }
break;
case RAZWI_INITIATOR_ID_X_Y_TPC5:
+ *engine_id_1 = GAUDI_ENGINE_ID_TPC_5;
return "TPC5";
case RAZWI_INITIATOR_ID_X_Y_MME2_0:
case RAZWI_INITIATOR_ID_X_Y_MME2_1:
+ *engine_id_1 = GAUDI_ENGINE_ID_MME_2;
return "MME2";
case RAZWI_INITIATOR_ID_X_Y_MME3_0:
case RAZWI_INITIATOR_ID_X_Y_MME3_1:
+ *engine_id_1 = GAUDI_ENGINE_ID_MME_3;
return "MME3";
case RAZWI_INITIATOR_ID_X_Y_TPC6:
+ *engine_id_1 = GAUDI_ENGINE_ID_TPC_6;
return "TPC6";
case RAZWI_INITIATOR_ID_X_Y_TPC7_NIC4_NIC5:
- if (axi_id == RAZWI_INITIATOR_ID_AXI_ID(AXI_ID_TPC))
+ if (axi_id == RAZWI_INITIATOR_ID_AXI_ID(AXI_ID_TPC)) {
+ *engine_id_1 = GAUDI_ENGINE_ID_TPC_7;
return "TPC7";
- if (axi_id == RAZWI_INITIATOR_ID_AXI_ID(AXI_ID_NIC))
+ }
+ if (axi_id == RAZWI_INITIATOR_ID_AXI_ID(AXI_ID_NIC)) {
+ *engine_id_1 = GAUDI_ENGINE_ID_NIC_4;
return "NIC4";
- if (axi_id == RAZWI_INITIATOR_ID_AXI_ID(AXI_ID_NIC_FT))
+ }
+ if (axi_id == RAZWI_INITIATOR_ID_AXI_ID(AXI_ID_NIC_FT)) {
+ *engine_id_1 = GAUDI_ENGINE_ID_NIC_5;
return "NIC5";
+ }
break;
default:
break;
return "unknown initiator";
}
-static void gaudi_print_razwi_info(struct hl_device *hdev)
+static void gaudi_print_and_get_razwi_info(struct hl_device *hdev, u32 *engine_id_1,
+ u32 *engine_id_2)
{
+
if (RREG32(mmMMU_UP_RAZWI_WRITE_VLD)) {
dev_err_ratelimited(hdev->dev,
"RAZWI event caused by illegal write of %s\n",
- gaudi_get_razwi_initiator_name(hdev, true));
+ gaudi_get_razwi_initiator_name(hdev, true, engine_id_1, engine_id_2));
WREG32(mmMMU_UP_RAZWI_WRITE_VLD, 0);
}
if (RREG32(mmMMU_UP_RAZWI_READ_VLD)) {
dev_err_ratelimited(hdev->dev,
"RAZWI event caused by illegal read of %s\n",
- gaudi_get_razwi_initiator_name(hdev, false));
+ gaudi_get_razwi_initiator_name(hdev, false, engine_id_1, engine_id_2));
WREG32(mmMMU_UP_RAZWI_READ_VLD, 0);
}
}
-static void gaudi_print_mmu_error_info(struct hl_device *hdev)
+static void gaudi_print_and_get_mmu_error_info(struct hl_device *hdev, u64 *addr, u8 *type)
{
struct gaudi_device *gaudi = hdev->asic_specific;
- u64 addr;
u32 val;
if (!(gaudi->hw_cap_initialized & HW_CAP_MMU))
val = RREG32(mmMMU_UP_PAGE_ERROR_CAPTURE);
if (val & MMU_UP_PAGE_ERROR_CAPTURE_ENTRY_VALID_MASK) {
- addr = val & MMU_UP_PAGE_ERROR_CAPTURE_VA_49_32_MASK;
- addr <<= 32;
- addr |= RREG32(mmMMU_UP_PAGE_ERROR_CAPTURE_VA);
+ *addr = val & MMU_UP_PAGE_ERROR_CAPTURE_VA_49_32_MASK;
+ *addr <<= 32;
+ *addr |= RREG32(mmMMU_UP_PAGE_ERROR_CAPTURE_VA);
- dev_err_ratelimited(hdev->dev, "MMU page fault on va 0x%llx\n",
- addr);
+ dev_err_ratelimited(hdev->dev, "MMU page fault on va 0x%llx\n", *addr);
+ *type = HL_RAZWI_PAGE_FAULT;
WREG32(mmMMU_UP_PAGE_ERROR_CAPTURE, 0);
}
val = RREG32(mmMMU_UP_ACCESS_ERROR_CAPTURE);
if (val & MMU_UP_ACCESS_ERROR_CAPTURE_ENTRY_VALID_MASK) {
- addr = val & MMU_UP_ACCESS_ERROR_CAPTURE_VA_49_32_MASK;
- addr <<= 32;
- addr |= RREG32(mmMMU_UP_ACCESS_ERROR_CAPTURE_VA);
+ *addr = val & MMU_UP_ACCESS_ERROR_CAPTURE_VA_49_32_MASK;
+ *addr <<= 32;
+ *addr |= RREG32(mmMMU_UP_ACCESS_ERROR_CAPTURE_VA);
- dev_err_ratelimited(hdev->dev,
- "MMU access error on va 0x%llx\n", addr);
+ dev_err_ratelimited(hdev->dev, "MMU access error on va 0x%llx\n", *addr);
+ *type = HL_RAZWI_MMU_ACCESS_ERROR;
WREG32(mmMMU_UP_ACCESS_ERROR_CAPTURE, 0);
}
static void gaudi_print_irq_info(struct hl_device *hdev, u16 event_type,
bool razwi)
{
+ u32 engine_id_1, engine_id_2;
char desc[64] = "";
+ u64 razwi_addr = 0;
+ u8 razwi_type;
+ int rc;
+
+ /*
+ * Init engine id by default as not valid and only if razwi initiated from engine with
+ * engine id it will get valid value.
+ * Init razwi type to default, will be changed only if razwi caused by page fault of
+ * MMU access error
+ */
+ engine_id_1 = U16_MAX;
+ engine_id_2 = U16_MAX;
+ razwi_type = U8_MAX;
gaudi_get_event_desc(event_type, desc, sizeof(desc));
dev_err_ratelimited(hdev->dev, "Received H/W interrupt %d [\"%s\"]\n",
event_type, desc);
if (razwi) {
- gaudi_print_razwi_info(hdev);
- gaudi_print_mmu_error_info(hdev);
+ gaudi_print_and_get_razwi_info(hdev, &engine_id_1, &engine_id_2);
+ gaudi_print_and_get_mmu_error_info(hdev, &razwi_addr, &razwi_type);
+
+ /* In case it's the first razwi, save its parameters*/
+ rc = atomic_cmpxchg(&hdev->last_error.razwi_write_disable, 0, 1);
+ if (!rc) {
+ hdev->last_error.open_dev_timestamp = hdev->last_successful_open_ktime;
+ hdev->last_error.razwi_timestamp = ktime_get();
+ hdev->last_error.razwi_addr = razwi_addr;
+ hdev->last_error.razwi_engine_id_1 = engine_id_1;
+ hdev->last_error.razwi_engine_id_2 = engine_id_2;
+ /*
+ * If first engine id holds non valid value the razwi initiator
+ * does not have engine id
+ */
+ hdev->last_error.razwi_non_engine_initiator = (engine_id_1 == U16_MAX);
+ hdev->last_error.razwi_type = razwi_type;
+
+ }
}
}
* HL_INFO_OPEN_STATS - Retrieve info regarding recent device open calls
* HL_INFO_DRAM_REPLACED_ROWS - Retrieve DRAM replaced rows info
* HL_INFO_DRAM_PENDING_ROWS - Retrieve DRAM pending rows num
+ * HL_INFO_LAST_ERR_OPEN_DEV_TIME - Retrieve timestamp of the last time the device was opened
+ * and CS timeout or razwi error occurred.
+ * HL_INFO_CS_TIMEOUT_EVENT - Retrieve CS timeout timestamp and its related CS sequence number.
+ * HL_INFO_RAZWI_EVENT - Retrieve parameters of razwi:
+ * Timestamp of razwi.
+ * The address which accessing it caused the razwi.
+ * Razwi initiator.
+ * Razwi cause, was it a page fault or MMU access error.
*/
#define HL_INFO_HW_IP_INFO 0
#define HL_INFO_HW_EVENTS 1
#define HL_INFO_OPEN_STATS 18
#define HL_INFO_DRAM_REPLACED_ROWS 21
#define HL_INFO_DRAM_PENDING_ROWS 22
+#define HL_INFO_LAST_ERR_OPEN_DEV_TIME 23
+#define HL_INFO_CS_TIMEOUT_EVENT 24
+#define HL_INFO_RAZWI_EVENT 25
-#define HL_INFO_VERSION_MAX_LEN 128
+#define HL_INFO_VERSION_MAX_LEN 128
#define HL_INFO_CARD_NAME_MAX_LEN 16
/**
__u64 ctx_validation_drop_cnt;
};
+/**
+ * struct hl_info_last_err_open_dev_time - last error boot information.
+ * @timestamp: timestamp of last time the device was opened and error occurred.
+ */
+struct hl_info_last_err_open_dev_time {
+ __s64 timestamp;
+};
+
+/**
+ * struct hl_info_cs_timeout_event - last CS timeout information.
+ * @timestamp: timestamp when last CS timeout event occurred.
+ * @seq: sequence number of last CS timeout event.
+ */
+struct hl_info_cs_timeout_event {
+ __s64 timestamp;
+ __u64 seq;
+};
+
+#define HL_RAZWI_PAGE_FAULT 0
+#define HL_RAZWI_MMU_ACCESS_ERROR 1
+
+/**
+ * struct hl_info_razwi_event - razwi information.
+ * @timestamp: timestamp of razwi.
+ * @addr: address which accessing it caused razwi.
+ * @engine_id_1: engine id of the razwi initiator, if it was initiated by engine that does not
+ * have engine id it will be set to U16_MAX.
+ * @engine_id_2: second engine id of razwi initiator. Might happen that razwi have 2 possible
+ * engines which one them caused the razwi. In that case, it will contain the
+ * second possible engine id, otherwise it will be set to U16_MAX.
+ * @no_engine_id: if razwi initiator does not have engine id, this field will be set to 1,
+ * otherwise 0.
+ * @error_type: cause of razwi, page fault or access error, otherwise it will be set to U8_MAX.
+ * @pad: padding to 64 bit.
+ */
+struct hl_info_razwi_event {
+ __s64 timestamp;
+ __u64 addr;
+ __u16 engine_id_1;
+ __u16 engine_id_2;
+ __u8 no_engine_id;
+ __u8 error_type;
+ __u8 pad[2];
+};
+
enum gaudi_dcores {
HL_GAUDI_WS_DCORE,
HL_GAUDI_WN_DCORE,