{
writel(val, hdev->rmmio + reg);
}
+
+void hl_capture_razwi(struct hl_device *hdev, u64 addr, u16 *engine_id, u16 num_of_engines,
+ u8 flags)
+{
+ if (num_of_engines > HL_RAZWI_MAX_NUM_OF_ENGINES_PER_RTR) {
+ dev_err(hdev->dev,
+ "Number of possible razwi initiators (%u) exceeded limit (%u)\n",
+ num_of_engines, HL_RAZWI_MAX_NUM_OF_ENGINES_PER_RTR);
+ return;
+ }
+
+ /* In case it's the first razwi since the device was opened, capture its parameters */
+ if (atomic_cmpxchg(&hdev->captured_err_info.razwi_info_recorded, 0, 1))
+ return;
+
+ hdev->captured_err_info.razwi.timestamp = ktime_to_ns(ktime_get());
+ hdev->captured_err_info.razwi.addr = addr;
+ hdev->captured_err_info.razwi.num_of_possible_engines = num_of_engines;
+ memcpy(&hdev->captured_err_info.razwi.engine_id[0], &engine_id[0],
+ num_of_engines * sizeof(u16));
+ hdev->captured_err_info.razwi.flags = flags;
+}
u64 seq;
};
-/**
- * struct razwi_info - info about last razwi error occurred.
- * @timestamp: razwi timestamp.
- * @write_enable: if set writing to razwi parameters in the structure is enabled.
- * otherwise - disabled, so the first (root cause) razwi will not be overwritten.
- * @addr: address that caused razwi.
- * @engine_id_1: engine id of the razwi initiator, if it was initiated by engine that does
- * not have engine id it will be set to U16_MAX.
- * @engine_id_2: second engine id of razwi initiator. Might happen that razwi have 2 possible
- * engines which one them caused the razwi. In that case, it will contain the
- * second possible engine id, otherwise it will be set to U16_MAX.
- * @non_engine_initiator: in case the initiator of the razwi does not have engine id.
- * @type: cause of razwi, page fault or access error, otherwise it will be set to U8_MAX.
- */
-struct razwi_info {
- ktime_t timestamp;
- atomic_t write_enable;
- u64 addr;
- u16 engine_id_1;
- u16 engine_id_2;
- u8 non_engine_initiator;
- u8 type;
-};
-
#define MAX_QMAN_STREAMS_INFO 4
#define OPCODE_INFO_MAX_ADDR_SIZE 8
/**
* struct hl_error_info - holds information collected during an error.
* @cs_timeout: CS timeout error information.
* @razwi: razwi information.
+ * @razwi_info_recorded: if set writing to razwi information is enabled.
+ * otherwise - disabled, so the first (root cause) razwi will not be overwritten.
* @undef_opcode: undefined opcode information
*/
struct hl_error_info {
struct cs_timeout_info cs_timeout;
- struct razwi_info razwi;
+ struct hl_info_razwi_event razwi;
+ atomic_t razwi_info_recorded;
struct undefined_opcode_info undef_opcode;
};
struct hl_mmap_mem_buf_behavior *behavior, gfp_t gfp,
void *args);
__printf(2, 3) void hl_engine_data_sprintf(struct engines_data *e, const char *fmt, ...);
+void hl_capture_razwi(struct hl_device *hdev, u64 addr, u16 *engine_id, u16 num_of_engines,
+ u8 flags);
#ifdef CONFIG_DEBUG_FS
hl_debugfs_add_file(hpriv);
atomic_set(&hdev->captured_err_info.cs_timeout.write_enable, 1);
- atomic_set(&hdev->captured_err_info.razwi.write_enable, 1);
+ atomic_set(&hdev->captured_err_info.razwi_info_recorded, 0);
hdev->captured_err_info.undef_opcode.write_enable = true;
hdev->open_counter++;
{
struct hl_device *hdev = hpriv->hdev;
u32 max_size = args->return_size;
- struct hl_info_razwi_event info = {0};
+ struct hl_info_razwi_event *info = &hdev->captured_err_info.razwi;
void __user *out = (void __user *) (uintptr_t) args->return_pointer;
if ((!max_size) || (!out))
return -EINVAL;
- info.timestamp = ktime_to_ns(hdev->captured_err_info.razwi.timestamp);
- info.addr = hdev->captured_err_info.razwi.addr;
- info.engine_id_1 = hdev->captured_err_info.razwi.engine_id_1;
- info.engine_id_2 = hdev->captured_err_info.razwi.engine_id_2;
- info.no_engine_id = hdev->captured_err_info.razwi.non_engine_initiator;
- info.error_type = hdev->captured_err_info.razwi.type;
-
- return copy_to_user(out, &info, min_t(size_t, max_size, sizeof(info))) ? -EFAULT : 0;
+ return copy_to_user(out, info, min_t(size_t, max_size, sizeof(struct hl_info_razwi_event)))
+ ? -EFAULT : 0;
}
static int undefined_opcode_info(struct hl_fpriv *hpriv, struct hl_info_args *args)
}
static const char *gaudi_get_razwi_initiator_dma_name(struct hl_device *hdev, u32 x_y,
- bool is_write, s32 *engine_id_1,
- s32 *engine_id_2)
+ bool is_write, u16 *engine_id_1,
+ u16 *engine_id_2)
{
u32 dma_id[2], dma_offset, err_cause[2], mask, i;
}
static const char *gaudi_get_razwi_initiator_name(struct hl_device *hdev, bool is_write,
- u32 *engine_id_1, u32 *engine_id_2)
+ u16 *engine_id_1, u16 *engine_id_2)
{
u32 val, x_y, axi_id;
return "unknown initiator";
}
-static void gaudi_print_and_get_razwi_info(struct hl_device *hdev, u32 *engine_id_1,
- u32 *engine_id_2)
+static void gaudi_print_and_get_razwi_info(struct hl_device *hdev, u16 *engine_id_1,
+ u16 *engine_id_2, bool *is_read, bool *is_write)
{
if (RREG32(mmMMU_UP_RAZWI_WRITE_VLD)) {
"RAZWI event caused by illegal write of %s\n",
gaudi_get_razwi_initiator_name(hdev, true, engine_id_1, engine_id_2));
WREG32(mmMMU_UP_RAZWI_WRITE_VLD, 0);
+ *is_write = true;
}
if (RREG32(mmMMU_UP_RAZWI_READ_VLD)) {
"RAZWI event caused by illegal read of %s\n",
gaudi_get_razwi_initiator_name(hdev, false, engine_id_1, engine_id_2));
WREG32(mmMMU_UP_RAZWI_READ_VLD, 0);
+ *is_read = true;
}
}
-static void gaudi_print_and_get_mmu_error_info(struct hl_device *hdev, u64 *addr, u8 *type)
+static void gaudi_print_and_get_mmu_error_info(struct hl_device *hdev, u64 *addr)
{
struct gaudi_device *gaudi = hdev->asic_specific;
u32 val;
*addr |= RREG32(mmMMU_UP_PAGE_ERROR_CAPTURE_VA);
dev_err_ratelimited(hdev->dev, "MMU page fault on va 0x%llx\n", *addr);
- *type = HL_RAZWI_PAGE_FAULT;
-
WREG32(mmMMU_UP_PAGE_ERROR_CAPTURE, 0);
}
*addr |= RREG32(mmMMU_UP_ACCESS_ERROR_CAPTURE_VA);
dev_err_ratelimited(hdev->dev, "MMU access error on va 0x%llx\n", *addr);
- *type = HL_RAZWI_MMU_ACCESS_ERROR;
WREG32(mmMMU_UP_ACCESS_ERROR_CAPTURE, 0);
}
static void gaudi_print_irq_info(struct hl_device *hdev, u16 event_type,
bool razwi)
{
- u32 engine_id_1, engine_id_2;
+ bool is_read = false, is_write = false;
+ u16 engine_id[2], num_of_razwi_eng = 0;
char desc[64] = "";
u64 razwi_addr = 0;
- u8 razwi_type;
- int rc;
+ u8 razwi_flags = 0;
/*
* Init engine id by default as not valid and only if razwi initiated from engine with
* engine id it will get valid value.
- * Init razwi type to default, will be changed only if razwi caused by page fault of
- * MMU access error
*/
- engine_id_1 = U16_MAX;
- engine_id_2 = U16_MAX;
- razwi_type = U8_MAX;
+ engine_id[0] = HL_RAZWI_NA_ENG_ID;
+ engine_id[1] = HL_RAZWI_NA_ENG_ID;
gaudi_get_event_desc(event_type, desc, sizeof(desc));
dev_err_ratelimited(hdev->dev, "Received H/W interrupt %d [\"%s\"]\n",
event_type, desc);
if (razwi) {
- gaudi_print_and_get_razwi_info(hdev, &engine_id_1, &engine_id_2);
- gaudi_print_and_get_mmu_error_info(hdev, &razwi_addr, &razwi_type);
-
- /* In case it's the first razwi, save its parameters*/
- rc = atomic_cmpxchg(&hdev->captured_err_info.razwi.write_enable, 1, 0);
- if (rc) {
- hdev->captured_err_info.razwi.timestamp = ktime_get();
- hdev->captured_err_info.razwi.addr = razwi_addr;
- hdev->captured_err_info.razwi.engine_id_1 = engine_id_1;
- hdev->captured_err_info.razwi.engine_id_2 = engine_id_2;
- /*
- * If first engine id holds non valid value the razwi initiator
- * does not have engine id
- */
- hdev->captured_err_info.razwi.non_engine_initiator =
- (engine_id_1 == U16_MAX);
- hdev->captured_err_info.razwi.type = razwi_type;
-
+ gaudi_print_and_get_razwi_info(hdev, &engine_id[0], &engine_id[1], &is_read,
+ &is_write);
+ gaudi_print_and_get_mmu_error_info(hdev, &razwi_addr);
+
+ if (is_read)
+ razwi_flags |= HL_RAZWI_READ;
+ if (is_write)
+ razwi_flags |= HL_RAZWI_WRITE;
+
+ if (engine_id[0] != HL_RAZWI_NA_ENG_ID) {
+ if (engine_id[1] != HL_RAZWI_NA_ENG_ID)
+ num_of_razwi_eng = 2;
+ else
+ num_of_razwi_eng = 1;
}
+
+ hl_capture_razwi(hdev, razwi_addr, engine_id, num_of_razwi_eng, razwi_flags);
}
}
__u64 seq;
};
-#define HL_RAZWI_PAGE_FAULT 0
-#define HL_RAZWI_MMU_ACCESS_ERROR 1
+#define HL_RAZWI_NA_ENG_ID U16_MAX
+#define HL_RAZWI_MAX_NUM_OF_ENGINES_PER_RTR 128
+#define HL_RAZWI_READ BIT(0)
+#define HL_RAZWI_WRITE BIT(1)
+#define HL_RAZWI_LBW BIT(2)
+#define HL_RAZWI_HBW BIT(3)
+#define HL_RAZWI_RR BIT(4)
+#define HL_RAZWI_ADDR_DEC BIT(5)
/**
* struct hl_info_razwi_event - razwi information.
* @timestamp: timestamp of razwi.
* @addr: address which accessing it caused razwi.
- * @engine_id_1: engine id of the razwi initiator, if it was initiated by engine that does not
- * have engine id it will be set to U16_MAX.
- * @engine_id_2: second engine id of razwi initiator. Might happen that razwi have 2 possible
- * engines which one them caused the razwi. In that case, it will contain the
- * second possible engine id, otherwise it will be set to U16_MAX.
- * @no_engine_id: if razwi initiator does not have engine id, this field will be set to 1,
- * otherwise 0.
- * @error_type: cause of razwi, page fault or access error, otherwise it will be set to U8_MAX.
- * @pad: padding to 64 bit.
+ * @engine_id: engine id of the razwi initiator, if it was initiated by engine that does not
+ * have engine id it will be set to HL_RAZWI_NA_ENG_ID. If there are several possible
+ * engines which caused the razwi, it will hold all of them.
+ * @num_of_possible_engines: contains number of possible engine ids. In some asics, razwi indication
+ * might be common for several engines and there is no way to get the
+ * exact engine. In this way, engine_id array will be filled with all
+ * possible engines caused this razwi. Also, there might be possibility
+ * in gaudi, where we don't indication on specific engine, in that case
+ * the value of this parameter will be zero.
+ * @flags: bitmask for additional data: HL_RAZWI_READ - razwi caused by read operation
+ * HL_RAZWI_WRITE - razwi caused by write operation
+ * HL_RAZWI_LBW - razwi caused by lbw fabric transaction
+ * HL_RAZWI_HBW - razwi caused by hbw fabric transaction
+ * HL_RAZWI_RR - razwi caused by range register
+ * HL_RAZWI_ADDR_DEC - razwi caused by address decode error
+ * Note: this data is not supported by all asics, in that case the relevant bits will not
+ * be set.
*/
struct hl_info_razwi_event {
__s64 timestamp;
__u64 addr;
- __u16 engine_id_1;
- __u16 engine_id_2;
- __u8 no_engine_id;
- __u8 error_type;
- __u8 pad[2];
+ __u16 engine_id[HL_RAZWI_MAX_NUM_OF_ENGINES_PER_RTR];
+ __u16 num_of_possible_engines;
+ __u8 flags;
+ __u8 pad[5];
};
#define MAX_QMAN_STREAMS_INFO 4