From 2de949abd6a539fac4b2c89a560e4ae505b6fb52 Mon Sep 17 00:00:00 2001 From: Haoyue Xu Date: Thu, 14 Jul 2022 21:43:53 +0800 Subject: [PATCH] RDMA/hns: Recover 1bit-ECC error of RAM on chip Since ECC memory maintains a memory system immune to single-bit errors, add support for correcting the 1bit-ECC error, which prevents a 1bit-ECC error become an uncorrected type error. When a 1bit-ECC error happens in the internal ram of the ROCE engine, such as the QPC table, as a 1bit-ECC error caused by reading, the ROCE engine only corrects those 1bit ECC errors by writing. Link: https://lore.kernel.org/r/20220714134353.16700-6-liangwenpeng@huawei.com Signed-off-by: Haoyue Xu Signed-off-by: Wenpeng Liang Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/hns/hns_roce_device.h | 1 + drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 182 +++++++++++++++++++++++++++- drivers/infiniband/hw/hns/hns_roce_hw_v2.h | 12 ++ 3 files changed, 193 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/hw/hns/hns_roce_device.h b/drivers/infiniband/hw/hns/hns_roce_device.h index 2855e9a..f848eed 100644 --- a/drivers/infiniband/hw/hns/hns_roce_device.h +++ b/drivers/infiniband/hw/hns/hns_roce_device.h @@ -959,6 +959,7 @@ struct hns_roce_dev { const struct hns_roce_hw *hw; void *priv; struct workqueue_struct *irq_workq; + struct work_struct ecc_work; const struct hns_roce_dfx_hw *dfx; u32 func_num; u32 is_vf; diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index 782f09a..cbdafaa 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -55,6 +55,42 @@ enum { CMD_RST_PRC_EBUSY, }; +enum ecc_resource_type { + ECC_RESOURCE_QPC, + ECC_RESOURCE_CQC, + ECC_RESOURCE_MPT, + ECC_RESOURCE_SRQC, + ECC_RESOURCE_GMV, + ECC_RESOURCE_QPC_TIMER, + ECC_RESOURCE_CQC_TIMER, + ECC_RESOURCE_SCCC, + ECC_RESOURCE_COUNT, +}; + +static const struct { + const char *name; + u8 read_bt0_op; + u8 write_bt0_op; +} fmea_ram_res[] = { + { "ECC_RESOURCE_QPC", + HNS_ROCE_CMD_READ_QPC_BT0, HNS_ROCE_CMD_WRITE_QPC_BT0 }, + { "ECC_RESOURCE_CQC", + HNS_ROCE_CMD_READ_CQC_BT0, HNS_ROCE_CMD_WRITE_CQC_BT0 }, + { "ECC_RESOURCE_MPT", + HNS_ROCE_CMD_READ_MPT_BT0, HNS_ROCE_CMD_WRITE_MPT_BT0 }, + { "ECC_RESOURCE_SRQC", + HNS_ROCE_CMD_READ_SRQC_BT0, HNS_ROCE_CMD_WRITE_SRQC_BT0 }, + /* ECC_RESOURCE_GMV is handled by cmdq, not mailbox */ + { "ECC_RESOURCE_GMV", + 0, 0 }, + { "ECC_RESOURCE_QPC_TIMER", + HNS_ROCE_CMD_READ_QPC_TIMER_BT0, HNS_ROCE_CMD_WRITE_QPC_TIMER_BT0 }, + { "ECC_RESOURCE_CQC_TIMER", + HNS_ROCE_CMD_READ_CQC_TIMER_BT0, HNS_ROCE_CMD_WRITE_CQC_TIMER_BT0 }, + { "ECC_RESOURCE_SCCC", + HNS_ROCE_CMD_READ_SCCC_BT0, HNS_ROCE_CMD_WRITE_SCCC_BT0 }, +}; + static inline void set_data_seg_v2(struct hns_roce_v2_wqe_data_seg *dseg, struct ib_sge *sg) { @@ -6017,6 +6053,142 @@ static irqreturn_t abnormal_interrupt_basic(struct hns_roce_dev *hr_dev, return IRQ_RETVAL(int_work); } +static int fmea_ram_ecc_query(struct hns_roce_dev *hr_dev, + struct fmea_ram_ecc *ecc_info) +{ + struct hns_roce_cmq_desc desc; + struct hns_roce_cmq_req *req = (struct hns_roce_cmq_req *)desc.data; + int ret; + + hns_roce_cmq_setup_basic_desc(&desc, HNS_ROCE_QUERY_RAM_ECC, true); + ret = hns_roce_cmq_send(hr_dev, &desc, 1); + if (ret) + return ret; + + ecc_info->is_ecc_err = hr_reg_read(req, QUERY_RAM_ECC_1BIT_ERR); + ecc_info->res_type = hr_reg_read(req, QUERY_RAM_ECC_RES_TYPE); + ecc_info->index = hr_reg_read(req, QUERY_RAM_ECC_TAG); + + return 0; +} + +static int fmea_recover_gmv(struct hns_roce_dev *hr_dev, u32 idx) +{ + struct hns_roce_cmq_desc desc; + struct hns_roce_cmq_req *req = (struct hns_roce_cmq_req *)desc.data; + u32 addr_upper; + u32 addr_low; + int ret; + + hns_roce_cmq_setup_basic_desc(&desc, HNS_ROCE_OPC_CFG_GMV_BT, true); + hr_reg_write(req, CFG_GMV_BT_IDX, idx); + + ret = hns_roce_cmq_send(hr_dev, &desc, 1); + if (ret) { + dev_err(hr_dev->dev, + "failed to execute cmd to read gmv, ret = %d.\n", ret); + return ret; + } + + addr_low = hr_reg_read(req, CFG_GMV_BT_BA_L); + addr_upper = hr_reg_read(req, CFG_GMV_BT_BA_H); + + hns_roce_cmq_setup_basic_desc(&desc, HNS_ROCE_OPC_CFG_GMV_BT, false); + hr_reg_write(req, CFG_GMV_BT_BA_L, addr_low); + hr_reg_write(req, CFG_GMV_BT_BA_H, addr_upper); + hr_reg_write(req, CFG_GMV_BT_IDX, idx); + + return hns_roce_cmq_send(hr_dev, &desc, 1); +} + +static u64 fmea_get_ram_res_addr(u32 res_type, __le64 *data) +{ + if (res_type == ECC_RESOURCE_QPC_TIMER || + res_type == ECC_RESOURCE_CQC_TIMER || + res_type == ECC_RESOURCE_SCCC) + return le64_to_cpu(*data); + + return le64_to_cpu(*data) << PAGE_SHIFT; +} + +static int fmea_recover_others(struct hns_roce_dev *hr_dev, u32 res_type, + u32 index) +{ + u8 write_bt0_op = fmea_ram_res[res_type].write_bt0_op; + u8 read_bt0_op = fmea_ram_res[res_type].read_bt0_op; + struct hns_roce_cmd_mailbox *mailbox; + u64 addr; + int ret; + + mailbox = hns_roce_alloc_cmd_mailbox(hr_dev); + if (IS_ERR(mailbox)) + return PTR_ERR(mailbox); + + ret = hns_roce_cmd_mbox(hr_dev, 0, mailbox->dma, read_bt0_op, index); + if (ret) { + dev_err(hr_dev->dev, + "failed to execute cmd to read fmea ram, ret = %d.\n", + ret); + goto out; + } + + addr = fmea_get_ram_res_addr(res_type, mailbox->buf); + + ret = hns_roce_cmd_mbox(hr_dev, addr, 0, write_bt0_op, index); + if (ret) + dev_err(hr_dev->dev, + "failed to execute cmd to write fmea ram, ret = %d.\n", + ret); + +out: + hns_roce_free_cmd_mailbox(hr_dev, mailbox); + return ret; +} + +static void fmea_ram_ecc_recover(struct hns_roce_dev *hr_dev, + struct fmea_ram_ecc *ecc_info) +{ + u32 res_type = ecc_info->res_type; + u32 index = ecc_info->index; + int ret; + + BUILD_BUG_ON(ARRAY_SIZE(fmea_ram_res) != ECC_RESOURCE_COUNT); + + if (res_type >= ECC_RESOURCE_COUNT) { + dev_err(hr_dev->dev, "unsupported fmea ram ecc type %u.\n", + res_type); + return; + } + + if (res_type == ECC_RESOURCE_GMV) + ret = fmea_recover_gmv(hr_dev, index); + else + ret = fmea_recover_others(hr_dev, res_type, index); + if (ret) + dev_err(hr_dev->dev, + "failed to recover %s, index = %u, ret = %d.\n", + fmea_ram_res[res_type].name, index, ret); +} + +static void fmea_ram_ecc_work(struct work_struct *ecc_work) +{ + struct hns_roce_dev *hr_dev = + container_of(ecc_work, struct hns_roce_dev, ecc_work); + struct fmea_ram_ecc ecc_info = {}; + + if (fmea_ram_ecc_query(hr_dev, &ecc_info)) { + dev_err(hr_dev->dev, "failed to query fmea ram ecc.\n"); + return; + } + + if (!ecc_info.is_ecc_err) { + dev_err(hr_dev->dev, "there is no fmea ram ecc err found.\n"); + return; + } + + fmea_ram_ecc_recover(hr_dev, &ecc_info); +} + static irqreturn_t hns_roce_v2_msix_interrupt_abn(int irq, void *dev_id) { struct hns_roce_dev *hr_dev = dev_id; @@ -6025,10 +6197,14 @@ static irqreturn_t hns_roce_v2_msix_interrupt_abn(int irq, void *dev_id) int_st = roce_read(hr_dev, ROCEE_VF_ABN_INT_ST_REG); - if (int_st) + if (int_st) { int_work = abnormal_interrupt_basic(hr_dev, int_st); - else + } else if (hr_dev->pci_dev->revision >= PCI_REVISION_ID_HIP09) { + queue_work(hr_dev->irq_workq, &hr_dev->ecc_work); + int_work = IRQ_HANDLED; + } else { dev_err(hr_dev->dev, "there is no abnormal irq found.\n"); + } return IRQ_RETVAL(int_work); } @@ -6344,6 +6520,8 @@ static int hns_roce_v2_init_eq_table(struct hns_roce_dev *hr_dev) } } + INIT_WORK(&hr_dev->ecc_work, fmea_ram_ecc_work); + hr_dev->irq_workq = alloc_ordered_workqueue("hns_roce_irq_workq", 0); if (!hr_dev->irq_workq) { dev_err(dev, "failed to create irq workqueue.\n"); diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.h b/drivers/infiniband/hw/hns/hns_roce_hw_v2.h index e618614..f96deba 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.h +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.h @@ -250,6 +250,7 @@ enum hns_roce_opcode_type { HNS_ROCE_OPC_CFG_GMV_TBL = 0x850f, HNS_ROCE_OPC_CFG_GMV_BT = 0x8510, HNS_ROCE_OPC_EXT_CFG = 0x8512, + HNS_ROCE_QUERY_RAM_ECC = 0x8513, HNS_SWITCH_PARAMETER_CFG = 0x1033, }; @@ -1107,6 +1108,11 @@ enum { #define CFG_GMV_BT_BA_H CMQ_REQ_FIELD_LOC(51, 32) #define CFG_GMV_BT_IDX CMQ_REQ_FIELD_LOC(95, 64) +/* Fields of HNS_ROCE_QUERY_RAM_ECC */ +#define QUERY_RAM_ECC_1BIT_ERR CMQ_REQ_FIELD_LOC(31, 0) +#define QUERY_RAM_ECC_RES_TYPE CMQ_REQ_FIELD_LOC(63, 32) +#define QUERY_RAM_ECC_TAG CMQ_REQ_FIELD_LOC(95, 64) + struct hns_roce_cfg_sgid_tb { __le32 table_idx_rsv; __le32 vf_sgid_l; @@ -1343,6 +1349,12 @@ struct hns_roce_dip { struct list_head node; /* all dips are on a list */ }; +struct fmea_ram_ecc { + u32 is_ecc_err; + u32 res_type; + u32 index; +}; + /* only for RNR timeout issue of HIP08 */ #define HNS_ROCE_CLOCK_ADJUST 1000 #define HNS_ROCE_MAX_CQ_PERIOD 65 -- 2.7.4