drm/amdgpu: add instance mask for RAS inject
authorTao Zhou <tao.zhou1@amd.com>
Mon, 27 Feb 2023 10:25:23 +0000 (18:25 +0800)
committerAlex Deucher <alexander.deucher@amd.com>
Fri, 9 Jun 2023 14:37:10 +0000 (10:37 -0400)
User can specify injected instances by the mask. For backward
compatibility, the mask value is incorporated into sub block index
without interface change of RAS TA.
User uses logical mask and driver should convert it to physical value
before sending it to RAS TA.

v2: update parameter name.

Signed-off-by: Tao Zhou <tao.zhou1@amd.com>
Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com>
Reviewed-by: Stanley.Yang <Stanley.Yang@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
drivers/gpu/drm/amd/amdgpu/gfx_v9_4.c
drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.c

index ec79a5c2f500c127d943ca6567d14e096dc9906d..59b8b26e2caf2790be7f02913883ea0cd563b638 100644 (file)
@@ -1672,14 +1672,33 @@ int psp_ras_initialize(struct psp_context *psp)
 }
 
 int psp_ras_trigger_error(struct psp_context *psp,
-                         struct ta_ras_trigger_error_input *info)
+                         struct ta_ras_trigger_error_input *info, uint32_t instance_mask)
 {
        struct ta_ras_shared_memory *ras_cmd;
+       struct amdgpu_device *adev = psp->adev;
        int ret;
+       uint32_t dev_mask;
 
        if (!psp->ras_context.context.initialized)
                return -EINVAL;
 
+       switch (info->block_id) {
+       case TA_RAS_BLOCK__GFX:
+               dev_mask = GET_MASK(GC, instance_mask);
+               break;
+       case TA_RAS_BLOCK__SDMA:
+               dev_mask = GET_MASK(SDMA0, instance_mask);
+               break;
+       default:
+               dev_mask = instance_mask;
+               break;
+       }
+
+       /* reuse sub_block_index for backward compatibility */
+       dev_mask <<= AMDGPU_RAS_INST_SHIFT;
+       dev_mask &= AMDGPU_RAS_INST_MASK;
+       info->sub_block_index |= dev_mask;
+
        ras_cmd = (struct ta_ras_shared_memory *)psp->ras_context.context.mem_context.shared_buf;
        memset(ras_cmd, 0, sizeof(struct ta_ras_shared_memory));
 
index 0a409da749d1095cb47cfd7ac577817c5e67af92..d84323923a3fce68edc744d017d0bb7278fa8e4a 100644 (file)
@@ -486,7 +486,7 @@ int psp_ras_invoke(struct psp_context *psp, uint32_t ta_cmd_id);
 int psp_ras_enable_features(struct psp_context *psp,
                union ta_ras_cmd_input *info, bool enable);
 int psp_ras_trigger_error(struct psp_context *psp,
-                         struct ta_ras_trigger_error_input *info);
+                         struct ta_ras_trigger_error_input *info, uint32_t instance_mask);
 int psp_ras_terminate(struct psp_context *psp);
 
 int psp_hdcp_invoke(struct psp_context *psp, uint32_t ta_cmd_id);
index 64f80e8cbd638e8c425482a403a3694d572ba515..7ae08f168f9903509c23b1d051a6f94477984766 100644 (file)
@@ -256,6 +256,8 @@ static int amdgpu_ras_debugfs_ctrl_parse_data(struct file *f,
        int block_id;
        uint32_t sub_block;
        u64 address, value;
+       /* default value is 0 if the mask is not set by user */
+       u32 instance_mask = 0;
 
        if (*pos)
                return -EINVAL;
@@ -306,7 +308,11 @@ static int amdgpu_ras_debugfs_ctrl_parse_data(struct file *f,
                data->op = op;
 
                if (op == 2) {
-                       if (sscanf(str, "%*s %*s %*s 0x%x 0x%llx 0x%llx",
+                       if (sscanf(str, "%*s %*s %*s 0x%x 0x%llx 0x%llx 0x%x",
+                                  &sub_block, &address, &value, &instance_mask) != 4 &&
+                           sscanf(str, "%*s %*s %*s %u %llu %llu %u",
+                                  &sub_block, &address, &value, &instance_mask) != 4 &&
+                               sscanf(str, "%*s %*s %*s 0x%x 0x%llx 0x%llx",
                                   &sub_block, &address, &value) != 3 &&
                            sscanf(str, "%*s %*s %*s %u %llu %llu",
                                   &sub_block, &address, &value) != 3)
@@ -314,6 +320,7 @@ static int amdgpu_ras_debugfs_ctrl_parse_data(struct file *f,
                        data->head.sub_block_index = sub_block;
                        data->inject.address = address;
                        data->inject.value = value;
+                       data->inject.instance_mask = instance_mask;
                }
        } else {
                if (size < sizeof(*data))
@@ -341,7 +348,7 @@ static int amdgpu_ras_debugfs_ctrl_parse_data(struct file *f,
  * sub_block_index: some IPs have subcomponets. say, GFX, sDMA.
  * name: the name of IP.
  *
- * inject has two more members than head, they are address, value.
+ * inject has three more members than head, they are address, value and mask.
  * As their names indicate, inject operation will write the
  * value to the address.
  *
@@ -365,7 +372,7 @@ static int amdgpu_ras_debugfs_ctrl_parse_data(struct file *f,
  *
  *     echo "disable <block>" > /sys/kernel/debug/dri/<N>/ras/ras_ctrl
  *     echo "enable  <block> <error>" > /sys/kernel/debug/dri/<N>/ras/ras_ctrl
- *     echo "inject  <block> <error> <sub-block> <address> <value> > /sys/kernel/debug/dri/<N>/ras/ras_ctrl
+ *     echo "inject  <block> <error> <sub-block> <address> <value> <mask>" > /sys/kernel/debug/dri/<N>/ras/ras_ctrl
  *
  * Where N, is the card which you want to affect.
  *
@@ -382,13 +389,14 @@ static int amdgpu_ras_debugfs_ctrl_parse_data(struct file *f,
  *
  * The sub-block is a the sub-block index, pass 0 if there is no sub-block.
  * The address and value are hexadecimal numbers, leading 0x is optional.
+ * The mask means instance mask, is optional, default value is 0x1.
  *
  * For instance,
  *
  * .. code-block:: bash
  *
  *     echo inject umc ue 0x0 0x0 0x0 > /sys/kernel/debug/dri/0/ras/ras_ctrl
- *     echo inject umc ce 0 0 0 > /sys/kernel/debug/dri/0/ras/ras_ctrl
+ *     echo inject umc ce 0 0 0 > /sys/kernel/debug/dri/0/ras/ras_ctrl
  *     echo disable umc > /sys/kernel/debug/dri/0/ras/ras_ctrl
  *
  * How to check the result of the operation?
@@ -1117,13 +1125,14 @@ int amdgpu_ras_error_inject(struct amdgpu_device *adev,
 
        if (info->head.block == AMDGPU_RAS_BLOCK__GFX) {
                if (block_obj->hw_ops->ras_error_inject)
-                       ret = block_obj->hw_ops->ras_error_inject(adev, info);
+                       ret = block_obj->hw_ops->ras_error_inject(adev, info, info->instance_mask);
        } else {
                /* If defined special ras_error_inject(e.g: xgmi), implement special ras_error_inject */
                if (block_obj->hw_ops->ras_error_inject)
-                       ret = block_obj->hw_ops->ras_error_inject(adev, &block_info);
+                       ret = block_obj->hw_ops->ras_error_inject(adev, &block_info,
+                                               info->instance_mask);
                else  /*If not defined .ras_error_inject, use default ras_error_inject*/
-                       ret = psp_ras_trigger_error(&adev->psp, &block_info);
+                       ret = psp_ras_trigger_error(&adev->psp, &block_info, info->instance_mask);
        }
 
        if (ret)
index e96333d0c26948697b414b3b328fba0063af0b5b..bc43f7db17ccf1ccf680363be1f981bbc590c001 100644 (file)
 struct amdgpu_iv_entry;
 
 #define AMDGPU_RAS_FLAG_INIT_BY_VBIOS          (0x1 << 0)
+/* position of instance value in sub_block_index of
+ * ta_ras_trigger_error_input, the sub block uses lower 12 bits
+ */
+#define AMDGPU_RAS_INST_MASK 0xfffff000
+#define AMDGPU_RAS_INST_SHIFT 0xc
 
 enum amdgpu_ras_block {
        AMDGPU_RAS_BLOCK__UMC = 0,
@@ -508,6 +513,7 @@ struct ras_inject_if {
        struct ras_common_if head;
        uint64_t address;
        uint64_t value;
+       uint32_t instance_mask;
 };
 
 struct ras_cure_if {
@@ -545,7 +551,8 @@ struct amdgpu_ras_block_object {
 };
 
 struct amdgpu_ras_block_hw_ops {
-       int  (*ras_error_inject)(struct amdgpu_device *adev, void *inject_if);
+       int  (*ras_error_inject)(struct amdgpu_device *adev,
+                       void *inject_if, uint32_t instance_mask);
        void (*query_ras_error_count)(struct amdgpu_device *adev, void *ras_error_status);
        void (*query_ras_error_status)(struct amdgpu_device *adev);
        void (*query_ras_error_address)(struct amdgpu_device *adev, void *ras_error_status);
index 439925477fb89f0e6ca7f1a499322fba85838b27..85ee1af963dde85480f9002b64aeb9d89177ab29 100644 (file)
@@ -1014,7 +1014,8 @@ static void amdgpu_xgmi_query_ras_error_count(struct amdgpu_device *adev,
 }
 
 /* Trigger XGMI/WAFL error */
-static int amdgpu_ras_error_inject_xgmi(struct amdgpu_device *adev,  void *inject_if)
+static int amdgpu_ras_error_inject_xgmi(struct amdgpu_device *adev,
+                       void *inject_if, uint32_t instance_mask)
 {
        int ret = 0;
        struct ta_ras_trigger_error_input *block_info =
@@ -1026,7 +1027,7 @@ static int amdgpu_ras_error_inject_xgmi(struct amdgpu_device *adev,  void *injec
        if (amdgpu_dpm_allow_xgmi_power_down(adev, false))
                dev_warn(adev->dev, "Failed to disallow XGMI power down");
 
-       ret = psp_ras_trigger_error(&adev->psp, block_info);
+       ret = psp_ras_trigger_error(&adev->psp, block_info, instance_mask);
 
        if (amdgpu_ras_intr_triggered())
                return ret;
index cc005e3bcd400fa401e60048f876d8068f68be58..de8e70b3db75c9c2d431941f2c9b6c136792e8b3 100644 (file)
@@ -770,7 +770,7 @@ static u64 gfx_v9_0_ring_get_rptr_compute(struct amdgpu_ring *ring);
 static void gfx_v9_0_query_ras_error_count(struct amdgpu_device *adev,
                                          void *ras_error_status);
 static int gfx_v9_0_ras_error_inject(struct amdgpu_device *adev,
-                                    void *inject_if);
+                                    void *inject_if, uint32_t instance_mask);
 static void gfx_v9_0_reset_ras_error_count(struct amdgpu_device *adev);
 
 static void gfx_v9_0_kiq_set_resources(struct amdgpu_ring *kiq_ring,
@@ -6335,7 +6335,7 @@ static const struct soc15_ras_field_entry gfx_v9_0_ras_fields[] = {
 };
 
 static int gfx_v9_0_ras_error_inject(struct amdgpu_device *adev,
-                                    void *inject_if)
+                                    void *inject_if, uint32_t instance_mask)
 {
        struct ras_inject_if *info = (struct ras_inject_if *)inject_if;
        int ret;
@@ -6374,7 +6374,7 @@ static int gfx_v9_0_ras_error_inject(struct amdgpu_device *adev,
        block_info.value = info->value;
 
        mutex_lock(&adev->grbm_idx_mutex);
-       ret = psp_ras_trigger_error(&adev->psp, &block_info);
+       ret = psp_ras_trigger_error(&adev->psp, &block_info, instance_mask);
        mutex_unlock(&adev->grbm_idx_mutex);
 
        return ret;
index c67e387a97f5c1c2c4949bca08affe6a3002bae9..59abe162bbaf97fcd0d1f72469e90eba55e513f7 100644 (file)
@@ -971,7 +971,7 @@ static void gfx_v9_4_reset_ras_error_count(struct amdgpu_device *adev)
 }
 
 static int gfx_v9_4_ras_error_inject(struct amdgpu_device *adev,
-                                    void *inject_if)
+                                    void *inject_if, uint32_t instance_mask)
 {
        struct ras_inject_if *info = (struct ras_inject_if *)inject_if;
        int ret;
@@ -987,7 +987,7 @@ static int gfx_v9_4_ras_error_inject(struct amdgpu_device *adev,
        block_info.value = info->value;
 
        mutex_lock(&adev->grbm_idx_mutex);
-       ret = psp_ras_trigger_error(&adev->psp, &block_info);
+       ret = psp_ras_trigger_error(&adev->psp, &block_info, instance_mask);
        mutex_unlock(&adev->grbm_idx_mutex);
 
        return ret;
index ec7c049c5952158709ed8a317b50d77a828153ab..4906affa6f8cbb81c7ac1eb0f8ed87aeb1ac0909 100644 (file)
@@ -1699,7 +1699,8 @@ static void gfx_v9_4_2_reset_ras_error_count(struct amdgpu_device *adev)
        gfx_v9_4_2_query_utc_edc_count(adev, NULL, NULL);
 }
 
-static int gfx_v9_4_2_ras_error_inject(struct amdgpu_device *adev, void *inject_if)
+static int gfx_v9_4_2_ras_error_inject(struct amdgpu_device *adev,
+                       void *inject_if, uint32_t instance_mask)
 {
        struct ras_inject_if *info = (struct ras_inject_if *)inject_if;
        int ret;
@@ -1715,7 +1716,7 @@ static int gfx_v9_4_2_ras_error_inject(struct amdgpu_device *adev, void *inject_
        block_info.value = info->value;
 
        mutex_lock(&adev->grbm_idx_mutex);
-       ret = psp_ras_trigger_error(&adev->psp, &block_info);
+       ret = psp_ras_trigger_error(&adev->psp, &block_info, instance_mask);
        mutex_unlock(&adev->grbm_idx_mutex);
 
        return ret;