drm/amdgpu: Support passing amdgpu critical error to host via GPU Mailbox.
authorGavin Wan <Gavin.Wan@amd.com>
Fri, 23 Jun 2017 17:55:15 +0000 (13:55 -0400)
committerAlex Deucher <alexander.deucher@amd.com>
Fri, 14 Jul 2017 15:05:52 +0000 (11:05 -0400)
This feature works for SRIOV enviroment. For non-SRIOV enviroment, the
trans_error function does nothing.

The error information includes error_code (16bit), error_flags(16bit)
and error_data(64bit). Since there are not many errors, we keep the
errors in an array and transfer all errors to Host before amdgpu
initialization function (amdgpu_device_init) exit.

Signed-off-by: Gavin Wan <Gavin.Wan@amd.com>
Reviewed-by: Alex Deucher <alexander.deucher@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
drivers/gpu/drm/amd/amdgpu/Makefile
drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
drivers/gpu/drm/amd/amdgpu/amdgpu_vf_error.c [new file with mode: 0644]
drivers/gpu/drm/amd/amdgpu/amdgpu_vf_error.h [new file with mode: 0644]
drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
drivers/gpu/drm/amd/amdgpu/mxgpu_ai.h
drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c
drivers/gpu/drm/amd/amdgpu/mxgpu_vi.h

index faea634..658bac0 100644 (file)
@@ -25,7 +25,7 @@ amdgpu-y += amdgpu_device.o amdgpu_kms.o \
        amdgpu_prime.o amdgpu_vm.o amdgpu_ib.o amdgpu_pll.o \
        amdgpu_ucode.o amdgpu_bo_list.o amdgpu_ctx.o amdgpu_sync.o \
        amdgpu_gtt_mgr.o amdgpu_vram_mgr.o amdgpu_virt.o amdgpu_atomfirmware.o \
-       amdgpu_queue_mgr.o
+       amdgpu_queue_mgr.o amdgpu_vf_error.o
 
 # add asic specific block
 amdgpu-$(CONFIG_DRM_AMDGPU_CIK)+= cik.o cik_ih.o kv_smc.o kv_dpm.o \
index ae4387f..88e45c6 100644 (file)
@@ -53,6 +53,7 @@
 #include "bif/bif_4_1_d.h"
 #include <linux/pci.h>
 #include <linux/firmware.h>
+#include "amdgpu_vf_error.h"
 
 MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin");
 MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin");
@@ -2134,6 +2135,7 @@ int amdgpu_device_init(struct amdgpu_device *adev,
        r = amdgpu_atombios_init(adev);
        if (r) {
                dev_err(adev->dev, "amdgpu_atombios_init failed\n");
+               amdgpu_vf_error_put(AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 0, 0);
                goto failed;
        }
 
@@ -2144,6 +2146,7 @@ int amdgpu_device_init(struct amdgpu_device *adev,
        if (amdgpu_vpost_needed(adev)) {
                if (!adev->bios) {
                        dev_err(adev->dev, "no vBIOS found\n");
+                       amdgpu_vf_error_put(AMDGIM_ERROR_VF_NO_VBIOS, 0, 0);
                        r = -EINVAL;
                        goto failed;
                }
@@ -2151,6 +2154,7 @@ int amdgpu_device_init(struct amdgpu_device *adev,
                r = amdgpu_atom_asic_init(adev->mode_info.atom_context);
                if (r) {
                        dev_err(adev->dev, "gpu post error!\n");
+                       amdgpu_vf_error_put(AMDGIM_ERROR_VF_GPU_POST_ERROR, 0, 0);
                        goto failed;
                }
        } else {
@@ -2162,7 +2166,8 @@ int amdgpu_device_init(struct amdgpu_device *adev,
                r = amdgpu_atombios_get_clock_info(adev);
                if (r) {
                        dev_err(adev->dev, "amdgpu_atombios_get_clock_info failed\n");
-                       return r;
+                       amdgpu_vf_error_put(AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0);
+                       goto failed;
                }
                /* init i2c buses */
                amdgpu_atombios_i2c_init(adev);
@@ -2172,6 +2177,7 @@ int amdgpu_device_init(struct amdgpu_device *adev,
        r = amdgpu_fence_driver_init(adev);
        if (r) {
                dev_err(adev->dev, "amdgpu_fence_driver_init failed\n");
+               amdgpu_vf_error_put(AMDGIM_ERROR_VF_FENCE_INIT_FAIL, 0, 0);
                goto failed;
        }
 
@@ -2181,6 +2187,7 @@ int amdgpu_device_init(struct amdgpu_device *adev,
        r = amdgpu_init(adev);
        if (r) {
                dev_err(adev->dev, "amdgpu_init failed\n");
+               amdgpu_vf_error_put(AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0);
                amdgpu_fini(adev);
                goto failed;
        }
@@ -2200,6 +2207,7 @@ int amdgpu_device_init(struct amdgpu_device *adev,
        r = amdgpu_ib_pool_init(adev);
        if (r) {
                dev_err(adev->dev, "IB initialization failed (%d).\n", r);
+               amdgpu_vf_error_put(AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r);
                goto failed;
        }
 
@@ -2244,12 +2252,14 @@ int amdgpu_device_init(struct amdgpu_device *adev,
        r = amdgpu_late_init(adev);
        if (r) {
                dev_err(adev->dev, "amdgpu_late_init failed\n");
+               amdgpu_vf_error_put(AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r);
                goto failed;
        }
 
        return 0;
 
 failed:
+       amdgpu_vf_error_trans_all(adev);
        if (runtime)
                vga_switcheroo_fini_domain_pm_ops(adev->dev);
        return r;
@@ -2937,6 +2947,7 @@ out:
                }
        } else {
                dev_err(adev->dev, "asic resume failed (%d).\n", r);
+               amdgpu_vf_error_put(AMDGIM_ERROR_VF_ASIC_RESUME_FAIL, 0, r);
                for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
                        if (adev->rings[i] && adev->rings[i]->sched.thread) {
                                kthread_unpark(adev->rings[i]->sched.thread);
@@ -2947,12 +2958,16 @@ out:
        drm_helper_resume_force_mode(adev->ddev);
 
        ttm_bo_unlock_delayed_workqueue(&adev->mman.bdev, resched);
-       if (r)
+       if (r) {
                /* bad news, how to tell it to userspace ? */
                dev_info(adev->dev, "GPU reset failed\n");
-       else
+               amdgpu_vf_error_put(AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r);
+       }
+       else {
                dev_info(adev->dev, "GPU reset successed!\n");
+       }
 
+       amdgpu_vf_error_trans_all(adev);
        return r;
 }
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vf_error.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vf_error.c
new file mode 100644 (file)
index 0000000..45ac918
--- /dev/null
@@ -0,0 +1,85 @@
+/*
+ * Copyright 2017 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+#include "amdgpu.h"
+#include "amdgpu_vf_error.h"
+#include "mxgpu_ai.h"
+
+#define AMDGPU_VF_ERROR_ENTRY_SIZE    16 
+
+/* struct error_entry - amdgpu VF error information. */
+struct amdgpu_vf_error_buffer {
+       int read_count;
+       int write_count;
+       uint16_t code[AMDGPU_VF_ERROR_ENTRY_SIZE];
+       uint16_t flags[AMDGPU_VF_ERROR_ENTRY_SIZE];
+       uint64_t data[AMDGPU_VF_ERROR_ENTRY_SIZE];
+};
+
+struct amdgpu_vf_error_buffer admgpu_vf_errors;
+
+
+void amdgpu_vf_error_put(uint16_t sub_error_code, uint16_t error_flags, uint64_t error_data)
+{
+       int index;
+       uint16_t error_code = AMDGIM_ERROR_CODE(AMDGIM_ERROR_CATEGORY_VF, sub_error_code);
+
+       index = admgpu_vf_errors.write_count % AMDGPU_VF_ERROR_ENTRY_SIZE;
+       admgpu_vf_errors.code [index] = error_code;
+       admgpu_vf_errors.flags [index] = error_flags;
+       admgpu_vf_errors.data [index] = error_data;
+       admgpu_vf_errors.write_count ++;
+}
+
+
+void amdgpu_vf_error_trans_all(struct amdgpu_device *adev)
+{
+       /* u32 pf2vf_flags = 0; */
+       u32 data1, data2, data3;
+       int index;
+
+       if ((NULL == adev) || (!amdgpu_sriov_vf(adev)) || (!adev->virt.ops) || (!adev->virt.ops->trans_msg)) {
+               return;
+       }
+/*
+       TODO: Enable these code when pv2vf_info is merged
+       AMDGPU_FW_VRAM_PF2VF_READ (adev, feature_flags, &pf2vf_flags);
+       if (!(pf2vf_flags & AMDGIM_FEATURE_ERROR_LOG_COLLECT)) {
+               return;
+       }
+*/
+       /* The errors are overlay of array, correct read_count as full. */
+       if (admgpu_vf_errors.write_count - admgpu_vf_errors.read_count > AMDGPU_VF_ERROR_ENTRY_SIZE) {
+               admgpu_vf_errors.read_count = admgpu_vf_errors.write_count - AMDGPU_VF_ERROR_ENTRY_SIZE;
+       }
+
+       while (admgpu_vf_errors.read_count < admgpu_vf_errors.write_count) {
+               index =admgpu_vf_errors.read_count % AMDGPU_VF_ERROR_ENTRY_SIZE;
+               data1 = AMDGIM_ERROR_CODE_FLAGS_TO_MAILBOX (admgpu_vf_errors.code[index], admgpu_vf_errors.flags[index]);
+               data2 = admgpu_vf_errors.data[index] & 0xFFFFFFFF;
+               data3 = (admgpu_vf_errors.data[index] >> 32) & 0xFFFFFFFF;
+
+               adev->virt.ops->trans_msg(adev, IDH_LOG_VF_ERROR, data1, data2, data3);
+               admgpu_vf_errors.read_count ++;
+       }
+}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vf_error.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_vf_error.h
new file mode 100644 (file)
index 0000000..2a3278e
--- /dev/null
@@ -0,0 +1,62 @@
+/*
+ * Copyright 2017 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+#ifndef __VF_ERROR_H__
+#define __VF_ERROR_H__
+
+#define AMDGIM_ERROR_CODE_FLAGS_TO_MAILBOX(c,f)    (((c & 0xFFFF) << 16) | (f & 0xFFFF))
+#define AMDGIM_ERROR_CODE(t,c)       (((t&0xF)<<12)|(c&0xFFF))
+
+/* Please keep enum same as AMD GIM driver */
+enum AMDGIM_ERROR_VF {
+       AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL = 0,
+       AMDGIM_ERROR_VF_NO_VBIOS,
+       AMDGIM_ERROR_VF_GPU_POST_ERROR,
+       AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL,
+       AMDGIM_ERROR_VF_FENCE_INIT_FAIL,
+
+       AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL,
+       AMDGIM_ERROR_VF_IB_INIT_FAIL,
+       AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL,
+       AMDGIM_ERROR_VF_ASIC_RESUME_FAIL,
+       AMDGIM_ERROR_VF_GPU_RESET_FAIL,
+
+       AMDGIM_ERROR_VF_TEST,
+       AMDGIM_ERROR_VF_MAX
+};
+
+enum AMDGIM_ERROR_CATEGORY {
+       AMDGIM_ERROR_CATEGORY_NON_USED = 0,
+       AMDGIM_ERROR_CATEGORY_GIM,
+       AMDGIM_ERROR_CATEGORY_PF,
+       AMDGIM_ERROR_CATEGORY_VF,
+       AMDGIM_ERROR_CATEGORY_VBIOS,
+       AMDGIM_ERROR_CATEGORY_MONITOR,
+
+       AMDGIM_ERROR_CATEGORY_MAX
+};
+
+void amdgpu_vf_error_put(uint16_t sub_error_code, uint16_t error_flags, uint64_t error_data);
+void amdgpu_vf_error_trans_all (struct amdgpu_device *adev);
+
+#endif /* __VF_ERROR_H__ */
index 9e1062e..e5b1baf 100644 (file)
@@ -43,6 +43,7 @@ struct amdgpu_virt_ops {
        int (*req_full_gpu)(struct amdgpu_device *adev, bool init);
        int (*rel_full_gpu)(struct amdgpu_device *adev, bool init);
        int (*reset_gpu)(struct amdgpu_device *adev);
+       void (*trans_msg)(struct amdgpu_device *adev, u32 req, u32 data1, u32 data2, u32 data3);
 };
 
 /* GPU virtualization */
index bde3ca3..2812d88 100644 (file)
@@ -72,21 +72,6 @@ static void xgpu_ai_mailbox_set_valid(struct amdgpu_device *adev, bool val)
                      reg);
 }
 
-static void xgpu_ai_mailbox_trans_msg(struct amdgpu_device *adev,
-                                     enum idh_request req)
-{
-       u32 reg;
-
-       reg = RREG32_NO_KIQ(SOC15_REG_OFFSET(NBIO, 0,
-                                            mmBIF_BX_PF0_MAILBOX_MSGBUF_TRN_DW0));
-       reg = REG_SET_FIELD(reg, BIF_BX_PF0_MAILBOX_MSGBUF_TRN_DW0,
-                           MSGBUF_DATA, req);
-       WREG32_NO_KIQ(SOC15_REG_OFFSET(NBIO, 0, mmBIF_BX_PF0_MAILBOX_MSGBUF_TRN_DW0),
-                     reg);
-
-       xgpu_ai_mailbox_set_valid(adev, true);
-}
-
 static int xgpu_ai_mailbox_rcv_msg(struct amdgpu_device *adev,
                                   enum idh_event event)
 {
@@ -154,13 +139,25 @@ static int xgpu_ai_poll_msg(struct amdgpu_device *adev, enum idh_event event)
        return r;
 }
 
-
-static int xgpu_ai_send_access_requests(struct amdgpu_device *adev,
-                                       enum idh_request req)
-{
+static void xgpu_ai_mailbox_trans_msg (struct amdgpu_device *adev,
+             enum idh_request req, u32 data1, u32 data2, u32 data3) {
+       u32 reg;
        int r;
 
-       xgpu_ai_mailbox_trans_msg(adev, req);
+       reg = RREG32_NO_KIQ(SOC15_REG_OFFSET(NBIO, 0,
+                                            mmBIF_BX_PF0_MAILBOX_MSGBUF_TRN_DW0));
+       reg = REG_SET_FIELD(reg, BIF_BX_PF0_MAILBOX_MSGBUF_TRN_DW0,
+                           MSGBUF_DATA, req);
+       WREG32_NO_KIQ(SOC15_REG_OFFSET(NBIO, 0, mmBIF_BX_PF0_MAILBOX_MSGBUF_TRN_DW0),
+                     reg);
+       WREG32_NO_KIQ(SOC15_REG_OFFSET(NBIO, 0, mmBIF_BX_PF0_MAILBOX_MSGBUF_TRN_DW1),
+                               data1);
+       WREG32_NO_KIQ(SOC15_REG_OFFSET(NBIO, 0, mmBIF_BX_PF0_MAILBOX_MSGBUF_TRN_DW2),
+                               data2);
+       WREG32_NO_KIQ(SOC15_REG_OFFSET(NBIO, 0, mmBIF_BX_PF0_MAILBOX_MSGBUF_TRN_DW3),
+                               data3);
+
+       xgpu_ai_mailbox_set_valid(adev, true);
 
        /* start to poll ack */
        r = xgpu_ai_poll_ack(adev);
@@ -168,6 +165,14 @@ static int xgpu_ai_send_access_requests(struct amdgpu_device *adev,
                pr_err("Doesn't get ack from pf, continue\n");
 
        xgpu_ai_mailbox_set_valid(adev, false);
+}
+
+static int xgpu_ai_send_access_requests(struct amdgpu_device *adev,
+                                       enum idh_request req)
+{
+       int r;
+
+       xgpu_ai_mailbox_trans_msg(adev, req, 0, 0, 0);
 
        /* start to check msg if request is idh_req_gpu_init_access */
        if (req == IDH_REQ_GPU_INIT_ACCESS ||
@@ -342,4 +347,5 @@ const struct amdgpu_virt_ops xgpu_ai_virt_ops = {
        .req_full_gpu   = xgpu_ai_request_full_gpu_access,
        .rel_full_gpu   = xgpu_ai_release_full_gpu_access,
        .reset_gpu = xgpu_ai_request_reset,
+       .trans_msg = xgpu_ai_mailbox_trans_msg,
 };
index 9aefc44..1e91b9a 100644 (file)
@@ -31,7 +31,9 @@ enum idh_request {
        IDH_REL_GPU_INIT_ACCESS,
        IDH_REQ_GPU_FINI_ACCESS,
        IDH_REL_GPU_FINI_ACCESS,
-       IDH_REQ_GPU_RESET_ACCESS
+       IDH_REQ_GPU_RESET_ACCESS,
+
+       IDH_LOG_VF_ERROR       = 200,
 };
 
 enum idh_event {
index 171a658..c25a831 100644 (file)
@@ -613,4 +613,5 @@ const struct amdgpu_virt_ops xgpu_vi_virt_ops = {
        .req_full_gpu           = xgpu_vi_request_full_gpu_access,
        .rel_full_gpu           = xgpu_vi_release_full_gpu_access,
        .reset_gpu              = xgpu_vi_request_reset,
+       .trans_msg              = NULL, /* Does not need to trans VF errors to host. */
 };
index 2db7411..c791d73 100644 (file)
@@ -32,7 +32,9 @@ enum idh_request {
        IDH_REL_GPU_INIT_ACCESS,
        IDH_REQ_GPU_FINI_ACCESS,
        IDH_REL_GPU_FINI_ACCESS,
-       IDH_REQ_GPU_RESET_ACCESS
+       IDH_REQ_GPU_RESET_ACCESS,
+
+       IDH_LOG_VF_ERROR       = 200,
 };
 
 /* VI mailbox messages data */