#define AMDGPU_RESET_MAGIC_NUM 64
#define AMDGPU_MAX_DF_PERFMONS 4
#define AMDGPU_PRODUCT_NAME_LEN 64
-struct amdgpu_reset_domain {
- struct workqueue_struct *wq;
-};
+struct amdgpu_reset_domain;
struct amdgpu_device {
struct device *dev;
uint32_t ip_versions[MAX_HWIP][HWIP_MAX_INSTANCE];
bool ram_is_direct_mapped;
- struct amdgpu_reset_domain reset_domain;
+ struct amdgpu_reset_domain *reset_domain;
};
static inline struct amdgpu_device *drm_to_adev(struct drm_device *ddev)
r = drm_sched_init(&ring->sched, &amdgpu_sched_ops,
ring->num_hw_submission, amdgpu_job_hang_limit,
- timeout, adev->reset_domain.wq, ring->sched_score, ring->name);
+ timeout, adev->reset_domain->wq, ring->sched_score, ring->name);
if (r) {
DRM_ERROR("Failed to create scheduler on ring %s.\n",
ring->name);
if (r)
goto init_failed;
+ /**
+ * In case of XGMI grab extra reference for reset domain for this device
+ */
if (adev->gmc.xgmi.num_physical_nodes > 1) {
- struct amdgpu_hive_info *hive;
+ if (amdgpu_xgmi_add_device(adev) == 0) {
+ struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev);
- amdgpu_xgmi_add_device(adev);
+ if (!hive->reset_domain ||
+ !amdgpu_reset_get_reset_domain(hive->reset_domain)) {
+ r = -ENOENT;
+ goto init_failed;
+ }
- hive = amdgpu_get_xgmi_hive(adev);
- if (!hive || !hive->reset_domain.wq) {
- DRM_ERROR("Failed to obtain reset domain info for XGMI hive:%llx", hive->hive_id);
- r = -EINVAL;
- goto init_failed;
- }
-
- adev->reset_domain.wq = hive->reset_domain.wq;
- } else {
- adev->reset_domain.wq = alloc_ordered_workqueue("amdgpu-reset-dev", 0);
- if (!adev->reset_domain.wq) {
- r = -ENOMEM;
- goto init_failed;
+ /* Drop the early temporary reset domain we created for device */
+ amdgpu_reset_put_reset_domain(adev->reset_domain);
+ adev->reset_domain = hive->reset_domain;
}
}
return r;
}
+ /*
+ * Reset domain needs to be present early, before XGMI hive discovered
+ * (if any) and intitialized to use reset sem and in_gpu reset flag
+ * early on during init.
+ */
+ adev->reset_domain = amdgpu_reset_create_reset_domain(SINGLE_DEVICE ,"amdgpu-reset-dev");
+ if (!adev->reset_domain)
+ return -ENOMEM;
+
/* early init functions */
r = amdgpu_device_ip_early_init(adev);
if (r)
if (adev->mman.discovery_bin)
amdgpu_discovery_fini(adev);
+ amdgpu_reset_put_reset_domain(adev->reset_domain);
+ adev->reset_domain = NULL;
+
kfree(adev->pci_state);
}
INIT_WORK(&work.base, amdgpu_device_queue_gpu_recover_work);
- if (!queue_work(adev->reset_domain.wq, &work.base))
+ if (!amdgpu_reset_domain_schedule(adev->reset_domain, &work.base))
return -EAGAIN;
flush_work(&work.base);
return reset_handler->restore_hwcontext(adev->reset_cntl,
reset_context);
}
+
+
+void amdgpu_reset_destroy_reset_domain(struct kref *ref)
+{
+ struct amdgpu_reset_domain *reset_domain = container_of(ref,
+ struct amdgpu_reset_domain,
+ refcount);
+ if (reset_domain->wq)
+ destroy_workqueue(reset_domain->wq);
+
+ kvfree(reset_domain);
+}
+
+struct amdgpu_reset_domain *amdgpu_reset_create_reset_domain(enum amdgpu_reset_domain_type type,
+ char *wq_name)
+{
+ struct amdgpu_reset_domain *reset_domain;
+
+ reset_domain = kvzalloc(sizeof(struct amdgpu_reset_domain), GFP_KERNEL);
+ if (!reset_domain) {
+ DRM_ERROR("Failed to allocate amdgpu_reset_domain!");
+ return NULL;
+ }
+
+ reset_domain->type = type;
+ kref_init(&reset_domain->refcount);
+
+ reset_domain->wq = create_singlethread_workqueue(wq_name);
+ if (!reset_domain->wq) {
+ DRM_ERROR("Failed to allocate wq for amdgpu_reset_domain!");
+ amdgpu_reset_put_reset_domain(reset_domain);
+ return NULL;
+
+ }
+
+ return reset_domain;
+}
+
+
+
void (*async_reset)(struct work_struct *work);
};
+
+enum amdgpu_reset_domain_type {
+ SINGLE_DEVICE,
+ XGMI_HIVE
+};
+
+struct amdgpu_reset_domain {
+ struct kref refcount;
+ struct workqueue_struct *wq;
+ enum amdgpu_reset_domain_type type;
+};
+
+
int amdgpu_reset_init(struct amdgpu_device *adev);
int amdgpu_reset_fini(struct amdgpu_device *adev);
int amdgpu_reset_add_handler(struct amdgpu_reset_control *reset_ctl,
struct amdgpu_reset_handler *handler);
+struct amdgpu_reset_domain *amdgpu_reset_create_reset_domain(enum amdgpu_reset_domain_type type,
+ char *wq_name);
+
+void amdgpu_reset_destroy_reset_domain(struct kref *ref);
+
+static inline bool amdgpu_reset_get_reset_domain(struct amdgpu_reset_domain *domain)
+{
+ return kref_get_unless_zero(&domain->refcount) != 0;
+}
+
+static inline void amdgpu_reset_put_reset_domain(struct amdgpu_reset_domain *domain)
+{
+ kref_put(&domain->refcount, amdgpu_reset_destroy_reset_domain);
+}
+
+static inline bool amdgpu_reset_domain_schedule(struct amdgpu_reset_domain *domain,
+ struct work_struct *work)
+{
+ return queue_work(domain->wq, work);
+}
+
+
#endif
#include "wafl/wafl2_4_0_0_smn.h"
#include "wafl/wafl2_4_0_0_sh_mask.h"
+#include "amdgpu_reset.h"
+
#define smnPCS_XGMI23_PCS_ERROR_STATUS 0x11a01210
#define smnPCS_XGMI3X16_PCS_ERROR_STATUS 0x11a0020c
#define smnPCS_GOPX1_PCS_ERROR_STATUS 0x12200210
struct amdgpu_hive_info *hive = container_of(
kobj, struct amdgpu_hive_info, kobj);
+ amdgpu_reset_put_reset_domain(hive->reset_domain);
+ hive->reset_domain = NULL;
+
mutex_destroy(&hive->hive_lock);
kfree(hive);
}
goto pro_end;
}
- hive->reset_domain.wq = alloc_ordered_workqueue("amdgpu-reset-hive", 0);
- if (!hive->reset_domain.wq) {
- dev_err(adev->dev, "XGMI: failed allocating wq for reset domain!\n");
- kfree(hive);
- hive = NULL;
- goto pro_end;
+ /**
+ * Avoid recreating reset domain when hive is reconstructed for the case
+ * of reset the devices in the XGMI hive during probe for SRIOV
+ * See https://www.spinics.net/lists/amd-gfx/msg58836.html
+ */
+ if (adev->reset_domain->type != XGMI_HIVE) {
+ hive->reset_domain = amdgpu_reset_create_reset_domain(XGMI_HIVE, "amdgpu-reset-hive");
+ if (!hive->reset_domain) {
+ dev_err(adev->dev, "XGMI: failed initializing reset domain for xgmi hive\n");
+ ret = -ENOMEM;
+ kobject_put(&hive->kobj);
+ kfree(hive);
+ hive = NULL;
+ goto pro_end;
+ }
+ } else {
+ amdgpu_reset_get_reset_domain(adev->reset_domain);
+ hive->reset_domain = adev->reset_domain;
}
hive->hive_id = adev->gmc.xgmi.hive_id;
AMDGPU_XGMI_PSTATE_UNKNOWN
} pstate;
- struct amdgpu_reset_domain reset_domain;
+ struct amdgpu_reset_domain *reset_domain;
};
struct amdgpu_pcs_ras_field {
#include "soc15_common.h"
#include "mxgpu_ai.h"
+#include "amdgpu_reset.h"
+
static void xgpu_ai_mailbox_send_ack(struct amdgpu_device *adev)
{
WREG8(AI_MAIBOX_CONTROL_RCV_OFFSET_BYTE, 2);
switch (event) {
case IDH_FLR_NOTIFICATION:
if (amdgpu_sriov_runtime(adev) && !amdgpu_in_reset(adev))
- WARN_ONCE(!queue_work(adev->reset_domain.wq,
- &adev->virt.flr_work),
+ WARN_ONCE(!amdgpu_reset_domain_schedule(adev->reset_domain,
+ &adev->virt.flr_work),
"Failed to queue work! at %s",
__func__);
break;
#include "soc15_common.h"
#include "mxgpu_nv.h"
+#include "amdgpu_reset.h"
+
static void xgpu_nv_mailbox_send_ack(struct amdgpu_device *adev)
{
WREG8(NV_MAIBOX_CONTROL_RCV_OFFSET_BYTE, 2);
switch (event) {
case IDH_FLR_NOTIFICATION:
if (amdgpu_sriov_runtime(adev) && !amdgpu_in_reset(adev))
- WARN_ONCE(!queue_work(adev->reset_domain.wq,
- &adev->virt.flr_work),
+ WARN_ONCE(!amdgpu_reset_domain_schedule(adev->reset_domain,
+ &adev->virt.flr_work),
"Failed to queue work! at %s",
__func__);
break;
#include "smu/smu_7_1_3_d.h"
#include "mxgpu_vi.h"
+#include "amdgpu_reset.h"
+
/* VI golden setting */
static const u32 xgpu_fiji_mgcg_cgcg_init[] = {
mmRLC_CGTT_MGCG_OVERRIDE, 0xffffffff, 0xffffffff,
/* only handle FLR_NOTIFY now */
if (!r && !amdgpu_in_reset(adev))
- WARN_ONCE(!queue_work(adev->reset_domain.wq,
- &adev->virt.flr_work),
+ WARN_ONCE(!amdgpu_reset_domain_schedule(adev->reset_domain,
+ &adev->virt.flr_work),
"Failed to queue work! at %s",
__func__);
}