drm/amdgpu: use scheduler fault instead of reset work
authorChristian König <christian.koenig@amd.com>
Tue, 16 Oct 2018 11:08:21 +0000 (13:08 +0200)
committerAlex Deucher <alexander.deucher@amd.com>
Mon, 5 Nov 2018 19:21:03 +0000 (14:21 -0500)
Signal a fault to the scheduler on an illegal instruction or register
access violation instead of kicking of the reset handler directly.

Signed-off-by: Christian König <christian.koenig@amd.com>
Acked-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
drivers/gpu/drm/amd/amdgpu/amdgpu.h
drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c
drivers/gpu/drm/amd/amdgpu/cik_sdma.c
drivers/gpu/drm/amd/amdgpu/gfx_v6_0.c
drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c
drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
drivers/gpu/drm/amd/amdgpu/sdma_v2_4.c
drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c
drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c

index fb922a8..9348eb5 100644 (file)
@@ -830,7 +830,6 @@ struct amdgpu_device {
        bool                            need_dma32;
        bool                            need_swiotlb;
        bool                            accel_working;
-       struct work_struct              reset_work;
        struct notifier_block           acpi_nb;
        struct amdgpu_i2c_chan          *i2c_bus[AMDGPU_MAX_I2C_BUS];
        struct amdgpu_debugfs           debugfs[AMDGPU_DEBUGFS_MAX_COMPONENTS];
index 52c17f6..6b6524f 100644 (file)
@@ -94,23 +94,6 @@ static void amdgpu_hotplug_work_func(struct work_struct *work)
 }
 
 /**
- * amdgpu_irq_reset_work_func - execute GPU reset
- *
- * @work: work struct pointer
- *
- * Execute scheduled GPU reset (Cayman+).
- * This function is called when the IRQ handler thinks we need a GPU reset.
- */
-static void amdgpu_irq_reset_work_func(struct work_struct *work)
-{
-       struct amdgpu_device *adev = container_of(work, struct amdgpu_device,
-                                                 reset_work);
-
-       if (!amdgpu_sriov_vf(adev) && amdgpu_device_should_recover_gpu(adev))
-               amdgpu_device_gpu_recover(adev, NULL);
-}
-
-/**
  * amdgpu_irq_disable_all - disable *all* interrupts
  *
  * @adev: amdgpu device pointer
@@ -262,15 +245,12 @@ int amdgpu_irq_init(struct amdgpu_device *adev)
                                amdgpu_hotplug_work_func);
        }
 
-       INIT_WORK(&adev->reset_work, amdgpu_irq_reset_work_func);
-
        adev->irq.installed = true;
        r = drm_irq_install(adev->ddev, adev->ddev->pdev->irq);
        if (r) {
                adev->irq.installed = false;
                if (!amdgpu_device_has_dc_support(adev))
                        flush_work(&adev->hotplug_work);
-               cancel_work_sync(&adev->reset_work);
                return r;
        }
        adev->ddev->max_vblank_count = 0x00ffffff;
@@ -299,7 +279,6 @@ void amdgpu_irq_fini(struct amdgpu_device *adev)
                        pci_disable_msi(adev->pdev);
                if (!amdgpu_device_has_dc_support(adev))
                        flush_work(&adev->hotplug_work);
-               cancel_work_sync(&adev->reset_work);
        }
 
        for (i = 0; i < AMDGPU_IRQ_CLIENTID_MAX; ++i) {
index b918c88..32eb43d 100644 (file)
@@ -1214,8 +1214,11 @@ static int cik_sdma_process_illegal_inst_irq(struct amdgpu_device *adev,
                                             struct amdgpu_irq_src *source,
                                             struct amdgpu_iv_entry *entry)
 {
+       u8 instance_id;
+
        DRM_ERROR("Illegal instruction in SDMA command stream\n");
-       schedule_work(&adev->reset_work);
+       instance_id = (entry->ring_id & 0x3) >> 0;
+       drm_sched_fault(&adev->sdma.instance[instance_id].ring.sched);
        return 0;
 }
 
index d76eb27..622dd70 100644 (file)
@@ -3393,12 +3393,31 @@ static int gfx_v6_0_eop_irq(struct amdgpu_device *adev,
        return 0;
 }
 
+static void gfx_v6_0_fault(struct amdgpu_device *adev,
+                          struct amdgpu_iv_entry *entry)
+{
+       struct amdgpu_ring *ring;
+
+       switch (entry->ring_id) {
+       case 0:
+               ring = &adev->gfx.gfx_ring[0];
+               break;
+       case 1:
+       case 2:
+               ring = &adev->gfx.compute_ring[entry->ring_id - 1];
+               break;
+       default:
+               return;
+       }
+       drm_sched_fault(&ring->sched);
+}
+
 static int gfx_v6_0_priv_reg_irq(struct amdgpu_device *adev,
                                 struct amdgpu_irq_src *source,
                                 struct amdgpu_iv_entry *entry)
 {
        DRM_ERROR("Illegal register access in command stream\n");
-       schedule_work(&adev->reset_work);
+       gfx_v6_0_fault(adev, entry);
        return 0;
 }
 
@@ -3407,7 +3426,7 @@ static int gfx_v6_0_priv_inst_irq(struct amdgpu_device *adev,
                                  struct amdgpu_iv_entry *entry)
 {
        DRM_ERROR("Illegal instruction in command stream\n");
-       schedule_work(&adev->reset_work);
+       gfx_v6_0_fault(adev, entry);
        return 0;
 }
 
index 0e72bc0..9fadb32 100644 (file)
@@ -4959,12 +4959,36 @@ static int gfx_v7_0_eop_irq(struct amdgpu_device *adev,
        return 0;
 }
 
+static void gfx_v7_0_fault(struct amdgpu_device *adev,
+                          struct amdgpu_iv_entry *entry)
+{
+       struct amdgpu_ring *ring;
+       u8 me_id, pipe_id;
+       int i;
+
+       me_id = (entry->ring_id & 0x0c) >> 2;
+       pipe_id = (entry->ring_id & 0x03) >> 0;
+       switch (me_id) {
+       case 0:
+               drm_sched_fault(&adev->gfx.gfx_ring[0].sched);
+               break;
+       case 1:
+       case 2:
+               for (i = 0; i < adev->gfx.num_compute_rings; i++) {
+                       ring = &adev->gfx.compute_ring[i];
+                       if ((ring->me == me_id) && (ring->pipe == pipe_id))
+                               drm_sched_fault(&ring->sched);
+               }
+               break;
+       }
+}
+
 static int gfx_v7_0_priv_reg_irq(struct amdgpu_device *adev,
                                 struct amdgpu_irq_src *source,
                                 struct amdgpu_iv_entry *entry)
 {
        DRM_ERROR("Illegal register access in command stream\n");
-       schedule_work(&adev->reset_work);
+       gfx_v7_0_fault(adev, entry);
        return 0;
 }
 
@@ -4974,7 +4998,7 @@ static int gfx_v7_0_priv_inst_irq(struct amdgpu_device *adev,
 {
        DRM_ERROR("Illegal instruction in command stream\n");
        // XXX soft reset the gfx block only
-       schedule_work(&adev->reset_work);
+       gfx_v7_0_fault(adev, entry);
        return 0;
 }
 
index 617b0c8..ba614f2 100644 (file)
@@ -6738,12 +6738,39 @@ static int gfx_v8_0_eop_irq(struct amdgpu_device *adev,
        return 0;
 }
 
+static void gfx_v8_0_fault(struct amdgpu_device *adev,
+                          struct amdgpu_iv_entry *entry)
+{
+       u8 me_id, pipe_id, queue_id;
+       struct amdgpu_ring *ring;
+       int i;
+
+       me_id = (entry->ring_id & 0x0c) >> 2;
+       pipe_id = (entry->ring_id & 0x03) >> 0;
+       queue_id = (entry->ring_id & 0x70) >> 4;
+
+       switch (me_id) {
+       case 0:
+               drm_sched_fault(&adev->gfx.gfx_ring[0].sched);
+               break;
+       case 1:
+       case 2:
+               for (i = 0; i < adev->gfx.num_compute_rings; i++) {
+                       ring = &adev->gfx.compute_ring[i];
+                       if (ring->me == me_id && ring->pipe == pipe_id &&
+                           ring->queue == queue_id)
+                               drm_sched_fault(&ring->sched);
+               }
+               break;
+       }
+}
+
 static int gfx_v8_0_priv_reg_irq(struct amdgpu_device *adev,
                                 struct amdgpu_irq_src *source,
                                 struct amdgpu_iv_entry *entry)
 {
        DRM_ERROR("Illegal register access in command stream\n");
-       schedule_work(&adev->reset_work);
+       gfx_v8_0_fault(adev, entry);
        return 0;
 }
 
@@ -6752,7 +6779,7 @@ static int gfx_v8_0_priv_inst_irq(struct amdgpu_device *adev,
                                  struct amdgpu_iv_entry *entry)
 {
        DRM_ERROR("Illegal instruction in command stream\n");
-       schedule_work(&adev->reset_work);
+       gfx_v8_0_fault(adev, entry);
        return 0;
 }
 
index 6d7baf5..0ce1e14 100644 (file)
@@ -4695,12 +4695,39 @@ static int gfx_v9_0_eop_irq(struct amdgpu_device *adev,
        return 0;
 }
 
+static void gfx_v9_0_fault(struct amdgpu_device *adev,
+                          struct amdgpu_iv_entry *entry)
+{
+       u8 me_id, pipe_id, queue_id;
+       struct amdgpu_ring *ring;
+       int i;
+
+       me_id = (entry->ring_id & 0x0c) >> 2;
+       pipe_id = (entry->ring_id & 0x03) >> 0;
+       queue_id = (entry->ring_id & 0x70) >> 4;
+
+       switch (me_id) {
+       case 0:
+               drm_sched_fault(&adev->gfx.gfx_ring[0].sched);
+               break;
+       case 1:
+       case 2:
+               for (i = 0; i < adev->gfx.num_compute_rings; i++) {
+                       ring = &adev->gfx.compute_ring[i];
+                       if (ring->me == me_id && ring->pipe == pipe_id &&
+                           ring->queue == queue_id)
+                               drm_sched_fault(&ring->sched);
+               }
+               break;
+       }
+}
+
 static int gfx_v9_0_priv_reg_irq(struct amdgpu_device *adev,
                                 struct amdgpu_irq_src *source,
                                 struct amdgpu_iv_entry *entry)
 {
        DRM_ERROR("Illegal register access in command stream\n");
-       schedule_work(&adev->reset_work);
+       gfx_v9_0_fault(adev, entry);
        return 0;
 }
 
@@ -4709,7 +4736,7 @@ static int gfx_v9_0_priv_inst_irq(struct amdgpu_device *adev,
                                  struct amdgpu_iv_entry *entry)
 {
        DRM_ERROR("Illegal instruction in command stream\n");
-       schedule_work(&adev->reset_work);
+       gfx_v9_0_fault(adev, entry);
        return 0;
 }
 
index 2d4770e..bedbd5f 100644 (file)
@@ -1105,8 +1105,14 @@ static int sdma_v2_4_process_illegal_inst_irq(struct amdgpu_device *adev,
                                              struct amdgpu_irq_src *source,
                                              struct amdgpu_iv_entry *entry)
 {
+       u8 instance_id, queue_id;
+
        DRM_ERROR("Illegal instruction in SDMA command stream\n");
-       schedule_work(&adev->reset_work);
+       instance_id = (entry->ring_id & 0x3) >> 0;
+       queue_id = (entry->ring_id & 0xc) >> 2;
+
+       if (instance_id <= 1 && queue_id == 0)
+               drm_sched_fault(&adev->sdma.instance[instance_id].ring.sched);
        return 0;
 }
 
index 6fb3eda..415968d 100644 (file)
@@ -1440,8 +1440,14 @@ static int sdma_v3_0_process_illegal_inst_irq(struct amdgpu_device *adev,
                                              struct amdgpu_irq_src *source,
                                              struct amdgpu_iv_entry *entry)
 {
+       u8 instance_id, queue_id;
+
        DRM_ERROR("Illegal instruction in SDMA command stream\n");
-       schedule_work(&adev->reset_work);
+       instance_id = (entry->ring_id & 0x3) >> 0;
+       queue_id = (entry->ring_id & 0xc) >> 2;
+
+       if (instance_id <= 1 && queue_id == 0)
+               drm_sched_fault(&adev->sdma.instance[instance_id].ring.sched);
        return 0;
 }
 
index c0d1650..88d9343 100644 (file)
@@ -1717,12 +1717,29 @@ static int sdma_v4_0_process_illegal_inst_irq(struct amdgpu_device *adev,
                                              struct amdgpu_irq_src *source,
                                              struct amdgpu_iv_entry *entry)
 {
+       int instance;
+
        DRM_ERROR("Illegal instruction in SDMA command stream\n");
-       schedule_work(&adev->reset_work);
+
+       switch (entry->client_id) {
+       case SOC15_IH_CLIENTID_SDMA0:
+               instance = 0;
+               break;
+       case SOC15_IH_CLIENTID_SDMA1:
+               instance = 1;
+               break;
+       default:
+               return 0;
+       }
+
+       switch (entry->ring_id) {
+       case 0:
+               drm_sched_fault(&adev->sdma.instance[instance].ring.sched);
+               break;
+       }
        return 0;
 }
 
-
 static void sdma_v4_0_update_medium_grain_clock_gating(
                struct amdgpu_device *adev,
                bool enable)