drivers/gpu/drm/amd/amdgpu/amdgpu_device.c

   1 /*
   2  * Copyright 2008 Advanced Micro Devices, Inc.
   3  * Copyright 2008 Red Hat Inc.
   4  * Copyright 2009 Jerome Glisse.
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a
   7  * copy of this software and associated documentation files (the "Software"),
   8  * to deal in the Software without restriction, including without limitation
   9  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
  10  * and/or sell copies of the Software, and to permit persons to whom the
  11  * Software is furnished to do so, subject to the following conditions:
  12  *
  13  * The above copyright notice and this permission notice shall be included in
  14  * all copies or substantial portions of the Software.
  15  *
  16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  19  * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
  20  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
  21  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
  22  * OTHER DEALINGS IN THE SOFTWARE.
  23  *
  24  * Authors: Dave Airlie
  25  *          Alex Deucher
  26  *          Jerome Glisse
  27  */
  28 #include <linux/power_supply.h>
  29 #include <linux/kthread.h>
  30 #include <linux/module.h>
  31 #include <linux/console.h>
  32 #include <linux/slab.h>
  33 #include <linux/iommu.h>
  34 #include <linux/pci.h>
  35 #include <linux/devcoredump.h>
  36 #include <generated/utsrelease.h>
  37 #include <linux/pci-p2pdma.h>
  38
  39 #include <drm/drm_aperture.h>
  40 #include <drm/drm_atomic_helper.h>
  41 #include <drm/drm_probe_helper.h>
  42 #include <drm/amdgpu_drm.h>
  43 #include <linux/vgaarb.h>
  44 #include <linux/vga_switcheroo.h>
  45 #include <linux/efi.h>
  46 #include "amdgpu.h"
  47 #include "amdgpu_trace.h"
  48 #include "amdgpu_i2c.h"
  49 #include "atom.h"
  50 #include "amdgpu_atombios.h"
  51 #include "amdgpu_atomfirmware.h"
  52 #include "amd_pcie.h"
  53 #ifdef CONFIG_DRM_AMDGPU_SI
  54 #include "si.h"
  55 #endif
  56 #ifdef CONFIG_DRM_AMDGPU_CIK
  57 #include "cik.h"
  58 #endif
  59 #include "vi.h"
  60 #include "soc15.h"
  61 #include "nv.h"
  62 #include "bif/bif_4_1_d.h"
  63 #include <linux/firmware.h>
  64 #include "amdgpu_vf_error.h"
  65
  66 #include "amdgpu_amdkfd.h"
  67 #include "amdgpu_pm.h"
  68
  69 #include "amdgpu_xgmi.h"
  70 #include "amdgpu_ras.h"
  71 #include "amdgpu_pmu.h"
  72 #include "amdgpu_fru_eeprom.h"
  73 #include "amdgpu_reset.h"
  74
  75 #include <linux/suspend.h>
  76 #include <drm/task_barrier.h>
  77 #include <linux/pm_runtime.h>
  78
  79 #include <drm/drm_drv.h>
  80
  81 MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin");
  82 MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin");
  83 MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin");
  84 MODULE_FIRMWARE("amdgpu/picasso_gpu_info.bin");
  85 MODULE_FIRMWARE("amdgpu/raven2_gpu_info.bin");
  86 MODULE_FIRMWARE("amdgpu/arcturus_gpu_info.bin");
  87 MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin");
  88
  89 #define AMDGPU_RESUME_MS                2000
  90 #define AMDGPU_MAX_RETRY_LIMIT          2
  91 #define AMDGPU_RETRY_SRIOV_RESET(r) ((r) == -EBUSY || (r) == -ETIMEDOUT || (r) == -EINVAL)
  92
  93 static const struct drm_driver amdgpu_kms_driver;
  94
  95 const char *amdgpu_asic_name[] = {
  96         "TAHITI",
  97         "PITCAIRN",
  98         "VERDE",
  99         "OLAND",
 100         "HAINAN",
 101         "BONAIRE",
 102         "KAVERI",
 103         "KABINI",
 104         "HAWAII",
 105         "MULLINS",
 106         "TOPAZ",
 107         "TONGA",
 108         "FIJI",
 109         "CARRIZO",
 110         "STONEY",
 111         "POLARIS10",
 112         "POLARIS11",
 113         "POLARIS12",
 114         "VEGAM",
 115         "VEGA10",
 116         "VEGA12",
 117         "VEGA20",
 118         "RAVEN",
 119         "ARCTURUS",
 120         "RENOIR",
 121         "ALDEBARAN",
 122         "NAVI10",
 123         "CYAN_SKILLFISH",
 124         "NAVI14",
 125         "NAVI12",
 126         "SIENNA_CICHLID",
 127         "NAVY_FLOUNDER",
 128         "VANGOGH",
 129         "DIMGREY_CAVEFISH",
 130         "BEIGE_GOBY",
 131         "YELLOW_CARP",
 132         "IP DISCOVERY",
 133         "LAST",
 134 };
 135
 136 /**
 137  * DOC: pcie_replay_count
 138  *
 139  * The amdgpu driver provides a sysfs API for reporting the total number
 140  * of PCIe replays (NAKs)
 141  * The file pcie_replay_count is used for this and returns the total
 142  * number of replays as a sum of the NAKs generated and NAKs received
 143  */
 144
 145 static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev,
 146                 struct device_attribute *attr, char *buf)
 147 {
 148         struct drm_device *ddev = dev_get_drvdata(dev);
 149         struct amdgpu_device *adev = drm_to_adev(ddev);
 150         uint64_t cnt = amdgpu_asic_get_pcie_replay_count(adev);
 151
 152         return sysfs_emit(buf, "%llu\n", cnt);
 153 }
 154
 155 static DEVICE_ATTR(pcie_replay_count, S_IRUGO,
 156                 amdgpu_device_get_pcie_replay_count, NULL);
 157
 158 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev);
 159
 160 /**
 161  * DOC: product_name
 162  *
 163  * The amdgpu driver provides a sysfs API for reporting the product name
 164  * for the device
 165  * The file serial_number is used for this and returns the product name
 166  * as returned from the FRU.
 167  * NOTE: This is only available for certain server cards
 168  */
 169
 170 static ssize_t amdgpu_device_get_product_name(struct device *dev,
 171                 struct device_attribute *attr, char *buf)
 172 {
 173         struct drm_device *ddev = dev_get_drvdata(dev);
 174         struct amdgpu_device *adev = drm_to_adev(ddev);
 175
 176         return sysfs_emit(buf, "%s\n", adev->product_name);
 177 }
 178
 179 static DEVICE_ATTR(product_name, S_IRUGO,
 180                 amdgpu_device_get_product_name, NULL);
 181
 182 /**
 183  * DOC: product_number
 184  *
 185  * The amdgpu driver provides a sysfs API for reporting the part number
 186  * for the device
 187  * The file serial_number is used for this and returns the part number
 188  * as returned from the FRU.
 189  * NOTE: This is only available for certain server cards
 190  */
 191
 192 static ssize_t amdgpu_device_get_product_number(struct device *dev,
 193                 struct device_attribute *attr, char *buf)
 194 {
 195         struct drm_device *ddev = dev_get_drvdata(dev);
 196         struct amdgpu_device *adev = drm_to_adev(ddev);
 197
 198         return sysfs_emit(buf, "%s\n", adev->product_number);
 199 }
 200
 201 static DEVICE_ATTR(product_number, S_IRUGO,
 202                 amdgpu_device_get_product_number, NULL);
 203
 204 /**
 205  * DOC: serial_number
 206  *
 207  * The amdgpu driver provides a sysfs API for reporting the serial number
 208  * for the device
 209  * The file serial_number is used for this and returns the serial number
 210  * as returned from the FRU.
 211  * NOTE: This is only available for certain server cards
 212  */
 213
 214 static ssize_t amdgpu_device_get_serial_number(struct device *dev,
 215                 struct device_attribute *attr, char *buf)
 216 {
 217         struct drm_device *ddev = dev_get_drvdata(dev);
 218         struct amdgpu_device *adev = drm_to_adev(ddev);
 219
 220         return sysfs_emit(buf, "%s\n", adev->serial);
 221 }
 222
 223 static DEVICE_ATTR(serial_number, S_IRUGO,
 224                 amdgpu_device_get_serial_number, NULL);
 225
 226 /**
 227  * amdgpu_device_supports_px - Is the device a dGPU with ATPX power control
 228  *
 229  * @dev: drm_device pointer
 230  *
 231  * Returns true if the device is a dGPU with ATPX power control,
 232  * otherwise return false.
 233  */
 234 bool amdgpu_device_supports_px(struct drm_device *dev)
 235 {
 236         struct amdgpu_device *adev = drm_to_adev(dev);
 237
 238         if ((adev->flags & AMD_IS_PX) && !amdgpu_is_atpx_hybrid())
 239                 return true;
 240         return false;
 241 }
 242
 243 /**
 244  * amdgpu_device_supports_boco - Is the device a dGPU with ACPI power resources
 245  *
 246  * @dev: drm_device pointer
 247  *
 248  * Returns true if the device is a dGPU with ACPI power control,
 249  * otherwise return false.
 250  */
 251 bool amdgpu_device_supports_boco(struct drm_device *dev)
 252 {
 253         struct amdgpu_device *adev = drm_to_adev(dev);
 254
 255         if (adev->has_pr3 ||
 256             ((adev->flags & AMD_IS_PX) && amdgpu_is_atpx_hybrid()))
 257                 return true;
 258         return false;
 259 }
 260
 261 /**
 262  * amdgpu_device_supports_baco - Does the device support BACO
 263  *
 264  * @dev: drm_device pointer
 265  *
 266  * Returns true if the device supporte BACO,
 267  * otherwise return false.
 268  */
 269 bool amdgpu_device_supports_baco(struct drm_device *dev)
 270 {
 271         struct amdgpu_device *adev = drm_to_adev(dev);
 272
 273         return amdgpu_asic_supports_baco(adev);
 274 }
 275
 276 /**
 277  * amdgpu_device_supports_smart_shift - Is the device dGPU with
 278  * smart shift support
 279  *
 280  * @dev: drm_device pointer
 281  *
 282  * Returns true if the device is a dGPU with Smart Shift support,
 283  * otherwise returns false.
 284  */
 285 bool amdgpu_device_supports_smart_shift(struct drm_device *dev)
 286 {
 287         return (amdgpu_device_supports_boco(dev) &&
 288                 amdgpu_acpi_is_power_shift_control_supported());
 289 }
 290
 291 /*
 292  * VRAM access helper functions
 293  */
 294
 295 /**
 296  * amdgpu_device_mm_access - access vram by MM_INDEX/MM_DATA
 297  *
 298  * @adev: amdgpu_device pointer
 299  * @pos: offset of the buffer in vram
 300  * @buf: virtual address of the buffer in system memory
 301  * @size: read/write size, sizeof(@buf) must > @size
 302  * @write: true - write to vram, otherwise - read from vram
 303  */
 304 void amdgpu_device_mm_access(struct amdgpu_device *adev, loff_t pos,
 305                              void *buf, size_t size, bool write)
 306 {
 307         unsigned long flags;
 308         uint32_t hi = ~0, tmp = 0;
 309         uint32_t *data = buf;
 310         uint64_t last;
 311         int idx;
 312
 313         if (!drm_dev_enter(adev_to_drm(adev), &idx))
 314                 return;
 315
 316         BUG_ON(!IS_ALIGNED(pos, 4) || !IS_ALIGNED(size, 4));
 317
 318         spin_lock_irqsave(&adev->mmio_idx_lock, flags);
 319         for (last = pos + size; pos < last; pos += 4) {
 320                 tmp = pos >> 31;
 321
 322                 WREG32_NO_KIQ(mmMM_INDEX, ((uint32_t)pos) | 0x80000000);
 323                 if (tmp != hi) {
 324                         WREG32_NO_KIQ(mmMM_INDEX_HI, tmp);
 325                         hi = tmp;
 326                 }
 327                 if (write)
 328                         WREG32_NO_KIQ(mmMM_DATA, *data++);
 329                 else
 330                         *data++ = RREG32_NO_KIQ(mmMM_DATA);
 331         }
 332
 333         spin_unlock_irqrestore(&adev->mmio_idx_lock, flags);
 334         drm_dev_exit(idx);
 335 }
 336
 337 /**
 338  * amdgpu_device_aper_access - access vram by vram aperature
 339  *
 340  * @adev: amdgpu_device pointer
 341  * @pos: offset of the buffer in vram
 342  * @buf: virtual address of the buffer in system memory
 343  * @size: read/write size, sizeof(@buf) must > @size
 344  * @write: true - write to vram, otherwise - read from vram
 345  *
 346  * The return value means how many bytes have been transferred.
 347  */
 348 size_t amdgpu_device_aper_access(struct amdgpu_device *adev, loff_t pos,
 349                                  void *buf, size_t size, bool write)
 350 {
 351 #ifdef CONFIG_64BIT
 352         void __iomem *addr;
 353         size_t count = 0;
 354         uint64_t last;
 355
 356         if (!adev->mman.aper_base_kaddr)
 357                 return 0;
 358
 359         last = min(pos + size, adev->gmc.visible_vram_size);
 360         if (last > pos) {
 361                 addr = adev->mman.aper_base_kaddr + pos;
 362                 count = last - pos;
 363
 364                 if (write) {
 365                         memcpy_toio(addr, buf, count);
 366                         mb();
 367                         amdgpu_device_flush_hdp(adev, NULL);
 368                 } else {
 369                         amdgpu_device_invalidate_hdp(adev, NULL);
 370                         mb();
 371                         memcpy_fromio(buf, addr, count);
 372                 }
 373
 374         }
 375
 376         return count;
 377 #else
 378         return 0;
 379 #endif
 380 }
 381
 382 /**
 383  * amdgpu_device_vram_access - read/write a buffer in vram
 384  *
 385  * @adev: amdgpu_device pointer
 386  * @pos: offset of the buffer in vram
 387  * @buf: virtual address of the buffer in system memory
 388  * @size: read/write size, sizeof(@buf) must > @size
 389  * @write: true - write to vram, otherwise - read from vram
 390  */
 391 void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos,
 392                                void *buf, size_t size, bool write)
 393 {
 394         size_t count;
 395
 396         /* try to using vram apreature to access vram first */
 397         count = amdgpu_device_aper_access(adev, pos, buf, size, write);
 398         size -= count;
 399         if (size) {
 400                 /* using MM to access rest vram */
 401                 pos += count;
 402                 buf += count;
 403                 amdgpu_device_mm_access(adev, pos, buf, size, write);
 404         }
 405 }
 406
 407 /*
 408  * register access helper functions.
 409  */
 410
 411 /* Check if hw access should be skipped because of hotplug or device error */
 412 bool amdgpu_device_skip_hw_access(struct amdgpu_device *adev)
 413 {
 414         if (adev->no_hw_access)
 415                 return true;
 416
 417 #ifdef CONFIG_LOCKDEP
 418         /*
 419          * This is a bit complicated to understand, so worth a comment. What we assert
 420          * here is that the GPU reset is not running on another thread in parallel.
 421          *
 422          * For this we trylock the read side of the reset semaphore, if that succeeds
 423          * we know that the reset is not running in paralell.
 424          *
 425          * If the trylock fails we assert that we are either already holding the read
 426          * side of the lock or are the reset thread itself and hold the write side of
 427          * the lock.
 428          */
 429         if (in_task()) {
 430                 if (down_read_trylock(&adev->reset_domain->sem))
 431                         up_read(&adev->reset_domain->sem);
 432                 else
 433                         lockdep_assert_held(&adev->reset_domain->sem);
 434         }
 435 #endif
 436         return false;
 437 }
 438
 439 /**
 440  * amdgpu_device_rreg - read a memory mapped IO or indirect register
 441  *
 442  * @adev: amdgpu_device pointer
 443  * @reg: dword aligned register offset
 444  * @acc_flags: access flags which require special behavior
 445  *
 446  * Returns the 32 bit value from the offset specified.
 447  */
 448 uint32_t amdgpu_device_rreg(struct amdgpu_device *adev,
 449                             uint32_t reg, uint32_t acc_flags)
 450 {
 451         uint32_t ret;
 452
 453         if (amdgpu_device_skip_hw_access(adev))
 454                 return 0;
 455
 456         if ((reg * 4) < adev->rmmio_size) {
 457                 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&
 458                     amdgpu_sriov_runtime(adev) &&
 459                     down_read_trylock(&adev->reset_domain->sem)) {
 460                         ret = amdgpu_kiq_rreg(adev, reg);
 461                         up_read(&adev->reset_domain->sem);
 462                 } else {
 463                         ret = readl(((void __iomem *)adev->rmmio) + (reg * 4));
 464                 }
 465         } else {
 466                 ret = adev->pcie_rreg(adev, reg * 4);
 467         }
 468
 469         trace_amdgpu_device_rreg(adev->pdev->device, reg, ret);
 470
 471         return ret;
 472 }
 473
 474 /*
 475  * MMIO register read with bytes helper functions
 476  * @offset:bytes offset from MMIO start
 477  *
 478 */
 479
 480 /**
 481  * amdgpu_mm_rreg8 - read a memory mapped IO register
 482  *
 483  * @adev: amdgpu_device pointer
 484  * @offset: byte aligned register offset
 485  *
 486  * Returns the 8 bit value from the offset specified.
 487  */
 488 uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset)
 489 {
 490         if (amdgpu_device_skip_hw_access(adev))
 491                 return 0;
 492
 493         if (offset < adev->rmmio_size)
 494                 return (readb(adev->rmmio + offset));
 495         BUG();
 496 }
 497
 498 /*
 499  * MMIO register write with bytes helper functions
 500  * @offset:bytes offset from MMIO start
 501  * @value: the value want to be written to the register
 502  *
 503 */
 504 /**
 505  * amdgpu_mm_wreg8 - read a memory mapped IO register
 506  *
 507  * @adev: amdgpu_device pointer
 508  * @offset: byte aligned register offset
 509  * @value: 8 bit value to write
 510  *
 511  * Writes the value specified to the offset specified.
 512  */
 513 void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value)
 514 {
 515         if (amdgpu_device_skip_hw_access(adev))
 516                 return;
 517
 518         if (offset < adev->rmmio_size)
 519                 writeb(value, adev->rmmio + offset);
 520         else
 521                 BUG();
 522 }
 523
 524 /**
 525  * amdgpu_device_wreg - write to a memory mapped IO or indirect register
 526  *
 527  * @adev: amdgpu_device pointer
 528  * @reg: dword aligned register offset
 529  * @v: 32 bit value to write to the register
 530  * @acc_flags: access flags which require special behavior
 531  *
 532  * Writes the value specified to the offset specified.
 533  */
 534 void amdgpu_device_wreg(struct amdgpu_device *adev,
 535                         uint32_t reg, uint32_t v,
 536                         uint32_t acc_flags)
 537 {
 538         if (amdgpu_device_skip_hw_access(adev))
 539                 return;
 540
 541         if ((reg * 4) < adev->rmmio_size) {
 542                 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&
 543                     amdgpu_sriov_runtime(adev) &&
 544                     down_read_trylock(&adev->reset_domain->sem)) {
 545                         amdgpu_kiq_wreg(adev, reg, v);
 546                         up_read(&adev->reset_domain->sem);
 547                 } else {
 548                         writel(v, ((void __iomem *)adev->rmmio) + (reg * 4));
 549                 }
 550         } else {
 551                 adev->pcie_wreg(adev, reg * 4, v);
 552         }
 553
 554         trace_amdgpu_device_wreg(adev->pdev->device, reg, v);
 555 }
 556
 557 /**
 558  * amdgpu_mm_wreg_mmio_rlc -  write register either with direct/indirect mmio or with RLC path if in range
 559  *
 560  * @adev: amdgpu_device pointer
 561  * @reg: mmio/rlc register
 562  * @v: value to write
 563  *
 564  * this function is invoked only for the debugfs register access
 565  */
 566 void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev,
 567                              uint32_t reg, uint32_t v)
 568 {
 569         if (amdgpu_device_skip_hw_access(adev))
 570                 return;
 571
 572         if (amdgpu_sriov_fullaccess(adev) &&
 573             adev->gfx.rlc.funcs &&
 574             adev->gfx.rlc.funcs->is_rlcg_access_range) {
 575                 if (adev->gfx.rlc.funcs->is_rlcg_access_range(adev, reg))
 576                         return amdgpu_sriov_wreg(adev, reg, v, 0, 0);
 577         } else if ((reg * 4) >= adev->rmmio_size) {
 578                 adev->pcie_wreg(adev, reg * 4, v);
 579         } else {
 580                 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4));
 581         }
 582 }
 583
 584 /**
 585  * amdgpu_mm_rdoorbell - read a doorbell dword
 586  *
 587  * @adev: amdgpu_device pointer
 588  * @index: doorbell index
 589  *
 590  * Returns the value in the doorbell aperture at the
 591  * requested doorbell index (CIK).
 592  */
 593 u32 amdgpu_mm_rdoorbell(struct amdgpu_device *adev, u32 index)
 594 {
 595         if (amdgpu_device_skip_hw_access(adev))
 596                 return 0;
 597
 598         if (index < adev->doorbell.num_doorbells) {
 599                 return readl(adev->doorbell.ptr + index);
 600         } else {
 601                 DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index);
 602                 return 0;
 603         }
 604 }
 605
 606 /**
 607  * amdgpu_mm_wdoorbell - write a doorbell dword
 608  *
 609  * @adev: amdgpu_device pointer
 610  * @index: doorbell index
 611  * @v: value to write
 612  *
 613  * Writes @v to the doorbell aperture at the
 614  * requested doorbell index (CIK).
 615  */
 616 void amdgpu_mm_wdoorbell(struct amdgpu_device *adev, u32 index, u32 v)
 617 {
 618         if (amdgpu_device_skip_hw_access(adev))
 619                 return;
 620
 621         if (index < adev->doorbell.num_doorbells) {
 622                 writel(v, adev->doorbell.ptr + index);
 623         } else {
 624                 DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index);
 625         }
 626 }
 627
 628 /**
 629  * amdgpu_mm_rdoorbell64 - read a doorbell Qword
 630  *
 631  * @adev: amdgpu_device pointer
 632  * @index: doorbell index
 633  *
 634  * Returns the value in the doorbell aperture at the
 635  * requested doorbell index (VEGA10+).
 636  */
 637 u64 amdgpu_mm_rdoorbell64(struct amdgpu_device *adev, u32 index)
 638 {
 639         if (amdgpu_device_skip_hw_access(adev))
 640                 return 0;
 641
 642         if (index < adev->doorbell.num_doorbells) {
 643                 return atomic64_read((atomic64_t *)(adev->doorbell.ptr + index));
 644         } else {
 645                 DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index);
 646                 return 0;
 647         }
 648 }
 649
 650 /**
 651  * amdgpu_mm_wdoorbell64 - write a doorbell Qword
 652  *
 653  * @adev: amdgpu_device pointer
 654  * @index: doorbell index
 655  * @v: value to write
 656  *
 657  * Writes @v to the doorbell aperture at the
 658  * requested doorbell index (VEGA10+).
 659  */
 660 void amdgpu_mm_wdoorbell64(struct amdgpu_device *adev, u32 index, u64 v)
 661 {
 662         if (amdgpu_device_skip_hw_access(adev))
 663                 return;
 664
 665         if (index < adev->doorbell.num_doorbells) {
 666                 atomic64_set((atomic64_t *)(adev->doorbell.ptr + index), v);
 667         } else {
 668                 DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index);
 669         }
 670 }
 671
 672 /**
 673  * amdgpu_device_indirect_rreg - read an indirect register
 674  *
 675  * @adev: amdgpu_device pointer
 676  * @pcie_index: mmio register offset
 677  * @pcie_data: mmio register offset
 678  * @reg_addr: indirect register address to read from
 679  *
 680  * Returns the value of indirect register @reg_addr
 681  */
 682 u32 amdgpu_device_indirect_rreg(struct amdgpu_device *adev,
 683                                 u32 pcie_index, u32 pcie_data,
 684                                 u32 reg_addr)
 685 {
 686         unsigned long flags;
 687         u32 r;
 688         void __iomem *pcie_index_offset;
 689         void __iomem *pcie_data_offset;
 690
 691         spin_lock_irqsave(&adev->pcie_idx_lock, flags);
 692         pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
 693         pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
 694
 695         writel(reg_addr, pcie_index_offset);
 696         readl(pcie_index_offset);
 697         r = readl(pcie_data_offset);
 698         spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
 699
 700         return r;
 701 }
 702
 703 /**
 704  * amdgpu_device_indirect_rreg64 - read a 64bits indirect register
 705  *
 706  * @adev: amdgpu_device pointer
 707  * @pcie_index: mmio register offset
 708  * @pcie_data: mmio register offset
 709  * @reg_addr: indirect register address to read from
 710  *
 711  * Returns the value of indirect register @reg_addr
 712  */
 713 u64 amdgpu_device_indirect_rreg64(struct amdgpu_device *adev,
 714                                   u32 pcie_index, u32 pcie_data,
 715                                   u32 reg_addr)
 716 {
 717         unsigned long flags;
 718         u64 r;
 719         void __iomem *pcie_index_offset;
 720         void __iomem *pcie_data_offset;
 721
 722         spin_lock_irqsave(&adev->pcie_idx_lock, flags);
 723         pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
 724         pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
 725
 726         /* read low 32 bits */
 727         writel(reg_addr, pcie_index_offset);
 728         readl(pcie_index_offset);
 729         r = readl(pcie_data_offset);
 730         /* read high 32 bits */
 731         writel(reg_addr + 4, pcie_index_offset);
 732         readl(pcie_index_offset);
 733         r |= ((u64)readl(pcie_data_offset) << 32);
 734         spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
 735
 736         return r;
 737 }
 738
 739 /**
 740  * amdgpu_device_indirect_wreg - write an indirect register address
 741  *
 742  * @adev: amdgpu_device pointer
 743  * @pcie_index: mmio register offset
 744  * @pcie_data: mmio register offset
 745  * @reg_addr: indirect register offset
 746  * @reg_data: indirect register data
 747  *
 748  */
 749 void amdgpu_device_indirect_wreg(struct amdgpu_device *adev,
 750                                  u32 pcie_index, u32 pcie_data,
 751                                  u32 reg_addr, u32 reg_data)
 752 {
 753         unsigned long flags;
 754         void __iomem *pcie_index_offset;
 755         void __iomem *pcie_data_offset;
 756
 757         spin_lock_irqsave(&adev->pcie_idx_lock, flags);
 758         pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
 759         pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
 760
 761         writel(reg_addr, pcie_index_offset);
 762         readl(pcie_index_offset);
 763         writel(reg_data, pcie_data_offset);
 764         readl(pcie_data_offset);
 765         spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
 766 }
 767
 768 /**
 769  * amdgpu_device_indirect_wreg64 - write a 64bits indirect register address
 770  *
 771  * @adev: amdgpu_device pointer
 772  * @pcie_index: mmio register offset
 773  * @pcie_data: mmio register offset
 774  * @reg_addr: indirect register offset
 775  * @reg_data: indirect register data
 776  *
 777  */
 778 void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev,
 779                                    u32 pcie_index, u32 pcie_data,
 780                                    u32 reg_addr, u64 reg_data)
 781 {
 782         unsigned long flags;
 783         void __iomem *pcie_index_offset;
 784         void __iomem *pcie_data_offset;
 785
 786         spin_lock_irqsave(&adev->pcie_idx_lock, flags);
 787         pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
 788         pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
 789
 790         /* write low 32 bits */
 791         writel(reg_addr, pcie_index_offset);
 792         readl(pcie_index_offset);
 793         writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset);
 794         readl(pcie_data_offset);
 795         /* write high 32 bits */
 796         writel(reg_addr + 4, pcie_index_offset);
 797         readl(pcie_index_offset);
 798         writel((u32)(reg_data >> 32), pcie_data_offset);
 799         readl(pcie_data_offset);
 800         spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
 801 }
 802
 803 /**
 804  * amdgpu_invalid_rreg - dummy reg read function
 805  *
 806  * @adev: amdgpu_device pointer
 807  * @reg: offset of register
 808  *
 809  * Dummy register read function.  Used for register blocks
 810  * that certain asics don't have (all asics).
 811  * Returns the value in the register.
 812  */
 813 static uint32_t amdgpu_invalid_rreg(struct amdgpu_device *adev, uint32_t reg)
 814 {
 815         DRM_ERROR("Invalid callback to read register 0x%04X\n", reg);
 816         BUG();
 817         return 0;
 818 }
 819
 820 /**
 821  * amdgpu_invalid_wreg - dummy reg write function
 822  *
 823  * @adev: amdgpu_device pointer
 824  * @reg: offset of register
 825  * @v: value to write to the register
 826  *
 827  * Dummy register read function.  Used for register blocks
 828  * that certain asics don't have (all asics).
 829  */
 830 static void amdgpu_invalid_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v)
 831 {
 832         DRM_ERROR("Invalid callback to write register 0x%04X with 0x%08X\n",
 833                   reg, v);
 834         BUG();
 835 }
 836
 837 /**
 838  * amdgpu_invalid_rreg64 - dummy 64 bit reg read function
 839  *
 840  * @adev: amdgpu_device pointer
 841  * @reg: offset of register
 842  *
 843  * Dummy register read function.  Used for register blocks
 844  * that certain asics don't have (all asics).
 845  * Returns the value in the register.
 846  */
 847 static uint64_t amdgpu_invalid_rreg64(struct amdgpu_device *adev, uint32_t reg)
 848 {
 849         DRM_ERROR("Invalid callback to read 64 bit register 0x%04X\n", reg);
 850         BUG();
 851         return 0;
 852 }
 853
 854 /**
 855  * amdgpu_invalid_wreg64 - dummy reg write function
 856  *
 857  * @adev: amdgpu_device pointer
 858  * @reg: offset of register
 859  * @v: value to write to the register
 860  *
 861  * Dummy register read function.  Used for register blocks
 862  * that certain asics don't have (all asics).
 863  */
 864 static void amdgpu_invalid_wreg64(struct amdgpu_device *adev, uint32_t reg, uint64_t v)
 865 {
 866         DRM_ERROR("Invalid callback to write 64 bit register 0x%04X with 0x%08llX\n",
 867                   reg, v);
 868         BUG();
 869 }
 870
 871 /**
 872  * amdgpu_block_invalid_rreg - dummy reg read function
 873  *
 874  * @adev: amdgpu_device pointer
 875  * @block: offset of instance
 876  * @reg: offset of register
 877  *
 878  * Dummy register read function.  Used for register blocks
 879  * that certain asics don't have (all asics).
 880  * Returns the value in the register.
 881  */
 882 static uint32_t amdgpu_block_invalid_rreg(struct amdgpu_device *adev,
 883                                           uint32_t block, uint32_t reg)
 884 {
 885         DRM_ERROR("Invalid callback to read register 0x%04X in block 0x%04X\n",
 886                   reg, block);
 887         BUG();
 888         return 0;
 889 }
 890
 891 /**
 892  * amdgpu_block_invalid_wreg - dummy reg write function
 893  *
 894  * @adev: amdgpu_device pointer
 895  * @block: offset of instance
 896  * @reg: offset of register
 897  * @v: value to write to the register
 898  *
 899  * Dummy register read function.  Used for register blocks
 900  * that certain asics don't have (all asics).
 901  */
 902 static void amdgpu_block_invalid_wreg(struct amdgpu_device *adev,
 903                                       uint32_t block,
 904                                       uint32_t reg, uint32_t v)
 905 {
 906         DRM_ERROR("Invalid block callback to write register 0x%04X in block 0x%04X with 0x%08X\n",
 907                   reg, block, v);
 908         BUG();
 909 }
 910
 911 /**
 912  * amdgpu_device_asic_init - Wrapper for atom asic_init
 913  *
 914  * @adev: amdgpu_device pointer
 915  *
 916  * Does any asic specific work and then calls atom asic init.
 917  */
 918 static int amdgpu_device_asic_init(struct amdgpu_device *adev)
 919 {
 920         amdgpu_asic_pre_asic_init(adev);
 921
 922         if (adev->ip_versions[GC_HWIP][0] >= IP_VERSION(11, 0, 0))
 923                 return amdgpu_atomfirmware_asic_init(adev, true);
 924         else
 925                 return amdgpu_atom_asic_init(adev->mode_info.atom_context);
 926 }
 927
 928 /**
 929  * amdgpu_device_vram_scratch_init - allocate the VRAM scratch page
 930  *
 931  * @adev: amdgpu_device pointer
 932  *
 933  * Allocates a scratch page of VRAM for use by various things in the
 934  * driver.
 935  */
 936 static int amdgpu_device_vram_scratch_init(struct amdgpu_device *adev)
 937 {
 938         return amdgpu_bo_create_kernel(adev, AMDGPU_GPU_PAGE_SIZE,
 939                                        PAGE_SIZE, AMDGPU_GEM_DOMAIN_VRAM,
 940                                        &adev->vram_scratch.robj,
 941                                        &adev->vram_scratch.gpu_addr,
 942                                        (void **)&adev->vram_scratch.ptr);
 943 }
 944
 945 /**
 946  * amdgpu_device_vram_scratch_fini - Free the VRAM scratch page
 947  *
 948  * @adev: amdgpu_device pointer
 949  *
 950  * Frees the VRAM scratch page.
 951  */
 952 static void amdgpu_device_vram_scratch_fini(struct amdgpu_device *adev)
 953 {
 954         amdgpu_bo_free_kernel(&adev->vram_scratch.robj, NULL, NULL);
 955 }
 956
 957 /**
 958  * amdgpu_device_program_register_sequence - program an array of registers.
 959  *
 960  * @adev: amdgpu_device pointer
 961  * @registers: pointer to the register array
 962  * @array_size: size of the register array
 963  *
 964  * Programs an array or registers with and and or masks.
 965  * This is a helper for setting golden registers.
 966  */
 967 void amdgpu_device_program_register_sequence(struct amdgpu_device *adev,
 968                                              const u32 *registers,
 969                                              const u32 array_size)
 970 {
 971         u32 tmp, reg, and_mask, or_mask;
 972         int i;
 973
 974         if (array_size % 3)
 975                 return;
 976
 977         for (i = 0; i < array_size; i +=3) {
 978                 reg = registers[i + 0];
 979                 and_mask = registers[i + 1];
 980                 or_mask = registers[i + 2];
 981
 982                 if (and_mask == 0xffffffff) {
 983                         tmp = or_mask;
 984                 } else {
 985                         tmp = RREG32(reg);
 986                         tmp &= ~and_mask;
 987                         if (adev->family >= AMDGPU_FAMILY_AI)
 988                                 tmp |= (or_mask & and_mask);
 989                         else
 990                                 tmp |= or_mask;
 991                 }
 992                 WREG32(reg, tmp);
 993         }
 994 }
 995
 996 /**
 997  * amdgpu_device_pci_config_reset - reset the GPU
 998  *
 999  * @adev: amdgpu_device pointer
1000  *
1001  * Resets the GPU using the pci config reset sequence.
1002  * Only applicable to asics prior to vega10.
1003  */
1004 void amdgpu_device_pci_config_reset(struct amdgpu_device *adev)
1005 {
1006         pci_write_config_dword(adev->pdev, 0x7c, AMDGPU_ASIC_RESET_DATA);
1007 }
1008
1009 /**
1010  * amdgpu_device_pci_reset - reset the GPU using generic PCI means
1011  *
1012  * @adev: amdgpu_device pointer
1013  *
1014  * Resets the GPU using generic pci reset interfaces (FLR, SBR, etc.).
1015  */
1016 int amdgpu_device_pci_reset(struct amdgpu_device *adev)
1017 {
1018         return pci_reset_function(adev->pdev);
1019 }
1020
1021 /*
1022  * GPU doorbell aperture helpers function.
1023  */
1024 /**
1025  * amdgpu_device_doorbell_init - Init doorbell driver information.
1026  *
1027  * @adev: amdgpu_device pointer
1028  *
1029  * Init doorbell driver information (CIK)
1030  * Returns 0 on success, error on failure.
1031  */
1032 static int amdgpu_device_doorbell_init(struct amdgpu_device *adev)
1033 {
1034
1035         /* No doorbell on SI hardware generation */
1036         if (adev->asic_type < CHIP_BONAIRE) {
1037                 adev->doorbell.base = 0;
1038                 adev->doorbell.size = 0;
1039                 adev->doorbell.num_doorbells = 0;
1040                 adev->doorbell.ptr = NULL;
1041                 return 0;
1042         }
1043
1044         if (pci_resource_flags(adev->pdev, 2) & IORESOURCE_UNSET)
1045                 return -EINVAL;
1046
1047         amdgpu_asic_init_doorbell_index(adev);
1048
1049         /* doorbell bar mapping */
1050         adev->doorbell.base = pci_resource_start(adev->pdev, 2);
1051         adev->doorbell.size = pci_resource_len(adev->pdev, 2);
1052
1053         if (adev->enable_mes) {
1054                 adev->doorbell.num_doorbells =
1055                         adev->doorbell.size / sizeof(u32);
1056         } else {
1057                 adev->doorbell.num_doorbells =
1058                         min_t(u32, adev->doorbell.size / sizeof(u32),
1059                               adev->doorbell_index.max_assignment+1);
1060                 if (adev->doorbell.num_doorbells == 0)
1061                         return -EINVAL;
1062
1063                 /* For Vega, reserve and map two pages on doorbell BAR since SDMA
1064                  * paging queue doorbell use the second page. The
1065                  * AMDGPU_DOORBELL64_MAX_ASSIGNMENT definition assumes all the
1066                  * doorbells are in the first page. So with paging queue enabled,
1067                  * the max num_doorbells should + 1 page (0x400 in dword)
1068                  */
1069                 if (adev->asic_type >= CHIP_VEGA10)
1070                         adev->doorbell.num_doorbells += 0x400;
1071         }
1072
1073         adev->doorbell.ptr = ioremap(adev->doorbell.base,
1074                                      adev->doorbell.num_doorbells *
1075                                      sizeof(u32));
1076         if (adev->doorbell.ptr == NULL)
1077                 return -ENOMEM;
1078
1079         return 0;
1080 }
1081
1082 /**
1083  * amdgpu_device_doorbell_fini - Tear down doorbell driver information.
1084  *
1085  * @adev: amdgpu_device pointer
1086  *
1087  * Tear down doorbell driver information (CIK)
1088  */
1089 static void amdgpu_device_doorbell_fini(struct amdgpu_device *adev)
1090 {
1091         iounmap(adev->doorbell.ptr);
1092         adev->doorbell.ptr = NULL;
1093 }
1094
1095
1096
1097 /*
1098  * amdgpu_device_wb_*()
1099  * Writeback is the method by which the GPU updates special pages in memory
1100  * with the status of certain GPU events (fences, ring pointers,etc.).
1101  */
1102
1103 /**
1104  * amdgpu_device_wb_fini - Disable Writeback and free memory
1105  *
1106  * @adev: amdgpu_device pointer
1107  *
1108  * Disables Writeback and frees the Writeback memory (all asics).
1109  * Used at driver shutdown.
1110  */
1111 static void amdgpu_device_wb_fini(struct amdgpu_device *adev)
1112 {
1113         if (adev->wb.wb_obj) {
1114                 amdgpu_bo_free_kernel(&adev->wb.wb_obj,
1115                                       &adev->wb.gpu_addr,
1116                                       (void **)&adev->wb.wb);
1117                 adev->wb.wb_obj = NULL;
1118         }
1119 }
1120
1121 /**
1122  * amdgpu_device_wb_init - Init Writeback driver info and allocate memory
1123  *
1124  * @adev: amdgpu_device pointer
1125  *
1126  * Initializes writeback and allocates writeback memory (all asics).
1127  * Used at driver startup.
1128  * Returns 0 on success or an -error on failure.
1129  */
1130 static int amdgpu_device_wb_init(struct amdgpu_device *adev)
1131 {
1132         int r;
1133
1134         if (adev->wb.wb_obj == NULL) {
1135                 /* AMDGPU_MAX_WB * sizeof(uint32_t) * 8 = AMDGPU_MAX_WB 256bit slots */
1136                 r = amdgpu_bo_create_kernel(adev, AMDGPU_MAX_WB * sizeof(uint32_t) * 8,
1137                                             PAGE_SIZE, AMDGPU_GEM_DOMAIN_GTT,
1138                                             &adev->wb.wb_obj, &adev->wb.gpu_addr,
1139                                             (void **)&adev->wb.wb);
1140                 if (r) {
1141                         dev_warn(adev->dev, "(%d) create WB bo failed\n", r);
1142                         return r;
1143                 }
1144
1145                 adev->wb.num_wb = AMDGPU_MAX_WB;
1146                 memset(&adev->wb.used, 0, sizeof(adev->wb.used));
1147
1148                 /* clear wb memory */
1149                 memset((char *)adev->wb.wb, 0, AMDGPU_MAX_WB * sizeof(uint32_t) * 8);
1150         }
1151
1152         return 0;
1153 }
1154
1155 /**
1156  * amdgpu_device_wb_get - Allocate a wb entry
1157  *
1158  * @adev: amdgpu_device pointer
1159  * @wb: wb index
1160  *
1161  * Allocate a wb slot for use by the driver (all asics).
1162  * Returns 0 on success or -EINVAL on failure.
1163  */
1164 int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb)
1165 {
1166         unsigned long offset = find_first_zero_bit(adev->wb.used, adev->wb.num_wb);
1167
1168         if (offset < adev->wb.num_wb) {
1169                 __set_bit(offset, adev->wb.used);
1170                 *wb = offset << 3; /* convert to dw offset */
1171                 return 0;
1172         } else {
1173                 return -EINVAL;
1174         }
1175 }
1176
1177 /**
1178  * amdgpu_device_wb_free - Free a wb entry
1179  *
1180  * @adev: amdgpu_device pointer
1181  * @wb: wb index
1182  *
1183  * Free a wb slot allocated for use by the driver (all asics)
1184  */
1185 void amdgpu_device_wb_free(struct amdgpu_device *adev, u32 wb)
1186 {
1187         wb >>= 3;
1188         if (wb < adev->wb.num_wb)
1189                 __clear_bit(wb, adev->wb.used);
1190 }
1191
1192 /**
1193  * amdgpu_device_resize_fb_bar - try to resize FB BAR
1194  *
1195  * @adev: amdgpu_device pointer
1196  *
1197  * Try to resize FB BAR to make all VRAM CPU accessible. We try very hard not
1198  * to fail, but if any of the BARs is not accessible after the size we abort
1199  * driver loading by returning -ENODEV.
1200  */
1201 int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev)
1202 {
1203         int rbar_size = pci_rebar_bytes_to_size(adev->gmc.real_vram_size);
1204         struct pci_bus *root;
1205         struct resource *res;
1206         unsigned i;
1207         u16 cmd;
1208         int r;
1209
1210         /* Bypass for VF */
1211         if (amdgpu_sriov_vf(adev))
1212                 return 0;
1213
1214         /* skip if the bios has already enabled large BAR */
1215         if (adev->gmc.real_vram_size &&
1216             (pci_resource_len(adev->pdev, 0) >= adev->gmc.real_vram_size))
1217                 return 0;
1218
1219         /* Check if the root BUS has 64bit memory resources */
1220         root = adev->pdev->bus;
1221         while (root->parent)
1222                 root = root->parent;
1223
1224         pci_bus_for_each_resource(root, res, i) {
1225                 if (res && res->flags & (IORESOURCE_MEM | IORESOURCE_MEM_64) &&
1226                     res->start > 0x100000000ull)
1227                         break;
1228         }
1229
1230         /* Trying to resize is pointless without a root hub window above 4GB */
1231         if (!res)
1232                 return 0;
1233
1234         /* Limit the BAR size to what is available */
1235         rbar_size = min(fls(pci_rebar_get_possible_sizes(adev->pdev, 0)) - 1,
1236                         rbar_size);
1237
1238         /* Disable memory decoding while we change the BAR addresses and size */
1239         pci_read_config_word(adev->pdev, PCI_COMMAND, &cmd);
1240         pci_write_config_word(adev->pdev, PCI_COMMAND,
1241                               cmd & ~PCI_COMMAND_MEMORY);
1242
1243         /* Free the VRAM and doorbell BAR, we most likely need to move both. */
1244         amdgpu_device_doorbell_fini(adev);
1245         if (adev->asic_type >= CHIP_BONAIRE)
1246                 pci_release_resource(adev->pdev, 2);
1247
1248         pci_release_resource(adev->pdev, 0);
1249
1250         r = pci_resize_resource(adev->pdev, 0, rbar_size);
1251         if (r == -ENOSPC)
1252                 DRM_INFO("Not enough PCI address space for a large BAR.");
1253         else if (r && r != -ENOTSUPP)
1254                 DRM_ERROR("Problem resizing BAR0 (%d).", r);
1255
1256         pci_assign_unassigned_bus_resources(adev->pdev->bus);
1257
1258         /* When the doorbell or fb BAR isn't available we have no chance of
1259          * using the device.
1260          */
1261         r = amdgpu_device_doorbell_init(adev);
1262         if (r || (pci_resource_flags(adev->pdev, 0) & IORESOURCE_UNSET))
1263                 return -ENODEV;
1264
1265         pci_write_config_word(adev->pdev, PCI_COMMAND, cmd);
1266
1267         return 0;
1268 }
1269
1270 /*
1271  * GPU helpers function.
1272  */
1273 /**
1274  * amdgpu_device_need_post - check if the hw need post or not
1275  *
1276  * @adev: amdgpu_device pointer
1277  *
1278  * Check if the asic has been initialized (all asics) at driver startup
1279  * or post is needed if  hw reset is performed.
1280  * Returns true if need or false if not.
1281  */
1282 bool amdgpu_device_need_post(struct amdgpu_device *adev)
1283 {
1284         uint32_t reg;
1285
1286         if (amdgpu_sriov_vf(adev))
1287                 return false;
1288
1289         if (amdgpu_passthrough(adev)) {
1290                 /* for FIJI: In whole GPU pass-through virtualization case, after VM reboot
1291                  * some old smc fw still need driver do vPost otherwise gpu hang, while
1292                  * those smc fw version above 22.15 doesn't have this flaw, so we force
1293                  * vpost executed for smc version below 22.15
1294                  */
1295                 if (adev->asic_type == CHIP_FIJI) {
1296                         int err;
1297                         uint32_t fw_ver;
1298                         err = request_firmware(&adev->pm.fw, "amdgpu/fiji_smc.bin", adev->dev);
1299                         /* force vPost if error occured */
1300                         if (err)
1301                                 return true;
1302
1303                         fw_ver = *((uint32_t *)adev->pm.fw->data + 69);
1304                         if (fw_ver < 0x00160e00)
1305                                 return true;
1306                 }
1307         }
1308
1309         /* Don't post if we need to reset whole hive on init */
1310         if (adev->gmc.xgmi.pending_reset)
1311                 return false;
1312
1313         if (adev->has_hw_reset) {
1314                 adev->has_hw_reset = false;
1315                 return true;
1316         }
1317
1318         /* bios scratch used on CIK+ */
1319         if (adev->asic_type >= CHIP_BONAIRE)
1320                 return amdgpu_atombios_scratch_need_asic_init(adev);
1321
1322         /* check MEM_SIZE for older asics */
1323         reg = amdgpu_asic_get_config_memsize(adev);
1324
1325         if ((reg != 0) && (reg != 0xffffffff))
1326                 return false;
1327
1328         return true;
1329 }
1330
1331 /**
1332  * amdgpu_device_should_use_aspm - check if the device should program ASPM
1333  *
1334  * @adev: amdgpu_device pointer
1335  *
1336  * Confirm whether the module parameter and pcie bridge agree that ASPM should
1337  * be set for this device.
1338  *
1339  * Returns true if it should be used or false if not.
1340  */
1341 bool amdgpu_device_should_use_aspm(struct amdgpu_device *adev)
1342 {
1343         switch (amdgpu_aspm) {
1344         case -1:
1345                 break;
1346         case 0:
1347                 return false;
1348         case 1:
1349                 return true;
1350         default:
1351                 return false;
1352         }
1353         return pcie_aspm_enabled(adev->pdev);
1354 }
1355
1356 /* if we get transitioned to only one device, take VGA back */
1357 /**
1358  * amdgpu_device_vga_set_decode - enable/disable vga decode
1359  *
1360  * @pdev: PCI device pointer
1361  * @state: enable/disable vga decode
1362  *
1363  * Enable/disable vga decode (all asics).
1364  * Returns VGA resource flags.
1365  */
1366 static unsigned int amdgpu_device_vga_set_decode(struct pci_dev *pdev,
1367                 bool state)
1368 {
1369         struct amdgpu_device *adev = drm_to_adev(pci_get_drvdata(pdev));
1370         amdgpu_asic_set_vga_state(adev, state);
1371         if (state)
1372                 return VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM |
1373                        VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM;
1374         else
1375                 return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM;
1376 }
1377
1378 /**
1379  * amdgpu_device_check_block_size - validate the vm block size
1380  *
1381  * @adev: amdgpu_device pointer
1382  *
1383  * Validates the vm block size specified via module parameter.
1384  * The vm block size defines number of bits in page table versus page directory,
1385  * a page is 4KB so we have 12 bits offset, minimum 9 bits in the
1386  * page table and the remaining bits are in the page directory.
1387  */
1388 static void amdgpu_device_check_block_size(struct amdgpu_device *adev)
1389 {
1390         /* defines number of bits in page table versus page directory,
1391          * a page is 4KB so we have 12 bits offset, minimum 9 bits in the
1392          * page table and the remaining bits are in the page directory */
1393         if (amdgpu_vm_block_size == -1)
1394                 return;
1395
1396         if (amdgpu_vm_block_size < 9) {
1397                 dev_warn(adev->dev, "VM page table size (%d) too small\n",
1398                          amdgpu_vm_block_size);
1399                 amdgpu_vm_block_size = -1;
1400         }
1401 }
1402
1403 /**
1404  * amdgpu_device_check_vm_size - validate the vm size
1405  *
1406  * @adev: amdgpu_device pointer
1407  *
1408  * Validates the vm size in GB specified via module parameter.
1409  * The VM size is the size of the GPU virtual memory space in GB.
1410  */
1411 static void amdgpu_device_check_vm_size(struct amdgpu_device *adev)
1412 {
1413         /* no need to check the default value */
1414         if (amdgpu_vm_size == -1)
1415                 return;
1416
1417         if (amdgpu_vm_size < 1) {
1418                 dev_warn(adev->dev, "VM size (%d) too small, min is 1GB\n",
1419                          amdgpu_vm_size);
1420                 amdgpu_vm_size = -1;
1421         }
1422 }
1423
1424 static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev)
1425 {
1426         struct sysinfo si;
1427         bool is_os_64 = (sizeof(void *) == 8);
1428         uint64_t total_memory;
1429         uint64_t dram_size_seven_GB = 0x1B8000000;
1430         uint64_t dram_size_three_GB = 0xB8000000;
1431
1432         if (amdgpu_smu_memory_pool_size == 0)
1433                 return;
1434
1435         if (!is_os_64) {
1436                 DRM_WARN("Not 64-bit OS, feature not supported\n");
1437                 goto def_value;
1438         }
1439         si_meminfo(&si);
1440         total_memory = (uint64_t)si.totalram * si.mem_unit;
1441
1442         if ((amdgpu_smu_memory_pool_size == 1) ||
1443                 (amdgpu_smu_memory_pool_size == 2)) {
1444                 if (total_memory < dram_size_three_GB)
1445                         goto def_value1;
1446         } else if ((amdgpu_smu_memory_pool_size == 4) ||
1447                 (amdgpu_smu_memory_pool_size == 8)) {
1448                 if (total_memory < dram_size_seven_GB)
1449                         goto def_value1;
1450         } else {
1451                 DRM_WARN("Smu memory pool size not supported\n");
1452                 goto def_value;
1453         }
1454         adev->pm.smu_prv_buffer_size = amdgpu_smu_memory_pool_size << 28;
1455
1456         return;
1457
1458 def_value1:
1459         DRM_WARN("No enough system memory\n");
1460 def_value:
1461         adev->pm.smu_prv_buffer_size = 0;
1462 }
1463
1464 static int amdgpu_device_init_apu_flags(struct amdgpu_device *adev)
1465 {
1466         if (!(adev->flags & AMD_IS_APU) ||
1467             adev->asic_type < CHIP_RAVEN)
1468                 return 0;
1469
1470         switch (adev->asic_type) {
1471         case CHIP_RAVEN:
1472                 if (adev->pdev->device == 0x15dd)
1473                         adev->apu_flags |= AMD_APU_IS_RAVEN;
1474                 if (adev->pdev->device == 0x15d8)
1475                         adev->apu_flags |= AMD_APU_IS_PICASSO;
1476                 break;
1477         case CHIP_RENOIR:
1478                 if ((adev->pdev->device == 0x1636) ||
1479                     (adev->pdev->device == 0x164c))
1480                         adev->apu_flags |= AMD_APU_IS_RENOIR;
1481                 else
1482                         adev->apu_flags |= AMD_APU_IS_GREEN_SARDINE;
1483                 break;
1484         case CHIP_VANGOGH:
1485                 adev->apu_flags |= AMD_APU_IS_VANGOGH;
1486                 break;
1487         case CHIP_YELLOW_CARP:
1488                 break;
1489         case CHIP_CYAN_SKILLFISH:
1490                 if ((adev->pdev->device == 0x13FE) ||
1491                     (adev->pdev->device == 0x143F))
1492                         adev->apu_flags |= AMD_APU_IS_CYAN_SKILLFISH2;
1493                 break;
1494         default:
1495                 break;
1496         }
1497
1498         return 0;
1499 }
1500
1501 /**
1502  * amdgpu_device_check_arguments - validate module params
1503  *
1504  * @adev: amdgpu_device pointer
1505  *
1506  * Validates certain module parameters and updates
1507  * the associated values used by the driver (all asics).
1508  */
1509 static int amdgpu_device_check_arguments(struct amdgpu_device *adev)
1510 {
1511         if (amdgpu_sched_jobs < 4) {
1512                 dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n",
1513                          amdgpu_sched_jobs);
1514                 amdgpu_sched_jobs = 4;
1515         } else if (!is_power_of_2(amdgpu_sched_jobs)){
1516                 dev_warn(adev->dev, "sched jobs (%d) must be a power of 2\n",
1517                          amdgpu_sched_jobs);
1518                 amdgpu_sched_jobs = roundup_pow_of_two(amdgpu_sched_jobs);
1519         }
1520
1521         if (amdgpu_gart_size != -1 && amdgpu_gart_size < 32) {
1522                 /* gart size must be greater or equal to 32M */
1523                 dev_warn(adev->dev, "gart size (%d) too small\n",
1524                          amdgpu_gart_size);
1525                 amdgpu_gart_size = -1;
1526         }
1527
1528         if (amdgpu_gtt_size != -1 && amdgpu_gtt_size < 32) {
1529                 /* gtt size must be greater or equal to 32M */
1530                 dev_warn(adev->dev, "gtt size (%d) too small\n",
1531                                  amdgpu_gtt_size);
1532                 amdgpu_gtt_size = -1;
1533         }
1534
1535         /* valid range is between 4 and 9 inclusive */
1536         if (amdgpu_vm_fragment_size != -1 &&
1537             (amdgpu_vm_fragment_size > 9 || amdgpu_vm_fragment_size < 4)) {
1538                 dev_warn(adev->dev, "valid range is between 4 and 9\n");
1539                 amdgpu_vm_fragment_size = -1;
1540         }
1541
1542         if (amdgpu_sched_hw_submission < 2) {
1543                 dev_warn(adev->dev, "sched hw submission jobs (%d) must be at least 2\n",
1544                          amdgpu_sched_hw_submission);
1545                 amdgpu_sched_hw_submission = 2;
1546         } else if (!is_power_of_2(amdgpu_sched_hw_submission)) {
1547                 dev_warn(adev->dev, "sched hw submission jobs (%d) must be a power of 2\n",
1548                          amdgpu_sched_hw_submission);
1549                 amdgpu_sched_hw_submission = roundup_pow_of_two(amdgpu_sched_hw_submission);
1550         }
1551
1552         if (amdgpu_reset_method < -1 || amdgpu_reset_method > 4) {
1553                 dev_warn(adev->dev, "invalid option for reset method, reverting to default\n");
1554                 amdgpu_reset_method = -1;
1555         }
1556
1557         amdgpu_device_check_smu_prv_buffer_size(adev);
1558
1559         amdgpu_device_check_vm_size(adev);
1560
1561         amdgpu_device_check_block_size(adev);
1562
1563         adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type);
1564
1565         return 0;
1566 }
1567
1568 /**
1569  * amdgpu_switcheroo_set_state - set switcheroo state
1570  *
1571  * @pdev: pci dev pointer
1572  * @state: vga_switcheroo state
1573  *
1574  * Callback for the switcheroo driver.  Suspends or resumes the
1575  * the asics before or after it is powered up using ACPI methods.
1576  */
1577 static void amdgpu_switcheroo_set_state(struct pci_dev *pdev,
1578                                         enum vga_switcheroo_state state)
1579 {
1580         struct drm_device *dev = pci_get_drvdata(pdev);
1581         int r;
1582
1583         if (amdgpu_device_supports_px(dev) && state == VGA_SWITCHEROO_OFF)
1584                 return;
1585
1586         if (state == VGA_SWITCHEROO_ON) {
1587                 pr_info("switched on\n");
1588                 /* don't suspend or resume card normally */
1589                 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING;
1590
1591                 pci_set_power_state(pdev, PCI_D0);
1592                 amdgpu_device_load_pci_state(pdev);
1593                 r = pci_enable_device(pdev);
1594                 if (r)
1595                         DRM_WARN("pci_enable_device failed (%d)\n", r);
1596                 amdgpu_device_resume(dev, true);
1597
1598                 dev->switch_power_state = DRM_SWITCH_POWER_ON;
1599         } else {
1600                 pr_info("switched off\n");
1601                 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING;
1602                 amdgpu_device_suspend(dev, true);
1603                 amdgpu_device_cache_pci_state(pdev);
1604                 /* Shut down the device */
1605                 pci_disable_device(pdev);
1606                 pci_set_power_state(pdev, PCI_D3cold);
1607                 dev->switch_power_state = DRM_SWITCH_POWER_OFF;
1608         }
1609 }
1610
1611 /**
1612  * amdgpu_switcheroo_can_switch - see if switcheroo state can change
1613  *
1614  * @pdev: pci dev pointer
1615  *
1616  * Callback for the switcheroo driver.  Check of the switcheroo
1617  * state can be changed.
1618  * Returns true if the state can be changed, false if not.
1619  */
1620 static bool amdgpu_switcheroo_can_switch(struct pci_dev *pdev)
1621 {
1622         struct drm_device *dev = pci_get_drvdata(pdev);
1623
1624         /*
1625         * FIXME: open_count is protected by drm_global_mutex but that would lead to
1626         * locking inversion with the driver load path. And the access here is
1627         * completely racy anyway. So don't bother with locking for now.
1628         */
1629         return atomic_read(&dev->open_count) == 0;
1630 }
1631
1632 static const struct vga_switcheroo_client_ops amdgpu_switcheroo_ops = {
1633         .set_gpu_state = amdgpu_switcheroo_set_state,
1634         .reprobe = NULL,
1635         .can_switch = amdgpu_switcheroo_can_switch,
1636 };
1637
1638 /**
1639  * amdgpu_device_ip_set_clockgating_state - set the CG state
1640  *
1641  * @dev: amdgpu_device pointer
1642  * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1643  * @state: clockgating state (gate or ungate)
1644  *
1645  * Sets the requested clockgating state for all instances of
1646  * the hardware IP specified.
1647  * Returns the error code from the last instance.
1648  */
1649 int amdgpu_device_ip_set_clockgating_state(void *dev,
1650                                            enum amd_ip_block_type block_type,
1651                                            enum amd_clockgating_state state)
1652 {
1653         struct amdgpu_device *adev = dev;
1654         int i, r = 0;
1655
1656         for (i = 0; i < adev->num_ip_blocks; i++) {
1657                 if (!adev->ip_blocks[i].status.valid)
1658                         continue;
1659                 if (adev->ip_blocks[i].version->type != block_type)
1660                         continue;
1661                 if (!adev->ip_blocks[i].version->funcs->set_clockgating_state)
1662                         continue;
1663                 r = adev->ip_blocks[i].version->funcs->set_clockgating_state(
1664                         (void *)adev, state);
1665                 if (r)
1666                         DRM_ERROR("set_clockgating_state of IP block <%s> failed %d\n",
1667                                   adev->ip_blocks[i].version->funcs->name, r);
1668         }
1669         return r;
1670 }
1671
1672 /**
1673  * amdgpu_device_ip_set_powergating_state - set the PG state
1674  *
1675  * @dev: amdgpu_device pointer
1676  * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1677  * @state: powergating state (gate or ungate)
1678  *
1679  * Sets the requested powergating state for all instances of
1680  * the hardware IP specified.
1681  * Returns the error code from the last instance.
1682  */
1683 int amdgpu_device_ip_set_powergating_state(void *dev,
1684                                            enum amd_ip_block_type block_type,
1685                                            enum amd_powergating_state state)
1686 {
1687         struct amdgpu_device *adev = dev;
1688         int i, r = 0;
1689
1690         for (i = 0; i < adev->num_ip_blocks; i++) {
1691                 if (!adev->ip_blocks[i].status.valid)
1692                         continue;
1693                 if (adev->ip_blocks[i].version->type != block_type)
1694                         continue;
1695                 if (!adev->ip_blocks[i].version->funcs->set_powergating_state)
1696                         continue;
1697                 r = adev->ip_blocks[i].version->funcs->set_powergating_state(
1698                         (void *)adev, state);
1699                 if (r)
1700                         DRM_ERROR("set_powergating_state of IP block <%s> failed %d\n",
1701                                   adev->ip_blocks[i].version->funcs->name, r);
1702         }
1703         return r;
1704 }
1705
1706 /**
1707  * amdgpu_device_ip_get_clockgating_state - get the CG state
1708  *
1709  * @adev: amdgpu_device pointer
1710  * @flags: clockgating feature flags
1711  *
1712  * Walks the list of IPs on the device and updates the clockgating
1713  * flags for each IP.
1714  * Updates @flags with the feature flags for each hardware IP where
1715  * clockgating is enabled.
1716  */
1717 void amdgpu_device_ip_get_clockgating_state(struct amdgpu_device *adev,
1718                                             u64 *flags)
1719 {
1720         int i;
1721
1722         for (i = 0; i < adev->num_ip_blocks; i++) {
1723                 if (!adev->ip_blocks[i].status.valid)
1724                         continue;
1725                 if (adev->ip_blocks[i].version->funcs->get_clockgating_state)
1726                         adev->ip_blocks[i].version->funcs->get_clockgating_state((void *)adev, flags);
1727         }
1728 }
1729
1730 /**
1731  * amdgpu_device_ip_wait_for_idle - wait for idle
1732  *
1733  * @adev: amdgpu_device pointer
1734  * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1735  *
1736  * Waits for the request hardware IP to be idle.
1737  * Returns 0 for success or a negative error code on failure.
1738  */
1739 int amdgpu_device_ip_wait_for_idle(struct amdgpu_device *adev,
1740                                    enum amd_ip_block_type block_type)
1741 {
1742         int i, r;
1743
1744         for (i = 0; i < adev->num_ip_blocks; i++) {
1745                 if (!adev->ip_blocks[i].status.valid)
1746                         continue;
1747                 if (adev->ip_blocks[i].version->type == block_type) {
1748                         r = adev->ip_blocks[i].version->funcs->wait_for_idle((void *)adev);
1749                         if (r)
1750                                 return r;
1751                         break;
1752                 }
1753         }
1754         return 0;
1755
1756 }
1757
1758 /**
1759  * amdgpu_device_ip_is_idle - is the hardware IP idle
1760  *
1761  * @adev: amdgpu_device pointer
1762  * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1763  *
1764  * Check if the hardware IP is idle or not.
1765  * Returns true if it the IP is idle, false if not.
1766  */
1767 bool amdgpu_device_ip_is_idle(struct amdgpu_device *adev,
1768                               enum amd_ip_block_type block_type)
1769 {
1770         int i;
1771
1772         for (i = 0; i < adev->num_ip_blocks; i++) {
1773                 if (!adev->ip_blocks[i].status.valid)
1774                         continue;
1775                 if (adev->ip_blocks[i].version->type == block_type)
1776                         return adev->ip_blocks[i].version->funcs->is_idle((void *)adev);
1777         }
1778         return true;
1779
1780 }
1781
1782 /**
1783  * amdgpu_device_ip_get_ip_block - get a hw IP pointer
1784  *
1785  * @adev: amdgpu_device pointer
1786  * @type: Type of hardware IP (SMU, GFX, UVD, etc.)
1787  *
1788  * Returns a pointer to the hardware IP block structure
1789  * if it exists for the asic, otherwise NULL.
1790  */
1791 struct amdgpu_ip_block *
1792 amdgpu_device_ip_get_ip_block(struct amdgpu_device *adev,
1793                               enum amd_ip_block_type type)
1794 {
1795         int i;
1796
1797         for (i = 0; i < adev->num_ip_blocks; i++)
1798                 if (adev->ip_blocks[i].version->type == type)
1799                         return &adev->ip_blocks[i];
1800
1801         return NULL;
1802 }
1803
1804 /**
1805  * amdgpu_device_ip_block_version_cmp
1806  *
1807  * @adev: amdgpu_device pointer
1808  * @type: enum amd_ip_block_type
1809  * @major: major version
1810  * @minor: minor version
1811  *
1812  * return 0 if equal or greater
1813  * return 1 if smaller or the ip_block doesn't exist
1814  */
1815 int amdgpu_device_ip_block_version_cmp(struct amdgpu_device *adev,
1816                                        enum amd_ip_block_type type,
1817                                        u32 major, u32 minor)
1818 {
1819         struct amdgpu_ip_block *ip_block = amdgpu_device_ip_get_ip_block(adev, type);
1820
1821         if (ip_block && ((ip_block->version->major > major) ||
1822                         ((ip_block->version->major == major) &&
1823                         (ip_block->version->minor >= minor))))
1824                 return 0;
1825
1826         return 1;
1827 }
1828
1829 /**
1830  * amdgpu_device_ip_block_add
1831  *
1832  * @adev: amdgpu_device pointer
1833  * @ip_block_version: pointer to the IP to add
1834  *
1835  * Adds the IP block driver information to the collection of IPs
1836  * on the asic.
1837  */
1838 int amdgpu_device_ip_block_add(struct amdgpu_device *adev,
1839                                const struct amdgpu_ip_block_version *ip_block_version)
1840 {
1841         if (!ip_block_version)
1842                 return -EINVAL;
1843
1844         switch (ip_block_version->type) {
1845         case AMD_IP_BLOCK_TYPE_VCN:
1846                 if (adev->harvest_ip_mask & AMD_HARVEST_IP_VCN_MASK)
1847                         return 0;
1848                 break;
1849         case AMD_IP_BLOCK_TYPE_JPEG:
1850                 if (adev->harvest_ip_mask & AMD_HARVEST_IP_JPEG_MASK)
1851                         return 0;
1852                 break;
1853         default:
1854                 break;
1855         }
1856
1857         DRM_INFO("add ip block number %d <%s>\n", adev->num_ip_blocks,
1858                   ip_block_version->funcs->name);
1859
1860         adev->ip_blocks[adev->num_ip_blocks++].version = ip_block_version;
1861
1862         return 0;
1863 }
1864
1865 /**
1866  * amdgpu_device_enable_virtual_display - enable virtual display feature
1867  *
1868  * @adev: amdgpu_device pointer
1869  *
1870  * Enabled the virtual display feature if the user has enabled it via
1871  * the module parameter virtual_display.  This feature provides a virtual
1872  * display hardware on headless boards or in virtualized environments.
1873  * This function parses and validates the configuration string specified by
1874  * the user and configues the virtual display configuration (number of
1875  * virtual connectors, crtcs, etc.) specified.
1876  */
1877 static void amdgpu_device_enable_virtual_display(struct amdgpu_device *adev)
1878 {
1879         adev->enable_virtual_display = false;
1880
1881         if (amdgpu_virtual_display) {
1882                 const char *pci_address_name = pci_name(adev->pdev);
1883                 char *pciaddstr, *pciaddstr_tmp, *pciaddname_tmp, *pciaddname;
1884
1885                 pciaddstr = kstrdup(amdgpu_virtual_display, GFP_KERNEL);
1886                 pciaddstr_tmp = pciaddstr;
1887                 while ((pciaddname_tmp = strsep(&pciaddstr_tmp, ";"))) {
1888                         pciaddname = strsep(&pciaddname_tmp, ",");
1889                         if (!strcmp("all", pciaddname)
1890                             || !strcmp(pci_address_name, pciaddname)) {
1891                                 long num_crtc;
1892                                 int res = -1;
1893
1894                                 adev->enable_virtual_display = true;
1895
1896                                 if (pciaddname_tmp)
1897                                         res = kstrtol(pciaddname_tmp, 10,
1898                                                       &num_crtc);
1899
1900                                 if (!res) {
1901                                         if (num_crtc < 1)
1902                                                 num_crtc = 1;
1903                                         if (num_crtc > 6)
1904                                                 num_crtc = 6;
1905                                         adev->mode_info.num_crtc = num_crtc;
1906                                 } else {
1907                                         adev->mode_info.num_crtc = 1;
1908                                 }
1909                                 break;
1910                         }
1911                 }
1912
1913                 DRM_INFO("virtual display string:%s, %s:virtual_display:%d, num_crtc:%d\n",
1914                          amdgpu_virtual_display, pci_address_name,
1915                          adev->enable_virtual_display, adev->mode_info.num_crtc);
1916
1917                 kfree(pciaddstr);
1918         }
1919 }
1920
1921 /**
1922  * amdgpu_device_parse_gpu_info_fw - parse gpu info firmware
1923  *
1924  * @adev: amdgpu_device pointer
1925  *
1926  * Parses the asic configuration parameters specified in the gpu info
1927  * firmware and makes them availale to the driver for use in configuring
1928  * the asic.
1929  * Returns 0 on success, -EINVAL on failure.
1930  */
1931 static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev)
1932 {
1933         const char *chip_name;
1934         char fw_name[40];
1935         int err;
1936         const struct gpu_info_firmware_header_v1_0 *hdr;
1937
1938         adev->firmware.gpu_info_fw = NULL;
1939
1940         if (adev->mman.discovery_bin) {
1941                 /*
1942                  * FIXME: The bounding box is still needed by Navi12, so
1943                  * temporarily read it from gpu_info firmware. Should be dropped
1944                  * when DAL no longer needs it.
1945                  */
1946                 if (adev->asic_type != CHIP_NAVI12)
1947                         return 0;
1948         }
1949
1950         switch (adev->asic_type) {
1951         default:
1952                 return 0;
1953         case CHIP_VEGA10:
1954                 chip_name = "vega10";
1955                 break;
1956         case CHIP_VEGA12:
1957                 chip_name = "vega12";
1958                 break;
1959         case CHIP_RAVEN:
1960                 if (adev->apu_flags & AMD_APU_IS_RAVEN2)
1961                         chip_name = "raven2";
1962                 else if (adev->apu_flags & AMD_APU_IS_PICASSO)
1963                         chip_name = "picasso";
1964                 else
1965                         chip_name = "raven";
1966                 break;
1967         case CHIP_ARCTURUS:
1968                 chip_name = "arcturus";
1969                 break;
1970         case CHIP_NAVI12:
1971                 chip_name = "navi12";
1972                 break;
1973         }
1974
1975         snprintf(fw_name, sizeof(fw_name), "amdgpu/%s_gpu_info.bin", chip_name);
1976         err = request_firmware(&adev->firmware.gpu_info_fw, fw_name, adev->dev);
1977         if (err) {
1978                 dev_err(adev->dev,
1979                         "Failed to load gpu_info firmware \"%s\"\n",
1980                         fw_name);
1981                 goto out;
1982         }
1983         err = amdgpu_ucode_validate(adev->firmware.gpu_info_fw);
1984         if (err) {
1985                 dev_err(adev->dev,
1986                         "Failed to validate gpu_info firmware \"%s\"\n",
1987                         fw_name);
1988                 goto out;
1989         }
1990
1991         hdr = (const struct gpu_info_firmware_header_v1_0 *)adev->firmware.gpu_info_fw->data;
1992         amdgpu_ucode_print_gpu_info_hdr(&hdr->header);
1993
1994         switch (hdr->version_major) {
1995         case 1:
1996         {
1997                 const struct gpu_info_firmware_v1_0 *gpu_info_fw =
1998                         (const struct gpu_info_firmware_v1_0 *)(adev->firmware.gpu_info_fw->data +
1999                                                                 le32_to_cpu(hdr->header.ucode_array_offset_bytes));
2000
2001                 /*
2002                  * Should be droped when DAL no longer needs it.
2003                  */
2004                 if (adev->asic_type == CHIP_NAVI12)
2005                         goto parse_soc_bounding_box;
2006
2007                 adev->gfx.config.max_shader_engines = le32_to_cpu(gpu_info_fw->gc_num_se);
2008                 adev->gfx.config.max_cu_per_sh = le32_to_cpu(gpu_info_fw->gc_num_cu_per_sh);
2009                 adev->gfx.config.max_sh_per_se = le32_to_cpu(gpu_info_fw->gc_num_sh_per_se);
2010                 adev->gfx.config.max_backends_per_se = le32_to_cpu(gpu_info_fw->gc_num_rb_per_se);
2011                 adev->gfx.config.max_texture_channel_caches =
2012                         le32_to_cpu(gpu_info_fw->gc_num_tccs);
2013                 adev->gfx.config.max_gprs = le32_to_cpu(gpu_info_fw->gc_num_gprs);
2014                 adev->gfx.config.max_gs_threads = le32_to_cpu(gpu_info_fw->gc_num_max_gs_thds);
2015                 adev->gfx.config.gs_vgt_table_depth = le32_to_cpu(gpu_info_fw->gc_gs_table_depth);
2016                 adev->gfx.config.gs_prim_buffer_depth = le32_to_cpu(gpu_info_fw->gc_gsprim_buff_depth);
2017                 adev->gfx.config.double_offchip_lds_buf =
2018                         le32_to_cpu(gpu_info_fw->gc_double_offchip_lds_buffer);
2019                 adev->gfx.cu_info.wave_front_size = le32_to_cpu(gpu_info_fw->gc_wave_size);
2020                 adev->gfx.cu_info.max_waves_per_simd =
2021                         le32_to_cpu(gpu_info_fw->gc_max_waves_per_simd);
2022                 adev->gfx.cu_info.max_scratch_slots_per_cu =
2023                         le32_to_cpu(gpu_info_fw->gc_max_scratch_slots_per_cu);
2024                 adev->gfx.cu_info.lds_size = le32_to_cpu(gpu_info_fw->gc_lds_size);
2025                 if (hdr->version_minor >= 1) {
2026                         const struct gpu_info_firmware_v1_1 *gpu_info_fw =
2027                                 (const struct gpu_info_firmware_v1_1 *)(adev->firmware.gpu_info_fw->data +
2028                                                                         le32_to_cpu(hdr->header.ucode_array_offset_bytes));
2029                         adev->gfx.config.num_sc_per_sh =
2030                                 le32_to_cpu(gpu_info_fw->num_sc_per_sh);
2031                         adev->gfx.config.num_packer_per_sc =
2032                                 le32_to_cpu(gpu_info_fw->num_packer_per_sc);
2033                 }
2034
2035 parse_soc_bounding_box:
2036                 /*
2037                  * soc bounding box info is not integrated in disocovery table,
2038                  * we always need to parse it from gpu info firmware if needed.
2039                  */
2040                 if (hdr->version_minor == 2) {
2041                         const struct gpu_info_firmware_v1_2 *gpu_info_fw =
2042                                 (const struct gpu_info_firmware_v1_2 *)(adev->firmware.gpu_info_fw->data +
2043                                                                         le32_to_cpu(hdr->header.ucode_array_offset_bytes));
2044                         adev->dm.soc_bounding_box = &gpu_info_fw->soc_bounding_box;
2045                 }
2046                 break;
2047         }
2048         default:
2049                 dev_err(adev->dev,
2050                         "Unsupported gpu_info table %d\n", hdr->header.ucode_version);
2051                 err = -EINVAL;
2052                 goto out;
2053         }
2054 out:
2055         return err;
2056 }
2057
2058 /**
2059  * amdgpu_device_ip_early_init - run early init for hardware IPs
2060  *
2061  * @adev: amdgpu_device pointer
2062  *
2063  * Early initialization pass for hardware IPs.  The hardware IPs that make
2064  * up each asic are discovered each IP's early_init callback is run.  This
2065  * is the first stage in initializing the asic.
2066  * Returns 0 on success, negative error code on failure.
2067  */
2068 static int amdgpu_device_ip_early_init(struct amdgpu_device *adev)
2069 {
2070         struct drm_device *dev = adev_to_drm(adev);
2071         struct pci_dev *parent;
2072         int i, r;
2073
2074         amdgpu_device_enable_virtual_display(adev);
2075
2076         if (amdgpu_sriov_vf(adev)) {
2077                 r = amdgpu_virt_request_full_gpu(adev, true);
2078                 if (r)
2079                         return r;
2080         }
2081
2082         switch (adev->asic_type) {
2083 #ifdef CONFIG_DRM_AMDGPU_SI
2084         case CHIP_VERDE:
2085         case CHIP_TAHITI:
2086         case CHIP_PITCAIRN:
2087         case CHIP_OLAND:
2088         case CHIP_HAINAN:
2089                 adev->family = AMDGPU_FAMILY_SI;
2090                 r = si_set_ip_blocks(adev);
2091                 if (r)
2092                         return r;
2093                 break;
2094 #endif
2095 #ifdef CONFIG_DRM_AMDGPU_CIK
2096         case CHIP_BONAIRE:
2097         case CHIP_HAWAII:
2098         case CHIP_KAVERI:
2099         case CHIP_KABINI:
2100         case CHIP_MULLINS:
2101                 if (adev->flags & AMD_IS_APU)
2102                         adev->family = AMDGPU_FAMILY_KV;
2103                 else
2104                         adev->family = AMDGPU_FAMILY_CI;
2105
2106                 r = cik_set_ip_blocks(adev);
2107                 if (r)
2108                         return r;
2109                 break;
2110 #endif
2111         case CHIP_TOPAZ:
2112         case CHIP_TONGA:
2113         case CHIP_FIJI:
2114         case CHIP_POLARIS10:
2115         case CHIP_POLARIS11:
2116         case CHIP_POLARIS12:
2117         case CHIP_VEGAM:
2118         case CHIP_CARRIZO:
2119         case CHIP_STONEY:
2120                 if (adev->flags & AMD_IS_APU)
2121                         adev->family = AMDGPU_FAMILY_CZ;
2122                 else
2123                         adev->family = AMDGPU_FAMILY_VI;
2124
2125                 r = vi_set_ip_blocks(adev);
2126                 if (r)
2127                         return r;
2128                 break;
2129         default:
2130                 r = amdgpu_discovery_set_ip_blocks(adev);
2131                 if (r)
2132                         return r;
2133                 break;
2134         }
2135
2136         if (amdgpu_has_atpx() &&
2137             (amdgpu_is_atpx_hybrid() ||
2138              amdgpu_has_atpx_dgpu_power_cntl()) &&
2139             ((adev->flags & AMD_IS_APU) == 0) &&
2140             !pci_is_thunderbolt_attached(to_pci_dev(dev->dev)))
2141                 adev->flags |= AMD_IS_PX;
2142
2143         if (!(adev->flags & AMD_IS_APU)) {
2144                 parent = pci_upstream_bridge(adev->pdev);
2145                 adev->has_pr3 = parent ? pci_pr3_present(parent) : false;
2146         }
2147
2148         amdgpu_amdkfd_device_probe(adev);
2149
2150         adev->pm.pp_feature = amdgpu_pp_feature_mask;
2151         if (amdgpu_sriov_vf(adev) || sched_policy == KFD_SCHED_POLICY_NO_HWS)
2152                 adev->pm.pp_feature &= ~PP_GFXOFF_MASK;
2153         if (amdgpu_sriov_vf(adev) && adev->asic_type == CHIP_SIENNA_CICHLID)
2154                 adev->pm.pp_feature &= ~PP_OVERDRIVE_MASK;
2155
2156         for (i = 0; i < adev->num_ip_blocks; i++) {
2157                 if ((amdgpu_ip_block_mask & (1 << i)) == 0) {
2158                         DRM_ERROR("disabled ip block: %d <%s>\n",
2159                                   i, adev->ip_blocks[i].version->funcs->name);
2160                         adev->ip_blocks[i].status.valid = false;
2161                 } else {
2162                         if (adev->ip_blocks[i].version->funcs->early_init) {
2163                                 r = adev->ip_blocks[i].version->funcs->early_init((void *)adev);
2164                                 if (r == -ENOENT) {
2165                                         adev->ip_blocks[i].status.valid = false;
2166                                 } else if (r) {
2167                                         DRM_ERROR("early_init of IP block <%s> failed %d\n",
2168                                                   adev->ip_blocks[i].version->funcs->name, r);
2169                                         return r;
2170                                 } else {
2171                                         adev->ip_blocks[i].status.valid = true;
2172                                 }
2173                         } else {
2174                                 adev->ip_blocks[i].status.valid = true;
2175                         }
2176                 }
2177                 /* get the vbios after the asic_funcs are set up */
2178                 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) {
2179                         r = amdgpu_device_parse_gpu_info_fw(adev);
2180                         if (r)
2181                                 return r;
2182
2183                         /* Read BIOS */
2184                         if (!amdgpu_get_bios(adev))
2185                                 return -EINVAL;
2186
2187                         r = amdgpu_atombios_init(adev);
2188                         if (r) {
2189                                 dev_err(adev->dev, "amdgpu_atombios_init failed\n");
2190                                 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 0, 0);
2191                                 return r;
2192                         }
2193
2194                         /*get pf2vf msg info at it's earliest time*/
2195                         if (amdgpu_sriov_vf(adev))
2196                                 amdgpu_virt_init_data_exchange(adev);
2197
2198                 }
2199         }
2200
2201         adev->cg_flags &= amdgpu_cg_mask;
2202         adev->pg_flags &= amdgpu_pg_mask;
2203
2204         return 0;
2205 }
2206
2207 static int amdgpu_device_ip_hw_init_phase1(struct amdgpu_device *adev)
2208 {
2209         int i, r;
2210
2211         for (i = 0; i < adev->num_ip_blocks; i++) {
2212                 if (!adev->ip_blocks[i].status.sw)
2213                         continue;
2214                 if (adev->ip_blocks[i].status.hw)
2215                         continue;
2216                 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
2217                     (amdgpu_sriov_vf(adev) && (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)) ||
2218                     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) {
2219                         r = adev->ip_blocks[i].version->funcs->hw_init(adev);
2220                         if (r) {
2221                                 DRM_ERROR("hw_init of IP block <%s> failed %d\n",
2222                                           adev->ip_blocks[i].version->funcs->name, r);
2223                                 return r;
2224                         }
2225                         adev->ip_blocks[i].status.hw = true;
2226                 }
2227         }
2228
2229         return 0;
2230 }
2231
2232 static int amdgpu_device_ip_hw_init_phase2(struct amdgpu_device *adev)
2233 {
2234         int i, r;
2235
2236         for (i = 0; i < adev->num_ip_blocks; i++) {
2237                 if (!adev->ip_blocks[i].status.sw)
2238                         continue;
2239                 if (adev->ip_blocks[i].status.hw)
2240                         continue;
2241                 r = adev->ip_blocks[i].version->funcs->hw_init(adev);
2242                 if (r) {
2243                         DRM_ERROR("hw_init of IP block <%s> failed %d\n",
2244                                   adev->ip_blocks[i].version->funcs->name, r);
2245                         return r;
2246                 }
2247                 adev->ip_blocks[i].status.hw = true;
2248         }
2249
2250         return 0;
2251 }
2252
2253 static int amdgpu_device_fw_loading(struct amdgpu_device *adev)
2254 {
2255         int r = 0;
2256         int i;
2257         uint32_t smu_version;
2258
2259         if (adev->asic_type >= CHIP_VEGA10) {
2260                 for (i = 0; i < adev->num_ip_blocks; i++) {
2261                         if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP)
2262                                 continue;
2263
2264                         if (!adev->ip_blocks[i].status.sw)
2265                                 continue;
2266
2267                         /* no need to do the fw loading again if already done*/
2268                         if (adev->ip_blocks[i].status.hw == true)
2269                                 break;
2270
2271                         if (amdgpu_in_reset(adev) || adev->in_suspend) {
2272                                 r = adev->ip_blocks[i].version->funcs->resume(adev);
2273                                 if (r) {
2274                                         DRM_ERROR("resume of IP block <%s> failed %d\n",
2275                                                           adev->ip_blocks[i].version->funcs->name, r);
2276                                         return r;
2277                                 }
2278                         } else {
2279                                 r = adev->ip_blocks[i].version->funcs->hw_init(adev);
2280                                 if (r) {
2281                                         DRM_ERROR("hw_init of IP block <%s> failed %d\n",
2282                                                           adev->ip_blocks[i].version->funcs->name, r);
2283                                         return r;
2284                                 }
2285                         }
2286
2287                         adev->ip_blocks[i].status.hw = true;
2288                         break;
2289                 }
2290         }
2291
2292         if (!amdgpu_sriov_vf(adev) || adev->asic_type == CHIP_TONGA)
2293                 r = amdgpu_pm_load_smu_firmware(adev, &smu_version);
2294
2295         return r;
2296 }
2297
2298 static int amdgpu_device_init_schedulers(struct amdgpu_device *adev)
2299 {
2300         long timeout;
2301         int r, i;
2302
2303         for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
2304                 struct amdgpu_ring *ring = adev->rings[i];
2305
2306                 /* No need to setup the GPU scheduler for rings that don't need it */
2307                 if (!ring || ring->no_scheduler)
2308                         continue;
2309
2310                 switch (ring->funcs->type) {
2311                 case AMDGPU_RING_TYPE_GFX:
2312                         timeout = adev->gfx_timeout;
2313                         break;
2314                 case AMDGPU_RING_TYPE_COMPUTE:
2315                         timeout = adev->compute_timeout;
2316                         break;
2317                 case AMDGPU_RING_TYPE_SDMA:
2318                         timeout = adev->sdma_timeout;
2319                         break;
2320                 default:
2321                         timeout = adev->video_timeout;
2322                         break;
2323                 }
2324
2325                 r = drm_sched_init(&ring->sched, &amdgpu_sched_ops,
2326                                    ring->num_hw_submission, amdgpu_job_hang_limit,
2327                                    timeout, adev->reset_domain->wq,
2328                                    ring->sched_score, ring->name,
2329                                    adev->dev);
2330                 if (r) {
2331                         DRM_ERROR("Failed to create scheduler on ring %s.\n",
2332                                   ring->name);
2333                         return r;
2334                 }
2335         }
2336
2337         return 0;
2338 }
2339
2340
2341 /**
2342  * amdgpu_device_ip_init - run init for hardware IPs
2343  *
2344  * @adev: amdgpu_device pointer
2345  *
2346  * Main initialization pass for hardware IPs.  The list of all the hardware
2347  * IPs that make up the asic is walked and the sw_init and hw_init callbacks
2348  * are run.  sw_init initializes the software state associated with each IP
2349  * and hw_init initializes the hardware associated with each IP.
2350  * Returns 0 on success, negative error code on failure.
2351  */
2352 static int amdgpu_device_ip_init(struct amdgpu_device *adev)
2353 {
2354         int i, r;
2355
2356         r = amdgpu_ras_init(adev);
2357         if (r)
2358                 return r;
2359
2360         for (i = 0; i < adev->num_ip_blocks; i++) {
2361                 if (!adev->ip_blocks[i].status.valid)
2362                         continue;
2363                 r = adev->ip_blocks[i].version->funcs->sw_init((void *)adev);
2364                 if (r) {
2365                         DRM_ERROR("sw_init of IP block <%s> failed %d\n",
2366                                   adev->ip_blocks[i].version->funcs->name, r);
2367                         goto init_failed;
2368                 }
2369                 adev->ip_blocks[i].status.sw = true;
2370
2371                 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) {
2372                         /* need to do common hw init early so everything is set up for gmc */
2373                         r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev);
2374                         if (r) {
2375                                 DRM_ERROR("hw_init %d failed %d\n", i, r);
2376                                 goto init_failed;
2377                         }
2378                         adev->ip_blocks[i].status.hw = true;
2379                 } else if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) {
2380                         /* need to do gmc hw init early so we can allocate gpu mem */
2381                         /* Try to reserve bad pages early */
2382                         if (amdgpu_sriov_vf(adev))
2383                                 amdgpu_virt_exchange_data(adev);
2384
2385                         r = amdgpu_device_vram_scratch_init(adev);
2386                         if (r) {
2387                                 DRM_ERROR("amdgpu_vram_scratch_init failed %d\n", r);
2388                                 goto init_failed;
2389                         }
2390                         r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev);
2391                         if (r) {
2392                                 DRM_ERROR("hw_init %d failed %d\n", i, r);
2393                                 goto init_failed;
2394                         }
2395                         r = amdgpu_device_wb_init(adev);
2396                         if (r) {
2397                                 DRM_ERROR("amdgpu_device_wb_init failed %d\n", r);
2398                                 goto init_failed;
2399                         }
2400                         adev->ip_blocks[i].status.hw = true;
2401
2402                         /* right after GMC hw init, we create CSA */
2403                         if (amdgpu_mcbp || amdgpu_sriov_vf(adev)) {
2404                                 r = amdgpu_allocate_static_csa(adev, &adev->virt.csa_obj,
2405                                                                 AMDGPU_GEM_DOMAIN_VRAM,
2406                                                                 AMDGPU_CSA_SIZE);
2407                                 if (r) {
2408                                         DRM_ERROR("allocate CSA failed %d\n", r);
2409                                         goto init_failed;
2410                                 }
2411                         }
2412                 }
2413         }
2414
2415         if (amdgpu_sriov_vf(adev))
2416                 amdgpu_virt_init_data_exchange(adev);
2417
2418         r = amdgpu_ib_pool_init(adev);
2419         if (r) {
2420                 dev_err(adev->dev, "IB initialization failed (%d).\n", r);
2421                 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r);
2422                 goto init_failed;
2423         }
2424
2425         r = amdgpu_ucode_create_bo(adev); /* create ucode bo when sw_init complete*/
2426         if (r)
2427                 goto init_failed;
2428
2429         r = amdgpu_device_ip_hw_init_phase1(adev);
2430         if (r)
2431                 goto init_failed;
2432
2433         r = amdgpu_device_fw_loading(adev);
2434         if (r)
2435                 goto init_failed;
2436
2437         r = amdgpu_device_ip_hw_init_phase2(adev);
2438         if (r)
2439                 goto init_failed;
2440
2441         /*
2442          * retired pages will be loaded from eeprom and reserved here,
2443          * it should be called after amdgpu_device_ip_hw_init_phase2  since
2444          * for some ASICs the RAS EEPROM code relies on SMU fully functioning
2445          * for I2C communication which only true at this point.
2446          *
2447          * amdgpu_ras_recovery_init may fail, but the upper only cares the
2448          * failure from bad gpu situation and stop amdgpu init process
2449          * accordingly. For other failed cases, it will still release all
2450          * the resource and print error message, rather than returning one
2451          * negative value to upper level.
2452          *
2453          * Note: theoretically, this should be called before all vram allocations
2454          * to protect retired page from abusing
2455          */
2456         r = amdgpu_ras_recovery_init(adev);
2457         if (r)
2458                 goto init_failed;
2459
2460         /**
2461          * In case of XGMI grab extra reference for reset domain for this device
2462          */
2463         if (adev->gmc.xgmi.num_physical_nodes > 1) {
2464                 if (amdgpu_xgmi_add_device(adev) == 0) {
2465                         if (!amdgpu_sriov_vf(adev)) {
2466                                 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev);
2467
2468                                 if (WARN_ON(!hive)) {
2469                                         r = -ENOENT;
2470                                         goto init_failed;
2471                                 }
2472
2473                                 if (!hive->reset_domain ||
2474                                     !amdgpu_reset_get_reset_domain(hive->reset_domain)) {
2475                                         r = -ENOENT;
2476                                         amdgpu_put_xgmi_hive(hive);
2477                                         goto init_failed;
2478                                 }
2479
2480                                 /* Drop the early temporary reset domain we created for device */
2481                                 amdgpu_reset_put_reset_domain(adev->reset_domain);
2482                                 adev->reset_domain = hive->reset_domain;
2483                                 amdgpu_put_xgmi_hive(hive);
2484                         }
2485                 }
2486         }
2487
2488         r = amdgpu_device_init_schedulers(adev);
2489         if (r)
2490                 goto init_failed;
2491
2492         /* Don't init kfd if whole hive need to be reset during init */
2493         if (!adev->gmc.xgmi.pending_reset)
2494                 amdgpu_amdkfd_device_init(adev);
2495
2496         amdgpu_fru_get_product_info(adev);
2497
2498 init_failed:
2499         if (amdgpu_sriov_vf(adev))
2500                 amdgpu_virt_release_full_gpu(adev, true);
2501
2502         return r;
2503 }
2504
2505 /**
2506  * amdgpu_device_fill_reset_magic - writes reset magic to gart pointer
2507  *
2508  * @adev: amdgpu_device pointer
2509  *
2510  * Writes a reset magic value to the gart pointer in VRAM.  The driver calls
2511  * this function before a GPU reset.  If the value is retained after a
2512  * GPU reset, VRAM has not been lost.  Some GPU resets may destry VRAM contents.
2513  */
2514 static void amdgpu_device_fill_reset_magic(struct amdgpu_device *adev)
2515 {
2516         memcpy(adev->reset_magic, adev->gart.ptr, AMDGPU_RESET_MAGIC_NUM);
2517 }
2518
2519 /**
2520  * amdgpu_device_check_vram_lost - check if vram is valid
2521  *
2522  * @adev: amdgpu_device pointer
2523  *
2524  * Checks the reset magic value written to the gart pointer in VRAM.
2525  * The driver calls this after a GPU reset to see if the contents of
2526  * VRAM is lost or now.
2527  * returns true if vram is lost, false if not.
2528  */
2529 static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev)
2530 {
2531         if (memcmp(adev->gart.ptr, adev->reset_magic,
2532                         AMDGPU_RESET_MAGIC_NUM))
2533                 return true;
2534
2535         if (!amdgpu_in_reset(adev))
2536                 return false;
2537
2538         /*
2539          * For all ASICs with baco/mode1 reset, the VRAM is
2540          * always assumed to be lost.
2541          */
2542         switch (amdgpu_asic_reset_method(adev)) {
2543         case AMD_RESET_METHOD_BACO:
2544         case AMD_RESET_METHOD_MODE1:
2545                 return true;
2546         default:
2547                 return false;
2548         }
2549 }
2550
2551 /**
2552  * amdgpu_device_set_cg_state - set clockgating for amdgpu device
2553  *
2554  * @adev: amdgpu_device pointer
2555  * @state: clockgating state (gate or ungate)
2556  *
2557  * The list of all the hardware IPs that make up the asic is walked and the
2558  * set_clockgating_state callbacks are run.
2559  * Late initialization pass enabling clockgating for hardware IPs.
2560  * Fini or suspend, pass disabling clockgating for hardware IPs.
2561  * Returns 0 on success, negative error code on failure.
2562  */
2563
2564 int amdgpu_device_set_cg_state(struct amdgpu_device *adev,
2565                                enum amd_clockgating_state state)
2566 {
2567         int i, j, r;
2568
2569         if (amdgpu_emu_mode == 1)
2570                 return 0;
2571
2572         for (j = 0; j < adev->num_ip_blocks; j++) {
2573                 i = state == AMD_CG_STATE_GATE ? j : adev->num_ip_blocks - j - 1;
2574                 if (!adev->ip_blocks[i].status.late_initialized)
2575                         continue;
2576                 /* skip CG for GFX on S0ix */
2577                 if (adev->in_s0ix &&
2578                     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX)
2579                         continue;
2580                 /* skip CG for VCE/UVD, it's handled specially */
2581                 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD &&
2582                     adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE &&
2583                     adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN &&
2584                     adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG &&
2585                     adev->ip_blocks[i].version->funcs->set_clockgating_state) {
2586                         /* enable clockgating to save power */
2587                         r = adev->ip_blocks[i].version->funcs->set_clockgating_state((void *)adev,
2588                                                                                      state);
2589                         if (r) {
2590                                 DRM_ERROR("set_clockgating_state(gate) of IP block <%s> failed %d\n",
2591                                           adev->ip_blocks[i].version->funcs->name, r);
2592                                 return r;
2593                         }
2594                 }
2595         }
2596
2597         return 0;
2598 }
2599
2600 int amdgpu_device_set_pg_state(struct amdgpu_device *adev,
2601                                enum amd_powergating_state state)
2602 {
2603         int i, j, r;
2604
2605         if (amdgpu_emu_mode == 1)
2606                 return 0;
2607
2608         for (j = 0; j < adev->num_ip_blocks; j++) {
2609                 i = state == AMD_PG_STATE_GATE ? j : adev->num_ip_blocks - j - 1;
2610                 if (!adev->ip_blocks[i].status.late_initialized)
2611                         continue;
2612                 /* skip PG for GFX on S0ix */
2613                 if (adev->in_s0ix &&
2614                     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX)
2615                         continue;
2616                 /* skip CG for VCE/UVD, it's handled specially */
2617                 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD &&
2618                     adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE &&
2619                     adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN &&
2620                     adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG &&
2621                     adev->ip_blocks[i].version->funcs->set_powergating_state) {
2622                         /* enable powergating to save power */
2623                         r = adev->ip_blocks[i].version->funcs->set_powergating_state((void *)adev,
2624                                                                                         state);
2625                         if (r) {
2626                                 DRM_ERROR("set_powergating_state(gate) of IP block <%s> failed %d\n",
2627                                           adev->ip_blocks[i].version->funcs->name, r);
2628                                 return r;
2629                         }
2630                 }
2631         }
2632         return 0;
2633 }
2634
2635 static int amdgpu_device_enable_mgpu_fan_boost(void)
2636 {
2637         struct amdgpu_gpu_instance *gpu_ins;
2638         struct amdgpu_device *adev;
2639         int i, ret = 0;
2640
2641         mutex_lock(&mgpu_info.mutex);
2642
2643         /*
2644          * MGPU fan boost feature should be enabled
2645          * only when there are two or more dGPUs in
2646          * the system
2647          */
2648         if (mgpu_info.num_dgpu < 2)
2649                 goto out;
2650
2651         for (i = 0; i < mgpu_info.num_dgpu; i++) {
2652                 gpu_ins = &(mgpu_info.gpu_ins[i]);
2653                 adev = gpu_ins->adev;
2654                 if (!(adev->flags & AMD_IS_APU) &&
2655                     !gpu_ins->mgpu_fan_enabled) {
2656                         ret = amdgpu_dpm_enable_mgpu_fan_boost(adev);
2657                         if (ret)
2658                                 break;
2659
2660                         gpu_ins->mgpu_fan_enabled = 1;
2661                 }
2662         }
2663
2664 out:
2665         mutex_unlock(&mgpu_info.mutex);
2666
2667         return ret;
2668 }
2669
2670 /**
2671  * amdgpu_device_ip_late_init - run late init for hardware IPs
2672  *
2673  * @adev: amdgpu_device pointer
2674  *
2675  * Late initialization pass for hardware IPs.  The list of all the hardware
2676  * IPs that make up the asic is walked and the late_init callbacks are run.
2677  * late_init covers any special initialization that an IP requires
2678  * after all of the have been initialized or something that needs to happen
2679  * late in the init process.
2680  * Returns 0 on success, negative error code on failure.
2681  */
2682 static int amdgpu_device_ip_late_init(struct amdgpu_device *adev)
2683 {
2684         struct amdgpu_gpu_instance *gpu_instance;
2685         int i = 0, r;
2686
2687         for (i = 0; i < adev->num_ip_blocks; i++) {
2688                 if (!adev->ip_blocks[i].status.hw)
2689                         continue;
2690                 if (adev->ip_blocks[i].version->funcs->late_init) {
2691                         r = adev->ip_blocks[i].version->funcs->late_init((void *)adev);
2692                         if (r) {
2693                                 DRM_ERROR("late_init of IP block <%s> failed %d\n",
2694                                           adev->ip_blocks[i].version->funcs->name, r);
2695                                 return r;
2696                         }
2697                 }
2698                 adev->ip_blocks[i].status.late_initialized = true;
2699         }
2700
2701         r = amdgpu_ras_late_init(adev);
2702         if (r) {
2703                 DRM_ERROR("amdgpu_ras_late_init failed %d", r);
2704                 return r;
2705         }
2706
2707         amdgpu_ras_set_error_query_ready(adev, true);
2708
2709         amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE);
2710         amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE);
2711
2712         amdgpu_device_fill_reset_magic(adev);
2713
2714         r = amdgpu_device_enable_mgpu_fan_boost();
2715         if (r)
2716                 DRM_ERROR("enable mgpu fan boost failed (%d).\n", r);
2717
2718         /* For passthrough configuration on arcturus and aldebaran, enable special handling SBR */
2719         if (amdgpu_passthrough(adev) && ((adev->asic_type == CHIP_ARCTURUS && adev->gmc.xgmi.num_physical_nodes > 1)||
2720                                adev->asic_type == CHIP_ALDEBARAN ))
2721                 amdgpu_dpm_handle_passthrough_sbr(adev, true);
2722
2723         if (adev->gmc.xgmi.num_physical_nodes > 1) {
2724                 mutex_lock(&mgpu_info.mutex);
2725
2726                 /*
2727                  * Reset device p-state to low as this was booted with high.
2728                  *
2729                  * This should be performed only after all devices from the same
2730                  * hive get initialized.
2731                  *
2732                  * However, it's unknown how many device in the hive in advance.
2733                  * As this is counted one by one during devices initializations.
2734                  *
2735                  * So, we wait for all XGMI interlinked devices initialized.
2736                  * This may bring some delays as those devices may come from
2737                  * different hives. But that should be OK.
2738                  */
2739                 if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) {
2740                         for (i = 0; i < mgpu_info.num_gpu; i++) {
2741                                 gpu_instance = &(mgpu_info.gpu_ins[i]);
2742                                 if (gpu_instance->adev->flags & AMD_IS_APU)
2743                                         continue;
2744
2745                                 r = amdgpu_xgmi_set_pstate(gpu_instance->adev,
2746                                                 AMDGPU_XGMI_PSTATE_MIN);
2747                                 if (r) {
2748                                         DRM_ERROR("pstate setting failed (%d).\n", r);
2749                                         break;
2750                                 }
2751                         }
2752                 }
2753
2754                 mutex_unlock(&mgpu_info.mutex);
2755         }
2756
2757         return 0;
2758 }
2759
2760 /**
2761  * amdgpu_device_smu_fini_early - smu hw_fini wrapper
2762  *
2763  * @adev: amdgpu_device pointer
2764  *
2765  * For ASICs need to disable SMC first
2766  */
2767 static void amdgpu_device_smu_fini_early(struct amdgpu_device *adev)
2768 {
2769         int i, r;
2770
2771         if (adev->ip_versions[GC_HWIP][0] > IP_VERSION(9, 0, 0))
2772                 return;
2773
2774         for (i = 0; i < adev->num_ip_blocks; i++) {
2775                 if (!adev->ip_blocks[i].status.hw)
2776                         continue;
2777                 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) {
2778                         r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev);
2779                         /* XXX handle errors */
2780                         if (r) {
2781                                 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n",
2782                                           adev->ip_blocks[i].version->funcs->name, r);
2783                         }
2784                         adev->ip_blocks[i].status.hw = false;
2785                         break;
2786                 }
2787         }
2788 }
2789
2790 static int amdgpu_device_ip_fini_early(struct amdgpu_device *adev)
2791 {
2792         int i, r;
2793
2794         for (i = 0; i < adev->num_ip_blocks; i++) {
2795                 if (!adev->ip_blocks[i].version->funcs->early_fini)
2796                         continue;
2797
2798                 r = adev->ip_blocks[i].version->funcs->early_fini((void *)adev);
2799                 if (r) {
2800                         DRM_DEBUG("early_fini of IP block <%s> failed %d\n",
2801                                   adev->ip_blocks[i].version->funcs->name, r);
2802                 }
2803         }
2804
2805         amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
2806         amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
2807
2808         amdgpu_amdkfd_suspend(adev, false);
2809
2810         /* Workaroud for ASICs need to disable SMC first */
2811         amdgpu_device_smu_fini_early(adev);
2812
2813         for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2814                 if (!adev->ip_blocks[i].status.hw)
2815                         continue;
2816
2817                 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev);
2818                 /* XXX handle errors */
2819                 if (r) {
2820                         DRM_DEBUG("hw_fini of IP block <%s> failed %d\n",
2821                                   adev->ip_blocks[i].version->funcs->name, r);
2822                 }
2823
2824                 adev->ip_blocks[i].status.hw = false;
2825         }
2826
2827         if (amdgpu_sriov_vf(adev)) {
2828                 if (amdgpu_virt_release_full_gpu(adev, false))
2829                         DRM_ERROR("failed to release exclusive mode on fini\n");
2830         }
2831
2832         return 0;
2833 }
2834
2835 /**
2836  * amdgpu_device_ip_fini - run fini for hardware IPs
2837  *
2838  * @adev: amdgpu_device pointer
2839  *
2840  * Main teardown pass for hardware IPs.  The list of all the hardware
2841  * IPs that make up the asic is walked and the hw_fini and sw_fini callbacks
2842  * are run.  hw_fini tears down the hardware associated with each IP
2843  * and sw_fini tears down any software state associated with each IP.
2844  * Returns 0 on success, negative error code on failure.
2845  */
2846 static int amdgpu_device_ip_fini(struct amdgpu_device *adev)
2847 {
2848         int i, r;
2849
2850         if (amdgpu_sriov_vf(adev) && adev->virt.ras_init_done)
2851                 amdgpu_virt_release_ras_err_handler_data(adev);
2852
2853         if (adev->gmc.xgmi.num_physical_nodes > 1)
2854                 amdgpu_xgmi_remove_device(adev);
2855
2856         amdgpu_amdkfd_device_fini_sw(adev);
2857
2858         for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2859                 if (!adev->ip_blocks[i].status.sw)
2860                         continue;
2861
2862                 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) {
2863                         amdgpu_ucode_free_bo(adev);
2864                         amdgpu_free_static_csa(&adev->virt.csa_obj);
2865                         amdgpu_device_wb_fini(adev);
2866                         amdgpu_device_vram_scratch_fini(adev);
2867                         amdgpu_ib_pool_fini(adev);
2868                 }
2869
2870                 r = adev->ip_blocks[i].version->funcs->sw_fini((void *)adev);
2871                 /* XXX handle errors */
2872                 if (r) {
2873                         DRM_DEBUG("sw_fini of IP block <%s> failed %d\n",
2874                                   adev->ip_blocks[i].version->funcs->name, r);
2875                 }
2876                 adev->ip_blocks[i].status.sw = false;
2877                 adev->ip_blocks[i].status.valid = false;
2878         }
2879
2880         for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2881                 if (!adev->ip_blocks[i].status.late_initialized)
2882                         continue;
2883                 if (adev->ip_blocks[i].version->funcs->late_fini)
2884                         adev->ip_blocks[i].version->funcs->late_fini((void *)adev);
2885                 adev->ip_blocks[i].status.late_initialized = false;
2886         }
2887
2888         amdgpu_ras_fini(adev);
2889
2890         return 0;
2891 }
2892
2893 /**
2894  * amdgpu_device_delayed_init_work_handler - work handler for IB tests
2895  *
2896  * @work: work_struct.
2897  */
2898 static void amdgpu_device_delayed_init_work_handler(struct work_struct *work)
2899 {
2900         struct amdgpu_device *adev =
2901                 container_of(work, struct amdgpu_device, delayed_init_work.work);
2902         int r;
2903
2904         r = amdgpu_ib_ring_tests(adev);
2905         if (r)
2906                 DRM_ERROR("ib ring test failed (%d).\n", r);
2907 }
2908
2909 static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work)
2910 {
2911         struct amdgpu_device *adev =
2912                 container_of(work, struct amdgpu_device, gfx.gfx_off_delay_work.work);
2913
2914         WARN_ON_ONCE(adev->gfx.gfx_off_state);
2915         WARN_ON_ONCE(adev->gfx.gfx_off_req_count);
2916
2917         if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true))
2918                 adev->gfx.gfx_off_state = true;
2919 }
2920
2921 /**
2922  * amdgpu_device_ip_suspend_phase1 - run suspend for hardware IPs (phase 1)
2923  *
2924  * @adev: amdgpu_device pointer
2925  *
2926  * Main suspend function for hardware IPs.  The list of all the hardware
2927  * IPs that make up the asic is walked, clockgating is disabled and the
2928  * suspend callbacks are run.  suspend puts the hardware and software state
2929  * in each IP into a state suitable for suspend.
2930  * Returns 0 on success, negative error code on failure.
2931  */
2932 static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev)
2933 {
2934         int i, r;
2935
2936         amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
2937         amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
2938
2939         /*
2940          * Per PMFW team's suggestion, driver needs to handle gfxoff
2941          * and df cstate features disablement for gpu reset(e.g. Mode1Reset)
2942          * scenario. Add the missing df cstate disablement here.
2943          */
2944         if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_DISALLOW))
2945                 dev_warn(adev->dev, "Failed to disallow df cstate");
2946
2947         for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2948                 if (!adev->ip_blocks[i].status.valid)
2949                         continue;
2950
2951                 /* displays are handled separately */
2952                 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_DCE)
2953                         continue;
2954
2955                 /* XXX handle errors */
2956                 r = adev->ip_blocks[i].version->funcs->suspend(adev);
2957                 /* XXX handle errors */
2958                 if (r) {
2959                         DRM_ERROR("suspend of IP block <%s> failed %d\n",
2960                                   adev->ip_blocks[i].version->funcs->name, r);
2961                         return r;
2962                 }
2963
2964                 adev->ip_blocks[i].status.hw = false;
2965         }
2966
2967         return 0;
2968 }
2969
2970 /**
2971  * amdgpu_device_ip_suspend_phase2 - run suspend for hardware IPs (phase 2)
2972  *
2973  * @adev: amdgpu_device pointer
2974  *
2975  * Main suspend function for hardware IPs.  The list of all the hardware
2976  * IPs that make up the asic is walked, clockgating is disabled and the
2977  * suspend callbacks are run.  suspend puts the hardware and software state
2978  * in each IP into a state suitable for suspend.
2979  * Returns 0 on success, negative error code on failure.
2980  */
2981 static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev)
2982 {
2983         int i, r;
2984
2985         if (adev->in_s0ix)
2986                 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D3Entry);
2987
2988         for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2989                 if (!adev->ip_blocks[i].status.valid)
2990                         continue;
2991                 /* displays are handled in phase1 */
2992                 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE)
2993                         continue;
2994                 /* PSP lost connection when err_event_athub occurs */
2995                 if (amdgpu_ras_intr_triggered() &&
2996                     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) {
2997                         adev->ip_blocks[i].status.hw = false;
2998                         continue;
2999                 }
3000
3001                 /* skip unnecessary suspend if we do not initialize them yet */
3002                 if (adev->gmc.xgmi.pending_reset &&
3003                     !(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
3004                       adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC ||
3005                       adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
3006                       adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH)) {
3007                         adev->ip_blocks[i].status.hw = false;
3008                         continue;
3009                 }
3010
3011                 /* skip suspend of gfx/mes and psp for S0ix
3012                  * gfx is in gfxoff state, so on resume it will exit gfxoff just
3013                  * like at runtime. PSP is also part of the always on hardware
3014                  * so no need to suspend it.
3015                  */
3016                 if (adev->in_s0ix &&
3017                     (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP ||
3018                      adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX ||
3019                      adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_MES))
3020                         continue;
3021
3022                 /* XXX handle errors */
3023                 r = adev->ip_blocks[i].version->funcs->suspend(adev);
3024                 /* XXX handle errors */
3025                 if (r) {
3026                         DRM_ERROR("suspend of IP block <%s> failed %d\n",
3027                                   adev->ip_blocks[i].version->funcs->name, r);
3028                 }
3029                 adev->ip_blocks[i].status.hw = false;
3030                 /* handle putting the SMC in the appropriate state */
3031                 if(!amdgpu_sriov_vf(adev)){
3032                         if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) {
3033                                 r = amdgpu_dpm_set_mp1_state(adev, adev->mp1_state);
3034                                 if (r) {
3035                                         DRM_ERROR("SMC failed to set mp1 state %d, %d\n",
3036                                                         adev->mp1_state, r);
3037                                         return r;
3038                                 }
3039                         }
3040                 }
3041         }
3042
3043         return 0;
3044 }
3045
3046 /**
3047  * amdgpu_device_ip_suspend - run suspend for hardware IPs
3048  *
3049  * @adev: amdgpu_device pointer
3050  *
3051  * Main suspend function for hardware IPs.  The list of all the hardware
3052  * IPs that make up the asic is walked, clockgating is disabled and the
3053  * suspend callbacks are run.  suspend puts the hardware and software state
3054  * in each IP into a state suitable for suspend.
3055  * Returns 0 on success, negative error code on failure.
3056  */
3057 int amdgpu_device_ip_suspend(struct amdgpu_device *adev)
3058 {
3059         int r;
3060
3061         if (amdgpu_sriov_vf(adev)) {
3062                 amdgpu_virt_fini_data_exchange(adev);
3063                 amdgpu_virt_request_full_gpu(adev, false);
3064         }
3065
3066         r = amdgpu_device_ip_suspend_phase1(adev);
3067         if (r)
3068                 return r;
3069         r = amdgpu_device_ip_suspend_phase2(adev);
3070
3071         if (amdgpu_sriov_vf(adev))
3072                 amdgpu_virt_release_full_gpu(adev, false);
3073
3074         return r;
3075 }
3076
3077 static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev)
3078 {
3079         int i, r;
3080
3081         static enum amd_ip_block_type ip_order[] = {
3082                 AMD_IP_BLOCK_TYPE_COMMON,
3083                 AMD_IP_BLOCK_TYPE_GMC,
3084                 AMD_IP_BLOCK_TYPE_PSP,
3085                 AMD_IP_BLOCK_TYPE_IH,
3086         };
3087
3088         for (i = 0; i < adev->num_ip_blocks; i++) {
3089                 int j;
3090                 struct amdgpu_ip_block *block;
3091
3092                 block = &adev->ip_blocks[i];
3093                 block->status.hw = false;
3094
3095                 for (j = 0; j < ARRAY_SIZE(ip_order); j++) {
3096
3097                         if (block->version->type != ip_order[j] ||
3098                                 !block->status.valid)
3099                                 continue;
3100
3101                         r = block->version->funcs->hw_init(adev);
3102                         DRM_INFO("RE-INIT-early: %s %s\n", block->version->funcs->name, r?"failed":"succeeded");
3103                         if (r)
3104                                 return r;
3105                         block->status.hw = true;
3106                 }
3107         }
3108
3109         return 0;
3110 }
3111
3112 static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev)
3113 {
3114         int i, r;
3115
3116         static enum amd_ip_block_type ip_order[] = {
3117                 AMD_IP_BLOCK_TYPE_SMC,
3118                 AMD_IP_BLOCK_TYPE_DCE,
3119                 AMD_IP_BLOCK_TYPE_GFX,
3120                 AMD_IP_BLOCK_TYPE_SDMA,
3121                 AMD_IP_BLOCK_TYPE_UVD,
3122                 AMD_IP_BLOCK_TYPE_VCE,
3123                 AMD_IP_BLOCK_TYPE_VCN
3124         };
3125
3126         for (i = 0; i < ARRAY_SIZE(ip_order); i++) {
3127                 int j;
3128                 struct amdgpu_ip_block *block;
3129
3130                 for (j = 0; j < adev->num_ip_blocks; j++) {
3131                         block = &adev->ip_blocks[j];
3132
3133                         if (block->version->type != ip_order[i] ||
3134                                 !block->status.valid ||
3135                                 block->status.hw)
3136                                 continue;
3137
3138                         if (block->version->type == AMD_IP_BLOCK_TYPE_SMC)
3139                                 r = block->version->funcs->resume(adev);
3140                         else
3141                                 r = block->version->funcs->hw_init(adev);
3142
3143                         DRM_INFO("RE-INIT-late: %s %s\n", block->version->funcs->name, r?"failed":"succeeded");
3144                         if (r)
3145                                 return r;
3146                         block->status.hw = true;
3147                 }
3148         }
3149
3150         return 0;
3151 }
3152
3153 /**
3154  * amdgpu_device_ip_resume_phase1 - run resume for hardware IPs
3155  *
3156  * @adev: amdgpu_device pointer
3157  *
3158  * First resume function for hardware IPs.  The list of all the hardware
3159  * IPs that make up the asic is walked and the resume callbacks are run for
3160  * COMMON, GMC, and IH.  resume puts the hardware into a functional state
3161  * after a suspend and updates the software state as necessary.  This
3162  * function is also used for restoring the GPU after a GPU reset.
3163  * Returns 0 on success, negative error code on failure.
3164  */
3165 static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev)
3166 {
3167         int i, r;
3168
3169         for (i = 0; i < adev->num_ip_blocks; i++) {
3170                 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw)
3171                         continue;
3172                 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
3173                     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
3174                     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH ||
3175                     (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP && amdgpu_sriov_vf(adev))) {
3176
3177                         r = adev->ip_blocks[i].version->funcs->resume(adev);
3178                         if (r) {
3179                                 DRM_ERROR("resume of IP block <%s> failed %d\n",
3180                                           adev->ip_blocks[i].version->funcs->name, r);
3181                                 return r;
3182                         }
3183                         adev->ip_blocks[i].status.hw = true;
3184                 }
3185         }
3186
3187         return 0;
3188 }
3189
3190 /**
3191  * amdgpu_device_ip_resume_phase2 - run resume for hardware IPs
3192  *
3193  * @adev: amdgpu_device pointer
3194  *
3195  * First resume function for hardware IPs.  The list of all the hardware
3196  * IPs that make up the asic is walked and the resume callbacks are run for
3197  * all blocks except COMMON, GMC, and IH.  resume puts the hardware into a
3198  * functional state after a suspend and updates the software state as
3199  * necessary.  This function is also used for restoring the GPU after a GPU
3200  * reset.
3201  * Returns 0 on success, negative error code on failure.
3202  */
3203 static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev)
3204 {
3205         int i, r;
3206
3207         for (i = 0; i < adev->num_ip_blocks; i++) {
3208                 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw)
3209                         continue;
3210                 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
3211                     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
3212                     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH ||
3213                     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)
3214                         continue;
3215                 r = adev->ip_blocks[i].version->funcs->resume(adev);
3216                 if (r) {
3217                         DRM_ERROR("resume of IP block <%s> failed %d\n",
3218                                   adev->ip_blocks[i].version->funcs->name, r);
3219                         return r;
3220                 }
3221                 adev->ip_blocks[i].status.hw = true;
3222
3223                 if (adev->in_s0ix && adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) {
3224                         /* disable gfxoff for IP resume. The gfxoff will be re-enabled in
3225                          * amdgpu_device_resume() after IP resume.
3226                          */
3227                         amdgpu_gfx_off_ctrl(adev, false);
3228                         DRM_DEBUG("will disable gfxoff for re-initializing other blocks\n");
3229                 }
3230
3231         }
3232
3233         return 0;
3234 }
3235
3236 /**
3237  * amdgpu_device_ip_resume - run resume for hardware IPs
3238  *
3239  * @adev: amdgpu_device pointer
3240  *
3241  * Main resume function for hardware IPs.  The hardware IPs
3242  * are split into two resume functions because they are
3243  * are also used in in recovering from a GPU reset and some additional
3244  * steps need to be take between them.  In this case (S3/S4) they are
3245  * run sequentially.
3246  * Returns 0 on success, negative error code on failure.
3247  */
3248 static int amdgpu_device_ip_resume(struct amdgpu_device *adev)
3249 {
3250         int r;
3251
3252         r = amdgpu_amdkfd_resume_iommu(adev);
3253         if (r)
3254                 return r;
3255
3256         r = amdgpu_device_ip_resume_phase1(adev);
3257         if (r)
3258                 return r;
3259
3260         r = amdgpu_device_fw_loading(adev);
3261         if (r)
3262                 return r;
3263
3264         r = amdgpu_device_ip_resume_phase2(adev);
3265
3266         return r;
3267 }
3268
3269 /**
3270  * amdgpu_device_detect_sriov_bios - determine if the board supports SR-IOV
3271  *
3272  * @adev: amdgpu_device pointer
3273  *
3274  * Query the VBIOS data tables to determine if the board supports SR-IOV.
3275  */
3276 static void amdgpu_device_detect_sriov_bios(struct amdgpu_device *adev)
3277 {
3278         if (amdgpu_sriov_vf(adev)) {
3279                 if (adev->is_atom_fw) {
3280                         if (amdgpu_atomfirmware_gpu_virtualization_supported(adev))
3281                                 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS;
3282                 } else {
3283                         if (amdgpu_atombios_has_gpu_virtualization_table(adev))
3284                                 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS;
3285                 }
3286
3287                 if (!(adev->virt.caps & AMDGPU_SRIOV_CAPS_SRIOV_VBIOS))
3288                         amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_NO_VBIOS, 0, 0);
3289         }
3290 }
3291
3292 /**
3293  * amdgpu_device_asic_has_dc_support - determine if DC supports the asic
3294  *
3295  * @asic_type: AMD asic type
3296  *
3297  * Check if there is DC (new modesetting infrastructre) support for an asic.
3298  * returns true if DC has support, false if not.
3299  */
3300 bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type)
3301 {
3302         switch (asic_type) {
3303 #ifdef CONFIG_DRM_AMDGPU_SI
3304         case CHIP_HAINAN:
3305 #endif
3306         case CHIP_TOPAZ:
3307                 /* chips with no display hardware */
3308                 return false;
3309 #if defined(CONFIG_DRM_AMD_DC)
3310         case CHIP_TAHITI:
3311         case CHIP_PITCAIRN:
3312         case CHIP_VERDE:
3313         case CHIP_OLAND:
3314                 /*
3315                  * We have systems in the wild with these ASICs that require
3316                  * LVDS and VGA support which is not supported with DC.
3317                  *
3318                  * Fallback to the non-DC driver here by default so as not to
3319                  * cause regressions.
3320                  */
3321 #if defined(CONFIG_DRM_AMD_DC_SI)
3322                 return amdgpu_dc > 0;
3323 #else
3324                 return false;
3325 #endif
3326         case CHIP_BONAIRE:
3327         case CHIP_KAVERI:
3328         case CHIP_KABINI:
3329         case CHIP_MULLINS:
3330                 /*
3331                  * We have systems in the wild with these ASICs that require
3332                  * VGA support which is not supported with DC.
3333                  *
3334                  * Fallback to the non-DC driver here by default so as not to
3335                  * cause regressions.
3336                  */
3337                 return amdgpu_dc > 0;
3338         default:
3339                 return amdgpu_dc != 0;
3340 #else
3341         default:
3342                 if (amdgpu_dc > 0)
3343                         DRM_INFO_ONCE("Display Core has been requested via kernel parameter "
3344                                          "but isn't supported by ASIC, ignoring\n");
3345                 return false;
3346 #endif
3347         }
3348 }
3349
3350 /**
3351  * amdgpu_device_has_dc_support - check if dc is supported
3352  *
3353  * @adev: amdgpu_device pointer
3354  *
3355  * Returns true for supported, false for not supported
3356  */
3357 bool amdgpu_device_has_dc_support(struct amdgpu_device *adev)
3358 {
3359         if (amdgpu_sriov_vf(adev) ||
3360             adev->enable_virtual_display ||
3361             (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK))
3362                 return false;
3363
3364         return amdgpu_device_asic_has_dc_support(adev->asic_type);
3365 }
3366
3367 static void amdgpu_device_xgmi_reset_func(struct work_struct *__work)
3368 {
3369         struct amdgpu_device *adev =
3370                 container_of(__work, struct amdgpu_device, xgmi_reset_work);
3371         struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev);
3372
3373         /* It's a bug to not have a hive within this function */
3374         if (WARN_ON(!hive))
3375                 return;
3376
3377         /*
3378          * Use task barrier to synchronize all xgmi reset works across the
3379          * hive. task_barrier_enter and task_barrier_exit will block
3380          * until all the threads running the xgmi reset works reach
3381          * those points. task_barrier_full will do both blocks.
3382          */
3383         if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) {
3384
3385                 task_barrier_enter(&hive->tb);
3386                 adev->asic_reset_res = amdgpu_device_baco_enter(adev_to_drm(adev));
3387
3388                 if (adev->asic_reset_res)
3389                         goto fail;
3390
3391                 task_barrier_exit(&hive->tb);
3392                 adev->asic_reset_res = amdgpu_device_baco_exit(adev_to_drm(adev));
3393
3394                 if (adev->asic_reset_res)
3395                         goto fail;
3396
3397                 if (adev->mmhub.ras && adev->mmhub.ras->ras_block.hw_ops &&
3398                     adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count)
3399                         adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count(adev);
3400         } else {
3401
3402                 task_barrier_full(&hive->tb);
3403                 adev->asic_reset_res =  amdgpu_asic_reset(adev);
3404         }
3405
3406 fail:
3407         if (adev->asic_reset_res)
3408                 DRM_WARN("ASIC reset failed with error, %d for drm dev, %s",
3409                          adev->asic_reset_res, adev_to_drm(adev)->unique);
3410         amdgpu_put_xgmi_hive(hive);
3411 }
3412
3413 static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev)
3414 {
3415         char *input = amdgpu_lockup_timeout;
3416         char *timeout_setting = NULL;
3417         int index = 0;
3418         long timeout;
3419         int ret = 0;
3420
3421         /*
3422          * By default timeout for non compute jobs is 10000
3423          * and 60000 for compute jobs.
3424          * In SR-IOV or passthrough mode, timeout for compute
3425          * jobs are 60000 by default.
3426          */
3427         adev->gfx_timeout = msecs_to_jiffies(10000);
3428         adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout;
3429         if (amdgpu_sriov_vf(adev))
3430                 adev->compute_timeout = amdgpu_sriov_is_pp_one_vf(adev) ?
3431                                         msecs_to_jiffies(60000) : msecs_to_jiffies(10000);
3432         else
3433                 adev->compute_timeout =  msecs_to_jiffies(60000);
3434
3435         if (strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) {
3436                 while ((timeout_setting = strsep(&input, ",")) &&
3437                                 strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) {
3438                         ret = kstrtol(timeout_setting, 0, &timeout);
3439                         if (ret)
3440                                 return ret;
3441
3442                         if (timeout == 0) {
3443                                 index++;
3444                                 continue;
3445                         } else if (timeout < 0) {
3446                                 timeout = MAX_SCHEDULE_TIMEOUT;
3447                                 dev_warn(adev->dev, "lockup timeout disabled");
3448                                 add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK);
3449                         } else {
3450                                 timeout = msecs_to_jiffies(timeout);
3451                         }
3452
3453                         switch (index++) {
3454                         case 0:
3455                                 adev->gfx_timeout = timeout;
3456                                 break;
3457                         case 1:
3458                                 adev->compute_timeout = timeout;
3459                                 break;
3460                         case 2:
3461                                 adev->sdma_timeout = timeout;
3462                                 break;
3463                         case 3:
3464                                 adev->video_timeout = timeout;
3465                                 break;
3466                         default:
3467                                 break;
3468                         }
3469                 }
3470                 /*
3471                  * There is only one value specified and
3472                  * it should apply to all non-compute jobs.
3473                  */
3474                 if (index == 1) {
3475                         adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout;
3476                         if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev))
3477                                 adev->compute_timeout = adev->gfx_timeout;
3478                 }
3479         }
3480
3481         return ret;
3482 }
3483
3484 /**
3485  * amdgpu_device_check_iommu_direct_map - check if RAM direct mapped to GPU
3486  *
3487  * @adev: amdgpu_device pointer
3488  *
3489  * RAM direct mapped to GPU if IOMMU is not enabled or is pass through mode
3490  */
3491 static void amdgpu_device_check_iommu_direct_map(struct amdgpu_device *adev)
3492 {
3493         struct iommu_domain *domain;
3494
3495         domain = iommu_get_domain_for_dev(adev->dev);
3496         if (!domain || domain->type == IOMMU_DOMAIN_IDENTITY)
3497                 adev->ram_is_direct_mapped = true;
3498 }
3499
3500 static const struct attribute *amdgpu_dev_attributes[] = {
3501         &dev_attr_product_name.attr,
3502         &dev_attr_product_number.attr,
3503         &dev_attr_serial_number.attr,
3504         &dev_attr_pcie_replay_count.attr,
3505         NULL
3506 };
3507
3508 /**
3509  * amdgpu_device_init - initialize the driver
3510  *
3511  * @adev: amdgpu_device pointer
3512  * @flags: driver flags
3513  *
3514  * Initializes the driver info and hw (all asics).
3515  * Returns 0 for success or an error on failure.
3516  * Called at driver startup.
3517  */
3518 int amdgpu_device_init(struct amdgpu_device *adev,
3519                        uint32_t flags)
3520 {
3521         struct drm_device *ddev = adev_to_drm(adev);
3522         struct pci_dev *pdev = adev->pdev;
3523         int r, i;
3524         bool px = false;
3525         u32 max_MBps;
3526
3527         adev->shutdown = false;
3528         adev->flags = flags;
3529
3530         if (amdgpu_force_asic_type >= 0 && amdgpu_force_asic_type < CHIP_LAST)
3531                 adev->asic_type = amdgpu_force_asic_type;
3532         else
3533                 adev->asic_type = flags & AMD_ASIC_MASK;
3534
3535         adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT;
3536         if (amdgpu_emu_mode == 1)
3537                 adev->usec_timeout *= 10;
3538         adev->gmc.gart_size = 512 * 1024 * 1024;
3539         adev->accel_working = false;
3540         adev->num_rings = 0;
3541         RCU_INIT_POINTER(adev->gang_submit, dma_fence_get_stub());
3542         adev->mman.buffer_funcs = NULL;
3543         adev->mman.buffer_funcs_ring = NULL;
3544         adev->vm_manager.vm_pte_funcs = NULL;
3545         adev->vm_manager.vm_pte_num_scheds = 0;
3546         adev->gmc.gmc_funcs = NULL;
3547         adev->harvest_ip_mask = 0x0;
3548         adev->fence_context = dma_fence_context_alloc(AMDGPU_MAX_RINGS);
3549         bitmap_zero(adev->gfx.pipe_reserve_bitmap, AMDGPU_MAX_COMPUTE_QUEUES);
3550
3551         adev->smc_rreg = &amdgpu_invalid_rreg;
3552         adev->smc_wreg = &amdgpu_invalid_wreg;
3553         adev->pcie_rreg = &amdgpu_invalid_rreg;
3554         adev->pcie_wreg = &amdgpu_invalid_wreg;
3555         adev->pciep_rreg = &amdgpu_invalid_rreg;
3556         adev->pciep_wreg = &amdgpu_invalid_wreg;
3557         adev->pcie_rreg64 = &amdgpu_invalid_rreg64;
3558         adev->pcie_wreg64 = &amdgpu_invalid_wreg64;
3559         adev->uvd_ctx_rreg = &amdgpu_invalid_rreg;
3560         adev->uvd_ctx_wreg = &amdgpu_invalid_wreg;
3561         adev->didt_rreg = &amdgpu_invalid_rreg;
3562         adev->didt_wreg = &amdgpu_invalid_wreg;
3563         adev->gc_cac_rreg = &amdgpu_invalid_rreg;
3564         adev->gc_cac_wreg = &amdgpu_invalid_wreg;
3565         adev->audio_endpt_rreg = &amdgpu_block_invalid_rreg;
3566         adev->audio_endpt_wreg = &amdgpu_block_invalid_wreg;
3567
3568         DRM_INFO("initializing kernel modesetting (%s 0x%04X:0x%04X 0x%04X:0x%04X 0x%02X).\n",
3569                  amdgpu_asic_name[adev->asic_type], pdev->vendor, pdev->device,
3570                  pdev->subsystem_vendor, pdev->subsystem_device, pdev->revision);
3571
3572         /* mutex initialization are all done here so we
3573          * can recall function without having locking issues */
3574         mutex_init(&adev->firmware.mutex);
3575         mutex_init(&adev->pm.mutex);
3576         mutex_init(&adev->gfx.gpu_clock_mutex);
3577         mutex_init(&adev->srbm_mutex);
3578         mutex_init(&adev->gfx.pipe_reserve_mutex);
3579         mutex_init(&adev->gfx.gfx_off_mutex);
3580         mutex_init(&adev->grbm_idx_mutex);
3581         mutex_init(&adev->mn_lock);
3582         mutex_init(&adev->virt.vf_errors.lock);
3583         hash_init(adev->mn_hash);
3584         mutex_init(&adev->psp.mutex);
3585         mutex_init(&adev->notifier_lock);
3586         mutex_init(&adev->pm.stable_pstate_ctx_lock);
3587         mutex_init(&adev->benchmark_mutex);
3588
3589         amdgpu_device_init_apu_flags(adev);
3590
3591         r = amdgpu_device_check_arguments(adev);
3592         if (r)
3593                 return r;
3594
3595         spin_lock_init(&adev->mmio_idx_lock);
3596         spin_lock_init(&adev->smc_idx_lock);
3597         spin_lock_init(&adev->pcie_idx_lock);
3598         spin_lock_init(&adev->uvd_ctx_idx_lock);
3599         spin_lock_init(&adev->didt_idx_lock);
3600         spin_lock_init(&adev->gc_cac_idx_lock);
3601         spin_lock_init(&adev->se_cac_idx_lock);
3602         spin_lock_init(&adev->audio_endpt_idx_lock);
3603         spin_lock_init(&adev->mm_stats.lock);
3604
3605         INIT_LIST_HEAD(&adev->shadow_list);
3606         mutex_init(&adev->shadow_list_lock);
3607
3608         INIT_LIST_HEAD(&adev->reset_list);
3609
3610         INIT_LIST_HEAD(&adev->ras_list);
3611
3612         INIT_DELAYED_WORK(&adev->delayed_init_work,
3613                           amdgpu_device_delayed_init_work_handler);
3614         INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work,
3615                           amdgpu_device_delay_enable_gfx_off);
3616
3617         INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func);
3618
3619         adev->gfx.gfx_off_req_count = 1;
3620         adev->gfx.gfx_off_residency = 0;
3621         adev->gfx.gfx_off_entrycount = 0;
3622         adev->pm.ac_power = power_supply_is_system_supplied() > 0;
3623
3624         atomic_set(&adev->throttling_logging_enabled, 1);
3625         /*
3626          * If throttling continues, logging will be performed every minute
3627          * to avoid log flooding. "-1" is subtracted since the thermal
3628          * throttling interrupt comes every second. Thus, the total logging
3629          * interval is 59 seconds(retelimited printk interval) + 1(waiting
3630          * for throttling interrupt) = 60 seconds.
3631          */
3632         ratelimit_state_init(&adev->throttling_logging_rs, (60 - 1) * HZ, 1);
3633         ratelimit_set_flags(&adev->throttling_logging_rs, RATELIMIT_MSG_ON_RELEASE);
3634
3635         /* Registers mapping */
3636         /* TODO: block userspace mapping of io register */
3637         if (adev->asic_type >= CHIP_BONAIRE) {
3638                 adev->rmmio_base = pci_resource_start(adev->pdev, 5);
3639                 adev->rmmio_size = pci_resource_len(adev->pdev, 5);
3640         } else {
3641                 adev->rmmio_base = pci_resource_start(adev->pdev, 2);
3642                 adev->rmmio_size = pci_resource_len(adev->pdev, 2);
3643         }
3644
3645         for (i = 0; i < AMD_IP_BLOCK_TYPE_NUM; i++)
3646                 atomic_set(&adev->pm.pwr_state[i], POWER_STATE_UNKNOWN);
3647
3648         adev->rmmio = ioremap(adev->rmmio_base, adev->rmmio_size);
3649         if (adev->rmmio == NULL) {
3650                 return -ENOMEM;
3651         }
3652         DRM_INFO("register mmio base: 0x%08X\n", (uint32_t)adev->rmmio_base);
3653         DRM_INFO("register mmio size: %u\n", (unsigned)adev->rmmio_size);
3654
3655         amdgpu_device_get_pcie_info(adev);
3656
3657         if (amdgpu_mcbp)
3658                 DRM_INFO("MCBP is enabled\n");
3659
3660         /*
3661          * Reset domain needs to be present early, before XGMI hive discovered
3662          * (if any) and intitialized to use reset sem and in_gpu reset flag
3663          * early on during init and before calling to RREG32.
3664          */
3665         adev->reset_domain = amdgpu_reset_create_reset_domain(SINGLE_DEVICE, "amdgpu-reset-dev");
3666         if (!adev->reset_domain)
3667                 return -ENOMEM;
3668
3669         /* detect hw virtualization here */
3670         amdgpu_detect_virtualization(adev);
3671
3672         r = amdgpu_device_get_job_timeout_settings(adev);
3673         if (r) {
3674                 dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n");
3675                 return r;
3676         }
3677
3678         /* early init functions */
3679         r = amdgpu_device_ip_early_init(adev);
3680         if (r)
3681                 return r;
3682
3683         /* Get rid of things like offb */
3684         r = drm_aperture_remove_conflicting_pci_framebuffers(adev->pdev, &amdgpu_kms_driver);
3685         if (r)
3686                 return r;
3687
3688         /* Enable TMZ based on IP_VERSION */
3689         amdgpu_gmc_tmz_set(adev);
3690
3691         amdgpu_gmc_noretry_set(adev);
3692         /* Need to get xgmi info early to decide the reset behavior*/
3693         if (adev->gmc.xgmi.supported) {
3694                 r = adev->gfxhub.funcs->get_xgmi_info(adev);
3695                 if (r)
3696                         return r;
3697         }
3698
3699         /* enable PCIE atomic ops */
3700         if (amdgpu_sriov_vf(adev))
3701                 adev->have_atomics_support = ((struct amd_sriov_msg_pf2vf_info *)
3702                         adev->virt.fw_reserve.p_pf2vf)->pcie_atomic_ops_support_flags ==
3703                         (PCI_EXP_DEVCAP2_ATOMIC_COMP32 | PCI_EXP_DEVCAP2_ATOMIC_COMP64);
3704         else
3705                 adev->have_atomics_support =
3706                         !pci_enable_atomic_ops_to_root(adev->pdev,
3707                                           PCI_EXP_DEVCAP2_ATOMIC_COMP32 |
3708                                           PCI_EXP_DEVCAP2_ATOMIC_COMP64);
3709         if (!adev->have_atomics_support)
3710                 dev_info(adev->dev, "PCIE atomic ops is not supported\n");
3711
3712         /* doorbell bar mapping and doorbell index init*/
3713         amdgpu_device_doorbell_init(adev);
3714
3715         if (amdgpu_emu_mode == 1) {
3716                 /* post the asic on emulation mode */
3717                 emu_soc_asic_init(adev);
3718                 goto fence_driver_init;
3719         }
3720
3721         amdgpu_reset_init(adev);
3722
3723         /* detect if we are with an SRIOV vbios */
3724         amdgpu_device_detect_sriov_bios(adev);
3725
3726         /* check if we need to reset the asic
3727          *  E.g., driver was not cleanly unloaded previously, etc.
3728          */
3729         if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) {
3730                 if (adev->gmc.xgmi.num_physical_nodes) {
3731                         dev_info(adev->dev, "Pending hive reset.\n");
3732                         adev->gmc.xgmi.pending_reset = true;
3733                         /* Only need to init necessary block for SMU to handle the reset */
3734                         for (i = 0; i < adev->num_ip_blocks; i++) {
3735                                 if (!adev->ip_blocks[i].status.valid)
3736                                         continue;
3737                                 if (!(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
3738                                       adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
3739                                       adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH ||
3740                                       adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC)) {
3741                                         DRM_DEBUG("IP %s disabled for hw_init.\n",
3742                                                 adev->ip_blocks[i].version->funcs->name);
3743                                         adev->ip_blocks[i].status.hw = true;
3744                                 }
3745                         }
3746                 } else {
3747                         r = amdgpu_asic_reset(adev);
3748                         if (r) {
3749                                 dev_err(adev->dev, "asic reset on init failed\n");
3750                                 goto failed;
3751                         }
3752                 }
3753         }
3754
3755         pci_enable_pcie_error_reporting(adev->pdev);
3756
3757         /* Post card if necessary */
3758         if (amdgpu_device_need_post(adev)) {
3759                 if (!adev->bios) {
3760                         dev_err(adev->dev, "no vBIOS found\n");
3761                         r = -EINVAL;
3762                         goto failed;
3763                 }
3764                 DRM_INFO("GPU posting now...\n");
3765                 r = amdgpu_device_asic_init(adev);
3766                 if (r) {
3767                         dev_err(adev->dev, "gpu post error!\n");
3768                         goto failed;
3769                 }
3770         }
3771
3772         if (adev->is_atom_fw) {
3773                 /* Initialize clocks */
3774                 r = amdgpu_atomfirmware_get_clock_info(adev);
3775                 if (r) {
3776                         dev_err(adev->dev, "amdgpu_atomfirmware_get_clock_info failed\n");
3777                         amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0);
3778                         goto failed;
3779                 }
3780         } else {
3781                 /* Initialize clocks */
3782                 r = amdgpu_atombios_get_clock_info(adev);
3783                 if (r) {
3784                         dev_err(adev->dev, "amdgpu_atombios_get_clock_info failed\n");
3785                         amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0);
3786                         goto failed;
3787                 }
3788                 /* init i2c buses */
3789                 if (!amdgpu_device_has_dc_support(adev))
3790                         amdgpu_atombios_i2c_init(adev);
3791         }
3792
3793 fence_driver_init:
3794         /* Fence driver */
3795         r = amdgpu_fence_driver_sw_init(adev);
3796         if (r) {
3797                 dev_err(adev->dev, "amdgpu_fence_driver_sw_init failed\n");
3798                 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_FENCE_INIT_FAIL, 0, 0);
3799                 goto failed;
3800         }
3801
3802         /* init the mode config */
3803         drm_mode_config_init(adev_to_drm(adev));
3804
3805         r = amdgpu_device_ip_init(adev);
3806         if (r) {
3807                 /* failed in exclusive mode due to timeout */
3808                 if (amdgpu_sriov_vf(adev) &&
3809                     !amdgpu_sriov_runtime(adev) &&
3810                     amdgpu_virt_mmio_blocked(adev) &&
3811                     !amdgpu_virt_wait_reset(adev)) {
3812                         dev_err(adev->dev, "VF exclusive mode timeout\n");
3813                         /* Don't send request since VF is inactive. */
3814                         adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME;
3815                         adev->virt.ops = NULL;
3816                         r = -EAGAIN;
3817                         goto release_ras_con;
3818                 }
3819                 dev_err(adev->dev, "amdgpu_device_ip_init failed\n");
3820                 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0);
3821                 goto release_ras_con;
3822         }
3823
3824         amdgpu_fence_driver_hw_init(adev);
3825
3826         dev_info(adev->dev,
3827                 "SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n",
3828                         adev->gfx.config.max_shader_engines,
3829                         adev->gfx.config.max_sh_per_se,
3830                         adev->gfx.config.max_cu_per_sh,
3831                         adev->gfx.cu_info.number);
3832
3833         adev->accel_working = true;
3834
3835         amdgpu_vm_check_compute_bug(adev);
3836
3837         /* Initialize the buffer migration limit. */
3838         if (amdgpu_moverate >= 0)
3839                 max_MBps = amdgpu_moverate;
3840         else
3841                 max_MBps = 8; /* Allow 8 MB/s. */
3842         /* Get a log2 for easy divisions. */
3843         adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps));
3844
3845         r = amdgpu_pm_sysfs_init(adev);
3846         if (r) {
3847                 adev->pm_sysfs_en = false;
3848                 DRM_ERROR("registering pm debugfs failed (%d).\n", r);
3849         } else
3850                 adev->pm_sysfs_en = true;
3851
3852         r = amdgpu_ucode_sysfs_init(adev);
3853         if (r) {
3854                 adev->ucode_sysfs_en = false;
3855                 DRM_ERROR("Creating firmware sysfs failed (%d).\n", r);
3856         } else
3857                 adev->ucode_sysfs_en = true;
3858
3859         r = amdgpu_psp_sysfs_init(adev);
3860         if (r) {
3861                 adev->psp_sysfs_en = false;
3862                 if (!amdgpu_sriov_vf(adev))
3863                         DRM_ERROR("Creating psp sysfs failed\n");
3864         } else
3865                 adev->psp_sysfs_en = true;
3866
3867         /*
3868          * Register gpu instance before amdgpu_device_enable_mgpu_fan_boost.
3869          * Otherwise the mgpu fan boost feature will be skipped due to the
3870          * gpu instance is counted less.
3871          */
3872         amdgpu_register_gpu_instance(adev);
3873
3874         /* enable clockgating, etc. after ib tests, etc. since some blocks require
3875          * explicit gating rather than handling it automatically.
3876          */
3877         if (!adev->gmc.xgmi.pending_reset) {
3878                 r = amdgpu_device_ip_late_init(adev);
3879                 if (r) {
3880                         dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n");
3881                         amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r);
3882                         goto release_ras_con;
3883                 }
3884                 /* must succeed. */
3885                 amdgpu_ras_resume(adev);
3886                 queue_delayed_work(system_wq, &adev->delayed_init_work,
3887                                    msecs_to_jiffies(AMDGPU_RESUME_MS));
3888         }
3889
3890         if (amdgpu_sriov_vf(adev))
3891                 flush_delayed_work(&adev->delayed_init_work);
3892
3893         r = sysfs_create_files(&adev->dev->kobj, amdgpu_dev_attributes);
3894         if (r)
3895                 dev_err(adev->dev, "Could not create amdgpu device attr\n");
3896
3897         if (IS_ENABLED(CONFIG_PERF_EVENTS))
3898                 r = amdgpu_pmu_init(adev);
3899         if (r)
3900                 dev_err(adev->dev, "amdgpu_pmu_init failed\n");
3901
3902         /* Have stored pci confspace at hand for restore in sudden PCI error */
3903         if (amdgpu_device_cache_pci_state(adev->pdev))
3904                 pci_restore_state(pdev);
3905
3906         /* if we have > 1 VGA cards, then disable the amdgpu VGA resources */
3907         /* this will fail for cards that aren't VGA class devices, just
3908          * ignore it */
3909         if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA)
3910                 vga_client_register(adev->pdev, amdgpu_device_vga_set_decode);
3911
3912         if (amdgpu_device_supports_px(ddev)) {
3913                 px = true;
3914                 vga_switcheroo_register_client(adev->pdev,
3915                                                &amdgpu_switcheroo_ops, px);
3916                 vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain);
3917         }
3918
3919         if (adev->gmc.xgmi.pending_reset)
3920                 queue_delayed_work(system_wq, &mgpu_info.delayed_reset_work,
3921                                    msecs_to_jiffies(AMDGPU_RESUME_MS));
3922
3923         amdgpu_device_check_iommu_direct_map(adev);
3924
3925         return 0;
3926
3927 release_ras_con:
3928         amdgpu_release_ras_context(adev);
3929
3930 failed:
3931         amdgpu_vf_error_trans_all(adev);
3932
3933         return r;
3934 }
3935
3936 static void amdgpu_device_unmap_mmio(struct amdgpu_device *adev)
3937 {
3938
3939         /* Clear all CPU mappings pointing to this device */
3940         unmap_mapping_range(adev->ddev.anon_inode->i_mapping, 0, 0, 1);
3941
3942         /* Unmap all mapped bars - Doorbell, registers and VRAM */
3943         amdgpu_device_doorbell_fini(adev);
3944
3945         iounmap(adev->rmmio);
3946         adev->rmmio = NULL;
3947         if (adev->mman.aper_base_kaddr)
3948                 iounmap(adev->mman.aper_base_kaddr);
3949         adev->mman.aper_base_kaddr = NULL;
3950
3951         /* Memory manager related */
3952         if (!adev->gmc.xgmi.connected_to_cpu) {
3953                 arch_phys_wc_del(adev->gmc.vram_mtrr);
3954                 arch_io_free_memtype_wc(adev->gmc.aper_base, adev->gmc.aper_size);
3955         }
3956 }
3957
3958 /**
3959  * amdgpu_device_fini_hw - tear down the driver
3960  *
3961  * @adev: amdgpu_device pointer
3962  *
3963  * Tear down the driver info (all asics).
3964  * Called at driver shutdown.
3965  */
3966 void amdgpu_device_fini_hw(struct amdgpu_device *adev)
3967 {
3968         dev_info(adev->dev, "amdgpu: finishing device.\n");
3969         flush_delayed_work(&adev->delayed_init_work);
3970         adev->shutdown = true;
3971
3972         /* make sure IB test finished before entering exclusive mode
3973          * to avoid preemption on IB test
3974          * */
3975         if (amdgpu_sriov_vf(adev)) {
3976                 amdgpu_virt_request_full_gpu(adev, false);
3977                 amdgpu_virt_fini_data_exchange(adev);
3978         }
3979
3980         /* disable all interrupts */
3981         amdgpu_irq_disable_all(adev);
3982         if (adev->mode_info.mode_config_initialized){
3983                 if (!drm_drv_uses_atomic_modeset(adev_to_drm(adev)))
3984                         drm_helper_force_disable_all(adev_to_drm(adev));
3985                 else
3986                         drm_atomic_helper_shutdown(adev_to_drm(adev));
3987         }
3988         amdgpu_fence_driver_hw_fini(adev);
3989
3990         if (adev->mman.initialized) {
3991                 flush_delayed_work(&adev->mman.bdev.wq);
3992                 ttm_bo_lock_delayed_workqueue(&adev->mman.bdev);
3993         }
3994
3995         if (adev->pm_sysfs_en)
3996                 amdgpu_pm_sysfs_fini(adev);
3997         if (adev->ucode_sysfs_en)
3998                 amdgpu_ucode_sysfs_fini(adev);
3999         if (adev->psp_sysfs_en)
4000                 amdgpu_psp_sysfs_fini(adev);
4001         sysfs_remove_files(&adev->dev->kobj, amdgpu_dev_attributes);
4002
4003         /* disable ras feature must before hw fini */
4004         amdgpu_ras_pre_fini(adev);
4005
4006         amdgpu_device_ip_fini_early(adev);
4007
4008         amdgpu_irq_fini_hw(adev);
4009
4010         if (adev->mman.initialized)
4011                 ttm_device_clear_dma_mappings(&adev->mman.bdev);
4012
4013         amdgpu_gart_dummy_page_fini(adev);
4014
4015         amdgpu_device_unmap_mmio(adev);
4016
4017 }
4018
4019 void amdgpu_device_fini_sw(struct amdgpu_device *adev)
4020 {
4021         int idx;
4022
4023         amdgpu_fence_driver_sw_fini(adev);
4024         amdgpu_device_ip_fini(adev);
4025         release_firmware(adev->firmware.gpu_info_fw);
4026         adev->firmware.gpu_info_fw = NULL;
4027         adev->accel_working = false;
4028         dma_fence_put(rcu_dereference_protected(adev->gang_submit, true));
4029
4030         amdgpu_reset_fini(adev);
4031
4032         /* free i2c buses */
4033         if (!amdgpu_device_has_dc_support(adev))
4034                 amdgpu_i2c_fini(adev);
4035
4036         if (amdgpu_emu_mode != 1)
4037                 amdgpu_atombios_fini(adev);
4038
4039         kfree(adev->bios);
4040         adev->bios = NULL;
4041         if (amdgpu_device_supports_px(adev_to_drm(adev))) {
4042                 vga_switcheroo_unregister_client(adev->pdev);
4043                 vga_switcheroo_fini_domain_pm_ops(adev->dev);
4044         }
4045         if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA)
4046                 vga_client_unregister(adev->pdev);
4047
4048         if (drm_dev_enter(adev_to_drm(adev), &idx)) {
4049
4050                 iounmap(adev->rmmio);
4051                 adev->rmmio = NULL;
4052                 amdgpu_device_doorbell_fini(adev);
4053                 drm_dev_exit(idx);
4054         }
4055
4056         if (IS_ENABLED(CONFIG_PERF_EVENTS))
4057                 amdgpu_pmu_fini(adev);
4058         if (adev->mman.discovery_bin)
4059                 amdgpu_discovery_fini(adev);
4060
4061         amdgpu_reset_put_reset_domain(adev->reset_domain);
4062         adev->reset_domain = NULL;
4063
4064         kfree(adev->pci_state);
4065
4066 }
4067
4068 /**
4069  * amdgpu_device_evict_resources - evict device resources
4070  * @adev: amdgpu device object
4071  *
4072  * Evicts all ttm device resources(vram BOs, gart table) from the lru list
4073  * of the vram memory type. Mainly used for evicting device resources
4074  * at suspend time.
4075  *
4076  */
4077 static int amdgpu_device_evict_resources(struct amdgpu_device *adev)
4078 {
4079         int ret;
4080
4081         /* No need to evict vram on APUs for suspend to ram or s2idle */
4082         if ((adev->in_s3 || adev->in_s0ix) && (adev->flags & AMD_IS_APU))
4083                 return 0;
4084
4085         ret = amdgpu_ttm_evict_resources(adev, TTM_PL_VRAM);
4086         if (ret)
4087                 DRM_WARN("evicting device resources failed\n");
4088         return ret;
4089 }
4090
4091 /*
4092  * Suspend & resume.
4093  */
4094 /**
4095  * amdgpu_device_suspend - initiate device suspend
4096  *
4097  * @dev: drm dev pointer
4098  * @fbcon : notify the fbdev of suspend
4099  *
4100  * Puts the hw in the suspend state (all asics).
4101  * Returns 0 for success or an error on failure.
4102  * Called at driver suspend.
4103  */
4104 int amdgpu_device_suspend(struct drm_device *dev, bool fbcon)
4105 {
4106         struct amdgpu_device *adev = drm_to_adev(dev);
4107         int r = 0;
4108
4109         if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)
4110                 return 0;
4111
4112         adev->in_suspend = true;
4113
4114         if (amdgpu_sriov_vf(adev)) {
4115                 amdgpu_virt_fini_data_exchange(adev);
4116                 r = amdgpu_virt_request_full_gpu(adev, false);
4117                 if (r)
4118                         return r;
4119         }
4120
4121         if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D3))
4122                 DRM_WARN("smart shift update failed\n");
4123
4124         drm_kms_helper_poll_disable(dev);
4125
4126         if (fbcon)
4127                 drm_fb_helper_set_suspend_unlocked(adev_to_drm(adev)->fb_helper, true);
4128
4129         cancel_delayed_work_sync(&adev->delayed_init_work);
4130
4131         amdgpu_ras_suspend(adev);
4132
4133         amdgpu_device_ip_suspend_phase1(adev);
4134
4135         if (!adev->in_s0ix)
4136                 amdgpu_amdkfd_suspend(adev, adev->in_runpm);
4137
4138         r = amdgpu_device_evict_resources(adev);
4139         if (r)
4140                 return r;
4141
4142         amdgpu_fence_driver_hw_fini(adev);
4143
4144         amdgpu_device_ip_suspend_phase2(adev);
4145
4146         if (amdgpu_sriov_vf(adev))
4147                 amdgpu_virt_release_full_gpu(adev, false);
4148
4149         return 0;
4150 }
4151
4152 /**
4153  * amdgpu_device_resume - initiate device resume
4154  *
4155  * @dev: drm dev pointer
4156  * @fbcon : notify the fbdev of resume
4157  *
4158  * Bring the hw back to operating state (all asics).
4159  * Returns 0 for success or an error on failure.
4160  * Called at driver resume.
4161  */
4162 int amdgpu_device_resume(struct drm_device *dev, bool fbcon)
4163 {
4164         struct amdgpu_device *adev = drm_to_adev(dev);
4165         int r = 0;
4166
4167         if (amdgpu_sriov_vf(adev)) {
4168                 r = amdgpu_virt_request_full_gpu(adev, true);
4169                 if (r)
4170                         return r;
4171         }
4172
4173         if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)
4174                 return 0;
4175
4176         if (adev->in_s0ix)
4177                 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D0Entry);
4178
4179         /* post card */
4180         if (amdgpu_device_need_post(adev)) {
4181                 r = amdgpu_device_asic_init(adev);
4182                 if (r)
4183                         dev_err(adev->dev, "amdgpu asic init failed\n");
4184         }
4185
4186         r = amdgpu_device_ip_resume(adev);
4187
4188         /* no matter what r is, always need to properly release full GPU */
4189         if (amdgpu_sriov_vf(adev)) {
4190                 amdgpu_virt_init_data_exchange(adev);
4191                 amdgpu_virt_release_full_gpu(adev, true);
4192         }
4193
4194         if (r) {
4195                 dev_err(adev->dev, "amdgpu_device_ip_resume failed (%d).\n", r);
4196                 return r;
4197         }
4198         amdgpu_fence_driver_hw_init(adev);
4199
4200         r = amdgpu_device_ip_late_init(adev);
4201         if (r)
4202                 return r;
4203
4204         queue_delayed_work(system_wq, &adev->delayed_init_work,
4205                            msecs_to_jiffies(AMDGPU_RESUME_MS));
4206
4207         if (!adev->in_s0ix) {
4208                 r = amdgpu_amdkfd_resume(adev, adev->in_runpm);
4209                 if (r)
4210                         return r;
4211         }
4212
4213         /* Make sure IB tests flushed */
4214         flush_delayed_work(&adev->delayed_init_work);
4215
4216         if (adev->in_s0ix) {
4217                 /* re-enable gfxoff after IP resume. This re-enables gfxoff after
4218                  * it was disabled for IP resume in amdgpu_device_ip_resume_phase2().
4219                  */
4220                 amdgpu_gfx_off_ctrl(adev, true);
4221                 DRM_DEBUG("will enable gfxoff for the mission mode\n");
4222         }
4223         if (fbcon)
4224                 drm_fb_helper_set_suspend_unlocked(adev_to_drm(adev)->fb_helper, false);
4225
4226         drm_kms_helper_poll_enable(dev);
4227
4228         amdgpu_ras_resume(adev);
4229
4230         /*
4231          * Most of the connector probing functions try to acquire runtime pm
4232          * refs to ensure that the GPU is powered on when connector polling is
4233          * performed. Since we're calling this from a runtime PM callback,
4234          * trying to acquire rpm refs will cause us to deadlock.
4235          *
4236          * Since we're guaranteed to be holding the rpm lock, it's safe to
4237          * temporarily disable the rpm helpers so this doesn't deadlock us.
4238          */
4239 #ifdef CONFIG_PM
4240         dev->dev->power.disable_depth++;
4241 #endif
4242         if (!amdgpu_device_has_dc_support(adev))
4243                 drm_helper_hpd_irq_event(dev);
4244         else
4245                 drm_kms_helper_hotplug_event(dev);
4246 #ifdef CONFIG_PM
4247         dev->dev->power.disable_depth--;
4248 #endif
4249         adev->in_suspend = false;
4250
4251         if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D0))
4252                 DRM_WARN("smart shift update failed\n");
4253
4254         return 0;
4255 }
4256
4257 /**
4258  * amdgpu_device_ip_check_soft_reset - did soft reset succeed
4259  *
4260  * @adev: amdgpu_device pointer
4261  *
4262  * The list of all the hardware IPs that make up the asic is walked and
4263  * the check_soft_reset callbacks are run.  check_soft_reset determines
4264  * if the asic is still hung or not.
4265  * Returns true if any of the IPs are still in a hung state, false if not.
4266  */
4267 static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device *adev)
4268 {
4269         int i;
4270         bool asic_hang = false;
4271
4272         if (amdgpu_sriov_vf(adev))
4273                 return true;
4274
4275         if (amdgpu_asic_need_full_reset(adev))
4276                 return true;
4277
4278         for (i = 0; i < adev->num_ip_blocks; i++) {
4279                 if (!adev->ip_blocks[i].status.valid)
4280                         continue;
4281                 if (adev->ip_blocks[i].version->funcs->check_soft_reset)
4282                         adev->ip_blocks[i].status.hang =
4283                                 adev->ip_blocks[i].version->funcs->check_soft_reset(adev);
4284                 if (adev->ip_blocks[i].status.hang) {
4285                         dev_info(adev->dev, "IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name);
4286                         asic_hang = true;
4287                 }
4288         }
4289         return asic_hang;
4290 }
4291
4292 /**
4293  * amdgpu_device_ip_pre_soft_reset - prepare for soft reset
4294  *
4295  * @adev: amdgpu_device pointer
4296  *
4297  * The list of all the hardware IPs that make up the asic is walked and the
4298  * pre_soft_reset callbacks are run if the block is hung.  pre_soft_reset
4299  * handles any IP specific hardware or software state changes that are
4300  * necessary for a soft reset to succeed.
4301  * Returns 0 on success, negative error code on failure.
4302  */
4303 static int amdgpu_device_ip_pre_soft_reset(struct amdgpu_device *adev)
4304 {
4305         int i, r = 0;
4306
4307         for (i = 0; i < adev->num_ip_blocks; i++) {
4308                 if (!adev->ip_blocks[i].status.valid)
4309                         continue;
4310                 if (adev->ip_blocks[i].status.hang &&
4311                     adev->ip_blocks[i].version->funcs->pre_soft_reset) {
4312                         r = adev->ip_blocks[i].version->funcs->pre_soft_reset(adev);
4313                         if (r)
4314                                 return r;
4315                 }
4316         }
4317
4318         return 0;
4319 }
4320
4321 /**
4322  * amdgpu_device_ip_need_full_reset - check if a full asic reset is needed
4323  *
4324  * @adev: amdgpu_device pointer
4325  *
4326  * Some hardware IPs cannot be soft reset.  If they are hung, a full gpu
4327  * reset is necessary to recover.
4328  * Returns true if a full asic reset is required, false if not.
4329  */
4330 static bool amdgpu_device_ip_need_full_reset(struct amdgpu_device *adev)
4331 {
4332         int i;
4333
4334         if (amdgpu_asic_need_full_reset(adev))
4335                 return true;
4336
4337         for (i = 0; i < adev->num_ip_blocks; i++) {
4338                 if (!adev->ip_blocks[i].status.valid)
4339                         continue;
4340                 if ((adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) ||
4341                     (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) ||
4342                     (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_ACP) ||
4343                     (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) ||
4344                      adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) {
4345                         if (adev->ip_blocks[i].status.hang) {
4346                                 dev_info(adev->dev, "Some block need full reset!\n");
4347                                 return true;
4348                         }
4349                 }
4350         }
4351         return false;
4352 }
4353
4354 /**
4355  * amdgpu_device_ip_soft_reset - do a soft reset
4356  *
4357  * @adev: amdgpu_device pointer
4358  *
4359  * The list of all the hardware IPs that make up the asic is walked and the
4360  * soft_reset callbacks are run if the block is hung.  soft_reset handles any
4361  * IP specific hardware or software state changes that are necessary to soft
4362  * reset the IP.
4363  * Returns 0 on success, negative error code on failure.
4364  */
4365 static int amdgpu_device_ip_soft_reset(struct amdgpu_device *adev)
4366 {
4367         int i, r = 0;
4368
4369         for (i = 0; i < adev->num_ip_blocks; i++) {
4370                 if (!adev->ip_blocks[i].status.valid)
4371                         continue;
4372                 if (adev->ip_blocks[i].status.hang &&
4373                     adev->ip_blocks[i].version->funcs->soft_reset) {
4374                         r = adev->ip_blocks[i].version->funcs->soft_reset(adev);
4375                         if (r)
4376                                 return r;
4377                 }
4378         }
4379
4380         return 0;
4381 }
4382
4383 /**
4384  * amdgpu_device_ip_post_soft_reset - clean up from soft reset
4385  *
4386  * @adev: amdgpu_device pointer
4387  *
4388  * The list of all the hardware IPs that make up the asic is walked and the
4389  * post_soft_reset callbacks are run if the asic was hung.  post_soft_reset
4390  * handles any IP specific hardware or software state changes that are
4391  * necessary after the IP has been soft reset.
4392  * Returns 0 on success, negative error code on failure.
4393  */
4394 static int amdgpu_device_ip_post_soft_reset(struct amdgpu_device *adev)
4395 {
4396         int i, r = 0;
4397
4398         for (i = 0; i < adev->num_ip_blocks; i++) {
4399                 if (!adev->ip_blocks[i].status.valid)
4400                         continue;
4401                 if (adev->ip_blocks[i].status.hang &&
4402                     adev->ip_blocks[i].version->funcs->post_soft_reset)
4403                         r = adev->ip_blocks[i].version->funcs->post_soft_reset(adev);
4404                 if (r)
4405                         return r;
4406         }
4407
4408         return 0;
4409 }
4410
4411 /**
4412  * amdgpu_device_recover_vram - Recover some VRAM contents
4413  *
4414  * @adev: amdgpu_device pointer
4415  *
4416  * Restores the contents of VRAM buffers from the shadows in GTT.  Used to
4417  * restore things like GPUVM page tables after a GPU reset where
4418  * the contents of VRAM might be lost.
4419  *
4420  * Returns:
4421  * 0 on success, negative error code on failure.
4422  */
4423 static int amdgpu_device_recover_vram(struct amdgpu_device *adev)
4424 {
4425         struct dma_fence *fence = NULL, *next = NULL;
4426         struct amdgpu_bo *shadow;
4427         struct amdgpu_bo_vm *vmbo;
4428         long r = 1, tmo;
4429
4430         if (amdgpu_sriov_runtime(adev))
4431                 tmo = msecs_to_jiffies(8000);
4432         else
4433                 tmo = msecs_to_jiffies(100);
4434
4435         dev_info(adev->dev, "recover vram bo from shadow start\n");
4436         mutex_lock(&adev->shadow_list_lock);
4437         list_for_each_entry(vmbo, &adev->shadow_list, shadow_list) {
4438                 shadow = &vmbo->bo;
4439                 /* No need to recover an evicted BO */
4440                 if (shadow->tbo.resource->mem_type != TTM_PL_TT ||
4441                     shadow->tbo.resource->start == AMDGPU_BO_INVALID_OFFSET ||
4442                     shadow->parent->tbo.resource->mem_type != TTM_PL_VRAM)
4443                         continue;
4444
4445                 r = amdgpu_bo_restore_shadow(shadow, &next);
4446                 if (r)
4447                         break;
4448
4449                 if (fence) {
4450                         tmo = dma_fence_wait_timeout(fence, false, tmo);
4451                         dma_fence_put(fence);
4452                         fence = next;
4453                         if (tmo == 0) {
4454                                 r = -ETIMEDOUT;
4455                                 break;
4456                         } else if (tmo < 0) {
4457                                 r = tmo;
4458                                 break;
4459                         }
4460                 } else {
4461                         fence = next;
4462                 }
4463         }
4464         mutex_unlock(&adev->shadow_list_lock);
4465
4466         if (fence)
4467                 tmo = dma_fence_wait_timeout(fence, false, tmo);
4468         dma_fence_put(fence);
4469
4470         if (r < 0 || tmo <= 0) {
4471                 dev_err(adev->dev, "recover vram bo from shadow failed, r is %ld, tmo is %ld\n", r, tmo);
4472                 return -EIO;
4473         }
4474
4475         dev_info(adev->dev, "recover vram bo from shadow done\n");
4476         return 0;
4477 }
4478
4479
4480 /**
4481  * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf
4482  *
4483  * @adev: amdgpu_device pointer
4484  * @from_hypervisor: request from hypervisor
4485  *
4486  * do VF FLR and reinitialize Asic
4487  * return 0 means succeeded otherwise failed
4488  */
4489 static int amdgpu_device_reset_sriov(struct amdgpu_device *adev,
4490                                      bool from_hypervisor)
4491 {
4492         int r;
4493         struct amdgpu_hive_info *hive = NULL;
4494         int retry_limit = 0;
4495
4496 retry:
4497         amdgpu_amdkfd_pre_reset(adev);
4498
4499         if (from_hypervisor)
4500                 r = amdgpu_virt_request_full_gpu(adev, true);
4501         else
4502                 r = amdgpu_virt_reset_gpu(adev);
4503         if (r)
4504                 return r;
4505
4506         /* Resume IP prior to SMC */
4507         r = amdgpu_device_ip_reinit_early_sriov(adev);
4508         if (r)
4509                 goto error;
4510
4511         amdgpu_virt_init_data_exchange(adev);
4512
4513         r = amdgpu_device_fw_loading(adev);
4514         if (r)
4515                 return r;
4516
4517         /* now we are okay to resume SMC/CP/SDMA */
4518         r = amdgpu_device_ip_reinit_late_sriov(adev);
4519         if (r)
4520                 goto error;
4521
4522         hive = amdgpu_get_xgmi_hive(adev);
4523         /* Update PSP FW topology after reset */
4524         if (hive && adev->gmc.xgmi.num_physical_nodes > 1)
4525                 r = amdgpu_xgmi_update_topology(hive, adev);
4526
4527         if (hive)
4528                 amdgpu_put_xgmi_hive(hive);
4529
4530         if (!r) {
4531                 amdgpu_irq_gpu_reset_resume_helper(adev);
4532                 r = amdgpu_ib_ring_tests(adev);
4533
4534                 amdgpu_amdkfd_post_reset(adev);
4535         }
4536
4537 error:
4538         if (!r && adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) {
4539                 amdgpu_inc_vram_lost(adev);
4540                 r = amdgpu_device_recover_vram(adev);
4541         }
4542         amdgpu_virt_release_full_gpu(adev, true);
4543
4544         if (AMDGPU_RETRY_SRIOV_RESET(r)) {
4545                 if (retry_limit < AMDGPU_MAX_RETRY_LIMIT) {
4546                         retry_limit++;
4547                         goto retry;
4548                 } else
4549                         DRM_ERROR("GPU reset retry is beyond the retry limit\n");
4550         }
4551
4552         return r;
4553 }
4554
4555 /**
4556  * amdgpu_device_has_job_running - check if there is any job in mirror list
4557  *
4558  * @adev: amdgpu_device pointer
4559  *
4560  * check if there is any job in mirror list
4561  */
4562 bool amdgpu_device_has_job_running(struct amdgpu_device *adev)
4563 {
4564         int i;
4565         struct drm_sched_job *job;
4566
4567         for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4568                 struct amdgpu_ring *ring = adev->rings[i];
4569
4570                 if (!ring || !ring->sched.thread)
4571                         continue;
4572
4573                 spin_lock(&ring->sched.job_list_lock);
4574                 job = list_first_entry_or_null(&ring->sched.pending_list,
4575                                                struct drm_sched_job, list);
4576                 spin_unlock(&ring->sched.job_list_lock);
4577                 if (job)
4578                         return true;
4579         }
4580         return false;
4581 }
4582
4583 /**
4584  * amdgpu_device_should_recover_gpu - check if we should try GPU recovery
4585  *
4586  * @adev: amdgpu_device pointer
4587  *
4588  * Check amdgpu_gpu_recovery and SRIOV status to see if we should try to recover
4589  * a hung GPU.
4590  */
4591 bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev)
4592 {
4593
4594         if (amdgpu_gpu_recovery == 0)
4595                 goto disabled;
4596
4597         if (!amdgpu_device_ip_check_soft_reset(adev)) {
4598                 dev_info(adev->dev,"Timeout, but no hardware hang detected.\n");
4599                 return false;
4600         }
4601
4602         if (amdgpu_sriov_vf(adev))
4603                 return true;
4604
4605         if (amdgpu_gpu_recovery == -1) {
4606                 switch (adev->asic_type) {
4607 #ifdef CONFIG_DRM_AMDGPU_SI
4608                 case CHIP_VERDE:
4609                 case CHIP_TAHITI:
4610                 case CHIP_PITCAIRN:
4611                 case CHIP_OLAND:
4612                 case CHIP_HAINAN:
4613 #endif
4614 #ifdef CONFIG_DRM_AMDGPU_CIK
4615                 case CHIP_KAVERI:
4616                 case CHIP_KABINI:
4617                 case CHIP_MULLINS:
4618 #endif
4619                 case CHIP_CARRIZO:
4620                 case CHIP_STONEY:
4621                 case CHIP_CYAN_SKILLFISH:
4622                         goto disabled;
4623                 default:
4624                         break;
4625                 }
4626         }
4627
4628         return true;
4629
4630 disabled:
4631                 dev_info(adev->dev, "GPU recovery disabled.\n");
4632                 return false;
4633 }
4634
4635 int amdgpu_device_mode1_reset(struct amdgpu_device *adev)
4636 {
4637         u32 i;
4638         int ret = 0;
4639
4640         amdgpu_atombios_scratch_regs_engine_hung(adev, true);
4641
4642         dev_info(adev->dev, "GPU mode1 reset\n");
4643
4644         /* disable BM */
4645         pci_clear_master(adev->pdev);
4646
4647         amdgpu_device_cache_pci_state(adev->pdev);
4648
4649         if (amdgpu_dpm_is_mode1_reset_supported(adev)) {
4650                 dev_info(adev->dev, "GPU smu mode1 reset\n");
4651                 ret = amdgpu_dpm_mode1_reset(adev);
4652         } else {
4653                 dev_info(adev->dev, "GPU psp mode1 reset\n");
4654                 ret = psp_gpu_reset(adev);
4655         }
4656
4657         if (ret)
4658                 dev_err(adev->dev, "GPU mode1 reset failed\n");
4659
4660         amdgpu_device_load_pci_state(adev->pdev);
4661
4662         /* wait for asic to come out of reset */
4663         for (i = 0; i < adev->usec_timeout; i++) {
4664                 u32 memsize = adev->nbio.funcs->get_memsize(adev);
4665
4666                 if (memsize != 0xffffffff)
4667                         break;
4668                 udelay(1);
4669         }
4670
4671         amdgpu_atombios_scratch_regs_engine_hung(adev, false);
4672         return ret;
4673 }
4674
4675 int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
4676                                  struct amdgpu_reset_context *reset_context)
4677 {
4678         int i, r = 0;
4679         struct amdgpu_job *job = NULL;
4680         bool need_full_reset =
4681                 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
4682
4683         if (reset_context->reset_req_dev == adev)
4684                 job = reset_context->job;
4685
4686         if (amdgpu_sriov_vf(adev)) {
4687                 /* stop the data exchange thread */
4688                 amdgpu_virt_fini_data_exchange(adev);
4689         }
4690
4691         amdgpu_fence_driver_isr_toggle(adev, true);
4692
4693         /* block all schedulers and reset given job's ring */
4694         for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4695                 struct amdgpu_ring *ring = adev->rings[i];
4696
4697                 if (!ring || !ring->sched.thread)
4698                         continue;
4699
4700                 /*clear job fence from fence drv to avoid force_completion
4701                  *leave NULL and vm flush fence in fence drv */
4702                 amdgpu_fence_driver_clear_job_fences(ring);
4703
4704                 /* after all hw jobs are reset, hw fence is meaningless, so force_completion */
4705                 amdgpu_fence_driver_force_completion(ring);
4706         }
4707
4708         amdgpu_fence_driver_isr_toggle(adev, false);
4709
4710         if (job && job->vm)
4711                 drm_sched_increase_karma(&job->base);
4712
4713         r = amdgpu_reset_prepare_hwcontext(adev, reset_context);
4714         /* If reset handler not implemented, continue; otherwise return */
4715         if (r == -ENOSYS)
4716                 r = 0;
4717         else
4718                 return r;
4719
4720         /* Don't suspend on bare metal if we are not going to HW reset the ASIC */
4721         if (!amdgpu_sriov_vf(adev)) {
4722
4723                 if (!need_full_reset)
4724                         need_full_reset = amdgpu_device_ip_need_full_reset(adev);
4725
4726                 if (!need_full_reset && amdgpu_gpu_recovery) {
4727                         amdgpu_device_ip_pre_soft_reset(adev);
4728                         r = amdgpu_device_ip_soft_reset(adev);
4729                         amdgpu_device_ip_post_soft_reset(adev);
4730                         if (r || amdgpu_device_ip_check_soft_reset(adev)) {
4731                                 dev_info(adev->dev, "soft reset failed, will fallback to full reset!\n");
4732                                 need_full_reset = true;
4733                         }
4734                 }
4735
4736                 if (need_full_reset)
4737                         r = amdgpu_device_ip_suspend(adev);
4738                 if (need_full_reset)
4739                         set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
4740                 else
4741                         clear_bit(AMDGPU_NEED_FULL_RESET,
4742                                   &reset_context->flags);
4743         }
4744
4745         return r;
4746 }
4747
4748 static int amdgpu_reset_reg_dumps(struct amdgpu_device *adev)
4749 {
4750         int i;
4751
4752         lockdep_assert_held(&adev->reset_domain->sem);
4753
4754         for (i = 0; i < adev->num_regs; i++) {
4755                 adev->reset_dump_reg_value[i] = RREG32(adev->reset_dump_reg_list[i]);
4756                 trace_amdgpu_reset_reg_dumps(adev->reset_dump_reg_list[i],
4757                                              adev->reset_dump_reg_value[i]);
4758         }
4759
4760         return 0;
4761 }
4762
4763 #ifdef CONFIG_DEV_COREDUMP
4764 static ssize_t amdgpu_devcoredump_read(char *buffer, loff_t offset,
4765                 size_t count, void *data, size_t datalen)
4766 {
4767         struct drm_printer p;
4768         struct amdgpu_device *adev = data;
4769         struct drm_print_iterator iter;
4770         int i;
4771
4772         iter.data = buffer;
4773         iter.offset = 0;
4774         iter.start = offset;
4775         iter.remain = count;
4776
4777         p = drm_coredump_printer(&iter);
4778
4779         drm_printf(&p, "**** AMDGPU Device Coredump ****\n");
4780         drm_printf(&p, "kernel: " UTS_RELEASE "\n");
4781         drm_printf(&p, "module: " KBUILD_MODNAME "\n");
4782         drm_printf(&p, "time: %lld.%09ld\n", adev->reset_time.tv_sec, adev->reset_time.tv_nsec);
4783         if (adev->reset_task_info.pid)
4784                 drm_printf(&p, "process_name: %s PID: %d\n",
4785                            adev->reset_task_info.process_name,
4786                            adev->reset_task_info.pid);
4787
4788         if (adev->reset_vram_lost)
4789                 drm_printf(&p, "VRAM is lost due to GPU reset!\n");
4790         if (adev->num_regs) {
4791                 drm_printf(&p, "AMDGPU register dumps:\nOffset:     Value:\n");
4792
4793                 for (i = 0; i < adev->num_regs; i++)
4794                         drm_printf(&p, "0x%08x: 0x%08x\n",
4795                                    adev->reset_dump_reg_list[i],
4796                                    adev->reset_dump_reg_value[i]);
4797         }
4798
4799         return count - iter.remain;
4800 }
4801
4802 static void amdgpu_devcoredump_free(void *data)
4803 {
4804 }
4805
4806 static void amdgpu_reset_capture_coredumpm(struct amdgpu_device *adev)
4807 {
4808         struct drm_device *dev = adev_to_drm(adev);
4809
4810         ktime_get_ts64(&adev->reset_time);
4811         dev_coredumpm(dev->dev, THIS_MODULE, adev, 0, GFP_KERNEL,
4812                       amdgpu_devcoredump_read, amdgpu_devcoredump_free);
4813 }
4814 #endif
4815
4816 int amdgpu_do_asic_reset(struct list_head *device_list_handle,
4817                          struct amdgpu_reset_context *reset_context)
4818 {
4819         struct amdgpu_device *tmp_adev = NULL;
4820         bool need_full_reset, skip_hw_reset, vram_lost = false;
4821         int r = 0;
4822         bool gpu_reset_for_dev_remove = 0;
4823
4824         /* Try reset handler method first */
4825         tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device,
4826                                     reset_list);
4827         amdgpu_reset_reg_dumps(tmp_adev);
4828
4829         reset_context->reset_device_list = device_list_handle;
4830         r = amdgpu_reset_perform_reset(tmp_adev, reset_context);
4831         /* If reset handler not implemented, continue; otherwise return */
4832         if (r == -ENOSYS)
4833                 r = 0;
4834         else
4835                 return r;
4836
4837         /* Reset handler not implemented, use the default method */
4838         need_full_reset =
4839                 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
4840         skip_hw_reset = test_bit(AMDGPU_SKIP_HW_RESET, &reset_context->flags);
4841
4842         gpu_reset_for_dev_remove =
4843                 test_bit(AMDGPU_RESET_FOR_DEVICE_REMOVE, &reset_context->flags) &&
4844                         test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
4845
4846         /*
4847          * ASIC reset has to be done on all XGMI hive nodes ASAP
4848          * to allow proper links negotiation in FW (within 1 sec)
4849          */
4850         if (!skip_hw_reset && need_full_reset) {
4851                 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
4852                         /* For XGMI run all resets in parallel to speed up the process */
4853                         if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
4854                                 tmp_adev->gmc.xgmi.pending_reset = false;
4855                                 if (!queue_work(system_unbound_wq, &tmp_adev->xgmi_reset_work))
4856                                         r = -EALREADY;
4857                         } else
4858                                 r = amdgpu_asic_reset(tmp_adev);
4859
4860                         if (r) {
4861                                 dev_err(tmp_adev->dev, "ASIC reset failed with error, %d for drm dev, %s",
4862                                          r, adev_to_drm(tmp_adev)->unique);
4863                                 break;
4864                         }
4865                 }
4866
4867                 /* For XGMI wait for all resets to complete before proceed */
4868                 if (!r) {
4869                         list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
4870                                 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
4871                                         flush_work(&tmp_adev->xgmi_reset_work);
4872                                         r = tmp_adev->asic_reset_res;
4873                                         if (r)
4874                                                 break;
4875                                 }
4876                         }
4877                 }
4878         }
4879
4880         if (!r && amdgpu_ras_intr_triggered()) {
4881                 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
4882                         if (tmp_adev->mmhub.ras && tmp_adev->mmhub.ras->ras_block.hw_ops &&
4883                             tmp_adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count)
4884                                 tmp_adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count(tmp_adev);
4885                 }
4886
4887                 amdgpu_ras_intr_cleared();
4888         }
4889
4890         /* Since the mode1 reset affects base ip blocks, the
4891          * phase1 ip blocks need to be resumed. Otherwise there
4892          * will be a BIOS signature error and the psp bootloader
4893          * can't load kdb on the next amdgpu install.
4894          */
4895         if (gpu_reset_for_dev_remove) {
4896                 list_for_each_entry(tmp_adev, device_list_handle, reset_list)
4897                         amdgpu_device_ip_resume_phase1(tmp_adev);
4898
4899                 goto end;
4900         }
4901
4902         list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
4903                 if (need_full_reset) {
4904                         /* post card */
4905                         r = amdgpu_device_asic_init(tmp_adev);
4906                         if (r) {
4907                                 dev_warn(tmp_adev->dev, "asic atom init failed!");
4908                         } else {
4909                                 dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n");
4910                                 r = amdgpu_amdkfd_resume_iommu(tmp_adev);
4911                                 if (r)
4912                                         goto out;
4913
4914                                 r = amdgpu_device_ip_resume_phase1(tmp_adev);
4915                                 if (r)
4916                                         goto out;
4917
4918                                 vram_lost = amdgpu_device_check_vram_lost(tmp_adev);
4919 #ifdef CONFIG_DEV_COREDUMP
4920                                 tmp_adev->reset_vram_lost = vram_lost;
4921                                 memset(&tmp_adev->reset_task_info, 0,
4922                                                 sizeof(tmp_adev->reset_task_info));
4923                                 if (reset_context->job && reset_context->job->vm)
4924                                         tmp_adev->reset_task_info =
4925                                                 reset_context->job->vm->task_info;
4926                                 amdgpu_reset_capture_coredumpm(tmp_adev);
4927 #endif
4928                                 if (vram_lost) {
4929                                         DRM_INFO("VRAM is lost due to GPU reset!\n");
4930                                         amdgpu_inc_vram_lost(tmp_adev);
4931                                 }
4932
4933                                 r = amdgpu_device_fw_loading(tmp_adev);
4934                                 if (r)
4935                                         return r;
4936
4937                                 r = amdgpu_device_ip_resume_phase2(tmp_adev);
4938                                 if (r)
4939                                         goto out;
4940
4941                                 if (vram_lost)
4942                                         amdgpu_device_fill_reset_magic(tmp_adev);
4943
4944                                 /*
4945                                  * Add this ASIC as tracked as reset was already
4946                                  * complete successfully.
4947                                  */
4948                                 amdgpu_register_gpu_instance(tmp_adev);
4949
4950                                 if (!reset_context->hive &&
4951                                     tmp_adev->gmc.xgmi.num_physical_nodes > 1)
4952                                         amdgpu_xgmi_add_device(tmp_adev);
4953
4954                                 r = amdgpu_device_ip_late_init(tmp_adev);
4955                                 if (r)
4956                                         goto out;
4957
4958                                 drm_fb_helper_set_suspend_unlocked(adev_to_drm(tmp_adev)->fb_helper, false);
4959
4960                                 /*
4961                                  * The GPU enters bad state once faulty pages
4962                                  * by ECC has reached the threshold, and ras
4963                                  * recovery is scheduled next. So add one check
4964                                  * here to break recovery if it indeed exceeds
4965                                  * bad page threshold, and remind user to
4966                                  * retire this GPU or setting one bigger
4967                                  * bad_page_threshold value to fix this once
4968                                  * probing driver again.
4969                                  */
4970                                 if (!amdgpu_ras_eeprom_check_err_threshold(tmp_adev)) {
4971                                         /* must succeed. */
4972                                         amdgpu_ras_resume(tmp_adev);
4973                                 } else {
4974                                         r = -EINVAL;
4975                                         goto out;
4976                                 }
4977
4978                                 /* Update PSP FW topology after reset */
4979                                 if (reset_context->hive &&
4980                                     tmp_adev->gmc.xgmi.num_physical_nodes > 1)
4981                                         r = amdgpu_xgmi_update_topology(
4982                                                 reset_context->hive, tmp_adev);
4983                         }
4984                 }
4985
4986 out:
4987                 if (!r) {
4988                         amdgpu_irq_gpu_reset_resume_helper(tmp_adev);
4989                         r = amdgpu_ib_ring_tests(tmp_adev);
4990                         if (r) {
4991                                 dev_err(tmp_adev->dev, "ib ring test failed (%d).\n", r);
4992                                 need_full_reset = true;
4993                                 r = -EAGAIN;
4994                                 goto end;
4995                         }
4996                 }
4997
4998                 if (!r)
4999                         r = amdgpu_device_recover_vram(tmp_adev);
5000                 else
5001                         tmp_adev->asic_reset_res = r;
5002         }
5003
5004 end:
5005         if (need_full_reset)
5006                 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
5007         else
5008                 clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
5009         return r;
5010 }
5011
5012 static void amdgpu_device_set_mp1_state(struct amdgpu_device *adev)
5013 {
5014
5015         switch (amdgpu_asic_reset_method(adev)) {
5016         case AMD_RESET_METHOD_MODE1:
5017                 adev->mp1_state = PP_MP1_STATE_SHUTDOWN;
5018                 break;
5019         case AMD_RESET_METHOD_MODE2:
5020                 adev->mp1_state = PP_MP1_STATE_RESET;
5021                 break;
5022         default:
5023                 adev->mp1_state = PP_MP1_STATE_NONE;
5024                 break;
5025         }
5026 }
5027
5028 static void amdgpu_device_unset_mp1_state(struct amdgpu_device *adev)
5029 {
5030         amdgpu_vf_error_trans_all(adev);
5031         adev->mp1_state = PP_MP1_STATE_NONE;
5032 }
5033
5034 static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev)
5035 {
5036         struct pci_dev *p = NULL;
5037
5038         p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus),
5039                         adev->pdev->bus->number, 1);
5040         if (p) {
5041                 pm_runtime_enable(&(p->dev));
5042                 pm_runtime_resume(&(p->dev));
5043         }
5044
5045         pci_dev_put(p);
5046 }
5047
5048 static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev)
5049 {
5050         enum amd_reset_method reset_method;
5051         struct pci_dev *p = NULL;
5052         u64 expires;
5053
5054         /*
5055          * For now, only BACO and mode1 reset are confirmed
5056          * to suffer the audio issue without proper suspended.
5057          */
5058         reset_method = amdgpu_asic_reset_method(adev);
5059         if ((reset_method != AMD_RESET_METHOD_BACO) &&
5060              (reset_method != AMD_RESET_METHOD_MODE1))
5061                 return -EINVAL;
5062
5063         p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus),
5064                         adev->pdev->bus->number, 1);
5065         if (!p)
5066                 return -ENODEV;
5067
5068         expires = pm_runtime_autosuspend_expiration(&(p->dev));
5069         if (!expires)
5070                 /*
5071                  * If we cannot get the audio device autosuspend delay,
5072                  * a fixed 4S interval will be used. Considering 3S is
5073                  * the audio controller default autosuspend delay setting.
5074                  * 4S used here is guaranteed to cover that.
5075                  */
5076                 expires = ktime_get_mono_fast_ns() + NSEC_PER_SEC * 4ULL;
5077
5078         while (!pm_runtime_status_suspended(&(p->dev))) {
5079                 if (!pm_runtime_suspend(&(p->dev)))
5080                         break;
5081
5082                 if (expires < ktime_get_mono_fast_ns()) {
5083                         dev_warn(adev->dev, "failed to suspend display audio\n");
5084                         pci_dev_put(p);
5085                         /* TODO: abort the succeeding gpu reset? */
5086                         return -ETIMEDOUT;
5087                 }
5088         }
5089
5090         pm_runtime_disable(&(p->dev));
5091
5092         pci_dev_put(p);
5093         return 0;
5094 }
5095
5096 static void amdgpu_device_recheck_guilty_jobs(
5097         struct amdgpu_device *adev, struct list_head *device_list_handle,
5098         struct amdgpu_reset_context *reset_context)
5099 {
5100         int i, r = 0;
5101
5102         for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
5103                 struct amdgpu_ring *ring = adev->rings[i];
5104                 int ret = 0;
5105                 struct drm_sched_job *s_job;
5106
5107                 if (!ring || !ring->sched.thread)
5108                         continue;
5109
5110                 s_job = list_first_entry_or_null(&ring->sched.pending_list,
5111                                 struct drm_sched_job, list);
5112                 if (s_job == NULL)
5113                         continue;
5114
5115                 /* clear job's guilty and depend the folowing step to decide the real one */
5116                 drm_sched_reset_karma(s_job);
5117                 drm_sched_resubmit_jobs_ext(&ring->sched, 1);
5118
5119                 if (!s_job->s_fence->parent) {
5120                         DRM_WARN("Failed to get a HW fence for job!");
5121                         continue;
5122                 }
5123
5124                 ret = dma_fence_wait_timeout(s_job->s_fence->parent, false, ring->sched.timeout);
5125                 if (ret == 0) { /* timeout */
5126                         DRM_ERROR("Found the real bad job! ring:%s, job_id:%llx\n",
5127                                                 ring->sched.name, s_job->id);
5128
5129
5130                         amdgpu_fence_driver_isr_toggle(adev, true);
5131
5132                         /* Clear this failed job from fence array */
5133                         amdgpu_fence_driver_clear_job_fences(ring);
5134
5135                         amdgpu_fence_driver_isr_toggle(adev, false);
5136
5137                         /* Since the job won't signal and we go for
5138                          * another resubmit drop this parent pointer
5139                          */
5140                         dma_fence_put(s_job->s_fence->parent);
5141                         s_job->s_fence->parent = NULL;
5142
5143                         /* set guilty */
5144                         drm_sched_increase_karma(s_job);
5145                         amdgpu_reset_prepare_hwcontext(adev, reset_context);
5146 retry:
5147                         /* do hw reset */
5148                         if (amdgpu_sriov_vf(adev)) {
5149                                 amdgpu_virt_fini_data_exchange(adev);
5150                                 r = amdgpu_device_reset_sriov(adev, false);
5151                                 if (r)
5152                                         adev->asic_reset_res = r;
5153                         } else {
5154                                 clear_bit(AMDGPU_SKIP_HW_RESET,
5155                                           &reset_context->flags);
5156                                 r = amdgpu_do_asic_reset(device_list_handle,
5157                                                          reset_context);
5158                                 if (r && r == -EAGAIN)
5159                                         goto retry;
5160                         }
5161
5162                         /*
5163                          * add reset counter so that the following
5164                          * resubmitted job could flush vmid
5165                          */
5166                         atomic_inc(&adev->gpu_reset_counter);
5167                         continue;
5168                 }
5169
5170                 /* got the hw fence, signal finished fence */
5171                 atomic_dec(ring->sched.score);
5172                 dma_fence_get(&s_job->s_fence->finished);
5173                 dma_fence_signal(&s_job->s_fence->finished);
5174                 dma_fence_put(&s_job->s_fence->finished);
5175
5176                 /* remove node from list and free the job */
5177                 spin_lock(&ring->sched.job_list_lock);
5178                 list_del_init(&s_job->list);
5179                 spin_unlock(&ring->sched.job_list_lock);
5180                 ring->sched.ops->free_job(s_job);
5181         }
5182 }
5183
5184 static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev)
5185 {
5186         struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
5187
5188 #if defined(CONFIG_DEBUG_FS)
5189         if (!amdgpu_sriov_vf(adev))
5190                 cancel_work(&adev->reset_work);
5191 #endif
5192
5193         if (adev->kfd.dev)
5194                 cancel_work(&adev->kfd.reset_work);
5195
5196         if (amdgpu_sriov_vf(adev))
5197                 cancel_work(&adev->virt.flr_work);
5198
5199         if (con && adev->ras_enabled)
5200                 cancel_work(&con->recovery_work);
5201
5202 }
5203
5204
5205 /**
5206  * amdgpu_device_gpu_recover - reset the asic and recover scheduler
5207  *
5208  * @adev: amdgpu_device pointer
5209  * @job: which job trigger hang
5210  *
5211  * Attempt to reset the GPU if it has hung (all asics).
5212  * Attempt to do soft-reset or full-reset and reinitialize Asic
5213  * Returns 0 for success or an error on failure.
5214  */
5215
5216 int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
5217                               struct amdgpu_job *job,
5218                               struct amdgpu_reset_context *reset_context)
5219 {
5220         struct list_head device_list, *device_list_handle =  NULL;
5221         bool job_signaled = false;
5222         struct amdgpu_hive_info *hive = NULL;
5223         struct amdgpu_device *tmp_adev = NULL;
5224         int i, r = 0;
5225         bool need_emergency_restart = false;
5226         bool audio_suspended = false;
5227         int tmp_vram_lost_counter;
5228         bool gpu_reset_for_dev_remove = false;
5229
5230         gpu_reset_for_dev_remove =
5231                         test_bit(AMDGPU_RESET_FOR_DEVICE_REMOVE, &reset_context->flags) &&
5232                                 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
5233
5234         /*
5235          * Special case: RAS triggered and full reset isn't supported
5236          */
5237         need_emergency_restart = amdgpu_ras_need_emergency_restart(adev);
5238
5239         /*
5240          * Flush RAM to disk so that after reboot
5241          * the user can read log and see why the system rebooted.
5242          */
5243         if (need_emergency_restart && amdgpu_ras_get_context(adev)->reboot) {
5244                 DRM_WARN("Emergency reboot.");
5245
5246                 ksys_sync_helper();
5247                 emergency_restart();
5248         }
5249
5250         dev_info(adev->dev, "GPU %s begin!\n",
5251                 need_emergency_restart ? "jobs stop":"reset");
5252
5253         if (!amdgpu_sriov_vf(adev))
5254                 hive = amdgpu_get_xgmi_hive(adev);
5255         if (hive)
5256                 mutex_lock(&hive->hive_lock);
5257
5258         reset_context->job = job;
5259         reset_context->hive = hive;
5260         /*
5261          * Build list of devices to reset.
5262          * In case we are in XGMI hive mode, resort the device list
5263          * to put adev in the 1st position.
5264          */
5265         INIT_LIST_HEAD(&device_list);
5266         if (!amdgpu_sriov_vf(adev) && (adev->gmc.xgmi.num_physical_nodes > 1)) {
5267                 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {
5268                         list_add_tail(&tmp_adev->reset_list, &device_list);
5269                         if (gpu_reset_for_dev_remove && adev->shutdown)
5270                                 tmp_adev->shutdown = true;
5271                 }
5272                 if (!list_is_first(&adev->reset_list, &device_list))
5273                         list_rotate_to_front(&adev->reset_list, &device_list);
5274                 device_list_handle = &device_list;
5275         } else {
5276                 list_add_tail(&adev->reset_list, &device_list);
5277                 device_list_handle = &device_list;
5278         }
5279
5280         /* We need to lock reset domain only once both for XGMI and single device */
5281         tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device,
5282                                     reset_list);
5283         amdgpu_device_lock_reset_domain(tmp_adev->reset_domain);
5284
5285         /* block all schedulers and reset given job's ring */
5286         list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5287
5288                 amdgpu_device_set_mp1_state(tmp_adev);
5289
5290                 /*
5291                  * Try to put the audio codec into suspend state
5292                  * before gpu reset started.
5293                  *
5294                  * Due to the power domain of the graphics device
5295                  * is shared with AZ power domain. Without this,
5296                  * we may change the audio hardware from behind
5297                  * the audio driver's back. That will trigger
5298                  * some audio codec errors.
5299                  */
5300                 if (!amdgpu_device_suspend_display_audio(tmp_adev))
5301                         audio_suspended = true;
5302
5303                 amdgpu_ras_set_error_query_ready(tmp_adev, false);
5304
5305                 cancel_delayed_work_sync(&tmp_adev->delayed_init_work);
5306
5307                 if (!amdgpu_sriov_vf(tmp_adev))
5308                         amdgpu_amdkfd_pre_reset(tmp_adev);
5309
5310                 /*
5311                  * Mark these ASICs to be reseted as untracked first
5312                  * And add them back after reset completed
5313                  */
5314                 amdgpu_unregister_gpu_instance(tmp_adev);
5315
5316                 drm_fb_helper_set_suspend_unlocked(adev_to_drm(tmp_adev)->fb_helper, true);
5317
5318                 /* disable ras on ALL IPs */
5319                 if (!need_emergency_restart &&
5320                       amdgpu_device_ip_need_full_reset(tmp_adev))
5321                         amdgpu_ras_suspend(tmp_adev);
5322
5323                 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
5324                         struct amdgpu_ring *ring = tmp_adev->rings[i];
5325
5326                         if (!ring || !ring->sched.thread)
5327                                 continue;
5328
5329                         drm_sched_stop(&ring->sched, job ? &job->base : NULL);
5330
5331                         if (need_emergency_restart)
5332                                 amdgpu_job_stop_all_jobs_on_sched(&ring->sched);
5333                 }
5334                 atomic_inc(&tmp_adev->gpu_reset_counter);
5335         }
5336
5337         if (need_emergency_restart)
5338                 goto skip_sched_resume;
5339
5340         /*
5341          * Must check guilty signal here since after this point all old
5342          * HW fences are force signaled.
5343          *
5344          * job->base holds a reference to parent fence
5345          */
5346         if (job && dma_fence_is_signaled(&job->hw_fence)) {
5347                 job_signaled = true;
5348                 dev_info(adev->dev, "Guilty job already signaled, skipping HW reset");
5349                 goto skip_hw_reset;
5350         }
5351
5352 retry:  /* Rest of adevs pre asic reset from XGMI hive. */
5353         list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5354                 if (gpu_reset_for_dev_remove) {
5355                         /* Workaroud for ASICs need to disable SMC first */
5356                         amdgpu_device_smu_fini_early(tmp_adev);
5357                 }
5358                 r = amdgpu_device_pre_asic_reset(tmp_adev, reset_context);
5359                 /*TODO Should we stop ?*/
5360                 if (r) {
5361                         dev_err(tmp_adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ",
5362                                   r, adev_to_drm(tmp_adev)->unique);
5363                         tmp_adev->asic_reset_res = r;
5364                 }
5365
5366                 /*
5367                  * Drop all pending non scheduler resets. Scheduler resets
5368                  * were already dropped during drm_sched_stop
5369                  */
5370                 amdgpu_device_stop_pending_resets(tmp_adev);
5371         }
5372
5373         tmp_vram_lost_counter = atomic_read(&((adev)->vram_lost_counter));
5374         /* Actual ASIC resets if needed.*/
5375         /* Host driver will handle XGMI hive reset for SRIOV */
5376         if (amdgpu_sriov_vf(adev)) {
5377                 r = amdgpu_device_reset_sriov(adev, job ? false : true);
5378                 if (r)
5379                         adev->asic_reset_res = r;
5380
5381                 /* Aldebaran supports ras in SRIOV, so need resume ras during reset */
5382                 if (adev->ip_versions[GC_HWIP][0] == IP_VERSION(9, 4, 2))
5383                         amdgpu_ras_resume(adev);
5384         } else {
5385                 r = amdgpu_do_asic_reset(device_list_handle, reset_context);
5386                 if (r && r == -EAGAIN)
5387                         goto retry;
5388
5389                 if (!r && gpu_reset_for_dev_remove)
5390                         goto recover_end;
5391         }
5392
5393 skip_hw_reset:
5394
5395         /* Post ASIC reset for all devs .*/
5396         list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5397
5398                 /*
5399                  * Sometimes a later bad compute job can block a good gfx job as gfx
5400                  * and compute ring share internal GC HW mutually. We add an additional
5401                  * guilty jobs recheck step to find the real guilty job, it synchronously
5402                  * submits and pends for the first job being signaled. If it gets timeout,
5403                  * we identify it as a real guilty job.
5404                  */
5405                 if (amdgpu_gpu_recovery == 2 &&
5406                         !(tmp_vram_lost_counter < atomic_read(&adev->vram_lost_counter)))
5407                         amdgpu_device_recheck_guilty_jobs(
5408                                 tmp_adev, device_list_handle, reset_context);
5409
5410                 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
5411                         struct amdgpu_ring *ring = tmp_adev->rings[i];
5412
5413                         if (!ring || !ring->sched.thread)
5414                                 continue;
5415
5416                         /* No point to resubmit jobs if we didn't HW reset*/
5417                         if (!tmp_adev->asic_reset_res && !job_signaled)
5418                                 drm_sched_resubmit_jobs(&ring->sched);
5419
5420                         drm_sched_start(&ring->sched, !tmp_adev->asic_reset_res);
5421                 }
5422
5423                 if (adev->enable_mes && adev->ip_versions[GC_HWIP][0] != IP_VERSION(11, 0, 3))
5424                         amdgpu_mes_self_test(tmp_adev);
5425
5426                 if (!drm_drv_uses_atomic_modeset(adev_to_drm(tmp_adev)) && !job_signaled) {
5427                         drm_helper_resume_force_mode(adev_to_drm(tmp_adev));
5428                 }
5429
5430                 if (tmp_adev->asic_reset_res)
5431                         r = tmp_adev->asic_reset_res;
5432
5433                 tmp_adev->asic_reset_res = 0;
5434
5435                 if (r) {
5436                         /* bad news, how to tell it to userspace ? */
5437                         dev_info(tmp_adev->dev, "GPU reset(%d) failed\n", atomic_read(&tmp_adev->gpu_reset_counter));
5438                         amdgpu_vf_error_put(tmp_adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r);
5439                 } else {
5440                         dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&tmp_adev->gpu_reset_counter));
5441                         if (amdgpu_acpi_smart_shift_update(adev_to_drm(tmp_adev), AMDGPU_SS_DEV_D0))
5442                                 DRM_WARN("smart shift update failed\n");
5443                 }
5444         }
5445
5446 skip_sched_resume:
5447         list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5448                 /* unlock kfd: SRIOV would do it separately */
5449                 if (!need_emergency_restart && !amdgpu_sriov_vf(tmp_adev))
5450                         amdgpu_amdkfd_post_reset(tmp_adev);
5451
5452                 /* kfd_post_reset will do nothing if kfd device is not initialized,
5453                  * need to bring up kfd here if it's not be initialized before
5454                  */
5455                 if (!adev->kfd.init_complete)
5456                         amdgpu_amdkfd_device_init(adev);
5457
5458                 if (audio_suspended)
5459                         amdgpu_device_resume_display_audio(tmp_adev);
5460
5461                 amdgpu_device_unset_mp1_state(tmp_adev);
5462         }
5463
5464 recover_end:
5465         tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device,
5466                                             reset_list);
5467         amdgpu_device_unlock_reset_domain(tmp_adev->reset_domain);
5468
5469         if (hive) {
5470                 mutex_unlock(&hive->hive_lock);
5471                 amdgpu_put_xgmi_hive(hive);
5472         }
5473
5474         if (r)
5475                 dev_info(adev->dev, "GPU reset end with ret = %d\n", r);
5476
5477         atomic_set(&adev->reset_domain->reset_res, r);
5478         return r;
5479 }
5480
5481 /**
5482  * amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot
5483  *
5484  * @adev: amdgpu_device pointer
5485  *
5486  * Fetchs and stores in the driver the PCIE capabilities (gen speed
5487  * and lanes) of the slot the device is in. Handles APUs and
5488  * virtualized environments where PCIE config space may not be available.
5489  */
5490 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev)
5491 {
5492         struct pci_dev *pdev;
5493         enum pci_bus_speed speed_cap, platform_speed_cap;
5494         enum pcie_link_width platform_link_width;
5495
5496         if (amdgpu_pcie_gen_cap)
5497                 adev->pm.pcie_gen_mask = amdgpu_pcie_gen_cap;
5498
5499         if (amdgpu_pcie_lane_cap)
5500                 adev->pm.pcie_mlw_mask = amdgpu_pcie_lane_cap;
5501
5502         /* covers APUs as well */
5503         if (pci_is_root_bus(adev->pdev->bus)) {
5504                 if (adev->pm.pcie_gen_mask == 0)
5505                         adev->pm.pcie_gen_mask = AMDGPU_DEFAULT_PCIE_GEN_MASK;
5506                 if (adev->pm.pcie_mlw_mask == 0)
5507                         adev->pm.pcie_mlw_mask = AMDGPU_DEFAULT_PCIE_MLW_MASK;
5508                 return;
5509         }
5510
5511         if (adev->pm.pcie_gen_mask && adev->pm.pcie_mlw_mask)
5512                 return;
5513
5514         pcie_bandwidth_available(adev->pdev, NULL,
5515                                  &platform_speed_cap, &platform_link_width);
5516
5517         if (adev->pm.pcie_gen_mask == 0) {
5518                 /* asic caps */
5519                 pdev = adev->pdev;
5520                 speed_cap = pcie_get_speed_cap(pdev);
5521                 if (speed_cap == PCI_SPEED_UNKNOWN) {
5522                         adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5523                                                   CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5524                                                   CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3);
5525                 } else {
5526                         if (speed_cap == PCIE_SPEED_32_0GT)
5527                                 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5528                                                           CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5529                                                           CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 |
5530                                                           CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4 |
5531                                                           CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN5);
5532                         else if (speed_cap == PCIE_SPEED_16_0GT)
5533                                 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5534                                                           CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5535                                                           CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 |
5536                                                           CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4);
5537                         else if (speed_cap == PCIE_SPEED_8_0GT)
5538                                 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5539                                                           CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5540                                                           CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3);
5541                         else if (speed_cap == PCIE_SPEED_5_0GT)
5542                                 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5543                                                           CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2);
5544                         else
5545                                 adev->pm.pcie_gen_mask |= CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1;
5546                 }
5547                 /* platform caps */
5548                 if (platform_speed_cap == PCI_SPEED_UNKNOWN) {
5549                         adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5550                                                    CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2);
5551                 } else {
5552                         if (platform_speed_cap == PCIE_SPEED_32_0GT)
5553                                 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5554                                                            CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5555                                                            CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 |
5556                                                            CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4 |
5557                                                            CAIL_PCIE_LINK_SPEED_SUPPORT_GEN5);
5558                         else if (platform_speed_cap == PCIE_SPEED_16_0GT)
5559                                 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5560                                                            CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5561                                                            CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 |
5562                                                            CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4);
5563                         else if (platform_speed_cap == PCIE_SPEED_8_0GT)
5564                                 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5565                                                            CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5566                                                            CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3);
5567                         else if (platform_speed_cap == PCIE_SPEED_5_0GT)
5568                                 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5569                                                            CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2);
5570                         else
5571                                 adev->pm.pcie_gen_mask |= CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1;
5572
5573                 }
5574         }
5575         if (adev->pm.pcie_mlw_mask == 0) {
5576                 if (platform_link_width == PCIE_LNK_WIDTH_UNKNOWN) {
5577                         adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_PCIE_MLW_MASK;
5578                 } else {
5579                         switch (platform_link_width) {
5580                         case PCIE_LNK_X32:
5581                                 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X32 |
5582                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 |
5583                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
5584                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
5585                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
5586                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
5587                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
5588                                 break;
5589                         case PCIE_LNK_X16:
5590                                 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 |
5591                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
5592                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
5593                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
5594                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
5595                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
5596                                 break;
5597                         case PCIE_LNK_X12:
5598                                 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
5599                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
5600                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
5601                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
5602                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
5603                                 break;
5604                         case PCIE_LNK_X8:
5605                                 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
5606                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
5607                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
5608                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
5609                                 break;
5610                         case PCIE_LNK_X4:
5611                                 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
5612                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
5613                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
5614                                 break;
5615                         case PCIE_LNK_X2:
5616                                 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
5617                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
5618                                 break;
5619                         case PCIE_LNK_X1:
5620                                 adev->pm.pcie_mlw_mask = CAIL_PCIE_LINK_WIDTH_SUPPORT_X1;
5621                                 break;
5622                         default:
5623                                 break;
5624                         }
5625                 }
5626         }
5627 }
5628
5629 /**
5630  * amdgpu_device_is_peer_accessible - Check peer access through PCIe BAR
5631  *
5632  * @adev: amdgpu_device pointer
5633  * @peer_adev: amdgpu_device pointer for peer device trying to access @adev
5634  *
5635  * Return true if @peer_adev can access (DMA) @adev through the PCIe
5636  * BAR, i.e. @adev is "large BAR" and the BAR matches the DMA mask of
5637  * @peer_adev.
5638  */
5639 bool amdgpu_device_is_peer_accessible(struct amdgpu_device *adev,
5640                                       struct amdgpu_device *peer_adev)
5641 {
5642 #ifdef CONFIG_HSA_AMD_P2P
5643         uint64_t address_mask = peer_adev->dev->dma_mask ?
5644                 ~*peer_adev->dev->dma_mask : ~((1ULL << 32) - 1);
5645         resource_size_t aper_limit =
5646                 adev->gmc.aper_base + adev->gmc.aper_size - 1;
5647         bool p2p_access =
5648                 !adev->gmc.xgmi.connected_to_cpu &&
5649                 !(pci_p2pdma_distance(adev->pdev, peer_adev->dev, false) < 0);
5650
5651         return pcie_p2p && p2p_access && (adev->gmc.visible_vram_size &&
5652                 adev->gmc.real_vram_size == adev->gmc.visible_vram_size &&
5653                 !(adev->gmc.aper_base & address_mask ||
5654                   aper_limit & address_mask));
5655 #else
5656         return false;
5657 #endif
5658 }
5659
5660 int amdgpu_device_baco_enter(struct drm_device *dev)
5661 {
5662         struct amdgpu_device *adev = drm_to_adev(dev);
5663         struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
5664
5665         if (!amdgpu_device_supports_baco(adev_to_drm(adev)))
5666                 return -ENOTSUPP;
5667
5668         if (ras && adev->ras_enabled &&
5669             adev->nbio.funcs->enable_doorbell_interrupt)
5670                 adev->nbio.funcs->enable_doorbell_interrupt(adev, false);
5671
5672         return amdgpu_dpm_baco_enter(adev);
5673 }
5674
5675 int amdgpu_device_baco_exit(struct drm_device *dev)
5676 {
5677         struct amdgpu_device *adev = drm_to_adev(dev);
5678         struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
5679         int ret = 0;
5680
5681         if (!amdgpu_device_supports_baco(adev_to_drm(adev)))
5682                 return -ENOTSUPP;
5683
5684         ret = amdgpu_dpm_baco_exit(adev);
5685         if (ret)
5686                 return ret;
5687
5688         if (ras && adev->ras_enabled &&
5689             adev->nbio.funcs->enable_doorbell_interrupt)
5690                 adev->nbio.funcs->enable_doorbell_interrupt(adev, true);
5691
5692         if (amdgpu_passthrough(adev) &&
5693             adev->nbio.funcs->clear_doorbell_interrupt)
5694                 adev->nbio.funcs->clear_doorbell_interrupt(adev);
5695
5696         return 0;
5697 }
5698
5699 /**
5700  * amdgpu_pci_error_detected - Called when a PCI error is detected.
5701  * @pdev: PCI device struct
5702  * @state: PCI channel state
5703  *
5704  * Description: Called when a PCI error is detected.
5705  *
5706  * Return: PCI_ERS_RESULT_NEED_RESET or PCI_ERS_RESULT_DISCONNECT.
5707  */
5708 pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state)
5709 {
5710         struct drm_device *dev = pci_get_drvdata(pdev);
5711         struct amdgpu_device *adev = drm_to_adev(dev);
5712         int i;
5713
5714         DRM_INFO("PCI error: detected callback, state(%d)!!\n", state);
5715
5716         if (adev->gmc.xgmi.num_physical_nodes > 1) {
5717                 DRM_WARN("No support for XGMI hive yet...");
5718                 return PCI_ERS_RESULT_DISCONNECT;
5719         }
5720
5721         adev->pci_channel_state = state;
5722
5723         switch (state) {
5724         case pci_channel_io_normal:
5725                 return PCI_ERS_RESULT_CAN_RECOVER;
5726         /* Fatal error, prepare for slot reset */
5727         case pci_channel_io_frozen:
5728                 /*
5729                  * Locking adev->reset_domain->sem will prevent any external access
5730                  * to GPU during PCI error recovery
5731                  */
5732                 amdgpu_device_lock_reset_domain(adev->reset_domain);
5733                 amdgpu_device_set_mp1_state(adev);
5734
5735                 /*
5736                  * Block any work scheduling as we do for regular GPU reset
5737                  * for the duration of the recovery
5738                  */
5739                 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
5740                         struct amdgpu_ring *ring = adev->rings[i];
5741
5742                         if (!ring || !ring->sched.thread)
5743                                 continue;
5744
5745                         drm_sched_stop(&ring->sched, NULL);
5746                 }
5747                 atomic_inc(&adev->gpu_reset_counter);
5748                 return PCI_ERS_RESULT_NEED_RESET;
5749         case pci_channel_io_perm_failure:
5750                 /* Permanent error, prepare for device removal */
5751                 return PCI_ERS_RESULT_DISCONNECT;
5752         }
5753
5754         return PCI_ERS_RESULT_NEED_RESET;
5755 }
5756
5757 /**
5758  * amdgpu_pci_mmio_enabled - Enable MMIO and dump debug registers
5759  * @pdev: pointer to PCI device
5760  */
5761 pci_ers_result_t amdgpu_pci_mmio_enabled(struct pci_dev *pdev)
5762 {
5763
5764         DRM_INFO("PCI error: mmio enabled callback!!\n");
5765
5766         /* TODO - dump whatever for debugging purposes */
5767
5768         /* This called only if amdgpu_pci_error_detected returns
5769          * PCI_ERS_RESULT_CAN_RECOVER. Read/write to the device still
5770          * works, no need to reset slot.
5771          */
5772
5773         return PCI_ERS_RESULT_RECOVERED;
5774 }
5775
5776 /**
5777  * amdgpu_pci_slot_reset - Called when PCI slot has been reset.
5778  * @pdev: PCI device struct
5779  *
5780  * Description: This routine is called by the pci error recovery
5781  * code after the PCI slot has been reset, just before we
5782  * should resume normal operations.
5783  */
5784 pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev)
5785 {
5786         struct drm_device *dev = pci_get_drvdata(pdev);
5787         struct amdgpu_device *adev = drm_to_adev(dev);
5788         int r, i;
5789         struct amdgpu_reset_context reset_context;
5790         u32 memsize;
5791         struct list_head device_list;
5792
5793         DRM_INFO("PCI error: slot reset callback!!\n");
5794
5795         memset(&reset_context, 0, sizeof(reset_context));
5796
5797         INIT_LIST_HEAD(&device_list);
5798         list_add_tail(&adev->reset_list, &device_list);
5799
5800         /* wait for asic to come out of reset */
5801         msleep(500);
5802
5803         /* Restore PCI confspace */
5804         amdgpu_device_load_pci_state(pdev);
5805
5806         /* confirm  ASIC came out of reset */
5807         for (i = 0; i < adev->usec_timeout; i++) {
5808                 memsize = amdgpu_asic_get_config_memsize(adev);
5809
5810                 if (memsize != 0xffffffff)
5811                         break;
5812                 udelay(1);
5813         }
5814         if (memsize == 0xffffffff) {
5815                 r = -ETIME;
5816                 goto out;
5817         }
5818
5819         reset_context.method = AMD_RESET_METHOD_NONE;
5820         reset_context.reset_req_dev = adev;
5821         set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
5822         set_bit(AMDGPU_SKIP_HW_RESET, &reset_context.flags);
5823
5824         adev->no_hw_access = true;
5825         r = amdgpu_device_pre_asic_reset(adev, &reset_context);
5826         adev->no_hw_access = false;
5827         if (r)
5828                 goto out;
5829
5830         r = amdgpu_do_asic_reset(&device_list, &reset_context);
5831
5832 out:
5833         if (!r) {
5834                 if (amdgpu_device_cache_pci_state(adev->pdev))
5835                         pci_restore_state(adev->pdev);
5836
5837                 DRM_INFO("PCIe error recovery succeeded\n");
5838         } else {
5839                 DRM_ERROR("PCIe error recovery failed, err:%d", r);
5840                 amdgpu_device_unset_mp1_state(adev);
5841                 amdgpu_device_unlock_reset_domain(adev->reset_domain);
5842         }
5843
5844         return r ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED;
5845 }
5846
5847 /**
5848  * amdgpu_pci_resume() - resume normal ops after PCI reset
5849  * @pdev: pointer to PCI device
5850  *
5851  * Called when the error recovery driver tells us that its
5852  * OK to resume normal operation.
5853  */
5854 void amdgpu_pci_resume(struct pci_dev *pdev)
5855 {
5856         struct drm_device *dev = pci_get_drvdata(pdev);
5857         struct amdgpu_device *adev = drm_to_adev(dev);
5858         int i;
5859
5860
5861         DRM_INFO("PCI error: resume callback!!\n");
5862
5863         /* Only continue execution for the case of pci_channel_io_frozen */
5864         if (adev->pci_channel_state != pci_channel_io_frozen)
5865                 return;
5866
5867         for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
5868                 struct amdgpu_ring *ring = adev->rings[i];
5869
5870                 if (!ring || !ring->sched.thread)
5871                         continue;
5872
5873
5874                 drm_sched_resubmit_jobs(&ring->sched);
5875                 drm_sched_start(&ring->sched, true);
5876         }
5877
5878         amdgpu_device_unset_mp1_state(adev);
5879         amdgpu_device_unlock_reset_domain(adev->reset_domain);
5880 }
5881
5882 bool amdgpu_device_cache_pci_state(struct pci_dev *pdev)
5883 {
5884         struct drm_device *dev = pci_get_drvdata(pdev);
5885         struct amdgpu_device *adev = drm_to_adev(dev);
5886         int r;
5887
5888         r = pci_save_state(pdev);
5889         if (!r) {
5890                 kfree(adev->pci_state);
5891
5892                 adev->pci_state = pci_store_saved_state(pdev);
5893
5894                 if (!adev->pci_state) {
5895                         DRM_ERROR("Failed to store PCI saved state");
5896                         return false;
5897                 }
5898         } else {
5899                 DRM_WARN("Failed to save PCI state, err:%d\n", r);
5900                 return false;
5901         }
5902
5903         return true;
5904 }
5905
5906 bool amdgpu_device_load_pci_state(struct pci_dev *pdev)
5907 {
5908         struct drm_device *dev = pci_get_drvdata(pdev);
5909         struct amdgpu_device *adev = drm_to_adev(dev);
5910         int r;
5911
5912         if (!adev->pci_state)
5913                 return false;
5914
5915         r = pci_load_saved_state(pdev, adev->pci_state);
5916
5917         if (!r) {
5918                 pci_restore_state(pdev);
5919         } else {
5920                 DRM_WARN("Failed to load PCI state, err:%d\n", r);
5921                 return false;
5922         }
5923
5924         return true;
5925 }
5926
5927 void amdgpu_device_flush_hdp(struct amdgpu_device *adev,
5928                 struct amdgpu_ring *ring)
5929 {
5930 #ifdef CONFIG_X86_64
5931         if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev))
5932                 return;
5933 #endif
5934         if (adev->gmc.xgmi.connected_to_cpu)
5935                 return;
5936
5937         if (ring && ring->funcs->emit_hdp_flush)
5938                 amdgpu_ring_emit_hdp_flush(ring);
5939         else
5940                 amdgpu_asic_flush_hdp(adev, ring);
5941 }
5942
5943 void amdgpu_device_invalidate_hdp(struct amdgpu_device *adev,
5944                 struct amdgpu_ring *ring)
5945 {
5946 #ifdef CONFIG_X86_64
5947         if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev))
5948                 return;
5949 #endif
5950         if (adev->gmc.xgmi.connected_to_cpu)
5951                 return;
5952
5953         amdgpu_asic_invalidate_hdp(adev, ring);
5954 }
5955
5956 int amdgpu_in_reset(struct amdgpu_device *adev)
5957 {
5958         return atomic_read(&adev->reset_domain->in_gpu_reset);
5959         }
5960
5961 /**
5962  * amdgpu_device_halt() - bring hardware to some kind of halt state
5963  *
5964  * @adev: amdgpu_device pointer
5965  *
5966  * Bring hardware to some kind of halt state so that no one can touch it
5967  * any more. It will help to maintain error context when error occurred.
5968  * Compare to a simple hang, the system will keep stable at least for SSH
5969  * access. Then it should be trivial to inspect the hardware state and
5970  * see what's going on. Implemented as following:
5971  *
5972  * 1. drm_dev_unplug() makes device inaccessible to user space(IOCTLs, etc),
5973  *    clears all CPU mappings to device, disallows remappings through page faults
5974  * 2. amdgpu_irq_disable_all() disables all interrupts
5975  * 3. amdgpu_fence_driver_hw_fini() signals all HW fences
5976  * 4. set adev->no_hw_access to avoid potential crashes after setp 5
5977  * 5. amdgpu_device_unmap_mmio() clears all MMIO mappings
5978  * 6. pci_disable_device() and pci_wait_for_pending_transaction()
5979  *    flush any in flight DMA operations
5980  */
5981 void amdgpu_device_halt(struct amdgpu_device *adev)
5982 {
5983         struct pci_dev *pdev = adev->pdev;
5984         struct drm_device *ddev = adev_to_drm(adev);
5985
5986         drm_dev_unplug(ddev);
5987
5988         amdgpu_irq_disable_all(adev);
5989
5990         amdgpu_fence_driver_hw_fini(adev);
5991
5992         adev->no_hw_access = true;
5993
5994         amdgpu_device_unmap_mmio(adev);
5995
5996         pci_disable_device(pdev);
5997         pci_wait_for_pending_transaction(pdev);
5998 }
5999
6000 u32 amdgpu_device_pcie_port_rreg(struct amdgpu_device *adev,
6001                                 u32 reg)
6002 {
6003         unsigned long flags, address, data;
6004         u32 r;
6005
6006         address = adev->nbio.funcs->get_pcie_port_index_offset(adev);
6007         data = adev->nbio.funcs->get_pcie_port_data_offset(adev);
6008
6009         spin_lock_irqsave(&adev->pcie_idx_lock, flags);
6010         WREG32(address, reg * 4);
6011         (void)RREG32(address);
6012         r = RREG32(data);
6013         spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
6014         return r;
6015 }
6016
6017 void amdgpu_device_pcie_port_wreg(struct amdgpu_device *adev,
6018                                 u32 reg, u32 v)
6019 {
6020         unsigned long flags, address, data;
6021
6022         address = adev->nbio.funcs->get_pcie_port_index_offset(adev);
6023         data = adev->nbio.funcs->get_pcie_port_data_offset(adev);
6024
6025         spin_lock_irqsave(&adev->pcie_idx_lock, flags);
6026         WREG32(address, reg * 4);
6027         (void)RREG32(address);
6028         WREG32(data, v);
6029         (void)RREG32(data);
6030         spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
6031 }
6032
6033 /**
6034  * amdgpu_device_switch_gang - switch to a new gang
6035  * @adev: amdgpu_device pointer
6036  * @gang: the gang to switch to
6037  *
6038  * Try to switch to a new gang.
6039  * Returns: NULL if we switched to the new gang or a reference to the current
6040  * gang leader.
6041  */
6042 struct dma_fence *amdgpu_device_switch_gang(struct amdgpu_device *adev,
6043                                             struct dma_fence *gang)
6044 {
6045         struct dma_fence *old = NULL;
6046
6047         do {
6048                 dma_fence_put(old);
6049                 rcu_read_lock();
6050                 old = dma_fence_get_rcu_safe(&adev->gang_submit);
6051                 rcu_read_unlock();
6052
6053                 if (old == gang)
6054                         break;
6055
6056                 if (!dma_fence_is_signaled(old))
6057                         return old;
6058
6059         } while (cmpxchg((struct dma_fence __force **)&adev->gang_submit,
6060                          old, gang) != old);
6061
6062         dma_fence_put(old);
6063         return NULL;
6064 }
6065
6066 bool amdgpu_device_has_display_hardware(struct amdgpu_device *adev)
6067 {
6068         switch (adev->asic_type) {
6069 #ifdef CONFIG_DRM_AMDGPU_SI
6070         case CHIP_HAINAN:
6071 #endif
6072         case CHIP_TOPAZ:
6073                 /* chips with no display hardware */
6074                 return false;
6075 #ifdef CONFIG_DRM_AMDGPU_SI
6076         case CHIP_TAHITI:
6077         case CHIP_PITCAIRN:
6078         case CHIP_VERDE:
6079         case CHIP_OLAND:
6080 #endif
6081 #ifdef CONFIG_DRM_AMDGPU_CIK
6082         case CHIP_BONAIRE:
6083         case CHIP_HAWAII:
6084         case CHIP_KAVERI:
6085         case CHIP_KABINI:
6086         case CHIP_MULLINS:
6087 #endif
6088         case CHIP_TONGA:
6089         case CHIP_FIJI:
6090         case CHIP_POLARIS10:
6091         case CHIP_POLARIS11:
6092         case CHIP_POLARIS12:
6093         case CHIP_VEGAM:
6094         case CHIP_CARRIZO:
6095         case CHIP_STONEY:
6096                 /* chips with display hardware */
6097                 return true;
6098         default:
6099                 /* IP discovery */
6100                 if (!adev->ip_versions[DCE_HWIP][0] ||
6101                     (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK))
6102                         return false;
6103                 return true;
6104         }
6105 }