drivers/gpu/drm/amd/amdgpu/amdgpu_device.c

   1 /*
   2  * Copyright 2008 Advanced Micro Devices, Inc.
   3  * Copyright 2008 Red Hat Inc.
   4  * Copyright 2009 Jerome Glisse.
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a
   7  * copy of this software and associated documentation files (the "Software"),
   8  * to deal in the Software without restriction, including without limitation
   9  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
  10  * and/or sell copies of the Software, and to permit persons to whom the
  11  * Software is furnished to do so, subject to the following conditions:
  12  *
  13  * The above copyright notice and this permission notice shall be included in
  14  * all copies or substantial portions of the Software.
  15  *
  16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  19  * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
  20  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
  21  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
  22  * OTHER DEALINGS IN THE SOFTWARE.
  23  *
  24  * Authors: Dave Airlie
  25  *          Alex Deucher
  26  *          Jerome Glisse
  27  */
  28 #include <linux/power_supply.h>
  29 #include <linux/kthread.h>
  30 #include <linux/module.h>
  31 #include <linux/console.h>
  32 #include <linux/slab.h>
  33 #include <linux/iommu.h>
  34 #include <linux/pci.h>
  35 #include <linux/devcoredump.h>
  36 #include <generated/utsrelease.h>
  37 #include <linux/pci-p2pdma.h>
  38 #include <linux/apple-gmux.h>
  39
  40 #include <drm/drm_aperture.h>
  41 #include <drm/drm_atomic_helper.h>
  42 #include <drm/drm_crtc_helper.h>
  43 #include <drm/drm_fb_helper.h>
  44 #include <drm/drm_probe_helper.h>
  45 #include <drm/amdgpu_drm.h>
  46 #include <linux/vgaarb.h>
  47 #include <linux/vga_switcheroo.h>
  48 #include <linux/efi.h>
  49 #include "amdgpu.h"
  50 #include "amdgpu_trace.h"
  51 #include "amdgpu_i2c.h"
  52 #include "atom.h"
  53 #include "amdgpu_atombios.h"
  54 #include "amdgpu_atomfirmware.h"
  55 #include "amd_pcie.h"
  56 #ifdef CONFIG_DRM_AMDGPU_SI
  57 #include "si.h"
  58 #endif
  59 #ifdef CONFIG_DRM_AMDGPU_CIK
  60 #include "cik.h"
  61 #endif
  62 #include "vi.h"
  63 #include "soc15.h"
  64 #include "nv.h"
  65 #include "bif/bif_4_1_d.h"
  66 #include <linux/firmware.h>
  67 #include "amdgpu_vf_error.h"
  68
  69 #include "amdgpu_amdkfd.h"
  70 #include "amdgpu_pm.h"
  71
  72 #include "amdgpu_xgmi.h"
  73 #include "amdgpu_ras.h"
  74 #include "amdgpu_pmu.h"
  75 #include "amdgpu_fru_eeprom.h"
  76 #include "amdgpu_reset.h"
  77
  78 #include <linux/suspend.h>
  79 #include <drm/task_barrier.h>
  80 #include <linux/pm_runtime.h>
  81
  82 #include <drm/drm_drv.h>
  83
  84 #if IS_ENABLED(CONFIG_X86)
  85 #include <asm/intel-family.h>
  86 #endif
  87
  88 MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin");
  89 MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin");
  90 MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin");
  91 MODULE_FIRMWARE("amdgpu/picasso_gpu_info.bin");
  92 MODULE_FIRMWARE("amdgpu/raven2_gpu_info.bin");
  93 MODULE_FIRMWARE("amdgpu/arcturus_gpu_info.bin");
  94 MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin");
  95
  96 #define AMDGPU_RESUME_MS                2000
  97 #define AMDGPU_MAX_RETRY_LIMIT          2
  98 #define AMDGPU_RETRY_SRIOV_RESET(r) ((r) == -EBUSY || (r) == -ETIMEDOUT || (r) == -EINVAL)
  99
 100 static const struct drm_driver amdgpu_kms_driver;
 101
 102 const char *amdgpu_asic_name[] = {
 103         "TAHITI",
 104         "PITCAIRN",
 105         "VERDE",
 106         "OLAND",
 107         "HAINAN",
 108         "BONAIRE",
 109         "KAVERI",
 110         "KABINI",
 111         "HAWAII",
 112         "MULLINS",
 113         "TOPAZ",
 114         "TONGA",
 115         "FIJI",
 116         "CARRIZO",
 117         "STONEY",
 118         "POLARIS10",
 119         "POLARIS11",
 120         "POLARIS12",
 121         "VEGAM",
 122         "VEGA10",
 123         "VEGA12",
 124         "VEGA20",
 125         "RAVEN",
 126         "ARCTURUS",
 127         "RENOIR",
 128         "ALDEBARAN",
 129         "NAVI10",
 130         "CYAN_SKILLFISH",
 131         "NAVI14",
 132         "NAVI12",
 133         "SIENNA_CICHLID",
 134         "NAVY_FLOUNDER",
 135         "VANGOGH",
 136         "DIMGREY_CAVEFISH",
 137         "BEIGE_GOBY",
 138         "YELLOW_CARP",
 139         "IP DISCOVERY",
 140         "LAST",
 141 };
 142
 143 /**
 144  * DOC: pcie_replay_count
 145  *
 146  * The amdgpu driver provides a sysfs API for reporting the total number
 147  * of PCIe replays (NAKs)
 148  * The file pcie_replay_count is used for this and returns the total
 149  * number of replays as a sum of the NAKs generated and NAKs received
 150  */
 151
 152 static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev,
 153                 struct device_attribute *attr, char *buf)
 154 {
 155         struct drm_device *ddev = dev_get_drvdata(dev);
 156         struct amdgpu_device *adev = drm_to_adev(ddev);
 157         uint64_t cnt = amdgpu_asic_get_pcie_replay_count(adev);
 158
 159         return sysfs_emit(buf, "%llu\n", cnt);
 160 }
 161
 162 static DEVICE_ATTR(pcie_replay_count, 0444,
 163                 amdgpu_device_get_pcie_replay_count, NULL);
 164
 165 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev);
 166
 167 /**
 168  * DOC: product_name
 169  *
 170  * The amdgpu driver provides a sysfs API for reporting the product name
 171  * for the device
 172  * The file product_name is used for this and returns the product name
 173  * as returned from the FRU.
 174  * NOTE: This is only available for certain server cards
 175  */
 176
 177 static ssize_t amdgpu_device_get_product_name(struct device *dev,
 178                 struct device_attribute *attr, char *buf)
 179 {
 180         struct drm_device *ddev = dev_get_drvdata(dev);
 181         struct amdgpu_device *adev = drm_to_adev(ddev);
 182
 183         return sysfs_emit(buf, "%s\n", adev->product_name);
 184 }
 185
 186 static DEVICE_ATTR(product_name, 0444,
 187                 amdgpu_device_get_product_name, NULL);
 188
 189 /**
 190  * DOC: product_number
 191  *
 192  * The amdgpu driver provides a sysfs API for reporting the part number
 193  * for the device
 194  * The file product_number is used for this and returns the part number
 195  * as returned from the FRU.
 196  * NOTE: This is only available for certain server cards
 197  */
 198
 199 static ssize_t amdgpu_device_get_product_number(struct device *dev,
 200                 struct device_attribute *attr, char *buf)
 201 {
 202         struct drm_device *ddev = dev_get_drvdata(dev);
 203         struct amdgpu_device *adev = drm_to_adev(ddev);
 204
 205         return sysfs_emit(buf, "%s\n", adev->product_number);
 206 }
 207
 208 static DEVICE_ATTR(product_number, 0444,
 209                 amdgpu_device_get_product_number, NULL);
 210
 211 /**
 212  * DOC: serial_number
 213  *
 214  * The amdgpu driver provides a sysfs API for reporting the serial number
 215  * for the device
 216  * The file serial_number is used for this and returns the serial number
 217  * as returned from the FRU.
 218  * NOTE: This is only available for certain server cards
 219  */
 220
 221 static ssize_t amdgpu_device_get_serial_number(struct device *dev,
 222                 struct device_attribute *attr, char *buf)
 223 {
 224         struct drm_device *ddev = dev_get_drvdata(dev);
 225         struct amdgpu_device *adev = drm_to_adev(ddev);
 226
 227         return sysfs_emit(buf, "%s\n", adev->serial);
 228 }
 229
 230 static DEVICE_ATTR(serial_number, 0444,
 231                 amdgpu_device_get_serial_number, NULL);
 232
 233 /**
 234  * amdgpu_device_supports_px - Is the device a dGPU with ATPX power control
 235  *
 236  * @dev: drm_device pointer
 237  *
 238  * Returns true if the device is a dGPU with ATPX power control,
 239  * otherwise return false.
 240  */
 241 bool amdgpu_device_supports_px(struct drm_device *dev)
 242 {
 243         struct amdgpu_device *adev = drm_to_adev(dev);
 244
 245         if ((adev->flags & AMD_IS_PX) && !amdgpu_is_atpx_hybrid())
 246                 return true;
 247         return false;
 248 }
 249
 250 /**
 251  * amdgpu_device_supports_boco - Is the device a dGPU with ACPI power resources
 252  *
 253  * @dev: drm_device pointer
 254  *
 255  * Returns true if the device is a dGPU with ACPI power control,
 256  * otherwise return false.
 257  */
 258 bool amdgpu_device_supports_boco(struct drm_device *dev)
 259 {
 260         struct amdgpu_device *adev = drm_to_adev(dev);
 261
 262         if (adev->has_pr3 ||
 263             ((adev->flags & AMD_IS_PX) && amdgpu_is_atpx_hybrid()))
 264                 return true;
 265         return false;
 266 }
 267
 268 /**
 269  * amdgpu_device_supports_baco - Does the device support BACO
 270  *
 271  * @dev: drm_device pointer
 272  *
 273  * Returns true if the device supporte BACO,
 274  * otherwise return false.
 275  */
 276 bool amdgpu_device_supports_baco(struct drm_device *dev)
 277 {
 278         struct amdgpu_device *adev = drm_to_adev(dev);
 279
 280         return amdgpu_asic_supports_baco(adev);
 281 }
 282
 283 /**
 284  * amdgpu_device_supports_smart_shift - Is the device dGPU with
 285  * smart shift support
 286  *
 287  * @dev: drm_device pointer
 288  *
 289  * Returns true if the device is a dGPU with Smart Shift support,
 290  * otherwise returns false.
 291  */
 292 bool amdgpu_device_supports_smart_shift(struct drm_device *dev)
 293 {
 294         return (amdgpu_device_supports_boco(dev) &&
 295                 amdgpu_acpi_is_power_shift_control_supported());
 296 }
 297
 298 /*
 299  * VRAM access helper functions
 300  */
 301
 302 /**
 303  * amdgpu_device_mm_access - access vram by MM_INDEX/MM_DATA
 304  *
 305  * @adev: amdgpu_device pointer
 306  * @pos: offset of the buffer in vram
 307  * @buf: virtual address of the buffer in system memory
 308  * @size: read/write size, sizeof(@buf) must > @size
 309  * @write: true - write to vram, otherwise - read from vram
 310  */
 311 void amdgpu_device_mm_access(struct amdgpu_device *adev, loff_t pos,
 312                              void *buf, size_t size, bool write)
 313 {
 314         unsigned long flags;
 315         uint32_t hi = ~0, tmp = 0;
 316         uint32_t *data = buf;
 317         uint64_t last;
 318         int idx;
 319
 320         if (!drm_dev_enter(adev_to_drm(adev), &idx))
 321                 return;
 322
 323         BUG_ON(!IS_ALIGNED(pos, 4) || !IS_ALIGNED(size, 4));
 324
 325         spin_lock_irqsave(&adev->mmio_idx_lock, flags);
 326         for (last = pos + size; pos < last; pos += 4) {
 327                 tmp = pos >> 31;
 328
 329                 WREG32_NO_KIQ(mmMM_INDEX, ((uint32_t)pos) | 0x80000000);
 330                 if (tmp != hi) {
 331                         WREG32_NO_KIQ(mmMM_INDEX_HI, tmp);
 332                         hi = tmp;
 333                 }
 334                 if (write)
 335                         WREG32_NO_KIQ(mmMM_DATA, *data++);
 336                 else
 337                         *data++ = RREG32_NO_KIQ(mmMM_DATA);
 338         }
 339
 340         spin_unlock_irqrestore(&adev->mmio_idx_lock, flags);
 341         drm_dev_exit(idx);
 342 }
 343
 344 /**
 345  * amdgpu_device_aper_access - access vram by vram aperature
 346  *
 347  * @adev: amdgpu_device pointer
 348  * @pos: offset of the buffer in vram
 349  * @buf: virtual address of the buffer in system memory
 350  * @size: read/write size, sizeof(@buf) must > @size
 351  * @write: true - write to vram, otherwise - read from vram
 352  *
 353  * The return value means how many bytes have been transferred.
 354  */
 355 size_t amdgpu_device_aper_access(struct amdgpu_device *adev, loff_t pos,
 356                                  void *buf, size_t size, bool write)
 357 {
 358 #ifdef CONFIG_64BIT
 359         void __iomem *addr;
 360         size_t count = 0;
 361         uint64_t last;
 362
 363         if (!adev->mman.aper_base_kaddr)
 364                 return 0;
 365
 366         last = min(pos + size, adev->gmc.visible_vram_size);
 367         if (last > pos) {
 368                 addr = adev->mman.aper_base_kaddr + pos;
 369                 count = last - pos;
 370
 371                 if (write) {
 372                         memcpy_toio(addr, buf, count);
 373                         mb();
 374                         amdgpu_device_flush_hdp(adev, NULL);
 375                 } else {
 376                         amdgpu_device_invalidate_hdp(adev, NULL);
 377                         mb();
 378                         memcpy_fromio(buf, addr, count);
 379                 }
 380
 381         }
 382
 383         return count;
 384 #else
 385         return 0;
 386 #endif
 387 }
 388
 389 /**
 390  * amdgpu_device_vram_access - read/write a buffer in vram
 391  *
 392  * @adev: amdgpu_device pointer
 393  * @pos: offset of the buffer in vram
 394  * @buf: virtual address of the buffer in system memory
 395  * @size: read/write size, sizeof(@buf) must > @size
 396  * @write: true - write to vram, otherwise - read from vram
 397  */
 398 void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos,
 399                                void *buf, size_t size, bool write)
 400 {
 401         size_t count;
 402
 403         /* try to using vram apreature to access vram first */
 404         count = amdgpu_device_aper_access(adev, pos, buf, size, write);
 405         size -= count;
 406         if (size) {
 407                 /* using MM to access rest vram */
 408                 pos += count;
 409                 buf += count;
 410                 amdgpu_device_mm_access(adev, pos, buf, size, write);
 411         }
 412 }
 413
 414 /*
 415  * register access helper functions.
 416  */
 417
 418 /* Check if hw access should be skipped because of hotplug or device error */
 419 bool amdgpu_device_skip_hw_access(struct amdgpu_device *adev)
 420 {
 421         if (adev->no_hw_access)
 422                 return true;
 423
 424 #ifdef CONFIG_LOCKDEP
 425         /*
 426          * This is a bit complicated to understand, so worth a comment. What we assert
 427          * here is that the GPU reset is not running on another thread in parallel.
 428          *
 429          * For this we trylock the read side of the reset semaphore, if that succeeds
 430          * we know that the reset is not running in paralell.
 431          *
 432          * If the trylock fails we assert that we are either already holding the read
 433          * side of the lock or are the reset thread itself and hold the write side of
 434          * the lock.
 435          */
 436         if (in_task()) {
 437                 if (down_read_trylock(&adev->reset_domain->sem))
 438                         up_read(&adev->reset_domain->sem);
 439                 else
 440                         lockdep_assert_held(&adev->reset_domain->sem);
 441         }
 442 #endif
 443         return false;
 444 }
 445
 446 /**
 447  * amdgpu_device_rreg - read a memory mapped IO or indirect register
 448  *
 449  * @adev: amdgpu_device pointer
 450  * @reg: dword aligned register offset
 451  * @acc_flags: access flags which require special behavior
 452  *
 453  * Returns the 32 bit value from the offset specified.
 454  */
 455 uint32_t amdgpu_device_rreg(struct amdgpu_device *adev,
 456                             uint32_t reg, uint32_t acc_flags)
 457 {
 458         uint32_t ret;
 459
 460         if (amdgpu_device_skip_hw_access(adev))
 461                 return 0;
 462
 463         if ((reg * 4) < adev->rmmio_size) {
 464                 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&
 465                     amdgpu_sriov_runtime(adev) &&
 466                     down_read_trylock(&adev->reset_domain->sem)) {
 467                         ret = amdgpu_kiq_rreg(adev, reg);
 468                         up_read(&adev->reset_domain->sem);
 469                 } else {
 470                         ret = readl(((void __iomem *)adev->rmmio) + (reg * 4));
 471                 }
 472         } else {
 473                 ret = adev->pcie_rreg(adev, reg * 4);
 474         }
 475
 476         trace_amdgpu_device_rreg(adev->pdev->device, reg, ret);
 477
 478         return ret;
 479 }
 480
 481 /*
 482  * MMIO register read with bytes helper functions
 483  * @offset:bytes offset from MMIO start
 484  */
 485
 486 /**
 487  * amdgpu_mm_rreg8 - read a memory mapped IO register
 488  *
 489  * @adev: amdgpu_device pointer
 490  * @offset: byte aligned register offset
 491  *
 492  * Returns the 8 bit value from the offset specified.
 493  */
 494 uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset)
 495 {
 496         if (amdgpu_device_skip_hw_access(adev))
 497                 return 0;
 498
 499         if (offset < adev->rmmio_size)
 500                 return (readb(adev->rmmio + offset));
 501         BUG();
 502 }
 503
 504 /*
 505  * MMIO register write with bytes helper functions
 506  * @offset:bytes offset from MMIO start
 507  * @value: the value want to be written to the register
 508  */
 509
 510 /**
 511  * amdgpu_mm_wreg8 - read a memory mapped IO register
 512  *
 513  * @adev: amdgpu_device pointer
 514  * @offset: byte aligned register offset
 515  * @value: 8 bit value to write
 516  *
 517  * Writes the value specified to the offset specified.
 518  */
 519 void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value)
 520 {
 521         if (amdgpu_device_skip_hw_access(adev))
 522                 return;
 523
 524         if (offset < adev->rmmio_size)
 525                 writeb(value, adev->rmmio + offset);
 526         else
 527                 BUG();
 528 }
 529
 530 /**
 531  * amdgpu_device_wreg - write to a memory mapped IO or indirect register
 532  *
 533  * @adev: amdgpu_device pointer
 534  * @reg: dword aligned register offset
 535  * @v: 32 bit value to write to the register
 536  * @acc_flags: access flags which require special behavior
 537  *
 538  * Writes the value specified to the offset specified.
 539  */
 540 void amdgpu_device_wreg(struct amdgpu_device *adev,
 541                         uint32_t reg, uint32_t v,
 542                         uint32_t acc_flags)
 543 {
 544         if (amdgpu_device_skip_hw_access(adev))
 545                 return;
 546
 547         if ((reg * 4) < adev->rmmio_size) {
 548                 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&
 549                     amdgpu_sriov_runtime(adev) &&
 550                     down_read_trylock(&adev->reset_domain->sem)) {
 551                         amdgpu_kiq_wreg(adev, reg, v);
 552                         up_read(&adev->reset_domain->sem);
 553                 } else {
 554                         writel(v, ((void __iomem *)adev->rmmio) + (reg * 4));
 555                 }
 556         } else {
 557                 adev->pcie_wreg(adev, reg * 4, v);
 558         }
 559
 560         trace_amdgpu_device_wreg(adev->pdev->device, reg, v);
 561 }
 562
 563 /**
 564  * amdgpu_mm_wreg_mmio_rlc -  write register either with direct/indirect mmio or with RLC path if in range
 565  *
 566  * @adev: amdgpu_device pointer
 567  * @reg: mmio/rlc register
 568  * @v: value to write
 569  *
 570  * this function is invoked only for the debugfs register access
 571  */
 572 void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev,
 573                              uint32_t reg, uint32_t v,
 574                              uint32_t xcc_id)
 575 {
 576         if (amdgpu_device_skip_hw_access(adev))
 577                 return;
 578
 579         if (amdgpu_sriov_fullaccess(adev) &&
 580             adev->gfx.rlc.funcs &&
 581             adev->gfx.rlc.funcs->is_rlcg_access_range) {
 582                 if (adev->gfx.rlc.funcs->is_rlcg_access_range(adev, reg))
 583                         return amdgpu_sriov_wreg(adev, reg, v, 0, 0, xcc_id);
 584         } else if ((reg * 4) >= adev->rmmio_size) {
 585                 adev->pcie_wreg(adev, reg * 4, v);
 586         } else {
 587                 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4));
 588         }
 589 }
 590
 591 /**
 592  * amdgpu_device_indirect_rreg - read an indirect register
 593  *
 594  * @adev: amdgpu_device pointer
 595  * @reg_addr: indirect register address to read from
 596  *
 597  * Returns the value of indirect register @reg_addr
 598  */
 599 u32 amdgpu_device_indirect_rreg(struct amdgpu_device *adev,
 600                                 u32 reg_addr)
 601 {
 602         unsigned long flags, pcie_index, pcie_data;
 603         void __iomem *pcie_index_offset;
 604         void __iomem *pcie_data_offset;
 605         u32 r;
 606
 607         pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
 608         pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
 609
 610         spin_lock_irqsave(&adev->pcie_idx_lock, flags);
 611         pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
 612         pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
 613
 614         writel(reg_addr, pcie_index_offset);
 615         readl(pcie_index_offset);
 616         r = readl(pcie_data_offset);
 617         spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
 618
 619         return r;
 620 }
 621
 622 u32 amdgpu_device_indirect_rreg_ext(struct amdgpu_device *adev,
 623                                     u64 reg_addr)
 624 {
 625         unsigned long flags, pcie_index, pcie_index_hi, pcie_data;
 626         u32 r;
 627         void __iomem *pcie_index_offset;
 628         void __iomem *pcie_index_hi_offset;
 629         void __iomem *pcie_data_offset;
 630
 631         pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
 632         pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
 633         if (adev->nbio.funcs->get_pcie_index_hi_offset)
 634                 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev);
 635         else
 636                 pcie_index_hi = 0;
 637
 638         spin_lock_irqsave(&adev->pcie_idx_lock, flags);
 639         pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
 640         pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
 641         if (pcie_index_hi != 0)
 642                 pcie_index_hi_offset = (void __iomem *)adev->rmmio +
 643                                 pcie_index_hi * 4;
 644
 645         writel(reg_addr, pcie_index_offset);
 646         readl(pcie_index_offset);
 647         if (pcie_index_hi != 0) {
 648                 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset);
 649                 readl(pcie_index_hi_offset);
 650         }
 651         r = readl(pcie_data_offset);
 652
 653         /* clear the high bits */
 654         if (pcie_index_hi != 0) {
 655                 writel(0, pcie_index_hi_offset);
 656                 readl(pcie_index_hi_offset);
 657         }
 658
 659         spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
 660
 661         return r;
 662 }
 663
 664 /**
 665  * amdgpu_device_indirect_rreg64 - read a 64bits indirect register
 666  *
 667  * @adev: amdgpu_device pointer
 668  * @reg_addr: indirect register address to read from
 669  *
 670  * Returns the value of indirect register @reg_addr
 671  */
 672 u64 amdgpu_device_indirect_rreg64(struct amdgpu_device *adev,
 673                                   u32 reg_addr)
 674 {
 675         unsigned long flags, pcie_index, pcie_data;
 676         void __iomem *pcie_index_offset;
 677         void __iomem *pcie_data_offset;
 678         u64 r;
 679
 680         pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
 681         pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
 682
 683         spin_lock_irqsave(&adev->pcie_idx_lock, flags);
 684         pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
 685         pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
 686
 687         /* read low 32 bits */
 688         writel(reg_addr, pcie_index_offset);
 689         readl(pcie_index_offset);
 690         r = readl(pcie_data_offset);
 691         /* read high 32 bits */
 692         writel(reg_addr + 4, pcie_index_offset);
 693         readl(pcie_index_offset);
 694         r |= ((u64)readl(pcie_data_offset) << 32);
 695         spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
 696
 697         return r;
 698 }
 699
 700 /**
 701  * amdgpu_device_indirect_wreg - write an indirect register address
 702  *
 703  * @adev: amdgpu_device pointer
 704  * @reg_addr: indirect register offset
 705  * @reg_data: indirect register data
 706  *
 707  */
 708 void amdgpu_device_indirect_wreg(struct amdgpu_device *adev,
 709                                  u32 reg_addr, u32 reg_data)
 710 {
 711         unsigned long flags, pcie_index, pcie_data;
 712         void __iomem *pcie_index_offset;
 713         void __iomem *pcie_data_offset;
 714
 715         pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
 716         pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
 717
 718         spin_lock_irqsave(&adev->pcie_idx_lock, flags);
 719         pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
 720         pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
 721
 722         writel(reg_addr, pcie_index_offset);
 723         readl(pcie_index_offset);
 724         writel(reg_data, pcie_data_offset);
 725         readl(pcie_data_offset);
 726         spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
 727 }
 728
 729 void amdgpu_device_indirect_wreg_ext(struct amdgpu_device *adev,
 730                                      u64 reg_addr, u32 reg_data)
 731 {
 732         unsigned long flags, pcie_index, pcie_index_hi, pcie_data;
 733         void __iomem *pcie_index_offset;
 734         void __iomem *pcie_index_hi_offset;
 735         void __iomem *pcie_data_offset;
 736
 737         pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
 738         pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
 739         if (adev->nbio.funcs->get_pcie_index_hi_offset)
 740                 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev);
 741         else
 742                 pcie_index_hi = 0;
 743
 744         spin_lock_irqsave(&adev->pcie_idx_lock, flags);
 745         pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
 746         pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
 747         if (pcie_index_hi != 0)
 748                 pcie_index_hi_offset = (void __iomem *)adev->rmmio +
 749                                 pcie_index_hi * 4;
 750
 751         writel(reg_addr, pcie_index_offset);
 752         readl(pcie_index_offset);
 753         if (pcie_index_hi != 0) {
 754                 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset);
 755                 readl(pcie_index_hi_offset);
 756         }
 757         writel(reg_data, pcie_data_offset);
 758         readl(pcie_data_offset);
 759
 760         /* clear the high bits */
 761         if (pcie_index_hi != 0) {
 762                 writel(0, pcie_index_hi_offset);
 763                 readl(pcie_index_hi_offset);
 764         }
 765
 766         spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
 767 }
 768
 769 /**
 770  * amdgpu_device_indirect_wreg64 - write a 64bits indirect register address
 771  *
 772  * @adev: amdgpu_device pointer
 773  * @reg_addr: indirect register offset
 774  * @reg_data: indirect register data
 775  *
 776  */
 777 void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev,
 778                                    u32 reg_addr, u64 reg_data)
 779 {
 780         unsigned long flags, pcie_index, pcie_data;
 781         void __iomem *pcie_index_offset;
 782         void __iomem *pcie_data_offset;
 783
 784         pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
 785         pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
 786
 787         spin_lock_irqsave(&adev->pcie_idx_lock, flags);
 788         pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
 789         pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
 790
 791         /* write low 32 bits */
 792         writel(reg_addr, pcie_index_offset);
 793         readl(pcie_index_offset);
 794         writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset);
 795         readl(pcie_data_offset);
 796         /* write high 32 bits */
 797         writel(reg_addr + 4, pcie_index_offset);
 798         readl(pcie_index_offset);
 799         writel((u32)(reg_data >> 32), pcie_data_offset);
 800         readl(pcie_data_offset);
 801         spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
 802 }
 803
 804 /**
 805  * amdgpu_device_get_rev_id - query device rev_id
 806  *
 807  * @adev: amdgpu_device pointer
 808  *
 809  * Return device rev_id
 810  */
 811 u32 amdgpu_device_get_rev_id(struct amdgpu_device *adev)
 812 {
 813         return adev->nbio.funcs->get_rev_id(adev);
 814 }
 815
 816 /**
 817  * amdgpu_invalid_rreg - dummy reg read function
 818  *
 819  * @adev: amdgpu_device pointer
 820  * @reg: offset of register
 821  *
 822  * Dummy register read function.  Used for register blocks
 823  * that certain asics don't have (all asics).
 824  * Returns the value in the register.
 825  */
 826 static uint32_t amdgpu_invalid_rreg(struct amdgpu_device *adev, uint32_t reg)
 827 {
 828         DRM_ERROR("Invalid callback to read register 0x%04X\n", reg);
 829         BUG();
 830         return 0;
 831 }
 832
 833 static uint32_t amdgpu_invalid_rreg_ext(struct amdgpu_device *adev, uint64_t reg)
 834 {
 835         DRM_ERROR("Invalid callback to read register 0x%llX\n", reg);
 836         BUG();
 837         return 0;
 838 }
 839
 840 /**
 841  * amdgpu_invalid_wreg - dummy reg write function
 842  *
 843  * @adev: amdgpu_device pointer
 844  * @reg: offset of register
 845  * @v: value to write to the register
 846  *
 847  * Dummy register read function.  Used for register blocks
 848  * that certain asics don't have (all asics).
 849  */
 850 static void amdgpu_invalid_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v)
 851 {
 852         DRM_ERROR("Invalid callback to write register 0x%04X with 0x%08X\n",
 853                   reg, v);
 854         BUG();
 855 }
 856
 857 static void amdgpu_invalid_wreg_ext(struct amdgpu_device *adev, uint64_t reg, uint32_t v)
 858 {
 859         DRM_ERROR("Invalid callback to write register 0x%llX with 0x%08X\n",
 860                   reg, v);
 861         BUG();
 862 }
 863
 864 /**
 865  * amdgpu_invalid_rreg64 - dummy 64 bit reg read function
 866  *
 867  * @adev: amdgpu_device pointer
 868  * @reg: offset of register
 869  *
 870  * Dummy register read function.  Used for register blocks
 871  * that certain asics don't have (all asics).
 872  * Returns the value in the register.
 873  */
 874 static uint64_t amdgpu_invalid_rreg64(struct amdgpu_device *adev, uint32_t reg)
 875 {
 876         DRM_ERROR("Invalid callback to read 64 bit register 0x%04X\n", reg);
 877         BUG();
 878         return 0;
 879 }
 880
 881 /**
 882  * amdgpu_invalid_wreg64 - dummy reg write function
 883  *
 884  * @adev: amdgpu_device pointer
 885  * @reg: offset of register
 886  * @v: value to write to the register
 887  *
 888  * Dummy register read function.  Used for register blocks
 889  * that certain asics don't have (all asics).
 890  */
 891 static void amdgpu_invalid_wreg64(struct amdgpu_device *adev, uint32_t reg, uint64_t v)
 892 {
 893         DRM_ERROR("Invalid callback to write 64 bit register 0x%04X with 0x%08llX\n",
 894                   reg, v);
 895         BUG();
 896 }
 897
 898 /**
 899  * amdgpu_block_invalid_rreg - dummy reg read function
 900  *
 901  * @adev: amdgpu_device pointer
 902  * @block: offset of instance
 903  * @reg: offset of register
 904  *
 905  * Dummy register read function.  Used for register blocks
 906  * that certain asics don't have (all asics).
 907  * Returns the value in the register.
 908  */
 909 static uint32_t amdgpu_block_invalid_rreg(struct amdgpu_device *adev,
 910                                           uint32_t block, uint32_t reg)
 911 {
 912         DRM_ERROR("Invalid callback to read register 0x%04X in block 0x%04X\n",
 913                   reg, block);
 914         BUG();
 915         return 0;
 916 }
 917
 918 /**
 919  * amdgpu_block_invalid_wreg - dummy reg write function
 920  *
 921  * @adev: amdgpu_device pointer
 922  * @block: offset of instance
 923  * @reg: offset of register
 924  * @v: value to write to the register
 925  *
 926  * Dummy register read function.  Used for register blocks
 927  * that certain asics don't have (all asics).
 928  */
 929 static void amdgpu_block_invalid_wreg(struct amdgpu_device *adev,
 930                                       uint32_t block,
 931                                       uint32_t reg, uint32_t v)
 932 {
 933         DRM_ERROR("Invalid block callback to write register 0x%04X in block 0x%04X with 0x%08X\n",
 934                   reg, block, v);
 935         BUG();
 936 }
 937
 938 /**
 939  * amdgpu_device_asic_init - Wrapper for atom asic_init
 940  *
 941  * @adev: amdgpu_device pointer
 942  *
 943  * Does any asic specific work and then calls atom asic init.
 944  */
 945 static int amdgpu_device_asic_init(struct amdgpu_device *adev)
 946 {
 947         amdgpu_asic_pre_asic_init(adev);
 948
 949         if (adev->ip_versions[GC_HWIP][0] == IP_VERSION(9, 4, 3) ||
 950             adev->ip_versions[GC_HWIP][0] >= IP_VERSION(11, 0, 0))
 951                 return amdgpu_atomfirmware_asic_init(adev, true);
 952         else
 953                 return amdgpu_atom_asic_init(adev->mode_info.atom_context);
 954 }
 955
 956 /**
 957  * amdgpu_device_mem_scratch_init - allocate the VRAM scratch page
 958  *
 959  * @adev: amdgpu_device pointer
 960  *
 961  * Allocates a scratch page of VRAM for use by various things in the
 962  * driver.
 963  */
 964 static int amdgpu_device_mem_scratch_init(struct amdgpu_device *adev)
 965 {
 966         return amdgpu_bo_create_kernel(adev, AMDGPU_GPU_PAGE_SIZE, PAGE_SIZE,
 967                                        AMDGPU_GEM_DOMAIN_VRAM |
 968                                        AMDGPU_GEM_DOMAIN_GTT,
 969                                        &adev->mem_scratch.robj,
 970                                        &adev->mem_scratch.gpu_addr,
 971                                        (void **)&adev->mem_scratch.ptr);
 972 }
 973
 974 /**
 975  * amdgpu_device_mem_scratch_fini - Free the VRAM scratch page
 976  *
 977  * @adev: amdgpu_device pointer
 978  *
 979  * Frees the VRAM scratch page.
 980  */
 981 static void amdgpu_device_mem_scratch_fini(struct amdgpu_device *adev)
 982 {
 983         amdgpu_bo_free_kernel(&adev->mem_scratch.robj, NULL, NULL);
 984 }
 985
 986 /**
 987  * amdgpu_device_program_register_sequence - program an array of registers.
 988  *
 989  * @adev: amdgpu_device pointer
 990  * @registers: pointer to the register array
 991  * @array_size: size of the register array
 992  *
 993  * Programs an array or registers with and or masks.
 994  * This is a helper for setting golden registers.
 995  */
 996 void amdgpu_device_program_register_sequence(struct amdgpu_device *adev,
 997                                              const u32 *registers,
 998                                              const u32 array_size)
 999 {
1000         u32 tmp, reg, and_mask, or_mask;
1001         int i;
1002
1003         if (array_size % 3)
1004                 return;
1005
1006         for (i = 0; i < array_size; i += 3) {
1007                 reg = registers[i + 0];
1008                 and_mask = registers[i + 1];
1009                 or_mask = registers[i + 2];
1010
1011                 if (and_mask == 0xffffffff) {
1012                         tmp = or_mask;
1013                 } else {
1014                         tmp = RREG32(reg);
1015                         tmp &= ~and_mask;
1016                         if (adev->family >= AMDGPU_FAMILY_AI)
1017                                 tmp |= (or_mask & and_mask);
1018                         else
1019                                 tmp |= or_mask;
1020                 }
1021                 WREG32(reg, tmp);
1022         }
1023 }
1024
1025 /**
1026  * amdgpu_device_pci_config_reset - reset the GPU
1027  *
1028  * @adev: amdgpu_device pointer
1029  *
1030  * Resets the GPU using the pci config reset sequence.
1031  * Only applicable to asics prior to vega10.
1032  */
1033 void amdgpu_device_pci_config_reset(struct amdgpu_device *adev)
1034 {
1035         pci_write_config_dword(adev->pdev, 0x7c, AMDGPU_ASIC_RESET_DATA);
1036 }
1037
1038 /**
1039  * amdgpu_device_pci_reset - reset the GPU using generic PCI means
1040  *
1041  * @adev: amdgpu_device pointer
1042  *
1043  * Resets the GPU using generic pci reset interfaces (FLR, SBR, etc.).
1044  */
1045 int amdgpu_device_pci_reset(struct amdgpu_device *adev)
1046 {
1047         return pci_reset_function(adev->pdev);
1048 }
1049
1050 /*
1051  * amdgpu_device_wb_*()
1052  * Writeback is the method by which the GPU updates special pages in memory
1053  * with the status of certain GPU events (fences, ring pointers,etc.).
1054  */
1055
1056 /**
1057  * amdgpu_device_wb_fini - Disable Writeback and free memory
1058  *
1059  * @adev: amdgpu_device pointer
1060  *
1061  * Disables Writeback and frees the Writeback memory (all asics).
1062  * Used at driver shutdown.
1063  */
1064 static void amdgpu_device_wb_fini(struct amdgpu_device *adev)
1065 {
1066         if (adev->wb.wb_obj) {
1067                 amdgpu_bo_free_kernel(&adev->wb.wb_obj,
1068                                       &adev->wb.gpu_addr,
1069                                       (void **)&adev->wb.wb);
1070                 adev->wb.wb_obj = NULL;
1071         }
1072 }
1073
1074 /**
1075  * amdgpu_device_wb_init - Init Writeback driver info and allocate memory
1076  *
1077  * @adev: amdgpu_device pointer
1078  *
1079  * Initializes writeback and allocates writeback memory (all asics).
1080  * Used at driver startup.
1081  * Returns 0 on success or an -error on failure.
1082  */
1083 static int amdgpu_device_wb_init(struct amdgpu_device *adev)
1084 {
1085         int r;
1086
1087         if (adev->wb.wb_obj == NULL) {
1088                 /* AMDGPU_MAX_WB * sizeof(uint32_t) * 8 = AMDGPU_MAX_WB 256bit slots */
1089                 r = amdgpu_bo_create_kernel(adev, AMDGPU_MAX_WB * sizeof(uint32_t) * 8,
1090                                             PAGE_SIZE, AMDGPU_GEM_DOMAIN_GTT,
1091                                             &adev->wb.wb_obj, &adev->wb.gpu_addr,
1092                                             (void **)&adev->wb.wb);
1093                 if (r) {
1094                         dev_warn(adev->dev, "(%d) create WB bo failed\n", r);
1095                         return r;
1096                 }
1097
1098                 adev->wb.num_wb = AMDGPU_MAX_WB;
1099                 memset(&adev->wb.used, 0, sizeof(adev->wb.used));
1100
1101                 /* clear wb memory */
1102                 memset((char *)adev->wb.wb, 0, AMDGPU_MAX_WB * sizeof(uint32_t) * 8);
1103         }
1104
1105         return 0;
1106 }
1107
1108 /**
1109  * amdgpu_device_wb_get - Allocate a wb entry
1110  *
1111  * @adev: amdgpu_device pointer
1112  * @wb: wb index
1113  *
1114  * Allocate a wb slot for use by the driver (all asics).
1115  * Returns 0 on success or -EINVAL on failure.
1116  */
1117 int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb)
1118 {
1119         unsigned long offset = find_first_zero_bit(adev->wb.used, adev->wb.num_wb);
1120
1121         if (offset < adev->wb.num_wb) {
1122                 __set_bit(offset, adev->wb.used);
1123                 *wb = offset << 3; /* convert to dw offset */
1124                 return 0;
1125         } else {
1126                 return -EINVAL;
1127         }
1128 }
1129
1130 /**
1131  * amdgpu_device_wb_free - Free a wb entry
1132  *
1133  * @adev: amdgpu_device pointer
1134  * @wb: wb index
1135  *
1136  * Free a wb slot allocated for use by the driver (all asics)
1137  */
1138 void amdgpu_device_wb_free(struct amdgpu_device *adev, u32 wb)
1139 {
1140         wb >>= 3;
1141         if (wb < adev->wb.num_wb)
1142                 __clear_bit(wb, adev->wb.used);
1143 }
1144
1145 /**
1146  * amdgpu_device_resize_fb_bar - try to resize FB BAR
1147  *
1148  * @adev: amdgpu_device pointer
1149  *
1150  * Try to resize FB BAR to make all VRAM CPU accessible. We try very hard not
1151  * to fail, but if any of the BARs is not accessible after the size we abort
1152  * driver loading by returning -ENODEV.
1153  */
1154 int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev)
1155 {
1156         int rbar_size = pci_rebar_bytes_to_size(adev->gmc.real_vram_size);
1157         struct pci_bus *root;
1158         struct resource *res;
1159         unsigned int i;
1160         u16 cmd;
1161         int r;
1162
1163         if (!IS_ENABLED(CONFIG_PHYS_ADDR_T_64BIT))
1164                 return 0;
1165
1166         /* Bypass for VF */
1167         if (amdgpu_sriov_vf(adev))
1168                 return 0;
1169
1170         /* skip if the bios has already enabled large BAR */
1171         if (adev->gmc.real_vram_size &&
1172             (pci_resource_len(adev->pdev, 0) >= adev->gmc.real_vram_size))
1173                 return 0;
1174
1175         /* Check if the root BUS has 64bit memory resources */
1176         root = adev->pdev->bus;
1177         while (root->parent)
1178                 root = root->parent;
1179
1180         pci_bus_for_each_resource(root, res, i) {
1181                 if (res && res->flags & (IORESOURCE_MEM | IORESOURCE_MEM_64) &&
1182                     res->start > 0x100000000ull)
1183                         break;
1184         }
1185
1186         /* Trying to resize is pointless without a root hub window above 4GB */
1187         if (!res)
1188                 return 0;
1189
1190         /* Limit the BAR size to what is available */
1191         rbar_size = min(fls(pci_rebar_get_possible_sizes(adev->pdev, 0)) - 1,
1192                         rbar_size);
1193
1194         /* Disable memory decoding while we change the BAR addresses and size */
1195         pci_read_config_word(adev->pdev, PCI_COMMAND, &cmd);
1196         pci_write_config_word(adev->pdev, PCI_COMMAND,
1197                               cmd & ~PCI_COMMAND_MEMORY);
1198
1199         /* Free the VRAM and doorbell BAR, we most likely need to move both. */
1200         amdgpu_doorbell_fini(adev);
1201         if (adev->asic_type >= CHIP_BONAIRE)
1202                 pci_release_resource(adev->pdev, 2);
1203
1204         pci_release_resource(adev->pdev, 0);
1205
1206         r = pci_resize_resource(adev->pdev, 0, rbar_size);
1207         if (r == -ENOSPC)
1208                 DRM_INFO("Not enough PCI address space for a large BAR.");
1209         else if (r && r != -ENOTSUPP)
1210                 DRM_ERROR("Problem resizing BAR0 (%d).", r);
1211
1212         pci_assign_unassigned_bus_resources(adev->pdev->bus);
1213
1214         /* When the doorbell or fb BAR isn't available we have no chance of
1215          * using the device.
1216          */
1217         r = amdgpu_doorbell_init(adev);
1218         if (r || (pci_resource_flags(adev->pdev, 0) & IORESOURCE_UNSET))
1219                 return -ENODEV;
1220
1221         pci_write_config_word(adev->pdev, PCI_COMMAND, cmd);
1222
1223         return 0;
1224 }
1225
1226 static bool amdgpu_device_read_bios(struct amdgpu_device *adev)
1227 {
1228         if (hweight32(adev->aid_mask) && (adev->flags & AMD_IS_APU))
1229                 return false;
1230
1231         return true;
1232 }
1233
1234 /*
1235  * GPU helpers function.
1236  */
1237 /**
1238  * amdgpu_device_need_post - check if the hw need post or not
1239  *
1240  * @adev: amdgpu_device pointer
1241  *
1242  * Check if the asic has been initialized (all asics) at driver startup
1243  * or post is needed if  hw reset is performed.
1244  * Returns true if need or false if not.
1245  */
1246 bool amdgpu_device_need_post(struct amdgpu_device *adev)
1247 {
1248         uint32_t reg;
1249
1250         if (amdgpu_sriov_vf(adev))
1251                 return false;
1252
1253         if (!amdgpu_device_read_bios(adev))
1254                 return false;
1255
1256         if (amdgpu_passthrough(adev)) {
1257                 /* for FIJI: In whole GPU pass-through virtualization case, after VM reboot
1258                  * some old smc fw still need driver do vPost otherwise gpu hang, while
1259                  * those smc fw version above 22.15 doesn't have this flaw, so we force
1260                  * vpost executed for smc version below 22.15
1261                  */
1262                 if (adev->asic_type == CHIP_FIJI) {
1263                         int err;
1264                         uint32_t fw_ver;
1265
1266                         err = request_firmware(&adev->pm.fw, "amdgpu/fiji_smc.bin", adev->dev);
1267                         /* force vPost if error occured */
1268                         if (err)
1269                                 return true;
1270
1271                         fw_ver = *((uint32_t *)adev->pm.fw->data + 69);
1272                         if (fw_ver < 0x00160e00)
1273                                 return true;
1274                 }
1275         }
1276
1277         /* Don't post if we need to reset whole hive on init */
1278         if (adev->gmc.xgmi.pending_reset)
1279                 return false;
1280
1281         if (adev->has_hw_reset) {
1282                 adev->has_hw_reset = false;
1283                 return true;
1284         }
1285
1286         /* bios scratch used on CIK+ */
1287         if (adev->asic_type >= CHIP_BONAIRE)
1288                 return amdgpu_atombios_scratch_need_asic_init(adev);
1289
1290         /* check MEM_SIZE for older asics */
1291         reg = amdgpu_asic_get_config_memsize(adev);
1292
1293         if ((reg != 0) && (reg != 0xffffffff))
1294                 return false;
1295
1296         return true;
1297 }
1298
1299 /*
1300  * Intel hosts such as Raptor Lake and Sapphire Rapids don't support dynamic
1301  * speed switching. Until we have confirmation from Intel that a specific host
1302  * supports it, it's safer that we keep it disabled for all.
1303  *
1304  * https://edc.intel.com/content/www/us/en/design/products/platforms/details/raptor-lake-s/13th-generation-core-processors-datasheet-volume-1-of-2/005/pci-express-support/
1305  * https://gitlab.freedesktop.org/drm/amd/-/issues/2663
1306  */
1307 bool amdgpu_device_pcie_dynamic_switching_supported(void)
1308 {
1309 #if IS_ENABLED(CONFIG_X86)
1310         struct cpuinfo_x86 *c = &cpu_data(0);
1311
1312         if (c->x86_vendor == X86_VENDOR_INTEL)
1313                 return false;
1314 #endif
1315         return true;
1316 }
1317
1318 /**
1319  * amdgpu_device_should_use_aspm - check if the device should program ASPM
1320  *
1321  * @adev: amdgpu_device pointer
1322  *
1323  * Confirm whether the module parameter and pcie bridge agree that ASPM should
1324  * be set for this device.
1325  *
1326  * Returns true if it should be used or false if not.
1327  */
1328 bool amdgpu_device_should_use_aspm(struct amdgpu_device *adev)
1329 {
1330         switch (amdgpu_aspm) {
1331         case -1:
1332                 break;
1333         case 0:
1334                 return false;
1335         case 1:
1336                 return true;
1337         default:
1338                 return false;
1339         }
1340         return pcie_aspm_enabled(adev->pdev);
1341 }
1342
1343 bool amdgpu_device_aspm_support_quirk(void)
1344 {
1345 #if IS_ENABLED(CONFIG_X86)
1346         struct cpuinfo_x86 *c = &cpu_data(0);
1347
1348         return !(c->x86 == 6 && c->x86_model == INTEL_FAM6_ALDERLAKE);
1349 #else
1350         return true;
1351 #endif
1352 }
1353
1354 /* if we get transitioned to only one device, take VGA back */
1355 /**
1356  * amdgpu_device_vga_set_decode - enable/disable vga decode
1357  *
1358  * @pdev: PCI device pointer
1359  * @state: enable/disable vga decode
1360  *
1361  * Enable/disable vga decode (all asics).
1362  * Returns VGA resource flags.
1363  */
1364 static unsigned int amdgpu_device_vga_set_decode(struct pci_dev *pdev,
1365                 bool state)
1366 {
1367         struct amdgpu_device *adev = drm_to_adev(pci_get_drvdata(pdev));
1368
1369         amdgpu_asic_set_vga_state(adev, state);
1370         if (state)
1371                 return VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM |
1372                        VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM;
1373         else
1374                 return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM;
1375 }
1376
1377 /**
1378  * amdgpu_device_check_block_size - validate the vm block size
1379  *
1380  * @adev: amdgpu_device pointer
1381  *
1382  * Validates the vm block size specified via module parameter.
1383  * The vm block size defines number of bits in page table versus page directory,
1384  * a page is 4KB so we have 12 bits offset, minimum 9 bits in the
1385  * page table and the remaining bits are in the page directory.
1386  */
1387 static void amdgpu_device_check_block_size(struct amdgpu_device *adev)
1388 {
1389         /* defines number of bits in page table versus page directory,
1390          * a page is 4KB so we have 12 bits offset, minimum 9 bits in the
1391          * page table and the remaining bits are in the page directory
1392          */
1393         if (amdgpu_vm_block_size == -1)
1394                 return;
1395
1396         if (amdgpu_vm_block_size < 9) {
1397                 dev_warn(adev->dev, "VM page table size (%d) too small\n",
1398                          amdgpu_vm_block_size);
1399                 amdgpu_vm_block_size = -1;
1400         }
1401 }
1402
1403 /**
1404  * amdgpu_device_check_vm_size - validate the vm size
1405  *
1406  * @adev: amdgpu_device pointer
1407  *
1408  * Validates the vm size in GB specified via module parameter.
1409  * The VM size is the size of the GPU virtual memory space in GB.
1410  */
1411 static void amdgpu_device_check_vm_size(struct amdgpu_device *adev)
1412 {
1413         /* no need to check the default value */
1414         if (amdgpu_vm_size == -1)
1415                 return;
1416
1417         if (amdgpu_vm_size < 1) {
1418                 dev_warn(adev->dev, "VM size (%d) too small, min is 1GB\n",
1419                          amdgpu_vm_size);
1420                 amdgpu_vm_size = -1;
1421         }
1422 }
1423
1424 static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev)
1425 {
1426         struct sysinfo si;
1427         bool is_os_64 = (sizeof(void *) == 8);
1428         uint64_t total_memory;
1429         uint64_t dram_size_seven_GB = 0x1B8000000;
1430         uint64_t dram_size_three_GB = 0xB8000000;
1431
1432         if (amdgpu_smu_memory_pool_size == 0)
1433                 return;
1434
1435         if (!is_os_64) {
1436                 DRM_WARN("Not 64-bit OS, feature not supported\n");
1437                 goto def_value;
1438         }
1439         si_meminfo(&si);
1440         total_memory = (uint64_t)si.totalram * si.mem_unit;
1441
1442         if ((amdgpu_smu_memory_pool_size == 1) ||
1443                 (amdgpu_smu_memory_pool_size == 2)) {
1444                 if (total_memory < dram_size_three_GB)
1445                         goto def_value1;
1446         } else if ((amdgpu_smu_memory_pool_size == 4) ||
1447                 (amdgpu_smu_memory_pool_size == 8)) {
1448                 if (total_memory < dram_size_seven_GB)
1449                         goto def_value1;
1450         } else {
1451                 DRM_WARN("Smu memory pool size not supported\n");
1452                 goto def_value;
1453         }
1454         adev->pm.smu_prv_buffer_size = amdgpu_smu_memory_pool_size << 28;
1455
1456         return;
1457
1458 def_value1:
1459         DRM_WARN("No enough system memory\n");
1460 def_value:
1461         adev->pm.smu_prv_buffer_size = 0;
1462 }
1463
1464 static int amdgpu_device_init_apu_flags(struct amdgpu_device *adev)
1465 {
1466         if (!(adev->flags & AMD_IS_APU) ||
1467             adev->asic_type < CHIP_RAVEN)
1468                 return 0;
1469
1470         switch (adev->asic_type) {
1471         case CHIP_RAVEN:
1472                 if (adev->pdev->device == 0x15dd)
1473                         adev->apu_flags |= AMD_APU_IS_RAVEN;
1474                 if (adev->pdev->device == 0x15d8)
1475                         adev->apu_flags |= AMD_APU_IS_PICASSO;
1476                 break;
1477         case CHIP_RENOIR:
1478                 if ((adev->pdev->device == 0x1636) ||
1479                     (adev->pdev->device == 0x164c))
1480                         adev->apu_flags |= AMD_APU_IS_RENOIR;
1481                 else
1482                         adev->apu_flags |= AMD_APU_IS_GREEN_SARDINE;
1483                 break;
1484         case CHIP_VANGOGH:
1485                 adev->apu_flags |= AMD_APU_IS_VANGOGH;
1486                 break;
1487         case CHIP_YELLOW_CARP:
1488                 break;
1489         case CHIP_CYAN_SKILLFISH:
1490                 if ((adev->pdev->device == 0x13FE) ||
1491                     (adev->pdev->device == 0x143F))
1492                         adev->apu_flags |= AMD_APU_IS_CYAN_SKILLFISH2;
1493                 break;
1494         default:
1495                 break;
1496         }
1497
1498         return 0;
1499 }
1500
1501 /**
1502  * amdgpu_device_check_arguments - validate module params
1503  *
1504  * @adev: amdgpu_device pointer
1505  *
1506  * Validates certain module parameters and updates
1507  * the associated values used by the driver (all asics).
1508  */
1509 static int amdgpu_device_check_arguments(struct amdgpu_device *adev)
1510 {
1511         if (amdgpu_sched_jobs < 4) {
1512                 dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n",
1513                          amdgpu_sched_jobs);
1514                 amdgpu_sched_jobs = 4;
1515         } else if (!is_power_of_2(amdgpu_sched_jobs)) {
1516                 dev_warn(adev->dev, "sched jobs (%d) must be a power of 2\n",
1517                          amdgpu_sched_jobs);
1518                 amdgpu_sched_jobs = roundup_pow_of_two(amdgpu_sched_jobs);
1519         }
1520
1521         if (amdgpu_gart_size != -1 && amdgpu_gart_size < 32) {
1522                 /* gart size must be greater or equal to 32M */
1523                 dev_warn(adev->dev, "gart size (%d) too small\n",
1524                          amdgpu_gart_size);
1525                 amdgpu_gart_size = -1;
1526         }
1527
1528         if (amdgpu_gtt_size != -1 && amdgpu_gtt_size < 32) {
1529                 /* gtt size must be greater or equal to 32M */
1530                 dev_warn(adev->dev, "gtt size (%d) too small\n",
1531                                  amdgpu_gtt_size);
1532                 amdgpu_gtt_size = -1;
1533         }
1534
1535         /* valid range is between 4 and 9 inclusive */
1536         if (amdgpu_vm_fragment_size != -1 &&
1537             (amdgpu_vm_fragment_size > 9 || amdgpu_vm_fragment_size < 4)) {
1538                 dev_warn(adev->dev, "valid range is between 4 and 9\n");
1539                 amdgpu_vm_fragment_size = -1;
1540         }
1541
1542         if (amdgpu_sched_hw_submission < 2) {
1543                 dev_warn(adev->dev, "sched hw submission jobs (%d) must be at least 2\n",
1544                          amdgpu_sched_hw_submission);
1545                 amdgpu_sched_hw_submission = 2;
1546         } else if (!is_power_of_2(amdgpu_sched_hw_submission)) {
1547                 dev_warn(adev->dev, "sched hw submission jobs (%d) must be a power of 2\n",
1548                          amdgpu_sched_hw_submission);
1549                 amdgpu_sched_hw_submission = roundup_pow_of_two(amdgpu_sched_hw_submission);
1550         }
1551
1552         if (amdgpu_reset_method < -1 || amdgpu_reset_method > 4) {
1553                 dev_warn(adev->dev, "invalid option for reset method, reverting to default\n");
1554                 amdgpu_reset_method = -1;
1555         }
1556
1557         amdgpu_device_check_smu_prv_buffer_size(adev);
1558
1559         amdgpu_device_check_vm_size(adev);
1560
1561         amdgpu_device_check_block_size(adev);
1562
1563         adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type);
1564
1565         return 0;
1566 }
1567
1568 /**
1569  * amdgpu_switcheroo_set_state - set switcheroo state
1570  *
1571  * @pdev: pci dev pointer
1572  * @state: vga_switcheroo state
1573  *
1574  * Callback for the switcheroo driver.  Suspends or resumes
1575  * the asics before or after it is powered up using ACPI methods.
1576  */
1577 static void amdgpu_switcheroo_set_state(struct pci_dev *pdev,
1578                                         enum vga_switcheroo_state state)
1579 {
1580         struct drm_device *dev = pci_get_drvdata(pdev);
1581         int r;
1582
1583         if (amdgpu_device_supports_px(dev) && state == VGA_SWITCHEROO_OFF)
1584                 return;
1585
1586         if (state == VGA_SWITCHEROO_ON) {
1587                 pr_info("switched on\n");
1588                 /* don't suspend or resume card normally */
1589                 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING;
1590
1591                 pci_set_power_state(pdev, PCI_D0);
1592                 amdgpu_device_load_pci_state(pdev);
1593                 r = pci_enable_device(pdev);
1594                 if (r)
1595                         DRM_WARN("pci_enable_device failed (%d)\n", r);
1596                 amdgpu_device_resume(dev, true);
1597
1598                 dev->switch_power_state = DRM_SWITCH_POWER_ON;
1599         } else {
1600                 pr_info("switched off\n");
1601                 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING;
1602                 amdgpu_device_suspend(dev, true);
1603                 amdgpu_device_cache_pci_state(pdev);
1604                 /* Shut down the device */
1605                 pci_disable_device(pdev);
1606                 pci_set_power_state(pdev, PCI_D3cold);
1607                 dev->switch_power_state = DRM_SWITCH_POWER_OFF;
1608         }
1609 }
1610
1611 /**
1612  * amdgpu_switcheroo_can_switch - see if switcheroo state can change
1613  *
1614  * @pdev: pci dev pointer
1615  *
1616  * Callback for the switcheroo driver.  Check of the switcheroo
1617  * state can be changed.
1618  * Returns true if the state can be changed, false if not.
1619  */
1620 static bool amdgpu_switcheroo_can_switch(struct pci_dev *pdev)
1621 {
1622         struct drm_device *dev = pci_get_drvdata(pdev);
1623
1624        /*
1625         * FIXME: open_count is protected by drm_global_mutex but that would lead to
1626         * locking inversion with the driver load path. And the access here is
1627         * completely racy anyway. So don't bother with locking for now.
1628         */
1629         return atomic_read(&dev->open_count) == 0;
1630 }
1631
1632 static const struct vga_switcheroo_client_ops amdgpu_switcheroo_ops = {
1633         .set_gpu_state = amdgpu_switcheroo_set_state,
1634         .reprobe = NULL,
1635         .can_switch = amdgpu_switcheroo_can_switch,
1636 };
1637
1638 /**
1639  * amdgpu_device_ip_set_clockgating_state - set the CG state
1640  *
1641  * @dev: amdgpu_device pointer
1642  * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1643  * @state: clockgating state (gate or ungate)
1644  *
1645  * Sets the requested clockgating state for all instances of
1646  * the hardware IP specified.
1647  * Returns the error code from the last instance.
1648  */
1649 int amdgpu_device_ip_set_clockgating_state(void *dev,
1650                                            enum amd_ip_block_type block_type,
1651                                            enum amd_clockgating_state state)
1652 {
1653         struct amdgpu_device *adev = dev;
1654         int i, r = 0;
1655
1656         for (i = 0; i < adev->num_ip_blocks; i++) {
1657                 if (!adev->ip_blocks[i].status.valid)
1658                         continue;
1659                 if (adev->ip_blocks[i].version->type != block_type)
1660                         continue;
1661                 if (!adev->ip_blocks[i].version->funcs->set_clockgating_state)
1662                         continue;
1663                 r = adev->ip_blocks[i].version->funcs->set_clockgating_state(
1664                         (void *)adev, state);
1665                 if (r)
1666                         DRM_ERROR("set_clockgating_state of IP block <%s> failed %d\n",
1667                                   adev->ip_blocks[i].version->funcs->name, r);
1668         }
1669         return r;
1670 }
1671
1672 /**
1673  * amdgpu_device_ip_set_powergating_state - set the PG state
1674  *
1675  * @dev: amdgpu_device pointer
1676  * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1677  * @state: powergating state (gate or ungate)
1678  *
1679  * Sets the requested powergating state for all instances of
1680  * the hardware IP specified.
1681  * Returns the error code from the last instance.
1682  */
1683 int amdgpu_device_ip_set_powergating_state(void *dev,
1684                                            enum amd_ip_block_type block_type,
1685                                            enum amd_powergating_state state)
1686 {
1687         struct amdgpu_device *adev = dev;
1688         int i, r = 0;
1689
1690         for (i = 0; i < adev->num_ip_blocks; i++) {
1691                 if (!adev->ip_blocks[i].status.valid)
1692                         continue;
1693                 if (adev->ip_blocks[i].version->type != block_type)
1694                         continue;
1695                 if (!adev->ip_blocks[i].version->funcs->set_powergating_state)
1696                         continue;
1697                 r = adev->ip_blocks[i].version->funcs->set_powergating_state(
1698                         (void *)adev, state);
1699                 if (r)
1700                         DRM_ERROR("set_powergating_state of IP block <%s> failed %d\n",
1701                                   adev->ip_blocks[i].version->funcs->name, r);
1702         }
1703         return r;
1704 }
1705
1706 /**
1707  * amdgpu_device_ip_get_clockgating_state - get the CG state
1708  *
1709  * @adev: amdgpu_device pointer
1710  * @flags: clockgating feature flags
1711  *
1712  * Walks the list of IPs on the device and updates the clockgating
1713  * flags for each IP.
1714  * Updates @flags with the feature flags for each hardware IP where
1715  * clockgating is enabled.
1716  */
1717 void amdgpu_device_ip_get_clockgating_state(struct amdgpu_device *adev,
1718                                             u64 *flags)
1719 {
1720         int i;
1721
1722         for (i = 0; i < adev->num_ip_blocks; i++) {
1723                 if (!adev->ip_blocks[i].status.valid)
1724                         continue;
1725                 if (adev->ip_blocks[i].version->funcs->get_clockgating_state)
1726                         adev->ip_blocks[i].version->funcs->get_clockgating_state((void *)adev, flags);
1727         }
1728 }
1729
1730 /**
1731  * amdgpu_device_ip_wait_for_idle - wait for idle
1732  *
1733  * @adev: amdgpu_device pointer
1734  * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1735  *
1736  * Waits for the request hardware IP to be idle.
1737  * Returns 0 for success or a negative error code on failure.
1738  */
1739 int amdgpu_device_ip_wait_for_idle(struct amdgpu_device *adev,
1740                                    enum amd_ip_block_type block_type)
1741 {
1742         int i, r;
1743
1744         for (i = 0; i < adev->num_ip_blocks; i++) {
1745                 if (!adev->ip_blocks[i].status.valid)
1746                         continue;
1747                 if (adev->ip_blocks[i].version->type == block_type) {
1748                         r = adev->ip_blocks[i].version->funcs->wait_for_idle((void *)adev);
1749                         if (r)
1750                                 return r;
1751                         break;
1752                 }
1753         }
1754         return 0;
1755
1756 }
1757
1758 /**
1759  * amdgpu_device_ip_is_idle - is the hardware IP idle
1760  *
1761  * @adev: amdgpu_device pointer
1762  * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1763  *
1764  * Check if the hardware IP is idle or not.
1765  * Returns true if it the IP is idle, false if not.
1766  */
1767 bool amdgpu_device_ip_is_idle(struct amdgpu_device *adev,
1768                               enum amd_ip_block_type block_type)
1769 {
1770         int i;
1771
1772         for (i = 0; i < adev->num_ip_blocks; i++) {
1773                 if (!adev->ip_blocks[i].status.valid)
1774                         continue;
1775                 if (adev->ip_blocks[i].version->type == block_type)
1776                         return adev->ip_blocks[i].version->funcs->is_idle((void *)adev);
1777         }
1778         return true;
1779
1780 }
1781
1782 /**
1783  * amdgpu_device_ip_get_ip_block - get a hw IP pointer
1784  *
1785  * @adev: amdgpu_device pointer
1786  * @type: Type of hardware IP (SMU, GFX, UVD, etc.)
1787  *
1788  * Returns a pointer to the hardware IP block structure
1789  * if it exists for the asic, otherwise NULL.
1790  */
1791 struct amdgpu_ip_block *
1792 amdgpu_device_ip_get_ip_block(struct amdgpu_device *adev,
1793                               enum amd_ip_block_type type)
1794 {
1795         int i;
1796
1797         for (i = 0; i < adev->num_ip_blocks; i++)
1798                 if (adev->ip_blocks[i].version->type == type)
1799                         return &adev->ip_blocks[i];
1800
1801         return NULL;
1802 }
1803
1804 /**
1805  * amdgpu_device_ip_block_version_cmp
1806  *
1807  * @adev: amdgpu_device pointer
1808  * @type: enum amd_ip_block_type
1809  * @major: major version
1810  * @minor: minor version
1811  *
1812  * return 0 if equal or greater
1813  * return 1 if smaller or the ip_block doesn't exist
1814  */
1815 int amdgpu_device_ip_block_version_cmp(struct amdgpu_device *adev,
1816                                        enum amd_ip_block_type type,
1817                                        u32 major, u32 minor)
1818 {
1819         struct amdgpu_ip_block *ip_block = amdgpu_device_ip_get_ip_block(adev, type);
1820
1821         if (ip_block && ((ip_block->version->major > major) ||
1822                         ((ip_block->version->major == major) &&
1823                         (ip_block->version->minor >= minor))))
1824                 return 0;
1825
1826         return 1;
1827 }
1828
1829 /**
1830  * amdgpu_device_ip_block_add
1831  *
1832  * @adev: amdgpu_device pointer
1833  * @ip_block_version: pointer to the IP to add
1834  *
1835  * Adds the IP block driver information to the collection of IPs
1836  * on the asic.
1837  */
1838 int amdgpu_device_ip_block_add(struct amdgpu_device *adev,
1839                                const struct amdgpu_ip_block_version *ip_block_version)
1840 {
1841         if (!ip_block_version)
1842                 return -EINVAL;
1843
1844         switch (ip_block_version->type) {
1845         case AMD_IP_BLOCK_TYPE_VCN:
1846                 if (adev->harvest_ip_mask & AMD_HARVEST_IP_VCN_MASK)
1847                         return 0;
1848                 break;
1849         case AMD_IP_BLOCK_TYPE_JPEG:
1850                 if (adev->harvest_ip_mask & AMD_HARVEST_IP_JPEG_MASK)
1851                         return 0;
1852                 break;
1853         default:
1854                 break;
1855         }
1856
1857         DRM_INFO("add ip block number %d <%s>\n", adev->num_ip_blocks,
1858                   ip_block_version->funcs->name);
1859
1860         adev->ip_blocks[adev->num_ip_blocks++].version = ip_block_version;
1861
1862         return 0;
1863 }
1864
1865 /**
1866  * amdgpu_device_enable_virtual_display - enable virtual display feature
1867  *
1868  * @adev: amdgpu_device pointer
1869  *
1870  * Enabled the virtual display feature if the user has enabled it via
1871  * the module parameter virtual_display.  This feature provides a virtual
1872  * display hardware on headless boards or in virtualized environments.
1873  * This function parses and validates the configuration string specified by
1874  * the user and configues the virtual display configuration (number of
1875  * virtual connectors, crtcs, etc.) specified.
1876  */
1877 static void amdgpu_device_enable_virtual_display(struct amdgpu_device *adev)
1878 {
1879         adev->enable_virtual_display = false;
1880
1881         if (amdgpu_virtual_display) {
1882                 const char *pci_address_name = pci_name(adev->pdev);
1883                 char *pciaddstr, *pciaddstr_tmp, *pciaddname_tmp, *pciaddname;
1884
1885                 pciaddstr = kstrdup(amdgpu_virtual_display, GFP_KERNEL);
1886                 pciaddstr_tmp = pciaddstr;
1887                 while ((pciaddname_tmp = strsep(&pciaddstr_tmp, ";"))) {
1888                         pciaddname = strsep(&pciaddname_tmp, ",");
1889                         if (!strcmp("all", pciaddname)
1890                             || !strcmp(pci_address_name, pciaddname)) {
1891                                 long num_crtc;
1892                                 int res = -1;
1893
1894                                 adev->enable_virtual_display = true;
1895
1896                                 if (pciaddname_tmp)
1897                                         res = kstrtol(pciaddname_tmp, 10,
1898                                                       &num_crtc);
1899
1900                                 if (!res) {
1901                                         if (num_crtc < 1)
1902                                                 num_crtc = 1;
1903                                         if (num_crtc > 6)
1904                                                 num_crtc = 6;
1905                                         adev->mode_info.num_crtc = num_crtc;
1906                                 } else {
1907                                         adev->mode_info.num_crtc = 1;
1908                                 }
1909                                 break;
1910                         }
1911                 }
1912
1913                 DRM_INFO("virtual display string:%s, %s:virtual_display:%d, num_crtc:%d\n",
1914                          amdgpu_virtual_display, pci_address_name,
1915                          adev->enable_virtual_display, adev->mode_info.num_crtc);
1916
1917                 kfree(pciaddstr);
1918         }
1919 }
1920
1921 void amdgpu_device_set_sriov_virtual_display(struct amdgpu_device *adev)
1922 {
1923         if (amdgpu_sriov_vf(adev) && !adev->enable_virtual_display) {
1924                 adev->mode_info.num_crtc = 1;
1925                 adev->enable_virtual_display = true;
1926                 DRM_INFO("virtual_display:%d, num_crtc:%d\n",
1927                          adev->enable_virtual_display, adev->mode_info.num_crtc);
1928         }
1929 }
1930
1931 /**
1932  * amdgpu_device_parse_gpu_info_fw - parse gpu info firmware
1933  *
1934  * @adev: amdgpu_device pointer
1935  *
1936  * Parses the asic configuration parameters specified in the gpu info
1937  * firmware and makes them availale to the driver for use in configuring
1938  * the asic.
1939  * Returns 0 on success, -EINVAL on failure.
1940  */
1941 static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev)
1942 {
1943         const char *chip_name;
1944         char fw_name[40];
1945         int err;
1946         const struct gpu_info_firmware_header_v1_0 *hdr;
1947
1948         adev->firmware.gpu_info_fw = NULL;
1949
1950         if (adev->mman.discovery_bin) {
1951                 /*
1952                  * FIXME: The bounding box is still needed by Navi12, so
1953                  * temporarily read it from gpu_info firmware. Should be dropped
1954                  * when DAL no longer needs it.
1955                  */
1956                 if (adev->asic_type != CHIP_NAVI12)
1957                         return 0;
1958         }
1959
1960         switch (adev->asic_type) {
1961         default:
1962                 return 0;
1963         case CHIP_VEGA10:
1964                 chip_name = "vega10";
1965                 break;
1966         case CHIP_VEGA12:
1967                 chip_name = "vega12";
1968                 break;
1969         case CHIP_RAVEN:
1970                 if (adev->apu_flags & AMD_APU_IS_RAVEN2)
1971                         chip_name = "raven2";
1972                 else if (adev->apu_flags & AMD_APU_IS_PICASSO)
1973                         chip_name = "picasso";
1974                 else
1975                         chip_name = "raven";
1976                 break;
1977         case CHIP_ARCTURUS:
1978                 chip_name = "arcturus";
1979                 break;
1980         case CHIP_NAVI12:
1981                 chip_name = "navi12";
1982                 break;
1983         }
1984
1985         snprintf(fw_name, sizeof(fw_name), "amdgpu/%s_gpu_info.bin", chip_name);
1986         err = amdgpu_ucode_request(adev, &adev->firmware.gpu_info_fw, fw_name);
1987         if (err) {
1988                 dev_err(adev->dev,
1989                         "Failed to get gpu_info firmware \"%s\"\n",
1990                         fw_name);
1991                 goto out;
1992         }
1993
1994         hdr = (const struct gpu_info_firmware_header_v1_0 *)adev->firmware.gpu_info_fw->data;
1995         amdgpu_ucode_print_gpu_info_hdr(&hdr->header);
1996
1997         switch (hdr->version_major) {
1998         case 1:
1999         {
2000                 const struct gpu_info_firmware_v1_0 *gpu_info_fw =
2001                         (const struct gpu_info_firmware_v1_0 *)(adev->firmware.gpu_info_fw->data +
2002                                                                 le32_to_cpu(hdr->header.ucode_array_offset_bytes));
2003
2004                 /*
2005                  * Should be droped when DAL no longer needs it.
2006                  */
2007                 if (adev->asic_type == CHIP_NAVI12)
2008                         goto parse_soc_bounding_box;
2009
2010                 adev->gfx.config.max_shader_engines = le32_to_cpu(gpu_info_fw->gc_num_se);
2011                 adev->gfx.config.max_cu_per_sh = le32_to_cpu(gpu_info_fw->gc_num_cu_per_sh);
2012                 adev->gfx.config.max_sh_per_se = le32_to_cpu(gpu_info_fw->gc_num_sh_per_se);
2013                 adev->gfx.config.max_backends_per_se = le32_to_cpu(gpu_info_fw->gc_num_rb_per_se);
2014                 adev->gfx.config.max_texture_channel_caches =
2015                         le32_to_cpu(gpu_info_fw->gc_num_tccs);
2016                 adev->gfx.config.max_gprs = le32_to_cpu(gpu_info_fw->gc_num_gprs);
2017                 adev->gfx.config.max_gs_threads = le32_to_cpu(gpu_info_fw->gc_num_max_gs_thds);
2018                 adev->gfx.config.gs_vgt_table_depth = le32_to_cpu(gpu_info_fw->gc_gs_table_depth);
2019                 adev->gfx.config.gs_prim_buffer_depth = le32_to_cpu(gpu_info_fw->gc_gsprim_buff_depth);
2020                 adev->gfx.config.double_offchip_lds_buf =
2021                         le32_to_cpu(gpu_info_fw->gc_double_offchip_lds_buffer);
2022                 adev->gfx.cu_info.wave_front_size = le32_to_cpu(gpu_info_fw->gc_wave_size);
2023                 adev->gfx.cu_info.max_waves_per_simd =
2024                         le32_to_cpu(gpu_info_fw->gc_max_waves_per_simd);
2025                 adev->gfx.cu_info.max_scratch_slots_per_cu =
2026                         le32_to_cpu(gpu_info_fw->gc_max_scratch_slots_per_cu);
2027                 adev->gfx.cu_info.lds_size = le32_to_cpu(gpu_info_fw->gc_lds_size);
2028                 if (hdr->version_minor >= 1) {
2029                         const struct gpu_info_firmware_v1_1 *gpu_info_fw =
2030                                 (const struct gpu_info_firmware_v1_1 *)(adev->firmware.gpu_info_fw->data +
2031                                                                         le32_to_cpu(hdr->header.ucode_array_offset_bytes));
2032                         adev->gfx.config.num_sc_per_sh =
2033                                 le32_to_cpu(gpu_info_fw->num_sc_per_sh);
2034                         adev->gfx.config.num_packer_per_sc =
2035                                 le32_to_cpu(gpu_info_fw->num_packer_per_sc);
2036                 }
2037
2038 parse_soc_bounding_box:
2039                 /*
2040                  * soc bounding box info is not integrated in disocovery table,
2041                  * we always need to parse it from gpu info firmware if needed.
2042                  */
2043                 if (hdr->version_minor == 2) {
2044                         const struct gpu_info_firmware_v1_2 *gpu_info_fw =
2045                                 (const struct gpu_info_firmware_v1_2 *)(adev->firmware.gpu_info_fw->data +
2046                                                                         le32_to_cpu(hdr->header.ucode_array_offset_bytes));
2047                         adev->dm.soc_bounding_box = &gpu_info_fw->soc_bounding_box;
2048                 }
2049                 break;
2050         }
2051         default:
2052                 dev_err(adev->dev,
2053                         "Unsupported gpu_info table %d\n", hdr->header.ucode_version);
2054                 err = -EINVAL;
2055                 goto out;
2056         }
2057 out:
2058         return err;
2059 }
2060
2061 /**
2062  * amdgpu_device_ip_early_init - run early init for hardware IPs
2063  *
2064  * @adev: amdgpu_device pointer
2065  *
2066  * Early initialization pass for hardware IPs.  The hardware IPs that make
2067  * up each asic are discovered each IP's early_init callback is run.  This
2068  * is the first stage in initializing the asic.
2069  * Returns 0 on success, negative error code on failure.
2070  */
2071 static int amdgpu_device_ip_early_init(struct amdgpu_device *adev)
2072 {
2073         struct drm_device *dev = adev_to_drm(adev);
2074         struct pci_dev *parent;
2075         int i, r;
2076         bool total;
2077
2078         amdgpu_device_enable_virtual_display(adev);
2079
2080         if (amdgpu_sriov_vf(adev)) {
2081                 r = amdgpu_virt_request_full_gpu(adev, true);
2082                 if (r)
2083                         return r;
2084         }
2085
2086         switch (adev->asic_type) {
2087 #ifdef CONFIG_DRM_AMDGPU_SI
2088         case CHIP_VERDE:
2089         case CHIP_TAHITI:
2090         case CHIP_PITCAIRN:
2091         case CHIP_OLAND:
2092         case CHIP_HAINAN:
2093                 adev->family = AMDGPU_FAMILY_SI;
2094                 r = si_set_ip_blocks(adev);
2095                 if (r)
2096                         return r;
2097                 break;
2098 #endif
2099 #ifdef CONFIG_DRM_AMDGPU_CIK
2100         case CHIP_BONAIRE:
2101         case CHIP_HAWAII:
2102         case CHIP_KAVERI:
2103         case CHIP_KABINI:
2104         case CHIP_MULLINS:
2105                 if (adev->flags & AMD_IS_APU)
2106                         adev->family = AMDGPU_FAMILY_KV;
2107                 else
2108                         adev->family = AMDGPU_FAMILY_CI;
2109
2110                 r = cik_set_ip_blocks(adev);
2111                 if (r)
2112                         return r;
2113                 break;
2114 #endif
2115         case CHIP_TOPAZ:
2116         case CHIP_TONGA:
2117         case CHIP_FIJI:
2118         case CHIP_POLARIS10:
2119         case CHIP_POLARIS11:
2120         case CHIP_POLARIS12:
2121         case CHIP_VEGAM:
2122         case CHIP_CARRIZO:
2123         case CHIP_STONEY:
2124                 if (adev->flags & AMD_IS_APU)
2125                         adev->family = AMDGPU_FAMILY_CZ;
2126                 else
2127                         adev->family = AMDGPU_FAMILY_VI;
2128
2129                 r = vi_set_ip_blocks(adev);
2130                 if (r)
2131                         return r;
2132                 break;
2133         default:
2134                 r = amdgpu_discovery_set_ip_blocks(adev);
2135                 if (r)
2136                         return r;
2137                 break;
2138         }
2139
2140         if (amdgpu_has_atpx() &&
2141             (amdgpu_is_atpx_hybrid() ||
2142              amdgpu_has_atpx_dgpu_power_cntl()) &&
2143             ((adev->flags & AMD_IS_APU) == 0) &&
2144             !pci_is_thunderbolt_attached(to_pci_dev(dev->dev)))
2145                 adev->flags |= AMD_IS_PX;
2146
2147         if (!(adev->flags & AMD_IS_APU)) {
2148                 parent = pci_upstream_bridge(adev->pdev);
2149                 adev->has_pr3 = parent ? pci_pr3_present(parent) : false;
2150         }
2151
2152
2153         adev->pm.pp_feature = amdgpu_pp_feature_mask;
2154         if (amdgpu_sriov_vf(adev) || sched_policy == KFD_SCHED_POLICY_NO_HWS)
2155                 adev->pm.pp_feature &= ~PP_GFXOFF_MASK;
2156         if (amdgpu_sriov_vf(adev) && adev->asic_type == CHIP_SIENNA_CICHLID)
2157                 adev->pm.pp_feature &= ~PP_OVERDRIVE_MASK;
2158
2159         total = true;
2160         for (i = 0; i < adev->num_ip_blocks; i++) {
2161                 if ((amdgpu_ip_block_mask & (1 << i)) == 0) {
2162                         DRM_WARN("disabled ip block: %d <%s>\n",
2163                                   i, adev->ip_blocks[i].version->funcs->name);
2164                         adev->ip_blocks[i].status.valid = false;
2165                 } else {
2166                         if (adev->ip_blocks[i].version->funcs->early_init) {
2167                                 r = adev->ip_blocks[i].version->funcs->early_init((void *)adev);
2168                                 if (r == -ENOENT) {
2169                                         adev->ip_blocks[i].status.valid = false;
2170                                 } else if (r) {
2171                                         DRM_ERROR("early_init of IP block <%s> failed %d\n",
2172                                                   adev->ip_blocks[i].version->funcs->name, r);
2173                                         total = false;
2174                                 } else {
2175                                         adev->ip_blocks[i].status.valid = true;
2176                                 }
2177                         } else {
2178                                 adev->ip_blocks[i].status.valid = true;
2179                         }
2180                 }
2181                 /* get the vbios after the asic_funcs are set up */
2182                 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) {
2183                         r = amdgpu_device_parse_gpu_info_fw(adev);
2184                         if (r)
2185                                 return r;
2186
2187                         /* Read BIOS */
2188                         if (amdgpu_device_read_bios(adev)) {
2189                                 if (!amdgpu_get_bios(adev))
2190                                         return -EINVAL;
2191
2192                                 r = amdgpu_atombios_init(adev);
2193                                 if (r) {
2194                                         dev_err(adev->dev, "amdgpu_atombios_init failed\n");
2195                                         amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 0, 0);
2196                                         return r;
2197                                 }
2198                         }
2199
2200                         /*get pf2vf msg info at it's earliest time*/
2201                         if (amdgpu_sriov_vf(adev))
2202                                 amdgpu_virt_init_data_exchange(adev);
2203
2204                 }
2205         }
2206         if (!total)
2207                 return -ENODEV;
2208
2209         amdgpu_amdkfd_device_probe(adev);
2210         adev->cg_flags &= amdgpu_cg_mask;
2211         adev->pg_flags &= amdgpu_pg_mask;
2212
2213         return 0;
2214 }
2215
2216 static int amdgpu_device_ip_hw_init_phase1(struct amdgpu_device *adev)
2217 {
2218         int i, r;
2219
2220         for (i = 0; i < adev->num_ip_blocks; i++) {
2221                 if (!adev->ip_blocks[i].status.sw)
2222                         continue;
2223                 if (adev->ip_blocks[i].status.hw)
2224                         continue;
2225                 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
2226                     (amdgpu_sriov_vf(adev) && (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)) ||
2227                     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) {
2228                         r = adev->ip_blocks[i].version->funcs->hw_init(adev);
2229                         if (r) {
2230                                 DRM_ERROR("hw_init of IP block <%s> failed %d\n",
2231                                           adev->ip_blocks[i].version->funcs->name, r);
2232                                 return r;
2233                         }
2234                         adev->ip_blocks[i].status.hw = true;
2235                 }
2236         }
2237
2238         return 0;
2239 }
2240
2241 static int amdgpu_device_ip_hw_init_phase2(struct amdgpu_device *adev)
2242 {
2243         int i, r;
2244
2245         for (i = 0; i < adev->num_ip_blocks; i++) {
2246                 if (!adev->ip_blocks[i].status.sw)
2247                         continue;
2248                 if (adev->ip_blocks[i].status.hw)
2249                         continue;
2250                 r = adev->ip_blocks[i].version->funcs->hw_init(adev);
2251                 if (r) {
2252                         DRM_ERROR("hw_init of IP block <%s> failed %d\n",
2253                                   adev->ip_blocks[i].version->funcs->name, r);
2254                         return r;
2255                 }
2256                 adev->ip_blocks[i].status.hw = true;
2257         }
2258
2259         return 0;
2260 }
2261
2262 static int amdgpu_device_fw_loading(struct amdgpu_device *adev)
2263 {
2264         int r = 0;
2265         int i;
2266         uint32_t smu_version;
2267
2268         if (adev->asic_type >= CHIP_VEGA10) {
2269                 for (i = 0; i < adev->num_ip_blocks; i++) {
2270                         if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP)
2271                                 continue;
2272
2273                         if (!adev->ip_blocks[i].status.sw)
2274                                 continue;
2275
2276                         /* no need to do the fw loading again if already done*/
2277                         if (adev->ip_blocks[i].status.hw == true)
2278                                 break;
2279
2280                         if (amdgpu_in_reset(adev) || adev->in_suspend) {
2281                                 r = adev->ip_blocks[i].version->funcs->resume(adev);
2282                                 if (r) {
2283                                         DRM_ERROR("resume of IP block <%s> failed %d\n",
2284                                                           adev->ip_blocks[i].version->funcs->name, r);
2285                                         return r;
2286                                 }
2287                         } else {
2288                                 r = adev->ip_blocks[i].version->funcs->hw_init(adev);
2289                                 if (r) {
2290                                         DRM_ERROR("hw_init of IP block <%s> failed %d\n",
2291                                                           adev->ip_blocks[i].version->funcs->name, r);
2292                                         return r;
2293                                 }
2294                         }
2295
2296                         adev->ip_blocks[i].status.hw = true;
2297                         break;
2298                 }
2299         }
2300
2301         if (!amdgpu_sriov_vf(adev) || adev->asic_type == CHIP_TONGA)
2302                 r = amdgpu_pm_load_smu_firmware(adev, &smu_version);
2303
2304         return r;
2305 }
2306
2307 static int amdgpu_device_init_schedulers(struct amdgpu_device *adev)
2308 {
2309         long timeout;
2310         int r, i;
2311
2312         for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
2313                 struct amdgpu_ring *ring = adev->rings[i];
2314
2315                 /* No need to setup the GPU scheduler for rings that don't need it */
2316                 if (!ring || ring->no_scheduler)
2317                         continue;
2318
2319                 switch (ring->funcs->type) {
2320                 case AMDGPU_RING_TYPE_GFX:
2321                         timeout = adev->gfx_timeout;
2322                         break;
2323                 case AMDGPU_RING_TYPE_COMPUTE:
2324                         timeout = adev->compute_timeout;
2325                         break;
2326                 case AMDGPU_RING_TYPE_SDMA:
2327                         timeout = adev->sdma_timeout;
2328                         break;
2329                 default:
2330                         timeout = adev->video_timeout;
2331                         break;
2332                 }
2333
2334                 r = drm_sched_init(&ring->sched, &amdgpu_sched_ops,
2335                                    ring->num_hw_submission, 0,
2336                                    timeout, adev->reset_domain->wq,
2337                                    ring->sched_score, ring->name,
2338                                    adev->dev);
2339                 if (r) {
2340                         DRM_ERROR("Failed to create scheduler on ring %s.\n",
2341                                   ring->name);
2342                         return r;
2343                 }
2344         }
2345
2346         amdgpu_xcp_update_partition_sched_list(adev);
2347
2348         return 0;
2349 }
2350
2351
2352 /**
2353  * amdgpu_device_ip_init - run init for hardware IPs
2354  *
2355  * @adev: amdgpu_device pointer
2356  *
2357  * Main initialization pass for hardware IPs.  The list of all the hardware
2358  * IPs that make up the asic is walked and the sw_init and hw_init callbacks
2359  * are run.  sw_init initializes the software state associated with each IP
2360  * and hw_init initializes the hardware associated with each IP.
2361  * Returns 0 on success, negative error code on failure.
2362  */
2363 static int amdgpu_device_ip_init(struct amdgpu_device *adev)
2364 {
2365         int i, r;
2366
2367         r = amdgpu_ras_init(adev);
2368         if (r)
2369                 return r;
2370
2371         for (i = 0; i < adev->num_ip_blocks; i++) {
2372                 if (!adev->ip_blocks[i].status.valid)
2373                         continue;
2374                 r = adev->ip_blocks[i].version->funcs->sw_init((void *)adev);
2375                 if (r) {
2376                         DRM_ERROR("sw_init of IP block <%s> failed %d\n",
2377                                   adev->ip_blocks[i].version->funcs->name, r);
2378                         goto init_failed;
2379                 }
2380                 adev->ip_blocks[i].status.sw = true;
2381
2382                 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) {
2383                         /* need to do common hw init early so everything is set up for gmc */
2384                         r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev);
2385                         if (r) {
2386                                 DRM_ERROR("hw_init %d failed %d\n", i, r);
2387                                 goto init_failed;
2388                         }
2389                         adev->ip_blocks[i].status.hw = true;
2390                 } else if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) {
2391                         /* need to do gmc hw init early so we can allocate gpu mem */
2392                         /* Try to reserve bad pages early */
2393                         if (amdgpu_sriov_vf(adev))
2394                                 amdgpu_virt_exchange_data(adev);
2395
2396                         r = amdgpu_device_mem_scratch_init(adev);
2397                         if (r) {
2398                                 DRM_ERROR("amdgpu_mem_scratch_init failed %d\n", r);
2399                                 goto init_failed;
2400                         }
2401                         r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev);
2402                         if (r) {
2403                                 DRM_ERROR("hw_init %d failed %d\n", i, r);
2404                                 goto init_failed;
2405                         }
2406                         r = amdgpu_device_wb_init(adev);
2407                         if (r) {
2408                                 DRM_ERROR("amdgpu_device_wb_init failed %d\n", r);
2409                                 goto init_failed;
2410                         }
2411                         adev->ip_blocks[i].status.hw = true;
2412
2413                         /* right after GMC hw init, we create CSA */
2414                         if (adev->gfx.mcbp) {
2415                                 r = amdgpu_allocate_static_csa(adev, &adev->virt.csa_obj,
2416                                                                AMDGPU_GEM_DOMAIN_VRAM |
2417                                                                AMDGPU_GEM_DOMAIN_GTT,
2418                                                                AMDGPU_CSA_SIZE);
2419                                 if (r) {
2420                                         DRM_ERROR("allocate CSA failed %d\n", r);
2421                                         goto init_failed;
2422                                 }
2423                         }
2424                 }
2425         }
2426
2427         if (amdgpu_sriov_vf(adev))
2428                 amdgpu_virt_init_data_exchange(adev);
2429
2430         r = amdgpu_ib_pool_init(adev);
2431         if (r) {
2432                 dev_err(adev->dev, "IB initialization failed (%d).\n", r);
2433                 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r);
2434                 goto init_failed;
2435         }
2436
2437         r = amdgpu_ucode_create_bo(adev); /* create ucode bo when sw_init complete*/
2438         if (r)
2439                 goto init_failed;
2440
2441         r = amdgpu_device_ip_hw_init_phase1(adev);
2442         if (r)
2443                 goto init_failed;
2444
2445         r = amdgpu_device_fw_loading(adev);
2446         if (r)
2447                 goto init_failed;
2448
2449         r = amdgpu_device_ip_hw_init_phase2(adev);
2450         if (r)
2451                 goto init_failed;
2452
2453         /*
2454          * retired pages will be loaded from eeprom and reserved here,
2455          * it should be called after amdgpu_device_ip_hw_init_phase2  since
2456          * for some ASICs the RAS EEPROM code relies on SMU fully functioning
2457          * for I2C communication which only true at this point.
2458          *
2459          * amdgpu_ras_recovery_init may fail, but the upper only cares the
2460          * failure from bad gpu situation and stop amdgpu init process
2461          * accordingly. For other failed cases, it will still release all
2462          * the resource and print error message, rather than returning one
2463          * negative value to upper level.
2464          *
2465          * Note: theoretically, this should be called before all vram allocations
2466          * to protect retired page from abusing
2467          */
2468         r = amdgpu_ras_recovery_init(adev);
2469         if (r)
2470                 goto init_failed;
2471
2472         /**
2473          * In case of XGMI grab extra reference for reset domain for this device
2474          */
2475         if (adev->gmc.xgmi.num_physical_nodes > 1) {
2476                 if (amdgpu_xgmi_add_device(adev) == 0) {
2477                         if (!amdgpu_sriov_vf(adev)) {
2478                                 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev);
2479
2480                                 if (WARN_ON(!hive)) {
2481                                         r = -ENOENT;
2482                                         goto init_failed;
2483                                 }
2484
2485                                 if (!hive->reset_domain ||
2486                                     !amdgpu_reset_get_reset_domain(hive->reset_domain)) {
2487                                         r = -ENOENT;
2488                                         amdgpu_put_xgmi_hive(hive);
2489                                         goto init_failed;
2490                                 }
2491
2492                                 /* Drop the early temporary reset domain we created for device */
2493                                 amdgpu_reset_put_reset_domain(adev->reset_domain);
2494                                 adev->reset_domain = hive->reset_domain;
2495                                 amdgpu_put_xgmi_hive(hive);
2496                         }
2497                 }
2498         }
2499
2500         r = amdgpu_device_init_schedulers(adev);
2501         if (r)
2502                 goto init_failed;
2503
2504         /* Don't init kfd if whole hive need to be reset during init */
2505         if (!adev->gmc.xgmi.pending_reset) {
2506                 kgd2kfd_init_zone_device(adev);
2507                 amdgpu_amdkfd_device_init(adev);
2508         }
2509
2510         amdgpu_fru_get_product_info(adev);
2511
2512 init_failed:
2513
2514         return r;
2515 }
2516
2517 /**
2518  * amdgpu_device_fill_reset_magic - writes reset magic to gart pointer
2519  *
2520  * @adev: amdgpu_device pointer
2521  *
2522  * Writes a reset magic value to the gart pointer in VRAM.  The driver calls
2523  * this function before a GPU reset.  If the value is retained after a
2524  * GPU reset, VRAM has not been lost.  Some GPU resets may destry VRAM contents.
2525  */
2526 static void amdgpu_device_fill_reset_magic(struct amdgpu_device *adev)
2527 {
2528         memcpy(adev->reset_magic, adev->gart.ptr, AMDGPU_RESET_MAGIC_NUM);
2529 }
2530
2531 /**
2532  * amdgpu_device_check_vram_lost - check if vram is valid
2533  *
2534  * @adev: amdgpu_device pointer
2535  *
2536  * Checks the reset magic value written to the gart pointer in VRAM.
2537  * The driver calls this after a GPU reset to see if the contents of
2538  * VRAM is lost or now.
2539  * returns true if vram is lost, false if not.
2540  */
2541 static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev)
2542 {
2543         if (memcmp(adev->gart.ptr, adev->reset_magic,
2544                         AMDGPU_RESET_MAGIC_NUM))
2545                 return true;
2546
2547         if (!amdgpu_in_reset(adev))
2548                 return false;
2549
2550         /*
2551          * For all ASICs with baco/mode1 reset, the VRAM is
2552          * always assumed to be lost.
2553          */
2554         switch (amdgpu_asic_reset_method(adev)) {
2555         case AMD_RESET_METHOD_BACO:
2556         case AMD_RESET_METHOD_MODE1:
2557                 return true;
2558         default:
2559                 return false;
2560         }
2561 }
2562
2563 /**
2564  * amdgpu_device_set_cg_state - set clockgating for amdgpu device
2565  *
2566  * @adev: amdgpu_device pointer
2567  * @state: clockgating state (gate or ungate)
2568  *
2569  * The list of all the hardware IPs that make up the asic is walked and the
2570  * set_clockgating_state callbacks are run.
2571  * Late initialization pass enabling clockgating for hardware IPs.
2572  * Fini or suspend, pass disabling clockgating for hardware IPs.
2573  * Returns 0 on success, negative error code on failure.
2574  */
2575
2576 int amdgpu_device_set_cg_state(struct amdgpu_device *adev,
2577                                enum amd_clockgating_state state)
2578 {
2579         int i, j, r;
2580
2581         if (amdgpu_emu_mode == 1)
2582                 return 0;
2583
2584         for (j = 0; j < adev->num_ip_blocks; j++) {
2585                 i = state == AMD_CG_STATE_GATE ? j : adev->num_ip_blocks - j - 1;
2586                 if (!adev->ip_blocks[i].status.late_initialized)
2587                         continue;
2588                 /* skip CG for GFX, SDMA on S0ix */
2589                 if (adev->in_s0ix &&
2590                     (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX ||
2591                      adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA))
2592                         continue;
2593                 /* skip CG for VCE/UVD, it's handled specially */
2594                 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD &&
2595                     adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE &&
2596                     adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN &&
2597                     adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG &&
2598                     adev->ip_blocks[i].version->funcs->set_clockgating_state) {
2599                         /* enable clockgating to save power */
2600                         r = adev->ip_blocks[i].version->funcs->set_clockgating_state((void *)adev,
2601                                                                                      state);
2602                         if (r) {
2603                                 DRM_ERROR("set_clockgating_state(gate) of IP block <%s> failed %d\n",
2604                                           adev->ip_blocks[i].version->funcs->name, r);
2605                                 return r;
2606                         }
2607                 }
2608         }
2609
2610         return 0;
2611 }
2612
2613 int amdgpu_device_set_pg_state(struct amdgpu_device *adev,
2614                                enum amd_powergating_state state)
2615 {
2616         int i, j, r;
2617
2618         if (amdgpu_emu_mode == 1)
2619                 return 0;
2620
2621         for (j = 0; j < adev->num_ip_blocks; j++) {
2622                 i = state == AMD_PG_STATE_GATE ? j : adev->num_ip_blocks - j - 1;
2623                 if (!adev->ip_blocks[i].status.late_initialized)
2624                         continue;
2625                 /* skip PG for GFX, SDMA on S0ix */
2626                 if (adev->in_s0ix &&
2627                     (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX ||
2628                      adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA))
2629                         continue;
2630                 /* skip CG for VCE/UVD, it's handled specially */
2631                 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD &&
2632                     adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE &&
2633                     adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN &&
2634                     adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG &&
2635                     adev->ip_blocks[i].version->funcs->set_powergating_state) {
2636                         /* enable powergating to save power */
2637                         r = adev->ip_blocks[i].version->funcs->set_powergating_state((void *)adev,
2638                                                                                         state);
2639                         if (r) {
2640                                 DRM_ERROR("set_powergating_state(gate) of IP block <%s> failed %d\n",
2641                                           adev->ip_blocks[i].version->funcs->name, r);
2642                                 return r;
2643                         }
2644                 }
2645         }
2646         return 0;
2647 }
2648
2649 static int amdgpu_device_enable_mgpu_fan_boost(void)
2650 {
2651         struct amdgpu_gpu_instance *gpu_ins;
2652         struct amdgpu_device *adev;
2653         int i, ret = 0;
2654
2655         mutex_lock(&mgpu_info.mutex);
2656
2657         /*
2658          * MGPU fan boost feature should be enabled
2659          * only when there are two or more dGPUs in
2660          * the system
2661          */
2662         if (mgpu_info.num_dgpu < 2)
2663                 goto out;
2664
2665         for (i = 0; i < mgpu_info.num_dgpu; i++) {
2666                 gpu_ins = &(mgpu_info.gpu_ins[i]);
2667                 adev = gpu_ins->adev;
2668                 if (!(adev->flags & AMD_IS_APU) &&
2669                     !gpu_ins->mgpu_fan_enabled) {
2670                         ret = amdgpu_dpm_enable_mgpu_fan_boost(adev);
2671                         if (ret)
2672                                 break;
2673
2674                         gpu_ins->mgpu_fan_enabled = 1;
2675                 }
2676         }
2677
2678 out:
2679         mutex_unlock(&mgpu_info.mutex);
2680
2681         return ret;
2682 }
2683
2684 /**
2685  * amdgpu_device_ip_late_init - run late init for hardware IPs
2686  *
2687  * @adev: amdgpu_device pointer
2688  *
2689  * Late initialization pass for hardware IPs.  The list of all the hardware
2690  * IPs that make up the asic is walked and the late_init callbacks are run.
2691  * late_init covers any special initialization that an IP requires
2692  * after all of the have been initialized or something that needs to happen
2693  * late in the init process.
2694  * Returns 0 on success, negative error code on failure.
2695  */
2696 static int amdgpu_device_ip_late_init(struct amdgpu_device *adev)
2697 {
2698         struct amdgpu_gpu_instance *gpu_instance;
2699         int i = 0, r;
2700
2701         for (i = 0; i < adev->num_ip_blocks; i++) {
2702                 if (!adev->ip_blocks[i].status.hw)
2703                         continue;
2704                 if (adev->ip_blocks[i].version->funcs->late_init) {
2705                         r = adev->ip_blocks[i].version->funcs->late_init((void *)adev);
2706                         if (r) {
2707                                 DRM_ERROR("late_init of IP block <%s> failed %d\n",
2708                                           adev->ip_blocks[i].version->funcs->name, r);
2709                                 return r;
2710                         }
2711                 }
2712                 adev->ip_blocks[i].status.late_initialized = true;
2713         }
2714
2715         r = amdgpu_ras_late_init(adev);
2716         if (r) {
2717                 DRM_ERROR("amdgpu_ras_late_init failed %d", r);
2718                 return r;
2719         }
2720
2721         amdgpu_ras_set_error_query_ready(adev, true);
2722
2723         amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE);
2724         amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE);
2725
2726         amdgpu_device_fill_reset_magic(adev);
2727
2728         r = amdgpu_device_enable_mgpu_fan_boost();
2729         if (r)
2730                 DRM_ERROR("enable mgpu fan boost failed (%d).\n", r);
2731
2732         /* For passthrough configuration on arcturus and aldebaran, enable special handling SBR */
2733         if (amdgpu_passthrough(adev) &&
2734             ((adev->asic_type == CHIP_ARCTURUS && adev->gmc.xgmi.num_physical_nodes > 1) ||
2735              adev->asic_type == CHIP_ALDEBARAN))
2736                 amdgpu_dpm_handle_passthrough_sbr(adev, true);
2737
2738         if (adev->gmc.xgmi.num_physical_nodes > 1) {
2739                 mutex_lock(&mgpu_info.mutex);
2740
2741                 /*
2742                  * Reset device p-state to low as this was booted with high.
2743                  *
2744                  * This should be performed only after all devices from the same
2745                  * hive get initialized.
2746                  *
2747                  * However, it's unknown how many device in the hive in advance.
2748                  * As this is counted one by one during devices initializations.
2749                  *
2750                  * So, we wait for all XGMI interlinked devices initialized.
2751                  * This may bring some delays as those devices may come from
2752                  * different hives. But that should be OK.
2753                  */
2754                 if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) {
2755                         for (i = 0; i < mgpu_info.num_gpu; i++) {
2756                                 gpu_instance = &(mgpu_info.gpu_ins[i]);
2757                                 if (gpu_instance->adev->flags & AMD_IS_APU)
2758                                         continue;
2759
2760                                 r = amdgpu_xgmi_set_pstate(gpu_instance->adev,
2761                                                 AMDGPU_XGMI_PSTATE_MIN);
2762                                 if (r) {
2763                                         DRM_ERROR("pstate setting failed (%d).\n", r);
2764                                         break;
2765                                 }
2766                         }
2767                 }
2768
2769                 mutex_unlock(&mgpu_info.mutex);
2770         }
2771
2772         return 0;
2773 }
2774
2775 /**
2776  * amdgpu_device_smu_fini_early - smu hw_fini wrapper
2777  *
2778  * @adev: amdgpu_device pointer
2779  *
2780  * For ASICs need to disable SMC first
2781  */
2782 static void amdgpu_device_smu_fini_early(struct amdgpu_device *adev)
2783 {
2784         int i, r;
2785
2786         if (adev->ip_versions[GC_HWIP][0] > IP_VERSION(9, 0, 0))
2787                 return;
2788
2789         for (i = 0; i < adev->num_ip_blocks; i++) {
2790                 if (!adev->ip_blocks[i].status.hw)
2791                         continue;
2792                 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) {
2793                         r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev);
2794                         /* XXX handle errors */
2795                         if (r) {
2796                                 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n",
2797                                           adev->ip_blocks[i].version->funcs->name, r);
2798                         }
2799                         adev->ip_blocks[i].status.hw = false;
2800                         break;
2801                 }
2802         }
2803 }
2804
2805 static int amdgpu_device_ip_fini_early(struct amdgpu_device *adev)
2806 {
2807         int i, r;
2808
2809         for (i = 0; i < adev->num_ip_blocks; i++) {
2810                 if (!adev->ip_blocks[i].version->funcs->early_fini)
2811                         continue;
2812
2813                 r = adev->ip_blocks[i].version->funcs->early_fini((void *)adev);
2814                 if (r) {
2815                         DRM_DEBUG("early_fini of IP block <%s> failed %d\n",
2816                                   adev->ip_blocks[i].version->funcs->name, r);
2817                 }
2818         }
2819
2820         amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
2821         amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
2822
2823         amdgpu_amdkfd_suspend(adev, false);
2824
2825         /* Workaroud for ASICs need to disable SMC first */
2826         amdgpu_device_smu_fini_early(adev);
2827
2828         for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2829                 if (!adev->ip_blocks[i].status.hw)
2830                         continue;
2831
2832                 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev);
2833                 /* XXX handle errors */
2834                 if (r) {
2835                         DRM_DEBUG("hw_fini of IP block <%s> failed %d\n",
2836                                   adev->ip_blocks[i].version->funcs->name, r);
2837                 }
2838
2839                 adev->ip_blocks[i].status.hw = false;
2840         }
2841
2842         if (amdgpu_sriov_vf(adev)) {
2843                 if (amdgpu_virt_release_full_gpu(adev, false))
2844                         DRM_ERROR("failed to release exclusive mode on fini\n");
2845         }
2846
2847         return 0;
2848 }
2849
2850 /**
2851  * amdgpu_device_ip_fini - run fini for hardware IPs
2852  *
2853  * @adev: amdgpu_device pointer
2854  *
2855  * Main teardown pass for hardware IPs.  The list of all the hardware
2856  * IPs that make up the asic is walked and the hw_fini and sw_fini callbacks
2857  * are run.  hw_fini tears down the hardware associated with each IP
2858  * and sw_fini tears down any software state associated with each IP.
2859  * Returns 0 on success, negative error code on failure.
2860  */
2861 static int amdgpu_device_ip_fini(struct amdgpu_device *adev)
2862 {
2863         int i, r;
2864
2865         if (amdgpu_sriov_vf(adev) && adev->virt.ras_init_done)
2866                 amdgpu_virt_release_ras_err_handler_data(adev);
2867
2868         if (adev->gmc.xgmi.num_physical_nodes > 1)
2869                 amdgpu_xgmi_remove_device(adev);
2870
2871         amdgpu_amdkfd_device_fini_sw(adev);
2872
2873         for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2874                 if (!adev->ip_blocks[i].status.sw)
2875                         continue;
2876
2877                 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) {
2878                         amdgpu_ucode_free_bo(adev);
2879                         amdgpu_free_static_csa(&adev->virt.csa_obj);
2880                         amdgpu_device_wb_fini(adev);
2881                         amdgpu_device_mem_scratch_fini(adev);
2882                         amdgpu_ib_pool_fini(adev);
2883                 }
2884
2885                 r = adev->ip_blocks[i].version->funcs->sw_fini((void *)adev);
2886                 /* XXX handle errors */
2887                 if (r) {
2888                         DRM_DEBUG("sw_fini of IP block <%s> failed %d\n",
2889                                   adev->ip_blocks[i].version->funcs->name, r);
2890                 }
2891                 adev->ip_blocks[i].status.sw = false;
2892                 adev->ip_blocks[i].status.valid = false;
2893         }
2894
2895         for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2896                 if (!adev->ip_blocks[i].status.late_initialized)
2897                         continue;
2898                 if (adev->ip_blocks[i].version->funcs->late_fini)
2899                         adev->ip_blocks[i].version->funcs->late_fini((void *)adev);
2900                 adev->ip_blocks[i].status.late_initialized = false;
2901         }
2902
2903         amdgpu_ras_fini(adev);
2904
2905         return 0;
2906 }
2907
2908 /**
2909  * amdgpu_device_delayed_init_work_handler - work handler for IB tests
2910  *
2911  * @work: work_struct.
2912  */
2913 static void amdgpu_device_delayed_init_work_handler(struct work_struct *work)
2914 {
2915         struct amdgpu_device *adev =
2916                 container_of(work, struct amdgpu_device, delayed_init_work.work);
2917         int r;
2918
2919         r = amdgpu_ib_ring_tests(adev);
2920         if (r)
2921                 DRM_ERROR("ib ring test failed (%d).\n", r);
2922 }
2923
2924 static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work)
2925 {
2926         struct amdgpu_device *adev =
2927                 container_of(work, struct amdgpu_device, gfx.gfx_off_delay_work.work);
2928
2929         WARN_ON_ONCE(adev->gfx.gfx_off_state);
2930         WARN_ON_ONCE(adev->gfx.gfx_off_req_count);
2931
2932         if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true))
2933                 adev->gfx.gfx_off_state = true;
2934 }
2935
2936 /**
2937  * amdgpu_device_ip_suspend_phase1 - run suspend for hardware IPs (phase 1)
2938  *
2939  * @adev: amdgpu_device pointer
2940  *
2941  * Main suspend function for hardware IPs.  The list of all the hardware
2942  * IPs that make up the asic is walked, clockgating is disabled and the
2943  * suspend callbacks are run.  suspend puts the hardware and software state
2944  * in each IP into a state suitable for suspend.
2945  * Returns 0 on success, negative error code on failure.
2946  */
2947 static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev)
2948 {
2949         int i, r;
2950
2951         amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
2952         amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
2953
2954         /*
2955          * Per PMFW team's suggestion, driver needs to handle gfxoff
2956          * and df cstate features disablement for gpu reset(e.g. Mode1Reset)
2957          * scenario. Add the missing df cstate disablement here.
2958          */
2959         if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_DISALLOW))
2960                 dev_warn(adev->dev, "Failed to disallow df cstate");
2961
2962         for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2963                 if (!adev->ip_blocks[i].status.valid)
2964                         continue;
2965
2966                 /* displays are handled separately */
2967                 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_DCE)
2968                         continue;
2969
2970                 /* XXX handle errors */
2971                 r = adev->ip_blocks[i].version->funcs->suspend(adev);
2972                 /* XXX handle errors */
2973                 if (r) {
2974                         DRM_ERROR("suspend of IP block <%s> failed %d\n",
2975                                   adev->ip_blocks[i].version->funcs->name, r);
2976                         return r;
2977                 }
2978
2979                 adev->ip_blocks[i].status.hw = false;
2980         }
2981
2982         return 0;
2983 }
2984
2985 /**
2986  * amdgpu_device_ip_suspend_phase2 - run suspend for hardware IPs (phase 2)
2987  *
2988  * @adev: amdgpu_device pointer
2989  *
2990  * Main suspend function for hardware IPs.  The list of all the hardware
2991  * IPs that make up the asic is walked, clockgating is disabled and the
2992  * suspend callbacks are run.  suspend puts the hardware and software state
2993  * in each IP into a state suitable for suspend.
2994  * Returns 0 on success, negative error code on failure.
2995  */
2996 static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev)
2997 {
2998         int i, r;
2999
3000         if (adev->in_s0ix)
3001                 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D3Entry);
3002
3003         for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
3004                 if (!adev->ip_blocks[i].status.valid)
3005                         continue;
3006                 /* displays are handled in phase1 */
3007                 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE)
3008                         continue;
3009                 /* PSP lost connection when err_event_athub occurs */
3010                 if (amdgpu_ras_intr_triggered() &&
3011                     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) {
3012                         adev->ip_blocks[i].status.hw = false;
3013                         continue;
3014                 }
3015
3016                 /* skip unnecessary suspend if we do not initialize them yet */
3017                 if (adev->gmc.xgmi.pending_reset &&
3018                     !(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
3019                       adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC ||
3020                       adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
3021                       adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH)) {
3022                         adev->ip_blocks[i].status.hw = false;
3023                         continue;
3024                 }
3025
3026                 /* skip suspend of gfx/mes and psp for S0ix
3027                  * gfx is in gfxoff state, so on resume it will exit gfxoff just
3028                  * like at runtime. PSP is also part of the always on hardware
3029                  * so no need to suspend it.
3030                  */
3031                 if (adev->in_s0ix &&
3032                     (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP ||
3033                      adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX ||
3034                      adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_MES))
3035                         continue;
3036
3037                 /* SDMA 5.x+ is part of GFX power domain so it's covered by GFXOFF */
3038                 if (adev->in_s0ix &&
3039                     (adev->ip_versions[SDMA0_HWIP][0] >= IP_VERSION(5, 0, 0)) &&
3040                     (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA))
3041                         continue;
3042
3043                 /* Once swPSP provides the IMU, RLC FW binaries to TOS during cold-boot.
3044                  * These are in TMR, hence are expected to be reused by PSP-TOS to reload
3045                  * from this location and RLC Autoload automatically also gets loaded
3046                  * from here based on PMFW -> PSP message during re-init sequence.
3047                  * Therefore, the psp suspend & resume should be skipped to avoid destroy
3048                  * the TMR and reload FWs again for IMU enabled APU ASICs.
3049                  */
3050                 if (amdgpu_in_reset(adev) &&
3051                     (adev->flags & AMD_IS_APU) && adev->gfx.imu.funcs &&
3052                     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)
3053                         continue;
3054
3055                 /* XXX handle errors */
3056                 r = adev->ip_blocks[i].version->funcs->suspend(adev);
3057                 /* XXX handle errors */
3058                 if (r) {
3059                         DRM_ERROR("suspend of IP block <%s> failed %d\n",
3060                                   adev->ip_blocks[i].version->funcs->name, r);
3061                 }
3062                 adev->ip_blocks[i].status.hw = false;
3063                 /* handle putting the SMC in the appropriate state */
3064                 if (!amdgpu_sriov_vf(adev)) {
3065                         if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) {
3066                                 r = amdgpu_dpm_set_mp1_state(adev, adev->mp1_state);
3067                                 if (r) {
3068                                         DRM_ERROR("SMC failed to set mp1 state %d, %d\n",
3069                                                         adev->mp1_state, r);
3070                                         return r;
3071                                 }
3072                         }
3073                 }
3074         }
3075
3076         return 0;
3077 }
3078
3079 /**
3080  * amdgpu_device_ip_suspend - run suspend for hardware IPs
3081  *
3082  * @adev: amdgpu_device pointer
3083  *
3084  * Main suspend function for hardware IPs.  The list of all the hardware
3085  * IPs that make up the asic is walked, clockgating is disabled and the
3086  * suspend callbacks are run.  suspend puts the hardware and software state
3087  * in each IP into a state suitable for suspend.
3088  * Returns 0 on success, negative error code on failure.
3089  */
3090 int amdgpu_device_ip_suspend(struct amdgpu_device *adev)
3091 {
3092         int r;
3093
3094         if (amdgpu_sriov_vf(adev)) {
3095                 amdgpu_virt_fini_data_exchange(adev);
3096                 amdgpu_virt_request_full_gpu(adev, false);
3097         }
3098
3099         r = amdgpu_device_ip_suspend_phase1(adev);
3100         if (r)
3101                 return r;
3102         r = amdgpu_device_ip_suspend_phase2(adev);
3103
3104         if (amdgpu_sriov_vf(adev))
3105                 amdgpu_virt_release_full_gpu(adev, false);
3106
3107         return r;
3108 }
3109
3110 static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev)
3111 {
3112         int i, r;
3113
3114         static enum amd_ip_block_type ip_order[] = {
3115                 AMD_IP_BLOCK_TYPE_COMMON,
3116                 AMD_IP_BLOCK_TYPE_GMC,
3117                 AMD_IP_BLOCK_TYPE_PSP,
3118                 AMD_IP_BLOCK_TYPE_IH,
3119         };
3120
3121         for (i = 0; i < adev->num_ip_blocks; i++) {
3122                 int j;
3123                 struct amdgpu_ip_block *block;
3124
3125                 block = &adev->ip_blocks[i];
3126                 block->status.hw = false;
3127
3128                 for (j = 0; j < ARRAY_SIZE(ip_order); j++) {
3129
3130                         if (block->version->type != ip_order[j] ||
3131                                 !block->status.valid)
3132                                 continue;
3133
3134                         r = block->version->funcs->hw_init(adev);
3135                         DRM_INFO("RE-INIT-early: %s %s\n", block->version->funcs->name, r?"failed":"succeeded");
3136                         if (r)
3137                                 return r;
3138                         block->status.hw = true;
3139                 }
3140         }
3141
3142         return 0;
3143 }
3144
3145 static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev)
3146 {
3147         int i, r;
3148
3149         static enum amd_ip_block_type ip_order[] = {
3150                 AMD_IP_BLOCK_TYPE_SMC,
3151                 AMD_IP_BLOCK_TYPE_DCE,
3152                 AMD_IP_BLOCK_TYPE_GFX,
3153                 AMD_IP_BLOCK_TYPE_SDMA,
3154                 AMD_IP_BLOCK_TYPE_MES,
3155                 AMD_IP_BLOCK_TYPE_UVD,
3156                 AMD_IP_BLOCK_TYPE_VCE,
3157                 AMD_IP_BLOCK_TYPE_VCN,
3158                 AMD_IP_BLOCK_TYPE_JPEG
3159         };
3160
3161         for (i = 0; i < ARRAY_SIZE(ip_order); i++) {
3162                 int j;
3163                 struct amdgpu_ip_block *block;
3164
3165                 for (j = 0; j < adev->num_ip_blocks; j++) {
3166                         block = &adev->ip_blocks[j];
3167
3168                         if (block->version->type != ip_order[i] ||
3169                                 !block->status.valid ||
3170                                 block->status.hw)
3171                                 continue;
3172
3173                         if (block->version->type == AMD_IP_BLOCK_TYPE_SMC)
3174                                 r = block->version->funcs->resume(adev);
3175                         else
3176                                 r = block->version->funcs->hw_init(adev);
3177
3178                         DRM_INFO("RE-INIT-late: %s %s\n", block->version->funcs->name, r?"failed":"succeeded");
3179                         if (r)
3180                                 return r;
3181                         block->status.hw = true;
3182                 }
3183         }
3184
3185         return 0;
3186 }
3187
3188 /**
3189  * amdgpu_device_ip_resume_phase1 - run resume for hardware IPs
3190  *
3191  * @adev: amdgpu_device pointer
3192  *
3193  * First resume function for hardware IPs.  The list of all the hardware
3194  * IPs that make up the asic is walked and the resume callbacks are run for
3195  * COMMON, GMC, and IH.  resume puts the hardware into a functional state
3196  * after a suspend and updates the software state as necessary.  This
3197  * function is also used for restoring the GPU after a GPU reset.
3198  * Returns 0 on success, negative error code on failure.
3199  */
3200 static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev)
3201 {
3202         int i, r;
3203
3204         for (i = 0; i < adev->num_ip_blocks; i++) {
3205                 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw)
3206                         continue;
3207                 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
3208                     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
3209                     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH ||
3210                     (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP && amdgpu_sriov_vf(adev))) {
3211
3212                         r = adev->ip_blocks[i].version->funcs->resume(adev);
3213                         if (r) {
3214                                 DRM_ERROR("resume of IP block <%s> failed %d\n",
3215                                           adev->ip_blocks[i].version->funcs->name, r);
3216                                 return r;
3217                         }
3218                         adev->ip_blocks[i].status.hw = true;
3219                 }
3220         }
3221
3222         return 0;
3223 }
3224
3225 /**
3226  * amdgpu_device_ip_resume_phase2 - run resume for hardware IPs
3227  *
3228  * @adev: amdgpu_device pointer
3229  *
3230  * First resume function for hardware IPs.  The list of all the hardware
3231  * IPs that make up the asic is walked and the resume callbacks are run for
3232  * all blocks except COMMON, GMC, and IH.  resume puts the hardware into a
3233  * functional state after a suspend and updates the software state as
3234  * necessary.  This function is also used for restoring the GPU after a GPU
3235  * reset.
3236  * Returns 0 on success, negative error code on failure.
3237  */
3238 static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev)
3239 {
3240         int i, r;
3241
3242         for (i = 0; i < adev->num_ip_blocks; i++) {
3243                 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw)
3244                         continue;
3245                 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
3246                     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
3247                     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH ||
3248                     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)
3249                         continue;
3250                 r = adev->ip_blocks[i].version->funcs->resume(adev);
3251                 if (r) {
3252                         DRM_ERROR("resume of IP block <%s> failed %d\n",
3253                                   adev->ip_blocks[i].version->funcs->name, r);
3254                         return r;
3255                 }
3256                 adev->ip_blocks[i].status.hw = true;
3257         }
3258
3259         return 0;
3260 }
3261
3262 /**
3263  * amdgpu_device_ip_resume - run resume for hardware IPs
3264  *
3265  * @adev: amdgpu_device pointer
3266  *
3267  * Main resume function for hardware IPs.  The hardware IPs
3268  * are split into two resume functions because they are
3269  * also used in recovering from a GPU reset and some additional
3270  * steps need to be take between them.  In this case (S3/S4) they are
3271  * run sequentially.
3272  * Returns 0 on success, negative error code on failure.
3273  */
3274 static int amdgpu_device_ip_resume(struct amdgpu_device *adev)
3275 {
3276         int r;
3277
3278         if (!adev->in_s0ix) {
3279                 r = amdgpu_amdkfd_resume_iommu(adev);
3280                 if (r)
3281                         return r;
3282         }
3283
3284         r = amdgpu_device_ip_resume_phase1(adev);
3285         if (r)
3286                 return r;
3287
3288         r = amdgpu_device_fw_loading(adev);
3289         if (r)
3290                 return r;
3291
3292         r = amdgpu_device_ip_resume_phase2(adev);
3293
3294         return r;
3295 }
3296
3297 /**
3298  * amdgpu_device_detect_sriov_bios - determine if the board supports SR-IOV
3299  *
3300  * @adev: amdgpu_device pointer
3301  *
3302  * Query the VBIOS data tables to determine if the board supports SR-IOV.
3303  */
3304 static void amdgpu_device_detect_sriov_bios(struct amdgpu_device *adev)
3305 {
3306         if (amdgpu_sriov_vf(adev)) {
3307                 if (adev->is_atom_fw) {
3308                         if (amdgpu_atomfirmware_gpu_virtualization_supported(adev))
3309                                 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS;
3310                 } else {
3311                         if (amdgpu_atombios_has_gpu_virtualization_table(adev))
3312                                 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS;
3313                 }
3314
3315                 if (!(adev->virt.caps & AMDGPU_SRIOV_CAPS_SRIOV_VBIOS))
3316                         amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_NO_VBIOS, 0, 0);
3317         }
3318 }
3319
3320 /**
3321  * amdgpu_device_asic_has_dc_support - determine if DC supports the asic
3322  *
3323  * @asic_type: AMD asic type
3324  *
3325  * Check if there is DC (new modesetting infrastructre) support for an asic.
3326  * returns true if DC has support, false if not.
3327  */
3328 bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type)
3329 {
3330         switch (asic_type) {
3331 #ifdef CONFIG_DRM_AMDGPU_SI
3332         case CHIP_HAINAN:
3333 #endif
3334         case CHIP_TOPAZ:
3335                 /* chips with no display hardware */
3336                 return false;
3337 #if defined(CONFIG_DRM_AMD_DC)
3338         case CHIP_TAHITI:
3339         case CHIP_PITCAIRN:
3340         case CHIP_VERDE:
3341         case CHIP_OLAND:
3342                 /*
3343                  * We have systems in the wild with these ASICs that require
3344                  * LVDS and VGA support which is not supported with DC.
3345                  *
3346                  * Fallback to the non-DC driver here by default so as not to
3347                  * cause regressions.
3348                  */
3349 #if defined(CONFIG_DRM_AMD_DC_SI)
3350                 return amdgpu_dc > 0;
3351 #else
3352                 return false;
3353 #endif
3354         case CHIP_BONAIRE:
3355         case CHIP_KAVERI:
3356         case CHIP_KABINI:
3357         case CHIP_MULLINS:
3358                 /*
3359                  * We have systems in the wild with these ASICs that require
3360                  * VGA support which is not supported with DC.
3361                  *
3362                  * Fallback to the non-DC driver here by default so as not to
3363                  * cause regressions.
3364                  */
3365                 return amdgpu_dc > 0;
3366         default:
3367                 return amdgpu_dc != 0;
3368 #else
3369         default:
3370                 if (amdgpu_dc > 0)
3371                         DRM_INFO_ONCE("Display Core has been requested via kernel parameter but isn't supported by ASIC, ignoring\n");
3372                 return false;
3373 #endif
3374         }
3375 }
3376
3377 /**
3378  * amdgpu_device_has_dc_support - check if dc is supported
3379  *
3380  * @adev: amdgpu_device pointer
3381  *
3382  * Returns true for supported, false for not supported
3383  */
3384 bool amdgpu_device_has_dc_support(struct amdgpu_device *adev)
3385 {
3386         if (adev->enable_virtual_display ||
3387             (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK))
3388                 return false;
3389
3390         return amdgpu_device_asic_has_dc_support(adev->asic_type);
3391 }
3392
3393 static void amdgpu_device_xgmi_reset_func(struct work_struct *__work)
3394 {
3395         struct amdgpu_device *adev =
3396                 container_of(__work, struct amdgpu_device, xgmi_reset_work);
3397         struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev);
3398
3399         /* It's a bug to not have a hive within this function */
3400         if (WARN_ON(!hive))
3401                 return;
3402
3403         /*
3404          * Use task barrier to synchronize all xgmi reset works across the
3405          * hive. task_barrier_enter and task_barrier_exit will block
3406          * until all the threads running the xgmi reset works reach
3407          * those points. task_barrier_full will do both blocks.
3408          */
3409         if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) {
3410
3411                 task_barrier_enter(&hive->tb);
3412                 adev->asic_reset_res = amdgpu_device_baco_enter(adev_to_drm(adev));
3413
3414                 if (adev->asic_reset_res)
3415                         goto fail;
3416
3417                 task_barrier_exit(&hive->tb);
3418                 adev->asic_reset_res = amdgpu_device_baco_exit(adev_to_drm(adev));
3419
3420                 if (adev->asic_reset_res)
3421                         goto fail;
3422
3423                 if (adev->mmhub.ras && adev->mmhub.ras->ras_block.hw_ops &&
3424                     adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count)
3425                         adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count(adev);
3426         } else {
3427
3428                 task_barrier_full(&hive->tb);
3429                 adev->asic_reset_res =  amdgpu_asic_reset(adev);
3430         }
3431
3432 fail:
3433         if (adev->asic_reset_res)
3434                 DRM_WARN("ASIC reset failed with error, %d for drm dev, %s",
3435                          adev->asic_reset_res, adev_to_drm(adev)->unique);
3436         amdgpu_put_xgmi_hive(hive);
3437 }
3438
3439 static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev)
3440 {
3441         char *input = amdgpu_lockup_timeout;
3442         char *timeout_setting = NULL;
3443         int index = 0;
3444         long timeout;
3445         int ret = 0;
3446
3447         /*
3448          * By default timeout for non compute jobs is 10000
3449          * and 60000 for compute jobs.
3450          * In SR-IOV or passthrough mode, timeout for compute
3451          * jobs are 60000 by default.
3452          */
3453         adev->gfx_timeout = msecs_to_jiffies(10000);
3454         adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout;
3455         if (amdgpu_sriov_vf(adev))
3456                 adev->compute_timeout = amdgpu_sriov_is_pp_one_vf(adev) ?
3457                                         msecs_to_jiffies(60000) : msecs_to_jiffies(10000);
3458         else
3459                 adev->compute_timeout =  msecs_to_jiffies(60000);
3460
3461         if (strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) {
3462                 while ((timeout_setting = strsep(&input, ",")) &&
3463                                 strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) {
3464                         ret = kstrtol(timeout_setting, 0, &timeout);
3465                         if (ret)
3466                                 return ret;
3467
3468                         if (timeout == 0) {
3469                                 index++;
3470                                 continue;
3471                         } else if (timeout < 0) {
3472                                 timeout = MAX_SCHEDULE_TIMEOUT;
3473                                 dev_warn(adev->dev, "lockup timeout disabled");
3474                                 add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK);
3475                         } else {
3476                                 timeout = msecs_to_jiffies(timeout);
3477                         }
3478
3479                         switch (index++) {
3480                         case 0:
3481                                 adev->gfx_timeout = timeout;
3482                                 break;
3483                         case 1:
3484                                 adev->compute_timeout = timeout;
3485                                 break;
3486                         case 2:
3487                                 adev->sdma_timeout = timeout;
3488                                 break;
3489                         case 3:
3490                                 adev->video_timeout = timeout;
3491                                 break;
3492                         default:
3493                                 break;
3494                         }
3495                 }
3496                 /*
3497                  * There is only one value specified and
3498                  * it should apply to all non-compute jobs.
3499                  */
3500                 if (index == 1) {
3501                         adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout;
3502                         if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev))
3503                                 adev->compute_timeout = adev->gfx_timeout;
3504                 }
3505         }
3506
3507         return ret;
3508 }
3509
3510 /**
3511  * amdgpu_device_check_iommu_direct_map - check if RAM direct mapped to GPU
3512  *
3513  * @adev: amdgpu_device pointer
3514  *
3515  * RAM direct mapped to GPU if IOMMU is not enabled or is pass through mode
3516  */
3517 static void amdgpu_device_check_iommu_direct_map(struct amdgpu_device *adev)
3518 {
3519         struct iommu_domain *domain;
3520
3521         domain = iommu_get_domain_for_dev(adev->dev);
3522         if (!domain || domain->type == IOMMU_DOMAIN_IDENTITY)
3523                 adev->ram_is_direct_mapped = true;
3524 }
3525
3526 static const struct attribute *amdgpu_dev_attributes[] = {
3527         &dev_attr_product_name.attr,
3528         &dev_attr_product_number.attr,
3529         &dev_attr_serial_number.attr,
3530         &dev_attr_pcie_replay_count.attr,
3531         NULL
3532 };
3533
3534 static void amdgpu_device_set_mcbp(struct amdgpu_device *adev)
3535 {
3536         if (amdgpu_mcbp == 1)
3537                 adev->gfx.mcbp = true;
3538
3539         if ((adev->ip_versions[GC_HWIP][0] >= IP_VERSION(9, 0, 0)) &&
3540             (adev->ip_versions[GC_HWIP][0] < IP_VERSION(10, 0, 0)) &&
3541             adev->gfx.num_gfx_rings)
3542                 adev->gfx.mcbp = true;
3543
3544         if (amdgpu_sriov_vf(adev))
3545                 adev->gfx.mcbp = true;
3546
3547         if (adev->gfx.mcbp)
3548                 DRM_INFO("MCBP is enabled\n");
3549 }
3550
3551 /**
3552  * amdgpu_device_init - initialize the driver
3553  *
3554  * @adev: amdgpu_device pointer
3555  * @flags: driver flags
3556  *
3557  * Initializes the driver info and hw (all asics).
3558  * Returns 0 for success or an error on failure.
3559  * Called at driver startup.
3560  */
3561 int amdgpu_device_init(struct amdgpu_device *adev,
3562                        uint32_t flags)
3563 {
3564         struct drm_device *ddev = adev_to_drm(adev);
3565         struct pci_dev *pdev = adev->pdev;
3566         int r, i;
3567         bool px = false;
3568         u32 max_MBps;
3569         int tmp;
3570
3571         adev->shutdown = false;
3572         adev->flags = flags;
3573
3574         if (amdgpu_force_asic_type >= 0 && amdgpu_force_asic_type < CHIP_LAST)
3575                 adev->asic_type = amdgpu_force_asic_type;
3576         else
3577                 adev->asic_type = flags & AMD_ASIC_MASK;
3578
3579         adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT;
3580         if (amdgpu_emu_mode == 1)
3581                 adev->usec_timeout *= 10;
3582         adev->gmc.gart_size = 512 * 1024 * 1024;
3583         adev->accel_working = false;
3584         adev->num_rings = 0;
3585         RCU_INIT_POINTER(adev->gang_submit, dma_fence_get_stub());
3586         adev->mman.buffer_funcs = NULL;
3587         adev->mman.buffer_funcs_ring = NULL;
3588         adev->vm_manager.vm_pte_funcs = NULL;
3589         adev->vm_manager.vm_pte_num_scheds = 0;
3590         adev->gmc.gmc_funcs = NULL;
3591         adev->harvest_ip_mask = 0x0;
3592         adev->fence_context = dma_fence_context_alloc(AMDGPU_MAX_RINGS);
3593         bitmap_zero(adev->gfx.pipe_reserve_bitmap, AMDGPU_MAX_COMPUTE_QUEUES);
3594
3595         adev->smc_rreg = &amdgpu_invalid_rreg;
3596         adev->smc_wreg = &amdgpu_invalid_wreg;
3597         adev->pcie_rreg = &amdgpu_invalid_rreg;
3598         adev->pcie_wreg = &amdgpu_invalid_wreg;
3599         adev->pcie_rreg_ext = &amdgpu_invalid_rreg_ext;
3600         adev->pcie_wreg_ext = &amdgpu_invalid_wreg_ext;
3601         adev->pciep_rreg = &amdgpu_invalid_rreg;
3602         adev->pciep_wreg = &amdgpu_invalid_wreg;
3603         adev->pcie_rreg64 = &amdgpu_invalid_rreg64;
3604         adev->pcie_wreg64 = &amdgpu_invalid_wreg64;
3605         adev->uvd_ctx_rreg = &amdgpu_invalid_rreg;
3606         adev->uvd_ctx_wreg = &amdgpu_invalid_wreg;
3607         adev->didt_rreg = &amdgpu_invalid_rreg;
3608         adev->didt_wreg = &amdgpu_invalid_wreg;
3609         adev->gc_cac_rreg = &amdgpu_invalid_rreg;
3610         adev->gc_cac_wreg = &amdgpu_invalid_wreg;
3611         adev->audio_endpt_rreg = &amdgpu_block_invalid_rreg;
3612         adev->audio_endpt_wreg = &amdgpu_block_invalid_wreg;
3613
3614         DRM_INFO("initializing kernel modesetting (%s 0x%04X:0x%04X 0x%04X:0x%04X 0x%02X).\n",
3615                  amdgpu_asic_name[adev->asic_type], pdev->vendor, pdev->device,
3616                  pdev->subsystem_vendor, pdev->subsystem_device, pdev->revision);
3617
3618         /* mutex initialization are all done here so we
3619          * can recall function without having locking issues
3620          */
3621         mutex_init(&adev->firmware.mutex);
3622         mutex_init(&adev->pm.mutex);
3623         mutex_init(&adev->gfx.gpu_clock_mutex);
3624         mutex_init(&adev->srbm_mutex);
3625         mutex_init(&adev->gfx.pipe_reserve_mutex);
3626         mutex_init(&adev->gfx.gfx_off_mutex);
3627         mutex_init(&adev->gfx.partition_mutex);
3628         mutex_init(&adev->grbm_idx_mutex);
3629         mutex_init(&adev->mn_lock);
3630         mutex_init(&adev->virt.vf_errors.lock);
3631         hash_init(adev->mn_hash);
3632         mutex_init(&adev->psp.mutex);
3633         mutex_init(&adev->notifier_lock);
3634         mutex_init(&adev->pm.stable_pstate_ctx_lock);
3635         mutex_init(&adev->benchmark_mutex);
3636
3637         amdgpu_device_init_apu_flags(adev);
3638
3639         r = amdgpu_device_check_arguments(adev);
3640         if (r)
3641                 return r;
3642
3643         spin_lock_init(&adev->mmio_idx_lock);
3644         spin_lock_init(&adev->smc_idx_lock);
3645         spin_lock_init(&adev->pcie_idx_lock);
3646         spin_lock_init(&adev->uvd_ctx_idx_lock);
3647         spin_lock_init(&adev->didt_idx_lock);
3648         spin_lock_init(&adev->gc_cac_idx_lock);
3649         spin_lock_init(&adev->se_cac_idx_lock);
3650         spin_lock_init(&adev->audio_endpt_idx_lock);
3651         spin_lock_init(&adev->mm_stats.lock);
3652
3653         INIT_LIST_HEAD(&adev->shadow_list);
3654         mutex_init(&adev->shadow_list_lock);
3655
3656         INIT_LIST_HEAD(&adev->reset_list);
3657
3658         INIT_LIST_HEAD(&adev->ras_list);
3659
3660         INIT_DELAYED_WORK(&adev->delayed_init_work,
3661                           amdgpu_device_delayed_init_work_handler);
3662         INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work,
3663                           amdgpu_device_delay_enable_gfx_off);
3664
3665         INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func);
3666
3667         adev->gfx.gfx_off_req_count = 1;
3668         adev->gfx.gfx_off_residency = 0;
3669         adev->gfx.gfx_off_entrycount = 0;
3670         adev->pm.ac_power = power_supply_is_system_supplied() > 0;
3671
3672         atomic_set(&adev->throttling_logging_enabled, 1);
3673         /*
3674          * If throttling continues, logging will be performed every minute
3675          * to avoid log flooding. "-1" is subtracted since the thermal
3676          * throttling interrupt comes every second. Thus, the total logging
3677          * interval is 59 seconds(retelimited printk interval) + 1(waiting
3678          * for throttling interrupt) = 60 seconds.
3679          */
3680         ratelimit_state_init(&adev->throttling_logging_rs, (60 - 1) * HZ, 1);
3681         ratelimit_set_flags(&adev->throttling_logging_rs, RATELIMIT_MSG_ON_RELEASE);
3682
3683         /* Registers mapping */
3684         /* TODO: block userspace mapping of io register */
3685         if (adev->asic_type >= CHIP_BONAIRE) {
3686                 adev->rmmio_base = pci_resource_start(adev->pdev, 5);
3687                 adev->rmmio_size = pci_resource_len(adev->pdev, 5);
3688         } else {
3689                 adev->rmmio_base = pci_resource_start(adev->pdev, 2);
3690                 adev->rmmio_size = pci_resource_len(adev->pdev, 2);
3691         }
3692
3693         for (i = 0; i < AMD_IP_BLOCK_TYPE_NUM; i++)
3694                 atomic_set(&adev->pm.pwr_state[i], POWER_STATE_UNKNOWN);
3695
3696         adev->rmmio = ioremap(adev->rmmio_base, adev->rmmio_size);
3697         if (!adev->rmmio)
3698                 return -ENOMEM;
3699
3700         DRM_INFO("register mmio base: 0x%08X\n", (uint32_t)adev->rmmio_base);
3701         DRM_INFO("register mmio size: %u\n", (unsigned int)adev->rmmio_size);
3702
3703         /*
3704          * Reset domain needs to be present early, before XGMI hive discovered
3705          * (if any) and intitialized to use reset sem and in_gpu reset flag
3706          * early on during init and before calling to RREG32.
3707          */
3708         adev->reset_domain = amdgpu_reset_create_reset_domain(SINGLE_DEVICE, "amdgpu-reset-dev");
3709         if (!adev->reset_domain)
3710                 return -ENOMEM;
3711
3712         /* detect hw virtualization here */
3713         amdgpu_detect_virtualization(adev);
3714
3715         amdgpu_device_get_pcie_info(adev);
3716
3717         r = amdgpu_device_get_job_timeout_settings(adev);
3718         if (r) {
3719                 dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n");
3720                 return r;
3721         }
3722
3723         /* early init functions */
3724         r = amdgpu_device_ip_early_init(adev);
3725         if (r)
3726                 return r;
3727
3728         amdgpu_device_set_mcbp(adev);
3729
3730         /* Get rid of things like offb */
3731         r = drm_aperture_remove_conflicting_pci_framebuffers(adev->pdev, &amdgpu_kms_driver);
3732         if (r)
3733                 return r;
3734
3735         /* Enable TMZ based on IP_VERSION */
3736         amdgpu_gmc_tmz_set(adev);
3737
3738         amdgpu_gmc_noretry_set(adev);
3739         /* Need to get xgmi info early to decide the reset behavior*/
3740         if (adev->gmc.xgmi.supported) {
3741                 r = adev->gfxhub.funcs->get_xgmi_info(adev);
3742                 if (r)
3743                         return r;
3744         }
3745
3746         /* enable PCIE atomic ops */
3747         if (amdgpu_sriov_vf(adev)) {
3748                 if (adev->virt.fw_reserve.p_pf2vf)
3749                         adev->have_atomics_support = ((struct amd_sriov_msg_pf2vf_info *)
3750                                                       adev->virt.fw_reserve.p_pf2vf)->pcie_atomic_ops_support_flags ==
3751                                 (PCI_EXP_DEVCAP2_ATOMIC_COMP32 | PCI_EXP_DEVCAP2_ATOMIC_COMP64);
3752         /* APUs w/ gfx9 onwards doesn't reply on PCIe atomics, rather it is a
3753          * internal path natively support atomics, set have_atomics_support to true.
3754          */
3755         } else if ((adev->flags & AMD_IS_APU) &&
3756                    (adev->ip_versions[GC_HWIP][0] > IP_VERSION(9, 0, 0))) {
3757                 adev->have_atomics_support = true;
3758         } else {
3759                 adev->have_atomics_support =
3760                         !pci_enable_atomic_ops_to_root(adev->pdev,
3761                                           PCI_EXP_DEVCAP2_ATOMIC_COMP32 |
3762                                           PCI_EXP_DEVCAP2_ATOMIC_COMP64);
3763         }
3764
3765         if (!adev->have_atomics_support)
3766                 dev_info(adev->dev, "PCIE atomic ops is not supported\n");
3767
3768         /* doorbell bar mapping and doorbell index init*/
3769         amdgpu_doorbell_init(adev);
3770
3771         if (amdgpu_emu_mode == 1) {
3772                 /* post the asic on emulation mode */
3773                 emu_soc_asic_init(adev);
3774                 goto fence_driver_init;
3775         }
3776
3777         amdgpu_reset_init(adev);
3778
3779         /* detect if we are with an SRIOV vbios */
3780         if (adev->bios)
3781                 amdgpu_device_detect_sriov_bios(adev);
3782
3783         /* check if we need to reset the asic
3784          *  E.g., driver was not cleanly unloaded previously, etc.
3785          */
3786         if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) {
3787                 if (adev->gmc.xgmi.num_physical_nodes) {
3788                         dev_info(adev->dev, "Pending hive reset.\n");
3789                         adev->gmc.xgmi.pending_reset = true;
3790                         /* Only need to init necessary block for SMU to handle the reset */
3791                         for (i = 0; i < adev->num_ip_blocks; i++) {
3792                                 if (!adev->ip_blocks[i].status.valid)
3793                                         continue;
3794                                 if (!(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
3795                                       adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
3796                                       adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH ||
3797                                       adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC)) {
3798                                         DRM_DEBUG("IP %s disabled for hw_init.\n",
3799                                                 adev->ip_blocks[i].version->funcs->name);
3800                                         adev->ip_blocks[i].status.hw = true;
3801                                 }
3802                         }
3803                 } else {
3804                         tmp = amdgpu_reset_method;
3805                         /* It should do a default reset when loading or reloading the driver,
3806                          * regardless of the module parameter reset_method.
3807                          */
3808                         amdgpu_reset_method = AMD_RESET_METHOD_NONE;
3809                         r = amdgpu_asic_reset(adev);
3810                         amdgpu_reset_method = tmp;
3811                         if (r) {
3812                                 dev_err(adev->dev, "asic reset on init failed\n");
3813                                 goto failed;
3814                         }
3815                 }
3816         }
3817
3818         /* Post card if necessary */
3819         if (amdgpu_device_need_post(adev)) {
3820                 if (!adev->bios) {
3821                         dev_err(adev->dev, "no vBIOS found\n");
3822                         r = -EINVAL;
3823                         goto failed;
3824                 }
3825                 DRM_INFO("GPU posting now...\n");
3826                 r = amdgpu_device_asic_init(adev);
3827                 if (r) {
3828                         dev_err(adev->dev, "gpu post error!\n");
3829                         goto failed;
3830                 }
3831         }
3832
3833         if (adev->bios) {
3834                 if (adev->is_atom_fw) {
3835                         /* Initialize clocks */
3836                         r = amdgpu_atomfirmware_get_clock_info(adev);
3837                         if (r) {
3838                                 dev_err(adev->dev, "amdgpu_atomfirmware_get_clock_info failed\n");
3839                                 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0);
3840                                 goto failed;
3841                         }
3842                 } else {
3843                         /* Initialize clocks */
3844                         r = amdgpu_atombios_get_clock_info(adev);
3845                         if (r) {
3846                                 dev_err(adev->dev, "amdgpu_atombios_get_clock_info failed\n");
3847                                 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0);
3848                                 goto failed;
3849                         }
3850                         /* init i2c buses */
3851                         if (!amdgpu_device_has_dc_support(adev))
3852                                 amdgpu_atombios_i2c_init(adev);
3853                 }
3854         }
3855
3856 fence_driver_init:
3857         /* Fence driver */
3858         r = amdgpu_fence_driver_sw_init(adev);
3859         if (r) {
3860                 dev_err(adev->dev, "amdgpu_fence_driver_sw_init failed\n");
3861                 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_FENCE_INIT_FAIL, 0, 0);
3862                 goto failed;
3863         }
3864
3865         /* init the mode config */
3866         drm_mode_config_init(adev_to_drm(adev));
3867
3868         r = amdgpu_device_ip_init(adev);
3869         if (r) {
3870                 dev_err(adev->dev, "amdgpu_device_ip_init failed\n");
3871                 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0);
3872                 goto release_ras_con;
3873         }
3874
3875         amdgpu_fence_driver_hw_init(adev);
3876
3877         dev_info(adev->dev,
3878                 "SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n",
3879                         adev->gfx.config.max_shader_engines,
3880                         adev->gfx.config.max_sh_per_se,
3881                         adev->gfx.config.max_cu_per_sh,
3882                         adev->gfx.cu_info.number);
3883
3884         adev->accel_working = true;
3885
3886         amdgpu_vm_check_compute_bug(adev);
3887
3888         /* Initialize the buffer migration limit. */
3889         if (amdgpu_moverate >= 0)
3890                 max_MBps = amdgpu_moverate;
3891         else
3892                 max_MBps = 8; /* Allow 8 MB/s. */
3893         /* Get a log2 for easy divisions. */
3894         adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps));
3895
3896         r = amdgpu_atombios_sysfs_init(adev);
3897         if (r)
3898                 drm_err(&adev->ddev,
3899                         "registering atombios sysfs failed (%d).\n", r);
3900
3901         r = amdgpu_pm_sysfs_init(adev);
3902         if (r)
3903                 DRM_ERROR("registering pm sysfs failed (%d).\n", r);
3904
3905         r = amdgpu_ucode_sysfs_init(adev);
3906         if (r) {
3907                 adev->ucode_sysfs_en = false;
3908                 DRM_ERROR("Creating firmware sysfs failed (%d).\n", r);
3909         } else
3910                 adev->ucode_sysfs_en = true;
3911
3912         /*
3913          * Register gpu instance before amdgpu_device_enable_mgpu_fan_boost.
3914          * Otherwise the mgpu fan boost feature will be skipped due to the
3915          * gpu instance is counted less.
3916          */
3917         amdgpu_register_gpu_instance(adev);
3918
3919         /* enable clockgating, etc. after ib tests, etc. since some blocks require
3920          * explicit gating rather than handling it automatically.
3921          */
3922         if (!adev->gmc.xgmi.pending_reset) {
3923                 r = amdgpu_device_ip_late_init(adev);
3924                 if (r) {
3925                         dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n");
3926                         amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r);
3927                         goto release_ras_con;
3928                 }
3929                 /* must succeed. */
3930                 amdgpu_ras_resume(adev);
3931                 queue_delayed_work(system_wq, &adev->delayed_init_work,
3932                                    msecs_to_jiffies(AMDGPU_RESUME_MS));
3933         }
3934
3935         if (amdgpu_sriov_vf(adev)) {
3936                 amdgpu_virt_release_full_gpu(adev, true);
3937                 flush_delayed_work(&adev->delayed_init_work);
3938         }
3939
3940         r = sysfs_create_files(&adev->dev->kobj, amdgpu_dev_attributes);
3941         if (r)
3942                 dev_err(adev->dev, "Could not create amdgpu device attr\n");
3943
3944         if (IS_ENABLED(CONFIG_PERF_EVENTS))
3945                 r = amdgpu_pmu_init(adev);
3946         if (r)
3947                 dev_err(adev->dev, "amdgpu_pmu_init failed\n");
3948
3949         /* Have stored pci confspace at hand for restore in sudden PCI error */
3950         if (amdgpu_device_cache_pci_state(adev->pdev))
3951                 pci_restore_state(pdev);
3952
3953         /* if we have > 1 VGA cards, then disable the amdgpu VGA resources */
3954         /* this will fail for cards that aren't VGA class devices, just
3955          * ignore it
3956          */
3957         if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA)
3958                 vga_client_register(adev->pdev, amdgpu_device_vga_set_decode);
3959
3960         px = amdgpu_device_supports_px(ddev);
3961
3962         if (px || (!pci_is_thunderbolt_attached(adev->pdev) &&
3963                                 apple_gmux_detect(NULL, NULL)))
3964                 vga_switcheroo_register_client(adev->pdev,
3965                                                &amdgpu_switcheroo_ops, px);
3966
3967         if (px)
3968                 vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain);
3969
3970         if (adev->gmc.xgmi.pending_reset)
3971                 queue_delayed_work(system_wq, &mgpu_info.delayed_reset_work,
3972                                    msecs_to_jiffies(AMDGPU_RESUME_MS));
3973
3974         amdgpu_device_check_iommu_direct_map(adev);
3975
3976         return 0;
3977
3978 release_ras_con:
3979         if (amdgpu_sriov_vf(adev))
3980                 amdgpu_virt_release_full_gpu(adev, true);
3981
3982         /* failed in exclusive mode due to timeout */
3983         if (amdgpu_sriov_vf(adev) &&
3984                 !amdgpu_sriov_runtime(adev) &&
3985                 amdgpu_virt_mmio_blocked(adev) &&
3986                 !amdgpu_virt_wait_reset(adev)) {
3987                 dev_err(adev->dev, "VF exclusive mode timeout\n");
3988                 /* Don't send request since VF is inactive. */
3989                 adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME;
3990                 adev->virt.ops = NULL;
3991                 r = -EAGAIN;
3992         }
3993         amdgpu_release_ras_context(adev);
3994
3995 failed:
3996         amdgpu_vf_error_trans_all(adev);
3997
3998         return r;
3999 }
4000
4001 static void amdgpu_device_unmap_mmio(struct amdgpu_device *adev)
4002 {
4003
4004         /* Clear all CPU mappings pointing to this device */
4005         unmap_mapping_range(adev->ddev.anon_inode->i_mapping, 0, 0, 1);
4006
4007         /* Unmap all mapped bars - Doorbell, registers and VRAM */
4008         amdgpu_doorbell_fini(adev);
4009
4010         iounmap(adev->rmmio);
4011         adev->rmmio = NULL;
4012         if (adev->mman.aper_base_kaddr)
4013                 iounmap(adev->mman.aper_base_kaddr);
4014         adev->mman.aper_base_kaddr = NULL;
4015
4016         /* Memory manager related */
4017         if (!adev->gmc.xgmi.connected_to_cpu && !adev->gmc.is_app_apu) {
4018                 arch_phys_wc_del(adev->gmc.vram_mtrr);
4019                 arch_io_free_memtype_wc(adev->gmc.aper_base, adev->gmc.aper_size);
4020         }
4021 }
4022
4023 /**
4024  * amdgpu_device_fini_hw - tear down the driver
4025  *
4026  * @adev: amdgpu_device pointer
4027  *
4028  * Tear down the driver info (all asics).
4029  * Called at driver shutdown.
4030  */
4031 void amdgpu_device_fini_hw(struct amdgpu_device *adev)
4032 {
4033         dev_info(adev->dev, "amdgpu: finishing device.\n");
4034         flush_delayed_work(&adev->delayed_init_work);
4035         adev->shutdown = true;
4036
4037         /* make sure IB test finished before entering exclusive mode
4038          * to avoid preemption on IB test
4039          */
4040         if (amdgpu_sriov_vf(adev)) {
4041                 amdgpu_virt_request_full_gpu(adev, false);
4042                 amdgpu_virt_fini_data_exchange(adev);
4043         }
4044
4045         /* disable all interrupts */
4046         amdgpu_irq_disable_all(adev);
4047         if (adev->mode_info.mode_config_initialized) {
4048                 if (!drm_drv_uses_atomic_modeset(adev_to_drm(adev)))
4049                         drm_helper_force_disable_all(adev_to_drm(adev));
4050                 else
4051                         drm_atomic_helper_shutdown(adev_to_drm(adev));
4052         }
4053         amdgpu_fence_driver_hw_fini(adev);
4054
4055         if (adev->mman.initialized)
4056                 drain_workqueue(adev->mman.bdev.wq);
4057
4058         if (adev->pm.sysfs_initialized)
4059                 amdgpu_pm_sysfs_fini(adev);
4060         if (adev->ucode_sysfs_en)
4061                 amdgpu_ucode_sysfs_fini(adev);
4062         sysfs_remove_files(&adev->dev->kobj, amdgpu_dev_attributes);
4063
4064         /* disable ras feature must before hw fini */
4065         amdgpu_ras_pre_fini(adev);
4066
4067         amdgpu_device_ip_fini_early(adev);
4068
4069         amdgpu_irq_fini_hw(adev);
4070
4071         if (adev->mman.initialized)
4072                 ttm_device_clear_dma_mappings(&adev->mman.bdev);
4073
4074         amdgpu_gart_dummy_page_fini(adev);
4075
4076         if (drm_dev_is_unplugged(adev_to_drm(adev)))
4077                 amdgpu_device_unmap_mmio(adev);
4078
4079 }
4080
4081 void amdgpu_device_fini_sw(struct amdgpu_device *adev)
4082 {
4083         int idx;
4084         bool px;
4085
4086         amdgpu_fence_driver_sw_fini(adev);
4087         amdgpu_device_ip_fini(adev);
4088         amdgpu_ucode_release(&adev->firmware.gpu_info_fw);
4089         adev->accel_working = false;
4090         dma_fence_put(rcu_dereference_protected(adev->gang_submit, true));
4091
4092         amdgpu_reset_fini(adev);
4093
4094         /* free i2c buses */
4095         if (!amdgpu_device_has_dc_support(adev))
4096                 amdgpu_i2c_fini(adev);
4097
4098         if (amdgpu_emu_mode != 1)
4099                 amdgpu_atombios_fini(adev);
4100
4101         kfree(adev->bios);
4102         adev->bios = NULL;
4103
4104         px = amdgpu_device_supports_px(adev_to_drm(adev));
4105
4106         if (px || (!pci_is_thunderbolt_attached(adev->pdev) &&
4107                                 apple_gmux_detect(NULL, NULL)))
4108                 vga_switcheroo_unregister_client(adev->pdev);
4109
4110         if (px)
4111                 vga_switcheroo_fini_domain_pm_ops(adev->dev);
4112
4113         if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA)
4114                 vga_client_unregister(adev->pdev);
4115
4116         if (drm_dev_enter(adev_to_drm(adev), &idx)) {
4117
4118                 iounmap(adev->rmmio);
4119                 adev->rmmio = NULL;
4120                 amdgpu_doorbell_fini(adev);
4121                 drm_dev_exit(idx);
4122         }
4123
4124         if (IS_ENABLED(CONFIG_PERF_EVENTS))
4125                 amdgpu_pmu_fini(adev);
4126         if (adev->mman.discovery_bin)
4127                 amdgpu_discovery_fini(adev);
4128
4129         amdgpu_reset_put_reset_domain(adev->reset_domain);
4130         adev->reset_domain = NULL;
4131
4132         kfree(adev->pci_state);
4133
4134 }
4135
4136 /**
4137  * amdgpu_device_evict_resources - evict device resources
4138  * @adev: amdgpu device object
4139  *
4140  * Evicts all ttm device resources(vram BOs, gart table) from the lru list
4141  * of the vram memory type. Mainly used for evicting device resources
4142  * at suspend time.
4143  *
4144  */
4145 static int amdgpu_device_evict_resources(struct amdgpu_device *adev)
4146 {
4147         int ret;
4148
4149         /* No need to evict vram on APUs for suspend to ram or s2idle */
4150         if ((adev->in_s3 || adev->in_s0ix) && (adev->flags & AMD_IS_APU))
4151                 return 0;
4152
4153         ret = amdgpu_ttm_evict_resources(adev, TTM_PL_VRAM);
4154         if (ret)
4155                 DRM_WARN("evicting device resources failed\n");
4156         return ret;
4157 }
4158
4159 /*
4160  * Suspend & resume.
4161  */
4162 /**
4163  * amdgpu_device_suspend - initiate device suspend
4164  *
4165  * @dev: drm dev pointer
4166  * @fbcon : notify the fbdev of suspend
4167  *
4168  * Puts the hw in the suspend state (all asics).
4169  * Returns 0 for success or an error on failure.
4170  * Called at driver suspend.
4171  */
4172 int amdgpu_device_suspend(struct drm_device *dev, bool fbcon)
4173 {
4174         struct amdgpu_device *adev = drm_to_adev(dev);
4175         int r = 0;
4176
4177         if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)
4178                 return 0;
4179
4180         adev->in_suspend = true;
4181
4182         /* Evict the majority of BOs before grabbing the full access */
4183         r = amdgpu_device_evict_resources(adev);
4184         if (r)
4185                 return r;
4186
4187         if (amdgpu_sriov_vf(adev)) {
4188                 amdgpu_virt_fini_data_exchange(adev);
4189                 r = amdgpu_virt_request_full_gpu(adev, false);
4190                 if (r)
4191                         return r;
4192         }
4193
4194         if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D3))
4195                 DRM_WARN("smart shift update failed\n");
4196
4197         if (fbcon)
4198                 drm_fb_helper_set_suspend_unlocked(adev_to_drm(adev)->fb_helper, true);
4199
4200         cancel_delayed_work_sync(&adev->delayed_init_work);
4201
4202         amdgpu_ras_suspend(adev);
4203
4204         amdgpu_device_ip_suspend_phase1(adev);
4205
4206         if (!adev->in_s0ix)
4207                 amdgpu_amdkfd_suspend(adev, adev->in_runpm);
4208
4209         r = amdgpu_device_evict_resources(adev);
4210         if (r)
4211                 return r;
4212
4213         amdgpu_fence_driver_hw_fini(adev);
4214
4215         amdgpu_device_ip_suspend_phase2(adev);
4216
4217         if (amdgpu_sriov_vf(adev))
4218                 amdgpu_virt_release_full_gpu(adev, false);
4219
4220         return 0;
4221 }
4222
4223 /**
4224  * amdgpu_device_resume - initiate device resume
4225  *
4226  * @dev: drm dev pointer
4227  * @fbcon : notify the fbdev of resume
4228  *
4229  * Bring the hw back to operating state (all asics).
4230  * Returns 0 for success or an error on failure.
4231  * Called at driver resume.
4232  */
4233 int amdgpu_device_resume(struct drm_device *dev, bool fbcon)
4234 {
4235         struct amdgpu_device *adev = drm_to_adev(dev);
4236         int r = 0;
4237
4238         if (amdgpu_sriov_vf(adev)) {
4239                 r = amdgpu_virt_request_full_gpu(adev, true);
4240                 if (r)
4241                         return r;
4242         }
4243
4244         if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)
4245                 return 0;
4246
4247         if (adev->in_s0ix)
4248                 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D0Entry);
4249
4250         /* post card */
4251         if (amdgpu_device_need_post(adev)) {
4252                 r = amdgpu_device_asic_init(adev);
4253                 if (r)
4254                         dev_err(adev->dev, "amdgpu asic init failed\n");
4255         }
4256
4257         r = amdgpu_device_ip_resume(adev);
4258
4259         if (r) {
4260                 dev_err(adev->dev, "amdgpu_device_ip_resume failed (%d).\n", r);
4261                 goto exit;
4262         }
4263         amdgpu_fence_driver_hw_init(adev);
4264
4265         r = amdgpu_device_ip_late_init(adev);
4266         if (r)
4267                 goto exit;
4268
4269         queue_delayed_work(system_wq, &adev->delayed_init_work,
4270                            msecs_to_jiffies(AMDGPU_RESUME_MS));
4271
4272         if (!adev->in_s0ix) {
4273                 r = amdgpu_amdkfd_resume(adev, adev->in_runpm);
4274                 if (r)
4275                         goto exit;
4276         }
4277
4278 exit:
4279         if (amdgpu_sriov_vf(adev)) {
4280                 amdgpu_virt_init_data_exchange(adev);
4281                 amdgpu_virt_release_full_gpu(adev, true);
4282         }
4283
4284         if (r)
4285                 return r;
4286
4287         /* Make sure IB tests flushed */
4288         flush_delayed_work(&adev->delayed_init_work);
4289
4290         if (fbcon)
4291                 drm_fb_helper_set_suspend_unlocked(adev_to_drm(adev)->fb_helper, false);
4292
4293         amdgpu_ras_resume(adev);
4294
4295         if (adev->mode_info.num_crtc) {
4296                 /*
4297                  * Most of the connector probing functions try to acquire runtime pm
4298                  * refs to ensure that the GPU is powered on when connector polling is
4299                  * performed. Since we're calling this from a runtime PM callback,
4300                  * trying to acquire rpm refs will cause us to deadlock.
4301                  *
4302                  * Since we're guaranteed to be holding the rpm lock, it's safe to
4303                  * temporarily disable the rpm helpers so this doesn't deadlock us.
4304                  */
4305 #ifdef CONFIG_PM
4306                 dev->dev->power.disable_depth++;
4307 #endif
4308                 if (!adev->dc_enabled)
4309                         drm_helper_hpd_irq_event(dev);
4310                 else
4311                         drm_kms_helper_hotplug_event(dev);
4312 #ifdef CONFIG_PM
4313                 dev->dev->power.disable_depth--;
4314 #endif
4315         }
4316         adev->in_suspend = false;
4317
4318         if (adev->enable_mes)
4319                 amdgpu_mes_self_test(adev);
4320
4321         if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D0))
4322                 DRM_WARN("smart shift update failed\n");
4323
4324         return 0;
4325 }
4326
4327 /**
4328  * amdgpu_device_ip_check_soft_reset - did soft reset succeed
4329  *
4330  * @adev: amdgpu_device pointer
4331  *
4332  * The list of all the hardware IPs that make up the asic is walked and
4333  * the check_soft_reset callbacks are run.  check_soft_reset determines
4334  * if the asic is still hung or not.
4335  * Returns true if any of the IPs are still in a hung state, false if not.
4336  */
4337 static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device *adev)
4338 {
4339         int i;
4340         bool asic_hang = false;
4341
4342         if (amdgpu_sriov_vf(adev))
4343                 return true;
4344
4345         if (amdgpu_asic_need_full_reset(adev))
4346                 return true;
4347
4348         for (i = 0; i < adev->num_ip_blocks; i++) {
4349                 if (!adev->ip_blocks[i].status.valid)
4350                         continue;
4351                 if (adev->ip_blocks[i].version->funcs->check_soft_reset)
4352                         adev->ip_blocks[i].status.hang =
4353                                 adev->ip_blocks[i].version->funcs->check_soft_reset(adev);
4354                 if (adev->ip_blocks[i].status.hang) {
4355                         dev_info(adev->dev, "IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name);
4356                         asic_hang = true;
4357                 }
4358         }
4359         return asic_hang;
4360 }
4361
4362 /**
4363  * amdgpu_device_ip_pre_soft_reset - prepare for soft reset
4364  *
4365  * @adev: amdgpu_device pointer
4366  *
4367  * The list of all the hardware IPs that make up the asic is walked and the
4368  * pre_soft_reset callbacks are run if the block is hung.  pre_soft_reset
4369  * handles any IP specific hardware or software state changes that are
4370  * necessary for a soft reset to succeed.
4371  * Returns 0 on success, negative error code on failure.
4372  */
4373 static int amdgpu_device_ip_pre_soft_reset(struct amdgpu_device *adev)
4374 {
4375         int i, r = 0;
4376
4377         for (i = 0; i < adev->num_ip_blocks; i++) {
4378                 if (!adev->ip_blocks[i].status.valid)
4379                         continue;
4380                 if (adev->ip_blocks[i].status.hang &&
4381                     adev->ip_blocks[i].version->funcs->pre_soft_reset) {
4382                         r = adev->ip_blocks[i].version->funcs->pre_soft_reset(adev);
4383                         if (r)
4384                                 return r;
4385                 }
4386         }
4387
4388         return 0;
4389 }
4390
4391 /**
4392  * amdgpu_device_ip_need_full_reset - check if a full asic reset is needed
4393  *
4394  * @adev: amdgpu_device pointer
4395  *
4396  * Some hardware IPs cannot be soft reset.  If they are hung, a full gpu
4397  * reset is necessary to recover.
4398  * Returns true if a full asic reset is required, false if not.
4399  */
4400 static bool amdgpu_device_ip_need_full_reset(struct amdgpu_device *adev)
4401 {
4402         int i;
4403
4404         if (amdgpu_asic_need_full_reset(adev))
4405                 return true;
4406
4407         for (i = 0; i < adev->num_ip_blocks; i++) {
4408                 if (!adev->ip_blocks[i].status.valid)
4409                         continue;
4410                 if ((adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) ||
4411                     (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) ||
4412                     (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_ACP) ||
4413                     (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) ||
4414                      adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) {
4415                         if (adev->ip_blocks[i].status.hang) {
4416                                 dev_info(adev->dev, "Some block need full reset!\n");
4417                                 return true;
4418                         }
4419                 }
4420         }
4421         return false;
4422 }
4423
4424 /**
4425  * amdgpu_device_ip_soft_reset - do a soft reset
4426  *
4427  * @adev: amdgpu_device pointer
4428  *
4429  * The list of all the hardware IPs that make up the asic is walked and the
4430  * soft_reset callbacks are run if the block is hung.  soft_reset handles any
4431  * IP specific hardware or software state changes that are necessary to soft
4432  * reset the IP.
4433  * Returns 0 on success, negative error code on failure.
4434  */
4435 static int amdgpu_device_ip_soft_reset(struct amdgpu_device *adev)
4436 {
4437         int i, r = 0;
4438
4439         for (i = 0; i < adev->num_ip_blocks; i++) {
4440                 if (!adev->ip_blocks[i].status.valid)
4441                         continue;
4442                 if (adev->ip_blocks[i].status.hang &&
4443                     adev->ip_blocks[i].version->funcs->soft_reset) {
4444                         r = adev->ip_blocks[i].version->funcs->soft_reset(adev);
4445                         if (r)
4446                                 return r;
4447                 }
4448         }
4449
4450         return 0;
4451 }
4452
4453 /**
4454  * amdgpu_device_ip_post_soft_reset - clean up from soft reset
4455  *
4456  * @adev: amdgpu_device pointer
4457  *
4458  * The list of all the hardware IPs that make up the asic is walked and the
4459  * post_soft_reset callbacks are run if the asic was hung.  post_soft_reset
4460  * handles any IP specific hardware or software state changes that are
4461  * necessary after the IP has been soft reset.
4462  * Returns 0 on success, negative error code on failure.
4463  */
4464 static int amdgpu_device_ip_post_soft_reset(struct amdgpu_device *adev)
4465 {
4466         int i, r = 0;
4467
4468         for (i = 0; i < adev->num_ip_blocks; i++) {
4469                 if (!adev->ip_blocks[i].status.valid)
4470                         continue;
4471                 if (adev->ip_blocks[i].status.hang &&
4472                     adev->ip_blocks[i].version->funcs->post_soft_reset)
4473                         r = adev->ip_blocks[i].version->funcs->post_soft_reset(adev);
4474                 if (r)
4475                         return r;
4476         }
4477
4478         return 0;
4479 }
4480
4481 /**
4482  * amdgpu_device_recover_vram - Recover some VRAM contents
4483  *
4484  * @adev: amdgpu_device pointer
4485  *
4486  * Restores the contents of VRAM buffers from the shadows in GTT.  Used to
4487  * restore things like GPUVM page tables after a GPU reset where
4488  * the contents of VRAM might be lost.
4489  *
4490  * Returns:
4491  * 0 on success, negative error code on failure.
4492  */
4493 static int amdgpu_device_recover_vram(struct amdgpu_device *adev)
4494 {
4495         struct dma_fence *fence = NULL, *next = NULL;
4496         struct amdgpu_bo *shadow;
4497         struct amdgpu_bo_vm *vmbo;
4498         long r = 1, tmo;
4499
4500         if (amdgpu_sriov_runtime(adev))
4501                 tmo = msecs_to_jiffies(8000);
4502         else
4503                 tmo = msecs_to_jiffies(100);
4504
4505         dev_info(adev->dev, "recover vram bo from shadow start\n");
4506         mutex_lock(&adev->shadow_list_lock);
4507         list_for_each_entry(vmbo, &adev->shadow_list, shadow_list) {
4508                 /* If vm is compute context or adev is APU, shadow will be NULL */
4509                 if (!vmbo->shadow)
4510                         continue;
4511                 shadow = vmbo->shadow;
4512
4513                 /* No need to recover an evicted BO */
4514                 if (shadow->tbo.resource->mem_type != TTM_PL_TT ||
4515                     shadow->tbo.resource->start == AMDGPU_BO_INVALID_OFFSET ||
4516                     shadow->parent->tbo.resource->mem_type != TTM_PL_VRAM)
4517                         continue;
4518
4519                 r = amdgpu_bo_restore_shadow(shadow, &next);
4520                 if (r)
4521                         break;
4522
4523                 if (fence) {
4524                         tmo = dma_fence_wait_timeout(fence, false, tmo);
4525                         dma_fence_put(fence);
4526                         fence = next;
4527                         if (tmo == 0) {
4528                                 r = -ETIMEDOUT;
4529                                 break;
4530                         } else if (tmo < 0) {
4531                                 r = tmo;
4532                                 break;
4533                         }
4534                 } else {
4535                         fence = next;
4536                 }
4537         }
4538         mutex_unlock(&adev->shadow_list_lock);
4539
4540         if (fence)
4541                 tmo = dma_fence_wait_timeout(fence, false, tmo);
4542         dma_fence_put(fence);
4543
4544         if (r < 0 || tmo <= 0) {
4545                 dev_err(adev->dev, "recover vram bo from shadow failed, r is %ld, tmo is %ld\n", r, tmo);
4546                 return -EIO;
4547         }
4548
4549         dev_info(adev->dev, "recover vram bo from shadow done\n");
4550         return 0;
4551 }
4552
4553
4554 /**
4555  * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf
4556  *
4557  * @adev: amdgpu_device pointer
4558  * @from_hypervisor: request from hypervisor
4559  *
4560  * do VF FLR and reinitialize Asic
4561  * return 0 means succeeded otherwise failed
4562  */
4563 static int amdgpu_device_reset_sriov(struct amdgpu_device *adev,
4564                                      bool from_hypervisor)
4565 {
4566         int r;
4567         struct amdgpu_hive_info *hive = NULL;
4568         int retry_limit = 0;
4569
4570 retry:
4571         amdgpu_amdkfd_pre_reset(adev);
4572
4573         if (from_hypervisor)
4574                 r = amdgpu_virt_request_full_gpu(adev, true);
4575         else
4576                 r = amdgpu_virt_reset_gpu(adev);
4577         if (r)
4578                 return r;
4579
4580         /* some sw clean up VF needs to do before recover */
4581         amdgpu_virt_post_reset(adev);
4582
4583         /* Resume IP prior to SMC */
4584         r = amdgpu_device_ip_reinit_early_sriov(adev);
4585         if (r)
4586                 goto error;
4587
4588         amdgpu_virt_init_data_exchange(adev);
4589
4590         r = amdgpu_device_fw_loading(adev);
4591         if (r)
4592                 return r;
4593
4594         /* now we are okay to resume SMC/CP/SDMA */
4595         r = amdgpu_device_ip_reinit_late_sriov(adev);
4596         if (r)
4597                 goto error;
4598
4599         hive = amdgpu_get_xgmi_hive(adev);
4600         /* Update PSP FW topology after reset */
4601         if (hive && adev->gmc.xgmi.num_physical_nodes > 1)
4602                 r = amdgpu_xgmi_update_topology(hive, adev);
4603
4604         if (hive)
4605                 amdgpu_put_xgmi_hive(hive);
4606
4607         if (!r) {
4608                 amdgpu_irq_gpu_reset_resume_helper(adev);
4609                 r = amdgpu_ib_ring_tests(adev);
4610
4611                 amdgpu_amdkfd_post_reset(adev);
4612         }
4613
4614 error:
4615         if (!r && adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) {
4616                 amdgpu_inc_vram_lost(adev);
4617                 r = amdgpu_device_recover_vram(adev);
4618         }
4619         amdgpu_virt_release_full_gpu(adev, true);
4620
4621         if (AMDGPU_RETRY_SRIOV_RESET(r)) {
4622                 if (retry_limit < AMDGPU_MAX_RETRY_LIMIT) {
4623                         retry_limit++;
4624                         goto retry;
4625                 } else
4626                         DRM_ERROR("GPU reset retry is beyond the retry limit\n");
4627         }
4628
4629         return r;
4630 }
4631
4632 /**
4633  * amdgpu_device_has_job_running - check if there is any job in mirror list
4634  *
4635  * @adev: amdgpu_device pointer
4636  *
4637  * check if there is any job in mirror list
4638  */
4639 bool amdgpu_device_has_job_running(struct amdgpu_device *adev)
4640 {
4641         int i;
4642         struct drm_sched_job *job;
4643
4644         for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4645                 struct amdgpu_ring *ring = adev->rings[i];
4646
4647                 if (!ring || !ring->sched.thread)
4648                         continue;
4649
4650                 spin_lock(&ring->sched.job_list_lock);
4651                 job = list_first_entry_or_null(&ring->sched.pending_list,
4652                                                struct drm_sched_job, list);
4653                 spin_unlock(&ring->sched.job_list_lock);
4654                 if (job)
4655                         return true;
4656         }
4657         return false;
4658 }
4659
4660 /**
4661  * amdgpu_device_should_recover_gpu - check if we should try GPU recovery
4662  *
4663  * @adev: amdgpu_device pointer
4664  *
4665  * Check amdgpu_gpu_recovery and SRIOV status to see if we should try to recover
4666  * a hung GPU.
4667  */
4668 bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev)
4669 {
4670
4671         if (amdgpu_gpu_recovery == 0)
4672                 goto disabled;
4673
4674         /* Skip soft reset check in fatal error mode */
4675         if (!amdgpu_ras_is_poison_mode_supported(adev))
4676                 return true;
4677
4678         if (amdgpu_sriov_vf(adev))
4679                 return true;
4680
4681         if (amdgpu_gpu_recovery == -1) {
4682                 switch (adev->asic_type) {
4683 #ifdef CONFIG_DRM_AMDGPU_SI
4684                 case CHIP_VERDE:
4685                 case CHIP_TAHITI:
4686                 case CHIP_PITCAIRN:
4687                 case CHIP_OLAND:
4688                 case CHIP_HAINAN:
4689 #endif
4690 #ifdef CONFIG_DRM_AMDGPU_CIK
4691                 case CHIP_KAVERI:
4692                 case CHIP_KABINI:
4693                 case CHIP_MULLINS:
4694 #endif
4695                 case CHIP_CARRIZO:
4696                 case CHIP_STONEY:
4697                 case CHIP_CYAN_SKILLFISH:
4698                         goto disabled;
4699                 default:
4700                         break;
4701                 }
4702         }
4703
4704         return true;
4705
4706 disabled:
4707                 dev_info(adev->dev, "GPU recovery disabled.\n");
4708                 return false;
4709 }
4710
4711 int amdgpu_device_mode1_reset(struct amdgpu_device *adev)
4712 {
4713         u32 i;
4714         int ret = 0;
4715
4716         amdgpu_atombios_scratch_regs_engine_hung(adev, true);
4717
4718         dev_info(adev->dev, "GPU mode1 reset\n");
4719
4720         /* disable BM */
4721         pci_clear_master(adev->pdev);
4722
4723         amdgpu_device_cache_pci_state(adev->pdev);
4724
4725         if (amdgpu_dpm_is_mode1_reset_supported(adev)) {
4726                 dev_info(adev->dev, "GPU smu mode1 reset\n");
4727                 ret = amdgpu_dpm_mode1_reset(adev);
4728         } else {
4729                 dev_info(adev->dev, "GPU psp mode1 reset\n");
4730                 ret = psp_gpu_reset(adev);
4731         }
4732
4733         if (ret)
4734                 dev_err(adev->dev, "GPU mode1 reset failed\n");
4735
4736         amdgpu_device_load_pci_state(adev->pdev);
4737
4738         /* wait for asic to come out of reset */
4739         for (i = 0; i < adev->usec_timeout; i++) {
4740                 u32 memsize = adev->nbio.funcs->get_memsize(adev);
4741
4742                 if (memsize != 0xffffffff)
4743                         break;
4744                 udelay(1);
4745         }
4746
4747         amdgpu_atombios_scratch_regs_engine_hung(adev, false);
4748         return ret;
4749 }
4750
4751 int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
4752                                  struct amdgpu_reset_context *reset_context)
4753 {
4754         int i, r = 0;
4755         struct amdgpu_job *job = NULL;
4756         bool need_full_reset =
4757                 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
4758
4759         if (reset_context->reset_req_dev == adev)
4760                 job = reset_context->job;
4761
4762         if (amdgpu_sriov_vf(adev)) {
4763                 /* stop the data exchange thread */
4764                 amdgpu_virt_fini_data_exchange(adev);
4765         }
4766
4767         amdgpu_fence_driver_isr_toggle(adev, true);
4768
4769         /* block all schedulers and reset given job's ring */
4770         for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4771                 struct amdgpu_ring *ring = adev->rings[i];
4772
4773                 if (!ring || !ring->sched.thread)
4774                         continue;
4775
4776                 /* Clear job fence from fence drv to avoid force_completion
4777                  * leave NULL and vm flush fence in fence drv
4778                  */
4779                 amdgpu_fence_driver_clear_job_fences(ring);
4780
4781                 /* after all hw jobs are reset, hw fence is meaningless, so force_completion */
4782                 amdgpu_fence_driver_force_completion(ring);
4783         }
4784
4785         amdgpu_fence_driver_isr_toggle(adev, false);
4786
4787         if (job && job->vm)
4788                 drm_sched_increase_karma(&job->base);
4789
4790         r = amdgpu_reset_prepare_hwcontext(adev, reset_context);
4791         /* If reset handler not implemented, continue; otherwise return */
4792         if (r == -EOPNOTSUPP)
4793                 r = 0;
4794         else
4795                 return r;
4796
4797         /* Don't suspend on bare metal if we are not going to HW reset the ASIC */
4798         if (!amdgpu_sriov_vf(adev)) {
4799
4800                 if (!need_full_reset)
4801                         need_full_reset = amdgpu_device_ip_need_full_reset(adev);
4802
4803                 if (!need_full_reset && amdgpu_gpu_recovery &&
4804                     amdgpu_device_ip_check_soft_reset(adev)) {
4805                         amdgpu_device_ip_pre_soft_reset(adev);
4806                         r = amdgpu_device_ip_soft_reset(adev);
4807                         amdgpu_device_ip_post_soft_reset(adev);
4808                         if (r || amdgpu_device_ip_check_soft_reset(adev)) {
4809                                 dev_info(adev->dev, "soft reset failed, will fallback to full reset!\n");
4810                                 need_full_reset = true;
4811                         }
4812                 }
4813
4814                 if (need_full_reset)
4815                         r = amdgpu_device_ip_suspend(adev);
4816                 if (need_full_reset)
4817                         set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
4818                 else
4819                         clear_bit(AMDGPU_NEED_FULL_RESET,
4820                                   &reset_context->flags);
4821         }
4822
4823         return r;
4824 }
4825
4826 static int amdgpu_reset_reg_dumps(struct amdgpu_device *adev)
4827 {
4828         int i;
4829
4830         lockdep_assert_held(&adev->reset_domain->sem);
4831
4832         for (i = 0; i < adev->num_regs; i++) {
4833                 adev->reset_dump_reg_value[i] = RREG32(adev->reset_dump_reg_list[i]);
4834                 trace_amdgpu_reset_reg_dumps(adev->reset_dump_reg_list[i],
4835                                              adev->reset_dump_reg_value[i]);
4836         }
4837
4838         return 0;
4839 }
4840
4841 #ifdef CONFIG_DEV_COREDUMP
4842 static ssize_t amdgpu_devcoredump_read(char *buffer, loff_t offset,
4843                 size_t count, void *data, size_t datalen)
4844 {
4845         struct drm_printer p;
4846         struct amdgpu_device *adev = data;
4847         struct drm_print_iterator iter;
4848         int i;
4849
4850         iter.data = buffer;
4851         iter.offset = 0;
4852         iter.start = offset;
4853         iter.remain = count;
4854
4855         p = drm_coredump_printer(&iter);
4856
4857         drm_printf(&p, "**** AMDGPU Device Coredump ****\n");
4858         drm_printf(&p, "kernel: " UTS_RELEASE "\n");
4859         drm_printf(&p, "module: " KBUILD_MODNAME "\n");
4860         drm_printf(&p, "time: %lld.%09ld\n", adev->reset_time.tv_sec, adev->reset_time.tv_nsec);
4861         if (adev->reset_task_info.pid)
4862                 drm_printf(&p, "process_name: %s PID: %d\n",
4863                            adev->reset_task_info.process_name,
4864                            adev->reset_task_info.pid);
4865
4866         if (adev->reset_vram_lost)
4867                 drm_printf(&p, "VRAM is lost due to GPU reset!\n");
4868         if (adev->num_regs) {
4869                 drm_printf(&p, "AMDGPU register dumps:\nOffset:     Value:\n");
4870
4871                 for (i = 0; i < adev->num_regs; i++)
4872                         drm_printf(&p, "0x%08x: 0x%08x\n",
4873                                    adev->reset_dump_reg_list[i],
4874                                    adev->reset_dump_reg_value[i]);
4875         }
4876
4877         return count - iter.remain;
4878 }
4879
4880 static void amdgpu_devcoredump_free(void *data)
4881 {
4882 }
4883
4884 static void amdgpu_reset_capture_coredumpm(struct amdgpu_device *adev)
4885 {
4886         struct drm_device *dev = adev_to_drm(adev);
4887
4888         ktime_get_ts64(&adev->reset_time);
4889         dev_coredumpm(dev->dev, THIS_MODULE, adev, 0, GFP_KERNEL,
4890                       amdgpu_devcoredump_read, amdgpu_devcoredump_free);
4891 }
4892 #endif
4893
4894 int amdgpu_do_asic_reset(struct list_head *device_list_handle,
4895                          struct amdgpu_reset_context *reset_context)
4896 {
4897         struct amdgpu_device *tmp_adev = NULL;
4898         bool need_full_reset, skip_hw_reset, vram_lost = false;
4899         int r = 0;
4900         bool gpu_reset_for_dev_remove = 0;
4901
4902         /* Try reset handler method first */
4903         tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device,
4904                                     reset_list);
4905         amdgpu_reset_reg_dumps(tmp_adev);
4906
4907         reset_context->reset_device_list = device_list_handle;
4908         r = amdgpu_reset_perform_reset(tmp_adev, reset_context);
4909         /* If reset handler not implemented, continue; otherwise return */
4910         if (r == -EOPNOTSUPP)
4911                 r = 0;
4912         else
4913                 return r;
4914
4915         /* Reset handler not implemented, use the default method */
4916         need_full_reset =
4917                 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
4918         skip_hw_reset = test_bit(AMDGPU_SKIP_HW_RESET, &reset_context->flags);
4919
4920         gpu_reset_for_dev_remove =
4921                 test_bit(AMDGPU_RESET_FOR_DEVICE_REMOVE, &reset_context->flags) &&
4922                         test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
4923
4924         /*
4925          * ASIC reset has to be done on all XGMI hive nodes ASAP
4926          * to allow proper links negotiation in FW (within 1 sec)
4927          */
4928         if (!skip_hw_reset && need_full_reset) {
4929                 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
4930                         /* For XGMI run all resets in parallel to speed up the process */
4931                         if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
4932                                 tmp_adev->gmc.xgmi.pending_reset = false;
4933                                 if (!queue_work(system_unbound_wq, &tmp_adev->xgmi_reset_work))
4934                                         r = -EALREADY;
4935                         } else
4936                                 r = amdgpu_asic_reset(tmp_adev);
4937
4938                         if (r) {
4939                                 dev_err(tmp_adev->dev, "ASIC reset failed with error, %d for drm dev, %s",
4940                                          r, adev_to_drm(tmp_adev)->unique);
4941                                 break;
4942                         }
4943                 }
4944
4945                 /* For XGMI wait for all resets to complete before proceed */
4946                 if (!r) {
4947                         list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
4948                                 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
4949                                         flush_work(&tmp_adev->xgmi_reset_work);
4950                                         r = tmp_adev->asic_reset_res;
4951                                         if (r)
4952                                                 break;
4953                                 }
4954                         }
4955                 }
4956         }
4957
4958         if (!r && amdgpu_ras_intr_triggered()) {
4959                 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
4960                         if (tmp_adev->mmhub.ras && tmp_adev->mmhub.ras->ras_block.hw_ops &&
4961                             tmp_adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count)
4962                                 tmp_adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count(tmp_adev);
4963                 }
4964
4965                 amdgpu_ras_intr_cleared();
4966         }
4967
4968         /* Since the mode1 reset affects base ip blocks, the
4969          * phase1 ip blocks need to be resumed. Otherwise there
4970          * will be a BIOS signature error and the psp bootloader
4971          * can't load kdb on the next amdgpu install.
4972          */
4973         if (gpu_reset_for_dev_remove) {
4974                 list_for_each_entry(tmp_adev, device_list_handle, reset_list)
4975                         amdgpu_device_ip_resume_phase1(tmp_adev);
4976
4977                 goto end;
4978         }
4979
4980         list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
4981                 if (need_full_reset) {
4982                         /* post card */
4983                         r = amdgpu_device_asic_init(tmp_adev);
4984                         if (r) {
4985                                 dev_warn(tmp_adev->dev, "asic atom init failed!");
4986                         } else {
4987                                 dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n");
4988                                 r = amdgpu_amdkfd_resume_iommu(tmp_adev);
4989                                 if (r)
4990                                         goto out;
4991
4992                                 r = amdgpu_device_ip_resume_phase1(tmp_adev);
4993                                 if (r)
4994                                         goto out;
4995
4996                                 vram_lost = amdgpu_device_check_vram_lost(tmp_adev);
4997 #ifdef CONFIG_DEV_COREDUMP
4998                                 tmp_adev->reset_vram_lost = vram_lost;
4999                                 memset(&tmp_adev->reset_task_info, 0,
5000                                                 sizeof(tmp_adev->reset_task_info));
5001                                 if (reset_context->job && reset_context->job->vm)
5002                                         tmp_adev->reset_task_info =
5003                                                 reset_context->job->vm->task_info;
5004                                 amdgpu_reset_capture_coredumpm(tmp_adev);
5005 #endif
5006                                 if (vram_lost) {
5007                                         DRM_INFO("VRAM is lost due to GPU reset!\n");
5008                                         amdgpu_inc_vram_lost(tmp_adev);
5009                                 }
5010
5011                                 r = amdgpu_device_fw_loading(tmp_adev);
5012                                 if (r)
5013                                         return r;
5014
5015                                 r = amdgpu_device_ip_resume_phase2(tmp_adev);
5016                                 if (r)
5017                                         goto out;
5018
5019                                 if (vram_lost)
5020                                         amdgpu_device_fill_reset_magic(tmp_adev);
5021
5022                                 /*
5023                                  * Add this ASIC as tracked as reset was already
5024                                  * complete successfully.
5025                                  */
5026                                 amdgpu_register_gpu_instance(tmp_adev);
5027
5028                                 if (!reset_context->hive &&
5029                                     tmp_adev->gmc.xgmi.num_physical_nodes > 1)
5030                                         amdgpu_xgmi_add_device(tmp_adev);
5031
5032                                 r = amdgpu_device_ip_late_init(tmp_adev);
5033                                 if (r)
5034                                         goto out;
5035
5036                                 drm_fb_helper_set_suspend_unlocked(adev_to_drm(tmp_adev)->fb_helper, false);
5037
5038                                 /*
5039                                  * The GPU enters bad state once faulty pages
5040                                  * by ECC has reached the threshold, and ras
5041                                  * recovery is scheduled next. So add one check
5042                                  * here to break recovery if it indeed exceeds
5043                                  * bad page threshold, and remind user to
5044                                  * retire this GPU or setting one bigger
5045                                  * bad_page_threshold value to fix this once
5046                                  * probing driver again.
5047                                  */
5048                                 if (!amdgpu_ras_eeprom_check_err_threshold(tmp_adev)) {
5049                                         /* must succeed. */
5050                                         amdgpu_ras_resume(tmp_adev);
5051                                 } else {
5052                                         r = -EINVAL;
5053                                         goto out;
5054                                 }
5055
5056                                 /* Update PSP FW topology after reset */
5057                                 if (reset_context->hive &&
5058                                     tmp_adev->gmc.xgmi.num_physical_nodes > 1)
5059                                         r = amdgpu_xgmi_update_topology(
5060                                                 reset_context->hive, tmp_adev);
5061                         }
5062                 }
5063
5064 out:
5065                 if (!r) {
5066                         amdgpu_irq_gpu_reset_resume_helper(tmp_adev);
5067                         r = amdgpu_ib_ring_tests(tmp_adev);
5068                         if (r) {
5069                                 dev_err(tmp_adev->dev, "ib ring test failed (%d).\n", r);
5070                                 need_full_reset = true;
5071                                 r = -EAGAIN;
5072                                 goto end;
5073                         }
5074                 }
5075
5076                 if (!r)
5077                         r = amdgpu_device_recover_vram(tmp_adev);
5078                 else
5079                         tmp_adev->asic_reset_res = r;
5080         }
5081
5082 end:
5083         if (need_full_reset)
5084                 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
5085         else
5086                 clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
5087         return r;
5088 }
5089
5090 static void amdgpu_device_set_mp1_state(struct amdgpu_device *adev)
5091 {
5092
5093         switch (amdgpu_asic_reset_method(adev)) {
5094         case AMD_RESET_METHOD_MODE1:
5095                 adev->mp1_state = PP_MP1_STATE_SHUTDOWN;
5096                 break;
5097         case AMD_RESET_METHOD_MODE2:
5098                 adev->mp1_state = PP_MP1_STATE_RESET;
5099                 break;
5100         default:
5101                 adev->mp1_state = PP_MP1_STATE_NONE;
5102                 break;
5103         }
5104 }
5105
5106 static void amdgpu_device_unset_mp1_state(struct amdgpu_device *adev)
5107 {
5108         amdgpu_vf_error_trans_all(adev);
5109         adev->mp1_state = PP_MP1_STATE_NONE;
5110 }
5111
5112 static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev)
5113 {
5114         struct pci_dev *p = NULL;
5115
5116         p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus),
5117                         adev->pdev->bus->number, 1);
5118         if (p) {
5119                 pm_runtime_enable(&(p->dev));
5120                 pm_runtime_resume(&(p->dev));
5121         }
5122
5123         pci_dev_put(p);
5124 }
5125
5126 static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev)
5127 {
5128         enum amd_reset_method reset_method;
5129         struct pci_dev *p = NULL;
5130         u64 expires;
5131
5132         /*
5133          * For now, only BACO and mode1 reset are confirmed
5134          * to suffer the audio issue without proper suspended.
5135          */
5136         reset_method = amdgpu_asic_reset_method(adev);
5137         if ((reset_method != AMD_RESET_METHOD_BACO) &&
5138              (reset_method != AMD_RESET_METHOD_MODE1))
5139                 return -EINVAL;
5140
5141         p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus),
5142                         adev->pdev->bus->number, 1);
5143         if (!p)
5144                 return -ENODEV;
5145
5146         expires = pm_runtime_autosuspend_expiration(&(p->dev));
5147         if (!expires)
5148                 /*
5149                  * If we cannot get the audio device autosuspend delay,
5150                  * a fixed 4S interval will be used. Considering 3S is
5151                  * the audio controller default autosuspend delay setting.
5152                  * 4S used here is guaranteed to cover that.
5153                  */
5154                 expires = ktime_get_mono_fast_ns() + NSEC_PER_SEC * 4ULL;
5155
5156         while (!pm_runtime_status_suspended(&(p->dev))) {
5157                 if (!pm_runtime_suspend(&(p->dev)))
5158                         break;
5159
5160                 if (expires < ktime_get_mono_fast_ns()) {
5161                         dev_warn(adev->dev, "failed to suspend display audio\n");
5162                         pci_dev_put(p);
5163                         /* TODO: abort the succeeding gpu reset? */
5164                         return -ETIMEDOUT;
5165                 }
5166         }
5167
5168         pm_runtime_disable(&(p->dev));
5169
5170         pci_dev_put(p);
5171         return 0;
5172 }
5173
5174 static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev)
5175 {
5176         struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
5177
5178 #if defined(CONFIG_DEBUG_FS)
5179         if (!amdgpu_sriov_vf(adev))
5180                 cancel_work(&adev->reset_work);
5181 #endif
5182
5183         if (adev->kfd.dev)
5184                 cancel_work(&adev->kfd.reset_work);
5185
5186         if (amdgpu_sriov_vf(adev))
5187                 cancel_work(&adev->virt.flr_work);
5188
5189         if (con && adev->ras_enabled)
5190                 cancel_work(&con->recovery_work);
5191
5192 }
5193
5194 /**
5195  * amdgpu_device_gpu_recover - reset the asic and recover scheduler
5196  *
5197  * @adev: amdgpu_device pointer
5198  * @job: which job trigger hang
5199  * @reset_context: amdgpu reset context pointer
5200  *
5201  * Attempt to reset the GPU if it has hung (all asics).
5202  * Attempt to do soft-reset or full-reset and reinitialize Asic
5203  * Returns 0 for success or an error on failure.
5204  */
5205
5206 int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
5207                               struct amdgpu_job *job,
5208                               struct amdgpu_reset_context *reset_context)
5209 {
5210         struct list_head device_list, *device_list_handle =  NULL;
5211         bool job_signaled = false;
5212         struct amdgpu_hive_info *hive = NULL;
5213         struct amdgpu_device *tmp_adev = NULL;
5214         int i, r = 0;
5215         bool need_emergency_restart = false;
5216         bool audio_suspended = false;
5217         bool gpu_reset_for_dev_remove = false;
5218
5219         gpu_reset_for_dev_remove =
5220                         test_bit(AMDGPU_RESET_FOR_DEVICE_REMOVE, &reset_context->flags) &&
5221                                 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
5222
5223         /*
5224          * Special case: RAS triggered and full reset isn't supported
5225          */
5226         need_emergency_restart = amdgpu_ras_need_emergency_restart(adev);
5227
5228         /*
5229          * Flush RAM to disk so that after reboot
5230          * the user can read log and see why the system rebooted.
5231          */
5232         if (need_emergency_restart && amdgpu_ras_get_context(adev)->reboot) {
5233                 DRM_WARN("Emergency reboot.");
5234
5235                 ksys_sync_helper();
5236                 emergency_restart();
5237         }
5238
5239         dev_info(adev->dev, "GPU %s begin!\n",
5240                 need_emergency_restart ? "jobs stop":"reset");
5241
5242         if (!amdgpu_sriov_vf(adev))
5243                 hive = amdgpu_get_xgmi_hive(adev);
5244         if (hive)
5245                 mutex_lock(&hive->hive_lock);
5246
5247         reset_context->job = job;
5248         reset_context->hive = hive;
5249         /*
5250          * Build list of devices to reset.
5251          * In case we are in XGMI hive mode, resort the device list
5252          * to put adev in the 1st position.
5253          */
5254         INIT_LIST_HEAD(&device_list);
5255         if (!amdgpu_sriov_vf(adev) && (adev->gmc.xgmi.num_physical_nodes > 1)) {
5256                 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {
5257                         list_add_tail(&tmp_adev->reset_list, &device_list);
5258                         if (gpu_reset_for_dev_remove && adev->shutdown)
5259                                 tmp_adev->shutdown = true;
5260                 }
5261                 if (!list_is_first(&adev->reset_list, &device_list))
5262                         list_rotate_to_front(&adev->reset_list, &device_list);
5263                 device_list_handle = &device_list;
5264         } else {
5265                 list_add_tail(&adev->reset_list, &device_list);
5266                 device_list_handle = &device_list;
5267         }
5268
5269         /* We need to lock reset domain only once both for XGMI and single device */
5270         tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device,
5271                                     reset_list);
5272         amdgpu_device_lock_reset_domain(tmp_adev->reset_domain);
5273
5274         /* block all schedulers and reset given job's ring */
5275         list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5276
5277                 amdgpu_device_set_mp1_state(tmp_adev);
5278
5279                 /*
5280                  * Try to put the audio codec into suspend state
5281                  * before gpu reset started.
5282                  *
5283                  * Due to the power domain of the graphics device
5284                  * is shared with AZ power domain. Without this,
5285                  * we may change the audio hardware from behind
5286                  * the audio driver's back. That will trigger
5287                  * some audio codec errors.
5288                  */
5289                 if (!amdgpu_device_suspend_display_audio(tmp_adev))
5290                         audio_suspended = true;
5291
5292                 amdgpu_ras_set_error_query_ready(tmp_adev, false);
5293
5294                 cancel_delayed_work_sync(&tmp_adev->delayed_init_work);
5295
5296                 if (!amdgpu_sriov_vf(tmp_adev))
5297                         amdgpu_amdkfd_pre_reset(tmp_adev);
5298
5299                 /*
5300                  * Mark these ASICs to be reseted as untracked first
5301                  * And add them back after reset completed
5302                  */
5303                 amdgpu_unregister_gpu_instance(tmp_adev);
5304
5305                 drm_fb_helper_set_suspend_unlocked(adev_to_drm(tmp_adev)->fb_helper, true);
5306
5307                 /* disable ras on ALL IPs */
5308                 if (!need_emergency_restart &&
5309                       amdgpu_device_ip_need_full_reset(tmp_adev))
5310                         amdgpu_ras_suspend(tmp_adev);
5311
5312                 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
5313                         struct amdgpu_ring *ring = tmp_adev->rings[i];
5314
5315                         if (!ring || !ring->sched.thread)
5316                                 continue;
5317
5318                         drm_sched_stop(&ring->sched, job ? &job->base : NULL);
5319
5320                         if (need_emergency_restart)
5321                                 amdgpu_job_stop_all_jobs_on_sched(&ring->sched);
5322                 }
5323                 atomic_inc(&tmp_adev->gpu_reset_counter);
5324         }
5325
5326         if (need_emergency_restart)
5327                 goto skip_sched_resume;
5328
5329         /*
5330          * Must check guilty signal here since after this point all old
5331          * HW fences are force signaled.
5332          *
5333          * job->base holds a reference to parent fence
5334          */
5335         if (job && dma_fence_is_signaled(&job->hw_fence)) {
5336                 job_signaled = true;
5337                 dev_info(adev->dev, "Guilty job already signaled, skipping HW reset");
5338                 goto skip_hw_reset;
5339         }
5340
5341 retry:  /* Rest of adevs pre asic reset from XGMI hive. */
5342         list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5343                 if (gpu_reset_for_dev_remove) {
5344                         /* Workaroud for ASICs need to disable SMC first */
5345                         amdgpu_device_smu_fini_early(tmp_adev);
5346                 }
5347                 r = amdgpu_device_pre_asic_reset(tmp_adev, reset_context);
5348                 /*TODO Should we stop ?*/
5349                 if (r) {
5350                         dev_err(tmp_adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ",
5351                                   r, adev_to_drm(tmp_adev)->unique);
5352                         tmp_adev->asic_reset_res = r;
5353                 }
5354
5355                 /*
5356                  * Drop all pending non scheduler resets. Scheduler resets
5357                  * were already dropped during drm_sched_stop
5358                  */
5359                 amdgpu_device_stop_pending_resets(tmp_adev);
5360         }
5361
5362         /* Actual ASIC resets if needed.*/
5363         /* Host driver will handle XGMI hive reset for SRIOV */
5364         if (amdgpu_sriov_vf(adev)) {
5365                 r = amdgpu_device_reset_sriov(adev, job ? false : true);
5366                 if (r)
5367                         adev->asic_reset_res = r;
5368
5369                 /* Aldebaran and gfx_11_0_3 support ras in SRIOV, so need resume ras during reset */
5370                 if (adev->ip_versions[GC_HWIP][0] == IP_VERSION(9, 4, 2) ||
5371                     adev->ip_versions[GC_HWIP][0] == IP_VERSION(11, 0, 3))
5372                         amdgpu_ras_resume(adev);
5373         } else {
5374                 r = amdgpu_do_asic_reset(device_list_handle, reset_context);
5375                 if (r && r == -EAGAIN)
5376                         goto retry;
5377
5378                 if (!r && gpu_reset_for_dev_remove)
5379                         goto recover_end;
5380         }
5381
5382 skip_hw_reset:
5383
5384         /* Post ASIC reset for all devs .*/
5385         list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5386
5387                 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
5388                         struct amdgpu_ring *ring = tmp_adev->rings[i];
5389
5390                         if (!ring || !ring->sched.thread)
5391                                 continue;
5392
5393                         drm_sched_start(&ring->sched, true);
5394                 }
5395
5396                 if (adev->enable_mes && adev->ip_versions[GC_HWIP][0] != IP_VERSION(11, 0, 3))
5397                         amdgpu_mes_self_test(tmp_adev);
5398
5399                 if (!drm_drv_uses_atomic_modeset(adev_to_drm(tmp_adev)) && !job_signaled)
5400                         drm_helper_resume_force_mode(adev_to_drm(tmp_adev));
5401
5402                 if (tmp_adev->asic_reset_res)
5403                         r = tmp_adev->asic_reset_res;
5404
5405                 tmp_adev->asic_reset_res = 0;
5406
5407                 if (r) {
5408                         /* bad news, how to tell it to userspace ? */
5409                         dev_info(tmp_adev->dev, "GPU reset(%d) failed\n", atomic_read(&tmp_adev->gpu_reset_counter));
5410                         amdgpu_vf_error_put(tmp_adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r);
5411                 } else {
5412                         dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&tmp_adev->gpu_reset_counter));
5413                         if (amdgpu_acpi_smart_shift_update(adev_to_drm(tmp_adev), AMDGPU_SS_DEV_D0))
5414                                 DRM_WARN("smart shift update failed\n");
5415                 }
5416         }
5417
5418 skip_sched_resume:
5419         list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5420                 /* unlock kfd: SRIOV would do it separately */
5421                 if (!need_emergency_restart && !amdgpu_sriov_vf(tmp_adev))
5422                         amdgpu_amdkfd_post_reset(tmp_adev);
5423
5424                 /* kfd_post_reset will do nothing if kfd device is not initialized,
5425                  * need to bring up kfd here if it's not be initialized before
5426                  */
5427                 if (!adev->kfd.init_complete)
5428                         amdgpu_amdkfd_device_init(adev);
5429
5430                 if (audio_suspended)
5431                         amdgpu_device_resume_display_audio(tmp_adev);
5432
5433                 amdgpu_device_unset_mp1_state(tmp_adev);
5434
5435                 amdgpu_ras_set_error_query_ready(tmp_adev, true);
5436         }
5437
5438 recover_end:
5439         tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device,
5440                                             reset_list);
5441         amdgpu_device_unlock_reset_domain(tmp_adev->reset_domain);
5442
5443         if (hive) {
5444                 mutex_unlock(&hive->hive_lock);
5445                 amdgpu_put_xgmi_hive(hive);
5446         }
5447
5448         if (r)
5449                 dev_info(adev->dev, "GPU reset end with ret = %d\n", r);
5450
5451         atomic_set(&adev->reset_domain->reset_res, r);
5452         return r;
5453 }
5454
5455 /**
5456  * amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot
5457  *
5458  * @adev: amdgpu_device pointer
5459  *
5460  * Fetchs and stores in the driver the PCIE capabilities (gen speed
5461  * and lanes) of the slot the device is in. Handles APUs and
5462  * virtualized environments where PCIE config space may not be available.
5463  */
5464 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev)
5465 {
5466         struct pci_dev *pdev;
5467         enum pci_bus_speed speed_cap, platform_speed_cap;
5468         enum pcie_link_width platform_link_width;
5469
5470         if (amdgpu_pcie_gen_cap)
5471                 adev->pm.pcie_gen_mask = amdgpu_pcie_gen_cap;
5472
5473         if (amdgpu_pcie_lane_cap)
5474                 adev->pm.pcie_mlw_mask = amdgpu_pcie_lane_cap;
5475
5476         /* covers APUs as well */
5477         if (pci_is_root_bus(adev->pdev->bus) && !amdgpu_passthrough(adev)) {
5478                 if (adev->pm.pcie_gen_mask == 0)
5479                         adev->pm.pcie_gen_mask = AMDGPU_DEFAULT_PCIE_GEN_MASK;
5480                 if (adev->pm.pcie_mlw_mask == 0)
5481                         adev->pm.pcie_mlw_mask = AMDGPU_DEFAULT_PCIE_MLW_MASK;
5482                 return;
5483         }
5484
5485         if (adev->pm.pcie_gen_mask && adev->pm.pcie_mlw_mask)
5486                 return;
5487
5488         pcie_bandwidth_available(adev->pdev, NULL,
5489                                  &platform_speed_cap, &platform_link_width);
5490
5491         if (adev->pm.pcie_gen_mask == 0) {
5492                 /* asic caps */
5493                 pdev = adev->pdev;
5494                 speed_cap = pcie_get_speed_cap(pdev);
5495                 if (speed_cap == PCI_SPEED_UNKNOWN) {
5496                         adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5497                                                   CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5498                                                   CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3);
5499                 } else {
5500                         if (speed_cap == PCIE_SPEED_32_0GT)
5501                                 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5502                                                           CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5503                                                           CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 |
5504                                                           CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4 |
5505                                                           CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN5);
5506                         else if (speed_cap == PCIE_SPEED_16_0GT)
5507                                 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5508                                                           CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5509                                                           CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 |
5510                                                           CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4);
5511                         else if (speed_cap == PCIE_SPEED_8_0GT)
5512                                 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5513                                                           CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5514                                                           CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3);
5515                         else if (speed_cap == PCIE_SPEED_5_0GT)
5516                                 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5517                                                           CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2);
5518                         else
5519                                 adev->pm.pcie_gen_mask |= CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1;
5520                 }
5521                 /* platform caps */
5522                 if (platform_speed_cap == PCI_SPEED_UNKNOWN) {
5523                         adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5524                                                    CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2);
5525                 } else {
5526                         if (platform_speed_cap == PCIE_SPEED_32_0GT)
5527                                 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5528                                                            CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5529                                                            CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 |
5530                                                            CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4 |
5531                                                            CAIL_PCIE_LINK_SPEED_SUPPORT_GEN5);
5532                         else if (platform_speed_cap == PCIE_SPEED_16_0GT)
5533                                 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5534                                                            CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5535                                                            CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 |
5536                                                            CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4);
5537                         else if (platform_speed_cap == PCIE_SPEED_8_0GT)
5538                                 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5539                                                            CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5540                                                            CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3);
5541                         else if (platform_speed_cap == PCIE_SPEED_5_0GT)
5542                                 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5543                                                            CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2);
5544                         else
5545                                 adev->pm.pcie_gen_mask |= CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1;
5546
5547                 }
5548         }
5549         if (adev->pm.pcie_mlw_mask == 0) {
5550                 if (platform_link_width == PCIE_LNK_WIDTH_UNKNOWN) {
5551                         adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_PCIE_MLW_MASK;
5552                 } else {
5553                         switch (platform_link_width) {
5554                         case PCIE_LNK_X32:
5555                                 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X32 |
5556                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 |
5557                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
5558                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
5559                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
5560                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
5561                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
5562                                 break;
5563                         case PCIE_LNK_X16:
5564                                 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 |
5565                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
5566                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
5567                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
5568                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
5569                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
5570                                 break;
5571                         case PCIE_LNK_X12:
5572                                 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
5573                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
5574                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
5575                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
5576                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
5577                                 break;
5578                         case PCIE_LNK_X8:
5579                                 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
5580                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
5581                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
5582                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
5583                                 break;
5584                         case PCIE_LNK_X4:
5585                                 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
5586                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
5587                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
5588                                 break;
5589                         case PCIE_LNK_X2:
5590                                 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
5591                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
5592                                 break;
5593                         case PCIE_LNK_X1:
5594                                 adev->pm.pcie_mlw_mask = CAIL_PCIE_LINK_WIDTH_SUPPORT_X1;
5595                                 break;
5596                         default:
5597                                 break;
5598                         }
5599                 }
5600         }
5601 }
5602
5603 /**
5604  * amdgpu_device_is_peer_accessible - Check peer access through PCIe BAR
5605  *
5606  * @adev: amdgpu_device pointer
5607  * @peer_adev: amdgpu_device pointer for peer device trying to access @adev
5608  *
5609  * Return true if @peer_adev can access (DMA) @adev through the PCIe
5610  * BAR, i.e. @adev is "large BAR" and the BAR matches the DMA mask of
5611  * @peer_adev.
5612  */
5613 bool amdgpu_device_is_peer_accessible(struct amdgpu_device *adev,
5614                                       struct amdgpu_device *peer_adev)
5615 {
5616 #ifdef CONFIG_HSA_AMD_P2P
5617         uint64_t address_mask = peer_adev->dev->dma_mask ?
5618                 ~*peer_adev->dev->dma_mask : ~((1ULL << 32) - 1);
5619         resource_size_t aper_limit =
5620                 adev->gmc.aper_base + adev->gmc.aper_size - 1;
5621         bool p2p_access =
5622                 !adev->gmc.xgmi.connected_to_cpu &&
5623                 !(pci_p2pdma_distance(adev->pdev, peer_adev->dev, false) < 0);
5624
5625         return pcie_p2p && p2p_access && (adev->gmc.visible_vram_size &&
5626                 adev->gmc.real_vram_size == adev->gmc.visible_vram_size &&
5627                 !(adev->gmc.aper_base & address_mask ||
5628                   aper_limit & address_mask));
5629 #else
5630         return false;
5631 #endif
5632 }
5633
5634 int amdgpu_device_baco_enter(struct drm_device *dev)
5635 {
5636         struct amdgpu_device *adev = drm_to_adev(dev);
5637         struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
5638
5639         if (!amdgpu_device_supports_baco(dev))
5640                 return -ENOTSUPP;
5641
5642         if (ras && adev->ras_enabled &&
5643             adev->nbio.funcs->enable_doorbell_interrupt)
5644                 adev->nbio.funcs->enable_doorbell_interrupt(adev, false);
5645
5646         return amdgpu_dpm_baco_enter(adev);
5647 }
5648
5649 int amdgpu_device_baco_exit(struct drm_device *dev)
5650 {
5651         struct amdgpu_device *adev = drm_to_adev(dev);
5652         struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
5653         int ret = 0;
5654
5655         if (!amdgpu_device_supports_baco(dev))
5656                 return -ENOTSUPP;
5657
5658         ret = amdgpu_dpm_baco_exit(adev);
5659         if (ret)
5660                 return ret;
5661
5662         if (ras && adev->ras_enabled &&
5663             adev->nbio.funcs->enable_doorbell_interrupt)
5664                 adev->nbio.funcs->enable_doorbell_interrupt(adev, true);
5665
5666         if (amdgpu_passthrough(adev) &&
5667             adev->nbio.funcs->clear_doorbell_interrupt)
5668                 adev->nbio.funcs->clear_doorbell_interrupt(adev);
5669
5670         return 0;
5671 }
5672
5673 /**
5674  * amdgpu_pci_error_detected - Called when a PCI error is detected.
5675  * @pdev: PCI device struct
5676  * @state: PCI channel state
5677  *
5678  * Description: Called when a PCI error is detected.
5679  *
5680  * Return: PCI_ERS_RESULT_NEED_RESET or PCI_ERS_RESULT_DISCONNECT.
5681  */
5682 pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state)
5683 {
5684         struct drm_device *dev = pci_get_drvdata(pdev);
5685         struct amdgpu_device *adev = drm_to_adev(dev);
5686         int i;
5687
5688         DRM_INFO("PCI error: detected callback, state(%d)!!\n", state);
5689
5690         if (adev->gmc.xgmi.num_physical_nodes > 1) {
5691                 DRM_WARN("No support for XGMI hive yet...");
5692                 return PCI_ERS_RESULT_DISCONNECT;
5693         }
5694
5695         adev->pci_channel_state = state;
5696
5697         switch (state) {
5698         case pci_channel_io_normal:
5699                 return PCI_ERS_RESULT_CAN_RECOVER;
5700         /* Fatal error, prepare for slot reset */
5701         case pci_channel_io_frozen:
5702                 /*
5703                  * Locking adev->reset_domain->sem will prevent any external access
5704                  * to GPU during PCI error recovery
5705                  */
5706                 amdgpu_device_lock_reset_domain(adev->reset_domain);
5707                 amdgpu_device_set_mp1_state(adev);
5708
5709                 /*
5710                  * Block any work scheduling as we do for regular GPU reset
5711                  * for the duration of the recovery
5712                  */
5713                 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
5714                         struct amdgpu_ring *ring = adev->rings[i];
5715
5716                         if (!ring || !ring->sched.thread)
5717                                 continue;
5718
5719                         drm_sched_stop(&ring->sched, NULL);
5720                 }
5721                 atomic_inc(&adev->gpu_reset_counter);
5722                 return PCI_ERS_RESULT_NEED_RESET;
5723         case pci_channel_io_perm_failure:
5724                 /* Permanent error, prepare for device removal */
5725                 return PCI_ERS_RESULT_DISCONNECT;
5726         }
5727
5728         return PCI_ERS_RESULT_NEED_RESET;
5729 }
5730
5731 /**
5732  * amdgpu_pci_mmio_enabled - Enable MMIO and dump debug registers
5733  * @pdev: pointer to PCI device
5734  */
5735 pci_ers_result_t amdgpu_pci_mmio_enabled(struct pci_dev *pdev)
5736 {
5737
5738         DRM_INFO("PCI error: mmio enabled callback!!\n");
5739
5740         /* TODO - dump whatever for debugging purposes */
5741
5742         /* This called only if amdgpu_pci_error_detected returns
5743          * PCI_ERS_RESULT_CAN_RECOVER. Read/write to the device still
5744          * works, no need to reset slot.
5745          */
5746
5747         return PCI_ERS_RESULT_RECOVERED;
5748 }
5749
5750 /**
5751  * amdgpu_pci_slot_reset - Called when PCI slot has been reset.
5752  * @pdev: PCI device struct
5753  *
5754  * Description: This routine is called by the pci error recovery
5755  * code after the PCI slot has been reset, just before we
5756  * should resume normal operations.
5757  */
5758 pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev)
5759 {
5760         struct drm_device *dev = pci_get_drvdata(pdev);
5761         struct amdgpu_device *adev = drm_to_adev(dev);
5762         int r, i;
5763         struct amdgpu_reset_context reset_context;
5764         u32 memsize;
5765         struct list_head device_list;
5766
5767         DRM_INFO("PCI error: slot reset callback!!\n");
5768
5769         memset(&reset_context, 0, sizeof(reset_context));
5770
5771         INIT_LIST_HEAD(&device_list);
5772         list_add_tail(&adev->reset_list, &device_list);
5773
5774         /* wait for asic to come out of reset */
5775         msleep(500);
5776
5777         /* Restore PCI confspace */
5778         amdgpu_device_load_pci_state(pdev);
5779
5780         /* confirm  ASIC came out of reset */
5781         for (i = 0; i < adev->usec_timeout; i++) {
5782                 memsize = amdgpu_asic_get_config_memsize(adev);
5783
5784                 if (memsize != 0xffffffff)
5785                         break;
5786                 udelay(1);
5787         }
5788         if (memsize == 0xffffffff) {
5789                 r = -ETIME;
5790                 goto out;
5791         }
5792
5793         reset_context.method = AMD_RESET_METHOD_NONE;
5794         reset_context.reset_req_dev = adev;
5795         set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
5796         set_bit(AMDGPU_SKIP_HW_RESET, &reset_context.flags);
5797
5798         adev->no_hw_access = true;
5799         r = amdgpu_device_pre_asic_reset(adev, &reset_context);
5800         adev->no_hw_access = false;
5801         if (r)
5802                 goto out;
5803
5804         r = amdgpu_do_asic_reset(&device_list, &reset_context);
5805
5806 out:
5807         if (!r) {
5808                 if (amdgpu_device_cache_pci_state(adev->pdev))
5809                         pci_restore_state(adev->pdev);
5810
5811                 DRM_INFO("PCIe error recovery succeeded\n");
5812         } else {
5813                 DRM_ERROR("PCIe error recovery failed, err:%d", r);
5814                 amdgpu_device_unset_mp1_state(adev);
5815                 amdgpu_device_unlock_reset_domain(adev->reset_domain);
5816         }
5817
5818         return r ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED;
5819 }
5820
5821 /**
5822  * amdgpu_pci_resume() - resume normal ops after PCI reset
5823  * @pdev: pointer to PCI device
5824  *
5825  * Called when the error recovery driver tells us that its
5826  * OK to resume normal operation.
5827  */
5828 void amdgpu_pci_resume(struct pci_dev *pdev)
5829 {
5830         struct drm_device *dev = pci_get_drvdata(pdev);
5831         struct amdgpu_device *adev = drm_to_adev(dev);
5832         int i;
5833
5834
5835         DRM_INFO("PCI error: resume callback!!\n");
5836
5837         /* Only continue execution for the case of pci_channel_io_frozen */
5838         if (adev->pci_channel_state != pci_channel_io_frozen)
5839                 return;
5840
5841         for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
5842                 struct amdgpu_ring *ring = adev->rings[i];
5843
5844                 if (!ring || !ring->sched.thread)
5845                         continue;
5846
5847                 drm_sched_start(&ring->sched, true);
5848         }
5849
5850         amdgpu_device_unset_mp1_state(adev);
5851         amdgpu_device_unlock_reset_domain(adev->reset_domain);
5852 }
5853
5854 bool amdgpu_device_cache_pci_state(struct pci_dev *pdev)
5855 {
5856         struct drm_device *dev = pci_get_drvdata(pdev);
5857         struct amdgpu_device *adev = drm_to_adev(dev);
5858         int r;
5859
5860         r = pci_save_state(pdev);
5861         if (!r) {
5862                 kfree(adev->pci_state);
5863
5864                 adev->pci_state = pci_store_saved_state(pdev);
5865
5866                 if (!adev->pci_state) {
5867                         DRM_ERROR("Failed to store PCI saved state");
5868                         return false;
5869                 }
5870         } else {
5871                 DRM_WARN("Failed to save PCI state, err:%d\n", r);
5872                 return false;
5873         }
5874
5875         return true;
5876 }
5877
5878 bool amdgpu_device_load_pci_state(struct pci_dev *pdev)
5879 {
5880         struct drm_device *dev = pci_get_drvdata(pdev);
5881         struct amdgpu_device *adev = drm_to_adev(dev);
5882         int r;
5883
5884         if (!adev->pci_state)
5885                 return false;
5886
5887         r = pci_load_saved_state(pdev, adev->pci_state);
5888
5889         if (!r) {
5890                 pci_restore_state(pdev);
5891         } else {
5892                 DRM_WARN("Failed to load PCI state, err:%d\n", r);
5893                 return false;
5894         }
5895
5896         return true;
5897 }
5898
5899 void amdgpu_device_flush_hdp(struct amdgpu_device *adev,
5900                 struct amdgpu_ring *ring)
5901 {
5902 #ifdef CONFIG_X86_64
5903         if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev))
5904                 return;
5905 #endif
5906         if (adev->gmc.xgmi.connected_to_cpu)
5907                 return;
5908
5909         if (ring && ring->funcs->emit_hdp_flush)
5910                 amdgpu_ring_emit_hdp_flush(ring);
5911         else
5912                 amdgpu_asic_flush_hdp(adev, ring);
5913 }
5914
5915 void amdgpu_device_invalidate_hdp(struct amdgpu_device *adev,
5916                 struct amdgpu_ring *ring)
5917 {
5918 #ifdef CONFIG_X86_64
5919         if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev))
5920                 return;
5921 #endif
5922         if (adev->gmc.xgmi.connected_to_cpu)
5923                 return;
5924
5925         amdgpu_asic_invalidate_hdp(adev, ring);
5926 }
5927
5928 int amdgpu_in_reset(struct amdgpu_device *adev)
5929 {
5930         return atomic_read(&adev->reset_domain->in_gpu_reset);
5931 }
5932
5933 /**
5934  * amdgpu_device_halt() - bring hardware to some kind of halt state
5935  *
5936  * @adev: amdgpu_device pointer
5937  *
5938  * Bring hardware to some kind of halt state so that no one can touch it
5939  * any more. It will help to maintain error context when error occurred.
5940  * Compare to a simple hang, the system will keep stable at least for SSH
5941  * access. Then it should be trivial to inspect the hardware state and
5942  * see what's going on. Implemented as following:
5943  *
5944  * 1. drm_dev_unplug() makes device inaccessible to user space(IOCTLs, etc),
5945  *    clears all CPU mappings to device, disallows remappings through page faults
5946  * 2. amdgpu_irq_disable_all() disables all interrupts
5947  * 3. amdgpu_fence_driver_hw_fini() signals all HW fences
5948  * 4. set adev->no_hw_access to avoid potential crashes after setp 5
5949  * 5. amdgpu_device_unmap_mmio() clears all MMIO mappings
5950  * 6. pci_disable_device() and pci_wait_for_pending_transaction()
5951  *    flush any in flight DMA operations
5952  */
5953 void amdgpu_device_halt(struct amdgpu_device *adev)
5954 {
5955         struct pci_dev *pdev = adev->pdev;
5956         struct drm_device *ddev = adev_to_drm(adev);
5957
5958         amdgpu_xcp_dev_unplug(adev);
5959         drm_dev_unplug(ddev);
5960
5961         amdgpu_irq_disable_all(adev);
5962
5963         amdgpu_fence_driver_hw_fini(adev);
5964
5965         adev->no_hw_access = true;
5966
5967         amdgpu_device_unmap_mmio(adev);
5968
5969         pci_disable_device(pdev);
5970         pci_wait_for_pending_transaction(pdev);
5971 }
5972
5973 u32 amdgpu_device_pcie_port_rreg(struct amdgpu_device *adev,
5974                                 u32 reg)
5975 {
5976         unsigned long flags, address, data;
5977         u32 r;
5978
5979         address = adev->nbio.funcs->get_pcie_port_index_offset(adev);
5980         data = adev->nbio.funcs->get_pcie_port_data_offset(adev);
5981
5982         spin_lock_irqsave(&adev->pcie_idx_lock, flags);
5983         WREG32(address, reg * 4);
5984         (void)RREG32(address);
5985         r = RREG32(data);
5986         spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
5987         return r;
5988 }
5989
5990 void amdgpu_device_pcie_port_wreg(struct amdgpu_device *adev,
5991                                 u32 reg, u32 v)
5992 {
5993         unsigned long flags, address, data;
5994
5995         address = adev->nbio.funcs->get_pcie_port_index_offset(adev);
5996         data = adev->nbio.funcs->get_pcie_port_data_offset(adev);
5997
5998         spin_lock_irqsave(&adev->pcie_idx_lock, flags);
5999         WREG32(address, reg * 4);
6000         (void)RREG32(address);
6001         WREG32(data, v);
6002         (void)RREG32(data);
6003         spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
6004 }
6005
6006 /**
6007  * amdgpu_device_switch_gang - switch to a new gang
6008  * @adev: amdgpu_device pointer
6009  * @gang: the gang to switch to
6010  *
6011  * Try to switch to a new gang.
6012  * Returns: NULL if we switched to the new gang or a reference to the current
6013  * gang leader.
6014  */
6015 struct dma_fence *amdgpu_device_switch_gang(struct amdgpu_device *adev,
6016                                             struct dma_fence *gang)
6017 {
6018         struct dma_fence *old = NULL;
6019
6020         do {
6021                 dma_fence_put(old);
6022                 rcu_read_lock();
6023                 old = dma_fence_get_rcu_safe(&adev->gang_submit);
6024                 rcu_read_unlock();
6025
6026                 if (old == gang)
6027                         break;
6028
6029                 if (!dma_fence_is_signaled(old))
6030                         return old;
6031
6032         } while (cmpxchg((struct dma_fence __force **)&adev->gang_submit,
6033                          old, gang) != old);
6034
6035         dma_fence_put(old);
6036         return NULL;
6037 }
6038
6039 bool amdgpu_device_has_display_hardware(struct amdgpu_device *adev)
6040 {
6041         switch (adev->asic_type) {
6042 #ifdef CONFIG_DRM_AMDGPU_SI
6043         case CHIP_HAINAN:
6044 #endif
6045         case CHIP_TOPAZ:
6046                 /* chips with no display hardware */
6047                 return false;
6048 #ifdef CONFIG_DRM_AMDGPU_SI
6049         case CHIP_TAHITI:
6050         case CHIP_PITCAIRN:
6051         case CHIP_VERDE:
6052         case CHIP_OLAND:
6053 #endif
6054 #ifdef CONFIG_DRM_AMDGPU_CIK
6055         case CHIP_BONAIRE:
6056         case CHIP_HAWAII:
6057         case CHIP_KAVERI:
6058         case CHIP_KABINI:
6059         case CHIP_MULLINS:
6060 #endif
6061         case CHIP_TONGA:
6062         case CHIP_FIJI:
6063         case CHIP_POLARIS10:
6064         case CHIP_POLARIS11:
6065         case CHIP_POLARIS12:
6066         case CHIP_VEGAM:
6067         case CHIP_CARRIZO:
6068         case CHIP_STONEY:
6069                 /* chips with display hardware */
6070                 return true;
6071         default:
6072                 /* IP discovery */
6073                 if (!adev->ip_versions[DCE_HWIP][0] ||
6074                     (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK))
6075                         return false;
6076                 return true;
6077         }
6078 }
6079
6080 uint32_t amdgpu_device_wait_on_rreg(struct amdgpu_device *adev,
6081                 uint32_t inst, uint32_t reg_addr, char reg_name[],
6082                 uint32_t expected_value, uint32_t mask)
6083 {
6084         uint32_t ret = 0;
6085         uint32_t old_ = 0;
6086         uint32_t tmp_ = RREG32(reg_addr);
6087         uint32_t loop = adev->usec_timeout;
6088
6089         while ((tmp_ & (mask)) != (expected_value)) {
6090                 if (old_ != tmp_) {
6091                         loop = adev->usec_timeout;
6092                         old_ = tmp_;
6093                 } else
6094                         udelay(1);
6095                 tmp_ = RREG32(reg_addr);
6096                 loop--;
6097                 if (!loop) {
6098                         DRM_WARN("Register(%d) [%s] failed to reach value 0x%08x != 0x%08xn",
6099                                   inst, reg_name, (uint32_t)expected_value,
6100                                   (uint32_t)(tmp_ & (mask)));
6101                         ret = -ETIMEDOUT;
6102                         break;
6103                 }
6104         }
6105         return ret;
6106 }