drivers/iommu/intel/iommu.c

   1 // SPDX-License-Identifier: GPL-2.0-only
   2 /*
   3  * Copyright © 2006-2014 Intel Corporation.
   4  *
   5  * Authors: David Woodhouse <dwmw2@infradead.org>,
   6  *          Ashok Raj <ashok.raj@intel.com>,
   7  *          Shaohua Li <shaohua.li@intel.com>,
   8  *          Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
   9  *          Fenghua Yu <fenghua.yu@intel.com>
  10  *          Joerg Roedel <jroedel@suse.de>
  11  */
  12
  13 #define pr_fmt(fmt)     "DMAR: " fmt
  14 #define dev_fmt(fmt)    pr_fmt(fmt)
  15
  16 #include <linux/crash_dump.h>
  17 #include <linux/dma-direct.h>
  18 #include <linux/dmi.h>
  19 #include <linux/memory.h>
  20 #include <linux/pci.h>
  21 #include <linux/pci-ats.h>
  22 #include <linux/spinlock.h>
  23 #include <linux/syscore_ops.h>
  24 #include <linux/tboot.h>
  25 #include <uapi/linux/iommufd.h>
  26
  27 #include "iommu.h"
  28 #include "../dma-iommu.h"
  29 #include "../irq_remapping.h"
  30 #include "../iommu-sva.h"
  31 #include "pasid.h"
  32 #include "cap_audit.h"
  33 #include "perfmon.h"
  34
  35 #define ROOT_SIZE               VTD_PAGE_SIZE
  36 #define CONTEXT_SIZE            VTD_PAGE_SIZE
  37
  38 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
  39 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
  40 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
  41 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
  42
  43 #define IOAPIC_RANGE_START      (0xfee00000)
  44 #define IOAPIC_RANGE_END        (0xfeefffff)
  45 #define IOVA_START_ADDR         (0x1000)
  46
  47 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 57
  48
  49 #define MAX_AGAW_WIDTH 64
  50 #define MAX_AGAW_PFN_WIDTH      (MAX_AGAW_WIDTH - VTD_PAGE_SHIFT)
  51
  52 #define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << ((gaw) - VTD_PAGE_SHIFT)) - 1)
  53 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << (gaw)) - 1)
  54
  55 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
  56    to match. That way, we can use 'unsigned long' for PFNs with impunity. */
  57 #define DOMAIN_MAX_PFN(gaw)     ((unsigned long) min_t(uint64_t, \
  58                                 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
  59 #define DOMAIN_MAX_ADDR(gaw)    (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
  60
  61 /* IO virtual address start page frame number */
  62 #define IOVA_START_PFN          (1)
  63
  64 #define IOVA_PFN(addr)          ((addr) >> PAGE_SHIFT)
  65
  66 /* page table handling */
  67 #define LEVEL_STRIDE            (9)
  68 #define LEVEL_MASK              (((u64)1 << LEVEL_STRIDE) - 1)
  69
  70 static inline int agaw_to_level(int agaw)
  71 {
  72         return agaw + 2;
  73 }
  74
  75 static inline int agaw_to_width(int agaw)
  76 {
  77         return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH);
  78 }
  79
  80 static inline int width_to_agaw(int width)
  81 {
  82         return DIV_ROUND_UP(width - 30, LEVEL_STRIDE);
  83 }
  84
  85 static inline unsigned int level_to_offset_bits(int level)
  86 {
  87         return (level - 1) * LEVEL_STRIDE;
  88 }
  89
  90 static inline int pfn_level_offset(u64 pfn, int level)
  91 {
  92         return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
  93 }
  94
  95 static inline u64 level_mask(int level)
  96 {
  97         return -1ULL << level_to_offset_bits(level);
  98 }
  99
 100 static inline u64 level_size(int level)
 101 {
 102         return 1ULL << level_to_offset_bits(level);
 103 }
 104
 105 static inline u64 align_to_level(u64 pfn, int level)
 106 {
 107         return (pfn + level_size(level) - 1) & level_mask(level);
 108 }
 109
 110 static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
 111 {
 112         return 1UL << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH);
 113 }
 114
 115 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
 116    are never going to work. */
 117 static inline unsigned long mm_to_dma_pfn_start(unsigned long mm_pfn)
 118 {
 119         return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
 120 }
 121 static inline unsigned long mm_to_dma_pfn_end(unsigned long mm_pfn)
 122 {
 123         return ((mm_pfn + 1) << (PAGE_SHIFT - VTD_PAGE_SHIFT)) - 1;
 124 }
 125 static inline unsigned long page_to_dma_pfn(struct page *pg)
 126 {
 127         return mm_to_dma_pfn_start(page_to_pfn(pg));
 128 }
 129 static inline unsigned long virt_to_dma_pfn(void *p)
 130 {
 131         return page_to_dma_pfn(virt_to_page(p));
 132 }
 133
 134 static void __init check_tylersburg_isoch(void);
 135 static int rwbf_quirk;
 136
 137 /*
 138  * set to 1 to panic kernel if can't successfully enable VT-d
 139  * (used when kernel is launched w/ TXT)
 140  */
 141 static int force_on = 0;
 142 static int intel_iommu_tboot_noforce;
 143 static int no_platform_optin;
 144
 145 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
 146
 147 /*
 148  * Take a root_entry and return the Lower Context Table Pointer (LCTP)
 149  * if marked present.
 150  */
 151 static phys_addr_t root_entry_lctp(struct root_entry *re)
 152 {
 153         if (!(re->lo & 1))
 154                 return 0;
 155
 156         return re->lo & VTD_PAGE_MASK;
 157 }
 158
 159 /*
 160  * Take a root_entry and return the Upper Context Table Pointer (UCTP)
 161  * if marked present.
 162  */
 163 static phys_addr_t root_entry_uctp(struct root_entry *re)
 164 {
 165         if (!(re->hi & 1))
 166                 return 0;
 167
 168         return re->hi & VTD_PAGE_MASK;
 169 }
 170
 171 static inline void context_set_present(struct context_entry *context)
 172 {
 173         context->lo |= 1;
 174 }
 175
 176 static inline void context_set_fault_enable(struct context_entry *context)
 177 {
 178         context->lo &= (((u64)-1) << 2) | 1;
 179 }
 180
 181 static inline void context_set_translation_type(struct context_entry *context,
 182                                                 unsigned long value)
 183 {
 184         context->lo &= (((u64)-1) << 4) | 3;
 185         context->lo |= (value & 3) << 2;
 186 }
 187
 188 static inline void context_set_address_root(struct context_entry *context,
 189                                             unsigned long value)
 190 {
 191         context->lo &= ~VTD_PAGE_MASK;
 192         context->lo |= value & VTD_PAGE_MASK;
 193 }
 194
 195 static inline void context_set_address_width(struct context_entry *context,
 196                                              unsigned long value)
 197 {
 198         context->hi |= value & 7;
 199 }
 200
 201 static inline void context_set_domain_id(struct context_entry *context,
 202                                          unsigned long value)
 203 {
 204         context->hi |= (value & ((1 << 16) - 1)) << 8;
 205 }
 206
 207 static inline void context_set_pasid(struct context_entry *context)
 208 {
 209         context->lo |= CONTEXT_PASIDE;
 210 }
 211
 212 static inline int context_domain_id(struct context_entry *c)
 213 {
 214         return((c->hi >> 8) & 0xffff);
 215 }
 216
 217 static inline void context_clear_entry(struct context_entry *context)
 218 {
 219         context->lo = 0;
 220         context->hi = 0;
 221 }
 222
 223 static inline bool context_copied(struct intel_iommu *iommu, u8 bus, u8 devfn)
 224 {
 225         if (!iommu->copied_tables)
 226                 return false;
 227
 228         return test_bit(((long)bus << 8) | devfn, iommu->copied_tables);
 229 }
 230
 231 static inline void
 232 set_context_copied(struct intel_iommu *iommu, u8 bus, u8 devfn)
 233 {
 234         set_bit(((long)bus << 8) | devfn, iommu->copied_tables);
 235 }
 236
 237 static inline void
 238 clear_context_copied(struct intel_iommu *iommu, u8 bus, u8 devfn)
 239 {
 240         clear_bit(((long)bus << 8) | devfn, iommu->copied_tables);
 241 }
 242
 243 /*
 244  * This domain is a statically identity mapping domain.
 245  *      1. This domain creats a static 1:1 mapping to all usable memory.
 246  *      2. It maps to each iommu if successful.
 247  *      3. Each iommu mapps to this domain if successful.
 248  */
 249 static struct dmar_domain *si_domain;
 250 static int hw_pass_through = 1;
 251
 252 struct dmar_rmrr_unit {
 253         struct list_head list;          /* list of rmrr units   */
 254         struct acpi_dmar_header *hdr;   /* ACPI header          */
 255         u64     base_address;           /* reserved base address*/
 256         u64     end_address;            /* reserved end address */
 257         struct dmar_dev_scope *devices; /* target devices */
 258         int     devices_cnt;            /* target device count */
 259 };
 260
 261 struct dmar_atsr_unit {
 262         struct list_head list;          /* list of ATSR units */
 263         struct acpi_dmar_header *hdr;   /* ACPI header */
 264         struct dmar_dev_scope *devices; /* target devices */
 265         int devices_cnt;                /* target device count */
 266         u8 include_all:1;               /* include all ports */
 267 };
 268
 269 struct dmar_satc_unit {
 270         struct list_head list;          /* list of SATC units */
 271         struct acpi_dmar_header *hdr;   /* ACPI header */
 272         struct dmar_dev_scope *devices; /* target devices */
 273         struct intel_iommu *iommu;      /* the corresponding iommu */
 274         int devices_cnt;                /* target device count */
 275         u8 atc_required:1;              /* ATS is required */
 276 };
 277
 278 static LIST_HEAD(dmar_atsr_units);
 279 static LIST_HEAD(dmar_rmrr_units);
 280 static LIST_HEAD(dmar_satc_units);
 281
 282 #define for_each_rmrr_units(rmrr) \
 283         list_for_each_entry(rmrr, &dmar_rmrr_units, list)
 284
 285 static void device_block_translation(struct device *dev);
 286 static void intel_iommu_domain_free(struct iommu_domain *domain);
 287
 288 int dmar_disabled = !IS_ENABLED(CONFIG_INTEL_IOMMU_DEFAULT_ON);
 289 int intel_iommu_sm = IS_ENABLED(CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON);
 290
 291 int intel_iommu_enabled = 0;
 292 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
 293
 294 static int dmar_map_gfx = 1;
 295 static int intel_iommu_superpage = 1;
 296 static int iommu_identity_mapping;
 297 static int iommu_skip_te_disable;
 298
 299 #define IDENTMAP_GFX            2
 300 #define IDENTMAP_AZALIA         4
 301
 302 const struct iommu_ops intel_iommu_ops;
 303
 304 static bool translation_pre_enabled(struct intel_iommu *iommu)
 305 {
 306         return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
 307 }
 308
 309 static void clear_translation_pre_enabled(struct intel_iommu *iommu)
 310 {
 311         iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
 312 }
 313
 314 static void init_translation_status(struct intel_iommu *iommu)
 315 {
 316         u32 gsts;
 317
 318         gsts = readl(iommu->reg + DMAR_GSTS_REG);
 319         if (gsts & DMA_GSTS_TES)
 320                 iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
 321 }
 322
 323 static int __init intel_iommu_setup(char *str)
 324 {
 325         if (!str)
 326                 return -EINVAL;
 327
 328         while (*str) {
 329                 if (!strncmp(str, "on", 2)) {
 330                         dmar_disabled = 0;
 331                         pr_info("IOMMU enabled\n");
 332                 } else if (!strncmp(str, "off", 3)) {
 333                         dmar_disabled = 1;
 334                         no_platform_optin = 1;
 335                         pr_info("IOMMU disabled\n");
 336                 } else if (!strncmp(str, "igfx_off", 8)) {
 337                         dmar_map_gfx = 0;
 338                         pr_info("Disable GFX device mapping\n");
 339                 } else if (!strncmp(str, "forcedac", 8)) {
 340                         pr_warn("intel_iommu=forcedac deprecated; use iommu.forcedac instead\n");
 341                         iommu_dma_forcedac = true;
 342                 } else if (!strncmp(str, "strict", 6)) {
 343                         pr_warn("intel_iommu=strict deprecated; use iommu.strict=1 instead\n");
 344                         iommu_set_dma_strict();
 345                 } else if (!strncmp(str, "sp_off", 6)) {
 346                         pr_info("Disable supported super page\n");
 347                         intel_iommu_superpage = 0;
 348                 } else if (!strncmp(str, "sm_on", 5)) {
 349                         pr_info("Enable scalable mode if hardware supports\n");
 350                         intel_iommu_sm = 1;
 351                 } else if (!strncmp(str, "sm_off", 6)) {
 352                         pr_info("Scalable mode is disallowed\n");
 353                         intel_iommu_sm = 0;
 354                 } else if (!strncmp(str, "tboot_noforce", 13)) {
 355                         pr_info("Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
 356                         intel_iommu_tboot_noforce = 1;
 357                 } else {
 358                         pr_notice("Unknown option - '%s'\n", str);
 359                 }
 360
 361                 str += strcspn(str, ",");
 362                 while (*str == ',')
 363                         str++;
 364         }
 365
 366         return 1;
 367 }
 368 __setup("intel_iommu=", intel_iommu_setup);
 369
 370 void *alloc_pgtable_page(int node, gfp_t gfp)
 371 {
 372         struct page *page;
 373         void *vaddr = NULL;
 374
 375         page = alloc_pages_node(node, gfp | __GFP_ZERO, 0);
 376         if (page)
 377                 vaddr = page_address(page);
 378         return vaddr;
 379 }
 380
 381 void free_pgtable_page(void *vaddr)
 382 {
 383         free_page((unsigned long)vaddr);
 384 }
 385
 386 static inline int domain_type_is_si(struct dmar_domain *domain)
 387 {
 388         return domain->domain.type == IOMMU_DOMAIN_IDENTITY;
 389 }
 390
 391 static inline int domain_pfn_supported(struct dmar_domain *domain,
 392                                        unsigned long pfn)
 393 {
 394         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
 395
 396         return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
 397 }
 398
 399 /*
 400  * Calculate the Supported Adjusted Guest Address Widths of an IOMMU.
 401  * Refer to 11.4.2 of the VT-d spec for the encoding of each bit of
 402  * the returned SAGAW.
 403  */
 404 static unsigned long __iommu_calculate_sagaw(struct intel_iommu *iommu)
 405 {
 406         unsigned long fl_sagaw, sl_sagaw;
 407
 408         fl_sagaw = BIT(2) | (cap_fl5lp_support(iommu->cap) ? BIT(3) : 0);
 409         sl_sagaw = cap_sagaw(iommu->cap);
 410
 411         /* Second level only. */
 412         if (!sm_supported(iommu) || !ecap_flts(iommu->ecap))
 413                 return sl_sagaw;
 414
 415         /* First level only. */
 416         if (!ecap_slts(iommu->ecap))
 417                 return fl_sagaw;
 418
 419         return fl_sagaw & sl_sagaw;
 420 }
 421
 422 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
 423 {
 424         unsigned long sagaw;
 425         int agaw;
 426
 427         sagaw = __iommu_calculate_sagaw(iommu);
 428         for (agaw = width_to_agaw(max_gaw); agaw >= 0; agaw--) {
 429                 if (test_bit(agaw, &sagaw))
 430                         break;
 431         }
 432
 433         return agaw;
 434 }
 435
 436 /*
 437  * Calculate max SAGAW for each iommu.
 438  */
 439 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
 440 {
 441         return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
 442 }
 443
 444 /*
 445  * calculate agaw for each iommu.
 446  * "SAGAW" may be different across iommus, use a default agaw, and
 447  * get a supported less agaw for iommus that don't support the default agaw.
 448  */
 449 int iommu_calculate_agaw(struct intel_iommu *iommu)
 450 {
 451         return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
 452 }
 453
 454 static inline bool iommu_paging_structure_coherency(struct intel_iommu *iommu)
 455 {
 456         return sm_supported(iommu) ?
 457                         ecap_smpwc(iommu->ecap) : ecap_coherent(iommu->ecap);
 458 }
 459
 460 static void domain_update_iommu_coherency(struct dmar_domain *domain)
 461 {
 462         struct iommu_domain_info *info;
 463         struct dmar_drhd_unit *drhd;
 464         struct intel_iommu *iommu;
 465         bool found = false;
 466         unsigned long i;
 467
 468         domain->iommu_coherency = true;
 469         xa_for_each(&domain->iommu_array, i, info) {
 470                 found = true;
 471                 if (!iommu_paging_structure_coherency(info->iommu)) {
 472                         domain->iommu_coherency = false;
 473                         break;
 474                 }
 475         }
 476         if (found)
 477                 return;
 478
 479         /* No hardware attached; use lowest common denominator */
 480         rcu_read_lock();
 481         for_each_active_iommu(iommu, drhd) {
 482                 if (!iommu_paging_structure_coherency(iommu)) {
 483                         domain->iommu_coherency = false;
 484                         break;
 485                 }
 486         }
 487         rcu_read_unlock();
 488 }
 489
 490 static int domain_update_iommu_superpage(struct dmar_domain *domain,
 491                                          struct intel_iommu *skip)
 492 {
 493         struct dmar_drhd_unit *drhd;
 494         struct intel_iommu *iommu;
 495         int mask = 0x3;
 496
 497         if (!intel_iommu_superpage)
 498                 return 0;
 499
 500         /* set iommu_superpage to the smallest common denominator */
 501         rcu_read_lock();
 502         for_each_active_iommu(iommu, drhd) {
 503                 if (iommu != skip) {
 504                         if (domain && domain->use_first_level) {
 505                                 if (!cap_fl1gp_support(iommu->cap))
 506                                         mask = 0x1;
 507                         } else {
 508                                 mask &= cap_super_page_val(iommu->cap);
 509                         }
 510
 511                         if (!mask)
 512                                 break;
 513                 }
 514         }
 515         rcu_read_unlock();
 516
 517         return fls(mask);
 518 }
 519
 520 static int domain_update_device_node(struct dmar_domain *domain)
 521 {
 522         struct device_domain_info *info;
 523         int nid = NUMA_NO_NODE;
 524         unsigned long flags;
 525
 526         spin_lock_irqsave(&domain->lock, flags);
 527         list_for_each_entry(info, &domain->devices, link) {
 528                 /*
 529                  * There could possibly be multiple device numa nodes as devices
 530                  * within the same domain may sit behind different IOMMUs. There
 531                  * isn't perfect answer in such situation, so we select first
 532                  * come first served policy.
 533                  */
 534                 nid = dev_to_node(info->dev);
 535                 if (nid != NUMA_NO_NODE)
 536                         break;
 537         }
 538         spin_unlock_irqrestore(&domain->lock, flags);
 539
 540         return nid;
 541 }
 542
 543 static void domain_update_iotlb(struct dmar_domain *domain);
 544
 545 /* Return the super pagesize bitmap if supported. */
 546 static unsigned long domain_super_pgsize_bitmap(struct dmar_domain *domain)
 547 {
 548         unsigned long bitmap = 0;
 549
 550         /*
 551          * 1-level super page supports page size of 2MiB, 2-level super page
 552          * supports page size of both 2MiB and 1GiB.
 553          */
 554         if (domain->iommu_superpage == 1)
 555                 bitmap |= SZ_2M;
 556         else if (domain->iommu_superpage == 2)
 557                 bitmap |= SZ_2M | SZ_1G;
 558
 559         return bitmap;
 560 }
 561
 562 /* Some capabilities may be different across iommus */
 563 static void domain_update_iommu_cap(struct dmar_domain *domain)
 564 {
 565         domain_update_iommu_coherency(domain);
 566         domain->iommu_superpage = domain_update_iommu_superpage(domain, NULL);
 567
 568         /*
 569          * If RHSA is missing, we should default to the device numa domain
 570          * as fall back.
 571          */
 572         if (domain->nid == NUMA_NO_NODE)
 573                 domain->nid = domain_update_device_node(domain);
 574
 575         /*
 576          * First-level translation restricts the input-address to a
 577          * canonical address (i.e., address bits 63:N have the same
 578          * value as address bit [N-1], where N is 48-bits with 4-level
 579          * paging and 57-bits with 5-level paging). Hence, skip bit
 580          * [N-1].
 581          */
 582         if (domain->use_first_level)
 583                 domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw - 1);
 584         else
 585                 domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw);
 586
 587         domain->domain.pgsize_bitmap |= domain_super_pgsize_bitmap(domain);
 588         domain_update_iotlb(domain);
 589 }
 590
 591 struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
 592                                          u8 devfn, int alloc)
 593 {
 594         struct root_entry *root = &iommu->root_entry[bus];
 595         struct context_entry *context;
 596         u64 *entry;
 597
 598         /*
 599          * Except that the caller requested to allocate a new entry,
 600          * returning a copied context entry makes no sense.
 601          */
 602         if (!alloc && context_copied(iommu, bus, devfn))
 603                 return NULL;
 604
 605         entry = &root->lo;
 606         if (sm_supported(iommu)) {
 607                 if (devfn >= 0x80) {
 608                         devfn -= 0x80;
 609                         entry = &root->hi;
 610                 }
 611                 devfn *= 2;
 612         }
 613         if (*entry & 1)
 614                 context = phys_to_virt(*entry & VTD_PAGE_MASK);
 615         else {
 616                 unsigned long phy_addr;
 617                 if (!alloc)
 618                         return NULL;
 619
 620                 context = alloc_pgtable_page(iommu->node, GFP_ATOMIC);
 621                 if (!context)
 622                         return NULL;
 623
 624                 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
 625                 phy_addr = virt_to_phys((void *)context);
 626                 *entry = phy_addr | 1;
 627                 __iommu_flush_cache(iommu, entry, sizeof(*entry));
 628         }
 629         return &context[devfn];
 630 }
 631
 632 /**
 633  * is_downstream_to_pci_bridge - test if a device belongs to the PCI
 634  *                               sub-hierarchy of a candidate PCI-PCI bridge
 635  * @dev: candidate PCI device belonging to @bridge PCI sub-hierarchy
 636  * @bridge: the candidate PCI-PCI bridge
 637  *
 638  * Return: true if @dev belongs to @bridge PCI sub-hierarchy, else false.
 639  */
 640 static bool
 641 is_downstream_to_pci_bridge(struct device *dev, struct device *bridge)
 642 {
 643         struct pci_dev *pdev, *pbridge;
 644
 645         if (!dev_is_pci(dev) || !dev_is_pci(bridge))
 646                 return false;
 647
 648         pdev = to_pci_dev(dev);
 649         pbridge = to_pci_dev(bridge);
 650
 651         if (pbridge->subordinate &&
 652             pbridge->subordinate->number <= pdev->bus->number &&
 653             pbridge->subordinate->busn_res.end >= pdev->bus->number)
 654                 return true;
 655
 656         return false;
 657 }
 658
 659 static bool quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
 660 {
 661         struct dmar_drhd_unit *drhd;
 662         u32 vtbar;
 663         int rc;
 664
 665         /* We know that this device on this chipset has its own IOMMU.
 666          * If we find it under a different IOMMU, then the BIOS is lying
 667          * to us. Hope that the IOMMU for this device is actually
 668          * disabled, and it needs no translation...
 669          */
 670         rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
 671         if (rc) {
 672                 /* "can't" happen */
 673                 dev_info(&pdev->dev, "failed to run vt-d quirk\n");
 674                 return false;
 675         }
 676         vtbar &= 0xffff0000;
 677
 678         /* we know that the this iommu should be at offset 0xa000 from vtbar */
 679         drhd = dmar_find_matched_drhd_unit(pdev);
 680         if (!drhd || drhd->reg_base_addr - vtbar != 0xa000) {
 681                 pr_warn_once(FW_BUG "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n");
 682                 add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
 683                 return true;
 684         }
 685
 686         return false;
 687 }
 688
 689 static bool iommu_is_dummy(struct intel_iommu *iommu, struct device *dev)
 690 {
 691         if (!iommu || iommu->drhd->ignored)
 692                 return true;
 693
 694         if (dev_is_pci(dev)) {
 695                 struct pci_dev *pdev = to_pci_dev(dev);
 696
 697                 if (pdev->vendor == PCI_VENDOR_ID_INTEL &&
 698                     pdev->device == PCI_DEVICE_ID_INTEL_IOAT_SNB &&
 699                     quirk_ioat_snb_local_iommu(pdev))
 700                         return true;
 701         }
 702
 703         return false;
 704 }
 705
 706 struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn)
 707 {
 708         struct dmar_drhd_unit *drhd = NULL;
 709         struct pci_dev *pdev = NULL;
 710         struct intel_iommu *iommu;
 711         struct device *tmp;
 712         u16 segment = 0;
 713         int i;
 714
 715         if (!dev)
 716                 return NULL;
 717
 718         if (dev_is_pci(dev)) {
 719                 struct pci_dev *pf_pdev;
 720
 721                 pdev = pci_real_dma_dev(to_pci_dev(dev));
 722
 723                 /* VFs aren't listed in scope tables; we need to look up
 724                  * the PF instead to find the IOMMU. */
 725                 pf_pdev = pci_physfn(pdev);
 726                 dev = &pf_pdev->dev;
 727                 segment = pci_domain_nr(pdev->bus);
 728         } else if (has_acpi_companion(dev))
 729                 dev = &ACPI_COMPANION(dev)->dev;
 730
 731         rcu_read_lock();
 732         for_each_iommu(iommu, drhd) {
 733                 if (pdev && segment != drhd->segment)
 734                         continue;
 735
 736                 for_each_active_dev_scope(drhd->devices,
 737                                           drhd->devices_cnt, i, tmp) {
 738                         if (tmp == dev) {
 739                                 /* For a VF use its original BDF# not that of the PF
 740                                  * which we used for the IOMMU lookup. Strictly speaking
 741                                  * we could do this for all PCI devices; we only need to
 742                                  * get the BDF# from the scope table for ACPI matches. */
 743                                 if (pdev && pdev->is_virtfn)
 744                                         goto got_pdev;
 745
 746                                 if (bus && devfn) {
 747                                         *bus = drhd->devices[i].bus;
 748                                         *devfn = drhd->devices[i].devfn;
 749                                 }
 750                                 goto out;
 751                         }
 752
 753                         if (is_downstream_to_pci_bridge(dev, tmp))
 754                                 goto got_pdev;
 755                 }
 756
 757                 if (pdev && drhd->include_all) {
 758 got_pdev:
 759                         if (bus && devfn) {
 760                                 *bus = pdev->bus->number;
 761                                 *devfn = pdev->devfn;
 762                         }
 763                         goto out;
 764                 }
 765         }
 766         iommu = NULL;
 767 out:
 768         if (iommu_is_dummy(iommu, dev))
 769                 iommu = NULL;
 770
 771         rcu_read_unlock();
 772
 773         return iommu;
 774 }
 775
 776 static void domain_flush_cache(struct dmar_domain *domain,
 777                                void *addr, int size)
 778 {
 779         if (!domain->iommu_coherency)
 780                 clflush_cache_range(addr, size);
 781 }
 782
 783 static void free_context_table(struct intel_iommu *iommu)
 784 {
 785         struct context_entry *context;
 786         int i;
 787
 788         if (!iommu->root_entry)
 789                 return;
 790
 791         for (i = 0; i < ROOT_ENTRY_NR; i++) {
 792                 context = iommu_context_addr(iommu, i, 0, 0);
 793                 if (context)
 794                         free_pgtable_page(context);
 795
 796                 if (!sm_supported(iommu))
 797                         continue;
 798
 799                 context = iommu_context_addr(iommu, i, 0x80, 0);
 800                 if (context)
 801                         free_pgtable_page(context);
 802         }
 803
 804         free_pgtable_page(iommu->root_entry);
 805         iommu->root_entry = NULL;
 806 }
 807
 808 #ifdef CONFIG_DMAR_DEBUG
 809 static void pgtable_walk(struct intel_iommu *iommu, unsigned long pfn,
 810                          u8 bus, u8 devfn, struct dma_pte *parent, int level)
 811 {
 812         struct dma_pte *pte;
 813         int offset;
 814
 815         while (1) {
 816                 offset = pfn_level_offset(pfn, level);
 817                 pte = &parent[offset];
 818                 if (!pte || (dma_pte_superpage(pte) || !dma_pte_present(pte))) {
 819                         pr_info("PTE not present at level %d\n", level);
 820                         break;
 821                 }
 822
 823                 pr_info("pte level: %d, pte value: 0x%016llx\n", level, pte->val);
 824
 825                 if (level == 1)
 826                         break;
 827
 828                 parent = phys_to_virt(dma_pte_addr(pte));
 829                 level--;
 830         }
 831 }
 832
 833 void dmar_fault_dump_ptes(struct intel_iommu *iommu, u16 source_id,
 834                           unsigned long long addr, u32 pasid)
 835 {
 836         struct pasid_dir_entry *dir, *pde;
 837         struct pasid_entry *entries, *pte;
 838         struct context_entry *ctx_entry;
 839         struct root_entry *rt_entry;
 840         int i, dir_index, index, level;
 841         u8 devfn = source_id & 0xff;
 842         u8 bus = source_id >> 8;
 843         struct dma_pte *pgtable;
 844
 845         pr_info("Dump %s table entries for IOVA 0x%llx\n", iommu->name, addr);
 846
 847         /* root entry dump */
 848         rt_entry = &iommu->root_entry[bus];
 849         if (!rt_entry) {
 850                 pr_info("root table entry is not present\n");
 851                 return;
 852         }
 853
 854         if (sm_supported(iommu))
 855                 pr_info("scalable mode root entry: hi 0x%016llx, low 0x%016llx\n",
 856                         rt_entry->hi, rt_entry->lo);
 857         else
 858                 pr_info("root entry: 0x%016llx", rt_entry->lo);
 859
 860         /* context entry dump */
 861         ctx_entry = iommu_context_addr(iommu, bus, devfn, 0);
 862         if (!ctx_entry) {
 863                 pr_info("context table entry is not present\n");
 864                 return;
 865         }
 866
 867         pr_info("context entry: hi 0x%016llx, low 0x%016llx\n",
 868                 ctx_entry->hi, ctx_entry->lo);
 869
 870         /* legacy mode does not require PASID entries */
 871         if (!sm_supported(iommu)) {
 872                 level = agaw_to_level(ctx_entry->hi & 7);
 873                 pgtable = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK);
 874                 goto pgtable_walk;
 875         }
 876
 877         /* get the pointer to pasid directory entry */
 878         dir = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK);
 879         if (!dir) {
 880                 pr_info("pasid directory entry is not present\n");
 881                 return;
 882         }
 883         /* For request-without-pasid, get the pasid from context entry */
 884         if (intel_iommu_sm && pasid == IOMMU_PASID_INVALID)
 885                 pasid = IOMMU_NO_PASID;
 886
 887         dir_index = pasid >> PASID_PDE_SHIFT;
 888         pde = &dir[dir_index];
 889         pr_info("pasid dir entry: 0x%016llx\n", pde->val);
 890
 891         /* get the pointer to the pasid table entry */
 892         entries = get_pasid_table_from_pde(pde);
 893         if (!entries) {
 894                 pr_info("pasid table entry is not present\n");
 895                 return;
 896         }
 897         index = pasid & PASID_PTE_MASK;
 898         pte = &entries[index];
 899         for (i = 0; i < ARRAY_SIZE(pte->val); i++)
 900                 pr_info("pasid table entry[%d]: 0x%016llx\n", i, pte->val[i]);
 901
 902         if (pasid_pte_get_pgtt(pte) == PASID_ENTRY_PGTT_FL_ONLY) {
 903                 level = pte->val[2] & BIT_ULL(2) ? 5 : 4;
 904                 pgtable = phys_to_virt(pte->val[2] & VTD_PAGE_MASK);
 905         } else {
 906                 level = agaw_to_level((pte->val[0] >> 2) & 0x7);
 907                 pgtable = phys_to_virt(pte->val[0] & VTD_PAGE_MASK);
 908         }
 909
 910 pgtable_walk:
 911         pgtable_walk(iommu, addr >> VTD_PAGE_SHIFT, bus, devfn, pgtable, level);
 912 }
 913 #endif
 914
 915 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
 916                                       unsigned long pfn, int *target_level,
 917                                       gfp_t gfp)
 918 {
 919         struct dma_pte *parent, *pte;
 920         int level = agaw_to_level(domain->agaw);
 921         int offset;
 922
 923         if (!domain_pfn_supported(domain, pfn))
 924                 /* Address beyond IOMMU's addressing capabilities. */
 925                 return NULL;
 926
 927         parent = domain->pgd;
 928
 929         while (1) {
 930                 void *tmp_page;
 931
 932                 offset = pfn_level_offset(pfn, level);
 933                 pte = &parent[offset];
 934                 if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
 935                         break;
 936                 if (level == *target_level)
 937                         break;
 938
 939                 if (!dma_pte_present(pte)) {
 940                         uint64_t pteval;
 941
 942                         tmp_page = alloc_pgtable_page(domain->nid, gfp);
 943
 944                         if (!tmp_page)
 945                                 return NULL;
 946
 947                         domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
 948                         pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
 949                         if (domain->use_first_level)
 950                                 pteval |= DMA_FL_PTE_XD | DMA_FL_PTE_US | DMA_FL_PTE_ACCESS;
 951
 952                         if (cmpxchg64(&pte->val, 0ULL, pteval))
 953                                 /* Someone else set it while we were thinking; use theirs. */
 954                                 free_pgtable_page(tmp_page);
 955                         else
 956                                 domain_flush_cache(domain, pte, sizeof(*pte));
 957                 }
 958                 if (level == 1)
 959                         break;
 960
 961                 parent = phys_to_virt(dma_pte_addr(pte));
 962                 level--;
 963         }
 964
 965         if (!*target_level)
 966                 *target_level = level;
 967
 968         return pte;
 969 }
 970
 971 /* return address's pte at specific level */
 972 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
 973                                          unsigned long pfn,
 974                                          int level, int *large_page)
 975 {
 976         struct dma_pte *parent, *pte;
 977         int total = agaw_to_level(domain->agaw);
 978         int offset;
 979
 980         parent = domain->pgd;
 981         while (level <= total) {
 982                 offset = pfn_level_offset(pfn, total);
 983                 pte = &parent[offset];
 984                 if (level == total)
 985                         return pte;
 986
 987                 if (!dma_pte_present(pte)) {
 988                         *large_page = total;
 989                         break;
 990                 }
 991
 992                 if (dma_pte_superpage(pte)) {
 993                         *large_page = total;
 994                         return pte;
 995                 }
 996
 997                 parent = phys_to_virt(dma_pte_addr(pte));
 998                 total--;
 999         }
1000         return NULL;
1001 }
1002
1003 /* clear last level pte, a tlb flush should be followed */
1004 static void dma_pte_clear_range(struct dmar_domain *domain,
1005                                 unsigned long start_pfn,
1006                                 unsigned long last_pfn)
1007 {
1008         unsigned int large_page;
1009         struct dma_pte *first_pte, *pte;
1010
1011         if (WARN_ON(!domain_pfn_supported(domain, last_pfn)) ||
1012             WARN_ON(start_pfn > last_pfn))
1013                 return;
1014
1015         /* we don't need lock here; nobody else touches the iova range */
1016         do {
1017                 large_page = 1;
1018                 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
1019                 if (!pte) {
1020                         start_pfn = align_to_level(start_pfn + 1, large_page + 1);
1021                         continue;
1022                 }
1023                 do {
1024                         dma_clear_pte(pte);
1025                         start_pfn += lvl_to_nr_pages(large_page);
1026                         pte++;
1027                 } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
1028
1029                 domain_flush_cache(domain, first_pte,
1030                                    (void *)pte - (void *)first_pte);
1031
1032         } while (start_pfn && start_pfn <= last_pfn);
1033 }
1034
1035 static void dma_pte_free_level(struct dmar_domain *domain, int level,
1036                                int retain_level, struct dma_pte *pte,
1037                                unsigned long pfn, unsigned long start_pfn,
1038                                unsigned long last_pfn)
1039 {
1040         pfn = max(start_pfn, pfn);
1041         pte = &pte[pfn_level_offset(pfn, level)];
1042
1043         do {
1044                 unsigned long level_pfn;
1045                 struct dma_pte *level_pte;
1046
1047                 if (!dma_pte_present(pte) || dma_pte_superpage(pte))
1048                         goto next;
1049
1050                 level_pfn = pfn & level_mask(level);
1051                 level_pte = phys_to_virt(dma_pte_addr(pte));
1052
1053                 if (level > 2) {
1054                         dma_pte_free_level(domain, level - 1, retain_level,
1055                                            level_pte, level_pfn, start_pfn,
1056                                            last_pfn);
1057                 }
1058
1059                 /*
1060                  * Free the page table if we're below the level we want to
1061                  * retain and the range covers the entire table.
1062                  */
1063                 if (level < retain_level && !(start_pfn > level_pfn ||
1064                       last_pfn < level_pfn + level_size(level) - 1)) {
1065                         dma_clear_pte(pte);
1066                         domain_flush_cache(domain, pte, sizeof(*pte));
1067                         free_pgtable_page(level_pte);
1068                 }
1069 next:
1070                 pfn += level_size(level);
1071         } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1072 }
1073
1074 /*
1075  * clear last level (leaf) ptes and free page table pages below the
1076  * level we wish to keep intact.
1077  */
1078 static void dma_pte_free_pagetable(struct dmar_domain *domain,
1079                                    unsigned long start_pfn,
1080                                    unsigned long last_pfn,
1081                                    int retain_level)
1082 {
1083         dma_pte_clear_range(domain, start_pfn, last_pfn);
1084
1085         /* We don't need lock here; nobody else touches the iova range */
1086         dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level,
1087                            domain->pgd, 0, start_pfn, last_pfn);
1088
1089         /* free pgd */
1090         if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1091                 free_pgtable_page(domain->pgd);
1092                 domain->pgd = NULL;
1093         }
1094 }
1095
1096 /* When a page at a given level is being unlinked from its parent, we don't
1097    need to *modify* it at all. All we need to do is make a list of all the
1098    pages which can be freed just as soon as we've flushed the IOTLB and we
1099    know the hardware page-walk will no longer touch them.
1100    The 'pte' argument is the *parent* PTE, pointing to the page that is to
1101    be freed. */
1102 static void dma_pte_list_pagetables(struct dmar_domain *domain,
1103                                     int level, struct dma_pte *pte,
1104                                     struct list_head *freelist)
1105 {
1106         struct page *pg;
1107
1108         pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
1109         list_add_tail(&pg->lru, freelist);
1110
1111         if (level == 1)
1112                 return;
1113
1114         pte = page_address(pg);
1115         do {
1116                 if (dma_pte_present(pte) && !dma_pte_superpage(pte))
1117                         dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1118                 pte++;
1119         } while (!first_pte_in_page(pte));
1120 }
1121
1122 static void dma_pte_clear_level(struct dmar_domain *domain, int level,
1123                                 struct dma_pte *pte, unsigned long pfn,
1124                                 unsigned long start_pfn, unsigned long last_pfn,
1125                                 struct list_head *freelist)
1126 {
1127         struct dma_pte *first_pte = NULL, *last_pte = NULL;
1128
1129         pfn = max(start_pfn, pfn);
1130         pte = &pte[pfn_level_offset(pfn, level)];
1131
1132         do {
1133                 unsigned long level_pfn = pfn & level_mask(level);
1134
1135                 if (!dma_pte_present(pte))
1136                         goto next;
1137
1138                 /* If range covers entire pagetable, free it */
1139                 if (start_pfn <= level_pfn &&
1140                     last_pfn >= level_pfn + level_size(level) - 1) {
1141                         /* These suborbinate page tables are going away entirely. Don't
1142                            bother to clear them; we're just going to *free* them. */
1143                         if (level > 1 && !dma_pte_superpage(pte))
1144                                 dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1145
1146                         dma_clear_pte(pte);
1147                         if (!first_pte)
1148                                 first_pte = pte;
1149                         last_pte = pte;
1150                 } else if (level > 1) {
1151                         /* Recurse down into a level that isn't *entirely* obsolete */
1152                         dma_pte_clear_level(domain, level - 1,
1153                                             phys_to_virt(dma_pte_addr(pte)),
1154                                             level_pfn, start_pfn, last_pfn,
1155                                             freelist);
1156                 }
1157 next:
1158                 pfn = level_pfn + level_size(level);
1159         } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1160
1161         if (first_pte)
1162                 domain_flush_cache(domain, first_pte,
1163                                    (void *)++last_pte - (void *)first_pte);
1164 }
1165
1166 /* We can't just free the pages because the IOMMU may still be walking
1167    the page tables, and may have cached the intermediate levels. The
1168    pages can only be freed after the IOTLB flush has been done. */
1169 static void domain_unmap(struct dmar_domain *domain, unsigned long start_pfn,
1170                          unsigned long last_pfn, struct list_head *freelist)
1171 {
1172         if (WARN_ON(!domain_pfn_supported(domain, last_pfn)) ||
1173             WARN_ON(start_pfn > last_pfn))
1174                 return;
1175
1176         /* we don't need lock here; nobody else touches the iova range */
1177         dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1178                             domain->pgd, 0, start_pfn, last_pfn, freelist);
1179
1180         /* free pgd */
1181         if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1182                 struct page *pgd_page = virt_to_page(domain->pgd);
1183                 list_add_tail(&pgd_page->lru, freelist);
1184                 domain->pgd = NULL;
1185         }
1186 }
1187
1188 /* iommu handling */
1189 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1190 {
1191         struct root_entry *root;
1192
1193         root = alloc_pgtable_page(iommu->node, GFP_ATOMIC);
1194         if (!root) {
1195                 pr_err("Allocating root entry for %s failed\n",
1196                         iommu->name);
1197                 return -ENOMEM;
1198         }
1199
1200         __iommu_flush_cache(iommu, root, ROOT_SIZE);
1201         iommu->root_entry = root;
1202
1203         return 0;
1204 }
1205
1206 static void iommu_set_root_entry(struct intel_iommu *iommu)
1207 {
1208         u64 addr;
1209         u32 sts;
1210         unsigned long flag;
1211
1212         addr = virt_to_phys(iommu->root_entry);
1213         if (sm_supported(iommu))
1214                 addr |= DMA_RTADDR_SMT;
1215
1216         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1217         dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1218
1219         writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1220
1221         /* Make sure hardware complete it */
1222         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1223                       readl, (sts & DMA_GSTS_RTPS), sts);
1224
1225         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1226
1227         /*
1228          * Hardware invalidates all DMA remapping hardware translation
1229          * caches as part of SRTP flow.
1230          */
1231         if (cap_esrtps(iommu->cap))
1232                 return;
1233
1234         iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
1235         if (sm_supported(iommu))
1236                 qi_flush_pasid_cache(iommu, 0, QI_PC_GLOBAL, 0);
1237         iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
1238 }
1239
1240 void iommu_flush_write_buffer(struct intel_iommu *iommu)
1241 {
1242         u32 val;
1243         unsigned long flag;
1244
1245         if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1246                 return;
1247
1248         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1249         writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1250
1251         /* Make sure hardware complete it */
1252         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1253                       readl, (!(val & DMA_GSTS_WBFS)), val);
1254
1255         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1256 }
1257
1258 /* return value determine if we need a write buffer flush */
1259 static void __iommu_flush_context(struct intel_iommu *iommu,
1260                                   u16 did, u16 source_id, u8 function_mask,
1261                                   u64 type)
1262 {
1263         u64 val = 0;
1264         unsigned long flag;
1265
1266         switch (type) {
1267         case DMA_CCMD_GLOBAL_INVL:
1268                 val = DMA_CCMD_GLOBAL_INVL;
1269                 break;
1270         case DMA_CCMD_DOMAIN_INVL:
1271                 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1272                 break;
1273         case DMA_CCMD_DEVICE_INVL:
1274                 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1275                         | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1276                 break;
1277         default:
1278                 pr_warn("%s: Unexpected context-cache invalidation type 0x%llx\n",
1279                         iommu->name, type);
1280                 return;
1281         }
1282         val |= DMA_CCMD_ICC;
1283
1284         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1285         dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1286
1287         /* Make sure hardware complete it */
1288         IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1289                 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1290
1291         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1292 }
1293
1294 /* return value determine if we need a write buffer flush */
1295 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1296                                 u64 addr, unsigned int size_order, u64 type)
1297 {
1298         int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1299         u64 val = 0, val_iva = 0;
1300         unsigned long flag;
1301
1302         switch (type) {
1303         case DMA_TLB_GLOBAL_FLUSH:
1304                 /* global flush doesn't need set IVA_REG */
1305                 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1306                 break;
1307         case DMA_TLB_DSI_FLUSH:
1308                 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1309                 break;
1310         case DMA_TLB_PSI_FLUSH:
1311                 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1312                 /* IH bit is passed in as part of address */
1313                 val_iva = size_order | addr;
1314                 break;
1315         default:
1316                 pr_warn("%s: Unexpected iotlb invalidation type 0x%llx\n",
1317                         iommu->name, type);
1318                 return;
1319         }
1320
1321         if (cap_write_drain(iommu->cap))
1322                 val |= DMA_TLB_WRITE_DRAIN;
1323
1324         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1325         /* Note: Only uses first TLB reg currently */
1326         if (val_iva)
1327                 dmar_writeq(iommu->reg + tlb_offset, val_iva);
1328         dmar_writeq(iommu->reg + tlb_offset + 8, val);
1329
1330         /* Make sure hardware complete it */
1331         IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1332                 dmar_readq, (!(val & DMA_TLB_IVT)), val);
1333
1334         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1335
1336         /* check IOTLB invalidation granularity */
1337         if (DMA_TLB_IAIG(val) == 0)
1338                 pr_err("Flush IOTLB failed\n");
1339         if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1340                 pr_debug("TLB flush request %Lx, actual %Lx\n",
1341                         (unsigned long long)DMA_TLB_IIRG(type),
1342                         (unsigned long long)DMA_TLB_IAIG(val));
1343 }
1344
1345 static struct device_domain_info *
1346 domain_lookup_dev_info(struct dmar_domain *domain,
1347                        struct intel_iommu *iommu, u8 bus, u8 devfn)
1348 {
1349         struct device_domain_info *info;
1350         unsigned long flags;
1351
1352         spin_lock_irqsave(&domain->lock, flags);
1353         list_for_each_entry(info, &domain->devices, link) {
1354                 if (info->iommu == iommu && info->bus == bus &&
1355                     info->devfn == devfn) {
1356                         spin_unlock_irqrestore(&domain->lock, flags);
1357                         return info;
1358                 }
1359         }
1360         spin_unlock_irqrestore(&domain->lock, flags);
1361
1362         return NULL;
1363 }
1364
1365 static void domain_update_iotlb(struct dmar_domain *domain)
1366 {
1367         struct dev_pasid_info *dev_pasid;
1368         struct device_domain_info *info;
1369         bool has_iotlb_device = false;
1370         unsigned long flags;
1371
1372         spin_lock_irqsave(&domain->lock, flags);
1373         list_for_each_entry(info, &domain->devices, link) {
1374                 if (info->ats_enabled) {
1375                         has_iotlb_device = true;
1376                         break;
1377                 }
1378         }
1379
1380         list_for_each_entry(dev_pasid, &domain->dev_pasids, link_domain) {
1381                 info = dev_iommu_priv_get(dev_pasid->dev);
1382                 if (info->ats_enabled) {
1383                         has_iotlb_device = true;
1384                         break;
1385                 }
1386         }
1387         domain->has_iotlb_device = has_iotlb_device;
1388         spin_unlock_irqrestore(&domain->lock, flags);
1389 }
1390
1391 /*
1392  * The extra devTLB flush quirk impacts those QAT devices with PCI device
1393  * IDs ranging from 0x4940 to 0x4943. It is exempted from risky_device()
1394  * check because it applies only to the built-in QAT devices and it doesn't
1395  * grant additional privileges.
1396  */
1397 #define BUGGY_QAT_DEVID_MASK 0x4940
1398 static bool dev_needs_extra_dtlb_flush(struct pci_dev *pdev)
1399 {
1400         if (pdev->vendor != PCI_VENDOR_ID_INTEL)
1401                 return false;
1402
1403         if ((pdev->device & 0xfffc) != BUGGY_QAT_DEVID_MASK)
1404                 return false;
1405
1406         return true;
1407 }
1408
1409 static void iommu_enable_pci_caps(struct device_domain_info *info)
1410 {
1411         struct pci_dev *pdev;
1412
1413         if (!dev_is_pci(info->dev))
1414                 return;
1415
1416         pdev = to_pci_dev(info->dev);
1417
1418         /* The PCIe spec, in its wisdom, declares that the behaviour of
1419            the device if you enable PASID support after ATS support is
1420            undefined. So always enable PASID support on devices which
1421            have it, even if we can't yet know if we're ever going to
1422            use it. */
1423         if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1))
1424                 info->pasid_enabled = 1;
1425
1426         if (info->ats_supported && pci_ats_page_aligned(pdev) &&
1427             !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) {
1428                 info->ats_enabled = 1;
1429                 domain_update_iotlb(info->domain);
1430         }
1431 }
1432
1433 static void iommu_disable_pci_caps(struct device_domain_info *info)
1434 {
1435         struct pci_dev *pdev;
1436
1437         if (!dev_is_pci(info->dev))
1438                 return;
1439
1440         pdev = to_pci_dev(info->dev);
1441
1442         if (info->ats_enabled) {
1443                 pci_disable_ats(pdev);
1444                 info->ats_enabled = 0;
1445                 domain_update_iotlb(info->domain);
1446         }
1447
1448         if (info->pasid_enabled) {
1449                 pci_disable_pasid(pdev);
1450                 info->pasid_enabled = 0;
1451         }
1452 }
1453
1454 static void __iommu_flush_dev_iotlb(struct device_domain_info *info,
1455                                     u64 addr, unsigned int mask)
1456 {
1457         u16 sid, qdep;
1458
1459         if (!info || !info->ats_enabled)
1460                 return;
1461
1462         sid = info->bus << 8 | info->devfn;
1463         qdep = info->ats_qdep;
1464         qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
1465                            qdep, addr, mask);
1466         quirk_extra_dev_tlb_flush(info, addr, mask, IOMMU_NO_PASID, qdep);
1467 }
1468
1469 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1470                                   u64 addr, unsigned mask)
1471 {
1472         struct dev_pasid_info *dev_pasid;
1473         struct device_domain_info *info;
1474         unsigned long flags;
1475
1476         if (!domain->has_iotlb_device)
1477                 return;
1478
1479         spin_lock_irqsave(&domain->lock, flags);
1480         list_for_each_entry(info, &domain->devices, link)
1481                 __iommu_flush_dev_iotlb(info, addr, mask);
1482
1483         list_for_each_entry(dev_pasid, &domain->dev_pasids, link_domain) {
1484                 info = dev_iommu_priv_get(dev_pasid->dev);
1485
1486                 if (!info->ats_enabled)
1487                         continue;
1488
1489                 qi_flush_dev_iotlb_pasid(info->iommu,
1490                                          PCI_DEVID(info->bus, info->devfn),
1491                                          info->pfsid, dev_pasid->pasid,
1492                                          info->ats_qdep, addr,
1493                                          mask);
1494         }
1495         spin_unlock_irqrestore(&domain->lock, flags);
1496 }
1497
1498 static void domain_flush_pasid_iotlb(struct intel_iommu *iommu,
1499                                      struct dmar_domain *domain, u64 addr,
1500                                      unsigned long npages, bool ih)
1501 {
1502         u16 did = domain_id_iommu(domain, iommu);
1503         struct dev_pasid_info *dev_pasid;
1504         unsigned long flags;
1505
1506         spin_lock_irqsave(&domain->lock, flags);
1507         list_for_each_entry(dev_pasid, &domain->dev_pasids, link_domain)
1508                 qi_flush_piotlb(iommu, did, dev_pasid->pasid, addr, npages, ih);
1509
1510         if (!list_empty(&domain->devices))
1511                 qi_flush_piotlb(iommu, did, IOMMU_NO_PASID, addr, npages, ih);
1512         spin_unlock_irqrestore(&domain->lock, flags);
1513 }
1514
1515 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu,
1516                                   struct dmar_domain *domain,
1517                                   unsigned long pfn, unsigned int pages,
1518                                   int ih, int map)
1519 {
1520         unsigned int aligned_pages = __roundup_pow_of_two(pages);
1521         unsigned int mask = ilog2(aligned_pages);
1522         uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1523         u16 did = domain_id_iommu(domain, iommu);
1524
1525         if (WARN_ON(!pages))
1526                 return;
1527
1528         if (ih)
1529                 ih = 1 << 6;
1530
1531         if (domain->use_first_level) {
1532                 domain_flush_pasid_iotlb(iommu, domain, addr, pages, ih);
1533         } else {
1534                 unsigned long bitmask = aligned_pages - 1;
1535
1536                 /*
1537                  * PSI masks the low order bits of the base address. If the
1538                  * address isn't aligned to the mask, then compute a mask value
1539                  * needed to ensure the target range is flushed.
1540                  */
1541                 if (unlikely(bitmask & pfn)) {
1542                         unsigned long end_pfn = pfn + pages - 1, shared_bits;
1543
1544                         /*
1545                          * Since end_pfn <= pfn + bitmask, the only way bits
1546                          * higher than bitmask can differ in pfn and end_pfn is
1547                          * by carrying. This means after masking out bitmask,
1548                          * high bits starting with the first set bit in
1549                          * shared_bits are all equal in both pfn and end_pfn.
1550                          */
1551                         shared_bits = ~(pfn ^ end_pfn) & ~bitmask;
1552                         mask = shared_bits ? __ffs(shared_bits) : BITS_PER_LONG;
1553                 }
1554
1555                 /*
1556                  * Fallback to domain selective flush if no PSI support or
1557                  * the size is too big.
1558                  */
1559                 if (!cap_pgsel_inv(iommu->cap) ||
1560                     mask > cap_max_amask_val(iommu->cap))
1561                         iommu->flush.flush_iotlb(iommu, did, 0, 0,
1562                                                         DMA_TLB_DSI_FLUSH);
1563                 else
1564                         iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
1565                                                         DMA_TLB_PSI_FLUSH);
1566         }
1567
1568         /*
1569          * In caching mode, changes of pages from non-present to present require
1570          * flush. However, device IOTLB doesn't need to be flushed in this case.
1571          */
1572         if (!cap_caching_mode(iommu->cap) || !map)
1573                 iommu_flush_dev_iotlb(domain, addr, mask);
1574 }
1575
1576 /* Notification for newly created mappings */
1577 static inline void __mapping_notify_one(struct intel_iommu *iommu,
1578                                         struct dmar_domain *domain,
1579                                         unsigned long pfn, unsigned int pages)
1580 {
1581         /*
1582          * It's a non-present to present mapping. Only flush if caching mode
1583          * and second level.
1584          */
1585         if (cap_caching_mode(iommu->cap) && !domain->use_first_level)
1586                 iommu_flush_iotlb_psi(iommu, domain, pfn, pages, 0, 1);
1587         else
1588                 iommu_flush_write_buffer(iommu);
1589 }
1590
1591 static void intel_flush_iotlb_all(struct iommu_domain *domain)
1592 {
1593         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
1594         struct iommu_domain_info *info;
1595         unsigned long idx;
1596
1597         xa_for_each(&dmar_domain->iommu_array, idx, info) {
1598                 struct intel_iommu *iommu = info->iommu;
1599                 u16 did = domain_id_iommu(dmar_domain, iommu);
1600
1601                 if (dmar_domain->use_first_level)
1602                         domain_flush_pasid_iotlb(iommu, dmar_domain, 0, -1, 0);
1603                 else
1604                         iommu->flush.flush_iotlb(iommu, did, 0, 0,
1605                                                  DMA_TLB_DSI_FLUSH);
1606
1607                 if (!cap_caching_mode(iommu->cap))
1608                         iommu_flush_dev_iotlb(dmar_domain, 0, MAX_AGAW_PFN_WIDTH);
1609         }
1610 }
1611
1612 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1613 {
1614         u32 pmen;
1615         unsigned long flags;
1616
1617         if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap))
1618                 return;
1619
1620         raw_spin_lock_irqsave(&iommu->register_lock, flags);
1621         pmen = readl(iommu->reg + DMAR_PMEN_REG);
1622         pmen &= ~DMA_PMEN_EPM;
1623         writel(pmen, iommu->reg + DMAR_PMEN_REG);
1624
1625         /* wait for the protected region status bit to clear */
1626         IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1627                 readl, !(pmen & DMA_PMEN_PRS), pmen);
1628
1629         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1630 }
1631
1632 static void iommu_enable_translation(struct intel_iommu *iommu)
1633 {
1634         u32 sts;
1635         unsigned long flags;
1636
1637         raw_spin_lock_irqsave(&iommu->register_lock, flags);
1638         iommu->gcmd |= DMA_GCMD_TE;
1639         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1640
1641         /* Make sure hardware complete it */
1642         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1643                       readl, (sts & DMA_GSTS_TES), sts);
1644
1645         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1646 }
1647
1648 static void iommu_disable_translation(struct intel_iommu *iommu)
1649 {
1650         u32 sts;
1651         unsigned long flag;
1652
1653         if (iommu_skip_te_disable && iommu->drhd->gfx_dedicated &&
1654             (cap_read_drain(iommu->cap) || cap_write_drain(iommu->cap)))
1655                 return;
1656
1657         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1658         iommu->gcmd &= ~DMA_GCMD_TE;
1659         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1660
1661         /* Make sure hardware complete it */
1662         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1663                       readl, (!(sts & DMA_GSTS_TES)), sts);
1664
1665         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1666 }
1667
1668 static int iommu_init_domains(struct intel_iommu *iommu)
1669 {
1670         u32 ndomains;
1671
1672         ndomains = cap_ndoms(iommu->cap);
1673         pr_debug("%s: Number of Domains supported <%d>\n",
1674                  iommu->name, ndomains);
1675
1676         spin_lock_init(&iommu->lock);
1677
1678         iommu->domain_ids = bitmap_zalloc(ndomains, GFP_KERNEL);
1679         if (!iommu->domain_ids)
1680                 return -ENOMEM;
1681
1682         /*
1683          * If Caching mode is set, then invalid translations are tagged
1684          * with domain-id 0, hence we need to pre-allocate it. We also
1685          * use domain-id 0 as a marker for non-allocated domain-id, so
1686          * make sure it is not used for a real domain.
1687          */
1688         set_bit(0, iommu->domain_ids);
1689
1690         /*
1691          * Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid
1692          * entry for first-level or pass-through translation modes should
1693          * be programmed with a domain id different from those used for
1694          * second-level or nested translation. We reserve a domain id for
1695          * this purpose.
1696          */
1697         if (sm_supported(iommu))
1698                 set_bit(FLPT_DEFAULT_DID, iommu->domain_ids);
1699
1700         return 0;
1701 }
1702
1703 static void disable_dmar_iommu(struct intel_iommu *iommu)
1704 {
1705         if (!iommu->domain_ids)
1706                 return;
1707
1708         /*
1709          * All iommu domains must have been detached from the devices,
1710          * hence there should be no domain IDs in use.
1711          */
1712         if (WARN_ON(bitmap_weight(iommu->domain_ids, cap_ndoms(iommu->cap))
1713                     > NUM_RESERVED_DID))
1714                 return;
1715
1716         if (iommu->gcmd & DMA_GCMD_TE)
1717                 iommu_disable_translation(iommu);
1718 }
1719
1720 static void free_dmar_iommu(struct intel_iommu *iommu)
1721 {
1722         if (iommu->domain_ids) {
1723                 bitmap_free(iommu->domain_ids);
1724                 iommu->domain_ids = NULL;
1725         }
1726
1727         if (iommu->copied_tables) {
1728                 bitmap_free(iommu->copied_tables);
1729                 iommu->copied_tables = NULL;
1730         }
1731
1732         /* free context mapping */
1733         free_context_table(iommu);
1734
1735 #ifdef CONFIG_INTEL_IOMMU_SVM
1736         if (pasid_supported(iommu)) {
1737                 if (ecap_prs(iommu->ecap))
1738                         intel_svm_finish_prq(iommu);
1739         }
1740 #endif
1741 }
1742
1743 /*
1744  * Check and return whether first level is used by default for
1745  * DMA translation.
1746  */
1747 static bool first_level_by_default(unsigned int type)
1748 {
1749         /* Only SL is available in legacy mode */
1750         if (!scalable_mode_support())
1751                 return false;
1752
1753         /* Only level (either FL or SL) is available, just use it */
1754         if (intel_cap_flts_sanity() ^ intel_cap_slts_sanity())
1755                 return intel_cap_flts_sanity();
1756
1757         /* Both levels are available, decide it based on domain type */
1758         return type != IOMMU_DOMAIN_UNMANAGED;
1759 }
1760
1761 static struct dmar_domain *alloc_domain(unsigned int type)
1762 {
1763         struct dmar_domain *domain;
1764
1765         domain = kzalloc(sizeof(*domain), GFP_KERNEL);
1766         if (!domain)
1767                 return NULL;
1768
1769         domain->nid = NUMA_NO_NODE;
1770         if (first_level_by_default(type))
1771                 domain->use_first_level = true;
1772         domain->has_iotlb_device = false;
1773         INIT_LIST_HEAD(&domain->devices);
1774         INIT_LIST_HEAD(&domain->dev_pasids);
1775         spin_lock_init(&domain->lock);
1776         xa_init(&domain->iommu_array);
1777
1778         return domain;
1779 }
1780
1781 static int domain_attach_iommu(struct dmar_domain *domain,
1782                                struct intel_iommu *iommu)
1783 {
1784         struct iommu_domain_info *info, *curr;
1785         unsigned long ndomains;
1786         int num, ret = -ENOSPC;
1787
1788         info = kzalloc(sizeof(*info), GFP_KERNEL);
1789         if (!info)
1790                 return -ENOMEM;
1791
1792         spin_lock(&iommu->lock);
1793         curr = xa_load(&domain->iommu_array, iommu->seq_id);
1794         if (curr) {
1795                 curr->refcnt++;
1796                 spin_unlock(&iommu->lock);
1797                 kfree(info);
1798                 return 0;
1799         }
1800
1801         ndomains = cap_ndoms(iommu->cap);
1802         num = find_first_zero_bit(iommu->domain_ids, ndomains);
1803         if (num >= ndomains) {
1804                 pr_err("%s: No free domain ids\n", iommu->name);
1805                 goto err_unlock;
1806         }
1807
1808         set_bit(num, iommu->domain_ids);
1809         info->refcnt    = 1;
1810         info->did       = num;
1811         info->iommu     = iommu;
1812         curr = xa_cmpxchg(&domain->iommu_array, iommu->seq_id,
1813                           NULL, info, GFP_ATOMIC);
1814         if (curr) {
1815                 ret = xa_err(curr) ? : -EBUSY;
1816                 goto err_clear;
1817         }
1818         domain_update_iommu_cap(domain);
1819
1820         spin_unlock(&iommu->lock);
1821         return 0;
1822
1823 err_clear:
1824         clear_bit(info->did, iommu->domain_ids);
1825 err_unlock:
1826         spin_unlock(&iommu->lock);
1827         kfree(info);
1828         return ret;
1829 }
1830
1831 static void domain_detach_iommu(struct dmar_domain *domain,
1832                                 struct intel_iommu *iommu)
1833 {
1834         struct iommu_domain_info *info;
1835
1836         spin_lock(&iommu->lock);
1837         info = xa_load(&domain->iommu_array, iommu->seq_id);
1838         if (--info->refcnt == 0) {
1839                 clear_bit(info->did, iommu->domain_ids);
1840                 xa_erase(&domain->iommu_array, iommu->seq_id);
1841                 domain->nid = NUMA_NO_NODE;
1842                 domain_update_iommu_cap(domain);
1843                 kfree(info);
1844         }
1845         spin_unlock(&iommu->lock);
1846 }
1847
1848 static inline int guestwidth_to_adjustwidth(int gaw)
1849 {
1850         int agaw;
1851         int r = (gaw - 12) % 9;
1852
1853         if (r == 0)
1854                 agaw = gaw;
1855         else
1856                 agaw = gaw + 9 - r;
1857         if (agaw > 64)
1858                 agaw = 64;
1859         return agaw;
1860 }
1861
1862 static void domain_exit(struct dmar_domain *domain)
1863 {
1864         if (domain->pgd) {
1865                 LIST_HEAD(freelist);
1866
1867                 domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw), &freelist);
1868                 put_pages_list(&freelist);
1869         }
1870
1871         if (WARN_ON(!list_empty(&domain->devices)))
1872                 return;
1873
1874         kfree(domain);
1875 }
1876
1877 /*
1878  * Get the PASID directory size for scalable mode context entry.
1879  * Value of X in the PDTS field of a scalable mode context entry
1880  * indicates PASID directory with 2^(X + 7) entries.
1881  */
1882 static inline unsigned long context_get_sm_pds(struct pasid_table *table)
1883 {
1884         unsigned long pds, max_pde;
1885
1886         max_pde = table->max_pasid >> PASID_PDE_SHIFT;
1887         pds = find_first_bit(&max_pde, MAX_NR_PASID_BITS);
1888         if (pds < 7)
1889                 return 0;
1890
1891         return pds - 7;
1892 }
1893
1894 /*
1895  * Set the RID_PASID field of a scalable mode context entry. The
1896  * IOMMU hardware will use the PASID value set in this field for
1897  * DMA translations of DMA requests without PASID.
1898  */
1899 static inline void
1900 context_set_sm_rid2pasid(struct context_entry *context, unsigned long pasid)
1901 {
1902         context->hi |= pasid & ((1 << 20) - 1);
1903 }
1904
1905 /*
1906  * Set the DTE(Device-TLB Enable) field of a scalable mode context
1907  * entry.
1908  */
1909 static inline void context_set_sm_dte(struct context_entry *context)
1910 {
1911         context->lo |= BIT_ULL(2);
1912 }
1913
1914 /*
1915  * Set the PRE(Page Request Enable) field of a scalable mode context
1916  * entry.
1917  */
1918 static inline void context_set_sm_pre(struct context_entry *context)
1919 {
1920         context->lo |= BIT_ULL(4);
1921 }
1922
1923 /* Convert value to context PASID directory size field coding. */
1924 #define context_pdts(pds)       (((pds) & 0x7) << 9)
1925
1926 static int domain_context_mapping_one(struct dmar_domain *domain,
1927                                       struct intel_iommu *iommu,
1928                                       struct pasid_table *table,
1929                                       u8 bus, u8 devfn)
1930 {
1931         struct device_domain_info *info =
1932                         domain_lookup_dev_info(domain, iommu, bus, devfn);
1933         u16 did = domain_id_iommu(domain, iommu);
1934         int translation = CONTEXT_TT_MULTI_LEVEL;
1935         struct context_entry *context;
1936         int ret;
1937
1938         if (hw_pass_through && domain_type_is_si(domain))
1939                 translation = CONTEXT_TT_PASS_THROUGH;
1940
1941         pr_debug("Set context mapping for %02x:%02x.%d\n",
1942                 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1943
1944         spin_lock(&iommu->lock);
1945         ret = -ENOMEM;
1946         context = iommu_context_addr(iommu, bus, devfn, 1);
1947         if (!context)
1948                 goto out_unlock;
1949
1950         ret = 0;
1951         if (context_present(context) && !context_copied(iommu, bus, devfn))
1952                 goto out_unlock;
1953
1954         /*
1955          * For kdump cases, old valid entries may be cached due to the
1956          * in-flight DMA and copied pgtable, but there is no unmapping
1957          * behaviour for them, thus we need an explicit cache flush for
1958          * the newly-mapped device. For kdump, at this point, the device
1959          * is supposed to finish reset at its driver probe stage, so no
1960          * in-flight DMA will exist, and we don't need to worry anymore
1961          * hereafter.
1962          */
1963         if (context_copied(iommu, bus, devfn)) {
1964                 u16 did_old = context_domain_id(context);
1965
1966                 if (did_old < cap_ndoms(iommu->cap)) {
1967                         iommu->flush.flush_context(iommu, did_old,
1968                                                    (((u16)bus) << 8) | devfn,
1969                                                    DMA_CCMD_MASK_NOBIT,
1970                                                    DMA_CCMD_DEVICE_INVL);
1971                         iommu->flush.flush_iotlb(iommu, did_old, 0, 0,
1972                                                  DMA_TLB_DSI_FLUSH);
1973                 }
1974
1975                 clear_context_copied(iommu, bus, devfn);
1976         }
1977
1978         context_clear_entry(context);
1979
1980         if (sm_supported(iommu)) {
1981                 unsigned long pds;
1982
1983                 /* Setup the PASID DIR pointer: */
1984                 pds = context_get_sm_pds(table);
1985                 context->lo = (u64)virt_to_phys(table->table) |
1986                                 context_pdts(pds);
1987
1988                 /* Setup the RID_PASID field: */
1989                 context_set_sm_rid2pasid(context, IOMMU_NO_PASID);
1990
1991                 /*
1992                  * Setup the Device-TLB enable bit and Page request
1993                  * Enable bit:
1994                  */
1995                 if (info && info->ats_supported)
1996                         context_set_sm_dte(context);
1997                 if (info && info->pri_supported)
1998                         context_set_sm_pre(context);
1999                 if (info && info->pasid_supported)
2000                         context_set_pasid(context);
2001         } else {
2002                 struct dma_pte *pgd = domain->pgd;
2003                 int agaw;
2004
2005                 context_set_domain_id(context, did);
2006
2007                 if (translation != CONTEXT_TT_PASS_THROUGH) {
2008                         /*
2009                          * Skip top levels of page tables for iommu which has
2010                          * less agaw than default. Unnecessary for PT mode.
2011                          */
2012                         for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2013                                 ret = -ENOMEM;
2014                                 pgd = phys_to_virt(dma_pte_addr(pgd));
2015                                 if (!dma_pte_present(pgd))
2016                                         goto out_unlock;
2017                         }
2018
2019                         if (info && info->ats_supported)
2020                                 translation = CONTEXT_TT_DEV_IOTLB;
2021                         else
2022                                 translation = CONTEXT_TT_MULTI_LEVEL;
2023
2024                         context_set_address_root(context, virt_to_phys(pgd));
2025                         context_set_address_width(context, agaw);
2026                 } else {
2027                         /*
2028                          * In pass through mode, AW must be programmed to
2029                          * indicate the largest AGAW value supported by
2030                          * hardware. And ASR is ignored by hardware.
2031                          */
2032                         context_set_address_width(context, iommu->msagaw);
2033                 }
2034
2035                 context_set_translation_type(context, translation);
2036         }
2037
2038         context_set_fault_enable(context);
2039         context_set_present(context);
2040         if (!ecap_coherent(iommu->ecap))
2041                 clflush_cache_range(context, sizeof(*context));
2042
2043         /*
2044          * It's a non-present to present mapping. If hardware doesn't cache
2045          * non-present entry we only need to flush the write-buffer. If the
2046          * _does_ cache non-present entries, then it does so in the special
2047          * domain #0, which we have to flush:
2048          */
2049         if (cap_caching_mode(iommu->cap)) {
2050                 iommu->flush.flush_context(iommu, 0,
2051                                            (((u16)bus) << 8) | devfn,
2052                                            DMA_CCMD_MASK_NOBIT,
2053                                            DMA_CCMD_DEVICE_INVL);
2054                 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
2055         } else {
2056                 iommu_flush_write_buffer(iommu);
2057         }
2058
2059         ret = 0;
2060
2061 out_unlock:
2062         spin_unlock(&iommu->lock);
2063
2064         return ret;
2065 }
2066
2067 struct domain_context_mapping_data {
2068         struct dmar_domain *domain;
2069         struct intel_iommu *iommu;
2070         struct pasid_table *table;
2071 };
2072
2073 static int domain_context_mapping_cb(struct pci_dev *pdev,
2074                                      u16 alias, void *opaque)
2075 {
2076         struct domain_context_mapping_data *data = opaque;
2077
2078         return domain_context_mapping_one(data->domain, data->iommu,
2079                                           data->table, PCI_BUS_NUM(alias),
2080                                           alias & 0xff);
2081 }
2082
2083 static int
2084 domain_context_mapping(struct dmar_domain *domain, struct device *dev)
2085 {
2086         struct domain_context_mapping_data data;
2087         struct pasid_table *table;
2088         struct intel_iommu *iommu;
2089         u8 bus, devfn;
2090
2091         iommu = device_to_iommu(dev, &bus, &devfn);
2092         if (!iommu)
2093                 return -ENODEV;
2094
2095         table = intel_pasid_get_table(dev);
2096
2097         if (!dev_is_pci(dev))
2098                 return domain_context_mapping_one(domain, iommu, table,
2099                                                   bus, devfn);
2100
2101         data.domain = domain;
2102         data.iommu = iommu;
2103         data.table = table;
2104
2105         return pci_for_each_dma_alias(to_pci_dev(dev),
2106                                       &domain_context_mapping_cb, &data);
2107 }
2108
2109 /* Returns a number of VTD pages, but aligned to MM page size */
2110 static inline unsigned long aligned_nrpages(unsigned long host_addr,
2111                                             size_t size)
2112 {
2113         host_addr &= ~PAGE_MASK;
2114         return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
2115 }
2116
2117 /* Return largest possible superpage level for a given mapping */
2118 static inline int hardware_largepage_caps(struct dmar_domain *domain,
2119                                           unsigned long iov_pfn,
2120                                           unsigned long phy_pfn,
2121                                           unsigned long pages)
2122 {
2123         int support, level = 1;
2124         unsigned long pfnmerge;
2125
2126         support = domain->iommu_superpage;
2127
2128         /* To use a large page, the virtual *and* physical addresses
2129            must be aligned to 2MiB/1GiB/etc. Lower bits set in either
2130            of them will mean we have to use smaller pages. So just
2131            merge them and check both at once. */
2132         pfnmerge = iov_pfn | phy_pfn;
2133
2134         while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
2135                 pages >>= VTD_STRIDE_SHIFT;
2136                 if (!pages)
2137                         break;
2138                 pfnmerge >>= VTD_STRIDE_SHIFT;
2139                 level++;
2140                 support--;
2141         }
2142         return level;
2143 }
2144
2145 /*
2146  * Ensure that old small page tables are removed to make room for superpage(s).
2147  * We're going to add new large pages, so make sure we don't remove their parent
2148  * tables. The IOTLB/devTLBs should be flushed if any PDE/PTEs are cleared.
2149  */
2150 static void switch_to_super_page(struct dmar_domain *domain,
2151                                  unsigned long start_pfn,
2152                                  unsigned long end_pfn, int level)
2153 {
2154         unsigned long lvl_pages = lvl_to_nr_pages(level);
2155         struct iommu_domain_info *info;
2156         struct dma_pte *pte = NULL;
2157         unsigned long i;
2158
2159         while (start_pfn <= end_pfn) {
2160                 if (!pte)
2161                         pte = pfn_to_dma_pte(domain, start_pfn, &level,
2162                                              GFP_ATOMIC);
2163
2164                 if (dma_pte_present(pte)) {
2165                         dma_pte_free_pagetable(domain, start_pfn,
2166                                                start_pfn + lvl_pages - 1,
2167                                                level + 1);
2168
2169                         xa_for_each(&domain->iommu_array, i, info)
2170                                 iommu_flush_iotlb_psi(info->iommu, domain,
2171                                                       start_pfn, lvl_pages,
2172                                                       0, 0);
2173                 }
2174
2175                 pte++;
2176                 start_pfn += lvl_pages;
2177                 if (first_pte_in_page(pte))
2178                         pte = NULL;
2179         }
2180 }
2181
2182 static int
2183 __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2184                  unsigned long phys_pfn, unsigned long nr_pages, int prot,
2185                  gfp_t gfp)
2186 {
2187         struct dma_pte *first_pte = NULL, *pte = NULL;
2188         unsigned int largepage_lvl = 0;
2189         unsigned long lvl_pages = 0;
2190         phys_addr_t pteval;
2191         u64 attr;
2192
2193         if (unlikely(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1)))
2194                 return -EINVAL;
2195
2196         if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
2197                 return -EINVAL;
2198
2199         attr = prot & (DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP);
2200         attr |= DMA_FL_PTE_PRESENT;
2201         if (domain->use_first_level) {
2202                 attr |= DMA_FL_PTE_XD | DMA_FL_PTE_US | DMA_FL_PTE_ACCESS;
2203                 if (prot & DMA_PTE_WRITE)
2204                         attr |= DMA_FL_PTE_DIRTY;
2205         }
2206
2207         pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | attr;
2208
2209         while (nr_pages > 0) {
2210                 uint64_t tmp;
2211
2212                 if (!pte) {
2213                         largepage_lvl = hardware_largepage_caps(domain, iov_pfn,
2214                                         phys_pfn, nr_pages);
2215
2216                         pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl,
2217                                              gfp);
2218                         if (!pte)
2219                                 return -ENOMEM;
2220                         first_pte = pte;
2221
2222                         lvl_pages = lvl_to_nr_pages(largepage_lvl);
2223
2224                         /* It is large page*/
2225                         if (largepage_lvl > 1) {
2226                                 unsigned long end_pfn;
2227                                 unsigned long pages_to_remove;
2228
2229                                 pteval |= DMA_PTE_LARGE_PAGE;
2230                                 pages_to_remove = min_t(unsigned long, nr_pages,
2231                                                         nr_pte_to_next_page(pte) * lvl_pages);
2232                                 end_pfn = iov_pfn + pages_to_remove - 1;
2233                                 switch_to_super_page(domain, iov_pfn, end_pfn, largepage_lvl);
2234                         } else {
2235                                 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
2236                         }
2237
2238                 }
2239                 /* We don't need lock here, nobody else
2240                  * touches the iova range
2241                  */
2242                 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
2243                 if (tmp) {
2244                         static int dumps = 5;
2245                         pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
2246                                 iov_pfn, tmp, (unsigned long long)pteval);
2247                         if (dumps) {
2248                                 dumps--;
2249                                 debug_dma_dump_mappings(NULL);
2250                         }
2251                         WARN_ON(1);
2252                 }
2253
2254                 nr_pages -= lvl_pages;
2255                 iov_pfn += lvl_pages;
2256                 phys_pfn += lvl_pages;
2257                 pteval += lvl_pages * VTD_PAGE_SIZE;
2258
2259                 /* If the next PTE would be the first in a new page, then we
2260                  * need to flush the cache on the entries we've just written.
2261                  * And then we'll need to recalculate 'pte', so clear it and
2262                  * let it get set again in the if (!pte) block above.
2263                  *
2264                  * If we're done (!nr_pages) we need to flush the cache too.
2265                  *
2266                  * Also if we've been setting superpages, we may need to
2267                  * recalculate 'pte' and switch back to smaller pages for the
2268                  * end of the mapping, if the trailing size is not enough to
2269                  * use another superpage (i.e. nr_pages < lvl_pages).
2270                  */
2271                 pte++;
2272                 if (!nr_pages || first_pte_in_page(pte) ||
2273                     (largepage_lvl > 1 && nr_pages < lvl_pages)) {
2274                         domain_flush_cache(domain, first_pte,
2275                                            (void *)pte - (void *)first_pte);
2276                         pte = NULL;
2277                 }
2278         }
2279
2280         return 0;
2281 }
2282
2283 static void domain_context_clear_one(struct device_domain_info *info, u8 bus, u8 devfn)
2284 {
2285         struct intel_iommu *iommu = info->iommu;
2286         struct context_entry *context;
2287         u16 did_old;
2288
2289         if (!iommu)
2290                 return;
2291
2292         spin_lock(&iommu->lock);
2293         context = iommu_context_addr(iommu, bus, devfn, 0);
2294         if (!context) {
2295                 spin_unlock(&iommu->lock);
2296                 return;
2297         }
2298
2299         if (sm_supported(iommu)) {
2300                 if (hw_pass_through && domain_type_is_si(info->domain))
2301                         did_old = FLPT_DEFAULT_DID;
2302                 else
2303                         did_old = domain_id_iommu(info->domain, iommu);
2304         } else {
2305                 did_old = context_domain_id(context);
2306         }
2307
2308         context_clear_entry(context);
2309         __iommu_flush_cache(iommu, context, sizeof(*context));
2310         spin_unlock(&iommu->lock);
2311         iommu->flush.flush_context(iommu,
2312                                    did_old,
2313                                    (((u16)bus) << 8) | devfn,
2314                                    DMA_CCMD_MASK_NOBIT,
2315                                    DMA_CCMD_DEVICE_INVL);
2316
2317         if (sm_supported(iommu))
2318                 qi_flush_pasid_cache(iommu, did_old, QI_PC_ALL_PASIDS, 0);
2319
2320         iommu->flush.flush_iotlb(iommu,
2321                                  did_old,
2322                                  0,
2323                                  0,
2324                                  DMA_TLB_DSI_FLUSH);
2325
2326         __iommu_flush_dev_iotlb(info, 0, MAX_AGAW_PFN_WIDTH);
2327 }
2328
2329 static int domain_setup_first_level(struct intel_iommu *iommu,
2330                                     struct dmar_domain *domain,
2331                                     struct device *dev,
2332                                     u32 pasid)
2333 {
2334         struct dma_pte *pgd = domain->pgd;
2335         int agaw, level;
2336         int flags = 0;
2337
2338         /*
2339          * Skip top levels of page tables for iommu which has
2340          * less agaw than default. Unnecessary for PT mode.
2341          */
2342         for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2343                 pgd = phys_to_virt(dma_pte_addr(pgd));
2344                 if (!dma_pte_present(pgd))
2345                         return -ENOMEM;
2346         }
2347
2348         level = agaw_to_level(agaw);
2349         if (level != 4 && level != 5)
2350                 return -EINVAL;
2351
2352         if (level == 5)
2353                 flags |= PASID_FLAG_FL5LP;
2354
2355         if (domain->force_snooping)
2356                 flags |= PASID_FLAG_PAGE_SNOOP;
2357
2358         return intel_pasid_setup_first_level(iommu, dev, (pgd_t *)pgd, pasid,
2359                                              domain_id_iommu(domain, iommu),
2360                                              flags);
2361 }
2362
2363 static bool dev_is_real_dma_subdevice(struct device *dev)
2364 {
2365         return dev && dev_is_pci(dev) &&
2366                pci_real_dma_dev(to_pci_dev(dev)) != to_pci_dev(dev);
2367 }
2368
2369 static int iommu_domain_identity_map(struct dmar_domain *domain,
2370                                      unsigned long first_vpfn,
2371                                      unsigned long last_vpfn)
2372 {
2373         /*
2374          * RMRR range might have overlap with physical memory range,
2375          * clear it first
2376          */
2377         dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2378
2379         return __domain_mapping(domain, first_vpfn,
2380                                 first_vpfn, last_vpfn - first_vpfn + 1,
2381                                 DMA_PTE_READ|DMA_PTE_WRITE, GFP_KERNEL);
2382 }
2383
2384 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2385
2386 static int __init si_domain_init(int hw)
2387 {
2388         struct dmar_rmrr_unit *rmrr;
2389         struct device *dev;
2390         int i, nid, ret;
2391
2392         si_domain = alloc_domain(IOMMU_DOMAIN_IDENTITY);
2393         if (!si_domain)
2394                 return -EFAULT;
2395
2396         if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2397                 domain_exit(si_domain);
2398                 si_domain = NULL;
2399                 return -EFAULT;
2400         }
2401
2402         if (hw)
2403                 return 0;
2404
2405         for_each_online_node(nid) {
2406                 unsigned long start_pfn, end_pfn;
2407                 int i;
2408
2409                 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2410                         ret = iommu_domain_identity_map(si_domain,
2411                                         mm_to_dma_pfn_start(start_pfn),
2412                                         mm_to_dma_pfn_end(end_pfn));
2413                         if (ret)
2414                                 return ret;
2415                 }
2416         }
2417
2418         /*
2419          * Identity map the RMRRs so that devices with RMRRs could also use
2420          * the si_domain.
2421          */
2422         for_each_rmrr_units(rmrr) {
2423                 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
2424                                           i, dev) {
2425                         unsigned long long start = rmrr->base_address;
2426                         unsigned long long end = rmrr->end_address;
2427
2428                         if (WARN_ON(end < start ||
2429                                     end >> agaw_to_width(si_domain->agaw)))
2430                                 continue;
2431
2432                         ret = iommu_domain_identity_map(si_domain,
2433                                         mm_to_dma_pfn_start(start >> PAGE_SHIFT),
2434                                         mm_to_dma_pfn_end(end >> PAGE_SHIFT));
2435                         if (ret)
2436                                 return ret;
2437                 }
2438         }
2439
2440         return 0;
2441 }
2442
2443 static int dmar_domain_attach_device(struct dmar_domain *domain,
2444                                      struct device *dev)
2445 {
2446         struct device_domain_info *info = dev_iommu_priv_get(dev);
2447         struct intel_iommu *iommu;
2448         unsigned long flags;
2449         u8 bus, devfn;
2450         int ret;
2451
2452         iommu = device_to_iommu(dev, &bus, &devfn);
2453         if (!iommu)
2454                 return -ENODEV;
2455
2456         ret = domain_attach_iommu(domain, iommu);
2457         if (ret)
2458                 return ret;
2459         info->domain = domain;
2460         spin_lock_irqsave(&domain->lock, flags);
2461         list_add(&info->link, &domain->devices);
2462         spin_unlock_irqrestore(&domain->lock, flags);
2463
2464         /* PASID table is mandatory for a PCI device in scalable mode. */
2465         if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev)) {
2466                 /* Setup the PASID entry for requests without PASID: */
2467                 if (hw_pass_through && domain_type_is_si(domain))
2468                         ret = intel_pasid_setup_pass_through(iommu, domain,
2469                                         dev, IOMMU_NO_PASID);
2470                 else if (domain->use_first_level)
2471                         ret = domain_setup_first_level(iommu, domain, dev,
2472                                         IOMMU_NO_PASID);
2473                 else
2474                         ret = intel_pasid_setup_second_level(iommu, domain,
2475                                         dev, IOMMU_NO_PASID);
2476                 if (ret) {
2477                         dev_err(dev, "Setup RID2PASID failed\n");
2478                         device_block_translation(dev);
2479                         return ret;
2480                 }
2481         }
2482
2483         ret = domain_context_mapping(domain, dev);
2484         if (ret) {
2485                 dev_err(dev, "Domain context map failed\n");
2486                 device_block_translation(dev);
2487                 return ret;
2488         }
2489
2490         iommu_enable_pci_caps(info);
2491
2492         return 0;
2493 }
2494
2495 /**
2496  * device_rmrr_is_relaxable - Test whether the RMRR of this device
2497  * is relaxable (ie. is allowed to be not enforced under some conditions)
2498  * @dev: device handle
2499  *
2500  * We assume that PCI USB devices with RMRRs have them largely
2501  * for historical reasons and that the RMRR space is not actively used post
2502  * boot.  This exclusion may change if vendors begin to abuse it.
2503  *
2504  * The same exception is made for graphics devices, with the requirement that
2505  * any use of the RMRR regions will be torn down before assigning the device
2506  * to a guest.
2507  *
2508  * Return: true if the RMRR is relaxable, false otherwise
2509  */
2510 static bool device_rmrr_is_relaxable(struct device *dev)
2511 {
2512         struct pci_dev *pdev;
2513
2514         if (!dev_is_pci(dev))
2515                 return false;
2516
2517         pdev = to_pci_dev(dev);
2518         if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
2519                 return true;
2520         else
2521                 return false;
2522 }
2523
2524 /*
2525  * Return the required default domain type for a specific device.
2526  *
2527  * @dev: the device in query
2528  * @startup: true if this is during early boot
2529  *
2530  * Returns:
2531  *  - IOMMU_DOMAIN_DMA: device requires a dynamic mapping domain
2532  *  - IOMMU_DOMAIN_IDENTITY: device requires an identical mapping domain
2533  *  - 0: both identity and dynamic domains work for this device
2534  */
2535 static int device_def_domain_type(struct device *dev)
2536 {
2537         if (dev_is_pci(dev)) {
2538                 struct pci_dev *pdev = to_pci_dev(dev);
2539
2540                 if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2541                         return IOMMU_DOMAIN_IDENTITY;
2542
2543                 if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2544                         return IOMMU_DOMAIN_IDENTITY;
2545         }
2546
2547         return 0;
2548 }
2549
2550 static void intel_iommu_init_qi(struct intel_iommu *iommu)
2551 {
2552         /*
2553          * Start from the sane iommu hardware state.
2554          * If the queued invalidation is already initialized by us
2555          * (for example, while enabling interrupt-remapping) then
2556          * we got the things already rolling from a sane state.
2557          */
2558         if (!iommu->qi) {
2559                 /*
2560                  * Clear any previous faults.
2561                  */
2562                 dmar_fault(-1, iommu);
2563                 /*
2564                  * Disable queued invalidation if supported and already enabled
2565                  * before OS handover.
2566                  */
2567                 dmar_disable_qi(iommu);
2568         }
2569
2570         if (dmar_enable_qi(iommu)) {
2571                 /*
2572                  * Queued Invalidate not enabled, use Register Based Invalidate
2573                  */
2574                 iommu->flush.flush_context = __iommu_flush_context;
2575                 iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2576                 pr_info("%s: Using Register based invalidation\n",
2577                         iommu->name);
2578         } else {
2579                 iommu->flush.flush_context = qi_flush_context;
2580                 iommu->flush.flush_iotlb = qi_flush_iotlb;
2581                 pr_info("%s: Using Queued invalidation\n", iommu->name);
2582         }
2583 }
2584
2585 static int copy_context_table(struct intel_iommu *iommu,
2586                               struct root_entry *old_re,
2587                               struct context_entry **tbl,
2588                               int bus, bool ext)
2589 {
2590         int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
2591         struct context_entry *new_ce = NULL, ce;
2592         struct context_entry *old_ce = NULL;
2593         struct root_entry re;
2594         phys_addr_t old_ce_phys;
2595
2596         tbl_idx = ext ? bus * 2 : bus;
2597         memcpy(&re, old_re, sizeof(re));
2598
2599         for (devfn = 0; devfn < 256; devfn++) {
2600                 /* First calculate the correct index */
2601                 idx = (ext ? devfn * 2 : devfn) % 256;
2602
2603                 if (idx == 0) {
2604                         /* First save what we may have and clean up */
2605                         if (new_ce) {
2606                                 tbl[tbl_idx] = new_ce;
2607                                 __iommu_flush_cache(iommu, new_ce,
2608                                                     VTD_PAGE_SIZE);
2609                                 pos = 1;
2610                         }
2611
2612                         if (old_ce)
2613                                 memunmap(old_ce);
2614
2615                         ret = 0;
2616                         if (devfn < 0x80)
2617                                 old_ce_phys = root_entry_lctp(&re);
2618                         else
2619                                 old_ce_phys = root_entry_uctp(&re);
2620
2621                         if (!old_ce_phys) {
2622                                 if (ext && devfn == 0) {
2623                                         /* No LCTP, try UCTP */
2624                                         devfn = 0x7f;
2625                                         continue;
2626                                 } else {
2627                                         goto out;
2628                                 }
2629                         }
2630
2631                         ret = -ENOMEM;
2632                         old_ce = memremap(old_ce_phys, PAGE_SIZE,
2633                                         MEMREMAP_WB);
2634                         if (!old_ce)
2635                                 goto out;
2636
2637                         new_ce = alloc_pgtable_page(iommu->node, GFP_KERNEL);
2638                         if (!new_ce)
2639                                 goto out_unmap;
2640
2641                         ret = 0;
2642                 }
2643
2644                 /* Now copy the context entry */
2645                 memcpy(&ce, old_ce + idx, sizeof(ce));
2646
2647                 if (!context_present(&ce))
2648                         continue;
2649
2650                 did = context_domain_id(&ce);
2651                 if (did >= 0 && did < cap_ndoms(iommu->cap))
2652                         set_bit(did, iommu->domain_ids);
2653
2654                 set_context_copied(iommu, bus, devfn);
2655                 new_ce[idx] = ce;
2656         }
2657
2658         tbl[tbl_idx + pos] = new_ce;
2659
2660         __iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
2661
2662 out_unmap:
2663         memunmap(old_ce);
2664
2665 out:
2666         return ret;
2667 }
2668
2669 static int copy_translation_tables(struct intel_iommu *iommu)
2670 {
2671         struct context_entry **ctxt_tbls;
2672         struct root_entry *old_rt;
2673         phys_addr_t old_rt_phys;
2674         int ctxt_table_entries;
2675         u64 rtaddr_reg;
2676         int bus, ret;
2677         bool new_ext, ext;
2678
2679         rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
2680         ext        = !!(rtaddr_reg & DMA_RTADDR_SMT);
2681         new_ext    = !!sm_supported(iommu);
2682
2683         /*
2684          * The RTT bit can only be changed when translation is disabled,
2685          * but disabling translation means to open a window for data
2686          * corruption. So bail out and don't copy anything if we would
2687          * have to change the bit.
2688          */
2689         if (new_ext != ext)
2690                 return -EINVAL;
2691
2692         iommu->copied_tables = bitmap_zalloc(BIT_ULL(16), GFP_KERNEL);
2693         if (!iommu->copied_tables)
2694                 return -ENOMEM;
2695
2696         old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
2697         if (!old_rt_phys)
2698                 return -EINVAL;
2699
2700         old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB);
2701         if (!old_rt)
2702                 return -ENOMEM;
2703
2704         /* This is too big for the stack - allocate it from slab */
2705         ctxt_table_entries = ext ? 512 : 256;
2706         ret = -ENOMEM;
2707         ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL);
2708         if (!ctxt_tbls)
2709                 goto out_unmap;
2710
2711         for (bus = 0; bus < 256; bus++) {
2712                 ret = copy_context_table(iommu, &old_rt[bus],
2713                                          ctxt_tbls, bus, ext);
2714                 if (ret) {
2715                         pr_err("%s: Failed to copy context table for bus %d\n",
2716                                 iommu->name, bus);
2717                         continue;
2718                 }
2719         }
2720
2721         spin_lock(&iommu->lock);
2722
2723         /* Context tables are copied, now write them to the root_entry table */
2724         for (bus = 0; bus < 256; bus++) {
2725                 int idx = ext ? bus * 2 : bus;
2726                 u64 val;
2727
2728                 if (ctxt_tbls[idx]) {
2729                         val = virt_to_phys(ctxt_tbls[idx]) | 1;
2730                         iommu->root_entry[bus].lo = val;
2731                 }
2732
2733                 if (!ext || !ctxt_tbls[idx + 1])
2734                         continue;
2735
2736                 val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
2737                 iommu->root_entry[bus].hi = val;
2738         }
2739
2740         spin_unlock(&iommu->lock);
2741
2742         kfree(ctxt_tbls);
2743
2744         __iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
2745
2746         ret = 0;
2747
2748 out_unmap:
2749         memunmap(old_rt);
2750
2751         return ret;
2752 }
2753
2754 static int __init init_dmars(void)
2755 {
2756         struct dmar_drhd_unit *drhd;
2757         struct intel_iommu *iommu;
2758         int ret;
2759
2760         ret = intel_cap_audit(CAP_AUDIT_STATIC_DMAR, NULL);
2761         if (ret)
2762                 goto free_iommu;
2763
2764         for_each_iommu(iommu, drhd) {
2765                 if (drhd->ignored) {
2766                         iommu_disable_translation(iommu);
2767                         continue;
2768                 }
2769
2770                 /*
2771                  * Find the max pasid size of all IOMMU's in the system.
2772                  * We need to ensure the system pasid table is no bigger
2773                  * than the smallest supported.
2774                  */
2775                 if (pasid_supported(iommu)) {
2776                         u32 temp = 2 << ecap_pss(iommu->ecap);
2777
2778                         intel_pasid_max_id = min_t(u32, temp,
2779                                                    intel_pasid_max_id);
2780                 }
2781
2782                 intel_iommu_init_qi(iommu);
2783
2784                 ret = iommu_init_domains(iommu);
2785                 if (ret)
2786                         goto free_iommu;
2787
2788                 init_translation_status(iommu);
2789
2790                 if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
2791                         iommu_disable_translation(iommu);
2792                         clear_translation_pre_enabled(iommu);
2793                         pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
2794                                 iommu->name);
2795                 }
2796
2797                 /*
2798                  * TBD:
2799                  * we could share the same root & context tables
2800                  * among all IOMMU's. Need to Split it later.
2801                  */
2802                 ret = iommu_alloc_root_entry(iommu);
2803                 if (ret)
2804                         goto free_iommu;
2805
2806                 if (translation_pre_enabled(iommu)) {
2807                         pr_info("Translation already enabled - trying to copy translation structures\n");
2808
2809                         ret = copy_translation_tables(iommu);
2810                         if (ret) {
2811                                 /*
2812                                  * We found the IOMMU with translation
2813                                  * enabled - but failed to copy over the
2814                                  * old root-entry table. Try to proceed
2815                                  * by disabling translation now and
2816                                  * allocating a clean root-entry table.
2817                                  * This might cause DMAR faults, but
2818                                  * probably the dump will still succeed.
2819                                  */
2820                                 pr_err("Failed to copy translation tables from previous kernel for %s\n",
2821                                        iommu->name);
2822                                 iommu_disable_translation(iommu);
2823                                 clear_translation_pre_enabled(iommu);
2824                         } else {
2825                                 pr_info("Copied translation tables from previous kernel for %s\n",
2826                                         iommu->name);
2827                         }
2828                 }
2829
2830                 if (!ecap_pass_through(iommu->ecap))
2831                         hw_pass_through = 0;
2832                 intel_svm_check(iommu);
2833         }
2834
2835         /*
2836          * Now that qi is enabled on all iommus, set the root entry and flush
2837          * caches. This is required on some Intel X58 chipsets, otherwise the
2838          * flush_context function will loop forever and the boot hangs.
2839          */
2840         for_each_active_iommu(iommu, drhd) {
2841                 iommu_flush_write_buffer(iommu);
2842                 iommu_set_root_entry(iommu);
2843         }
2844
2845 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
2846         dmar_map_gfx = 0;
2847 #endif
2848
2849         if (!dmar_map_gfx)
2850                 iommu_identity_mapping |= IDENTMAP_GFX;
2851
2852         check_tylersburg_isoch();
2853
2854         ret = si_domain_init(hw_pass_through);
2855         if (ret)
2856                 goto free_iommu;
2857
2858         /*
2859          * for each drhd
2860          *   enable fault log
2861          *   global invalidate context cache
2862          *   global invalidate iotlb
2863          *   enable translation
2864          */
2865         for_each_iommu(iommu, drhd) {
2866                 if (drhd->ignored) {
2867                         /*
2868                          * we always have to disable PMRs or DMA may fail on
2869                          * this device
2870                          */
2871                         if (force_on)
2872                                 iommu_disable_protect_mem_regions(iommu);
2873                         continue;
2874                 }
2875
2876                 iommu_flush_write_buffer(iommu);
2877
2878 #ifdef CONFIG_INTEL_IOMMU_SVM
2879                 if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
2880                         /*
2881                          * Call dmar_alloc_hwirq() with dmar_global_lock held,
2882                          * could cause possible lock race condition.
2883                          */
2884                         up_write(&dmar_global_lock);
2885                         ret = intel_svm_enable_prq(iommu);
2886                         down_write(&dmar_global_lock);
2887                         if (ret)
2888                                 goto free_iommu;
2889                 }
2890 #endif
2891                 ret = dmar_set_interrupt(iommu);
2892                 if (ret)
2893                         goto free_iommu;
2894         }
2895
2896         return 0;
2897
2898 free_iommu:
2899         for_each_active_iommu(iommu, drhd) {
2900                 disable_dmar_iommu(iommu);
2901                 free_dmar_iommu(iommu);
2902         }
2903         if (si_domain) {
2904                 domain_exit(si_domain);
2905                 si_domain = NULL;
2906         }
2907
2908         return ret;
2909 }
2910
2911 static void __init init_no_remapping_devices(void)
2912 {
2913         struct dmar_drhd_unit *drhd;
2914         struct device *dev;
2915         int i;
2916
2917         for_each_drhd_unit(drhd) {
2918                 if (!drhd->include_all) {
2919                         for_each_active_dev_scope(drhd->devices,
2920                                                   drhd->devices_cnt, i, dev)
2921                                 break;
2922                         /* ignore DMAR unit if no devices exist */
2923                         if (i == drhd->devices_cnt)
2924                                 drhd->ignored = 1;
2925                 }
2926         }
2927
2928         for_each_active_drhd_unit(drhd) {
2929                 if (drhd->include_all)
2930                         continue;
2931
2932                 for_each_active_dev_scope(drhd->devices,
2933                                           drhd->devices_cnt, i, dev)
2934                         if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
2935                                 break;
2936                 if (i < drhd->devices_cnt)
2937                         continue;
2938
2939                 /* This IOMMU has *only* gfx devices. Either bypass it or
2940                    set the gfx_mapped flag, as appropriate */
2941                 drhd->gfx_dedicated = 1;
2942                 if (!dmar_map_gfx)
2943                         drhd->ignored = 1;
2944         }
2945 }
2946
2947 #ifdef CONFIG_SUSPEND
2948 static int init_iommu_hw(void)
2949 {
2950         struct dmar_drhd_unit *drhd;
2951         struct intel_iommu *iommu = NULL;
2952         int ret;
2953
2954         for_each_active_iommu(iommu, drhd) {
2955                 if (iommu->qi) {
2956                         ret = dmar_reenable_qi(iommu);
2957                         if (ret)
2958                                 return ret;
2959                 }
2960         }
2961
2962         for_each_iommu(iommu, drhd) {
2963                 if (drhd->ignored) {
2964                         /*
2965                          * we always have to disable PMRs or DMA may fail on
2966                          * this device
2967                          */
2968                         if (force_on)
2969                                 iommu_disable_protect_mem_regions(iommu);
2970                         continue;
2971                 }
2972
2973                 iommu_flush_write_buffer(iommu);
2974                 iommu_set_root_entry(iommu);
2975                 iommu_enable_translation(iommu);
2976                 iommu_disable_protect_mem_regions(iommu);
2977         }
2978
2979         return 0;
2980 }
2981
2982 static void iommu_flush_all(void)
2983 {
2984         struct dmar_drhd_unit *drhd;
2985         struct intel_iommu *iommu;
2986
2987         for_each_active_iommu(iommu, drhd) {
2988                 iommu->flush.flush_context(iommu, 0, 0, 0,
2989                                            DMA_CCMD_GLOBAL_INVL);
2990                 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
2991                                          DMA_TLB_GLOBAL_FLUSH);
2992         }
2993 }
2994
2995 static int iommu_suspend(void)
2996 {
2997         struct dmar_drhd_unit *drhd;
2998         struct intel_iommu *iommu = NULL;
2999         unsigned long flag;
3000
3001         for_each_active_iommu(iommu, drhd) {
3002                 iommu->iommu_state = kcalloc(MAX_SR_DMAR_REGS, sizeof(u32),
3003                                              GFP_KERNEL);
3004                 if (!iommu->iommu_state)
3005                         goto nomem;
3006         }
3007
3008         iommu_flush_all();
3009
3010         for_each_active_iommu(iommu, drhd) {
3011                 iommu_disable_translation(iommu);
3012
3013                 raw_spin_lock_irqsave(&iommu->register_lock, flag);
3014
3015                 iommu->iommu_state[SR_DMAR_FECTL_REG] =
3016                         readl(iommu->reg + DMAR_FECTL_REG);
3017                 iommu->iommu_state[SR_DMAR_FEDATA_REG] =
3018                         readl(iommu->reg + DMAR_FEDATA_REG);
3019                 iommu->iommu_state[SR_DMAR_FEADDR_REG] =
3020                         readl(iommu->reg + DMAR_FEADDR_REG);
3021                 iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
3022                         readl(iommu->reg + DMAR_FEUADDR_REG);
3023
3024                 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3025         }
3026         return 0;
3027
3028 nomem:
3029         for_each_active_iommu(iommu, drhd)
3030                 kfree(iommu->iommu_state);
3031
3032         return -ENOMEM;
3033 }
3034
3035 static void iommu_resume(void)
3036 {
3037         struct dmar_drhd_unit *drhd;
3038         struct intel_iommu *iommu = NULL;
3039         unsigned long flag;
3040
3041         if (init_iommu_hw()) {
3042                 if (force_on)
3043                         panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
3044                 else
3045                         WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
3046                 return;
3047         }
3048
3049         for_each_active_iommu(iommu, drhd) {
3050
3051                 raw_spin_lock_irqsave(&iommu->register_lock, flag);
3052
3053                 writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
3054                         iommu->reg + DMAR_FECTL_REG);
3055                 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
3056                         iommu->reg + DMAR_FEDATA_REG);
3057                 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
3058                         iommu->reg + DMAR_FEADDR_REG);
3059                 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
3060                         iommu->reg + DMAR_FEUADDR_REG);
3061
3062                 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3063         }
3064
3065         for_each_active_iommu(iommu, drhd)
3066                 kfree(iommu->iommu_state);
3067 }
3068
3069 static struct syscore_ops iommu_syscore_ops = {
3070         .resume         = iommu_resume,
3071         .suspend        = iommu_suspend,
3072 };
3073
3074 static void __init init_iommu_pm_ops(void)
3075 {
3076         register_syscore_ops(&iommu_syscore_ops);
3077 }
3078
3079 #else
3080 static inline void init_iommu_pm_ops(void) {}
3081 #endif  /* CONFIG_PM */
3082
3083 static int __init rmrr_sanity_check(struct acpi_dmar_reserved_memory *rmrr)
3084 {
3085         if (!IS_ALIGNED(rmrr->base_address, PAGE_SIZE) ||
3086             !IS_ALIGNED(rmrr->end_address + 1, PAGE_SIZE) ||
3087             rmrr->end_address <= rmrr->base_address ||
3088             arch_rmrr_sanity_check(rmrr))
3089                 return -EINVAL;
3090
3091         return 0;
3092 }
3093
3094 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
3095 {
3096         struct acpi_dmar_reserved_memory *rmrr;
3097         struct dmar_rmrr_unit *rmrru;
3098
3099         rmrr = (struct acpi_dmar_reserved_memory *)header;
3100         if (rmrr_sanity_check(rmrr)) {
3101                 pr_warn(FW_BUG
3102                            "Your BIOS is broken; bad RMRR [%#018Lx-%#018Lx]\n"
3103                            "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
3104                            rmrr->base_address, rmrr->end_address,
3105                            dmi_get_system_info(DMI_BIOS_VENDOR),
3106                            dmi_get_system_info(DMI_BIOS_VERSION),
3107                            dmi_get_system_info(DMI_PRODUCT_VERSION));
3108                 add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
3109         }
3110
3111         rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
3112         if (!rmrru)
3113                 goto out;
3114
3115         rmrru->hdr = header;
3116
3117         rmrru->base_address = rmrr->base_address;
3118         rmrru->end_address = rmrr->end_address;
3119
3120         rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
3121                                 ((void *)rmrr) + rmrr->header.length,
3122                                 &rmrru->devices_cnt);
3123         if (rmrru->devices_cnt && rmrru->devices == NULL)
3124                 goto free_rmrru;
3125
3126         list_add(&rmrru->list, &dmar_rmrr_units);
3127
3128         return 0;
3129 free_rmrru:
3130         kfree(rmrru);
3131 out:
3132         return -ENOMEM;
3133 }
3134
3135 static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
3136 {
3137         struct dmar_atsr_unit *atsru;
3138         struct acpi_dmar_atsr *tmp;
3139
3140         list_for_each_entry_rcu(atsru, &dmar_atsr_units, list,
3141                                 dmar_rcu_check()) {
3142                 tmp = (struct acpi_dmar_atsr *)atsru->hdr;
3143                 if (atsr->segment != tmp->segment)
3144                         continue;
3145                 if (atsr->header.length != tmp->header.length)
3146                         continue;
3147                 if (memcmp(atsr, tmp, atsr->header.length) == 0)
3148                         return atsru;
3149         }
3150
3151         return NULL;
3152 }
3153
3154 int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3155 {
3156         struct acpi_dmar_atsr *atsr;
3157         struct dmar_atsr_unit *atsru;
3158
3159         if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
3160                 return 0;
3161
3162         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3163         atsru = dmar_find_atsr(atsr);
3164         if (atsru)
3165                 return 0;
3166
3167         atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
3168         if (!atsru)
3169                 return -ENOMEM;
3170
3171         /*
3172          * If memory is allocated from slab by ACPI _DSM method, we need to
3173          * copy the memory content because the memory buffer will be freed
3174          * on return.
3175          */
3176         atsru->hdr = (void *)(atsru + 1);
3177         memcpy(atsru->hdr, hdr, hdr->length);
3178         atsru->include_all = atsr->flags & 0x1;
3179         if (!atsru->include_all) {
3180                 atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
3181                                 (void *)atsr + atsr->header.length,
3182                                 &atsru->devices_cnt);
3183                 if (atsru->devices_cnt && atsru->devices == NULL) {
3184                         kfree(atsru);
3185                         return -ENOMEM;
3186                 }
3187         }
3188
3189         list_add_rcu(&atsru->list, &dmar_atsr_units);
3190
3191         return 0;
3192 }
3193
3194 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
3195 {
3196         dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
3197         kfree(atsru);
3198 }
3199
3200 int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3201 {
3202         struct acpi_dmar_atsr *atsr;
3203         struct dmar_atsr_unit *atsru;
3204
3205         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3206         atsru = dmar_find_atsr(atsr);
3207         if (atsru) {
3208                 list_del_rcu(&atsru->list);
3209                 synchronize_rcu();
3210                 intel_iommu_free_atsr(atsru);
3211         }
3212
3213         return 0;
3214 }
3215
3216 int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3217 {
3218         int i;
3219         struct device *dev;
3220         struct acpi_dmar_atsr *atsr;
3221         struct dmar_atsr_unit *atsru;
3222
3223         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3224         atsru = dmar_find_atsr(atsr);
3225         if (!atsru)
3226                 return 0;
3227
3228         if (!atsru->include_all && atsru->devices && atsru->devices_cnt) {
3229                 for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
3230                                           i, dev)
3231                         return -EBUSY;
3232         }
3233
3234         return 0;
3235 }
3236
3237 static struct dmar_satc_unit *dmar_find_satc(struct acpi_dmar_satc *satc)
3238 {
3239         struct dmar_satc_unit *satcu;
3240         struct acpi_dmar_satc *tmp;
3241
3242         list_for_each_entry_rcu(satcu, &dmar_satc_units, list,
3243                                 dmar_rcu_check()) {
3244                 tmp = (struct acpi_dmar_satc *)satcu->hdr;
3245                 if (satc->segment != tmp->segment)
3246                         continue;
3247                 if (satc->header.length != tmp->header.length)
3248                         continue;
3249                 if (memcmp(satc, tmp, satc->header.length) == 0)
3250                         return satcu;
3251         }
3252
3253         return NULL;
3254 }
3255
3256 int dmar_parse_one_satc(struct acpi_dmar_header *hdr, void *arg)
3257 {
3258         struct acpi_dmar_satc *satc;
3259         struct dmar_satc_unit *satcu;
3260
3261         if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
3262                 return 0;
3263
3264         satc = container_of(hdr, struct acpi_dmar_satc, header);
3265         satcu = dmar_find_satc(satc);
3266         if (satcu)
3267                 return 0;
3268
3269         satcu = kzalloc(sizeof(*satcu) + hdr->length, GFP_KERNEL);
3270         if (!satcu)
3271                 return -ENOMEM;
3272
3273         satcu->hdr = (void *)(satcu + 1);
3274         memcpy(satcu->hdr, hdr, hdr->length);
3275         satcu->atc_required = satc->flags & 0x1;
3276         satcu->devices = dmar_alloc_dev_scope((void *)(satc + 1),
3277                                               (void *)satc + satc->header.length,
3278                                               &satcu->devices_cnt);
3279         if (satcu->devices_cnt && !satcu->devices) {
3280                 kfree(satcu);
3281                 return -ENOMEM;
3282         }
3283         list_add_rcu(&satcu->list, &dmar_satc_units);
3284
3285         return 0;
3286 }
3287
3288 static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
3289 {
3290         int sp, ret;
3291         struct intel_iommu *iommu = dmaru->iommu;
3292
3293         ret = intel_cap_audit(CAP_AUDIT_HOTPLUG_DMAR, iommu);
3294         if (ret)
3295                 goto out;
3296
3297         if (hw_pass_through && !ecap_pass_through(iommu->ecap)) {
3298                 pr_warn("%s: Doesn't support hardware pass through.\n",
3299                         iommu->name);
3300                 return -ENXIO;
3301         }
3302
3303         sp = domain_update_iommu_superpage(NULL, iommu) - 1;
3304         if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) {
3305                 pr_warn("%s: Doesn't support large page.\n",
3306                         iommu->name);
3307                 return -ENXIO;
3308         }
3309
3310         /*
3311          * Disable translation if already enabled prior to OS handover.
3312          */
3313         if (iommu->gcmd & DMA_GCMD_TE)
3314                 iommu_disable_translation(iommu);
3315
3316         ret = iommu_init_domains(iommu);
3317         if (ret == 0)
3318                 ret = iommu_alloc_root_entry(iommu);
3319         if (ret)
3320                 goto out;
3321
3322         intel_svm_check(iommu);
3323
3324         if (dmaru->ignored) {
3325                 /*
3326                  * we always have to disable PMRs or DMA may fail on this device
3327                  */
3328                 if (force_on)
3329                         iommu_disable_protect_mem_regions(iommu);
3330                 return 0;
3331         }
3332
3333         intel_iommu_init_qi(iommu);
3334         iommu_flush_write_buffer(iommu);
3335
3336 #ifdef CONFIG_INTEL_IOMMU_SVM
3337         if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
3338                 ret = intel_svm_enable_prq(iommu);
3339                 if (ret)
3340                         goto disable_iommu;
3341         }
3342 #endif
3343         ret = dmar_set_interrupt(iommu);
3344         if (ret)
3345                 goto disable_iommu;
3346
3347         iommu_set_root_entry(iommu);
3348         iommu_enable_translation(iommu);
3349
3350         iommu_disable_protect_mem_regions(iommu);
3351         return 0;
3352
3353 disable_iommu:
3354         disable_dmar_iommu(iommu);
3355 out:
3356         free_dmar_iommu(iommu);
3357         return ret;
3358 }
3359
3360 int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
3361 {
3362         int ret = 0;
3363         struct intel_iommu *iommu = dmaru->iommu;
3364
3365         if (!intel_iommu_enabled)
3366                 return 0;
3367         if (iommu == NULL)
3368                 return -EINVAL;
3369
3370         if (insert) {
3371                 ret = intel_iommu_add(dmaru);
3372         } else {
3373                 disable_dmar_iommu(iommu);
3374                 free_dmar_iommu(iommu);
3375         }
3376
3377         return ret;
3378 }
3379
3380 static void intel_iommu_free_dmars(void)
3381 {
3382         struct dmar_rmrr_unit *rmrru, *rmrr_n;
3383         struct dmar_atsr_unit *atsru, *atsr_n;
3384         struct dmar_satc_unit *satcu, *satc_n;
3385
3386         list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
3387                 list_del(&rmrru->list);
3388                 dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
3389                 kfree(rmrru);
3390         }
3391
3392         list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
3393                 list_del(&atsru->list);
3394                 intel_iommu_free_atsr(atsru);
3395         }
3396         list_for_each_entry_safe(satcu, satc_n, &dmar_satc_units, list) {
3397                 list_del(&satcu->list);
3398                 dmar_free_dev_scope(&satcu->devices, &satcu->devices_cnt);
3399                 kfree(satcu);
3400         }
3401 }
3402
3403 static struct dmar_satc_unit *dmar_find_matched_satc_unit(struct pci_dev *dev)
3404 {
3405         struct dmar_satc_unit *satcu;
3406         struct acpi_dmar_satc *satc;
3407         struct device *tmp;
3408         int i;
3409
3410         dev = pci_physfn(dev);
3411         rcu_read_lock();
3412
3413         list_for_each_entry_rcu(satcu, &dmar_satc_units, list) {
3414                 satc = container_of(satcu->hdr, struct acpi_dmar_satc, header);
3415                 if (satc->segment != pci_domain_nr(dev->bus))
3416                         continue;
3417                 for_each_dev_scope(satcu->devices, satcu->devices_cnt, i, tmp)
3418                         if (to_pci_dev(tmp) == dev)
3419                                 goto out;
3420         }
3421         satcu = NULL;
3422 out:
3423         rcu_read_unlock();
3424         return satcu;
3425 }
3426
3427 static int dmar_ats_supported(struct pci_dev *dev, struct intel_iommu *iommu)
3428 {
3429         int i, ret = 1;
3430         struct pci_bus *bus;
3431         struct pci_dev *bridge = NULL;
3432         struct device *tmp;
3433         struct acpi_dmar_atsr *atsr;
3434         struct dmar_atsr_unit *atsru;
3435         struct dmar_satc_unit *satcu;
3436
3437         dev = pci_physfn(dev);
3438         satcu = dmar_find_matched_satc_unit(dev);
3439         if (satcu)
3440                 /*
3441                  * This device supports ATS as it is in SATC table.
3442                  * When IOMMU is in legacy mode, enabling ATS is done
3443                  * automatically by HW for the device that requires
3444                  * ATS, hence OS should not enable this device ATS
3445                  * to avoid duplicated TLB invalidation.
3446                  */
3447                 return !(satcu->atc_required && !sm_supported(iommu));
3448
3449         for (bus = dev->bus; bus; bus = bus->parent) {
3450                 bridge = bus->self;
3451                 /* If it's an integrated device, allow ATS */
3452                 if (!bridge)
3453                         return 1;
3454                 /* Connected via non-PCIe: no ATS */
3455                 if (!pci_is_pcie(bridge) ||
3456                     pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
3457                         return 0;
3458                 /* If we found the root port, look it up in the ATSR */
3459                 if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
3460                         break;
3461         }
3462
3463         rcu_read_lock();
3464         list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
3465                 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3466                 if (atsr->segment != pci_domain_nr(dev->bus))
3467                         continue;
3468
3469                 for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
3470                         if (tmp == &bridge->dev)
3471                                 goto out;
3472
3473                 if (atsru->include_all)
3474                         goto out;
3475         }
3476         ret = 0;
3477 out:
3478         rcu_read_unlock();
3479
3480         return ret;
3481 }
3482
3483 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
3484 {
3485         int ret;
3486         struct dmar_rmrr_unit *rmrru;
3487         struct dmar_atsr_unit *atsru;
3488         struct dmar_satc_unit *satcu;
3489         struct acpi_dmar_atsr *atsr;
3490         struct acpi_dmar_reserved_memory *rmrr;
3491         struct acpi_dmar_satc *satc;
3492
3493         if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING)
3494                 return 0;
3495
3496         list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
3497                 rmrr = container_of(rmrru->hdr,
3498                                     struct acpi_dmar_reserved_memory, header);
3499                 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3500                         ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
3501                                 ((void *)rmrr) + rmrr->header.length,
3502                                 rmrr->segment, rmrru->devices,
3503                                 rmrru->devices_cnt);
3504                         if (ret < 0)
3505                                 return ret;
3506                 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
3507                         dmar_remove_dev_scope(info, rmrr->segment,
3508                                 rmrru->devices, rmrru->devices_cnt);
3509                 }
3510         }
3511
3512         list_for_each_entry(atsru, &dmar_atsr_units, list) {
3513                 if (atsru->include_all)
3514                         continue;
3515
3516                 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3517                 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3518                         ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
3519                                         (void *)atsr + atsr->header.length,
3520                                         atsr->segment, atsru->devices,
3521                                         atsru->devices_cnt);
3522                         if (ret > 0)
3523                                 break;
3524                         else if (ret < 0)
3525                                 return ret;
3526                 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
3527                         if (dmar_remove_dev_scope(info, atsr->segment,
3528                                         atsru->devices, atsru->devices_cnt))
3529                                 break;
3530                 }
3531         }
3532         list_for_each_entry(satcu, &dmar_satc_units, list) {
3533                 satc = container_of(satcu->hdr, struct acpi_dmar_satc, header);
3534                 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3535                         ret = dmar_insert_dev_scope(info, (void *)(satc + 1),
3536                                         (void *)satc + satc->header.length,
3537                                         satc->segment, satcu->devices,
3538                                         satcu->devices_cnt);
3539                         if (ret > 0)
3540                                 break;
3541                         else if (ret < 0)
3542                                 return ret;
3543                 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
3544                         if (dmar_remove_dev_scope(info, satc->segment,
3545                                         satcu->devices, satcu->devices_cnt))
3546                                 break;
3547                 }
3548         }
3549
3550         return 0;
3551 }
3552
3553 static int intel_iommu_memory_notifier(struct notifier_block *nb,
3554                                        unsigned long val, void *v)
3555 {
3556         struct memory_notify *mhp = v;
3557         unsigned long start_vpfn = mm_to_dma_pfn_start(mhp->start_pfn);
3558         unsigned long last_vpfn = mm_to_dma_pfn_end(mhp->start_pfn +
3559                         mhp->nr_pages - 1);
3560
3561         switch (val) {
3562         case MEM_GOING_ONLINE:
3563                 if (iommu_domain_identity_map(si_domain,
3564                                               start_vpfn, last_vpfn)) {
3565                         pr_warn("Failed to build identity map for [%lx-%lx]\n",
3566                                 start_vpfn, last_vpfn);
3567                         return NOTIFY_BAD;
3568                 }
3569                 break;
3570
3571         case MEM_OFFLINE:
3572         case MEM_CANCEL_ONLINE:
3573                 {
3574                         struct dmar_drhd_unit *drhd;
3575                         struct intel_iommu *iommu;
3576                         LIST_HEAD(freelist);
3577
3578                         domain_unmap(si_domain, start_vpfn, last_vpfn, &freelist);
3579
3580                         rcu_read_lock();
3581                         for_each_active_iommu(iommu, drhd)
3582                                 iommu_flush_iotlb_psi(iommu, si_domain,
3583                                         start_vpfn, mhp->nr_pages,
3584                                         list_empty(&freelist), 0);
3585                         rcu_read_unlock();
3586                         put_pages_list(&freelist);
3587                 }
3588                 break;
3589         }
3590
3591         return NOTIFY_OK;
3592 }
3593
3594 static struct notifier_block intel_iommu_memory_nb = {
3595         .notifier_call = intel_iommu_memory_notifier,
3596         .priority = 0
3597 };
3598
3599 static void intel_disable_iommus(void)
3600 {
3601         struct intel_iommu *iommu = NULL;
3602         struct dmar_drhd_unit *drhd;
3603
3604         for_each_iommu(iommu, drhd)
3605                 iommu_disable_translation(iommu);
3606 }
3607
3608 void intel_iommu_shutdown(void)
3609 {
3610         struct dmar_drhd_unit *drhd;
3611         struct intel_iommu *iommu = NULL;
3612
3613         if (no_iommu || dmar_disabled)
3614                 return;
3615
3616         down_write(&dmar_global_lock);
3617
3618         /* Disable PMRs explicitly here. */
3619         for_each_iommu(iommu, drhd)
3620                 iommu_disable_protect_mem_regions(iommu);
3621
3622         /* Make sure the IOMMUs are switched off */
3623         intel_disable_iommus();
3624
3625         up_write(&dmar_global_lock);
3626 }
3627
3628 static inline struct intel_iommu *dev_to_intel_iommu(struct device *dev)
3629 {
3630         struct iommu_device *iommu_dev = dev_to_iommu_device(dev);
3631
3632         return container_of(iommu_dev, struct intel_iommu, iommu);
3633 }
3634
3635 static ssize_t version_show(struct device *dev,
3636                             struct device_attribute *attr, char *buf)
3637 {
3638         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3639         u32 ver = readl(iommu->reg + DMAR_VER_REG);
3640         return sysfs_emit(buf, "%d:%d\n",
3641                           DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
3642 }
3643 static DEVICE_ATTR_RO(version);
3644
3645 static ssize_t address_show(struct device *dev,
3646                             struct device_attribute *attr, char *buf)
3647 {
3648         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3649         return sysfs_emit(buf, "%llx\n", iommu->reg_phys);
3650 }
3651 static DEVICE_ATTR_RO(address);
3652
3653 static ssize_t cap_show(struct device *dev,
3654                         struct device_attribute *attr, char *buf)
3655 {
3656         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3657         return sysfs_emit(buf, "%llx\n", iommu->cap);
3658 }
3659 static DEVICE_ATTR_RO(cap);
3660
3661 static ssize_t ecap_show(struct device *dev,
3662                          struct device_attribute *attr, char *buf)
3663 {
3664         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3665         return sysfs_emit(buf, "%llx\n", iommu->ecap);
3666 }
3667 static DEVICE_ATTR_RO(ecap);
3668
3669 static ssize_t domains_supported_show(struct device *dev,
3670                                       struct device_attribute *attr, char *buf)
3671 {
3672         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3673         return sysfs_emit(buf, "%ld\n", cap_ndoms(iommu->cap));
3674 }
3675 static DEVICE_ATTR_RO(domains_supported);
3676
3677 static ssize_t domains_used_show(struct device *dev,
3678                                  struct device_attribute *attr, char *buf)
3679 {
3680         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3681         return sysfs_emit(buf, "%d\n",
3682                           bitmap_weight(iommu->domain_ids,
3683                                         cap_ndoms(iommu->cap)));
3684 }
3685 static DEVICE_ATTR_RO(domains_used);
3686
3687 static struct attribute *intel_iommu_attrs[] = {
3688         &dev_attr_version.attr,
3689         &dev_attr_address.attr,
3690         &dev_attr_cap.attr,
3691         &dev_attr_ecap.attr,
3692         &dev_attr_domains_supported.attr,
3693         &dev_attr_domains_used.attr,
3694         NULL,
3695 };
3696
3697 static struct attribute_group intel_iommu_group = {
3698         .name = "intel-iommu",
3699         .attrs = intel_iommu_attrs,
3700 };
3701
3702 const struct attribute_group *intel_iommu_groups[] = {
3703         &intel_iommu_group,
3704         NULL,
3705 };
3706
3707 static inline bool has_external_pci(void)
3708 {
3709         struct pci_dev *pdev = NULL;
3710
3711         for_each_pci_dev(pdev)
3712                 if (pdev->external_facing) {
3713                         pci_dev_put(pdev);
3714                         return true;
3715                 }
3716
3717         return false;
3718 }
3719
3720 static int __init platform_optin_force_iommu(void)
3721 {
3722         if (!dmar_platform_optin() || no_platform_optin || !has_external_pci())
3723                 return 0;
3724
3725         if (no_iommu || dmar_disabled)
3726                 pr_info("Intel-IOMMU force enabled due to platform opt in\n");
3727
3728         /*
3729          * If Intel-IOMMU is disabled by default, we will apply identity
3730          * map for all devices except those marked as being untrusted.
3731          */
3732         if (dmar_disabled)
3733                 iommu_set_default_passthrough(false);
3734
3735         dmar_disabled = 0;
3736         no_iommu = 0;
3737
3738         return 1;
3739 }
3740
3741 static int __init probe_acpi_namespace_devices(void)
3742 {
3743         struct dmar_drhd_unit *drhd;
3744         /* To avoid a -Wunused-but-set-variable warning. */
3745         struct intel_iommu *iommu __maybe_unused;
3746         struct device *dev;
3747         int i, ret = 0;
3748
3749         for_each_active_iommu(iommu, drhd) {
3750                 for_each_active_dev_scope(drhd->devices,
3751                                           drhd->devices_cnt, i, dev) {
3752                         struct acpi_device_physical_node *pn;
3753                         struct acpi_device *adev;
3754
3755                         if (dev->bus != &acpi_bus_type)
3756                                 continue;
3757
3758                         adev = to_acpi_device(dev);
3759                         mutex_lock(&adev->physical_node_lock);
3760                         list_for_each_entry(pn,
3761                                             &adev->physical_node_list, node) {
3762                                 ret = iommu_probe_device(pn->dev);
3763                                 if (ret)
3764                                         break;
3765                         }
3766                         mutex_unlock(&adev->physical_node_lock);
3767
3768                         if (ret)
3769                                 return ret;
3770                 }
3771         }
3772
3773         return 0;
3774 }
3775
3776 static __init int tboot_force_iommu(void)
3777 {
3778         if (!tboot_enabled())
3779                 return 0;
3780
3781         if (no_iommu || dmar_disabled)
3782                 pr_warn("Forcing Intel-IOMMU to enabled\n");
3783
3784         dmar_disabled = 0;
3785         no_iommu = 0;
3786
3787         return 1;
3788 }
3789
3790 int __init intel_iommu_init(void)
3791 {
3792         int ret = -ENODEV;
3793         struct dmar_drhd_unit *drhd;
3794         struct intel_iommu *iommu;
3795
3796         /*
3797          * Intel IOMMU is required for a TXT/tboot launch or platform
3798          * opt in, so enforce that.
3799          */
3800         force_on = (!intel_iommu_tboot_noforce && tboot_force_iommu()) ||
3801                     platform_optin_force_iommu();
3802
3803         down_write(&dmar_global_lock);
3804         if (dmar_table_init()) {
3805                 if (force_on)
3806                         panic("tboot: Failed to initialize DMAR table\n");
3807                 goto out_free_dmar;
3808         }
3809
3810         if (dmar_dev_scope_init() < 0) {
3811                 if (force_on)
3812                         panic("tboot: Failed to initialize DMAR device scope\n");
3813                 goto out_free_dmar;
3814         }
3815
3816         up_write(&dmar_global_lock);
3817
3818         /*
3819          * The bus notifier takes the dmar_global_lock, so lockdep will
3820          * complain later when we register it under the lock.
3821          */
3822         dmar_register_bus_notifier();
3823
3824         down_write(&dmar_global_lock);
3825
3826         if (!no_iommu)
3827                 intel_iommu_debugfs_init();
3828
3829         if (no_iommu || dmar_disabled) {
3830                 /*
3831                  * We exit the function here to ensure IOMMU's remapping and
3832                  * mempool aren't setup, which means that the IOMMU's PMRs
3833                  * won't be disabled via the call to init_dmars(). So disable
3834                  * it explicitly here. The PMRs were setup by tboot prior to
3835                  * calling SENTER, but the kernel is expected to reset/tear
3836                  * down the PMRs.
3837                  */
3838                 if (intel_iommu_tboot_noforce) {
3839                         for_each_iommu(iommu, drhd)
3840                                 iommu_disable_protect_mem_regions(iommu);
3841                 }
3842
3843                 /*
3844                  * Make sure the IOMMUs are switched off, even when we
3845                  * boot into a kexec kernel and the previous kernel left
3846                  * them enabled
3847                  */
3848                 intel_disable_iommus();
3849                 goto out_free_dmar;
3850         }
3851
3852         if (list_empty(&dmar_rmrr_units))
3853                 pr_info("No RMRR found\n");
3854
3855         if (list_empty(&dmar_atsr_units))
3856                 pr_info("No ATSR found\n");
3857
3858         if (list_empty(&dmar_satc_units))
3859                 pr_info("No SATC found\n");
3860
3861         init_no_remapping_devices();
3862
3863         ret = init_dmars();
3864         if (ret) {
3865                 if (force_on)
3866                         panic("tboot: Failed to initialize DMARs\n");
3867                 pr_err("Initialization failed\n");
3868                 goto out_free_dmar;
3869         }
3870         up_write(&dmar_global_lock);
3871
3872         init_iommu_pm_ops();
3873
3874         down_read(&dmar_global_lock);
3875         for_each_active_iommu(iommu, drhd) {
3876                 /*
3877                  * The flush queue implementation does not perform
3878                  * page-selective invalidations that are required for efficient
3879                  * TLB flushes in virtual environments.  The benefit of batching
3880                  * is likely to be much lower than the overhead of synchronizing
3881                  * the virtual and physical IOMMU page-tables.
3882                  */
3883                 if (cap_caching_mode(iommu->cap) &&
3884                     !first_level_by_default(IOMMU_DOMAIN_DMA)) {
3885                         pr_info_once("IOMMU batching disallowed due to virtualization\n");
3886                         iommu_set_dma_strict();
3887                 }
3888                 iommu_device_sysfs_add(&iommu->iommu, NULL,
3889                                        intel_iommu_groups,
3890                                        "%s", iommu->name);
3891                 iommu_device_register(&iommu->iommu, &intel_iommu_ops, NULL);
3892
3893                 iommu_pmu_register(iommu);
3894         }
3895         up_read(&dmar_global_lock);
3896
3897         if (si_domain && !hw_pass_through)
3898                 register_memory_notifier(&intel_iommu_memory_nb);
3899
3900         down_read(&dmar_global_lock);
3901         if (probe_acpi_namespace_devices())
3902                 pr_warn("ACPI name space devices didn't probe correctly\n");
3903
3904         /* Finally, we enable the DMA remapping hardware. */
3905         for_each_iommu(iommu, drhd) {
3906                 if (!drhd->ignored && !translation_pre_enabled(iommu))
3907                         iommu_enable_translation(iommu);
3908
3909                 iommu_disable_protect_mem_regions(iommu);
3910         }
3911         up_read(&dmar_global_lock);
3912
3913         pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
3914
3915         intel_iommu_enabled = 1;
3916
3917         return 0;
3918
3919 out_free_dmar:
3920         intel_iommu_free_dmars();
3921         up_write(&dmar_global_lock);
3922         return ret;
3923 }
3924
3925 static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque)
3926 {
3927         struct device_domain_info *info = opaque;
3928
3929         domain_context_clear_one(info, PCI_BUS_NUM(alias), alias & 0xff);
3930         return 0;
3931 }
3932
3933 /*
3934  * NB - intel-iommu lacks any sort of reference counting for the users of
3935  * dependent devices.  If multiple endpoints have intersecting dependent
3936  * devices, unbinding the driver from any one of them will possibly leave
3937  * the others unable to operate.
3938  */
3939 static void domain_context_clear(struct device_domain_info *info)
3940 {
3941         if (!info->iommu || !info->dev || !dev_is_pci(info->dev))
3942                 return;
3943
3944         pci_for_each_dma_alias(to_pci_dev(info->dev),
3945                                &domain_context_clear_one_cb, info);
3946 }
3947
3948 static void dmar_remove_one_dev_info(struct device *dev)
3949 {
3950         struct device_domain_info *info = dev_iommu_priv_get(dev);
3951         struct dmar_domain *domain = info->domain;
3952         struct intel_iommu *iommu = info->iommu;
3953         unsigned long flags;
3954
3955         if (!dev_is_real_dma_subdevice(info->dev)) {
3956                 if (dev_is_pci(info->dev) && sm_supported(iommu))
3957                         intel_pasid_tear_down_entry(iommu, info->dev,
3958                                         IOMMU_NO_PASID, false);
3959
3960                 iommu_disable_pci_caps(info);
3961                 domain_context_clear(info);
3962         }
3963
3964         spin_lock_irqsave(&domain->lock, flags);
3965         list_del(&info->link);
3966         spin_unlock_irqrestore(&domain->lock, flags);
3967
3968         domain_detach_iommu(domain, iommu);
3969         info->domain = NULL;
3970 }
3971
3972 /*
3973  * Clear the page table pointer in context or pasid table entries so that
3974  * all DMA requests without PASID from the device are blocked. If the page
3975  * table has been set, clean up the data structures.
3976  */
3977 static void device_block_translation(struct device *dev)
3978 {
3979         struct device_domain_info *info = dev_iommu_priv_get(dev);
3980         struct intel_iommu *iommu = info->iommu;
3981         unsigned long flags;
3982
3983         iommu_disable_pci_caps(info);
3984         if (!dev_is_real_dma_subdevice(dev)) {
3985                 if (sm_supported(iommu))
3986                         intel_pasid_tear_down_entry(iommu, dev,
3987                                                     IOMMU_NO_PASID, false);
3988                 else
3989                         domain_context_clear(info);
3990         }
3991
3992         if (!info->domain)
3993                 return;
3994
3995         spin_lock_irqsave(&info->domain->lock, flags);
3996         list_del(&info->link);
3997         spin_unlock_irqrestore(&info->domain->lock, flags);
3998
3999         domain_detach_iommu(info->domain, iommu);
4000         info->domain = NULL;
4001 }
4002
4003 static int md_domain_init(struct dmar_domain *domain, int guest_width)
4004 {
4005         int adjust_width;
4006
4007         /* calculate AGAW */
4008         domain->gaw = guest_width;
4009         adjust_width = guestwidth_to_adjustwidth(guest_width);
4010         domain->agaw = width_to_agaw(adjust_width);
4011
4012         domain->iommu_coherency = false;
4013         domain->iommu_superpage = 0;
4014         domain->max_addr = 0;
4015
4016         /* always allocate the top pgd */
4017         domain->pgd = alloc_pgtable_page(domain->nid, GFP_ATOMIC);
4018         if (!domain->pgd)
4019                 return -ENOMEM;
4020         domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
4021         return 0;
4022 }
4023
4024 static int blocking_domain_attach_dev(struct iommu_domain *domain,
4025                                       struct device *dev)
4026 {
4027         device_block_translation(dev);
4028         return 0;
4029 }
4030
4031 static struct iommu_domain blocking_domain = {
4032         .ops = &(const struct iommu_domain_ops) {
4033                 .attach_dev     = blocking_domain_attach_dev,
4034                 .free           = intel_iommu_domain_free
4035         }
4036 };
4037
4038 static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
4039 {
4040         struct dmar_domain *dmar_domain;
4041         struct iommu_domain *domain;
4042
4043         switch (type) {
4044         case IOMMU_DOMAIN_BLOCKED:
4045                 return &blocking_domain;
4046         case IOMMU_DOMAIN_DMA:
4047         case IOMMU_DOMAIN_UNMANAGED:
4048                 dmar_domain = alloc_domain(type);
4049                 if (!dmar_domain) {
4050                         pr_err("Can't allocate dmar_domain\n");
4051                         return NULL;
4052                 }
4053                 if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
4054                         pr_err("Domain initialization failed\n");
4055                         domain_exit(dmar_domain);
4056                         return NULL;
4057                 }
4058
4059                 domain = &dmar_domain->domain;
4060                 domain->geometry.aperture_start = 0;
4061                 domain->geometry.aperture_end   =
4062                                 __DOMAIN_MAX_ADDR(dmar_domain->gaw);
4063                 domain->geometry.force_aperture = true;
4064
4065                 return domain;
4066         case IOMMU_DOMAIN_IDENTITY:
4067                 return &si_domain->domain;
4068         case IOMMU_DOMAIN_SVA:
4069                 return intel_svm_domain_alloc();
4070         default:
4071                 return NULL;
4072         }
4073
4074         return NULL;
4075 }
4076
4077 static void intel_iommu_domain_free(struct iommu_domain *domain)
4078 {
4079         if (domain != &si_domain->domain && domain != &blocking_domain)
4080                 domain_exit(to_dmar_domain(domain));
4081 }
4082
4083 static int prepare_domain_attach_device(struct iommu_domain *domain,
4084                                         struct device *dev)
4085 {
4086         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4087         struct intel_iommu *iommu;
4088         int addr_width;
4089
4090         iommu = device_to_iommu(dev, NULL, NULL);
4091         if (!iommu)
4092                 return -ENODEV;
4093
4094         if (dmar_domain->force_snooping && !ecap_sc_support(iommu->ecap))
4095                 return -EINVAL;
4096
4097         /* check if this iommu agaw is sufficient for max mapped address */
4098         addr_width = agaw_to_width(iommu->agaw);
4099         if (addr_width > cap_mgaw(iommu->cap))
4100                 addr_width = cap_mgaw(iommu->cap);
4101
4102         if (dmar_domain->max_addr > (1LL << addr_width))
4103                 return -EINVAL;
4104         dmar_domain->gaw = addr_width;
4105
4106         /*
4107          * Knock out extra levels of page tables if necessary
4108          */
4109         while (iommu->agaw < dmar_domain->agaw) {
4110                 struct dma_pte *pte;
4111
4112                 pte = dmar_domain->pgd;
4113                 if (dma_pte_present(pte)) {
4114                         dmar_domain->pgd = phys_to_virt(dma_pte_addr(pte));
4115                         free_pgtable_page(pte);
4116                 }
4117                 dmar_domain->agaw--;
4118         }
4119
4120         return 0;
4121 }
4122
4123 static int intel_iommu_attach_device(struct iommu_domain *domain,
4124                                      struct device *dev)
4125 {
4126         struct device_domain_info *info = dev_iommu_priv_get(dev);
4127         int ret;
4128
4129         if (info->domain)
4130                 device_block_translation(dev);
4131
4132         ret = prepare_domain_attach_device(domain, dev);
4133         if (ret)
4134                 return ret;
4135
4136         return dmar_domain_attach_device(to_dmar_domain(domain), dev);
4137 }
4138
4139 static int intel_iommu_map(struct iommu_domain *domain,
4140                            unsigned long iova, phys_addr_t hpa,
4141                            size_t size, int iommu_prot, gfp_t gfp)
4142 {
4143         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4144         u64 max_addr;
4145         int prot = 0;
4146
4147         if (iommu_prot & IOMMU_READ)
4148                 prot |= DMA_PTE_READ;
4149         if (iommu_prot & IOMMU_WRITE)
4150                 prot |= DMA_PTE_WRITE;
4151         if (dmar_domain->set_pte_snp)
4152                 prot |= DMA_PTE_SNP;
4153
4154         max_addr = iova + size;
4155         if (dmar_domain->max_addr < max_addr) {
4156                 u64 end;
4157
4158                 /* check if minimum agaw is sufficient for mapped address */
4159                 end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
4160                 if (end < max_addr) {
4161                         pr_err("%s: iommu width (%d) is not "
4162                                "sufficient for the mapped address (%llx)\n",
4163                                __func__, dmar_domain->gaw, max_addr);
4164                         return -EFAULT;
4165                 }
4166                 dmar_domain->max_addr = max_addr;
4167         }
4168         /* Round up size to next multiple of PAGE_SIZE, if it and
4169            the low bits of hpa would take us onto the next page */
4170         size = aligned_nrpages(hpa, size);
4171         return __domain_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
4172                                 hpa >> VTD_PAGE_SHIFT, size, prot, gfp);
4173 }
4174
4175 static int intel_iommu_map_pages(struct iommu_domain *domain,
4176                                  unsigned long iova, phys_addr_t paddr,
4177                                  size_t pgsize, size_t pgcount,
4178                                  int prot, gfp_t gfp, size_t *mapped)
4179 {
4180         unsigned long pgshift = __ffs(pgsize);
4181         size_t size = pgcount << pgshift;
4182         int ret;
4183
4184         if (pgsize != SZ_4K && pgsize != SZ_2M && pgsize != SZ_1G)
4185                 return -EINVAL;
4186
4187         if (!IS_ALIGNED(iova | paddr, pgsize))
4188                 return -EINVAL;
4189
4190         ret = intel_iommu_map(domain, iova, paddr, size, prot, gfp);
4191         if (!ret && mapped)
4192                 *mapped = size;
4193
4194         return ret;
4195 }
4196
4197 static size_t intel_iommu_unmap(struct iommu_domain *domain,
4198                                 unsigned long iova, size_t size,
4199                                 struct iommu_iotlb_gather *gather)
4200 {
4201         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4202         unsigned long start_pfn, last_pfn;
4203         int level = 0;
4204
4205         /* Cope with horrid API which requires us to unmap more than the
4206            size argument if it happens to be a large-page mapping. */
4207         if (unlikely(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT,
4208                                      &level, GFP_ATOMIC)))
4209                 return 0;
4210
4211         if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
4212                 size = VTD_PAGE_SIZE << level_to_offset_bits(level);
4213
4214         start_pfn = iova >> VTD_PAGE_SHIFT;
4215         last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
4216
4217         domain_unmap(dmar_domain, start_pfn, last_pfn, &gather->freelist);
4218
4219         if (dmar_domain->max_addr == iova + size)
4220                 dmar_domain->max_addr = iova;
4221
4222         /*
4223          * We do not use page-selective IOTLB invalidation in flush queue,
4224          * so there is no need to track page and sync iotlb.
4225          */
4226         if (!iommu_iotlb_gather_queued(gather))
4227                 iommu_iotlb_gather_add_page(domain, gather, iova, size);
4228
4229         return size;
4230 }
4231
4232 static size_t intel_iommu_unmap_pages(struct iommu_domain *domain,
4233                                       unsigned long iova,
4234                                       size_t pgsize, size_t pgcount,
4235                                       struct iommu_iotlb_gather *gather)
4236 {
4237         unsigned long pgshift = __ffs(pgsize);
4238         size_t size = pgcount << pgshift;
4239
4240         return intel_iommu_unmap(domain, iova, size, gather);
4241 }
4242
4243 static void intel_iommu_tlb_sync(struct iommu_domain *domain,
4244                                  struct iommu_iotlb_gather *gather)
4245 {
4246         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4247         unsigned long iova_pfn = IOVA_PFN(gather->start);
4248         size_t size = gather->end - gather->start;
4249         struct iommu_domain_info *info;
4250         unsigned long start_pfn;
4251         unsigned long nrpages;
4252         unsigned long i;
4253
4254         nrpages = aligned_nrpages(gather->start, size);
4255         start_pfn = mm_to_dma_pfn_start(iova_pfn);
4256
4257         xa_for_each(&dmar_domain->iommu_array, i, info)
4258                 iommu_flush_iotlb_psi(info->iommu, dmar_domain,
4259                                       start_pfn, nrpages,
4260                                       list_empty(&gather->freelist), 0);
4261
4262         put_pages_list(&gather->freelist);
4263 }
4264
4265 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
4266                                             dma_addr_t iova)
4267 {
4268         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4269         struct dma_pte *pte;
4270         int level = 0;
4271         u64 phys = 0;
4272
4273         pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level,
4274                              GFP_ATOMIC);
4275         if (pte && dma_pte_present(pte))
4276                 phys = dma_pte_addr(pte) +
4277                         (iova & (BIT_MASK(level_to_offset_bits(level) +
4278                                                 VTD_PAGE_SHIFT) - 1));
4279
4280         return phys;
4281 }
4282
4283 static bool domain_support_force_snooping(struct dmar_domain *domain)
4284 {
4285         struct device_domain_info *info;
4286         bool support = true;
4287
4288         assert_spin_locked(&domain->lock);
4289         list_for_each_entry(info, &domain->devices, link) {
4290                 if (!ecap_sc_support(info->iommu->ecap)) {
4291                         support = false;
4292                         break;
4293                 }
4294         }
4295
4296         return support;
4297 }
4298
4299 static void domain_set_force_snooping(struct dmar_domain *domain)
4300 {
4301         struct device_domain_info *info;
4302
4303         assert_spin_locked(&domain->lock);
4304         /*
4305          * Second level page table supports per-PTE snoop control. The
4306          * iommu_map() interface will handle this by setting SNP bit.
4307          */
4308         if (!domain->use_first_level) {
4309                 domain->set_pte_snp = true;
4310                 return;
4311         }
4312
4313         list_for_each_entry(info, &domain->devices, link)
4314                 intel_pasid_setup_page_snoop_control(info->iommu, info->dev,
4315                                                      IOMMU_NO_PASID);
4316 }
4317
4318 static bool intel_iommu_enforce_cache_coherency(struct iommu_domain *domain)
4319 {
4320         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4321         unsigned long flags;
4322
4323         if (dmar_domain->force_snooping)
4324                 return true;
4325
4326         spin_lock_irqsave(&dmar_domain->lock, flags);
4327         if (!domain_support_force_snooping(dmar_domain)) {
4328                 spin_unlock_irqrestore(&dmar_domain->lock, flags);
4329                 return false;
4330         }
4331
4332         domain_set_force_snooping(dmar_domain);
4333         dmar_domain->force_snooping = true;
4334         spin_unlock_irqrestore(&dmar_domain->lock, flags);
4335
4336         return true;
4337 }
4338
4339 static bool intel_iommu_capable(struct device *dev, enum iommu_cap cap)
4340 {
4341         struct device_domain_info *info = dev_iommu_priv_get(dev);
4342
4343         switch (cap) {
4344         case IOMMU_CAP_CACHE_COHERENCY:
4345         case IOMMU_CAP_DEFERRED_FLUSH:
4346                 return true;
4347         case IOMMU_CAP_PRE_BOOT_PROTECTION:
4348                 return dmar_platform_optin();
4349         case IOMMU_CAP_ENFORCE_CACHE_COHERENCY:
4350                 return ecap_sc_support(info->iommu->ecap);
4351         default:
4352                 return false;
4353         }
4354 }
4355
4356 static struct iommu_device *intel_iommu_probe_device(struct device *dev)
4357 {
4358         struct pci_dev *pdev = dev_is_pci(dev) ? to_pci_dev(dev) : NULL;
4359         struct device_domain_info *info;
4360         struct intel_iommu *iommu;
4361         u8 bus, devfn;
4362         int ret;
4363
4364         iommu = device_to_iommu(dev, &bus, &devfn);
4365         if (!iommu || !iommu->iommu.ops)
4366                 return ERR_PTR(-ENODEV);
4367
4368         info = kzalloc(sizeof(*info), GFP_KERNEL);
4369         if (!info)
4370                 return ERR_PTR(-ENOMEM);
4371
4372         if (dev_is_real_dma_subdevice(dev)) {
4373                 info->bus = pdev->bus->number;
4374                 info->devfn = pdev->devfn;
4375                 info->segment = pci_domain_nr(pdev->bus);
4376         } else {
4377                 info->bus = bus;
4378                 info->devfn = devfn;
4379                 info->segment = iommu->segment;
4380         }
4381
4382         info->dev = dev;
4383         info->iommu = iommu;
4384         if (dev_is_pci(dev)) {
4385                 if (ecap_dev_iotlb_support(iommu->ecap) &&
4386                     pci_ats_supported(pdev) &&
4387                     dmar_ats_supported(pdev, iommu)) {
4388                         info->ats_supported = 1;
4389                         info->dtlb_extra_inval = dev_needs_extra_dtlb_flush(pdev);
4390
4391                         /*
4392                          * For IOMMU that supports device IOTLB throttling
4393                          * (DIT), we assign PFSID to the invalidation desc
4394                          * of a VF such that IOMMU HW can gauge queue depth
4395                          * at PF level. If DIT is not set, PFSID will be
4396                          * treated as reserved, which should be set to 0.
4397                          */
4398                         if (ecap_dit(iommu->ecap))
4399                                 info->pfsid = pci_dev_id(pci_physfn(pdev));
4400                         info->ats_qdep = pci_ats_queue_depth(pdev);
4401                 }
4402                 if (sm_supported(iommu)) {
4403                         if (pasid_supported(iommu)) {
4404                                 int features = pci_pasid_features(pdev);
4405
4406                                 if (features >= 0)
4407                                         info->pasid_supported = features | 1;
4408                         }
4409
4410                         if (info->ats_supported && ecap_prs(iommu->ecap) &&
4411                             pci_pri_supported(pdev))
4412                                 info->pri_supported = 1;
4413                 }
4414         }
4415
4416         dev_iommu_priv_set(dev, info);
4417
4418         if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev)) {
4419                 ret = intel_pasid_alloc_table(dev);
4420                 if (ret) {
4421                         dev_err(dev, "PASID table allocation failed\n");
4422                         dev_iommu_priv_set(dev, NULL);
4423                         kfree(info);
4424                         return ERR_PTR(ret);
4425                 }
4426         }
4427
4428         return &iommu->iommu;
4429 }
4430
4431 static void intel_iommu_release_device(struct device *dev)
4432 {
4433         struct device_domain_info *info = dev_iommu_priv_get(dev);
4434
4435         dmar_remove_one_dev_info(dev);
4436         intel_pasid_free_table(dev);
4437         dev_iommu_priv_set(dev, NULL);
4438         kfree(info);
4439         set_dma_ops(dev, NULL);
4440 }
4441
4442 static void intel_iommu_probe_finalize(struct device *dev)
4443 {
4444         set_dma_ops(dev, NULL);
4445         iommu_setup_dma_ops(dev, 0, U64_MAX);
4446 }
4447
4448 static void intel_iommu_get_resv_regions(struct device *device,
4449                                          struct list_head *head)
4450 {
4451         int prot = DMA_PTE_READ | DMA_PTE_WRITE;
4452         struct iommu_resv_region *reg;
4453         struct dmar_rmrr_unit *rmrr;
4454         struct device *i_dev;
4455         int i;
4456
4457         rcu_read_lock();
4458         for_each_rmrr_units(rmrr) {
4459                 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
4460                                           i, i_dev) {
4461                         struct iommu_resv_region *resv;
4462                         enum iommu_resv_type type;
4463                         size_t length;
4464
4465                         if (i_dev != device &&
4466                             !is_downstream_to_pci_bridge(device, i_dev))
4467                                 continue;
4468
4469                         length = rmrr->end_address - rmrr->base_address + 1;
4470
4471                         type = device_rmrr_is_relaxable(device) ?
4472                                 IOMMU_RESV_DIRECT_RELAXABLE : IOMMU_RESV_DIRECT;
4473
4474                         resv = iommu_alloc_resv_region(rmrr->base_address,
4475                                                        length, prot, type,
4476                                                        GFP_ATOMIC);
4477                         if (!resv)
4478                                 break;
4479
4480                         list_add_tail(&resv->list, head);
4481                 }
4482         }
4483         rcu_read_unlock();
4484
4485 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
4486         if (dev_is_pci(device)) {
4487                 struct pci_dev *pdev = to_pci_dev(device);
4488
4489                 if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) {
4490                         reg = iommu_alloc_resv_region(0, 1UL << 24, prot,
4491                                         IOMMU_RESV_DIRECT_RELAXABLE,
4492                                         GFP_KERNEL);
4493                         if (reg)
4494                                 list_add_tail(&reg->list, head);
4495                 }
4496         }
4497 #endif /* CONFIG_INTEL_IOMMU_FLOPPY_WA */
4498
4499         reg = iommu_alloc_resv_region(IOAPIC_RANGE_START,
4500                                       IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1,
4501                                       0, IOMMU_RESV_MSI, GFP_KERNEL);
4502         if (!reg)
4503                 return;
4504         list_add_tail(&reg->list, head);
4505 }
4506
4507 static struct iommu_group *intel_iommu_device_group(struct device *dev)
4508 {
4509         if (dev_is_pci(dev))
4510                 return pci_device_group(dev);
4511         return generic_device_group(dev);
4512 }
4513
4514 static int intel_iommu_enable_sva(struct device *dev)
4515 {
4516         struct device_domain_info *info = dev_iommu_priv_get(dev);
4517         struct intel_iommu *iommu;
4518
4519         if (!info || dmar_disabled)
4520                 return -EINVAL;
4521
4522         iommu = info->iommu;
4523         if (!iommu)
4524                 return -EINVAL;
4525
4526         if (!(iommu->flags & VTD_FLAG_SVM_CAPABLE))
4527                 return -ENODEV;
4528
4529         if (!info->pasid_enabled || !info->ats_enabled)
4530                 return -EINVAL;
4531
4532         /*
4533          * Devices having device-specific I/O fault handling should not
4534          * support PCI/PRI. The IOMMU side has no means to check the
4535          * capability of device-specific IOPF.  Therefore, IOMMU can only
4536          * default that if the device driver enables SVA on a non-PRI
4537          * device, it will handle IOPF in its own way.
4538          */
4539         if (!info->pri_supported)
4540                 return 0;
4541
4542         /* Devices supporting PRI should have it enabled. */
4543         if (!info->pri_enabled)
4544                 return -EINVAL;
4545
4546         return 0;
4547 }
4548
4549 static int intel_iommu_enable_iopf(struct device *dev)
4550 {
4551         struct pci_dev *pdev = dev_is_pci(dev) ? to_pci_dev(dev) : NULL;
4552         struct device_domain_info *info = dev_iommu_priv_get(dev);
4553         struct intel_iommu *iommu;
4554         int ret;
4555
4556         if (!pdev || !info || !info->ats_enabled || !info->pri_supported)
4557                 return -ENODEV;
4558
4559         if (info->pri_enabled)
4560                 return -EBUSY;
4561
4562         iommu = info->iommu;
4563         if (!iommu)
4564                 return -EINVAL;
4565
4566         /* PASID is required in PRG Response Message. */
4567         if (info->pasid_enabled && !pci_prg_resp_pasid_required(pdev))
4568                 return -EINVAL;
4569
4570         ret = pci_reset_pri(pdev);
4571         if (ret)
4572                 return ret;
4573
4574         ret = iopf_queue_add_device(iommu->iopf_queue, dev);
4575         if (ret)
4576                 return ret;
4577
4578         ret = iommu_register_device_fault_handler(dev, iommu_queue_iopf, dev);
4579         if (ret)
4580                 goto iopf_remove_device;
4581
4582         ret = pci_enable_pri(pdev, PRQ_DEPTH);
4583         if (ret)
4584                 goto iopf_unregister_handler;
4585         info->pri_enabled = 1;
4586
4587         return 0;
4588
4589 iopf_unregister_handler:
4590         iommu_unregister_device_fault_handler(dev);
4591 iopf_remove_device:
4592         iopf_queue_remove_device(iommu->iopf_queue, dev);
4593
4594         return ret;
4595 }
4596
4597 static int intel_iommu_disable_iopf(struct device *dev)
4598 {
4599         struct device_domain_info *info = dev_iommu_priv_get(dev);
4600         struct intel_iommu *iommu = info->iommu;
4601
4602         if (!info->pri_enabled)
4603                 return -EINVAL;
4604
4605         /*
4606          * PCIe spec states that by clearing PRI enable bit, the Page
4607          * Request Interface will not issue new page requests, but has
4608          * outstanding page requests that have been transmitted or are
4609          * queued for transmission. This is supposed to be called after
4610          * the device driver has stopped DMA, all PASIDs have been
4611          * unbound and the outstanding PRQs have been drained.
4612          */
4613         pci_disable_pri(to_pci_dev(dev));
4614         info->pri_enabled = 0;
4615
4616         /*
4617          * With PRI disabled and outstanding PRQs drained, unregistering
4618          * fault handler and removing device from iopf queue should never
4619          * fail.
4620          */
4621         WARN_ON(iommu_unregister_device_fault_handler(dev));
4622         WARN_ON(iopf_queue_remove_device(iommu->iopf_queue, dev));
4623
4624         return 0;
4625 }
4626
4627 static int
4628 intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat)
4629 {
4630         switch (feat) {
4631         case IOMMU_DEV_FEAT_IOPF:
4632                 return intel_iommu_enable_iopf(dev);
4633
4634         case IOMMU_DEV_FEAT_SVA:
4635                 return intel_iommu_enable_sva(dev);
4636
4637         default:
4638                 return -ENODEV;
4639         }
4640 }
4641
4642 static int
4643 intel_iommu_dev_disable_feat(struct device *dev, enum iommu_dev_features feat)
4644 {
4645         switch (feat) {
4646         case IOMMU_DEV_FEAT_IOPF:
4647                 return intel_iommu_disable_iopf(dev);
4648
4649         case IOMMU_DEV_FEAT_SVA:
4650                 return 0;
4651
4652         default:
4653                 return -ENODEV;
4654         }
4655 }
4656
4657 static bool intel_iommu_is_attach_deferred(struct device *dev)
4658 {
4659         struct device_domain_info *info = dev_iommu_priv_get(dev);
4660
4661         return translation_pre_enabled(info->iommu) && !info->domain;
4662 }
4663
4664 /*
4665  * Check that the device does not live on an external facing PCI port that is
4666  * marked as untrusted. Such devices should not be able to apply quirks and
4667  * thus not be able to bypass the IOMMU restrictions.
4668  */
4669 static bool risky_device(struct pci_dev *pdev)
4670 {
4671         if (pdev->untrusted) {
4672                 pci_info(pdev,
4673                          "Skipping IOMMU quirk for dev [%04X:%04X] on untrusted PCI link\n",
4674                          pdev->vendor, pdev->device);
4675                 pci_info(pdev, "Please check with your BIOS/Platform vendor about this\n");
4676                 return true;
4677         }
4678         return false;
4679 }
4680
4681 static void intel_iommu_iotlb_sync_map(struct iommu_domain *domain,
4682                                        unsigned long iova, size_t size)
4683 {
4684         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4685         unsigned long pages = aligned_nrpages(iova, size);
4686         unsigned long pfn = iova >> VTD_PAGE_SHIFT;
4687         struct iommu_domain_info *info;
4688         unsigned long i;
4689
4690         xa_for_each(&dmar_domain->iommu_array, i, info)
4691                 __mapping_notify_one(info->iommu, dmar_domain, pfn, pages);
4692 }
4693
4694 static void intel_iommu_remove_dev_pasid(struct device *dev, ioasid_t pasid)
4695 {
4696         struct intel_iommu *iommu = device_to_iommu(dev, NULL, NULL);
4697         struct dev_pasid_info *curr, *dev_pasid = NULL;
4698         struct dmar_domain *dmar_domain;
4699         struct iommu_domain *domain;
4700         unsigned long flags;
4701
4702         domain = iommu_get_domain_for_dev_pasid(dev, pasid, 0);
4703         if (WARN_ON_ONCE(!domain))
4704                 goto out_tear_down;
4705
4706         /*
4707          * The SVA implementation needs to handle its own stuffs like the mm
4708          * notification. Before consolidating that code into iommu core, let
4709          * the intel sva code handle it.
4710          */
4711         if (domain->type == IOMMU_DOMAIN_SVA) {
4712                 intel_svm_remove_dev_pasid(dev, pasid);
4713                 goto out_tear_down;
4714         }
4715
4716         dmar_domain = to_dmar_domain(domain);
4717         spin_lock_irqsave(&dmar_domain->lock, flags);
4718         list_for_each_entry(curr, &dmar_domain->dev_pasids, link_domain) {
4719                 if (curr->dev == dev && curr->pasid == pasid) {
4720                         list_del(&curr->link_domain);
4721                         dev_pasid = curr;
4722                         break;
4723                 }
4724         }
4725         WARN_ON_ONCE(!dev_pasid);
4726         spin_unlock_irqrestore(&dmar_domain->lock, flags);
4727
4728         domain_detach_iommu(dmar_domain, iommu);
4729         kfree(dev_pasid);
4730 out_tear_down:
4731         intel_pasid_tear_down_entry(iommu, dev, pasid, false);
4732         intel_drain_pasid_prq(dev, pasid);
4733 }
4734
4735 static int intel_iommu_set_dev_pasid(struct iommu_domain *domain,
4736                                      struct device *dev, ioasid_t pasid)
4737 {
4738         struct device_domain_info *info = dev_iommu_priv_get(dev);
4739         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4740         struct intel_iommu *iommu = info->iommu;
4741         struct dev_pasid_info *dev_pasid;
4742         unsigned long flags;
4743         int ret;
4744
4745         if (!pasid_supported(iommu) || dev_is_real_dma_subdevice(dev))
4746                 return -EOPNOTSUPP;
4747
4748         if (context_copied(iommu, info->bus, info->devfn))
4749                 return -EBUSY;
4750
4751         ret = prepare_domain_attach_device(domain, dev);
4752         if (ret)
4753                 return ret;
4754
4755         dev_pasid = kzalloc(sizeof(*dev_pasid), GFP_KERNEL);
4756         if (!dev_pasid)
4757                 return -ENOMEM;
4758
4759         ret = domain_attach_iommu(dmar_domain, iommu);
4760         if (ret)
4761                 goto out_free;
4762
4763         if (domain_type_is_si(dmar_domain))
4764                 ret = intel_pasid_setup_pass_through(iommu, dmar_domain,
4765                                                      dev, pasid);
4766         else if (dmar_domain->use_first_level)
4767                 ret = domain_setup_first_level(iommu, dmar_domain,
4768                                                dev, pasid);
4769         else
4770                 ret = intel_pasid_setup_second_level(iommu, dmar_domain,
4771                                                      dev, pasid);
4772         if (ret)
4773                 goto out_detach_iommu;
4774
4775         dev_pasid->dev = dev;
4776         dev_pasid->pasid = pasid;
4777         spin_lock_irqsave(&dmar_domain->lock, flags);
4778         list_add(&dev_pasid->link_domain, &dmar_domain->dev_pasids);
4779         spin_unlock_irqrestore(&dmar_domain->lock, flags);
4780
4781         return 0;
4782 out_detach_iommu:
4783         domain_detach_iommu(dmar_domain, iommu);
4784 out_free:
4785         kfree(dev_pasid);
4786         return ret;
4787 }
4788
4789 static void *intel_iommu_hw_info(struct device *dev, u32 *length, u32 *type)
4790 {
4791         struct device_domain_info *info = dev_iommu_priv_get(dev);
4792         struct intel_iommu *iommu = info->iommu;
4793         struct iommu_hw_info_vtd *vtd;
4794
4795         vtd = kzalloc(sizeof(*vtd), GFP_KERNEL);
4796         if (!vtd)
4797                 return ERR_PTR(-ENOMEM);
4798
4799         vtd->cap_reg = iommu->cap;
4800         vtd->ecap_reg = iommu->ecap;
4801         *length = sizeof(*vtd);
4802         *type = IOMMU_HW_INFO_TYPE_INTEL_VTD;
4803         return vtd;
4804 }
4805
4806 const struct iommu_ops intel_iommu_ops = {
4807         .capable                = intel_iommu_capable,
4808         .hw_info                = intel_iommu_hw_info,
4809         .domain_alloc           = intel_iommu_domain_alloc,
4810         .probe_device           = intel_iommu_probe_device,
4811         .probe_finalize         = intel_iommu_probe_finalize,
4812         .release_device         = intel_iommu_release_device,
4813         .get_resv_regions       = intel_iommu_get_resv_regions,
4814         .device_group           = intel_iommu_device_group,
4815         .dev_enable_feat        = intel_iommu_dev_enable_feat,
4816         .dev_disable_feat       = intel_iommu_dev_disable_feat,
4817         .is_attach_deferred     = intel_iommu_is_attach_deferred,
4818         .def_domain_type        = device_def_domain_type,
4819         .remove_dev_pasid       = intel_iommu_remove_dev_pasid,
4820         .pgsize_bitmap          = SZ_4K,
4821 #ifdef CONFIG_INTEL_IOMMU_SVM
4822         .page_response          = intel_svm_page_response,
4823 #endif
4824         .default_domain_ops = &(const struct iommu_domain_ops) {
4825                 .attach_dev             = intel_iommu_attach_device,
4826                 .set_dev_pasid          = intel_iommu_set_dev_pasid,
4827                 .map_pages              = intel_iommu_map_pages,
4828                 .unmap_pages            = intel_iommu_unmap_pages,
4829                 .iotlb_sync_map         = intel_iommu_iotlb_sync_map,
4830                 .flush_iotlb_all        = intel_flush_iotlb_all,
4831                 .iotlb_sync             = intel_iommu_tlb_sync,
4832                 .iova_to_phys           = intel_iommu_iova_to_phys,
4833                 .free                   = intel_iommu_domain_free,
4834                 .enforce_cache_coherency = intel_iommu_enforce_cache_coherency,
4835         }
4836 };
4837
4838 static void quirk_iommu_igfx(struct pci_dev *dev)
4839 {
4840         if (risky_device(dev))
4841                 return;
4842
4843         pci_info(dev, "Disabling IOMMU for graphics on this chipset\n");
4844         dmar_map_gfx = 0;
4845 }
4846
4847 /* G4x/GM45 integrated gfx dmar support is totally busted. */
4848 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_igfx);
4849 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_igfx);
4850 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_igfx);
4851 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_igfx);
4852 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_igfx);
4853 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_igfx);
4854 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_igfx);
4855
4856 /* Broadwell igfx malfunctions with dmar */
4857 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1606, quirk_iommu_igfx);
4858 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160B, quirk_iommu_igfx);
4859 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160E, quirk_iommu_igfx);
4860 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1602, quirk_iommu_igfx);
4861 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160A, quirk_iommu_igfx);
4862 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160D, quirk_iommu_igfx);
4863 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1616, quirk_iommu_igfx);
4864 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161B, quirk_iommu_igfx);
4865 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161E, quirk_iommu_igfx);
4866 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1612, quirk_iommu_igfx);
4867 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161A, quirk_iommu_igfx);
4868 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161D, quirk_iommu_igfx);
4869 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1626, quirk_iommu_igfx);
4870 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162B, quirk_iommu_igfx);
4871 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162E, quirk_iommu_igfx);
4872 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1622, quirk_iommu_igfx);
4873 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162A, quirk_iommu_igfx);
4874 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162D, quirk_iommu_igfx);
4875 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1636, quirk_iommu_igfx);
4876 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163B, quirk_iommu_igfx);
4877 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163E, quirk_iommu_igfx);
4878 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1632, quirk_iommu_igfx);
4879 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163A, quirk_iommu_igfx);
4880 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163D, quirk_iommu_igfx);
4881
4882 static void quirk_iommu_rwbf(struct pci_dev *dev)
4883 {
4884         if (risky_device(dev))
4885                 return;
4886
4887         /*
4888          * Mobile 4 Series Chipset neglects to set RWBF capability,
4889          * but needs it. Same seems to hold for the desktop versions.
4890          */
4891         pci_info(dev, "Forcing write-buffer flush capability\n");
4892         rwbf_quirk = 1;
4893 }
4894
4895 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
4896 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
4897 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
4898 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
4899 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
4900 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
4901 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
4902
4903 #define GGC 0x52
4904 #define GGC_MEMORY_SIZE_MASK    (0xf << 8)
4905 #define GGC_MEMORY_SIZE_NONE    (0x0 << 8)
4906 #define GGC_MEMORY_SIZE_1M      (0x1 << 8)
4907 #define GGC_MEMORY_SIZE_2M      (0x3 << 8)
4908 #define GGC_MEMORY_VT_ENABLED   (0x8 << 8)
4909 #define GGC_MEMORY_SIZE_2M_VT   (0x9 << 8)
4910 #define GGC_MEMORY_SIZE_3M_VT   (0xa << 8)
4911 #define GGC_MEMORY_SIZE_4M_VT   (0xb << 8)
4912
4913 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
4914 {
4915         unsigned short ggc;
4916
4917         if (risky_device(dev))
4918                 return;
4919
4920         if (pci_read_config_word(dev, GGC, &ggc))
4921                 return;
4922
4923         if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
4924                 pci_info(dev, "BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
4925                 dmar_map_gfx = 0;
4926         } else if (dmar_map_gfx) {
4927                 /* we have to ensure the gfx device is idle before we flush */
4928                 pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n");
4929                 iommu_set_dma_strict();
4930         }
4931 }
4932 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
4933 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
4934 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
4935 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
4936
4937 static void quirk_igfx_skip_te_disable(struct pci_dev *dev)
4938 {
4939         unsigned short ver;
4940
4941         if (!IS_GFX_DEVICE(dev))
4942                 return;
4943
4944         ver = (dev->device >> 8) & 0xff;
4945         if (ver != 0x45 && ver != 0x46 && ver != 0x4c &&
4946             ver != 0x4e && ver != 0x8a && ver != 0x98 &&
4947             ver != 0x9a && ver != 0xa7)
4948                 return;
4949
4950         if (risky_device(dev))
4951                 return;
4952
4953         pci_info(dev, "Skip IOMMU disabling for graphics\n");
4954         iommu_skip_te_disable = 1;
4955 }
4956 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_ANY_ID, quirk_igfx_skip_te_disable);
4957
4958 /* On Tylersburg chipsets, some BIOSes have been known to enable the
4959    ISOCH DMAR unit for the Azalia sound device, but not give it any
4960    TLB entries, which causes it to deadlock. Check for that.  We do
4961    this in a function called from init_dmars(), instead of in a PCI
4962    quirk, because we don't want to print the obnoxious "BIOS broken"
4963    message if VT-d is actually disabled.
4964 */
4965 static void __init check_tylersburg_isoch(void)
4966 {
4967         struct pci_dev *pdev;
4968         uint32_t vtisochctrl;
4969
4970         /* If there's no Azalia in the system anyway, forget it. */
4971         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
4972         if (!pdev)
4973                 return;
4974
4975         if (risky_device(pdev)) {
4976                 pci_dev_put(pdev);
4977                 return;
4978         }
4979
4980         pci_dev_put(pdev);
4981
4982         /* System Management Registers. Might be hidden, in which case
4983            we can't do the sanity check. But that's OK, because the
4984            known-broken BIOSes _don't_ actually hide it, so far. */
4985         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
4986         if (!pdev)
4987                 return;
4988
4989         if (risky_device(pdev)) {
4990                 pci_dev_put(pdev);
4991                 return;
4992         }
4993
4994         if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
4995                 pci_dev_put(pdev);
4996                 return;
4997         }
4998
4999         pci_dev_put(pdev);
5000
5001         /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
5002         if (vtisochctrl & 1)
5003                 return;
5004
5005         /* Drop all bits other than the number of TLB entries */
5006         vtisochctrl &= 0x1c;
5007
5008         /* If we have the recommended number of TLB entries (16), fine. */
5009         if (vtisochctrl == 0x10)
5010                 return;
5011
5012         /* Zero TLB entries? You get to ride the short bus to school. */
5013         if (!vtisochctrl) {
5014                 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
5015                      "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
5016                      dmi_get_system_info(DMI_BIOS_VENDOR),
5017                      dmi_get_system_info(DMI_BIOS_VERSION),
5018                      dmi_get_system_info(DMI_PRODUCT_VERSION));
5019                 iommu_identity_mapping |= IDENTMAP_AZALIA;
5020                 return;
5021         }
5022
5023         pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
5024                vtisochctrl);
5025 }
5026
5027 /*
5028  * Here we deal with a device TLB defect where device may inadvertently issue ATS
5029  * invalidation completion before posted writes initiated with translated address
5030  * that utilized translations matching the invalidation address range, violating
5031  * the invalidation completion ordering.
5032  * Therefore, any use cases that cannot guarantee DMA is stopped before unmap is
5033  * vulnerable to this defect. In other words, any dTLB invalidation initiated not
5034  * under the control of the trusted/privileged host device driver must use this
5035  * quirk.
5036  * Device TLBs are invalidated under the following six conditions:
5037  * 1. Device driver does DMA API unmap IOVA
5038  * 2. Device driver unbind a PASID from a process, sva_unbind_device()
5039  * 3. PASID is torn down, after PASID cache is flushed. e.g. process
5040  *    exit_mmap() due to crash
5041  * 4. Under SVA usage, called by mmu_notifier.invalidate_range() where
5042  *    VM has to free pages that were unmapped
5043  * 5. Userspace driver unmaps a DMA buffer
5044  * 6. Cache invalidation in vSVA usage (upcoming)
5045  *
5046  * For #1 and #2, device drivers are responsible for stopping DMA traffic
5047  * before unmap/unbind. For #3, iommu driver gets mmu_notifier to
5048  * invalidate TLB the same way as normal user unmap which will use this quirk.
5049  * The dTLB invalidation after PASID cache flush does not need this quirk.
5050  *
5051  * As a reminder, #6 will *NEED* this quirk as we enable nested translation.
5052  */
5053 void quirk_extra_dev_tlb_flush(struct device_domain_info *info,
5054                                unsigned long address, unsigned long mask,
5055                                u32 pasid, u16 qdep)
5056 {
5057         u16 sid;
5058
5059         if (likely(!info->dtlb_extra_inval))
5060                 return;
5061
5062         sid = PCI_DEVID(info->bus, info->devfn);
5063         if (pasid == IOMMU_NO_PASID) {
5064                 qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
5065                                    qdep, address, mask);
5066         } else {
5067                 qi_flush_dev_iotlb_pasid(info->iommu, sid, info->pfsid,
5068                                          pasid, qdep, address, mask);
5069         }
5070 }
5071
5072 #define ecmd_get_status_code(res)       (((res) & 0xff) >> 1)
5073
5074 /*
5075  * Function to submit a command to the enhanced command interface. The
5076  * valid enhanced command descriptions are defined in Table 47 of the
5077  * VT-d spec. The VT-d hardware implementation may support some but not
5078  * all commands, which can be determined by checking the Enhanced
5079  * Command Capability Register.
5080  *
5081  * Return values:
5082  *  - 0: Command successful without any error;
5083  *  - Negative: software error value;
5084  *  - Nonzero positive: failure status code defined in Table 48.
5085  */
5086 int ecmd_submit_sync(struct intel_iommu *iommu, u8 ecmd, u64 oa, u64 ob)
5087 {
5088         unsigned long flags;
5089         u64 res;
5090         int ret;
5091
5092         if (!cap_ecmds(iommu->cap))
5093                 return -ENODEV;
5094
5095         raw_spin_lock_irqsave(&iommu->register_lock, flags);
5096
5097         res = dmar_readq(iommu->reg + DMAR_ECRSP_REG);
5098         if (res & DMA_ECMD_ECRSP_IP) {
5099                 ret = -EBUSY;
5100                 goto err;
5101         }
5102
5103         /*
5104          * Unconditionally write the operand B, because
5105          * - There is no side effect if an ecmd doesn't require an
5106          *   operand B, but we set the register to some value.
5107          * - It's not invoked in any critical path. The extra MMIO
5108          *   write doesn't bring any performance concerns.
5109          */
5110         dmar_writeq(iommu->reg + DMAR_ECEO_REG, ob);
5111         dmar_writeq(iommu->reg + DMAR_ECMD_REG, ecmd | (oa << DMA_ECMD_OA_SHIFT));
5112
5113         IOMMU_WAIT_OP(iommu, DMAR_ECRSP_REG, dmar_readq,
5114                       !(res & DMA_ECMD_ECRSP_IP), res);
5115
5116         if (res & DMA_ECMD_ECRSP_IP) {
5117                 ret = -ETIMEDOUT;
5118                 goto err;
5119         }
5120
5121         ret = ecmd_get_status_code(res);
5122 err:
5123         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
5124
5125         return ret;
5126 }