iommu/vt-d: Support enforce_cache_coherency only for empty domains
[platform/kernel/linux-rpi.git] / drivers / iommu / intel / iommu.c
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Copyright © 2006-2014 Intel Corporation.
4  *
5  * Authors: David Woodhouse <dwmw2@infradead.org>,
6  *          Ashok Raj <ashok.raj@intel.com>,
7  *          Shaohua Li <shaohua.li@intel.com>,
8  *          Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
9  *          Fenghua Yu <fenghua.yu@intel.com>
10  *          Joerg Roedel <jroedel@suse.de>
11  */
12
13 #define pr_fmt(fmt)     "DMAR: " fmt
14 #define dev_fmt(fmt)    pr_fmt(fmt)
15
16 #include <linux/crash_dump.h>
17 #include <linux/dma-direct.h>
18 #include <linux/dmi.h>
19 #include <linux/memory.h>
20 #include <linux/pci.h>
21 #include <linux/pci-ats.h>
22 #include <linux/spinlock.h>
23 #include <linux/syscore_ops.h>
24 #include <linux/tboot.h>
25 #include <uapi/linux/iommufd.h>
26
27 #include "iommu.h"
28 #include "../dma-iommu.h"
29 #include "../irq_remapping.h"
30 #include "../iommu-sva.h"
31 #include "pasid.h"
32 #include "cap_audit.h"
33 #include "perfmon.h"
34
35 #define ROOT_SIZE               VTD_PAGE_SIZE
36 #define CONTEXT_SIZE            VTD_PAGE_SIZE
37
38 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
39 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
40 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
41 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
42
43 #define IOAPIC_RANGE_START      (0xfee00000)
44 #define IOAPIC_RANGE_END        (0xfeefffff)
45 #define IOVA_START_ADDR         (0x1000)
46
47 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 57
48
49 #define MAX_AGAW_WIDTH 64
50 #define MAX_AGAW_PFN_WIDTH      (MAX_AGAW_WIDTH - VTD_PAGE_SHIFT)
51
52 #define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << ((gaw) - VTD_PAGE_SHIFT)) - 1)
53 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << (gaw)) - 1)
54
55 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
56    to match. That way, we can use 'unsigned long' for PFNs with impunity. */
57 #define DOMAIN_MAX_PFN(gaw)     ((unsigned long) min_t(uint64_t, \
58                                 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
59 #define DOMAIN_MAX_ADDR(gaw)    (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
60
61 /* IO virtual address start page frame number */
62 #define IOVA_START_PFN          (1)
63
64 #define IOVA_PFN(addr)          ((addr) >> PAGE_SHIFT)
65
66 /* page table handling */
67 #define LEVEL_STRIDE            (9)
68 #define LEVEL_MASK              (((u64)1 << LEVEL_STRIDE) - 1)
69
70 static inline int agaw_to_level(int agaw)
71 {
72         return agaw + 2;
73 }
74
75 static inline int agaw_to_width(int agaw)
76 {
77         return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH);
78 }
79
80 static inline int width_to_agaw(int width)
81 {
82         return DIV_ROUND_UP(width - 30, LEVEL_STRIDE);
83 }
84
85 static inline unsigned int level_to_offset_bits(int level)
86 {
87         return (level - 1) * LEVEL_STRIDE;
88 }
89
90 static inline int pfn_level_offset(u64 pfn, int level)
91 {
92         return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
93 }
94
95 static inline u64 level_mask(int level)
96 {
97         return -1ULL << level_to_offset_bits(level);
98 }
99
100 static inline u64 level_size(int level)
101 {
102         return 1ULL << level_to_offset_bits(level);
103 }
104
105 static inline u64 align_to_level(u64 pfn, int level)
106 {
107         return (pfn + level_size(level) - 1) & level_mask(level);
108 }
109
110 static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
111 {
112         return 1UL << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH);
113 }
114
115 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
116    are never going to work. */
117 static inline unsigned long mm_to_dma_pfn_start(unsigned long mm_pfn)
118 {
119         return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
120 }
121 static inline unsigned long mm_to_dma_pfn_end(unsigned long mm_pfn)
122 {
123         return ((mm_pfn + 1) << (PAGE_SHIFT - VTD_PAGE_SHIFT)) - 1;
124 }
125 static inline unsigned long page_to_dma_pfn(struct page *pg)
126 {
127         return mm_to_dma_pfn_start(page_to_pfn(pg));
128 }
129 static inline unsigned long virt_to_dma_pfn(void *p)
130 {
131         return page_to_dma_pfn(virt_to_page(p));
132 }
133
134 static void __init check_tylersburg_isoch(void);
135 static int rwbf_quirk;
136
137 /*
138  * set to 1 to panic kernel if can't successfully enable VT-d
139  * (used when kernel is launched w/ TXT)
140  */
141 static int force_on = 0;
142 static int intel_iommu_tboot_noforce;
143 static int no_platform_optin;
144
145 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
146
147 /*
148  * Take a root_entry and return the Lower Context Table Pointer (LCTP)
149  * if marked present.
150  */
151 static phys_addr_t root_entry_lctp(struct root_entry *re)
152 {
153         if (!(re->lo & 1))
154                 return 0;
155
156         return re->lo & VTD_PAGE_MASK;
157 }
158
159 /*
160  * Take a root_entry and return the Upper Context Table Pointer (UCTP)
161  * if marked present.
162  */
163 static phys_addr_t root_entry_uctp(struct root_entry *re)
164 {
165         if (!(re->hi & 1))
166                 return 0;
167
168         return re->hi & VTD_PAGE_MASK;
169 }
170
171 static inline void context_set_present(struct context_entry *context)
172 {
173         context->lo |= 1;
174 }
175
176 static inline void context_set_fault_enable(struct context_entry *context)
177 {
178         context->lo &= (((u64)-1) << 2) | 1;
179 }
180
181 static inline void context_set_translation_type(struct context_entry *context,
182                                                 unsigned long value)
183 {
184         context->lo &= (((u64)-1) << 4) | 3;
185         context->lo |= (value & 3) << 2;
186 }
187
188 static inline void context_set_address_root(struct context_entry *context,
189                                             unsigned long value)
190 {
191         context->lo &= ~VTD_PAGE_MASK;
192         context->lo |= value & VTD_PAGE_MASK;
193 }
194
195 static inline void context_set_address_width(struct context_entry *context,
196                                              unsigned long value)
197 {
198         context->hi |= value & 7;
199 }
200
201 static inline void context_set_domain_id(struct context_entry *context,
202                                          unsigned long value)
203 {
204         context->hi |= (value & ((1 << 16) - 1)) << 8;
205 }
206
207 static inline void context_set_pasid(struct context_entry *context)
208 {
209         context->lo |= CONTEXT_PASIDE;
210 }
211
212 static inline int context_domain_id(struct context_entry *c)
213 {
214         return((c->hi >> 8) & 0xffff);
215 }
216
217 static inline void context_clear_entry(struct context_entry *context)
218 {
219         context->lo = 0;
220         context->hi = 0;
221 }
222
223 static inline bool context_copied(struct intel_iommu *iommu, u8 bus, u8 devfn)
224 {
225         if (!iommu->copied_tables)
226                 return false;
227
228         return test_bit(((long)bus << 8) | devfn, iommu->copied_tables);
229 }
230
231 static inline void
232 set_context_copied(struct intel_iommu *iommu, u8 bus, u8 devfn)
233 {
234         set_bit(((long)bus << 8) | devfn, iommu->copied_tables);
235 }
236
237 static inline void
238 clear_context_copied(struct intel_iommu *iommu, u8 bus, u8 devfn)
239 {
240         clear_bit(((long)bus << 8) | devfn, iommu->copied_tables);
241 }
242
243 /*
244  * This domain is a statically identity mapping domain.
245  *      1. This domain creats a static 1:1 mapping to all usable memory.
246  *      2. It maps to each iommu if successful.
247  *      3. Each iommu mapps to this domain if successful.
248  */
249 static struct dmar_domain *si_domain;
250 static int hw_pass_through = 1;
251
252 struct dmar_rmrr_unit {
253         struct list_head list;          /* list of rmrr units   */
254         struct acpi_dmar_header *hdr;   /* ACPI header          */
255         u64     base_address;           /* reserved base address*/
256         u64     end_address;            /* reserved end address */
257         struct dmar_dev_scope *devices; /* target devices */
258         int     devices_cnt;            /* target device count */
259 };
260
261 struct dmar_atsr_unit {
262         struct list_head list;          /* list of ATSR units */
263         struct acpi_dmar_header *hdr;   /* ACPI header */
264         struct dmar_dev_scope *devices; /* target devices */
265         int devices_cnt;                /* target device count */
266         u8 include_all:1;               /* include all ports */
267 };
268
269 struct dmar_satc_unit {
270         struct list_head list;          /* list of SATC units */
271         struct acpi_dmar_header *hdr;   /* ACPI header */
272         struct dmar_dev_scope *devices; /* target devices */
273         struct intel_iommu *iommu;      /* the corresponding iommu */
274         int devices_cnt;                /* target device count */
275         u8 atc_required:1;              /* ATS is required */
276 };
277
278 static LIST_HEAD(dmar_atsr_units);
279 static LIST_HEAD(dmar_rmrr_units);
280 static LIST_HEAD(dmar_satc_units);
281
282 #define for_each_rmrr_units(rmrr) \
283         list_for_each_entry(rmrr, &dmar_rmrr_units, list)
284
285 static void device_block_translation(struct device *dev);
286 static void intel_iommu_domain_free(struct iommu_domain *domain);
287
288 int dmar_disabled = !IS_ENABLED(CONFIG_INTEL_IOMMU_DEFAULT_ON);
289 int intel_iommu_sm = IS_ENABLED(CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON);
290
291 int intel_iommu_enabled = 0;
292 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
293
294 static int dmar_map_gfx = 1;
295 static int intel_iommu_superpage = 1;
296 static int iommu_identity_mapping;
297 static int iommu_skip_te_disable;
298
299 #define IDENTMAP_GFX            2
300 #define IDENTMAP_AZALIA         4
301
302 const struct iommu_ops intel_iommu_ops;
303
304 static bool translation_pre_enabled(struct intel_iommu *iommu)
305 {
306         return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
307 }
308
309 static void clear_translation_pre_enabled(struct intel_iommu *iommu)
310 {
311         iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
312 }
313
314 static void init_translation_status(struct intel_iommu *iommu)
315 {
316         u32 gsts;
317
318         gsts = readl(iommu->reg + DMAR_GSTS_REG);
319         if (gsts & DMA_GSTS_TES)
320                 iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
321 }
322
323 static int __init intel_iommu_setup(char *str)
324 {
325         if (!str)
326                 return -EINVAL;
327
328         while (*str) {
329                 if (!strncmp(str, "on", 2)) {
330                         dmar_disabled = 0;
331                         pr_info("IOMMU enabled\n");
332                 } else if (!strncmp(str, "off", 3)) {
333                         dmar_disabled = 1;
334                         no_platform_optin = 1;
335                         pr_info("IOMMU disabled\n");
336                 } else if (!strncmp(str, "igfx_off", 8)) {
337                         dmar_map_gfx = 0;
338                         pr_info("Disable GFX device mapping\n");
339                 } else if (!strncmp(str, "forcedac", 8)) {
340                         pr_warn("intel_iommu=forcedac deprecated; use iommu.forcedac instead\n");
341                         iommu_dma_forcedac = true;
342                 } else if (!strncmp(str, "strict", 6)) {
343                         pr_warn("intel_iommu=strict deprecated; use iommu.strict=1 instead\n");
344                         iommu_set_dma_strict();
345                 } else if (!strncmp(str, "sp_off", 6)) {
346                         pr_info("Disable supported super page\n");
347                         intel_iommu_superpage = 0;
348                 } else if (!strncmp(str, "sm_on", 5)) {
349                         pr_info("Enable scalable mode if hardware supports\n");
350                         intel_iommu_sm = 1;
351                 } else if (!strncmp(str, "sm_off", 6)) {
352                         pr_info("Scalable mode is disallowed\n");
353                         intel_iommu_sm = 0;
354                 } else if (!strncmp(str, "tboot_noforce", 13)) {
355                         pr_info("Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
356                         intel_iommu_tboot_noforce = 1;
357                 } else {
358                         pr_notice("Unknown option - '%s'\n", str);
359                 }
360
361                 str += strcspn(str, ",");
362                 while (*str == ',')
363                         str++;
364         }
365
366         return 1;
367 }
368 __setup("intel_iommu=", intel_iommu_setup);
369
370 void *alloc_pgtable_page(int node, gfp_t gfp)
371 {
372         struct page *page;
373         void *vaddr = NULL;
374
375         page = alloc_pages_node(node, gfp | __GFP_ZERO, 0);
376         if (page)
377                 vaddr = page_address(page);
378         return vaddr;
379 }
380
381 void free_pgtable_page(void *vaddr)
382 {
383         free_page((unsigned long)vaddr);
384 }
385
386 static inline int domain_type_is_si(struct dmar_domain *domain)
387 {
388         return domain->domain.type == IOMMU_DOMAIN_IDENTITY;
389 }
390
391 static inline int domain_pfn_supported(struct dmar_domain *domain,
392                                        unsigned long pfn)
393 {
394         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
395
396         return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
397 }
398
399 /*
400  * Calculate the Supported Adjusted Guest Address Widths of an IOMMU.
401  * Refer to 11.4.2 of the VT-d spec for the encoding of each bit of
402  * the returned SAGAW.
403  */
404 static unsigned long __iommu_calculate_sagaw(struct intel_iommu *iommu)
405 {
406         unsigned long fl_sagaw, sl_sagaw;
407
408         fl_sagaw = BIT(2) | (cap_fl5lp_support(iommu->cap) ? BIT(3) : 0);
409         sl_sagaw = cap_sagaw(iommu->cap);
410
411         /* Second level only. */
412         if (!sm_supported(iommu) || !ecap_flts(iommu->ecap))
413                 return sl_sagaw;
414
415         /* First level only. */
416         if (!ecap_slts(iommu->ecap))
417                 return fl_sagaw;
418
419         return fl_sagaw & sl_sagaw;
420 }
421
422 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
423 {
424         unsigned long sagaw;
425         int agaw;
426
427         sagaw = __iommu_calculate_sagaw(iommu);
428         for (agaw = width_to_agaw(max_gaw); agaw >= 0; agaw--) {
429                 if (test_bit(agaw, &sagaw))
430                         break;
431         }
432
433         return agaw;
434 }
435
436 /*
437  * Calculate max SAGAW for each iommu.
438  */
439 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
440 {
441         return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
442 }
443
444 /*
445  * calculate agaw for each iommu.
446  * "SAGAW" may be different across iommus, use a default agaw, and
447  * get a supported less agaw for iommus that don't support the default agaw.
448  */
449 int iommu_calculate_agaw(struct intel_iommu *iommu)
450 {
451         return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
452 }
453
454 static inline bool iommu_paging_structure_coherency(struct intel_iommu *iommu)
455 {
456         return sm_supported(iommu) ?
457                         ecap_smpwc(iommu->ecap) : ecap_coherent(iommu->ecap);
458 }
459
460 static void domain_update_iommu_coherency(struct dmar_domain *domain)
461 {
462         struct iommu_domain_info *info;
463         struct dmar_drhd_unit *drhd;
464         struct intel_iommu *iommu;
465         bool found = false;
466         unsigned long i;
467
468         domain->iommu_coherency = true;
469         xa_for_each(&domain->iommu_array, i, info) {
470                 found = true;
471                 if (!iommu_paging_structure_coherency(info->iommu)) {
472                         domain->iommu_coherency = false;
473                         break;
474                 }
475         }
476         if (found)
477                 return;
478
479         /* No hardware attached; use lowest common denominator */
480         rcu_read_lock();
481         for_each_active_iommu(iommu, drhd) {
482                 if (!iommu_paging_structure_coherency(iommu)) {
483                         domain->iommu_coherency = false;
484                         break;
485                 }
486         }
487         rcu_read_unlock();
488 }
489
490 static int domain_update_iommu_superpage(struct dmar_domain *domain,
491                                          struct intel_iommu *skip)
492 {
493         struct dmar_drhd_unit *drhd;
494         struct intel_iommu *iommu;
495         int mask = 0x3;
496
497         if (!intel_iommu_superpage)
498                 return 0;
499
500         /* set iommu_superpage to the smallest common denominator */
501         rcu_read_lock();
502         for_each_active_iommu(iommu, drhd) {
503                 if (iommu != skip) {
504                         if (domain && domain->use_first_level) {
505                                 if (!cap_fl1gp_support(iommu->cap))
506                                         mask = 0x1;
507                         } else {
508                                 mask &= cap_super_page_val(iommu->cap);
509                         }
510
511                         if (!mask)
512                                 break;
513                 }
514         }
515         rcu_read_unlock();
516
517         return fls(mask);
518 }
519
520 static int domain_update_device_node(struct dmar_domain *domain)
521 {
522         struct device_domain_info *info;
523         int nid = NUMA_NO_NODE;
524         unsigned long flags;
525
526         spin_lock_irqsave(&domain->lock, flags);
527         list_for_each_entry(info, &domain->devices, link) {
528                 /*
529                  * There could possibly be multiple device numa nodes as devices
530                  * within the same domain may sit behind different IOMMUs. There
531                  * isn't perfect answer in such situation, so we select first
532                  * come first served policy.
533                  */
534                 nid = dev_to_node(info->dev);
535                 if (nid != NUMA_NO_NODE)
536                         break;
537         }
538         spin_unlock_irqrestore(&domain->lock, flags);
539
540         return nid;
541 }
542
543 static void domain_update_iotlb(struct dmar_domain *domain);
544
545 /* Return the super pagesize bitmap if supported. */
546 static unsigned long domain_super_pgsize_bitmap(struct dmar_domain *domain)
547 {
548         unsigned long bitmap = 0;
549
550         /*
551          * 1-level super page supports page size of 2MiB, 2-level super page
552          * supports page size of both 2MiB and 1GiB.
553          */
554         if (domain->iommu_superpage == 1)
555                 bitmap |= SZ_2M;
556         else if (domain->iommu_superpage == 2)
557                 bitmap |= SZ_2M | SZ_1G;
558
559         return bitmap;
560 }
561
562 /* Some capabilities may be different across iommus */
563 static void domain_update_iommu_cap(struct dmar_domain *domain)
564 {
565         domain_update_iommu_coherency(domain);
566         domain->iommu_superpage = domain_update_iommu_superpage(domain, NULL);
567
568         /*
569          * If RHSA is missing, we should default to the device numa domain
570          * as fall back.
571          */
572         if (domain->nid == NUMA_NO_NODE)
573                 domain->nid = domain_update_device_node(domain);
574
575         /*
576          * First-level translation restricts the input-address to a
577          * canonical address (i.e., address bits 63:N have the same
578          * value as address bit [N-1], where N is 48-bits with 4-level
579          * paging and 57-bits with 5-level paging). Hence, skip bit
580          * [N-1].
581          */
582         if (domain->use_first_level)
583                 domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw - 1);
584         else
585                 domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw);
586
587         domain->domain.pgsize_bitmap |= domain_super_pgsize_bitmap(domain);
588         domain_update_iotlb(domain);
589 }
590
591 struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
592                                          u8 devfn, int alloc)
593 {
594         struct root_entry *root = &iommu->root_entry[bus];
595         struct context_entry *context;
596         u64 *entry;
597
598         /*
599          * Except that the caller requested to allocate a new entry,
600          * returning a copied context entry makes no sense.
601          */
602         if (!alloc && context_copied(iommu, bus, devfn))
603                 return NULL;
604
605         entry = &root->lo;
606         if (sm_supported(iommu)) {
607                 if (devfn >= 0x80) {
608                         devfn -= 0x80;
609                         entry = &root->hi;
610                 }
611                 devfn *= 2;
612         }
613         if (*entry & 1)
614                 context = phys_to_virt(*entry & VTD_PAGE_MASK);
615         else {
616                 unsigned long phy_addr;
617                 if (!alloc)
618                         return NULL;
619
620                 context = alloc_pgtable_page(iommu->node, GFP_ATOMIC);
621                 if (!context)
622                         return NULL;
623
624                 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
625                 phy_addr = virt_to_phys((void *)context);
626                 *entry = phy_addr | 1;
627                 __iommu_flush_cache(iommu, entry, sizeof(*entry));
628         }
629         return &context[devfn];
630 }
631
632 /**
633  * is_downstream_to_pci_bridge - test if a device belongs to the PCI
634  *                               sub-hierarchy of a candidate PCI-PCI bridge
635  * @dev: candidate PCI device belonging to @bridge PCI sub-hierarchy
636  * @bridge: the candidate PCI-PCI bridge
637  *
638  * Return: true if @dev belongs to @bridge PCI sub-hierarchy, else false.
639  */
640 static bool
641 is_downstream_to_pci_bridge(struct device *dev, struct device *bridge)
642 {
643         struct pci_dev *pdev, *pbridge;
644
645         if (!dev_is_pci(dev) || !dev_is_pci(bridge))
646                 return false;
647
648         pdev = to_pci_dev(dev);
649         pbridge = to_pci_dev(bridge);
650
651         if (pbridge->subordinate &&
652             pbridge->subordinate->number <= pdev->bus->number &&
653             pbridge->subordinate->busn_res.end >= pdev->bus->number)
654                 return true;
655
656         return false;
657 }
658
659 static bool quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
660 {
661         struct dmar_drhd_unit *drhd;
662         u32 vtbar;
663         int rc;
664
665         /* We know that this device on this chipset has its own IOMMU.
666          * If we find it under a different IOMMU, then the BIOS is lying
667          * to us. Hope that the IOMMU for this device is actually
668          * disabled, and it needs no translation...
669          */
670         rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
671         if (rc) {
672                 /* "can't" happen */
673                 dev_info(&pdev->dev, "failed to run vt-d quirk\n");
674                 return false;
675         }
676         vtbar &= 0xffff0000;
677
678         /* we know that the this iommu should be at offset 0xa000 from vtbar */
679         drhd = dmar_find_matched_drhd_unit(pdev);
680         if (!drhd || drhd->reg_base_addr - vtbar != 0xa000) {
681                 pr_warn_once(FW_BUG "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n");
682                 add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
683                 return true;
684         }
685
686         return false;
687 }
688
689 static bool iommu_is_dummy(struct intel_iommu *iommu, struct device *dev)
690 {
691         if (!iommu || iommu->drhd->ignored)
692                 return true;
693
694         if (dev_is_pci(dev)) {
695                 struct pci_dev *pdev = to_pci_dev(dev);
696
697                 if (pdev->vendor == PCI_VENDOR_ID_INTEL &&
698                     pdev->device == PCI_DEVICE_ID_INTEL_IOAT_SNB &&
699                     quirk_ioat_snb_local_iommu(pdev))
700                         return true;
701         }
702
703         return false;
704 }
705
706 struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn)
707 {
708         struct dmar_drhd_unit *drhd = NULL;
709         struct pci_dev *pdev = NULL;
710         struct intel_iommu *iommu;
711         struct device *tmp;
712         u16 segment = 0;
713         int i;
714
715         if (!dev)
716                 return NULL;
717
718         if (dev_is_pci(dev)) {
719                 struct pci_dev *pf_pdev;
720
721                 pdev = pci_real_dma_dev(to_pci_dev(dev));
722
723                 /* VFs aren't listed in scope tables; we need to look up
724                  * the PF instead to find the IOMMU. */
725                 pf_pdev = pci_physfn(pdev);
726                 dev = &pf_pdev->dev;
727                 segment = pci_domain_nr(pdev->bus);
728         } else if (has_acpi_companion(dev))
729                 dev = &ACPI_COMPANION(dev)->dev;
730
731         rcu_read_lock();
732         for_each_iommu(iommu, drhd) {
733                 if (pdev && segment != drhd->segment)
734                         continue;
735
736                 for_each_active_dev_scope(drhd->devices,
737                                           drhd->devices_cnt, i, tmp) {
738                         if (tmp == dev) {
739                                 /* For a VF use its original BDF# not that of the PF
740                                  * which we used for the IOMMU lookup. Strictly speaking
741                                  * we could do this for all PCI devices; we only need to
742                                  * get the BDF# from the scope table for ACPI matches. */
743                                 if (pdev && pdev->is_virtfn)
744                                         goto got_pdev;
745
746                                 if (bus && devfn) {
747                                         *bus = drhd->devices[i].bus;
748                                         *devfn = drhd->devices[i].devfn;
749                                 }
750                                 goto out;
751                         }
752
753                         if (is_downstream_to_pci_bridge(dev, tmp))
754                                 goto got_pdev;
755                 }
756
757                 if (pdev && drhd->include_all) {
758 got_pdev:
759                         if (bus && devfn) {
760                                 *bus = pdev->bus->number;
761                                 *devfn = pdev->devfn;
762                         }
763                         goto out;
764                 }
765         }
766         iommu = NULL;
767 out:
768         if (iommu_is_dummy(iommu, dev))
769                 iommu = NULL;
770
771         rcu_read_unlock();
772
773         return iommu;
774 }
775
776 static void domain_flush_cache(struct dmar_domain *domain,
777                                void *addr, int size)
778 {
779         if (!domain->iommu_coherency)
780                 clflush_cache_range(addr, size);
781 }
782
783 static void free_context_table(struct intel_iommu *iommu)
784 {
785         struct context_entry *context;
786         int i;
787
788         if (!iommu->root_entry)
789                 return;
790
791         for (i = 0; i < ROOT_ENTRY_NR; i++) {
792                 context = iommu_context_addr(iommu, i, 0, 0);
793                 if (context)
794                         free_pgtable_page(context);
795
796                 if (!sm_supported(iommu))
797                         continue;
798
799                 context = iommu_context_addr(iommu, i, 0x80, 0);
800                 if (context)
801                         free_pgtable_page(context);
802         }
803
804         free_pgtable_page(iommu->root_entry);
805         iommu->root_entry = NULL;
806 }
807
808 #ifdef CONFIG_DMAR_DEBUG
809 static void pgtable_walk(struct intel_iommu *iommu, unsigned long pfn,
810                          u8 bus, u8 devfn, struct dma_pte *parent, int level)
811 {
812         struct dma_pte *pte;
813         int offset;
814
815         while (1) {
816                 offset = pfn_level_offset(pfn, level);
817                 pte = &parent[offset];
818                 if (!pte || (dma_pte_superpage(pte) || !dma_pte_present(pte))) {
819                         pr_info("PTE not present at level %d\n", level);
820                         break;
821                 }
822
823                 pr_info("pte level: %d, pte value: 0x%016llx\n", level, pte->val);
824
825                 if (level == 1)
826                         break;
827
828                 parent = phys_to_virt(dma_pte_addr(pte));
829                 level--;
830         }
831 }
832
833 void dmar_fault_dump_ptes(struct intel_iommu *iommu, u16 source_id,
834                           unsigned long long addr, u32 pasid)
835 {
836         struct pasid_dir_entry *dir, *pde;
837         struct pasid_entry *entries, *pte;
838         struct context_entry *ctx_entry;
839         struct root_entry *rt_entry;
840         int i, dir_index, index, level;
841         u8 devfn = source_id & 0xff;
842         u8 bus = source_id >> 8;
843         struct dma_pte *pgtable;
844
845         pr_info("Dump %s table entries for IOVA 0x%llx\n", iommu->name, addr);
846
847         /* root entry dump */
848         rt_entry = &iommu->root_entry[bus];
849         if (!rt_entry) {
850                 pr_info("root table entry is not present\n");
851                 return;
852         }
853
854         if (sm_supported(iommu))
855                 pr_info("scalable mode root entry: hi 0x%016llx, low 0x%016llx\n",
856                         rt_entry->hi, rt_entry->lo);
857         else
858                 pr_info("root entry: 0x%016llx", rt_entry->lo);
859
860         /* context entry dump */
861         ctx_entry = iommu_context_addr(iommu, bus, devfn, 0);
862         if (!ctx_entry) {
863                 pr_info("context table entry is not present\n");
864                 return;
865         }
866
867         pr_info("context entry: hi 0x%016llx, low 0x%016llx\n",
868                 ctx_entry->hi, ctx_entry->lo);
869
870         /* legacy mode does not require PASID entries */
871         if (!sm_supported(iommu)) {
872                 level = agaw_to_level(ctx_entry->hi & 7);
873                 pgtable = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK);
874                 goto pgtable_walk;
875         }
876
877         /* get the pointer to pasid directory entry */
878         dir = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK);
879         if (!dir) {
880                 pr_info("pasid directory entry is not present\n");
881                 return;
882         }
883         /* For request-without-pasid, get the pasid from context entry */
884         if (intel_iommu_sm && pasid == IOMMU_PASID_INVALID)
885                 pasid = IOMMU_NO_PASID;
886
887         dir_index = pasid >> PASID_PDE_SHIFT;
888         pde = &dir[dir_index];
889         pr_info("pasid dir entry: 0x%016llx\n", pde->val);
890
891         /* get the pointer to the pasid table entry */
892         entries = get_pasid_table_from_pde(pde);
893         if (!entries) {
894                 pr_info("pasid table entry is not present\n");
895                 return;
896         }
897         index = pasid & PASID_PTE_MASK;
898         pte = &entries[index];
899         for (i = 0; i < ARRAY_SIZE(pte->val); i++)
900                 pr_info("pasid table entry[%d]: 0x%016llx\n", i, pte->val[i]);
901
902         if (pasid_pte_get_pgtt(pte) == PASID_ENTRY_PGTT_FL_ONLY) {
903                 level = pte->val[2] & BIT_ULL(2) ? 5 : 4;
904                 pgtable = phys_to_virt(pte->val[2] & VTD_PAGE_MASK);
905         } else {
906                 level = agaw_to_level((pte->val[0] >> 2) & 0x7);
907                 pgtable = phys_to_virt(pte->val[0] & VTD_PAGE_MASK);
908         }
909
910 pgtable_walk:
911         pgtable_walk(iommu, addr >> VTD_PAGE_SHIFT, bus, devfn, pgtable, level);
912 }
913 #endif
914
915 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
916                                       unsigned long pfn, int *target_level,
917                                       gfp_t gfp)
918 {
919         struct dma_pte *parent, *pte;
920         int level = agaw_to_level(domain->agaw);
921         int offset;
922
923         if (!domain_pfn_supported(domain, pfn))
924                 /* Address beyond IOMMU's addressing capabilities. */
925                 return NULL;
926
927         parent = domain->pgd;
928
929         while (1) {
930                 void *tmp_page;
931
932                 offset = pfn_level_offset(pfn, level);
933                 pte = &parent[offset];
934                 if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
935                         break;
936                 if (level == *target_level)
937                         break;
938
939                 if (!dma_pte_present(pte)) {
940                         uint64_t pteval;
941
942                         tmp_page = alloc_pgtable_page(domain->nid, gfp);
943
944                         if (!tmp_page)
945                                 return NULL;
946
947                         domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
948                         pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
949                         if (domain->use_first_level)
950                                 pteval |= DMA_FL_PTE_XD | DMA_FL_PTE_US | DMA_FL_PTE_ACCESS;
951
952                         if (cmpxchg64(&pte->val, 0ULL, pteval))
953                                 /* Someone else set it while we were thinking; use theirs. */
954                                 free_pgtable_page(tmp_page);
955                         else
956                                 domain_flush_cache(domain, pte, sizeof(*pte));
957                 }
958                 if (level == 1)
959                         break;
960
961                 parent = phys_to_virt(dma_pte_addr(pte));
962                 level--;
963         }
964
965         if (!*target_level)
966                 *target_level = level;
967
968         return pte;
969 }
970
971 /* return address's pte at specific level */
972 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
973                                          unsigned long pfn,
974                                          int level, int *large_page)
975 {
976         struct dma_pte *parent, *pte;
977         int total = agaw_to_level(domain->agaw);
978         int offset;
979
980         parent = domain->pgd;
981         while (level <= total) {
982                 offset = pfn_level_offset(pfn, total);
983                 pte = &parent[offset];
984                 if (level == total)
985                         return pte;
986
987                 if (!dma_pte_present(pte)) {
988                         *large_page = total;
989                         break;
990                 }
991
992                 if (dma_pte_superpage(pte)) {
993                         *large_page = total;
994                         return pte;
995                 }
996
997                 parent = phys_to_virt(dma_pte_addr(pte));
998                 total--;
999         }
1000         return NULL;
1001 }
1002
1003 /* clear last level pte, a tlb flush should be followed */
1004 static void dma_pte_clear_range(struct dmar_domain *domain,
1005                                 unsigned long start_pfn,
1006                                 unsigned long last_pfn)
1007 {
1008         unsigned int large_page;
1009         struct dma_pte *first_pte, *pte;
1010
1011         if (WARN_ON(!domain_pfn_supported(domain, last_pfn)) ||
1012             WARN_ON(start_pfn > last_pfn))
1013                 return;
1014
1015         /* we don't need lock here; nobody else touches the iova range */
1016         do {
1017                 large_page = 1;
1018                 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
1019                 if (!pte) {
1020                         start_pfn = align_to_level(start_pfn + 1, large_page + 1);
1021                         continue;
1022                 }
1023                 do {
1024                         dma_clear_pte(pte);
1025                         start_pfn += lvl_to_nr_pages(large_page);
1026                         pte++;
1027                 } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
1028
1029                 domain_flush_cache(domain, first_pte,
1030                                    (void *)pte - (void *)first_pte);
1031
1032         } while (start_pfn && start_pfn <= last_pfn);
1033 }
1034
1035 static void dma_pte_free_level(struct dmar_domain *domain, int level,
1036                                int retain_level, struct dma_pte *pte,
1037                                unsigned long pfn, unsigned long start_pfn,
1038                                unsigned long last_pfn)
1039 {
1040         pfn = max(start_pfn, pfn);
1041         pte = &pte[pfn_level_offset(pfn, level)];
1042
1043         do {
1044                 unsigned long level_pfn;
1045                 struct dma_pte *level_pte;
1046
1047                 if (!dma_pte_present(pte) || dma_pte_superpage(pte))
1048                         goto next;
1049
1050                 level_pfn = pfn & level_mask(level);
1051                 level_pte = phys_to_virt(dma_pte_addr(pte));
1052
1053                 if (level > 2) {
1054                         dma_pte_free_level(domain, level - 1, retain_level,
1055                                            level_pte, level_pfn, start_pfn,
1056                                            last_pfn);
1057                 }
1058
1059                 /*
1060                  * Free the page table if we're below the level we want to
1061                  * retain and the range covers the entire table.
1062                  */
1063                 if (level < retain_level && !(start_pfn > level_pfn ||
1064                       last_pfn < level_pfn + level_size(level) - 1)) {
1065                         dma_clear_pte(pte);
1066                         domain_flush_cache(domain, pte, sizeof(*pte));
1067                         free_pgtable_page(level_pte);
1068                 }
1069 next:
1070                 pfn += level_size(level);
1071         } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1072 }
1073
1074 /*
1075  * clear last level (leaf) ptes and free page table pages below the
1076  * level we wish to keep intact.
1077  */
1078 static void dma_pte_free_pagetable(struct dmar_domain *domain,
1079                                    unsigned long start_pfn,
1080                                    unsigned long last_pfn,
1081                                    int retain_level)
1082 {
1083         dma_pte_clear_range(domain, start_pfn, last_pfn);
1084
1085         /* We don't need lock here; nobody else touches the iova range */
1086         dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level,
1087                            domain->pgd, 0, start_pfn, last_pfn);
1088
1089         /* free pgd */
1090         if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1091                 free_pgtable_page(domain->pgd);
1092                 domain->pgd = NULL;
1093         }
1094 }
1095
1096 /* When a page at a given level is being unlinked from its parent, we don't
1097    need to *modify* it at all. All we need to do is make a list of all the
1098    pages which can be freed just as soon as we've flushed the IOTLB and we
1099    know the hardware page-walk will no longer touch them.
1100    The 'pte' argument is the *parent* PTE, pointing to the page that is to
1101    be freed. */
1102 static void dma_pte_list_pagetables(struct dmar_domain *domain,
1103                                     int level, struct dma_pte *pte,
1104                                     struct list_head *freelist)
1105 {
1106         struct page *pg;
1107
1108         pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
1109         list_add_tail(&pg->lru, freelist);
1110
1111         if (level == 1)
1112                 return;
1113
1114         pte = page_address(pg);
1115         do {
1116                 if (dma_pte_present(pte) && !dma_pte_superpage(pte))
1117                         dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1118                 pte++;
1119         } while (!first_pte_in_page(pte));
1120 }
1121
1122 static void dma_pte_clear_level(struct dmar_domain *domain, int level,
1123                                 struct dma_pte *pte, unsigned long pfn,
1124                                 unsigned long start_pfn, unsigned long last_pfn,
1125                                 struct list_head *freelist)
1126 {
1127         struct dma_pte *first_pte = NULL, *last_pte = NULL;
1128
1129         pfn = max(start_pfn, pfn);
1130         pte = &pte[pfn_level_offset(pfn, level)];
1131
1132         do {
1133                 unsigned long level_pfn = pfn & level_mask(level);
1134
1135                 if (!dma_pte_present(pte))
1136                         goto next;
1137
1138                 /* If range covers entire pagetable, free it */
1139                 if (start_pfn <= level_pfn &&
1140                     last_pfn >= level_pfn + level_size(level) - 1) {
1141                         /* These suborbinate page tables are going away entirely. Don't
1142                            bother to clear them; we're just going to *free* them. */
1143                         if (level > 1 && !dma_pte_superpage(pte))
1144                                 dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1145
1146                         dma_clear_pte(pte);
1147                         if (!first_pte)
1148                                 first_pte = pte;
1149                         last_pte = pte;
1150                 } else if (level > 1) {
1151                         /* Recurse down into a level that isn't *entirely* obsolete */
1152                         dma_pte_clear_level(domain, level - 1,
1153                                             phys_to_virt(dma_pte_addr(pte)),
1154                                             level_pfn, start_pfn, last_pfn,
1155                                             freelist);
1156                 }
1157 next:
1158                 pfn = level_pfn + level_size(level);
1159         } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1160
1161         if (first_pte)
1162                 domain_flush_cache(domain, first_pte,
1163                                    (void *)++last_pte - (void *)first_pte);
1164 }
1165
1166 /* We can't just free the pages because the IOMMU may still be walking
1167    the page tables, and may have cached the intermediate levels. The
1168    pages can only be freed after the IOTLB flush has been done. */
1169 static void domain_unmap(struct dmar_domain *domain, unsigned long start_pfn,
1170                          unsigned long last_pfn, struct list_head *freelist)
1171 {
1172         if (WARN_ON(!domain_pfn_supported(domain, last_pfn)) ||
1173             WARN_ON(start_pfn > last_pfn))
1174                 return;
1175
1176         /* we don't need lock here; nobody else touches the iova range */
1177         dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1178                             domain->pgd, 0, start_pfn, last_pfn, freelist);
1179
1180         /* free pgd */
1181         if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1182                 struct page *pgd_page = virt_to_page(domain->pgd);
1183                 list_add_tail(&pgd_page->lru, freelist);
1184                 domain->pgd = NULL;
1185         }
1186 }
1187
1188 /* iommu handling */
1189 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1190 {
1191         struct root_entry *root;
1192
1193         root = alloc_pgtable_page(iommu->node, GFP_ATOMIC);
1194         if (!root) {
1195                 pr_err("Allocating root entry for %s failed\n",
1196                         iommu->name);
1197                 return -ENOMEM;
1198         }
1199
1200         __iommu_flush_cache(iommu, root, ROOT_SIZE);
1201         iommu->root_entry = root;
1202
1203         return 0;
1204 }
1205
1206 static void iommu_set_root_entry(struct intel_iommu *iommu)
1207 {
1208         u64 addr;
1209         u32 sts;
1210         unsigned long flag;
1211
1212         addr = virt_to_phys(iommu->root_entry);
1213         if (sm_supported(iommu))
1214                 addr |= DMA_RTADDR_SMT;
1215
1216         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1217         dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1218
1219         writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1220
1221         /* Make sure hardware complete it */
1222         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1223                       readl, (sts & DMA_GSTS_RTPS), sts);
1224
1225         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1226
1227         /*
1228          * Hardware invalidates all DMA remapping hardware translation
1229          * caches as part of SRTP flow.
1230          */
1231         if (cap_esrtps(iommu->cap))
1232                 return;
1233
1234         iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
1235         if (sm_supported(iommu))
1236                 qi_flush_pasid_cache(iommu, 0, QI_PC_GLOBAL, 0);
1237         iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
1238 }
1239
1240 void iommu_flush_write_buffer(struct intel_iommu *iommu)
1241 {
1242         u32 val;
1243         unsigned long flag;
1244
1245         if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1246                 return;
1247
1248         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1249         writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1250
1251         /* Make sure hardware complete it */
1252         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1253                       readl, (!(val & DMA_GSTS_WBFS)), val);
1254
1255         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1256 }
1257
1258 /* return value determine if we need a write buffer flush */
1259 static void __iommu_flush_context(struct intel_iommu *iommu,
1260                                   u16 did, u16 source_id, u8 function_mask,
1261                                   u64 type)
1262 {
1263         u64 val = 0;
1264         unsigned long flag;
1265
1266         switch (type) {
1267         case DMA_CCMD_GLOBAL_INVL:
1268                 val = DMA_CCMD_GLOBAL_INVL;
1269                 break;
1270         case DMA_CCMD_DOMAIN_INVL:
1271                 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1272                 break;
1273         case DMA_CCMD_DEVICE_INVL:
1274                 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1275                         | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1276                 break;
1277         default:
1278                 pr_warn("%s: Unexpected context-cache invalidation type 0x%llx\n",
1279                         iommu->name, type);
1280                 return;
1281         }
1282         val |= DMA_CCMD_ICC;
1283
1284         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1285         dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1286
1287         /* Make sure hardware complete it */
1288         IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1289                 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1290
1291         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1292 }
1293
1294 /* return value determine if we need a write buffer flush */
1295 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1296                                 u64 addr, unsigned int size_order, u64 type)
1297 {
1298         int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1299         u64 val = 0, val_iva = 0;
1300         unsigned long flag;
1301
1302         switch (type) {
1303         case DMA_TLB_GLOBAL_FLUSH:
1304                 /* global flush doesn't need set IVA_REG */
1305                 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1306                 break;
1307         case DMA_TLB_DSI_FLUSH:
1308                 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1309                 break;
1310         case DMA_TLB_PSI_FLUSH:
1311                 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1312                 /* IH bit is passed in as part of address */
1313                 val_iva = size_order | addr;
1314                 break;
1315         default:
1316                 pr_warn("%s: Unexpected iotlb invalidation type 0x%llx\n",
1317                         iommu->name, type);
1318                 return;
1319         }
1320
1321         if (cap_write_drain(iommu->cap))
1322                 val |= DMA_TLB_WRITE_DRAIN;
1323
1324         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1325         /* Note: Only uses first TLB reg currently */
1326         if (val_iva)
1327                 dmar_writeq(iommu->reg + tlb_offset, val_iva);
1328         dmar_writeq(iommu->reg + tlb_offset + 8, val);
1329
1330         /* Make sure hardware complete it */
1331         IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1332                 dmar_readq, (!(val & DMA_TLB_IVT)), val);
1333
1334         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1335
1336         /* check IOTLB invalidation granularity */
1337         if (DMA_TLB_IAIG(val) == 0)
1338                 pr_err("Flush IOTLB failed\n");
1339         if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1340                 pr_debug("TLB flush request %Lx, actual %Lx\n",
1341                         (unsigned long long)DMA_TLB_IIRG(type),
1342                         (unsigned long long)DMA_TLB_IAIG(val));
1343 }
1344
1345 static struct device_domain_info *
1346 domain_lookup_dev_info(struct dmar_domain *domain,
1347                        struct intel_iommu *iommu, u8 bus, u8 devfn)
1348 {
1349         struct device_domain_info *info;
1350         unsigned long flags;
1351
1352         spin_lock_irqsave(&domain->lock, flags);
1353         list_for_each_entry(info, &domain->devices, link) {
1354                 if (info->iommu == iommu && info->bus == bus &&
1355                     info->devfn == devfn) {
1356                         spin_unlock_irqrestore(&domain->lock, flags);
1357                         return info;
1358                 }
1359         }
1360         spin_unlock_irqrestore(&domain->lock, flags);
1361
1362         return NULL;
1363 }
1364
1365 static void domain_update_iotlb(struct dmar_domain *domain)
1366 {
1367         struct dev_pasid_info *dev_pasid;
1368         struct device_domain_info *info;
1369         bool has_iotlb_device = false;
1370         unsigned long flags;
1371
1372         spin_lock_irqsave(&domain->lock, flags);
1373         list_for_each_entry(info, &domain->devices, link) {
1374                 if (info->ats_enabled) {
1375                         has_iotlb_device = true;
1376                         break;
1377                 }
1378         }
1379
1380         list_for_each_entry(dev_pasid, &domain->dev_pasids, link_domain) {
1381                 info = dev_iommu_priv_get(dev_pasid->dev);
1382                 if (info->ats_enabled) {
1383                         has_iotlb_device = true;
1384                         break;
1385                 }
1386         }
1387         domain->has_iotlb_device = has_iotlb_device;
1388         spin_unlock_irqrestore(&domain->lock, flags);
1389 }
1390
1391 /*
1392  * The extra devTLB flush quirk impacts those QAT devices with PCI device
1393  * IDs ranging from 0x4940 to 0x4943. It is exempted from risky_device()
1394  * check because it applies only to the built-in QAT devices and it doesn't
1395  * grant additional privileges.
1396  */
1397 #define BUGGY_QAT_DEVID_MASK 0x4940
1398 static bool dev_needs_extra_dtlb_flush(struct pci_dev *pdev)
1399 {
1400         if (pdev->vendor != PCI_VENDOR_ID_INTEL)
1401                 return false;
1402
1403         if ((pdev->device & 0xfffc) != BUGGY_QAT_DEVID_MASK)
1404                 return false;
1405
1406         return true;
1407 }
1408
1409 static void iommu_enable_pci_caps(struct device_domain_info *info)
1410 {
1411         struct pci_dev *pdev;
1412
1413         if (!dev_is_pci(info->dev))
1414                 return;
1415
1416         pdev = to_pci_dev(info->dev);
1417
1418         /* The PCIe spec, in its wisdom, declares that the behaviour of
1419            the device if you enable PASID support after ATS support is
1420            undefined. So always enable PASID support on devices which
1421            have it, even if we can't yet know if we're ever going to
1422            use it. */
1423         if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1))
1424                 info->pasid_enabled = 1;
1425
1426         if (info->ats_supported && pci_ats_page_aligned(pdev) &&
1427             !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) {
1428                 info->ats_enabled = 1;
1429                 domain_update_iotlb(info->domain);
1430         }
1431 }
1432
1433 static void iommu_disable_pci_caps(struct device_domain_info *info)
1434 {
1435         struct pci_dev *pdev;
1436
1437         if (!dev_is_pci(info->dev))
1438                 return;
1439
1440         pdev = to_pci_dev(info->dev);
1441
1442         if (info->ats_enabled) {
1443                 pci_disable_ats(pdev);
1444                 info->ats_enabled = 0;
1445                 domain_update_iotlb(info->domain);
1446         }
1447
1448         if (info->pasid_enabled) {
1449                 pci_disable_pasid(pdev);
1450                 info->pasid_enabled = 0;
1451         }
1452 }
1453
1454 static void __iommu_flush_dev_iotlb(struct device_domain_info *info,
1455                                     u64 addr, unsigned int mask)
1456 {
1457         u16 sid, qdep;
1458
1459         if (!info || !info->ats_enabled)
1460                 return;
1461
1462         sid = info->bus << 8 | info->devfn;
1463         qdep = info->ats_qdep;
1464         qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
1465                            qdep, addr, mask);
1466         quirk_extra_dev_tlb_flush(info, addr, mask, IOMMU_NO_PASID, qdep);
1467 }
1468
1469 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1470                                   u64 addr, unsigned mask)
1471 {
1472         struct dev_pasid_info *dev_pasid;
1473         struct device_domain_info *info;
1474         unsigned long flags;
1475
1476         if (!domain->has_iotlb_device)
1477                 return;
1478
1479         spin_lock_irqsave(&domain->lock, flags);
1480         list_for_each_entry(info, &domain->devices, link)
1481                 __iommu_flush_dev_iotlb(info, addr, mask);
1482
1483         list_for_each_entry(dev_pasid, &domain->dev_pasids, link_domain) {
1484                 info = dev_iommu_priv_get(dev_pasid->dev);
1485
1486                 if (!info->ats_enabled)
1487                         continue;
1488
1489                 qi_flush_dev_iotlb_pasid(info->iommu,
1490                                          PCI_DEVID(info->bus, info->devfn),
1491                                          info->pfsid, dev_pasid->pasid,
1492                                          info->ats_qdep, addr,
1493                                          mask);
1494         }
1495         spin_unlock_irqrestore(&domain->lock, flags);
1496 }
1497
1498 static void domain_flush_pasid_iotlb(struct intel_iommu *iommu,
1499                                      struct dmar_domain *domain, u64 addr,
1500                                      unsigned long npages, bool ih)
1501 {
1502         u16 did = domain_id_iommu(domain, iommu);
1503         struct dev_pasid_info *dev_pasid;
1504         unsigned long flags;
1505
1506         spin_lock_irqsave(&domain->lock, flags);
1507         list_for_each_entry(dev_pasid, &domain->dev_pasids, link_domain)
1508                 qi_flush_piotlb(iommu, did, dev_pasid->pasid, addr, npages, ih);
1509
1510         if (!list_empty(&domain->devices))
1511                 qi_flush_piotlb(iommu, did, IOMMU_NO_PASID, addr, npages, ih);
1512         spin_unlock_irqrestore(&domain->lock, flags);
1513 }
1514
1515 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu,
1516                                   struct dmar_domain *domain,
1517                                   unsigned long pfn, unsigned int pages,
1518                                   int ih, int map)
1519 {
1520         unsigned int aligned_pages = __roundup_pow_of_two(pages);
1521         unsigned int mask = ilog2(aligned_pages);
1522         uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1523         u16 did = domain_id_iommu(domain, iommu);
1524
1525         if (WARN_ON(!pages))
1526                 return;
1527
1528         if (ih)
1529                 ih = 1 << 6;
1530
1531         if (domain->use_first_level) {
1532                 domain_flush_pasid_iotlb(iommu, domain, addr, pages, ih);
1533         } else {
1534                 unsigned long bitmask = aligned_pages - 1;
1535
1536                 /*
1537                  * PSI masks the low order bits of the base address. If the
1538                  * address isn't aligned to the mask, then compute a mask value
1539                  * needed to ensure the target range is flushed.
1540                  */
1541                 if (unlikely(bitmask & pfn)) {
1542                         unsigned long end_pfn = pfn + pages - 1, shared_bits;
1543
1544                         /*
1545                          * Since end_pfn <= pfn + bitmask, the only way bits
1546                          * higher than bitmask can differ in pfn and end_pfn is
1547                          * by carrying. This means after masking out bitmask,
1548                          * high bits starting with the first set bit in
1549                          * shared_bits are all equal in both pfn and end_pfn.
1550                          */
1551                         shared_bits = ~(pfn ^ end_pfn) & ~bitmask;
1552                         mask = shared_bits ? __ffs(shared_bits) : BITS_PER_LONG;
1553                 }
1554
1555                 /*
1556                  * Fallback to domain selective flush if no PSI support or
1557                  * the size is too big.
1558                  */
1559                 if (!cap_pgsel_inv(iommu->cap) ||
1560                     mask > cap_max_amask_val(iommu->cap))
1561                         iommu->flush.flush_iotlb(iommu, did, 0, 0,
1562                                                         DMA_TLB_DSI_FLUSH);
1563                 else
1564                         iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
1565                                                         DMA_TLB_PSI_FLUSH);
1566         }
1567
1568         /*
1569          * In caching mode, changes of pages from non-present to present require
1570          * flush. However, device IOTLB doesn't need to be flushed in this case.
1571          */
1572         if (!cap_caching_mode(iommu->cap) || !map)
1573                 iommu_flush_dev_iotlb(domain, addr, mask);
1574 }
1575
1576 /* Notification for newly created mappings */
1577 static inline void __mapping_notify_one(struct intel_iommu *iommu,
1578                                         struct dmar_domain *domain,
1579                                         unsigned long pfn, unsigned int pages)
1580 {
1581         /*
1582          * It's a non-present to present mapping. Only flush if caching mode
1583          * and second level.
1584          */
1585         if (cap_caching_mode(iommu->cap) && !domain->use_first_level)
1586                 iommu_flush_iotlb_psi(iommu, domain, pfn, pages, 0, 1);
1587         else
1588                 iommu_flush_write_buffer(iommu);
1589 }
1590
1591 static void intel_flush_iotlb_all(struct iommu_domain *domain)
1592 {
1593         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
1594         struct iommu_domain_info *info;
1595         unsigned long idx;
1596
1597         xa_for_each(&dmar_domain->iommu_array, idx, info) {
1598                 struct intel_iommu *iommu = info->iommu;
1599                 u16 did = domain_id_iommu(dmar_domain, iommu);
1600
1601                 if (dmar_domain->use_first_level)
1602                         domain_flush_pasid_iotlb(iommu, dmar_domain, 0, -1, 0);
1603                 else
1604                         iommu->flush.flush_iotlb(iommu, did, 0, 0,
1605                                                  DMA_TLB_DSI_FLUSH);
1606
1607                 if (!cap_caching_mode(iommu->cap))
1608                         iommu_flush_dev_iotlb(dmar_domain, 0, MAX_AGAW_PFN_WIDTH);
1609         }
1610 }
1611
1612 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1613 {
1614         u32 pmen;
1615         unsigned long flags;
1616
1617         if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap))
1618                 return;
1619
1620         raw_spin_lock_irqsave(&iommu->register_lock, flags);
1621         pmen = readl(iommu->reg + DMAR_PMEN_REG);
1622         pmen &= ~DMA_PMEN_EPM;
1623         writel(pmen, iommu->reg + DMAR_PMEN_REG);
1624
1625         /* wait for the protected region status bit to clear */
1626         IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1627                 readl, !(pmen & DMA_PMEN_PRS), pmen);
1628
1629         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1630 }
1631
1632 static void iommu_enable_translation(struct intel_iommu *iommu)
1633 {
1634         u32 sts;
1635         unsigned long flags;
1636
1637         raw_spin_lock_irqsave(&iommu->register_lock, flags);
1638         iommu->gcmd |= DMA_GCMD_TE;
1639         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1640
1641         /* Make sure hardware complete it */
1642         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1643                       readl, (sts & DMA_GSTS_TES), sts);
1644
1645         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1646 }
1647
1648 static void iommu_disable_translation(struct intel_iommu *iommu)
1649 {
1650         u32 sts;
1651         unsigned long flag;
1652
1653         if (iommu_skip_te_disable && iommu->drhd->gfx_dedicated &&
1654             (cap_read_drain(iommu->cap) || cap_write_drain(iommu->cap)))
1655                 return;
1656
1657         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1658         iommu->gcmd &= ~DMA_GCMD_TE;
1659         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1660
1661         /* Make sure hardware complete it */
1662         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1663                       readl, (!(sts & DMA_GSTS_TES)), sts);
1664
1665         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1666 }
1667
1668 static int iommu_init_domains(struct intel_iommu *iommu)
1669 {
1670         u32 ndomains;
1671
1672         ndomains = cap_ndoms(iommu->cap);
1673         pr_debug("%s: Number of Domains supported <%d>\n",
1674                  iommu->name, ndomains);
1675
1676         spin_lock_init(&iommu->lock);
1677
1678         iommu->domain_ids = bitmap_zalloc(ndomains, GFP_KERNEL);
1679         if (!iommu->domain_ids)
1680                 return -ENOMEM;
1681
1682         /*
1683          * If Caching mode is set, then invalid translations are tagged
1684          * with domain-id 0, hence we need to pre-allocate it. We also
1685          * use domain-id 0 as a marker for non-allocated domain-id, so
1686          * make sure it is not used for a real domain.
1687          */
1688         set_bit(0, iommu->domain_ids);
1689
1690         /*
1691          * Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid
1692          * entry for first-level or pass-through translation modes should
1693          * be programmed with a domain id different from those used for
1694          * second-level or nested translation. We reserve a domain id for
1695          * this purpose.
1696          */
1697         if (sm_supported(iommu))
1698                 set_bit(FLPT_DEFAULT_DID, iommu->domain_ids);
1699
1700         return 0;
1701 }
1702
1703 static void disable_dmar_iommu(struct intel_iommu *iommu)
1704 {
1705         if (!iommu->domain_ids)
1706                 return;
1707
1708         /*
1709          * All iommu domains must have been detached from the devices,
1710          * hence there should be no domain IDs in use.
1711          */
1712         if (WARN_ON(bitmap_weight(iommu->domain_ids, cap_ndoms(iommu->cap))
1713                     > NUM_RESERVED_DID))
1714                 return;
1715
1716         if (iommu->gcmd & DMA_GCMD_TE)
1717                 iommu_disable_translation(iommu);
1718 }
1719
1720 static void free_dmar_iommu(struct intel_iommu *iommu)
1721 {
1722         if (iommu->domain_ids) {
1723                 bitmap_free(iommu->domain_ids);
1724                 iommu->domain_ids = NULL;
1725         }
1726
1727         if (iommu->copied_tables) {
1728                 bitmap_free(iommu->copied_tables);
1729                 iommu->copied_tables = NULL;
1730         }
1731
1732         /* free context mapping */
1733         free_context_table(iommu);
1734
1735 #ifdef CONFIG_INTEL_IOMMU_SVM
1736         if (pasid_supported(iommu)) {
1737                 if (ecap_prs(iommu->ecap))
1738                         intel_svm_finish_prq(iommu);
1739         }
1740 #endif
1741 }
1742
1743 /*
1744  * Check and return whether first level is used by default for
1745  * DMA translation.
1746  */
1747 static bool first_level_by_default(unsigned int type)
1748 {
1749         /* Only SL is available in legacy mode */
1750         if (!scalable_mode_support())
1751                 return false;
1752
1753         /* Only level (either FL or SL) is available, just use it */
1754         if (intel_cap_flts_sanity() ^ intel_cap_slts_sanity())
1755                 return intel_cap_flts_sanity();
1756
1757         /* Both levels are available, decide it based on domain type */
1758         return type != IOMMU_DOMAIN_UNMANAGED;
1759 }
1760
1761 static struct dmar_domain *alloc_domain(unsigned int type)
1762 {
1763         struct dmar_domain *domain;
1764
1765         domain = kzalloc(sizeof(*domain), GFP_KERNEL);
1766         if (!domain)
1767                 return NULL;
1768
1769         domain->nid = NUMA_NO_NODE;
1770         if (first_level_by_default(type))
1771                 domain->use_first_level = true;
1772         domain->has_iotlb_device = false;
1773         INIT_LIST_HEAD(&domain->devices);
1774         INIT_LIST_HEAD(&domain->dev_pasids);
1775         spin_lock_init(&domain->lock);
1776         xa_init(&domain->iommu_array);
1777
1778         return domain;
1779 }
1780
1781 static int domain_attach_iommu(struct dmar_domain *domain,
1782                                struct intel_iommu *iommu)
1783 {
1784         struct iommu_domain_info *info, *curr;
1785         unsigned long ndomains;
1786         int num, ret = -ENOSPC;
1787
1788         info = kzalloc(sizeof(*info), GFP_KERNEL);
1789         if (!info)
1790                 return -ENOMEM;
1791
1792         spin_lock(&iommu->lock);
1793         curr = xa_load(&domain->iommu_array, iommu->seq_id);
1794         if (curr) {
1795                 curr->refcnt++;
1796                 spin_unlock(&iommu->lock);
1797                 kfree(info);
1798                 return 0;
1799         }
1800
1801         ndomains = cap_ndoms(iommu->cap);
1802         num = find_first_zero_bit(iommu->domain_ids, ndomains);
1803         if (num >= ndomains) {
1804                 pr_err("%s: No free domain ids\n", iommu->name);
1805                 goto err_unlock;
1806         }
1807
1808         set_bit(num, iommu->domain_ids);
1809         info->refcnt    = 1;
1810         info->did       = num;
1811         info->iommu     = iommu;
1812         curr = xa_cmpxchg(&domain->iommu_array, iommu->seq_id,
1813                           NULL, info, GFP_ATOMIC);
1814         if (curr) {
1815                 ret = xa_err(curr) ? : -EBUSY;
1816                 goto err_clear;
1817         }
1818         domain_update_iommu_cap(domain);
1819
1820         spin_unlock(&iommu->lock);
1821         return 0;
1822
1823 err_clear:
1824         clear_bit(info->did, iommu->domain_ids);
1825 err_unlock:
1826         spin_unlock(&iommu->lock);
1827         kfree(info);
1828         return ret;
1829 }
1830
1831 static void domain_detach_iommu(struct dmar_domain *domain,
1832                                 struct intel_iommu *iommu)
1833 {
1834         struct iommu_domain_info *info;
1835
1836         spin_lock(&iommu->lock);
1837         info = xa_load(&domain->iommu_array, iommu->seq_id);
1838         if (--info->refcnt == 0) {
1839                 clear_bit(info->did, iommu->domain_ids);
1840                 xa_erase(&domain->iommu_array, iommu->seq_id);
1841                 domain->nid = NUMA_NO_NODE;
1842                 domain_update_iommu_cap(domain);
1843                 kfree(info);
1844         }
1845         spin_unlock(&iommu->lock);
1846 }
1847
1848 static inline int guestwidth_to_adjustwidth(int gaw)
1849 {
1850         int agaw;
1851         int r = (gaw - 12) % 9;
1852
1853         if (r == 0)
1854                 agaw = gaw;
1855         else
1856                 agaw = gaw + 9 - r;
1857         if (agaw > 64)
1858                 agaw = 64;
1859         return agaw;
1860 }
1861
1862 static void domain_exit(struct dmar_domain *domain)
1863 {
1864         if (domain->pgd) {
1865                 LIST_HEAD(freelist);
1866
1867                 domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw), &freelist);
1868                 put_pages_list(&freelist);
1869         }
1870
1871         if (WARN_ON(!list_empty(&domain->devices)))
1872                 return;
1873
1874         kfree(domain);
1875 }
1876
1877 /*
1878  * Get the PASID directory size for scalable mode context entry.
1879  * Value of X in the PDTS field of a scalable mode context entry
1880  * indicates PASID directory with 2^(X + 7) entries.
1881  */
1882 static inline unsigned long context_get_sm_pds(struct pasid_table *table)
1883 {
1884         unsigned long pds, max_pde;
1885
1886         max_pde = table->max_pasid >> PASID_PDE_SHIFT;
1887         pds = find_first_bit(&max_pde, MAX_NR_PASID_BITS);
1888         if (pds < 7)
1889                 return 0;
1890
1891         return pds - 7;
1892 }
1893
1894 /*
1895  * Set the RID_PASID field of a scalable mode context entry. The
1896  * IOMMU hardware will use the PASID value set in this field for
1897  * DMA translations of DMA requests without PASID.
1898  */
1899 static inline void
1900 context_set_sm_rid2pasid(struct context_entry *context, unsigned long pasid)
1901 {
1902         context->hi |= pasid & ((1 << 20) - 1);
1903 }
1904
1905 /*
1906  * Set the DTE(Device-TLB Enable) field of a scalable mode context
1907  * entry.
1908  */
1909 static inline void context_set_sm_dte(struct context_entry *context)
1910 {
1911         context->lo |= BIT_ULL(2);
1912 }
1913
1914 /*
1915  * Set the PRE(Page Request Enable) field of a scalable mode context
1916  * entry.
1917  */
1918 static inline void context_set_sm_pre(struct context_entry *context)
1919 {
1920         context->lo |= BIT_ULL(4);
1921 }
1922
1923 /* Convert value to context PASID directory size field coding. */
1924 #define context_pdts(pds)       (((pds) & 0x7) << 9)
1925
1926 static int domain_context_mapping_one(struct dmar_domain *domain,
1927                                       struct intel_iommu *iommu,
1928                                       struct pasid_table *table,
1929                                       u8 bus, u8 devfn)
1930 {
1931         struct device_domain_info *info =
1932                         domain_lookup_dev_info(domain, iommu, bus, devfn);
1933         u16 did = domain_id_iommu(domain, iommu);
1934         int translation = CONTEXT_TT_MULTI_LEVEL;
1935         struct context_entry *context;
1936         int ret;
1937
1938         if (hw_pass_through && domain_type_is_si(domain))
1939                 translation = CONTEXT_TT_PASS_THROUGH;
1940
1941         pr_debug("Set context mapping for %02x:%02x.%d\n",
1942                 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1943
1944         spin_lock(&iommu->lock);
1945         ret = -ENOMEM;
1946         context = iommu_context_addr(iommu, bus, devfn, 1);
1947         if (!context)
1948                 goto out_unlock;
1949
1950         ret = 0;
1951         if (context_present(context) && !context_copied(iommu, bus, devfn))
1952                 goto out_unlock;
1953
1954         /*
1955          * For kdump cases, old valid entries may be cached due to the
1956          * in-flight DMA and copied pgtable, but there is no unmapping
1957          * behaviour for them, thus we need an explicit cache flush for
1958          * the newly-mapped device. For kdump, at this point, the device
1959          * is supposed to finish reset at its driver probe stage, so no
1960          * in-flight DMA will exist, and we don't need to worry anymore
1961          * hereafter.
1962          */
1963         if (context_copied(iommu, bus, devfn)) {
1964                 u16 did_old = context_domain_id(context);
1965
1966                 if (did_old < cap_ndoms(iommu->cap)) {
1967                         iommu->flush.flush_context(iommu, did_old,
1968                                                    (((u16)bus) << 8) | devfn,
1969                                                    DMA_CCMD_MASK_NOBIT,
1970                                                    DMA_CCMD_DEVICE_INVL);
1971                         iommu->flush.flush_iotlb(iommu, did_old, 0, 0,
1972                                                  DMA_TLB_DSI_FLUSH);
1973                 }
1974
1975                 clear_context_copied(iommu, bus, devfn);
1976         }
1977
1978         context_clear_entry(context);
1979
1980         if (sm_supported(iommu)) {
1981                 unsigned long pds;
1982
1983                 /* Setup the PASID DIR pointer: */
1984                 pds = context_get_sm_pds(table);
1985                 context->lo = (u64)virt_to_phys(table->table) |
1986                                 context_pdts(pds);
1987
1988                 /* Setup the RID_PASID field: */
1989                 context_set_sm_rid2pasid(context, IOMMU_NO_PASID);
1990
1991                 /*
1992                  * Setup the Device-TLB enable bit and Page request
1993                  * Enable bit:
1994                  */
1995                 if (info && info->ats_supported)
1996                         context_set_sm_dte(context);
1997                 if (info && info->pri_supported)
1998                         context_set_sm_pre(context);
1999                 if (info && info->pasid_supported)
2000                         context_set_pasid(context);
2001         } else {
2002                 struct dma_pte *pgd = domain->pgd;
2003                 int agaw;
2004
2005                 context_set_domain_id(context, did);
2006
2007                 if (translation != CONTEXT_TT_PASS_THROUGH) {
2008                         /*
2009                          * Skip top levels of page tables for iommu which has
2010                          * less agaw than default. Unnecessary for PT mode.
2011                          */
2012                         for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2013                                 ret = -ENOMEM;
2014                                 pgd = phys_to_virt(dma_pte_addr(pgd));
2015                                 if (!dma_pte_present(pgd))
2016                                         goto out_unlock;
2017                         }
2018
2019                         if (info && info->ats_supported)
2020                                 translation = CONTEXT_TT_DEV_IOTLB;
2021                         else
2022                                 translation = CONTEXT_TT_MULTI_LEVEL;
2023
2024                         context_set_address_root(context, virt_to_phys(pgd));
2025                         context_set_address_width(context, agaw);
2026                 } else {
2027                         /*
2028                          * In pass through mode, AW must be programmed to
2029                          * indicate the largest AGAW value supported by
2030                          * hardware. And ASR is ignored by hardware.
2031                          */
2032                         context_set_address_width(context, iommu->msagaw);
2033                 }
2034
2035                 context_set_translation_type(context, translation);
2036         }
2037
2038         context_set_fault_enable(context);
2039         context_set_present(context);
2040         if (!ecap_coherent(iommu->ecap))
2041                 clflush_cache_range(context, sizeof(*context));
2042
2043         /*
2044          * It's a non-present to present mapping. If hardware doesn't cache
2045          * non-present entry we only need to flush the write-buffer. If the
2046          * _does_ cache non-present entries, then it does so in the special
2047          * domain #0, which we have to flush:
2048          */
2049         if (cap_caching_mode(iommu->cap)) {
2050                 iommu->flush.flush_context(iommu, 0,
2051                                            (((u16)bus) << 8) | devfn,
2052                                            DMA_CCMD_MASK_NOBIT,
2053                                            DMA_CCMD_DEVICE_INVL);
2054                 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
2055         } else {
2056                 iommu_flush_write_buffer(iommu);
2057         }
2058
2059         ret = 0;
2060
2061 out_unlock:
2062         spin_unlock(&iommu->lock);
2063
2064         return ret;
2065 }
2066
2067 struct domain_context_mapping_data {
2068         struct dmar_domain *domain;
2069         struct intel_iommu *iommu;
2070         struct pasid_table *table;
2071 };
2072
2073 static int domain_context_mapping_cb(struct pci_dev *pdev,
2074                                      u16 alias, void *opaque)
2075 {
2076         struct domain_context_mapping_data *data = opaque;
2077
2078         return domain_context_mapping_one(data->domain, data->iommu,
2079                                           data->table, PCI_BUS_NUM(alias),
2080                                           alias & 0xff);
2081 }
2082
2083 static int
2084 domain_context_mapping(struct dmar_domain *domain, struct device *dev)
2085 {
2086         struct domain_context_mapping_data data;
2087         struct pasid_table *table;
2088         struct intel_iommu *iommu;
2089         u8 bus, devfn;
2090
2091         iommu = device_to_iommu(dev, &bus, &devfn);
2092         if (!iommu)
2093                 return -ENODEV;
2094
2095         table = intel_pasid_get_table(dev);
2096
2097         if (!dev_is_pci(dev))
2098                 return domain_context_mapping_one(domain, iommu, table,
2099                                                   bus, devfn);
2100
2101         data.domain = domain;
2102         data.iommu = iommu;
2103         data.table = table;
2104
2105         return pci_for_each_dma_alias(to_pci_dev(dev),
2106                                       &domain_context_mapping_cb, &data);
2107 }
2108
2109 /* Returns a number of VTD pages, but aligned to MM page size */
2110 static inline unsigned long aligned_nrpages(unsigned long host_addr,
2111                                             size_t size)
2112 {
2113         host_addr &= ~PAGE_MASK;
2114         return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
2115 }
2116
2117 /* Return largest possible superpage level for a given mapping */
2118 static inline int hardware_largepage_caps(struct dmar_domain *domain,
2119                                           unsigned long iov_pfn,
2120                                           unsigned long phy_pfn,
2121                                           unsigned long pages)
2122 {
2123         int support, level = 1;
2124         unsigned long pfnmerge;
2125
2126         support = domain->iommu_superpage;
2127
2128         /* To use a large page, the virtual *and* physical addresses
2129            must be aligned to 2MiB/1GiB/etc. Lower bits set in either
2130            of them will mean we have to use smaller pages. So just
2131            merge them and check both at once. */
2132         pfnmerge = iov_pfn | phy_pfn;
2133
2134         while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
2135                 pages >>= VTD_STRIDE_SHIFT;
2136                 if (!pages)
2137                         break;
2138                 pfnmerge >>= VTD_STRIDE_SHIFT;
2139                 level++;
2140                 support--;
2141         }
2142         return level;
2143 }
2144
2145 /*
2146  * Ensure that old small page tables are removed to make room for superpage(s).
2147  * We're going to add new large pages, so make sure we don't remove their parent
2148  * tables. The IOTLB/devTLBs should be flushed if any PDE/PTEs are cleared.
2149  */
2150 static void switch_to_super_page(struct dmar_domain *domain,
2151                                  unsigned long start_pfn,
2152                                  unsigned long end_pfn, int level)
2153 {
2154         unsigned long lvl_pages = lvl_to_nr_pages(level);
2155         struct iommu_domain_info *info;
2156         struct dma_pte *pte = NULL;
2157         unsigned long i;
2158
2159         while (start_pfn <= end_pfn) {
2160                 if (!pte)
2161                         pte = pfn_to_dma_pte(domain, start_pfn, &level,
2162                                              GFP_ATOMIC);
2163
2164                 if (dma_pte_present(pte)) {
2165                         dma_pte_free_pagetable(domain, start_pfn,
2166                                                start_pfn + lvl_pages - 1,
2167                                                level + 1);
2168
2169                         xa_for_each(&domain->iommu_array, i, info)
2170                                 iommu_flush_iotlb_psi(info->iommu, domain,
2171                                                       start_pfn, lvl_pages,
2172                                                       0, 0);
2173                 }
2174
2175                 pte++;
2176                 start_pfn += lvl_pages;
2177                 if (first_pte_in_page(pte))
2178                         pte = NULL;
2179         }
2180 }
2181
2182 static int
2183 __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2184                  unsigned long phys_pfn, unsigned long nr_pages, int prot,
2185                  gfp_t gfp)
2186 {
2187         struct dma_pte *first_pte = NULL, *pte = NULL;
2188         unsigned int largepage_lvl = 0;
2189         unsigned long lvl_pages = 0;
2190         phys_addr_t pteval;
2191         u64 attr;
2192
2193         if (unlikely(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1)))
2194                 return -EINVAL;
2195
2196         if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
2197                 return -EINVAL;
2198
2199         attr = prot & (DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP);
2200         attr |= DMA_FL_PTE_PRESENT;
2201         if (domain->use_first_level) {
2202                 attr |= DMA_FL_PTE_XD | DMA_FL_PTE_US | DMA_FL_PTE_ACCESS;
2203                 if (prot & DMA_PTE_WRITE)
2204                         attr |= DMA_FL_PTE_DIRTY;
2205         }
2206
2207         domain->has_mappings = true;
2208
2209         pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | attr;
2210
2211         while (nr_pages > 0) {
2212                 uint64_t tmp;
2213
2214                 if (!pte) {
2215                         largepage_lvl = hardware_largepage_caps(domain, iov_pfn,
2216                                         phys_pfn, nr_pages);
2217
2218                         pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl,
2219                                              gfp);
2220                         if (!pte)
2221                                 return -ENOMEM;
2222                         first_pte = pte;
2223
2224                         lvl_pages = lvl_to_nr_pages(largepage_lvl);
2225
2226                         /* It is large page*/
2227                         if (largepage_lvl > 1) {
2228                                 unsigned long end_pfn;
2229                                 unsigned long pages_to_remove;
2230
2231                                 pteval |= DMA_PTE_LARGE_PAGE;
2232                                 pages_to_remove = min_t(unsigned long, nr_pages,
2233                                                         nr_pte_to_next_page(pte) * lvl_pages);
2234                                 end_pfn = iov_pfn + pages_to_remove - 1;
2235                                 switch_to_super_page(domain, iov_pfn, end_pfn, largepage_lvl);
2236                         } else {
2237                                 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
2238                         }
2239
2240                 }
2241                 /* We don't need lock here, nobody else
2242                  * touches the iova range
2243                  */
2244                 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
2245                 if (tmp) {
2246                         static int dumps = 5;
2247                         pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
2248                                 iov_pfn, tmp, (unsigned long long)pteval);
2249                         if (dumps) {
2250                                 dumps--;
2251                                 debug_dma_dump_mappings(NULL);
2252                         }
2253                         WARN_ON(1);
2254                 }
2255
2256                 nr_pages -= lvl_pages;
2257                 iov_pfn += lvl_pages;
2258                 phys_pfn += lvl_pages;
2259                 pteval += lvl_pages * VTD_PAGE_SIZE;
2260
2261                 /* If the next PTE would be the first in a new page, then we
2262                  * need to flush the cache on the entries we've just written.
2263                  * And then we'll need to recalculate 'pte', so clear it and
2264                  * let it get set again in the if (!pte) block above.
2265                  *
2266                  * If we're done (!nr_pages) we need to flush the cache too.
2267                  *
2268                  * Also if we've been setting superpages, we may need to
2269                  * recalculate 'pte' and switch back to smaller pages for the
2270                  * end of the mapping, if the trailing size is not enough to
2271                  * use another superpage (i.e. nr_pages < lvl_pages).
2272                  */
2273                 pte++;
2274                 if (!nr_pages || first_pte_in_page(pte) ||
2275                     (largepage_lvl > 1 && nr_pages < lvl_pages)) {
2276                         domain_flush_cache(domain, first_pte,
2277                                            (void *)pte - (void *)first_pte);
2278                         pte = NULL;
2279                 }
2280         }
2281
2282         return 0;
2283 }
2284
2285 static void domain_context_clear_one(struct device_domain_info *info, u8 bus, u8 devfn)
2286 {
2287         struct intel_iommu *iommu = info->iommu;
2288         struct context_entry *context;
2289         u16 did_old;
2290
2291         if (!iommu)
2292                 return;
2293
2294         spin_lock(&iommu->lock);
2295         context = iommu_context_addr(iommu, bus, devfn, 0);
2296         if (!context) {
2297                 spin_unlock(&iommu->lock);
2298                 return;
2299         }
2300
2301         if (sm_supported(iommu)) {
2302                 if (hw_pass_through && domain_type_is_si(info->domain))
2303                         did_old = FLPT_DEFAULT_DID;
2304                 else
2305                         did_old = domain_id_iommu(info->domain, iommu);
2306         } else {
2307                 did_old = context_domain_id(context);
2308         }
2309
2310         context_clear_entry(context);
2311         __iommu_flush_cache(iommu, context, sizeof(*context));
2312         spin_unlock(&iommu->lock);
2313         iommu->flush.flush_context(iommu,
2314                                    did_old,
2315                                    (((u16)bus) << 8) | devfn,
2316                                    DMA_CCMD_MASK_NOBIT,
2317                                    DMA_CCMD_DEVICE_INVL);
2318
2319         if (sm_supported(iommu))
2320                 qi_flush_pasid_cache(iommu, did_old, QI_PC_ALL_PASIDS, 0);
2321
2322         iommu->flush.flush_iotlb(iommu,
2323                                  did_old,
2324                                  0,
2325                                  0,
2326                                  DMA_TLB_DSI_FLUSH);
2327
2328         __iommu_flush_dev_iotlb(info, 0, MAX_AGAW_PFN_WIDTH);
2329 }
2330
2331 static int domain_setup_first_level(struct intel_iommu *iommu,
2332                                     struct dmar_domain *domain,
2333                                     struct device *dev,
2334                                     u32 pasid)
2335 {
2336         struct dma_pte *pgd = domain->pgd;
2337         int agaw, level;
2338         int flags = 0;
2339
2340         /*
2341          * Skip top levels of page tables for iommu which has
2342          * less agaw than default. Unnecessary for PT mode.
2343          */
2344         for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2345                 pgd = phys_to_virt(dma_pte_addr(pgd));
2346                 if (!dma_pte_present(pgd))
2347                         return -ENOMEM;
2348         }
2349
2350         level = agaw_to_level(agaw);
2351         if (level != 4 && level != 5)
2352                 return -EINVAL;
2353
2354         if (level == 5)
2355                 flags |= PASID_FLAG_FL5LP;
2356
2357         if (domain->force_snooping)
2358                 flags |= PASID_FLAG_PAGE_SNOOP;
2359
2360         return intel_pasid_setup_first_level(iommu, dev, (pgd_t *)pgd, pasid,
2361                                              domain_id_iommu(domain, iommu),
2362                                              flags);
2363 }
2364
2365 static bool dev_is_real_dma_subdevice(struct device *dev)
2366 {
2367         return dev && dev_is_pci(dev) &&
2368                pci_real_dma_dev(to_pci_dev(dev)) != to_pci_dev(dev);
2369 }
2370
2371 static int iommu_domain_identity_map(struct dmar_domain *domain,
2372                                      unsigned long first_vpfn,
2373                                      unsigned long last_vpfn)
2374 {
2375         /*
2376          * RMRR range might have overlap with physical memory range,
2377          * clear it first
2378          */
2379         dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2380
2381         return __domain_mapping(domain, first_vpfn,
2382                                 first_vpfn, last_vpfn - first_vpfn + 1,
2383                                 DMA_PTE_READ|DMA_PTE_WRITE, GFP_KERNEL);
2384 }
2385
2386 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2387
2388 static int __init si_domain_init(int hw)
2389 {
2390         struct dmar_rmrr_unit *rmrr;
2391         struct device *dev;
2392         int i, nid, ret;
2393
2394         si_domain = alloc_domain(IOMMU_DOMAIN_IDENTITY);
2395         if (!si_domain)
2396                 return -EFAULT;
2397
2398         if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2399                 domain_exit(si_domain);
2400                 si_domain = NULL;
2401                 return -EFAULT;
2402         }
2403
2404         if (hw)
2405                 return 0;
2406
2407         for_each_online_node(nid) {
2408                 unsigned long start_pfn, end_pfn;
2409                 int i;
2410
2411                 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2412                         ret = iommu_domain_identity_map(si_domain,
2413                                         mm_to_dma_pfn_start(start_pfn),
2414                                         mm_to_dma_pfn_end(end_pfn));
2415                         if (ret)
2416                                 return ret;
2417                 }
2418         }
2419
2420         /*
2421          * Identity map the RMRRs so that devices with RMRRs could also use
2422          * the si_domain.
2423          */
2424         for_each_rmrr_units(rmrr) {
2425                 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
2426                                           i, dev) {
2427                         unsigned long long start = rmrr->base_address;
2428                         unsigned long long end = rmrr->end_address;
2429
2430                         if (WARN_ON(end < start ||
2431                                     end >> agaw_to_width(si_domain->agaw)))
2432                                 continue;
2433
2434                         ret = iommu_domain_identity_map(si_domain,
2435                                         mm_to_dma_pfn_start(start >> PAGE_SHIFT),
2436                                         mm_to_dma_pfn_end(end >> PAGE_SHIFT));
2437                         if (ret)
2438                                 return ret;
2439                 }
2440         }
2441
2442         return 0;
2443 }
2444
2445 static int dmar_domain_attach_device(struct dmar_domain *domain,
2446                                      struct device *dev)
2447 {
2448         struct device_domain_info *info = dev_iommu_priv_get(dev);
2449         struct intel_iommu *iommu;
2450         unsigned long flags;
2451         u8 bus, devfn;
2452         int ret;
2453
2454         iommu = device_to_iommu(dev, &bus, &devfn);
2455         if (!iommu)
2456                 return -ENODEV;
2457
2458         ret = domain_attach_iommu(domain, iommu);
2459         if (ret)
2460                 return ret;
2461         info->domain = domain;
2462         spin_lock_irqsave(&domain->lock, flags);
2463         list_add(&info->link, &domain->devices);
2464         spin_unlock_irqrestore(&domain->lock, flags);
2465
2466         /* PASID table is mandatory for a PCI device in scalable mode. */
2467         if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev)) {
2468                 /* Setup the PASID entry for requests without PASID: */
2469                 if (hw_pass_through && domain_type_is_si(domain))
2470                         ret = intel_pasid_setup_pass_through(iommu, domain,
2471                                         dev, IOMMU_NO_PASID);
2472                 else if (domain->use_first_level)
2473                         ret = domain_setup_first_level(iommu, domain, dev,
2474                                         IOMMU_NO_PASID);
2475                 else
2476                         ret = intel_pasid_setup_second_level(iommu, domain,
2477                                         dev, IOMMU_NO_PASID);
2478                 if (ret) {
2479                         dev_err(dev, "Setup RID2PASID failed\n");
2480                         device_block_translation(dev);
2481                         return ret;
2482                 }
2483         }
2484
2485         ret = domain_context_mapping(domain, dev);
2486         if (ret) {
2487                 dev_err(dev, "Domain context map failed\n");
2488                 device_block_translation(dev);
2489                 return ret;
2490         }
2491
2492         if (sm_supported(info->iommu) || !domain_type_is_si(info->domain))
2493                 iommu_enable_pci_caps(info);
2494
2495         return 0;
2496 }
2497
2498 /**
2499  * device_rmrr_is_relaxable - Test whether the RMRR of this device
2500  * is relaxable (ie. is allowed to be not enforced under some conditions)
2501  * @dev: device handle
2502  *
2503  * We assume that PCI USB devices with RMRRs have them largely
2504  * for historical reasons and that the RMRR space is not actively used post
2505  * boot.  This exclusion may change if vendors begin to abuse it.
2506  *
2507  * The same exception is made for graphics devices, with the requirement that
2508  * any use of the RMRR regions will be torn down before assigning the device
2509  * to a guest.
2510  *
2511  * Return: true if the RMRR is relaxable, false otherwise
2512  */
2513 static bool device_rmrr_is_relaxable(struct device *dev)
2514 {
2515         struct pci_dev *pdev;
2516
2517         if (!dev_is_pci(dev))
2518                 return false;
2519
2520         pdev = to_pci_dev(dev);
2521         if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
2522                 return true;
2523         else
2524                 return false;
2525 }
2526
2527 /*
2528  * Return the required default domain type for a specific device.
2529  *
2530  * @dev: the device in query
2531  * @startup: true if this is during early boot
2532  *
2533  * Returns:
2534  *  - IOMMU_DOMAIN_DMA: device requires a dynamic mapping domain
2535  *  - IOMMU_DOMAIN_IDENTITY: device requires an identical mapping domain
2536  *  - 0: both identity and dynamic domains work for this device
2537  */
2538 static int device_def_domain_type(struct device *dev)
2539 {
2540         if (dev_is_pci(dev)) {
2541                 struct pci_dev *pdev = to_pci_dev(dev);
2542
2543                 if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2544                         return IOMMU_DOMAIN_IDENTITY;
2545
2546                 if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2547                         return IOMMU_DOMAIN_IDENTITY;
2548         }
2549
2550         return 0;
2551 }
2552
2553 static void intel_iommu_init_qi(struct intel_iommu *iommu)
2554 {
2555         /*
2556          * Start from the sane iommu hardware state.
2557          * If the queued invalidation is already initialized by us
2558          * (for example, while enabling interrupt-remapping) then
2559          * we got the things already rolling from a sane state.
2560          */
2561         if (!iommu->qi) {
2562                 /*
2563                  * Clear any previous faults.
2564                  */
2565                 dmar_fault(-1, iommu);
2566                 /*
2567                  * Disable queued invalidation if supported and already enabled
2568                  * before OS handover.
2569                  */
2570                 dmar_disable_qi(iommu);
2571         }
2572
2573         if (dmar_enable_qi(iommu)) {
2574                 /*
2575                  * Queued Invalidate not enabled, use Register Based Invalidate
2576                  */
2577                 iommu->flush.flush_context = __iommu_flush_context;
2578                 iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2579                 pr_info("%s: Using Register based invalidation\n",
2580                         iommu->name);
2581         } else {
2582                 iommu->flush.flush_context = qi_flush_context;
2583                 iommu->flush.flush_iotlb = qi_flush_iotlb;
2584                 pr_info("%s: Using Queued invalidation\n", iommu->name);
2585         }
2586 }
2587
2588 static int copy_context_table(struct intel_iommu *iommu,
2589                               struct root_entry *old_re,
2590                               struct context_entry **tbl,
2591                               int bus, bool ext)
2592 {
2593         int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
2594         struct context_entry *new_ce = NULL, ce;
2595         struct context_entry *old_ce = NULL;
2596         struct root_entry re;
2597         phys_addr_t old_ce_phys;
2598
2599         tbl_idx = ext ? bus * 2 : bus;
2600         memcpy(&re, old_re, sizeof(re));
2601
2602         for (devfn = 0; devfn < 256; devfn++) {
2603                 /* First calculate the correct index */
2604                 idx = (ext ? devfn * 2 : devfn) % 256;
2605
2606                 if (idx == 0) {
2607                         /* First save what we may have and clean up */
2608                         if (new_ce) {
2609                                 tbl[tbl_idx] = new_ce;
2610                                 __iommu_flush_cache(iommu, new_ce,
2611                                                     VTD_PAGE_SIZE);
2612                                 pos = 1;
2613                         }
2614
2615                         if (old_ce)
2616                                 memunmap(old_ce);
2617
2618                         ret = 0;
2619                         if (devfn < 0x80)
2620                                 old_ce_phys = root_entry_lctp(&re);
2621                         else
2622                                 old_ce_phys = root_entry_uctp(&re);
2623
2624                         if (!old_ce_phys) {
2625                                 if (ext && devfn == 0) {
2626                                         /* No LCTP, try UCTP */
2627                                         devfn = 0x7f;
2628                                         continue;
2629                                 } else {
2630                                         goto out;
2631                                 }
2632                         }
2633
2634                         ret = -ENOMEM;
2635                         old_ce = memremap(old_ce_phys, PAGE_SIZE,
2636                                         MEMREMAP_WB);
2637                         if (!old_ce)
2638                                 goto out;
2639
2640                         new_ce = alloc_pgtable_page(iommu->node, GFP_KERNEL);
2641                         if (!new_ce)
2642                                 goto out_unmap;
2643
2644                         ret = 0;
2645                 }
2646
2647                 /* Now copy the context entry */
2648                 memcpy(&ce, old_ce + idx, sizeof(ce));
2649
2650                 if (!context_present(&ce))
2651                         continue;
2652
2653                 did = context_domain_id(&ce);
2654                 if (did >= 0 && did < cap_ndoms(iommu->cap))
2655                         set_bit(did, iommu->domain_ids);
2656
2657                 set_context_copied(iommu, bus, devfn);
2658                 new_ce[idx] = ce;
2659         }
2660
2661         tbl[tbl_idx + pos] = new_ce;
2662
2663         __iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
2664
2665 out_unmap:
2666         memunmap(old_ce);
2667
2668 out:
2669         return ret;
2670 }
2671
2672 static int copy_translation_tables(struct intel_iommu *iommu)
2673 {
2674         struct context_entry **ctxt_tbls;
2675         struct root_entry *old_rt;
2676         phys_addr_t old_rt_phys;
2677         int ctxt_table_entries;
2678         u64 rtaddr_reg;
2679         int bus, ret;
2680         bool new_ext, ext;
2681
2682         rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
2683         ext        = !!(rtaddr_reg & DMA_RTADDR_SMT);
2684         new_ext    = !!sm_supported(iommu);
2685
2686         /*
2687          * The RTT bit can only be changed when translation is disabled,
2688          * but disabling translation means to open a window for data
2689          * corruption. So bail out and don't copy anything if we would
2690          * have to change the bit.
2691          */
2692         if (new_ext != ext)
2693                 return -EINVAL;
2694
2695         iommu->copied_tables = bitmap_zalloc(BIT_ULL(16), GFP_KERNEL);
2696         if (!iommu->copied_tables)
2697                 return -ENOMEM;
2698
2699         old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
2700         if (!old_rt_phys)
2701                 return -EINVAL;
2702
2703         old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB);
2704         if (!old_rt)
2705                 return -ENOMEM;
2706
2707         /* This is too big for the stack - allocate it from slab */
2708         ctxt_table_entries = ext ? 512 : 256;
2709         ret = -ENOMEM;
2710         ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL);
2711         if (!ctxt_tbls)
2712                 goto out_unmap;
2713
2714         for (bus = 0; bus < 256; bus++) {
2715                 ret = copy_context_table(iommu, &old_rt[bus],
2716                                          ctxt_tbls, bus, ext);
2717                 if (ret) {
2718                         pr_err("%s: Failed to copy context table for bus %d\n",
2719                                 iommu->name, bus);
2720                         continue;
2721                 }
2722         }
2723
2724         spin_lock(&iommu->lock);
2725
2726         /* Context tables are copied, now write them to the root_entry table */
2727         for (bus = 0; bus < 256; bus++) {
2728                 int idx = ext ? bus * 2 : bus;
2729                 u64 val;
2730
2731                 if (ctxt_tbls[idx]) {
2732                         val = virt_to_phys(ctxt_tbls[idx]) | 1;
2733                         iommu->root_entry[bus].lo = val;
2734                 }
2735
2736                 if (!ext || !ctxt_tbls[idx + 1])
2737                         continue;
2738
2739                 val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
2740                 iommu->root_entry[bus].hi = val;
2741         }
2742
2743         spin_unlock(&iommu->lock);
2744
2745         kfree(ctxt_tbls);
2746
2747         __iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
2748
2749         ret = 0;
2750
2751 out_unmap:
2752         memunmap(old_rt);
2753
2754         return ret;
2755 }
2756
2757 static int __init init_dmars(void)
2758 {
2759         struct dmar_drhd_unit *drhd;
2760         struct intel_iommu *iommu;
2761         int ret;
2762
2763         ret = intel_cap_audit(CAP_AUDIT_STATIC_DMAR, NULL);
2764         if (ret)
2765                 goto free_iommu;
2766
2767         for_each_iommu(iommu, drhd) {
2768                 if (drhd->ignored) {
2769                         iommu_disable_translation(iommu);
2770                         continue;
2771                 }
2772
2773                 /*
2774                  * Find the max pasid size of all IOMMU's in the system.
2775                  * We need to ensure the system pasid table is no bigger
2776                  * than the smallest supported.
2777                  */
2778                 if (pasid_supported(iommu)) {
2779                         u32 temp = 2 << ecap_pss(iommu->ecap);
2780
2781                         intel_pasid_max_id = min_t(u32, temp,
2782                                                    intel_pasid_max_id);
2783                 }
2784
2785                 intel_iommu_init_qi(iommu);
2786
2787                 ret = iommu_init_domains(iommu);
2788                 if (ret)
2789                         goto free_iommu;
2790
2791                 init_translation_status(iommu);
2792
2793                 if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
2794                         iommu_disable_translation(iommu);
2795                         clear_translation_pre_enabled(iommu);
2796                         pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
2797                                 iommu->name);
2798                 }
2799
2800                 /*
2801                  * TBD:
2802                  * we could share the same root & context tables
2803                  * among all IOMMU's. Need to Split it later.
2804                  */
2805                 ret = iommu_alloc_root_entry(iommu);
2806                 if (ret)
2807                         goto free_iommu;
2808
2809                 if (translation_pre_enabled(iommu)) {
2810                         pr_info("Translation already enabled - trying to copy translation structures\n");
2811
2812                         ret = copy_translation_tables(iommu);
2813                         if (ret) {
2814                                 /*
2815                                  * We found the IOMMU with translation
2816                                  * enabled - but failed to copy over the
2817                                  * old root-entry table. Try to proceed
2818                                  * by disabling translation now and
2819                                  * allocating a clean root-entry table.
2820                                  * This might cause DMAR faults, but
2821                                  * probably the dump will still succeed.
2822                                  */
2823                                 pr_err("Failed to copy translation tables from previous kernel for %s\n",
2824                                        iommu->name);
2825                                 iommu_disable_translation(iommu);
2826                                 clear_translation_pre_enabled(iommu);
2827                         } else {
2828                                 pr_info("Copied translation tables from previous kernel for %s\n",
2829                                         iommu->name);
2830                         }
2831                 }
2832
2833                 if (!ecap_pass_through(iommu->ecap))
2834                         hw_pass_through = 0;
2835                 intel_svm_check(iommu);
2836         }
2837
2838         /*
2839          * Now that qi is enabled on all iommus, set the root entry and flush
2840          * caches. This is required on some Intel X58 chipsets, otherwise the
2841          * flush_context function will loop forever and the boot hangs.
2842          */
2843         for_each_active_iommu(iommu, drhd) {
2844                 iommu_flush_write_buffer(iommu);
2845                 iommu_set_root_entry(iommu);
2846         }
2847
2848 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
2849         dmar_map_gfx = 0;
2850 #endif
2851
2852         if (!dmar_map_gfx)
2853                 iommu_identity_mapping |= IDENTMAP_GFX;
2854
2855         check_tylersburg_isoch();
2856
2857         ret = si_domain_init(hw_pass_through);
2858         if (ret)
2859                 goto free_iommu;
2860
2861         /*
2862          * for each drhd
2863          *   enable fault log
2864          *   global invalidate context cache
2865          *   global invalidate iotlb
2866          *   enable translation
2867          */
2868         for_each_iommu(iommu, drhd) {
2869                 if (drhd->ignored) {
2870                         /*
2871                          * we always have to disable PMRs or DMA may fail on
2872                          * this device
2873                          */
2874                         if (force_on)
2875                                 iommu_disable_protect_mem_regions(iommu);
2876                         continue;
2877                 }
2878
2879                 iommu_flush_write_buffer(iommu);
2880
2881 #ifdef CONFIG_INTEL_IOMMU_SVM
2882                 if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
2883                         /*
2884                          * Call dmar_alloc_hwirq() with dmar_global_lock held,
2885                          * could cause possible lock race condition.
2886                          */
2887                         up_write(&dmar_global_lock);
2888                         ret = intel_svm_enable_prq(iommu);
2889                         down_write(&dmar_global_lock);
2890                         if (ret)
2891                                 goto free_iommu;
2892                 }
2893 #endif
2894                 ret = dmar_set_interrupt(iommu);
2895                 if (ret)
2896                         goto free_iommu;
2897         }
2898
2899         return 0;
2900
2901 free_iommu:
2902         for_each_active_iommu(iommu, drhd) {
2903                 disable_dmar_iommu(iommu);
2904                 free_dmar_iommu(iommu);
2905         }
2906         if (si_domain) {
2907                 domain_exit(si_domain);
2908                 si_domain = NULL;
2909         }
2910
2911         return ret;
2912 }
2913
2914 static void __init init_no_remapping_devices(void)
2915 {
2916         struct dmar_drhd_unit *drhd;
2917         struct device *dev;
2918         int i;
2919
2920         for_each_drhd_unit(drhd) {
2921                 if (!drhd->include_all) {
2922                         for_each_active_dev_scope(drhd->devices,
2923                                                   drhd->devices_cnt, i, dev)
2924                                 break;
2925                         /* ignore DMAR unit if no devices exist */
2926                         if (i == drhd->devices_cnt)
2927                                 drhd->ignored = 1;
2928                 }
2929         }
2930
2931         for_each_active_drhd_unit(drhd) {
2932                 if (drhd->include_all)
2933                         continue;
2934
2935                 for_each_active_dev_scope(drhd->devices,
2936                                           drhd->devices_cnt, i, dev)
2937                         if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
2938                                 break;
2939                 if (i < drhd->devices_cnt)
2940                         continue;
2941
2942                 /* This IOMMU has *only* gfx devices. Either bypass it or
2943                    set the gfx_mapped flag, as appropriate */
2944                 drhd->gfx_dedicated = 1;
2945                 if (!dmar_map_gfx)
2946                         drhd->ignored = 1;
2947         }
2948 }
2949
2950 #ifdef CONFIG_SUSPEND
2951 static int init_iommu_hw(void)
2952 {
2953         struct dmar_drhd_unit *drhd;
2954         struct intel_iommu *iommu = NULL;
2955         int ret;
2956
2957         for_each_active_iommu(iommu, drhd) {
2958                 if (iommu->qi) {
2959                         ret = dmar_reenable_qi(iommu);
2960                         if (ret)
2961                                 return ret;
2962                 }
2963         }
2964
2965         for_each_iommu(iommu, drhd) {
2966                 if (drhd->ignored) {
2967                         /*
2968                          * we always have to disable PMRs or DMA may fail on
2969                          * this device
2970                          */
2971                         if (force_on)
2972                                 iommu_disable_protect_mem_regions(iommu);
2973                         continue;
2974                 }
2975
2976                 iommu_flush_write_buffer(iommu);
2977                 iommu_set_root_entry(iommu);
2978                 iommu_enable_translation(iommu);
2979                 iommu_disable_protect_mem_regions(iommu);
2980         }
2981
2982         return 0;
2983 }
2984
2985 static void iommu_flush_all(void)
2986 {
2987         struct dmar_drhd_unit *drhd;
2988         struct intel_iommu *iommu;
2989
2990         for_each_active_iommu(iommu, drhd) {
2991                 iommu->flush.flush_context(iommu, 0, 0, 0,
2992                                            DMA_CCMD_GLOBAL_INVL);
2993                 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
2994                                          DMA_TLB_GLOBAL_FLUSH);
2995         }
2996 }
2997
2998 static int iommu_suspend(void)
2999 {
3000         struct dmar_drhd_unit *drhd;
3001         struct intel_iommu *iommu = NULL;
3002         unsigned long flag;
3003
3004         iommu_flush_all();
3005
3006         for_each_active_iommu(iommu, drhd) {
3007                 iommu_disable_translation(iommu);
3008
3009                 raw_spin_lock_irqsave(&iommu->register_lock, flag);
3010
3011                 iommu->iommu_state[SR_DMAR_FECTL_REG] =
3012                         readl(iommu->reg + DMAR_FECTL_REG);
3013                 iommu->iommu_state[SR_DMAR_FEDATA_REG] =
3014                         readl(iommu->reg + DMAR_FEDATA_REG);
3015                 iommu->iommu_state[SR_DMAR_FEADDR_REG] =
3016                         readl(iommu->reg + DMAR_FEADDR_REG);
3017                 iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
3018                         readl(iommu->reg + DMAR_FEUADDR_REG);
3019
3020                 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3021         }
3022         return 0;
3023 }
3024
3025 static void iommu_resume(void)
3026 {
3027         struct dmar_drhd_unit *drhd;
3028         struct intel_iommu *iommu = NULL;
3029         unsigned long flag;
3030
3031         if (init_iommu_hw()) {
3032                 if (force_on)
3033                         panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
3034                 else
3035                         WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
3036                 return;
3037         }
3038
3039         for_each_active_iommu(iommu, drhd) {
3040
3041                 raw_spin_lock_irqsave(&iommu->register_lock, flag);
3042
3043                 writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
3044                         iommu->reg + DMAR_FECTL_REG);
3045                 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
3046                         iommu->reg + DMAR_FEDATA_REG);
3047                 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
3048                         iommu->reg + DMAR_FEADDR_REG);
3049                 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
3050                         iommu->reg + DMAR_FEUADDR_REG);
3051
3052                 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3053         }
3054 }
3055
3056 static struct syscore_ops iommu_syscore_ops = {
3057         .resume         = iommu_resume,
3058         .suspend        = iommu_suspend,
3059 };
3060
3061 static void __init init_iommu_pm_ops(void)
3062 {
3063         register_syscore_ops(&iommu_syscore_ops);
3064 }
3065
3066 #else
3067 static inline void init_iommu_pm_ops(void) {}
3068 #endif  /* CONFIG_PM */
3069
3070 static int __init rmrr_sanity_check(struct acpi_dmar_reserved_memory *rmrr)
3071 {
3072         if (!IS_ALIGNED(rmrr->base_address, PAGE_SIZE) ||
3073             !IS_ALIGNED(rmrr->end_address + 1, PAGE_SIZE) ||
3074             rmrr->end_address <= rmrr->base_address ||
3075             arch_rmrr_sanity_check(rmrr))
3076                 return -EINVAL;
3077
3078         return 0;
3079 }
3080
3081 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
3082 {
3083         struct acpi_dmar_reserved_memory *rmrr;
3084         struct dmar_rmrr_unit *rmrru;
3085
3086         rmrr = (struct acpi_dmar_reserved_memory *)header;
3087         if (rmrr_sanity_check(rmrr)) {
3088                 pr_warn(FW_BUG
3089                            "Your BIOS is broken; bad RMRR [%#018Lx-%#018Lx]\n"
3090                            "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
3091                            rmrr->base_address, rmrr->end_address,
3092                            dmi_get_system_info(DMI_BIOS_VENDOR),
3093                            dmi_get_system_info(DMI_BIOS_VERSION),
3094                            dmi_get_system_info(DMI_PRODUCT_VERSION));
3095                 add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
3096         }
3097
3098         rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
3099         if (!rmrru)
3100                 goto out;
3101
3102         rmrru->hdr = header;
3103
3104         rmrru->base_address = rmrr->base_address;
3105         rmrru->end_address = rmrr->end_address;
3106
3107         rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
3108                                 ((void *)rmrr) + rmrr->header.length,
3109                                 &rmrru->devices_cnt);
3110         if (rmrru->devices_cnt && rmrru->devices == NULL)
3111                 goto free_rmrru;
3112
3113         list_add(&rmrru->list, &dmar_rmrr_units);
3114
3115         return 0;
3116 free_rmrru:
3117         kfree(rmrru);
3118 out:
3119         return -ENOMEM;
3120 }
3121
3122 static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
3123 {
3124         struct dmar_atsr_unit *atsru;
3125         struct acpi_dmar_atsr *tmp;
3126
3127         list_for_each_entry_rcu(atsru, &dmar_atsr_units, list,
3128                                 dmar_rcu_check()) {
3129                 tmp = (struct acpi_dmar_atsr *)atsru->hdr;
3130                 if (atsr->segment != tmp->segment)
3131                         continue;
3132                 if (atsr->header.length != tmp->header.length)
3133                         continue;
3134                 if (memcmp(atsr, tmp, atsr->header.length) == 0)
3135                         return atsru;
3136         }
3137
3138         return NULL;
3139 }
3140
3141 int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3142 {
3143         struct acpi_dmar_atsr *atsr;
3144         struct dmar_atsr_unit *atsru;
3145
3146         if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
3147                 return 0;
3148
3149         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3150         atsru = dmar_find_atsr(atsr);
3151         if (atsru)
3152                 return 0;
3153
3154         atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
3155         if (!atsru)
3156                 return -ENOMEM;
3157
3158         /*
3159          * If memory is allocated from slab by ACPI _DSM method, we need to
3160          * copy the memory content because the memory buffer will be freed
3161          * on return.
3162          */
3163         atsru->hdr = (void *)(atsru + 1);
3164         memcpy(atsru->hdr, hdr, hdr->length);
3165         atsru->include_all = atsr->flags & 0x1;
3166         if (!atsru->include_all) {
3167                 atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
3168                                 (void *)atsr + atsr->header.length,
3169                                 &atsru->devices_cnt);
3170                 if (atsru->devices_cnt && atsru->devices == NULL) {
3171                         kfree(atsru);
3172                         return -ENOMEM;
3173                 }
3174         }
3175
3176         list_add_rcu(&atsru->list, &dmar_atsr_units);
3177
3178         return 0;
3179 }
3180
3181 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
3182 {
3183         dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
3184         kfree(atsru);
3185 }
3186
3187 int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3188 {
3189         struct acpi_dmar_atsr *atsr;
3190         struct dmar_atsr_unit *atsru;
3191
3192         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3193         atsru = dmar_find_atsr(atsr);
3194         if (atsru) {
3195                 list_del_rcu(&atsru->list);
3196                 synchronize_rcu();
3197                 intel_iommu_free_atsr(atsru);
3198         }
3199
3200         return 0;
3201 }
3202
3203 int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3204 {
3205         int i;
3206         struct device *dev;
3207         struct acpi_dmar_atsr *atsr;
3208         struct dmar_atsr_unit *atsru;
3209
3210         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3211         atsru = dmar_find_atsr(atsr);
3212         if (!atsru)
3213                 return 0;
3214
3215         if (!atsru->include_all && atsru->devices && atsru->devices_cnt) {
3216                 for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
3217                                           i, dev)
3218                         return -EBUSY;
3219         }
3220
3221         return 0;
3222 }
3223
3224 static struct dmar_satc_unit *dmar_find_satc(struct acpi_dmar_satc *satc)
3225 {
3226         struct dmar_satc_unit *satcu;
3227         struct acpi_dmar_satc *tmp;
3228
3229         list_for_each_entry_rcu(satcu, &dmar_satc_units, list,
3230                                 dmar_rcu_check()) {
3231                 tmp = (struct acpi_dmar_satc *)satcu->hdr;
3232                 if (satc->segment != tmp->segment)
3233                         continue;
3234                 if (satc->header.length != tmp->header.length)
3235                         continue;
3236                 if (memcmp(satc, tmp, satc->header.length) == 0)
3237                         return satcu;
3238         }
3239
3240         return NULL;
3241 }
3242
3243 int dmar_parse_one_satc(struct acpi_dmar_header *hdr, void *arg)
3244 {
3245         struct acpi_dmar_satc *satc;
3246         struct dmar_satc_unit *satcu;
3247
3248         if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
3249                 return 0;
3250
3251         satc = container_of(hdr, struct acpi_dmar_satc, header);
3252         satcu = dmar_find_satc(satc);
3253         if (satcu)
3254                 return 0;
3255
3256         satcu = kzalloc(sizeof(*satcu) + hdr->length, GFP_KERNEL);
3257         if (!satcu)
3258                 return -ENOMEM;
3259
3260         satcu->hdr = (void *)(satcu + 1);
3261         memcpy(satcu->hdr, hdr, hdr->length);
3262         satcu->atc_required = satc->flags & 0x1;
3263         satcu->devices = dmar_alloc_dev_scope((void *)(satc + 1),
3264                                               (void *)satc + satc->header.length,
3265                                               &satcu->devices_cnt);
3266         if (satcu->devices_cnt && !satcu->devices) {
3267                 kfree(satcu);
3268                 return -ENOMEM;
3269         }
3270         list_add_rcu(&satcu->list, &dmar_satc_units);
3271
3272         return 0;
3273 }
3274
3275 static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
3276 {
3277         int sp, ret;
3278         struct intel_iommu *iommu = dmaru->iommu;
3279
3280         ret = intel_cap_audit(CAP_AUDIT_HOTPLUG_DMAR, iommu);
3281         if (ret)
3282                 goto out;
3283
3284         if (hw_pass_through && !ecap_pass_through(iommu->ecap)) {
3285                 pr_warn("%s: Doesn't support hardware pass through.\n",
3286                         iommu->name);
3287                 return -ENXIO;
3288         }
3289
3290         sp = domain_update_iommu_superpage(NULL, iommu) - 1;
3291         if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) {
3292                 pr_warn("%s: Doesn't support large page.\n",
3293                         iommu->name);
3294                 return -ENXIO;
3295         }
3296
3297         /*
3298          * Disable translation if already enabled prior to OS handover.
3299          */
3300         if (iommu->gcmd & DMA_GCMD_TE)
3301                 iommu_disable_translation(iommu);
3302
3303         ret = iommu_init_domains(iommu);
3304         if (ret == 0)
3305                 ret = iommu_alloc_root_entry(iommu);
3306         if (ret)
3307                 goto out;
3308
3309         intel_svm_check(iommu);
3310
3311         if (dmaru->ignored) {
3312                 /*
3313                  * we always have to disable PMRs or DMA may fail on this device
3314                  */
3315                 if (force_on)
3316                         iommu_disable_protect_mem_regions(iommu);
3317                 return 0;
3318         }
3319
3320         intel_iommu_init_qi(iommu);
3321         iommu_flush_write_buffer(iommu);
3322
3323 #ifdef CONFIG_INTEL_IOMMU_SVM
3324         if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
3325                 ret = intel_svm_enable_prq(iommu);
3326                 if (ret)
3327                         goto disable_iommu;
3328         }
3329 #endif
3330         ret = dmar_set_interrupt(iommu);
3331         if (ret)
3332                 goto disable_iommu;
3333
3334         iommu_set_root_entry(iommu);
3335         iommu_enable_translation(iommu);
3336
3337         iommu_disable_protect_mem_regions(iommu);
3338         return 0;
3339
3340 disable_iommu:
3341         disable_dmar_iommu(iommu);
3342 out:
3343         free_dmar_iommu(iommu);
3344         return ret;
3345 }
3346
3347 int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
3348 {
3349         int ret = 0;
3350         struct intel_iommu *iommu = dmaru->iommu;
3351
3352         if (!intel_iommu_enabled)
3353                 return 0;
3354         if (iommu == NULL)
3355                 return -EINVAL;
3356
3357         if (insert) {
3358                 ret = intel_iommu_add(dmaru);
3359         } else {
3360                 disable_dmar_iommu(iommu);
3361                 free_dmar_iommu(iommu);
3362         }
3363
3364         return ret;
3365 }
3366
3367 static void intel_iommu_free_dmars(void)
3368 {
3369         struct dmar_rmrr_unit *rmrru, *rmrr_n;
3370         struct dmar_atsr_unit *atsru, *atsr_n;
3371         struct dmar_satc_unit *satcu, *satc_n;
3372
3373         list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
3374                 list_del(&rmrru->list);
3375                 dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
3376                 kfree(rmrru);
3377         }
3378
3379         list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
3380                 list_del(&atsru->list);
3381                 intel_iommu_free_atsr(atsru);
3382         }
3383         list_for_each_entry_safe(satcu, satc_n, &dmar_satc_units, list) {
3384                 list_del(&satcu->list);
3385                 dmar_free_dev_scope(&satcu->devices, &satcu->devices_cnt);
3386                 kfree(satcu);
3387         }
3388 }
3389
3390 static struct dmar_satc_unit *dmar_find_matched_satc_unit(struct pci_dev *dev)
3391 {
3392         struct dmar_satc_unit *satcu;
3393         struct acpi_dmar_satc *satc;
3394         struct device *tmp;
3395         int i;
3396
3397         dev = pci_physfn(dev);
3398         rcu_read_lock();
3399
3400         list_for_each_entry_rcu(satcu, &dmar_satc_units, list) {
3401                 satc = container_of(satcu->hdr, struct acpi_dmar_satc, header);
3402                 if (satc->segment != pci_domain_nr(dev->bus))
3403                         continue;
3404                 for_each_dev_scope(satcu->devices, satcu->devices_cnt, i, tmp)
3405                         if (to_pci_dev(tmp) == dev)
3406                                 goto out;
3407         }
3408         satcu = NULL;
3409 out:
3410         rcu_read_unlock();
3411         return satcu;
3412 }
3413
3414 static int dmar_ats_supported(struct pci_dev *dev, struct intel_iommu *iommu)
3415 {
3416         int i, ret = 1;
3417         struct pci_bus *bus;
3418         struct pci_dev *bridge = NULL;
3419         struct device *tmp;
3420         struct acpi_dmar_atsr *atsr;
3421         struct dmar_atsr_unit *atsru;
3422         struct dmar_satc_unit *satcu;
3423
3424         dev = pci_physfn(dev);
3425         satcu = dmar_find_matched_satc_unit(dev);
3426         if (satcu)
3427                 /*
3428                  * This device supports ATS as it is in SATC table.
3429                  * When IOMMU is in legacy mode, enabling ATS is done
3430                  * automatically by HW for the device that requires
3431                  * ATS, hence OS should not enable this device ATS
3432                  * to avoid duplicated TLB invalidation.
3433                  */
3434                 return !(satcu->atc_required && !sm_supported(iommu));
3435
3436         for (bus = dev->bus; bus; bus = bus->parent) {
3437                 bridge = bus->self;
3438                 /* If it's an integrated device, allow ATS */
3439                 if (!bridge)
3440                         return 1;
3441                 /* Connected via non-PCIe: no ATS */
3442                 if (!pci_is_pcie(bridge) ||
3443                     pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
3444                         return 0;
3445                 /* If we found the root port, look it up in the ATSR */
3446                 if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
3447                         break;
3448         }
3449
3450         rcu_read_lock();
3451         list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
3452                 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3453                 if (atsr->segment != pci_domain_nr(dev->bus))
3454                         continue;
3455
3456                 for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
3457                         if (tmp == &bridge->dev)
3458                                 goto out;
3459
3460                 if (atsru->include_all)
3461                         goto out;
3462         }
3463         ret = 0;
3464 out:
3465         rcu_read_unlock();
3466
3467         return ret;
3468 }
3469
3470 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
3471 {
3472         int ret;
3473         struct dmar_rmrr_unit *rmrru;
3474         struct dmar_atsr_unit *atsru;
3475         struct dmar_satc_unit *satcu;
3476         struct acpi_dmar_atsr *atsr;
3477         struct acpi_dmar_reserved_memory *rmrr;
3478         struct acpi_dmar_satc *satc;
3479
3480         if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING)
3481                 return 0;
3482
3483         list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
3484                 rmrr = container_of(rmrru->hdr,
3485                                     struct acpi_dmar_reserved_memory, header);
3486                 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3487                         ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
3488                                 ((void *)rmrr) + rmrr->header.length,
3489                                 rmrr->segment, rmrru->devices,
3490                                 rmrru->devices_cnt);
3491                         if (ret < 0)
3492                                 return ret;
3493                 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
3494                         dmar_remove_dev_scope(info, rmrr->segment,
3495                                 rmrru->devices, rmrru->devices_cnt);
3496                 }
3497         }
3498
3499         list_for_each_entry(atsru, &dmar_atsr_units, list) {
3500                 if (atsru->include_all)
3501                         continue;
3502
3503                 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3504                 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3505                         ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
3506                                         (void *)atsr + atsr->header.length,
3507                                         atsr->segment, atsru->devices,
3508                                         atsru->devices_cnt);
3509                         if (ret > 0)
3510                                 break;
3511                         else if (ret < 0)
3512                                 return ret;
3513                 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
3514                         if (dmar_remove_dev_scope(info, atsr->segment,
3515                                         atsru->devices, atsru->devices_cnt))
3516                                 break;
3517                 }
3518         }
3519         list_for_each_entry(satcu, &dmar_satc_units, list) {
3520                 satc = container_of(satcu->hdr, struct acpi_dmar_satc, header);
3521                 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3522                         ret = dmar_insert_dev_scope(info, (void *)(satc + 1),
3523                                         (void *)satc + satc->header.length,
3524                                         satc->segment, satcu->devices,
3525                                         satcu->devices_cnt);
3526                         if (ret > 0)
3527                                 break;
3528                         else if (ret < 0)
3529                                 return ret;
3530                 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
3531                         if (dmar_remove_dev_scope(info, satc->segment,
3532                                         satcu->devices, satcu->devices_cnt))
3533                                 break;
3534                 }
3535         }
3536
3537         return 0;
3538 }
3539
3540 static int intel_iommu_memory_notifier(struct notifier_block *nb,
3541                                        unsigned long val, void *v)
3542 {
3543         struct memory_notify *mhp = v;
3544         unsigned long start_vpfn = mm_to_dma_pfn_start(mhp->start_pfn);
3545         unsigned long last_vpfn = mm_to_dma_pfn_end(mhp->start_pfn +
3546                         mhp->nr_pages - 1);
3547
3548         switch (val) {
3549         case MEM_GOING_ONLINE:
3550                 if (iommu_domain_identity_map(si_domain,
3551                                               start_vpfn, last_vpfn)) {
3552                         pr_warn("Failed to build identity map for [%lx-%lx]\n",
3553                                 start_vpfn, last_vpfn);
3554                         return NOTIFY_BAD;
3555                 }
3556                 break;
3557
3558         case MEM_OFFLINE:
3559         case MEM_CANCEL_ONLINE:
3560                 {
3561                         struct dmar_drhd_unit *drhd;
3562                         struct intel_iommu *iommu;
3563                         LIST_HEAD(freelist);
3564
3565                         domain_unmap(si_domain, start_vpfn, last_vpfn, &freelist);
3566
3567                         rcu_read_lock();
3568                         for_each_active_iommu(iommu, drhd)
3569                                 iommu_flush_iotlb_psi(iommu, si_domain,
3570                                         start_vpfn, mhp->nr_pages,
3571                                         list_empty(&freelist), 0);
3572                         rcu_read_unlock();
3573                         put_pages_list(&freelist);
3574                 }
3575                 break;
3576         }
3577
3578         return NOTIFY_OK;
3579 }
3580
3581 static struct notifier_block intel_iommu_memory_nb = {
3582         .notifier_call = intel_iommu_memory_notifier,
3583         .priority = 0
3584 };
3585
3586 static void intel_disable_iommus(void)
3587 {
3588         struct intel_iommu *iommu = NULL;
3589         struct dmar_drhd_unit *drhd;
3590
3591         for_each_iommu(iommu, drhd)
3592                 iommu_disable_translation(iommu);
3593 }
3594
3595 void intel_iommu_shutdown(void)
3596 {
3597         struct dmar_drhd_unit *drhd;
3598         struct intel_iommu *iommu = NULL;
3599
3600         if (no_iommu || dmar_disabled)
3601                 return;
3602
3603         down_write(&dmar_global_lock);
3604
3605         /* Disable PMRs explicitly here. */
3606         for_each_iommu(iommu, drhd)
3607                 iommu_disable_protect_mem_regions(iommu);
3608
3609         /* Make sure the IOMMUs are switched off */
3610         intel_disable_iommus();
3611
3612         up_write(&dmar_global_lock);
3613 }
3614
3615 static inline struct intel_iommu *dev_to_intel_iommu(struct device *dev)
3616 {
3617         struct iommu_device *iommu_dev = dev_to_iommu_device(dev);
3618
3619         return container_of(iommu_dev, struct intel_iommu, iommu);
3620 }
3621
3622 static ssize_t version_show(struct device *dev,
3623                             struct device_attribute *attr, char *buf)
3624 {
3625         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3626         u32 ver = readl(iommu->reg + DMAR_VER_REG);
3627         return sysfs_emit(buf, "%d:%d\n",
3628                           DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
3629 }
3630 static DEVICE_ATTR_RO(version);
3631
3632 static ssize_t address_show(struct device *dev,
3633                             struct device_attribute *attr, char *buf)
3634 {
3635         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3636         return sysfs_emit(buf, "%llx\n", iommu->reg_phys);
3637 }
3638 static DEVICE_ATTR_RO(address);
3639
3640 static ssize_t cap_show(struct device *dev,
3641                         struct device_attribute *attr, char *buf)
3642 {
3643         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3644         return sysfs_emit(buf, "%llx\n", iommu->cap);
3645 }
3646 static DEVICE_ATTR_RO(cap);
3647
3648 static ssize_t ecap_show(struct device *dev,
3649                          struct device_attribute *attr, char *buf)
3650 {
3651         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3652         return sysfs_emit(buf, "%llx\n", iommu->ecap);
3653 }
3654 static DEVICE_ATTR_RO(ecap);
3655
3656 static ssize_t domains_supported_show(struct device *dev,
3657                                       struct device_attribute *attr, char *buf)
3658 {
3659         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3660         return sysfs_emit(buf, "%ld\n", cap_ndoms(iommu->cap));
3661 }
3662 static DEVICE_ATTR_RO(domains_supported);
3663
3664 static ssize_t domains_used_show(struct device *dev,
3665                                  struct device_attribute *attr, char *buf)
3666 {
3667         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3668         return sysfs_emit(buf, "%d\n",
3669                           bitmap_weight(iommu->domain_ids,
3670                                         cap_ndoms(iommu->cap)));
3671 }
3672 static DEVICE_ATTR_RO(domains_used);
3673
3674 static struct attribute *intel_iommu_attrs[] = {
3675         &dev_attr_version.attr,
3676         &dev_attr_address.attr,
3677         &dev_attr_cap.attr,
3678         &dev_attr_ecap.attr,
3679         &dev_attr_domains_supported.attr,
3680         &dev_attr_domains_used.attr,
3681         NULL,
3682 };
3683
3684 static struct attribute_group intel_iommu_group = {
3685         .name = "intel-iommu",
3686         .attrs = intel_iommu_attrs,
3687 };
3688
3689 const struct attribute_group *intel_iommu_groups[] = {
3690         &intel_iommu_group,
3691         NULL,
3692 };
3693
3694 static inline bool has_external_pci(void)
3695 {
3696         struct pci_dev *pdev = NULL;
3697
3698         for_each_pci_dev(pdev)
3699                 if (pdev->external_facing) {
3700                         pci_dev_put(pdev);
3701                         return true;
3702                 }
3703
3704         return false;
3705 }
3706
3707 static int __init platform_optin_force_iommu(void)
3708 {
3709         if (!dmar_platform_optin() || no_platform_optin || !has_external_pci())
3710                 return 0;
3711
3712         if (no_iommu || dmar_disabled)
3713                 pr_info("Intel-IOMMU force enabled due to platform opt in\n");
3714
3715         /*
3716          * If Intel-IOMMU is disabled by default, we will apply identity
3717          * map for all devices except those marked as being untrusted.
3718          */
3719         if (dmar_disabled)
3720                 iommu_set_default_passthrough(false);
3721
3722         dmar_disabled = 0;
3723         no_iommu = 0;
3724
3725         return 1;
3726 }
3727
3728 static int __init probe_acpi_namespace_devices(void)
3729 {
3730         struct dmar_drhd_unit *drhd;
3731         /* To avoid a -Wunused-but-set-variable warning. */
3732         struct intel_iommu *iommu __maybe_unused;
3733         struct device *dev;
3734         int i, ret = 0;
3735
3736         for_each_active_iommu(iommu, drhd) {
3737                 for_each_active_dev_scope(drhd->devices,
3738                                           drhd->devices_cnt, i, dev) {
3739                         struct acpi_device_physical_node *pn;
3740                         struct acpi_device *adev;
3741
3742                         if (dev->bus != &acpi_bus_type)
3743                                 continue;
3744
3745                         adev = to_acpi_device(dev);
3746                         mutex_lock(&adev->physical_node_lock);
3747                         list_for_each_entry(pn,
3748                                             &adev->physical_node_list, node) {
3749                                 ret = iommu_probe_device(pn->dev);
3750                                 if (ret)
3751                                         break;
3752                         }
3753                         mutex_unlock(&adev->physical_node_lock);
3754
3755                         if (ret)
3756                                 return ret;
3757                 }
3758         }
3759
3760         return 0;
3761 }
3762
3763 static __init int tboot_force_iommu(void)
3764 {
3765         if (!tboot_enabled())
3766                 return 0;
3767
3768         if (no_iommu || dmar_disabled)
3769                 pr_warn("Forcing Intel-IOMMU to enabled\n");
3770
3771         dmar_disabled = 0;
3772         no_iommu = 0;
3773
3774         return 1;
3775 }
3776
3777 int __init intel_iommu_init(void)
3778 {
3779         int ret = -ENODEV;
3780         struct dmar_drhd_unit *drhd;
3781         struct intel_iommu *iommu;
3782
3783         /*
3784          * Intel IOMMU is required for a TXT/tboot launch or platform
3785          * opt in, so enforce that.
3786          */
3787         force_on = (!intel_iommu_tboot_noforce && tboot_force_iommu()) ||
3788                     platform_optin_force_iommu();
3789
3790         down_write(&dmar_global_lock);
3791         if (dmar_table_init()) {
3792                 if (force_on)
3793                         panic("tboot: Failed to initialize DMAR table\n");
3794                 goto out_free_dmar;
3795         }
3796
3797         if (dmar_dev_scope_init() < 0) {
3798                 if (force_on)
3799                         panic("tboot: Failed to initialize DMAR device scope\n");
3800                 goto out_free_dmar;
3801         }
3802
3803         up_write(&dmar_global_lock);
3804
3805         /*
3806          * The bus notifier takes the dmar_global_lock, so lockdep will
3807          * complain later when we register it under the lock.
3808          */
3809         dmar_register_bus_notifier();
3810
3811         down_write(&dmar_global_lock);
3812
3813         if (!no_iommu)
3814                 intel_iommu_debugfs_init();
3815
3816         if (no_iommu || dmar_disabled) {
3817                 /*
3818                  * We exit the function here to ensure IOMMU's remapping and
3819                  * mempool aren't setup, which means that the IOMMU's PMRs
3820                  * won't be disabled via the call to init_dmars(). So disable
3821                  * it explicitly here. The PMRs were setup by tboot prior to
3822                  * calling SENTER, but the kernel is expected to reset/tear
3823                  * down the PMRs.
3824                  */
3825                 if (intel_iommu_tboot_noforce) {
3826                         for_each_iommu(iommu, drhd)
3827                                 iommu_disable_protect_mem_regions(iommu);
3828                 }
3829
3830                 /*
3831                  * Make sure the IOMMUs are switched off, even when we
3832                  * boot into a kexec kernel and the previous kernel left
3833                  * them enabled
3834                  */
3835                 intel_disable_iommus();
3836                 goto out_free_dmar;
3837         }
3838
3839         if (list_empty(&dmar_rmrr_units))
3840                 pr_info("No RMRR found\n");
3841
3842         if (list_empty(&dmar_atsr_units))
3843                 pr_info("No ATSR found\n");
3844
3845         if (list_empty(&dmar_satc_units))
3846                 pr_info("No SATC found\n");
3847
3848         init_no_remapping_devices();
3849
3850         ret = init_dmars();
3851         if (ret) {
3852                 if (force_on)
3853                         panic("tboot: Failed to initialize DMARs\n");
3854                 pr_err("Initialization failed\n");
3855                 goto out_free_dmar;
3856         }
3857         up_write(&dmar_global_lock);
3858
3859         init_iommu_pm_ops();
3860
3861         down_read(&dmar_global_lock);
3862         for_each_active_iommu(iommu, drhd) {
3863                 /*
3864                  * The flush queue implementation does not perform
3865                  * page-selective invalidations that are required for efficient
3866                  * TLB flushes in virtual environments.  The benefit of batching
3867                  * is likely to be much lower than the overhead of synchronizing
3868                  * the virtual and physical IOMMU page-tables.
3869                  */
3870                 if (cap_caching_mode(iommu->cap) &&
3871                     !first_level_by_default(IOMMU_DOMAIN_DMA)) {
3872                         pr_info_once("IOMMU batching disallowed due to virtualization\n");
3873                         iommu_set_dma_strict();
3874                 }
3875                 iommu_device_sysfs_add(&iommu->iommu, NULL,
3876                                        intel_iommu_groups,
3877                                        "%s", iommu->name);
3878                 iommu_device_register(&iommu->iommu, &intel_iommu_ops, NULL);
3879
3880                 iommu_pmu_register(iommu);
3881         }
3882         up_read(&dmar_global_lock);
3883
3884         if (si_domain && !hw_pass_through)
3885                 register_memory_notifier(&intel_iommu_memory_nb);
3886
3887         down_read(&dmar_global_lock);
3888         if (probe_acpi_namespace_devices())
3889                 pr_warn("ACPI name space devices didn't probe correctly\n");
3890
3891         /* Finally, we enable the DMA remapping hardware. */
3892         for_each_iommu(iommu, drhd) {
3893                 if (!drhd->ignored && !translation_pre_enabled(iommu))
3894                         iommu_enable_translation(iommu);
3895
3896                 iommu_disable_protect_mem_regions(iommu);
3897         }
3898         up_read(&dmar_global_lock);
3899
3900         pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
3901
3902         intel_iommu_enabled = 1;
3903
3904         return 0;
3905
3906 out_free_dmar:
3907         intel_iommu_free_dmars();
3908         up_write(&dmar_global_lock);
3909         return ret;
3910 }
3911
3912 static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque)
3913 {
3914         struct device_domain_info *info = opaque;
3915
3916         domain_context_clear_one(info, PCI_BUS_NUM(alias), alias & 0xff);
3917         return 0;
3918 }
3919
3920 /*
3921  * NB - intel-iommu lacks any sort of reference counting for the users of
3922  * dependent devices.  If multiple endpoints have intersecting dependent
3923  * devices, unbinding the driver from any one of them will possibly leave
3924  * the others unable to operate.
3925  */
3926 static void domain_context_clear(struct device_domain_info *info)
3927 {
3928         if (!dev_is_pci(info->dev))
3929                 domain_context_clear_one(info, info->bus, info->devfn);
3930
3931         pci_for_each_dma_alias(to_pci_dev(info->dev),
3932                                &domain_context_clear_one_cb, info);
3933 }
3934
3935 static void dmar_remove_one_dev_info(struct device *dev)
3936 {
3937         struct device_domain_info *info = dev_iommu_priv_get(dev);
3938         struct dmar_domain *domain = info->domain;
3939         struct intel_iommu *iommu = info->iommu;
3940         unsigned long flags;
3941
3942         if (!dev_is_real_dma_subdevice(info->dev)) {
3943                 if (dev_is_pci(info->dev) && sm_supported(iommu))
3944                         intel_pasid_tear_down_entry(iommu, info->dev,
3945                                         IOMMU_NO_PASID, false);
3946
3947                 iommu_disable_pci_caps(info);
3948                 domain_context_clear(info);
3949         }
3950
3951         spin_lock_irqsave(&domain->lock, flags);
3952         list_del(&info->link);
3953         spin_unlock_irqrestore(&domain->lock, flags);
3954
3955         domain_detach_iommu(domain, iommu);
3956         info->domain = NULL;
3957 }
3958
3959 /*
3960  * Clear the page table pointer in context or pasid table entries so that
3961  * all DMA requests without PASID from the device are blocked. If the page
3962  * table has been set, clean up the data structures.
3963  */
3964 static void device_block_translation(struct device *dev)
3965 {
3966         struct device_domain_info *info = dev_iommu_priv_get(dev);
3967         struct intel_iommu *iommu = info->iommu;
3968         unsigned long flags;
3969
3970         iommu_disable_pci_caps(info);
3971         if (!dev_is_real_dma_subdevice(dev)) {
3972                 if (sm_supported(iommu))
3973                         intel_pasid_tear_down_entry(iommu, dev,
3974                                                     IOMMU_NO_PASID, false);
3975                 else
3976                         domain_context_clear(info);
3977         }
3978
3979         if (!info->domain)
3980                 return;
3981
3982         spin_lock_irqsave(&info->domain->lock, flags);
3983         list_del(&info->link);
3984         spin_unlock_irqrestore(&info->domain->lock, flags);
3985
3986         domain_detach_iommu(info->domain, iommu);
3987         info->domain = NULL;
3988 }
3989
3990 static int md_domain_init(struct dmar_domain *domain, int guest_width)
3991 {
3992         int adjust_width;
3993
3994         /* calculate AGAW */
3995         domain->gaw = guest_width;
3996         adjust_width = guestwidth_to_adjustwidth(guest_width);
3997         domain->agaw = width_to_agaw(adjust_width);
3998
3999         domain->iommu_coherency = false;
4000         domain->iommu_superpage = 0;
4001         domain->max_addr = 0;
4002
4003         /* always allocate the top pgd */
4004         domain->pgd = alloc_pgtable_page(domain->nid, GFP_ATOMIC);
4005         if (!domain->pgd)
4006                 return -ENOMEM;
4007         domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
4008         return 0;
4009 }
4010
4011 static int blocking_domain_attach_dev(struct iommu_domain *domain,
4012                                       struct device *dev)
4013 {
4014         device_block_translation(dev);
4015         return 0;
4016 }
4017
4018 static struct iommu_domain blocking_domain = {
4019         .ops = &(const struct iommu_domain_ops) {
4020                 .attach_dev     = blocking_domain_attach_dev,
4021                 .free           = intel_iommu_domain_free
4022         }
4023 };
4024
4025 static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
4026 {
4027         struct dmar_domain *dmar_domain;
4028         struct iommu_domain *domain;
4029
4030         switch (type) {
4031         case IOMMU_DOMAIN_BLOCKED:
4032                 return &blocking_domain;
4033         case IOMMU_DOMAIN_DMA:
4034         case IOMMU_DOMAIN_UNMANAGED:
4035                 dmar_domain = alloc_domain(type);
4036                 if (!dmar_domain) {
4037                         pr_err("Can't allocate dmar_domain\n");
4038                         return NULL;
4039                 }
4040                 if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
4041                         pr_err("Domain initialization failed\n");
4042                         domain_exit(dmar_domain);
4043                         return NULL;
4044                 }
4045
4046                 domain = &dmar_domain->domain;
4047                 domain->geometry.aperture_start = 0;
4048                 domain->geometry.aperture_end   =
4049                                 __DOMAIN_MAX_ADDR(dmar_domain->gaw);
4050                 domain->geometry.force_aperture = true;
4051
4052                 return domain;
4053         case IOMMU_DOMAIN_IDENTITY:
4054                 return &si_domain->domain;
4055         case IOMMU_DOMAIN_SVA:
4056                 return intel_svm_domain_alloc();
4057         default:
4058                 return NULL;
4059         }
4060
4061         return NULL;
4062 }
4063
4064 static void intel_iommu_domain_free(struct iommu_domain *domain)
4065 {
4066         if (domain != &si_domain->domain && domain != &blocking_domain)
4067                 domain_exit(to_dmar_domain(domain));
4068 }
4069
4070 static int prepare_domain_attach_device(struct iommu_domain *domain,
4071                                         struct device *dev)
4072 {
4073         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4074         struct intel_iommu *iommu;
4075         int addr_width;
4076
4077         iommu = device_to_iommu(dev, NULL, NULL);
4078         if (!iommu)
4079                 return -ENODEV;
4080
4081         if (dmar_domain->force_snooping && !ecap_sc_support(iommu->ecap))
4082                 return -EINVAL;
4083
4084         /* check if this iommu agaw is sufficient for max mapped address */
4085         addr_width = agaw_to_width(iommu->agaw);
4086         if (addr_width > cap_mgaw(iommu->cap))
4087                 addr_width = cap_mgaw(iommu->cap);
4088
4089         if (dmar_domain->max_addr > (1LL << addr_width))
4090                 return -EINVAL;
4091         dmar_domain->gaw = addr_width;
4092
4093         /*
4094          * Knock out extra levels of page tables if necessary
4095          */
4096         while (iommu->agaw < dmar_domain->agaw) {
4097                 struct dma_pte *pte;
4098
4099                 pte = dmar_domain->pgd;
4100                 if (dma_pte_present(pte)) {
4101                         dmar_domain->pgd = phys_to_virt(dma_pte_addr(pte));
4102                         free_pgtable_page(pte);
4103                 }
4104                 dmar_domain->agaw--;
4105         }
4106
4107         return 0;
4108 }
4109
4110 static int intel_iommu_attach_device(struct iommu_domain *domain,
4111                                      struct device *dev)
4112 {
4113         struct device_domain_info *info = dev_iommu_priv_get(dev);
4114         int ret;
4115
4116         if (info->domain)
4117                 device_block_translation(dev);
4118
4119         ret = prepare_domain_attach_device(domain, dev);
4120         if (ret)
4121                 return ret;
4122
4123         return dmar_domain_attach_device(to_dmar_domain(domain), dev);
4124 }
4125
4126 static int intel_iommu_map(struct iommu_domain *domain,
4127                            unsigned long iova, phys_addr_t hpa,
4128                            size_t size, int iommu_prot, gfp_t gfp)
4129 {
4130         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4131         u64 max_addr;
4132         int prot = 0;
4133
4134         if (iommu_prot & IOMMU_READ)
4135                 prot |= DMA_PTE_READ;
4136         if (iommu_prot & IOMMU_WRITE)
4137                 prot |= DMA_PTE_WRITE;
4138         if (dmar_domain->set_pte_snp)
4139                 prot |= DMA_PTE_SNP;
4140
4141         max_addr = iova + size;
4142         if (dmar_domain->max_addr < max_addr) {
4143                 u64 end;
4144
4145                 /* check if minimum agaw is sufficient for mapped address */
4146                 end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
4147                 if (end < max_addr) {
4148                         pr_err("%s: iommu width (%d) is not "
4149                                "sufficient for the mapped address (%llx)\n",
4150                                __func__, dmar_domain->gaw, max_addr);
4151                         return -EFAULT;
4152                 }
4153                 dmar_domain->max_addr = max_addr;
4154         }
4155         /* Round up size to next multiple of PAGE_SIZE, if it and
4156            the low bits of hpa would take us onto the next page */
4157         size = aligned_nrpages(hpa, size);
4158         return __domain_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
4159                                 hpa >> VTD_PAGE_SHIFT, size, prot, gfp);
4160 }
4161
4162 static int intel_iommu_map_pages(struct iommu_domain *domain,
4163                                  unsigned long iova, phys_addr_t paddr,
4164                                  size_t pgsize, size_t pgcount,
4165                                  int prot, gfp_t gfp, size_t *mapped)
4166 {
4167         unsigned long pgshift = __ffs(pgsize);
4168         size_t size = pgcount << pgshift;
4169         int ret;
4170
4171         if (pgsize != SZ_4K && pgsize != SZ_2M && pgsize != SZ_1G)
4172                 return -EINVAL;
4173
4174         if (!IS_ALIGNED(iova | paddr, pgsize))
4175                 return -EINVAL;
4176
4177         ret = intel_iommu_map(domain, iova, paddr, size, prot, gfp);
4178         if (!ret && mapped)
4179                 *mapped = size;
4180
4181         return ret;
4182 }
4183
4184 static size_t intel_iommu_unmap(struct iommu_domain *domain,
4185                                 unsigned long iova, size_t size,
4186                                 struct iommu_iotlb_gather *gather)
4187 {
4188         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4189         unsigned long start_pfn, last_pfn;
4190         int level = 0;
4191
4192         /* Cope with horrid API which requires us to unmap more than the
4193            size argument if it happens to be a large-page mapping. */
4194         if (unlikely(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT,
4195                                      &level, GFP_ATOMIC)))
4196                 return 0;
4197
4198         if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
4199                 size = VTD_PAGE_SIZE << level_to_offset_bits(level);
4200
4201         start_pfn = iova >> VTD_PAGE_SHIFT;
4202         last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
4203
4204         domain_unmap(dmar_domain, start_pfn, last_pfn, &gather->freelist);
4205
4206         if (dmar_domain->max_addr == iova + size)
4207                 dmar_domain->max_addr = iova;
4208
4209         /*
4210          * We do not use page-selective IOTLB invalidation in flush queue,
4211          * so there is no need to track page and sync iotlb.
4212          */
4213         if (!iommu_iotlb_gather_queued(gather))
4214                 iommu_iotlb_gather_add_page(domain, gather, iova, size);
4215
4216         return size;
4217 }
4218
4219 static size_t intel_iommu_unmap_pages(struct iommu_domain *domain,
4220                                       unsigned long iova,
4221                                       size_t pgsize, size_t pgcount,
4222                                       struct iommu_iotlb_gather *gather)
4223 {
4224         unsigned long pgshift = __ffs(pgsize);
4225         size_t size = pgcount << pgshift;
4226
4227         return intel_iommu_unmap(domain, iova, size, gather);
4228 }
4229
4230 static void intel_iommu_tlb_sync(struct iommu_domain *domain,
4231                                  struct iommu_iotlb_gather *gather)
4232 {
4233         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4234         unsigned long iova_pfn = IOVA_PFN(gather->start);
4235         size_t size = gather->end - gather->start;
4236         struct iommu_domain_info *info;
4237         unsigned long start_pfn;
4238         unsigned long nrpages;
4239         unsigned long i;
4240
4241         nrpages = aligned_nrpages(gather->start, size);
4242         start_pfn = mm_to_dma_pfn_start(iova_pfn);
4243
4244         xa_for_each(&dmar_domain->iommu_array, i, info)
4245                 iommu_flush_iotlb_psi(info->iommu, dmar_domain,
4246                                       start_pfn, nrpages,
4247                                       list_empty(&gather->freelist), 0);
4248
4249         put_pages_list(&gather->freelist);
4250 }
4251
4252 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
4253                                             dma_addr_t iova)
4254 {
4255         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4256         struct dma_pte *pte;
4257         int level = 0;
4258         u64 phys = 0;
4259
4260         pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level,
4261                              GFP_ATOMIC);
4262         if (pte && dma_pte_present(pte))
4263                 phys = dma_pte_addr(pte) +
4264                         (iova & (BIT_MASK(level_to_offset_bits(level) +
4265                                                 VTD_PAGE_SHIFT) - 1));
4266
4267         return phys;
4268 }
4269
4270 static bool domain_support_force_snooping(struct dmar_domain *domain)
4271 {
4272         struct device_domain_info *info;
4273         bool support = true;
4274
4275         assert_spin_locked(&domain->lock);
4276         list_for_each_entry(info, &domain->devices, link) {
4277                 if (!ecap_sc_support(info->iommu->ecap)) {
4278                         support = false;
4279                         break;
4280                 }
4281         }
4282
4283         return support;
4284 }
4285
4286 static void domain_set_force_snooping(struct dmar_domain *domain)
4287 {
4288         struct device_domain_info *info;
4289
4290         assert_spin_locked(&domain->lock);
4291         /*
4292          * Second level page table supports per-PTE snoop control. The
4293          * iommu_map() interface will handle this by setting SNP bit.
4294          */
4295         if (!domain->use_first_level) {
4296                 domain->set_pte_snp = true;
4297                 return;
4298         }
4299
4300         list_for_each_entry(info, &domain->devices, link)
4301                 intel_pasid_setup_page_snoop_control(info->iommu, info->dev,
4302                                                      IOMMU_NO_PASID);
4303 }
4304
4305 static bool intel_iommu_enforce_cache_coherency(struct iommu_domain *domain)
4306 {
4307         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4308         unsigned long flags;
4309
4310         if (dmar_domain->force_snooping)
4311                 return true;
4312
4313         spin_lock_irqsave(&dmar_domain->lock, flags);
4314         if (!domain_support_force_snooping(dmar_domain) ||
4315             (!dmar_domain->use_first_level && dmar_domain->has_mappings)) {
4316                 spin_unlock_irqrestore(&dmar_domain->lock, flags);
4317                 return false;
4318         }
4319
4320         domain_set_force_snooping(dmar_domain);
4321         dmar_domain->force_snooping = true;
4322         spin_unlock_irqrestore(&dmar_domain->lock, flags);
4323
4324         return true;
4325 }
4326
4327 static bool intel_iommu_capable(struct device *dev, enum iommu_cap cap)
4328 {
4329         struct device_domain_info *info = dev_iommu_priv_get(dev);
4330
4331         switch (cap) {
4332         case IOMMU_CAP_CACHE_COHERENCY:
4333         case IOMMU_CAP_DEFERRED_FLUSH:
4334                 return true;
4335         case IOMMU_CAP_PRE_BOOT_PROTECTION:
4336                 return dmar_platform_optin();
4337         case IOMMU_CAP_ENFORCE_CACHE_COHERENCY:
4338                 return ecap_sc_support(info->iommu->ecap);
4339         default:
4340                 return false;
4341         }
4342 }
4343
4344 static struct iommu_device *intel_iommu_probe_device(struct device *dev)
4345 {
4346         struct pci_dev *pdev = dev_is_pci(dev) ? to_pci_dev(dev) : NULL;
4347         struct device_domain_info *info;
4348         struct intel_iommu *iommu;
4349         u8 bus, devfn;
4350         int ret;
4351
4352         iommu = device_to_iommu(dev, &bus, &devfn);
4353         if (!iommu || !iommu->iommu.ops)
4354                 return ERR_PTR(-ENODEV);
4355
4356         info = kzalloc(sizeof(*info), GFP_KERNEL);
4357         if (!info)
4358                 return ERR_PTR(-ENOMEM);
4359
4360         if (dev_is_real_dma_subdevice(dev)) {
4361                 info->bus = pdev->bus->number;
4362                 info->devfn = pdev->devfn;
4363                 info->segment = pci_domain_nr(pdev->bus);
4364         } else {
4365                 info->bus = bus;
4366                 info->devfn = devfn;
4367                 info->segment = iommu->segment;
4368         }
4369
4370         info->dev = dev;
4371         info->iommu = iommu;
4372         if (dev_is_pci(dev)) {
4373                 if (ecap_dev_iotlb_support(iommu->ecap) &&
4374                     pci_ats_supported(pdev) &&
4375                     dmar_ats_supported(pdev, iommu)) {
4376                         info->ats_supported = 1;
4377                         info->dtlb_extra_inval = dev_needs_extra_dtlb_flush(pdev);
4378
4379                         /*
4380                          * For IOMMU that supports device IOTLB throttling
4381                          * (DIT), we assign PFSID to the invalidation desc
4382                          * of a VF such that IOMMU HW can gauge queue depth
4383                          * at PF level. If DIT is not set, PFSID will be
4384                          * treated as reserved, which should be set to 0.
4385                          */
4386                         if (ecap_dit(iommu->ecap))
4387                                 info->pfsid = pci_dev_id(pci_physfn(pdev));
4388                         info->ats_qdep = pci_ats_queue_depth(pdev);
4389                 }
4390                 if (sm_supported(iommu)) {
4391                         if (pasid_supported(iommu)) {
4392                                 int features = pci_pasid_features(pdev);
4393
4394                                 if (features >= 0)
4395                                         info->pasid_supported = features | 1;
4396                         }
4397
4398                         if (info->ats_supported && ecap_prs(iommu->ecap) &&
4399                             pci_pri_supported(pdev))
4400                                 info->pri_supported = 1;
4401                 }
4402         }
4403
4404         dev_iommu_priv_set(dev, info);
4405
4406         if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev)) {
4407                 ret = intel_pasid_alloc_table(dev);
4408                 if (ret) {
4409                         dev_err(dev, "PASID table allocation failed\n");
4410                         dev_iommu_priv_set(dev, NULL);
4411                         kfree(info);
4412                         return ERR_PTR(ret);
4413                 }
4414         }
4415
4416         return &iommu->iommu;
4417 }
4418
4419 static void intel_iommu_release_device(struct device *dev)
4420 {
4421         struct device_domain_info *info = dev_iommu_priv_get(dev);
4422
4423         dmar_remove_one_dev_info(dev);
4424         intel_pasid_free_table(dev);
4425         dev_iommu_priv_set(dev, NULL);
4426         kfree(info);
4427         set_dma_ops(dev, NULL);
4428 }
4429
4430 static void intel_iommu_probe_finalize(struct device *dev)
4431 {
4432         set_dma_ops(dev, NULL);
4433         iommu_setup_dma_ops(dev, 0, U64_MAX);
4434 }
4435
4436 static void intel_iommu_get_resv_regions(struct device *device,
4437                                          struct list_head *head)
4438 {
4439         int prot = DMA_PTE_READ | DMA_PTE_WRITE;
4440         struct iommu_resv_region *reg;
4441         struct dmar_rmrr_unit *rmrr;
4442         struct device *i_dev;
4443         int i;
4444
4445         rcu_read_lock();
4446         for_each_rmrr_units(rmrr) {
4447                 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
4448                                           i, i_dev) {
4449                         struct iommu_resv_region *resv;
4450                         enum iommu_resv_type type;
4451                         size_t length;
4452
4453                         if (i_dev != device &&
4454                             !is_downstream_to_pci_bridge(device, i_dev))
4455                                 continue;
4456
4457                         length = rmrr->end_address - rmrr->base_address + 1;
4458
4459                         type = device_rmrr_is_relaxable(device) ?
4460                                 IOMMU_RESV_DIRECT_RELAXABLE : IOMMU_RESV_DIRECT;
4461
4462                         resv = iommu_alloc_resv_region(rmrr->base_address,
4463                                                        length, prot, type,
4464                                                        GFP_ATOMIC);
4465                         if (!resv)
4466                                 break;
4467
4468                         list_add_tail(&resv->list, head);
4469                 }
4470         }
4471         rcu_read_unlock();
4472
4473 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
4474         if (dev_is_pci(device)) {
4475                 struct pci_dev *pdev = to_pci_dev(device);
4476
4477                 if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) {
4478                         reg = iommu_alloc_resv_region(0, 1UL << 24, prot,
4479                                         IOMMU_RESV_DIRECT_RELAXABLE,
4480                                         GFP_KERNEL);
4481                         if (reg)
4482                                 list_add_tail(&reg->list, head);
4483                 }
4484         }
4485 #endif /* CONFIG_INTEL_IOMMU_FLOPPY_WA */
4486
4487         reg = iommu_alloc_resv_region(IOAPIC_RANGE_START,
4488                                       IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1,
4489                                       0, IOMMU_RESV_MSI, GFP_KERNEL);
4490         if (!reg)
4491                 return;
4492         list_add_tail(&reg->list, head);
4493 }
4494
4495 static struct iommu_group *intel_iommu_device_group(struct device *dev)
4496 {
4497         if (dev_is_pci(dev))
4498                 return pci_device_group(dev);
4499         return generic_device_group(dev);
4500 }
4501
4502 static int intel_iommu_enable_sva(struct device *dev)
4503 {
4504         struct device_domain_info *info = dev_iommu_priv_get(dev);
4505         struct intel_iommu *iommu;
4506
4507         if (!info || dmar_disabled)
4508                 return -EINVAL;
4509
4510         iommu = info->iommu;
4511         if (!iommu)
4512                 return -EINVAL;
4513
4514         if (!(iommu->flags & VTD_FLAG_SVM_CAPABLE))
4515                 return -ENODEV;
4516
4517         if (!info->pasid_enabled || !info->ats_enabled)
4518                 return -EINVAL;
4519
4520         /*
4521          * Devices having device-specific I/O fault handling should not
4522          * support PCI/PRI. The IOMMU side has no means to check the
4523          * capability of device-specific IOPF.  Therefore, IOMMU can only
4524          * default that if the device driver enables SVA on a non-PRI
4525          * device, it will handle IOPF in its own way.
4526          */
4527         if (!info->pri_supported)
4528                 return 0;
4529
4530         /* Devices supporting PRI should have it enabled. */
4531         if (!info->pri_enabled)
4532                 return -EINVAL;
4533
4534         return 0;
4535 }
4536
4537 static int intel_iommu_enable_iopf(struct device *dev)
4538 {
4539         struct pci_dev *pdev = dev_is_pci(dev) ? to_pci_dev(dev) : NULL;
4540         struct device_domain_info *info = dev_iommu_priv_get(dev);
4541         struct intel_iommu *iommu;
4542         int ret;
4543
4544         if (!pdev || !info || !info->ats_enabled || !info->pri_supported)
4545                 return -ENODEV;
4546
4547         if (info->pri_enabled)
4548                 return -EBUSY;
4549
4550         iommu = info->iommu;
4551         if (!iommu)
4552                 return -EINVAL;
4553
4554         /* PASID is required in PRG Response Message. */
4555         if (info->pasid_enabled && !pci_prg_resp_pasid_required(pdev))
4556                 return -EINVAL;
4557
4558         ret = pci_reset_pri(pdev);
4559         if (ret)
4560                 return ret;
4561
4562         ret = iopf_queue_add_device(iommu->iopf_queue, dev);
4563         if (ret)
4564                 return ret;
4565
4566         ret = iommu_register_device_fault_handler(dev, iommu_queue_iopf, dev);
4567         if (ret)
4568                 goto iopf_remove_device;
4569
4570         ret = pci_enable_pri(pdev, PRQ_DEPTH);
4571         if (ret)
4572                 goto iopf_unregister_handler;
4573         info->pri_enabled = 1;
4574
4575         return 0;
4576
4577 iopf_unregister_handler:
4578         iommu_unregister_device_fault_handler(dev);
4579 iopf_remove_device:
4580         iopf_queue_remove_device(iommu->iopf_queue, dev);
4581
4582         return ret;
4583 }
4584
4585 static int intel_iommu_disable_iopf(struct device *dev)
4586 {
4587         struct device_domain_info *info = dev_iommu_priv_get(dev);
4588         struct intel_iommu *iommu = info->iommu;
4589
4590         if (!info->pri_enabled)
4591                 return -EINVAL;
4592
4593         /*
4594          * PCIe spec states that by clearing PRI enable bit, the Page
4595          * Request Interface will not issue new page requests, but has
4596          * outstanding page requests that have been transmitted or are
4597          * queued for transmission. This is supposed to be called after
4598          * the device driver has stopped DMA, all PASIDs have been
4599          * unbound and the outstanding PRQs have been drained.
4600          */
4601         pci_disable_pri(to_pci_dev(dev));
4602         info->pri_enabled = 0;
4603
4604         /*
4605          * With PRI disabled and outstanding PRQs drained, unregistering
4606          * fault handler and removing device from iopf queue should never
4607          * fail.
4608          */
4609         WARN_ON(iommu_unregister_device_fault_handler(dev));
4610         WARN_ON(iopf_queue_remove_device(iommu->iopf_queue, dev));
4611
4612         return 0;
4613 }
4614
4615 static int
4616 intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat)
4617 {
4618         switch (feat) {
4619         case IOMMU_DEV_FEAT_IOPF:
4620                 return intel_iommu_enable_iopf(dev);
4621
4622         case IOMMU_DEV_FEAT_SVA:
4623                 return intel_iommu_enable_sva(dev);
4624
4625         default:
4626                 return -ENODEV;
4627         }
4628 }
4629
4630 static int
4631 intel_iommu_dev_disable_feat(struct device *dev, enum iommu_dev_features feat)
4632 {
4633         switch (feat) {
4634         case IOMMU_DEV_FEAT_IOPF:
4635                 return intel_iommu_disable_iopf(dev);
4636
4637         case IOMMU_DEV_FEAT_SVA:
4638                 return 0;
4639
4640         default:
4641                 return -ENODEV;
4642         }
4643 }
4644
4645 static bool intel_iommu_is_attach_deferred(struct device *dev)
4646 {
4647         struct device_domain_info *info = dev_iommu_priv_get(dev);
4648
4649         return translation_pre_enabled(info->iommu) && !info->domain;
4650 }
4651
4652 /*
4653  * Check that the device does not live on an external facing PCI port that is
4654  * marked as untrusted. Such devices should not be able to apply quirks and
4655  * thus not be able to bypass the IOMMU restrictions.
4656  */
4657 static bool risky_device(struct pci_dev *pdev)
4658 {
4659         if (pdev->untrusted) {
4660                 pci_info(pdev,
4661                          "Skipping IOMMU quirk for dev [%04X:%04X] on untrusted PCI link\n",
4662                          pdev->vendor, pdev->device);
4663                 pci_info(pdev, "Please check with your BIOS/Platform vendor about this\n");
4664                 return true;
4665         }
4666         return false;
4667 }
4668
4669 static void intel_iommu_iotlb_sync_map(struct iommu_domain *domain,
4670                                        unsigned long iova, size_t size)
4671 {
4672         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4673         unsigned long pages = aligned_nrpages(iova, size);
4674         unsigned long pfn = iova >> VTD_PAGE_SHIFT;
4675         struct iommu_domain_info *info;
4676         unsigned long i;
4677
4678         xa_for_each(&dmar_domain->iommu_array, i, info)
4679                 __mapping_notify_one(info->iommu, dmar_domain, pfn, pages);
4680 }
4681
4682 static void intel_iommu_remove_dev_pasid(struct device *dev, ioasid_t pasid)
4683 {
4684         struct intel_iommu *iommu = device_to_iommu(dev, NULL, NULL);
4685         struct dev_pasid_info *curr, *dev_pasid = NULL;
4686         struct dmar_domain *dmar_domain;
4687         struct iommu_domain *domain;
4688         unsigned long flags;
4689
4690         domain = iommu_get_domain_for_dev_pasid(dev, pasid, 0);
4691         if (WARN_ON_ONCE(!domain))
4692                 goto out_tear_down;
4693
4694         /*
4695          * The SVA implementation needs to handle its own stuffs like the mm
4696          * notification. Before consolidating that code into iommu core, let
4697          * the intel sva code handle it.
4698          */
4699         if (domain->type == IOMMU_DOMAIN_SVA) {
4700                 intel_svm_remove_dev_pasid(dev, pasid);
4701                 goto out_tear_down;
4702         }
4703
4704         dmar_domain = to_dmar_domain(domain);
4705         spin_lock_irqsave(&dmar_domain->lock, flags);
4706         list_for_each_entry(curr, &dmar_domain->dev_pasids, link_domain) {
4707                 if (curr->dev == dev && curr->pasid == pasid) {
4708                         list_del(&curr->link_domain);
4709                         dev_pasid = curr;
4710                         break;
4711                 }
4712         }
4713         WARN_ON_ONCE(!dev_pasid);
4714         spin_unlock_irqrestore(&dmar_domain->lock, flags);
4715
4716         domain_detach_iommu(dmar_domain, iommu);
4717         kfree(dev_pasid);
4718 out_tear_down:
4719         intel_pasid_tear_down_entry(iommu, dev, pasid, false);
4720         intel_drain_pasid_prq(dev, pasid);
4721 }
4722
4723 static int intel_iommu_set_dev_pasid(struct iommu_domain *domain,
4724                                      struct device *dev, ioasid_t pasid)
4725 {
4726         struct device_domain_info *info = dev_iommu_priv_get(dev);
4727         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4728         struct intel_iommu *iommu = info->iommu;
4729         struct dev_pasid_info *dev_pasid;
4730         unsigned long flags;
4731         int ret;
4732
4733         if (!pasid_supported(iommu) || dev_is_real_dma_subdevice(dev))
4734                 return -EOPNOTSUPP;
4735
4736         if (context_copied(iommu, info->bus, info->devfn))
4737                 return -EBUSY;
4738
4739         ret = prepare_domain_attach_device(domain, dev);
4740         if (ret)
4741                 return ret;
4742
4743         dev_pasid = kzalloc(sizeof(*dev_pasid), GFP_KERNEL);
4744         if (!dev_pasid)
4745                 return -ENOMEM;
4746
4747         ret = domain_attach_iommu(dmar_domain, iommu);
4748         if (ret)
4749                 goto out_free;
4750
4751         if (domain_type_is_si(dmar_domain))
4752                 ret = intel_pasid_setup_pass_through(iommu, dmar_domain,
4753                                                      dev, pasid);
4754         else if (dmar_domain->use_first_level)
4755                 ret = domain_setup_first_level(iommu, dmar_domain,
4756                                                dev, pasid);
4757         else
4758                 ret = intel_pasid_setup_second_level(iommu, dmar_domain,
4759                                                      dev, pasid);
4760         if (ret)
4761                 goto out_detach_iommu;
4762
4763         dev_pasid->dev = dev;
4764         dev_pasid->pasid = pasid;
4765         spin_lock_irqsave(&dmar_domain->lock, flags);
4766         list_add(&dev_pasid->link_domain, &dmar_domain->dev_pasids);
4767         spin_unlock_irqrestore(&dmar_domain->lock, flags);
4768
4769         return 0;
4770 out_detach_iommu:
4771         domain_detach_iommu(dmar_domain, iommu);
4772 out_free:
4773         kfree(dev_pasid);
4774         return ret;
4775 }
4776
4777 static void *intel_iommu_hw_info(struct device *dev, u32 *length, u32 *type)
4778 {
4779         struct device_domain_info *info = dev_iommu_priv_get(dev);
4780         struct intel_iommu *iommu = info->iommu;
4781         struct iommu_hw_info_vtd *vtd;
4782
4783         vtd = kzalloc(sizeof(*vtd), GFP_KERNEL);
4784         if (!vtd)
4785                 return ERR_PTR(-ENOMEM);
4786
4787         vtd->cap_reg = iommu->cap;
4788         vtd->ecap_reg = iommu->ecap;
4789         *length = sizeof(*vtd);
4790         *type = IOMMU_HW_INFO_TYPE_INTEL_VTD;
4791         return vtd;
4792 }
4793
4794 const struct iommu_ops intel_iommu_ops = {
4795         .capable                = intel_iommu_capable,
4796         .hw_info                = intel_iommu_hw_info,
4797         .domain_alloc           = intel_iommu_domain_alloc,
4798         .probe_device           = intel_iommu_probe_device,
4799         .probe_finalize         = intel_iommu_probe_finalize,
4800         .release_device         = intel_iommu_release_device,
4801         .get_resv_regions       = intel_iommu_get_resv_regions,
4802         .device_group           = intel_iommu_device_group,
4803         .dev_enable_feat        = intel_iommu_dev_enable_feat,
4804         .dev_disable_feat       = intel_iommu_dev_disable_feat,
4805         .is_attach_deferred     = intel_iommu_is_attach_deferred,
4806         .def_domain_type        = device_def_domain_type,
4807         .remove_dev_pasid       = intel_iommu_remove_dev_pasid,
4808         .pgsize_bitmap          = SZ_4K,
4809 #ifdef CONFIG_INTEL_IOMMU_SVM
4810         .page_response          = intel_svm_page_response,
4811 #endif
4812         .default_domain_ops = &(const struct iommu_domain_ops) {
4813                 .attach_dev             = intel_iommu_attach_device,
4814                 .set_dev_pasid          = intel_iommu_set_dev_pasid,
4815                 .map_pages              = intel_iommu_map_pages,
4816                 .unmap_pages            = intel_iommu_unmap_pages,
4817                 .iotlb_sync_map         = intel_iommu_iotlb_sync_map,
4818                 .flush_iotlb_all        = intel_flush_iotlb_all,
4819                 .iotlb_sync             = intel_iommu_tlb_sync,
4820                 .iova_to_phys           = intel_iommu_iova_to_phys,
4821                 .free                   = intel_iommu_domain_free,
4822                 .enforce_cache_coherency = intel_iommu_enforce_cache_coherency,
4823         }
4824 };
4825
4826 static void quirk_iommu_igfx(struct pci_dev *dev)
4827 {
4828         if (risky_device(dev))
4829                 return;
4830
4831         pci_info(dev, "Disabling IOMMU for graphics on this chipset\n");
4832         dmar_map_gfx = 0;
4833 }
4834
4835 /* G4x/GM45 integrated gfx dmar support is totally busted. */
4836 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_igfx);
4837 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_igfx);
4838 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_igfx);
4839 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_igfx);
4840 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_igfx);
4841 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_igfx);
4842 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_igfx);
4843
4844 /* Broadwell igfx malfunctions with dmar */
4845 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1606, quirk_iommu_igfx);
4846 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160B, quirk_iommu_igfx);
4847 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160E, quirk_iommu_igfx);
4848 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1602, quirk_iommu_igfx);
4849 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160A, quirk_iommu_igfx);
4850 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160D, quirk_iommu_igfx);
4851 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1616, quirk_iommu_igfx);
4852 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161B, quirk_iommu_igfx);
4853 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161E, quirk_iommu_igfx);
4854 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1612, quirk_iommu_igfx);
4855 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161A, quirk_iommu_igfx);
4856 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161D, quirk_iommu_igfx);
4857 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1626, quirk_iommu_igfx);
4858 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162B, quirk_iommu_igfx);
4859 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162E, quirk_iommu_igfx);
4860 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1622, quirk_iommu_igfx);
4861 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162A, quirk_iommu_igfx);
4862 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162D, quirk_iommu_igfx);
4863 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1636, quirk_iommu_igfx);
4864 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163B, quirk_iommu_igfx);
4865 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163E, quirk_iommu_igfx);
4866 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1632, quirk_iommu_igfx);
4867 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163A, quirk_iommu_igfx);
4868 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163D, quirk_iommu_igfx);
4869
4870 static void quirk_iommu_rwbf(struct pci_dev *dev)
4871 {
4872         if (risky_device(dev))
4873                 return;
4874
4875         /*
4876          * Mobile 4 Series Chipset neglects to set RWBF capability,
4877          * but needs it. Same seems to hold for the desktop versions.
4878          */
4879         pci_info(dev, "Forcing write-buffer flush capability\n");
4880         rwbf_quirk = 1;
4881 }
4882
4883 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
4884 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
4885 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
4886 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
4887 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
4888 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
4889 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
4890
4891 #define GGC 0x52
4892 #define GGC_MEMORY_SIZE_MASK    (0xf << 8)
4893 #define GGC_MEMORY_SIZE_NONE    (0x0 << 8)
4894 #define GGC_MEMORY_SIZE_1M      (0x1 << 8)
4895 #define GGC_MEMORY_SIZE_2M      (0x3 << 8)
4896 #define GGC_MEMORY_VT_ENABLED   (0x8 << 8)
4897 #define GGC_MEMORY_SIZE_2M_VT   (0x9 << 8)
4898 #define GGC_MEMORY_SIZE_3M_VT   (0xa << 8)
4899 #define GGC_MEMORY_SIZE_4M_VT   (0xb << 8)
4900
4901 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
4902 {
4903         unsigned short ggc;
4904
4905         if (risky_device(dev))
4906                 return;
4907
4908         if (pci_read_config_word(dev, GGC, &ggc))
4909                 return;
4910
4911         if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
4912                 pci_info(dev, "BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
4913                 dmar_map_gfx = 0;
4914         } else if (dmar_map_gfx) {
4915                 /* we have to ensure the gfx device is idle before we flush */
4916                 pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n");
4917                 iommu_set_dma_strict();
4918         }
4919 }
4920 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
4921 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
4922 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
4923 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
4924
4925 static void quirk_igfx_skip_te_disable(struct pci_dev *dev)
4926 {
4927         unsigned short ver;
4928
4929         if (!IS_GFX_DEVICE(dev))
4930                 return;
4931
4932         ver = (dev->device >> 8) & 0xff;
4933         if (ver != 0x45 && ver != 0x46 && ver != 0x4c &&
4934             ver != 0x4e && ver != 0x8a && ver != 0x98 &&
4935             ver != 0x9a && ver != 0xa7 && ver != 0x7d)
4936                 return;
4937
4938         if (risky_device(dev))
4939                 return;
4940
4941         pci_info(dev, "Skip IOMMU disabling for graphics\n");
4942         iommu_skip_te_disable = 1;
4943 }
4944 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_ANY_ID, quirk_igfx_skip_te_disable);
4945
4946 /* On Tylersburg chipsets, some BIOSes have been known to enable the
4947    ISOCH DMAR unit for the Azalia sound device, but not give it any
4948    TLB entries, which causes it to deadlock. Check for that.  We do
4949    this in a function called from init_dmars(), instead of in a PCI
4950    quirk, because we don't want to print the obnoxious "BIOS broken"
4951    message if VT-d is actually disabled.
4952 */
4953 static void __init check_tylersburg_isoch(void)
4954 {
4955         struct pci_dev *pdev;
4956         uint32_t vtisochctrl;
4957
4958         /* If there's no Azalia in the system anyway, forget it. */
4959         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
4960         if (!pdev)
4961                 return;
4962
4963         if (risky_device(pdev)) {
4964                 pci_dev_put(pdev);
4965                 return;
4966         }
4967
4968         pci_dev_put(pdev);
4969
4970         /* System Management Registers. Might be hidden, in which case
4971            we can't do the sanity check. But that's OK, because the
4972            known-broken BIOSes _don't_ actually hide it, so far. */
4973         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
4974         if (!pdev)
4975                 return;
4976
4977         if (risky_device(pdev)) {
4978                 pci_dev_put(pdev);
4979                 return;
4980         }
4981
4982         if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
4983                 pci_dev_put(pdev);
4984                 return;
4985         }
4986
4987         pci_dev_put(pdev);
4988
4989         /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
4990         if (vtisochctrl & 1)
4991                 return;
4992
4993         /* Drop all bits other than the number of TLB entries */
4994         vtisochctrl &= 0x1c;
4995
4996         /* If we have the recommended number of TLB entries (16), fine. */
4997         if (vtisochctrl == 0x10)
4998                 return;
4999
5000         /* Zero TLB entries? You get to ride the short bus to school. */
5001         if (!vtisochctrl) {
5002                 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
5003                      "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
5004                      dmi_get_system_info(DMI_BIOS_VENDOR),
5005                      dmi_get_system_info(DMI_BIOS_VERSION),
5006                      dmi_get_system_info(DMI_PRODUCT_VERSION));
5007                 iommu_identity_mapping |= IDENTMAP_AZALIA;
5008                 return;
5009         }
5010
5011         pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
5012                vtisochctrl);
5013 }
5014
5015 /*
5016  * Here we deal with a device TLB defect where device may inadvertently issue ATS
5017  * invalidation completion before posted writes initiated with translated address
5018  * that utilized translations matching the invalidation address range, violating
5019  * the invalidation completion ordering.
5020  * Therefore, any use cases that cannot guarantee DMA is stopped before unmap is
5021  * vulnerable to this defect. In other words, any dTLB invalidation initiated not
5022  * under the control of the trusted/privileged host device driver must use this
5023  * quirk.
5024  * Device TLBs are invalidated under the following six conditions:
5025  * 1. Device driver does DMA API unmap IOVA
5026  * 2. Device driver unbind a PASID from a process, sva_unbind_device()
5027  * 3. PASID is torn down, after PASID cache is flushed. e.g. process
5028  *    exit_mmap() due to crash
5029  * 4. Under SVA usage, called by mmu_notifier.invalidate_range() where
5030  *    VM has to free pages that were unmapped
5031  * 5. Userspace driver unmaps a DMA buffer
5032  * 6. Cache invalidation in vSVA usage (upcoming)
5033  *
5034  * For #1 and #2, device drivers are responsible for stopping DMA traffic
5035  * before unmap/unbind. For #3, iommu driver gets mmu_notifier to
5036  * invalidate TLB the same way as normal user unmap which will use this quirk.
5037  * The dTLB invalidation after PASID cache flush does not need this quirk.
5038  *
5039  * As a reminder, #6 will *NEED* this quirk as we enable nested translation.
5040  */
5041 void quirk_extra_dev_tlb_flush(struct device_domain_info *info,
5042                                unsigned long address, unsigned long mask,
5043                                u32 pasid, u16 qdep)
5044 {
5045         u16 sid;
5046
5047         if (likely(!info->dtlb_extra_inval))
5048                 return;
5049
5050         sid = PCI_DEVID(info->bus, info->devfn);
5051         if (pasid == IOMMU_NO_PASID) {
5052                 qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
5053                                    qdep, address, mask);
5054         } else {
5055                 qi_flush_dev_iotlb_pasid(info->iommu, sid, info->pfsid,
5056                                          pasid, qdep, address, mask);
5057         }
5058 }
5059
5060 #define ecmd_get_status_code(res)       (((res) & 0xff) >> 1)
5061
5062 /*
5063  * Function to submit a command to the enhanced command interface. The
5064  * valid enhanced command descriptions are defined in Table 47 of the
5065  * VT-d spec. The VT-d hardware implementation may support some but not
5066  * all commands, which can be determined by checking the Enhanced
5067  * Command Capability Register.
5068  *
5069  * Return values:
5070  *  - 0: Command successful without any error;
5071  *  - Negative: software error value;
5072  *  - Nonzero positive: failure status code defined in Table 48.
5073  */
5074 int ecmd_submit_sync(struct intel_iommu *iommu, u8 ecmd, u64 oa, u64 ob)
5075 {
5076         unsigned long flags;
5077         u64 res;
5078         int ret;
5079
5080         if (!cap_ecmds(iommu->cap))
5081                 return -ENODEV;
5082
5083         raw_spin_lock_irqsave(&iommu->register_lock, flags);
5084
5085         res = dmar_readq(iommu->reg + DMAR_ECRSP_REG);
5086         if (res & DMA_ECMD_ECRSP_IP) {
5087                 ret = -EBUSY;
5088                 goto err;
5089         }
5090
5091         /*
5092          * Unconditionally write the operand B, because
5093          * - There is no side effect if an ecmd doesn't require an
5094          *   operand B, but we set the register to some value.
5095          * - It's not invoked in any critical path. The extra MMIO
5096          *   write doesn't bring any performance concerns.
5097          */
5098         dmar_writeq(iommu->reg + DMAR_ECEO_REG, ob);
5099         dmar_writeq(iommu->reg + DMAR_ECMD_REG, ecmd | (oa << DMA_ECMD_OA_SHIFT));
5100
5101         IOMMU_WAIT_OP(iommu, DMAR_ECRSP_REG, dmar_readq,
5102                       !(res & DMA_ECMD_ECRSP_IP), res);
5103
5104         if (res & DMA_ECMD_ECRSP_IP) {
5105                 ret = -ETIMEDOUT;
5106                 goto err;
5107         }
5108
5109         ret = ecmd_get_status_code(res);
5110 err:
5111         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
5112
5113         return ret;
5114 }