iommu/vt-d: Fix possible recursive locking in intel_iommu_init()
[platform/kernel/linux-starfive.git] / drivers / iommu / intel / iommu.c
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Copyright © 2006-2014 Intel Corporation.
4  *
5  * Authors: David Woodhouse <dwmw2@infradead.org>,
6  *          Ashok Raj <ashok.raj@intel.com>,
7  *          Shaohua Li <shaohua.li@intel.com>,
8  *          Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
9  *          Fenghua Yu <fenghua.yu@intel.com>
10  *          Joerg Roedel <jroedel@suse.de>
11  */
12
13 #define pr_fmt(fmt)     "DMAR: " fmt
14 #define dev_fmt(fmt)    pr_fmt(fmt)
15
16 #include <linux/crash_dump.h>
17 #include <linux/dma-direct.h>
18 #include <linux/dma-iommu.h>
19 #include <linux/dmi.h>
20 #include <linux/intel-svm.h>
21 #include <linux/memory.h>
22 #include <linux/pci.h>
23 #include <linux/pci-ats.h>
24 #include <linux/spinlock.h>
25 #include <linux/syscore_ops.h>
26 #include <linux/tboot.h>
27
28 #include "iommu.h"
29 #include "../irq_remapping.h"
30 #include "../iommu-sva-lib.h"
31 #include "pasid.h"
32 #include "cap_audit.h"
33
34 #define ROOT_SIZE               VTD_PAGE_SIZE
35 #define CONTEXT_SIZE            VTD_PAGE_SIZE
36
37 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
38 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
39 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
40 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
41
42 #define IOAPIC_RANGE_START      (0xfee00000)
43 #define IOAPIC_RANGE_END        (0xfeefffff)
44 #define IOVA_START_ADDR         (0x1000)
45
46 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 57
47
48 #define MAX_AGAW_WIDTH 64
49 #define MAX_AGAW_PFN_WIDTH      (MAX_AGAW_WIDTH - VTD_PAGE_SHIFT)
50
51 #define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << ((gaw) - VTD_PAGE_SHIFT)) - 1)
52 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << (gaw)) - 1)
53
54 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
55    to match. That way, we can use 'unsigned long' for PFNs with impunity. */
56 #define DOMAIN_MAX_PFN(gaw)     ((unsigned long) min_t(uint64_t, \
57                                 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
58 #define DOMAIN_MAX_ADDR(gaw)    (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
59
60 /* IO virtual address start page frame number */
61 #define IOVA_START_PFN          (1)
62
63 #define IOVA_PFN(addr)          ((addr) >> PAGE_SHIFT)
64
65 /* page table handling */
66 #define LEVEL_STRIDE            (9)
67 #define LEVEL_MASK              (((u64)1 << LEVEL_STRIDE) - 1)
68
69 static inline int agaw_to_level(int agaw)
70 {
71         return agaw + 2;
72 }
73
74 static inline int agaw_to_width(int agaw)
75 {
76         return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH);
77 }
78
79 static inline int width_to_agaw(int width)
80 {
81         return DIV_ROUND_UP(width - 30, LEVEL_STRIDE);
82 }
83
84 static inline unsigned int level_to_offset_bits(int level)
85 {
86         return (level - 1) * LEVEL_STRIDE;
87 }
88
89 static inline int pfn_level_offset(u64 pfn, int level)
90 {
91         return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
92 }
93
94 static inline u64 level_mask(int level)
95 {
96         return -1ULL << level_to_offset_bits(level);
97 }
98
99 static inline u64 level_size(int level)
100 {
101         return 1ULL << level_to_offset_bits(level);
102 }
103
104 static inline u64 align_to_level(u64 pfn, int level)
105 {
106         return (pfn + level_size(level) - 1) & level_mask(level);
107 }
108
109 static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
110 {
111         return 1UL << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH);
112 }
113
114 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
115    are never going to work. */
116 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
117 {
118         return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
119 }
120 static inline unsigned long page_to_dma_pfn(struct page *pg)
121 {
122         return mm_to_dma_pfn(page_to_pfn(pg));
123 }
124 static inline unsigned long virt_to_dma_pfn(void *p)
125 {
126         return page_to_dma_pfn(virt_to_page(p));
127 }
128
129 static void __init check_tylersburg_isoch(void);
130 static int rwbf_quirk;
131
132 /*
133  * set to 1 to panic kernel if can't successfully enable VT-d
134  * (used when kernel is launched w/ TXT)
135  */
136 static int force_on = 0;
137 static int intel_iommu_tboot_noforce;
138 static int no_platform_optin;
139
140 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
141
142 /*
143  * Take a root_entry and return the Lower Context Table Pointer (LCTP)
144  * if marked present.
145  */
146 static phys_addr_t root_entry_lctp(struct root_entry *re)
147 {
148         if (!(re->lo & 1))
149                 return 0;
150
151         return re->lo & VTD_PAGE_MASK;
152 }
153
154 /*
155  * Take a root_entry and return the Upper Context Table Pointer (UCTP)
156  * if marked present.
157  */
158 static phys_addr_t root_entry_uctp(struct root_entry *re)
159 {
160         if (!(re->hi & 1))
161                 return 0;
162
163         return re->hi & VTD_PAGE_MASK;
164 }
165
166 static inline void context_set_present(struct context_entry *context)
167 {
168         context->lo |= 1;
169 }
170
171 static inline void context_set_fault_enable(struct context_entry *context)
172 {
173         context->lo &= (((u64)-1) << 2) | 1;
174 }
175
176 static inline void context_set_translation_type(struct context_entry *context,
177                                                 unsigned long value)
178 {
179         context->lo &= (((u64)-1) << 4) | 3;
180         context->lo |= (value & 3) << 2;
181 }
182
183 static inline void context_set_address_root(struct context_entry *context,
184                                             unsigned long value)
185 {
186         context->lo &= ~VTD_PAGE_MASK;
187         context->lo |= value & VTD_PAGE_MASK;
188 }
189
190 static inline void context_set_address_width(struct context_entry *context,
191                                              unsigned long value)
192 {
193         context->hi |= value & 7;
194 }
195
196 static inline void context_set_domain_id(struct context_entry *context,
197                                          unsigned long value)
198 {
199         context->hi |= (value & ((1 << 16) - 1)) << 8;
200 }
201
202 static inline int context_domain_id(struct context_entry *c)
203 {
204         return((c->hi >> 8) & 0xffff);
205 }
206
207 static inline void context_clear_entry(struct context_entry *context)
208 {
209         context->lo = 0;
210         context->hi = 0;
211 }
212
213 static inline bool context_copied(struct intel_iommu *iommu, u8 bus, u8 devfn)
214 {
215         if (!iommu->copied_tables)
216                 return false;
217
218         return test_bit(((long)bus << 8) | devfn, iommu->copied_tables);
219 }
220
221 static inline void
222 set_context_copied(struct intel_iommu *iommu, u8 bus, u8 devfn)
223 {
224         set_bit(((long)bus << 8) | devfn, iommu->copied_tables);
225 }
226
227 static inline void
228 clear_context_copied(struct intel_iommu *iommu, u8 bus, u8 devfn)
229 {
230         clear_bit(((long)bus << 8) | devfn, iommu->copied_tables);
231 }
232
233 /*
234  * This domain is a statically identity mapping domain.
235  *      1. This domain creats a static 1:1 mapping to all usable memory.
236  *      2. It maps to each iommu if successful.
237  *      3. Each iommu mapps to this domain if successful.
238  */
239 static struct dmar_domain *si_domain;
240 static int hw_pass_through = 1;
241
242 struct dmar_rmrr_unit {
243         struct list_head list;          /* list of rmrr units   */
244         struct acpi_dmar_header *hdr;   /* ACPI header          */
245         u64     base_address;           /* reserved base address*/
246         u64     end_address;            /* reserved end address */
247         struct dmar_dev_scope *devices; /* target devices */
248         int     devices_cnt;            /* target device count */
249 };
250
251 struct dmar_atsr_unit {
252         struct list_head list;          /* list of ATSR units */
253         struct acpi_dmar_header *hdr;   /* ACPI header */
254         struct dmar_dev_scope *devices; /* target devices */
255         int devices_cnt;                /* target device count */
256         u8 include_all:1;               /* include all ports */
257 };
258
259 struct dmar_satc_unit {
260         struct list_head list;          /* list of SATC units */
261         struct acpi_dmar_header *hdr;   /* ACPI header */
262         struct dmar_dev_scope *devices; /* target devices */
263         struct intel_iommu *iommu;      /* the corresponding iommu */
264         int devices_cnt;                /* target device count */
265         u8 atc_required:1;              /* ATS is required */
266 };
267
268 static LIST_HEAD(dmar_atsr_units);
269 static LIST_HEAD(dmar_rmrr_units);
270 static LIST_HEAD(dmar_satc_units);
271
272 #define for_each_rmrr_units(rmrr) \
273         list_for_each_entry(rmrr, &dmar_rmrr_units, list)
274
275 static void dmar_remove_one_dev_info(struct device *dev);
276
277 int dmar_disabled = !IS_ENABLED(CONFIG_INTEL_IOMMU_DEFAULT_ON);
278 int intel_iommu_sm = IS_ENABLED(CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON);
279
280 int intel_iommu_enabled = 0;
281 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
282
283 static int dmar_map_gfx = 1;
284 static int intel_iommu_superpage = 1;
285 static int iommu_identity_mapping;
286 static int iommu_skip_te_disable;
287
288 #define IDENTMAP_GFX            2
289 #define IDENTMAP_AZALIA         4
290
291 const struct iommu_ops intel_iommu_ops;
292
293 static bool translation_pre_enabled(struct intel_iommu *iommu)
294 {
295         return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
296 }
297
298 static void clear_translation_pre_enabled(struct intel_iommu *iommu)
299 {
300         iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
301 }
302
303 static void init_translation_status(struct intel_iommu *iommu)
304 {
305         u32 gsts;
306
307         gsts = readl(iommu->reg + DMAR_GSTS_REG);
308         if (gsts & DMA_GSTS_TES)
309                 iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
310 }
311
312 static int __init intel_iommu_setup(char *str)
313 {
314         if (!str)
315                 return -EINVAL;
316
317         while (*str) {
318                 if (!strncmp(str, "on", 2)) {
319                         dmar_disabled = 0;
320                         pr_info("IOMMU enabled\n");
321                 } else if (!strncmp(str, "off", 3)) {
322                         dmar_disabled = 1;
323                         no_platform_optin = 1;
324                         pr_info("IOMMU disabled\n");
325                 } else if (!strncmp(str, "igfx_off", 8)) {
326                         dmar_map_gfx = 0;
327                         pr_info("Disable GFX device mapping\n");
328                 } else if (!strncmp(str, "forcedac", 8)) {
329                         pr_warn("intel_iommu=forcedac deprecated; use iommu.forcedac instead\n");
330                         iommu_dma_forcedac = true;
331                 } else if (!strncmp(str, "strict", 6)) {
332                         pr_warn("intel_iommu=strict deprecated; use iommu.strict=1 instead\n");
333                         iommu_set_dma_strict();
334                 } else if (!strncmp(str, "sp_off", 6)) {
335                         pr_info("Disable supported super page\n");
336                         intel_iommu_superpage = 0;
337                 } else if (!strncmp(str, "sm_on", 5)) {
338                         pr_info("Enable scalable mode if hardware supports\n");
339                         intel_iommu_sm = 1;
340                 } else if (!strncmp(str, "sm_off", 6)) {
341                         pr_info("Scalable mode is disallowed\n");
342                         intel_iommu_sm = 0;
343                 } else if (!strncmp(str, "tboot_noforce", 13)) {
344                         pr_info("Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
345                         intel_iommu_tboot_noforce = 1;
346                 } else {
347                         pr_notice("Unknown option - '%s'\n", str);
348                 }
349
350                 str += strcspn(str, ",");
351                 while (*str == ',')
352                         str++;
353         }
354
355         return 1;
356 }
357 __setup("intel_iommu=", intel_iommu_setup);
358
359 void *alloc_pgtable_page(int node)
360 {
361         struct page *page;
362         void *vaddr = NULL;
363
364         page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
365         if (page)
366                 vaddr = page_address(page);
367         return vaddr;
368 }
369
370 void free_pgtable_page(void *vaddr)
371 {
372         free_page((unsigned long)vaddr);
373 }
374
375 static inline int domain_type_is_si(struct dmar_domain *domain)
376 {
377         return domain->domain.type == IOMMU_DOMAIN_IDENTITY;
378 }
379
380 static inline bool domain_use_first_level(struct dmar_domain *domain)
381 {
382         return domain->flags & DOMAIN_FLAG_USE_FIRST_LEVEL;
383 }
384
385 static inline int domain_pfn_supported(struct dmar_domain *domain,
386                                        unsigned long pfn)
387 {
388         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
389
390         return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
391 }
392
393 /*
394  * Calculate the Supported Adjusted Guest Address Widths of an IOMMU.
395  * Refer to 11.4.2 of the VT-d spec for the encoding of each bit of
396  * the returned SAGAW.
397  */
398 static unsigned long __iommu_calculate_sagaw(struct intel_iommu *iommu)
399 {
400         unsigned long fl_sagaw, sl_sagaw;
401
402         fl_sagaw = BIT(2) | (cap_fl1gp_support(iommu->cap) ? BIT(3) : 0);
403         sl_sagaw = cap_sagaw(iommu->cap);
404
405         /* Second level only. */
406         if (!sm_supported(iommu) || !ecap_flts(iommu->ecap))
407                 return sl_sagaw;
408
409         /* First level only. */
410         if (!ecap_slts(iommu->ecap))
411                 return fl_sagaw;
412
413         return fl_sagaw & sl_sagaw;
414 }
415
416 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
417 {
418         unsigned long sagaw;
419         int agaw;
420
421         sagaw = __iommu_calculate_sagaw(iommu);
422         for (agaw = width_to_agaw(max_gaw); agaw >= 0; agaw--) {
423                 if (test_bit(agaw, &sagaw))
424                         break;
425         }
426
427         return agaw;
428 }
429
430 /*
431  * Calculate max SAGAW for each iommu.
432  */
433 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
434 {
435         return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
436 }
437
438 /*
439  * calculate agaw for each iommu.
440  * "SAGAW" may be different across iommus, use a default agaw, and
441  * get a supported less agaw for iommus that don't support the default agaw.
442  */
443 int iommu_calculate_agaw(struct intel_iommu *iommu)
444 {
445         return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
446 }
447
448 static inline bool iommu_paging_structure_coherency(struct intel_iommu *iommu)
449 {
450         return sm_supported(iommu) ?
451                         ecap_smpwc(iommu->ecap) : ecap_coherent(iommu->ecap);
452 }
453
454 static void domain_update_iommu_coherency(struct dmar_domain *domain)
455 {
456         struct iommu_domain_info *info;
457         struct dmar_drhd_unit *drhd;
458         struct intel_iommu *iommu;
459         bool found = false;
460         unsigned long i;
461
462         domain->iommu_coherency = true;
463         xa_for_each(&domain->iommu_array, i, info) {
464                 found = true;
465                 if (!iommu_paging_structure_coherency(info->iommu)) {
466                         domain->iommu_coherency = false;
467                         break;
468                 }
469         }
470         if (found)
471                 return;
472
473         /* No hardware attached; use lowest common denominator */
474         rcu_read_lock();
475         for_each_active_iommu(iommu, drhd) {
476                 if (!iommu_paging_structure_coherency(iommu)) {
477                         domain->iommu_coherency = false;
478                         break;
479                 }
480         }
481         rcu_read_unlock();
482 }
483
484 static int domain_update_iommu_superpage(struct dmar_domain *domain,
485                                          struct intel_iommu *skip)
486 {
487         struct dmar_drhd_unit *drhd;
488         struct intel_iommu *iommu;
489         int mask = 0x3;
490
491         if (!intel_iommu_superpage)
492                 return 0;
493
494         /* set iommu_superpage to the smallest common denominator */
495         rcu_read_lock();
496         for_each_active_iommu(iommu, drhd) {
497                 if (iommu != skip) {
498                         if (domain && domain_use_first_level(domain)) {
499                                 if (!cap_fl1gp_support(iommu->cap))
500                                         mask = 0x1;
501                         } else {
502                                 mask &= cap_super_page_val(iommu->cap);
503                         }
504
505                         if (!mask)
506                                 break;
507                 }
508         }
509         rcu_read_unlock();
510
511         return fls(mask);
512 }
513
514 static int domain_update_device_node(struct dmar_domain *domain)
515 {
516         struct device_domain_info *info;
517         int nid = NUMA_NO_NODE;
518         unsigned long flags;
519
520         spin_lock_irqsave(&domain->lock, flags);
521         list_for_each_entry(info, &domain->devices, link) {
522                 /*
523                  * There could possibly be multiple device numa nodes as devices
524                  * within the same domain may sit behind different IOMMUs. There
525                  * isn't perfect answer in such situation, so we select first
526                  * come first served policy.
527                  */
528                 nid = dev_to_node(info->dev);
529                 if (nid != NUMA_NO_NODE)
530                         break;
531         }
532         spin_unlock_irqrestore(&domain->lock, flags);
533
534         return nid;
535 }
536
537 static void domain_update_iotlb(struct dmar_domain *domain);
538
539 /* Return the super pagesize bitmap if supported. */
540 static unsigned long domain_super_pgsize_bitmap(struct dmar_domain *domain)
541 {
542         unsigned long bitmap = 0;
543
544         /*
545          * 1-level super page supports page size of 2MiB, 2-level super page
546          * supports page size of both 2MiB and 1GiB.
547          */
548         if (domain->iommu_superpage == 1)
549                 bitmap |= SZ_2M;
550         else if (domain->iommu_superpage == 2)
551                 bitmap |= SZ_2M | SZ_1G;
552
553         return bitmap;
554 }
555
556 /* Some capabilities may be different across iommus */
557 static void domain_update_iommu_cap(struct dmar_domain *domain)
558 {
559         domain_update_iommu_coherency(domain);
560         domain->iommu_superpage = domain_update_iommu_superpage(domain, NULL);
561
562         /*
563          * If RHSA is missing, we should default to the device numa domain
564          * as fall back.
565          */
566         if (domain->nid == NUMA_NO_NODE)
567                 domain->nid = domain_update_device_node(domain);
568
569         /*
570          * First-level translation restricts the input-address to a
571          * canonical address (i.e., address bits 63:N have the same
572          * value as address bit [N-1], where N is 48-bits with 4-level
573          * paging and 57-bits with 5-level paging). Hence, skip bit
574          * [N-1].
575          */
576         if (domain_use_first_level(domain))
577                 domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw - 1);
578         else
579                 domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw);
580
581         domain->domain.pgsize_bitmap |= domain_super_pgsize_bitmap(domain);
582         domain_update_iotlb(domain);
583 }
584
585 struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
586                                          u8 devfn, int alloc)
587 {
588         struct root_entry *root = &iommu->root_entry[bus];
589         struct context_entry *context;
590         u64 *entry;
591
592         /*
593          * Except that the caller requested to allocate a new entry,
594          * returning a copied context entry makes no sense.
595          */
596         if (!alloc && context_copied(iommu, bus, devfn))
597                 return NULL;
598
599         entry = &root->lo;
600         if (sm_supported(iommu)) {
601                 if (devfn >= 0x80) {
602                         devfn -= 0x80;
603                         entry = &root->hi;
604                 }
605                 devfn *= 2;
606         }
607         if (*entry & 1)
608                 context = phys_to_virt(*entry & VTD_PAGE_MASK);
609         else {
610                 unsigned long phy_addr;
611                 if (!alloc)
612                         return NULL;
613
614                 context = alloc_pgtable_page(iommu->node);
615                 if (!context)
616                         return NULL;
617
618                 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
619                 phy_addr = virt_to_phys((void *)context);
620                 *entry = phy_addr | 1;
621                 __iommu_flush_cache(iommu, entry, sizeof(*entry));
622         }
623         return &context[devfn];
624 }
625
626 /**
627  * is_downstream_to_pci_bridge - test if a device belongs to the PCI
628  *                               sub-hierarchy of a candidate PCI-PCI bridge
629  * @dev: candidate PCI device belonging to @bridge PCI sub-hierarchy
630  * @bridge: the candidate PCI-PCI bridge
631  *
632  * Return: true if @dev belongs to @bridge PCI sub-hierarchy, else false.
633  */
634 static bool
635 is_downstream_to_pci_bridge(struct device *dev, struct device *bridge)
636 {
637         struct pci_dev *pdev, *pbridge;
638
639         if (!dev_is_pci(dev) || !dev_is_pci(bridge))
640                 return false;
641
642         pdev = to_pci_dev(dev);
643         pbridge = to_pci_dev(bridge);
644
645         if (pbridge->subordinate &&
646             pbridge->subordinate->number <= pdev->bus->number &&
647             pbridge->subordinate->busn_res.end >= pdev->bus->number)
648                 return true;
649
650         return false;
651 }
652
653 static bool quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
654 {
655         struct dmar_drhd_unit *drhd;
656         u32 vtbar;
657         int rc;
658
659         /* We know that this device on this chipset has its own IOMMU.
660          * If we find it under a different IOMMU, then the BIOS is lying
661          * to us. Hope that the IOMMU for this device is actually
662          * disabled, and it needs no translation...
663          */
664         rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
665         if (rc) {
666                 /* "can't" happen */
667                 dev_info(&pdev->dev, "failed to run vt-d quirk\n");
668                 return false;
669         }
670         vtbar &= 0xffff0000;
671
672         /* we know that the this iommu should be at offset 0xa000 from vtbar */
673         drhd = dmar_find_matched_drhd_unit(pdev);
674         if (!drhd || drhd->reg_base_addr - vtbar != 0xa000) {
675                 pr_warn_once(FW_BUG "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n");
676                 add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
677                 return true;
678         }
679
680         return false;
681 }
682
683 static bool iommu_is_dummy(struct intel_iommu *iommu, struct device *dev)
684 {
685         if (!iommu || iommu->drhd->ignored)
686                 return true;
687
688         if (dev_is_pci(dev)) {
689                 struct pci_dev *pdev = to_pci_dev(dev);
690
691                 if (pdev->vendor == PCI_VENDOR_ID_INTEL &&
692                     pdev->device == PCI_DEVICE_ID_INTEL_IOAT_SNB &&
693                     quirk_ioat_snb_local_iommu(pdev))
694                         return true;
695         }
696
697         return false;
698 }
699
700 struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn)
701 {
702         struct dmar_drhd_unit *drhd = NULL;
703         struct pci_dev *pdev = NULL;
704         struct intel_iommu *iommu;
705         struct device *tmp;
706         u16 segment = 0;
707         int i;
708
709         if (!dev)
710                 return NULL;
711
712         if (dev_is_pci(dev)) {
713                 struct pci_dev *pf_pdev;
714
715                 pdev = pci_real_dma_dev(to_pci_dev(dev));
716
717                 /* VFs aren't listed in scope tables; we need to look up
718                  * the PF instead to find the IOMMU. */
719                 pf_pdev = pci_physfn(pdev);
720                 dev = &pf_pdev->dev;
721                 segment = pci_domain_nr(pdev->bus);
722         } else if (has_acpi_companion(dev))
723                 dev = &ACPI_COMPANION(dev)->dev;
724
725         rcu_read_lock();
726         for_each_iommu(iommu, drhd) {
727                 if (pdev && segment != drhd->segment)
728                         continue;
729
730                 for_each_active_dev_scope(drhd->devices,
731                                           drhd->devices_cnt, i, tmp) {
732                         if (tmp == dev) {
733                                 /* For a VF use its original BDF# not that of the PF
734                                  * which we used for the IOMMU lookup. Strictly speaking
735                                  * we could do this for all PCI devices; we only need to
736                                  * get the BDF# from the scope table for ACPI matches. */
737                                 if (pdev && pdev->is_virtfn)
738                                         goto got_pdev;
739
740                                 if (bus && devfn) {
741                                         *bus = drhd->devices[i].bus;
742                                         *devfn = drhd->devices[i].devfn;
743                                 }
744                                 goto out;
745                         }
746
747                         if (is_downstream_to_pci_bridge(dev, tmp))
748                                 goto got_pdev;
749                 }
750
751                 if (pdev && drhd->include_all) {
752 got_pdev:
753                         if (bus && devfn) {
754                                 *bus = pdev->bus->number;
755                                 *devfn = pdev->devfn;
756                         }
757                         goto out;
758                 }
759         }
760         iommu = NULL;
761 out:
762         if (iommu_is_dummy(iommu, dev))
763                 iommu = NULL;
764
765         rcu_read_unlock();
766
767         return iommu;
768 }
769
770 static void domain_flush_cache(struct dmar_domain *domain,
771                                void *addr, int size)
772 {
773         if (!domain->iommu_coherency)
774                 clflush_cache_range(addr, size);
775 }
776
777 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
778 {
779         struct context_entry *context;
780         int ret = 0;
781
782         spin_lock(&iommu->lock);
783         context = iommu_context_addr(iommu, bus, devfn, 0);
784         if (context)
785                 ret = context_present(context);
786         spin_unlock(&iommu->lock);
787         return ret;
788 }
789
790 static void free_context_table(struct intel_iommu *iommu)
791 {
792         struct context_entry *context;
793         int i;
794
795         if (!iommu->root_entry)
796                 return;
797
798         for (i = 0; i < ROOT_ENTRY_NR; i++) {
799                 context = iommu_context_addr(iommu, i, 0, 0);
800                 if (context)
801                         free_pgtable_page(context);
802
803                 if (!sm_supported(iommu))
804                         continue;
805
806                 context = iommu_context_addr(iommu, i, 0x80, 0);
807                 if (context)
808                         free_pgtable_page(context);
809         }
810
811         free_pgtable_page(iommu->root_entry);
812         iommu->root_entry = NULL;
813 }
814
815 #ifdef CONFIG_DMAR_DEBUG
816 static void pgtable_walk(struct intel_iommu *iommu, unsigned long pfn,
817                          u8 bus, u8 devfn, struct dma_pte *parent, int level)
818 {
819         struct dma_pte *pte;
820         int offset;
821
822         while (1) {
823                 offset = pfn_level_offset(pfn, level);
824                 pte = &parent[offset];
825                 if (!pte || (dma_pte_superpage(pte) || !dma_pte_present(pte))) {
826                         pr_info("PTE not present at level %d\n", level);
827                         break;
828                 }
829
830                 pr_info("pte level: %d, pte value: 0x%016llx\n", level, pte->val);
831
832                 if (level == 1)
833                         break;
834
835                 parent = phys_to_virt(dma_pte_addr(pte));
836                 level--;
837         }
838 }
839
840 void dmar_fault_dump_ptes(struct intel_iommu *iommu, u16 source_id,
841                           unsigned long long addr, u32 pasid)
842 {
843         struct pasid_dir_entry *dir, *pde;
844         struct pasid_entry *entries, *pte;
845         struct context_entry *ctx_entry;
846         struct root_entry *rt_entry;
847         int i, dir_index, index, level;
848         u8 devfn = source_id & 0xff;
849         u8 bus = source_id >> 8;
850         struct dma_pte *pgtable;
851
852         pr_info("Dump %s table entries for IOVA 0x%llx\n", iommu->name, addr);
853
854         /* root entry dump */
855         rt_entry = &iommu->root_entry[bus];
856         if (!rt_entry) {
857                 pr_info("root table entry is not present\n");
858                 return;
859         }
860
861         if (sm_supported(iommu))
862                 pr_info("scalable mode root entry: hi 0x%016llx, low 0x%016llx\n",
863                         rt_entry->hi, rt_entry->lo);
864         else
865                 pr_info("root entry: 0x%016llx", rt_entry->lo);
866
867         /* context entry dump */
868         ctx_entry = iommu_context_addr(iommu, bus, devfn, 0);
869         if (!ctx_entry) {
870                 pr_info("context table entry is not present\n");
871                 return;
872         }
873
874         pr_info("context entry: hi 0x%016llx, low 0x%016llx\n",
875                 ctx_entry->hi, ctx_entry->lo);
876
877         /* legacy mode does not require PASID entries */
878         if (!sm_supported(iommu)) {
879                 level = agaw_to_level(ctx_entry->hi & 7);
880                 pgtable = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK);
881                 goto pgtable_walk;
882         }
883
884         /* get the pointer to pasid directory entry */
885         dir = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK);
886         if (!dir) {
887                 pr_info("pasid directory entry is not present\n");
888                 return;
889         }
890         /* For request-without-pasid, get the pasid from context entry */
891         if (intel_iommu_sm && pasid == INVALID_IOASID)
892                 pasid = PASID_RID2PASID;
893
894         dir_index = pasid >> PASID_PDE_SHIFT;
895         pde = &dir[dir_index];
896         pr_info("pasid dir entry: 0x%016llx\n", pde->val);
897
898         /* get the pointer to the pasid table entry */
899         entries = get_pasid_table_from_pde(pde);
900         if (!entries) {
901                 pr_info("pasid table entry is not present\n");
902                 return;
903         }
904         index = pasid & PASID_PTE_MASK;
905         pte = &entries[index];
906         for (i = 0; i < ARRAY_SIZE(pte->val); i++)
907                 pr_info("pasid table entry[%d]: 0x%016llx\n", i, pte->val[i]);
908
909         if (pasid_pte_get_pgtt(pte) == PASID_ENTRY_PGTT_FL_ONLY) {
910                 level = pte->val[2] & BIT_ULL(2) ? 5 : 4;
911                 pgtable = phys_to_virt(pte->val[2] & VTD_PAGE_MASK);
912         } else {
913                 level = agaw_to_level((pte->val[0] >> 2) & 0x7);
914                 pgtable = phys_to_virt(pte->val[0] & VTD_PAGE_MASK);
915         }
916
917 pgtable_walk:
918         pgtable_walk(iommu, addr >> VTD_PAGE_SHIFT, bus, devfn, pgtable, level);
919 }
920 #endif
921
922 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
923                                       unsigned long pfn, int *target_level)
924 {
925         struct dma_pte *parent, *pte;
926         int level = agaw_to_level(domain->agaw);
927         int offset;
928
929         BUG_ON(!domain->pgd);
930
931         if (!domain_pfn_supported(domain, pfn))
932                 /* Address beyond IOMMU's addressing capabilities. */
933                 return NULL;
934
935         parent = domain->pgd;
936
937         while (1) {
938                 void *tmp_page;
939
940                 offset = pfn_level_offset(pfn, level);
941                 pte = &parent[offset];
942                 if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
943                         break;
944                 if (level == *target_level)
945                         break;
946
947                 if (!dma_pte_present(pte)) {
948                         uint64_t pteval;
949
950                         tmp_page = alloc_pgtable_page(domain->nid);
951
952                         if (!tmp_page)
953                                 return NULL;
954
955                         domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
956                         pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
957                         if (domain_use_first_level(domain)) {
958                                 pteval |= DMA_FL_PTE_XD | DMA_FL_PTE_US;
959                                 if (iommu_is_dma_domain(&domain->domain))
960                                         pteval |= DMA_FL_PTE_ACCESS;
961                         }
962                         if (cmpxchg64(&pte->val, 0ULL, pteval))
963                                 /* Someone else set it while we were thinking; use theirs. */
964                                 free_pgtable_page(tmp_page);
965                         else
966                                 domain_flush_cache(domain, pte, sizeof(*pte));
967                 }
968                 if (level == 1)
969                         break;
970
971                 parent = phys_to_virt(dma_pte_addr(pte));
972                 level--;
973         }
974
975         if (!*target_level)
976                 *target_level = level;
977
978         return pte;
979 }
980
981 /* return address's pte at specific level */
982 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
983                                          unsigned long pfn,
984                                          int level, int *large_page)
985 {
986         struct dma_pte *parent, *pte;
987         int total = agaw_to_level(domain->agaw);
988         int offset;
989
990         parent = domain->pgd;
991         while (level <= total) {
992                 offset = pfn_level_offset(pfn, total);
993                 pte = &parent[offset];
994                 if (level == total)
995                         return pte;
996
997                 if (!dma_pte_present(pte)) {
998                         *large_page = total;
999                         break;
1000                 }
1001
1002                 if (dma_pte_superpage(pte)) {
1003                         *large_page = total;
1004                         return pte;
1005                 }
1006
1007                 parent = phys_to_virt(dma_pte_addr(pte));
1008                 total--;
1009         }
1010         return NULL;
1011 }
1012
1013 /* clear last level pte, a tlb flush should be followed */
1014 static void dma_pte_clear_range(struct dmar_domain *domain,
1015                                 unsigned long start_pfn,
1016                                 unsigned long last_pfn)
1017 {
1018         unsigned int large_page;
1019         struct dma_pte *first_pte, *pte;
1020
1021         BUG_ON(!domain_pfn_supported(domain, start_pfn));
1022         BUG_ON(!domain_pfn_supported(domain, last_pfn));
1023         BUG_ON(start_pfn > last_pfn);
1024
1025         /* we don't need lock here; nobody else touches the iova range */
1026         do {
1027                 large_page = 1;
1028                 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
1029                 if (!pte) {
1030                         start_pfn = align_to_level(start_pfn + 1, large_page + 1);
1031                         continue;
1032                 }
1033                 do {
1034                         dma_clear_pte(pte);
1035                         start_pfn += lvl_to_nr_pages(large_page);
1036                         pte++;
1037                 } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
1038
1039                 domain_flush_cache(domain, first_pte,
1040                                    (void *)pte - (void *)first_pte);
1041
1042         } while (start_pfn && start_pfn <= last_pfn);
1043 }
1044
1045 static void dma_pte_free_level(struct dmar_domain *domain, int level,
1046                                int retain_level, struct dma_pte *pte,
1047                                unsigned long pfn, unsigned long start_pfn,
1048                                unsigned long last_pfn)
1049 {
1050         pfn = max(start_pfn, pfn);
1051         pte = &pte[pfn_level_offset(pfn, level)];
1052
1053         do {
1054                 unsigned long level_pfn;
1055                 struct dma_pte *level_pte;
1056
1057                 if (!dma_pte_present(pte) || dma_pte_superpage(pte))
1058                         goto next;
1059
1060                 level_pfn = pfn & level_mask(level);
1061                 level_pte = phys_to_virt(dma_pte_addr(pte));
1062
1063                 if (level > 2) {
1064                         dma_pte_free_level(domain, level - 1, retain_level,
1065                                            level_pte, level_pfn, start_pfn,
1066                                            last_pfn);
1067                 }
1068
1069                 /*
1070                  * Free the page table if we're below the level we want to
1071                  * retain and the range covers the entire table.
1072                  */
1073                 if (level < retain_level && !(start_pfn > level_pfn ||
1074                       last_pfn < level_pfn + level_size(level) - 1)) {
1075                         dma_clear_pte(pte);
1076                         domain_flush_cache(domain, pte, sizeof(*pte));
1077                         free_pgtable_page(level_pte);
1078                 }
1079 next:
1080                 pfn += level_size(level);
1081         } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1082 }
1083
1084 /*
1085  * clear last level (leaf) ptes and free page table pages below the
1086  * level we wish to keep intact.
1087  */
1088 static void dma_pte_free_pagetable(struct dmar_domain *domain,
1089                                    unsigned long start_pfn,
1090                                    unsigned long last_pfn,
1091                                    int retain_level)
1092 {
1093         dma_pte_clear_range(domain, start_pfn, last_pfn);
1094
1095         /* We don't need lock here; nobody else touches the iova range */
1096         dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level,
1097                            domain->pgd, 0, start_pfn, last_pfn);
1098
1099         /* free pgd */
1100         if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1101                 free_pgtable_page(domain->pgd);
1102                 domain->pgd = NULL;
1103         }
1104 }
1105
1106 /* When a page at a given level is being unlinked from its parent, we don't
1107    need to *modify* it at all. All we need to do is make a list of all the
1108    pages which can be freed just as soon as we've flushed the IOTLB and we
1109    know the hardware page-walk will no longer touch them.
1110    The 'pte' argument is the *parent* PTE, pointing to the page that is to
1111    be freed. */
1112 static void dma_pte_list_pagetables(struct dmar_domain *domain,
1113                                     int level, struct dma_pte *pte,
1114                                     struct list_head *freelist)
1115 {
1116         struct page *pg;
1117
1118         pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
1119         list_add_tail(&pg->lru, freelist);
1120
1121         if (level == 1)
1122                 return;
1123
1124         pte = page_address(pg);
1125         do {
1126                 if (dma_pte_present(pte) && !dma_pte_superpage(pte))
1127                         dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1128                 pte++;
1129         } while (!first_pte_in_page(pte));
1130 }
1131
1132 static void dma_pte_clear_level(struct dmar_domain *domain, int level,
1133                                 struct dma_pte *pte, unsigned long pfn,
1134                                 unsigned long start_pfn, unsigned long last_pfn,
1135                                 struct list_head *freelist)
1136 {
1137         struct dma_pte *first_pte = NULL, *last_pte = NULL;
1138
1139         pfn = max(start_pfn, pfn);
1140         pte = &pte[pfn_level_offset(pfn, level)];
1141
1142         do {
1143                 unsigned long level_pfn = pfn & level_mask(level);
1144
1145                 if (!dma_pte_present(pte))
1146                         goto next;
1147
1148                 /* If range covers entire pagetable, free it */
1149                 if (start_pfn <= level_pfn &&
1150                     last_pfn >= level_pfn + level_size(level) - 1) {
1151                         /* These suborbinate page tables are going away entirely. Don't
1152                            bother to clear them; we're just going to *free* them. */
1153                         if (level > 1 && !dma_pte_superpage(pte))
1154                                 dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1155
1156                         dma_clear_pte(pte);
1157                         if (!first_pte)
1158                                 first_pte = pte;
1159                         last_pte = pte;
1160                 } else if (level > 1) {
1161                         /* Recurse down into a level that isn't *entirely* obsolete */
1162                         dma_pte_clear_level(domain, level - 1,
1163                                             phys_to_virt(dma_pte_addr(pte)),
1164                                             level_pfn, start_pfn, last_pfn,
1165                                             freelist);
1166                 }
1167 next:
1168                 pfn = level_pfn + level_size(level);
1169         } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1170
1171         if (first_pte)
1172                 domain_flush_cache(domain, first_pte,
1173                                    (void *)++last_pte - (void *)first_pte);
1174 }
1175
1176 /* We can't just free the pages because the IOMMU may still be walking
1177    the page tables, and may have cached the intermediate levels. The
1178    pages can only be freed after the IOTLB flush has been done. */
1179 static void domain_unmap(struct dmar_domain *domain, unsigned long start_pfn,
1180                          unsigned long last_pfn, struct list_head *freelist)
1181 {
1182         BUG_ON(!domain_pfn_supported(domain, start_pfn));
1183         BUG_ON(!domain_pfn_supported(domain, last_pfn));
1184         BUG_ON(start_pfn > last_pfn);
1185
1186         /* we don't need lock here; nobody else touches the iova range */
1187         dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1188                             domain->pgd, 0, start_pfn, last_pfn, freelist);
1189
1190         /* free pgd */
1191         if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1192                 struct page *pgd_page = virt_to_page(domain->pgd);
1193                 list_add_tail(&pgd_page->lru, freelist);
1194                 domain->pgd = NULL;
1195         }
1196 }
1197
1198 /* iommu handling */
1199 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1200 {
1201         struct root_entry *root;
1202
1203         root = (struct root_entry *)alloc_pgtable_page(iommu->node);
1204         if (!root) {
1205                 pr_err("Allocating root entry for %s failed\n",
1206                         iommu->name);
1207                 return -ENOMEM;
1208         }
1209
1210         __iommu_flush_cache(iommu, root, ROOT_SIZE);
1211         iommu->root_entry = root;
1212
1213         return 0;
1214 }
1215
1216 static void iommu_set_root_entry(struct intel_iommu *iommu)
1217 {
1218         u64 addr;
1219         u32 sts;
1220         unsigned long flag;
1221
1222         addr = virt_to_phys(iommu->root_entry);
1223         if (sm_supported(iommu))
1224                 addr |= DMA_RTADDR_SMT;
1225
1226         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1227         dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1228
1229         writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1230
1231         /* Make sure hardware complete it */
1232         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1233                       readl, (sts & DMA_GSTS_RTPS), sts);
1234
1235         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1236
1237         iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
1238         if (sm_supported(iommu))
1239                 qi_flush_pasid_cache(iommu, 0, QI_PC_GLOBAL, 0);
1240         iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
1241 }
1242
1243 void iommu_flush_write_buffer(struct intel_iommu *iommu)
1244 {
1245         u32 val;
1246         unsigned long flag;
1247
1248         if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1249                 return;
1250
1251         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1252         writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1253
1254         /* Make sure hardware complete it */
1255         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1256                       readl, (!(val & DMA_GSTS_WBFS)), val);
1257
1258         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1259 }
1260
1261 /* return value determine if we need a write buffer flush */
1262 static void __iommu_flush_context(struct intel_iommu *iommu,
1263                                   u16 did, u16 source_id, u8 function_mask,
1264                                   u64 type)
1265 {
1266         u64 val = 0;
1267         unsigned long flag;
1268
1269         switch (type) {
1270         case DMA_CCMD_GLOBAL_INVL:
1271                 val = DMA_CCMD_GLOBAL_INVL;
1272                 break;
1273         case DMA_CCMD_DOMAIN_INVL:
1274                 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1275                 break;
1276         case DMA_CCMD_DEVICE_INVL:
1277                 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1278                         | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1279                 break;
1280         default:
1281                 BUG();
1282         }
1283         val |= DMA_CCMD_ICC;
1284
1285         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1286         dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1287
1288         /* Make sure hardware complete it */
1289         IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1290                 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1291
1292         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1293 }
1294
1295 /* return value determine if we need a write buffer flush */
1296 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1297                                 u64 addr, unsigned int size_order, u64 type)
1298 {
1299         int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1300         u64 val = 0, val_iva = 0;
1301         unsigned long flag;
1302
1303         switch (type) {
1304         case DMA_TLB_GLOBAL_FLUSH:
1305                 /* global flush doesn't need set IVA_REG */
1306                 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1307                 break;
1308         case DMA_TLB_DSI_FLUSH:
1309                 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1310                 break;
1311         case DMA_TLB_PSI_FLUSH:
1312                 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1313                 /* IH bit is passed in as part of address */
1314                 val_iva = size_order | addr;
1315                 break;
1316         default:
1317                 BUG();
1318         }
1319         /* Note: set drain read/write */
1320 #if 0
1321         /*
1322          * This is probably to be super secure.. Looks like we can
1323          * ignore it without any impact.
1324          */
1325         if (cap_read_drain(iommu->cap))
1326                 val |= DMA_TLB_READ_DRAIN;
1327 #endif
1328         if (cap_write_drain(iommu->cap))
1329                 val |= DMA_TLB_WRITE_DRAIN;
1330
1331         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1332         /* Note: Only uses first TLB reg currently */
1333         if (val_iva)
1334                 dmar_writeq(iommu->reg + tlb_offset, val_iva);
1335         dmar_writeq(iommu->reg + tlb_offset + 8, val);
1336
1337         /* Make sure hardware complete it */
1338         IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1339                 dmar_readq, (!(val & DMA_TLB_IVT)), val);
1340
1341         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1342
1343         /* check IOTLB invalidation granularity */
1344         if (DMA_TLB_IAIG(val) == 0)
1345                 pr_err("Flush IOTLB failed\n");
1346         if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1347                 pr_debug("TLB flush request %Lx, actual %Lx\n",
1348                         (unsigned long long)DMA_TLB_IIRG(type),
1349                         (unsigned long long)DMA_TLB_IAIG(val));
1350 }
1351
1352 static struct device_domain_info *
1353 iommu_support_dev_iotlb(struct dmar_domain *domain, struct intel_iommu *iommu,
1354                         u8 bus, u8 devfn)
1355 {
1356         struct device_domain_info *info;
1357         unsigned long flags;
1358
1359         if (!iommu->qi)
1360                 return NULL;
1361
1362         spin_lock_irqsave(&domain->lock, flags);
1363         list_for_each_entry(info, &domain->devices, link) {
1364                 if (info->iommu == iommu && info->bus == bus &&
1365                     info->devfn == devfn) {
1366                         spin_unlock_irqrestore(&domain->lock, flags);
1367                         return info->ats_supported ? info : NULL;
1368                 }
1369         }
1370         spin_unlock_irqrestore(&domain->lock, flags);
1371
1372         return NULL;
1373 }
1374
1375 static void domain_update_iotlb(struct dmar_domain *domain)
1376 {
1377         struct device_domain_info *info;
1378         bool has_iotlb_device = false;
1379         unsigned long flags;
1380
1381         spin_lock_irqsave(&domain->lock, flags);
1382         list_for_each_entry(info, &domain->devices, link) {
1383                 if (info->ats_enabled) {
1384                         has_iotlb_device = true;
1385                         break;
1386                 }
1387         }
1388         domain->has_iotlb_device = has_iotlb_device;
1389         spin_unlock_irqrestore(&domain->lock, flags);
1390 }
1391
1392 static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1393 {
1394         struct pci_dev *pdev;
1395
1396         if (!info || !dev_is_pci(info->dev))
1397                 return;
1398
1399         pdev = to_pci_dev(info->dev);
1400         /* For IOMMU that supports device IOTLB throttling (DIT), we assign
1401          * PFSID to the invalidation desc of a VF such that IOMMU HW can gauge
1402          * queue depth at PF level. If DIT is not set, PFSID will be treated as
1403          * reserved, which should be set to 0.
1404          */
1405         if (!ecap_dit(info->iommu->ecap))
1406                 info->pfsid = 0;
1407         else {
1408                 struct pci_dev *pf_pdev;
1409
1410                 /* pdev will be returned if device is not a vf */
1411                 pf_pdev = pci_physfn(pdev);
1412                 info->pfsid = pci_dev_id(pf_pdev);
1413         }
1414
1415 #ifdef CONFIG_INTEL_IOMMU_SVM
1416         /* The PCIe spec, in its wisdom, declares that the behaviour of
1417            the device if you enable PASID support after ATS support is
1418            undefined. So always enable PASID support on devices which
1419            have it, even if we can't yet know if we're ever going to
1420            use it. */
1421         if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1))
1422                 info->pasid_enabled = 1;
1423
1424         if (info->pri_supported &&
1425             (info->pasid_enabled ? pci_prg_resp_pasid_required(pdev) : 1)  &&
1426             !pci_reset_pri(pdev) && !pci_enable_pri(pdev, PRQ_DEPTH))
1427                 info->pri_enabled = 1;
1428 #endif
1429         if (info->ats_supported && pci_ats_page_aligned(pdev) &&
1430             !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) {
1431                 info->ats_enabled = 1;
1432                 domain_update_iotlb(info->domain);
1433                 info->ats_qdep = pci_ats_queue_depth(pdev);
1434         }
1435 }
1436
1437 static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1438 {
1439         struct pci_dev *pdev;
1440
1441         if (!dev_is_pci(info->dev))
1442                 return;
1443
1444         pdev = to_pci_dev(info->dev);
1445
1446         if (info->ats_enabled) {
1447                 pci_disable_ats(pdev);
1448                 info->ats_enabled = 0;
1449                 domain_update_iotlb(info->domain);
1450         }
1451 #ifdef CONFIG_INTEL_IOMMU_SVM
1452         if (info->pri_enabled) {
1453                 pci_disable_pri(pdev);
1454                 info->pri_enabled = 0;
1455         }
1456         if (info->pasid_enabled) {
1457                 pci_disable_pasid(pdev);
1458                 info->pasid_enabled = 0;
1459         }
1460 #endif
1461 }
1462
1463 static void __iommu_flush_dev_iotlb(struct device_domain_info *info,
1464                                     u64 addr, unsigned int mask)
1465 {
1466         u16 sid, qdep;
1467
1468         if (!info || !info->ats_enabled)
1469                 return;
1470
1471         sid = info->bus << 8 | info->devfn;
1472         qdep = info->ats_qdep;
1473         qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
1474                            qdep, addr, mask);
1475 }
1476
1477 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1478                                   u64 addr, unsigned mask)
1479 {
1480         struct device_domain_info *info;
1481         unsigned long flags;
1482
1483         if (!domain->has_iotlb_device)
1484                 return;
1485
1486         spin_lock_irqsave(&domain->lock, flags);
1487         list_for_each_entry(info, &domain->devices, link)
1488                 __iommu_flush_dev_iotlb(info, addr, mask);
1489         spin_unlock_irqrestore(&domain->lock, flags);
1490 }
1491
1492 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu,
1493                                   struct dmar_domain *domain,
1494                                   unsigned long pfn, unsigned int pages,
1495                                   int ih, int map)
1496 {
1497         unsigned int aligned_pages = __roundup_pow_of_two(pages);
1498         unsigned int mask = ilog2(aligned_pages);
1499         uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1500         u16 did = domain_id_iommu(domain, iommu);
1501
1502         BUG_ON(pages == 0);
1503
1504         if (ih)
1505                 ih = 1 << 6;
1506
1507         if (domain_use_first_level(domain)) {
1508                 qi_flush_piotlb(iommu, did, PASID_RID2PASID, addr, pages, ih);
1509         } else {
1510                 unsigned long bitmask = aligned_pages - 1;
1511
1512                 /*
1513                  * PSI masks the low order bits of the base address. If the
1514                  * address isn't aligned to the mask, then compute a mask value
1515                  * needed to ensure the target range is flushed.
1516                  */
1517                 if (unlikely(bitmask & pfn)) {
1518                         unsigned long end_pfn = pfn + pages - 1, shared_bits;
1519
1520                         /*
1521                          * Since end_pfn <= pfn + bitmask, the only way bits
1522                          * higher than bitmask can differ in pfn and end_pfn is
1523                          * by carrying. This means after masking out bitmask,
1524                          * high bits starting with the first set bit in
1525                          * shared_bits are all equal in both pfn and end_pfn.
1526                          */
1527                         shared_bits = ~(pfn ^ end_pfn) & ~bitmask;
1528                         mask = shared_bits ? __ffs(shared_bits) : BITS_PER_LONG;
1529                 }
1530
1531                 /*
1532                  * Fallback to domain selective flush if no PSI support or
1533                  * the size is too big.
1534                  */
1535                 if (!cap_pgsel_inv(iommu->cap) ||
1536                     mask > cap_max_amask_val(iommu->cap))
1537                         iommu->flush.flush_iotlb(iommu, did, 0, 0,
1538                                                         DMA_TLB_DSI_FLUSH);
1539                 else
1540                         iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
1541                                                         DMA_TLB_PSI_FLUSH);
1542         }
1543
1544         /*
1545          * In caching mode, changes of pages from non-present to present require
1546          * flush. However, device IOTLB doesn't need to be flushed in this case.
1547          */
1548         if (!cap_caching_mode(iommu->cap) || !map)
1549                 iommu_flush_dev_iotlb(domain, addr, mask);
1550 }
1551
1552 /* Notification for newly created mappings */
1553 static inline void __mapping_notify_one(struct intel_iommu *iommu,
1554                                         struct dmar_domain *domain,
1555                                         unsigned long pfn, unsigned int pages)
1556 {
1557         /*
1558          * It's a non-present to present mapping. Only flush if caching mode
1559          * and second level.
1560          */
1561         if (cap_caching_mode(iommu->cap) && !domain_use_first_level(domain))
1562                 iommu_flush_iotlb_psi(iommu, domain, pfn, pages, 0, 1);
1563         else
1564                 iommu_flush_write_buffer(iommu);
1565 }
1566
1567 static void intel_flush_iotlb_all(struct iommu_domain *domain)
1568 {
1569         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
1570         struct iommu_domain_info *info;
1571         unsigned long idx;
1572
1573         xa_for_each(&dmar_domain->iommu_array, idx, info) {
1574                 struct intel_iommu *iommu = info->iommu;
1575                 u16 did = domain_id_iommu(dmar_domain, iommu);
1576
1577                 if (domain_use_first_level(dmar_domain))
1578                         qi_flush_piotlb(iommu, did, PASID_RID2PASID, 0, -1, 0);
1579                 else
1580                         iommu->flush.flush_iotlb(iommu, did, 0, 0,
1581                                                  DMA_TLB_DSI_FLUSH);
1582
1583                 if (!cap_caching_mode(iommu->cap))
1584                         iommu_flush_dev_iotlb(dmar_domain, 0, MAX_AGAW_PFN_WIDTH);
1585         }
1586 }
1587
1588 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1589 {
1590         u32 pmen;
1591         unsigned long flags;
1592
1593         if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap))
1594                 return;
1595
1596         raw_spin_lock_irqsave(&iommu->register_lock, flags);
1597         pmen = readl(iommu->reg + DMAR_PMEN_REG);
1598         pmen &= ~DMA_PMEN_EPM;
1599         writel(pmen, iommu->reg + DMAR_PMEN_REG);
1600
1601         /* wait for the protected region status bit to clear */
1602         IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1603                 readl, !(pmen & DMA_PMEN_PRS), pmen);
1604
1605         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1606 }
1607
1608 static void iommu_enable_translation(struct intel_iommu *iommu)
1609 {
1610         u32 sts;
1611         unsigned long flags;
1612
1613         raw_spin_lock_irqsave(&iommu->register_lock, flags);
1614         iommu->gcmd |= DMA_GCMD_TE;
1615         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1616
1617         /* Make sure hardware complete it */
1618         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1619                       readl, (sts & DMA_GSTS_TES), sts);
1620
1621         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1622 }
1623
1624 static void iommu_disable_translation(struct intel_iommu *iommu)
1625 {
1626         u32 sts;
1627         unsigned long flag;
1628
1629         if (iommu_skip_te_disable && iommu->drhd->gfx_dedicated &&
1630             (cap_read_drain(iommu->cap) || cap_write_drain(iommu->cap)))
1631                 return;
1632
1633         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1634         iommu->gcmd &= ~DMA_GCMD_TE;
1635         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1636
1637         /* Make sure hardware complete it */
1638         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1639                       readl, (!(sts & DMA_GSTS_TES)), sts);
1640
1641         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1642 }
1643
1644 static int iommu_init_domains(struct intel_iommu *iommu)
1645 {
1646         u32 ndomains;
1647
1648         ndomains = cap_ndoms(iommu->cap);
1649         pr_debug("%s: Number of Domains supported <%d>\n",
1650                  iommu->name, ndomains);
1651
1652         spin_lock_init(&iommu->lock);
1653
1654         iommu->domain_ids = bitmap_zalloc(ndomains, GFP_KERNEL);
1655         if (!iommu->domain_ids)
1656                 return -ENOMEM;
1657
1658         /*
1659          * If Caching mode is set, then invalid translations are tagged
1660          * with domain-id 0, hence we need to pre-allocate it. We also
1661          * use domain-id 0 as a marker for non-allocated domain-id, so
1662          * make sure it is not used for a real domain.
1663          */
1664         set_bit(0, iommu->domain_ids);
1665
1666         /*
1667          * Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid
1668          * entry for first-level or pass-through translation modes should
1669          * be programmed with a domain id different from those used for
1670          * second-level or nested translation. We reserve a domain id for
1671          * this purpose.
1672          */
1673         if (sm_supported(iommu))
1674                 set_bit(FLPT_DEFAULT_DID, iommu->domain_ids);
1675
1676         return 0;
1677 }
1678
1679 static void disable_dmar_iommu(struct intel_iommu *iommu)
1680 {
1681         if (!iommu->domain_ids)
1682                 return;
1683
1684         /*
1685          * All iommu domains must have been detached from the devices,
1686          * hence there should be no domain IDs in use.
1687          */
1688         if (WARN_ON(bitmap_weight(iommu->domain_ids, cap_ndoms(iommu->cap))
1689                     > NUM_RESERVED_DID))
1690                 return;
1691
1692         if (iommu->gcmd & DMA_GCMD_TE)
1693                 iommu_disable_translation(iommu);
1694 }
1695
1696 static void free_dmar_iommu(struct intel_iommu *iommu)
1697 {
1698         if (iommu->domain_ids) {
1699                 bitmap_free(iommu->domain_ids);
1700                 iommu->domain_ids = NULL;
1701         }
1702
1703         if (iommu->copied_tables) {
1704                 bitmap_free(iommu->copied_tables);
1705                 iommu->copied_tables = NULL;
1706         }
1707
1708         /* free context mapping */
1709         free_context_table(iommu);
1710
1711 #ifdef CONFIG_INTEL_IOMMU_SVM
1712         if (pasid_supported(iommu)) {
1713                 if (ecap_prs(iommu->ecap))
1714                         intel_svm_finish_prq(iommu);
1715         }
1716         if (vccap_pasid(iommu->vccap))
1717                 ioasid_unregister_allocator(&iommu->pasid_allocator);
1718
1719 #endif
1720 }
1721
1722 /*
1723  * Check and return whether first level is used by default for
1724  * DMA translation.
1725  */
1726 static bool first_level_by_default(unsigned int type)
1727 {
1728         /* Only SL is available in legacy mode */
1729         if (!scalable_mode_support())
1730                 return false;
1731
1732         /* Only level (either FL or SL) is available, just use it */
1733         if (intel_cap_flts_sanity() ^ intel_cap_slts_sanity())
1734                 return intel_cap_flts_sanity();
1735
1736         /* Both levels are available, decide it based on domain type */
1737         return type != IOMMU_DOMAIN_UNMANAGED;
1738 }
1739
1740 static struct dmar_domain *alloc_domain(unsigned int type)
1741 {
1742         struct dmar_domain *domain;
1743
1744         domain = kzalloc(sizeof(*domain), GFP_KERNEL);
1745         if (!domain)
1746                 return NULL;
1747
1748         domain->nid = NUMA_NO_NODE;
1749         if (first_level_by_default(type))
1750                 domain->flags |= DOMAIN_FLAG_USE_FIRST_LEVEL;
1751         domain->has_iotlb_device = false;
1752         INIT_LIST_HEAD(&domain->devices);
1753         spin_lock_init(&domain->lock);
1754         xa_init(&domain->iommu_array);
1755
1756         return domain;
1757 }
1758
1759 static int domain_attach_iommu(struct dmar_domain *domain,
1760                                struct intel_iommu *iommu)
1761 {
1762         struct iommu_domain_info *info, *curr;
1763         unsigned long ndomains;
1764         int num, ret = -ENOSPC;
1765
1766         info = kzalloc(sizeof(*info), GFP_KERNEL);
1767         if (!info)
1768                 return -ENOMEM;
1769
1770         spin_lock(&iommu->lock);
1771         curr = xa_load(&domain->iommu_array, iommu->seq_id);
1772         if (curr) {
1773                 curr->refcnt++;
1774                 spin_unlock(&iommu->lock);
1775                 kfree(info);
1776                 return 0;
1777         }
1778
1779         ndomains = cap_ndoms(iommu->cap);
1780         num = find_first_zero_bit(iommu->domain_ids, ndomains);
1781         if (num >= ndomains) {
1782                 pr_err("%s: No free domain ids\n", iommu->name);
1783                 goto err_unlock;
1784         }
1785
1786         set_bit(num, iommu->domain_ids);
1787         info->refcnt    = 1;
1788         info->did       = num;
1789         info->iommu     = iommu;
1790         curr = xa_cmpxchg(&domain->iommu_array, iommu->seq_id,
1791                           NULL, info, GFP_ATOMIC);
1792         if (curr) {
1793                 ret = xa_err(curr) ? : -EBUSY;
1794                 goto err_clear;
1795         }
1796         domain_update_iommu_cap(domain);
1797
1798         spin_unlock(&iommu->lock);
1799         return 0;
1800
1801 err_clear:
1802         clear_bit(info->did, iommu->domain_ids);
1803 err_unlock:
1804         spin_unlock(&iommu->lock);
1805         kfree(info);
1806         return ret;
1807 }
1808
1809 static void domain_detach_iommu(struct dmar_domain *domain,
1810                                 struct intel_iommu *iommu)
1811 {
1812         struct iommu_domain_info *info;
1813
1814         spin_lock(&iommu->lock);
1815         info = xa_load(&domain->iommu_array, iommu->seq_id);
1816         if (--info->refcnt == 0) {
1817                 clear_bit(info->did, iommu->domain_ids);
1818                 xa_erase(&domain->iommu_array, iommu->seq_id);
1819                 domain->nid = NUMA_NO_NODE;
1820                 domain_update_iommu_cap(domain);
1821                 kfree(info);
1822         }
1823         spin_unlock(&iommu->lock);
1824 }
1825
1826 static inline int guestwidth_to_adjustwidth(int gaw)
1827 {
1828         int agaw;
1829         int r = (gaw - 12) % 9;
1830
1831         if (r == 0)
1832                 agaw = gaw;
1833         else
1834                 agaw = gaw + 9 - r;
1835         if (agaw > 64)
1836                 agaw = 64;
1837         return agaw;
1838 }
1839
1840 static void domain_exit(struct dmar_domain *domain)
1841 {
1842         if (domain->pgd) {
1843                 LIST_HEAD(freelist);
1844
1845                 domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw), &freelist);
1846                 put_pages_list(&freelist);
1847         }
1848
1849         if (WARN_ON(!list_empty(&domain->devices)))
1850                 return;
1851
1852         kfree(domain);
1853 }
1854
1855 /*
1856  * Get the PASID directory size for scalable mode context entry.
1857  * Value of X in the PDTS field of a scalable mode context entry
1858  * indicates PASID directory with 2^(X + 7) entries.
1859  */
1860 static inline unsigned long context_get_sm_pds(struct pasid_table *table)
1861 {
1862         unsigned long pds, max_pde;
1863
1864         max_pde = table->max_pasid >> PASID_PDE_SHIFT;
1865         pds = find_first_bit(&max_pde, MAX_NR_PASID_BITS);
1866         if (pds < 7)
1867                 return 0;
1868
1869         return pds - 7;
1870 }
1871
1872 /*
1873  * Set the RID_PASID field of a scalable mode context entry. The
1874  * IOMMU hardware will use the PASID value set in this field for
1875  * DMA translations of DMA requests without PASID.
1876  */
1877 static inline void
1878 context_set_sm_rid2pasid(struct context_entry *context, unsigned long pasid)
1879 {
1880         context->hi |= pasid & ((1 << 20) - 1);
1881 }
1882
1883 /*
1884  * Set the DTE(Device-TLB Enable) field of a scalable mode context
1885  * entry.
1886  */
1887 static inline void context_set_sm_dte(struct context_entry *context)
1888 {
1889         context->lo |= (1 << 2);
1890 }
1891
1892 /*
1893  * Set the PRE(Page Request Enable) field of a scalable mode context
1894  * entry.
1895  */
1896 static inline void context_set_sm_pre(struct context_entry *context)
1897 {
1898         context->lo |= (1 << 4);
1899 }
1900
1901 /* Convert value to context PASID directory size field coding. */
1902 #define context_pdts(pds)       (((pds) & 0x7) << 9)
1903
1904 static int domain_context_mapping_one(struct dmar_domain *domain,
1905                                       struct intel_iommu *iommu,
1906                                       struct pasid_table *table,
1907                                       u8 bus, u8 devfn)
1908 {
1909         struct device_domain_info *info =
1910                         iommu_support_dev_iotlb(domain, iommu, bus, devfn);
1911         u16 did = domain_id_iommu(domain, iommu);
1912         int translation = CONTEXT_TT_MULTI_LEVEL;
1913         struct context_entry *context;
1914         int ret;
1915
1916         WARN_ON(did == 0);
1917
1918         if (hw_pass_through && domain_type_is_si(domain))
1919                 translation = CONTEXT_TT_PASS_THROUGH;
1920
1921         pr_debug("Set context mapping for %02x:%02x.%d\n",
1922                 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1923
1924         BUG_ON(!domain->pgd);
1925
1926         spin_lock(&iommu->lock);
1927         ret = -ENOMEM;
1928         context = iommu_context_addr(iommu, bus, devfn, 1);
1929         if (!context)
1930                 goto out_unlock;
1931
1932         ret = 0;
1933         if (context_present(context) && !context_copied(iommu, bus, devfn))
1934                 goto out_unlock;
1935
1936         /*
1937          * For kdump cases, old valid entries may be cached due to the
1938          * in-flight DMA and copied pgtable, but there is no unmapping
1939          * behaviour for them, thus we need an explicit cache flush for
1940          * the newly-mapped device. For kdump, at this point, the device
1941          * is supposed to finish reset at its driver probe stage, so no
1942          * in-flight DMA will exist, and we don't need to worry anymore
1943          * hereafter.
1944          */
1945         if (context_copied(iommu, bus, devfn)) {
1946                 u16 did_old = context_domain_id(context);
1947
1948                 if (did_old < cap_ndoms(iommu->cap)) {
1949                         iommu->flush.flush_context(iommu, did_old,
1950                                                    (((u16)bus) << 8) | devfn,
1951                                                    DMA_CCMD_MASK_NOBIT,
1952                                                    DMA_CCMD_DEVICE_INVL);
1953                         iommu->flush.flush_iotlb(iommu, did_old, 0, 0,
1954                                                  DMA_TLB_DSI_FLUSH);
1955                 }
1956
1957                 clear_context_copied(iommu, bus, devfn);
1958         }
1959
1960         context_clear_entry(context);
1961
1962         if (sm_supported(iommu)) {
1963                 unsigned long pds;
1964
1965                 WARN_ON(!table);
1966
1967                 /* Setup the PASID DIR pointer: */
1968                 pds = context_get_sm_pds(table);
1969                 context->lo = (u64)virt_to_phys(table->table) |
1970                                 context_pdts(pds);
1971
1972                 /* Setup the RID_PASID field: */
1973                 context_set_sm_rid2pasid(context, PASID_RID2PASID);
1974
1975                 /*
1976                  * Setup the Device-TLB enable bit and Page request
1977                  * Enable bit:
1978                  */
1979                 if (info && info->ats_supported)
1980                         context_set_sm_dte(context);
1981                 if (info && info->pri_supported)
1982                         context_set_sm_pre(context);
1983         } else {
1984                 struct dma_pte *pgd = domain->pgd;
1985                 int agaw;
1986
1987                 context_set_domain_id(context, did);
1988
1989                 if (translation != CONTEXT_TT_PASS_THROUGH) {
1990                         /*
1991                          * Skip top levels of page tables for iommu which has
1992                          * less agaw than default. Unnecessary for PT mode.
1993                          */
1994                         for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
1995                                 ret = -ENOMEM;
1996                                 pgd = phys_to_virt(dma_pte_addr(pgd));
1997                                 if (!dma_pte_present(pgd))
1998                                         goto out_unlock;
1999                         }
2000
2001                         if (info && info->ats_supported)
2002                                 translation = CONTEXT_TT_DEV_IOTLB;
2003                         else
2004                                 translation = CONTEXT_TT_MULTI_LEVEL;
2005
2006                         context_set_address_root(context, virt_to_phys(pgd));
2007                         context_set_address_width(context, agaw);
2008                 } else {
2009                         /*
2010                          * In pass through mode, AW must be programmed to
2011                          * indicate the largest AGAW value supported by
2012                          * hardware. And ASR is ignored by hardware.
2013                          */
2014                         context_set_address_width(context, iommu->msagaw);
2015                 }
2016
2017                 context_set_translation_type(context, translation);
2018         }
2019
2020         context_set_fault_enable(context);
2021         context_set_present(context);
2022         if (!ecap_coherent(iommu->ecap))
2023                 clflush_cache_range(context, sizeof(*context));
2024
2025         /*
2026          * It's a non-present to present mapping. If hardware doesn't cache
2027          * non-present entry we only need to flush the write-buffer. If the
2028          * _does_ cache non-present entries, then it does so in the special
2029          * domain #0, which we have to flush:
2030          */
2031         if (cap_caching_mode(iommu->cap)) {
2032                 iommu->flush.flush_context(iommu, 0,
2033                                            (((u16)bus) << 8) | devfn,
2034                                            DMA_CCMD_MASK_NOBIT,
2035                                            DMA_CCMD_DEVICE_INVL);
2036                 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
2037         } else {
2038                 iommu_flush_write_buffer(iommu);
2039         }
2040         iommu_enable_dev_iotlb(info);
2041
2042         ret = 0;
2043
2044 out_unlock:
2045         spin_unlock(&iommu->lock);
2046
2047         return ret;
2048 }
2049
2050 struct domain_context_mapping_data {
2051         struct dmar_domain *domain;
2052         struct intel_iommu *iommu;
2053         struct pasid_table *table;
2054 };
2055
2056 static int domain_context_mapping_cb(struct pci_dev *pdev,
2057                                      u16 alias, void *opaque)
2058 {
2059         struct domain_context_mapping_data *data = opaque;
2060
2061         return domain_context_mapping_one(data->domain, data->iommu,
2062                                           data->table, PCI_BUS_NUM(alias),
2063                                           alias & 0xff);
2064 }
2065
2066 static int
2067 domain_context_mapping(struct dmar_domain *domain, struct device *dev)
2068 {
2069         struct domain_context_mapping_data data;
2070         struct pasid_table *table;
2071         struct intel_iommu *iommu;
2072         u8 bus, devfn;
2073
2074         iommu = device_to_iommu(dev, &bus, &devfn);
2075         if (!iommu)
2076                 return -ENODEV;
2077
2078         table = intel_pasid_get_table(dev);
2079
2080         if (!dev_is_pci(dev))
2081                 return domain_context_mapping_one(domain, iommu, table,
2082                                                   bus, devfn);
2083
2084         data.domain = domain;
2085         data.iommu = iommu;
2086         data.table = table;
2087
2088         return pci_for_each_dma_alias(to_pci_dev(dev),
2089                                       &domain_context_mapping_cb, &data);
2090 }
2091
2092 static int domain_context_mapped_cb(struct pci_dev *pdev,
2093                                     u16 alias, void *opaque)
2094 {
2095         struct intel_iommu *iommu = opaque;
2096
2097         return !device_context_mapped(iommu, PCI_BUS_NUM(alias), alias & 0xff);
2098 }
2099
2100 static int domain_context_mapped(struct device *dev)
2101 {
2102         struct intel_iommu *iommu;
2103         u8 bus, devfn;
2104
2105         iommu = device_to_iommu(dev, &bus, &devfn);
2106         if (!iommu)
2107                 return -ENODEV;
2108
2109         if (!dev_is_pci(dev))
2110                 return device_context_mapped(iommu, bus, devfn);
2111
2112         return !pci_for_each_dma_alias(to_pci_dev(dev),
2113                                        domain_context_mapped_cb, iommu);
2114 }
2115
2116 /* Returns a number of VTD pages, but aligned to MM page size */
2117 static inline unsigned long aligned_nrpages(unsigned long host_addr,
2118                                             size_t size)
2119 {
2120         host_addr &= ~PAGE_MASK;
2121         return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
2122 }
2123
2124 /* Return largest possible superpage level for a given mapping */
2125 static inline int hardware_largepage_caps(struct dmar_domain *domain,
2126                                           unsigned long iov_pfn,
2127                                           unsigned long phy_pfn,
2128                                           unsigned long pages)
2129 {
2130         int support, level = 1;
2131         unsigned long pfnmerge;
2132
2133         support = domain->iommu_superpage;
2134
2135         /* To use a large page, the virtual *and* physical addresses
2136            must be aligned to 2MiB/1GiB/etc. Lower bits set in either
2137            of them will mean we have to use smaller pages. So just
2138            merge them and check both at once. */
2139         pfnmerge = iov_pfn | phy_pfn;
2140
2141         while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
2142                 pages >>= VTD_STRIDE_SHIFT;
2143                 if (!pages)
2144                         break;
2145                 pfnmerge >>= VTD_STRIDE_SHIFT;
2146                 level++;
2147                 support--;
2148         }
2149         return level;
2150 }
2151
2152 /*
2153  * Ensure that old small page tables are removed to make room for superpage(s).
2154  * We're going to add new large pages, so make sure we don't remove their parent
2155  * tables. The IOTLB/devTLBs should be flushed if any PDE/PTEs are cleared.
2156  */
2157 static void switch_to_super_page(struct dmar_domain *domain,
2158                                  unsigned long start_pfn,
2159                                  unsigned long end_pfn, int level)
2160 {
2161         unsigned long lvl_pages = lvl_to_nr_pages(level);
2162         struct iommu_domain_info *info;
2163         struct dma_pte *pte = NULL;
2164         unsigned long i;
2165
2166         while (start_pfn <= end_pfn) {
2167                 if (!pte)
2168                         pte = pfn_to_dma_pte(domain, start_pfn, &level);
2169
2170                 if (dma_pte_present(pte)) {
2171                         dma_pte_free_pagetable(domain, start_pfn,
2172                                                start_pfn + lvl_pages - 1,
2173                                                level + 1);
2174
2175                         xa_for_each(&domain->iommu_array, i, info)
2176                                 iommu_flush_iotlb_psi(info->iommu, domain,
2177                                                       start_pfn, lvl_pages,
2178                                                       0, 0);
2179                 }
2180
2181                 pte++;
2182                 start_pfn += lvl_pages;
2183                 if (first_pte_in_page(pte))
2184                         pte = NULL;
2185         }
2186 }
2187
2188 static int
2189 __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2190                  unsigned long phys_pfn, unsigned long nr_pages, int prot)
2191 {
2192         struct dma_pte *first_pte = NULL, *pte = NULL;
2193         unsigned int largepage_lvl = 0;
2194         unsigned long lvl_pages = 0;
2195         phys_addr_t pteval;
2196         u64 attr;
2197
2198         BUG_ON(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1));
2199
2200         if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
2201                 return -EINVAL;
2202
2203         attr = prot & (DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP);
2204         attr |= DMA_FL_PTE_PRESENT;
2205         if (domain_use_first_level(domain)) {
2206                 attr |= DMA_FL_PTE_XD | DMA_FL_PTE_US | DMA_FL_PTE_ACCESS;
2207                 if (prot & DMA_PTE_WRITE)
2208                         attr |= DMA_FL_PTE_DIRTY;
2209         }
2210
2211         pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | attr;
2212
2213         while (nr_pages > 0) {
2214                 uint64_t tmp;
2215
2216                 if (!pte) {
2217                         largepage_lvl = hardware_largepage_caps(domain, iov_pfn,
2218                                         phys_pfn, nr_pages);
2219
2220                         pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl);
2221                         if (!pte)
2222                                 return -ENOMEM;
2223                         first_pte = pte;
2224
2225                         lvl_pages = lvl_to_nr_pages(largepage_lvl);
2226
2227                         /* It is large page*/
2228                         if (largepage_lvl > 1) {
2229                                 unsigned long end_pfn;
2230                                 unsigned long pages_to_remove;
2231
2232                                 pteval |= DMA_PTE_LARGE_PAGE;
2233                                 pages_to_remove = min_t(unsigned long, nr_pages,
2234                                                         nr_pte_to_next_page(pte) * lvl_pages);
2235                                 end_pfn = iov_pfn + pages_to_remove - 1;
2236                                 switch_to_super_page(domain, iov_pfn, end_pfn, largepage_lvl);
2237                         } else {
2238                                 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
2239                         }
2240
2241                 }
2242                 /* We don't need lock here, nobody else
2243                  * touches the iova range
2244                  */
2245                 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
2246                 if (tmp) {
2247                         static int dumps = 5;
2248                         pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
2249                                 iov_pfn, tmp, (unsigned long long)pteval);
2250                         if (dumps) {
2251                                 dumps--;
2252                                 debug_dma_dump_mappings(NULL);
2253                         }
2254                         WARN_ON(1);
2255                 }
2256
2257                 nr_pages -= lvl_pages;
2258                 iov_pfn += lvl_pages;
2259                 phys_pfn += lvl_pages;
2260                 pteval += lvl_pages * VTD_PAGE_SIZE;
2261
2262                 /* If the next PTE would be the first in a new page, then we
2263                  * need to flush the cache on the entries we've just written.
2264                  * And then we'll need to recalculate 'pte', so clear it and
2265                  * let it get set again in the if (!pte) block above.
2266                  *
2267                  * If we're done (!nr_pages) we need to flush the cache too.
2268                  *
2269                  * Also if we've been setting superpages, we may need to
2270                  * recalculate 'pte' and switch back to smaller pages for the
2271                  * end of the mapping, if the trailing size is not enough to
2272                  * use another superpage (i.e. nr_pages < lvl_pages).
2273                  */
2274                 pte++;
2275                 if (!nr_pages || first_pte_in_page(pte) ||
2276                     (largepage_lvl > 1 && nr_pages < lvl_pages)) {
2277                         domain_flush_cache(domain, first_pte,
2278                                            (void *)pte - (void *)first_pte);
2279                         pte = NULL;
2280                 }
2281         }
2282
2283         return 0;
2284 }
2285
2286 static void domain_context_clear_one(struct device_domain_info *info, u8 bus, u8 devfn)
2287 {
2288         struct intel_iommu *iommu = info->iommu;
2289         struct context_entry *context;
2290         u16 did_old;
2291
2292         if (!iommu)
2293                 return;
2294
2295         spin_lock(&iommu->lock);
2296         context = iommu_context_addr(iommu, bus, devfn, 0);
2297         if (!context) {
2298                 spin_unlock(&iommu->lock);
2299                 return;
2300         }
2301
2302         if (sm_supported(iommu)) {
2303                 if (hw_pass_through && domain_type_is_si(info->domain))
2304                         did_old = FLPT_DEFAULT_DID;
2305                 else
2306                         did_old = domain_id_iommu(info->domain, iommu);
2307         } else {
2308                 did_old = context_domain_id(context);
2309         }
2310
2311         context_clear_entry(context);
2312         __iommu_flush_cache(iommu, context, sizeof(*context));
2313         spin_unlock(&iommu->lock);
2314         iommu->flush.flush_context(iommu,
2315                                    did_old,
2316                                    (((u16)bus) << 8) | devfn,
2317                                    DMA_CCMD_MASK_NOBIT,
2318                                    DMA_CCMD_DEVICE_INVL);
2319
2320         if (sm_supported(iommu))
2321                 qi_flush_pasid_cache(iommu, did_old, QI_PC_ALL_PASIDS, 0);
2322
2323         iommu->flush.flush_iotlb(iommu,
2324                                  did_old,
2325                                  0,
2326                                  0,
2327                                  DMA_TLB_DSI_FLUSH);
2328
2329         __iommu_flush_dev_iotlb(info, 0, MAX_AGAW_PFN_WIDTH);
2330 }
2331
2332 static int domain_setup_first_level(struct intel_iommu *iommu,
2333                                     struct dmar_domain *domain,
2334                                     struct device *dev,
2335                                     u32 pasid)
2336 {
2337         struct dma_pte *pgd = domain->pgd;
2338         int agaw, level;
2339         int flags = 0;
2340
2341         /*
2342          * Skip top levels of page tables for iommu which has
2343          * less agaw than default. Unnecessary for PT mode.
2344          */
2345         for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2346                 pgd = phys_to_virt(dma_pte_addr(pgd));
2347                 if (!dma_pte_present(pgd))
2348                         return -ENOMEM;
2349         }
2350
2351         level = agaw_to_level(agaw);
2352         if (level != 4 && level != 5)
2353                 return -EINVAL;
2354
2355         if (pasid != PASID_RID2PASID)
2356                 flags |= PASID_FLAG_SUPERVISOR_MODE;
2357         if (level == 5)
2358                 flags |= PASID_FLAG_FL5LP;
2359
2360         if (domain->force_snooping)
2361                 flags |= PASID_FLAG_PAGE_SNOOP;
2362
2363         return intel_pasid_setup_first_level(iommu, dev, (pgd_t *)pgd, pasid,
2364                                              domain_id_iommu(domain, iommu),
2365                                              flags);
2366 }
2367
2368 static bool dev_is_real_dma_subdevice(struct device *dev)
2369 {
2370         return dev && dev_is_pci(dev) &&
2371                pci_real_dma_dev(to_pci_dev(dev)) != to_pci_dev(dev);
2372 }
2373
2374 static int iommu_domain_identity_map(struct dmar_domain *domain,
2375                                      unsigned long first_vpfn,
2376                                      unsigned long last_vpfn)
2377 {
2378         /*
2379          * RMRR range might have overlap with physical memory range,
2380          * clear it first
2381          */
2382         dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2383
2384         return __domain_mapping(domain, first_vpfn,
2385                                 first_vpfn, last_vpfn - first_vpfn + 1,
2386                                 DMA_PTE_READ|DMA_PTE_WRITE);
2387 }
2388
2389 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2390
2391 static int __init si_domain_init(int hw)
2392 {
2393         struct dmar_rmrr_unit *rmrr;
2394         struct device *dev;
2395         int i, nid, ret;
2396
2397         si_domain = alloc_domain(IOMMU_DOMAIN_IDENTITY);
2398         if (!si_domain)
2399                 return -EFAULT;
2400
2401         if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2402                 domain_exit(si_domain);
2403                 return -EFAULT;
2404         }
2405
2406         if (hw)
2407                 return 0;
2408
2409         for_each_online_node(nid) {
2410                 unsigned long start_pfn, end_pfn;
2411                 int i;
2412
2413                 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2414                         ret = iommu_domain_identity_map(si_domain,
2415                                         mm_to_dma_pfn(start_pfn),
2416                                         mm_to_dma_pfn(end_pfn));
2417                         if (ret)
2418                                 return ret;
2419                 }
2420         }
2421
2422         /*
2423          * Identity map the RMRRs so that devices with RMRRs could also use
2424          * the si_domain.
2425          */
2426         for_each_rmrr_units(rmrr) {
2427                 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
2428                                           i, dev) {
2429                         unsigned long long start = rmrr->base_address;
2430                         unsigned long long end = rmrr->end_address;
2431
2432                         if (WARN_ON(end < start ||
2433                                     end >> agaw_to_width(si_domain->agaw)))
2434                                 continue;
2435
2436                         ret = iommu_domain_identity_map(si_domain,
2437                                         mm_to_dma_pfn(start >> PAGE_SHIFT),
2438                                         mm_to_dma_pfn(end >> PAGE_SHIFT));
2439                         if (ret)
2440                                 return ret;
2441                 }
2442         }
2443
2444         return 0;
2445 }
2446
2447 static int domain_add_dev_info(struct dmar_domain *domain, struct device *dev)
2448 {
2449         struct device_domain_info *info = dev_iommu_priv_get(dev);
2450         struct intel_iommu *iommu;
2451         unsigned long flags;
2452         u8 bus, devfn;
2453         int ret;
2454
2455         iommu = device_to_iommu(dev, &bus, &devfn);
2456         if (!iommu)
2457                 return -ENODEV;
2458
2459         ret = domain_attach_iommu(domain, iommu);
2460         if (ret)
2461                 return ret;
2462         info->domain = domain;
2463         spin_lock_irqsave(&domain->lock, flags);
2464         list_add(&info->link, &domain->devices);
2465         spin_unlock_irqrestore(&domain->lock, flags);
2466
2467         /* PASID table is mandatory for a PCI device in scalable mode. */
2468         if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev)) {
2469                 ret = intel_pasid_alloc_table(dev);
2470                 if (ret) {
2471                         dev_err(dev, "PASID table allocation failed\n");
2472                         dmar_remove_one_dev_info(dev);
2473                         return ret;
2474                 }
2475
2476                 /* Setup the PASID entry for requests without PASID: */
2477                 if (hw_pass_through && domain_type_is_si(domain))
2478                         ret = intel_pasid_setup_pass_through(iommu, domain,
2479                                         dev, PASID_RID2PASID);
2480                 else if (domain_use_first_level(domain))
2481                         ret = domain_setup_first_level(iommu, domain, dev,
2482                                         PASID_RID2PASID);
2483                 else
2484                         ret = intel_pasid_setup_second_level(iommu, domain,
2485                                         dev, PASID_RID2PASID);
2486                 if (ret) {
2487                         dev_err(dev, "Setup RID2PASID failed\n");
2488                         dmar_remove_one_dev_info(dev);
2489                         return ret;
2490                 }
2491         }
2492
2493         ret = domain_context_mapping(domain, dev);
2494         if (ret) {
2495                 dev_err(dev, "Domain context map failed\n");
2496                 dmar_remove_one_dev_info(dev);
2497                 return ret;
2498         }
2499
2500         return 0;
2501 }
2502
2503 static bool device_has_rmrr(struct device *dev)
2504 {
2505         struct dmar_rmrr_unit *rmrr;
2506         struct device *tmp;
2507         int i;
2508
2509         rcu_read_lock();
2510         for_each_rmrr_units(rmrr) {
2511                 /*
2512                  * Return TRUE if this RMRR contains the device that
2513                  * is passed in.
2514                  */
2515                 for_each_active_dev_scope(rmrr->devices,
2516                                           rmrr->devices_cnt, i, tmp)
2517                         if (tmp == dev ||
2518                             is_downstream_to_pci_bridge(dev, tmp)) {
2519                                 rcu_read_unlock();
2520                                 return true;
2521                         }
2522         }
2523         rcu_read_unlock();
2524         return false;
2525 }
2526
2527 /**
2528  * device_rmrr_is_relaxable - Test whether the RMRR of this device
2529  * is relaxable (ie. is allowed to be not enforced under some conditions)
2530  * @dev: device handle
2531  *
2532  * We assume that PCI USB devices with RMRRs have them largely
2533  * for historical reasons and that the RMRR space is not actively used post
2534  * boot.  This exclusion may change if vendors begin to abuse it.
2535  *
2536  * The same exception is made for graphics devices, with the requirement that
2537  * any use of the RMRR regions will be torn down before assigning the device
2538  * to a guest.
2539  *
2540  * Return: true if the RMRR is relaxable, false otherwise
2541  */
2542 static bool device_rmrr_is_relaxable(struct device *dev)
2543 {
2544         struct pci_dev *pdev;
2545
2546         if (!dev_is_pci(dev))
2547                 return false;
2548
2549         pdev = to_pci_dev(dev);
2550         if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
2551                 return true;
2552         else
2553                 return false;
2554 }
2555
2556 /*
2557  * There are a couple cases where we need to restrict the functionality of
2558  * devices associated with RMRRs.  The first is when evaluating a device for
2559  * identity mapping because problems exist when devices are moved in and out
2560  * of domains and their respective RMRR information is lost.  This means that
2561  * a device with associated RMRRs will never be in a "passthrough" domain.
2562  * The second is use of the device through the IOMMU API.  This interface
2563  * expects to have full control of the IOVA space for the device.  We cannot
2564  * satisfy both the requirement that RMRR access is maintained and have an
2565  * unencumbered IOVA space.  We also have no ability to quiesce the device's
2566  * use of the RMRR space or even inform the IOMMU API user of the restriction.
2567  * We therefore prevent devices associated with an RMRR from participating in
2568  * the IOMMU API, which eliminates them from device assignment.
2569  *
2570  * In both cases, devices which have relaxable RMRRs are not concerned by this
2571  * restriction. See device_rmrr_is_relaxable comment.
2572  */
2573 static bool device_is_rmrr_locked(struct device *dev)
2574 {
2575         if (!device_has_rmrr(dev))
2576                 return false;
2577
2578         if (device_rmrr_is_relaxable(dev))
2579                 return false;
2580
2581         return true;
2582 }
2583
2584 /*
2585  * Return the required default domain type for a specific device.
2586  *
2587  * @dev: the device in query
2588  * @startup: true if this is during early boot
2589  *
2590  * Returns:
2591  *  - IOMMU_DOMAIN_DMA: device requires a dynamic mapping domain
2592  *  - IOMMU_DOMAIN_IDENTITY: device requires an identical mapping domain
2593  *  - 0: both identity and dynamic domains work for this device
2594  */
2595 static int device_def_domain_type(struct device *dev)
2596 {
2597         if (dev_is_pci(dev)) {
2598                 struct pci_dev *pdev = to_pci_dev(dev);
2599
2600                 if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2601                         return IOMMU_DOMAIN_IDENTITY;
2602
2603                 if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2604                         return IOMMU_DOMAIN_IDENTITY;
2605         }
2606
2607         return 0;
2608 }
2609
2610 static void intel_iommu_init_qi(struct intel_iommu *iommu)
2611 {
2612         /*
2613          * Start from the sane iommu hardware state.
2614          * If the queued invalidation is already initialized by us
2615          * (for example, while enabling interrupt-remapping) then
2616          * we got the things already rolling from a sane state.
2617          */
2618         if (!iommu->qi) {
2619                 /*
2620                  * Clear any previous faults.
2621                  */
2622                 dmar_fault(-1, iommu);
2623                 /*
2624                  * Disable queued invalidation if supported and already enabled
2625                  * before OS handover.
2626                  */
2627                 dmar_disable_qi(iommu);
2628         }
2629
2630         if (dmar_enable_qi(iommu)) {
2631                 /*
2632                  * Queued Invalidate not enabled, use Register Based Invalidate
2633                  */
2634                 iommu->flush.flush_context = __iommu_flush_context;
2635                 iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2636                 pr_info("%s: Using Register based invalidation\n",
2637                         iommu->name);
2638         } else {
2639                 iommu->flush.flush_context = qi_flush_context;
2640                 iommu->flush.flush_iotlb = qi_flush_iotlb;
2641                 pr_info("%s: Using Queued invalidation\n", iommu->name);
2642         }
2643 }
2644
2645 static int copy_context_table(struct intel_iommu *iommu,
2646                               struct root_entry *old_re,
2647                               struct context_entry **tbl,
2648                               int bus, bool ext)
2649 {
2650         int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
2651         struct context_entry *new_ce = NULL, ce;
2652         struct context_entry *old_ce = NULL;
2653         struct root_entry re;
2654         phys_addr_t old_ce_phys;
2655
2656         tbl_idx = ext ? bus * 2 : bus;
2657         memcpy(&re, old_re, sizeof(re));
2658
2659         for (devfn = 0; devfn < 256; devfn++) {
2660                 /* First calculate the correct index */
2661                 idx = (ext ? devfn * 2 : devfn) % 256;
2662
2663                 if (idx == 0) {
2664                         /* First save what we may have and clean up */
2665                         if (new_ce) {
2666                                 tbl[tbl_idx] = new_ce;
2667                                 __iommu_flush_cache(iommu, new_ce,
2668                                                     VTD_PAGE_SIZE);
2669                                 pos = 1;
2670                         }
2671
2672                         if (old_ce)
2673                                 memunmap(old_ce);
2674
2675                         ret = 0;
2676                         if (devfn < 0x80)
2677                                 old_ce_phys = root_entry_lctp(&re);
2678                         else
2679                                 old_ce_phys = root_entry_uctp(&re);
2680
2681                         if (!old_ce_phys) {
2682                                 if (ext && devfn == 0) {
2683                                         /* No LCTP, try UCTP */
2684                                         devfn = 0x7f;
2685                                         continue;
2686                                 } else {
2687                                         goto out;
2688                                 }
2689                         }
2690
2691                         ret = -ENOMEM;
2692                         old_ce = memremap(old_ce_phys, PAGE_SIZE,
2693                                         MEMREMAP_WB);
2694                         if (!old_ce)
2695                                 goto out;
2696
2697                         new_ce = alloc_pgtable_page(iommu->node);
2698                         if (!new_ce)
2699                                 goto out_unmap;
2700
2701                         ret = 0;
2702                 }
2703
2704                 /* Now copy the context entry */
2705                 memcpy(&ce, old_ce + idx, sizeof(ce));
2706
2707                 if (!context_present(&ce))
2708                         continue;
2709
2710                 did = context_domain_id(&ce);
2711                 if (did >= 0 && did < cap_ndoms(iommu->cap))
2712                         set_bit(did, iommu->domain_ids);
2713
2714                 set_context_copied(iommu, bus, devfn);
2715                 new_ce[idx] = ce;
2716         }
2717
2718         tbl[tbl_idx + pos] = new_ce;
2719
2720         __iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
2721
2722 out_unmap:
2723         memunmap(old_ce);
2724
2725 out:
2726         return ret;
2727 }
2728
2729 static int copy_translation_tables(struct intel_iommu *iommu)
2730 {
2731         struct context_entry **ctxt_tbls;
2732         struct root_entry *old_rt;
2733         phys_addr_t old_rt_phys;
2734         int ctxt_table_entries;
2735         u64 rtaddr_reg;
2736         int bus, ret;
2737         bool new_ext, ext;
2738
2739         rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
2740         ext        = !!(rtaddr_reg & DMA_RTADDR_SMT);
2741         new_ext    = !!sm_supported(iommu);
2742
2743         /*
2744          * The RTT bit can only be changed when translation is disabled,
2745          * but disabling translation means to open a window for data
2746          * corruption. So bail out and don't copy anything if we would
2747          * have to change the bit.
2748          */
2749         if (new_ext != ext)
2750                 return -EINVAL;
2751
2752         iommu->copied_tables = bitmap_zalloc(BIT_ULL(16), GFP_KERNEL);
2753         if (!iommu->copied_tables)
2754                 return -ENOMEM;
2755
2756         old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
2757         if (!old_rt_phys)
2758                 return -EINVAL;
2759
2760         old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB);
2761         if (!old_rt)
2762                 return -ENOMEM;
2763
2764         /* This is too big for the stack - allocate it from slab */
2765         ctxt_table_entries = ext ? 512 : 256;
2766         ret = -ENOMEM;
2767         ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL);
2768         if (!ctxt_tbls)
2769                 goto out_unmap;
2770
2771         for (bus = 0; bus < 256; bus++) {
2772                 ret = copy_context_table(iommu, &old_rt[bus],
2773                                          ctxt_tbls, bus, ext);
2774                 if (ret) {
2775                         pr_err("%s: Failed to copy context table for bus %d\n",
2776                                 iommu->name, bus);
2777                         continue;
2778                 }
2779         }
2780
2781         spin_lock(&iommu->lock);
2782
2783         /* Context tables are copied, now write them to the root_entry table */
2784         for (bus = 0; bus < 256; bus++) {
2785                 int idx = ext ? bus * 2 : bus;
2786                 u64 val;
2787
2788                 if (ctxt_tbls[idx]) {
2789                         val = virt_to_phys(ctxt_tbls[idx]) | 1;
2790                         iommu->root_entry[bus].lo = val;
2791                 }
2792
2793                 if (!ext || !ctxt_tbls[idx + 1])
2794                         continue;
2795
2796                 val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
2797                 iommu->root_entry[bus].hi = val;
2798         }
2799
2800         spin_unlock(&iommu->lock);
2801
2802         kfree(ctxt_tbls);
2803
2804         __iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
2805
2806         ret = 0;
2807
2808 out_unmap:
2809         memunmap(old_rt);
2810
2811         return ret;
2812 }
2813
2814 #ifdef CONFIG_INTEL_IOMMU_SVM
2815 static ioasid_t intel_vcmd_ioasid_alloc(ioasid_t min, ioasid_t max, void *data)
2816 {
2817         struct intel_iommu *iommu = data;
2818         ioasid_t ioasid;
2819
2820         if (!iommu)
2821                 return INVALID_IOASID;
2822         /*
2823          * VT-d virtual command interface always uses the full 20 bit
2824          * PASID range. Host can partition guest PASID range based on
2825          * policies but it is out of guest's control.
2826          */
2827         if (min < PASID_MIN || max > intel_pasid_max_id)
2828                 return INVALID_IOASID;
2829
2830         if (vcmd_alloc_pasid(iommu, &ioasid))
2831                 return INVALID_IOASID;
2832
2833         return ioasid;
2834 }
2835
2836 static void intel_vcmd_ioasid_free(ioasid_t ioasid, void *data)
2837 {
2838         struct intel_iommu *iommu = data;
2839
2840         if (!iommu)
2841                 return;
2842         /*
2843          * Sanity check the ioasid owner is done at upper layer, e.g. VFIO
2844          * We can only free the PASID when all the devices are unbound.
2845          */
2846         if (ioasid_find(NULL, ioasid, NULL)) {
2847                 pr_alert("Cannot free active IOASID %d\n", ioasid);
2848                 return;
2849         }
2850         vcmd_free_pasid(iommu, ioasid);
2851 }
2852
2853 static void register_pasid_allocator(struct intel_iommu *iommu)
2854 {
2855         /*
2856          * If we are running in the host, no need for custom allocator
2857          * in that PASIDs are allocated from the host system-wide.
2858          */
2859         if (!cap_caching_mode(iommu->cap))
2860                 return;
2861
2862         if (!sm_supported(iommu)) {
2863                 pr_warn("VT-d Scalable Mode not enabled, no PASID allocation\n");
2864                 return;
2865         }
2866
2867         /*
2868          * Register a custom PASID allocator if we are running in a guest,
2869          * guest PASID must be obtained via virtual command interface.
2870          * There can be multiple vIOMMUs in each guest but only one allocator
2871          * is active. All vIOMMU allocators will eventually be calling the same
2872          * host allocator.
2873          */
2874         if (!vccap_pasid(iommu->vccap))
2875                 return;
2876
2877         pr_info("Register custom PASID allocator\n");
2878         iommu->pasid_allocator.alloc = intel_vcmd_ioasid_alloc;
2879         iommu->pasid_allocator.free = intel_vcmd_ioasid_free;
2880         iommu->pasid_allocator.pdata = (void *)iommu;
2881         if (ioasid_register_allocator(&iommu->pasid_allocator)) {
2882                 pr_warn("Custom PASID allocator failed, scalable mode disabled\n");
2883                 /*
2884                  * Disable scalable mode on this IOMMU if there
2885                  * is no custom allocator. Mixing SM capable vIOMMU
2886                  * and non-SM vIOMMU are not supported.
2887                  */
2888                 intel_iommu_sm = 0;
2889         }
2890 }
2891 #endif
2892
2893 static int __init init_dmars(void)
2894 {
2895         struct dmar_drhd_unit *drhd;
2896         struct intel_iommu *iommu;
2897         int ret;
2898
2899         ret = intel_cap_audit(CAP_AUDIT_STATIC_DMAR, NULL);
2900         if (ret)
2901                 goto free_iommu;
2902
2903         for_each_iommu(iommu, drhd) {
2904                 if (drhd->ignored) {
2905                         iommu_disable_translation(iommu);
2906                         continue;
2907                 }
2908
2909                 /*
2910                  * Find the max pasid size of all IOMMU's in the system.
2911                  * We need to ensure the system pasid table is no bigger
2912                  * than the smallest supported.
2913                  */
2914                 if (pasid_supported(iommu)) {
2915                         u32 temp = 2 << ecap_pss(iommu->ecap);
2916
2917                         intel_pasid_max_id = min_t(u32, temp,
2918                                                    intel_pasid_max_id);
2919                 }
2920
2921                 intel_iommu_init_qi(iommu);
2922
2923                 ret = iommu_init_domains(iommu);
2924                 if (ret)
2925                         goto free_iommu;
2926
2927                 init_translation_status(iommu);
2928
2929                 if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
2930                         iommu_disable_translation(iommu);
2931                         clear_translation_pre_enabled(iommu);
2932                         pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
2933                                 iommu->name);
2934                 }
2935
2936                 /*
2937                  * TBD:
2938                  * we could share the same root & context tables
2939                  * among all IOMMU's. Need to Split it later.
2940                  */
2941                 ret = iommu_alloc_root_entry(iommu);
2942                 if (ret)
2943                         goto free_iommu;
2944
2945                 if (translation_pre_enabled(iommu)) {
2946                         pr_info("Translation already enabled - trying to copy translation structures\n");
2947
2948                         ret = copy_translation_tables(iommu);
2949                         if (ret) {
2950                                 /*
2951                                  * We found the IOMMU with translation
2952                                  * enabled - but failed to copy over the
2953                                  * old root-entry table. Try to proceed
2954                                  * by disabling translation now and
2955                                  * allocating a clean root-entry table.
2956                                  * This might cause DMAR faults, but
2957                                  * probably the dump will still succeed.
2958                                  */
2959                                 pr_err("Failed to copy translation tables from previous kernel for %s\n",
2960                                        iommu->name);
2961                                 iommu_disable_translation(iommu);
2962                                 clear_translation_pre_enabled(iommu);
2963                         } else {
2964                                 pr_info("Copied translation tables from previous kernel for %s\n",
2965                                         iommu->name);
2966                         }
2967                 }
2968
2969                 if (!ecap_pass_through(iommu->ecap))
2970                         hw_pass_through = 0;
2971                 intel_svm_check(iommu);
2972         }
2973
2974         /*
2975          * Now that qi is enabled on all iommus, set the root entry and flush
2976          * caches. This is required on some Intel X58 chipsets, otherwise the
2977          * flush_context function will loop forever and the boot hangs.
2978          */
2979         for_each_active_iommu(iommu, drhd) {
2980                 iommu_flush_write_buffer(iommu);
2981 #ifdef CONFIG_INTEL_IOMMU_SVM
2982                 register_pasid_allocator(iommu);
2983 #endif
2984                 iommu_set_root_entry(iommu);
2985         }
2986
2987 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
2988         dmar_map_gfx = 0;
2989 #endif
2990
2991         if (!dmar_map_gfx)
2992                 iommu_identity_mapping |= IDENTMAP_GFX;
2993
2994         check_tylersburg_isoch();
2995
2996         ret = si_domain_init(hw_pass_through);
2997         if (ret)
2998                 goto free_iommu;
2999
3000         /*
3001          * for each drhd
3002          *   enable fault log
3003          *   global invalidate context cache
3004          *   global invalidate iotlb
3005          *   enable translation
3006          */
3007         for_each_iommu(iommu, drhd) {
3008                 if (drhd->ignored) {
3009                         /*
3010                          * we always have to disable PMRs or DMA may fail on
3011                          * this device
3012                          */
3013                         if (force_on)
3014                                 iommu_disable_protect_mem_regions(iommu);
3015                         continue;
3016                 }
3017
3018                 iommu_flush_write_buffer(iommu);
3019
3020 #ifdef CONFIG_INTEL_IOMMU_SVM
3021                 if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
3022                         ret = intel_svm_enable_prq(iommu);
3023                         if (ret)
3024                                 goto free_iommu;
3025                 }
3026 #endif
3027                 ret = dmar_set_interrupt(iommu);
3028                 if (ret)
3029                         goto free_iommu;
3030         }
3031
3032         return 0;
3033
3034 free_iommu:
3035         for_each_active_iommu(iommu, drhd) {
3036                 disable_dmar_iommu(iommu);
3037                 free_dmar_iommu(iommu);
3038         }
3039
3040         return ret;
3041 }
3042
3043 static void __init init_no_remapping_devices(void)
3044 {
3045         struct dmar_drhd_unit *drhd;
3046         struct device *dev;
3047         int i;
3048
3049         for_each_drhd_unit(drhd) {
3050                 if (!drhd->include_all) {
3051                         for_each_active_dev_scope(drhd->devices,
3052                                                   drhd->devices_cnt, i, dev)
3053                                 break;
3054                         /* ignore DMAR unit if no devices exist */
3055                         if (i == drhd->devices_cnt)
3056                                 drhd->ignored = 1;
3057                 }
3058         }
3059
3060         for_each_active_drhd_unit(drhd) {
3061                 if (drhd->include_all)
3062                         continue;
3063
3064                 for_each_active_dev_scope(drhd->devices,
3065                                           drhd->devices_cnt, i, dev)
3066                         if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
3067                                 break;
3068                 if (i < drhd->devices_cnt)
3069                         continue;
3070
3071                 /* This IOMMU has *only* gfx devices. Either bypass it or
3072                    set the gfx_mapped flag, as appropriate */
3073                 drhd->gfx_dedicated = 1;
3074                 if (!dmar_map_gfx)
3075                         drhd->ignored = 1;
3076         }
3077 }
3078
3079 #ifdef CONFIG_SUSPEND
3080 static int init_iommu_hw(void)
3081 {
3082         struct dmar_drhd_unit *drhd;
3083         struct intel_iommu *iommu = NULL;
3084
3085         for_each_active_iommu(iommu, drhd)
3086                 if (iommu->qi)
3087                         dmar_reenable_qi(iommu);
3088
3089         for_each_iommu(iommu, drhd) {
3090                 if (drhd->ignored) {
3091                         /*
3092                          * we always have to disable PMRs or DMA may fail on
3093                          * this device
3094                          */
3095                         if (force_on)
3096                                 iommu_disable_protect_mem_regions(iommu);
3097                         continue;
3098                 }
3099
3100                 iommu_flush_write_buffer(iommu);
3101                 iommu_set_root_entry(iommu);
3102                 iommu_enable_translation(iommu);
3103                 iommu_disable_protect_mem_regions(iommu);
3104         }
3105
3106         return 0;
3107 }
3108
3109 static void iommu_flush_all(void)
3110 {
3111         struct dmar_drhd_unit *drhd;
3112         struct intel_iommu *iommu;
3113
3114         for_each_active_iommu(iommu, drhd) {
3115                 iommu->flush.flush_context(iommu, 0, 0, 0,
3116                                            DMA_CCMD_GLOBAL_INVL);
3117                 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3118                                          DMA_TLB_GLOBAL_FLUSH);
3119         }
3120 }
3121
3122 static int iommu_suspend(void)
3123 {
3124         struct dmar_drhd_unit *drhd;
3125         struct intel_iommu *iommu = NULL;
3126         unsigned long flag;
3127
3128         for_each_active_iommu(iommu, drhd) {
3129                 iommu->iommu_state = kcalloc(MAX_SR_DMAR_REGS, sizeof(u32),
3130                                              GFP_KERNEL);
3131                 if (!iommu->iommu_state)
3132                         goto nomem;
3133         }
3134
3135         iommu_flush_all();
3136
3137         for_each_active_iommu(iommu, drhd) {
3138                 iommu_disable_translation(iommu);
3139
3140                 raw_spin_lock_irqsave(&iommu->register_lock, flag);
3141
3142                 iommu->iommu_state[SR_DMAR_FECTL_REG] =
3143                         readl(iommu->reg + DMAR_FECTL_REG);
3144                 iommu->iommu_state[SR_DMAR_FEDATA_REG] =
3145                         readl(iommu->reg + DMAR_FEDATA_REG);
3146                 iommu->iommu_state[SR_DMAR_FEADDR_REG] =
3147                         readl(iommu->reg + DMAR_FEADDR_REG);
3148                 iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
3149                         readl(iommu->reg + DMAR_FEUADDR_REG);
3150
3151                 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3152         }
3153         return 0;
3154
3155 nomem:
3156         for_each_active_iommu(iommu, drhd)
3157                 kfree(iommu->iommu_state);
3158
3159         return -ENOMEM;
3160 }
3161
3162 static void iommu_resume(void)
3163 {
3164         struct dmar_drhd_unit *drhd;
3165         struct intel_iommu *iommu = NULL;
3166         unsigned long flag;
3167
3168         if (init_iommu_hw()) {
3169                 if (force_on)
3170                         panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
3171                 else
3172                         WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
3173                 return;
3174         }
3175
3176         for_each_active_iommu(iommu, drhd) {
3177
3178                 raw_spin_lock_irqsave(&iommu->register_lock, flag);
3179
3180                 writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
3181                         iommu->reg + DMAR_FECTL_REG);
3182                 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
3183                         iommu->reg + DMAR_FEDATA_REG);
3184                 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
3185                         iommu->reg + DMAR_FEADDR_REG);
3186                 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
3187                         iommu->reg + DMAR_FEUADDR_REG);
3188
3189                 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3190         }
3191
3192         for_each_active_iommu(iommu, drhd)
3193                 kfree(iommu->iommu_state);
3194 }
3195
3196 static struct syscore_ops iommu_syscore_ops = {
3197         .resume         = iommu_resume,
3198         .suspend        = iommu_suspend,
3199 };
3200
3201 static void __init init_iommu_pm_ops(void)
3202 {
3203         register_syscore_ops(&iommu_syscore_ops);
3204 }
3205
3206 #else
3207 static inline void init_iommu_pm_ops(void) {}
3208 #endif  /* CONFIG_PM */
3209
3210 static int __init rmrr_sanity_check(struct acpi_dmar_reserved_memory *rmrr)
3211 {
3212         if (!IS_ALIGNED(rmrr->base_address, PAGE_SIZE) ||
3213             !IS_ALIGNED(rmrr->end_address + 1, PAGE_SIZE) ||
3214             rmrr->end_address <= rmrr->base_address ||
3215             arch_rmrr_sanity_check(rmrr))
3216                 return -EINVAL;
3217
3218         return 0;
3219 }
3220
3221 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
3222 {
3223         struct acpi_dmar_reserved_memory *rmrr;
3224         struct dmar_rmrr_unit *rmrru;
3225
3226         rmrr = (struct acpi_dmar_reserved_memory *)header;
3227         if (rmrr_sanity_check(rmrr)) {
3228                 pr_warn(FW_BUG
3229                            "Your BIOS is broken; bad RMRR [%#018Lx-%#018Lx]\n"
3230                            "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
3231                            rmrr->base_address, rmrr->end_address,
3232                            dmi_get_system_info(DMI_BIOS_VENDOR),
3233                            dmi_get_system_info(DMI_BIOS_VERSION),
3234                            dmi_get_system_info(DMI_PRODUCT_VERSION));
3235                 add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
3236         }
3237
3238         rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
3239         if (!rmrru)
3240                 goto out;
3241
3242         rmrru->hdr = header;
3243
3244         rmrru->base_address = rmrr->base_address;
3245         rmrru->end_address = rmrr->end_address;
3246
3247         rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
3248                                 ((void *)rmrr) + rmrr->header.length,
3249                                 &rmrru->devices_cnt);
3250         if (rmrru->devices_cnt && rmrru->devices == NULL)
3251                 goto free_rmrru;
3252
3253         list_add(&rmrru->list, &dmar_rmrr_units);
3254
3255         return 0;
3256 free_rmrru:
3257         kfree(rmrru);
3258 out:
3259         return -ENOMEM;
3260 }
3261
3262 static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
3263 {
3264         struct dmar_atsr_unit *atsru;
3265         struct acpi_dmar_atsr *tmp;
3266
3267         list_for_each_entry_rcu(atsru, &dmar_atsr_units, list,
3268                                 dmar_rcu_check()) {
3269                 tmp = (struct acpi_dmar_atsr *)atsru->hdr;
3270                 if (atsr->segment != tmp->segment)
3271                         continue;
3272                 if (atsr->header.length != tmp->header.length)
3273                         continue;
3274                 if (memcmp(atsr, tmp, atsr->header.length) == 0)
3275                         return atsru;
3276         }
3277
3278         return NULL;
3279 }
3280
3281 int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3282 {
3283         struct acpi_dmar_atsr *atsr;
3284         struct dmar_atsr_unit *atsru;
3285
3286         if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
3287                 return 0;
3288
3289         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3290         atsru = dmar_find_atsr(atsr);
3291         if (atsru)
3292                 return 0;
3293
3294         atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
3295         if (!atsru)
3296                 return -ENOMEM;
3297
3298         /*
3299          * If memory is allocated from slab by ACPI _DSM method, we need to
3300          * copy the memory content because the memory buffer will be freed
3301          * on return.
3302          */
3303         atsru->hdr = (void *)(atsru + 1);
3304         memcpy(atsru->hdr, hdr, hdr->length);
3305         atsru->include_all = atsr->flags & 0x1;
3306         if (!atsru->include_all) {
3307                 atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
3308                                 (void *)atsr + atsr->header.length,
3309                                 &atsru->devices_cnt);
3310                 if (atsru->devices_cnt && atsru->devices == NULL) {
3311                         kfree(atsru);
3312                         return -ENOMEM;
3313                 }
3314         }
3315
3316         list_add_rcu(&atsru->list, &dmar_atsr_units);
3317
3318         return 0;
3319 }
3320
3321 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
3322 {
3323         dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
3324         kfree(atsru);
3325 }
3326
3327 int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3328 {
3329         struct acpi_dmar_atsr *atsr;
3330         struct dmar_atsr_unit *atsru;
3331
3332         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3333         atsru = dmar_find_atsr(atsr);
3334         if (atsru) {
3335                 list_del_rcu(&atsru->list);
3336                 synchronize_rcu();
3337                 intel_iommu_free_atsr(atsru);
3338         }
3339
3340         return 0;
3341 }
3342
3343 int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3344 {
3345         int i;
3346         struct device *dev;
3347         struct acpi_dmar_atsr *atsr;
3348         struct dmar_atsr_unit *atsru;
3349
3350         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3351         atsru = dmar_find_atsr(atsr);
3352         if (!atsru)
3353                 return 0;
3354
3355         if (!atsru->include_all && atsru->devices && atsru->devices_cnt) {
3356                 for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
3357                                           i, dev)
3358                         return -EBUSY;
3359         }
3360
3361         return 0;
3362 }
3363
3364 static struct dmar_satc_unit *dmar_find_satc(struct acpi_dmar_satc *satc)
3365 {
3366         struct dmar_satc_unit *satcu;
3367         struct acpi_dmar_satc *tmp;
3368
3369         list_for_each_entry_rcu(satcu, &dmar_satc_units, list,
3370                                 dmar_rcu_check()) {
3371                 tmp = (struct acpi_dmar_satc *)satcu->hdr;
3372                 if (satc->segment != tmp->segment)
3373                         continue;
3374                 if (satc->header.length != tmp->header.length)
3375                         continue;
3376                 if (memcmp(satc, tmp, satc->header.length) == 0)
3377                         return satcu;
3378         }
3379
3380         return NULL;
3381 }
3382
3383 int dmar_parse_one_satc(struct acpi_dmar_header *hdr, void *arg)
3384 {
3385         struct acpi_dmar_satc *satc;
3386         struct dmar_satc_unit *satcu;
3387
3388         if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
3389                 return 0;
3390
3391         satc = container_of(hdr, struct acpi_dmar_satc, header);
3392         satcu = dmar_find_satc(satc);
3393         if (satcu)
3394                 return 0;
3395
3396         satcu = kzalloc(sizeof(*satcu) + hdr->length, GFP_KERNEL);
3397         if (!satcu)
3398                 return -ENOMEM;
3399
3400         satcu->hdr = (void *)(satcu + 1);
3401         memcpy(satcu->hdr, hdr, hdr->length);
3402         satcu->atc_required = satc->flags & 0x1;
3403         satcu->devices = dmar_alloc_dev_scope((void *)(satc + 1),
3404                                               (void *)satc + satc->header.length,
3405                                               &satcu->devices_cnt);
3406         if (satcu->devices_cnt && !satcu->devices) {
3407                 kfree(satcu);
3408                 return -ENOMEM;
3409         }
3410         list_add_rcu(&satcu->list, &dmar_satc_units);
3411
3412         return 0;
3413 }
3414
3415 static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
3416 {
3417         int sp, ret;
3418         struct intel_iommu *iommu = dmaru->iommu;
3419
3420         ret = intel_cap_audit(CAP_AUDIT_HOTPLUG_DMAR, iommu);
3421         if (ret)
3422                 goto out;
3423
3424         if (hw_pass_through && !ecap_pass_through(iommu->ecap)) {
3425                 pr_warn("%s: Doesn't support hardware pass through.\n",
3426                         iommu->name);
3427                 return -ENXIO;
3428         }
3429
3430         sp = domain_update_iommu_superpage(NULL, iommu) - 1;
3431         if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) {
3432                 pr_warn("%s: Doesn't support large page.\n",
3433                         iommu->name);
3434                 return -ENXIO;
3435         }
3436
3437         /*
3438          * Disable translation if already enabled prior to OS handover.
3439          */
3440         if (iommu->gcmd & DMA_GCMD_TE)
3441                 iommu_disable_translation(iommu);
3442
3443         ret = iommu_init_domains(iommu);
3444         if (ret == 0)
3445                 ret = iommu_alloc_root_entry(iommu);
3446         if (ret)
3447                 goto out;
3448
3449         intel_svm_check(iommu);
3450
3451         if (dmaru->ignored) {
3452                 /*
3453                  * we always have to disable PMRs or DMA may fail on this device
3454                  */
3455                 if (force_on)
3456                         iommu_disable_protect_mem_regions(iommu);
3457                 return 0;
3458         }
3459
3460         intel_iommu_init_qi(iommu);
3461         iommu_flush_write_buffer(iommu);
3462
3463 #ifdef CONFIG_INTEL_IOMMU_SVM
3464         if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
3465                 ret = intel_svm_enable_prq(iommu);
3466                 if (ret)
3467                         goto disable_iommu;
3468         }
3469 #endif
3470         ret = dmar_set_interrupt(iommu);
3471         if (ret)
3472                 goto disable_iommu;
3473
3474         iommu_set_root_entry(iommu);
3475         iommu_enable_translation(iommu);
3476
3477         iommu_disable_protect_mem_regions(iommu);
3478         return 0;
3479
3480 disable_iommu:
3481         disable_dmar_iommu(iommu);
3482 out:
3483         free_dmar_iommu(iommu);
3484         return ret;
3485 }
3486
3487 int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
3488 {
3489         int ret = 0;
3490         struct intel_iommu *iommu = dmaru->iommu;
3491
3492         if (!intel_iommu_enabled)
3493                 return 0;
3494         if (iommu == NULL)
3495                 return -EINVAL;
3496
3497         if (insert) {
3498                 ret = intel_iommu_add(dmaru);
3499         } else {
3500                 disable_dmar_iommu(iommu);
3501                 free_dmar_iommu(iommu);
3502         }
3503
3504         return ret;
3505 }
3506
3507 static void intel_iommu_free_dmars(void)
3508 {
3509         struct dmar_rmrr_unit *rmrru, *rmrr_n;
3510         struct dmar_atsr_unit *atsru, *atsr_n;
3511         struct dmar_satc_unit *satcu, *satc_n;
3512
3513         list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
3514                 list_del(&rmrru->list);
3515                 dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
3516                 kfree(rmrru);
3517         }
3518
3519         list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
3520                 list_del(&atsru->list);
3521                 intel_iommu_free_atsr(atsru);
3522         }
3523         list_for_each_entry_safe(satcu, satc_n, &dmar_satc_units, list) {
3524                 list_del(&satcu->list);
3525                 dmar_free_dev_scope(&satcu->devices, &satcu->devices_cnt);
3526                 kfree(satcu);
3527         }
3528 }
3529
3530 static struct dmar_satc_unit *dmar_find_matched_satc_unit(struct pci_dev *dev)
3531 {
3532         struct dmar_satc_unit *satcu;
3533         struct acpi_dmar_satc *satc;
3534         struct device *tmp;
3535         int i;
3536
3537         dev = pci_physfn(dev);
3538         rcu_read_lock();
3539
3540         list_for_each_entry_rcu(satcu, &dmar_satc_units, list) {
3541                 satc = container_of(satcu->hdr, struct acpi_dmar_satc, header);
3542                 if (satc->segment != pci_domain_nr(dev->bus))
3543                         continue;
3544                 for_each_dev_scope(satcu->devices, satcu->devices_cnt, i, tmp)
3545                         if (to_pci_dev(tmp) == dev)
3546                                 goto out;
3547         }
3548         satcu = NULL;
3549 out:
3550         rcu_read_unlock();
3551         return satcu;
3552 }
3553
3554 static int dmar_ats_supported(struct pci_dev *dev, struct intel_iommu *iommu)
3555 {
3556         int i, ret = 1;
3557         struct pci_bus *bus;
3558         struct pci_dev *bridge = NULL;
3559         struct device *tmp;
3560         struct acpi_dmar_atsr *atsr;
3561         struct dmar_atsr_unit *atsru;
3562         struct dmar_satc_unit *satcu;
3563
3564         dev = pci_physfn(dev);
3565         satcu = dmar_find_matched_satc_unit(dev);
3566         if (satcu)
3567                 /*
3568                  * This device supports ATS as it is in SATC table.
3569                  * When IOMMU is in legacy mode, enabling ATS is done
3570                  * automatically by HW for the device that requires
3571                  * ATS, hence OS should not enable this device ATS
3572                  * to avoid duplicated TLB invalidation.
3573                  */
3574                 return !(satcu->atc_required && !sm_supported(iommu));
3575
3576         for (bus = dev->bus; bus; bus = bus->parent) {
3577                 bridge = bus->self;
3578                 /* If it's an integrated device, allow ATS */
3579                 if (!bridge)
3580                         return 1;
3581                 /* Connected via non-PCIe: no ATS */
3582                 if (!pci_is_pcie(bridge) ||
3583                     pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
3584                         return 0;
3585                 /* If we found the root port, look it up in the ATSR */
3586                 if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
3587                         break;
3588         }
3589
3590         rcu_read_lock();
3591         list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
3592                 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3593                 if (atsr->segment != pci_domain_nr(dev->bus))
3594                         continue;
3595
3596                 for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
3597                         if (tmp == &bridge->dev)
3598                                 goto out;
3599
3600                 if (atsru->include_all)
3601                         goto out;
3602         }
3603         ret = 0;
3604 out:
3605         rcu_read_unlock();
3606
3607         return ret;
3608 }
3609
3610 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
3611 {
3612         int ret;
3613         struct dmar_rmrr_unit *rmrru;
3614         struct dmar_atsr_unit *atsru;
3615         struct dmar_satc_unit *satcu;
3616         struct acpi_dmar_atsr *atsr;
3617         struct acpi_dmar_reserved_memory *rmrr;
3618         struct acpi_dmar_satc *satc;
3619
3620         if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING)
3621                 return 0;
3622
3623         list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
3624                 rmrr = container_of(rmrru->hdr,
3625                                     struct acpi_dmar_reserved_memory, header);
3626                 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3627                         ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
3628                                 ((void *)rmrr) + rmrr->header.length,
3629                                 rmrr->segment, rmrru->devices,
3630                                 rmrru->devices_cnt);
3631                         if (ret < 0)
3632                                 return ret;
3633                 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
3634                         dmar_remove_dev_scope(info, rmrr->segment,
3635                                 rmrru->devices, rmrru->devices_cnt);
3636                 }
3637         }
3638
3639         list_for_each_entry(atsru, &dmar_atsr_units, list) {
3640                 if (atsru->include_all)
3641                         continue;
3642
3643                 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3644                 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3645                         ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
3646                                         (void *)atsr + atsr->header.length,
3647                                         atsr->segment, atsru->devices,
3648                                         atsru->devices_cnt);
3649                         if (ret > 0)
3650                                 break;
3651                         else if (ret < 0)
3652                                 return ret;
3653                 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
3654                         if (dmar_remove_dev_scope(info, atsr->segment,
3655                                         atsru->devices, atsru->devices_cnt))
3656                                 break;
3657                 }
3658         }
3659         list_for_each_entry(satcu, &dmar_satc_units, list) {
3660                 satc = container_of(satcu->hdr, struct acpi_dmar_satc, header);
3661                 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3662                         ret = dmar_insert_dev_scope(info, (void *)(satc + 1),
3663                                         (void *)satc + satc->header.length,
3664                                         satc->segment, satcu->devices,
3665                                         satcu->devices_cnt);
3666                         if (ret > 0)
3667                                 break;
3668                         else if (ret < 0)
3669                                 return ret;
3670                 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
3671                         if (dmar_remove_dev_scope(info, satc->segment,
3672                                         satcu->devices, satcu->devices_cnt))
3673                                 break;
3674                 }
3675         }
3676
3677         return 0;
3678 }
3679
3680 static int intel_iommu_memory_notifier(struct notifier_block *nb,
3681                                        unsigned long val, void *v)
3682 {
3683         struct memory_notify *mhp = v;
3684         unsigned long start_vpfn = mm_to_dma_pfn(mhp->start_pfn);
3685         unsigned long last_vpfn = mm_to_dma_pfn(mhp->start_pfn +
3686                         mhp->nr_pages - 1);
3687
3688         switch (val) {
3689         case MEM_GOING_ONLINE:
3690                 if (iommu_domain_identity_map(si_domain,
3691                                               start_vpfn, last_vpfn)) {
3692                         pr_warn("Failed to build identity map for [%lx-%lx]\n",
3693                                 start_vpfn, last_vpfn);
3694                         return NOTIFY_BAD;
3695                 }
3696                 break;
3697
3698         case MEM_OFFLINE:
3699         case MEM_CANCEL_ONLINE:
3700                 {
3701                         struct dmar_drhd_unit *drhd;
3702                         struct intel_iommu *iommu;
3703                         LIST_HEAD(freelist);
3704
3705                         domain_unmap(si_domain, start_vpfn, last_vpfn, &freelist);
3706
3707                         rcu_read_lock();
3708                         for_each_active_iommu(iommu, drhd)
3709                                 iommu_flush_iotlb_psi(iommu, si_domain,
3710                                         start_vpfn, mhp->nr_pages,
3711                                         list_empty(&freelist), 0);
3712                         rcu_read_unlock();
3713                         put_pages_list(&freelist);
3714                 }
3715                 break;
3716         }
3717
3718         return NOTIFY_OK;
3719 }
3720
3721 static struct notifier_block intel_iommu_memory_nb = {
3722         .notifier_call = intel_iommu_memory_notifier,
3723         .priority = 0
3724 };
3725
3726 static void intel_disable_iommus(void)
3727 {
3728         struct intel_iommu *iommu = NULL;
3729         struct dmar_drhd_unit *drhd;
3730
3731         for_each_iommu(iommu, drhd)
3732                 iommu_disable_translation(iommu);
3733 }
3734
3735 void intel_iommu_shutdown(void)
3736 {
3737         struct dmar_drhd_unit *drhd;
3738         struct intel_iommu *iommu = NULL;
3739
3740         if (no_iommu || dmar_disabled)
3741                 return;
3742
3743         down_write(&dmar_global_lock);
3744
3745         /* Disable PMRs explicitly here. */
3746         for_each_iommu(iommu, drhd)
3747                 iommu_disable_protect_mem_regions(iommu);
3748
3749         /* Make sure the IOMMUs are switched off */
3750         intel_disable_iommus();
3751
3752         up_write(&dmar_global_lock);
3753 }
3754
3755 static inline struct intel_iommu *dev_to_intel_iommu(struct device *dev)
3756 {
3757         struct iommu_device *iommu_dev = dev_to_iommu_device(dev);
3758
3759         return container_of(iommu_dev, struct intel_iommu, iommu);
3760 }
3761
3762 static ssize_t version_show(struct device *dev,
3763                             struct device_attribute *attr, char *buf)
3764 {
3765         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3766         u32 ver = readl(iommu->reg + DMAR_VER_REG);
3767         return sprintf(buf, "%d:%d\n",
3768                        DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
3769 }
3770 static DEVICE_ATTR_RO(version);
3771
3772 static ssize_t address_show(struct device *dev,
3773                             struct device_attribute *attr, char *buf)
3774 {
3775         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3776         return sprintf(buf, "%llx\n", iommu->reg_phys);
3777 }
3778 static DEVICE_ATTR_RO(address);
3779
3780 static ssize_t cap_show(struct device *dev,
3781                         struct device_attribute *attr, char *buf)
3782 {
3783         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3784         return sprintf(buf, "%llx\n", iommu->cap);
3785 }
3786 static DEVICE_ATTR_RO(cap);
3787
3788 static ssize_t ecap_show(struct device *dev,
3789                          struct device_attribute *attr, char *buf)
3790 {
3791         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3792         return sprintf(buf, "%llx\n", iommu->ecap);
3793 }
3794 static DEVICE_ATTR_RO(ecap);
3795
3796 static ssize_t domains_supported_show(struct device *dev,
3797                                       struct device_attribute *attr, char *buf)
3798 {
3799         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3800         return sprintf(buf, "%ld\n", cap_ndoms(iommu->cap));
3801 }
3802 static DEVICE_ATTR_RO(domains_supported);
3803
3804 static ssize_t domains_used_show(struct device *dev,
3805                                  struct device_attribute *attr, char *buf)
3806 {
3807         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3808         return sprintf(buf, "%d\n", bitmap_weight(iommu->domain_ids,
3809                                                   cap_ndoms(iommu->cap)));
3810 }
3811 static DEVICE_ATTR_RO(domains_used);
3812
3813 static struct attribute *intel_iommu_attrs[] = {
3814         &dev_attr_version.attr,
3815         &dev_attr_address.attr,
3816         &dev_attr_cap.attr,
3817         &dev_attr_ecap.attr,
3818         &dev_attr_domains_supported.attr,
3819         &dev_attr_domains_used.attr,
3820         NULL,
3821 };
3822
3823 static struct attribute_group intel_iommu_group = {
3824         .name = "intel-iommu",
3825         .attrs = intel_iommu_attrs,
3826 };
3827
3828 const struct attribute_group *intel_iommu_groups[] = {
3829         &intel_iommu_group,
3830         NULL,
3831 };
3832
3833 static inline bool has_external_pci(void)
3834 {
3835         struct pci_dev *pdev = NULL;
3836
3837         for_each_pci_dev(pdev)
3838                 if (pdev->external_facing)
3839                         return true;
3840
3841         return false;
3842 }
3843
3844 static int __init platform_optin_force_iommu(void)
3845 {
3846         if (!dmar_platform_optin() || no_platform_optin || !has_external_pci())
3847                 return 0;
3848
3849         if (no_iommu || dmar_disabled)
3850                 pr_info("Intel-IOMMU force enabled due to platform opt in\n");
3851
3852         /*
3853          * If Intel-IOMMU is disabled by default, we will apply identity
3854          * map for all devices except those marked as being untrusted.
3855          */
3856         if (dmar_disabled)
3857                 iommu_set_default_passthrough(false);
3858
3859         dmar_disabled = 0;
3860         no_iommu = 0;
3861
3862         return 1;
3863 }
3864
3865 static int __init probe_acpi_namespace_devices(void)
3866 {
3867         struct dmar_drhd_unit *drhd;
3868         /* To avoid a -Wunused-but-set-variable warning. */
3869         struct intel_iommu *iommu __maybe_unused;
3870         struct device *dev;
3871         int i, ret = 0;
3872
3873         for_each_active_iommu(iommu, drhd) {
3874                 for_each_active_dev_scope(drhd->devices,
3875                                           drhd->devices_cnt, i, dev) {
3876                         struct acpi_device_physical_node *pn;
3877                         struct iommu_group *group;
3878                         struct acpi_device *adev;
3879
3880                         if (dev->bus != &acpi_bus_type)
3881                                 continue;
3882
3883                         adev = to_acpi_device(dev);
3884                         mutex_lock(&adev->physical_node_lock);
3885                         list_for_each_entry(pn,
3886                                             &adev->physical_node_list, node) {
3887                                 group = iommu_group_get(pn->dev);
3888                                 if (group) {
3889                                         iommu_group_put(group);
3890                                         continue;
3891                                 }
3892
3893                                 pn->dev->bus->iommu_ops = &intel_iommu_ops;
3894                                 ret = iommu_probe_device(pn->dev);
3895                                 if (ret)
3896                                         break;
3897                         }
3898                         mutex_unlock(&adev->physical_node_lock);
3899
3900                         if (ret)
3901                                 return ret;
3902                 }
3903         }
3904
3905         return 0;
3906 }
3907
3908 static __init int tboot_force_iommu(void)
3909 {
3910         if (!tboot_enabled())
3911                 return 0;
3912
3913         if (no_iommu || dmar_disabled)
3914                 pr_warn("Forcing Intel-IOMMU to enabled\n");
3915
3916         dmar_disabled = 0;
3917         no_iommu = 0;
3918
3919         return 1;
3920 }
3921
3922 int __init intel_iommu_init(void)
3923 {
3924         int ret = -ENODEV;
3925         struct dmar_drhd_unit *drhd;
3926         struct intel_iommu *iommu;
3927
3928         /*
3929          * Intel IOMMU is required for a TXT/tboot launch or platform
3930          * opt in, so enforce that.
3931          */
3932         force_on = (!intel_iommu_tboot_noforce && tboot_force_iommu()) ||
3933                     platform_optin_force_iommu();
3934
3935         if (dmar_table_init()) {
3936                 if (force_on)
3937                         panic("tboot: Failed to initialize DMAR table\n");
3938                 goto out_free_dmar;
3939         }
3940
3941         if (dmar_dev_scope_init() < 0) {
3942                 if (force_on)
3943                         panic("tboot: Failed to initialize DMAR device scope\n");
3944                 goto out_free_dmar;
3945         }
3946
3947         if (!no_iommu)
3948                 intel_iommu_debugfs_init();
3949
3950         if (no_iommu || dmar_disabled) {
3951                 /*
3952                  * We exit the function here to ensure IOMMU's remapping and
3953                  * mempool aren't setup, which means that the IOMMU's PMRs
3954                  * won't be disabled via the call to init_dmars(). So disable
3955                  * it explicitly here. The PMRs were setup by tboot prior to
3956                  * calling SENTER, but the kernel is expected to reset/tear
3957                  * down the PMRs.
3958                  */
3959                 if (intel_iommu_tboot_noforce) {
3960                         for_each_iommu(iommu, drhd)
3961                                 iommu_disable_protect_mem_regions(iommu);
3962                 }
3963
3964                 /*
3965                  * Make sure the IOMMUs are switched off, even when we
3966                  * boot into a kexec kernel and the previous kernel left
3967                  * them enabled
3968                  */
3969                 intel_disable_iommus();
3970                 goto out_free_dmar;
3971         }
3972
3973         if (list_empty(&dmar_rmrr_units))
3974                 pr_info("No RMRR found\n");
3975
3976         if (list_empty(&dmar_atsr_units))
3977                 pr_info("No ATSR found\n");
3978
3979         if (list_empty(&dmar_satc_units))
3980                 pr_info("No SATC found\n");
3981
3982         init_no_remapping_devices();
3983
3984         ret = init_dmars();
3985         if (ret) {
3986                 if (force_on)
3987                         panic("tboot: Failed to initialize DMARs\n");
3988                 pr_err("Initialization failed\n");
3989                 goto out_free_dmar;
3990         }
3991
3992         init_iommu_pm_ops();
3993
3994         for_each_active_iommu(iommu, drhd) {
3995                 /*
3996                  * The flush queue implementation does not perform
3997                  * page-selective invalidations that are required for efficient
3998                  * TLB flushes in virtual environments.  The benefit of batching
3999                  * is likely to be much lower than the overhead of synchronizing
4000                  * the virtual and physical IOMMU page-tables.
4001                  */
4002                 if (cap_caching_mode(iommu->cap)) {
4003                         pr_info_once("IOMMU batching disallowed due to virtualization\n");
4004                         iommu_set_dma_strict();
4005                 }
4006                 iommu_device_sysfs_add(&iommu->iommu, NULL,
4007                                        intel_iommu_groups,
4008                                        "%s", iommu->name);
4009                 iommu_device_register(&iommu->iommu, &intel_iommu_ops, NULL);
4010         }
4011
4012         bus_set_iommu(&pci_bus_type, &intel_iommu_ops);
4013         if (si_domain && !hw_pass_through)
4014                 register_memory_notifier(&intel_iommu_memory_nb);
4015
4016         if (probe_acpi_namespace_devices())
4017                 pr_warn("ACPI name space devices didn't probe correctly\n");
4018
4019         /* Finally, we enable the DMA remapping hardware. */
4020         for_each_iommu(iommu, drhd) {
4021                 if (!drhd->ignored && !translation_pre_enabled(iommu))
4022                         iommu_enable_translation(iommu);
4023
4024                 iommu_disable_protect_mem_regions(iommu);
4025         }
4026
4027         intel_iommu_enabled = 1;
4028         dmar_register_bus_notifier();
4029         pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
4030
4031         return 0;
4032
4033 out_free_dmar:
4034         intel_iommu_free_dmars();
4035         return ret;
4036 }
4037
4038 static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque)
4039 {
4040         struct device_domain_info *info = opaque;
4041
4042         domain_context_clear_one(info, PCI_BUS_NUM(alias), alias & 0xff);
4043         return 0;
4044 }
4045
4046 /*
4047  * NB - intel-iommu lacks any sort of reference counting for the users of
4048  * dependent devices.  If multiple endpoints have intersecting dependent
4049  * devices, unbinding the driver from any one of them will possibly leave
4050  * the others unable to operate.
4051  */
4052 static void domain_context_clear(struct device_domain_info *info)
4053 {
4054         if (!info->iommu || !info->dev || !dev_is_pci(info->dev))
4055                 return;
4056
4057         pci_for_each_dma_alias(to_pci_dev(info->dev),
4058                                &domain_context_clear_one_cb, info);
4059 }
4060
4061 static void dmar_remove_one_dev_info(struct device *dev)
4062 {
4063         struct device_domain_info *info = dev_iommu_priv_get(dev);
4064         struct dmar_domain *domain = info->domain;
4065         struct intel_iommu *iommu = info->iommu;
4066         unsigned long flags;
4067
4068         if (!dev_is_real_dma_subdevice(info->dev)) {
4069                 if (dev_is_pci(info->dev) && sm_supported(iommu))
4070                         intel_pasid_tear_down_entry(iommu, info->dev,
4071                                         PASID_RID2PASID, false);
4072
4073                 iommu_disable_dev_iotlb(info);
4074                 domain_context_clear(info);
4075                 intel_pasid_free_table(info->dev);
4076         }
4077
4078         spin_lock_irqsave(&domain->lock, flags);
4079         list_del(&info->link);
4080         spin_unlock_irqrestore(&domain->lock, flags);
4081
4082         domain_detach_iommu(domain, iommu);
4083         info->domain = NULL;
4084 }
4085
4086 static int md_domain_init(struct dmar_domain *domain, int guest_width)
4087 {
4088         int adjust_width;
4089
4090         /* calculate AGAW */
4091         domain->gaw = guest_width;
4092         adjust_width = guestwidth_to_adjustwidth(guest_width);
4093         domain->agaw = width_to_agaw(adjust_width);
4094
4095         domain->iommu_coherency = false;
4096         domain->iommu_superpage = 0;
4097         domain->max_addr = 0;
4098
4099         /* always allocate the top pgd */
4100         domain->pgd = alloc_pgtable_page(domain->nid);
4101         if (!domain->pgd)
4102                 return -ENOMEM;
4103         domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
4104         return 0;
4105 }
4106
4107 static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
4108 {
4109         struct dmar_domain *dmar_domain;
4110         struct iommu_domain *domain;
4111
4112         switch (type) {
4113         case IOMMU_DOMAIN_DMA:
4114         case IOMMU_DOMAIN_DMA_FQ:
4115         case IOMMU_DOMAIN_UNMANAGED:
4116                 dmar_domain = alloc_domain(type);
4117                 if (!dmar_domain) {
4118                         pr_err("Can't allocate dmar_domain\n");
4119                         return NULL;
4120                 }
4121                 if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
4122                         pr_err("Domain initialization failed\n");
4123                         domain_exit(dmar_domain);
4124                         return NULL;
4125                 }
4126
4127                 domain = &dmar_domain->domain;
4128                 domain->geometry.aperture_start = 0;
4129                 domain->geometry.aperture_end   =
4130                                 __DOMAIN_MAX_ADDR(dmar_domain->gaw);
4131                 domain->geometry.force_aperture = true;
4132
4133                 return domain;
4134         case IOMMU_DOMAIN_IDENTITY:
4135                 return &si_domain->domain;
4136         default:
4137                 return NULL;
4138         }
4139
4140         return NULL;
4141 }
4142
4143 static void intel_iommu_domain_free(struct iommu_domain *domain)
4144 {
4145         if (domain != &si_domain->domain)
4146                 domain_exit(to_dmar_domain(domain));
4147 }
4148
4149 static int prepare_domain_attach_device(struct iommu_domain *domain,
4150                                         struct device *dev)
4151 {
4152         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4153         struct intel_iommu *iommu;
4154         int addr_width;
4155
4156         iommu = device_to_iommu(dev, NULL, NULL);
4157         if (!iommu)
4158                 return -ENODEV;
4159
4160         if (dmar_domain->force_snooping && !ecap_sc_support(iommu->ecap))
4161                 return -EOPNOTSUPP;
4162
4163         /* check if this iommu agaw is sufficient for max mapped address */
4164         addr_width = agaw_to_width(iommu->agaw);
4165         if (addr_width > cap_mgaw(iommu->cap))
4166                 addr_width = cap_mgaw(iommu->cap);
4167
4168         if (dmar_domain->max_addr > (1LL << addr_width)) {
4169                 dev_err(dev, "%s: iommu width (%d) is not "
4170                         "sufficient for the mapped address (%llx)\n",
4171                         __func__, addr_width, dmar_domain->max_addr);
4172                 return -EFAULT;
4173         }
4174         dmar_domain->gaw = addr_width;
4175
4176         /*
4177          * Knock out extra levels of page tables if necessary
4178          */
4179         while (iommu->agaw < dmar_domain->agaw) {
4180                 struct dma_pte *pte;
4181
4182                 pte = dmar_domain->pgd;
4183                 if (dma_pte_present(pte)) {
4184                         dmar_domain->pgd = phys_to_virt(dma_pte_addr(pte));
4185                         free_pgtable_page(pte);
4186                 }
4187                 dmar_domain->agaw--;
4188         }
4189
4190         return 0;
4191 }
4192
4193 static int intel_iommu_attach_device(struct iommu_domain *domain,
4194                                      struct device *dev)
4195 {
4196         int ret;
4197
4198         if (domain->type == IOMMU_DOMAIN_UNMANAGED &&
4199             device_is_rmrr_locked(dev)) {
4200                 dev_warn(dev, "Device is ineligible for IOMMU domain attach due to platform RMRR requirement.  Contact your platform vendor.\n");
4201                 return -EPERM;
4202         }
4203
4204         /* normally dev is not mapped */
4205         if (unlikely(domain_context_mapped(dev))) {
4206                 struct device_domain_info *info = dev_iommu_priv_get(dev);
4207
4208                 if (info->domain)
4209                         dmar_remove_one_dev_info(dev);
4210         }
4211
4212         ret = prepare_domain_attach_device(domain, dev);
4213         if (ret)
4214                 return ret;
4215
4216         return domain_add_dev_info(to_dmar_domain(domain), dev);
4217 }
4218
4219 static void intel_iommu_detach_device(struct iommu_domain *domain,
4220                                       struct device *dev)
4221 {
4222         dmar_remove_one_dev_info(dev);
4223 }
4224
4225 static int intel_iommu_map(struct iommu_domain *domain,
4226                            unsigned long iova, phys_addr_t hpa,
4227                            size_t size, int iommu_prot, gfp_t gfp)
4228 {
4229         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4230         u64 max_addr;
4231         int prot = 0;
4232
4233         if (iommu_prot & IOMMU_READ)
4234                 prot |= DMA_PTE_READ;
4235         if (iommu_prot & IOMMU_WRITE)
4236                 prot |= DMA_PTE_WRITE;
4237         if (dmar_domain->set_pte_snp)
4238                 prot |= DMA_PTE_SNP;
4239
4240         max_addr = iova + size;
4241         if (dmar_domain->max_addr < max_addr) {
4242                 u64 end;
4243
4244                 /* check if minimum agaw is sufficient for mapped address */
4245                 end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
4246                 if (end < max_addr) {
4247                         pr_err("%s: iommu width (%d) is not "
4248                                "sufficient for the mapped address (%llx)\n",
4249                                __func__, dmar_domain->gaw, max_addr);
4250                         return -EFAULT;
4251                 }
4252                 dmar_domain->max_addr = max_addr;
4253         }
4254         /* Round up size to next multiple of PAGE_SIZE, if it and
4255            the low bits of hpa would take us onto the next page */
4256         size = aligned_nrpages(hpa, size);
4257         return __domain_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
4258                                 hpa >> VTD_PAGE_SHIFT, size, prot);
4259 }
4260
4261 static int intel_iommu_map_pages(struct iommu_domain *domain,
4262                                  unsigned long iova, phys_addr_t paddr,
4263                                  size_t pgsize, size_t pgcount,
4264                                  int prot, gfp_t gfp, size_t *mapped)
4265 {
4266         unsigned long pgshift = __ffs(pgsize);
4267         size_t size = pgcount << pgshift;
4268         int ret;
4269
4270         if (pgsize != SZ_4K && pgsize != SZ_2M && pgsize != SZ_1G)
4271                 return -EINVAL;
4272
4273         if (!IS_ALIGNED(iova | paddr, pgsize))
4274                 return -EINVAL;
4275
4276         ret = intel_iommu_map(domain, iova, paddr, size, prot, gfp);
4277         if (!ret && mapped)
4278                 *mapped = size;
4279
4280         return ret;
4281 }
4282
4283 static size_t intel_iommu_unmap(struct iommu_domain *domain,
4284                                 unsigned long iova, size_t size,
4285                                 struct iommu_iotlb_gather *gather)
4286 {
4287         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4288         unsigned long start_pfn, last_pfn;
4289         int level = 0;
4290
4291         /* Cope with horrid API which requires us to unmap more than the
4292            size argument if it happens to be a large-page mapping. */
4293         BUG_ON(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level));
4294
4295         if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
4296                 size = VTD_PAGE_SIZE << level_to_offset_bits(level);
4297
4298         start_pfn = iova >> VTD_PAGE_SHIFT;
4299         last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
4300
4301         domain_unmap(dmar_domain, start_pfn, last_pfn, &gather->freelist);
4302
4303         if (dmar_domain->max_addr == iova + size)
4304                 dmar_domain->max_addr = iova;
4305
4306         iommu_iotlb_gather_add_page(domain, gather, iova, size);
4307
4308         return size;
4309 }
4310
4311 static size_t intel_iommu_unmap_pages(struct iommu_domain *domain,
4312                                       unsigned long iova,
4313                                       size_t pgsize, size_t pgcount,
4314                                       struct iommu_iotlb_gather *gather)
4315 {
4316         unsigned long pgshift = __ffs(pgsize);
4317         size_t size = pgcount << pgshift;
4318
4319         return intel_iommu_unmap(domain, iova, size, gather);
4320 }
4321
4322 static void intel_iommu_tlb_sync(struct iommu_domain *domain,
4323                                  struct iommu_iotlb_gather *gather)
4324 {
4325         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4326         unsigned long iova_pfn = IOVA_PFN(gather->start);
4327         size_t size = gather->end - gather->start;
4328         struct iommu_domain_info *info;
4329         unsigned long start_pfn;
4330         unsigned long nrpages;
4331         unsigned long i;
4332
4333         nrpages = aligned_nrpages(gather->start, size);
4334         start_pfn = mm_to_dma_pfn(iova_pfn);
4335
4336         xa_for_each(&dmar_domain->iommu_array, i, info)
4337                 iommu_flush_iotlb_psi(info->iommu, dmar_domain,
4338                                       start_pfn, nrpages,
4339                                       list_empty(&gather->freelist), 0);
4340
4341         put_pages_list(&gather->freelist);
4342 }
4343
4344 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
4345                                             dma_addr_t iova)
4346 {
4347         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4348         struct dma_pte *pte;
4349         int level = 0;
4350         u64 phys = 0;
4351
4352         pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level);
4353         if (pte && dma_pte_present(pte))
4354                 phys = dma_pte_addr(pte) +
4355                         (iova & (BIT_MASK(level_to_offset_bits(level) +
4356                                                 VTD_PAGE_SHIFT) - 1));
4357
4358         return phys;
4359 }
4360
4361 static bool domain_support_force_snooping(struct dmar_domain *domain)
4362 {
4363         struct device_domain_info *info;
4364         bool support = true;
4365
4366         assert_spin_locked(&domain->lock);
4367         list_for_each_entry(info, &domain->devices, link) {
4368                 if (!ecap_sc_support(info->iommu->ecap)) {
4369                         support = false;
4370                         break;
4371                 }
4372         }
4373
4374         return support;
4375 }
4376
4377 static void domain_set_force_snooping(struct dmar_domain *domain)
4378 {
4379         struct device_domain_info *info;
4380
4381         assert_spin_locked(&domain->lock);
4382         /*
4383          * Second level page table supports per-PTE snoop control. The
4384          * iommu_map() interface will handle this by setting SNP bit.
4385          */
4386         if (!domain_use_first_level(domain)) {
4387                 domain->set_pte_snp = true;
4388                 return;
4389         }
4390
4391         list_for_each_entry(info, &domain->devices, link)
4392                 intel_pasid_setup_page_snoop_control(info->iommu, info->dev,
4393                                                      PASID_RID2PASID);
4394 }
4395
4396 static bool intel_iommu_enforce_cache_coherency(struct iommu_domain *domain)
4397 {
4398         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4399         unsigned long flags;
4400
4401         if (dmar_domain->force_snooping)
4402                 return true;
4403
4404         spin_lock_irqsave(&dmar_domain->lock, flags);
4405         if (!domain_support_force_snooping(dmar_domain)) {
4406                 spin_unlock_irqrestore(&dmar_domain->lock, flags);
4407                 return false;
4408         }
4409
4410         domain_set_force_snooping(dmar_domain);
4411         dmar_domain->force_snooping = true;
4412         spin_unlock_irqrestore(&dmar_domain->lock, flags);
4413
4414         return true;
4415 }
4416
4417 static bool intel_iommu_capable(enum iommu_cap cap)
4418 {
4419         if (cap == IOMMU_CAP_CACHE_COHERENCY)
4420                 return true;
4421         if (cap == IOMMU_CAP_INTR_REMAP)
4422                 return irq_remapping_enabled == 1;
4423         if (cap == IOMMU_CAP_PRE_BOOT_PROTECTION)
4424                 return dmar_platform_optin();
4425
4426         return false;
4427 }
4428
4429 static struct iommu_device *intel_iommu_probe_device(struct device *dev)
4430 {
4431         struct pci_dev *pdev = dev_is_pci(dev) ? to_pci_dev(dev) : NULL;
4432         struct device_domain_info *info;
4433         struct intel_iommu *iommu;
4434         u8 bus, devfn;
4435
4436         iommu = device_to_iommu(dev, &bus, &devfn);
4437         if (!iommu)
4438                 return ERR_PTR(-ENODEV);
4439
4440         info = kzalloc(sizeof(*info), GFP_KERNEL);
4441         if (!info)
4442                 return ERR_PTR(-ENOMEM);
4443
4444         if (dev_is_real_dma_subdevice(dev)) {
4445                 info->bus = pdev->bus->number;
4446                 info->devfn = pdev->devfn;
4447                 info->segment = pci_domain_nr(pdev->bus);
4448         } else {
4449                 info->bus = bus;
4450                 info->devfn = devfn;
4451                 info->segment = iommu->segment;
4452         }
4453
4454         info->dev = dev;
4455         info->iommu = iommu;
4456         if (dev_is_pci(dev)) {
4457                 if (ecap_dev_iotlb_support(iommu->ecap) &&
4458                     pci_ats_supported(pdev) &&
4459                     dmar_ats_supported(pdev, iommu))
4460                         info->ats_supported = 1;
4461
4462                 if (sm_supported(iommu)) {
4463                         if (pasid_supported(iommu)) {
4464                                 int features = pci_pasid_features(pdev);
4465
4466                                 if (features >= 0)
4467                                         info->pasid_supported = features | 1;
4468                         }
4469
4470                         if (info->ats_supported && ecap_prs(iommu->ecap) &&
4471                             pci_pri_supported(pdev))
4472                                 info->pri_supported = 1;
4473                 }
4474         }
4475
4476         dev_iommu_priv_set(dev, info);
4477
4478         return &iommu->iommu;
4479 }
4480
4481 static void intel_iommu_release_device(struct device *dev)
4482 {
4483         struct device_domain_info *info = dev_iommu_priv_get(dev);
4484
4485         dmar_remove_one_dev_info(dev);
4486         dev_iommu_priv_set(dev, NULL);
4487         kfree(info);
4488         set_dma_ops(dev, NULL);
4489 }
4490
4491 static void intel_iommu_probe_finalize(struct device *dev)
4492 {
4493         set_dma_ops(dev, NULL);
4494         iommu_setup_dma_ops(dev, 0, U64_MAX);
4495 }
4496
4497 static void intel_iommu_get_resv_regions(struct device *device,
4498                                          struct list_head *head)
4499 {
4500         int prot = DMA_PTE_READ | DMA_PTE_WRITE;
4501         struct iommu_resv_region *reg;
4502         struct dmar_rmrr_unit *rmrr;
4503         struct device *i_dev;
4504         int i;
4505
4506         down_read(&dmar_global_lock);
4507         for_each_rmrr_units(rmrr) {
4508                 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
4509                                           i, i_dev) {
4510                         struct iommu_resv_region *resv;
4511                         enum iommu_resv_type type;
4512                         size_t length;
4513
4514                         if (i_dev != device &&
4515                             !is_downstream_to_pci_bridge(device, i_dev))
4516                                 continue;
4517
4518                         length = rmrr->end_address - rmrr->base_address + 1;
4519
4520                         type = device_rmrr_is_relaxable(device) ?
4521                                 IOMMU_RESV_DIRECT_RELAXABLE : IOMMU_RESV_DIRECT;
4522
4523                         resv = iommu_alloc_resv_region(rmrr->base_address,
4524                                                        length, prot, type);
4525                         if (!resv)
4526                                 break;
4527
4528                         list_add_tail(&resv->list, head);
4529                 }
4530         }
4531         up_read(&dmar_global_lock);
4532
4533 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
4534         if (dev_is_pci(device)) {
4535                 struct pci_dev *pdev = to_pci_dev(device);
4536
4537                 if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) {
4538                         reg = iommu_alloc_resv_region(0, 1UL << 24, prot,
4539                                                    IOMMU_RESV_DIRECT_RELAXABLE);
4540                         if (reg)
4541                                 list_add_tail(&reg->list, head);
4542                 }
4543         }
4544 #endif /* CONFIG_INTEL_IOMMU_FLOPPY_WA */
4545
4546         reg = iommu_alloc_resv_region(IOAPIC_RANGE_START,
4547                                       IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1,
4548                                       0, IOMMU_RESV_MSI);
4549         if (!reg)
4550                 return;
4551         list_add_tail(&reg->list, head);
4552 }
4553
4554 int intel_iommu_enable_pasid(struct intel_iommu *iommu, struct device *dev)
4555 {
4556         struct device_domain_info *info = dev_iommu_priv_get(dev);
4557         struct context_entry *context;
4558         struct dmar_domain *domain;
4559         u64 ctx_lo;
4560         int ret;
4561
4562         domain = info->domain;
4563         if (!domain)
4564                 return -EINVAL;
4565
4566         spin_lock(&iommu->lock);
4567         ret = -EINVAL;
4568         if (!info->pasid_supported)
4569                 goto out;
4570
4571         context = iommu_context_addr(iommu, info->bus, info->devfn, 0);
4572         if (WARN_ON(!context))
4573                 goto out;
4574
4575         ctx_lo = context[0].lo;
4576
4577         if (!(ctx_lo & CONTEXT_PASIDE)) {
4578                 ctx_lo |= CONTEXT_PASIDE;
4579                 context[0].lo = ctx_lo;
4580                 wmb();
4581                 iommu->flush.flush_context(iommu,
4582                                            domain_id_iommu(domain, iommu),
4583                                            PCI_DEVID(info->bus, info->devfn),
4584                                            DMA_CCMD_MASK_NOBIT,
4585                                            DMA_CCMD_DEVICE_INVL);
4586         }
4587
4588         /* Enable PASID support in the device, if it wasn't already */
4589         if (!info->pasid_enabled)
4590                 iommu_enable_dev_iotlb(info);
4591
4592         ret = 0;
4593
4594  out:
4595         spin_unlock(&iommu->lock);
4596
4597         return ret;
4598 }
4599
4600 static struct iommu_group *intel_iommu_device_group(struct device *dev)
4601 {
4602         if (dev_is_pci(dev))
4603                 return pci_device_group(dev);
4604         return generic_device_group(dev);
4605 }
4606
4607 static int intel_iommu_enable_sva(struct device *dev)
4608 {
4609         struct device_domain_info *info = dev_iommu_priv_get(dev);
4610         struct intel_iommu *iommu;
4611         int ret;
4612
4613         if (!info || dmar_disabled)
4614                 return -EINVAL;
4615
4616         iommu = info->iommu;
4617         if (!iommu)
4618                 return -EINVAL;
4619
4620         if (!(iommu->flags & VTD_FLAG_SVM_CAPABLE))
4621                 return -ENODEV;
4622
4623         if (intel_iommu_enable_pasid(iommu, dev))
4624                 return -ENODEV;
4625
4626         if (!info->pasid_enabled || !info->pri_enabled || !info->ats_enabled)
4627                 return -EINVAL;
4628
4629         ret = iopf_queue_add_device(iommu->iopf_queue, dev);
4630         if (!ret)
4631                 ret = iommu_register_device_fault_handler(dev, iommu_queue_iopf, dev);
4632
4633         return ret;
4634 }
4635
4636 static int intel_iommu_disable_sva(struct device *dev)
4637 {
4638         struct device_domain_info *info = dev_iommu_priv_get(dev);
4639         struct intel_iommu *iommu = info->iommu;
4640         int ret;
4641
4642         ret = iommu_unregister_device_fault_handler(dev);
4643         if (!ret)
4644                 ret = iopf_queue_remove_device(iommu->iopf_queue, dev);
4645
4646         return ret;
4647 }
4648
4649 static int intel_iommu_enable_iopf(struct device *dev)
4650 {
4651         struct device_domain_info *info = dev_iommu_priv_get(dev);
4652
4653         if (info && info->pri_supported)
4654                 return 0;
4655
4656         return -ENODEV;
4657 }
4658
4659 static int
4660 intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat)
4661 {
4662         switch (feat) {
4663         case IOMMU_DEV_FEAT_IOPF:
4664                 return intel_iommu_enable_iopf(dev);
4665
4666         case IOMMU_DEV_FEAT_SVA:
4667                 return intel_iommu_enable_sva(dev);
4668
4669         default:
4670                 return -ENODEV;
4671         }
4672 }
4673
4674 static int
4675 intel_iommu_dev_disable_feat(struct device *dev, enum iommu_dev_features feat)
4676 {
4677         switch (feat) {
4678         case IOMMU_DEV_FEAT_IOPF:
4679                 return 0;
4680
4681         case IOMMU_DEV_FEAT_SVA:
4682                 return intel_iommu_disable_sva(dev);
4683
4684         default:
4685                 return -ENODEV;
4686         }
4687 }
4688
4689 static bool intel_iommu_is_attach_deferred(struct device *dev)
4690 {
4691         struct device_domain_info *info = dev_iommu_priv_get(dev);
4692
4693         return translation_pre_enabled(info->iommu) && !info->domain;
4694 }
4695
4696 /*
4697  * Check that the device does not live on an external facing PCI port that is
4698  * marked as untrusted. Such devices should not be able to apply quirks and
4699  * thus not be able to bypass the IOMMU restrictions.
4700  */
4701 static bool risky_device(struct pci_dev *pdev)
4702 {
4703         if (pdev->untrusted) {
4704                 pci_info(pdev,
4705                          "Skipping IOMMU quirk for dev [%04X:%04X] on untrusted PCI link\n",
4706                          pdev->vendor, pdev->device);
4707                 pci_info(pdev, "Please check with your BIOS/Platform vendor about this\n");
4708                 return true;
4709         }
4710         return false;
4711 }
4712
4713 static void intel_iommu_iotlb_sync_map(struct iommu_domain *domain,
4714                                        unsigned long iova, size_t size)
4715 {
4716         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4717         unsigned long pages = aligned_nrpages(iova, size);
4718         unsigned long pfn = iova >> VTD_PAGE_SHIFT;
4719         struct iommu_domain_info *info;
4720         unsigned long i;
4721
4722         xa_for_each(&dmar_domain->iommu_array, i, info)
4723                 __mapping_notify_one(info->iommu, dmar_domain, pfn, pages);
4724 }
4725
4726 const struct iommu_ops intel_iommu_ops = {
4727         .capable                = intel_iommu_capable,
4728         .domain_alloc           = intel_iommu_domain_alloc,
4729         .probe_device           = intel_iommu_probe_device,
4730         .probe_finalize         = intel_iommu_probe_finalize,
4731         .release_device         = intel_iommu_release_device,
4732         .get_resv_regions       = intel_iommu_get_resv_regions,
4733         .device_group           = intel_iommu_device_group,
4734         .dev_enable_feat        = intel_iommu_dev_enable_feat,
4735         .dev_disable_feat       = intel_iommu_dev_disable_feat,
4736         .is_attach_deferred     = intel_iommu_is_attach_deferred,
4737         .def_domain_type        = device_def_domain_type,
4738         .pgsize_bitmap          = SZ_4K,
4739 #ifdef CONFIG_INTEL_IOMMU_SVM
4740         .sva_bind               = intel_svm_bind,
4741         .sva_unbind             = intel_svm_unbind,
4742         .sva_get_pasid          = intel_svm_get_pasid,
4743         .page_response          = intel_svm_page_response,
4744 #endif
4745         .default_domain_ops = &(const struct iommu_domain_ops) {
4746                 .attach_dev             = intel_iommu_attach_device,
4747                 .detach_dev             = intel_iommu_detach_device,
4748                 .map_pages              = intel_iommu_map_pages,
4749                 .unmap_pages            = intel_iommu_unmap_pages,
4750                 .iotlb_sync_map         = intel_iommu_iotlb_sync_map,
4751                 .flush_iotlb_all        = intel_flush_iotlb_all,
4752                 .iotlb_sync             = intel_iommu_tlb_sync,
4753                 .iova_to_phys           = intel_iommu_iova_to_phys,
4754                 .free                   = intel_iommu_domain_free,
4755                 .enforce_cache_coherency = intel_iommu_enforce_cache_coherency,
4756         }
4757 };
4758
4759 static void quirk_iommu_igfx(struct pci_dev *dev)
4760 {
4761         if (risky_device(dev))
4762                 return;
4763
4764         pci_info(dev, "Disabling IOMMU for graphics on this chipset\n");
4765         dmar_map_gfx = 0;
4766 }
4767
4768 /* G4x/GM45 integrated gfx dmar support is totally busted. */
4769 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_igfx);
4770 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_igfx);
4771 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_igfx);
4772 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_igfx);
4773 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_igfx);
4774 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_igfx);
4775 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_igfx);
4776
4777 /* Broadwell igfx malfunctions with dmar */
4778 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1606, quirk_iommu_igfx);
4779 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160B, quirk_iommu_igfx);
4780 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160E, quirk_iommu_igfx);
4781 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1602, quirk_iommu_igfx);
4782 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160A, quirk_iommu_igfx);
4783 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160D, quirk_iommu_igfx);
4784 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1616, quirk_iommu_igfx);
4785 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161B, quirk_iommu_igfx);
4786 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161E, quirk_iommu_igfx);
4787 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1612, quirk_iommu_igfx);
4788 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161A, quirk_iommu_igfx);
4789 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161D, quirk_iommu_igfx);
4790 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1626, quirk_iommu_igfx);
4791 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162B, quirk_iommu_igfx);
4792 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162E, quirk_iommu_igfx);
4793 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1622, quirk_iommu_igfx);
4794 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162A, quirk_iommu_igfx);
4795 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162D, quirk_iommu_igfx);
4796 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1636, quirk_iommu_igfx);
4797 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163B, quirk_iommu_igfx);
4798 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163E, quirk_iommu_igfx);
4799 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1632, quirk_iommu_igfx);
4800 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163A, quirk_iommu_igfx);
4801 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163D, quirk_iommu_igfx);
4802
4803 static void quirk_iommu_rwbf(struct pci_dev *dev)
4804 {
4805         if (risky_device(dev))
4806                 return;
4807
4808         /*
4809          * Mobile 4 Series Chipset neglects to set RWBF capability,
4810          * but needs it. Same seems to hold for the desktop versions.
4811          */
4812         pci_info(dev, "Forcing write-buffer flush capability\n");
4813         rwbf_quirk = 1;
4814 }
4815
4816 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
4817 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
4818 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
4819 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
4820 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
4821 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
4822 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
4823
4824 #define GGC 0x52
4825 #define GGC_MEMORY_SIZE_MASK    (0xf << 8)
4826 #define GGC_MEMORY_SIZE_NONE    (0x0 << 8)
4827 #define GGC_MEMORY_SIZE_1M      (0x1 << 8)
4828 #define GGC_MEMORY_SIZE_2M      (0x3 << 8)
4829 #define GGC_MEMORY_VT_ENABLED   (0x8 << 8)
4830 #define GGC_MEMORY_SIZE_2M_VT   (0x9 << 8)
4831 #define GGC_MEMORY_SIZE_3M_VT   (0xa << 8)
4832 #define GGC_MEMORY_SIZE_4M_VT   (0xb << 8)
4833
4834 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
4835 {
4836         unsigned short ggc;
4837
4838         if (risky_device(dev))
4839                 return;
4840
4841         if (pci_read_config_word(dev, GGC, &ggc))
4842                 return;
4843
4844         if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
4845                 pci_info(dev, "BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
4846                 dmar_map_gfx = 0;
4847         } else if (dmar_map_gfx) {
4848                 /* we have to ensure the gfx device is idle before we flush */
4849                 pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n");
4850                 iommu_set_dma_strict();
4851         }
4852 }
4853 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
4854 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
4855 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
4856 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
4857
4858 static void quirk_igfx_skip_te_disable(struct pci_dev *dev)
4859 {
4860         unsigned short ver;
4861
4862         if (!IS_GFX_DEVICE(dev))
4863                 return;
4864
4865         ver = (dev->device >> 8) & 0xff;
4866         if (ver != 0x45 && ver != 0x46 && ver != 0x4c &&
4867             ver != 0x4e && ver != 0x8a && ver != 0x98 &&
4868             ver != 0x9a && ver != 0xa7)
4869                 return;
4870
4871         if (risky_device(dev))
4872                 return;
4873
4874         pci_info(dev, "Skip IOMMU disabling for graphics\n");
4875         iommu_skip_te_disable = 1;
4876 }
4877 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_ANY_ID, quirk_igfx_skip_te_disable);
4878
4879 /* On Tylersburg chipsets, some BIOSes have been known to enable the
4880    ISOCH DMAR unit for the Azalia sound device, but not give it any
4881    TLB entries, which causes it to deadlock. Check for that.  We do
4882    this in a function called from init_dmars(), instead of in a PCI
4883    quirk, because we don't want to print the obnoxious "BIOS broken"
4884    message if VT-d is actually disabled.
4885 */
4886 static void __init check_tylersburg_isoch(void)
4887 {
4888         struct pci_dev *pdev;
4889         uint32_t vtisochctrl;
4890
4891         /* If there's no Azalia in the system anyway, forget it. */
4892         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
4893         if (!pdev)
4894                 return;
4895
4896         if (risky_device(pdev)) {
4897                 pci_dev_put(pdev);
4898                 return;
4899         }
4900
4901         pci_dev_put(pdev);
4902
4903         /* System Management Registers. Might be hidden, in which case
4904            we can't do the sanity check. But that's OK, because the
4905            known-broken BIOSes _don't_ actually hide it, so far. */
4906         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
4907         if (!pdev)
4908                 return;
4909
4910         if (risky_device(pdev)) {
4911                 pci_dev_put(pdev);
4912                 return;
4913         }
4914
4915         if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
4916                 pci_dev_put(pdev);
4917                 return;
4918         }
4919
4920         pci_dev_put(pdev);
4921
4922         /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
4923         if (vtisochctrl & 1)
4924                 return;
4925
4926         /* Drop all bits other than the number of TLB entries */
4927         vtisochctrl &= 0x1c;
4928
4929         /* If we have the recommended number of TLB entries (16), fine. */
4930         if (vtisochctrl == 0x10)
4931                 return;
4932
4933         /* Zero TLB entries? You get to ride the short bus to school. */
4934         if (!vtisochctrl) {
4935                 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
4936                      "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
4937                      dmi_get_system_info(DMI_BIOS_VENDOR),
4938                      dmi_get_system_info(DMI_BIOS_VERSION),
4939                      dmi_get_system_info(DMI_PRODUCT_VERSION));
4940                 iommu_identity_mapping |= IDENTMAP_AZALIA;
4941                 return;
4942         }
4943
4944         pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
4945                vtisochctrl);
4946 }