64d30895a4c8f4eb652721598729cfe0970ca075
[platform/kernel/linux-starfive.git] / drivers / iommu / intel / iommu.c
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Copyright © 2006-2014 Intel Corporation.
4  *
5  * Authors: David Woodhouse <dwmw2@infradead.org>,
6  *          Ashok Raj <ashok.raj@intel.com>,
7  *          Shaohua Li <shaohua.li@intel.com>,
8  *          Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
9  *          Fenghua Yu <fenghua.yu@intel.com>
10  *          Joerg Roedel <jroedel@suse.de>
11  */
12
13 #define pr_fmt(fmt)     "DMAR: " fmt
14 #define dev_fmt(fmt)    pr_fmt(fmt)
15
16 #include <linux/crash_dump.h>
17 #include <linux/dma-direct.h>
18 #include <linux/dma-iommu.h>
19 #include <linux/dmi.h>
20 #include <linux/intel-svm.h>
21 #include <linux/memory.h>
22 #include <linux/pci.h>
23 #include <linux/pci-ats.h>
24 #include <linux/spinlock.h>
25 #include <linux/syscore_ops.h>
26 #include <linux/tboot.h>
27
28 #include "iommu.h"
29 #include "../irq_remapping.h"
30 #include "../iommu-sva-lib.h"
31 #include "pasid.h"
32 #include "cap_audit.h"
33
34 #define ROOT_SIZE               VTD_PAGE_SIZE
35 #define CONTEXT_SIZE            VTD_PAGE_SIZE
36
37 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
38 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
39 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
40 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
41
42 #define IOAPIC_RANGE_START      (0xfee00000)
43 #define IOAPIC_RANGE_END        (0xfeefffff)
44 #define IOVA_START_ADDR         (0x1000)
45
46 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 57
47
48 #define MAX_AGAW_WIDTH 64
49 #define MAX_AGAW_PFN_WIDTH      (MAX_AGAW_WIDTH - VTD_PAGE_SHIFT)
50
51 #define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << ((gaw) - VTD_PAGE_SHIFT)) - 1)
52 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << (gaw)) - 1)
53
54 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
55    to match. That way, we can use 'unsigned long' for PFNs with impunity. */
56 #define DOMAIN_MAX_PFN(gaw)     ((unsigned long) min_t(uint64_t, \
57                                 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
58 #define DOMAIN_MAX_ADDR(gaw)    (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
59
60 /* IO virtual address start page frame number */
61 #define IOVA_START_PFN          (1)
62
63 #define IOVA_PFN(addr)          ((addr) >> PAGE_SHIFT)
64
65 /* page table handling */
66 #define LEVEL_STRIDE            (9)
67 #define LEVEL_MASK              (((u64)1 << LEVEL_STRIDE) - 1)
68
69 static inline int agaw_to_level(int agaw)
70 {
71         return agaw + 2;
72 }
73
74 static inline int agaw_to_width(int agaw)
75 {
76         return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH);
77 }
78
79 static inline int width_to_agaw(int width)
80 {
81         return DIV_ROUND_UP(width - 30, LEVEL_STRIDE);
82 }
83
84 static inline unsigned int level_to_offset_bits(int level)
85 {
86         return (level - 1) * LEVEL_STRIDE;
87 }
88
89 static inline int pfn_level_offset(u64 pfn, int level)
90 {
91         return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
92 }
93
94 static inline u64 level_mask(int level)
95 {
96         return -1ULL << level_to_offset_bits(level);
97 }
98
99 static inline u64 level_size(int level)
100 {
101         return 1ULL << level_to_offset_bits(level);
102 }
103
104 static inline u64 align_to_level(u64 pfn, int level)
105 {
106         return (pfn + level_size(level) - 1) & level_mask(level);
107 }
108
109 static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
110 {
111         return 1UL << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH);
112 }
113
114 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
115    are never going to work. */
116 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
117 {
118         return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
119 }
120 static inline unsigned long page_to_dma_pfn(struct page *pg)
121 {
122         return mm_to_dma_pfn(page_to_pfn(pg));
123 }
124 static inline unsigned long virt_to_dma_pfn(void *p)
125 {
126         return page_to_dma_pfn(virt_to_page(p));
127 }
128
129 static void __init check_tylersburg_isoch(void);
130 static int rwbf_quirk;
131
132 /*
133  * set to 1 to panic kernel if can't successfully enable VT-d
134  * (used when kernel is launched w/ TXT)
135  */
136 static int force_on = 0;
137 static int intel_iommu_tboot_noforce;
138 static int no_platform_optin;
139
140 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
141
142 /*
143  * Take a root_entry and return the Lower Context Table Pointer (LCTP)
144  * if marked present.
145  */
146 static phys_addr_t root_entry_lctp(struct root_entry *re)
147 {
148         if (!(re->lo & 1))
149                 return 0;
150
151         return re->lo & VTD_PAGE_MASK;
152 }
153
154 /*
155  * Take a root_entry and return the Upper Context Table Pointer (UCTP)
156  * if marked present.
157  */
158 static phys_addr_t root_entry_uctp(struct root_entry *re)
159 {
160         if (!(re->hi & 1))
161                 return 0;
162
163         return re->hi & VTD_PAGE_MASK;
164 }
165
166 static inline void context_set_present(struct context_entry *context)
167 {
168         context->lo |= 1;
169 }
170
171 static inline void context_set_fault_enable(struct context_entry *context)
172 {
173         context->lo &= (((u64)-1) << 2) | 1;
174 }
175
176 static inline void context_set_translation_type(struct context_entry *context,
177                                                 unsigned long value)
178 {
179         context->lo &= (((u64)-1) << 4) | 3;
180         context->lo |= (value & 3) << 2;
181 }
182
183 static inline void context_set_address_root(struct context_entry *context,
184                                             unsigned long value)
185 {
186         context->lo &= ~VTD_PAGE_MASK;
187         context->lo |= value & VTD_PAGE_MASK;
188 }
189
190 static inline void context_set_address_width(struct context_entry *context,
191                                              unsigned long value)
192 {
193         context->hi |= value & 7;
194 }
195
196 static inline void context_set_domain_id(struct context_entry *context,
197                                          unsigned long value)
198 {
199         context->hi |= (value & ((1 << 16) - 1)) << 8;
200 }
201
202 static inline int context_domain_id(struct context_entry *c)
203 {
204         return((c->hi >> 8) & 0xffff);
205 }
206
207 static inline void context_clear_entry(struct context_entry *context)
208 {
209         context->lo = 0;
210         context->hi = 0;
211 }
212
213 static inline bool context_copied(struct intel_iommu *iommu, u8 bus, u8 devfn)
214 {
215         if (!iommu->copied_tables)
216                 return false;
217
218         return test_bit(((long)bus << 8) | devfn, iommu->copied_tables);
219 }
220
221 static inline void
222 set_context_copied(struct intel_iommu *iommu, u8 bus, u8 devfn)
223 {
224         set_bit(((long)bus << 8) | devfn, iommu->copied_tables);
225 }
226
227 static inline void
228 clear_context_copied(struct intel_iommu *iommu, u8 bus, u8 devfn)
229 {
230         clear_bit(((long)bus << 8) | devfn, iommu->copied_tables);
231 }
232
233 /*
234  * This domain is a statically identity mapping domain.
235  *      1. This domain creats a static 1:1 mapping to all usable memory.
236  *      2. It maps to each iommu if successful.
237  *      3. Each iommu mapps to this domain if successful.
238  */
239 static struct dmar_domain *si_domain;
240 static int hw_pass_through = 1;
241
242 struct dmar_rmrr_unit {
243         struct list_head list;          /* list of rmrr units   */
244         struct acpi_dmar_header *hdr;   /* ACPI header          */
245         u64     base_address;           /* reserved base address*/
246         u64     end_address;            /* reserved end address */
247         struct dmar_dev_scope *devices; /* target devices */
248         int     devices_cnt;            /* target device count */
249 };
250
251 struct dmar_atsr_unit {
252         struct list_head list;          /* list of ATSR units */
253         struct acpi_dmar_header *hdr;   /* ACPI header */
254         struct dmar_dev_scope *devices; /* target devices */
255         int devices_cnt;                /* target device count */
256         u8 include_all:1;               /* include all ports */
257 };
258
259 struct dmar_satc_unit {
260         struct list_head list;          /* list of SATC units */
261         struct acpi_dmar_header *hdr;   /* ACPI header */
262         struct dmar_dev_scope *devices; /* target devices */
263         struct intel_iommu *iommu;      /* the corresponding iommu */
264         int devices_cnt;                /* target device count */
265         u8 atc_required:1;              /* ATS is required */
266 };
267
268 static LIST_HEAD(dmar_atsr_units);
269 static LIST_HEAD(dmar_rmrr_units);
270 static LIST_HEAD(dmar_satc_units);
271
272 #define for_each_rmrr_units(rmrr) \
273         list_for_each_entry(rmrr, &dmar_rmrr_units, list)
274
275 static void dmar_remove_one_dev_info(struct device *dev);
276
277 int dmar_disabled = !IS_ENABLED(CONFIG_INTEL_IOMMU_DEFAULT_ON);
278 int intel_iommu_sm = IS_ENABLED(CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON);
279
280 int intel_iommu_enabled = 0;
281 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
282
283 static int dmar_map_gfx = 1;
284 static int intel_iommu_superpage = 1;
285 static int iommu_identity_mapping;
286 static int iommu_skip_te_disable;
287
288 #define IDENTMAP_GFX            2
289 #define IDENTMAP_AZALIA         4
290
291 const struct iommu_ops intel_iommu_ops;
292
293 static bool translation_pre_enabled(struct intel_iommu *iommu)
294 {
295         return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
296 }
297
298 static void clear_translation_pre_enabled(struct intel_iommu *iommu)
299 {
300         iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
301 }
302
303 static void init_translation_status(struct intel_iommu *iommu)
304 {
305         u32 gsts;
306
307         gsts = readl(iommu->reg + DMAR_GSTS_REG);
308         if (gsts & DMA_GSTS_TES)
309                 iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
310 }
311
312 static int __init intel_iommu_setup(char *str)
313 {
314         if (!str)
315                 return -EINVAL;
316
317         while (*str) {
318                 if (!strncmp(str, "on", 2)) {
319                         dmar_disabled = 0;
320                         pr_info("IOMMU enabled\n");
321                 } else if (!strncmp(str, "off", 3)) {
322                         dmar_disabled = 1;
323                         no_platform_optin = 1;
324                         pr_info("IOMMU disabled\n");
325                 } else if (!strncmp(str, "igfx_off", 8)) {
326                         dmar_map_gfx = 0;
327                         pr_info("Disable GFX device mapping\n");
328                 } else if (!strncmp(str, "forcedac", 8)) {
329                         pr_warn("intel_iommu=forcedac deprecated; use iommu.forcedac instead\n");
330                         iommu_dma_forcedac = true;
331                 } else if (!strncmp(str, "strict", 6)) {
332                         pr_warn("intel_iommu=strict deprecated; use iommu.strict=1 instead\n");
333                         iommu_set_dma_strict();
334                 } else if (!strncmp(str, "sp_off", 6)) {
335                         pr_info("Disable supported super page\n");
336                         intel_iommu_superpage = 0;
337                 } else if (!strncmp(str, "sm_on", 5)) {
338                         pr_info("Enable scalable mode if hardware supports\n");
339                         intel_iommu_sm = 1;
340                 } else if (!strncmp(str, "sm_off", 6)) {
341                         pr_info("Scalable mode is disallowed\n");
342                         intel_iommu_sm = 0;
343                 } else if (!strncmp(str, "tboot_noforce", 13)) {
344                         pr_info("Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
345                         intel_iommu_tboot_noforce = 1;
346                 } else {
347                         pr_notice("Unknown option - '%s'\n", str);
348                 }
349
350                 str += strcspn(str, ",");
351                 while (*str == ',')
352                         str++;
353         }
354
355         return 1;
356 }
357 __setup("intel_iommu=", intel_iommu_setup);
358
359 void *alloc_pgtable_page(int node)
360 {
361         struct page *page;
362         void *vaddr = NULL;
363
364         page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
365         if (page)
366                 vaddr = page_address(page);
367         return vaddr;
368 }
369
370 void free_pgtable_page(void *vaddr)
371 {
372         free_page((unsigned long)vaddr);
373 }
374
375 static inline int domain_type_is_si(struct dmar_domain *domain)
376 {
377         return domain->domain.type == IOMMU_DOMAIN_IDENTITY;
378 }
379
380 static inline bool domain_use_first_level(struct dmar_domain *domain)
381 {
382         return domain->flags & DOMAIN_FLAG_USE_FIRST_LEVEL;
383 }
384
385 static inline int domain_pfn_supported(struct dmar_domain *domain,
386                                        unsigned long pfn)
387 {
388         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
389
390         return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
391 }
392
393 /*
394  * Calculate the Supported Adjusted Guest Address Widths of an IOMMU.
395  * Refer to 11.4.2 of the VT-d spec for the encoding of each bit of
396  * the returned SAGAW.
397  */
398 static unsigned long __iommu_calculate_sagaw(struct intel_iommu *iommu)
399 {
400         unsigned long fl_sagaw, sl_sagaw;
401
402         fl_sagaw = BIT(2) | (cap_fl1gp_support(iommu->cap) ? BIT(3) : 0);
403         sl_sagaw = cap_sagaw(iommu->cap);
404
405         /* Second level only. */
406         if (!sm_supported(iommu) || !ecap_flts(iommu->ecap))
407                 return sl_sagaw;
408
409         /* First level only. */
410         if (!ecap_slts(iommu->ecap))
411                 return fl_sagaw;
412
413         return fl_sagaw & sl_sagaw;
414 }
415
416 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
417 {
418         unsigned long sagaw;
419         int agaw;
420
421         sagaw = __iommu_calculate_sagaw(iommu);
422         for (agaw = width_to_agaw(max_gaw); agaw >= 0; agaw--) {
423                 if (test_bit(agaw, &sagaw))
424                         break;
425         }
426
427         return agaw;
428 }
429
430 /*
431  * Calculate max SAGAW for each iommu.
432  */
433 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
434 {
435         return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
436 }
437
438 /*
439  * calculate agaw for each iommu.
440  * "SAGAW" may be different across iommus, use a default agaw, and
441  * get a supported less agaw for iommus that don't support the default agaw.
442  */
443 int iommu_calculate_agaw(struct intel_iommu *iommu)
444 {
445         return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
446 }
447
448 static inline bool iommu_paging_structure_coherency(struct intel_iommu *iommu)
449 {
450         return sm_supported(iommu) ?
451                         ecap_smpwc(iommu->ecap) : ecap_coherent(iommu->ecap);
452 }
453
454 static void domain_update_iommu_coherency(struct dmar_domain *domain)
455 {
456         struct iommu_domain_info *info;
457         struct dmar_drhd_unit *drhd;
458         struct intel_iommu *iommu;
459         bool found = false;
460         unsigned long i;
461
462         domain->iommu_coherency = true;
463         xa_for_each(&domain->iommu_array, i, info) {
464                 found = true;
465                 if (!iommu_paging_structure_coherency(info->iommu)) {
466                         domain->iommu_coherency = false;
467                         break;
468                 }
469         }
470         if (found)
471                 return;
472
473         /* No hardware attached; use lowest common denominator */
474         rcu_read_lock();
475         for_each_active_iommu(iommu, drhd) {
476                 if (!iommu_paging_structure_coherency(iommu)) {
477                         domain->iommu_coherency = false;
478                         break;
479                 }
480         }
481         rcu_read_unlock();
482 }
483
484 static int domain_update_iommu_superpage(struct dmar_domain *domain,
485                                          struct intel_iommu *skip)
486 {
487         struct dmar_drhd_unit *drhd;
488         struct intel_iommu *iommu;
489         int mask = 0x3;
490
491         if (!intel_iommu_superpage)
492                 return 0;
493
494         /* set iommu_superpage to the smallest common denominator */
495         rcu_read_lock();
496         for_each_active_iommu(iommu, drhd) {
497                 if (iommu != skip) {
498                         if (domain && domain_use_first_level(domain)) {
499                                 if (!cap_fl1gp_support(iommu->cap))
500                                         mask = 0x1;
501                         } else {
502                                 mask &= cap_super_page_val(iommu->cap);
503                         }
504
505                         if (!mask)
506                                 break;
507                 }
508         }
509         rcu_read_unlock();
510
511         return fls(mask);
512 }
513
514 static int domain_update_device_node(struct dmar_domain *domain)
515 {
516         struct device_domain_info *info;
517         int nid = NUMA_NO_NODE;
518         unsigned long flags;
519
520         spin_lock_irqsave(&domain->lock, flags);
521         list_for_each_entry(info, &domain->devices, link) {
522                 /*
523                  * There could possibly be multiple device numa nodes as devices
524                  * within the same domain may sit behind different IOMMUs. There
525                  * isn't perfect answer in such situation, so we select first
526                  * come first served policy.
527                  */
528                 nid = dev_to_node(info->dev);
529                 if (nid != NUMA_NO_NODE)
530                         break;
531         }
532         spin_unlock_irqrestore(&domain->lock, flags);
533
534         return nid;
535 }
536
537 static void domain_update_iotlb(struct dmar_domain *domain);
538
539 /* Return the super pagesize bitmap if supported. */
540 static unsigned long domain_super_pgsize_bitmap(struct dmar_domain *domain)
541 {
542         unsigned long bitmap = 0;
543
544         /*
545          * 1-level super page supports page size of 2MiB, 2-level super page
546          * supports page size of both 2MiB and 1GiB.
547          */
548         if (domain->iommu_superpage == 1)
549                 bitmap |= SZ_2M;
550         else if (domain->iommu_superpage == 2)
551                 bitmap |= SZ_2M | SZ_1G;
552
553         return bitmap;
554 }
555
556 /* Some capabilities may be different across iommus */
557 static void domain_update_iommu_cap(struct dmar_domain *domain)
558 {
559         domain_update_iommu_coherency(domain);
560         domain->iommu_superpage = domain_update_iommu_superpage(domain, NULL);
561
562         /*
563          * If RHSA is missing, we should default to the device numa domain
564          * as fall back.
565          */
566         if (domain->nid == NUMA_NO_NODE)
567                 domain->nid = domain_update_device_node(domain);
568
569         /*
570          * First-level translation restricts the input-address to a
571          * canonical address (i.e., address bits 63:N have the same
572          * value as address bit [N-1], where N is 48-bits with 4-level
573          * paging and 57-bits with 5-level paging). Hence, skip bit
574          * [N-1].
575          */
576         if (domain_use_first_level(domain))
577                 domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw - 1);
578         else
579                 domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw);
580
581         domain->domain.pgsize_bitmap |= domain_super_pgsize_bitmap(domain);
582         domain_update_iotlb(domain);
583 }
584
585 struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
586                                          u8 devfn, int alloc)
587 {
588         struct root_entry *root = &iommu->root_entry[bus];
589         struct context_entry *context;
590         u64 *entry;
591
592         /*
593          * Except that the caller requested to allocate a new entry,
594          * returning a copied context entry makes no sense.
595          */
596         if (!alloc && context_copied(iommu, bus, devfn))
597                 return NULL;
598
599         entry = &root->lo;
600         if (sm_supported(iommu)) {
601                 if (devfn >= 0x80) {
602                         devfn -= 0x80;
603                         entry = &root->hi;
604                 }
605                 devfn *= 2;
606         }
607         if (*entry & 1)
608                 context = phys_to_virt(*entry & VTD_PAGE_MASK);
609         else {
610                 unsigned long phy_addr;
611                 if (!alloc)
612                         return NULL;
613
614                 context = alloc_pgtable_page(iommu->node);
615                 if (!context)
616                         return NULL;
617
618                 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
619                 phy_addr = virt_to_phys((void *)context);
620                 *entry = phy_addr | 1;
621                 __iommu_flush_cache(iommu, entry, sizeof(*entry));
622         }
623         return &context[devfn];
624 }
625
626 /**
627  * is_downstream_to_pci_bridge - test if a device belongs to the PCI
628  *                               sub-hierarchy of a candidate PCI-PCI bridge
629  * @dev: candidate PCI device belonging to @bridge PCI sub-hierarchy
630  * @bridge: the candidate PCI-PCI bridge
631  *
632  * Return: true if @dev belongs to @bridge PCI sub-hierarchy, else false.
633  */
634 static bool
635 is_downstream_to_pci_bridge(struct device *dev, struct device *bridge)
636 {
637         struct pci_dev *pdev, *pbridge;
638
639         if (!dev_is_pci(dev) || !dev_is_pci(bridge))
640                 return false;
641
642         pdev = to_pci_dev(dev);
643         pbridge = to_pci_dev(bridge);
644
645         if (pbridge->subordinate &&
646             pbridge->subordinate->number <= pdev->bus->number &&
647             pbridge->subordinate->busn_res.end >= pdev->bus->number)
648                 return true;
649
650         return false;
651 }
652
653 static bool quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
654 {
655         struct dmar_drhd_unit *drhd;
656         u32 vtbar;
657         int rc;
658
659         /* We know that this device on this chipset has its own IOMMU.
660          * If we find it under a different IOMMU, then the BIOS is lying
661          * to us. Hope that the IOMMU for this device is actually
662          * disabled, and it needs no translation...
663          */
664         rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
665         if (rc) {
666                 /* "can't" happen */
667                 dev_info(&pdev->dev, "failed to run vt-d quirk\n");
668                 return false;
669         }
670         vtbar &= 0xffff0000;
671
672         /* we know that the this iommu should be at offset 0xa000 from vtbar */
673         drhd = dmar_find_matched_drhd_unit(pdev);
674         if (!drhd || drhd->reg_base_addr - vtbar != 0xa000) {
675                 pr_warn_once(FW_BUG "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n");
676                 add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
677                 return true;
678         }
679
680         return false;
681 }
682
683 static bool iommu_is_dummy(struct intel_iommu *iommu, struct device *dev)
684 {
685         if (!iommu || iommu->drhd->ignored)
686                 return true;
687
688         if (dev_is_pci(dev)) {
689                 struct pci_dev *pdev = to_pci_dev(dev);
690
691                 if (pdev->vendor == PCI_VENDOR_ID_INTEL &&
692                     pdev->device == PCI_DEVICE_ID_INTEL_IOAT_SNB &&
693                     quirk_ioat_snb_local_iommu(pdev))
694                         return true;
695         }
696
697         return false;
698 }
699
700 struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn)
701 {
702         struct dmar_drhd_unit *drhd = NULL;
703         struct pci_dev *pdev = NULL;
704         struct intel_iommu *iommu;
705         struct device *tmp;
706         u16 segment = 0;
707         int i;
708
709         if (!dev)
710                 return NULL;
711
712         if (dev_is_pci(dev)) {
713                 struct pci_dev *pf_pdev;
714
715                 pdev = pci_real_dma_dev(to_pci_dev(dev));
716
717                 /* VFs aren't listed in scope tables; we need to look up
718                  * the PF instead to find the IOMMU. */
719                 pf_pdev = pci_physfn(pdev);
720                 dev = &pf_pdev->dev;
721                 segment = pci_domain_nr(pdev->bus);
722         } else if (has_acpi_companion(dev))
723                 dev = &ACPI_COMPANION(dev)->dev;
724
725         rcu_read_lock();
726         for_each_iommu(iommu, drhd) {
727                 if (pdev && segment != drhd->segment)
728                         continue;
729
730                 for_each_active_dev_scope(drhd->devices,
731                                           drhd->devices_cnt, i, tmp) {
732                         if (tmp == dev) {
733                                 /* For a VF use its original BDF# not that of the PF
734                                  * which we used for the IOMMU lookup. Strictly speaking
735                                  * we could do this for all PCI devices; we only need to
736                                  * get the BDF# from the scope table for ACPI matches. */
737                                 if (pdev && pdev->is_virtfn)
738                                         goto got_pdev;
739
740                                 if (bus && devfn) {
741                                         *bus = drhd->devices[i].bus;
742                                         *devfn = drhd->devices[i].devfn;
743                                 }
744                                 goto out;
745                         }
746
747                         if (is_downstream_to_pci_bridge(dev, tmp))
748                                 goto got_pdev;
749                 }
750
751                 if (pdev && drhd->include_all) {
752 got_pdev:
753                         if (bus && devfn) {
754                                 *bus = pdev->bus->number;
755                                 *devfn = pdev->devfn;
756                         }
757                         goto out;
758                 }
759         }
760         iommu = NULL;
761 out:
762         if (iommu_is_dummy(iommu, dev))
763                 iommu = NULL;
764
765         rcu_read_unlock();
766
767         return iommu;
768 }
769
770 static void domain_flush_cache(struct dmar_domain *domain,
771                                void *addr, int size)
772 {
773         if (!domain->iommu_coherency)
774                 clflush_cache_range(addr, size);
775 }
776
777 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
778 {
779         struct context_entry *context;
780         int ret = 0;
781
782         spin_lock(&iommu->lock);
783         context = iommu_context_addr(iommu, bus, devfn, 0);
784         if (context)
785                 ret = context_present(context);
786         spin_unlock(&iommu->lock);
787         return ret;
788 }
789
790 static void free_context_table(struct intel_iommu *iommu)
791 {
792         struct context_entry *context;
793         int i;
794
795         if (!iommu->root_entry)
796                 return;
797
798         for (i = 0; i < ROOT_ENTRY_NR; i++) {
799                 context = iommu_context_addr(iommu, i, 0, 0);
800                 if (context)
801                         free_pgtable_page(context);
802
803                 if (!sm_supported(iommu))
804                         continue;
805
806                 context = iommu_context_addr(iommu, i, 0x80, 0);
807                 if (context)
808                         free_pgtable_page(context);
809         }
810
811         free_pgtable_page(iommu->root_entry);
812         iommu->root_entry = NULL;
813 }
814
815 #ifdef CONFIG_DMAR_DEBUG
816 static void pgtable_walk(struct intel_iommu *iommu, unsigned long pfn,
817                          u8 bus, u8 devfn, struct dma_pte *parent, int level)
818 {
819         struct dma_pte *pte;
820         int offset;
821
822         while (1) {
823                 offset = pfn_level_offset(pfn, level);
824                 pte = &parent[offset];
825                 if (!pte || (dma_pte_superpage(pte) || !dma_pte_present(pte))) {
826                         pr_info("PTE not present at level %d\n", level);
827                         break;
828                 }
829
830                 pr_info("pte level: %d, pte value: 0x%016llx\n", level, pte->val);
831
832                 if (level == 1)
833                         break;
834
835                 parent = phys_to_virt(dma_pte_addr(pte));
836                 level--;
837         }
838 }
839
840 void dmar_fault_dump_ptes(struct intel_iommu *iommu, u16 source_id,
841                           unsigned long long addr, u32 pasid)
842 {
843         struct pasid_dir_entry *dir, *pde;
844         struct pasid_entry *entries, *pte;
845         struct context_entry *ctx_entry;
846         struct root_entry *rt_entry;
847         int i, dir_index, index, level;
848         u8 devfn = source_id & 0xff;
849         u8 bus = source_id >> 8;
850         struct dma_pte *pgtable;
851
852         pr_info("Dump %s table entries for IOVA 0x%llx\n", iommu->name, addr);
853
854         /* root entry dump */
855         rt_entry = &iommu->root_entry[bus];
856         if (!rt_entry) {
857                 pr_info("root table entry is not present\n");
858                 return;
859         }
860
861         if (sm_supported(iommu))
862                 pr_info("scalable mode root entry: hi 0x%016llx, low 0x%016llx\n",
863                         rt_entry->hi, rt_entry->lo);
864         else
865                 pr_info("root entry: 0x%016llx", rt_entry->lo);
866
867         /* context entry dump */
868         ctx_entry = iommu_context_addr(iommu, bus, devfn, 0);
869         if (!ctx_entry) {
870                 pr_info("context table entry is not present\n");
871                 return;
872         }
873
874         pr_info("context entry: hi 0x%016llx, low 0x%016llx\n",
875                 ctx_entry->hi, ctx_entry->lo);
876
877         /* legacy mode does not require PASID entries */
878         if (!sm_supported(iommu)) {
879                 level = agaw_to_level(ctx_entry->hi & 7);
880                 pgtable = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK);
881                 goto pgtable_walk;
882         }
883
884         /* get the pointer to pasid directory entry */
885         dir = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK);
886         if (!dir) {
887                 pr_info("pasid directory entry is not present\n");
888                 return;
889         }
890         /* For request-without-pasid, get the pasid from context entry */
891         if (intel_iommu_sm && pasid == INVALID_IOASID)
892                 pasid = PASID_RID2PASID;
893
894         dir_index = pasid >> PASID_PDE_SHIFT;
895         pde = &dir[dir_index];
896         pr_info("pasid dir entry: 0x%016llx\n", pde->val);
897
898         /* get the pointer to the pasid table entry */
899         entries = get_pasid_table_from_pde(pde);
900         if (!entries) {
901                 pr_info("pasid table entry is not present\n");
902                 return;
903         }
904         index = pasid & PASID_PTE_MASK;
905         pte = &entries[index];
906         for (i = 0; i < ARRAY_SIZE(pte->val); i++)
907                 pr_info("pasid table entry[%d]: 0x%016llx\n", i, pte->val[i]);
908
909         if (pasid_pte_get_pgtt(pte) == PASID_ENTRY_PGTT_FL_ONLY) {
910                 level = pte->val[2] & BIT_ULL(2) ? 5 : 4;
911                 pgtable = phys_to_virt(pte->val[2] & VTD_PAGE_MASK);
912         } else {
913                 level = agaw_to_level((pte->val[0] >> 2) & 0x7);
914                 pgtable = phys_to_virt(pte->val[0] & VTD_PAGE_MASK);
915         }
916
917 pgtable_walk:
918         pgtable_walk(iommu, addr >> VTD_PAGE_SHIFT, bus, devfn, pgtable, level);
919 }
920 #endif
921
922 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
923                                       unsigned long pfn, int *target_level)
924 {
925         struct dma_pte *parent, *pte;
926         int level = agaw_to_level(domain->agaw);
927         int offset;
928
929         BUG_ON(!domain->pgd);
930
931         if (!domain_pfn_supported(domain, pfn))
932                 /* Address beyond IOMMU's addressing capabilities. */
933                 return NULL;
934
935         parent = domain->pgd;
936
937         while (1) {
938                 void *tmp_page;
939
940                 offset = pfn_level_offset(pfn, level);
941                 pte = &parent[offset];
942                 if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
943                         break;
944                 if (level == *target_level)
945                         break;
946
947                 if (!dma_pte_present(pte)) {
948                         uint64_t pteval;
949
950                         tmp_page = alloc_pgtable_page(domain->nid);
951
952                         if (!tmp_page)
953                                 return NULL;
954
955                         domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
956                         pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
957                         if (domain_use_first_level(domain)) {
958                                 pteval |= DMA_FL_PTE_XD | DMA_FL_PTE_US;
959                                 if (iommu_is_dma_domain(&domain->domain))
960                                         pteval |= DMA_FL_PTE_ACCESS;
961                         }
962                         if (cmpxchg64(&pte->val, 0ULL, pteval))
963                                 /* Someone else set it while we were thinking; use theirs. */
964                                 free_pgtable_page(tmp_page);
965                         else
966                                 domain_flush_cache(domain, pte, sizeof(*pte));
967                 }
968                 if (level == 1)
969                         break;
970
971                 parent = phys_to_virt(dma_pte_addr(pte));
972                 level--;
973         }
974
975         if (!*target_level)
976                 *target_level = level;
977
978         return pte;
979 }
980
981 /* return address's pte at specific level */
982 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
983                                          unsigned long pfn,
984                                          int level, int *large_page)
985 {
986         struct dma_pte *parent, *pte;
987         int total = agaw_to_level(domain->agaw);
988         int offset;
989
990         parent = domain->pgd;
991         while (level <= total) {
992                 offset = pfn_level_offset(pfn, total);
993                 pte = &parent[offset];
994                 if (level == total)
995                         return pte;
996
997                 if (!dma_pte_present(pte)) {
998                         *large_page = total;
999                         break;
1000                 }
1001
1002                 if (dma_pte_superpage(pte)) {
1003                         *large_page = total;
1004                         return pte;
1005                 }
1006
1007                 parent = phys_to_virt(dma_pte_addr(pte));
1008                 total--;
1009         }
1010         return NULL;
1011 }
1012
1013 /* clear last level pte, a tlb flush should be followed */
1014 static void dma_pte_clear_range(struct dmar_domain *domain,
1015                                 unsigned long start_pfn,
1016                                 unsigned long last_pfn)
1017 {
1018         unsigned int large_page;
1019         struct dma_pte *first_pte, *pte;
1020
1021         BUG_ON(!domain_pfn_supported(domain, start_pfn));
1022         BUG_ON(!domain_pfn_supported(domain, last_pfn));
1023         BUG_ON(start_pfn > last_pfn);
1024
1025         /* we don't need lock here; nobody else touches the iova range */
1026         do {
1027                 large_page = 1;
1028                 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
1029                 if (!pte) {
1030                         start_pfn = align_to_level(start_pfn + 1, large_page + 1);
1031                         continue;
1032                 }
1033                 do {
1034                         dma_clear_pte(pte);
1035                         start_pfn += lvl_to_nr_pages(large_page);
1036                         pte++;
1037                 } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
1038
1039                 domain_flush_cache(domain, first_pte,
1040                                    (void *)pte - (void *)first_pte);
1041
1042         } while (start_pfn && start_pfn <= last_pfn);
1043 }
1044
1045 static void dma_pte_free_level(struct dmar_domain *domain, int level,
1046                                int retain_level, struct dma_pte *pte,
1047                                unsigned long pfn, unsigned long start_pfn,
1048                                unsigned long last_pfn)
1049 {
1050         pfn = max(start_pfn, pfn);
1051         pte = &pte[pfn_level_offset(pfn, level)];
1052
1053         do {
1054                 unsigned long level_pfn;
1055                 struct dma_pte *level_pte;
1056
1057                 if (!dma_pte_present(pte) || dma_pte_superpage(pte))
1058                         goto next;
1059
1060                 level_pfn = pfn & level_mask(level);
1061                 level_pte = phys_to_virt(dma_pte_addr(pte));
1062
1063                 if (level > 2) {
1064                         dma_pte_free_level(domain, level - 1, retain_level,
1065                                            level_pte, level_pfn, start_pfn,
1066                                            last_pfn);
1067                 }
1068
1069                 /*
1070                  * Free the page table if we're below the level we want to
1071                  * retain and the range covers the entire table.
1072                  */
1073                 if (level < retain_level && !(start_pfn > level_pfn ||
1074                       last_pfn < level_pfn + level_size(level) - 1)) {
1075                         dma_clear_pte(pte);
1076                         domain_flush_cache(domain, pte, sizeof(*pte));
1077                         free_pgtable_page(level_pte);
1078                 }
1079 next:
1080                 pfn += level_size(level);
1081         } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1082 }
1083
1084 /*
1085  * clear last level (leaf) ptes and free page table pages below the
1086  * level we wish to keep intact.
1087  */
1088 static void dma_pte_free_pagetable(struct dmar_domain *domain,
1089                                    unsigned long start_pfn,
1090                                    unsigned long last_pfn,
1091                                    int retain_level)
1092 {
1093         dma_pte_clear_range(domain, start_pfn, last_pfn);
1094
1095         /* We don't need lock here; nobody else touches the iova range */
1096         dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level,
1097                            domain->pgd, 0, start_pfn, last_pfn);
1098
1099         /* free pgd */
1100         if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1101                 free_pgtable_page(domain->pgd);
1102                 domain->pgd = NULL;
1103         }
1104 }
1105
1106 /* When a page at a given level is being unlinked from its parent, we don't
1107    need to *modify* it at all. All we need to do is make a list of all the
1108    pages which can be freed just as soon as we've flushed the IOTLB and we
1109    know the hardware page-walk will no longer touch them.
1110    The 'pte' argument is the *parent* PTE, pointing to the page that is to
1111    be freed. */
1112 static void dma_pte_list_pagetables(struct dmar_domain *domain,
1113                                     int level, struct dma_pte *pte,
1114                                     struct list_head *freelist)
1115 {
1116         struct page *pg;
1117
1118         pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
1119         list_add_tail(&pg->lru, freelist);
1120
1121         if (level == 1)
1122                 return;
1123
1124         pte = page_address(pg);
1125         do {
1126                 if (dma_pte_present(pte) && !dma_pte_superpage(pte))
1127                         dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1128                 pte++;
1129         } while (!first_pte_in_page(pte));
1130 }
1131
1132 static void dma_pte_clear_level(struct dmar_domain *domain, int level,
1133                                 struct dma_pte *pte, unsigned long pfn,
1134                                 unsigned long start_pfn, unsigned long last_pfn,
1135                                 struct list_head *freelist)
1136 {
1137         struct dma_pte *first_pte = NULL, *last_pte = NULL;
1138
1139         pfn = max(start_pfn, pfn);
1140         pte = &pte[pfn_level_offset(pfn, level)];
1141
1142         do {
1143                 unsigned long level_pfn = pfn & level_mask(level);
1144
1145                 if (!dma_pte_present(pte))
1146                         goto next;
1147
1148                 /* If range covers entire pagetable, free it */
1149                 if (start_pfn <= level_pfn &&
1150                     last_pfn >= level_pfn + level_size(level) - 1) {
1151                         /* These suborbinate page tables are going away entirely. Don't
1152                            bother to clear them; we're just going to *free* them. */
1153                         if (level > 1 && !dma_pte_superpage(pte))
1154                                 dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1155
1156                         dma_clear_pte(pte);
1157                         if (!first_pte)
1158                                 first_pte = pte;
1159                         last_pte = pte;
1160                 } else if (level > 1) {
1161                         /* Recurse down into a level that isn't *entirely* obsolete */
1162                         dma_pte_clear_level(domain, level - 1,
1163                                             phys_to_virt(dma_pte_addr(pte)),
1164                                             level_pfn, start_pfn, last_pfn,
1165                                             freelist);
1166                 }
1167 next:
1168                 pfn = level_pfn + level_size(level);
1169         } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1170
1171         if (first_pte)
1172                 domain_flush_cache(domain, first_pte,
1173                                    (void *)++last_pte - (void *)first_pte);
1174 }
1175
1176 /* We can't just free the pages because the IOMMU may still be walking
1177    the page tables, and may have cached the intermediate levels. The
1178    pages can only be freed after the IOTLB flush has been done. */
1179 static void domain_unmap(struct dmar_domain *domain, unsigned long start_pfn,
1180                          unsigned long last_pfn, struct list_head *freelist)
1181 {
1182         BUG_ON(!domain_pfn_supported(domain, start_pfn));
1183         BUG_ON(!domain_pfn_supported(domain, last_pfn));
1184         BUG_ON(start_pfn > last_pfn);
1185
1186         /* we don't need lock here; nobody else touches the iova range */
1187         dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1188                             domain->pgd, 0, start_pfn, last_pfn, freelist);
1189
1190         /* free pgd */
1191         if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1192                 struct page *pgd_page = virt_to_page(domain->pgd);
1193                 list_add_tail(&pgd_page->lru, freelist);
1194                 domain->pgd = NULL;
1195         }
1196 }
1197
1198 /* iommu handling */
1199 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1200 {
1201         struct root_entry *root;
1202
1203         root = (struct root_entry *)alloc_pgtable_page(iommu->node);
1204         if (!root) {
1205                 pr_err("Allocating root entry for %s failed\n",
1206                         iommu->name);
1207                 return -ENOMEM;
1208         }
1209
1210         __iommu_flush_cache(iommu, root, ROOT_SIZE);
1211         iommu->root_entry = root;
1212
1213         return 0;
1214 }
1215
1216 static void iommu_set_root_entry(struct intel_iommu *iommu)
1217 {
1218         u64 addr;
1219         u32 sts;
1220         unsigned long flag;
1221
1222         addr = virt_to_phys(iommu->root_entry);
1223         if (sm_supported(iommu))
1224                 addr |= DMA_RTADDR_SMT;
1225
1226         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1227         dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1228
1229         writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1230
1231         /* Make sure hardware complete it */
1232         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1233                       readl, (sts & DMA_GSTS_RTPS), sts);
1234
1235         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1236
1237         iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
1238         if (sm_supported(iommu))
1239                 qi_flush_pasid_cache(iommu, 0, QI_PC_GLOBAL, 0);
1240         iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
1241 }
1242
1243 void iommu_flush_write_buffer(struct intel_iommu *iommu)
1244 {
1245         u32 val;
1246         unsigned long flag;
1247
1248         if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1249                 return;
1250
1251         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1252         writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1253
1254         /* Make sure hardware complete it */
1255         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1256                       readl, (!(val & DMA_GSTS_WBFS)), val);
1257
1258         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1259 }
1260
1261 /* return value determine if we need a write buffer flush */
1262 static void __iommu_flush_context(struct intel_iommu *iommu,
1263                                   u16 did, u16 source_id, u8 function_mask,
1264                                   u64 type)
1265 {
1266         u64 val = 0;
1267         unsigned long flag;
1268
1269         switch (type) {
1270         case DMA_CCMD_GLOBAL_INVL:
1271                 val = DMA_CCMD_GLOBAL_INVL;
1272                 break;
1273         case DMA_CCMD_DOMAIN_INVL:
1274                 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1275                 break;
1276         case DMA_CCMD_DEVICE_INVL:
1277                 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1278                         | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1279                 break;
1280         default:
1281                 BUG();
1282         }
1283         val |= DMA_CCMD_ICC;
1284
1285         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1286         dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1287
1288         /* Make sure hardware complete it */
1289         IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1290                 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1291
1292         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1293 }
1294
1295 /* return value determine if we need a write buffer flush */
1296 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1297                                 u64 addr, unsigned int size_order, u64 type)
1298 {
1299         int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1300         u64 val = 0, val_iva = 0;
1301         unsigned long flag;
1302
1303         switch (type) {
1304         case DMA_TLB_GLOBAL_FLUSH:
1305                 /* global flush doesn't need set IVA_REG */
1306                 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1307                 break;
1308         case DMA_TLB_DSI_FLUSH:
1309                 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1310                 break;
1311         case DMA_TLB_PSI_FLUSH:
1312                 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1313                 /* IH bit is passed in as part of address */
1314                 val_iva = size_order | addr;
1315                 break;
1316         default:
1317                 BUG();
1318         }
1319         /* Note: set drain read/write */
1320 #if 0
1321         /*
1322          * This is probably to be super secure.. Looks like we can
1323          * ignore it without any impact.
1324          */
1325         if (cap_read_drain(iommu->cap))
1326                 val |= DMA_TLB_READ_DRAIN;
1327 #endif
1328         if (cap_write_drain(iommu->cap))
1329                 val |= DMA_TLB_WRITE_DRAIN;
1330
1331         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1332         /* Note: Only uses first TLB reg currently */
1333         if (val_iva)
1334                 dmar_writeq(iommu->reg + tlb_offset, val_iva);
1335         dmar_writeq(iommu->reg + tlb_offset + 8, val);
1336
1337         /* Make sure hardware complete it */
1338         IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1339                 dmar_readq, (!(val & DMA_TLB_IVT)), val);
1340
1341         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1342
1343         /* check IOTLB invalidation granularity */
1344         if (DMA_TLB_IAIG(val) == 0)
1345                 pr_err("Flush IOTLB failed\n");
1346         if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1347                 pr_debug("TLB flush request %Lx, actual %Lx\n",
1348                         (unsigned long long)DMA_TLB_IIRG(type),
1349                         (unsigned long long)DMA_TLB_IAIG(val));
1350 }
1351
1352 static struct device_domain_info *
1353 iommu_support_dev_iotlb(struct dmar_domain *domain, struct intel_iommu *iommu,
1354                         u8 bus, u8 devfn)
1355 {
1356         struct device_domain_info *info;
1357         unsigned long flags;
1358
1359         if (!iommu->qi)
1360                 return NULL;
1361
1362         spin_lock_irqsave(&domain->lock, flags);
1363         list_for_each_entry(info, &domain->devices, link) {
1364                 if (info->iommu == iommu && info->bus == bus &&
1365                     info->devfn == devfn) {
1366                         spin_unlock_irqrestore(&domain->lock, flags);
1367                         return info->ats_supported ? info : NULL;
1368                 }
1369         }
1370         spin_unlock_irqrestore(&domain->lock, flags);
1371
1372         return NULL;
1373 }
1374
1375 static void domain_update_iotlb(struct dmar_domain *domain)
1376 {
1377         struct device_domain_info *info;
1378         bool has_iotlb_device = false;
1379         unsigned long flags;
1380
1381         spin_lock_irqsave(&domain->lock, flags);
1382         list_for_each_entry(info, &domain->devices, link) {
1383                 if (info->ats_enabled) {
1384                         has_iotlb_device = true;
1385                         break;
1386                 }
1387         }
1388         domain->has_iotlb_device = has_iotlb_device;
1389         spin_unlock_irqrestore(&domain->lock, flags);
1390 }
1391
1392 static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1393 {
1394         struct pci_dev *pdev;
1395
1396         if (!info || !dev_is_pci(info->dev))
1397                 return;
1398
1399         pdev = to_pci_dev(info->dev);
1400         /* For IOMMU that supports device IOTLB throttling (DIT), we assign
1401          * PFSID to the invalidation desc of a VF such that IOMMU HW can gauge
1402          * queue depth at PF level. If DIT is not set, PFSID will be treated as
1403          * reserved, which should be set to 0.
1404          */
1405         if (!ecap_dit(info->iommu->ecap))
1406                 info->pfsid = 0;
1407         else {
1408                 struct pci_dev *pf_pdev;
1409
1410                 /* pdev will be returned if device is not a vf */
1411                 pf_pdev = pci_physfn(pdev);
1412                 info->pfsid = pci_dev_id(pf_pdev);
1413         }
1414
1415 #ifdef CONFIG_INTEL_IOMMU_SVM
1416         /* The PCIe spec, in its wisdom, declares that the behaviour of
1417            the device if you enable PASID support after ATS support is
1418            undefined. So always enable PASID support on devices which
1419            have it, even if we can't yet know if we're ever going to
1420            use it. */
1421         if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1))
1422                 info->pasid_enabled = 1;
1423
1424         if (info->pri_supported &&
1425             (info->pasid_enabled ? pci_prg_resp_pasid_required(pdev) : 1)  &&
1426             !pci_reset_pri(pdev) && !pci_enable_pri(pdev, PRQ_DEPTH))
1427                 info->pri_enabled = 1;
1428 #endif
1429         if (info->ats_supported && pci_ats_page_aligned(pdev) &&
1430             !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) {
1431                 info->ats_enabled = 1;
1432                 domain_update_iotlb(info->domain);
1433                 info->ats_qdep = pci_ats_queue_depth(pdev);
1434         }
1435 }
1436
1437 static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1438 {
1439         struct pci_dev *pdev;
1440
1441         if (!dev_is_pci(info->dev))
1442                 return;
1443
1444         pdev = to_pci_dev(info->dev);
1445
1446         if (info->ats_enabled) {
1447                 pci_disable_ats(pdev);
1448                 info->ats_enabled = 0;
1449                 domain_update_iotlb(info->domain);
1450         }
1451 #ifdef CONFIG_INTEL_IOMMU_SVM
1452         if (info->pri_enabled) {
1453                 pci_disable_pri(pdev);
1454                 info->pri_enabled = 0;
1455         }
1456         if (info->pasid_enabled) {
1457                 pci_disable_pasid(pdev);
1458                 info->pasid_enabled = 0;
1459         }
1460 #endif
1461 }
1462
1463 static void __iommu_flush_dev_iotlb(struct device_domain_info *info,
1464                                     u64 addr, unsigned int mask)
1465 {
1466         u16 sid, qdep;
1467
1468         if (!info || !info->ats_enabled)
1469                 return;
1470
1471         sid = info->bus << 8 | info->devfn;
1472         qdep = info->ats_qdep;
1473         qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
1474                            qdep, addr, mask);
1475 }
1476
1477 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1478                                   u64 addr, unsigned mask)
1479 {
1480         struct device_domain_info *info;
1481         unsigned long flags;
1482
1483         if (!domain->has_iotlb_device)
1484                 return;
1485
1486         spin_lock_irqsave(&domain->lock, flags);
1487         list_for_each_entry(info, &domain->devices, link)
1488                 __iommu_flush_dev_iotlb(info, addr, mask);
1489         spin_unlock_irqrestore(&domain->lock, flags);
1490 }
1491
1492 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu,
1493                                   struct dmar_domain *domain,
1494                                   unsigned long pfn, unsigned int pages,
1495                                   int ih, int map)
1496 {
1497         unsigned int aligned_pages = __roundup_pow_of_two(pages);
1498         unsigned int mask = ilog2(aligned_pages);
1499         uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1500         u16 did = domain_id_iommu(domain, iommu);
1501
1502         BUG_ON(pages == 0);
1503
1504         if (ih)
1505                 ih = 1 << 6;
1506
1507         if (domain_use_first_level(domain)) {
1508                 qi_flush_piotlb(iommu, did, PASID_RID2PASID, addr, pages, ih);
1509         } else {
1510                 unsigned long bitmask = aligned_pages - 1;
1511
1512                 /*
1513                  * PSI masks the low order bits of the base address. If the
1514                  * address isn't aligned to the mask, then compute a mask value
1515                  * needed to ensure the target range is flushed.
1516                  */
1517                 if (unlikely(bitmask & pfn)) {
1518                         unsigned long end_pfn = pfn + pages - 1, shared_bits;
1519
1520                         /*
1521                          * Since end_pfn <= pfn + bitmask, the only way bits
1522                          * higher than bitmask can differ in pfn and end_pfn is
1523                          * by carrying. This means after masking out bitmask,
1524                          * high bits starting with the first set bit in
1525                          * shared_bits are all equal in both pfn and end_pfn.
1526                          */
1527                         shared_bits = ~(pfn ^ end_pfn) & ~bitmask;
1528                         mask = shared_bits ? __ffs(shared_bits) : BITS_PER_LONG;
1529                 }
1530
1531                 /*
1532                  * Fallback to domain selective flush if no PSI support or
1533                  * the size is too big.
1534                  */
1535                 if (!cap_pgsel_inv(iommu->cap) ||
1536                     mask > cap_max_amask_val(iommu->cap))
1537                         iommu->flush.flush_iotlb(iommu, did, 0, 0,
1538                                                         DMA_TLB_DSI_FLUSH);
1539                 else
1540                         iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
1541                                                         DMA_TLB_PSI_FLUSH);
1542         }
1543
1544         /*
1545          * In caching mode, changes of pages from non-present to present require
1546          * flush. However, device IOTLB doesn't need to be flushed in this case.
1547          */
1548         if (!cap_caching_mode(iommu->cap) || !map)
1549                 iommu_flush_dev_iotlb(domain, addr, mask);
1550 }
1551
1552 /* Notification for newly created mappings */
1553 static inline void __mapping_notify_one(struct intel_iommu *iommu,
1554                                         struct dmar_domain *domain,
1555                                         unsigned long pfn, unsigned int pages)
1556 {
1557         /*
1558          * It's a non-present to present mapping. Only flush if caching mode
1559          * and second level.
1560          */
1561         if (cap_caching_mode(iommu->cap) && !domain_use_first_level(domain))
1562                 iommu_flush_iotlb_psi(iommu, domain, pfn, pages, 0, 1);
1563         else
1564                 iommu_flush_write_buffer(iommu);
1565 }
1566
1567 static void intel_flush_iotlb_all(struct iommu_domain *domain)
1568 {
1569         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
1570         struct iommu_domain_info *info;
1571         unsigned long idx;
1572
1573         xa_for_each(&dmar_domain->iommu_array, idx, info) {
1574                 struct intel_iommu *iommu = info->iommu;
1575                 u16 did = domain_id_iommu(dmar_domain, iommu);
1576
1577                 if (domain_use_first_level(dmar_domain))
1578                         qi_flush_piotlb(iommu, did, PASID_RID2PASID, 0, -1, 0);
1579                 else
1580                         iommu->flush.flush_iotlb(iommu, did, 0, 0,
1581                                                  DMA_TLB_DSI_FLUSH);
1582
1583                 if (!cap_caching_mode(iommu->cap))
1584                         iommu_flush_dev_iotlb(dmar_domain, 0, MAX_AGAW_PFN_WIDTH);
1585         }
1586 }
1587
1588 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1589 {
1590         u32 pmen;
1591         unsigned long flags;
1592
1593         if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap))
1594                 return;
1595
1596         raw_spin_lock_irqsave(&iommu->register_lock, flags);
1597         pmen = readl(iommu->reg + DMAR_PMEN_REG);
1598         pmen &= ~DMA_PMEN_EPM;
1599         writel(pmen, iommu->reg + DMAR_PMEN_REG);
1600
1601         /* wait for the protected region status bit to clear */
1602         IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1603                 readl, !(pmen & DMA_PMEN_PRS), pmen);
1604
1605         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1606 }
1607
1608 static void iommu_enable_translation(struct intel_iommu *iommu)
1609 {
1610         u32 sts;
1611         unsigned long flags;
1612
1613         raw_spin_lock_irqsave(&iommu->register_lock, flags);
1614         iommu->gcmd |= DMA_GCMD_TE;
1615         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1616
1617         /* Make sure hardware complete it */
1618         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1619                       readl, (sts & DMA_GSTS_TES), sts);
1620
1621         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1622 }
1623
1624 static void iommu_disable_translation(struct intel_iommu *iommu)
1625 {
1626         u32 sts;
1627         unsigned long flag;
1628
1629         if (iommu_skip_te_disable && iommu->drhd->gfx_dedicated &&
1630             (cap_read_drain(iommu->cap) || cap_write_drain(iommu->cap)))
1631                 return;
1632
1633         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1634         iommu->gcmd &= ~DMA_GCMD_TE;
1635         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1636
1637         /* Make sure hardware complete it */
1638         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1639                       readl, (!(sts & DMA_GSTS_TES)), sts);
1640
1641         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1642 }
1643
1644 static int iommu_init_domains(struct intel_iommu *iommu)
1645 {
1646         u32 ndomains;
1647
1648         ndomains = cap_ndoms(iommu->cap);
1649         pr_debug("%s: Number of Domains supported <%d>\n",
1650                  iommu->name, ndomains);
1651
1652         spin_lock_init(&iommu->lock);
1653
1654         iommu->domain_ids = bitmap_zalloc(ndomains, GFP_KERNEL);
1655         if (!iommu->domain_ids)
1656                 return -ENOMEM;
1657
1658         /*
1659          * If Caching mode is set, then invalid translations are tagged
1660          * with domain-id 0, hence we need to pre-allocate it. We also
1661          * use domain-id 0 as a marker for non-allocated domain-id, so
1662          * make sure it is not used for a real domain.
1663          */
1664         set_bit(0, iommu->domain_ids);
1665
1666         /*
1667          * Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid
1668          * entry for first-level or pass-through translation modes should
1669          * be programmed with a domain id different from those used for
1670          * second-level or nested translation. We reserve a domain id for
1671          * this purpose.
1672          */
1673         if (sm_supported(iommu))
1674                 set_bit(FLPT_DEFAULT_DID, iommu->domain_ids);
1675
1676         return 0;
1677 }
1678
1679 static void disable_dmar_iommu(struct intel_iommu *iommu)
1680 {
1681         if (!iommu->domain_ids)
1682                 return;
1683
1684         /*
1685          * All iommu domains must have been detached from the devices,
1686          * hence there should be no domain IDs in use.
1687          */
1688         if (WARN_ON(bitmap_weight(iommu->domain_ids, cap_ndoms(iommu->cap))
1689                     > NUM_RESERVED_DID))
1690                 return;
1691
1692         if (iommu->gcmd & DMA_GCMD_TE)
1693                 iommu_disable_translation(iommu);
1694 }
1695
1696 static void free_dmar_iommu(struct intel_iommu *iommu)
1697 {
1698         if (iommu->domain_ids) {
1699                 bitmap_free(iommu->domain_ids);
1700                 iommu->domain_ids = NULL;
1701         }
1702
1703         if (iommu->copied_tables) {
1704                 bitmap_free(iommu->copied_tables);
1705                 iommu->copied_tables = NULL;
1706         }
1707
1708         /* free context mapping */
1709         free_context_table(iommu);
1710
1711 #ifdef CONFIG_INTEL_IOMMU_SVM
1712         if (pasid_supported(iommu)) {
1713                 if (ecap_prs(iommu->ecap))
1714                         intel_svm_finish_prq(iommu);
1715         }
1716         if (vccap_pasid(iommu->vccap))
1717                 ioasid_unregister_allocator(&iommu->pasid_allocator);
1718
1719 #endif
1720 }
1721
1722 /*
1723  * Check and return whether first level is used by default for
1724  * DMA translation.
1725  */
1726 static bool first_level_by_default(unsigned int type)
1727 {
1728         /* Only SL is available in legacy mode */
1729         if (!scalable_mode_support())
1730                 return false;
1731
1732         /* Only level (either FL or SL) is available, just use it */
1733         if (intel_cap_flts_sanity() ^ intel_cap_slts_sanity())
1734                 return intel_cap_flts_sanity();
1735
1736         /* Both levels are available, decide it based on domain type */
1737         return type != IOMMU_DOMAIN_UNMANAGED;
1738 }
1739
1740 static struct dmar_domain *alloc_domain(unsigned int type)
1741 {
1742         struct dmar_domain *domain;
1743
1744         domain = kzalloc(sizeof(*domain), GFP_KERNEL);
1745         if (!domain)
1746                 return NULL;
1747
1748         domain->nid = NUMA_NO_NODE;
1749         if (first_level_by_default(type))
1750                 domain->flags |= DOMAIN_FLAG_USE_FIRST_LEVEL;
1751         domain->has_iotlb_device = false;
1752         INIT_LIST_HEAD(&domain->devices);
1753         spin_lock_init(&domain->lock);
1754         xa_init(&domain->iommu_array);
1755
1756         return domain;
1757 }
1758
1759 static int domain_attach_iommu(struct dmar_domain *domain,
1760                                struct intel_iommu *iommu)
1761 {
1762         struct iommu_domain_info *info, *curr;
1763         unsigned long ndomains;
1764         int num, ret = -ENOSPC;
1765
1766         info = kzalloc(sizeof(*info), GFP_KERNEL);
1767         if (!info)
1768                 return -ENOMEM;
1769
1770         spin_lock(&iommu->lock);
1771         curr = xa_load(&domain->iommu_array, iommu->seq_id);
1772         if (curr) {
1773                 curr->refcnt++;
1774                 spin_unlock(&iommu->lock);
1775                 kfree(info);
1776                 return 0;
1777         }
1778
1779         ndomains = cap_ndoms(iommu->cap);
1780         num = find_first_zero_bit(iommu->domain_ids, ndomains);
1781         if (num >= ndomains) {
1782                 pr_err("%s: No free domain ids\n", iommu->name);
1783                 goto err_unlock;
1784         }
1785
1786         set_bit(num, iommu->domain_ids);
1787         info->refcnt    = 1;
1788         info->did       = num;
1789         info->iommu     = iommu;
1790         curr = xa_cmpxchg(&domain->iommu_array, iommu->seq_id,
1791                           NULL, info, GFP_ATOMIC);
1792         if (curr) {
1793                 ret = xa_err(curr) ? : -EBUSY;
1794                 goto err_clear;
1795         }
1796         domain_update_iommu_cap(domain);
1797
1798         spin_unlock(&iommu->lock);
1799         return 0;
1800
1801 err_clear:
1802         clear_bit(info->did, iommu->domain_ids);
1803 err_unlock:
1804         spin_unlock(&iommu->lock);
1805         kfree(info);
1806         return ret;
1807 }
1808
1809 static void domain_detach_iommu(struct dmar_domain *domain,
1810                                 struct intel_iommu *iommu)
1811 {
1812         struct iommu_domain_info *info;
1813
1814         spin_lock(&iommu->lock);
1815         info = xa_load(&domain->iommu_array, iommu->seq_id);
1816         if (--info->refcnt == 0) {
1817                 clear_bit(info->did, iommu->domain_ids);
1818                 xa_erase(&domain->iommu_array, iommu->seq_id);
1819                 domain->nid = NUMA_NO_NODE;
1820                 domain_update_iommu_cap(domain);
1821                 kfree(info);
1822         }
1823         spin_unlock(&iommu->lock);
1824 }
1825
1826 static inline int guestwidth_to_adjustwidth(int gaw)
1827 {
1828         int agaw;
1829         int r = (gaw - 12) % 9;
1830
1831         if (r == 0)
1832                 agaw = gaw;
1833         else
1834                 agaw = gaw + 9 - r;
1835         if (agaw > 64)
1836                 agaw = 64;
1837         return agaw;
1838 }
1839
1840 static void domain_exit(struct dmar_domain *domain)
1841 {
1842         if (domain->pgd) {
1843                 LIST_HEAD(freelist);
1844
1845                 domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw), &freelist);
1846                 put_pages_list(&freelist);
1847         }
1848
1849         if (WARN_ON(!list_empty(&domain->devices)))
1850                 return;
1851
1852         kfree(domain);
1853 }
1854
1855 /*
1856  * Get the PASID directory size for scalable mode context entry.
1857  * Value of X in the PDTS field of a scalable mode context entry
1858  * indicates PASID directory with 2^(X + 7) entries.
1859  */
1860 static inline unsigned long context_get_sm_pds(struct pasid_table *table)
1861 {
1862         unsigned long pds, max_pde;
1863
1864         max_pde = table->max_pasid >> PASID_PDE_SHIFT;
1865         pds = find_first_bit(&max_pde, MAX_NR_PASID_BITS);
1866         if (pds < 7)
1867                 return 0;
1868
1869         return pds - 7;
1870 }
1871
1872 /*
1873  * Set the RID_PASID field of a scalable mode context entry. The
1874  * IOMMU hardware will use the PASID value set in this field for
1875  * DMA translations of DMA requests without PASID.
1876  */
1877 static inline void
1878 context_set_sm_rid2pasid(struct context_entry *context, unsigned long pasid)
1879 {
1880         context->hi |= pasid & ((1 << 20) - 1);
1881 }
1882
1883 /*
1884  * Set the DTE(Device-TLB Enable) field of a scalable mode context
1885  * entry.
1886  */
1887 static inline void context_set_sm_dte(struct context_entry *context)
1888 {
1889         context->lo |= (1 << 2);
1890 }
1891
1892 /*
1893  * Set the PRE(Page Request Enable) field of a scalable mode context
1894  * entry.
1895  */
1896 static inline void context_set_sm_pre(struct context_entry *context)
1897 {
1898         context->lo |= (1 << 4);
1899 }
1900
1901 /* Convert value to context PASID directory size field coding. */
1902 #define context_pdts(pds)       (((pds) & 0x7) << 9)
1903
1904 static int domain_context_mapping_one(struct dmar_domain *domain,
1905                                       struct intel_iommu *iommu,
1906                                       struct pasid_table *table,
1907                                       u8 bus, u8 devfn)
1908 {
1909         struct device_domain_info *info =
1910                         iommu_support_dev_iotlb(domain, iommu, bus, devfn);
1911         u16 did = domain_id_iommu(domain, iommu);
1912         int translation = CONTEXT_TT_MULTI_LEVEL;
1913         struct context_entry *context;
1914         int ret;
1915
1916         WARN_ON(did == 0);
1917
1918         if (hw_pass_through && domain_type_is_si(domain))
1919                 translation = CONTEXT_TT_PASS_THROUGH;
1920
1921         pr_debug("Set context mapping for %02x:%02x.%d\n",
1922                 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1923
1924         BUG_ON(!domain->pgd);
1925
1926         spin_lock(&iommu->lock);
1927         ret = -ENOMEM;
1928         context = iommu_context_addr(iommu, bus, devfn, 1);
1929         if (!context)
1930                 goto out_unlock;
1931
1932         ret = 0;
1933         if (context_present(context) && !context_copied(iommu, bus, devfn))
1934                 goto out_unlock;
1935
1936         /*
1937          * For kdump cases, old valid entries may be cached due to the
1938          * in-flight DMA and copied pgtable, but there is no unmapping
1939          * behaviour for them, thus we need an explicit cache flush for
1940          * the newly-mapped device. For kdump, at this point, the device
1941          * is supposed to finish reset at its driver probe stage, so no
1942          * in-flight DMA will exist, and we don't need to worry anymore
1943          * hereafter.
1944          */
1945         if (context_copied(iommu, bus, devfn)) {
1946                 u16 did_old = context_domain_id(context);
1947
1948                 if (did_old < cap_ndoms(iommu->cap)) {
1949                         iommu->flush.flush_context(iommu, did_old,
1950                                                    (((u16)bus) << 8) | devfn,
1951                                                    DMA_CCMD_MASK_NOBIT,
1952                                                    DMA_CCMD_DEVICE_INVL);
1953                         iommu->flush.flush_iotlb(iommu, did_old, 0, 0,
1954                                                  DMA_TLB_DSI_FLUSH);
1955                 }
1956
1957                 clear_context_copied(iommu, bus, devfn);
1958         }
1959
1960         context_clear_entry(context);
1961
1962         if (sm_supported(iommu)) {
1963                 unsigned long pds;
1964
1965                 WARN_ON(!table);
1966
1967                 /* Setup the PASID DIR pointer: */
1968                 pds = context_get_sm_pds(table);
1969                 context->lo = (u64)virt_to_phys(table->table) |
1970                                 context_pdts(pds);
1971
1972                 /* Setup the RID_PASID field: */
1973                 context_set_sm_rid2pasid(context, PASID_RID2PASID);
1974
1975                 /*
1976                  * Setup the Device-TLB enable bit and Page request
1977                  * Enable bit:
1978                  */
1979                 if (info && info->ats_supported)
1980                         context_set_sm_dte(context);
1981                 if (info && info->pri_supported)
1982                         context_set_sm_pre(context);
1983         } else {
1984                 struct dma_pte *pgd = domain->pgd;
1985                 int agaw;
1986
1987                 context_set_domain_id(context, did);
1988
1989                 if (translation != CONTEXT_TT_PASS_THROUGH) {
1990                         /*
1991                          * Skip top levels of page tables for iommu which has
1992                          * less agaw than default. Unnecessary for PT mode.
1993                          */
1994                         for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
1995                                 ret = -ENOMEM;
1996                                 pgd = phys_to_virt(dma_pte_addr(pgd));
1997                                 if (!dma_pte_present(pgd))
1998                                         goto out_unlock;
1999                         }
2000
2001                         if (info && info->ats_supported)
2002                                 translation = CONTEXT_TT_DEV_IOTLB;
2003                         else
2004                                 translation = CONTEXT_TT_MULTI_LEVEL;
2005
2006                         context_set_address_root(context, virt_to_phys(pgd));
2007                         context_set_address_width(context, agaw);
2008                 } else {
2009                         /*
2010                          * In pass through mode, AW must be programmed to
2011                          * indicate the largest AGAW value supported by
2012                          * hardware. And ASR is ignored by hardware.
2013                          */
2014                         context_set_address_width(context, iommu->msagaw);
2015                 }
2016
2017                 context_set_translation_type(context, translation);
2018         }
2019
2020         context_set_fault_enable(context);
2021         context_set_present(context);
2022         if (!ecap_coherent(iommu->ecap))
2023                 clflush_cache_range(context, sizeof(*context));
2024
2025         /*
2026          * It's a non-present to present mapping. If hardware doesn't cache
2027          * non-present entry we only need to flush the write-buffer. If the
2028          * _does_ cache non-present entries, then it does so in the special
2029          * domain #0, which we have to flush:
2030          */
2031         if (cap_caching_mode(iommu->cap)) {
2032                 iommu->flush.flush_context(iommu, 0,
2033                                            (((u16)bus) << 8) | devfn,
2034                                            DMA_CCMD_MASK_NOBIT,
2035                                            DMA_CCMD_DEVICE_INVL);
2036                 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
2037         } else {
2038                 iommu_flush_write_buffer(iommu);
2039         }
2040         iommu_enable_dev_iotlb(info);
2041
2042         ret = 0;
2043
2044 out_unlock:
2045         spin_unlock(&iommu->lock);
2046
2047         return ret;
2048 }
2049
2050 struct domain_context_mapping_data {
2051         struct dmar_domain *domain;
2052         struct intel_iommu *iommu;
2053         struct pasid_table *table;
2054 };
2055
2056 static int domain_context_mapping_cb(struct pci_dev *pdev,
2057                                      u16 alias, void *opaque)
2058 {
2059         struct domain_context_mapping_data *data = opaque;
2060
2061         return domain_context_mapping_one(data->domain, data->iommu,
2062                                           data->table, PCI_BUS_NUM(alias),
2063                                           alias & 0xff);
2064 }
2065
2066 static int
2067 domain_context_mapping(struct dmar_domain *domain, struct device *dev)
2068 {
2069         struct domain_context_mapping_data data;
2070         struct pasid_table *table;
2071         struct intel_iommu *iommu;
2072         u8 bus, devfn;
2073
2074         iommu = device_to_iommu(dev, &bus, &devfn);
2075         if (!iommu)
2076                 return -ENODEV;
2077
2078         table = intel_pasid_get_table(dev);
2079
2080         if (!dev_is_pci(dev))
2081                 return domain_context_mapping_one(domain, iommu, table,
2082                                                   bus, devfn);
2083
2084         data.domain = domain;
2085         data.iommu = iommu;
2086         data.table = table;
2087
2088         return pci_for_each_dma_alias(to_pci_dev(dev),
2089                                       &domain_context_mapping_cb, &data);
2090 }
2091
2092 static int domain_context_mapped_cb(struct pci_dev *pdev,
2093                                     u16 alias, void *opaque)
2094 {
2095         struct intel_iommu *iommu = opaque;
2096
2097         return !device_context_mapped(iommu, PCI_BUS_NUM(alias), alias & 0xff);
2098 }
2099
2100 static int domain_context_mapped(struct device *dev)
2101 {
2102         struct intel_iommu *iommu;
2103         u8 bus, devfn;
2104
2105         iommu = device_to_iommu(dev, &bus, &devfn);
2106         if (!iommu)
2107                 return -ENODEV;
2108
2109         if (!dev_is_pci(dev))
2110                 return device_context_mapped(iommu, bus, devfn);
2111
2112         return !pci_for_each_dma_alias(to_pci_dev(dev),
2113                                        domain_context_mapped_cb, iommu);
2114 }
2115
2116 /* Returns a number of VTD pages, but aligned to MM page size */
2117 static inline unsigned long aligned_nrpages(unsigned long host_addr,
2118                                             size_t size)
2119 {
2120         host_addr &= ~PAGE_MASK;
2121         return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
2122 }
2123
2124 /* Return largest possible superpage level for a given mapping */
2125 static inline int hardware_largepage_caps(struct dmar_domain *domain,
2126                                           unsigned long iov_pfn,
2127                                           unsigned long phy_pfn,
2128                                           unsigned long pages)
2129 {
2130         int support, level = 1;
2131         unsigned long pfnmerge;
2132
2133         support = domain->iommu_superpage;
2134
2135         /* To use a large page, the virtual *and* physical addresses
2136            must be aligned to 2MiB/1GiB/etc. Lower bits set in either
2137            of them will mean we have to use smaller pages. So just
2138            merge them and check both at once. */
2139         pfnmerge = iov_pfn | phy_pfn;
2140
2141         while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
2142                 pages >>= VTD_STRIDE_SHIFT;
2143                 if (!pages)
2144                         break;
2145                 pfnmerge >>= VTD_STRIDE_SHIFT;
2146                 level++;
2147                 support--;
2148         }
2149         return level;
2150 }
2151
2152 /*
2153  * Ensure that old small page tables are removed to make room for superpage(s).
2154  * We're going to add new large pages, so make sure we don't remove their parent
2155  * tables. The IOTLB/devTLBs should be flushed if any PDE/PTEs are cleared.
2156  */
2157 static void switch_to_super_page(struct dmar_domain *domain,
2158                                  unsigned long start_pfn,
2159                                  unsigned long end_pfn, int level)
2160 {
2161         unsigned long lvl_pages = lvl_to_nr_pages(level);
2162         struct iommu_domain_info *info;
2163         struct dma_pte *pte = NULL;
2164         unsigned long i;
2165
2166         while (start_pfn <= end_pfn) {
2167                 if (!pte)
2168                         pte = pfn_to_dma_pte(domain, start_pfn, &level);
2169
2170                 if (dma_pte_present(pte)) {
2171                         dma_pte_free_pagetable(domain, start_pfn,
2172                                                start_pfn + lvl_pages - 1,
2173                                                level + 1);
2174
2175                         xa_for_each(&domain->iommu_array, i, info)
2176                                 iommu_flush_iotlb_psi(info->iommu, domain,
2177                                                       start_pfn, lvl_pages,
2178                                                       0, 0);
2179                 }
2180
2181                 pte++;
2182                 start_pfn += lvl_pages;
2183                 if (first_pte_in_page(pte))
2184                         pte = NULL;
2185         }
2186 }
2187
2188 static int
2189 __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2190                  unsigned long phys_pfn, unsigned long nr_pages, int prot)
2191 {
2192         struct dma_pte *first_pte = NULL, *pte = NULL;
2193         unsigned int largepage_lvl = 0;
2194         unsigned long lvl_pages = 0;
2195         phys_addr_t pteval;
2196         u64 attr;
2197
2198         BUG_ON(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1));
2199
2200         if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
2201                 return -EINVAL;
2202
2203         attr = prot & (DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP);
2204         attr |= DMA_FL_PTE_PRESENT;
2205         if (domain_use_first_level(domain)) {
2206                 attr |= DMA_FL_PTE_XD | DMA_FL_PTE_US | DMA_FL_PTE_ACCESS;
2207                 if (prot & DMA_PTE_WRITE)
2208                         attr |= DMA_FL_PTE_DIRTY;
2209         }
2210
2211         pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | attr;
2212
2213         while (nr_pages > 0) {
2214                 uint64_t tmp;
2215
2216                 if (!pte) {
2217                         largepage_lvl = hardware_largepage_caps(domain, iov_pfn,
2218                                         phys_pfn, nr_pages);
2219
2220                         pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl);
2221                         if (!pte)
2222                                 return -ENOMEM;
2223                         first_pte = pte;
2224
2225                         lvl_pages = lvl_to_nr_pages(largepage_lvl);
2226
2227                         /* It is large page*/
2228                         if (largepage_lvl > 1) {
2229                                 unsigned long end_pfn;
2230                                 unsigned long pages_to_remove;
2231
2232                                 pteval |= DMA_PTE_LARGE_PAGE;
2233                                 pages_to_remove = min_t(unsigned long, nr_pages,
2234                                                         nr_pte_to_next_page(pte) * lvl_pages);
2235                                 end_pfn = iov_pfn + pages_to_remove - 1;
2236                                 switch_to_super_page(domain, iov_pfn, end_pfn, largepage_lvl);
2237                         } else {
2238                                 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
2239                         }
2240
2241                 }
2242                 /* We don't need lock here, nobody else
2243                  * touches the iova range
2244                  */
2245                 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
2246                 if (tmp) {
2247                         static int dumps = 5;
2248                         pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
2249                                 iov_pfn, tmp, (unsigned long long)pteval);
2250                         if (dumps) {
2251                                 dumps--;
2252                                 debug_dma_dump_mappings(NULL);
2253                         }
2254                         WARN_ON(1);
2255                 }
2256
2257                 nr_pages -= lvl_pages;
2258                 iov_pfn += lvl_pages;
2259                 phys_pfn += lvl_pages;
2260                 pteval += lvl_pages * VTD_PAGE_SIZE;
2261
2262                 /* If the next PTE would be the first in a new page, then we
2263                  * need to flush the cache on the entries we've just written.
2264                  * And then we'll need to recalculate 'pte', so clear it and
2265                  * let it get set again in the if (!pte) block above.
2266                  *
2267                  * If we're done (!nr_pages) we need to flush the cache too.
2268                  *
2269                  * Also if we've been setting superpages, we may need to
2270                  * recalculate 'pte' and switch back to smaller pages for the
2271                  * end of the mapping, if the trailing size is not enough to
2272                  * use another superpage (i.e. nr_pages < lvl_pages).
2273                  */
2274                 pte++;
2275                 if (!nr_pages || first_pte_in_page(pte) ||
2276                     (largepage_lvl > 1 && nr_pages < lvl_pages)) {
2277                         domain_flush_cache(domain, first_pte,
2278                                            (void *)pte - (void *)first_pte);
2279                         pte = NULL;
2280                 }
2281         }
2282
2283         return 0;
2284 }
2285
2286 static void domain_context_clear_one(struct device_domain_info *info, u8 bus, u8 devfn)
2287 {
2288         struct intel_iommu *iommu = info->iommu;
2289         struct context_entry *context;
2290         u16 did_old;
2291
2292         if (!iommu)
2293                 return;
2294
2295         spin_lock(&iommu->lock);
2296         context = iommu_context_addr(iommu, bus, devfn, 0);
2297         if (!context) {
2298                 spin_unlock(&iommu->lock);
2299                 return;
2300         }
2301
2302         if (sm_supported(iommu)) {
2303                 if (hw_pass_through && domain_type_is_si(info->domain))
2304                         did_old = FLPT_DEFAULT_DID;
2305                 else
2306                         did_old = domain_id_iommu(info->domain, iommu);
2307         } else {
2308                 did_old = context_domain_id(context);
2309         }
2310
2311         context_clear_entry(context);
2312         __iommu_flush_cache(iommu, context, sizeof(*context));
2313         spin_unlock(&iommu->lock);
2314         iommu->flush.flush_context(iommu,
2315                                    did_old,
2316                                    (((u16)bus) << 8) | devfn,
2317                                    DMA_CCMD_MASK_NOBIT,
2318                                    DMA_CCMD_DEVICE_INVL);
2319
2320         if (sm_supported(iommu))
2321                 qi_flush_pasid_cache(iommu, did_old, QI_PC_ALL_PASIDS, 0);
2322
2323         iommu->flush.flush_iotlb(iommu,
2324                                  did_old,
2325                                  0,
2326                                  0,
2327                                  DMA_TLB_DSI_FLUSH);
2328
2329         __iommu_flush_dev_iotlb(info, 0, MAX_AGAW_PFN_WIDTH);
2330 }
2331
2332 static int domain_setup_first_level(struct intel_iommu *iommu,
2333                                     struct dmar_domain *domain,
2334                                     struct device *dev,
2335                                     u32 pasid)
2336 {
2337         struct dma_pte *pgd = domain->pgd;
2338         int agaw, level;
2339         int flags = 0;
2340
2341         /*
2342          * Skip top levels of page tables for iommu which has
2343          * less agaw than default. Unnecessary for PT mode.
2344          */
2345         for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2346                 pgd = phys_to_virt(dma_pte_addr(pgd));
2347                 if (!dma_pte_present(pgd))
2348                         return -ENOMEM;
2349         }
2350
2351         level = agaw_to_level(agaw);
2352         if (level != 4 && level != 5)
2353                 return -EINVAL;
2354
2355         if (pasid != PASID_RID2PASID)
2356                 flags |= PASID_FLAG_SUPERVISOR_MODE;
2357         if (level == 5)
2358                 flags |= PASID_FLAG_FL5LP;
2359
2360         if (domain->force_snooping)
2361                 flags |= PASID_FLAG_PAGE_SNOOP;
2362
2363         return intel_pasid_setup_first_level(iommu, dev, (pgd_t *)pgd, pasid,
2364                                              domain_id_iommu(domain, iommu),
2365                                              flags);
2366 }
2367
2368 static bool dev_is_real_dma_subdevice(struct device *dev)
2369 {
2370         return dev && dev_is_pci(dev) &&
2371                pci_real_dma_dev(to_pci_dev(dev)) != to_pci_dev(dev);
2372 }
2373
2374 static int iommu_domain_identity_map(struct dmar_domain *domain,
2375                                      unsigned long first_vpfn,
2376                                      unsigned long last_vpfn)
2377 {
2378         /*
2379          * RMRR range might have overlap with physical memory range,
2380          * clear it first
2381          */
2382         dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2383
2384         return __domain_mapping(domain, first_vpfn,
2385                                 first_vpfn, last_vpfn - first_vpfn + 1,
2386                                 DMA_PTE_READ|DMA_PTE_WRITE);
2387 }
2388
2389 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2390
2391 static int __init si_domain_init(int hw)
2392 {
2393         struct dmar_rmrr_unit *rmrr;
2394         struct device *dev;
2395         int i, nid, ret;
2396
2397         si_domain = alloc_domain(IOMMU_DOMAIN_IDENTITY);
2398         if (!si_domain)
2399                 return -EFAULT;
2400
2401         if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2402                 domain_exit(si_domain);
2403                 return -EFAULT;
2404         }
2405
2406         if (hw)
2407                 return 0;
2408
2409         for_each_online_node(nid) {
2410                 unsigned long start_pfn, end_pfn;
2411                 int i;
2412
2413                 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2414                         ret = iommu_domain_identity_map(si_domain,
2415                                         mm_to_dma_pfn(start_pfn),
2416                                         mm_to_dma_pfn(end_pfn));
2417                         if (ret)
2418                                 return ret;
2419                 }
2420         }
2421
2422         /*
2423          * Identity map the RMRRs so that devices with RMRRs could also use
2424          * the si_domain.
2425          */
2426         for_each_rmrr_units(rmrr) {
2427                 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
2428                                           i, dev) {
2429                         unsigned long long start = rmrr->base_address;
2430                         unsigned long long end = rmrr->end_address;
2431
2432                         if (WARN_ON(end < start ||
2433                                     end >> agaw_to_width(si_domain->agaw)))
2434                                 continue;
2435
2436                         ret = iommu_domain_identity_map(si_domain,
2437                                         mm_to_dma_pfn(start >> PAGE_SHIFT),
2438                                         mm_to_dma_pfn(end >> PAGE_SHIFT));
2439                         if (ret)
2440                                 return ret;
2441                 }
2442         }
2443
2444         return 0;
2445 }
2446
2447 static int domain_add_dev_info(struct dmar_domain *domain, struct device *dev)
2448 {
2449         struct device_domain_info *info = dev_iommu_priv_get(dev);
2450         struct intel_iommu *iommu;
2451         unsigned long flags;
2452         u8 bus, devfn;
2453         int ret;
2454
2455         iommu = device_to_iommu(dev, &bus, &devfn);
2456         if (!iommu)
2457                 return -ENODEV;
2458
2459         ret = domain_attach_iommu(domain, iommu);
2460         if (ret)
2461                 return ret;
2462         info->domain = domain;
2463         spin_lock_irqsave(&domain->lock, flags);
2464         list_add(&info->link, &domain->devices);
2465         spin_unlock_irqrestore(&domain->lock, flags);
2466
2467         /* PASID table is mandatory for a PCI device in scalable mode. */
2468         if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev)) {
2469                 ret = intel_pasid_alloc_table(dev);
2470                 if (ret) {
2471                         dev_err(dev, "PASID table allocation failed\n");
2472                         dmar_remove_one_dev_info(dev);
2473                         return ret;
2474                 }
2475
2476                 /* Setup the PASID entry for requests without PASID: */
2477                 if (hw_pass_through && domain_type_is_si(domain))
2478                         ret = intel_pasid_setup_pass_through(iommu, domain,
2479                                         dev, PASID_RID2PASID);
2480                 else if (domain_use_first_level(domain))
2481                         ret = domain_setup_first_level(iommu, domain, dev,
2482                                         PASID_RID2PASID);
2483                 else
2484                         ret = intel_pasid_setup_second_level(iommu, domain,
2485                                         dev, PASID_RID2PASID);
2486                 if (ret) {
2487                         dev_err(dev, "Setup RID2PASID failed\n");
2488                         dmar_remove_one_dev_info(dev);
2489                         return ret;
2490                 }
2491         }
2492
2493         ret = domain_context_mapping(domain, dev);
2494         if (ret) {
2495                 dev_err(dev, "Domain context map failed\n");
2496                 dmar_remove_one_dev_info(dev);
2497                 return ret;
2498         }
2499
2500         return 0;
2501 }
2502
2503 static bool device_has_rmrr(struct device *dev)
2504 {
2505         struct dmar_rmrr_unit *rmrr;
2506         struct device *tmp;
2507         int i;
2508
2509         rcu_read_lock();
2510         for_each_rmrr_units(rmrr) {
2511                 /*
2512                  * Return TRUE if this RMRR contains the device that
2513                  * is passed in.
2514                  */
2515                 for_each_active_dev_scope(rmrr->devices,
2516                                           rmrr->devices_cnt, i, tmp)
2517                         if (tmp == dev ||
2518                             is_downstream_to_pci_bridge(dev, tmp)) {
2519                                 rcu_read_unlock();
2520                                 return true;
2521                         }
2522         }
2523         rcu_read_unlock();
2524         return false;
2525 }
2526
2527 /**
2528  * device_rmrr_is_relaxable - Test whether the RMRR of this device
2529  * is relaxable (ie. is allowed to be not enforced under some conditions)
2530  * @dev: device handle
2531  *
2532  * We assume that PCI USB devices with RMRRs have them largely
2533  * for historical reasons and that the RMRR space is not actively used post
2534  * boot.  This exclusion may change if vendors begin to abuse it.
2535  *
2536  * The same exception is made for graphics devices, with the requirement that
2537  * any use of the RMRR regions will be torn down before assigning the device
2538  * to a guest.
2539  *
2540  * Return: true if the RMRR is relaxable, false otherwise
2541  */
2542 static bool device_rmrr_is_relaxable(struct device *dev)
2543 {
2544         struct pci_dev *pdev;
2545
2546         if (!dev_is_pci(dev))
2547                 return false;
2548
2549         pdev = to_pci_dev(dev);
2550         if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
2551                 return true;
2552         else
2553                 return false;
2554 }
2555
2556 /*
2557  * There are a couple cases where we need to restrict the functionality of
2558  * devices associated with RMRRs.  The first is when evaluating a device for
2559  * identity mapping because problems exist when devices are moved in and out
2560  * of domains and their respective RMRR information is lost.  This means that
2561  * a device with associated RMRRs will never be in a "passthrough" domain.
2562  * The second is use of the device through the IOMMU API.  This interface
2563  * expects to have full control of the IOVA space for the device.  We cannot
2564  * satisfy both the requirement that RMRR access is maintained and have an
2565  * unencumbered IOVA space.  We also have no ability to quiesce the device's
2566  * use of the RMRR space or even inform the IOMMU API user of the restriction.
2567  * We therefore prevent devices associated with an RMRR from participating in
2568  * the IOMMU API, which eliminates them from device assignment.
2569  *
2570  * In both cases, devices which have relaxable RMRRs are not concerned by this
2571  * restriction. See device_rmrr_is_relaxable comment.
2572  */
2573 static bool device_is_rmrr_locked(struct device *dev)
2574 {
2575         if (!device_has_rmrr(dev))
2576                 return false;
2577
2578         if (device_rmrr_is_relaxable(dev))
2579                 return false;
2580
2581         return true;
2582 }
2583
2584 /*
2585  * Return the required default domain type for a specific device.
2586  *
2587  * @dev: the device in query
2588  * @startup: true if this is during early boot
2589  *
2590  * Returns:
2591  *  - IOMMU_DOMAIN_DMA: device requires a dynamic mapping domain
2592  *  - IOMMU_DOMAIN_IDENTITY: device requires an identical mapping domain
2593  *  - 0: both identity and dynamic domains work for this device
2594  */
2595 static int device_def_domain_type(struct device *dev)
2596 {
2597         if (dev_is_pci(dev)) {
2598                 struct pci_dev *pdev = to_pci_dev(dev);
2599
2600                 if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2601                         return IOMMU_DOMAIN_IDENTITY;
2602
2603                 if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2604                         return IOMMU_DOMAIN_IDENTITY;
2605         }
2606
2607         return 0;
2608 }
2609
2610 static void intel_iommu_init_qi(struct intel_iommu *iommu)
2611 {
2612         /*
2613          * Start from the sane iommu hardware state.
2614          * If the queued invalidation is already initialized by us
2615          * (for example, while enabling interrupt-remapping) then
2616          * we got the things already rolling from a sane state.
2617          */
2618         if (!iommu->qi) {
2619                 /*
2620                  * Clear any previous faults.
2621                  */
2622                 dmar_fault(-1, iommu);
2623                 /*
2624                  * Disable queued invalidation if supported and already enabled
2625                  * before OS handover.
2626                  */
2627                 dmar_disable_qi(iommu);
2628         }
2629
2630         if (dmar_enable_qi(iommu)) {
2631                 /*
2632                  * Queued Invalidate not enabled, use Register Based Invalidate
2633                  */
2634                 iommu->flush.flush_context = __iommu_flush_context;
2635                 iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2636                 pr_info("%s: Using Register based invalidation\n",
2637                         iommu->name);
2638         } else {
2639                 iommu->flush.flush_context = qi_flush_context;
2640                 iommu->flush.flush_iotlb = qi_flush_iotlb;
2641                 pr_info("%s: Using Queued invalidation\n", iommu->name);
2642         }
2643 }
2644
2645 static int copy_context_table(struct intel_iommu *iommu,
2646                               struct root_entry *old_re,
2647                               struct context_entry **tbl,
2648                               int bus, bool ext)
2649 {
2650         int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
2651         struct context_entry *new_ce = NULL, ce;
2652         struct context_entry *old_ce = NULL;
2653         struct root_entry re;
2654         phys_addr_t old_ce_phys;
2655
2656         tbl_idx = ext ? bus * 2 : bus;
2657         memcpy(&re, old_re, sizeof(re));
2658
2659         for (devfn = 0; devfn < 256; devfn++) {
2660                 /* First calculate the correct index */
2661                 idx = (ext ? devfn * 2 : devfn) % 256;
2662
2663                 if (idx == 0) {
2664                         /* First save what we may have and clean up */
2665                         if (new_ce) {
2666                                 tbl[tbl_idx] = new_ce;
2667                                 __iommu_flush_cache(iommu, new_ce,
2668                                                     VTD_PAGE_SIZE);
2669                                 pos = 1;
2670                         }
2671
2672                         if (old_ce)
2673                                 memunmap(old_ce);
2674
2675                         ret = 0;
2676                         if (devfn < 0x80)
2677                                 old_ce_phys = root_entry_lctp(&re);
2678                         else
2679                                 old_ce_phys = root_entry_uctp(&re);
2680
2681                         if (!old_ce_phys) {
2682                                 if (ext && devfn == 0) {
2683                                         /* No LCTP, try UCTP */
2684                                         devfn = 0x7f;
2685                                         continue;
2686                                 } else {
2687                                         goto out;
2688                                 }
2689                         }
2690
2691                         ret = -ENOMEM;
2692                         old_ce = memremap(old_ce_phys, PAGE_SIZE,
2693                                         MEMREMAP_WB);
2694                         if (!old_ce)
2695                                 goto out;
2696
2697                         new_ce = alloc_pgtable_page(iommu->node);
2698                         if (!new_ce)
2699                                 goto out_unmap;
2700
2701                         ret = 0;
2702                 }
2703
2704                 /* Now copy the context entry */
2705                 memcpy(&ce, old_ce + idx, sizeof(ce));
2706
2707                 if (!context_present(&ce))
2708                         continue;
2709
2710                 did = context_domain_id(&ce);
2711                 if (did >= 0 && did < cap_ndoms(iommu->cap))
2712                         set_bit(did, iommu->domain_ids);
2713
2714                 set_context_copied(iommu, bus, devfn);
2715                 new_ce[idx] = ce;
2716         }
2717
2718         tbl[tbl_idx + pos] = new_ce;
2719
2720         __iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
2721
2722 out_unmap:
2723         memunmap(old_ce);
2724
2725 out:
2726         return ret;
2727 }
2728
2729 static int copy_translation_tables(struct intel_iommu *iommu)
2730 {
2731         struct context_entry **ctxt_tbls;
2732         struct root_entry *old_rt;
2733         phys_addr_t old_rt_phys;
2734         int ctxt_table_entries;
2735         u64 rtaddr_reg;
2736         int bus, ret;
2737         bool new_ext, ext;
2738
2739         rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
2740         ext        = !!(rtaddr_reg & DMA_RTADDR_SMT);
2741         new_ext    = !!sm_supported(iommu);
2742
2743         /*
2744          * The RTT bit can only be changed when translation is disabled,
2745          * but disabling translation means to open a window for data
2746          * corruption. So bail out and don't copy anything if we would
2747          * have to change the bit.
2748          */
2749         if (new_ext != ext)
2750                 return -EINVAL;
2751
2752         iommu->copied_tables = bitmap_zalloc(BIT_ULL(16), GFP_KERNEL);
2753         if (!iommu->copied_tables)
2754                 return -ENOMEM;
2755
2756         old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
2757         if (!old_rt_phys)
2758                 return -EINVAL;
2759
2760         old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB);
2761         if (!old_rt)
2762                 return -ENOMEM;
2763
2764         /* This is too big for the stack - allocate it from slab */
2765         ctxt_table_entries = ext ? 512 : 256;
2766         ret = -ENOMEM;
2767         ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL);
2768         if (!ctxt_tbls)
2769                 goto out_unmap;
2770
2771         for (bus = 0; bus < 256; bus++) {
2772                 ret = copy_context_table(iommu, &old_rt[bus],
2773                                          ctxt_tbls, bus, ext);
2774                 if (ret) {
2775                         pr_err("%s: Failed to copy context table for bus %d\n",
2776                                 iommu->name, bus);
2777                         continue;
2778                 }
2779         }
2780
2781         spin_lock(&iommu->lock);
2782
2783         /* Context tables are copied, now write them to the root_entry table */
2784         for (bus = 0; bus < 256; bus++) {
2785                 int idx = ext ? bus * 2 : bus;
2786                 u64 val;
2787
2788                 if (ctxt_tbls[idx]) {
2789                         val = virt_to_phys(ctxt_tbls[idx]) | 1;
2790                         iommu->root_entry[bus].lo = val;
2791                 }
2792
2793                 if (!ext || !ctxt_tbls[idx + 1])
2794                         continue;
2795
2796                 val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
2797                 iommu->root_entry[bus].hi = val;
2798         }
2799
2800         spin_unlock(&iommu->lock);
2801
2802         kfree(ctxt_tbls);
2803
2804         __iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
2805
2806         ret = 0;
2807
2808 out_unmap:
2809         memunmap(old_rt);
2810
2811         return ret;
2812 }
2813
2814 #ifdef CONFIG_INTEL_IOMMU_SVM
2815 static ioasid_t intel_vcmd_ioasid_alloc(ioasid_t min, ioasid_t max, void *data)
2816 {
2817         struct intel_iommu *iommu = data;
2818         ioasid_t ioasid;
2819
2820         if (!iommu)
2821                 return INVALID_IOASID;
2822         /*
2823          * VT-d virtual command interface always uses the full 20 bit
2824          * PASID range. Host can partition guest PASID range based on
2825          * policies but it is out of guest's control.
2826          */
2827         if (min < PASID_MIN || max > intel_pasid_max_id)
2828                 return INVALID_IOASID;
2829
2830         if (vcmd_alloc_pasid(iommu, &ioasid))
2831                 return INVALID_IOASID;
2832
2833         return ioasid;
2834 }
2835
2836 static void intel_vcmd_ioasid_free(ioasid_t ioasid, void *data)
2837 {
2838         struct intel_iommu *iommu = data;
2839
2840         if (!iommu)
2841                 return;
2842         /*
2843          * Sanity check the ioasid owner is done at upper layer, e.g. VFIO
2844          * We can only free the PASID when all the devices are unbound.
2845          */
2846         if (ioasid_find(NULL, ioasid, NULL)) {
2847                 pr_alert("Cannot free active IOASID %d\n", ioasid);
2848                 return;
2849         }
2850         vcmd_free_pasid(iommu, ioasid);
2851 }
2852
2853 static void register_pasid_allocator(struct intel_iommu *iommu)
2854 {
2855         /*
2856          * If we are running in the host, no need for custom allocator
2857          * in that PASIDs are allocated from the host system-wide.
2858          */
2859         if (!cap_caching_mode(iommu->cap))
2860                 return;
2861
2862         if (!sm_supported(iommu)) {
2863                 pr_warn("VT-d Scalable Mode not enabled, no PASID allocation\n");
2864                 return;
2865         }
2866
2867         /*
2868          * Register a custom PASID allocator if we are running in a guest,
2869          * guest PASID must be obtained via virtual command interface.
2870          * There can be multiple vIOMMUs in each guest but only one allocator
2871          * is active. All vIOMMU allocators will eventually be calling the same
2872          * host allocator.
2873          */
2874         if (!vccap_pasid(iommu->vccap))
2875                 return;
2876
2877         pr_info("Register custom PASID allocator\n");
2878         iommu->pasid_allocator.alloc = intel_vcmd_ioasid_alloc;
2879         iommu->pasid_allocator.free = intel_vcmd_ioasid_free;
2880         iommu->pasid_allocator.pdata = (void *)iommu;
2881         if (ioasid_register_allocator(&iommu->pasid_allocator)) {
2882                 pr_warn("Custom PASID allocator failed, scalable mode disabled\n");
2883                 /*
2884                  * Disable scalable mode on this IOMMU if there
2885                  * is no custom allocator. Mixing SM capable vIOMMU
2886                  * and non-SM vIOMMU are not supported.
2887                  */
2888                 intel_iommu_sm = 0;
2889         }
2890 }
2891 #endif
2892
2893 static int __init init_dmars(void)
2894 {
2895         struct dmar_drhd_unit *drhd;
2896         struct intel_iommu *iommu;
2897         int ret;
2898
2899         ret = intel_cap_audit(CAP_AUDIT_STATIC_DMAR, NULL);
2900         if (ret)
2901                 goto free_iommu;
2902
2903         for_each_iommu(iommu, drhd) {
2904                 if (drhd->ignored) {
2905                         iommu_disable_translation(iommu);
2906                         continue;
2907                 }
2908
2909                 /*
2910                  * Find the max pasid size of all IOMMU's in the system.
2911                  * We need to ensure the system pasid table is no bigger
2912                  * than the smallest supported.
2913                  */
2914                 if (pasid_supported(iommu)) {
2915                         u32 temp = 2 << ecap_pss(iommu->ecap);
2916
2917                         intel_pasid_max_id = min_t(u32, temp,
2918                                                    intel_pasid_max_id);
2919                 }
2920
2921                 intel_iommu_init_qi(iommu);
2922
2923                 ret = iommu_init_domains(iommu);
2924                 if (ret)
2925                         goto free_iommu;
2926
2927                 init_translation_status(iommu);
2928
2929                 if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
2930                         iommu_disable_translation(iommu);
2931                         clear_translation_pre_enabled(iommu);
2932                         pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
2933                                 iommu->name);
2934                 }
2935
2936                 /*
2937                  * TBD:
2938                  * we could share the same root & context tables
2939                  * among all IOMMU's. Need to Split it later.
2940                  */
2941                 ret = iommu_alloc_root_entry(iommu);
2942                 if (ret)
2943                         goto free_iommu;
2944
2945                 if (translation_pre_enabled(iommu)) {
2946                         pr_info("Translation already enabled - trying to copy translation structures\n");
2947
2948                         ret = copy_translation_tables(iommu);
2949                         if (ret) {
2950                                 /*
2951                                  * We found the IOMMU with translation
2952                                  * enabled - but failed to copy over the
2953                                  * old root-entry table. Try to proceed
2954                                  * by disabling translation now and
2955                                  * allocating a clean root-entry table.
2956                                  * This might cause DMAR faults, but
2957                                  * probably the dump will still succeed.
2958                                  */
2959                                 pr_err("Failed to copy translation tables from previous kernel for %s\n",
2960                                        iommu->name);
2961                                 iommu_disable_translation(iommu);
2962                                 clear_translation_pre_enabled(iommu);
2963                         } else {
2964                                 pr_info("Copied translation tables from previous kernel for %s\n",
2965                                         iommu->name);
2966                         }
2967                 }
2968
2969                 if (!ecap_pass_through(iommu->ecap))
2970                         hw_pass_through = 0;
2971                 intel_svm_check(iommu);
2972         }
2973
2974         /*
2975          * Now that qi is enabled on all iommus, set the root entry and flush
2976          * caches. This is required on some Intel X58 chipsets, otherwise the
2977          * flush_context function will loop forever and the boot hangs.
2978          */
2979         for_each_active_iommu(iommu, drhd) {
2980                 iommu_flush_write_buffer(iommu);
2981 #ifdef CONFIG_INTEL_IOMMU_SVM
2982                 register_pasid_allocator(iommu);
2983 #endif
2984                 iommu_set_root_entry(iommu);
2985         }
2986
2987 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
2988         dmar_map_gfx = 0;
2989 #endif
2990
2991         if (!dmar_map_gfx)
2992                 iommu_identity_mapping |= IDENTMAP_GFX;
2993
2994         check_tylersburg_isoch();
2995
2996         ret = si_domain_init(hw_pass_through);
2997         if (ret)
2998                 goto free_iommu;
2999
3000         /*
3001          * for each drhd
3002          *   enable fault log
3003          *   global invalidate context cache
3004          *   global invalidate iotlb
3005          *   enable translation
3006          */
3007         for_each_iommu(iommu, drhd) {
3008                 if (drhd->ignored) {
3009                         /*
3010                          * we always have to disable PMRs or DMA may fail on
3011                          * this device
3012                          */
3013                         if (force_on)
3014                                 iommu_disable_protect_mem_regions(iommu);
3015                         continue;
3016                 }
3017
3018                 iommu_flush_write_buffer(iommu);
3019
3020 #ifdef CONFIG_INTEL_IOMMU_SVM
3021                 if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
3022                         /*
3023                          * Call dmar_alloc_hwirq() with dmar_global_lock held,
3024                          * could cause possible lock race condition.
3025                          */
3026                         up_write(&dmar_global_lock);
3027                         ret = intel_svm_enable_prq(iommu);
3028                         down_write(&dmar_global_lock);
3029                         if (ret)
3030                                 goto free_iommu;
3031                 }
3032 #endif
3033                 ret = dmar_set_interrupt(iommu);
3034                 if (ret)
3035                         goto free_iommu;
3036         }
3037
3038         return 0;
3039
3040 free_iommu:
3041         for_each_active_iommu(iommu, drhd) {
3042                 disable_dmar_iommu(iommu);
3043                 free_dmar_iommu(iommu);
3044         }
3045
3046         return ret;
3047 }
3048
3049 static void __init init_no_remapping_devices(void)
3050 {
3051         struct dmar_drhd_unit *drhd;
3052         struct device *dev;
3053         int i;
3054
3055         for_each_drhd_unit(drhd) {
3056                 if (!drhd->include_all) {
3057                         for_each_active_dev_scope(drhd->devices,
3058                                                   drhd->devices_cnt, i, dev)
3059                                 break;
3060                         /* ignore DMAR unit if no devices exist */
3061                         if (i == drhd->devices_cnt)
3062                                 drhd->ignored = 1;
3063                 }
3064         }
3065
3066         for_each_active_drhd_unit(drhd) {
3067                 if (drhd->include_all)
3068                         continue;
3069
3070                 for_each_active_dev_scope(drhd->devices,
3071                                           drhd->devices_cnt, i, dev)
3072                         if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
3073                                 break;
3074                 if (i < drhd->devices_cnt)
3075                         continue;
3076
3077                 /* This IOMMU has *only* gfx devices. Either bypass it or
3078                    set the gfx_mapped flag, as appropriate */
3079                 drhd->gfx_dedicated = 1;
3080                 if (!dmar_map_gfx)
3081                         drhd->ignored = 1;
3082         }
3083 }
3084
3085 #ifdef CONFIG_SUSPEND
3086 static int init_iommu_hw(void)
3087 {
3088         struct dmar_drhd_unit *drhd;
3089         struct intel_iommu *iommu = NULL;
3090
3091         for_each_active_iommu(iommu, drhd)
3092                 if (iommu->qi)
3093                         dmar_reenable_qi(iommu);
3094
3095         for_each_iommu(iommu, drhd) {
3096                 if (drhd->ignored) {
3097                         /*
3098                          * we always have to disable PMRs or DMA may fail on
3099                          * this device
3100                          */
3101                         if (force_on)
3102                                 iommu_disable_protect_mem_regions(iommu);
3103                         continue;
3104                 }
3105
3106                 iommu_flush_write_buffer(iommu);
3107                 iommu_set_root_entry(iommu);
3108                 iommu_enable_translation(iommu);
3109                 iommu_disable_protect_mem_regions(iommu);
3110         }
3111
3112         return 0;
3113 }
3114
3115 static void iommu_flush_all(void)
3116 {
3117         struct dmar_drhd_unit *drhd;
3118         struct intel_iommu *iommu;
3119
3120         for_each_active_iommu(iommu, drhd) {
3121                 iommu->flush.flush_context(iommu, 0, 0, 0,
3122                                            DMA_CCMD_GLOBAL_INVL);
3123                 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3124                                          DMA_TLB_GLOBAL_FLUSH);
3125         }
3126 }
3127
3128 static int iommu_suspend(void)
3129 {
3130         struct dmar_drhd_unit *drhd;
3131         struct intel_iommu *iommu = NULL;
3132         unsigned long flag;
3133
3134         for_each_active_iommu(iommu, drhd) {
3135                 iommu->iommu_state = kcalloc(MAX_SR_DMAR_REGS, sizeof(u32),
3136                                              GFP_KERNEL);
3137                 if (!iommu->iommu_state)
3138                         goto nomem;
3139         }
3140
3141         iommu_flush_all();
3142
3143         for_each_active_iommu(iommu, drhd) {
3144                 iommu_disable_translation(iommu);
3145
3146                 raw_spin_lock_irqsave(&iommu->register_lock, flag);
3147
3148                 iommu->iommu_state[SR_DMAR_FECTL_REG] =
3149                         readl(iommu->reg + DMAR_FECTL_REG);
3150                 iommu->iommu_state[SR_DMAR_FEDATA_REG] =
3151                         readl(iommu->reg + DMAR_FEDATA_REG);
3152                 iommu->iommu_state[SR_DMAR_FEADDR_REG] =
3153                         readl(iommu->reg + DMAR_FEADDR_REG);
3154                 iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
3155                         readl(iommu->reg + DMAR_FEUADDR_REG);
3156
3157                 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3158         }
3159         return 0;
3160
3161 nomem:
3162         for_each_active_iommu(iommu, drhd)
3163                 kfree(iommu->iommu_state);
3164
3165         return -ENOMEM;
3166 }
3167
3168 static void iommu_resume(void)
3169 {
3170         struct dmar_drhd_unit *drhd;
3171         struct intel_iommu *iommu = NULL;
3172         unsigned long flag;
3173
3174         if (init_iommu_hw()) {
3175                 if (force_on)
3176                         panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
3177                 else
3178                         WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
3179                 return;
3180         }
3181
3182         for_each_active_iommu(iommu, drhd) {
3183
3184                 raw_spin_lock_irqsave(&iommu->register_lock, flag);
3185
3186                 writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
3187                         iommu->reg + DMAR_FECTL_REG);
3188                 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
3189                         iommu->reg + DMAR_FEDATA_REG);
3190                 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
3191                         iommu->reg + DMAR_FEADDR_REG);
3192                 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
3193                         iommu->reg + DMAR_FEUADDR_REG);
3194
3195                 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3196         }
3197
3198         for_each_active_iommu(iommu, drhd)
3199                 kfree(iommu->iommu_state);
3200 }
3201
3202 static struct syscore_ops iommu_syscore_ops = {
3203         .resume         = iommu_resume,
3204         .suspend        = iommu_suspend,
3205 };
3206
3207 static void __init init_iommu_pm_ops(void)
3208 {
3209         register_syscore_ops(&iommu_syscore_ops);
3210 }
3211
3212 #else
3213 static inline void init_iommu_pm_ops(void) {}
3214 #endif  /* CONFIG_PM */
3215
3216 static int __init rmrr_sanity_check(struct acpi_dmar_reserved_memory *rmrr)
3217 {
3218         if (!IS_ALIGNED(rmrr->base_address, PAGE_SIZE) ||
3219             !IS_ALIGNED(rmrr->end_address + 1, PAGE_SIZE) ||
3220             rmrr->end_address <= rmrr->base_address ||
3221             arch_rmrr_sanity_check(rmrr))
3222                 return -EINVAL;
3223
3224         return 0;
3225 }
3226
3227 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
3228 {
3229         struct acpi_dmar_reserved_memory *rmrr;
3230         struct dmar_rmrr_unit *rmrru;
3231
3232         rmrr = (struct acpi_dmar_reserved_memory *)header;
3233         if (rmrr_sanity_check(rmrr)) {
3234                 pr_warn(FW_BUG
3235                            "Your BIOS is broken; bad RMRR [%#018Lx-%#018Lx]\n"
3236                            "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
3237                            rmrr->base_address, rmrr->end_address,
3238                            dmi_get_system_info(DMI_BIOS_VENDOR),
3239                            dmi_get_system_info(DMI_BIOS_VERSION),
3240                            dmi_get_system_info(DMI_PRODUCT_VERSION));
3241                 add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
3242         }
3243
3244         rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
3245         if (!rmrru)
3246                 goto out;
3247
3248         rmrru->hdr = header;
3249
3250         rmrru->base_address = rmrr->base_address;
3251         rmrru->end_address = rmrr->end_address;
3252
3253         rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
3254                                 ((void *)rmrr) + rmrr->header.length,
3255                                 &rmrru->devices_cnt);
3256         if (rmrru->devices_cnt && rmrru->devices == NULL)
3257                 goto free_rmrru;
3258
3259         list_add(&rmrru->list, &dmar_rmrr_units);
3260
3261         return 0;
3262 free_rmrru:
3263         kfree(rmrru);
3264 out:
3265         return -ENOMEM;
3266 }
3267
3268 static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
3269 {
3270         struct dmar_atsr_unit *atsru;
3271         struct acpi_dmar_atsr *tmp;
3272
3273         list_for_each_entry_rcu(atsru, &dmar_atsr_units, list,
3274                                 dmar_rcu_check()) {
3275                 tmp = (struct acpi_dmar_atsr *)atsru->hdr;
3276                 if (atsr->segment != tmp->segment)
3277                         continue;
3278                 if (atsr->header.length != tmp->header.length)
3279                         continue;
3280                 if (memcmp(atsr, tmp, atsr->header.length) == 0)
3281                         return atsru;
3282         }
3283
3284         return NULL;
3285 }
3286
3287 int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3288 {
3289         struct acpi_dmar_atsr *atsr;
3290         struct dmar_atsr_unit *atsru;
3291
3292         if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
3293                 return 0;
3294
3295         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3296         atsru = dmar_find_atsr(atsr);
3297         if (atsru)
3298                 return 0;
3299
3300         atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
3301         if (!atsru)
3302                 return -ENOMEM;
3303
3304         /*
3305          * If memory is allocated from slab by ACPI _DSM method, we need to
3306          * copy the memory content because the memory buffer will be freed
3307          * on return.
3308          */
3309         atsru->hdr = (void *)(atsru + 1);
3310         memcpy(atsru->hdr, hdr, hdr->length);
3311         atsru->include_all = atsr->flags & 0x1;
3312         if (!atsru->include_all) {
3313                 atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
3314                                 (void *)atsr + atsr->header.length,
3315                                 &atsru->devices_cnt);
3316                 if (atsru->devices_cnt && atsru->devices == NULL) {
3317                         kfree(atsru);
3318                         return -ENOMEM;
3319                 }
3320         }
3321
3322         list_add_rcu(&atsru->list, &dmar_atsr_units);
3323
3324         return 0;
3325 }
3326
3327 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
3328 {
3329         dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
3330         kfree(atsru);
3331 }
3332
3333 int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3334 {
3335         struct acpi_dmar_atsr *atsr;
3336         struct dmar_atsr_unit *atsru;
3337
3338         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3339         atsru = dmar_find_atsr(atsr);
3340         if (atsru) {
3341                 list_del_rcu(&atsru->list);
3342                 synchronize_rcu();
3343                 intel_iommu_free_atsr(atsru);
3344         }
3345
3346         return 0;
3347 }
3348
3349 int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3350 {
3351         int i;
3352         struct device *dev;
3353         struct acpi_dmar_atsr *atsr;
3354         struct dmar_atsr_unit *atsru;
3355
3356         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3357         atsru = dmar_find_atsr(atsr);
3358         if (!atsru)
3359                 return 0;
3360
3361         if (!atsru->include_all && atsru->devices && atsru->devices_cnt) {
3362                 for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
3363                                           i, dev)
3364                         return -EBUSY;
3365         }
3366
3367         return 0;
3368 }
3369
3370 static struct dmar_satc_unit *dmar_find_satc(struct acpi_dmar_satc *satc)
3371 {
3372         struct dmar_satc_unit *satcu;
3373         struct acpi_dmar_satc *tmp;
3374
3375         list_for_each_entry_rcu(satcu, &dmar_satc_units, list,
3376                                 dmar_rcu_check()) {
3377                 tmp = (struct acpi_dmar_satc *)satcu->hdr;
3378                 if (satc->segment != tmp->segment)
3379                         continue;
3380                 if (satc->header.length != tmp->header.length)
3381                         continue;
3382                 if (memcmp(satc, tmp, satc->header.length) == 0)
3383                         return satcu;
3384         }
3385
3386         return NULL;
3387 }
3388
3389 int dmar_parse_one_satc(struct acpi_dmar_header *hdr, void *arg)
3390 {
3391         struct acpi_dmar_satc *satc;
3392         struct dmar_satc_unit *satcu;
3393
3394         if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
3395                 return 0;
3396
3397         satc = container_of(hdr, struct acpi_dmar_satc, header);
3398         satcu = dmar_find_satc(satc);
3399         if (satcu)
3400                 return 0;
3401
3402         satcu = kzalloc(sizeof(*satcu) + hdr->length, GFP_KERNEL);
3403         if (!satcu)
3404                 return -ENOMEM;
3405
3406         satcu->hdr = (void *)(satcu + 1);
3407         memcpy(satcu->hdr, hdr, hdr->length);
3408         satcu->atc_required = satc->flags & 0x1;
3409         satcu->devices = dmar_alloc_dev_scope((void *)(satc + 1),
3410                                               (void *)satc + satc->header.length,
3411                                               &satcu->devices_cnt);
3412         if (satcu->devices_cnt && !satcu->devices) {
3413                 kfree(satcu);
3414                 return -ENOMEM;
3415         }
3416         list_add_rcu(&satcu->list, &dmar_satc_units);
3417
3418         return 0;
3419 }
3420
3421 static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
3422 {
3423         int sp, ret;
3424         struct intel_iommu *iommu = dmaru->iommu;
3425
3426         ret = intel_cap_audit(CAP_AUDIT_HOTPLUG_DMAR, iommu);
3427         if (ret)
3428                 goto out;
3429
3430         if (hw_pass_through && !ecap_pass_through(iommu->ecap)) {
3431                 pr_warn("%s: Doesn't support hardware pass through.\n",
3432                         iommu->name);
3433                 return -ENXIO;
3434         }
3435
3436         sp = domain_update_iommu_superpage(NULL, iommu) - 1;
3437         if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) {
3438                 pr_warn("%s: Doesn't support large page.\n",
3439                         iommu->name);
3440                 return -ENXIO;
3441         }
3442
3443         /*
3444          * Disable translation if already enabled prior to OS handover.
3445          */
3446         if (iommu->gcmd & DMA_GCMD_TE)
3447                 iommu_disable_translation(iommu);
3448
3449         ret = iommu_init_domains(iommu);
3450         if (ret == 0)
3451                 ret = iommu_alloc_root_entry(iommu);
3452         if (ret)
3453                 goto out;
3454
3455         intel_svm_check(iommu);
3456
3457         if (dmaru->ignored) {
3458                 /*
3459                  * we always have to disable PMRs or DMA may fail on this device
3460                  */
3461                 if (force_on)
3462                         iommu_disable_protect_mem_regions(iommu);
3463                 return 0;
3464         }
3465
3466         intel_iommu_init_qi(iommu);
3467         iommu_flush_write_buffer(iommu);
3468
3469 #ifdef CONFIG_INTEL_IOMMU_SVM
3470         if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
3471                 ret = intel_svm_enable_prq(iommu);
3472                 if (ret)
3473                         goto disable_iommu;
3474         }
3475 #endif
3476         ret = dmar_set_interrupt(iommu);
3477         if (ret)
3478                 goto disable_iommu;
3479
3480         iommu_set_root_entry(iommu);
3481         iommu_enable_translation(iommu);
3482
3483         iommu_disable_protect_mem_regions(iommu);
3484         return 0;
3485
3486 disable_iommu:
3487         disable_dmar_iommu(iommu);
3488 out:
3489         free_dmar_iommu(iommu);
3490         return ret;
3491 }
3492
3493 int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
3494 {
3495         int ret = 0;
3496         struct intel_iommu *iommu = dmaru->iommu;
3497
3498         if (!intel_iommu_enabled)
3499                 return 0;
3500         if (iommu == NULL)
3501                 return -EINVAL;
3502
3503         if (insert) {
3504                 ret = intel_iommu_add(dmaru);
3505         } else {
3506                 disable_dmar_iommu(iommu);
3507                 free_dmar_iommu(iommu);
3508         }
3509
3510         return ret;
3511 }
3512
3513 static void intel_iommu_free_dmars(void)
3514 {
3515         struct dmar_rmrr_unit *rmrru, *rmrr_n;
3516         struct dmar_atsr_unit *atsru, *atsr_n;
3517         struct dmar_satc_unit *satcu, *satc_n;
3518
3519         list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
3520                 list_del(&rmrru->list);
3521                 dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
3522                 kfree(rmrru);
3523         }
3524
3525         list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
3526                 list_del(&atsru->list);
3527                 intel_iommu_free_atsr(atsru);
3528         }
3529         list_for_each_entry_safe(satcu, satc_n, &dmar_satc_units, list) {
3530                 list_del(&satcu->list);
3531                 dmar_free_dev_scope(&satcu->devices, &satcu->devices_cnt);
3532                 kfree(satcu);
3533         }
3534 }
3535
3536 static struct dmar_satc_unit *dmar_find_matched_satc_unit(struct pci_dev *dev)
3537 {
3538         struct dmar_satc_unit *satcu;
3539         struct acpi_dmar_satc *satc;
3540         struct device *tmp;
3541         int i;
3542
3543         dev = pci_physfn(dev);
3544         rcu_read_lock();
3545
3546         list_for_each_entry_rcu(satcu, &dmar_satc_units, list) {
3547                 satc = container_of(satcu->hdr, struct acpi_dmar_satc, header);
3548                 if (satc->segment != pci_domain_nr(dev->bus))
3549                         continue;
3550                 for_each_dev_scope(satcu->devices, satcu->devices_cnt, i, tmp)
3551                         if (to_pci_dev(tmp) == dev)
3552                                 goto out;
3553         }
3554         satcu = NULL;
3555 out:
3556         rcu_read_unlock();
3557         return satcu;
3558 }
3559
3560 static int dmar_ats_supported(struct pci_dev *dev, struct intel_iommu *iommu)
3561 {
3562         int i, ret = 1;
3563         struct pci_bus *bus;
3564         struct pci_dev *bridge = NULL;
3565         struct device *tmp;
3566         struct acpi_dmar_atsr *atsr;
3567         struct dmar_atsr_unit *atsru;
3568         struct dmar_satc_unit *satcu;
3569
3570         dev = pci_physfn(dev);
3571         satcu = dmar_find_matched_satc_unit(dev);
3572         if (satcu)
3573                 /*
3574                  * This device supports ATS as it is in SATC table.
3575                  * When IOMMU is in legacy mode, enabling ATS is done
3576                  * automatically by HW for the device that requires
3577                  * ATS, hence OS should not enable this device ATS
3578                  * to avoid duplicated TLB invalidation.
3579                  */
3580                 return !(satcu->atc_required && !sm_supported(iommu));
3581
3582         for (bus = dev->bus; bus; bus = bus->parent) {
3583                 bridge = bus->self;
3584                 /* If it's an integrated device, allow ATS */
3585                 if (!bridge)
3586                         return 1;
3587                 /* Connected via non-PCIe: no ATS */
3588                 if (!pci_is_pcie(bridge) ||
3589                     pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
3590                         return 0;
3591                 /* If we found the root port, look it up in the ATSR */
3592                 if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
3593                         break;
3594         }
3595
3596         rcu_read_lock();
3597         list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
3598                 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3599                 if (atsr->segment != pci_domain_nr(dev->bus))
3600                         continue;
3601
3602                 for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
3603                         if (tmp == &bridge->dev)
3604                                 goto out;
3605
3606                 if (atsru->include_all)
3607                         goto out;
3608         }
3609         ret = 0;
3610 out:
3611         rcu_read_unlock();
3612
3613         return ret;
3614 }
3615
3616 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
3617 {
3618         int ret;
3619         struct dmar_rmrr_unit *rmrru;
3620         struct dmar_atsr_unit *atsru;
3621         struct dmar_satc_unit *satcu;
3622         struct acpi_dmar_atsr *atsr;
3623         struct acpi_dmar_reserved_memory *rmrr;
3624         struct acpi_dmar_satc *satc;
3625
3626         if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING)
3627                 return 0;
3628
3629         list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
3630                 rmrr = container_of(rmrru->hdr,
3631                                     struct acpi_dmar_reserved_memory, header);
3632                 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3633                         ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
3634                                 ((void *)rmrr) + rmrr->header.length,
3635                                 rmrr->segment, rmrru->devices,
3636                                 rmrru->devices_cnt);
3637                         if (ret < 0)
3638                                 return ret;
3639                 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
3640                         dmar_remove_dev_scope(info, rmrr->segment,
3641                                 rmrru->devices, rmrru->devices_cnt);
3642                 }
3643         }
3644
3645         list_for_each_entry(atsru, &dmar_atsr_units, list) {
3646                 if (atsru->include_all)
3647                         continue;
3648
3649                 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3650                 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3651                         ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
3652                                         (void *)atsr + atsr->header.length,
3653                                         atsr->segment, atsru->devices,
3654                                         atsru->devices_cnt);
3655                         if (ret > 0)
3656                                 break;
3657                         else if (ret < 0)
3658                                 return ret;
3659                 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
3660                         if (dmar_remove_dev_scope(info, atsr->segment,
3661                                         atsru->devices, atsru->devices_cnt))
3662                                 break;
3663                 }
3664         }
3665         list_for_each_entry(satcu, &dmar_satc_units, list) {
3666                 satc = container_of(satcu->hdr, struct acpi_dmar_satc, header);
3667                 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3668                         ret = dmar_insert_dev_scope(info, (void *)(satc + 1),
3669                                         (void *)satc + satc->header.length,
3670                                         satc->segment, satcu->devices,
3671                                         satcu->devices_cnt);
3672                         if (ret > 0)
3673                                 break;
3674                         else if (ret < 0)
3675                                 return ret;
3676                 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
3677                         if (dmar_remove_dev_scope(info, satc->segment,
3678                                         satcu->devices, satcu->devices_cnt))
3679                                 break;
3680                 }
3681         }
3682
3683         return 0;
3684 }
3685
3686 static int intel_iommu_memory_notifier(struct notifier_block *nb,
3687                                        unsigned long val, void *v)
3688 {
3689         struct memory_notify *mhp = v;
3690         unsigned long start_vpfn = mm_to_dma_pfn(mhp->start_pfn);
3691         unsigned long last_vpfn = mm_to_dma_pfn(mhp->start_pfn +
3692                         mhp->nr_pages - 1);
3693
3694         switch (val) {
3695         case MEM_GOING_ONLINE:
3696                 if (iommu_domain_identity_map(si_domain,
3697                                               start_vpfn, last_vpfn)) {
3698                         pr_warn("Failed to build identity map for [%lx-%lx]\n",
3699                                 start_vpfn, last_vpfn);
3700                         return NOTIFY_BAD;
3701                 }
3702                 break;
3703
3704         case MEM_OFFLINE:
3705         case MEM_CANCEL_ONLINE:
3706                 {
3707                         struct dmar_drhd_unit *drhd;
3708                         struct intel_iommu *iommu;
3709                         LIST_HEAD(freelist);
3710
3711                         domain_unmap(si_domain, start_vpfn, last_vpfn, &freelist);
3712
3713                         rcu_read_lock();
3714                         for_each_active_iommu(iommu, drhd)
3715                                 iommu_flush_iotlb_psi(iommu, si_domain,
3716                                         start_vpfn, mhp->nr_pages,
3717                                         list_empty(&freelist), 0);
3718                         rcu_read_unlock();
3719                         put_pages_list(&freelist);
3720                 }
3721                 break;
3722         }
3723
3724         return NOTIFY_OK;
3725 }
3726
3727 static struct notifier_block intel_iommu_memory_nb = {
3728         .notifier_call = intel_iommu_memory_notifier,
3729         .priority = 0
3730 };
3731
3732 static void intel_disable_iommus(void)
3733 {
3734         struct intel_iommu *iommu = NULL;
3735         struct dmar_drhd_unit *drhd;
3736
3737         for_each_iommu(iommu, drhd)
3738                 iommu_disable_translation(iommu);
3739 }
3740
3741 void intel_iommu_shutdown(void)
3742 {
3743         struct dmar_drhd_unit *drhd;
3744         struct intel_iommu *iommu = NULL;
3745
3746         if (no_iommu || dmar_disabled)
3747                 return;
3748
3749         down_write(&dmar_global_lock);
3750
3751         /* Disable PMRs explicitly here. */
3752         for_each_iommu(iommu, drhd)
3753                 iommu_disable_protect_mem_regions(iommu);
3754
3755         /* Make sure the IOMMUs are switched off */
3756         intel_disable_iommus();
3757
3758         up_write(&dmar_global_lock);
3759 }
3760
3761 static inline struct intel_iommu *dev_to_intel_iommu(struct device *dev)
3762 {
3763         struct iommu_device *iommu_dev = dev_to_iommu_device(dev);
3764
3765         return container_of(iommu_dev, struct intel_iommu, iommu);
3766 }
3767
3768 static ssize_t version_show(struct device *dev,
3769                             struct device_attribute *attr, char *buf)
3770 {
3771         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3772         u32 ver = readl(iommu->reg + DMAR_VER_REG);
3773         return sprintf(buf, "%d:%d\n",
3774                        DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
3775 }
3776 static DEVICE_ATTR_RO(version);
3777
3778 static ssize_t address_show(struct device *dev,
3779                             struct device_attribute *attr, char *buf)
3780 {
3781         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3782         return sprintf(buf, "%llx\n", iommu->reg_phys);
3783 }
3784 static DEVICE_ATTR_RO(address);
3785
3786 static ssize_t cap_show(struct device *dev,
3787                         struct device_attribute *attr, char *buf)
3788 {
3789         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3790         return sprintf(buf, "%llx\n", iommu->cap);
3791 }
3792 static DEVICE_ATTR_RO(cap);
3793
3794 static ssize_t ecap_show(struct device *dev,
3795                          struct device_attribute *attr, char *buf)
3796 {
3797         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3798         return sprintf(buf, "%llx\n", iommu->ecap);
3799 }
3800 static DEVICE_ATTR_RO(ecap);
3801
3802 static ssize_t domains_supported_show(struct device *dev,
3803                                       struct device_attribute *attr, char *buf)
3804 {
3805         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3806         return sprintf(buf, "%ld\n", cap_ndoms(iommu->cap));
3807 }
3808 static DEVICE_ATTR_RO(domains_supported);
3809
3810 static ssize_t domains_used_show(struct device *dev,
3811                                  struct device_attribute *attr, char *buf)
3812 {
3813         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3814         return sprintf(buf, "%d\n", bitmap_weight(iommu->domain_ids,
3815                                                   cap_ndoms(iommu->cap)));
3816 }
3817 static DEVICE_ATTR_RO(domains_used);
3818
3819 static struct attribute *intel_iommu_attrs[] = {
3820         &dev_attr_version.attr,
3821         &dev_attr_address.attr,
3822         &dev_attr_cap.attr,
3823         &dev_attr_ecap.attr,
3824         &dev_attr_domains_supported.attr,
3825         &dev_attr_domains_used.attr,
3826         NULL,
3827 };
3828
3829 static struct attribute_group intel_iommu_group = {
3830         .name = "intel-iommu",
3831         .attrs = intel_iommu_attrs,
3832 };
3833
3834 const struct attribute_group *intel_iommu_groups[] = {
3835         &intel_iommu_group,
3836         NULL,
3837 };
3838
3839 static inline bool has_external_pci(void)
3840 {
3841         struct pci_dev *pdev = NULL;
3842
3843         for_each_pci_dev(pdev)
3844                 if (pdev->external_facing)
3845                         return true;
3846
3847         return false;
3848 }
3849
3850 static int __init platform_optin_force_iommu(void)
3851 {
3852         if (!dmar_platform_optin() || no_platform_optin || !has_external_pci())
3853                 return 0;
3854
3855         if (no_iommu || dmar_disabled)
3856                 pr_info("Intel-IOMMU force enabled due to platform opt in\n");
3857
3858         /*
3859          * If Intel-IOMMU is disabled by default, we will apply identity
3860          * map for all devices except those marked as being untrusted.
3861          */
3862         if (dmar_disabled)
3863                 iommu_set_default_passthrough(false);
3864
3865         dmar_disabled = 0;
3866         no_iommu = 0;
3867
3868         return 1;
3869 }
3870
3871 static int __init probe_acpi_namespace_devices(void)
3872 {
3873         struct dmar_drhd_unit *drhd;
3874         /* To avoid a -Wunused-but-set-variable warning. */
3875         struct intel_iommu *iommu __maybe_unused;
3876         struct device *dev;
3877         int i, ret = 0;
3878
3879         for_each_active_iommu(iommu, drhd) {
3880                 for_each_active_dev_scope(drhd->devices,
3881                                           drhd->devices_cnt, i, dev) {
3882                         struct acpi_device_physical_node *pn;
3883                         struct iommu_group *group;
3884                         struct acpi_device *adev;
3885
3886                         if (dev->bus != &acpi_bus_type)
3887                                 continue;
3888
3889                         adev = to_acpi_device(dev);
3890                         mutex_lock(&adev->physical_node_lock);
3891                         list_for_each_entry(pn,
3892                                             &adev->physical_node_list, node) {
3893                                 group = iommu_group_get(pn->dev);
3894                                 if (group) {
3895                                         iommu_group_put(group);
3896                                         continue;
3897                                 }
3898
3899                                 pn->dev->bus->iommu_ops = &intel_iommu_ops;
3900                                 ret = iommu_probe_device(pn->dev);
3901                                 if (ret)
3902                                         break;
3903                         }
3904                         mutex_unlock(&adev->physical_node_lock);
3905
3906                         if (ret)
3907                                 return ret;
3908                 }
3909         }
3910
3911         return 0;
3912 }
3913
3914 static __init int tboot_force_iommu(void)
3915 {
3916         if (!tboot_enabled())
3917                 return 0;
3918
3919         if (no_iommu || dmar_disabled)
3920                 pr_warn("Forcing Intel-IOMMU to enabled\n");
3921
3922         dmar_disabled = 0;
3923         no_iommu = 0;
3924
3925         return 1;
3926 }
3927
3928 int __init intel_iommu_init(void)
3929 {
3930         int ret = -ENODEV;
3931         struct dmar_drhd_unit *drhd;
3932         struct intel_iommu *iommu;
3933
3934         /*
3935          * Intel IOMMU is required for a TXT/tboot launch or platform
3936          * opt in, so enforce that.
3937          */
3938         force_on = (!intel_iommu_tboot_noforce && tboot_force_iommu()) ||
3939                     platform_optin_force_iommu();
3940
3941         down_write(&dmar_global_lock);
3942         if (dmar_table_init()) {
3943                 if (force_on)
3944                         panic("tboot: Failed to initialize DMAR table\n");
3945                 goto out_free_dmar;
3946         }
3947
3948         if (dmar_dev_scope_init() < 0) {
3949                 if (force_on)
3950                         panic("tboot: Failed to initialize DMAR device scope\n");
3951                 goto out_free_dmar;
3952         }
3953
3954         up_write(&dmar_global_lock);
3955
3956         /*
3957          * The bus notifier takes the dmar_global_lock, so lockdep will
3958          * complain later when we register it under the lock.
3959          */
3960         dmar_register_bus_notifier();
3961
3962         down_write(&dmar_global_lock);
3963
3964         if (!no_iommu)
3965                 intel_iommu_debugfs_init();
3966
3967         if (no_iommu || dmar_disabled) {
3968                 /*
3969                  * We exit the function here to ensure IOMMU's remapping and
3970                  * mempool aren't setup, which means that the IOMMU's PMRs
3971                  * won't be disabled via the call to init_dmars(). So disable
3972                  * it explicitly here. The PMRs were setup by tboot prior to
3973                  * calling SENTER, but the kernel is expected to reset/tear
3974                  * down the PMRs.
3975                  */
3976                 if (intel_iommu_tboot_noforce) {
3977                         for_each_iommu(iommu, drhd)
3978                                 iommu_disable_protect_mem_regions(iommu);
3979                 }
3980
3981                 /*
3982                  * Make sure the IOMMUs are switched off, even when we
3983                  * boot into a kexec kernel and the previous kernel left
3984                  * them enabled
3985                  */
3986                 intel_disable_iommus();
3987                 goto out_free_dmar;
3988         }
3989
3990         if (list_empty(&dmar_rmrr_units))
3991                 pr_info("No RMRR found\n");
3992
3993         if (list_empty(&dmar_atsr_units))
3994                 pr_info("No ATSR found\n");
3995
3996         if (list_empty(&dmar_satc_units))
3997                 pr_info("No SATC found\n");
3998
3999         init_no_remapping_devices();
4000
4001         ret = init_dmars();
4002         if (ret) {
4003                 if (force_on)
4004                         panic("tboot: Failed to initialize DMARs\n");
4005                 pr_err("Initialization failed\n");
4006                 goto out_free_dmar;
4007         }
4008         up_write(&dmar_global_lock);
4009
4010         init_iommu_pm_ops();
4011
4012         down_read(&dmar_global_lock);
4013         for_each_active_iommu(iommu, drhd) {
4014                 /*
4015                  * The flush queue implementation does not perform
4016                  * page-selective invalidations that are required for efficient
4017                  * TLB flushes in virtual environments.  The benefit of batching
4018                  * is likely to be much lower than the overhead of synchronizing
4019                  * the virtual and physical IOMMU page-tables.
4020                  */
4021                 if (cap_caching_mode(iommu->cap)) {
4022                         pr_info_once("IOMMU batching disallowed due to virtualization\n");
4023                         iommu_set_dma_strict();
4024                 }
4025                 iommu_device_sysfs_add(&iommu->iommu, NULL,
4026                                        intel_iommu_groups,
4027                                        "%s", iommu->name);
4028                 iommu_device_register(&iommu->iommu, &intel_iommu_ops, NULL);
4029         }
4030         up_read(&dmar_global_lock);
4031
4032         bus_set_iommu(&pci_bus_type, &intel_iommu_ops);
4033         if (si_domain && !hw_pass_through)
4034                 register_memory_notifier(&intel_iommu_memory_nb);
4035
4036         down_read(&dmar_global_lock);
4037         if (probe_acpi_namespace_devices())
4038                 pr_warn("ACPI name space devices didn't probe correctly\n");
4039
4040         /* Finally, we enable the DMA remapping hardware. */
4041         for_each_iommu(iommu, drhd) {
4042                 if (!drhd->ignored && !translation_pre_enabled(iommu))
4043                         iommu_enable_translation(iommu);
4044
4045                 iommu_disable_protect_mem_regions(iommu);
4046         }
4047         up_read(&dmar_global_lock);
4048
4049         pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
4050
4051         intel_iommu_enabled = 1;
4052
4053         return 0;
4054
4055 out_free_dmar:
4056         intel_iommu_free_dmars();
4057         up_write(&dmar_global_lock);
4058         return ret;
4059 }
4060
4061 static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque)
4062 {
4063         struct device_domain_info *info = opaque;
4064
4065         domain_context_clear_one(info, PCI_BUS_NUM(alias), alias & 0xff);
4066         return 0;
4067 }
4068
4069 /*
4070  * NB - intel-iommu lacks any sort of reference counting for the users of
4071  * dependent devices.  If multiple endpoints have intersecting dependent
4072  * devices, unbinding the driver from any one of them will possibly leave
4073  * the others unable to operate.
4074  */
4075 static void domain_context_clear(struct device_domain_info *info)
4076 {
4077         if (!info->iommu || !info->dev || !dev_is_pci(info->dev))
4078                 return;
4079
4080         pci_for_each_dma_alias(to_pci_dev(info->dev),
4081                                &domain_context_clear_one_cb, info);
4082 }
4083
4084 static void dmar_remove_one_dev_info(struct device *dev)
4085 {
4086         struct device_domain_info *info = dev_iommu_priv_get(dev);
4087         struct dmar_domain *domain = info->domain;
4088         struct intel_iommu *iommu = info->iommu;
4089         unsigned long flags;
4090
4091         if (!dev_is_real_dma_subdevice(info->dev)) {
4092                 if (dev_is_pci(info->dev) && sm_supported(iommu))
4093                         intel_pasid_tear_down_entry(iommu, info->dev,
4094                                         PASID_RID2PASID, false);
4095
4096                 iommu_disable_dev_iotlb(info);
4097                 domain_context_clear(info);
4098                 intel_pasid_free_table(info->dev);
4099         }
4100
4101         spin_lock_irqsave(&domain->lock, flags);
4102         list_del(&info->link);
4103         spin_unlock_irqrestore(&domain->lock, flags);
4104
4105         domain_detach_iommu(domain, iommu);
4106         info->domain = NULL;
4107 }
4108
4109 static int md_domain_init(struct dmar_domain *domain, int guest_width)
4110 {
4111         int adjust_width;
4112
4113         /* calculate AGAW */
4114         domain->gaw = guest_width;
4115         adjust_width = guestwidth_to_adjustwidth(guest_width);
4116         domain->agaw = width_to_agaw(adjust_width);
4117
4118         domain->iommu_coherency = false;
4119         domain->iommu_superpage = 0;
4120         domain->max_addr = 0;
4121
4122         /* always allocate the top pgd */
4123         domain->pgd = alloc_pgtable_page(domain->nid);
4124         if (!domain->pgd)
4125                 return -ENOMEM;
4126         domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
4127         return 0;
4128 }
4129
4130 static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
4131 {
4132         struct dmar_domain *dmar_domain;
4133         struct iommu_domain *domain;
4134
4135         switch (type) {
4136         case IOMMU_DOMAIN_DMA:
4137         case IOMMU_DOMAIN_DMA_FQ:
4138         case IOMMU_DOMAIN_UNMANAGED:
4139                 dmar_domain = alloc_domain(type);
4140                 if (!dmar_domain) {
4141                         pr_err("Can't allocate dmar_domain\n");
4142                         return NULL;
4143                 }
4144                 if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
4145                         pr_err("Domain initialization failed\n");
4146                         domain_exit(dmar_domain);
4147                         return NULL;
4148                 }
4149
4150                 domain = &dmar_domain->domain;
4151                 domain->geometry.aperture_start = 0;
4152                 domain->geometry.aperture_end   =
4153                                 __DOMAIN_MAX_ADDR(dmar_domain->gaw);
4154                 domain->geometry.force_aperture = true;
4155
4156                 return domain;
4157         case IOMMU_DOMAIN_IDENTITY:
4158                 return &si_domain->domain;
4159         default:
4160                 return NULL;
4161         }
4162
4163         return NULL;
4164 }
4165
4166 static void intel_iommu_domain_free(struct iommu_domain *domain)
4167 {
4168         if (domain != &si_domain->domain)
4169                 domain_exit(to_dmar_domain(domain));
4170 }
4171
4172 static int prepare_domain_attach_device(struct iommu_domain *domain,
4173                                         struct device *dev)
4174 {
4175         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4176         struct intel_iommu *iommu;
4177         int addr_width;
4178
4179         iommu = device_to_iommu(dev, NULL, NULL);
4180         if (!iommu)
4181                 return -ENODEV;
4182
4183         if (dmar_domain->force_snooping && !ecap_sc_support(iommu->ecap))
4184                 return -EOPNOTSUPP;
4185
4186         /* check if this iommu agaw is sufficient for max mapped address */
4187         addr_width = agaw_to_width(iommu->agaw);
4188         if (addr_width > cap_mgaw(iommu->cap))
4189                 addr_width = cap_mgaw(iommu->cap);
4190
4191         if (dmar_domain->max_addr > (1LL << addr_width)) {
4192                 dev_err(dev, "%s: iommu width (%d) is not "
4193                         "sufficient for the mapped address (%llx)\n",
4194                         __func__, addr_width, dmar_domain->max_addr);
4195                 return -EFAULT;
4196         }
4197         dmar_domain->gaw = addr_width;
4198
4199         /*
4200          * Knock out extra levels of page tables if necessary
4201          */
4202         while (iommu->agaw < dmar_domain->agaw) {
4203                 struct dma_pte *pte;
4204
4205                 pte = dmar_domain->pgd;
4206                 if (dma_pte_present(pte)) {
4207                         dmar_domain->pgd = phys_to_virt(dma_pte_addr(pte));
4208                         free_pgtable_page(pte);
4209                 }
4210                 dmar_domain->agaw--;
4211         }
4212
4213         return 0;
4214 }
4215
4216 static int intel_iommu_attach_device(struct iommu_domain *domain,
4217                                      struct device *dev)
4218 {
4219         int ret;
4220
4221         if (domain->type == IOMMU_DOMAIN_UNMANAGED &&
4222             device_is_rmrr_locked(dev)) {
4223                 dev_warn(dev, "Device is ineligible for IOMMU domain attach due to platform RMRR requirement.  Contact your platform vendor.\n");
4224                 return -EPERM;
4225         }
4226
4227         /* normally dev is not mapped */
4228         if (unlikely(domain_context_mapped(dev))) {
4229                 struct device_domain_info *info = dev_iommu_priv_get(dev);
4230
4231                 if (info->domain)
4232                         dmar_remove_one_dev_info(dev);
4233         }
4234
4235         ret = prepare_domain_attach_device(domain, dev);
4236         if (ret)
4237                 return ret;
4238
4239         return domain_add_dev_info(to_dmar_domain(domain), dev);
4240 }
4241
4242 static void intel_iommu_detach_device(struct iommu_domain *domain,
4243                                       struct device *dev)
4244 {
4245         dmar_remove_one_dev_info(dev);
4246 }
4247
4248 static int intel_iommu_map(struct iommu_domain *domain,
4249                            unsigned long iova, phys_addr_t hpa,
4250                            size_t size, int iommu_prot, gfp_t gfp)
4251 {
4252         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4253         u64 max_addr;
4254         int prot = 0;
4255
4256         if (iommu_prot & IOMMU_READ)
4257                 prot |= DMA_PTE_READ;
4258         if (iommu_prot & IOMMU_WRITE)
4259                 prot |= DMA_PTE_WRITE;
4260         if (dmar_domain->set_pte_snp)
4261                 prot |= DMA_PTE_SNP;
4262
4263         max_addr = iova + size;
4264         if (dmar_domain->max_addr < max_addr) {
4265                 u64 end;
4266
4267                 /* check if minimum agaw is sufficient for mapped address */
4268                 end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
4269                 if (end < max_addr) {
4270                         pr_err("%s: iommu width (%d) is not "
4271                                "sufficient for the mapped address (%llx)\n",
4272                                __func__, dmar_domain->gaw, max_addr);
4273                         return -EFAULT;
4274                 }
4275                 dmar_domain->max_addr = max_addr;
4276         }
4277         /* Round up size to next multiple of PAGE_SIZE, if it and
4278            the low bits of hpa would take us onto the next page */
4279         size = aligned_nrpages(hpa, size);
4280         return __domain_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
4281                                 hpa >> VTD_PAGE_SHIFT, size, prot);
4282 }
4283
4284 static int intel_iommu_map_pages(struct iommu_domain *domain,
4285                                  unsigned long iova, phys_addr_t paddr,
4286                                  size_t pgsize, size_t pgcount,
4287                                  int prot, gfp_t gfp, size_t *mapped)
4288 {
4289         unsigned long pgshift = __ffs(pgsize);
4290         size_t size = pgcount << pgshift;
4291         int ret;
4292
4293         if (pgsize != SZ_4K && pgsize != SZ_2M && pgsize != SZ_1G)
4294                 return -EINVAL;
4295
4296         if (!IS_ALIGNED(iova | paddr, pgsize))
4297                 return -EINVAL;
4298
4299         ret = intel_iommu_map(domain, iova, paddr, size, prot, gfp);
4300         if (!ret && mapped)
4301                 *mapped = size;
4302
4303         return ret;
4304 }
4305
4306 static size_t intel_iommu_unmap(struct iommu_domain *domain,
4307                                 unsigned long iova, size_t size,
4308                                 struct iommu_iotlb_gather *gather)
4309 {
4310         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4311         unsigned long start_pfn, last_pfn;
4312         int level = 0;
4313
4314         /* Cope with horrid API which requires us to unmap more than the
4315            size argument if it happens to be a large-page mapping. */
4316         BUG_ON(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level));
4317
4318         if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
4319                 size = VTD_PAGE_SIZE << level_to_offset_bits(level);
4320
4321         start_pfn = iova >> VTD_PAGE_SHIFT;
4322         last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
4323
4324         domain_unmap(dmar_domain, start_pfn, last_pfn, &gather->freelist);
4325
4326         if (dmar_domain->max_addr == iova + size)
4327                 dmar_domain->max_addr = iova;
4328
4329         iommu_iotlb_gather_add_page(domain, gather, iova, size);
4330
4331         return size;
4332 }
4333
4334 static size_t intel_iommu_unmap_pages(struct iommu_domain *domain,
4335                                       unsigned long iova,
4336                                       size_t pgsize, size_t pgcount,
4337                                       struct iommu_iotlb_gather *gather)
4338 {
4339         unsigned long pgshift = __ffs(pgsize);
4340         size_t size = pgcount << pgshift;
4341
4342         return intel_iommu_unmap(domain, iova, size, gather);
4343 }
4344
4345 static void intel_iommu_tlb_sync(struct iommu_domain *domain,
4346                                  struct iommu_iotlb_gather *gather)
4347 {
4348         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4349         unsigned long iova_pfn = IOVA_PFN(gather->start);
4350         size_t size = gather->end - gather->start;
4351         struct iommu_domain_info *info;
4352         unsigned long start_pfn;
4353         unsigned long nrpages;
4354         unsigned long i;
4355
4356         nrpages = aligned_nrpages(gather->start, size);
4357         start_pfn = mm_to_dma_pfn(iova_pfn);
4358
4359         xa_for_each(&dmar_domain->iommu_array, i, info)
4360                 iommu_flush_iotlb_psi(info->iommu, dmar_domain,
4361                                       start_pfn, nrpages,
4362                                       list_empty(&gather->freelist), 0);
4363
4364         put_pages_list(&gather->freelist);
4365 }
4366
4367 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
4368                                             dma_addr_t iova)
4369 {
4370         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4371         struct dma_pte *pte;
4372         int level = 0;
4373         u64 phys = 0;
4374
4375         pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level);
4376         if (pte && dma_pte_present(pte))
4377                 phys = dma_pte_addr(pte) +
4378                         (iova & (BIT_MASK(level_to_offset_bits(level) +
4379                                                 VTD_PAGE_SHIFT) - 1));
4380
4381         return phys;
4382 }
4383
4384 static bool domain_support_force_snooping(struct dmar_domain *domain)
4385 {
4386         struct device_domain_info *info;
4387         bool support = true;
4388
4389         assert_spin_locked(&domain->lock);
4390         list_for_each_entry(info, &domain->devices, link) {
4391                 if (!ecap_sc_support(info->iommu->ecap)) {
4392                         support = false;
4393                         break;
4394                 }
4395         }
4396
4397         return support;
4398 }
4399
4400 static void domain_set_force_snooping(struct dmar_domain *domain)
4401 {
4402         struct device_domain_info *info;
4403
4404         assert_spin_locked(&domain->lock);
4405         /*
4406          * Second level page table supports per-PTE snoop control. The
4407          * iommu_map() interface will handle this by setting SNP bit.
4408          */
4409         if (!domain_use_first_level(domain)) {
4410                 domain->set_pte_snp = true;
4411                 return;
4412         }
4413
4414         list_for_each_entry(info, &domain->devices, link)
4415                 intel_pasid_setup_page_snoop_control(info->iommu, info->dev,
4416                                                      PASID_RID2PASID);
4417 }
4418
4419 static bool intel_iommu_enforce_cache_coherency(struct iommu_domain *domain)
4420 {
4421         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4422         unsigned long flags;
4423
4424         if (dmar_domain->force_snooping)
4425                 return true;
4426
4427         spin_lock_irqsave(&dmar_domain->lock, flags);
4428         if (!domain_support_force_snooping(dmar_domain)) {
4429                 spin_unlock_irqrestore(&dmar_domain->lock, flags);
4430                 return false;
4431         }
4432
4433         domain_set_force_snooping(dmar_domain);
4434         dmar_domain->force_snooping = true;
4435         spin_unlock_irqrestore(&dmar_domain->lock, flags);
4436
4437         return true;
4438 }
4439
4440 static bool intel_iommu_capable(enum iommu_cap cap)
4441 {
4442         if (cap == IOMMU_CAP_CACHE_COHERENCY)
4443                 return true;
4444         if (cap == IOMMU_CAP_INTR_REMAP)
4445                 return irq_remapping_enabled == 1;
4446         if (cap == IOMMU_CAP_PRE_BOOT_PROTECTION)
4447                 return dmar_platform_optin();
4448
4449         return false;
4450 }
4451
4452 static struct iommu_device *intel_iommu_probe_device(struct device *dev)
4453 {
4454         struct pci_dev *pdev = dev_is_pci(dev) ? to_pci_dev(dev) : NULL;
4455         struct device_domain_info *info;
4456         struct intel_iommu *iommu;
4457         u8 bus, devfn;
4458
4459         iommu = device_to_iommu(dev, &bus, &devfn);
4460         if (!iommu)
4461                 return ERR_PTR(-ENODEV);
4462
4463         info = kzalloc(sizeof(*info), GFP_KERNEL);
4464         if (!info)
4465                 return ERR_PTR(-ENOMEM);
4466
4467         if (dev_is_real_dma_subdevice(dev)) {
4468                 info->bus = pdev->bus->number;
4469                 info->devfn = pdev->devfn;
4470                 info->segment = pci_domain_nr(pdev->bus);
4471         } else {
4472                 info->bus = bus;
4473                 info->devfn = devfn;
4474                 info->segment = iommu->segment;
4475         }
4476
4477         info->dev = dev;
4478         info->iommu = iommu;
4479         if (dev_is_pci(dev)) {
4480                 if (ecap_dev_iotlb_support(iommu->ecap) &&
4481                     pci_ats_supported(pdev) &&
4482                     dmar_ats_supported(pdev, iommu))
4483                         info->ats_supported = 1;
4484
4485                 if (sm_supported(iommu)) {
4486                         if (pasid_supported(iommu)) {
4487                                 int features = pci_pasid_features(pdev);
4488
4489                                 if (features >= 0)
4490                                         info->pasid_supported = features | 1;
4491                         }
4492
4493                         if (info->ats_supported && ecap_prs(iommu->ecap) &&
4494                             pci_pri_supported(pdev))
4495                                 info->pri_supported = 1;
4496                 }
4497         }
4498
4499         dev_iommu_priv_set(dev, info);
4500
4501         return &iommu->iommu;
4502 }
4503
4504 static void intel_iommu_release_device(struct device *dev)
4505 {
4506         struct device_domain_info *info = dev_iommu_priv_get(dev);
4507
4508         dmar_remove_one_dev_info(dev);
4509         dev_iommu_priv_set(dev, NULL);
4510         kfree(info);
4511         set_dma_ops(dev, NULL);
4512 }
4513
4514 static void intel_iommu_probe_finalize(struct device *dev)
4515 {
4516         set_dma_ops(dev, NULL);
4517         iommu_setup_dma_ops(dev, 0, U64_MAX);
4518 }
4519
4520 static void intel_iommu_get_resv_regions(struct device *device,
4521                                          struct list_head *head)
4522 {
4523         int prot = DMA_PTE_READ | DMA_PTE_WRITE;
4524         struct iommu_resv_region *reg;
4525         struct dmar_rmrr_unit *rmrr;
4526         struct device *i_dev;
4527         int i;
4528
4529         down_read(&dmar_global_lock);
4530         for_each_rmrr_units(rmrr) {
4531                 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
4532                                           i, i_dev) {
4533                         struct iommu_resv_region *resv;
4534                         enum iommu_resv_type type;
4535                         size_t length;
4536
4537                         if (i_dev != device &&
4538                             !is_downstream_to_pci_bridge(device, i_dev))
4539                                 continue;
4540
4541                         length = rmrr->end_address - rmrr->base_address + 1;
4542
4543                         type = device_rmrr_is_relaxable(device) ?
4544                                 IOMMU_RESV_DIRECT_RELAXABLE : IOMMU_RESV_DIRECT;
4545
4546                         resv = iommu_alloc_resv_region(rmrr->base_address,
4547                                                        length, prot, type);
4548                         if (!resv)
4549                                 break;
4550
4551                         list_add_tail(&resv->list, head);
4552                 }
4553         }
4554         up_read(&dmar_global_lock);
4555
4556 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
4557         if (dev_is_pci(device)) {
4558                 struct pci_dev *pdev = to_pci_dev(device);
4559
4560                 if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) {
4561                         reg = iommu_alloc_resv_region(0, 1UL << 24, prot,
4562                                                    IOMMU_RESV_DIRECT_RELAXABLE);
4563                         if (reg)
4564                                 list_add_tail(&reg->list, head);
4565                 }
4566         }
4567 #endif /* CONFIG_INTEL_IOMMU_FLOPPY_WA */
4568
4569         reg = iommu_alloc_resv_region(IOAPIC_RANGE_START,
4570                                       IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1,
4571                                       0, IOMMU_RESV_MSI);
4572         if (!reg)
4573                 return;
4574         list_add_tail(&reg->list, head);
4575 }
4576
4577 int intel_iommu_enable_pasid(struct intel_iommu *iommu, struct device *dev)
4578 {
4579         struct device_domain_info *info = dev_iommu_priv_get(dev);
4580         struct context_entry *context;
4581         struct dmar_domain *domain;
4582         u64 ctx_lo;
4583         int ret;
4584
4585         domain = info->domain;
4586         if (!domain)
4587                 return -EINVAL;
4588
4589         spin_lock(&iommu->lock);
4590         ret = -EINVAL;
4591         if (!info->pasid_supported)
4592                 goto out;
4593
4594         context = iommu_context_addr(iommu, info->bus, info->devfn, 0);
4595         if (WARN_ON(!context))
4596                 goto out;
4597
4598         ctx_lo = context[0].lo;
4599
4600         if (!(ctx_lo & CONTEXT_PASIDE)) {
4601                 ctx_lo |= CONTEXT_PASIDE;
4602                 context[0].lo = ctx_lo;
4603                 wmb();
4604                 iommu->flush.flush_context(iommu,
4605                                            domain_id_iommu(domain, iommu),
4606                                            PCI_DEVID(info->bus, info->devfn),
4607                                            DMA_CCMD_MASK_NOBIT,
4608                                            DMA_CCMD_DEVICE_INVL);
4609         }
4610
4611         /* Enable PASID support in the device, if it wasn't already */
4612         if (!info->pasid_enabled)
4613                 iommu_enable_dev_iotlb(info);
4614
4615         ret = 0;
4616
4617  out:
4618         spin_unlock(&iommu->lock);
4619
4620         return ret;
4621 }
4622
4623 static struct iommu_group *intel_iommu_device_group(struct device *dev)
4624 {
4625         if (dev_is_pci(dev))
4626                 return pci_device_group(dev);
4627         return generic_device_group(dev);
4628 }
4629
4630 static int intel_iommu_enable_sva(struct device *dev)
4631 {
4632         struct device_domain_info *info = dev_iommu_priv_get(dev);
4633         struct intel_iommu *iommu;
4634         int ret;
4635
4636         if (!info || dmar_disabled)
4637                 return -EINVAL;
4638
4639         iommu = info->iommu;
4640         if (!iommu)
4641                 return -EINVAL;
4642
4643         if (!(iommu->flags & VTD_FLAG_SVM_CAPABLE))
4644                 return -ENODEV;
4645
4646         if (intel_iommu_enable_pasid(iommu, dev))
4647                 return -ENODEV;
4648
4649         if (!info->pasid_enabled || !info->pri_enabled || !info->ats_enabled)
4650                 return -EINVAL;
4651
4652         ret = iopf_queue_add_device(iommu->iopf_queue, dev);
4653         if (!ret)
4654                 ret = iommu_register_device_fault_handler(dev, iommu_queue_iopf, dev);
4655
4656         return ret;
4657 }
4658
4659 static int intel_iommu_disable_sva(struct device *dev)
4660 {
4661         struct device_domain_info *info = dev_iommu_priv_get(dev);
4662         struct intel_iommu *iommu = info->iommu;
4663         int ret;
4664
4665         ret = iommu_unregister_device_fault_handler(dev);
4666         if (!ret)
4667                 ret = iopf_queue_remove_device(iommu->iopf_queue, dev);
4668
4669         return ret;
4670 }
4671
4672 static int intel_iommu_enable_iopf(struct device *dev)
4673 {
4674         struct device_domain_info *info = dev_iommu_priv_get(dev);
4675
4676         if (info && info->pri_supported)
4677                 return 0;
4678
4679         return -ENODEV;
4680 }
4681
4682 static int
4683 intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat)
4684 {
4685         switch (feat) {
4686         case IOMMU_DEV_FEAT_IOPF:
4687                 return intel_iommu_enable_iopf(dev);
4688
4689         case IOMMU_DEV_FEAT_SVA:
4690                 return intel_iommu_enable_sva(dev);
4691
4692         default:
4693                 return -ENODEV;
4694         }
4695 }
4696
4697 static int
4698 intel_iommu_dev_disable_feat(struct device *dev, enum iommu_dev_features feat)
4699 {
4700         switch (feat) {
4701         case IOMMU_DEV_FEAT_IOPF:
4702                 return 0;
4703
4704         case IOMMU_DEV_FEAT_SVA:
4705                 return intel_iommu_disable_sva(dev);
4706
4707         default:
4708                 return -ENODEV;
4709         }
4710 }
4711
4712 static bool intel_iommu_is_attach_deferred(struct device *dev)
4713 {
4714         struct device_domain_info *info = dev_iommu_priv_get(dev);
4715
4716         return translation_pre_enabled(info->iommu) && !info->domain;
4717 }
4718
4719 /*
4720  * Check that the device does not live on an external facing PCI port that is
4721  * marked as untrusted. Such devices should not be able to apply quirks and
4722  * thus not be able to bypass the IOMMU restrictions.
4723  */
4724 static bool risky_device(struct pci_dev *pdev)
4725 {
4726         if (pdev->untrusted) {
4727                 pci_info(pdev,
4728                          "Skipping IOMMU quirk for dev [%04X:%04X] on untrusted PCI link\n",
4729                          pdev->vendor, pdev->device);
4730                 pci_info(pdev, "Please check with your BIOS/Platform vendor about this\n");
4731                 return true;
4732         }
4733         return false;
4734 }
4735
4736 static void intel_iommu_iotlb_sync_map(struct iommu_domain *domain,
4737                                        unsigned long iova, size_t size)
4738 {
4739         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4740         unsigned long pages = aligned_nrpages(iova, size);
4741         unsigned long pfn = iova >> VTD_PAGE_SHIFT;
4742         struct iommu_domain_info *info;
4743         unsigned long i;
4744
4745         xa_for_each(&dmar_domain->iommu_array, i, info)
4746                 __mapping_notify_one(info->iommu, dmar_domain, pfn, pages);
4747 }
4748
4749 const struct iommu_ops intel_iommu_ops = {
4750         .capable                = intel_iommu_capable,
4751         .domain_alloc           = intel_iommu_domain_alloc,
4752         .probe_device           = intel_iommu_probe_device,
4753         .probe_finalize         = intel_iommu_probe_finalize,
4754         .release_device         = intel_iommu_release_device,
4755         .get_resv_regions       = intel_iommu_get_resv_regions,
4756         .device_group           = intel_iommu_device_group,
4757         .dev_enable_feat        = intel_iommu_dev_enable_feat,
4758         .dev_disable_feat       = intel_iommu_dev_disable_feat,
4759         .is_attach_deferred     = intel_iommu_is_attach_deferred,
4760         .def_domain_type        = device_def_domain_type,
4761         .pgsize_bitmap          = SZ_4K,
4762 #ifdef CONFIG_INTEL_IOMMU_SVM
4763         .sva_bind               = intel_svm_bind,
4764         .sva_unbind             = intel_svm_unbind,
4765         .sva_get_pasid          = intel_svm_get_pasid,
4766         .page_response          = intel_svm_page_response,
4767 #endif
4768         .default_domain_ops = &(const struct iommu_domain_ops) {
4769                 .attach_dev             = intel_iommu_attach_device,
4770                 .detach_dev             = intel_iommu_detach_device,
4771                 .map_pages              = intel_iommu_map_pages,
4772                 .unmap_pages            = intel_iommu_unmap_pages,
4773                 .iotlb_sync_map         = intel_iommu_iotlb_sync_map,
4774                 .flush_iotlb_all        = intel_flush_iotlb_all,
4775                 .iotlb_sync             = intel_iommu_tlb_sync,
4776                 .iova_to_phys           = intel_iommu_iova_to_phys,
4777                 .free                   = intel_iommu_domain_free,
4778                 .enforce_cache_coherency = intel_iommu_enforce_cache_coherency,
4779         }
4780 };
4781
4782 static void quirk_iommu_igfx(struct pci_dev *dev)
4783 {
4784         if (risky_device(dev))
4785                 return;
4786
4787         pci_info(dev, "Disabling IOMMU for graphics on this chipset\n");
4788         dmar_map_gfx = 0;
4789 }
4790
4791 /* G4x/GM45 integrated gfx dmar support is totally busted. */
4792 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_igfx);
4793 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_igfx);
4794 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_igfx);
4795 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_igfx);
4796 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_igfx);
4797 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_igfx);
4798 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_igfx);
4799
4800 /* Broadwell igfx malfunctions with dmar */
4801 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1606, quirk_iommu_igfx);
4802 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160B, quirk_iommu_igfx);
4803 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160E, quirk_iommu_igfx);
4804 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1602, quirk_iommu_igfx);
4805 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160A, quirk_iommu_igfx);
4806 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160D, quirk_iommu_igfx);
4807 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1616, quirk_iommu_igfx);
4808 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161B, quirk_iommu_igfx);
4809 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161E, quirk_iommu_igfx);
4810 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1612, quirk_iommu_igfx);
4811 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161A, quirk_iommu_igfx);
4812 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161D, quirk_iommu_igfx);
4813 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1626, quirk_iommu_igfx);
4814 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162B, quirk_iommu_igfx);
4815 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162E, quirk_iommu_igfx);
4816 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1622, quirk_iommu_igfx);
4817 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162A, quirk_iommu_igfx);
4818 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162D, quirk_iommu_igfx);
4819 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1636, quirk_iommu_igfx);
4820 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163B, quirk_iommu_igfx);
4821 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163E, quirk_iommu_igfx);
4822 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1632, quirk_iommu_igfx);
4823 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163A, quirk_iommu_igfx);
4824 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163D, quirk_iommu_igfx);
4825
4826 static void quirk_iommu_rwbf(struct pci_dev *dev)
4827 {
4828         if (risky_device(dev))
4829                 return;
4830
4831         /*
4832          * Mobile 4 Series Chipset neglects to set RWBF capability,
4833          * but needs it. Same seems to hold for the desktop versions.
4834          */
4835         pci_info(dev, "Forcing write-buffer flush capability\n");
4836         rwbf_quirk = 1;
4837 }
4838
4839 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
4840 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
4841 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
4842 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
4843 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
4844 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
4845 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
4846
4847 #define GGC 0x52
4848 #define GGC_MEMORY_SIZE_MASK    (0xf << 8)
4849 #define GGC_MEMORY_SIZE_NONE    (0x0 << 8)
4850 #define GGC_MEMORY_SIZE_1M      (0x1 << 8)
4851 #define GGC_MEMORY_SIZE_2M      (0x3 << 8)
4852 #define GGC_MEMORY_VT_ENABLED   (0x8 << 8)
4853 #define GGC_MEMORY_SIZE_2M_VT   (0x9 << 8)
4854 #define GGC_MEMORY_SIZE_3M_VT   (0xa << 8)
4855 #define GGC_MEMORY_SIZE_4M_VT   (0xb << 8)
4856
4857 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
4858 {
4859         unsigned short ggc;
4860
4861         if (risky_device(dev))
4862                 return;
4863
4864         if (pci_read_config_word(dev, GGC, &ggc))
4865                 return;
4866
4867         if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
4868                 pci_info(dev, "BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
4869                 dmar_map_gfx = 0;
4870         } else if (dmar_map_gfx) {
4871                 /* we have to ensure the gfx device is idle before we flush */
4872                 pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n");
4873                 iommu_set_dma_strict();
4874         }
4875 }
4876 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
4877 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
4878 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
4879 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
4880
4881 static void quirk_igfx_skip_te_disable(struct pci_dev *dev)
4882 {
4883         unsigned short ver;
4884
4885         if (!IS_GFX_DEVICE(dev))
4886                 return;
4887
4888         ver = (dev->device >> 8) & 0xff;
4889         if (ver != 0x45 && ver != 0x46 && ver != 0x4c &&
4890             ver != 0x4e && ver != 0x8a && ver != 0x98 &&
4891             ver != 0x9a && ver != 0xa7)
4892                 return;
4893
4894         if (risky_device(dev))
4895                 return;
4896
4897         pci_info(dev, "Skip IOMMU disabling for graphics\n");
4898         iommu_skip_te_disable = 1;
4899 }
4900 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_ANY_ID, quirk_igfx_skip_te_disable);
4901
4902 /* On Tylersburg chipsets, some BIOSes have been known to enable the
4903    ISOCH DMAR unit for the Azalia sound device, but not give it any
4904    TLB entries, which causes it to deadlock. Check for that.  We do
4905    this in a function called from init_dmars(), instead of in a PCI
4906    quirk, because we don't want to print the obnoxious "BIOS broken"
4907    message if VT-d is actually disabled.
4908 */
4909 static void __init check_tylersburg_isoch(void)
4910 {
4911         struct pci_dev *pdev;
4912         uint32_t vtisochctrl;
4913
4914         /* If there's no Azalia in the system anyway, forget it. */
4915         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
4916         if (!pdev)
4917                 return;
4918
4919         if (risky_device(pdev)) {
4920                 pci_dev_put(pdev);
4921                 return;
4922         }
4923
4924         pci_dev_put(pdev);
4925
4926         /* System Management Registers. Might be hidden, in which case
4927            we can't do the sanity check. But that's OK, because the
4928            known-broken BIOSes _don't_ actually hide it, so far. */
4929         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
4930         if (!pdev)
4931                 return;
4932
4933         if (risky_device(pdev)) {
4934                 pci_dev_put(pdev);
4935                 return;
4936         }
4937
4938         if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
4939                 pci_dev_put(pdev);
4940                 return;
4941         }
4942
4943         pci_dev_put(pdev);
4944
4945         /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
4946         if (vtisochctrl & 1)
4947                 return;
4948
4949         /* Drop all bits other than the number of TLB entries */
4950         vtisochctrl &= 0x1c;
4951
4952         /* If we have the recommended number of TLB entries (16), fine. */
4953         if (vtisochctrl == 0x10)
4954                 return;
4955
4956         /* Zero TLB entries? You get to ride the short bus to school. */
4957         if (!vtisochctrl) {
4958                 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
4959                      "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
4960                      dmi_get_system_info(DMI_BIOS_VENDOR),
4961                      dmi_get_system_info(DMI_BIOS_VERSION),
4962                      dmi_get_system_info(DMI_PRODUCT_VERSION));
4963                 iommu_identity_mapping |= IDENTMAP_AZALIA;
4964                 return;
4965         }
4966
4967         pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
4968                vtisochctrl);
4969 }